fst commit

2023-06-16 10:42:02 +08:00
parent 5793dd0af6
commit 0136d3c2da
122 changed files with 25173 additions and 25 deletions
--- a/algorithm/common_det/cuda/common_det_cuda_impl.cpp
+++ b/algorithm/common_det/cuda/common_det_cuda_impl.cpp
@@ -0,0 +1,310 @@
+#include "common_det_cuda_impl.h"
+#include <cmath>
+#include <fstream>
+
+#define SV_MODEL_DIR "/SpireCV/models/"
+#define SV_ROOT_DIR "/SpireCV/"
+
+
+#ifdef WITH_CUDA
+#include "yolov7/cuda_utils.h"
+#include "yolov7/logging.h"
+#include "yolov7/utils.h"
+#include "yolov7/preprocess.h"
+#include "yolov7/postprocess.h"
+#include "yolov7/model.h"
+#define TRTCHECK(status)                                 \
+  do                                                     \
+  {                                                      \
+    auto ret = (status);                                 \
+    if (ret != 0)                                        \
+    {                                                    \
+      std::cerr << "Cuda failure: " << ret << std::endl; \
+      abort();                                           \
+    }                                                    \
+  } while (0)
+
+#define DEVICE 0  // GPU id
+#define BATCH_SIZE 1
+#define MAX_IMAGE_INPUT_SIZE_THRESH 3000 * 3000 // ensure it exceed the maximum size in the input images !
+#endif
+
+
+namespace sv {
+
+using namespace cv;
+
+
+#ifdef WITH_CUDA
+using namespace nvinfer1;
+static Logger g_nvlogger;
+const static int kOutputSize = kMaxNumOutputBbox * sizeof(Detection) / sizeof(float) + 1;
+const static int kOutputSize1 = kMaxNumOutputBbox * sizeof(Detection) / sizeof(float) + 1;
+const static int kOutputSize2 = 32 * (640 / 4) * (640 / 4);
+#endif
+
+
+CommonObjectDetectorCUDAImpl::CommonObjectDetectorCUDAImpl()
+{
+#ifdef WITH_CUDA
+  this->_gpu_buffers[0] = nullptr;
+  this->_gpu_buffers[1] = nullptr;
+  this->_gpu_buffers[2] = nullptr;
+  this->_cpu_output_buffer = nullptr;
+  this->_cpu_output_buffer1 = nullptr;
+  this->_cpu_output_buffer2 = nullptr;
+  this->_context = nullptr;
+  this->_engine = nullptr;
+  this->_runtime = nullptr;
+#endif
+}
+
+
+CommonObjectDetectorCUDAImpl::~CommonObjectDetectorCUDAImpl()
+{
+#ifdef WITH_CUDA
+  // Release stream and buffers
+  cudaStreamDestroy(_stream);
+  if (_gpu_buffers[0])
+    CUDA_CHECK(cudaFree(_gpu_buffers[0]));
+  if (_gpu_buffers[1])
+    CUDA_CHECK(cudaFree(_gpu_buffers[1]));
+  if (_gpu_buffers[2])
+    CUDA_CHECK(cudaFree(_gpu_buffers[2]));
+  if (_cpu_output_buffer)
+    delete[] _cpu_output_buffer;
+  if (_cpu_output_buffer1)
+    delete[] _cpu_output_buffer1;
+  if (_cpu_output_buffer2)
+    delete[] _cpu_output_buffer2;
+  cuda_preprocess_destroy();
+  // Destroy the engine
+  if (_context)
+    _context->destroy();
+  if (_engine)
+    _engine->destroy();
+  if (_runtime)
+    _runtime->destroy();
+#endif
+}
+
+
+#ifdef WITH_CUDA
+void infer(IExecutionContext& context, cudaStream_t& stream, void** gpu_buffers, float* output, int batchsize) {
+  context.enqueue(batchsize, gpu_buffers, stream, nullptr);
+  // context.enqueueV2(gpu_buffers, stream, nullptr);
+  CUDA_CHECK(cudaMemcpyAsync(output, gpu_buffers[1], batchsize * kOutputSize * sizeof(float), cudaMemcpyDeviceToHost, stream));
+  cudaStreamSynchronize(stream);
+}
+void infer_seg(IExecutionContext& context, cudaStream_t& stream, void **buffers, float* output1, float* output2, int batchSize) {
+  context.enqueue(batchSize, buffers, stream, nullptr);
+  // context.enqueueV2(buffers, stream, nullptr);
+  CUDA_CHECK(cudaMemcpyAsync(output1, buffers[1], batchSize * kOutputSize1 * sizeof(float), cudaMemcpyDeviceToHost, stream));
+  CUDA_CHECK(cudaMemcpyAsync(output2, buffers[2], batchSize * kOutputSize2 * sizeof(float), cudaMemcpyDeviceToHost, stream));
+  cudaStreamSynchronize(stream);
+}
+void CommonObjectDetectorCUDAImpl::_prepare_buffers(int input_h, int input_w) {
+  assert(this->_engine->getNbBindings() == 2);
+  // In order to bind the buffers, we need to know the names of the input and output tensors.
+  // Note that indices are guaranteed to be less than IEngine::getNbBindings()
+  const int inputIndex = this->_engine->getBindingIndex(kInputTensorName);
+  const int outputIndex = this->_engine->getBindingIndex(kOutputTensorName);
+  assert(inputIndex == 0);
+  assert(outputIndex == 1);
+  // Create GPU buffers on device
+  CUDA_CHECK(cudaMalloc((void**)&(this->_gpu_buffers[0]), kBatchSize * 3 * input_h * input_w * sizeof(float)));
+  CUDA_CHECK(cudaMalloc((void**)&(this->_gpu_buffers[1]), kBatchSize * kOutputSize * sizeof(float)));
+
+  this->_cpu_output_buffer = new float[kBatchSize * kOutputSize];
+}
+void CommonObjectDetectorCUDAImpl::_prepare_buffers_seg(int input_h, int input_w) {
+  assert(this->_engine->getNbBindings() == 3);
+  // In order to bind the buffers, we need to know the names of the input and output tensors.
+  // Note that indices are guaranteed to be less than IEngine::getNbBindings()
+  const int inputIndex = this->_engine->getBindingIndex(kInputTensorName);
+  const int outputIndex1 = this->_engine->getBindingIndex(kOutputTensorName);
+  const int outputIndex2 = this->_engine->getBindingIndex("proto");
+  assert(inputIndex == 0);
+  assert(outputIndex1 == 1);
+  assert(outputIndex2 == 2);
+
+  // Create GPU buffers on device
+  CUDA_CHECK(cudaMalloc((void**)&(this->_gpu_buffers[0]), kBatchSize * 3 * input_h * input_w * sizeof(float)));
+  CUDA_CHECK(cudaMalloc((void**)&(this->_gpu_buffers[1]), kBatchSize * kOutputSize1 * sizeof(float)));
+  CUDA_CHECK(cudaMalloc((void**)&(this->_gpu_buffers[2]), kBatchSize * kOutputSize2 * sizeof(float)));
+
+  // Alloc CPU buffers
+  this->_cpu_output_buffer1 = new float[kBatchSize * kOutputSize1];
+  this->_cpu_output_buffer2 = new float[kBatchSize * kOutputSize2];
+}
+void deserialize_engine(std::string& engine_name, IRuntime** runtime, ICudaEngine** engine, IExecutionContext** context) {
+  std::ifstream file(engine_name, std::ios::binary);
+  if (!file.good()) {
+    std::cerr << "read " << engine_name << " error!" << std::endl;
+    assert(false);
+  }
+  size_t size = 0;
+  file.seekg(0, file.end);
+  size = file.tellg();
+  file.seekg(0, file.beg);
+  char* serialized_engine = new char[size];
+  assert(serialized_engine);
+  file.read(serialized_engine, size);
+  file.close();
+
+  *runtime = createInferRuntime(g_nvlogger);
+  assert(*runtime);
+  *engine = (*runtime)->deserializeCudaEngine(serialized_engine, size);
+  assert(*engine);
+  *context = (*engine)->createExecutionContext();
+  assert(*context);
+  delete[] serialized_engine;
+}
+#endif
+
+
+void CommonObjectDetectorCUDAImpl::cudaDetect(
+  CommonObjectDetectorBase* base_,
+  cv::Mat img_,
+  std::vector<float>& boxes_x_,
+  std::vector<float>& boxes_y_,
+  std::vector<float>& boxes_w_,
+  std::vector<float>& boxes_h_,
+  std::vector<int>& boxes_label_,
+  std::vector<float>& boxes_score_,
+  std::vector<cv::Mat>& boxes_seg_
+)
+{
+#ifdef WITH_CUDA
+  int input_h = base_->getInputH();
+  int input_w = base_->getInputW();
+  bool with_segmentation = base_->withSegmentation();
+  double thrs_conf = base_->getThrsConf();
+  double thrs_nms = base_->getThrsNms();
+
+  std::vector<cv::Mat> img_batch;
+  img_batch.push_back(img_);
+  // Preprocess
+  cuda_batch_preprocess(img_batch, this->_gpu_buffers[0], input_w, input_h, this->_stream);
+
+  // Run inference
+  if (with_segmentation)
+  {
+    infer_seg(*this->_context, this->_stream, (void**)this->_gpu_buffers, this->_cpu_output_buffer1, this->_cpu_output_buffer2, kBatchSize);
+  }
+  else
+  {
+    infer(*this->_context, this->_stream, (void**)this->_gpu_buffers, this->_cpu_output_buffer, kBatchSize);
+  }
+
+  // NMS
+  std::vector<std::vector<Detection>> res_batch;
+  if (with_segmentation)
+  {
+    batch_nms(res_batch, this->_cpu_output_buffer1, img_batch.size(), kOutputSize1, thrs_conf, thrs_nms);
+  }
+  else
+  {
+    batch_nms(res_batch, this->_cpu_output_buffer, img_batch.size(), kOutputSize, thrs_conf, thrs_nms);
+  }
+
+  std::vector<Detection> res = res_batch[0];
+  std::vector<cv::Mat> masks;
+  if (with_segmentation)
+  {
+    masks = process_mask(&(this->_cpu_output_buffer2[0]), kOutputSize2, res, input_h, input_w);
+  }
+
+
+
+  for (size_t j = 0; j < res.size(); j++) {
+    cv::Rect r = get_rect(img_, res[j].bbox, input_h, input_w);
+    if (r.x < 0) r.x = 0;
+    if (r.y < 0) r.y = 0;
+    if (r.x + r.width >= img_.cols)  r.width = img_.cols - r.x - 1;
+    if (r.y + r.height >= img_.rows) r.height = img_.rows - r.y - 1;
+    if (r.width > 5 && r.height > 5)
+    {
+      // cv::rectangle(img_show, r, cv::Scalar(0, 0, 255), 2);
+      // cv::putText(img_show, vehiclenames[(int)res[j].class_id], cv::Point(r.x, r.y - 1), cv::FONT_HERSHEY_PLAIN, 2.2, cv::Scalar(0, 0, 255), 2);
+      boxes_x_.push_back(r.x);
+      boxes_y_.push_back(r.y);
+      boxes_w_.push_back(r.width);
+      boxes_h_.push_back(r.height);
+
+      boxes_label_.push_back((int)res[j].class_id);
+      boxes_score_.push_back(res[j].conf);
+
+      if (with_segmentation)
+      {
+        cv::Mat mask_j = masks[j].clone();
+        boxes_seg_.push_back(mask_j);
+      }
+    }
+  }
+  
+#endif
+}
+
+bool CommonObjectDetectorCUDAImpl::cudaSetup(CommonObjectDetectorBase* base_)
+{
+#ifdef WITH_CUDA
+  std::string dataset = base_->getDataset();
+  int input_h = base_->getInputH();
+  int input_w = base_->getInputW();
+  bool with_segmentation = base_->withSegmentation();
+  double thrs_conf = base_->getThrsConf();
+  double thrs_nms = base_->getThrsNms();
+
+  std::string engine_fn = get_home() + SV_MODEL_DIR + dataset + ".engine";
+  if (input_w == 1280)
+  {
+    engine_fn = get_home() + SV_MODEL_DIR + dataset + "_HD.engine";
+  }
+  if (with_segmentation)
+  {
+    base_->setInputH(640);
+    base_->setInputW(640);
+    engine_fn = get_home() + SV_MODEL_DIR + dataset + "_SEG.engine";
+  }
+  std::cout << "Load: " << engine_fn << std::endl;
+  if (!is_file_exist(engine_fn))
+  {
+    throw std::runtime_error("SpireCV (104) Error loading the CommonObject TensorRT model (File Not Exist)");
+  }
+
+  deserialize_engine(engine_fn, &this->_runtime, &this->_engine, &this->_context);
+  CUDA_CHECK(cudaStreamCreate(&this->_stream));
+
+  // Init CUDA preprocessing
+  cuda_preprocess_init(kMaxInputImageSize);
+
+  if (with_segmentation)
+  {
+    // Prepare cpu and gpu buffers
+    this->_prepare_buffers_seg(input_h, input_w);
+  }
+  else
+  {
+    // Prepare cpu and gpu buffers
+    this->_prepare_buffers(input_h, input_w);
+  }
+  return true;
+#endif
+  return false;
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
+}
+
--- a/algorithm/common_det/cuda/common_det_cuda_impl.h
+++ b/algorithm/common_det/cuda/common_det_cuda_impl.h
@@ -0,0 +1,58 @@
+#ifndef __SV_COMMON_DET_CUDA__
+#define __SV_COMMON_DET_CUDA__
+
+#include "sv_core.h"
+#include <opencv2/opencv.hpp>
+#include <opencv2/aruco.hpp>
+#include <opencv2/tracking.hpp>
+#include <string>
+#include <chrono>
+
+
+
+#ifdef WITH_CUDA
+#include <NvInfer.h>
+#include <cuda_runtime_api.h>
+#endif
+
+
+
+namespace sv {
+
+
+class CommonObjectDetectorCUDAImpl
+{
+public:
+  CommonObjectDetectorCUDAImpl();
+  ~CommonObjectDetectorCUDAImpl();
+
+  bool cudaSetup(CommonObjectDetectorBase* base_);
+  void cudaDetect(
+    CommonObjectDetectorBase* base_,
+    cv::Mat img_,
+    std::vector<float>& boxes_x_,
+    std::vector<float>& boxes_y_,
+    std::vector<float>& boxes_w_,
+    std::vector<float>& boxes_h_,
+    std::vector<int>& boxes_label_,
+    std::vector<float>& boxes_score_,
+    std::vector<cv::Mat>& boxes_seg_
+  );
+
+#ifdef WITH_CUDA
+  void _prepare_buffers_seg(int input_h, int input_w);
+  void _prepare_buffers(int input_h, int input_w);
+  nvinfer1::IExecutionContext* _context;
+  nvinfer1::IRuntime* _runtime;
+  nvinfer1::ICudaEngine* _engine;
+  cudaStream_t _stream;
+  float* _gpu_buffers[3];
+  float* _cpu_output_buffer;
+  float* _cpu_output_buffer1;
+  float* _cpu_output_buffer2;
+#endif
+};
+
+
+}
+#endif
--- a/algorithm/common_det/cuda/yolov7/calibrator.cpp
+++ b/algorithm/common_det/cuda/yolov7/calibrator.cpp
@@ -0,0 +1,97 @@
+#include "calibrator.h"
+#include "cuda_utils.h"
+#include "utils.h"
+
+#include <iostream>
+#include <iterator>
+#include <fstream>
+#include <opencv2/opencv.hpp>
+#include <opencv2/dnn/dnn.hpp>
+
+static cv::Mat preprocess_img(cv::Mat& img, int input_w, int input_h) {
+  int w, h, x, y;
+  float r_w = input_w / (img.cols * 1.0);
+  float r_h = input_h / (img.rows * 1.0);
+  if (r_h > r_w) {
+    w = input_w;
+    h = r_w * img.rows;
+    x = 0;
+    y = (input_h - h) / 2;
+  } else {
+    w = r_h * img.cols;
+    h = input_h;
+    x = (input_w - w) / 2;
+    y = 0;
+  }
+  cv::Mat re(h, w, CV_8UC3);
+  cv::resize(img, re, re.size(), 0, 0, cv::INTER_LINEAR);
+  cv::Mat out(input_h, input_w, CV_8UC3, cv::Scalar(128, 128, 128));
+  re.copyTo(out(cv::Rect(x, y, re.cols, re.rows)));
+  return out;
+}
+
+Int8EntropyCalibrator2::Int8EntropyCalibrator2(int batchsize, int input_w, int input_h, const char* img_dir, const char* calib_table_name, const char* input_blob_name, bool read_cache)
+    : batchsize_(batchsize),
+      input_w_(input_w),
+      input_h_(input_h),
+      img_idx_(0),
+      img_dir_(img_dir),
+      calib_table_name_(calib_table_name),
+      input_blob_name_(input_blob_name),
+      read_cache_(read_cache) {
+  input_count_ = 3 * input_w * input_h * batchsize;
+  CUDA_CHECK(cudaMalloc(&device_input_, input_count_ * sizeof(float)));
+  read_files_in_dir(img_dir, img_files_);
+}
+
+Int8EntropyCalibrator2::~Int8EntropyCalibrator2() {
+  CUDA_CHECK(cudaFree(device_input_));
+}
+
+int Int8EntropyCalibrator2::getBatchSize() const TRT_NOEXCEPT {
+  return batchsize_;
+}
+
+bool Int8EntropyCalibrator2::getBatch(void* bindings[], const char* names[], int nbBindings) TRT_NOEXCEPT {
+  if (img_idx_ + batchsize_ > (int)img_files_.size()) {
+    return false;
+  }
+
+  std::vector<cv::Mat> input_imgs_;
+  for (int i = img_idx_; i < img_idx_ + batchsize_; i++) {
+    std::cout << img_files_[i] << "  " << i << std::endl;
+    cv::Mat temp = cv::imread(img_dir_ + img_files_[i]);
+    if (temp.empty()) {
+      std::cerr << "Fatal error: image cannot open!" << std::endl;
+      return false;
+    }
+    cv::Mat pr_img = preprocess_img(temp, input_w_, input_h_);
+    input_imgs_.push_back(pr_img);
+  }
+  img_idx_ += batchsize_;
+  cv::Mat blob = cv::dnn::blobFromImages(input_imgs_, 1.0 / 255.0, cv::Size(input_w_, input_h_), cv::Scalar(0, 0, 0), true, false);
+
+  CUDA_CHECK(cudaMemcpy(device_input_, blob.ptr<float>(0), input_count_ * sizeof(float), cudaMemcpyHostToDevice));
+  assert(!strcmp(names[0], input_blob_name_));
+  bindings[0] = device_input_;
+  return true;
+}
+
+const void* Int8EntropyCalibrator2::readCalibrationCache(size_t& length) TRT_NOEXCEPT {
+  std::cout << "reading calib cache: " << calib_table_name_ << std::endl;
+  calib_cache_.clear();
+  std::ifstream input(calib_table_name_, std::ios::binary);
+  input >> std::noskipws;
+  if (read_cache_ && input.good()) {
+    std::copy(std::istream_iterator<char>(input), std::istream_iterator<char>(), std::back_inserter(calib_cache_));
+  }
+  length = calib_cache_.size();
+  return length ? calib_cache_.data() : nullptr;
+}
+
+void Int8EntropyCalibrator2::writeCalibrationCache(const void* cache, size_t length) TRT_NOEXCEPT {
+  std::cout << "writing calib cache: " << calib_table_name_ << " size: " << length << std::endl;
+  std::ofstream output(calib_table_name_, std::ios::binary);
+  output.write(reinterpret_cast<const char*>(cache), length);
+}
+
--- a/algorithm/common_det/cuda/yolov7/calibrator.h
+++ b/algorithm/common_det/cuda/yolov7/calibrator.h
@@ -0,0 +1,36 @@
+#pragma once
+
+#include "macros.h"
+#include <string>
+#include <vector>
+
+//! \class Int8EntropyCalibrator2
+//!
+//! \brief Implements Entropy calibrator 2.
+//!  CalibrationAlgoType is kENTROPY_CALIBRATION_2.
+//!
+class Int8EntropyCalibrator2 : public nvinfer1::IInt8EntropyCalibrator2 {
+ public:
+  Int8EntropyCalibrator2(int batchsize, int input_w, int input_h, const char* img_dir, const char* calib_table_name, const char* input_blob_name, bool read_cache = true);
+
+  virtual ~Int8EntropyCalibrator2();
+  int getBatchSize() const TRT_NOEXCEPT override;
+  bool getBatch(void* bindings[], const char* names[], int nbBindings) TRT_NOEXCEPT override;
+  const void* readCalibrationCache(size_t& length) TRT_NOEXCEPT override;
+  void writeCalibrationCache(const void* cache, size_t length) TRT_NOEXCEPT override;
+
+ private:
+  int batchsize_;
+  int input_w_;
+  int input_h_;
+  int img_idx_;
+  std::string img_dir_;
+  std::vector<std::string> img_files_;
+  size_t input_count_;
+  std::string calib_table_name_;
+  const char* input_blob_name_;
+  bool read_cache_;
+  void* device_input_;
+  std::vector<char> calib_cache_;
+};
+
--- a/algorithm/common_det/cuda/yolov7/config.h
+++ b/algorithm/common_det/cuda/yolov7/config.h
@@ -0,0 +1,55 @@
+#pragma once
+
+/* --------------------------------------------------------
+ * These configs are related to tensorrt model, if these are changed,
+ * please re-compile and re-serialize the tensorrt model.
+ * --------------------------------------------------------*/
+
+// For INT8, you need prepare the calibration dataset, please refer to
+// https://github.com/wang-xinyu/tensorrtx/tree/master/yolov5#int8-quantization
+#define USE_FP16  // set USE_INT8 or USE_FP16 or USE_FP32
+
+// These are used to define input/output tensor names,
+// you can set them to whatever you want.
+const static char* kInputTensorName = "data";
+const static char* kOutputTensorName = "prob";
+
+// Detection model and Segmentation model' number of classes
+// constexpr static int kNumClass = 80;
+
+// Classfication model's number of classes
+constexpr static int kClsNumClass = 1000;
+
+constexpr static int kBatchSize = 1;
+
+// Yolo's input width and height must by divisible by 32
+// constexpr static int kInputH = 640;
+// constexpr static int kInputW = 640;
+
+// Classfication model's input shape
+constexpr static int kClsInputH = 224;
+constexpr static int kClsInputW = 224;
+
+// Maximum number of output bounding boxes from yololayer plugin.
+// That is maximum number of output bounding boxes before NMS.
+constexpr static int kMaxNumOutputBbox = 1000;
+
+constexpr static int kNumAnchor = 3;
+
+// The bboxes whose confidence is lower than kIgnoreThresh will be ignored in yololayer plugin.
+constexpr static float kIgnoreThresh = 0.1f;
+
+/* --------------------------------------------------------
+ * These configs are NOT related to tensorrt model, if these are changed,
+ * please re-compile, but no need to re-serialize the tensorrt model.
+ * --------------------------------------------------------*/
+
+// NMS overlapping thresh and final detection confidence thresh
+const static float kNmsThresh = 0.45f;
+const static float kConfThresh = 0.5f;
+
+const static int kGpuId = 0;
+
+// If your image size is larger than 4096 * 3112, please increase this value
+const static int kMaxInputImageSize = 4096 * 3112;
+
--- a/algorithm/common_det/cuda/yolov7/cuda_utils.h
+++ b/algorithm/common_det/cuda/yolov7/cuda_utils.h
@@ -0,0 +1,18 @@
+#ifndef TRTX_CUDA_UTILS_H_
+#define TRTX_CUDA_UTILS_H_
+
+#include <cuda_runtime_api.h>
+
+#ifndef CUDA_CHECK
+#define CUDA_CHECK(callstr)\
+    {\
+        cudaError_t error_code = callstr;\
+        if (error_code != cudaSuccess) {\
+            std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__;\
+            assert(0);\
+        }\
+    }
+#endif  // CUDA_CHECK
+
+#endif  // TRTX_CUDA_UTILS_H_
+
--- a/algorithm/common_det/cuda/yolov7/logging.h
+++ b/algorithm/common_det/cuda/yolov7/logging.h
@@ -0,0 +1,504 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef TENSORRT_LOGGING_H
+#define TENSORRT_LOGGING_H
+
+#include "NvInferRuntimeCommon.h"
+#include <cassert>
+#include <ctime>
+#include <iomanip>
+#include <iostream>
+#include <ostream>
+#include <sstream>
+#include <string>
+#include "macros.h"
+
+using Severity = nvinfer1::ILogger::Severity;
+
+class LogStreamConsumerBuffer : public std::stringbuf
+{
+public:
+    LogStreamConsumerBuffer(std::ostream& stream, const std::string& prefix, bool shouldLog)
+        : mOutput(stream)
+        , mPrefix(prefix)
+        , mShouldLog(shouldLog)
+    {
+    }
+
+    LogStreamConsumerBuffer(LogStreamConsumerBuffer&& other)
+        : mOutput(other.mOutput)
+    {
+    }
+
+    ~LogStreamConsumerBuffer()
+    {
+        // std::streambuf::pbase() gives a pointer to the beginning of the buffered part of the output sequence
+        // std::streambuf::pptr() gives a pointer to the current position of the output sequence
+        // if the pointer to the beginning is not equal to the pointer to the current position,
+        // call putOutput() to log the output to the stream
+        if (pbase() != pptr())
+        {
+            putOutput();
+        }
+    }
+
+    // synchronizes the stream buffer and returns 0 on success
+    // synchronizing the stream buffer consists of inserting the buffer contents into the stream,
+    // resetting the buffer and flushing the stream
+    virtual int sync()
+    {
+        putOutput();
+        return 0;
+    }
+
+    void putOutput()
+    {
+        if (mShouldLog)
+        {
+            // prepend timestamp
+            std::time_t timestamp = std::time(nullptr);
+            tm* tm_local = std::localtime(&timestamp);
+            std::cout << "[";
+            std::cout << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon << "/";
+            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_mday << "/";
+            std::cout << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year << "-";
+            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_hour << ":";
+            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_min << ":";
+            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_sec << "] ";
+            // std::stringbuf::str() gets the string contents of the buffer
+            // insert the buffer contents pre-appended by the appropriate prefix into the stream
+            mOutput << mPrefix << str();
+            // set the buffer to empty
+            str("");
+            // flush the stream
+            mOutput.flush();
+        }
+    }
+
+    void setShouldLog(bool shouldLog)
+    {
+        mShouldLog = shouldLog;
+    }
+
+private:
+    std::ostream& mOutput;
+    std::string mPrefix;
+    bool mShouldLog;
+};
+
+//!
+//! \class LogStreamConsumerBase
+//! \brief Convenience object used to initialize LogStreamConsumerBuffer before std::ostream in LogStreamConsumer
+//!
+class LogStreamConsumerBase
+{
+public:
+    LogStreamConsumerBase(std::ostream& stream, const std::string& prefix, bool shouldLog)
+        : mBuffer(stream, prefix, shouldLog)
+    {
+    }
+
+protected:
+    LogStreamConsumerBuffer mBuffer;
+};
+
+//!
+//! \class LogStreamConsumer
+//! \brief Convenience object used to facilitate use of C++ stream syntax when logging messages.
+//!  Order of base classes is LogStreamConsumerBase and then std::ostream.
+//!  This is because the LogStreamConsumerBase class is used to initialize the LogStreamConsumerBuffer member field
+//!  in LogStreamConsumer and then the address of the buffer is passed to std::ostream.
+//!  This is necessary to prevent the address of an uninitialized buffer from being passed to std::ostream.
+//!  Please do not change the order of the parent classes.
+//!
+class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream
+{
+public:
+    //! \brief Creates a LogStreamConsumer which logs messages with level severity.
+    //!  Reportable severity determines if the messages are severe enough to be logged.
+    LogStreamConsumer(Severity reportableSeverity, Severity severity)
+        : LogStreamConsumerBase(severityOstream(severity), severityPrefix(severity), severity <= reportableSeverity)
+        , std::ostream(&mBuffer) // links the stream buffer with the stream
+        , mShouldLog(severity <= reportableSeverity)
+        , mSeverity(severity)
+    {
+    }
+
+    LogStreamConsumer(LogStreamConsumer&& other)
+        : LogStreamConsumerBase(severityOstream(other.mSeverity), severityPrefix(other.mSeverity), other.mShouldLog)
+        , std::ostream(&mBuffer) // links the stream buffer with the stream
+        , mShouldLog(other.mShouldLog)
+        , mSeverity(other.mSeverity)
+    {
+    }
+
+    void setReportableSeverity(Severity reportableSeverity)
+    {
+        mShouldLog = mSeverity <= reportableSeverity;
+        mBuffer.setShouldLog(mShouldLog);
+    }
+
+private:
+    static std::ostream& severityOstream(Severity severity)
+    {
+        return severity >= Severity::kINFO ? std::cout : std::cerr;
+    }
+
+    static std::string severityPrefix(Severity severity)
+    {
+        switch (severity)
+        {
+        case Severity::kINTERNAL_ERROR: return "[F] ";
+        case Severity::kERROR: return "[E] ";
+        case Severity::kWARNING: return "[W] ";
+        case Severity::kINFO: return "[I] ";
+        case Severity::kVERBOSE: return "[V] ";
+        default: assert(0); return "";
+        }
+    }
+
+    bool mShouldLog;
+    Severity mSeverity;
+};
+
+//! \class Logger
+//!
+//! \brief Class which manages logging of TensorRT tools and samples
+//!
+//! \details This class provides a common interface for TensorRT tools and samples to log information to the console,
+//! and supports logging two types of messages:
+//!
+//! - Debugging messages with an associated severity (info, warning, error, or internal error/fatal)
+//! - Test pass/fail messages
+//!
+//! The advantage of having all samples use this class for logging as opposed to emitting directly to stdout/stderr is
+//! that the logic for controlling the verbosity and formatting of sample output is centralized in one location.
+//!
+//! In the future, this class could be extended to support dumping test results to a file in some standard format
+//! (for example, JUnit XML), and providing additional metadata (e.g. timing the duration of a test run).
+//!
+//! TODO: For backwards compatibility with existing samples, this class inherits directly from the nvinfer1::ILogger
+//! interface, which is problematic since there isn't a clean separation between messages coming from the TensorRT
+//! library and messages coming from the sample.
+//!
+//! In the future (once all samples are updated to use Logger::getTRTLogger() to access the ILogger) we can refactor the
+//! class to eliminate the inheritance and instead make the nvinfer1::ILogger implementation a member of the Logger
+//! object.
+
+class Logger : public nvinfer1::ILogger
+{
+public:
+    Logger(Severity severity = Severity::kWARNING)
+        : mReportableSeverity(severity)
+    {
+    }
+
+    //!
+    //! \enum TestResult
+    //! \brief Represents the state of a given test
+    //!
+    enum class TestResult
+    {
+        kRUNNING, //!< The test is running
+        kPASSED,  //!< The test passed
+        kFAILED,  //!< The test failed
+        kWAIVED   //!< The test was waived
+    };
+
+    //!
+    //! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this Logger
+    //! \return The nvinfer1::ILogger associated with this Logger
+    //!
+    //! TODO Once all samples are updated to use this method to register the logger with TensorRT,
+    //! we can eliminate the inheritance of Logger from ILogger
+    //!
+    nvinfer1::ILogger& getTRTLogger()
+    {
+        return *this;
+    }
+
+    //!
+    //! \brief Implementation of the nvinfer1::ILogger::log() virtual method
+    //!
+    //! Note samples should not be calling this function directly; it will eventually go away once we eliminate the
+    //! inheritance from nvinfer1::ILogger
+    //!
+    void log(Severity severity, const char* msg) TRT_NOEXCEPT override 
+    {
+        LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << std::endl;
+    }
+
+    //!
+    //! \brief Method for controlling the verbosity of logging output
+    //!
+    //! \param severity The logger will only emit messages that have severity of this level or higher.
+    //!
+    void setReportableSeverity(Severity severity)
+    {
+        mReportableSeverity = severity;
+    }
+
+    //!
+    //! \brief Opaque handle that holds logging information for a particular test
+    //!
+    //! This object is an opaque handle to information used by the Logger to print test results.
+    //! The sample must call Logger::defineTest() in order to obtain a TestAtom that can be used
+    //! with Logger::reportTest{Start,End}().
+    //!
+    class TestAtom
+    {
+    public:
+        TestAtom(TestAtom&&) = default;
+
+    private:
+        friend class Logger;
+
+        TestAtom(bool started, const std::string& name, const std::string& cmdline)
+            : mStarted(started)
+            , mName(name)
+            , mCmdline(cmdline)
+        {
+        }
+
+        bool mStarted;
+        std::string mName;
+        std::string mCmdline;
+    };
+
+    //!
+    //! \brief Define a test for logging
+    //!
+    //! \param[in] name The name of the test.  This should be a string starting with
+    //!                  "TensorRT" and containing dot-separated strings containing
+    //!                  the characters [A-Za-z0-9_].
+    //!                  For example, "TensorRT.sample_googlenet"
+    //! \param[in] cmdline The command line used to reproduce the test
+    //
+    //! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
+    //!
+    static TestAtom defineTest(const std::string& name, const std::string& cmdline)
+    {
+        return TestAtom(false, name, cmdline);
+    }
+
+    //!
+    //! \brief A convenience overloaded version of defineTest() that accepts an array of command-line arguments
+    //!        as input
+    //!
+    //! \param[in] name The name of the test
+    //! \param[in] argc The number of command-line arguments
+    //! \param[in] argv The array of command-line arguments (given as C strings)
+    //!
+    //! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
+    static TestAtom defineTest(const std::string& name, int argc, char const* const* argv)
+    {
+        auto cmdline = genCmdlineString(argc, argv);
+        return defineTest(name, cmdline);
+    }
+
+    //!
+    //! \brief Report that a test has started.
+    //!
+    //! \pre reportTestStart() has not been called yet for the given testAtom
+    //!
+    //! \param[in] testAtom The handle to the test that has started
+    //!
+    static void reportTestStart(TestAtom& testAtom)
+    {
+        reportTestResult(testAtom, TestResult::kRUNNING);
+        assert(!testAtom.mStarted);
+        testAtom.mStarted = true;
+    }
+
+    //!
+    //! \brief Report that a test has ended.
+    //!
+    //! \pre reportTestStart() has been called for the given testAtom
+    //!
+    //! \param[in] testAtom The handle to the test that has ended
+    //! \param[in] result The result of the test. Should be one of TestResult::kPASSED,
+    //!                   TestResult::kFAILED, TestResult::kWAIVED
+    //!
+    static void reportTestEnd(const TestAtom& testAtom, TestResult result)
+    {
+        assert(result != TestResult::kRUNNING);
+        assert(testAtom.mStarted);
+        reportTestResult(testAtom, result);
+    }
+
+    static int reportPass(const TestAtom& testAtom)
+    {
+        reportTestEnd(testAtom, TestResult::kPASSED);
+        return EXIT_SUCCESS;
+    }
+
+    static int reportFail(const TestAtom& testAtom)
+    {
+        reportTestEnd(testAtom, TestResult::kFAILED);
+        return EXIT_FAILURE;
+    }
+
+    static int reportWaive(const TestAtom& testAtom)
+    {
+        reportTestEnd(testAtom, TestResult::kWAIVED);
+        return EXIT_SUCCESS;
+    }
+
+    static int reportTest(const TestAtom& testAtom, bool pass)
+    {
+        return pass ? reportPass(testAtom) : reportFail(testAtom);
+    }
+
+    Severity getReportableSeverity() const
+    {
+        return mReportableSeverity;
+    }
+
+private:
+    //!
+    //! \brief returns an appropriate string for prefixing a log message with the given severity
+    //!
+    static const char* severityPrefix(Severity severity)
+    {
+        switch (severity)
+        {
+        case Severity::kINTERNAL_ERROR: return "[F] ";
+        case Severity::kERROR: return "[E] ";
+        case Severity::kWARNING: return "[W] ";
+        case Severity::kINFO: return "[I] ";
+        case Severity::kVERBOSE: return "[V] ";
+        default: assert(0); return "";
+        }
+    }
+
+    //!
+    //! \brief returns an appropriate string for prefixing a test result message with the given result
+    //!
+    static const char* testResultString(TestResult result)
+    {
+        switch (result)
+        {
+        case TestResult::kRUNNING: return "RUNNING";
+        case TestResult::kPASSED: return "PASSED";
+        case TestResult::kFAILED: return "FAILED";
+        case TestResult::kWAIVED: return "WAIVED";
+        default: assert(0); return "";
+        }
+    }
+
+    //!
+    //! \brief returns an appropriate output stream (cout or cerr) to use with the given severity
+    //!
+    static std::ostream& severityOstream(Severity severity)
+    {
+        return severity >= Severity::kINFO ? std::cout : std::cerr;
+    }
+
+    //!
+    //! \brief method that implements logging test results
+    //!
+    static void reportTestResult(const TestAtom& testAtom, TestResult result)
+    {
+        severityOstream(Severity::kINFO) << "&&&& " << testResultString(result) << " " << testAtom.mName << " # "
+                                         << testAtom.mCmdline << std::endl;
+    }
+
+    //!
+    //! \brief generate a command line string from the given (argc, argv) values
+    //!
+    static std::string genCmdlineString(int argc, char const* const* argv)
+    {
+        std::stringstream ss;
+        for (int i = 0; i < argc; i++)
+        {
+            if (i > 0)
+                ss << " ";
+            ss << argv[i];
+        }
+        return ss.str();
+    }
+
+    Severity mReportableSeverity;
+};
+
+namespace
+{
+
+//!
+//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kVERBOSE
+//!
+//! Example usage:
+//!
+//!     LOG_VERBOSE(logger) << "hello world" << std::endl;
+//!
+inline LogStreamConsumer LOG_VERBOSE(const Logger& logger)
+{
+    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kVERBOSE);
+}
+
+//!
+//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINFO
+//!
+//! Example usage:
+//!
+//!     LOG_INFO(logger) << "hello world" << std::endl;
+//!
+inline LogStreamConsumer LOG_INFO(const Logger& logger)
+{
+    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINFO);
+}
+
+//!
+//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kWARNING
+//!
+//! Example usage:
+//!
+//!     LOG_WARN(logger) << "hello world" << std::endl;
+//!
+inline LogStreamConsumer LOG_WARN(const Logger& logger)
+{
+    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kWARNING);
+}
+
+//!
+//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kERROR
+//!
+//! Example usage:
+//!
+//!     LOG_ERROR(logger) << "hello world" << std::endl;
+//!
+inline LogStreamConsumer LOG_ERROR(const Logger& logger)
+{
+    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kERROR);
+}
+
+//!
+//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINTERNAL_ERROR
+//         ("fatal" severity)
+//!
+//! Example usage:
+//!
+//!     LOG_FATAL(logger) << "hello world" << std::endl;
+//!
+inline LogStreamConsumer LOG_FATAL(const Logger& logger)
+{
+    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINTERNAL_ERROR);
+}
+
+} // anonymous namespace
+
+#endif // TENSORRT_LOGGING_H
--- a/algorithm/common_det/cuda/yolov7/macros.h
+++ b/algorithm/common_det/cuda/yolov7/macros.h
@@ -0,0 +1,29 @@
+#ifndef __MACROS_H
+#define __MACROS_H
+
+#include <NvInfer.h>
+
+#ifdef API_EXPORTS
+#if defined(_MSC_VER)
+#define API __declspec(dllexport)
+#else
+#define API __attribute__((visibility("default")))
+#endif
+#else
+
+#if defined(_MSC_VER)
+#define API __declspec(dllimport)
+#else
+#define API
+#endif
+#endif  // API_EXPORTS
+
+#if NV_TENSORRT_MAJOR >= 8
+#define TRT_NOEXCEPT noexcept
+#define TRT_CONST_ENQUEUE const
+#else
+#define TRT_NOEXCEPT
+#define TRT_CONST_ENQUEUE
+#endif
+
+#endif  // __MACROS_H
--- a/algorithm/common_det/cuda/yolov7/model.cpp
+++ b/algorithm/common_det/cuda/yolov7/model.cpp
@@ -0,0 +1,628 @@
+#include "model.h"
+#include "calibrator.h"
+#include "config.h"
+#include "yololayer.h"
+
+#include <iostream>
+#include <fstream>
+#include <map>
+#include <cassert>
+#include <cmath>
+#include <cstring>
+
+using namespace nvinfer1;
+
+// TensorRT weight files have a simple space delimited format:
+// [type] [size] <data x size in hex>
+static std::map<std::string, Weights> loadWeights(const std::string file) {
+  std::cout << "Loading weights: " << file << std::endl;
+  std::map<std::string, Weights> weightMap;
+
+  // Open weights file
+  std::ifstream input(file);
+  assert(input.is_open() && "Unable to load weight file. please check if the .wts file path is right!!!!!!");
+
+  // Read number of weight blobs
+  int32_t count;
+  input >> count;
+  assert(count > 0 && "Invalid weight map file.");
+
+  while (count--) {
+    Weights wt{ DataType::kFLOAT, nullptr, 0 };
+    uint32_t size;
+
+    // Read name and type of blob
+    std::string name;
+    input >> name >> std::dec >> size;
+    wt.type = DataType::kFLOAT;
+
+    // Load blob
+    uint32_t* val = reinterpret_cast<uint32_t*>(malloc(sizeof(val) * size));
+    for (uint32_t x = 0, y = size; x < y; ++x) {
+      input >> std::hex >> val[x];
+    }
+    wt.values = val;
+
+    wt.count = size;
+    weightMap[name] = wt;
+  }
+
+  return weightMap;
+}
+
+static int get_width(int x, float gw, int divisor = 8) {
+  return int(ceil((x * gw) / divisor)) * divisor;
+}
+
+static int get_depth(int x, float gd) {
+  if (x == 1) return 1;
+  int r = round(x * gd);
+  if (x * gd - int(x * gd) == 0.5 && (int(x * gd) % 2) == 0) {
+    --r;
+  }
+  return std::max<int>(r, 1);
+}
+
+static IScaleLayer* addBatchNorm2d(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, std::string lname, float eps) {
+  float* gamma = (float*)weightMap[lname + ".weight"].values;
+  float* beta = (float*)weightMap[lname + ".bias"].values;
+  float* mean = (float*)weightMap[lname + ".running_mean"].values;
+  float* var = (float*)weightMap[lname + ".running_var"].values;
+  int len = weightMap[lname + ".running_var"].count;
+
+  float* scval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
+  for (int i = 0; i < len; i++) {
+    scval[i] = gamma[i] / sqrt(var[i] + eps);
+  }
+  Weights scale{ DataType::kFLOAT, scval, len };
+
+  float* shval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
+  for (int i = 0; i < len; i++) {
+    shval[i] = beta[i] - mean[i] * gamma[i] / sqrt(var[i] + eps);
+  }
+  Weights shift{ DataType::kFLOAT, shval, len };
+
+  float* pval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
+  for (int i = 0; i < len; i++) {
+    pval[i] = 1.0;
+  }
+  Weights power{ DataType::kFLOAT, pval, len };
+
+  weightMap[lname + ".scale"] = scale;
+  weightMap[lname + ".shift"] = shift;
+  weightMap[lname + ".power"] = power;
+  IScaleLayer* scale_1 = network->addScale(input, ScaleMode::kCHANNEL, shift, scale, power);
+  assert(scale_1);
+  return scale_1;
+}
+
+static ILayer* convBlock(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, int outch, int ksize, int s, int g, std::string lname) {
+  Weights emptywts{ DataType::kFLOAT, nullptr, 0 };
+  int p = ksize / 3;
+  IConvolutionLayer* conv1 = network->addConvolutionNd(input, outch, DimsHW{ ksize, ksize }, weightMap[lname + ".conv.weight"], emptywts);
+  assert(conv1);
+  conv1->setStrideNd(DimsHW{ s, s });
+  conv1->setPaddingNd(DimsHW{ p, p });
+  conv1->setNbGroups(g);
+  conv1->setName((lname + ".conv").c_str());
+  IScaleLayer* bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), lname + ".bn", 1e-3);
+
+  // silu = x * sigmoid
+  auto sig = network->addActivation(*bn1->getOutput(0), ActivationType::kSIGMOID);
+  assert(sig);
+  auto ew = network->addElementWise(*bn1->getOutput(0), *sig->getOutput(0), ElementWiseOperation::kPROD);
+  assert(ew);
+  return ew;
+}
+
+static ILayer* focus(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, int inch, int outch, int ksize, std::string lname, int input_h, int input_w) {
+  ISliceLayer* s1 = network->addSlice(input, Dims3{ 0, 0, 0 }, Dims3{ inch, input_h / 2, input_w / 2 }, Dims3{ 1, 2, 2 });
+  ISliceLayer* s2 = network->addSlice(input, Dims3{ 0, 1, 0 }, Dims3{ inch, input_h / 2, input_w / 2 }, Dims3{ 1, 2, 2 });
+  ISliceLayer* s3 = network->addSlice(input, Dims3{ 0, 0, 1 }, Dims3{ inch, input_h / 2, input_w / 2 }, Dims3{ 1, 2, 2 });
+  ISliceLayer* s4 = network->addSlice(input, Dims3{ 0, 1, 1 }, Dims3{ inch, input_h / 2, input_w / 2 }, Dims3{ 1, 2, 2 });
+  ITensor* inputTensors[] = { s1->getOutput(0), s2->getOutput(0), s3->getOutput(0), s4->getOutput(0) };
+  auto cat = network->addConcatenation(inputTensors, 4);
+  auto conv = convBlock(network, weightMap, *cat->getOutput(0), outch, ksize, 1, 1, lname + ".conv");
+  return conv;
+}
+
+static ILayer* bottleneck(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, int c1, int c2, bool shortcut, int g, float e, std::string lname) {
+  auto cv1 = convBlock(network, weightMap, input, (int)((float)c2 * e), 1, 1, 1, lname + ".cv1");
+  auto cv2 = convBlock(network, weightMap, *cv1->getOutput(0), c2, 3, 1, g, lname + ".cv2");
+  if (shortcut && c1 == c2) {
+    auto ew = network->addElementWise(input, *cv2->getOutput(0), ElementWiseOperation::kSUM);
+    return ew;
+  }
+  return cv2;
+}
+
+static ILayer* bottleneckCSP(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, int c1, int c2, int n, bool shortcut, int g, float e, std::string lname) {
+  Weights emptywts{ DataType::kFLOAT, nullptr, 0 };
+  int c_ = (int)((float)c2 * e);
+  auto cv1 = convBlock(network, weightMap, input, c_, 1, 1, 1, lname + ".cv1");
+  auto cv2 = network->addConvolutionNd(input, c_, DimsHW{ 1, 1 }, weightMap[lname + ".cv2.weight"], emptywts);
+  ITensor* y1 = cv1->getOutput(0);
+  for (int i = 0; i < n; i++) {
+    auto b = bottleneck(network, weightMap, *y1, c_, c_, shortcut, g, 1.0, lname + ".m." + std::to_string(i));
+    y1 = b->getOutput(0);
+  }
+  auto cv3 = network->addConvolutionNd(*y1, c_, DimsHW{ 1, 1 }, weightMap[lname + ".cv3.weight"], emptywts);
+
+  ITensor* inputTensors[] = { cv3->getOutput(0), cv2->getOutput(0) };
+  auto cat = network->addConcatenation(inputTensors, 2);
+
+  IScaleLayer* bn = addBatchNorm2d(network, weightMap, *cat->getOutput(0), lname + ".bn", 1e-4);
+  auto lr = network->addActivation(*bn->getOutput(0), ActivationType::kLEAKY_RELU);
+  lr->setAlpha(0.1);
+
+  auto cv4 = convBlock(network, weightMap, *lr->getOutput(0), c2, 1, 1, 1, lname + ".cv4");
+  return cv4;
+}
+
+static ILayer* C3(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, int c1, int c2, int n, bool shortcut, int g, float e, std::string lname) {
+  int c_ = (int)((float)c2 * e);
+  auto cv1 = convBlock(network, weightMap, input, c_, 1, 1, 1, lname + ".cv1");
+  auto cv2 = convBlock(network, weightMap, input, c_, 1, 1, 1, lname + ".cv2");
+  ITensor *y1 = cv1->getOutput(0);
+  for (int i = 0; i < n; i++) {
+    auto b = bottleneck(network, weightMap, *y1, c_, c_, shortcut, g, 1.0, lname + ".m." + std::to_string(i));
+    y1 = b->getOutput(0);
+  }
+
+  ITensor* inputTensors[] = { y1, cv2->getOutput(0) };
+  auto cat = network->addConcatenation(inputTensors, 2);
+
+  auto cv3 = convBlock(network, weightMap, *cat->getOutput(0), c2, 1, 1, 1, lname + ".cv3");
+  return cv3;
+}
+
+static ILayer* SPP(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, int c1, int c2, int k1, int k2, int k3, std::string lname) {
+  int c_ = c1 / 2;
+  auto cv1 = convBlock(network, weightMap, input, c_, 1, 1, 1, lname + ".cv1");
+
+  auto pool1 = network->addPoolingNd(*cv1->getOutput(0), PoolingType::kMAX, DimsHW{ k1, k1 });
+  pool1->setPaddingNd(DimsHW{ k1 / 2, k1 / 2 });
+  pool1->setStrideNd(DimsHW{ 1, 1 });
+  auto pool2 = network->addPoolingNd(*cv1->getOutput(0), PoolingType::kMAX, DimsHW{ k2, k2 });
+  pool2->setPaddingNd(DimsHW{ k2 / 2, k2 / 2 });
+  pool2->setStrideNd(DimsHW{ 1, 1 });
+  auto pool3 = network->addPoolingNd(*cv1->getOutput(0), PoolingType::kMAX, DimsHW{ k3, k3 });
+  pool3->setPaddingNd(DimsHW{ k3 / 2, k3 / 2 });
+  pool3->setStrideNd(DimsHW{ 1, 1 });
+
+  ITensor* inputTensors[] = { cv1->getOutput(0), pool1->getOutput(0), pool2->getOutput(0), pool3->getOutput(0) };
+  auto cat = network->addConcatenation(inputTensors, 4);
+
+  auto cv2 = convBlock(network, weightMap, *cat->getOutput(0), c2, 1, 1, 1, lname + ".cv2");
+  return cv2;
+}
+
+static ILayer* SPPF(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, int c1, int c2, int k, std::string lname) {
+  int c_ = c1 / 2;
+  auto cv1 = convBlock(network, weightMap, input, c_, 1, 1, 1, lname + ".cv1");
+
+  auto pool1 = network->addPoolingNd(*cv1->getOutput(0), PoolingType::kMAX, DimsHW{ k, k });
+  pool1->setPaddingNd(DimsHW{ k / 2, k / 2 });
+  pool1->setStrideNd(DimsHW{ 1, 1 });
+  auto pool2 = network->addPoolingNd(*pool1->getOutput(0), PoolingType::kMAX, DimsHW{ k, k });
+  pool2->setPaddingNd(DimsHW{ k / 2, k / 2 });
+  pool2->setStrideNd(DimsHW{ 1, 1 });
+  auto pool3 = network->addPoolingNd(*pool2->getOutput(0), PoolingType::kMAX, DimsHW{ k, k });
+  pool3->setPaddingNd(DimsHW{ k / 2, k / 2 });
+  pool3->setStrideNd(DimsHW{ 1, 1 });
+  ITensor* inputTensors[] = { cv1->getOutput(0), pool1->getOutput(0), pool2->getOutput(0), pool3->getOutput(0) };
+  auto cat = network->addConcatenation(inputTensors, 4);
+  auto cv2 = convBlock(network, weightMap, *cat->getOutput(0), c2, 1, 1, 1, lname + ".cv2");
+  return cv2;
+}
+
+static ILayer* Proto(INetworkDefinition* network, std::map<std::string, Weights>& weightMap, ITensor& input, int c_, int c2, std::string lname) {
+  auto cv1 = convBlock(network, weightMap, input, c_, 3, 1, 1, lname + ".cv1");
+
+  auto upsample = network->addResize(*cv1->getOutput(0));
+  assert(upsample);
+  upsample->setResizeMode(ResizeMode::kNEAREST);
+  const float scales[] = {1, 2, 2};
+  upsample->setScales(scales, 3);
+
+  auto cv2 = convBlock(network, weightMap, *upsample->getOutput(0), c_, 3, 1, 1, lname + ".cv2");
+  auto cv3 = convBlock(network, weightMap, *cv2->getOutput(0), c2, 1, 1, 1, lname + ".cv3");
+  assert(cv3);
+  return cv3;
+}
+
+static std::vector<std::vector<float>> getAnchors(std::map<std::string, Weights>& weightMap, std::string lname) {
+  std::vector<std::vector<float>> anchors;
+  Weights wts = weightMap[lname + ".anchor_grid"];
+  int anchor_len = kNumAnchor * 2;
+  for (int i = 0; i < wts.count / anchor_len; i++) {
+    auto *p = (const float*)wts.values + i * anchor_len;
+    std::vector<float> anchor(p, p + anchor_len);
+    anchors.push_back(anchor);
+  }
+  return anchors;
+}
+
+static IPluginV2Layer* addYoLoLayer(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, std::string lname, std::vector<IConvolutionLayer*> dets, int input_h, int input_w, int n_classes, bool is_segmentation = false) {
+  auto creator = getPluginRegistry()->getPluginCreator("YoloLayer_TRT", "1");
+  auto anchors = getAnchors(weightMap, lname);
+  PluginField plugin_fields[2];
+  int netinfo[5] = {n_classes, input_w, input_h, kMaxNumOutputBbox, (int)is_segmentation};
+  plugin_fields[0].data = netinfo;
+  plugin_fields[0].length = 5;
+  plugin_fields[0].name = "netinfo";
+  plugin_fields[0].type = PluginFieldType::kFLOAT32;
+
+  //load strides from Detect layer
+  assert(weightMap.find(lname + ".strides") != weightMap.end() && "Not found `strides`, please check gen_wts.py!!!");
+  Weights strides = weightMap[lname + ".strides"];
+  auto *p = (const float*)(strides.values);
+  std::vector<int> scales(p, p + strides.count);
+
+  std::vector<YoloKernel> kernels;
+  for (size_t i = 0; i < anchors.size(); i++) {
+    YoloKernel kernel;
+    kernel.width = input_w / scales[i];
+    kernel.height = input_h / scales[i];
+    memcpy(kernel.anchors, &anchors[i][0], anchors[i].size() * sizeof(float));
+    kernels.push_back(kernel);
+  }
+  plugin_fields[1].data = &kernels[0];
+  plugin_fields[1].length = kernels.size();
+  plugin_fields[1].name = "kernels";
+  plugin_fields[1].type = PluginFieldType::kFLOAT32;
+  PluginFieldCollection plugin_data;
+  plugin_data.nbFields = 2;
+  plugin_data.fields = plugin_fields;
+  IPluginV2 *plugin_obj = creator->createPlugin("yololayer", &plugin_data);
+  std::vector<ITensor*> input_tensors;
+  for (auto det: dets) {
+    input_tensors.push_back(det->getOutput(0));
+  }
+  auto yolo = network->addPluginV2(&input_tensors[0], input_tensors.size(), *plugin_obj);
+  return yolo;
+}
+
+ICudaEngine* build_det_engine(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt, float& gd, float& gw, std::string& wts_name, int input_h, int input_w, int n_classes) {
+  INetworkDefinition* network = builder->createNetworkV2(0U);
+
+  // Create input tensor of shape {3, input_h, input_w}
+  ITensor* data = network->addInput(kInputTensorName, dt, Dims3{ 3, input_h, input_w });
+  assert(data);
+  std::map<std::string, Weights> weightMap = loadWeights(wts_name);
+
+  // Backbone
+  auto conv0 = convBlock(network, weightMap, *data,  get_width(64, gw), 6, 2, 1,  "model.0");
+  assert(conv0);
+  auto conv1 = convBlock(network, weightMap, *conv0->getOutput(0), get_width(128, gw), 3, 2, 1, "model.1");
+  auto bottleneck_CSP2 = C3(network, weightMap, *conv1->getOutput(0), get_width(128, gw), get_width(128, gw), get_depth(3, gd), true, 1, 0.5, "model.2");
+  auto conv3 = convBlock(network, weightMap, *bottleneck_CSP2->getOutput(0), get_width(256, gw), 3, 2, 1, "model.3");
+  auto bottleneck_csp4 = C3(network, weightMap, *conv3->getOutput(0), get_width(256, gw), get_width(256, gw), get_depth(6, gd), true, 1, 0.5, "model.4");
+  auto conv5 = convBlock(network, weightMap, *bottleneck_csp4->getOutput(0), get_width(512, gw), 3, 2, 1, "model.5");
+  auto bottleneck_csp6 = C3(network, weightMap, *conv5->getOutput(0), get_width(512, gw), get_width(512, gw), get_depth(9, gd), true, 1, 0.5, "model.6");
+  auto conv7 = convBlock(network, weightMap, *bottleneck_csp6->getOutput(0), get_width(1024, gw), 3, 2, 1, "model.7");
+  auto bottleneck_csp8 = C3(network, weightMap, *conv7->getOutput(0), get_width(1024, gw), get_width(1024, gw), get_depth(3, gd), true, 1, 0.5, "model.8");
+  auto spp9 = SPPF(network, weightMap, *bottleneck_csp8->getOutput(0), get_width(1024, gw), get_width(1024, gw), 5, "model.9");
+
+  // Head
+  auto conv10 = convBlock(network, weightMap, *spp9->getOutput(0), get_width(512, gw), 1, 1, 1, "model.10");
+
+  auto upsample11 = network->addResize(*conv10->getOutput(0));
+  assert(upsample11);
+  upsample11->setResizeMode(ResizeMode::kNEAREST);
+  upsample11->setOutputDimensions(bottleneck_csp6->getOutput(0)->getDimensions());
+
+  ITensor* inputTensors12[] = { upsample11->getOutput(0), bottleneck_csp6->getOutput(0) };
+  auto cat12 = network->addConcatenation(inputTensors12, 2);
+  auto bottleneck_csp13 = C3(network, weightMap, *cat12->getOutput(0), get_width(1024, gw), get_width(512, gw), get_depth(3, gd), false, 1, 0.5, "model.13");
+  auto conv14 = convBlock(network, weightMap, *bottleneck_csp13->getOutput(0), get_width(256, gw), 1, 1, 1, "model.14");
+
+  auto upsample15 = network->addResize(*conv14->getOutput(0));
+  assert(upsample15);
+  upsample15->setResizeMode(ResizeMode::kNEAREST);
+  upsample15->setOutputDimensions(bottleneck_csp4->getOutput(0)->getDimensions());
+
+  ITensor* inputTensors16[] = { upsample15->getOutput(0), bottleneck_csp4->getOutput(0) };
+  auto cat16 = network->addConcatenation(inputTensors16, 2);
+
+  auto bottleneck_csp17 = C3(network, weightMap, *cat16->getOutput(0), get_width(512, gw), get_width(256, gw), get_depth(3, gd), false, 1, 0.5, "model.17");
+
+  // Detect
+  IConvolutionLayer* det0 = network->addConvolutionNd(*bottleneck_csp17->getOutput(0), 3 * (n_classes + 5), DimsHW{ 1, 1 }, weightMap["model.24.m.0.weight"], weightMap["model.24.m.0.bias"]);
+  auto conv18 = convBlock(network, weightMap, *bottleneck_csp17->getOutput(0), get_width(256, gw), 3, 2, 1, "model.18");
+  ITensor* inputTensors19[] = { conv18->getOutput(0), conv14->getOutput(0) };
+  auto cat19 = network->addConcatenation(inputTensors19, 2);
+  auto bottleneck_csp20 = C3(network, weightMap, *cat19->getOutput(0), get_width(512, gw), get_width(512, gw), get_depth(3, gd), false, 1, 0.5, "model.20");
+  IConvolutionLayer* det1 = network->addConvolutionNd(*bottleneck_csp20->getOutput(0), 3 * (n_classes + 5), DimsHW{ 1, 1 }, weightMap["model.24.m.1.weight"], weightMap["model.24.m.1.bias"]);
+  auto conv21 = convBlock(network, weightMap, *bottleneck_csp20->getOutput(0), get_width(512, gw), 3, 2, 1, "model.21");
+  ITensor* inputTensors22[] = { conv21->getOutput(0), conv10->getOutput(0) };
+  auto cat22 = network->addConcatenation(inputTensors22, 2);
+  auto bottleneck_csp23 = C3(network, weightMap, *cat22->getOutput(0), get_width(1024, gw), get_width(1024, gw), get_depth(3, gd), false, 1, 0.5, "model.23");
+  IConvolutionLayer* det2 = network->addConvolutionNd(*bottleneck_csp23->getOutput(0), 3 * (n_classes + 5), DimsHW{ 1, 1 }, weightMap["model.24.m.2.weight"], weightMap["model.24.m.2.bias"]);
+
+  auto yolo = addYoLoLayer(network, weightMap, "model.24", std::vector<IConvolutionLayer*>{det0, det1, det2}, input_h, input_w, n_classes);
+  yolo->getOutput(0)->setName(kOutputTensorName);
+  network->markOutput(*yolo->getOutput(0));
+
+  // Engine config
+  builder->setMaxBatchSize(maxBatchSize);
+  config->setMaxWorkspaceSize(16 * (1 << 20));  // 16MB
+#if defined(USE_FP16)
+  config->setFlag(BuilderFlag::kFP16);
+#elif defined(USE_INT8)
+  std::cout << "Your platform support int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl;
+  assert(builder->platformHasFastInt8());
+  config->setFlag(BuilderFlag::kINT8);
+  Int8EntropyCalibrator2* calibrator = new Int8EntropyCalibrator2(1, input_w, input_h, "./coco_calib/", "int8calib.table", kInputTensorName);
+  config->setInt8Calibrator(calibrator);
+#endif
+
+  std::cout << "Building engine, please wait for a while..." << std::endl;
+  ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config);
+  std::cout << "Build engine successfully!" << std::endl;
+
+  // Don't need the network any more
+  network->destroy();
+
+  // Release host memory
+  for (auto& mem : weightMap) {
+    free((void*)(mem.second.values));
+  }
+
+  return engine;
+}
+
+ICudaEngine* build_det_p6_engine(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt, float& gd, float& gw, std::string& wts_name, int input_h, int input_w, int n_classes) {
+  INetworkDefinition* network = builder->createNetworkV2(0U);
+
+  // Create input tensor of shape {3, input_h, input_w}
+  ITensor* data = network->addInput(kInputTensorName, dt, Dims3{ 3, input_h, input_w });
+  assert(data);
+
+  std::map<std::string, Weights> weightMap = loadWeights(wts_name);
+
+  // Backbone
+  auto conv0 = convBlock(network, weightMap, *data,  get_width(64, gw), 6, 2, 1,  "model.0");
+  auto conv1 = convBlock(network, weightMap, *conv0->getOutput(0), get_width(128, gw), 3, 2, 1, "model.1");
+  auto c3_2 = C3(network, weightMap, *conv1->getOutput(0), get_width(128, gw), get_width(128, gw), get_depth(3, gd), true, 1, 0.5, "model.2");
+  auto conv3 = convBlock(network, weightMap, *c3_2->getOutput(0), get_width(256, gw), 3, 2, 1, "model.3");
+  auto c3_4 = C3(network, weightMap, *conv3->getOutput(0), get_width(256, gw), get_width(256, gw), get_depth(6, gd), true, 1, 0.5, "model.4");
+  auto conv5 = convBlock(network, weightMap, *c3_4->getOutput(0), get_width(512, gw), 3, 2, 1, "model.5");
+  auto c3_6 = C3(network, weightMap, *conv5->getOutput(0), get_width(512, gw), get_width(512, gw), get_depth(9, gd), true, 1, 0.5, "model.6");
+  auto conv7 = convBlock(network, weightMap, *c3_6->getOutput(0), get_width(768, gw), 3, 2, 1, "model.7");
+  auto c3_8 = C3(network, weightMap, *conv7->getOutput(0), get_width(768, gw), get_width(768, gw), get_depth(3, gd), true, 1, 0.5, "model.8");
+  auto conv9 = convBlock(network, weightMap, *c3_8->getOutput(0), get_width(1024, gw), 3, 2, 1, "model.9");
+  auto c3_10 = C3(network, weightMap, *conv9->getOutput(0), get_width(1024, gw), get_width(1024, gw), get_depth(3, gd), true, 1, 0.5, "model.10");
+  auto sppf11 = SPPF(network, weightMap, *c3_10->getOutput(0), get_width(1024, gw), get_width(1024, gw), 5, "model.11");
+
+  // Head
+  auto conv12 = convBlock(network, weightMap, *sppf11->getOutput(0), get_width(768, gw), 1, 1, 1, "model.12");
+  auto upsample13 = network->addResize(*conv12->getOutput(0));
+  assert(upsample13);
+  upsample13->setResizeMode(ResizeMode::kNEAREST);
+  upsample13->setOutputDimensions(c3_8->getOutput(0)->getDimensions());
+  ITensor* inputTensors14[] = { upsample13->getOutput(0), c3_8->getOutput(0) };
+  auto cat14 = network->addConcatenation(inputTensors14, 2);
+  auto c3_15 = C3(network, weightMap, *cat14->getOutput(0), get_width(1536, gw), get_width(768, gw), get_depth(3, gd), false, 1, 0.5, "model.15");
+
+  auto conv16 = convBlock(network, weightMap, *c3_15->getOutput(0), get_width(512, gw), 1, 1, 1, "model.16");
+  auto upsample17 = network->addResize(*conv16->getOutput(0));
+  assert(upsample17);
+  upsample17->setResizeMode(ResizeMode::kNEAREST);
+  upsample17->setOutputDimensions(c3_6->getOutput(0)->getDimensions());
+  ITensor* inputTensors18[] = { upsample17->getOutput(0), c3_6->getOutput(0) };
+  auto cat18 = network->addConcatenation(inputTensors18, 2);
+  auto c3_19 = C3(network, weightMap, *cat18->getOutput(0), get_width(1024, gw), get_width(512, gw), get_depth(3, gd), false, 1, 0.5, "model.19");
+
+  auto conv20 = convBlock(network, weightMap, *c3_19->getOutput(0), get_width(256, gw), 1, 1, 1, "model.20");
+  auto upsample21 = network->addResize(*conv20->getOutput(0));
+  assert(upsample21);
+  upsample21->setResizeMode(ResizeMode::kNEAREST);
+  upsample21->setOutputDimensions(c3_4->getOutput(0)->getDimensions());
+  ITensor* inputTensors21[] = { upsample21->getOutput(0), c3_4->getOutput(0) };
+  auto cat22 = network->addConcatenation(inputTensors21, 2);
+  auto c3_23 = C3(network, weightMap, *cat22->getOutput(0), get_width(512, gw), get_width(256, gw), get_depth(3, gd), false, 1, 0.5, "model.23");
+
+  auto conv24 = convBlock(network, weightMap, *c3_23->getOutput(0), get_width(256, gw), 3, 2, 1, "model.24");
+  ITensor* inputTensors25[] = { conv24->getOutput(0), conv20->getOutput(0) };
+  auto cat25 = network->addConcatenation(inputTensors25, 2);
+  auto c3_26 = C3(network, weightMap, *cat25->getOutput(0), get_width(1024, gw), get_width(512, gw), get_depth(3, gd), false, 1, 0.5, "model.26");
+
+  auto conv27 = convBlock(network, weightMap, *c3_26->getOutput(0), get_width(512, gw), 3, 2, 1, "model.27");
+  ITensor* inputTensors28[] = { conv27->getOutput(0), conv16->getOutput(0) };
+  auto cat28 = network->addConcatenation(inputTensors28, 2);
+  auto c3_29 = C3(network, weightMap, *cat28->getOutput(0), get_width(1536, gw), get_width(768, gw), get_depth(3, gd), false, 1, 0.5, "model.29");
+
+  auto conv30 = convBlock(network, weightMap, *c3_29->getOutput(0), get_width(768, gw), 3, 2, 1, "model.30");
+  ITensor* inputTensors31[] = { conv30->getOutput(0), conv12->getOutput(0) };
+  auto cat31 = network->addConcatenation(inputTensors31, 2);
+  auto c3_32 = C3(network, weightMap, *cat31->getOutput(0), get_width(2048, gw), get_width(1024, gw), get_depth(3, gd), false, 1, 0.5, "model.32");
+
+  // Detect
+  IConvolutionLayer* det0 = network->addConvolutionNd(*c3_23->getOutput(0), 3 * (n_classes + 5), DimsHW{ 1, 1 }, weightMap["model.33.m.0.weight"], weightMap["model.33.m.0.bias"]);
+  IConvolutionLayer* det1 = network->addConvolutionNd(*c3_26->getOutput(0), 3 * (n_classes + 5), DimsHW{ 1, 1 }, weightMap["model.33.m.1.weight"], weightMap["model.33.m.1.bias"]);
+  IConvolutionLayer* det2 = network->addConvolutionNd(*c3_29->getOutput(0), 3 * (n_classes + 5), DimsHW{ 1, 1 }, weightMap["model.33.m.2.weight"], weightMap["model.33.m.2.bias"]);
+  IConvolutionLayer* det3 = network->addConvolutionNd(*c3_32->getOutput(0), 3 * (n_classes + 5), DimsHW{ 1, 1 }, weightMap["model.33.m.3.weight"], weightMap["model.33.m.3.bias"]);
+
+  auto yolo = addYoLoLayer(network, weightMap, "model.33", std::vector<IConvolutionLayer*>{det0, det1, det2, det3}, input_h, input_w, n_classes);
+  yolo->getOutput(0)->setName(kOutputTensorName);
+  network->markOutput(*yolo->getOutput(0));
+
+  // Engine config
+  builder->setMaxBatchSize(maxBatchSize);
+  config->setMaxWorkspaceSize(16 * (1 << 20));  // 16MB
+#if defined(USE_FP16)
+  config->setFlag(BuilderFlag::kFP16);
+#elif defined(USE_INT8)
+  std::cout << "Your platform support int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl;
+  assert(builder->platformHasFastInt8());
+  config->setFlag(BuilderFlag::kINT8);
+  Int8EntropyCalibrator2* calibrator = new Int8EntropyCalibrator2(1, input_w, input_h, "./coco_calib/", "int8calib.table", kInputTensorName);
+  config->setInt8Calibrator(calibrator);
+#endif
+
+  std::cout << "Building engine, please wait for a while..." << std::endl;
+  ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config);
+  std::cout << "Build engine successfully!" << std::endl;
+
+  // Don't need the network any more
+  network->destroy();
+
+  // Release host memory
+  for (auto& mem : weightMap) {
+    free((void*)(mem.second.values));
+  }
+
+  return engine;
+}
+
+ICudaEngine* build_cls_engine(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt, float& gd, float& gw, std::string& wts_name) {
+  INetworkDefinition* network = builder->createNetworkV2(0U);
+
+  // Create input tensor
+  ITensor* data = network->addInput(kInputTensorName, dt, Dims3{ 3, kClsInputH, kClsInputW });
+  assert(data);
+  std::map<std::string, Weights> weightMap = loadWeights(wts_name);
+
+  // Backbone
+  auto conv0 = convBlock(network, weightMap, *data,  get_width(64, gw), 6, 2, 1,  "model.0");
+  assert(conv0);
+  auto conv1 = convBlock(network, weightMap, *conv0->getOutput(0), get_width(128, gw), 3, 2, 1, "model.1");
+  auto bottleneck_CSP2 = C3(network, weightMap, *conv1->getOutput(0), get_width(128, gw), get_width(128, gw), get_depth(3, gd), true, 1, 0.5, "model.2");
+  auto conv3 = convBlock(network, weightMap, *bottleneck_CSP2->getOutput(0), get_width(256, gw), 3, 2, 1, "model.3");
+  auto bottleneck_csp4 = C3(network, weightMap, *conv3->getOutput(0), get_width(256, gw), get_width(256, gw), get_depth(6, gd), true, 1, 0.5, "model.4");
+  auto conv5 = convBlock(network, weightMap, *bottleneck_csp4->getOutput(0), get_width(512, gw), 3, 2, 1, "model.5");
+  auto bottleneck_csp6 = C3(network, weightMap, *conv5->getOutput(0), get_width(512, gw), get_width(512, gw), get_depth(9, gd), true, 1, 0.5, "model.6");
+  auto conv7 = convBlock(network, weightMap, *bottleneck_csp6->getOutput(0), get_width(1024, gw), 3, 2, 1, "model.7");
+  auto bottleneck_csp8 = C3(network, weightMap, *conv7->getOutput(0), get_width(1024, gw), get_width(1024, gw), get_depth(3, gd), true, 1, 0.5, "model.8");
+
+  // Head
+  auto conv_class = convBlock(network, weightMap, *bottleneck_csp8->getOutput(0), 1280, 1, 1, 1, "model.9.conv");
+  IPoolingLayer* pool2 = network->addPoolingNd(*conv_class->getOutput(0), PoolingType::kAVERAGE, DimsHW{7, 7});
+  assert(pool2);
+  IFullyConnectedLayer* yolo = network->addFullyConnected(*pool2->getOutput(0), kClsNumClass, weightMap["model.9.linear.weight"], weightMap["model.9.linear.bias"]);
+  assert(yolo);
+
+  yolo->getOutput(0)->setName(kOutputTensorName);
+  network->markOutput(*yolo->getOutput(0));
+
+  // Engine config
+  builder->setMaxBatchSize(maxBatchSize);
+  config->setMaxWorkspaceSize(16 * (1 << 20));  // 16MB
+
+#if defined(USE_FP16)
+  config->setFlag(BuilderFlag::kFP16);
+#elif defined(USE_INT8)
+  std::cout << "Your platform support int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl;
+  assert(builder->platformHasFastInt8());
+  config->setFlag(BuilderFlag::kINT8);
+  Int8EntropyCalibrator2* calibrator = new Int8EntropyCalibrator2(1, kClsInputW, kClsInputW, "./coco_calib/", "int8calib.table", kInputTensorName);
+  config->setInt8Calibrator(calibrator);
+#endif
+
+  std::cout << "Building engine, please wait for a while..." << std::endl;
+  ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config);
+  std::cout << "Build engine successfully!" << std::endl;
+
+  // Don't need the network any more
+  network->destroy();
+
+  // Release host memory
+  for (auto& mem : weightMap) {
+    free((void*)(mem.second.values));
+  }
+
+  return engine;
+}
+
+ICudaEngine* build_seg_engine(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt, float& gd, float& gw, std::string& wts_name, int input_h, int input_w, int n_classes) {
+  INetworkDefinition* network = builder->createNetworkV2(0U);
+  ITensor* data = network->addInput(kInputTensorName, dt, Dims3{ 3, input_h, input_w });
+  assert(data);
+  std::map<std::string, Weights> weightMap = loadWeights(wts_name);
+
+  // Backbone
+  auto conv0 = convBlock(network, weightMap, *data,  get_width(64, gw), 6, 2, 1,  "model.0");
+  assert(conv0);
+  auto conv1 = convBlock(network, weightMap, *conv0->getOutput(0), get_width(128, gw), 3, 2, 1, "model.1");
+  auto bottleneck_CSP2 = C3(network, weightMap, *conv1->getOutput(0), get_width(128, gw), get_width(128, gw), get_depth(3, gd), true, 1, 0.5, "model.2");
+  auto conv3 = convBlock(network, weightMap, *bottleneck_CSP2->getOutput(0), get_width(256, gw), 3, 2, 1, "model.3");
+  auto bottleneck_csp4 = C3(network, weightMap, *conv3->getOutput(0), get_width(256, gw), get_width(256, gw), get_depth(6, gd), true, 1, 0.5, "model.4");
+  auto conv5 = convBlock(network, weightMap, *bottleneck_csp4->getOutput(0), get_width(512, gw), 3, 2, 1, "model.5");
+  auto bottleneck_csp6 = C3(network, weightMap, *conv5->getOutput(0), get_width(512, gw), get_width(512, gw), get_depth(9, gd), true, 1, 0.5, "model.6");
+  auto conv7 = convBlock(network, weightMap, *bottleneck_csp6->getOutput(0), get_width(1024, gw), 3, 2, 1, "model.7");
+  auto bottleneck_csp8 = C3(network, weightMap, *conv7->getOutput(0), get_width(1024, gw), get_width(1024, gw), get_depth(3, gd), true, 1, 0.5, "model.8");
+  auto spp9 = SPPF(network, weightMap, *bottleneck_csp8->getOutput(0), get_width(1024, gw), get_width(1024, gw), 5, "model.9");
+
+  // Head
+  auto conv10 = convBlock(network, weightMap, *spp9->getOutput(0), get_width(512, gw), 1, 1, 1, "model.10");
+
+  auto upsample11 = network->addResize(*conv10->getOutput(0));
+  assert(upsample11);
+  upsample11->setResizeMode(ResizeMode::kNEAREST);
+  upsample11->setOutputDimensions(bottleneck_csp6->getOutput(0)->getDimensions());
+
+  ITensor* inputTensors12[] = { upsample11->getOutput(0), bottleneck_csp6->getOutput(0) };
+  auto cat12 = network->addConcatenation(inputTensors12, 2);
+  auto bottleneck_csp13 = C3(network, weightMap, *cat12->getOutput(0), get_width(1024, gw), get_width(512, gw), get_depth(3, gd), false, 1, 0.5, "model.13");
+  auto conv14 = convBlock(network, weightMap, *bottleneck_csp13->getOutput(0), get_width(256, gw), 1, 1, 1, "model.14");
+
+  auto upsample15 = network->addResize(*conv14->getOutput(0));
+  assert(upsample15);
+  upsample15->setResizeMode(ResizeMode::kNEAREST);
+  upsample15->setOutputDimensions(bottleneck_csp4->getOutput(0)->getDimensions());
+
+  ITensor* inputTensors16[] = { upsample15->getOutput(0), bottleneck_csp4->getOutput(0) };
+  auto cat16 = network->addConcatenation(inputTensors16, 2);
+
+  auto bottleneck_csp17 = C3(network, weightMap, *cat16->getOutput(0), get_width(512, gw), get_width(256, gw), get_depth(3, gd), false, 1, 0.5, "model.17");
+
+  // Segmentation
+  IConvolutionLayer* det0 = network->addConvolutionNd(*bottleneck_csp17->getOutput(0), 3 * (32 + n_classes + 5), DimsHW{ 1, 1 }, weightMap["model.24.m.0.weight"], weightMap["model.24.m.0.bias"]);
+  auto conv18 = convBlock(network, weightMap, *bottleneck_csp17->getOutput(0), get_width(256, gw), 3, 2, 1, "model.18");
+  ITensor* inputTensors19[] = { conv18->getOutput(0), conv14->getOutput(0) };
+  auto cat19 = network->addConcatenation(inputTensors19, 2);
+  auto bottleneck_csp20 = C3(network, weightMap, *cat19->getOutput(0), get_width(512, gw), get_width(512, gw), get_depth(3, gd), false, 1, 0.5, "model.20");
+  IConvolutionLayer* det1 = network->addConvolutionNd(*bottleneck_csp20->getOutput(0), 3 * (32 + n_classes + 5), DimsHW{ 1, 1 }, weightMap["model.24.m.1.weight"], weightMap["model.24.m.1.bias"]);
+  auto conv21 = convBlock(network, weightMap, *bottleneck_csp20->getOutput(0), get_width(512, gw), 3, 2, 1, "model.21");
+  ITensor* inputTensors22[] = { conv21->getOutput(0), conv10->getOutput(0) };
+  auto cat22 = network->addConcatenation(inputTensors22, 2);
+  auto bottleneck_csp23 = C3(network, weightMap, *cat22->getOutput(0), get_width(1024, gw), get_width(1024, gw), get_depth(3, gd), false, 1, 0.5, "model.23");
+  IConvolutionLayer* det2 = network->addConvolutionNd(*bottleneck_csp23->getOutput(0), 3 * (32 + n_classes + 5), DimsHW{ 1, 1 }, weightMap["model.24.m.2.weight"], weightMap["model.24.m.2.bias"]);
+
+  auto yolo = addYoLoLayer(network, weightMap, "model.24", std::vector<IConvolutionLayer*>{det0, det1, det2}, input_h, input_w, n_classes, true);
+  yolo->getOutput(0)->setName(kOutputTensorName);
+  network->markOutput(*yolo->getOutput(0));
+
+  auto proto = Proto(network, weightMap, *bottleneck_csp17->getOutput(0), get_width(256, gw), 32, "model.24.proto");
+  proto->getOutput(0)->setName("proto");
+  network->markOutput(*proto->getOutput(0));
+
+  // Engine config
+  builder->setMaxBatchSize(maxBatchSize);
+  config->setMaxWorkspaceSize(16 * (1 << 20));  // 16MB
+#if defined(USE_FP16)
+  config->setFlag(BuilderFlag::kFP16);
+#elif defined(USE_INT8)
+  std::cout << "Your platform support int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl;
+  assert(builder->platformHasFastInt8());
+  config->setFlag(BuilderFlag::kINT8);
+  Int8EntropyCalibrator2* calibrator = new Int8EntropyCalibrator2(1, input_w, input_h, "./coco_calib/", "int8calib.table", kInputTensorName);
+  config->setInt8Calibrator(calibrator);
+#endif
+
+  std::cout << "Building engine, please wait for a while..." << std::endl;
+  ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config);
+  std::cout << "Build engine successfully!" << std::endl;
+
+  // Don't need the network any more
+  network->destroy();
+
+  // Release host memory
+  for (auto& mem : weightMap) {
+    free((void*)(mem.second.values));
+  }
+
+  return engine;
+}
+
--- a/algorithm/common_det/cuda/yolov7/model.h
+++ b/algorithm/common_det/cuda/yolov7/model.h
@@ -0,0 +1,16 @@
+#pragma once
+
+#include <NvInfer.h>
+#include <string>
+
+nvinfer1::ICudaEngine* build_det_engine(unsigned int maxBatchSize, nvinfer1::IBuilder* builder,
+                                        nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt,
+                                        float& gd, float& gw, std::string& wts_name, int input_h, int input_w, int n_classes);
+
+nvinfer1::ICudaEngine* build_det_p6_engine(unsigned int maxBatchSize, nvinfer1::IBuilder* builder,
+                                           nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt,
+                                           float& gd, float& gw, std::string& wts_name, int input_h, int input_w, int n_classes);
+
+nvinfer1::ICudaEngine* build_cls_engine(unsigned int maxBatchSize, nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt, float& gd, float& gw, std::string& wts_name);
+
+nvinfer1::ICudaEngine* build_seg_engine(unsigned int maxBatchSize, nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt, float& gd, float& gw, std::string& wts_name, int input_h, int input_w, int n_classes);
--- a/algorithm/common_det/cuda/yolov7/postprocess.cpp
+++ b/algorithm/common_det/cuda/yolov7/postprocess.cpp
@@ -0,0 +1,198 @@
+#include "postprocess.h"
+#include "utils.h"
+
+
+cv::Rect get_rect(cv::Mat& img, float bbox[4], int input_h, int input_w)
+{
+  float l, r, t, b;
+  float r_w = input_w / (img.cols * 1.0);
+  float r_h = input_h / (img.rows * 1.0);
+  if (r_h > r_w) {
+    l = bbox[0] - bbox[2] / 2.f;
+    r = bbox[0] + bbox[2] / 2.f;
+    t = bbox[1] - bbox[3] / 2.f - (input_h - r_w * img.rows) / 2;
+    b = bbox[1] + bbox[3] / 2.f - (input_h - r_w * img.rows) / 2;
+    l = l / r_w;
+    r = r / r_w;
+    t = t / r_w;
+    b = b / r_w;
+  } else {
+    l = bbox[0] - bbox[2] / 2.f - (input_w - r_h * img.cols) / 2;
+    r = bbox[0] + bbox[2] / 2.f - (input_w - r_h * img.cols) / 2;
+    t = bbox[1] - bbox[3] / 2.f;
+    b = bbox[1] + bbox[3] / 2.f;
+    l = l / r_h;
+    r = r / r_h;
+    t = t / r_h;
+    b = b / r_h;
+  }
+  return cv::Rect(round(l), round(t), round(r - l), round(b - t));
+}
+
+static float iou(float lbox[4], float rbox[4]) {
+  float interBox[] = {
+    (std::max)(lbox[0] - lbox[2] / 2.f , rbox[0] - rbox[2] / 2.f), //left
+    (std::min)(lbox[0] + lbox[2] / 2.f , rbox[0] + rbox[2] / 2.f), //right
+    (std::max)(lbox[1] - lbox[3] / 2.f , rbox[1] - rbox[3] / 2.f), //top
+    (std::min)(lbox[1] + lbox[3] / 2.f , rbox[1] + rbox[3] / 2.f), //bottom
+  };
+
+  if (interBox[2] > interBox[3] || interBox[0] > interBox[1])
+    return 0.0f;
+
+  float interBoxS = (interBox[1] - interBox[0])*(interBox[3] - interBox[2]);
+  return interBoxS / (lbox[2] * lbox[3] + rbox[2] * rbox[3] - interBoxS);
+}
+
+static bool cmp(const Detection& a, const Detection& b) {
+  return a.conf > b.conf;
+}
+
+void nms(std::vector<Detection>& res, float* output, float conf_thresh, float nms_thresh) {
+  int det_size = sizeof(Detection) / sizeof(float);
+  std::map<float, std::vector<Detection>> m;
+  for (int i = 0; i < output[0] && i < kMaxNumOutputBbox; i++) {
+    if (output[1 + det_size * i + 4] <= conf_thresh) continue;
+    Detection det;
+    memcpy(&det, &output[1 + det_size * i], det_size * sizeof(float));
+    if (m.count(det.class_id) == 0) m.emplace(det.class_id, std::vector<Detection>());
+    m[det.class_id].push_back(det);
+  }
+  for (auto it = m.begin(); it != m.end(); it++) {
+    auto& dets = it->second;
+    std::sort(dets.begin(), dets.end(), cmp);
+    for (size_t m = 0; m < dets.size(); ++m) {
+      auto& item = dets[m];
+      res.push_back(item);
+      for (size_t n = m + 1; n < dets.size(); ++n) {
+        if (iou(item.bbox, dets[n].bbox) > nms_thresh) {
+          dets.erase(dets.begin() + n);
+          --n;
+        }
+      }
+    }
+  }
+}
+
+void batch_nms(std::vector<std::vector<Detection>>& res_batch, float *output, int batch_size, int output_size, float conf_thresh, float nms_thresh) {
+  res_batch.resize(batch_size);
+  for (int i = 0; i < batch_size; i++) {
+    nms(res_batch[i], &output[i * output_size], conf_thresh, nms_thresh);
+  }
+}
+
+void draw_bbox(std::vector<cv::Mat>& img_batch, std::vector<std::vector<Detection>>& res_batch, int input_h, int input_w) {
+  for (size_t i = 0; i < img_batch.size(); i++) {
+    auto& res = res_batch[i];
+    cv::Mat img = img_batch[i];
+    for (size_t j = 0; j < res.size(); j++) {
+      cv::Rect r = get_rect(img, res[j].bbox, input_h, input_w);
+      cv::rectangle(img, r, cv::Scalar(0x27, 0xC1, 0x36), 2);
+      cv::putText(img, std::to_string((int)res[j].class_id), cv::Point(r.x, r.y - 1), cv::FONT_HERSHEY_PLAIN, 1.2, cv::Scalar(0xFF, 0xFF, 0xFF), 2);
+    }
+  }
+}
+
+static cv::Rect get_downscale_rect(float bbox[4], float scale) {
+  float left = bbox[0] - bbox[2] / 2;
+  float top = bbox[1] - bbox[3] / 2;
+  float right = bbox[0] + bbox[2] / 2;
+  float bottom = bbox[1] + bbox[3] / 2;
+  left /= scale;
+  top /= scale;
+  right /= scale;
+  bottom /= scale;
+  return cv::Rect(round(left), round(top), round(right - left), round(bottom - top));
+}
+
+std::vector<cv::Mat> process_mask(const float* proto, int proto_size, std::vector<Detection>& dets, int input_h, int input_w) {
+  std::vector<cv::Mat> masks;
+  for (size_t i = 0; i < dets.size(); i++) {
+    cv::Mat mask_mat = cv::Mat::zeros(input_h / 4, input_w / 4, CV_32FC1);
+    auto r = get_downscale_rect(dets[i].bbox, 4);
+    if (r.x < 0) r.x = 0;
+    if (r.y < 0) r.y = 0;
+    if (r.x + r.width > mask_mat.cols) r.width = mask_mat.cols - r.x;
+    if (r.y + r.height > mask_mat.rows) r.height = mask_mat.rows - r.y;
+    // printf(" %d, %d, %d, %d, (%d, %d)\n", r.x, r.y, r.width, r.height, mask_mat.cols, mask_mat.rows);
+
+    for (int x = r.x; x < r.x + r.width; x++) {
+      for (int y = r.y; y < r.y + r.height; y++) {
+        float e = 0.0f;
+        for (int j = 0; j < 32; j++) {
+          e += dets[i].mask[j] * proto[j * proto_size / 32 + y * mask_mat.cols + x];
+        }
+        e = 1.0f / (1.0f + expf(-e));
+        mask_mat.at<float>(y, x) = e;
+      }
+    }
+
+    cv::resize(mask_mat, mask_mat, cv::Size(input_w, input_h));
+    masks.push_back(mask_mat);
+  }
+  return masks;
+}
+
+cv::Mat scale_mask(cv::Mat mask, cv::Mat img, int input_h, int input_w) {
+  int x, y, w, h;
+  float r_w = input_w / (img.cols * 1.0);
+  float r_h = input_h / (img.rows * 1.0);
+  if (r_h > r_w) {
+    w = input_w;
+    h = r_w * img.rows;
+    x = 0;
+    y = (input_h - h) / 2;
+  } else {
+    w = r_h * img.cols;
+    h = input_h;
+    x = (input_w - w) / 2;
+    y = 0;
+  }
+  cv::Rect r(x, y, w, h);
+  cv::Mat res;
+  cv::resize(mask(r), res, img.size());
+  return res;
+}
+
+void draw_mask_bbox(cv::Mat& img, std::vector<Detection>& dets, std::vector<cv::Mat>& masks, std::unordered_map<int, std::string>& labels_map, int input_h, int input_w) {
+  static std::vector<uint32_t> colors = {0xFF3838, 0xFF9D97, 0xFF701F, 0xFFB21D, 0xCFD231, 0x48F90A,
+                                         0x92CC17, 0x3DDB86, 0x1A9334, 0x00D4BB, 0x2C99A8, 0x00C2FF,
+                                         0x344593, 0x6473FF, 0x0018EC, 0x8438FF, 0x520085, 0xCB38FF,
+                                         0xFF95C8, 0xFF37C7};
+  for (size_t i = 0; i < dets.size(); i++) {
+    cv::Mat img_mask = scale_mask(masks[i], img, input_h, input_w);
+    auto color = colors[(int)dets[i].class_id % colors.size()];
+    auto bgr = cv::Scalar(color & 0xFF, color >> 8 & 0xFF, color >> 16 & 0xFF);
+
+    cv::Rect r = get_rect(img, dets[i].bbox, input_h, input_w);
+    for (int x = r.x; x < r.x + r.width; x++) {
+      for (int y = r.y; y < r.y + r.height; y++) {
+        float val = img_mask.at<float>(y, x);
+        if (val <= 0.5) continue;
+        img.at<cv::Vec3b>(y, x)[0] = img.at<cv::Vec3b>(y, x)[0] / 2 + bgr[0] / 2;
+        img.at<cv::Vec3b>(y, x)[1] = img.at<cv::Vec3b>(y, x)[1] / 2 + bgr[1] / 2;
+        img.at<cv::Vec3b>(y, x)[2] = img.at<cv::Vec3b>(y, x)[2] / 2 + bgr[2] / 2;
+      }
+    }
+
+    cv::rectangle(img, r, bgr, 2);
+    
+    // Get the size of the text
+    cv::Size textSize = cv::getTextSize(labels_map[(int)dets[i].class_id] + " " + to_string_with_precision(dets[i].conf), cv::FONT_HERSHEY_PLAIN, 1.2, 2, NULL);
+    // Set the top left corner of the rectangle
+    cv::Point topLeft(r.x, r.y - textSize.height);
+
+    // Set the bottom right corner of the rectangle
+    cv::Point bottomRight(r.x + textSize.width, r.y + textSize.height);
+
+    // Set the thickness of the rectangle lines
+    int lineThickness = 2;
+
+    // Draw the rectangle on the image
+    cv::rectangle(img, topLeft, bottomRight, bgr, -1);
+
+    cv::putText(img, labels_map[(int)dets[i].class_id] + " " + to_string_with_precision(dets[i].conf), cv::Point(r.x, r.y + 4), cv::FONT_HERSHEY_PLAIN, 1.2, cv::Scalar::all(0xFF), 2);
+
+  }
+}
+
--- a/algorithm/common_det/cuda/yolov7/postprocess.h
+++ b/algorithm/common_det/cuda/yolov7/postprocess.h
@@ -0,0 +1,16 @@
+#pragma once
+
+#include "types.h"
+#include <opencv2/opencv.hpp>
+
+cv::Rect get_rect(cv::Mat& img, float bbox[4], int input_h, int input_w);
+
+void nms(std::vector<Detection>& res, float *output, float conf_thresh, float nms_thresh = 0.5);
+
+void batch_nms(std::vector<std::vector<Detection>>& batch_res, float *output, int batch_size, int output_size, float conf_thresh, float nms_thresh = 0.5);
+
+void draw_bbox(std::vector<cv::Mat>& img_batch, std::vector<std::vector<Detection>>& res_batch, int input_h, int input_w);
+
+std::vector<cv::Mat> process_mask(const float* proto, int proto_size, std::vector<Detection>& dets, int input_h, int input_w);
+
+void draw_mask_bbox(cv::Mat& img, std::vector<Detection>& dets, std::vector<cv::Mat>& masks, std::unordered_map<int, std::string>& labels_map, int input_h, int input_w);
--- a/algorithm/common_det/cuda/yolov7/preprocess.cu
+++ b/algorithm/common_det/cuda/yolov7/preprocess.cu
@@ -0,0 +1,153 @@
+#include "preprocess.h"
+#include "cuda_utils.h"
+
+static uint8_t* img_buffer_host = nullptr;
+static uint8_t* img_buffer_device = nullptr;
+
+struct AffineMatrix {
+  float value[6];
+};
+
+__global__ void warpaffine_kernel(
+    uint8_t* src, int src_line_size, int src_width,
+    int src_height, float* dst, int dst_width,
+    int dst_height, uint8_t const_value_st,
+    AffineMatrix d2s, int edge) {
+  int position = blockDim.x * blockIdx.x + threadIdx.x;
+  if (position >= edge) return;
+
+  float m_x1 = d2s.value[0];
+  float m_y1 = d2s.value[1];
+  float m_z1 = d2s.value[2];
+  float m_x2 = d2s.value[3];
+  float m_y2 = d2s.value[4];
+  float m_z2 = d2s.value[5];
+
+  int dx = position % dst_width;
+  int dy = position / dst_width;
+  float src_x = m_x1 * dx + m_y1 * dy + m_z1 + 0.5f;
+  float src_y = m_x2 * dx + m_y2 * dy + m_z2 + 0.5f;
+  float c0, c1, c2;
+
+  if (src_x <= -1 || src_x >= src_width || src_y <= -1 || src_y >= src_height) {
+    // out of range
+    c0 = const_value_st;
+    c1 = const_value_st;
+    c2 = const_value_st;
+  } else {
+    int y_low = floorf(src_y);
+    int x_low = floorf(src_x);
+    int y_high = y_low + 1;
+    int x_high = x_low + 1;
+
+    uint8_t const_value[] = {const_value_st, const_value_st, const_value_st};
+    float ly = src_y - y_low;
+    float lx = src_x - x_low;
+    float hy = 1 - ly;
+    float hx = 1 - lx;
+    float w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+    uint8_t* v1 = const_value;
+    uint8_t* v2 = const_value;
+    uint8_t* v3 = const_value;
+    uint8_t* v4 = const_value;
+
+    if (y_low >= 0) {
+      if (x_low >= 0)
+        v1 = src + y_low * src_line_size + x_low * 3;
+
+      if (x_high < src_width)
+        v2 = src + y_low * src_line_size + x_high * 3;
+    }
+
+    if (y_high < src_height) {
+      if (x_low >= 0)
+        v3 = src + y_high * src_line_size + x_low * 3;
+
+      if (x_high < src_width)
+        v4 = src + y_high * src_line_size + x_high * 3;
+    }
+
+    c0 = w1 * v1[0] + w2 * v2[0] + w3 * v3[0] + w4 * v4[0];
+    c1 = w1 * v1[1] + w2 * v2[1] + w3 * v3[1] + w4 * v4[1];
+    c2 = w1 * v1[2] + w2 * v2[2] + w3 * v3[2] + w4 * v4[2];
+  }
+
+  // bgr to rgb 
+  float t = c2;
+  c2 = c0;
+  c0 = t;
+
+  // normalization
+  c0 = c0 / 255.0f;
+  c1 = c1 / 255.0f;
+  c2 = c2 / 255.0f;
+
+  // rgbrgbrgb to rrrgggbbb
+  int area = dst_width * dst_height;
+  float* pdst_c0 = dst + dy * dst_width + dx;
+  float* pdst_c1 = pdst_c0 + area;
+  float* pdst_c2 = pdst_c1 + area;
+  *pdst_c0 = c0;
+  *pdst_c1 = c1;
+  *pdst_c2 = c2;
+}
+
+void cuda_preprocess(
+    uint8_t* src, int src_width, int src_height,
+    float* dst, int dst_width, int dst_height,
+    cudaStream_t stream) {
+
+  int img_size = src_width * src_height * 3;
+  // copy data to pinned memory
+  memcpy(img_buffer_host, src, img_size);
+  // copy data to device memory
+  CUDA_CHECK(cudaMemcpyAsync(img_buffer_device, img_buffer_host, img_size, cudaMemcpyHostToDevice, stream));
+
+  AffineMatrix s2d, d2s;
+  float scale = std::min(dst_height / (float)src_height, dst_width / (float)src_width);
+
+  s2d.value[0] = scale;
+  s2d.value[1] = 0;
+  s2d.value[2] = -scale * src_width  * 0.5  + dst_width * 0.5;
+  s2d.value[3] = 0;
+  s2d.value[4] = scale;
+  s2d.value[5] = -scale * src_height * 0.5 + dst_height * 0.5;
+
+  cv::Mat m2x3_s2d(2, 3, CV_32F, s2d.value);
+  cv::Mat m2x3_d2s(2, 3, CV_32F, d2s.value);
+  cv::invertAffineTransform(m2x3_s2d, m2x3_d2s);
+
+  memcpy(d2s.value, m2x3_d2s.ptr<float>(0), sizeof(d2s.value));
+
+  int jobs = dst_height * dst_width;
+  int threads = 256;
+  int blocks = ceil(jobs / (float)threads);
+
+  warpaffine_kernel<<<blocks, threads, 0, stream>>>(
+      img_buffer_device, src_width * 3, src_width,
+      src_height, dst, dst_width,
+      dst_height, 128, d2s, jobs);
+}
+
+void cuda_batch_preprocess(std::vector<cv::Mat>& img_batch,
+                           float* dst, int dst_width, int dst_height,
+                           cudaStream_t stream) {
+  int dst_size = dst_width * dst_height * 3;
+  for (size_t i = 0; i < img_batch.size(); i++) {
+    cuda_preprocess(img_batch[i].ptr(), img_batch[i].cols, img_batch[i].rows, &dst[dst_size * i], dst_width, dst_height, stream);
+    CUDA_CHECK(cudaStreamSynchronize(stream));
+  }
+}
+
+void cuda_preprocess_init(int max_image_size) {
+  // prepare input data in pinned memory
+  CUDA_CHECK(cudaMallocHost((void**)&img_buffer_host, max_image_size * 3));
+  // prepare input data in device memory
+  CUDA_CHECK(cudaMalloc((void**)&img_buffer_device, max_image_size * 3));
+}
+
+void cuda_preprocess_destroy() {
+  CUDA_CHECK(cudaFree(img_buffer_device));
+  CUDA_CHECK(cudaFreeHost(img_buffer_host));
+}
+
--- a/algorithm/common_det/cuda/yolov7/preprocess.h
+++ b/algorithm/common_det/cuda/yolov7/preprocess.h
@@ -0,0 +1,15 @@
+#pragma once
+
+#include <cuda_runtime.h>
+#include <cstdint>
+#include <opencv2/opencv.hpp>
+
+void cuda_preprocess_init(int max_image_size);
+void cuda_preprocess_destroy();
+void cuda_preprocess(uint8_t* src, int src_width, int src_height,
+                     float* dst, int dst_width, int dst_height,
+                     cudaStream_t stream);
+void cuda_batch_preprocess(std::vector<cv::Mat>& img_batch,
+                           float* dst, int dst_width, int dst_height,
+                           cudaStream_t stream);
+
--- a/algorithm/common_det/cuda/yolov7/types.h
+++ b/algorithm/common_det/cuda/yolov7/types.h
@@ -0,0 +1,17 @@
+#pragma once
+
+#include "config.h"
+
+struct YoloKernel {
+  int width;
+  int height;
+  float anchors[kNumAnchor * 2];
+};
+
+struct alignas(float) Detection {
+  float bbox[4];  // center_x center_y w h
+  float conf;  // bbox_conf * cls_conf
+  float class_id;
+  float mask[32];
+};
+
--- a/algorithm/common_det/cuda/yolov7/utils.h
+++ b/algorithm/common_det/cuda/yolov7/utils.h
@@ -0,0 +1,70 @@
+#pragma once
+
+#include <dirent.h>
+#include <fstream>
+#include <unordered_map>
+#include <string>
+#include <sstream>
+#include <vector>
+#include <cstring>
+
+static inline int read_files_in_dir(const char* p_dir_name, std::vector<std::string>& file_names) {
+    DIR *p_dir = opendir(p_dir_name);
+    if (p_dir == nullptr) {
+        return -1;
+    }
+
+    struct dirent* p_file = nullptr;
+    while ((p_file = readdir(p_dir)) != nullptr) {
+        if (strcmp(p_file->d_name, ".") != 0 &&
+            strcmp(p_file->d_name, "..") != 0) {
+            //std::string cur_file_name(p_dir_name);
+            //cur_file_name += "/";
+            //cur_file_name += p_file->d_name;
+            std::string cur_file_name(p_file->d_name);
+            file_names.push_back(cur_file_name);
+        }
+    }
+
+    closedir(p_dir);
+    return 0;
+}
+
+// Function to trim leading and trailing whitespace from a string
+static inline std::string trim_leading_whitespace(const std::string& str) {
+    size_t first = str.find_first_not_of(' ');
+    if (std::string::npos == first) {
+        return str;
+    }
+    size_t last = str.find_last_not_of(' ');
+    return str.substr(first, (last - first + 1));
+}
+
+// Src: https://stackoverflow.com/questions/16605967
+static inline std::string to_string_with_precision(const float a_value, const int n = 2) {
+    std::ostringstream out;
+    out.precision(n);
+    out << std::fixed << a_value;
+    return out.str();
+}
+
+static inline int read_labels(const std::string labels_filename, std::unordered_map<int, std::string>& labels_map) {
+
+    std::ifstream file(labels_filename);
+    // Read each line of the file
+    std::string line;
+    int index = 0;
+    while (std::getline(file, line)) {
+        // Strip the line of any leading or trailing whitespace
+        line = trim_leading_whitespace(line);
+
+        // Add the stripped line to the labels_map, using the loop index as the key
+        labels_map[index] = line;
+        index++;
+    }
+    // Close the file
+    file.close();
+
+    return 0;
+}
+
--- a/algorithm/common_det/cuda/yolov7/yololayer.cu
+++ b/algorithm/common_det/cuda/yolov7/yololayer.cu
@@ -0,0 +1,280 @@
+#include "yololayer.h"
+#include "cuda_utils.h"
+
+#include <cassert>
+#include <vector>
+#include <iostream>
+
+namespace Tn {
+template<typename T> 
+void write(char*& buffer, const T& val) {
+  *reinterpret_cast<T*>(buffer) = val;
+  buffer += sizeof(T);
+}
+
+template<typename T> 
+void read(const char*& buffer, T& val) {
+  val = *reinterpret_cast<const T*>(buffer);
+  buffer += sizeof(T);
+}
+}
+
+namespace nvinfer1 {
+YoloLayerPlugin::YoloLayerPlugin(int classCount, int netWidth, int netHeight, int maxOut, bool is_segmentation, const std::vector<YoloKernel>& vYoloKernel) {
+  mClassCount = classCount;
+  mYoloV5NetWidth = netWidth;
+  mYoloV5NetHeight = netHeight;
+  mMaxOutObject = maxOut;
+  is_segmentation_ = is_segmentation;
+  mYoloKernel = vYoloKernel;
+  mKernelCount = vYoloKernel.size();
+
+  CUDA_CHECK(cudaMallocHost(&mAnchor, mKernelCount * sizeof(void*)));
+  size_t AnchorLen = sizeof(float)* kNumAnchor * 2;
+  for (int ii = 0; ii < mKernelCount; ii++) {
+    CUDA_CHECK(cudaMalloc(&mAnchor[ii], AnchorLen));
+    const auto& yolo = mYoloKernel[ii];
+    CUDA_CHECK(cudaMemcpy(mAnchor[ii], yolo.anchors, AnchorLen, cudaMemcpyHostToDevice));
+  }
+}
+
+YoloLayerPlugin::~YoloLayerPlugin() {
+  for (int ii = 0; ii < mKernelCount; ii++) {
+    CUDA_CHECK(cudaFree(mAnchor[ii]));
+  }
+  CUDA_CHECK(cudaFreeHost(mAnchor));
+}
+
+// create the plugin at runtime from a byte stream
+YoloLayerPlugin::YoloLayerPlugin(const void* data, size_t length) {
+  using namespace Tn;
+  const char *d = reinterpret_cast<const char *>(data), *a = d;
+  read(d, mClassCount);
+  read(d, mThreadCount);
+  read(d, mKernelCount);
+  read(d, mYoloV5NetWidth);
+  read(d, mYoloV5NetHeight);
+  read(d, mMaxOutObject);
+  read(d, is_segmentation_);
+  mYoloKernel.resize(mKernelCount);
+  auto kernelSize = mKernelCount * sizeof(YoloKernel);
+  memcpy(mYoloKernel.data(), d, kernelSize);
+  d += kernelSize;
+  CUDA_CHECK(cudaMallocHost(&mAnchor, mKernelCount * sizeof(void*)));
+  size_t AnchorLen = sizeof(float)* kNumAnchor * 2;
+  for (int ii = 0; ii < mKernelCount; ii++) {
+    CUDA_CHECK(cudaMalloc(&mAnchor[ii], AnchorLen));
+    const auto& yolo = mYoloKernel[ii];
+    CUDA_CHECK(cudaMemcpy(mAnchor[ii], yolo.anchors, AnchorLen, cudaMemcpyHostToDevice));
+  }
+  assert(d == a + length);
+}
+
+void YoloLayerPlugin::serialize(void* buffer) const TRT_NOEXCEPT {
+  using namespace Tn;
+  char* d = static_cast<char*>(buffer), *a = d;
+  write(d, mClassCount);
+  write(d, mThreadCount);
+  write(d, mKernelCount);
+  write(d, mYoloV5NetWidth);
+  write(d, mYoloV5NetHeight);
+  write(d, mMaxOutObject);
+  write(d, is_segmentation_);
+  auto kernelSize = mKernelCount * sizeof(YoloKernel);
+  memcpy(d, mYoloKernel.data(), kernelSize);
+  d += kernelSize;
+
+  assert(d == a + getSerializationSize());
+}
+
+size_t YoloLayerPlugin::getSerializationSize() const TRT_NOEXCEPT {
+  size_t s = sizeof(mClassCount) + sizeof(mThreadCount) + sizeof(mKernelCount);
+  s += sizeof(YoloKernel) * mYoloKernel.size();
+  s += sizeof(mYoloV5NetWidth) + sizeof(mYoloV5NetHeight);
+  s += sizeof(mMaxOutObject) + sizeof(is_segmentation_);
+  return s;
+}
+
+int YoloLayerPlugin::initialize() TRT_NOEXCEPT {
+  return 0;
+}
+
+Dims YoloLayerPlugin::getOutputDimensions(int index, const Dims* inputs, int nbInputDims) TRT_NOEXCEPT {
+  //output the result to channel
+  int totalsize = mMaxOutObject * sizeof(Detection) / sizeof(float);
+  return Dims3(totalsize + 1, 1, 1);
+}
+
+// Set plugin namespace
+void YoloLayerPlugin::setPluginNamespace(const char* pluginNamespace) TRT_NOEXCEPT {
+  mPluginNamespace = pluginNamespace;
+}
+
+const char* YoloLayerPlugin::getPluginNamespace() const TRT_NOEXCEPT {
+  return mPluginNamespace;
+}
+
+// Return the DataType of the plugin output at the requested index
+DataType YoloLayerPlugin::getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const TRT_NOEXCEPT {
+  return DataType::kFLOAT;
+}
+
+// Return true if output tensor is broadcast across a batch.
+bool YoloLayerPlugin::isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const TRT_NOEXCEPT {
+  return false;
+}
+
+// Return true if plugin can use input that is broadcast across batch without replication.
+bool YoloLayerPlugin::canBroadcastInputAcrossBatch(int inputIndex) const TRT_NOEXCEPT {
+  return false;
+}
+
+void YoloLayerPlugin::configurePlugin(const PluginTensorDesc* in, int nbInput, const PluginTensorDesc* out, int nbOutput) TRT_NOEXCEPT {}
+
+// Attach the plugin object to an execution context and grant the plugin the access to some context resource.
+void YoloLayerPlugin::attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator) TRT_NOEXCEPT {}
+
+// Detach the plugin object from its execution context.
+void YoloLayerPlugin::detachFromContext() TRT_NOEXCEPT {}
+
+const char* YoloLayerPlugin::getPluginType() const TRT_NOEXCEPT {
+  return "YoloLayer_TRT";
+}
+
+const char* YoloLayerPlugin::getPluginVersion() const TRT_NOEXCEPT {
+  return "1";
+}
+
+void YoloLayerPlugin::destroy() TRT_NOEXCEPT {
+  delete this;
+}
+
+// Clone the plugin
+IPluginV2IOExt* YoloLayerPlugin::clone() const TRT_NOEXCEPT {
+  YoloLayerPlugin* p = new YoloLayerPlugin(mClassCount, mYoloV5NetWidth, mYoloV5NetHeight, mMaxOutObject, is_segmentation_, mYoloKernel);
+  p->setPluginNamespace(mPluginNamespace);
+  return p;
+}
+
+__device__ float Logist(float data) { return 1.0f / (1.0f + expf(-data)); };
+
+__global__ void CalDetection(const float *input, float *output, int noElements,
+    const int netwidth, const int netheight, int maxoutobject, int yoloWidth,
+    int yoloHeight, const float anchors[kNumAnchor * 2], int classes, int outputElem, bool is_segmentation) {
+
+  int idx = threadIdx.x + blockDim.x * blockIdx.x;
+  if (idx >= noElements) return;
+
+  int total_grid = yoloWidth * yoloHeight;
+  int bnIdx = idx / total_grid;
+  idx = idx - total_grid * bnIdx;
+  int info_len_i = 5 + classes;
+  if (is_segmentation) info_len_i += 32;
+  const float* curInput = input + bnIdx * (info_len_i * total_grid * kNumAnchor);
+
+  for (int k = 0; k < kNumAnchor; ++k) {
+    float box_prob = Logist(curInput[idx + k * info_len_i * total_grid + 4 * total_grid]);
+    if (box_prob < kIgnoreThresh) continue;
+    int class_id = 0;
+    float max_cls_prob = 0.0;
+    for (int i = 5; i < 5 + classes; ++i) {
+      float p = Logist(curInput[idx + k * info_len_i * total_grid + i * total_grid]);
+      if (p > max_cls_prob) {
+        max_cls_prob = p;
+        class_id = i - 5;
+      }
+    }
+    float *res_count = output + bnIdx * outputElem;
+    int count = (int)atomicAdd(res_count, 1);
+    if (count >= maxoutobject) return;
+    char *data = (char*)res_count + sizeof(float) + count * sizeof(Detection);
+    Detection *det = (Detection*)(data);
+
+    int row = idx / yoloWidth;
+    int col = idx % yoloWidth;
+
+    det->bbox[0] = (col - 0.5f + 2.0f * Logist(curInput[idx + k * info_len_i * total_grid + 0 * total_grid])) * netwidth / yoloWidth;
+    det->bbox[1] = (row - 0.5f + 2.0f * Logist(curInput[idx + k * info_len_i * total_grid + 1 * total_grid])) * netheight / yoloHeight;
+
+    det->bbox[2] = 2.0f * Logist(curInput[idx + k * info_len_i * total_grid + 2 * total_grid]);
+    det->bbox[2] = det->bbox[2] * det->bbox[2] * anchors[2 * k];
+    det->bbox[3] = 2.0f * Logist(curInput[idx + k * info_len_i * total_grid + 3 * total_grid]);
+    det->bbox[3] = det->bbox[3] * det->bbox[3] * anchors[2 * k + 1];
+    det->conf = box_prob * max_cls_prob;
+    det->class_id = class_id;
+
+    for (int i = 0; is_segmentation && i < 32; i++) {
+      det->mask[i] = curInput[idx + k * info_len_i * total_grid + (i + 5 + classes) * total_grid];
+    }
+  }
+}
+
+void YoloLayerPlugin::forwardGpu(const float* const* inputs, float *output, cudaStream_t stream, int batchSize) {
+  int outputElem = 1 + mMaxOutObject * sizeof(Detection) / sizeof(float);
+  for (int idx = 0; idx < batchSize; ++idx) {
+    CUDA_CHECK(cudaMemsetAsync(output + idx * outputElem, 0, sizeof(float), stream));
+  }
+  int numElem = 0;
+  for (unsigned int i = 0; i < mYoloKernel.size(); ++i) {
+    const auto& yolo = mYoloKernel[i];
+    numElem = yolo.width * yolo.height * batchSize;
+    if (numElem < mThreadCount) mThreadCount = numElem;
+
+    CalDetection << < (numElem + mThreadCount - 1) / mThreadCount, mThreadCount, 0, stream >> >
+      (inputs[i], output, numElem, mYoloV5NetWidth, mYoloV5NetHeight, mMaxOutObject, yolo.width, yolo.height, (float*)mAnchor[i], mClassCount, outputElem, is_segmentation_);
+  }
+}
+
+
+int YoloLayerPlugin::enqueue(int batchSize, const void* const* inputs, void* TRT_CONST_ENQUEUE* outputs, void* workspace, cudaStream_t stream) TRT_NOEXCEPT {
+  forwardGpu((const float* const*)inputs, (float*)outputs[0], stream, batchSize);
+  return 0;
+}
+
+PluginFieldCollection YoloPluginCreator::mFC{};
+std::vector<PluginField> YoloPluginCreator::mPluginAttributes;
+
+YoloPluginCreator::YoloPluginCreator() {
+  mPluginAttributes.clear();
+  mFC.nbFields = mPluginAttributes.size();
+  mFC.fields = mPluginAttributes.data();
+}
+
+const char* YoloPluginCreator::getPluginName() const TRT_NOEXCEPT {
+  return "YoloLayer_TRT";
+}
+
+const char* YoloPluginCreator::getPluginVersion() const TRT_NOEXCEPT {
+  return "1";
+}
+
+const PluginFieldCollection* YoloPluginCreator::getFieldNames() TRT_NOEXCEPT {
+  return &mFC;
+}
+
+IPluginV2IOExt* YoloPluginCreator::createPlugin(const char* name, const PluginFieldCollection* fc) TRT_NOEXCEPT {
+  assert(fc->nbFields == 2);
+  assert(strcmp(fc->fields[0].name, "netinfo") == 0);
+  assert(strcmp(fc->fields[1].name, "kernels") == 0);
+  int *p_netinfo = (int*)(fc->fields[0].data);
+  int class_count = p_netinfo[0];
+  int input_w = p_netinfo[1];
+  int input_h = p_netinfo[2];
+  int max_output_object_count = p_netinfo[3];
+  bool is_segmentation = (bool)p_netinfo[4];
+  std::vector<YoloKernel> kernels(fc->fields[1].length);
+  memcpy(&kernels[0], fc->fields[1].data, kernels.size() * sizeof(YoloKernel));
+  YoloLayerPlugin* obj = new YoloLayerPlugin(class_count, input_w, input_h, max_output_object_count, is_segmentation, kernels);
+  obj->setPluginNamespace(mNamespace.c_str());
+  return obj;
+}
+
+IPluginV2IOExt* YoloPluginCreator::deserializePlugin(const char* name, const void* serialData, size_t serialLength) TRT_NOEXCEPT {
+  // This object will be deleted when the network is destroyed, which will
+  // call YoloLayerPlugin::destroy()
+  YoloLayerPlugin* obj = new YoloLayerPlugin(serialData, serialLength);
+  obj->setPluginNamespace(mNamespace.c_str());
+  return obj;
+}
+}
+
--- a/algorithm/common_det/cuda/yolov7/yololayer.h
+++ b/algorithm/common_det/cuda/yolov7/yololayer.h
@@ -0,0 +1,106 @@
+#pragma once
+
+#include "types.h"
+#include "macros.h"
+
+#include <vector>
+#include <string>
+
+namespace nvinfer1 {
+class API YoloLayerPlugin : public IPluginV2IOExt {
+public:
+  YoloLayerPlugin(int classCount, int netWidth, int netHeight, int maxOut, bool is_segmentation, const std::vector<YoloKernel>& vYoloKernel);
+  YoloLayerPlugin(const void* data, size_t length);
+  ~YoloLayerPlugin();
+
+  int getNbOutputs() const TRT_NOEXCEPT override { return 1; }
+
+  Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) TRT_NOEXCEPT override;
+
+  int initialize() TRT_NOEXCEPT override;
+
+  virtual void terminate() TRT_NOEXCEPT override {};
+
+  virtual size_t getWorkspaceSize(int maxBatchSize) const TRT_NOEXCEPT override { return 0; }
+
+  virtual int enqueue(int batchSize, const void* const* inputs, void*TRT_CONST_ENQUEUE* outputs, void* workspace, cudaStream_t stream) TRT_NOEXCEPT override;
+
+  virtual size_t getSerializationSize() const TRT_NOEXCEPT override;
+
+  virtual void serialize(void* buffer) const TRT_NOEXCEPT override;
+
+  bool supportsFormatCombination(int pos, const PluginTensorDesc* inOut, int nbInputs, int nbOutputs) const TRT_NOEXCEPT override {
+    return inOut[pos].format == TensorFormat::kLINEAR && inOut[pos].type == DataType::kFLOAT;
+  }
+
+  const char* getPluginType() const TRT_NOEXCEPT override;
+
+  const char* getPluginVersion() const TRT_NOEXCEPT override;
+
+  void destroy() TRT_NOEXCEPT override;
+
+  IPluginV2IOExt* clone() const TRT_NOEXCEPT override;
+
+  void setPluginNamespace(const char* pluginNamespace) TRT_NOEXCEPT override;
+
+  const char* getPluginNamespace() const TRT_NOEXCEPT override;
+
+  DataType getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const TRT_NOEXCEPT override;
+
+  bool isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const TRT_NOEXCEPT override;
+
+  bool canBroadcastInputAcrossBatch(int inputIndex) const TRT_NOEXCEPT override;
+
+  void attachToContext(
+      cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator) TRT_NOEXCEPT override;
+
+  void configurePlugin(const PluginTensorDesc* in, int nbInput, const PluginTensorDesc* out, int nbOutput) TRT_NOEXCEPT override;
+
+  void detachFromContext() TRT_NOEXCEPT override;
+
+ private:
+  void forwardGpu(const float* const* inputs, float *output, cudaStream_t stream, int batchSize = 1);
+  int mThreadCount = 256;
+  const char* mPluginNamespace;
+  int mKernelCount;
+  int mClassCount;
+  int mYoloV5NetWidth;
+  int mYoloV5NetHeight;
+  int mMaxOutObject;
+  bool is_segmentation_;
+  std::vector<YoloKernel> mYoloKernel;
+  void** mAnchor;
+};
+
+class API YoloPluginCreator : public IPluginCreator {
+ public:
+  YoloPluginCreator();
+
+  ~YoloPluginCreator() override = default;
+
+  const char* getPluginName() const TRT_NOEXCEPT override;
+
+  const char* getPluginVersion() const TRT_NOEXCEPT override;
+
+  const PluginFieldCollection* getFieldNames() TRT_NOEXCEPT override;
+
+  IPluginV2IOExt* createPlugin(const char* name, const PluginFieldCollection* fc) TRT_NOEXCEPT override;
+
+  IPluginV2IOExt* deserializePlugin(const char* name, const void* serialData, size_t serialLength) TRT_NOEXCEPT override;
+
+  void setPluginNamespace(const char* libNamespace) TRT_NOEXCEPT override {
+    mNamespace = libNamespace;
+  }
+
+  const char* getPluginNamespace() const TRT_NOEXCEPT override {
+    return mNamespace.c_str();
+  }
+
+ private:
+  std::string mNamespace;
+  static PluginFieldCollection mFC;
+  static std::vector<PluginField> mPluginAttributes;
+};
+REGISTER_TENSORRT_PLUGIN(YoloPluginCreator);
+};
+
--- a/algorithm/common_det/sv_common_det.cpp
+++ b/algorithm/common_det/sv_common_det.cpp
@@ -0,0 +1,62 @@
+#include "sv_common_det.h"
+#include <cmath>
+#include <fstream>
+
+#ifdef WITH_CUDA
+#include <NvInfer.h>
+#include <cuda_runtime_api.h>
+#include "common_det_cuda_impl.h"
+#endif
+
+
+namespace sv {
+
+
+CommonObjectDetector::CommonObjectDetector()
+{
+  this->_cuda_impl = new CommonObjectDetectorCUDAImpl;
+}
+CommonObjectDetector::~CommonObjectDetector()
+{
+}
+
+bool CommonObjectDetector::setupImpl()
+{
+#ifdef WITH_CUDA
+  return this->_cuda_impl->cudaSetup(this);
+#endif
+  return false;
+}
+
+void CommonObjectDetector::detectImpl(
+  cv::Mat img_,
+  std::vector<float>& boxes_x_,
+  std::vector<float>& boxes_y_,
+  std::vector<float>& boxes_w_,
+  std::vector<float>& boxes_h_,
+  std::vector<int>& boxes_label_,
+  std::vector<float>& boxes_score_,
+  std::vector<cv::Mat>& boxes_seg_
+)
+{
+#ifdef WITH_CUDA
+  this->_cuda_impl->cudaDetect(
+    this,
+    img_,
+    boxes_x_,
+    boxes_y_,
+    boxes_w_,
+    boxes_h_,
+    boxes_label_,
+    boxes_score_,
+    boxes_seg_
+  );
+#endif
+}
+
+
+
+
+
+}
+
--- a/algorithm/ellipse_det/ellipse_detector.cpp
+++ b/algorithm/ellipse_det/ellipse_detector.cpp
--- a/algorithm/ellipse_det/ellipse_detector.h
+++ b/algorithm/ellipse_det/ellipse_detector.h
--- a/algorithm/landing_det/cuda/landing_det_cuda_impl.cpp
+++ b/algorithm/landing_det/cuda/landing_det_cuda_impl.cpp
@@ -0,0 +1,160 @@
+#include "landing_det_cuda_impl.h"
+#include <cmath>
+#include <fstream>
+
+#define SV_MODEL_DIR "/SpireCV/models/"
+#define SV_ROOT_DIR "/SpireCV/"
+
+
+#ifdef WITH_CUDA
+#include "yolov7/logging.h"
+#define TRTCHECK(status)                                 \
+  do                                                     \
+  {                                                      \
+    auto ret = (status);                                 \
+    if (ret != 0)                                        \
+    {                                                    \
+      std::cerr << "Cuda failure: " << ret << std::endl; \
+      abort();                                           \
+    }                                                    \
+  } while (0)
+
+#define DEVICE 0  // GPU id
+#define BATCH_SIZE 1
+#define MAX_IMAGE_INPUT_SIZE_THRESH 3000 * 3000 // ensure it exceed the maximum size in the input images !
+#endif
+
+
+namespace sv {
+
+using namespace cv;
+
+
+#ifdef WITH_CUDA
+using namespace nvinfer1;
+static Logger g_nvlogger;
+#endif
+
+
+LandingMarkerDetectorCUDAImpl::LandingMarkerDetectorCUDAImpl()
+{
+}
+
+
+LandingMarkerDetectorCUDAImpl::~LandingMarkerDetectorCUDAImpl()
+{
+}
+
+
+bool LandingMarkerDetectorCUDAImpl::cudaSetup()
+{
+#ifdef WITH_CUDA
+  std::string trt_model_fn = get_home() + SV_MODEL_DIR + "LandingMarker.engine";
+  if (!is_file_exist(trt_model_fn))
+  {
+    throw std::runtime_error("SpireCV (104) Error loading the LandingMarker TensorRT model (File Not Exist)");
+  }
+  char *trt_model_stream{nullptr};
+  size_t trt_model_size{0};
+  try
+  {
+    std::ifstream file(trt_model_fn, std::ios::binary);
+    file.seekg(0, file.end);
+    trt_model_size = file.tellg();
+    file.seekg(0, file.beg);
+    trt_model_stream = new char[trt_model_size];
+    assert(trt_model_stream);
+    file.read(trt_model_stream, trt_model_size);
+    file.close();
+  }
+  catch (const std::runtime_error &e)
+  {
+    throw std::runtime_error("SpireCV (104) Error loading the TensorRT model!");
+  }
+  
+  // TensorRT
+  IRuntime *runtime = nvinfer1::createInferRuntime(g_nvlogger);
+  assert(runtime != nullptr);
+  ICudaEngine *p_cu_engine = runtime->deserializeCudaEngine(trt_model_stream, trt_model_size);
+  assert(p_cu_engine != nullptr);
+  this->_trt_context = p_cu_engine->createExecutionContext();
+  assert(this->_trt_context != nullptr);
+
+  delete[] trt_model_stream;
+  const ICudaEngine &cu_engine = this->_trt_context->getEngine();
+  assert(cu_engine.getNbBindings() == 2);
+  
+  this->_input_index = cu_engine.getBindingIndex("input");
+  this->_output_index = cu_engine.getBindingIndex("output");
+  TRTCHECK(cudaMalloc(&_p_buffers[this->_input_index], 1 * 3 * 32 * 32 * sizeof(float)));
+  TRTCHECK(cudaMalloc(&_p_buffers[this->_output_index], 1 * 11 * sizeof(float)));
+  TRTCHECK(cudaStreamCreate(&_cu_stream));
+  
+  auto input_dims = nvinfer1::Dims4{1, 3, 32, 32};
+  this->_trt_context->setBindingDimensions(this->_input_index, input_dims);
+  
+  this->_p_data = new float[1 * 3 * 32 * 32];
+  this->_p_prob = new float[1 * 11];
+  // Input
+  TRTCHECK(cudaMemcpyAsync(_p_buffers[this->_input_index], this->_p_data, 1 * 3 * 32 * 32 * sizeof(float), cudaMemcpyHostToDevice, this->_cu_stream));
+  // this->_trt_context->enqueue(1, _p_buffers, this->_cu_stream, nullptr);
+  this->_trt_context->enqueueV2(_p_buffers, this->_cu_stream, nullptr);
+  // Output
+  TRTCHECK(cudaMemcpyAsync(this->_p_prob, _p_buffers[this->_output_index], 1 * 11 * sizeof(float), cudaMemcpyDeviceToHost, this->_cu_stream));
+  cudaStreamSynchronize(this->_cu_stream);
+  return true;
+#endif
+  return false;
+}
+
+
+void LandingMarkerDetectorCUDAImpl::cudaRoiCNN(
+  std::vector<cv::Mat>& input_rois_,
+  std::vector<int>& output_labels_
+)
+{
+#ifdef WITH_CUDA
+  output_labels_.clear();
+  for (int i=0; i<input_rois_.size(); i++)
+  {
+    cv::Mat e_roi = input_rois_[i];
+    for (int row = 0; row < 32; ++row)
+    {
+      uchar *uc_pixel = e_roi.data + row * e_roi.step; // compute row id
+      for (int col = 0; col < 32; ++col)
+      {
+        // mean=[136.20, 141.50, 145.41], std=[44.77, 44.20, 44.30]
+        this->_p_data[col + row * 32] = ((float)uc_pixel[0] - 136.20f) / 44.77f;
+        this->_p_data[col + row * 32 + 32 * 32] = ((float)uc_pixel[1] - 141.50f) / 44.20f;
+        this->_p_data[col + row * 32 + 32 * 32 * 2] = ((float)uc_pixel[2] - 145.41f) / 44.30f;
+        uc_pixel += 3;
+      }
+    }
+
+    // Input
+    TRTCHECK(cudaMemcpyAsync(_p_buffers[this->_input_index], this->_p_data, 1 * 3 * 32 * 32 * sizeof(float), cudaMemcpyHostToDevice, this->_cu_stream));
+    // this->_trt_context->enqueue(1, _p_buffers, this->_cu_stream, nullptr);
+    this->_trt_context->enqueueV2(_p_buffers, this->_cu_stream, nullptr);
+    // Output
+    TRTCHECK(cudaMemcpyAsync(this->_p_prob, _p_buffers[this->_output_index], 1 * 11 * sizeof(float), cudaMemcpyDeviceToHost, this->_cu_stream));
+    cudaStreamSynchronize(this->_cu_stream);
+
+    // Find max index
+    double max = 0;
+    int label = 0;
+    for (int i = 0; i < 11; ++i)
+    {
+      if (max < this->_p_prob[i])
+      {
+        max = this->_p_prob[i];
+        label = i;
+      }
+    }
+    output_labels_.push_back(label);
+  }
+#endif
+}
+
+
+}
+
--- a/algorithm/landing_det/cuda/landing_det_cuda_impl.h
+++ b/algorithm/landing_det/cuda/landing_det_cuda_impl.h
@@ -0,0 +1,48 @@
+#ifndef __SV_LANDING_DET_CUDA__
+#define __SV_LANDING_DET_CUDA__
+
+#include "sv_core.h"
+#include <opencv2/opencv.hpp>
+#include <opencv2/aruco.hpp>
+#include <opencv2/tracking.hpp>
+#include <string>
+#include <chrono>
+
+
+
+#ifdef WITH_CUDA
+#include <NvInfer.h>
+#include <cuda_runtime_api.h>
+#endif
+
+
+
+namespace sv {
+
+
+class LandingMarkerDetectorCUDAImpl
+{
+public:
+  LandingMarkerDetectorCUDAImpl();
+  ~LandingMarkerDetectorCUDAImpl();
+
+  bool cudaSetup();
+  void cudaRoiCNN(
+    std::vector<cv::Mat>& input_rois_,
+    std::vector<int>& output_labels_
+  );
+
+#ifdef WITH_CUDA
+  float *_p_data;
+  float *_p_prob;
+  nvinfer1::IExecutionContext *_trt_context;
+  int _input_index;
+  int _output_index;
+  void *_p_buffers[2];
+  cudaStream_t _cu_stream;
+#endif
+};
+
+
+}
+#endif
--- a/algorithm/landing_det/sv_landing_det.cpp
+++ b/algorithm/landing_det/sv_landing_det.cpp
@@ -0,0 +1,48 @@
+#include "sv_landing_det.h"
+#include <cmath>
+#include <fstream>
+#ifdef WITH_CUDA
+#include <NvInfer.h>
+#include <cuda_runtime_api.h>
+#include "landing_det_cuda_impl.h"
+#endif
+
+
+namespace sv {
+
+
+LandingMarkerDetector::LandingMarkerDetector()
+{
+  this->_cuda_impl = new LandingMarkerDetectorCUDAImpl;
+}
+LandingMarkerDetector::~LandingMarkerDetector()
+{
+}
+
+bool LandingMarkerDetector::setupImpl()
+{
+#ifdef WITH_CUDA
+  return this->_cuda_impl->cudaSetup();
+#endif
+  return false;
+}
+
+void LandingMarkerDetector::roiCNN(
+  std::vector<cv::Mat>& input_rois_,
+  std::vector<int>& output_labels_
+)
+{
+#ifdef WITH_CUDA
+  this->_cuda_impl->cudaRoiCNN(
+    input_rois_,
+    output_labels_
+  );
+#endif
+}
+
+
+
+
+
+}
+
--- a/algorithm/sv_algorithm_base.cpp
+++ b/algorithm/sv_algorithm_base.cpp
--- a/algorithm/tracking/ocv470/tracking_ocv470_impl.cpp
+++ b/algorithm/tracking/ocv470/tracking_ocv470_impl.cpp
@@ -0,0 +1,135 @@
+#include "tracking_ocv470_impl.h"
+#include <cmath>
+#include <fstream>
+
+#define SV_MODEL_DIR "/SpireCV/models/"
+#define SV_ROOT_DIR "/SpireCV/"
+
+
+namespace sv {
+
+using namespace cv;
+
+
+SingleObjectTrackerOCV470Impl::SingleObjectTrackerOCV470Impl()
+{
+}
+
+SingleObjectTrackerOCV470Impl::~SingleObjectTrackerOCV470Impl()
+{
+}
+
+
+bool SingleObjectTrackerOCV470Impl::ocv470Setup(SingleObjectTrackerBase* base_)
+{
+  this->_algorithm = base_->getAlgorithm();
+  this->_backend = base_->getBackend();
+  this->_target = base_->getTarget();
+
+#ifdef WITH_OCV470
+  std::string net = get_home() + SV_MODEL_DIR + "dasiamrpn_model.onnx";
+  std::string kernel_cls1 = get_home() + SV_MODEL_DIR + "dasiamrpn_kernel_cls1.onnx";
+  std::string kernel_r1 = get_home() + SV_MODEL_DIR + "dasiamrpn_kernel_r1.onnx";
+
+  std::string backbone = get_home() + SV_MODEL_DIR + "nanotrack_backbone_sim.onnx";
+  std::string neckhead = get_home() + SV_MODEL_DIR + "nanotrack_head_sim.onnx";
+
+  try
+  {
+    TrackerNano::Params nano_params;
+    nano_params.backbone = samples::findFile(backbone);
+    nano_params.neckhead = samples::findFile(neckhead);
+    nano_params.backend = this->_backend;
+    nano_params.target = this->_target;
+
+    _nano = TrackerNano::create(nano_params);
+  }
+  catch (const cv::Exception& ee)
+  {
+    std::cerr << "Exception: " << ee.what() << std::endl;
+    std::cout << "Can't load the network by using the following files:" << std::endl;
+    std::cout << "nanoBackbone : " << backbone << std::endl;
+    std::cout << "nanoNeckhead : " << neckhead << std::endl;
+  }
+
+  try
+  {
+    TrackerDaSiamRPN::Params params;
+    params.model = samples::findFile(net);
+    params.kernel_cls1 = samples::findFile(kernel_cls1);
+    params.kernel_r1 = samples::findFile(kernel_r1);
+    params.backend = this->_backend;
+    params.target = this->_target;
+    _siam_rpn = TrackerDaSiamRPN::create(params);
+  }
+  catch (const cv::Exception& ee)
+  {
+    std::cerr << "Exception: " << ee.what() << std::endl;
+    std::cout << "Can't load the network by using the following files:" << std::endl;
+    std::cout << "siamRPN : " << net << std::endl;
+    std::cout << "siamKernelCL1 : " << kernel_cls1 << std::endl;
+    std::cout << "siamKernelR1 : " << kernel_r1 << std::endl;
+  }
+  return true;
+#endif
+  return false;
+}
+
+
+void SingleObjectTrackerOCV470Impl::ocv470Init(cv::Mat img_, const cv::Rect& bounding_box_)
+{
+#ifdef WITH_OCV470
+  if (this->_algorithm == "kcf")
+  {
+    TrackerKCF::Params params;
+    _kcf = TrackerKCF::create(params);
+    _kcf->init(img_, bounding_box_);
+  }
+  else if (this->_algorithm == "csrt")
+  {
+    TrackerCSRT::Params params;
+    _csrt = TrackerCSRT::create(params);
+    _csrt->init(img_, bounding_box_);
+  }
+  else if (this->_algorithm == "siamrpn")
+  {
+    _siam_rpn->init(img_, bounding_box_);
+  }
+  else if (this->_algorithm == "nano")
+  {
+    _nano->init(img_, bounding_box_);
+  }
+#endif
+}
+
+
+bool SingleObjectTrackerOCV470Impl::ocv470Track(cv::Mat img_, cv::Rect& output_bbox_)
+{
+#ifdef WITH_OCV470
+  bool ok = false;
+  if (this->_algorithm == "kcf")
+  {
+    ok = _kcf->update(img_, output_bbox_);
+  }
+  else if (this->_algorithm == "csrt")
+  {
+    ok = _csrt->update(img_, output_bbox_);
+  }
+  else if (this->_algorithm == "siamrpn")
+  {
+    ok = _siam_rpn->update(img_, output_bbox_);
+  }
+  else if (this->_algorithm == "nano")
+  {
+    ok = _nano->update(img_, output_bbox_);
+  }
+  return ok;
+#endif
+  return false;
+}
+
+
+
+
+}
+
--- a/algorithm/tracking/ocv470/tracking_ocv470_impl.h
+++ b/algorithm/tracking/ocv470/tracking_ocv470_impl.h
@@ -0,0 +1,40 @@
+#ifndef __SV_TRACKING_OCV470__
+#define __SV_TRACKING_OCV470__
+
+#include "sv_core.h"
+#include <opencv2/opencv.hpp>
+#include <opencv2/aruco.hpp>
+#include <opencv2/tracking.hpp>
+#include <string>
+#include <chrono>
+
+
+
+namespace sv {
+
+
+class SingleObjectTrackerOCV470Impl
+{
+public:
+  SingleObjectTrackerOCV470Impl();
+  ~SingleObjectTrackerOCV470Impl();
+
+  bool ocv470Setup(SingleObjectTrackerBase* base_);
+  void ocv470Init(cv::Mat img_, const cv::Rect& bounding_box_);
+  bool ocv470Track(cv::Mat img_, cv::Rect& output_bbox_);
+
+  std::string _algorithm;
+  int _backend;
+  int _target;
+
+#ifdef WITH_OCV470
+  cv::Ptr<cv::TrackerDaSiamRPN> _siam_rpn;
+  cv::Ptr<cv::TrackerKCF> _kcf;
+  cv::Ptr<cv::TrackerCSRT> _csrt;
+  cv::Ptr<cv::TrackerNano> _nano;
+#endif
+};
+
+
+}
+#endif
--- a/algorithm/tracking/sv_tracking.cpp
+++ b/algorithm/tracking/sv_tracking.cpp
@@ -0,0 +1,41 @@
+#include "sv_tracking.h"
+#include <cmath>
+#include <fstream>
+#include "tracking_ocv470_impl.h"
+
+
+namespace sv {
+
+
+SingleObjectTracker::SingleObjectTracker()
+{
+  this->_ocv470_impl = new SingleObjectTrackerOCV470Impl;
+}
+SingleObjectTracker::~SingleObjectTracker()
+{
+}
+
+bool SingleObjectTracker::setupImpl()
+{
+#ifdef WITH_OCV470
+  return this->_ocv470_impl->ocv470Setup(this);
+#endif
+  return false;
+}
+void SingleObjectTracker::initImpl(cv::Mat img_, const cv::Rect& bounding_box_)
+{
+#ifdef WITH_OCV470
+  this->_ocv470_impl->ocv470Init(img_, bounding_box_);
+#endif
+}
+bool SingleObjectTracker::trackImpl(cv::Mat img_, cv::Rect& output_bbox_)
+{
+#ifdef WITH_OCV470
+  return this->_ocv470_impl->ocv470Track(img_, output_bbox_);
+#endif
+  return false;
+}
+
+
+}
+