diff --git a/APP_Framework/Framework/knowing/kpu/k210_yolov2_detect_procedure/Kconfig b/APP_Framework/Framework/knowing/kpu/k210_yolov2_detect_procedure/Kconfig
index fa5819f4f..2d25a5710 100644
--- a/APP_Framework/Framework/knowing/kpu/k210_yolov2_detect_procedure/Kconfig
+++ b/APP_Framework/Framework/knowing/kpu/k210_yolov2_detect_procedure/Kconfig
@@ -3,5 +3,9 @@ menuconfig USING_K210_YOLOV2_DETECT
 	depends on USING_KPU_PROCESSING
     default n
 
+config CAMERA_DEV_DRIVER
+    string "Set camera dev path"
+    default "/dev/ov2640"
+
 
 
diff --git a/APP_Framework/Framework/knowing/kpu/k210_yolov2_detect_procedure/Makefile b/APP_Framework/Framework/knowing/kpu/k210_yolov2_detect_procedure/Makefile
new file mode 100644
index 000000000..767322091
--- /dev/null
+++ b/APP_Framework/Framework/knowing/kpu/k210_yolov2_detect_procedure/Makefile
@@ -0,0 +1,4 @@
+SRC_FILES := k210_yolov2_detect.c
+
+include $(KERNEL_ROOT)/compiler.mk
+
diff --git a/APP_Framework/Framework/knowing/kpu/k210_yolov2_detect_procedure/k210_yolov2_detect.h b/APP_Framework/Framework/knowing/kpu/k210_yolov2_detect_procedure/k210_yolov2_detect.h
index 47427e734..935b23067 100644
--- a/APP_Framework/Framework/knowing/kpu/k210_yolov2_detect_procedure/k210_yolov2_detect.h
+++ b/APP_Framework/Framework/knowing/kpu/k210_yolov2_detect_procedure/k210_yolov2_detect.h
@@ -1,7 +1,10 @@
 #ifndef _K210_DETECT_H_
 #define _K210_DETECT_H_
 
+#include <stdio.h>
+#include <string.h>
 #include <transform.h>
+#include "sleep.h"
 
 void k210_detect(char *json_file_path);
 
diff --git a/APP_Framework/Framework/knowing/kpu/yolov2/Makefile b/APP_Framework/Framework/knowing/kpu/yolov2/Makefile
new file mode 100644
index 000000000..6ebd0b800
--- /dev/null
+++ b/APP_Framework/Framework/knowing/kpu/yolov2/Makefile
@@ -0,0 +1,4 @@
+SRC_FILES := region_layer.c
+
+include $(KERNEL_ROOT)/compiler.mk
+
diff --git a/APP_Framework/Framework/knowing/kpu/yolov2/region_layer.c b/APP_Framework/Framework/knowing/kpu/yolov2/region_layer.c
index 255bf82a0..32a6a08dc 100644
--- a/APP_Framework/Framework/knowing/kpu/yolov2/region_layer.c
+++ b/APP_Framework/Framework/knowing/kpu/yolov2/region_layer.c
@@ -224,7 +224,7 @@ static void get_region_boxes(region_layer_t *rl, float *predictions, float **pro
     correct_region_boxes(rl, boxes);
 }
 
-static int nms_comparator(void *pa, void *pb)
+static int nms_comparator(const void *pa,const void *pb)
 {
     sortable_box_t a = *(sortable_box_t *)pa;
     sortable_box_t b = *(sortable_box_t *)pb;
diff --git a/APP_Framework/Framework/knowing/kpu/yolov2_json/Makefile b/APP_Framework/Framework/knowing/kpu/yolov2_json/Makefile
new file mode 100644
index 000000000..50b092a60
--- /dev/null
+++ b/APP_Framework/Framework/knowing/kpu/yolov2_json/Makefile
@@ -0,0 +1,4 @@
+SRC_FILES := json_parser.c
+
+include $(KERNEL_ROOT)/compiler.mk
+
diff --git a/APP_Framework/Framework/knowing/kpu/yolov2_json/json_parser.c b/APP_Framework/Framework/knowing/kpu/yolov2_json/json_parser.c
index 9a3167f64..ada573872 100644
--- a/APP_Framework/Framework/knowing/kpu/yolov2_json/json_parser.c
+++ b/APP_Framework/Framework/knowing/kpu/yolov2_json/json_parser.c
@@ -1,6 +1,9 @@
 #include "json_parser.h"
 
-#include <fcntl.h>
+// #include <fcntl.h>
+#include <stdio.h>
+#include <string.h>
+#include <transform.h>
 
 #include "cJSON.h"
 
@@ -31,9 +34,9 @@ yolov2_params_t param_parse(char *json_file_path)
     } else {
         printf("Reading config from: %s\n", json_file_path);
     }
+
     read(fin, buffer, sizeof(buffer));
     close(fin);
-
     // read json string
     json_obj = cJSON_Parse(buffer);
     // free(buffer);
diff --git a/APP_Framework/Framework/transform_layer/xizi/user_api/posix_support/include/pthread.h b/APP_Framework/Framework/transform_layer/xizi/user_api/posix_support/include/pthread.h
index efe995dc7..acebf067f 100644
--- a/APP_Framework/Framework/transform_layer/xizi/user_api/posix_support/include/pthread.h
+++ b/APP_Framework/Framework/transform_layer/xizi/user_api/posix_support/include/pthread.h
@@ -73,6 +73,12 @@ typedef int   pid_t;
 int       pthread_atfork(void (*prepare)(void), void (*parent)(void), void (*child)(void));
 int       pthread_create(pthread_t *thread, const pthread_attr_t *attr,
                          void *(*start_routine)(void *), void *arg);
+int pthread_attr_init(pthread_attr_t *attr);
+int pthread_attr_setstacksize(pthread_attr_t *attr, size_t stack_size);
+int pthread_attr_setschedparam(pthread_attr_t *attr,struct sched_param const *param);
+int pthread_attr_setstack(pthread_attr_t *attr,
+                          void           *stack_base,
+                          size_t          stack_size);
 void      pthread_exit(void *value_ptr);
 int       pthread_detach(pthread_t thread);
 int       pthread_join(pthread_t thread, void **retval);
diff --git a/APP_Framework/Framework/transform_layer/xizi/user_api/posix_support/pthread.c b/APP_Framework/Framework/transform_layer/xizi/user_api/posix_support/pthread.c
index 3b8b170fd..ce33b259a 100644
--- a/APP_Framework/Framework/transform_layer/xizi/user_api/posix_support/pthread.c
+++ b/APP_Framework/Framework/transform_layer/xizi/user_api/posix_support/pthread.c
@@ -22,6 +22,10 @@
 #include <stdio.h>
 #include "include/pthread.h"
 
+#define DEFAULT_STACK_SIZE  2048
+#define DEFAULT_PRIORITY    (KTASK_PRIORITY_MAX/2 + KTASK_PRIORITY_MAX/4)
+
+
 int pthread_create(pthread_t *thread, const pthread_attr_t *attr,
                    void *(*start_routine)(void *), void *arg)
 {
@@ -55,6 +59,27 @@ int pthread_create(pthread_t *thread, const pthread_attr_t *attr,
 
 }
 
+int pthread_attr_init(pthread_attr_t *attr)
+{
+    return 0;
+}
+
+int pthread_attr_setschedparam(pthread_attr_t           *attr,
+                               struct sched_param const *param)
+{
+    NULL_PARAM_CHECK(attr != NULL);
+    NULL_PARAM_CHECK(param != NULL);
+
+    attr->schedparam.sched_priority = param->sched_priority;
+
+    return 0;
+}
+
+int pthread_attr_setstacksize(pthread_attr_t *attr, size_t stack_size)
+{
+    return 0;
+}
+
 void pthread_exit(void *value_ptr){
     //todo add exit value
     UserTaskQuit();
diff --git a/Ubiquitous/XiZi_IIoT/board/edu-riscv64/board.c b/Ubiquitous/XiZi_IIoT/board/edu-riscv64/board.c
index ad4ec5406..1a76f26c6 100644
--- a/Ubiquitous/XiZi_IIoT/board/edu-riscv64/board.c
+++ b/Ubiquitous/XiZi_IIoT/board/edu-riscv64/board.c
@@ -70,6 +70,7 @@ extern int HwLcdInit(void);
 extern int HwSpiInit(void);
 extern int HwSoftSPIInit(void);
 extern int HwWiznetInit(void);
+extern int HwDvpInit(void);
 
 #include <iot-vfs.h>
 #ifdef MOUNT_USB
diff --git a/Ubiquitous/XiZi_IIoT/board/edu-riscv64/third_party_driver/Kconfig b/Ubiquitous/XiZi_IIoT/board/edu-riscv64/third_party_driver/Kconfig
index f6322f544..31d42d085 100755
--- a/Ubiquitous/XiZi_IIoT/board/edu-riscv64/third_party_driver/Kconfig
+++ b/Ubiquitous/XiZi_IIoT/board/edu-riscv64/third_party_driver/Kconfig
@@ -123,9 +123,17 @@ menuconfig BSP_USING_WIZCHIP
 
 menuconfig BSP_USING_CAMERA
     bool "Using camera device"
-    default y
+    default n
     select RESOURCES_CAMERA
     if BSP_USING_CAMERA
         source "$BSP_DIR/third_party_driver/dvp/Kconfig"
     endif
 
+menuconfig BSP_USING_KPU
+    bool "Using kpu device"
+    default n
+    select RESOURCES_KPU
+    if BSP_USING_KPU
+        source "$BSP_DIR/third_party_driver/kpu/Kconfig"
+    endif
+
diff --git a/Ubiquitous/XiZi_IIoT/board/edu-riscv64/third_party_driver/Makefile b/Ubiquitous/XiZi_IIoT/board/edu-riscv64/third_party_driver/Makefile
index 7c5f9ad84..51804af84 100644
--- a/Ubiquitous/XiZi_IIoT/board/edu-riscv64/third_party_driver/Makefile
+++ b/Ubiquitous/XiZi_IIoT/board/edu-riscv64/third_party_driver/Makefile
@@ -59,9 +59,13 @@ endif
 ifeq ($(CONFIG_BSP_USING_WIZCHIP),y)
   SRC_DIR += ethernet
 endif
+
 ifeq ($(CONFIG_BSP_USING_CAMERA),y)
   SRC_DIR += dvp
 endif
 
+ifeq ($(CONFIG_BSP_USING_KPU),y)
+  SRC_DIR += kpu
+endif
 
 include $(KERNEL_ROOT)/compiler.mk
diff --git a/Ubiquitous/XiZi_IIoT/board/edu-riscv64/third_party_driver/include/kpu.h b/Ubiquitous/XiZi_IIoT/board/edu-riscv64/third_party_driver/include/kpu.h
new file mode 100644
index 000000000..1bf683c90
--- /dev/null
+++ b/Ubiquitous/XiZi_IIoT/board/edu-riscv64/third_party_driver/include/kpu.h
@@ -0,0 +1,930 @@
+/* Copyright 2018 Canaan Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef _KPU_H
+#define _KPU_H
+
+#include <stdint.h>
+#include <plic.h>
+#include "dmac.h"
+
+#define kpu_matmul_begin kpu_conv2d_output
+
+typedef int (*plic_irq_callback_t)(void *ctx);
+
+typedef struct
+{
+    union
+    {
+        uint64_t reg;
+        struct
+        {
+            uint64_t int_en:1;
+            uint64_t ram_flag:1;
+            uint64_t full_add:1;
+            uint64_t depth_wise_layer:1;
+            uint64_t reserved:60;
+        } data;
+    } interrupt_enabe;
+
+    union
+    {
+        uint64_t reg;
+        struct
+        {
+            uint64_t image_src_addr:15;
+            uint64_t reserved0:17;
+            uint64_t image_dst_addr:15;
+            uint64_t reserved1:17;
+        } data;
+    } image_addr;
+
+    union
+    {
+        uint64_t reg;
+        struct
+        {
+            uint64_t i_ch_num:10;
+            uint64_t reserved0:22;
+            uint64_t o_ch_num:10;
+            uint64_t reserved1:6;
+            uint64_t o_ch_num_coef:10;
+            uint64_t reserved2:6;
+        } data;
+    } image_channel_num;
+
+    union
+    {
+        uint64_t reg;
+        struct
+        {
+            uint64_t i_row_wid:10;
+            uint64_t i_col_high:9;
+            uint64_t reserved0:13;
+            uint64_t o_row_wid:10;
+            uint64_t o_col_high:9;
+            uint64_t reserved1:13;
+        } data;
+    } image_size;
+
+    union
+    {
+        uint64_t reg;
+        struct
+        {
+            uint64_t kernel_type:3;
+            uint64_t pad_type:1;
+            uint64_t pool_type:4;
+            uint64_t first_stride:1;
+            uint64_t bypass_conv:1;
+            uint64_t load_para:1;
+            uint64_t reserved0:5;
+            uint64_t dma_burst_size:8;
+            uint64_t pad_value:8;
+            uint64_t bwsx_base_addr:32;
+        } data;
+    } kernel_pool_type_cfg;
+
+    union
+    {
+        uint64_t reg;
+        struct
+        {
+            uint64_t load_coor:1;
+            uint64_t load_time:6;
+            uint64_t reserved0:8;
+            uint64_t para_size:17;
+            uint64_t para_start_addr:32;
+        } data;
+    } kernel_load_cfg;
+
+    union
+    {
+        uint64_t reg;
+        struct
+        {
+            uint64_t coef_column_offset:4;
+            uint64_t coef_row_offset:12;
+            uint64_t reserved0:48;
+        } data;
+    } kernel_offset;
+
+    union
+    {
+        uint64_t reg;
+        struct
+        {
+            uint64_t channel_switch_addr:15;
+            uint64_t reserved:1;
+            uint64_t row_switch_addr:4;
+            uint64_t coef_size:8;
+            uint64_t coef_group:3;
+            uint64_t load_act:1;
+            uint64_t active_addr:32;
+        } data;
+    } kernel_calc_type_cfg;
+
+    union
+    {
+        uint64_t reg;
+        struct
+        {
+            uint64_t wb_channel_switch_addr:15;
+            uint64_t reserved0:1;
+            uint64_t wb_row_switch_addr:4;
+            uint64_t wb_group:3;
+            uint64_t reserved1:41;
+        } data;
+    } write_back_cfg;
+
+    union
+    {
+        uint64_t reg;
+        struct
+        {
+            uint64_t shr_w:4;
+            uint64_t shr_x:4;
+            uint64_t arg_w:24;
+            uint64_t arg_x:24;
+            uint64_t reserved0:8;
+        } data;
+    } conv_value;
+
+    union
+    {
+        uint64_t reg;
+        struct
+        {
+            uint64_t arg_add:40;
+            uint64_t reserved:24;
+        } data;
+    } conv_value2;
+
+    union
+    {
+        uint64_t reg;
+        struct
+        {
+            uint64_t send_data_out:1;
+            uint64_t reserved:15;
+            uint64_t channel_byte_num:16;
+            uint64_t dma_total_byte:32;
+        } data;
+    } dma_parameter;
+} kpu_layer_argument_t;
+
+typedef struct
+{
+    union
+    {
+        uint64_t reg;
+        struct
+        {
+            uint64_t shift_number:8;
+            uint64_t y_mul:16;
+            uint64_t x_start:36;
+        } data;
+    } activate_para[16];
+
+    union
+    {
+        uint64_t reg;
+        struct
+        {
+            uint8_t result_bias[8];
+        } data;
+    } activate_para_bias0;
+
+    union
+    {
+        uint64_t reg;
+        struct
+        {
+            uint8_t result_bias[8];
+        } data;
+    } activate_para_bias1;
+} kpu_activate_table_t;
+
+typedef struct
+{
+    union
+    {
+        uint64_t reg;
+        struct
+        {
+            uint64_t norm_mul:24;
+            uint64_t norm_add:32;
+            uint64_t norm_shift:4;
+        } data;
+    } batchnorm;
+} kpu_batchnorm_argument_t;
+
+
+typedef struct
+{
+    union
+    {
+        uint64_t reg;
+        struct
+        {
+            uint16_t weight[9];
+        } data;
+    } weights;
+} kpu_weights_kernel_16_3x3_t;
+
+typedef struct
+{
+    uint64_t calc_done_int:1;
+    uint64_t layer_cfg_almost_empty_int:1;
+    uint64_t layer_cfg_almost_full_int:1;
+    uint64_t reserved:61;
+} kpu_config_interrupt_t;
+
+typedef struct
+{
+    uint64_t fifo_full_threshold:4;
+    uint64_t fifo_empty_threshold:4;
+    uint64_t reserved:56;
+} kpu_config_fifo_threshold_t;
+
+typedef struct
+{
+    uint64_t dma_fifo_flush_n:1;
+    uint64_t gs_fifo_flush_n:1;
+    uint64_t cfg_fifo_flush_n:1;
+    uint64_t cmd_fifo_flush_n:1;
+    uint64_t resp_fifo_flush_n:1;
+    uint64_t reserved:59;
+} kpu_config_fifo_ctrl_t;
+
+typedef struct
+{
+    uint64_t eight_bit_mode:1;
+    uint64_t reserved:63;
+} kpu_config_eight_bit_mode_t;
+
+
+typedef struct
+{
+    volatile uint64_t layer_argument_fifo;
+
+    volatile union
+    {
+        uint64_t reg;
+        kpu_config_interrupt_t data;
+    } interrupt_status;
+
+    volatile  union
+    {
+        uint64_t reg;
+        kpu_config_interrupt_t  data;
+    } interrupt_raw;
+
+    volatile  union {
+        uint64_t reg;
+        kpu_config_interrupt_t  data;
+    } interrupt_mask;
+
+    volatile  union
+    {
+        uint64_t reg;
+        kpu_config_interrupt_t data;
+    } interrupt_clear;
+
+    volatile  union
+    {
+        uint64_t reg;
+        kpu_config_fifo_threshold_t  data;
+    } fifo_threshold;
+
+    volatile uint64_t fifo_data_out;
+
+    volatile  union
+    {
+        uint64_t reg;
+        kpu_config_fifo_ctrl_t  data;
+    } fifo_ctrl;
+
+    volatile  union
+    {
+        uint64_t reg;
+        kpu_config_eight_bit_mode_t  data;
+    } eight_bit_mode;
+} kpu_config_t;
+
+typedef struct
+{
+    kpu_layer_argument_t *layers;
+    kpu_layer_argument_t *remain_layers;
+    plic_irq_callback_t callback;
+    void *ctx;
+    uint64_t *src;
+    uint64_t *dst;
+    uint32_t src_length;
+    uint32_t dst_length;
+    uint32_t layers_length;
+    uint32_t remain_layers_length;
+    dmac_channel_number_t dma_ch;
+    uint32_t eight_bit_mode;
+    float output_scale;
+    float output_bias;
+    float input_scale;
+    float input_bias;
+} kpu_task_t;
+
+typedef struct
+{
+    uint32_t version;
+    uint32_t flags;
+    uint32_t arch;
+    uint32_t layers_length;
+    uint32_t max_start_address;
+    uint32_t main_mem_usage;
+    uint32_t output_count;
+} kpu_kmodel_header_t;
+
+typedef struct
+{
+    uint32_t version;
+    uint32_t flags;
+    uint32_t layers_length;
+    uint32_t max_start_address;
+    uint32_t layers_argument_start;
+} kpu_model_header_t;
+
+typedef struct
+{
+    uint32_t address;
+    uint32_t size;
+} kpu_model_output_t;
+
+typedef enum
+{
+    KL_INVALID = 0,
+    KL_ADD,
+    KL_QUANTIZED_ADD,
+    KL_GLOBAL_MAX_POOL2D,
+    KL_QUANTIZED_GLOBAL_MAX_POOL2D,
+    KL_GLOBAL_AVERAGE_POOL2D,
+    KL_QUANTIZED_GLOBAL_AVERAGE_POOL2D,
+    KL_MAX_POOL2D,
+    KL_QUANTIZED_MAX_POOL2D,
+    KL_AVERAGE_POOL2D,
+    KL_QUANTIZED_AVERAGE_POOL2D,
+    KL_QUANTIZE,
+    KL_DEQUANTIZE,
+    KL_REQUANTIZE,
+    KL_L2_NORMALIZATION,
+    KL_SOFTMAX,
+    KL_CONCAT,
+    KL_QUANTIZED_CONCAT,
+    KL_FULLY_CONNECTED,
+    KL_QUANTIZED_FULLY_CONNECTED,
+    KL_TENSORFLOW_FLATTEN,
+    KL_QUANTIZED_TENSORFLOW_FLATTEN,
+    KL_RESIZE_NEAREST_NEIGHBOR,
+    KL_QUANTIZED_RESIZE_NEAREST_NEIGHBOR,
+    KL_CHANNELWISE_DEQUANTIZE,
+    KL_K210_CONV = 10240,
+    KL_K210_ADD_PADDING,
+    KL_K210_REMOVE_PADDING,
+    KL_K210_UPLOAD
+} kpu_model_layer_type_t;
+
+typedef struct
+{
+    uint32_t type;
+    uint32_t body_size;
+} kpu_model_layer_header_t;
+
+typedef enum
+{
+    KLF_NONE = 0,
+    KLF_MAIN_MEM_OUT = 1
+} kpu_model_layer_flags_t;
+
+typedef enum
+{
+    KLP_SAME = 0,
+    KLP_VALID = 1
+} kpu_model_padding_t;
+
+typedef enum
+{
+    KLA_LINEAR = 0,
+    KLA_RELU = 1,
+    KLA_RELU6 = 2
+} kpu_model_activation_t;
+
+typedef struct
+{
+    float scale;
+	float bias;
+} kpu_model_quant_param_t;
+
+typedef struct
+{
+    uint32_t width;
+    uint32_t height;
+    uint32_t channels;
+} kpu_model_shape_t;
+
+typedef struct
+{
+    uint32_t start;
+    uint32_t size;
+} kpu_model_memory_range_t;
+
+typedef struct
+{
+    uint32_t flags;
+    uint32_t main_mem_out_address;
+    uint32_t layer_offset;
+    uint32_t weights_offset;
+    uint32_t bn_offset;
+    uint32_t act_offset;
+} kpu_model_conv_layer_argument_t;
+
+typedef struct
+{
+    uint32_t flags;
+    uint32_t main_mem_in_a_address;
+    uint32_t main_mem_in_b_address;
+    uint32_t main_mem_out_address;
+    uint32_t count;
+} kpu_model_add_layer_argument_t;
+
+typedef struct
+{
+    uint32_t flags;
+    uint32_t main_mem_in_a_address;
+    uint32_t main_mem_in_b_address;
+    uint32_t main_mem_out_address;
+    uint32_t count;
+    int32_t in_a_offset;
+    int32_t in_a_mul;
+    int32_t in_a_shift;
+    int32_t in_b_offset;
+    int32_t in_b_mul;
+    int32_t in_b_shift;
+    int32_t out_offset;
+    int32_t out_mul;
+    int32_t out_shift;
+} kpu_model_quant_add_layer_argument_t;
+
+typedef struct
+{
+    uint32_t flags;
+    uint32_t main_mem_in_address;
+    uint32_t main_mem_out_address;
+    uint32_t kernel_size;
+    uint32_t channels;
+} kpu_model_gap2d_layer_argument_t;
+
+typedef struct
+{
+    uint32_t flags;
+    uint32_t main_mem_in_address;
+    uint32_t main_mem_out_address;
+    kpu_model_shape_t in_shape;
+    kpu_model_shape_t out_shape;
+    uint32_t kernel_width;
+    uint32_t kernel_height;
+    uint32_t stride_width;
+    uint32_t stride_height;
+    uint32_t padding_width;
+    uint32_t padding_height;
+} kpu_model_quant_max_pool2d_layer_argument_t;
+
+typedef struct
+{
+    uint32_t flags;
+    uint32_t main_mem_in_address;
+    uint32_t main_mem_out_address;
+    kpu_model_shape_t in_shape;
+    kpu_model_shape_t out_shape;
+    uint32_t kernel_width;
+    uint32_t kernel_height;
+    uint32_t stride_width;
+    uint32_t stride_height;
+    uint32_t padding_width;
+    uint32_t padding_height;
+    kpu_model_activation_t act;
+} kpu_model_ave_pool2d_layer_argument_t;
+
+typedef struct
+{
+    uint32_t flags;
+    uint32_t main_mem_in_address;
+    uint32_t mem_out_address;
+    uint32_t count;
+    kpu_model_quant_param_t quant_param;
+} kpu_model_quantize_layer_argument_t;
+
+typedef struct
+{
+    uint32_t flags;
+    uint32_t main_mem_in_address;
+    uint32_t main_mem_out_address;
+    uint32_t count;
+    kpu_model_quant_param_t quant_param;
+} kpu_model_dequantize_layer_argument_t;
+
+typedef struct
+{
+    uint32_t flags;
+    uint32_t main_mem_in_address;
+    uint32_t main_mem_out_address;
+    uint32_t count;
+    uint8_t table[256];
+} kpu_model_requantize_layer_argument_t;
+
+typedef struct
+{
+    uint32_t flags;
+    uint32_t main_mem_in_address;
+    uint32_t kpu_mem_out_address;
+    uint32_t channels;
+} kpu_model_add_padding_layer_argument_t;
+
+typedef struct
+{
+    uint32_t flags;
+    uint32_t main_mem_in_address;
+    uint32_t main_mem_out_address;
+    uint32_t channels;
+} kpu_model_remove_padding_layer_argument_t;
+
+typedef struct
+{
+    uint32_t flags;
+    uint32_t main_mem_in_address;
+    uint32_t kpu_mem_out_address;
+    uint32_t width;
+    uint32_t height;
+    uint32_t channels;
+} kpu_model_upload_layer_argument_t;
+
+typedef struct
+{
+    uint32_t flags;
+    uint32_t main_mem_in_address;
+    uint32_t main_mem_out_address;
+    uint32_t channels;
+} kpu_model_l2_norm_layer_argument_t;
+
+typedef struct
+{
+    uint32_t flags;
+    uint32_t main_mem_in_address;
+    uint32_t main_mem_out_address;
+    uint32_t channels;
+} kpu_model_softmax_layer_argument_t;
+
+typedef struct
+{
+    uint32_t flags;
+    uint32_t main_mem_out_address;
+    uint32_t input_count;
+    kpu_model_memory_range_t inputs_mem[0];
+} kpu_model_concat_layer_argument_t;
+
+typedef struct
+{
+    uint32_t flags;
+    uint32_t main_mem_in_address;
+    uint32_t main_mem_out_address;
+    uint32_t in_channels;
+    uint32_t out_channels;
+    kpu_model_activation_t act;
+    float weights[0];
+} kpu_model_fully_connected_layer_argument_t;
+
+typedef struct
+{
+    uint32_t flags;
+    uint32_t main_mem_in_address;
+    uint32_t main_mem_out_address;
+    kpu_model_shape_t shape;
+} kpu_model_tf_flatten_layer_argument_t;
+
+typedef struct
+{
+    uint32_t flags;
+    uint32_t main_mem_in_address;
+    uint32_t main_mem_out_address;
+    kpu_model_shape_t in_shape;
+    uint32_t out_width;
+    uint32_t out_height;
+    uint32_t align_corners;
+} kpu_model_resize_nearest_neighbor_layer_argument_t;
+
+typedef struct
+{
+    uint32_t flags;
+    uint32_t main_mem_in_address;
+    uint32_t main_mem_out_address;
+    uint32_t channels;
+    uint32_t channel_size;
+    kpu_model_quant_param_t quant_params[0];
+} kpu_model_channelwise_dequant_argument_t;
+
+typedef void(*kpu_done_callback_t)(void* userdata);
+
+typedef struct
+{
+    const uint8_t *model_buffer;
+    uint8_t *main_buffer;
+    uint32_t output_count;
+    const kpu_model_output_t *outputs;
+    const kpu_model_layer_header_t *layer_headers;
+    const uint8_t *body_start;
+    uint32_t layers_length;
+    volatile uint32_t current_layer;
+    const uint8_t * volatile current_body;
+    dmac_channel_number_t dma_ch;
+    kpu_done_callback_t done_callback;
+    void *userdata;
+} kpu_model_context_t;
+
+typedef struct
+{
+    uint32_t weigths_offset;
+    uint32_t bn_offset;
+    uint32_t act_offset;
+    float input_scale;
+    float input_bias;
+    float output_scale;
+    float output_bias;
+} kpu_model_layer_metadata_t;
+
+typedef struct _quantize_param
+{
+    float scale;
+    float bias;
+} quantize_param_t;
+
+extern volatile kpu_config_t *const kpu;
+
+/**
+ * @brief       Modle complier init kpu handler
+ *
+ * @param[in]   task            Kpu handler
+ *
+ * @return      Kpu handler
+ */
+extern kpu_task_t *kpu_task_init(kpu_task_t* task);
+
+/**
+ * @brief       Kpu run for AI
+ *
+ * @param[in]   task                Kpu handler
+ * @param[in]   dma_ch              DMA for kpu
+ * @param[in]   src                 The picture data
+ * @param[in]   dest                The result of kpu
+ * @param[in]   callback            The callback of kpu
+ *
+ * @return      result
+ *     - 0      Success
+ *     - Other  Fail.Kpu is busy.
+ */
+int kpu_run(kpu_task_t* task, dmac_channel_number_t dma_ch, const void *src, void* dest, plic_irq_callback_t callback);
+
+/**
+ * @brief       Get kpu result buf
+ *
+ * @param[in]   task                Kpu handler
+ *
+ * @return      Kpu result buf
+ */
+uint8_t *kpu_get_output_buf(kpu_task_t* task);
+
+/**
+ * @brief       Release kpu output buf
+ *
+ * @param[in]   output_buf                Kpu output buf
+ *
+ */
+void kpu_release_output_buf(uint8_t *output_buf);
+
+/**
+ * @brief       Kpu run for AI
+ *
+ * @param[in]   task                Kpu handler
+*
+* @return      result
+*     - 0      Success
+*     - Other  Fail.Kpu is busy.
+*/
+int kpu_start(kpu_task_t *task);
+
+/**
+ * @brief      Initialize kpu handler
+ *
+ * @param[in]   task            Kpu handler
+ *
+ * @return      result
+ *     - 0      Success
+ *     - Other  Fail.
+ */
+int kpu_single_task_init(kpu_task_t *task);
+
+/**
+ * @brief      Uninitialize kpu handler
+ *
+ * @param[in]   task            Kpu handler
+ *
+ * @return      result
+ *     - 0      Success
+ *     - Other  Fail.
+ */
+int kpu_single_task_deinit(kpu_task_t *task);
+
+/**
+ * @brief      Load kmodel and init kpu task
+ *
+ * @param[in]   task            Kpu handler
+ * @param[in]   buffer          Kmodel
+ * @param[in]   meta            Test data
+ *
+ * @return      result
+ *     - 0      Success
+ *     - Other  Fail.
+ */
+int kpu_model_load_from_buffer(kpu_task_t *task, uint8_t *buffer, kpu_model_layer_metadata_t **meta);
+
+/**
+ * @brief       Kpu initialize
+ *
+ * @param[in]   eight_bit_mode            0:16bit mode  1:8bit mode
+ * @param[in]   callback                  Callback of kpu
+ * @param[in]   userdata                  Data of callback
+ *
+ */
+void kpu_init(int eight_bit_mode, plic_irq_callback_t callback, void *userdata);
+
+/**
+ * @brief       Kpu input data by dma
+ *
+ * @param[in]   layer                   Kpu task layer
+ * @param[in]   src                     Image data
+ * @param[in]   dma_ch                  Dmac channel
+ * @param[in]   callback                Dmac complete callback
+ * @param[in]   userdata                Data of callback
+ *
+ */
+void kpu_input_dma(const kpu_layer_argument_t *layer, const uint8_t *src, dmac_channel_number_t dma_ch, plic_irq_callback_t callback, void *userdata);
+
+/**
+ * @brief       Kpu input data by cpu
+ *
+ * @param[in]   layer                   Kpu task layer
+ * @param[in]   src                     Image data
+ * @param[in]   width                   Image width
+ * @param[in]   height                  Image heigth
+ * @param[in]   channels                Color channel, RGB is 3
+ *
+ */
+void kpu_input_with_padding(kpu_layer_argument_t *layer, const uint8_t *src, int width, int height, int channels);
+
+/**
+ * @brief       Kpu run only one layer
+ *
+ * @param[in]   layer                   Kpu task layer
+ *
+ */
+void kpu_conv2d(kpu_layer_argument_t *layer);
+
+/**
+ * @brief       Kpu run only one layer then get the result by dma
+ *
+ * @param[in]   layer                   Kpu task layer
+ * @param[in]   dma_ch                  Dmac channel
+ * @param[in]   dest                    Result
+ * @param[in]   callback                Dmac complete callback
+ * @param[in]   userdata                Data of callback
+ *
+ */
+void kpu_conv2d_output(kpu_layer_argument_t *layer, dmac_channel_number_t dma_ch, uint8_t *dest, plic_irq_callback_t callback, void *userdata);
+
+/**
+ * @brief       Kpu pooling
+ *
+ * @param[in]   src                        Source
+ * @param[in]   src_param                  Source param
+ * @param[in]   kernel_size                Kernel size, 7*7 is 49
+ * @param[in]   channels                   Channels
+ * @param[in]   dest                       Dest
+ * @param[in]   dest_param                 Dest param
+ *
+ */
+void kpu_global_average_pool(const uint8_t *src, const quantize_param_t *src_param, int kernel_size, int channels, uint8_t *dest, const quantize_param_t *dest_param);
+
+/**
+ * @brief       Kpu pooling
+ *
+ * @param[in]   src                        Source
+ * @param[in]   src_param                  Source param
+ * @param[in]   kernel_size                Kernel size, 7*7 is 49
+ * @param[in]   channels                   Channels
+ * @param[in]   dest                       Dest
+ *
+ */
+void kpu_global_average_pool_float(const uint8_t *src, const quantize_param_t *src_param, int kernel_size, int channels, float *dest);
+
+/**
+ * @brief       Kpu fullly connected by cpu
+ *
+ * @param[in]   src                                 Source
+ * @param[in]   weights                             Weight
+ * @param[in]   biases                              Biases
+ * @param[in]   dest                                Dest
+ * @param[in]   input_channels                      Input channels
+ * @param[in]   output_channels                     Output channels
+ *
+ */
+void kpu_fully_connected(const float *src, const float *weights, const float *biases, float *dest, int input_channels, int output_channels);
+
+/**
+ * @brief       Kpu matrix multiplication
+ *
+ * @param[in]   src                                 Source
+ * @param[in]   channels                            Channels
+ * @param[in]   dest                                Dest
+ * @param[in]   dest_param                          Dest param
+ *
+ */
+void kpu_matmul_end(const uint8_t *src, int channels, float *dest, const quantize_param_t *dest_param);
+
+/**
+ * @brief       Kpu dequantize
+ *
+ * @param[in]   src                                 Source
+ * @param[in]   src_param                           Source param
+ * @param[in]   count                               Dequantize count
+ * @param[in]   dest                                Dest
+ *
+ */
+void kpu_dequantize(const uint8_t *src, const quantize_param_t *src_param, size_t count, float *dest);
+
+/**
+ * @brief       Kpu load kmodel
+ *
+ * @param[in]   ctx                                 Kmodel object
+ * @param[in]   buffer                              Kmodel buffer
+ *
+ * @return      result
+ *     - 0      Success
+ *     - Other  Fail.
+ */
+int kpu_load_kmodel(kpu_model_context_t *ctx, const uint8_t *buffer);
+
+/**
+ * @brief       Kpu free kmodel buffer
+ *
+ * @param[in]   ctx                                 kmodel object
+ *
+ */
+void kpu_model_free(kpu_model_context_t *ctx);
+
+/**
+ * @brief       Kpu load kmodel
+ *
+ * @param[in]   ctx                                 Kmodel object
+ * @param[in]   index                               Output index
+ * @param[in]   data                                Output data
+ * @param[in]   size                                Output data size
+ *
+ * @return      result
+ *     - 0      Success
+ *     - Other  Fail.
+ */
+int kpu_get_output(kpu_model_context_t *ctx, uint32_t index, uint8_t **data, size_t *size);
+
+/**
+ * @brief       Kpu run kmodel
+ *
+ * @param[in]   ctx                                 Kmodel object
+ * @param[in]   src                                 Source data
+ * @param[in]   dma_ch                              Dma channel
+ * @param[in]   done_callback                       Kpu complete callback
+ * @param[in]   userdata                            Data of callback
+ *
+ * @return      result
+ *     - 0      Success
+ *     - Other  Fail.
+ */
+int kpu_run_kmodel(kpu_model_context_t *ctx, const uint8_t *src, dmac_channel_number_t dma_ch, kpu_done_callback_t done_callback, void *userdata);
+
+#endif
diff --git a/Ubiquitous/XiZi_IIoT/board/edu-riscv64/third_party_driver/kpu/Kconfig b/Ubiquitous/XiZi_IIoT/board/edu-riscv64/third_party_driver/kpu/Kconfig
new file mode 100644
index 000000000..8b1378917
--- /dev/null
+++ b/Ubiquitous/XiZi_IIoT/board/edu-riscv64/third_party_driver/kpu/Kconfig
@@ -0,0 +1 @@
+
diff --git a/Ubiquitous/XiZi_IIoT/board/edu-riscv64/third_party_driver/kpu/Makefile b/Ubiquitous/XiZi_IIoT/board/edu-riscv64/third_party_driver/kpu/Makefile
new file mode 100644
index 000000000..cedabd673
--- /dev/null
+++ b/Ubiquitous/XiZi_IIoT/board/edu-riscv64/third_party_driver/kpu/Makefile
@@ -0,0 +1,4 @@
+SRC_FILES := kpu.c
+
+
+include $(KERNEL_ROOT)/compiler.mk
diff --git a/Ubiquitous/XiZi_IIoT/board/edu-riscv64/third_party_driver/kpu/kpu.c b/Ubiquitous/XiZi_IIoT/board/edu-riscv64/third_party_driver/kpu/kpu.c
new file mode 100644
index 000000000..4926c8d6a
--- /dev/null
+++ b/Ubiquitous/XiZi_IIoT/board/edu-riscv64/third_party_driver/kpu/kpu.c
@@ -0,0 +1,1634 @@
+#include "kpu.h"
+#include <platform.h>
+#include <sysctl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include "printf.h"
+#include "dmac.h"
+#include <string.h>
+#include "bsp.h"
+#include <assert.h>
+#include <float.h>
+
+#define LAYER_BURST_SIZE 12
+
+#define KPU_DEBUG 0
+#define USE_CACHED_AI_RAM 0
+
+#define min(a, b) (((a) < (b)) ? (a) : (b))
+#define max(a, b) (((a) > (b)) ? (a) : (b))
+#define ALIGN_UP(x, align) ((x + (align - 1)) & (~(align - 1)))
+
+static int ai_step(void *userdata);
+static int kpu_kmodel_done(kpu_model_context_t *ctx);
+
+volatile kpu_config_t *const kpu = (volatile kpu_config_t *)AI_BASE_ADDR;
+static volatile uint32_t kpu_status;
+
+typedef struct kpu_context
+{
+    kpu_task_t kpu_task;
+    uint32_t kpu_status;
+} kpu_context_t;
+
+volatile kpu_context_t g_kpu_context;
+
+static int kpu_run_all_done(void* _task)
+{
+    atomic_swap(&g_kpu_context.kpu_status, 0);
+    kpu_task_t* task = (kpu_task_t*)_task;
+    task->callback(task);
+    return 0;
+}
+
+int kpu_continue(void* _task)
+{
+    kpu_task_t* task = (kpu_task_t*)_task;
+    int layer_burst_size = 1;
+
+    kpu->interrupt_clear.data = (kpu_config_interrupt_t)
+    {
+        .calc_done_int=1,
+        .layer_cfg_almost_empty_int=1,
+        .layer_cfg_almost_full_int=1
+    };
+
+    if(task->remain_layers_length == 0)
+    {
+        return 0;
+    }
+    if(task->remain_layers_length <= layer_burst_size)
+    {
+        for(uint32_t i=0; i<task->remain_layers_length; i++)
+        {
+            kpu->layer_argument_fifo = task->remain_layers[i].interrupt_enabe.reg;
+            kpu->layer_argument_fifo = task->remain_layers[i].image_addr.reg;
+            kpu->layer_argument_fifo = task->remain_layers[i].image_channel_num.reg;
+            kpu->layer_argument_fifo = task->remain_layers[i].image_size.reg;
+            kpu->layer_argument_fifo = task->remain_layers[i].kernel_pool_type_cfg.reg;
+            kpu->layer_argument_fifo = task->remain_layers[i].kernel_load_cfg.reg;
+            kpu->layer_argument_fifo = task->remain_layers[i].kernel_offset.reg;
+            kpu->layer_argument_fifo = task->remain_layers[i].kernel_calc_type_cfg.reg;
+            kpu->layer_argument_fifo = task->remain_layers[i].write_back_cfg.reg;
+            kpu->layer_argument_fifo = task->remain_layers[i].conv_value.reg;
+            kpu->layer_argument_fifo = task->remain_layers[i].conv_value2.reg;
+            kpu->layer_argument_fifo = task->remain_layers[i].dma_parameter.reg;
+        }
+        task->remain_layers_length = 0;
+    }
+    else
+    {
+        for(uint32_t i=0; i<layer_burst_size; i++)
+        {
+            kpu->layer_argument_fifo = task->remain_layers[i].interrupt_enabe.reg;
+            kpu->layer_argument_fifo = task->remain_layers[i].image_addr.reg;
+            kpu->layer_argument_fifo = task->remain_layers[i].image_channel_num.reg;
+            kpu->layer_argument_fifo = task->remain_layers[i].image_size.reg;
+            kpu->layer_argument_fifo = task->remain_layers[i].kernel_pool_type_cfg.reg;
+            kpu->layer_argument_fifo = task->remain_layers[i].kernel_load_cfg.reg;
+            kpu->layer_argument_fifo = task->remain_layers[i].kernel_offset.reg;
+            kpu->layer_argument_fifo = task->remain_layers[i].kernel_calc_type_cfg.reg;
+            kpu->layer_argument_fifo = task->remain_layers[i].write_back_cfg.reg;
+            kpu->layer_argument_fifo = task->remain_layers[i].conv_value.reg;
+            kpu->layer_argument_fifo = task->remain_layers[i].conv_value2.reg;
+            kpu->layer_argument_fifo = task->remain_layers[i].dma_parameter.reg;
+        }
+        task->remain_layers += layer_burst_size;
+        task->remain_layers_length -= layer_burst_size;
+    }
+    return 0;
+}
+
+static int kpu_run_dma_output(uint32_t dma_ch, void* dst, uint32_t length, plic_irq_callback_t cb, void* _task)
+{
+    sysctl_dma_select(dma_ch, SYSCTL_DMA_SELECT_AI_RX_REQ);
+    dmac_irq_register(dma_ch, kpu_run_all_done, _task, 1);
+    dmac_set_single_mode(dma_ch, (void *)(&kpu->fifo_data_out), (void *)(dst), DMAC_ADDR_NOCHANGE, DMAC_ADDR_INCREMENT,
+                         DMAC_MSIZE_8, DMAC_TRANS_WIDTH_64, (length+7)/8);
+    return 0;
+}
+
+static int kpu_run_dma_input_done_push_layers(void* _task)
+{
+    kpu_task_t* task = (kpu_task_t*)_task;
+    kpu->interrupt_clear.reg = 7;
+    dmac->channel[task->dma_ch].intclear = 0xFFFFFFFF;
+    kpu->fifo_threshold.data = (kpu_config_fifo_threshold_t)
+    {
+        .fifo_full_threshold = 10, .fifo_empty_threshold=1
+    };
+    kpu->eight_bit_mode.data = (kpu_config_eight_bit_mode_t)
+    {
+        .eight_bit_mode=task->eight_bit_mode
+    };
+
+    kpu_layer_argument_t* last_layer = &task->layers[task->layers_length-1];
+
+    kpu_run_dma_output(task->dma_ch, task->dst, last_layer->dma_parameter.data.dma_total_byte+1, kpu_run_all_done, task);
+
+    kpu->interrupt_mask.data = (kpu_config_interrupt_t)
+    {
+        .calc_done_int=0,
+        .layer_cfg_almost_empty_int=0,
+        .layer_cfg_almost_full_int=1
+    };
+    kpu_continue(task);
+    return 0;
+}
+
+static void kpu_run_dma_input(uint32_t dma_ch, const void* src, plic_irq_callback_t cb, void* _task)
+{
+    kpu_task_t* task = _task;
+    kpu_layer_argument_t* first_layer = &task->layers[0];
+    uint64_t input_size = first_layer->kernel_calc_type_cfg.data.channel_switch_addr * 64 * (first_layer->image_channel_num.data.i_ch_num+1);
+    void *v_src = ((uintptr_t)src > 0x80000000 && (uintptr_t)src < 0x80600000) ? (void *)(src - 0x40000000) : (void *)src;
+    dmac_irq_register(dma_ch, cb, _task, 1);
+    dmac_set_single_mode(dma_ch, (void *)v_src, (void *)(AI_IO_BASE_ADDR), DMAC_ADDR_INCREMENT, DMAC_ADDR_INCREMENT,
+        DMAC_MSIZE_16, DMAC_TRANS_WIDTH_64, input_size / 8);
+}
+
+int kpu_run(kpu_task_t* v_task, dmac_channel_number_t dma_ch, const void *src, void* dest, plic_irq_callback_t callback)
+{
+    if(atomic_cas(&g_kpu_context.kpu_status, 0, 1))
+        return -1;
+
+    memcpy((void *)&g_kpu_context.kpu_task, v_task, sizeof(kpu_task_t));
+    kpu_task_t *task = (kpu_task_t *)&g_kpu_context.kpu_task;
+
+    kpu_layer_argument_t* last_layer = &task->layers[task->layers_length-1];
+
+    uint64_t output_size = last_layer->dma_parameter.data.dma_total_byte+1;
+
+    last_layer->dma_parameter.data.send_data_out = 1;
+    last_layer->interrupt_enabe.data.int_en = 1;
+
+    task->dma_ch = dma_ch;
+    task->dst = dest;
+    task->dst_length = output_size;
+    task->callback = callback;
+    task->remain_layers_length = task->layers_length;
+    task->remain_layers = task->layers;
+
+    plic_irq_enable(IRQN_AI_INTERRUPT);
+    plic_set_priority(IRQN_AI_INTERRUPT, 1);
+    plic_irq_register(IRQN_AI_INTERRUPT, kpu_continue, task);
+
+    kpu_run_dma_input(dma_ch, src, kpu_run_dma_input_done_push_layers, task);
+
+    return 0;
+}
+
+uint8_t *kpu_get_output_buf(kpu_task_t* task)
+{
+    kpu_layer_argument_t* last_layer = &task->layers[task->layers_length-1];
+    size_t output_size = ((last_layer->dma_parameter.data.dma_total_byte+1) + 7) / 8 * 8;
+    return malloc(output_size);
+}
+
+void kpu_release_output_buf(uint8_t *output_buf)
+{
+    if(output_buf != NULL)
+        free(output_buf);
+}
+
+static int kpu_done(void *ctx)
+{
+    atomic_swap(&kpu_status, 0);
+    kpu_task_t *task = (kpu_task_t *)ctx;
+    task->callback(task->ctx);
+    return 0;
+}
+
+static int kpu_config_input(void *ctx)
+{
+    kpu_task_t *task = (kpu_task_t *)ctx;
+    kpu->interrupt_clear.reg = 7;
+    if (task->remain_layers_length <= LAYER_BURST_SIZE)
+    {
+        for (uint32_t i = 0; i < task->remain_layers_length; i++)
+        {
+            kpu->layer_argument_fifo = task->remain_layers[i].interrupt_enabe.reg;
+            kpu->layer_argument_fifo = task->remain_layers[i].image_addr.reg;
+            kpu->layer_argument_fifo = task->remain_layers[i].image_channel_num.reg;
+            kpu->layer_argument_fifo = task->remain_layers[i].image_size.reg;
+            kpu->layer_argument_fifo = task->remain_layers[i].kernel_pool_type_cfg.reg;
+            kpu->layer_argument_fifo = task->remain_layers[i].kernel_load_cfg.reg;
+            kpu->layer_argument_fifo = task->remain_layers[i].kernel_offset.reg;
+            kpu->layer_argument_fifo = task->remain_layers[i].kernel_calc_type_cfg.reg;
+            kpu->layer_argument_fifo = task->remain_layers[i].write_back_cfg.reg;
+            kpu->layer_argument_fifo = task->remain_layers[i].conv_value.reg;
+            kpu->layer_argument_fifo = task->remain_layers[i].conv_value2.reg;
+            kpu->layer_argument_fifo = task->remain_layers[i].dma_parameter.reg;
+        }
+        task->remain_layers_length = 0;
+        kpu->interrupt_mask.reg = 7;
+    }
+    else
+    {
+        for (uint32_t i = 0; i < LAYER_BURST_SIZE; i++)
+        {
+            kpu->layer_argument_fifo = task->remain_layers[i].interrupt_enabe.reg;
+            kpu->layer_argument_fifo = task->remain_layers[i].image_addr.reg;
+            kpu->layer_argument_fifo = task->remain_layers[i].image_channel_num.reg;
+            kpu->layer_argument_fifo = task->remain_layers[i].image_size.reg;
+            kpu->layer_argument_fifo = task->remain_layers[i].kernel_pool_type_cfg.reg;
+            kpu->layer_argument_fifo = task->remain_layers[i].kernel_load_cfg.reg;
+            kpu->layer_argument_fifo = task->remain_layers[i].kernel_offset.reg;
+            kpu->layer_argument_fifo = task->remain_layers[i].kernel_calc_type_cfg.reg;
+            kpu->layer_argument_fifo = task->remain_layers[i].write_back_cfg.reg;
+            kpu->layer_argument_fifo = task->remain_layers[i].conv_value.reg;
+            kpu->layer_argument_fifo = task->remain_layers[i].conv_value2.reg;
+            kpu->layer_argument_fifo = task->remain_layers[i].dma_parameter.reg;
+        }
+        task->remain_layers += LAYER_BURST_SIZE;
+        task->remain_layers_length -= LAYER_BURST_SIZE;
+    }
+    return 0;
+}
+
+static void kpu_data_output(kpu_task_t *task)
+{
+    sysctl_dma_select(task->dma_ch, SYSCTL_DMA_SELECT_AI_RX_REQ);
+    dmac_irq_register(task->dma_ch, kpu_done, task, 1);
+    dmac_set_single_mode(task->dma_ch, (void *)(&kpu->fifo_data_out), (void *)(task->dst), DMAC_ADDR_NOCHANGE, DMAC_ADDR_INCREMENT,
+        DMAC_MSIZE_8, DMAC_TRANS_WIDTH_64, task->dst_length);
+}
+
+static int kpu_data_ready(void *ctx)
+{
+    kpu_task_t *task = (kpu_task_t *)ctx;
+
+    dmac->channel[task->dma_ch].intclear = 0xFFFFFFFF;
+    kpu_data_output(task);
+
+    kpu->eight_bit_mode.reg = task->eight_bit_mode;
+    kpu->interrupt_mask.reg = 7;
+    kpu->interrupt_clear.reg = 7;
+    kpu->fifo_threshold.data = (kpu_config_fifo_threshold_t)
+    {
+        .fifo_full_threshold = 12, .fifo_empty_threshold = 1
+    };
+    plic_irq_enable(IRQN_AI_INTERRUPT);
+    plic_set_priority(IRQN_AI_INTERRUPT, 2);
+    plic_irq_register(IRQN_AI_INTERRUPT, kpu_config_input, task);
+    kpu_config_input(task);
+    kpu->interrupt_mask.data = (kpu_config_interrupt_t)
+    {
+        .calc_done_int = 1,
+        .layer_cfg_almost_empty_int = 0,
+        .layer_cfg_almost_full_int = 1
+    };
+    return 0;
+}
+
+static void kpu_data_input(kpu_task_t *task)
+{
+    if (task->src == NULL)
+    {
+        kpu_data_ready(task);
+        return;
+    }
+    void *v_src = ((uintptr_t)task->src > 0x80000000 && (uintptr_t)task->src < 0x80600000) ? (void *)((void *)task->src - 0x40000000) : (void *)task->src;
+    dmac_irq_register(task->dma_ch, kpu_data_ready, task, 1);
+    kpu_layer_argument_t *layer = &task->layers[0];
+    dmac_set_single_mode(task->dma_ch, v_src, (void *)(uintptr_t)(AI_IO_BASE_ADDR + layer->image_addr.data.image_src_addr * 64), DMAC_ADDR_INCREMENT, DMAC_ADDR_INCREMENT,
+        DMAC_MSIZE_16, DMAC_TRANS_WIDTH_64, task->src_length);
+}
+
+int kpu_single_task_init(kpu_task_t *task)
+{
+    sysctl_clock_enable(SYSCTL_CLOCK_AI);
+    kpu_layer_argument_t *first_layer = &task->layers[0];
+    kpu_layer_argument_t *last_layer = &task->layers[task->layers_length - 1];
+
+    last_layer->dma_parameter.data.send_data_out = 1;
+    last_layer->interrupt_enabe.data.int_en = 1;
+    task->src_length = first_layer->kernel_calc_type_cfg.data.channel_switch_addr * 64 * (first_layer->image_channel_num.data.i_ch_num + 1) / 8;
+    task->dst_length = ((last_layer->dma_parameter.data.dma_total_byte + 1) + 7) / 8;
+    task->dst = (uint64_t *)malloc(task->dst_length * 8);
+    memset(task->dst, 0, task->dst_length * 8);
+    if (task->dst == NULL)
+        return 1;
+    return 0;
+}
+
+int kpu_single_task_deinit(kpu_task_t *task)
+{
+    free(task->dst);
+    return 0;
+}
+
+int kpu_model_load_from_buffer(kpu_task_t *task, uint8_t *buffer, kpu_model_layer_metadata_t **meta)
+{
+    uintptr_t base_addr = (uintptr_t)buffer;
+    kpu_model_header_t *header = (kpu_model_header_t *)buffer;
+    kpu_model_layer_metadata_t *layer_meta = (kpu_model_layer_metadata_t *)(base_addr + sizeof(kpu_model_header_t));
+    kpu_layer_argument_t *layers = (kpu_layer_argument_t *)(base_addr + header->layers_argument_start);
+
+    if (header->version != 1)
+        return -1;
+    uint32_t layers_length = header->layers_length;
+    task->layers_length = layers_length;
+    task->eight_bit_mode = header->flags & 1;
+    task->layers = layers;
+    task->output_scale = layer_meta[layers_length - 1].output_scale;
+    task->output_bias = layer_meta[layers_length - 1].output_bias;
+    size_t i;
+    for (i = 0; i < layers_length; i++)
+    {
+        layers[i].kernel_load_cfg.data.para_start_addr = (uint64_t)(base_addr + layer_meta[i].weigths_offset);
+        layers[i].kernel_pool_type_cfg.data.bwsx_base_addr = (uint64_t)(base_addr + layer_meta[i].bn_offset);
+        layers[i].kernel_calc_type_cfg.data.active_addr = (uint64_t)(base_addr + layer_meta[i].act_offset);
+    }
+
+    if (meta)
+        *meta = layer_meta;
+    return 0;
+}
+
+int kpu_start(kpu_task_t *task)
+{
+    if (atomic_cas(&kpu_status, 0, 1))
+        return -1;
+
+    task->remain_layers_length = task->layers_length;
+    task->remain_layers = task->layers;
+    kpu_data_input(task);
+    return 0;
+}
+
+static void kpu_send_layer(const kpu_layer_argument_t *layer)
+{
+    kpu->layer_argument_fifo = layer->interrupt_enabe.reg;
+    kpu->layer_argument_fifo = layer->image_addr.reg;
+    kpu->layer_argument_fifo = layer->image_channel_num.reg;
+    kpu->layer_argument_fifo = layer->image_size.reg;
+    kpu->layer_argument_fifo = layer->kernel_pool_type_cfg.reg;
+    kpu->layer_argument_fifo = layer->kernel_load_cfg.reg;
+    kpu->layer_argument_fifo = layer->kernel_offset.reg;
+    kpu->layer_argument_fifo = layer->kernel_calc_type_cfg.reg;
+    kpu->layer_argument_fifo = layer->write_back_cfg.reg;
+    kpu->layer_argument_fifo = layer->conv_value.reg;
+    kpu->layer_argument_fifo = layer->conv_value2.reg;
+    kpu->layer_argument_fifo = layer->dma_parameter.reg;
+}
+
+void kpu_init(int eight_bit_mode, plic_irq_callback_t callback, void *userdata)
+{
+    kpu->interrupt_clear.reg = 7;
+    kpu->fifo_threshold.data = (kpu_config_fifo_threshold_t)
+    {
+        .fifo_full_threshold = 10, .fifo_empty_threshold = 1
+    };
+    kpu->eight_bit_mode.data = (kpu_config_eight_bit_mode_t)
+    {
+        .eight_bit_mode = eight_bit_mode
+    };
+    kpu->interrupt_mask.data = (kpu_config_interrupt_t)
+    {
+        .calc_done_int = 1,
+        .layer_cfg_almost_empty_int = 0,
+        .layer_cfg_almost_full_int = 1
+    };
+
+    plic_irq_enable(IRQN_AI_INTERRUPT);
+    plic_set_priority(IRQN_AI_INTERRUPT, 1);
+    plic_irq_register(IRQN_AI_INTERRUPT, callback, userdata);
+}
+
+#if 0
+void kpu_input_dma(kpu_layer_argument_t *layer, const uint8_t *src, dmac_channel_number_t dma_ch, plic_irq_callback_t callback, void *userdata)
+{
+    uint64_t input_size = layer->kernel_calc_type_cfg.data.channel_switch_addr * 64 * (layer->image_channel_num.data.i_ch_num + 1);
+    dmac_set_irq(dma_ch, callback, userdata, 1);
+    dmac_set_single_mode(dma_ch, (void *)src, (void *)(AI_IO_BASE_ADDR + layer->image_addr.data.image_src_addr * 64), DMAC_ADDR_INCREMENT, DMAC_ADDR_INCREMENT,
+        DMAC_MSIZE_16, DMAC_TRANS_WIDTH_64, input_size / 8);
+}
+#endif
+
+void kpu_input_dma(const kpu_layer_argument_t *layer, const uint8_t *src, dmac_channel_number_t dma_ch, plic_irq_callback_t callback, void *userdata)
+{
+    uint64_t input_size = layer->kernel_calc_type_cfg.data.channel_switch_addr * 64 * (layer->image_channel_num.data.i_ch_num + 1);
+    void *v_src = ((uintptr_t)src > 0x80000000 && (uintptr_t)src < 0x80600000) ? (void *)(src - 0x40000000) : (void *)src;
+    dmac_set_irq(dma_ch, callback, userdata, 1);
+    dmac_set_single_mode(dma_ch, (void *)v_src, (void *)(uintptr_t)(AI_IO_BASE_ADDR + layer->image_addr.data.image_src_addr * 64), DMAC_ADDR_INCREMENT, DMAC_ADDR_INCREMENT,
+        DMAC_MSIZE_16, DMAC_TRANS_WIDTH_64, input_size / 8);
+}
+
+static void kpu_conv2d_core(kpu_layer_argument_t *layer)
+{
+    kpu_send_layer(layer);
+}
+
+void kpu_conv2d(kpu_layer_argument_t *layer)
+{
+    kpu->interrupt_clear.data = (kpu_config_interrupt_t)
+    {
+        .calc_done_int = 1,
+        .layer_cfg_almost_empty_int = 1,
+        .layer_cfg_almost_full_int = 1
+    };
+    kpu->interrupt_mask.data = (kpu_config_interrupt_t)
+    {
+        .calc_done_int = 1,
+        .layer_cfg_almost_empty_int = 0,
+        .layer_cfg_almost_full_int = 1
+    };
+    kpu_conv2d_core(layer);
+}
+
+void kpu_conv2d_output(kpu_layer_argument_t *layer, dmac_channel_number_t dma_ch, uint8_t *dest, plic_irq_callback_t callback, void *userdata)
+{
+    kpu->interrupt_clear.data = (kpu_config_interrupt_t)
+    {
+        .calc_done_int = 1,
+        .layer_cfg_almost_empty_int = 1,
+        .layer_cfg_almost_full_int = 1
+    };
+    kpu->interrupt_mask.data = (kpu_config_interrupt_t)
+    {
+        .calc_done_int = 1,
+        .layer_cfg_almost_empty_int = 1,
+        .layer_cfg_almost_full_int = 1
+    };
+    layer->dma_parameter.data.send_data_out = 1;
+    sysctl_dma_select(dma_ch, SYSCTL_DMA_SELECT_AI_RX_REQ);
+    dmac_set_irq(dma_ch, callback, userdata, 1);
+    dmac_set_single_mode(dma_ch, (void *)(&kpu->fifo_data_out), dest, DMAC_ADDR_NOCHANGE, DMAC_ADDR_INCREMENT,
+        DMAC_MSIZE_8, DMAC_TRANS_WIDTH_64, (layer->dma_parameter.data.dma_total_byte + 8) / 8);
+    kpu_conv2d_core(layer);
+}
+
+void kpu_conv2d_output_full_add(kpu_layer_argument_t *layer, dmac_channel_number_t dma_ch, uint64_t *dest, plic_irq_callback_t callback, void *userdata)
+{
+    uint32_t channels = layer->image_channel_num.data.o_ch_num + 1;
+    layer->interrupt_enabe.data.full_add = 1;
+
+    kpu->interrupt_clear.data = (kpu_config_interrupt_t)
+    {
+        .calc_done_int = 1,
+        .layer_cfg_almost_empty_int = 1,
+        .layer_cfg_almost_full_int = 1
+    };
+    kpu->interrupt_mask.data = (kpu_config_interrupt_t)
+    {
+        .calc_done_int = 1,
+        .layer_cfg_almost_empty_int = 1,
+        .layer_cfg_almost_full_int = 1
+    };
+    layer->dma_parameter.data.send_data_out = 1;
+    sysctl_dma_select(dma_ch, SYSCTL_DMA_SELECT_AI_RX_REQ);
+    dmac_set_irq(dma_ch, callback, userdata, 1);
+    dmac_set_single_mode(dma_ch, (void *)(&kpu->fifo_data_out), dest, DMAC_ADDR_NOCHANGE, DMAC_ADDR_INCREMENT,
+        DMAC_MSIZE_8, DMAC_TRANS_WIDTH_64, channels);
+    kpu_conv2d_core(layer);
+}
+
+void kpu_add(const uint8_t *src1, const quantize_param_t *src1_param, const uint8_t *src2, const quantize_param_t *src2_param, size_t count, uint8_t *dest, const quantize_param_t *dest_param)
+{
+    quantize_param_t q1 = *src1_param, q2 = *src2_param, q3 = *dest_param;
+
+    size_t i;
+    for (i = 0; i < count; i++)
+    {
+        int value = ((*src1++ * q1.scale + q1.bias + *src2++ * q2.scale + q2.bias) - q3.bias) / q3.scale;
+        if (value < 0) value = 0;
+        if (value > 0xFF) value = 0xFF;
+        *dest++ = value;
+    }
+}
+
+void kpu_global_average_pool(const uint8_t *src, const quantize_param_t *src_param, int kernel_size, int channels, uint8_t *dest, const quantize_param_t *dest_param)
+{
+    quantize_param_t q1 = *src_param, q2 = *dest_param;
+    size_t oc, y, x;
+
+    if (((uintptr_t)dest) >= AI_IO_BASE_ADDR && ((uintptr_t)dest) < AI_IO_BASE_ADDR + 2 * 1024 * 1024)
+    {
+        uint32_t row_padding = 16;
+        uint32_t row_group = 4;
+        uint32_t row_length = 1;
+        uint32_t height = 4;
+
+        for (oc = 0; oc < channels; oc++)
+        {
+            uint8_t *channel_origin = dest + oc / row_group * row_length * height * 64 + oc % row_group * row_padding;
+            for (y = 0; y < 1; y++)
+            {
+                uint8_t *y_origin = channel_origin + y * row_length * 64;
+                for (x = 0; x < 1; x++)
+                {
+                    int64_t sum = 0;
+                    size_t i;
+                    for (i = 0; i < kernel_size; i++)
+                        sum += *src++;
+
+                    int value = ((sum * q1.scale + q1.bias) / kernel_size - q2.bias) / q2.scale;
+                    if (value < 0) value = 0;
+                    if (value > 0xFF) value = 0xFF;
+                    y_origin[x] = value;
+                }
+            }
+        }
+    }
+    else
+    {
+        for (oc = 0; oc < channels; oc++)
+        {
+            int64_t sum = 0;
+            size_t i;
+            for (i = 0; i < kernel_size; i++)
+                sum += *src++;
+
+            int value = ((sum * q1.scale + q1.bias) / kernel_size - q2.bias) / q2.scale;
+            if (value < 0) value = 0;
+            if (value > 0xFF) value = 0xFF;
+            dest[oc] = value;
+        }
+    }
+}
+
+void kpu_global_average_pool_float(const uint8_t *src, const quantize_param_t *src_param, int kernel_size, int channels, float *dest)
+{
+    quantize_param_t q = *src_param;
+    size_t oc;
+
+    for (oc = 0; oc < channels; oc++)
+    {
+        int64_t sum = 0;
+        size_t i;
+        for (i = 0; i < kernel_size; i++)
+            sum += *src++;
+
+        float value = (sum * q.scale + q.bias) / kernel_size;
+        dest[oc] = value;
+    }
+}
+
+void kpu_matmul_end(const uint8_t *src, int channels, float *dest, const quantize_param_t *dest_param)
+{
+    quantize_param_t q1 = *dest_param;
+    size_t i = 0;
+    for (i = 0; i < channels; i++)
+        *dest++ = src[i * 16] * q1.scale + q1.bias;
+}
+
+void kpu_fully_connected(const float *src, const float *weights, const float *biases, float *dest, int input_channels, int output_channels)
+{
+    int ic, oc;
+    for (oc = 0; oc < output_channels; oc++)
+    {
+        const float *c_weights = weights + oc * input_channels;
+
+        float sum = 0.0f;
+        for (ic = 0; ic < input_channels; ic++)
+            sum += src[ic] * c_weights[ic];
+        dest[oc] = sum + biases[oc];
+    }
+}
+
+void kpu_dequantize(const uint8_t *src, const quantize_param_t *src_param, size_t count, float *dest)
+{
+    quantize_param_t q1 = *src_param;
+    size_t i = 0;
+    for (i = 0; i < count; i++)
+        *dest++ = src[i] * q1.scale + q1.bias;
+}
+
+
+void kpu_input_with_padding(kpu_layer_argument_t *layer, const uint8_t *src, int width, int height, int channels)
+{
+    uint8_t *dest = (uint8_t *)(uintptr_t)(AI_IO_BASE_ADDR + layer->image_addr.data.image_src_addr * 64);
+    size_t oc, y, x;
+
+    uint32_t row_padding;
+    uint32_t row_group;
+    uint32_t row_length;
+
+    if (width <= 16)
+    {
+        row_padding = 16;
+        row_group = 4;
+        row_length = 1;
+    }
+    else if (width <= 32)
+    {
+        row_padding = 32;
+        row_group = 2;
+        row_length = 1;
+    }
+    else
+    {
+        row_padding = 64;
+        row_group = 1;
+        row_length = (width + 63) / 64;
+    }
+
+    for (oc = 0; oc < channels; oc++)
+    {
+        uint8_t *channel_origin = dest + oc / row_group * row_length * height * 64 + oc % row_group * row_padding;
+        for (y = 0; y < height; y++)
+        {
+            uint8_t *y_origin = channel_origin + y * row_length * 64;
+            for (x = 0; x < width; x++)
+                y_origin[x] = *src++;
+        }
+    }
+}
+#if USE_CACHED_AI_RAM
+static void kpu_flush_cache(uint32_t addr, size_t lines)
+{
+    size_t line;
+    for (line = 0; line < lines; line++)
+    {
+        const uint64_t *src = (const uint64_t *)(AI_RAM_BASE_ADDR + (addr + line) * 64);
+        uint64_t *dest = (uint64_t *)(AI_IO_BASE_ADDR + (addr + line) * 64);
+        size_t i;
+        for (i = 0; i < 8; i++)
+            dest[i] = src[i];
+    }
+}
+#endif
+static int64_t kpu_carry_shift(int64_t value, uint32_t shift)
+{
+    if (shift > 0)
+    {
+        value >>= shift - 1;
+        if (value & 0x1)
+        {
+            if (value < 0)
+                value = (value >> 1) - 1;
+            else
+                value = (value >> 1) + 1;
+        }
+        else
+        {
+            value >>= 1;
+        }
+    }
+
+    return value;
+}
+static void kpu_upload_core(size_t width, size_t height, size_t channels, const uint8_t *src, uint32_t kpu_addr)
+{
+    uint8_t *dest = (uint8_t *)(uintptr_t)(AI_IO_BASE_ADDR + kpu_addr * 64);
+    size_t oc, y, x;
+    uint32_t row_padding;
+    uint32_t row_group;
+    uint32_t row_length;
+    if (width <= 16)
+    {
+    	row_padding = 16;
+    	row_group = 4;
+    	row_length = 1;
+    }
+    else if (width <= 32)
+    {
+    	row_padding = 32;
+    	row_group = 2;
+    	row_length = 1;
+    }
+    else
+    {
+    	row_padding = 64;
+    	row_group = 1;
+    	row_length = (width + 63) / 64;
+    }
+
+    if ((uintptr_t)src % 8 == 0 && width % 8 == 0)
+    {
+#define UPLOAD_BEGIN()                                                                                               \
+    for (oc = 0; oc < channels; oc++)                                                                                \
+    {                                                                                                                \
+        uint8_t* channel_origin = dest + oc / row_group * row_length * height * 64 + oc % row_group * row_padding;   \
+        for (y = 0; y < height; y++)                                                                                 \
+        {                                                                                                            \
+            uint64_t* y_origin = (uint64_t*)(channel_origin + y * row_length * 64);                                               \
+
+#define UPLOAD_END() \
+        }            \
+    }
+
+        width /= 8;
+        const uint64_t *u64_src = (const uint64_t *)src;
+        if (width == 1)
+        {
+            UPLOAD_BEGIN()
+                y_origin[0] = *u64_src++;
+            UPLOAD_END()
+        }
+        else if (width == 2)
+        {
+            UPLOAD_BEGIN()
+            {
+                y_origin[0] = *u64_src++;
+                y_origin[1] = *u64_src++;
+            }
+            UPLOAD_END()
+        }
+        else if (width == 4)
+        {
+            UPLOAD_BEGIN()
+            {
+                y_origin[0] = *u64_src++;
+                y_origin[1] = *u64_src++;
+                y_origin[2] = *u64_src++;
+                y_origin[3] = *u64_src++;
+            }
+            UPLOAD_END()
+        }
+        else
+        {
+            UPLOAD_BEGIN()
+            for (x = 0; x < width; x++)
+                y_origin[x] = *u64_src++;
+            UPLOAD_END()
+        }
+    }
+    else
+    {
+        for (oc = 0; oc < channels; oc++)
+        {
+        	uint8_t *channel_origin = dest + oc / row_group * row_length * height * 64 + oc % row_group * row_padding;
+        	for (y = 0; y < height; y++)
+        	{
+        		uint8_t *y_origin = channel_origin + y * row_length * 64;
+        		for (x = 0; x < width; x++)
+                    y_origin[x] = *src++;
+            }
+        }
+    }
+}
+static void kpu_kmodel_input_with_padding(const kpu_layer_argument_t *layer, const uint8_t *src)
+{
+    size_t width = layer->image_size.data.i_row_wid + 1;
+    size_t height = layer->image_size.data.i_col_high + 1;
+    size_t channels = layer->image_channel_num.data.i_ch_num + 1;
+
+    kpu_upload_core(width, height, channels, src, layer->image_addr.data.image_src_addr);
+}
+
+static void kpu_kmodel_add(const kpu_model_add_layer_argument_t *arg, kpu_model_context_t *ctx)
+{
+    const float *src_a = (const float *)(ctx->main_buffer + arg->main_mem_in_a_address);
+    const float *src_b = (const float *)(ctx->main_buffer + arg->main_mem_in_b_address);
+    float *dest = (float *)(ctx->main_buffer + arg->main_mem_out_address);
+    size_t i, count = arg->count;
+
+	for (i = 0; i < count; i++)
+		dest[i] = src_a[i] + src_b[i];
+}
+
+static void kpu_quantized_add(const kpu_model_quant_add_layer_argument_t *arg, kpu_model_context_t *ctx)
+{
+    const uint8_t *src_a = (const uint8_t*)(ctx->main_buffer + arg->main_mem_in_a_address);
+    const uint8_t *src_b = (const uint8_t*)(ctx->main_buffer + arg->main_mem_in_b_address);
+    size_t count = ALIGN_UP(arg->count, 8) / 8;
+    int64_t off_a = arg->in_a_offset, mul_a = arg->in_a_mul, sh_a = arg->in_a_shift;
+    int64_t off_b = arg->in_b_offset, mul_b = arg->in_b_mul, sh_b = arg->in_b_shift;
+    int64_t off_o = arg->out_offset, mul_o = arg->out_mul, sh_o = arg->out_shift;
+
+    uint8_t* dest = (uint8_t*)(ctx->main_buffer + arg->main_mem_out_address);
+    size_t i;
+
+    if (sh_a == sh_b)
+    {
+#define QADD_UNROLL_1(x)     \
+    int64_t a##x = *src_a++; \
+    int64_t b##x = *src_b++;
+
+#define QADD_UNROLL_2(x) \
+    a##x += off_a; \
+    b##x += off_b;
+
+#define QADD_UNROLL_3(x) \
+    a##x *= mul_a; \
+    b##x *= mul_b;
+
+#define QADD_UNROLL_4(x) \
+    int64_t v##x = a##x + b##x;
+
+#define QADD_UNROLL_5(x) \
+    v##x >>= sh_a;
+
+#define QADD_UNROLL_6(x) \
+    v##x *= mul_o;
+
+#define QADD_UNROLL_7(x) \
+    v##x = kpu_carry_shift(v##x, sh_o);
+
+#define QADD_UNROLL_8(x) \
+    v##x += off_o;
+
+#define QADD_UNROLL_9(x) \
+    v##x = min(0xFF, max(0, v##x));
+
+#define QADD_UNROLL_10(x) \
+    *dest++ = v##x;
+
+#define QADD_UNROLL_S(x) \
+    QADD_UNROLL_##x(0) \
+    QADD_UNROLL_##x(1) \
+    QADD_UNROLL_##x(2) \
+    QADD_UNROLL_##x(3) \
+    QADD_UNROLL_##x(4) \
+    QADD_UNROLL_##x(5) \
+    QADD_UNROLL_##x(6) \
+    QADD_UNROLL_##x(7)
+
+        for (i = 0; i < count; i++)
+        {
+            QADD_UNROLL_S(1);
+            QADD_UNROLL_S(2);
+            QADD_UNROLL_S(3);
+            QADD_UNROLL_S(4);
+            QADD_UNROLL_S(5);
+            QADD_UNROLL_S(6);
+            QADD_UNROLL_S(7);
+            QADD_UNROLL_S(8);
+            QADD_UNROLL_S(9);
+            QADD_UNROLL_S(10);
+        }
+    }
+    else
+    {
+#undef QADD_UNROLL_1
+#define QADD_UNROLL_1(x)     \
+    int64_t a##x = *src_a++; \
+    int64_t b##x = *src_b++;
+
+#undef QADD_UNROLL_2
+#define QADD_UNROLL_2(x) \
+    a##x += off_a; \
+    b##x += off_b;
+
+#undef QADD_UNROLL_3
+#define QADD_UNROLL_3(x) \
+    a##x *= mul_a; \
+    b##x *= mul_b;
+
+#undef QADD_UNROLL_4
+#define QADD_UNROLL_4(x) \
+    a##x >>= sh_a; \
+    b##x >>= sh_b;
+
+#undef QADD_UNROLL_5
+#define QADD_UNROLL_5(x) \
+    int64_t v##x = a##x + b##x;
+
+#undef QADD_UNROLL_6
+#define QADD_UNROLL_6(x) \
+    v##x *= mul_o;
+
+#undef QADD_UNROLL_7
+#define QADD_UNROLL_7(x) \
+    v##x >>= sh_o;
+
+#undef QADD_UNROLL_8
+#define QADD_UNROLL_8(x) \
+    v##x += off_o;
+
+#undef QADD_UNROLL_9
+#define QADD_UNROLL_9(x) \
+    v##x = min(0xFF, max(0, v##x));
+
+#undef QADD_UNROLL_10
+#define QADD_UNROLL_10(x) \
+    *dest++ = v##x;
+
+#undef QADD_UNROLL_S
+#define QADD_UNROLL_S(x) \
+    QADD_UNROLL_##x(0) \
+    QADD_UNROLL_##x(1) \
+    QADD_UNROLL_##x(2) \
+    QADD_UNROLL_##x(3) \
+    QADD_UNROLL_##x(4) \
+    QADD_UNROLL_##x(5) \
+    QADD_UNROLL_##x(6) \
+    QADD_UNROLL_##x(7)
+
+        for (i = 0; i < count; i++)
+        {
+            QADD_UNROLL_S(1);
+            QADD_UNROLL_S(2);
+            QADD_UNROLL_S(3);
+            QADD_UNROLL_S(4);
+            QADD_UNROLL_S(5);
+            QADD_UNROLL_S(6);
+            QADD_UNROLL_S(7);
+            QADD_UNROLL_S(8);
+            QADD_UNROLL_S(9);
+            QADD_UNROLL_S(10);
+        }
+    }
+}
+
+static void kpu_global_average_pool2d(const kpu_model_gap2d_layer_argument_t *arg, kpu_model_context_t *ctx)
+{
+    const float *src = (const float *)(ctx->main_buffer + arg->main_mem_in_address);
+    float *dest = (float *)(ctx->main_buffer + arg->main_mem_out_address);
+    size_t oc, channels = arg->channels, kernel_size = arg->kernel_size;
+
+	for (oc = 0; oc < channels; oc++)
+	{
+		float sum = 0.f;
+		size_t i;
+		for (i = 0; i < kernel_size; i++)
+			sum += *src++;
+
+		dest[oc] = sum / kernel_size;
+	}
+}
+
+static void kpu_quantized_max_pool2d(const kpu_model_quant_max_pool2d_layer_argument_t *arg, kpu_model_context_t *ctx)
+{
+    const uint8_t *src = (const uint8_t *)(ctx->main_buffer + arg->main_mem_in_address);
+    uint8_t *dest = (uint8_t *)(ctx->main_buffer + arg->main_mem_out_address);
+    kpu_model_shape_t in_shape = arg->in_shape, out_shape = arg->out_shape;
+    uint32_t kernel_width = arg->kernel_width, kernel_height = arg->kernel_height;
+    uint32_t stride_width = arg->stride_width, stride_height = arg->stride_height;
+    uint32_t padding_width = arg->padding_width, padding_height = arg->padding_height;
+
+    uint32_t out_y, out_x, oc;
+
+    for (oc = 0; oc < out_shape.channels; oc++)
+    {
+        const uint8_t *channel_src = src + in_shape.width * in_shape.height * oc;
+        for (out_y = 0; out_y < out_shape.height; out_y++)
+        {
+            for (out_x = 0; out_x < out_shape.width; out_x++)
+            {
+                int32_t in_x_origin = (int32_t)(out_x * stride_width) - padding_width;
+                int32_t in_y_origin = (int32_t)(out_y * stride_height) - padding_height;
+                int32_t kernel_x_start = max(0, -in_x_origin);
+                int32_t kernel_x_end = min(kernel_width, in_shape.width - in_x_origin);
+                int32_t kernel_y_start = max(0, -in_y_origin);
+                int32_t kernel_y_end = min(kernel_height, in_shape.height - in_y_origin);
+                uint8_t value = 0;
+
+                int32_t kernel_y, kernel_x;
+                for (kernel_y = kernel_y_start; kernel_y < kernel_y_end; kernel_y++)
+                {
+                    for (kernel_x = kernel_x_start; kernel_x < kernel_x_end; kernel_x++)
+                    {
+                        int32_t in_x = in_x_origin + kernel_x;
+                        int32_t in_y = in_y_origin + kernel_y;
+                        value = max(value, channel_src[in_y * in_shape.width + in_x]);
+                    }
+                }
+
+                *dest++ = value;
+            }
+        }
+    }
+}
+
+static void kpu_average_pool2d(const kpu_model_ave_pool2d_layer_argument_t *arg, kpu_model_context_t *ctx)
+{
+    const float *src = (const float *)(ctx->main_buffer + arg->main_mem_in_address);
+    float *dest = (float *)(ctx->main_buffer + arg->main_mem_out_address);
+    kpu_model_shape_t in_shape = arg->in_shape, out_shape = arg->out_shape;
+    uint32_t kernel_width = arg->kernel_width, kernel_height = arg->kernel_height;
+    uint32_t stride_width = arg->stride_width, stride_height = arg->stride_height;
+    uint32_t padding_width = arg->padding_width, padding_height = arg->padding_height;
+
+    uint32_t out_y, out_x, oc;
+
+    for (oc = 0; oc < out_shape.channels; oc++)
+    {
+        const float *channel_src = src + in_shape.width * in_shape.height * oc;
+        for (out_y = 0; out_y < out_shape.height; out_y++)
+        {
+            for (out_x = 0; out_x < out_shape.width; out_x++)
+            {
+                int32_t in_x_origin = (int32_t)(out_x * stride_width) - padding_width;
+                int32_t in_y_origin = (int32_t)(out_y * stride_height) - padding_height;
+                int32_t kernel_x_start = max(0, -in_x_origin);
+                int32_t kernel_x_end = min(kernel_width, in_shape.width - in_x_origin);
+                int32_t kernel_y_start = max(0, -in_y_origin);
+                int32_t kernel_y_end = min(kernel_height, in_shape.height - in_y_origin);
+                float value = 0;
+                float kernel_count = 0;
+
+                int32_t kernel_y, kernel_x;
+                for (kernel_y = kernel_y_start; kernel_y < kernel_y_end; kernel_y++)
+                {
+                    for (kernel_x = kernel_x_start; kernel_x < kernel_x_end; kernel_x++)
+                    {
+                        int32_t in_x = in_x_origin + kernel_x;
+                        int32_t in_y = in_y_origin + kernel_y;
+                        value += channel_src[in_y * in_shape.width + in_x];
+                        kernel_count++;
+                    }
+                }
+
+                *dest++ = value / kernel_count;
+            }
+        }
+    }
+}
+
+static void kpu_quantize(const kpu_model_quantize_layer_argument_t *arg, kpu_model_context_t *ctx)
+{
+    size_t count = arg->count;
+    const float *src = (const float *)(ctx->main_buffer + arg->main_mem_in_address);;
+    const kpu_model_quant_param_t q = arg->quant_param;
+    float scale = 1.f / q.scale;
+
+    uint8_t *dest = (uint8_t *)(ctx->main_buffer + arg->mem_out_address);
+    size_t i;
+    for (i = 0; i < count; i++)
+    {
+        int value = (*src++ - q.bias) * scale;
+        if (value < 0) value = 0;
+        if (value > 0xFF) value = 0xFF;
+        *dest++ = (uint8_t)value;
+    }
+}
+
+static void kpu_kmodel_dequantize(const kpu_model_dequantize_layer_argument_t *arg, kpu_model_context_t *ctx)
+{
+    const uint8_t *src = (const uint8_t *)(ctx->main_buffer + arg->main_mem_in_address);
+    float *dest = (float *)(ctx->main_buffer + arg->main_mem_out_address);
+    size_t oc, count = arg->count;
+    const kpu_model_quant_param_t q = arg->quant_param;
+
+	for (oc = 0; oc < count; oc++)
+		dest[oc] = *src++ * q.scale + q.bias;
+}
+
+static void kpu_kmodel_channelwise_dequantize(const kpu_model_channelwise_dequant_argument_t *arg, kpu_model_context_t *ctx)
+{
+    const uint8_t *src = (const uint8_t *)(ctx->main_buffer + arg->main_mem_in_address);
+    float *dest = (float *)(ctx->main_buffer + arg->main_mem_out_address);
+    size_t oc, i, channels = arg->channels, count = arg->channel_size;
+
+	for (oc = 0; oc < channels; oc++)
+    {
+        const kpu_model_quant_param_t q = arg->quant_params[oc];
+
+	    for (i = 0; i < count; i++)
+		    *dest++ = *src++ * q.scale + q.bias;
+    }
+}
+
+static void kpu_requantize(const kpu_model_requantize_layer_argument_t *arg, kpu_model_context_t *ctx)
+{
+    const uint8_t *src = (const uint8_t *)(ctx->main_buffer + arg->main_mem_in_address);
+    uint8_t *dest = (uint8_t *)(ctx->main_buffer + arg->main_mem_out_address);
+    size_t oc, count = ALIGN_UP(arg->count, 8) / 8;
+    const uint8_t *table = arg->table;
+
+	for (oc = 0; oc < count;)
+    {
+		dest[oc++] = table[*src++];
+		dest[oc++] = table[*src++];
+		dest[oc++] = table[*src++];
+		dest[oc++] = table[*src++];
+		dest[oc++] = table[*src++];
+		dest[oc++] = table[*src++];
+		dest[oc++] = table[*src++];
+		dest[oc++] = table[*src++];
+    }
+}
+
+static void kpu_l2_normalization(const kpu_model_l2_norm_layer_argument_t *arg, kpu_model_context_t *ctx)
+{
+    const float *src = (const float *)(ctx->main_buffer + arg->main_mem_in_address);
+    float *dest = (float *)(ctx->main_buffer + arg->main_mem_out_address);
+    size_t oc, channels = arg->channels;
+
+    float sum = 0.f;
+    const float epsilon = 1e-10f;
+    for (oc = 0; oc < channels; oc++)
+        sum += src[oc] * src[oc];
+    if (sum < epsilon)
+        sum = epsilon;
+    sum = 1.f / sqrtf(sum);
+    for (oc = 0; oc < channels; oc++)
+        dest[oc] = src[oc] * sum;
+}
+
+static void kpu_softmax(const kpu_model_softmax_layer_argument_t *arg, kpu_model_context_t *ctx)
+{
+    const float *src = (const float *)(ctx->main_buffer + arg->main_mem_in_address);
+    float *dest = (float *)(ctx->main_buffer + arg->main_mem_out_address);
+    size_t oc, channels = arg->channels;
+
+    float max = FLT_MIN;
+    for (oc = 0; oc < channels; oc++)
+        max = fmaxf(max, src[oc]);
+
+    float sum = 0.f;
+    for (oc = 0; oc < channels; oc++)
+    {
+        float value = expf(src[oc] - max);
+        sum += value;
+        dest[oc] = value;
+    }
+
+    for (oc = 0; oc < channels; oc++)
+        dest[oc] /= sum;
+}
+
+static void kpu_concat(const kpu_model_concat_layer_argument_t *arg, kpu_model_context_t *ctx)
+{
+    uint8_t *dest = (uint8_t *)(ctx->main_buffer + arg->main_mem_out_address);
+    uint32_t count = arg->input_count, i;
+
+    for (i = 0; i < count; i++)
+    {
+        kpu_model_memory_range_t input = arg->inputs_mem[i];
+        const uint8_t *src = (const uint8_t *)(ctx->main_buffer + input.start);
+        memcpy(dest, src, input.size);
+        dest += input.size;
+    }
+}
+
+static void kpu_kmodel_fully_connected(const kpu_model_fully_connected_layer_argument_t *arg, kpu_model_context_t *ctx)
+{
+    const float *src = (const float *)(ctx->main_buffer + arg->main_mem_in_address);
+    float *dest = (float *)(ctx->main_buffer + arg->main_mem_out_address);
+    uint32_t in_channels = arg->in_channels, out_channels = arg->out_channels, ic, oc;
+    const float *weights = arg->weights, *bias = arg->weights + in_channels * out_channels;
+
+    if (in_channels % 8 == 0)
+    {
+#define FC_UNROLL_1(x)        \
+    float i##x = *c_src++;    \
+    float w##x = *c_weights++;
+
+#define FC_UNROLL_2(x)        \
+    sum += i##x * w##x;
+
+#define FC_UNROLL_S(x) \
+    FC_UNROLL_##x(0) \
+    FC_UNROLL_##x(1) \
+    FC_UNROLL_##x(2) \
+    FC_UNROLL_##x(3) \
+    FC_UNROLL_##x(4) \
+    FC_UNROLL_##x(5) \
+    FC_UNROLL_##x(6) \
+    FC_UNROLL_##x(7)
+
+        for (oc = 0; oc < out_channels; oc++)
+        {
+            const float *c_src = src;
+            const float *c_weights = weights + oc * in_channels;
+
+            float sum = 0.0f;
+            for (ic = 0; ic < in_channels / 8; ic++)
+            {
+                FC_UNROLL_S(1);
+                FC_UNROLL_S(2);
+            }
+
+            dest[oc] = sum + bias[oc];
+        }
+    }
+    else
+    {
+        for (oc = 0; oc < out_channels; oc++)
+        {
+            const float *c_weights = weights + oc * in_channels;
+
+            float sum = 0.0f;
+            for (ic = 0; ic < in_channels; ic++)
+                sum += src[ic] * c_weights[ic];
+            dest[oc] = sum + bias[oc];
+        }
+    }
+}
+
+static void kpu_tf_flatten(const kpu_model_tf_flatten_layer_argument_t *arg, kpu_model_context_t *ctx)
+{
+    const float *src = (const float *)(ctx->main_buffer + arg->main_mem_in_address);
+    float *dest = (float *)(ctx->main_buffer + arg->main_mem_out_address);
+    kpu_model_shape_t in_shape = arg->shape;
+    uint32_t oc, oy, ox;
+
+    for (oy = 0; oy < in_shape.height; oy++)
+        for (ox = 0; ox < in_shape.width; ox++)
+            for (oc = 0; oc < in_shape.channels; oc++)
+                *dest++ = src[(oc * in_shape.height + oy) * in_shape.width + ox];
+}
+
+static void kpu_resize_nearest_neighbor(const kpu_model_resize_nearest_neighbor_layer_argument_t *arg, kpu_model_context_t *ctx)
+{
+    const float *src = (const float *)(ctx->main_buffer + arg->main_mem_in_address);
+    float *dest = (float *)(ctx->main_buffer + arg->main_mem_out_address);
+    kpu_model_shape_t in_shape = arg->in_shape;
+    uint32_t out_width = arg->out_width, out_height = arg->out_height;
+    uint32_t oc, oy, ox;
+
+    float height_scale = (float)in_shape.height / out_height;
+    float width_scale = (float)in_shape.width / out_width;
+
+    for (oc = 0; oc < in_shape.channels; oc++)
+    {
+        const float *channel_src = src + in_shape.width * in_shape.height * oc;
+        for (oy = 0; oy <out_height; oy++)
+        {
+            uint32_t in_y = (uint32_t)min(floorf(oy * height_scale), in_shape.height - 1);
+            const float *y_origin = channel_src + in_y * in_shape.width;
+            for (ox = 0; ox < out_width; ox++)
+            {
+                uint32_t in_x = (uint32_t)min(floorf(ox * width_scale), in_shape.width - 1);
+                *dest++ = y_origin[in_x];
+            }
+        }
+    }
+}
+
+static void kpu_conv(const kpu_model_conv_layer_argument_t *arg, kpu_model_context_t *ctx)
+{
+    volatile kpu_layer_argument_t layer = *(const volatile kpu_layer_argument_t *)(ctx->model_buffer + arg->layer_offset);
+    layer.kernel_load_cfg.data.para_start_addr = (uintptr_t)(ctx->model_buffer + arg->weights_offset);
+    layer.kernel_pool_type_cfg.data.bwsx_base_addr = (uintptr_t)(ctx->model_buffer + arg->bn_offset);
+    layer.kernel_calc_type_cfg.data.active_addr = (uintptr_t)(ctx->model_buffer + arg->act_offset);
+
+    if (arg->flags & KLF_MAIN_MEM_OUT)
+    {
+        dmac_channel_number_t dma_ch = ctx->dma_ch;
+        uint8_t *dest = ctx->main_buffer + arg->main_mem_out_address;
+        kpu->interrupt_clear.data = (kpu_config_interrupt_t)
+        {
+            .calc_done_int = 1,
+            .layer_cfg_almost_empty_int = 1,
+            .layer_cfg_almost_full_int = 1
+        };
+        kpu->interrupt_mask.data = (kpu_config_interrupt_t)
+        {
+            .calc_done_int = 1,
+            .layer_cfg_almost_empty_int = 1,
+            .layer_cfg_almost_full_int = 1
+        };
+        layer.dma_parameter.data.send_data_out = 1;
+        sysctl_dma_select(dma_ch, SYSCTL_DMA_SELECT_AI_RX_REQ);
+        if (ctx->current_layer != ctx->layers_length)
+            dmac_set_irq(dma_ch, ai_step, ctx, 1);
+        else
+            dmac_set_irq(dma_ch, (plic_irq_callback_t)kpu_kmodel_done, ctx, 1);
+        dmac_set_single_mode(dma_ch, (void *)(&kpu->fifo_data_out), dest, DMAC_ADDR_NOCHANGE, DMAC_ADDR_INCREMENT,
+            DMAC_MSIZE_8, DMAC_TRANS_WIDTH_64, (layer.dma_parameter.data.dma_total_byte + 8) / 8);
+    }
+    else
+    {
+        kpu->interrupt_clear.data = (kpu_config_interrupt_t)
+        {
+            .calc_done_int = 1,
+            .layer_cfg_almost_empty_int = 1,
+            .layer_cfg_almost_full_int = 1
+        };
+
+        kpu->interrupt_mask.data = (kpu_config_interrupt_t)
+        {
+            .calc_done_int = 0,
+            .layer_cfg_almost_empty_int = 1,
+            .layer_cfg_almost_full_int = 1
+        };
+        layer.interrupt_enabe.data.int_en = 1;
+    }
+
+    kpu_send_layer((const kpu_layer_argument_t *)&layer);
+}
+
+static void kpu_add_padding(const kpu_model_add_padding_layer_argument_t *arg, kpu_model_context_t *ctx)
+{
+    const uint8_t *src = (const uint8_t *)(ctx->main_buffer + arg->main_mem_in_address);
+#if USE_CACHED_AI_RAM
+	uint8_t *dest = (uint8_t *)(uintptr_t)(AI_RAM_BASE_ADDR + arg->kpu_mem_out_address * 64);
+#else
+	uint8_t *dest = (uint8_t *)(uintptr_t)(AI_IO_BASE_ADDR + arg->kpu_mem_out_address * 64);
+#endif
+
+    uint32_t row_padding = 16;
+    uint32_t row_group = 4;
+    uint32_t row_length = 1;
+    uint32_t height = 4;
+    uint32_t oc, x, y, channels = arg->channels;
+
+    for (oc = 0; oc < channels; oc++)
+    {
+        uint8_t *channel_origin = dest + oc / row_group * row_length * height * 64 + oc % row_group * row_padding;
+        for (y = 0; y < 1; y++)
+        {
+            uint8_t *y_origin = channel_origin + y * row_length * 64;
+            for (x = 0; x < 1; x++)
+                y_origin[x] = *src++;
+        }
+    }
+
+#if USE_CACHED_AI_RAM
+    uint32_t lines = row_length * height * channels / row_group;
+    kpu_flush_cache(arg->kpu_mem_out_address, lines);
+#endif
+}
+
+static void kpu_remove_padding(const kpu_model_remove_padding_layer_argument_t *arg, kpu_model_context_t *ctx)
+{
+    const uint8_t *src = (const uint8_t *)(ctx->main_buffer + arg->main_mem_in_address);
+    uint8_t *dest = (uint8_t *)(ctx->main_buffer + arg->main_mem_out_address);
+    uint32_t oc, channels = arg->channels;
+
+    for (oc = 0; oc < channels; oc++)
+        *dest++ = src[oc * 16];
+}
+
+static void kpu_upload(const kpu_model_upload_layer_argument_t *arg, kpu_model_context_t *ctx)
+{
+    size_t width = arg->width;
+    size_t height = arg->height;
+    size_t channels = arg->channels;
+
+    kpu_upload_core(width, height, channels, ctx->main_buffer + arg->main_mem_in_address, arg->kpu_mem_out_address);
+}
+
+int kpu_load_kmodel(kpu_model_context_t *ctx, const uint8_t *buffer)
+{
+    uintptr_t base_addr = (uintptr_t)buffer;
+    const kpu_kmodel_header_t *header = (const kpu_kmodel_header_t *)buffer;
+    printf("\nheader->version:%d,header->arch:%d\n",header->version,header->arch);
+    if (header->version == 3 && header->arch == 0)
+    {
+        ctx->model_buffer = buffer;
+        ctx->output_count = header->output_count;
+        ctx->outputs = (const kpu_model_output_t *)(base_addr + sizeof(kpu_kmodel_header_t));
+        ctx->layer_headers = (const kpu_model_layer_header_t *)((uintptr_t)ctx->outputs + sizeof(kpu_model_output_t) * ctx->output_count);
+        ctx->layers_length = header->layers_length;
+        ctx->body_start = (const uint8_t *)((uintptr_t)ctx->layer_headers + sizeof(kpu_model_layer_header_t) * header->layers_length);
+        ctx->main_buffer = (uint8_t *)malloc(header->main_mem_usage);
+        if (!ctx->main_buffer)
+            return -1;
+    }
+    else
+    {
+        return -1;
+    }
+
+    return 0;
+}
+
+int kpu_get_output(kpu_model_context_t *ctx, uint32_t index, uint8_t **data, size_t *size)
+{
+    if (index >= ctx->output_count)
+        return -1;
+
+    const kpu_model_output_t * output = ctx->outputs + index;
+    *data = ctx->main_buffer + output->address;
+    *size = output->size;
+    return 0;
+}
+
+void kpu_model_free(kpu_model_context_t *ctx)
+{
+    free(ctx->main_buffer);
+    ctx->main_buffer = NULL;
+}
+
+#if KPU_DEBUG
+static uint64_t last_time;
+static uint64_t total_time;
+static uint64_t kpu_time;
+static uint32_t last_layer_type;
+
+static const char *str_layer_type(uint32_t type)
+{
+    switch (type)
+    {
+        case KL_ADD:
+            return "Add";
+        case KL_QUANTIZED_ADD:
+            return "QuantAdd";
+        case KL_GLOBAL_AVERAGE_POOL2D:
+            return "GAP";
+        case KL_QUANTIZED_MAX_POOL2D:
+            return "QuantMaxPool2d";
+        case KL_AVERAGE_POOL2D:
+            return "AveragePool2d";
+        case KL_QUANTIZE:
+            return "Quantize";
+        case KL_DEQUANTIZE:
+            return "Dequantize";
+        case KL_REQUANTIZE:
+            return "Requantize";
+        case KL_L2_NORMALIZATION:
+            return "L2Norm";
+        case KL_SOFTMAX:
+            return "Softmax";
+        case KL_CONCAT:
+            return "Concat";
+        case KL_QUANTIZED_CONCAT:
+            return "QuantConcat";
+        case KL_FULLY_CONNECTED:
+            return "FullyConnected";
+        case KL_TENSORFLOW_FLATTEN:
+            return "TFFlatten";
+        case KL_RESIZE_NEAREST_NEIGHBOR:
+            return "ResizeNearestNeighbor";
+        case KL_CHANNELWISE_DEQUANTIZE:
+            return "ChannelwiseDequantize";
+        case KL_K210_CONV:
+            return "K210Conv";
+        case KL_K210_ADD_PADDING:
+            return "K210AddPad";
+        case KL_K210_REMOVE_PADDING:
+            return "K210RemovePad";
+        case KL_K210_UPLOAD:
+            return "K210Upload";
+        default:
+            return "Unknown";
+    }
+}
+#endif
+
+static int kpu_kmodel_done(kpu_model_context_t *ctx)
+{
+    kpu->interrupt_clear.data = (kpu_config_interrupt_t)
+    {
+        .calc_done_int = 1,
+        .layer_cfg_almost_empty_int = 1,
+        .layer_cfg_almost_full_int = 1
+    };
+    kpu->interrupt_mask.data = (kpu_config_interrupt_t)
+    {
+        .calc_done_int = 1,
+        .layer_cfg_almost_empty_int = 1,
+        .layer_cfg_almost_full_int = 1
+    };
+#if KPU_DEBUG
+    uint32_t cnt_layer_id = ctx->current_layer - 1;
+    uint64_t time = sysctl_get_time_us();
+    if (last_time != 0)
+    {
+        uint64_t layer_time = time - last_time;
+        printf("layer %d [%s]: %f ms\n", cnt_layer_id, str_layer_type(last_layer_type), layer_time / 1000.0);
+        total_time += layer_time;
+        if (last_layer_type == KL_K210_CONV)
+            kpu_time += layer_time;
+    }
+
+    printf("KPU: %f ms\n", kpu_time / 1000.0);
+    printf("CPU: %f ms\n", (total_time - kpu_time) / 1000.0);
+    printf("Model: %f ms\n", total_time / 1000.0);
+#endif
+    ctx->done_callback(ctx->userdata);
+    return 0;
+}
+
+static int ai_step(void *userdata)
+{
+    kpu_model_context_t *ctx = (kpu_model_context_t *)userdata;
+
+    uint32_t cnt_layer_id = ctx->current_layer++;
+    const uint8_t *layer_body = ctx->current_body;
+    const kpu_model_layer_header_t *cnt_layer_header = ctx->layer_headers + cnt_layer_id;
+    ctx->current_body += cnt_layer_header->body_size;
+
+#if KPU_DEBUG
+    uint64_t time = sysctl_get_time_us();
+    if (last_time != 0)
+    {
+        uint64_t layer_time = time - last_time;
+        printf("layer %d [%s]: %f ms\n", cnt_layer_id - 1, str_layer_type(last_layer_type), layer_time / 1000.0);
+        total_time += layer_time;
+        if (last_layer_type == KL_K210_CONV)
+            kpu_time += layer_time;
+    }
+
+    last_layer_type = cnt_layer_header->type;
+    last_time = sysctl_get_time_us();
+#endif
+
+    switch (cnt_layer_header->type)
+    {
+        case KL_ADD:
+            kpu_kmodel_add((const kpu_model_add_layer_argument_t *)layer_body, ctx);
+            break;
+        case KL_QUANTIZED_ADD:
+            kpu_quantized_add((const kpu_model_quant_add_layer_argument_t *)layer_body, ctx);
+            break;
+        case KL_GLOBAL_AVERAGE_POOL2D:
+            kpu_global_average_pool2d((const kpu_model_gap2d_layer_argument_t *)layer_body, ctx);
+            break;
+        case KL_QUANTIZED_MAX_POOL2D:
+            kpu_quantized_max_pool2d((const kpu_model_quant_max_pool2d_layer_argument_t *)layer_body, ctx);
+            break;
+        case KL_AVERAGE_POOL2D:
+            kpu_average_pool2d((const kpu_model_ave_pool2d_layer_argument_t *)layer_body, ctx);
+            break;
+        case KL_QUANTIZE:
+            kpu_quantize((const kpu_model_quantize_layer_argument_t *)layer_body, ctx);
+            break;
+        case KL_DEQUANTIZE:
+            kpu_kmodel_dequantize((const kpu_model_dequantize_layer_argument_t *)layer_body, ctx);
+            break;
+        case KL_REQUANTIZE:
+            kpu_requantize((const kpu_model_requantize_layer_argument_t *)layer_body, ctx);
+            break;
+        case KL_L2_NORMALIZATION:
+            kpu_l2_normalization((const kpu_model_l2_norm_layer_argument_t *)layer_body, ctx);
+            break;
+        case KL_SOFTMAX:
+            kpu_softmax((const kpu_model_softmax_layer_argument_t *)layer_body, ctx);
+            break;
+        case KL_CONCAT:
+        case KL_QUANTIZED_CONCAT:
+            kpu_concat((const kpu_model_concat_layer_argument_t *)layer_body, ctx);
+            break;
+        case KL_FULLY_CONNECTED:
+            kpu_kmodel_fully_connected((const kpu_model_fully_connected_layer_argument_t *)layer_body, ctx);
+            break;
+        case KL_TENSORFLOW_FLATTEN:
+            kpu_tf_flatten((const kpu_model_tf_flatten_layer_argument_t *)layer_body, ctx);
+            break;
+        case KL_RESIZE_NEAREST_NEIGHBOR:
+            kpu_resize_nearest_neighbor((const kpu_model_resize_nearest_neighbor_layer_argument_t *)layer_body, ctx);
+            break;
+        case KL_CHANNELWISE_DEQUANTIZE:
+            kpu_kmodel_channelwise_dequantize((const kpu_model_channelwise_dequant_argument_t *)layer_body, ctx);
+            break;
+        case KL_K210_CONV:
+            kpu_conv((const kpu_model_conv_layer_argument_t *)layer_body, ctx);
+            return 0;
+        case KL_K210_ADD_PADDING:
+            kpu_add_padding((const kpu_model_add_padding_layer_argument_t *)layer_body, ctx);
+            break;
+        case KL_K210_REMOVE_PADDING:
+            kpu_remove_padding((const kpu_model_remove_padding_layer_argument_t *)layer_body, ctx);
+            break;
+        case KL_K210_UPLOAD:
+            kpu_upload((const kpu_model_upload_layer_argument_t *)layer_body, ctx);
+            break;
+        default:
+            assert(!"Layer is not supported.");
+    }
+
+    if (cnt_layer_id != (ctx->layers_length - 1))
+        ai_step(userdata);
+    else
+        kpu_kmodel_done(ctx);
+    return 0;
+}
+
+static void ai_step_not_isr(void *userdata)
+{
+    sysctl_disable_irq();
+    ai_step(userdata);
+    sysctl_enable_irq();
+}
+
+int kpu_run_kmodel(kpu_model_context_t *ctx, const uint8_t *src, dmac_channel_number_t dma_ch, kpu_done_callback_t done_callback, void *userdata)
+{
+    ctx->dma_ch = dma_ch;
+    ctx->done_callback = done_callback;
+    ctx->userdata = userdata;
+    ctx->current_layer = 0;
+    ctx->current_body = ctx->body_start;
+#if KPU_DEBUG
+    last_time = 0;
+    total_time = 0;
+    kpu_time = 0;
+#endif
+
+    kpu_kmodel_header_t *header = (kpu_kmodel_header_t *)ctx->model_buffer;
+    kpu->interrupt_clear.reg = 7;
+    kpu->fifo_threshold.data = (kpu_config_fifo_threshold_t)
+    {
+        .fifo_full_threshold = 10, .fifo_empty_threshold = 1
+    };
+    kpu->eight_bit_mode.data = (kpu_config_eight_bit_mode_t)
+    {
+        .eight_bit_mode = header->flags & 1
+    };
+    kpu->interrupt_mask.data = (kpu_config_interrupt_t)
+    {
+        .calc_done_int = 1,
+        .layer_cfg_almost_empty_int = 0,
+        .layer_cfg_almost_full_int = 1
+    };
+
+    plic_irq_enable(IRQN_AI_INTERRUPT);
+    plic_set_priority(IRQN_AI_INTERRUPT, 1);
+    plic_irq_register(IRQN_AI_INTERRUPT, ai_step, ctx);
+
+    const kpu_model_layer_header_t *first_layer_header = ctx->layer_headers;
+    if (first_layer_header->type != KL_K210_CONV)
+        return -1;
+    const kpu_model_conv_layer_argument_t *first_layer = (const kpu_model_conv_layer_argument_t *)ctx->body_start;
+    kpu_layer_argument_t layer_arg = *(volatile kpu_layer_argument_t *)(ctx->model_buffer + first_layer->layer_offset);
+
+    if ((layer_arg.image_size.data.i_row_wid + 1) % 64 != 0)
+    {
+        kpu_kmodel_input_with_padding(&layer_arg, src);
+        ai_step_not_isr(ctx);
+    }
+    else
+    {
+        kpu_input_dma(&layer_arg, src, ctx->dma_ch, ai_step, ctx);
+    }
+
+    return 0;
+}
+
diff --git a/Ubiquitous/XiZi_IIoT/path_kernel.mk b/Ubiquitous/XiZi_IIoT/path_kernel.mk
index 3731ae559..b9544200b 100755
--- a/Ubiquitous/XiZi_IIoT/path_kernel.mk
+++ b/Ubiquitous/XiZi_IIoT/path_kernel.mk
@@ -461,6 +461,9 @@ KERNELPATHS += -I$(KERNEL_ROOT)/../../APP_Framework/Framework/knowing/tensorflow
 KERNELPATHS += -I$(KERNEL_ROOT)/../../APP_Framework/Framework/knowing/tensorflow-lite/tensorflow-lite-for-mcu/source/third_party/gemmlowp #
 KERNELPATHS += -I$(KERNEL_ROOT)/../../APP_Framework/Framework/knowing/tensorflow-lite/tensorflow-lite-for-mcu/source/third_party/flatbuffers/include #
 KERNELPATHS += -I$(KERNEL_ROOT)/../../APP_Framework/Framework/knowing/tensorflow-lite/tensorflow-lite-for-mcu/source/third_party/ruy #
+KERNELPATHS += -I$(KERNEL_ROOT)/../../APP_Framework/Framework/knowing/kpu/k210_yolov2_detect_procedure #
+KERNELPATHS += -I$(KERNEL_ROOT)/../../APP_Framework/Framework/knowing/kpu/yolov2 #
+KERNELPATHS += -I$(KERNEL_ROOT)/../../APP_Framework/Framework/knowing/kpu/yolov2_json #
 endif
 
 ifeq ($(CONFIG_LIB_LV),y)
@@ -479,6 +482,10 @@ KERNELPATHS += -I$(KERNEL_ROOT)/../../APP_Framework/Framework/control/plc_protoc
 KERNELPATHS += -I$(KERNEL_ROOT)/../../APP_Framework/Framework/control/plc_protocol/melsec #
 KERNELPATHS += -I$(KERNEL_ROOT)/../../APP_Framework/Framework/control/plc_protocol/opcua #
 KERNELPATHS += -I$(KERNEL_ROOT)/../../APP_Framework/Framework/control/plc_protocol/s7 #
+endif
+
+
+ifeq ($(CONFIG_LIB_USING_CJSON), y)
 KERNELPATHS += -I$(KERNEL_ROOT)/../../APP_Framework/lib/cJSON
 endif
 
diff --git a/Ubiquitous/XiZi_IIoT/resources/spi/sd_card_spi/sd_spi.c b/Ubiquitous/XiZi_IIoT/resources/spi/sd_card_spi/sd_spi.c
index 6eba43464..e66e82166 100644
--- a/Ubiquitous/XiZi_IIoT/resources/spi/sd_card_spi/sd_spi.c
+++ b/Ubiquitous/XiZi_IIoT/resources/spi/sd_card_spi/sd_spi.c
@@ -850,7 +850,6 @@ static uint32 SdReadMultiBlock(SpiSdDeviceType spi_sd_dev, uint32 id, const uint
 
         do
         {
-            BusDevWriteData(&spi_sd_dev->spi_dev->haldev, &write_param);
             BusDevReadData(&spi_sd_dev->spi_dev->haldev, &read_param);
 
             SD_TIMEOUT(start_time, 10 * SPI_SD_TIMEOUT_NUM);