diff --git a/APP_Framework/Framework/knowing/nnom/README.md b/APP_Framework/Framework/knowing/nnom/README.md new file mode 100644 index 000000000..34082b201 --- /dev/null +++ b/APP_Framework/Framework/knowing/nnom/README.md @@ -0,0 +1,14 @@ +# Neural Network on Microcontroller (NNoM) + +NNoM is a high-level inference Neural Network library specifically for microcontrollers, released under Apache License 2.0. + +Current version is 0.4.3. More information available in [NNOM](https://github.com/majianjia/nnom). + +## CMSIS-NN Backend + +[CMSIS-NN/DSP](https://github.com/ARM-software/CMSIS_5/tree/develop/CMSIS/NN) is an inference acceleration libraries for Arm Cortex-M CPUs and can be used as the backend of NNoM for high performance. + +## Notes + +- CHW format is incompatible with CMSIS-NN and must be used when using hardware accelerator such as KPU in K210 chip. +- Static memory buffer must be set by using "nnom_set_static_buf()" before creating a model. \ No newline at end of file diff --git a/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_activation.h b/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_activation.h new file mode 100644 index 000000000..7cda07ce3 --- /dev/null +++ b/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_activation.h @@ -0,0 +1,96 @@ +/* + * Copyright (c) 2018-2020 + * Jianjia Ma + * majianjia@live.com + * + * SPDX-License-Identifier: Apache-2.0 + * + * Change Logs: + * Date Author Notes + * 2020-05-03 Jianjia Ma The first version + */ + +#ifndef __NNOM_ACTIVATION_H__ +#define __NNOM_ACTIVATION_H__ + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include +#include + +#include "nnom.h" +#include "nnom_layers.h" +#include "nnom_local.h" +#include "nnom_tensor.h" + + +// activation layer +typedef struct _nnom_activation_layer_t +{ + nnom_layer_t super; + nnom_activation_t *act; +} nnom_activation_layer_t; + + +// activation with fixed q format (tanh and sigmoid) +typedef struct _nnom_activation_fixed_q_t +{ + nnom_activation_t super; + uint8_t dec_bit; +} nnom_activation_fixed_q_t; + +// leaky relu +typedef struct _nnom_activation_leaky_relu_t +{ + nnom_activation_t super; + q7_t alpha; // alpha is present by q0.7 format. (-128 = -1) +} nnom_activation_leaky_relu_t; + +// advance relu (full ReLU) +typedef struct _nnom_activation_adv_relu_t +{ + nnom_activation_t super; + q7_t negative_slope; // negative_slope is present by q0.7 format. (-128 = -1) + float max; // cap of the max value + float threshold; // threshold +} nnom_activation_adv_relu_t; + +// method +nnom_status_t activation_run(nnom_layer_t* layer); +nnom_status_t activation_free(nnom_layer_t *layer); + +// activation delete +void act_delete(nnom_activation_t* act); + +// a direct api on tensor +nnom_status_t act_tensor_run(nnom_activation_t* act, nnom_tensor_t* tensor); + + +// Layer API +nnom_layer_t *Activation(nnom_activation_t *act); +nnom_layer_t *ReLU(void); +nnom_layer_t *LeakyReLU(float alpha); +nnom_layer_t *AdvReLU(float alpha, float max, float threshold); +nnom_layer_t *Sigmoid(int32_t dec_bit); +nnom_layer_t *TanH(int32_t dec_bit); + +// Activation API. +nnom_activation_t* act_relu(void); +nnom_activation_t* act_leaky_relu(float alpha); +nnom_activation_t* act_adv_relu(float negative_slope, float max, float threshold); +nnom_activation_t* act_tanh(int32_t dec_bit); +nnom_activation_t* act_sigmoid(int32_t dec_bit); +nnom_activation_t* act_hard_tanh(int32_t dec_bit); +nnom_activation_t* act_hard_sigmoid(int32_t dec_bit); + +// utils +int32_t act_get_dec_bit(nnom_activation_type_t type, int32_t dec_bit); + +#ifdef __cplusplus +} +#endif + +#endif /* __NNOM_ACTIVATION_H__ */ diff --git a/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_avgpool.h b/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_avgpool.h new file mode 100644 index 000000000..6f8354630 --- /dev/null +++ b/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_avgpool.h @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2018-2020 + * Jianjia Ma + * majianjia@live.com + * + * SPDX-License-Identifier: Apache-2.0 + * + * Change Logs: + * Date Author Notes + * 2020-05-03 Jianjia Ma The first version + */ + +#ifndef __NNOM_AVGPOOL_H__ +#define __NNOM_AVGPOOL_H__ + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include +#include + +#include "nnom.h" +#include "nnom_layers.h" +#include "nnom_local.h" +#include "nnom_tensor.h" + +#include "layers/nnom_maxpool.h" + +// Avg Pooling +typedef nnom_maxpool_layer_t nnom_avgpool_layer_t; + +// method +nnom_status_t avgpooling_build(nnom_layer_t *layer); +nnom_status_t avgpool_run(nnom_layer_t *layer); + +// API +nnom_layer_t *avgpool_s(const nnom_pool_config_t * config); +nnom_layer_t *AvgPool(nnom_3d_shape_t k, nnom_3d_shape_t s, nnom_padding_t pad_type); + + +#ifdef __cplusplus +} +#endif + +#endif /* __NNOM_AVGPOOL_H__ */ diff --git a/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_baselayer.h b/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_baselayer.h new file mode 100644 index 000000000..940bce578 --- /dev/null +++ b/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_baselayer.h @@ -0,0 +1,43 @@ +/* + * Copyright (c) 2018-2020 + * Jianjia Ma + * majianjia@live.com + * + * SPDX-License-Identifier: Apache-2.0 + * + * Change Logs: + * Date Author Notes + * 2020-05-03 Jianjia Ma The first version + */ + +#ifndef __NNOM_BASELAYER_H__ +#define __NNOM_BASELAYER_H__ + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include +#include + +#include "nnom.h" +#include "nnom_layers.h" +#include "nnom_local.h" +#include "nnom_tensor.h" + +#include "layers/nnom_input.h" + +// method +nnom_status_t default_build(nnom_layer_t *layer); +nnom_status_t default_run(nnom_layer_t *layer); + +// API +nnom_layer_t *baselayer_s(const nnom_layer_config_t * config); +nnom_layer_t *BaseLayer(void); + +#ifdef __cplusplus +} +#endif + +#endif /* __NNOM_BASELAYER_H__ */ diff --git a/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_concat.h b/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_concat.h new file mode 100644 index 000000000..d47b26365 --- /dev/null +++ b/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_concat.h @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2018-2020 + * Jianjia Ma + * majianjia@live.com + * + * SPDX-License-Identifier: Apache-2.0 + * + * Change Logs: + * Date Author Notes + * 2020-05-03 Jianjia Ma The first version + */ + +#ifndef __NNOM_CONCAT_H__ +#define __NNOM_CONCAT_H__ + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include +#include + +#include "nnom.h" +#include "nnom_layers.h" +#include "nnom_local.h" +#include "nnom_tensor.h" + +// concatenate layer +typedef struct _nnom_concat_layer +{ + nnom_layer_t super; + int8_t axis; +} nnom_concat_layer_t; + +typedef struct _nnom_concat_config_t +{ + nnom_layer_config_t super; + int8_t axis; +} nnom_concat_config_t; + +// method +nnom_status_t concat_build(nnom_layer_t *layer); +nnom_status_t concat_run(nnom_layer_t *layer); + +// API +nnom_layer_t *concat_s(const nnom_concat_config_t *config); +nnom_layer_t *Concat(int8_t axis); + + +#ifdef __cplusplus +} +#endif + +#endif /* __NNOM_CONCAT_H__ */ diff --git a/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_conv2d.h b/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_conv2d.h new file mode 100644 index 000000000..2b6efb198 --- /dev/null +++ b/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_conv2d.h @@ -0,0 +1,83 @@ +/* + * Copyright (c) 2018-2020 + * Jianjia Ma + * majianjia@live.com + * + * SPDX-License-Identifier: Apache-2.0 + * + * Change Logs: + * Date Author Notes + * 2020-05-03 Jianjia Ma The first version + */ + +#ifndef __NNOM_CONV2D_H__ +#define __NNOM_CONV2D_H__ + +#ifdef __cplusplus +extern "C" { +#endif + + +#include +#include +#include + +#include "nnom.h" +#include "nnom_layers.h" +#include "nnom_local.h" +#include "nnom_tensor.h" + +// child layers parameters +typedef struct _nnom_conv2d_layer_t +{ + nnom_layer_t super; + nnom_3d_shape_t kernel; + nnom_3d_shape_t stride; + nnom_3d_shape_t pad; + nnom_3d_shape_t dilation; + nnom_padding_t padding_type; + uint32_t filter_mult; // filter size (for conv) or multilplier (for depthwise) + + nnom_tensor_t *weight; + nnom_tensor_t *bias; + + // test + nnom_qformat_param_t * output_rshift; + nnom_qformat_param_t * bias_lshift; +} nnom_conv2d_layer_t; + +// a machine interface for configuration +typedef struct _nnom_conv2d_config_t +{ + nnom_layer_config_t super; + nnom_qtype_t qtype; //quantisation type(per channel or per layer) + nnom_tensor_t *weight; + nnom_tensor_t *bias; + nnom_qformat_param_t *output_shift; + nnom_qformat_param_t *bias_shift; + uint32_t filter_size; + int8_t kernel_size[2]; + int8_t stride_size[2]; + int8_t padding_size[2]; + int8_t dilation_size[2]; + nnom_padding_t padding_type; +} nnom_conv2d_config_t; + +// method +nnom_status_t conv2d_run(nnom_layer_t *layer); +nnom_status_t conv2d_build(nnom_layer_t *layer); +nnom_status_t conv2d_free(nnom_layer_t *layer); + +// utils +uint32_t conv_output_length(uint32_t input_length, uint32_t filter_size, nnom_padding_t padding, uint32_t stride, uint32_t dilation); + +// API +nnom_layer_t *conv2d_s(const nnom_conv2d_config_t *config); +nnom_layer_t *Conv2D(uint32_t filters, nnom_3d_shape_t k, nnom_3d_shape_t s, nnom_3d_shape_t d, nnom_padding_t pad_type, + const nnom_weight_t *w, const nnom_bias_t *b); + +#ifdef __cplusplus +} +#endif + +#endif /* __NNOM_CONV2D_H__ */ diff --git a/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_conv2d_trans.h b/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_conv2d_trans.h new file mode 100644 index 000000000..26249f3d9 --- /dev/null +++ b/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_conv2d_trans.h @@ -0,0 +1,52 @@ +/* + * Copyright (c) 2018-2020 + * Jianjia Ma + * majianjia@live.com + * + * SPDX-License-Identifier: Apache-2.0 + * + * Change Logs: + * Date Author Notes + * 2020-05-30 Jianjia Ma The first version + */ + +#ifndef __NNOM_DECONV2D_H__ +#define __NNOM_DECONV2D_H__ + +#ifdef __cplusplus +extern "C" { +#endif + + +#include +#include +#include + +#include "nnom.h" +#include "nnom_layers.h" +#include "nnom_local.h" +#include "nnom_tensor.h" +#include "layers/nnom_conv2d.h" + +// child layers parameters +typedef nnom_conv2d_layer_t nnom_conv2d_trans_layer_t; + +typedef nnom_conv2d_config_t nnom_conv2d_trans_config_t; + +// method +nnom_status_t conv2d_trans_run(nnom_layer_t *layer); +nnom_status_t conv2d_trans_build(nnom_layer_t *layer); + +// utils +uint32_t conv_trans_output_length(uint32_t input_length, uint32_t filter_size, nnom_padding_t padding, uint32_t stride, uint32_t dilation); + +// API +nnom_layer_t *conv2d_trans_s(const nnom_conv2d_config_t *config); +nnom_layer_t *Conv2DTrans(uint32_t filters, nnom_3d_shape_t k, nnom_3d_shape_t s, nnom_3d_shape_t d, nnom_padding_t pad_type, + const nnom_weight_t *w, const nnom_bias_t *b); + +#ifdef __cplusplus +} +#endif + +#endif /* __NNOM_DECONV2D_H__ */ diff --git a/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_cropping.h b/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_cropping.h new file mode 100644 index 000000000..252357481 --- /dev/null +++ b/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_cropping.h @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2018-2020 + * Jianjia Ma + * majianjia@live.com + * + * SPDX-License-Identifier: Apache-2.0 + * + * Change Logs: + * Date Author Notes + * 2020-05-03 Jianjia Ma The first version + */ + +#ifndef __NNOM_CROPPING_H__ +#define __NNOM_CROPPING_H__ + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include +#include + +#include "nnom.h" +#include "nnom_layers.h" +#include "nnom_local.h" +#include "nnom_tensor.h" + +#include "layers/nnom_zero_padding.h" + +// Cropping, same as zeropadding +typedef nnom_zero_padding_layer_t nnom_cropping_layer_t; + +typedef nnom_zero_padding_config_t nnom_cropping_config_t; + +// method +nnom_status_t cropping_build(nnom_layer_t *layer); +nnom_status_t cropping_run(nnom_layer_t *layer); + +// API +nnom_layer_t * cropping_s(const nnom_cropping_config_t *config); +nnom_layer_t *Cropping(nnom_border_t pad); + +#ifdef __cplusplus +} +#endif + +#endif /* __NNOM_CROPPING_H__ */ diff --git a/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_dense.h b/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_dense.h new file mode 100644 index 000000000..a0504a317 --- /dev/null +++ b/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_dense.h @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2018-2020 + * Jianjia Ma + * majianjia@live.com + * + * SPDX-License-Identifier: Apache-2.0 + * + * Change Logs: + * Date Author Notes + * 2020-05-03 Jianjia Ma The first version + */ + +#ifndef __NNOM_DENSE_H__ +#define __NNOM_DENSE_H__ + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include +#include + +#include "nnom.h" +#include "nnom_layers.h" +#include "nnom_local.h" +#include "nnom_tensor.h" + +typedef struct _nnom_dense_layer_t +{ + nnom_layer_t super; + size_t output_unit; + nnom_tensor_t *weight; + nnom_tensor_t *bias; + nnom_qformat_param_t *output_rshift; + nnom_qformat_param_t *bias_lshift; +} nnom_dense_layer_t; + +// a machine interface for configuration +typedef struct _nnom_dense_config_t +{ + nnom_layer_config_t super; + nnom_qtype_t qtype; //quantisation type(per channel or per layer) + nnom_tensor_t *weight; + nnom_tensor_t *bias; + nnom_qformat_param_t *output_shift; + nnom_qformat_param_t *bias_shift; +} nnom_dense_config_t; + +// method +nnom_status_t dense_free(nnom_layer_t *layer); +nnom_status_t dense_build(nnom_layer_t *layer); +nnom_status_t dense_run(nnom_layer_t *layer); + +// API +nnom_layer_t *dense_s(const nnom_dense_config_t *config); +nnom_layer_t *Dense(size_t output_unit, const nnom_weight_t *w, const nnom_bias_t *b); + +#ifdef __cplusplus +} +#endif + +#endif /* __NNOM_DENSE_H__ */ diff --git a/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_dw_conv2d.h b/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_dw_conv2d.h new file mode 100644 index 000000000..5a9b58b25 --- /dev/null +++ b/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_dw_conv2d.h @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2018-2020 + * Jianjia Ma + * majianjia@live.com + * + * SPDX-License-Identifier: Apache-2.0 + * + * Change Logs: + * Date Author Notes + * 2020-05-03 Jianjia Ma The first version + */ + +#ifndef __NNOM_DW_CONV2D_H__ +#define __NNOM_DW_CONV2D_H__ + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include +#include + +#include "nnom.h" +#include "nnom_layers.h" +#include "nnom_local.h" +#include "nnom_tensor.h" + +#include "layers/nnom_conv2d.h" + +// method +nnom_status_t dw_conv2d_build(nnom_layer_t *layer); +nnom_status_t dw_conv2d_run(nnom_layer_t *layer); + +//API +nnom_layer_t *dw_conv2d_s(const nnom_conv2d_config_t *config); +nnom_layer_t *DW_Conv2D(uint32_t multiplier, nnom_3d_shape_t k, nnom_3d_shape_t s, nnom_3d_shape_t d, nnom_padding_t pad_type, + const nnom_weight_t *w, const nnom_bias_t *b); + +#ifdef __cplusplus +} +#endif + +#endif /* __NNOM_DW_CONV2D_H__ */ diff --git a/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_flatten.h b/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_flatten.h new file mode 100644 index 000000000..c77160fca --- /dev/null +++ b/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_flatten.h @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2018-2020 + * Jianjia Ma + * majianjia@live.com + * + * SPDX-License-Identifier: Apache-2.0 + * + * Change Logs: + * Date Author Notes + * 2020-05-03 Jianjia Ma The first version + */ + +#ifndef __NNOM_FLATTEN_H__ +#define __NNOM_FLATTEN_H__ + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include +#include + +#include "nnom.h" +#include "nnom_layers.h" +#include "nnom_local.h" +#include "nnom_tensor.h" + +// no special parameters but we need it. +typedef struct _nnom_flatten_config_t{ + nnom_layer_config_t super; +} nnom_flatten_config_t; + +// method +nnom_status_t flatten_build(nnom_layer_t *layer); +nnom_status_t flatten_run(nnom_layer_t *layer); + +// API +nnom_layer_t *flatten_s(const nnom_flatten_config_t *config); +nnom_layer_t *Flatten(void); + +#ifdef __cplusplus +} +#endif + +#endif /* __NNOM_FLATTEN_H__ */ diff --git a/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_global_pool.h b/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_global_pool.h new file mode 100644 index 000000000..febccb0e8 --- /dev/null +++ b/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_global_pool.h @@ -0,0 +1,54 @@ +/* + * Copyright (c) 2018-2020 + * Jianjia Ma + * majianjia@live.com + * + * SPDX-License-Identifier: Apache-2.0 + * + * Change Logs: + * Date Author Notes + * 2020-05-03 Jianjia Ma The first version + */ + +#ifndef __NNOM_GLOBAL_POOL_H__ +#define __NNOM_GLOBAL_POOL_H__ + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include +#include + +#include "nnom.h" +#include "nnom_layers.h" +#include "nnom_local.h" +#include "nnom_tensor.h" + +#include "layers/nnom_maxpool.h" + +typedef struct _nnom_global_pool_config_t +{ + nnom_layer_config_t super; + int16_t output_shift; +}nnom_global_pool_config_t; + +// method +nnom_status_t global_pool_build(nnom_layer_t *layer); + +// API +nnom_layer_t * global_maxpool_s(const nnom_global_pool_config_t *config); +nnom_layer_t * global_avgpool_s(const nnom_global_pool_config_t *config); +nnom_layer_t * global_sumpool_s(const nnom_global_pool_config_t *config); + +nnom_layer_t *GlobalMaxPool(void); +nnom_layer_t *GlobalAvgPool(void); +nnom_layer_t *GlobalSumPool(void); + + +#ifdef __cplusplus +} +#endif + +#endif /* __NNOM_GLOBAL_POOL_H__ */ diff --git a/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_gru_cell.h b/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_gru_cell.h new file mode 100644 index 000000000..8ba459624 --- /dev/null +++ b/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_gru_cell.h @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2018-2020 + * Jianjia Ma + * majianjia@live.com + * + * SPDX-License-Identifier: Apache-2.0 + * + * Change Logs: + * Date Author Notes + * 2020-08-27 Jianjia Ma The first version + */ + +#ifndef __NNOM_GRU_CELL_H__ +#define __NNOM_GRU_CELL_H__ + +#ifdef __cplusplus +extern "C" { +#endif + +#include "nnom_rnn.h" +#include "nnom_activation.h" + +typedef struct _nnom_gru_cell_config_t +{ + nnom_layer_config_t super; + nnom_tensor_t *weights; + nnom_tensor_t* recurrent_weights; + nnom_tensor_t *bias; + nnom_qformat_param_t q_dec_z, q_dec_h; // z, r, h + uint16_t units; +} nnom_gru_cell_config_t; + + +typedef struct _nnom_gru_cell_t +{ + nnom_rnn_cell_t super; + + nnom_tensor_t* weights; + nnom_tensor_t* recurrent_weights; + nnom_tensor_t* bias; + + // decide later. + // z, r, h + nnom_qformat_param_t q_dec_z, q_dec_h; + nnom_qformat_param_t oshift_iw, oshift_hw, bias_shift; + +} nnom_gru_cell_t; + +// gru +nnom_rnn_cell_t *gru_cell_s(const nnom_gru_cell_config_t* config); + +nnom_status_t gru_cell_free(nnom_rnn_cell_t* cell); +nnom_status_t gru_cell_build(nnom_rnn_cell_t* cell); +nnom_status_t gru_cell_run(nnom_rnn_cell_t* cell); + +#ifdef __cplusplus +} +#endif + +#endif /* __NNOM_GRU_CELL_H__ */ diff --git a/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_input.h b/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_input.h new file mode 100644 index 000000000..42322a61f --- /dev/null +++ b/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_input.h @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2018-2020 + * Jianjia Ma + * majianjia@live.com + * + * SPDX-License-Identifier: Apache-2.0 + * + * Change Logs: + * Date Author Notes + * 2020-05-03 Jianjia Ma The first version + */ + +#ifndef __NNOM_INPUT_H__ +#define __NNOM_INPUT_H__ + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include +#include + +#include "nnom.h" +#include "nnom_layers.h" +#include "nnom_local.h" +#include "nnom_tensor.h" + +// IO layer +typedef struct _nnom_io_layer +{ + nnom_layer_t super; + nnom_3d_shape_t shape; + nnom_qformat_param_t dec_bit; + void *buf; //input or output +} nnom_io_layer_t; + +typedef struct _nnom_io_config_t +{ + nnom_layer_config_t super; + nnom_tensor_t *tensor; +}nnom_io_config_t; + +// method +nnom_status_t input_build(nnom_layer_t *layer); +nnom_status_t input_run(nnom_layer_t *layer); + +// API +nnom_layer_t *input_s(const nnom_io_config_t* config); +nnom_layer_t *Input(nnom_3d_shape_t input_shape, void *p_buf); + + +#ifdef __cplusplus +} +#endif + +#endif /* __NNOM_INPUT_H__ */ diff --git a/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_lambda.h b/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_lambda.h new file mode 100644 index 000000000..80c5e6915 --- /dev/null +++ b/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_lambda.h @@ -0,0 +1,54 @@ +/* + * Copyright (c) 2018-2020 + * Jianjia Ma + * majianjia@live.com + * + * SPDX-License-Identifier: Apache-2.0 + * + * Change Logs: + * Date Author Notes + * 2020-05-03 Jianjia Ma The first version + */ + +#ifndef __NNOM_LAMBDA_H__ +#define __NNOM_LAMBDA_H__ + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include +#include + +#include "nnom.h" +#include "nnom_layers.h" +#include "nnom_local.h" +#include "nnom_tensor.h" + +#include "layers/nnom_input.h" + +// lambda layer +typedef struct _nnom_lambda_layer_t +{ + nnom_layer_t super; + void *parameters; // parameters for lambda +} nnom_lambda_layer_t; + +// lambda layer +typedef struct _nnom_lambda_config_t +{ + nnom_layer_config_t super; + nnom_status_t (*run_func_name)(nnom_layer_t *layer); // run method. required + nnom_status_t (*build_func_name)(nnom_layer_t *layer);// compute output buffer shape. can be left null, will call default_build() + nnom_status_t (*free_func_name)(nnom_layer_t *layer); // a callback to free private resources (comp buf not included) can be left null + void *parameters; // parameters for lambda +} nnom_lambda_config_t; + + + +#ifdef __cplusplus +} +#endif + +#endif /* __NNOM_LAMBDA_H__ */ diff --git a/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_lstm_cell.h b/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_lstm_cell.h new file mode 100644 index 000000000..f0563fc91 --- /dev/null +++ b/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_lstm_cell.h @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2018-2020 + * Jianjia Ma + * majianjia@live.com + * + * SPDX-License-Identifier: Apache-2.0 + * + * Change Logs: + * Date Author Notes + * 2020-08-24 Jianjia Ma The first version + */ + +#ifndef __NNOM_LSTM_CELL_H__ +#define __NNOM_LSTM_CELL_H__ + +#ifdef __cplusplus +extern "C" { +#endif + +#include "nnom_rnn.h" +#include "nnom_activation.h" + +// a machine interface for configuration +typedef struct _nnom_lstm_cell_config_t +{ + nnom_layer_config_t super; + nnom_tensor_t *weights; + nnom_tensor_t* recurrent_weights; + nnom_tensor_t *bias; + nnom_qformat_param_t q_dec_z, q_dec_h, q_dec_c; // z = iw + hw, c = cell state; h=output and memory + uint16_t units; +} nnom_lstm_cell_config_t; + + +typedef struct _nnom_lstm_cell_t +{ + nnom_rnn_cell_t super; + + nnom_tensor_t* weights; + nnom_tensor_t* recurrent_weights; + nnom_tensor_t* bias; + + // experimental, + // iw: input x weight + // hw: hidden state x recurrent weight + // h: hidden state (memor) + // c: cell state + nnom_qformat_param_t q_dec_z, q_dec_h, q_dec_c; + nnom_qformat_param_t oshift_iw, oshift_hw, oshift_zc, bias_shift; + +} nnom_lstm_cell_t; + +// LSTM +nnom_rnn_cell_t *lstm_cell_s(const nnom_lstm_cell_config_t* config); + +nnom_status_t lstm_cell_free(nnom_rnn_cell_t* cell); +nnom_status_t lstm_cell_q7_q15_build(nnom_rnn_cell_t* cell); +nnom_status_t lstm_cell_q7_q15_run(nnom_rnn_cell_t* cell); + +#ifdef __cplusplus +} +#endif + +#endif /* __NNOM_LSTM_CELL_H__ */ diff --git a/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_matrix.h b/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_matrix.h new file mode 100644 index 000000000..11b775bbe --- /dev/null +++ b/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_matrix.h @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2018-2020 + * Jianjia Ma + * majianjia@live.com + * + * SPDX-License-Identifier: Apache-2.0 + * + * Change Logs: + * Date Author Notes + * 2020-05-03 Jianjia Ma The first version + */ + +#ifndef __NNOM_MATRIX_H__ +#define __NNOM_MATRIX_H__ + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include +#include + +#include "nnom.h" +#include "nnom_layers.h" +#include "nnom_local.h" +#include "nnom_tensor.h" + +// the maximum input layer hooked to this layer +#define MAX_INPUT_LAYER 8 + +// matrix layer +typedef struct _nnom_matrix_layer_t +{ + nnom_layer_t super; + int16_t oshift; // output right shift +} nnom_matrix_layer_t; + +typedef struct _nnom_matrix_config_t +{ + nnom_layer_config_t super; + int16_t output_shift; // output right shift +} nnom_matrix_config_t; + +// methods +nnom_layer_t* _same_shape_matrix_layer(void); +nnom_status_t add_run(nnom_layer_t *layer); +nnom_status_t sub_run(nnom_layer_t *layer); +nnom_status_t mult_run(nnom_layer_t *layer); + +// API +nnom_layer_t *add_s(const nnom_matrix_config_t * config); +nnom_layer_t *sub_s(const nnom_matrix_config_t * config); +nnom_layer_t *mult_s(const nnom_matrix_config_t * config); +nnom_layer_t *Add(int16_t oshift); +nnom_layer_t *Sub(int16_t oshift); +nnom_layer_t *Mult(int16_t oshift); + +#ifdef __cplusplus +} +#endif + +#endif /* __NNOM_MATRIX_H__ */ diff --git a/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_maxpool.h b/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_maxpool.h new file mode 100644 index 000000000..690a02d2f --- /dev/null +++ b/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_maxpool.h @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2018-2020 + * Jianjia Ma + * majianjia@live.com + * + * SPDX-License-Identifier: Apache-2.0 + * + * Change Logs: + * Date Author Notes + * 2020-05-03 Jianjia Ma The first version + */ + +#ifndef __NNOM_MAXPOOL_H__ +#define __NNOM_MAXPOOL_H__ + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include +#include + +#include "nnom.h" +#include "nnom_layers.h" +#include "nnom_local.h" +#include "nnom_tensor.h" + +// Max Pooling +typedef struct _nnom_maxpool_layer_t +{ + nnom_layer_t super; + nnom_3d_shape_t kernel; + nnom_3d_shape_t stride; + nnom_3d_shape_t pad; + nnom_padding_t padding_type; + int16_t output_shift; // reserve +} nnom_maxpool_layer_t; + +// a machine interface for configuration +typedef struct _nnom_pool_config_t +{ + nnom_layer_config_t super; + nnom_padding_t padding_type; + int16_t output_shift; + int8_t kernel_size[2]; + int8_t stride_size[2]; + int8_t num_dim; +} nnom_pool_config_t; + +// method +nnom_status_t maxpool_build(nnom_layer_t *layer); +nnom_status_t maxpool_run(nnom_layer_t *layer); + +// API +nnom_layer_t *maxpool_s(const nnom_pool_config_t * config); +nnom_layer_t *MaxPool(nnom_3d_shape_t k, nnom_3d_shape_t s, nnom_padding_t pad_type); + +#ifdef __cplusplus +} +#endif + +#endif /* __NNOM_MATRIX_H__ */ diff --git a/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_output.h b/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_output.h new file mode 100644 index 000000000..8e62e22f2 --- /dev/null +++ b/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_output.h @@ -0,0 +1,43 @@ +/* + * Copyright (c) 2018-2020 + * Jianjia Ma + * majianjia@live.com + * + * SPDX-License-Identifier: Apache-2.0 + * + * Change Logs: + * Date Author Notes + * 2020-05-03 Jianjia Ma The first version + */ + +#ifndef __NNOM_OUTPUT_H__ +#define __NNOM_OUTPUT_H__ + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include +#include + +#include "nnom.h" +#include "nnom_layers.h" +#include "nnom_local.h" +#include "nnom_tensor.h" + +#include "layers/nnom_input.h" + +// method +nnom_status_t output_build(nnom_layer_t *layer); +nnom_status_t output_run(nnom_layer_t *layer); + +// API +nnom_layer_t *output_s(const nnom_io_config_t* config); +nnom_layer_t *Output(nnom_3d_shape_t output_shape, void *p_buf); + +#ifdef __cplusplus +} +#endif + +#endif /* __NNOM_OUTPUT_H__ */ diff --git a/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_reshape.h b/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_reshape.h new file mode 100644 index 000000000..fc68c45d1 --- /dev/null +++ b/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_reshape.h @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2018-2020 + * Jianjia Ma + * majianjia@live.com + * + * SPDX-License-Identifier: Apache-2.0 + * + * Change Logs: + * Date Author Notes + * 2020-12-07 Jianjia Ma The first version + */ + +#ifndef __NNOM_RESHAPE_H__ +#define __NNOM_RESHAPE_H__ + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include +#include + +#include "nnom.h" +#include "nnom_layers.h" +#include "nnom_local.h" +#include "nnom_tensor.h" + +typedef struct _nnom_reshape_layer_t +{ + nnom_layer_t super; + nnom_shape_data_t* dim; + uint8_t num_dim; + +} nnom_reshape_layer_t; + +typedef struct nnom_reshape_config_t +{ + nnom_layer_config_t super; + nnom_shape_data_t* dim; + uint8_t num_dim; +} nnom_reshape_config_t; + +// method +nnom_status_t reshape_run(nnom_layer_t *layer); +nnom_status_t reshape_build(nnom_layer_t *layer); +nnom_status_t reshape_free(nnom_layer_t *layer); + +// API +nnom_layer_t *reshape_s(const nnom_reshape_config_t *config); + +#ifdef __cplusplus +} +#endif + +#endif /* __NNOM_CONV2D_H__ */ diff --git a/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_rnn.h b/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_rnn.h new file mode 100644 index 000000000..6a9d6efb6 --- /dev/null +++ b/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_rnn.h @@ -0,0 +1,85 @@ +/* + * Copyright (c) 2018-2020 + * Jianjia Ma + * majianjia@live.com + * + * SPDX-License-Identifier: Apache-2.0 + * + * Change Logs: + * Date Author Notes + * 2020-05-03 Jianjia Ma The first version + */ + +#ifndef __NNOM_RNN_H__ +#define __NNOM_RNN_H__ + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include +#include + +#include "nnom.h" +#include "nnom_layers.h" +#include "nnom_local.h" +#include "nnom_tensor.h" + +// a machine interface for configuration +typedef struct _nnom_rnn_config_t +{ + nnom_layer_config_t super; + bool return_sequence; + bool stateful; + bool go_backwards; +} nnom_rnn_config_t; + +// RNN cell base type +typedef struct _nnom_rnn_cell_t +{ + nnom_status_t (*run)(struct _nnom_rnn_cell_t* cell); // cell runner + nnom_status_t (*build)(struct _nnom_rnn_cell_t* cell); // cell builder, calculate buffer size, output data size + nnom_status_t (*free)(struct _nnom_rnn_cell_t* cell); // + nnom_layer_t *layer; // pointer to its layer holder + nnom_layer_config_t *config; // config for the cell event it is a layer type + nnom_rnn_cell_type_t type; + + void *in_data; // input data + void *out_data; // output data + void *in_state; // input state data (or hidden state) + void *out_state; // output state data + + size_t comp_buf_size; // the size of temporary buffer. + size_t state_size; // the size of hidden state + uint16_t units; // the output units + uint16_t feature_size; // the input feature size (vector size) + + size_t macc; // stat of MAC count. +} nnom_rnn_cell_t; + +typedef struct _nnom_rnn_layer_t +{ + nnom_layer_t super; + nnom_rnn_cell_t *cell; + void *state_buf; // memory allocated to store state, size = 2 x size of state required by cell. + + uint16_t timestamp_size;// size of timestamp + bool return_sequence; // whether to return the output for each unit (sequence) + bool stateful; // whether the states are kept after one inteference + bool go_backwards; // whether go backwards timestamping +} nnom_rnn_layer_t; + + +// rnn layer +nnom_layer_t *rnn_s(nnom_rnn_cell_t *cell, const nnom_rnn_config_t* config); + +nnom_status_t rnn_run(nnom_layer_t* layer); +nnom_status_t rnn_build(nnom_layer_t* layer); +nnom_status_t rnn_free(nnom_layer_t* layer); + +#ifdef __cplusplus +} +#endif + +#endif /* __NNOM_RNN_H__ */ diff --git a/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_simple_cell.h b/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_simple_cell.h new file mode 100644 index 000000000..87977ed8f --- /dev/null +++ b/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_simple_cell.h @@ -0,0 +1,86 @@ +/* + * Copyright (c) 2018-2020 + * Jianjia Ma + * majianjia@live.com + * + * SPDX-License-Identifier: Apache-2.0 + * + * Change Logs: + * Date Author Notes + * 2020-08-20 Jianjia Ma The first version + */ + +#ifndef __NNOM_SIMPLE_CELL_H__ +#define __NNOM_SIMPLE_CELL_H__ + +#ifdef __cplusplus +extern "C" { +#endif + +#include "nnom_rnn.h" +#include "nnom_activation.h" + + +// This Simple Cell replicate the Keras's SimpleCell as blow +/* + def call(self, inputs, states, training=None): + prev_output = states[0] if nest.is_sequence(states) else states + + h = K.dot(inputs, self.kernel) + h = K.bias_add(h, self.bias) + + output = h + K.dot(prev_output, self.recurrent_kernel) + output = self.activation(output) + + new_state = [output] if nest.is_sequence(states) else output + return output, new_state +*/ + +// a machine interface for configuration +typedef struct _nnom_simple_cell_config_t +{ + nnom_layer_config_t super; + nnom_tensor_t *weights; + nnom_tensor_t* recurrent_weights; + nnom_tensor_t *bias; + nnom_qformat_param_t q_dec_iw, q_dec_hw, q_dec_h; + nnom_activation_type_t act_type; // type of the activation + uint16_t units; +} nnom_simple_cell_config_t; + + +typedef struct _nnom_simple_cell_t +{ + nnom_rnn_cell_t super; + nnom_activation_type_t act_type; + + nnom_tensor_t* weights; + nnom_tensor_t* recurrent_weights; + nnom_tensor_t* bias; + + // experimental, + // iw: input x weight + // hw: hidden state x recurrent weight + // h: hidden state + nnom_qformat_param_t q_dec_iw, q_dec_hw, q_dec_h; + nnom_qformat_param_t oshift_iw, oshift_hw, bias_shift; + +} nnom_simple_cell_t; + + +// RNN cells +// The shape for RNN input is (batch, timestamp, feature), where batch is always 1. +// +// SimpleCell +nnom_rnn_cell_t *simple_cell_s(const nnom_simple_cell_config_t* config); + +nnom_status_t simple_cell_free(nnom_rnn_cell_t* cell); +nnom_status_t simple_cell_build(nnom_rnn_cell_t* cell); +nnom_status_t simple_cell_run(nnom_rnn_cell_t* cell); + + +#ifdef __cplusplus +} +#endif + +#endif /* __NNOM_SIMPLE_CELL_H__ */ diff --git a/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_softmax.h b/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_softmax.h new file mode 100644 index 000000000..230be3277 --- /dev/null +++ b/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_softmax.h @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2018-2020 + * Jianjia Ma + * majianjia@live.com + * + * SPDX-License-Identifier: Apache-2.0 + * + * Change Logs: + * Date Author Notes + * 2020-05-03 Jianjia Ma The first version + */ + +#ifndef __NNOM_SOFTMAX_H__ +#define __NNOM_SOFTMAX_H__ + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include +#include + +#include "nnom.h" +#include "nnom_layers.h" +#include "nnom_local.h" +#include "nnom_tensor.h" + +typedef struct _nnom_softmax_config_t +{ + nnom_layer_config_t super; +} nnom_softmax_config_t; + + +// method +nnom_status_t softmax_run(nnom_layer_t *layer); +nnom_status_t softmax_build(nnom_layer_t *layer); + +// API +nnom_layer_t *softmax_s(const nnom_softmax_config_t * config); +nnom_layer_t *Softmax(void); + +#ifdef __cplusplus +} +#endif + +#endif /* __NNOM_SOFTMAX_H__ */ diff --git a/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_sumpool.h b/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_sumpool.h new file mode 100644 index 000000000..927615e82 --- /dev/null +++ b/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_sumpool.h @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2018-2020 + * Jianjia Ma + * majianjia@live.com + * + * SPDX-License-Identifier: Apache-2.0 + * + * Change Logs: + * Date Author Notes + * 2020-05-03 Jianjia Ma The first version + */ + +#ifndef __NNOM_SUMPOOL_H__ +#define __NNOM_SUMPOOL_H__ + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include +#include + +#include "nnom.h" +#include "nnom_layers.h" +#include "nnom_local.h" +#include "nnom_tensor.h" + +#include "layers/nnom_maxpool.h" + +// Sum Pooling +typedef nnom_maxpool_layer_t nnom_sumpool_layer_t; + +// method +nnom_status_t sumpool_build(nnom_layer_t *layer); +nnom_status_t sumpool_run(nnom_layer_t *layer); + +// API +nnom_layer_t *sumpool_s(const nnom_pool_config_t * config); +nnom_layer_t *SumPool(nnom_3d_shape_t k, nnom_3d_shape_t s, nnom_padding_t pad_type); + +#ifdef __cplusplus +} +#endif + +#endif /* __NNOM_SUMPOOL_H__ */ diff --git a/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_upsample.h b/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_upsample.h new file mode 100644 index 000000000..5db7c9708 --- /dev/null +++ b/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_upsample.h @@ -0,0 +1,54 @@ +/* + * Copyright (c) 2018-2020 + * Jianjia Ma + * majianjia@live.com + * + * SPDX-License-Identifier: Apache-2.0 + * + * Change Logs: + * Date Author Notes + * 2020-05-03 Jianjia Ma The first version + */ + +#ifndef __NNOM_UPSAMPLE_H__ +#define __NNOM_UPSAMPLE_H__ + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include +#include + +#include "nnom.h" +#include "nnom_layers.h" +#include "nnom_local.h" +#include "nnom_tensor.h" + +// Up Sampling layer (UnPooling) +typedef struct _nnom_upsample_layer_t +{ + nnom_layer_t super; + nnom_3d_shape_t kernel; +} nnom_upsample_layer_t; + +typedef struct _nnom_upsample_config_t +{ + nnom_layer_config_t super; + nnom_shape_data_t kernel[2]; +} nnom_upsample_config_t; + +// API +nnom_layer_t *upsample_s(const nnom_upsample_config_t *config); +nnom_layer_t *UpSample(nnom_3d_shape_t kernel); + +// Methods +nnom_status_t upsample_build(nnom_layer_t *layer); +nnom_status_t upsample_run(nnom_layer_t *layer); + +#ifdef __cplusplus +} +#endif + +#endif /* __NNOM_UPSAMPLE_H__ */ diff --git a/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_zero_padding.h b/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_zero_padding.h new file mode 100644 index 000000000..9aefd6d03 --- /dev/null +++ b/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_zero_padding.h @@ -0,0 +1,54 @@ +/* + * Copyright (c) 2018-2020 + * Jianjia Ma + * majianjia@live.com + * + * SPDX-License-Identifier: Apache-2.0 + * + * Change Logs: + * Date Author Notes + * 2020-05-03 Jianjia Ma The first version + */ + +#ifndef __NNOM_ZERO_PADDING_H__ +#define __NNOM_ZERO_PADDING_H__ + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include +#include + +#include "nnom.h" +#include "nnom_layers.h" +#include "nnom_local.h" +#include "nnom_tensor.h" + +typedef struct _nnom_zero_padding_config_t +{ + nnom_layer_config_t super; + nnom_border_t pad; +} nnom_zero_padding_config_t; + +// zero padding +typedef struct _nnom_zero_padding_layer_t +{ + nnom_layer_t super; + nnom_border_t pad; +} nnom_zero_padding_layer_t; + +// API +nnom_layer_t *zeropadding_s(const nnom_zero_padding_config_t* config); +nnom_layer_t *ZeroPadding(nnom_border_t pad); + +// method +nnom_status_t zero_padding_build(nnom_layer_t *layer); +nnom_status_t zero_padding_run(nnom_layer_t *layer); + +#ifdef __cplusplus +} +#endif + +#endif /* __NNOM_ZERO_PADDING_H__ */ diff --git a/APP_Framework/Framework/knowing/nnom/inc/nnom.h b/APP_Framework/Framework/knowing/nnom/inc/nnom.h new file mode 100644 index 000000000..ba802f0e5 --- /dev/null +++ b/APP_Framework/Framework/knowing/nnom/inc/nnom.h @@ -0,0 +1,415 @@ +/* + * Copyright (c) 2018-2020 + * Jianjia Ma + * majianjia@live.com + * + * SPDX-License-Identifier: Apache-2.0 + * + * Change Logs: + * Date Author Notes + * 2019-02-05 Jianjia Ma The first version + * 2019-02-10 Jianjia Ma Compiler supports dense net connection + */ + +#ifndef __NNOM_H__ +#define __NNOM_H__ + +#ifdef __cplusplus +extern "C" { +#endif + + +#include +#include +#include +#include +#include + +#include "nnom_port.h" + +#define NNOM_ALIGN (sizeof(char*)) // alignment when doing memory ops. Equal to size of pointer in byte. +#define q7_t int8_t +#define q15_t int16_t +#define q31_t int32_t +#define q63_t int64_t + +/* version */ +#define NNOM_MAJORVERSION 0 /**< major version number */ +#define NNOM_SUBVERSION 4 /**< minor version number */ +#define NNOM_REVISION 3 /**< revise version number */ +#define NNOM_VERSION ((NNOM_MAJORVERSION * 10000) + (NNOM_SUBVERSION * 100) + NNOM_REVISION) + +#ifdef ARM_NN_TRUNCATE +#define NNOM_TRUNCATE +#endif + +#ifndef NNOM_TRUNCATE + #define NNOM_ROUND(out_shift) ((0x1 << out_shift) >> 1 ) +#else + #define NNOM_ROUND(out_shift) 0 +#endif + +typedef enum +{ + NN_SUCCESS = 0, /**< No error */ + NN_ARGUMENT_ERROR = -1, /**< One or more arguments are incorrect */ + NN_LENGTH_ERROR = -2, /**< Length of data buffer is incorrect */ + NN_SIZE_MISMATCH = -3, /**< Size of matrices is not compatible with the operation. */ + NN_NANINF = -4, /**< Not-a-number (NaN) or infinity is generated */ + NN_SINGULAR = -5, /**< Generated by matrix inversion if the input matrix is singular and cannot be inverted. */ + NN_TEST_FAILURE = -6, /**< Test Failed */ + NN_NO_MEMORY = -7, + NN_MORE_TODO = -8 +} nnom_status_t; + +typedef enum +{ + NNOM_INVALID = 0, + NNOM_BASE, + NNOM_INPUT, + NNOM_OUTPUT, + NNOM_CONV_2D, + NNOM_DW_CONV_2D, + NNOM_CONV2D_TRANS, + NNOM_BATCHNORM, + NNOM_DENSE, + NNOM_ZERO_PADDING, + NNOM_CROPPING, + NNOM_RNN, + NNOM_ACTIVATION, + NNOM_RELU, + NNOM_LEAKY_RELU, + NNOM_ADV_RELU, + NNOM_SIGMOID, + NNOM_TANH, + NNOM_SOFTMAX, + NNOM_MAXPOOL, + NNOM_GLOBAL_MAXPOOL, + NNOM_AVGPOOL, + NNOM_GLOBAL_AVGPOOL, + NNOM_SUMPOOL, + NNOM_GLOBAL_SUMPOOL, + NNOM_UPSAMPLE, + NNOM_FLATTEN, + NNOM_RESHAPE, + NNOM_LAMBDA, + NNOM_CONCAT, + NNOM_ADD, + NNOM_SUB, + NNOM_MULT, + NNOM_TYPE_MAX + +} nnom_layer_type_t; + +#define DEFUALT_LAYER_NAMES \ + { \ + "Unknown", \ + "Base", \ + "Input", \ + "Output", \ + "Conv2D", \ + "DW_Conv2D", \ + "Conv2DTrsp", \ + "BatchNorm", \ + "Dense", \ + "ZeroPad", \ + "Cropping", \ + "RNN", \ + "Activation", \ + "ReLU", \ + "Leaky_ReLU", \ + "Adv_ReLU", \ + "Sigmoid", \ + "Tanh", \ + "Softmax", \ + "MaxPool", \ + "GL_MaxPool", \ + "AvgPool", \ + "GL_AvgPool", \ + "SumPool", \ + "GL_SumPool", \ + "UpSample", \ + "Flatten", \ + "Reshape", \ + "Lambda", \ + "Concat", \ + "Add", \ + "Sub", \ + "Mult", \ + } +extern const char default_layer_names[][12]; + +// We dont count softmax an activation here, softmax is instanced as a layer +typedef enum +{ + ACT_UNKNOWN = 0, + ACT_RELU, + ACT_LEAKY_RELU, + ACT_ADV_RELU, + ACT_TANH, + ACT_SIGMOID, + ACT_HARD_TANH, + ACT_HARD_SIGMOID +} nnom_activation_type_t; + +#define ACTIVATION_NAMES \ + { \ + "Unknown", \ + "ReLU", \ + "LkyReLU", \ + "AdvReLU", \ + "TanH", \ + "Sigmoid", \ + "HrdTanH", \ + "HrdSigd", \ + } +extern const char default_activation_names[][8]; + +// RNN cell type +typedef enum +{ + NNOM_UNKOWN_CELL = 0, + NNOM_SIMPLE_CELL, + NNOM_GRU_CELL, + NNOM_LSTM_CELL, + NNOM_CELL_TYPE_MAX +} nnom_rnn_cell_type_t; + +#define DEFUALT_CELL_NAMES \ + { \ + "Unknown", \ + "Simple", \ + "GRU", \ + "LSTM", \ + } +extern const char default_cell_names[][8]; + + +// parameters +typedef enum +{ + PADDING_VALID = 0, + PADDING_SAME +} nnom_padding_t; + +#define NNOM_TENSOR_BUF_NULL (0) // This buffer is not in used +#define NNOM_TENSOR_BUF_TEMP (1) // The memory in IO is temporary occupided, can be reused by other layer once the computation is done. +#define NNOM_TENSOR_BUF_RESERVED (2) // the mem is reserve for this layer only (not to be reused by other layer. + +// currently used in compiling. +#define NNOM_BUF_EMPTY (0) +#define NNOM_BUF_FILLED (1) + +// basic types +#define nnom_qformat_param_t int32_t // this should match the backend, need a better way to do it. +#define nnom_shape_data_t uint16_t + +typedef struct _nnom_3d_shape_t +{ + nnom_shape_data_t h, w, c; +} nnom_3d_shape_t; + +typedef struct _nnom_border_t +{ + nnom_shape_data_t top, bottom, left, right; +} nnom_border_t; + +// nnom_3d_shape_axis_t type provide the axis[] format access to nnom_3d_shape_t +typedef union { + nnom_3d_shape_t s; + nnom_shape_data_t axis[sizeof(nnom_3d_shape_t) / sizeof(nnom_shape_data_t)]; +} nnom_3d_shape_axis_t; + +// tensor quantisation types +typedef enum +{ + NNOM_QTYPE_PER_TENSOR = 0, + NNOM_QTYPE_PER_AXIS = 1 +} nnom_qtype_t; + +typedef struct _nnom_weights +{ + const void *p_value; + nnom_qformat_param_t shift; +} nnom_weight_t; + +typedef struct _nnom_bias +{ + const void *p_value; + nnom_qformat_param_t shift; +} nnom_bias_t; + +// experimental +typedef struct _nnom_tensor_t +{ + void* p_data; // value + nnom_shape_data_t *dim; // dimension of this tensor + nnom_qformat_param_t *q_dec; // number of decimal bit for Q format (scale) + nnom_qformat_param_t *q_offset; // offset for each channel + nnom_qtype_t qtype; // the quantisation type + uint8_t num_dim; // the number of dimension + uint8_t bitwidth; // the data bit width, only support 8bit now +} nnom_tensor_t; + +// nn wrappers +typedef struct _nnom_layer_t nnom_layer_t; +typedef struct _nnom_layer_io_t nnom_layer_io_t; +typedef struct _nnom_layer_hook_t nnom_layer_hook_t; +typedef struct _nnom_mem_block_t nnom_mem_block_t; + +// activation wrapper +typedef struct _nnom_activation_t nnom_activation_t; + +typedef struct _nnom_buf +{ + nnom_mem_block_t *mem; + size_t size; + uint8_t type; +} nnom_buf_t; + +// a memory block to store pre-assign memories during compiling. then assigned to each tensor after. +struct _nnom_mem_block_t +{ + void *blk; // data block location + size_t size; // the maximum size for this block + uint8_t owners; // how many layers own this block + uint8_t state; // empty? filled? for static nn, currently only used in compiling +}; + +typedef struct _nnom_stat_t +{ + size_t macc; //num. of mac operation + uint32_t time; +} nnom_layer_stat_t; + +struct _nnom_layer_hook_t +{ + nnom_layer_io_t *io; // hooked io + nnom_layer_hook_t *next; // next hook include secondary hooked layer +}; + +struct _nnom_layer_io_t +{ + nnom_layer_hook_t hook; // for example: (layer->out)--hook--(layer->in) + nnom_layer_io_t *aux; // point to auxilary I/O (multiple I/O layer) + nnom_tensor_t *tensor; // experimental + nnom_mem_block_t *mem; // memory blocks handles for compiling only. The memory are now pass by tensor. trying to remove it. + nnom_layer_t *owner; // which layer owns this io. + uint8_t type; +}; + +// structured configuration base type +typedef struct _nnom_layer_config_t +{ + char* name; // the name of the layer prequantiesd model (the model trained by user before converted to nnom) +} nnom_layer_config_t; + +// layers base +struct _nnom_layer_t +{ + nnom_layer_t *shortcut; // shortcut points to the next layer, applied on compiling + + nnom_status_t (*run)(nnom_layer_t *layer); // run method. required + nnom_status_t (*build)(nnom_layer_t *layer); // compute output buffer shape. can be left null, will call default_build() + nnom_status_t (*free)(nnom_layer_t *layer); // a callback to free private resources (comp buf not included) can be left null + nnom_buf_t *comp; // computational buf + nnom_activation_t *actail; // I have an activation, I have a tail, wooo haaaa, act-tail!!! + + nnom_layer_config_t *config; // point to the configuration of the layers. for machine api only. + nnom_layer_type_t type; // layer types + nnom_layer_io_t *in; // IO buff, last*layer, states + nnom_layer_io_t *out; // IO buff, next*layer, states + nnom_layer_stat_t stat; // stats, timing, ops +}; + +// activation base +struct _nnom_activation_t +{ + nnom_status_t (*run)(struct _nnom_activation_t *act); + nnom_tensor_t *tensor; + nnom_activation_type_t type; +}; + +// local static functions when libc is not available +#ifdef NNOM_USING_STATIC_MEMORY + void nnom_set_static_buf(void* buf, size_t size); + void *nnom_malloc(size_t size); + void nnom_free(void* p); +#endif //NNOM_USING_STATIC_BUF + +typedef struct _nnom_model nnom_model_t; + +#include "nnom_tensor.h" +#include "nnom_layers.h" +#include "nnom_utils.h" + +// models, I dont want to make model class as a child of layer class yet +struct _nnom_model +{ + nnom_layer_t *head; + nnom_layer_t *tail; + + // model constructor + nnom_status_t (*add)(struct _nnom_model *m, nnom_layer_t *layer); // has too pass a raw value + nnom_layer_t *(*hook)(nnom_layer_t *curr, nnom_layer_t *last); // create hook between 2 layers' primary IO. + nnom_layer_t *(*merge)(nnom_layer_t *method, nnom_layer_t *in1, nnom_layer_t *in2); // an older interface of merge 2 inputs. + nnom_layer_t *(*mergex)(nnom_layer_t *method, int num, ...); // merge a few layers using mutiple input method (concate, add, ...) + nnom_layer_t *(*active)(nnom_activation_t *act, nnom_layer_t *target_layer); // add the activation to the existing layer's tail + + // callback + nnom_status_t (*layer_callback)(nnom_model_t *m, nnom_layer_t *layer); // layer callback will be called after each layer(after actail). + + // block memory for layers + nnom_mem_block_t blocks[NNOM_BLOCK_NUM]; + + size_t total_ops; + + bool is_inited; // is this structure initialized + bool is_allocated; // is this structure allocated by nnom (not by user) +}; + +#define NNOM_NULL_CHECK(p) \ + if ((p) == NULL) \ + { \ + NNOM_LOG("Error: NULL object.\n"); \ + return NN_ARGUMENT_ERROR; \ + } + + +// utils +size_t nnom_alignto(size_t value, uint32_t alignment); +size_t nnom_io_length(nnom_layer_io_t *io); +size_t nnom_hook_length(nnom_layer_hook_t *hook); + +// memory (malloc + memeset 0) +void *nnom_mem(size_t size); + +// get how much memory has been taken +size_t nnom_mem_stat(void); + +// Model APIs +// create or init a model +nnom_model_t *new_model(nnom_model_t *m); +// compile as sequencial model +nnom_status_t sequencial_compile(nnom_model_t *m); +// compile as functional model +nnom_status_t model_compile(nnom_model_t *m, nnom_layer_t *input, nnom_layer_t *output); +// run a prediction +nnom_status_t model_run(nnom_model_t *m); +// delete model. +void model_delete(nnom_model_t *m); +// check version +nnom_status_t check_model_version(unsigned long model_version); + +// callback, called after each layer has finished the calculation. +// this callback must return NN_SUCCESS for continually run the model. otherwise, model will be returned with the ERROR code. +// this function return NN_LENGTH_ERROR if the callback is already set to other. +nnom_status_t model_set_callback(nnom_model_t *m, nnom_status_t (*layer_callback)(nnom_model_t *m, nnom_layer_t *layer)); +// delete callback. +void model_delete_callback(nnom_model_t *m); + +#ifdef __cplusplus +} +#endif + +#endif /* __NNOM_H__ */ diff --git a/APP_Framework/Framework/knowing/nnom/inc/nnom_layers.h b/APP_Framework/Framework/knowing/nnom/inc/nnom_layers.h new file mode 100644 index 000000000..cba44874f --- /dev/null +++ b/APP_Framework/Framework/knowing/nnom/inc/nnom_layers.h @@ -0,0 +1,194 @@ +/* + * Copyright (c) 2018-2020 + * Jianjia Ma + * majianjia@live.com + * + * SPDX-License-Identifier: Apache-2.0 + * + * Change Logs: + * Date Author Notes + * 2019-02-05 Jianjia Ma The first version + */ + +#ifndef __NNOM_LAYERS_H__ +#define __NNOM_LAYERS_H__ + +#ifdef __cplusplus +extern "C" { +#endif + + +#include +#include +#include + +#include "nnom.h" + +// properties +nnom_3d_shape_t shape(size_t h, size_t w, size_t c); +nnom_3d_shape_t kernel(size_t h, size_t w); +nnom_3d_shape_t stride(size_t h, size_t w); +nnom_3d_shape_t dilation(size_t h, size_t w); +nnom_border_t border(size_t top, size_t bottom, size_t left, size_t right); +//nnom_qformat_t qformat(int8_t m, int8_t n); +size_t shape_size(nnom_3d_shape_t* s); + +// this function is to add a new IO to current inited IO +// input, the targeted IO that the new IO will be added to +// output , the new IO +nnom_layer_io_t* io_add_aux(nnom_layer_io_t* targeted_io); +nnom_layer_io_t *io_init(void *owner_layer, nnom_layer_io_t *io); + +#define NN_CEILIF(x,y) ((x+y-1)/y) + +#include "layers/nnom_activation.h" +#include "layers/nnom_concat.h" +#include "layers/nnom_conv2d.h" +#include "layers/nnom_cropping.h" +#include "layers/nnom_conv2d_trans.h" +#include "layers/nnom_dense.h" +#include "layers/nnom_dw_conv2d.h" +#include "layers/nnom_flatten.h" +#include "layers/nnom_reshape.h" +#include "layers/nnom_global_pool.h" +#include "layers/nnom_input.h" +#include "layers/nnom_lambda.h" +#include "layers/nnom_matrix.h" +#include "layers/nnom_maxpool.h" +#include "layers/nnom_avgpool.h" +#include "layers/nnom_output.h" +#include "layers/nnom_rnn.h" +#include "layers/nnom_softmax.h" +#include "layers/nnom_sumpool.h" +#include "layers/nnom_upsample.h" +#include "layers/nnom_zero_padding.h" +#include "layers/nnom_rnn.h" +#include "layers/nnom_simple_cell.h" +#include "layers/nnom_lstm_cell.h" +#include "layers/nnom_gru_cell.h" + +// Layer APIs ****** +// (a summary for each individual layer's files) + +// input/output +nnom_layer_t *Input(nnom_3d_shape_t input_shape, void *p_buf); +nnom_layer_t *Output(nnom_3d_shape_t output_shape, void *p_buf); + +// Pooling +nnom_layer_t *MaxPool(nnom_3d_shape_t k, nnom_3d_shape_t s, nnom_padding_t pad); +nnom_layer_t *AvgPool(nnom_3d_shape_t k, nnom_3d_shape_t s, nnom_padding_t pad); +nnom_layer_t *SumPool(nnom_3d_shape_t k, nnom_3d_shape_t s, nnom_padding_t pad); +nnom_layer_t *GlobalMaxPool(void); +nnom_layer_t *GlobalAvgPool(void); +nnom_layer_t *GlobalSumPool(void); + +// padding, cropping, upsample +nnom_layer_t *UpSample(nnom_3d_shape_t kernel); +nnom_layer_t *ZeroPadding(nnom_border_t pad); +nnom_layer_t *Cropping(nnom_border_t pad); + +// Activation +nnom_layer_t *Activation(nnom_activation_t *act); +nnom_layer_t *ReLU(void); +nnom_layer_t *LeakyReLU(float alpha); +nnom_layer_t *Softmax(void); +nnom_layer_t *Sigmoid(int32_t dec_bit); // input dec bit +nnom_layer_t *TanH(int32_t dec_bit); // input dec bit + +// Matrix +nnom_layer_t *Add(int16_t oshift); // output shift +nnom_layer_t *Sub(int16_t oshift); // output shift +nnom_layer_t *Mult(int16_t oshift); // output shift + +nnom_layer_t *Flatten(void); +nnom_layer_t *Concat(int8_t axis); +// -- NN Constructers -- +// conv2d +nnom_layer_t *Conv2D(uint32_t filters, nnom_3d_shape_t k, nnom_3d_shape_t s, nnom_3d_shape_t d, nnom_padding_t pad, + const nnom_weight_t *w, const nnom_bias_t *b); + +// deconv2d +nnom_layer_t *Conv2DTrans(uint32_t filters, nnom_3d_shape_t k, nnom_3d_shape_t s, nnom_3d_shape_t d, nnom_padding_t pad, + const nnom_weight_t *w, const nnom_bias_t *b); + +// depthwise_convolution +nnom_layer_t *DW_Conv2D(uint32_t multiplier, nnom_3d_shape_t k, nnom_3d_shape_t s, nnom_3d_shape_t d, nnom_padding_t pad, + const nnom_weight_t *w, const nnom_bias_t *b); + +// fully connected, dense +nnom_layer_t *Dense(size_t output_unit, const nnom_weight_t *w, const nnom_bias_t *b); + + +// Lambda Layers +nnom_layer_t *Lambda(nnom_status_t (*run)(nnom_layer_t *), // run method, required + nnom_status_t (*build)(nnom_layer_t *), // optional, call default_build() if left null + nnom_status_t (*free)(nnom_layer_t *), // not required if no resources needs to be deleted, can be left null. + void *parameters); // user private parameters for run method, left null if not needed. + +// building methods +nnom_status_t default_build(nnom_layer_t* layer); +nnom_status_t input_build(nnom_layer_t* layer); + +nnom_status_t conv2d_build(nnom_layer_t* layer); +nnom_status_t dw_conv2d_build(nnom_layer_t* layer); +nnom_status_t conv2d_trans_build(nnom_layer_t* layer); +nnom_status_t dense_build(nnom_layer_t* layer); +nnom_status_t rnn_build(nnom_layer_t* layer); + +nnom_status_t upsample_build(nnom_layer_t* layer); +nnom_status_t zero_padding_build(nnom_layer_t* layer); +nnom_status_t cropping_build(nnom_layer_t* layer); + +nnom_status_t maxpool_build(nnom_layer_t* layer); +nnom_status_t avgpool_build(nnom_layer_t* layer); +nnom_status_t sumpool_build(nnom_layer_t* layer); +nnom_status_t global_pool_build(nnom_layer_t* layer); + +nnom_status_t flatten_build(nnom_layer_t* layer); +nnom_status_t reshape_build(nnom_layer_t* layer); +nnom_status_t concat_build(nnom_layer_t* layer); + +// run +nnom_status_t input_run(nnom_layer_t* layer); +nnom_status_t output_run(nnom_layer_t* layer); +nnom_status_t flatten_run(nnom_layer_t* layer); +nnom_status_t reshape_run(nnom_layer_t* layer); +nnom_status_t default_run(nnom_layer_t* layer); // simply copy data from input to output + +nnom_status_t dw_conv2d_run(nnom_layer_t* layer); +nnom_status_t conv2d_run(nnom_layer_t* layer); +nnom_status_t conv2d_trans_run(nnom_layer_t* layer); +nnom_status_t dense_run(nnom_layer_t* layer); +nnom_status_t rnn_run(nnom_layer_t* layer); + +nnom_status_t upsample_run(nnom_layer_t* layer); +nnom_status_t zero_padding_run(nnom_layer_t* layer); +nnom_status_t cropping_run(nnom_layer_t* layer); + +nnom_status_t activation_run(nnom_layer_t* layer); +nnom_status_t softmax_run(nnom_layer_t* layer); + +nnom_status_t maxpool_run(nnom_layer_t* layer); +nnom_status_t avgpool_run(nnom_layer_t* layer); +nnom_status_t sumpool_run(nnom_layer_t* layer); + +nnom_status_t concat_run(nnom_layer_t* layer); +nnom_status_t add_run(nnom_layer_t* layer); +nnom_status_t sub_run(nnom_layer_t* layer); +nnom_status_t mult_run(nnom_layer_t* layer); + +// Activation APIs +// Softmax is not considered as activation in NNoM, Softmax is in layer API. +nnom_activation_t* act_relu(void); +nnom_activation_t* act_leaky_relu(float alpha); +nnom_activation_t* act_sigmoid(int32_t dec_bit); +nnom_activation_t* act_tanh(int32_t dec_bit); + +// direct API +nnom_status_t act_tensor_run(nnom_activation_t* act, nnom_tensor_t* tensor); + +#ifdef __cplusplus +} +#endif + +#endif /* __NNOM_LAYERS_H__ */ diff --git a/APP_Framework/Framework/knowing/nnom/inc/nnom_local.h b/APP_Framework/Framework/knowing/nnom/inc/nnom_local.h new file mode 100644 index 000000000..35845a564 --- /dev/null +++ b/APP_Framework/Framework/knowing/nnom/inc/nnom_local.h @@ -0,0 +1,974 @@ +/* + * Copyright (c) 2018-2020 + * Jianjia Ma + * majianjia@live.com + * + * SPDX-License-Identifier: Apache-2.0 + * + * Notice: + * Code in this file inlcudes derivative works from CMSIS, which is released under alternative license. + * Please check the LICENSE file for detial. + * + * Change Logs: + * Date Author Notes + * 2019-02-05 Jianjia Ma The first version + * 2019-03-19 Jianjia Ma Local C implementation partly from CMSIS-NN + */ + +#ifndef __NNOM_LOCAL_H__ +#define __NNOM_LOCAL_H__ + +#ifdef __cplusplus +extern "C" { +#endif + + +#include "stdint.h" +#include "nnom_port.h" + +#ifdef ARM_NN_TRUNCATE +#define NNOM_TRUNCATE +#endif + +// SSAT implementation with C code +#ifndef __NNOM_SSAT +static inline int __NNOM_SSAT(int32_t value, int32_t bit) { + int32_t min = -(1<<(bit-1)); + int32_t max = (1<<(bit-1)) - 1; + if (value < min) + return min; + else if (value > max) + return max; + else + return value; +} +#endif + +// USAT implementation with C code +#ifndef __NNOM_USAT +static inline int __NNOM_USAT(int32_t value, int32_t bit) { + int32_t max = (1<<(bit-1)) - 1; + if (value < 0) + return 0; + else if (value > max) + return max; + else + return value; +} +#endif + +#define MAX(A, B) ((A) > (B) ? (A) : (B)) +#define MIN(A, B) ((A) < (B) ? (A) : (B)) + + +// Those functions/tables below are partially modifed from CMSIS-NN lib +// https://github.com/ARM-software/CMSIS_5 +// +void local_avepool_q7_HWC(const q7_t *Im_in, // input image + const uint16_t dim_im_in_x, // input image dimension x or W + const uint16_t dim_im_in_y, // input image dimension y or H + const uint16_t ch_im_in, // number of input image channels + const uint16_t dim_kernel_x, // window kernel size + const uint16_t dim_kernel_y, // window kernel size + const uint16_t padding_x, // padding sizes + const uint16_t padding_y, // padding sizes + const uint16_t stride_x, // stride + const uint16_t stride_y, // stride + const uint16_t dim_im_out_x, // output image dimension x or W + const uint16_t dim_im_out_y, // output image dimension y or H + const uint16_t output_shift, // output right shift + q7_t *bufferA, // a buffer for local storage, NULL by now + q7_t *Im_out); + +void local_avepool_q7_CHW(const q7_t *Im_in, // input image + const uint16_t dim_im_in_x, // input image dimension x or W + const uint16_t dim_im_in_y, // input image dimension y or H + const uint16_t ch_im_in, // number of input image channels + const uint16_t dim_kernel_x, // window kernel size + const uint16_t dim_kernel_y, // window kernel size + const uint16_t padding_x, // padding sizes + const uint16_t padding_y, // padding sizes + const uint16_t stride_x, // stride + const uint16_t stride_y, // stride + const uint16_t dim_im_out_x, // output image dimension x or W + const uint16_t dim_im_out_y, // output image dimension y or H + const uint16_t output_shift, // output right shift + q7_t *bufferA, // a buffer for local storage, NULL by now + q7_t *Im_out); + +// modified from CMSIS-NN test_ref +void local_maxpool_q7_HWC(const q7_t * Im_in, // input image + const uint16_t dim_im_in_x, // input image dimension x or W + const uint16_t dim_im_in_y, // input image dimension y or H + const uint16_t ch_im_in, // number of input image channels + const uint16_t dim_kernel_x, // window kernel size + const uint16_t dim_kernel_y, // window kernel size + const uint16_t padding_x, // padding sizes + const uint16_t padding_y, // padding sizes + const uint16_t stride_x, // stride + const uint16_t stride_y, // stride + const uint16_t dim_im_out_x, // output image dimension x or W + const uint16_t dim_im_out_y, // output image dimension y or H + q7_t * bufferA, // a buffer for local storage, NULL by now + q7_t * Im_out); + +void local_maxpool_q7_CHW(const q7_t * Im_in, // input image + const uint16_t dim_im_in_x, // input image dimension x or W + const uint16_t dim_im_in_y, // input image dimension y or H + const uint16_t ch_im_in, // number of input image channels + const uint16_t dim_kernel_x, // window kernel size + const uint16_t dim_kernel_y, // window kernel size + const uint16_t padding_x, // padding sizes + const uint16_t padding_y, // padding sizes + const uint16_t stride_x, // stride + const uint16_t stride_y, // stride + const uint16_t dim_im_out_x, // output image dimension x or W + const uint16_t dim_im_out_y, // output image dimension y or H + q7_t * bufferA, // a buffer for local storage, NULL by now + q7_t * Im_out); + +void local_sumpool_q7_HWC(const q7_t * Im_in, // input image + const uint16_t dim_im_in_x, // input image dimension x or W + const uint16_t dim_im_in_y, // input image dimension y or H + const uint16_t ch_im_in, // number of input image channels + const uint16_t dim_kernel_x, // window kernel size + const uint16_t dim_kernel_y, // window kernel size + const uint16_t padding_x, // padding sizes + const uint16_t padding_y, // padding sizes + const uint16_t stride_x, // stride + const uint16_t stride_y, // stride + const uint16_t dim_im_out_x, // output image dimension x or W + const uint16_t dim_im_out_y, // output image dimension y or H + q7_t * bufferA, // a buffer for local storage, size = 4*output_size + q7_t * Im_out); + +void local_sumpool_q7_CHW(const q7_t * Im_in, // input image + const uint16_t dim_im_in_x, // input image dimension x or W + const uint16_t dim_im_in_y, // input image dimension y or H + const uint16_t ch_im_in, // number of input image channels + const uint16_t dim_kernel_x, // window kernel size + const uint16_t dim_kernel_y, // window kernel size + const uint16_t padding_x, // padding sizes + const uint16_t padding_y, // padding sizes + const uint16_t stride_x, // stride + const uint16_t stride_y, // stride + const uint16_t dim_im_out_x, // output image dimension x or W + const uint16_t dim_im_out_y, // output image dimension y or H + q7_t * bufferA, // a buffer for local storage, size = 4*output_size + q7_t * Im_out); + +// customised up sample pooling +void local_up_sampling_q7_HWC(const q7_t *Im_in, // input image + const uint16_t dim_im_in_x, // input image dimension x or W + const uint16_t dim_im_in_y, // input image dimension y or H + const uint16_t ch_im_in, // number of input image channels + const uint16_t dim_kernel_x, // window kernel size + const uint16_t dim_kernel_y, // window kernel size + const uint16_t dim_im_out_x, // output image dimension x or W + const uint16_t dim_im_out_y, // output image dimension y or H + q7_t *bufferA, // NULL + q7_t *Im_out); + +void local_up_sampling_q7_CHW(const q7_t *Im_in, // input image + const uint16_t dim_im_in_x, // input image dimension x or W + const uint16_t dim_im_in_y, // input image dimension y or H + const uint16_t ch_im_in, // number of input image channels + const uint16_t dim_kernel_x, // window kernel size + const uint16_t dim_kernel_y, // window kernel size + const uint16_t dim_im_out_x, // output image dimension x or W + const uint16_t dim_im_out_y, // output image dimension y or H + q7_t *bufferA, // NULL + q7_t *Im_out); + +void local_convolve_HWC_q7_nonsquare(const q7_t *Im_in, // input image + const uint16_t dim_im_in_x, // input image dimention x + const uint16_t dim_im_in_y, // input image dimention y + const uint16_t ch_im_in, // number of input image channels + const q7_t *wt, // kernel weights + const uint16_t ch_im_out, // number of filters, i.e., output image channels + const uint16_t dim_kernel_x, // filter kernel size x + const uint16_t dim_kernel_y, // filter kernel size y + const uint16_t padding_x, // padding sizes x + const uint16_t padding_y, // padding sizes y + const uint16_t stride_x, // stride x + const uint16_t stride_y, // stride y + const uint16_t dilation_x, // dilation x + const uint16_t dilation_y, // dilation y + const q7_t *bias, // bias + const nnom_qformat_param_t *bias_shift, // bias shifts + const nnom_qformat_param_t *out_shift, // output shift + const nnom_qtype_t q_type, // per channel or per tensor + q7_t *Im_out, // output image + const uint16_t dim_im_out_x, // output image dimension x + const uint16_t dim_im_out_y, // output image dimension y + q15_t *bufferA, //buffer space for input + q7_t *bufferB //buffer space for output +); + +void local_convolve_CHW_q7_nonsquare(const q7_t *Im_in, // input image + const uint16_t dim_im_in_x, // input image dimention x + const uint16_t dim_im_in_y, // input image dimention y + const uint16_t ch_im_in, // number of input image channels + const q7_t *wt, // kernel weights + const uint16_t ch_im_out, // number of filters, i.e., output image channels + const uint16_t dim_kernel_x, // filter kernel size x + const uint16_t dim_kernel_y, // filter kernel size y + const uint16_t padding_x, // padding sizes x + const uint16_t padding_y, // padding sizes y + const uint16_t stride_x, // stride x + const uint16_t stride_y, // stride y + const uint16_t dilation_x, // dilation x + const uint16_t dilation_y, // dilation y + const q7_t *bias, // bias + const nnom_qformat_param_t *bias_shift, // bias shifts + const nnom_qformat_param_t *out_shift, // output shift + const nnom_qtype_t q_type, // per channel or per tensor + q7_t *Im_out, // output image + const uint16_t dim_im_out_x, // output image dimension x + const uint16_t dim_im_out_y, // output image dimension y + q15_t *bufferA, //buffer space for input + q7_t *bufferB //buffer space for output +); + +void local_conv_trans_HWC_q7_nonsquare(const int8_t * Im_in, + const uint16_t dim_im_in_x, // input image dimention x + const uint16_t dim_im_in_y, // input image dimention y + const uint16_t ch_im_in, // number of input image channels + const q7_t *wt, // kernel weights + const uint16_t ch_im_out, // number of filters, i.e., output image channels + const uint16_t dim_kernel_x, // filter kernel size x + const uint16_t dim_kernel_y, // filter kernel size y + const uint16_t padding_x, // padding sizes x + const uint16_t padding_y, // padding sizes y + const uint16_t stride_x, // stride x + const uint16_t stride_y, // stride y + const uint16_t dilation_x, // dilation x + const uint16_t dilation_y, // dilation y + const q7_t *bias, // bias + const uint16_t bias_shift, const uint16_t out_shift, q7_t *Im_out, // output image + const uint16_t dim_im_out_x, // output image dimension x + const uint16_t dim_im_out_y, // output image dimension y + q15_t *bufferA, //buffer space for input + q7_t *bufferB //buffer space for output +); + +void local_depthwise_separable_conv_HWC_q7_nonsquare(const q7_t *Im_in,// input image + const uint16_t dim_im_in_x, // input image dimention x + const uint16_t dim_im_in_y, // input image dimention y + const uint16_t ch_im_in, // number of input image channels + const q7_t *wt, // kernel weights + const uint16_t ch_im_out, // number of filters, i.e., output image channels + const uint16_t dim_kernel_x, // filter kernel size x + const uint16_t dim_kernel_y, // filter kernel size y + const uint16_t padding_x, // padding sizes x + const uint16_t padding_y, // padding sizes y + const uint16_t stride_x, // stride x + const uint16_t stride_y, // stride y + const uint16_t dilation_x, // dilation x + const uint16_t dilation_y, // dilation y + const q7_t *bias, // bias + const nnom_qformat_param_t *bias_shift, // bias shifts + const nnom_qformat_param_t *out_shift, // output shift + const nnom_qtype_t q_type, // per channel or per tensor + q7_t *Im_out, // output image + const uint16_t dim_im_out_x, // output image dimension x + const uint16_t dim_im_out_y, // output image dimension y + q15_t *bufferA, //buffer space for input + q7_t *bufferB //buffer space for output +); + +void local_depthwise_separable_conv_CHW_q7_nonsquare(const q7_t *Im_in,// input image + const uint16_t dim_im_in_x, // input image dimention x + const uint16_t dim_im_in_y, // input image dimention y + const uint16_t ch_im_in, // number of input image channels + const q7_t *wt, // kernel weights + const uint16_t ch_im_out, // number of filters, i.e., output image channels + const uint16_t dim_kernel_x, // filter kernel size x + const uint16_t dim_kernel_y, // filter kernel size y + const uint16_t padding_x, // padding sizes x + const uint16_t padding_y, // padding sizes y + const uint16_t stride_x, // stride x + const uint16_t stride_y, // stride y + const uint16_t dilation_x, // dilation x + const uint16_t dilation_y, // dilation y + const q7_t *bias, // bias + const nnom_qformat_param_t *bias_shift, // bias shifts + const nnom_qformat_param_t *out_shift, // output shift + const nnom_qtype_t q_type, // per channel or per tensor + q7_t *Im_out, // output image + const uint16_t dim_im_out_x, // output image dimension x + const uint16_t dim_im_out_y, // output image dimension y + q15_t *bufferA, //buffer space for input + q7_t *bufferB //buffer space for output +); + +void local_zero_padding_HWC_q7(const q7_t *Im_in, // input image + const uint16_t dim_im_in_x, // input image dimention x + const uint16_t dim_im_in_y, // input image dimention y + const uint16_t ch_im_in, // number of input image channels + const uint16_t padding_top, // padding sizes y + const uint16_t padding_bottom, // padding sizes y + const uint16_t padding_left, // padding sizes x + const uint16_t padding_right, // padding sizes x + q7_t *Im_out, // output image + const uint16_t dim_im_out_x, // output image dimension x + const uint16_t dim_im_out_y); // output image dimension y + +void local_zero_padding_CHW_q7(const q7_t *Im_in, // input image + const uint16_t dim_im_in_x, // input image dimention x + const uint16_t dim_im_in_y, // input image dimention y + const uint16_t ch_im_in, // number of input image channels + const uint16_t padding_top, // padding sizes y + const uint16_t padding_bottom, // padding sizes y + const uint16_t padding_left, // padding sizes x + const uint16_t padding_right, // padding sizes x + q7_t *Im_out, // output image + const uint16_t dim_im_out_x, // output image dimension x + const uint16_t dim_im_out_y); // output image dimension y + +void local_cropping_HWC_q7(const q7_t *Im_in, // input image + const uint16_t dim_im_in_x, // input image dimention x + const uint16_t dim_im_in_y, // input image dimention y + const uint16_t ch_im_in, // number of input image channels + const uint16_t padding_top, // padding sizes y + const uint16_t padding_bottom, // padding sizes y + const uint16_t padding_left, // padding sizes x + const uint16_t padding_right, // padding sizes x + q7_t *Im_out, // output image + const uint16_t dim_im_out_x, // output image dimension x + const uint16_t dim_im_out_y); // output image dimension y + +void local_cropping_CHW_q7(const q7_t *Im_in, // input image + const uint16_t dim_im_in_x, // input image dimention x + const uint16_t dim_im_in_y, // input image dimention y + const uint16_t ch_im_in, // number of input image channels + const uint16_t padding_top, // padding sizes y + const uint16_t padding_bottom, // padding sizes y + const uint16_t padding_left, // padding sizes x + const uint16_t padding_right, // padding sizes x + q7_t *Im_out, // output image + const uint16_t dim_im_out_x, // output image dimension x + const uint16_t dim_im_out_y); // output image dimension y + +void local_fully_connected_q7_opt(const q7_t * pV, // pointer to vector + const q7_t * pM, // pointer to matrix + const uint16_t dim_vec, // length of the vector + const uint16_t num_of_rows, // numCol of A + const uint16_t bias_shift, // amount of left-shift for bias + const uint16_t out_shift, // amount of right-shift for output + const q7_t * bias, q7_t * pOut, // output operand + q15_t * vec_buffer); + + +void local_fully_connected_q7(const q7_t * pV, // pointer to vector + const q7_t * pM, // pointer to matrix + const uint16_t dim_vec, // length of the vector + const uint16_t num_of_rows, // numCol of A + const uint16_t bias_shift, // amount of left-shift for bias + const uint16_t out_shift, // amount of right-shift for output + const q7_t * bias, q7_t * pOut, // output operand + q15_t * vec_buffer); + +// matrix dot, +// it takes reorderd weight as input, (see dense layer for detail. this is basiclly a dense opt without bias) +void local_dot_q7_opt(const q7_t *pV, // pointer to vector + const q7_t *pM, // pointer to matrix + const uint16_t dim_vec, // length of the vector + const uint16_t num_of_rows, // numCol of A + const uint16_t out_shift, // amount of right-shift for output + q7_t *pOut); // result buffer + +void local_dot_q7(const q7_t *pV, // pointer to vector + const q7_t *pM, // pointer to matrix + const uint16_t dim_vec, // length of the vector + const uint16_t num_of_rows, // numCol of A + const uint16_t out_shift, // amount of right-shift for output + q7_t *pOut); // output operand) + + + +// softmax +void local_softmax_q7(const q7_t * vec_in, const uint32_t dim_vec, q7_t * p_out); + +// sigmoid +void local_sigmoid_q7(q7_t * data, uint32_t size, int16_t int_width); + +// tanh +void local_tanh_q7(q7_t * data, uint32_t size, int16_t int_width); + +// relu +void local_relu_q7(q7_t * data, uint32_t size); + +// leaky relu +void local_leaky_relu_q7(q7_t *data, q7_t alpha, uint32_t size); + +// alpha in q7 format with dec_bit=7 +// max and threshold has the same Q format with the activation +void local_adv_relu_q7(q7_t *data, q7_t alpha, q7_t max, q7_t threshold, uint32_t size); + +// hard sigmoid, +// y=-1 if x < -2.5 +// y=1 if x > 2.5 +// otherwise y = 0.2 * x + 0.5 (y=0.20315 * x + 0.5) +void local_hard_sigmoid_q7(q7_t *data, uint32_t size, int16_t dec_bit); + +// hard tanh +// y=-1 if x < -1 +// y=1 if x > 1 +// otherwise y = x +void local_hard_tanh_q7(q7_t *data, uint32_t size, int16_t dec_bit); + +// matrix ops +void local_mult_q7(q7_t * pSrcA, q7_t * pSrcB, q7_t * pDst, const uint16_t out_shift, uint32_t blockSize); + +// add +void local_add_q7(q7_t * pSrcA, q7_t * pSrcB, q7_t * pDst, const uint16_t out_shift, uint32_t blockSize); + +// sub +void local_sub_q7(q7_t * pSrcA, q7_t * pSrcB, q7_t * pDst, const uint16_t out_shift, uint32_t blockSize); + +// take multiple blocks (>2) as input +void local_multiple_add_q7( q7_t *p_dst, + const int16_t out_shift, + uint32_t block_size, + uint32_t num_block, + q7_t **p_src); + +void local_multiple_mult_q7( q7_t *p_dst, + const int16_t out_shift, + uint32_t block_size, + uint32_t num_block, + q7_t **p_src); + +void local_multiple_sub_q7( q7_t *p_dst, + const int16_t out_shift, + uint32_t block_size, + uint32_t num_block, + q7_t **p_src); + + +// Below tables credit to CMSIS +// For more info. check CMSIS-NN lib +// https://github.com/ARM-software/CMSIS_5/blob/develop/CMSIS/NN/Source/NNSupportFunctions/arm_nntables.c +static const q7_t nnom_sigmoid_table_q7[256] = { + 0x40, 0x42, 0x44, 0x46, 0x48, 0x4a, 0x4c, 0x4e, + 0x50, 0x52, 0x53, 0x55, 0x57, 0x59, 0x5a, 0x5c, + 0x5e, 0x5f, 0x61, 0x62, 0x63, 0x65, 0x66, 0x67, + 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70, + 0x71, 0x72, 0x72, 0x73, 0x74, 0x74, 0x75, 0x76, + 0x76, 0x77, 0x77, 0x78, 0x78, 0x79, 0x79, 0x7a, + 0x7a, 0x7a, 0x7b, 0x7b, 0x7b, 0x7c, 0x7c, 0x7c, + 0x7c, 0x7c, 0x7d, 0x7d, 0x7d, 0x7d, 0x7d, 0x7e, + 0x7e, 0x7e, 0x7e, 0x7e, 0x7e, 0x7e, 0x7e, 0x7f, + 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, + 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, + 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, + 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, + 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, + 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, + 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, + 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, + 0x01, 0x01, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x04, + 0x04, 0x04, 0x04, 0x04, 0x05, 0x05, 0x05, 0x06, + 0x06, 0x06, 0x07, 0x07, 0x08, 0x08, 0x09, 0x09, + 0x0a, 0x0a, 0x0b, 0x0c, 0x0c, 0x0d, 0x0e, 0x0e, + 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, + 0x17, 0x19, 0x1a, 0x1b, 0x1d, 0x1e, 0x1f, 0x21, + 0x22, 0x24, 0x26, 0x27, 0x29, 0x2b, 0x2d, 0x2e, + 0x30, 0x32, 0x34, 0x36, 0x38, 0x3a, 0x3c, 0x3e, +}; + + +static const q7_t nnom_tanh_table_q7[256] = { + 0x00, 0x08, 0x10, 0x18, 0x1f, 0x27, 0x2e, 0x35, + 0x3b, 0x41, 0x47, 0x4c, 0x51, 0x56, 0x5a, 0x5e, + 0x61, 0x65, 0x68, 0x6a, 0x6d, 0x6f, 0x71, 0x72, + 0x74, 0x75, 0x76, 0x78, 0x78, 0x79, 0x7a, 0x7b, + 0x7b, 0x7c, 0x7c, 0x7d, 0x7d, 0x7e, 0x7e, 0x7e, + 0x7e, 0x7e, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, + 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, + 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, + 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, + 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, + 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, + 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, + 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, + 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, + 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, + 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, + 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x81, + 0x81, 0x81, 0x81, 0x81, 0x81, 0x81, 0x81, 0x82, + 0x82, 0x82, 0x82, 0x82, 0x83, 0x83, 0x84, 0x84, + 0x85, 0x85, 0x86, 0x87, 0x88, 0x88, 0x8a, 0x8b, + 0x8c, 0x8e, 0x8f, 0x91, 0x93, 0x96, 0x98, 0x9b, + 0x9f, 0xa2, 0xa6, 0xaa, 0xaf, 0xb4, 0xb9, 0xbf, + 0xc5, 0xcb, 0xd2, 0xd9, 0xe1, 0xe8, 0xf0, 0xf8, +}; + + +// ------------ 16bit ops -------------------- + +void local_avepool_q15_HWC(const q15_t *Im_in, // input image + const uint16_t dim_im_in_x, // input image dimension x or W + const uint16_t dim_im_in_y, // input image dimension y or H + const uint16_t ch_im_in, // number of input image channels + const uint16_t dim_kernel_x, // window kernel size + const uint16_t dim_kernel_y, // window kernel size + const uint16_t padding_x, // padding sizes + const uint16_t padding_y, // padding sizes + const uint16_t stride_x, // stride + const uint16_t stride_y, // stride + const uint16_t dim_im_out_x, // output image dimension x or W + const uint16_t dim_im_out_y, // output image dimension y or H + const uint16_t output_shift, // output right shift + q7_t *bufferA, // a buffer for local storage, NULL by now + q15_t *Im_out); + +void local_avepool_q15_CHW(const q15_t *Im_in, // input image + const uint16_t dim_im_in_x, // input image dimension x or W + const uint16_t dim_im_in_y, // input image dimension y or H + const uint16_t ch_im_in, // number of input image channels + const uint16_t dim_kernel_x, // window kernel size + const uint16_t dim_kernel_y, // window kernel size + const uint16_t padding_x, // padding sizes + const uint16_t padding_y, // padding sizes + const uint16_t stride_x, // stride + const uint16_t stride_y, // stride + const uint16_t dim_im_out_x, // output image dimension x or W + const uint16_t dim_im_out_y, // output image dimension y or H + const uint16_t output_shift, // output right shift + q7_t *bufferA, // a buffer for local storage, NULL by now + q15_t *Im_out); + +void local_maxpool_q15_HWC(const q15_t *Im_in, // input image + const uint16_t dim_im_in_x, // input image dimension x or W + const uint16_t dim_im_in_y, // input image dimension y or H + const uint16_t ch_im_in, // number of input image channels + const uint16_t dim_kernel_x, // window kernel size + const uint16_t dim_kernel_y, // window kernel size + const uint16_t padding_x, // padding sizes + const uint16_t padding_y, // padding sizes + const uint16_t stride_x, // stride + const uint16_t stride_y, // stride + const uint16_t dim_im_out_x, // output image dimension x or W + const uint16_t dim_im_out_y, // output image dimension y or H + q7_t *bufferA, // a buffer for local storage, NULL by now + q15_t *Im_out); + +void local_maxpool_q15_CHW(const q15_t *Im_in, // input image + const uint16_t dim_im_in_x, // input image dimension x or W + const uint16_t dim_im_in_y, // input image dimension y or H + const uint16_t ch_im_in, // number of input image channels + const uint16_t dim_kernel_x, // window kernel size + const uint16_t dim_kernel_y, // window kernel size + const uint16_t padding_x, // padding sizes + const uint16_t padding_y, // padding sizes + const uint16_t stride_x, // stride + const uint16_t stride_y, // stride + const uint16_t dim_im_out_x, // output image dimension x or W + const uint16_t dim_im_out_y, // output image dimension y or H + q7_t *bufferA, // a buffer for local storage, NULL by now + q15_t *Im_out); + +void local_sumpool_q15_HWC(const q15_t *Im_in, // input image + const uint16_t dim_im_in_x, // input image dimension x or W + const uint16_t dim_im_in_y, // input image dimension y or H + const uint16_t ch_im_in, // number of input image channels + const uint16_t dim_kernel_x, // window kernel size + const uint16_t dim_kernel_y, // window kernel size + const uint16_t padding_x, // padding sizes + const uint16_t padding_y, // padding sizes + const uint16_t stride_x, // stride + const uint16_t stride_y, // stride + const uint16_t dim_im_out_x, // output image dimension x or W + const uint16_t dim_im_out_y, // output image dimension y or H + const uint16_t output_shift, // output right shift + q7_t *bufferA, // a buffer for local storage, size = 4*output_size + q15_t *Im_out); + +void local_sumpool_q15_CHW(const q15_t *Im_in, // input image + const uint16_t dim_im_in_x, // input image dimension x or W + const uint16_t dim_im_in_y, // input image dimension y or H + const uint16_t ch_im_in, // number of input image channels + const uint16_t dim_kernel_x, // window kernel size + const uint16_t dim_kernel_y, // window kernel size + const uint16_t padding_x, // padding sizes + const uint16_t padding_y, // padding sizes + const uint16_t stride_x, // stride + const uint16_t stride_y, // stride + const uint16_t dim_im_out_x, // output image dimension x or W + const uint16_t dim_im_out_y, // output image dimension y or H + const uint16_t output_shift, // output right shift + q7_t *bufferA, // a buffer for local storage, size = 4*output_size + q15_t *Im_out); + +void local_up_sampling_q15_HWC(const q15_t *Im_in, // input image + const uint16_t dim_im_in_x, // input image dimension x or W + const uint16_t dim_im_in_y, // input image dimension y or H + const uint16_t ch_im_in, // number of input image channels + const uint16_t dim_kernel_x, // window kernel size + const uint16_t dim_kernel_y, // window kernel size + const uint16_t dim_im_out_x, // output image dimension x or W + const uint16_t dim_im_out_y, // output image dimension y or H + q7_t *bufferA, // a buffer for local storage, NULL by now + q15_t *Im_out); + + void local_up_sampling_q15_CHW(const q15_t *Im_in, // input image + const uint16_t dim_im_in_x, // input image dimension x or W + const uint16_t dim_im_in_y, // input image dimension y or H + const uint16_t ch_im_in, // number of input image channels + const uint16_t dim_kernel_x, // window kernel size + const uint16_t dim_kernel_y, // window kernel size + const uint16_t dim_im_out_x, // output image dimension x or W + const uint16_t dim_im_out_y, // output image dimension y or H + q7_t *bufferA, // a buffer for local storage, NULL by now + q15_t *Im_out); + +void local_convolve_HWC_q15_nonsquare(const q15_t *Im_in, // input image + const uint16_t dim_im_in_x, // input image dimention x + const uint16_t dim_im_in_y, // input image dimention y + const uint16_t ch_im_in, // number of input image channels + const q7_t *wt, // kernel weights + const uint16_t ch_im_out, // number of filters, i.e., output image channels + const uint16_t dim_kernel_x, // filter kernel size x + const uint16_t dim_kernel_y, // filter kernel size y + const uint16_t padding_x, // padding sizes x + const uint16_t padding_y, // padding sizes y + const uint16_t stride_x, // stride x + const uint16_t stride_y, // stride y + const uint16_t dilation_x, // dilation x + const uint16_t dilation_y, // dilation y + const q7_t *bias, // bias + const nnom_qformat_param_t *bias_shift, // bias shifts + const nnom_qformat_param_t *out_shift, // output shift + const nnom_qtype_t q_type, // per channel or per tensor + q15_t *Im_out, // output image + const uint16_t dim_im_out_x, // output image dimension x + const uint16_t dim_im_out_y, // output image dimension y + q15_t *bufferA, //buffer space for input + q7_t *bufferB //buffer space for output +); +void local_convolve_CHW_q15_nonsquare(const q15_t *Im_in, // input image + const uint16_t dim_im_in_x, // input image dimention x + const uint16_t dim_im_in_y, // input image dimention y + const uint16_t ch_im_in, // number of input image channels + const q7_t *wt, // kernel weights + const uint16_t ch_im_out, // number of filters, i.e., output image channels + const uint16_t dim_kernel_x, // filter kernel size x + const uint16_t dim_kernel_y, // filter kernel size y + const uint16_t padding_x, // padding sizes x + const uint16_t padding_y, // padding sizes y + const uint16_t stride_x, // stride x + const uint16_t stride_y, // stride y + const uint16_t dilation_x, // dilation x + const uint16_t dilation_y, // dilation y + const q7_t *bias, // bias + const nnom_qformat_param_t *bias_shift, // bias shifts + const nnom_qformat_param_t *out_shift, // output shift + const nnom_qtype_t q_type, // per channel or per tensor + q15_t *Im_out, // output image + const uint16_t dim_im_out_x, // output image dimension x + const uint16_t dim_im_out_y, // output image dimension y + q15_t *bufferA, //buffer space for input + q7_t *bufferB //buffer space for output +); + +void local_conv_trans_HWC_q15_nonsquare(const int8_t * Im_in, + const uint16_t dim_im_in_x, // input image dimention x + const uint16_t dim_im_in_y, // input image dimention y + const uint16_t ch_im_in, // number of input image channels + const q7_t *wt, // kernel weights + const uint16_t ch_im_out, // number of filters, i.e., output image channels + const uint16_t dim_kernel_x, // filter kernel size x + const uint16_t dim_kernel_y, // filter kernel size y + const uint16_t padding_x, // padding sizes x + const uint16_t padding_y, // padding sizes y + const uint16_t stride_x, // stride x + const uint16_t stride_y, // stride y + const uint16_t dilation_x, // dilation x + const uint16_t dilation_y, // dilation y + const q7_t *bias, // bias + const uint16_t bias_shift, const uint16_t out_shift, q15_t *Im_out, // output image + const uint16_t dim_im_out_x, // output image dimension x + const uint16_t dim_im_out_y, // output image dimension y + q15_t *bufferA, //buffer space for input + q7_t *bufferB //buffer space for output +); + +void local_depthwise_separable_conv_HWC_q15_nonsquare(const q15_t *Im_in,// input image + const uint16_t dim_im_in_x, // input image dimention x + const uint16_t dim_im_in_y, // input image dimention y + const uint16_t ch_im_in, // number of input image channels + const q7_t *wt, // kernel weights + const uint16_t ch_im_out, // number of filters, i.e., output image channels + const uint16_t dim_kernel_x, // filter kernel size x + const uint16_t dim_kernel_y, // filter kernel size y + const uint16_t padding_x, // padding sizes x + const uint16_t padding_y, // padding sizes y + const uint16_t stride_x, // stride x + const uint16_t stride_y, // stride y + const uint16_t dilation_x, // dilation x + const uint16_t dilation_y, // dilation y + const q7_t *bias, // bias + const nnom_qformat_param_t *bias_shift, // bias shifts + const nnom_qformat_param_t *out_shift, // output shift + const nnom_qtype_t q_type, // per channel or per tensor + q15_t *Im_out, // output image + const uint16_t dim_im_out_x, // output image dimension x + const uint16_t dim_im_out_y, // output image dimension y + q15_t *bufferA, //buffer space for input + q7_t *bufferB //buffer space for output +); + +void local_depthwise_separable_conv_CHW_q15_nonsquare(const q15_t *Im_in,// input image + const uint16_t dim_im_in_x, // input image dimention x + const uint16_t dim_im_in_y, // input image dimention y + const uint16_t ch_im_in, // number of input image channels + const q7_t *wt, // kernel weights + const uint16_t ch_im_out, // number of filters, i.e., output image channels + const uint16_t dim_kernel_x, // filter kernel size x + const uint16_t dim_kernel_y, // filter kernel size y + const uint16_t padding_x, // padding sizes x + const uint16_t padding_y, // padding sizes y + const uint16_t stride_x, // stride x + const uint16_t stride_y, // stride y + const uint16_t dilation_x, // dilation x + const uint16_t dilation_y, // dilation y + const q7_t *bias, // bias + const nnom_qformat_param_t *bias_shift, // bias shifts + const nnom_qformat_param_t *out_shift, // output shift + const nnom_qtype_t q_type, // per channel or per tensor + q15_t *Im_out, // output image + const uint16_t dim_im_out_x, // output image dimension x + const uint16_t dim_im_out_y, // output image dimension y + q15_t *bufferA, //buffer space for input + q7_t *bufferB //buffer space for output +); + +void local_zero_padding_HWC_q15(const q15_t *Im_in, // input image + const uint16_t dim_im_in_x, // input image dimention x + const uint16_t dim_im_in_y, // input image dimention y + const uint16_t ch_im_in, // number of input image channels + const uint16_t padding_top, // padding sizes y + const uint16_t padding_bottom, // padding sizes y + const uint16_t padding_left, // padding sizes x + const uint16_t padding_right, // padding sizes x + q15_t *Im_out, // output image + const uint16_t dim_im_out_x, // output image dimension x + const uint16_t dim_im_out_y); // output image dimension y + +void local_zero_padding_CHW_q15(const q15_t *Im_in, // input image + const uint16_t dim_im_in_x, // input image dimention x + const uint16_t dim_im_in_y, // input image dimention y + const uint16_t ch_im_in, // number of input image channels + const uint16_t padding_top, // padding sizes y + const uint16_t padding_bottom, // padding sizes y + const uint16_t padding_left, // padding sizes x + const uint16_t padding_right, // padding sizes x + q15_t *Im_out, // output image + const uint16_t dim_im_out_x, // output image dimension x + const uint16_t dim_im_out_y); // output image dimension y + +void local_cropping_HWC_q15(const q15_t *Im_in, // input image + const uint16_t dim_im_in_x, // input image dimention x + const uint16_t dim_im_in_y, // input image dimention y + const uint16_t ch_im_in, // number of input image channels + const uint16_t padding_top, // padding sizes y + const uint16_t padding_bottom, // padding sizes y + const uint16_t padding_left, // padding sizes x + const uint16_t padding_right, // padding sizes x + q15_t *Im_out, // output image + const uint16_t dim_im_out_x, // output image dimension x + const uint16_t dim_im_out_y); // output image dimension y + +void local_cropping_CHW_q15(const q15_t *Im_in, // input image + const uint16_t dim_im_in_x, // input image dimention x + const uint16_t dim_im_in_y, // input image dimention y + const uint16_t ch_im_in, // number of input image channels + const uint16_t padding_top, // padding sizes y + const uint16_t padding_bottom, // padding sizes y + const uint16_t padding_left, // padding sizes x + const uint16_t padding_right, // padding sizes x + q15_t *Im_out, // output image + const uint16_t dim_im_out_x, // output image dimension x + const uint16_t dim_im_out_y); // output image dimension y + + +void local_dot_q15(const q15_t *pV, // pointer to vector + const q15_t *pM, // pointer to matrix + const uint16_t dim_vec, // length of the vector + const uint16_t num_of_rows, // numCol of A + const uint16_t out_shift, // amount of right-shift for output + q15_t *pOut); // output operand) + +void local_dot_q15_opt(const q15_t * pV, + const q15_t * pM, + const uint16_t dim_vec, + const uint16_t num_of_rows, + const uint16_t out_shift, + q15_t * pOut); + +// original implementation +// this support none bias, the it will perform like a dot. +// set the `bias=NULL` to work +void local_fully_connected_mat_q7_vec_q15(const q15_t * pV, // pointer to vector + const q7_t * pM, // pointer to matrix + const uint16_t dim_vec, // length of the vector + const uint16_t num_of_rows, // numCol of A + const uint16_t bias_shift, // amount of left-shift for bias + const uint16_t out_shift, // amount of right-shift for output + const q7_t * bias, // bias + q15_t * pOut, // output + q15_t * vec_buffer); // not used but to keep the interface same as the ARM's version + +// work on recorder matrix +// this support none bias, set the bias=NULL to work +void local_fully_connected_mat_q7_vec_q15_opt(const q15_t * pV, + const q7_t * pM, + const uint16_t dim_vec, + const uint16_t num_of_rows, + const uint16_t bias_shift, + const uint16_t out_shift, + const q7_t * bias, + q15_t * pOut, + q15_t * vec_buffer); + +// matrix operation Q15 +void local_multiple_add_q15( q15_t *p_dst, + const int16_t out_shift, + uint32_t block_size, + uint32_t num_block, + q15_t **p_src); + +void local_multiple_mult_q15( q15_t *p_dst, + const int16_t out_shift, + uint32_t block_size, + uint32_t num_block, + q15_t **p_src); + +void local_multiple_sub_q15( q15_t *p_dst, + const int16_t out_shift, + uint32_t block_size, + uint32_t num_block, + q15_t **p_src); + +void local_mult_q15(q15_t * pSrcA, q15_t * pSrcB, q15_t * pDst, const uint16_t out_shift, uint32_t blockSize); + +// add +void local_add_q15(q15_t * pSrcA, q15_t * pSrcB, q15_t * pDst, const uint16_t out_shift, uint32_t blockSize); + +// sub +void local_sub_q15(q15_t * pSrcA, q15_t * pSrcB, q15_t * pDst, const uint16_t out_shift, uint32_t blockSize); + +// Convert Q7 to Q15 +void local_q7_to_q15_no_shift(const q7_t *src, q15_t *des, uint32_t size); +void local_q7_to_q15(const q7_t *src, q15_t *des, uint32_t size); + +// q15 shift to q7 +void local_q15_to_q7(const q15_t *src, q7_t *des, uint32_t shift, uint32_t size); + +// y = 1 - x +void local_1_minor_z_q15(q15_t *src, q15_t *des, uint16_t dec_bit, uint32_t size); + +void local_softmax_q15(const q15_t * vec_in, const uint16_t dim_vec, q15_t * p_out); +void local_hard_sigmoid_q15(q15_t *data, uint32_t size, int16_t dec_bit); +void local_hard_tanh_q15(q15_t *data, uint32_t size, int16_t dec_bit); +void local_relu_q15(q15_t *data, uint32_t size); +void local_leaky_relu_q15(q15_t *data, q7_t alpha, uint32_t size); +void local_adv_relu_q15(q15_t *data, q7_t negative_slope, q15_t max, q15_t threshold, uint32_t size); +void local_sigmoid_q15(q15_t * data, uint32_t size, uint16_t int_width); +void local_tanh_q15(q15_t * data, uint32_t size, uint16_t int_width); + + +static const q15_t nnom_sigmoid_table_q15[256] = { + 0x4000, 0x4200, 0x43ff, 0x45fc, 0x47f5, 0x49eb, 0x4bdc, 0x4dc8, + 0x4fad, 0x518a, 0x5360, 0x552c, 0x56ef, 0x58a8, 0x5a57, 0x5bfb, + 0x5d93, 0x5f20, 0x60a1, 0x6216, 0x637f, 0x64db, 0x662b, 0x676f, + 0x68a6, 0x69d2, 0x6af1, 0x6c05, 0x6d0d, 0x6e09, 0x6efb, 0x6fe2, + 0x70be, 0x7190, 0x7258, 0x7316, 0x73cc, 0x7478, 0x751b, 0x75b7, + 0x764a, 0x76d6, 0x775b, 0x77d8, 0x784f, 0x78c0, 0x792a, 0x798f, + 0x79ee, 0x7a48, 0x7a9d, 0x7aed, 0x7b39, 0x7b80, 0x7bc4, 0x7c03, + 0x7c3f, 0x7c78, 0x7cad, 0x7ce0, 0x7d0f, 0x7d3c, 0x7d66, 0x7d8d, + 0x7db3, 0x7dd6, 0x7df7, 0x7e16, 0x7e33, 0x7e4f, 0x7e69, 0x7e81, + 0x7e98, 0x7eae, 0x7ec2, 0x7ed5, 0x7ee7, 0x7ef8, 0x7f08, 0x7f17, + 0x7f25, 0x7f32, 0x7f3e, 0x7f4a, 0x7f55, 0x7f5f, 0x7f69, 0x7f72, + 0x7f7b, 0x7f83, 0x7f8a, 0x7f91, 0x7f98, 0x7f9e, 0x7fa4, 0x7faa, + 0x7faf, 0x7fb4, 0x7fb8, 0x7fbd, 0x7fc1, 0x7fc5, 0x7fc8, 0x7fcc, + 0x7fcf, 0x7fd2, 0x7fd5, 0x7fd7, 0x7fda, 0x7fdc, 0x7fde, 0x7fe0, + 0x7fe2, 0x7fe4, 0x7fe6, 0x7fe7, 0x7fe9, 0x7fea, 0x7feb, 0x7fed, + 0x7fee, 0x7fef, 0x7ff0, 0x7ff1, 0x7ff2, 0x7ff3, 0x7ff4, 0x7ff4, + 0x000b, 0x000c, 0x000c, 0x000d, 0x000e, 0x000f, 0x0010, 0x0011, + 0x0012, 0x0013, 0x0015, 0x0016, 0x0017, 0x0019, 0x001a, 0x001c, + 0x001e, 0x0020, 0x0022, 0x0024, 0x0026, 0x0029, 0x002b, 0x002e, + 0x0031, 0x0034, 0x0038, 0x003b, 0x003f, 0x0043, 0x0048, 0x004c, + 0x0051, 0x0056, 0x005c, 0x0062, 0x0068, 0x006f, 0x0076, 0x007d, + 0x0085, 0x008e, 0x0097, 0x00a1, 0x00ab, 0x00b6, 0x00c2, 0x00ce, + 0x00db, 0x00e9, 0x00f8, 0x0108, 0x0119, 0x012b, 0x013e, 0x0152, + 0x0168, 0x017f, 0x0197, 0x01b1, 0x01cd, 0x01ea, 0x0209, 0x022a, + 0x024d, 0x0273, 0x029a, 0x02c4, 0x02f1, 0x0320, 0x0353, 0x0388, + 0x03c1, 0x03fd, 0x043c, 0x0480, 0x04c7, 0x0513, 0x0563, 0x05b8, + 0x0612, 0x0671, 0x06d6, 0x0740, 0x07b1, 0x0828, 0x08a5, 0x092a, + 0x09b6, 0x0a49, 0x0ae5, 0x0b88, 0x0c34, 0x0cea, 0x0da8, 0x0e70, + 0x0f42, 0x101e, 0x1105, 0x11f7, 0x12f3, 0x13fb, 0x150f, 0x162e, + 0x175a, 0x1891, 0x19d5, 0x1b25, 0x1c81, 0x1dea, 0x1f5f, 0x20e0, + 0x226d, 0x2405, 0x25a9, 0x2758, 0x2911, 0x2ad4, 0x2ca0, 0x2e76, + 0x3053, 0x3238, 0x3424, 0x3615, 0x380b, 0x3a04, 0x3c01, 0x3e00, +}; + + +static const q15_t nnom_tanh_table_q15[256] = { + 0x0000, 0x07fd, 0x0feb, 0x17b9, 0x1f59, 0x26bf, 0x2ddf, 0x34ae, + 0x3b27, 0x4142, 0x46fd, 0x4c56, 0x514d, 0x55e2, 0x5a1a, 0x5df6, + 0x617c, 0x64b0, 0x6797, 0x6a37, 0x6c95, 0x6eb5, 0x709e, 0x7254, + 0x73dc, 0x753a, 0x7672, 0x7788, 0x787f, 0x795b, 0x7a1e, 0x7acb, + 0x7b65, 0x7bee, 0x7c66, 0x7cd1, 0x7d30, 0x7d84, 0x7dce, 0x7e0f, + 0x7e49, 0x7e7d, 0x7eaa, 0x7ed2, 0x7ef5, 0x7f14, 0x7f30, 0x7f48, + 0x7f5e, 0x7f71, 0x7f82, 0x7f91, 0x7f9e, 0x7fa9, 0x7fb3, 0x7fbc, + 0x7fc4, 0x7fcb, 0x7fd1, 0x7fd7, 0x7fdc, 0x7fe0, 0x7fe4, 0x7fe7, + 0x7fea, 0x7fed, 0x7fef, 0x7ff1, 0x7ff3, 0x7ff4, 0x7ff6, 0x7ff7, + 0x7ff8, 0x7ff9, 0x7ffa, 0x7ffa, 0x7ffb, 0x7ffc, 0x7ffc, 0x7ffd, + 0x7ffd, 0x7ffd, 0x7ffe, 0x7ffe, 0x7ffe, 0x7ffe, 0x7fff, 0x7fff, + 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, + 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, + 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, + 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, + 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8001, 0x8001, 0x8001, 0x8001, 0x8001, 0x8001, + 0x8001, 0x8001, 0x8001, 0x8002, 0x8002, 0x8002, 0x8002, 0x8003, + 0x8003, 0x8003, 0x8004, 0x8004, 0x8005, 0x8006, 0x8006, 0x8007, + 0x8008, 0x8009, 0x800a, 0x800c, 0x800d, 0x800f, 0x8011, 0x8013, + 0x8016, 0x8019, 0x801c, 0x8020, 0x8024, 0x8029, 0x802f, 0x8035, + 0x803c, 0x8044, 0x804d, 0x8057, 0x8062, 0x806f, 0x807e, 0x808f, + 0x80a2, 0x80b8, 0x80d0, 0x80ec, 0x810b, 0x812e, 0x8156, 0x8183, + 0x81b7, 0x81f1, 0x8232, 0x827c, 0x82d0, 0x832f, 0x839a, 0x8412, + 0x849b, 0x8535, 0x85e2, 0x86a5, 0x8781, 0x8878, 0x898e, 0x8ac6, + 0x8c24, 0x8dac, 0x8f62, 0x914b, 0x936b, 0x95c9, 0x9869, 0x9b50, + 0x9e84, 0xa20a, 0xa5e6, 0xaa1e, 0xaeb3, 0xb3aa, 0xb903, 0xbebe, + 0xc4d9, 0xcb52, 0xd221, 0xd941, 0xe0a7, 0xe847, 0xf015, 0xf803, +}; + +#ifdef __cplusplus +} +#endif + +#endif /* __NNOM_LOCAL_H__ */ diff --git a/APP_Framework/Framework/knowing/nnom/inc/nnom_tensor.h b/APP_Framework/Framework/knowing/nnom/inc/nnom_tensor.h new file mode 100644 index 000000000..6853da868 --- /dev/null +++ b/APP_Framework/Framework/knowing/nnom/inc/nnom_tensor.h @@ -0,0 +1,54 @@ +/* + * Copyright (c) 2018-2020 + * Jianjia Ma + * majianjia@live.com + * + * SPDX-License-Identifier: Apache-2.0 + * + * Change Logs: + * Date Author Notes + * 2019-02-05 Jianjia Ma The first version + * 2019-02-10 Jianjia Ma Compiler supports dense net connection + */ + +#ifndef __NNOM_TENSOR_H__ +#define __NNOM_TENSOR_H__ + +#ifdef __cplusplus +extern "C" { +#endif + +#include "nnom.h" + + +void delete_tensor(nnom_tensor_t* t); +nnom_tensor_t* new_tensor(nnom_qtype_t type, uint32_t num_dim, uint32_t num_channel); +// set tensor by value +// for tensor with quantized type NNOM_QTYPE_PER_TENSOR +nnom_tensor_t* tensor_set_attr_v(nnom_tensor_t* t, + nnom_qformat_param_t dec_bit, nnom_qformat_param_t offset, nnom_shape_data_t* dim, uint32_t num_dim, uint8_t bitwidth); +nnom_tensor_t* tensor_set_attr(nnom_tensor_t* t, + nnom_qformat_param_t*dec_bit, nnom_qformat_param_t *offset, nnom_shape_data_t* dim, uint32_t num_dim, uint8_t bitwidth); +nnom_tensor_t* tensor_cpy_attr(nnom_tensor_t* des, nnom_tensor_t* src); +size_t tensor_get_num_channel(nnom_tensor_t* t); +size_t tensor_size(nnom_tensor_t* t); +size_t tensor_size_byte(nnom_tensor_t* t); + +// only support 3d tensor +// change format from CHW to HWC +// the shape of the data, input data, output data +void tensor_hwc2chw_q7(nnom_tensor_t* des, nnom_tensor_t* src); + +// change format from CHW to HWC +// the shape of the data, input data, output data +void tensor_chw2hwc_q7(nnom_tensor_t* des, nnom_tensor_t* src); + +// deprecated. +void hwc2chw_q7(nnom_3d_shape_t shape, q7_t* p_in, q7_t* p_out); +void chw2hwc_q7(nnom_3d_shape_t shape, q7_t* p_in, q7_t* p_out); + +#ifdef __cplusplus +} +#endif + +#endif /*__NNOM_TENSOR_H__ */ diff --git a/APP_Framework/Framework/knowing/nnom/inc/nnom_utils.h b/APP_Framework/Framework/knowing/nnom/inc/nnom_utils.h new file mode 100644 index 000000000..88c5067d3 --- /dev/null +++ b/APP_Framework/Framework/knowing/nnom/inc/nnom_utils.h @@ -0,0 +1,91 @@ +/* + * Copyright (c) 2018-2020 + * Jianjia Ma + * majianjia@live.com + * + * SPDX-License-Identifier: Apache-2.0 + * + * Change Logs: + * Date Author Notes + * 2019-02-05 Jianjia Ma The first version + */ + +#ifndef __NNOM_UTILS_H__ +#define __NNOM_UTILS_H__ + +#ifdef __cplusplus +extern "C" { +#endif + + +#include +#include +#include + +#include "nnom.h" + +typedef struct _nnom_predict_t +{ + uint16_t *confusion_mat; // confusiong matrix + uint32_t *top_k; // which stored the num of prediction in rank_k, example: Top-2 = top_k[0]+top_k[1] + nnom_model_t *model; // the model to run + int8_t *buf_prediction; // the pointer to the output of softmax layer(normally the end of classifier). + + // setting + uint32_t label_num; // number of types in classification + uint32_t top_k_size; // number of k that wants to know. + + // running + uint32_t predict_count; // how many prediction is done + + //timing + uint32_t t_run_total; // total running time + uint32_t t_predict_start; // when it is initial + uint32_t t_predict_total; // total time of the whole test +} nnom_predict_t; + +// create a prediction +// input model, the buf pointer to the softwmax output (Temporary, this can be extract from model) +// the size of softmax output (the num of lable) +// the top k that wants to record. +nnom_predict_t *prediction_create(nnom_model_t *m, int8_t *buf_prediction, size_t label_num, size_t top_k_size); // currently int8_t + +// after a new data is set in input +// feed data to prediction +// input the current label, (range from 0 to total number of label -1) +// (the current input data should be set by user manully to the input buffer of the model.) +// return NN_ARGUMENT_ERROR if parameter error +nnom_status_t prediction_run(nnom_predict_t *pre, uint32_t true_label, uint32_t* predict_label, float* prob); + +// to mark prediction finished +void prediction_end(nnom_predict_t *pre); + +// free all resources +void prediction_delete(nnom_predict_t *pre); + +// print matrix +void prediction_matrix(nnom_predict_t *pre); + +// print top-k +void prediction_top_k(nnom_predict_t *pre); + +// this function is to print sumarry +void prediction_summary(nnom_predict_t *pre); + +// ------------------------------- + +// stand alone prediction API +// this api test one set of data, return the prediction +// return the predicted label +// return NN_ARGUMENT_ERROR if parameter error +nnom_status_t nnom_predict(nnom_model_t *m, uint32_t *label, float *prob); + +void model_stat(nnom_model_t *m); + +void model_io_format(nnom_model_t *m); + +#ifdef __cplusplus +} +#endif + +#endif /*__NNOM_UTILS_H__ */ diff --git a/APP_Framework/Framework/knowing/nnom/port/nnom_port.h b/APP_Framework/Framework/knowing/nnom/port/nnom_port.h new file mode 100644 index 000000000..c9105431f --- /dev/null +++ b/APP_Framework/Framework/knowing/nnom/port/nnom_port.h @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2018-2020 + * Jianjia Ma + * majianjia@live.com + * + * SPDX-License-Identifier: Apache-2.0 + * + * Change Logs: + * Date Author Notes + * 2019-02-05 Jianjia Ma The first version + */ + +#ifndef __NNOM_PORT_H__ +#define __NNOM_PORT_H__ + +#include +#include +#include + +/* use static memory */ +// must set buf using "nnom_set_static_buf()" before creating a model. + +/* dynamic memory interfaces */ +/* when libc is not available, you shall implement the below memory interfaces (libc equivalents). */ +#ifndef NNOM_USING_STATIC_MEMORY + #define nnom_malloc(n) malloc(n) + #define nnom_free(p) free(p) +#endif + +/* memory interface */ +/* when libc is not available, you shall implement your equivalent functions here */ +#define nnom_memset(p,v,s) memset(p,v,s) +#define nnom_memcpy(dst,src,len) memcpy(dst,src,len) + +/* runtime & debug */ +#define nnom_us_get() 0 // return a microsecond timestamp +#define nnom_ms_get() 0 // return a millisecond timestamp +#define NNOM_LOG(...) printf(__VA_ARGS__) + +/* NNoM configuration */ +#define NNOM_BLOCK_NUM (8) // maximum number of memory blocks, increase it when log request. +#define DENSE_WEIGHT_OPT (1) // if used fully connected layer optimized weights. + +#endif + + + diff --git a/APP_Framework/Framework/knowing/nnom/scripts/README.MD b/APP_Framework/Framework/knowing/nnom/scripts/README.MD new file mode 100644 index 000000000..54a62afa7 --- /dev/null +++ b/APP_Framework/Framework/knowing/nnom/scripts/README.MD @@ -0,0 +1,4 @@ +fully_connected_opt_weight_generation.py - is from https://github.com/ARM-software/CMSIS_5/tree/develop/CMSIS/NN/Scripts/NNFunctions witch is not a part of NNoM + +Please refer to NNoM documents for its usages. + diff --git a/APP_Framework/Framework/knowing/nnom/scripts/__init__.py b/APP_Framework/Framework/knowing/nnom/scripts/__init__.py new file mode 100644 index 000000000..5bb534f79 --- /dev/null +++ b/APP_Framework/Framework/knowing/nnom/scripts/__init__.py @@ -0,0 +1 @@ +# package diff --git a/APP_Framework/Framework/knowing/nnom/scripts/fully_connected_opt_weight_generation.py b/APP_Framework/Framework/knowing/nnom/scripts/fully_connected_opt_weight_generation.py new file mode 100644 index 000000000..f68382b1f --- /dev/null +++ b/APP_Framework/Framework/knowing/nnom/scripts/fully_connected_opt_weight_generation.py @@ -0,0 +1,153 @@ +#!/usr/bin/env python + +''' + This file is apart of CMSIS-NN release + https://github.com/ARM-software/CMSIS_5/tree/develop/CMSIS/NN/Scripts/NNFunctions +''' + +import numpy as np + +def convert_to_x4_q7_weights(weights): + [r, h, w, c] = weights.shape + weights = np.reshape(weights, (r, h*w*c)) + num_of_rows = r + num_of_cols = h*w*c + new_weights = np.copy(weights) + new_weights = np.reshape(new_weights, (r*h*w*c)) + counter = 0 + for i in range(int(num_of_rows/4)): + # we only need to do the re-ordering for every 4 rows + row_base = 4*i + for j in range(int(num_of_cols/4)): + # for each 4 entries + column_base = 4*j + new_weights[counter] = weights[row_base ][column_base ] + new_weights[counter+1] = weights[row_base+1][column_base ] + new_weights[counter+2] = weights[row_base ][column_base+2] + new_weights[counter+3] = weights[row_base+1][column_base+2] + new_weights[counter+4] = weights[row_base+2][column_base ] + new_weights[counter+5] = weights[row_base+3][column_base ] + new_weights[counter+6] = weights[row_base+2][column_base+2] + new_weights[counter+7] = weights[row_base+3][column_base+2] + + new_weights[counter+8] = weights[row_base ][column_base+1] + new_weights[counter+9] = weights[row_base+1][column_base+1] + new_weights[counter+10] = weights[row_base ][column_base+3] + new_weights[counter+11] = weights[row_base+1][column_base+3] + new_weights[counter+12] = weights[row_base+2][column_base+1] + new_weights[counter+13] = weights[row_base+3][column_base+1] + new_weights[counter+14] = weights[row_base+2][column_base+3] + new_weights[counter+15] = weights[row_base+3][column_base+3] + counter = counter + 16 + # the remaining ones are in order + for j in range((int)(num_of_cols-num_of_cols%4), int(num_of_cols)): + new_weights[counter] = weights[row_base][j] + new_weights[counter+1] = weights[row_base+1][j] + new_weights[counter+2] = weights[row_base+2][j] + new_weights[counter+3] = weights[row_base+3][j] + counter = counter + 4 + return new_weights + +def convert_to_x4_q15_weights(weights): + [r, h, w, c] = weights.shape + weights = np.reshape(weights, (r, h*w*c)) + num_of_rows = r + num_of_cols = h*w*c + new_weights = np.copy(weights) + new_weights = np.reshape(new_weights, (r*h*w*c)) + counter = 0 + for i in range(int(num_of_rows/4)): + # we only need to do the re-ordering for every 4 rows + row_base = 4*i + for j in range(int(num_of_cols/2)): + # for each 2 entries + column_base = 2*j + new_weights[counter] = weights[row_base ][column_base ] + new_weights[counter+1] = weights[row_base ][column_base+1] + new_weights[counter+2] = weights[row_base+1][column_base ] + new_weights[counter+3] = weights[row_base+1][column_base+1] + new_weights[counter+4] = weights[row_base+2][column_base ] + new_weights[counter+5] = weights[row_base+2][column_base+1] + new_weights[counter+6] = weights[row_base+3][column_base ] + new_weights[counter+7] = weights[row_base+3][column_base+1] + + counter = counter + 8 + # the remaining ones are in order + for j in range((int)(num_of_cols-num_of_cols%2), int(num_of_cols)): + new_weights[counter] = weights[row_base][j] + new_weights[counter+1] = weights[row_base+1][j] + new_weights[counter+2] = weights[row_base+2][j] + new_weights[counter+3] = weights[row_base+3][j] + counter = counter + 4 + return new_weights + +def convert_q7_q15_weights(weights): + [r, h, w, c] = weights.shape + weights = np.reshape(weights, (r, h*w*c)) + num_of_rows = r + num_of_cols = h*w*c + new_weights = np.copy(weights) + new_weights = np.reshape(new_weights, (r*h*w*c)) + counter = 0 + for i in range(int(num_of_rows/4)): + # we only need to do the re-ordering for every 4 rows + row_base = 4*i + for j in range(int(num_of_cols/2)): + # for each 2 entries + column_base = 2*j + new_weights[counter] = weights[row_base ][column_base ] + new_weights[counter+1] = weights[row_base+1][column_base ] + new_weights[counter+2] = weights[row_base ][column_base+1] + new_weights[counter+3] = weights[row_base+1][column_base+1] + new_weights[counter+4] = weights[row_base+2][column_base ] + new_weights[counter+5] = weights[row_base+3][column_base ] + new_weights[counter+6] = weights[row_base+2][column_base+1] + new_weights[counter+7] = weights[row_base+3][column_base+1] + + counter = counter + 8 + # the remaining ones are in order + for j in range((int)(num_of_cols-num_of_cols%2), int(num_of_cols)): + new_weights[counter] = weights[row_base][j] + new_weights[counter+1] = weights[row_base+1][j] + new_weights[counter+2] = weights[row_base+2][j] + new_weights[counter+3] = weights[row_base+3][j] + counter = counter + 4 + return new_weights + + +if __name__ == "__main__": + # input dimensions + vec_dim = 127 + row_dim = 127 + + weight = np.zeros((row_dim,vec_dim), dtype=int) + + # generate random inputs + for i in range(row_dim): + for j in range(vec_dim): + weight[i][j] = np.random.randint(256)-128 + + weight = np.reshape(weight, (row_dim, vec_dim, 1, 1)) + + outfile = open("../Ref_Implementations/fully_connected_testing_weights.h", "w") + outfile.write("#define IP2_WEIGHT {") + weight.tofile(outfile,sep=",",format="%d") + outfile.write("}\n\n") + + new_weight = convert_to_x4_q7_weights(weight) + outfile.write("#define IP4_WEIGHT {") + new_weight.tofile(outfile,sep=",",format="%d") + outfile.write("}\n\n") + + new_weight = convert_q7_q15_weights(weight) + outfile.write("#define IP4_q7_q15_WEIGHT {") + new_weight.tofile(outfile,sep=",",format="%d") + outfile.write("}\n\n") + + new_weight = convert_to_x4_q15_weights(weight) + outfile.write("#define IP4_WEIGHT_Q15 {") + new_weight.tofile(outfile,sep=",",format="%d") + outfile.write("}\n\n") + + + outfile.close() diff --git a/APP_Framework/Framework/knowing/nnom/scripts/gen_config.py b/APP_Framework/Framework/knowing/nnom/scripts/gen_config.py new file mode 100644 index 000000000..d1b787abd --- /dev/null +++ b/APP_Framework/Framework/knowing/nnom/scripts/gen_config.py @@ -0,0 +1,561 @@ +''' + Copyright (c) 2018-2020 + Jianjia Ma + majianjia@live.com + SPDX-License-Identifier: Apache-2.0 + Change Logs: + Date Author Notes + 2020-05-22 Jianjia Ma The first version +''' +from tensorflow.keras.layers import * +import numpy as np + +def convert_tensor_name(t): + return 'tensor_'+t.name.replace('/', '_').replace(':', '_') + +def to_cstyle(data, integer=True): + #Convert an array to C style basket, not to be used for very large array. size > options['threshold'] will lead to ... + if(integer): + data = np.array(data, dtype=np.int).flatten() + else: + data = np.array(data).flatten() + s = np.array2string(data, separator=',') + s = s.replace("\n","").replace("\r","").replace(' ','') + s = s.replace(',', ', ') + s = s.replace('(', '[').replace(')', ']') + return s.replace('[', '{').replace(']', '}') + +def tensor_shape(tensor, is_io_tensor=False): + # inconsistance of TF1 and TF2 + # get tensor shape without None or ? + try: + shape = tensor.shape.as_list() # tf1 + except: + shape = tensor.get_shape().as_list() # tf2 + if(shape[0] == None or is_io_tensor): + shape = shape[1:] + else: + shape = shape + # for rnn input with timestamp = None, need a better implementation + for i in range(len(shape)): + shape[i] = shape[i] if shape[i] is not None else 1 + return shape + +def gen_base_config(layer): + config = '{.name = "%s"}' % (layer.name) + return config + +def gen_values(var_name, var, size='', dtype='const int8_t'): + s = ' [] = ;\n' + s = s.replace('', var_name).replace('', var).replace('', size).replace('', dtype) + return s + +# generate tensor by the tensor config +def gen_tensor(tensor, dec_bits, tensor_value='NULL', per_axis=False, is_io_tensor=False): + config = ''' +const nnom_shape_data_t _dim[] = ; +const nnom_qformat_param_t _dec[] = ; +const nnom_qformat_param_t _offset[] = ; +const nnom_tensor_t = { + .p_data = (void*), + .dim = (nnom_shape_data_t*)_dim, + .q_dec = (nnom_qformat_param_t*)_dec, + .q_offset = (nnom_qformat_param_t*)_offset, + .qtype = , + .num_dim = , + .bitwidth = +}; +''' + # inconsistance of TF1 and TF2 + shape = tensor_shape(tensor, is_io_tensor) + config = config.replace('', convert_tensor_name(tensor))#.name.replace('/','_').split(':')[0]) #conv2d/kernel:0 + config = config.replace('', '8') + config = config.replace('', tensor_value) + config = config.replace('', to_cstyle(shape)) + config = config.replace('', str(len(shape))) + if(type(dec_bits) == str): + config = config.replace('', dec_bits) + config = config.replace('', to_cstyle([0])) + else: + config = config.replace('', to_cstyle(dec_bits)) + config = config.replace('', to_cstyle([0])) + if(per_axis): + config = config.replace('', 'NNOM_QTYPE_PER_AXIS') + else: + config = config.replace('', 'NNOM_QTYPE_PER_TENSOR') + return config + +# create tensor by directly setting up the value +def gen_create_tensor(tensor_name, shape, dec_bits, tensor_value='NULL', per_axis=False): + config = ''' +const nnom_shape_data_t _dim[] = ; +const nnom_qformat_param_t _dec[] = ; +const nnom_qformat_param_t _offset[] = ; +const nnom_tensor_t = { + .p_data = (void*), + .dim = (nnom_shape_data_t*)_dim, + .q_dec = (nnom_qformat_param_t*)_dec, + .q_offset = (nnom_qformat_param_t*)_offset, + .qtype = , + .num_dim = , + .bitwidth = +}; +''' + config = config.replace('', tensor_name) + config = config.replace('', '8') + config = config.replace('', tensor_value) + config = config.replace('', to_cstyle(shape)) + config = config.replace('', str(len(shape))) + if(type(dec_bits) == str): + config = config.replace('', dec_bits) + config = config.replace('', to_cstyle([0])) + else: + config = config.replace('', to_cstyle(dec_bits)) + config = config.replace('', to_cstyle([0])) + if(per_axis): + config = config.replace('', 'NNOM_QTYPE_PER_AXIS') + else: + config = config.replace('', 'NNOM_QTYPE_PER_TENSOR') + return config + +def gen_conv2d_config(layer, output_shifts, bias_shifts): + c = ''' +const nnom_qformat_param_t _output_shift[] = ; +const nnom_qformat_param_t _bias_shift[] = ; +const nnom_conv2d_config_t _config = { + .super = , + .qtype = , + .weight = (nnom_tensor_t*)&, + .bias = (nnom_tensor_t*)&, + .output_shift = (nnom_qformat_param_t *)&_output_shift, + .bias_shift = (nnom_qformat_param_t *)&_bias_shift, + .filter_size = , + .kernel_size = , + .stride_size = , + .padding_size = , + .dilation_size = , + .padding_type = +}; +''' + c = c.replace('', layer.name) + c = c.replace('', gen_base_config(layer)) + c = c.replace('', "NNOM_QTYPE_PER_TENSOR") + c = c.replace('',convert_tensor_name(layer.weights[0])) + c = c.replace('',convert_tensor_name(layer.weights[1])) + c = c.replace('', output_shifts) + c = c.replace('', bias_shifts) + c = c.replace('', str(layer.filters) if layer.filters is not None else str(layer.depth_multiplier)) # output channel + c = c.replace('', to_cstyle(layer.kernel_size)) + c = c.replace('', to_cstyle(layer.strides)) + c = c.replace('', '{0, 0}') # not using it with keras, defined by padding type instead + c = c.replace('', to_cstyle(layer.dilation_rate)) + c = c.replace('', 'PADDING_'+layer.padding.upper()) + return c + +def gen_conv2d_trans_config(layer, output_shifts, bias_shifts): + c = ''' +const nnom_qformat_param_t _output_shift[] = ; +const nnom_qformat_param_t _bias_shift[] = ; +const nnom_conv2d_trans_config_t _config = { + .super = , + .qtype = , + .weight = (nnom_tensor_t*)&, + .bias = (nnom_tensor_t*)&, + .output_shift = (nnom_qformat_param_t *)&_output_shift, + .bias_shift = (nnom_qformat_param_t *)&_bias_shift, + .filter_size = , + .kernel_size = , + .stride_size = , + .padding_size = , + .dilation_size = , + .padding_type = +}; +''' + c = c.replace('', layer.name) + c = c.replace('', gen_base_config(layer)) + c = c.replace('', "NNOM_QTYPE_PER_TENSOR") + c = c.replace('',convert_tensor_name(layer.weights[0])) + c = c.replace('',convert_tensor_name(layer.weights[1])) + c = c.replace('', output_shifts) + c = c.replace('', bias_shifts) + c = c.replace('', str(layer.filters)) # output channel + c = c.replace('', to_cstyle(layer.kernel_size)) + c = c.replace('', to_cstyle(layer.strides)) + c = c.replace('', '{0, 0}') # not using it with keras, defined by padding type instead + c = c.replace('', to_cstyle(layer.dilation_rate)) + c = c.replace('', 'PADDING_'+layer.padding.upper()) + return c + +def gen_dense_config(layer, output_shifts, bias_shift): + c = ''' +const nnom_qformat_param_t _output_shift[] = ; +const nnom_qformat_param_t _bias_shift[] = ; +const nnom_dense_config_t _config = { + .super = , + .qtype = , + .weight = (nnom_tensor_t*)&, + .bias = (nnom_tensor_t*)&, + .output_shift = (nnom_qformat_param_t *)&_output_shift, + .bias_shift = (nnom_qformat_param_t *)&_bias_shift +}; +''' + c = c.replace('', layer.name) + c = c.replace('', gen_base_config(layer)) + c = c.replace('', "NNOM_QTYPE_PER_TENSOR") + c = c.replace('', convert_tensor_name(layer.weights[0])) + c = c.replace('', convert_tensor_name(layer.weights[1])) + c = c.replace('', output_shifts) + c = c.replace('', bias_shift) + return c + +def gen_io_config(layer, tensor_name): + c = ''' +const nnom_io_config_t _config = { + .super = , + .tensor = (nnom_tensor_t*)& +}; +''' + c = c.replace('', layer.name) + c = c.replace('', gen_base_config(layer)) + c = c.replace('', tensor_name) + return c + +def gen_output_config(previous_layer, dec_bits, output_num, value_name='nnom_output_data'): #cheat at the moments + c = ''' +const nnom_shape_data_t _dim[] = ; +const nnom_qformat_param_t _dec[] = ; +const nnom_qformat_param_t _offset[] = ; +const nnom_tensor_t = { + .p_data = (void*), + .dim = (nnom_shape_data_t*)_dim, + .q_dec = (nnom_qformat_param_t*)_dec, + .q_offset = (nnom_qformat_param_t*)_offset, + .qtype = , + .num_dim = , + .bitwidth = 8 +}; + +const nnom_io_config_t _config = { + .super = , + .tensor = (nnom_tensor_t*)& +}; +''' + shape = tensor_shape(previous_layer.output, is_io_tensor=True) + + c = c.replace('', 'tensor_output'+str(output_num)) + c = c.replace('', 'output'+str(output_num)) + c = c.replace('', '{.name = "output'+str(output_num)+'"}') # cheating at the moment. + c = c.replace('', value_name) + c = c.replace('', 'NNOM_QTYPE_PER_TENSOR') + c = c.replace('', str(len(shape))) + c = c.replace('', to_cstyle(shape)) + c = c.replace('', '{'+dec_bits+'}') + c = c.replace('', to_cstyle([0])) + return c + + +def gen_pooling_config(layer, output_shifts='0'): + c = ''' +const nnom_pool_config_t _config = { + .super = , + .padding_type = , + .output_shift = , + .kernel_size = , + .stride_size = , + .num_dim = +}; +''' + c = c.replace('', layer.name) + c = c.replace('', gen_base_config(layer)) + c = c.replace('', 'PADDING_'+layer.padding.upper()) + c = c.replace('', to_cstyle(layer.pool_size)) + c = c.replace('', to_cstyle(layer.strides)) + c = c.replace('', str(len(layer.pool_size))) + c = c.replace('', output_shifts) # not used at the moment + return c + +def gen_gl_pooling_config(layer, output_shifts='0'): + c = ''' +const nnom_global_pool_config_t _config = { + .super = , + .output_shift = , +}; +''' + c = c.replace('', layer.name) + c = c.replace('', gen_base_config(layer)) + c = c.replace('', output_shifts) + return c + + + +def gen_matrix_config(layer, output_shift_name='0'): + c = ''' +const nnom_matrix_config_t _config = { + .super = , + .output_shift = +}; +''' + c = c.replace('', layer.name) + c = c.replace('', gen_base_config(layer)) + c = c.replace('', output_shift_name) # not used at the moment + return c + +def gen_zero_padding_config(layer): + c = ''' +const nnom_zero_padding_config_t _config = { + .super = , + .pad = +}; +''' + c = c.replace('', layer.name) + c = c.replace('', gen_base_config(layer)) + try: + c = c.replace('', to_cstyle(sum(layer.padding, ()))) + except: + pad = ((0, 0), layer.padding) + c = c.replace('', to_cstyle(sum(pad, ()))) + return c + +def gen_cropping_config(layer): + c = ''' +const nnom_cropping_config_t _config = { + .super = , + .pad = +}; +''' + c = c.replace('', layer.name) + c = c.replace('', gen_base_config(layer)) + try: + c = c.replace('', to_cstyle(sum(layer.cropping, ()))) #((top_crop, bottom_crop), (left_crop, right_crop)) + except: + pad = ((0, 0), layer.cropping) + c = c.replace('', to_cstyle(sum(pad, ()))) + return c + +def gen_upsampling_config(layer): + c = ''' +const nnom_upsample_config_t _config = { + .super = , + .kernel = +}; +''' + c = c.replace('', layer.name) + c = c.replace('', gen_base_config(layer)) + c = c.replace('', to_cstyle(layer.size)) + return c + +def gen_softmax_config(layer): + c = ''' +const nnom_softmax_config_t _config = { + .super = +}; +''' + c = c.replace('', layer.name) + c = c.replace('', gen_base_config(layer)) + return c + +def gen_flatten_config(layer): + c = ''' +const nnom_flatten_config_t _config = { + .super = +}; +''' + c = c.replace('', layer.name) + c = c.replace('', gen_base_config(layer)) + return c + +def gen_reshape_config(layer): + c = ''' +const nnom_shape_data_t _targeted_shape[] = ; +const nnom_reshape_config_t _config = { + .super = , + .dim = (nnom_shape_data_t*)_targeted_shape, + .num_dim = +}; +''' + c = c.replace('', layer.name) + c = c.replace('', gen_base_config(layer)) + c = c.replace('', to_cstyle(layer.output_shape[1:])) + c = c.replace('', str(len(layer.output_shape[1:]))) + return c + +def gen_concat_config(layer): + c = ''' +const nnom_concat_config_t _config = { + .super = , + .axis = +}; +''' + c = c.replace('', layer.name) + c = c.replace('', gen_base_config(layer)) + c = c.replace('', str(layer.axis)) + return c + +def gen_lambda_config(layer, run_func_name='NULL', build_func_name='NULL', free_func_name='NULL', parameters_name='NULL'): + c = ''' +const nnom_lambda_config_t _config = { + .super = , + .run_func_name = , + .build_func_name = , + .free_func_name = , + .parameters = +}; +''' + c = c.replace('', layer.name) + c = c.replace('', gen_base_config(layer)) + c = c.replace('', run_func_name) + c = c.replace('', build_func_name) + c = c.replace('', free_func_name) + c = c.replace('', parameters_name) + return c + +def gen_rnn_config(layer): + c = ''' +const nnom_rnn_config_t _config = { + .super = , + .return_sequence = , + .stateful = , + .go_backwards = +}; +''' + c = c.replace('', layer.name) + c = c.replace('', gen_base_config(layer)) + c = c.replace('', 'true' if layer.stateful else 'false') + c = c.replace('', 'true' if layer.go_backwards else 'false') + c = c.replace('', 'true' if layer.return_sequences else 'false') + return c + +def gen_simple_cell_config(layer, q_list): + c = ''' +const nnom_simple_cell_config_t _simple_cell_config = { + .super = , + .weights = (nnom_tensor_t*)&, + .recurrent_weights = (nnom_tensor_t*)&, + .bias = (nnom_tensor_t*)&, + .q_dec_iw = , + .q_dec_hw = , + .q_dec_h = , + .act_type = , + .units = +}; +''' + try: + cell_cfg = layer.get_config()['cell']['config'] + except: + cell_cfg = layer.get_config() + c = c.replace('', layer.name) + c = c.replace('', gen_base_config(layer)) + c = c.replace('', convert_tensor_name(layer.weights[0])) + c = c.replace('', convert_tensor_name(layer.weights[1])) + c = c.replace('', convert_tensor_name(layer.weights[2])) + c = c.replace('', str(q_list[1])) # the qfmt of input x weight + c = c.replace('', str(q_list[2])) # q of hidden x recurrent weight + c = c.replace('', str(q_list[0])) # output, if act != relu, should be 7 (consider delete it.) + c = c.replace('', 'ACT_' + cell_cfg['activation'].upper()) + c = c.replace('', str(cell_cfg['units'])) + return c + +def gen_lstm_cell_config(layer, q_list): + c = ''' +const nnom_lstm_cell_config_t _lstm_cell_config = { + .super = , + .weights = (nnom_tensor_t*)&, + .recurrent_weights = (nnom_tensor_t*)&, + .bias = (nnom_tensor_t*)&, + .q_dec_z = , + .q_dec_h = , + .q_dec_c = , + .units = +}; +''' + try: + cell_cfg = layer.get_config()['cell']['config'] + except: + cell_cfg = layer.get_config() + c = c.replace('', layer.name) + c = c.replace('', gen_base_config(layer)) + c = c.replace('', convert_tensor_name(layer.weights[0])) + c = c.replace('', convert_tensor_name(layer.weights[1])) + c = c.replace('', convert_tensor_name(layer.weights[2])) + c = c.replace('', str(q_list[0])) # output and memory state, (should be q0.7. consider delete it) + c = c.replace('', str(q_list[1])) # cell state + c = c.replace('', str(q_list[2])) # input*weight + hidden*weight + bias + c = c.replace('', str(cell_cfg['units'])) + return c + + + +def gen_gru_cell_config(layer, q_list): + c = ''' +const nnom_gru_cell_config_t _gru_cell_config = { + .super = , + .weights = (nnom_tensor_t*)&, + .recurrent_weights = (nnom_tensor_t*)&, + .bias = (nnom_tensor_t*)&, + .q_dec_z = , + .q_dec_h = , + .units = +}; +''' + try: + cell_cfg = layer.get_config()['cell']['config'] + except: + cell_cfg = layer.get_config() + c = c.replace('', layer.name) + c = c.replace('', gen_base_config(layer)) + c = c.replace('', convert_tensor_name(layer.weights[0])) + c = c.replace('', convert_tensor_name(layer.weights[1])) + c = c.replace('', convert_tensor_name(layer.weights[2])) + c = c.replace('', str(q_list[0])) # + c = c.replace('', str(q_list[1])) # + c = c.replace('', str(cell_cfg['units'])) + return c + + +if __name__ == "__main__": + # test only + from tensorflow.keras.models import load_model + model = load_model("../model.h5") + print(gen_tensor(model.layers[1].weights[0], dec_bits=(1, 2, 3, 4, 5))) + print(gen_tensor(model.layers[1].weights[1], dec_bits=(1, 2, 3, 4, 5))) + print(gen_conv2d_config(model.layers[1], (1,2,3), 3)) + + with open("test.h", 'w') as fp: + # fp.write(gen_tensor(model.layers[1].weights[0], dec_bits=(1, 2, 3, 4, 5))) + # fp.write(gen_tensor(model.layers[1].weights[1], dec_bits=(1, 2, 3, 4, 5))) + # fp.write(gen_conv2d_config(model.layers[1], (1,2,3,))) + + fp.write('#include "nnom.h"\n') + + # test all + for layer in model.layers: + if(type(layer) in [Conv2D, Conv1D]): + for w in layer.weights: + fp.write(gen_tensor(w, [3])) + fp.write(gen_conv2d_config(layer, {0}, 2)) + elif(type(layer) in [Dense]): + for w in layer.weights: + fp.write(gen_tensor(w, [3])) + fp.write(gen_dense_config(layer, 2, 2)) + elif(type(layer) in [Input]): + fp.write(gen_io_config(layer, [9,1,1])) + elif(type(layer) in [MaxPooling2D, GlobalMaxPooling2D, AveragePooling2D, GlobalAveragePooling2D]): + fp.write(gen_pooling_config(layer)) + elif(type(layer) in [Multiply, Add, Subtract]): + fp.write(gen_matrix_config(layer)) + elif(type(layer) in [ZeroPadding2D, ZeroPadding1D]): + fp.write(gen_zero_padding_config(layer)) + elif(type(layer) in [Cropping2D, Cropping1D]): + fp.write(gen_cropping_config(layer)) + elif(type(layer) in [Softmax]): + fp.write(gen_softmax_config(layer)) + elif(type(layer) in [Flatten]): + fp.write(gen_flatten_config(layer)) + elif(type(layer) in [Concatenate]): + fp.write(gen_concat_config(layer)) + elif(type(layer) in [Lambda]): + fp.write(gen_lambda_config(layer)) + elif(type(layer) in [UpSampling2D, UpSampling1D]): + fp.write(gen_upsampling_config(layer)) + + diff --git a/APP_Framework/Framework/knowing/nnom/scripts/nnom.py b/APP_Framework/Framework/knowing/nnom/scripts/nnom.py new file mode 100644 index 000000000..45e6b30a7 --- /dev/null +++ b/APP_Framework/Framework/knowing/nnom/scripts/nnom.py @@ -0,0 +1,1198 @@ +''' + Copyright (c) 2018-2020 + Jianjia Ma + majianjia@live.com + + SPDX-License-Identifier: Apache-2.0 + + Change Logs: + Date Author Notes + 2019-02-05 Jianjia Ma The first version +''' + +import sklearn.metrics as skmetrics +import matplotlib.pyplot as plt +import tensorflow as tf +import tensorflow.keras.backend as K +from tensorflow.keras import * +from tensorflow.keras.layers import * +from fully_connected_opt_weight_generation import * +from gen_config import * +import scipy.stats +import time +import warnings + +model_major_version = 0 +model_sub_version = 4 +model_reversion = 3 + +#define NNOM_MAJORVERSION 0L /**< major version number */ +#define NNOM_SUBVERSION 4L /**< minor version number */ +#define NNOM_REVISION 3L /**< revise version number */ +#define NNOM_VERSION (NNOM_MAJORVERSION * 10000) + (NNOM_SUBVERSION * 100) + NNOM_REVISION) + +def fuse_bn_to_conv(layer): + # try to fuse BN layer to convolutional + if ('conv' in layer.name) and \ + ('batch_normalization' in layer.outbound_nodes[0].outbound_layer.name): + print("fusing batch normalization to", layer.name) + bn_layer = layer._outbound_nodes[0].outbound_layer + c_w = layer.get_weights()[0] + c_b = layer.get_weights()[1] + print('original weight max', c_w.max(), 'min', c_w.min()) + print('original bias max', c_b.max(), 'min', c_b.min()) + bn_gamma = bn_layer.get_weights()[0] + bn_beta = bn_layer.get_weights()[1] + bn_mean = bn_layer.get_weights()[2] + bn_variance = bn_layer.get_weights()[3] + epsilon = 1e-3 # default epsilon for tf.slim.batch_norm + if ('conv2d' in layer.name): + if "depthwise" in layer.name: # depthwise batchnorm params are ordered differently + for l in range(c_w.shape[3]): + for k in range(c_w.shape[2]): + for j in range(c_w.shape[1]): + for i in range(c_w.shape[0]): + c_w[i][j][k][l] *= bn_gamma[k*c_w.shape[3]+l] / np.sqrt(bn_variance[k*c_w.shape[3]+l] + epsilon) + depth_dim = c_w.shape[2] * c_w.shape[3] # test needed + # normal conv + else: + for l in range(c_w.shape[3]): + for k in range(c_w.shape[2]): + for j in range(c_w.shape[1]): + for i in range(c_w.shape[0]): + c_w[i][j][k][l] *= bn_gamma[l] / np.sqrt(bn_variance[l] + epsilon) + depth_dim = c_w.shape[3] + for l in range(depth_dim): + c_b[l] = (bn_gamma[l] * (c_b[l] - bn_mean[l]) / np.sqrt(bn_variance[l] + epsilon)) + bn_beta[l] + # conv1d + else: + epsilon = 1e-3 # default epsilon for tf.slim.batch_norm + for k in range(c_w.shape[2]): + for j in range(c_w.shape[1]): + for i in range(c_w.shape[0]): + if "depthwise" in layer.name: # depthwise batchnorm params are ordered differently + c_w[i][j][k] *= bn_gamma[j] / np.sqrt(bn_variance[j] + epsilon) + else: + c_w[i][j][k] *= bn_gamma[k] / np.sqrt(bn_variance[k] + epsilon) + + if "depthwise" in layer.name: + depth_dim = c_w.shape[1]*c_w.shape[2] # need to be tested + else: + depth_dim = c_w.shape[2] + for l in range(depth_dim): + c_b[l] = (bn_gamma[l] * (c_b[l] - bn_mean[l]) / np.sqrt(bn_variance[l] + epsilon)) + bn_beta[l] + + print('fused weight max', c_w.max(), 'min', c_w.min()) + print('fused bias max', c_b.max(), 'min', c_b.min()) + # write the weights back to the layer + # after that, the model will be destroyed.. need a better way to pass the new weight + layer.set_weights([c_w, c_b]) + +def generate_test_bin(x, y, name='test_data_with_label.bin'): + ''' + this method generate the + :param x: input x data size + :param y: input label (one hot label) + :return: + ''' + # quantize input x + dec_bits = find_dec_bits_max_min(x, bit_width=8) + x = np.round(x*2**dec_bits).clip(-128, 127).astype(np.int8) + # get label + if(len(y.shape) >1): + test_label = np.argwhere(y == 1).astype(np.int8) # test data + test_label = test_label[:, 1] + else: + test_label = y + + # get data + dat = x.astype(dtype="byte") # test data + batch_size = dat.shape[0] # total pices of data + dat = dat.flatten() # flatten to get the total size. + block_size = int(dat.size / batch_size) # this must be integer but... just to confirm + + # write (label x 128) (data_block x 128) + label_batch = 128 # the Y-modem example uses 128 batch + with open(name, 'wb') as f: + start = 0 + while start <= (test_label.size - label_batch): + test_label[start: start + label_batch].tofile(f) + dat[block_size * start: block_size * (start + label_batch)].tofile(f) + start += label_batch + + # the rest data + if (start < test_label.size): + rest_len = test_label.size - start + new_labls = test_label[start:] + new_labls = np.pad(new_labls, (0, label_batch - rest_len), mode='constant') + new_labls.tofile(f) + dat[block_size * start:].tofile(f) + + print("binary test file generated:", name) + print("test data length:", test_label.size) + return + +def is_shift_layer(layer): + ''' layer which can change the output encoding''' + #FIXME: add more which will change the output shift + if('input' in layer.name or + 'conv2d' in layer.name or + 'conv1d' in layer.name or + 'dense' in layer.name or + 'softmax' in layer.name or + 'sigmoid' in layer.name or + 'tanh' in layer.name or + ('add' in layer.name and 'zero' not in layer.name) or # the name, zero_padding contains 'add' + 'subtract' in layer.name or + 'multiply' in layer.name or + ('activation' in layer.name and layer.get_config()['activation'] == 'softmax')or + ('activation' in layer.name and layer.get_config()['activation'] == 'hard_sigmoid') or + ('activation' in layer.name and layer.get_config()['activation'] == 'tanh') or + ('activation' in layer.name and layer.get_config()['activation'] == 'hard_tanh') or + is_rnn_layer(layer) + ): + return True + return False + +def is_shift_fixed(layer): + ''' layer which shift to a fixed value''' + #FIXME: add more which will change the output shift + if('softmax' in layer.name or + 'sigmoid' in layer.name or + 'tanh' in layer.name or + ('activation' in layer.name and layer.get_config()['activation'] == 'softmax') or + ('activation' in layer.name and layer.get_config()['activation'] == 'sigmoid') or + ('activation' in layer.name and layer.get_config()['activation'] == 'hard_sigmoid') or + ('activation' in layer.name and layer.get_config()['activation'] == 'tanh') or + ('activation' in layer.name and layer.get_config()['activation'] == 'hard_tanh') or + is_rnn_layer(layer) + ): + return True + return False + +def is_lstm_layer(layer): + if type(layer) is LSTM or 'lstm' in layer.name: + return True + if(type(layer) is RNN or 'rnn' in layer.name): + if(type(layer.cell) is LSTMCell or 'lstm' in layer.cell.name): + return True + return False + +def is_gru_layer(layer): + if type(layer) is GRU or 'gru' in layer.name: + return True + if(type(layer) is RNN or 'rnn' in layer.name): + if(type(layer.cell) is GRUCell or 'gru' in layer.cell.name): + return True + return False + +def is_rnn_layer(layer): + if( 'rnn' in layer.name or + is_lstm_layer(layer) or + is_gru_layer(layer) + ): + return True + return False + +def find_offset(data): + """ + Offset of the original data before quantisation + :param data: + :return: offset of the data block + """ + return np.average(data) + + +def find_dec_bits_max_min(data, bit_width=8, maximum_bit=32): + """ + A ragular non-saturated shift-based quantisation mathod. Using max/min values + :param data: + :param bit_width: + :param maximum_bit: maximum decimal bit. Incase sometime bias is too small lead to very large size dec bit + :return: + """ + max_val = abs(data.max()) - abs(data.max()/pow(2, bit_width)) # allow very small saturation. + min_val = abs(data.min()) - abs(data.min()/pow(2, bit_width)) + int_bits = int(np.ceil(np.log2(max(max_val, min_val)))) + dec_bits = (bit_width-1) - int_bits + return min(dec_bits, maximum_bit) + +def find_dec_bits_max_min_axis(data, axis=-1,bit_width=8, maximum_bit=32): + """ + A ragular non-saturated shift-based quantisation mathod. Using max/min values + :param data: + :param axis: + :param bit_width: + :return: + """ + dec_bits = [] + # if(len(data.shape) < np.abs(axis)): # for depthwise with axis = -2 while len(shape) =1 + # size = data.shape[0] + # axis = 0 # + # else: + # size = data.shape[axis] + for i in np.arange(0, data.shape[axis]): + d = np.take(data, indices=i, axis=axis) + max_val = abs(d.max()) - abs(d.max() / pow(2, bit_width)) # allow very small saturation. + min_val = abs(d.min()) - abs(d.min() / pow(2, bit_width)) + int_bit = int(np.ceil(np.log2(max(abs(max_val), abs(min_val))))) + dec_bit = (bit_width-1) - int_bit + dec_bits.append(min(dec_bit, maximum_bit)) + return dec_bits + +def find_dec_bits_kld(data, bit_width=8, scan_times=4, maximum_bit=16): + """ + # saturation shift, using KLD method (Kullback-Leibler divergence) + # Ref: http://on-demand.gputechconf.com/gtc/2017/presentation/s7310-8-bit-inference-with-tensorrt.pdf + :param data: The data for looking for quantisation + :param bit_width: the bitwidth of the data + :param scan_times: the times to try the best kld (normally the second is the best.) + :return: dec bit width for this data + """ + # do a regular non-saturated quantisation + max_val = data.max() + min_val = data.min() + abs_max = max(abs(max_val), abs(min_val)) + int_bits = int(np.ceil(np.log2(max(abs(max_val), abs(min_val))))) + dec_bits = (bit_width-1) - int_bits + + # now looking for the best quantisation using KLD method + small_var = 1e-5 + bins = np.arange(-abs_max, abs_max, abs_max / 2048 * 2) + q_bins = np.arange(-abs_max, abs_max, abs_max / 256 * 2) + flat_hist = np.histogram(data.flatten(), bins=bins)[0] + kl_loss = [] + kl_shifts = [] + for shift in range(scan_times): + t = 2 ** (dec_bits + shift) # 2-based threshold + act = np.round(data.flatten() * t) + act = act / t + act = np.clip(act, -128 / t, 127 / t) + act = np.histogram(act, bins=q_bins)[0] + act_hist = np.zeros(2047) + chunk = int(2048 / 256) + for i in range(int(255)): + none_zero = np.count_nonzero(flat_hist[i * chunk:(i + 1) * chunk]) + if none_zero == 0: + continue + for j in range(chunk): + act_hist[i * chunk + j] = act[i] / none_zero if flat_hist[i * chunk + j] != 0 else 0 + flat_hist[flat_hist == 0] = small_var + act_hist[act_hist == 0] = small_var + kl = scipy.stats.entropy(flat_hist, act_hist) + kl_loss.append(kl) + kl_shifts.append(dec_bits + shift) + + # now get the least loss from the scaned kld shift + dec_bits = kl_shifts[np.argmin(kl_loss)] # set the dec_bit to the KLD results + return min(dec_bits, maximum_bit) + +# convert to [-128,128) or int8 +def quantize_data(data, dec_bits, axis=-1, per_axis=False, bitwith=8): + if (per_axis): + out = [] + for i in np.arange(0, data.shape[axis]): + d = np.take(data, indices=i, axis=axis) + d = np.round(d * 2 ** dec_bits[i]) + d = np.clip(d, -2**(bitwith-1), 2**(bitwith-1)-1) + d = np.expand_dims(d, axis=axis) + out.append(d) + out = np.concatenate(out, axis=axis) + return out + else: + return np.clip(np.round(data * 2 ** dec_bits), -2**(bitwith-1), 2**(bitwith-1) -1) + +def quantize_rnn_intermediate_output(layer, features): + def nnom_sigmoid(data): + return 1 / (1 + np.exp(-data)) + def nnom_tanh(data): + return np.tanh(data) + def split_array(d, num): + l = len(d) + if(num==4): + return d[:int(l/4)], d[int(l/4): int(l/2)], d[int(l/2):-int(l/4)], d[-int(l/4):] + elif(num==3): + return d[:int(l/3)], d[int(l/3): -int(l/3)], d[-int(l/3):] + lcfg = layer.get_config() + if(lcfg['go_backwards']): + features = features[:,::-1,:] # reverse timestamp + + if(type(layer.cell) is SimpleRNNCell): + cfg = layer.cell.get_config() + state = np.zeros(cfg['units']) + kernel = layer.get_weights()[0] + recurrent_kernel = layer.get_weights()[1] + bias = layer.get_weights()[2] + # replicate keras's implementation + def simple_cell_step(inputs, state, kernel, recurrent_kernel, bias, activation): + h = np.dot(inputs, kernel) + h = np.add(h, bias) + h2 = np.dot(state, recurrent_kernel) + output = h + h2 + output = activation(output) + return output, h, h2 + output_arrary = [] + h_array = [] + h2_array = [] + activation = nnom_tanh if cfg['activation'] is 'tanh' else nnom_sigmoid + state = np.zeros(cfg['units']) + for feature in features: + if(not layer.stateful): + state = np.zeros(cfg['units']) + for fe in feature: + output, h, h2 = simple_cell_step(fe, state, kernel, recurrent_kernel, bias, activation) + state = output + output_arrary.append(output) + h_array.append(h) + h2_array.append(h2) + output_arrary = np.array(output_arrary) + h_array = np.array(h_array) + h2_array = np.array(h2_array) + # qout = find_dec_bits_kld(output_arrary) + # qh = find_dec_bits_kld(h_array) + # qh2 = find_dec_bits_kld(h2_array) + qout = find_dec_bits_max_min(output_arrary) + qh = find_dec_bits_max_min(h_array) + qh2 = find_dec_bits_max_min(h2_array) + return [qout, qh, qh2] + + elif (type(layer.cell) is LSTMCell or 'lstm' in layer.cell.name): + cfg = layer.cell.get_config() + state = np.zeros(cfg['units']*2) + kernel = layer.get_weights()[0] + recurrent_kernel = layer.get_weights()[1] + bias = layer.get_weights()[2] + def lstm_cell_step(cell_inputs, cell_states, kernel, recurrent_kernel, bias): + h_tm1 = cell_states[0] # previous memory state + c_tm1 = cell_states[1] # previous carry state + z1 = np.dot(cell_inputs, kernel) + z1 = np.add(z1, bias) + z2 = np.dot(h_tm1, recurrent_kernel) + z = z1+z2 # -----> q_z + z0, z1, z2, z3 = split_array(z, 4) + i = nnom_sigmoid(z0) # q0.7 + f = nnom_sigmoid(z1) # q0.7 + c1 = f*c_tm1 + c2 = i*nnom_tanh(z2) # q0.7 + c = c1 + c2 # -----> q_c + o = nnom_sigmoid(z3) # q0.7 + tc = nnom_tanh(c) + h = o * tc # q0.7 + return h, [h, c], z ,z0, z1, z2, z3 + h_array = [] + c_array = [] + z_array = [] + z0_array = [] + z1_array = [] + z2_array = [] + z3_array = [] + state = [np.zeros(cfg['units']), np.zeros(cfg['units'])] + for feature in features: + if(not layer.stateful): + state = [np.zeros(cfg['units']), np.zeros(cfg['units']) ] + for fe in feature: + output, state, z, z0, z1, z2, z3 = lstm_cell_step(fe, state, kernel, recurrent_kernel, bias) + h_array.append(output) + c_array.append(state[1]) + z_array.append(z) + z0_array.append(z0) + z1_array.append(z1) + z2_array.append(z2) + z3_array.append(z3) + h_array = np.array(h_array) + c_array = np.array(c_array) + z_array = np.array(z_array) + z0_array = np.array(z0_array) + z1_array = np.array(z1_array) + z2_array = np.array(z2_array) + z3_array = np.array(z3_array) + # q_h = find_dec_bits_kld(h_array) + # q_c = find_dec_bits_kld(c_array) + # q_z = find_dec_bits_kld(z_array) + # q_z0 = find_dec_bits_kld(z0_array) + # q_z1 = find_dec_bits_kld(z1_array) + # q_z2 = find_dec_bits_kld(z2_array) + # q_z3 = find_dec_bits_kld(z3_array) + q_h = find_dec_bits_max_min(h_array) + q_c = find_dec_bits_max_min(c_array) + q_z = find_dec_bits_max_min(z_array) + q_z0 = find_dec_bits_max_min(z0_array) # not needed. + q_z1 = find_dec_bits_max_min(z1_array) + q_z2 = find_dec_bits_max_min(z2_array) + q_z3 = find_dec_bits_max_min(z3_array) + return [q_h, q_c, q_z] + + elif (type(layer.cell) is GRUCell or 'gru' in layer.cell.name): + cfg = layer.cell.get_config() + state = np.zeros(cfg['units']) + k = layer.get_weights()[0] + rk = layer.get_weights()[1] + bias = layer.get_weights()[2] + + def gru_cell_step(cell_inputs, cell_states, kernel, recurrent_kernel, input_bias, recurrent_bias): + h_tm1 = cell_states[0] + # inputs projected by all gate matrices at once + matrix_x = np.dot(cell_inputs, kernel) + input_bias + x_z, x_r, x_h = split_array(matrix_x, 3) + # hidden state projected by all gate matrices at once + matrix_inner = np.dot(h_tm1, recurrent_kernel) + recurrent_bias + recurrent_z, recurrent_r, recurrent_h = split_array(matrix_inner, 3) + z = nnom_sigmoid(x_z + recurrent_z) + r = nnom_sigmoid(x_r + recurrent_r) + hh = nnom_tanh(x_h + r * recurrent_h) + # previous and candidate state mixed by update gate + # h = z * h_tm1 + (1 - z) * hh + h1 = z*h_tm1 + h2 = 1-z + h3 = h2 * hh + h = h1 + h3 + return h, [h], matrix_x, matrix_inner + h_array = [] + z_array = [] + i_array=[] + state = [np.zeros(cfg['units'])] + for feature in features: + if (not layer.stateful): + state = [np.zeros(cfg['units'])] + for fe in feature: + output, state, z, i = gru_cell_step(fe, state, k, rk, bias[0], bias[1]) + h_array.append(output) + z_array.append(z) + i_array.append(i) + h_array = np.array(h_array) + i_array = np.array(i_array) + z_array = np.array(z_array) + # q_h = find_dec_bits_kld(h_array) + # q_i = find_dec_bits_kld(i_array) + # q_z = find_dec_bits_kld(z_array) + q_h = find_dec_bits_max_min(h_array) + q_i = find_dec_bits_max_min(i_array) + q_z = find_dec_bits_max_min(z_array) + q_z = min(q_i, q_z) + return [q_h, q_z] + return [] + +def quantize_output(model, x_test, quantize_method='max_min', layer_offset=False, calibrate_size=None): + # limit the test data size + if(calibrate_size is not None): + if (x_test.shape[0] > calibrate_size): + x_test = x_test[:calibrate_size] + # test, show the output ranges + layer_q_list = {} + # FIXME: only support one input + if (type(model.layers[0]) != InputLayer): + L = [model.input] + model.layers + else: + L = model.layers + + for layer in L: # layer loop + if ("input" in layer.name): + features = x_test + else: + # rnn need a further step to determine the intermediate q format + if (is_rnn_layer(layer)): + in_layer = layer.inbound_nodes[0].inbound_layers + layer_model = Model(inputs=model.input, outputs=in_layer.output) + bs = model.input.shape[0] + features = layer_model.predict(x_test, batch_size=bs) + intermediate_dec = quantize_rnn_intermediate_output(layer, features) + print(layer.name, 'dec bit', intermediate_dec) + layer_q_list['intermediate_' + layer.name] = intermediate_dec + + # batch_normalization will need to be handled differently, since we are fusing the weight to its previosu conv. + # sigmoid and tanh are different, their shift is fixed to 7 + if (is_shift_layer(layer) or + ('batch_normalization' in layer.name)): + layer_model = Model(inputs=model.input, outputs=layer.output) + bs = model.input.shape[0] + features = layer_model.predict(x_test, batch_size=bs) + else: + # leave the features not changed, so this layer shift will be the same as its inputs + pass + + # we currently only support one offset for a layer output. + if(layer_offset): + offset = find_offset(features) + features = features - offset + else: + offset = 0 + # saturated shift using KLD method OR non saturated shift using max-min + if ("kld" in quantize_method + and not is_shift_fixed(layer) + and "input" not in layer.name + and "dense" not in layer.name): # test, also do not use kld in input layer + dec_bits = find_dec_bits_kld(features, bit_width=8, scan_times=4) + print(layer.name,"Quantized method:", "KLD", "Values max:", np.max(features), "min:", np.min(features), "dec bit", dec_bits) + else: + dec_bits = find_dec_bits_max_min(features, bit_width=8) + print(layer.name,"Quantized method:","max-min"," Values max:", np.max(features), "min:", np.min(features), "dec bit", dec_bits) + # quantise offset + offset = int(np.round(offset * 2 ** dec_bits)) + # record the shift + if (type(model.input) == tf.Tensor and type(model.layers[0]) != InputLayer): + layer_q_list[layer.name.split(':')[0]] = [dec_bits, offset] + else: + layer_q_list[layer.name] = [dec_bits, offset] + if ('batch_normalization' in layer.name): + layer_q_list[layer.inbound_nodes[0].inbound_layers.name] = [dec_bits, offset] # use the bn layer shift to update the last layer. + + # scan the layers backward, try to unify the dec bit in multiple input layers, (add, mult... concat...etc.) + LM = {} + for layer in model.layers: + LM[layer.name] = layer + L = [l for l in model.layers[1:]] + L.reverse() + def update_previous_layer_shift(layer, dec_bit): + if(type(layer.input) == list): + for inp in layer.input: + iname = inp.name.split('/')[0] + if('input' in iname): + continue + layer_q_list[iname][0] = dec_min + if(not is_shift_layer(LM[iname])): + update_previous_layer_shift(LM[iname], dec_bit) + else: + iname = layer.input.name.split('/')[0] + if('input' in iname): + return + layer_q_list[iname][0] = dec_min + if(not is_shift_layer(LM[iname])): + update_previous_layer_shift(LM[iname], dec_bit) + for layer in L: + if(type(layer.input) == list): + iname = layer.input[0].name.split('/')[0].split(':')[0] + dec_min = layer_q_list[iname][0] + # find min dec bit in these input + for inp in layer.input: + iname = inp.name.split('/')[0].split(':')[0] + if(layer_q_list[iname][0] < dec_min): + dec_min = layer_q_list[iname][0] + if(layer_q_list[iname][0] != dec_min): + bFlag = True + for inp in layer.input: + iname = inp.name.split('/')[0].split(':')[0] + layer_q_list[iname][0] = dec_min + if(not is_shift_layer(LM[iname])): + update_previous_layer_shift(LM[iname], dec_min) + print('set dec bit', dec_min, 'for the input of', layer.name, ':', [inp.name.split('/')[0] for inp in layer.input]) + if(not is_shift_layer(layer) or dec_min < layer_q_list[layer.name][0]): # update current layer's shift only when we cannot change the shift + layer_q_list[layer.name][0] = dec_min + # quantise offset + print("quantisation list", layer_q_list) + return layer_q_list + + +def layer_name_from_tensor(t): + return t.name.replace(':','/').split('/')[0] + + +def quantize_weights(model, name='weights.h', format='hwc', per_channel_quant=True, layer_q_list=None): + # Quantize weights to 8-bits using (min,max) and write to file + f = open(name, 'w') + f.write('#include "nnom.h"\n\n') + f.write('/* Weights, bias and Q format */\n') + f.close() + for curr_idx, layer in enumerate(model.layers): + if (not layer.weights): + continue + # before merging bn layer, check if the bn is "legally" after Conv + if('batch_normalization' in layer.name) and \ + ('conv' not in layer.inbound_nodes[0].inbound_layers.name): + raise Exception('Only support batch_normalization placed after conv', layer.name, + layer.inbound_nodes[0].inbound_layers.name) + # try to fuse BN layer to convolutional + if ('conv' in layer.name) and \ + ('batch_normalization' in layer.outbound_nodes[0].outbound_layer.name): + fuse_bn_to_conv(layer) + # generate weights and bias now + weight_dec_shift = 0 + print('quantizing weights for layer', layer.name) + layer_weights = layer.get_weights() + for idx, var in enumerate(layer_weights): + var_name = convert_tensor_name(layer.weights[idx]) + var_values = var + if("kernel" not in var_name and 'bias' not in var_name): # ignore batchnormalisation's parameters + continue + + if (per_channel_quant and type(layer) in [Conv2D, Conv1D, DepthwiseConv2D, Conv2DTranspose]): + if(type(layer) in [DepthwiseConv2D] and "kernel" in var_name): #depthwise kernel quantised by + shape = var_values.shape[:2] + (-1,) # need to combine the mult and channel first + var = var_values.reshape(shape) + dec_bits = find_dec_bits_max_min_axis(var, axis=-1, bit_width=8) + elif(type(layer) in [Conv2DTranspose]): + dec_bits = find_dec_bits_max_min_axis(var_values, axis=-2, bit_width=8) + else: + dec_bits = find_dec_bits_max_min_axis(var_values, bit_width=8) + else: + dec_bits = find_dec_bits_max_min(var_values, bit_width=8) + print(' ', var_name, "dec bit", dec_bits) + + # kernel dec, bias dec, bias shift, output shift + if(is_shift_layer(layer) and not is_rnn_layer(layer)): + inp = layer.input.name.replace(':','/').split('/')[0] + layer_input_dec = layer_q_list[inp][0] + layer_output_dec = layer_q_list[layer.name][0] + if ("kernel" in var_name): + weight_dec_shift = dec_bits + else: + # channel wise + if hasattr(dec_bits, '__len__'): + bias_shift = np.full(len(dec_bits), layer_input_dec)+weight_dec_shift-dec_bits + layer_output_shift = np.full(len(weight_dec_shift), layer_input_dec) + weight_dec_shift \ + - np.full(len(weight_dec_shift), layer_output_dec) + if (np.min(bias_shift) < 0): + for i, w_dec in enumerate(weight_dec_shift): + if (bias_shift[i] < 0): + dec_bits[i] = w_dec + bias_shift[i] = 0 + # layer wise + else: + bias_shift = layer_input_dec + weight_dec_shift - dec_bits + layer_output_shift = layer_input_dec + weight_dec_shift - layer_output_dec + if (bias_shift < 0): + dec_bits = weight_dec_shift + bias_shift = 0 + # RNN layer's kernel dec, bias dec, bias shift, output shift + if(is_rnn_layer(layer)): + inp = layer.input.name.replace(':','/').split('/')[0] + layer_input_dec = layer_q_list[inp][0] + layer_output_dec = layer_q_list[layer.name][0] + #if (type(layer.cell) is SimpleRNNCell): + if ("kernel" in var_name and 'recurrent' not in var_name): + weight_dec_shift = dec_bits + elif ('bias' in var_name): + bias_shift = layer_input_dec + weight_dec_shift - dec_bits + layer_output_shift = layer_input_dec + weight_dec_shift - layer_output_dec # this is not valid + if (bias_shift < 0): + dec_bits = weight_dec_shift + bias_shift = 0 + + # now quantise them + if(type(layer) in [Conv2D, Conv1D, DepthwiseConv2D, Conv2DTranspose]): + if(type(layer) in [DepthwiseConv2D] and "kernel" in var_name): + old_shape = var_values.shape + var_values = quantize_data(var_values.reshape(var_values.shape[:2] + (-1,)), + dec_bits, axis=-1, per_axis=per_channel_quant) # convert to [h, w, out x mult] + var_values = var_values.reshape(old_shape) # convert the shape back to [h, w, out, mult] + elif(type(layer) in [Conv2DTranspose] and "kernel" in var_name): + var_values = quantize_data(var_values, dec_bits, axis=-2, per_axis=per_channel_quant) # [h, w, out, in] + else: + var_values = quantize_data(var_values, dec_bits, per_axis=per_channel_quant) # [h, w, in, out] + else: + var_values = quantize_data(var_values, dec_bits, per_axis=False) + + # CHW format + if ('chw' in format): + if (is_lstm_layer(layer) or is_gru_layer(layer)): # currently we use 16 bit intermediate, use reorder optimation + transposed_wts = np.transpose(var_values) + if('kernel' in var_name): + transposed_wts = convert_q7_q15_weights(np.reshape(transposed_wts ,(transposed_wts.shape[0], transposed_wts.shape[1], 1, 1))) + # dense and rnn still working under HWC format + elif ("dense" in var_name or is_rnn_layer(layer)) and "kernel" in var_name: + transposed_wts = np.transpose(var_values) + transposed_wts = convert_to_x4_q7_weights(np.reshape(transposed_wts, (transposed_wts.shape[0], transposed_wts.shape[1], 1, 1))) + # all other kernels, bias stay the same + else: + transposed_wts = var_values + # HWC format (NNOM/CMSIS-NN use [out_ch, h, w, in_ch], in C order) + else: + if (len(var_values.shape) == 3): # 1D convolution layer weights + transposed_wts = np.transpose(var_values, (2, 0, 1)) + elif (len(var_values.shape) == 4): # 2D convolution layer weights + if(type(layer) == Conv2DTranspose): # test + transposed_wts = np.transpose(var_values, (2, 0, 1, 3)) + elif type(layer) == DepthwiseConv2D: + transposed_wts = var_values#np.transpose(var_values, (0, 1, 3, 2)) # [h, w, out, mult] test for multiplier + else: + transposed_wts = np.transpose(var_values, (3, 0, 1, 2)) + elif(is_lstm_layer(layer) or is_gru_layer(layer)): # currently we use 16 bit intermediate, use reorder optimation + if('kernel' in var_name): + transposed_wts = np.transpose(var_values) + transposed_wts = convert_q7_q15_weights(np.reshape(transposed_wts ,(transposed_wts.shape[0], transposed_wts.shape[1], 1, 1))) + else: # bias will not need to be transposed (for GRU which has 2d bias) + transposed_wts = var_values + else: # fully connected layer weights or biases of any layer + # test, use opt weight reorder + transposed_wts = np.transpose(var_values) + if ("dense" in var_name or is_rnn_layer(layer)) and "kernel" in var_name: # and other RNN layers + transposed_wts = convert_to_x4_q7_weights(np.reshape(transposed_wts ,(transposed_wts.shape[0], transposed_wts.shape[1], 1, 1))) + + with open(name, 'a') as f: + def write_weights(f, name, value): + f.write('#define ' + name + ' {') + value.tofile(f, sep=", ", format="%d") + f.write('}\n\n') + # weights or bias + write_weights(f, var_name.upper(), transposed_wts) + # dec bits + write_weights(f, var_name.upper()+'_DEC_BITS' , np.array(dec_bits)) + # for test + if( "bias" in var_name): + f.write('#define ' + layer.name.upper() + '_BIAS_LSHIFT '+to_cstyle(bias_shift) +'\n\n') + #f.write('#define ' + layer.name.upper() + '_OUTPUT_DEC '+ to_cstyle(layer_output_dec)+'\n\n') # not here + f.write('#define ' + layer.name.upper() + '_OUTPUT_RSHIFT ' + to_cstyle(layer_output_shift)+'\n\n') + + +def generate_model(model, x_test, per_channel_quant=False, name='weights.h', format='hwc', quantize_method='max_min'): + """ + :param model: + :param x_test: + :param name: + :param format: + :param quantize_method: "max_min" or "kld" + :return: + """ + # get the quantize output range/format + layer_q_list = quantize_output(model, x_test, layer_offset=False, quantize_method=quantize_method) + # quantize weights and output shift + quantize_weights(model, per_channel_quant=per_channel_quant, name=name, format=format, layer_q_list=layer_q_list) + # now generate the model + if (type(model.layers[0]) != InputLayer): + L = [model.input] + model.layers + else: + L = model.layers + with open(name, 'a') as fp: + # generate the list of output + fp.write('\n/* output q format for each layer */\n') + for layer in L: + if (type(model.input) == tf.Tensor and type(model.layers[0]) != InputLayer): + iname = layer.name.split(':')[0] + else: + iname = layer.name + fp.write('#define %s_OUTPUT_DEC %s\n' % (iname.upper(), layer_q_list[iname][0])) + fp.write('#define %s_OUTPUT_OFFSET %s\n' % (iname.upper(), layer_q_list[iname][1])) + fp.write('\n/* bias shift and output shift for none-weighted layer */\n') + + # generate output shift for the layers without weights (weighted layers were generated in quantize_weights) + for layer in model.layers: + if (is_shift_layer(layer)): + iname = layer.name.upper() + # add, sub + if ('add' in layer.name or 'subtract' in layer.name): + # only consider the first, they have been set to same in out_put_range() + inp = layer.input[0].name.replace(':', '/').split('/')[0].upper() + fp.write('#define {0}_OUTPUT_RSHIFT ({1}_OUTPUT_DEC-{0}_OUTPUT_DEC)\n'.format( + iname, inp)) + fp.write( + '#if {0}_OUTPUT_RSHIFT < 0\n#error {0}_OUTPUT_RSHIFT must be bigger than 0\n#endif\n'.format( + iname)) + # mult is different, Q3.4 * Q3.4 = Q6.8. if mult out is Q4.3, then shift (Q.4+q.4)-Q.3=5. Am I right? + elif ('multiply' in layer.name): + inp = layer.input[0].name.replace(':', '/').split('/')[0].upper() + fp.write('#define {0}_OUTPUT_RSHIFT ({1}_OUTPUT_DEC*2-{0}_OUTPUT_DEC)\n'.format( + iname, inp)) + fp.write( + '#if {0}_OUTPUT_RSHIFT < 0\n#error {0}_OUTPUT_RSHIFT must be bigger than 0\n#endif\n'.format( + iname)) + + fp.write('\n/* tensors and configurations for each layer */\n') + LI = {} + ID = 0 + + def is_skipable_layer(layer): + # FIXME: add more that could be skiped + if ('lambda' in layer.name or + 'dropout' in layer.name or + 'gaussian_noise' in layer.name or + 'batch_normalization' in layer.name + #or ('flatten' in layer.name and 'chw' not in format) + ): # flatten layer can be skipped in HWC but needed in CHW + return True + return False + + output_num = 0 + for id, layer in enumerate(L): + if (is_skipable_layer(layer)): + inp = layer.input.name.replace(':', '/').split('/')[0] + LI[layer.name] = (LI[inp][0], layer) + else: + if (type(model.input) == tf.Tensor and type(model.layers[0]) != InputLayer): + LI[layer.name.split(':')[0]] = (ID, layer) + else: + LI[layer.name] = (ID, layer) + ID += 1 + + def gen_weight_tensor(w, per_axis): + var_cname = convert_tensor_name(w) + '_data' + dec_bits_name = convert_tensor_name(w).upper() + '_DEC_BITS' + fp.write(gen_values(var_cname, convert_tensor_name(w).upper())) + fp.write(gen_tensor(w, dec_bits=dec_bits_name, tensor_value=var_cname, per_axis=per_axis)) + + # output the config of all layer + if (type(layer) in [InputLayer] or 'input' in layer.name): + if(type(layer) == tf.Tensor): + raise Exception('Not yet support tensor as input/or Sequential model. ' + 'please use Input layer as your first layer in the model', layer.name, layer) + size = 1 + for s in layer.input.shape[1:]: + size *= s if s is not None else 1 + fp.write(gen_values('nnom_input_data', '{0}', size=str(size), dtype='static int8_t')) + fp.write(gen_tensor(layer.input, layer_q_list[layer.name][0], tensor_value='nnom_input_data', is_io_tensor=True)) + fp.write(gen_io_config(layer, tensor_name=convert_tensor_name(layer.input))) + elif (type(layer) in [Conv2D, Conv1D, DepthwiseConv2D]): + for w in layer.weights: + gen_weight_tensor(w, per_axis=per_channel_quant) + fp.write(gen_conv2d_config(layer, layer.name.upper() +'_OUTPUT_RSHIFT', layer.name.upper() +'_BIAS_LSHIFT')) + elif (type(layer) in [Conv2DTranspose]): + for w in layer.weights: + gen_weight_tensor(w, per_axis=per_channel_quant) + fp.write(gen_conv2d_trans_config(layer, layer.name.upper() +'_OUTPUT_RSHIFT', layer.name.upper() +'_BIAS_LSHIFT')) + elif (type(layer) in [Dense]): + for w in layer.weights: + gen_weight_tensor(w, per_axis=False) + fp.write(gen_dense_config(layer, layer.name.upper() +'_OUTPUT_RSHIFT', layer.name.upper() +'_BIAS_LSHIFT')) + elif (type(layer) in [MaxPooling2D, AveragePooling2D, MaxPooling1D, AveragePooling1D]): + fp.write(gen_pooling_config(layer)) + elif (type(layer) in [GlobalMaxPooling2D, GlobalAveragePooling2D, GlobalMaxPooling1D, GlobalAveragePooling1D]): + fp.write(gen_gl_pooling_config(layer)) + elif (type(layer) in [Multiply, Add, Subtract]): + fp.write(gen_matrix_config(layer, output_shift_name=layer.name.upper()+'_OUTPUT_RSHIFT')) + elif (type(layer) in [ZeroPadding2D, ZeroPadding1D]): + fp.write(gen_zero_padding_config(layer)) + elif (type(layer) in [Cropping2D, Cropping1D]): + fp.write(gen_cropping_config(layer)) + elif (type(layer) in [Softmax]): + fp.write(gen_softmax_config(layer)) + elif (type(layer) in [Flatten]): + fp.write(gen_flatten_config(layer)) + elif (type(layer) in [Reshape]): + fp.write(gen_reshape_config(layer)) + elif (type(layer) in [Concatenate]): + fp.write(gen_concat_config(layer)) + elif (type(layer) in [Lambda]): + fp.write(gen_lambda_config(layer)) + elif (type(layer) in [UpSampling2D, UpSampling1D]): + fp.write(gen_upsampling_config(layer)) + elif(is_rnn_layer(layer)): + if(type(layer.cell) is SimpleRNNCell): + for w in layer.weights: + gen_weight_tensor(w, per_axis=False) + fp.write(gen_simple_cell_config(layer, layer_q_list['intermediate_'+layer.name])) + elif(type(layer.cell) is GRUCell or 'gru' in layer.cell.name): + for w in layer.weights: + gen_weight_tensor(w, per_axis=False) + fp.write(gen_gru_cell_config(layer, layer_q_list['intermediate_'+layer.name])) + elif(type(layer.cell) is LSTMCell or 'lstm' in layer.cell.name): + for w in layer.weights: + gen_weight_tensor(w, per_axis=False) + fp.write(gen_lstm_cell_config(layer, layer_q_list['intermediate_'+layer.name])) + fp.write(gen_rnn_config(layer)) + + # test, multiple output layer + if(len(layer.outbound_nodes) == 0): + size=1 + for s in layer.output.shape[1:]: + size *= s if s is not None else 1 + if(output_num == 0): # the first output or the only output + fp.write(gen_values('nnom_output_data', '{0}', size=str(size), dtype='static int8_t')) + fp.write(gen_output_config(layer, dec_bits=layer.name.upper() + '_OUTPUT_DEC', output_num=output_num, value_name='nnom_output_data')) + output_num += 1 + else: + output_value_names = 'nnom_output_data'+str(output_num) + fp.write(gen_values(output_value_names, '{0}', size=str(size), dtype='static int8_t')) + fp.write(gen_output_config(layer, dec_bits=layer.name.upper() + '_OUTPUT_DEC', output_num=output_num, value_name=output_value_names)) + output_num += 1 + + # # last layer, attach the additional nnom output layer + # if(id == len(L)-1): + # size=1 + # for s in layer.output.shape[1:]: + # size *= s if s is not None else 1 + # fp.write(gen_values('nnom_output_data', '{0}', size=str(size), dtype='static int8_t')) + # fp.write(gen_output_config(layer, dec_bits=layer.name.upper()+'_OUTPUT_DEC', value_name='nnom_output_data')) + + # write version + fp.write('/* model version */\n') + fp.write('#define NNOM_MODEL_VERSION (10000*{0} + 100*{1} + {2})\n'.format(model_major_version, model_sub_version, model_reversion )) + + # model + fp.write('\n/* nnom model */\n') + fp.write('static nnom_model_t* nnom_model_create(void)\n{\n') + fp.write('\tstatic nnom_model_t model;\n') + if (ID > 32): + fp.write('\tnnom_layer_t **layer = (nnom_layer_t**)malloc(sizeof(nnom_layer_t *)*%d);\n' % (ID + 1)) + fp.write('\tif(NULL == layer) return NULL;\n') + else: + fp.write('\tnnom_layer_t* layer[%d];\n' % (ID + 1)) + fp.write('\n\tcheck_model_version(NNOM_MODEL_VERSION);') + fp.write('\n\tnew_model(&model);\n\n') + + # inverted order of output, very strange + output_num = (len(model.output) -1) if type(model.output) is list else 0 + for layer in L: + if (is_skipable_layer(layer)): + continue + # FIXME: need a better solution to seperate the input 'tensor' from other layers + if (type(model.input) == tf.Tensor and type(model.layers[0]) != InputLayer): + id, _ = LI[layer.name.split(':')[0]] + else: + id, _ = LI[layer.name] + + if ('input' in layer.name): + fp.write('\tlayer[%d] = input_s(&%s_config);\n' % (id, layer.name)) + + # convlutional + elif ('conv1d' in layer.name + or 'conv2d' in layer.name): + inp = layer_name_from_tensor(layer.input) + if('transpose' in layer.name): + fp.write('\tlayer[{0}] = model.hook(conv2d_trans_s(&{1}_config), layer[{2}]);\n'.format(id, layer.name, LI[inp][0])) + elif('depthwise' in layer.name): + fp.write('\tlayer[{0}] = model.hook(dw_conv2d_s(&{1}_config), layer[{2}]);\n'.format(id, layer.name, LI[inp][0])) + else: + fp.write('\tlayer[{0}] = model.hook(conv2d_s(&{1}_config), layer[{2}]);\n'.format(id, layer.name, LI[inp][0])) + elif ('activation' in layer.name): + inp = layer_name_from_tensor(layer.input) + cfg = layer.get_config() + if (cfg['activation'] == 'relu'): + fp.write('\tlayer[%s] = model.active(act_relu(), layer[%s]);\n' % (id, LI[inp][0])) + elif (cfg['activation'] == 'tanh'): + fp.write('\tlayer[%s] = model.active(act_hard_tanh(%s_OUTPUT_DEC), layer[%s]);\n' % ( + id, inp.upper(), LI[inp][0])) + elif (cfg['activation'] == 'sigmoid'): + fp.write('\tlayer[%s] = model.active(act_sigmoid(%s_OUTPUT_DEC), layer[%s]);\n' % ( + id, inp.upper(), LI[inp][0])) + elif (cfg['activation'] == 'hard_sigmoid'): + fp.write('\tlayer[%s] = model.active(act_hard_sigmoid(%s_OUTPUT_DEC), layer[%s]);\n' % ( + id, inp.upper(), LI[inp][0])) + elif (cfg['activation'] == 'softmax'): + fp.write('\tlayer[%s] = model.hook(Softmax(), layer[%s]);\n' % (id, LI[inp][0])) + elif ('leaky_re_lu' in layer.name): + inp = layer_name_from_tensor(layer.input) + cfg = layer.get_config() + fp.write('\tlayer[%s] = model.active(act_leaky_relu(%ff), layer[%s]);\n' % (id, cfg["alpha"],LI[inp][0])) + elif ('re_lu' in layer.name): + inp = layer_name_from_tensor(layer.input) + cfg = layer.get_config() + if(cfg['max_value'] is None and cfg['negative_slope'] == 0 and cfg['threshold'] == 0): + fp.write('\tlayer[%s] = model.active(act_relu(), layer[%s]);\n' % (id, LI[inp][0])) + else: + if(cfg['max_value'] is None): + max_v = 'INFINITY ' + else: + max_v = str(cfg['max_value']) + fp.write('\tlayer[%s] = model.active(act_adv_relu(%f,%s,%f), layer[%s]);\n' + % (id, cfg['negative_slope'], max_v, cfg['threshold'], LI[inp][0])) + # pooling + elif ('max_pooling' in layer.name): + inp = layer_name_from_tensor(layer.input) + if ('global' in layer.name): + fp.write('\tlayer[{0}] = model.hook(global_maxpool_s(&{1}_config), layer[{2}]);\n'.format(id, layer.name, LI[inp][0])) + else: + fp.write('\tlayer[{0}] = model.hook(maxpool_s(&{1}_config), layer[{2}]);\n'.format(id, layer.name, LI[inp][0])) + elif ('average_pooling' in layer.name): + inp = layer_name_from_tensor(layer.input) + if ('global' in layer.name): + fp.write('\tlayer[{0}] = model.hook(global_avgpool_s(&{1}_config), layer[{2}]);\n'.format(id, layer.name, LI[inp][0])) + else: + fp.write('\tlayer[{0}] = model.hook(avgpool_s(&{1}_config), layer[{2}]);\n'.format(id, layer.name, LI[inp][0])) + elif ('up_sampling' in layer.name): + inp = layer_name_from_tensor(layer.input) + fp.write('\tlayer[{0}] = model.hook(upsample_s(&{1}_config), layer[{2}]);\n'.format(id, layer.name, LI[inp][0])) + # zero padding + elif ('zero_padding' in layer.name): + inp = layer_name_from_tensor(layer.input) + fp.write('\tlayer[{0}] = model.hook(zeropadding_s(&{1}_config), layer[{2}]);\n'.format(id, layer.name, LI[inp][0])) + # Cropping + elif ('cropping' in layer.name): + inp = layer_name_from_tensor(layer.input) + fp.write('\tlayer[{0}] = model.hook(cropping_s(&{1}_config), layer[{2}]);\n'.format(id, layer.name, LI[inp][0])) + + # others + elif ('flatten' in layer.name): # flatten is needed in CHW backend but not needed in HWC + inp = layer_name_from_tensor(layer.input) + fp.write('\tlayer[{0}] = model.hook(flatten_s(&{1}_config), layer[{2}]);\n'.format(id, layer.name, LI[inp][0])) + elif ('reshape' in layer.name): # flatten is needed in CHW backend but not needed in HWC + inp = layer_name_from_tensor(layer.input) + fp.write('\tlayer[{0}] = model.hook(reshape_s(&{1}_config), layer[{2}]);\n'.format(id, layer.name, LI[inp][0])) + elif ('concatenate' in layer.name): + inps = [layer_name_from_tensor(input) for input in layer.input] + inX = '' + for inp in inps: + inX += ' ,layer[%d]' % (LI[inp][0]) + fp.write('\tlayer[%s] = model.mergex(concat_s(&%s_config), %s%s);\n' % ( + id, layer.name, len(inps), inX)) + elif ('add' in layer.name): + inps = [layer_name_from_tensor(input) for input in layer.input] + inX = '' + for inp in inps: + inX += ' ,layer[%d]' % (LI[inp][0]) + fp.write('\tlayer[%s] = model.mergex(add_s(&%s_config), %s%s);\n' % ( + id, layer.name, len(inps), inX)) + elif ('subtract' in layer.name): + inps = [layer_name_from_tensor(input) for input in layer.input] + inX = '' + for inp in inps: + inX += ' ,layer[%d]' % (LI[inp][0]) + fp.write('\tlayer[%s] = model.mergex(sub_s(&%s_config), %s%s);\n' % ( + id, layer.name, len(inps), inX)) + elif ('multiply' in layer.name): + inps = [layer_name_from_tensor(input) for input in layer.input] + inX = '' + for inp in inps: + inX += ' ,layer[%d]' % (LI[inp][0]) + fp.write('\tlayer[%s] = model.mergex(mult_s(&%s_config), %s%s);\n' % ( + id, layer.name, len(inps), inX)) + elif ('dense' in layer.name): + inp = layer_name_from_tensor(layer.input) + fp.write('\tlayer[{0}] = model.hook(dense_s(&{1}_config), layer[{2}]);\n'.format(id, layer.name, LI[inp][0])) + elif ('softmax' in layer.name): + inp = layer_name_from_tensor(layer.input) + fp.write('\tlayer[{0}] = model.hook(softmax_s(&{1}_config), layer[{2}]);\n'.format(id, layer.name, LI[inp][0])) + + elif (is_rnn_layer(layer)): + inp = layer_name_from_tensor(layer.input) + line = '\tlayer[{0}] = model.hook(rnn_s(, &{1}_config), layer[{2}]);\n'.format(id, layer.name, LI[inp][0]) + if (type(layer.cell) is SimpleRNNCell): + line = line.replace('', 'simple_cell_s(&%s_simple_cell_config)' %(layer.name)) + elif (type(layer.cell) is GRUCell or 'gru' in layer.cell.name): + line = line.replace('', 'gru_cell_s(&%s_gru_cell_config)' % (layer.name)) + elif (type(layer.cell) is LSTMCell or 'lstm' in layer.cell.name): + line = line.replace('', 'lstm_cell_s(&%s_lstm_cell_config)' % (layer.name)) + fp.write(line) + else: + raise Exception('unsupported layer', layer.name, layer) + + # test, multiple output layer (not yet working with multiple outputs) + if(len(layer.outbound_nodes) == 0): + fp.write('\tlayer[{0}] = model.hook(output_s(&{1}_config), layer[{2}]);\n'.format(id + 1, 'output'+str(output_num), LI[inp][0] + 1)) + output_num -=1 # the num is inverted in keras, not a good solution yet. + + """ + # temporary fixed for activations attached into layers in construction + def is_activation_attached(layer): + if(("Softmax" in layer.output.name and "softmax" not in layer.name)or + ("Relu" in layer.output.name and "re_lu" not in layer.name) or + ("Sigmoid" in layer.output.name and "sigmoid" not in layer.name) or + ("Tanh" in layer.output.name and "tanh" not in layer.name)): + return True + return False + if "input" not in layer.name and is_activation_attached(layer): + inp = layer.output.name.replace(':', '/').split('/')[0] + cfg = layer.get_config() + if(cfg['activation'] == 'relu'): + fp.write('\tlayer[%s] = model.active(act_relu(), layer[%s]);\n'%(id, LI[inp][0])) + if(cfg['activation'] == 'tanh'): + fp.write('\tlayer[%s] = model.active(act_tanh(%s_OUTPUT_SHIFT), layer[%s]);\n'%(id, inp.upper(), LI[inp][0])) + if(cfg['activation'] == 'sigmoid'): + fp.write('\tlayer[%s] = model.active(act_sigmoid(%s_OUTPUT_SHIFT), layer[%s]);\n'%(id, inp.upper(), LI[inp][0])) + elif(cfg['activation'] == 'softmax'): + fp.write('\tlayer[%s] = model.hook(Softmax(), layer[%s]);\n'%(id, LI[inp][0])) + """ + # generate final output layer + #fp.write('\tlayer[{0}] = model.hook(output_s(&{1}_config), layer[{2}]);\n'.format(id+1, 'output', LI[inp][0]+1)) + fp.write('\tmodel_compile(&model, layer[0], layer[%s]);\n' % (id + 1)) + if (ID > 32): + fp.write('\tfree(layer);\n') + fp.write('\treturn &model;\n}\n') + with open('.layer_q_list', 'w') as fp: + fp.write(str(layer_q_list)) + +def evaluate_model(model, x_test, y_test, running_time=False, to_file='evaluation.txt'): + # Score trained model. + scores = model.evaluate(x_test, y_test, verbose=2) + print('Test loss:', scores[0]) + print('Top 1:', scores[1]) + + if(len(y_test.shape)>1): + bs = model.input.shape[0] + predictions = model.predict(x_test, batch_size=bs) + matrix = skmetrics.confusion_matrix(y_test.argmax(axis=1), predictions.argmax(axis=1)) + print(matrix) + + run_time = 0 + if running_time: + # try to calculate the time + T = time.time() + bs = model.input.shape[0] + for i in range(10): + model.predict(x_test, batch_size=bs) + T = time.time() - T + run_time = round((T / 10 / x_test.shape[0] * 1000 * 1000), 2) + print("Runing time:",run_time , "us" ) + # + with open(to_file, 'w') as f: + f.write("Runing time: "+ str(run_time) + "us" + "\n") + f.write('Test loss:'+ str(scores[0]) + "\n") + f.write('Top 1:'+ str(scores[1])+ "\n") + if (len(y_test.shape) > 1): + for row in matrix: + row.tofile(f, sep=',') + f.write("\n") + return scores + +def f2q(d, Q): + '''To convert a number from floating point to Qm.n format: + 1. Multiply the floating point number by 2n + 2. Round to the nearest integer + ''' + return np.round(d*2**Q) + + +def q2f(d, Q): + '''To convert a number from Qm.n format to floating point: + 1. Convert the number to floating point as if it were an integer, in other words remove the binary point + 2. Multiply by 2-n + ''' + return d*2**-Q + +def show_weights(w, name): + sz = 1 + for s in w.shape: + sz = sz*s + aL = w.reshape(sz,) + MIN,MAX=min(aL),max(aL) + Q = int(np.ceil(np.log2(max(abs(MIN),abs(MAX))))) + Q = 7-Q + qL = f2q(aL,Q) + qL = q2f(qL,Q) + plt.figure(figsize=(18, 3)) + plt.subplot(131) + plt.title(name) + plt.plot(aL) + plt.grid() + aL.sort() + plt.plot(aL,'r') + plt.grid() + plt.subplot(132) + plt.title('Q%s'%(Q)) + qL.sort() + plt.plot(aL,'r') + plt.plot(qL,'g') + plt.grid() + plt.subplot(133) + plt.hist(aL,100) + plt.title('hist') + plt.grid() + plt.show() + +def compare(a,b,name): + sz = 1 + for s in a.shape: + sz = sz*s + aL = a.reshape(sz,) + bL = b.reshape(sz,) + assert(len(aL) == len(bL)) + Z = list(zip(aL,bL)) + Z.sort(key=lambda x: x[0]) + aL1,bL1=zip(*Z) + plt.figure(figsize=(18, 3)) + plt.subplot(131) + plt.plot(aL) + plt.plot(aL1,'r') + plt.grid() + plt.title('tf-%s'%(name)) + plt.subplot(133) + plt.plot(bL1,'g') + plt.plot(aL1,'r') + plt.grid() + plt.title('compare') + plt.subplot(132) + bL1=list(bL1) + bL1.sort() + plt.plot(bL) + plt.plot(bL1,'g') + plt.grid() + plt.title('nn-%s'%(name)) + plt.show() + diff --git a/APP_Framework/Framework/knowing/nnom/scripts/nnom_utils.py b/APP_Framework/Framework/knowing/nnom/scripts/nnom_utils.py new file mode 100644 index 000000000..32868ac81 --- /dev/null +++ b/APP_Framework/Framework/knowing/nnom/scripts/nnom_utils.py @@ -0,0 +1,845 @@ +''' + Copyright (c) 2018-2020 + Jianjia Ma + majianjia@live.com + + SPDX-License-Identifier: Apache-2.0 + + Change Logs: + Date Author Notes + 2019-02-05 Jianjia Ma The first version + + + This file provides: + -> fake_quantisation layers which simulate the output quantisation on fixed-point NN models. + -> weights/bias quantisation of Convolution and Dense Layer. "weight.h" file generations + -> export "testing set" binary data file. + -> print output ranges of each layers. + + Currently, this script does not support RNN (type) layers. +''' + +import matplotlib.pyplot as plt +import tensorflow as tf +from tensorflow.keras.layers import InputLayer +from tensorflow.keras.models import Model + +from sklearn import metrics +from .fully_connected_opt_weight_generation import * +import time +import warnings + +""" +this is the generate the test set data to a bin file +bin file can be used to validate the implementation in MCU + +""" +def generate_test_bin(x, y, name='test_data_with_label.bin'): + ''' + this method generate the + :param x: input x data size + :param y: input label (one hot label) + :return: + ''' + # quantize input x + min_value = np.min(x) + max_value = np.max(x) + + int_bits = int(np.ceil(np.log2(max(abs(min_value), abs(max_value))))) + dec_bits = 7 - int_bits + x = np.round(x*2**dec_bits).astype(np.int8) + # get label + if(len(y.shape) >1): + test_label = np.argwhere(y == 1).astype(np.int8) # test data + test_label = test_label[:, 1] + else: + test_label = y + + # get data + dat = x.astype(dtype="byte") # test data + batch_size = dat.shape[0] # total pices of data + dat = dat.flatten() # flatten to get the total size. + block_size = int(dat.size / batch_size) # this must be integer but... just to confirm + + # write (label x 128) (data_block x 128) + label_batch = 128 # the Y-modem example uses 128 batch + with open(name, 'wb') as f: + start = 0 + while start <= (test_label.size - label_batch): + test_label[start: start + label_batch].tofile(f) + dat[block_size * start: block_size * (start + label_batch)].tofile(f) + start += label_batch + + # the rest data + if (start < test_label.size): + rest_len = test_label.size - start + new_labls = test_label[start:] + new_labls = np.pad(new_labls, (0, label_batch - rest_len), mode='constant') + new_labls.tofile(f) + dat[block_size * start:].tofile(f) + + print("binary test file generated:", name) + print("test data length:", test_label.size) + return + +def is_shift_layer(layer): + ''' layer which can change the output encoding''' + #FIXME: add more which will change the output shift + if('input' in layer.name or + 'conv2d' in layer.name or + 'conv1d' in layer.name or + 'dense' in layer.name or + 'softmax' in layer.name or + 'sigmoid' in layer.name or + 'tanh' in layer.name or + ('add' in layer.name and 'zero' not in layer.name) or # the name, zero_padding contains 'add' + 'subtract' in layer.name or + 'multiply' in layer.name or + ('activation' in layer.name and layer.get_config()['activation'] == 'softmax')or + ('activation' in layer.name and layer.get_config()['activation'] == 'sigmoid') or + ('activation' in layer.name and layer.get_config()['activation'] == 'tanh') + ): + return True + return False + +def is_shift_fixed(layer): + ''' layer which shift to a fixed value''' + #FIXME: add more which will change the output shift + if('softmax' in layer.name or + 'sigmoid' in layer.name or + 'tanh' in layer.name or + ('activation' in layer.name and layer.get_config()['activation'] == 'softmax') or + ('activation' in layer.name and layer.get_config()['activation'] == 'sigmoid') or + ('activation' in layer.name and layer.get_config()['activation'] == 'tanh') + ): + return True + return False + +def fuse_bn_to_conv(layer): + # try to fuse BN layer to convolutional + if ('conv' in layer.name) and \ + ('batch_normalization' in layer._outbound_nodes[0].outbound_layer.name): + + print("fusing batch normalization to", layer.name) + bn_layer = layer._outbound_nodes[0].outbound_layer + c_w = layer.get_weights()[0] + c_b = layer.get_weights()[1] + print('original weight max', c_w.max(), 'min', c_w.min()) + print('original bias max', c_b.max(), 'min', c_b.min()) + bn_gamma = bn_layer.get_weights()[0] + bn_beta = bn_layer.get_weights()[1] + bn_mean = bn_layer.get_weights()[2] + bn_variance = bn_layer.get_weights()[3] + + if ('conv2d' in layer.name): + epsilon = 1e-3 # default epsilon for tf.slim.batch_norm + for l in range(c_w.shape[3]): + for k in range(c_w.shape[2]): + for j in range(c_w.shape[1]): + for i in range(c_w.shape[0]): + if "depthwise" in layer.name: # depthwise batchnorm params are ordered differently + c_w[i][j][k][l] *= bn_gamma[k] / np.sqrt(bn_variance[k] + epsilon) + else: + c_w[i][j][k][l] *= bn_gamma[l] / np.sqrt(bn_variance[l] + epsilon) + + if "depthwise" in layer.name: + depth_dim = c_w.shape[2] + else: + depth_dim = c_w.shape[3] + for l in range(depth_dim): + c_b[l] = (bn_gamma[l] * (c_b[l] - bn_mean[l]) / np.sqrt(bn_variance[l] + epsilon)) + bn_beta[l] + # conv1d + else: + epsilon = 1e-3 # default epsilon for tf.slim.batch_norm + for k in range(c_w.shape[2]): + for j in range(c_w.shape[1]): + for i in range(c_w.shape[0]): + if "depthwise" in layer.name: # depthwise batchnorm params are ordered differently + c_w[i][j][k] *= bn_gamma[j] / np.sqrt(bn_variance[j] + epsilon) + else: + c_w[i][j][k] *= bn_gamma[k] / np.sqrt(bn_variance[k] + epsilon) + + if "depthwise" in layer.name: + depth_dim = c_w.shape[1] + else: + depth_dim = c_w.shape[2] + for l in range(depth_dim): + c_b[l] = (bn_gamma[l] * (c_b[l] - bn_mean[l]) / np.sqrt(bn_variance[l] + epsilon)) + bn_beta[l] + + print('fused weight max', c_w.max(), 'min', c_w.min()) + print('fused bias max', c_b.max(), 'min', c_b.min()) + # write the weights back to the layer + # after that, the model will be destroyed.. need a better way to pass the new weight + layer.set_weights([c_w, c_b]) + +def generate_weights(model, name='weights.h', format='hwc', shift_list=None): + # Quantize weights to 8-bits using (min,max) and write to file + f = open(name, 'w') + f.write('#include "nnom.h"\n\n') + f.close() + + for curr_idx, layer in enumerate(model.layers): + if (not layer.weights): + continue + + # before merging bn layer, check if the bn is "legally" after Conv + if('batch_normalization' in layer.name) and \ + ('conv' not in layer.inbound_nodes[0].inbound_layers.name): + raise Exception('Currently only support batch_normalization after conv', layer.name, + layer._inbound_nodes[0].inbound_layers[0].name) + + # try to fuse BN layer to convolutional + if ('conv' in layer.name) and \ + ('batch_normalization' in layer.outbound_nodes[0].outbound_layer.name): + fuse_bn_to_conv(layer) + + # generate weights and bias now + weight_dec_shift = 0 + print('weights for layer', layer.name) + for var in layer.weights: + var_name = str(var.name) + if("kernel" in var_name ): + var_values = layer.get_weights()[0] # weight + print(" weight:", var_name) + elif("bias" in var_name): + var_values = layer.get_weights()[1] # bias + print(" bias: ",var_name) + else: + continue + + print(" original shape: ", var_values.shape) + min_value = np.min(var_values) + max_value = np.max(var_values) + + int_bits = int(np.ceil(np.log2(max(abs(min_value), abs(max_value))))) + dec_bits = 7 - int_bits + print(" dec bit", dec_bits) + bSameAsKernel = False + if(is_shift_layer(layer)): + bSameAsKernel = False + inp = layer.input.name.replace(':','/').split('/')[0] + input_encoding = shift_list[inp] + if ("kernel" in var_name): + weight_dec_shift = dec_bits + else: + shift = input_encoding+weight_dec_shift-dec_bits + if(shift < 0): + bSameAsKernel = True + if(shift_list is None or bSameAsKernel): + # check if bias shift > weight shift, then reduce bias shift to weight shift + if ("kernel" in var_name): + weight_dec_shift = dec_bits + else: + if(dec_bits > weight_dec_shift): + dec_bits = weight_dec_shift + print(" new dec bit", dec_bits) + + # convert to [-128,128) or int8 + var_values = np.round(var_values * 2 ** dec_bits) + var_name = var_name.replace('/', '_') + var_name = var_name.replace(':', '_') + with open(name, 'a') as f: + f.write('#define ' + var_name.upper() + ' {') + # CHW format + if ('chw' in format): + if "dense" in var_name and "kernel" in var_name: + transposed_wts = np.transpose(var_values) + transposed_wts = convert_to_x4_q7_weights( + np.reshape(transposed_wts, (transposed_wts.shape[0], transposed_wts.shape[1], 1, 1))) + # all other kernels, bias stay the same + else: + transposed_wts = var_values + # HWC format + else: + if (len(var_values.shape) == 3): # 1D convolution layer weights + transposed_wts = np.transpose(var_values, (2, 0, 1)) + elif (len(var_values.shape) == 4): # 2D convolution layer weights + transposed_wts = np.transpose(var_values, (3, 0, 1, 2)) + else: # fully connected layer weights or biases of any layer + # test, use opt weight reorder + if "dense" in var_name and "kernel" in var_name: + transposed_wts = np.transpose(var_values) + transposed_wts = convert_to_x4_q7_weights(np.reshape(transposed_wts ,(transposed_wts.shape[0], transposed_wts.shape[1], 1, 1))) + else: + transposed_wts = np.transpose(var_values) + + print(" reshape to:",transposed_wts.shape) + + with open(name, 'a') as f: + transposed_wts.tofile(f, sep=", ", format="%d") + f.write('}\n\n') + if ("bias" in var_name): + f.write('#define ' + var_name.upper() + '_SHIFT ' + '(' + str(dec_bits) + ')\n\n\n') + if ("kernel" in var_name ): + f.write('#define ' + var_name.upper() + '_SHIFT ' + '(' + str(dec_bits) + ')\n\n') + """ + # for checking the quantised and dequantised range. + with K.tf.Session() as session: + # convert back original range but quantized to 8-bits or 256 levels + var_values = var_values / (2 ** dec_bits) + var_values = session.run(K.tf.assign(var, var_values)) + print(' '+var_name + ' number of wts/bias: ' + str(var_values.shape) + \ + ' dec bits: ' + str(dec_bits) + \ + ' max: (' + str(np.max(var_values)) + ',' + str(max_value) + ')' + \ + ' min: (' + str(np.min(var_values)) + ',' + str(min_value) + ')') + """ + +def layers_output_ranges(model, x_test, quantize_method='max_min', calibrate_size=1000): + # limit the test data size + np.random.shuffle(x_test) + if(x_test.shape[0] > calibrate_size): + x_test = x_test[:1000] + # test, show the output ranges + shift_list = {} + # FIXME: only support one input + if(type(model.layers[0]) != InputLayer): + L = [model.input] + model.layers + else: + L = model.layers + last_layer = None + + for layer in L: # layer loop + if("input" in layer.name): + features = x_test + else: + # batch_normalization will need to be handled differently, since we are fusing the weight to its predecessor. + # sigmoid and tanh are different, their shift is fixed to 7 + if(is_shift_layer(layer) or + ('batch_normalization' in layer.name)): + layer_model = Model(inputs=model.input, outputs=layer.output) + features = layer_model.predict(x_test) + else: + # leave the features not changed, so this layer shift will be the same + # as its inputs + pass + # calculate no saturation shift + max_val = features.max() + min_val = features.min() + int_bits = int(np.ceil(np.log2(max(abs(max_val), abs(min_val))))) + dec_bits = 7 - int_bits + + # saturation shift, using KLD method + # Ref: http://on-demand.gputechconf.com/gtc/2017/presentation/s7310-8-bit-inference-with-tensorrt.pdf + if('kld' in quantize_method and not is_shift_fixed(layer) and "input" not in layer.name and "dense" not in layer.name): # test, also do not use kld in input layer + import scipy.stats + abs_max = max(abs(max_val), abs(min_val)) + small_var = 1e-5 + bins = np.arange(-abs_max, abs_max, abs_max/2048*2) + q_bins = np.arange(-abs_max, abs_max, abs_max/256*2) + flat_hist = np.histogram(features.flatten(), bins=bins)[0] + kl_loss = [] + kl_shifts = [] + for shift in range(4): + t = 2 ** (dec_bits + shift) # 2-based threshold + act = np.round(features.flatten() * t) + act = act / t + act = np.clip(act, -128/t, 127/t) + act = np.histogram(act, bins=q_bins)[0] + act_hist = np.zeros(2047) + chunk = int(2048/256) + for i in range(int(255)): + none_zero = np.count_nonzero(flat_hist[i*chunk:(i+1)*chunk]) + if none_zero == 0: + continue + for j in range(chunk): + act_hist[i*chunk+j] = act[i]/none_zero if flat_hist[i*chunk+j] != 0 else 0 + flat_hist[flat_hist==0] = small_var + act_hist[act_hist==0] = small_var + kl = scipy.stats.entropy(flat_hist, act_hist) + kl_loss.append(kl) + kl_shifts.append(dec_bits + shift) + """ + ax = plt.subplot(8, 1, shift+1) + ax.plot(flat_hist) + ax.plot(act_hist) + """ + new_dec = kl_shifts[np.argmin(kl_loss)] # set the dec_bit to the KLD results + #plt.show() + print("KLD loss", kl_loss) + print("KLD shift", kl_shifts) + if(new_dec != dec_bits): + print(layer.name,"is using KLD method, original shift",dec_bits, "KLD results", new_dec) + dec_bits = new_dec + + print( layer.name, "max value:", max_val, "min value:", min_val,"dec bit", dec_bits) + # record the shift + if(type(model.input) == tf.Tensor and type(model.layers[0]) != InputLayer): + shift_list[layer.name.split(':')[0]] = dec_bits + else: + shift_list[layer.name] = dec_bits + if ('batch_normalization' in layer.name): + shift_list[last_layer.name] = dec_bits # use the bn layer shift to update the last layer. + last_layer = layer + + LM = {} + for layer in model.layers: + LM[layer.name] = layer + L = [l for l in model.layers[1:]] + L.reverse() + + def update_previous_layer_shift(layer, Q): + if(type(layer.input) == list): + for inp in layer.input: + iname = inp.name.split('/')[0] + if('input' in iname): + continue + shift_list[iname] = Qmin + if(not is_shift_layer(LM[iname])): + update_previous_layer_shift(LM[iname], Q) + else: + iname = layer.input.name.split('/')[0] + if('input' in iname): + return + shift_list[iname] = Qmin + if(not is_shift_layer(LM[iname])): + update_previous_layer_shift(LM[iname], Q) + for layer in L: + if(type(layer.input) == list): + iname = layer.input[0].name.split('/')[0] + Qmin = shift_list[iname] + for inp in layer.input: + iname = inp.name.split('/')[0] + if(shift_list[iname] < Qmin): + Qmin = shift_list[iname] + if(shift_list[iname] != Qmin): + bFlag = True + for inp in layer.input: + iname = inp.name.split('/')[0] + shift_list[iname] = Qmin + if(not is_shift_layer(LM[iname])): + update_previous_layer_shift(LM[iname], Qmin) + print('set shift', Qmin, 'for the input of', layer.name, ':', [inp.name.split('/')[0] for inp in layer.input]) + if(not is_shift_layer(layer) or Qmin < shift_list[layer.name]): # update current layer's shift only when we cannot change the shift + shift_list[layer.name] = Qmin + print("shift list", shift_list) + return shift_list + +def generate_model(model, x_test, name='weights.h', format='hwc', quantize_method='max_min'): + shift_list = layers_output_ranges(model, x_test, quantize_method=quantize_method) + generate_weights(model, name=name, format=format, shift_list=shift_list) + if(type(model.layers[0]) != InputLayer): + L = [model.input] + model.layers + else: + L = model.layers + with open(name,'a') as fp: + fp.write('\n/* output enconding for each layer */\n') + for layer in L: + if(type(model.input) == tf.Tensor and type(model.layers[0]) != InputLayer): + iname = layer.name.split(':')[0] + else: + iname = layer.name + fp.write('#define %s_OUTPUT_SHIFT %s\n'%(iname.upper(), shift_list[iname])) + fp.write('\n/* bias shift and output shift for each layer */\n') + for layer in model.layers: + if(is_shift_layer(layer)): + iname = layer.name.upper() + if(len(layer.weights) == 2 and + 'kernel' in layer.weights[0].name and + 'bias' in layer.weights[1].name): + kname = layer.weights[0].name.upper().replace('/', '_').replace(':', '_') + bname = layer.weights[1].name.upper().replace('/', '_').replace(':', '_') + inp = layer.input.name.replace(':','/').split('/')[0].upper() + fp.write('#define {0}_OUTPUT_RSHIFT ({1}_OUTPUT_SHIFT+{2}_SHIFT-{0}_OUTPUT_SHIFT)\n'.format( + iname, inp, kname)) + fp.write('#define {0}_BIAS_LSHIFT ({1}_OUTPUT_SHIFT+{2}_SHIFT-{3}_SHIFT)\n'.format( + iname, inp, kname, bname)) + fp.write('#if {0}_OUTPUT_RSHIFT < 0\n#error {0}_OUTPUT_RSHIFT must be bigger than 0\n#endif\n'.format(iname)) + fp.write('#if {0}_BIAS_LSHIFT < 0\n#error {0}_BIAS_RSHIFT must be bigger than 0\n#endif\n'.format(iname)) + # add, sub + elif ('add' in layer.name or + 'subtract' in layer.name): + # only consider the first, they have been set to same in out_put_range() + inp = layer.input[0].name.replace(':','/').split('/')[0].upper() + fp.write('#define {0}_OUTPUT_RSHIFT ({1}_OUTPUT_SHIFT-{0}_OUTPUT_SHIFT)\n'.format( + iname, inp)) + fp.write('#if {0}_OUTPUT_RSHIFT < 0\n#error {0}_OUTPUT_RSHIFT must be bigger than 0\n#endif\n'.format(iname)) + # mult is different, Q3.4 * Q3.4 = Q6.8. if mult out is Q4.3, then shift (Q.4+q.4)-Q.3=5. Am I right? + elif ('multiply' in layer.name ): + inp = layer.input[0].name.replace(':','/').split('/')[0].upper() + fp.write('#define {0}_OUTPUT_RSHIFT ({1}_OUTPUT_SHIFT*2-{0}_OUTPUT_SHIFT)\n'.format( + iname, inp)) + fp.write('#if {0}_OUTPUT_RSHIFT < 0\n#error {0}_OUTPUT_RSHIFT must be bigger than 0\n#endif\n'.format(iname)) + + fp.write('\n/* weights for each layer */\n') + LI = {} + ID = 0 + def is_skipable_layer(layer): + # FIXME: add more that could be skiped + if('lambda' in layer.name or + 'dropout' in layer.name or + 'batch_normalization' in layer.name or + ('flatten' in layer.name and 'chw' not in format)): # flatten layer can be skipped in HWC but have to present in CHW + return True + return False + for id,layer in enumerate(L): + if(is_skipable_layer(layer)): + inp = layer.input.name.replace(':','/').split('/')[0] + LI[layer.name] = (LI[inp][0], layer) + else: + if(type(model.input) == tf.Tensor and type(model.layers[0]) != InputLayer): + LI[layer.name.split(':')[0]] = (ID, layer) + else: + LI[layer.name] = (ID, layer) + ID += 1 + + if ('input' in layer.name or not layer.weights): + continue + for var in layer.weights: + var_name = str(var.name).replace('/', '_').replace(':', '_') + if("kernel" in var_name): + fp.write('static const int8_t %s_weights[] = %s;\n'%(layer.name, var_name.upper())) + fp.write('static const nnom_weight_t %s_w = { (const void*)%s_weights, %s_OUTPUT_RSHIFT};\n'%(layer.name,layer.name, layer.name.upper())) + elif("bias" in var_name): + fp.write('static const int8_t %s_bias[] = %s;\n'%(layer.name, var_name.upper())) + fp.write('static const nnom_bias_t %s_b = { (const void*)%s_bias, %s_BIAS_LSHIFT};\n'%(layer.name,layer.name, layer.name.upper())) + fp.write('\n/* nnom model */\n') + # FIXME: now only support one input and one output + sz = 1 + for d in model.input.shape[1:]: + sz = sz*d + fp.write('static int8_t nnom_input_data[%d];\n'%(sz)) + sz = 1 + for d in model.output.shape[1:]: + sz = sz*d + fp.write('static int8_t nnom_output_data[%d];\n'%(sz)) + fp.write('static nnom_model_t* nnom_model_create(void)\n{\n') + fp.write('\tstatic nnom_model_t model;\n') + if(ID>32): + fp.write('\tnnom_layer_t ** layer = malloc(sizeof(nnom_layer_t *)*%d);\n'%(ID+1)) + fp.write('\tif(NULL == layer) return NULL;\n') + else: + fp.write('\tnnom_layer_t* layer[%d];\n'%(ID+1)) + fp.write('\n\tnew_model(&model);\n\n') + for layer in L: + if(is_skipable_layer(layer)): + continue + #FIXME: need a better solution to seperate the input 'tensor' from other layers + if (type(model.input) == tf.Tensor and type(model.layers[0]) != InputLayer): + id,_ = LI[layer.name.split(':')[0]] + else: + id,_ = LI[layer.name] + + if('input' in layer.name): + try: + inshape = layer.input_shape[0][1:] # new changes in tf2? + except: + inshape = layer.shape[1:] + if (len(inshape) == 1): # 1-D input + fp.write('\tlayer[%d] = Input(shape(%d,1,1), nnom_input_data);\n' % (id, inshape[0])) + elif (len(inshape) == 2): # 1-D input + fp.write('\tlayer[%d] = Input(shape(1,%d,%d), nnom_input_data);\n' % (id, inshape[0], inshape[1])) + else: + fp.write('\tlayer[%d] = Input(shape%s, nnom_input_data);\n' % (id, inshape)) + + # convlutional + elif('conv1d' in layer.name): + inp = layer.input.name.replace(':','/').split('/')[0] + cfg = layer.get_config() + if('depthwise' in layer.name): + fp.write('\tlayer[{0}] = model.hook(DW_Conv2D({1}, kernel(1,{2}), stride(1,{3}), dilation(1,{4}), PADDING_{5}, &{6}_w, &{6}_b), layer[{7}]);\n'.format( + id, 1, cfg['kernel_size'][0], cfg['strides'][0], cfg['dilation_rate'][0], cfg['padding'].upper(), + layer.name, LI[inp][0])) + else: + fp.write('\tlayer[{0}] = model.hook(Conv2D({1}, kernel(1,{2}), stride(1,{3}), dilation(1,{4}), PADDING_{5}, &{6}_w, &{6}_b), layer[{7}]);\n'.format( + id, cfg['filters'], cfg['kernel_size'][0], cfg['strides'][0], cfg['dilation_rate'][0], cfg['padding'].upper(), + layer.name, LI[inp][0])) + elif('conv2d' in layer.name): + inp = layer.input.name.replace(':','/').split('/')[0] + cfg = layer.get_config() + if ('depthwise' in layer.name): + fp.write('\tlayer[{0}] = model.hook(DW_Conv2D({1}, kernel{2}, stride{3}, dilation{4}, PADDING_{5}, &{6}_w, &{6}_b), layer[{7}]);\n'.format( + id, 1, cfg['kernel_size'], cfg['strides'], cfg['dilation_rate'], cfg['padding'].upper(), + layer.name, LI[inp][0])) + else: + fp.write('\tlayer[{0}] = model.hook(Conv2D({1}, kernel{2}, stride{3}, dilation{4}, PADDING_{5}, &{6}_w, &{6}_b), layer[{7}]);\n'.format( + id, cfg['filters'], cfg['kernel_size'], cfg['strides'], cfg['dilation_rate'], cfg['padding'].upper(), + layer.name, LI[inp][0])) + # activations + elif('activation' in layer.name): + inp = layer.input.name.replace(':','/').split('/')[0] + cfg = layer.get_config() + if(cfg['activation'] == 'relu'): + fp.write('\tlayer[%s] = model.active(act_relu(), layer[%s]);\n'%(id, LI[inp][0])) + if(cfg['activation'] == 'tanh'): + fp.write('\tlayer[%s] = model.active(act_tanh(%s_OUTPUT_SHIFT), layer[%s]);\n'%(id, inp.upper(), LI[inp][0])) + if(cfg['activation'] == 'sigmoid'): + fp.write('\tlayer[%s] = model.active(act_sigmoid(%s_OUTPUT_SHIFT), layer[%s]);\n'%(id, inp.upper(), LI[inp][0])) + elif(cfg['activation'] == 'softmax'): + fp.write('\tlayer[%s] = model.hook(Softmax(), layer[%s]);\n'%(id, LI[inp][0])) + elif('re_lu' in layer.name): + inp = layer.input.name.replace(':','/').split('/')[0] + fp.write('\tlayer[%s] = model.active(act_relu(), layer[%s]);\n'%(id, LI[inp][0])) + # pooling + elif('max_pooling' in layer.name): + inp = layer.input.name.replace(':','/').split('/')[0] + cfg = layer.get_config() + if ('global' in layer.name): + fp.write('\tlayer[%s] = model.hook(GlobalMaxPool(), layer[%s]);\n' % (id, LI[inp][0])) + elif('2d' in layer.name): + fp.write('\tlayer[%s] = model.hook(MaxPool(kernel%s, stride%s, PADDING_%s), layer[%d]);\n'%( + id, cfg['pool_size'], cfg['strides'], cfg['padding'].upper(), LI[inp][0])) + elif('1d' in layer.name): + fp.write('\tlayer[{0}] = model.hook(MaxPool(kernel(1,{1}), stride(1,{2}), PADDING_{3}), layer[{4}]);\n'.format( + id, cfg['pool_size'][0], cfg['strides'][0], cfg['padding'].upper(), LI[inp][0])) + elif('average_pooling' in layer.name): + inp = layer.input.name.replace(':','/').split('/')[0] + cfg = layer.get_config() + if ('global' in layer.name): + # a global avg pool before softmax can be replace by sumpool in MCU (recommend) + if(layer == model.layers[-2] and 'Softmax' in model.layers[-1].output.name): + print(layer.name, 'has been replaced by GlobalSumPool()') + fp.write('\tlayer[%s] = model.hook(GlobalSumPool(), layer[%s]);\n' % (id, LI[inp][0])) + else: + fp.write('\tlayer[%s] = model.hook(GlobalAvgPool(), layer[%s]);\n' % (id, LI[inp][0])) + elif('2d' in layer.name): + fp.write('\tlayer[%s] = model.hook(AvgPool(kernel%s, stride%s, PADDING_%s), layer[%d]);\n'%( + id, cfg['pool_size'], cfg['strides'], cfg['padding'].upper(), LI[inp][0])) + elif('1d' in layer.name): + fp.write('\tlayer[{0}] = model.hook(AvgPool(kernel(1,{1}), stride(1,{2}), PADDING_{3}), layer[{4}]);\n'.format( + id, cfg['pool_size'][0], cfg['strides'][0], cfg['padding'].upper(), LI[inp][0])) + elif ('up_sampling' in layer.name): + inp = layer.input.name.replace(':','/').split('/')[0] + cfg = layer.get_config() + if('2d' in layer.name): + fp.write('\tlayer[%s] = model.hook(UpSample(kernel%s), layer[%d]);\n'%(id, cfg['size'], LI[inp][0])) + elif('1d' in layer.name): + fp.write('\tlayer[{0}] = model.hook(UpSample(kernel(1,{1})), layer[{2}]);\n'.format( + id, cfg['size'][0], LI[inp][0])) + # zero padding + elif ('zero_padding' in layer.name): + inp = layer.input.name.replace(':','/').split('/')[0] + cfg = layer.get_config() + if('2d' in layer.name): + fp.write('\tlayer[{0}] = model.hook(ZeroPadding(border({1},{2},{3},{4})), layer[{5}]);\n'.format( + id, cfg['padding'][0][0], cfg['padding'][0][1], cfg['padding'][1][0],cfg['padding'][1][1], LI[inp][0])) + elif('1d' in layer.name): + fp.write('\tlayer[{0}] = model.hook(ZeroPadding(border(0,0,{1},{2})), layer[{3}]);\n'.format( + id, cfg['padding'][0], cfg['padding'][1], LI[inp][0])) + # Cropping + elif ('cropping' in layer.name): + inp = layer.input.name.replace(':','/').split('/')[0] + cfg = layer.get_config() + if('2d' in layer.name): + fp.write('\tlayer[{0}] = model.hook(Cropping(border({1},{2},{3},{4})), layer[{5}]);\n'.format( + id, cfg['cropping'][0][0], cfg['cropping'][0][1], cfg['cropping'][1][0],cfg['cropping'][1][1], LI[inp][0])) + elif('1d' in layer.name): + fp.write('\tlayer[{0}] = model.hook(Cropping(border(0,0,{1},{2})), layer[{3}]);\n'.format( + id, cfg['cropping'][0], cfg['cropping'][1], LI[inp][0])) + + # others + elif('flatten' in layer.name): # flatten is needed in CHW backend but not needed in HWC + inp = layer.input.name.replace(':', '/').split('/')[0] + fp.write('\tlayer[%s] = model.hook(Flatten(), layer[%s]);\n'%(id, LI[inp][0])) + elif('concatenate' in layer.name): + inps = [input.name.replace(':','/').split('/')[0] for input in layer.input] + inX = '' + for inp in inps: + inX += ' ,layer[%d]'%(LI[inp][0]) + cfg = layer.get_config() + fp.write('\tlayer[%s] = model.mergex(Concat(%s), %s%s);\n'%( + id, cfg['axis'], len(inps), inX)) + elif('add' in layer.name): + inps = [input.name.replace(':','/').split('/')[0] for input in layer.input] + inX = '' + for inp in inps: + inX += ' ,layer[%d]'%(LI[inp][0]) + fp.write('\tlayer[%s] = model.mergex(Add(%s_OUTPUT_RSHIFT), %s%s);\n'%( + id, layer.name.upper(), len(inps), inX)) + elif('subtract' in layer.name): + inps = [input.name.replace(':','/').split('/')[0] for input in layer.input] + inX = '' + for inp in inps: + inX += ' ,layer[%d]'%(LI[inp][0]) + fp.write('\tlayer[%s] = model.mergex(Sub(%s_OUTPUT_RSHIFT), %s%s);\n'%( + id, layer.name.upper(), len(inps), inX)) + elif('multiply' in layer.name): + warnings.warn("Warning mutiply is under testing") + inps = [input.name.replace(':','/').split('/')[0] for input in layer.input] + inX = '' + for inp in inps: + inX += ' ,layer[%d]'%(LI[inp][0]) + fp.write('\tlayer[%s] = model.mergex(Mult(%s_OUTPUT_RSHIFT), %s%s);\n'%( + id, layer.name.upper(), len(inps), inX)) + elif('dense' in layer.name): + inp = layer.input.name.replace(':','/').split('/')[0] + cfg = layer.get_config() + fp.write('\tlayer[{0}] = model.hook(Dense({1}, &{2}_w, &{2}_b), layer[{3}]);\n'.format( + id, cfg['units'], layer.name, LI[inp][0])) + elif('softmax' in layer.name): + inp = layer.input.name.replace(':','/').split('/')[0] + fp.write('\tlayer[%s] = model.hook(Softmax(), layer[%s]);\n'%(id, LI[inp][0])) + else: + raise Exception('unsupported layer', layer.name, layer) + + """ + # temporary fixed for activations attached into layers in construction + def is_activation_attached(layer): + if(("Softmax" in layer.output.name and "softmax" not in layer.name)or + ("Relu" in layer.output.name and "re_lu" not in layer.name) or + ("Sigmoid" in layer.output.name and "sigmoid" not in layer.name) or + ("Tanh" in layer.output.name and "tanh" not in layer.name)): + return True + return False + if "input" not in layer.name and is_activation_attached(layer): + inp = layer.output.name.replace(':', '/').split('/')[0] + cfg = layer.get_config() + if(cfg['activation'] == 'relu'): + fp.write('\tlayer[%s] = model.active(act_relu(), layer[%s]);\n'%(id, LI[inp][0])) + if(cfg['activation'] == 'tanh'): + fp.write('\tlayer[%s] = model.active(act_tanh(%s_OUTPUT_SHIFT), layer[%s]);\n'%(id, inp.upper(), LI[inp][0])) + if(cfg['activation'] == 'sigmoid'): + fp.write('\tlayer[%s] = model.active(act_sigmoid(%s_OUTPUT_SHIFT), layer[%s]);\n'%(id, inp.upper(), LI[inp][0])) + elif(cfg['activation'] == 'softmax'): + fp.write('\tlayer[%s] = model.hook(Softmax(), layer[%s]);\n'%(id, LI[inp][0])) + """ + + # FIXME, test later. + if('softmax' in layer.name + or ('activation' in layer.name and layer.get_config()['activation'] == 'softmax')): + fp.write('\tlayer[%s] = model.hook(Output(shape(%s,1,1), nnom_output_data), layer[%s]);\n'%(id+1, layer.output.shape[1], id)) + elif len(layer.output.shape) == 4: + fp.write('\tlayer[%s] = model.hook(Output(shape%s, nnom_output_data), layer[%s]);\n'%(id+1, layer.output.shape[1:], id)) + elif len(layer.output.shape) == 3: + fp.write('\tlayer[%s] = model.hook(Output(shape(1,%s,%s), nnom_output_data), layer[%s]);\n'%(id+1, layer.output.shape[1], layer.output.shape[2], id)) + elif len(layer.output.shape) == 2: + fp.write('\tlayer[%s] = model.hook(Output(shape(%s,1,1), nnom_output_data), layer[%s]);\n'%(id+1, layer.output.shape[1], id)) + else: + raise Exception('unsupported output shape of the last layer', layer.name, layer) + fp.write('\tmodel_compile(&model, layer[0], layer[%s]);\n'%(id+1)) + if(ID>32): + fp.write('\tfree(layer);\n') + fp.write('\treturn &model;\n}\n') + with open('.shift_list','w') as fp: + fp.write(str(shift_list)) + +def evaluate_model(model, x_test, y_test, running_time=False, to_file='evaluation.txt'): + # Score trained model. + scores = model.evaluate(x_test, y_test, verbose=2) + print('Test loss:', scores[0]) + print('Top 1:', scores[1]) + + if(len(y_test.shape)>1): + # predictions = model.predict(x_test) + # output = tf.keras.metrics.top_k_categorical_accuracy(y_test, predictions, k=2) + # # with tf.Session() as sess: + # # result = sess.run(output) + # result = + # print("Top 2:",result) + + predictions = model.predict(x_test) + matrix = metrics.confusion_matrix(y_test.argmax(axis=1), predictions.argmax(axis=1)) + print(matrix) + + run_time = 0 + if running_time: + # try to calculate the time + T = time.time() + for i in range(10): + model.predict(x_test) + T = time.time() - T + run_time = round((T / 10 / x_test.shape[0] * 1000 * 1000), 2) + print("Runing time:",run_time , "us" ) + # + with open(to_file, 'w') as f: + f.write("Runing time: "+ str(run_time) + "us" + "\n") + f.write('Test loss:'+ str(scores[0]) + "\n") + f.write('Top 1:'+ str(scores[1])+ "\n") + if (len(y_test.shape) > 1): + #f.write("Top 2:"+ str(result)+ "\n") + #f.write(str(matrix)) + for row in matrix: + row.tofile(f, sep=',') + f.write("\n") + + # try to check the weight and bias dec ranges + for layer in model.layers: + if (not layer.weights): + continue + for var in layer.weights: + var_name = str(var.name) + if ("kernel" in var_name): + var_values = layer.get_weights()[0] # weight + else: + var_values = layer.get_weights()[1] # bias + min_value = np.min(var_values) + max_value = np.max(var_values) + intt = int(np.ceil(np.log2(max(abs(min_value), abs(max_value))))) + dec = 7 - intt + print(var_name, "Dec num:", dec) + return scores + +def f2q(d, Q): + '''To convert a number from floating point to Qm.n format: + 1. Multiply the floating point number by 2n + 2. Round to the nearest integer + ''' + return np.round(d*2**Q) + + +def q2f(d, Q): + '''To convert a number from Qm.n format to floating point: + 1. Convert the number to floating point as if it were an integer, in other words remove the binary point + 2. Multiply by 2-n + ''' + return d*2**-Q + +def show_weights(w, name): + sz = 1 + for s in w.shape: + sz = sz*s + aL = w.reshape(sz,) + MIN,MAX=min(aL),max(aL) + Q = int(np.ceil(np.log2(max(abs(MIN),abs(MAX))))) + Q = 7-Q + qL = f2q(aL,Q) + qL = q2f(qL,Q) + plt.figure(figsize=(18, 3)) + plt.subplot(131) + plt.title(name) + plt.plot(aL) + plt.grid() + aL.sort() + plt.plot(aL,'r') + plt.grid() + plt.subplot(132) + plt.title('Q%s'%(Q)) + qL.sort() + plt.plot(aL,'r') + plt.plot(qL,'g') + plt.grid() + plt.subplot(133) + plt.hist(aL,100) + plt.title('hist') + plt.grid() + plt.show() + +def compare(a,b,name): + sz = 1 + for s in a.shape: + sz = sz*s + aL = a.reshape(sz,) + bL = b.reshape(sz,) + assert(len(aL) == len(bL)) + Z = list(zip(aL,bL)) + Z.sort(key=lambda x: x[0]) + aL1,bL1=zip(*Z) + plt.figure(figsize=(18, 3)) + plt.subplot(131) + plt.plot(aL) + plt.plot(aL1,'r') + plt.grid() + plt.title('tf-%s'%(name)) + plt.subplot(133) + plt.plot(bL1,'g') + plt.plot(aL1,'r') + plt.grid() + plt.title('compare') + plt.subplot(132) + bL1=list(bL1) + bL1.sort() + plt.plot(bL) + plt.plot(bL1,'g') + plt.grid() + plt.title('nn-%s'%(name)) + plt.show() + diff --git a/APP_Framework/Framework/knowing/nnom/src/backends/nnom_local.c b/APP_Framework/Framework/knowing/nnom/src/backends/nnom_local.c new file mode 100644 index 000000000..5c514b21b --- /dev/null +++ b/APP_Framework/Framework/knowing/nnom/src/backends/nnom_local.c @@ -0,0 +1,1689 @@ +/* + * Copyright (c) 2018-2020 + * Jianjia Ma + * majianjia@live.com + * + * SPDX-License-Identifier: Apache-2.0 + * + * Notice: + * Code in this file inlcudes derivative works from CMSIS + * Please check the LICENSE file for detial. + * + * Change Logs: + * Date Author Notes + * 2019-02-05 Jianjia Ma The first version + * 2019-03-19 Jianjia Ma Local C implementation partly from CMSIS-NN + * 2019-06-19 Jianjia Ma Implement CHW functions + */ + +#include "nnom.h" +#include "nnom_local.h" + +// modified from CMSIS-NN test_ref +void local_avepool_q7_HWC(const q7_t *Im_in, // input image + const uint16_t dim_im_in_x, // input image dimension x or W + const uint16_t dim_im_in_y, // input image dimension y or H + const uint16_t ch_im_in, // number of input image channels + const uint16_t dim_kernel_x, // window kernel size + const uint16_t dim_kernel_y, // window kernel size + const uint16_t padding_x, // padding sizes + const uint16_t padding_y, // padding sizes + const uint16_t stride_x, // stride + const uint16_t stride_y, // stride + const uint16_t dim_im_out_x, // output image dimension x or W + const uint16_t dim_im_out_y, // output image dimension y or H + const uint16_t output_shift, // output right shift + q7_t *bufferA, // a buffer for local storage, NULL by now + q7_t *Im_out) +{ + int16_t i_ch_in, i_x, i_y; + int16_t k_x, k_y; + + for (i_ch_in = 0; i_ch_in < ch_im_in; i_ch_in++) + { + for (i_y = 0; i_y < dim_im_out_y; i_y++) + { + for (i_x = 0; i_x < dim_im_out_x; i_x++) + { + int sum = 0; + int count = 0; + for (k_y = i_y * stride_y - padding_y; k_y < i_y * stride_y - padding_y + dim_kernel_y; k_y++) + { + for (k_x = i_x * stride_x - padding_x; k_x < i_x * stride_x - padding_x + dim_kernel_x; k_x++) + { + if (k_y >= 0 && k_x >= 0 && k_y < dim_im_in_y && k_x < dim_im_in_x) + { + sum += Im_in[i_ch_in + ch_im_in * (k_x + k_y * dim_im_in_x)]; + count++; + } + } + } + Im_out[i_ch_in + ch_im_in * (i_x + i_y * dim_im_out_x)] = sum / (count>>output_shift); + } + } + } +} + +void local_avepool_q7_CHW(const q7_t *Im_in, // input image + const uint16_t dim_im_in_x, // input image dimension x or W + const uint16_t dim_im_in_y, // input image dimension y or H + const uint16_t ch_im_in, // number of input image channels + const uint16_t dim_kernel_x, // window kernel size + const uint16_t dim_kernel_y, // window kernel size + const uint16_t padding_x, // padding sizes + const uint16_t padding_y, // padding sizes + const uint16_t stride_x, // stride + const uint16_t stride_y, // stride + const uint16_t dim_im_out_x, // output image dimension x or W + const uint16_t dim_im_out_y, // output image dimension y or H + const uint16_t output_shift, // output right shift + q7_t *bufferA, // a buffer for local storage, NULL by now + q7_t *Im_out) +{ + int16_t i_ch_in, i_x, i_y; + int16_t k_x, k_y; + int32_t ch_offset; + + for (i_ch_in = 0; i_ch_in < ch_im_in; i_ch_in++) + { + ch_offset = i_ch_in*dim_im_in_x*dim_im_in_y; + for (i_y = 0; i_y < dim_im_out_y; i_y++) + { + for (i_x = 0; i_x < dim_im_out_x; i_x++) + { + int sum = 0; + int count = 0; + for (k_y = i_y * stride_y - padding_y; k_y < i_y * stride_y - padding_y + dim_kernel_y; k_y++) + { + for (k_x = i_x * stride_x - padding_x; k_x < i_x * stride_x - padding_x + dim_kernel_x; k_x++) + { + if (k_y >= 0 && k_x >= 0 && k_y < dim_im_in_y && k_x < dim_im_in_x) + { + sum += Im_in[ch_offset + (k_x + k_y * dim_im_in_x)]; + count++; + } + } + } + Im_out[i_ch_in*dim_im_out_x*dim_im_out_y + (i_x + i_y * dim_im_out_x)] = sum / (count>>output_shift); + } + } + } +} + +// modified from CMSIS-NN test_ref +void local_maxpool_q7_HWC(const q7_t *Im_in, // input image + const uint16_t dim_im_in_x, // input image dimension x or W + const uint16_t dim_im_in_y, // input image dimension y or H + const uint16_t ch_im_in, // number of input image channels + const uint16_t dim_kernel_x, // window kernel size + const uint16_t dim_kernel_y, // window kernel size + const uint16_t padding_x, // padding sizes + const uint16_t padding_y, // padding sizes + const uint16_t stride_x, // stride + const uint16_t stride_y, // stride + const uint16_t dim_im_out_x, // output image dimension x or W + const uint16_t dim_im_out_y, // output image dimension y or H + q7_t *bufferA, // a buffer for local storage, NULL by now + q7_t *Im_out) +{ + int16_t i_ch_in, i_x, i_y; + int16_t k_x, k_y; + + for (i_ch_in = 0; i_ch_in < ch_im_in; i_ch_in++) + { + for (i_y = 0; i_y < dim_im_out_y; i_y++) + { + for (i_x = 0; i_x < dim_im_out_x; i_x++) + { + int max = -129; + for (k_y = i_y * stride_y - padding_y; k_y < i_y * stride_y - padding_y + dim_kernel_y; k_y++) + { + for (k_x = i_x * stride_x - padding_x; k_x < i_x * stride_x - padding_x + dim_kernel_x; k_x++) + { + if (k_y >= 0 && k_x >= 0 && k_y < dim_im_in_y && k_x < dim_im_in_x) + { + if (Im_in[i_ch_in + ch_im_in * (k_x + k_y * dim_im_in_x)] > max) + { + max = Im_in[i_ch_in + ch_im_in * (k_x + k_y * dim_im_in_x)]; + } + } + } + } + Im_out[i_ch_in + ch_im_in * (i_x + i_y * dim_im_out_x)] = max; + } + } + } +} + +void local_maxpool_q7_CHW(const q7_t *Im_in, // input image + const uint16_t dim_im_in_x, // input image dimension x or W + const uint16_t dim_im_in_y, // input image dimension y or H + const uint16_t ch_im_in, // number of input image channels + const uint16_t dim_kernel_x, // window kernel size + const uint16_t dim_kernel_y, // window kernel size + const uint16_t padding_x, // padding sizes + const uint16_t padding_y, // padding sizes + const uint16_t stride_x, // stride + const uint16_t stride_y, // stride + const uint16_t dim_im_out_x, // output image dimension x or W + const uint16_t dim_im_out_y, // output image dimension y or H + q7_t *bufferA, // a buffer for local storage, NULL by now + q7_t *Im_out) +{ + int16_t i_ch_in, i_x, i_y; + int16_t k_x, k_y; + int32_t ch_offset; + + for (i_ch_in = 0; i_ch_in < ch_im_in; i_ch_in++) + { + ch_offset = i_ch_in * dim_im_out_x * dim_im_out_y; + for (i_y = 0; i_y < dim_im_out_y; i_y++) + { + for (i_x = 0; i_x < dim_im_out_x; i_x++) + { + int max = -129; + for (k_y = i_y * stride_y - padding_y; k_y < i_y * stride_y - padding_y + dim_kernel_y; k_y++) + { + for (k_x = i_x * stride_x - padding_x; k_x < i_x * stride_x - padding_x + dim_kernel_x; k_x++) + { + if (k_y >= 0 && k_x >= 0 && k_y < dim_im_in_y && k_x < dim_im_in_x) + { + if (Im_in[i_ch_in * dim_im_in_x * dim_im_in_y + (k_x + k_y * dim_im_in_x)] > max) + { + max = Im_in[i_ch_in * dim_im_in_x * dim_im_in_y + (k_x + k_y * dim_im_in_x)]; + } + } + } + } + Im_out[ch_offset+(i_x + i_y * dim_im_out_x)] = max; + } + } + } +} + +// temporary for the thesis +// shift according to the maximum +void local_sumpool_q7_HWC(const q7_t *Im_in, // input image + const uint16_t dim_im_in_x, // input image dimension x or W + const uint16_t dim_im_in_y, // input image dimension y or H + const uint16_t ch_im_in, // number of input image channels + const uint16_t dim_kernel_x, // window kernel size + const uint16_t dim_kernel_y, // window kernel size + const uint16_t padding_x, // padding sizes + const uint16_t padding_y, // padding sizes + const uint16_t stride_x, // stride + const uint16_t stride_y, // stride + const uint16_t dim_im_out_x, // output image dimension x or W + const uint16_t dim_im_out_y, // output image dimension y or H + q7_t *bufferA, // a buffer for local storage, size = 4*output_size + q7_t *Im_out) +{ + int16_t i_ch_in, i_x, i_y; + int16_t k_x, k_y; + int32_t *buf = (int32_t *)bufferA; + // stage2 + // int32_t max_abs = 0; + // int32_t output_shift; + // size_t output_size = dim_im_out_x * dim_im_out_x * ch_im_in; + + // save in 32bit + for (i_ch_in = 0; i_ch_in < ch_im_in; i_ch_in++) + { + for (i_y = 0; i_y < dim_im_out_y; i_y++) + { + for (i_x = 0; i_x < dim_im_out_x; i_x++) + { + int sum = 0; + for (k_y = i_y * stride_y - padding_y; k_y < i_y * stride_y - padding_y + dim_kernel_y; k_y++) + { + for (k_x = i_x * stride_x - padding_x; k_x < i_x * stride_x - padding_x + dim_kernel_x; k_x++) + { + if (k_y >= 0 && k_x >= 0 && k_y < dim_im_in_y && k_x < dim_im_in_x) + { + sum += Im_in[i_ch_in + ch_im_in * (k_x + k_y * dim_im_in_x)]; + } + } + } + // 32bit + buf[i_ch_in + ch_im_in * (i_x + i_y * dim_im_out_x)] = sum; + } + } + } + + // // find max amount results + // for (int i = 0; i < output_size; i++) + // { + // int32_t val = buf[i]; + // if (val < 0) + // val = -val; + // if (val > max_abs) + // max_abs = val; + // } + // // find best shift to cover the max + // for (output_shift = 0;; output_shift++) + // { + // if (127 * (1 + output_shift) >= max_abs) + // break; + // } + + // // shift the results + // for (int i = 0; i < output_size; i++) + // { + // Im_out[i] = buf[i] >> output_shift; + // } + //return output_shift; +} + +// temporary for the thesis +// shift according to the maximum +void local_sumpool_q7_CHW(const q7_t *Im_in, // input image + const uint16_t dim_im_in_x, // input image dimension x or W + const uint16_t dim_im_in_y, // input image dimension y or H + const uint16_t ch_im_in, // number of input image channels + const uint16_t dim_kernel_x, // window kernel size + const uint16_t dim_kernel_y, // window kernel size + const uint16_t padding_x, // padding sizes + const uint16_t padding_y, // padding sizes + const uint16_t stride_x, // stride + const uint16_t stride_y, // stride + const uint16_t dim_im_out_x, // output image dimension x or W + const uint16_t dim_im_out_y, // output image dimension y or H + q7_t *bufferA, // a buffer for local storage, size = 4*output_size + q7_t *Im_out) +{ + int16_t i_ch_in, i_x, i_y; + int16_t k_x, k_y; + int32_t *buf = (int32_t *)bufferA; + int32_t i_ch_offset, o_ch_offset; + // stage2 + // int32_t max_abs = 0; + // int32_t output_shift; + // size_t output_size = dim_im_out_x * dim_im_out_x * ch_im_in; + + // save in 32bit + for (i_ch_in = 0; i_ch_in < ch_im_in; i_ch_in++) + { + i_ch_offset = i_ch_in*dim_im_in_x*dim_im_in_y; + o_ch_offset = i_ch_in*dim_im_out_x*dim_im_out_y; + + for (i_y = 0; i_y < dim_im_out_y; i_y++) + { + for (i_x = 0; i_x < dim_im_out_x; i_x++) + { + int sum = 0; + for (k_y = i_y * stride_y - padding_y; k_y < i_y * stride_y - padding_y + dim_kernel_y; k_y++) + { + for (k_x = i_x * stride_x - padding_x; k_x < i_x * stride_x - padding_x + dim_kernel_x; k_x++) + { + if (k_y >= 0 && k_x >= 0 && k_y < dim_im_in_y && k_x < dim_im_in_x) + { + sum += Im_in[i_ch_offset + (k_x + k_y * dim_im_in_x)]; + } + } + } + // 32bit + buf[o_ch_offset + (i_x + i_y * dim_im_out_x)] = sum; + } + } + } + + // // find max amount results + // for (int i = 0; i < output_size; i++) + // { + // int32_t val = buf[i]; + // if (val < 0) + // val = -val; + // if (val > max_abs) + // max_abs = val; + // } + // // find best shift to cover the max + // for (output_shift = 0;; output_shift++) + // { + // if (127 * (1 + output_shift) >= max_abs) + // break; + // } + + // // shift the results + // for (int i = 0; i < output_size; i++) + // { + // Im_out[i] = buf[i] >> output_shift; + // } + //return output_shift; +} + +// customised up sample pooling +void local_up_sampling_q7_HWC(const q7_t *Im_in, // input image + const uint16_t dim_im_in_x, // input image dimension x or W + const uint16_t dim_im_in_y, // input image dimension y or H + const uint16_t ch_im_in, // number of input image channels + const uint16_t dim_kernel_x, // window kernel size + const uint16_t dim_kernel_y, // window kernel size + const uint16_t dim_im_out_x, // output image dimension x or W + const uint16_t dim_im_out_y, // output image dimension y or H + q7_t *bufferA, // a buffer for local storage, NULL by now + q7_t *Im_out) +{ + int16_t i_x, i_y; + + // for loop for each pixel in input image. + for (i_y = 0; i_y < dim_im_in_y; i_y++) + { + for (i_x = 0; i_x < dim_im_in_x; i_x++) + { + // copy all the channels together. + const q7_t *p_in = Im_in + (i_y * dim_im_in_x + i_x ) * ch_im_in; + q7_t *pout = Im_out + (i_y * dim_im_in_x * dim_kernel_x * dim_kernel_y + i_x * dim_kernel_y) * ch_im_in; + + // copy along x axis + for(int i = 0; i> out_shift[shift_idx]), 8); + } + } + } +} + +void local_convolve_CHW_q7_nonsquare(const q7_t *Im_in, // input image + const uint16_t dim_im_in_x, // input image dimention x + const uint16_t dim_im_in_y, // input image dimention y + const uint16_t ch_im_in, // number of input image channels + const q7_t *wt, // kernel weights + const uint16_t ch_im_out, // number of filters, i.e., output image channels + const uint16_t dim_kernel_x, // filter kernel size x + const uint16_t dim_kernel_y, // filter kernel size y + const uint16_t padding_x, // padding sizes x + const uint16_t padding_y, // padding sizes y + const uint16_t stride_x, // stride x + const uint16_t stride_y, // stride y + const uint16_t dilation_x, // dilation x + const uint16_t dilation_y, // dilation y + const q7_t *bias, // bias + const nnom_qformat_param_t *bias_shift, // bias shifts + const nnom_qformat_param_t *out_shift, // output shift + const nnom_qtype_t q_type, // per channel or per tensor + q7_t *Im_out, // output image + const uint16_t dim_im_out_x, // output image dimension x + const uint16_t dim_im_out_y, // output image dimension y + q15_t *bufferA, //buffer space for input + q7_t *bufferB //buffer space for output +) +{ + int i, j, k, l, m, n; + long conv_out; + int in_row, in_col; + int shift_idx, shift_steps; + if(q_type == NNOM_QTYPE_PER_AXIS) + shift_steps = 1; + else + shift_steps = 0; + + for(i = 0, shift_idx = 0; i < ch_im_out; i++, shift_idx += shift_steps) + { + for (j = 0; j < dim_im_out_y; j++) + { + for (k = 0; k < dim_im_out_x; k++) + { + if(bias) + conv_out = ((q31_t)(bias[i]) << bias_shift[shift_idx]) + NNOM_ROUND(out_shift[shift_idx]); + else + conv_out = (q31_t) NNOM_ROUND(out_shift[shift_idx]); + + for (m = 0; m < dim_kernel_y; m++) + { + for (n = 0; n < dim_kernel_x; n++) + { + // if-for implementation + in_row = stride_y * j + m * dilation_y - padding_y; + in_col = stride_x * k + n * dilation_x - padding_x; + if (in_row >= 0 && in_col >= 0 && in_row < dim_im_in_y && in_col < dim_im_in_x) + { + for (l = 0; l < ch_im_in; l++) + { + conv_out += Im_in[(in_row * dim_im_in_x + in_col) + l * dim_im_in_x * dim_im_in_y] * + wt[(m * dim_kernel_x + n) * ch_im_in * ch_im_out + l * ch_im_out + i]; + } + } + } + } + Im_out[i * dim_im_out_x * dim_im_out_y + (j * dim_im_out_x + k)] = (q7_t)__NNOM_SSAT((conv_out >> out_shift[shift_idx]), 8); + } + } + } +} + +#define FALSE 0 +#define TRUE 1 + +static int alg_deconv2d_calculate_position( + int pos, + int stride, + int padding, + int dim_kernel, + int dim_in, + int* in_start, + int* kernel_start, + int* kernel_end) +{ + int is_zero = FALSE; + int of, adj; + is_zero = FALSE; + *in_start = pos/stride; + of = pos%stride; + *kernel_start = padding - of; + if(*kernel_start >= 0) { + adj = MIN(*in_start, *kernel_start/stride); + *kernel_start -= adj*stride; + *in_start -= adj; + } else { + adj = -*kernel_start + dim_kernel; + if(adj<=stride) { + is_zero = TRUE; + } else { + adj = MIN(dim_in-1-*in_start, adj/stride); + *kernel_start += adj*stride; + *in_start += adj; + } + } + of = dim_kernel - 1 - *kernel_start; + adj = MIN(dim_in-1-*in_start, of/stride); + *kernel_end = *kernel_start + adj*stride; + + return is_zero; +} + +void local_conv_trans_HWC_q7_nonsquare(const int8_t * Im_in, + const uint16_t dim_im_in_x, // input image dimention x + const uint16_t dim_im_in_y, // input image dimention y + const uint16_t ch_im_in, // number of input image channels + const q7_t *wt, // kernel weights + const uint16_t ch_im_out, // number of filters, i.e., output image channels + const uint16_t dim_kernel_x, // filter kernel size x + const uint16_t dim_kernel_y, // filter kernel size y + const uint16_t padding_x, // padding sizes x + const uint16_t padding_y, // padding sizes y + const uint16_t stride_x, // stride x + const uint16_t stride_y, // stride y + const uint16_t dilation_x, // dilation x + const uint16_t dilation_y, // dilation y + const q7_t *bias, // bias + const uint16_t bias_shift, const uint16_t out_shift, q7_t *Im_out, // output image + const uint16_t dim_im_out_x, // output image dimension x + const uint16_t dim_im_out_y, // output image dimension y + q15_t *bufferA, //buffer space for input + q7_t *bufferB //buffer space for output +) +// { +// int ox, oy, oc, ky, kx, kc, ix, iy; +// int conv_out; +// int in_pix_loc, wt_loc; + +// (void)dilation_y; +// (void)dilation_x; + +// // padding and stride are applied to output +// for (oc = 0; oc < ch_im_out; oc++) +// { +// for (oy = 0; oy < dim_im_out_y; oy++) +// { +// for (ox = 0; ox < dim_im_out_x; ox++) +// { +// conv_out = ((q31_t)(bias[oc]) << bias_shift) + NNOM_ROUND(out_shift); + +// for (ky = 0; ky < dim_kernel_y; ky++) +// { +// for (kx = 0; kx < dim_kernel_x; kx++) +// { +// // input y, input x location +// iy = oy / stride_y + ky - padding_y; +// ix = ox / stride_x + kx - padding_x; + +// if(ix >= 0 && iy >= 0 && ix < dim_im_in_y && iy< dim_im_in_y) +// { +// in_pix_loc = (iy * dim_im_in_x + ix) * ch_im_in; +// wt_loc = oc * ch_im_in * dim_kernel_y * dim_kernel_x + (ky * dim_kernel_x + kx) * ch_im_in; + +// for (kc = 0; kc < ch_im_in; kc++) +// { +// conv_out += Im_in[in_pix_loc + kc] * wt[wt_loc + kc]; +// } +// } +// } +// } + +// Im_out[oc + (oy * dim_im_out_x + ox) * ch_im_out] = (q7_t) __NNOM_SSAT((conv_out >> out_shift), 8); +// } +// } +// } +// } + +{ + int i, j, k, l, m, n; + int conv_out; + int in_row, in_col; + int kernel_start_x,kernel_end_x; + int kernel_start_y,kernel_end_y; + int in_row_start, in_col_start; + int is_zero; + + for (i = 0; i < ch_im_out; i++) { + for (j = 0; j < dim_im_out_y; j++) { + is_zero = alg_deconv2d_calculate_position(j, stride_y, padding_y, dim_kernel_y, + dim_im_in_y, &in_row_start, &kernel_start_y, &kernel_end_y); + + if(is_zero) { + conv_out = ((q31_t)(bias[i]) << bias_shift) + NNOM_ROUND(out_shift); + conv_out = (q7_t) __NNOM_SSAT((conv_out >> out_shift), 8); + for (k = 0; k < dim_im_out_x; k++) { + Im_out[i + (j * dim_im_out_x + k) * ch_im_out] = (q7_t) conv_out; + } + continue; + } + + for (k = 0; k < dim_im_out_x; k++) { + conv_out = ((q31_t)(bias[i]) << bias_shift) + NNOM_ROUND(out_shift); + + is_zero = alg_deconv2d_calculate_position(k, stride_x, padding_x, dim_kernel_x, + dim_im_in_x, &in_col_start, &kernel_start_x, &kernel_end_x); + + if(is_zero) { + Im_out[i + (j * dim_im_out_x + k) * ch_im_out] = conv_out; + continue; + } + + for (m = kernel_start_y, in_row = in_row_start; m <= kernel_end_y; m+=stride_y, in_row++) { + for (n = kernel_start_x, in_col = in_col_start; n <= kernel_end_x; n+=stride_x, in_col++) { + if ((in_row >= 0) && (in_col >= 0) && + (in_row < dim_im_in_y) && (in_col < dim_im_in_x)) { + for (l = 0; l < ch_im_in; l++) { + conv_out += Im_in[(in_row * dim_im_in_x + in_col) * ch_im_in + l] * + wt[i * ch_im_in * dim_kernel_y * dim_kernel_x + (m * dim_kernel_x + n) * ch_im_in + l]; + } + } + } + } + + Im_out[i + (j * dim_im_out_x + k) * ch_im_out] = (q7_t) __NNOM_SSAT((conv_out >> out_shift), 8); + } + } + } +} + +void local_depthwise_separable_conv_HWC_q7_nonsquare(const q7_t *Im_in,// input image + const uint16_t dim_im_in_x, // input image dimention x + const uint16_t dim_im_in_y, // input image dimention y + const uint16_t ch_im_in, // number of input image channels + const q7_t *wt, // kernel weights + const uint16_t ch_im_out, // number of filters, i.e., output image channels + const uint16_t dim_kernel_x, // filter kernel size x + const uint16_t dim_kernel_y, // filter kernel size y + const uint16_t padding_x, // padding sizes x + const uint16_t padding_y, // padding sizes y + const uint16_t stride_x, // stride x + const uint16_t stride_y, // stride y + const uint16_t dilation_x, // dilation x + const uint16_t dilation_y, // dilation y + const q7_t *bias, // bias + const nnom_qformat_param_t *bias_shift, // bias shifts + const nnom_qformat_param_t *out_shift, // output shift + const nnom_qtype_t q_type, // per channel or per tensor + q7_t *Im_out, // output image + const uint16_t dim_im_out_x, // output image dimension x + const uint16_t dim_im_out_y, // output image dimension y + q15_t *bufferA, //buffer space for input + q7_t *bufferB //buffer space for output +) +{ + int i_out_y, i_out_x, i_ch_out, i_ch_in, i_ch_mult; + int i_ker_y, i_ker_x; + int i_out = 0; + int shift_idx, shift_steps; + int ch_mult = ch_im_out / ch_im_in; + q31_t conv_out; + + for (i_out_y = 0; i_out_y < dim_im_out_y; i_out_y++) + { + const int32_t base_idx_y = stride_y * i_out_y - padding_y; + for (i_out_x = 0; i_out_x < dim_im_out_x; i_out_x++) + { + const int32_t base_idx_x = stride_x * i_out_x - padding_x; + for (i_ch_in = 0; i_ch_in < ch_im_in; i_ch_in++) + { + for(i_ch_mult = 0; i_ch_mult < ch_mult; i_ch_mult++) + { + i_ch_out = i_ch_mult + i_ch_in * ch_mult; + int32_t ker_y_start = MAX(0, -(base_idx_y-(dilation_y-1))/dilation_y); + int32_t ker_x_start = MAX(0, -(base_idx_x-(dilation_x-1))/dilation_x); + int32_t ker_y_end = MIN(dim_kernel_y, (dim_im_in_y - base_idx_y + (dilation_y-1))/dilation_y); + int32_t ker_x_end = MIN(dim_kernel_x, (dim_im_in_x - base_idx_x + (dilation_x-1))/dilation_x); + + shift_idx = q_type == NNOM_QTYPE_PER_AXIS ? i_ch_out : 0; + if (bias) + conv_out = ((q31_t)(bias[i_ch_out]) << bias_shift[shift_idx]) + NNOM_ROUND(out_shift[shift_idx]); + else + conv_out = (q31_t)NNOM_ROUND(out_shift[shift_idx]); + + for (i_ker_y = ker_y_start; i_ker_y < ker_y_end; i_ker_y++) + { + const int32_t idx_y = base_idx_y + i_ker_y * dilation_y; + for (i_ker_x = ker_x_start; i_ker_x < ker_x_end; i_ker_x++) + { + const int32_t idx_x = base_idx_x + i_ker_x * dilation_x; + int32_t in_pix_loc = (idx_y * dim_im_in_x + idx_x) * ch_im_in + i_ch_in; + int32_t wt_loc = (i_ker_y * dim_kernel_x + i_ker_x) * (ch_im_in * ch_mult) + i_ch_out; + conv_out += Im_in[in_pix_loc] * wt[wt_loc]; + } + } + Im_out[i_out++] = (q7_t)__NNOM_SSAT((conv_out >> out_shift[shift_idx]), 8); + } + } + } + } +} + +void local_depthwise_separable_conv_CHW_q7_nonsquare(const q7_t *Im_in,// input image + const uint16_t dim_im_in_x, // input image dimention x + const uint16_t dim_im_in_y, // input image dimention y + const uint16_t ch_im_in, // number of input image channels + const q7_t *wt, // kernel weights + const uint16_t ch_im_out, // number of filters, i.e., output image channels + const uint16_t dim_kernel_x, // filter kernel size x + const uint16_t dim_kernel_y, // filter kernel size y + const uint16_t padding_x, // padding sizes x + const uint16_t padding_y, // padding sizes y + const uint16_t stride_x, // stride x + const uint16_t stride_y, // stride y + const uint16_t dilation_x, // dilation x + const uint16_t dilation_y, // dilation y + const q7_t *bias, // bias + const nnom_qformat_param_t *bias_shift, // bias shifts + const nnom_qformat_param_t *out_shift, // output shift + const nnom_qtype_t q_type, // per channel or per tensor + q7_t *Im_out, // output image + const uint16_t dim_im_out_x, // output image dimension x + const uint16_t dim_im_out_y, // output image dimension y + q15_t *bufferA, //buffer space for input + q7_t *bufferB //buffer space for output +) +{ + int i_out_y, i_out_x, i_ch_out, i_ch_in, i_ch_mult; + int i_ker_y, i_ker_x; + int i_out = 0; + int shift_idx, shift_steps; + int ch_mult = ch_im_out / ch_im_in; + q31_t conv_out; + + for (i_out_y = 0; i_out_y < dim_im_out_y; i_out_y++) + { + const int32_t base_idx_y = stride_y * i_out_y - padding_y; + for (i_out_x = 0; i_out_x < dim_im_out_x; i_out_x++) + { + const int32_t base_idx_x = stride_x * i_out_x - padding_x; + for (i_ch_in = 0; i_ch_in < ch_im_in; i_ch_in++) + { + for (i_ch_mult = 0; i_ch_mult < ch_mult; i_ch_mult++) + { + i_ch_out = i_ch_mult + i_ch_in * ch_mult; + int32_t ker_y_start = MAX(0, -(base_idx_y-(dilation_y-1))/dilation_y); + int32_t ker_x_start = MAX(0, -(base_idx_x-(dilation_x-1))/dilation_x); + int32_t ker_y_end = MIN(dim_kernel_y, (dim_im_in_y - base_idx_y + (dilation_y-1))/dilation_y); + int32_t ker_x_end = MIN(dim_kernel_x, (dim_im_in_x - base_idx_x + (dilation_x-1))/dilation_x); + + shift_idx = q_type == NNOM_QTYPE_PER_AXIS ? i_ch_out : 0; + if (bias) + conv_out = ((q31_t)(bias[i_ch_out]) << bias_shift[shift_idx]) + NNOM_ROUND(out_shift[shift_idx]); + else + conv_out = (q31_t)NNOM_ROUND(out_shift[shift_idx]); + + for (i_ker_y = ker_y_start; i_ker_y < ker_y_end; i_ker_y++) + { + const int32_t idx_y = base_idx_y + i_ker_y * dilation_y; + for (i_ker_x = ker_x_start; i_ker_x < ker_x_end; i_ker_x++) + { + const int32_t idx_x = base_idx_x + i_ker_x * dilation_x; + int32_t in_pix_loc = (idx_y * dim_im_in_x + idx_x) + i_ch_in * dim_im_in_x * dim_im_in_y; + int32_t wt_loc = (i_ker_y * dim_kernel_x + i_ker_x) * ch_im_out + i_ch_out; + conv_out += Im_in[in_pix_loc] * wt[wt_loc]; + } + } + Im_out[i_ch_out * dim_im_out_x * dim_im_out_y + (i_out_y * dim_im_out_x + i_out_x)] = (q7_t)__NNOM_SSAT((conv_out >> out_shift[shift_idx]), 8); + } + } + } + } + +} + + +void local_zero_padding_HWC_q7(const q7_t *Im_in, // input image + const uint16_t dim_im_in_x, // input image dimention x + const uint16_t dim_im_in_y, // input image dimention y + const uint16_t ch_im_in, // number of input image channels + const uint16_t padding_top, // padding sizes y + const uint16_t padding_bottom, // padding sizes y + const uint16_t padding_left, // padding sizes x + const uint16_t padding_right, // padding sizes x + q7_t *Im_out, // output image + const uint16_t dim_im_out_x, // output image dimension x + const uint16_t dim_im_out_y) // output image dimension y +{ + int i, size; + q7_t * p_out = Im_out; + + // top rows + size = dim_im_out_x*ch_im_in*padding_top; + nnom_memset(p_out, 0, size); + p_out += size; + + // middle + for(i=0; i> 2; + const q7_t *pB = pM; + const q7_t *pA; + q7_t *pO = pOut; + + while (rowCnt) + { + pA = pV; + q31_t sum = (q31_t) NNOM_ROUND(out_shift); + q31_t sum2 = (q31_t) NNOM_ROUND(out_shift); + q31_t sum3 = (q31_t) NNOM_ROUND(out_shift); + q31_t sum4 = (q31_t) NNOM_ROUND(out_shift); + + uint16_t colCnt = dim_vec >> 2; + + while (colCnt) + { + q7_t inA1 = *pA++; + q7_t inA3 = *pA++; + q7_t inA2 = *pA++; + q7_t inA4 = *pA++; + + q7_t inB1 = *pB++; + q7_t inB3 = *pB++; + q7_t inB2 = *pB++; + q7_t inB4 = *pB++; + + sum += inA1 * inB1 + inA2 * inB2; + sum2 += inA1 * inB3 + inA2 * inB4; + + inB1 = *pB++; + inB3 = *pB++; + inB2 = *pB++; + inB4 = *pB++; + + sum3 += inA1 * inB1 + inA2 * inB2; + sum4 += inA1 * inB3 + inA2 * inB4; + + inB1 = *pB++; + inB3 = *pB++; + inB2 = *pB++; + inB4 = *pB++; + + sum += inA3 * inB1 + inA4 * inB2; + sum2 += inA3 * inB3 + inA4 * inB4; + + inB1 = *pB++; + inB3 = *pB++; + inB2 = *pB++; + inB4 = *pB++; + + sum3 += inA3 * inB1 + inA4 * inB2; + sum4 += inA3 * inB3 + inA4 * inB4; + + colCnt--; + } + colCnt = dim_vec & 0x3; + while (colCnt) + { + q7_t inA = *pA++; + q7_t inB = *pB++; + sum += inA * inB; + inB = *pB++; + sum2 += inA * inB; + inB = *pB++; + sum3 += inA * inB; + inB = *pB++; + sum4 += inA * inB; + + colCnt--; + } + *pO++ = (q7_t)__NNOM_SSAT((sum >> out_shift), 8); + *pO++ = (q7_t)__NNOM_SSAT((sum2 >> out_shift), 8); + *pO++ = (q7_t)__NNOM_SSAT((sum3 >> out_shift), 8); + *pO++ = (q7_t)__NNOM_SSAT((sum4 >> out_shift), 8); + + rowCnt--; + } + + rowCnt = num_of_rows & 0x3; + + while (rowCnt) + { + int ip_out = (q31_t) NNOM_ROUND (out_shift); + pA = pV; + for (int j = 0; j < dim_vec; j++) + { + q7_t inA = *pA++; + q7_t inB = *pB++; + ip_out += inA * inB; + } + *pO++ = (q7_t)__NNOM_SSAT((ip_out >> out_shift), 8); + + rowCnt--; + } +} + +void local_dot_q7(const q7_t *pV, // pointer to vector + const q7_t *pM, // pointer to matrix + const uint16_t dim_vec, // length of the vector + const uint16_t num_of_rows, // numCol of A + const uint16_t out_shift, // amount of right-shift for output + q7_t *pOut) // output operand) +{ + for (int i = 0; i < num_of_rows; i++) + { + int ip_out = (q31_t) NNOM_ROUND(out_shift); + for (int j = 0; j < dim_vec; j++) + { + ip_out += pV[j] * pM[i * dim_vec + j]; + } + pOut[i] = (q7_t)__NNOM_SSAT((ip_out >> out_shift), 8); + } +} + +void local_fully_connected_q7_opt(const q7_t *pV, // pointer to vector + const q7_t *pM, // pointer to matrix + const uint16_t dim_vec, // length of the vector + const uint16_t num_of_rows, // numCol of A + const uint16_t bias_shift, // amount of left-shift for bias + const uint16_t out_shift, // amount of right-shift for output + const q7_t *bias, q7_t *pOut, // output operand + q15_t *vec_buffer) +{ + uint16_t rowCnt = num_of_rows >> 2; + const q7_t *pB = pM; + const q7_t *pA; + q7_t *pO = pOut; + const q7_t *pBias = bias; + + while (rowCnt) + { + pA = pV; + q31_t sum; + q31_t sum2; + q31_t sum3; + q31_t sum4; + uint16_t colCnt = dim_vec >> 2; + + if(bias) + { + sum = ((q31_t)(*pBias++) << bias_shift) + NNOM_ROUND(out_shift); + sum2 = ((q31_t)(*pBias++) << bias_shift) + NNOM_ROUND(out_shift); + sum3 = ((q31_t)(*pBias++) << bias_shift) + NNOM_ROUND(out_shift); + sum4 = ((q31_t)(*pBias++) << bias_shift) + NNOM_ROUND(out_shift); + } + else + { + sum = (q31_t) NNOM_ROUND(out_shift); + sum2 = (q31_t) NNOM_ROUND(out_shift); + sum3 = (q31_t) NNOM_ROUND(out_shift); + sum4 = (q31_t) NNOM_ROUND(out_shift); + } + + while (colCnt) + { + q7_t inA1 = *pA++; + q7_t inA3 = *pA++; + q7_t inA2 = *pA++; + q7_t inA4 = *pA++; + + q7_t inB1 = *pB++; + q7_t inB3 = *pB++; + q7_t inB2 = *pB++; + q7_t inB4 = *pB++; + + sum += inA1 * inB1 + inA2 * inB2; + sum2 += inA1 * inB3 + inA2 * inB4; + + inB1 = *pB++; + inB3 = *pB++; + inB2 = *pB++; + inB4 = *pB++; + + sum3 += inA1 * inB1 + inA2 * inB2; + sum4 += inA1 * inB3 + inA2 * inB4; + + inB1 = *pB++; + inB3 = *pB++; + inB2 = *pB++; + inB4 = *pB++; + + sum += inA3 * inB1 + inA4 * inB2; + sum2 += inA3 * inB3 + inA4 * inB4; + + inB1 = *pB++; + inB3 = *pB++; + inB2 = *pB++; + inB4 = *pB++; + + sum3 += inA3 * inB1 + inA4 * inB2; + sum4 += inA3 * inB3 + inA4 * inB4; + + colCnt--; + } + colCnt = dim_vec & 0x3; + while (colCnt) + { + q7_t inA = *pA++; + q7_t inB = *pB++; + sum += inA * inB; + inB = *pB++; + sum2 += inA * inB; + inB = *pB++; + sum3 += inA * inB; + inB = *pB++; + sum4 += inA * inB; + + colCnt--; + } + *pO++ = (q7_t)__NNOM_SSAT((sum >> out_shift), 8); + *pO++ = (q7_t)__NNOM_SSAT((sum2 >> out_shift), 8); + *pO++ = (q7_t)__NNOM_SSAT((sum3 >> out_shift), 8); + *pO++ = (q7_t)__NNOM_SSAT((sum4 >> out_shift), 8); + + rowCnt--; + } + + rowCnt = num_of_rows & 0x3; + + while (rowCnt) + { + int ip_out; + if(bias) + ip_out=((q31_t)(*bias++) << bias_shift) + NNOM_ROUND(out_shift); + else + ip_out=(q31_t)NNOM_ROUND(out_shift); + + pA = pV; + for (int j = 0; j < dim_vec; j++) + { + q7_t inA = *pA++; + q7_t inB = *pB++; + ip_out += inA * inB; + } + *pO++ = (q7_t)__NNOM_SSAT((ip_out >> out_shift), 8); + + rowCnt--; + } +} + +void local_fully_connected_q7(const q7_t *pV, // pointer to vector + const q7_t *pM, // pointer to matrix + const uint16_t dim_vec, // length of the vector + const uint16_t num_of_rows, // numCol of A + const uint16_t bias_shift, // amount of left-shift for bias + const uint16_t out_shift, // amount of right-shift for output + const q7_t *bias, q7_t *pOut, // output operand + q15_t *vec_buffer) +{ + if(bias) + { + for (int i = 0; i < num_of_rows; i++) + { + int ip_out = ((q31_t)(*bias++) << bias_shift) + NNOM_ROUND(out_shift); + for (int j = 0; j < dim_vec; j++) + { + ip_out += pV[j] * pM[i * dim_vec + j]; + } + pOut[i] = (q7_t)__NNOM_SSAT((ip_out >> out_shift), 8); + } + } + else + { + for (int i = 0; i < num_of_rows; i++) + { + int ip_out = (q31_t)NNOM_ROUND(out_shift); + for (int j = 0; j < dim_vec; j++) + { + ip_out += pV[j] * pM[i * dim_vec + j]; + } + pOut[i] = (q7_t)__NNOM_SSAT((ip_out >> out_shift), 8); + } + } +} + + +void local_softmax_q7(const q7_t *vec_in, const uint32_t dim_vec, q7_t *p_out) +{ + q31_t sum; + int32_t i; + uint8_t shift; + q15_t base; + base = -257; + + /* We first search for the maximum */ + for (i = 0; i < dim_vec; i++) + { + if (vec_in[i] > base) + { + base = vec_in[i]; + } + } + + /* + * So the base is set to max-8, meaning + * that we ignore really small values. + * anyway, they will be 0 after shrinking to q7_t. + */ + base = base - 8; + + sum = 0; + + for (i = 0; i < dim_vec; i++) + { + if (vec_in[i] > base) + { + shift = (uint8_t)__NNOM_USAT(vec_in[i] - base, 5); + sum += 0x1 << shift; + } + } + + /* This is effectively (0x1 << 20) / sum */ + int output_base = 0x100000 / sum; + + /* + * Final confidence will be output_base >> ( 13 - (vec_in[i] - base) ) + * so 128 (0x1<<7) -> 100% confidence when sum = 0x1 << 8, output_base = 0x1 << 12 + * and vec_in[i]-base = 8 + */ + for (i = 0; i < dim_vec; i++) + { + if (vec_in[i] > base) + { + /* Here minimum value of 13+base-vec_in[i] will be 5 */ + shift = (uint8_t)__NNOM_USAT(13 + base - vec_in[i], 5); + p_out[i] = (q7_t)__NNOM_SSAT((output_base >> shift), 8); + } + else + { + p_out[i] = 0; + } + } +} + + +// hard sigmoid, +// y=-1 if x < -2.5 +// y=1 if x > 2.5 +// otherwise y = 0.2 * x + 0.5 (y=0.20315 * x + 0.5) +void local_hard_sigmoid_q7(q7_t *data, uint32_t size, int16_t dec_bit) +{ + int16_t limit = 2.5f * (1 << dec_bit)-1; + int16_t offset = 64; // 0.5 * 128 + int16_t mult = 26; // 0.2 * 128 + + // int bit >= 0 + for(int i=0; i= limit) + data[i] = 127; + else + { + data[i] = ((int16_t)(data[i] * mult) >> dec_bit) + offset; + } + } + } + +// hard tanh +// y=-1 if x < -1 +// y=1 if x > 1 +// otherwise y = x +void local_hard_tanh_q7(q7_t *data, uint32_t size, int16_t dec_bit) +{ + int16_t int_bit = 7 - dec_bit; + int16_t limit = 1 << dec_bit; + + if(dec_bit == 7) + return; + + // int bit < 0 + if(int_bit < 0) + for(int i=0; i= limit) + data[i] = 127; + else + { + data[i] = data[i] >> (-int_bit); + } + } + else + // int bit >= 0 + for(int i=0; i= limit) + data[i] = 127; + else + { + data[i] = data[i] << int_bit; + } + } +} + +void local_sigmoid_q7(q7_t *data, uint32_t size, int16_t int_width) +{ + uint32_t i = size; + q7_t *pIn = data; + q7_t *pOut = data; + q7_t in; + q7_t out; + uint16_t shift_size = 3 - int_width; + // saturation if int bit too large + if(int_width > 3) + { + while (i) + { + if(*pIn++ > 0) + *pOut++ = 127; + else + *pOut++ = 0; + i--; + } + } + // otherwise search table + else + { + while (i) + { + in = *pIn++; + out = nnom_sigmoid_table_q7[(uint8_t)(in >> shift_size)]; + *pOut++ = out; + i--; + } + } +} + +void local_tanh_q7(q7_t *data, uint32_t size, int16_t int_width) +{ + uint32_t i = size; + q7_t *pIn = data; + q7_t *pOut = data; + q7_t in; + q7_t out; + uint16_t shift_size = 3 - int_width; + + // saturation if int bit too large + if(int_width > 3) + { + while (i) + { + in = *pIn++; + if(in > 0) + *pOut++ = 127; + else if ( in == 0) + *pOut++ = 0; + else + *pOut++ = -128; + i--; + } + } + // otherwise search table + else + { + while (i) + { + in = *pIn++; + out = nnom_tanh_table_q7[(uint8_t)(in >> shift_size)]; + *pOut++ = out; + i--; + } + } +} + +void local_relu_q7(q7_t *data, uint32_t size) +{ + uint32_t i; + + for (i = 0; i < size; i++) + { + if (data[i] < 0) + data[i] = 0; + } +} + +// alpha in q7 format with dec_bit=7 +void local_leaky_relu_q7(q7_t *data, q7_t alpha, uint32_t size) +{ + uint32_t i; + + for (i = 0; i < size; i++) + { + if (data[i] < 0) + { + data[i] = data[i] * alpha / 128; + } + } +} + +// alpha in q7 format with dec_bit=7 +// max and threshold has the same Q format with the activation +void local_adv_relu_q7(q7_t *data, q7_t negative_slope, q7_t max, q7_t threshold, uint32_t size) +{ + uint32_t i; + for (i = 0; i < size; i++) + { + // `f(x) = max_value` for `x >= max_value`, + // `f(x) = x` for `threshold <= x < max_value`, + // `f(x) = alpha * (x - threshold)` otherwise. + + if(data[i] > max) + data[i] = max; + if (data[i] < threshold) + data[i] = (data[i] - threshold) * negative_slope / 128; + } +} + +// matrix ops +void local_mult_q7(q7_t *pSrcA, + q7_t *pSrcB, + q7_t *pDst, + const uint16_t out_shift, + uint32_t blockSize) +{ + uint32_t i; + + for (i = 0; i < blockSize; i++) + { + q31_t product = pSrcA[i] * pSrcB[i]; + pDst[i] = (q7_t) __NNOM_SSAT(((product + NNOM_ROUND(out_shift)) >> out_shift), 8); + } +} + +void local_add_q7(q7_t *pSrcA, + q7_t *pSrcB, + q7_t *pDst, + const uint16_t out_shift, + uint32_t blockSize) +{ + uint32_t i; + + for (i = 0; i < blockSize; i++) + { + q31_t sum = pSrcA[i] + pSrcB[i]; + pDst[i] = (q7_t) __NNOM_SSAT(((sum + NNOM_ROUND(out_shift)) >> out_shift), 8); + } +} + +void local_sub_q7(q7_t *pSrcA, + q7_t *pSrcB, + q7_t *pDst, + const uint16_t out_shift, + uint32_t blockSize) +{ + uint32_t i; + + for (i = 0; i < blockSize; i++) + { + q31_t sub = pSrcA[i] - pSrcB[i]; + pDst[i] = (q7_t) __NNOM_SSAT(((sub + NNOM_ROUND(out_shift)) >> out_shift), 8); + } +} + + + +void local_multiple_add_q7( q7_t *p_dst, + const int16_t out_shift, + uint32_t block_size, + uint32_t num_block, + q7_t **p_src) +{ + uint32_t i, blk; + q31_t sum; + + for (i = 0; i < block_size; i++) + { + sum = 0; + for(blk=0; blk < num_block; blk++) + sum += p_src[blk][i]; + p_dst[i] = (q7_t) __NNOM_SSAT(((sum + NNOM_ROUND(out_shift)) >> out_shift), 8); + } +} + +void local_multiple_mult_q7( q7_t *p_dst, + const int16_t out_shift, + uint32_t block_size, + uint32_t num_block, + q7_t **p_src) +{ + uint32_t i, blk; + q31_t product; + + for (i = 0; i < block_size; i++) + { + product = 1; + for(blk=0; blk < num_block; blk++) + product *= p_src[blk][i]; + p_dst[i] = (q7_t) __NNOM_SSAT(((product + NNOM_ROUND(out_shift)) >> out_shift), 8); + } +} + +void local_multiple_sub_q7( q7_t *p_dst, + const int16_t out_shift, + uint32_t block_size, + uint32_t num_block, + q7_t **p_src) +{ + uint32_t i, blk; + q31_t sub; + + for (i = 0; i < block_size; i++) + { + sub = p_src[0][i]; + for(blk=1; blk < num_block; blk++) + sub -= p_src[blk][i]; + p_dst[i] = (q7_t) __NNOM_SSAT(((sub + NNOM_ROUND(out_shift)) >> out_shift), 8); + } +} + + +void local_q7_to_q15_no_shift(const q7_t *src, q15_t *des, uint32_t size) +{ + // simple unloop + uint32_t count = size/8; + while (count-- > 0) + { + *des++ = (q15_t)*src++; + *des++ = (q15_t)*src++; + *des++ = (q15_t)*src++; + *des++ = (q15_t)*src++; + *des++ = (q15_t)*src++; + *des++ = (q15_t)*src++; + *des++ = (q15_t)*src++; + *des++ = (q15_t)*src++; + } + count = size%8; + while(count-- > 0) + *des++ = (q15_t)*src++; +} + +void local_q7_to_q15(const q7_t *src, q15_t *des, uint32_t size) +{ + // simple unloop + uint32_t count = size/8; + while (count-- > 0) + { + *des++ = (q15_t)*src++<<8; + *des++ = (q15_t)*src++<<8; + *des++ = (q15_t)*src++<<8; + *des++ = (q15_t)*src++<<8; + *des++ = (q15_t)*src++<<8; + *des++ = (q15_t)*src++<<8; + *des++ = (q15_t)*src++<<8; + *des++ = (q15_t)*src++<<8; + } + count = size%8; + while(count-- > 0) + *des++ = (q15_t)*src++<<8; +} + +// right shift q15 to q7 +void local_q15_to_q7(const q15_t *src, q7_t *des, uint32_t shift, uint32_t size) +{ + while(size-- >0) + { + *des = *src >> shift; + des++; + src++; + } +} + diff --git a/APP_Framework/Framework/knowing/nnom/src/backends/nnom_local_q15.c b/APP_Framework/Framework/knowing/nnom/src/backends/nnom_local_q15.c new file mode 100644 index 000000000..d78c3efc0 --- /dev/null +++ b/APP_Framework/Framework/knowing/nnom/src/backends/nnom_local_q15.c @@ -0,0 +1,1602 @@ +/* + * Copyright (c) 2018-2020 + * Jianjia Ma + * majianjia@live.com + * + * SPDX-License-Identifier: Apache-2.0 + * + * Notice: + * Code in this file inlcudes derivative works from CMSIS + * Please check the LICENSE file for detial. + * + * Change Logs: + * Date Author Notes + * 2020-10-05 Jianjia Ma The first version + */ + +#include "nnom.h" +#include "nnom_local.h" + +// modified from CMSIS-NN test_ref +void local_avepool_q15_HWC(const q15_t *Im_in, // input image + const uint16_t dim_im_in_x, // input image dimension x or W + const uint16_t dim_im_in_y, // input image dimension y or H + const uint16_t ch_im_in, // number of input image channels + const uint16_t dim_kernel_x, // window kernel size + const uint16_t dim_kernel_y, // window kernel size + const uint16_t padding_x, // padding sizes + const uint16_t padding_y, // padding sizes + const uint16_t stride_x, // stride + const uint16_t stride_y, // stride + const uint16_t dim_im_out_x, // output image dimension x or W + const uint16_t dim_im_out_y, // output image dimension y or H + const uint16_t output_shift, // output right shift + q7_t *bufferA, // a buffer for local storage, NULL by now + q15_t *Im_out) +{ + int16_t i_ch_in, i_x, i_y; + int16_t k_x, k_y; + + for (i_ch_in = 0; i_ch_in < ch_im_in; i_ch_in++) + { + for (i_y = 0; i_y < dim_im_out_y; i_y++) + { + for (i_x = 0; i_x < dim_im_out_x; i_x++) + { + int sum = 0; + int count = 0; + for (k_y = i_y * stride_y - padding_y; k_y < i_y * stride_y - padding_y + dim_kernel_y; k_y++) + { + for (k_x = i_x * stride_x - padding_x; k_x < i_x * stride_x - padding_x + dim_kernel_x; k_x++) + { + if (k_y >= 0 && k_x >= 0 && k_y < dim_im_in_y && k_x < dim_im_in_x) + { + sum += Im_in[i_ch_in + ch_im_in * (k_x + k_y * dim_im_in_x)]; + count++; + } + } + } + Im_out[i_ch_in + ch_im_in * (i_x + i_y * dim_im_out_x)] = sum / (count>>output_shift); + } + } + } +} + +void local_avepool_q15_CHW(const q15_t *Im_in, // input image + const uint16_t dim_im_in_x, // input image dimension x or W + const uint16_t dim_im_in_y, // input image dimension y or H + const uint16_t ch_im_in, // number of input image channels + const uint16_t dim_kernel_x, // window kernel size + const uint16_t dim_kernel_y, // window kernel size + const uint16_t padding_x, // padding sizes + const uint16_t padding_y, // padding sizes + const uint16_t stride_x, // stride + const uint16_t stride_y, // stride + const uint16_t dim_im_out_x, // output image dimension x or W + const uint16_t dim_im_out_y, // output image dimension y or H + const uint16_t output_shift, // output right shift + q7_t *bufferA, // a buffer for local storage, NULL by now + q15_t *Im_out) +{ + int16_t i_ch_in, i_x, i_y; + int16_t k_x, k_y; + int32_t ch_offset; + + for (i_ch_in = 0; i_ch_in < ch_im_in; i_ch_in++) + { + ch_offset = i_ch_in*dim_im_in_x*dim_im_in_y; + for (i_y = 0; i_y < dim_im_out_y; i_y++) + { + for (i_x = 0; i_x < dim_im_out_x; i_x++) + { + int sum = 0; + int count = 0; + for (k_y = i_y * stride_y - padding_y; k_y < i_y * stride_y - padding_y + dim_kernel_y; k_y++) + { + for (k_x = i_x * stride_x - padding_x; k_x < i_x * stride_x - padding_x + dim_kernel_x; k_x++) + { + if (k_y >= 0 && k_x >= 0 && k_y < dim_im_in_y && k_x < dim_im_in_x) + { + sum += Im_in[ch_offset + (k_x + k_y * dim_im_in_x)]; + count++; + } + } + } + Im_out[i_ch_in*dim_im_out_x*dim_im_out_y + (i_x + i_y * dim_im_out_x)] = sum / (count>>output_shift); + } + } + } +} + +// modified from CMSIS-NN test_ref +void local_maxpool_q15_HWC(const q15_t *Im_in, // input image + const uint16_t dim_im_in_x, // input image dimension x or W + const uint16_t dim_im_in_y, // input image dimension y or H + const uint16_t ch_im_in, // number of input image channels + const uint16_t dim_kernel_x, // window kernel size + const uint16_t dim_kernel_y, // window kernel size + const uint16_t padding_x, // padding sizes + const uint16_t padding_y, // padding sizes + const uint16_t stride_x, // stride + const uint16_t stride_y, // stride + const uint16_t dim_im_out_x, // output image dimension x or W + const uint16_t dim_im_out_y, // output image dimension y or H + q7_t *bufferA, // a buffer for local storage, NULL by now + q15_t *Im_out) +{ + int16_t i_ch_in, i_x, i_y; + int16_t k_x, k_y; + + for (i_ch_in = 0; i_ch_in < ch_im_in; i_ch_in++) + { + for (i_y = 0; i_y < dim_im_out_y; i_y++) + { + for (i_x = 0; i_x < dim_im_out_x; i_x++) + { + int max = -32768; + for (k_y = i_y * stride_y - padding_y; k_y < i_y * stride_y - padding_y + dim_kernel_y; k_y++) + { + for (k_x = i_x * stride_x - padding_x; k_x < i_x * stride_x - padding_x + dim_kernel_x; k_x++) + { + if (k_y >= 0 && k_x >= 0 && k_y < dim_im_in_y && k_x < dim_im_in_x) + { + if (Im_in[i_ch_in + ch_im_in * (k_x + k_y * dim_im_in_x)] > max) + { + max = Im_in[i_ch_in + ch_im_in * (k_x + k_y * dim_im_in_x)]; + } + } + } + } + Im_out[i_ch_in + ch_im_in * (i_x + i_y * dim_im_out_x)] = max; + } + } + } +} + +void local_maxpool_q15_CHW(const q15_t *Im_in, // input image + const uint16_t dim_im_in_x, // input image dimension x or W + const uint16_t dim_im_in_y, // input image dimension y or H + const uint16_t ch_im_in, // number of input image channels + const uint16_t dim_kernel_x, // window kernel size + const uint16_t dim_kernel_y, // window kernel size + const uint16_t padding_x, // padding sizes + const uint16_t padding_y, // padding sizes + const uint16_t stride_x, // stride + const uint16_t stride_y, // stride + const uint16_t dim_im_out_x, // output image dimension x or W + const uint16_t dim_im_out_y, // output image dimension y or H + q7_t *bufferA, // a buffer for local storage, NULL by now + q15_t *Im_out) +{ + int16_t i_ch_in, i_x, i_y; + int16_t k_x, k_y; + int32_t ch_offset; + + for (i_ch_in = 0; i_ch_in < ch_im_in; i_ch_in++) + { + ch_offset = i_ch_in * dim_im_out_x * dim_im_out_y; + for (i_y = 0; i_y < dim_im_out_y; i_y++) + { + for (i_x = 0; i_x < dim_im_out_x; i_x++) + { + int max = -32768; + for (k_y = i_y * stride_y - padding_y; k_y < i_y * stride_y - padding_y + dim_kernel_y; k_y++) + { + for (k_x = i_x * stride_x - padding_x; k_x < i_x * stride_x - padding_x + dim_kernel_x; k_x++) + { + if (k_y >= 0 && k_x >= 0 && k_y < dim_im_in_y && k_x < dim_im_in_x) + { + if (Im_in[i_ch_in * dim_im_in_x * dim_im_in_y + (k_x + k_y * dim_im_in_x)] > max) + { + max = Im_in[i_ch_in * dim_im_in_x * dim_im_in_y + (k_x + k_y * dim_im_in_x)]; + } + } + } + } + Im_out[ch_offset+(i_x + i_y * dim_im_out_x)] = max; + } + } + } +} + +// shift according to the maximum +void local_sumpool_q15_HWC(const q15_t *Im_in, // input image + const uint16_t dim_im_in_x, // input image dimension x or W + const uint16_t dim_im_in_y, // input image dimension y or H + const uint16_t ch_im_in, // number of input image channels + const uint16_t dim_kernel_x, // window kernel size + const uint16_t dim_kernel_y, // window kernel size + const uint16_t padding_x, // padding sizes + const uint16_t padding_y, // padding sizes + const uint16_t stride_x, // stride + const uint16_t stride_y, // stride + const uint16_t dim_im_out_x, // output image dimension x or W + const uint16_t dim_im_out_y, // output image dimension y or H + const uint16_t output_shift, // output right shift + q7_t *bufferA, // a buffer for local storage, size = 4*output_size + q15_t *Im_out) +{ + int16_t i_ch_in, i_x, i_y; + int16_t k_x, k_y; + int32_t *buf = (int32_t *)bufferA; + // stage2 + // int32_t max_abs = 0; + // int32_t output_shift; + // size_t output_size = dim_im_out_x * dim_im_out_x * ch_im_in; + + // save in 32bit + for (i_ch_in = 0; i_ch_in < ch_im_in; i_ch_in++) + { + for (i_y = 0; i_y < dim_im_out_y; i_y++) + { + for (i_x = 0; i_x < dim_im_out_x; i_x++) + { + int sum = 0; + for (k_y = i_y * stride_y - padding_y; k_y < i_y * stride_y - padding_y + dim_kernel_y; k_y++) + { + for (k_x = i_x * stride_x - padding_x; k_x < i_x * stride_x - padding_x + dim_kernel_x; k_x++) + { + if (k_y >= 0 && k_x >= 0 && k_y < dim_im_in_y && k_x < dim_im_in_x) + { + sum += Im_in[i_ch_in + ch_im_in * (k_x + k_y * dim_im_in_x)]; + } + } + } + // 32bit + buf[i_ch_in + ch_im_in * (i_x + i_y * dim_im_out_x)] = (q15_t)__NNOM_SSAT((sum >> output_shift), 16); + } + } + } + + // // find max amount results + // for (int i = 0; i < output_size; i++) + // { + // int32_t val = buf[i]; + // if (val < 0) + // val = -val; + // if (val > max_abs) + // max_abs = val; + // } + // // find best shift to cover the max + // for (output_shift = 0;; output_shift++) + // { + // if (127 * (1 + output_shift) >= max_abs) + // break; + // } + + // // shift the results + // for (int i = 0; i < output_size; i++) + // { + // Im_out[i] = buf[i] >> output_shift; + // } + //return output_shift; +} + +// temporary for the thesis +// shift according to the maximum +void local_sumpool_q15_CHW(const q15_t *Im_in, // input image + const uint16_t dim_im_in_x, // input image dimension x or W + const uint16_t dim_im_in_y, // input image dimension y or H + const uint16_t ch_im_in, // number of input image channels + const uint16_t dim_kernel_x, // window kernel size + const uint16_t dim_kernel_y, // window kernel size + const uint16_t padding_x, // padding sizes + const uint16_t padding_y, // padding sizes + const uint16_t stride_x, // stride + const uint16_t stride_y, // stride + const uint16_t dim_im_out_x, // output image dimension x or W + const uint16_t dim_im_out_y, // output image dimension y or H + const uint16_t output_shift, // output right shift + q7_t *bufferA, // a buffer for local storage, size = 4*output_size + q15_t *Im_out) +{ + int16_t i_ch_in, i_x, i_y; + int16_t k_x, k_y; + int32_t *buf = (int32_t *)bufferA; + int32_t i_ch_offset, o_ch_offset; + // // stage2 + // int32_t max_abs = 0; + // int32_t output_shift; + // size_t output_size = dim_im_out_x * dim_im_out_x * ch_im_in; + + // save in 32bit + for (i_ch_in = 0; i_ch_in < ch_im_in; i_ch_in++) + { + i_ch_offset = i_ch_in*dim_im_in_x*dim_im_in_y; + o_ch_offset = i_ch_in*dim_im_out_x*dim_im_out_y; + + for (i_y = 0; i_y < dim_im_out_y; i_y++) + { + for (i_x = 0; i_x < dim_im_out_x; i_x++) + { + int sum = 0; + for (k_y = i_y * stride_y - padding_y; k_y < i_y * stride_y - padding_y + dim_kernel_y; k_y++) + { + for (k_x = i_x * stride_x - padding_x; k_x < i_x * stride_x - padding_x + dim_kernel_x; k_x++) + { + if (k_y >= 0 && k_x >= 0 && k_y < dim_im_in_y && k_x < dim_im_in_x) + { + sum += Im_in[i_ch_offset + (k_x + k_y * dim_im_in_x)]; + } + } + } + // 32bit + buf[o_ch_offset + (i_x + i_y * dim_im_out_x)] = (q15_t)__NNOM_SSAT((sum >> output_shift), 16); + } + } + } + + // // find max amount results + // for (int i = 0; i < output_size; i++) + // { + // int32_t val = buf[i]; + // if (val < 0) + // val = -val; + // if (val > max_abs) + // max_abs = val; + // } + // // find best shift to cover the max + // for (output_shift = 0;; output_shift++) + // { + // if (127 * (1 + output_shift) >= max_abs) + // break; + // } + + // // shift the results + // for (int i = 0; i < output_size; i++) + // { + // Im_out[i] = buf[i] >> output_shift; + // } + //return output_shift; +} + +// customised up sample pooling +void local_up_sampling_q15_HWC(const q15_t *Im_in, // input image + const uint16_t dim_im_in_x, // input image dimension x or W + const uint16_t dim_im_in_y, // input image dimension y or H + const uint16_t ch_im_in, // number of input image channels + const uint16_t dim_kernel_x, // window kernel size + const uint16_t dim_kernel_y, // window kernel size + const uint16_t dim_im_out_x, // output image dimension x or W + const uint16_t dim_im_out_y, // output image dimension y or H + q7_t *bufferA, // a buffer for local storage, NULL by now + q15_t *Im_out) +{ + int16_t i_x, i_y; + + // for loop for each pixel in input image. + for (i_y = 0; i_y < dim_im_in_y; i_y++) + { + for (i_x = 0; i_x < dim_im_in_x; i_x++) + { + // copy all the channels together. + const q15_t *p_in = Im_in + (i_y * dim_im_in_x + i_x ) * ch_im_in; + q15_t *pout = Im_out + (i_y * dim_im_in_x * dim_kernel_x * dim_kernel_y + i_x * dim_kernel_y) * ch_im_in; + + // copy along x axis + for(int i = 0; i> out_shift[shift_idx]), 16); + } + } + } +} + +void local_convolve_CHW_q15_nonsquare(const q15_t *Im_in, // input image + const uint16_t dim_im_in_x, // input image dimention x + const uint16_t dim_im_in_y, // input image dimention y + const uint16_t ch_im_in, // number of input image channels + const q7_t *wt, // kernel weights + const uint16_t ch_im_out, // number of filters, i.e., output image channels + const uint16_t dim_kernel_x, // filter kernel size x + const uint16_t dim_kernel_y, // filter kernel size y + const uint16_t padding_x, // padding sizes x + const uint16_t padding_y, // padding sizes y + const uint16_t stride_x, // stride x + const uint16_t stride_y, // stride y + const uint16_t dilation_x, // dilation x + const uint16_t dilation_y, // dilation y + const q7_t *bias, // bias + const nnom_qformat_param_t *bias_shift, // bias shifts + const nnom_qformat_param_t *out_shift, // output shift + const nnom_qtype_t q_type, // per channel or per tensor + q15_t *Im_out, // output image + const uint16_t dim_im_out_x, // output image dimension x + const uint16_t dim_im_out_y, // output image dimension y + q15_t *bufferA, //buffer space for input + q7_t *bufferB //buffer space for output +) +{ + int i, j, k, l, m, n; + int64_t conv_out; + int in_row, in_col; + int shift_idx, shift_steps; + if(q_type == NNOM_QTYPE_PER_AXIS) + shift_steps = 1; + else + shift_steps = 0; + + for(i = 0, shift_idx = 0; i < ch_im_out; i++, shift_idx += shift_steps) + { + for (j = 0; j < dim_im_out_y; j++) + { + for (k = 0; k < dim_im_out_x; k++) + { + if(bias) + conv_out = ((q31_t)(bias[i]) << bias_shift[shift_idx]) + NNOM_ROUND(out_shift[shift_idx]); + else + conv_out = (q31_t)NNOM_ROUND(out_shift[shift_idx]); + for (m = 0; m < dim_kernel_y; m++) + { + for (n = 0; n < dim_kernel_x; n++) + { + // if-for implementation + in_row = stride_y * j + m * dilation_y - padding_y; + in_col = stride_x * k + n * dilation_x - padding_x; + if (in_row >= 0 && in_col >= 0 && in_row < dim_im_in_y && in_col < dim_im_in_x) + { + for (l = 0; l < ch_im_in; l++) + { + conv_out += Im_in[(in_row * dim_im_in_x + in_col) + l * dim_im_in_x * dim_im_in_y] * + wt[(m * dim_kernel_x + n) * ch_im_in * ch_im_out + l * ch_im_out + i]; + } + } + } + } + Im_out[i * dim_im_out_x * dim_im_out_y + (j * dim_im_out_x + k)] = (q15_t)__NNOM_SSAT((conv_out >> out_shift[shift_idx]), 16); + } + } + } +} + +#define FALSE 0 +#define TRUE 1 + +static int alg_deconv2d_calculate_position( + int pos, + int stride, + int padding, + int dim_kernel, + int dim_in, + int* in_start, + int* kernel_start, + int* kernel_end) +{ + int is_zero = FALSE; + int of, adj; + is_zero = FALSE; + *in_start = pos/stride; + of = pos%stride; + *kernel_start = padding - of; + if(*kernel_start >= 0) { + adj = MIN(*in_start, *kernel_start/stride); + *kernel_start -= adj*stride; + *in_start -= adj; + } else { + adj = -*kernel_start + dim_kernel; + if(adj<=stride) { + is_zero = TRUE; + } else { + adj = MIN(dim_in-1-*in_start, adj/stride); + *kernel_start += adj*stride; + *in_start += adj; + } + } + of = dim_kernel - 1 - *kernel_start; + adj = MIN(dim_in-1-*in_start, of/stride); + *kernel_end = *kernel_start + adj*stride; + + return is_zero; +} + +void local_conv_trans_HWC_q15_nonsquare(const int8_t * Im_in, + const uint16_t dim_im_in_x, // input image dimention x + const uint16_t dim_im_in_y, // input image dimention y + const uint16_t ch_im_in, // number of input image channels + const q7_t *wt, // kernel weights + const uint16_t ch_im_out, // number of filters, i.e., output image channels + const uint16_t dim_kernel_x, // filter kernel size x + const uint16_t dim_kernel_y, // filter kernel size y + const uint16_t padding_x, // padding sizes x + const uint16_t padding_y, // padding sizes y + const uint16_t stride_x, // stride x + const uint16_t stride_y, // stride y + const uint16_t dilation_x, // dilation x + const uint16_t dilation_y, // dilation y + const q7_t *bias, // bias + const uint16_t bias_shift, const uint16_t out_shift, q15_t *Im_out, // output image + const uint16_t dim_im_out_x, // output image dimension x + const uint16_t dim_im_out_y, // output image dimension y + q15_t *bufferA, //buffer space for input + q7_t *bufferB //buffer space for output +) +// { +// int ox, oy, oc, ky, kx, kc, ix, iy; +// int conv_out; +// int in_pix_loc, wt_loc; + +// (void)dilation_y; +// (void)dilation_x; + +// // padding and stride are applied to output +// for (oc = 0; oc < ch_im_out; oc++) +// { +// for (oy = 0; oy < dim_im_out_y; oy++) +// { +// for (ox = 0; ox < dim_im_out_x; ox++) +// { +// conv_out = ((q31_t)(bias[oc]) << bias_shift) + NNOM_ROUND(out_shift); + +// for (ky = 0; ky < dim_kernel_y; ky++) +// { +// for (kx = 0; kx < dim_kernel_x; kx++) +// { +// // input y, input x location +// iy = oy / stride_y + ky - padding_y; +// ix = ox / stride_x + kx - padding_x; + +// if(ix >= 0 && iy >= 0 && ix < dim_im_in_y && iy< dim_im_in_y) +// { +// in_pix_loc = (iy * dim_im_in_x + ix) * ch_im_in; +// wt_loc = oc * ch_im_in * dim_kernel_y * dim_kernel_x + (ky * dim_kernel_x + kx) * ch_im_in; + +// for (kc = 0; kc < ch_im_in; kc++) +// { +// conv_out += Im_in[in_pix_loc + kc] * wt[wt_loc + kc]; +// } +// } +// } +// } + +// Im_out[oc + (oy * dim_im_out_x + ox) * ch_im_out] = (q7_t) __NNOM_SSAT((conv_out >> out_shift), 8); +// } +// } +// } +// } + +{ + int i, j, k, l, m, n; + int64_t conv_out; + int in_row, in_col; + int kernel_start_x,kernel_end_x; + int kernel_start_y,kernel_end_y; + int in_row_start, in_col_start; + int is_zero; + + for (i = 0; i < ch_im_out; i++) { + for (j = 0; j < dim_im_out_y; j++) { + is_zero = alg_deconv2d_calculate_position(j, stride_y, padding_y, dim_kernel_y, + dim_im_in_y, &in_row_start, &kernel_start_y, &kernel_end_y); + + if(is_zero) { + conv_out = ((q31_t)(bias[i]) << bias_shift) + NNOM_ROUND(out_shift); + conv_out = (q15_t) __NNOM_SSAT((conv_out >> out_shift), 16); + for (k = 0; k < dim_im_out_x; k++) { + Im_out[i + (j * dim_im_out_x + k) * ch_im_out] = (q15_t) conv_out; + } + continue; + } + + for (k = 0; k < dim_im_out_x; k++) { + conv_out = ((q31_t)(bias[i]) << bias_shift) + NNOM_ROUND(out_shift); + + is_zero = alg_deconv2d_calculate_position(k, stride_x, padding_x, dim_kernel_x, + dim_im_in_x, &in_col_start, &kernel_start_x, &kernel_end_x); + + if(is_zero) { + Im_out[i + (j * dim_im_out_x + k) * ch_im_out] = conv_out; + continue; + } + + for (m = kernel_start_y, in_row = in_row_start; m <= kernel_end_y; m+=stride_y, in_row++) { + for (n = kernel_start_x, in_col = in_col_start; n <= kernel_end_x; n+=stride_x, in_col++) { + if ((in_row >= 0) && (in_col >= 0) && + (in_row < dim_im_in_y) && (in_col < dim_im_in_x)) { + for (l = 0; l < ch_im_in; l++) { + conv_out += Im_in[(in_row * dim_im_in_x + in_col) * ch_im_in + l] * + wt[i * ch_im_in * dim_kernel_y * dim_kernel_x + (m * dim_kernel_x + n) * ch_im_in + l]; + } + } + } + } + + Im_out[i + (j * dim_im_out_x + k) * ch_im_out] = (q15_t) __NNOM_SSAT((conv_out >> out_shift), 16); + } + } + } +} + + + + +void local_depthwise_separable_conv_HWC_q15_nonsquare(const q15_t *Im_in,// input image + const uint16_t dim_im_in_x, // input image dimention x + const uint16_t dim_im_in_y, // input image dimention y + const uint16_t ch_im_in, // number of input image channels + const q7_t *wt, // kernel weights + const uint16_t ch_im_out, // number of filters, i.e., output image channels + const uint16_t dim_kernel_x, // filter kernel size x + const uint16_t dim_kernel_y, // filter kernel size y + const uint16_t padding_x, // padding sizes x + const uint16_t padding_y, // padding sizes y + const uint16_t stride_x, // stride x + const uint16_t stride_y, // stride y + const uint16_t dilation_x, // dilation x + const uint16_t dilation_y, // dilation y + const q7_t *bias, // bias + const nnom_qformat_param_t *bias_shift, // bias shifts + const nnom_qformat_param_t *out_shift, // output shift + const nnom_qtype_t q_type, // per channel or per tensor + q15_t *Im_out, // output image + const uint16_t dim_im_out_x, // output image dimension x + const uint16_t dim_im_out_y, // output image dimension y + q15_t *bufferA, //buffer space for input + q7_t *bufferB //buffer space for output +) +{ + int i_out_y, i_out_x, i_ch_out, i_ch_in, i_ch_mult; + int i_ker_y, i_ker_x; + int i_out = 0; + int shift_idx; + int ch_mult = ch_im_out / ch_im_in; + int64_t conv_out; + + for (i_out_y = 0; i_out_y < dim_im_out_y; i_out_y++) + { + const int32_t base_idx_y = stride_y * i_out_y - padding_y; + for (i_out_x = 0; i_out_x < dim_im_out_x; i_out_x++) + { + const int32_t base_idx_x = stride_x * i_out_x - padding_x; + for (i_ch_in = 0; i_ch_in < ch_im_in; i_ch_in++) + { + for(i_ch_mult = 0; i_ch_mult < ch_mult; i_ch_mult++) + { + i_ch_out = i_ch_mult + i_ch_in * ch_mult; + int32_t ker_y_start = MAX(0, -base_idx_y); + int32_t ker_x_start = MAX(0, -base_idx_x); + int32_t ker_y_end = MIN(dim_kernel_y, dim_im_in_y - base_idx_y); + int32_t ker_x_end = MIN(dim_kernel_x, dim_im_in_x - base_idx_x); + + shift_idx = q_type == NNOM_QTYPE_PER_AXIS ? i_ch_out : 0; + if (bias) + conv_out = ((q31_t)(bias[i_ch_out]) << bias_shift[shift_idx]) + NNOM_ROUND(out_shift[shift_idx]); + else + conv_out = (q31_t)NNOM_ROUND(out_shift[shift_idx]); + + for (i_ker_y = ker_y_start; i_ker_y < ker_y_end; i_ker_y++) + { + const int32_t idx_y = base_idx_y + i_ker_y * dilation_y; + for (i_ker_x = ker_x_start; i_ker_x < ker_x_end; i_ker_x++) + { + const int32_t idx_x = base_idx_x + i_ker_x * dilation_x; + int32_t in_pix_loc = (idx_y * dim_im_in_x + idx_x) * ch_im_in + i_ch_in; + int32_t wt_loc = (i_ker_y * dim_kernel_x + i_ker_x) * (ch_im_in * ch_mult) + i_ch_out; + conv_out += Im_in[in_pix_loc] * wt[wt_loc]; + } + } + Im_out[i_out++] = (q15_t)__NNOM_SSAT((conv_out >> out_shift[shift_idx]), 16); + } + } + } + } +} + +void local_depthwise_separable_conv_CHW_q15_nonsquare(const q15_t *Im_in,// input image + const uint16_t dim_im_in_x, // input image dimention x + const uint16_t dim_im_in_y, // input image dimention y + const uint16_t ch_im_in, // number of input image channels + const q7_t *wt, // kernel weights + const uint16_t ch_im_out, // number of filters, i.e., output image channels + const uint16_t dim_kernel_x, // filter kernel size x + const uint16_t dim_kernel_y, // filter kernel size y + const uint16_t padding_x, // padding sizes x + const uint16_t padding_y, // padding sizes y + const uint16_t stride_x, // stride x + const uint16_t stride_y, // stride y + const uint16_t dilation_x, // dilation x + const uint16_t dilation_y, // dilation y + const q7_t *bias, // bias + const nnom_qformat_param_t *bias_shift, // bias shifts + const nnom_qformat_param_t *out_shift, // output shift + const nnom_qtype_t q_type, // per channel or per tensor + q15_t *Im_out, // output image + const uint16_t dim_im_out_x, // output image dimension x + const uint16_t dim_im_out_y, // output image dimension y + q15_t *bufferA, //buffer space for input + q7_t *bufferB //buffer space for output +) +{ + int i_out_y, i_out_x, i_ch_out, i_ch_in, i_ch_mult; + int i_ker_y, i_ker_x; + int shift_idx; + int ch_mult = ch_im_out / ch_im_in; + int64_t conv_out; + + for (i_out_y = 0; i_out_y < dim_im_out_y; i_out_y++) + { + const int32_t base_idx_y = stride_y * i_out_y - padding_y; + for (i_out_x = 0; i_out_x < dim_im_out_x; i_out_x++) + { + const int32_t base_idx_x = stride_x * i_out_x - padding_x; + for (i_ch_in = 0; i_ch_in < ch_im_in; i_ch_in++) + { + for (i_ch_mult = 0; i_ch_mult < ch_mult; i_ch_mult++) + { + i_ch_out = i_ch_mult + i_ch_in * ch_mult; + int32_t ker_y_start = MAX(0, -base_idx_y); + int32_t ker_x_start = MAX(0, -base_idx_x); + int32_t ker_y_end = MIN(dim_kernel_y, dim_im_in_y - base_idx_y); + int32_t ker_x_end = MIN(dim_kernel_x, dim_im_in_x - base_idx_x); + + shift_idx = q_type == NNOM_QTYPE_PER_AXIS ? i_ch_out : 0; + if (bias) + conv_out = ((q31_t)(bias[i_ch_out]) << bias_shift[shift_idx]) + NNOM_ROUND(out_shift[shift_idx]); + else + conv_out = (q31_t)NNOM_ROUND(out_shift[shift_idx]); + + for (i_ker_y = ker_y_start; i_ker_y < ker_y_end; i_ker_y++) + { + const int32_t idx_y = base_idx_y + i_ker_y * dilation_y; + for (i_ker_x = ker_x_start; i_ker_x < ker_x_end; i_ker_x++) + { + const int32_t idx_x = base_idx_x + i_ker_x * dilation_x; + int32_t in_pix_loc = (idx_y * dim_im_in_x + idx_x) + i_ch_in * dim_im_in_x * dim_im_in_y; + int32_t wt_loc = (i_ker_y * dim_kernel_x + i_ker_x) * ch_im_out + i_ch_out; + conv_out += Im_in[in_pix_loc] * wt[wt_loc]; + } + } + Im_out[i_ch_out * dim_im_out_x * dim_im_out_y + (i_out_y * dim_im_out_x + i_out_x)] = + (q15_t)__NNOM_SSAT((conv_out >> out_shift[shift_idx]), 16); + } + } + } + } + +} + +void local_zero_padding_HWC_q15(const q15_t *Im_in, // input image + const uint16_t dim_im_in_x, // input image dimention x + const uint16_t dim_im_in_y, // input image dimention y + const uint16_t ch_im_in, // number of input image channels + const uint16_t padding_top, // padding sizes y + const uint16_t padding_bottom, // padding sizes y + const uint16_t padding_left, // padding sizes x + const uint16_t padding_right, // padding sizes x + q15_t *Im_out, // output image + const uint16_t dim_im_out_x, // output image dimension x + const uint16_t dim_im_out_y) // output image dimension y +{ + int i, size; + q15_t * p_out = Im_out; + + // top rows + size = dim_im_out_x*ch_im_in*padding_top; + nnom_memset(p_out, 0, size*sizeof(q15_t)); + p_out += size; + + // middle + for(i=0; i> out_shift), 16); + } +} + +void local_dot_q15_opt(const q15_t * pV, + const q15_t * pM, + const uint16_t dim_vec, + const uint16_t num_of_rows, + const uint16_t out_shift, + q15_t * pOut) +{ + /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */ + uint16_t rowCnt = num_of_rows >> 2; + const q15_t *pB = pM; + const q15_t *pA; + q15_t *pO = pOut; + + while (rowCnt) + { + int64_t sum = (q31_t) NNOM_ROUND(out_shift); + int64_t sum2 = (q31_t) NNOM_ROUND(out_shift); + int64_t sum3 = (q31_t) NNOM_ROUND(out_shift); + int64_t sum4 = (q31_t) NNOM_ROUND(out_shift); + uint16_t colCnt = dim_vec >> 1; + pA = pV; + while (colCnt) + { + q15_t inA1 = *pA++; + q15_t inA2 = *pA++; + q15_t inB1 = *pB++; + q15_t inB2 = *pB++; + sum += inA1 * inB1 + inA2 * inB2; + + inB1 = *pB++; + inB2 = *pB++; + sum2 += inA1 * inB1 + inA2 * inB2; + + inB1 = *pB++; + inB2 = *pB++; + sum3 += inA1 * inB1 + inA2 * inB2; + + inB1 = *pB++; + inB2 = *pB++; + sum4 += inA1 * inB1 + inA2 * inB2; + + colCnt--; + } + colCnt = dim_vec & 0x1; + while (colCnt) + { + q15_t inA = *pA++; + q15_t inB = *pB++; + sum += inA * inB; + inB = *pB++; + sum2 += inA * inB; + inB = *pB++; + sum3 += inA * inB; + inB = *pB++; + sum4 += inA * inB; + colCnt--; + } + *pO++ = (q15_t) __NNOM_SSAT((sum >> out_shift), 16); + *pO++ = (q15_t) __NNOM_SSAT((sum2 >> out_shift), 16); + *pO++ = (q15_t) __NNOM_SSAT((sum3 >> out_shift), 16); + *pO++ = (q15_t) __NNOM_SSAT((sum4 >> out_shift), 16); + + rowCnt--; + } + rowCnt = num_of_rows & 0x3; + + while (rowCnt) + { + int64_t ip_out = (q31_t) + NNOM_ROUND(out_shift); + int j; + + pA = pV; + for (j = 0; j < dim_vec; j++) + { + q15_t inA = *pA++; + q15_t inB = *pB++; + ip_out += inA * inB; + } + *pO++ = (q15_t) __NNOM_SSAT((ip_out >> out_shift), 16); + + rowCnt--; + } +} + +void local_fully_connected_mat_q7_vec_q15_opt(const q15_t * pV, + const q7_t * pM, + const uint16_t dim_vec, + const uint16_t num_of_rows, + const uint16_t bias_shift, + const uint16_t out_shift, + const q7_t * bias, + q15_t * pOut, + q15_t * vec_buffer) +{ + + (void)vec_buffer; + + /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */ + uint16_t rowCnt = num_of_rows >> 2; + const q7_t *pB = pM; + const q15_t *pA; + q15_t *pO = pOut; + const q7_t *pBias = bias; + + while (rowCnt) + { + int64_t sum; + int64_t sum2; + int64_t sum3; + int64_t sum4; + uint16_t colCnt = dim_vec >> 1; + + // quick and dirty to support none bias fully connected + if(bias) + { + sum = ((q31_t)(*pBias++) << bias_shift) + NNOM_ROUND(out_shift); + sum2 = ((q31_t)(*pBias++) << bias_shift) + NNOM_ROUND(out_shift); + sum3 = ((q31_t)(*pBias++) << bias_shift) + NNOM_ROUND(out_shift); + sum4 = ((q31_t)(*pBias++) << bias_shift) + NNOM_ROUND(out_shift); + } + else + { + sum = (q31_t) NNOM_ROUND(out_shift); + sum2 = (q31_t) NNOM_ROUND(out_shift); + sum3 = (q31_t) NNOM_ROUND(out_shift); + sum4 = (q31_t) NNOM_ROUND(out_shift); + } + + pA = pV; + while (colCnt) + { + q15_t inA1 = *pA++; + q15_t inA2 = *pA++; + + q7_t inB1 = *pB++; + q7_t inB3 = *pB++; + q7_t inB2 = *pB++; + q7_t inB4 = *pB++; + + sum += inA1 * inB1 + inA2 * inB2; + sum2 += inA1 * inB3 + inA2 * inB4; + + inB1 = *pB++; + inB3 = *pB++; + inB2 = *pB++; + inB4 = *pB++; + + sum3 += inA1 * inB1 + inA2 * inB2; + sum4 += inA1 * inB3 + inA2 * inB4; + + colCnt--; + } + + colCnt = dim_vec & 0x1; + while (colCnt) + { + q15_t inA = *pA++; + q7_t inB = *pB++; + sum += inA * inB; + inB = *pB++; + sum2 += inA * inB; + inB = *pB++; + sum3 += inA * inB; + inB = *pB++; + sum4 += inA * inB; + + colCnt--; + } + *pO++ = (q15_t) __NNOM_SSAT((sum >> out_shift), 16); + *pO++ = (q15_t) __NNOM_SSAT((sum2 >> out_shift), 16); + *pO++ = (q15_t) __NNOM_SSAT((sum3 >> out_shift), 16); + *pO++ = (q15_t) __NNOM_SSAT((sum4 >> out_shift), 16); + + rowCnt--; + } + + rowCnt = num_of_rows & 0x3; + + while (rowCnt) + { + int64_t ip_out; + int j; + + // quick and dirty to support none bias fully connected + if(bias) + ip_out = ((q31_t)(*pBias++) << bias_shift) + NNOM_ROUND(out_shift); + else + ip_out = (q31_t)NNOM_ROUND(out_shift); + + pA = pV; + for (j = 0; j < dim_vec; j++) + { + q15_t inA = *pA++; + q7_t inB = *pB++; + ip_out += inA * inB; + } + *pO++ = (q15_t) __NNOM_SSAT((ip_out >> out_shift), 16); + + rowCnt--; + } +} + +void local_fully_connected_mat_q7_vec_q15(const q15_t * pV, + const q7_t * pM, + const uint16_t dim_vec, + const uint16_t num_of_rows, + const uint16_t bias_shift, + const uint16_t out_shift, + const q7_t * bias, + q15_t * pOut, + q15_t * vec_buffer) +{ + int i, j; + + // a quick solution for none-bias dot. + if(bias == NULL) + { + for (i = 0; i < num_of_rows; i++) + { + int64_t ip_out = (q31_t) NNOM_ROUND(out_shift); + for (j = 0; j < dim_vec; j++) + { + ip_out += pV[j] * pM[i * dim_vec + j]; + } + pOut[i] = (q15_t) __NNOM_SSAT((ip_out >> out_shift), 16); + } + } + else + { + for (i = 0; i < num_of_rows; i++) + { + int64_t ip_out = ((q31_t)(bias[i]) << bias_shift) + NNOM_ROUND(out_shift); + for (j = 0; j < dim_vec; j++) + { + ip_out += pV[j] * pM[i * dim_vec + j]; + } + pOut[i] = (q15_t) __NNOM_SSAT((ip_out >> out_shift), 16); + } + } +} + +// This softmax is a copy from ARM CMSIS implimentation as it was efficient and written in pure-C. +// original implementation: https://github.com/ARM-software/CMSIS_5/blob/develop/CMSIS/NN/Source/SoftmaxFunctions/arm_softmax_q15.c +void local_softmax_q15(const q15_t * vec_in, const uint16_t dim_vec, q15_t * p_out) +{ + q31_t sum; + int16_t i; + uint8_t shift; + q31_t base; + base = -1 * 0x100000; + for (i = 0; i < dim_vec; i++) + { + if (vec_in[i] > base) + { + base = vec_in[i]; + } + } + + /* we ignore really small values + * anyway, they will be 0 after shrinking + * to q15_t + */ + base = base - 16; + + sum = 0; + + for (i = 0; i < dim_vec; i++) + { + if (vec_in[i] > base) + { + shift = (uint8_t)__NNOM_USAT(vec_in[i] - base, 5); + sum += 0x1 << shift; + } + } + + /* This is effectively (0x1 << 32) / sum */ + int64_t div_base = 0x100000000LL; + int output_base = (int32_t)(div_base / sum); + + /* Final confidence will be output_base >> ( 17 - (vec_in[i] - base) ) + * so 32768 (0x1<<15) -> 100% confidence when sum = 0x1 << 16, output_base = 0x1 << 16 + * and vec_in[i]-base = 16 + */ + for (i = 0; i < dim_vec; i++) + { + if (vec_in[i] > base) + { + /* Here minimum value of 17+base-vec[i] will be 1 */ + shift = (uint8_t)__NNOM_USAT(17+base-vec_in[i], 5); + p_out[i] = (q15_t) __NNOM_SSAT((output_base >> shift), 16); + } else + { + p_out[i] = 0; + } + } + +} + + +// hard sigmoid, +// y=-1 if x < -2.5 +// y=1 if x > 2.5 +// otherwise y = 0.2 * x + 0.5 (y=0.20315 * x + 0.5) +void local_hard_sigmoid_q15(q15_t *data, uint32_t size, int16_t dec_bit) +{ + int16_t limit = 2.5f * (1 << dec_bit)-1; + int16_t offset = 16384; // 0.5 * 32768 + int16_t mult = 6554; // 0.2 * 32768 + + // int bit >= 0 + for(int i=0; i= limit) + data[i] = 32767; + else + { + data[i] = ((int32_t)(data[i] * mult) >> dec_bit) + offset; + } + } + } + +// hard tanh +// y=-1 if x < -1 +// y=1 if x > 1 +// otherwise y = x +void local_hard_tanh_q15(q15_t *data, uint32_t size, int16_t dec_bit) +{ + int16_t int_bit = 15 - dec_bit; + int16_t limit = 1 << dec_bit; + + if(dec_bit == 15) + return; + + // int bit < 0 + if(int_bit < 0) + for(int i=0; i= limit) + data[i] = 32767; + else + { + data[i] = data[i] >> (-int_bit); + } + } + else + // int bit >= 0 + for(int i=0; i= limit) + data[i] = 32767; + else + { + data[i] = data[i] << int_bit; + } + } +} + +void local_relu_q15(q15_t *data, uint32_t size) +{ + uint32_t i; + + for (i = 0; i < size; i++) + { + if (data[i] < 0) + data[i] = 0; + } +} + +// alpha in q7 format with dec_bit=7 +void local_leaky_relu_q15(q15_t *data, q7_t alpha, uint32_t size) +{ + uint32_t i; + + for (i = 0; i < size; i++) + { + if (data[i] < 0) + { + data[i] = data[i] * alpha / 128; + } + } +} + +// alpha in q7 format with dec_bit=7 +// max and threshold has the same Q format with the activation +void local_adv_relu_q15(q15_t *data, q7_t negative_slope, q15_t max, q15_t threshold, uint32_t size) +{ + uint32_t i; + for (i = 0; i < size; i++) + { + // `f(x) = max_value` for `x >= max_value`, + // `f(x) = x` for `threshold <= x < max_value`, + // `f(x) = alpha * (x - threshold)` otherwise. + + if(data[i] > max) + data[i] = max; + if (data[i] < threshold) + data[i] = (data[i] - threshold) * negative_slope / 128; + } +} + +// ARM's CMSIS implementation. +static void local_activation_q15(q15_t * data, uint32_t size, uint16_t int_width, const q15_t*lookup_table) +{ + uint32_t i = size; + q15_t *pIn = data; + q15_t *pOut = data; + uint16_t shift_size = 8 + 3 - int_width; + uint32_t bit_mask = 0x7FF >> int_width; + uint32_t full_frac = bit_mask + 1; + while (i) + { + q15_t out; + q15_t in = *pIn++; + q15_t frac = (uint32_t) in & bit_mask; + q15_t value = lookup_table[(uint8_t)(in >> shift_size)]; + if ((in >> shift_size) != 0x7f) + { + q15_t value2 = lookup_table[(uint8_t)(1 + ((uint8_t)(in >> shift_size)))]; + /* doing the interpolation here for better accuracy */ + out = ((q31_t) (full_frac - frac) * value + (q31_t) value2 * frac) >> shift_size; + } else + { + /* the largest positive value does not have a right side for linear interpolation */ + out = value; + } + *pOut++ = out; + i--; + } +} + +void local_sigmoid_q15(q15_t * data, uint32_t size, uint16_t int_width) +{ + local_activation_q15(data, size, int_width, nnom_sigmoid_table_q15); +} + +void local_tanh_q15(q15_t * data, uint32_t size, uint16_t int_width) +{ + local_activation_q15(data, size, int_width, nnom_tanh_table_q15); +} + +// matrix ops q15 +void local_mult_q15(q15_t *pSrcA, + q15_t *pSrcB, + q15_t *pDst, + const uint16_t out_shift, + uint32_t blockSize) +{ + uint32_t i; + + for (i = 0; i < blockSize; i++) + { + q31_t product = pSrcA[i] * pSrcB[i]; + pDst[i] = (q15_t) __NNOM_SSAT(((product + NNOM_ROUND(out_shift)) >> out_shift), 16); + } +} + +void local_add_q15(q15_t *pSrcA, + q15_t *pSrcB, + q15_t *pDst, + const uint16_t out_shift, + uint32_t blockSize) +{ + uint32_t i; + + for (i = 0; i < blockSize; i++) + { + q31_t sum = pSrcA[i] + pSrcB[i]; + pDst[i] = (q15_t) __NNOM_SSAT(((sum + NNOM_ROUND(out_shift)) >> out_shift), 16); + } +} + +void local_sub_q15(q15_t *pSrcA, + q15_t *pSrcB, + q15_t *pDst, + const uint16_t out_shift, + uint32_t blockSize) +{ + uint32_t i; + + for (i = 0; i < blockSize; i++) + { + q31_t sub = pSrcA[i] - pSrcB[i]; + pDst[i] = (q15_t) __NNOM_SSAT(((sub + NNOM_ROUND(out_shift)) >> out_shift), 16); + } +} + + +void local_multiple_add_q15( q15_t *p_dst, + const int16_t out_shift, + uint32_t block_size, + uint32_t num_block, + q15_t **p_src) +{ + uint32_t i, blk; + q31_t sum; + + for (i = 0; i < block_size; i++) + { + sum = 0; + for(blk=0; blk < num_block; blk++) + sum += p_src[blk][i]; + p_dst[i] = (q15_t) __NNOM_SSAT(((sum + NNOM_ROUND(out_shift)) >> out_shift), 16); + } +} + +void local_multiple_mult_q15( q15_t *p_dst, + const int16_t out_shift, + uint32_t block_size, + uint32_t num_block, + q15_t **p_src) +{ + uint32_t i, blk; + q63_t product; + + for (i = 0; i < block_size; i++) + { + product = 1; + for(blk=0; blk < num_block; blk++) + product *= p_src[blk][i]; + p_dst[i] = (q15_t) __NNOM_SSAT(((product + NNOM_ROUND(out_shift)) >> out_shift), 16); + } +} + +void local_multiple_sub_q15( q15_t *p_dst, + const int16_t out_shift, + uint32_t block_size, + uint32_t num_block, + q15_t **p_src) +{ + uint32_t i, blk; + q31_t sub; + + for (i = 0; i < block_size; i++) + { + sub = p_src[0][i]; + for(blk=1; blk < num_block; blk++) + sub -= p_src[blk][i]; + p_dst[i] = (q15_t) __NNOM_SSAT(((sub + NNOM_ROUND(out_shift)) >> out_shift), 16); + } +} + +// y = 1 - x +void local_1_minor_z_q15(q15_t* src, q15_t*des, uint16_t dec_bit, uint32_t size) +{ + int32_t one = (1 << dec_bit)-1; + for(int i=0; i +#include +#include +#include +#include "nnom.h" + +const char default_layer_names[][12] = DEFUALT_LAYER_NAMES; +const char default_activation_names[][8] = ACTIVATION_NAMES; +const char default_cell_names[][8] = DEFUALT_CELL_NAMES; +size_t nnom_memory_taken = 0; + +// local static functions (when libc/dynamic memory interfaces are not avaiable. ) +#ifdef NNOM_USING_STATIC_MEMORY +static uint8_t *nnom_static_buf = NULL; //pointer to static buffer +static size_t nnom_static_buf_size = 0; //static buf size +static size_t nnom_static_buf_curr = 0; +void nnom_set_static_buf(void* buf, size_t size) +{ + nnom_static_buf = buf; + nnom_static_buf_size = size; + nnom_static_buf_curr = 0; +} +void* nnom_malloc(size_t size) +{ + size = nnom_alignto(size, NNOM_ALIGN); + if(size + nnom_static_buf_curr < nnom_static_buf_size) + { + uint8_t* new_block = nnom_static_buf_curr + nnom_static_buf; + nnom_static_buf_curr += size; + return new_block; + } + else + { + if(nnom_static_buf_size == 0) + NNOM_LOG("Please set static memory using 'nnom_set_static_buf()' before calling model created."); + else + NNOM_LOG("No memory! Static buffer size(%d) not big enough, please increase buffer size!", + (uint32_t)nnom_static_buf_size); + return NULL; + } +} +void nnom_free(void* p){;} +#endif // NNOM_USING_STATIC_MEMORY + +void *nnom_mem(size_t size) +{ + size = nnom_alignto(size, NNOM_ALIGN); + void *p = nnom_malloc(size); + if (p) + { + nnom_memory_taken += size; //test + nnom_memset(p, 0, size); + } + return p; +} + +size_t nnom_mem_stat(void) +{ + return nnom_memory_taken; +} + +// get the size of an IO module +static size_t io_mem_size(nnom_layer_io_t *io) +{ + size_t size = 0; + if (io != NULL) + { + while (io) + { + size += tensor_size(io->tensor); + io = io->aux; + } + } + return size; +} + +size_t nnom_alignto(size_t value, uint32_t alignment) +{ + if (value % alignment == 0) + return value; + value += alignment - value % alignment; + return value; +} + +static nnom_layer_t *find_last(nnom_layer_t *layer) +{ + if (layer == NULL) + return NULL; + // iterate every layer until the last one on the list, then return the last instance + while (layer->out->hook.io != NULL) + layer = layer->out->hook.io->owner; + return layer; +} +// input start layer, return layer num +static uint32_t find_index(nnom_layer_t *start, nnom_layer_t *layer) +{ + uint32_t i = 1; + if (start == NULL) + return 0; + // iterate every layer until the last one on the list, then return the index number + while (start->out->hook.io != NULL) + { + i++; + if (layer == start) + return i; + start = start->out->hook.io->owner; + } + return 0; +} + +static nnom_status_t model_add(nnom_model_t *model, nnom_layer_t *layer) +{ + nnom_layer_t *last = NULL; + nnom_layer_t *curr = NULL; + + if (layer == NULL) + { + NNOM_LOG("Error: added a NULL layer, could be no memory while creating layer.\n"); + return NN_NO_MEMORY; + } + + last = find_last(model->head); + curr = layer; + + // when the layer list is empty, the find_last() return model->head. + if (last == NULL) + { + model->head = curr; + } + else + { + // hook the current layer with the last layer. + last->out->hook.io = curr->in; // hook IO + curr->in->hook.io = last->out; + } + return NN_SUCCESS; +} + +// find an available hook on the io module, normally used by output io module. +// input, the output io module that wants to hook on +// output, the new hook that added to the end of the hook list on the io +static nnom_layer_hook_t *allocate_hook(nnom_layer_io_t *io) +{ + nnom_layer_hook_t *hook; + if (io == NULL) + return NULL; + hook = &io->hook; + + // if the primary hook is empty, reture it directly. + if (hook->io == NULL) + { + return hook; + } + else + { + // find the empty place and allocate new hook for us + while (hook->next != NULL) + { + hook = hook->next; + } + hook->next = nnom_mem(sizeof(nnom_layer_hook_t)); + if (hook->next == NULL) + return NULL; + return hook->next; + } +} + +// to check if an io is hooked to other layer +// input the primary io of a layer's input or output +// return, the new io that added to the io list. +static nnom_layer_io_t *allocate_io(nnom_layer_io_t *io) +{ + if (io == NULL) + return NULL; + + // if the io is free to used + if (io->hook.io == NULL) + { + return io; + } + else + { + // find the empty place and allocate new hook for us + while (io->aux != NULL) + { + io = io->aux; + } + io->aux = nnom_mem(sizeof(nnom_layer_io_t)); + if (io->aux == NULL) + return NULL; + // the owner for new io is inherited + io->aux->owner = io->owner; + return io->aux; + } +} + +// hook the current layer to the input layer +// this function only to connect (single output layer) to (single input layer). +static nnom_layer_t *model_hook(nnom_layer_t *curr, nnom_layer_t *last) +{ + nnom_layer_io_t *curr_in_io; + nnom_layer_hook_t *last_io_hook; + + if (last == NULL || curr == NULL) + return NULL; + + // add a new hook to the output io of the last layer + last_io_hook = allocate_hook(last->out); + // add a new input io to the current layer's input list. + curr_in_io = allocate_io(curr->in); + + // manually hook them togeter. + last_io_hook->io = curr_in_io; + curr_in_io->hook.io = last->out; + + return curr; +} + +// merge a few layers using specified method +// num = the number of layer that will be merged +// method = functional layer such as (concat(), mult(), add(), sub()) +static nnom_layer_t *model_mergex(nnom_layer_t *method, int num, ...) +{ + nnom_layer_t *layer_in; + va_list valist; + + if (method == NULL) + return NULL; + + va_start(valist, num); + for (int i = 0; i < num; i++) + { + // get the input layer + layer_in = va_arg(valist, nnom_layer_t *); + model_hook(method, layer_in); + } + va_end(valist); + return method; +} + +// merge 2 input +// this is an older interface +// method = functional layer such as (concat(), mult(), add(), sub()) +static nnom_layer_t *model_merge(nnom_layer_t *method, nnom_layer_t *in1, nnom_layer_t *in2) +{ + return model_mergex(method, 2, in1, in2); +} + +// This api will merge activation to layer's actail to avoid the extra layer for activation +static nnom_layer_t *model_active(nnom_activation_t *act, nnom_layer_t *target) +{ + // simple and easy + target->actail = act; + return target; +} + +// when model=NULL, it create a new sequential model +nnom_model_t *new_model(nnom_model_t *model) +{ + nnom_model_t *m = model; + if (m == NULL) + { + m = nnom_mem(sizeof(nnom_model_t)); + m->is_allocated = true; + } + else + { + nnom_memset(m, 0, sizeof(nnom_model_t)); + m->is_allocated = false; + } + + // set methods + m->add = model_add; + m->hook = model_hook; + m->merge = model_merge; + m->mergex = model_mergex; + m->active = model_active; + + return m; +} + +static void io_tensor_delete(nnom_layer_io_t* io) +{ + while (io) + { + nnom_free(io->tensor); + io = io->aux; + } +} + +// delete all the aux hooks +// delete aux io only, keep the primary io. +static void io_list_delete(nnom_layer_io_t *io) +{ + nnom_layer_hook_t *hook, *next_hook; + nnom_layer_io_t *next_io; + while (io) + { + // store the next io + next_io = io->aux; + + // release hooks list first + hook = io->hook.next; + while (hook) + { + next_hook = hook->next; + nnom_free(hook); + hook = next_hook; + } + + // now we can release the aux io itself + // but if this io is the primary input/out of the layer, it will be freed with they layer's instance since they are allocated together. + if (io != io->owner->in && io != io->owner->out) + nnom_free(io); + + // next aux io + io = next_io; + } +} + +// there are 2 type of memory in a layer +// *primary memory* is allocated when a layer instance is created, they are created by layer API (Conv2D()...). +// it includes the layer instance, primary input, primary output, and an optional computational memory buffer instance +// each io module also has one primary hook. +// *secondary memory* are axiliary io modules, axiliary hooks and activations which created by model.xx() APIs (model.hook(), model.active()...) +// it includes the list of aux io modules, the list of aux hooks. +// +// Additionaly, layer's private free method must be called to free layer's private resources +// Such as activation instance passed to Activation() layer, and private memory allcated within Lambda layer. +// +// A layer is consist of a few io modules. primary io are allocated with layers instance. +// each of the io has a few hooks. primary hooks are included in the io module. +// so only "aux" hooks and ios need to be freed separately. +static void layer_delete(nnom_layer_t *layer) +{ + if (layer == NULL) + return; + + // call private free of the layer + if (layer->free) + layer->free(layer); + + // delete the tensors first. only input layer should delete input + if (layer->type == NNOM_INPUT) + io_tensor_delete(layer->in); + io_tensor_delete(layer->out); + + // release secondary memory on the layers. + // they are io lists and hooks list + io_list_delete(layer->in); + io_list_delete(layer->out); + + // release activations (it takes null too) + nnom_free(layer->actail); + + // release primary memory + nnom_free(layer); + return; +} + +void model_delete(nnom_model_t *m) +{ + nnom_layer_t *layer; + nnom_layer_t *next; + if (m == NULL) + return; + + // uses shortcut list to iterate the model, + // start from head + layer = m->head; + while (layer) + { + // get the next before releasing current + next = layer->shortcut; + // your term + layer_delete(layer); + // who's next! + layer = next; + } + + // free the memory blocks for the network's buffer + nnom_free(m->blocks->blk); + + // free model instance itself + if (m->is_allocated) + nnom_free(m); + else + nnom_memset(m, 0, sizeof(nnom_model_t)); + + nnom_memory_taken = 0; + return; +} + +// find an available memory block. +static nnom_mem_block_t *allocate_block(nnom_mem_block_t *list) +{ + nnom_mem_block_t *free = NULL; + uint32_t idx; + + for (idx = 0; idx < NNOM_BLOCK_NUM; idx++) + { + if (list[idx].owners == 0) + break; + } + if(idx == NNOM_BLOCK_NUM) + { + NNOM_LOG("\nERROR! No enough memory block for parallel buffers, please increase the 'NNOM_BLOCK_NUM' in 'nnom_port.h'\n"); + return NULL; + } + + free = &list[idx]; + return free; +} + +static void release_block(nnom_mem_block_t *block) +{ + if (block->owners > 0) + block->owners -= 1; + if (block->owners == 0) + block->state = NNOM_BUF_EMPTY; +} + +static void release_input_mem(nnom_layer_t *layer) +{ + nnom_layer_io_t *in; + // release all input of buf + in = layer->in; + while (in != NULL) + { + release_block(in->mem); + in = in->aux; + } +} +static void release_comp_mem(nnom_layer_t *layer) +{ + // release computational buf if exist + if (layer->comp != NULL) + { + release_block(layer->comp->mem); + } +} + +// return the length of the io lists +size_t nnom_io_length(nnom_layer_io_t *io) +{ + size_t num = 0; + if (io == NULL) + return 0; + while (io != NULL) + { + num++; + io = io->aux; + } + return num; +} + +// return the length of the hook lists +size_t nnom_hook_length(nnom_layer_hook_t *hook) +{ + size_t num = 0; + if (hook == NULL) + return 0; + while (hook != NULL) + { + num++; + hook = hook->next; + } + return num; +} + +// The shortcut version of find_last() method. +// must be used after compiling. +static nnom_layer_t *layer_shortcut_find_last(nnom_layer_t *start) +{ + nnom_layer_t *layer = start; + if (start == NULL) + return NULL; + while (layer->shortcut != NULL) + layer = layer->shortcut; + return layer; +} + +// call while compiling. +// the shorcut is for fast running and fast iliterating. +// simply link every layer as a list. ordered by its runing order +static nnom_status_t layer_shortcut_add(nnom_layer_t *start, nnom_layer_t *curr) +{ + nnom_layer_t *layer = start; + // first one, return + if (start == curr) + { + return NN_SUCCESS; + } + // find the end of the list, and add curr layer to the end of it. + while (layer->shortcut != NULL) + { + // if the layer is already in shortcut list, tell upper. + if (curr == layer) + return NN_ARGUMENT_ERROR; + layer = layer->shortcut; + } + layer->shortcut = curr; + + return NN_SUCCESS; +} + +// input the layer number, +static void print_layer_info(nnom_layer_t *layer, uint32_t layer_count) +{ + size_t in_size = io_mem_size(layer->in); + size_t out_size = io_mem_size(layer->out); + size_t compsize; + size_t mac = layer->stat.macc; + if (layer->comp != NULL) + compsize = layer->comp->size; + else + compsize = 0; + // names + if(layer->type != NNOM_RNN) + NNOM_LOG("#%-3d %-10s - ", layer_count, default_layer_names[layer->type]); + else + { + NNOM_LOG("#%-3d %-3s/", layer_count, default_layer_names[layer->type]); + NNOM_LOG("%-6s - ", default_cell_names[((nnom_rnn_layer_t*)layer)->cell->type]); + } + + // activations + if (layer->actail != NULL) + NNOM_LOG("%-8s - ", default_activation_names[layer->actail->type]); + else + NNOM_LOG(" - "); + + NNOM_LOG("("); + for (int i = 0; i < 3; i++) + { + if (layer->out->tensor->num_dim > i) + NNOM_LOG("%4d,", layer->out->tensor->dim[i]); + else + NNOM_LOG(" "); + } + NNOM_LOG(") "); + + // MAC operation + if(mac == 0) + NNOM_LOG(" "); + else if (mac < 10000) + NNOM_LOG("%7d ", (uint32_t)mac); + else if (mac < 1000*1000) + NNOM_LOG("%6dk ", (uint32_t)(mac/1000)); + else if (mac < 1000*1000*1000) + NNOM_LOG("%3d.%02dM ", (uint32_t)(mac/(1000*1000)), (uint32_t)(mac%(1000*1000)/(10*1000))); // xxx.xx M + else + NNOM_LOG("%3d.%02dG ", (uint32_t)(mac/(1000*1000*1000)), (uint32_t)(mac%(1000*1000*1000)/(10*1000*1000))); // xxx.xx G + + // memory + NNOM_LOG("(%6d,%6d,%6d)", (uint32_t)in_size, (uint32_t)out_size,(uint32_t) compsize); +} + +static void print_memory_block_info(nnom_mem_block_t *block_pool) +{ + // show the memory blocks's lifetime (number of owners) + NNOM_LOG(" "); + for (int i = 0; i < NNOM_BLOCK_NUM; i++) + { + if (i % 4 == 0) + NNOM_LOG(" "); + if (block_pool[i].owners) + NNOM_LOG("%d ", block_pool[i].owners); + else + NNOM_LOG("- "); + } + NNOM_LOG("\n"); +} + +// This is a nested called functions. +// to analyse the topology of the model, calculate the output_shape of each layer and create shortcut lists. +// Nest will happend when a layer have multiple output module or mutiple output hooks. +// This function will return when +// 1) if the layer has multiple input but not all of them are filled by last layers. returns NN_MORE_TODO +// 2) if all the output hooked are nested called. return NN_SUCCESS +// 3) if the layer is output layer. return NN_SUCCESS +nnom_status_t compile_layers(nnom_layer_t* first, nnom_layer_t *curr, nnom_mem_block_t *block_pool, uint32_t *layer_count) +{ + size_t mem_size = 0; + nnom_layer_t *layer = curr; + nnom_layer_io_t *in; + nnom_layer_io_t *out; + nnom_layer_hook_t *hook; + + nnom_mem_block_t *in_blk; + nnom_mem_block_t *out_blk; + + uint32_t local_layer_count = 1; + + if(layer_count == NULL) + layer_count = &local_layer_count; + + in = layer->in; + out = layer->out; + + while (layer) + { + // check input + in = layer->in; + + // check if this layer is the input layer + // the first layer has no input hooked, and the io is not initialized + if (in->hook.io == NULL) + { + // if the input is not initalized + if (in->mem == NULL) + { + in_blk = allocate_block(block_pool); + in_blk->owners += 1; // add 1 + mem_size = nnom_alignto(tensor_size(in->tensor), NNOM_ALIGN); + in_blk->size = mem_size > in_blk->size ? mem_size : in_blk->size; + // set the blk to the layer IO + in->mem = in_blk; + in->mem->state = NNOM_BUF_FILLED; //mark input buff filled + } + } + else + { + // get the mem for every input from its hooked output. + while (in != NULL) + { + in->mem = in->hook.io->mem; + in = in->aux; + } + } + + // if there are mutiple inputs, wait utill all blocks filled + in = layer->in; + if (in != NULL && in->aux != NULL) + { + while (in != NULL) + { + // if the mem (of its hooked output) is not allocated or is not filled. + // It not the time to run the layer yet, return and waits for next nested called. + if (in->mem == NULL || in->mem->state != NNOM_BUF_FILLED) + return NN_MORE_TODO; + in = in->aux; + } + } + + // if run to this point, then it is the right time to compile(run) this layer. + // compiling are seperated into the steps below. + // 1. to calculate the output shape. + // 2. to put the current layer to the end of shortcut list. + // 3. allocate computational buffer. + // 4. allocate output buffer for each output module. + // 5.1 if there is only one layer hooked to the output. we dont use nested call, but continue in this big while loop. + // 5.2 nested call the hooked output layers (if there are > 1 hooked to the output of this layer) + + // 1. calculate output shape while all inputs are filled + layer->build(layer); + + // 2. add to shortcut list. + layer_shortcut_add(first, layer); + + // 3. assign for computational buf + if (layer->comp != NULL) + { + layer->comp->mem = allocate_block(block_pool); + layer->comp->mem->owners += 1; // add us to buffer users + layer->comp->mem->state = NNOM_BUF_FILLED; + // record maximum mem size in this block + mem_size = nnom_alignto(layer->comp->size, NNOM_ALIGN); + layer->comp->mem->size = + mem_size > layer->comp->mem->size ? mem_size : layer->comp->mem->size; + } + + // print current layer's info. + // show names, activations, mem block size + print_layer_info(layer, (*layer_count)++); + + // 4. allocate output buffer for each output module. + // check output + if (layer->out == NULL) + return NN_SUCCESS; + + // 5.1 if there is only one layer hooked to the output. we dont use nested call, but continue in this big while loop. + // if the layer is Single Output, continue the loop directly. To reduce nested level + if (layer->out->aux == NULL && layer->out->hook.next == NULL) + { + // single buf layer. + if (layer->in->type == NNOM_TENSOR_BUF_NULL || layer->out->type == NNOM_TENSOR_BUF_NULL) + { + // pass to next layer directly, like we never touch the buffer(dont change life-time) + layer->out->mem = layer->in->mem; + + // print memory before release + print_memory_block_info(block_pool); + // computational buf + release_comp_mem(layer); + } + // not a single buf layer + else + { + // allocate mem block for the output + out_blk = allocate_block(block_pool); + if (out_blk == NULL) + return NN_NO_MEMORY; + // set the life time, only one hooked layer, so the life time is 1 + out_blk->owners = 1; + out_blk->state = NNOM_BUF_FILLED; // marked filled + // record maximum mem size in this block + mem_size = nnom_alignto(tensor_size(layer->out->tensor), NNOM_ALIGN); + out_blk->size = mem_size > out_blk->size ? mem_size : out_blk->size; + // set the blk to the layer IO + layer->out->mem = out_blk; + + // once we allocate for output, we can now release input and comput. + // print memory before release + print_memory_block_info(block_pool); + // release input mem and comp mem + release_input_mem(layer); + release_comp_mem(layer); + } + } + // Multiple output and/or mutiple hooks + else + { + // single buf layer will use the input buf for the first output + if (layer->in->type == NNOM_TENSOR_BUF_NULL || layer->out->type == NNOM_TENSOR_BUF_NULL) + { + // we dont allocate new buf, but use the input + // the ownership will be set to next layer later + layer->out->mem = layer->in->mem; + layer->out->mem->owners += nnom_hook_length(&layer->out->hook); // set the mem lifetime.// test + layer->out->mem->state = NNOM_BUF_FILLED; + + // print memory before release + print_memory_block_info(block_pool); + // release computational buff and input buffer + release_input_mem(layer); + release_comp_mem(layer); + } + // mutiple buf layer. (I/O use different memory blocks) + else + { + // allocate for every output + out = layer->out; + while (out != NULL && out->hook.io != NULL) // the output layer have no output IO + { + // assign new block + out->mem = allocate_block(block_pool); + if (out->mem == NULL) + return NN_NO_MEMORY; + // record maximum mem size in this block + mem_size = nnom_alignto(tensor_size(out->tensor), NNOM_ALIGN); + out->mem->size = mem_size > out->mem->size ? mem_size : out->mem->size; + // keep the block untill the last hooked layer is called. + out->mem->owners = nnom_hook_length(&out->hook); // set lifetime of the buffer = the num of hooked layers + out->mem->state = NNOM_BUF_FILLED; + + out = out->aux; + } + // once we allocate for output, we can now release input and comput (or reduce the lifetime). + // print memory before release + print_memory_block_info(block_pool); + // release input mem and comp mem + release_input_mem(layer); + release_comp_mem(layer); + } + + // 5.12 nested call the hooked output layers (if there are > 1 hooked to the output of this layer) + // while all the out module(s) receive a memory block, it is ready to be sent to other layers. + // iterate all hooked layers in each out module. + out = layer->out; + while (out != NULL) + { + // nested call hooked layer one by one. + hook = &out->hook; + while (hook != NULL && hook->io != NULL) + { + compile_layers(first, hook->io->owner, block_pool, layer_count); + hook = hook->next; + } + + // next io + out = out->aux; + } + + // when all the out is called. this should stop here. + // once enter mutiple output iterating, the function will always return. + // because at least one of the nested called by this function will run till the end of the model. + return NN_SUCCESS; + } + // Multiple output ended. + + // return if this is output layer. + // the output layer's output io is hooked to nothing. + if (layer->out->hook.io == NULL) + return NN_SUCCESS; + + // single output layer, this function continue to analyse next layer. + // switch to next layer. + layer = layer->out->hook.io->owner; + } + + // seems to be redundants + return NN_SUCCESS; +} + +size_t mem_analysis_result(nnom_model_t *m) +{ + uint32_t index; + uint32_t total_mem = 0; + NNOM_LOG("Memory cost by each block:\n "); + // print size of memory blocks + for (index = 0; index < NNOM_BLOCK_NUM; index++) + { + total_mem += m->blocks[index].size; + NNOM_LOG("blk_%d:%d ", index, (uint32_t)(m->blocks[index].size)); + } + // size of total memory cost by networks buffer + NNOM_LOG("\n Memory cost by network buffers: %d bytes\n", total_mem); + return total_mem; +} + +// allocate memory, and set them to each block according to the mem analysis results. +nnom_status_t block_mem_set(nnom_model_t *m, void *buf) +{ + uint32_t index; + uint32_t mem_offset = 0; + + for (index = 0; index < NNOM_BLOCK_NUM; index++) + { + if (m->blocks[index].size == 0) + break; + m->blocks[index].blk = (void *)((uint8_t*)buf + mem_offset); + mem_offset += m->blocks[index].size; + } + return NN_SUCCESS; +} + +// experimental: this function is temporary use to +// assign memory blk which has assigned to input and output to the corresponding tensor +nnom_status_t tensor_mem_set(nnom_model_t *m) +{ + nnom_layer_t *layer = m->head; + nnom_layer_io_t *io; + while (layer) + { + io = layer->in; + while (io) + { + io->tensor->p_data = io->mem->blk; + io = io->aux; + } + + io = layer->out; + while (io) + { + io->tensor->p_data = io->mem->blk; + io = io->aux; + } + + layer = layer->shortcut; + } + + return NN_SUCCESS; +} + +// this function has to be used after memory is assigned to the layers. +// it means it has to be call after compile_model() as well. +// it simply get the output buffer and set the buffer to tailed activation of each layer.. +nnom_status_t set_tailed_activation(nnom_model_t *m) +{ + NNOM_NULL_CHECK(m); + NNOM_NULL_CHECK(m->head); + nnom_layer_t *layer = m->head; + + // if tailed activation is exist, set it to the output. + while (layer) + { + if (layer->actail != NULL) + { + layer->actail->tensor = layer->out->tensor; + } + if (layer->shortcut == NULL) + break; + layer = layer->shortcut; + } + + return NN_SUCCESS; +} + +// get total ops +static uint64_t model_set_ops(nnom_model_t *m) +{ + nnom_layer_t *layer; + uint64_t total_ops = 0; + layer = m->head; + while (layer) + { + total_ops += layer->stat.macc; + if (layer->shortcut == NULL) + break; + layer = layer->shortcut; + } + m->total_ops = total_ops; + return total_ops; +} + +// a compiler can be use for both sequencial / functional model. +// the output layer is optional only when the model is single output model +// in this case, if output = NULL, the compile can find it by its own. +nnom_status_t model_compile(nnom_model_t *m, nnom_layer_t *input, nnom_layer_t *output) +{ + size_t buf_size; + uint8_t *buf; + uint32_t layer_num = 1; + uint32_t time = nnom_ms_get(); + + NNOM_NULL_CHECK(m); + NNOM_NULL_CHECK(input); + + m->head = input; + m->tail = output; + if (output == NULL) + m->tail = find_last(input); + + NNOM_LOG("NNoM version %d.%d.%d\n", NNOM_MAJORVERSION, NNOM_SUBVERSION, NNOM_REVISION); + NNOM_LOG("To disable logs, please void the marco 'NNOM_LOG(...)' in 'nnom_port.h'.\n"); + #ifdef NNOM_USING_CHW + NNOM_LOG("Data format: Channel first (CHW)\n"); + #else + NNOM_LOG("Data format: Channel last (HWC)\n"); + #endif + #ifdef NNOM_USING_CMSIS_NN + NNOM_LOG("Backend optimization: CMSIS-NN\n"); + #endif + #ifdef NNOM_USING_STATIC_MEMORY + NNOM_LOG("Static memory size set to: %d\n", (uint32_t)nnom_static_buf_size); + #endif + NNOM_LOG("Start compiling model...\n"); + NNOM_LOG("Layer(#) Activation output shape ops(MAC) mem(in, out, buf) mem blk lifetime\n"); + NNOM_LOG("-------------------------------------------------------------------------------------------------\n"); + + // compile layers, started from list head, nested run till the end of models + compile_layers(m->head, m->head, m->blocks, &layer_num); + + NNOM_LOG("-------------------------------------------------------------------------------------------------\n"); + + // if model's tail is not the last layer which built by user. + if (output->type != NNOM_OUTPUT) + NNOM_LOG("WARNING: the last layer '%s' is not the Output Layer, please check carefully.\n", + default_layer_names[output->type]); + + // get the total (aligned) memory requirement + buf_size = mem_analysis_result(m); + + // allocate one big memory block + buf = nnom_mem(buf_size); + if (buf == NULL) + { + NNOM_LOG("ERROR: No enough memory for network buffer, required %d bytes\n", (uint32_t)buf_size); + return NN_NO_MEMORY; + } + // all memory cost + NNOM_LOG(" Total memory occupied: %d bytes\n", (uint32_t)nnom_memory_taken); + + // split the memory for every memory block + block_mem_set(m, buf); + + // experimental: set memory from io to the io tensor + tensor_mem_set(m); + + // finally set the output buff to tailed activation on each layer + set_tailed_activation(m); + + // calculate the total operations and set it to the model + model_set_ops(m); + + // print the time. + if(nnom_ms_get()) + NNOM_LOG("Compling done in %d ms\n", nnom_ms_get() - time); + + return NN_SUCCESS; +} + +// This is a simplified API for compile models with sequencial model only +// this does not require specified Input / Output layers +nnom_status_t sequencial_compile(nnom_model_t *m) +{ + nnom_layer_t *input, *output; + input = m->head; + output = find_last(input); + return model_compile(m, input, output); +} + +// run that layer +nnom_status_t layer_run(nnom_layer_t *layer) +{ + nnom_status_t result; + uint32_t start; + NNOM_NULL_CHECK(layer); + + // start + start = nnom_us_get(); + // run main layer first + result = layer->run(layer); + // run tailed-activation if it is presented + if (layer->actail != NULL) + { + layer->actail->run(layer->actail); + } + // done + layer->stat.time = nnom_us_get() - start; + return result; +} + +// run the model, until the end_layer. If end_layer == NULL, run all layers. +nnom_status_t model_run_to(nnom_model_t *m, nnom_layer_t *end_layer) +{ + uint32_t layer_num = 1; + nnom_status_t result; + nnom_layer_t *layer; + NNOM_NULL_CHECK(m); + NNOM_NULL_CHECK(m->head); + + layer = m->head; + + // using shortcut run + while (layer) + { + // run layer + result = layer_run(layer); + if (result != NN_SUCCESS) + { + NNOM_LOG("Error: #%d %s layer return error code:%d\n", layer_num, default_layer_names[layer->type], result); + return result; + } + // run callback + if(m->layer_callback != NULL) + { + result = m->layer_callback(m, layer); + if (result != NN_SUCCESS) + { + NNOM_LOG("Error: Callback return error code %d at #%d %s layer\n", result, layer_num, default_layer_names[layer->type]); + return result; + } + } + // check if finished + if (layer == end_layer || layer->shortcut == NULL) + break; + layer = layer->shortcut; + layer_num++; + } + + return NN_SUCCESS; +} + +// run all layers. +nnom_status_t model_run(nnom_model_t *m) +{ + return model_run_to(m, NULL); +} + +// callback, called after each layer has finished the calculation. +nnom_status_t model_set_callback(nnom_model_t *m, nnom_status_t (*layer_callback)(nnom_model_t *m, nnom_layer_t *layer)) +{ + if(m->layer_callback != NULL && m->layer_callback != layer_callback) + return NN_LENGTH_ERROR; + + m->layer_callback = layer_callback; + return NN_SUCCESS; +} + +// delete callback. +void model_delete_callback(nnom_model_t *m) +{ + m->layer_callback = NULL; +} + +nnom_status_t check_model_version(unsigned long model_version) +{ + nnom_status_t result = NN_SUCCESS; + int32_t major, sub, rev; + major = model_version/10000; + sub = (model_version/100)%100; + rev = model_version % 100; + if(model_version != NNOM_VERSION) + { + NNOM_LOG("WARNING: model version %d.%d.%d dosen't match nnom version!\n", major, sub, rev); + result = -NN_ARGUMENT_ERROR; + } + else + { + NNOM_LOG("Model version: %d.%d.%d\n", major, sub, rev); + } + return result; +} + + diff --git a/APP_Framework/Framework/knowing/nnom/src/core/nnom_layers.c b/APP_Framework/Framework/knowing/nnom/src/core/nnom_layers.c new file mode 100644 index 000000000..dc059074a --- /dev/null +++ b/APP_Framework/Framework/knowing/nnom/src/core/nnom_layers.c @@ -0,0 +1,83 @@ +/* + * Copyright (c) 2018-2020 + * Jianjia Ma + * majianjia@live.com + * + * SPDX-License-Identifier: Apache-2.0 + * + * Change Logs: + * Date Author Notes + * 2019-02-05 Jianjia Ma The first version + */ + +#include +#include +#include + +#include "nnom.h" +#include "nnom_local.h" +#include "nnom_layers.h" + +size_t shape_size(nnom_3d_shape_t *s) +{ + if (s == NULL) + return 0; + return s->h * s->w * s->c; +} + +nnom_3d_shape_t shape(size_t h, size_t w, size_t c) +{ + nnom_3d_shape_t s; + s.h = h; + s.w = w; + s.c = c; + return s; +} +nnom_3d_shape_t kernel(size_t h, size_t w) +{ + return shape(h, w, 1); +} +nnom_3d_shape_t stride(size_t h, size_t w) +{ + return shape(h, w, 1); +} +nnom_3d_shape_t dilation(size_t h, size_t w) +{ + return shape(h, w, 1); +} + +nnom_border_t border(size_t top, size_t bottom, size_t left, size_t right) +{ + nnom_border_t b; + b.top = top; + b.bottom = bottom; + b.left = left; + b.right = right; + return b; +} + +// this function has to be used while assign a io for a layer. +// because the io needs to know who is its owner. +nnom_layer_io_t *io_init(void *owner_layer, nnom_layer_io_t *io) +{ + io->owner = (nnom_layer_t *)owner_layer; + return io; +} + +// this function is to add a new IO to current inited IO +// input, the targeted IO that the new IO will be added to +// output , the new IO +nnom_layer_io_t *io_add_aux(nnom_layer_io_t *targeted_io) +{ + nnom_layer_io_t *new_io; + // check if the targeted io is inited, and its aux = NULL + if (targeted_io == NULL || targeted_io->owner == NULL || targeted_io->aux != NULL) + return NULL; + // create new io, init it + new_io = nnom_mem(sizeof(nnom_layer_io_t)); + if (new_io == NULL) + return NULL; + // add to aux + targeted_io->aux = new_io; + return io_init(targeted_io->owner, new_io); +} diff --git a/APP_Framework/Framework/knowing/nnom/src/core/nnom_tensor.c b/APP_Framework/Framework/knowing/nnom/src/core/nnom_tensor.c new file mode 100644 index 000000000..55b3984ca --- /dev/null +++ b/APP_Framework/Framework/knowing/nnom/src/core/nnom_tensor.c @@ -0,0 +1,245 @@ +/* + * Copyright (c) 2018-2020 + * Jianjia Ma + * majianjia@live.com + * + * SPDX-License-Identifier: Apache-2.0 + * + * Change Logs: + * Date Author Notes + * 2019-02-05 Jianjia Ma The first version + * 2019-02-14 Jianjia Ma Add layer.free() method. + */ + +#include +#include +#include +#include +#include "nnom.h" +#include "nnom_tensor.h" + + // tensor size +size_t tensor_size(nnom_tensor_t* t) +{ + size_t size = 0; + if (t != NULL) + { + size = t->dim[0]; + for (int i = 1; i < t->num_dim; i++) + size *= t->dim[i]; + } + return size; +} +size_t tensor_size_byte(nnom_tensor_t* t) +{ + return tensor_size(t)*t->bitwidth/8; +} + + +size_t tensor_get_num_channel(nnom_tensor_t* t) +{ + // this will need to be changed to support batch. +#ifdef NNOM_USING_CHW + // channel first + //return t->dim[0]; + return t->dim[t->num_dim -1]; // we are always using hwc to describe even our data is in CHW +#else + // channel last + return t->dim[t->num_dim -1]; +#endif +} + +// initialise/create new tensor +nnom_tensor_t* new_tensor(nnom_qtype_t type, uint32_t num_dim, uint32_t num_channel) +{ + nnom_tensor_t* t = NULL; + uint32_t q_len; + if(type == NNOM_QTYPE_PER_AXIS) + { + q_len = num_channel; + } + else if (type == NNOM_QTYPE_PER_TENSOR) + { + q_len = 1; + } + else + { + NNOM_LOG("ERROR: tensor type not specified\n"); + return NULL; + } + + t = nnom_mem(nnom_alignto(sizeof(nnom_tensor_t), NNOM_ALIGN) + + nnom_alignto(num_dim*sizeof(nnom_shape_data_t),sizeof(nnom_qformat_param_t)) + + q_len*sizeof(nnom_qformat_param_t)*2); + if(t == NULL) + return t; + t->dim = (nnom_shape_data_t*)((uint8_t*)t + sizeof(nnom_tensor_t)); // should add alignment + t->q_dec = (nnom_qformat_param_t*)((uint8_t*)t->dim + nnom_alignto(num_dim*sizeof(nnom_shape_data_t),sizeof(nnom_qformat_param_t))); + t->q_offset = (nnom_qformat_param_t*)((uint8_t*)t->q_dec + q_len*sizeof(nnom_qformat_param_t)); + t->num_dim = num_dim; + t->qtype = type; + + return t; +} + +void delete_tensor(nnom_tensor_t* t) +{ + if (t) + nnom_free(t); +} + +// set tensor by value +// for tensor with quantized type NNOM_QTYPE_PER_TENSOR +nnom_tensor_t* tensor_set_attr_v(nnom_tensor_t* t, + nnom_qformat_param_t dec_bit, nnom_qformat_param_t offset, nnom_shape_data_t* dim, uint32_t num_dim, uint8_t bitwidth) +{ + // copy dim + t->num_dim = num_dim; + nnom_memcpy(t->dim, dim, sizeof(nnom_shape_data_t) * num_dim); + + // bitwidth + t->bitwidth = bitwidth; + // copy the offset and q format + *(t->q_dec) = dec_bit; + *(t->q_offset) = offset; + return t; +} + + +// set tensor by pointer +// for tensor with quantized type NNOM_QTYPE_PER_AXIS +nnom_tensor_t* tensor_set_attr(nnom_tensor_t* t, + nnom_qformat_param_t*dec_bit, nnom_qformat_param_t *offset, nnom_shape_data_t* dim, uint32_t num_dim, uint8_t bitwidth) +{ + size_t size; + + // copy dim + t->num_dim = num_dim; + nnom_memcpy(t->dim, dim, sizeof(nnom_shape_data_t) * num_dim); + + // get the q format data size + if(t->qtype == NNOM_QTYPE_PER_AXIS) + size = sizeof(nnom_qformat_param_t) * tensor_get_num_channel(t); + else + size = sizeof(nnom_qformat_param_t); + + // bitwidth + t->bitwidth = bitwidth; + // copy the offset and q format + nnom_memcpy(t->q_dec, dec_bit, size); + nnom_memcpy(t->q_offset, offset, size); + return t; +} + +// this method copy the attributes of a tensor to a new tensor +// before that, src and des tensor must already have QTYPE and NUM_OF_DIM set. +// Note, the tensors must have the same lenght. this method wont cpy the memory pointer data (we will assign memory later after building) +nnom_tensor_t* tensor_cpy_attr(nnom_tensor_t* des, nnom_tensor_t* src) +{ + size_t size; + if(src->qtype != des->qtype || src->num_dim != des->num_dim) + return NULL; + + if(src->qtype == NNOM_QTYPE_PER_AXIS) + size = sizeof(nnom_qformat_param_t) * tensor_get_num_channel(src); + else + size = sizeof(nnom_qformat_param_t); + + // bit + des->bitwidth = src->bitwidth; + // copy quantisation parameters + nnom_memcpy(des->q_dec, src->q_dec, size); + nnom_memcpy(des->q_offset, src->q_offset, size); + + // copy number of dimension + des->num_dim = src->num_dim; + nnom_memcpy(des->dim, src->dim, src->num_dim * sizeof(nnom_shape_data_t)); + return des; +} + +// change format from CHW to HWC +// the shape of the data, input data, output data +void tensor_hwc2chw_q7(nnom_tensor_t* des, nnom_tensor_t* src) +{ + q7_t* p_out = des->p_data; + q7_t* p_in = src->p_data; + + for (int c = 0; c < src->dim[2]; c++) + { + for (int h = 0; h < src->dim[0]; h++) + { + for (int w = 0; w < src->dim[1]; w++) + { + *p_out = p_in[(h * src->dim[1] + w) * src->dim[2] + c]; + p_out++; + } + } + } +} + + +// only support 3d tensor +// change format from CHW to HWC +void tensor_chw2hwc_q7(nnom_tensor_t* des, nnom_tensor_t* src) +{ + q7_t* p_out = des->p_data; + q7_t* p_in = src->p_data; + int im_size; + int h_step; + + im_size = src->dim[0] * src->dim[1]; // H*W + + for (int h = 0; h < src->dim[0]; h++) + { + h_step = src->dim[1] * h; + for (int w = 0; w < src->dim[1]; w++) + { + for (int c = 0; c < src->dim[2]; c++) + { + *p_out = p_in[im_size * c + h_step + w]; + p_out++; + } + } + } + +} + +// (deprecated by tensor_hwc2chw version) +// change format from CHW to HWC +// the shape of the data, input data, output data +void hwc2chw_q7(nnom_3d_shape_t shape, q7_t* p_in, q7_t* p_out) +{ + for (int c = 0; c < shape.c; c++) + { + for (int h = 0; h < shape.h; h++) + { + for (int w = 0; w < shape.w; w++) + { + *p_out = p_in[(h * shape.w + w) * shape.c + c]; + p_out++; + } + } + } +} + +// (deprecated) +// change format from CHW to HWC +// the shape of the data, input data, output data +void chw2hwc_q7(nnom_3d_shape_t shape, q7_t* p_in, q7_t* p_out) +{ + int im_size = shape.w * shape.h; + int h_step; + + for (int h = 0; h < shape.h; h++) + { + h_step = shape.w * h; + for (int w = 0; w < shape.w; w++) + { + for (int c = 0; c < shape.c; c++) + { + *p_out = p_in[im_size * c + h_step + w]; + p_out++; + } + } + } +} diff --git a/APP_Framework/Framework/knowing/nnom/src/core/nnom_utils.c b/APP_Framework/Framework/knowing/nnom/src/core/nnom_utils.c new file mode 100644 index 000000000..3b13c3551 --- /dev/null +++ b/APP_Framework/Framework/knowing/nnom/src/core/nnom_utils.c @@ -0,0 +1,417 @@ +/* + * Copyright (c) 2018-2020 + * Jianjia Ma + * majianjia@live.com + * + * SPDX-License-Identifier: Apache-2.0 + * + * Change Logs: + * Date Author Notes + * 2019-02-05 Jianjia Ma The first version + */ + +#include +#include +#include +#include +#include "nnom.h" +#include "nnom_utils.h" + +static nnom_predict_t *_predict_create_instance(nnom_model_t *m, size_t label_num, size_t top_k_size) +{ + nnom_predict_t *pre; + // allocate memory + pre = (nnom_predict_t *)nnom_malloc(sizeof(nnom_predict_t)); + if(pre == NULL) + return NULL; + pre->top_k = (uint32_t *)nnom_malloc(top_k_size * sizeof(uint32_t)); + pre->confusion_mat = (uint16_t *)nnom_malloc(label_num * label_num * sizeof(uint16_t)); + if(pre->top_k == NULL || pre->confusion_mat == NULL) + { + nnom_free(pre->top_k); nnom_free(pre->confusion_mat); nnom_free(pre); + return NULL; + } + nnom_memset(pre->top_k, 0, top_k_size * sizeof(uint32_t)); + nnom_memset(pre->confusion_mat, 0, label_num * label_num * sizeof(uint16_t)); + + // config + pre->label_num = label_num; + pre->top_k_size = top_k_size; + pre->predict_count = 0; + + // run + pre->model = m; + pre->t_run_total = 0; // model running time in total + pre->t_predict_start = 0; // when it is initial + pre->t_predict_total = 0; // total time of the whole test + + return pre; +} + +static void _predict_delete_instance(nnom_predict_t *pre) +{ + if(pre == NULL) + return; + nnom_free(pre->top_k); + nnom_free(pre->confusion_mat); + nnom_free(pre); +} + +// create a prediction +// input model, the buf pointer to the softwmax output (Temporary, this can be extract from model) +// the size of softmax output (the num of lable) +// the top k that wants to record. +nnom_predict_t *prediction_create(nnom_model_t *m, int8_t *buf_prediction, size_t label_num, size_t top_k_size) +{ + nnom_predict_t *pre = _predict_create_instance(m, label_num, top_k_size); + if (!pre) + return NULL; + if (!m) + { + _predict_delete_instance(pre); + return NULL; + } + + // set the output buffer of model to the prediction instance + pre->buf_prediction = buf_prediction; + + // mark start time. + pre->t_predict_start = nnom_ms_get(); + + return pre; +} + +// after a new data is set in input +// feed data to prediction +// input the current label, (range from 0 to total number of label -1) +// (the current input data should be set by user manully to the input buffer of the model.) +nnom_status_t prediction_run(nnom_predict_t *pre, uint32_t true_label, uint32_t*predict_label, float* prob) +{ + int max_val; + int max_index; + uint32_t true_ranking = 0; + uint32_t start; + uint32_t sum = 0; + + if (!pre) + return NN_ARGUMENT_ERROR; + + // now run model + start = nnom_ms_get(); + model_run(pre->model); + pre->t_run_total += nnom_ms_get() - start; + + // only draw matrix and top k when number of label > 1 + if (pre->label_num > 1) + { + // find how many prediction is bigger than the ground true. + // Raning rules, same as tensorflow. however, predictions in MCU is more frequencly to have equal probability since it is using fixed-point. + // if ranking is 1, 2, =2(true), 4, 5, 6. the result will be top 3. + // if ranking is 1, 2(true), =2, 4, 5, 6. the result will be top 2. + // find the ranking of the prediced label. + for (uint32_t j = 0; j < pre->label_num; j++) + { + if (j == true_label) + continue; + if (pre->buf_prediction[true_label] < pre->buf_prediction[j]) + true_ranking++; + // while value[label] = value[j]. only when label > j, label is the second of j + else if (pre->buf_prediction[true_label] == pre->buf_prediction[j] && j < true_label) + true_ranking++; + } + + if (true_ranking < pre->top_k_size) + pre->top_k[true_ranking]++; + + // Find top 1 and return the current prediction. + // If there are several maximum prediction, return the first one. + max_val = pre->buf_prediction[0]; + max_index = 0; + for (uint32_t j = 1; j < pre->label_num; j++) + { + if (pre->buf_prediction[j] > max_val) + { + max_val = pre->buf_prediction[j]; + max_index = j; + } + sum += pre->buf_prediction[j]; + } + // result + if (max_val != 0) + *prob = (float)max_val / 127.f; + else + *prob = 0; + *predict_label = max_index; + + // fill confusion matrix + pre->confusion_mat[true_label * pre->label_num + max_index] += 1; + } + // only one neural as output. + else + { + *prob = (float)pre->buf_prediction[0] / 127.f; + if (*prob >= 0.5f) + *predict_label = 1; + else + *predict_label = 0; + } + + // prediction count + pre->predict_count++; + + // return the prediction + return NN_SUCCESS; +} + +void prediction_end(nnom_predict_t *pre) +{ + if (!pre) + return; + pre->t_predict_total = nnom_ms_get() - pre->t_predict_start; +} + +void prediction_delete(nnom_predict_t *pre) +{ + _predict_delete_instance(pre); +} + +void prediction_matrix(nnom_predict_t *pre) +{ + if (!pre) + return; + // print titles + NNOM_LOG("\nConfusion matrix:\n"); + NNOM_LOG("predict"); + for (int i = 0; i < pre->label_num; i++) + { + NNOM_LOG("%6d", i); + } + NNOM_LOG("\n"); + NNOM_LOG("actual\n"); + // print the matrix + for (int i = 0; i < pre->label_num; i++) + { + uint32_t row_total = 0; + + NNOM_LOG(" %3d | ", i); + for (int j = 0; j < pre->label_num; j++) + { + row_total += pre->confusion_mat[i * pre->label_num + j]; + NNOM_LOG("%6d", pre->confusion_mat[i * pre->label_num + j]); + } + NNOM_LOG(" |%4d%%\n", pre->confusion_mat[i * pre->label_num + i] * 100 / row_total); + row_total = 0; + } + NNOM_LOG("\n"); +} + +// top-k +void prediction_top_k(nnom_predict_t *pre) +{ + uint32_t top = 0; + if (!pre) + return; + + for (int i = 0; i < pre->top_k_size; i++) + { + top += pre->top_k[i]; + if (top != pre->predict_count) + NNOM_LOG("Top %d Accuracy: %d.%02d%% \n", i + 1, (top * 100) / pre->predict_count, + ((top * 100 * 100) / pre->predict_count)%100); + else + NNOM_LOG("Top %d Accuracy: 100%% \n", i + 1); + } +} + +// this function is to print sumarry +void prediction_summary(nnom_predict_t *pre) +{ + if (!pre) + return; + // sumamry + NNOM_LOG("\nPrediction summary:\n"); + NNOM_LOG("Test frames: %d\n", pre->predict_count); + NNOM_LOG("Test running time: %d sec\n", pre->t_predict_total / 1000); + NNOM_LOG("Model running time: %d ms\n", pre->t_run_total); + if(pre->predict_count !=0) + NNOM_LOG("Average prediction time: %d us\n", (pre->t_run_total * 1000) / pre->predict_count); + if(pre->t_run_total != 0) + NNOM_LOG("Average effeciency: %d.%02d ops/us\n", (int)(((uint64_t)pre->model->total_ops * pre->predict_count) / (pre->t_run_total * 1000)), + (int)(((uint64_t)pre->model->total_ops * pre->predict_count)*100 / (pre->t_run_total * 1000))%100); + if(pre->t_run_total !=0 && pre->predict_count !=0) + NNOM_LOG("Average frame rate: %d.%d Hz\n", 1000 / (pre->t_run_total / pre->predict_count), + (1000*10 / (pre->t_run_total / pre->predict_count))%10); + + // only valid for multiple labels + if(pre->label_num > 1) + { + // print top-k + prediction_top_k(pre); + + // print confusion matrix + prediction_matrix(pre); + } +} + +// stand alone prediction API +// this api test one set of data, return the prediction +nnom_status_t nnom_predict(nnom_model_t *m, uint32_t *label, float *prob) +{ + int32_t max_val, max_index, sum; + int8_t *output; + + if (!m) + return NN_ARGUMENT_ERROR; + + model_run(m); + + // get the output memory + output = m->tail->out->tensor->p_data; + + // multiple neural output + if (tensor_size(m->tail->out->tensor) > 1) + { + // Top 1 + max_val = output[0]; + max_index = 0; + sum = max_val; + for (uint32_t i = 1; i < tensor_size(m->tail->out->tensor); i++) + { + if (output[i] > max_val) + { + max_val = output[i]; + max_index = i; + } + sum += output[i]; + } + // send results + *label = max_index; + if(max_val !=0) + *prob = (float)max_val/127.f; + else + *prob = 0; + } + // single neural output + else + { + *prob = (float)output[0] / 127.f; + if (*prob >= 0.5f) + *label = 1; + else + *label = 0; + } + + return NN_SUCCESS; +} + +static void layer_stat(nnom_layer_t *layer) +{ + // layer stat + if(layer->type != NNOM_RNN) + NNOM_LOG("%-10s - ", default_layer_names[layer->type]); + else + { + NNOM_LOG("%-3s/", default_layer_names[layer->type]); + NNOM_LOG("%-6s - ", default_cell_names[((nnom_rnn_layer_t*)layer)->cell->type]); + } + NNOM_LOG(" %8d ", layer->stat.time); + + // MAC operation + if(layer->stat.macc == 0) + NNOM_LOG(" "); + else if (layer->stat.macc < 10000) + NNOM_LOG("%7d ", (uint32_t)layer->stat.macc); + else if (layer->stat.macc < 1000*1000) + NNOM_LOG("%6dk ", (uint32_t)(layer->stat.macc/1000)); + else if (layer->stat.macc < 1000*1000*1000) + NNOM_LOG("%3d.%02dM ", (uint32_t)(layer->stat.macc/(1000*1000)), (uint32_t)(layer->stat.macc%(1000*1000)/(10*1000))); // xxx.xx M + else + NNOM_LOG("%3d.%02dG ", (uint32_t)(layer->stat.macc/(1000*1000*1000)), (uint32_t)(layer->stat.macc%(1000*1000*1000)/(10*1000*1000))); // xxx.xx G + + // layer efficiency + if (layer->stat.macc != 0 && layer->stat.time != 0) + NNOM_LOG("%d.%02d\n", (uint32_t)(layer->stat.macc / layer->stat.time), (uint32_t)((layer->stat.macc * 100) / (layer->stat.time) % 100)); + else + NNOM_LOG("\n"); +} + +void model_stat(nnom_model_t *m) +{ + size_t total_ops = 0; + size_t total_time = 0; + nnom_layer_t *layer; + uint32_t run_num = 0; + + if (!m) + return; + + layer = m->head; + + NNOM_LOG("\nPrint running stat..\n"); + NNOM_LOG("Layer(#) - Time(us) ops(MACs) ops/us \n"); + NNOM_LOG("--------------------------------------------------------\n"); + while (layer) + { + run_num++; + NNOM_LOG("#%-3d", run_num); + total_ops += layer->stat.macc; + total_time += layer->stat.time; + layer_stat(layer); + if (layer->shortcut == NULL) + break; + layer = layer->shortcut; + } + NNOM_LOG("\nSummary:\n"); + NNOM_LOG("Total ops (MAC): %d", (uint32_t)(total_ops)); + NNOM_LOG("(%d.%02dM)\n", (uint32_t) (total_ops/(1000*1000)), (uint32_t)(total_ops%(1000*1000)/(10000))); + NNOM_LOG("Prediction time :%dus\n", (uint32_t)total_time); + if(total_time != 0) + NNOM_LOG("Efficiency %d.%02d ops/us\n", + (uint32_t)(total_ops / total_time), + (uint32_t)((total_ops * 100) / (total_time) % 100)); + + NNOM_LOG("Total memory:%d\n", (uint32_t)nnom_mem_stat()); +} + +void model_io_format(nnom_model_t *m) +{ + nnom_layer_t *layer; + uint32_t run_num = 0; + + if (!m) + return; + + layer = m->head; + + NNOM_LOG("\nPrint layer input/output..\n"); + NNOM_LOG("Layer(#) - Input(Qnm) Output(Qnm) Oshape \n"); + NNOM_LOG("----------------------------------------------------------\n"); + while (layer) + { + run_num++; + NNOM_LOG("#%-3d", run_num); + if(layer->type != NNOM_RNN) + NNOM_LOG("%-10s - ", default_layer_names[layer->type]); + else + { + NNOM_LOG("%-3s/", default_layer_names[layer->type]); + NNOM_LOG("%-6s - ", default_cell_names[((nnom_rnn_layer_t*)layer)->cell->type]); + } + NNOM_LOG(" %2d.%2d", 7-layer->in->tensor->q_dec[0], layer->in->tensor->q_dec[0]); + NNOM_LOG(" %2d.%2d", 7-layer->out->tensor->q_dec[0], layer->out->tensor->q_dec[0]); + NNOM_LOG(" ("); + for (int i = 0; i < 3; i++) + { + if (layer->out->tensor->num_dim > i) + NNOM_LOG("%4d,", layer->out->tensor->dim[i]); + else + NNOM_LOG(" "); + } + NNOM_LOG(")\n"); + + if (layer->shortcut == NULL) + break; + layer = layer->shortcut; + } + +} diff --git a/APP_Framework/Framework/knowing/nnom/src/layers/nnom_activation.c b/APP_Framework/Framework/knowing/nnom/src/layers/nnom_activation.c new file mode 100644 index 000000000..c90171c77 --- /dev/null +++ b/APP_Framework/Framework/knowing/nnom/src/layers/nnom_activation.c @@ -0,0 +1,369 @@ + + +/* + * Copyright (c) 2018-2020 + * Jianjia Ma + * majianjia@live.com + * + * SPDX-License-Identifier: Apache-2.0 + * + * Change Logs: + * Date Author Notes + * 2019-07-23 Jianjia Ma The first version + */ + +#include +#include +#include +#include + +#include "nnom.h" +#include "nnom_local.h" +#include "nnom_layers.h" +#include "layers/nnom_activation.h" + +#ifdef NNOM_USING_CMSIS_NN +#include "arm_math.h" +#include "arm_nnfunctions.h" +#endif + +nnom_layer_t *Activation(nnom_activation_t *act) +{ + nnom_activation_layer_t *layer; + nnom_layer_io_t *in, *out; + + // apply a block memory for all the sub handles. + size_t mem_size = sizeof(nnom_activation_layer_t) + sizeof(nnom_layer_io_t) * 2; + layer = nnom_mem(mem_size); + if (layer == NULL) + return NULL; + + // distribut the memory to sub handles. + in = (void *)((uint8_t*)layer + sizeof(nnom_activation_layer_t)); + out = (void *)((uint8_t*)in + sizeof(nnom_layer_io_t)); + + // set type in layer parent + layer->super.type = NNOM_ACTIVATION; + layer->super.run = activation_run; + layer->super.build = default_build; + // set buf state + in->type = NNOM_TENSOR_BUF_TEMP; + out->type = NNOM_TENSOR_BUF_NULL; // when a layer's io is set to NULL, both will point to same mem. + // put in & out on the layer. + layer->super.in = io_init(layer, in); + layer->super.out = io_init(layer, out); + + // set activation to layer + layer->act = act; + + // set free method + layer->super.free = activation_free; + + return (nnom_layer_t *)layer; +} + +nnom_layer_t *ReLU(void) +{ + nnom_layer_t *layer = Activation(act_relu()); + if (layer == NULL) + return NULL; + + // set type in layer parent + layer->type = NNOM_RELU; + return layer; +} + +nnom_layer_t *LeakyReLU(float alpha) +{ + nnom_layer_t *layer = Activation(act_leaky_relu(alpha)); + if (layer == NULL) + return NULL; + + // set type in layer parent + layer->type = NNOM_LEAKY_RELU; + return layer; +} + +nnom_layer_t *AdvReLU(float alpha, float max, float threshold) +{ + nnom_layer_t *layer = Activation(act_adv_relu(alpha, max, threshold)); + if (layer == NULL) + return NULL; + + // set type in layer parent + layer->type = NNOM_ADV_RELU; + return layer; +} + +nnom_layer_t *Sigmoid(int32_t dec_bit) +{ + nnom_layer_t *layer = Activation(act_sigmoid(dec_bit)); + if (layer == NULL) + return NULL; + + // set type in layer parent + layer->type = NNOM_SIGMOID; + return layer; +} + +nnom_layer_t *TanH(int32_t dec_bit) +{ + nnom_layer_t *layer = Activation(act_tanh(dec_bit)); + if (layer == NULL) + return NULL; + // set type in layer parent + layer->type = NNOM_TANH; + return layer; +} + +void act_delete(nnom_activation_t* act){ + nnom_free(act); +} + +// activation takes act instance which is created. therefore, it must be free when activation is deleted. +// this is the callback in layer->free +nnom_status_t activation_free(nnom_layer_t *layer) +{ + if(layer) + act_delete(((nnom_activation_layer_t *)layer)->act); + return NN_SUCCESS; +} + +nnom_status_t activation_run(nnom_layer_t *layer) +{ + nnom_activation_layer_t *cl = (nnom_activation_layer_t *)layer; + return act_tensor_run(cl->act, layer->in->tensor); +} + +// porting +static nnom_status_t relu_run(nnom_activation_t* act) +{ + if(act->tensor->bitwidth == 16) + { + #ifdef NNOM_USING_CMSIS_NN + arm_relu_q15(act->tensor->p_data, tensor_size(act->tensor)); + #else + local_relu_q15(act->tensor->p_data, tensor_size(act->tensor)); + #endif + } + else + { + #ifdef NNOM_USING_CMSIS_NN + arm_relu_q7(act->tensor->p_data, tensor_size(act->tensor)); + #else + local_relu_q7(act->tensor->p_data, tensor_size(act->tensor)); + #endif + } + return NN_SUCCESS; +} + +// leaky relu +static nnom_status_t leaky_relu_run(nnom_activation_t* act) +{ + nnom_activation_leaky_relu_t* a = (nnom_activation_leaky_relu_t*) act; + if(act->tensor->bitwidth == 16) + local_leaky_relu_q15(act->tensor->p_data, a->alpha, tensor_size(act->tensor)); + else + local_leaky_relu_q7(act->tensor->p_data, a->alpha, tensor_size(act->tensor)); + return NN_SUCCESS; +} + +// advance relu +static nnom_status_t adv_relu_run(nnom_activation_t* act) +{ + nnom_activation_adv_relu_t* a = (nnom_activation_adv_relu_t*) act; + + // we need to convert float to fixpoint in runtime where we can know the tensor's q format + if(act->tensor->bitwidth == 16) + { + q15_t max = 32767; + q15_t threshold = MIN(a->threshold * (1 << (15 - act->tensor->q_dec[0])), 32767); + q7_t max_scale = (1 << (15 - act->tensor->q_dec[0])); + if(a->max != INFINITY && a->max != 0x7fc00000) + if(a->max * max_scale < max) + max = a->max * max_scale; + local_adv_relu_q15(act->tensor->p_data, a->negative_slope, max, threshold, tensor_size(act->tensor)); + } + // 8bit + else + { + q7_t max = 127; + q7_t threshold = MIN(a->threshold * (1 << (7 - act->tensor->q_dec[0])), 127); + q7_t max_scale = (1 << (7 - act->tensor->q_dec[0])); + if(a->max != INFINITY && a->max != 0x7fc00000) // QNAN 0x7fc00000 also represent infinity in script 0.4.1 + if(a->max * max_scale < max) + max = a->max * max_scale; + local_adv_relu_q7(act->tensor->p_data, a->negative_slope, max, threshold, tensor_size(act->tensor)); + } + + return NN_SUCCESS; +} + +static nnom_status_t tanh_run(nnom_activation_t* act) +{ + nnom_activation_fixed_q_t * a = (nnom_activation_fixed_q_t*)act; + // 16 bit + if(act->tensor->bitwidth == 16) + { + uint8_t int_bit = 15 - a->dec_bit; + #ifdef NNOM_USING_CMSIS_NN + arm_nn_activations_direct_q15(act->tensor->p_data, tensor_size(act->tensor), int_bit, ARM_TANH); + #else + local_tanh_q15(act->tensor->p_data, tensor_size(act->tensor), int_bit); + #endif + } + else // 8bit + { + uint8_t int_bit = 7 - a->dec_bit; + // arm version cannot handle int_bit > 3 + #ifdef NNOM_USING_CMSIS_NN + if(act->tensor->q_dec[0] <= 3) + arm_nn_activations_direct_q7(act->tensor->p_data, tensor_size(act->tensor), int_bit, ARM_TANH); + else + #endif + local_tanh_q7(act->tensor->p_data, tensor_size(act->tensor), int_bit); + } + return NN_SUCCESS; +} + +static nnom_status_t sigmoid_run( nnom_activation_t* act) +{ + nnom_activation_fixed_q_t * a = (nnom_activation_fixed_q_t*)act; + // 16 bit + if(act->tensor->bitwidth == 16) + { + uint8_t int_bit = 15 - a->dec_bit; + #ifdef NNOM_USING_CMSIS_NN + arm_nn_activations_direct_q15(act->tensor->p_data, tensor_size(act->tensor), int_bit, ARM_SIGMOID); + #else + local_sigmoid_q15(act->tensor->p_data, tensor_size(act->tensor), int_bit); + #endif + } + else // 8bit + { + uint8_t int_bit = 7 - a->dec_bit; + // arm version cannot handle int_bit > 3 + #ifdef NNOM_USING_CMSIS_NN + if(act->tensor->q_dec[0] <= 3) + arm_nn_activations_direct_q7(act->tensor->p_data, tensor_size(act->tensor), int_bit, ARM_TANH); + else + #endif + local_sigmoid_q7(act->tensor->p_data, tensor_size(act->tensor), int_bit); + } + + return NN_SUCCESS; +} + +static nnom_status_t hard_tanh_run( nnom_activation_t* act) +{ + nnom_activation_fixed_q_t * a = (nnom_activation_fixed_q_t*)act; + if(act->tensor->bitwidth == 16) + local_hard_tanh_q15(act->tensor->p_data, tensor_size(act->tensor), a->dec_bit + 8); // a->dec is based on 8 bit. + else + local_hard_tanh_q7(act->tensor->p_data, tensor_size(act->tensor), a->dec_bit); + return NN_SUCCESS; +} + +static nnom_status_t hard_sigmoid_run( nnom_activation_t* act) +{ + nnom_activation_fixed_q_t * a = (nnom_activation_fixed_q_t*)act; + if(act->tensor->bitwidth == 16) + local_hard_sigmoid_q15(act->tensor->p_data, tensor_size(act->tensor), a->dec_bit + 8); // a->dec is based on 8 bit. + else + local_hard_sigmoid_q7(act->tensor->p_data, tensor_size(act->tensor), a->dec_bit); + return NN_SUCCESS; +} + +// +nnom_activation_t* act_relu(void) +{ + nnom_activation_t* act = nnom_mem(sizeof(nnom_activation_t)); + act->run = relu_run; + act->type = ACT_RELU; + return act; +} + +nnom_activation_t* act_leaky_relu(float alpha) +{ + nnom_activation_leaky_relu_t* act = nnom_mem(sizeof(nnom_activation_leaky_relu_t)); + act->super.run = leaky_relu_run; + act->super.type = ACT_LEAKY_RELU; + act->alpha = (q7_t)(alpha*128); + return (nnom_activation_t* )act; +} + +nnom_activation_t* act_adv_relu(float negative_slope, float max, float threshold) +{ + nnom_activation_adv_relu_t* act = nnom_mem(sizeof(nnom_activation_adv_relu_t)); + act->super.run = adv_relu_run; + act->super.type = ACT_ADV_RELU; + act->negative_slope = (q7_t)(negative_slope*128); + act->max = max; + act->threshold = threshold; + return (nnom_activation_t* )act; +} + +nnom_activation_t* act_tanh(int32_t dec_bit) +{ + nnom_activation_fixed_q_t* act = nnom_mem(sizeof(nnom_activation_fixed_q_t)); + act->super.run = tanh_run; + act->super.type = ACT_TANH; + act->dec_bit = dec_bit; + return (nnom_activation_t*)act; +} + +nnom_activation_t* act_sigmoid(int32_t dec_bit) +{ + nnom_activation_fixed_q_t* act = nnom_mem(sizeof(nnom_activation_fixed_q_t)); + + act->super.run = sigmoid_run; + act->super.type = ACT_SIGMOID; + act->dec_bit = dec_bit; + return (nnom_activation_t*)act; +} + +nnom_activation_t* act_hard_tanh(int32_t dec_bit) +{ + nnom_activation_fixed_q_t* act = nnom_mem(sizeof(nnom_activation_fixed_q_t)); + + act->super.run = hard_tanh_run; + act->super.type = ACT_HARD_TANH; + act->dec_bit = dec_bit; + return (nnom_activation_t*)act; +} + +nnom_activation_t* act_hard_sigmoid(int32_t dec_bit) +{ + nnom_activation_fixed_q_t* act = nnom_mem(sizeof(nnom_activation_fixed_q_t)); + + act->super.run = hard_sigmoid_run; + act->super.type = ACT_HARD_SIGMOID; + act->dec_bit = dec_bit; + return (nnom_activation_t*)act; +} + +// return the decimal bit if the activation will change the q format of the layer. +int32_t act_get_dec_bit(nnom_activation_type_t type, int32_t dec_bit) +{ + switch(type) + { + case ACT_RELU: + case ACT_LEAKY_RELU: + case ACT_ADV_RELU: + break; + case ACT_TANH: + case ACT_HARD_TANH: + case ACT_SIGMOID: + case ACT_HARD_SIGMOID: + dec_bit = 7; + default:break; + } + return dec_bit; +} + +// a direct api to run activate a tensor +nnom_status_t act_tensor_run(nnom_activation_t* act, nnom_tensor_t* tensor) +{ + act->tensor = tensor; + return act->run(act); +} diff --git a/APP_Framework/Framework/knowing/nnom/src/layers/nnom_avgpool.c b/APP_Framework/Framework/knowing/nnom/src/layers/nnom_avgpool.c new file mode 100644 index 000000000..8ee220f4c --- /dev/null +++ b/APP_Framework/Framework/knowing/nnom/src/layers/nnom_avgpool.c @@ -0,0 +1,167 @@ +/* + * Copyright (c) 2018-2020 + * Jianjia Ma + * majianjia@live.com + * + * SPDX-License-Identifier: Apache-2.0 + * + * Change Logs: + * Date Author Notes + * 2019-07-23 Jianjia Ma The first version + */ + +#include +#include +#include + +#include "nnom.h" +#include "nnom_local.h" +#include "nnom_layers.h" +#include "layers/nnom_avgpool.h" + +#ifdef NNOM_USING_CMSIS_NN +#include "arm_math.h" +#include "arm_nnfunctions.h" +#endif + +nnom_layer_t *avgpool_s(const nnom_pool_config_t * config) +{ + nnom_avgpool_layer_t *cl; + + if(config->num_dim == 1) + { + cl = (nnom_avgpool_layer_t *)AvgPool(kernel(1, config->kernel_size[0]), + stride(1, config->stride_size[0]), + config->padding_type); + } + else + { + cl = (nnom_avgpool_layer_t *)AvgPool(kernel(config->kernel_size[0], config->kernel_size[1]), + stride(config->stride_size[0], config->stride_size[1]), + config->padding_type); + } + + if(cl) + { + cl->super.config = (void*) config; + cl->output_shift = config->output_shift; // no idea if we need it + } + return (nnom_layer_t *)cl; +} + +nnom_layer_t *AvgPool(nnom_3d_shape_t k, nnom_3d_shape_t s, nnom_padding_t pad_type) +{ + nnom_layer_t *layer = MaxPool(k, s, pad_type); + + if (layer != NULL) + { + layer->type = NNOM_AVGPOOL; + layer->run = avgpool_run; + layer->build = avgpool_build; + } + return (nnom_layer_t *)layer; +} + +nnom_status_t avgpool_build(nnom_layer_t *layer) +{ + uint32_t size; + // avg pooling share the same output shape, stride, padding setting. + maxpool_build(layer); + + #ifdef NNOM_USING_CMSIS_NN + // however, avg pooling require a computational buffer. + // bufferA size: 2*dim_im_out*ch_im_in + size = layer->out->tensor->dim[1] > layer->out->tensor->dim[0] ? + layer->out->tensor->dim[1] : layer->out->tensor->dim[0]; + layer->comp->size = 2 * size * layer->in->tensor->dim[2]; + #endif + + return NN_SUCCESS; +} + +nnom_status_t avgpool_run(nnom_layer_t *layer) +{ + nnom_avgpool_layer_t *cl = (nnom_avgpool_layer_t *)(layer); + uint16_t out_x, out_y; + // if global pooling + if(layer->out->tensor->num_dim == 1) + { + out_x = 1; out_y = 1; + } + else // normal pooling. + { + out_x = layer->out->tensor->dim[1]; //W + out_y = layer->out->tensor->dim[0]; //h + } + + // 16 bit + if(layer->in->tensor->bitwidth == 16) + { +#ifdef NNOM_USING_CHW + local_avepool_q15_CHW(layer->in->tensor->p_data, + layer->in->tensor->dim[1], layer->in->tensor->dim[0], layer->in->tensor->dim[2], + cl->kernel.w, cl->kernel.h, + cl->pad.w, cl->pad.h, + cl->stride.w, cl->stride.h, + out_x, out_y, + cl->output_shift, + NULL, + layer->out->tensor->p_data); +#else + local_avepool_q15_HWC(layer->in->tensor->p_data, + layer->in->tensor->dim[1], layer->in->tensor->dim[0], layer->in->tensor->dim[2], + cl->kernel.w, cl->kernel.h, + cl->pad.w, cl->pad.h, + cl->stride.w, cl->stride.h, + out_x, out_y, + cl->output_shift, + NULL, + layer->out->tensor->p_data); +#endif + } + // 8bit + else{ +#ifdef NNOM_USING_CHW + local_avepool_q7_CHW(layer->in->tensor->p_data, + layer->in->tensor->dim[1], layer->in->tensor->dim[0], layer->in->tensor->dim[2], + cl->kernel.w, cl->kernel.h, + cl->pad.w, cl->pad.h, + cl->stride.w, cl->stride.h, + out_x, out_y, + cl->output_shift, + NULL, + layer->out->tensor->p_data); +#else //end of CHW + #ifdef NNOM_USING_CMSIS_NN + // 2D, square + if (layer->in->tensor->dim[1] == layer->in->tensor->dim[0] && + layer->out->tensor->dim[1] == layer->out->tensor->dim[0] && + cl->output_shift == 0) + { + arm_avepool_q7_HWC( + layer->in->tensor->p_data, + layer->in->tensor->dim[1], layer->in->tensor->dim[2], + cl->kernel.w, cl->pad.w, cl->stride.w, + layer->out->tensor->dim[1], + layer->comp->mem->blk, + layer->out->tensor->p_data); + } + // none square 2D, or 1D + else + #endif + { + // CMSIS-NN does not support none-square pooling, we have to use local implementation + local_avepool_q7_HWC(layer->in->tensor->p_data, + layer->in->tensor->dim[1], layer->in->tensor->dim[0], layer->in->tensor->dim[2], + cl->kernel.w, cl->kernel.h, + cl->pad.w, cl->pad.h, + cl->stride.w, cl->stride.h, + out_x, out_y, + cl->output_shift, + NULL, + layer->out->tensor->p_data); + } +#endif + } + return NN_SUCCESS; +} diff --git a/APP_Framework/Framework/knowing/nnom/src/layers/nnom_baselayer.c b/APP_Framework/Framework/knowing/nnom/src/layers/nnom_baselayer.c new file mode 100644 index 000000000..0442fb2b0 --- /dev/null +++ b/APP_Framework/Framework/knowing/nnom/src/layers/nnom_baselayer.c @@ -0,0 +1,90 @@ +/* + * Copyright (c) 2018-2020 + * Jianjia Ma + * majianjia@live.com + * + * SPDX-License-Identifier: Apache-2.0 + * + * Change Logs: + * Date Author Notes + * 2019-07-23 Jianjia Ma The first version + */ + +#include +#include +#include + +#include "nnom.h" +#include "nnom_local.h" +#include "nnom_layers.h" +#include "layers/nnom_baselayer.h" + +// this layer copys the input to the output + +nnom_layer_t *baselayer_s(const nnom_layer_config_t * config) +{ + nnom_layer_t *layer = BaseLayer(); + if(layer) + layer->config = (void*) config; + return layer; +} + +nnom_layer_t *BaseLayer() +{ + nnom_io_layer_t *layer; + nnom_layer_io_t *in, *out; + + // apply a block memory for all the sub handles. + size_t mem_size = sizeof(nnom_io_layer_t) + sizeof(nnom_layer_io_t) * 2; + layer = nnom_mem(mem_size); + if (layer == NULL) + return NULL; + + // distribut the memory to sub handles. + in = (void *)((uint8_t*)layer + sizeof(nnom_io_layer_t)); + out = (void *)((uint8_t*)in + sizeof(nnom_layer_io_t)); + + // set type in layer parent + layer->super.type = NNOM_BASE; + layer->super.run = default_run; + layer->super.build = default_build; + // set buf state + in->type = NNOM_TENSOR_BUF_TEMP; + out->type = NNOM_TENSOR_BUF_NULL; + // put in & out on the layer. + layer->super.in = io_init(layer, in); + layer->super.out = io_init(layer, out); + + return (nnom_layer_t *)layer; +} + +// this is call while output shape is not defined. +// this will set the output shape same as input shape, and it set only the primary IO +// this cannot be used as first layer, of course... +nnom_status_t default_build(nnom_layer_t *layer) +{ + // get the last layer's output as input shape + layer->in->tensor = layer->in->hook.io->tensor; + // output tensor + // 1. allocate a new tensor for output + // 2. set the same dim, qfmt to the new tensor. + layer->out->tensor = new_tensor(NNOM_QTYPE_PER_TENSOR,layer->in->tensor->num_dim, tensor_get_num_channel(layer->in->tensor)); + tensor_cpy_attr(layer->out->tensor, layer->in->tensor); + + // see if the activation will change the q format + if(layer->actail) + layer->out->tensor->q_dec[0] = act_get_dec_bit(layer->actail->type, layer->out->tensor->q_dec[0]); + + // now this build has passed the input tensors (shapes, formats) to the new tensors. + return NN_SUCCESS; +} + +// simply copy input to output +nnom_status_t default_run(nnom_layer_t *layer) +{ + if(layer->out->type != NNOM_TENSOR_BUF_NULL) + { + nnom_memcpy(layer->out->tensor->p_data, layer->in->tensor->p_data, tensor_size_byte(layer->in->tensor)); + } + return NN_SUCCESS; +} diff --git a/APP_Framework/Framework/knowing/nnom/src/layers/nnom_concat.c b/APP_Framework/Framework/knowing/nnom/src/layers/nnom_concat.c new file mode 100644 index 000000000..0e1efa7a2 --- /dev/null +++ b/APP_Framework/Framework/knowing/nnom/src/layers/nnom_concat.c @@ -0,0 +1,223 @@ +/* + * Copyright (c) 2018-2020 + * Jianjia Ma + * majianjia@live.com + * + * SPDX-License-Identifier: Apache-2.0 + * + * Change Logs: + * Date Author Notes + * 2019-07-23 Jianjia Ma The first version + */ + +#include +#include +#include + +#include "nnom.h" +#include "nnom_local.h" +#include "nnom_layers.h" +#include "layers/nnom_concat.h" + +nnom_layer_t *concat_s(const nnom_concat_config_t *config) +{ + nnom_layer_t* layer = Concat(config->axis); + if(layer) + layer->config = (void*) config; + return layer; +} + +// concate method +// concate requires more than one input module. aux input will be allocated in model.merge() +nnom_layer_t *Concat(int8_t axis) +{ + nnom_concat_layer_t *layer; + nnom_layer_io_t *in, *out; + size_t mem_size; + + // apply a block memory for all the sub handles. + mem_size = sizeof(nnom_concat_layer_t) + sizeof(nnom_layer_io_t) * 2; + layer = nnom_mem(mem_size); + if (layer == NULL) + return NULL; + + // distribut the memory to sub handles. + in = (void *)((uint8_t*)layer + sizeof(nnom_concat_layer_t)); + out = (void *)((uint8_t*)in + sizeof(nnom_layer_io_t)); + + // set type in layer parent + layer->super.type = NNOM_CONCAT; + layer->super.run = concat_run; + layer->super.build = concat_build; + // set buf state + in->type = NNOM_TENSOR_BUF_TEMP; + out->type = NNOM_TENSOR_BUF_TEMP; + // put in & out on the layer. + layer->super.in = io_init(layer, in); + layer->super.out = io_init(layer, out); + + // axis + layer->axis = axis; + + return (nnom_layer_t *)layer; +} + + +nnom_status_t concat_build(nnom_layer_t *layer) +{ + nnom_concat_layer_t *cl = (nnom_concat_layer_t *)layer; + nnom_layer_io_t *in; + uint32_t in_num = 0; + int32_t num_dim; + + // for each input module, copy the shape from the output of last layer + in = layer->in; + while (in != NULL) + { + //get the last layer's output as input shape + in->tensor = in->hook.io->tensor; + in = in->aux; + in_num++; + } + + // allocate new tensor for output, keep the same dimension lenght + layer->out->tensor = new_tensor(NNOM_QTYPE_PER_TENSOR, layer->in->tensor->num_dim, tensor_get_num_channel(layer->in->tensor)); + tensor_cpy_attr(layer->out->tensor, layer->in->tensor); + + // convert the axis. + if (cl->axis < 0) + cl->axis = (layer->in->tensor->num_dim + cl->axis); + else if (cl->axis >0) + cl->axis = cl->axis -1; // keras use axis start from 1. we are using 0, 1, 2 (check?) + + // find out the concated axis + num_dim = layer->in->tensor->num_dim; + for (uint32_t i = 0; i < num_dim; i ++) + { + // exclue the concat axies + if (i == cl->axis) + { + layer->out->tensor->dim[i] = 0; + + // add the same axis from all input up. + in = layer->in; + while (in != NULL) + { + layer->out->tensor->dim[i] += in->tensor->dim[i]; + in = in->aux; + } + continue; + } + + // check others, all other must be same shape + in = layer->in; + while (in != NULL && in->aux != NULL) + { + if (in->tensor->dim[i] != in->aux->tensor->dim[i]) + return NN_ARGUMENT_ERROR; + in = in->aux; + } + + // now set other axis + layer->out->tensor->dim[i] = layer->in->tensor->dim[i]; + } + + return NN_SUCCESS; +} + + +#ifdef NNOM_USING_CHW +// axis index converter between HWC and CHW +static inline int chw_i(int hwc, int num_dim) +{ + num_dim = num_dim -1; + hwc = hwc + 1; + if(hwc>num_dim) + hwc = 0; + return hwc; +} +static inline int hwc_i(int chw, int num_dim) +{ + num_dim = num_dim -1; + chw = chw - 1; + if(chw=2) input and 1 output. + nnom_concat_layer_t *cl = (nnom_concat_layer_t *)layer; + nnom_layer_io_t *in; + uint32_t dwidth = layer->in->tensor->bitwidth/8; // data width in byte + +#ifdef NNOM_USING_CHW + // Concatenate for HWC + uint8_t *pin; + uint8_t *pout = layer->out->tensor->p_data; + uint32_t block_size; + uint32_t n_block; + uint8_t num_dim = layer->in->tensor->num_dim; + + // calcualte number of block to concat. the other shapes before the concat axis + n_block = 1; + for(int i= 0; i< chw_i(cl->axis, num_dim); i++) + { + n_block *= layer->in->tensor->dim[hwc_i(i, num_dim)]; + } + + // concat all input layers + for(int i=0; iin; + while (in != NULL) + { + // the block size of concat data in this layer + block_size = dwidth; + for(int j= num_dim-1; j >= chw_i(cl->axis, num_dim); j--) + block_size *= in->tensor->dim[hwc_i(j, num_dim)]; + // concat + pin = (uint8_t *)in->tensor->p_data + i * block_size; + nnom_memcpy(pout, pin, block_size); + pout += block_size; + in = in->aux; + } + } + +#else // end of CHW concate + + // Concatenate for HWC + uint8_t* pin; + uint8_t* pout = layer->out->tensor->p_data; + uint32_t block_size; + uint32_t n_block; + uint8_t num_dim = layer->in->tensor->num_dim; + + // calcualte the number of block to concat. (the other shapes before the concat axis) + n_block = 1; + for (int i = 0; i < cl->axis; i++) + n_block *= layer->in->tensor->dim[i]; + + // concat all input layers + for (int i = 0; i < n_block; i++) + { + in = layer->in; + while (in != NULL) + { + // the block size of concat data in this layer + block_size = dwidth; + for (int j = cl->axis; j < num_dim; j++) + block_size *= in->tensor->dim[j]; + // concat + pin = (uint8_t*)in->tensor->p_data + i * block_size; + nnom_memcpy(pout, pin, block_size); + pout += block_size; + in = in->aux; + } + } +#endif + return NN_SUCCESS; +} + diff --git a/APP_Framework/Framework/knowing/nnom/src/layers/nnom_conv2d.c b/APP_Framework/Framework/knowing/nnom/src/layers/nnom_conv2d.c new file mode 100644 index 000000000..ea553aedf --- /dev/null +++ b/APP_Framework/Framework/knowing/nnom/src/layers/nnom_conv2d.c @@ -0,0 +1,434 @@ +/* + * Copyright (c) 2018-2020 + * Jianjia Ma + * majianjia@live.com + * + * SPDX-License-Identifier: Apache-2.0 + * + * Change Logs: + * Date Author Notes + * 2019-07-23 Jianjia Ma The first version + */ + + +#include +#include +#include + +#include "nnom.h" +#include "nnom_local.h" +#include "nnom_layers.h" +#include "layers/nnom_conv2d.h" + +#ifdef NNOM_USING_CMSIS_NN +#include "arm_math.h" +#include "arm_nnfunctions.h" +#endif + +// a machine friendly api, with suffix _s for structured configuration. +nnom_layer_t *conv2d_s(const nnom_conv2d_config_t *config) +{ + nnom_conv2d_layer_t *layer; + nnom_buf_t *comp; + nnom_layer_io_t *in, *out; + size_t mem_size; + + // allocate a block memory for all the sub handles and shifts. + mem_size = sizeof(nnom_conv2d_layer_t) + sizeof(nnom_layer_io_t) * 2 + sizeof(nnom_buf_t); + layer = nnom_mem(mem_size); + if (layer == NULL) + return NULL; + + // distribut the memory to sub handles. + in = (void *)((uint8_t*)layer + sizeof(nnom_conv2d_layer_t)); + out = (void *)((uint8_t*)in + sizeof(nnom_layer_io_t)); + comp = (void *)((uint8_t*)out + sizeof(nnom_layer_io_t)); + + // set type in layer parent + layer->super.type = NNOM_CONV_2D; + // set buf state + in->type = NNOM_TENSOR_BUF_TEMP; + out->type = NNOM_TENSOR_BUF_TEMP; + comp->type = NNOM_TENSOR_BUF_TEMP; + // put in & out on the layer. + layer->super.in = io_init(layer, in); + layer->super.out = io_init(layer, out); + #ifdef NNOM_USING_CMSIS_NN + layer->super.comp = comp; + #endif + // set run method & output shape + layer->super.run = conv2d_run; + layer->super.build = conv2d_build; + layer->super.free = conv2d_free; + + // save the config + layer->super.config = (void*) config; + + // get the private parameters + // test: for 1d input, expend h = 1 + if(config->weight->num_dim == 3) + { + layer->kernel = kernel(1, config->kernel_size[0]); + layer->stride = stride(1, config->stride_size[0]); + layer->dilation = dilation(1, config->dilation_size[0]); + } + else + { + layer->kernel = kernel(config->kernel_size[0], config->kernel_size[1]); + layer->stride = stride(config->stride_size[0], config->stride_size[1]); + layer->dilation = dilation(config->dilation_size[0], config->dilation_size[1]); + } + + layer->filter_mult = config->filter_size; // for convs, this means filter number + layer->padding_type = config->padding_type; + + // get bias and weight tensor, this should be created by script. + layer->weight = config->weight; + layer->bias = config->bias; + + // get shifts + layer->output_rshift = (nnom_qformat_param_t *)config->output_shift; + layer->bias_lshift = (nnom_qformat_param_t *)config->bias_shift; + + // padding + if (layer->padding_type == PADDING_SAME) + { + layer->pad.h = layer->dilation.h * (layer->kernel.h - 1) / 2; + layer->pad.w = layer->dilation.w * (layer->kernel.w - 1) / 2; + layer->pad.c = (1 - 1) / 2; + } + + return (nnom_layer_t *)layer; +} + + +// Conv2D +// multiplier of (output/input channel), +// shape of kernal, shape of strides, weight struct, bias struct +nnom_layer_t *Conv2D(uint32_t filters, nnom_3d_shape_t k, nnom_3d_shape_t s, nnom_3d_shape_t d, nnom_padding_t pad_type, + const nnom_weight_t *w, const nnom_bias_t *b) +{ + nnom_conv2d_layer_t *layer; + nnom_buf_t *comp; + nnom_layer_io_t *in, *out; + // apply a block memory for all the sub handles. + size_t mem_size = sizeof(nnom_conv2d_layer_t) + sizeof(nnom_layer_io_t) * 2 + sizeof(nnom_buf_t); + layer = nnom_mem(mem_size); + if (layer == NULL) + return NULL; + + // distribut the memory to sub handles. + in = (void *)((uint8_t*)layer + sizeof(nnom_conv2d_layer_t)); + out = (void *)((uint8_t*)in + sizeof(nnom_layer_io_t)); + comp = (void *)((uint8_t*)out + sizeof(nnom_layer_io_t)); + + // set type in layer parent + layer->super.type = NNOM_CONV_2D; + // set buf state + in->type = NNOM_TENSOR_BUF_TEMP; + out->type = NNOM_TENSOR_BUF_TEMP; + comp->type = NNOM_TENSOR_BUF_TEMP; + // put in & out on the layer. + layer->super.in = io_init(layer, in); + layer->super.out = io_init(layer, out); + #ifdef NNOM_USING_CMSIS_NN + layer->super.comp = comp; + #endif + // set run method & output shape + layer->super.run = conv2d_run; + layer->super.build = conv2d_build; + + // get the private parameters + layer->kernel = k; + layer->stride = s; + layer->dilation = d; + layer->filter_mult = filters; // for convs, this means filter number + layer->padding_type = pad_type; + + // create weight and bias tensor + layer->weight = new_tensor(NNOM_QTYPE_PER_TENSOR, 4, filters); + layer->bias = new_tensor(NNOM_QTYPE_PER_TENSOR, 1, filters); + + // configure weight tensor manually to support new tensor based backends. + // needs to be very careful + { + // config weight + nnom_shape_data_t dim[4] = {k.h, k.w, k.c, filters}; + *(layer->weight->q_offset) = 0; // we have no support of offset here + *(layer->weight->q_dec) = 0; // not using it + layer->weight->p_data = (void*)w->p_value; + layer->weight->bitwidth = 8; + layer->weight->qtype = NNOM_QTYPE_PER_TENSOR; + nnom_memcpy(layer->weight->dim, dim, layer->weight->num_dim * sizeof(nnom_shape_data_t)); + + // config bias + dim[0] = filters; + *(layer->bias->q_offset) = 0; // we have no support of offset here + *(layer->bias->q_dec) = 0; // not using it + layer->bias->p_data = (void*) b->p_value; + layer->bias->bitwidth = 8; + layer->weight->qtype = NNOM_QTYPE_PER_TENSOR; + nnom_memcpy(layer->bias->dim, dim, layer->bias->num_dim * sizeof(nnom_shape_data_t)); + + // output shift and bias shift + layer->output_rshift = (nnom_qformat_param_t *)&w->shift; + layer->bias_lshift = (nnom_qformat_param_t *)&b->shift; + } + + return (nnom_layer_t *)layer; +} + +// keras's implementation. +// source: https://github.com/keras-team/keras/blob/7a39b6c62d43c25472b2c2476bd2a8983ae4f682/keras/utils/conv_utils.py#L85 +uint32_t conv_output_length(uint32_t input_length, uint32_t filter_size, nnom_padding_t padding, uint32_t stride, uint32_t dilation) +{ + if (input_length == 0) + return 0; + uint32_t dilated_filter_size = (filter_size - 1) * dilation + 1; + uint32_t output_length; + if(padding == PADDING_SAME) + output_length = input_length; + else + output_length = input_length - dilated_filter_size + 1; + return (output_length + stride - 1) / stride; +} + +nnom_status_t conv2d_build(nnom_layer_t *layer) +{ + nnom_conv2d_layer_t *cl = (nnom_conv2d_layer_t *)layer; + + // get the tensor from last layer's output + layer->in->tensor = layer->in->hook.io->tensor; + + // create new tensor for the output + layer->out->tensor = new_tensor(NNOM_QTYPE_PER_TENSOR, layer->in->tensor->num_dim, cl->filter_mult); + // copy then change later. + tensor_cpy_attr(layer->out->tensor, layer->in->tensor); + + // calculate the output tensor q format, only support per tensor quantise now + layer->out->tensor->q_dec[0] = layer->in->tensor->q_dec[0] + cl->weight->q_dec[0] - cl->output_rshift[0]; // need some modification for 16bit. + // see if the activation will change the q format + if(layer->actail) + layer->out->tensor->q_dec[0] = act_get_dec_bit(layer->actail->type, layer->out->tensor->q_dec[0]); + + // now we set up the tensor shape, always HWC format + layer->out->tensor->dim[0] = conv_output_length(layer->in->tensor->dim[0], cl->kernel.h, cl->padding_type, cl->stride.h, cl->dilation.h); + layer->out->tensor->dim[1] = conv_output_length(layer->in->tensor->dim[1], cl->kernel.w, cl->padding_type, cl->stride.w, cl->dilation.w); + layer->out->tensor->dim[2] = cl->filter_mult; // channel stays the same + + // fill padding + if (cl->padding_type == PADDING_SAME) + { + cl->pad.w = cl->dilation.w * (cl->kernel.w - 1) / 2; + cl->pad.h = cl->dilation.h * (cl->kernel.h - 1) / 2; + cl->pad.c = 0; + } + + #ifdef NNOM_USING_CMSIS_NN + // bufferA size: (1D shape) + // 2*ch_im_in*dim_kernel*dim_kernel + layer->comp->size = 2 * 2 * layer->in->tensor->dim[2] * cl->kernel.w * cl->kernel.h; + #endif + // computational cost: K x K x Cin x Hour x Wout x Cout + layer->stat.macc = cl->kernel.w * cl->kernel.h * layer->in->tensor->dim[2] * tensor_size(layer->out->tensor); + return NN_SUCCESS; +} + +nnom_status_t conv2d_free(nnom_layer_t *layer) +{ + // free weight and bias tensor when we are not initialised from structured configuration. + if(!layer->config) + { + nnom_conv2d_layer_t* cl = (nnom_conv2d_layer_t*)layer; + delete_tensor(cl->weight); + delete_tensor(cl->bias); + } + return NN_SUCCESS; +} + + +nnom_status_t conv2d_run(nnom_layer_t *layer) +{ + nnom_conv2d_layer_t *cl = (nnom_conv2d_layer_t *)layer; + +#ifdef NNOM_USING_CHW + // CHW format + if(layer->in->tensor->bitwidth == 16) + local_convolve_CHW_q15_nonsquare( + layer->in->tensor->p_data, + layer->in->tensor->dim[1], layer->in->tensor->dim[0], layer->in->tensor->dim[2], + cl->weight->p_data, layer->out->tensor->dim[2], + cl->kernel.w, cl->kernel.h, cl->pad.w, cl->pad.h, cl->stride.w, cl->stride.h, cl->dilation.w, cl->dilation.h, + cl->bias->p_data, cl->bias_lshift, cl->output_rshift, cl->weight->qtype, + layer->out->tensor->p_data, + layer->out->tensor->dim[1], layer->out->tensor->dim[0], NULL, NULL); + else + local_convolve_CHW_q7_nonsquare( + layer->in->tensor->p_data, + layer->in->tensor->dim[1], layer->in->tensor->dim[0], layer->in->tensor->dim[2], + cl->weight->p_data, layer->out->tensor->dim[2], + cl->kernel.w, cl->kernel.h, cl->pad.w, cl->pad.h, cl->stride.w, cl->stride.h, cl->dilation.w, cl->dilation.h, + cl->bias->p_data, cl->bias_lshift, cl->output_rshift, cl->weight->qtype, + layer->out->tensor->p_data, + layer->out->tensor->dim[1], layer->out->tensor->dim[0], NULL, NULL); + return NN_SUCCESS; +#else + // HWC format + #ifdef NNOM_USING_CMSIS_NN + // current cmsis nn does not support dilation + if(cl->dilation.w == 1 && cl->dilation.h == 1 && cl->weight->qtype == NNOM_QTYPE_PER_TENSOR) + { + // 8 bit cmsis nn + if(layer->in->tensor->bitwidth == 8) + { + //RGB + // ch_im_in = 3, w = h + if (layer->in->tensor->dim[2] == 3 && layer->in->tensor->dim[0] == layer->in->tensor->dim[1]) + // squared + if((cl->kernel.w == cl->kernel.h) && (cl->pad.w == cl->pad.h) && (cl->stride.w == cl->stride.h)) + return (nnom_status_t)arm_convolve_HWC_q7_RGB( + layer->in->tensor->p_data, layer->in->tensor->dim[1], layer->in->tensor->dim[2], + cl->weight->p_data, + layer->out->tensor->dim[2], + cl->kernel.w, cl->pad.w, cl->stride.w, + cl->bias->p_data, cl->bias_lshift[0], + cl->output_rshift[0], layer->out->tensor->p_data, layer->out->tensor->dim[1], + (q15_t *)(layer->comp->mem->blk), NULL); + + // check if can use optimized function + // ch_im_in is multiple of 4 + // ch_im_out is multiple of 2 + if ((layer->in->tensor->dim[2] % 4 == 0) && (layer->out->tensor->dim[2] % 2 == 0)) + { + // squared + if((layer->in->tensor->dim[0] == layer->in->tensor->dim[1]) + && (layer->out->tensor->dim[0] == layer->out->tensor->dim[1]) + && (cl->kernel.w == cl->kernel.h) && (cl->pad.w == cl->pad.h) && (cl->stride.w == cl->stride.h)) + { + // 1x1 fast + if (cl->kernel.w == 1 && cl->kernel.h == 1 && cl->stride.w == 1 && cl->stride.h == 1 && cl->pad.w == 0 && cl->pad.h == 0) + return (nnom_status_t)arm_convolve_1x1_HWC_q7_fast_nonsquare( + layer->in->tensor->p_data, + layer->in->tensor->dim[1], layer->in->tensor->dim[0], layer->in->tensor->dim[2], + cl->weight->p_data, + layer->out->tensor->dim[2], + cl->kernel.w, cl->kernel.h, cl->pad.w, cl->pad.h, cl->stride.w, cl->stride.h, + cl->bias->p_data, cl->bias_lshift[0], + cl->output_rshift[0], layer->out->tensor->p_data, layer->out->tensor->dim[1], layer->out->tensor->dim[0], + (q15_t *)(layer->comp->mem->blk), NULL); + // opt square shape + else + return (nnom_status_t)arm_convolve_HWC_q7_fast( + layer->in->tensor->p_data, layer->in->tensor->dim[1], layer->in->tensor->dim[2], + cl->weight->p_data, + layer->out->tensor->dim[2], cl->kernel.w, cl->pad.w, cl->stride.w, + cl->bias->p_data, cl->bias_lshift[0], + cl->output_rshift[0], layer->out->tensor->p_data, + layer->out->tensor->dim[1], (q15_t *)(layer->comp->mem->blk), NULL); + } + // opt none square shape + else + return (nnom_status_t)arm_convolve_HWC_q7_fast_nonsquare( + layer->in->tensor->p_data, + layer->in->tensor->dim[1], layer->in->tensor->dim[0], layer->in->tensor->dim[2], + cl->weight->p_data, layer->out->tensor->dim[2], + cl->kernel.w, cl->kernel.h, cl->pad.w, cl->pad.h, cl->stride.w, cl->stride.h, + cl->bias->p_data, cl->bias_lshift[0], cl->output_rshift[0], + layer->out->tensor->p_data, + layer->out->tensor->dim[1], layer->out->tensor->dim[0], (q15_t *)(layer->comp->mem->blk), NULL); + } + // none optimized + else + { + // none opt square shape + if ((layer->in->tensor->dim[0] == layer->in->tensor->dim[1] && + layer->out->tensor->dim[0] == layer->out->tensor->dim[1]) && + (cl->kernel.w == cl->kernel.h) && (cl->pad.w == cl->pad.h) && (cl->stride.w == cl->stride.h)) + return (nnom_status_t)arm_convolve_HWC_q7_basic( + layer->in->tensor->p_data, layer->in->tensor->dim[1], layer->in->tensor->dim[2], + cl->weight->p_data, + layer->out->tensor->dim[2], cl->kernel.w, cl->pad.w, cl->stride.w, + cl->bias->p_data, cl->bias_lshift[0], + cl->output_rshift[0], layer->out->tensor->p_data, + layer->out->tensor->dim[1], (q15_t *)(layer->comp->mem->blk), NULL); + // none opt none square shape + else + return (nnom_status_t)arm_convolve_HWC_q7_basic_nonsquare( + layer->in->tensor->p_data, + layer->in->tensor->dim[1], layer->in->tensor->dim[0], layer->in->tensor->dim[2], + cl->weight->p_data, layer->out->tensor->dim[2], + cl->kernel.w, cl->kernel.h, cl->pad.w, cl->pad.h, cl->stride.w, cl->stride.h, + cl->bias->p_data, cl->bias_lshift[0], cl->output_rshift[0], + layer->out->tensor->p_data, + layer->out->tensor->dim[1], layer->out->tensor->dim[0], (q15_t *)(layer->comp->mem->blk), NULL); + } //end of cmsis-nn none-opt + } //end of 8 bit cmsis-nn + else if (layer->in->tensor->bitwidth == 16) + { + // fast opt + if ((layer->in->tensor->dim[2] % 2 == 0) && (layer->out->tensor->dim[2] % 2 == 0)) + { + if((layer->in->tensor->dim[0] == layer->in->tensor->dim[1]) + && (layer->out->tensor->dim[0] == layer->out->tensor->dim[1]) + && (cl->kernel.w == cl->kernel.h) && (cl->pad.w == cl->pad.h) && (cl->stride.w == cl->stride.h)) + return (nnom_status_t)arm_convolve_HWC_q15_fast( + layer->in->tensor->p_data, layer->in->tensor->dim[1], layer->in->tensor->dim[2], + cl->weight->p_data, + layer->out->tensor->dim[2], cl->kernel.w, cl->pad.w, cl->stride.w, + cl->bias->p_data, cl->bias_lshift[0], + cl->output_rshift[0], layer->out->tensor->p_data, + layer->out->tensor->dim[1], (q15_t *)(layer->comp->mem->blk), NULL); + else + return (nnom_status_t)arm_convolve_HWC_q15_fast_nonsquare( + layer->in->tensor->p_data, + layer->in->tensor->dim[1], layer->in->tensor->dim[0], layer->in->tensor->dim[2], + cl->weight->p_data, layer->out->tensor->dim[2], + cl->kernel.w, cl->kernel.h, cl->pad.w, cl->pad.h, cl->stride.w, cl->stride.h, + cl->bias->p_data, cl->bias_lshift[0], cl->output_rshift[0], + layer->out->tensor->p_data, + layer->out->tensor->dim[1], layer->out->tensor->dim[0], (q15_t *)(layer->comp->mem->blk), NULL); + } + // none opt basic + else + { + local_convolve_HWC_q7_nonsquare( + layer->in->tensor->p_data, + layer->in->tensor->dim[1], layer->in->tensor->dim[0], layer->in->tensor->dim[2], + cl->weight->p_data, layer->out->tensor->dim[2], + cl->kernel.w, cl->kernel.h, cl->pad.w, cl->pad.h, cl->stride.w, cl->stride.h, cl->dilation.w, cl->dilation.h, + cl->bias->p_data, cl->bias_lshift, cl->output_rshift, cl->weight->qtype, + layer->out->tensor->p_data, + layer->out->tensor->dim[1], layer->out->tensor->dim[0], NULL, NULL); + return NN_SUCCESS; + } + + } // end of 16 bit cmsis-nn + } // end of dilation == 1 + else + #endif // NNOM_USING_CMSIS_NN + { + + if(layer->in->tensor->bitwidth == 16) + local_convolve_HWC_q15_nonsquare( + layer->in->tensor->p_data, + layer->in->tensor->dim[1], layer->in->tensor->dim[0], layer->in->tensor->dim[2], + cl->weight->p_data, layer->out->tensor->dim[2], + cl->kernel.w, cl->kernel.h, cl->pad.w, cl->pad.h, cl->stride.w, cl->stride.h, cl->dilation.w, cl->dilation.h, + cl->bias->p_data, cl->bias_lshift, cl->output_rshift, cl->weight->qtype, + layer->out->tensor->p_data, + layer->out->tensor->dim[1], layer->out->tensor->dim[0], NULL, NULL); + else + local_convolve_HWC_q7_nonsquare( + layer->in->tensor->p_data, + layer->in->tensor->dim[1], layer->in->tensor->dim[0], layer->in->tensor->dim[2], + cl->weight->p_data, layer->out->tensor->dim[2], + cl->kernel.w, cl->kernel.h, cl->pad.w, cl->pad.h, cl->stride.w, cl->stride.h, cl->dilation.w, cl->dilation.h, + cl->bias->p_data, cl->bias_lshift, cl->output_rshift, cl->weight->qtype, + layer->out->tensor->p_data, + layer->out->tensor->dim[1], layer->out->tensor->dim[0], NULL, NULL); + return NN_SUCCESS; + } +#endif // end of CHW/HWC + return NN_SUCCESS; +} + diff --git a/APP_Framework/Framework/knowing/nnom/src/layers/nnom_conv2d_trans.c b/APP_Framework/Framework/knowing/nnom/src/layers/nnom_conv2d_trans.c new file mode 100644 index 000000000..5a99380a2 --- /dev/null +++ b/APP_Framework/Framework/knowing/nnom/src/layers/nnom_conv2d_trans.c @@ -0,0 +1,131 @@ +/* + * Copyright (c) 2018-2020 + * Jianjia Ma + * majianjia@live.com + * + * SPDX-License-Identifier: Apache-2.0 + * + * Change Logs: + * Date Author Notes + * 2020-05-31 Jianjia Ma The first version + */ + + +#include +#include +#include + +#include "nnom.h" +#include "nnom_local.h" +#include "nnom_layers.h" +#include "layers/nnom_conv2d_trans.h" + +nnom_layer_t *conv2d_trans_s(const nnom_conv2d_config_t *config) +{ + nnom_layer_t *layer; + layer = conv2d_s(config); + if (layer) + { + layer->type = NNOM_CONV2D_TRANS; + layer->run = conv2d_trans_run; + layer->build = conv2d_trans_build; + } + return layer; +} + +nnom_layer_t *Conv2DTrans(uint32_t multiplier, nnom_3d_shape_t k, nnom_3d_shape_t s, nnom_3d_shape_t d, nnom_padding_t pad_type, + const nnom_weight_t *w, const nnom_bias_t *b) +{ + nnom_layer_t *layer = Conv2D(multiplier, k, s, d, pad_type, w, b); + if (layer != NULL) + { + layer->type = NNOM_CONV2D_TRANS; + layer->run = conv2d_trans_run; + layer->build = conv2d_trans_build; + } + return layer; +} + +// utils, keras method +// https://github.com/keras-team/keras/blob/7a39b6c62d43c25472b2c2476bd2a8983ae4f682/keras/utils/conv_utils.py#L114 +// https://github.com/tensorflow/tensorflow/blob/2b96f3662bd776e277f86997659e61046b56c315/tensorflow/python/layers/utils.py#L156 +uint32_t conv_trans_output_length(uint32_t input_length, uint32_t kernel_size, nnom_padding_t padding, uint32_t stride_size, uint32_t dilation) +{ + input_length *= stride_size; + if (padding == PADDING_VALID) + input_length += MAX(kernel_size - stride_size, 0); + return input_length; +} + +nnom_status_t conv2d_trans_build(nnom_layer_t *layer) +{ + nnom_conv2d_trans_layer_t *cl = (nnom_conv2d_trans_layer_t *)layer; + + // get the tensor from last layer's output + layer->in->tensor = layer->in->hook.io->tensor; + + // create new tensor for the output + layer->out->tensor = new_tensor(NNOM_QTYPE_PER_TENSOR, layer->in->tensor->num_dim, cl->filter_mult); + // copy then change later. + tensor_cpy_attr(layer->out->tensor, layer->in->tensor); + + // calculate the output tensor q format, only support per tensor quantise now + layer->out->tensor->q_dec[0] = layer->in->tensor->q_dec[0] + cl->weight->q_dec[0] - cl->output_rshift[0]; + // see if the activation will change the q format + if(layer->actail) + layer->out->tensor->q_dec[0] = act_get_dec_bit(layer->actail->type, layer->out->tensor->q_dec[0]); + + // now we set up the tensor shape, always HWC format + layer->out->tensor->dim[0] = conv_trans_output_length(layer->in->tensor->dim[0], cl->kernel.h, cl->padding_type, cl->stride.h, cl->dilation.h); + layer->out->tensor->dim[1] = conv_trans_output_length(layer->in->tensor->dim[1], cl->kernel.w, cl->padding_type, cl->stride.w, cl->dilation.w); + layer->out->tensor->dim[2] = cl->filter_mult; // channel stays the same + + // fill the correct padding + if(cl->padding_type == PADDING_SAME) + { + cl->pad.h = (cl->kernel.h - cl->stride.h) / 2; // the padding to the output. + cl->pad.w = (cl->kernel.w - cl->stride.w) / 2; +// cl->pad.h = (cl->kernel.h - 1)/2; // the padding to the output. +// cl->pad.w = (cl->kernel.w - 1)/2; + cl->pad.c = 0; + } + else + { + cl->pad.h = 0; + cl->pad.w = 0; + cl->pad.c = 0; + } + + // bufferA size: (1D shape) + // 2*ch_im_in*dim_kernel*dim_kernel + //layer->comp->size = 2 * 2 * layer->in->tensor->dim[2] * cl->kernel.w * cl->kernel.h; + // computational cost: K x K x Cin x Hour x Wout x Cout + layer->stat.macc = cl->kernel.w * cl->kernel.h * layer->in->tensor->dim[2] * tensor_size(layer->out->tensor); + return NN_SUCCESS; +} + + +nnom_status_t conv2d_trans_run(nnom_layer_t *layer) +{ + nnom_conv2d_trans_layer_t *cl = (nnom_conv2d_trans_layer_t *)layer; + +#ifdef NNOM_USING_CHW + // no support for CHW yet + return NN_ARGUMENT_ERROR; +#else + + //return conv2d_run(layer); + + local_conv_trans_HWC_q7_nonsquare( + layer->in->tensor->p_data, + layer->in->tensor->dim[1], layer->in->tensor->dim[0], layer->in->tensor->dim[2], + cl->weight->p_data, layer->out->tensor->dim[2], + cl->kernel.w, cl->kernel.h, cl->pad.w, cl->pad.h, cl->stride.w, cl->stride.h, cl->dilation.w, cl->dilation.h, + cl->bias->p_data, cl->bias_lshift[0], cl->output_rshift[0], + layer->out->tensor->p_data, + layer->out->tensor->dim[1], layer->out->tensor->dim[0], NULL, NULL); + return NN_SUCCESS; +#endif +} + + diff --git a/APP_Framework/Framework/knowing/nnom/src/layers/nnom_cropping.c b/APP_Framework/Framework/knowing/nnom/src/layers/nnom_cropping.c new file mode 100644 index 000000000..01abe9265 --- /dev/null +++ b/APP_Framework/Framework/knowing/nnom/src/layers/nnom_cropping.c @@ -0,0 +1,88 @@ +/* + * Copyright (c) 2018-2020 + * Jianjia Ma + * majianjia@live.com + * + * SPDX-License-Identifier: Apache-2.0 + * + * Change Logs: + * Date Author Notes + * 2019-07-23 Jianjia Ma The first version + */ + +#include +#include +#include + +#include "nnom.h" +#include "nnom_local.h" +#include "nnom_layers.h" +#include "layers/nnom_cropping.h" + +nnom_layer_t * cropping_s(const nnom_cropping_config_t *config) +{ + nnom_layer_t *layer = Cropping(config->pad); + if(layer) + layer->config = (void*) config; + return layer; +} + +// Cropping layer +nnom_layer_t *Cropping(nnom_border_t pad) +{ + nnom_layer_t *layer; + // most setting are the same as zero padding + layer = ZeroPadding(pad); + + // now change to cropping + layer->type = NNOM_CROPPING; + layer->run = cropping_run; + layer->build = cropping_build; + + return layer; +} + +nnom_status_t cropping_build(nnom_layer_t* layer) +{ + nnom_cropping_layer_t *cl = (nnom_cropping_layer_t *)layer; + + // get the tensor from last layer's output + layer->in->tensor = layer->in->hook.io->tensor; + + // create new tensor for output + layer->out->tensor = new_tensor(NNOM_QTYPE_PER_TENSOR, layer->in->tensor->num_dim, tensor_get_num_channel(layer->in->tensor)); + // copy then change later. + tensor_cpy_attr(layer->out->tensor, layer->in->tensor); + + // output shape + if(layer->in->tensor->dim[1] <= (cl->pad.left + cl->pad.right) || + layer->in->tensor->dim[0] <= (cl->pad.top + cl->pad.bottom)) + return NN_ARGUMENT_ERROR; + + layer->out->tensor->dim[0] = layer->in->tensor->dim[0] - (cl->pad.top + cl->pad.bottom); + layer->out->tensor->dim[1] = layer->in->tensor->dim[1] - (cl->pad.left + cl->pad.right); + layer->out->tensor->dim[2] = layer->in->tensor->dim[2]; + return NN_SUCCESS; +} + + +nnom_status_t cropping_run(nnom_layer_t * layer) +{ + nnom_cropping_layer_t *cl = (nnom_cropping_layer_t*)layer; + +#ifdef NNOM_USING_CHW + local_cropping_CHW_q7( +#else + local_cropping_HWC_q7( +#endif + layer->in->tensor->p_data, + layer->in->tensor->dim[1], layer->in->tensor->dim[0], layer->in->tensor->dim[2], + cl->pad.top, + cl->pad.bottom, + cl->pad.left, + cl->pad.right, + layer->out->tensor->p_data, + layer->out->tensor->dim[1], layer->out->tensor->dim[0]); + + return NN_SUCCESS; +} diff --git a/APP_Framework/Framework/knowing/nnom/src/layers/nnom_dense.c b/APP_Framework/Framework/knowing/nnom/src/layers/nnom_dense.c new file mode 100644 index 000000000..17c566c76 --- /dev/null +++ b/APP_Framework/Framework/knowing/nnom/src/layers/nnom_dense.c @@ -0,0 +1,207 @@ +/* + * Copyright (c) 2018-2020 + * Jianjia Ma + * majianjia@live.com + * + * SPDX-License-Identifier: Apache-2.0 + * + * Change Logs: + * Date Author Notes + * 2019-07-23 Jianjia Ma The first version + */ + +#include +#include +#include + +#include "nnom.h" +#include "nnom_local.h" +#include "nnom_layers.h" +#include "layers/nnom_dense.h" + +#ifdef NNOM_USING_CMSIS_NN +#include "arm_math.h" +#include "arm_nnfunctions.h" +#endif + +nnom_layer_t *dense_s(const nnom_dense_config_t *config) +{ + nnom_dense_layer_t *layer; + nnom_buf_t *comp; + nnom_layer_io_t *in, *out; + + // apply a block memory for all the sub handles. + size_t mem_size = sizeof(nnom_dense_layer_t) + sizeof(nnom_layer_io_t) * 2 + sizeof(nnom_buf_t); + layer = nnom_mem(mem_size); + if (layer == NULL) + return NULL; + + // distribut the memory to sub handles. + in = (void *)((uint8_t*)layer + sizeof(nnom_dense_layer_t)); + out = (void *)((uint8_t*)in + sizeof(nnom_layer_io_t)); + comp = (void *)((uint8_t*)out + sizeof(nnom_layer_io_t)); + + // set type in layer parent + layer->super.type = NNOM_DENSE; + // set buf state + in->type = NNOM_TENSOR_BUF_TEMP; + out->type = NNOM_TENSOR_BUF_TEMP; + comp->type = NNOM_TENSOR_BUF_TEMP; + // put in & out on the layer. + layer->super.in = io_init(layer, in); + layer->super.out = io_init(layer, out); + layer->super.comp = comp; + // set run and outshape methods + layer->super.run = dense_run; + layer->super.build = dense_build; + layer->super.free = dense_free; + + // set parameters + layer->output_unit = tensor_get_num_channel(config->weight); + layer->bias = config->bias; + layer->weight = config->weight; + // set shifts + layer->output_rshift = (nnom_qformat_param_t *)config->output_shift; + layer->bias_lshift = (nnom_qformat_param_t *)config->bias_shift; + // set config + layer->super.config = (void*) config; + + return (nnom_layer_t *)layer; +} + +nnom_layer_t *Dense(size_t output_unit, const nnom_weight_t *w, const nnom_bias_t *b) +{ + nnom_dense_layer_t *layer; + nnom_buf_t *comp; + nnom_layer_io_t *in, *out; + + // apply a block memory for all the sub handles. + size_t mem_size = sizeof(nnom_dense_layer_t) + sizeof(nnom_layer_io_t) * 2 + sizeof(nnom_buf_t); + layer = nnom_mem(mem_size); + if (layer == NULL) + return NULL; + + // distribut the memory to sub handles. + in = (void *)((uint8_t*)layer + sizeof(nnom_dense_layer_t)); + out = (void *)((uint8_t*)in + sizeof(nnom_layer_io_t)); + comp = (void *)((uint8_t*)out + sizeof(nnom_layer_io_t)); + + // set type in layer parent + layer->super.type = NNOM_DENSE; + // set buf state + in->type = NNOM_TENSOR_BUF_TEMP; + out->type = NNOM_TENSOR_BUF_TEMP; + comp->type = NNOM_TENSOR_BUF_TEMP; + // put in & out on the layer. + layer->super.in = io_init(layer, in); + layer->super.out = io_init(layer, out); + layer->super.comp = comp; + // set run and outshape methods + layer->super.run = dense_run; + layer->super.build = dense_build; + + // set parameters + layer->output_unit = output_unit; // this is no longer needed. the information is contained in the weight tensor. + + layer->weight = new_tensor(NNOM_QTYPE_PER_TENSOR, 2, output_unit); + layer->bias = new_tensor(NNOM_QTYPE_PER_TENSOR, 1, output_unit); + + // configure weight tensor manually to support new tensor-based backends. + // needs to be very careful + { + // config weight + nnom_shape_data_t dim[2] = {0, output_unit}; // the first dim doesnt matter here. will be file in later. + *(layer->weight->q_offset) = 0; // we have no support of offset here + *(layer->weight->q_dec) = 0; // this is not even correct + layer->weight->p_data = (void*)w->p_value; + layer->weight->bitwidth = 8; + layer->weight->qtype = NNOM_QTYPE_PER_TENSOR; + nnom_memcpy(layer->weight->dim, dim, layer->weight->num_dim * sizeof(nnom_shape_data_t)); + + // config bias + dim[0] = output_unit; + *(layer->bias->q_offset) = 0; // we have no support of offset here + *(layer->bias->q_dec) = 0; // this is not even correct + layer->bias->p_data = (void*)b->p_value; + layer->bias->bitwidth = 8; + layer->weight->qtype = NNOM_QTYPE_PER_TENSOR; + nnom_memcpy(layer->bias->dim, dim, layer->bias->num_dim * sizeof(nnom_shape_data_t)); + } + + // set output shifts + layer->output_rshift = (nnom_qformat_param_t *)&w->shift; + layer->bias_lshift = (nnom_qformat_param_t *)&b->shift; + + return (nnom_layer_t *)layer; +} + +nnom_status_t dense_build(nnom_layer_t *layer) +{ + nnom_dense_layer_t *cl = (nnom_dense_layer_t *)layer; + + // get the tensor from last layer's output + layer->in->tensor = layer->in->hook.io->tensor; + + // create new tensor for output + layer->out->tensor = new_tensor(NNOM_QTYPE_PER_TENSOR, 1, tensor_get_num_channel(layer->in->tensor)); + // setup new tensor + nnom_shape_data_t dim[1] = {cl->output_unit}; + tensor_set_attr(layer->out->tensor, cl->weight->q_dec, cl->weight->q_offset, dim, 1, 8); // test, this is not correct + + // calculate the output tensor q format, only support per tensor quantise now + layer->out->tensor->q_dec[0] = layer->in->tensor->q_dec[0] + cl->weight->q_dec[0] - cl->output_rshift[0]; + // see if the activation will change the q format + if(layer->actail) + layer->out->tensor->q_dec[0] = act_get_dec_bit(layer->actail->type, layer->out->tensor->q_dec[0]); + + // vec_buffer size: dim_vec (*2, q7->q15) ? I am not sure this is right + layer->comp->size = tensor_size(layer->in->tensor)*2; + + // computational cost: In * out + layer->stat.macc = tensor_size(layer->in->tensor) * tensor_size(layer->out->tensor); + return NN_SUCCESS; +} + +nnom_status_t dense_free(nnom_layer_t *layer) +{ + // free weight and bias tensor when we are not initialised from structured configuration. + if(!layer->config) + { + nnom_dense_layer_t* cl = (nnom_dense_layer_t*)layer; + delete_tensor(cl->weight); + delete_tensor(cl->bias); + } + + return NN_SUCCESS; +} + +nnom_status_t dense_run(nnom_layer_t *layer) +{ + nnom_status_t result = NN_SUCCESS; + nnom_dense_layer_t *cl = (nnom_dense_layer_t *)(layer); + nnom_qformat_param_t bias_shift = cl->bias_lshift[0]; // this is not correct but a temporary fix solution for backward compatibility. + nnom_qformat_param_t output_shift = cl->output_rshift[0]; + + +#if !(DENSE_WEIGHT_OPT) + #ifdef NNOM_USING_CMSIS_NN + result = (nnom_status_t)arm_fully_connected_q7( + #else + local_fully_connected_q7( + #endif +#else + #ifdef NNOM_USING_CMSIS_NN + result = (nnom_status_t)arm_fully_connected_q7_opt( + #else + local_fully_connected_q7_opt( + #endif +#endif + layer->in->tensor->p_data, + cl->weight->p_data, + tensor_size(layer->in->tensor), layer->out->tensor->dim[0], + bias_shift, output_shift, + cl->bias->p_data, + layer->out->tensor->p_data, (q15_t *)(layer->comp->mem->blk)); + return result; +} + diff --git a/APP_Framework/Framework/knowing/nnom/src/layers/nnom_dw_conv2d.c b/APP_Framework/Framework/knowing/nnom/src/layers/nnom_dw_conv2d.c new file mode 100644 index 000000000..72ac7754e --- /dev/null +++ b/APP_Framework/Framework/knowing/nnom/src/layers/nnom_dw_conv2d.c @@ -0,0 +1,140 @@ +/* + * Copyright (c) 2018-2020 + * Jianjia Ma + * majianjia@live.com + * + * SPDX-License-Identifier: Apache-2.0 + * + * Change Logs: + * Date Author Notes + * 2019-07-23 Jianjia Ma The first version + */ + +#include +#include +#include + +#include "nnom.h" +#include "nnom_local.h" +#include "nnom_layers.h" +#include "layers/nnom_dw_conv2d.h" + +#ifdef NNOM_USING_CMSIS_NN +#include "arm_math.h" +#include "arm_nnfunctions.h" +#endif + +nnom_layer_t *dw_conv2d_s(const nnom_conv2d_config_t *config) +{ + nnom_layer_t *layer; + layer = conv2d_s(config); + if (layer) + { + layer->type = NNOM_DW_CONV_2D; + layer->run = dw_conv2d_run; + layer->build = dw_conv2d_build; + } + return layer; +} + +nnom_layer_t *DW_Conv2D(uint32_t multiplier, nnom_3d_shape_t k, nnom_3d_shape_t s, nnom_3d_shape_t d, nnom_padding_t pad_type, + const nnom_weight_t *w, const nnom_bias_t *b) +{ + nnom_layer_t *layer = Conv2D(multiplier, k, s, d, pad_type, w, b); // passing multiplier in . + if (layer != NULL) + { + layer->type = NNOM_DW_CONV_2D; + layer->run = dw_conv2d_run; + layer->build = dw_conv2d_build; + } + return layer; +} + +nnom_status_t dw_conv2d_build(nnom_layer_t *layer) +{ + nnom_conv2d_layer_t *cl = (nnom_conv2d_layer_t *)layer; + + // get the tensor from last layer's output + layer->in->tensor = layer->in->hook.io->tensor; + + // create new tensor for output + layer->out->tensor = new_tensor(NNOM_QTYPE_PER_TENSOR, layer->in->tensor->num_dim, tensor_get_num_channel(layer->in->tensor) * cl->filter_mult); + // copy then change later. + tensor_cpy_attr(layer->out->tensor, layer->in->tensor); + + // calculate the output tensor q format, only support per tensor quantise now + layer->out->tensor->q_dec[0] = layer->in->tensor->q_dec[0] + cl->weight->q_dec[0] - cl->output_rshift[0]; + // see if the activation will change the q format + if(layer->actail) + layer->out->tensor->q_dec[0] = act_get_dec_bit(layer->actail->type, layer->out->tensor->q_dec[0]); + + // now we set up the tensor shape, always HWC format + layer->out->tensor->dim[0] = conv_output_length(layer->in->tensor->dim[0], cl->kernel.h, cl->padding_type, cl->stride.h, cl->dilation.h); + layer->out->tensor->dim[1] = conv_output_length(layer->in->tensor->dim[1], cl->kernel.w, cl->padding_type, cl->stride.w, cl->dilation.w); + layer->out->tensor->dim[2] = layer->in->tensor->dim[2] * cl->filter_mult; // channel stays the same + + // fill padding + if (cl->padding_type == PADDING_SAME) + { + cl->pad.w = cl->dilation.w * (cl->kernel.w - 1) / 2; + cl->pad.h = cl->dilation.h * (cl->kernel.h - 1) / 2; + cl->pad.c = 0; + } + + // bufferA size: + #ifdef NNOM_USING_CMSIS_NN + layer->comp->size = 2 * 2 * (layer->in->tensor->dim[2] / cl->filter_mult) * cl->kernel.w * cl->kernel.h; + #endif + + // computational cost: K x K x Cin x Hout x Wout x Multiplier + // or : K x K x Cout x Hout x Wout + layer->stat.macc = cl->kernel.w * cl->kernel.h * tensor_size(layer->out->tensor); + return NN_SUCCESS; +} + +nnom_status_t dw_conv2d_run(nnom_layer_t *layer) +{ + nnom_status_t result = NN_SUCCESS; + nnom_conv2d_layer_t *cl = (nnom_conv2d_layer_t *)layer; + +#ifndef NNOM_USING_CHW + #ifdef NNOM_USING_CMSIS_NN + // Current CMSIS-NN does not support dilation + if(cl->dilation.w ==1 && cl->dilation.h == 1 && cl->weight->qtype == NNOM_QTYPE_PER_TENSOR && cl->filter_mult == 1) + { + // CMSIS-NN only support 1 mulplipier in depthwise conv + if (layer->in->tensor->dim[2] % 2 != 0 || layer->out->tensor->dim[2] % 2) + return NN_ARGUMENT_ERROR; + result = (nnom_status_t)arm_depthwise_separable_conv_HWC_q7_nonsquare( + layer->in->tensor->p_data, + layer->in->tensor->dim[1], layer->in->tensor->dim[0], layer->in->tensor->dim[2], + cl->weight->p_data, + layer->out->tensor->dim[2], + cl->kernel.w, cl->kernel.h, + cl->pad.w, cl->pad.h, + cl->stride.w, cl->stride.h, + cl->bias->p_data, + cl->bias_lshift[0], cl->output_rshift[0], + layer->out->tensor->p_data, + layer->out->tensor->dim[1], layer->out->tensor->dim[0], (q15_t *)(layer->comp->mem->blk), NULL); + } + else + #endif + local_depthwise_separable_conv_HWC_q7_nonsquare( +#else + local_depthwise_separable_conv_CHW_q7_nonsquare( +#endif + layer->in->tensor->p_data, + layer->in->tensor->dim[1], layer->in->tensor->dim[0], layer->in->tensor->dim[2], + cl->weight->p_data, + layer->out->tensor->dim[2], + cl->kernel.w, cl->kernel.h, + cl->pad.w, cl->pad.h, + cl->stride.w, cl->stride.h, + cl->dilation.w, cl->dilation.h, + cl->bias->p_data, + cl->bias_lshift, cl->output_rshift, cl->weight->qtype, + layer->out->tensor->p_data, + layer->out->tensor->dim[1], layer->out->tensor->dim[0], NULL, NULL); + return result; +} diff --git a/APP_Framework/Framework/knowing/nnom/src/layers/nnom_flatten.c b/APP_Framework/Framework/knowing/nnom/src/layers/nnom_flatten.c new file mode 100644 index 000000000..c976bca9a --- /dev/null +++ b/APP_Framework/Framework/knowing/nnom/src/layers/nnom_flatten.c @@ -0,0 +1,84 @@ +/* + * Copyright (c) 2018-2020 + * Jianjia Ma + * majianjia@live.com + * + * SPDX-License-Identifier: Apache-2.0 + * + * Change Logs: + * Date Author Notes + * 2019-07-23 Jianjia Ma The first version + */ + +#include +#include +#include + +#include "nnom.h" +#include "nnom_local.h" +#include "nnom_layers.h" +#include "layers/nnom_flatten.h" + +nnom_layer_t *flatten_s(const nnom_flatten_config_t *config) +{ + nnom_layer_t *layer = Flatten(); + if(layer) + layer->config = (void*) config; + return layer; +} + +nnom_layer_t *Flatten(void) +{ + nnom_layer_t *layer; + nnom_layer_io_t *in, *out; + + // apply a block memory for all the sub handles. + size_t mem_size = sizeof(nnom_layer_t) + sizeof(nnom_layer_io_t) * 2; + layer = nnom_mem(mem_size); + if (layer == NULL) + return NULL; + + // distribut the memory to sub handles. + in = (void *)((uint8_t*)layer + sizeof(nnom_layer_t)); + out = (void *)((uint8_t*)in + sizeof(nnom_layer_io_t)); + + // set type in layer parent + layer->type = NNOM_FLATTEN; + layer->run = flatten_run; + layer->build = flatten_build; + // set buf state + in->type = NNOM_TENSOR_BUF_TEMP; + #ifdef NNOM_USING_CHW + out->type = NNOM_TENSOR_BUF_TEMP; // test for CHW format + #else + out->type = NNOM_TENSOR_BUF_NULL; + #endif + // put in & out on the layer. + layer->in = io_init(layer, in); + layer->out = io_init(layer, out); + + return layer; +} + +nnom_status_t flatten_build(nnom_layer_t *layer) +{ + // get the tensor from last layer's output + layer->in->tensor = layer->in->hook.io->tensor; + + // create new tensor for output + layer->out->tensor = new_tensor(NNOM_QTYPE_PER_TENSOR, layer->in->tensor->num_dim, tensor_get_num_channel(layer->in->tensor)); + // setup new tensor + nnom_shape_data_t dim[1] = {tensor_size(layer->in->tensor)}; + tensor_set_attr(layer->out->tensor, layer->in->tensor->q_dec, layer->in->tensor->q_offset, dim, 1, 8); + + return NN_SUCCESS; +} + +nnom_status_t flatten_run(nnom_layer_t *layer) +{ + #ifdef NNOM_USING_CHW + // CHW format must reorder to HWC for dense layer and all other 1D layer (?) + tensor_chw2hwc_q7(layer->out->tensor, layer->in->tensor); + #endif + return NN_SUCCESS; +} diff --git a/APP_Framework/Framework/knowing/nnom/src/layers/nnom_global_pool.c b/APP_Framework/Framework/knowing/nnom/src/layers/nnom_global_pool.c new file mode 100644 index 000000000..8e0d1ee64 --- /dev/null +++ b/APP_Framework/Framework/knowing/nnom/src/layers/nnom_global_pool.c @@ -0,0 +1,145 @@ + +/* + * Copyright (c) 2018-2020 + * Jianjia Ma + * majianjia@live.com + * + * SPDX-License-Identifier: Apache-2.0 + * + * Change Logs: + * Date Author Notes + * 2019-07-23 Jianjia Ma The first version + */ + +#include +#include +#include + +#include "nnom.h" +#include "nnom_local.h" +#include "nnom_layers.h" +#include "layers/nnom_global_pool.h" + +nnom_layer_t * global_maxpool_s(const nnom_global_pool_config_t *config) +{ + nnom_maxpool_layer_t * cl = (nnom_maxpool_layer_t *)GlobalMaxPool(); + if(cl) + { + cl->super.config = (void*) config; + cl->output_shift = config->output_shift; + } + return (nnom_layer_t *)cl; +} +nnom_layer_t * global_avgpool_s(const nnom_global_pool_config_t *config) +{ + nnom_maxpool_layer_t * cl = (nnom_maxpool_layer_t *)GlobalAvgPool(); + if(cl) + { + cl->super.config = (void*) config; + cl->output_shift = config->output_shift; + } + return (nnom_layer_t *)cl; +} + +nnom_layer_t * global_sumpool_s(const nnom_global_pool_config_t *config) +{ + nnom_maxpool_layer_t * cl = (nnom_maxpool_layer_t *)GlobalSumPool(); + if(cl) + { + cl->super.config = (void*) config; + cl->output_shift = config->output_shift; + } + return (nnom_layer_t *)cl; +} + + +nnom_layer_t *GlobalMaxPool(void) +{ + // create the normal pooling layer, the parameters are left empty to fill in later. + // parameters will be filled in in global_pooling_build() + nnom_layer_t *layer = MaxPool(kernel(0, 0), stride(0, 0), PADDING_VALID); + + // change to global max pool + if (layer != NULL) + { + layer->type = NNOM_GLOBAL_MAXPOOL; + layer->build = global_pool_build; + } + + return (nnom_layer_t *)layer; +} + +nnom_layer_t *GlobalAvgPool(void) +{ + // create the normal pooling layer, the parameters are left empty to fill in later. + // parameters will be filled in global_pooling_build() remotely + nnom_layer_t *layer = MaxPool(kernel(0, 0), stride(0, 0), PADDING_VALID); + + // change some parameters to be recognised as avg pooling + if (layer != NULL) + { + layer->type = NNOM_GLOBAL_AVGPOOL; + layer->run = avgpool_run; // global and basic pooling share the same runner + layer->build = global_pool_build; + } + + return (nnom_layer_t *)layer; +} + +nnom_layer_t *GlobalSumPool(void) +{ + // create the normal pooling layer, the parameters are left empty to fill in later. + // parameters will be filled in global_pooling_build() remotely + nnom_layer_t *layer = MaxPool(kernel(0, 0), stride(0, 0), PADDING_VALID); + + // change some parameters to be recognised as avg pooling + if (layer != NULL) + { + layer->type = NNOM_GLOBAL_SUMPOOL; + layer->run = sumpool_run; // global and basic pooling share the same runner + layer->build = global_pool_build; + } + + return (nnom_layer_t *)layer; +} + +nnom_status_t global_pool_build(nnom_layer_t *layer) +{ + nnom_maxpool_layer_t *cl = (nnom_maxpool_layer_t *)layer; + + // get the tensor from last layer's output + layer->in->tensor = layer->in->hook.io->tensor; + + // create new tensor for output + layer->out->tensor = new_tensor(NNOM_QTYPE_PER_TENSOR, 1, tensor_get_num_channel(layer->in->tensor)); + + nnom_shape_data_t dim[1] = {tensor_get_num_channel(layer->in->tensor)}; // fill the first 2 dim later + tensor_set_attr_v(layer->out->tensor, layer->in->tensor->q_dec[0], 0, dim, sizeof(dim)/sizeof(nnom_shape_data_t), 8); + + // see if the activation will change the q format + if(layer->actail) + layer->out->tensor->q_dec[0] = act_get_dec_bit(layer->actail->type, layer->out->tensor->q_dec[0]); + + // different from other *_build(), the kernel..padding left by layer API needs to be set in here + // due to the *_run() methods of global pooling are using the normall pooling's. + // fill in the parameters left by layer APIs (GlobalAvgPool and MaxAvgPool) + cl->kernel = shape(layer->in->tensor->dim[0], layer->in->tensor->dim[1], 1); + cl->stride = shape(1, 1, 1); + cl->pad = shape(0, 0, 0); + cl->padding_type = PADDING_VALID; + + // additionally, avg pooling require computational buffer, which is 2*dim_im_out*ch_im_in + if (layer->type == NNOM_AVGPOOL || layer->type == NNOM_GLOBAL_AVGPOOL) + { + // bufferA size: 2*dim_im_out*ch_im_in + layer->comp->size = 2 * layer->out->tensor->dim[0] * layer->in->tensor->dim[2]; + } + + // additional for sumpool + if (layer->type == NNOM_SUMPOOL || layer->type == NNOM_GLOBAL_SUMPOOL) + layer->comp->size = 4 * tensor_size(layer->out->tensor); + + return NN_SUCCESS; +} + + diff --git a/APP_Framework/Framework/knowing/nnom/src/layers/nnom_gru_cell.c b/APP_Framework/Framework/knowing/nnom/src/layers/nnom_gru_cell.c new file mode 100644 index 000000000..7e01e9e2a --- /dev/null +++ b/APP_Framework/Framework/knowing/nnom/src/layers/nnom_gru_cell.c @@ -0,0 +1,338 @@ +/* + * Copyright (c) 2018-2020 + * Jianjia Ma + * majianjia@live.com + * + * SPDX-License-Identifier: Apache-2.0 + * + * Change Logs: + * Date Author Notes + * 2020-08-24 Jianjia Ma The first version + */ + +#include +#include +#include + +#include "nnom.h" +#include "nnom_local.h" +#include "nnom_layers.h" +#include "layers/nnom_gru_cell.h" + +#ifdef NNOM_USING_CMSIS_NN +#include "arm_math.h" +#include "arm_nnfunctions.h" +#endif + +nnom_rnn_cell_t *gru_cell_s(const nnom_gru_cell_config_t* config) +{ + nnom_gru_cell_t *cell; + cell = nnom_mem(sizeof(nnom_gru_cell_t)); + if (cell == NULL) + return NULL; + // set methods + cell->super.run = gru_cell_run; + cell->super.build = gru_cell_build; + cell->super.free = gru_cell_free; + cell->super.config = (void*) config; + cell->super.units = config->units; + cell->super.type = NNOM_GRU_CELL; + + // set parameters + cell->bias = config->bias; + cell->weights = config->weights; + cell->recurrent_weights = config->recurrent_weights; + + // q format for intermediate calculation + cell->q_dec_h = config->q_dec_h; + cell->q_dec_z = config->q_dec_z; + + return (nnom_rnn_cell_t *)cell; +} + +nnom_status_t gru_cell_free(nnom_rnn_cell_t* cell) +{ + return NN_SUCCESS; +} + +// the state buffer and computational buffer shape of the cell +nnom_status_t gru_cell_build(nnom_rnn_cell_t* cell) +{ + nnom_layer_t *layer = cell->layer; + nnom_gru_cell_t *c = (nnom_gru_cell_t *)cell; + + // calculate output shift for the 2 calculation. + // hw = the product of hidden x weight, iw = the product of input x weight + // due to the addition of them, they must have same q format. + // that is -> c->q_dec_z; + + // for the dots in cell: output shift = input_dec + weight_dec - output_dec + c->oshift_hw = c->q_dec_h + c->recurrent_weights->q_dec[0] - c->q_dec_z; + c->oshift_iw = layer->in->tensor->q_dec[0] + c->weights->q_dec[0] - c->q_dec_z; + + // bias shift = bias_dec - out_dec + c->bias_shift = layer->in->tensor->q_dec[0] + c->weights->q_dec[0] - c->bias->q_dec[0]; + + // state size = one timestamp output size. + cell->state_size = cell->units * 2; // Q15 + + // comp buffer size: not required + cell->comp_buf_size = cell->units * (3*3) * 2 + cell->feature_size * 2; //q15 + input q7->q15 buffer. + + // finally, calculate the MAC for info for each timestamp + cell->macc = cell->feature_size * cell->units *3 // input: feature * state * 3 gates + + cell->units * cell->units *8 // recurrent, state * output_unit * (5 gate + 3 mult) + + cell->units * (3 + 3 + 5); // 3 gates, 3 mult, 5 addition + + return NN_SUCCESS; +} + + +// keras implementation as below. +/* + def step(cell_inputs, cell_states): + """Step function that will be used by Keras RNN backend.""" + h_tm1 = cell_states[0] + + # inputs projected by all gate matrices at once + matrix_x = K.dot(cell_inputs, kernel) + matrix_x = K.bias_add(matrix_x, input_bias) + + x_z, x_r, x_h = array_ops.split(matrix_x, 3, axis=1) + + # hidden state projected by all gate matrices at once + matrix_inner = K.dot(h_tm1, recurrent_kernel) + matrix_inner = K.bias_add(matrix_inner, recurrent_bias) + + recurrent_z, recurrent_r, recurrent_h = array_ops.split(matrix_inner, 3, + axis=1) + z = nn.sigmoid(x_z + recurrent_z) + r = nn.sigmoid(x_r + recurrent_r) + hh = nn.tanh(x_h + r * recurrent_h) + + # previous and candidate state mixed by update gate + h = z * h_tm1 + (1 - z) * hh + return h, [h] +*/ + +// +nnom_status_t gru_cell_run(nnom_rnn_cell_t* cell) +{ + nnom_layer_t *layer = cell->layer; + nnom_gru_cell_t* c = (nnom_gru_cell_t*) cell; + int act_int_bit = 7 - c->q_dec_z; + // gate data + q15_t* x_z, *x_r, *x_h; + q15_t* recurrent_z, *recurrent_r, *recurrent_h; + q15_t* temp[3]; + + // bias + q7_t* bias = (q7_t*)c->bias->p_data; + q7_t* recurrent_bias = (q7_t*)c->bias->p_data + cell->units*3; + + // state buffer + q15_t* h_tm1 = (q15_t*)cell->in_state; + q15_t* h_t = (q15_t*)cell->out_state; + + // computing buffer + // low |-- buf0 --|-- buf1 --|-- buf2 --|-- input_q15 --| + q15_t *buf[3]; + buf[0] = (q15_t*)layer->comp->mem->blk; + buf[1] = (q15_t*)layer->comp->mem->blk + cell->units*3; + buf[2] = (q15_t*)layer->comp->mem->blk + cell->units*6; + q15_t *in_q15_buf = (q15_t*)layer->comp->mem->blk + cell->units*9; + + // input q7 cast to q15 + local_q7_to_q15(cell->in_data, in_q15_buf, cell->feature_size); + + // matrix_x = K.dot(cell_inputs, kernel) + bias --> buf0 + #ifdef NNOM_USING_CMSIS_NN + arm_fully_connected_mat_q7_vec_q15_opt + #else + local_fully_connected_mat_q7_vec_q15_opt + #endif + (in_q15_buf, c->weights->p_data, cell->feature_size, + cell->units*3, c->bias_shift + 8, c->oshift_iw, bias, buf[0], NULL); + + // matrix_intter = K.dot(h_tm1, recurrent_kernel) + bias -> buf1 + #ifdef NNOM_USING_CMSIS_NN + arm_fully_connected_mat_q7_vec_q15_opt + #else + local_fully_connected_mat_q7_vec_q15_opt + #endif + (h_tm1, c->recurrent_weights->p_data, cell->units, + cell->units*3, c->bias_shift + 8, c->oshift_hw, recurrent_bias, buf[1], NULL); + + // split to each gate + x_z = buf[0]; + x_r = buf[0] + cell->units; + x_h = buf[0] + cell->units*2; + recurrent_z = buf[1]; + recurrent_r = buf[1] + cell->units; + recurrent_h = buf[1] + cell->units*2; + // buffers + temp[0] = buf[2]; + temp[1] = buf[2] + cell->units; + temp[2] = buf[2] + cell->units*2; + + /* z = nn.sigmoid(x_z + recurrent_z) */ + // 1. z1 = x_z + recurrent_z ---> temp[0] + local_add_q15(x_z, recurrent_z, temp[0], 0, cell->units); + // 2. z = sigmoid(z1) + local_sigmoid_q15(temp[0], cell->units, act_int_bit); + + /* r = nn.sigmoid(x_r + recurrent_r) */ + // 1. r1 = x_r + recurrent_r ---> temp[1] + local_add_q15(x_r, recurrent_r, temp[1], 0, cell->units); + // 2. r = sigmoid(r1) + local_sigmoid_q15(temp[1], cell->units, act_int_bit); + + /* hh = nn.tanh(x_h + r * recurrent_h) */ + // 1. hh1 = r * recurrent_h ---> temp[2] + local_mult_q15(temp[1], recurrent_h, temp[2], 15, cell->units); + // 2. hh2 = x_h + hh1 ---> temp[1] + local_add_q15(x_h, temp[2], temp[1], 0, cell->units); + // 3. hh = tanh(h2) ---> temp[1] + local_tanh_q15(temp[1], cell->units, act_int_bit); + + /* h = z * h_tm1 + (1 - z) * hh */ + // 1. h1 = z*h_tm1 ---> temp[2] + local_mult_q15(temp[0], h_tm1, temp[2], 15, cell->units); + // 2. h2 = 1 - z ---> h_t state buff + local_1_minor_z_q15(temp[0], h_t, 15, cell->units); + // 3. h3 = h2 * hh ---> temp[0] + local_mult_q15(h_t, temp[1], temp[0], 15, cell->units); + // h = h1 + h3 + local_add_q15(temp[2], temp[0], h_t, 0, cell->units); + + // finally, copy and convert state to output + local_q15_to_q7(h_t, cell->out_data, 8, cell->units); + return NN_SUCCESS; +} + + +// Researve for debugging, printing the intermediate variables/data. +#if 0 +// delete after testing completed +static void print_variable_q15(q15_t *data,char*name, int dec_bit, int size) +{ + printf("\n\n"); + printf("%s", name); + for(int i = 0; i < size; i++) + { + if(i%8==0) + printf("\n"); + printf("%f\t", (float) data[i] / (1 << dec_bit)); + } + printf("\n"); +} + +// +nnom_status_t gru_cell_run(nnom_rnn_cell_t* cell) +{ + nnom_layer_t *layer = cell->layer; + nnom_gru_cell_t* c = (nnom_gru_cell_t*) cell; + int act_int_bit = 7 - c->q_dec_z; + // gate data + q15_t* x_z, *x_r, *x_h; + q15_t* recurrent_z, *recurrent_r, *recurrent_h; + q15_t* temp[3]; + + // test + //nnom_memset(cell->in_data, 5 * (1<in->tensor->q_dec[0]), cell->feature_size); + + // bias + q7_t* bias = (q7_t*)c->bias->p_data; + q7_t* recurrent_bias = (q7_t*)c->bias->p_data + cell->units*3; + + // state buffer + q15_t* h_tm1 = (q15_t*)cell->in_state; + q15_t* h_t = (q15_t*)cell->out_state; + + // computing buffer + // low |-- buf0 --|-- buf1 --|-- buf2 --|-- input_q15 --| + q15_t *buf[3]; + buf[0] = (q15_t*)layer->comp->mem->blk; + buf[1] = (q15_t*)layer->comp->mem->blk + cell->units*3; + buf[2] = (q15_t*)layer->comp->mem->blk + cell->units*6; + q15_t *in_q15_buf = (q15_t*)layer->comp->mem->blk + cell->units*9; + + // input q7 cast to q15 + local_q7_to_q15(cell->in_data, in_q15_buf, cell->feature_size); + + // matrix_x = K.dot(cell_inputs, kernel) + bias --> buf0 + #ifdef NNOM_USING_CMSIS_NN + arm_fully_connected_mat_q7_vec_q15_opt + #else + local_fully_connected_mat_q7_vec_q15_opt + #endif + (in_q15_buf, c->weights->p_data, cell->feature_size, + cell->units*3, c->bias_shift + 8, c->oshift_iw, bias, buf[0], NULL); + + // matrix_intter = K.dot(h_tm1, recurrent_kernel) + bias -> buf1 + #ifdef NNOM_USING_CMSIS_NN + arm_fully_connected_mat_q7_vec_q15_opt + #else + local_fully_connected_mat_q7_vec_q15_opt + #endif + (h_tm1, c->recurrent_weights->p_data, cell->units, + cell->units*3, c->bias_shift + 8, c->oshift_hw, recurrent_bias, buf[1], NULL); + + print_variable_q15(in_q15_buf, "input", layer->in->tensor->q_dec[0]+8, cell->feature_size); + print_variable_q15(buf[0], "matrix_x", c->q_dec_z+8, cell->units*3); + print_variable_q15(buf[1], "matrix_recurrent", c->q_dec_z+8, cell->units*3); + + // split to each gate + x_z = buf[0]; + x_r = buf[0] + cell->units; + x_h = buf[0] + cell->units*2; + recurrent_z = buf[1]; + recurrent_r = buf[1] + cell->units; + recurrent_h = buf[1] + cell->units*2; + // buffers + temp[0] = buf[2]; + temp[1] = buf[2] + cell->units; + temp[2] = buf[2] + cell->units*2; + + // z = nn.sigmoid(x_z + recurrent_z) + // 1. z1 = x_z + recurrent_z ---> temp[0] + local_add_q15(x_z, recurrent_z, temp[0], 0, cell->units); + // 2. z = sigmoid(z1) + local_sigmoid_q15(temp[0], cell->units, act_int_bit); + print_variable_q15(temp[0], "z", 15, cell->units); + + // r = nn.sigmoid(x_r + recurrent_r) + // 1. r1 = x_r + recurrent_r ---> temp[1] + local_add_q15(x_r, recurrent_r, temp[1], 0, cell->units); + // 2. r = sigmoid(r1) + local_sigmoid_q15(temp[1], cell->units, act_int_bit); + print_variable_q15(temp[1], "r", 15, cell->units); + + // hh = nn.tanh(x_h + r * recurrent_h) + // 1. hh1 = r * recurrent_h ---> temp[2] + local_mult_q15(temp[1], recurrent_h, temp[2], 15, cell->units); + // 2. hh2 = x_h + h1 ---> temp[1] + local_add_q15(x_h, temp[2], temp[1], 0, cell->units); + // 3. hh = tanh(h2) ---> temp[1] + local_tanh_q15(temp[1], cell->units, act_int_bit); + print_variable_q15(temp[1], "hh", 15, cell->units); + + // h = z * h_tm1 + (1 - z) * hh + // 1. h1 = z*h_tm1 ---> temp[2] + local_mult_q15(temp[0], h_tm1, temp[2], 15, cell->units); + print_variable_q15( temp[2], "h1", 15, cell->units); + // 2. h2 = 1 - z ---> h_t state buff + local_1_minor_z_q15(temp[0], h_t, 15, cell->units); + print_variable_q15( h_t, "h2", 15, cell->units); + // 3. h3 = h2 * hh ---> temp[0] + local_mult_q15(h_t, temp[1], temp[0], 15, cell->units); + print_variable_q15( temp[0], "h3", 15, cell->units); + // h = h1 + h3 + local_add_q15(temp[2], temp[0], h_t, 0, cell->units); + print_variable_q15(h_t, "h", 15, cell->units); + + // finally, copy and convert state to output + local_q15_to_q7(h_t, cell->out_data, 8, cell->units); + return NN_SUCCESS; +} +#endif diff --git a/APP_Framework/Framework/knowing/nnom/src/layers/nnom_input.c b/APP_Framework/Framework/knowing/nnom/src/layers/nnom_input.c new file mode 100644 index 000000000..f1fc3b9c9 --- /dev/null +++ b/APP_Framework/Framework/knowing/nnom/src/layers/nnom_input.c @@ -0,0 +1,145 @@ +/* + * Copyright (c) 2018-2020 + * Jianjia Ma + * majianjia@live.com + * + * SPDX-License-Identifier: Apache-2.0 + * + * Change Logs: + * Date Author Notes + * 2019-07-23 Jianjia Ma The first version + */ + +#include +#include +#include + +#include "nnom.h" +#include "nnom_local.h" +#include "nnom_layers.h" +#include "layers/nnom_input.h" + +nnom_layer_t *input_s(const nnom_io_config_t* config) +{ + nnom_io_layer_t *layer; + nnom_layer_io_t *in, *out; + // apply a block memory for all the sub handles. + layer = nnom_mem(sizeof(nnom_io_layer_t) + sizeof(nnom_layer_io_t) * 2); + if (layer == NULL) + return NULL; + + // distribut the memory to sub handles. + in = (void *)((uint8_t*)layer + sizeof(nnom_io_layer_t)); + out = (void *)((uint8_t*)in + sizeof(nnom_layer_io_t)); + + // set type in layer parent + layer->super.type = NNOM_INPUT; + layer->super.run = input_run; + layer->super.build = input_build; + // set buf state + in->type = NNOM_TENSOR_BUF_TEMP; + out->type = NNOM_TENSOR_BUF_NULL; + // put in & out on the layer. + layer->super.in = io_init(layer, in); + layer->super.out = io_init(layer, out); + + /* + // some other layers (Conv, pooling) are not supporting 12 d input, we still expand the 1,2 dimension to 3 + // test -> native support 1,2,3 D input. + layer->super.in->tensor = new_tensor(NNOM_QTYPE_PER_TENSOR, config->tensor->num_dim, tensor_get_num_channel(config->tensor)); + tensor_cpy_attr(layer->super.in->tensor, config->tensor); + layer->buf = config->tensor->p_data; + layer->dec_bit = config->tensor->q_dec[0]; + */ + + // set parameters + if(config->tensor->num_dim == 1) // test for 1d input, expend h = 1 + layer->shape = shape(1, 1, config->tensor->dim[0]); + else if (config->tensor->num_dim == 2) // test for 1d input, expend h = 1 + layer->shape = shape(1, config->tensor->dim[0], config->tensor->dim[1]); + else + layer->shape = shape(config->tensor->dim[0], config->tensor->dim[1], config->tensor->dim[2]); + layer->buf = config->tensor->p_data; + layer->dec_bit = config->tensor->q_dec[0]; + + // experimental: fixed input dim to 3 + // input normally dont have a tensor, so we create one to store the initial data. + nnom_shape_data_t dim[3] = {layer->shape.h, layer->shape.w, layer->shape.c}; + layer->super.in->tensor = new_tensor(NNOM_QTYPE_PER_TENSOR, 3, tensor_get_num_channel(config->tensor)); + tensor_set_attr_v(layer->super.in->tensor, layer->dec_bit, 0, dim, sizeof(dim)/sizeof(nnom_shape_data_t), 8); + return (nnom_layer_t *)layer; +} + +nnom_layer_t *Input(nnom_3d_shape_t input_shape, void *p_buf) +{ + nnom_io_layer_t *layer; + nnom_layer_io_t *in, *out; + + // apply a block memory for all the sub handles. + layer = nnom_mem(sizeof(nnom_io_layer_t) + sizeof(nnom_layer_io_t) * 2); + if (layer == NULL) + return NULL; + + // distribut the memory to sub handles. + in = (void *)((uint8_t*)layer + sizeof(nnom_io_layer_t)); + out = (void *)((uint8_t*)in + sizeof(nnom_layer_io_t)); + + // set type in layer parent + layer->super.type = NNOM_INPUT; + layer->super.run = input_run; + layer->super.build = input_build; + // set buf state + in->type = NNOM_TENSOR_BUF_TEMP; + out->type = NNOM_TENSOR_BUF_NULL; + // put in & out on the layer. + layer->super.in = io_init(layer, in); + layer->super.out = io_init(layer, out); + + // set parameters + layer->shape = input_shape; + layer->buf = p_buf; + layer->dec_bit = 7; + + // experimental: fixed input dim to 3 + // input normally dont have a tensor, so we create one to store the initial data. + nnom_shape_data_t dim[3] = { input_shape.h, input_shape.w, input_shape.c }; + layer->super.in->tensor = new_tensor(NNOM_QTYPE_PER_TENSOR, 3, input_shape.c); + tensor_set_attr_v(layer->super.in->tensor, layer->dec_bit, 0, dim, sizeof(dim)/sizeof(nnom_shape_data_t), 8); + return (nnom_layer_t *)layer; +} + +nnom_status_t input_build(nnom_layer_t* layer) +{ + // the input tensor of inputlayer has assigned previously + + // output tensor + // 1. allocate a new tensor for output + // 2. set the same dim, qfmt to the new tensor. + layer->out->tensor = new_tensor(NNOM_QTYPE_PER_TENSOR, layer->in->tensor->num_dim, tensor_get_num_channel(layer->in->tensor)); + tensor_cpy_attr(layer->out->tensor, layer->in->tensor); + + // now this build has passed the input tensors (shapes, formats) to the new tensors. + return NN_SUCCESS; +} + + +nnom_status_t input_run(nnom_layer_t *layer) +{ + nnom_io_layer_t *cl = (nnom_io_layer_t *)layer; +#ifdef NNOM_USING_CHW + if(layer->in->tensor->num_dim == 3) + { + nnom_3d_shape_t shape = {layer->in->tensor->dim[0], layer->in->tensor->dim[1], layer->in->tensor->dim[2]}; + hwc2chw_q7(shape, cl->buf, layer->in->tensor->p_data); + } + else if (layer->in->tensor->num_dim == 2) + { + nnom_3d_shape_t shape = {1, layer->in->tensor->dim[0], layer->in->tensor->dim[1]}; + hwc2chw_q7(shape, cl->buf, layer->in->tensor->p_data); + } + else +#endif + nnom_memcpy(layer->in->tensor->p_data, cl->buf, tensor_size(layer->in->tensor)); + + return NN_SUCCESS; +} diff --git a/APP_Framework/Framework/knowing/nnom/src/layers/nnom_lambda.c b/APP_Framework/Framework/knowing/nnom/src/layers/nnom_lambda.c new file mode 100644 index 000000000..31e9c7c5e --- /dev/null +++ b/APP_Framework/Framework/knowing/nnom/src/layers/nnom_lambda.c @@ -0,0 +1,81 @@ +/* + * Copyright (c) 2018-2020 + * Jianjia Ma + * majianjia@live.com + * + * SPDX-License-Identifier: Apache-2.0 + * + * Change Logs: + * Date Author Notes + * 2019-07-23 Jianjia Ma The first version + */ + +#include +#include +#include + +#include "nnom.h" +#include "nnom_local.h" +#include "nnom_layers.h" +#include "layers/nnom_lambda.h" + +nnom_layer_t *lambda_s(const nnom_lambda_config_t * config) +{ + nnom_lambda_layer_t *cl = (nnom_lambda_layer_t *)Lambda( + config->run_func_name, + config->build_func_name, + config->free_func_name, + config->parameters); + if(cl) + cl->super.config = (void*) config; + return (nnom_layer_t *)cl; +} + +// TODO: extended to multiple IO layer +nnom_layer_t *Lambda(nnom_status_t (*run)(nnom_layer_t *), + nnom_status_t (*build)(nnom_layer_t *), + nnom_status_t (*free)(nnom_layer_t *), + void *parameters) +{ + nnom_lambda_layer_t *layer; + nnom_layer_io_t *in, *out; + + // apply a block memory for all the sub handles. + size_t mem_size = sizeof(nnom_io_layer_t) + sizeof(nnom_layer_io_t) * 2; + layer = nnom_mem(mem_size); + if (layer == NULL) + return NULL; + + // distribut the memory to sub handles. + in = (void *)((uint8_t*)layer + sizeof(nnom_lambda_layer_t)); + out = (void *)((uint8_t*)in + sizeof(nnom_layer_io_t)); + + // set buf type. + in->type = NNOM_TENSOR_BUF_TEMP; + out->type = NNOM_TENSOR_BUF_TEMP; + + // set io modules to the layer + layer->super.in = io_init(layer, in); + layer->super.out = io_init(layer, out); + // layer type + layer->super.type = NNOM_LAMBDA; + + // user parameters + layer->parameters = parameters; + + // free method + layer->super.free = free; + + // output shape method. pass NULL in will use the default outshape method, which set the output shape same as input shape. + if (build == NULL) + layer->super.build = default_build; + else + layer->super.build = build; + // run method. default_run() will simply copy data from input tensor to output tensor. + if(run == NULL) + layer->super.run = default_run; + else + layer->super.run = run; + + return (nnom_layer_t *)layer; +} diff --git a/APP_Framework/Framework/knowing/nnom/src/layers/nnom_lstm_cell.c b/APP_Framework/Framework/knowing/nnom/src/layers/nnom_lstm_cell.c new file mode 100644 index 000000000..ed4a120b4 --- /dev/null +++ b/APP_Framework/Framework/knowing/nnom/src/layers/nnom_lstm_cell.c @@ -0,0 +1,334 @@ +/* + * Copyright (c) 2018-2020 + * Jianjia Ma + * majianjia@live.com + * + * SPDX-License-Identifier: Apache-2.0 + * + * Change Logs: + * Date Author Notes + * 2020-08-24 Jianjia Ma The first version + */ + +#include +#include +#include + +#include "nnom.h" +#include "nnom_local.h" +#include "nnom_layers.h" +#include "layers/nnom_lstm_cell.h" + +#ifdef NNOM_USING_CMSIS_NN +#include "arm_math.h" +#include "arm_nnfunctions.h" +#endif + +// LSTM RNN +// unit = output shape +// type of activation +nnom_rnn_cell_t *lstm_cell_s(const nnom_lstm_cell_config_t* config) +{ + nnom_lstm_cell_t *cell; + cell = nnom_mem(sizeof(nnom_lstm_cell_t)); + if (cell == NULL) + return NULL; + // set methods + cell->super.run = lstm_cell_q7_q15_run; + cell->super.build = lstm_cell_q7_q15_build; + cell->super.free = lstm_cell_free; + cell->super.config = (void*) config; + cell->super.units = config->units; + cell->super.type = NNOM_LSTM_CELL; + + // set parameters + cell->bias = config->bias; + cell->weights = config->weights; + cell->recurrent_weights = config->recurrent_weights; + + // q format for intermediate calculation + cell->q_dec_c = config->q_dec_c; + cell->q_dec_h = config->q_dec_h; + cell->q_dec_z = config->q_dec_z; + + return (nnom_rnn_cell_t *)cell; +} + +nnom_status_t lstm_cell_free(nnom_rnn_cell_t* cell) +{ + return NN_SUCCESS; +} + +// keras implementation as below. +/* + def step(cell_inputs, cell_states): + """Step function that will be used by Keras RNN backend.""" + h_tm1 = cell_states[0] # previous memory state + c_tm1 = cell_states[1] # previous carry state + + z = K.dot(cell_inputs, kernel) -> q_iw + z += K.dot(h_tm1, recurrent_kernel) -> q_hw + z = K.bias_add(z, bias) + + z0, z1, z2, z3 = array_ops.split(z, 4, axis=1) + + i = nn.sigmoid(z0) + f = nn.sigmoid(z1) + c = f * c_tm1 + i * nn.tanh(z2) + o = nn.sigmoid(z3) + + h = o * nn.tanh(c) + return h, [h, c] +*/ + + + +// the state buffer and computational buffer shape of the cell +nnom_status_t lstm_cell_q7_q15_build(nnom_rnn_cell_t* cell) +{ + nnom_layer_t *layer = cell->layer; + nnom_lstm_cell_t *c = (nnom_lstm_cell_t *)cell; + + // calculate output shift for the 2 calculation. + // hw = the product of hidden x weight, iw = the product of input x weight + // due to the addition of them, they must have same q format. + // that is -> c->q_dec_z; + + // for the dots in cell: output shift = input_dec + weight_dec - output_dec + c->oshift_hw = c->q_dec_h + c->recurrent_weights->q_dec[0] - c->q_dec_z; + c->oshift_iw = layer->in->tensor->q_dec[0] + c->weights->q_dec[0] - c->q_dec_z; + + // bias shift = bias_dec - out_dec + c->bias_shift = layer->in->tensor->q_dec[0] + c->weights->q_dec[0] - c->bias->q_dec[0]; + + // state size = one timestamp output size. + cell->state_size = cell->units * 2 * 2; // Q15 + + // // comp buffer size: not required + cell->comp_buf_size = cell->units * 12 * 2 + cell->feature_size * 2; //q15 + input q7->q15 buffer. + + // finally, calculate the MAC for info (for each timestamp) + cell->macc = cell->feature_size * cell->units *4 // input: feature * state * 4 gates + + cell->units * cell->units *4 // recurrent, state + + cell->units *10; // output_unit * (5 gate + 3 mult + 2 addition) + + return NN_SUCCESS; +} + +// Q7 input output +// Q7 weights +// Q15 states and intermediate buffer +nnom_status_t lstm_cell_q7_q15_run(nnom_rnn_cell_t* cell) +{ + nnom_layer_t *layer = cell->layer; + nnom_lstm_cell_t* c = (nnom_lstm_cell_t*) cell; + int act_int_bit = 7 - c->q_dec_z; + + // state buffer + // low |-- hidden --|-- carry --| high + q15_t* h_tm1 = (q15_t*)cell->in_state; + q15_t* c_tm1 = (q15_t*)cell->in_state + cell->units; + q15_t* o_state[2]; + o_state[0] = (q15_t*)cell->out_state; + o_state[1] = (q15_t*)cell->out_state + cell->units; + + // computing buffer + // low |-- buf0 --|-- buf1 --|-- buf2 --|-- input q15 --| + q15_t* z[4]; + q15_t *buf0, *buf1, *buf2, *in_q15_buf; + buf0 = (q15_t*)layer->comp->mem->blk; + buf1 = (q15_t*)layer->comp->mem->blk + cell->units*4; + buf2 = (q15_t*)layer->comp->mem->blk + cell->units*8; + in_q15_buf = (q15_t*)layer->comp->mem->blk + cell->units*12; + + // input q7 -> q15 + local_q7_to_q15(cell->in_data, in_q15_buf, cell->feature_size); + + // z1 = K.dot(cell_inputs, kernel) + bias -> buf1 + #ifdef NNOM_USING_CMSIS_NN + arm_fully_connected_mat_q7_vec_q15_opt + #else + local_fully_connected_mat_q7_vec_q15_opt + #endif + (in_q15_buf, c->weights->p_data, cell->feature_size, cell->units*4, c->bias_shift + 8, c->oshift_iw, c->bias->p_data, buf1, NULL); + + // z2 = K.dot(h_tm1, recurrent_kernel) -> buf2 + // --- arm version must use bias, so we have to use local implementation + local_fully_connected_mat_q7_vec_q15_opt(h_tm1, c->recurrent_weights->p_data, + cell->units, cell->units*4, 0, c->oshift_hw, NULL, buf2, NULL); + + // z = z1 + z2 -> buf0 + local_add_q15(buf1, buf2, buf0, 0, cell->units*4); + + // split the data to each gate + z[0] = buf0; + z[1] = buf0 + cell->units; + z[2] = buf0 + cell->units*2; + z[3] = buf0 + cell->units*3; + + // i = nn.sigmoid(z0) + local_sigmoid_q15(z[0], cell->units, act_int_bit); + // f = nn.sigmoid(z1) + local_sigmoid_q15(z[1], cell->units, act_int_bit); + // o = nn.sigmoid(z3) + local_sigmoid_q15(z[3], cell->units, act_int_bit); + + /* c = f * c_tm1 + i * nn.tanh(z2) for the step 1-3. */ + // 1. i * tanh(z2) -> buf1 + local_tanh_q15(z[2], cell->units, act_int_bit); + local_mult_q15(z[0], z[2], buf1, 30 - (c->q_dec_c+8), cell->units); + // 2. f * c_tm1 -> o_state[0] + local_mult_q15(z[1], c_tm1, o_state[0], 15, cell->units); + // 3. c = i*tanh + f*c_tm1 -> o_state[1] ** fill the upper state (carry) + local_add_q15(buf1, o_state[0], o_state[1], 0, cell->units); + + /* h = o * nn.tanh(c) -> o_state[0] for the step 1-2 */ + // 1. tanh(c) -> buf2 --- first copy then activate. + nnom_memcpy(buf2, o_state[1], cell->units*2); + local_tanh_q15(buf2, cell->units, 7 - c->q_dec_c); // this int bit is under 8bit + // 2. h = o*tanh(c) -> o_state[0] ** fill the lower state (memory, hidden) + local_mult_q15(z[3], buf2, o_state[0], 15, cell->units); + + // copy and shift q15 to q7 ** (copy hidden to output) + local_q15_to_q7(o_state[0], cell->out_data, 8, cell->units); + return NN_SUCCESS; +} + + +// researve for debugging, printing the intermediate products and variables +#if 0 +static void print_variable(q7_t* data,char*name, int dec_bit, int size) +{ + printf("\n"); + printf("%s\n", name); + for(int i = 0; i < size; i++) + { + if(i%8==0) + printf("\n"); + printf("%f\t", (float) data[i] / (1 << dec_bit)); + } + printf("\n"); +} + +static void print_variable_q15(q15_t *data,char*name, int dec_bit, int size) +{ + printf("\n\n"); + printf("%s", name); + for(int i = 0; i < size; i++) + { + if(i%8==0) + printf("\n"); + printf("%f\t", (float) data[i] / (1 << dec_bit)); + } + printf("\n"); +} + + +// Q7 input output +// Q7 weights +// Q15 states and intermediate buffer +nnom_status_t lstm_cell_q7_q15_run(nnom_rnn_cell_t* cell) +{ + nnom_layer_t *layer = cell->layer; + nnom_rnn_layer_t* cl = (nnom_rnn_layer_t *) layer; + nnom_lstm_cell_t* c = (nnom_lstm_cell_t*) cell; + int act_int_bit = 7 - c->q_dec_z; + + // test + //nnom_memset(cell->in_data, 32, cell->feature_size); + + // state buffer + // low |-- hidden --|-- carry --| high + q15_t* h_tm1 = (q15_t*)cell->in_state; + q15_t* c_tm1 = (q15_t*)cell->in_state + cell->units; + q15_t* o_state[2]; + o_state[0] = (q15_t*)cell->out_state; + o_state[1] = (q15_t*)cell->out_state + cell->units; + + // computing buffer + // low |-- buf0 --|-- buf1 --|-- buf2 --|-- input q15 --| + q15_t* z[4]; + q15_t *buf0, *buf1, *buf2, *in_q15_buf; + buf0 = (q15_t*)layer->comp->mem->blk; + buf1 = (q15_t*)layer->comp->mem->blk + cell->units*4; + buf2 = (q15_t*)layer->comp->mem->blk + cell->units*8; + in_q15_buf = (q15_t*)layer->comp->mem->blk + cell->units*12; + + // input q7 -> q15 + //local_q7_to_q15_no_shift(cell->in_data, in_q15_buf, cell->feature_size); + local_q7_to_q15(cell->in_data, in_q15_buf, cell->feature_size); + print_variable_q15(in_q15_buf, "input", layer->in->tensor->q_dec[0] + 8, cell->feature_size); + print_variable_q15(h_tm1, "h_tml", 15, cell->units); + print_variable_q15(c_tm1, "c_tml", c->q_dec_c + 8, cell->units); + + // z1 = K.dot(cell_inputs, kernel) + bias -> buf1 + #ifdef NNOM_USING_CMSIS_NN + arm_fully_connected_mat_q7_vec_q15_opt + #else + local_fully_connected_mat_q7_vec_q15_opt + #endif + (in_q15_buf, c->weights->p_data, cell->feature_size, cell->units*4, c->bias_shift + 8, c->oshift_iw, c->bias->p_data, buf1, NULL); + + // z2 = K.dot(h_tm1, recurrent_kernel) -> buf2 + // arm version must use bias, so we have to use local implementation + local_fully_connected_mat_q7_vec_q15_opt(h_tm1, c->recurrent_weights->p_data, + cell->units, cell->units*4, 0, c->oshift_hw, NULL, buf2, NULL); + + // z = z1 + z2 -> buf0 + local_add_q15(buf1, buf2, buf0, 0, cell->units*4); + + print_variable_q15(buf0, "z", c->q_dec_z + 8, cell->units*4); + print_variable_q15(buf1, "z1", c->q_dec_z + 8, cell->units*4); + print_variable_q15(buf2, "z2", c->q_dec_z + 8, cell->units*4); + + // split the data to each gate + z[0] = buf0; + z[1] = buf0 + cell->units; + z[2] = buf0 + cell->units*2; + z[3] = buf0 + cell->units*3; + + // i = nn.sigmoid(z0) + local_sigmoid_q15(z[0], cell->units, act_int_bit); + // f = nn.sigmoid(z1) + local_sigmoid_q15(z[1], cell->units, act_int_bit); + // o = nn.sigmoid(z3) + local_sigmoid_q15(z[3], cell->units, act_int_bit); + + print_variable_q15(z[0], "z[0] - i", 15, cell->units); + print_variable_q15(z[1], "z[1] - f", 15, cell->units); + print_variable_q15(z[3], "z[3] - o", 15, cell->units); + + /* c = f * c_tm1 + i * nn.tanh(z2) for the step 1-3. */ + // 1. i * tanh(z2) -> buf1 + local_tanh_q15(z[2], cell->units, act_int_bit); + print_variable_q15(z[2], "z[2] - ?", 15, cell->units); + + local_mult_q15(z[0], z[2], buf1, 30 - (c->q_dec_c+8), cell->units); //q0.15 * q0.15 >> (shift) = (q_c + 8) // i am not very sure + print_variable_q15(buf1, "c2: i * tanh(z2) ", c->q_dec_c+8, cell->units); + + // 2. f * c_tm1 -> o_state[0] + local_mult_q15(z[1], c_tm1, o_state[0], 15, cell->units); + print_variable_q15(o_state[0], "c1: f * c_tm1", c->q_dec_c+8, cell->units); + + // 3. c = i*tanh + f*c_tm1 -> o_state[1] ** fill the upper state (carry) + local_add_q15(buf1, o_state[0], o_state[1], 0, cell->units); + print_variable_q15(o_state[1], "c = c1+c2", c->q_dec_c+8, cell->units); + + /* h = o * nn.tanh(c) -> o_state[0] for the step 1-2 */ + // 1. tanh(c) -> buf2 --- first copy then activate. + nnom_memcpy(buf2, o_state[1], cell->units*2); + local_tanh_q15(buf2, cell->units, 7 - c->q_dec_c); // this int bit is under 8bit + print_variable_q15(buf2, "tanh(c)", 15, cell->units); + + // 2. h = o*tanh(c) -> o_state[0] ** fill the lower state (memory, hidden) + local_mult_q15(z[3], buf2, o_state[0], 15, cell->units); + print_variable_q15(o_state[0], "h = o*tanh(c)", 15, cell->units); + + // copy and shift q15 to q7 ** (copy hidden to output) + local_q15_to_q7(o_state[0], cell->out_data, 8, cell->units); + + print_variable(cell->out_data, "q7 output)", 7, cell->units); + + return NN_SUCCESS; +} +#endif diff --git a/APP_Framework/Framework/knowing/nnom/src/layers/nnom_matrix.c b/APP_Framework/Framework/knowing/nnom/src/layers/nnom_matrix.c new file mode 100644 index 000000000..e011ecc0f --- /dev/null +++ b/APP_Framework/Framework/knowing/nnom/src/layers/nnom_matrix.c @@ -0,0 +1,239 @@ +/* + * Copyright (c) 2018-2020 + * Jianjia Ma + * majianjia@live.com + * + * SPDX-License-Identifier: Apache-2.0 + * + * Change Logs: + * Date Author Notes + * 2019-07-23 Jianjia Ma The first version + */ + +#include +#include +#include + +#include "nnom.h" +#include "nnom_local.h" +#include "nnom_layers.h" +#include "layers/nnom_matrix.h" + +// TODO, completely change this file to local version +#ifdef NNOM_USING_CMSIS_NN +#include "arm_math.h" +#include "arm_nnfunctions.h" +#endif + +nnom_status_t matrix_build(nnom_layer_t *layer); + +nnom_layer_t *add_s(const nnom_matrix_config_t * config) +{ + nnom_matrix_layer_t *cl = (nnom_matrix_layer_t *) Add(config->output_shift); + if(cl) + cl->super.config = (void*) config; + return (nnom_layer_t *)cl; +} + +nnom_layer_t *sub_s(const nnom_matrix_config_t * config) +{ + nnom_matrix_layer_t *cl = (nnom_matrix_layer_t *) Sub(config->output_shift); + if(cl) + cl->super.config = (void*) config; + return (nnom_layer_t *)cl; +} + +nnom_layer_t *mult_s(const nnom_matrix_config_t * config) +{ + nnom_matrix_layer_t *cl = (nnom_matrix_layer_t *) Mult(config->output_shift); + if(cl) + cl->super.config = (void*) config; + return (nnom_layer_t *)cl; +} + +nnom_layer_t *Add(int16_t oshift) +{ + nnom_matrix_layer_t *cl = (nnom_matrix_layer_t *)_same_shape_matrix_layer(); + if (cl == NULL) + return NULL; + // set type in layer parent + cl->super.type = NNOM_ADD; + cl->super.run = add_run; + cl->oshift = oshift; + return (nnom_layer_t *)cl; +} + +nnom_layer_t *Sub(int16_t oshift) +{ + nnom_matrix_layer_t *cl = (nnom_matrix_layer_t *)_same_shape_matrix_layer(); + if (cl == NULL) + return NULL; + // set type in layer parent + cl->super.type = NNOM_SUB; + cl->super.run = sub_run; + cl->oshift = oshift; + return (nnom_layer_t *)cl; +} + +nnom_layer_t *Mult(int16_t oshift) +{ + nnom_matrix_layer_t *cl = (nnom_matrix_layer_t *)_same_shape_matrix_layer(); + if (cl == NULL) + return NULL; + // set type in layer parent + cl->super.type = NNOM_MULT; + cl->super.run = mult_run; + cl->oshift = oshift; + return (nnom_layer_t *)cl; +} + +// init a base layer instance with same shape 1 in 1 out. More IO can be added later +// mainly used by matrix calculation (add, mult, sub) +nnom_layer_t *_same_shape_matrix_layer() +{ + nnom_matrix_layer_t *layer; + nnom_layer_io_t *in, *out; + //nnom_buf_t *comp; + size_t mem_size; + + // apply a block memory for all the sub handles. + mem_size = sizeof(nnom_matrix_layer_t) + sizeof(nnom_layer_io_t) * 2; + layer = nnom_mem(mem_size); + if (layer == NULL) + return NULL; + + // distribut the memory to sub handles. + in = (void *)((uint8_t*)layer + sizeof(nnom_matrix_layer_t)); + out = (void *)((uint8_t*)in + sizeof(nnom_layer_io_t)); + //comp = (void *)((uint8_t*)out + sizeof(nnom_layer_io_t)); + + // set type in layer parent + layer->super.build = matrix_build; + // set buf state + in->type = NNOM_TENSOR_BUF_TEMP; + out->type = NNOM_TENSOR_BUF_TEMP; + //comp->type = NNOM_TENSOR_BUF_TEMP; + // put in & out on the layer. + layer->super.in = io_init(layer, in); + layer->super.out = io_init(layer, out); + //layer->super.comp = comp; + return (nnom_layer_t*)layer; +} + +nnom_status_t matrix_build(nnom_layer_t *layer) +{ + // get the last layer's output as input shape (if more than one) + nnom_layer_io_t *in = layer->in; + while(in) + { + in->tensor = in->hook.io->tensor; + in = in->aux; + } + // output tensor + layer->out->tensor = new_tensor(NNOM_QTYPE_PER_TENSOR,layer->in->tensor->num_dim, tensor_get_num_channel(layer->in->tensor)); + tensor_cpy_attr(layer->out->tensor, layer->in->tensor); + + // see if the activation will change the q format + if(layer->actail) + layer->out->tensor->q_dec[0] = act_get_dec_bit(layer->actail->type, layer->out->tensor->q_dec[0]); + + // now this build has passed the input tensors (shapes, formats) to the new tensors. + return NN_SUCCESS; +} + + +nnom_status_t add_run(nnom_layer_t *layer) +{ + nnom_matrix_layer_t* cl = (nnom_matrix_layer_t*)layer; + nnom_layer_io_t *in = layer->in;; + size_t t_size = tensor_size(layer->out->tensor); + int32_t oshift = cl->oshift; + size_t num_input = nnom_io_length(layer->in); + q7_t *input_mem_blk[MAX_INPUT_LAYER]; + + // if there is only 2 matrix + if(num_input == 2) + { + #ifdef NNOM_USING_CMSIS_NN + if(oshift == 0) + arm_add_q7(layer->in->tensor->p_data, layer->in->aux->tensor->p_data, layer->out->tensor->p_data, t_size); + else + #endif + local_add_q7(layer->in->tensor->p_data, layer->in->aux->tensor->p_data, layer->out->tensor->p_data, oshift, t_size); + } + else + { + for(int i = 0; i < num_input; i++) + { + input_mem_blk[i] = in->tensor->p_data; + in = in->aux; + } + local_multiple_add_q7(layer->out->tensor->p_data, oshift, t_size, num_input, input_mem_blk); + } + + return NN_SUCCESS; +} + +nnom_status_t sub_run(nnom_layer_t *layer) +{ + nnom_matrix_layer_t* cl = (nnom_matrix_layer_t*)layer; + nnom_layer_io_t *in = layer->in; + size_t t_size = tensor_size(layer->out->tensor); + int32_t oshift = cl->oshift; + size_t num_input = nnom_io_length(layer->in); + q7_t *input_mem_blk[MAX_INPUT_LAYER]; + + // if there is only 2 matrix + if(num_input == 2) + { + // the first 2 matrix + #ifdef NNOM_USING_CMSIS_NN + if(oshift == 0) + arm_sub_q7(layer->in->tensor->p_data, layer->in->aux->tensor->p_data, layer->out->tensor->p_data, t_size); + else + #endif + local_sub_q7(layer->in->tensor->p_data, layer->in->aux->tensor->p_data, layer->out->tensor->p_data, oshift, t_size); + } + else + { + for(int i = 0; i < num_input; i++) + { + input_mem_blk[i] = in->tensor->p_data; + in = in->aux; + } + local_multiple_sub_q7(layer->out->tensor->p_data, oshift, t_size, num_input, input_mem_blk); + } + return NN_SUCCESS; +} + +nnom_status_t mult_run(nnom_layer_t *layer) +{ + nnom_matrix_layer_t* cl = (nnom_matrix_layer_t*)layer; + nnom_layer_io_t *in = layer->in; + size_t t_size = tensor_size(layer->out->tensor); + int32_t oshift = cl->oshift; + size_t num_input = nnom_io_length(layer->in); + q7_t *input_mem_blk[MAX_INPUT_LAYER]; + + // if there is only 2 matrix + if(num_input == 2) + { + // the first 2 matrix + #ifdef NNOM_USING_CMSIS_NN + if(oshift == 0) + arm_mult_q7(layer->in->tensor->p_data, layer->in->aux->tensor->p_data, layer->out->tensor->p_data, t_size); + else + #endif + local_mult_q7(layer->in->tensor->p_data, layer->in->aux->tensor->p_data, layer->out->tensor->p_data, oshift, t_size); + } + else + { + for(int i = 0; i < num_input; i++) + { + input_mem_blk[i] = in->tensor->p_data; + in = in->aux; + } + local_multiple_mult_q7(layer->out->tensor->p_data, oshift, t_size, num_input, input_mem_blk); + } + return NN_SUCCESS; +} diff --git a/APP_Framework/Framework/knowing/nnom/src/layers/nnom_maxpool.c b/APP_Framework/Framework/knowing/nnom/src/layers/nnom_maxpool.c new file mode 100644 index 000000000..fe904bad8 --- /dev/null +++ b/APP_Framework/Framework/knowing/nnom/src/layers/nnom_maxpool.c @@ -0,0 +1,191 @@ +/* + * Copyright (c) 2018-2020 + * Jianjia Ma + * majianjia@live.com + * + * SPDX-License-Identifier: Apache-2.0 + * + * Change Logs: + * Date Author Notes + * 2019-07-23 Jianjia Ma The first version + */ + +#include +#include +#include + +#include "nnom.h" +#include "nnom_local.h" +#include "nnom_layers.h" +#include "layers/nnom_maxpool.h" + +#ifdef NNOM_USING_CMSIS_NN +#include "arm_math.h" +#include "arm_nnfunctions.h" +#endif + +nnom_layer_t *maxpool_s(const nnom_pool_config_t * config) +{ + nnom_layer_t *layer; + + // test, to accomodate 1d and 2d input + if(config->num_dim == 1) + { + layer = MaxPool(kernel(1, config->kernel_size[0]), + stride(1, config->stride_size[0]), + config->padding_type); + } + else + { + layer = MaxPool(kernel(config->kernel_size[0], config->kernel_size[1]), + stride(config->stride_size[0], config->stride_size[1]), + config->padding_type); + } + + if(layer) + layer->config = (void*) config; + return layer; +} + +nnom_layer_t *MaxPool(nnom_3d_shape_t k, nnom_3d_shape_t s, nnom_padding_t pad_type) +{ + nnom_maxpool_layer_t *layer; + nnom_buf_t *comp; + nnom_layer_io_t *in, *out; + + // apply a block memory for all the sub handles. + size_t mem_size = sizeof(nnom_maxpool_layer_t) + sizeof(nnom_layer_io_t) * 2 + sizeof(nnom_buf_t); + layer = nnom_mem(mem_size); + if (layer == NULL) + return NULL; + + // distribut the memory to sub handles. + in = (void *)((uint8_t*)layer + sizeof(nnom_maxpool_layer_t)); + out = (void *)((uint8_t*)in + sizeof(nnom_layer_io_t)); + comp = (void *)((uint8_t*)out + sizeof(nnom_layer_io_t)); + + // set type in layer parent + layer->super.type = NNOM_MAXPOOL; + layer->super.run = maxpool_run; + layer->super.build = maxpool_build; + // set buf state + in->type = NNOM_TENSOR_BUF_TEMP; + out->type = NNOM_TENSOR_BUF_TEMP; + comp->type = NNOM_TENSOR_BUF_TEMP; + // put in & out on the layer. + layer->super.in = io_init(layer, in); + layer->super.out = io_init(layer, out); + layer->super.comp = comp; + + // set parameters + layer->kernel = k; + layer->stride = s; + layer->padding_type = pad_type; + + // padding + if (layer->padding_type == PADDING_SAME) + { + layer->pad.h = (k.h - 1) / 2; + layer->pad.w = (k.w - 1) / 2; + layer->pad.c = 1; // no meaning + } + else + { + layer->pad.h = 0; + layer->pad.w = 0; + layer->pad.c = 0; + } + return (nnom_layer_t *)layer; +} + +nnom_status_t maxpool_build(nnom_layer_t *layer) +{ + nnom_maxpool_layer_t *cl = (nnom_maxpool_layer_t *)layer; + + // get the tensor from last layer's output + layer->in->tensor = layer->in->hook.io->tensor; + + // create new tensor for output + layer->out->tensor = new_tensor(NNOM_QTYPE_PER_TENSOR, layer->in->tensor->num_dim, tensor_get_num_channel(layer->in->tensor)); + // copy then change later. + tensor_cpy_attr(layer->out->tensor, layer->in->tensor); + + // see if the activation will change the q format + if(layer->actail) + layer->out->tensor->q_dec[0] = act_get_dec_bit(layer->actail->type, layer->out->tensor->q_dec[0]); + + // now we set up the tensor shape, always HWC format + if (cl->padding_type == PADDING_SAME) + { + layer->out->tensor->dim[0] = NN_CEILIF(layer->in->tensor->dim[0], cl->stride.h); + layer->out->tensor->dim[1] = NN_CEILIF(layer->in->tensor->dim[1], cl->stride.w); + layer->out->tensor->dim[2] = layer->in->tensor->dim[2]; // channel stays the same + } + else + { + layer->out->tensor->dim[0] = NN_CEILIF(layer->in->tensor->dim[0] - cl->kernel.h + 1, cl->stride.h); + layer->out->tensor->dim[1] = NN_CEILIF(layer->in->tensor->dim[1] - cl->kernel.w + 1, cl->stride.w); + layer->out->tensor->dim[2] = layer->in->tensor->dim[2]; + } + + return NN_SUCCESS; +} + +nnom_status_t maxpool_run(nnom_layer_t *layer) +{ + nnom_maxpool_layer_t *cl = (nnom_maxpool_layer_t *)(layer); + + uint16_t out_x, out_y; + + // if global pooling + if(layer->out->tensor->num_dim == 1) + { + out_x = 1; out_y = 1; + } + else // normal pooling. + { + out_x = layer->out->tensor->dim[1]; //W + out_y = layer->out->tensor->dim[0]; //h + } + +#ifdef NNOM_USING_CHW + local_maxpool_q7_CHW(layer->in->tensor->p_data, + layer->in->tensor->dim[1], layer->in->tensor->dim[0], layer->in->tensor->dim[2], + cl->kernel.w, cl->kernel.h, + cl->pad.w, cl->pad.h, + cl->stride.w, cl->stride.h, + out_x, out_y, + NULL, + layer->out->tensor->p_data); +#else //end of CHW + // HWC + #ifdef NNOM_USING_CMSIS_NN + // 2D, square + if (layer->in->tensor->dim[1] == layer->in->tensor->dim[0] && + layer->out->tensor->dim[1] == layer->out->tensor->dim[0]) + { + arm_maxpool_q7_HWC( + layer->in->tensor->p_data, + layer->in->tensor->dim[1], layer->in->tensor->dim[2], + cl->kernel.w, cl->pad.w, cl->stride.w, + layer->out->tensor->dim[1], + NULL, + layer->out->tensor->p_data); + } + // none square 2D, or 1D + else + #endif + { + // CMSIS-NN does not support none-square pooling, we have to use local implementation + local_maxpool_q7_HWC(layer->in->tensor->p_data, + layer->in->tensor->dim[1], layer->in->tensor->dim[0], layer->in->tensor->dim[2], + cl->kernel.w, cl->kernel.h, + cl->pad.w, cl->pad.h, + cl->stride.w, cl->stride.h, + out_x, out_y, + NULL, + layer->out->tensor->p_data); + } +#endif // CHW/HWC + return NN_SUCCESS; +} diff --git a/APP_Framework/Framework/knowing/nnom/src/layers/nnom_output.c b/APP_Framework/Framework/knowing/nnom/src/layers/nnom_output.c new file mode 100644 index 000000000..bed1c89cd --- /dev/null +++ b/APP_Framework/Framework/knowing/nnom/src/layers/nnom_output.c @@ -0,0 +1,54 @@ + +/* + * Copyright (c) 2018-2020 + * Jianjia Ma + * majianjia@live.com + * + * SPDX-License-Identifier: Apache-2.0 + * + * Change Logs: + * Date Author Notes + * 2019-07-23 Jianjia Ma The first version + */ + +#include +#include +#include + +#include "nnom.h" +#include "nnom_local.h" +#include "nnom_layers.h" +#include "layers/nnom_output.h" + +nnom_layer_t *output_s(const nnom_io_config_t* config) +{ + nnom_layer_t *layer = input_s(config); + if(layer) + { + layer->config = (void*) config; + layer->type = NNOM_OUTPUT; + layer->run = output_run; + layer->build = default_build; + } + return layer; +} + +nnom_layer_t *Output(nnom_3d_shape_t output_shape, void *p_buf) +{ + // they are acturally the same.. expect the type defined + nnom_layer_t *layer = Input(output_shape, p_buf); + if (layer != NULL) + { + layer->type = NNOM_OUTPUT; + layer->run = output_run; + layer->build = default_build; + } + return layer; +} + +nnom_status_t output_run(nnom_layer_t *layer) +{ + nnom_io_layer_t *cl = (nnom_io_layer_t *)layer; + nnom_memcpy(cl->buf, layer->in->tensor->p_data, tensor_size(layer->out->tensor)); // in->memory -> user memory + return NN_SUCCESS; +} diff --git a/APP_Framework/Framework/knowing/nnom/src/layers/nnom_reshape.c b/APP_Framework/Framework/knowing/nnom/src/layers/nnom_reshape.c new file mode 100644 index 000000000..1b6ae82f7 --- /dev/null +++ b/APP_Framework/Framework/knowing/nnom/src/layers/nnom_reshape.c @@ -0,0 +1,77 @@ +/* + * Copyright (c) 2018-2020 + * Jianjia Ma + * majianjia@live.com + * + * SPDX-License-Identifier: Apache-2.0 + * + * Change Logs: + * Date Author Notes + * 2020-12-07 Jianjia Ma The first version + */ + +#include +#include +#include + +#include "nnom.h" +#include "nnom_local.h" +#include "nnom_layers.h" +#include "layers/nnom_reshape.h" + + +nnom_layer_t *reshape_s(const nnom_reshape_config_t *config) +{ + nnom_reshape_layer_t *layer; + nnom_layer_io_t *in, *out; + + // allocate a block memory for all the sub handles and shifts. + size_t mem_size = sizeof(nnom_reshape_layer_t) + sizeof(nnom_layer_io_t) * 2 ; + layer = nnom_mem(mem_size); + if (layer == NULL) + return NULL; + + // distribut the memory to sub handles. + in = (void *)((uint8_t*)layer + sizeof(nnom_reshape_layer_t)); + out = (void *)((uint8_t*)in + sizeof(nnom_layer_io_t)); + + // set type in layer parent + layer->super.type = NNOM_RESHAPE; + layer->super.run = reshape_run; + layer->super.build = reshape_build; + // set buf state + in->type = NNOM_TENSOR_BUF_TEMP; + out->type = NNOM_TENSOR_BUF_NULL; + + // config + //nnom_memcpy(layer->dim, config->dim, config->num_dim * sizeof(nnom_shape_data_t)); + layer->super.config = config; + layer->dim = config->dim; // temporary use the config directly. (not preferable.) + layer->num_dim = config->num_dim; + + // put in & out on the layer. + layer->super.in = io_init(layer, in); + layer->super.out = io_init(layer, out); + + return (nnom_layer_t *)layer; +} + +nnom_status_t reshape_build(nnom_layer_t *layer) +{ + nnom_reshape_layer_t *cl = (nnom_reshape_layer_t *)layer; + + // get the tensor from last layer's output + layer->in->tensor = layer->in->hook.io->tensor; + + // create new tensor for output + layer->out->tensor = new_tensor(NNOM_QTYPE_PER_TENSOR, layer->in->tensor->num_dim, tensor_get_num_channel(layer->in->tensor)); + tensor_set_attr(layer->out->tensor, layer->in->tensor->q_dec, layer->in->tensor->q_offset, cl->dim, cl->num_dim, 8); + + return NN_SUCCESS; +} + +nnom_status_t reshape_run(nnom_layer_t *layer) +{ + return NN_SUCCESS; +} + diff --git a/APP_Framework/Framework/knowing/nnom/src/layers/nnom_rnn.c b/APP_Framework/Framework/knowing/nnom/src/layers/nnom_rnn.c new file mode 100644 index 000000000..6fe9662e0 --- /dev/null +++ b/APP_Framework/Framework/knowing/nnom/src/layers/nnom_rnn.c @@ -0,0 +1,191 @@ +/* + * Copyright (c) 2018-2020 + * Jianjia Ma + * majianjia@live.com + * + * SPDX-License-Identifier: Apache-2.0 + * + * Change Logs: + * Date Author Notes + * 2019-07-23 Jianjia Ma The first version + */ + +#include +#include +#include + +#include "nnom.h" +#include "nnom_local.h" +#include "nnom_layers.h" +#include "layers/nnom_rnn.h" + +nnom_status_t rnn_build(nnom_layer_t *layer); +nnom_status_t rnn_run(nnom_layer_t *layer); +nnom_status_t rnn_free(nnom_layer_t* layer); + +// RNN +nnom_layer_t *rnn_s(nnom_rnn_cell_t *cell, const nnom_rnn_config_t* config) +{ + nnom_rnn_layer_t *layer; + nnom_buf_t *comp; + nnom_layer_io_t *in, *out; + + // apply a block memory for all the sub handles. + size_t mem_size = sizeof(nnom_rnn_layer_t) + sizeof(nnom_layer_io_t) * 2 + sizeof(nnom_buf_t); + layer = nnom_mem(mem_size); + if (layer == NULL) + return NULL; + + // distribut the memory to sub handles. + in = (void *)((uint8_t*)layer + sizeof(nnom_rnn_layer_t)); + out = (void *)((uint8_t*)in + sizeof(nnom_layer_io_t)); + comp = (void *)((uint8_t*)out + sizeof(nnom_layer_io_t)); + + // set type in layer parent + layer->super.type = NNOM_RNN; + // set buf state + in->type = NNOM_TENSOR_BUF_TEMP; + out->type = NNOM_TENSOR_BUF_TEMP; + comp->type = NNOM_TENSOR_BUF_TEMP; + // put in & out on the layer. + layer->super.in = io_init(layer, in); + layer->super.out = io_init(layer, out); + layer->super.comp = comp; + // set run and outshape methods + layer->super.run = rnn_run; + layer->super.build = rnn_build; + layer->super.free = rnn_free; + + // rnn parameters. + layer->return_sequence = config->return_sequence; + layer->stateful = config->stateful; + layer->go_backwards = config->go_backwards; + layer->super.config = (void*)config; + layer->cell = cell; + + // set this layer to the cell + layer->cell->layer = (nnom_layer_t *)layer; + + return (nnom_layer_t *)layer; +} + +nnom_status_t rnn_free(nnom_layer_t* layer) +{ + nnom_rnn_layer_t* cl = (nnom_rnn_layer_t*)layer; + // free cell + if(cl->cell->free) + cl->cell->free(cl->cell); + + // free state buffer + nnom_free(cl->state_buf); + + return NN_SUCCESS; +} + +nnom_status_t rnn_build(nnom_layer_t* layer) +{ + nnom_rnn_layer_t *cl = (nnom_rnn_layer_t *)layer; + + // get the tensor from last layer's output + layer->in->tensor = layer->in->hook.io->tensor; + + // timestamp size + cl->timestamp_size = layer->in->tensor->num_dim > 2 ? layer->in->tensor->dim[1] : layer->in->tensor->dim[0]; + + if(cl->return_sequence) + { + // create new tensor for the output + layer->out->tensor = new_tensor(NNOM_QTYPE_PER_TENSOR, 2, 0); + // shape: timestamp, units + layer->out->tensor->dim[0] = cl->timestamp_size; + layer->out->tensor->dim[1] = cl->cell->units; + } + else + { + // create new tensor for the output + layer->out->tensor = new_tensor(NNOM_QTYPE_PER_TENSOR, 1, 0); + // shape: timestamp, units + layer->out->tensor->dim[0] = cl->cell->units; + } + + // output q format - the output of the available activations are both q0.7. + layer->out->tensor->q_dec[0] = layer->in->tensor->bitwidth==16? 15: 7; + layer->out->tensor->bitwidth = layer->in->tensor->bitwidth; + // see if the activation will change the q format + if(layer->actail) + layer->out->tensor->q_dec[0] = act_get_dec_bit(layer->actail->type, layer->out->tensor->q_dec[0]); + + // get feature size from input tensor + cl->cell->feature_size = tensor_get_num_channel(layer->in->tensor); // vector (feature) size + + // call cell builder to build the cell + cl->cell->build(cl->cell); + + // get the size of computation buffer? + cl->super.comp->size = cl->cell->comp_buf_size; // size of intermediate buffer required by the cell. + cl->state_buf = nnom_mem(cl->cell->state_size * 2); // allocate state buf for upper/lower state buffer. + if(!cl->state_buf) + return NN_NO_MEMORY; + + // get the computational cost provided by Cell + layer->stat.macc = cl->cell->macc * cl->timestamp_size; + return NN_SUCCESS; +} + +nnom_status_t rnn_run(nnom_layer_t* layer) +{ + nnom_status_t result; + nnom_rnn_layer_t* cl = (nnom_rnn_layer_t*)(layer); + size_t timestamps_size = layer->in->tensor->dim[layer->in->tensor->num_dim-2]; + size_t feature_size = tensor_get_num_channel(layer->in->tensor); // feature size = last dimension. + size_t state_size = cl->cell->state_size; + size_t output_growth; + void* upper_state = (q7_t*)cl->state_buf + state_size; + void* lower_state = (q7_t*)cl->state_buf; + + // reset state buffer if not in stateful + if (!cl->stateful) + nnom_memset(cl->state_buf, 0, state_size * 2); + + // set output data + output_growth = cl->return_sequence ? cl->cell->units : 0; + + // run timestamp by timestamp + for (uint32_t round = 0; round < timestamps_size; round++) + { + if(cl->go_backwards) + { + // set input data + cl->cell->in_data = (q7_t*)layer->in->tensor->p_data + feature_size*(timestamps_size - 1 - round); + // set output data + cl->cell->out_data = (q7_t*)layer->out->tensor->p_data + output_growth*(timestamps_size - 1 - round); + } + else + { + // set input data + cl->cell->in_data = (q7_t*)layer->in->tensor->p_data + feature_size*round; + // set output data + cl->cell->out_data = (q7_t*)layer->out->tensor->p_data + output_growth*round; + } + + // switch upper/lower state buffer + if(cl->cell->in_state != lower_state) + { + cl->cell->in_state = lower_state; + cl->cell->out_state = upper_state; + } + else + { + cl->cell->in_state = upper_state; + cl->cell->out_state = lower_state; + } + + // run it + result = cl->cell->run(cl->cell); + if(result != NN_SUCCESS) + return result; + } + + return NN_SUCCESS; +} + diff --git a/APP_Framework/Framework/knowing/nnom/src/layers/nnom_simple_cell.c b/APP_Framework/Framework/knowing/nnom/src/layers/nnom_simple_cell.c new file mode 100644 index 000000000..b61acbef3 --- /dev/null +++ b/APP_Framework/Framework/knowing/nnom/src/layers/nnom_simple_cell.c @@ -0,0 +1,142 @@ +/* + * Copyright (c) 2018-2020 + * Jianjia Ma + * majianjia@live.com + * + * SPDX-License-Identifier: Apache-2.0 + * + * Change Logs: + * Date Author Notes + * 2020-08-21 Jianjia Ma The first version + */ + +#include +#include +#include + +#include "nnom.h" +#include "nnom_local.h" +#include "nnom_layers.h" +#include "layers/nnom_simple_cell.h" + +#ifdef NNOM_USING_CMSIS_NN +#include "arm_math.h" +#include "arm_nnfunctions.h" +#endif + +// Simple RNN +// unit = output shape +// type of activation +nnom_rnn_cell_t *simple_cell_s(const nnom_simple_cell_config_t* config) +{ + nnom_simple_cell_t *cell; + cell = nnom_mem(sizeof(nnom_simple_cell_t)); + if (cell == NULL) + return NULL; + // set methods + cell->super.run = simple_cell_run; + cell->super.build = simple_cell_build; + cell->super.free = simple_cell_free; + cell->super.config = (void*) config; + cell->super.units = config->units; + cell->super.type = NNOM_SIMPLE_CELL; + + // set parameters + cell->bias = config->bias; + cell->weights = config->weights; + cell->recurrent_weights = config->recurrent_weights; + cell->act_type = config->act_type; + // q format for intermediate products + cell->q_dec_iw = config->q_dec_iw; + cell->q_dec_hw = config->q_dec_hw; + cell->q_dec_h = config->q_dec_h; + + return (nnom_rnn_cell_t *)cell; +} + +nnom_status_t simple_cell_free(nnom_rnn_cell_t* cell) +{ + return NN_SUCCESS; +} + +// the state buffer and computational buffer shape of the cell +nnom_status_t simple_cell_build(nnom_rnn_cell_t* cell) +{ + nnom_layer_t *layer = cell->layer; + nnom_simple_cell_t *c = (nnom_simple_cell_t *)cell; + nnom_simple_cell_config_t *config = (nnom_simple_cell_config_t *)cell->config; + int q_hw_iw; + + // activation, check if activation is supported + if(config->act_type != ACT_SIGMOID && config->act_type != ACT_TANH) + return NN_ARGUMENT_ERROR; + + // calculate output shift for the 2 calculation. + // hw = the product of hidden x weight, iw = the product of input x weight + // due to the addition of them, they must have same q format. + q_hw_iw = MIN(c->q_dec_hw, c->q_dec_iw); + + // for the 2 dot in cell: output shift = input_dec + weight_dec - output_dec + c->oshift_hw = c->q_dec_h + c->recurrent_weights->q_dec[0] - q_hw_iw; + c->oshift_iw = layer->in->tensor->q_dec[0] + c->weights->q_dec[0] - q_hw_iw; + + // bias shift = bias_dec - out_dec + c->bias_shift = layer->in->tensor->q_dec[0] + c->weights->q_dec[0] - c->bias->q_dec[0]; + + // state size = one timestamp output size. + cell->state_size = cell->units; + + // comp buffer size: not required + cell->comp_buf_size = 0; + + // finally, calculate the MAC for info + cell->macc = cell->feature_size * cell->units // input: feature * state + + cell->units * cell->units; // recurrent, state * output_unit + + return NN_SUCCESS; +} + +// This Simple Cell replicate the Keras's SimpleCell as blow +/* + def call(self, inputs, states, training=None): + prev_output = states[0] if nest.is_sequence(states) else states + + h = K.dot(inputs, self.kernel) + h = K.bias_add(h, self.bias) + + h2 = K.dot(prev_output, self.recurrent_kernel) + output = h + H2 + output = self.activation(output) + + new_state = [output] if nest.is_sequence(states) else output + return output, new_state +*/ + +nnom_status_t simple_cell_run(nnom_rnn_cell_t* cell) +{ + nnom_simple_cell_t* c = (nnom_simple_cell_t*) cell; + int act_int_bit = 7 - MIN(c->q_dec_hw, c->q_dec_iw); + + // in_state x recurrent_weight -> h2 (output buf) + local_dot_q7_opt(cell->in_state, c->recurrent_weights->p_data, cell->units, cell->units, c->oshift_hw, cell->out_data); + // (input x weight) + bias -> h (in_state buf) + local_fully_connected_q7_opt(cell->in_data, c->weights->p_data, + cell->feature_size, cell->units, c->bias_shift, c->oshift_iw, c->bias->p_data, cell->in_state, NULL); + // h + h2 -> (out_state buf) + local_add_q7(cell->in_state, cell->out_data, cell->out_state, 0, cell->units); + + // active(out_state buf) + if(c->act_type == ACT_TANH) + local_tanh_q7(cell->out_state, cell->units, act_int_bit); + //local_hard_tanh_q7(cell->out_state, cell->units, act_int_bit); + else + local_sigmoid_q7(cell->out_state, cell->units, act_int_bit); + //local_hard_sigmoid_q7(cell->out_state, cell->units, act_int_bit); + + // (out_state buf) --copy--> (output buf) + nnom_memcpy(cell->out_data, cell->out_state, cell->units); + + return NN_SUCCESS; +} + + diff --git a/APP_Framework/Framework/knowing/nnom/src/layers/nnom_softmax.c b/APP_Framework/Framework/knowing/nnom/src/layers/nnom_softmax.c new file mode 100644 index 000000000..04b009b35 --- /dev/null +++ b/APP_Framework/Framework/knowing/nnom/src/layers/nnom_softmax.c @@ -0,0 +1,86 @@ +/* + * Copyright (c) 2018-2020 + * Jianjia Ma + * majianjia@live.com + * + * SPDX-License-Identifier: Apache-2.0 + * + * Change Logs: + * Date Author Notes + * 2019-07-23 Jianjia Ma The first version + */ + +#include +#include +#include + +#include "nnom.h" +#include "nnom_local.h" +#include "nnom_layers.h" +#include "layers/nnom_softmax.h" + +#ifdef NNOM_USING_CMSIS_NN +#include "arm_math.h" +#include "arm_nnfunctions.h" +#endif + +nnom_layer_t *softmax_s(const nnom_softmax_config_t * config) +{ + nnom_layer_t * layer = Softmax(); + if(layer) + layer->config = (void*) config; + return layer; +} + +nnom_layer_t *Softmax(void) +{ + nnom_layer_t *layer; + nnom_layer_io_t *in, *out; + + // apply a block memory for all the sub handles. + size_t mem_size = sizeof(nnom_layer_t) + sizeof(nnom_layer_io_t) * 2; + layer = nnom_mem(mem_size); + if (layer == NULL) + return NULL; + + // distribut the memory to sub handles. + in = (void *)((uint8_t*)layer + sizeof(nnom_layer_t)); + out = (void *)((uint8_t*)in + sizeof(nnom_layer_io_t)); + + // set type in layer parent + layer->type = NNOM_SOFTMAX; + layer->run = softmax_run; + layer->build = softmax_build; + // set buf state + in->type = NNOM_TENSOR_BUF_TEMP; + out->type = NNOM_TENSOR_BUF_TEMP; + // put in & out on the layer. + layer->in = io_init(layer, in); + layer->out = io_init(layer, out); + + return layer; +} + +nnom_status_t softmax_build(nnom_layer_t *layer) +{ + // get the last layer's output as input shape + layer->in->tensor = layer->in->hook.io->tensor; + // output tensor + layer->out->tensor = new_tensor(NNOM_QTYPE_PER_TENSOR, layer->in->tensor->num_dim, tensor_get_num_channel(layer->in->tensor)); + tensor_cpy_attr(layer->out->tensor, layer->in->tensor); + // softmax has fixed output dec bit + layer->out->tensor->q_dec[0] = 7; + return NN_SUCCESS; +} + +nnom_status_t softmax_run(nnom_layer_t *layer) +{ + // looks like the new version cause accuracy drop quite a lot. +// #ifdef NNOM_USING_CMSIS_NN +// // temporary fixed for mutiple dimension input. +// arm_softmax_q7(layer->in->tensor->p_data, tensor_size(layer->out->tensor), layer->out->tensor->p_data); +// #else + local_softmax_q7(layer->in->tensor->p_data, tensor_size(layer->out->tensor), layer->out->tensor->p_data); + //#endif + return NN_SUCCESS; +} diff --git a/APP_Framework/Framework/knowing/nnom/src/layers/nnom_sumpool.c b/APP_Framework/Framework/knowing/nnom/src/layers/nnom_sumpool.c new file mode 100644 index 000000000..82de147c4 --- /dev/null +++ b/APP_Framework/Framework/knowing/nnom/src/layers/nnom_sumpool.c @@ -0,0 +1,104 @@ +/* + * Copyright (c) 2018-2020 + * Jianjia Ma + * majianjia@live.com + * + * SPDX-License-Identifier: Apache-2.0 + * + * Change Logs: + * Date Author Notes + * 2019-07-23 Jianjia Ma The first version + */ + +#include +#include +#include + +#include "nnom.h" +#include "nnom_local.h" +#include "nnom_layers.h" +#include "layers/nnom_sumpool.h" + +nnom_layer_t *sumpool_s(const nnom_pool_config_t * config) +{ + nnom_sumpool_layer_t *cl; + if(config->num_dim == 1) + { + cl = (nnom_sumpool_layer_t *)SumPool(kernel(1, config->kernel_size[0]), + stride(1, config->stride_size[0]), + config->padding_type); + } + else + { + cl = (nnom_sumpool_layer_t *)SumPool(kernel(config->kernel_size[0], config->kernel_size[1]), + stride(config->stride_size[0], config->stride_size[1]), + config->padding_type); + } + if(cl) + { + cl->super.config = (void*) config; + cl->output_shift = config->output_shift; // no idea if we need it + } + return (nnom_layer_t *)cl; +} + + +nnom_layer_t *SumPool(nnom_3d_shape_t k, nnom_3d_shape_t s, nnom_padding_t pad_type) +{ + nnom_layer_t *layer = MaxPool(k, s, pad_type); + + if (layer != NULL) + { + layer->type = NNOM_SUMPOOL; + layer->run = sumpool_run; + layer->build = sumpool_build; + } + return (nnom_layer_t *)layer; +} + + +nnom_status_t sumpool_build(nnom_layer_t *layer) +{ + // avg pooling share the same output shape, stride, padding setting. + maxpool_build(layer); + + // however, avg pooling require a computational buffer. + layer->comp->size = 4 * tensor_size(layer->out->tensor); + + return NN_SUCCESS; +} + + +// sum pooling, dynamic change Q format, must be used in the last layer before softmax in current version +nnom_status_t sumpool_run(nnom_layer_t *layer) +{ + nnom_sumpool_layer_t *cl = (nnom_sumpool_layer_t *)(layer); + uint16_t out_x, out_y; + + // if global pooling + if(layer->out->tensor->num_dim == 1) + { + out_x = 1; out_y = 1; + } + else // normal pooling. + { + out_x = layer->out->tensor->dim[1]; //W + out_y = layer->out->tensor->dim[0]; //h + } + +#ifdef NNOM_USING_CHW + local_sumpool_q7_CHW( +#else + local_sumpool_q7_HWC( +#endif + layer->in->tensor->p_data, + layer->in->tensor->dim[1], layer->in->tensor->dim[0], layer->in->tensor->dim[2], + cl->kernel.w, cl->kernel.h, + cl->pad.w, cl->pad.h, + cl->stride.w, cl->stride.h, + out_x, out_y, + layer->comp->mem->blk, + layer->out->tensor->p_data); + + return NN_SUCCESS; +} diff --git a/APP_Framework/Framework/knowing/nnom/src/layers/nnom_upsample.c b/APP_Framework/Framework/knowing/nnom/src/layers/nnom_upsample.c new file mode 100644 index 000000000..96472a5ab --- /dev/null +++ b/APP_Framework/Framework/knowing/nnom/src/layers/nnom_upsample.c @@ -0,0 +1,104 @@ +/* + * Copyright (c) 2018-2020 + * Jianjia Ma + * majianjia@live.com + * + * SPDX-License-Identifier: Apache-2.0 + * + * Change Logs: + * Date Author Notes + * 2019-07-23 Jianjia Ma The first version + */ + +#include +#include +#include + +#include "nnom.h" +#include "nnom_local.h" +#include "nnom_layers.h" +#include "layers/nnom_upsample.h" + +nnom_layer_t *upsample_s(const nnom_upsample_config_t *config) +{ + nnom_layer_t *layer = UpSample(kernel(config->kernel[0], config->kernel[1])); + if(layer) + layer->config = (void*) config; + return layer; +} + +// up sampling layer +nnom_layer_t *UpSample(nnom_3d_shape_t kernel) +{ + nnom_upsample_layer_t *layer; + nnom_layer_io_t *in, *out; + + // apply a block memory for all the sub handles. + size_t mem_size = sizeof(nnom_upsample_layer_t) + sizeof(nnom_layer_io_t) * 2; + layer = nnom_mem(mem_size); + if (layer == NULL) + return NULL; + + // distribut the memory to sub handles. + in = (void *)((uint8_t*)layer + sizeof(nnom_upsample_layer_t)); + out = (void *)((uint8_t*)in + sizeof(nnom_layer_io_t)); + + // set type in layer parent + layer->super.type = NNOM_UPSAMPLE; + // set buf state + in->type = NNOM_TENSOR_BUF_TEMP; + out->type = NNOM_TENSOR_BUF_TEMP; + // put in & out on the layer. + layer->super.in = io_init(layer, in); + layer->super.out = io_init(layer, out); + // set run and outshape methods + layer->super.run = upsample_run; + layer->super.build = upsample_build; + + // set parameters + layer->kernel = kernel; + + return (nnom_layer_t*)layer; +} + +nnom_status_t upsample_build(nnom_layer_t *layer) +{ + nnom_upsample_layer_t* cl = (nnom_upsample_layer_t*)layer; + + // get the last layer's output as input shape + layer->in->tensor = layer->in->hook.io->tensor; + // output tensor + // 1. allocate a new tensor for output + // 2. set the same dim, qfmt to the new tensor. + layer->out->tensor = new_tensor(NNOM_QTYPE_PER_TENSOR, layer->in->tensor->num_dim, tensor_get_num_channel(layer->in->tensor)); + tensor_cpy_attr(layer->out->tensor, layer->in->tensor); + + // see if the activation will change the q format + if(layer->actail) + layer->out->tensor->q_dec[0] = act_get_dec_bit(layer->actail->type, layer->out->tensor->q_dec[0]); + + // enlarge w and h, c stay the same. + layer->out->tensor->dim[0] = layer->in->tensor->dim[0] * cl->kernel.h; + layer->out->tensor->dim[1] = layer->in->tensor->dim[1] * cl->kernel.w; + + return NN_SUCCESS; +} + +// up sampling, or so called unpooling +nnom_status_t upsample_run(nnom_layer_t *layer) +{ + nnom_upsample_layer_t *cl = (nnom_upsample_layer_t *)(layer); + +#ifdef NNOM_USING_CHW + local_up_sampling_q7_CHW( +#else + local_up_sampling_q7_HWC( +#endif + layer->in->tensor->p_data, + layer->in->tensor->dim[1], layer->in->tensor->dim[0], layer->in->tensor->dim[2], + cl->kernel.w, cl->kernel.h, + layer->out->tensor->dim[1], layer->out->tensor->dim[0], + NULL, + layer->out->tensor->p_data); + return NN_SUCCESS; +} diff --git a/APP_Framework/Framework/knowing/nnom/src/layers/nnom_zero_padding.c b/APP_Framework/Framework/knowing/nnom/src/layers/nnom_zero_padding.c new file mode 100644 index 000000000..2352e614e --- /dev/null +++ b/APP_Framework/Framework/knowing/nnom/src/layers/nnom_zero_padding.c @@ -0,0 +1,107 @@ +/* + * Copyright (c) 2018-2020 + * Jianjia Ma + * majianjia@live.com + * + * SPDX-License-Identifier: Apache-2.0 + * + * Change Logs: + * Date Author Notes + * 2019-07-23 Jianjia Ma The first version + */ + +#include +#include +#include + +#include "nnom.h" +#include "nnom_local.h" +#include "nnom_layers.h" +#include "layers/nnom_zero_padding.h" + +nnom_layer_t * zeropadding_s(const nnom_zero_padding_config_t* config) +{ + nnom_layer_t *layer = ZeroPadding(config->pad); + if(layer) + layer->config = (void*) config; + return (nnom_layer_t*)layer; +} + +// Zero padding layer +nnom_layer_t *ZeroPadding(nnom_border_t pad) +{ + nnom_zero_padding_layer_t *layer; + nnom_layer_io_t *in, *out; + + // apply a block memory for all the sub handles. + size_t mem_size = sizeof(nnom_zero_padding_layer_t) + sizeof(nnom_layer_io_t) * 2; + layer = nnom_mem(mem_size); + if (layer == NULL) + return NULL; + + // distribut the memory to sub handles. + in = (void *)((uint8_t*)layer + sizeof(nnom_zero_padding_layer_t)); + out = (void *)((uint8_t*)in + sizeof(nnom_layer_io_t)); + + // set type in layer parent + layer->super.type = NNOM_ZERO_PADDING; + // set buf state + in->type = NNOM_TENSOR_BUF_TEMP; + out->type = NNOM_TENSOR_BUF_TEMP; + // put in & out on the layer. + layer->super.in = io_init(layer, in); + layer->super.out = io_init(layer, out); + // set run and outshape methods + layer->super.run = zero_padding_run; + layer->super.build = zero_padding_build; + + // set parameters + layer->pad = pad; + + return (nnom_layer_t*)layer; +} + +nnom_status_t zero_padding_build(nnom_layer_t* layer) +{ + nnom_zero_padding_layer_t *cl = (nnom_zero_padding_layer_t *)layer; + + // get the tensor from last layer's output + layer->in->tensor = layer->in->hook.io->tensor; + + // create new tensor for output + layer->out->tensor = new_tensor(NNOM_QTYPE_PER_TENSOR, layer->in->tensor->num_dim, tensor_get_num_channel(layer->in->tensor)); + // copy then change later. + tensor_cpy_attr(layer->out->tensor, layer->in->tensor); + + // see if the activation will change the q format + if(layer->actail) + layer->out->tensor->q_dec[0] = act_get_dec_bit(layer->actail->type, layer->out->tensor->q_dec[0]); + + // output shape + layer->out->tensor->dim[1] = layer->in->tensor->dim[1] + cl->pad.left + cl->pad.right; + layer->out->tensor->dim[0] = layer->in->tensor->dim[0] + cl->pad.top + cl->pad.bottom; + layer->out->tensor->dim[2] = layer->in->tensor->dim[2]; + return NN_SUCCESS; +} + +nnom_status_t zero_padding_run(nnom_layer_t * layer) +{ + nnom_zero_padding_layer_t *cl = (nnom_zero_padding_layer_t*)layer; + +#ifdef NNOM_USING_CHW + local_zero_padding_CHW_q7( +#else + local_zero_padding_HWC_q7( +#endif + layer->in->tensor->p_data, + layer->in->tensor->dim[1], layer->in->tensor->dim[0], layer->in->tensor->dim[2], + cl->pad.top, + cl->pad.bottom, + cl->pad.left, + cl->pad.right, + layer->out->tensor->p_data, + layer->out->tensor->dim[1], layer->out->tensor->dim[0]); + + return NN_SUCCESS; +} +