diff --git a/APP_Framework/Framework/knowing/nnom/README.md b/APP_Framework/Framework/knowing/nnom/README.md
new file mode 100644
index 000000000..34082b201
--- /dev/null
+++ b/APP_Framework/Framework/knowing/nnom/README.md
@@ -0,0 +1,14 @@
+# Neural Network on Microcontroller (NNoM)
+
+NNoM is a high-level inference Neural Network library specifically for microcontrollers, released under Apache License 2.0. 
+
+Current version is 0.4.3. More information available in [NNOM](https://github.com/majianjia/nnom).
+
+## CMSIS-NN Backend
+
+[CMSIS-NN/DSP](https://github.com/ARM-software/CMSIS_5/tree/develop/CMSIS/NN) is an inference acceleration libraries for Arm Cortex-M CPUs and can be used as the backend of NNoM for high performance.
+
+## Notes
+
+- CHW format is incompatible with CMSIS-NN and must be used when using hardware accelerator such as KPU in K210 chip.
+- Static memory buffer must be set by using "nnom_set_static_buf()" before creating a model.
\ No newline at end of file
diff --git a/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_activation.h b/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_activation.h
new file mode 100644
index 000000000..7cda07ce3
--- /dev/null
+++ b/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_activation.h
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2018-2020
+ * Jianjia Ma
+ * majianjia@live.com
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Change Logs:
+ * Date           Author       Notes
+ * 2020-05-03     Jianjia Ma   The first version
+ */
+
+#ifndef __NNOM_ACTIVATION_H__
+#define __NNOM_ACTIVATION_H__
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdint.h>
+#include <string.h>
+#include <stdbool.h>
+
+#include "nnom.h"
+#include "nnom_layers.h"
+#include "nnom_local.h"
+#include "nnom_tensor.h"
+
+
+// activation layer
+typedef struct _nnom_activation_layer_t
+{
+	nnom_layer_t super;
+	nnom_activation_t *act; 
+} nnom_activation_layer_t;
+
+
+// activation with fixed q format (tanh and sigmoid)
+typedef struct _nnom_activation_fixed_q_t
+{
+	nnom_activation_t super;
+	uint8_t dec_bit;
+} nnom_activation_fixed_q_t;
+
+// leaky relu
+typedef struct _nnom_activation_leaky_relu_t
+{
+	nnom_activation_t super;
+	q7_t alpha;					// alpha is present by q0.7 format. (-128 = -1) 
+} nnom_activation_leaky_relu_t;
+
+// advance relu (full ReLU)
+typedef struct _nnom_activation_adv_relu_t
+{
+	nnom_activation_t super;
+	q7_t negative_slope;			// negative_slope is present by q0.7 format. (-128 = -1) 
+	float max;						// cap of the max value
+	float threshold;				// threshold
+} nnom_activation_adv_relu_t;
+
+// method
+nnom_status_t activation_run(nnom_layer_t* layer);
+nnom_status_t activation_free(nnom_layer_t *layer);
+
+// activation delete
+void act_delete(nnom_activation_t* act);
+
+// a direct api on tensor
+nnom_status_t act_tensor_run(nnom_activation_t* act, nnom_tensor_t* tensor);
+
+
+// Layer API
+nnom_layer_t *Activation(nnom_activation_t *act);
+nnom_layer_t *ReLU(void);
+nnom_layer_t *LeakyReLU(float alpha);
+nnom_layer_t *AdvReLU(float alpha, float max, float threshold);
+nnom_layer_t *Sigmoid(int32_t dec_bit);
+nnom_layer_t *TanH(int32_t dec_bit);
+
+// Activation API. 
+nnom_activation_t* act_relu(void);
+nnom_activation_t* act_leaky_relu(float alpha);
+nnom_activation_t* act_adv_relu(float negative_slope, float max, float threshold);
+nnom_activation_t* act_tanh(int32_t dec_bit);
+nnom_activation_t* act_sigmoid(int32_t dec_bit);
+nnom_activation_t* act_hard_tanh(int32_t dec_bit);
+nnom_activation_t* act_hard_sigmoid(int32_t dec_bit);
+
+// utils
+int32_t act_get_dec_bit(nnom_activation_type_t type, int32_t dec_bit);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __NNOM_ACTIVATION_H__ */
diff --git a/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_avgpool.h b/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_avgpool.h
new file mode 100644
index 000000000..6f8354630
--- /dev/null
+++ b/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_avgpool.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2018-2020
+ * Jianjia Ma
+ * majianjia@live.com
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Change Logs:
+ * Date           Author       Notes
+ * 2020-05-03     Jianjia Ma   The first version
+ */
+
+#ifndef __NNOM_AVGPOOL_H__
+#define __NNOM_AVGPOOL_H__
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdint.h>
+#include <string.h>
+#include <stdbool.h>
+
+#include "nnom.h"
+#include "nnom_layers.h"
+#include "nnom_local.h"
+#include "nnom_tensor.h"
+
+#include "layers/nnom_maxpool.h"
+
+// Avg Pooling
+typedef nnom_maxpool_layer_t nnom_avgpool_layer_t;
+
+// method
+nnom_status_t avgpooling_build(nnom_layer_t *layer);
+nnom_status_t avgpool_run(nnom_layer_t *layer);
+
+// API
+nnom_layer_t *avgpool_s(const nnom_pool_config_t * config);
+nnom_layer_t *AvgPool(nnom_3d_shape_t k, nnom_3d_shape_t s, nnom_padding_t pad_type);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __NNOM_AVGPOOL_H__ */
diff --git a/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_baselayer.h b/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_baselayer.h
new file mode 100644
index 000000000..940bce578
--- /dev/null
+++ b/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_baselayer.h
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2018-2020
+ * Jianjia Ma
+ * majianjia@live.com
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Change Logs:
+ * Date           Author       Notes
+ * 2020-05-03     Jianjia Ma   The first version
+ */
+
+#ifndef __NNOM_BASELAYER_H__
+#define __NNOM_BASELAYER_H__
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdint.h>
+#include <string.h>
+#include <stdbool.h>
+
+#include "nnom.h"
+#include "nnom_layers.h"
+#include "nnom_local.h"
+#include "nnom_tensor.h"
+
+#include "layers/nnom_input.h"
+
+// method
+nnom_status_t default_build(nnom_layer_t *layer);
+nnom_status_t default_run(nnom_layer_t *layer);
+
+// API
+nnom_layer_t *baselayer_s(const nnom_layer_config_t * config);
+nnom_layer_t *BaseLayer(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __NNOM_BASELAYER_H__ */
diff --git a/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_concat.h b/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_concat.h
new file mode 100644
index 000000000..d47b26365
--- /dev/null
+++ b/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_concat.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2018-2020
+ * Jianjia Ma
+ * majianjia@live.com
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Change Logs:
+ * Date           Author       Notes
+ * 2020-05-03     Jianjia Ma   The first version
+ */
+
+#ifndef __NNOM_CONCAT_H__
+#define __NNOM_CONCAT_H__
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdint.h>
+#include <string.h>
+#include <stdbool.h>
+
+#include "nnom.h"
+#include "nnom_layers.h"
+#include "nnom_local.h"
+#include "nnom_tensor.h"
+
+// concatenate layer
+typedef struct _nnom_concat_layer
+{
+	nnom_layer_t super;
+	int8_t axis;
+} nnom_concat_layer_t;
+
+typedef struct _nnom_concat_config_t
+{
+	nnom_layer_config_t super;
+	int8_t axis;
+} nnom_concat_config_t;
+
+// method
+nnom_status_t concat_build(nnom_layer_t *layer);
+nnom_status_t concat_run(nnom_layer_t *layer);
+
+// API
+nnom_layer_t *concat_s(const nnom_concat_config_t *config);
+nnom_layer_t *Concat(int8_t axis);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __NNOM_CONCAT_H__ */
diff --git a/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_conv2d.h b/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_conv2d.h
new file mode 100644
index 000000000..2b6efb198
--- /dev/null
+++ b/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_conv2d.h
@@ -0,0 +1,83 @@
+/*
+ * Copyright (c) 2018-2020
+ * Jianjia Ma
+ * majianjia@live.com
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Change Logs:
+ * Date           Author       Notes
+ * 2020-05-03     Jianjia Ma   The first version
+ */
+
+#ifndef __NNOM_CONV2D_H__
+#define __NNOM_CONV2D_H__
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+#include <stdint.h>
+#include <string.h>
+#include <stdbool.h>
+
+#include "nnom.h"
+#include "nnom_layers.h"
+#include "nnom_local.h"
+#include "nnom_tensor.h"
+
+// child layers parameters
+typedef struct _nnom_conv2d_layer_t
+{
+	nnom_layer_t super;
+	nnom_3d_shape_t kernel;
+	nnom_3d_shape_t stride;
+	nnom_3d_shape_t pad;
+	nnom_3d_shape_t dilation;
+	nnom_padding_t padding_type;
+	uint32_t filter_mult; 							// filter size (for conv) or multilplier (for depthwise)
+
+	nnom_tensor_t *weight; 
+	nnom_tensor_t *bias;
+
+	// test
+	nnom_qformat_param_t * output_rshift;			
+	nnom_qformat_param_t * bias_lshift;
+} nnom_conv2d_layer_t;
+
+// a machine interface for configuration
+typedef struct _nnom_conv2d_config_t
+{
+	nnom_layer_config_t super;
+	nnom_qtype_t qtype; 	//quantisation type(per channel or per layer)
+	nnom_tensor_t *weight;
+	nnom_tensor_t *bias;
+	nnom_qformat_param_t *output_shift;   
+	nnom_qformat_param_t *bias_shift;   
+	uint32_t filter_size;  
+	int8_t kernel_size[2];
+	int8_t stride_size[2];
+	int8_t padding_size[2];
+	int8_t dilation_size[2];
+	nnom_padding_t padding_type;
+} nnom_conv2d_config_t;
+
+// method
+nnom_status_t conv2d_run(nnom_layer_t *layer);
+nnom_status_t conv2d_build(nnom_layer_t *layer);
+nnom_status_t conv2d_free(nnom_layer_t *layer);
+
+// utils
+uint32_t conv_output_length(uint32_t input_length, uint32_t filter_size, nnom_padding_t padding, uint32_t stride, uint32_t dilation);
+
+// API
+nnom_layer_t *conv2d_s(const nnom_conv2d_config_t *config);
+nnom_layer_t *Conv2D(uint32_t filters, nnom_3d_shape_t k, nnom_3d_shape_t s, nnom_3d_shape_t d,  nnom_padding_t pad_type,
+					 const nnom_weight_t *w, const nnom_bias_t *b);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __NNOM_CONV2D_H__ */
diff --git a/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_conv2d_trans.h b/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_conv2d_trans.h
new file mode 100644
index 000000000..26249f3d9
--- /dev/null
+++ b/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_conv2d_trans.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2018-2020
+ * Jianjia Ma
+ * majianjia@live.com
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Change Logs:
+ * Date           Author       Notes
+ * 2020-05-30     Jianjia Ma   The first version
+ */
+
+#ifndef __NNOM_DECONV2D_H__
+#define __NNOM_DECONV2D_H__
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+#include <stdint.h>
+#include <string.h>
+#include <stdbool.h>
+
+#include "nnom.h"
+#include "nnom_layers.h"
+#include "nnom_local.h"
+#include "nnom_tensor.h"
+#include "layers/nnom_conv2d.h"
+
+// child layers parameters
+typedef nnom_conv2d_layer_t nnom_conv2d_trans_layer_t;
+
+typedef nnom_conv2d_config_t nnom_conv2d_trans_config_t;
+
+// method
+nnom_status_t conv2d_trans_run(nnom_layer_t *layer);
+nnom_status_t conv2d_trans_build(nnom_layer_t *layer);
+
+// utils
+uint32_t conv_trans_output_length(uint32_t input_length, uint32_t filter_size, nnom_padding_t padding, uint32_t stride, uint32_t dilation);
+
+// API
+nnom_layer_t *conv2d_trans_s(const nnom_conv2d_config_t *config);
+nnom_layer_t *Conv2DTrans(uint32_t filters, nnom_3d_shape_t k, nnom_3d_shape_t s, nnom_3d_shape_t d,  nnom_padding_t pad_type,
+					 const nnom_weight_t *w, const nnom_bias_t *b);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __NNOM_DECONV2D_H__ */
diff --git a/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_cropping.h b/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_cropping.h
new file mode 100644
index 000000000..252357481
--- /dev/null
+++ b/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_cropping.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2018-2020
+ * Jianjia Ma
+ * majianjia@live.com
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Change Logs:
+ * Date           Author       Notes
+ * 2020-05-03     Jianjia Ma   The first version
+ */
+
+#ifndef __NNOM_CROPPING_H__
+#define __NNOM_CROPPING_H__
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdint.h>
+#include <string.h>
+#include <stdbool.h>
+
+#include "nnom.h"
+#include "nnom_layers.h"
+#include "nnom_local.h"
+#include "nnom_tensor.h"
+
+#include "layers/nnom_zero_padding.h"
+
+// Cropping, same as zeropadding
+typedef nnom_zero_padding_layer_t nnom_cropping_layer_t;
+
+typedef nnom_zero_padding_config_t nnom_cropping_config_t;
+
+// method
+nnom_status_t cropping_build(nnom_layer_t *layer);
+nnom_status_t cropping_run(nnom_layer_t *layer);
+
+// API
+nnom_layer_t * cropping_s(const nnom_cropping_config_t *config);
+nnom_layer_t *Cropping(nnom_border_t pad);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __NNOM_CROPPING_H__ */
diff --git a/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_dense.h b/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_dense.h
new file mode 100644
index 000000000..a0504a317
--- /dev/null
+++ b/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_dense.h
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2018-2020
+ * Jianjia Ma
+ * majianjia@live.com
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Change Logs:
+ * Date           Author       Notes
+ * 2020-05-03     Jianjia Ma   The first version
+ */
+
+#ifndef __NNOM_DENSE_H__
+#define __NNOM_DENSE_H__
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdint.h>
+#include <string.h>
+#include <stdbool.h>
+
+#include "nnom.h"
+#include "nnom_layers.h"
+#include "nnom_local.h"
+#include "nnom_tensor.h"
+
+typedef struct _nnom_dense_layer_t
+{
+	nnom_layer_t super;
+	size_t output_unit;
+	nnom_tensor_t *weight;
+	nnom_tensor_t *bias;
+	nnom_qformat_param_t *output_rshift;			
+	nnom_qformat_param_t *bias_lshift;
+} nnom_dense_layer_t;
+
+// a machine interface for configuration
+typedef struct _nnom_dense_config_t
+{
+	nnom_layer_config_t super;
+	nnom_qtype_t qtype; 	//quantisation type(per channel or per layer)
+	nnom_tensor_t *weight;
+	nnom_tensor_t *bias;
+	nnom_qformat_param_t *output_shift;			
+	nnom_qformat_param_t *bias_shift;
+} nnom_dense_config_t;
+
+// method
+nnom_status_t dense_free(nnom_layer_t *layer);
+nnom_status_t dense_build(nnom_layer_t *layer);
+nnom_status_t dense_run(nnom_layer_t *layer);
+
+// API
+nnom_layer_t *dense_s(const nnom_dense_config_t *config);
+nnom_layer_t *Dense(size_t output_unit, const nnom_weight_t *w, const nnom_bias_t *b);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __NNOM_DENSE_H__ */
diff --git a/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_dw_conv2d.h b/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_dw_conv2d.h
new file mode 100644
index 000000000..5a9b58b25
--- /dev/null
+++ b/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_dw_conv2d.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2018-2020
+ * Jianjia Ma
+ * majianjia@live.com
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Change Logs:
+ * Date           Author       Notes
+ * 2020-05-03     Jianjia Ma   The first version
+ */
+
+#ifndef __NNOM_DW_CONV2D_H__
+#define __NNOM_DW_CONV2D_H__
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdint.h>
+#include <string.h>
+#include <stdbool.h>
+
+#include "nnom.h"
+#include "nnom_layers.h"
+#include "nnom_local.h"
+#include "nnom_tensor.h"
+
+#include "layers/nnom_conv2d.h"
+
+// method
+nnom_status_t dw_conv2d_build(nnom_layer_t *layer);
+nnom_status_t dw_conv2d_run(nnom_layer_t *layer);
+
+//API
+nnom_layer_t *dw_conv2d_s(const nnom_conv2d_config_t *config);
+nnom_layer_t *DW_Conv2D(uint32_t multiplier, nnom_3d_shape_t k, nnom_3d_shape_t s, nnom_3d_shape_t d, nnom_padding_t pad_type,
+						const nnom_weight_t *w, const nnom_bias_t *b);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __NNOM_DW_CONV2D_H__ */
diff --git a/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_flatten.h b/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_flatten.h
new file mode 100644
index 000000000..c77160fca
--- /dev/null
+++ b/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_flatten.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2018-2020
+ * Jianjia Ma
+ * majianjia@live.com
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Change Logs:
+ * Date           Author       Notes
+ * 2020-05-03     Jianjia Ma   The first version
+ */
+
+#ifndef __NNOM_FLATTEN_H__
+#define __NNOM_FLATTEN_H__
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdint.h>
+#include <string.h>
+#include <stdbool.h>
+
+#include "nnom.h"
+#include "nnom_layers.h"
+#include "nnom_local.h"
+#include "nnom_tensor.h"
+
+// no special parameters but we need it. 
+typedef struct _nnom_flatten_config_t{
+    nnom_layer_config_t super;
+} nnom_flatten_config_t;
+
+// method
+nnom_status_t flatten_build(nnom_layer_t *layer);
+nnom_status_t flatten_run(nnom_layer_t *layer);
+
+// API
+nnom_layer_t *flatten_s(const nnom_flatten_config_t *config);
+nnom_layer_t *Flatten(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __NNOM_FLATTEN_H__ */
diff --git a/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_global_pool.h b/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_global_pool.h
new file mode 100644
index 000000000..febccb0e8
--- /dev/null
+++ b/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_global_pool.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2018-2020
+ * Jianjia Ma
+ * majianjia@live.com
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Change Logs:
+ * Date           Author       Notes
+ * 2020-05-03     Jianjia Ma   The first version
+ */
+
+#ifndef __NNOM_GLOBAL_POOL_H__
+#define __NNOM_GLOBAL_POOL_H__
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdint.h>
+#include <string.h>
+#include <stdbool.h>
+
+#include "nnom.h"
+#include "nnom_layers.h"
+#include "nnom_local.h"
+#include "nnom_tensor.h"
+
+#include "layers/nnom_maxpool.h"
+
+typedef struct _nnom_global_pool_config_t
+{
+    nnom_layer_config_t super;
+    int16_t output_shift;
+}nnom_global_pool_config_t;
+
+// method
+nnom_status_t global_pool_build(nnom_layer_t *layer);
+
+// API
+nnom_layer_t * global_maxpool_s(const nnom_global_pool_config_t *config);
+nnom_layer_t * global_avgpool_s(const nnom_global_pool_config_t *config);
+nnom_layer_t * global_sumpool_s(const nnom_global_pool_config_t *config);
+
+nnom_layer_t *GlobalMaxPool(void);
+nnom_layer_t *GlobalAvgPool(void);
+nnom_layer_t *GlobalSumPool(void);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __NNOM_GLOBAL_POOL_H__ */
diff --git a/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_gru_cell.h b/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_gru_cell.h
new file mode 100644
index 000000000..8ba459624
--- /dev/null
+++ b/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_gru_cell.h
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2018-2020
+ * Jianjia Ma
+ * majianjia@live.com
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Change Logs:
+ * Date           Author       Notes
+ * 2020-08-27     Jianjia Ma   The first version
+ */
+
+#ifndef __NNOM_GRU_CELL_H__
+#define __NNOM_GRU_CELL_H__
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "nnom_rnn.h"
+#include "nnom_activation.h"
+
+typedef struct _nnom_gru_cell_config_t
+{
+	nnom_layer_config_t super;
+	nnom_tensor_t *weights;
+	nnom_tensor_t* recurrent_weights;
+	nnom_tensor_t *bias;
+	nnom_qformat_param_t q_dec_z, q_dec_h; // z, r, h
+	uint16_t units;
+} nnom_gru_cell_config_t;
+
+
+typedef struct _nnom_gru_cell_t
+{
+	nnom_rnn_cell_t super;
+
+	nnom_tensor_t* weights;
+	nnom_tensor_t* recurrent_weights;
+	nnom_tensor_t* bias;
+
+    // decide later. 
+    // z, r, h
+	nnom_qformat_param_t q_dec_z, q_dec_h;
+	nnom_qformat_param_t oshift_iw, oshift_hw, bias_shift;
+
+} nnom_gru_cell_t;
+
+// gru
+nnom_rnn_cell_t *gru_cell_s(const nnom_gru_cell_config_t* config);
+
+nnom_status_t gru_cell_free(nnom_rnn_cell_t* cell);
+nnom_status_t gru_cell_build(nnom_rnn_cell_t* cell);
+nnom_status_t gru_cell_run(nnom_rnn_cell_t* cell);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __NNOM_GRU_CELL_H__ */
diff --git a/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_input.h b/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_input.h
new file mode 100644
index 000000000..42322a61f
--- /dev/null
+++ b/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_input.h
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2018-2020
+ * Jianjia Ma
+ * majianjia@live.com
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Change Logs:
+ * Date           Author       Notes
+ * 2020-05-03     Jianjia Ma   The first version
+ */
+
+#ifndef __NNOM_INPUT_H__
+#define __NNOM_INPUT_H__
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdint.h>
+#include <string.h>
+#include <stdbool.h>
+
+#include "nnom.h"
+#include "nnom_layers.h"
+#include "nnom_local.h"
+#include "nnom_tensor.h"
+
+// IO layer
+typedef struct _nnom_io_layer
+{
+	nnom_layer_t super;
+	nnom_3d_shape_t shape;
+	nnom_qformat_param_t dec_bit;
+	void *buf; //input or output
+} nnom_io_layer_t;
+
+typedef struct _nnom_io_config_t
+{
+	nnom_layer_config_t super;
+	nnom_tensor_t *tensor; 
+}nnom_io_config_t;
+
+// method
+nnom_status_t input_build(nnom_layer_t *layer);
+nnom_status_t input_run(nnom_layer_t *layer);
+
+// API
+nnom_layer_t *input_s(const nnom_io_config_t* config);
+nnom_layer_t *Input(nnom_3d_shape_t input_shape, void *p_buf);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __NNOM_INPUT_H__ */
diff --git a/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_lambda.h b/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_lambda.h
new file mode 100644
index 000000000..80c5e6915
--- /dev/null
+++ b/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_lambda.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2018-2020
+ * Jianjia Ma
+ * majianjia@live.com
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Change Logs:
+ * Date           Author       Notes
+ * 2020-05-03     Jianjia Ma   The first version
+ */
+
+#ifndef __NNOM_LAMBDA_H__
+#define __NNOM_LAMBDA_H__
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdint.h>
+#include <string.h>
+#include <stdbool.h>
+
+#include "nnom.h"
+#include "nnom_layers.h"
+#include "nnom_local.h"
+#include "nnom_tensor.h"
+
+#include "layers/nnom_input.h"
+
+// lambda layer
+typedef struct _nnom_lambda_layer_t
+{
+	nnom_layer_t super;
+	void *parameters;							  // parameters for lambda
+} nnom_lambda_layer_t;
+
+// lambda layer
+typedef struct _nnom_lambda_config_t
+{
+	nnom_layer_config_t super;
+	nnom_status_t (*run_func_name)(nnom_layer_t *layer);	// run method. required
+	nnom_status_t (*build_func_name)(nnom_layer_t *layer);// compute output buffer shape. can be left null, will call default_build()
+	nnom_status_t (*free_func_name)(nnom_layer_t *layer);	// a callback to free private resources (comp buf not included) can be left null
+	void *parameters;							// parameters for lambda
+} nnom_lambda_config_t;
+
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __NNOM_LAMBDA_H__ */
diff --git a/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_lstm_cell.h b/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_lstm_cell.h
new file mode 100644
index 000000000..f0563fc91
--- /dev/null
+++ b/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_lstm_cell.h
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2018-2020
+ * Jianjia Ma
+ * majianjia@live.com
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Change Logs:
+ * Date           Author       Notes
+ * 2020-08-24     Jianjia Ma   The first version
+ */
+
+#ifndef __NNOM_LSTM_CELL_H__
+#define __NNOM_LSTM_CELL_H__
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "nnom_rnn.h"
+#include "nnom_activation.h"
+
+// a machine interface for configuration
+typedef struct _nnom_lstm_cell_config_t
+{
+	nnom_layer_config_t super;
+	nnom_tensor_t *weights;
+	nnom_tensor_t* recurrent_weights;
+	nnom_tensor_t *bias;
+	nnom_qformat_param_t q_dec_z, q_dec_h, q_dec_c; // z = iw + hw, c = cell state; h=output and memory
+	uint16_t units;
+} nnom_lstm_cell_config_t;
+
+
+typedef struct _nnom_lstm_cell_t
+{
+	nnom_rnn_cell_t super;
+
+	nnom_tensor_t* weights;
+	nnom_tensor_t* recurrent_weights;
+	nnom_tensor_t* bias;
+
+	// experimental, 
+	// iw: input x weight
+	// hw: hidden state x recurrent weight
+	// h: hidden state (memor)
+	// c: cell state
+	nnom_qformat_param_t q_dec_z, q_dec_h, q_dec_c;
+	nnom_qformat_param_t oshift_iw, oshift_hw, oshift_zc, bias_shift;
+
+} nnom_lstm_cell_t;
+
+// LSTM
+nnom_rnn_cell_t *lstm_cell_s(const nnom_lstm_cell_config_t* config);
+
+nnom_status_t lstm_cell_free(nnom_rnn_cell_t* cell);
+nnom_status_t lstm_cell_q7_q15_build(nnom_rnn_cell_t* cell);
+nnom_status_t lstm_cell_q7_q15_run(nnom_rnn_cell_t* cell);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __NNOM_LSTM_CELL_H__ */
diff --git a/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_matrix.h b/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_matrix.h
new file mode 100644
index 000000000..11b775bbe
--- /dev/null
+++ b/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_matrix.h
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2018-2020
+ * Jianjia Ma
+ * majianjia@live.com
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Change Logs:
+ * Date           Author       Notes
+ * 2020-05-03     Jianjia Ma   The first version
+ */
+
+#ifndef __NNOM_MATRIX_H__
+#define __NNOM_MATRIX_H__
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdint.h>
+#include <string.h>
+#include <stdbool.h>
+
+#include "nnom.h"
+#include "nnom_layers.h"
+#include "nnom_local.h"
+#include "nnom_tensor.h"
+
+// the maximum input layer hooked to this layer
+#define MAX_INPUT_LAYER 8
+
+// matrix layer
+typedef struct _nnom_matrix_layer_t
+{
+	nnom_layer_t super;
+	int16_t oshift;		// output right shift
+} nnom_matrix_layer_t;
+
+typedef struct _nnom_matrix_config_t
+{
+	nnom_layer_config_t super;
+	int16_t output_shift;		// output right shift
+} nnom_matrix_config_t;
+
+// methods
+nnom_layer_t* _same_shape_matrix_layer(void);
+nnom_status_t add_run(nnom_layer_t *layer);
+nnom_status_t sub_run(nnom_layer_t *layer);
+nnom_status_t mult_run(nnom_layer_t *layer);
+
+// API
+nnom_layer_t *add_s(const nnom_matrix_config_t * config);
+nnom_layer_t *sub_s(const nnom_matrix_config_t * config);
+nnom_layer_t *mult_s(const nnom_matrix_config_t * config);
+nnom_layer_t *Add(int16_t oshift);
+nnom_layer_t *Sub(int16_t oshift);
+nnom_layer_t *Mult(int16_t oshift);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __NNOM_MATRIX_H__ */
diff --git a/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_maxpool.h b/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_maxpool.h
new file mode 100644
index 000000000..690a02d2f
--- /dev/null
+++ b/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_maxpool.h
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2018-2020
+ * Jianjia Ma
+ * majianjia@live.com
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Change Logs:
+ * Date           Author       Notes
+ * 2020-05-03     Jianjia Ma   The first version
+ */
+
+#ifndef __NNOM_MAXPOOL_H__
+#define __NNOM_MAXPOOL_H__
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdint.h>
+#include <string.h>
+#include <stdbool.h>
+
+#include "nnom.h"
+#include "nnom_layers.h"
+#include "nnom_local.h"
+#include "nnom_tensor.h"
+
+// Max Pooling
+typedef struct _nnom_maxpool_layer_t
+{
+	nnom_layer_t super;
+	nnom_3d_shape_t kernel;
+	nnom_3d_shape_t stride;
+	nnom_3d_shape_t pad;
+	nnom_padding_t padding_type;
+	int16_t output_shift;			// reserve
+} nnom_maxpool_layer_t;
+
+// a machine interface for configuration
+typedef struct _nnom_pool_config_t
+{
+	nnom_layer_config_t super;
+	nnom_padding_t padding_type;
+	int16_t output_shift;
+	int8_t kernel_size[2];
+	int8_t stride_size[2];
+	int8_t num_dim;
+} nnom_pool_config_t;
+
+// method
+nnom_status_t maxpool_build(nnom_layer_t *layer);
+nnom_status_t maxpool_run(nnom_layer_t *layer);
+
+// API
+nnom_layer_t *maxpool_s(const nnom_pool_config_t * config);
+nnom_layer_t *MaxPool(nnom_3d_shape_t k, nnom_3d_shape_t s, nnom_padding_t pad_type);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __NNOM_MATRIX_H__ */
diff --git a/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_output.h b/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_output.h
new file mode 100644
index 000000000..8e62e22f2
--- /dev/null
+++ b/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_output.h
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2018-2020
+ * Jianjia Ma
+ * majianjia@live.com
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Change Logs:
+ * Date           Author       Notes
+ * 2020-05-03     Jianjia Ma   The first version
+ */
+
+#ifndef __NNOM_OUTPUT_H__
+#define __NNOM_OUTPUT_H__
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdint.h>
+#include <string.h>
+#include <stdbool.h>
+
+#include "nnom.h"
+#include "nnom_layers.h"
+#include "nnom_local.h"
+#include "nnom_tensor.h"
+
+#include "layers/nnom_input.h"
+
+// method
+nnom_status_t output_build(nnom_layer_t *layer);
+nnom_status_t output_run(nnom_layer_t *layer);
+
+// API
+nnom_layer_t *output_s(const nnom_io_config_t* config);
+nnom_layer_t *Output(nnom_3d_shape_t output_shape, void *p_buf);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __NNOM_OUTPUT_H__ */
diff --git a/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_reshape.h b/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_reshape.h
new file mode 100644
index 000000000..fc68c45d1
--- /dev/null
+++ b/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_reshape.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2018-2020
+ * Jianjia Ma
+ * majianjia@live.com
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Change Logs:
+ * Date           Author       Notes
+ * 2020-12-07     Jianjia Ma   The first version
+ */
+
+#ifndef __NNOM_RESHAPE_H__
+#define __NNOM_RESHAPE_H__
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdint.h>
+#include <string.h>
+#include <stdbool.h>
+
+#include "nnom.h"
+#include "nnom_layers.h"
+#include "nnom_local.h"
+#include "nnom_tensor.h"
+
+typedef struct _nnom_reshape_layer_t
+{
+	nnom_layer_t super;
+	nnom_shape_data_t* dim;
+    uint8_t num_dim;
+
+} nnom_reshape_layer_t;
+
+typedef struct nnom_reshape_config_t
+{
+	nnom_layer_config_t super;
+	nnom_shape_data_t* dim;
+    uint8_t num_dim;
+} nnom_reshape_config_t;
+
+// method
+nnom_status_t reshape_run(nnom_layer_t *layer);
+nnom_status_t reshape_build(nnom_layer_t *layer);
+nnom_status_t reshape_free(nnom_layer_t *layer);
+
+// API
+nnom_layer_t *reshape_s(const nnom_reshape_config_t *config);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __NNOM_CONV2D_H__ */
diff --git a/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_rnn.h b/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_rnn.h
new file mode 100644
index 000000000..6a9d6efb6
--- /dev/null
+++ b/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_rnn.h
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2018-2020
+ * Jianjia Ma
+ * majianjia@live.com
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Change Logs:
+ * Date           Author       Notes
+ * 2020-05-03     Jianjia Ma   The first version
+ */
+
+#ifndef __NNOM_RNN_H__
+#define __NNOM_RNN_H__
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdint.h>
+#include <string.h>
+#include <stdbool.h>
+
+#include "nnom.h"
+#include "nnom_layers.h"
+#include "nnom_local.h"
+#include "nnom_tensor.h"
+
+// a machine interface for configuration
+typedef struct _nnom_rnn_config_t
+{
+	nnom_layer_config_t super;
+	bool return_sequence;
+	bool stateful;
+	bool go_backwards;
+} nnom_rnn_config_t;
+
+// RNN cell base type
+typedef struct _nnom_rnn_cell_t
+{
+	nnom_status_t (*run)(struct _nnom_rnn_cell_t* cell); // cell runner
+	nnom_status_t (*build)(struct _nnom_rnn_cell_t* cell); // cell builder, calculate buffer size, output data size
+	nnom_status_t (*free)(struct _nnom_rnn_cell_t* cell); // 
+	nnom_layer_t *layer;				// pointer to its layer holder
+	nnom_layer_config_t *config;		// config for the cell event it is a layer type		
+	nnom_rnn_cell_type_t type;	
+
+	void *in_data;						// input data
+	void *out_data;						// output data
+	void *in_state;					// input state data (or hidden state)
+	void *out_state;				// output state data
+
+	size_t comp_buf_size;			// the size of temporary buffer. 
+	size_t state_size; 				// the size of hidden state
+	uint16_t units;					// the output units 
+	uint16_t feature_size;			// the input feature size (vector size)
+
+	size_t macc; // stat of MAC count. 
+} nnom_rnn_cell_t;
+
+typedef struct _nnom_rnn_layer_t
+{
+	nnom_layer_t super;
+	nnom_rnn_cell_t *cell;
+	void *state_buf;		// memory allocated to store state, size = 2 x size of state required by cell. 
+
+	uint16_t timestamp_size;// size of timestamp
+	bool return_sequence; 	// whether to return the output for each unit (sequence)
+	bool stateful;			// whether the states are kept after one inteference
+	bool go_backwards;		// whether go backwards timestamping
+} nnom_rnn_layer_t;
+
+
+// rnn layer 
+nnom_layer_t *rnn_s(nnom_rnn_cell_t *cell, const nnom_rnn_config_t* config);
+
+nnom_status_t rnn_run(nnom_layer_t* layer);
+nnom_status_t rnn_build(nnom_layer_t* layer);
+nnom_status_t rnn_free(nnom_layer_t* layer);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __NNOM_RNN_H__ */
diff --git a/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_simple_cell.h b/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_simple_cell.h
new file mode 100644
index 000000000..87977ed8f
--- /dev/null
+++ b/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_simple_cell.h
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2018-2020
+ * Jianjia Ma
+ * majianjia@live.com
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Change Logs:
+ * Date           Author       Notes
+ * 2020-08-20     Jianjia Ma   The first version
+ */
+
+#ifndef __NNOM_SIMPLE_CELL_H__
+#define __NNOM_SIMPLE_CELL_H__
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "nnom_rnn.h"
+#include "nnom_activation.h"
+
+
+// This Simple Cell replicate the Keras's SimpleCell as blow 
+/*
+ def call(self, inputs, states, training=None):
+    prev_output = states[0] if nest.is_sequence(states) else states
+
+	h = K.dot(inputs, self.kernel)
+	h = K.bias_add(h, self.bias)
+
+    output = h + K.dot(prev_output, self.recurrent_kernel)
+    output = self.activation(output)
+
+    new_state = [output] if nest.is_sequence(states) else output
+    return output, new_state
+*/
+
+// a machine interface for configuration
+typedef struct _nnom_simple_cell_config_t
+{
+	nnom_layer_config_t super;
+	nnom_tensor_t *weights;
+	nnom_tensor_t* recurrent_weights;
+	nnom_tensor_t *bias;
+	nnom_qformat_param_t q_dec_iw, q_dec_hw, q_dec_h;
+	nnom_activation_type_t act_type;		// type of the activation
+	uint16_t units;
+} nnom_simple_cell_config_t;
+
+
+typedef struct _nnom_simple_cell_t
+{
+	nnom_rnn_cell_t super;
+	nnom_activation_type_t act_type;
+
+	nnom_tensor_t* weights;
+	nnom_tensor_t* recurrent_weights;
+	nnom_tensor_t* bias;
+
+	// experimental, 
+	// iw: input x weight
+	// hw: hidden state x recurrent weight
+	// h: hidden state
+	nnom_qformat_param_t q_dec_iw, q_dec_hw, q_dec_h;
+	nnom_qformat_param_t oshift_iw, oshift_hw, bias_shift;
+	
+} nnom_simple_cell_t;
+
+
+// RNN cells
+// The shape for RNN input is (batch, timestamp, feature), where batch is always 1. 
+//
+// SimpleCell
+nnom_rnn_cell_t *simple_cell_s(const nnom_simple_cell_config_t* config);
+
+nnom_status_t simple_cell_free(nnom_rnn_cell_t* cell);
+nnom_status_t simple_cell_build(nnom_rnn_cell_t* cell);
+nnom_status_t simple_cell_run(nnom_rnn_cell_t* cell);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __NNOM_SIMPLE_CELL_H__ */
diff --git a/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_softmax.h b/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_softmax.h
new file mode 100644
index 000000000..230be3277
--- /dev/null
+++ b/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_softmax.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2018-2020
+ * Jianjia Ma
+ * majianjia@live.com
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Change Logs:
+ * Date           Author       Notes
+ * 2020-05-03     Jianjia Ma   The first version
+ */
+
+#ifndef __NNOM_SOFTMAX_H__
+#define __NNOM_SOFTMAX_H__
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdint.h>
+#include <string.h>
+#include <stdbool.h>
+
+#include "nnom.h"
+#include "nnom_layers.h"
+#include "nnom_local.h"
+#include "nnom_tensor.h"
+
+typedef struct _nnom_softmax_config_t
+{
+    nnom_layer_config_t super;
+} nnom_softmax_config_t;
+
+
+// method
+nnom_status_t softmax_run(nnom_layer_t *layer);
+nnom_status_t softmax_build(nnom_layer_t *layer);
+
+// API
+nnom_layer_t *softmax_s(const nnom_softmax_config_t * config);
+nnom_layer_t *Softmax(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __NNOM_SOFTMAX_H__ */
diff --git a/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_sumpool.h b/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_sumpool.h
new file mode 100644
index 000000000..927615e82
--- /dev/null
+++ b/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_sumpool.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2018-2020
+ * Jianjia Ma
+ * majianjia@live.com
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Change Logs:
+ * Date           Author       Notes
+ * 2020-05-03     Jianjia Ma   The first version
+ */
+
+#ifndef __NNOM_SUMPOOL_H__
+#define __NNOM_SUMPOOL_H__
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdint.h>
+#include <string.h>
+#include <stdbool.h>
+
+#include "nnom.h"
+#include "nnom_layers.h"
+#include "nnom_local.h"
+#include "nnom_tensor.h"
+
+#include "layers/nnom_maxpool.h"
+
+// Sum Pooling
+typedef nnom_maxpool_layer_t nnom_sumpool_layer_t;
+
+// method
+nnom_status_t sumpool_build(nnom_layer_t *layer);
+nnom_status_t sumpool_run(nnom_layer_t *layer);
+
+// API
+nnom_layer_t *sumpool_s(const nnom_pool_config_t * config);
+nnom_layer_t *SumPool(nnom_3d_shape_t k, nnom_3d_shape_t s, nnom_padding_t pad_type);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __NNOM_SUMPOOL_H__ */
diff --git a/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_upsample.h b/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_upsample.h
new file mode 100644
index 000000000..5db7c9708
--- /dev/null
+++ b/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_upsample.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2018-2020
+ * Jianjia Ma
+ * majianjia@live.com
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Change Logs:
+ * Date           Author       Notes
+ * 2020-05-03     Jianjia Ma   The first version
+ */
+
+#ifndef __NNOM_UPSAMPLE_H__
+#define __NNOM_UPSAMPLE_H__
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdint.h>
+#include <string.h>
+#include <stdbool.h>
+
+#include "nnom.h"
+#include "nnom_layers.h"
+#include "nnom_local.h"
+#include "nnom_tensor.h"
+
+// Up Sampling layer (UnPooling)
+typedef struct _nnom_upsample_layer_t
+{
+	nnom_layer_t super;
+	nnom_3d_shape_t kernel;
+} nnom_upsample_layer_t;
+
+typedef struct _nnom_upsample_config_t
+{
+	nnom_layer_config_t super;
+	nnom_shape_data_t kernel[2];
+} nnom_upsample_config_t;
+
+// API
+nnom_layer_t *upsample_s(const nnom_upsample_config_t *config);
+nnom_layer_t *UpSample(nnom_3d_shape_t kernel);
+
+// Methods
+nnom_status_t upsample_build(nnom_layer_t *layer);
+nnom_status_t upsample_run(nnom_layer_t *layer);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __NNOM_UPSAMPLE_H__ */
diff --git a/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_zero_padding.h b/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_zero_padding.h
new file mode 100644
index 000000000..9aefd6d03
--- /dev/null
+++ b/APP_Framework/Framework/knowing/nnom/inc/layers/nnom_zero_padding.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2018-2020
+ * Jianjia Ma
+ * majianjia@live.com
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Change Logs:
+ * Date           Author       Notes
+ * 2020-05-03     Jianjia Ma   The first version
+ */
+
+#ifndef __NNOM_ZERO_PADDING_H__
+#define __NNOM_ZERO_PADDING_H__
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdint.h>
+#include <string.h>
+#include <stdbool.h>
+
+#include "nnom.h"
+#include "nnom_layers.h"
+#include "nnom_local.h"
+#include "nnom_tensor.h"
+
+typedef struct _nnom_zero_padding_config_t
+{
+	nnom_layer_config_t super;
+	nnom_border_t pad;
+} nnom_zero_padding_config_t;
+
+// zero padding
+typedef struct _nnom_zero_padding_layer_t
+{
+	nnom_layer_t super;
+	nnom_border_t pad;
+} nnom_zero_padding_layer_t;
+
+// API
+nnom_layer_t *zeropadding_s(const nnom_zero_padding_config_t* config);
+nnom_layer_t *ZeroPadding(nnom_border_t pad);
+
+// method
+nnom_status_t zero_padding_build(nnom_layer_t *layer);
+nnom_status_t zero_padding_run(nnom_layer_t *layer);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __NNOM_ZERO_PADDING_H__ */
diff --git a/APP_Framework/Framework/knowing/nnom/inc/nnom.h b/APP_Framework/Framework/knowing/nnom/inc/nnom.h
new file mode 100644
index 000000000..ba802f0e5
--- /dev/null
+++ b/APP_Framework/Framework/knowing/nnom/inc/nnom.h
@@ -0,0 +1,415 @@
+/*
+ * Copyright (c) 2018-2020
+ * Jianjia Ma
+ * majianjia@live.com
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Change Logs:
+ * Date           Author       Notes
+ * 2019-02-05     Jianjia Ma   The first version
+ * 2019-02-10     Jianjia Ma   Compiler supports dense net connection
+ */
+
+#ifndef __NNOM_H__
+#define __NNOM_H__
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+#include <stdint.h>
+#include <string.h>
+#include <stdbool.h>
+#include <stdarg.h>
+#include <math.h>
+
+#include "nnom_port.h"
+
+#define NNOM_ALIGN  (sizeof(char*))     // alignment when doing memory ops. Equal to size of pointer in byte.
+#define q7_t 	int8_t
+#define q15_t 	int16_t
+#define q31_t 	int32_t
+#define q63_t 	int64_t
+
+/* version */
+#define NNOM_MAJORVERSION     0              /**< major version number */
+#define NNOM_SUBVERSION       4              /**< minor version number */
+#define NNOM_REVISION         3              /**< revise version number */
+#define NNOM_VERSION          ((NNOM_MAJORVERSION * 10000) + (NNOM_SUBVERSION * 100) + NNOM_REVISION)
+
+#ifdef ARM_NN_TRUNCATE
+#define NNOM_TRUNCATE
+#endif
+
+#ifndef NNOM_TRUNCATE 
+    #define NNOM_ROUND(out_shift) ((0x1 << out_shift) >> 1 )
+#else
+    #define NNOM_ROUND(out_shift) 0
+#endif
+										 
+typedef enum
+{
+	NN_SUCCESS = 0,			/**< No error */
+	NN_ARGUMENT_ERROR = -1, /**< One or more arguments are incorrect */
+	NN_LENGTH_ERROR = -2,   /**< Length of data buffer is incorrect */
+	NN_SIZE_MISMATCH = -3,  /**< Size of matrices is not compatible with the operation. */
+	NN_NANINF = -4,			/**< Not-a-number (NaN) or infinity is generated */
+	NN_SINGULAR = -5,		/**< Generated by matrix inversion if the input matrix is singular and cannot be inverted. */
+	NN_TEST_FAILURE = -6,   /**< Test Failed  */
+	NN_NO_MEMORY = -7,
+	NN_MORE_TODO = -8
+} nnom_status_t;
+
+typedef enum
+{
+	NNOM_INVALID = 0,
+	NNOM_BASE,
+	NNOM_INPUT,
+	NNOM_OUTPUT,
+	NNOM_CONV_2D,
+	NNOM_DW_CONV_2D,
+	NNOM_CONV2D_TRANS,
+	NNOM_BATCHNORM,
+	NNOM_DENSE,
+	NNOM_ZERO_PADDING,
+	NNOM_CROPPING,
+	NNOM_RNN,
+	NNOM_ACTIVATION,
+	NNOM_RELU,
+	NNOM_LEAKY_RELU,
+	NNOM_ADV_RELU,
+	NNOM_SIGMOID,
+	NNOM_TANH,
+	NNOM_SOFTMAX,
+	NNOM_MAXPOOL,
+	NNOM_GLOBAL_MAXPOOL,
+	NNOM_AVGPOOL,
+	NNOM_GLOBAL_AVGPOOL,
+	NNOM_SUMPOOL,
+	NNOM_GLOBAL_SUMPOOL,
+	NNOM_UPSAMPLE,
+	NNOM_FLATTEN,
+    NNOM_RESHAPE,
+	NNOM_LAMBDA,
+	NNOM_CONCAT,
+	NNOM_ADD,
+	NNOM_SUB,
+	NNOM_MULT,
+	NNOM_TYPE_MAX
+
+} nnom_layer_type_t;
+
+#define DEFUALT_LAYER_NAMES \
+	{                       \
+		"Unknown",          \
+			"Base",			\
+			"Input",        \
+			"Output",       \
+			"Conv2D",       \
+			"DW_Conv2D",    \
+			"Conv2DTrsp",    \
+			"BatchNorm",	\
+			"Dense",        \
+			"ZeroPad",	    \
+			"Cropping",     \
+			"RNN",          \
+			"Activation",   \
+			"ReLU",         \
+			"Leaky_ReLU",	\
+			"Adv_ReLU",	    \
+			"Sigmoid",      \
+			"Tanh",         \
+			"Softmax",      \
+			"MaxPool",      \
+			"GL_MaxPool",	\
+			"AvgPool",      \
+			"GL_AvgPool",	\
+			"SumPool",		\
+			"GL_SumPool",	\
+			"UpSample",		\
+			"Flatten",      \
+            "Reshape",      \
+			"Lambda",       \
+			"Concat",       \
+			"Add",          \
+			"Sub",          \
+			"Mult",         \
+	}
+extern const char default_layer_names[][12];
+
+// We dont count softmax an activation here, softmax is instanced as a layer
+typedef enum
+{
+    ACT_UNKNOWN = 0,
+	ACT_RELU,
+	ACT_LEAKY_RELU,
+	ACT_ADV_RELU,
+	ACT_TANH,
+	ACT_SIGMOID,
+    ACT_HARD_TANH,
+    ACT_HARD_SIGMOID
+} nnom_activation_type_t;
+
+#define ACTIVATION_NAMES \
+	{                    \
+        "Unknown",          \
+		"ReLU",          \
+		"LkyReLU",		 \
+		"AdvReLU",		\
+		"TanH",      \
+		"Sigmoid",   \
+        "HrdTanH",      \
+		"HrdSigd",   \
+	}
+extern const char default_activation_names[][8];
+
+// RNN cell type
+typedef enum
+{
+	NNOM_UNKOWN_CELL = 0,
+	NNOM_SIMPLE_CELL,
+	NNOM_GRU_CELL,
+	NNOM_LSTM_CELL,
+	NNOM_CELL_TYPE_MAX
+} nnom_rnn_cell_type_t;
+
+#define DEFUALT_CELL_NAMES \
+	{                    \
+		"Unknown",          \
+		"Simple",		 \
+		"GRU",		\
+		"LSTM",      \
+	}
+extern const char default_cell_names[][8];
+
+
+// parameters
+typedef enum
+{
+	PADDING_VALID = 0,
+	PADDING_SAME
+} nnom_padding_t;
+
+#define NNOM_TENSOR_BUF_NULL     (0)	// This buffer is not in used
+#define NNOM_TENSOR_BUF_TEMP     (1)  // The memory in IO is temporary occupided, can be reused by other layer once the computation is done.
+#define NNOM_TENSOR_BUF_RESERVED (2)  // the mem is reserve for this layer only (not to be reused by other layer.
+
+// currently used in compiling.
+#define NNOM_BUF_EMPTY   (0)
+#define NNOM_BUF_FILLED  (1)
+
+// basic types
+#define nnom_qformat_param_t int32_t // this should match the backend, need a better way to do it. 
+#define nnom_shape_data_t uint16_t
+
+typedef struct _nnom_3d_shape_t
+{
+	nnom_shape_data_t h, w, c;
+} nnom_3d_shape_t;
+
+typedef struct _nnom_border_t
+{
+	nnom_shape_data_t top, bottom, left, right;
+} nnom_border_t;
+
+// nnom_3d_shape_axis_t type provide the axis[] format access to nnom_3d_shape_t
+typedef union {
+	nnom_3d_shape_t s;
+	nnom_shape_data_t axis[sizeof(nnom_3d_shape_t) / sizeof(nnom_shape_data_t)];
+} nnom_3d_shape_axis_t;
+
+// tensor quantisation types
+typedef enum
+{
+	NNOM_QTYPE_PER_TENSOR = 0,
+	NNOM_QTYPE_PER_AXIS = 1
+} nnom_qtype_t;
+
+typedef struct _nnom_weights
+{
+	const void *p_value;
+	nnom_qformat_param_t shift;
+} nnom_weight_t;
+
+typedef struct _nnom_bias
+{
+	const void *p_value;
+	nnom_qformat_param_t shift;
+} nnom_bias_t;
+
+// experimental                   
+typedef struct _nnom_tensor_t
+{
+	void* p_data;			// value
+	nnom_shape_data_t *dim; // dimension of this tensor
+	nnom_qformat_param_t *q_dec;	// number of decimal bit for Q format (scale)
+	nnom_qformat_param_t *q_offset;	// offset for each channel
+	nnom_qtype_t qtype;			// the quantisation type	
+	uint8_t num_dim;			// the number of dimension
+	uint8_t bitwidth;			// the data bit width, only support 8bit now
+} nnom_tensor_t;
+
+// nn wrappers
+typedef struct _nnom_layer_t 	nnom_layer_t;
+typedef struct _nnom_layer_io_t nnom_layer_io_t;
+typedef struct _nnom_layer_hook_t nnom_layer_hook_t;
+typedef struct _nnom_mem_block_t nnom_mem_block_t;
+
+// activation wrapper
+typedef struct _nnom_activation_t nnom_activation_t;
+
+typedef struct _nnom_buf
+{
+	nnom_mem_block_t *mem;
+	size_t size;
+	uint8_t type;
+} nnom_buf_t;
+
+// a memory block to store pre-assign memories during compiling. then assigned to each tensor after.
+struct _nnom_mem_block_t
+{
+	void *blk;		// data block location
+	size_t size;	// the maximum size for this block
+	uint8_t owners; // how many layers own this block
+	uint8_t state;  // empty? filled? for static nn, currently only used in compiling
+};
+
+typedef struct _nnom_stat_t
+{
+	size_t macc; //num. of mac operation
+	uint32_t time;
+} nnom_layer_stat_t;
+
+struct _nnom_layer_hook_t
+{
+	nnom_layer_io_t *io;	 // hooked io
+	nnom_layer_hook_t *next; // next hook include secondary hooked layer
+};
+
+struct _nnom_layer_io_t
+{
+	nnom_layer_hook_t hook;		  // for example: (layer->out)--hook--(layer->in)
+	nnom_layer_io_t *aux; 			// point to auxilary I/O (multiple I/O layer)
+	nnom_tensor_t *tensor;		  // experimental 
+	nnom_mem_block_t *mem;		  // memory blocks handles for compiling only. The memory are now pass by tensor. trying to remove it. 
+	nnom_layer_t *owner;		  // which layer owns this io.
+	uint8_t type;
+};
+
+// structured configuration base type
+typedef struct _nnom_layer_config_t
+{
+	char* name;			// the name of the layer prequantiesd model (the model trained by user before converted to nnom)
+} nnom_layer_config_t;
+
+// layers base
+struct _nnom_layer_t
+{
+	nnom_layer_t *shortcut; // shortcut points to the next layer, applied on compiling
+
+	nnom_status_t (*run)(nnom_layer_t *layer);				// run method. required
+	nnom_status_t (*build)(nnom_layer_t *layer);			// compute output buffer shape. can be left null, will call default_build()
+	nnom_status_t (*free)(nnom_layer_t *layer);				// a callback to free private resources (comp buf not included) can be left null
+	nnom_buf_t *comp;		   								// computational buf
+	nnom_activation_t *actail; 								// I have an activation, I have a tail, wooo haaaa, act-tail!!!
+
+	nnom_layer_config_t *config;			// point to the configuration of the layers. for machine api only. 
+	nnom_layer_type_t type; // layer types
+	nnom_layer_io_t *in;	// IO buff, last*layer, states
+	nnom_layer_io_t *out;   // IO buff, next*layer, states
+	nnom_layer_stat_t stat; // stats, timing, ops
+};
+
+// activation base
+struct _nnom_activation_t
+{
+	nnom_status_t (*run)(struct _nnom_activation_t *act);
+	nnom_tensor_t *tensor;
+	nnom_activation_type_t type;
+};
+
+// local static functions when libc is not available
+#ifdef NNOM_USING_STATIC_MEMORY
+    void nnom_set_static_buf(void* buf, size_t size);
+    void *nnom_malloc(size_t size);
+    void nnom_free(void* p);
+#endif //NNOM_USING_STATIC_BUF
+
+typedef struct _nnom_model nnom_model_t;
+
+#include "nnom_tensor.h"
+#include "nnom_layers.h"
+#include "nnom_utils.h"
+
+// models, I dont want to make model class as a child of layer class yet
+struct _nnom_model
+{
+	nnom_layer_t *head;
+	nnom_layer_t *tail;
+
+	// model constructor
+	nnom_status_t (*add)(struct _nnom_model *m, nnom_layer_t *layer);					// has too pass a raw value
+	nnom_layer_t *(*hook)(nnom_layer_t *curr, nnom_layer_t *last);						// create hook between 2 layers' primary IO.
+	nnom_layer_t *(*merge)(nnom_layer_t *method, nnom_layer_t *in1, nnom_layer_t *in2); // an older interface of merge 2 inputs.
+	nnom_layer_t *(*mergex)(nnom_layer_t *method, int num, ...);						// merge a few layers using mutiple input method (concate, add, ...)
+	nnom_layer_t *(*active)(nnom_activation_t *act, nnom_layer_t *target_layer);		// add the activation to the existing layer's tail
+
+	// callback
+	nnom_status_t (*layer_callback)(nnom_model_t *m, nnom_layer_t *layer);				// layer callback will be called after each layer(after actail). 
+
+	// block memory for layers
+	nnom_mem_block_t blocks[NNOM_BLOCK_NUM];
+
+	size_t total_ops;
+
+	bool is_inited; 	//	is this structure initialized
+	bool is_allocated;  //	is this structure allocated by nnom (not by user)
+};
+
+#define NNOM_NULL_CHECK(p)                 \
+	if ((p) == NULL)                       \
+	{                                 	   \
+		NNOM_LOG("Error: NULL object.\n"); \
+		return NN_ARGUMENT_ERROR;          \
+	}
+
+
+// utils
+size_t nnom_alignto(size_t value, uint32_t alignment);
+size_t nnom_io_length(nnom_layer_io_t *io);
+size_t nnom_hook_length(nnom_layer_hook_t *hook);
+
+// memory (malloc + memeset 0)
+void *nnom_mem(size_t size);
+	
+// get how much memory has been taken
+size_t nnom_mem_stat(void);
+
+// Model APIs
+// create or init a model
+nnom_model_t *new_model(nnom_model_t *m);
+// compile as sequencial model
+nnom_status_t sequencial_compile(nnom_model_t *m);
+// compile as functional model
+nnom_status_t model_compile(nnom_model_t *m, nnom_layer_t *input, nnom_layer_t *output);
+// run a prediction
+nnom_status_t model_run(nnom_model_t *m);
+// delete model. 
+void model_delete(nnom_model_t *m);
+// check version
+nnom_status_t check_model_version(unsigned long model_version);
+
+// callback, called after each layer has finished the calculation. 
+// this callback must return NN_SUCCESS for continually run the model. otherwise, model will be returned with the ERROR code. 
+// this function return NN_LENGTH_ERROR if the callback is already set to other. 
+nnom_status_t model_set_callback(nnom_model_t *m, nnom_status_t (*layer_callback)(nnom_model_t *m, nnom_layer_t *layer));
+// delete callback. 
+void model_delete_callback(nnom_model_t *m);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __NNOM_H__ */
diff --git a/APP_Framework/Framework/knowing/nnom/inc/nnom_layers.h b/APP_Framework/Framework/knowing/nnom/inc/nnom_layers.h
new file mode 100644
index 000000000..cba44874f
--- /dev/null
+++ b/APP_Framework/Framework/knowing/nnom/inc/nnom_layers.h
@@ -0,0 +1,194 @@
+/*
+ * Copyright (c) 2018-2020
+ * Jianjia Ma
+ * majianjia@live.com
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Change Logs:
+ * Date           Author       Notes
+ * 2019-02-05     Jianjia Ma   The first version
+ */
+
+#ifndef __NNOM_LAYERS_H__
+#define __NNOM_LAYERS_H__
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+#include <stdint.h>
+#include <string.h>
+#include <stdbool.h>
+
+#include "nnom.h"
+
+// properties
+nnom_3d_shape_t shape(size_t h, size_t w, size_t c);
+nnom_3d_shape_t kernel(size_t h, size_t w);
+nnom_3d_shape_t stride(size_t h, size_t w);
+nnom_3d_shape_t dilation(size_t h, size_t w);
+nnom_border_t border(size_t top, size_t bottom, size_t left, size_t right);
+//nnom_qformat_t qformat(int8_t m, int8_t n);
+size_t shape_size(nnom_3d_shape_t* s);
+
+// this function is to add a new IO to current inited IO
+// input, the targeted IO that the new IO will be added to
+// output , the new IO
+nnom_layer_io_t* io_add_aux(nnom_layer_io_t* targeted_io);
+nnom_layer_io_t *io_init(void *owner_layer, nnom_layer_io_t *io);
+
+#define NN_CEILIF(x,y) ((x+y-1)/y)
+
+#include "layers/nnom_activation.h"
+#include "layers/nnom_concat.h"
+#include "layers/nnom_conv2d.h"
+#include "layers/nnom_cropping.h"
+#include "layers/nnom_conv2d_trans.h"
+#include "layers/nnom_dense.h"
+#include "layers/nnom_dw_conv2d.h"
+#include "layers/nnom_flatten.h"
+#include "layers/nnom_reshape.h"
+#include "layers/nnom_global_pool.h"
+#include "layers/nnom_input.h"
+#include "layers/nnom_lambda.h"
+#include "layers/nnom_matrix.h"
+#include "layers/nnom_maxpool.h"
+#include "layers/nnom_avgpool.h"
+#include "layers/nnom_output.h"
+#include "layers/nnom_rnn.h"
+#include "layers/nnom_softmax.h"
+#include "layers/nnom_sumpool.h"
+#include "layers/nnom_upsample.h"
+#include "layers/nnom_zero_padding.h"
+#include "layers/nnom_rnn.h"
+#include "layers/nnom_simple_cell.h"
+#include "layers/nnom_lstm_cell.h"
+#include "layers/nnom_gru_cell.h"
+
+// Layer APIs ******
+// (a summary for each individual layer's files)
+
+// input/output
+nnom_layer_t *Input(nnom_3d_shape_t input_shape, void *p_buf);
+nnom_layer_t *Output(nnom_3d_shape_t output_shape, void *p_buf);
+
+// Pooling
+nnom_layer_t *MaxPool(nnom_3d_shape_t k, nnom_3d_shape_t s, nnom_padding_t pad);
+nnom_layer_t *AvgPool(nnom_3d_shape_t k, nnom_3d_shape_t s, nnom_padding_t pad);
+nnom_layer_t *SumPool(nnom_3d_shape_t k, nnom_3d_shape_t s, nnom_padding_t pad);
+nnom_layer_t *GlobalMaxPool(void);
+nnom_layer_t *GlobalAvgPool(void);
+nnom_layer_t *GlobalSumPool(void);
+
+// padding, cropping, upsample
+nnom_layer_t *UpSample(nnom_3d_shape_t kernel);	
+nnom_layer_t *ZeroPadding(nnom_border_t pad);
+nnom_layer_t *Cropping(nnom_border_t pad);
+
+// Activation
+nnom_layer_t *Activation(nnom_activation_t *act);
+nnom_layer_t *ReLU(void);
+nnom_layer_t *LeakyReLU(float alpha);
+nnom_layer_t *Softmax(void);
+nnom_layer_t *Sigmoid(int32_t dec_bit);  // input dec bit
+nnom_layer_t *TanH(int32_t dec_bit);     // input dec bit 
+
+// Matrix
+nnom_layer_t *Add(int16_t oshift);       // output shift
+nnom_layer_t *Sub(int16_t oshift);       // output shift			
+nnom_layer_t *Mult(int16_t oshift);      // output shift
+
+nnom_layer_t *Flatten(void);
+nnom_layer_t *Concat(int8_t axis);
+// -- NN Constructers --
+// conv2d
+nnom_layer_t *Conv2D(uint32_t filters, nnom_3d_shape_t k, nnom_3d_shape_t s, nnom_3d_shape_t d, nnom_padding_t pad,
+					 const nnom_weight_t *w, const nnom_bias_t *b);
+
+// deconv2d
+nnom_layer_t *Conv2DTrans(uint32_t filters, nnom_3d_shape_t k, nnom_3d_shape_t s, nnom_3d_shape_t d, nnom_padding_t pad,
+					 const nnom_weight_t *w, const nnom_bias_t *b);
+
+// depthwise_convolution
+nnom_layer_t *DW_Conv2D(uint32_t multiplier, nnom_3d_shape_t k, nnom_3d_shape_t s, nnom_3d_shape_t d, nnom_padding_t pad,
+						const nnom_weight_t *w, const nnom_bias_t *b);
+
+// fully connected, dense
+nnom_layer_t *Dense(size_t output_unit, const nnom_weight_t *w, const nnom_bias_t *b);
+
+
+// Lambda Layers
+nnom_layer_t *Lambda(nnom_status_t (*run)(nnom_layer_t *),	// run method, required
+					 nnom_status_t (*build)(nnom_layer_t *), // optional, call default_build() if left null
+					 nnom_status_t (*free)(nnom_layer_t *),   // not required if no resources needs to be deleted, can be left null.
+					 void *parameters);						  // user private parameters for run method, left null if not needed.
+
+// building methods
+nnom_status_t default_build(nnom_layer_t* layer);
+nnom_status_t input_build(nnom_layer_t* layer);
+
+nnom_status_t conv2d_build(nnom_layer_t* layer);
+nnom_status_t dw_conv2d_build(nnom_layer_t* layer);
+nnom_status_t conv2d_trans_build(nnom_layer_t* layer);
+nnom_status_t dense_build(nnom_layer_t* layer);
+nnom_status_t rnn_build(nnom_layer_t* layer);
+
+nnom_status_t upsample_build(nnom_layer_t* layer);
+nnom_status_t zero_padding_build(nnom_layer_t* layer);
+nnom_status_t cropping_build(nnom_layer_t* layer);
+
+nnom_status_t maxpool_build(nnom_layer_t* layer);
+nnom_status_t avgpool_build(nnom_layer_t* layer);
+nnom_status_t sumpool_build(nnom_layer_t* layer);
+nnom_status_t global_pool_build(nnom_layer_t* layer);
+
+nnom_status_t flatten_build(nnom_layer_t* layer);
+nnom_status_t reshape_build(nnom_layer_t* layer);
+nnom_status_t concat_build(nnom_layer_t* layer);
+
+// run
+nnom_status_t input_run(nnom_layer_t* layer);
+nnom_status_t output_run(nnom_layer_t* layer);
+nnom_status_t flatten_run(nnom_layer_t* layer);
+nnom_status_t reshape_run(nnom_layer_t* layer);
+nnom_status_t default_run(nnom_layer_t* layer);  // simply copy data from input to output
+
+nnom_status_t dw_conv2d_run(nnom_layer_t* layer);
+nnom_status_t conv2d_run(nnom_layer_t* layer);
+nnom_status_t conv2d_trans_run(nnom_layer_t* layer);
+nnom_status_t dense_run(nnom_layer_t* layer);
+nnom_status_t rnn_run(nnom_layer_t* layer);
+
+nnom_status_t upsample_run(nnom_layer_t* layer);
+nnom_status_t zero_padding_run(nnom_layer_t* layer);
+nnom_status_t cropping_run(nnom_layer_t* layer);
+
+nnom_status_t activation_run(nnom_layer_t* layer);
+nnom_status_t softmax_run(nnom_layer_t* layer);
+
+nnom_status_t maxpool_run(nnom_layer_t* layer);
+nnom_status_t avgpool_run(nnom_layer_t* layer);
+nnom_status_t sumpool_run(nnom_layer_t* layer);
+
+nnom_status_t concat_run(nnom_layer_t* layer);
+nnom_status_t add_run(nnom_layer_t* layer);
+nnom_status_t sub_run(nnom_layer_t* layer);
+nnom_status_t mult_run(nnom_layer_t* layer);
+
+// Activation APIs
+// Softmax is not considered as activation in NNoM, Softmax is in layer API.
+nnom_activation_t* act_relu(void);
+nnom_activation_t* act_leaky_relu(float alpha);
+nnom_activation_t* act_sigmoid(int32_t dec_bit);
+nnom_activation_t* act_tanh(int32_t dec_bit);
+
+// direct API
+nnom_status_t act_tensor_run(nnom_activation_t* act, nnom_tensor_t* tensor);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __NNOM_LAYERS_H__ */
diff --git a/APP_Framework/Framework/knowing/nnom/inc/nnom_local.h b/APP_Framework/Framework/knowing/nnom/inc/nnom_local.h
new file mode 100644
index 000000000..35845a564
--- /dev/null
+++ b/APP_Framework/Framework/knowing/nnom/inc/nnom_local.h
@@ -0,0 +1,974 @@
+/*
+ * Copyright (c) 2018-2020
+ * Jianjia Ma
+ * majianjia@live.com
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ * 
+ * Notice: 
+ * Code in this file inlcudes derivative works from CMSIS, which is released under alternative license.
+ * Please check the LICENSE file for detial.
+ *
+ * Change Logs:
+ * Date           Author       Notes
+ * 2019-02-05     Jianjia Ma   The first version
+ * 2019-03-19     Jianjia Ma   Local C implementation partly from CMSIS-NN
+ */
+
+#ifndef __NNOM_LOCAL_H__
+#define __NNOM_LOCAL_H__
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+#include "stdint.h"
+#include "nnom_port.h"
+
+#ifdef ARM_NN_TRUNCATE
+#define NNOM_TRUNCATE
+#endif
+
+// SSAT implementation with C code
+#ifndef __NNOM_SSAT
+static inline int __NNOM_SSAT(int32_t value, int32_t bit) {
+    int32_t min = -(1<<(bit-1));
+    int32_t max = (1<<(bit-1)) - 1;
+    if (value < min)
+        return min;
+    else if (value > max)
+        return max;
+    else
+        return value;
+}
+#endif
+
+// USAT implementation with C code
+#ifndef __NNOM_USAT
+static inline int __NNOM_USAT(int32_t value, int32_t bit) {
+    int32_t max = (1<<(bit-1)) - 1;
+    if (value < 0)
+        return 0;
+    else if (value > max)
+        return max;
+    else
+        return value;
+}
+#endif
+
+#define MAX(A, B) ((A) > (B) ? (A) : (B))
+#define MIN(A, B) ((A) < (B) ? (A) : (B))
+
+
+// Those functions/tables below are partially modifed from CMSIS-NN lib
+// https://github.com/ARM-software/CMSIS_5
+//
+void local_avepool_q7_HWC(const q7_t *Im_in,           // input image
+	const uint16_t dim_im_in_x,  // input image dimension x or W
+	const uint16_t dim_im_in_y,  // input image dimension y or H
+	const uint16_t ch_im_in,     // number of input image channels
+	const uint16_t dim_kernel_x, // window kernel size
+	const uint16_t dim_kernel_y, // window kernel size
+	const uint16_t padding_x,    // padding sizes
+	const uint16_t padding_y,    // padding sizes
+	const uint16_t stride_x,     // stride
+	const uint16_t stride_y,     // stride
+	const uint16_t dim_im_out_x, // output image dimension x or W
+	const uint16_t dim_im_out_y, // output image dimension y or H
+	const uint16_t output_shift, // output right shift
+	q7_t *bufferA,               // a buffer for local storage, NULL by now
+	q7_t *Im_out);
+
+void local_avepool_q7_CHW(const q7_t *Im_in,           // input image
+	const uint16_t dim_im_in_x,  // input image dimension x or W
+	const uint16_t dim_im_in_y,  // input image dimension y or H
+	const uint16_t ch_im_in,     // number of input image channels
+	const uint16_t dim_kernel_x, // window kernel size
+	const uint16_t dim_kernel_y, // window kernel size
+	const uint16_t padding_x,    // padding sizes
+	const uint16_t padding_y,    // padding sizes
+	const uint16_t stride_x,     // stride
+	const uint16_t stride_y,     // stride
+	const uint16_t dim_im_out_x, // output image dimension x or W
+	const uint16_t dim_im_out_y, // output image dimension y or H
+	const uint16_t output_shift, // output right shift
+	q7_t *bufferA,               // a buffer for local storage, NULL by now
+	q7_t *Im_out);
+
+// modified from CMSIS-NN test_ref                            
+void local_maxpool_q7_HWC(const q7_t * Im_in, 				// input image
+	const uint16_t dim_im_in_x,   	// input image dimension x or W
+	const uint16_t dim_im_in_y,   	// input image dimension y or H
+	const uint16_t ch_im_in,    	// number of input image channels
+	const uint16_t dim_kernel_x,  	// window kernel size
+	const uint16_t dim_kernel_y,  	// window kernel size
+	const uint16_t padding_x, 		// padding sizes
+	const uint16_t padding_y, 		// padding sizes
+	const uint16_t stride_x,  		// stride
+	const uint16_t stride_y,  		// stride
+	const uint16_t dim_im_out_x,  	// output image dimension x or W
+	const uint16_t dim_im_out_y,  	// output image dimension y or H
+	q7_t * bufferA, 				// a buffer for local storage, NULL by now
+	q7_t * Im_out);
+
+void local_maxpool_q7_CHW(const q7_t * Im_in, 				// input image
+	const uint16_t dim_im_in_x,   	// input image dimension x or W
+	const uint16_t dim_im_in_y,   	// input image dimension y or H
+	const uint16_t ch_im_in,    	// number of input image channels
+	const uint16_t dim_kernel_x,  	// window kernel size
+	const uint16_t dim_kernel_y,  	// window kernel size
+	const uint16_t padding_x, 		// padding sizes
+	const uint16_t padding_y, 		// padding sizes
+	const uint16_t stride_x,  		// stride
+	const uint16_t stride_y,  		// stride
+	const uint16_t dim_im_out_x,  	// output image dimension x or W
+	const uint16_t dim_im_out_y,  	// output image dimension y or H
+	q7_t * bufferA, 				// a buffer for local storage, NULL by now
+	q7_t * Im_out);
+							
+void local_sumpool_q7_HWC(const q7_t * Im_in, // input image
+	const uint16_t dim_im_in_x,   	// input image dimension x or W
+	const uint16_t dim_im_in_y,   	// input image dimension y or H
+	const uint16_t ch_im_in,    	// number of input image channels
+	const uint16_t dim_kernel_x,  	// window kernel size
+	const uint16_t dim_kernel_y,  	// window kernel size
+	const uint16_t padding_x, 		// padding sizes
+	const uint16_t padding_y, 		// padding sizes
+	const uint16_t stride_x,  		// stride
+	const uint16_t stride_y,  		// stride
+	const uint16_t dim_im_out_x,  	// output image dimension x or W
+	const uint16_t dim_im_out_y,  	// output image dimension y or H
+	q7_t * bufferA, 				// a buffer for local storage, size = 4*output_size
+	q7_t * Im_out);
+							
+void local_sumpool_q7_CHW(const q7_t * Im_in, // input image
+	const uint16_t dim_im_in_x,   	// input image dimension x or W
+	const uint16_t dim_im_in_y,   	// input image dimension y or H
+	const uint16_t ch_im_in,    	// number of input image channels
+	const uint16_t dim_kernel_x,  	// window kernel size
+	const uint16_t dim_kernel_y,  	// window kernel size
+	const uint16_t padding_x, 		// padding sizes
+	const uint16_t padding_y, 		// padding sizes
+	const uint16_t stride_x,  		// stride
+	const uint16_t stride_y,  		// stride
+	const uint16_t dim_im_out_x,  	// output image dimension x or W
+	const uint16_t dim_im_out_y,  	// output image dimension y or H
+	q7_t * bufferA, 				// a buffer for local storage, size = 4*output_size
+	q7_t * Im_out);
+
+// customised up sample pooling
+void local_up_sampling_q7_HWC(const q7_t *Im_in,       // input image
+	const uint16_t dim_im_in_x,  // input image dimension x or W
+	const uint16_t dim_im_in_y,  // input image dimension y or H
+	const uint16_t ch_im_in,     // number of input image channels
+	const uint16_t dim_kernel_x, // window kernel size
+	const uint16_t dim_kernel_y, // window kernel size
+	const uint16_t dim_im_out_x, // output image dimension x or W
+	const uint16_t dim_im_out_y, // output image dimension y or H
+	q7_t *bufferA,               // NULL
+	q7_t *Im_out);
+						  
+void local_up_sampling_q7_CHW(const q7_t *Im_in,       // input image
+	const uint16_t dim_im_in_x,  // input image dimension x or W
+	const uint16_t dim_im_in_y,  // input image dimension y or H
+	const uint16_t ch_im_in,     // number of input image channels
+	const uint16_t dim_kernel_x, // window kernel size
+	const uint16_t dim_kernel_y, // window kernel size
+	const uint16_t dim_im_out_x, // output image dimension x or W
+	const uint16_t dim_im_out_y, // output image dimension y or H
+	q7_t *bufferA,               // NULL
+	q7_t *Im_out);
+
+void local_convolve_HWC_q7_nonsquare(const q7_t *Im_in,                // input image
+	const uint16_t dim_im_in_x,     // input image dimention x
+	const uint16_t dim_im_in_y,     // input image dimention y
+	const uint16_t ch_im_in,        // number of input image channels
+	const q7_t *wt,                 // kernel weights
+	const uint16_t ch_im_out,       // number of filters, i.e., output image channels
+	const uint16_t dim_kernel_x,    // filter kernel size x
+	const uint16_t dim_kernel_y,    // filter kernel size y
+	const uint16_t padding_x,       // padding sizes x
+	const uint16_t padding_y,       // padding sizes y
+	const uint16_t stride_x,        // stride x
+	const uint16_t stride_y,        // stride y
+    const uint16_t dilation_x,      // dilation x
+	const uint16_t dilation_y,      // dilation y
+	const q7_t *bias,               // bias
+	const nnom_qformat_param_t *bias_shift,                            // bias shifts
+    const nnom_qformat_param_t *out_shift,                             // output shift
+    const nnom_qtype_t q_type,      // per channel or per tensor
+    q7_t *Im_out,                   // output image
+	const uint16_t dim_im_out_x,    // output image dimension x
+	const uint16_t dim_im_out_y,    // output image dimension y
+	q15_t *bufferA,                 //buffer space for input
+	q7_t *bufferB                   //buffer space for output
+);
+					   
+void local_convolve_CHW_q7_nonsquare(const q7_t *Im_in,                // input image
+	const uint16_t dim_im_in_x,     // input image dimention x
+	const uint16_t dim_im_in_y,     // input image dimention y
+	const uint16_t ch_im_in,        // number of input image channels
+	const q7_t *wt,                 // kernel weights
+	const uint16_t ch_im_out,       // number of filters, i.e., output image channels
+	const uint16_t dim_kernel_x,    // filter kernel size x
+	const uint16_t dim_kernel_y,    // filter kernel size y
+	const uint16_t padding_x,       // padding sizes x
+	const uint16_t padding_y,       // padding sizes y
+	const uint16_t stride_x,        // stride x
+	const uint16_t stride_y,        // stride y
+    const uint16_t dilation_x,      // dilation x
+	const uint16_t dilation_y,      // dilation y
+	const q7_t *bias,               // bias
+	const nnom_qformat_param_t *bias_shift,                            // bias shifts
+    const nnom_qformat_param_t *out_shift,                             // output shift
+    const nnom_qtype_t q_type,      // per channel or per tensor
+    q7_t *Im_out,                   // output image
+	const uint16_t dim_im_out_x,    // output image dimension x
+	const uint16_t dim_im_out_y,    // output image dimension y
+	q15_t *bufferA,                 //buffer space for input
+	q7_t *bufferB                   //buffer space for output
+);
+
+void local_conv_trans_HWC_q7_nonsquare(const int8_t * Im_in,
+	const uint16_t dim_im_in_x,     // input image dimention x
+	const uint16_t dim_im_in_y,     // input image dimention y
+	const uint16_t ch_im_in,        // number of input image channels
+	const q7_t *wt,                 // kernel weights
+	const uint16_t ch_im_out,       // number of filters, i.e., output image channels
+	const uint16_t dim_kernel_x,    // filter kernel size x
+	const uint16_t dim_kernel_y,    // filter kernel size y
+	const uint16_t padding_x,       // padding sizes x
+	const uint16_t padding_y,       // padding sizes y
+	const uint16_t stride_x,        // stride x
+	const uint16_t stride_y,        // stride y
+    const uint16_t dilation_x,      // dilation x
+	const uint16_t dilation_y,      // dilation y
+	const q7_t *bias,               // bias
+	const uint16_t bias_shift, const uint16_t out_shift, q7_t *Im_out, // output image
+	const uint16_t dim_im_out_x,    // output image dimension x
+	const uint16_t dim_im_out_y,    // output image dimension y
+	q15_t *bufferA,                 //buffer space for input
+	q7_t *bufferB                   //buffer space for output
+);
+
+void local_depthwise_separable_conv_HWC_q7_nonsquare(const q7_t *Im_in,// input image
+	const uint16_t dim_im_in_x,     // input image dimention x
+	const uint16_t dim_im_in_y,     // input image dimention y
+	const uint16_t ch_im_in,        // number of input image channels
+	const q7_t *wt,                 // kernel weights
+	const uint16_t ch_im_out,       // number of filters, i.e., output image channels
+	const uint16_t dim_kernel_x,    // filter kernel size x
+	const uint16_t dim_kernel_y,    // filter kernel size y
+	const uint16_t padding_x,       // padding sizes x
+	const uint16_t padding_y,       // padding sizes y
+	const uint16_t stride_x,        // stride x
+	const uint16_t stride_y,        // stride y
+    const uint16_t dilation_x,      // dilation x
+	const uint16_t dilation_y,      // dilation y
+	const q7_t *bias,               // bias
+	const nnom_qformat_param_t *bias_shift,                            // bias shifts
+    const nnom_qformat_param_t *out_shift,                             // output shift
+    const nnom_qtype_t q_type,      // per channel or per tensor
+    q7_t *Im_out,                   // output image
+	const uint16_t dim_im_out_x,    // output image dimension x
+	const uint16_t dim_im_out_y,    // output image dimension y
+	q15_t *bufferA,                 //buffer space for input
+	q7_t *bufferB                   //buffer space for output
+);
+
+void local_depthwise_separable_conv_CHW_q7_nonsquare(const q7_t *Im_in,// input image
+	const uint16_t dim_im_in_x,     // input image dimention x
+	const uint16_t dim_im_in_y,     // input image dimention y
+	const uint16_t ch_im_in,        // number of input image channels
+	const q7_t *wt,                 // kernel weights
+	const uint16_t ch_im_out,       // number of filters, i.e., output image channels
+	const uint16_t dim_kernel_x,    // filter kernel size x
+	const uint16_t dim_kernel_y,    // filter kernel size y
+	const uint16_t padding_x,       // padding sizes x
+	const uint16_t padding_y,       // padding sizes y
+	const uint16_t stride_x,        // stride x
+	const uint16_t stride_y,        // stride y
+    const uint16_t dilation_x,      // dilation x
+	const uint16_t dilation_y,      // dilation y
+	const q7_t *bias,               // bias
+	const nnom_qformat_param_t *bias_shift,                            // bias shifts
+    const nnom_qformat_param_t *out_shift,                             // output shift
+    const nnom_qtype_t q_type,      // per channel or per tensor
+    q7_t *Im_out,                   // output image
+	const uint16_t dim_im_out_x,    // output image dimension x
+	const uint16_t dim_im_out_y,    // output image dimension y
+	q15_t *bufferA,                 //buffer space for input
+	q7_t *bufferB                   //buffer space for output
+);
+
+void local_zero_padding_HWC_q7(const q7_t *Im_in,           // input image
+	const uint16_t dim_im_in_x,    // input image dimention x
+	const uint16_t dim_im_in_y,    // input image dimention y
+	const uint16_t ch_im_in,       // number of input image channels
+	const uint16_t padding_top,    // padding sizes y
+	const uint16_t padding_bottom, // padding sizes y
+	const uint16_t padding_left,   // padding sizes x
+	const uint16_t padding_right,  // padding sizes x
+	q7_t *Im_out,                  // output image
+	const uint16_t dim_im_out_x,   // output image dimension x
+	const uint16_t dim_im_out_y);  // output image dimension y 
+						 
+void local_zero_padding_CHW_q7(const q7_t *Im_in,           // input image
+	const uint16_t dim_im_in_x,    // input image dimention x
+	const uint16_t dim_im_in_y,    // input image dimention y
+	const uint16_t ch_im_in,       // number of input image channels
+	const uint16_t padding_top,    // padding sizes y
+	const uint16_t padding_bottom, // padding sizes y
+	const uint16_t padding_left,   // padding sizes x
+	const uint16_t padding_right,  // padding sizes x
+	q7_t *Im_out,                  // output image
+	const uint16_t dim_im_out_x,   // output image dimension x
+	const uint16_t dim_im_out_y);  // output image dimension y 
+						 
+void local_cropping_HWC_q7(const q7_t *Im_in,           // input image
+	const uint16_t dim_im_in_x,    // input image dimention x
+	const uint16_t dim_im_in_y,    // input image dimention y
+	const uint16_t ch_im_in,       // number of input image channels
+	const uint16_t padding_top,    // padding sizes y
+	const uint16_t padding_bottom, // padding sizes y
+	const uint16_t padding_left,   // padding sizes x
+	const uint16_t padding_right,  // padding sizes x
+	q7_t *Im_out,                  // output image
+	const uint16_t dim_im_out_x,   // output image dimension x
+	const uint16_t dim_im_out_y);  // output image dimension y 
+						 
+void local_cropping_CHW_q7(const q7_t *Im_in,           // input image
+	const uint16_t dim_im_in_x,    // input image dimention x
+	const uint16_t dim_im_in_y,    // input image dimention y
+	const uint16_t ch_im_in,       // number of input image channels
+	const uint16_t padding_top,    // padding sizes y
+	const uint16_t padding_bottom, // padding sizes y
+	const uint16_t padding_left,   // padding sizes x
+	const uint16_t padding_right,  // padding sizes x
+	q7_t *Im_out,                  // output image
+	const uint16_t dim_im_out_x,   // output image dimension x
+	const uint16_t dim_im_out_y);  // output image dimension y 
+
+void local_fully_connected_q7_opt(const q7_t * pV,    // pointer to vector
+	const q7_t * pM,    // pointer to matrix
+	const uint16_t dim_vec, // length of the vector
+	const uint16_t num_of_rows, // numCol of A
+	const uint16_t bias_shift,  // amount of left-shift for bias
+	const uint16_t out_shift,   // amount of right-shift for output
+	const q7_t * bias, q7_t * pOut, // output operand
+	q15_t * vec_buffer);
+
+
+void local_fully_connected_q7(const q7_t * pV,    // pointer to vector
+	const q7_t * pM,    		// pointer to matrix
+	const uint16_t dim_vec, 	// length of the vector
+	const uint16_t num_of_rows, // numCol of A
+	const uint16_t bias_shift,  // amount of left-shift for bias
+	const uint16_t out_shift,   // amount of right-shift for output
+	const q7_t * bias, q7_t * pOut, // output operand
+	q15_t * vec_buffer);
+
+// matrix dot, 
+// it takes reorderd weight as input, (see dense layer for detail. this is basiclly a dense opt without bias)
+void local_dot_q7_opt(const q7_t *pV, // pointer to vector
+	const q7_t *pM,               // pointer to matrix
+	const uint16_t dim_vec,       // length of the vector
+	const uint16_t num_of_rows,   // numCol of A
+	const uint16_t out_shift,   // amount of right-shift for output
+	q7_t *pOut);				// result buffer  
+
+void local_dot_q7(const q7_t *pV, // pointer to vector
+	const q7_t *pM,               // pointer to matrix
+	const uint16_t dim_vec,       // length of the vector
+	const uint16_t num_of_rows,   // numCol of A
+	const uint16_t out_shift,     // amount of right-shift for output
+	 q7_t *pOut);                   // output operand)
+
+
+
+// softmax
+void local_softmax_q7(const q7_t * vec_in, const uint32_t dim_vec, q7_t * p_out);
+
+// sigmoid
+void local_sigmoid_q7(q7_t * data, uint32_t size, int16_t int_width);
+
+// tanh
+void local_tanh_q7(q7_t * data, uint32_t size, int16_t int_width);
+
+// relu
+void local_relu_q7(q7_t * data, uint32_t size);
+
+// leaky relu
+void local_leaky_relu_q7(q7_t *data, q7_t alpha, uint32_t size);
+
+// alpha in q7 format with dec_bit=7
+// max and threshold has the same Q format with the activation
+void local_adv_relu_q7(q7_t *data, q7_t alpha, q7_t max, q7_t threshold, uint32_t size);
+
+// hard sigmoid, 
+// y=-1 if x < -2.5
+// y=1  if x > 2.5
+// otherwise y = 0.2 * x + 0.5 (y=0.20315 * x + 0.5)
+void local_hard_sigmoid_q7(q7_t *data, uint32_t size, int16_t dec_bit);
+
+// hard tanh
+// y=-1 if x < -1
+// y=1  if x > 1
+// otherwise y = x
+void local_hard_tanh_q7(q7_t *data, uint32_t size, int16_t dec_bit);
+
+// matrix ops
+void local_mult_q7(q7_t * pSrcA, q7_t * pSrcB, q7_t * pDst, const uint16_t out_shift, uint32_t blockSize);
+
+// add 
+void local_add_q7(q7_t * pSrcA, q7_t * pSrcB, q7_t * pDst, const uint16_t out_shift,  uint32_t blockSize);
+
+// sub 
+void local_sub_q7(q7_t * pSrcA, q7_t * pSrcB, q7_t * pDst, const uint16_t out_shift, uint32_t blockSize);
+
+// take multiple blocks (>2) as input
+void local_multiple_add_q7( q7_t *p_dst,
+                  const int16_t out_shift,
+                  uint32_t block_size,
+                  uint32_t num_block,
+                  q7_t **p_src);
+
+void local_multiple_mult_q7( q7_t *p_dst,
+                  const int16_t out_shift,
+                  uint32_t block_size,
+                  uint32_t num_block,
+                  q7_t **p_src);
+
+void local_multiple_sub_q7( q7_t *p_dst,
+                  const int16_t out_shift,
+                  uint32_t block_size,
+                  uint32_t num_block,
+                  q7_t **p_src);
+
+
+// Below tables credit to CMSIS
+// For more info. check CMSIS-NN lib
+// https://github.com/ARM-software/CMSIS_5/blob/develop/CMSIS/NN/Source/NNSupportFunctions/arm_nntables.c
+static const q7_t nnom_sigmoid_table_q7[256] = {
+    0x40, 0x42, 0x44, 0x46, 0x48, 0x4a, 0x4c, 0x4e,
+    0x50, 0x52, 0x53, 0x55, 0x57, 0x59, 0x5a, 0x5c,
+    0x5e, 0x5f, 0x61, 0x62, 0x63, 0x65, 0x66, 0x67,
+    0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70,
+    0x71, 0x72, 0x72, 0x73, 0x74, 0x74, 0x75, 0x76,
+    0x76, 0x77, 0x77, 0x78, 0x78, 0x79, 0x79, 0x7a,
+    0x7a, 0x7a, 0x7b, 0x7b, 0x7b, 0x7c, 0x7c, 0x7c,
+    0x7c, 0x7c, 0x7d, 0x7d, 0x7d, 0x7d, 0x7d, 0x7e,
+    0x7e, 0x7e, 0x7e, 0x7e, 0x7e, 0x7e, 0x7e, 0x7f,
+    0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+    0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+    0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+    0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+    0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+    0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+    0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+    0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+    0x01, 0x01, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
+    0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x04,
+    0x04, 0x04, 0x04, 0x04, 0x05, 0x05, 0x05, 0x06,
+    0x06, 0x06, 0x07, 0x07, 0x08, 0x08, 0x09, 0x09,
+    0x0a, 0x0a, 0x0b, 0x0c, 0x0c, 0x0d, 0x0e, 0x0e,
+    0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
+    0x17, 0x19, 0x1a, 0x1b, 0x1d, 0x1e, 0x1f, 0x21,
+    0x22, 0x24, 0x26, 0x27, 0x29, 0x2b, 0x2d, 0x2e,
+    0x30, 0x32, 0x34, 0x36, 0x38, 0x3a, 0x3c, 0x3e,
+};
+
+
+static const q7_t nnom_tanh_table_q7[256] = {
+    0x00, 0x08, 0x10, 0x18, 0x1f, 0x27, 0x2e, 0x35,
+    0x3b, 0x41, 0x47, 0x4c, 0x51, 0x56, 0x5a, 0x5e,
+    0x61, 0x65, 0x68, 0x6a, 0x6d, 0x6f, 0x71, 0x72,
+    0x74, 0x75, 0x76, 0x78, 0x78, 0x79, 0x7a, 0x7b,
+    0x7b, 0x7c, 0x7c, 0x7d, 0x7d, 0x7e, 0x7e, 0x7e,
+    0x7e, 0x7e, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+    0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+    0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+    0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+    0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+    0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+    0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+    0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+    0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+    0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+    0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x81,
+    0x81, 0x81, 0x81, 0x81, 0x81, 0x81, 0x81, 0x82,
+    0x82, 0x82, 0x82, 0x82, 0x83, 0x83, 0x84, 0x84,
+    0x85, 0x85, 0x86, 0x87, 0x88, 0x88, 0x8a, 0x8b,
+    0x8c, 0x8e, 0x8f, 0x91, 0x93, 0x96, 0x98, 0x9b,
+    0x9f, 0xa2, 0xa6, 0xaa, 0xaf, 0xb4, 0xb9, 0xbf,
+    0xc5, 0xcb, 0xd2, 0xd9, 0xe1, 0xe8, 0xf0, 0xf8,
+};
+
+
+// ------------ 16bit ops --------------------
+
+void local_avepool_q15_HWC(const q15_t *Im_in,           // input image
+	const uint16_t dim_im_in_x,  // input image dimension x or W
+	const uint16_t dim_im_in_y,  // input image dimension y or H
+	const uint16_t ch_im_in,     // number of input image channels
+	const uint16_t dim_kernel_x, // window kernel size
+	const uint16_t dim_kernel_y, // window kernel size
+	const uint16_t padding_x,    // padding sizes
+	const uint16_t padding_y,    // padding sizes
+	const uint16_t stride_x,     // stride
+	const uint16_t stride_y,     // stride
+	const uint16_t dim_im_out_x, // output image dimension x or W
+	const uint16_t dim_im_out_y, // output image dimension y or H
+	const uint16_t output_shift, // output right shift
+	q7_t *bufferA,               // a buffer for local storage, NULL by now
+	q15_t *Im_out);
+
+void local_avepool_q15_CHW(const q15_t *Im_in,           // input image
+	const uint16_t dim_im_in_x,  // input image dimension x or W
+	const uint16_t dim_im_in_y,  // input image dimension y or H
+	const uint16_t ch_im_in,     // number of input image channels
+	const uint16_t dim_kernel_x, // window kernel size
+	const uint16_t dim_kernel_y, // window kernel size
+	const uint16_t padding_x,    // padding sizes
+	const uint16_t padding_y,    // padding sizes
+	const uint16_t stride_x,     // stride
+	const uint16_t stride_y,     // stride
+	const uint16_t dim_im_out_x, // output image dimension x or W
+	const uint16_t dim_im_out_y, // output image dimension y or H
+	const uint16_t output_shift, // output right shift
+	q7_t *bufferA,               // a buffer for local storage, NULL by now
+	q15_t *Im_out);
+
+void local_maxpool_q15_HWC(const q15_t *Im_in,           // input image
+	const uint16_t dim_im_in_x,  // input image dimension x or W
+	const uint16_t dim_im_in_y,  // input image dimension y or H
+	const uint16_t ch_im_in,     // number of input image channels
+	const uint16_t dim_kernel_x, // window kernel size
+	const uint16_t dim_kernel_y, // window kernel size
+	const uint16_t padding_x,    // padding sizes
+	const uint16_t padding_y,    // padding sizes
+	const uint16_t stride_x,     // stride
+	const uint16_t stride_y,     // stride
+	const uint16_t dim_im_out_x, // output image dimension x or W
+	const uint16_t dim_im_out_y, // output image dimension y or H
+	q7_t *bufferA,               // a buffer for local storage, NULL by now
+	q15_t *Im_out);
+
+void local_maxpool_q15_CHW(const q15_t *Im_in,           // input image
+	const uint16_t dim_im_in_x,  // input image dimension x or W
+	const uint16_t dim_im_in_y,  // input image dimension y or H
+	const uint16_t ch_im_in,     // number of input image channels
+	const uint16_t dim_kernel_x, // window kernel size
+	const uint16_t dim_kernel_y, // window kernel size
+	const uint16_t padding_x,    // padding sizes
+	const uint16_t padding_y,    // padding sizes
+	const uint16_t stride_x,     // stride
+	const uint16_t stride_y,     // stride
+	const uint16_t dim_im_out_x, // output image dimension x or W
+	const uint16_t dim_im_out_y, // output image dimension y or H
+	q7_t *bufferA,               // a buffer for local storage, NULL by now
+	q15_t *Im_out);
+
+void local_sumpool_q15_HWC(const q15_t *Im_in,           // input image
+	const uint16_t dim_im_in_x,  // input image dimension x or W
+	const uint16_t dim_im_in_y,  // input image dimension y or H
+	const uint16_t ch_im_in,     // number of input image channels
+	const uint16_t dim_kernel_x, // window kernel size
+	const uint16_t dim_kernel_y, // window kernel size
+	const uint16_t padding_x,    // padding sizes
+	const uint16_t padding_y,    // padding sizes
+	const uint16_t stride_x,     // stride
+	const uint16_t stride_y,     // stride
+	const uint16_t dim_im_out_x, // output image dimension x or W
+	const uint16_t dim_im_out_y, // output image dimension y or H
+    const uint16_t output_shift, // output right shift
+	q7_t *bufferA,               // a buffer for local storage, size = 4*output_size
+	q15_t *Im_out);
+
+void local_sumpool_q15_CHW(const q15_t *Im_in,           // input image
+	const uint16_t dim_im_in_x,  // input image dimension x or W
+	const uint16_t dim_im_in_y,  // input image dimension y or H
+	const uint16_t ch_im_in,     // number of input image channels
+	const uint16_t dim_kernel_x, // window kernel size
+	const uint16_t dim_kernel_y, // window kernel size
+	const uint16_t padding_x,    // padding sizes
+	const uint16_t padding_y,    // padding sizes
+	const uint16_t stride_x,     // stride
+	const uint16_t stride_y,     // stride
+	const uint16_t dim_im_out_x, // output image dimension x or W
+	const uint16_t dim_im_out_y, // output image dimension y or H
+	const uint16_t output_shift, // output right shift
+	q7_t *bufferA,               // a buffer for local storage, size = 4*output_size
+	q15_t *Im_out);  
+
+void local_up_sampling_q15_HWC(const q15_t *Im_in,       // input image
+	const uint16_t dim_im_in_x,  // input image dimension x or W
+	const uint16_t dim_im_in_y,  // input image dimension y or H
+	const uint16_t ch_im_in,     // number of input image channels
+	const uint16_t dim_kernel_x, // window kernel size
+	const uint16_t dim_kernel_y, // window kernel size
+	const uint16_t dim_im_out_x, // output image dimension x or W
+	const uint16_t dim_im_out_y, // output image dimension y or H
+	q7_t *bufferA,               // a buffer for local storage, NULL by now
+	q15_t *Im_out);
+
+ void local_up_sampling_q15_CHW(const q15_t *Im_in,       // input image
+	const uint16_t dim_im_in_x,  // input image dimension x or W
+	const uint16_t dim_im_in_y,  // input image dimension y or H
+	const uint16_t ch_im_in,     // number of input image channels
+	const uint16_t dim_kernel_x, // window kernel size
+	const uint16_t dim_kernel_y, // window kernel size
+	const uint16_t dim_im_out_x, // output image dimension x or W
+	const uint16_t dim_im_out_y, // output image dimension y or H
+	q7_t *bufferA,               // a buffer for local storage, NULL by now
+	q15_t *Im_out);
+
+void local_convolve_HWC_q15_nonsquare(const q15_t *Im_in,   // input image
+	const uint16_t dim_im_in_x,   // input image dimention x
+	const uint16_t dim_im_in_y,   // input image dimention y
+	const uint16_t ch_im_in,      // number of input image channels
+	const q7_t *wt,               // kernel weights
+	const uint16_t ch_im_out,     // number of filters, i.e., output image channels
+	const uint16_t dim_kernel_x,  // filter kernel size x
+	const uint16_t dim_kernel_y,  // filter kernel size y
+	const uint16_t padding_x,     // padding sizes x
+	const uint16_t padding_y,     // padding sizes y
+	const uint16_t stride_x,      // stride x
+	const uint16_t stride_y,      // stride y
+    const uint16_t dilation_x,    // dilation x
+	const uint16_t dilation_y,    // dilation y
+	const q7_t *bias,             // bias
+	const nnom_qformat_param_t *bias_shift, // bias shifts
+    const nnom_qformat_param_t *out_shift, // output shift
+    const nnom_qtype_t q_type,      // per channel or per tensor
+    q15_t *Im_out,                  // output image
+	const uint16_t dim_im_out_x,    // output image dimension x
+	const uint16_t dim_im_out_y,    // output image dimension y
+	q15_t *bufferA,                 //buffer space for input
+	q7_t *bufferB                   //buffer space for output
+);
+void local_convolve_CHW_q15_nonsquare(const q15_t *Im_in,                // input image
+	const uint16_t dim_im_in_x,     // input image dimention x
+	const uint16_t dim_im_in_y,     // input image dimention y
+	const uint16_t ch_im_in,        // number of input image channels
+	const q7_t *wt,                 // kernel weights
+	const uint16_t ch_im_out,       // number of filters, i.e., output image channels
+	const uint16_t dim_kernel_x,    // filter kernel size x
+	const uint16_t dim_kernel_y,    // filter kernel size y
+	const uint16_t padding_x,       // padding sizes x
+	const uint16_t padding_y,       // padding sizes y
+	const uint16_t stride_x,        // stride x
+	const uint16_t stride_y,        // stride y
+    const uint16_t dilation_x,      // dilation x
+	const uint16_t dilation_y,      // dilation y
+	const q7_t *bias,               // bias
+	const nnom_qformat_param_t *bias_shift,     // bias shifts
+    const nnom_qformat_param_t *out_shift,      // output shift
+    const nnom_qtype_t q_type,      // per channel or per tensor
+    q15_t *Im_out,                   // output image
+	const uint16_t dim_im_out_x,    // output image dimension x
+	const uint16_t dim_im_out_y,    // output image dimension y
+	q15_t *bufferA,                 //buffer space for input
+	q7_t *bufferB                   //buffer space for output
+);
+
+void local_conv_trans_HWC_q15_nonsquare(const int8_t * Im_in,
+	const uint16_t dim_im_in_x,     // input image dimention x
+	const uint16_t dim_im_in_y,     // input image dimention y
+	const uint16_t ch_im_in,        // number of input image channels
+	const q7_t *wt,                 // kernel weights
+	const uint16_t ch_im_out,       // number of filters, i.e., output image channels
+	const uint16_t dim_kernel_x,    // filter kernel size x
+	const uint16_t dim_kernel_y,    // filter kernel size y
+	const uint16_t padding_x,       // padding sizes x
+	const uint16_t padding_y,       // padding sizes y
+	const uint16_t stride_x,        // stride x
+	const uint16_t stride_y,        // stride y
+    const uint16_t dilation_x,      // dilation x
+	const uint16_t dilation_y,      // dilation y
+	const q7_t *bias,               // bias
+	const uint16_t bias_shift, const uint16_t out_shift, q15_t *Im_out, // output image
+	const uint16_t dim_im_out_x,    // output image dimension x
+	const uint16_t dim_im_out_y,    // output image dimension y
+	q15_t *bufferA,                 //buffer space for input
+	q7_t *bufferB                   //buffer space for output
+);
+
+void local_depthwise_separable_conv_HWC_q15_nonsquare(const q15_t *Im_in,// input image
+	const uint16_t dim_im_in_x,     // input image dimention x
+	const uint16_t dim_im_in_y,     // input image dimention y
+	const uint16_t ch_im_in,        // number of input image channels
+	const q7_t *wt,                 // kernel weights
+	const uint16_t ch_im_out,       // number of filters, i.e., output image channels
+	const uint16_t dim_kernel_x,    // filter kernel size x
+	const uint16_t dim_kernel_y,    // filter kernel size y
+	const uint16_t padding_x,       // padding sizes x
+	const uint16_t padding_y,       // padding sizes y
+	const uint16_t stride_x,        // stride x
+	const uint16_t stride_y,        // stride y
+    const uint16_t dilation_x,      // dilation x
+	const uint16_t dilation_y,      // dilation y
+	const q7_t *bias,               // bias
+	const nnom_qformat_param_t *bias_shift,                            // bias shifts
+    const nnom_qformat_param_t *out_shift,                             // output shift
+    const nnom_qtype_t q_type,      // per channel or per tensor
+    q15_t *Im_out,                   // output image
+	const uint16_t dim_im_out_x,    // output image dimension x
+	const uint16_t dim_im_out_y,    // output image dimension y
+	q15_t *bufferA,                 //buffer space for input
+	q7_t *bufferB                   //buffer space for output
+);
+
+void local_depthwise_separable_conv_CHW_q15_nonsquare(const q15_t *Im_in,// input image
+	const uint16_t dim_im_in_x,     // input image dimention x
+	const uint16_t dim_im_in_y,     // input image dimention y
+	const uint16_t ch_im_in,        // number of input image channels
+	const q7_t *wt,                 // kernel weights
+	const uint16_t ch_im_out,       // number of filters, i.e., output image channels
+	const uint16_t dim_kernel_x,    // filter kernel size x
+	const uint16_t dim_kernel_y,    // filter kernel size y
+	const uint16_t padding_x,       // padding sizes x
+	const uint16_t padding_y,       // padding sizes y
+	const uint16_t stride_x,        // stride x
+	const uint16_t stride_y,        // stride y
+    const uint16_t dilation_x,      // dilation x
+	const uint16_t dilation_y,      // dilation y
+	const q7_t *bias,               // bias
+	const nnom_qformat_param_t *bias_shift,                            // bias shifts
+    const nnom_qformat_param_t *out_shift,                             // output shift
+    const nnom_qtype_t q_type,      // per channel or per tensor
+    q15_t *Im_out,                   // output image
+	const uint16_t dim_im_out_x,    // output image dimension x
+	const uint16_t dim_im_out_y,    // output image dimension y
+	q15_t *bufferA,                 //buffer space for input
+	q7_t *bufferB                   //buffer space for output
+);
+
+void local_zero_padding_HWC_q15(const q15_t *Im_in,           // input image
+	const uint16_t dim_im_in_x,    // input image dimention x
+	const uint16_t dim_im_in_y,    // input image dimention y
+	const uint16_t ch_im_in,       // number of input image channels
+	const uint16_t padding_top,    // padding sizes y
+	const uint16_t padding_bottom, // padding sizes y
+	const uint16_t padding_left,   // padding sizes x
+	const uint16_t padding_right,  // padding sizes x
+	q15_t *Im_out,                  // output image
+	const uint16_t dim_im_out_x,   // output image dimension x
+	const uint16_t dim_im_out_y);   // output image dimension y 
+
+void local_zero_padding_CHW_q15(const q15_t *Im_in,           // input image
+	const uint16_t dim_im_in_x,    // input image dimention x
+	const uint16_t dim_im_in_y,    // input image dimention y
+	const uint16_t ch_im_in,       // number of input image channels
+	const uint16_t padding_top,    // padding sizes y
+	const uint16_t padding_bottom, // padding sizes y
+	const uint16_t padding_left,   // padding sizes x
+	const uint16_t padding_right,  // padding sizes x
+	q15_t *Im_out,                  // output image
+	const uint16_t dim_im_out_x,   // output image dimension x
+	const uint16_t dim_im_out_y);   // output image dimension y 
+
+void local_cropping_HWC_q15(const q15_t *Im_in,           // input image
+	const uint16_t dim_im_in_x,    // input image dimention x
+	const uint16_t dim_im_in_y,    // input image dimention y
+	const uint16_t ch_im_in,       // number of input image channels
+	const uint16_t padding_top,    // padding sizes y
+	const uint16_t padding_bottom, // padding sizes y
+	const uint16_t padding_left,   // padding sizes x
+	const uint16_t padding_right,  // padding sizes x
+	q15_t *Im_out,                  // output image
+	const uint16_t dim_im_out_x,   // output image dimension x
+	const uint16_t dim_im_out_y);   // output image dimension y 
+
+void local_cropping_CHW_q15(const q15_t *Im_in,           // input image
+	const uint16_t dim_im_in_x,    // input image dimention x
+	const uint16_t dim_im_in_y,    // input image dimention y
+	const uint16_t ch_im_in,       // number of input image channels
+	const uint16_t padding_top,    // padding sizes y
+	const uint16_t padding_bottom, // padding sizes y
+	const uint16_t padding_left,   // padding sizes x
+	const uint16_t padding_right,  // padding sizes x
+	q15_t *Im_out,                  // output image
+	const uint16_t dim_im_out_x,   // output image dimension x
+	const uint16_t dim_im_out_y);   // output image dimension y 
+
+
+void local_dot_q15(const q15_t *pV, // pointer to vector
+	const q15_t *pM,               // pointer to matrix
+	const uint16_t dim_vec,       // length of the vector
+	const uint16_t num_of_rows,   // numCol of A
+	const uint16_t out_shift,     // amount of right-shift for output
+	 q15_t *pOut);                   // output operand)
+
+void local_dot_q15_opt(const q15_t * pV,
+	const q15_t * pM,
+	const uint16_t dim_vec,
+	const uint16_t num_of_rows,
+	const uint16_t out_shift, 
+	q15_t * pOut);
+
+// original implementation
+// this support none bias, the it will perform like a dot. 
+// set the `bias=NULL` to work
+void local_fully_connected_mat_q7_vec_q15(const q15_t * pV, // pointer to vector
+	const q7_t * pM,			// pointer to matrix
+	const uint16_t dim_vec,  	// length of the vector
+	const uint16_t num_of_rows, // numCol of A
+	const uint16_t bias_shift, 	// amount of left-shift for bias
+	const uint16_t out_shift, 	// amount of right-shift for output
+	const q7_t * bias,			// bias
+	q15_t * pOut,				// output
+	q15_t * vec_buffer);     	// not used but to keep the interface same as the ARM's version
+
+// work on recorder matrix
+// this support none bias, set the bias=NULL to work								   
+void local_fully_connected_mat_q7_vec_q15_opt(const q15_t * pV,
+	const q7_t * pM,
+	const uint16_t dim_vec,
+	const uint16_t num_of_rows,
+	const uint16_t bias_shift,
+	const uint16_t out_shift, 
+	const q7_t * bias, 
+	q15_t * pOut, 
+	q15_t * vec_buffer);
+
+// matrix operation Q15
+void local_multiple_add_q15( q15_t *p_dst,
+                  const int16_t out_shift,
+                  uint32_t block_size,
+                  uint32_t num_block,
+                  q15_t **p_src);
+                  
+void local_multiple_mult_q15( q15_t *p_dst,
+                  const int16_t out_shift,
+                  uint32_t block_size,
+                  uint32_t num_block,
+                  q15_t **p_src);
+
+void local_multiple_sub_q15( q15_t *p_dst,
+                  const int16_t out_shift,
+                  uint32_t block_size,
+                  uint32_t num_block,
+                  q15_t **p_src);
+
+void local_mult_q15(q15_t * pSrcA, q15_t * pSrcB, q15_t * pDst, const uint16_t out_shift, uint32_t blockSize);
+
+// add 
+void local_add_q15(q15_t * pSrcA, q15_t * pSrcB, q15_t * pDst, const uint16_t out_shift,  uint32_t blockSize);
+
+// sub 
+void local_sub_q15(q15_t * pSrcA, q15_t * pSrcB, q15_t * pDst, const uint16_t out_shift, uint32_t blockSize);
+
+// Convert Q7 to Q15
+void local_q7_to_q15_no_shift(const q7_t *src, q15_t *des, uint32_t size);
+void local_q7_to_q15(const q7_t *src, q15_t *des, uint32_t size);
+
+// q15 shift to q7
+void local_q15_to_q7(const q15_t *src, q7_t *des,  uint32_t shift, uint32_t size);
+
+// y = 1 - x
+void local_1_minor_z_q15(q15_t *src, q15_t *des, uint16_t dec_bit, uint32_t size);
+
+void local_softmax_q15(const q15_t * vec_in, const uint16_t dim_vec, q15_t * p_out);
+void local_hard_sigmoid_q15(q15_t *data, uint32_t size, int16_t dec_bit);
+void local_hard_tanh_q15(q15_t *data, uint32_t size, int16_t dec_bit);
+void local_relu_q15(q15_t *data, uint32_t size);
+void local_leaky_relu_q15(q15_t *data, q7_t alpha, uint32_t size);
+void local_adv_relu_q15(q15_t *data, q7_t negative_slope, q15_t max, q15_t threshold, uint32_t size);
+void local_sigmoid_q15(q15_t * data, uint32_t size, uint16_t int_width);
+void local_tanh_q15(q15_t * data, uint32_t size, uint16_t int_width);
+
+
+static const q15_t nnom_sigmoid_table_q15[256] = {
+    0x4000, 0x4200, 0x43ff, 0x45fc, 0x47f5, 0x49eb, 0x4bdc, 0x4dc8,
+    0x4fad, 0x518a, 0x5360, 0x552c, 0x56ef, 0x58a8, 0x5a57, 0x5bfb,
+    0x5d93, 0x5f20, 0x60a1, 0x6216, 0x637f, 0x64db, 0x662b, 0x676f,
+    0x68a6, 0x69d2, 0x6af1, 0x6c05, 0x6d0d, 0x6e09, 0x6efb, 0x6fe2,
+    0x70be, 0x7190, 0x7258, 0x7316, 0x73cc, 0x7478, 0x751b, 0x75b7,
+    0x764a, 0x76d6, 0x775b, 0x77d8, 0x784f, 0x78c0, 0x792a, 0x798f,
+    0x79ee, 0x7a48, 0x7a9d, 0x7aed, 0x7b39, 0x7b80, 0x7bc4, 0x7c03,
+    0x7c3f, 0x7c78, 0x7cad, 0x7ce0, 0x7d0f, 0x7d3c, 0x7d66, 0x7d8d,
+    0x7db3, 0x7dd6, 0x7df7, 0x7e16, 0x7e33, 0x7e4f, 0x7e69, 0x7e81,
+    0x7e98, 0x7eae, 0x7ec2, 0x7ed5, 0x7ee7, 0x7ef8, 0x7f08, 0x7f17,
+    0x7f25, 0x7f32, 0x7f3e, 0x7f4a, 0x7f55, 0x7f5f, 0x7f69, 0x7f72,
+    0x7f7b, 0x7f83, 0x7f8a, 0x7f91, 0x7f98, 0x7f9e, 0x7fa4, 0x7faa,
+    0x7faf, 0x7fb4, 0x7fb8, 0x7fbd, 0x7fc1, 0x7fc5, 0x7fc8, 0x7fcc,
+    0x7fcf, 0x7fd2, 0x7fd5, 0x7fd7, 0x7fda, 0x7fdc, 0x7fde, 0x7fe0,
+    0x7fe2, 0x7fe4, 0x7fe6, 0x7fe7, 0x7fe9, 0x7fea, 0x7feb, 0x7fed,
+    0x7fee, 0x7fef, 0x7ff0, 0x7ff1, 0x7ff2, 0x7ff3, 0x7ff4, 0x7ff4,
+    0x000b, 0x000c, 0x000c, 0x000d, 0x000e, 0x000f, 0x0010, 0x0011,
+    0x0012, 0x0013, 0x0015, 0x0016, 0x0017, 0x0019, 0x001a, 0x001c,
+    0x001e, 0x0020, 0x0022, 0x0024, 0x0026, 0x0029, 0x002b, 0x002e,
+    0x0031, 0x0034, 0x0038, 0x003b, 0x003f, 0x0043, 0x0048, 0x004c,
+    0x0051, 0x0056, 0x005c, 0x0062, 0x0068, 0x006f, 0x0076, 0x007d,
+    0x0085, 0x008e, 0x0097, 0x00a1, 0x00ab, 0x00b6, 0x00c2, 0x00ce,
+    0x00db, 0x00e9, 0x00f8, 0x0108, 0x0119, 0x012b, 0x013e, 0x0152,
+    0x0168, 0x017f, 0x0197, 0x01b1, 0x01cd, 0x01ea, 0x0209, 0x022a,
+    0x024d, 0x0273, 0x029a, 0x02c4, 0x02f1, 0x0320, 0x0353, 0x0388,
+    0x03c1, 0x03fd, 0x043c, 0x0480, 0x04c7, 0x0513, 0x0563, 0x05b8,
+    0x0612, 0x0671, 0x06d6, 0x0740, 0x07b1, 0x0828, 0x08a5, 0x092a,
+    0x09b6, 0x0a49, 0x0ae5, 0x0b88, 0x0c34, 0x0cea, 0x0da8, 0x0e70,
+    0x0f42, 0x101e, 0x1105, 0x11f7, 0x12f3, 0x13fb, 0x150f, 0x162e,
+    0x175a, 0x1891, 0x19d5, 0x1b25, 0x1c81, 0x1dea, 0x1f5f, 0x20e0,
+    0x226d, 0x2405, 0x25a9, 0x2758, 0x2911, 0x2ad4, 0x2ca0, 0x2e76,
+    0x3053, 0x3238, 0x3424, 0x3615, 0x380b, 0x3a04, 0x3c01, 0x3e00,
+};
+
+
+static const q15_t nnom_tanh_table_q15[256] = {
+    0x0000, 0x07fd, 0x0feb, 0x17b9, 0x1f59, 0x26bf, 0x2ddf, 0x34ae,
+    0x3b27, 0x4142, 0x46fd, 0x4c56, 0x514d, 0x55e2, 0x5a1a, 0x5df6,
+    0x617c, 0x64b0, 0x6797, 0x6a37, 0x6c95, 0x6eb5, 0x709e, 0x7254,
+    0x73dc, 0x753a, 0x7672, 0x7788, 0x787f, 0x795b, 0x7a1e, 0x7acb,
+    0x7b65, 0x7bee, 0x7c66, 0x7cd1, 0x7d30, 0x7d84, 0x7dce, 0x7e0f,
+    0x7e49, 0x7e7d, 0x7eaa, 0x7ed2, 0x7ef5, 0x7f14, 0x7f30, 0x7f48,
+    0x7f5e, 0x7f71, 0x7f82, 0x7f91, 0x7f9e, 0x7fa9, 0x7fb3, 0x7fbc,
+    0x7fc4, 0x7fcb, 0x7fd1, 0x7fd7, 0x7fdc, 0x7fe0, 0x7fe4, 0x7fe7,
+    0x7fea, 0x7fed, 0x7fef, 0x7ff1, 0x7ff3, 0x7ff4, 0x7ff6, 0x7ff7,
+    0x7ff8, 0x7ff9, 0x7ffa, 0x7ffa, 0x7ffb, 0x7ffc, 0x7ffc, 0x7ffd,
+    0x7ffd, 0x7ffd, 0x7ffe, 0x7ffe, 0x7ffe, 0x7ffe, 0x7fff, 0x7fff,
+    0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff,
+    0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff,
+    0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff,
+    0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff,
+    0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff,
+    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+    0x8000, 0x8000, 0x8001, 0x8001, 0x8001, 0x8001, 0x8001, 0x8001,
+    0x8001, 0x8001, 0x8001, 0x8002, 0x8002, 0x8002, 0x8002, 0x8003,
+    0x8003, 0x8003, 0x8004, 0x8004, 0x8005, 0x8006, 0x8006, 0x8007,
+    0x8008, 0x8009, 0x800a, 0x800c, 0x800d, 0x800f, 0x8011, 0x8013,
+    0x8016, 0x8019, 0x801c, 0x8020, 0x8024, 0x8029, 0x802f, 0x8035,
+    0x803c, 0x8044, 0x804d, 0x8057, 0x8062, 0x806f, 0x807e, 0x808f,
+    0x80a2, 0x80b8, 0x80d0, 0x80ec, 0x810b, 0x812e, 0x8156, 0x8183,
+    0x81b7, 0x81f1, 0x8232, 0x827c, 0x82d0, 0x832f, 0x839a, 0x8412,
+    0x849b, 0x8535, 0x85e2, 0x86a5, 0x8781, 0x8878, 0x898e, 0x8ac6,
+    0x8c24, 0x8dac, 0x8f62, 0x914b, 0x936b, 0x95c9, 0x9869, 0x9b50,
+    0x9e84, 0xa20a, 0xa5e6, 0xaa1e, 0xaeb3, 0xb3aa, 0xb903, 0xbebe,
+    0xc4d9, 0xcb52, 0xd221, 0xd941, 0xe0a7, 0xe847, 0xf015, 0xf803,
+};
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  /* __NNOM_LOCAL_H__ */
diff --git a/APP_Framework/Framework/knowing/nnom/inc/nnom_tensor.h b/APP_Framework/Framework/knowing/nnom/inc/nnom_tensor.h
new file mode 100644
index 000000000..6853da868
--- /dev/null
+++ b/APP_Framework/Framework/knowing/nnom/inc/nnom_tensor.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2018-2020
+ * Jianjia Ma
+ * majianjia@live.com
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Change Logs:
+ * Date           Author       Notes
+ * 2019-02-05     Jianjia Ma   The first version
+ * 2019-02-10     Jianjia Ma   Compiler supports dense net connection
+ */
+
+#ifndef __NNOM_TENSOR_H__
+#define __NNOM_TENSOR_H__
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "nnom.h"
+
+
+void delete_tensor(nnom_tensor_t* t);
+nnom_tensor_t* new_tensor(nnom_qtype_t type, uint32_t num_dim, uint32_t num_channel);
+// set tensor by value
+// for tensor with quantized type NNOM_QTYPE_PER_TENSOR
+nnom_tensor_t* tensor_set_attr_v(nnom_tensor_t* t, 
+		nnom_qformat_param_t dec_bit, nnom_qformat_param_t offset, nnom_shape_data_t* dim, uint32_t num_dim, uint8_t bitwidth);
+nnom_tensor_t* tensor_set_attr(nnom_tensor_t* t, 
+		nnom_qformat_param_t*dec_bit, nnom_qformat_param_t *offset, nnom_shape_data_t* dim, uint32_t num_dim, uint8_t bitwidth);
+nnom_tensor_t* tensor_cpy_attr(nnom_tensor_t* des, nnom_tensor_t* src);
+size_t tensor_get_num_channel(nnom_tensor_t* t);
+size_t tensor_size(nnom_tensor_t* t);
+size_t tensor_size_byte(nnom_tensor_t* t);
+
+// only support 3d tensor
+// change format from CHW to HWC
+// the shape of the data, input data, output data
+void tensor_hwc2chw_q7(nnom_tensor_t* des, nnom_tensor_t* src);
+
+// change format from CHW to HWC
+// the shape of the data, input data, output data
+void tensor_chw2hwc_q7(nnom_tensor_t* des, nnom_tensor_t* src);
+
+// deprecated. 
+void hwc2chw_q7(nnom_3d_shape_t shape, q7_t* p_in, q7_t* p_out);
+void chw2hwc_q7(nnom_3d_shape_t shape, q7_t* p_in, q7_t* p_out);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /*__NNOM_TENSOR_H__ */
diff --git a/APP_Framework/Framework/knowing/nnom/inc/nnom_utils.h b/APP_Framework/Framework/knowing/nnom/inc/nnom_utils.h
new file mode 100644
index 000000000..88c5067d3
--- /dev/null
+++ b/APP_Framework/Framework/knowing/nnom/inc/nnom_utils.h
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 2018-2020
+ * Jianjia Ma
+ * majianjia@live.com
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Change Logs:
+ * Date           Author       Notes
+ * 2019-02-05     Jianjia Ma   The first version
+ */
+
+#ifndef __NNOM_UTILS_H__
+#define __NNOM_UTILS_H__
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+#include <stdint.h>
+#include <string.h>
+#include <stdbool.h>
+
+#include "nnom.h"
+
+typedef struct _nnom_predict_t
+{
+	uint16_t *confusion_mat; // confusiong matrix
+	uint32_t *top_k;		 // which stored the num of prediction in rank_k, example: Top-2 = top_k[0]+top_k[1]
+	nnom_model_t *model;	 // the model to run
+	int8_t *buf_prediction;  // the pointer to the output of softmax layer(normally the end of classifier).
+
+	// setting
+	uint32_t label_num;  // number of types in classification
+	uint32_t top_k_size; // number of k that wants to know.
+
+	// running
+	uint32_t predict_count; // how many prediction is done
+
+	//timing
+	uint32_t t_run_total;	// total running time
+	uint32_t t_predict_start; // when it is initial
+	uint32_t t_predict_total; // total time of the whole test
+} nnom_predict_t;
+
+// create a prediction
+// input model, the buf pointer to the softwmax output (Temporary, this can be extract from model)
+// the size of softmax output (the num of lable)
+// the top k that wants to record.
+nnom_predict_t *prediction_create(nnom_model_t *m, int8_t *buf_prediction, size_t label_num, size_t top_k_size); // currently int8_t
+
+// after a new data is set in input
+// feed data to prediction
+// input the current label, (range from 0 to total number of label -1)
+// (the current input data should be set by user manully to the input buffer of the model.)
+// return NN_ARGUMENT_ERROR if parameter error
+nnom_status_t prediction_run(nnom_predict_t *pre, uint32_t true_label, uint32_t* predict_label, float* prob);
+
+// to mark prediction finished
+void prediction_end(nnom_predict_t *pre);
+
+// free all resources
+void prediction_delete(nnom_predict_t *pre);
+
+// print matrix
+void prediction_matrix(nnom_predict_t *pre);
+
+// print top-k
+void prediction_top_k(nnom_predict_t *pre);
+
+// this function is to print sumarry
+void prediction_summary(nnom_predict_t *pre);
+
+// -------------------------------
+
+// stand alone prediction API
+// this api test one set of data, return the prediction
+// return the predicted label
+// return NN_ARGUMENT_ERROR if parameter error
+nnom_status_t nnom_predict(nnom_model_t *m, uint32_t *label, float *prob);
+
+void model_stat(nnom_model_t *m);
+
+void model_io_format(nnom_model_t *m);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /*__NNOM_UTILS_H__ */
diff --git a/APP_Framework/Framework/knowing/nnom/port/nnom_port.h b/APP_Framework/Framework/knowing/nnom/port/nnom_port.h
new file mode 100644
index 000000000..c9105431f
--- /dev/null
+++ b/APP_Framework/Framework/knowing/nnom/port/nnom_port.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2018-2020
+ * Jianjia Ma
+ * majianjia@live.com
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Change Logs:
+ * Date           Author       Notes
+ * 2019-02-05     Jianjia Ma   The first version
+ */
+
+#ifndef __NNOM_PORT_H__
+#define __NNOM_PORT_H__
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <transform.h>
+
+/* use static memory */
+// must set buf using "nnom_set_static_buf()" before creating a model. 
+
+/* dynamic memory interfaces */
+/* when libc is not available, you shall implement the below memory interfaces (libc equivalents). */
+#ifndef NNOM_USING_STATIC_MEMORY    
+    #define nnom_malloc(n)      malloc(n)       
+    #define nnom_free(p)        free(p)
+#endif
+
+/* memory interface */
+/* when libc is not available, you shall implement your equivalent functions here */
+#define nnom_memset(p,v,s)        memset(p,v,s)        
+#define nnom_memcpy(dst,src,len)  memcpy(dst,src,len)  
+
+/* runtime & debug */
+#define nnom_us_get()       0       // return a microsecond timestamp
+#define nnom_ms_get()       0       // return a millisecond timestamp
+#define NNOM_LOG(...)       printf(__VA_ARGS__)
+
+/* NNoM configuration */
+#define NNOM_BLOCK_NUM  	(8)		// maximum number of memory blocks, increase it when log request.   
+#define DENSE_WEIGHT_OPT 	(1)		// if used fully connected layer optimized weights. 
+
+#endif
+
+
+
diff --git a/APP_Framework/Framework/knowing/nnom/scripts/README.MD b/APP_Framework/Framework/knowing/nnom/scripts/README.MD
new file mode 100644
index 000000000..54a62afa7
--- /dev/null
+++ b/APP_Framework/Framework/knowing/nnom/scripts/README.MD
@@ -0,0 +1,4 @@
+fully_connected_opt_weight_generation.py - is from https://github.com/ARM-software/CMSIS_5/tree/develop/CMSIS/NN/Scripts/NNFunctions witch is not a part of NNoM
+
+Please refer to NNoM documents for its usages. 
+
diff --git a/APP_Framework/Framework/knowing/nnom/scripts/__init__.py b/APP_Framework/Framework/knowing/nnom/scripts/__init__.py
new file mode 100644
index 000000000..5bb534f79
--- /dev/null
+++ b/APP_Framework/Framework/knowing/nnom/scripts/__init__.py
@@ -0,0 +1 @@
+# package
diff --git a/APP_Framework/Framework/knowing/nnom/scripts/fully_connected_opt_weight_generation.py b/APP_Framework/Framework/knowing/nnom/scripts/fully_connected_opt_weight_generation.py
new file mode 100644
index 000000000..f68382b1f
--- /dev/null
+++ b/APP_Framework/Framework/knowing/nnom/scripts/fully_connected_opt_weight_generation.py
@@ -0,0 +1,153 @@
+#!/usr/bin/env python
+
+'''
+    This file is apart of CMSIS-NN release
+    https://github.com/ARM-software/CMSIS_5/tree/develop/CMSIS/NN/Scripts/NNFunctions
+'''
+
+import numpy as np
+
+def convert_to_x4_q7_weights(weights):
+    [r, h, w, c] = weights.shape
+    weights = np.reshape(weights, (r, h*w*c))
+    num_of_rows = r
+    num_of_cols = h*w*c
+    new_weights = np.copy(weights)
+    new_weights = np.reshape(new_weights, (r*h*w*c))
+    counter = 0
+    for i in range(int(num_of_rows/4)):
+      # we only need to do the re-ordering for every 4 rows
+      row_base = 4*i
+      for j in range(int(num_of_cols/4)):
+        # for each 4 entries
+        column_base = 4*j
+        new_weights[counter]   =  weights[row_base  ][column_base  ]
+        new_weights[counter+1] =  weights[row_base+1][column_base  ]
+        new_weights[counter+2] =  weights[row_base  ][column_base+2]
+        new_weights[counter+3] =  weights[row_base+1][column_base+2]
+        new_weights[counter+4] =  weights[row_base+2][column_base  ]
+        new_weights[counter+5] =  weights[row_base+3][column_base  ]
+        new_weights[counter+6] =  weights[row_base+2][column_base+2]
+        new_weights[counter+7] =  weights[row_base+3][column_base+2]
+
+        new_weights[counter+8] =  weights[row_base  ][column_base+1]
+        new_weights[counter+9] =  weights[row_base+1][column_base+1]
+        new_weights[counter+10] = weights[row_base  ][column_base+3]
+        new_weights[counter+11] = weights[row_base+1][column_base+3]
+        new_weights[counter+12] = weights[row_base+2][column_base+1]
+        new_weights[counter+13] = weights[row_base+3][column_base+1]
+        new_weights[counter+14] = weights[row_base+2][column_base+3]
+        new_weights[counter+15] = weights[row_base+3][column_base+3]
+        counter = counter + 16
+      # the remaining ones are in order
+      for j in range((int)(num_of_cols-num_of_cols%4), int(num_of_cols)):
+        new_weights[counter] = weights[row_base][j]
+        new_weights[counter+1] = weights[row_base+1][j]
+        new_weights[counter+2] = weights[row_base+2][j]
+        new_weights[counter+3] = weights[row_base+3][j]
+        counter = counter + 4
+    return new_weights
+
+def convert_to_x4_q15_weights(weights):
+    [r, h, w, c] = weights.shape
+    weights = np.reshape(weights, (r, h*w*c))
+    num_of_rows = r
+    num_of_cols = h*w*c
+    new_weights = np.copy(weights)
+    new_weights = np.reshape(new_weights, (r*h*w*c))
+    counter = 0
+    for i in range(int(num_of_rows/4)):
+      # we only need to do the re-ordering for every 4 rows
+      row_base = 4*i
+      for j in range(int(num_of_cols/2)):
+        # for each 2 entries
+        column_base = 2*j
+        new_weights[counter]   =  weights[row_base  ][column_base  ]
+        new_weights[counter+1] =  weights[row_base  ][column_base+1]
+        new_weights[counter+2] =  weights[row_base+1][column_base  ]
+        new_weights[counter+3] =  weights[row_base+1][column_base+1]
+        new_weights[counter+4] =  weights[row_base+2][column_base  ]
+        new_weights[counter+5] =  weights[row_base+2][column_base+1]
+        new_weights[counter+6] =  weights[row_base+3][column_base  ]
+        new_weights[counter+7] =  weights[row_base+3][column_base+1]
+
+        counter = counter + 8
+      # the remaining ones are in order
+      for j in range((int)(num_of_cols-num_of_cols%2), int(num_of_cols)):
+        new_weights[counter] = weights[row_base][j]
+        new_weights[counter+1] = weights[row_base+1][j]
+        new_weights[counter+2] = weights[row_base+2][j]
+        new_weights[counter+3] = weights[row_base+3][j]
+        counter = counter + 4
+    return new_weights
+
+def convert_q7_q15_weights(weights):
+    [r, h, w, c] = weights.shape
+    weights = np.reshape(weights, (r, h*w*c))
+    num_of_rows = r
+    num_of_cols = h*w*c
+    new_weights = np.copy(weights)
+    new_weights = np.reshape(new_weights, (r*h*w*c))
+    counter = 0
+    for i in range(int(num_of_rows/4)):
+      # we only need to do the re-ordering for every 4 rows
+      row_base = 4*i
+      for j in range(int(num_of_cols/2)):
+        # for each 2 entries
+        column_base = 2*j
+        new_weights[counter]   =  weights[row_base  ][column_base  ]
+        new_weights[counter+1] =  weights[row_base+1][column_base  ]
+        new_weights[counter+2] =  weights[row_base  ][column_base+1]
+        new_weights[counter+3] =  weights[row_base+1][column_base+1]
+        new_weights[counter+4] =  weights[row_base+2][column_base  ]
+        new_weights[counter+5] =  weights[row_base+3][column_base  ]
+        new_weights[counter+6] =  weights[row_base+2][column_base+1]
+        new_weights[counter+7] =  weights[row_base+3][column_base+1]
+
+        counter = counter + 8
+      # the remaining ones are in order
+      for j in range((int)(num_of_cols-num_of_cols%2), int(num_of_cols)):
+        new_weights[counter] = weights[row_base][j]
+        new_weights[counter+1] = weights[row_base+1][j]
+        new_weights[counter+2] = weights[row_base+2][j]
+        new_weights[counter+3] = weights[row_base+3][j]
+        counter = counter + 4
+    return new_weights
+
+
+if __name__ == "__main__":
+    # input dimensions
+    vec_dim = 127
+    row_dim = 127
+
+    weight = np.zeros((row_dim,vec_dim), dtype=int)
+
+    # generate random inputs
+    for i in range(row_dim):
+      for j in range(vec_dim):
+        weight[i][j] = np.random.randint(256)-128
+
+    weight = np.reshape(weight, (row_dim, vec_dim, 1, 1))
+
+    outfile = open("../Ref_Implementations/fully_connected_testing_weights.h", "w")
+    outfile.write("#define IP2_WEIGHT {")
+    weight.tofile(outfile,sep=",",format="%d")
+    outfile.write("}\n\n")
+
+    new_weight = convert_to_x4_q7_weights(weight)
+    outfile.write("#define IP4_WEIGHT {")
+    new_weight.tofile(outfile,sep=",",format="%d")
+    outfile.write("}\n\n")
+
+    new_weight = convert_q7_q15_weights(weight)
+    outfile.write("#define IP4_q7_q15_WEIGHT {")
+    new_weight.tofile(outfile,sep=",",format="%d")
+    outfile.write("}\n\n")
+
+    new_weight = convert_to_x4_q15_weights(weight)
+    outfile.write("#define IP4_WEIGHT_Q15 {")
+    new_weight.tofile(outfile,sep=",",format="%d")
+    outfile.write("}\n\n")
+
+
+    outfile.close()
diff --git a/APP_Framework/Framework/knowing/nnom/scripts/gen_config.py b/APP_Framework/Framework/knowing/nnom/scripts/gen_config.py
new file mode 100644
index 000000000..d1b787abd
--- /dev/null
+++ b/APP_Framework/Framework/knowing/nnom/scripts/gen_config.py
@@ -0,0 +1,561 @@
+'''
+    Copyright (c) 2018-2020
+    Jianjia Ma
+    majianjia@live.com
+    SPDX-License-Identifier: Apache-2.0
+    Change Logs:
+    Date           Author       Notes
+    2020-05-22     Jianjia Ma   The first version
+'''
+from tensorflow.keras.layers import *
+import numpy as np
+
+def convert_tensor_name(t):
+    return 'tensor_'+t.name.replace('/', '_').replace(':', '_')
+
+def to_cstyle(data, integer=True):
+    #Convert an array to C style basket, not to be used for very large array. size > options['threshold'] will lead to ...
+    if(integer):
+        data = np.array(data, dtype=np.int).flatten()
+    else:
+        data = np.array(data).flatten()
+    s = np.array2string(data, separator=',')
+    s = s.replace("\n","").replace("\r","").replace(' ','')
+    s = s.replace(',', ', ')
+    s = s.replace('(', '[').replace(')', ']')
+    return s.replace('[', '{').replace(']', '}')
+
+def tensor_shape(tensor, is_io_tensor=False):
+    # inconsistance of TF1 and TF2
+    # get tensor shape without None or ?
+    try:
+        shape = tensor.shape.as_list() # tf1
+    except:
+        shape = tensor.get_shape().as_list() # tf2
+    if(shape[0] == None or is_io_tensor):
+        shape = shape[1:]
+    else:
+        shape = shape
+    # for rnn input with timestamp = None, need a better implementation
+    for i in range(len(shape)):
+        shape[i] = shape[i] if shape[i] is not None else 1
+    return shape
+
+def gen_base_config(layer):
+    config = '{.name = "%s"}' % (layer.name)
+    return config
+
+def gen_values(var_name, var, size='', dtype='const int8_t'):
+    s = '<dtype> <var_name>[<size>] = <var>;\n'
+    s = s.replace('<var_name>', var_name).replace('<var>', var).replace('<size>', size).replace('<dtype>', dtype)
+    return s
+
+# generate tensor by the tensor config
+def gen_tensor(tensor, dec_bits, tensor_value='NULL', per_axis=False, is_io_tensor=False):
+    config = '''
+const nnom_shape_data_t <tensor_name>_dim[] = <dim>;
+const nnom_qformat_param_t <tensor_name>_dec[] = <q_dec>;
+const nnom_qformat_param_t <tensor_name>_offset[] = <q_offset>;
+const nnom_tensor_t <tensor_name> = {
+    .p_data = (void*)<value>,
+    .dim = (nnom_shape_data_t*)<tensor_name>_dim,
+    .q_dec = (nnom_qformat_param_t*)<tensor_name>_dec,
+    .q_offset = (nnom_qformat_param_t*)<tensor_name>_offset,
+    .qtype = <qtype>,
+    .num_dim = <num_dim>,
+    .bitwidth = <bitwidth>
+};
+'''
+    # inconsistance of TF1 and TF2
+    shape = tensor_shape(tensor, is_io_tensor)
+    config = config.replace('<tensor_name>', convert_tensor_name(tensor))#.name.replace('/','_').split(':')[0]) #conv2d/kernel:0
+    config = config.replace('<bitwidth>', '8')
+    config = config.replace('<value>', tensor_value)
+    config = config.replace('<dim>', to_cstyle(shape))
+    config = config.replace('<num_dim>', str(len(shape)))
+    if(type(dec_bits) == str):
+        config = config.replace('<q_dec>', dec_bits)
+        config = config.replace('<q_offset>', to_cstyle([0]))
+    else:
+        config = config.replace('<q_dec>', to_cstyle(dec_bits))
+        config = config.replace('<q_offset>', to_cstyle([0]))
+    if(per_axis):
+        config = config.replace('<qtype>', 'NNOM_QTYPE_PER_AXIS')
+    else:
+        config = config.replace('<qtype>', 'NNOM_QTYPE_PER_TENSOR')
+    return config
+
+# create tensor by directly setting up the value
+def gen_create_tensor(tensor_name, shape, dec_bits, tensor_value='NULL', per_axis=False):
+    config = '''
+const nnom_shape_data_t <tensor_name>_dim[] = <dim>;
+const nnom_qformat_param_t <tensor_name>_dec[] = <q_dec>;
+const nnom_qformat_param_t <tensor_name>_offset[] = <q_offset>;
+const nnom_tensor_t <tensor_name> = {
+    .p_data = (void*)<value>,
+    .dim = (nnom_shape_data_t*)<tensor_name>_dim,
+    .q_dec = (nnom_qformat_param_t*)<tensor_name>_dec,
+    .q_offset = (nnom_qformat_param_t*)<tensor_name>_offset,
+    .qtype = <qtype>,
+    .num_dim = <num_dim>,
+    .bitwidth = <bitwidth>
+};
+'''
+    config = config.replace('<tensor_name>', tensor_name)
+    config = config.replace('<bitwidth>', '8')
+    config = config.replace('<value>', tensor_value)
+    config = config.replace('<dim>', to_cstyle(shape))
+    config = config.replace('<num_dim>', str(len(shape)))
+    if(type(dec_bits) == str):
+        config = config.replace('<q_dec>', dec_bits)
+        config = config.replace('<q_offset>', to_cstyle([0]))
+    else:
+        config = config.replace('<q_dec>', to_cstyle(dec_bits))
+        config = config.replace('<q_offset>', to_cstyle([0]))
+    if(per_axis):
+        config = config.replace('<qtype>', 'NNOM_QTYPE_PER_AXIS')
+    else:
+        config = config.replace('<qtype>', 'NNOM_QTYPE_PER_TENSOR')
+    return config
+
+def gen_conv2d_config(layer, output_shifts, bias_shifts):
+    c = '''
+const nnom_qformat_param_t <layer_name>_output_shift[] = <output_shift_values>;
+const nnom_qformat_param_t <layer_name>_bias_shift[] = <bias_shift_values>;
+const nnom_conv2d_config_t <layer_name>_config = {
+    .super = <base_config>,
+    .qtype = <qtype>,
+    .weight = (nnom_tensor_t*)&<weight>,
+    .bias = (nnom_tensor_t*)&<bias>,
+    .output_shift = (nnom_qformat_param_t *)&<layer_name>_output_shift, 
+    .bias_shift = (nnom_qformat_param_t *)&<layer_name>_bias_shift, 
+    .filter_size = <filter_size>,
+    .kernel_size = <kernel_size>,
+    .stride_size = <stride_size>,
+    .padding_size = <padding_size>,
+    .dilation_size = <dilation_size>,
+    .padding_type = <padding_type>
+};
+'''
+    c = c.replace('<layer_name>', layer.name)
+    c = c.replace('<base_config>', gen_base_config(layer))
+    c = c.replace('<qtype>', "NNOM_QTYPE_PER_TENSOR")
+    c = c.replace('<weight>',convert_tensor_name(layer.weights[0]))
+    c = c.replace('<bias>',convert_tensor_name(layer.weights[1]))
+    c = c.replace('<output_shift_values>', output_shifts)
+    c = c.replace('<bias_shift_values>', bias_shifts)
+    c = c.replace('<filter_size>', str(layer.filters) if layer.filters is not None else str(layer.depth_multiplier))  # output channel
+    c = c.replace('<kernel_size>', to_cstyle(layer.kernel_size))
+    c = c.replace('<stride_size>', to_cstyle(layer.strides))
+    c = c.replace('<padding_size>', '{0, 0}') # not using it with keras, defined by padding type instead
+    c = c.replace('<dilation_size>', to_cstyle(layer.dilation_rate))
+    c = c.replace('<padding_type>', 'PADDING_'+layer.padding.upper())
+    return c
+
+def gen_conv2d_trans_config(layer, output_shifts, bias_shifts):
+    c = '''
+const nnom_qformat_param_t <layer_name>_output_shift[] = <output_shift_values>;
+const nnom_qformat_param_t <layer_name>_bias_shift[] = <bias_shift_values>;
+const nnom_conv2d_trans_config_t <layer_name>_config = {
+    .super = <base_config>,
+    .qtype = <qtype>,
+    .weight = (nnom_tensor_t*)&<weight>,
+    .bias = (nnom_tensor_t*)&<bias>,
+    .output_shift = (nnom_qformat_param_t *)&<layer_name>_output_shift, 
+    .bias_shift = (nnom_qformat_param_t *)&<layer_name>_bias_shift, 
+    .filter_size = <filter_size>,
+    .kernel_size = <kernel_size>,
+    .stride_size = <stride_size>,
+    .padding_size = <padding_size>,
+    .dilation_size = <dilation_size>,
+    .padding_type = <padding_type>
+};
+'''
+    c = c.replace('<layer_name>', layer.name)
+    c = c.replace('<base_config>', gen_base_config(layer))
+    c = c.replace('<qtype>', "NNOM_QTYPE_PER_TENSOR")
+    c = c.replace('<weight>',convert_tensor_name(layer.weights[0]))
+    c = c.replace('<bias>',convert_tensor_name(layer.weights[1]))
+    c = c.replace('<output_shift_values>', output_shifts)
+    c = c.replace('<bias_shift_values>', bias_shifts)
+    c = c.replace('<filter_size>', str(layer.filters)) # output channel
+    c = c.replace('<kernel_size>', to_cstyle(layer.kernel_size))
+    c = c.replace('<stride_size>', to_cstyle(layer.strides))
+    c = c.replace('<padding_size>', '{0, 0}') # not using it with keras, defined by padding type instead
+    c = c.replace('<dilation_size>', to_cstyle(layer.dilation_rate))
+    c = c.replace('<padding_type>', 'PADDING_'+layer.padding.upper())
+    return c
+
+def gen_dense_config(layer, output_shifts, bias_shift):
+    c = '''
+const nnom_qformat_param_t <layer_name>_output_shift[] = <output_shift_values>;
+const nnom_qformat_param_t <layer_name>_bias_shift[] = <bias_shift_values>;
+const nnom_dense_config_t <layer_name>_config = {
+    .super = <base_config>,
+    .qtype = <qtype>,
+    .weight = (nnom_tensor_t*)&<weight>,
+    .bias = (nnom_tensor_t*)&<bias>,
+    .output_shift = (nnom_qformat_param_t *)&<layer_name>_output_shift,
+    .bias_shift = (nnom_qformat_param_t *)&<layer_name>_bias_shift
+};
+'''
+    c = c.replace('<layer_name>', layer.name)
+    c = c.replace('<base_config>', gen_base_config(layer))
+    c = c.replace('<qtype>', "NNOM_QTYPE_PER_TENSOR")
+    c = c.replace('<weight>', convert_tensor_name(layer.weights[0]))
+    c = c.replace('<bias>', convert_tensor_name(layer.weights[1]))
+    c = c.replace('<output_shift_values>', output_shifts)
+    c = c.replace('<bias_shift_values>', bias_shift)
+    return c
+
+def gen_io_config(layer, tensor_name):
+    c = '''
+const nnom_io_config_t <layer_name>_config = {
+    .super = <base_config>,
+    .tensor = (nnom_tensor_t*)&<tensor>
+};
+'''
+    c = c.replace('<layer_name>', layer.name)
+    c = c.replace('<base_config>', gen_base_config(layer))
+    c = c.replace('<tensor>', tensor_name)
+    return c
+
+def gen_output_config(previous_layer, dec_bits, output_num, value_name='nnom_output_data'): #cheat at the moments
+    c = '''
+const nnom_shape_data_t <tensor_name>_dim[] = <dim>;
+const nnom_qformat_param_t <tensor_name>_dec[] = <q_dec>;
+const nnom_qformat_param_t <tensor_name>_offset[] = <q_offset>;
+const nnom_tensor_t <tensor_name> = {
+    .p_data = (void*)<value>,
+    .dim = (nnom_shape_data_t*)<tensor_name>_dim,
+    .q_dec = (nnom_qformat_param_t*)<tensor_name>_dec,
+    .q_offset = (nnom_qformat_param_t*)<tensor_name>_offset,
+    .qtype = <qtype>,
+    .num_dim = <num_dim>,
+    .bitwidth = 8
+};
+
+const nnom_io_config_t <layer_name>_config = {
+    .super = <base_config>,
+    .tensor = (nnom_tensor_t*)&<tensor_name>
+};
+'''
+    shape = tensor_shape(previous_layer.output, is_io_tensor=True)
+
+    c = c.replace('<tensor_name>', 'tensor_output'+str(output_num))
+    c = c.replace('<layer_name>', 'output'+str(output_num))
+    c = c.replace('<base_config>', '{.name = "output'+str(output_num)+'"}') # cheating at the moment.
+    c = c.replace('<value>', value_name)
+    c = c.replace('<qtype>', 'NNOM_QTYPE_PER_TENSOR')
+    c = c.replace('<num_dim>', str(len(shape)))
+    c = c.replace('<dim>', to_cstyle(shape))
+    c = c.replace('<q_dec>', '{'+dec_bits+'}')
+    c = c.replace('<q_offset>', to_cstyle([0]))
+    return c
+
+
+def gen_pooling_config(layer, output_shifts='0'):
+    c = '''
+const nnom_pool_config_t <layer_name>_config = {
+    .super = <base_config>,
+    .padding_type = <padding_type>,
+    .output_shift = <output_shift>,
+    .kernel_size = <kernel_size>,
+    .stride_size = <stride_size>,
+    .num_dim = <num_dim>
+};
+'''
+    c = c.replace('<layer_name>', layer.name)
+    c = c.replace('<base_config>', gen_base_config(layer))
+    c = c.replace('<padding_type>', 'PADDING_'+layer.padding.upper())
+    c = c.replace('<kernel_size>', to_cstyle(layer.pool_size))
+    c = c.replace('<stride_size>', to_cstyle(layer.strides))
+    c = c.replace('<num_dim>', str(len(layer.pool_size)))
+    c = c.replace('<output_shift>', output_shifts) # not used at the moment
+    return c
+
+def gen_gl_pooling_config(layer, output_shifts='0'):
+    c = '''
+const nnom_global_pool_config_t <layer_name>_config = {
+    .super = <base_config>,
+    .output_shift = <output_shift>,
+};
+'''
+    c = c.replace('<layer_name>', layer.name)
+    c = c.replace('<base_config>', gen_base_config(layer))
+    c = c.replace('<output_shift>', output_shifts)
+    return c
+
+
+
+def gen_matrix_config(layer, output_shift_name='0'):
+    c = '''
+const nnom_matrix_config_t <layer_name>_config = {
+    .super = <base_config>,
+    .output_shift = <output_shift> 
+};
+'''
+    c = c.replace('<layer_name>', layer.name)
+    c = c.replace('<base_config>', gen_base_config(layer))
+    c = c.replace('<output_shift>',  output_shift_name) # not used at the moment
+    return c
+
+def gen_zero_padding_config(layer):
+    c = '''
+const nnom_zero_padding_config_t <layer_name>_config = {
+    .super = <base_config>,
+    .pad = <padding> 
+};
+'''
+    c = c.replace('<layer_name>', layer.name)
+    c = c.replace('<base_config>', gen_base_config(layer))
+    try:
+        c = c.replace('<padding>', to_cstyle(sum(layer.padding, ())))
+    except:
+        pad = ((0, 0), layer.padding)
+        c = c.replace('<padding>', to_cstyle(sum(pad, ())))
+    return c
+
+def gen_cropping_config(layer):
+    c = '''
+const nnom_cropping_config_t <layer_name>_config = {
+    .super = <base_config>,
+    .pad = <padding>
+};
+'''
+    c = c.replace('<layer_name>', layer.name)
+    c = c.replace('<base_config>', gen_base_config(layer))
+    try:
+        c = c.replace('<padding>', to_cstyle(sum(layer.cropping, ()))) #((top_crop, bottom_crop), (left_crop, right_crop))
+    except:
+        pad = ((0, 0), layer.cropping)
+        c = c.replace('<padding>', to_cstyle(sum(pad, ())))
+    return c
+
+def gen_upsampling_config(layer):
+    c = '''
+const nnom_upsample_config_t <layer_name>_config = {
+    .super = <base_config>,
+    .kernel = <kernel> 
+};
+'''
+    c = c.replace('<layer_name>', layer.name)
+    c = c.replace('<base_config>', gen_base_config(layer))
+    c = c.replace('<kernel>', to_cstyle(layer.size))
+    return c
+
+def gen_softmax_config(layer):
+    c = '''
+const nnom_softmax_config_t <layer_name>_config = {
+    .super = <base_config>
+};
+'''
+    c = c.replace('<layer_name>', layer.name)
+    c = c.replace('<base_config>', gen_base_config(layer))
+    return c
+
+def gen_flatten_config(layer):
+    c = '''
+const nnom_flatten_config_t <layer_name>_config = {
+    .super = <base_config>
+};
+'''
+    c = c.replace('<layer_name>', layer.name)
+    c = c.replace('<base_config>', gen_base_config(layer))
+    return c
+
+def gen_reshape_config(layer):
+    c = '''
+const nnom_shape_data_t <layer_name>_targeted_shape[] = <shape>;
+const nnom_reshape_config_t <layer_name>_config = {
+    .super = <base_config>,
+    .dim = (nnom_shape_data_t*)<layer_name>_targeted_shape,
+    .num_dim = <num_dim>
+};
+'''
+    c = c.replace('<layer_name>', layer.name)
+    c = c.replace('<base_config>', gen_base_config(layer))
+    c = c.replace('<shape>', to_cstyle(layer.output_shape[1:]))
+    c = c.replace('<num_dim>', str(len(layer.output_shape[1:])))
+    return c
+
+def gen_concat_config(layer):
+    c = '''
+const nnom_concat_config_t <layer_name>_config = {
+    .super = <base_config>,
+    .axis = <axis>
+};
+'''
+    c = c.replace('<layer_name>', layer.name)
+    c = c.replace('<base_config>', gen_base_config(layer))
+    c = c.replace('<axis>', str(layer.axis))
+    return c
+
+def gen_lambda_config(layer, run_func_name='NULL', build_func_name='NULL', free_func_name='NULL', parameters_name='NULL'):
+    c = '''
+const nnom_lambda_config_t <layer_name>_config = {
+    .super = <base_config>,
+    .run_func_name = <run_func_name>,
+    .build_func_name = <build_func_name>,
+    .free_func_name = <free_func_name>,
+    .parameters = <parameters_name>
+};
+'''
+    c = c.replace('<layer_name>', layer.name)
+    c = c.replace('<base_config>', gen_base_config(layer))
+    c = c.replace('<run_func_name>', run_func_name)
+    c = c.replace('<build_func_name>', build_func_name)
+    c = c.replace('<free_func_name>', free_func_name)
+    c = c.replace('<parameters_name>', parameters_name)
+    return c
+
+def gen_rnn_config(layer):
+    c = '''
+const nnom_rnn_config_t <layer_name>_config = {
+    .super = <base_config>,
+    .return_sequence = <return_sequence>,
+    .stateful = <stateful>,
+    .go_backwards = <go_backwards>
+};
+'''
+    c = c.replace('<layer_name>', layer.name)
+    c = c.replace('<base_config>', gen_base_config(layer))
+    c = c.replace('<stateful>', 'true' if layer.stateful else 'false')
+    c = c.replace('<go_backwards>', 'true' if layer.go_backwards else 'false')
+    c = c.replace('<return_sequence>', 'true' if layer.return_sequences else 'false')
+    return c
+
+def gen_simple_cell_config(layer, q_list):
+    c = '''
+const nnom_simple_cell_config_t <layer_name>_simple_cell_config = {
+    .super = <base_config>,
+    .weights = (nnom_tensor_t*)&<weights>,
+    .recurrent_weights = (nnom_tensor_t*)&<recurrent_weights>,
+    .bias = (nnom_tensor_t*)&<bias>,
+    .q_dec_iw = <q_dec_iw>,
+    .q_dec_hw = <q_dec_hw>,
+    .q_dec_h = <q_dec_h>,
+    .act_type = <act_type>,
+    .units = <units>
+};
+'''
+    try:
+        cell_cfg = layer.get_config()['cell']['config']
+    except:
+        cell_cfg = layer.get_config()
+    c = c.replace('<layer_name>', layer.name)
+    c = c.replace('<base_config>', gen_base_config(layer))
+    c = c.replace('<weights>', convert_tensor_name(layer.weights[0]))
+    c = c.replace('<recurrent_weights>', convert_tensor_name(layer.weights[1]))
+    c = c.replace('<bias>', convert_tensor_name(layer.weights[2]))
+    c = c.replace('<q_dec_iw>', str(q_list[1])) # the qfmt of input x weight
+    c = c.replace('<q_dec_hw>', str(q_list[2])) # q of hidden x recurrent weight
+    c = c.replace('<q_dec_h>', str(q_list[0])) # output, if act != relu, should be 7 (consider delete it.)
+    c = c.replace('<act_type>', 'ACT_' + cell_cfg['activation'].upper())
+    c = c.replace('<units>', str(cell_cfg['units']))
+    return c
+
+def gen_lstm_cell_config(layer, q_list):
+    c = '''
+const nnom_lstm_cell_config_t <layer_name>_lstm_cell_config = {
+    .super = <base_config>,
+    .weights = (nnom_tensor_t*)&<weights>,
+    .recurrent_weights = (nnom_tensor_t*)&<recurrent_weights>,
+    .bias = (nnom_tensor_t*)&<bias>,
+    .q_dec_z = <q_dec_z>,
+    .q_dec_h = <q_dec_h>,
+    .q_dec_c = <q_dec_c>,
+    .units = <units>
+};
+'''
+    try:
+        cell_cfg = layer.get_config()['cell']['config']
+    except:
+        cell_cfg = layer.get_config()
+    c = c.replace('<layer_name>', layer.name)
+    c = c.replace('<base_config>', gen_base_config(layer))
+    c = c.replace('<weights>', convert_tensor_name(layer.weights[0]))
+    c = c.replace('<recurrent_weights>', convert_tensor_name(layer.weights[1]))
+    c = c.replace('<bias>', convert_tensor_name(layer.weights[2]))
+    c = c.replace('<q_dec_h>', str(q_list[0])) # output and memory state, (should be q0.7. consider delete it)
+    c = c.replace('<q_dec_c>', str(q_list[1])) # cell state
+    c = c.replace('<q_dec_z>', str(q_list[2])) # input*weight + hidden*weight + bias
+    c = c.replace('<units>', str(cell_cfg['units']))
+    return c
+
+
+
+def gen_gru_cell_config(layer, q_list):
+    c = '''
+const nnom_gru_cell_config_t <layer_name>_gru_cell_config = {
+    .super = <base_config>,
+    .weights = (nnom_tensor_t*)&<weights>,
+    .recurrent_weights = (nnom_tensor_t*)&<recurrent_weights>,
+    .bias = (nnom_tensor_t*)&<bias>,
+    .q_dec_z = <q_dec_z>,
+    .q_dec_h = <q_dec_h>,
+    .units = <units>
+};
+'''
+    try:
+        cell_cfg = layer.get_config()['cell']['config']
+    except:
+        cell_cfg = layer.get_config()
+    c = c.replace('<layer_name>', layer.name)
+    c = c.replace('<base_config>', gen_base_config(layer))
+    c = c.replace('<weights>', convert_tensor_name(layer.weights[0]))
+    c = c.replace('<recurrent_weights>', convert_tensor_name(layer.weights[1]))
+    c = c.replace('<bias>', convert_tensor_name(layer.weights[2]))
+    c = c.replace('<q_dec_h>', str(q_list[0])) #
+    c = c.replace('<q_dec_z>', str(q_list[1])) #
+    c = c.replace('<units>', str(cell_cfg['units']))
+    return c
+
+
+if __name__ == "__main__":
+    # test only
+    from tensorflow.keras.models import load_model
+    model = load_model("../model.h5")
+    print(gen_tensor(model.layers[1].weights[0], dec_bits=(1, 2, 3, 4, 5)))
+    print(gen_tensor(model.layers[1].weights[1], dec_bits=(1, 2, 3, 4, 5)))
+    print(gen_conv2d_config(model.layers[1], (1,2,3), 3))
+
+    with open("test.h", 'w') as fp:
+        # fp.write(gen_tensor(model.layers[1].weights[0], dec_bits=(1, 2, 3, 4, 5)))
+        # fp.write(gen_tensor(model.layers[1].weights[1], dec_bits=(1, 2, 3, 4, 5)))
+        # fp.write(gen_conv2d_config(model.layers[1], (1,2,3,)))
+
+        fp.write('#include "nnom.h"\n')
+
+        # test all
+        for layer in model.layers:
+            if(type(layer) in [Conv2D, Conv1D]):
+                for w in layer.weights:
+                    fp.write(gen_tensor(w, [3]))
+                fp.write(gen_conv2d_config(layer, {0}, 2))
+            elif(type(layer) in [Dense]):
+                for w in layer.weights:
+                    fp.write(gen_tensor(w, [3]))
+                fp.write(gen_dense_config(layer, 2, 2))
+            elif(type(layer) in [Input]):
+                fp.write(gen_io_config(layer, [9,1,1]))
+            elif(type(layer) in [MaxPooling2D, GlobalMaxPooling2D, AveragePooling2D, GlobalAveragePooling2D]):
+                fp.write(gen_pooling_config(layer))
+            elif(type(layer) in [Multiply, Add, Subtract]):
+                fp.write(gen_matrix_config(layer))
+            elif(type(layer) in [ZeroPadding2D, ZeroPadding1D]):
+                fp.write(gen_zero_padding_config(layer))
+            elif(type(layer) in [Cropping2D, Cropping1D]):
+                fp.write(gen_cropping_config(layer))
+            elif(type(layer) in [Softmax]):
+                fp.write(gen_softmax_config(layer))
+            elif(type(layer) in [Flatten]):
+                fp.write(gen_flatten_config(layer))
+            elif(type(layer) in [Concatenate]):
+                fp.write(gen_concat_config(layer))
+            elif(type(layer) in [Lambda]):
+                fp.write(gen_lambda_config(layer))
+            elif(type(layer) in [UpSampling2D, UpSampling1D]):
+                fp.write(gen_upsampling_config(layer))
+
+
diff --git a/APP_Framework/Framework/knowing/nnom/scripts/nnom.py b/APP_Framework/Framework/knowing/nnom/scripts/nnom.py
new file mode 100644
index 000000000..45e6b30a7
--- /dev/null
+++ b/APP_Framework/Framework/knowing/nnom/scripts/nnom.py
@@ -0,0 +1,1198 @@
+'''
+    Copyright (c) 2018-2020
+    Jianjia Ma
+    majianjia@live.com
+
+    SPDX-License-Identifier: Apache-2.0
+
+    Change Logs:
+    Date           Author       Notes
+    2019-02-05     Jianjia Ma   The first version
+'''
+
+import sklearn.metrics as skmetrics
+import matplotlib.pyplot as plt
+import tensorflow as tf
+import tensorflow.keras.backend as K
+from tensorflow.keras import *
+from tensorflow.keras.layers import *
+from fully_connected_opt_weight_generation import *
+from gen_config import *
+import scipy.stats
+import time
+import warnings
+
+model_major_version = 0
+model_sub_version = 4
+model_reversion = 3
+
+#define NNOM_MAJORVERSION     0L              /**< major version number */
+#define NNOM_SUBVERSION       4L              /**< minor version number */
+#define NNOM_REVISION         3L              /**< revise version number */
+#define NNOM_VERSION          (NNOM_MAJORVERSION * 10000) + (NNOM_SUBVERSION * 100) + NNOM_REVISION)
+
+def fuse_bn_to_conv(layer):
+    # try to fuse BN layer to convolutional
+    if ('conv' in layer.name) and \
+            ('batch_normalization' in layer.outbound_nodes[0].outbound_layer.name):
+        print("fusing batch normalization to", layer.name)
+        bn_layer = layer._outbound_nodes[0].outbound_layer
+        c_w = layer.get_weights()[0]
+        c_b = layer.get_weights()[1]
+        print('original weight max', c_w.max(), 'min', c_w.min())
+        print('original bias max', c_b.max(), 'min', c_b.min())
+        bn_gamma = bn_layer.get_weights()[0]
+        bn_beta = bn_layer.get_weights()[1]
+        bn_mean = bn_layer.get_weights()[2]
+        bn_variance = bn_layer.get_weights()[3]
+        epsilon = 1e-3  # default epsilon for tf.slim.batch_norm
+        if ('conv2d' in layer.name):
+            if "depthwise" in layer.name:  # depthwise batchnorm params are ordered differently
+                for l in range(c_w.shape[3]):
+                    for k in range(c_w.shape[2]):
+                        for j in range(c_w.shape[1]):
+                            for i in range(c_w.shape[0]):
+                                c_w[i][j][k][l] *= bn_gamma[k*c_w.shape[3]+l] / np.sqrt(bn_variance[k*c_w.shape[3]+l] + epsilon)
+                depth_dim = c_w.shape[2] * c_w.shape[3]  # test needed
+            # normal conv
+            else:
+                for l in range(c_w.shape[3]):
+                    for k in range(c_w.shape[2]):
+                        for j in range(c_w.shape[1]):
+                            for i in range(c_w.shape[0]):
+                                c_w[i][j][k][l] *= bn_gamma[l] / np.sqrt(bn_variance[l] + epsilon)
+                depth_dim = c_w.shape[3]
+            for l in range(depth_dim):
+                c_b[l] = (bn_gamma[l] * (c_b[l] - bn_mean[l]) / np.sqrt(bn_variance[l] + epsilon)) + bn_beta[l]
+        # conv1d
+        else:
+            epsilon = 1e-3  # default epsilon for tf.slim.batch_norm
+            for k in range(c_w.shape[2]):
+                for j in range(c_w.shape[1]):
+                    for i in range(c_w.shape[0]):
+                        if "depthwise" in layer.name:  # depthwise batchnorm params are ordered differently
+                            c_w[i][j][k] *= bn_gamma[j] / np.sqrt(bn_variance[j] + epsilon)
+                        else:
+                            c_w[i][j][k] *= bn_gamma[k] / np.sqrt(bn_variance[k] + epsilon)
+
+            if "depthwise" in layer.name:
+                depth_dim = c_w.shape[1]*c_w.shape[2] # need to be tested
+            else:
+                depth_dim = c_w.shape[2]
+            for l in range(depth_dim):
+                c_b[l] = (bn_gamma[l] * (c_b[l] - bn_mean[l]) / np.sqrt(bn_variance[l] + epsilon)) + bn_beta[l]
+
+        print('fused weight max', c_w.max(), 'min', c_w.min())
+        print('fused bias max', c_b.max(), 'min', c_b.min())
+        # write the weights back to the layer
+        # after that, the model will be destroyed.. need a better way to pass the new weight
+        layer.set_weights([c_w, c_b])
+
+def generate_test_bin(x, y, name='test_data_with_label.bin'):
+    '''
+    this method generate the
+    :param x:  input x data size
+    :param y:  input label (one hot label)
+    :return:
+    '''
+    # quantize input x
+    dec_bits = find_dec_bits_max_min(x, bit_width=8)
+    x = np.round(x*2**dec_bits).clip(-128, 127).astype(np.int8)
+    # get label
+    if(len(y.shape) >1):
+        test_label = np.argwhere(y == 1).astype(np.int8)  # test data
+        test_label = test_label[:, 1]
+    else:
+        test_label = y
+
+    # get data
+    dat = x.astype(dtype="byte")  # test data
+    batch_size = dat.shape[0]     # total pices of data
+    dat = dat.flatten()           # flatten to get the total size.
+    block_size = int(dat.size / batch_size) # this must be integer but... just to confirm
+
+    # write (label x 128) (data_block x 128)
+    label_batch = 128       # the Y-modem example uses 128 batch
+    with open(name, 'wb') as f:
+        start = 0
+        while start <= (test_label.size - label_batch):
+            test_label[start: start + label_batch].tofile(f)
+            dat[block_size * start: block_size * (start + label_batch)].tofile(f)
+            start += label_batch
+
+        # the rest data
+        if (start < test_label.size):
+            rest_len = test_label.size - start
+            new_labls = test_label[start:]
+            new_labls = np.pad(new_labls, (0, label_batch - rest_len), mode='constant')
+            new_labls.tofile(f)
+            dat[block_size * start:].tofile(f)
+
+    print("binary test file generated:", name)
+    print("test data length:", test_label.size)
+    return
+
+def is_shift_layer(layer):
+    ''' layer which can change the output encoding'''
+    #FIXME: add more which will change the output shift
+    if('input' in layer.name or
+       'conv2d' in layer.name or
+       'conv1d' in layer.name or
+       'dense' in layer.name or
+       'softmax' in layer.name or
+        'sigmoid' in layer.name or
+        'tanh' in layer.name or
+        ('add' in layer.name and 'zero' not in layer.name) or # the name, zero_padding contains 'add'
+        'subtract' in layer.name or
+        'multiply' in layer.name or
+       ('activation' in layer.name and layer.get_config()['activation'] == 'softmax')or
+        ('activation' in layer.name and layer.get_config()['activation'] == 'hard_sigmoid') or
+        ('activation' in layer.name and layer.get_config()['activation'] == 'tanh') or
+        ('activation' in layer.name and layer.get_config()['activation'] == 'hard_tanh') or
+        is_rnn_layer(layer)
+    ):
+        return True
+    return False
+
+def is_shift_fixed(layer):
+    ''' layer which shift to a fixed value'''
+    #FIXME: add more which will change the output shift
+    if('softmax' in layer.name or
+        'sigmoid' in layer.name or
+        'tanh' in layer.name or
+        ('activation' in layer.name and layer.get_config()['activation'] == 'softmax') or
+        ('activation' in layer.name and layer.get_config()['activation'] == 'sigmoid') or
+        ('activation' in layer.name and layer.get_config()['activation'] == 'hard_sigmoid') or
+        ('activation' in layer.name and layer.get_config()['activation'] == 'tanh') or
+        ('activation' in layer.name and layer.get_config()['activation'] == 'hard_tanh') or
+        is_rnn_layer(layer)
+    ):
+        return True
+    return  False
+
+def is_lstm_layer(layer):
+    if type(layer) is LSTM or 'lstm' in layer.name:
+        return True
+    if(type(layer) is RNN or 'rnn' in layer.name):
+        if(type(layer.cell) is LSTMCell or 'lstm' in layer.cell.name):
+            return True
+    return False
+
+def is_gru_layer(layer):
+    if type(layer) is GRU or 'gru' in layer.name:
+        return True
+    if(type(layer) is RNN or 'rnn' in layer.name):
+        if(type(layer.cell) is GRUCell or 'gru' in layer.cell.name):
+            return True
+    return False
+
+def is_rnn_layer(layer):
+    if( 'rnn' in layer.name or
+        is_lstm_layer(layer) or
+        is_gru_layer(layer)
+    ):
+        return True
+    return  False
+
+def find_offset(data):
+    """
+    Offset of the original data before quantisation
+    :param data:
+    :return: offset of the data block
+    """
+    return np.average(data)
+
+
+def find_dec_bits_max_min(data, bit_width=8, maximum_bit=32):
+    """
+    A ragular non-saturated shift-based quantisation mathod. Using max/min values
+    :param data:
+    :param bit_width:
+    :param maximum_bit: maximum decimal bit. Incase sometime bias is too small lead to very large size dec bit
+    :return:
+    """
+    max_val = abs(data.max()) - abs(data.max()/pow(2, bit_width)) # allow very small saturation.
+    min_val = abs(data.min()) - abs(data.min()/pow(2, bit_width))
+    int_bits = int(np.ceil(np.log2(max(max_val, min_val))))
+    dec_bits = (bit_width-1) - int_bits
+    return min(dec_bits, maximum_bit)
+
+def find_dec_bits_max_min_axis(data, axis=-1,bit_width=8, maximum_bit=32):
+    """
+    A ragular non-saturated shift-based quantisation mathod. Using max/min values
+    :param data:
+    :param axis:
+    :param bit_width:
+    :return:
+    """
+    dec_bits = []
+    # if(len(data.shape) < np.abs(axis)): # for depthwise with axis = -2 while len(shape) =1
+    #     size = data.shape[0]
+    #     axis = 0 #
+    # else:
+    #     size = data.shape[axis]
+    for i in np.arange(0, data.shape[axis]):
+        d = np.take(data, indices=i, axis=axis)
+        max_val = abs(d.max()) - abs(d.max() / pow(2, bit_width))  # allow very small saturation.
+        min_val = abs(d.min()) - abs(d.min() / pow(2, bit_width))
+        int_bit = int(np.ceil(np.log2(max(abs(max_val), abs(min_val)))))
+        dec_bit = (bit_width-1) - int_bit
+        dec_bits.append(min(dec_bit, maximum_bit))
+    return dec_bits
+
+def find_dec_bits_kld(data, bit_width=8, scan_times=4, maximum_bit=16):
+    """
+    # saturation shift, using KLD method (Kullback-Leibler divergence)
+    # Ref: http://on-demand.gputechconf.com/gtc/2017/presentation/s7310-8-bit-inference-with-tensorrt.pdf
+    :param data: The data for looking for quantisation
+    :param bit_width: the bitwidth of the data
+    :param scan_times: the times to try the best kld (normally the second is the best.)
+    :return: dec bit width for this data
+    """
+    # do a regular non-saturated quantisation
+    max_val = data.max()
+    min_val = data.min()
+    abs_max = max(abs(max_val), abs(min_val))
+    int_bits = int(np.ceil(np.log2(max(abs(max_val), abs(min_val)))))
+    dec_bits = (bit_width-1) - int_bits
+
+    # now looking for the best quantisation using KLD method
+    small_var = 1e-5
+    bins = np.arange(-abs_max, abs_max, abs_max / 2048 * 2)
+    q_bins = np.arange(-abs_max, abs_max, abs_max / 256 * 2)
+    flat_hist = np.histogram(data.flatten(), bins=bins)[0]
+    kl_loss = []
+    kl_shifts = []
+    for shift in range(scan_times):
+        t = 2 ** (dec_bits  + shift)  # 2-based threshold
+        act = np.round(data.flatten() * t)
+        act = act / t
+        act = np.clip(act, -128 / t, 127 / t)
+        act = np.histogram(act, bins=q_bins)[0]
+        act_hist = np.zeros(2047)
+        chunk = int(2048 / 256)
+        for i in range(int(255)):
+            none_zero = np.count_nonzero(flat_hist[i * chunk:(i + 1) * chunk])
+            if none_zero == 0:
+                continue
+            for j in range(chunk):
+                act_hist[i * chunk + j] = act[i] / none_zero if flat_hist[i * chunk + j] != 0 else 0
+        flat_hist[flat_hist == 0] = small_var
+        act_hist[act_hist == 0] = small_var
+        kl = scipy.stats.entropy(flat_hist, act_hist)
+        kl_loss.append(kl)
+        kl_shifts.append(dec_bits + shift)
+
+    # now get the least loss from the scaned kld shift
+    dec_bits = kl_shifts[np.argmin(kl_loss)]  # set the dec_bit to the KLD results
+    return min(dec_bits, maximum_bit)
+
+# convert to [-128,128) or int8
+def quantize_data(data, dec_bits, axis=-1, per_axis=False, bitwith=8):
+    if (per_axis):
+        out = []
+        for i in np.arange(0, data.shape[axis]):
+            d = np.take(data, indices=i, axis=axis)
+            d = np.round(d * 2 ** dec_bits[i])
+            d = np.clip(d, -2**(bitwith-1), 2**(bitwith-1)-1)
+            d = np.expand_dims(d, axis=axis)
+            out.append(d)
+        out = np.concatenate(out, axis=axis)
+        return out
+    else:
+        return np.clip(np.round(data * 2 ** dec_bits), -2**(bitwith-1), 2**(bitwith-1) -1)
+
+def quantize_rnn_intermediate_output(layer, features):
+    def nnom_sigmoid(data):
+        return 1 / (1 + np.exp(-data))
+    def nnom_tanh(data):
+        return np.tanh(data)
+    def split_array(d, num):
+        l = len(d)
+        if(num==4):
+            return d[:int(l/4)], d[int(l/4): int(l/2)], d[int(l/2):-int(l/4)], d[-int(l/4):]
+        elif(num==3):
+            return d[:int(l/3)], d[int(l/3): -int(l/3)], d[-int(l/3):]
+    lcfg = layer.get_config()
+    if(lcfg['go_backwards']):
+        features = features[:,::-1,:] # reverse timestamp
+
+    if(type(layer.cell) is SimpleRNNCell):
+        cfg = layer.cell.get_config()
+        state = np.zeros(cfg['units'])
+        kernel = layer.get_weights()[0]
+        recurrent_kernel = layer.get_weights()[1]
+        bias = layer.get_weights()[2]
+        # replicate keras's implementation
+        def simple_cell_step(inputs, state, kernel, recurrent_kernel, bias, activation):
+            h = np.dot(inputs, kernel)
+            h = np.add(h, bias)
+            h2 = np.dot(state, recurrent_kernel)
+            output = h + h2
+            output = activation(output)
+            return output, h, h2
+        output_arrary = []
+        h_array = []
+        h2_array = []
+        activation = nnom_tanh if cfg['activation'] is 'tanh' else nnom_sigmoid
+        state = np.zeros(cfg['units'])
+        for feature in features:
+            if(not layer.stateful):
+                state = np.zeros(cfg['units'])
+            for fe in feature:
+                output, h, h2 = simple_cell_step(fe, state, kernel, recurrent_kernel, bias, activation)
+                state = output
+                output_arrary.append(output)
+                h_array.append(h)
+                h2_array.append(h2)
+        output_arrary = np.array(output_arrary)
+        h_array = np.array(h_array)
+        h2_array = np.array(h2_array)
+        # qout = find_dec_bits_kld(output_arrary)
+        # qh = find_dec_bits_kld(h_array)
+        # qh2 = find_dec_bits_kld(h2_array)
+        qout = find_dec_bits_max_min(output_arrary)
+        qh = find_dec_bits_max_min(h_array)
+        qh2 = find_dec_bits_max_min(h2_array)
+        return [qout, qh, qh2]
+
+    elif (type(layer.cell) is LSTMCell or 'lstm' in layer.cell.name):
+        cfg = layer.cell.get_config()
+        state = np.zeros(cfg['units']*2)
+        kernel = layer.get_weights()[0]
+        recurrent_kernel = layer.get_weights()[1]
+        bias = layer.get_weights()[2]
+        def lstm_cell_step(cell_inputs, cell_states, kernel, recurrent_kernel, bias):
+            h_tm1 = cell_states[0]  # previous memory state
+            c_tm1 = cell_states[1]  # previous carry state
+            z1 = np.dot(cell_inputs, kernel)
+            z1 = np.add(z1, bias)
+            z2 = np.dot(h_tm1, recurrent_kernel)
+            z = z1+z2               # -----> q_z
+            z0, z1, z2, z3 = split_array(z, 4)
+            i = nnom_sigmoid(z0) # q0.7
+            f = nnom_sigmoid(z1) # q0.7
+            c1 = f*c_tm1
+            c2 = i*nnom_tanh(z2) # q0.7
+            c = c1 + c2          # -----> q_c
+            o = nnom_sigmoid(z3) # q0.7
+            tc = nnom_tanh(c)
+            h = o * tc # q0.7
+            return h, [h, c], z ,z0, z1, z2, z3
+        h_array = []
+        c_array = []
+        z_array = []
+        z0_array = []
+        z1_array = []
+        z2_array = []
+        z3_array = []
+        state = [np.zeros(cfg['units']), np.zeros(cfg['units'])]
+        for feature in features:
+            if(not layer.stateful):
+                state = [np.zeros(cfg['units']), np.zeros(cfg['units']) ]
+            for fe in feature:
+                output, state, z, z0, z1, z2, z3 = lstm_cell_step(fe, state, kernel, recurrent_kernel, bias)
+                h_array.append(output)
+                c_array.append(state[1])
+                z_array.append(z)
+                z0_array.append(z0)
+                z1_array.append(z1)
+                z2_array.append(z2)
+                z3_array.append(z3)
+        h_array = np.array(h_array)
+        c_array = np.array(c_array)
+        z_array = np.array(z_array)
+        z0_array = np.array(z0_array)
+        z1_array = np.array(z1_array)
+        z2_array = np.array(z2_array)
+        z3_array = np.array(z3_array)
+        # q_h = find_dec_bits_kld(h_array)
+        # q_c = find_dec_bits_kld(c_array)
+        # q_z = find_dec_bits_kld(z_array)
+        # q_z0 = find_dec_bits_kld(z0_array)
+        # q_z1 = find_dec_bits_kld(z1_array)
+        # q_z2 = find_dec_bits_kld(z2_array)
+        # q_z3 = find_dec_bits_kld(z3_array)
+        q_h = find_dec_bits_max_min(h_array)
+        q_c = find_dec_bits_max_min(c_array)
+        q_z = find_dec_bits_max_min(z_array)
+        q_z0 = find_dec_bits_max_min(z0_array)      # not needed.
+        q_z1 = find_dec_bits_max_min(z1_array)
+        q_z2 = find_dec_bits_max_min(z2_array)
+        q_z3 = find_dec_bits_max_min(z3_array)
+        return [q_h, q_c, q_z]
+
+    elif (type(layer.cell) is GRUCell or 'gru' in layer.cell.name):
+        cfg = layer.cell.get_config()
+        state = np.zeros(cfg['units'])
+        k = layer.get_weights()[0]
+        rk = layer.get_weights()[1]
+        bias = layer.get_weights()[2]
+
+        def gru_cell_step(cell_inputs, cell_states, kernel, recurrent_kernel, input_bias, recurrent_bias):
+            h_tm1 = cell_states[0]
+            # inputs projected by all gate matrices at once
+            matrix_x = np.dot(cell_inputs, kernel) +  input_bias
+            x_z, x_r, x_h = split_array(matrix_x, 3)
+            # hidden state projected by all gate matrices at once
+            matrix_inner = np.dot(h_tm1, recurrent_kernel) + recurrent_bias
+            recurrent_z, recurrent_r, recurrent_h = split_array(matrix_inner, 3)
+            z = nnom_sigmoid(x_z + recurrent_z)
+            r = nnom_sigmoid(x_r + recurrent_r)
+            hh = nnom_tanh(x_h + r * recurrent_h)
+            # previous and candidate state mixed by update gate
+            # h = z * h_tm1 + (1 - z) * hh
+            h1 =  z*h_tm1
+            h2 = 1-z
+            h3 = h2 * hh
+            h = h1 + h3
+            return h, [h], matrix_x, matrix_inner
+        h_array = []
+        z_array = []
+        i_array=[]
+        state = [np.zeros(cfg['units'])]
+        for feature in features:
+            if (not layer.stateful):
+                state = [np.zeros(cfg['units'])]
+            for fe in feature:
+                output, state, z, i = gru_cell_step(fe, state, k, rk, bias[0], bias[1])
+                h_array.append(output)
+                z_array.append(z)
+                i_array.append(i)
+        h_array = np.array(h_array)
+        i_array = np.array(i_array)
+        z_array = np.array(z_array)
+        # q_h = find_dec_bits_kld(h_array)
+        # q_i = find_dec_bits_kld(i_array)
+        # q_z = find_dec_bits_kld(z_array)
+        q_h = find_dec_bits_max_min(h_array)
+        q_i = find_dec_bits_max_min(i_array)
+        q_z = find_dec_bits_max_min(z_array)
+        q_z = min(q_i, q_z)
+        return [q_h, q_z]
+    return []
+
+def quantize_output(model, x_test, quantize_method='max_min', layer_offset=False, calibrate_size=None):
+    # limit the test data size
+    if(calibrate_size is not None):
+        if (x_test.shape[0] > calibrate_size):
+            x_test = x_test[:calibrate_size]
+    # test, show the output ranges
+    layer_q_list = {}
+    # FIXME: only support one input
+    if (type(model.layers[0]) != InputLayer):
+        L = [model.input] + model.layers
+    else:
+        L = model.layers
+
+    for layer in L:  # layer loop
+        if ("input" in layer.name):
+            features = x_test
+        else:
+            # rnn need a further step to determine the intermediate q format
+            if (is_rnn_layer(layer)):
+                in_layer = layer.inbound_nodes[0].inbound_layers
+                layer_model = Model(inputs=model.input, outputs=in_layer.output)
+                bs = model.input.shape[0]
+                features = layer_model.predict(x_test, batch_size=bs)
+                intermediate_dec = quantize_rnn_intermediate_output(layer, features)
+                print(layer.name, 'dec bit', intermediate_dec)
+                layer_q_list['intermediate_' + layer.name] = intermediate_dec
+
+            # batch_normalization will need to be handled differently, since we are fusing the weight to its previosu conv.
+            # sigmoid and tanh are different, their shift is fixed to 7
+            if (is_shift_layer(layer) or
+                    ('batch_normalization' in layer.name)):
+                layer_model = Model(inputs=model.input, outputs=layer.output)
+                bs = model.input.shape[0]
+                features = layer_model.predict(x_test, batch_size=bs)
+            else:
+                # leave the features not changed, so this layer shift will be the same as its inputs
+                pass
+
+        # we currently only support one offset for a layer output.
+        if(layer_offset):
+            offset = find_offset(features)
+            features = features - offset
+        else:
+            offset = 0
+        # saturated shift using KLD method OR non saturated shift using max-min
+        if ("kld"  in quantize_method
+                and not is_shift_fixed(layer)
+                and "input" not in layer.name
+                and "dense" not in layer.name):  # test, also do not use kld in input layer
+            dec_bits = find_dec_bits_kld(features, bit_width=8, scan_times=4)
+            print(layer.name,"Quantized method:", "KLD", "Values max:", np.max(features), "min:", np.min(features), "dec bit", dec_bits)
+        else:
+            dec_bits = find_dec_bits_max_min(features, bit_width=8)
+            print(layer.name,"Quantized method:","max-min"," Values max:", np.max(features), "min:", np.min(features), "dec bit", dec_bits)
+        # quantise offset
+        offset = int(np.round(offset * 2 ** dec_bits))
+        # record the shift
+        if (type(model.input) == tf.Tensor and type(model.layers[0]) != InputLayer):
+            layer_q_list[layer.name.split(':')[0]] = [dec_bits, offset]
+        else:
+            layer_q_list[layer.name] = [dec_bits, offset]
+        if ('batch_normalization' in layer.name):
+            layer_q_list[layer.inbound_nodes[0].inbound_layers.name] = [dec_bits, offset]  # use the bn layer shift to update the last layer.
+
+    # scan the layers backward, try to unify the dec bit in multiple input layers, (add, mult... concat...etc.)
+    LM = {}
+    for layer in model.layers:
+        LM[layer.name] = layer
+    L = [l for l in model.layers[1:]]
+    L.reverse()
+    def update_previous_layer_shift(layer, dec_bit):
+        if(type(layer.input) == list):
+            for inp in layer.input:
+                iname = inp.name.split('/')[0]
+                if('input' in iname):
+                    continue
+                layer_q_list[iname][0] = dec_min
+                if(not is_shift_layer(LM[iname])):
+                    update_previous_layer_shift(LM[iname], dec_bit)
+        else:
+            iname = layer.input.name.split('/')[0]
+            if('input' in iname):
+                return
+            layer_q_list[iname][0] = dec_min
+            if(not is_shift_layer(LM[iname])):
+                update_previous_layer_shift(LM[iname], dec_bit)
+    for layer in L:
+        if(type(layer.input) == list):
+            iname = layer.input[0].name.split('/')[0].split(':')[0]
+            dec_min = layer_q_list[iname][0]
+            # find min dec bit in these input
+            for inp in layer.input:
+                iname = inp.name.split('/')[0].split(':')[0]
+                if(layer_q_list[iname][0] < dec_min):
+                    dec_min = layer_q_list[iname][0]
+                if(layer_q_list[iname][0] != dec_min):
+                    bFlag = True
+            for inp in layer.input:
+                iname = inp.name.split('/')[0].split(':')[0]
+                layer_q_list[iname][0] = dec_min
+                if(not is_shift_layer(LM[iname])):
+                    update_previous_layer_shift(LM[iname], dec_min)
+            print('set dec bit', dec_min, 'for the input of', layer.name, ':', [inp.name.split('/')[0] for inp in layer.input])
+            if(not is_shift_layer(layer) or dec_min < layer_q_list[layer.name][0]): # update current layer's shift only when we cannot change the shift
+                layer_q_list[layer.name][0] = dec_min
+    # quantise offset
+    print("quantisation list", layer_q_list)
+    return layer_q_list
+
+
+def layer_name_from_tensor(t):
+    return t.name.replace(':','/').split('/')[0]
+
+
+def quantize_weights(model, name='weights.h', format='hwc', per_channel_quant=True, layer_q_list=None):
+    # Quantize weights to 8-bits using (min,max) and write to file
+    f = open(name, 'w')
+    f.write('#include "nnom.h"\n\n')
+    f.write('/* Weights, bias and Q format */\n')
+    f.close()
+    for curr_idx, layer in  enumerate(model.layers):
+        if (not layer.weights):
+            continue
+        # before merging bn layer, check if the bn is "legally" after Conv
+        if('batch_normalization' in layer.name) and \
+            ('conv' not in layer.inbound_nodes[0].inbound_layers.name):
+            raise  Exception('Only support batch_normalization placed after conv', layer.name,
+                            layer.inbound_nodes[0].inbound_layers.name)
+        # try to fuse BN layer to convolutional
+        if ('conv' in layer.name) and \
+            ('batch_normalization' in layer.outbound_nodes[0].outbound_layer.name):
+            fuse_bn_to_conv(layer)
+        # generate weights and bias now
+        weight_dec_shift = 0
+        print('quantizing weights for layer', layer.name)
+        layer_weights = layer.get_weights()
+        for idx, var in enumerate(layer_weights):
+            var_name = convert_tensor_name(layer.weights[idx])
+            var_values = var
+            if("kernel" not in var_name and 'bias' not in var_name): # ignore batchnormalisation's parameters
+                continue
+
+            if (per_channel_quant and type(layer) in [Conv2D, Conv1D, DepthwiseConv2D, Conv2DTranspose]):
+                if(type(layer) in [DepthwiseConv2D] and "kernel" in var_name): #depthwise kernel quantised by
+                    shape = var_values.shape[:2] + (-1,) # need to combine the mult and channel first
+                    var = var_values.reshape(shape)
+                    dec_bits = find_dec_bits_max_min_axis(var, axis=-1, bit_width=8)
+                elif(type(layer) in [Conv2DTranspose]):
+                    dec_bits = find_dec_bits_max_min_axis(var_values, axis=-2, bit_width=8)
+                else:
+                    dec_bits = find_dec_bits_max_min_axis(var_values, bit_width=8)
+            else:
+                dec_bits = find_dec_bits_max_min(var_values, bit_width=8)
+            print('   ', var_name, "dec bit", dec_bits)
+
+            # kernel dec, bias dec, bias shift, output shift
+            if(is_shift_layer(layer) and not is_rnn_layer(layer)):
+                inp = layer.input.name.replace(':','/').split('/')[0]
+                layer_input_dec = layer_q_list[inp][0]
+                layer_output_dec = layer_q_list[layer.name][0]
+                if ("kernel" in var_name):
+                    weight_dec_shift = dec_bits
+                else:
+                    # channel wise
+                    if hasattr(dec_bits, '__len__'):
+                        bias_shift = np.full(len(dec_bits), layer_input_dec)+weight_dec_shift-dec_bits
+                        layer_output_shift = np.full(len(weight_dec_shift), layer_input_dec) + weight_dec_shift \
+                            - np.full(len(weight_dec_shift), layer_output_dec)
+                        if (np.min(bias_shift) < 0):
+                            for i, w_dec in enumerate(weight_dec_shift):
+                                if (bias_shift[i] < 0):
+                                    dec_bits[i] = w_dec
+                                    bias_shift[i] = 0
+                    # layer wise
+                    else:
+                        bias_shift = layer_input_dec + weight_dec_shift - dec_bits
+                        layer_output_shift = layer_input_dec + weight_dec_shift - layer_output_dec
+                        if (bias_shift < 0):
+                            dec_bits = weight_dec_shift
+                            bias_shift = 0
+            # RNN layer's kernel dec, bias dec, bias shift, output shift
+            if(is_rnn_layer(layer)):
+                inp = layer.input.name.replace(':','/').split('/')[0]
+                layer_input_dec = layer_q_list[inp][0]
+                layer_output_dec = layer_q_list[layer.name][0]
+                #if (type(layer.cell) is SimpleRNNCell):
+                if ("kernel" in var_name and 'recurrent' not in var_name):
+                    weight_dec_shift = dec_bits
+                elif ('bias' in var_name):
+                    bias_shift = layer_input_dec + weight_dec_shift - dec_bits
+                    layer_output_shift = layer_input_dec + weight_dec_shift - layer_output_dec # this is not valid
+                    if (bias_shift < 0):
+                        dec_bits = weight_dec_shift
+                        bias_shift = 0
+
+            # now quantise them
+            if(type(layer) in [Conv2D, Conv1D, DepthwiseConv2D, Conv2DTranspose]):
+                if(type(layer) in [DepthwiseConv2D] and "kernel" in var_name):
+                    old_shape = var_values.shape
+                    var_values = quantize_data(var_values.reshape(var_values.shape[:2] + (-1,)),
+                                   dec_bits, axis=-1, per_axis=per_channel_quant) # convert to [h, w, out x mult]
+                    var_values = var_values.reshape(old_shape) # convert the shape back to  [h, w, out, mult]
+                elif(type(layer) in [Conv2DTranspose] and "kernel" in var_name):
+                    var_values = quantize_data(var_values, dec_bits, axis=-2, per_axis=per_channel_quant) # [h, w, out, in]
+                else:
+                    var_values = quantize_data(var_values, dec_bits, per_axis=per_channel_quant) # [h, w, in, out]
+            else:
+                var_values = quantize_data(var_values, dec_bits, per_axis=False)
+
+            # CHW format
+            if ('chw' in format):
+                if (is_lstm_layer(layer) or is_gru_layer(layer)):   # currently we use 16 bit intermediate， use reorder optimation
+                    transposed_wts = np.transpose(var_values)
+                    if('kernel' in var_name):
+                        transposed_wts = convert_q7_q15_weights(np.reshape(transposed_wts ,(transposed_wts.shape[0], transposed_wts.shape[1], 1, 1)))
+                # dense and rnn still working under HWC format
+                elif ("dense" in var_name or is_rnn_layer(layer)) and "kernel" in var_name:
+                    transposed_wts = np.transpose(var_values)
+                    transposed_wts = convert_to_x4_q7_weights(np.reshape(transposed_wts, (transposed_wts.shape[0], transposed_wts.shape[1], 1, 1)))
+                # all other kernels, bias stay the same
+                else:
+                    transposed_wts = var_values
+            # HWC format (NNOM/CMSIS-NN use [out_ch, h, w, in_ch], in C order)
+            else:
+                if (len(var_values.shape) == 3):  # 1D convolution layer weights
+                    transposed_wts = np.transpose(var_values, (2, 0, 1))
+                elif (len(var_values.shape) == 4):  # 2D convolution layer weights
+                    if(type(layer) == Conv2DTranspose): # test
+                        transposed_wts = np.transpose(var_values, (2, 0, 1, 3))
+                    elif type(layer) == DepthwiseConv2D:
+                        transposed_wts = var_values#np.transpose(var_values, (0, 1, 3, 2)) # [h, w, out, mult] test for multiplier
+                    else:
+                        transposed_wts = np.transpose(var_values, (3, 0, 1, 2))
+                elif(is_lstm_layer(layer) or is_gru_layer(layer)):   # currently we use 16 bit intermediate, use reorder optimation
+                    if('kernel' in var_name):
+                        transposed_wts = np.transpose(var_values)
+                        transposed_wts = convert_q7_q15_weights(np.reshape(transposed_wts ,(transposed_wts.shape[0], transposed_wts.shape[1], 1, 1)))
+                    else: # bias will not need to be transposed (for GRU which has 2d bias)
+                        transposed_wts = var_values
+                else:  # fully connected layer weights or biases of any layer
+                    # test, use opt weight reorder
+                    transposed_wts = np.transpose(var_values)
+                    if ("dense" in var_name or is_rnn_layer(layer)) and "kernel" in var_name: # and other RNN layers
+                        transposed_wts = convert_to_x4_q7_weights(np.reshape(transposed_wts ,(transposed_wts.shape[0], transposed_wts.shape[1], 1, 1)))
+
+            with open(name, 'a') as f:
+                def write_weights(f, name, value):
+                    f.write('#define ' + name + ' {')
+                    value.tofile(f, sep=", ", format="%d")
+                    f.write('}\n\n')
+                # weights or bias
+                write_weights(f, var_name.upper(), transposed_wts)
+                # dec bits
+                write_weights(f, var_name.upper()+'_DEC_BITS' , np.array(dec_bits))
+                # for test
+                if( "bias" in var_name):
+                    f.write('#define ' + layer.name.upper() + '_BIAS_LSHIFT '+to_cstyle(bias_shift) +'\n\n')
+                    #f.write('#define ' + layer.name.upper() + '_OUTPUT_DEC '+ to_cstyle(layer_output_dec)+'\n\n') # not here
+                    f.write('#define ' + layer.name.upper() + '_OUTPUT_RSHIFT ' + to_cstyle(layer_output_shift)+'\n\n')
+
+
+def generate_model(model, x_test, per_channel_quant=False, name='weights.h', format='hwc', quantize_method='max_min'):
+    """
+    :param model:
+    :param x_test:
+    :param name:
+    :param format:
+    :param quantize_method: "max_min" or "kld"
+    :return:
+    """
+    # get the quantize output range/format
+    layer_q_list = quantize_output(model, x_test, layer_offset=False, quantize_method=quantize_method)
+    # quantize weights and output shift
+    quantize_weights(model, per_channel_quant=per_channel_quant, name=name, format=format, layer_q_list=layer_q_list)
+    # now generate the model
+    if (type(model.layers[0]) != InputLayer):
+        L = [model.input] + model.layers
+    else:
+        L = model.layers
+    with open(name, 'a') as fp:
+        # generate the list of output
+        fp.write('\n/* output q format for each layer */\n')
+        for layer in L:
+            if (type(model.input) == tf.Tensor and type(model.layers[0]) != InputLayer):
+                iname = layer.name.split(':')[0]
+            else:
+                iname = layer.name
+            fp.write('#define %s_OUTPUT_DEC %s\n' % (iname.upper(), layer_q_list[iname][0]))
+            fp.write('#define %s_OUTPUT_OFFSET %s\n' % (iname.upper(), layer_q_list[iname][1]))
+        fp.write('\n/* bias shift and output shift for none-weighted layer */\n')
+
+        # generate output shift for the layers without weights (weighted layers were generated in quantize_weights)
+        for layer in model.layers:
+            if (is_shift_layer(layer)):
+                iname = layer.name.upper()
+                # add, sub
+                if ('add' in layer.name or 'subtract' in layer.name):
+                    # only consider the first, they have been set to same in out_put_range()
+                    inp = layer.input[0].name.replace(':', '/').split('/')[0].upper()
+                    fp.write('#define {0}_OUTPUT_RSHIFT ({1}_OUTPUT_DEC-{0}_OUTPUT_DEC)\n'.format(
+                        iname, inp))
+                    fp.write(
+                        '#if {0}_OUTPUT_RSHIFT < 0\n#error {0}_OUTPUT_RSHIFT must be bigger than 0\n#endif\n'.format(
+                            iname))
+                # mult is different, Q3.4 * Q3.4 = Q6.8. if mult out is Q4.3, then shift (Q.4+q.4)-Q.3=5. Am I right?
+                elif ('multiply' in layer.name):
+                    inp = layer.input[0].name.replace(':', '/').split('/')[0].upper()
+                    fp.write('#define {0}_OUTPUT_RSHIFT ({1}_OUTPUT_DEC*2-{0}_OUTPUT_DEC)\n'.format(
+                        iname, inp))
+                    fp.write(
+                        '#if {0}_OUTPUT_RSHIFT < 0\n#error {0}_OUTPUT_RSHIFT must be bigger than 0\n#endif\n'.format(
+                            iname))
+
+        fp.write('\n/* tensors and configurations for each layer */\n')
+        LI = {}
+        ID = 0
+
+        def is_skipable_layer(layer):
+            # FIXME: add more that could be skiped
+            if ('lambda' in layer.name or
+                'dropout' in layer.name or
+                'gaussian_noise' in layer.name or
+                'batch_normalization' in layer.name
+                #or ('flatten' in layer.name and 'chw' not in format)
+                ): # flatten layer can be skipped in HWC but needed in CHW
+                return True
+            return False
+
+        output_num = 0
+        for id, layer in enumerate(L):
+            if (is_skipable_layer(layer)):
+                inp = layer.input.name.replace(':', '/').split('/')[0]
+                LI[layer.name] = (LI[inp][0], layer)
+            else:
+                if (type(model.input) == tf.Tensor and type(model.layers[0]) != InputLayer):
+                    LI[layer.name.split(':')[0]] = (ID, layer)
+                else:
+                    LI[layer.name] = (ID, layer)
+                ID += 1
+
+            def gen_weight_tensor(w, per_axis):
+                var_cname = convert_tensor_name(w) + '_data'
+                dec_bits_name = convert_tensor_name(w).upper() + '_DEC_BITS'
+                fp.write(gen_values(var_cname, convert_tensor_name(w).upper()))
+                fp.write(gen_tensor(w, dec_bits=dec_bits_name, tensor_value=var_cname, per_axis=per_axis))
+
+            # output the config of all layer
+            if (type(layer) in [InputLayer] or 'input' in layer.name):
+                if(type(layer) == tf.Tensor):
+                    raise  Exception('Not yet support tensor as input/or Sequential model. '
+                                     'please use Input layer as your first layer in the model', layer.name, layer)
+                size = 1
+                for s in layer.input.shape[1:]:
+                    size *= s if s is not None else 1
+                fp.write(gen_values('nnom_input_data', '{0}', size=str(size), dtype='static int8_t'))
+                fp.write(gen_tensor(layer.input, layer_q_list[layer.name][0], tensor_value='nnom_input_data', is_io_tensor=True))
+                fp.write(gen_io_config(layer, tensor_name=convert_tensor_name(layer.input)))
+            elif (type(layer) in [Conv2D, Conv1D, DepthwiseConv2D]):
+                for w in layer.weights:
+                    gen_weight_tensor(w, per_axis=per_channel_quant)
+                fp.write(gen_conv2d_config(layer, layer.name.upper() +'_OUTPUT_RSHIFT', layer.name.upper() +'_BIAS_LSHIFT'))
+            elif (type(layer) in [Conv2DTranspose]):
+                for w in layer.weights:
+                    gen_weight_tensor(w, per_axis=per_channel_quant)
+                fp.write(gen_conv2d_trans_config(layer, layer.name.upper() +'_OUTPUT_RSHIFT', layer.name.upper() +'_BIAS_LSHIFT'))
+            elif (type(layer) in [Dense]):
+                for w in layer.weights:
+                    gen_weight_tensor(w, per_axis=False)
+                fp.write(gen_dense_config(layer, layer.name.upper() +'_OUTPUT_RSHIFT', layer.name.upper() +'_BIAS_LSHIFT'))
+            elif (type(layer) in [MaxPooling2D, AveragePooling2D, MaxPooling1D, AveragePooling1D]):
+                fp.write(gen_pooling_config(layer))
+            elif (type(layer) in [GlobalMaxPooling2D, GlobalAveragePooling2D, GlobalMaxPooling1D, GlobalAveragePooling1D]):
+                fp.write(gen_gl_pooling_config(layer))
+            elif (type(layer) in [Multiply, Add, Subtract]):
+                fp.write(gen_matrix_config(layer, output_shift_name=layer.name.upper()+'_OUTPUT_RSHIFT'))
+            elif (type(layer) in [ZeroPadding2D, ZeroPadding1D]):
+                fp.write(gen_zero_padding_config(layer))
+            elif (type(layer) in [Cropping2D, Cropping1D]):
+                fp.write(gen_cropping_config(layer))
+            elif (type(layer) in [Softmax]):
+                fp.write(gen_softmax_config(layer))
+            elif (type(layer) in [Flatten]):
+                fp.write(gen_flatten_config(layer))
+            elif (type(layer) in [Reshape]):
+                fp.write(gen_reshape_config(layer))
+            elif (type(layer) in [Concatenate]):
+                fp.write(gen_concat_config(layer))
+            elif (type(layer) in [Lambda]):
+                fp.write(gen_lambda_config(layer))
+            elif (type(layer) in [UpSampling2D, UpSampling1D]):
+                fp.write(gen_upsampling_config(layer))
+            elif(is_rnn_layer(layer)):
+                if(type(layer.cell) is SimpleRNNCell):
+                    for w in layer.weights:
+                        gen_weight_tensor(w, per_axis=False)
+                    fp.write(gen_simple_cell_config(layer, layer_q_list['intermediate_'+layer.name]))
+                elif(type(layer.cell) is GRUCell or 'gru' in layer.cell.name):
+                    for w in layer.weights:
+                        gen_weight_tensor(w, per_axis=False)
+                    fp.write(gen_gru_cell_config(layer, layer_q_list['intermediate_'+layer.name]))
+                elif(type(layer.cell) is LSTMCell or 'lstm' in layer.cell.name):
+                    for w in layer.weights:
+                        gen_weight_tensor(w, per_axis=False)
+                    fp.write(gen_lstm_cell_config(layer, layer_q_list['intermediate_'+layer.name]))
+                fp.write(gen_rnn_config(layer))
+
+            # test, multiple output layer
+            if(len(layer.outbound_nodes) == 0):
+                size=1
+                for s in layer.output.shape[1:]:
+                    size *= s if s is not None else 1
+                if(output_num == 0): # the first output or the only output
+                    fp.write(gen_values('nnom_output_data', '{0}', size=str(size), dtype='static int8_t'))
+                    fp.write(gen_output_config(layer, dec_bits=layer.name.upper() + '_OUTPUT_DEC', output_num=output_num, value_name='nnom_output_data'))
+                    output_num += 1
+                else:
+                    output_value_names = 'nnom_output_data'+str(output_num)
+                    fp.write(gen_values(output_value_names, '{0}', size=str(size), dtype='static int8_t'))
+                    fp.write(gen_output_config(layer, dec_bits=layer.name.upper() + '_OUTPUT_DEC', output_num=output_num, value_name=output_value_names))
+                    output_num += 1
+
+            # # last layer, attach the additional nnom output layer
+            # if(id == len(L)-1):
+            #     size=1
+            #     for s in layer.output.shape[1:]:
+            #         size *= s if s is not None else 1
+            #     fp.write(gen_values('nnom_output_data', '{0}', size=str(size), dtype='static int8_t'))
+            #     fp.write(gen_output_config(layer,  dec_bits=layer.name.upper()+'_OUTPUT_DEC', value_name='nnom_output_data'))
+
+        # write version
+        fp.write('/* model version */\n')
+        fp.write('#define NNOM_MODEL_VERSION (10000*{0} + 100*{1} + {2})\n'.format(model_major_version, model_sub_version, model_reversion ))
+
+        # model
+        fp.write('\n/* nnom model */\n')
+        fp.write('static nnom_model_t* nnom_model_create(void)\n{\n')
+        fp.write('\tstatic nnom_model_t model;\n')
+        if (ID > 32):
+            fp.write('\tnnom_layer_t **layer = (nnom_layer_t**)malloc(sizeof(nnom_layer_t *)*%d);\n' % (ID + 1))
+            fp.write('\tif(NULL == layer) return NULL;\n')
+        else:
+            fp.write('\tnnom_layer_t* layer[%d];\n' % (ID + 1))
+        fp.write('\n\tcheck_model_version(NNOM_MODEL_VERSION);')
+        fp.write('\n\tnew_model(&model);\n\n')
+
+        # inverted order of output, very strange
+        output_num = (len(model.output) -1) if type(model.output) is list else 0
+        for layer in L:
+            if (is_skipable_layer(layer)):
+                continue
+            # FIXME: need a better solution to seperate the input 'tensor' from other layers
+            if (type(model.input) == tf.Tensor and type(model.layers[0]) != InputLayer):
+                id, _ = LI[layer.name.split(':')[0]]
+            else:
+                id, _ = LI[layer.name]
+
+            if ('input' in layer.name):
+                fp.write('\tlayer[%d] = input_s(&%s_config);\n' % (id, layer.name))
+
+            # convlutional
+            elif ('conv1d' in layer.name
+                  or 'conv2d' in layer.name):
+                inp = layer_name_from_tensor(layer.input)
+                if('transpose' in layer.name):
+                    fp.write('\tlayer[{0}] = model.hook(conv2d_trans_s(&{1}_config), layer[{2}]);\n'.format(id, layer.name,  LI[inp][0]))
+                elif('depthwise' in layer.name):
+                    fp.write('\tlayer[{0}] = model.hook(dw_conv2d_s(&{1}_config), layer[{2}]);\n'.format(id, layer.name, LI[inp][0]))
+                else:
+                    fp.write('\tlayer[{0}] = model.hook(conv2d_s(&{1}_config), layer[{2}]);\n'.format(id, layer.name, LI[inp][0]))
+            elif ('activation' in layer.name):
+                inp = layer_name_from_tensor(layer.input)
+                cfg = layer.get_config()
+                if (cfg['activation'] == 'relu'):
+                    fp.write('\tlayer[%s] = model.active(act_relu(), layer[%s]);\n' % (id, LI[inp][0]))
+                elif (cfg['activation'] == 'tanh'):
+                    fp.write('\tlayer[%s] = model.active(act_hard_tanh(%s_OUTPUT_DEC), layer[%s]);\n' % (
+                    id, inp.upper(), LI[inp][0]))
+                elif (cfg['activation'] == 'sigmoid'):
+                    fp.write('\tlayer[%s] = model.active(act_sigmoid(%s_OUTPUT_DEC), layer[%s]);\n' % (
+                    id, inp.upper(), LI[inp][0]))
+                elif (cfg['activation'] == 'hard_sigmoid'):
+                    fp.write('\tlayer[%s] = model.active(act_hard_sigmoid(%s_OUTPUT_DEC), layer[%s]);\n' % (
+                    id, inp.upper(), LI[inp][0]))
+                elif (cfg['activation'] == 'softmax'):
+                    fp.write('\tlayer[%s] = model.hook(Softmax(), layer[%s]);\n' % (id, LI[inp][0]))
+            elif ('leaky_re_lu' in layer.name):
+                inp = layer_name_from_tensor(layer.input)
+                cfg = layer.get_config()
+                fp.write('\tlayer[%s] = model.active(act_leaky_relu(%ff), layer[%s]);\n' % (id, cfg["alpha"],LI[inp][0]))
+            elif ('re_lu' in layer.name):
+                inp = layer_name_from_tensor(layer.input)
+                cfg = layer.get_config()
+                if(cfg['max_value'] is None and cfg['negative_slope'] == 0 and cfg['threshold'] == 0):
+                    fp.write('\tlayer[%s] = model.active(act_relu(), layer[%s]);\n' % (id, LI[inp][0]))
+                else:
+                    if(cfg['max_value'] is None):
+                        max_v = 'INFINITY '
+                    else:
+                        max_v = str(cfg['max_value'])
+                    fp.write('\tlayer[%s] = model.active(act_adv_relu(%f,%s,%f), layer[%s]);\n'
+                             % (id, cfg['negative_slope'], max_v, cfg['threshold'], LI[inp][0]))
+            # pooling
+            elif ('max_pooling' in layer.name):
+                inp = layer_name_from_tensor(layer.input)
+                if ('global' in layer.name):
+                    fp.write('\tlayer[{0}] = model.hook(global_maxpool_s(&{1}_config), layer[{2}]);\n'.format(id, layer.name, LI[inp][0]))
+                else:
+                    fp.write('\tlayer[{0}] = model.hook(maxpool_s(&{1}_config), layer[{2}]);\n'.format(id, layer.name, LI[inp][0]))
+            elif ('average_pooling' in layer.name):
+                inp = layer_name_from_tensor(layer.input)
+                if ('global' in layer.name):
+                    fp.write('\tlayer[{0}] = model.hook(global_avgpool_s(&{1}_config), layer[{2}]);\n'.format(id, layer.name, LI[inp][0]))
+                else:
+                    fp.write('\tlayer[{0}] = model.hook(avgpool_s(&{1}_config), layer[{2}]);\n'.format(id, layer.name, LI[inp][0]))
+            elif ('up_sampling' in layer.name):
+                inp = layer_name_from_tensor(layer.input)
+                fp.write('\tlayer[{0}] = model.hook(upsample_s(&{1}_config), layer[{2}]);\n'.format(id, layer.name, LI[inp][0]))
+            # zero padding
+            elif ('zero_padding' in layer.name):
+                inp = layer_name_from_tensor(layer.input)
+                fp.write('\tlayer[{0}] = model.hook(zeropadding_s(&{1}_config), layer[{2}]);\n'.format(id, layer.name, LI[inp][0]))
+            # Cropping
+            elif ('cropping' in layer.name):
+                inp = layer_name_from_tensor(layer.input)
+                fp.write('\tlayer[{0}] = model.hook(cropping_s(&{1}_config), layer[{2}]);\n'.format(id, layer.name, LI[inp][0]))
+
+            # others
+            elif ('flatten' in layer.name):  # flatten is needed in CHW backend but not needed in HWC
+                inp = layer_name_from_tensor(layer.input)
+                fp.write('\tlayer[{0}] = model.hook(flatten_s(&{1}_config), layer[{2}]);\n'.format(id, layer.name, LI[inp][0]))
+            elif ('reshape' in layer.name):  # flatten is needed in CHW backend but not needed in HWC
+                inp = layer_name_from_tensor(layer.input)
+                fp.write('\tlayer[{0}] = model.hook(reshape_s(&{1}_config), layer[{2}]);\n'.format(id, layer.name, LI[inp][0]))
+            elif ('concatenate' in layer.name):
+                inps = [layer_name_from_tensor(input) for input in layer.input]
+                inX = ''
+                for inp in inps:
+                    inX += ' ,layer[%d]' % (LI[inp][0])
+                fp.write('\tlayer[%s] = model.mergex(concat_s(&%s_config), %s%s);\n' % (
+                    id, layer.name, len(inps), inX))
+            elif ('add' in layer.name):
+                inps = [layer_name_from_tensor(input) for input in layer.input]
+                inX = ''
+                for inp in inps:
+                    inX += ' ,layer[%d]' % (LI[inp][0])
+                fp.write('\tlayer[%s] = model.mergex(add_s(&%s_config), %s%s);\n' % (
+                    id, layer.name, len(inps), inX))
+            elif ('subtract' in layer.name):
+                inps = [layer_name_from_tensor(input) for input in layer.input]
+                inX = ''
+                for inp in inps:
+                    inX += ' ,layer[%d]' % (LI[inp][0])
+                fp.write('\tlayer[%s] = model.mergex(sub_s(&%s_config), %s%s);\n' % (
+                    id, layer.name, len(inps), inX))
+            elif ('multiply' in layer.name):
+                inps = [layer_name_from_tensor(input) for input in layer.input]
+                inX = ''
+                for inp in inps:
+                    inX += ' ,layer[%d]' % (LI[inp][0])
+                fp.write('\tlayer[%s] = model.mergex(mult_s(&%s_config), %s%s);\n' % (
+                    id, layer.name, len(inps), inX))
+            elif ('dense' in layer.name):
+                inp = layer_name_from_tensor(layer.input)
+                fp.write('\tlayer[{0}] = model.hook(dense_s(&{1}_config), layer[{2}]);\n'.format(id, layer.name, LI[inp][0]))
+            elif ('softmax' in layer.name):
+                inp = layer_name_from_tensor(layer.input)
+                fp.write('\tlayer[{0}] = model.hook(softmax_s(&{1}_config), layer[{2}]);\n'.format(id, layer.name, LI[inp][0]))
+
+            elif (is_rnn_layer(layer)):
+                inp = layer_name_from_tensor(layer.input)
+                line = '\tlayer[{0}] = model.hook(rnn_s(<rnn_cell>, &{1}_config), layer[{2}]);\n'.format(id, layer.name, LI[inp][0])
+                if (type(layer.cell) is SimpleRNNCell):
+                    line = line.replace('<rnn_cell>', 'simple_cell_s(&%s_simple_cell_config)' %(layer.name))
+                elif (type(layer.cell) is GRUCell or 'gru' in layer.cell.name):
+                    line = line.replace('<rnn_cell>', 'gru_cell_s(&%s_gru_cell_config)' % (layer.name))
+                elif (type(layer.cell) is LSTMCell or 'lstm' in layer.cell.name):
+                    line = line.replace('<rnn_cell>', 'lstm_cell_s(&%s_lstm_cell_config)' % (layer.name))
+                fp.write(line)
+            else:
+                raise Exception('unsupported layer', layer.name, layer)
+
+            # test, multiple output layer (not yet working with multiple outputs)
+            if(len(layer.outbound_nodes) == 0):
+                fp.write('\tlayer[{0}] = model.hook(output_s(&{1}_config), layer[{2}]);\n'.format(id + 1, 'output'+str(output_num), LI[inp][0] + 1))
+                output_num -=1 # the num is inverted in keras, not a good solution yet.
+
+            """
+            # temporary fixed for activations attached into layers in construction
+            def is_activation_attached(layer):
+                if(("Softmax" in layer.output.name and "softmax" not in layer.name)or
+                ("Relu" in layer.output.name and "re_lu" not in layer.name) or
+                ("Sigmoid" in layer.output.name and "sigmoid" not in layer.name) or
+                ("Tanh" in layer.output.name and "tanh" not in layer.name)):
+                    return True
+                return False
+            if "input" not in layer.name and is_activation_attached(layer):
+                inp = layer.output.name.replace(':', '/').split('/')[0]
+                cfg = layer.get_config()
+                if(cfg['activation'] == 'relu'):
+                    fp.write('\tlayer[%s] = model.active(act_relu(), layer[%s]);\n'%(id, LI[inp][0]))
+                if(cfg['activation'] == 'tanh'):
+                    fp.write('\tlayer[%s] = model.active(act_tanh(%s_OUTPUT_SHIFT), layer[%s]);\n'%(id, inp.upper(), LI[inp][0]))
+                if(cfg['activation'] == 'sigmoid'):
+                    fp.write('\tlayer[%s] = model.active(act_sigmoid(%s_OUTPUT_SHIFT), layer[%s]);\n'%(id, inp.upper(), LI[inp][0]))
+                elif(cfg['activation'] == 'softmax'):
+                    fp.write('\tlayer[%s] = model.hook(Softmax(), layer[%s]);\n'%(id, LI[inp][0]))
+            """
+        # generate final output layer
+        #fp.write('\tlayer[{0}] = model.hook(output_s(&{1}_config), layer[{2}]);\n'.format(id+1, 'output', LI[inp][0]+1))
+        fp.write('\tmodel_compile(&model, layer[0], layer[%s]);\n' % (id + 1))
+        if (ID > 32):
+            fp.write('\tfree(layer);\n')
+        fp.write('\treturn &model;\n}\n')
+    with open('.layer_q_list', 'w') as fp:
+        fp.write(str(layer_q_list))
+
+def evaluate_model(model, x_test, y_test, running_time=False, to_file='evaluation.txt'):
+    # Score trained model.
+    scores = model.evaluate(x_test, y_test, verbose=2)
+    print('Test loss:', scores[0])
+    print('Top 1:', scores[1])
+
+    if(len(y_test.shape)>1):
+        bs = model.input.shape[0]
+        predictions = model.predict(x_test, batch_size=bs)
+        matrix = skmetrics.confusion_matrix(y_test.argmax(axis=1), predictions.argmax(axis=1))
+        print(matrix)
+
+    run_time = 0
+    if running_time:
+        # try to calculate the time
+        T = time.time()
+        bs = model.input.shape[0]
+        for i in range(10):
+            model.predict(x_test, batch_size=bs)
+        T = time.time() - T
+        run_time = round((T / 10 / x_test.shape[0] * 1000 * 1000), 2)
+        print("Runing time:",run_time , "us" )
+    #
+    with open(to_file, 'w') as f:
+        f.write("Runing time: "+ str(run_time) + "us" + "\n")
+        f.write('Test loss:'+ str(scores[0]) + "\n")
+        f.write('Top 1:'+ str(scores[1])+ "\n")
+        if (len(y_test.shape) > 1):
+            for row in matrix:
+                row.tofile(f, sep=',')
+                f.write("\n")
+    return scores
+
+def f2q(d, Q):
+    '''To convert a number from floating point to Qm.n format:
+        1. Multiply the floating point number by 2n
+        2. Round to the nearest integer
+    '''
+    return np.round(d*2**Q)
+
+
+def q2f(d, Q):
+    '''To convert a number from Qm.n format to floating point:
+        1. Convert the number to floating point as if it were an integer, in other words remove the binary point
+        2. Multiply by 2-n
+    '''
+    return d*2**-Q
+
+def show_weights(w, name):
+    sz = 1
+    for s in w.shape:
+        sz = sz*s
+    aL = w.reshape(sz,)
+    MIN,MAX=min(aL),max(aL)
+    Q = int(np.ceil(np.log2(max(abs(MIN),abs(MAX)))))
+    Q = 7-Q
+    qL = f2q(aL,Q)
+    qL = q2f(qL,Q)
+    plt.figure(figsize=(18, 3))
+    plt.subplot(131)
+    plt.title(name)
+    plt.plot(aL)
+    plt.grid()
+    aL.sort()
+    plt.plot(aL,'r')
+    plt.grid()
+    plt.subplot(132)
+    plt.title('Q%s'%(Q))
+    qL.sort()
+    plt.plot(aL,'r')
+    plt.plot(qL,'g')
+    plt.grid()
+    plt.subplot(133)
+    plt.hist(aL,100)
+    plt.title('hist')
+    plt.grid()
+    plt.show()
+
+def compare(a,b,name):
+    sz = 1
+    for s in a.shape:
+        sz = sz*s
+    aL = a.reshape(sz,)
+    bL = b.reshape(sz,)
+    assert(len(aL) == len(bL))
+    Z = list(zip(aL,bL))
+    Z.sort(key=lambda x: x[0])
+    aL1,bL1=zip(*Z)
+    plt.figure(figsize=(18, 3))
+    plt.subplot(131)
+    plt.plot(aL)
+    plt.plot(aL1,'r')
+    plt.grid()
+    plt.title('tf-%s'%(name))
+    plt.subplot(133)
+    plt.plot(bL1,'g')
+    plt.plot(aL1,'r')
+    plt.grid()
+    plt.title('compare')
+    plt.subplot(132)
+    bL1=list(bL1)
+    bL1.sort()
+    plt.plot(bL)
+    plt.plot(bL1,'g')
+    plt.grid()
+    plt.title('nn-%s'%(name))
+    plt.show()
+
diff --git a/APP_Framework/Framework/knowing/nnom/scripts/nnom_utils.py b/APP_Framework/Framework/knowing/nnom/scripts/nnom_utils.py
new file mode 100644
index 000000000..32868ac81
--- /dev/null
+++ b/APP_Framework/Framework/knowing/nnom/scripts/nnom_utils.py
@@ -0,0 +1,845 @@
+'''
+    Copyright (c) 2018-2020
+    Jianjia Ma
+    majianjia@live.com
+
+    SPDX-License-Identifier: Apache-2.0
+
+    Change Logs:
+    Date           Author       Notes
+    2019-02-05     Jianjia Ma   The first version
+
+
+    This file provides:
+    -> fake_quantisation layers which simulate the output quantisation on fixed-point NN models.
+    -> weights/bias quantisation of Convolution and Dense Layer. "weight.h" file generations
+    -> export "testing set" binary data file.
+    -> print output ranges of each layers.
+
+    Currently, this script does not support RNN (type) layers.
+'''
+
+import matplotlib.pyplot as plt
+import tensorflow as tf
+from tensorflow.keras.layers import InputLayer
+from tensorflow.keras.models import Model
+
+from sklearn import metrics
+from .fully_connected_opt_weight_generation import *
+import time
+import warnings
+
+""" 
+this is the generate the test set data to a bin file
+bin file can be used to validate the implementation in MCU
+
+"""
+def generate_test_bin(x, y, name='test_data_with_label.bin'):
+    '''
+    this method generate the
+    :param x:  input x data size
+    :param y:  input label (one hot label)
+    :return:
+    '''
+    # quantize input x
+    min_value = np.min(x)
+    max_value = np.max(x)
+
+    int_bits = int(np.ceil(np.log2(max(abs(min_value), abs(max_value)))))
+    dec_bits = 7 - int_bits
+    x = np.round(x*2**dec_bits).astype(np.int8)
+    # get label
+    if(len(y.shape) >1):
+        test_label = np.argwhere(y == 1).astype(np.int8)  # test data
+        test_label = test_label[:, 1]
+    else:
+        test_label = y
+
+    # get data
+    dat = x.astype(dtype="byte")  # test data
+    batch_size = dat.shape[0]     # total pices of data	
+    dat = dat.flatten()           # flatten to get the total size.
+    block_size = int(dat.size / batch_size) # this must be integer but... just to confirm
+
+    # write (label x 128) (data_block x 128)
+    label_batch = 128       # the Y-modem example uses 128 batch
+    with open(name, 'wb') as f:
+        start = 0
+        while start <= (test_label.size - label_batch):
+            test_label[start: start + label_batch].tofile(f)
+            dat[block_size * start: block_size * (start + label_batch)].tofile(f)
+            start += label_batch
+
+        # the rest data
+        if (start < test_label.size):
+            rest_len = test_label.size - start
+            new_labls = test_label[start:]
+            new_labls = np.pad(new_labls, (0, label_batch - rest_len), mode='constant')
+            new_labls.tofile(f)
+            dat[block_size * start:].tofile(f)
+
+    print("binary test file generated:", name)
+    print("test data length:", test_label.size)
+    return
+
+def is_shift_layer(layer):
+    ''' layer which can change the output encoding'''
+    #FIXME: add more which will change the output shift
+    if('input' in layer.name or
+       'conv2d' in layer.name or
+       'conv1d' in layer.name or
+       'dense' in layer.name or
+       'softmax' in layer.name or
+        'sigmoid' in layer.name or
+        'tanh' in layer.name or
+        ('add' in layer.name and 'zero' not in layer.name) or # the name, zero_padding contains 'add'
+        'subtract' in layer.name or
+        'multiply' in layer.name or
+       ('activation' in layer.name and layer.get_config()['activation'] == 'softmax')or
+       ('activation' in layer.name and layer.get_config()['activation'] == 'sigmoid') or
+       ('activation' in layer.name and layer.get_config()['activation'] == 'tanh')
+    ):
+        return True
+    return False
+
+def is_shift_fixed(layer):
+    ''' layer which shift to a fixed value'''
+    #FIXME: add more which will change the output shift
+    if('softmax' in layer.name or
+        'sigmoid' in layer.name or
+        'tanh' in layer.name or
+        ('activation' in layer.name and layer.get_config()['activation'] == 'softmax') or
+        ('activation' in layer.name and layer.get_config()['activation'] == 'sigmoid') or
+        ('activation' in layer.name and layer.get_config()['activation'] == 'tanh')
+    ):
+        return True
+    return  False
+
+def fuse_bn_to_conv(layer):
+    # try to fuse BN layer to convolutional
+    if ('conv' in layer.name) and \
+            ('batch_normalization' in layer._outbound_nodes[0].outbound_layer.name):
+
+        print("fusing batch normalization to", layer.name)
+        bn_layer = layer._outbound_nodes[0].outbound_layer
+        c_w = layer.get_weights()[0]
+        c_b = layer.get_weights()[1]
+        print('original weight max', c_w.max(), 'min', c_w.min())
+        print('original bias max', c_b.max(), 'min', c_b.min())
+        bn_gamma = bn_layer.get_weights()[0]
+        bn_beta = bn_layer.get_weights()[1]
+        bn_mean = bn_layer.get_weights()[2]
+        bn_variance = bn_layer.get_weights()[3]
+
+        if ('conv2d' in layer.name):
+            epsilon = 1e-3  # default epsilon for tf.slim.batch_norm
+            for l in range(c_w.shape[3]):
+                for k in range(c_w.shape[2]):
+                    for j in range(c_w.shape[1]):
+                        for i in range(c_w.shape[0]):
+                            if "depthwise" in layer.name:  # depthwise batchnorm params are ordered differently
+                                c_w[i][j][k][l] *= bn_gamma[k] / np.sqrt(bn_variance[k] + epsilon)
+                            else:
+                                c_w[i][j][k][l] *= bn_gamma[l] / np.sqrt(bn_variance[l] + epsilon)
+
+            if "depthwise" in layer.name:
+                depth_dim = c_w.shape[2]
+            else:
+                depth_dim = c_w.shape[3]
+            for l in range(depth_dim):
+                c_b[l] = (bn_gamma[l] * (c_b[l] - bn_mean[l]) / np.sqrt(bn_variance[l] + epsilon)) + bn_beta[l]
+        # conv1d
+        else:
+            epsilon = 1e-3  # default epsilon for tf.slim.batch_norm
+            for k in range(c_w.shape[2]):
+                for j in range(c_w.shape[1]):
+                    for i in range(c_w.shape[0]):
+                        if "depthwise" in layer.name:  # depthwise batchnorm params are ordered differently
+                            c_w[i][j][k] *= bn_gamma[j] / np.sqrt(bn_variance[j] + epsilon)
+                        else:
+                            c_w[i][j][k] *= bn_gamma[k] / np.sqrt(bn_variance[k] + epsilon)
+
+            if "depthwise" in layer.name:
+                depth_dim = c_w.shape[1]
+            else:
+                depth_dim = c_w.shape[2]
+            for l in range(depth_dim):
+                c_b[l] = (bn_gamma[l] * (c_b[l] - bn_mean[l]) / np.sqrt(bn_variance[l] + epsilon)) + bn_beta[l]
+
+        print('fused weight max', c_w.max(), 'min', c_w.min())
+        print('fused bias max', c_b.max(), 'min', c_b.min())
+        # write the weights back to the layer
+        # after that, the model will be destroyed.. need a better way to pass the new weight
+        layer.set_weights([c_w, c_b])
+
+def generate_weights(model, name='weights.h', format='hwc', shift_list=None):
+    # Quantize weights to 8-bits using (min,max) and write to file
+    f = open(name, 'w')
+    f.write('#include "nnom.h"\n\n')
+    f.close()
+
+    for curr_idx, layer in  enumerate(model.layers):
+        if (not layer.weights):
+            continue
+
+        # before merging bn layer, check if the bn is "legally" after Conv
+        if('batch_normalization' in layer.name) and \
+            ('conv' not in layer.inbound_nodes[0].inbound_layers.name):
+            raise  Exception('Currently only support batch_normalization after conv', layer.name,
+                            layer._inbound_nodes[0].inbound_layers[0].name)
+
+        # try to fuse BN layer to convolutional
+        if ('conv' in layer.name) and \
+            ('batch_normalization' in layer.outbound_nodes[0].outbound_layer.name):
+            fuse_bn_to_conv(layer)
+
+        # generate weights and bias now
+        weight_dec_shift = 0
+        print('weights for layer', layer.name)
+        for var in layer.weights:
+            var_name = str(var.name)
+            if("kernel" in var_name ):
+                var_values = layer.get_weights()[0] # weight
+                print("  weight:", var_name)
+            elif("bias" in var_name):
+                var_values = layer.get_weights()[1] # bias
+                print("  bias: ",var_name)
+            else:
+                continue
+
+            print("  original shape: ", var_values.shape)
+            min_value = np.min(var_values)
+            max_value = np.max(var_values)
+
+            int_bits = int(np.ceil(np.log2(max(abs(min_value), abs(max_value)))))
+            dec_bits = 7 - int_bits
+            print("  dec bit", dec_bits)
+            bSameAsKernel = False
+            if(is_shift_layer(layer)):
+                bSameAsKernel = False
+                inp = layer.input.name.replace(':','/').split('/')[0]
+                input_encoding = shift_list[inp]
+                if ("kernel" in var_name):
+                    weight_dec_shift = dec_bits
+                else:
+                    shift = input_encoding+weight_dec_shift-dec_bits
+                    if(shift < 0):
+                        bSameAsKernel = True
+            if(shift_list is None or bSameAsKernel):
+                # check if bias shift > weight shift, then reduce bias shift to weight shift	
+                if ("kernel" in var_name):
+                    weight_dec_shift = dec_bits	
+                else:	
+                    if(dec_bits > weight_dec_shift):	
+                        dec_bits = weight_dec_shift	
+                print("  new dec bit", dec_bits)
+
+            # convert to [-128,128) or int8
+            var_values = np.round(var_values * 2 ** dec_bits)
+            var_name = var_name.replace('/', '_')
+            var_name = var_name.replace(':', '_')
+            with open(name, 'a') as f:
+                f.write('#define ' + var_name.upper() + ' {')
+            # CHW format
+            if ('chw' in format):
+                if "dense" in var_name and "kernel" in var_name:
+                    transposed_wts = np.transpose(var_values)
+                    transposed_wts = convert_to_x4_q7_weights(
+                        np.reshape(transposed_wts, (transposed_wts.shape[0], transposed_wts.shape[1], 1, 1)))
+                # all other kernels, bias stay the same
+                else:
+                    transposed_wts = var_values
+            # HWC format
+            else:
+                if (len(var_values.shape) == 3):  # 1D convolution layer weights
+                    transposed_wts = np.transpose(var_values, (2, 0, 1))
+                elif (len(var_values.shape) == 4):  # 2D convolution layer weights
+                    transposed_wts = np.transpose(var_values, (3, 0, 1, 2))
+                else:  # fully connected layer weights or biases of any layer
+                    # test, use opt weight reorder
+                    if "dense" in var_name and "kernel" in var_name:
+                        transposed_wts = np.transpose(var_values)
+                        transposed_wts = convert_to_x4_q7_weights(np.reshape(transposed_wts ,(transposed_wts.shape[0], transposed_wts.shape[1], 1, 1)))
+                    else:
+                        transposed_wts = np.transpose(var_values)
+
+            print("  reshape to:",transposed_wts.shape)
+
+            with open(name, 'a') as f:
+                transposed_wts.tofile(f, sep=", ", format="%d")
+                f.write('}\n\n')
+                if ("bias" in var_name):
+                    f.write('#define ' + var_name.upper() + '_SHIFT ' + '(' + str(dec_bits) + ')\n\n\n')
+                if ("kernel" in var_name ):
+                    f.write('#define ' + var_name.upper() + '_SHIFT ' + '(' + str(dec_bits) + ')\n\n')
+            """
+            # for checking the quantised and dequantised range. 
+            with K.tf.Session() as session:
+                # convert back original range but quantized to 8-bits or 256 levels
+                var_values = var_values / (2 ** dec_bits)
+                var_values = session.run(K.tf.assign(var, var_values))
+                print('  '+var_name + ' number of wts/bias: ' + str(var_values.shape) + \
+                  ' dec bits: ' + str(dec_bits) + \
+                  ' max: (' + str(np.max(var_values)) + ',' + str(max_value) + ')' + \
+                  ' min: (' + str(np.min(var_values)) + ',' + str(min_value) + ')')
+            """
+
+def layers_output_ranges(model, x_test, quantize_method='max_min', calibrate_size=1000):
+    # limit the test data size
+    np.random.shuffle(x_test)
+    if(x_test.shape[0] > calibrate_size):
+        x_test = x_test[:1000]
+    # test, show the output ranges
+    shift_list = {}
+    # FIXME: only support one input
+    if(type(model.layers[0]) != InputLayer):
+        L = [model.input] + model.layers
+    else:
+        L = model.layers
+    last_layer = None
+
+    for layer in L: # layer loop
+        if("input" in layer.name):
+            features = x_test
+        else:
+            # batch_normalization will need to be handled differently, since we are fusing the weight to its predecessor.
+            # sigmoid and tanh are different, their shift is fixed to 7
+            if(is_shift_layer(layer) or
+                ('batch_normalization' in layer.name)):
+                layer_model = Model(inputs=model.input, outputs=layer.output)
+                features = layer_model.predict(x_test)
+            else:
+                # leave the features not changed, so this layer shift will be the same
+                # as its inputs
+                pass
+        #  calculate no saturation shift
+        max_val = features.max()
+        min_val = features.min()
+        int_bits = int(np.ceil(np.log2(max(abs(max_val), abs(min_val)))))
+        dec_bits = 7 - int_bits
+
+        # saturation shift, using KLD method
+        # Ref: http://on-demand.gputechconf.com/gtc/2017/presentation/s7310-8-bit-inference-with-tensorrt.pdf
+        if('kld' in quantize_method and not is_shift_fixed(layer) and "input" not in layer.name and "dense" not in layer.name): # test, also do not use kld in input layer
+            import scipy.stats
+            abs_max = max(abs(max_val), abs(min_val))
+            small_var = 1e-5
+            bins = np.arange(-abs_max, abs_max, abs_max/2048*2)
+            q_bins = np.arange(-abs_max, abs_max, abs_max/256*2)
+            flat_hist = np.histogram(features.flatten(), bins=bins)[0]
+            kl_loss = []
+            kl_shifts = []
+            for shift in range(4):
+                t = 2 ** (dec_bits + shift)     # 2-based threshold
+                act = np.round(features.flatten() * t)
+                act = act / t
+                act = np.clip(act, -128/t, 127/t)
+                act = np.histogram(act, bins=q_bins)[0]
+                act_hist = np.zeros(2047)
+                chunk = int(2048/256)
+                for i in range(int(255)):
+                    none_zero = np.count_nonzero(flat_hist[i*chunk:(i+1)*chunk])
+                    if none_zero == 0:
+                        continue
+                    for j in range(chunk):
+                        act_hist[i*chunk+j] = act[i]/none_zero if flat_hist[i*chunk+j] != 0 else 0
+                flat_hist[flat_hist==0] = small_var
+                act_hist[act_hist==0] = small_var
+                kl = scipy.stats.entropy(flat_hist, act_hist)
+                kl_loss.append(kl)
+                kl_shifts.append(dec_bits + shift)
+                """
+                ax = plt.subplot(8, 1, shift+1)
+                ax.plot(flat_hist)
+                ax.plot(act_hist)
+                """
+            new_dec = kl_shifts[np.argmin(kl_loss)] # set the dec_bit to the KLD results
+            #plt.show()
+            print("KLD loss", kl_loss)
+            print("KLD shift", kl_shifts)
+            if(new_dec != dec_bits):
+                print(layer.name,"is using KLD method, original shift",dec_bits, "KLD results", new_dec)
+                dec_bits = new_dec
+
+        print( layer.name, "max value:", max_val, "min value:", min_val,"dec bit", dec_bits)
+        # record the shift
+        if(type(model.input) == tf.Tensor and type(model.layers[0]) != InputLayer):
+            shift_list[layer.name.split(':')[0]] = dec_bits
+        else:
+            shift_list[layer.name] = dec_bits
+        if ('batch_normalization' in layer.name):
+            shift_list[last_layer.name] = dec_bits  # use the bn layer shift to update the last layer.
+        last_layer = layer
+
+    LM = {}
+    for layer in model.layers:
+        LM[layer.name] = layer
+    L = [l for l in model.layers[1:]]
+    L.reverse()
+
+    def update_previous_layer_shift(layer, Q):
+        if(type(layer.input) == list):
+            for inp in layer.input:
+                iname = inp.name.split('/')[0]
+                if('input' in iname):
+                    continue
+                shift_list[iname] = Qmin
+                if(not is_shift_layer(LM[iname])):
+                    update_previous_layer_shift(LM[iname], Q)
+        else:
+            iname = layer.input.name.split('/')[0]
+            if('input' in iname):
+                return
+            shift_list[iname] = Qmin
+            if(not is_shift_layer(LM[iname])):
+                update_previous_layer_shift(LM[iname], Q)
+    for layer in L:
+        if(type(layer.input) == list):
+            iname = layer.input[0].name.split('/')[0]
+            Qmin = shift_list[iname]
+            for inp in layer.input:
+                iname = inp.name.split('/')[0]
+                if(shift_list[iname] < Qmin):
+                    Qmin = shift_list[iname]
+                if(shift_list[iname] != Qmin):
+                    bFlag = True
+            for inp in layer.input:
+                iname = inp.name.split('/')[0]
+                shift_list[iname] = Qmin
+                if(not is_shift_layer(LM[iname])):
+                    update_previous_layer_shift(LM[iname], Qmin)
+            print('set shift', Qmin, 'for the input of', layer.name, ':', [inp.name.split('/')[0] for inp in layer.input])
+            if(not is_shift_layer(layer) or Qmin < shift_list[layer.name]): # update current layer's shift only when we cannot change the shift
+                shift_list[layer.name] = Qmin
+    print("shift list", shift_list)
+    return shift_list
+
+def generate_model(model, x_test, name='weights.h', format='hwc', quantize_method='max_min'):
+    shift_list = layers_output_ranges(model, x_test, quantize_method=quantize_method)
+    generate_weights(model, name=name, format=format, shift_list=shift_list)
+    if(type(model.layers[0]) != InputLayer):
+        L = [model.input] + model.layers
+    else:
+        L = model.layers
+    with open(name,'a') as fp:
+        fp.write('\n/* output enconding for each layer */\n')
+        for layer in L:
+            if(type(model.input) == tf.Tensor and type(model.layers[0]) != InputLayer):
+                iname = layer.name.split(':')[0]
+            else:
+                iname = layer.name
+            fp.write('#define %s_OUTPUT_SHIFT %s\n'%(iname.upper(), shift_list[iname]))
+        fp.write('\n/* bias shift and output shift for each layer */\n')
+        for layer in model.layers:
+            if(is_shift_layer(layer)):
+                iname = layer.name.upper()
+                if(len(layer.weights) == 2 and
+                   'kernel' in layer.weights[0].name and
+                   'bias' in layer.weights[1].name):
+                    kname = layer.weights[0].name.upper().replace('/', '_').replace(':', '_')
+                    bname = layer.weights[1].name.upper().replace('/', '_').replace(':', '_')
+                    inp = layer.input.name.replace(':','/').split('/')[0].upper()
+                    fp.write('#define {0}_OUTPUT_RSHIFT ({1}_OUTPUT_SHIFT+{2}_SHIFT-{0}_OUTPUT_SHIFT)\n'.format(
+                            iname, inp, kname))
+                    fp.write('#define {0}_BIAS_LSHIFT   ({1}_OUTPUT_SHIFT+{2}_SHIFT-{3}_SHIFT)\n'.format(
+                            iname, inp, kname, bname))
+                    fp.write('#if {0}_OUTPUT_RSHIFT < 0\n#error {0}_OUTPUT_RSHIFT must be bigger than 0\n#endif\n'.format(iname))
+                    fp.write('#if {0}_BIAS_LSHIFT < 0\n#error {0}_BIAS_RSHIFT must be bigger than 0\n#endif\n'.format(iname))
+                # add, sub
+                elif ('add' in layer.name or
+                    'subtract' in layer.name):
+                    # only consider the first, they have been set to same in out_put_range()
+                    inp = layer.input[0].name.replace(':','/').split('/')[0].upper()
+                    fp.write('#define {0}_OUTPUT_RSHIFT ({1}_OUTPUT_SHIFT-{0}_OUTPUT_SHIFT)\n'.format(
+                            iname, inp))
+                    fp.write('#if {0}_OUTPUT_RSHIFT < 0\n#error {0}_OUTPUT_RSHIFT must be bigger than 0\n#endif\n'.format(iname))
+                # mult is different, Q3.4 * Q3.4 = Q6.8. if mult out is Q4.3, then shift (Q.4+q.4)-Q.3=5. Am I right?
+                elif ('multiply' in layer.name ):
+                    inp = layer.input[0].name.replace(':','/').split('/')[0].upper()
+                    fp.write('#define {0}_OUTPUT_RSHIFT ({1}_OUTPUT_SHIFT*2-{0}_OUTPUT_SHIFT)\n'.format(
+                            iname, inp))
+                    fp.write('#if {0}_OUTPUT_RSHIFT < 0\n#error {0}_OUTPUT_RSHIFT must be bigger than 0\n#endif\n'.format(iname))
+
+        fp.write('\n/* weights for each layer */\n')
+        LI = {}
+        ID = 0
+        def is_skipable_layer(layer):
+            # FIXME: add more that could be skiped
+            if('lambda' in layer.name or
+               'dropout' in layer.name or
+               'batch_normalization' in layer.name or
+                ('flatten' in layer.name and 'chw' not in format)): # flatten layer can be skipped in HWC but have to present in CHW
+                return True
+            return False
+        for id,layer in enumerate(L):
+            if(is_skipable_layer(layer)):
+                inp = layer.input.name.replace(':','/').split('/')[0]
+                LI[layer.name] = (LI[inp][0], layer)
+            else:
+                if(type(model.input) == tf.Tensor and type(model.layers[0]) != InputLayer):
+                    LI[layer.name.split(':')[0]] = (ID, layer)
+                else:
+                    LI[layer.name] = (ID, layer)
+                ID += 1
+
+            if ('input' in layer.name or not layer.weights):
+                continue
+            for var in layer.weights:
+                var_name = str(var.name).replace('/', '_').replace(':', '_')
+                if("kernel" in var_name):
+                    fp.write('static const int8_t %s_weights[] = %s;\n'%(layer.name, var_name.upper()))
+                    fp.write('static const nnom_weight_t %s_w = { (const void*)%s_weights, %s_OUTPUT_RSHIFT};\n'%(layer.name,layer.name, layer.name.upper()))
+                elif("bias" in var_name):
+                    fp.write('static const int8_t %s_bias[] = %s;\n'%(layer.name, var_name.upper()))
+                    fp.write('static const nnom_bias_t %s_b = { (const void*)%s_bias, %s_BIAS_LSHIFT};\n'%(layer.name,layer.name, layer.name.upper()))
+        fp.write('\n/* nnom model */\n')
+        # FIXME: now only support one input and one output
+        sz = 1
+        for d in model.input.shape[1:]:
+            sz = sz*d
+        fp.write('static int8_t nnom_input_data[%d];\n'%(sz))
+        sz = 1
+        for d in model.output.shape[1:]:
+            sz = sz*d
+        fp.write('static int8_t nnom_output_data[%d];\n'%(sz))
+        fp.write('static nnom_model_t* nnom_model_create(void)\n{\n')
+        fp.write('\tstatic nnom_model_t model;\n')
+        if(ID>32):
+            fp.write('\tnnom_layer_t ** layer = malloc(sizeof(nnom_layer_t *)*%d);\n'%(ID+1))
+            fp.write('\tif(NULL == layer) return NULL;\n')
+        else:
+            fp.write('\tnnom_layer_t* layer[%d];\n'%(ID+1))
+        fp.write('\n\tnew_model(&model);\n\n')
+        for layer in L:
+            if(is_skipable_layer(layer)):
+                continue
+            #FIXME: need a better solution to seperate the input 'tensor' from other layers
+            if (type(model.input) == tf.Tensor and type(model.layers[0]) != InputLayer):
+                id,_ = LI[layer.name.split(':')[0]]
+            else:
+                id,_ = LI[layer.name]
+
+            if('input' in layer.name):
+                try:
+                    inshape = layer.input_shape[0][1:] # new changes in tf2?
+                except:
+                    inshape = layer.shape[1:]
+                if (len(inshape) == 1):  # 1-D input
+                    fp.write('\tlayer[%d] = Input(shape(%d,1,1), nnom_input_data);\n' % (id, inshape[0]))
+                elif (len(inshape) == 2):  # 1-D input
+                    fp.write('\tlayer[%d] = Input(shape(1,%d,%d), nnom_input_data);\n' % (id, inshape[0], inshape[1]))
+                else:
+                    fp.write('\tlayer[%d] = Input(shape%s, nnom_input_data);\n' % (id, inshape))
+
+            # convlutional
+            elif('conv1d' in layer.name):
+                inp = layer.input.name.replace(':','/').split('/')[0]
+                cfg = layer.get_config()
+                if('depthwise' in layer.name):
+                    fp.write('\tlayer[{0}] = model.hook(DW_Conv2D({1}, kernel(1,{2}), stride(1,{3}), dilation(1,{4}), PADDING_{5}, &{6}_w, &{6}_b), layer[{7}]);\n'.format(
+                        id, 1, cfg['kernel_size'][0], cfg['strides'][0], cfg['dilation_rate'][0], cfg['padding'].upper(),
+                        layer.name, LI[inp][0]))
+                else:
+                    fp.write('\tlayer[{0}] = model.hook(Conv2D({1}, kernel(1,{2}), stride(1,{3}), dilation(1,{4}), PADDING_{5}, &{6}_w, &{6}_b), layer[{7}]);\n'.format(
+                        id, cfg['filters'], cfg['kernel_size'][0], cfg['strides'][0], cfg['dilation_rate'][0], cfg['padding'].upper(),
+                        layer.name, LI[inp][0]))
+            elif('conv2d' in layer.name):
+                inp = layer.input.name.replace(':','/').split('/')[0]
+                cfg = layer.get_config()
+                if ('depthwise' in layer.name):
+                    fp.write('\tlayer[{0}] = model.hook(DW_Conv2D({1}, kernel{2}, stride{3}, dilation{4}, PADDING_{5}, &{6}_w, &{6}_b), layer[{7}]);\n'.format(
+                        id, 1, cfg['kernel_size'], cfg['strides'], cfg['dilation_rate'], cfg['padding'].upper(),
+                        layer.name, LI[inp][0]))
+                else:
+                    fp.write('\tlayer[{0}] = model.hook(Conv2D({1}, kernel{2}, stride{3}, dilation{4}, PADDING_{5}, &{6}_w, &{6}_b), layer[{7}]);\n'.format(
+                        id, cfg['filters'], cfg['kernel_size'], cfg['strides'], cfg['dilation_rate'], cfg['padding'].upper(),
+                        layer.name, LI[inp][0]))
+            # activations
+            elif('activation' in layer.name):
+                inp = layer.input.name.replace(':','/').split('/')[0]
+                cfg = layer.get_config()
+                if(cfg['activation'] == 'relu'):
+                    fp.write('\tlayer[%s] = model.active(act_relu(), layer[%s]);\n'%(id, LI[inp][0]))
+                if(cfg['activation'] == 'tanh'):
+                    fp.write('\tlayer[%s] = model.active(act_tanh(%s_OUTPUT_SHIFT), layer[%s]);\n'%(id, inp.upper(), LI[inp][0]))
+                if(cfg['activation'] == 'sigmoid'):
+                    fp.write('\tlayer[%s] = model.active(act_sigmoid(%s_OUTPUT_SHIFT), layer[%s]);\n'%(id, inp.upper(), LI[inp][0]))
+                elif(cfg['activation'] == 'softmax'):
+                    fp.write('\tlayer[%s] = model.hook(Softmax(), layer[%s]);\n'%(id, LI[inp][0]))
+            elif('re_lu' in layer.name):
+                inp = layer.input.name.replace(':','/').split('/')[0]
+                fp.write('\tlayer[%s] = model.active(act_relu(), layer[%s]);\n'%(id, LI[inp][0]))
+            # pooling
+            elif('max_pooling' in layer.name):
+                inp = layer.input.name.replace(':','/').split('/')[0]
+                cfg = layer.get_config()
+                if ('global' in layer.name):
+                    fp.write('\tlayer[%s] = model.hook(GlobalMaxPool(),  layer[%s]);\n' % (id, LI[inp][0]))
+                elif('2d' in layer.name):
+                    fp.write('\tlayer[%s] = model.hook(MaxPool(kernel%s, stride%s, PADDING_%s), layer[%d]);\n'%(
+                        id, cfg['pool_size'], cfg['strides'], cfg['padding'].upper(), LI[inp][0]))
+                elif('1d' in layer.name):
+                    fp.write('\tlayer[{0}] = model.hook(MaxPool(kernel(1,{1}), stride(1,{2}), PADDING_{3}), layer[{4}]);\n'.format(
+                        id, cfg['pool_size'][0], cfg['strides'][0], cfg['padding'].upper(), LI[inp][0]))
+            elif('average_pooling' in layer.name):
+                inp = layer.input.name.replace(':','/').split('/')[0]
+                cfg = layer.get_config()
+                if ('global' in layer.name):
+                    # a global avg pool before softmax can be replace by sumpool in MCU (recommend)
+                    if(layer == model.layers[-2] and 'Softmax' in model.layers[-1].output.name):
+                        print(layer.name, 'has been replaced by GlobalSumPool()')
+                        fp.write('\tlayer[%s] = model.hook(GlobalSumPool(),  layer[%s]);\n' % (id, LI[inp][0]))
+                    else:
+                        fp.write('\tlayer[%s] = model.hook(GlobalAvgPool(),  layer[%s]);\n' % (id, LI[inp][0]))
+                elif('2d' in layer.name):
+                    fp.write('\tlayer[%s] = model.hook(AvgPool(kernel%s, stride%s, PADDING_%s), layer[%d]);\n'%(
+                        id, cfg['pool_size'], cfg['strides'], cfg['padding'].upper(), LI[inp][0]))
+                elif('1d' in layer.name):
+                    fp.write('\tlayer[{0}] = model.hook(AvgPool(kernel(1,{1}), stride(1,{2}), PADDING_{3}), layer[{4}]);\n'.format(
+                        id, cfg['pool_size'][0], cfg['strides'][0], cfg['padding'].upper(), LI[inp][0]))
+            elif ('up_sampling' in layer.name):
+                inp = layer.input.name.replace(':','/').split('/')[0]
+                cfg = layer.get_config()
+                if('2d' in layer.name):
+                    fp.write('\tlayer[%s] = model.hook(UpSample(kernel%s), layer[%d]);\n'%(id, cfg['size'],  LI[inp][0]))
+                elif('1d' in layer.name):
+                    fp.write('\tlayer[{0}] = model.hook(UpSample(kernel(1,{1})), layer[{2}]);\n'.format(
+                        id,  cfg['size'][0], LI[inp][0]))
+            # zero padding
+            elif ('zero_padding' in layer.name):
+                inp = layer.input.name.replace(':','/').split('/')[0]
+                cfg = layer.get_config()
+                if('2d' in layer.name):
+                    fp.write('\tlayer[{0}] = model.hook(ZeroPadding(border({1},{2},{3},{4})), layer[{5}]);\n'.format(
+                        id,  cfg['padding'][0][0], cfg['padding'][0][1], cfg['padding'][1][0],cfg['padding'][1][1], LI[inp][0]))
+                elif('1d' in layer.name):
+                    fp.write('\tlayer[{0}] = model.hook(ZeroPadding(border(0,0,{1},{2})), layer[{3}]);\n'.format(
+                        id,  cfg['padding'][0], cfg['padding'][1], LI[inp][0]))
+            # Cropping
+            elif ('cropping' in layer.name):
+                inp = layer.input.name.replace(':','/').split('/')[0]
+                cfg = layer.get_config()
+                if('2d' in layer.name):
+                    fp.write('\tlayer[{0}] = model.hook(Cropping(border({1},{2},{3},{4})), layer[{5}]);\n'.format(
+                        id,  cfg['cropping'][0][0], cfg['cropping'][0][1], cfg['cropping'][1][0],cfg['cropping'][1][1], LI[inp][0]))
+                elif('1d' in layer.name):
+                    fp.write('\tlayer[{0}] = model.hook(Cropping(border(0,0,{1},{2})), layer[{3}]);\n'.format(
+                        id,  cfg['cropping'][0], cfg['cropping'][1], LI[inp][0]))
+
+            # others
+            elif('flatten' in layer.name): # flatten is needed in CHW backend but not needed in HWC
+                inp = layer.input.name.replace(':', '/').split('/')[0]
+                fp.write('\tlayer[%s] = model.hook(Flatten(), layer[%s]);\n'%(id, LI[inp][0]))
+            elif('concatenate' in layer.name):
+                inps = [input.name.replace(':','/').split('/')[0] for input in layer.input]
+                inX = ''
+                for inp in inps:
+                    inX += ' ,layer[%d]'%(LI[inp][0])
+                cfg = layer.get_config()
+                fp.write('\tlayer[%s] = model.mergex(Concat(%s), %s%s);\n'%(
+                    id, cfg['axis'], len(inps), inX))
+            elif('add' in layer.name):
+                inps = [input.name.replace(':','/').split('/')[0] for input in layer.input]
+                inX = ''
+                for inp in inps:
+                    inX += ' ,layer[%d]'%(LI[inp][0])
+                fp.write('\tlayer[%s] = model.mergex(Add(%s_OUTPUT_RSHIFT), %s%s);\n'%(
+                    id, layer.name.upper(), len(inps), inX))
+            elif('subtract' in layer.name):
+                inps = [input.name.replace(':','/').split('/')[0] for input in layer.input]
+                inX = ''
+                for inp in inps:
+                    inX += ' ,layer[%d]'%(LI[inp][0])
+                fp.write('\tlayer[%s] = model.mergex(Sub(%s_OUTPUT_RSHIFT), %s%s);\n'%(
+                    id, layer.name.upper(), len(inps), inX))
+            elif('multiply' in layer.name):
+                warnings.warn("Warning mutiply is under testing")
+                inps = [input.name.replace(':','/').split('/')[0] for input in layer.input]
+                inX = ''
+                for inp in inps:
+                    inX += ' ,layer[%d]'%(LI[inp][0])
+                fp.write('\tlayer[%s] = model.mergex(Mult(%s_OUTPUT_RSHIFT), %s%s);\n'%(
+                    id, layer.name.upper(), len(inps), inX))
+            elif('dense' in layer.name):
+                inp = layer.input.name.replace(':','/').split('/')[0]
+                cfg = layer.get_config()
+                fp.write('\tlayer[{0}] = model.hook(Dense({1}, &{2}_w, &{2}_b), layer[{3}]);\n'.format(
+                    id, cfg['units'], layer.name, LI[inp][0]))
+            elif('softmax' in layer.name):
+                inp = layer.input.name.replace(':','/').split('/')[0]
+                fp.write('\tlayer[%s] = model.hook(Softmax(), layer[%s]);\n'%(id, LI[inp][0]))
+            else:
+                raise Exception('unsupported layer', layer.name, layer)
+			
+            """
+            # temporary fixed for activations attached into layers in construction
+            def is_activation_attached(layer):
+                if(("Softmax" in layer.output.name and "softmax" not in layer.name)or
+                ("Relu" in layer.output.name and "re_lu" not in layer.name) or
+                ("Sigmoid" in layer.output.name and "sigmoid" not in layer.name) or
+                ("Tanh" in layer.output.name and "tanh" not in layer.name)):
+                    return True
+                return False
+            if "input" not in layer.name and is_activation_attached(layer):
+                inp = layer.output.name.replace(':', '/').split('/')[0]
+                cfg = layer.get_config()
+                if(cfg['activation'] == 'relu'):
+                    fp.write('\tlayer[%s] = model.active(act_relu(), layer[%s]);\n'%(id, LI[inp][0]))
+                if(cfg['activation'] == 'tanh'):
+                    fp.write('\tlayer[%s] = model.active(act_tanh(%s_OUTPUT_SHIFT), layer[%s]);\n'%(id, inp.upper(), LI[inp][0]))
+                if(cfg['activation'] == 'sigmoid'):
+                    fp.write('\tlayer[%s] = model.active(act_sigmoid(%s_OUTPUT_SHIFT), layer[%s]);\n'%(id, inp.upper(), LI[inp][0]))
+                elif(cfg['activation'] == 'softmax'):
+                    fp.write('\tlayer[%s] = model.hook(Softmax(), layer[%s]);\n'%(id, LI[inp][0]))
+            """
+			
+        # FIXME, test later.
+        if('softmax' in layer.name
+           or ('activation' in layer.name and layer.get_config()['activation'] == 'softmax')):
+            fp.write('\tlayer[%s] = model.hook(Output(shape(%s,1,1), nnom_output_data), layer[%s]);\n'%(id+1, layer.output.shape[1], id))
+        elif len(layer.output.shape) == 4:
+            fp.write('\tlayer[%s] = model.hook(Output(shape%s, nnom_output_data), layer[%s]);\n'%(id+1, layer.output.shape[1:], id))
+        elif len(layer.output.shape) == 3:
+            fp.write('\tlayer[%s] = model.hook(Output(shape(1,%s,%s), nnom_output_data), layer[%s]);\n'%(id+1, layer.output.shape[1], layer.output.shape[2], id))
+        elif len(layer.output.shape) == 2:
+            fp.write('\tlayer[%s] = model.hook(Output(shape(%s,1,1), nnom_output_data), layer[%s]);\n'%(id+1, layer.output.shape[1], id))
+        else:
+            raise Exception('unsupported output shape of the last layer', layer.name, layer)
+        fp.write('\tmodel_compile(&model, layer[0], layer[%s]);\n'%(id+1))
+        if(ID>32):
+            fp.write('\tfree(layer);\n')
+        fp.write('\treturn &model;\n}\n')
+    with open('.shift_list','w') as fp:
+        fp.write(str(shift_list))
+
+def evaluate_model(model, x_test, y_test, running_time=False, to_file='evaluation.txt'):
+    # Score trained model.
+    scores = model.evaluate(x_test, y_test, verbose=2)
+    print('Test loss:', scores[0])
+    print('Top 1:', scores[1])
+
+    if(len(y_test.shape)>1):
+        # predictions = model.predict(x_test)
+        # output = tf.keras.metrics.top_k_categorical_accuracy(y_test, predictions, k=2)
+        # # with tf.Session() as sess:
+        # #     result = sess.run(output)
+        # result =
+        # print("Top 2:",result)
+
+        predictions = model.predict(x_test)
+        matrix = metrics.confusion_matrix(y_test.argmax(axis=1), predictions.argmax(axis=1))
+        print(matrix)
+
+    run_time = 0
+    if running_time:
+        # try to calculate the time
+        T = time.time()
+        for i in range(10):
+            model.predict(x_test)
+        T = time.time() - T
+        run_time = round((T / 10 / x_test.shape[0] * 1000 * 1000), 2)
+        print("Runing time:",run_time , "us" )
+    #
+    with open(to_file, 'w') as f:
+        f.write("Runing time: "+ str(run_time) + "us" + "\n")
+        f.write('Test loss:'+ str(scores[0]) + "\n")
+        f.write('Top 1:'+ str(scores[1])+ "\n")
+        if (len(y_test.shape) > 1):
+            #f.write("Top 2:"+ str(result)+ "\n")
+            #f.write(str(matrix))
+            for row in matrix:
+                row.tofile(f, sep=',')
+                f.write("\n")
+
+    # try to check the weight and bias dec ranges
+    for layer in model.layers:
+        if (not layer.weights):
+            continue
+        for var in layer.weights:
+            var_name = str(var.name)
+            if ("kernel" in var_name):
+                var_values = layer.get_weights()[0]  # weight
+            else:
+                var_values = layer.get_weights()[1]  # bias
+            min_value = np.min(var_values)
+            max_value = np.max(var_values)
+            intt = int(np.ceil(np.log2(max(abs(min_value), abs(max_value)))))
+            dec = 7 - intt
+            print(var_name, "Dec num:", dec)
+    return scores
+
+def f2q(d, Q):
+    '''To convert a number from floating point to Qm.n format:
+        1. Multiply the floating point number by 2n
+        2. Round to the nearest integer
+    '''
+    return np.round(d*2**Q)
+
+
+def q2f(d, Q):
+    '''To convert a number from Qm.n format to floating point:
+        1. Convert the number to floating point as if it were an integer, in other words remove the binary point
+        2. Multiply by 2-n
+    '''
+    return d*2**-Q
+
+def show_weights(w, name):
+    sz = 1
+    for s in w.shape:
+        sz = sz*s
+    aL = w.reshape(sz,)
+    MIN,MAX=min(aL),max(aL)
+    Q = int(np.ceil(np.log2(max(abs(MIN),abs(MAX)))))
+    Q = 7-Q
+    qL = f2q(aL,Q)
+    qL = q2f(qL,Q)
+    plt.figure(figsize=(18, 3))  
+    plt.subplot(131)
+    plt.title(name)
+    plt.plot(aL)
+    plt.grid()
+    aL.sort()
+    plt.plot(aL,'r')
+    plt.grid()
+    plt.subplot(132)
+    plt.title('Q%s'%(Q))
+    qL.sort()
+    plt.plot(aL,'r')
+    plt.plot(qL,'g')
+    plt.grid()
+    plt.subplot(133)
+    plt.hist(aL,100)
+    plt.title('hist')
+    plt.grid()
+    plt.show()
+
+def compare(a,b,name):
+    sz = 1
+    for s in a.shape:
+        sz = sz*s
+    aL = a.reshape(sz,)
+    bL = b.reshape(sz,)
+    assert(len(aL) == len(bL))
+    Z = list(zip(aL,bL))
+    Z.sort(key=lambda x: x[0])
+    aL1,bL1=zip(*Z)
+    plt.figure(figsize=(18, 3))
+    plt.subplot(131)
+    plt.plot(aL)
+    plt.plot(aL1,'r')
+    plt.grid()
+    plt.title('tf-%s'%(name))
+    plt.subplot(133)
+    plt.plot(bL1,'g')
+    plt.plot(aL1,'r')
+    plt.grid()
+    plt.title('compare')
+    plt.subplot(132)
+    bL1=list(bL1)
+    bL1.sort()
+    plt.plot(bL)
+    plt.plot(bL1,'g')
+    plt.grid()
+    plt.title('nn-%s'%(name))
+    plt.show()
+
diff --git a/APP_Framework/Framework/knowing/nnom/src/backends/nnom_local.c b/APP_Framework/Framework/knowing/nnom/src/backends/nnom_local.c
new file mode 100644
index 000000000..5c514b21b
--- /dev/null
+++ b/APP_Framework/Framework/knowing/nnom/src/backends/nnom_local.c
@@ -0,0 +1,1689 @@
+/*
+ * Copyright (c) 2018-2020
+ * Jianjia Ma
+ * majianjia@live.com
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ * 
+ * Notice: 
+ * Code in this file inlcudes derivative works from CMSIS
+ * Please check the LICENSE file for detial.
+ *
+ * Change Logs:
+ * Date           Author       Notes
+ * 2019-02-05     Jianjia Ma   The first version
+ * 2019-03-19     Jianjia Ma   Local C implementation partly from CMSIS-NN
+ * 2019-06-19     Jianjia Ma   Implement CHW functions 
+ */
+
+#include "nnom.h"
+#include "nnom_local.h"
+
+// modified from CMSIS-NN test_ref
+void local_avepool_q7_HWC(const q7_t *Im_in,           // input image
+	const uint16_t dim_im_in_x,  // input image dimension x or W
+	const uint16_t dim_im_in_y,  // input image dimension y or H
+	const uint16_t ch_im_in,     // number of input image channels
+	const uint16_t dim_kernel_x, // window kernel size
+	const uint16_t dim_kernel_y, // window kernel size
+	const uint16_t padding_x,    // padding sizes
+	const uint16_t padding_y,    // padding sizes
+	const uint16_t stride_x,     // stride
+	const uint16_t stride_y,     // stride
+	const uint16_t dim_im_out_x, // output image dimension x or W
+	const uint16_t dim_im_out_y, // output image dimension y or H
+	const uint16_t output_shift, // output right shift
+	q7_t *bufferA,               // a buffer for local storage, NULL by now
+	q7_t *Im_out)
+{
+    int16_t i_ch_in, i_x, i_y;
+    int16_t k_x, k_y;
+
+    for (i_ch_in = 0; i_ch_in < ch_im_in; i_ch_in++)
+    {
+        for (i_y = 0; i_y < dim_im_out_y; i_y++)
+        {
+            for (i_x = 0; i_x < dim_im_out_x; i_x++)
+            {
+                int sum = 0;
+                int count = 0;
+                for (k_y = i_y * stride_y - padding_y; k_y < i_y * stride_y - padding_y + dim_kernel_y; k_y++)
+                {
+                    for (k_x = i_x * stride_x - padding_x; k_x < i_x * stride_x - padding_x + dim_kernel_x; k_x++)
+                    {
+                        if (k_y >= 0 && k_x >= 0 && k_y < dim_im_in_y && k_x < dim_im_in_x)
+                        {
+                            sum += Im_in[i_ch_in + ch_im_in * (k_x + k_y * dim_im_in_x)];
+                            count++;
+                        }
+                    }
+                }
+                Im_out[i_ch_in + ch_im_in * (i_x + i_y * dim_im_out_x)] = sum / (count>>output_shift);
+            }
+        }
+    }
+}
+
+void local_avepool_q7_CHW(const q7_t *Im_in,           // input image
+	const uint16_t dim_im_in_x,  // input image dimension x or W
+	const uint16_t dim_im_in_y,  // input image dimension y or H
+	const uint16_t ch_im_in,     // number of input image channels
+	const uint16_t dim_kernel_x, // window kernel size
+	const uint16_t dim_kernel_y, // window kernel size
+	const uint16_t padding_x,    // padding sizes
+	const uint16_t padding_y,    // padding sizes
+	const uint16_t stride_x,     // stride
+	const uint16_t stride_y,     // stride
+	const uint16_t dim_im_out_x, // output image dimension x or W
+	const uint16_t dim_im_out_y, // output image dimension y or H
+	const uint16_t output_shift, // output right shift
+	q7_t *bufferA,               // a buffer for local storage, NULL by now
+	q7_t *Im_out)
+{
+    int16_t i_ch_in, i_x, i_y;
+    int16_t k_x, k_y;
+	int32_t ch_offset;
+
+    for (i_ch_in = 0; i_ch_in < ch_im_in; i_ch_in++)
+    {
+		ch_offset = i_ch_in*dim_im_in_x*dim_im_in_y;
+        for (i_y = 0; i_y < dim_im_out_y; i_y++)
+        {
+            for (i_x = 0; i_x < dim_im_out_x; i_x++)
+            {
+                int sum = 0;
+                int count = 0;
+                for (k_y = i_y * stride_y - padding_y; k_y < i_y * stride_y - padding_y + dim_kernel_y; k_y++)
+                {
+                    for (k_x = i_x * stride_x - padding_x; k_x < i_x * stride_x - padding_x + dim_kernel_x; k_x++)
+                    {
+                        if (k_y >= 0 && k_x >= 0 && k_y < dim_im_in_y && k_x < dim_im_in_x)
+                        {
+                            sum += Im_in[ch_offset + (k_x + k_y * dim_im_in_x)];
+                            count++;
+                        }
+                    }
+                }
+                Im_out[i_ch_in*dim_im_out_x*dim_im_out_y + (i_x + i_y * dim_im_out_x)] = sum / (count>>output_shift);
+            }
+        }
+    }
+}
+
+// modified from CMSIS-NN test_ref
+void local_maxpool_q7_HWC(const q7_t *Im_in,           // input image
+	const uint16_t dim_im_in_x,  // input image dimension x or W
+	const uint16_t dim_im_in_y,  // input image dimension y or H
+	const uint16_t ch_im_in,     // number of input image channels
+	const uint16_t dim_kernel_x, // window kernel size
+	const uint16_t dim_kernel_y, // window kernel size
+	const uint16_t padding_x,    // padding sizes
+	const uint16_t padding_y,    // padding sizes
+	const uint16_t stride_x,     // stride
+	const uint16_t stride_y,     // stride
+	const uint16_t dim_im_out_x, // output image dimension x or W
+	const uint16_t dim_im_out_y, // output image dimension y or H
+	q7_t *bufferA,               // a buffer for local storage, NULL by now
+	q7_t *Im_out)
+{
+    int16_t i_ch_in, i_x, i_y;
+    int16_t k_x, k_y;
+
+    for (i_ch_in = 0; i_ch_in < ch_im_in; i_ch_in++)
+    {
+        for (i_y = 0; i_y < dim_im_out_y; i_y++)
+        {
+            for (i_x = 0; i_x < dim_im_out_x; i_x++)
+            {
+                int max = -129;
+                for (k_y = i_y * stride_y - padding_y; k_y < i_y * stride_y - padding_y + dim_kernel_y; k_y++)
+                {
+                    for (k_x = i_x * stride_x - padding_x; k_x < i_x * stride_x - padding_x + dim_kernel_x; k_x++)
+                    {
+                        if (k_y >= 0 && k_x >= 0 && k_y < dim_im_in_y && k_x < dim_im_in_x)
+                        {
+                            if (Im_in[i_ch_in + ch_im_in * (k_x + k_y * dim_im_in_x)] > max)
+                            {
+                                max = Im_in[i_ch_in + ch_im_in * (k_x + k_y * dim_im_in_x)];
+                            }
+                        }
+                    }
+                }
+                Im_out[i_ch_in + ch_im_in * (i_x + i_y * dim_im_out_x)] = max;
+            }
+        }
+    }
+}
+
+void local_maxpool_q7_CHW(const q7_t *Im_in,           // input image
+	const uint16_t dim_im_in_x,  // input image dimension x or W
+	const uint16_t dim_im_in_y,  // input image dimension y or H
+	const uint16_t ch_im_in,     // number of input image channels
+	const uint16_t dim_kernel_x, // window kernel size
+	const uint16_t dim_kernel_y, // window kernel size
+	const uint16_t padding_x,    // padding sizes
+	const uint16_t padding_y,    // padding sizes
+	const uint16_t stride_x,     // stride
+	const uint16_t stride_y,     // stride
+	const uint16_t dim_im_out_x, // output image dimension x or W
+	const uint16_t dim_im_out_y, // output image dimension y or H
+	q7_t *bufferA,               // a buffer for local storage, NULL by now
+	q7_t *Im_out)
+{
+    int16_t i_ch_in, i_x, i_y;
+    int16_t k_x, k_y;
+	int32_t ch_offset;
+
+    for (i_ch_in = 0; i_ch_in < ch_im_in; i_ch_in++)
+    {
+		ch_offset = i_ch_in * dim_im_out_x * dim_im_out_y; 
+        for (i_y = 0; i_y < dim_im_out_y; i_y++)
+        {
+            for (i_x = 0; i_x < dim_im_out_x; i_x++)
+            {
+                int max = -129;
+                for (k_y = i_y * stride_y - padding_y; k_y < i_y * stride_y - padding_y + dim_kernel_y; k_y++)
+                {
+                    for (k_x = i_x * stride_x - padding_x; k_x < i_x * stride_x - padding_x + dim_kernel_x; k_x++)
+                    {
+                        if (k_y >= 0 && k_x >= 0 && k_y < dim_im_in_y && k_x < dim_im_in_x)
+                        {
+                            if (Im_in[i_ch_in * dim_im_in_x * dim_im_in_y + (k_x + k_y * dim_im_in_x)] > max)
+                            {
+                                max = Im_in[i_ch_in * dim_im_in_x * dim_im_in_y + (k_x + k_y * dim_im_in_x)];
+                            }
+                        }
+                    }
+                }
+                Im_out[ch_offset+(i_x + i_y * dim_im_out_x)] = max;
+            }
+        }
+    }
+}
+
+// temporary for the thesis
+// shift according to the maximum
+void local_sumpool_q7_HWC(const q7_t *Im_in,           // input image
+	const uint16_t dim_im_in_x,  // input image dimension x or W
+	const uint16_t dim_im_in_y,  // input image dimension y or H
+	const uint16_t ch_im_in,     // number of input image channels
+	const uint16_t dim_kernel_x, // window kernel size
+	const uint16_t dim_kernel_y, // window kernel size
+	const uint16_t padding_x,    // padding sizes
+	const uint16_t padding_y,    // padding sizes
+	const uint16_t stride_x,     // stride
+	const uint16_t stride_y,     // stride
+	const uint16_t dim_im_out_x, // output image dimension x or W
+	const uint16_t dim_im_out_y, // output image dimension y or H
+	q7_t *bufferA,               // a buffer for local storage, size = 4*output_size
+	q7_t *Im_out)
+{
+    int16_t i_ch_in, i_x, i_y;
+    int16_t k_x, k_y;
+    int32_t *buf = (int32_t *)bufferA;
+	// stage2
+    // int32_t max_abs = 0;
+    // int32_t output_shift;
+    // size_t output_size = dim_im_out_x * dim_im_out_x * ch_im_in;
+
+    // save in 32bit
+    for (i_ch_in = 0; i_ch_in < ch_im_in; i_ch_in++)
+    {
+        for (i_y = 0; i_y < dim_im_out_y; i_y++)
+        {
+            for (i_x = 0; i_x < dim_im_out_x; i_x++)
+            {
+                int sum = 0;
+                for (k_y = i_y * stride_y - padding_y; k_y < i_y * stride_y - padding_y + dim_kernel_y; k_y++)
+                {
+                    for (k_x = i_x * stride_x - padding_x; k_x < i_x * stride_x - padding_x + dim_kernel_x; k_x++)
+                    {
+                        if (k_y >= 0 && k_x >= 0 && k_y < dim_im_in_y && k_x < dim_im_in_x)
+                        {
+                            sum += Im_in[i_ch_in + ch_im_in * (k_x + k_y * dim_im_in_x)];
+                        }
+                    }
+                }
+                // 32bit
+                buf[i_ch_in + ch_im_in * (i_x + i_y * dim_im_out_x)] = sum;
+            }
+        }
+    }
+
+    // // find max amount results
+    // for (int i = 0; i < output_size; i++)
+    // {
+    //     int32_t val = buf[i];
+    //     if (val < 0)
+    //         val = -val;
+    //     if (val > max_abs)
+    //         max_abs = val;
+    // }
+    // // find best shift to cover the max
+    // for (output_shift = 0;; output_shift++)
+    // {
+    //     if (127 * (1 + output_shift) >= max_abs)
+    //         break;
+    // }
+
+    // // shift the results
+    // for (int i = 0; i < output_size; i++)
+    // {
+    //     Im_out[i] = buf[i] >> output_shift;
+    // }
+    //return output_shift;
+}
+
+// temporary for the thesis
+// shift according to the maximum
+void local_sumpool_q7_CHW(const q7_t *Im_in,           // input image
+	const uint16_t dim_im_in_x,  // input image dimension x or W
+	const uint16_t dim_im_in_y,  // input image dimension y or H
+	const uint16_t ch_im_in,     // number of input image channels
+	const uint16_t dim_kernel_x, // window kernel size
+	const uint16_t dim_kernel_y, // window kernel size
+	const uint16_t padding_x,    // padding sizes
+	const uint16_t padding_y,    // padding sizes
+	const uint16_t stride_x,     // stride
+	const uint16_t stride_y,     // stride
+	const uint16_t dim_im_out_x, // output image dimension x or W
+	const uint16_t dim_im_out_y, // output image dimension y or H
+	q7_t *bufferA,               // a buffer for local storage, size = 4*output_size
+	q7_t *Im_out)
+{
+    int16_t i_ch_in, i_x, i_y;
+    int16_t k_x, k_y;
+    int32_t *buf = (int32_t *)bufferA;
+	int32_t i_ch_offset, o_ch_offset;
+	// stage2
+    // int32_t max_abs = 0;
+    // int32_t output_shift;
+    // size_t output_size = dim_im_out_x * dim_im_out_x * ch_im_in;
+
+    // save in 32bit
+    for (i_ch_in = 0; i_ch_in < ch_im_in; i_ch_in++)
+    {
+		i_ch_offset = i_ch_in*dim_im_in_x*dim_im_in_y;
+		o_ch_offset = i_ch_in*dim_im_out_x*dim_im_out_y;
+		
+        for (i_y = 0; i_y < dim_im_out_y; i_y++)
+        {
+            for (i_x = 0; i_x < dim_im_out_x; i_x++)
+            {
+                int sum = 0;
+                for (k_y = i_y * stride_y - padding_y; k_y < i_y * stride_y - padding_y + dim_kernel_y; k_y++)
+                {
+                    for (k_x = i_x * stride_x - padding_x; k_x < i_x * stride_x - padding_x + dim_kernel_x; k_x++)
+                    {
+                        if (k_y >= 0 && k_x >= 0 && k_y < dim_im_in_y && k_x < dim_im_in_x)
+                        {
+                            sum += Im_in[i_ch_offset + (k_x + k_y * dim_im_in_x)];
+                        }
+                    }
+                }
+                // 32bit
+                buf[o_ch_offset + (i_x + i_y * dim_im_out_x)] = sum;
+            }
+        }
+    }
+
+    // // find max amount results
+    // for (int i = 0; i < output_size; i++)
+    // {
+    //     int32_t val = buf[i];
+    //     if (val < 0)
+    //         val = -val;
+    //     if (val > max_abs)
+    //         max_abs = val;
+    // }
+    // // find best shift to cover the max
+    // for (output_shift = 0;; output_shift++)
+    // {
+    //     if (127 * (1 + output_shift) >= max_abs)
+    //         break;
+    // }
+
+    // // shift the results
+    // for (int i = 0; i < output_size; i++)
+    // {
+    //     Im_out[i] = buf[i] >> output_shift;
+    // }
+    //return output_shift;
+}
+
+// customised up sample pooling
+void local_up_sampling_q7_HWC(const q7_t *Im_in,       // input image
+	const uint16_t dim_im_in_x,  // input image dimension x or W
+	const uint16_t dim_im_in_y,  // input image dimension y or H
+	const uint16_t ch_im_in,     // number of input image channels
+	const uint16_t dim_kernel_x, // window kernel size
+	const uint16_t dim_kernel_y, // window kernel size
+	const uint16_t dim_im_out_x, // output image dimension x or W
+	const uint16_t dim_im_out_y, // output image dimension y or H
+	q7_t *bufferA,               // a buffer for local storage, NULL by now
+	q7_t *Im_out)
+{
+    int16_t i_x, i_y;
+	
+    // for loop for each pixel in input image.
+    for (i_y = 0; i_y < dim_im_in_y; i_y++)
+    {
+        for (i_x = 0; i_x < dim_im_in_x; i_x++)
+        {
+            // copy all the channels together. 
+            const q7_t *p_in = Im_in + (i_y * dim_im_in_x + i_x ) * ch_im_in;
+            q7_t *pout = Im_out + (i_y * dim_im_in_x * dim_kernel_x * dim_kernel_y + i_x * dim_kernel_y) * ch_im_in;
+
+            // copy along x axis
+            for(int i = 0; i<dim_kernel_x; i++)
+                nnom_memcpy(pout + i * ch_im_in, p_in, ch_im_in);
+            // duplicate the copied x data into y axis. 
+            for(int i = 1; i<dim_kernel_y; i++)
+                nnom_memcpy(pout + i * ch_im_in * dim_im_in_x * dim_kernel_x, pout, ch_im_in * dim_kernel_x);
+        }
+    }
+}
+
+void local_up_sampling_q7_CHW(const q7_t *Im_in,       // input image
+	const uint16_t dim_im_in_x,  // input image dimension x or W
+	const uint16_t dim_im_in_y,  // input image dimension y or H
+	const uint16_t ch_im_in,     // number of input image channels
+	const uint16_t dim_kernel_x, // window kernel size
+	const uint16_t dim_kernel_y, // window kernel size
+	const uint16_t dim_im_out_x, // output image dimension x or W
+	const uint16_t dim_im_out_y, // output image dimension y or H
+	q7_t *bufferA,               // a buffer for local storage, NULL by now
+	q7_t *Im_out)
+{
+    int16_t i_x, i_y, ch;
+	// for loop for channel
+	for(ch=0; ch<ch_im_in; ch++)
+	{
+		// for loop for each pixel in input image.
+		for (i_y = 0; i_y < dim_im_in_y; i_y++)
+		{
+			for (i_x = 0; i_x < dim_im_in_x; i_x++)
+			{
+				const q7_t *p_in = Im_in + ch * dim_im_in_x * dim_im_in_y + (i_y * dim_im_in_x + i_x);
+				q7_t *pout = Im_out + ch * dim_im_out_x * dim_im_out_y + (i_y * dim_im_in_x * dim_kernel_x * dim_kernel_y + i_x * dim_kernel_y);
+
+				// cpy along x axis
+				for(int i = 0; i<dim_kernel_x; i++)
+					*(pout + i) =  *p_in;
+				// duplicate the copied x data into y axis. 
+				for(int i = 1; i<dim_kernel_y; i++)
+					nnom_memcpy(pout + i * dim_im_in_x * dim_kernel_x, pout, dim_kernel_x);
+			}
+		}
+	}
+}
+
+void local_convolve_HWC_q7_nonsquare(const q7_t *Im_in,                // input image
+	const uint16_t dim_im_in_x,                                        // input image dimention x
+	const uint16_t dim_im_in_y,                                        // input image dimention y
+	const uint16_t ch_im_in,                                           // number of input image channels
+	const q7_t *wt,                                                    // kernel weights
+	const uint16_t ch_im_out,                                          // number of filters, i.e., output image channels
+	const uint16_t dim_kernel_x,                                       // filter kernel size x
+	const uint16_t dim_kernel_y,                                       // filter kernel size y
+	const uint16_t padding_x,                                          // padding sizes x
+	const uint16_t padding_y,                                          // padding sizes y
+	const uint16_t stride_x,                                           // stride x
+	const uint16_t stride_y,                                           // stride y
+    const uint16_t dilation_x,                                         // dilation x
+	const uint16_t dilation_y,                                         // dilation y
+	const q7_t *bias,                                                  // bias
+	const nnom_qformat_param_t *bias_shift,                                        // bias shifts
+    const nnom_qformat_param_t *out_shift,                                         // output shift
+    const nnom_qtype_t q_type,                                         // per channel or per tensor
+    q7_t *Im_out,                                                      // output image
+	const uint16_t dim_im_out_x,                                       // output image dimension x
+	const uint16_t dim_im_out_y,                                       // output image dimension y
+	q15_t *bufferA,                                                    //buffer space for input
+	q7_t *bufferB                                                      //buffer space for output
+)
+{
+    int i, j, k, l, m, n;
+    int conv_out;
+    int in_row, in_col;
+    int in_pix_loc, wt_loc;
+    int shift_idx, shift_steps;
+    if(q_type == NNOM_QTYPE_PER_AXIS)
+        shift_steps = 1;
+    else
+        shift_steps = 0;
+
+    for (i = 0, shift_idx = 0; i < ch_im_out; i++, shift_idx += shift_steps)
+    {
+        for (j = 0; j < dim_im_out_y; j++)
+        {
+            int32_t base_idx_y = stride_y * j - padding_y;
+            for (k = 0; k < dim_im_out_x; k++)
+            {
+				int32_t base_idx_x = stride_x * k - padding_x;
+                int32_t ker_y_start = MAX(0, -(base_idx_y-(dilation_y-1))/dilation_y);
+                int32_t ker_x_start = MAX(0, -(base_idx_x-(dilation_x-1))/dilation_x);
+                int32_t ker_y_end = MIN(dim_kernel_y, (dim_im_in_y - base_idx_y + (dilation_y-1))/dilation_y);
+                int32_t ker_x_end = MIN(dim_kernel_x, (dim_im_in_x - base_idx_x + (dilation_x-1))/dilation_x);
+
+                if(bias)
+                    conv_out = ((q31_t)(bias[i]) << bias_shift[shift_idx]) + NNOM_ROUND(out_shift[shift_idx]);
+                else
+                    conv_out = (q31_t) NNOM_ROUND(out_shift[shift_idx]);
+
+                for (m = ker_y_start; m < ker_y_end; m++)
+                {
+                    for (n = ker_x_start; n < ker_x_end; n++)
+                    {
+                        in_row = stride_y * j + m * dilation_y - padding_y;
+                        in_col = stride_x * k + n * dilation_x - padding_x;
+
+                        // pre-calculate the pixel location and weight location to improve the performance.
+                        in_pix_loc = (in_row * dim_im_in_x + in_col) * ch_im_in;
+                        wt_loc = i * ch_im_in * dim_kernel_y * dim_kernel_x + (m * dim_kernel_x + n) * ch_im_in;
+                        
+                        for (l = 0; l < ch_im_in; l++)
+                        {    
+                            conv_out += Im_in[in_pix_loc + l] * wt[wt_loc + l];
+                        } 
+                    }
+                }
+                Im_out[i + (j * dim_im_out_x + k) * ch_im_out] = (q7_t)__NNOM_SSAT((conv_out >> out_shift[shift_idx]), 8);
+            }
+        }
+    }
+}
+
+void local_convolve_CHW_q7_nonsquare(const q7_t *Im_in,                // input image
+	const uint16_t dim_im_in_x,                                        // input image dimention x
+	const uint16_t dim_im_in_y,                                        // input image dimention y
+	const uint16_t ch_im_in,                                           // number of input image channels
+	const q7_t *wt,                                                    // kernel weights
+	const uint16_t ch_im_out,                                          // number of filters, i.e., output image channels
+	const uint16_t dim_kernel_x,                                       // filter kernel size x
+	const uint16_t dim_kernel_y,                                       // filter kernel size y
+	const uint16_t padding_x,                                          // padding sizes x
+	const uint16_t padding_y,                                          // padding sizes y
+	const uint16_t stride_x,                                           // stride x
+	const uint16_t stride_y,                                           // stride y
+    const uint16_t dilation_x,                                         // dilation x
+	const uint16_t dilation_y,                                         // dilation y
+	const q7_t *bias,                                                  // bias
+	const nnom_qformat_param_t *bias_shift,                                        // bias shifts
+    const nnom_qformat_param_t *out_shift,                                         // output shift
+    const nnom_qtype_t q_type,                                         // per channel or per tensor
+    q7_t *Im_out,                                                      // output image
+	const uint16_t dim_im_out_x,                                       // output image dimension x
+	const uint16_t dim_im_out_y,                                       // output image dimension y
+	q15_t *bufferA,                                                    //buffer space for input
+	q7_t *bufferB                                                      //buffer space for output
+)
+{
+    int i, j, k, l, m, n;
+    long conv_out;
+    int in_row, in_col;
+    int shift_idx, shift_steps;
+    if(q_type == NNOM_QTYPE_PER_AXIS)
+        shift_steps = 1;
+    else
+        shift_steps = 0;
+
+    for(i = 0, shift_idx = 0; i < ch_im_out; i++, shift_idx += shift_steps)
+    {
+        for (j = 0; j < dim_im_out_y; j++)
+        {
+            for (k = 0; k < dim_im_out_x; k++)
+            {
+                if(bias)
+                    conv_out = ((q31_t)(bias[i]) << bias_shift[shift_idx]) + NNOM_ROUND(out_shift[shift_idx]);
+                else
+                    conv_out = (q31_t) NNOM_ROUND(out_shift[shift_idx]);
+                    
+				for (m = 0; m < dim_kernel_y; m++)
+				{
+					for (n = 0; n < dim_kernel_x; n++)
+					{
+						// if-for implementation
+						in_row = stride_y * j + m * dilation_y - padding_y;
+						in_col = stride_x * k + n * dilation_x - padding_x;
+						if (in_row >= 0 && in_col >= 0 && in_row < dim_im_in_y && in_col < dim_im_in_x)
+						{
+							for (l = 0; l < ch_im_in; l++)
+							{
+								conv_out += Im_in[(in_row * dim_im_in_x + in_col) + l * dim_im_in_x * dim_im_in_y] *
+									wt[(m * dim_kernel_x + n) * ch_im_in * ch_im_out + l * ch_im_out + i];
+							}
+						}
+					}
+				}
+                Im_out[i * dim_im_out_x * dim_im_out_y + (j * dim_im_out_x + k)] = (q7_t)__NNOM_SSAT((conv_out >> out_shift[shift_idx]), 8);
+            }
+        }
+    }
+}
+
+#define FALSE 0
+#define TRUE 1
+
+static int alg_deconv2d_calculate_position(
+		int pos,
+		int stride,
+		int padding,
+		int dim_kernel,
+		int dim_in,
+		int* in_start,
+		int* kernel_start,
+		int* kernel_end)
+{
+	int is_zero = FALSE;
+	int of, adj;
+	is_zero = FALSE;
+	*in_start = pos/stride;
+	of = pos%stride;
+	*kernel_start = padding - of;
+	if(*kernel_start >= 0) {
+		adj = MIN(*in_start, *kernel_start/stride);
+		*kernel_start -= adj*stride;
+		*in_start -= adj;
+	} else {
+		adj = -*kernel_start + dim_kernel;
+		if(adj<=stride) {
+			is_zero = TRUE;
+		} else {
+			adj = MIN(dim_in-1-*in_start, adj/stride);
+			*kernel_start += adj*stride;
+			*in_start += adj;
+		}
+	}
+	of = dim_kernel - 1 - *kernel_start;
+	adj = MIN(dim_in-1-*in_start, of/stride);
+	*kernel_end = *kernel_start + adj*stride;
+
+	return is_zero;
+}
+
+void local_conv_trans_HWC_q7_nonsquare(const int8_t * Im_in,
+	const uint16_t dim_im_in_x,                                        // input image dimention x
+	const uint16_t dim_im_in_y,                                        // input image dimention y
+	const uint16_t ch_im_in,                                           // number of input image channels
+	const q7_t *wt,                                                    // kernel weights
+	const uint16_t ch_im_out,                                          // number of filters, i.e., output image channels
+	const uint16_t dim_kernel_x,                                       // filter kernel size x
+	const uint16_t dim_kernel_y,                                       // filter kernel size y
+	const uint16_t padding_x,                                          // padding sizes x
+	const uint16_t padding_y,                                          // padding sizes y
+	const uint16_t stride_x,                                           // stride x
+	const uint16_t stride_y,                                           // stride y
+    const uint16_t dilation_x,                                         // dilation x
+	const uint16_t dilation_y,                                         // dilation y
+	const q7_t *bias,                                                  // bias
+	const uint16_t bias_shift, const uint16_t out_shift, q7_t *Im_out, // output image
+	const uint16_t dim_im_out_x,                                       // output image dimension x
+	const uint16_t dim_im_out_y,                                       // output image dimension y
+	q15_t *bufferA,                                                    //buffer space for input
+	q7_t *bufferB                                                      //buffer space for output
+)
+// {
+// 	int ox, oy, oc, ky, kx, kc, ix, iy;
+// 	int conv_out;
+// 	int in_pix_loc, wt_loc;
+
+//     (void)dilation_y;
+//     (void)dilation_x;
+
+//     // padding and stride are applied to output 
+//  	for (oc = 0; oc < ch_im_out; oc++) 
+//     {
+//  		for (oy = 0; oy < dim_im_out_y; oy++) 
+//         {
+//  			for (ox = 0; ox < dim_im_out_x; ox++)
+//             {
+//  				conv_out = ((q31_t)(bias[oc]) << bias_shift) + NNOM_ROUND(out_shift);
+
+//                 for (ky = 0; ky < dim_kernel_y; ky++)
+//                 {
+//                     for (kx = 0; kx < dim_kernel_x; kx++)
+//                     {
+//                         // input y, input x location
+//                         iy = oy / stride_y + ky - padding_y;
+//                         ix = ox / stride_x + kx - padding_x;
+
+// 						if(ix >= 0 && iy >= 0 && ix < dim_im_in_y && iy< dim_im_in_y)
+// 						{
+// 							in_pix_loc = (iy * dim_im_in_x + ix) * ch_im_in;
+// 							wt_loc = oc * ch_im_in * dim_kernel_y * dim_kernel_x + (ky * dim_kernel_x + kx) * ch_im_in;
+						
+// 							for (kc = 0; kc < ch_im_in; kc++)
+// 							{    
+// 								conv_out += Im_in[in_pix_loc + kc] * wt[wt_loc + kc];
+// 							} 
+// 						}
+//                     }
+//                 }
+
+//  				Im_out[oc + (oy * dim_im_out_x + ox) * ch_im_out] = (q7_t) __NNOM_SSAT((conv_out >> out_shift), 8);
+//  			}
+//  		}
+//  	}
+//  }
+
+{
+	int i, j, k, l, m, n;
+	int conv_out;
+	int in_row, in_col;
+	int kernel_start_x,kernel_end_x;
+	int kernel_start_y,kernel_end_y;
+	int in_row_start, in_col_start;
+	int is_zero;
+
+	for (i = 0; i < ch_im_out; i++) {
+		for (j = 0; j < dim_im_out_y; j++) {
+			is_zero = alg_deconv2d_calculate_position(j, stride_y, padding_y, dim_kernel_y,
+					dim_im_in_y, &in_row_start, &kernel_start_y, &kernel_end_y);
+
+			if(is_zero) {
+				conv_out = ((q31_t)(bias[i]) << bias_shift) + NNOM_ROUND(out_shift);
+				conv_out = (q7_t) __NNOM_SSAT((conv_out >> out_shift), 8);
+				for (k = 0; k < dim_im_out_x; k++) {
+					Im_out[i + (j * dim_im_out_x + k) * ch_im_out] = (q7_t) conv_out;
+				}
+				continue;
+			}
+
+			for (k = 0; k < dim_im_out_x; k++) {
+				conv_out = ((q31_t)(bias[i]) << bias_shift) + NNOM_ROUND(out_shift);
+
+				is_zero = alg_deconv2d_calculate_position(k, stride_x, padding_x, dim_kernel_x,
+						dim_im_in_x, &in_col_start, &kernel_start_x, &kernel_end_x);
+
+				if(is_zero) {
+					Im_out[i + (j * dim_im_out_x + k) * ch_im_out] = conv_out;
+					continue;
+				}
+
+				for (m = kernel_start_y, in_row = in_row_start; m <= kernel_end_y; m+=stride_y, in_row++) {
+					for (n = kernel_start_x, in_col = in_col_start; n <= kernel_end_x; n+=stride_x, in_col++) {
+						if ((in_row >= 0) && (in_col >= 0) &&
+							(in_row < dim_im_in_y) && (in_col < dim_im_in_x)) {
+							for (l = 0; l < ch_im_in; l++) {
+								conv_out += Im_in[(in_row * dim_im_in_x + in_col) * ch_im_in + l] *
+									wt[i * ch_im_in * dim_kernel_y * dim_kernel_x + (m * dim_kernel_x + n) * ch_im_in + l];
+							}
+						}
+					}
+				}
+
+				Im_out[i + (j * dim_im_out_x + k) * ch_im_out] = (q7_t) __NNOM_SSAT((conv_out >> out_shift), 8);
+			}
+		}
+	}
+}
+
+void local_depthwise_separable_conv_HWC_q7_nonsquare(const q7_t *Im_in,// input image
+	const uint16_t dim_im_in_x,                                        // input image dimention x
+	const uint16_t dim_im_in_y,                                        // input image dimention y
+	const uint16_t ch_im_in,                                           // number of input image channels
+	const q7_t *wt,                                                    // kernel weights
+	const uint16_t ch_im_out,                                          // number of filters, i.e., output image channels
+	const uint16_t dim_kernel_x,                                       // filter kernel size x
+	const uint16_t dim_kernel_y,                                       // filter kernel size y
+	const uint16_t padding_x,                                          // padding sizes x
+	const uint16_t padding_y,                                          // padding sizes y
+	const uint16_t stride_x,                                           // stride x
+	const uint16_t stride_y,                                           // stride y
+    const uint16_t dilation_x,                                         // dilation x
+	const uint16_t dilation_y,                                         // dilation y
+	const q7_t *bias,                                                  // bias
+	const nnom_qformat_param_t *bias_shift,                            // bias shifts
+    const nnom_qformat_param_t *out_shift,                             // output shift
+    const nnom_qtype_t q_type,                                         // per channel or per tensor
+    q7_t *Im_out,                                                      // output image
+	const uint16_t dim_im_out_x,                                       // output image dimension x
+	const uint16_t dim_im_out_y,                                       // output image dimension y
+	q15_t *bufferA,                                                    //buffer space for input
+	q7_t *bufferB                                                      //buffer space for output
+)
+{
+    int i_out_y, i_out_x, i_ch_out, i_ch_in, i_ch_mult;
+    int i_ker_y, i_ker_x;
+    int i_out = 0;
+    int shift_idx, shift_steps;
+    int ch_mult = ch_im_out / ch_im_in;
+    q31_t conv_out;
+
+    for (i_out_y = 0; i_out_y < dim_im_out_y; i_out_y++)
+    {
+        const int32_t base_idx_y = stride_y * i_out_y - padding_y;
+        for (i_out_x = 0; i_out_x < dim_im_out_x; i_out_x++)
+        {
+            const int32_t base_idx_x = stride_x * i_out_x - padding_x;
+            for (i_ch_in = 0; i_ch_in < ch_im_in; i_ch_in++)
+            {
+                for(i_ch_mult = 0; i_ch_mult < ch_mult; i_ch_mult++)
+                {
+                    i_ch_out = i_ch_mult + i_ch_in * ch_mult;
+                    int32_t ker_y_start = MAX(0, -(base_idx_y-(dilation_y-1))/dilation_y);
+                    int32_t ker_x_start = MAX(0, -(base_idx_x-(dilation_x-1))/dilation_x);
+                    int32_t ker_y_end = MIN(dim_kernel_y, (dim_im_in_y - base_idx_y + (dilation_y-1))/dilation_y);
+                    int32_t ker_x_end = MIN(dim_kernel_x, (dim_im_in_x - base_idx_x + (dilation_x-1))/dilation_x);
+
+                    shift_idx = q_type == NNOM_QTYPE_PER_AXIS ? i_ch_out : 0;
+                    if (bias)
+                        conv_out = ((q31_t)(bias[i_ch_out]) << bias_shift[shift_idx]) + NNOM_ROUND(out_shift[shift_idx]);
+                    else
+                        conv_out = (q31_t)NNOM_ROUND(out_shift[shift_idx]);
+
+                    for (i_ker_y = ker_y_start; i_ker_y < ker_y_end; i_ker_y++)
+                    {
+                        const int32_t idx_y = base_idx_y + i_ker_y * dilation_y;
+                        for (i_ker_x = ker_x_start; i_ker_x < ker_x_end; i_ker_x++)
+                        {
+                            const int32_t idx_x = base_idx_x + i_ker_x * dilation_x;
+                            int32_t in_pix_loc = (idx_y * dim_im_in_x + idx_x) * ch_im_in + i_ch_in;
+                            int32_t wt_loc = (i_ker_y * dim_kernel_x + i_ker_x) * (ch_im_in * ch_mult) + i_ch_out;
+                            conv_out += Im_in[in_pix_loc] * wt[wt_loc];
+                        }
+                    }
+                    Im_out[i_out++] = (q7_t)__NNOM_SSAT((conv_out >> out_shift[shift_idx]), 8);
+                }
+            }
+        }
+    }
+}
+
+void local_depthwise_separable_conv_CHW_q7_nonsquare(const q7_t *Im_in,// input image
+	const uint16_t dim_im_in_x,                                        // input image dimention x
+	const uint16_t dim_im_in_y,                                        // input image dimention y
+	const uint16_t ch_im_in,                                           // number of input image channels
+	const q7_t *wt,                                                    // kernel weights
+	const uint16_t ch_im_out,                                          // number of filters, i.e., output image channels
+	const uint16_t dim_kernel_x,                                       // filter kernel size x
+	const uint16_t dim_kernel_y,                                       // filter kernel size y
+	const uint16_t padding_x,                                          // padding sizes x
+	const uint16_t padding_y,                                          // padding sizes y
+	const uint16_t stride_x,                                           // stride x
+	const uint16_t stride_y,                                           // stride y
+    const uint16_t dilation_x,                                         // dilation x
+	const uint16_t dilation_y,                                         // dilation y
+	const q7_t *bias,                                                  // bias
+	const nnom_qformat_param_t *bias_shift,                            // bias shifts
+    const nnom_qformat_param_t *out_shift,                             // output shift
+    const nnom_qtype_t q_type,                                         // per channel or per tensor
+    q7_t *Im_out,                                                      // output image
+	const uint16_t dim_im_out_x,                                       // output image dimension x
+	const uint16_t dim_im_out_y,                                       // output image dimension y
+	q15_t *bufferA,                                                    //buffer space for input
+	q7_t *bufferB                                                      //buffer space for output
+)
+{
+    int i_out_y, i_out_x, i_ch_out, i_ch_in, i_ch_mult;
+    int i_ker_y, i_ker_x;
+    int i_out = 0;
+    int shift_idx, shift_steps;
+    int ch_mult = ch_im_out / ch_im_in;
+    q31_t conv_out;
+
+    for (i_out_y = 0; i_out_y < dim_im_out_y; i_out_y++)
+    {
+        const int32_t base_idx_y = stride_y * i_out_y - padding_y;
+        for (i_out_x = 0; i_out_x < dim_im_out_x; i_out_x++)
+        {
+            const int32_t base_idx_x = stride_x * i_out_x - padding_x;
+            for (i_ch_in = 0; i_ch_in < ch_im_in; i_ch_in++)
+            {
+                for (i_ch_mult = 0; i_ch_mult < ch_mult; i_ch_mult++)
+                {
+                    i_ch_out = i_ch_mult + i_ch_in * ch_mult;
+                    int32_t ker_y_start = MAX(0, -(base_idx_y-(dilation_y-1))/dilation_y);
+                    int32_t ker_x_start = MAX(0, -(base_idx_x-(dilation_x-1))/dilation_x);
+                    int32_t ker_y_end = MIN(dim_kernel_y, (dim_im_in_y - base_idx_y + (dilation_y-1))/dilation_y);
+                    int32_t ker_x_end = MIN(dim_kernel_x, (dim_im_in_x - base_idx_x + (dilation_x-1))/dilation_x);
+
+                    shift_idx = q_type == NNOM_QTYPE_PER_AXIS ? i_ch_out : 0;
+                    if (bias)
+                        conv_out = ((q31_t)(bias[i_ch_out]) << bias_shift[shift_idx]) + NNOM_ROUND(out_shift[shift_idx]);
+                    else
+                        conv_out = (q31_t)NNOM_ROUND(out_shift[shift_idx]);
+
+                    for (i_ker_y = ker_y_start; i_ker_y < ker_y_end; i_ker_y++)
+                    {
+                        const int32_t idx_y = base_idx_y + i_ker_y * dilation_y;
+                        for (i_ker_x = ker_x_start; i_ker_x < ker_x_end; i_ker_x++)
+                        {
+                            const int32_t idx_x = base_idx_x + i_ker_x * dilation_x;
+                            int32_t in_pix_loc = (idx_y * dim_im_in_x + idx_x) + i_ch_in * dim_im_in_x * dim_im_in_y;
+                            int32_t wt_loc = (i_ker_y * dim_kernel_x + i_ker_x) * ch_im_out +  i_ch_out;
+                            conv_out += Im_in[in_pix_loc] * wt[wt_loc];
+                        }
+                    }
+                    Im_out[i_ch_out * dim_im_out_x * dim_im_out_y + (i_out_y * dim_im_out_x + i_out_x)] = (q7_t)__NNOM_SSAT((conv_out >> out_shift[shift_idx]), 8);
+                }
+            }
+        }
+    }
+
+}
+
+
+void local_zero_padding_HWC_q7(const q7_t *Im_in,           // input image
+	const uint16_t dim_im_in_x,    // input image dimention x
+	const uint16_t dim_im_in_y,    // input image dimention y
+	const uint16_t ch_im_in,       // number of input image channels
+	const uint16_t padding_top,    // padding sizes y
+	const uint16_t padding_bottom, // padding sizes y
+	const uint16_t padding_left,   // padding sizes x
+	const uint16_t padding_right,  // padding sizes x
+	q7_t *Im_out,                  // output image
+	const uint16_t dim_im_out_x,   // output image dimension x
+	const uint16_t dim_im_out_y)   // output image dimension y 
+{
+	int i, size;
+	q7_t * p_out = Im_out; 
+	
+	// top rows
+	size = dim_im_out_x*ch_im_in*padding_top;
+	nnom_memset(p_out, 0, size); 
+	p_out += size;
+	
+	// middle
+	for(i=0; i<dim_im_in_y; i++)
+	{
+		// left - set to 0
+		size = ch_im_in * padding_left;
+		nnom_memset(p_out, 0, size); 
+		p_out += size;
+		// data - copy a row
+		size = dim_im_in_x * ch_im_in;
+		nnom_memcpy(p_out, Im_in + i*size, size);
+		p_out += size;
+		// right - set to 0
+		size = ch_im_in * padding_right;
+		nnom_memset(p_out, 0, size); 
+		p_out += size;
+	}
+	// bottom rows
+	nnom_memset(p_out, 0, dim_im_out_x*ch_im_in*padding_bottom); 
+}
+
+void local_zero_padding_CHW_q7(const q7_t *Im_in,           // input image
+	const uint16_t dim_im_in_x,    // input image dimention x
+	const uint16_t dim_im_in_y,    // input image dimention y
+	const uint16_t ch_im_in,       // number of input image channels
+	const uint16_t padding_top,    // padding sizes y
+	const uint16_t padding_bottom, // padding sizes y
+	const uint16_t padding_left,   // padding sizes x
+	const uint16_t padding_right,  // padding sizes x
+	q7_t *Im_out,                  // output image
+	const uint16_t dim_im_out_x,   // output image dimension x
+	const uint16_t dim_im_out_y)   // output image dimension y 
+{
+	int i, size, ch_offset;
+	q7_t * p_out = Im_out; 
+	
+	for(int ch=0; ch < ch_im_in; ch++)
+	{
+		p_out = Im_out + ch * dim_im_out_x * dim_im_out_y;
+		// top rows
+		size = dim_im_out_x*padding_top;
+		nnom_memset(p_out, 0, size);
+		p_out += size;
+		
+		// middle
+		ch_offset = ch*dim_im_in_x*dim_im_in_y;
+		for(i=0; i<dim_im_in_y; i++)
+		{
+			// left - set to 0
+			nnom_memset(p_out, 0, padding_left); 
+			p_out += padding_left;
+			// data - copy a row
+			nnom_memcpy(p_out, Im_in + i*dim_im_in_x + ch_offset, dim_im_in_x);
+			p_out += dim_im_in_x;
+			// right - set to 0
+			nnom_memset(p_out, 0, size); 
+			p_out += padding_right;
+		}
+		// bottom
+		nnom_memset(p_out, 0, dim_im_out_x*padding_bottom); 
+	}
+
+}
+
+
+void local_cropping_HWC_q7(const q7_t *Im_in,           // input image
+	const uint16_t dim_im_in_x,    // input image dimention x
+	const uint16_t dim_im_in_y,    // input image dimention y
+	const uint16_t ch_im_in,       // number of input image channels
+	const uint16_t padding_top,    // padding sizes y
+	const uint16_t padding_bottom, // padding sizes y
+	const uint16_t padding_left,   // padding sizes x
+	const uint16_t padding_right,  // padding sizes x
+	q7_t *Im_out,                  // output image
+	const uint16_t dim_im_out_x,   // output image dimension x
+	const uint16_t dim_im_out_y)   // output image dimension y 
+{
+	int i, row_size;
+	const q7_t * p_in = Im_in; 
+	
+	// top rows to ignore
+	p_in += dim_im_in_x*ch_im_in*padding_top;
+	
+	// middle
+	row_size = dim_im_out_x * ch_im_in;
+	for(i=0; i<dim_im_out_y; i++)
+	{
+		// left to ignore          
+		p_in += ch_im_in * padding_left;
+		// data - copy a row
+		nnom_memcpy(Im_out + i*row_size, p_in, row_size);
+		p_in += row_size;
+		// right to ingore
+		p_in += ch_im_in * padding_right;
+	}
+
+}
+
+void local_cropping_CHW_q7(const q7_t *Im_in,           // input image
+	const uint16_t dim_im_in_x,    // input image dimention x
+	const uint16_t dim_im_in_y,    // input image dimention y
+	const uint16_t ch_im_in,       // number of input image channels
+	const uint16_t padding_top,    // padding sizes y
+	const uint16_t padding_bottom, // padding sizes y
+	const uint16_t padding_left,   // padding sizes x
+	const uint16_t padding_right,  // padding sizes x
+	q7_t *Im_out,                  // output image
+	const uint16_t dim_im_out_x,   // output image dimension x
+	const uint16_t dim_im_out_y)   // output image dimension y 
+{
+	int i, ch, ch_offset;
+	const q7_t * p_in; 
+	
+	for(ch=0; ch < ch_im_in; ch++)
+	{
+		p_in = Im_in + dim_im_in_x * dim_im_in_y * ch; 	// ch offset to input image
+		p_in += dim_im_in_x*padding_top; 				// top to ignore
+		
+		ch_offset = ch*dim_im_out_x*dim_im_out_y;
+		for(i=0; i<dim_im_out_y; i++)
+		{	
+			// data - middle of a row
+			nnom_memcpy(Im_out + i*dim_im_out_x + ch_offset, p_in+padding_left, dim_im_out_x); 
+			p_in += dim_im_in_x; // middle and right padding	
+		}
+	}	
+}
+
+
+void local_dot_q7_opt(const q7_t *pV, // pointer to vector
+	const q7_t *pM,               // pointer to matrix
+	const uint16_t dim_vec,       // length of the vector
+	const uint16_t num_of_rows,   // numCol of A
+	const uint16_t out_shift,     // amount of right-shift for output
+	 q7_t *pOut)                   // output operand)
+{
+    uint16_t rowCnt = num_of_rows >> 2;
+    const q7_t *pB = pM;
+    const q7_t *pA;
+    q7_t *pO = pOut;
+
+    while (rowCnt)
+    {
+        pA = pV;
+        q31_t     sum =  (q31_t) NNOM_ROUND(out_shift);
+        q31_t     sum2 = (q31_t) NNOM_ROUND(out_shift);
+        q31_t     sum3 = (q31_t) NNOM_ROUND(out_shift);
+        q31_t     sum4 = (q31_t) NNOM_ROUND(out_shift);
+
+        uint16_t colCnt = dim_vec >> 2;
+
+        while (colCnt)
+        {
+            q7_t inA1 = *pA++;
+            q7_t inA3 = *pA++;
+            q7_t inA2 = *pA++;
+            q7_t inA4 = *pA++;
+
+            q7_t inB1 = *pB++;
+            q7_t inB3 = *pB++;
+            q7_t inB2 = *pB++;
+            q7_t inB4 = *pB++;
+
+            sum += inA1 * inB1 + inA2 * inB2;
+            sum2 += inA1 * inB3 + inA2 * inB4;
+
+            inB1 = *pB++;
+            inB3 = *pB++;
+            inB2 = *pB++;
+            inB4 = *pB++;
+
+            sum3 += inA1 * inB1 + inA2 * inB2;
+            sum4 += inA1 * inB3 + inA2 * inB4;
+
+            inB1 = *pB++;
+            inB3 = *pB++;
+            inB2 = *pB++;
+            inB4 = *pB++;
+
+            sum += inA3 * inB1 + inA4 * inB2;
+            sum2 += inA3 * inB3 + inA4 * inB4;
+
+            inB1 = *pB++;
+            inB3 = *pB++;
+            inB2 = *pB++;
+            inB4 = *pB++;
+
+            sum3 += inA3 * inB1 + inA4 * inB2;
+            sum4 += inA3 * inB3 + inA4 * inB4;
+
+            colCnt--;
+        }
+        colCnt = dim_vec & 0x3;
+        while (colCnt)
+        {
+            q7_t inA = *pA++;
+            q7_t inB = *pB++;
+            sum += inA * inB;
+            inB = *pB++;
+            sum2 += inA * inB;
+            inB = *pB++;
+            sum3 += inA * inB;
+            inB = *pB++;
+            sum4 += inA * inB;
+
+            colCnt--;
+        }
+        *pO++ = (q7_t)__NNOM_SSAT((sum >> out_shift), 8);
+        *pO++ = (q7_t)__NNOM_SSAT((sum2 >> out_shift), 8);
+        *pO++ = (q7_t)__NNOM_SSAT((sum3 >> out_shift), 8);
+        *pO++ = (q7_t)__NNOM_SSAT((sum4 >> out_shift), 8);
+
+        rowCnt--;
+    }
+
+    rowCnt = num_of_rows & 0x3;
+
+    while (rowCnt)
+    {
+		int ip_out = (q31_t) NNOM_ROUND (out_shift);
+        pA = pV;
+        for (int j = 0; j < dim_vec; j++)
+        {
+            q7_t inA = *pA++;
+            q7_t inB = *pB++;
+            ip_out += inA * inB;
+        }
+        *pO++ = (q7_t)__NNOM_SSAT((ip_out >> out_shift), 8);
+
+        rowCnt--;
+    }
+}
+
+void local_dot_q7(const q7_t *pV, // pointer to vector
+	const q7_t *pM,               // pointer to matrix
+	const uint16_t dim_vec,       // length of the vector
+	const uint16_t num_of_rows,   // numCol of A
+	const uint16_t out_shift,     // amount of right-shift for output
+	 q7_t *pOut)                   // output operand)
+{
+    for (int i = 0; i < num_of_rows; i++)
+    {
+        int ip_out = (q31_t) NNOM_ROUND(out_shift);
+        for (int j = 0; j < dim_vec; j++)
+        {
+            ip_out += pV[j] * pM[i * dim_vec + j];
+        }
+        pOut[i] = (q7_t)__NNOM_SSAT((ip_out >> out_shift), 8);
+    }
+}
+
+void local_fully_connected_q7_opt(const q7_t *pV,               // pointer to vector
+	const q7_t *pM,               // pointer to matrix
+	const uint16_t dim_vec,       // length of the vector
+	const uint16_t num_of_rows,   // numCol of A
+	const uint16_t bias_shift,    // amount of left-shift for bias
+	const uint16_t out_shift,     // amount of right-shift for output
+	const q7_t *bias, q7_t *pOut, // output operand
+	q15_t *vec_buffer)
+{
+    uint16_t rowCnt = num_of_rows >> 2;
+    const q7_t *pB = pM;
+    const q7_t *pA;
+    q7_t *pO = pOut;
+    const q7_t *pBias = bias;
+
+    while (rowCnt)
+    {
+        pA = pV;
+        q31_t     sum;
+        q31_t     sum2;
+        q31_t     sum3;
+        q31_t     sum4;
+        uint16_t colCnt = dim_vec >> 2;
+
+        if(bias)
+        {
+            sum =  ((q31_t)(*pBias++) << bias_shift) + NNOM_ROUND(out_shift);
+            sum2 = ((q31_t)(*pBias++) << bias_shift) + NNOM_ROUND(out_shift);
+            sum3 = ((q31_t)(*pBias++) << bias_shift) + NNOM_ROUND(out_shift);
+            sum4 = ((q31_t)(*pBias++) << bias_shift) + NNOM_ROUND(out_shift);
+        }
+        else
+        {
+            sum =  (q31_t) NNOM_ROUND(out_shift);
+            sum2 = (q31_t) NNOM_ROUND(out_shift);
+            sum3 = (q31_t) NNOM_ROUND(out_shift);
+            sum4 = (q31_t) NNOM_ROUND(out_shift);
+        }
+
+        while (colCnt)
+        {
+            q7_t inA1 = *pA++;
+            q7_t inA3 = *pA++;
+            q7_t inA2 = *pA++;
+            q7_t inA4 = *pA++;
+
+            q7_t inB1 = *pB++;
+            q7_t inB3 = *pB++;
+            q7_t inB2 = *pB++;
+            q7_t inB4 = *pB++;
+
+            sum += inA1 * inB1 + inA2 * inB2;
+            sum2 += inA1 * inB3 + inA2 * inB4;
+
+            inB1 = *pB++;
+            inB3 = *pB++;
+            inB2 = *pB++;
+            inB4 = *pB++;
+
+            sum3 += inA1 * inB1 + inA2 * inB2;
+            sum4 += inA1 * inB3 + inA2 * inB4;
+
+            inB1 = *pB++;
+            inB3 = *pB++;
+            inB2 = *pB++;
+            inB4 = *pB++;
+
+            sum += inA3 * inB1 + inA4 * inB2;
+            sum2 += inA3 * inB3 + inA4 * inB4;
+
+            inB1 = *pB++;
+            inB3 = *pB++;
+            inB2 = *pB++;
+            inB4 = *pB++;
+
+            sum3 += inA3 * inB1 + inA4 * inB2;
+            sum4 += inA3 * inB3 + inA4 * inB4;
+
+            colCnt--;
+        }
+        colCnt = dim_vec & 0x3;
+        while (colCnt)
+        {
+            q7_t inA = *pA++;
+            q7_t inB = *pB++;
+            sum += inA * inB;
+            inB = *pB++;
+            sum2 += inA * inB;
+            inB = *pB++;
+            sum3 += inA * inB;
+            inB = *pB++;
+            sum4 += inA * inB;
+
+            colCnt--;
+        }
+        *pO++ = (q7_t)__NNOM_SSAT((sum >> out_shift), 8);
+        *pO++ = (q7_t)__NNOM_SSAT((sum2 >> out_shift), 8);
+        *pO++ = (q7_t)__NNOM_SSAT((sum3 >> out_shift), 8);
+        *pO++ = (q7_t)__NNOM_SSAT((sum4 >> out_shift), 8);
+
+        rowCnt--;
+    }
+
+    rowCnt = num_of_rows & 0x3;
+
+    while (rowCnt)
+    {
+		int ip_out;
+        if(bias)
+            ip_out=((q31_t)(*bias++) << bias_shift) + NNOM_ROUND(out_shift);
+        else
+            ip_out=(q31_t)NNOM_ROUND(out_shift);
+        
+        pA = pV;
+        for (int j = 0; j < dim_vec; j++)
+        {
+            q7_t inA = *pA++;
+            q7_t inB = *pB++;
+            ip_out += inA * inB;
+        }
+        *pO++ = (q7_t)__NNOM_SSAT((ip_out >> out_shift), 8);
+
+        rowCnt--;
+    }
+}
+
+void local_fully_connected_q7(const q7_t *pV,               // pointer to vector
+	const q7_t *pM,               // pointer to matrix
+	const uint16_t dim_vec,       // length of the vector
+	const uint16_t num_of_rows,   // numCol of A
+	const uint16_t bias_shift,    // amount of left-shift for bias
+	const uint16_t out_shift,     // amount of right-shift for output
+	const q7_t *bias, q7_t *pOut, // output operand
+	q15_t *vec_buffer)
+{
+    if(bias)
+    {
+        for (int i = 0; i < num_of_rows; i++)
+        {
+            int ip_out = ((q31_t)(*bias++) << bias_shift) + NNOM_ROUND(out_shift);
+            for (int j = 0; j < dim_vec; j++)
+            {
+                ip_out += pV[j] * pM[i * dim_vec + j];
+            }
+            pOut[i] = (q7_t)__NNOM_SSAT((ip_out >> out_shift), 8);
+        }
+    }
+    else
+    {
+        for (int i = 0; i < num_of_rows; i++)
+        {
+            int ip_out = (q31_t)NNOM_ROUND(out_shift);
+            for (int j = 0; j < dim_vec; j++)
+            {
+                ip_out += pV[j] * pM[i * dim_vec + j];
+            }
+            pOut[i] = (q7_t)__NNOM_SSAT((ip_out >> out_shift), 8);
+        }
+    }
+}
+
+
+void local_softmax_q7(const q7_t *vec_in, const uint32_t dim_vec, q7_t *p_out)
+{
+    q31_t sum;
+    int32_t i;
+    uint8_t shift;
+    q15_t base;
+    base = -257;
+
+    /* We first search for the maximum */
+    for (i = 0; i < dim_vec; i++)
+    {
+        if (vec_in[i] > base)
+        {
+            base = vec_in[i];
+        }
+    }
+
+    /* 
+     * So the base is set to max-8, meaning 
+     * that we ignore really small values. 
+     * anyway, they will be 0 after shrinking to q7_t.
+     */
+    base = base - 8;
+
+    sum = 0;
+
+    for (i = 0; i < dim_vec; i++)
+    {
+        if (vec_in[i] > base)
+        {
+            shift = (uint8_t)__NNOM_USAT(vec_in[i] - base, 5);
+            sum += 0x1 << shift;
+        }
+    }
+
+    /* This is effectively (0x1 << 20) / sum */
+    int output_base = 0x100000 / sum;
+
+    /* 
+     * Final confidence will be output_base >> ( 13 - (vec_in[i] - base) )
+     * so 128 (0x1<<7) -> 100% confidence when sum = 0x1 << 8, output_base = 0x1 << 12 
+     * and vec_in[i]-base = 8
+     */
+    for (i = 0; i < dim_vec; i++)
+    {
+        if (vec_in[i] > base)
+        {
+            /* Here minimum value of 13+base-vec_in[i] will be 5 */
+            shift = (uint8_t)__NNOM_USAT(13 + base - vec_in[i], 5);
+            p_out[i] = (q7_t)__NNOM_SSAT((output_base >> shift), 8);
+        }
+        else
+        {
+            p_out[i] = 0;
+        }
+    }
+}
+
+
+// hard sigmoid, 
+// y=-1 if x < -2.5
+// y=1  if x > 2.5
+// otherwise y = 0.2 * x + 0.5 (y=0.20315 * x + 0.5)
+void local_hard_sigmoid_q7(q7_t *data, uint32_t size, int16_t dec_bit)
+{
+	int16_t limit = 2.5f * (1 << dec_bit)-1; 
+	int16_t offset = 64;	// 0.5 * 128
+	int16_t mult = 26;  	// 0.2 * 128
+
+	// int bit >= 0
+	for(int i=0; i<size; i++)
+	{
+		if(data[i] <= -limit)
+			data[i] = 0;
+		else if(data[i] >= limit)
+			data[i] = 127;
+		else
+		{
+			data[i] = ((int16_t)(data[i] * mult) >> dec_bit) + offset;
+		} 
+	}   
+ }
+
+// hard tanh
+// y=-1 if x < -1
+// y=1  if x > 1
+// otherwise y = x
+void local_hard_tanh_q7(q7_t *data, uint32_t size, int16_t dec_bit)
+{
+    int16_t int_bit = 7 - dec_bit;
+    int16_t limit = 1 << dec_bit;
+
+    if(dec_bit == 7)
+        return;
+	
+	// int bit < 0
+	if(int_bit < 0)
+		for(int i=0; i<size; i++)
+		{
+			if(data[i] <= -limit)
+				data[i] = -128;
+			else if(data[i] >= limit)
+				data[i] = 127;
+			else
+			{
+				data[i] = data[i] >> (-int_bit);
+			}
+		}
+	else
+		// int bit >= 0
+		for(int i=0; i<size; i++)
+		{
+			if(data[i] <= -limit)
+				data[i] = -128;
+			else if(data[i] >= limit)
+				data[i] = 127;
+			else
+			{
+				data[i] = data[i] << int_bit;
+			}
+		}
+}
+
+void local_sigmoid_q7(q7_t *data, uint32_t size, int16_t int_width)
+{
+    uint32_t i = size;
+    q7_t *pIn = data;
+    q7_t *pOut = data;
+    q7_t in;
+    q7_t out;
+    uint16_t shift_size = 3 - int_width;
+	// saturation if int bit too large
+	if(int_width > 3)
+	{
+		while (i)
+		{
+			if(*pIn++ > 0)
+				*pOut++ = 127;
+			else
+				*pOut++ = 0;
+			i--;
+		}
+	}
+	// otherwise search table
+	else
+	{
+		while (i)
+		{
+			in = *pIn++;
+			out = nnom_sigmoid_table_q7[(uint8_t)(in >> shift_size)];
+			*pOut++ = out;
+			i--;
+		}
+    }
+}
+
+void local_tanh_q7(q7_t *data, uint32_t size, int16_t int_width)
+{
+    uint32_t i = size;
+    q7_t *pIn = data;
+    q7_t *pOut = data;
+    q7_t in;
+	q7_t out;
+	uint16_t shift_size = 3 - int_width;
+	
+	// saturation if int bit too large
+	if(int_width > 3)
+	{
+		while (i)
+		{
+			in = *pIn++;
+			if(in > 0)
+				*pOut++ = 127;
+			else if ( in == 0)
+				*pOut++ = 0;
+			else
+				*pOut++ = -128;
+			i--;
+		}
+	}
+	// otherwise search table
+	else
+	{
+		while (i)
+		{
+			in = *pIn++;
+			out = nnom_tanh_table_q7[(uint8_t)(in >> shift_size)];
+			*pOut++ = out;
+			i--;
+		}
+	}
+}
+
+void local_relu_q7(q7_t *data, uint32_t size)
+{
+    uint32_t i;
+
+    for (i = 0; i < size; i++)
+    {
+        if (data[i] < 0)
+            data[i] = 0;
+    }
+}
+
+// alpha in q7 format with dec_bit=7
+void local_leaky_relu_q7(q7_t *data, q7_t alpha, uint32_t size)
+{
+    uint32_t i;
+    
+    for (i = 0; i < size; i++)
+    {
+        if (data[i] < 0)
+        {
+            data[i] = data[i] * alpha / 128;
+        }
+    }
+}
+
+// alpha in q7 format with dec_bit=7
+// max and threshold has the same Q format with the activation
+void local_adv_relu_q7(q7_t *data, q7_t negative_slope, q7_t max, q7_t threshold, uint32_t size)
+{
+    uint32_t i;
+    for (i = 0; i < size; i++)
+    {
+        //   `f(x) = max_value` for `x >= max_value`,
+        //   `f(x) = x` for `threshold <= x < max_value`,
+        //   `f(x) = alpha * (x - threshold)` otherwise.
+
+        if(data[i] > max)
+            data[i] = max;
+        if (data[i] < threshold)
+            data[i] = (data[i] - threshold) * negative_slope / 128;    
+    }
+}
+
+// matrix ops
+void local_mult_q7(q7_t *pSrcA,
+                   q7_t *pSrcB,
+                   q7_t *pDst,
+                   const uint16_t out_shift,
+                   uint32_t blockSize)
+{
+    uint32_t i;
+
+    for (i = 0; i < blockSize; i++)
+    {
+        q31_t product = pSrcA[i] * pSrcB[i];
+        pDst[i] = (q7_t) __NNOM_SSAT(((product + NNOM_ROUND(out_shift)) >> out_shift), 8);
+    }
+}
+
+void local_add_q7(q7_t *pSrcA,
+                  q7_t *pSrcB,
+                  q7_t *pDst,
+                  const uint16_t out_shift,
+                  uint32_t blockSize)
+{
+    uint32_t i;
+
+    for (i = 0; i < blockSize; i++)
+    {
+        q31_t sum = pSrcA[i] + pSrcB[i];
+        pDst[i] = (q7_t) __NNOM_SSAT(((sum + NNOM_ROUND(out_shift)) >> out_shift), 8);
+    }
+}
+
+void local_sub_q7(q7_t *pSrcA,
+                  q7_t *pSrcB,
+                  q7_t *pDst,
+                  const uint16_t out_shift,
+                  uint32_t blockSize)
+{
+    uint32_t i;
+
+    for (i = 0; i < blockSize; i++)
+    {
+        q31_t sub = pSrcA[i] - pSrcB[i];
+        pDst[i] = (q7_t) __NNOM_SSAT(((sub + NNOM_ROUND(out_shift)) >> out_shift), 8);
+    }
+}
+
+
+
+void local_multiple_add_q7( q7_t *p_dst,
+                  const int16_t out_shift,
+                  uint32_t block_size,
+                  uint32_t num_block,
+                  q7_t **p_src)
+{
+    uint32_t i, blk;
+    q31_t sum; 
+
+    for (i = 0; i < block_size; i++)
+    {
+        sum = 0;
+        for(blk=0; blk < num_block; blk++)
+            sum += p_src[blk][i];
+        p_dst[i] = (q7_t) __NNOM_SSAT(((sum + NNOM_ROUND(out_shift)) >> out_shift), 8);
+    }
+}
+
+void local_multiple_mult_q7( q7_t *p_dst,
+                  const int16_t out_shift,
+                  uint32_t block_size,
+                  uint32_t num_block,
+                  q7_t **p_src)
+{
+    uint32_t i, blk;
+    q31_t product; 
+
+    for (i = 0; i < block_size; i++)
+    {
+        product = 1;
+        for(blk=0; blk < num_block; blk++)
+            product *= p_src[blk][i];
+        p_dst[i] = (q7_t) __NNOM_SSAT(((product + NNOM_ROUND(out_shift)) >> out_shift), 8);
+    }
+}
+
+void local_multiple_sub_q7( q7_t *p_dst,
+                  const int16_t out_shift,
+                  uint32_t block_size,
+                  uint32_t num_block,
+                  q7_t **p_src)
+{
+    uint32_t i, blk;
+    q31_t sub; 
+
+    for (i = 0; i < block_size; i++)
+    {
+        sub = p_src[0][i];
+        for(blk=1; blk < num_block; blk++)
+            sub -= p_src[blk][i];
+        p_dst[i] = (q7_t) __NNOM_SSAT(((sub + NNOM_ROUND(out_shift)) >> out_shift), 8);
+    }
+}
+
+
+void local_q7_to_q15_no_shift(const q7_t *src, q15_t *des, uint32_t size)
+{
+    // simple unloop
+    uint32_t  count = size/8;
+    while (count-- > 0)
+    {
+        *des++ = (q15_t)*src++;
+        *des++ = (q15_t)*src++;
+        *des++ = (q15_t)*src++;
+        *des++ = (q15_t)*src++;
+        *des++ = (q15_t)*src++;
+        *des++ = (q15_t)*src++;
+        *des++ = (q15_t)*src++;
+        *des++ = (q15_t)*src++;
+    }
+    count = size%8;
+    while(count-- > 0)
+        *des++ = (q15_t)*src++;
+}
+
+void local_q7_to_q15(const q7_t *src, q15_t *des, uint32_t size)
+{
+    // simple unloop
+    uint32_t  count = size/8;
+    while (count-- > 0)
+    {
+        *des++ = (q15_t)*src++<<8;
+        *des++ = (q15_t)*src++<<8;
+        *des++ = (q15_t)*src++<<8;
+        *des++ = (q15_t)*src++<<8;
+        *des++ = (q15_t)*src++<<8;
+        *des++ = (q15_t)*src++<<8;
+        *des++ = (q15_t)*src++<<8;
+        *des++ = (q15_t)*src++<<8;
+    }
+    count = size%8;
+    while(count-- > 0)
+        *des++ = (q15_t)*src++<<8;
+}
+
+// right shift q15 to q7
+void local_q15_to_q7(const q15_t *src, q7_t *des,  uint32_t shift, uint32_t size)
+{
+    while(size-- >0)
+    {
+        *des = *src >> shift;
+        des++;
+        src++;
+    }
+}
+
diff --git a/APP_Framework/Framework/knowing/nnom/src/backends/nnom_local_q15.c b/APP_Framework/Framework/knowing/nnom/src/backends/nnom_local_q15.c
new file mode 100644
index 000000000..d78c3efc0
--- /dev/null
+++ b/APP_Framework/Framework/knowing/nnom/src/backends/nnom_local_q15.c
@@ -0,0 +1,1602 @@
+/*
+ * Copyright (c) 2018-2020
+ * Jianjia Ma
+ * majianjia@live.com
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ * 
+ * Notice: 
+ * Code in this file inlcudes derivative works from CMSIS
+ * Please check the LICENSE file for detial.
+ *
+ * Change Logs:
+ * Date           Author       Notes
+ * 2020-10-05     Jianjia Ma   The first version
+ */
+
+#include "nnom.h"
+#include "nnom_local.h"
+
+// modified from CMSIS-NN test_ref
+void local_avepool_q15_HWC(const q15_t *Im_in,           // input image
+	const uint16_t dim_im_in_x,  // input image dimension x or W
+	const uint16_t dim_im_in_y,  // input image dimension y or H
+	const uint16_t ch_im_in,     // number of input image channels
+	const uint16_t dim_kernel_x, // window kernel size
+	const uint16_t dim_kernel_y, // window kernel size
+	const uint16_t padding_x,    // padding sizes
+	const uint16_t padding_y,    // padding sizes
+	const uint16_t stride_x,     // stride
+	const uint16_t stride_y,     // stride
+	const uint16_t dim_im_out_x, // output image dimension x or W
+	const uint16_t dim_im_out_y, // output image dimension y or H
+	const uint16_t output_shift, // output right shift
+	q7_t *bufferA,               // a buffer for local storage, NULL by now
+	q15_t *Im_out)
+{
+    int16_t i_ch_in, i_x, i_y;
+    int16_t k_x, k_y;
+
+    for (i_ch_in = 0; i_ch_in < ch_im_in; i_ch_in++)
+    {
+        for (i_y = 0; i_y < dim_im_out_y; i_y++)
+        {
+            for (i_x = 0; i_x < dim_im_out_x; i_x++)
+            {
+                int sum = 0;
+                int count = 0;
+                for (k_y = i_y * stride_y - padding_y; k_y < i_y * stride_y - padding_y + dim_kernel_y; k_y++)
+                {
+                    for (k_x = i_x * stride_x - padding_x; k_x < i_x * stride_x - padding_x + dim_kernel_x; k_x++)
+                    {
+                        if (k_y >= 0 && k_x >= 0 && k_y < dim_im_in_y && k_x < dim_im_in_x)
+                        {
+                            sum += Im_in[i_ch_in + ch_im_in * (k_x + k_y * dim_im_in_x)];
+                            count++;
+                        }
+                    }
+                }
+                Im_out[i_ch_in + ch_im_in * (i_x + i_y * dim_im_out_x)] = sum / (count>>output_shift);
+            }
+        }
+    }
+}
+
+void local_avepool_q15_CHW(const q15_t *Im_in,           // input image
+	const uint16_t dim_im_in_x,  // input image dimension x or W
+	const uint16_t dim_im_in_y,  // input image dimension y or H
+	const uint16_t ch_im_in,     // number of input image channels
+	const uint16_t dim_kernel_x, // window kernel size
+	const uint16_t dim_kernel_y, // window kernel size
+	const uint16_t padding_x,    // padding sizes
+	const uint16_t padding_y,    // padding sizes
+	const uint16_t stride_x,     // stride
+	const uint16_t stride_y,     // stride
+	const uint16_t dim_im_out_x, // output image dimension x or W
+	const uint16_t dim_im_out_y, // output image dimension y or H
+	const uint16_t output_shift, // output right shift
+	q7_t *bufferA,               // a buffer for local storage, NULL by now
+	q15_t *Im_out)
+{
+    int16_t i_ch_in, i_x, i_y;
+    int16_t k_x, k_y;
+	int32_t ch_offset;
+
+    for (i_ch_in = 0; i_ch_in < ch_im_in; i_ch_in++)
+    {
+		ch_offset = i_ch_in*dim_im_in_x*dim_im_in_y;
+        for (i_y = 0; i_y < dim_im_out_y; i_y++)
+        {
+            for (i_x = 0; i_x < dim_im_out_x; i_x++)
+            {
+                int sum = 0;
+                int count = 0;
+                for (k_y = i_y * stride_y - padding_y; k_y < i_y * stride_y - padding_y + dim_kernel_y; k_y++)
+                {
+                    for (k_x = i_x * stride_x - padding_x; k_x < i_x * stride_x - padding_x + dim_kernel_x; k_x++)
+                    {
+                        if (k_y >= 0 && k_x >= 0 && k_y < dim_im_in_y && k_x < dim_im_in_x)
+                        {
+                            sum += Im_in[ch_offset + (k_x + k_y * dim_im_in_x)];
+                            count++;
+                        }
+                    }
+                }
+                Im_out[i_ch_in*dim_im_out_x*dim_im_out_y + (i_x + i_y * dim_im_out_x)] = sum / (count>>output_shift);
+            }
+        }
+    }
+}
+
+// modified from CMSIS-NN test_ref
+void local_maxpool_q15_HWC(const q15_t *Im_in,           // input image
+	const uint16_t dim_im_in_x,  // input image dimension x or W
+	const uint16_t dim_im_in_y,  // input image dimension y or H
+	const uint16_t ch_im_in,     // number of input image channels
+	const uint16_t dim_kernel_x, // window kernel size
+	const uint16_t dim_kernel_y, // window kernel size
+	const uint16_t padding_x,    // padding sizes
+	const uint16_t padding_y,    // padding sizes
+	const uint16_t stride_x,     // stride
+	const uint16_t stride_y,     // stride
+	const uint16_t dim_im_out_x, // output image dimension x or W
+	const uint16_t dim_im_out_y, // output image dimension y or H
+	q7_t *bufferA,               // a buffer for local storage, NULL by now
+	q15_t *Im_out)
+{
+    int16_t i_ch_in, i_x, i_y;
+    int16_t k_x, k_y;
+
+    for (i_ch_in = 0; i_ch_in < ch_im_in; i_ch_in++)
+    {
+        for (i_y = 0; i_y < dim_im_out_y; i_y++)
+        {
+            for (i_x = 0; i_x < dim_im_out_x; i_x++)
+            {
+                int max = -32768;
+                for (k_y = i_y * stride_y - padding_y; k_y < i_y * stride_y - padding_y + dim_kernel_y; k_y++)
+                {
+                    for (k_x = i_x * stride_x - padding_x; k_x < i_x * stride_x - padding_x + dim_kernel_x; k_x++)
+                    {
+                        if (k_y >= 0 && k_x >= 0 && k_y < dim_im_in_y && k_x < dim_im_in_x)
+                        {
+                            if (Im_in[i_ch_in + ch_im_in * (k_x + k_y * dim_im_in_x)] > max)
+                            {
+                                max = Im_in[i_ch_in + ch_im_in * (k_x + k_y * dim_im_in_x)];
+                            }
+                        }
+                    }
+                }
+                Im_out[i_ch_in + ch_im_in * (i_x + i_y * dim_im_out_x)] = max;
+            }
+        }
+    }
+}
+
+void local_maxpool_q15_CHW(const q15_t *Im_in,           // input image
+	const uint16_t dim_im_in_x,  // input image dimension x or W
+	const uint16_t dim_im_in_y,  // input image dimension y or H
+	const uint16_t ch_im_in,     // number of input image channels
+	const uint16_t dim_kernel_x, // window kernel size
+	const uint16_t dim_kernel_y, // window kernel size
+	const uint16_t padding_x,    // padding sizes
+	const uint16_t padding_y,    // padding sizes
+	const uint16_t stride_x,     // stride
+	const uint16_t stride_y,     // stride
+	const uint16_t dim_im_out_x, // output image dimension x or W
+	const uint16_t dim_im_out_y, // output image dimension y or H
+	q7_t *bufferA,               // a buffer for local storage, NULL by now
+	q15_t *Im_out)
+{
+    int16_t i_ch_in, i_x, i_y;
+    int16_t k_x, k_y;
+	int32_t ch_offset;
+
+    for (i_ch_in = 0; i_ch_in < ch_im_in; i_ch_in++)
+    {
+		ch_offset = i_ch_in * dim_im_out_x * dim_im_out_y; 
+        for (i_y = 0; i_y < dim_im_out_y; i_y++)
+        {
+            for (i_x = 0; i_x < dim_im_out_x; i_x++)
+            {
+                int max = -32768;
+                for (k_y = i_y * stride_y - padding_y; k_y < i_y * stride_y - padding_y + dim_kernel_y; k_y++)
+                {
+                    for (k_x = i_x * stride_x - padding_x; k_x < i_x * stride_x - padding_x + dim_kernel_x; k_x++)
+                    {
+                        if (k_y >= 0 && k_x >= 0 && k_y < dim_im_in_y && k_x < dim_im_in_x)
+                        {
+                            if (Im_in[i_ch_in * dim_im_in_x * dim_im_in_y + (k_x + k_y * dim_im_in_x)] > max)
+                            {
+                                max = Im_in[i_ch_in * dim_im_in_x * dim_im_in_y + (k_x + k_y * dim_im_in_x)];
+                            }
+                        }
+                    }
+                }
+                Im_out[ch_offset+(i_x + i_y * dim_im_out_x)] = max;
+            }
+        }
+    }
+}
+
+// shift according to the maximum
+void local_sumpool_q15_HWC(const q15_t *Im_in,           // input image
+	const uint16_t dim_im_in_x,  // input image dimension x or W
+	const uint16_t dim_im_in_y,  // input image dimension y or H
+	const uint16_t ch_im_in,     // number of input image channels
+	const uint16_t dim_kernel_x, // window kernel size
+	const uint16_t dim_kernel_y, // window kernel size
+	const uint16_t padding_x,    // padding sizes
+	const uint16_t padding_y,    // padding sizes
+	const uint16_t stride_x,     // stride
+	const uint16_t stride_y,     // stride
+	const uint16_t dim_im_out_x, // output image dimension x or W
+	const uint16_t dim_im_out_y, // output image dimension y or H
+    const uint16_t output_shift, // output right shift
+	q7_t *bufferA,               // a buffer for local storage, size = 4*output_size
+	q15_t *Im_out)
+{
+    int16_t i_ch_in, i_x, i_y;
+    int16_t k_x, k_y;
+    int32_t *buf = (int32_t *)bufferA;
+	// stage2
+    // int32_t max_abs = 0;
+    // int32_t output_shift;
+    // size_t output_size = dim_im_out_x * dim_im_out_x * ch_im_in;
+
+    // save in 32bit
+    for (i_ch_in = 0; i_ch_in < ch_im_in; i_ch_in++)
+    {
+        for (i_y = 0; i_y < dim_im_out_y; i_y++)
+        {
+            for (i_x = 0; i_x < dim_im_out_x; i_x++)
+            {
+                int sum = 0;
+                for (k_y = i_y * stride_y - padding_y; k_y < i_y * stride_y - padding_y + dim_kernel_y; k_y++)
+                {
+                    for (k_x = i_x * stride_x - padding_x; k_x < i_x * stride_x - padding_x + dim_kernel_x; k_x++)
+                    {
+                        if (k_y >= 0 && k_x >= 0 && k_y < dim_im_in_y && k_x < dim_im_in_x)
+                        {
+                            sum += Im_in[i_ch_in + ch_im_in * (k_x + k_y * dim_im_in_x)];
+                        }
+                    }
+                }
+                // 32bit
+                buf[i_ch_in + ch_im_in * (i_x + i_y * dim_im_out_x)] = (q15_t)__NNOM_SSAT((sum >> output_shift), 16);
+            }
+        }
+    }
+
+    // // find max amount results
+    // for (int i = 0; i < output_size; i++)
+    // {
+    //     int32_t val = buf[i];
+    //     if (val < 0)
+    //         val = -val;
+    //     if (val > max_abs)
+    //         max_abs = val;
+    // }
+    // // find best shift to cover the max
+    // for (output_shift = 0;; output_shift++)
+    // {
+    //     if (127 * (1 + output_shift) >= max_abs)
+    //         break;
+    // }
+
+    // // shift the results
+    // for (int i = 0; i < output_size; i++)
+    // {
+    //     Im_out[i] = buf[i] >> output_shift;
+    // }
+    //return output_shift;
+}
+
+// temporary for the thesis
+// shift according to the maximum
+void local_sumpool_q15_CHW(const q15_t *Im_in,           // input image
+	const uint16_t dim_im_in_x,  // input image dimension x or W
+	const uint16_t dim_im_in_y,  // input image dimension y or H
+	const uint16_t ch_im_in,     // number of input image channels
+	const uint16_t dim_kernel_x, // window kernel size
+	const uint16_t dim_kernel_y, // window kernel size
+	const uint16_t padding_x,    // padding sizes
+	const uint16_t padding_y,    // padding sizes
+	const uint16_t stride_x,     // stride
+	const uint16_t stride_y,     // stride
+	const uint16_t dim_im_out_x, // output image dimension x or W
+	const uint16_t dim_im_out_y, // output image dimension y or H
+	const uint16_t output_shift, // output right shift
+	q7_t *bufferA,               // a buffer for local storage, size = 4*output_size
+	q15_t *Im_out)
+{
+    int16_t i_ch_in, i_x, i_y;
+    int16_t k_x, k_y;
+    int32_t *buf = (int32_t *)bufferA;
+	int32_t i_ch_offset, o_ch_offset;
+	// // stage2
+    // int32_t max_abs = 0;
+    // int32_t output_shift;
+    // size_t output_size = dim_im_out_x * dim_im_out_x * ch_im_in;
+
+    // save in 32bit
+    for (i_ch_in = 0; i_ch_in < ch_im_in; i_ch_in++)
+    {
+		i_ch_offset = i_ch_in*dim_im_in_x*dim_im_in_y;
+		o_ch_offset = i_ch_in*dim_im_out_x*dim_im_out_y;
+		
+        for (i_y = 0; i_y < dim_im_out_y; i_y++)
+        {
+            for (i_x = 0; i_x < dim_im_out_x; i_x++)
+            {
+                int sum = 0;
+                for (k_y = i_y * stride_y - padding_y; k_y < i_y * stride_y - padding_y + dim_kernel_y; k_y++)
+                {
+                    for (k_x = i_x * stride_x - padding_x; k_x < i_x * stride_x - padding_x + dim_kernel_x; k_x++)
+                    {
+                        if (k_y >= 0 && k_x >= 0 && k_y < dim_im_in_y && k_x < dim_im_in_x)
+                        {
+                            sum += Im_in[i_ch_offset + (k_x + k_y * dim_im_in_x)];
+                        }
+                    }
+                }
+                // 32bit
+                buf[o_ch_offset + (i_x + i_y * dim_im_out_x)] = (q15_t)__NNOM_SSAT((sum >> output_shift), 16);
+            }
+        }
+    }
+
+    // // find max amount results
+    // for (int i = 0; i < output_size; i++)
+    // {
+    //     int32_t val = buf[i];
+    //     if (val < 0)
+    //         val = -val;
+    //     if (val > max_abs)
+    //         max_abs = val;
+    // }
+    // // find best shift to cover the max
+    // for (output_shift = 0;; output_shift++)
+    // {
+    //     if (127 * (1 + output_shift) >= max_abs)
+    //         break;
+    // }
+
+    // // shift the results
+    // for (int i = 0; i < output_size; i++)
+    // {
+    //     Im_out[i] = buf[i] >> output_shift;
+    // }
+    //return output_shift;
+}
+
+// customised up sample pooling
+void local_up_sampling_q15_HWC(const q15_t *Im_in,       // input image
+	const uint16_t dim_im_in_x,  // input image dimension x or W
+	const uint16_t dim_im_in_y,  // input image dimension y or H
+	const uint16_t ch_im_in,     // number of input image channels
+	const uint16_t dim_kernel_x, // window kernel size
+	const uint16_t dim_kernel_y, // window kernel size
+	const uint16_t dim_im_out_x, // output image dimension x or W
+	const uint16_t dim_im_out_y, // output image dimension y or H
+	q7_t *bufferA,               // a buffer for local storage, NULL by now
+	q15_t *Im_out)
+{
+    int16_t i_x, i_y;
+	
+    // for loop for each pixel in input image.
+    for (i_y = 0; i_y < dim_im_in_y; i_y++)
+    {
+        for (i_x = 0; i_x < dim_im_in_x; i_x++)
+        {
+            // copy all the channels together. 
+            const q15_t *p_in = Im_in + (i_y * dim_im_in_x + i_x ) * ch_im_in;
+            q15_t *pout = Im_out + (i_y * dim_im_in_x * dim_kernel_x * dim_kernel_y + i_x * dim_kernel_y) * ch_im_in;
+
+            // copy along x axis
+            for(int i = 0; i<dim_kernel_x; i++)
+                nnom_memcpy(pout + i * ch_im_in, p_in, ch_im_in * sizeof(q15_t));
+            // duplicate the copied x data into y axis. 
+            for(int i = 1; i<dim_kernel_y; i++)
+                nnom_memcpy(pout + i * ch_im_in * dim_im_in_x * dim_kernel_x, pout, ch_im_in * dim_kernel_x * sizeof(q15_t));
+        }
+    }
+}
+
+void local_up_sampling_q15_CHW(const q15_t *Im_in,       // input image
+	const uint16_t dim_im_in_x,  // input image dimension x or W
+	const uint16_t dim_im_in_y,  // input image dimension y or H
+	const uint16_t ch_im_in,     // number of input image channels
+	const uint16_t dim_kernel_x, // window kernel size
+	const uint16_t dim_kernel_y, // window kernel size
+	const uint16_t dim_im_out_x, // output image dimension x or W
+	const uint16_t dim_im_out_y, // output image dimension y or H
+	q7_t *bufferA,               // a buffer for local storage, NULL by now
+	q15_t *Im_out)
+{
+    int16_t i_x, i_y, ch;
+	// for loop for channel
+	for(ch=0; ch<ch_im_in; ch++)
+	{
+		// for loop for each pixel in input image.
+		for (i_y = 0; i_y < dim_im_in_y; i_y++)
+		{
+			for (i_x = 0; i_x < dim_im_in_x; i_x++)
+			{
+				const q15_t *p_in = Im_in + ch * dim_im_in_x * dim_im_in_y + (i_y * dim_im_in_x + i_x);
+				q15_t *pout = Im_out + ch * dim_im_out_x * dim_im_out_y + (i_y * dim_im_in_x * dim_kernel_x * dim_kernel_y + i_x * dim_kernel_y);
+
+				// cpy along x axis
+				for(int i = 0; i<dim_kernel_x; i++)
+					*(pout + i) =  *p_in;
+				// duplicate the copied x data into y axis. 
+				for(int i = 1; i<dim_kernel_y; i++)
+					nnom_memcpy(pout + i * dim_im_in_x * dim_kernel_x, pout, dim_kernel_x * sizeof(q15_t));
+			}
+		}
+	}
+}
+
+void local_convolve_HWC_q15_nonsquare(const q15_t *Im_in,                // input image
+	const uint16_t dim_im_in_x,                                        // input image dimention x
+	const uint16_t dim_im_in_y,                                        // input image dimention y
+	const uint16_t ch_im_in,                                           // number of input image channels
+	const q7_t *wt,                                                    // kernel weights
+	const uint16_t ch_im_out,                                          // number of filters, i.e., output image channels
+	const uint16_t dim_kernel_x,                                       // filter kernel size x
+	const uint16_t dim_kernel_y,                                       // filter kernel size y
+	const uint16_t padding_x,                                          // padding sizes x
+	const uint16_t padding_y,                                          // padding sizes y
+	const uint16_t stride_x,                                           // stride x
+	const uint16_t stride_y,                                           // stride y
+    const uint16_t dilation_x,                                         // dilation x
+	const uint16_t dilation_y,                                         // dilation y
+	const q7_t *bias,                                                  // bias
+	const nnom_qformat_param_t *bias_shift,                                        // bias shifts
+    const nnom_qformat_param_t *out_shift,                                         // output shift
+    const nnom_qtype_t q_type,                                         // per channel or per tensor
+    q15_t *Im_out,                                                      // output image
+	const uint16_t dim_im_out_x,                                       // output image dimension x
+	const uint16_t dim_im_out_y,                                       // output image dimension y
+	q15_t *bufferA,                                                    //buffer space for input
+	q7_t *bufferB                                                      //buffer space for output
+)
+{
+    int i, j, k, l, m, n;
+    int64_t conv_out;
+    int in_row, in_col;
+    int in_pix_loc, wt_loc;
+    int shift_idx, shift_steps;
+    if(q_type == NNOM_QTYPE_PER_AXIS)
+        shift_steps = 1;
+    else
+        shift_steps = 0;
+
+    for (i = 0, shift_idx = 0; i < ch_im_out; i++, shift_idx += shift_steps)
+    {
+        for (j = 0; j < dim_im_out_y; j++)
+        {
+            int32_t base_idx_y = stride_y * j - padding_y;
+            for (k = 0; k < dim_im_out_x; k++)
+            {
+				int32_t base_idx_x = stride_x * k - padding_x;
+                int32_t ker_y_start = MAX(0, -(base_idx_y-(dilation_y-1))/dilation_y);
+                int32_t ker_x_start = MAX(0, -(base_idx_x-(dilation_x-1))/dilation_x);
+                int32_t ker_y_end = MIN(dim_kernel_y, (dim_im_in_y - base_idx_y + (dilation_y-1))/dilation_y);
+                int32_t ker_x_end = MIN(dim_kernel_x, (dim_im_in_x - base_idx_x + (dilation_x-1))/dilation_x);
+
+                if(bias)
+                    conv_out = ((q31_t)(bias[i]) << bias_shift[shift_idx]) + NNOM_ROUND(out_shift[shift_idx]);
+                else
+                    conv_out = (q31_t)NNOM_ROUND(out_shift[shift_idx]);
+                
+                for (m = ker_y_start; m < ker_y_end; m++)
+                {
+                    for (n = ker_x_start; n < ker_x_end; n++)
+                    {
+                        // if-for implementation
+                        in_row = stride_y * j + m * dilation_y - padding_y;
+                        in_col = stride_x * k + n * dilation_x - padding_x;
+
+                        // pre-calculate the pixel location and weight location to improve the performance.
+                        in_pix_loc = (in_row * dim_im_in_x + in_col) * ch_im_in;
+                        wt_loc = i * ch_im_in * dim_kernel_y * dim_kernel_x + (m * dim_kernel_x + n) * ch_im_in;
+                        
+                        for (l = 0; l < ch_im_in; l++)
+                        {    
+                            conv_out += Im_in[in_pix_loc + l] * wt[wt_loc + l];
+                        } 
+                    }
+                }
+                Im_out[i + (j * dim_im_out_x + k) * ch_im_out] = (q15_t)__NNOM_SSAT((conv_out >> out_shift[shift_idx]), 16);
+            }
+        }
+    }
+}
+
+void local_convolve_CHW_q15_nonsquare(const q15_t *Im_in,                // input image
+	const uint16_t dim_im_in_x,                                        // input image dimention x
+	const uint16_t dim_im_in_y,                                        // input image dimention y
+	const uint16_t ch_im_in,                                           // number of input image channels
+	const q7_t *wt,                                                    // kernel weights
+	const uint16_t ch_im_out,                                          // number of filters, i.e., output image channels
+	const uint16_t dim_kernel_x,                                       // filter kernel size x
+	const uint16_t dim_kernel_y,                                       // filter kernel size y
+	const uint16_t padding_x,                                          // padding sizes x
+	const uint16_t padding_y,                                          // padding sizes y
+	const uint16_t stride_x,                                           // stride x
+	const uint16_t stride_y,                                           // stride y
+    const uint16_t dilation_x,                                         // dilation x
+	const uint16_t dilation_y,                                         // dilation y
+	const q7_t *bias,                                                  // bias
+	const nnom_qformat_param_t *bias_shift,                                        // bias shifts
+    const nnom_qformat_param_t *out_shift,                                         // output shift
+    const nnom_qtype_t q_type,                                         // per channel or per tensor
+    q15_t *Im_out,                                                      // output image
+	const uint16_t dim_im_out_x,                                       // output image dimension x
+	const uint16_t dim_im_out_y,                                       // output image dimension y
+	q15_t *bufferA,                                                    //buffer space for input
+	q7_t *bufferB                                                      //buffer space for output
+)
+{
+    int i, j, k, l, m, n;
+    int64_t conv_out;
+    int in_row, in_col;
+    int shift_idx, shift_steps;
+    if(q_type == NNOM_QTYPE_PER_AXIS)
+        shift_steps = 1;
+    else
+        shift_steps = 0;
+
+    for(i = 0, shift_idx = 0; i < ch_im_out; i++, shift_idx += shift_steps)
+    {
+        for (j = 0; j < dim_im_out_y; j++)
+        {
+            for (k = 0; k < dim_im_out_x; k++)
+            {
+                if(bias)
+                    conv_out = ((q31_t)(bias[i]) << bias_shift[shift_idx]) + NNOM_ROUND(out_shift[shift_idx]);
+                else
+                    conv_out = (q31_t)NNOM_ROUND(out_shift[shift_idx]);
+				for (m = 0; m < dim_kernel_y; m++)
+				{
+					for (n = 0; n < dim_kernel_x; n++)
+					{
+						// if-for implementation
+						in_row = stride_y * j + m * dilation_y - padding_y;
+						in_col = stride_x * k + n * dilation_x - padding_x;
+						if (in_row >= 0 && in_col >= 0 && in_row < dim_im_in_y && in_col < dim_im_in_x)
+						{
+							for (l = 0; l < ch_im_in; l++)
+							{
+								conv_out += Im_in[(in_row * dim_im_in_x + in_col) + l * dim_im_in_x * dim_im_in_y] *
+									wt[(m * dim_kernel_x + n) * ch_im_in * ch_im_out + l * ch_im_out + i];
+							}
+						}
+					}
+				}
+                Im_out[i * dim_im_out_x * dim_im_out_y + (j * dim_im_out_x + k)] = (q15_t)__NNOM_SSAT((conv_out >> out_shift[shift_idx]), 16);
+            }
+        }
+    }
+}
+
+#define FALSE 0
+#define TRUE 1
+
+static int alg_deconv2d_calculate_position(
+		int pos,
+		int stride,
+		int padding,
+		int dim_kernel,
+		int dim_in,
+		int* in_start,
+		int* kernel_start,
+		int* kernel_end)
+{
+	int is_zero = FALSE;
+	int of, adj;
+	is_zero = FALSE;
+	*in_start = pos/stride;
+	of = pos%stride;
+	*kernel_start = padding - of;
+	if(*kernel_start >= 0) {
+		adj = MIN(*in_start, *kernel_start/stride);
+		*kernel_start -= adj*stride;
+		*in_start -= adj;
+	} else {
+		adj = -*kernel_start + dim_kernel;
+		if(adj<=stride) {
+			is_zero = TRUE;
+		} else {
+			adj = MIN(dim_in-1-*in_start, adj/stride);
+			*kernel_start += adj*stride;
+			*in_start += adj;
+		}
+	}
+	of = dim_kernel - 1 - *kernel_start;
+	adj = MIN(dim_in-1-*in_start, of/stride);
+	*kernel_end = *kernel_start + adj*stride;
+
+	return is_zero;
+}
+
+void local_conv_trans_HWC_q15_nonsquare(const int8_t * Im_in,
+	const uint16_t dim_im_in_x,                                        // input image dimention x
+	const uint16_t dim_im_in_y,                                        // input image dimention y
+	const uint16_t ch_im_in,                                           // number of input image channels
+	const q7_t *wt,                                                    // kernel weights
+	const uint16_t ch_im_out,                                          // number of filters, i.e., output image channels
+	const uint16_t dim_kernel_x,                                       // filter kernel size x
+	const uint16_t dim_kernel_y,                                       // filter kernel size y
+	const uint16_t padding_x,                                          // padding sizes x
+	const uint16_t padding_y,                                          // padding sizes y
+	const uint16_t stride_x,                                           // stride x
+	const uint16_t stride_y,                                           // stride y
+    const uint16_t dilation_x,                                         // dilation x
+	const uint16_t dilation_y,                                         // dilation y
+	const q7_t *bias,                                                  // bias
+	const uint16_t bias_shift, const uint16_t out_shift, q15_t *Im_out, // output image
+	const uint16_t dim_im_out_x,                                       // output image dimension x
+	const uint16_t dim_im_out_y,                                       // output image dimension y
+	q15_t *bufferA,                                                    //buffer space for input
+	q7_t *bufferB                                                      //buffer space for output
+)
+// {
+// 	int ox, oy, oc, ky, kx, kc, ix, iy;
+// 	int conv_out;
+// 	int in_pix_loc, wt_loc;
+
+//     (void)dilation_y;
+//     (void)dilation_x;
+
+//     // padding and stride are applied to output 
+//  	for (oc = 0; oc < ch_im_out; oc++) 
+//     {
+//  		for (oy = 0; oy < dim_im_out_y; oy++) 
+//         {
+//  			for (ox = 0; ox < dim_im_out_x; ox++)
+//             {
+//  				conv_out = ((q31_t)(bias[oc]) << bias_shift) + NNOM_ROUND(out_shift);
+
+//                 for (ky = 0; ky < dim_kernel_y; ky++)
+//                 {
+//                     for (kx = 0; kx < dim_kernel_x; kx++)
+//                     {
+//                         // input y, input x location
+//                         iy = oy / stride_y + ky - padding_y;
+//                         ix = ox / stride_x + kx - padding_x;
+
+// 						if(ix >= 0 && iy >= 0 && ix < dim_im_in_y && iy< dim_im_in_y)
+// 						{
+// 							in_pix_loc = (iy * dim_im_in_x + ix) * ch_im_in;
+// 							wt_loc = oc * ch_im_in * dim_kernel_y * dim_kernel_x + (ky * dim_kernel_x + kx) * ch_im_in;
+						
+// 							for (kc = 0; kc < ch_im_in; kc++)
+// 							{    
+// 								conv_out += Im_in[in_pix_loc + kc] * wt[wt_loc + kc];
+// 							} 
+// 						}
+//                     }
+//                 }
+
+//  				Im_out[oc + (oy * dim_im_out_x + ox) * ch_im_out] = (q7_t) __NNOM_SSAT((conv_out >> out_shift), 8);
+//  			}
+//  		}
+//  	}
+//  }
+
+{
+	int i, j, k, l, m, n;
+	int64_t conv_out;
+	int in_row, in_col;
+	int kernel_start_x,kernel_end_x;
+	int kernel_start_y,kernel_end_y;
+	int in_row_start, in_col_start;
+	int is_zero;
+
+	for (i = 0; i < ch_im_out; i++) {
+		for (j = 0; j < dim_im_out_y; j++) {
+			is_zero = alg_deconv2d_calculate_position(j, stride_y, padding_y, dim_kernel_y,
+					dim_im_in_y, &in_row_start, &kernel_start_y, &kernel_end_y);
+
+			if(is_zero) {
+				conv_out = ((q31_t)(bias[i]) << bias_shift) + NNOM_ROUND(out_shift);
+				conv_out = (q15_t) __NNOM_SSAT((conv_out >> out_shift), 16);
+				for (k = 0; k < dim_im_out_x; k++) {
+					Im_out[i + (j * dim_im_out_x + k) * ch_im_out] = (q15_t) conv_out;
+				}
+				continue;
+			}
+
+			for (k = 0; k < dim_im_out_x; k++) {
+				conv_out = ((q31_t)(bias[i]) << bias_shift) + NNOM_ROUND(out_shift);
+
+				is_zero = alg_deconv2d_calculate_position(k, stride_x, padding_x, dim_kernel_x,
+						dim_im_in_x, &in_col_start, &kernel_start_x, &kernel_end_x);
+
+				if(is_zero) {
+					Im_out[i + (j * dim_im_out_x + k) * ch_im_out] = conv_out;
+					continue;
+				}
+
+				for (m = kernel_start_y, in_row = in_row_start; m <= kernel_end_y; m+=stride_y, in_row++) {
+					for (n = kernel_start_x, in_col = in_col_start; n <= kernel_end_x; n+=stride_x, in_col++) {
+						if ((in_row >= 0) && (in_col >= 0) &&
+							(in_row < dim_im_in_y) && (in_col < dim_im_in_x)) {
+							for (l = 0; l < ch_im_in; l++) {
+								conv_out += Im_in[(in_row * dim_im_in_x + in_col) * ch_im_in + l] *
+									wt[i * ch_im_in * dim_kernel_y * dim_kernel_x + (m * dim_kernel_x + n) * ch_im_in + l];
+							}
+						}
+					}
+				}
+
+				Im_out[i + (j * dim_im_out_x + k) * ch_im_out] = (q15_t) __NNOM_SSAT((conv_out >> out_shift), 16);
+			}
+		}
+	}
+}
+
+
+
+
+void local_depthwise_separable_conv_HWC_q15_nonsquare(const q15_t *Im_in,// input image
+	const uint16_t dim_im_in_x,                                        // input image dimention x
+	const uint16_t dim_im_in_y,                                        // input image dimention y
+	const uint16_t ch_im_in,                                           // number of input image channels
+	const q7_t *wt,                                                    // kernel weights
+	const uint16_t ch_im_out,                                          // number of filters, i.e., output image channels
+	const uint16_t dim_kernel_x,                                       // filter kernel size x
+	const uint16_t dim_kernel_y,                                       // filter kernel size y
+	const uint16_t padding_x,                                          // padding sizes x
+	const uint16_t padding_y,                                          // padding sizes y
+	const uint16_t stride_x,                                           // stride x
+	const uint16_t stride_y,                                           // stride y
+    const uint16_t dilation_x,                                         // dilation x
+	const uint16_t dilation_y,                                         // dilation y
+	const q7_t *bias,                                                  // bias
+	const nnom_qformat_param_t *bias_shift,                            // bias shifts
+    const nnom_qformat_param_t *out_shift,                             // output shift
+    const nnom_qtype_t q_type,                                         // per channel or per tensor
+    q15_t *Im_out,                                                      // output image
+	const uint16_t dim_im_out_x,                                       // output image dimension x
+	const uint16_t dim_im_out_y,                                       // output image dimension y
+	q15_t *bufferA,                                                    //buffer space for input
+	q7_t *bufferB                                                      //buffer space for output
+)
+{
+    int i_out_y, i_out_x, i_ch_out, i_ch_in, i_ch_mult;
+    int i_ker_y, i_ker_x;
+    int i_out = 0;
+    int shift_idx;
+    int ch_mult = ch_im_out / ch_im_in;
+    int64_t conv_out;
+
+    for (i_out_y = 0; i_out_y < dim_im_out_y; i_out_y++)
+    {
+        const int32_t base_idx_y = stride_y * i_out_y - padding_y;
+        for (i_out_x = 0; i_out_x < dim_im_out_x; i_out_x++)
+        {
+            const int32_t base_idx_x = stride_x * i_out_x - padding_x;
+            for (i_ch_in = 0; i_ch_in < ch_im_in; i_ch_in++)
+            {
+                for(i_ch_mult = 0; i_ch_mult < ch_mult; i_ch_mult++)
+                {
+                    i_ch_out = i_ch_mult + i_ch_in * ch_mult;
+                    int32_t ker_y_start = MAX(0, -base_idx_y);
+                    int32_t ker_x_start = MAX(0, -base_idx_x);
+                    int32_t ker_y_end = MIN(dim_kernel_y, dim_im_in_y - base_idx_y);
+                    int32_t ker_x_end = MIN(dim_kernel_x, dim_im_in_x - base_idx_x);
+
+                    shift_idx = q_type == NNOM_QTYPE_PER_AXIS ? i_ch_out : 0;
+                    if (bias)
+                        conv_out = ((q31_t)(bias[i_ch_out]) << bias_shift[shift_idx]) + NNOM_ROUND(out_shift[shift_idx]);
+                    else
+                        conv_out = (q31_t)NNOM_ROUND(out_shift[shift_idx]);
+
+                    for (i_ker_y = ker_y_start; i_ker_y < ker_y_end; i_ker_y++)
+                    {
+                        const int32_t idx_y = base_idx_y + i_ker_y * dilation_y;
+                        for (i_ker_x = ker_x_start; i_ker_x < ker_x_end; i_ker_x++)
+                        {
+                            const int32_t idx_x = base_idx_x + i_ker_x * dilation_x;
+                            int32_t in_pix_loc = (idx_y * dim_im_in_x + idx_x) * ch_im_in + i_ch_in;
+                            int32_t wt_loc = (i_ker_y * dim_kernel_x + i_ker_x) * (ch_im_in * ch_mult) + i_ch_out;
+                            conv_out += Im_in[in_pix_loc] * wt[wt_loc];
+                        }
+                    }
+                    Im_out[i_out++] = (q15_t)__NNOM_SSAT((conv_out >> out_shift[shift_idx]), 16);
+                }
+            }
+        }
+    }
+}
+
+void local_depthwise_separable_conv_CHW_q15_nonsquare(const q15_t *Im_in,// input image
+	const uint16_t dim_im_in_x,                                        // input image dimention x
+	const uint16_t dim_im_in_y,                                        // input image dimention y
+	const uint16_t ch_im_in,                                           // number of input image channels
+	const q7_t *wt,                                                    // kernel weights
+	const uint16_t ch_im_out,                                          // number of filters, i.e., output image channels
+	const uint16_t dim_kernel_x,                                       // filter kernel size x
+	const uint16_t dim_kernel_y,                                       // filter kernel size y
+	const uint16_t padding_x,                                          // padding sizes x
+	const uint16_t padding_y,                                          // padding sizes y
+	const uint16_t stride_x,                                           // stride x
+	const uint16_t stride_y,                                           // stride y
+    const uint16_t dilation_x,                                         // dilation x
+	const uint16_t dilation_y,                                         // dilation y
+	const q7_t *bias,                                                  // bias
+	const nnom_qformat_param_t *bias_shift,                            // bias shifts
+    const nnom_qformat_param_t *out_shift,                             // output shift
+    const nnom_qtype_t q_type,                                         // per channel or per tensor
+    q15_t *Im_out,                                                      // output image
+	const uint16_t dim_im_out_x,                                       // output image dimension x
+	const uint16_t dim_im_out_y,                                       // output image dimension y
+	q15_t *bufferA,                                                    //buffer space for input
+	q7_t *bufferB                                                      //buffer space for output
+)
+{
+    int i_out_y, i_out_x, i_ch_out, i_ch_in, i_ch_mult;
+    int i_ker_y, i_ker_x;
+    int shift_idx;
+    int ch_mult = ch_im_out / ch_im_in;
+    int64_t conv_out;
+
+    for (i_out_y = 0; i_out_y < dim_im_out_y; i_out_y++)
+    {
+        const int32_t base_idx_y = stride_y * i_out_y - padding_y;
+        for (i_out_x = 0; i_out_x < dim_im_out_x; i_out_x++)
+        {
+            const int32_t base_idx_x = stride_x * i_out_x - padding_x;
+            for (i_ch_in = 0; i_ch_in < ch_im_in; i_ch_in++)
+            {
+                for (i_ch_mult = 0; i_ch_mult < ch_mult; i_ch_mult++)
+                {
+                    i_ch_out = i_ch_mult + i_ch_in * ch_mult;
+                    int32_t ker_y_start = MAX(0, -base_idx_y);
+                    int32_t ker_x_start = MAX(0, -base_idx_x);
+                    int32_t ker_y_end = MIN(dim_kernel_y, dim_im_in_y - base_idx_y);
+                    int32_t ker_x_end = MIN(dim_kernel_x, dim_im_in_x - base_idx_x);
+
+                    shift_idx = q_type == NNOM_QTYPE_PER_AXIS ? i_ch_out : 0;
+                    if (bias)
+                        conv_out = ((q31_t)(bias[i_ch_out]) << bias_shift[shift_idx]) + NNOM_ROUND(out_shift[shift_idx]);
+                    else
+                        conv_out = (q31_t)NNOM_ROUND(out_shift[shift_idx]);
+
+                    for (i_ker_y = ker_y_start; i_ker_y < ker_y_end; i_ker_y++)
+                    {
+                        const int32_t idx_y = base_idx_y + i_ker_y * dilation_y;
+                        for (i_ker_x = ker_x_start; i_ker_x < ker_x_end; i_ker_x++)
+                        {
+                            const int32_t idx_x = base_idx_x + i_ker_x * dilation_x;
+                            int32_t in_pix_loc = (idx_y * dim_im_in_x + idx_x) + i_ch_in * dim_im_in_x * dim_im_in_y;
+                            int32_t wt_loc = (i_ker_y * dim_kernel_x + i_ker_x) * ch_im_out +  i_ch_out;
+                            conv_out += Im_in[in_pix_loc] * wt[wt_loc];
+                        }
+                    }
+                    Im_out[i_ch_out * dim_im_out_x * dim_im_out_y + (i_out_y * dim_im_out_x + i_out_x)] = 
+                        (q15_t)__NNOM_SSAT((conv_out >> out_shift[shift_idx]), 16);
+                }
+            }
+        }
+    }
+
+}
+
+void local_zero_padding_HWC_q15(const q15_t *Im_in,           // input image
+	const uint16_t dim_im_in_x,    // input image dimention x
+	const uint16_t dim_im_in_y,    // input image dimention y
+	const uint16_t ch_im_in,       // number of input image channels
+	const uint16_t padding_top,    // padding sizes y
+	const uint16_t padding_bottom, // padding sizes y
+	const uint16_t padding_left,   // padding sizes x
+	const uint16_t padding_right,  // padding sizes x
+	q15_t *Im_out,                  // output image
+	const uint16_t dim_im_out_x,   // output image dimension x
+	const uint16_t dim_im_out_y)   // output image dimension y 
+{
+	int i, size;
+	q15_t * p_out = Im_out; 
+	
+	// top rows
+	size = dim_im_out_x*ch_im_in*padding_top;
+	nnom_memset(p_out, 0, size*sizeof(q15_t)); 
+	p_out += size;
+	
+	// middle
+	for(i=0; i<dim_im_in_y; i++)
+	{
+		// left - set to 0
+		size = ch_im_in * padding_left;
+		nnom_memset(p_out, 0, size*sizeof(q15_t)); 
+		p_out += size;
+		// data - copy a row
+		size = dim_im_in_x * ch_im_in;
+		nnom_memcpy(p_out, Im_in + i*size, size*sizeof(q15_t));
+		p_out += size;
+		// right - set to 0
+		size = ch_im_in * padding_right;
+		nnom_memset(p_out, 0, size*sizeof(q15_t)); 
+		p_out += size;
+	}
+	// bottom rows
+	nnom_memset(p_out, 0, dim_im_out_x*ch_im_in*padding_bottom*sizeof(q15_t)); 
+}
+
+void local_zero_padding_CHW_q15(const q15_t *Im_in,           // input image
+	const uint16_t dim_im_in_x,    // input image dimention x
+	const uint16_t dim_im_in_y,    // input image dimention y
+	const uint16_t ch_im_in,       // number of input image channels
+	const uint16_t padding_top,    // padding sizes y
+	const uint16_t padding_bottom, // padding sizes y
+	const uint16_t padding_left,   // padding sizes x
+	const uint16_t padding_right,  // padding sizes x
+	q15_t *Im_out,                  // output image
+	const uint16_t dim_im_out_x,   // output image dimension x
+	const uint16_t dim_im_out_y)   // output image dimension y 
+{
+	int i, size, ch_offset;
+	q15_t * p_out = Im_out; 
+	
+	for(int ch=0; ch < ch_im_in; ch++)
+	{
+		p_out = Im_out + ch * dim_im_out_x * dim_im_out_y;
+		// top rows
+		size = dim_im_out_x*padding_top;
+		nnom_memset(p_out, 0, size*sizeof(q15_t));
+		p_out += size;
+		
+		// middle
+		ch_offset = ch*dim_im_in_x*dim_im_in_y;
+		for(i=0; i<dim_im_in_y; i++)
+		{
+			// left - set to 0
+			nnom_memset(p_out, 0, padding_left*sizeof(q15_t)); 
+			p_out += padding_left;
+			// data - copy a row
+			nnom_memcpy(p_out, Im_in + i*dim_im_in_x + ch_offset, dim_im_in_x*sizeof(q15_t));
+			p_out += dim_im_in_x;
+			// right - set to 0
+			nnom_memset(p_out, 0, size*sizeof(q15_t)); 
+			p_out += padding_right;
+		}
+		// bottom
+		nnom_memset(p_out, 0, dim_im_out_x*padding_bottom*sizeof(q15_t)); 
+	}
+
+}
+
+void local_cropping_HWC_q15(const q15_t *Im_in,           // input image
+	const uint16_t dim_im_in_x,    // input image dimention x
+	const uint16_t dim_im_in_y,    // input image dimention y
+	const uint16_t ch_im_in,       // number of input image channels
+	const uint16_t padding_top,    // padding sizes y
+	const uint16_t padding_bottom, // padding sizes y
+	const uint16_t padding_left,   // padding sizes x
+	const uint16_t padding_right,  // padding sizes x
+	q15_t *Im_out,                  // output image
+	const uint16_t dim_im_out_x,   // output image dimension x
+	const uint16_t dim_im_out_y)   // output image dimension y 
+{
+	int i, row_size;
+	const q15_t * p_in = Im_in; 
+	
+	// top rows to ignore
+	p_in += dim_im_in_x*ch_im_in*padding_top;
+	
+	// middle
+	row_size = dim_im_out_x * ch_im_in;
+	for(i=0; i<dim_im_out_y; i++)
+	{
+		// left to ignore          
+		p_in += ch_im_in * padding_left;
+		// data - copy a row
+		nnom_memcpy(Im_out + i*row_size, p_in, row_size*sizeof(q15_t));
+		p_in += row_size;
+		// right to ingore
+		p_in += ch_im_in * padding_right;
+	}
+
+}
+
+void local_cropping_CHW_q15(const q15_t *Im_in,           // input image
+	const uint16_t dim_im_in_x,    // input image dimention x
+	const uint16_t dim_im_in_y,    // input image dimention y
+	const uint16_t ch_im_in,       // number of input image channels
+	const uint16_t padding_top,    // padding sizes y
+	const uint16_t padding_bottom, // padding sizes y
+	const uint16_t padding_left,   // padding sizes x
+	const uint16_t padding_right,  // padding sizes x
+	q15_t *Im_out,                  // output image
+	const uint16_t dim_im_out_x,   // output image dimension x
+	const uint16_t dim_im_out_y)   // output image dimension y 
+{
+	int i, ch, ch_offset;
+	const q15_t * p_in; 
+	
+	for(ch=0; ch < ch_im_in; ch++)
+	{
+		p_in = Im_in + dim_im_in_x * dim_im_in_y * ch; 	// ch offset to input image
+		p_in += dim_im_in_x*padding_top; 				// top to ignore
+		
+		ch_offset = ch*dim_im_out_x*dim_im_out_y;
+		for(i=0; i<dim_im_out_y; i++)
+		{	
+			// data - middle of a row
+			nnom_memcpy(Im_out + i*dim_im_out_x + ch_offset, p_in+padding_left, dim_im_out_x*sizeof(q15_t)); 
+			p_in += dim_im_in_x; // middle and right padding	
+		}
+	}	
+}
+
+
+void local_dot_q15(const q15_t *pV, // pointer to vector
+	const q15_t *pM,               // pointer to matrix
+	const uint16_t dim_vec,       // length of the vector
+	const uint16_t num_of_rows,   // numCol of A
+	const uint16_t out_shift,     // amount of right-shift for output
+	 q15_t *pOut)                   // output operand)
+{
+    for (int i = 0; i < num_of_rows; i++)
+    {
+        int64_t ip_out = (q31_t) NNOM_ROUND(out_shift); // q31 might not be enough
+        for (int j = 0; j < dim_vec; j++)
+        {
+            ip_out += pV[j] * pM[i * dim_vec + j];
+        }
+        pOut[i] = (q15_t)__NNOM_SSAT((ip_out >> out_shift), 16);
+    }
+}
+
+void local_dot_q15_opt(const q15_t * pV,
+	const q15_t * pM,
+	const uint16_t dim_vec,
+	const uint16_t num_of_rows,
+	const uint16_t out_shift, 
+	q15_t * pOut)
+{
+    /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
+    uint16_t  rowCnt = num_of_rows >> 2;
+    const q15_t *pB = pM;
+    const q15_t *pA;
+    q15_t    *pO = pOut;
+
+    while (rowCnt)
+    {
+        int64_t     sum =  (q31_t) NNOM_ROUND(out_shift);
+        int64_t     sum2 = (q31_t) NNOM_ROUND(out_shift);
+        int64_t     sum3 = (q31_t) NNOM_ROUND(out_shift);
+        int64_t     sum4 = (q31_t) NNOM_ROUND(out_shift);
+        uint16_t  colCnt = dim_vec >> 1;
+        pA = pV;
+        while (colCnt)
+        {
+            q15_t     inA1 = *pA++;
+            q15_t     inA2 = *pA++;
+            q15_t     inB1 = *pB++;
+            q15_t     inB2 = *pB++;
+            sum += inA1 * inB1 + inA2 * inB2;
+
+            inB1 = *pB++;
+            inB2 = *pB++;
+            sum2 += inA1 * inB1 + inA2 * inB2;
+
+            inB1 = *pB++;
+            inB2 = *pB++;
+            sum3 += inA1 * inB1 + inA2 * inB2;
+
+            inB1 = *pB++;
+            inB2 = *pB++;
+            sum4 += inA1 * inB1 + inA2 * inB2;
+
+            colCnt--;
+        }
+        colCnt = dim_vec & 0x1;
+        while (colCnt)
+        {
+            q15_t     inA = *pA++;
+            q15_t     inB = *pB++;
+            sum += inA * inB;
+            inB = *pB++;
+            sum2 += inA * inB;
+            inB = *pB++;
+            sum3 += inA * inB;
+            inB = *pB++;
+            sum4 += inA * inB;
+            colCnt--;
+        }
+        *pO++ = (q15_t) __NNOM_SSAT((sum >> out_shift), 16);
+        *pO++ = (q15_t) __NNOM_SSAT((sum2 >> out_shift), 16);
+        *pO++ = (q15_t) __NNOM_SSAT((sum3 >> out_shift), 16);
+        *pO++ = (q15_t) __NNOM_SSAT((sum4 >> out_shift), 16);
+
+        rowCnt--;
+    }
+    rowCnt = num_of_rows & 0x3;
+
+    while (rowCnt)
+    {
+        int64_t       ip_out = (q31_t) + NNOM_ROUND(out_shift);
+        int       j;
+
+        pA = pV;
+        for (j = 0; j < dim_vec; j++)
+        {
+            q15_t     inA = *pA++;
+            q15_t     inB = *pB++;
+            ip_out += inA * inB;
+        }
+        *pO++ = (q15_t) __NNOM_SSAT((ip_out >> out_shift), 16);
+
+        rowCnt--;
+    }
+}
+
+void local_fully_connected_mat_q7_vec_q15_opt(const q15_t * pV,
+                                       const q7_t * pM,
+                                       const uint16_t dim_vec,
+                                       const uint16_t num_of_rows,
+                                       const uint16_t bias_shift,
+                                       const uint16_t out_shift, 
+                                       const q7_t * bias, 
+                                       q15_t * pOut, 
+                                       q15_t * vec_buffer)
+{
+
+    (void)vec_buffer;
+
+    /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
+    uint16_t  rowCnt = num_of_rows >> 2;
+    const q7_t *pB = pM;
+    const q15_t *pA;
+    q15_t    *pO = pOut;
+    const q7_t *pBias = bias;
+
+    while (rowCnt)
+    {
+        int64_t     sum;
+        int64_t     sum2;
+        int64_t     sum3;
+        int64_t     sum4;
+        uint16_t  colCnt = dim_vec >> 1;
+
+        // quick and dirty to support none bias fully connected
+        if(bias)
+        {
+            sum =  ((q31_t)(*pBias++) << bias_shift) + NNOM_ROUND(out_shift);
+            sum2 = ((q31_t)(*pBias++) << bias_shift) + NNOM_ROUND(out_shift);
+            sum3 = ((q31_t)(*pBias++) << bias_shift) + NNOM_ROUND(out_shift);
+            sum4 = ((q31_t)(*pBias++) << bias_shift) + NNOM_ROUND(out_shift);
+        }
+        else
+        {
+            sum =  (q31_t) NNOM_ROUND(out_shift);
+            sum2 = (q31_t) NNOM_ROUND(out_shift);
+            sum3 = (q31_t) NNOM_ROUND(out_shift);
+            sum4 = (q31_t) NNOM_ROUND(out_shift);
+        }
+        
+        pA = pV;
+        while (colCnt)
+        {
+            q15_t     inA1 = *pA++;
+            q15_t     inA2 = *pA++;
+
+            q7_t      inB1 = *pB++;
+            q7_t      inB3 = *pB++;
+            q7_t      inB2 = *pB++;
+            q7_t      inB4 = *pB++;
+
+            sum += inA1 * inB1 + inA2 * inB2;
+            sum2 += inA1 * inB3 + inA2 * inB4;
+
+            inB1 = *pB++;
+            inB3 = *pB++;
+            inB2 = *pB++;
+            inB4 = *pB++;
+
+            sum3 += inA1 * inB1 + inA2 * inB2;
+            sum4 += inA1 * inB3 + inA2 * inB4;
+
+            colCnt--;
+        }
+
+        colCnt = dim_vec & 0x1;
+        while (colCnt)
+        {
+            q15_t     inA = *pA++;
+            q7_t      inB = *pB++;
+            sum += inA * inB;
+            inB = *pB++;
+            sum2 += inA * inB;
+            inB = *pB++;
+            sum3 += inA * inB;
+            inB = *pB++;
+            sum4 += inA * inB;
+
+            colCnt--;
+        }
+        *pO++ = (q15_t) __NNOM_SSAT((sum >> out_shift), 16);
+        *pO++ = (q15_t) __NNOM_SSAT((sum2 >> out_shift), 16);
+        *pO++ = (q15_t) __NNOM_SSAT((sum3 >> out_shift), 16);
+        *pO++ = (q15_t) __NNOM_SSAT((sum4 >> out_shift), 16);
+
+        rowCnt--;
+    }
+
+    rowCnt = num_of_rows & 0x3;
+
+    while (rowCnt)
+    {
+        int64_t       ip_out;
+        int       j;
+
+        // quick and dirty to support none bias fully connected
+        if(bias)
+            ip_out = ((q31_t)(*pBias++) << bias_shift) + NNOM_ROUND(out_shift);
+        else
+            ip_out = (q31_t)NNOM_ROUND(out_shift);
+        
+        pA = pV;
+        for (j = 0; j < dim_vec; j++)
+        {
+            q15_t     inA = *pA++;
+            q7_t      inB = *pB++;
+            ip_out += inA * inB;
+        }
+        *pO++ = (q15_t) __NNOM_SSAT((ip_out >> out_shift), 16);
+
+        rowCnt--;
+    }
+}
+
+void local_fully_connected_mat_q7_vec_q15(const q15_t * pV,
+                                   const q7_t * pM,
+                                   const uint16_t dim_vec,
+                                   const uint16_t num_of_rows,
+                                   const uint16_t bias_shift,
+                                   const uint16_t out_shift,
+                                   const q7_t * bias,
+                                   q15_t * pOut,
+                                   q15_t * vec_buffer)
+{
+    int  i, j;
+
+    // a quick solution for none-bias dot. 
+    if(bias == NULL)
+    {
+        for (i = 0; i < num_of_rows; i++)
+        {
+            int64_t ip_out = (q31_t) NNOM_ROUND(out_shift);
+            for (j = 0; j < dim_vec; j++)
+            {
+                ip_out += pV[j] * pM[i * dim_vec + j];
+            }
+            pOut[i] = (q15_t) __NNOM_SSAT((ip_out >> out_shift), 16);
+        }
+    }
+    else
+    {
+        for (i = 0; i < num_of_rows; i++)
+        {
+            int64_t ip_out = ((q31_t)(bias[i]) << bias_shift) + NNOM_ROUND(out_shift);
+            for (j = 0; j < dim_vec; j++)
+            {
+                ip_out += pV[j] * pM[i * dim_vec + j];
+            }
+            pOut[i] = (q15_t) __NNOM_SSAT((ip_out >> out_shift), 16);
+        }
+    }
+}
+
+// This softmax is a copy from ARM CMSIS implimentation as it was efficient and written in pure-C.
+// original implementation: https://github.com/ARM-software/CMSIS_5/blob/develop/CMSIS/NN/Source/SoftmaxFunctions/arm_softmax_q15.c
+void local_softmax_q15(const q15_t * vec_in, const uint16_t dim_vec, q15_t * p_out)
+{
+    q31_t     sum;
+    int16_t   i;
+    uint8_t   shift;
+    q31_t     base;
+    base = -1 * 0x100000;
+    for (i = 0; i < dim_vec; i++)
+    {
+        if (vec_in[i] > base)
+        {
+            base = vec_in[i];
+        }
+    }
+
+    /* we ignore really small values
+     * anyway, they will be 0 after shrinking
+     * to q15_t
+     */
+    base = base - 16;
+
+    sum = 0;
+
+    for (i = 0; i < dim_vec; i++)
+    {
+        if (vec_in[i] > base)
+        {
+            shift = (uint8_t)__NNOM_USAT(vec_in[i] - base, 5);
+            sum += 0x1 << shift;
+        }
+    }
+
+    /* This is effectively (0x1 << 32) / sum */
+    int64_t div_base = 0x100000000LL;
+    int output_base = (int32_t)(div_base / sum);
+
+    /* Final confidence will be output_base >> ( 17 - (vec_in[i] - base) )
+     * so 32768 (0x1<<15) -> 100% confidence when sum = 0x1 << 16, output_base = 0x1 << 16
+     * and vec_in[i]-base = 16
+     */
+    for (i = 0; i < dim_vec; i++)
+    {
+        if (vec_in[i] > base)
+        {
+            /* Here minimum value of 17+base-vec[i] will be 1 */
+            shift = (uint8_t)__NNOM_USAT(17+base-vec_in[i], 5);
+            p_out[i] = (q15_t) __NNOM_SSAT((output_base >> shift), 16);
+        } else
+        {
+            p_out[i] = 0;
+        }
+    }
+
+}
+
+
+// hard sigmoid, 
+// y=-1 if x < -2.5
+// y=1  if x > 2.5
+// otherwise y = 0.2 * x + 0.5 (y=0.20315 * x + 0.5)
+void local_hard_sigmoid_q15(q15_t *data, uint32_t size, int16_t dec_bit)
+{
+	int16_t limit = 2.5f * (1 << dec_bit)-1; 
+	int16_t offset = 16384;	// 0.5 * 32768
+	int16_t mult = 6554;  	// 0.2 * 32768
+
+	// int bit >= 0
+	for(int i=0; i<size; i++)
+	{
+		if(data[i] <= -limit)
+			data[i] = 0;
+		else if(data[i] >= limit)
+			data[i] = 32767;
+		else
+		{
+			data[i] = ((int32_t)(data[i] * mult) >> dec_bit) + offset;
+		} 
+	}   
+ }
+
+// hard tanh
+// y=-1 if x < -1
+// y=1  if x > 1
+// otherwise y = x
+void local_hard_tanh_q15(q15_t *data, uint32_t size, int16_t dec_bit)
+{
+    int16_t int_bit = 15 - dec_bit;
+    int16_t limit = 1 << dec_bit;
+
+    if(dec_bit == 15)
+        return;
+	
+	// int bit < 0
+	if(int_bit < 0)
+		for(int i=0; i<size; i++)
+		{
+			if(data[i] <= -limit)
+				data[i] = -32768;
+			else if(data[i] >= limit)
+				data[i] = 32767;
+			else
+			{
+				data[i] = data[i] >> (-int_bit);
+			}
+		}
+	else
+		// int bit >= 0
+		for(int i=0; i<size; i++)
+		{
+			if(data[i] <= -limit)
+				data[i] = -32768;
+			else if(data[i] >= limit)
+				data[i] = 32767;
+			else
+			{
+				data[i] = data[i] << int_bit;
+			}
+		}
+}
+
+void local_relu_q15(q15_t *data, uint32_t size)
+{
+    uint32_t i;
+
+    for (i = 0; i < size; i++)
+    {
+        if (data[i] < 0)
+            data[i] = 0;
+    }
+}
+
+// alpha in q7 format with dec_bit=7
+void local_leaky_relu_q15(q15_t *data, q7_t alpha, uint32_t size)
+{
+    uint32_t i;
+    
+    for (i = 0; i < size; i++)
+    {
+        if (data[i] < 0)
+        {
+            data[i] = data[i] * alpha / 128;
+        }
+    }
+}
+
+// alpha in q7 format with dec_bit=7
+// max and threshold has the same Q format with the activation
+void local_adv_relu_q15(q15_t *data, q7_t negative_slope, q15_t max, q15_t threshold, uint32_t size)
+{
+    uint32_t i;
+    for (i = 0; i < size; i++)
+    {
+        //   `f(x) = max_value` for `x >= max_value`,
+        //   `f(x) = x` for `threshold <= x < max_value`,
+        //   `f(x) = alpha * (x - threshold)` otherwise.
+
+        if(data[i] > max)
+            data[i] = max;
+        if (data[i] < threshold)
+            data[i] = (data[i] - threshold) * negative_slope / 128;    
+    }
+}
+
+// ARM's CMSIS implementation. 
+static void local_activation_q15(q15_t * data, uint32_t size, uint16_t int_width, const q15_t*lookup_table)
+{
+    uint32_t  i = size;
+    q15_t    *pIn = data;
+    q15_t    *pOut = data;
+    uint16_t  shift_size = 8 + 3 - int_width;
+    uint32_t  bit_mask = 0x7FF >> int_width;
+    uint32_t  full_frac = bit_mask + 1;
+    while (i)
+    {
+        q15_t     out;
+        q15_t     in = *pIn++;
+        q15_t     frac = (uint32_t) in & bit_mask;
+        q15_t     value = lookup_table[(uint8_t)(in >> shift_size)];
+        if ((in >> shift_size) != 0x7f)
+        {
+            q15_t    value2 = lookup_table[(uint8_t)(1 + ((uint8_t)(in >> shift_size)))];
+            /* doing the interpolation here for better accuracy */
+            out = ((q31_t) (full_frac - frac) * value + (q31_t) value2 * frac) >> shift_size;
+        } else
+        {
+            /* the largest positive value does not have a right side for linear interpolation */
+            out = value;
+        }
+        *pOut++ = out;
+        i--;
+    }
+}
+
+void local_sigmoid_q15(q15_t * data, uint32_t size, uint16_t int_width)
+{
+    local_activation_q15(data, size, int_width, nnom_sigmoid_table_q15);
+}
+
+void local_tanh_q15(q15_t * data, uint32_t size, uint16_t int_width)
+{
+    local_activation_q15(data, size, int_width, nnom_tanh_table_q15);
+}
+
+// matrix ops q15
+void local_mult_q15(q15_t *pSrcA,
+                   q15_t *pSrcB,
+                   q15_t *pDst,
+                   const uint16_t out_shift,
+                   uint32_t blockSize)
+{
+    uint32_t i;
+
+    for (i = 0; i < blockSize; i++)
+    {
+        q31_t product = pSrcA[i] * pSrcB[i];
+        pDst[i] = (q15_t) __NNOM_SSAT(((product + NNOM_ROUND(out_shift)) >> out_shift), 16);
+    }
+}
+
+void local_add_q15(q15_t *pSrcA,
+                  q15_t *pSrcB,
+                  q15_t *pDst,
+                  const uint16_t out_shift,
+                  uint32_t blockSize)
+{
+    uint32_t i;
+
+    for (i = 0; i < blockSize; i++)
+    {
+        q31_t sum = pSrcA[i] + pSrcB[i];
+        pDst[i] = (q15_t) __NNOM_SSAT(((sum + NNOM_ROUND(out_shift)) >> out_shift), 16);
+    }
+}
+
+void local_sub_q15(q15_t *pSrcA,
+                  q15_t *pSrcB,
+                  q15_t *pDst,
+                  const uint16_t out_shift,
+                  uint32_t blockSize)
+{
+    uint32_t i;
+
+    for (i = 0; i < blockSize; i++)
+    {
+        q31_t sub = pSrcA[i] - pSrcB[i];
+        pDst[i] = (q15_t) __NNOM_SSAT(((sub + NNOM_ROUND(out_shift)) >> out_shift), 16);
+    }
+}
+
+
+void local_multiple_add_q15( q15_t *p_dst,
+                  const int16_t out_shift,
+                  uint32_t block_size,
+                  uint32_t num_block,
+                  q15_t **p_src)
+{
+    uint32_t i, blk;
+    q31_t sum; 
+
+    for (i = 0; i < block_size; i++)
+    {
+        sum = 0;
+        for(blk=0; blk < num_block; blk++)
+            sum += p_src[blk][i];
+        p_dst[i] = (q15_t) __NNOM_SSAT(((sum + NNOM_ROUND(out_shift)) >> out_shift), 16);
+    }
+}
+
+void local_multiple_mult_q15( q15_t *p_dst,
+                  const int16_t out_shift,
+                  uint32_t block_size,
+                  uint32_t num_block,
+                  q15_t **p_src)
+{
+    uint32_t i, blk;
+    q63_t product; 
+
+    for (i = 0; i < block_size; i++)
+    {
+        product = 1;
+        for(blk=0; blk < num_block; blk++)
+            product *= p_src[blk][i];
+        p_dst[i] = (q15_t) __NNOM_SSAT(((product + NNOM_ROUND(out_shift)) >> out_shift), 16);
+    }
+}
+
+void local_multiple_sub_q15( q15_t *p_dst,
+                  const int16_t out_shift,
+                  uint32_t block_size,
+                  uint32_t num_block,
+                  q15_t **p_src)
+{
+    uint32_t i, blk;
+    q31_t sub; 
+
+    for (i = 0; i < block_size; i++)
+    {
+        sub = p_src[0][i];
+        for(blk=1; blk < num_block; blk++)
+            sub -= p_src[blk][i];
+        p_dst[i] = (q15_t) __NNOM_SSAT(((sub + NNOM_ROUND(out_shift)) >> out_shift), 16);
+    }
+}
+
+// y = 1 - x
+void local_1_minor_z_q15(q15_t* src, q15_t*des, uint16_t dec_bit, uint32_t size)
+{
+    int32_t one = (1 << dec_bit)-1;
+    for(int i=0; i<size/8; i++)
+    {
+        *des++ = one - *src++;
+        *des++ = one - *src++;
+        *des++ = one - *src++;
+        *des++ = one - *src++;
+        *des++ = one - *src++;
+        *des++ = one - *src++;
+        *des++ = one - *src++;
+        *des++ = one - *src++;
+    }
+    for(int i=0; i<size%8; i++)
+    {
+        *des++ = one - *src++;
+    }
+}
+
+
diff --git a/APP_Framework/Framework/knowing/nnom/src/core/nnom.c b/APP_Framework/Framework/knowing/nnom/src/core/nnom.c
new file mode 100644
index 000000000..2d33cd8ff
--- /dev/null
+++ b/APP_Framework/Framework/knowing/nnom/src/core/nnom.c
@@ -0,0 +1,1109 @@
+/*
+ * Copyright (c) 2018-2020
+ * Jianjia Ma
+ * majianjia@live.com
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Change Logs:
+ * Date           Author       Notes
+ * 2019-02-05     Jianjia Ma   The first version
+ * 2019-02-14	  Jianjia Ma   Add layer.free() method.
+ */
+#include <stdint.h>
+#include <string.h>
+#include <stdbool.h>
+#include <stdarg.h>
+#include "nnom.h"
+
+const char default_layer_names[][12] = DEFUALT_LAYER_NAMES;
+const char default_activation_names[][8] = ACTIVATION_NAMES;
+const char default_cell_names[][8] = DEFUALT_CELL_NAMES;
+size_t nnom_memory_taken = 0;
+
+// local static functions (when libc/dynamic memory interfaces are not avaiable. )
+#ifdef NNOM_USING_STATIC_MEMORY
+static uint8_t *nnom_static_buf = NULL;    //pointer to static buffer
+static size_t nnom_static_buf_size = 0;    //static buf size
+static size_t nnom_static_buf_curr = 0;
+void nnom_set_static_buf(void* buf, size_t size)
+{
+    nnom_static_buf = buf;
+    nnom_static_buf_size = size;
+    nnom_static_buf_curr = 0;
+}
+void* nnom_malloc(size_t size)
+{
+    size = nnom_alignto(size, NNOM_ALIGN);
+    if(size + nnom_static_buf_curr < nnom_static_buf_size)
+    {
+        uint8_t* new_block = nnom_static_buf_curr + nnom_static_buf;
+        nnom_static_buf_curr += size;
+        return new_block;
+    }
+    else
+    {
+        if(nnom_static_buf_size == 0)
+            NNOM_LOG("Please set static memory using 'nnom_set_static_buf()' before calling model created.");
+        else
+            NNOM_LOG("No memory! Static buffer size(%d) not big enough, please increase buffer size!", 
+                        (uint32_t)nnom_static_buf_size);
+        return NULL;
+    }
+}
+void nnom_free(void* p){;}
+#endif // NNOM_USING_STATIC_MEMORY
+
+void *nnom_mem(size_t size)
+{
+	size = nnom_alignto(size, NNOM_ALIGN);
+	void *p = nnom_malloc(size);
+	if (p)
+	{
+		nnom_memory_taken += size; //test
+		nnom_memset(p, 0, size);
+	}
+	return p;
+}
+
+size_t nnom_mem_stat(void)
+{
+	return nnom_memory_taken;
+}
+
+// get the size of an IO module
+static size_t io_mem_size(nnom_layer_io_t *io)
+{
+	size_t size = 0;
+	if (io != NULL)
+	{
+		while (io)
+		{
+			size += tensor_size(io->tensor);
+			io = io->aux;
+		}
+	}
+	return size;
+}
+
+size_t nnom_alignto(size_t value, uint32_t alignment)
+{
+	if (value % alignment == 0)
+		return value;
+	value += alignment - value % alignment;
+	return value;
+}
+
+static nnom_layer_t *find_last(nnom_layer_t *layer)
+{
+	if (layer == NULL)
+		return NULL;
+	// iterate every layer until the last one on the list, then return the last instance
+	while (layer->out->hook.io != NULL)
+		layer = layer->out->hook.io->owner;
+	return layer;
+}
+// input start layer, return layer num
+static uint32_t find_index(nnom_layer_t *start, nnom_layer_t *layer)
+{
+	uint32_t i = 1;
+	if (start == NULL)
+		return 0;
+	// iterate every layer until the last one on the list, then return the index number
+	while (start->out->hook.io != NULL)
+	{
+		i++;
+		if (layer == start)
+			return i;
+		start = start->out->hook.io->owner;
+	}
+	return 0;
+}
+
+static nnom_status_t model_add(nnom_model_t *model, nnom_layer_t *layer)
+{
+	nnom_layer_t *last = NULL;
+	nnom_layer_t *curr = NULL;
+
+	if (layer == NULL)
+	{
+		NNOM_LOG("Error: added a NULL layer, could be no memory while creating layer.\n");
+		return NN_NO_MEMORY;
+	}
+
+	last = find_last(model->head);
+	curr = layer;
+
+	// when the layer list is empty, the find_last() return model->head.
+	if (last == NULL)
+	{
+		model->head = curr;
+	}
+	else
+	{
+		// hook the current layer with the last layer.
+		last->out->hook.io = curr->in; // hook IO
+		curr->in->hook.io = last->out;
+	}
+	return NN_SUCCESS;
+}
+
+// find an available hook on the io module, normally used by output io module.
+// input, the output io module that wants to hook on
+// output, the new hook that added to the end of the hook list on the io
+static nnom_layer_hook_t *allocate_hook(nnom_layer_io_t *io)
+{
+	nnom_layer_hook_t *hook;
+	if (io == NULL)
+		return NULL;
+	hook = &io->hook;
+
+	// if the primary hook is empty, reture it directly.
+	if (hook->io == NULL)
+	{
+		return hook;
+	}
+	else
+	{
+		// find the empty place and allocate new hook for us
+		while (hook->next != NULL)
+		{
+			hook = hook->next;
+		}
+		hook->next = nnom_mem(sizeof(nnom_layer_hook_t));
+		if (hook->next == NULL)
+			return NULL;
+		return hook->next;
+	}
+}
+
+// to check if an io is hooked to other layer
+// input the primary io of a layer's input or output
+// return, the new io that added to the io list.
+static nnom_layer_io_t *allocate_io(nnom_layer_io_t *io)
+{
+	if (io == NULL)
+		return NULL;
+
+	// if the io is free to used
+	if (io->hook.io == NULL)
+	{
+		return io;
+	}
+	else
+	{
+		// find the empty place and allocate new hook for us
+		while (io->aux != NULL)
+		{
+			io = io->aux;
+		}
+		io->aux = nnom_mem(sizeof(nnom_layer_io_t));
+		if (io->aux == NULL)
+			return NULL;
+		// the owner for new io is inherited
+		io->aux->owner = io->owner;
+		return io->aux;
+	}
+}
+
+// hook the current layer to the input layer
+// this function only to connect (single output layer) to (single input layer).
+static nnom_layer_t *model_hook(nnom_layer_t *curr, nnom_layer_t *last)
+{
+	nnom_layer_io_t *curr_in_io;
+	nnom_layer_hook_t *last_io_hook;
+
+	if (last == NULL || curr == NULL)
+		return NULL;
+
+	// add a new hook to the output io of the last layer
+	last_io_hook = allocate_hook(last->out);
+	// add a new input io to the current layer's input list.
+	curr_in_io = allocate_io(curr->in);
+
+	// manually hook them togeter.
+	last_io_hook->io = curr_in_io;
+	curr_in_io->hook.io = last->out;
+
+	return curr;
+}
+
+// merge a few layers using specified method
+// num = the number of layer that will be merged
+// method = functional layer such as (concat(), mult(), add(), sub())
+static nnom_layer_t *model_mergex(nnom_layer_t *method, int num, ...)
+{
+	nnom_layer_t *layer_in;
+	va_list valist;
+
+	if (method == NULL)
+		return NULL;
+
+	va_start(valist, num);
+	for (int i = 0; i < num; i++)
+	{
+		// get the input layer
+		layer_in = va_arg(valist, nnom_layer_t *);
+		model_hook(method, layer_in);
+	}
+	va_end(valist);
+	return method;
+}
+
+// merge 2 input
+// this is an older interface
+// method = functional layer such as (concat(), mult(), add(), sub())
+static nnom_layer_t *model_merge(nnom_layer_t *method, nnom_layer_t *in1, nnom_layer_t *in2)
+{
+	return model_mergex(method, 2, in1, in2);
+}
+
+// This api will merge activation to layer's actail to avoid the extra layer for activation
+static nnom_layer_t *model_active(nnom_activation_t *act, nnom_layer_t *target)
+{
+	// simple and easy
+	target->actail = act;
+	return target;
+}
+
+// when model=NULL, it create a new sequential model
+nnom_model_t *new_model(nnom_model_t *model)
+{
+	nnom_model_t *m = model;
+	if (m == NULL)
+	{
+		m = nnom_mem(sizeof(nnom_model_t));
+		m->is_allocated = true;
+	}
+	else
+	{
+		nnom_memset(m, 0, sizeof(nnom_model_t));
+		m->is_allocated = false;
+	}
+
+	// set methods
+	m->add = model_add;
+	m->hook = model_hook;
+	m->merge = model_merge;
+	m->mergex = model_mergex;
+	m->active = model_active;
+
+	return m;
+}
+
+static void io_tensor_delete(nnom_layer_io_t* io)
+{
+	while (io)
+	{
+		nnom_free(io->tensor);
+		io = io->aux;
+	}
+}
+
+// delete all the aux hooks
+// delete aux io only, keep the primary io.
+static void io_list_delete(nnom_layer_io_t *io)
+{
+	nnom_layer_hook_t *hook, *next_hook;
+	nnom_layer_io_t *next_io;
+	while (io)
+	{
+		// store the next io
+		next_io = io->aux;
+
+		// release hooks list first
+		hook = io->hook.next;
+		while (hook)
+		{
+			next_hook = hook->next;
+			nnom_free(hook);
+			hook = next_hook;
+		}
+
+		// now we can release the aux io itself
+		// but if this io is the primary input/out of the layer, it will be freed with they layer's instance since they are allocated together.
+		if (io != io->owner->in && io != io->owner->out)
+			nnom_free(io);
+
+		// next aux io
+		io = next_io;
+	}
+}
+
+// there are 2 type of memory in a layer
+// *primary memory* is allocated when a layer instance is created, they are created by layer API (Conv2D()...).
+// 		it includes the layer instance, primary input, primary output, and an optional computational memory buffer instance
+//		each io module also has one primary hook.
+// *secondary memory* are axiliary io modules, axiliary hooks and activations which created by model.xx() APIs (model.hook(), model.active()...)
+//		it includes the list of aux io modules, the list of aux hooks.
+//
+// Additionaly, layer's private free method must be called to free layer's private resources
+// Such as activation instance passed to Activation() layer, and private memory allcated within Lambda layer.
+//
+// A layer is consist of a few io modules. primary io are allocated with layers instance.
+// each of the io has a few hooks. primary hooks are included in the io module.
+// so only "aux" hooks and ios need to be freed separately.
+static void layer_delete(nnom_layer_t *layer)
+{
+	if (layer == NULL)
+		return;
+
+	// call private free of the layer
+	if (layer->free)
+		layer->free(layer);
+
+	// delete the tensors first. only input layer should delete input 
+	if (layer->type == NNOM_INPUT)
+		io_tensor_delete(layer->in);
+	io_tensor_delete(layer->out);
+
+	// release secondary memory on the layers.
+	// they are io lists and hooks list
+	io_list_delete(layer->in);
+	io_list_delete(layer->out);
+
+	// release activations (it takes null too)
+	nnom_free(layer->actail);
+
+	// release primary memory
+	nnom_free(layer);
+	return;
+}
+
+void model_delete(nnom_model_t *m)
+{
+	nnom_layer_t *layer;
+	nnom_layer_t *next;
+	if (m == NULL)
+		return;
+
+	// uses shortcut list to iterate the model,
+	// start from head
+	layer = m->head;
+	while (layer)
+	{
+		// get the next before releasing current
+		next = layer->shortcut;
+		// your term
+		layer_delete(layer);
+		// who's next!
+		layer = next;
+	}
+
+	// free the memory blocks for the network's buffer
+	nnom_free(m->blocks->blk);
+
+	// free model instance itself
+	if (m->is_allocated)
+		nnom_free(m);
+	else
+		nnom_memset(m, 0, sizeof(nnom_model_t));
+	
+	nnom_memory_taken = 0;
+	return;
+}
+
+// find an available memory block.
+static nnom_mem_block_t *allocate_block(nnom_mem_block_t *list)
+{
+	nnom_mem_block_t *free = NULL;
+	uint32_t idx;
+
+	for (idx = 0; idx < NNOM_BLOCK_NUM; idx++)
+	{
+		if (list[idx].owners == 0)
+			break;
+	}
+    if(idx == NNOM_BLOCK_NUM)
+    {
+        NNOM_LOG("\nERROR! No enough memory block for parallel buffers, please increase the 'NNOM_BLOCK_NUM' in 'nnom_port.h'\n");
+        return NULL;
+    }
+
+	free = &list[idx];
+	return free;
+}
+
+static void release_block(nnom_mem_block_t *block)
+{
+	if (block->owners > 0)
+		block->owners -= 1;
+	if (block->owners == 0)
+		block->state = NNOM_BUF_EMPTY;
+}
+
+static void release_input_mem(nnom_layer_t *layer)
+{
+	nnom_layer_io_t *in;
+	// release all input of buf
+	in = layer->in;
+	while (in != NULL)
+	{
+		release_block(in->mem);
+		in = in->aux;
+	}
+}
+static void release_comp_mem(nnom_layer_t *layer)
+{
+	// release computational buf if exist
+	if (layer->comp != NULL)
+	{
+		release_block(layer->comp->mem);
+	}
+}
+
+// return the length of the io lists
+size_t nnom_io_length(nnom_layer_io_t *io)
+{
+	size_t num = 0;
+	if (io == NULL)
+		return 0;
+	while (io != NULL)
+	{
+		num++;
+		io = io->aux;
+	}
+	return num;
+}
+
+// return the length of the hook lists
+size_t nnom_hook_length(nnom_layer_hook_t *hook)
+{
+	size_t num = 0;
+	if (hook == NULL)
+		return 0;
+	while (hook != NULL)
+	{
+		num++;
+		hook = hook->next;
+	}
+	return num;
+}
+
+// The shortcut version of find_last() method. 
+// must be used after compiling. 
+static nnom_layer_t *layer_shortcut_find_last(nnom_layer_t *start)
+{
+	nnom_layer_t *layer = start;
+	if (start == NULL)
+		return NULL;
+	while (layer->shortcut != NULL)
+		layer = layer->shortcut;
+	return layer;
+}
+
+// call while compiling.
+// the shorcut is for fast running and fast iliterating.
+// simply link every layer as a list. ordered by its runing order
+static nnom_status_t layer_shortcut_add(nnom_layer_t *start, nnom_layer_t *curr)
+{
+	nnom_layer_t *layer = start;
+	// first one, return
+	if (start == curr)
+	{
+		return NN_SUCCESS;
+	}
+	// find the end of the list, and add curr layer to the end of it.
+	while (layer->shortcut != NULL)
+	{
+		// if the layer is already in shortcut list, tell upper.
+		if (curr == layer)
+			return NN_ARGUMENT_ERROR;
+		layer = layer->shortcut;
+	}
+	layer->shortcut = curr;
+
+	return NN_SUCCESS;
+}
+
+// input the layer number,
+static void print_layer_info(nnom_layer_t *layer, uint32_t layer_count)
+{
+	size_t in_size = io_mem_size(layer->in);
+	size_t out_size = io_mem_size(layer->out);
+	size_t compsize;
+	size_t mac = layer->stat.macc;
+	if (layer->comp != NULL)
+		compsize = layer->comp->size;
+	else
+		compsize = 0;
+	// names
+	if(layer->type != NNOM_RNN)
+		NNOM_LOG("#%-3d %-10s - ", layer_count, default_layer_names[layer->type]);
+	else
+	{
+		NNOM_LOG("#%-3d %-3s/", layer_count, default_layer_names[layer->type]);
+		NNOM_LOG("%-6s - ", default_cell_names[((nnom_rnn_layer_t*)layer)->cell->type]);
+	}
+		
+	// activations
+	if (layer->actail != NULL)
+		NNOM_LOG("%-8s - ", default_activation_names[layer->actail->type]);
+	else
+		NNOM_LOG("         - ");
+
+	NNOM_LOG("(");
+	for (int i = 0; i < 3; i++)
+	{
+		if (layer->out->tensor->num_dim > i)
+			NNOM_LOG("%4d,", layer->out->tensor->dim[i]);
+		else 
+			NNOM_LOG("     ");
+	}
+	NNOM_LOG(")  ");
+	
+	// MAC operation
+	if(mac == 0)
+		NNOM_LOG("        ");
+	else if (mac < 10000)
+		NNOM_LOG("%7d ", (uint32_t)mac);
+	else if (mac < 1000*1000)
+		NNOM_LOG("%6dk ", (uint32_t)(mac/1000));
+	else if (mac < 1000*1000*1000)
+		NNOM_LOG("%3d.%02dM ", (uint32_t)(mac/(1000*1000)), (uint32_t)(mac%(1000*1000)/(10*1000))); // xxx.xx M
+	else
+		NNOM_LOG("%3d.%02dG ", (uint32_t)(mac/(1000*1000*1000)), (uint32_t)(mac%(1000*1000*1000)/(10*1000*1000))); // xxx.xx G
+	
+	// memory 
+	NNOM_LOG("(%6d,%6d,%6d)", (uint32_t)in_size, (uint32_t)out_size,(uint32_t) compsize);
+}
+
+static void print_memory_block_info(nnom_mem_block_t *block_pool)
+{
+	// show the memory blocks's lifetime (number of owners)
+	NNOM_LOG("   ");
+	for (int i = 0; i < NNOM_BLOCK_NUM; i++)
+	{
+		if (i % 4 == 0)
+			NNOM_LOG(" ");
+		if (block_pool[i].owners)
+			NNOM_LOG("%d ", block_pool[i].owners);
+		else
+			NNOM_LOG("- ");
+	}
+	NNOM_LOG("\n");
+}
+
+// This is a nested called functions.
+// to analyse the topology of the model, calculate the output_shape of each layer and create shortcut lists.
+// Nest will happend when a layer have multiple output module or mutiple output hooks.
+// This function will return when
+// 	1) if the layer has multiple input but not all of them are filled by last layers. returns NN_MORE_TODO
+//	2) if all the output hooked are nested called. return NN_SUCCESS
+//	3) if the layer is output layer. return NN_SUCCESS
+nnom_status_t compile_layers(nnom_layer_t* first, nnom_layer_t *curr, nnom_mem_block_t *block_pool, uint32_t *layer_count)
+{
+	size_t mem_size = 0;
+	nnom_layer_t *layer = curr;
+	nnom_layer_io_t *in;
+	nnom_layer_io_t *out;
+	nnom_layer_hook_t *hook;
+
+	nnom_mem_block_t *in_blk;
+	nnom_mem_block_t *out_blk;
+	
+	uint32_t local_layer_count = 1;
+	
+	if(layer_count == NULL)
+		layer_count = &local_layer_count;
+
+	in = layer->in;
+	out = layer->out;
+
+	while (layer)
+	{
+		// check input
+		in = layer->in;
+
+		// check if this layer is the input layer
+		// the first layer has no input hooked, and the io is not initialized
+		if (in->hook.io == NULL)
+		{
+			// if the input is not initalized
+			if (in->mem == NULL)
+			{
+				in_blk = allocate_block(block_pool);
+				in_blk->owners += 1; // add 1
+				mem_size = nnom_alignto(tensor_size(in->tensor), NNOM_ALIGN);
+				in_blk->size = mem_size > in_blk->size ? mem_size : in_blk->size;
+				// set the blk to the layer IO
+				in->mem = in_blk;
+				in->mem->state = NNOM_BUF_FILLED; //mark input buff filled
+			}
+		}
+		else
+		{
+			// get the mem for every input from its hooked output. 
+			while (in != NULL)
+			{
+				in->mem = in->hook.io->mem;
+				in = in->aux;
+			}
+		}
+
+		// if there are mutiple inputs, wait utill all blocks filled
+		in = layer->in;
+		if (in != NULL && in->aux != NULL)
+		{
+			while (in != NULL)
+			{
+				// if the mem (of its hooked output) is not allocated or is not filled. 
+				// It not the time to run the layer yet, return and waits for next nested called. 
+				if (in->mem == NULL || in->mem->state != NNOM_BUF_FILLED)
+					return NN_MORE_TODO;
+				in = in->aux;
+			}
+		}
+
+		// if run to this point, then it is the right time to compile(run) this layer. 
+		// compiling are seperated into the steps below. 
+		// 1. to calculate the output shape. 
+		// 2. to put the current layer to the end of shortcut list.
+		// 3. allocate computational buffer.
+		// 4. allocate output buffer for each output module. 
+		// 5.1 if there is only one layer hooked to the output. we dont use nested call, but continue in this big while loop. 
+		// 5.2 nested call the hooked output layers (if there are > 1 hooked to the output of this layer)
+
+		// 1. calculate output shape while all inputs are filled
+		layer->build(layer);
+
+		// 2. add to shortcut list. 
+		layer_shortcut_add(first, layer);
+
+		// 3. assign for computational buf
+		if (layer->comp != NULL)
+		{
+			layer->comp->mem = allocate_block(block_pool);
+			layer->comp->mem->owners += 1; // add us to buffer users
+			layer->comp->mem->state = NNOM_BUF_FILLED;
+			// record maximum mem size in this block
+			mem_size = nnom_alignto(layer->comp->size, NNOM_ALIGN);
+			layer->comp->mem->size =
+				mem_size > layer->comp->mem->size ? mem_size : layer->comp->mem->size;
+		}
+
+		// print current layer's info. 
+		// show names, activations, mem block size
+		print_layer_info(layer, (*layer_count)++);
+
+		// 4. allocate output buffer for each output module. 
+		// check output
+		if (layer->out == NULL)
+			return NN_SUCCESS;
+
+		// 5.1 if there is only one layer hooked to the output. we dont use nested call, but continue in this big while loop. 
+		// if the layer is Single Output, continue the loop directly. To reduce nested level
+		if (layer->out->aux == NULL && layer->out->hook.next == NULL)
+		{
+			// single buf layer.
+			if (layer->in->type == NNOM_TENSOR_BUF_NULL || layer->out->type == NNOM_TENSOR_BUF_NULL)
+			{
+				// pass to next layer directly, like we never touch the buffer(dont change life-time)
+				layer->out->mem = layer->in->mem;
+				
+				// print memory before release
+				print_memory_block_info(block_pool);
+				// computational buf
+				release_comp_mem(layer);
+			}
+			// not a single buf layer
+			else
+			{
+				// allocate mem block for the output
+				out_blk = allocate_block(block_pool);
+				if (out_blk == NULL)
+					return NN_NO_MEMORY;
+				// set the life time, only one hooked layer, so the life time is 1
+				out_blk->owners = 1;
+				out_blk->state = NNOM_BUF_FILLED; // marked filled
+				// record maximum mem size in this block
+				mem_size = nnom_alignto(tensor_size(layer->out->tensor), NNOM_ALIGN);
+				out_blk->size = mem_size > out_blk->size ? mem_size : out_blk->size;
+				// set the blk to the layer IO
+				layer->out->mem = out_blk;
+
+				// once we allocate for output, we can now release input and comput.
+				// print memory before release
+				print_memory_block_info(block_pool);
+				// release input mem and comp mem
+				release_input_mem(layer);
+				release_comp_mem(layer);
+			}
+		}
+		// Multiple output and/or mutiple hooks
+		else
+		{
+			// single buf layer will use the input buf for the first output
+			if (layer->in->type == NNOM_TENSOR_BUF_NULL || layer->out->type == NNOM_TENSOR_BUF_NULL)
+			{
+				// we dont allocate new buf, but use the input
+				// the ownership will be set to next layer later
+				layer->out->mem = layer->in->mem;
+				layer->out->mem->owners += nnom_hook_length(&layer->out->hook); // set the mem lifetime.// test
+				layer->out->mem->state = NNOM_BUF_FILLED;
+				
+				// print memory before release
+				print_memory_block_info(block_pool);
+				// release computational buff and input buffer 
+				release_input_mem(layer);
+				release_comp_mem(layer);
+			}
+			// mutiple buf layer. (I/O use different memory blocks)
+			else
+			{
+				// allocate for every output
+				out = layer->out;
+				while (out != NULL && out->hook.io != NULL) // the output layer have no output IO
+				{
+					// assign new block
+					out->mem = allocate_block(block_pool);
+					if (out->mem == NULL)
+						return NN_NO_MEMORY;
+					// record maximum mem size in this block
+					mem_size = nnom_alignto(tensor_size(out->tensor), NNOM_ALIGN);
+					out->mem->size = mem_size > out->mem->size ? mem_size : out->mem->size;
+					// keep the block untill the last hooked layer is called.
+					out->mem->owners = nnom_hook_length(&out->hook); // set lifetime of the buffer = the num of hooked layers
+					out->mem->state = NNOM_BUF_FILLED;
+
+					out = out->aux;
+				}
+				// once we allocate for output, we can now release input and comput (or reduce the lifetime).
+				// print memory before release
+				print_memory_block_info(block_pool);
+				// release input mem and comp mem
+				release_input_mem(layer);
+				release_comp_mem(layer);
+			}
+
+			// 5.12 nested call the hooked output layers (if there are > 1 hooked to the output of this layer)
+			// while all the out module(s) receive a memory block, it is ready to be sent to other layers.
+			// iterate all hooked layers in each out module.
+			out = layer->out;
+			while (out != NULL)
+			{
+				// nested call hooked layer one by one.
+				hook = &out->hook;
+				while (hook != NULL && hook->io != NULL)
+				{
+					compile_layers(first, hook->io->owner, block_pool, layer_count);
+					hook = hook->next;
+				}
+
+				// next io
+				out = out->aux;
+			}
+
+			// when all the out is called. this should stop here.
+			// once enter mutiple output iterating, the function will always return.
+			// because at least one of the nested called by this function will run till the end of the model.
+			return NN_SUCCESS;
+		}
+		// Multiple output ended.
+
+		// return if this is output layer.
+		// the output layer's output io is hooked to nothing.
+		if (layer->out->hook.io == NULL)
+			return NN_SUCCESS;
+
+		// single output layer, this function continue to analyse next layer.
+		// switch to next layer.
+		layer = layer->out->hook.io->owner;
+	}
+
+	// seems to be redundants
+	return NN_SUCCESS;
+}
+
+size_t mem_analysis_result(nnom_model_t *m)
+{
+	uint32_t index;
+	uint32_t total_mem = 0;
+	NNOM_LOG("Memory cost by each block:\n ");
+	// print size of memory blocks
+	for (index = 0; index < NNOM_BLOCK_NUM; index++)
+	{
+		total_mem += m->blocks[index].size;
+		NNOM_LOG("blk_%d:%d  ", index, (uint32_t)(m->blocks[index].size));
+	}
+	// size of total memory cost by networks buffer
+	NNOM_LOG("\n Memory cost by network buffers: %d bytes\n", total_mem);
+	return total_mem;
+}
+
+// allocate memory, and set them to each block according to the mem analysis results.
+nnom_status_t block_mem_set(nnom_model_t *m, void *buf)
+{
+	uint32_t index;
+	uint32_t mem_offset = 0;
+
+	for (index = 0; index < NNOM_BLOCK_NUM; index++)
+	{
+		if (m->blocks[index].size == 0)
+			break;
+		m->blocks[index].blk = (void *)((uint8_t*)buf + mem_offset);
+		mem_offset += m->blocks[index].size;
+	}
+	return NN_SUCCESS;
+}
+
+// experimental: this function is temporary use to 
+// assign memory blk which has assigned to input and output to the corresponding tensor
+nnom_status_t tensor_mem_set(nnom_model_t *m)
+{
+	nnom_layer_t *layer = m->head;
+	nnom_layer_io_t *io;
+	while (layer)
+	{
+		io = layer->in;
+		while (io)
+		{
+			io->tensor->p_data = io->mem->blk;
+			io = io->aux;
+		}
+
+		io = layer->out;
+		while (io)
+		{
+			io->tensor->p_data = io->mem->blk;
+			io = io->aux;
+		}
+
+		layer = layer->shortcut;
+	}
+	
+	return NN_SUCCESS;
+}
+
+// this function has to be used after memory is assigned to the layers.
+// it means it has to be call after compile_model() as well.
+// it simply get the output buffer and set the buffer to tailed activation of each layer..
+nnom_status_t set_tailed_activation(nnom_model_t *m)
+{
+	NNOM_NULL_CHECK(m);
+	NNOM_NULL_CHECK(m->head);
+	nnom_layer_t *layer = m->head;
+
+	// if tailed activation is exist, set it to the output.
+	while (layer)
+	{
+		if (layer->actail != NULL)
+		{
+			layer->actail->tensor = layer->out->tensor;
+		}
+		if (layer->shortcut == NULL)
+			break;
+		layer = layer->shortcut;
+	}
+
+	return NN_SUCCESS;
+}
+
+// get total ops
+static uint64_t model_set_ops(nnom_model_t *m)
+{
+	nnom_layer_t *layer;
+	uint64_t total_ops = 0;
+	layer = m->head;
+	while (layer)
+	{
+		total_ops += layer->stat.macc;
+		if (layer->shortcut == NULL)
+			break;
+		layer = layer->shortcut;
+	}
+	m->total_ops = total_ops;
+	return total_ops;
+}
+
+// a compiler can be use for both sequencial / functional model.
+// the output layer is optional only when the model is single output model
+// in this case, if output = NULL, the compile can find it by its own. 
+nnom_status_t model_compile(nnom_model_t *m, nnom_layer_t *input, nnom_layer_t *output)
+{
+	size_t buf_size;
+	uint8_t *buf;
+	uint32_t layer_num = 1;
+	uint32_t time = nnom_ms_get();
+	
+	NNOM_NULL_CHECK(m);
+	NNOM_NULL_CHECK(input);
+
+	m->head = input;
+	m->tail = output;
+	if (output == NULL)
+		m->tail = find_last(input);
+
+	NNOM_LOG("NNoM version %d.%d.%d\n", NNOM_MAJORVERSION, NNOM_SUBVERSION, NNOM_REVISION);
+	NNOM_LOG("To disable logs, please void the marco 'NNOM_LOG(...)' in 'nnom_port.h'.\n");
+	#ifdef NNOM_USING_CHW
+	    NNOM_LOG("Data format: Channel first (CHW)\n");
+	#else
+	    NNOM_LOG("Data format: Channel last (HWC)\n");
+	#endif
+	#ifdef NNOM_USING_CMSIS_NN
+	    NNOM_LOG("Backend optimization: CMSIS-NN\n");
+	#endif
+    #ifdef NNOM_USING_STATIC_MEMORY
+	    NNOM_LOG("Static memory size set to: %d\n", (uint32_t)nnom_static_buf_size);
+	#endif
+	NNOM_LOG("Start compiling model...\n");
+	NNOM_LOG("Layer(#)         Activation    output shape    ops(MAC)   mem(in, out, buf)      mem blk lifetime\n");
+	NNOM_LOG("-------------------------------------------------------------------------------------------------\n");
+
+	// compile layers, started from list head, nested run till the end of models
+	compile_layers(m->head, m->head, m->blocks, &layer_num);
+
+	NNOM_LOG("-------------------------------------------------------------------------------------------------\n");
+
+	// if model's tail is not the last layer which built by user.
+	if (output->type != NNOM_OUTPUT)
+		NNOM_LOG("WARNING: the last layer '%s' is not the Output Layer, please check carefully.\n",
+			default_layer_names[output->type]);
+
+	// get the total (aligned) memory requirement
+	buf_size = mem_analysis_result(m);
+
+	// allocate one big memory block
+	buf = nnom_mem(buf_size);
+	if (buf == NULL)
+	{
+		NNOM_LOG("ERROR: No enough memory for network buffer, required %d bytes\n", (uint32_t)buf_size);
+		return NN_NO_MEMORY;
+	}
+    // all memory cost
+	NNOM_LOG(" Total memory occupied: %d bytes\n", (uint32_t)nnom_memory_taken);
+
+	// split the memory for every memory block
+	block_mem_set(m, buf);
+
+	// experimental: set memory from io to the io tensor 
+	tensor_mem_set(m);
+
+	// finally set the output buff to tailed activation on each layer
+	set_tailed_activation(m);
+
+	// calculate the total operations and set it to the model
+	model_set_ops(m);
+	
+	// print the time. 
+	if(nnom_ms_get())
+		NNOM_LOG("Compling done in %d ms\n", nnom_ms_get() - time);
+
+	return NN_SUCCESS;
+}
+
+// This is a simplified API for compile models with sequencial model only
+// this does not require specified Input / Output layers
+nnom_status_t sequencial_compile(nnom_model_t *m)
+{
+	nnom_layer_t *input, *output;
+	input = m->head;
+	output = find_last(input);
+	return model_compile(m, input, output);
+}
+
+// run that layer
+nnom_status_t layer_run(nnom_layer_t *layer)
+{
+	nnom_status_t result;
+	uint32_t start;
+	NNOM_NULL_CHECK(layer);
+
+	// start
+	start = nnom_us_get();
+	// run main layer first
+	result = layer->run(layer);
+	// run tailed-activation if it is presented
+	if (layer->actail != NULL)
+	{
+		layer->actail->run(layer->actail);
+	}
+	// done
+	layer->stat.time = nnom_us_get() - start;
+	return result;
+}
+
+// run the model, until the end_layer. If end_layer == NULL, run all layers.
+nnom_status_t model_run_to(nnom_model_t *m, nnom_layer_t *end_layer)
+{
+	uint32_t layer_num = 1;
+	nnom_status_t result;
+	nnom_layer_t *layer;
+	NNOM_NULL_CHECK(m);
+	NNOM_NULL_CHECK(m->head);
+
+	layer = m->head;
+	
+	// using shortcut run
+	while (layer)
+	{
+		// run layer
+		result = layer_run(layer);
+		if (result != NN_SUCCESS)
+		{
+			NNOM_LOG("Error: #%d %s layer return error code:%d\n", layer_num, default_layer_names[layer->type], result);
+			return result;
+		}
+		// run callback
+		if(m->layer_callback != NULL)
+		{
+			result = m->layer_callback(m, layer);
+			if (result != NN_SUCCESS)
+			{
+				NNOM_LOG("Error: Callback return error code %d at #%d %s layer\n", result, layer_num, default_layer_names[layer->type]);
+				return result;
+			}
+		}		
+		// check if finished
+		if (layer == end_layer || layer->shortcut == NULL)
+			break;
+		layer = layer->shortcut;
+		layer_num++;
+	}
+
+	return NN_SUCCESS;
+}
+
+// run all layers.
+nnom_status_t model_run(nnom_model_t *m)
+{
+	return model_run_to(m, NULL);
+}
+
+// callback, called after each layer has finished the calculation. 
+nnom_status_t model_set_callback(nnom_model_t *m, nnom_status_t (*layer_callback)(nnom_model_t *m, nnom_layer_t *layer))
+{
+	if(m->layer_callback != NULL && m->layer_callback != layer_callback)
+		return NN_LENGTH_ERROR;
+	
+	m->layer_callback = layer_callback;
+	return NN_SUCCESS;
+}
+
+// delete callback. 
+void model_delete_callback(nnom_model_t *m)
+{
+	m->layer_callback = NULL;
+}
+
+nnom_status_t check_model_version(unsigned long model_version)
+{
+	nnom_status_t result = NN_SUCCESS;
+	int32_t major, sub, rev;
+	major = model_version/10000;
+	sub = (model_version/100)%100;
+	rev = model_version % 100;
+	if(model_version != NNOM_VERSION)
+	{
+		NNOM_LOG("WARNING: model version %d.%d.%d dosen't match nnom version!\n", major, sub, rev);
+		result = -NN_ARGUMENT_ERROR;
+	}
+	else
+	{
+		NNOM_LOG("Model version: %d.%d.%d\n", major, sub, rev);
+	}
+	return result;
+}
+
+
diff --git a/APP_Framework/Framework/knowing/nnom/src/core/nnom_layers.c b/APP_Framework/Framework/knowing/nnom/src/core/nnom_layers.c
new file mode 100644
index 000000000..dc059074a
--- /dev/null
+++ b/APP_Framework/Framework/knowing/nnom/src/core/nnom_layers.c
@@ -0,0 +1,83 @@
+/*
+ * Copyright (c) 2018-2020
+ * Jianjia Ma
+ * majianjia@live.com
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Change Logs:
+ * Date           Author       Notes
+ * 2019-02-05     Jianjia Ma   The first version
+ */
+
+#include <stdint.h>
+#include <string.h>
+#include <stdbool.h>
+
+#include "nnom.h"
+#include "nnom_local.h"
+#include "nnom_layers.h"
+
+size_t shape_size(nnom_3d_shape_t *s)
+{
+	if (s == NULL)
+		return 0;
+	return s->h * s->w * s->c;
+}
+
+nnom_3d_shape_t shape(size_t h, size_t w, size_t c)
+{
+	nnom_3d_shape_t s;
+	s.h = h;
+	s.w = w;
+	s.c = c;
+	return s;
+}
+nnom_3d_shape_t kernel(size_t h, size_t w)
+{
+	return shape(h, w, 1);
+}
+nnom_3d_shape_t stride(size_t h, size_t w)
+{
+	return shape(h, w, 1);
+}
+nnom_3d_shape_t dilation(size_t h, size_t w)
+{
+	return shape(h, w, 1);
+}
+
+nnom_border_t border(size_t top, size_t bottom, size_t left, size_t right)
+{
+	nnom_border_t b;
+	b.top = top;
+	b.bottom = bottom;
+	b.left = left;
+	b.right = right;
+	return b;
+}
+
+// this function has to be used while assign a io for a layer.
+// because the io needs to know who is its owner.
+nnom_layer_io_t *io_init(void *owner_layer, nnom_layer_io_t *io)
+{
+	io->owner = (nnom_layer_t *)owner_layer;
+	return io;
+}
+
+// this function is to add a new IO to current inited IO
+// input, the targeted IO that the new IO will be added to
+// output , the new IO
+nnom_layer_io_t *io_add_aux(nnom_layer_io_t *targeted_io)
+{
+	nnom_layer_io_t *new_io;
+	// check if the targeted io is inited, and its aux = NULL
+	if (targeted_io == NULL || targeted_io->owner == NULL || targeted_io->aux != NULL)
+		return NULL;
+	// create new io, init it
+	new_io = nnom_mem(sizeof(nnom_layer_io_t));
+	if (new_io == NULL)
+		return NULL;
+	// add to aux
+	targeted_io->aux = new_io;
+	return io_init(targeted_io->owner, new_io);
+}
diff --git a/APP_Framework/Framework/knowing/nnom/src/core/nnom_tensor.c b/APP_Framework/Framework/knowing/nnom/src/core/nnom_tensor.c
new file mode 100644
index 000000000..55b3984ca
--- /dev/null
+++ b/APP_Framework/Framework/knowing/nnom/src/core/nnom_tensor.c
@@ -0,0 +1,245 @@
+/*
+ * Copyright (c) 2018-2020
+ * Jianjia Ma
+ * majianjia@live.com
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Change Logs:
+ * Date           Author       Notes
+ * 2019-02-05     Jianjia Ma   The first version
+ * 2019-02-14	  Jianjia Ma   Add layer.free() method.
+ */
+
+#include <stdint.h>
+#include <string.h>
+#include <stdbool.h>
+#include <stdarg.h>
+#include "nnom.h"
+#include "nnom_tensor.h"
+
+ // tensor size
+size_t tensor_size(nnom_tensor_t* t)
+{
+	size_t size = 0;
+	if (t != NULL)
+	{
+		size = t->dim[0];
+		for (int i = 1; i < t->num_dim; i++)
+			size *= t->dim[i];
+	}
+	return size;
+}
+size_t tensor_size_byte(nnom_tensor_t* t)
+{
+    return tensor_size(t)*t->bitwidth/8;
+}
+
+
+size_t tensor_get_num_channel(nnom_tensor_t* t)
+{
+	// this will need to be changed to support batch. 
+#ifdef NNOM_USING_CHW
+	// channel first
+	//return t->dim[0];
+	return t->dim[t->num_dim -1];		// we are always using hwc to describe even our data is in CHW
+#else
+	// channel last
+	return t->dim[t->num_dim -1];
+#endif
+}
+
+// initialise/create new tensor
+nnom_tensor_t* new_tensor(nnom_qtype_t type, uint32_t num_dim, uint32_t num_channel)
+{
+	nnom_tensor_t* t = NULL;
+	uint32_t q_len;
+	if(type == NNOM_QTYPE_PER_AXIS)
+	{
+		q_len = num_channel;
+	}
+	else if (type == NNOM_QTYPE_PER_TENSOR)
+	{
+		q_len = 1;
+	}
+	else
+	{
+		NNOM_LOG("ERROR: tensor type not specified\n");
+		return NULL;
+	}
+
+	t = nnom_mem(nnom_alignto(sizeof(nnom_tensor_t), NNOM_ALIGN) 
+							+ nnom_alignto(num_dim*sizeof(nnom_shape_data_t),sizeof(nnom_qformat_param_t)) 
+							+ q_len*sizeof(nnom_qformat_param_t)*2);
+	if(t == NULL)
+		return t;
+	t->dim = (nnom_shape_data_t*)((uint8_t*)t + sizeof(nnom_tensor_t));	// should add alignment
+	t->q_dec = (nnom_qformat_param_t*)((uint8_t*)t->dim + nnom_alignto(num_dim*sizeof(nnom_shape_data_t),sizeof(nnom_qformat_param_t)));
+	t->q_offset = (nnom_qformat_param_t*)((uint8_t*)t->q_dec + q_len*sizeof(nnom_qformat_param_t));
+	t->num_dim = num_dim;
+	t->qtype = type;
+
+	return t;
+}
+
+void delete_tensor(nnom_tensor_t* t)
+{
+	if (t)
+		nnom_free(t);
+}
+
+// set tensor by value
+// for tensor with quantized type NNOM_QTYPE_PER_TENSOR
+nnom_tensor_t* tensor_set_attr_v(nnom_tensor_t* t, 
+		nnom_qformat_param_t dec_bit, nnom_qformat_param_t offset, nnom_shape_data_t* dim, uint32_t num_dim, uint8_t bitwidth)
+{
+	// copy dim
+	t->num_dim = num_dim;
+	nnom_memcpy(t->dim, dim, sizeof(nnom_shape_data_t) * num_dim);
+
+	// bitwidth
+	t->bitwidth = bitwidth;
+	// copy the offset and q format
+	*(t->q_dec) = dec_bit;
+	*(t->q_offset) = offset;
+	return t;
+}
+
+
+// set tensor by pointer
+// for tensor with quantized type NNOM_QTYPE_PER_AXIS
+nnom_tensor_t* tensor_set_attr(nnom_tensor_t* t, 
+		nnom_qformat_param_t*dec_bit, nnom_qformat_param_t *offset, nnom_shape_data_t* dim, uint32_t num_dim, uint8_t bitwidth)
+{
+	size_t size;
+		
+	// copy dim
+	t->num_dim = num_dim;
+	nnom_memcpy(t->dim, dim, sizeof(nnom_shape_data_t) * num_dim);
+	
+	// get the q format data size
+	if(t->qtype == NNOM_QTYPE_PER_AXIS)
+		size = sizeof(nnom_qformat_param_t) * tensor_get_num_channel(t);
+	else
+		size = sizeof(nnom_qformat_param_t);
+
+	// bitwidth
+	t->bitwidth = bitwidth;
+	// copy the offset and q format
+	nnom_memcpy(t->q_dec, dec_bit, size);
+	nnom_memcpy(t->q_offset, offset, size);
+	return t;
+}
+
+// this method copy the attributes of a tensor to a new tensor
+// before that, src and des tensor must already have QTYPE and NUM_OF_DIM set. 
+// Note, the tensors must have the same lenght. this method wont cpy the memory pointer data (we will assign memory later after building)
+nnom_tensor_t* tensor_cpy_attr(nnom_tensor_t* des, nnom_tensor_t* src)
+{
+	size_t size;
+	if(src->qtype != des->qtype || src->num_dim != des->num_dim)
+		return NULL;
+	
+	if(src->qtype == NNOM_QTYPE_PER_AXIS)
+		size = sizeof(nnom_qformat_param_t) * tensor_get_num_channel(src);
+	else
+		size = sizeof(nnom_qformat_param_t);
+		
+	// bit
+	des->bitwidth = src->bitwidth;
+	// copy quantisation parameters
+	nnom_memcpy(des->q_dec, src->q_dec, size);
+	nnom_memcpy(des->q_offset, src->q_offset, size);
+
+	// copy number of dimension
+	des->num_dim = src->num_dim;
+	nnom_memcpy(des->dim, src->dim, src->num_dim * sizeof(nnom_shape_data_t));
+	return des;
+}
+
+// change format from CHW to HWC
+// the shape of the data, input data, output data
+void tensor_hwc2chw_q7(nnom_tensor_t* des, nnom_tensor_t* src)
+{
+	q7_t* p_out = des->p_data;
+	q7_t* p_in = src->p_data;
+
+	for (int c = 0; c < src->dim[2]; c++)
+	{
+		for (int h = 0; h < src->dim[0]; h++)
+		{
+			for (int w = 0; w < src->dim[1]; w++)
+			{
+				*p_out = p_in[(h * src->dim[1] + w) * src->dim[2] + c];
+				p_out++;
+			}
+		}
+	}
+}
+
+
+// only support 3d tensor
+// change format from CHW to HWC
+void tensor_chw2hwc_q7(nnom_tensor_t* des, nnom_tensor_t* src)
+{
+	q7_t* p_out = des->p_data;
+	q7_t* p_in = src->p_data;
+	int im_size;
+	int h_step;
+
+	im_size = src->dim[0] * src->dim[1]; // H*W
+
+	for (int h = 0; h < src->dim[0]; h++)
+	{
+		h_step = src->dim[1] * h;
+		for (int w = 0; w < src->dim[1]; w++)
+		{
+			for (int c = 0; c < src->dim[2]; c++)
+			{
+				*p_out = p_in[im_size * c + h_step + w];
+				p_out++;
+			}
+		}
+	}
+
+}
+
+// (deprecated by tensor_hwc2chw version)
+// change format from CHW to HWC
+// the shape of the data, input data, output data
+void hwc2chw_q7(nnom_3d_shape_t shape, q7_t* p_in, q7_t* p_out)
+{
+	for (int c = 0; c < shape.c; c++)
+	{
+		for (int h = 0; h < shape.h; h++)
+		{
+			for (int w = 0; w < shape.w; w++)
+			{
+				*p_out = p_in[(h * shape.w + w) * shape.c + c];
+				p_out++;
+			}
+		}
+	}
+}
+
+// (deprecated)
+// change format from CHW to HWC
+// the shape of the data, input data, output data
+void chw2hwc_q7(nnom_3d_shape_t shape, q7_t* p_in, q7_t* p_out)
+{
+	int im_size = shape.w * shape.h;
+	int h_step;
+
+	for (int h = 0; h < shape.h; h++)
+	{
+		h_step = shape.w * h;
+		for (int w = 0; w < shape.w; w++)
+		{
+			for (int c = 0; c < shape.c; c++)
+			{
+				*p_out = p_in[im_size * c + h_step + w];
+				p_out++;
+			}
+		}
+	}
+}
diff --git a/APP_Framework/Framework/knowing/nnom/src/core/nnom_utils.c b/APP_Framework/Framework/knowing/nnom/src/core/nnom_utils.c
new file mode 100644
index 000000000..3b13c3551
--- /dev/null
+++ b/APP_Framework/Framework/knowing/nnom/src/core/nnom_utils.c
@@ -0,0 +1,417 @@
+/*
+ * Copyright (c) 2018-2020
+ * Jianjia Ma
+ * majianjia@live.com
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Change Logs:
+ * Date           Author       Notes
+ * 2019-02-05     Jianjia Ma   The first version
+ */
+
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdbool.h>
+#include "nnom.h"
+#include "nnom_utils.h"
+
+static nnom_predict_t *_predict_create_instance(nnom_model_t *m, size_t label_num, size_t top_k_size)
+{
+	nnom_predict_t *pre;
+	// allocate memory 
+	pre = (nnom_predict_t *)nnom_malloc(sizeof(nnom_predict_t));
+	if(pre == NULL)
+		return NULL;
+	pre->top_k = (uint32_t *)nnom_malloc(top_k_size * sizeof(uint32_t));
+	pre->confusion_mat = (uint16_t *)nnom_malloc(label_num * label_num * sizeof(uint16_t));
+	if(pre->top_k == NULL || pre->confusion_mat == NULL)
+	{
+		nnom_free(pre->top_k); nnom_free(pre->confusion_mat); nnom_free(pre);
+		return NULL;
+	}
+	nnom_memset(pre->top_k, 0, top_k_size * sizeof(uint32_t));
+	nnom_memset(pre->confusion_mat, 0, label_num * label_num * sizeof(uint16_t));
+	
+	// config
+	pre->label_num = label_num;
+	pre->top_k_size = top_k_size;
+	pre->predict_count = 0;
+
+	// run
+	pre->model = m;
+	pre->t_run_total = 0;	// model running time in total
+	pre->t_predict_start = 0; // when it is initial
+	pre->t_predict_total = 0; // total time of the whole test
+
+	return pre;
+}
+
+static void _predict_delete_instance(nnom_predict_t *pre)
+{
+	if(pre == NULL)
+		return;
+	nnom_free(pre->top_k); 
+	nnom_free(pre->confusion_mat);
+	nnom_free(pre); 
+}
+
+// create a prediction
+// input model, the buf pointer to the softwmax output (Temporary, this can be extract from model)
+// the size of softmax output (the num of lable)
+// the top k that wants to record.
+nnom_predict_t *prediction_create(nnom_model_t *m, int8_t *buf_prediction, size_t label_num, size_t top_k_size)
+{
+	nnom_predict_t *pre = _predict_create_instance(m, label_num, top_k_size);
+	if (!pre)
+		return NULL;
+	if (!m)
+	{
+		_predict_delete_instance(pre);
+		return NULL;
+	}
+
+	// set the output buffer of model to the prediction instance
+	pre->buf_prediction = buf_prediction;
+
+	// mark start time.
+	pre->t_predict_start = nnom_ms_get();
+
+	return pre;
+}
+
+// after a new data is set in input
+// feed data to prediction
+// input the current label, (range from 0 to total number of label -1)
+// (the current input data should be set by user manully to the input buffer of the model.)
+nnom_status_t prediction_run(nnom_predict_t *pre, uint32_t true_label, uint32_t*predict_label, float* prob)
+{
+	int max_val;
+	int max_index;
+	uint32_t true_ranking = 0;
+	uint32_t start;
+	uint32_t sum = 0;
+
+	if (!pre)
+		return NN_ARGUMENT_ERROR;
+
+	// now run model
+	start = nnom_ms_get();
+	model_run(pre->model);
+	pre->t_run_total += nnom_ms_get() - start;
+
+	// only draw matrix and top k when number of label > 1
+	if (pre->label_num > 1)
+	{
+		// find how many prediction is bigger than the ground true.
+		// Raning rules, same as tensorflow. however, predictions in MCU is more frequencly to have equal probability since it is using fixed-point.
+		// if ranking is 1, 2, =2(true), 4, 5, 6. the result will be top 3.
+		// if ranking is 1, 2(true), =2, 4, 5, 6. the result will be top 2.
+		// find the ranking of the prediced label.
+		for (uint32_t j = 0; j < pre->label_num; j++)
+		{
+			if (j == true_label)
+				continue;
+			if (pre->buf_prediction[true_label] < pre->buf_prediction[j])
+				true_ranking++;
+			// while value[label] = value[j]. only when label > j, label is the second of j
+			else if (pre->buf_prediction[true_label] == pre->buf_prediction[j] && j < true_label)
+				true_ranking++;
+		}
+
+		if (true_ranking < pre->top_k_size)
+			pre->top_k[true_ranking]++;
+
+		// Find top 1 and return the current prediction.
+		// If there are several maximum prediction, return the first one.
+		max_val = pre->buf_prediction[0];
+		max_index = 0;
+		for (uint32_t j = 1; j < pre->label_num; j++)
+		{
+			if (pre->buf_prediction[j] > max_val)
+			{
+				max_val = pre->buf_prediction[j];
+				max_index = j;
+			}
+			sum += pre->buf_prediction[j];
+		}
+		// result
+		if (max_val != 0)
+			*prob = (float)max_val / 127.f;
+		else
+			*prob = 0;
+		*predict_label = max_index;
+
+		// fill confusion matrix
+		pre->confusion_mat[true_label * pre->label_num + max_index] += 1;
+	}
+	// only one neural as output. 
+	else
+	{
+		*prob = (float)pre->buf_prediction[0] / 127.f;
+		if (*prob >= 0.5f)
+			*predict_label = 1;
+		else
+			*predict_label = 0;
+	}
+
+	// prediction count
+	pre->predict_count++;
+
+	// return the prediction
+	return NN_SUCCESS;
+}
+
+void prediction_end(nnom_predict_t *pre)
+{
+	if (!pre)
+		return;
+	pre->t_predict_total = nnom_ms_get() - pre->t_predict_start;
+}
+
+void prediction_delete(nnom_predict_t *pre)
+{
+	_predict_delete_instance(pre);
+}
+
+void prediction_matrix(nnom_predict_t *pre)
+{
+	if (!pre)
+		return;
+	// print titles
+	NNOM_LOG("\nConfusion matrix:\n");
+	NNOM_LOG("predict");
+	for (int i = 0; i < pre->label_num; i++)
+	{
+		NNOM_LOG("%6d", i);
+	}
+	NNOM_LOG("\n");
+	NNOM_LOG("actual\n");
+	// print the matrix
+	for (int i = 0; i < pre->label_num; i++)
+	{
+		uint32_t row_total = 0;
+
+		NNOM_LOG(" %3d | ", i);
+		for (int j = 0; j < pre->label_num; j++)
+		{
+			row_total += pre->confusion_mat[i * pre->label_num + j];
+			NNOM_LOG("%6d", pre->confusion_mat[i * pre->label_num + j]);
+		}
+		NNOM_LOG("   |%4d%%\n", pre->confusion_mat[i * pre->label_num + i] * 100 / row_total);
+		row_total = 0;
+	}
+	NNOM_LOG("\n");
+}
+
+// top-k
+void prediction_top_k(nnom_predict_t *pre)
+{
+	uint32_t top = 0;
+	if (!pre)
+		return;
+
+	for (int i = 0; i < pre->top_k_size; i++)
+	{
+		top += pre->top_k[i];
+		if (top != pre->predict_count)
+			NNOM_LOG("Top %d Accuracy: %d.%02d%% \n", i + 1, (top * 100) / pre->predict_count,
+					((top * 100 * 100) / pre->predict_count)%100);
+		else
+			NNOM_LOG("Top %d Accuracy: 100%% \n", i + 1);
+	}
+}
+
+// this function is to print sumarry
+void prediction_summary(nnom_predict_t *pre)
+{
+	if (!pre)
+		return;
+	// sumamry
+	NNOM_LOG("\nPrediction summary:\n");
+	NNOM_LOG("Test frames: %d\n", pre->predict_count);
+	NNOM_LOG("Test running time: %d sec\n", pre->t_predict_total / 1000);
+	NNOM_LOG("Model running time: %d ms\n", pre->t_run_total);
+	if(pre->predict_count !=0)
+		NNOM_LOG("Average prediction time: %d us\n", (pre->t_run_total * 1000) / pre->predict_count);
+	if(pre->t_run_total != 0)
+		NNOM_LOG("Average effeciency: %d.%02d ops/us\n", (int)(((uint64_t)pre->model->total_ops * pre->predict_count) / (pre->t_run_total * 1000)),
+			(int)(((uint64_t)pre->model->total_ops * pre->predict_count)*100 / (pre->t_run_total * 1000))%100);
+	if(pre->t_run_total !=0 && pre->predict_count !=0)
+		NNOM_LOG("Average frame rate: %d.%d Hz\n", 1000 / (pre->t_run_total / pre->predict_count),
+			(1000*10 / (pre->t_run_total / pre->predict_count))%10);
+
+	// only valid for multiple labels 
+	if(pre->label_num > 1)
+	{
+		// print top-k
+		prediction_top_k(pre);
+
+		// print confusion matrix
+		prediction_matrix(pre);
+	}
+}
+
+// stand alone prediction API
+// this api test one set of data, return the prediction
+nnom_status_t nnom_predict(nnom_model_t *m, uint32_t *label, float *prob)
+{
+	int32_t max_val, max_index, sum;
+	int8_t *output;
+
+	if (!m)
+		return NN_ARGUMENT_ERROR;
+
+	model_run(m);
+
+	// get the output memory
+	output = m->tail->out->tensor->p_data;
+
+	// multiple neural output
+	if (tensor_size(m->tail->out->tensor) > 1)
+	{
+		// Top 1
+		max_val = output[0];
+		max_index = 0;
+		sum = max_val;
+		for (uint32_t i = 1; i < tensor_size(m->tail->out->tensor); i++)
+		{
+			if (output[i] > max_val)
+			{
+				max_val = output[i];
+				max_index = i;
+			}
+			sum += output[i];
+		}
+		// send results
+		*label = max_index;
+		if(max_val !=0)
+			*prob = (float)max_val/127.f; 
+		else
+			*prob = 0; 
+	}
+	// single neural output
+	else
+	{
+		*prob = (float)output[0] / 127.f;
+		if (*prob >= 0.5f)
+			*label = 1;
+		else
+			*label = 0;
+	}
+	
+	return NN_SUCCESS;
+}
+
+static void layer_stat(nnom_layer_t *layer)
+{
+	// layer stat
+	if(layer->type != NNOM_RNN)
+		NNOM_LOG("%-10s - ", default_layer_names[layer->type]);
+	else
+	{
+		NNOM_LOG("%-3s/", default_layer_names[layer->type]);
+		NNOM_LOG("%-6s - ", default_cell_names[((nnom_rnn_layer_t*)layer)->cell->type]);
+	}
+	NNOM_LOG(" %8d      ", layer->stat.time);
+
+	// MAC operation
+	if(layer->stat.macc == 0)
+		NNOM_LOG("            ");
+	else if (layer->stat.macc < 10000)
+		NNOM_LOG("%7d     ", (uint32_t)layer->stat.macc);
+	else if (layer->stat.macc < 1000*1000)
+		NNOM_LOG("%6dk     ", (uint32_t)(layer->stat.macc/1000));
+	else if (layer->stat.macc < 1000*1000*1000)
+		NNOM_LOG("%3d.%02dM     ", (uint32_t)(layer->stat.macc/(1000*1000)), (uint32_t)(layer->stat.macc%(1000*1000)/(10*1000))); // xxx.xx M
+	else
+		NNOM_LOG("%3d.%02dG     ", (uint32_t)(layer->stat.macc/(1000*1000*1000)), (uint32_t)(layer->stat.macc%(1000*1000*1000)/(10*1000*1000))); // xxx.xx G
+
+	// layer efficiency
+	if (layer->stat.macc != 0 && layer->stat.time != 0)
+		NNOM_LOG("%d.%02d\n", (uint32_t)(layer->stat.macc / layer->stat.time), (uint32_t)((layer->stat.macc * 100) / (layer->stat.time) % 100));
+	else
+		NNOM_LOG("\n");
+}
+
+void model_stat(nnom_model_t *m)
+{
+	size_t total_ops = 0;
+	size_t total_time = 0;
+	nnom_layer_t *layer;
+	uint32_t run_num = 0;
+
+	if (!m)
+		return;
+
+	layer = m->head;
+
+	NNOM_LOG("\nPrint running stat..\n");
+	NNOM_LOG("Layer(#)        -   Time(us)     ops(MACs)   ops/us \n");
+	NNOM_LOG("--------------------------------------------------------\n");
+	while (layer)
+	{
+		run_num++;
+		NNOM_LOG("#%-3d", run_num);
+		total_ops += layer->stat.macc;
+		total_time += layer->stat.time;
+		layer_stat(layer);
+		if (layer->shortcut == NULL)
+			break;
+		layer = layer->shortcut;
+	}
+	NNOM_LOG("\nSummary:\n");
+	NNOM_LOG("Total ops (MAC): %d", (uint32_t)(total_ops));
+	NNOM_LOG("(%d.%02dM)\n", (uint32_t) (total_ops/(1000*1000)), (uint32_t)(total_ops%(1000*1000)/(10000)));
+	NNOM_LOG("Prediction time :%dus\n", (uint32_t)total_time);
+	if(total_time != 0)
+		NNOM_LOG("Efficiency %d.%02d ops/us\n",
+		   (uint32_t)(total_ops / total_time),
+		   (uint32_t)((total_ops * 100) / (total_time) % 100));
+
+	NNOM_LOG("Total memory:%d\n", (uint32_t)nnom_mem_stat());
+}
+
+void model_io_format(nnom_model_t *m)
+{
+	nnom_layer_t *layer;
+	uint32_t run_num = 0;
+
+	if (!m)
+		return;
+
+	layer = m->head;
+
+	NNOM_LOG("\nPrint layer input/output..\n");
+	NNOM_LOG("Layer(#)        -  Input(Qnm)  Output(Qnm)   Oshape \n");
+	NNOM_LOG("----------------------------------------------------------\n");
+	while (layer)
+	{
+		run_num++;
+		NNOM_LOG("#%-3d", run_num);
+		if(layer->type != NNOM_RNN)
+			NNOM_LOG("%-10s - ", default_layer_names[layer->type]);
+		else
+		{
+			NNOM_LOG("%-3s/", default_layer_names[layer->type]);
+			NNOM_LOG("%-6s - ", default_cell_names[((nnom_rnn_layer_t*)layer)->cell->type]);
+		}
+		NNOM_LOG("  %2d.%2d", 7-layer->in->tensor->q_dec[0], layer->in->tensor->q_dec[0]);
+		NNOM_LOG("     %2d.%2d", 7-layer->out->tensor->q_dec[0], layer->out->tensor->q_dec[0]);
+		NNOM_LOG("      (");
+		for (int i = 0; i < 3; i++)
+		{
+			if (layer->out->tensor->num_dim > i)
+				NNOM_LOG("%4d,", layer->out->tensor->dim[i]);
+			else 
+				NNOM_LOG("     ");
+		}
+		NNOM_LOG(")\n");
+		
+		if (layer->shortcut == NULL)
+			break;
+		layer = layer->shortcut;
+	}
+
+}
diff --git a/APP_Framework/Framework/knowing/nnom/src/layers/nnom_activation.c b/APP_Framework/Framework/knowing/nnom/src/layers/nnom_activation.c
new file mode 100644
index 000000000..c90171c77
--- /dev/null
+++ b/APP_Framework/Framework/knowing/nnom/src/layers/nnom_activation.c
@@ -0,0 +1,369 @@
+
+
+/*
+ * Copyright (c) 2018-2020
+ * Jianjia Ma
+ * majianjia@live.com
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Change Logs:
+ * Date           Author       Notes
+ * 2019-07-23     Jianjia Ma   The first version
+ */
+
+#include <stdint.h>
+#include <string.h>
+#include <stdbool.h>
+#include <math.h>
+
+#include "nnom.h"
+#include "nnom_local.h"
+#include "nnom_layers.h"
+#include "layers/nnom_activation.h"
+
+#ifdef NNOM_USING_CMSIS_NN
+#include "arm_math.h"
+#include "arm_nnfunctions.h"
+#endif
+
+nnom_layer_t *Activation(nnom_activation_t *act)
+{
+	nnom_activation_layer_t *layer;
+	nnom_layer_io_t *in, *out;
+
+	// apply a block memory for all the sub handles.
+	size_t mem_size = sizeof(nnom_activation_layer_t) + sizeof(nnom_layer_io_t) * 2;
+	layer = nnom_mem(mem_size);
+	if (layer == NULL)
+		return NULL;
+
+	// distribut the memory to sub handles.
+	in = (void *)((uint8_t*)layer + sizeof(nnom_activation_layer_t));
+	out = (void *)((uint8_t*)in + sizeof(nnom_layer_io_t));
+
+	// set type in layer parent
+	layer->super.type = NNOM_ACTIVATION;
+	layer->super.run = activation_run;
+	layer->super.build = default_build;
+	// set buf state
+	in->type = NNOM_TENSOR_BUF_TEMP;
+	out->type = NNOM_TENSOR_BUF_NULL; // when a layer's io is set to NULL, both will point to same mem.
+	// put in & out on the layer.
+	layer->super.in = io_init(layer, in);
+	layer->super.out = io_init(layer, out);
+
+	// set activation to layer
+	layer->act = act;
+
+	// set free method
+	layer->super.free = activation_free;
+
+	return (nnom_layer_t *)layer;
+}
+
+nnom_layer_t *ReLU(void)
+{
+	nnom_layer_t *layer = Activation(act_relu());
+	if (layer == NULL)
+		return NULL;
+
+	// set type in layer parent
+	layer->type = NNOM_RELU;
+	return layer;
+}
+
+nnom_layer_t *LeakyReLU(float alpha)
+{
+	nnom_layer_t *layer = Activation(act_leaky_relu(alpha));
+	if (layer == NULL)
+		return NULL;
+
+	// set type in layer parent
+	layer->type = NNOM_LEAKY_RELU;
+	return layer;
+}
+
+nnom_layer_t *AdvReLU(float alpha, float max, float threshold)
+{
+	nnom_layer_t *layer = Activation(act_adv_relu(alpha, max, threshold));
+	if (layer == NULL)
+		return NULL;
+
+	// set type in layer parent
+	layer->type = NNOM_ADV_RELU;
+	return layer;
+}
+
+nnom_layer_t *Sigmoid(int32_t dec_bit)
+{
+	nnom_layer_t *layer = Activation(act_sigmoid(dec_bit));
+	if (layer == NULL)
+		return NULL;
+
+	// set type in layer parent
+	layer->type = NNOM_SIGMOID;
+	return layer;
+}
+
+nnom_layer_t *TanH(int32_t dec_bit)
+{
+	nnom_layer_t *layer = Activation(act_tanh(dec_bit));
+	if (layer == NULL)
+		return NULL;
+	// set type in layer parent
+	layer->type = NNOM_TANH;
+	return layer;
+}
+
+void act_delete(nnom_activation_t* act){
+	nnom_free(act);
+}
+
+// activation takes act instance which is created. therefore, it must be free when activation is deleted.
+// this is the callback in layer->free
+nnom_status_t activation_free(nnom_layer_t *layer)
+{
+	if(layer)
+		act_delete(((nnom_activation_layer_t *)layer)->act);
+	return NN_SUCCESS;
+}
+
+nnom_status_t activation_run(nnom_layer_t *layer)
+{
+	nnom_activation_layer_t *cl = (nnom_activation_layer_t *)layer;
+	return act_tensor_run(cl->act, layer->in->tensor);
+}
+
+// porting
+static nnom_status_t relu_run(nnom_activation_t* act)
+{
+    if(act->tensor->bitwidth == 16)
+    {
+    #ifdef NNOM_USING_CMSIS_NN
+        arm_relu_q15(act->tensor->p_data, tensor_size(act->tensor));
+    #else
+        local_relu_q15(act->tensor->p_data, tensor_size(act->tensor));
+    #endif
+    }
+    else
+    {
+    #ifdef NNOM_USING_CMSIS_NN
+        arm_relu_q7(act->tensor->p_data, tensor_size(act->tensor));
+    #else
+        local_relu_q7(act->tensor->p_data, tensor_size(act->tensor));
+    #endif
+    }
+	return NN_SUCCESS;
+}
+
+// leaky relu 
+static nnom_status_t leaky_relu_run(nnom_activation_t* act)
+{
+	nnom_activation_leaky_relu_t* a = (nnom_activation_leaky_relu_t*) act;
+    if(act->tensor->bitwidth == 16)
+        local_leaky_relu_q15(act->tensor->p_data, a->alpha, tensor_size(act->tensor));
+    else
+	    local_leaky_relu_q7(act->tensor->p_data, a->alpha, tensor_size(act->tensor));
+	return NN_SUCCESS;
+}
+
+// advance relu
+static nnom_status_t adv_relu_run(nnom_activation_t* act)
+{
+	nnom_activation_adv_relu_t* a = (nnom_activation_adv_relu_t*) act;
+
+	// we need to convert float to fixpoint in runtime where we can know the tensor's q format
+    if(act->tensor->bitwidth == 16)
+    {
+        q15_t max = 32767;
+        q15_t threshold = MIN(a->threshold * (1 << (15 - act->tensor->q_dec[0])), 32767);
+		q7_t max_scale = (1 << (15 - act->tensor->q_dec[0]));
+        if(a->max != INFINITY && a->max != 0x7fc00000) 
+            if(a->max * max_scale < max)
+                max = a->max * max_scale;
+        local_adv_relu_q15(act->tensor->p_data, a->negative_slope, max, threshold, tensor_size(act->tensor));
+    }
+    // 8bit
+    else
+    {
+        q7_t max = 127;
+        q7_t threshold = MIN(a->threshold * (1 << (7 - act->tensor->q_dec[0])), 127);
+		q7_t max_scale = (1 << (7 - act->tensor->q_dec[0]));
+        if(a->max != INFINITY && a->max != 0x7fc00000) // QNAN 0x7fc00000 also represent infinity in script 0.4.1
+            if(a->max * max_scale < max)
+                max = a->max * max_scale;
+        local_adv_relu_q7(act->tensor->p_data, a->negative_slope, max, threshold, tensor_size(act->tensor));
+    }
+    
+	return NN_SUCCESS;
+}
+
+static nnom_status_t tanh_run(nnom_activation_t* act)
+{
+	nnom_activation_fixed_q_t * a = (nnom_activation_fixed_q_t*)act;
+    // 16 bit
+    if(act->tensor->bitwidth == 16)
+    {   
+        uint8_t int_bit = 15 - a->dec_bit;
+        #ifdef NNOM_USING_CMSIS_NN
+        arm_nn_activations_direct_q15(act->tensor->p_data, tensor_size(act->tensor), int_bit, ARM_TANH);
+        #else
+        local_tanh_q15(act->tensor->p_data, tensor_size(act->tensor), int_bit);
+        #endif
+    }
+    else // 8bit
+    {
+        uint8_t int_bit = 7 - a->dec_bit;
+        // arm version cannot handle int_bit > 3
+    #ifdef NNOM_USING_CMSIS_NN
+        if(act->tensor->q_dec[0] <= 3) 
+            arm_nn_activations_direct_q7(act->tensor->p_data, tensor_size(act->tensor), int_bit, ARM_TANH);
+        else
+    #endif
+            local_tanh_q7(act->tensor->p_data, tensor_size(act->tensor), int_bit);
+    }
+	return NN_SUCCESS;
+}
+
+static nnom_status_t sigmoid_run( nnom_activation_t* act)
+{
+	nnom_activation_fixed_q_t * a = (nnom_activation_fixed_q_t*)act;
+    // 16 bit
+    if(act->tensor->bitwidth == 16)
+    {   
+        uint8_t int_bit = 15 - a->dec_bit;
+        #ifdef NNOM_USING_CMSIS_NN
+        arm_nn_activations_direct_q15(act->tensor->p_data, tensor_size(act->tensor), int_bit, ARM_SIGMOID);
+        #else
+        local_sigmoid_q15(act->tensor->p_data, tensor_size(act->tensor), int_bit);
+        #endif
+    }
+    else     // 8bit
+    {
+        uint8_t int_bit = 7 - a->dec_bit;
+        // arm version cannot handle int_bit > 3
+    #ifdef NNOM_USING_CMSIS_NN
+        if(act->tensor->q_dec[0] <= 3) 
+            arm_nn_activations_direct_q7(act->tensor->p_data, tensor_size(act->tensor), int_bit, ARM_TANH);
+        else
+    #endif
+            local_sigmoid_q7(act->tensor->p_data, tensor_size(act->tensor), int_bit);
+    }
+
+	return NN_SUCCESS;
+}
+
+static nnom_status_t hard_tanh_run( nnom_activation_t* act)
+{
+	nnom_activation_fixed_q_t * a = (nnom_activation_fixed_q_t*)act;
+    if(act->tensor->bitwidth == 16)
+        local_hard_tanh_q15(act->tensor->p_data, tensor_size(act->tensor), a->dec_bit + 8); // a->dec is based on 8 bit. 
+    else
+        local_hard_tanh_q7(act->tensor->p_data, tensor_size(act->tensor), a->dec_bit); 
+	return NN_SUCCESS;
+}
+
+static nnom_status_t hard_sigmoid_run( nnom_activation_t* act)
+{
+	nnom_activation_fixed_q_t * a = (nnom_activation_fixed_q_t*)act;
+    if(act->tensor->bitwidth == 16)
+        local_hard_sigmoid_q15(act->tensor->p_data, tensor_size(act->tensor), a->dec_bit + 8); // a->dec is based on 8 bit. 
+    else
+        local_hard_sigmoid_q7(act->tensor->p_data, tensor_size(act->tensor), a->dec_bit); 
+	return NN_SUCCESS;
+}
+
+//
+nnom_activation_t* act_relu(void)
+{
+	nnom_activation_t* act = nnom_mem(sizeof(nnom_activation_t));
+	act->run = relu_run;
+	act->type = ACT_RELU;
+	return act;
+}
+
+nnom_activation_t* act_leaky_relu(float alpha)
+{
+	nnom_activation_leaky_relu_t* act = nnom_mem(sizeof(nnom_activation_leaky_relu_t));
+	act->super.run = leaky_relu_run;
+	act->super.type = ACT_LEAKY_RELU;
+	act->alpha = (q7_t)(alpha*128);
+	return (nnom_activation_t* )act;
+}
+
+nnom_activation_t* act_adv_relu(float negative_slope, float max, float threshold)
+{
+	nnom_activation_adv_relu_t* act = nnom_mem(sizeof(nnom_activation_adv_relu_t));
+	act->super.run = adv_relu_run;
+	act->super.type = ACT_ADV_RELU;
+	act->negative_slope = (q7_t)(negative_slope*128);
+	act->max = max;
+	act->threshold = threshold;
+	return (nnom_activation_t* )act;
+}
+
+nnom_activation_t* act_tanh(int32_t dec_bit)
+{
+	nnom_activation_fixed_q_t* act = nnom_mem(sizeof(nnom_activation_fixed_q_t));
+	act->super.run = tanh_run;
+	act->super.type = ACT_TANH;
+	act->dec_bit = dec_bit;
+	return (nnom_activation_t*)act;
+}
+
+nnom_activation_t* act_sigmoid(int32_t dec_bit)
+{
+	nnom_activation_fixed_q_t* act = nnom_mem(sizeof(nnom_activation_fixed_q_t));
+
+	act->super.run = sigmoid_run;
+	act->super.type = ACT_SIGMOID;
+	act->dec_bit = dec_bit;
+	return (nnom_activation_t*)act;
+}
+
+nnom_activation_t* act_hard_tanh(int32_t dec_bit)
+{
+	nnom_activation_fixed_q_t* act = nnom_mem(sizeof(nnom_activation_fixed_q_t));
+
+	act->super.run = hard_tanh_run;
+	act->super.type = ACT_HARD_TANH;
+	act->dec_bit = dec_bit;
+	return (nnom_activation_t*)act;
+}
+
+nnom_activation_t* act_hard_sigmoid(int32_t dec_bit)
+{
+	nnom_activation_fixed_q_t* act = nnom_mem(sizeof(nnom_activation_fixed_q_t));
+
+	act->super.run = hard_sigmoid_run;
+	act->super.type = ACT_HARD_SIGMOID;
+	act->dec_bit = dec_bit;
+	return (nnom_activation_t*)act;
+}
+
+// return the decimal bit if the activation will change the q format of the layer. 
+int32_t act_get_dec_bit(nnom_activation_type_t type, int32_t dec_bit)
+{
+	switch(type)
+	{
+		case ACT_RELU:
+		case ACT_LEAKY_RELU:
+		case ACT_ADV_RELU:
+			break;
+		case ACT_TANH:
+        case ACT_HARD_TANH:
+		case ACT_SIGMOID:
+        case ACT_HARD_SIGMOID:
+			dec_bit = 7;
+		default:break;
+	}
+	return dec_bit;
+}
+
+// a direct api to run activate a tensor
+nnom_status_t act_tensor_run(nnom_activation_t* act, nnom_tensor_t* tensor)
+{
+	act->tensor = tensor;
+	return act->run(act);
+}
diff --git a/APP_Framework/Framework/knowing/nnom/src/layers/nnom_avgpool.c b/APP_Framework/Framework/knowing/nnom/src/layers/nnom_avgpool.c
new file mode 100644
index 000000000..8ee220f4c
--- /dev/null
+++ b/APP_Framework/Framework/knowing/nnom/src/layers/nnom_avgpool.c
@@ -0,0 +1,167 @@
+/*
+ * Copyright (c) 2018-2020
+ * Jianjia Ma
+ * majianjia@live.com
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Change Logs:
+ * Date           Author       Notes
+ * 2019-07-23     Jianjia Ma   The first version
+ */
+
+#include <stdint.h>
+#include <string.h>
+#include <stdbool.h>
+
+#include "nnom.h"
+#include "nnom_local.h"
+#include "nnom_layers.h"
+#include "layers/nnom_avgpool.h"
+
+#ifdef NNOM_USING_CMSIS_NN
+#include "arm_math.h"
+#include "arm_nnfunctions.h"
+#endif
+
+nnom_layer_t *avgpool_s(const nnom_pool_config_t * config)
+{
+	nnom_avgpool_layer_t *cl;
+	
+	if(config->num_dim == 1)
+	{
+		cl = (nnom_avgpool_layer_t *)AvgPool(kernel(1, config->kernel_size[0]), 
+						stride(1, config->stride_size[0]),
+						config->padding_type);
+	}
+	else
+	{
+		cl = (nnom_avgpool_layer_t *)AvgPool(kernel(config->kernel_size[0], config->kernel_size[1]), 
+						stride(config->stride_size[0], config->stride_size[1]),
+						config->padding_type);
+	}
+	
+	if(cl)
+	{
+		cl->super.config = (void*) config;
+		cl->output_shift = config->output_shift; // no idea if we need it
+	}
+	return (nnom_layer_t *)cl;
+}
+
+nnom_layer_t *AvgPool(nnom_3d_shape_t k, nnom_3d_shape_t s, nnom_padding_t pad_type)
+{
+	nnom_layer_t *layer = MaxPool(k, s, pad_type);
+
+	if (layer != NULL)
+	{
+		layer->type = NNOM_AVGPOOL;
+		layer->run = avgpool_run;
+		layer->build = avgpool_build;
+	}
+	return (nnom_layer_t *)layer;
+}
+
+nnom_status_t avgpool_build(nnom_layer_t *layer)
+{
+	uint32_t size;
+	// avg pooling share the same output shape, stride, padding setting.
+	maxpool_build(layer);
+
+	#ifdef NNOM_USING_CMSIS_NN
+	// however, avg pooling require a computational buffer.
+	//  bufferA size:  2*dim_im_out*ch_im_in
+	size = layer->out->tensor->dim[1] > layer->out->tensor->dim[0] ?
+						layer->out->tensor->dim[1] : layer->out->tensor->dim[0];
+	layer->comp->size = 2 * size * layer->in->tensor->dim[2];
+	#endif
+
+	return NN_SUCCESS;
+}
+
+nnom_status_t avgpool_run(nnom_layer_t *layer)
+{
+	nnom_avgpool_layer_t *cl = (nnom_avgpool_layer_t *)(layer);
+	uint16_t out_x, out_y;
+	// if global pooling
+	if(layer->out->tensor->num_dim == 1)
+	{
+		out_x = 1; out_y = 1;
+	}
+	else // normal pooling. 
+	{
+		out_x = layer->out->tensor->dim[1]; //W
+		out_y = layer->out->tensor->dim[0]; //h
+	}
+
+    // 16 bit
+    if(layer->in->tensor->bitwidth == 16)
+    {
+#ifdef NNOM_USING_CHW
+	local_avepool_q15_CHW(layer->in->tensor->p_data, 				
+			layer->in->tensor->dim[1], layer->in->tensor->dim[0], layer->in->tensor->dim[2],
+			cl->kernel.w, cl->kernel.h, 
+			cl->pad.w, cl->pad.h,
+			cl->stride.w, cl->stride.h,
+			out_x, out_y,
+			cl->output_shift,
+			NULL,
+			layer->out->tensor->p_data);
+#else
+    local_avepool_q15_HWC(layer->in->tensor->p_data, 				
+            layer->in->tensor->dim[1], layer->in->tensor->dim[0], layer->in->tensor->dim[2],
+            cl->kernel.w, cl->kernel.h, 
+            cl->pad.w, cl->pad.h,
+            cl->stride.w, cl->stride.h,
+            out_x, out_y,
+            cl->output_shift,
+            NULL,
+            layer->out->tensor->p_data);
+#endif
+    }
+    // 8bit
+	else{
+#ifdef NNOM_USING_CHW
+	local_avepool_q7_CHW(layer->in->tensor->p_data, 				
+			layer->in->tensor->dim[1], layer->in->tensor->dim[0], layer->in->tensor->dim[2],
+			cl->kernel.w, cl->kernel.h, 
+			cl->pad.w, cl->pad.h,
+			cl->stride.w, cl->stride.h,
+			out_x, out_y,
+			cl->output_shift,
+			NULL,
+			layer->out->tensor->p_data);
+#else //end of CHW
+	#ifdef NNOM_USING_CMSIS_NN
+	// 2D, square
+	if (layer->in->tensor->dim[1] == layer->in->tensor->dim[0] &&
+		layer->out->tensor->dim[1] == layer->out->tensor->dim[0] &&
+		cl->output_shift == 0)
+	{
+		arm_avepool_q7_HWC(
+			layer->in->tensor->p_data,
+			layer->in->tensor->dim[1], layer->in->tensor->dim[2],
+			cl->kernel.w, cl->pad.w, cl->stride.w,
+			layer->out->tensor->dim[1],
+			layer->comp->mem->blk,
+			layer->out->tensor->p_data);
+	}
+	// none square 2D, or 1D
+	else
+	#endif
+	{
+		// CMSIS-NN does not support none-square pooling, we have to use local implementation
+		local_avepool_q7_HWC(layer->in->tensor->p_data, 				
+				layer->in->tensor->dim[1], layer->in->tensor->dim[0], layer->in->tensor->dim[2],
+				cl->kernel.w, cl->kernel.h, 
+				cl->pad.w, cl->pad.h,
+				cl->stride.w, cl->stride.h,
+				out_x, out_y,
+				cl->output_shift,
+				NULL,
+				layer->out->tensor->p_data);
+	}
+#endif
+    }
+	return NN_SUCCESS;
+}
diff --git a/APP_Framework/Framework/knowing/nnom/src/layers/nnom_baselayer.c b/APP_Framework/Framework/knowing/nnom/src/layers/nnom_baselayer.c
new file mode 100644
index 000000000..0442fb2b0
--- /dev/null
+++ b/APP_Framework/Framework/knowing/nnom/src/layers/nnom_baselayer.c
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2018-2020
+ * Jianjia Ma
+ * majianjia@live.com
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Change Logs:
+ * Date           Author       Notes
+ * 2019-07-23     Jianjia Ma   The first version
+ */
+
+#include <stdint.h>
+#include <string.h>
+#include <stdbool.h>
+
+#include "nnom.h"
+#include "nnom_local.h"
+#include "nnom_layers.h"
+#include "layers/nnom_baselayer.h"
+
+// this layer copys the input to the output
+
+nnom_layer_t *baselayer_s(const nnom_layer_config_t * config)
+{
+	nnom_layer_t *layer = BaseLayer();
+	if(layer)
+		layer->config = (void*) config;
+	return layer;
+}
+
+nnom_layer_t *BaseLayer()
+{
+	nnom_io_layer_t *layer;
+	nnom_layer_io_t *in, *out;
+
+	// apply a block memory for all the sub handles.
+	size_t mem_size = sizeof(nnom_io_layer_t) + sizeof(nnom_layer_io_t) * 2;
+	layer = nnom_mem(mem_size);
+	if (layer == NULL)
+		return NULL;
+
+	// distribut the memory to sub handles.
+	in = (void *)((uint8_t*)layer + sizeof(nnom_io_layer_t));
+	out = (void *)((uint8_t*)in + sizeof(nnom_layer_io_t));
+
+	// set type in layer parent
+	layer->super.type = NNOM_BASE;
+	layer->super.run = default_run;
+	layer->super.build = default_build;
+	// set buf state
+	in->type = NNOM_TENSOR_BUF_TEMP;
+	out->type = NNOM_TENSOR_BUF_NULL;
+	// put in & out on the layer.
+	layer->super.in = io_init(layer, in);
+	layer->super.out = io_init(layer, out);
+
+	return (nnom_layer_t *)layer;
+}
+
+// this is call while output shape is not defined.
+// this will set the output shape same as input shape, and it set only the primary IO
+// this cannot be used as first layer, of course...
+nnom_status_t default_build(nnom_layer_t *layer)
+{
+	// get the last layer's output as input shape
+	layer->in->tensor = layer->in->hook.io->tensor;
+	// output tensor
+	// 1. allocate a new tensor for output
+	// 2. set the same dim, qfmt to the new tensor.
+	layer->out->tensor = new_tensor(NNOM_QTYPE_PER_TENSOR,layer->in->tensor->num_dim, tensor_get_num_channel(layer->in->tensor));
+	tensor_cpy_attr(layer->out->tensor, layer->in->tensor);
+	
+	// see if the activation will change the q format
+	if(layer->actail) 
+		layer->out->tensor->q_dec[0] = act_get_dec_bit(layer->actail->type, layer->out->tensor->q_dec[0]);
+
+	// now this build has passed the input tensors (shapes, formats) to the new tensors. 
+	return NN_SUCCESS;
+}
+
+// simply copy input to output
+nnom_status_t default_run(nnom_layer_t *layer)
+{
+	if(layer->out->type != NNOM_TENSOR_BUF_NULL)
+    {
+		nnom_memcpy(layer->out->tensor->p_data, layer->in->tensor->p_data, tensor_size_byte(layer->in->tensor)); 
+    }
+	return NN_SUCCESS;
+}
diff --git a/APP_Framework/Framework/knowing/nnom/src/layers/nnom_concat.c b/APP_Framework/Framework/knowing/nnom/src/layers/nnom_concat.c
new file mode 100644
index 000000000..0e1efa7a2
--- /dev/null
+++ b/APP_Framework/Framework/knowing/nnom/src/layers/nnom_concat.c
@@ -0,0 +1,223 @@
+/*
+ * Copyright (c) 2018-2020
+ * Jianjia Ma
+ * majianjia@live.com
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Change Logs:
+ * Date           Author       Notes
+ * 2019-07-23     Jianjia Ma   The first version
+ */
+
+#include <stdint.h>
+#include <string.h>
+#include <stdbool.h>
+
+#include "nnom.h"
+#include "nnom_local.h"
+#include "nnom_layers.h"
+#include "layers/nnom_concat.h"
+
+nnom_layer_t *concat_s(const nnom_concat_config_t *config)
+{
+	nnom_layer_t* layer = Concat(config->axis);
+	if(layer)
+		layer->config = (void*) config;
+	return layer;
+}
+
+// concate method
+// concate requires more than one input module. aux input will be allocated in model.merge()
+nnom_layer_t *Concat(int8_t axis)
+{
+	nnom_concat_layer_t *layer;
+	nnom_layer_io_t *in, *out;
+	size_t mem_size;
+
+	// apply a block memory for all the sub handles.
+	mem_size = sizeof(nnom_concat_layer_t) + sizeof(nnom_layer_io_t) * 2;
+	layer = nnom_mem(mem_size);
+	if (layer == NULL)
+		return NULL;
+
+	// distribut the memory to sub handles.
+	in = (void *)((uint8_t*)layer + sizeof(nnom_concat_layer_t));
+	out = (void *)((uint8_t*)in + sizeof(nnom_layer_io_t));
+
+	// set type in layer parent
+	layer->super.type = NNOM_CONCAT;
+	layer->super.run = concat_run;
+	layer->super.build = concat_build;
+	// set buf state
+	in->type = NNOM_TENSOR_BUF_TEMP;
+	out->type = NNOM_TENSOR_BUF_TEMP;
+	// put in & out on the layer.
+	layer->super.in = io_init(layer, in);
+	layer->super.out = io_init(layer, out);
+	
+	// axis
+	layer->axis = axis; 
+
+	return (nnom_layer_t *)layer;
+}
+
+
+nnom_status_t concat_build(nnom_layer_t *layer)
+{
+	nnom_concat_layer_t *cl = (nnom_concat_layer_t *)layer;
+	nnom_layer_io_t *in;
+	uint32_t in_num = 0;
+	int32_t num_dim;
+
+	// for each input module, copy the shape from the output of last layer
+	in = layer->in;
+	while (in != NULL)
+	{
+		//get the last layer's output as input shape
+		in->tensor = in->hook.io->tensor;
+		in = in->aux;
+		in_num++;
+	}
+	
+	// allocate new tensor for output, keep the same dimension lenght
+	layer->out->tensor = new_tensor(NNOM_QTYPE_PER_TENSOR, layer->in->tensor->num_dim, tensor_get_num_channel(layer->in->tensor));
+	tensor_cpy_attr(layer->out->tensor, layer->in->tensor);
+	
+	// convert the axis. 
+	if (cl->axis < 0)
+		cl->axis = (layer->in->tensor->num_dim + cl->axis);
+	else if (cl->axis >0)
+		cl->axis = cl->axis -1; // keras use axis start from 1. we are using 0, 1, 2 (check?)
+
+	// find out the concated axis
+	num_dim = layer->in->tensor->num_dim;
+	for (uint32_t i = 0; i < num_dim; i ++)
+	{
+		// exclue the concat axies
+		if (i == cl->axis)
+		{
+			layer->out->tensor->dim[i] = 0;
+
+			// add the same axis from all input up. 
+			in = layer->in;
+			while (in != NULL)
+			{
+				layer->out->tensor->dim[i] += in->tensor->dim[i];
+				in = in->aux;
+			}
+			continue;
+		}
+
+		// check others, all other must be same shape
+		in = layer->in;
+		while (in != NULL && in->aux != NULL)
+		{
+			if (in->tensor->dim[i] != in->aux->tensor->dim[i])
+				return NN_ARGUMENT_ERROR;
+			in = in->aux;
+		}
+
+		// now set other axis
+		layer->out->tensor->dim[i] = layer->in->tensor->dim[i];
+	}
+
+	return NN_SUCCESS;
+}
+
+
+#ifdef NNOM_USING_CHW
+// axis index converter between HWC and CHW
+static inline int chw_i(int hwc, int num_dim)
+{
+    num_dim = num_dim -1;
+	hwc = hwc + 1;			
+	if(hwc>num_dim) 
+		hwc = 0;
+	return hwc;
+}
+static inline int hwc_i(int chw, int num_dim)
+{
+    num_dim = num_dim -1;
+	chw = chw - 1;			
+	if(chw<num_dim) 
+		chw = num_dim;
+	return chw;
+}
+#endif
+
+nnom_status_t concat_run(nnom_layer_t *layer)
+{
+	// by default, concat layer has mutiple (>=2) input and 1 output.
+	nnom_concat_layer_t *cl = (nnom_concat_layer_t *)layer;
+	nnom_layer_io_t *in;
+    uint32_t dwidth = layer->in->tensor->bitwidth/8; // data width in byte
+
+#ifdef NNOM_USING_CHW
+	// Concatenate for HWC	
+	uint8_t *pin;
+	uint8_t *pout = layer->out->tensor->p_data;
+	uint32_t block_size;
+	uint32_t n_block;
+	uint8_t num_dim = layer->in->tensor->num_dim;
+	
+	// calcualte number of block to concat. the other shapes before the concat axis
+	n_block = 1;
+	for(int i= 0; i< chw_i(cl->axis, num_dim); i++)
+	{
+		n_block *= layer->in->tensor->dim[hwc_i(i, num_dim)];
+	}
+	
+	// concat all input layers
+	for(int i=0; i<n_block; i++)
+	{
+		in = layer->in;
+		while (in != NULL)
+		{
+			// the block size of concat data in this layer
+			block_size = dwidth;
+			for(int j= num_dim-1; j >= chw_i(cl->axis, num_dim); j--)
+				block_size *= in->tensor->dim[hwc_i(j, num_dim)];
+			// concat		
+			pin = (uint8_t *)in->tensor->p_data + i * block_size;
+			nnom_memcpy(pout, pin, block_size);
+			pout += block_size;
+			in = in->aux;
+		}
+	}
+	
+#else // end of CHW concate
+
+	// Concatenate for HWC	
+	uint8_t* pin;
+	uint8_t* pout = layer->out->tensor->p_data;
+	uint32_t block_size;
+	uint32_t n_block;
+	uint8_t num_dim = layer->in->tensor->num_dim;
+
+	// calcualte the number of block to concat. (the other shapes before the concat axis)
+	n_block = 1;
+	for (int i = 0; i < cl->axis; i++)
+		n_block *= layer->in->tensor->dim[i];
+
+	// concat all input layers
+	for (int i = 0; i < n_block; i++)
+	{
+		in = layer->in;
+		while (in != NULL)
+		{
+			// the block size of concat data in this layer
+			block_size = dwidth;
+			for (int j = cl->axis; j < num_dim; j++)
+				block_size *= in->tensor->dim[j];
+			// concat		
+			pin = (uint8_t*)in->tensor->p_data + i * block_size;
+			nnom_memcpy(pout, pin, block_size);
+			pout += block_size;
+			in = in->aux;
+		}
+	}
+#endif
+	return NN_SUCCESS;
+}
+
diff --git a/APP_Framework/Framework/knowing/nnom/src/layers/nnom_conv2d.c b/APP_Framework/Framework/knowing/nnom/src/layers/nnom_conv2d.c
new file mode 100644
index 000000000..ea553aedf
--- /dev/null
+++ b/APP_Framework/Framework/knowing/nnom/src/layers/nnom_conv2d.c
@@ -0,0 +1,434 @@
+/*
+ * Copyright (c) 2018-2020
+ * Jianjia Ma
+ * majianjia@live.com
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Change Logs:
+ * Date           Author       Notes
+ * 2019-07-23     Jianjia Ma   The first version
+ */
+
+
+#include <stdint.h>
+#include <string.h>
+#include <stdbool.h>
+
+#include "nnom.h"
+#include "nnom_local.h"
+#include "nnom_layers.h"
+#include "layers/nnom_conv2d.h"
+
+#ifdef NNOM_USING_CMSIS_NN
+#include "arm_math.h"
+#include "arm_nnfunctions.h"
+#endif
+
+// a machine friendly api, with suffix _s for structured configuration.  
+nnom_layer_t *conv2d_s(const nnom_conv2d_config_t *config)
+{
+	nnom_conv2d_layer_t *layer;
+	nnom_buf_t *comp;
+	nnom_layer_io_t *in, *out;
+	size_t mem_size;
+
+	// allocate a block memory for all the sub handles and shifts.
+	mem_size = sizeof(nnom_conv2d_layer_t) + sizeof(nnom_layer_io_t) * 2 + sizeof(nnom_buf_t);
+	layer = nnom_mem(mem_size);
+	if (layer == NULL)
+		return NULL;
+	
+	// distribut the memory to sub handles.
+	in = (void *)((uint8_t*)layer + sizeof(nnom_conv2d_layer_t));
+	out = (void *)((uint8_t*)in + sizeof(nnom_layer_io_t));
+	comp = (void *)((uint8_t*)out + sizeof(nnom_layer_io_t));
+
+	// set type in layer parent
+	layer->super.type = NNOM_CONV_2D;
+	// set buf state
+	in->type = NNOM_TENSOR_BUF_TEMP;
+	out->type = NNOM_TENSOR_BUF_TEMP;
+	comp->type = NNOM_TENSOR_BUF_TEMP;
+	// put in & out on the layer.
+	layer->super.in = io_init(layer, in);
+	layer->super.out = io_init(layer, out);
+	#ifdef NNOM_USING_CMSIS_NN
+	layer->super.comp = comp;
+	#endif
+	// set run method & output shape
+	layer->super.run = conv2d_run;
+	layer->super.build = conv2d_build;
+	layer->super.free = conv2d_free;
+
+	// save the config
+	layer->super.config = (void*) config;
+
+	// get the private parameters
+	// test: for 1d input, expend h = 1
+	if(config->weight->num_dim == 3)
+	{
+		layer->kernel = kernel(1, config->kernel_size[0]);
+		layer->stride = stride(1, config->stride_size[0]);
+		layer->dilation = dilation(1, config->dilation_size[0]);
+	}
+	else
+	{
+		layer->kernel = kernel(config->kernel_size[0], config->kernel_size[1]);
+		layer->stride = stride(config->stride_size[0], config->stride_size[1]);
+		layer->dilation = dilation(config->dilation_size[0], config->dilation_size[1]);
+	}
+
+	layer->filter_mult = config->filter_size; // for convs, this means filter number
+	layer->padding_type = config->padding_type;
+
+	// get bias and weight tensor, this should be created by script. 
+	layer->weight = config->weight;
+	layer->bias = config->bias;
+	
+	// get shifts
+	layer->output_rshift = (nnom_qformat_param_t *)config->output_shift;
+	layer->bias_lshift = (nnom_qformat_param_t *)config->bias_shift;
+
+	// padding
+	if (layer->padding_type == PADDING_SAME)
+	{
+		layer->pad.h = layer->dilation.h * (layer->kernel.h - 1) / 2;
+		layer->pad.w = layer->dilation.w * (layer->kernel.w - 1) / 2;
+		layer->pad.c = (1 - 1) / 2;
+	}
+
+	return (nnom_layer_t *)layer;
+}
+
+
+// Conv2D
+// multiplier of (output/input channel),
+// shape of kernal, shape of strides, weight struct, bias struct
+nnom_layer_t *Conv2D(uint32_t filters, nnom_3d_shape_t k, nnom_3d_shape_t s, nnom_3d_shape_t d,  nnom_padding_t pad_type,
+					 const nnom_weight_t *w, const nnom_bias_t *b)
+{
+	nnom_conv2d_layer_t *layer;
+	nnom_buf_t *comp;
+	nnom_layer_io_t *in, *out;
+	// apply a block memory for all the sub handles.
+	size_t mem_size = sizeof(nnom_conv2d_layer_t) + sizeof(nnom_layer_io_t) * 2 + sizeof(nnom_buf_t);
+	layer = nnom_mem(mem_size);
+	if (layer == NULL)
+		return NULL;
+
+	// distribut the memory to sub handles.
+	in = (void *)((uint8_t*)layer + sizeof(nnom_conv2d_layer_t));
+	out = (void *)((uint8_t*)in + sizeof(nnom_layer_io_t));
+	comp = (void *)((uint8_t*)out + sizeof(nnom_layer_io_t));
+
+	// set type in layer parent
+	layer->super.type = NNOM_CONV_2D;
+	// set buf state
+	in->type = NNOM_TENSOR_BUF_TEMP;
+	out->type = NNOM_TENSOR_BUF_TEMP;
+	comp->type = NNOM_TENSOR_BUF_TEMP;
+	// put in & out on the layer.
+	layer->super.in = io_init(layer, in);
+	layer->super.out = io_init(layer, out);
+	#ifdef NNOM_USING_CMSIS_NN
+	layer->super.comp = comp;
+	#endif
+	// set run method & output shape
+	layer->super.run = conv2d_run;
+	layer->super.build = conv2d_build;
+
+	// get the private parameters
+	layer->kernel = k;
+	layer->stride = s;
+	layer->dilation = d; 	
+	layer->filter_mult = filters; 		// for convs, this means filter number
+	layer->padding_type = pad_type;
+
+	// create weight and bias tensor
+	layer->weight = new_tensor(NNOM_QTYPE_PER_TENSOR, 4, filters);
+	layer->bias = new_tensor(NNOM_QTYPE_PER_TENSOR, 1, filters);
+
+	// configure weight tensor manually to support new tensor based backends. 
+	// needs to be very careful
+	{
+		// config weight 
+		nnom_shape_data_t dim[4] = {k.h, k.w, k.c, filters};
+		*(layer->weight->q_offset) = 0;			// we have no support of offset here
+		*(layer->weight->q_dec) = 0;		// not using it
+		layer->weight->p_data = (void*)w->p_value;
+		layer->weight->bitwidth = 8;
+		layer->weight->qtype = NNOM_QTYPE_PER_TENSOR;
+		nnom_memcpy(layer->weight->dim, dim, layer->weight->num_dim * sizeof(nnom_shape_data_t));
+
+		// config bias 
+		dim[0] = filters;
+		*(layer->bias->q_offset) = 0;			// we have no support of offset here
+		*(layer->bias->q_dec) = 0;		// not using it
+		layer->bias->p_data = (void*) b->p_value;
+		layer->bias->bitwidth = 8;
+		layer->weight->qtype = NNOM_QTYPE_PER_TENSOR;
+		nnom_memcpy(layer->bias->dim, dim, layer->bias->num_dim * sizeof(nnom_shape_data_t));
+		
+		// output shift and bias shift
+		layer->output_rshift = (nnom_qformat_param_t *)&w->shift;
+		layer->bias_lshift = (nnom_qformat_param_t *)&b->shift;
+	}
+
+	return (nnom_layer_t *)layer;
+}
+
+// keras's implementation. 
+// source: https://github.com/keras-team/keras/blob/7a39b6c62d43c25472b2c2476bd2a8983ae4f682/keras/utils/conv_utils.py#L85
+uint32_t conv_output_length(uint32_t input_length, uint32_t filter_size, nnom_padding_t padding, uint32_t stride, uint32_t dilation)
+{
+    if (input_length == 0)
+        return 0;
+    uint32_t dilated_filter_size = (filter_size - 1) * dilation + 1;
+	uint32_t output_length;
+    if(padding == PADDING_SAME)
+        output_length = input_length;
+    else
+        output_length = input_length - dilated_filter_size + 1;
+    return (output_length + stride - 1) / stride;
+}
+
+nnom_status_t conv2d_build(nnom_layer_t *layer)
+{
+	nnom_conv2d_layer_t *cl = (nnom_conv2d_layer_t *)layer;
+
+	// get the tensor from last layer's output
+	layer->in->tensor = layer->in->hook.io->tensor;
+
+	// create new tensor for the output
+	layer->out->tensor = new_tensor(NNOM_QTYPE_PER_TENSOR, layer->in->tensor->num_dim, cl->filter_mult);
+	// copy then change later. 
+	tensor_cpy_attr(layer->out->tensor, layer->in->tensor);
+	
+	// calculate the output tensor q format, only support per tensor quantise now
+	layer->out->tensor->q_dec[0] = layer->in->tensor->q_dec[0] + cl->weight->q_dec[0] - cl->output_rshift[0]; // need some modification for 16bit. 
+	// see if the activation will change the q format
+	if(layer->actail) 
+		layer->out->tensor->q_dec[0] = act_get_dec_bit(layer->actail->type, layer->out->tensor->q_dec[0]);
+	
+	// now we set up the tensor shape, always HWC format
+	layer->out->tensor->dim[0] = conv_output_length(layer->in->tensor->dim[0], cl->kernel.h, cl->padding_type, cl->stride.h, cl->dilation.h);
+	layer->out->tensor->dim[1] = conv_output_length(layer->in->tensor->dim[1], cl->kernel.w, cl->padding_type, cl->stride.w, cl->dilation.w);
+	layer->out->tensor->dim[2] = cl->filter_mult; // channel stays the same
+	
+	// fill padding
+	if (cl->padding_type == PADDING_SAME)
+	{
+		cl->pad.w = cl->dilation.w * (cl->kernel.w - 1) / 2;
+		cl->pad.h = cl->dilation.h * (cl->kernel.h - 1) / 2;
+		cl->pad.c = 0;
+	}
+
+	#ifdef NNOM_USING_CMSIS_NN
+	// bufferA size: (1D shape)
+	// 2*ch_im_in*dim_kernel*dim_kernel
+	layer->comp->size = 2 * 2 * layer->in->tensor->dim[2] * cl->kernel.w * cl->kernel.h;
+	#endif
+	// computational cost: K x K x Cin x Hour x Wout x Cout
+	layer->stat.macc = cl->kernel.w * cl->kernel.h * layer->in->tensor->dim[2] * tensor_size(layer->out->tensor);
+	return NN_SUCCESS;
+}
+
+nnom_status_t conv2d_free(nnom_layer_t *layer)
+{
+	// free weight and bias tensor when we are not initialised from structured configuration. 
+	if(!layer->config)
+	{
+		nnom_conv2d_layer_t* cl = (nnom_conv2d_layer_t*)layer;
+		delete_tensor(cl->weight);
+		delete_tensor(cl->bias);
+	}
+	return NN_SUCCESS;
+}
+
+
+nnom_status_t conv2d_run(nnom_layer_t *layer)
+{
+	nnom_conv2d_layer_t *cl = (nnom_conv2d_layer_t *)layer;
+
+#ifdef NNOM_USING_CHW
+    // CHW format
+    if(layer->in->tensor->bitwidth == 16) 
+    	local_convolve_CHW_q15_nonsquare(
+				layer->in->tensor->p_data,
+				layer->in->tensor->dim[1], layer->in->tensor->dim[0], layer->in->tensor->dim[2],
+				cl->weight->p_data, layer->out->tensor->dim[2],
+				cl->kernel.w, cl->kernel.h, cl->pad.w, cl->pad.h, cl->stride.w, cl->stride.h, cl->dilation.w, cl->dilation.h,
+				cl->bias->p_data, cl->bias_lshift, cl->output_rshift, cl->weight->qtype,
+				layer->out->tensor->p_data,
+				layer->out->tensor->dim[1], layer->out->tensor->dim[0], NULL, NULL);
+    else
+        local_convolve_CHW_q7_nonsquare(
+                    layer->in->tensor->p_data,
+                    layer->in->tensor->dim[1], layer->in->tensor->dim[0], layer->in->tensor->dim[2],
+                    cl->weight->p_data, layer->out->tensor->dim[2],
+                    cl->kernel.w, cl->kernel.h, cl->pad.w, cl->pad.h, cl->stride.w, cl->stride.h, cl->dilation.w, cl->dilation.h,
+                    cl->bias->p_data, cl->bias_lshift, cl->output_rshift, cl->weight->qtype,
+                    layer->out->tensor->p_data,
+                    layer->out->tensor->dim[1], layer->out->tensor->dim[0], NULL, NULL);
+	return NN_SUCCESS;
+#else
+	// HWC format
+	#ifdef NNOM_USING_CMSIS_NN
+	// current cmsis nn does not support dilation
+	if(cl->dilation.w == 1 && cl->dilation.h == 1 && cl->weight->qtype == NNOM_QTYPE_PER_TENSOR)
+    {   
+        // 8 bit cmsis nn
+        if(layer->in->tensor->bitwidth == 8)
+        {
+            //RGB
+            // ch_im_in = 3, w = h
+            if (layer->in->tensor->dim[2] == 3 && layer->in->tensor->dim[0] == layer->in->tensor->dim[1])
+                // squared
+                if((cl->kernel.w == cl->kernel.h) && (cl->pad.w == cl->pad.h) && (cl->stride.w == cl->stride.h))
+                    return (nnom_status_t)arm_convolve_HWC_q7_RGB(
+                        layer->in->tensor->p_data, layer->in->tensor->dim[1], layer->in->tensor->dim[2],
+                        cl->weight->p_data,
+                        layer->out->tensor->dim[2],
+                        cl->kernel.w, cl->pad.w, cl->stride.w,
+                        cl->bias->p_data, cl->bias_lshift[0],
+                        cl->output_rshift[0], layer->out->tensor->p_data, layer->out->tensor->dim[1],
+                        (q15_t *)(layer->comp->mem->blk), NULL);
+
+            // check if can use optimized function
+            //	ch_im_in is multiple of 4
+            //	ch_im_out is multiple of 2
+            if ((layer->in->tensor->dim[2] % 4 == 0) && (layer->out->tensor->dim[2] % 2 == 0))
+            {
+                // squared
+                if((layer->in->tensor->dim[0] == layer->in->tensor->dim[1])
+                && (layer->out->tensor->dim[0] == layer->out->tensor->dim[1])
+                && (cl->kernel.w == cl->kernel.h) && (cl->pad.w == cl->pad.h) && (cl->stride.w == cl->stride.h))
+                {
+                    // 1x1 fast
+                    if (cl->kernel.w == 1 && cl->kernel.h == 1 && cl->stride.w == 1 && cl->stride.h == 1 && cl->pad.w == 0 && cl->pad.h == 0)
+                        return (nnom_status_t)arm_convolve_1x1_HWC_q7_fast_nonsquare(
+                            layer->in->tensor->p_data,
+                            layer->in->tensor->dim[1], layer->in->tensor->dim[0], layer->in->tensor->dim[2],
+                            cl->weight->p_data,
+                            layer->out->tensor->dim[2],
+                            cl->kernel.w, cl->kernel.h, cl->pad.w, cl->pad.h, cl->stride.w, cl->stride.h,
+                            cl->bias->p_data, cl->bias_lshift[0],
+                            cl->output_rshift[0], layer->out->tensor->p_data, layer->out->tensor->dim[1], layer->out->tensor->dim[0],
+                            (q15_t *)(layer->comp->mem->blk), NULL);
+                    // opt square shape
+                    else
+                        return (nnom_status_t)arm_convolve_HWC_q7_fast(
+                            layer->in->tensor->p_data, layer->in->tensor->dim[1], layer->in->tensor->dim[2],
+                            cl->weight->p_data,
+                            layer->out->tensor->dim[2], cl->kernel.w, cl->pad.w, cl->stride.w,
+                            cl->bias->p_data, cl->bias_lshift[0],
+                            cl->output_rshift[0], layer->out->tensor->p_data,
+                            layer->out->tensor->dim[1], (q15_t *)(layer->comp->mem->blk), NULL);
+                }
+                // opt none square shape
+                else
+                    return (nnom_status_t)arm_convolve_HWC_q7_fast_nonsquare(
+                        layer->in->tensor->p_data,
+                        layer->in->tensor->dim[1], layer->in->tensor->dim[0], layer->in->tensor->dim[2],
+                        cl->weight->p_data, layer->out->tensor->dim[2],
+                        cl->kernel.w, cl->kernel.h, cl->pad.w, cl->pad.h, cl->stride.w, cl->stride.h,
+                        cl->bias->p_data, cl->bias_lshift[0], cl->output_rshift[0],
+                        layer->out->tensor->p_data,
+                        layer->out->tensor->dim[1], layer->out->tensor->dim[0], (q15_t *)(layer->comp->mem->blk), NULL);
+            }
+            // none optimized
+            else
+            {
+                // none opt square shape
+                if ((layer->in->tensor->dim[0] == layer->in->tensor->dim[1] && 
+                    layer->out->tensor->dim[0] == layer->out->tensor->dim[1]) &&
+                    (cl->kernel.w == cl->kernel.h) && (cl->pad.w == cl->pad.h) && (cl->stride.w == cl->stride.h))
+                    return (nnom_status_t)arm_convolve_HWC_q7_basic(
+                        layer->in->tensor->p_data, layer->in->tensor->dim[1], layer->in->tensor->dim[2],
+                        cl->weight->p_data,
+                        layer->out->tensor->dim[2], cl->kernel.w, cl->pad.w, cl->stride.w,
+                        cl->bias->p_data, cl->bias_lshift[0],
+                        cl->output_rshift[0], layer->out->tensor->p_data,
+                        layer->out->tensor->dim[1], (q15_t *)(layer->comp->mem->blk), NULL);
+                // none opt none square shape
+                else
+                    return (nnom_status_t)arm_convolve_HWC_q7_basic_nonsquare(
+                        layer->in->tensor->p_data,
+                        layer->in->tensor->dim[1], layer->in->tensor->dim[0], layer->in->tensor->dim[2],
+                        cl->weight->p_data, layer->out->tensor->dim[2],
+                        cl->kernel.w, cl->kernel.h, cl->pad.w, cl->pad.h, cl->stride.w, cl->stride.h,
+                        cl->bias->p_data, cl->bias_lshift[0], cl->output_rshift[0],
+                        layer->out->tensor->p_data,
+                        layer->out->tensor->dim[1], layer->out->tensor->dim[0], (q15_t *)(layer->comp->mem->blk), NULL);
+            } //end of cmsis-nn none-opt
+        }  //end of 8 bit cmsis-nn
+        else if (layer->in->tensor->bitwidth == 16)
+        {
+            // fast opt
+            if ((layer->in->tensor->dim[2] % 2 == 0) && (layer->out->tensor->dim[2] % 2 == 0))
+            {
+                if((layer->in->tensor->dim[0] == layer->in->tensor->dim[1])
+                    && (layer->out->tensor->dim[0] == layer->out->tensor->dim[1])
+                    && (cl->kernel.w == cl->kernel.h) && (cl->pad.w == cl->pad.h) && (cl->stride.w == cl->stride.h))
+                    return (nnom_status_t)arm_convolve_HWC_q15_fast(
+                        layer->in->tensor->p_data, layer->in->tensor->dim[1], layer->in->tensor->dim[2],
+                        cl->weight->p_data,
+                        layer->out->tensor->dim[2], cl->kernel.w, cl->pad.w, cl->stride.w,
+                        cl->bias->p_data, cl->bias_lshift[0],
+                        cl->output_rshift[0], layer->out->tensor->p_data,
+                        layer->out->tensor->dim[1], (q15_t *)(layer->comp->mem->blk), NULL);
+                else 
+                    return (nnom_status_t)arm_convolve_HWC_q15_fast_nonsquare(
+                        layer->in->tensor->p_data,
+                        layer->in->tensor->dim[1], layer->in->tensor->dim[0], layer->in->tensor->dim[2],
+                        cl->weight->p_data, layer->out->tensor->dim[2],
+                        cl->kernel.w, cl->kernel.h, cl->pad.w, cl->pad.h, cl->stride.w, cl->stride.h,
+                        cl->bias->p_data, cl->bias_lshift[0], cl->output_rshift[0],
+                        layer->out->tensor->p_data,
+                        layer->out->tensor->dim[1], layer->out->tensor->dim[0], (q15_t *)(layer->comp->mem->blk), NULL);
+            }
+            // none opt basic
+            else
+            {
+				local_convolve_HWC_q7_nonsquare(
+					layer->in->tensor->p_data,
+					layer->in->tensor->dim[1], layer->in->tensor->dim[0], layer->in->tensor->dim[2],
+					cl->weight->p_data, layer->out->tensor->dim[2],
+					cl->kernel.w, cl->kernel.h, cl->pad.w, cl->pad.h, cl->stride.w, cl->stride.h, cl->dilation.w, cl->dilation.h,
+					cl->bias->p_data, cl->bias_lshift, cl->output_rshift, cl->weight->qtype,
+					layer->out->tensor->p_data,
+					layer->out->tensor->dim[1], layer->out->tensor->dim[0], NULL, NULL);
+				return NN_SUCCESS;
+            }
+
+        } // end of 16 bit cmsis-nn
+    } // end of dilation == 1
+	else
+	#endif // NNOM_USING_CMSIS_NN
+	{
+
+        if(layer->in->tensor->bitwidth == 16) 
+    	local_convolve_HWC_q15_nonsquare(
+				layer->in->tensor->p_data,
+				layer->in->tensor->dim[1], layer->in->tensor->dim[0], layer->in->tensor->dim[2],
+				cl->weight->p_data, layer->out->tensor->dim[2],
+				cl->kernel.w, cl->kernel.h, cl->pad.w, cl->pad.h, cl->stride.w, cl->stride.h, cl->dilation.w, cl->dilation.h,
+				cl->bias->p_data, cl->bias_lshift, cl->output_rshift, cl->weight->qtype,
+				layer->out->tensor->p_data,
+				layer->out->tensor->dim[1], layer->out->tensor->dim[0], NULL, NULL);
+        else
+		local_convolve_HWC_q7_nonsquare(
+					layer->in->tensor->p_data,
+					layer->in->tensor->dim[1], layer->in->tensor->dim[0], layer->in->tensor->dim[2],
+					cl->weight->p_data, layer->out->tensor->dim[2],
+					cl->kernel.w, cl->kernel.h, cl->pad.w, cl->pad.h, cl->stride.w, cl->stride.h, cl->dilation.w, cl->dilation.h,
+					cl->bias->p_data, cl->bias_lshift, cl->output_rshift, cl->weight->qtype,
+					layer->out->tensor->p_data,
+					layer->out->tensor->dim[1], layer->out->tensor->dim[0], NULL, NULL);
+		return NN_SUCCESS;
+	}
+#endif // end of CHW/HWC
+	return NN_SUCCESS;
+}
+
diff --git a/APP_Framework/Framework/knowing/nnom/src/layers/nnom_conv2d_trans.c b/APP_Framework/Framework/knowing/nnom/src/layers/nnom_conv2d_trans.c
new file mode 100644
index 000000000..5a99380a2
--- /dev/null
+++ b/APP_Framework/Framework/knowing/nnom/src/layers/nnom_conv2d_trans.c
@@ -0,0 +1,131 @@
+/*
+ * Copyright (c) 2018-2020
+ * Jianjia Ma
+ * majianjia@live.com
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Change Logs:
+ * Date           Author       Notes
+ * 2020-05-31     Jianjia Ma   The first version
+ */
+
+
+#include <stdint.h>
+#include <string.h>
+#include <stdbool.h>
+
+#include "nnom.h"
+#include "nnom_local.h"
+#include "nnom_layers.h"
+#include "layers/nnom_conv2d_trans.h"
+
+nnom_layer_t *conv2d_trans_s(const nnom_conv2d_config_t *config)
+{
+	nnom_layer_t *layer;
+	layer = conv2d_s(config);
+	if (layer)
+	{
+		layer->type = NNOM_CONV2D_TRANS;
+		layer->run = conv2d_trans_run;
+		layer->build = conv2d_trans_build;
+	}
+	return layer;
+}
+
+nnom_layer_t *Conv2DTrans(uint32_t multiplier, nnom_3d_shape_t k, nnom_3d_shape_t s, nnom_3d_shape_t d, nnom_padding_t pad_type,
+						const nnom_weight_t *w, const nnom_bias_t *b)
+{
+	nnom_layer_t *layer = Conv2D(multiplier, k, s, d, pad_type, w, b); 
+	if (layer != NULL)
+	{
+		layer->type = NNOM_CONV2D_TRANS;
+		layer->run = conv2d_trans_run;
+		layer->build = conv2d_trans_build;
+	}
+	return layer;
+}
+
+// utils, keras method
+// https://github.com/keras-team/keras/blob/7a39b6c62d43c25472b2c2476bd2a8983ae4f682/keras/utils/conv_utils.py#L114
+// https://github.com/tensorflow/tensorflow/blob/2b96f3662bd776e277f86997659e61046b56c315/tensorflow/python/layers/utils.py#L156
+uint32_t conv_trans_output_length(uint32_t input_length, uint32_t kernel_size, nnom_padding_t padding, uint32_t stride_size, uint32_t dilation)
+{
+	input_length *= stride_size;
+	if (padding == PADDING_VALID)
+		input_length += MAX(kernel_size - stride_size, 0);
+	return input_length;
+}
+
+nnom_status_t conv2d_trans_build(nnom_layer_t *layer)
+{
+	nnom_conv2d_trans_layer_t *cl = (nnom_conv2d_trans_layer_t *)layer;
+
+	// get the tensor from last layer's output
+	layer->in->tensor = layer->in->hook.io->tensor;
+
+	// create new tensor for the output
+	layer->out->tensor = new_tensor(NNOM_QTYPE_PER_TENSOR, layer->in->tensor->num_dim, cl->filter_mult);
+	// copy then change later. 
+	tensor_cpy_attr(layer->out->tensor, layer->in->tensor);
+
+	// calculate the output tensor q format, only support per tensor quantise now
+	layer->out->tensor->q_dec[0] = layer->in->tensor->q_dec[0] + cl->weight->q_dec[0] - cl->output_rshift[0];
+	// see if the activation will change the q format
+	if(layer->actail) 
+		layer->out->tensor->q_dec[0] = act_get_dec_bit(layer->actail->type, layer->out->tensor->q_dec[0]);
+
+	// now we set up the tensor shape, always HWC format
+	layer->out->tensor->dim[0] = conv_trans_output_length(layer->in->tensor->dim[0], cl->kernel.h, cl->padding_type, cl->stride.h, cl->dilation.h);
+	layer->out->tensor->dim[1] = conv_trans_output_length(layer->in->tensor->dim[1], cl->kernel.w, cl->padding_type, cl->stride.w, cl->dilation.w);
+	layer->out->tensor->dim[2] = cl->filter_mult; // channel stays the same
+	
+	// fill the correct padding
+	if(cl->padding_type == PADDING_SAME)
+	{			
+		cl->pad.h = (cl->kernel.h - cl->stride.h) / 2; // the padding to the output. 
+		cl->pad.w = (cl->kernel.w - cl->stride.w) / 2;
+//		cl->pad.h = (cl->kernel.h - 1)/2; // the padding to the output. 
+//		cl->pad.w = (cl->kernel.w - 1)/2;
+		cl->pad.c = 0;
+	}
+	else
+	{
+		cl->pad.h = 0;
+		cl->pad.w = 0;
+		cl->pad.c = 0;
+	}
+
+	// bufferA size: (1D shape)
+	// 2*ch_im_in*dim_kernel*dim_kernel
+	//layer->comp->size = 2 * 2 * layer->in->tensor->dim[2] * cl->kernel.w * cl->kernel.h;
+	// computational cost: K x K x Cin x Hour x Wout x Cout
+	layer->stat.macc = cl->kernel.w * cl->kernel.h * layer->in->tensor->dim[2] * tensor_size(layer->out->tensor);
+	return NN_SUCCESS;
+}
+
+
+nnom_status_t conv2d_trans_run(nnom_layer_t *layer)
+{
+    nnom_conv2d_trans_layer_t *cl = (nnom_conv2d_trans_layer_t *)layer;
+
+#ifdef NNOM_USING_CHW
+	// no support for CHW yet
+	return NN_ARGUMENT_ERROR;
+#else	
+
+	//return conv2d_run(layer);
+	
+	local_conv_trans_HWC_q7_nonsquare(
+				layer->in->tensor->p_data,
+				layer->in->tensor->dim[1], layer->in->tensor->dim[0], layer->in->tensor->dim[2],
+				cl->weight->p_data, layer->out->tensor->dim[2],
+				cl->kernel.w, cl->kernel.h, cl->pad.w, cl->pad.h, cl->stride.w, cl->stride.h, cl->dilation.w, cl->dilation.h,
+				cl->bias->p_data, cl->bias_lshift[0], cl->output_rshift[0],
+				layer->out->tensor->p_data,
+				layer->out->tensor->dim[1], layer->out->tensor->dim[0], NULL, NULL);
+	return NN_SUCCESS;
+#endif
+}
+
+
diff --git a/APP_Framework/Framework/knowing/nnom/src/layers/nnom_cropping.c b/APP_Framework/Framework/knowing/nnom/src/layers/nnom_cropping.c
new file mode 100644
index 000000000..01abe9265
--- /dev/null
+++ b/APP_Framework/Framework/knowing/nnom/src/layers/nnom_cropping.c
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2018-2020
+ * Jianjia Ma
+ * majianjia@live.com
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Change Logs:
+ * Date           Author       Notes
+ * 2019-07-23     Jianjia Ma   The first version
+ */
+
+#include <stdint.h>
+#include <string.h>
+#include <stdbool.h>
+
+#include "nnom.h"
+#include "nnom_local.h"
+#include "nnom_layers.h"
+#include "layers/nnom_cropping.h"
+
+nnom_layer_t * cropping_s(const nnom_cropping_config_t *config)
+{
+	nnom_layer_t *layer = Cropping(config->pad);
+	if(layer)
+		layer->config = (void*) config;
+	return layer;
+}
+
+// Cropping layer
+nnom_layer_t *Cropping(nnom_border_t pad)
+{
+	nnom_layer_t *layer;
+	// most setting are the same as zero padding
+	layer = ZeroPadding(pad);
+	
+	// now change to cropping
+	layer->type = NNOM_CROPPING;
+	layer->run = cropping_run;
+	layer->build = cropping_build;
+
+	return layer;
+}
+
+nnom_status_t cropping_build(nnom_layer_t* layer)
+{
+	nnom_cropping_layer_t *cl = (nnom_cropping_layer_t *)layer;
+
+	// get the tensor from last layer's output
+	layer->in->tensor = layer->in->hook.io->tensor;
+
+	// create new tensor for output
+	layer->out->tensor = new_tensor(NNOM_QTYPE_PER_TENSOR, layer->in->tensor->num_dim, tensor_get_num_channel(layer->in->tensor));
+	// copy then change later. 
+	tensor_cpy_attr(layer->out->tensor, layer->in->tensor);
+	
+	// output shape
+	if(layer->in->tensor->dim[1] <= (cl->pad.left + cl->pad.right) || 
+		layer->in->tensor->dim[0] <= (cl->pad.top + cl->pad.bottom))
+		return NN_ARGUMENT_ERROR;
+	
+	layer->out->tensor->dim[0] = layer->in->tensor->dim[0] - (cl->pad.top + cl->pad.bottom);
+	layer->out->tensor->dim[1] = layer->in->tensor->dim[1] - (cl->pad.left + cl->pad.right);
+	layer->out->tensor->dim[2] = layer->in->tensor->dim[2];
+	return NN_SUCCESS;
+}
+
+
+nnom_status_t cropping_run(nnom_layer_t * layer)
+{
+	nnom_cropping_layer_t *cl = (nnom_cropping_layer_t*)layer;
+	
+#ifdef NNOM_USING_CHW
+	local_cropping_CHW_q7(
+#else
+	local_cropping_HWC_q7(
+#endif	
+						layer->in->tensor->p_data, 
+						layer->in->tensor->dim[1], layer->in->tensor->dim[0], layer->in->tensor->dim[2],
+						cl->pad.top,
+						cl->pad.bottom,
+						cl->pad.left,
+						cl->pad.right,
+						layer->out->tensor->p_data,
+						layer->out->tensor->dim[1], layer->out->tensor->dim[0]);
+
+	return NN_SUCCESS;
+}
diff --git a/APP_Framework/Framework/knowing/nnom/src/layers/nnom_dense.c b/APP_Framework/Framework/knowing/nnom/src/layers/nnom_dense.c
new file mode 100644
index 000000000..17c566c76
--- /dev/null
+++ b/APP_Framework/Framework/knowing/nnom/src/layers/nnom_dense.c
@@ -0,0 +1,207 @@
+/*
+ * Copyright (c) 2018-2020
+ * Jianjia Ma
+ * majianjia@live.com
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Change Logs:
+ * Date           Author       Notes
+ * 2019-07-23     Jianjia Ma   The first version
+ */
+
+#include <stdint.h>
+#include <string.h>
+#include <stdbool.h>
+
+#include "nnom.h"
+#include "nnom_local.h"
+#include "nnom_layers.h"
+#include "layers/nnom_dense.h"
+
+#ifdef NNOM_USING_CMSIS_NN
+#include "arm_math.h"
+#include "arm_nnfunctions.h"
+#endif
+
+nnom_layer_t *dense_s(const nnom_dense_config_t *config)
+{
+	nnom_dense_layer_t *layer;
+	nnom_buf_t *comp;
+	nnom_layer_io_t *in, *out;
+
+	// apply a block memory for all the sub handles.
+	size_t mem_size = sizeof(nnom_dense_layer_t) + sizeof(nnom_layer_io_t) * 2 + sizeof(nnom_buf_t);
+	layer = nnom_mem(mem_size);
+	if (layer == NULL)
+		return NULL;
+
+	// distribut the memory to sub handles.
+	in = (void *)((uint8_t*)layer + sizeof(nnom_dense_layer_t));
+	out = (void *)((uint8_t*)in + sizeof(nnom_layer_io_t));
+	comp = (void *)((uint8_t*)out + sizeof(nnom_layer_io_t));
+
+	// set type in layer parent
+	layer->super.type = NNOM_DENSE;
+	// set buf state
+	in->type = NNOM_TENSOR_BUF_TEMP;
+	out->type = NNOM_TENSOR_BUF_TEMP;
+	comp->type = NNOM_TENSOR_BUF_TEMP;
+	// put in & out on the layer.
+	layer->super.in = io_init(layer, in);
+	layer->super.out = io_init(layer, out);
+	layer->super.comp = comp;
+	// set run and outshape methods
+	layer->super.run = dense_run;
+	layer->super.build = dense_build;
+	layer->super.free = dense_free;
+
+	// set parameters
+	layer->output_unit = tensor_get_num_channel(config->weight); 
+	layer->bias = config->bias;
+	layer->weight = config->weight;
+	// set shifts
+	layer->output_rshift = (nnom_qformat_param_t *)config->output_shift;
+	layer->bias_lshift = (nnom_qformat_param_t *)config->bias_shift;
+	// set config
+	layer->super.config = (void*) config;
+
+	return (nnom_layer_t *)layer;
+}
+
+nnom_layer_t *Dense(size_t output_unit, const nnom_weight_t *w, const nnom_bias_t *b)
+{
+	nnom_dense_layer_t *layer;
+	nnom_buf_t *comp;
+	nnom_layer_io_t *in, *out;
+
+	// apply a block memory for all the sub handles.
+	size_t mem_size = sizeof(nnom_dense_layer_t) + sizeof(nnom_layer_io_t) * 2 + sizeof(nnom_buf_t);
+	layer = nnom_mem(mem_size);
+	if (layer == NULL)
+		return NULL;
+
+	// distribut the memory to sub handles.
+	in = (void *)((uint8_t*)layer + sizeof(nnom_dense_layer_t));
+	out = (void *)((uint8_t*)in + sizeof(nnom_layer_io_t));
+	comp = (void *)((uint8_t*)out + sizeof(nnom_layer_io_t));
+
+	// set type in layer parent
+	layer->super.type = NNOM_DENSE;
+	// set buf state
+	in->type = NNOM_TENSOR_BUF_TEMP;
+	out->type = NNOM_TENSOR_BUF_TEMP;
+	comp->type = NNOM_TENSOR_BUF_TEMP;
+	// put in & out on the layer.
+	layer->super.in = io_init(layer, in);
+	layer->super.out = io_init(layer, out);
+	layer->super.comp = comp;
+	// set run and outshape methods
+	layer->super.run = dense_run;
+	layer->super.build = dense_build;
+
+	// set parameters
+	layer->output_unit = output_unit; // this is no longer needed. the information is contained in the weight tensor. 
+
+	layer->weight = new_tensor(NNOM_QTYPE_PER_TENSOR, 2, output_unit);
+	layer->bias = new_tensor(NNOM_QTYPE_PER_TENSOR, 1, output_unit);
+
+	// configure weight tensor manually to support new tensor-based backends. 
+	// needs to be very careful
+	{
+		// config weight 
+		nnom_shape_data_t dim[2] = {0, output_unit}; // the first dim doesnt matter here. will be file in later. 
+		*(layer->weight->q_offset) = 0;			// we have no support of offset here
+		*(layer->weight->q_dec) = 0;		// this is not even correct
+		layer->weight->p_data = (void*)w->p_value;
+		layer->weight->bitwidth = 8;
+		layer->weight->qtype = NNOM_QTYPE_PER_TENSOR;
+		nnom_memcpy(layer->weight->dim, dim, layer->weight->num_dim * sizeof(nnom_shape_data_t));
+
+		// config bias 
+		dim[0] = output_unit;
+		*(layer->bias->q_offset) = 0;			// we have no support of offset here
+		*(layer->bias->q_dec) = 0;		// this is not even correct
+		layer->bias->p_data = (void*)b->p_value;
+		layer->bias->bitwidth = 8;
+		layer->weight->qtype = NNOM_QTYPE_PER_TENSOR;
+		nnom_memcpy(layer->bias->dim, dim, layer->bias->num_dim * sizeof(nnom_shape_data_t));
+	}
+
+	// set output shifts
+	layer->output_rshift = (nnom_qformat_param_t *)&w->shift;
+	layer->bias_lshift = (nnom_qformat_param_t *)&b->shift;
+
+	return (nnom_layer_t *)layer;
+}
+
+nnom_status_t dense_build(nnom_layer_t *layer)
+{
+	nnom_dense_layer_t *cl = (nnom_dense_layer_t *)layer;
+
+	// get the tensor from last layer's output
+	layer->in->tensor = layer->in->hook.io->tensor;
+
+	// create new tensor for output
+	layer->out->tensor = new_tensor(NNOM_QTYPE_PER_TENSOR, 1, tensor_get_num_channel(layer->in->tensor));
+	// setup new tensor
+	nnom_shape_data_t dim[1] = {cl->output_unit};
+	tensor_set_attr(layer->out->tensor, cl->weight->q_dec, cl->weight->q_offset, dim, 1, 8); // test, this is not correct
+
+	// calculate the output tensor q format, only support per tensor quantise now
+	layer->out->tensor->q_dec[0] = layer->in->tensor->q_dec[0] + cl->weight->q_dec[0] - cl->output_rshift[0];
+	// see if the activation will change the q format
+	if(layer->actail) 
+		layer->out->tensor->q_dec[0] = act_get_dec_bit(layer->actail->type, layer->out->tensor->q_dec[0]);
+	
+	// vec_buffer size: dim_vec (*2, q7->q15) ? I am not sure this is right
+	layer->comp->size = tensor_size(layer->in->tensor)*2;
+
+	// computational cost: In * out
+	layer->stat.macc = tensor_size(layer->in->tensor) * tensor_size(layer->out->tensor);
+	return NN_SUCCESS;
+}
+
+nnom_status_t dense_free(nnom_layer_t *layer)
+{
+	// free weight and bias tensor when we are not initialised from structured configuration. 
+	if(!layer->config)
+	{
+		nnom_dense_layer_t* cl = (nnom_dense_layer_t*)layer;
+		delete_tensor(cl->weight);
+		delete_tensor(cl->bias);
+	}
+
+	return NN_SUCCESS;
+}
+
+nnom_status_t dense_run(nnom_layer_t *layer)
+{
+	nnom_status_t result = NN_SUCCESS;
+	nnom_dense_layer_t *cl = (nnom_dense_layer_t *)(layer);
+	nnom_qformat_param_t bias_shift = cl->bias_lshift[0];			// this is not correct but a temporary fix solution for backward compatibility.
+	nnom_qformat_param_t output_shift = cl->output_rshift[0];
+
+
+#if !(DENSE_WEIGHT_OPT)
+	#ifdef NNOM_USING_CMSIS_NN
+		result = (nnom_status_t)arm_fully_connected_q7(
+	#else
+		local_fully_connected_q7(
+	#endif
+#else
+	#ifdef NNOM_USING_CMSIS_NN
+		result = (nnom_status_t)arm_fully_connected_q7_opt(
+	#else
+		local_fully_connected_q7_opt(
+	#endif
+#endif
+			layer->in->tensor->p_data,
+			cl->weight->p_data,
+			tensor_size(layer->in->tensor), layer->out->tensor->dim[0],
+			bias_shift, output_shift,
+			cl->bias->p_data,
+			layer->out->tensor->p_data, (q15_t *)(layer->comp->mem->blk));
+	return result;
+}
+
diff --git a/APP_Framework/Framework/knowing/nnom/src/layers/nnom_dw_conv2d.c b/APP_Framework/Framework/knowing/nnom/src/layers/nnom_dw_conv2d.c
new file mode 100644
index 000000000..72ac7754e
--- /dev/null
+++ b/APP_Framework/Framework/knowing/nnom/src/layers/nnom_dw_conv2d.c
@@ -0,0 +1,140 @@
+/*
+ * Copyright (c) 2018-2020
+ * Jianjia Ma
+ * majianjia@live.com
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Change Logs:
+ * Date           Author       Notes
+ * 2019-07-23     Jianjia Ma   The first version
+ */
+
+#include <stdint.h>
+#include <string.h>
+#include <stdbool.h>
+
+#include "nnom.h"
+#include "nnom_local.h"
+#include "nnom_layers.h"
+#include "layers/nnom_dw_conv2d.h"
+
+#ifdef NNOM_USING_CMSIS_NN
+#include "arm_math.h"
+#include "arm_nnfunctions.h"
+#endif
+
+nnom_layer_t *dw_conv2d_s(const nnom_conv2d_config_t *config)
+{
+	nnom_layer_t *layer;
+	layer = conv2d_s(config);
+	if (layer)
+	{
+		layer->type = NNOM_DW_CONV_2D;
+		layer->run = dw_conv2d_run;
+		layer->build = dw_conv2d_build;
+	}
+	return layer;
+}
+
+nnom_layer_t *DW_Conv2D(uint32_t multiplier, nnom_3d_shape_t k, nnom_3d_shape_t s, nnom_3d_shape_t d, nnom_padding_t pad_type,
+						const nnom_weight_t *w, const nnom_bias_t *b)
+{
+	nnom_layer_t *layer = Conv2D(multiplier, k, s, d, pad_type, w, b); // passing multiplier in .
+	if (layer != NULL)
+	{
+		layer->type = NNOM_DW_CONV_2D;
+		layer->run = dw_conv2d_run;
+		layer->build = dw_conv2d_build;
+	}
+	return layer;
+}
+
+nnom_status_t dw_conv2d_build(nnom_layer_t *layer)
+{
+	nnom_conv2d_layer_t *cl = (nnom_conv2d_layer_t *)layer;
+
+	// get the tensor from last layer's output
+	layer->in->tensor = layer->in->hook.io->tensor;
+
+	// create new tensor for output
+	layer->out->tensor = new_tensor(NNOM_QTYPE_PER_TENSOR, layer->in->tensor->num_dim, tensor_get_num_channel(layer->in->tensor) * cl->filter_mult);
+	// copy then change later. 
+	tensor_cpy_attr(layer->out->tensor, layer->in->tensor);
+
+	// calculate the output tensor q format, only support per tensor quantise now
+	layer->out->tensor->q_dec[0] = layer->in->tensor->q_dec[0] + cl->weight->q_dec[0] - cl->output_rshift[0];
+	// see if the activation will change the q format
+	if(layer->actail) 
+		layer->out->tensor->q_dec[0] = act_get_dec_bit(layer->actail->type, layer->out->tensor->q_dec[0]);
+
+	// now we set up the tensor shape, always HWC format
+	layer->out->tensor->dim[0] = conv_output_length(layer->in->tensor->dim[0], cl->kernel.h, cl->padding_type, cl->stride.h, cl->dilation.h);
+	layer->out->tensor->dim[1] = conv_output_length(layer->in->tensor->dim[1], cl->kernel.w, cl->padding_type, cl->stride.w, cl->dilation.w);
+	layer->out->tensor->dim[2] = layer->in->tensor->dim[2] * cl->filter_mult; // channel stays the same
+
+	// fill padding
+	if (cl->padding_type == PADDING_SAME)
+	{
+		cl->pad.w = cl->dilation.w * (cl->kernel.w - 1) / 2;
+		cl->pad.h = cl->dilation.h * (cl->kernel.h - 1) / 2;
+		cl->pad.c = 0;
+	}
+	
+	// bufferA size: 
+	#ifdef NNOM_USING_CMSIS_NN
+	layer->comp->size = 2 * 2 * (layer->in->tensor->dim[2] / cl->filter_mult) * cl->kernel.w * cl->kernel.h;
+	#endif
+
+	// computational cost: K x K x Cin x Hout x Wout x Multiplier
+	// or                : K x K x Cout x Hout x Wout
+	layer->stat.macc = cl->kernel.w * cl->kernel.h * tensor_size(layer->out->tensor);
+	return NN_SUCCESS;
+}
+
+nnom_status_t dw_conv2d_run(nnom_layer_t *layer)
+{
+	nnom_status_t result = NN_SUCCESS;
+	nnom_conv2d_layer_t *cl = (nnom_conv2d_layer_t *)layer;
+
+#ifndef NNOM_USING_CHW	
+	#ifdef NNOM_USING_CMSIS_NN
+	// Current CMSIS-NN does not support dilation
+	if(cl->dilation.w ==1 && cl->dilation.h == 1 && cl->weight->qtype == NNOM_QTYPE_PER_TENSOR && cl->filter_mult == 1)
+	{
+		// CMSIS-NN only support 1 mulplipier in depthwise conv
+		if (layer->in->tensor->dim[2] % 2 != 0 || layer->out->tensor->dim[2] % 2)
+			return NN_ARGUMENT_ERROR;
+		result = (nnom_status_t)arm_depthwise_separable_conv_HWC_q7_nonsquare(
+				layer->in->tensor->p_data,
+				layer->in->tensor->dim[1], layer->in->tensor->dim[0], layer->in->tensor->dim[2],
+				cl->weight->p_data,
+				layer->out->tensor->dim[2],
+				cl->kernel.w, cl->kernel.h,
+				cl->pad.w, cl->pad.h,
+				cl->stride.w, cl->stride.h,
+				cl->bias->p_data,
+				cl->bias_lshift[0], cl->output_rshift[0],
+				layer->out->tensor->p_data,
+				layer->out->tensor->dim[1], layer->out->tensor->dim[0], (q15_t *)(layer->comp->mem->blk), NULL);
+	}
+	else
+	#endif
+	local_depthwise_separable_conv_HWC_q7_nonsquare(
+#else	
+	local_depthwise_separable_conv_CHW_q7_nonsquare(
+#endif
+		layer->in->tensor->p_data,
+		layer->in->tensor->dim[1], layer->in->tensor->dim[0], layer->in->tensor->dim[2],
+		cl->weight->p_data,
+		layer->out->tensor->dim[2],
+		cl->kernel.w, cl->kernel.h,
+		cl->pad.w, cl->pad.h,
+		cl->stride.w, cl->stride.h,
+		cl->dilation.w, cl->dilation.h,
+		cl->bias->p_data,
+		cl->bias_lshift, cl->output_rshift, cl->weight->qtype,
+		layer->out->tensor->p_data,
+		layer->out->tensor->dim[1], layer->out->tensor->dim[0], NULL, NULL);
+	return result;
+}
diff --git a/APP_Framework/Framework/knowing/nnom/src/layers/nnom_flatten.c b/APP_Framework/Framework/knowing/nnom/src/layers/nnom_flatten.c
new file mode 100644
index 000000000..c976bca9a
--- /dev/null
+++ b/APP_Framework/Framework/knowing/nnom/src/layers/nnom_flatten.c
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2018-2020
+ * Jianjia Ma
+ * majianjia@live.com
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Change Logs:
+ * Date           Author       Notes
+ * 2019-07-23     Jianjia Ma   The first version
+ */
+
+#include <stdint.h>
+#include <string.h>
+#include <stdbool.h>
+
+#include "nnom.h"
+#include "nnom_local.h"
+#include "nnom_layers.h"
+#include "layers/nnom_flatten.h"
+
+nnom_layer_t *flatten_s(const nnom_flatten_config_t *config)
+{
+	nnom_layer_t *layer = Flatten();
+	if(layer)
+		layer->config = (void*) config;
+	return layer;
+}
+
+nnom_layer_t *Flatten(void)
+{
+	nnom_layer_t *layer;
+	nnom_layer_io_t *in, *out;
+
+	// apply a block memory for all the sub handles.
+	size_t mem_size = sizeof(nnom_layer_t) + sizeof(nnom_layer_io_t) * 2;
+	layer = nnom_mem(mem_size);
+	if (layer == NULL)
+		return NULL;
+
+	// distribut the memory to sub handles.
+	in = (void *)((uint8_t*)layer + sizeof(nnom_layer_t));
+	out = (void *)((uint8_t*)in + sizeof(nnom_layer_io_t));
+
+	// set type in layer parent
+	layer->type = NNOM_FLATTEN;
+	layer->run = flatten_run;
+	layer->build = flatten_build;
+	// set buf state
+	in->type = NNOM_TENSOR_BUF_TEMP;
+	#ifdef NNOM_USING_CHW
+		out->type = NNOM_TENSOR_BUF_TEMP; // test for CHW format
+	#else
+		out->type = NNOM_TENSOR_BUF_NULL; 
+	#endif
+	// put in & out on the layer.
+	layer->in = io_init(layer, in);
+	layer->out = io_init(layer, out);
+
+	return layer;
+}
+
+nnom_status_t flatten_build(nnom_layer_t *layer)
+{ 
+	// get the tensor from last layer's output
+	layer->in->tensor = layer->in->hook.io->tensor;
+
+	// create new tensor for output
+	layer->out->tensor = new_tensor(NNOM_QTYPE_PER_TENSOR, layer->in->tensor->num_dim, tensor_get_num_channel(layer->in->tensor));
+	// setup new tensor
+	nnom_shape_data_t dim[1] = {tensor_size(layer->in->tensor)};
+	tensor_set_attr(layer->out->tensor, layer->in->tensor->q_dec, layer->in->tensor->q_offset, dim, 1, 8);
+
+	return NN_SUCCESS;
+}
+
+nnom_status_t flatten_run(nnom_layer_t *layer)
+{
+	#ifdef NNOM_USING_CHW
+	// CHW format must reorder to HWC for dense layer and all other 1D layer (?)
+	tensor_chw2hwc_q7(layer->out->tensor, layer->in->tensor);
+	#endif
+	return NN_SUCCESS;
+}
diff --git a/APP_Framework/Framework/knowing/nnom/src/layers/nnom_global_pool.c b/APP_Framework/Framework/knowing/nnom/src/layers/nnom_global_pool.c
new file mode 100644
index 000000000..8e0d1ee64
--- /dev/null
+++ b/APP_Framework/Framework/knowing/nnom/src/layers/nnom_global_pool.c
@@ -0,0 +1,145 @@
+
+/*
+ * Copyright (c) 2018-2020
+ * Jianjia Ma
+ * majianjia@live.com
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Change Logs:
+ * Date           Author       Notes
+ * 2019-07-23     Jianjia Ma   The first version
+ */
+
+#include <stdint.h>
+#include <string.h>
+#include <stdbool.h>
+
+#include "nnom.h"
+#include "nnom_local.h"
+#include "nnom_layers.h"
+#include "layers/nnom_global_pool.h"
+
+nnom_layer_t * global_maxpool_s(const nnom_global_pool_config_t *config)
+{
+	nnom_maxpool_layer_t * cl = (nnom_maxpool_layer_t *)GlobalMaxPool();
+	if(cl)
+	{
+		cl->super.config = (void*) config;
+		cl->output_shift = config->output_shift;
+	}
+	return (nnom_layer_t *)cl;
+}
+nnom_layer_t * global_avgpool_s(const nnom_global_pool_config_t *config)
+{
+	nnom_maxpool_layer_t * cl = (nnom_maxpool_layer_t *)GlobalAvgPool();
+	if(cl)
+	{
+		cl->super.config = (void*) config;
+		cl->output_shift = config->output_shift;
+	}
+	return (nnom_layer_t *)cl;
+}
+
+nnom_layer_t * global_sumpool_s(const nnom_global_pool_config_t *config)
+{
+	nnom_maxpool_layer_t * cl = (nnom_maxpool_layer_t *)GlobalSumPool();
+	if(cl)
+	{
+		cl->super.config = (void*) config;
+		cl->output_shift = config->output_shift;
+	}
+	return (nnom_layer_t *)cl;
+}
+
+
+nnom_layer_t *GlobalMaxPool(void)
+{
+	// create the normal pooling layer, the parameters are left empty to fill in later.
+	// parameters will be filled in in global_pooling_build()
+	nnom_layer_t *layer = MaxPool(kernel(0, 0), stride(0, 0), PADDING_VALID);
+
+	// change to global max pool
+	if (layer != NULL)
+	{
+		layer->type = NNOM_GLOBAL_MAXPOOL;
+		layer->build = global_pool_build;
+	}
+
+	return (nnom_layer_t *)layer;
+}
+
+nnom_layer_t *GlobalAvgPool(void)
+{
+	// create the normal pooling layer, the parameters are left empty to fill in later.
+	// parameters will be filled in global_pooling_build() remotely
+	nnom_layer_t *layer = MaxPool(kernel(0, 0), stride(0, 0), PADDING_VALID);
+
+	// change some parameters to be recognised as avg pooling
+	if (layer != NULL)
+	{
+		layer->type = NNOM_GLOBAL_AVGPOOL;
+		layer->run = avgpool_run; // global and basic pooling share the same runner
+		layer->build = global_pool_build;
+	}
+
+	return (nnom_layer_t *)layer;
+}
+
+nnom_layer_t *GlobalSumPool(void)
+{
+	// create the normal pooling layer, the parameters are left empty to fill in later.
+	// parameters will be filled in global_pooling_build() remotely
+	nnom_layer_t *layer = MaxPool(kernel(0, 0), stride(0, 0), PADDING_VALID);
+
+	// change some parameters to be recognised as avg pooling
+	if (layer != NULL)
+	{
+		layer->type = NNOM_GLOBAL_SUMPOOL;
+		layer->run = sumpool_run; // global and basic pooling share the same runner
+		layer->build = global_pool_build;
+	}
+
+	return (nnom_layer_t *)layer;
+}
+
+nnom_status_t global_pool_build(nnom_layer_t *layer)
+{
+	nnom_maxpool_layer_t *cl = (nnom_maxpool_layer_t *)layer;
+
+	// get the tensor from last layer's output
+	layer->in->tensor = layer->in->hook.io->tensor;
+
+	// create new tensor for output
+	layer->out->tensor = new_tensor(NNOM_QTYPE_PER_TENSOR, 1, tensor_get_num_channel(layer->in->tensor));
+
+	nnom_shape_data_t dim[1] = {tensor_get_num_channel(layer->in->tensor)}; // fill the first 2 dim later
+	tensor_set_attr_v(layer->out->tensor, layer->in->tensor->q_dec[0], 0, dim, sizeof(dim)/sizeof(nnom_shape_data_t), 8); 
+	
+	// see if the activation will change the q format
+	if(layer->actail) 
+		layer->out->tensor->q_dec[0] = act_get_dec_bit(layer->actail->type, layer->out->tensor->q_dec[0]);
+
+	// different from other *_build(), the kernel..padding left by layer API needs to be set in here
+	// due to the *_run() methods of global pooling are using the normall pooling's.
+	// fill in the parameters left by layer APIs (GlobalAvgPool and MaxAvgPool)
+	cl->kernel = shape(layer->in->tensor->dim[0], layer->in->tensor->dim[1], 1);
+	cl->stride = shape(1, 1, 1);
+	cl->pad = shape(0, 0, 0);
+	cl->padding_type = PADDING_VALID;
+
+	// additionally, avg pooling require computational buffer, which is  2*dim_im_out*ch_im_in
+	if (layer->type == NNOM_AVGPOOL || layer->type == NNOM_GLOBAL_AVGPOOL)
+	{
+		//  bufferA size:  2*dim_im_out*ch_im_in
+		layer->comp->size = 2 * layer->out->tensor->dim[0] * layer->in->tensor->dim[2];
+	}
+	
+	// additional for sumpool
+	if (layer->type == NNOM_SUMPOOL || layer->type == NNOM_GLOBAL_SUMPOOL)
+		layer->comp->size = 4 * tensor_size(layer->out->tensor);
+
+	return NN_SUCCESS;
+}
+
+
diff --git a/APP_Framework/Framework/knowing/nnom/src/layers/nnom_gru_cell.c b/APP_Framework/Framework/knowing/nnom/src/layers/nnom_gru_cell.c
new file mode 100644
index 000000000..7e01e9e2a
--- /dev/null
+++ b/APP_Framework/Framework/knowing/nnom/src/layers/nnom_gru_cell.c
@@ -0,0 +1,338 @@
+/*
+ * Copyright (c) 2018-2020
+ * Jianjia Ma
+ * majianjia@live.com
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Change Logs:
+ * Date           Author       Notes
+ * 2020-08-24     Jianjia Ma   The first version
+ */
+
+#include <stdint.h>
+#include <string.h>
+#include <stdbool.h>
+
+#include "nnom.h"
+#include "nnom_local.h"
+#include "nnom_layers.h"
+#include "layers/nnom_gru_cell.h"
+
+#ifdef NNOM_USING_CMSIS_NN
+#include "arm_math.h"
+#include "arm_nnfunctions.h"
+#endif
+
+nnom_rnn_cell_t *gru_cell_s(const nnom_gru_cell_config_t* config)
+{
+	nnom_gru_cell_t *cell;
+	cell = nnom_mem(sizeof(nnom_gru_cell_t));
+	if (cell == NULL)
+		return NULL;
+	// set methods
+	cell->super.run = gru_cell_run;
+	cell->super.build = gru_cell_build;
+	cell->super.free = gru_cell_free;
+	cell->super.config = (void*) config;
+	cell->super.units = config->units;
+    cell->super.type = NNOM_GRU_CELL;
+
+	// set parameters 
+	cell->bias = config->bias;
+	cell->weights = config->weights;
+	cell->recurrent_weights = config->recurrent_weights;
+
+    // q format for intermediate calculation
+    cell->q_dec_h = config->q_dec_h;
+    cell->q_dec_z = config->q_dec_z;
+
+	return (nnom_rnn_cell_t *)cell;
+}
+
+nnom_status_t gru_cell_free(nnom_rnn_cell_t* cell)
+{
+	return NN_SUCCESS;
+}
+
+// the state buffer and computational buffer shape of the cell
+nnom_status_t gru_cell_build(nnom_rnn_cell_t* cell)
+{
+	nnom_layer_t *layer = cell->layer; 
+	nnom_gru_cell_t *c = (nnom_gru_cell_t *)cell;
+
+	// calculate output shift for the 2 calculation. 
+	// hw = the product of hidden x weight, iw = the product of input x weight
+	// due to the addition of them, they must have same q format.
+    // that is -> c->q_dec_z; 
+
+	// for the dots in cell: output shift = input_dec + weight_dec - output_dec
+	c->oshift_hw = c->q_dec_h + c->recurrent_weights->q_dec[0] - c->q_dec_z;
+	c->oshift_iw = layer->in->tensor->q_dec[0] + c->weights->q_dec[0] - c->q_dec_z; 
+	
+	// bias shift =  bias_dec - out_dec
+	c->bias_shift = layer->in->tensor->q_dec[0] + c->weights->q_dec[0] - c->bias->q_dec[0];
+
+	// state size = one timestamp output size. 
+	cell->state_size = cell->units * 2; // Q15
+
+	// comp buffer size: not required
+	cell->comp_buf_size = cell->units * (3*3) * 2 + cell->feature_size * 2; //q15 + input q7->q15 buffer.  
+
+	// finally, calculate the MAC for info for each timestamp
+	cell->macc = cell->feature_size * cell->units *3 // input: feature * state * 3 gates
+				+ cell->units * cell->units *8 // recurrent, state *  output_unit * (5 gate + 3 mult)
+				+ cell->units * (3 + 3 + 5);  // 3 gates, 3 mult, 5  addition
+
+	return NN_SUCCESS;
+}
+
+
+// keras implementation as below. 
+/*
+  def step(cell_inputs, cell_states):
+    """Step function that will be used by Keras RNN backend."""
+    h_tm1 = cell_states[0]
+
+    # inputs projected by all gate matrices at once
+    matrix_x = K.dot(cell_inputs, kernel)
+    matrix_x = K.bias_add(matrix_x, input_bias)
+
+    x_z, x_r, x_h = array_ops.split(matrix_x, 3, axis=1)
+
+    # hidden state projected by all gate matrices at once
+    matrix_inner = K.dot(h_tm1, recurrent_kernel)
+    matrix_inner = K.bias_add(matrix_inner, recurrent_bias)
+
+    recurrent_z, recurrent_r, recurrent_h = array_ops.split(matrix_inner, 3,
+                                                            axis=1)
+    z = nn.sigmoid(x_z + recurrent_z)
+    r = nn.sigmoid(x_r + recurrent_r)
+    hh = nn.tanh(x_h + r * recurrent_h)
+
+    # previous and candidate state mixed by update gate
+    h = z * h_tm1 + (1 - z) * hh
+    return h, [h]
+*/
+
+//
+nnom_status_t gru_cell_run(nnom_rnn_cell_t* cell)
+{
+	nnom_layer_t *layer = cell->layer;
+	nnom_gru_cell_t* c = (nnom_gru_cell_t*) cell;
+    int act_int_bit = 7 - c->q_dec_z;
+	// gate data    
+    q15_t* x_z, *x_r, *x_h;
+    q15_t* recurrent_z, *recurrent_r, *recurrent_h;
+	q15_t* temp[3];
+
+    // bias
+    q7_t* bias = (q7_t*)c->bias->p_data;
+    q7_t* recurrent_bias = (q7_t*)c->bias->p_data + cell->units*3;
+
+    // state buffer
+    q15_t* h_tm1 = (q15_t*)cell->in_state;
+    q15_t* h_t = (q15_t*)cell->out_state;
+
+    // computing buffer
+    // low |-- buf0 --|-- buf1 --|-- buf2 --|-- input_q15 --|
+    q15_t *buf[3];
+    buf[0] = (q15_t*)layer->comp->mem->blk;
+    buf[1] = (q15_t*)layer->comp->mem->blk + cell->units*3;
+    buf[2] = (q15_t*)layer->comp->mem->blk + cell->units*6;
+    q15_t *in_q15_buf = (q15_t*)layer->comp->mem->blk + cell->units*9;
+
+    // input q7 cast to q15
+    local_q7_to_q15(cell->in_data, in_q15_buf, cell->feature_size);
+
+    // matrix_x = K.dot(cell_inputs, kernel) + bias  --> buf0
+	#ifdef NNOM_USING_CMSIS_NN
+		arm_fully_connected_mat_q7_vec_q15_opt
+	#else
+		local_fully_connected_mat_q7_vec_q15_opt
+	#endif 
+		(in_q15_buf, c->weights->p_data, cell->feature_size, 
+			cell->units*3, c->bias_shift + 8, c->oshift_iw, bias, buf[0], NULL);
+
+    // matrix_intter = K.dot(h_tm1, recurrent_kernel) + bias -> buf1
+    #ifdef NNOM_USING_CMSIS_NN
+		arm_fully_connected_mat_q7_vec_q15_opt
+	#else
+		local_fully_connected_mat_q7_vec_q15_opt
+	#endif 
+		(h_tm1, c->recurrent_weights->p_data, cell->units, 
+			cell->units*3,  c->bias_shift + 8, c->oshift_hw, recurrent_bias, buf[1], NULL); 
+
+	// split to each gate
+    x_z = buf[0];
+    x_r = buf[0] + cell->units;
+    x_h = buf[0] + cell->units*2;
+    recurrent_z = buf[1];
+    recurrent_r = buf[1] + cell->units;
+    recurrent_h = buf[1] + cell->units*2;
+	// buffers
+    temp[0] = buf[2];
+    temp[1] = buf[2] + cell->units;
+    temp[2] = buf[2] + cell->units*2;
+
+    /* z = nn.sigmoid(x_z + recurrent_z) */
+    // 1.  z1 = x_z + recurrent_z    --->  temp[0]
+    local_add_q15(x_z, recurrent_z, temp[0], 0, cell->units);
+    // 2.  z = sigmoid(z1)
+    local_sigmoid_q15(temp[0], cell->units, act_int_bit);
+
+    /* r = nn.sigmoid(x_r + recurrent_r) */
+    // 1.  r1 = x_r + recurrent_r    --->  temp[1]
+    local_add_q15(x_r, recurrent_r, temp[1], 0, cell->units);
+    // 2.  r = sigmoid(r1)
+    local_sigmoid_q15(temp[1], cell->units, act_int_bit);
+
+    /* hh = nn.tanh(x_h + r * recurrent_h) */
+    // 1.  hh1 = r * recurrent_h     ---> temp[2]
+    local_mult_q15(temp[1], recurrent_h, temp[2], 15, cell->units);
+    // 2.  hh2 = x_h + hh1            ---> temp[1]
+    local_add_q15(x_h, temp[2], temp[1], 0, cell->units);
+    // 3.  hh = tanh(h2)           ---> temp[1]
+    local_tanh_q15(temp[1], cell->units, act_int_bit);
+
+    /* h = z * h_tm1 + (1 - z) * hh  */
+    // 1. h1 = z*h_tm1   ---> temp[2]
+    local_mult_q15(temp[0], h_tm1, temp[2], 15, cell->units);
+    // 2. h2 = 1 - z            ---> h_t state buff
+    local_1_minor_z_q15(temp[0], h_t, 15, cell->units);
+    // 3. h3 = h2 * hh          ---> temp[0]
+    local_mult_q15(h_t, temp[1],  temp[0], 15, cell->units);
+    // h = h1 + h3
+    local_add_q15(temp[2], temp[0], h_t, 0, cell->units);
+
+    // finally, copy and convert state to output
+    local_q15_to_q7(h_t, cell->out_data, 8, cell->units);
+	return NN_SUCCESS;
+}
+
+
+// Researve for debugging, printing the intermediate variables/data.
+#if 0
+// delete after testing completed
+static void print_variable_q15(q15_t *data,char*name, int dec_bit, int size)
+{
+	printf("\n\n");
+	printf("%s", name);
+	for(int i = 0; i < size; i++)
+	{
+		if(i%8==0)
+			printf("\n");
+		printf("%f\t", (float) data[i] / (1 << dec_bit));
+	}
+	printf("\n");
+}
+
+//
+nnom_status_t gru_cell_run(nnom_rnn_cell_t* cell)
+{
+	nnom_layer_t *layer = cell->layer;
+	nnom_gru_cell_t* c = (nnom_gru_cell_t*) cell;
+    int act_int_bit = 7 - c->q_dec_z;
+	// gate data    
+    q15_t* x_z, *x_r, *x_h;
+    q15_t* recurrent_z, *recurrent_r, *recurrent_h;
+	q15_t* temp[3];
+	
+    	 		// test
+	 			//nnom_memset(cell->in_data, 5 * (1<<layer->in->tensor->q_dec[0]), cell->feature_size); 
+
+    // bias
+    q7_t* bias = (q7_t*)c->bias->p_data;
+    q7_t* recurrent_bias = (q7_t*)c->bias->p_data + cell->units*3;
+
+    // state buffer
+    q15_t* h_tm1 = (q15_t*)cell->in_state;
+    q15_t* h_t = (q15_t*)cell->out_state;
+
+    // computing buffer
+    // low |-- buf0 --|-- buf1 --|-- buf2 --|-- input_q15 --|
+    q15_t *buf[3];
+    buf[0] = (q15_t*)layer->comp->mem->blk;
+    buf[1] = (q15_t*)layer->comp->mem->blk + cell->units*3;
+    buf[2] = (q15_t*)layer->comp->mem->blk + cell->units*6;
+    q15_t *in_q15_buf = (q15_t*)layer->comp->mem->blk + cell->units*9;
+
+    // input q7 cast to q15
+    local_q7_to_q15(cell->in_data, in_q15_buf, cell->feature_size);
+
+    // matrix_x = K.dot(cell_inputs, kernel) + bias  --> buf0
+	#ifdef NNOM_USING_CMSIS_NN
+		arm_fully_connected_mat_q7_vec_q15_opt
+	#else
+		local_fully_connected_mat_q7_vec_q15_opt
+	#endif 
+		(in_q15_buf, c->weights->p_data, cell->feature_size, 
+			cell->units*3, c->bias_shift + 8, c->oshift_iw, bias, buf[0], NULL);
+
+    // matrix_intter = K.dot(h_tm1, recurrent_kernel) + bias -> buf1
+    #ifdef NNOM_USING_CMSIS_NN
+		arm_fully_connected_mat_q7_vec_q15_opt
+	#else
+		local_fully_connected_mat_q7_vec_q15_opt
+	#endif 
+		(h_tm1, c->recurrent_weights->p_data, cell->units, 
+			cell->units*3,  c->bias_shift + 8, c->oshift_hw, recurrent_bias, buf[1], NULL); 
+			
+			print_variable_q15(in_q15_buf, "input", layer->in->tensor->q_dec[0]+8, cell->feature_size);
+			print_variable_q15(buf[0], "matrix_x", c->q_dec_z+8, cell->units*3);
+			print_variable_q15(buf[1], "matrix_recurrent", 	c->q_dec_z+8, cell->units*3);
+
+	// split to each gate
+    x_z = buf[0];
+    x_r = buf[0] + cell->units;
+    x_h = buf[0] + cell->units*2;
+    recurrent_z = buf[1];
+    recurrent_r = buf[1] + cell->units;
+    recurrent_h = buf[1] + cell->units*2;
+	// buffers
+    temp[0] = buf[2];
+    temp[1] = buf[2] + cell->units;
+    temp[2] = buf[2] + cell->units*2;
+
+    // z = nn.sigmoid(x_z + recurrent_z) 
+    // 1.  z1 = x_z + recurrent_z    --->  temp[0]
+    local_add_q15(x_z, recurrent_z, temp[0], 0, cell->units);
+    // 2.  z = sigmoid(z1)
+    local_sigmoid_q15(temp[0], cell->units, act_int_bit);
+		print_variable_q15(temp[0], "z", 15, cell->units);
+
+    // r = nn.sigmoid(x_r + recurrent_r) 
+    // 1.  r1 = x_r + recurrent_r    --->  temp[1]
+    local_add_q15(x_r, recurrent_r, temp[1], 0, cell->units);
+    // 2.  r = sigmoid(r1)
+    local_sigmoid_q15(temp[1], cell->units, act_int_bit);
+		print_variable_q15(temp[1], "r", 15, cell->units);
+
+    // hh = nn.tanh(x_h + r * recurrent_h) 
+    // 1.  hh1 = r * recurrent_h     ---> temp[2]
+    local_mult_q15(temp[1], recurrent_h, temp[2], 15, cell->units);
+    // 2.  hh2 = x_h + h1            ---> temp[1]
+    local_add_q15(x_h, temp[2], temp[1], 0, cell->units);
+    // 3.  hh = tanh(h2)           ---> temp[1]
+    local_tanh_q15(temp[1], cell->units, act_int_bit);
+		print_variable_q15(temp[1], "hh", 15, cell->units);
+
+    // h = z * h_tm1 + (1 - z) * hh  
+    // 1. h1 = z*h_tm1   ---> temp[2]
+    local_mult_q15(temp[0], h_tm1, temp[2], 15, cell->units);
+		print_variable_q15( temp[2], "h1", 15, cell->units);
+    // 2. h2 = 1 - z            ---> h_t state buff
+    local_1_minor_z_q15(temp[0], h_t, 15, cell->units);
+		print_variable_q15( h_t, "h2", 15, cell->units);
+    // 3. h3 = h2 * hh          ---> temp[0]
+    local_mult_q15(h_t, temp[1],  temp[0], 15, cell->units);
+		print_variable_q15( temp[0], "h3", 15, cell->units);
+    // h = h1 + h3
+    local_add_q15(temp[2], temp[0], h_t, 0, cell->units);
+		print_variable_q15(h_t, "h", 15, cell->units);
+
+    // finally, copy and convert state to output
+    local_q15_to_q7(h_t, cell->out_data, 8, cell->units);
+	return NN_SUCCESS;
+}
+#endif
diff --git a/APP_Framework/Framework/knowing/nnom/src/layers/nnom_input.c b/APP_Framework/Framework/knowing/nnom/src/layers/nnom_input.c
new file mode 100644
index 000000000..f1fc3b9c9
--- /dev/null
+++ b/APP_Framework/Framework/knowing/nnom/src/layers/nnom_input.c
@@ -0,0 +1,145 @@
+/*
+ * Copyright (c) 2018-2020
+ * Jianjia Ma
+ * majianjia@live.com
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Change Logs:
+ * Date           Author       Notes
+ * 2019-07-23     Jianjia Ma   The first version
+ */
+
+#include <stdint.h>
+#include <string.h>
+#include <stdbool.h>
+
+#include "nnom.h"
+#include "nnom_local.h"
+#include "nnom_layers.h"
+#include "layers/nnom_input.h"
+
+nnom_layer_t *input_s(const nnom_io_config_t* config)
+{
+	nnom_io_layer_t *layer;
+	nnom_layer_io_t *in, *out;
+	// apply a block memory for all the sub handles.
+	layer = nnom_mem(sizeof(nnom_io_layer_t) + sizeof(nnom_layer_io_t) * 2);
+	if (layer == NULL)
+		return NULL;
+
+	// distribut the memory to sub handles.
+	in = (void *)((uint8_t*)layer + sizeof(nnom_io_layer_t));
+	out = (void *)((uint8_t*)in + sizeof(nnom_layer_io_t));
+
+	// set type in layer parent
+	layer->super.type = NNOM_INPUT;
+	layer->super.run = input_run;
+	layer->super.build = input_build;
+	// set buf state
+	in->type = NNOM_TENSOR_BUF_TEMP;
+	out->type = NNOM_TENSOR_BUF_NULL;
+	// put in & out on the layer.
+	layer->super.in = io_init(layer, in);
+	layer->super.out = io_init(layer, out);
+
+	/*
+	// some other layers (Conv, pooling) are not supporting 12 d input, we still expand the 1,2 dimension to 3
+    // test -> native support 1,2,3 D input. 
+	layer->super.in->tensor = new_tensor(NNOM_QTYPE_PER_TENSOR, config->tensor->num_dim, tensor_get_num_channel(config->tensor));
+	tensor_cpy_attr(layer->super.in->tensor, config->tensor);
+	layer->buf = config->tensor->p_data;
+	layer->dec_bit = config->tensor->q_dec[0];
+	*/
+
+	// set parameters
+    if(config->tensor->num_dim == 1) // test for 1d input, expend h = 1
+        layer->shape = shape(1, 1, config->tensor->dim[0]);
+    else if (config->tensor->num_dim == 2) // test for 1d input, expend h = 1
+		layer->shape = shape(1, config->tensor->dim[0], config->tensor->dim[1]);
+	else
+		layer->shape = shape(config->tensor->dim[0], config->tensor->dim[1], config->tensor->dim[2]);
+	layer->buf = config->tensor->p_data;
+	layer->dec_bit = config->tensor->q_dec[0];
+
+	// experimental: fixed input dim to 3
+	// input normally dont have a tensor, so we create one to store the initial data. 
+	nnom_shape_data_t dim[3] = {layer->shape.h, layer->shape.w, layer->shape.c};
+	layer->super.in->tensor = new_tensor(NNOM_QTYPE_PER_TENSOR, 3, tensor_get_num_channel(config->tensor));
+	tensor_set_attr_v(layer->super.in->tensor, layer->dec_bit, 0, dim, sizeof(dim)/sizeof(nnom_shape_data_t), 8);
+	return (nnom_layer_t *)layer;
+}
+
+nnom_layer_t *Input(nnom_3d_shape_t input_shape, void *p_buf)
+{
+	nnom_io_layer_t *layer;
+	nnom_layer_io_t *in, *out;
+
+	// apply a block memory for all the sub handles.
+	layer = nnom_mem(sizeof(nnom_io_layer_t) + sizeof(nnom_layer_io_t) * 2);
+	if (layer == NULL)
+		return NULL;
+
+	// distribut the memory to sub handles.
+	in = (void *)((uint8_t*)layer + sizeof(nnom_io_layer_t));
+	out = (void *)((uint8_t*)in + sizeof(nnom_layer_io_t));
+
+	// set type in layer parent
+	layer->super.type = NNOM_INPUT;
+	layer->super.run = input_run;
+	layer->super.build = input_build;
+	// set buf state
+	in->type = NNOM_TENSOR_BUF_TEMP;
+	out->type = NNOM_TENSOR_BUF_NULL;
+	// put in & out on the layer.
+	layer->super.in = io_init(layer, in);
+	layer->super.out = io_init(layer, out);
+
+	// set parameters
+	layer->shape = input_shape;
+	layer->buf = p_buf;
+	layer->dec_bit = 7;
+
+	// experimental: fixed input dim to 3
+	// input normally dont have a tensor, so we create one to store the initial data. 
+	nnom_shape_data_t dim[3] = { input_shape.h, input_shape.w, input_shape.c };
+	layer->super.in->tensor = new_tensor(NNOM_QTYPE_PER_TENSOR, 3, input_shape.c);
+	tensor_set_attr_v(layer->super.in->tensor, layer->dec_bit, 0, dim, sizeof(dim)/sizeof(nnom_shape_data_t), 8);
+	return (nnom_layer_t *)layer;
+}
+
+nnom_status_t input_build(nnom_layer_t* layer)
+{
+	// the input tensor of inputlayer has assigned previously 
+
+	// output tensor
+	// 1. allocate a new tensor for output
+	// 2. set the same dim, qfmt to the new tensor.
+	layer->out->tensor = new_tensor(NNOM_QTYPE_PER_TENSOR, layer->in->tensor->num_dim, tensor_get_num_channel(layer->in->tensor));
+	tensor_cpy_attr(layer->out->tensor, layer->in->tensor);
+
+	// now this build has passed the input tensors (shapes, formats) to the new tensors. 
+	return NN_SUCCESS;
+}
+
+
+nnom_status_t input_run(nnom_layer_t *layer)
+{
+	nnom_io_layer_t *cl = (nnom_io_layer_t *)layer;
+#ifdef NNOM_USING_CHW
+	if(layer->in->tensor->num_dim == 3)
+    {
+        nnom_3d_shape_t shape = {layer->in->tensor->dim[0], layer->in->tensor->dim[1], layer->in->tensor->dim[2]};
+        hwc2chw_q7(shape, cl->buf, layer->in->tensor->p_data);
+    }
+    else if (layer->in->tensor->num_dim == 2)
+    {
+        nnom_3d_shape_t shape = {1, layer->in->tensor->dim[0], layer->in->tensor->dim[1]};
+        hwc2chw_q7(shape, cl->buf, layer->in->tensor->p_data);
+    }
+    else
+#endif
+	nnom_memcpy(layer->in->tensor->p_data, cl->buf, tensor_size(layer->in->tensor));
+
+	return NN_SUCCESS;
+}
diff --git a/APP_Framework/Framework/knowing/nnom/src/layers/nnom_lambda.c b/APP_Framework/Framework/knowing/nnom/src/layers/nnom_lambda.c
new file mode 100644
index 000000000..31e9c7c5e
--- /dev/null
+++ b/APP_Framework/Framework/knowing/nnom/src/layers/nnom_lambda.c
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2018-2020
+ * Jianjia Ma
+ * majianjia@live.com
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Change Logs:
+ * Date           Author       Notes
+ * 2019-07-23     Jianjia Ma   The first version
+ */
+
+#include <stdint.h>
+#include <string.h>
+#include <stdbool.h>
+
+#include "nnom.h"
+#include "nnom_local.h"
+#include "nnom_layers.h"
+#include "layers/nnom_lambda.h"
+
+nnom_layer_t *lambda_s(const nnom_lambda_config_t * config)
+{
+	nnom_lambda_layer_t *cl = (nnom_lambda_layer_t *)Lambda(
+		config->run_func_name, 
+		config->build_func_name,
+		config->free_func_name,
+		config->parameters);
+	if(cl)
+		cl->super.config = (void*) config;
+	return (nnom_layer_t *)cl;
+}
+
+// TODO: extended to multiple IO layer
+nnom_layer_t *Lambda(nnom_status_t (*run)(nnom_layer_t *),
+					 nnom_status_t (*build)(nnom_layer_t *),
+					 nnom_status_t (*free)(nnom_layer_t *),
+					 void *parameters)
+{
+	nnom_lambda_layer_t *layer;
+	nnom_layer_io_t *in, *out;
+
+	// apply a block memory for all the sub handles.
+	size_t mem_size = sizeof(nnom_io_layer_t) + sizeof(nnom_layer_io_t) * 2;
+	layer = nnom_mem(mem_size);
+	if (layer == NULL)
+		return NULL;
+
+	// distribut the memory to sub handles.
+	in = (void *)((uint8_t*)layer + sizeof(nnom_lambda_layer_t));
+	out = (void *)((uint8_t*)in + sizeof(nnom_layer_io_t));
+
+	// set buf type.
+	in->type = NNOM_TENSOR_BUF_TEMP;
+	out->type = NNOM_TENSOR_BUF_TEMP;
+
+	// set io modules to the layer
+	layer->super.in = io_init(layer, in);
+	layer->super.out = io_init(layer, out);
+	// layer type
+	layer->super.type = NNOM_LAMBDA;
+
+	// user parameters
+	layer->parameters = parameters;
+
+	// free method
+	layer->super.free = free;
+
+	// output shape method. pass NULL in will use the default outshape method, which set the output shape same as input shape.
+	if (build == NULL)
+		layer->super.build = default_build;
+	else
+		layer->super.build = build;
+	// run method. default_run() will simply copy data from input tensor to output tensor. 
+	if(run == NULL)
+		layer->super.run = default_run;
+	else
+		layer->super.run = run;
+
+	return (nnom_layer_t *)layer;
+}
diff --git a/APP_Framework/Framework/knowing/nnom/src/layers/nnom_lstm_cell.c b/APP_Framework/Framework/knowing/nnom/src/layers/nnom_lstm_cell.c
new file mode 100644
index 000000000..ed4a120b4
--- /dev/null
+++ b/APP_Framework/Framework/knowing/nnom/src/layers/nnom_lstm_cell.c
@@ -0,0 +1,334 @@
+/*
+ * Copyright (c) 2018-2020
+ * Jianjia Ma
+ * majianjia@live.com
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Change Logs:
+ * Date           Author       Notes
+ * 2020-08-24     Jianjia Ma   The first version
+ */
+
+#include <stdint.h>
+#include <string.h>
+#include <stdbool.h>
+
+#include "nnom.h"
+#include "nnom_local.h"
+#include "nnom_layers.h"
+#include "layers/nnom_lstm_cell.h"
+
+#ifdef NNOM_USING_CMSIS_NN
+#include "arm_math.h"
+#include "arm_nnfunctions.h"
+#endif
+
+// LSTM RNN
+// unit = output shape
+// type of activation
+nnom_rnn_cell_t *lstm_cell_s(const nnom_lstm_cell_config_t* config)
+{
+	nnom_lstm_cell_t *cell;
+	cell = nnom_mem(sizeof(nnom_lstm_cell_t));
+	if (cell == NULL)
+		return NULL;
+	// set methods
+	cell->super.run = lstm_cell_q7_q15_run;
+	cell->super.build = lstm_cell_q7_q15_build;
+	cell->super.free = lstm_cell_free;
+	cell->super.config = (void*) config;
+	cell->super.units = config->units;
+    cell->super.type = NNOM_LSTM_CELL;
+
+	// set parameters 
+	cell->bias = config->bias;
+	cell->weights = config->weights;
+	cell->recurrent_weights = config->recurrent_weights;
+
+    // q format for intermediate calculation
+    cell->q_dec_c = config->q_dec_c;
+    cell->q_dec_h = config->q_dec_h;
+    cell->q_dec_z = config->q_dec_z;
+
+	return (nnom_rnn_cell_t *)cell;
+}
+
+nnom_status_t lstm_cell_free(nnom_rnn_cell_t* cell)
+{
+	return NN_SUCCESS;
+}
+
+// keras implementation as below. 
+/*
+  def step(cell_inputs, cell_states):
+    """Step function that will be used by Keras RNN backend."""
+    h_tm1 = cell_states[0]  # previous memory state
+    c_tm1 = cell_states[1]  # previous carry state
+
+    z = K.dot(cell_inputs, kernel)          -> q_iw
+    z += K.dot(h_tm1, recurrent_kernel)     -> q_hw
+    z = K.bias_add(z, bias)                 
+
+    z0, z1, z2, z3 = array_ops.split(z, 4, axis=1)
+
+    i = nn.sigmoid(z0)
+    f = nn.sigmoid(z1)
+    c = f * c_tm1 + i * nn.tanh(z2)
+    o = nn.sigmoid(z3)
+
+    h = o * nn.tanh(c)
+    return h, [h, c]
+*/
+
+
+
+// the state buffer and computational buffer shape of the cell
+nnom_status_t lstm_cell_q7_q15_build(nnom_rnn_cell_t* cell)
+{
+	nnom_layer_t *layer = cell->layer; 
+	nnom_lstm_cell_t *c = (nnom_lstm_cell_t *)cell;
+
+	// calculate output shift for the 2 calculation. 
+	// hw = the product of hidden x weight, iw = the product of input x weight
+	// due to the addition of them, they must have same q format.
+    // that is -> c->q_dec_z; 
+
+	// for the dots in cell: output shift = input_dec + weight_dec - output_dec
+	c->oshift_hw = c->q_dec_h + c->recurrent_weights->q_dec[0] - c->q_dec_z;
+	c->oshift_iw = layer->in->tensor->q_dec[0] + c->weights->q_dec[0] - c->q_dec_z; 
+
+	// bias shift =  bias_dec - out_dec
+	c->bias_shift = layer->in->tensor->q_dec[0] + c->weights->q_dec[0] - c->bias->q_dec[0];
+
+	// state size = one timestamp output size. 
+	cell->state_size = cell->units * 2 * 2; // Q15
+
+	// // comp buffer size: not required
+	cell->comp_buf_size = cell->units * 12 * 2 + cell->feature_size * 2; //q15 + input q7->q15 buffer.  
+
+	// finally, calculate the MAC for info (for each timestamp)
+	cell->macc = cell->feature_size * cell->units *4    // input: feature *  state * 4 gates
+				+ cell->units * cell->units *4		    // recurrent, state
+				+ cell->units  *10;                     // output_unit * (5 gate + 3 mult + 2 addition)
+
+	return NN_SUCCESS;
+}
+
+// Q7 input output 
+// Q7 weights
+// Q15 states and intermediate buffer
+nnom_status_t lstm_cell_q7_q15_run(nnom_rnn_cell_t* cell)
+{
+	nnom_layer_t *layer = cell->layer;
+	nnom_lstm_cell_t* c = (nnom_lstm_cell_t*) cell;
+    int act_int_bit = 7 - c->q_dec_z;
+
+    // state buffer
+    // low |-- hidden --|-- carry --| high
+    q15_t* h_tm1 = (q15_t*)cell->in_state;
+    q15_t* c_tm1 = (q15_t*)cell->in_state + cell->units;
+    q15_t* o_state[2];
+    o_state[0] = (q15_t*)cell->out_state;
+    o_state[1] = (q15_t*)cell->out_state + cell->units;
+
+    // computing buffer
+    // low |-- buf0 --|-- buf1 --|-- buf2 --|-- input q15 --|
+    q15_t* z[4];
+    q15_t *buf0, *buf1, *buf2, *in_q15_buf;
+    buf0 = (q15_t*)layer->comp->mem->blk;
+    buf1 = (q15_t*)layer->comp->mem->blk + cell->units*4;
+    buf2 = (q15_t*)layer->comp->mem->blk + cell->units*8;
+    in_q15_buf = (q15_t*)layer->comp->mem->blk + cell->units*12;
+
+    // input q7 -> q15
+    local_q7_to_q15(cell->in_data, in_q15_buf, cell->feature_size);
+
+    // z1 = K.dot(cell_inputs, kernel) + bias -> buf1
+	#ifdef NNOM_USING_CMSIS_NN
+		arm_fully_connected_mat_q7_vec_q15_opt
+	#else
+		local_fully_connected_mat_q7_vec_q15_opt
+	#endif 
+		(in_q15_buf, c->weights->p_data, cell->feature_size, cell->units*4, c->bias_shift + 8, c->oshift_iw, c->bias->p_data, buf1, NULL);
+
+    // z2 = K.dot(h_tm1, recurrent_kernel)  -> buf2
+	// --- arm version must use bias, so we have to use local implementation
+    local_fully_connected_mat_q7_vec_q15_opt(h_tm1, c->recurrent_weights->p_data, 
+            cell->units, cell->units*4, 0, c->oshift_hw, NULL, buf2, NULL); 
+
+    // z = z1 + z2  -> buf0
+    local_add_q15(buf1, buf2, buf0, 0, cell->units*4);
+
+    // split the data to each gate
+    z[0] = buf0;
+    z[1] = buf0 + cell->units;
+    z[2] = buf0 + cell->units*2;
+    z[3] = buf0 + cell->units*3;
+
+    // i = nn.sigmoid(z0)
+    local_sigmoid_q15(z[0], cell->units, act_int_bit);
+    // f = nn.sigmoid(z1)
+    local_sigmoid_q15(z[1], cell->units, act_int_bit);
+    // o = nn.sigmoid(z3)
+    local_sigmoid_q15(z[3], cell->units, act_int_bit);
+
+    /* c = f * c_tm1 + i * nn.tanh(z2) for the step 1-3. */
+    // 1. i * tanh(z2) -> buf1
+    local_tanh_q15(z[2], cell->units, act_int_bit);
+    local_mult_q15(z[0], z[2], buf1, 30 - (c->q_dec_c+8), cell->units); 
+    // 2. f * c_tm1 -> o_state[0] 
+    local_mult_q15(z[1], c_tm1, o_state[0], 15, cell->units);
+    // 3. c = i*tanh + f*c_tm1 -> o_state[1]   ** fill the upper state (carry)
+    local_add_q15(buf1, o_state[0], o_state[1], 0, cell->units);
+
+    /* h = o * nn.tanh(c) -> o_state[0] for the step 1-2 */
+    // 1. tanh(c) -> buf2  --- first copy then activate. 
+    nnom_memcpy(buf2, o_state[1], cell->units*2);
+	local_tanh_q15(buf2, cell->units, 7 - c->q_dec_c); //  this int bit is under 8bit
+    // 2. h = o*tanh(c) -> o_state[0]    ** fill the lower state (memory, hidden)
+    local_mult_q15(z[3], buf2, o_state[0], 15, cell->units);
+
+    // copy and shift q15 to q7 ** (copy hidden to output)
+    local_q15_to_q7(o_state[0], cell->out_data, 8, cell->units);
+	return NN_SUCCESS;
+}
+
+
+// researve for debugging, printing the intermediate products and variables
+#if 0
+static void print_variable(q7_t* data,char*name, int dec_bit, int size)
+{
+	printf("\n");
+	printf("%s\n", name);
+	for(int i = 0; i < size; i++)
+	{
+		if(i%8==0)
+			printf("\n");
+		printf("%f\t", (float) data[i] / (1 << dec_bit));
+	}
+	printf("\n");
+}
+
+static void print_variable_q15(q15_t *data,char*name, int dec_bit, int size)
+{
+	printf("\n\n");
+	printf("%s", name);
+	for(int i = 0; i < size; i++)
+	{
+		if(i%8==0)
+			printf("\n");
+		printf("%f\t", (float) data[i] / (1 << dec_bit));
+	}
+	printf("\n");
+}
+
+
+// Q7 input output 
+// Q7 weights
+// Q15 states and intermediate buffer
+nnom_status_t lstm_cell_q7_q15_run(nnom_rnn_cell_t* cell)
+{
+	nnom_layer_t *layer = cell->layer;
+	nnom_rnn_layer_t* cl = (nnom_rnn_layer_t *) layer;
+	nnom_lstm_cell_t* c = (nnom_lstm_cell_t*) cell;
+    int act_int_bit = 7 - c->q_dec_z;
+	
+				// test
+				//nnom_memset(cell->in_data, 32, cell->feature_size); 
+
+    // state buffer
+    // low |-- hidden --|-- carry --| high
+    q15_t* h_tm1 = (q15_t*)cell->in_state;
+    q15_t* c_tm1 = (q15_t*)cell->in_state + cell->units;
+    q15_t* o_state[2];
+    o_state[0] = (q15_t*)cell->out_state;
+    o_state[1] = (q15_t*)cell->out_state + cell->units;
+
+    // computing buffer
+    // low |-- buf0 --|-- buf1 --|-- buf2 --|-- input q15 --|
+    q15_t* z[4];
+    q15_t *buf0, *buf1, *buf2, *in_q15_buf;
+    buf0 = (q15_t*)layer->comp->mem->blk;
+    buf1 = (q15_t*)layer->comp->mem->blk + cell->units*4;
+    buf2 = (q15_t*)layer->comp->mem->blk + cell->units*8;
+    in_q15_buf = (q15_t*)layer->comp->mem->blk + cell->units*12;
+
+    // input q7 -> q15
+    //local_q7_to_q15_no_shift(cell->in_data, in_q15_buf, cell->feature_size);
+    local_q7_to_q15(cell->in_data, in_q15_buf, cell->feature_size);
+			print_variable_q15(in_q15_buf, "input", layer->in->tensor->q_dec[0] + 8, cell->feature_size);
+			print_variable_q15(h_tm1, "h_tml", 15, cell->units);
+			print_variable_q15(c_tm1, "c_tml", c->q_dec_c + 8, cell->units);
+
+    // z1 = K.dot(cell_inputs, kernel) + bias -> buf1
+	#ifdef NNOM_USING_CMSIS_NN
+		arm_fully_connected_mat_q7_vec_q15_opt
+	#else
+		local_fully_connected_mat_q7_vec_q15_opt
+	#endif 
+		(in_q15_buf, c->weights->p_data, cell->feature_size, cell->units*4, c->bias_shift + 8, c->oshift_iw, c->bias->p_data, buf1, NULL);
+
+    // z2 = K.dot(h_tm1, recurrent_kernel)  -> buf2
+	// arm version must use bias, so we have to use local implementation
+    local_fully_connected_mat_q7_vec_q15_opt(h_tm1, c->recurrent_weights->p_data, 
+            cell->units, cell->units*4, 0, c->oshift_hw, NULL, buf2, NULL); 
+
+    // z = z1 + z2  -> buf0
+    local_add_q15(buf1, buf2, buf0, 0, cell->units*4);
+	
+			print_variable_q15(buf0, "z", c->q_dec_z + 8, cell->units*4);
+			print_variable_q15(buf1, "z1", c->q_dec_z + 8, cell->units*4);
+			print_variable_q15(buf2, "z2", c->q_dec_z + 8, cell->units*4);
+
+    // split the data to each gate
+    z[0] = buf0;
+    z[1] = buf0 + cell->units;
+    z[2] = buf0 + cell->units*2;
+    z[3] = buf0 + cell->units*3;
+
+    // i = nn.sigmoid(z0)
+    local_sigmoid_q15(z[0], cell->units, act_int_bit);
+    // f = nn.sigmoid(z1)
+    local_sigmoid_q15(z[1], cell->units, act_int_bit);
+    // o = nn.sigmoid(z3)
+    local_sigmoid_q15(z[3], cell->units, act_int_bit);
+	
+			print_variable_q15(z[0], "z[0] - i", 15, cell->units);
+			print_variable_q15(z[1], "z[1] - f", 15, cell->units);
+			print_variable_q15(z[3], "z[3] - o", 15, cell->units);
+
+    /* c = f * c_tm1 + i * nn.tanh(z2) for the step 1-3. */
+    // 1. i * tanh(z2) -> buf1
+    local_tanh_q15(z[2], cell->units, act_int_bit);
+			print_variable_q15(z[2], "z[2] - ?", 15, cell->units);
+	
+    local_mult_q15(z[0], z[2], buf1, 30 - (c->q_dec_c+8), cell->units); //q0.15 * q0.15 >> (shift) = (q_c + 8) // i am not very sure
+			print_variable_q15(buf1, "c2: i * tanh(z2) ", c->q_dec_c+8, cell->units);
+
+    // 2. f * c_tm1 -> o_state[0] 
+    local_mult_q15(z[1], c_tm1, o_state[0], 15, cell->units);
+			print_variable_q15(o_state[0], "c1: f * c_tm1", c->q_dec_c+8, cell->units);
+
+    // 3. c = i*tanh + f*c_tm1 -> o_state[1]   ** fill the upper state (carry)
+    local_add_q15(buf1, o_state[0], o_state[1], 0, cell->units);
+			print_variable_q15(o_state[1], "c = c1+c2", c->q_dec_c+8, cell->units);
+
+    /* h = o * nn.tanh(c) -> o_state[0] for the step 1-2 */
+    // 1. tanh(c) -> buf2  --- first copy then activate. 
+    nnom_memcpy(buf2, o_state[1], cell->units*2);
+	local_tanh_q15(buf2, cell->units, 7 - c->q_dec_c); //  this int bit is under 8bit
+			print_variable_q15(buf2, "tanh(c)", 15, cell->units);
+
+    // 2. h = o*tanh(c) -> o_state[0]    ** fill the lower state (memory, hidden)
+    local_mult_q15(z[3], buf2, o_state[0], 15, cell->units);
+			print_variable_q15(o_state[0], "h = o*tanh(c)", 15, cell->units);
+
+    // copy and shift q15 to q7 ** (copy hidden to output)
+    local_q15_to_q7(o_state[0], cell->out_data, 8, cell->units);
+	
+			print_variable(cell->out_data, "q7 output)", 7, cell->units);
+
+	return NN_SUCCESS;
+}
+#endif
diff --git a/APP_Framework/Framework/knowing/nnom/src/layers/nnom_matrix.c b/APP_Framework/Framework/knowing/nnom/src/layers/nnom_matrix.c
new file mode 100644
index 000000000..e011ecc0f
--- /dev/null
+++ b/APP_Framework/Framework/knowing/nnom/src/layers/nnom_matrix.c
@@ -0,0 +1,239 @@
+/*
+ * Copyright (c) 2018-2020
+ * Jianjia Ma
+ * majianjia@live.com
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Change Logs:
+ * Date           Author       Notes
+ * 2019-07-23     Jianjia Ma   The first version
+ */
+
+#include <stdint.h>
+#include <string.h>
+#include <stdbool.h>
+
+#include "nnom.h"
+#include "nnom_local.h"
+#include "nnom_layers.h"
+#include "layers/nnom_matrix.h"
+
+// TODO, completely change this file to local version
+#ifdef NNOM_USING_CMSIS_NN
+#include "arm_math.h"
+#include "arm_nnfunctions.h"
+#endif
+
+nnom_status_t matrix_build(nnom_layer_t *layer);
+
+nnom_layer_t *add_s(const nnom_matrix_config_t * config)
+{
+	nnom_matrix_layer_t *cl = (nnom_matrix_layer_t *) Add(config->output_shift);
+	if(cl)
+		cl->super.config = (void*) config;
+	return (nnom_layer_t *)cl;
+}
+
+nnom_layer_t *sub_s(const nnom_matrix_config_t * config)
+{
+	nnom_matrix_layer_t *cl = (nnom_matrix_layer_t *) Sub(config->output_shift);
+	if(cl)
+		cl->super.config = (void*) config;
+	return (nnom_layer_t *)cl;
+}
+
+nnom_layer_t *mult_s(const nnom_matrix_config_t * config)
+{
+	nnom_matrix_layer_t *cl = (nnom_matrix_layer_t *) Mult(config->output_shift);
+	if(cl)
+		cl->super.config = (void*) config;
+	return (nnom_layer_t *)cl;
+}
+
+nnom_layer_t *Add(int16_t oshift)
+{
+	nnom_matrix_layer_t *cl = (nnom_matrix_layer_t *)_same_shape_matrix_layer();
+	if (cl == NULL)
+		return NULL;
+	// set type in layer parent
+	cl->super.type = NNOM_ADD;
+	cl->super.run = add_run;
+	cl->oshift = oshift;
+	return (nnom_layer_t *)cl;
+}
+
+nnom_layer_t *Sub(int16_t oshift)
+{
+	nnom_matrix_layer_t *cl = (nnom_matrix_layer_t *)_same_shape_matrix_layer();
+	if (cl == NULL)
+		return NULL;
+	// set type in layer parent
+	cl->super.type = NNOM_SUB;
+	cl->super.run = sub_run;
+	cl->oshift = oshift;
+	return (nnom_layer_t *)cl;
+}
+
+nnom_layer_t *Mult(int16_t oshift)
+{
+	nnom_matrix_layer_t *cl = (nnom_matrix_layer_t *)_same_shape_matrix_layer();
+	if (cl == NULL)
+		return NULL;
+	// set type in layer parent
+	cl->super.type = NNOM_MULT;
+	cl->super.run = mult_run;
+	cl->oshift = oshift;
+	return (nnom_layer_t *)cl;
+}
+
+// init a base layer instance with same shape 1 in 1 out. More IO can be added later
+// mainly used by matrix calculation (add, mult, sub)
+nnom_layer_t *_same_shape_matrix_layer()
+{
+	nnom_matrix_layer_t *layer;
+	nnom_layer_io_t *in, *out;
+	//nnom_buf_t *comp;
+	size_t mem_size;
+
+	// apply a block memory for all the sub handles.
+	mem_size = sizeof(nnom_matrix_layer_t) + sizeof(nnom_layer_io_t) * 2;
+	layer = nnom_mem(mem_size);
+	if (layer == NULL)
+		return NULL;
+
+	// distribut the memory to sub handles.
+	in = (void *)((uint8_t*)layer + sizeof(nnom_matrix_layer_t));
+	out = (void *)((uint8_t*)in + sizeof(nnom_layer_io_t));
+	//comp = (void *)((uint8_t*)out + sizeof(nnom_layer_io_t));
+
+	// set type in layer parent
+	layer->super.build = matrix_build;
+	// set buf state
+	in->type = NNOM_TENSOR_BUF_TEMP;
+	out->type = NNOM_TENSOR_BUF_TEMP;
+	//comp->type = NNOM_TENSOR_BUF_TEMP;
+	// put in & out on the layer.
+	layer->super.in = io_init(layer, in);
+	layer->super.out = io_init(layer, out);
+	//layer->super.comp = comp;
+	return (nnom_layer_t*)layer;
+}
+
+nnom_status_t matrix_build(nnom_layer_t *layer)
+{
+	// get the last layer's output as input shape (if more than one)
+	nnom_layer_io_t *in = layer->in;
+	while(in)
+	{
+		in->tensor = in->hook.io->tensor;
+		in = in->aux;
+	}
+	// output tensor
+	layer->out->tensor = new_tensor(NNOM_QTYPE_PER_TENSOR,layer->in->tensor->num_dim, tensor_get_num_channel(layer->in->tensor));
+	tensor_cpy_attr(layer->out->tensor, layer->in->tensor);
+	
+	// see if the activation will change the q format
+	if(layer->actail) 
+		layer->out->tensor->q_dec[0] = act_get_dec_bit(layer->actail->type, layer->out->tensor->q_dec[0]);
+
+	// now this build has passed the input tensors (shapes, formats) to the new tensors. 
+	return NN_SUCCESS;
+}
+
+
+nnom_status_t add_run(nnom_layer_t *layer)
+{
+	nnom_matrix_layer_t* cl = (nnom_matrix_layer_t*)layer;
+	nnom_layer_io_t *in = layer->in;;
+	size_t t_size = tensor_size(layer->out->tensor);
+	int32_t oshift = cl->oshift;
+	size_t num_input = nnom_io_length(layer->in);
+	q7_t *input_mem_blk[MAX_INPUT_LAYER];
+
+	// if there is only 2 matrix
+	if(num_input == 2)
+	{
+		#ifdef NNOM_USING_CMSIS_NN
+		if(oshift == 0)
+			arm_add_q7(layer->in->tensor->p_data, layer->in->aux->tensor->p_data, layer->out->tensor->p_data, t_size);
+		else
+		#endif
+			local_add_q7(layer->in->tensor->p_data, layer->in->aux->tensor->p_data, layer->out->tensor->p_data, oshift, t_size);
+	}
+	else
+	{	
+		for(int i = 0; i < num_input; i++)
+		{
+			input_mem_blk[i] = in->tensor->p_data;
+			in = in->aux;
+		}
+		local_multiple_add_q7(layer->out->tensor->p_data, oshift, t_size, num_input, input_mem_blk);
+	}
+
+	return NN_SUCCESS;
+}
+
+nnom_status_t sub_run(nnom_layer_t *layer)
+{
+	nnom_matrix_layer_t* cl = (nnom_matrix_layer_t*)layer;
+	nnom_layer_io_t *in = layer->in;
+	size_t t_size = tensor_size(layer->out->tensor);
+	int32_t oshift = cl->oshift;
+	size_t num_input = nnom_io_length(layer->in);
+	q7_t *input_mem_blk[MAX_INPUT_LAYER];
+
+	// if there is only 2 matrix
+	if(num_input == 2)
+	{
+		// the first 2 matrix
+		#ifdef NNOM_USING_CMSIS_NN
+		if(oshift == 0)
+			arm_sub_q7(layer->in->tensor->p_data, layer->in->aux->tensor->p_data, layer->out->tensor->p_data, t_size);
+		else
+		#endif
+			local_sub_q7(layer->in->tensor->p_data, layer->in->aux->tensor->p_data, layer->out->tensor->p_data, oshift, t_size);
+	}
+	else
+	{	
+		for(int i = 0; i < num_input; i++)
+		{
+			input_mem_blk[i] = in->tensor->p_data;
+			in = in->aux;
+		}
+		local_multiple_sub_q7(layer->out->tensor->p_data, oshift, t_size, num_input, input_mem_blk);
+	}
+	return NN_SUCCESS;
+}
+
+nnom_status_t mult_run(nnom_layer_t *layer)
+{
+	nnom_matrix_layer_t* cl = (nnom_matrix_layer_t*)layer;
+	nnom_layer_io_t *in = layer->in;
+	size_t t_size = tensor_size(layer->out->tensor);
+	int32_t oshift = cl->oshift;
+	size_t num_input = nnom_io_length(layer->in);
+	q7_t *input_mem_blk[MAX_INPUT_LAYER];
+
+	// if there is only 2 matrix
+	if(num_input == 2)
+	{
+		// the first 2 matrix
+		#ifdef NNOM_USING_CMSIS_NN
+		if(oshift == 0)
+			arm_mult_q7(layer->in->tensor->p_data, layer->in->aux->tensor->p_data, layer->out->tensor->p_data, t_size);
+		else
+		#endif
+			local_mult_q7(layer->in->tensor->p_data, layer->in->aux->tensor->p_data, layer->out->tensor->p_data, oshift, t_size);
+	}
+	else
+	{	
+		for(int i = 0; i < num_input; i++)
+		{
+			input_mem_blk[i] = in->tensor->p_data;
+			in = in->aux;
+		}
+		local_multiple_mult_q7(layer->out->tensor->p_data, oshift, t_size, num_input, input_mem_blk);
+	}
+	return NN_SUCCESS;
+}
diff --git a/APP_Framework/Framework/knowing/nnom/src/layers/nnom_maxpool.c b/APP_Framework/Framework/knowing/nnom/src/layers/nnom_maxpool.c
new file mode 100644
index 000000000..fe904bad8
--- /dev/null
+++ b/APP_Framework/Framework/knowing/nnom/src/layers/nnom_maxpool.c
@@ -0,0 +1,191 @@
+/*
+ * Copyright (c) 2018-2020
+ * Jianjia Ma
+ * majianjia@live.com
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Change Logs:
+ * Date           Author       Notes
+ * 2019-07-23     Jianjia Ma   The first version
+ */
+
+#include <stdint.h>
+#include <string.h>
+#include <stdbool.h>
+
+#include "nnom.h"
+#include "nnom_local.h"
+#include "nnom_layers.h"
+#include "layers/nnom_maxpool.h"
+
+#ifdef NNOM_USING_CMSIS_NN
+#include "arm_math.h"
+#include "arm_nnfunctions.h"
+#endif
+
+nnom_layer_t *maxpool_s(const nnom_pool_config_t * config)
+{
+	nnom_layer_t *layer;
+	
+	// test, to accomodate 1d and 2d input
+	if(config->num_dim == 1)
+	{
+		layer = MaxPool(kernel(1, config->kernel_size[0]), 
+						stride(1, config->stride_size[0]), 
+						config->padding_type);
+	}
+	else
+	{
+		layer = MaxPool(kernel(config->kernel_size[0], config->kernel_size[1]), 
+						stride(config->stride_size[0], config->stride_size[1]),
+						config->padding_type);
+	}
+	
+	if(layer)
+		layer->config = (void*) config;
+	return layer;
+}
+
+nnom_layer_t *MaxPool(nnom_3d_shape_t k, nnom_3d_shape_t s, nnom_padding_t pad_type)
+{
+	nnom_maxpool_layer_t *layer;
+	nnom_buf_t *comp;
+	nnom_layer_io_t *in, *out;
+
+	// apply a block memory for all the sub handles.
+	size_t mem_size = sizeof(nnom_maxpool_layer_t) + sizeof(nnom_layer_io_t) * 2 + sizeof(nnom_buf_t);
+	layer = nnom_mem(mem_size);
+	if (layer == NULL)
+		return NULL;
+
+	// distribut the memory to sub handles.
+	in = (void *)((uint8_t*)layer + sizeof(nnom_maxpool_layer_t));
+	out = (void *)((uint8_t*)in + sizeof(nnom_layer_io_t));
+	comp = (void *)((uint8_t*)out + sizeof(nnom_layer_io_t));
+
+	// set type in layer parent
+	layer->super.type = NNOM_MAXPOOL;
+	layer->super.run = maxpool_run;
+	layer->super.build = maxpool_build;
+	// set buf state
+	in->type = NNOM_TENSOR_BUF_TEMP;
+	out->type = NNOM_TENSOR_BUF_TEMP;
+	comp->type = NNOM_TENSOR_BUF_TEMP;
+	// put in & out on the layer.
+	layer->super.in = io_init(layer, in);
+	layer->super.out = io_init(layer, out);
+	layer->super.comp = comp;
+
+	// set parameters
+	layer->kernel = k;
+	layer->stride = s;
+	layer->padding_type = pad_type;
+
+	// padding
+	if (layer->padding_type == PADDING_SAME)
+	{
+		layer->pad.h = (k.h - 1) / 2;
+		layer->pad.w = (k.w - 1) / 2;
+		layer->pad.c = 1; // no meaning
+	}
+	else
+	{
+		layer->pad.h = 0;
+		layer->pad.w = 0;
+		layer->pad.c = 0;
+	}
+	return (nnom_layer_t *)layer;
+}
+
+nnom_status_t maxpool_build(nnom_layer_t *layer)
+{
+	nnom_maxpool_layer_t *cl = (nnom_maxpool_layer_t *)layer;
+
+	// get the tensor from last layer's output
+	layer->in->tensor = layer->in->hook.io->tensor;
+
+	// create new tensor for output
+	layer->out->tensor = new_tensor(NNOM_QTYPE_PER_TENSOR, layer->in->tensor->num_dim, tensor_get_num_channel(layer->in->tensor));
+	// copy then change later. 
+	tensor_cpy_attr(layer->out->tensor, layer->in->tensor);
+	
+	// see if the activation will change the q format
+	if(layer->actail) 
+		layer->out->tensor->q_dec[0] = act_get_dec_bit(layer->actail->type, layer->out->tensor->q_dec[0]);
+
+	// now we set up the tensor shape, always HWC format
+	if (cl->padding_type == PADDING_SAME)
+	{
+		layer->out->tensor->dim[0] = NN_CEILIF(layer->in->tensor->dim[0], cl->stride.h);
+		layer->out->tensor->dim[1] = NN_CEILIF(layer->in->tensor->dim[1], cl->stride.w);
+		layer->out->tensor->dim[2] = layer->in->tensor->dim[2]; // channel stays the same
+	}
+	else
+	{
+		layer->out->tensor->dim[0] = NN_CEILIF(layer->in->tensor->dim[0] - cl->kernel.h + 1, cl->stride.h);
+		layer->out->tensor->dim[1] = NN_CEILIF(layer->in->tensor->dim[1] - cl->kernel.w + 1, cl->stride.w);
+		layer->out->tensor->dim[2] = layer->in->tensor->dim[2];
+	}
+
+	return NN_SUCCESS;
+}
+
+nnom_status_t maxpool_run(nnom_layer_t *layer)
+{
+	nnom_maxpool_layer_t *cl = (nnom_maxpool_layer_t *)(layer);
+
+	uint16_t out_x, out_y;
+
+	// if global pooling
+	if(layer->out->tensor->num_dim == 1)
+	{
+		out_x = 1; out_y = 1;
+	}
+	else // normal pooling. 
+	{
+		out_x = layer->out->tensor->dim[1]; //W
+		out_y = layer->out->tensor->dim[0]; //h
+	}
+	
+#ifdef NNOM_USING_CHW
+    local_maxpool_q7_CHW(layer->in->tensor->p_data, 				
+			layer->in->tensor->dim[1], layer->in->tensor->dim[0], layer->in->tensor->dim[2],
+			cl->kernel.w, cl->kernel.h, 
+			cl->pad.w, cl->pad.h,
+			cl->stride.w, cl->stride.h,
+			out_x, out_y,
+			NULL,
+			layer->out->tensor->p_data);
+#else //end of CHW
+	// HWC
+	#ifdef NNOM_USING_CMSIS_NN
+	// 2D, square
+	if (layer->in->tensor->dim[1] == layer->in->tensor->dim[0] &&
+		layer->out->tensor->dim[1] == layer->out->tensor->dim[0])
+	{
+		arm_maxpool_q7_HWC(
+			layer->in->tensor->p_data,
+			layer->in->tensor->dim[1], layer->in->tensor->dim[2],
+			cl->kernel.w, cl->pad.w, cl->stride.w,
+			layer->out->tensor->dim[1],
+			NULL,
+			layer->out->tensor->p_data);
+	}
+	// none square 2D, or 1D
+	else
+	#endif
+	{
+		// CMSIS-NN does not support none-square pooling, we have to use local implementation
+		local_maxpool_q7_HWC(layer->in->tensor->p_data, 				
+				layer->in->tensor->dim[1], layer->in->tensor->dim[0], layer->in->tensor->dim[2],
+				cl->kernel.w, cl->kernel.h, 
+				cl->pad.w, cl->pad.h,
+				cl->stride.w, cl->stride.h,
+				out_x, out_y,
+				NULL,
+				layer->out->tensor->p_data);
+	}
+#endif // CHW/HWC
+	return NN_SUCCESS;
+}
diff --git a/APP_Framework/Framework/knowing/nnom/src/layers/nnom_output.c b/APP_Framework/Framework/knowing/nnom/src/layers/nnom_output.c
new file mode 100644
index 000000000..bed1c89cd
--- /dev/null
+++ b/APP_Framework/Framework/knowing/nnom/src/layers/nnom_output.c
@@ -0,0 +1,54 @@
+
+/*
+ * Copyright (c) 2018-2020
+ * Jianjia Ma
+ * majianjia@live.com
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Change Logs:
+ * Date           Author       Notes
+ * 2019-07-23     Jianjia Ma   The first version
+ */
+
+#include <stdint.h>
+#include <string.h>
+#include <stdbool.h>
+
+#include "nnom.h"
+#include "nnom_local.h"
+#include "nnom_layers.h"
+#include "layers/nnom_output.h"
+
+nnom_layer_t *output_s(const nnom_io_config_t* config)
+{
+	nnom_layer_t *layer = input_s(config);
+	if(layer)
+	{
+		layer->config = (void*) config;
+		layer->type = NNOM_OUTPUT;
+		layer->run = output_run;
+		layer->build = default_build;
+	}
+	return layer;
+}
+
+nnom_layer_t *Output(nnom_3d_shape_t output_shape, void *p_buf)
+{
+	// they are acturally the same.. expect the type defined
+	nnom_layer_t *layer = Input(output_shape, p_buf);
+	if (layer != NULL)
+	{
+		layer->type = NNOM_OUTPUT;
+		layer->run = output_run;
+		layer->build = default_build;
+	}
+	return layer;
+}
+
+nnom_status_t output_run(nnom_layer_t *layer)
+{
+	nnom_io_layer_t *cl = (nnom_io_layer_t *)layer;
+	nnom_memcpy(cl->buf, layer->in->tensor->p_data, tensor_size(layer->out->tensor)); // in->memory -> user memory
+	return NN_SUCCESS;
+}
diff --git a/APP_Framework/Framework/knowing/nnom/src/layers/nnom_reshape.c b/APP_Framework/Framework/knowing/nnom/src/layers/nnom_reshape.c
new file mode 100644
index 000000000..1b6ae82f7
--- /dev/null
+++ b/APP_Framework/Framework/knowing/nnom/src/layers/nnom_reshape.c
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2018-2020
+ * Jianjia Ma
+ * majianjia@live.com
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Change Logs:
+ * Date           Author       Notes
+ * 2020-12-07     Jianjia Ma   The first version
+ */
+
+#include <stdint.h>
+#include <string.h>
+#include <stdbool.h>
+
+#include "nnom.h"
+#include "nnom_local.h"
+#include "nnom_layers.h"
+#include "layers/nnom_reshape.h"
+
+
+nnom_layer_t *reshape_s(const nnom_reshape_config_t *config)
+{
+	nnom_reshape_layer_t *layer;
+	nnom_layer_io_t *in, *out;
+    
+	// allocate a block memory for all the sub handles and shifts.
+	size_t mem_size = sizeof(nnom_reshape_layer_t) + sizeof(nnom_layer_io_t) * 2 ;
+	layer = nnom_mem(mem_size);
+	if (layer == NULL)
+		return NULL;
+	
+	// distribut the memory to sub handles.
+	in = (void *)((uint8_t*)layer + sizeof(nnom_reshape_layer_t));
+	out = (void *)((uint8_t*)in + sizeof(nnom_layer_io_t));
+
+	// set type in layer parent
+	layer->super.type = NNOM_RESHAPE;
+	layer->super.run = reshape_run;
+	layer->super.build = reshape_build;
+	// set buf state
+	in->type = NNOM_TENSOR_BUF_TEMP;
+	out->type = NNOM_TENSOR_BUF_NULL; 
+
+    // config
+    //nnom_memcpy(layer->dim, config->dim, config->num_dim * sizeof(nnom_shape_data_t));
+	layer->super.config = config;
+    layer->dim = config->dim;		// temporary use the config directly. (not preferable.) 
+	layer->num_dim = config->num_dim;
+
+	// put in & out on the layer.
+	layer->super.in = io_init(layer, in);
+	layer->super.out = io_init(layer, out);
+
+	return (nnom_layer_t *)layer;
+}
+
+nnom_status_t reshape_build(nnom_layer_t *layer)
+{
+	nnom_reshape_layer_t *cl = (nnom_reshape_layer_t *)layer;
+
+	// get the tensor from last layer's output
+	layer->in->tensor = layer->in->hook.io->tensor;
+
+	// create new tensor for output
+	layer->out->tensor = new_tensor(NNOM_QTYPE_PER_TENSOR, layer->in->tensor->num_dim, tensor_get_num_channel(layer->in->tensor));
+	tensor_set_attr(layer->out->tensor, layer->in->tensor->q_dec, layer->in->tensor->q_offset, cl->dim, cl->num_dim, 8);
+
+	return NN_SUCCESS;
+}
+
+nnom_status_t reshape_run(nnom_layer_t *layer)
+{
+	return NN_SUCCESS;
+}
+
diff --git a/APP_Framework/Framework/knowing/nnom/src/layers/nnom_rnn.c b/APP_Framework/Framework/knowing/nnom/src/layers/nnom_rnn.c
new file mode 100644
index 000000000..6fe9662e0
--- /dev/null
+++ b/APP_Framework/Framework/knowing/nnom/src/layers/nnom_rnn.c
@@ -0,0 +1,191 @@
+/*
+ * Copyright (c) 2018-2020
+ * Jianjia Ma
+ * majianjia@live.com
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Change Logs:
+ * Date           Author       Notes
+ * 2019-07-23     Jianjia Ma   The first version
+ */
+
+#include <stdint.h>
+#include <string.h>
+#include <stdbool.h>
+
+#include "nnom.h"
+#include "nnom_local.h"
+#include "nnom_layers.h"
+#include "layers/nnom_rnn.h"
+
+nnom_status_t rnn_build(nnom_layer_t *layer);
+nnom_status_t rnn_run(nnom_layer_t *layer);
+nnom_status_t rnn_free(nnom_layer_t* layer);
+
+// RNN
+nnom_layer_t *rnn_s(nnom_rnn_cell_t *cell, const nnom_rnn_config_t* config)
+{
+	nnom_rnn_layer_t *layer;
+	nnom_buf_t *comp;
+	nnom_layer_io_t *in, *out;
+
+	// apply a block memory for all the sub handles.
+	size_t mem_size = sizeof(nnom_rnn_layer_t) + sizeof(nnom_layer_io_t) * 2 + sizeof(nnom_buf_t);
+	layer = nnom_mem(mem_size);
+	if (layer == NULL)
+		return NULL;
+
+	// distribut the memory to sub handles.
+	in = (void *)((uint8_t*)layer + sizeof(nnom_rnn_layer_t));
+	out = (void *)((uint8_t*)in + sizeof(nnom_layer_io_t));
+	comp = (void *)((uint8_t*)out + sizeof(nnom_layer_io_t));
+
+	// set type in layer parent
+	layer->super.type = NNOM_RNN;
+	// set buf state
+	in->type = NNOM_TENSOR_BUF_TEMP;
+	out->type = NNOM_TENSOR_BUF_TEMP;
+	comp->type = NNOM_TENSOR_BUF_TEMP;
+	// put in & out on the layer.
+	layer->super.in = io_init(layer, in);
+	layer->super.out = io_init(layer, out);
+	layer->super.comp = comp;
+	// set run and outshape methods
+	layer->super.run = rnn_run;
+	layer->super.build = rnn_build;
+	layer->super.free = rnn_free;
+
+	// rnn parameters.
+	layer->return_sequence = config->return_sequence;
+	layer->stateful = config->stateful;
+	layer->go_backwards = config->go_backwards;
+	layer->super.config = (void*)config;
+	layer->cell = cell;
+
+	// set this layer to the cell
+	layer->cell->layer = (nnom_layer_t *)layer;
+
+	return (nnom_layer_t *)layer;
+}
+
+nnom_status_t rnn_free(nnom_layer_t* layer)
+{
+	nnom_rnn_layer_t* cl = (nnom_rnn_layer_t*)layer;
+	// free cell
+	if(cl->cell->free)
+		cl->cell->free(cl->cell);
+
+	// free state buffer
+	nnom_free(cl->state_buf);
+
+	return NN_SUCCESS;
+}
+
+nnom_status_t rnn_build(nnom_layer_t* layer)
+{
+	nnom_rnn_layer_t *cl = (nnom_rnn_layer_t *)layer;
+
+	// get the tensor from last layer's output
+	layer->in->tensor = layer->in->hook.io->tensor;
+	
+	// timestamp size
+	cl->timestamp_size = layer->in->tensor->num_dim > 2 ? layer->in->tensor->dim[1] : layer->in->tensor->dim[0];
+
+	if(cl->return_sequence)
+	{
+		// create new tensor for the output
+		layer->out->tensor = new_tensor(NNOM_QTYPE_PER_TENSOR, 2, 0);
+		// shape: timestamp, units
+		layer->out->tensor->dim[0] = cl->timestamp_size;
+		layer->out->tensor->dim[1] = cl->cell->units;
+	}
+	else
+	{
+		// create new tensor for the output
+		layer->out->tensor = new_tensor(NNOM_QTYPE_PER_TENSOR, 1, 0);
+		// shape: timestamp, units
+		layer->out->tensor->dim[0] = cl->cell->units;
+	}
+
+	// output q format - the output of the available activations are both q0.7.  
+	layer->out->tensor->q_dec[0] = layer->in->tensor->bitwidth==16? 15: 7;
+	layer->out->tensor->bitwidth = layer->in->tensor->bitwidth;
+	// see if the activation will change the q format
+	if(layer->actail) 
+		layer->out->tensor->q_dec[0] = act_get_dec_bit(layer->actail->type, layer->out->tensor->q_dec[0]);
+
+	// get feature size from input tensor
+	cl->cell->feature_size = tensor_get_num_channel(layer->in->tensor); // vector (feature) size
+
+	// call cell builder to build the cell
+	cl->cell->build(cl->cell);
+
+	// get the size of computation buffer?
+	cl->super.comp->size = cl->cell->comp_buf_size; 	// size of intermediate buffer required by the cell. 
+	cl->state_buf = nnom_mem(cl->cell->state_size * 2); // allocate state buf for upper/lower state buffer. 
+	if(!cl->state_buf)
+		return NN_NO_MEMORY;
+	
+	// get the computational cost provided by Cell
+	layer->stat.macc = cl->cell->macc * cl->timestamp_size;
+	return NN_SUCCESS;
+}
+
+nnom_status_t rnn_run(nnom_layer_t* layer)
+{
+	nnom_status_t result;
+	nnom_rnn_layer_t* cl = (nnom_rnn_layer_t*)(layer);
+	size_t timestamps_size = layer->in->tensor->dim[layer->in->tensor->num_dim-2];
+	size_t feature_size = tensor_get_num_channel(layer->in->tensor); // feature size = last dimension. 
+	size_t state_size = cl->cell->state_size;
+	size_t output_growth;
+	void* upper_state = (q7_t*)cl->state_buf + state_size;
+	void* lower_state = (q7_t*)cl->state_buf;
+
+	// reset state buffer if not in stateful
+	if (!cl->stateful)
+		nnom_memset(cl->state_buf, 0, state_size * 2);
+
+	// set output data
+	output_growth = cl->return_sequence ? cl->cell->units : 0;
+
+	// run timestamp by timestamp
+	for (uint32_t round = 0; round < timestamps_size; round++)
+	{
+		if(cl->go_backwards)
+		{
+			// set input data
+			cl->cell->in_data = (q7_t*)layer->in->tensor->p_data + feature_size*(timestamps_size - 1 - round);
+			// set output data
+			cl->cell->out_data = (q7_t*)layer->out->tensor->p_data + output_growth*(timestamps_size - 1 - round);
+		}
+		else
+		{
+			// set input data
+			cl->cell->in_data = (q7_t*)layer->in->tensor->p_data + feature_size*round;
+			// set output data
+			cl->cell->out_data = (q7_t*)layer->out->tensor->p_data + output_growth*round;
+		}
+		
+		// switch upper/lower state buffer
+		if(cl->cell->in_state != lower_state)
+		{
+			cl->cell->in_state = lower_state;
+			cl->cell->out_state = upper_state;
+		}
+		else
+		{
+			cl->cell->in_state = upper_state;
+			cl->cell->out_state = lower_state;
+		}
+
+		// run it
+		result = cl->cell->run(cl->cell);
+		if(result != NN_SUCCESS)
+			return result;
+	}
+	
+	return NN_SUCCESS;
+}
+
diff --git a/APP_Framework/Framework/knowing/nnom/src/layers/nnom_simple_cell.c b/APP_Framework/Framework/knowing/nnom/src/layers/nnom_simple_cell.c
new file mode 100644
index 000000000..b61acbef3
--- /dev/null
+++ b/APP_Framework/Framework/knowing/nnom/src/layers/nnom_simple_cell.c
@@ -0,0 +1,142 @@
+/*
+ * Copyright (c) 2018-2020
+ * Jianjia Ma
+ * majianjia@live.com
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Change Logs:
+ * Date           Author       Notes
+ * 2020-08-21     Jianjia Ma   The first version
+ */
+
+#include <stdint.h>
+#include <string.h>
+#include <stdbool.h>
+
+#include "nnom.h"
+#include "nnom_local.h"
+#include "nnom_layers.h"
+#include "layers/nnom_simple_cell.h"
+
+#ifdef NNOM_USING_CMSIS_NN
+#include "arm_math.h"
+#include "arm_nnfunctions.h"
+#endif
+
+// Simple RNN
+// unit = output shape
+// type of activation
+nnom_rnn_cell_t *simple_cell_s(const nnom_simple_cell_config_t* config)
+{
+	nnom_simple_cell_t *cell;
+	cell = nnom_mem(sizeof(nnom_simple_cell_t));
+	if (cell == NULL)
+		return NULL;
+	// set methods
+	cell->super.run = simple_cell_run;
+	cell->super.build = simple_cell_build;
+	cell->super.free = simple_cell_free;
+	cell->super.config = (void*) config;
+	cell->super.units = config->units;
+	cell->super.type = NNOM_SIMPLE_CELL;
+
+	// set parameters 
+	cell->bias = config->bias;
+	cell->weights = config->weights;
+	cell->recurrent_weights = config->recurrent_weights;
+	cell->act_type = config->act_type; 
+	// q format for intermediate products
+	cell->q_dec_iw = config->q_dec_iw;
+	cell->q_dec_hw = config->q_dec_hw;
+	cell->q_dec_h = config->q_dec_h;
+	
+	return (nnom_rnn_cell_t *)cell;
+}
+
+nnom_status_t simple_cell_free(nnom_rnn_cell_t* cell)
+{
+	return NN_SUCCESS;
+}
+
+// the state buffer and computational buffer shape of the cell
+nnom_status_t simple_cell_build(nnom_rnn_cell_t* cell)
+{
+	nnom_layer_t *layer = cell->layer; 
+	nnom_simple_cell_t *c = (nnom_simple_cell_t *)cell;
+	nnom_simple_cell_config_t *config = (nnom_simple_cell_config_t *)cell->config;
+	int q_hw_iw;
+	
+	// activation, check if activation is supported 
+	if(config->act_type != ACT_SIGMOID && config->act_type != ACT_TANH)
+		return NN_ARGUMENT_ERROR;
+
+	// calculate output shift for the 2 calculation. 
+	// hw = the product of hidden x weight, iw = the product of input x weight
+	// due to the addition of them, they must have same q format.
+	q_hw_iw = MIN(c->q_dec_hw, c->q_dec_iw);  
+
+	// for the 2 dot in cell: output shift = input_dec + weight_dec - output_dec
+	c->oshift_hw = c->q_dec_h + c->recurrent_weights->q_dec[0] - q_hw_iw;
+	c->oshift_iw = layer->in->tensor->q_dec[0] + c->weights->q_dec[0] - q_hw_iw;
+
+	// bias shift =  bias_dec - out_dec
+	c->bias_shift = layer->in->tensor->q_dec[0] + c->weights->q_dec[0] - c->bias->q_dec[0];
+
+	// state size = one timestamp output size. 
+	cell->state_size = cell->units;
+
+	// comp buffer size: not required
+	cell->comp_buf_size = 0; 
+
+	// finally, calculate the MAC for info
+	cell->macc = cell->feature_size * cell->units // input: feature  * state
+				+ cell->units * cell->units;      // recurrent, state *  output_unit
+
+	return NN_SUCCESS;
+}
+
+// This Simple Cell replicate the Keras's SimpleCell as blow 
+/*
+ def call(self, inputs, states, training=None):
+    prev_output = states[0] if nest.is_sequence(states) else states
+
+	h = K.dot(inputs, self.kernel)
+	h = K.bias_add(h, self.bias)
+
+	h2 = K.dot(prev_output, self.recurrent_kernel)
+    output = h + H2
+    output = self.activation(output)
+
+    new_state = [output] if nest.is_sequence(states) else output
+    return output, new_state
+*/
+
+nnom_status_t simple_cell_run(nnom_rnn_cell_t* cell)
+{
+	nnom_simple_cell_t* c = (nnom_simple_cell_t*) cell;
+	int act_int_bit = 7 - MIN(c->q_dec_hw, c->q_dec_iw);
+
+	// in_state x recurrent_weight -> h2 (output buf)
+	local_dot_q7_opt(cell->in_state, c->recurrent_weights->p_data, cell->units, cell->units, c->oshift_hw, cell->out_data);
+	// (input x weight) + bias -> h (in_state buf)
+	local_fully_connected_q7_opt(cell->in_data, c->weights->p_data, 
+				cell->feature_size, cell->units, c->bias_shift, c->oshift_iw, c->bias->p_data, cell->in_state, NULL);
+	// h + h2 -> (out_state buf)
+	local_add_q7(cell->in_state, cell->out_data, cell->out_state, 0, cell->units);
+
+	// active(out_state buf)
+	if(c->act_type == ACT_TANH)
+		local_tanh_q7(cell->out_state, cell->units, act_int_bit);
+		//local_hard_tanh_q7(cell->out_state, cell->units, act_int_bit);
+	else
+		local_sigmoid_q7(cell->out_state, cell->units, act_int_bit);
+		//local_hard_sigmoid_q7(cell->out_state, cell->units, act_int_bit);
+
+	// (out_state buf) --copy--> (output buf)
+	nnom_memcpy(cell->out_data, cell->out_state, cell->units);
+
+	return NN_SUCCESS;
+}
+
+
diff --git a/APP_Framework/Framework/knowing/nnom/src/layers/nnom_softmax.c b/APP_Framework/Framework/knowing/nnom/src/layers/nnom_softmax.c
new file mode 100644
index 000000000..04b009b35
--- /dev/null
+++ b/APP_Framework/Framework/knowing/nnom/src/layers/nnom_softmax.c
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2018-2020
+ * Jianjia Ma
+ * majianjia@live.com
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Change Logs:
+ * Date           Author       Notes
+ * 2019-07-23     Jianjia Ma   The first version
+ */
+
+#include <stdint.h>
+#include <string.h>
+#include <stdbool.h>
+
+#include "nnom.h"
+#include "nnom_local.h"
+#include "nnom_layers.h"
+#include "layers/nnom_softmax.h"
+
+#ifdef NNOM_USING_CMSIS_NN
+#include "arm_math.h"
+#include "arm_nnfunctions.h"
+#endif
+
+nnom_layer_t *softmax_s(const nnom_softmax_config_t * config)
+{
+	nnom_layer_t * layer = Softmax();
+	if(layer)
+		layer->config = (void*) config;
+	return layer;
+}
+
+nnom_layer_t *Softmax(void)
+{
+	nnom_layer_t *layer;
+	nnom_layer_io_t *in, *out;
+
+	// apply a block memory for all the sub handles.
+	size_t mem_size = sizeof(nnom_layer_t) + sizeof(nnom_layer_io_t) * 2;
+	layer = nnom_mem(mem_size);
+	if (layer == NULL)
+		return NULL;
+
+	// distribut the memory to sub handles.
+	in = (void *)((uint8_t*)layer + sizeof(nnom_layer_t));
+	out = (void *)((uint8_t*)in + sizeof(nnom_layer_io_t));
+
+	// set type in layer parent
+	layer->type = NNOM_SOFTMAX;
+	layer->run = softmax_run;
+	layer->build = softmax_build;
+	// set buf state
+	in->type = NNOM_TENSOR_BUF_TEMP;
+	out->type = NNOM_TENSOR_BUF_TEMP;
+	// put in & out on the layer.
+	layer->in = io_init(layer, in);
+	layer->out = io_init(layer, out);
+
+	return layer;
+}
+
+nnom_status_t softmax_build(nnom_layer_t *layer)
+{
+	// get the last layer's output as input shape
+	layer->in->tensor = layer->in->hook.io->tensor;
+	// output tensor
+	layer->out->tensor = new_tensor(NNOM_QTYPE_PER_TENSOR, layer->in->tensor->num_dim, tensor_get_num_channel(layer->in->tensor));
+	tensor_cpy_attr(layer->out->tensor, layer->in->tensor);
+	// softmax has fixed output dec bit
+	layer->out->tensor->q_dec[0] = 7;
+	return NN_SUCCESS;
+}
+
+nnom_status_t softmax_run(nnom_layer_t *layer)
+{
+	// looks like the new version cause accuracy drop quite a lot. 
+//	#ifdef NNOM_USING_CMSIS_NN
+//	// temporary fixed for mutiple dimension input. 
+//	arm_softmax_q7(layer->in->tensor->p_data, tensor_size(layer->out->tensor), layer->out->tensor->p_data);
+//	#else
+	local_softmax_q7(layer->in->tensor->p_data, tensor_size(layer->out->tensor), layer->out->tensor->p_data);
+	//#endif
+	return NN_SUCCESS;
+}
diff --git a/APP_Framework/Framework/knowing/nnom/src/layers/nnom_sumpool.c b/APP_Framework/Framework/knowing/nnom/src/layers/nnom_sumpool.c
new file mode 100644
index 000000000..82de147c4
--- /dev/null
+++ b/APP_Framework/Framework/knowing/nnom/src/layers/nnom_sumpool.c
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2018-2020
+ * Jianjia Ma
+ * majianjia@live.com
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Change Logs:
+ * Date           Author       Notes
+ * 2019-07-23     Jianjia Ma   The first version
+ */
+
+#include <stdint.h>
+#include <string.h>
+#include <stdbool.h>
+
+#include "nnom.h"
+#include "nnom_local.h"
+#include "nnom_layers.h"
+#include "layers/nnom_sumpool.h"
+
+nnom_layer_t *sumpool_s(const nnom_pool_config_t * config)
+{
+	nnom_sumpool_layer_t *cl;
+	if(config->num_dim == 1)
+	{
+		cl = (nnom_sumpool_layer_t *)SumPool(kernel(1, config->kernel_size[0]), 
+						stride(1, config->stride_size[0]),
+						config->padding_type);
+	}
+	else
+	{
+		cl = (nnom_sumpool_layer_t *)SumPool(kernel(config->kernel_size[0], config->kernel_size[1]), 
+						stride(config->stride_size[0], config->stride_size[1]),
+						config->padding_type);
+	}
+	if(cl)
+	{
+		cl->super.config = (void*) config;
+		cl->output_shift = config->output_shift; // no idea if we need it
+	}
+	return (nnom_layer_t *)cl;
+}
+
+
+nnom_layer_t *SumPool(nnom_3d_shape_t k, nnom_3d_shape_t s, nnom_padding_t pad_type)
+{
+	nnom_layer_t *layer = MaxPool(k, s, pad_type);
+
+	if (layer != NULL)
+	{
+		layer->type = NNOM_SUMPOOL;
+		layer->run = sumpool_run;
+		layer->build = sumpool_build;
+	}
+	return (nnom_layer_t *)layer;
+}
+
+
+nnom_status_t sumpool_build(nnom_layer_t *layer)
+{
+	// avg pooling share the same output shape, stride, padding setting.
+	maxpool_build(layer);
+
+	// however, avg pooling require a computational buffer.
+	layer->comp->size = 4 * tensor_size(layer->out->tensor);
+
+	return NN_SUCCESS;
+}
+
+
+// sum pooling, dynamic change Q format, must be used in the last layer before softmax in current version
+nnom_status_t sumpool_run(nnom_layer_t *layer)
+{
+	nnom_sumpool_layer_t *cl = (nnom_sumpool_layer_t *)(layer);
+	uint16_t out_x, out_y;
+
+	// if global pooling
+	if(layer->out->tensor->num_dim == 1)
+	{
+		out_x = 1; out_y = 1;
+	}
+	else // normal pooling. 
+	{
+		out_x = layer->out->tensor->dim[1]; //W
+		out_y = layer->out->tensor->dim[0]; //h
+	}
+	
+#ifdef NNOM_USING_CHW
+	local_sumpool_q7_CHW(				
+#else
+	local_sumpool_q7_HWC(
+#endif
+			layer->in->tensor->p_data, 				
+			layer->in->tensor->dim[1], layer->in->tensor->dim[0], layer->in->tensor->dim[2],
+			cl->kernel.w, cl->kernel.h, 
+			cl->pad.w, cl->pad.h,
+			cl->stride.w, cl->stride.h,
+			out_x, out_y,
+			layer->comp->mem->blk,
+			layer->out->tensor->p_data);
+
+	return NN_SUCCESS;
+}
diff --git a/APP_Framework/Framework/knowing/nnom/src/layers/nnom_upsample.c b/APP_Framework/Framework/knowing/nnom/src/layers/nnom_upsample.c
new file mode 100644
index 000000000..96472a5ab
--- /dev/null
+++ b/APP_Framework/Framework/knowing/nnom/src/layers/nnom_upsample.c
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2018-2020
+ * Jianjia Ma
+ * majianjia@live.com
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Change Logs:
+ * Date           Author       Notes
+ * 2019-07-23     Jianjia Ma   The first version
+ */
+
+#include <stdint.h>
+#include <string.h>
+#include <stdbool.h>
+
+#include "nnom.h"
+#include "nnom_local.h"
+#include "nnom_layers.h"
+#include "layers/nnom_upsample.h"
+
+nnom_layer_t *upsample_s(const nnom_upsample_config_t *config)
+{
+	nnom_layer_t *layer = UpSample(kernel(config->kernel[0], config->kernel[1]));
+	if(layer)
+		layer->config = (void*) config;
+	return layer;
+}
+
+// up sampling layer
+nnom_layer_t *UpSample(nnom_3d_shape_t kernel)
+{
+	nnom_upsample_layer_t *layer;
+	nnom_layer_io_t *in, *out;
+
+	// apply a block memory for all the sub handles.
+	size_t mem_size = sizeof(nnom_upsample_layer_t) + sizeof(nnom_layer_io_t) * 2;
+	layer = nnom_mem(mem_size);
+	if (layer == NULL)
+		return NULL;
+
+	// distribut the memory to sub handles.
+	in = (void *)((uint8_t*)layer + sizeof(nnom_upsample_layer_t));
+	out = (void *)((uint8_t*)in + sizeof(nnom_layer_io_t));
+
+	// set type in layer parent
+	layer->super.type = NNOM_UPSAMPLE;
+	// set buf state
+	in->type = NNOM_TENSOR_BUF_TEMP;
+	out->type = NNOM_TENSOR_BUF_TEMP;
+	// put in & out on the layer.
+	layer->super.in = io_init(layer, in);
+	layer->super.out = io_init(layer, out);
+	// set run and outshape methods
+	layer->super.run = upsample_run;
+	layer->super.build = upsample_build;
+
+	// set parameters
+	layer->kernel = kernel;
+	
+	return (nnom_layer_t*)layer;
+}
+
+nnom_status_t upsample_build(nnom_layer_t *layer)
+{
+	nnom_upsample_layer_t* cl = (nnom_upsample_layer_t*)layer;
+
+	// get the last layer's output as input shape
+	layer->in->tensor = layer->in->hook.io->tensor;
+	// output tensor
+	// 1. allocate a new tensor for output
+	// 2. set the same dim, qfmt to the new tensor.
+	layer->out->tensor = new_tensor(NNOM_QTYPE_PER_TENSOR, layer->in->tensor->num_dim, tensor_get_num_channel(layer->in->tensor));
+	tensor_cpy_attr(layer->out->tensor, layer->in->tensor);
+
+	// see if the activation will change the q format
+	if(layer->actail) 
+		layer->out->tensor->q_dec[0] = act_get_dec_bit(layer->actail->type, layer->out->tensor->q_dec[0]);
+
+	// enlarge w and h, c stay the same.
+	layer->out->tensor->dim[0] = layer->in->tensor->dim[0] * cl->kernel.h;
+	layer->out->tensor->dim[1] = layer->in->tensor->dim[1] * cl->kernel.w;
+
+	return NN_SUCCESS;
+}
+
+// up sampling, or so called unpooling
+nnom_status_t upsample_run(nnom_layer_t *layer)
+{
+	nnom_upsample_layer_t *cl = (nnom_upsample_layer_t *)(layer);
+
+#ifdef NNOM_USING_CHW
+	local_up_sampling_q7_CHW(				
+#else
+	local_up_sampling_q7_HWC(
+#endif
+			layer->in->tensor->p_data, 				
+			layer->in->tensor->dim[1], layer->in->tensor->dim[0], layer->in->tensor->dim[2],
+			cl->kernel.w, cl->kernel.h, 
+			layer->out->tensor->dim[1], layer->out->tensor->dim[0],
+			NULL,
+			layer->out->tensor->p_data);
+	return NN_SUCCESS;
+}
diff --git a/APP_Framework/Framework/knowing/nnom/src/layers/nnom_zero_padding.c b/APP_Framework/Framework/knowing/nnom/src/layers/nnom_zero_padding.c
new file mode 100644
index 000000000..2352e614e
--- /dev/null
+++ b/APP_Framework/Framework/knowing/nnom/src/layers/nnom_zero_padding.c
@@ -0,0 +1,107 @@
+/*
+ * Copyright (c) 2018-2020
+ * Jianjia Ma
+ * majianjia@live.com
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Change Logs:
+ * Date           Author       Notes
+ * 2019-07-23     Jianjia Ma   The first version
+ */
+
+#include <stdint.h>
+#include <string.h>
+#include <stdbool.h>
+
+#include "nnom.h"
+#include "nnom_local.h"
+#include "nnom_layers.h"
+#include "layers/nnom_zero_padding.h"
+
+nnom_layer_t * zeropadding_s(const nnom_zero_padding_config_t* config)
+{
+	nnom_layer_t *layer = ZeroPadding(config->pad);
+	if(layer)
+		layer->config = (void*) config;
+	return (nnom_layer_t*)layer;
+}
+
+// Zero padding layer
+nnom_layer_t *ZeroPadding(nnom_border_t pad)
+{
+	nnom_zero_padding_layer_t *layer;
+	nnom_layer_io_t *in, *out;
+
+	// apply a block memory for all the sub handles.
+	size_t mem_size = sizeof(nnom_zero_padding_layer_t) + sizeof(nnom_layer_io_t) * 2;
+	layer = nnom_mem(mem_size);
+	if (layer == NULL)
+		return NULL;
+
+	// distribut the memory to sub handles.
+	in = (void *)((uint8_t*)layer + sizeof(nnom_zero_padding_layer_t));
+	out = (void *)((uint8_t*)in + sizeof(nnom_layer_io_t));
+
+	// set type in layer parent
+	layer->super.type = NNOM_ZERO_PADDING;
+	// set buf state
+	in->type = NNOM_TENSOR_BUF_TEMP;
+	out->type = NNOM_TENSOR_BUF_TEMP;
+	// put in & out on the layer.
+	layer->super.in = io_init(layer, in);
+	layer->super.out = io_init(layer, out);
+	// set run and outshape methods
+	layer->super.run = zero_padding_run;
+	layer->super.build = zero_padding_build;
+
+	// set parameters
+	layer->pad = pad;
+	
+	return (nnom_layer_t*)layer;
+}
+
+nnom_status_t zero_padding_build(nnom_layer_t* layer)
+{
+	nnom_zero_padding_layer_t *cl = (nnom_zero_padding_layer_t *)layer;
+
+	// get the tensor from last layer's output
+	layer->in->tensor = layer->in->hook.io->tensor;
+
+	// create new tensor for output
+	layer->out->tensor = new_tensor(NNOM_QTYPE_PER_TENSOR, layer->in->tensor->num_dim, tensor_get_num_channel(layer->in->tensor));
+	// copy then change later. 
+	tensor_cpy_attr(layer->out->tensor, layer->in->tensor);
+
+	// see if the activation will change the q format
+	if(layer->actail) 
+		layer->out->tensor->q_dec[0] = act_get_dec_bit(layer->actail->type, layer->out->tensor->q_dec[0]);
+
+	// output shape
+	layer->out->tensor->dim[1] = layer->in->tensor->dim[1] + cl->pad.left + cl->pad.right;
+	layer->out->tensor->dim[0] = layer->in->tensor->dim[0] + cl->pad.top + cl->pad.bottom;
+	layer->out->tensor->dim[2] = layer->in->tensor->dim[2];
+	return NN_SUCCESS;
+}
+
+nnom_status_t zero_padding_run(nnom_layer_t * layer)
+{
+	nnom_zero_padding_layer_t *cl = (nnom_zero_padding_layer_t*)layer;
+	
+#ifdef NNOM_USING_CHW
+	local_zero_padding_CHW_q7(
+#else
+	local_zero_padding_HWC_q7(
+#endif
+						layer->in->tensor->p_data, 
+						layer->in->tensor->dim[1], layer->in->tensor->dim[0], layer->in->tensor->dim[2],
+						cl->pad.top,
+						cl->pad.bottom,
+						cl->pad.left,
+						cl->pad.right,
+						layer->out->tensor->p_data,
+						layer->out->tensor->dim[1], layer->out->tensor->dim[0]);
+
+	return NN_SUCCESS;
+}
+