Merge branch 'prepare_for_master' of https://git.trustie.net/xuos/xiuos into develop

This commit is contained in:
Wang_Weigen 2022-02-17 11:10:15 +08:00
commit 691611de34
145 changed files with 4166 additions and 30446 deletions

View File

@ -1,4 +1,4 @@
unsigned char mnist_model[] = {
const unsigned char mnist_model[] = {
0x1c, 0x00, 0x00, 0x00, 0x54, 0x46, 0x4c, 0x33, 0x14, 0x00, 0x20, 0x00,
0x04, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x10, 0x00, 0x14, 0x00, 0x00, 0x00,
0x18, 0x00, 0x1c, 0x00, 0x14, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,

View File

@ -1,13 +1,13 @@
menuconfig SUPPORT_KNOWING_FRAMEWORK
bool "support knowing framework"
default n
select TRANSFORM_LAYER_ATTRIUBUTE
if SUPPORT_KNOWING_FRAMEWORK
source "$APP_DIR/Framework/knowing/tensorflow-lite/Kconfig"
source "$APP_DIR/Framework/knowing/filter/Kconfig"
source "$APP_DIR/Framework/knowing/ota/Kconfig"
source "$APP_DIR/Framework/knowing/image_processing/Kconfig"
source "$APP_DIR/Framework/knowing/cmsis_5/Kconfig"
source "$APP_DIR/Framework/knowing/kpu/Kconfig"
endif
menuconfig SUPPORT_KNOWING_FRAMEWORK
bool "support knowing framework"
default n
select TRANSFORM_LAYER_ATTRIUBUTE
if SUPPORT_KNOWING_FRAMEWORK
source "$APP_DIR/Framework/knowing/tensorflow-lite/Kconfig"
source "$APP_DIR/Framework/knowing/filter/Kconfig"
source "$APP_DIR/Framework/knowing/ota/Kconfig"
source "$APP_DIR/Framework/knowing/image_processing/Kconfig"
source "$APP_DIR/Framework/knowing/cmsis_5/Kconfig"
source "$APP_DIR/Framework/knowing/kpu/Kconfig"
endif

View File

@ -27,7 +27,8 @@
* Target Processor: Cortex-M
*
* -------------------------------------------------------------------- */
#include <inttypes.h>
#include "../../../Core/Include/cmsis_gcc.h"
#include "arm_nnsupportfunctions.h"
/**

View File

@ -27,7 +27,8 @@
* Target Processor: Cortex-M
*
* -------------------------------------------------------------------- */
#include <inttypes.h>
#include "../../../Core/Include/cmsis_gcc.h"
#include "arm_nnsupportfunctions.h"
/**

View File

@ -28,7 +28,8 @@
* Target Processor: Cortex-M
*
* -------------------------------------------------------------------- */
#include <inttypes.h>
#include "../../../Core/Include/cmsis_gcc.h"
#include "arm_nnsupportfunctions.h"
/**

View File

@ -46,8 +46,8 @@ void k210_detect(char *json_file_path)
printf("open ov2640 fail !!");
return;
}
_ioctl_set_dvp_reso set_dvp_reso = {detect_params.sensor_output_size[1], detect_params.sensor_output_size[0]};
ioctl(g_fd, IOCTRL_CAMERA_SET_DVP_RESO, &set_dvp_reso);
_ioctl_set_reso set_dvp_reso = {detect_params.sensor_output_size[1], detect_params.sensor_output_size[0]};
ioctl(g_fd, IOCTRL_CAMERA_OUT_SIZE_RESO, &set_dvp_reso);
showbuffer =
(unsigned char *)rt_malloc_align(detect_params.sensor_output_size[0] * detect_params.sensor_output_size[1] * 2, 64);
if (NULL == showbuffer) {

View File

@ -102,75 +102,76 @@ tensorflow-lite-for-mcu/source/tensorflow/lite/micro/kernels/cmsis-nn/svdf.cc
''')
cmsis = Split('''
tensorflow-lite-for-mcu/source/tensorflow/lite/micro/tools/make/downloads/cmsis/CMSIS/NN/Source/ActivationFunctions/arm_nn_activations_q15.c
tensorflow-lite-for-mcu/source/tensorflow/lite/micro/tools/make/downloads/cmsis/CMSIS/NN/Source/ActivationFunctions/arm_nn_activations_q7.c
tensorflow-lite-for-mcu/source/tensorflow/lite/micro/tools/make/downloads/cmsis/CMSIS/NN/Source/ActivationFunctions/arm_relu6_s8.c
tensorflow-lite-for-mcu/source/tensorflow/lite/micro/tools/make/downloads/cmsis/CMSIS/NN/Source/ActivationFunctions/arm_relu_q15.c
tensorflow-lite-for-mcu/source/tensorflow/lite/micro/tools/make/downloads/cmsis/CMSIS/NN/Source/ActivationFunctions/arm_relu_q7.c
tensorflow-lite-for-mcu/source/tensorflow/lite/micro/tools/make/downloads/cmsis/CMSIS/NN/Source/BasicMathFunctions/arm_elementwise_add_s8.c
tensorflow-lite-for-mcu/source/tensorflow/lite/micro/tools/make/downloads/cmsis/CMSIS/NN/Source/BasicMathFunctions/arm_elementwise_mul_s8.c
tensorflow-lite-for-mcu/source/tensorflow/lite/micro/tools/make/downloads/cmsis/CMSIS/NN/Source/ConcatenationFunctions/arm_concatenation_s8_w.c
tensorflow-lite-for-mcu/source/tensorflow/lite/micro/tools/make/downloads/cmsis/CMSIS/NN/Source/ConcatenationFunctions/arm_concatenation_s8_x.c
tensorflow-lite-for-mcu/source/tensorflow/lite/micro/tools/make/downloads/cmsis/CMSIS/NN/Source/ConcatenationFunctions/arm_concatenation_s8_y.c
tensorflow-lite-for-mcu/source/tensorflow/lite/micro/tools/make/downloads/cmsis/CMSIS/NN/Source/ConcatenationFunctions/arm_concatenation_s8_z.c
tensorflow-lite-for-mcu/source/tensorflow/lite/micro/tools/make/downloads/cmsis/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_1_x_n_s8.c
tensorflow-lite-for-mcu/source/tensorflow/lite/micro/tools/make/downloads/cmsis/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_1x1_HWC_q7_fast_nonsquare.c
tensorflow-lite-for-mcu/source/tensorflow/lite/micro/tools/make/downloads/cmsis/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_1x1_s8_fast.c
tensorflow-lite-for-mcu/source/tensorflow/lite/micro/tools/make/downloads/cmsis/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q15_basic.c
tensorflow-lite-for-mcu/source/tensorflow/lite/micro/tools/make/downloads/cmsis/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q15_fast.c
tensorflow-lite-for-mcu/source/tensorflow/lite/micro/tools/make/downloads/cmsis/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q15_fast_nonsquare.c
tensorflow-lite-for-mcu/source/tensorflow/lite/micro/tools/make/downloads/cmsis/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_RGB.c
tensorflow-lite-for-mcu/source/tensorflow/lite/micro/tools/make/downloads/cmsis/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_basic.c
tensorflow-lite-for-mcu/source/tensorflow/lite/micro/tools/make/downloads/cmsis/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_basic_nonsquare.c
tensorflow-lite-for-mcu/source/tensorflow/lite/micro/tools/make/downloads/cmsis/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_fast.c
tensorflow-lite-for-mcu/source/tensorflow/lite/micro/tools/make/downloads/cmsis/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_fast_nonsquare.c
tensorflow-lite-for-mcu/source/tensorflow/lite/micro/tools/make/downloads/cmsis/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_s8.c
tensorflow-lite-for-mcu/source/tensorflow/lite/micro/tools/make/downloads/cmsis/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_wrapper_s8.c
tensorflow-lite-for-mcu/source/tensorflow/lite/micro/tools/make/downloads/cmsis/CMSIS/NN/Source/ConvolutionFunctions/arm_depthwise_conv_3x3_s8.c
tensorflow-lite-for-mcu/source/tensorflow/lite/micro/tools/make/downloads/cmsis/CMSIS/NN/Source/ConvolutionFunctions/arm_depthwise_conv_s8.c
tensorflow-lite-for-mcu/source/tensorflow/lite/micro/tools/make/downloads/cmsis/CMSIS/NN/Source/ConvolutionFunctions/arm_depthwise_conv_s8_opt.c
tensorflow-lite-for-mcu/source/tensorflow/lite/micro/tools/make/downloads/cmsis/CMSIS/NN/Source/ConvolutionFunctions/arm_depthwise_conv_u8_basic_ver1.c
tensorflow-lite-for-mcu/source/tensorflow/lite/micro/tools/make/downloads/cmsis/CMSIS/NN/Source/ConvolutionFunctions/arm_depthwise_conv_wrapper_s8.c
tensorflow-lite-for-mcu/source/tensorflow/lite/micro/tools/make/downloads/cmsis/CMSIS/NN/Source/ConvolutionFunctions/arm_depthwise_separable_conv_HWC_q7.c
tensorflow-lite-for-mcu/source/tensorflow/lite/micro/tools/make/downloads/cmsis/CMSIS/NN/Source/ConvolutionFunctions/arm_depthwise_separable_conv_HWC_q7_nonsquare.c
tensorflow-lite-for-mcu/source/tensorflow/lite/micro/tools/make/downloads/cmsis/CMSIS/NN/Source/ConvolutionFunctions/arm_nn_depthwise_conv_s8_core.c
tensorflow-lite-for-mcu/source/tensorflow/lite/micro/tools/make/downloads/cmsis/CMSIS/NN/Source/ConvolutionFunctions/arm_nn_mat_mult_kernel_q7_q15.c
tensorflow-lite-for-mcu/source/tensorflow/lite/micro/tools/make/downloads/cmsis/CMSIS/NN/Source/ConvolutionFunctions/arm_nn_mat_mult_kernel_q7_q15_reordered.c
tensorflow-lite-for-mcu/source/tensorflow/lite/micro/tools/make/downloads/cmsis/CMSIS/NN/Source/ConvolutionFunctions/arm_nn_mat_mult_kernel_s8_s16.c
tensorflow-lite-for-mcu/source/tensorflow/lite/micro/tools/make/downloads/cmsis/CMSIS/NN/Source/ConvolutionFunctions/arm_nn_mat_mult_kernel_s8_s16_reordered.c
tensorflow-lite-for-mcu/source/tensorflow/lite/micro/tools/make/downloads/cmsis/CMSIS/NN/Source/ConvolutionFunctions/arm_nn_mat_mult_s8.c
tensorflow-lite-for-mcu/source/tensorflow/lite/micro/tools/make/downloads/cmsis/CMSIS/NN/Source/FullyConnectedFunctions/arm_fully_connected_mat_q7_vec_q15.c
tensorflow-lite-for-mcu/source/tensorflow/lite/micro/tools/make/downloads/cmsis/CMSIS/NN/Source/FullyConnectedFunctions/arm_fully_connected_mat_q7_vec_q15_opt.c
tensorflow-lite-for-mcu/source/tensorflow/lite/micro/tools/make/downloads/cmsis/CMSIS/NN/Source/FullyConnectedFunctions/arm_fully_connected_q15.c
tensorflow-lite-for-mcu/source/tensorflow/lite/micro/tools/make/downloads/cmsis/CMSIS/NN/Source/FullyConnectedFunctions/arm_fully_connected_q15_opt.c
tensorflow-lite-for-mcu/source/tensorflow/lite/micro/tools/make/downloads/cmsis/CMSIS/NN/Source/FullyConnectedFunctions/arm_fully_connected_q7.c
tensorflow-lite-for-mcu/source/tensorflow/lite/micro/tools/make/downloads/cmsis/CMSIS/NN/Source/FullyConnectedFunctions/arm_fully_connected_q7_opt.c
tensorflow-lite-for-mcu/source/tensorflow/lite/micro/tools/make/downloads/cmsis/CMSIS/NN/Source/FullyConnectedFunctions/arm_fully_connected_s8.c
tensorflow-lite-for-mcu/source/tensorflow/lite/micro/tools/make/downloads/cmsis/CMSIS/NN/Source/NNSupportFunctions/arm_nn_accumulate_q7_to_q15.c
tensorflow-lite-for-mcu/source/tensorflow/lite/micro/tools/make/downloads/cmsis/CMSIS/NN/Source/NNSupportFunctions/arm_nn_add_q7.c
tensorflow-lite-for-mcu/source/tensorflow/lite/micro/tools/make/downloads/cmsis/CMSIS/NN/Source/NNSupportFunctions/arm_nn_depthwise_conv_nt_t_padded_s8.c
tensorflow-lite-for-mcu/source/tensorflow/lite/micro/tools/make/downloads/cmsis/CMSIS/NN/Source/NNSupportFunctions/arm_nn_depthwise_conv_nt_t_s8.c
tensorflow-lite-for-mcu/source/tensorflow/lite/micro/tools/make/downloads/cmsis/CMSIS/NN/Source/NNSupportFunctions/arm_nn_mat_mul_core_1x_s8.c
tensorflow-lite-for-mcu/source/tensorflow/lite/micro/tools/make/downloads/cmsis/CMSIS/NN/Source/NNSupportFunctions/arm_nn_mat_mul_core_4x_s8.c
tensorflow-lite-for-mcu/source/tensorflow/lite/micro/tools/make/downloads/cmsis/CMSIS/NN/Source/NNSupportFunctions/arm_nn_mat_mult_nt_t_s8.c
tensorflow-lite-for-mcu/source/tensorflow/lite/micro/tools/make/downloads/cmsis/CMSIS/NN/Source/NNSupportFunctions/arm_nn_mult_q15.c
tensorflow-lite-for-mcu/source/tensorflow/lite/micro/tools/make/downloads/cmsis/CMSIS/NN/Source/NNSupportFunctions/arm_nn_mult_q7.c
tensorflow-lite-for-mcu/source/tensorflow/lite/micro/tools/make/downloads/cmsis/CMSIS/NN/Source/NNSupportFunctions/arm_nn_vec_mat_mult_t_s8.c
tensorflow-lite-for-mcu/source/tensorflow/lite/micro/tools/make/downloads/cmsis/CMSIS/NN/Source/NNSupportFunctions/arm_nntables.c
tensorflow-lite-for-mcu/source/tensorflow/lite/micro/tools/make/downloads/cmsis/CMSIS/NN/Source/NNSupportFunctions/arm_q7_to_q15_no_shift.c
tensorflow-lite-for-mcu/source/tensorflow/lite/micro/tools/make/downloads/cmsis/CMSIS/NN/Source/NNSupportFunctions/arm_q7_to_q15_reordered_no_shift.c
tensorflow-lite-for-mcu/source/tensorflow/lite/micro/tools/make/downloads/cmsis/CMSIS/NN/Source/NNSupportFunctions/arm_q7_to_q15_reordered_with_offset.c
tensorflow-lite-for-mcu/source/tensorflow/lite/micro/tools/make/downloads/cmsis/CMSIS/NN/Source/NNSupportFunctions/arm_q7_to_q15_with_offset.c
tensorflow-lite-for-mcu/source/tensorflow/lite/micro/tools/make/downloads/cmsis/CMSIS/NN/Source/PoolingFunctions/arm_avgpool_s8.c
tensorflow-lite-for-mcu/source/tensorflow/lite/micro/tools/make/downloads/cmsis/CMSIS/NN/Source/PoolingFunctions/arm_max_pool_s8.c
tensorflow-lite-for-mcu/source/tensorflow/lite/micro/tools/make/downloads/cmsis/CMSIS/NN/Source/PoolingFunctions/arm_pool_q7_HWC.c
tensorflow-lite-for-mcu/source/tensorflow/lite/micro/tools/make/downloads/cmsis/CMSIS/NN/Source/ReshapeFunctions/arm_reshape_s8.c
tensorflow-lite-for-mcu/source/tensorflow/lite/micro/tools/make/downloads/cmsis/CMSIS/NN/Source/SVDFunctions/arm_svdf_s8.c
tensorflow-lite-for-mcu/source/tensorflow/lite/micro/tools/make/downloads/cmsis/CMSIS/NN/Source/SoftmaxFunctions/arm_softmax_q15.c
tensorflow-lite-for-mcu/source/tensorflow/lite/micro/tools/make/downloads/cmsis/CMSIS/NN/Source/SoftmaxFunctions/arm_softmax_q7.c
tensorflow-lite-for-mcu/source/tensorflow/lite/micro/tools/make/downloads/cmsis/CMSIS/NN/Source/SoftmaxFunctions/arm_softmax_s8.c
tensorflow-lite-for-mcu/source/tensorflow/lite/micro/tools/make/downloads/cmsis/CMSIS/NN/Source/SoftmaxFunctions/arm_softmax_u8.c
tensorflow-lite-for-mcu/source/tensorflow/lite/micro/tools/make/downloads/cmsis/CMSIS/NN/Source/SoftmaxFunctions/arm_softmax_with_batch_q7.c
../cmsis_5/NN/Source/ActivationFunctions/arm_nn_activations_q15.c
../cmsis_5/NN/Source/ActivationFunctions/arm_nn_activations_q7.c
../cmsis_5/NN/Source/ActivationFunctions/arm_relu6_s8.c
../cmsis_5/NN/Source/ActivationFunctions/arm_relu_q15.c
../cmsis_5/NN/Source/ActivationFunctions/arm_relu_q7.c
../cmsis_5/NN/Source/BasicMathFunctions/arm_elementwise_add_s8.c
../cmsis_5/NN/Source/BasicMathFunctions/arm_elementwise_mul_s8.c
../cmsis_5/NN/Source/ConcatenationFunctions/arm_concatenation_s8_w.c
../cmsis_5/NN/Source/ConcatenationFunctions/arm_concatenation_s8_x.c
../cmsis_5/NN/Source/ConcatenationFunctions/arm_concatenation_s8_y.c
../cmsis_5/NN/Source/ConcatenationFunctions/arm_concatenation_s8_z.c
../cmsis_5/NN/Source/ConvolutionFunctions/arm_convolve_1_x_n_s8.c
../cmsis_5/NN/Source/ConvolutionFunctions/arm_convolve_1x1_HWC_q7_fast_nonsquare.c
../cmsis_5/NN/Source/ConvolutionFunctions/arm_convolve_1x1_s8_fast.c
../cmsis_5/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q15_basic.c
../cmsis_5/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q15_fast.c
../cmsis_5/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q15_fast_nonsquare.c
../cmsis_5/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_RGB.c
../cmsis_5/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_basic.c
../cmsis_5/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_basic_nonsquare.c
../cmsis_5/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_fast.c
../cmsis_5/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_fast_nonsquare.c
../cmsis_5/NN/Source/ConvolutionFunctions/arm_convolve_s8.c
../cmsis_5/NN/Source/ConvolutionFunctions/arm_convolve_wrapper_s8.c
../cmsis_5/NN/Source/ConvolutionFunctions/arm_depthwise_conv_3x3_s8.c
../cmsis_5/NN/Source/ConvolutionFunctions/arm_depthwise_conv_s8.c
../cmsis_5/NN/Source/ConvolutionFunctions/arm_depthwise_conv_s8_opt.c
../cmsis_5/NN/Source/ConvolutionFunctions/arm_depthwise_conv_u8_basic_ver1.c
../cmsis_5/NN/Source/ConvolutionFunctions/arm_depthwise_conv_wrapper_s8.c
../cmsis_5/NN/Source/ConvolutionFunctions/arm_depthwise_separable_conv_HWC_q7.c
../cmsis_5/NN/Source/ConvolutionFunctions/arm_depthwise_separable_conv_HWC_q7_nonsquare.c
../cmsis_5/NN/Source/ConvolutionFunctions/arm_nn_depthwise_conv_s8_core.c
../cmsis_5/NN/Source/ConvolutionFunctions/arm_nn_mat_mult_kernel_q7_q15.c
../cmsis_5/NN/Source/ConvolutionFunctions/arm_nn_mat_mult_kernel_q7_q15_reordered.c
../cmsis_5/NN/Source/ConvolutionFunctions/arm_nn_mat_mult_kernel_s8_s16.c
../cmsis_5/NN/Source/ConvolutionFunctions/arm_nn_mat_mult_kernel_s8_s16_reordered.c
../cmsis_5/NN/Source/ConvolutionFunctions/arm_nn_mat_mult_s8.c
../cmsis_5/NN/Source/FullyConnectedFunctions/arm_fully_connected_mat_q7_vec_q15.c
../cmsis_5/NN/Source/FullyConnectedFunctions/arm_fully_connected_mat_q7_vec_q15_opt.c
../cmsis_5/NN/Source/FullyConnectedFunctions/arm_fully_connected_q15.c
../cmsis_5/NN/Source/FullyConnectedFunctions/arm_fully_connected_q15_opt.c
../cmsis_5/NN/Source/FullyConnectedFunctions/arm_fully_connected_q7.c
../cmsis_5/NN/Source/FullyConnectedFunctions/arm_fully_connected_q7_opt.c
../cmsis_5/NN/Source/FullyConnectedFunctions/arm_fully_connected_s8.c
../cmsis_5/NN/Source/NNSupportFunctions/arm_nn_accumulate_q7_to_q15.c
../cmsis_5/NN/Source/NNSupportFunctions/arm_nn_add_q7.c
../cmsis_5/NN/Source/NNSupportFunctions/arm_nn_depthwise_conv_nt_t_padded_s8.c
../cmsis_5/NN/Source/NNSupportFunctions/arm_nn_depthwise_conv_nt_t_s8.c
../cmsis_5/NN/Source/NNSupportFunctions/arm_nn_mat_mul_core_1x_s8.c
../cmsis_5/NN/Source/NNSupportFunctions/arm_nn_mat_mul_core_4x_s8.c
../cmsis_5/NN/Source/NNSupportFunctions/arm_nn_mat_mult_nt_t_s8.c
../cmsis_5/NN/Source/NNSupportFunctions/arm_nn_mult_q15.c
../cmsis_5/NN/Source/NNSupportFunctions/arm_nn_mult_q7.c
../cmsis_5/NN/Source/NNSupportFunctions/arm_nn_vec_mat_mult_t_s8.c
../cmsis_5/NN/Source/NNSupportFunctions/arm_nntables.c
../cmsis_5/NN/Source/NNSupportFunctions/arm_q7_to_q15_no_shift.c
../cmsis_5/NN/Source/NNSupportFunctions/arm_q7_to_q15_reordered_no_shift.c
../cmsis_5/NN/Source/NNSupportFunctions/arm_q7_to_q15_reordered_with_offset.c
../cmsis_5/NN/Source/NNSupportFunctions/arm_q7_to_q15_with_offset.c
../cmsis_5/NN/Source/NNSupportFunctions/arm_nn_vec_mat_mult_t_svdf_s8.c
../cmsis_5/NN/Source/PoolingFunctions/arm_avgpool_s8.c
../cmsis_5/NN/Source/PoolingFunctions/arm_max_pool_s8.c
../cmsis_5/NN/Source/PoolingFunctions/arm_pool_q7_HWC.c
../cmsis_5/NN/Source/ReshapeFunctions/arm_reshape_s8.c
../cmsis_5/NN/Source/SVDFunctions/arm_svdf_s8.c
../cmsis_5/NN/Source/SoftmaxFunctions/arm_softmax_q15.c
../cmsis_5/NN/Source/SoftmaxFunctions/arm_softmax_q7.c
../cmsis_5/NN/Source/SoftmaxFunctions/arm_softmax_s8.c
../cmsis_5/NN/Source/SoftmaxFunctions/arm_softmax_u8.c
../cmsis_5/NN/Source/SoftmaxFunctions/arm_softmax_with_batch_q7.c
''')
CPPPATH = [
@ -179,10 +180,10 @@ CPPPATH = [
os.path.join(cwd, 'tensorflow-lite-for-mcu/source/third_party/gemmlowp'),
os.path.join(cwd, 'tensorflow-lite-for-mcu/source/third_party/flatbuffers/include'),
os.path.join(cwd, 'tensorflow-lite-for-mcu/source/third_party/ruy'),
os.path.join(cwd, 'tensorflow-lite-for-mcu/source/tensorflow/lite/micro/tools/make/downloads/cmsis'),
os.path.join(cwd, 'tensorflow-lite-for-mcu/source/tensorflow/lite/micro/tools/make/downloads/cmsis/CMSIS/DSP/Include'),
os.path.join(cwd, 'tensorflow-lite-for-mcu/source/tensorflow/lite/micro/tools/make/downloads/cmsis/CMSIS/NN/Include'),
os.path.join(cwd, 'tensorflow-lite-for-mcu/source/tensorflow/lite/micro/tools/make/downloads/cmsis/CMSIS/Core/Include'),
os.path.join(cwd, '../cmsis_5/'),
os.path.join(cwd, '../cmsis_5/DSP/Include'),
os.path.join(cwd, '../cmsis_5/NN/Include'),
os.path.join(cwd, '../cmsis_5/Core/Include'),
]
# embedded C++ std don't have some math functions, use global math functions instead

View File

@ -76,6 +76,7 @@ limitations under the License.
#define TFLITE_CHECK_LT(x, y) ((x) < (y)) ? (void)0 : TFLITE_ABORT
#endif
/*
#ifndef TF_LITE_STATIC_MEMORY
// TODO(b/162019032): Consider removing these type-aliases.
using int8 = std::int8_t;
@ -85,6 +86,7 @@ using uint16 = std::uint16_t;
using int32 = std::int32_t;
using uint32 = std::uint32_t;
#endif // !defined(TF_LITE_STATIC_MEMORY)
*/
// TFLITE_DEPRECATED()
//

View File

@ -15,7 +15,7 @@ limitations under the License.
#include "tensorflow/lite/kernels/internal/reference/add.h"
#include "CMSIS/NN/Include/arm_nnfunctions.h"
#include "../../../../../../../../cmsis_5/NN/Include/arm_nnfunctions.h"
#include "tensorflow/lite/c/builtin_op_data.h"
#include "tensorflow/lite/kernels/internal/quantization_util.h"
#include "tensorflow/lite/kernels/internal/reference/integer_ops/add.h"

View File

@ -15,8 +15,8 @@ limitations under the License.
#include "tensorflow/lite/kernels/internal/reference/conv.h"
#include "CMSIS/NN/Include/arm_nn_types.h"
#include "CMSIS/NN/Include/arm_nnfunctions.h"
#include "../../../../../../../../cmsis_5/NN/Include/arm_nn_types.h"
#include "../../../../../../../../cmsis_5/NN/Include/arm_nnfunctions.h"
#include "tensorflow/lite/c/builtin_op_data.h"
#include "tensorflow/lite/c/common.h"
#include "tensorflow/lite/kernels/internal/common.h"

View File

@ -15,7 +15,7 @@ limitations under the License.
#include "tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h"
#include "CMSIS/NN/Include/arm_nnfunctions.h"
#include "../../../../../../../../cmsis_5/NN/Include/arm_nnfunctions.h"
#include "tensorflow/lite/c/builtin_op_data.h"
#include "tensorflow/lite/c/common.h"
#include "tensorflow/lite/kernels/internal/common.h"

View File

@ -15,7 +15,7 @@ limitations under the License.
#include "tensorflow/lite/kernels/internal/reference/fully_connected.h"
#include "CMSIS/NN/Include/arm_nnfunctions.h"
#include "../../../../../../../../cmsis_5/NN/Include/arm_nnfunctions.h"
#include "tensorflow/lite/c/builtin_op_data.h"
#include "tensorflow/lite/c/common.h"
#include "tensorflow/lite/kernels/internal/common.h"

View File

@ -15,7 +15,7 @@ limitations under the License.
#include "tensorflow/lite/kernels/internal/reference/mul.h"
#include "CMSIS/NN/Include/arm_nnfunctions.h"
#include "../../../../../../../../cmsis_5/NN/Include/arm_nnfunctions.h"
#include "tensorflow/lite/kernels/internal/quantization_util.h"
#include "tensorflow/lite/kernels/internal/reference/integer_ops/mul.h"
#include "tensorflow/lite/kernels/internal/reference/process_broadcast_shapes.h"

View File

@ -14,7 +14,7 @@ limitations under the License.
==============================================================================*/
#include "tensorflow/lite/kernels/internal/reference/pooling.h"
#include "CMSIS/NN/Include/arm_nnfunctions.h"
#include "../../../../../../../../cmsis_5/NN/Include/arm_nnfunctions.h"
#include "flatbuffers/base.h" // from @flatbuffers
#include "tensorflow/lite/c/builtin_op_data.h"
#include "tensorflow/lite/kernels/internal/reference/integer_ops/pooling.h"

View File

@ -15,7 +15,7 @@ limitations under the License.
#include "tensorflow/lite/kernels/internal/reference/softmax.h"
#include "CMSIS/NN/Include/arm_nnfunctions.h"
#include "../../../../../../../../cmsis_5/NN/Include/arm_nnfunctions.h"
#include "tensorflow/lite/c/common.h"
#include "tensorflow/lite/kernels/internal/common.h"
#include "tensorflow/lite/kernels/internal/quantization_util.h"

View File

@ -16,8 +16,8 @@ limitations under the License.
#include <cmath>
#include <cstdint>
#include "CMSIS/NN/Include/arm_nn_types.h"
#include "CMSIS/NN/Include/arm_nnfunctions.h"
#include "../../../../../../../../cmsis_5/NN/Include/arm_nn_types.h"
#include "../../../../../../../../cmsis_5/NN/Include/arm_nnfunctions.h"
#include "tensorflow/lite/c/builtin_op_data.h"
#include "tensorflow/lite/c/common.h"
#include "tensorflow/lite/kernels/internal/common.h"

View File

@ -1,283 +0,0 @@
/**************************************************************************//**
* @file cmsis_compiler.h
* @brief CMSIS compiler generic header file
* @version V5.1.0
* @date 09. October 2018
******************************************************************************/
/*
* Copyright (c) 2009-2018 Arm Limited. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef __CMSIS_COMPILER_H
#define __CMSIS_COMPILER_H
#include <stdint.h>
/*
* Arm Compiler 4/5
*/
#if defined ( __CC_ARM )
#include "cmsis_armcc.h"
/*
* Arm Compiler 6.6 LTM (armclang)
*/
#elif defined (__ARMCC_VERSION) && (__ARMCC_VERSION >= 6010050) && (__ARMCC_VERSION < 6100100)
#include "cmsis_armclang_ltm.h"
/*
* Arm Compiler above 6.10.1 (armclang)
*/
#elif defined (__ARMCC_VERSION) && (__ARMCC_VERSION >= 6100100)
#include "cmsis_armclang.h"
/*
* GNU Compiler
*/
#elif defined ( __GNUC__ )
#include "cmsis_gcc.h"
/*
* IAR Compiler
*/
#elif defined ( __ICCARM__ )
#include <cmsis_iccarm.h>
/*
* TI Arm Compiler
*/
#elif defined ( __TI_ARM__ )
#include <cmsis_ccs.h>
#ifndef __ASM
#define __ASM __asm
#endif
#ifndef __INLINE
#define __INLINE inline
#endif
#ifndef __STATIC_INLINE
#define __STATIC_INLINE static inline
#endif
#ifndef __STATIC_FORCEINLINE
#define __STATIC_FORCEINLINE __STATIC_INLINE
#endif
#ifndef __NO_RETURN
#define __NO_RETURN __attribute__((noreturn))
#endif
#ifndef __USED
#define __USED __attribute__((used))
#endif
#ifndef __WEAK
#define __WEAK __attribute__((weak))
#endif
#ifndef __PACKED
#define __PACKED __attribute__((packed))
#endif
#ifndef __PACKED_STRUCT
#define __PACKED_STRUCT struct __attribute__((packed))
#endif
#ifndef __PACKED_UNION
#define __PACKED_UNION union __attribute__((packed))
#endif
#ifndef __UNALIGNED_UINT32 /* deprecated */
struct __attribute__((packed)) T_UINT32 { uint32_t v; };
#define __UNALIGNED_UINT32(x) (((struct T_UINT32 *)(x))->v)
#endif
#ifndef __UNALIGNED_UINT16_WRITE
__PACKED_STRUCT T_UINT16_WRITE { uint16_t v; };
#define __UNALIGNED_UINT16_WRITE(addr, val) (void)((((struct T_UINT16_WRITE *)(void*)(addr))->v) = (val))
#endif
#ifndef __UNALIGNED_UINT16_READ
__PACKED_STRUCT T_UINT16_READ { uint16_t v; };
#define __UNALIGNED_UINT16_READ(addr) (((const struct T_UINT16_READ *)(const void *)(addr))->v)
#endif
#ifndef __UNALIGNED_UINT32_WRITE
__PACKED_STRUCT T_UINT32_WRITE { uint32_t v; };
#define __UNALIGNED_UINT32_WRITE(addr, val) (void)((((struct T_UINT32_WRITE *)(void *)(addr))->v) = (val))
#endif
#ifndef __UNALIGNED_UINT32_READ
__PACKED_STRUCT T_UINT32_READ { uint32_t v; };
#define __UNALIGNED_UINT32_READ(addr) (((const struct T_UINT32_READ *)(const void *)(addr))->v)
#endif
#ifndef __ALIGNED
#define __ALIGNED(x) __attribute__((aligned(x)))
#endif
#ifndef __RESTRICT
#define __RESTRICT __restrict
#endif
#ifndef __COMPILER_BARRIER
#warning No compiler specific solution for __COMPILER_BARRIER. __COMPILER_BARRIER is ignored.
#define __COMPILER_BARRIER() (void)0
#endif
/*
* TASKING Compiler
*/
#elif defined ( __TASKING__ )
/*
* The CMSIS functions have been implemented as intrinsics in the compiler.
* Please use "carm -?i" to get an up to date list of all intrinsics,
* Including the CMSIS ones.
*/
#ifndef __ASM
#define __ASM __asm
#endif
#ifndef __INLINE
#define __INLINE inline
#endif
#ifndef __STATIC_INLINE
#define __STATIC_INLINE static inline
#endif
#ifndef __STATIC_FORCEINLINE
#define __STATIC_FORCEINLINE __STATIC_INLINE
#endif
#ifndef __NO_RETURN
#define __NO_RETURN __attribute__((noreturn))
#endif
#ifndef __USED
#define __USED __attribute__((used))
#endif
#ifndef __WEAK
#define __WEAK __attribute__((weak))
#endif
#ifndef __PACKED
#define __PACKED __packed__
#endif
#ifndef __PACKED_STRUCT
#define __PACKED_STRUCT struct __packed__
#endif
#ifndef __PACKED_UNION
#define __PACKED_UNION union __packed__
#endif
#ifndef __UNALIGNED_UINT32 /* deprecated */
struct __packed__ T_UINT32 { uint32_t v; };
#define __UNALIGNED_UINT32(x) (((struct T_UINT32 *)(x))->v)
#endif
#ifndef __UNALIGNED_UINT16_WRITE
__PACKED_STRUCT T_UINT16_WRITE { uint16_t v; };
#define __UNALIGNED_UINT16_WRITE(addr, val) (void)((((struct T_UINT16_WRITE *)(void *)(addr))->v) = (val))
#endif
#ifndef __UNALIGNED_UINT16_READ
__PACKED_STRUCT T_UINT16_READ { uint16_t v; };
#define __UNALIGNED_UINT16_READ(addr) (((const struct T_UINT16_READ *)(const void *)(addr))->v)
#endif
#ifndef __UNALIGNED_UINT32_WRITE
__PACKED_STRUCT T_UINT32_WRITE { uint32_t v; };
#define __UNALIGNED_UINT32_WRITE(addr, val) (void)((((struct T_UINT32_WRITE *)(void *)(addr))->v) = (val))
#endif
#ifndef __UNALIGNED_UINT32_READ
__PACKED_STRUCT T_UINT32_READ { uint32_t v; };
#define __UNALIGNED_UINT32_READ(addr) (((const struct T_UINT32_READ *)(const void *)(addr))->v)
#endif
#ifndef __ALIGNED
#define __ALIGNED(x) __align(x)
#endif
#ifndef __RESTRICT
#warning No compiler specific solution for __RESTRICT. __RESTRICT is ignored.
#define __RESTRICT
#endif
#ifndef __COMPILER_BARRIER
#warning No compiler specific solution for __COMPILER_BARRIER. __COMPILER_BARRIER is ignored.
#define __COMPILER_BARRIER() (void)0
#endif
/*
* COSMIC Compiler
*/
#elif defined ( __CSMC__ )
#include <cmsis_csm.h>
#ifndef __ASM
#define __ASM _asm
#endif
#ifndef __INLINE
#define __INLINE inline
#endif
#ifndef __STATIC_INLINE
#define __STATIC_INLINE static inline
#endif
#ifndef __STATIC_FORCEINLINE
#define __STATIC_FORCEINLINE __STATIC_INLINE
#endif
#ifndef __NO_RETURN
// NO RETURN is automatically detected hence no warning here
#define __NO_RETURN
#endif
#ifndef __USED
#warning No compiler specific solution for __USED. __USED is ignored.
#define __USED
#endif
#ifndef __WEAK
#define __WEAK __weak
#endif
#ifndef __PACKED
#define __PACKED @packed
#endif
#ifndef __PACKED_STRUCT
#define __PACKED_STRUCT @packed struct
#endif
#ifndef __PACKED_UNION
#define __PACKED_UNION @packed union
#endif
#ifndef __UNALIGNED_UINT32 /* deprecated */
@packed struct T_UINT32 { uint32_t v; };
#define __UNALIGNED_UINT32(x) (((struct T_UINT32 *)(x))->v)
#endif
#ifndef __UNALIGNED_UINT16_WRITE
__PACKED_STRUCT T_UINT16_WRITE { uint16_t v; };
#define __UNALIGNED_UINT16_WRITE(addr, val) (void)((((struct T_UINT16_WRITE *)(void *)(addr))->v) = (val))
#endif
#ifndef __UNALIGNED_UINT16_READ
__PACKED_STRUCT T_UINT16_READ { uint16_t v; };
#define __UNALIGNED_UINT16_READ(addr) (((const struct T_UINT16_READ *)(const void *)(addr))->v)
#endif
#ifndef __UNALIGNED_UINT32_WRITE
__PACKED_STRUCT T_UINT32_WRITE { uint32_t v; };
#define __UNALIGNED_UINT32_WRITE(addr, val) (void)((((struct T_UINT32_WRITE *)(void *)(addr))->v) = (val))
#endif
#ifndef __UNALIGNED_UINT32_READ
__PACKED_STRUCT T_UINT32_READ { uint32_t v; };
#define __UNALIGNED_UINT32_READ(addr) (((const struct T_UINT32_READ *)(const void *)(addr))->v)
#endif
#ifndef __ALIGNED
#warning No compiler specific solution for __ALIGNED. __ALIGNED is ignored.
#define __ALIGNED(x)
#endif
#ifndef __RESTRICT
#warning No compiler specific solution for __RESTRICT. __RESTRICT is ignored.
#define __RESTRICT
#endif
#ifndef __COMPILER_BARRIER
#warning No compiler specific solution for __COMPILER_BARRIER. __COMPILER_BARRIER is ignored.
#define __COMPILER_BARRIER() (void)0
#endif
#else
#error Unknown compiler.
#endif
#endif /* __CMSIS_COMPILER_H */

View File

@ -1,529 +0,0 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_common_tables.h
* Description: Extern declaration for common tables
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef _ARM_COMMON_TABLES_H
#define _ARM_COMMON_TABLES_H
#include "arm_math_types.h"
#include "dsp/fast_math_functions.h"
#ifdef __cplusplus
extern "C"
{
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_FFT_ALLOW_TABLES)
/* Double Precision Float CFFT twiddles */
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREV_1024)
extern const uint16_t armBitRevTable[1024];
#endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F64_16)
extern const uint64_t twiddleCoefF64_16[32];
#endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F64_32)
extern const uint64_t twiddleCoefF64_32[64];
#endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F64_64)
extern const uint64_t twiddleCoefF64_64[128];
#endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F64_128)
extern const uint64_t twiddleCoefF64_128[256];
#endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F64_256)
extern const uint64_t twiddleCoefF64_256[512];
#endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F64_512)
extern const uint64_t twiddleCoefF64_512[1024];
#endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F64_1024)
extern const uint64_t twiddleCoefF64_1024[2048];
#endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F64_2048)
extern const uint64_t twiddleCoefF64_2048[4096];
#endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F64_4096)
extern const uint64_t twiddleCoefF64_4096[8192];
#endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F32_16)
extern const float32_t twiddleCoef_16[32];
#endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F32_32)
extern const float32_t twiddleCoef_32[64];
#endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F32_64)
extern const float32_t twiddleCoef_64[128];
#endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F32_128)
extern const float32_t twiddleCoef_128[256];
#endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F32_256)
extern const float32_t twiddleCoef_256[512];
#endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F32_512)
extern const float32_t twiddleCoef_512[1024];
#endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F32_1024)
extern const float32_t twiddleCoef_1024[2048];
#endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F32_2048)
extern const float32_t twiddleCoef_2048[4096];
#endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F32_4096)
extern const float32_t twiddleCoef_4096[8192];
#define twiddleCoef twiddleCoef_4096
#endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
/* Q31 */
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_Q31_16)
extern const q31_t twiddleCoef_16_q31[24];
#endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_Q31_32)
extern const q31_t twiddleCoef_32_q31[48];
#endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_Q31_64)
extern const q31_t twiddleCoef_64_q31[96];
#endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_Q31_128)
extern const q31_t twiddleCoef_128_q31[192];
#endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_Q31_256)
extern const q31_t twiddleCoef_256_q31[384];
#endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_Q31_512)
extern const q31_t twiddleCoef_512_q31[768];
#endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_Q31_1024)
extern const q31_t twiddleCoef_1024_q31[1536];
#endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_Q31_2048)
extern const q31_t twiddleCoef_2048_q31[3072];
#endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_Q31_4096)
extern const q31_t twiddleCoef_4096_q31[6144];
#endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_Q15_16)
extern const q15_t twiddleCoef_16_q15[24];
#endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_Q15_32)
extern const q15_t twiddleCoef_32_q15[48];
#endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_Q15_64)
extern const q15_t twiddleCoef_64_q15[96];
#endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_Q15_128)
extern const q15_t twiddleCoef_128_q15[192];
#endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_Q15_256)
extern const q15_t twiddleCoef_256_q15[384];
#endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_Q15_512)
extern const q15_t twiddleCoef_512_q15[768];
#endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_Q15_1024)
extern const q15_t twiddleCoef_1024_q15[1536];
#endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_Q15_2048)
extern const q15_t twiddleCoef_2048_q15[3072];
#endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_Q15_4096)
extern const q15_t twiddleCoef_4096_q15[6144];
#endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
/* Double Precision Float RFFT twiddles */
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_RFFT_F64_32)
extern const uint64_t twiddleCoefF64_rfft_32[32];
#endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_RFFT_F64_64)
extern const uint64_t twiddleCoefF64_rfft_64[64];
#endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_RFFT_F64_128)
extern const uint64_t twiddleCoefF64_rfft_128[128];
#endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_RFFT_F64_256)
extern const uint64_t twiddleCoefF64_rfft_256[256];
#endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_RFFT_F64_512)
extern const uint64_t twiddleCoefF64_rfft_512[512];
#endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_RFFT_F64_1024)
extern const uint64_t twiddleCoefF64_rfft_1024[1024];
#endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_RFFT_F64_2048)
extern const uint64_t twiddleCoefF64_rfft_2048[2048];
#endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_RFFT_F64_4096)
extern const uint64_t twiddleCoefF64_rfft_4096[4096];
#endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_32)
extern const float32_t twiddleCoef_rfft_32[32];
#endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_64)
extern const float32_t twiddleCoef_rfft_64[64];
#endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_128)
extern const float32_t twiddleCoef_rfft_128[128];
#endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_256)
extern const float32_t twiddleCoef_rfft_256[256];
#endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_512)
extern const float32_t twiddleCoef_rfft_512[512];
#endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_1024)
extern const float32_t twiddleCoef_rfft_1024[1024];
#endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_2048)
extern const float32_t twiddleCoef_rfft_2048[2048];
#endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_4096)
extern const float32_t twiddleCoef_rfft_4096[4096];
#endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
/* Double precision floating-point bit reversal tables */
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREVIDX_FLT64_16)
#define ARMBITREVINDEXTABLEF64_16_TABLE_LENGTH ((uint16_t)12)
extern const uint16_t armBitRevIndexTableF64_16[ARMBITREVINDEXTABLEF64_16_TABLE_LENGTH];
#endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREVIDX_FLT64_32)
#define ARMBITREVINDEXTABLEF64_32_TABLE_LENGTH ((uint16_t)24)
extern const uint16_t armBitRevIndexTableF64_32[ARMBITREVINDEXTABLEF64_32_TABLE_LENGTH];
#endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREVIDX_FLT64_64)
#define ARMBITREVINDEXTABLEF64_64_TABLE_LENGTH ((uint16_t)56)
extern const uint16_t armBitRevIndexTableF64_64[ARMBITREVINDEXTABLEF64_64_TABLE_LENGTH];
#endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREVIDX_FLT64_128)
#define ARMBITREVINDEXTABLEF64_128_TABLE_LENGTH ((uint16_t)112)
extern const uint16_t armBitRevIndexTableF64_128[ARMBITREVINDEXTABLEF64_128_TABLE_LENGTH];
#endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREVIDX_FLT64_256)
#define ARMBITREVINDEXTABLEF64_256_TABLE_LENGTH ((uint16_t)240)
extern const uint16_t armBitRevIndexTableF64_256[ARMBITREVINDEXTABLEF64_256_TABLE_LENGTH];
#endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREVIDX_FLT64_512)
#define ARMBITREVINDEXTABLEF64_512_TABLE_LENGTH ((uint16_t)480)
extern const uint16_t armBitRevIndexTableF64_512[ARMBITREVINDEXTABLEF64_512_TABLE_LENGTH];
#endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREVIDX_FLT64_1024)
#define ARMBITREVINDEXTABLEF64_1024_TABLE_LENGTH ((uint16_t)992)
extern const uint16_t armBitRevIndexTableF64_1024[ARMBITREVINDEXTABLEF64_1024_TABLE_LENGTH];
#endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREVIDX_FLT64_2048)
#define ARMBITREVINDEXTABLEF64_2048_TABLE_LENGTH ((uint16_t)1984)
extern const uint16_t armBitRevIndexTableF64_2048[ARMBITREVINDEXTABLEF64_2048_TABLE_LENGTH];
#endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREVIDX_FLT64_4096)
#define ARMBITREVINDEXTABLEF64_4096_TABLE_LENGTH ((uint16_t)4032)
extern const uint16_t armBitRevIndexTableF64_4096[ARMBITREVINDEXTABLEF64_4096_TABLE_LENGTH];
#endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
/* floating-point bit reversal tables */
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREVIDX_FLT_16)
#define ARMBITREVINDEXTABLE_16_TABLE_LENGTH ((uint16_t)20)
extern const uint16_t armBitRevIndexTable16[ARMBITREVINDEXTABLE_16_TABLE_LENGTH];
#endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREVIDX_FLT_32)
#define ARMBITREVINDEXTABLE_32_TABLE_LENGTH ((uint16_t)48)
extern const uint16_t armBitRevIndexTable32[ARMBITREVINDEXTABLE_32_TABLE_LENGTH];
#endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREVIDX_FLT_64)
#define ARMBITREVINDEXTABLE_64_TABLE_LENGTH ((uint16_t)56)
extern const uint16_t armBitRevIndexTable64[ARMBITREVINDEXTABLE_64_TABLE_LENGTH];
#endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREVIDX_FLT_128)
#define ARMBITREVINDEXTABLE_128_TABLE_LENGTH ((uint16_t)208)
extern const uint16_t armBitRevIndexTable128[ARMBITREVINDEXTABLE_128_TABLE_LENGTH];
#endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREVIDX_FLT_256)
#define ARMBITREVINDEXTABLE_256_TABLE_LENGTH ((uint16_t)440)
extern const uint16_t armBitRevIndexTable256[ARMBITREVINDEXTABLE_256_TABLE_LENGTH];
#endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREVIDX_FLT_512)
#define ARMBITREVINDEXTABLE_512_TABLE_LENGTH ((uint16_t)448)
extern const uint16_t armBitRevIndexTable512[ARMBITREVINDEXTABLE_512_TABLE_LENGTH];
#endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREVIDX_FLT_1024)
#define ARMBITREVINDEXTABLE_1024_TABLE_LENGTH ((uint16_t)1800)
extern const uint16_t armBitRevIndexTable1024[ARMBITREVINDEXTABLE_1024_TABLE_LENGTH];
#endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREVIDX_FLT_2048)
#define ARMBITREVINDEXTABLE_2048_TABLE_LENGTH ((uint16_t)3808)
extern const uint16_t armBitRevIndexTable2048[ARMBITREVINDEXTABLE_2048_TABLE_LENGTH];
#endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREVIDX_FLT_4096)
#define ARMBITREVINDEXTABLE_4096_TABLE_LENGTH ((uint16_t)4032)
extern const uint16_t armBitRevIndexTable4096[ARMBITREVINDEXTABLE_4096_TABLE_LENGTH];
#endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
/* fixed-point bit reversal tables */
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREVIDX_FXT_16)
#define ARMBITREVINDEXTABLE_FIXED_16_TABLE_LENGTH ((uint16_t)12)
extern const uint16_t armBitRevIndexTable_fixed_16[ARMBITREVINDEXTABLE_FIXED_16_TABLE_LENGTH];
#endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREVIDX_FXT_32)
#define ARMBITREVINDEXTABLE_FIXED_32_TABLE_LENGTH ((uint16_t)24)
extern const uint16_t armBitRevIndexTable_fixed_32[ARMBITREVINDEXTABLE_FIXED_32_TABLE_LENGTH];
#endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREVIDX_FXT_64)
#define ARMBITREVINDEXTABLE_FIXED_64_TABLE_LENGTH ((uint16_t)56)
extern const uint16_t armBitRevIndexTable_fixed_64[ARMBITREVINDEXTABLE_FIXED_64_TABLE_LENGTH];
#endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREVIDX_FXT_128)
#define ARMBITREVINDEXTABLE_FIXED_128_TABLE_LENGTH ((uint16_t)112)
extern const uint16_t armBitRevIndexTable_fixed_128[ARMBITREVINDEXTABLE_FIXED_128_TABLE_LENGTH];
#endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREVIDX_FXT_256)
#define ARMBITREVINDEXTABLE_FIXED_256_TABLE_LENGTH ((uint16_t)240)
extern const uint16_t armBitRevIndexTable_fixed_256[ARMBITREVINDEXTABLE_FIXED_256_TABLE_LENGTH];
#endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREVIDX_FXT_512)
#define ARMBITREVINDEXTABLE_FIXED_512_TABLE_LENGTH ((uint16_t)480)
extern const uint16_t armBitRevIndexTable_fixed_512[ARMBITREVINDEXTABLE_FIXED_512_TABLE_LENGTH];
#endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREVIDX_FXT_1024)
#define ARMBITREVINDEXTABLE_FIXED_1024_TABLE_LENGTH ((uint16_t)992)
extern const uint16_t armBitRevIndexTable_fixed_1024[ARMBITREVINDEXTABLE_FIXED_1024_TABLE_LENGTH];
#endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREVIDX_FXT_2048)
#define ARMBITREVINDEXTABLE_FIXED_2048_TABLE_LENGTH ((uint16_t)1984)
extern const uint16_t armBitRevIndexTable_fixed_2048[ARMBITREVINDEXTABLE_FIXED_2048_TABLE_LENGTH];
#endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREVIDX_FXT_4096)
#define ARMBITREVINDEXTABLE_FIXED_4096_TABLE_LENGTH ((uint16_t)4032)
extern const uint16_t armBitRevIndexTable_fixed_4096[ARMBITREVINDEXTABLE_FIXED_4096_TABLE_LENGTH];
#endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_REALCOEF_F32)
extern const float32_t realCoefA[8192];
extern const float32_t realCoefB[8192];
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_REALCOEF_Q31)
extern const q31_t realCoefAQ31[8192];
extern const q31_t realCoefBQ31[8192];
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_REALCOEF_Q15)
extern const q15_t realCoefAQ15[8192];
extern const q15_t realCoefBQ15[8192];
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_DCT4_F32_128)
extern const float32_t Weights_128[256];
extern const float32_t cos_factors_128[128];
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_DCT4_F32_512)
extern const float32_t Weights_512[1024];
extern const float32_t cos_factors_512[512];
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_DCT4_F32_2048)
extern const float32_t Weights_2048[4096];
extern const float32_t cos_factors_2048[2048];
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_DCT4_F32_8192)
extern const float32_t Weights_8192[16384];
extern const float32_t cos_factors_8192[8192];
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_DCT4_Q15_128)
extern const q15_t WeightsQ15_128[256];
extern const q15_t cos_factorsQ15_128[128];
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_DCT4_Q15_512)
extern const q15_t WeightsQ15_512[1024];
extern const q15_t cos_factorsQ15_512[512];
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_DCT4_Q15_2048)
extern const q15_t WeightsQ15_2048[4096];
extern const q15_t cos_factorsQ15_2048[2048];
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_DCT4_Q15_8192)
extern const q15_t WeightsQ15_8192[16384];
extern const q15_t cos_factorsQ15_8192[8192];
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_DCT4_Q31_128)
extern const q31_t WeightsQ31_128[256];
extern const q31_t cos_factorsQ31_128[128];
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_DCT4_Q31_512)
extern const q31_t WeightsQ31_512[1024];
extern const q31_t cos_factorsQ31_512[512];
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_DCT4_Q31_2048)
extern const q31_t WeightsQ31_2048[4096];
extern const q31_t cos_factorsQ31_2048[2048];
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_DCT4_Q31_8192)
extern const q31_t WeightsQ31_8192[16384];
extern const q31_t cos_factorsQ31_8192[8192];
#endif
#endif /* if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_FFT_TABLES) */
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_FAST_ALLOW_TABLES)
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FAST_TABLES) || defined(ARM_TABLE_RECIP_Q15)
extern const q15_t armRecipTableQ15[64];
#endif /* !defined(ARM_DSP_CONFIG_TABLES) defined(ARM_ALL_FAST_TABLES) */
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FAST_TABLES) || defined(ARM_TABLE_RECIP_Q31)
extern const q31_t armRecipTableQ31[64];
#endif /* !defined(ARM_DSP_CONFIG_TABLES) defined(ARM_ALL_FAST_TABLES) */
/* Tables for Fast Math Sine and Cosine */
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FAST_TABLES) || defined(ARM_TABLE_SIN_F32)
extern const float32_t sinTable_f32[FAST_MATH_TABLE_SIZE + 1];
#endif /* !defined(ARM_DSP_CONFIG_TABLES) defined(ARM_ALL_FAST_TABLES) */
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FAST_TABLES) || defined(ARM_TABLE_SIN_Q31)
extern const q31_t sinTable_q31[FAST_MATH_TABLE_SIZE + 1];
#endif /* !defined(ARM_DSP_CONFIG_TABLES) defined(ARM_ALL_FAST_TABLES) */
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FAST_TABLES) || defined(ARM_TABLE_SIN_Q15)
extern const q15_t sinTable_q15[FAST_MATH_TABLE_SIZE + 1];
#endif /* !defined(ARM_DSP_CONFIG_TABLES) defined(ARM_ALL_FAST_TABLES) */
#if defined(ARM_MATH_MVEI)
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FAST_TABLES) || defined(ARM_TABLE_FAST_SQRT_Q31_MVE)
extern const q31_t sqrtTable_Q31[256];
#endif /* !defined(ARM_DSP_CONFIG_TABLES) defined(ARM_ALL_FAST_TABLES) */
#endif
#if defined(ARM_MATH_MVEI)
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FAST_TABLES) || defined(ARM_TABLE_FAST_SQRT_Q15_MVE)
extern const q15_t sqrtTable_Q15[256];
#endif /* !defined(ARM_DSP_CONFIG_TABLES) defined(ARM_ALL_FAST_TABLES) */
#endif
#endif /* if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_FAST_TABLES) */
#if (defined(ARM_MATH_MVEF) || defined(ARM_MATH_HELIUM)) && !defined(ARM_MATH_AUTOVECTORIZE)
extern const float32_t exp_tab[8];
extern const float32_t __logf_lut_f32[8];
#endif /* (defined(ARM_MATH_MVEF) || defined(ARM_MATH_HELIUM)) && !defined(ARM_MATH_AUTOVECTORIZE) */
#if (defined(ARM_MATH_MVEI) || defined(ARM_MATH_HELIUM))
extern const unsigned char hwLUT[256];
#endif /* (defined(ARM_MATH_MVEI) || defined(ARM_MATH_HELIUM)) */
#ifdef __cplusplus
}
#endif
#endif /* ARM_COMMON_TABLES_H */

View File

@ -1,748 +0,0 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_helium_utils.h
* Description: Utility functions for Helium development
*
* $Date: 09. September 2019
* $Revision: V.1.5.1
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef _ARM_UTILS_HELIUM_H_
#define _ARM_UTILS_HELIUM_H_
#ifdef __cplusplus
extern "C"
{
#endif
/***************************************
Definitions available for MVEF and MVEI
***************************************/
#if defined (ARM_MATH_HELIUM) || defined(ARM_MATH_MVEF) || defined(ARM_MATH_MVEI)
#define INACTIVELANE 0 /* inactive lane content */
#endif /* defined (ARM_MATH_HELIUM) || defined(ARM_MATH_MVEF) || defined(ARM_MATH_MVEI) */
/***************************************
Definitions available for MVEF only
***************************************/
#if defined (ARM_MATH_HELIUM) || defined(ARM_MATH_MVEF)
__STATIC_FORCEINLINE float32_t vecAddAcrossF32Mve(float32x4_t in)
{
float32_t acc;
acc = vgetq_lane(in, 0) + vgetq_lane(in, 1) +
vgetq_lane(in, 2) + vgetq_lane(in, 3);
return acc;
}
__STATIC_FORCEINLINE float16_t vecAddAcrossF16Mve(float16x8_t in)
{
float16x8_t tmpVec;
_Float16 acc;
tmpVec = (float16x8_t) vrev32q_s16((int16x8_t) in);
in = vaddq_f16(tmpVec, in);
tmpVec = (float16x8_t) vrev64q_s32((int32x4_t) in);
in = vaddq_f16(tmpVec, in);
acc = (_Float16)vgetq_lane_f16(in, 0) + (_Float16)vgetq_lane_f16(in, 4);
return acc;
}
/* newton initial guess */
#define INVSQRT_MAGIC_F32 0x5f3759df
#define INV_NEWTON_INIT_F32 0x7EF127EA
#define INVSQRT_NEWTON_MVE_F32(invSqrt, xHalf, xStart)\
{ \
float32x4_t tmp; \
\
/* tmp = xhalf * x * x */ \
tmp = vmulq(xStart, xStart); \
tmp = vmulq(tmp, xHalf); \
/* (1.5f - xhalf * x * x) */ \
tmp = vsubq(vdupq_n_f32(1.5f), tmp); \
/* x = x*(1.5f-xhalf*x*x); */ \
invSqrt = vmulq(tmp, xStart); \
}
#endif /* defined (ARM_MATH_HELIUM) || defined(ARM_MATH_MVEF) */
/***************************************
Definitions available for f16 datatype with HW acceleration only
***************************************/
#if defined (ARM_MATH_MVE_FLOAT16)
__STATIC_FORCEINLINE float16x8_t __mve_cmplx_sum_intra_vec_f16(
float16x8_t vecIn)
{
float16x8_t vecTmp, vecOut;
uint32_t tmp;
vecTmp = (float16x8_t) vrev64q_s32((int32x4_t) vecIn);
// TO TRACK : using canonical addition leads to unefficient code generation for f16
// vecTmp = vecTmp + vecAccCpx0;
/*
* Compute
* re0+re1 | im0+im1 | re0+re1 | im0+im1
* re2+re3 | im2+im3 | re2+re3 | im2+im3
*/
vecTmp = vaddq(vecTmp, vecIn);
vecOut = vecTmp;
/*
* shift left, random tmp insertion in bottom
*/
vecOut = vreinterpretq_f16_s32(vshlcq_s32(vreinterpretq_s32_f16(vecOut) , &tmp, 32));
/*
* Compute:
* DONTCARE | DONTCARE | re0+re1+re0+re1 |im0+im1+im0+im1
* re0+re1+re2+re3 | im0+im1+im2+im3 | re2+re3+re2+re3 |im2+im3+im2+im3
*/
vecOut = vaddq(vecOut, vecTmp);
/*
* Cmplx sum is in 4rd & 5th f16 elt
* return full vector
*/
return vecOut;
}
#define mve_cmplx_sum_intra_r_i_f16(vec, Re, Im) \
{ \
float16x8_t vecOut = __mve_cmplx_sum_intra_vec_f16(vec); \
Re = vgetq_lane(vecOut, 4); \
Im = vgetq_lane(vecOut, 5); \
}
__STATIC_FORCEINLINE void mve_cmplx_sum_intra_vec_f16(
float16x8_t vecIn,
float16_t *pOut)
{
float16x8_t vecOut = __mve_cmplx_sum_intra_vec_f16(vecIn);
/*
* Cmplx sum is in 4rd & 5th f16 elt
* use 32-bit extraction
*/
*(float32_t *) pOut = ((float32x4_t) vecOut)[2];
}
#define INVSQRT_MAGIC_F16 0x59ba /* ( 0x1ba = 0x3759df >> 13) */
/* canonical version of INVSQRT_NEWTON_MVE_F16 leads to bad performance */
#define INVSQRT_NEWTON_MVE_F16(invSqrt, xHalf, xStart) \
{ \
float16x8_t tmp; \
\
/* tmp = xhalf * x * x */ \
tmp = vmulq(xStart, xStart); \
tmp = vmulq(tmp, xHalf); \
/* (1.5f - xhalf * x * x) */ \
tmp = vsubq(vdupq_n_f16((float16_t)1.5), tmp); \
/* x = x*(1.5f-xhalf*x*x); */ \
invSqrt = vmulq(tmp, xStart); \
}
#endif
/***************************************
Definitions available for MVEI and MVEF only
***************************************/
#if defined (ARM_MATH_HELIUM) || defined(ARM_MATH_MVEF) || defined(ARM_MATH_MVEI)
/* Following functions are used to transpose matrix in f32 and q31 cases */
__STATIC_INLINE arm_status arm_mat_trans_32bit_2x2_mve(
uint32_t * pDataSrc,
uint32_t * pDataDest)
{
static const uint32x4_t vecOffs = { 0, 2, 1, 3 };
/*
*
* | 0 1 | => | 0 2 |
* | 2 3 | | 1 3 |
*
*/
uint32x4_t vecIn = vldrwq_u32((uint32_t const *)pDataSrc);
vstrwq_scatter_shifted_offset_u32(pDataDest, vecOffs, vecIn);
return (ARM_MATH_SUCCESS);
}
__STATIC_INLINE arm_status arm_mat_trans_32bit_3x3_mve(
uint32_t * pDataSrc,
uint32_t * pDataDest)
{
const uint32x4_t vecOffs1 = { 0, 3, 6, 1};
const uint32x4_t vecOffs2 = { 4, 7, 2, 5};
/*
*
* | 0 1 2 | | 0 3 6 | 4 x 32 flattened version | 0 3 6 1 |
* | 3 4 5 | => | 1 4 7 | => | 4 7 2 5 |
* | 6 7 8 | | 2 5 8 | (row major) | 8 . . . |
*
*/
uint32x4_t vecIn1 = vldrwq_u32((uint32_t const *) pDataSrc);
uint32x4_t vecIn2 = vldrwq_u32((uint32_t const *) &pDataSrc[4]);
vstrwq_scatter_shifted_offset_u32(pDataDest, vecOffs1, vecIn1);
vstrwq_scatter_shifted_offset_u32(pDataDest, vecOffs2, vecIn2);
pDataDest[8] = pDataSrc[8];
return (ARM_MATH_SUCCESS);
}
__STATIC_INLINE arm_status arm_mat_trans_32bit_4x4_mve(uint32_t * pDataSrc, uint32_t * pDataDest)
{
/*
* 4x4 Matrix transposition
* is 4 x de-interleave operation
*
* 0 1 2 3 0 4 8 12
* 4 5 6 7 1 5 9 13
* 8 9 10 11 2 6 10 14
* 12 13 14 15 3 7 11 15
*/
uint32x4x4_t vecIn;
vecIn = vld4q((uint32_t const *) pDataSrc);
vstrwq(pDataDest, vecIn.val[0]);
pDataDest += 4;
vstrwq(pDataDest, vecIn.val[1]);
pDataDest += 4;
vstrwq(pDataDest, vecIn.val[2]);
pDataDest += 4;
vstrwq(pDataDest, vecIn.val[3]);
return (ARM_MATH_SUCCESS);
}
__STATIC_INLINE arm_status arm_mat_trans_32bit_generic_mve(
uint16_t srcRows,
uint16_t srcCols,
uint32_t * pDataSrc,
uint32_t * pDataDest)
{
uint32x4_t vecOffs;
uint32_t i;
uint32_t blkCnt;
uint32_t const *pDataC;
uint32_t *pDataDestR;
uint32x4_t vecIn;
vecOffs = vidupq_u32((uint32_t)0, 1);
vecOffs = vecOffs * srcCols;
i = srcCols;
do
{
pDataC = (uint32_t const *) pDataSrc;
pDataDestR = pDataDest;
blkCnt = srcRows >> 2;
while (blkCnt > 0U)
{
vecIn = vldrwq_gather_shifted_offset_u32(pDataC, vecOffs);
vstrwq(pDataDestR, vecIn);
pDataDestR += 4;
pDataC = pDataC + srcCols * 4;
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
}
/*
* tail
*/
blkCnt = srcRows & 3;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp32q(blkCnt);
vecIn = vldrwq_gather_shifted_offset_u32(pDataC, vecOffs);
vstrwq_p(pDataDestR, vecIn, p0);
}
pDataSrc += 1;
pDataDest += srcRows;
}
while (--i);
return (ARM_MATH_SUCCESS);
}
__STATIC_INLINE arm_status arm_mat_cmplx_trans_32bit(
uint16_t srcRows,
uint16_t srcCols,
uint32_t *pDataSrc,
uint16_t dstRows,
uint16_t dstCols,
uint32_t *pDataDest)
{
uint32_t i;
uint32_t const *pDataC;
uint32_t *pDataRow;
uint32_t *pDataDestR, *pDataDestRow;
uint32x4_t vecOffsRef, vecOffsCur;
uint32_t blkCnt;
uint32x4_t vecIn;
#ifdef ARM_MATH_MATRIX_CHECK
/*
* Check for matrix mismatch condition
*/
if ((srcRows != dstCols) || (srcCols != dstRows))
{
/*
* Set status as ARM_MATH_SIZE_MISMATCH
*/
return = ARM_MATH_SIZE_MISMATCH;
}
#else
(void)dstRows;
(void)dstCols;
#endif
/* 2x2, 3x3 and 4x4 specialization to be added */
vecOffsRef[0] = 0;
vecOffsRef[1] = 1;
vecOffsRef[2] = srcCols << 1;
vecOffsRef[3] = (srcCols << 1) + 1;
pDataRow = pDataSrc;
pDataDestRow = pDataDest;
i = srcCols;
do
{
pDataC = (uint32_t const *) pDataRow;
pDataDestR = pDataDestRow;
vecOffsCur = vecOffsRef;
blkCnt = (srcRows * CMPLX_DIM) >> 2;
while (blkCnt > 0U)
{
vecIn = vldrwq_gather_shifted_offset(pDataC, vecOffsCur);
vstrwq(pDataDestR, vecIn);
pDataDestR += 4;
vecOffsCur = vaddq(vecOffsCur, (srcCols << 2));
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
}
/*
* tail
* (will be merged thru tail predication)
*/
blkCnt = (srcRows * CMPLX_DIM) & 3;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp32q(blkCnt);
vecIn = vldrwq_gather_shifted_offset(pDataC, vecOffsCur);
vstrwq_p(pDataDestR, vecIn, p0);
}
pDataRow += CMPLX_DIM;
pDataDestRow += (srcRows * CMPLX_DIM);
}
while (--i);
return (ARM_MATH_SUCCESS);
}
__STATIC_INLINE arm_status arm_mat_trans_16bit_2x2(uint16_t * pDataSrc, uint16_t * pDataDest)
{
pDataDest[0] = pDataSrc[0];
pDataDest[3] = pDataSrc[3];
pDataDest[2] = pDataSrc[1];
pDataDest[1] = pDataSrc[2];
return (ARM_MATH_SUCCESS);
}
__STATIC_INLINE arm_status arm_mat_trans_16bit_3x3_mve(uint16_t * pDataSrc, uint16_t * pDataDest)
{
static const uint16_t stridesTr33[8] = { 0, 3, 6, 1, 4, 7, 2, 5 };
uint16x8_t vecOffs1;
uint16x8_t vecIn1;
/*
*
* | 0 1 2 | | 0 3 6 | 8 x 16 flattened version | 0 3 6 1 4 7 2 5 |
* | 3 4 5 | => | 1 4 7 | => | 8 . . . . . . . |
* | 6 7 8 | | 2 5 8 | (row major)
*
*/
vecOffs1 = vldrhq_u16((uint16_t const *) stridesTr33);
vecIn1 = vldrhq_u16((uint16_t const *) pDataSrc);
vstrhq_scatter_shifted_offset_u16(pDataDest, vecOffs1, vecIn1);
pDataDest[8] = pDataSrc[8];
return (ARM_MATH_SUCCESS);
}
__STATIC_INLINE arm_status arm_mat_trans_16bit_4x4_mve(uint16_t * pDataSrc, uint16_t * pDataDest)
{
static const uint16_t stridesTr44_1[8] = { 0, 4, 8, 12, 1, 5, 9, 13 };
static const uint16_t stridesTr44_2[8] = { 2, 6, 10, 14, 3, 7, 11, 15 };
uint16x8_t vecOffs1, vecOffs2;
uint16x8_t vecIn1, vecIn2;
uint16_t const * pDataSrcVec = (uint16_t const *) pDataSrc;
/*
* 4x4 Matrix transposition
*
* | 0 1 2 3 | | 0 4 8 12 | 8 x 16 flattened version
* | 4 5 6 7 | => | 1 5 9 13 | => [0 4 8 12 1 5 9 13]
* | 8 9 10 11 | | 2 6 10 14 | [2 6 10 14 3 7 11 15]
* | 12 13 14 15 | | 3 7 11 15 |
*/
vecOffs1 = vldrhq_u16((uint16_t const *) stridesTr44_1);
vecOffs2 = vldrhq_u16((uint16_t const *) stridesTr44_2);
vecIn1 = vldrhq_u16(pDataSrcVec);
pDataSrcVec += 8;
vecIn2 = vldrhq_u16(pDataSrcVec);
vstrhq_scatter_shifted_offset_u16(pDataDest, vecOffs1, vecIn1);
vstrhq_scatter_shifted_offset_u16(pDataDest, vecOffs2, vecIn2);
return (ARM_MATH_SUCCESS);
}
__STATIC_INLINE arm_status arm_mat_trans_16bit_generic(
uint16_t srcRows,
uint16_t srcCols,
uint16_t * pDataSrc,
uint16_t * pDataDest)
{
uint16x8_t vecOffs;
uint32_t i;
uint32_t blkCnt;
uint16_t const *pDataC;
uint16_t *pDataDestR;
uint16x8_t vecIn;
vecOffs = vidupq_u16((uint32_t)0, 1);
vecOffs = vecOffs * srcCols;
i = srcCols;
while(i > 0U)
{
pDataC = (uint16_t const *) pDataSrc;
pDataDestR = pDataDest;
blkCnt = srcRows >> 3;
while (blkCnt > 0U)
{
vecIn = vldrhq_gather_shifted_offset_u16(pDataC, vecOffs);
vstrhq_u16(pDataDestR, vecIn);
pDataDestR += 8;
pDataC = pDataC + srcCols * 8;
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
}
/*
* tail
*/
blkCnt = srcRows & 7;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp16q(blkCnt);
vecIn = vldrhq_gather_shifted_offset_u16(pDataC, vecOffs);
vstrhq_p_u16(pDataDestR, vecIn, p0);
}
pDataSrc += 1;
pDataDest += srcRows;
i--;
}
return (ARM_MATH_SUCCESS);
}
__STATIC_INLINE arm_status arm_mat_cmplx_trans_16bit(
uint16_t srcRows,
uint16_t srcCols,
uint16_t *pDataSrc,
uint16_t dstRows,
uint16_t dstCols,
uint16_t *pDataDest)
{
static const uint16_t loadCmplxCol[8] = { 0, 0, 1, 1, 2, 2, 3, 3 };
int i;
uint16x8_t vecOffsRef, vecOffsCur;
uint16_t const *pDataC;
uint16_t *pDataRow;
uint16_t *pDataDestR, *pDataDestRow;
uint32_t blkCnt;
uint16x8_t vecIn;
#ifdef ARM_MATH_MATRIX_CHECK
/*
* Check for matrix mismatch condition
*/
if ((srcRows != dstCols) || (srcCols != dstRows))
{
/*
* Set status as ARM_MATH_SIZE_MISMATCH
*/
return = ARM_MATH_SIZE_MISMATCH;
}
#else
(void)dstRows;
(void)dstCols;
#endif
/*
* 2x2, 3x3 and 4x4 specialization to be added
*/
/*
* build [0, 1, 2xcol, 2xcol+1, 4xcol, 4xcol+1, 6xcol, 6xcol+1]
*/
vecOffsRef = vldrhq_u16((uint16_t const *) loadCmplxCol);
vecOffsRef = vmulq(vecOffsRef, (uint16_t) (srcCols * CMPLX_DIM))
+ viwdupq_u16((uint32_t)0, (uint16_t) 2, 1);
pDataRow = pDataSrc;
pDataDestRow = pDataDest;
i = srcCols;
do
{
pDataC = (uint16_t const *) pDataRow;
pDataDestR = pDataDestRow;
vecOffsCur = vecOffsRef;
blkCnt = (srcRows * CMPLX_DIM) >> 3;
while (blkCnt > 0U)
{
vecIn = vldrhq_gather_shifted_offset(pDataC, vecOffsCur);
vstrhq(pDataDestR, vecIn);
pDataDestR+= 8; // VEC_LANES_U16
vecOffsCur = vaddq(vecOffsCur, (srcCols << 3));
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
}
/*
* tail
* (will be merged thru tail predication)
*/
blkCnt = (srcRows * CMPLX_DIM) & 0x7;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp16q(blkCnt);
vecIn = vldrhq_gather_shifted_offset(pDataC, vecOffsCur);
vstrhq_p(pDataDestR, vecIn, p0);
}
pDataRow += CMPLX_DIM;
pDataDestRow += (srcRows * CMPLX_DIM);
}
while (--i);
return (ARM_MATH_SUCCESS);
}
#endif /* MVEF and MVEI */
/***************************************
Definitions available for MVEI only
***************************************/
#if defined (ARM_MATH_HELIUM) || defined(ARM_MATH_MVEI)
#include "arm_common_tables.h"
#define MVE_ASRL_SAT16(acc, shift) ((sqrshrl_sat48(acc, -(32-shift)) >> 32) & 0xffffffff)
#define MVE_ASRL_SAT32(acc, shift) ((sqrshrl(acc, -(32-shift)) >> 32) & 0xffffffff)
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FAST_TABLES) || defined(ARM_TABLE_FAST_SQRT_Q31_MVE)
__STATIC_INLINE q31x4_t FAST_VSQRT_Q31(q31x4_t vecIn)
{
q63x2_t vecTmpLL;
q31x4_t vecTmp0, vecTmp1;
q31_t scale;
q63_t tmp64;
q31x4_t vecNrm, vecDst, vecIdx, vecSignBits;
vecSignBits = vclsq(vecIn);
vecSignBits = vbicq(vecSignBits, 1);
/*
* in = in << no_of_sign_bits;
*/
vecNrm = vshlq(vecIn, vecSignBits);
/*
* index = in >> 24;
*/
vecIdx = vecNrm >> 24;
vecIdx = vecIdx << 1;
vecTmp0 = vldrwq_gather_shifted_offset_s32(sqrtTable_Q31, (uint32x4_t)vecIdx);
vecIdx = vecIdx + 1;
vecTmp1 = vldrwq_gather_shifted_offset_s32(sqrtTable_Q31, (uint32x4_t)vecIdx);
vecTmp1 = vqrdmulhq(vecTmp1, vecNrm);
vecTmp0 = vecTmp0 - vecTmp1;
vecTmp1 = vqrdmulhq(vecTmp0, vecTmp0);
vecTmp1 = vqrdmulhq(vecNrm, vecTmp1);
vecTmp1 = vdupq_n_s32(0x18000000) - vecTmp1;
vecTmp0 = vqrdmulhq(vecTmp0, vecTmp1);
vecTmpLL = vmullbq_int(vecNrm, vecTmp0);
/*
* scale elements 0, 2
*/
scale = 26 + (vecSignBits[0] >> 1);
tmp64 = asrl(vecTmpLL[0], scale);
vecDst[0] = (q31_t) tmp64;
scale = 26 + (vecSignBits[2] >> 1);
tmp64 = asrl(vecTmpLL[1], scale);
vecDst[2] = (q31_t) tmp64;
vecTmpLL = vmulltq_int(vecNrm, vecTmp0);
/*
* scale elements 1, 3
*/
scale = 26 + (vecSignBits[1] >> 1);
tmp64 = asrl(vecTmpLL[0], scale);
vecDst[1] = (q31_t) tmp64;
scale = 26 + (vecSignBits[3] >> 1);
tmp64 = asrl(vecTmpLL[1], scale);
vecDst[3] = (q31_t) tmp64;
/*
* set negative values to 0
*/
vecDst = vdupq_m(vecDst, 0, vcmpltq_n_s32(vecIn, 0));
return vecDst;
}
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FAST_TABLES) || defined(ARM_TABLE_FAST_SQRT_Q15_MVE)
__STATIC_INLINE q15x8_t FAST_VSQRT_Q15(q15x8_t vecIn)
{
q31x4_t vecTmpLev, vecTmpLodd, vecSignL;
q15x8_t vecTmp0, vecTmp1;
q15x8_t vecNrm, vecDst, vecIdx, vecSignBits;
vecDst = vuninitializedq_s16();
vecSignBits = vclsq(vecIn);
vecSignBits = vbicq(vecSignBits, 1);
/*
* in = in << no_of_sign_bits;
*/
vecNrm = vshlq(vecIn, vecSignBits);
vecIdx = vecNrm >> 8;
vecIdx = vecIdx << 1;
vecTmp0 = vldrhq_gather_shifted_offset_s16(sqrtTable_Q15, (uint16x8_t)vecIdx);
vecIdx = vecIdx + 1;
vecTmp1 = vldrhq_gather_shifted_offset_s16(sqrtTable_Q15, (uint16x8_t)vecIdx);
vecTmp1 = vqrdmulhq(vecTmp1, vecNrm);
vecTmp0 = vecTmp0 - vecTmp1;
vecTmp1 = vqrdmulhq(vecTmp0, vecTmp0);
vecTmp1 = vqrdmulhq(vecNrm, vecTmp1);
vecTmp1 = vdupq_n_s16(0x1800) - vecTmp1;
vecTmp0 = vqrdmulhq(vecTmp0, vecTmp1);
vecSignBits = vecSignBits >> 1;
vecTmpLev = vmullbq_int(vecNrm, vecTmp0);
vecTmpLodd = vmulltq_int(vecNrm, vecTmp0);
vecTmp0 = vecSignBits + 10;
/*
* negate sign to apply register based vshl
*/
vecTmp0 = -vecTmp0;
/*
* shift even elements
*/
vecSignL = vmovlbq(vecTmp0);
vecTmpLev = vshlq(vecTmpLev, vecSignL);
/*
* shift odd elements
*/
vecSignL = vmovltq(vecTmp0);
vecTmpLodd = vshlq(vecTmpLodd, vecSignL);
/*
* merge and narrow odd and even parts
*/
vecDst = vmovnbq_s32(vecDst, vecTmpLev);
vecDst = vmovntq_s32(vecDst, vecTmpLodd);
/*
* set negative values to 0
*/
vecDst = vdupq_m(vecDst, 0, vcmpltq_n_s16(vecIn, 0));
return vecDst;
}
#endif
#endif /* defined (ARM_MATH_HELIUM) || defined(ARM_MATH_MVEI) */
#ifdef __cplusplus
}
#endif
#endif

View File

@ -1,246 +0,0 @@
/******************************************************************************
* @file arm_math.h
* @brief Public header file for CMSIS DSP Library
* @version V1.7.0
* @date 18. March 2019
******************************************************************************/
/*
* Copyright (c) 2010-2019 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
\mainpage CMSIS DSP Software Library
*
* \section intro Introduction
*
* This user manual describes the CMSIS DSP software library,
* a suite of common signal processing functions for use on Cortex-M and Cortex-A processor
* based devices.
*
* The library is divided into a number of functions each covering a specific category:
* - Basic math functions
* - Fast math functions
* - Complex math functions
* - Filtering functions
* - Matrix functions
* - Transform functions
* - Motor control functions
* - Statistical functions
* - Support functions
* - Interpolation functions
* - Support Vector Machine functions (SVM)
* - Bayes classifier functions
* - Distance functions
*
* The library has generally separate functions for operating on 8-bit integers, 16-bit integers,
* 32-bit integer and 32-bit floating-point values.
*
* \section using Using the Library
*
* The library installer contains prebuilt versions of the libraries in the <code>Lib</code> folder.
*
* Here is the list of pre-built libraries :
* - arm_cortexM7lfdp_math.lib (Cortex-M7, Little endian, Double Precision Floating Point Unit)
* - arm_cortexM7bfdp_math.lib (Cortex-M7, Big endian, Double Precision Floating Point Unit)
* - arm_cortexM7lfsp_math.lib (Cortex-M7, Little endian, Single Precision Floating Point Unit)
* - arm_cortexM7bfsp_math.lib (Cortex-M7, Big endian and Single Precision Floating Point Unit on)
* - arm_cortexM7l_math.lib (Cortex-M7, Little endian)
* - arm_cortexM7b_math.lib (Cortex-M7, Big endian)
* - arm_cortexM4lf_math.lib (Cortex-M4, Little endian, Floating Point Unit)
* - arm_cortexM4bf_math.lib (Cortex-M4, Big endian, Floating Point Unit)
* - arm_cortexM4l_math.lib (Cortex-M4, Little endian)
* - arm_cortexM4b_math.lib (Cortex-M4, Big endian)
* - arm_cortexM3l_math.lib (Cortex-M3, Little endian)
* - arm_cortexM3b_math.lib (Cortex-M3, Big endian)
* - arm_cortexM0l_math.lib (Cortex-M0 / Cortex-M0+, Little endian)
* - arm_cortexM0b_math.lib (Cortex-M0 / Cortex-M0+, Big endian)
* - arm_ARMv8MBLl_math.lib (Armv8-M Baseline, Little endian)
* - arm_ARMv8MMLl_math.lib (Armv8-M Mainline, Little endian)
* - arm_ARMv8MMLlfsp_math.lib (Armv8-M Mainline, Little endian, Single Precision Floating Point Unit)
* - arm_ARMv8MMLld_math.lib (Armv8-M Mainline, Little endian, DSP instructions)
* - arm_ARMv8MMLldfsp_math.lib (Armv8-M Mainline, Little endian, DSP instructions, Single Precision Floating Point Unit)
*
* The library functions are declared in the public file <code>arm_math.h</code> which is placed in the <code>Include</code> folder.
* Simply include this file and link the appropriate library in the application and begin calling the library functions. The Library supports single
* public header file <code> arm_math.h</code> for Cortex-M cores with little endian and big endian. Same header file will be used for floating point unit(FPU) variants.
*
*
* \section example Examples
*
* The library ships with a number of examples which demonstrate how to use the library functions.
*
* \section toolchain Toolchain Support
*
* The library is now tested on Fast Models building with cmake.
* Core M0, M7, A5 are tested.
*
*
*
* \section building Building the Library
*
* The library installer contains a project file to rebuild libraries on MDK toolchain in the <code>CMSIS\\DSP\\Projects\\ARM</code> folder.
* - arm_cortexM_math.uvprojx
*
*
* The libraries can be built by opening the arm_cortexM_math.uvprojx project in MDK-ARM, selecting a specific target, and defining the optional preprocessor macros detailed above.
*
* There is also a work in progress cmake build. The README file is giving more details.
*
* \section preprocessor Preprocessor Macros
*
* Each library project have different preprocessor macros.
*
* - ARM_MATH_BIG_ENDIAN:
*
* Define macro ARM_MATH_BIG_ENDIAN to build the library for big endian targets. By default library builds for little endian targets.
*
* - ARM_MATH_MATRIX_CHECK:
*
* Define macro ARM_MATH_MATRIX_CHECK for checking on the input and output sizes of matrices
*
* - ARM_MATH_ROUNDING:
*
* Define macro ARM_MATH_ROUNDING for rounding on support functions
*
* - ARM_MATH_LOOPUNROLL:
*
* Define macro ARM_MATH_LOOPUNROLL to enable manual loop unrolling in DSP functions
*
* - ARM_MATH_NEON:
*
* Define macro ARM_MATH_NEON to enable Neon versions of the DSP functions.
* It is not enabled by default when Neon is available because performances are
* dependent on the compiler and target architecture.
*
* - ARM_MATH_NEON_EXPERIMENTAL:
*
* Define macro ARM_MATH_NEON_EXPERIMENTAL to enable experimental Neon versions of
* of some DSP functions. Experimental Neon versions currently do not have better
* performances than the scalar versions.
*
* - ARM_MATH_HELIUM:
*
* It implies the flags ARM_MATH_MVEF and ARM_MATH_MVEI and ARM_MATH_FLOAT16.
*
* - ARM_MATH_MVEF:
*
* Select Helium versions of the f32 algorithms.
* It implies ARM_MATH_FLOAT16 and ARM_MATH_MVEI.
*
* - ARM_MATH_MVEI:
*
* Select Helium versions of the int and fixed point algorithms.
*
* - ARM_MATH_MVE_FLOAT16:
*
* MVE Float16 implementations of some algorithms (Requires MVE extension).
*
* - DISABLEFLOAT16:
*
* Disable float16 algorithms when __fp16 is not supported for a
* specific compiler / core configuration
*
* <hr>
* \section pack CMSIS-DSP in ARM::CMSIS Pack
*
* The following files relevant to CMSIS-DSP are present in the <b>ARM::CMSIS</b> Pack directories:
* |File/Folder |Content |
* |---------------------------------|------------------------------------------------------------------------|
* |\b CMSIS\\Documentation\\DSP | This documentation |
* |\b CMSIS\\DSP\\DSP_Lib_TestSuite | DSP_Lib deprecated test suite |
* |\b CMSIS\\DSP\\Examples | Example projects demonstrating the usage of the library functions |
* |\b CMSIS\\DSP\\Include | DSP_Lib include files for using and building the lib
* |\b CMSIS\\DSP\\PrivateInclude | DSP_Lib private include files for building the lib |
* |\b CMSIS\\DSP\\Lib | DSP_Lib binaries |
* |\b CMSIS\\DSP\\Projects | Projects to rebuild DSP_Lib binaries |
* |\b CMSIS\\DSP\\Source | DSP_Lib source files |
*
* <hr>
* \section rev Revision History of CMSIS-DSP
* Please refer to \ref ChangeLog_pg.
*/
/**
* @defgroup groupExamples Examples
*/
#ifndef _ARM_MATH_H
#define _ARM_MATH_H
#include "arm_math_types.h"
#include "arm_math_memory.h"
#include "dsp/none.h"
#include "dsp/utils.h"
#include "dsp/basic_math_functions.h"
#include "dsp/interpolation_functions.h"
#include "dsp/bayes_functions.h"
#include "dsp/matrix_functions.h"
#include "dsp/complex_math_functions.h"
#include "dsp/statistics_functions.h"
#include "dsp/controller_functions.h"
#include "dsp/support_functions.h"
#include "dsp/distance_functions.h"
#include "dsp/svm_functions.h"
#include "dsp/fast_math_functions.h"
#include "dsp/transform_functions.h"
#include "dsp/filtering_functions.h"
#ifdef __cplusplus
extern "C"
{
#endif
//#define TABLE_SPACING_Q31 0x400000
//#define TABLE_SPACING_Q15 0x80
#ifdef __cplusplus
}
#endif
#endif /* _ARM_MATH_H */
/**
*
* End of file.
*/

View File

@ -1,240 +0,0 @@
/******************************************************************************
* @file arm_math_memory.h
* @brief Public header file for CMSIS DSP Library
* @version V1.9.0
* @date 20. July 2020
******************************************************************************/
/*
* Copyright (c) 2010-2020 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef _ARM_MATH_MEMORY_H_
#define _ARM_MATH_MEMORY_H_
#include "arm_math_types.h"
#ifdef __cplusplus
extern "C"
{
#endif
/**
@brief definition to read/write two 16 bit values.
@deprecated
*/
#if defined ( __CC_ARM )
#define __SIMD32_TYPE int32_t __packed
#elif defined ( __ARMCC_VERSION ) && ( __ARMCC_VERSION >= 6010050 )
#define __SIMD32_TYPE int32_t
#elif defined ( __GNUC__ )
#define __SIMD32_TYPE int32_t
#elif defined ( __ICCARM__ )
#define __SIMD32_TYPE int32_t __packed
#elif defined ( __TI_ARM__ )
#define __SIMD32_TYPE int32_t
#elif defined ( __CSMC__ )
#define __SIMD32_TYPE int32_t
#elif defined ( __TASKING__ )
#define __SIMD32_TYPE __un(aligned) int32_t
#elif defined(_MSC_VER )
#define __SIMD32_TYPE int32_t
#else
#error Unknown compiler
#endif
#define __SIMD32(addr) (*(__SIMD32_TYPE **) & (addr))
#define __SIMD32_CONST(addr) ( (__SIMD32_TYPE * ) (addr))
#define _SIMD32_OFFSET(addr) (*(__SIMD32_TYPE * ) (addr))
#define __SIMD64(addr) (*( int64_t **) & (addr))
/* SIMD replacement */
/**
@brief Read 2 Q15 from Q15 pointer.
@param[in] pQ15 points to input value
@return Q31 value
*/
__STATIC_FORCEINLINE q31_t read_q15x2 (
q15_t * pQ15)
{
q31_t val;
#ifdef __ARM_FEATURE_UNALIGNED
memcpy (&val, pQ15, 4);
#else
val = (pQ15[1] << 16) | (pQ15[0] & 0x0FFFF) ;
#endif
return (val);
}
/**
@brief Read 2 Q15 from Q15 pointer and increment pointer afterwards.
@param[in] pQ15 points to input value
@return Q31 value
*/
__STATIC_FORCEINLINE q31_t read_q15x2_ia (
q15_t ** pQ15)
{
q31_t val;
#ifdef __ARM_FEATURE_UNALIGNED
memcpy (&val, *pQ15, 4);
#else
val = ((*pQ15)[1] << 16) | ((*pQ15)[0] & 0x0FFFF);
#endif
*pQ15 += 2;
return (val);
}
/**
@brief Read 2 Q15 from Q15 pointer and decrement pointer afterwards.
@param[in] pQ15 points to input value
@return Q31 value
*/
__STATIC_FORCEINLINE q31_t read_q15x2_da (
q15_t ** pQ15)
{
q31_t val;
#ifdef __ARM_FEATURE_UNALIGNED
memcpy (&val, *pQ15, 4);
#else
val = ((*pQ15)[1] << 16) | ((*pQ15)[0] & 0x0FFFF);
#endif
*pQ15 -= 2;
return (val);
}
/**
@brief Write 2 Q15 to Q15 pointer and increment pointer afterwards.
@param[in] pQ15 points to input value
@param[in] value Q31 value
@return none
*/
__STATIC_FORCEINLINE void write_q15x2_ia (
q15_t ** pQ15,
q31_t value)
{
q31_t val = value;
#ifdef __ARM_FEATURE_UNALIGNED
memcpy (*pQ15, &val, 4);
#else
(*pQ15)[0] = (val & 0x0FFFF);
(*pQ15)[1] = (val >> 16) & 0x0FFFF;
#endif
*pQ15 += 2;
}
/**
@brief Write 2 Q15 to Q15 pointer.
@param[in] pQ15 points to input value
@param[in] value Q31 value
@return none
*/
__STATIC_FORCEINLINE void write_q15x2 (
q15_t * pQ15,
q31_t value)
{
q31_t val = value;
#ifdef __ARM_FEATURE_UNALIGNED
memcpy (pQ15, &val, 4);
#else
pQ15[0] = val & 0x0FFFF;
pQ15[1] = val >> 16;
#endif
}
/**
@brief Read 4 Q7 from Q7 pointer and increment pointer afterwards.
@param[in] pQ7 points to input value
@return Q31 value
*/
__STATIC_FORCEINLINE q31_t read_q7x4_ia (
q7_t ** pQ7)
{
q31_t val;
#ifdef __ARM_FEATURE_UNALIGNED
memcpy (&val, *pQ7, 4);
#else
val =(((*pQ7)[3] & 0x0FF) << 24) | (((*pQ7)[2] & 0x0FF) << 16) | (((*pQ7)[1] & 0x0FF) << 8) | ((*pQ7)[0] & 0x0FF);
#endif
*pQ7 += 4;
return (val);
}
/**
@brief Read 4 Q7 from Q7 pointer and decrement pointer afterwards.
@param[in] pQ7 points to input value
@return Q31 value
*/
__STATIC_FORCEINLINE q31_t read_q7x4_da (
q7_t ** pQ7)
{
q31_t val;
#ifdef __ARM_FEATURE_UNALIGNED
memcpy (&val, *pQ7, 4);
#else
val = ((((*pQ7)[3]) & 0x0FF) << 24) | ((((*pQ7)[2]) & 0x0FF) << 16) | ((((*pQ7)[1]) & 0x0FF) << 8) | ((*pQ7)[0] & 0x0FF);
#endif
*pQ7 -= 4;
return (val);
}
/**
@brief Write 4 Q7 to Q7 pointer and increment pointer afterwards.
@param[in] pQ7 points to input value
@param[in] value Q31 value
@return none
*/
__STATIC_FORCEINLINE void write_q7x4_ia (
q7_t ** pQ7,
q31_t value)
{
q31_t val = value;
#ifdef __ARM_FEATURE_UNALIGNED
memcpy (*pQ7, &val, 4);
#else
(*pQ7)[0] = val & 0x0FF;
(*pQ7)[1] = (val >> 8) & 0x0FF;
(*pQ7)[2] = (val >> 16) & 0x0FF;
(*pQ7)[3] = (val >> 24) & 0x0FF;
#endif
*pQ7 += 4;
}
#ifdef __cplusplus
}
#endif
#endif /*ifndef _ARM_MATH_MEMORY_H_ */

View File

@ -1,598 +0,0 @@
/******************************************************************************
* @file arm_math_types.h
* @brief Public header file for CMSIS DSP Library
* @version V1.9.0
* @date 20. July 2020
******************************************************************************/
/*
* Copyright (c) 2010-2020 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef _ARM_MATH_TYPES_H_
#define _ARM_MATH_TYPES_H_
#ifdef __cplusplus
extern "C"
{
#endif
/* Compiler specific diagnostic adjustment */
#if defined ( __CC_ARM )
#elif defined ( __ARMCC_VERSION ) && ( __ARMCC_VERSION >= 6010050 )
#elif defined ( __GNUC__ )
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wsign-conversion"
#pragma GCC diagnostic ignored "-Wconversion"
#pragma GCC diagnostic ignored "-Wunused-parameter"
#elif defined ( __ICCARM__ )
#elif defined ( __TI_ARM__ )
#elif defined ( __CSMC__ )
#elif defined ( __TASKING__ )
#elif defined ( _MSC_VER )
#else
#error Unknown compiler
#endif
/* Included for instrinsics definitions */
#if defined (_MSC_VER )
#include <stdint.h>
#define __STATIC_FORCEINLINE static __forceinline
#define __STATIC_INLINE static __inline
#define __ALIGNED(x) __declspec(align(x))
#elif defined (__GNUC_PYTHON__)
#include <stdint.h>
#define __ALIGNED(x) __attribute__((aligned(x)))
#define __STATIC_FORCEINLINE static __attribute__((inline))
#define __STATIC_INLINE static __attribute__((inline))
#pragma GCC diagnostic ignored "-Wunused-function"
#pragma GCC diagnostic ignored "-Wattributes"
#else
#include "cmsis_compiler.h"
#endif
#include <string.h>
#include <math.h>
#include <float.h>
#include <limits.h>
/* evaluate ARM DSP feature */
#if (defined (__ARM_FEATURE_DSP) && (__ARM_FEATURE_DSP == 1))
#define ARM_MATH_DSP 1
#endif
#if defined(ARM_MATH_NEON)
#include <arm_neon.h>
#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
#if !defined(ARM_MATH_NEON_FLOAT16)
#define ARM_MATH_NEON_FLOAT16
#endif
#endif
#endif
#if !defined(ARM_MATH_AUTOVECTORIZE)
#if __ARM_FEATURE_MVE
#if !defined(ARM_MATH_MVEI)
#define ARM_MATH_MVEI
#endif
#endif
#if (__ARM_FEATURE_MVE & 2)
#if !defined(ARM_MATH_MVEF)
#define ARM_MATH_MVEF
#endif
#if !defined(ARM_MATH_MVE_FLOAT16)
/* HW Float16 not yet well supported on gcc for M55 */
#if !defined(__CMSIS_GCC_H)
#define ARM_MATH_MVE_FLOAT16
#endif
#endif
#endif
#endif /*!defined(ARM_MATH_AUTOVECTORIZE)*/
#if defined (ARM_MATH_HELIUM)
#if !defined(ARM_MATH_MVEF)
#define ARM_MATH_MVEF
#endif
#if !defined(ARM_MATH_MVEI)
#define ARM_MATH_MVEI
#endif
#if !defined(ARM_MATH_MVE_FLOAT16)
/* HW Float16 not yet well supported on gcc for M55 */
#if !defined(__CMSIS_GCC_H)
#define ARM_MATH_MVE_FLOAT16
#endif
#endif
#endif
#if defined ( __CC_ARM )
/* Enter low optimization region - place directly above function definition */
#if defined( __ARM_ARCH_7EM__ )
#define LOW_OPTIMIZATION_ENTER \
_Pragma ("push") \
_Pragma ("O1")
#else
#define LOW_OPTIMIZATION_ENTER
#endif
/* Exit low optimization region - place directly after end of function definition */
#if defined ( __ARM_ARCH_7EM__ )
#define LOW_OPTIMIZATION_EXIT \
_Pragma ("pop")
#else
#define LOW_OPTIMIZATION_EXIT
#endif
/* Enter low optimization region - place directly above function definition */
#define IAR_ONLY_LOW_OPTIMIZATION_ENTER
/* Exit low optimization region - place directly after end of function definition */
#define IAR_ONLY_LOW_OPTIMIZATION_EXIT
#elif defined (__ARMCC_VERSION ) && ( __ARMCC_VERSION >= 6010050 )
#define LOW_OPTIMIZATION_ENTER
#define LOW_OPTIMIZATION_EXIT
#define IAR_ONLY_LOW_OPTIMIZATION_ENTER
#define IAR_ONLY_LOW_OPTIMIZATION_EXIT
#elif defined ( __GNUC__ )
#define LOW_OPTIMIZATION_ENTER \
__attribute__(( optimize("-O1") ))
#define LOW_OPTIMIZATION_EXIT
#define IAR_ONLY_LOW_OPTIMIZATION_ENTER
#define IAR_ONLY_LOW_OPTIMIZATION_EXIT
#elif defined ( __ICCARM__ )
/* Enter low optimization region - place directly above function definition */
#if defined ( __ARM_ARCH_7EM__ )
#define LOW_OPTIMIZATION_ENTER \
_Pragma ("optimize=low")
#else
#define LOW_OPTIMIZATION_ENTER
#endif
/* Exit low optimization region - place directly after end of function definition */
#define LOW_OPTIMIZATION_EXIT
/* Enter low optimization region - place directly above function definition */
#if defined ( __ARM_ARCH_7EM__ )
#define IAR_ONLY_LOW_OPTIMIZATION_ENTER \
_Pragma ("optimize=low")
#else
#define IAR_ONLY_LOW_OPTIMIZATION_ENTER
#endif
/* Exit low optimization region - place directly after end of function definition */
#define IAR_ONLY_LOW_OPTIMIZATION_EXIT
#elif defined ( __TI_ARM__ )
#define LOW_OPTIMIZATION_ENTER
#define LOW_OPTIMIZATION_EXIT
#define IAR_ONLY_LOW_OPTIMIZATION_ENTER
#define IAR_ONLY_LOW_OPTIMIZATION_EXIT
#elif defined ( __CSMC__ )
#define LOW_OPTIMIZATION_ENTER
#define LOW_OPTIMIZATION_EXIT
#define IAR_ONLY_LOW_OPTIMIZATION_ENTER
#define IAR_ONLY_LOW_OPTIMIZATION_EXIT
#elif defined ( __TASKING__ )
#define LOW_OPTIMIZATION_ENTER
#define LOW_OPTIMIZATION_EXIT
#define IAR_ONLY_LOW_OPTIMIZATION_ENTER
#define IAR_ONLY_LOW_OPTIMIZATION_EXIT
#elif defined ( _MSC_VER ) || defined(__GNUC_PYTHON__)
#define LOW_OPTIMIZATION_ENTER
#define LOW_OPTIMIZATION_EXIT
#define IAR_ONLY_LOW_OPTIMIZATION_ENTER
#define IAR_ONLY_LOW_OPTIMIZATION_EXIT
#endif
/* Compiler specific diagnostic adjustment */
#if defined ( __CC_ARM )
#elif defined ( __ARMCC_VERSION ) && ( __ARMCC_VERSION >= 6010050 )
#elif defined ( __GNUC__ )
#pragma GCC diagnostic pop
#elif defined ( __ICCARM__ )
#elif defined ( __TI_ARM__ )
#elif defined ( __CSMC__ )
#elif defined ( __TASKING__ )
#elif defined ( _MSC_VER )
#else
#error Unknown compiler
#endif
#ifdef __cplusplus
}
#endif
#if __ARM_FEATURE_MVE
#include <arm_mve.h>
#endif
#ifdef __cplusplus
extern "C"
{
#endif
/**
* @brief 8-bit fractional data type in 1.7 format.
*/
typedef int8_t q7_t;
/**
* @brief 16-bit fractional data type in 1.15 format.
*/
typedef int16_t q15_t;
/**
* @brief 32-bit fractional data type in 1.31 format.
*/
typedef int32_t q31_t;
/**
* @brief 64-bit fractional data type in 1.63 format.
*/
typedef int64_t q63_t;
/**
* @brief 32-bit floating-point type definition.
*/
typedef float float32_t;
/**
* @brief 64-bit floating-point type definition.
*/
typedef double float64_t;
/**
* @brief vector types
*/
#if defined(ARM_MATH_NEON) || defined (ARM_MATH_MVEI)
/**
* @brief 64-bit fractional 128-bit vector data type in 1.63 format
*/
typedef int64x2_t q63x2_t;
/**
* @brief 32-bit fractional 128-bit vector data type in 1.31 format.
*/
typedef int32x4_t q31x4_t;
/**
* @brief 16-bit fractional 128-bit vector data type with 16-bit alignement in 1.15 format.
*/
typedef __ALIGNED(2) int16x8_t q15x8_t;
/**
* @brief 8-bit fractional 128-bit vector data type with 8-bit alignement in 1.7 format.
*/
typedef __ALIGNED(1) int8x16_t q7x16_t;
/**
* @brief 32-bit fractional 128-bit vector pair data type in 1.31 format.
*/
typedef int32x4x2_t q31x4x2_t;
/**
* @brief 32-bit fractional 128-bit vector quadruplet data type in 1.31 format.
*/
typedef int32x4x4_t q31x4x4_t;
/**
* @brief 16-bit fractional 128-bit vector pair data type in 1.15 format.
*/
typedef int16x8x2_t q15x8x2_t;
/**
* @brief 16-bit fractional 128-bit vector quadruplet data type in 1.15 format.
*/
typedef int16x8x4_t q15x8x4_t;
/**
* @brief 8-bit fractional 128-bit vector pair data type in 1.7 format.
*/
typedef int8x16x2_t q7x16x2_t;
/**
* @brief 8-bit fractional 128-bit vector quadruplet data type in 1.7 format.
*/
typedef int8x16x4_t q7x16x4_t;
/**
* @brief 32-bit fractional data type in 9.23 format.
*/
typedef int32_t q23_t;
/**
* @brief 32-bit fractional 128-bit vector data type in 9.23 format.
*/
typedef int32x4_t q23x4_t;
/**
* @brief 64-bit status 128-bit vector data type.
*/
typedef int64x2_t status64x2_t;
/**
* @brief 32-bit status 128-bit vector data type.
*/
typedef int32x4_t status32x4_t;
/**
* @brief 16-bit status 128-bit vector data type.
*/
typedef int16x8_t status16x8_t;
/**
* @brief 8-bit status 128-bit vector data type.
*/
typedef int8x16_t status8x16_t;
#endif
#if defined(ARM_MATH_NEON) || defined(ARM_MATH_MVEF) /* floating point vector*/
/**
* @brief 32-bit floating-point 128-bit vector type
*/
typedef float32x4_t f32x4_t;
/**
* @brief 32-bit floating-point 128-bit vector pair data type
*/
typedef float32x4x2_t f32x4x2_t;
/**
* @brief 32-bit floating-point 128-bit vector quadruplet data type
*/
typedef float32x4x4_t f32x4x4_t;
/**
* @brief 32-bit ubiquitous 128-bit vector data type
*/
typedef union _any32x4_t
{
float32x4_t f;
int32x4_t i;
} any32x4_t;
#endif
#if defined(ARM_MATH_NEON)
/**
* @brief 32-bit fractional 64-bit vector data type in 1.31 format.
*/
typedef int32x2_t q31x2_t;
/**
* @brief 16-bit fractional 64-bit vector data type in 1.15 format.
*/
typedef __ALIGNED(2) int16x4_t q15x4_t;
/**
* @brief 8-bit fractional 64-bit vector data type in 1.7 format.
*/
typedef __ALIGNED(1) int8x8_t q7x8_t;
/**
* @brief 32-bit float 64-bit vector data type.
*/
typedef float32x2_t f32x2_t;
/**
* @brief 32-bit floating-point 128-bit vector triplet data type
*/
typedef float32x4x3_t f32x4x3_t;
/**
* @brief 32-bit fractional 128-bit vector triplet data type in 1.31 format
*/
typedef int32x4x3_t q31x4x3_t;
/**
* @brief 16-bit fractional 128-bit vector triplet data type in 1.15 format
*/
typedef int16x8x3_t q15x8x3_t;
/**
* @brief 8-bit fractional 128-bit vector triplet data type in 1.7 format
*/
typedef int8x16x3_t q7x16x3_t;
/**
* @brief 32-bit floating-point 64-bit vector pair data type
*/
typedef float32x2x2_t f32x2x2_t;
/**
* @brief 32-bit floating-point 64-bit vector triplet data type
*/
typedef float32x2x3_t f32x2x3_t;
/**
* @brief 32-bit floating-point 64-bit vector quadruplet data type
*/
typedef float32x2x4_t f32x2x4_t;
/**
* @brief 32-bit fractional 64-bit vector pair data type in 1.31 format
*/
typedef int32x2x2_t q31x2x2_t;
/**
* @brief 32-bit fractional 64-bit vector triplet data type in 1.31 format
*/
typedef int32x2x3_t q31x2x3_t;
/**
* @brief 32-bit fractional 64-bit vector quadruplet data type in 1.31 format
*/
typedef int32x4x3_t q31x2x4_t;
/**
* @brief 16-bit fractional 64-bit vector pair data type in 1.15 format
*/
typedef int16x4x2_t q15x4x2_t;
/**
* @brief 16-bit fractional 64-bit vector triplet data type in 1.15 format
*/
typedef int16x4x2_t q15x4x3_t;
/**
* @brief 16-bit fractional 64-bit vector quadruplet data type in 1.15 format
*/
typedef int16x4x3_t q15x4x4_t;
/**
* @brief 8-bit fractional 64-bit vector pair data type in 1.7 format
*/
typedef int8x8x2_t q7x8x2_t;
/**
* @brief 8-bit fractional 64-bit vector triplet data type in 1.7 format
*/
typedef int8x8x3_t q7x8x3_t;
/**
* @brief 8-bit fractional 64-bit vector quadruplet data type in 1.7 format
*/
typedef int8x8x4_t q7x8x4_t;
/**
* @brief 32-bit ubiquitous 64-bit vector data type
*/
typedef union _any32x2_t
{
float32x2_t f;
int32x2_t i;
} any32x2_t;
/**
* @brief 32-bit status 64-bit vector data type.
*/
typedef int32x4_t status32x2_t;
/**
* @brief 16-bit status 64-bit vector data type.
*/
typedef int16x8_t status16x4_t;
/**
* @brief 8-bit status 64-bit vector data type.
*/
typedef int8x16_t status8x8_t;
#endif
#define F64_MAX ((float64_t)DBL_MAX)
#define F32_MAX ((float32_t)FLT_MAX)
#define F64_MIN (-DBL_MAX)
#define F32_MIN (-FLT_MAX)
#define F64_ABSMAX ((float64_t)DBL_MAX)
#define F32_ABSMAX ((float32_t)FLT_MAX)
#define F64_ABSMIN ((float64_t)0.0)
#define F32_ABSMIN ((float32_t)0.0)
#define Q31_MAX ((q31_t)(0x7FFFFFFFL))
#define Q15_MAX ((q15_t)(0x7FFF))
#define Q7_MAX ((q7_t)(0x7F))
#define Q31_MIN ((q31_t)(0x80000000L))
#define Q15_MIN ((q15_t)(0x8000))
#define Q7_MIN ((q7_t)(0x80))
#define Q31_ABSMAX ((q31_t)(0x7FFFFFFFL))
#define Q15_ABSMAX ((q15_t)(0x7FFF))
#define Q7_ABSMAX ((q7_t)(0x7F))
#define Q31_ABSMIN ((q31_t)0)
#define Q15_ABSMIN ((q15_t)0)
#define Q7_ABSMIN ((q7_t)0)
/* Dimension C vector space */
#define CMPLX_DIM 2
/**
* @brief Error status returned by some functions in the library.
*/
typedef enum
{
ARM_MATH_SUCCESS = 0, /**< No error */
ARM_MATH_ARGUMENT_ERROR = -1, /**< One or more arguments are incorrect */
ARM_MATH_LENGTH_ERROR = -2, /**< Length of data buffer is incorrect */
ARM_MATH_SIZE_MISMATCH = -3, /**< Size of matrices is not compatible with the operation */
ARM_MATH_NANINF = -4, /**< Not-a-number (NaN) or infinity is generated */
ARM_MATH_SINGULAR = -5, /**< Input matrix is singular and cannot be inverted */
ARM_MATH_TEST_FAILURE = -6 /**< Test Failed */
} arm_status;
#ifdef __cplusplus
}
#endif
#endif /*ifndef _ARM_MATH_TYPES_H_ */

View File

@ -1,699 +0,0 @@
/******************************************************************************
* @file basic_math_functions.h
* @brief Public header file for CMSIS DSP Library
* @version V1.9.0
* @date 20. July 2020
******************************************************************************/
/*
* Copyright (c) 2010-2020 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef _BASIC_MATH_FUNCTIONS_H_
#define _BASIC_MATH_FUNCTIONS_H_
#include "arm_math_types.h"
#include "arm_math_memory.h"
#include "dsp/none.h"
#include "dsp/utils.h"
#ifdef __cplusplus
extern "C"
{
#endif
/**
* @defgroup groupMath Basic Math Functions
*/
/**
* @brief Q7 vector multiplication.
* @param[in] pSrcA points to the first input vector
* @param[in] pSrcB points to the second input vector
* @param[out] pDst points to the output vector
* @param[in] blockSize number of samples in each vector
*/
void arm_mult_q7(
const q7_t * pSrcA,
const q7_t * pSrcB,
q7_t * pDst,
uint32_t blockSize);
/**
* @brief Q15 vector multiplication.
* @param[in] pSrcA points to the first input vector
* @param[in] pSrcB points to the second input vector
* @param[out] pDst points to the output vector
* @param[in] blockSize number of samples in each vector
*/
void arm_mult_q15(
const q15_t * pSrcA,
const q15_t * pSrcB,
q15_t * pDst,
uint32_t blockSize);
/**
* @brief Q31 vector multiplication.
* @param[in] pSrcA points to the first input vector
* @param[in] pSrcB points to the second input vector
* @param[out] pDst points to the output vector
* @param[in] blockSize number of samples in each vector
*/
void arm_mult_q31(
const q31_t * pSrcA,
const q31_t * pSrcB,
q31_t * pDst,
uint32_t blockSize);
/**
* @brief Floating-point vector multiplication.
* @param[in] pSrcA points to the first input vector
* @param[in] pSrcB points to the second input vector
* @param[out] pDst points to the output vector
* @param[in] blockSize number of samples in each vector
*/
void arm_mult_f32(
const float32_t * pSrcA,
const float32_t * pSrcB,
float32_t * pDst,
uint32_t blockSize);
/**
* @brief Floating-point vector addition.
* @param[in] pSrcA points to the first input vector
* @param[in] pSrcB points to the second input vector
* @param[out] pDst points to the output vector
* @param[in] blockSize number of samples in each vector
*/
void arm_add_f32(
const float32_t * pSrcA,
const float32_t * pSrcB,
float32_t * pDst,
uint32_t blockSize);
/**
* @brief Q7 vector addition.
* @param[in] pSrcA points to the first input vector
* @param[in] pSrcB points to the second input vector
* @param[out] pDst points to the output vector
* @param[in] blockSize number of samples in each vector
*/
void arm_add_q7(
const q7_t * pSrcA,
const q7_t * pSrcB,
q7_t * pDst,
uint32_t blockSize);
/**
* @brief Q15 vector addition.
* @param[in] pSrcA points to the first input vector
* @param[in] pSrcB points to the second input vector
* @param[out] pDst points to the output vector
* @param[in] blockSize number of samples in each vector
*/
void arm_add_q15(
const q15_t * pSrcA,
const q15_t * pSrcB,
q15_t * pDst,
uint32_t blockSize);
/**
* @brief Q31 vector addition.
* @param[in] pSrcA points to the first input vector
* @param[in] pSrcB points to the second input vector
* @param[out] pDst points to the output vector
* @param[in] blockSize number of samples in each vector
*/
void arm_add_q31(
const q31_t * pSrcA,
const q31_t * pSrcB,
q31_t * pDst,
uint32_t blockSize);
/**
* @brief Floating-point vector subtraction.
* @param[in] pSrcA points to the first input vector
* @param[in] pSrcB points to the second input vector
* @param[out] pDst points to the output vector
* @param[in] blockSize number of samples in each vector
*/
void arm_sub_f32(
const float32_t * pSrcA,
const float32_t * pSrcB,
float32_t * pDst,
uint32_t blockSize);
/**
* @brief Q7 vector subtraction.
* @param[in] pSrcA points to the first input vector
* @param[in] pSrcB points to the second input vector
* @param[out] pDst points to the output vector
* @param[in] blockSize number of samples in each vector
*/
void arm_sub_q7(
const q7_t * pSrcA,
const q7_t * pSrcB,
q7_t * pDst,
uint32_t blockSize);
/**
* @brief Q15 vector subtraction.
* @param[in] pSrcA points to the first input vector
* @param[in] pSrcB points to the second input vector
* @param[out] pDst points to the output vector
* @param[in] blockSize number of samples in each vector
*/
void arm_sub_q15(
const q15_t * pSrcA,
const q15_t * pSrcB,
q15_t * pDst,
uint32_t blockSize);
/**
* @brief Q31 vector subtraction.
* @param[in] pSrcA points to the first input vector
* @param[in] pSrcB points to the second input vector
* @param[out] pDst points to the output vector
* @param[in] blockSize number of samples in each vector
*/
void arm_sub_q31(
const q31_t * pSrcA,
const q31_t * pSrcB,
q31_t * pDst,
uint32_t blockSize);
/**
* @brief Multiplies a floating-point vector by a scalar.
* @param[in] pSrc points to the input vector
* @param[in] scale scale factor to be applied
* @param[out] pDst points to the output vector
* @param[in] blockSize number of samples in the vector
*/
void arm_scale_f32(
const float32_t * pSrc,
float32_t scale,
float32_t * pDst,
uint32_t blockSize);
/**
* @brief Multiplies a Q7 vector by a scalar.
* @param[in] pSrc points to the input vector
* @param[in] scaleFract fractional portion of the scale value
* @param[in] shift number of bits to shift the result by
* @param[out] pDst points to the output vector
* @param[in] blockSize number of samples in the vector
*/
void arm_scale_q7(
const q7_t * pSrc,
q7_t scaleFract,
int8_t shift,
q7_t * pDst,
uint32_t blockSize);
/**
* @brief Multiplies a Q15 vector by a scalar.
* @param[in] pSrc points to the input vector
* @param[in] scaleFract fractional portion of the scale value
* @param[in] shift number of bits to shift the result by
* @param[out] pDst points to the output vector
* @param[in] blockSize number of samples in the vector
*/
void arm_scale_q15(
const q15_t * pSrc,
q15_t scaleFract,
int8_t shift,
q15_t * pDst,
uint32_t blockSize);
/**
* @brief Multiplies a Q31 vector by a scalar.
* @param[in] pSrc points to the input vector
* @param[in] scaleFract fractional portion of the scale value
* @param[in] shift number of bits to shift the result by
* @param[out] pDst points to the output vector
* @param[in] blockSize number of samples in the vector
*/
void arm_scale_q31(
const q31_t * pSrc,
q31_t scaleFract,
int8_t shift,
q31_t * pDst,
uint32_t blockSize);
/**
* @brief Q7 vector absolute value.
* @param[in] pSrc points to the input buffer
* @param[out] pDst points to the output buffer
* @param[in] blockSize number of samples in each vector
*/
void arm_abs_q7(
const q7_t * pSrc,
q7_t * pDst,
uint32_t blockSize);
/**
* @brief Floating-point vector absolute value.
* @param[in] pSrc points to the input buffer
* @param[out] pDst points to the output buffer
* @param[in] blockSize number of samples in each vector
*/
void arm_abs_f32(
const float32_t * pSrc,
float32_t * pDst,
uint32_t blockSize);
/**
* @brief Q15 vector absolute value.
* @param[in] pSrc points to the input buffer
* @param[out] pDst points to the output buffer
* @param[in] blockSize number of samples in each vector
*/
void arm_abs_q15(
const q15_t * pSrc,
q15_t * pDst,
uint32_t blockSize);
/**
* @brief Q31 vector absolute value.
* @param[in] pSrc points to the input buffer
* @param[out] pDst points to the output buffer
* @param[in] blockSize number of samples in each vector
*/
void arm_abs_q31(
const q31_t * pSrc,
q31_t * pDst,
uint32_t blockSize);
/**
* @brief Dot product of floating-point vectors.
* @param[in] pSrcA points to the first input vector
* @param[in] pSrcB points to the second input vector
* @param[in] blockSize number of samples in each vector
* @param[out] result output result returned here
*/
void arm_dot_prod_f32(
const float32_t * pSrcA,
const float32_t * pSrcB,
uint32_t blockSize,
float32_t * result);
/**
* @brief Dot product of Q7 vectors.
* @param[in] pSrcA points to the first input vector
* @param[in] pSrcB points to the second input vector
* @param[in] blockSize number of samples in each vector
* @param[out] result output result returned here
*/
void arm_dot_prod_q7(
const q7_t * pSrcA,
const q7_t * pSrcB,
uint32_t blockSize,
q31_t * result);
/**
* @brief Dot product of Q15 vectors.
* @param[in] pSrcA points to the first input vector
* @param[in] pSrcB points to the second input vector
* @param[in] blockSize number of samples in each vector
* @param[out] result output result returned here
*/
void arm_dot_prod_q15(
const q15_t * pSrcA,
const q15_t * pSrcB,
uint32_t blockSize,
q63_t * result);
/**
* @brief Dot product of Q31 vectors.
* @param[in] pSrcA points to the first input vector
* @param[in] pSrcB points to the second input vector
* @param[in] blockSize number of samples in each vector
* @param[out] result output result returned here
*/
void arm_dot_prod_q31(
const q31_t * pSrcA,
const q31_t * pSrcB,
uint32_t blockSize,
q63_t * result);
/**
* @brief Shifts the elements of a Q7 vector a specified number of bits.
* @param[in] pSrc points to the input vector
* @param[in] shiftBits number of bits to shift. A positive value shifts left; a negative value shifts right.
* @param[out] pDst points to the output vector
* @param[in] blockSize number of samples in the vector
*/
void arm_shift_q7(
const q7_t * pSrc,
int8_t shiftBits,
q7_t * pDst,
uint32_t blockSize);
/**
* @brief Shifts the elements of a Q15 vector a specified number of bits.
* @param[in] pSrc points to the input vector
* @param[in] shiftBits number of bits to shift. A positive value shifts left; a negative value shifts right.
* @param[out] pDst points to the output vector
* @param[in] blockSize number of samples in the vector
*/
void arm_shift_q15(
const q15_t * pSrc,
int8_t shiftBits,
q15_t * pDst,
uint32_t blockSize);
/**
* @brief Shifts the elements of a Q31 vector a specified number of bits.
* @param[in] pSrc points to the input vector
* @param[in] shiftBits number of bits to shift. A positive value shifts left; a negative value shifts right.
* @param[out] pDst points to the output vector
* @param[in] blockSize number of samples in the vector
*/
void arm_shift_q31(
const q31_t * pSrc,
int8_t shiftBits,
q31_t * pDst,
uint32_t blockSize);
/**
* @brief Adds a constant offset to a floating-point vector.
* @param[in] pSrc points to the input vector
* @param[in] offset is the offset to be added
* @param[out] pDst points to the output vector
* @param[in] blockSize number of samples in the vector
*/
void arm_offset_f32(
const float32_t * pSrc,
float32_t offset,
float32_t * pDst,
uint32_t blockSize);
/**
* @brief Adds a constant offset to a Q7 vector.
* @param[in] pSrc points to the input vector
* @param[in] offset is the offset to be added
* @param[out] pDst points to the output vector
* @param[in] blockSize number of samples in the vector
*/
void arm_offset_q7(
const q7_t * pSrc,
q7_t offset,
q7_t * pDst,
uint32_t blockSize);
/**
* @brief Adds a constant offset to a Q15 vector.
* @param[in] pSrc points to the input vector
* @param[in] offset is the offset to be added
* @param[out] pDst points to the output vector
* @param[in] blockSize number of samples in the vector
*/
void arm_offset_q15(
const q15_t * pSrc,
q15_t offset,
q15_t * pDst,
uint32_t blockSize);
/**
* @brief Adds a constant offset to a Q31 vector.
* @param[in] pSrc points to the input vector
* @param[in] offset is the offset to be added
* @param[out] pDst points to the output vector
* @param[in] blockSize number of samples in the vector
*/
void arm_offset_q31(
const q31_t * pSrc,
q31_t offset,
q31_t * pDst,
uint32_t blockSize);
/**
* @brief Negates the elements of a floating-point vector.
* @param[in] pSrc points to the input vector
* @param[out] pDst points to the output vector
* @param[in] blockSize number of samples in the vector
*/
void arm_negate_f32(
const float32_t * pSrc,
float32_t * pDst,
uint32_t blockSize);
/**
* @brief Negates the elements of a Q7 vector.
* @param[in] pSrc points to the input vector
* @param[out] pDst points to the output vector
* @param[in] blockSize number of samples in the vector
*/
void arm_negate_q7(
const q7_t * pSrc,
q7_t * pDst,
uint32_t blockSize);
/**
* @brief Negates the elements of a Q15 vector.
* @param[in] pSrc points to the input vector
* @param[out] pDst points to the output vector
* @param[in] blockSize number of samples in the vector
*/
void arm_negate_q15(
const q15_t * pSrc,
q15_t * pDst,
uint32_t blockSize);
/**
* @brief Negates the elements of a Q31 vector.
* @param[in] pSrc points to the input vector
* @param[out] pDst points to the output vector
* @param[in] blockSize number of samples in the vector
*/
void arm_negate_q31(
const q31_t * pSrc,
q31_t * pDst,
uint32_t blockSize);
/**
* @brief Compute the logical bitwise AND of two fixed-point vectors.
* @param[in] pSrcA points to input vector A
* @param[in] pSrcB points to input vector B
* @param[out] pDst points to output vector
* @param[in] blockSize number of samples in each vector
* @return none
*/
void arm_and_u16(
const uint16_t * pSrcA,
const uint16_t * pSrcB,
uint16_t * pDst,
uint32_t blockSize);
/**
* @brief Compute the logical bitwise AND of two fixed-point vectors.
* @param[in] pSrcA points to input vector A
* @param[in] pSrcB points to input vector B
* @param[out] pDst points to output vector
* @param[in] blockSize number of samples in each vector
* @return none
*/
void arm_and_u32(
const uint32_t * pSrcA,
const uint32_t * pSrcB,
uint32_t * pDst,
uint32_t blockSize);
/**
* @brief Compute the logical bitwise AND of two fixed-point vectors.
* @param[in] pSrcA points to input vector A
* @param[in] pSrcB points to input vector B
* @param[out] pDst points to output vector
* @param[in] blockSize number of samples in each vector
* @return none
*/
void arm_and_u8(
const uint8_t * pSrcA,
const uint8_t * pSrcB,
uint8_t * pDst,
uint32_t blockSize);
/**
* @brief Compute the logical bitwise OR of two fixed-point vectors.
* @param[in] pSrcA points to input vector A
* @param[in] pSrcB points to input vector B
* @param[out] pDst points to output vector
* @param[in] blockSize number of samples in each vector
* @return none
*/
void arm_or_u16(
const uint16_t * pSrcA,
const uint16_t * pSrcB,
uint16_t * pDst,
uint32_t blockSize);
/**
* @brief Compute the logical bitwise OR of two fixed-point vectors.
* @param[in] pSrcA points to input vector A
* @param[in] pSrcB points to input vector B
* @param[out] pDst points to output vector
* @param[in] blockSize number of samples in each vector
* @return none
*/
void arm_or_u32(
const uint32_t * pSrcA,
const uint32_t * pSrcB,
uint32_t * pDst,
uint32_t blockSize);
/**
* @brief Compute the logical bitwise OR of two fixed-point vectors.
* @param[in] pSrcA points to input vector A
* @param[in] pSrcB points to input vector B
* @param[out] pDst points to output vector
* @param[in] blockSize number of samples in each vector
* @return none
*/
void arm_or_u8(
const uint8_t * pSrcA,
const uint8_t * pSrcB,
uint8_t * pDst,
uint32_t blockSize);
/**
* @brief Compute the logical bitwise NOT of a fixed-point vector.
* @param[in] pSrc points to input vector
* @param[out] pDst points to output vector
* @param[in] blockSize number of samples in each vector
* @return none
*/
void arm_not_u16(
const uint16_t * pSrc,
uint16_t * pDst,
uint32_t blockSize);
/**
* @brief Compute the logical bitwise NOT of a fixed-point vector.
* @param[in] pSrc points to input vector
* @param[out] pDst points to output vector
* @param[in] blockSize number of samples in each vector
* @return none
*/
void arm_not_u32(
const uint32_t * pSrc,
uint32_t * pDst,
uint32_t blockSize);
/**
* @brief Compute the logical bitwise NOT of a fixed-point vector.
* @param[in] pSrc points to input vector
* @param[out] pDst points to output vector
* @param[in] blockSize number of samples in each vector
* @return none
*/
void arm_not_u8(
const uint8_t * pSrc,
uint8_t * pDst,
uint32_t blockSize);
/**
* @brief Compute the logical bitwise XOR of two fixed-point vectors.
* @param[in] pSrcA points to input vector A
* @param[in] pSrcB points to input vector B
* @param[out] pDst points to output vector
* @param[in] blockSize number of samples in each vector
* @return none
*/
void arm_xor_u16(
const uint16_t * pSrcA,
const uint16_t * pSrcB,
uint16_t * pDst,
uint32_t blockSize);
/**
* @brief Compute the logical bitwise XOR of two fixed-point vectors.
* @param[in] pSrcA points to input vector A
* @param[in] pSrcB points to input vector B
* @param[out] pDst points to output vector
* @param[in] blockSize number of samples in each vector
* @return none
*/
void arm_xor_u32(
const uint32_t * pSrcA,
const uint32_t * pSrcB,
uint32_t * pDst,
uint32_t blockSize);
/**
* @brief Compute the logical bitwise XOR of two fixed-point vectors.
* @param[in] pSrcA points to input vector A
* @param[in] pSrcB points to input vector B
* @param[out] pDst points to output vector
* @param[in] blockSize number of samples in each vector
* @return none
*/
void arm_xor_u8(
const uint8_t * pSrcA,
const uint8_t * pSrcB,
uint8_t * pDst,
uint32_t blockSize);
#ifdef __cplusplus
}
#endif
#endif /* ifndef _BASIC_MATH_FUNCTIONS_H_ */

View File

@ -1,86 +0,0 @@
/******************************************************************************
* @file bayes_functions.h
* @brief Public header file for CMSIS DSP Library
* @version V1.9.0
* @date 20. July 2020
******************************************************************************/
/*
* Copyright (c) 2010-2020 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef _BAYES_FUNCTIONS_H_
#define _BAYES_FUNCTIONS_H_
#include "arm_math_types.h"
#include "arm_math_memory.h"
#include "dsp/none.h"
#include "dsp/utils.h"
#include "dsp/statistics_functions.h"
/**
* @defgroup groupBayes Bayesian estimators
*
* Implement the naive gaussian Bayes estimator.
* The training must be done from scikit-learn.
*
* The parameters can be easily
* generated from the scikit-learn object. Some examples are given in
* DSP/Testing/PatternGeneration/Bayes.py
*/
#ifdef __cplusplus
extern "C"
{
#endif
/**
* @brief Instance structure for Naive Gaussian Bayesian estimator.
*/
typedef struct
{
uint32_t vectorDimension; /**< Dimension of vector space */
uint32_t numberOfClasses; /**< Number of different classes */
const float32_t *theta; /**< Mean values for the Gaussians */
const float32_t *sigma; /**< Variances for the Gaussians */
const float32_t *classPriors; /**< Class prior probabilities */
float32_t epsilon; /**< Additive value to variances */
} arm_gaussian_naive_bayes_instance_f32;
/**
* @brief Naive Gaussian Bayesian Estimator
*
* @param[in] S points to a naive bayes instance structure
* @param[in] in points to the elements of the input vector.
* @param[in] pBuffer points to a buffer of length numberOfClasses
* @return The predicted class
*
*/
uint32_t arm_gaussian_naive_bayes_predict_f32(const arm_gaussian_naive_bayes_instance_f32 *S,
const float32_t * in,
float32_t *pBuffer);
#ifdef __cplusplus
}
#endif
#endif /* ifndef _BAYES_FUNCTIONS_H_ */

View File

@ -1,294 +0,0 @@
/******************************************************************************
* @file complex_math_functions.h
* @brief Public header file for CMSIS DSP Library
* @version V1.9.0
* @date 20. July 2020
******************************************************************************/
/*
* Copyright (c) 2010-2020 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef _COMPLEX_MATH_FUNCTIONS_H_
#define _COMPLEX_MATH_FUNCTIONS_H_
#include "arm_math_types.h"
#include "arm_math_memory.h"
#include "dsp/none.h"
#include "dsp/utils.h"
#include "dsp/fast_math_functions.h"
#ifdef __cplusplus
extern "C"
{
#endif
/**
* @defgroup groupCmplxMath Complex Math Functions
* This set of functions operates on complex data vectors.
* The data in the complex arrays is stored in an interleaved fashion
* (real, imag, real, imag, ...).
* In the API functions, the number of samples in a complex array refers
* to the number of complex values; the array contains twice this number of
* real values.
*/
/**
* @brief Floating-point complex conjugate.
* @param[in] pSrc points to the input vector
* @param[out] pDst points to the output vector
* @param[in] numSamples number of complex samples in each vector
*/
void arm_cmplx_conj_f32(
const float32_t * pSrc,
float32_t * pDst,
uint32_t numSamples);
/**
* @brief Q31 complex conjugate.
* @param[in] pSrc points to the input vector
* @param[out] pDst points to the output vector
* @param[in] numSamples number of complex samples in each vector
*/
void arm_cmplx_conj_q31(
const q31_t * pSrc,
q31_t * pDst,
uint32_t numSamples);
/**
* @brief Q15 complex conjugate.
* @param[in] pSrc points to the input vector
* @param[out] pDst points to the output vector
* @param[in] numSamples number of complex samples in each vector
*/
void arm_cmplx_conj_q15(
const q15_t * pSrc,
q15_t * pDst,
uint32_t numSamples);
/**
* @brief Floating-point complex magnitude squared
* @param[in] pSrc points to the complex input vector
* @param[out] pDst points to the real output vector
* @param[in] numSamples number of complex samples in the input vector
*/
void arm_cmplx_mag_squared_f32(
const float32_t * pSrc,
float32_t * pDst,
uint32_t numSamples);
/**
* @brief Q31 complex magnitude squared
* @param[in] pSrc points to the complex input vector
* @param[out] pDst points to the real output vector
* @param[in] numSamples number of complex samples in the input vector
*/
void arm_cmplx_mag_squared_q31(
const q31_t * pSrc,
q31_t * pDst,
uint32_t numSamples);
/**
* @brief Q15 complex magnitude squared
* @param[in] pSrc points to the complex input vector
* @param[out] pDst points to the real output vector
* @param[in] numSamples number of complex samples in the input vector
*/
void arm_cmplx_mag_squared_q15(
const q15_t * pSrc,
q15_t * pDst,
uint32_t numSamples);
/**
* @brief Floating-point complex magnitude
* @param[in] pSrc points to the complex input vector
* @param[out] pDst points to the real output vector
* @param[in] numSamples number of complex samples in the input vector
*/
void arm_cmplx_mag_f32(
const float32_t * pSrc,
float32_t * pDst,
uint32_t numSamples);
/**
* @brief Q31 complex magnitude
* @param[in] pSrc points to the complex input vector
* @param[out] pDst points to the real output vector
* @param[in] numSamples number of complex samples in the input vector
*/
void arm_cmplx_mag_q31(
const q31_t * pSrc,
q31_t * pDst,
uint32_t numSamples);
/**
* @brief Q15 complex magnitude
* @param[in] pSrc points to the complex input vector
* @param[out] pDst points to the real output vector
* @param[in] numSamples number of complex samples in the input vector
*/
void arm_cmplx_mag_q15(
const q15_t * pSrc,
q15_t * pDst,
uint32_t numSamples);
/**
* @brief Q15 complex dot product
* @param[in] pSrcA points to the first input vector
* @param[in] pSrcB points to the second input vector
* @param[in] numSamples number of complex samples in each vector
* @param[out] realResult real part of the result returned here
* @param[out] imagResult imaginary part of the result returned here
*/
void arm_cmplx_dot_prod_q15(
const q15_t * pSrcA,
const q15_t * pSrcB,
uint32_t numSamples,
q31_t * realResult,
q31_t * imagResult);
/**
* @brief Q31 complex dot product
* @param[in] pSrcA points to the first input vector
* @param[in] pSrcB points to the second input vector
* @param[in] numSamples number of complex samples in each vector
* @param[out] realResult real part of the result returned here
* @param[out] imagResult imaginary part of the result returned here
*/
void arm_cmplx_dot_prod_q31(
const q31_t * pSrcA,
const q31_t * pSrcB,
uint32_t numSamples,
q63_t * realResult,
q63_t * imagResult);
/**
* @brief Floating-point complex dot product
* @param[in] pSrcA points to the first input vector
* @param[in] pSrcB points to the second input vector
* @param[in] numSamples number of complex samples in each vector
* @param[out] realResult real part of the result returned here
* @param[out] imagResult imaginary part of the result returned here
*/
void arm_cmplx_dot_prod_f32(
const float32_t * pSrcA,
const float32_t * pSrcB,
uint32_t numSamples,
float32_t * realResult,
float32_t * imagResult);
/**
* @brief Q15 complex-by-real multiplication
* @param[in] pSrcCmplx points to the complex input vector
* @param[in] pSrcReal points to the real input vector
* @param[out] pCmplxDst points to the complex output vector
* @param[in] numSamples number of samples in each vector
*/
void arm_cmplx_mult_real_q15(
const q15_t * pSrcCmplx,
const q15_t * pSrcReal,
q15_t * pCmplxDst,
uint32_t numSamples);
/**
* @brief Q31 complex-by-real multiplication
* @param[in] pSrcCmplx points to the complex input vector
* @param[in] pSrcReal points to the real input vector
* @param[out] pCmplxDst points to the complex output vector
* @param[in] numSamples number of samples in each vector
*/
void arm_cmplx_mult_real_q31(
const q31_t * pSrcCmplx,
const q31_t * pSrcReal,
q31_t * pCmplxDst,
uint32_t numSamples);
/**
* @brief Floating-point complex-by-real multiplication
* @param[in] pSrcCmplx points to the complex input vector
* @param[in] pSrcReal points to the real input vector
* @param[out] pCmplxDst points to the complex output vector
* @param[in] numSamples number of samples in each vector
*/
void arm_cmplx_mult_real_f32(
const float32_t * pSrcCmplx,
const float32_t * pSrcReal,
float32_t * pCmplxDst,
uint32_t numSamples);
/**
* @brief Q15 complex-by-complex multiplication
* @param[in] pSrcA points to the first input vector
* @param[in] pSrcB points to the second input vector
* @param[out] pDst points to the output vector
* @param[in] numSamples number of complex samples in each vector
*/
void arm_cmplx_mult_cmplx_q15(
const q15_t * pSrcA,
const q15_t * pSrcB,
q15_t * pDst,
uint32_t numSamples);
/**
* @brief Q31 complex-by-complex multiplication
* @param[in] pSrcA points to the first input vector
* @param[in] pSrcB points to the second input vector
* @param[out] pDst points to the output vector
* @param[in] numSamples number of complex samples in each vector
*/
void arm_cmplx_mult_cmplx_q31(
const q31_t * pSrcA,
const q31_t * pSrcB,
q31_t * pDst,
uint32_t numSamples);
/**
* @brief Floating-point complex-by-complex multiplication
* @param[in] pSrcA points to the first input vector
* @param[in] pSrcB points to the second input vector
* @param[out] pDst points to the output vector
* @param[in] numSamples number of complex samples in each vector
*/
void arm_cmplx_mult_cmplx_f32(
const float32_t * pSrcA,
const float32_t * pSrcB,
float32_t * pDst,
uint32_t numSamples);
#ifdef __cplusplus
}
#endif
#endif /* ifndef _COMPLEX_MATH_FUNCTIONS_H_ */

View File

@ -1,790 +0,0 @@
/******************************************************************************
* @file controller_functions.h
* @brief Public header file for CMSIS DSP Library
* @version V1.9.0
* @date 20. July 2020
******************************************************************************/
/*
* Copyright (c) 2010-2020 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef _CONTROLLER_FUNCTIONS_H_
#define _CONTROLLER_FUNCTIONS_H_
#include "arm_math_types.h"
#include "arm_math_memory.h"
#include "dsp/none.h"
#include "dsp/utils.h"
#ifdef __cplusplus
extern "C"
{
#endif
/**
* @brief Macros required for SINE and COSINE Controller functions
*/
#define CONTROLLER_Q31_SHIFT (32 - 9)
/* 1.31(q31) Fixed value of 2/360 */
/* -1 to +1 is divided into 360 values so total spacing is (2/360) */
#define INPUT_SPACING 0xB60B61
/**
* @defgroup groupController Controller Functions
*/
/**
* @ingroup groupController
*/
/**
* @addtogroup SinCos
* @{
*/
/**
* @brief Floating-point sin_cos function.
* @param[in] theta input value in degrees
* @param[out] pSinVal points to the processed sine output.
* @param[out] pCosVal points to the processed cos output.
*/
void arm_sin_cos_f32(
float32_t theta,
float32_t * pSinVal,
float32_t * pCosVal);
/**
* @brief Q31 sin_cos function.
* @param[in] theta scaled input value in degrees
* @param[out] pSinVal points to the processed sine output.
* @param[out] pCosVal points to the processed cosine output.
*/
void arm_sin_cos_q31(
q31_t theta,
q31_t * pSinVal,
q31_t * pCosVal);
/**
* @} end of SinCos group
*/
/**
* @ingroup groupController
*/
/**
* @defgroup PID PID Motor Control
*
* A Proportional Integral Derivative (PID) controller is a generic feedback control
* loop mechanism widely used in industrial control systems.
* A PID controller is the most commonly used type of feedback controller.
*
* This set of functions implements (PID) controllers
* for Q15, Q31, and floating-point data types. The functions operate on a single sample
* of data and each call to the function returns a single processed value.
* <code>S</code> points to an instance of the PID control data structure. <code>in</code>
* is the input sample value. The functions return the output value.
*
* \par Algorithm:
* <pre>
* y[n] = y[n-1] + A0 * x[n] + A1 * x[n-1] + A2 * x[n-2]
* A0 = Kp + Ki + Kd
* A1 = (-Kp ) - (2 * Kd )
* A2 = Kd
* </pre>
*
* \par
* where \c Kp is proportional constant, \c Ki is Integral constant and \c Kd is Derivative constant
*
* \par
* \image html PID.gif "Proportional Integral Derivative Controller"
*
* \par
* The PID controller calculates an "error" value as the difference between
* the measured output and the reference input.
* The controller attempts to minimize the error by adjusting the process control inputs.
* The proportional value determines the reaction to the current error,
* the integral value determines the reaction based on the sum of recent errors,
* and the derivative value determines the reaction based on the rate at which the error has been changing.
*
* \par Instance Structure
* The Gains A0, A1, A2 and state variables for a PID controller are stored together in an instance data structure.
* A separate instance structure must be defined for each PID Controller.
* There are separate instance structure declarations for each of the 3 supported data types.
*
* \par Reset Functions
* There is also an associated reset function for each data type which clears the state array.
*
* \par Initialization Functions
* There is also an associated initialization function for each data type.
* The initialization function performs the following operations:
* - Initializes the Gains A0, A1, A2 from Kp,Ki, Kd gains.
* - Zeros out the values in the state buffer.
*
* \par
* Instance structure cannot be placed into a const data section and it is recommended to use the initialization function.
*
* \par Fixed-Point Behavior
* Care must be taken when using the fixed-point versions of the PID Controller functions.
* In particular, the overflow and saturation behavior of the accumulator used in each function must be considered.
* Refer to the function specific documentation below for usage guidelines.
*/
/**
* @brief Instance structure for the Q15 PID Control.
*/
typedef struct
{
q15_t A0; /**< The derived gain, A0 = Kp + Ki + Kd . */
#if !defined (ARM_MATH_DSP)
q15_t A1; /**< The derived gain A1 = -Kp - 2Kd */
q15_t A2; /**< The derived gain A1 = Kd. */
#else
q31_t A1; /**< The derived gain A1 = -Kp - 2Kd | Kd.*/
#endif
q15_t state[3]; /**< The state array of length 3. */
q15_t Kp; /**< The proportional gain. */
q15_t Ki; /**< The integral gain. */
q15_t Kd; /**< The derivative gain. */
} arm_pid_instance_q15;
/**
* @brief Instance structure for the Q31 PID Control.
*/
typedef struct
{
q31_t A0; /**< The derived gain, A0 = Kp + Ki + Kd . */
q31_t A1; /**< The derived gain, A1 = -Kp - 2Kd. */
q31_t A2; /**< The derived gain, A2 = Kd . */
q31_t state[3]; /**< The state array of length 3. */
q31_t Kp; /**< The proportional gain. */
q31_t Ki; /**< The integral gain. */
q31_t Kd; /**< The derivative gain. */
} arm_pid_instance_q31;
/**
* @brief Instance structure for the floating-point PID Control.
*/
typedef struct
{
float32_t A0; /**< The derived gain, A0 = Kp + Ki + Kd . */
float32_t A1; /**< The derived gain, A1 = -Kp - 2Kd. */
float32_t A2; /**< The derived gain, A2 = Kd . */
float32_t state[3]; /**< The state array of length 3. */
float32_t Kp; /**< The proportional gain. */
float32_t Ki; /**< The integral gain. */
float32_t Kd; /**< The derivative gain. */
} arm_pid_instance_f32;
/**
* @brief Initialization function for the floating-point PID Control.
* @param[in,out] S points to an instance of the PID structure.
* @param[in] resetStateFlag flag to reset the state. 0 = no change in state 1 = reset the state.
*/
void arm_pid_init_f32(
arm_pid_instance_f32 * S,
int32_t resetStateFlag);
/**
* @brief Reset function for the floating-point PID Control.
* @param[in,out] S is an instance of the floating-point PID Control structure
*/
void arm_pid_reset_f32(
arm_pid_instance_f32 * S);
/**
* @brief Initialization function for the Q31 PID Control.
* @param[in,out] S points to an instance of the Q15 PID structure.
* @param[in] resetStateFlag flag to reset the state. 0 = no change in state 1 = reset the state.
*/
void arm_pid_init_q31(
arm_pid_instance_q31 * S,
int32_t resetStateFlag);
/**
* @brief Reset function for the Q31 PID Control.
* @param[in,out] S points to an instance of the Q31 PID Control structure
*/
void arm_pid_reset_q31(
arm_pid_instance_q31 * S);
/**
* @brief Initialization function for the Q15 PID Control.
* @param[in,out] S points to an instance of the Q15 PID structure.
* @param[in] resetStateFlag flag to reset the state. 0 = no change in state 1 = reset the state.
*/
void arm_pid_init_q15(
arm_pid_instance_q15 * S,
int32_t resetStateFlag);
/**
* @brief Reset function for the Q15 PID Control.
* @param[in,out] S points to an instance of the q15 PID Control structure
*/
void arm_pid_reset_q15(
arm_pid_instance_q15 * S);
/**
* @addtogroup PID
* @{
*/
/**
* @brief Process function for the floating-point PID Control.
* @param[in,out] S is an instance of the floating-point PID Control structure
* @param[in] in input sample to process
* @return processed output sample.
*/
__STATIC_FORCEINLINE float32_t arm_pid_f32(
arm_pid_instance_f32 * S,
float32_t in)
{
float32_t out;
/* y[n] = y[n-1] + A0 * x[n] + A1 * x[n-1] + A2 * x[n-2] */
out = (S->A0 * in) +
(S->A1 * S->state[0]) + (S->A2 * S->state[1]) + (S->state[2]);
/* Update state */
S->state[1] = S->state[0];
S->state[0] = in;
S->state[2] = out;
/* return to application */
return (out);
}
/**
@brief Process function for the Q31 PID Control.
@param[in,out] S points to an instance of the Q31 PID Control structure
@param[in] in input sample to process
@return processed output sample.
\par Scaling and Overflow Behavior
The function is implemented using an internal 64-bit accumulator.
The accumulator has a 2.62 format and maintains full precision of the intermediate multiplication results but provides only a single guard bit.
Thus, if the accumulator result overflows it wraps around rather than clip.
In order to avoid overflows completely the input signal must be scaled down by 2 bits as there are four additions.
After all multiply-accumulates are performed, the 2.62 accumulator is truncated to 1.32 format and then saturated to 1.31 format.
*/
__STATIC_FORCEINLINE q31_t arm_pid_q31(
arm_pid_instance_q31 * S,
q31_t in)
{
q63_t acc;
q31_t out;
/* acc = A0 * x[n] */
acc = (q63_t) S->A0 * in;
/* acc += A1 * x[n-1] */
acc += (q63_t) S->A1 * S->state[0];
/* acc += A2 * x[n-2] */
acc += (q63_t) S->A2 * S->state[1];
/* convert output to 1.31 format to add y[n-1] */
out = (q31_t) (acc >> 31U);
/* out += y[n-1] */
out += S->state[2];
/* Update state */
S->state[1] = S->state[0];
S->state[0] = in;
S->state[2] = out;
/* return to application */
return (out);
}
/**
@brief Process function for the Q15 PID Control.
@param[in,out] S points to an instance of the Q15 PID Control structure
@param[in] in input sample to process
@return processed output sample.
\par Scaling and Overflow Behavior
The function is implemented using a 64-bit internal accumulator.
Both Gains and state variables are represented in 1.15 format and multiplications yield a 2.30 result.
The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format.
There is no risk of internal overflow with this approach and the full precision of intermediate multiplications is preserved.
After all additions have been performed, the accumulator is truncated to 34.15 format by discarding low 15 bits.
Lastly, the accumulator is saturated to yield a result in 1.15 format.
*/
__STATIC_FORCEINLINE q15_t arm_pid_q15(
arm_pid_instance_q15 * S,
q15_t in)
{
q63_t acc;
q15_t out;
#if defined (ARM_MATH_DSP)
/* Implementation of PID controller */
/* acc = A0 * x[n] */
acc = (q31_t) __SMUAD((uint32_t)S->A0, (uint32_t)in);
/* acc += A1 * x[n-1] + A2 * x[n-2] */
acc = (q63_t)__SMLALD((uint32_t)S->A1, (uint32_t)read_q15x2 (S->state), (uint64_t)acc);
#else
/* acc = A0 * x[n] */
acc = ((q31_t) S->A0) * in;
/* acc += A1 * x[n-1] + A2 * x[n-2] */
acc += (q31_t) S->A1 * S->state[0];
acc += (q31_t) S->A2 * S->state[1];
#endif
/* acc += y[n-1] */
acc += (q31_t) S->state[2] << 15;
/* saturate the output */
out = (q15_t) (__SSAT((q31_t)(acc >> 15), 16));
/* Update state */
S->state[1] = S->state[0];
S->state[0] = in;
S->state[2] = out;
/* return to application */
return (out);
}
/**
* @} end of PID group
*/
/**
* @ingroup groupController
*/
/**
* @defgroup park Vector Park Transform
*
* Forward Park transform converts the input two-coordinate vector to flux and torque components.
* The Park transform can be used to realize the transformation of the <code>Ialpha</code> and the <code>Ibeta</code> currents
* from the stationary to the moving reference frame and control the spatial relationship between
* the stator vector current and rotor flux vector.
* If we consider the d axis aligned with the rotor flux, the diagram below shows the
* current vector and the relationship from the two reference frames:
* \image html park.gif "Stator current space vector and its component in (a,b) and in the d,q rotating reference frame"
*
* The function operates on a single sample of data and each call to the function returns the processed output.
* The library provides separate functions for Q31 and floating-point data types.
* \par Algorithm
* \image html parkFormula.gif
* where <code>Ialpha</code> and <code>Ibeta</code> are the stator vector components,
* <code>pId</code> and <code>pIq</code> are rotor vector components and <code>cosVal</code> and <code>sinVal</code> are the
* cosine and sine values of theta (rotor flux position).
* \par Fixed-Point Behavior
* Care must be taken when using the Q31 version of the Park transform.
* In particular, the overflow and saturation behavior of the accumulator used must be considered.
* Refer to the function specific documentation below for usage guidelines.
*/
/**
* @addtogroup park
* @{
*/
/**
* @brief Floating-point Park transform
* @param[in] Ialpha input two-phase vector coordinate alpha
* @param[in] Ibeta input two-phase vector coordinate beta
* @param[out] pId points to output rotor reference frame d
* @param[out] pIq points to output rotor reference frame q
* @param[in] sinVal sine value of rotation angle theta
* @param[in] cosVal cosine value of rotation angle theta
* @return none
*
* The function implements the forward Park transform.
*
*/
__STATIC_FORCEINLINE void arm_park_f32(
float32_t Ialpha,
float32_t Ibeta,
float32_t * pId,
float32_t * pIq,
float32_t sinVal,
float32_t cosVal)
{
/* Calculate pId using the equation, pId = Ialpha * cosVal + Ibeta * sinVal */
*pId = Ialpha * cosVal + Ibeta * sinVal;
/* Calculate pIq using the equation, pIq = - Ialpha * sinVal + Ibeta * cosVal */
*pIq = -Ialpha * sinVal + Ibeta * cosVal;
}
/**
@brief Park transform for Q31 version
@param[in] Ialpha input two-phase vector coordinate alpha
@param[in] Ibeta input two-phase vector coordinate beta
@param[out] pId points to output rotor reference frame d
@param[out] pIq points to output rotor reference frame q
@param[in] sinVal sine value of rotation angle theta
@param[in] cosVal cosine value of rotation angle theta
@return none
\par Scaling and Overflow Behavior
The function is implemented using an internal 32-bit accumulator.
The accumulator maintains 1.31 format by truncating lower 31 bits of the intermediate multiplication in 2.62 format.
There is saturation on the addition and subtraction, hence there is no risk of overflow.
*/
__STATIC_FORCEINLINE void arm_park_q31(
q31_t Ialpha,
q31_t Ibeta,
q31_t * pId,
q31_t * pIq,
q31_t sinVal,
q31_t cosVal)
{
q31_t product1, product2; /* Temporary variables used to store intermediate results */
q31_t product3, product4; /* Temporary variables used to store intermediate results */
/* Intermediate product is calculated by (Ialpha * cosVal) */
product1 = (q31_t) (((q63_t) (Ialpha) * (cosVal)) >> 31);
/* Intermediate product is calculated by (Ibeta * sinVal) */
product2 = (q31_t) (((q63_t) (Ibeta) * (sinVal)) >> 31);
/* Intermediate product is calculated by (Ialpha * sinVal) */
product3 = (q31_t) (((q63_t) (Ialpha) * (sinVal)) >> 31);
/* Intermediate product is calculated by (Ibeta * cosVal) */
product4 = (q31_t) (((q63_t) (Ibeta) * (cosVal)) >> 31);
/* Calculate pId by adding the two intermediate products 1 and 2 */
*pId = __QADD(product1, product2);
/* Calculate pIq by subtracting the two intermediate products 3 from 4 */
*pIq = __QSUB(product4, product3);
}
/**
* @} end of park group
*/
/**
* @ingroup groupController
*/
/**
* @defgroup inv_park Vector Inverse Park transform
* Inverse Park transform converts the input flux and torque components to two-coordinate vector.
*
* The function operates on a single sample of data and each call to the function returns the processed output.
* The library provides separate functions for Q31 and floating-point data types.
* \par Algorithm
* \image html parkInvFormula.gif
* where <code>pIalpha</code> and <code>pIbeta</code> are the stator vector components,
* <code>Id</code> and <code>Iq</code> are rotor vector components and <code>cosVal</code> and <code>sinVal</code> are the
* cosine and sine values of theta (rotor flux position).
* \par Fixed-Point Behavior
* Care must be taken when using the Q31 version of the Park transform.
* In particular, the overflow and saturation behavior of the accumulator used must be considered.
* Refer to the function specific documentation below for usage guidelines.
*/
/**
* @addtogroup inv_park
* @{
*/
/**
* @brief Floating-point Inverse Park transform
* @param[in] Id input coordinate of rotor reference frame d
* @param[in] Iq input coordinate of rotor reference frame q
* @param[out] pIalpha points to output two-phase orthogonal vector axis alpha
* @param[out] pIbeta points to output two-phase orthogonal vector axis beta
* @param[in] sinVal sine value of rotation angle theta
* @param[in] cosVal cosine value of rotation angle theta
* @return none
*/
__STATIC_FORCEINLINE void arm_inv_park_f32(
float32_t Id,
float32_t Iq,
float32_t * pIalpha,
float32_t * pIbeta,
float32_t sinVal,
float32_t cosVal)
{
/* Calculate pIalpha using the equation, pIalpha = Id * cosVal - Iq * sinVal */
*pIalpha = Id * cosVal - Iq * sinVal;
/* Calculate pIbeta using the equation, pIbeta = Id * sinVal + Iq * cosVal */
*pIbeta = Id * sinVal + Iq * cosVal;
}
/**
@brief Inverse Park transform for Q31 version
@param[in] Id input coordinate of rotor reference frame d
@param[in] Iq input coordinate of rotor reference frame q
@param[out] pIalpha points to output two-phase orthogonal vector axis alpha
@param[out] pIbeta points to output two-phase orthogonal vector axis beta
@param[in] sinVal sine value of rotation angle theta
@param[in] cosVal cosine value of rotation angle theta
@return none
@par Scaling and Overflow Behavior
The function is implemented using an internal 32-bit accumulator.
The accumulator maintains 1.31 format by truncating lower 31 bits of the intermediate multiplication in 2.62 format.
There is saturation on the addition, hence there is no risk of overflow.
*/
__STATIC_FORCEINLINE void arm_inv_park_q31(
q31_t Id,
q31_t Iq,
q31_t * pIalpha,
q31_t * pIbeta,
q31_t sinVal,
q31_t cosVal)
{
q31_t product1, product2; /* Temporary variables used to store intermediate results */
q31_t product3, product4; /* Temporary variables used to store intermediate results */
/* Intermediate product is calculated by (Id * cosVal) */
product1 = (q31_t) (((q63_t) (Id) * (cosVal)) >> 31);
/* Intermediate product is calculated by (Iq * sinVal) */
product2 = (q31_t) (((q63_t) (Iq) * (sinVal)) >> 31);
/* Intermediate product is calculated by (Id * sinVal) */
product3 = (q31_t) (((q63_t) (Id) * (sinVal)) >> 31);
/* Intermediate product is calculated by (Iq * cosVal) */
product4 = (q31_t) (((q63_t) (Iq) * (cosVal)) >> 31);
/* Calculate pIalpha by using the two intermediate products 1 and 2 */
*pIalpha = __QSUB(product1, product2);
/* Calculate pIbeta by using the two intermediate products 3 and 4 */
*pIbeta = __QADD(product4, product3);
}
/**
* @} end of Inverse park group
*/
/**
* @ingroup groupController
*/
/**
* @defgroup clarke Vector Clarke Transform
* Forward Clarke transform converts the instantaneous stator phases into a two-coordinate time invariant vector.
* Generally the Clarke transform uses three-phase currents <code>Ia, Ib and Ic</code> to calculate currents
* in the two-phase orthogonal stator axis <code>Ialpha</code> and <code>Ibeta</code>.
* When <code>Ialpha</code> is superposed with <code>Ia</code> as shown in the figure below
* \image html clarke.gif Stator current space vector and its components in (a,b).
* and <code>Ia + Ib + Ic = 0</code>, in this condition <code>Ialpha</code> and <code>Ibeta</code>
* can be calculated using only <code>Ia</code> and <code>Ib</code>.
*
* The function operates on a single sample of data and each call to the function returns the processed output.
* The library provides separate functions for Q31 and floating-point data types.
* \par Algorithm
* \image html clarkeFormula.gif
* where <code>Ia</code> and <code>Ib</code> are the instantaneous stator phases and
* <code>pIalpha</code> and <code>pIbeta</code> are the two coordinates of time invariant vector.
* \par Fixed-Point Behavior
* Care must be taken when using the Q31 version of the Clarke transform.
* In particular, the overflow and saturation behavior of the accumulator used must be considered.
* Refer to the function specific documentation below for usage guidelines.
*/
/**
* @addtogroup clarke
* @{
*/
/**
*
* @brief Floating-point Clarke transform
* @param[in] Ia input three-phase coordinate <code>a</code>
* @param[in] Ib input three-phase coordinate <code>b</code>
* @param[out] pIalpha points to output two-phase orthogonal vector axis alpha
* @param[out] pIbeta points to output two-phase orthogonal vector axis beta
* @return none
*/
__STATIC_FORCEINLINE void arm_clarke_f32(
float32_t Ia,
float32_t Ib,
float32_t * pIalpha,
float32_t * pIbeta)
{
/* Calculate pIalpha using the equation, pIalpha = Ia */
*pIalpha = Ia;
/* Calculate pIbeta using the equation, pIbeta = (1/sqrt(3)) * Ia + (2/sqrt(3)) * Ib */
*pIbeta = (0.57735026919f * Ia + 1.15470053838f * Ib);
}
/**
@brief Clarke transform for Q31 version
@param[in] Ia input three-phase coordinate <code>a</code>
@param[in] Ib input three-phase coordinate <code>b</code>
@param[out] pIalpha points to output two-phase orthogonal vector axis alpha
@param[out] pIbeta points to output two-phase orthogonal vector axis beta
@return none
\par Scaling and Overflow Behavior
The function is implemented using an internal 32-bit accumulator.
The accumulator maintains 1.31 format by truncating lower 31 bits of the intermediate multiplication in 2.62 format.
There is saturation on the addition, hence there is no risk of overflow.
*/
__STATIC_FORCEINLINE void arm_clarke_q31(
q31_t Ia,
q31_t Ib,
q31_t * pIalpha,
q31_t * pIbeta)
{
q31_t product1, product2; /* Temporary variables used to store intermediate results */
/* Calculating pIalpha from Ia by equation pIalpha = Ia */
*pIalpha = Ia;
/* Intermediate product is calculated by (1/(sqrt(3)) * Ia) */
product1 = (q31_t) (((q63_t) Ia * 0x24F34E8B) >> 30);
/* Intermediate product is calculated by (2/sqrt(3) * Ib) */
product2 = (q31_t) (((q63_t) Ib * 0x49E69D16) >> 30);
/* pIbeta is calculated by adding the intermediate products */
*pIbeta = __QADD(product1, product2);
}
/**
* @} end of clarke group
*/
/**
* @ingroup groupController
*/
/**
* @defgroup inv_clarke Vector Inverse Clarke Transform
* Inverse Clarke transform converts the two-coordinate time invariant vector into instantaneous stator phases.
*
* The function operates on a single sample of data and each call to the function returns the processed output.
* The library provides separate functions for Q31 and floating-point data types.
* \par Algorithm
* \image html clarkeInvFormula.gif
* where <code>pIa</code> and <code>pIb</code> are the instantaneous stator phases and
* <code>Ialpha</code> and <code>Ibeta</code> are the two coordinates of time invariant vector.
* \par Fixed-Point Behavior
* Care must be taken when using the Q31 version of the Clarke transform.
* In particular, the overflow and saturation behavior of the accumulator used must be considered.
* Refer to the function specific documentation below for usage guidelines.
*/
/**
* @addtogroup inv_clarke
* @{
*/
/**
* @brief Floating-point Inverse Clarke transform
* @param[in] Ialpha input two-phase orthogonal vector axis alpha
* @param[in] Ibeta input two-phase orthogonal vector axis beta
* @param[out] pIa points to output three-phase coordinate <code>a</code>
* @param[out] pIb points to output three-phase coordinate <code>b</code>
* @return none
*/
__STATIC_FORCEINLINE void arm_inv_clarke_f32(
float32_t Ialpha,
float32_t Ibeta,
float32_t * pIa,
float32_t * pIb)
{
/* Calculating pIa from Ialpha by equation pIa = Ialpha */
*pIa = Ialpha;
/* Calculating pIb from Ialpha and Ibeta by equation pIb = -(1/2) * Ialpha + (sqrt(3)/2) * Ibeta */
*pIb = -0.5f * Ialpha + 0.8660254039f * Ibeta;
}
/**
@brief Inverse Clarke transform for Q31 version
@param[in] Ialpha input two-phase orthogonal vector axis alpha
@param[in] Ibeta input two-phase orthogonal vector axis beta
@param[out] pIa points to output three-phase coordinate <code>a</code>
@param[out] pIb points to output three-phase coordinate <code>b</code>
@return none
\par Scaling and Overflow Behavior
The function is implemented using an internal 32-bit accumulator.
The accumulator maintains 1.31 format by truncating lower 31 bits of the intermediate multiplication in 2.62 format.
There is saturation on the subtraction, hence there is no risk of overflow.
*/
__STATIC_FORCEINLINE void arm_inv_clarke_q31(
q31_t Ialpha,
q31_t Ibeta,
q31_t * pIa,
q31_t * pIb)
{
q31_t product1, product2; /* Temporary variables used to store intermediate results */
/* Calculating pIa from Ialpha by equation pIa = Ialpha */
*pIa = Ialpha;
/* Intermediate product is calculated by (1/(2*sqrt(3)) * Ia) */
product1 = (q31_t) (((q63_t) (Ialpha) * (0x40000000)) >> 31);
/* Intermediate product is calculated by (1/sqrt(3) * pIb) */
product2 = (q31_t) (((q63_t) (Ibeta) * (0x6ED9EBA1)) >> 31);
/* pIb is calculated by subtracting the products */
*pIb = __QSUB(product2, product1);
}
/**
* @} end of inv_clarke group
*/
#ifdef __cplusplus
}
#endif
#endif /* ifndef _CONTROLLER_FUNCTIONS_H_ */

View File

@ -1,296 +0,0 @@
/******************************************************************************
* @file distance_functions.h
* @brief Public header file for CMSIS DSP Library
* @version V1.9.0
* @date 20. July 2020
******************************************************************************/
/*
* Copyright (c) 2010-2020 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef _DISTANCE_FUNCTIONS_H_
#define _DISTANCE_FUNCTIONS_H_
#include "arm_math_types.h"
#include "arm_math_memory.h"
#include "dsp/none.h"
#include "dsp/utils.h"
#include "dsp/statistics_functions.h"
#include "dsp/basic_math_functions.h"
#include "dsp/fast_math_functions.h"
#ifdef __cplusplus
extern "C"
{
#endif
/**
* @defgroup groupDistance Distance functions
*
* Distance functions for use with clustering algorithms.
* There are distance functions for float vectors and boolean vectors.
*
*/
/* 6.14 bug */
#if defined (__ARMCC_VERSION) && (__ARMCC_VERSION >= 6100100) && (__ARMCC_VERSION < 6150001)
__attribute__((weak)) float __powisf2(float a, int b);
#endif
/**
* @brief Euclidean distance between two vectors
* @param[in] pA First vector
* @param[in] pB Second vector
* @param[in] blockSize vector length
* @return distance
*
*/
float32_t arm_euclidean_distance_f32(const float32_t *pA,const float32_t *pB, uint32_t blockSize);
/**
* @brief Bray-Curtis distance between two vectors
* @param[in] pA First vector
* @param[in] pB Second vector
* @param[in] blockSize vector length
* @return distance
*
*/
float32_t arm_braycurtis_distance_f32(const float32_t *pA,const float32_t *pB, uint32_t blockSize);
/**
* @brief Canberra distance between two vectors
*
* This function may divide by zero when samples pA[i] and pB[i] are both zero.
* The result of the computation will be correct. So the division per zero may be
* ignored.
*
* @param[in] pA First vector
* @param[in] pB Second vector
* @param[in] blockSize vector length
* @return distance
*
*/
float32_t arm_canberra_distance_f32(const float32_t *pA,const float32_t *pB, uint32_t blockSize);
/**
* @brief Chebyshev distance between two vectors
* @param[in] pA First vector
* @param[in] pB Second vector
* @param[in] blockSize vector length
* @return distance
*
*/
float32_t arm_chebyshev_distance_f32(const float32_t *pA,const float32_t *pB, uint32_t blockSize);
/**
* @brief Cityblock (Manhattan) distance between two vectors
* @param[in] pA First vector
* @param[in] pB Second vector
* @param[in] blockSize vector length
* @return distance
*
*/
float32_t arm_cityblock_distance_f32(const float32_t *pA,const float32_t *pB, uint32_t blockSize);
/**
* @brief Correlation distance between two vectors
*
* The input vectors are modified in place !
*
* @param[in] pA First vector
* @param[in] pB Second vector
* @param[in] blockSize vector length
* @return distance
*
*/
float32_t arm_correlation_distance_f32(float32_t *pA,float32_t *pB, uint32_t blockSize);
/**
* @brief Cosine distance between two vectors
*
* @param[in] pA First vector
* @param[in] pB Second vector
* @param[in] blockSize vector length
* @return distance
*
*/
float32_t arm_cosine_distance_f32(const float32_t *pA,const float32_t *pB, uint32_t blockSize);
/**
* @brief Jensen-Shannon distance between two vectors
*
* This function is assuming that elements of second vector are > 0
* and 0 only when the corresponding element of first vector is 0.
* Otherwise the result of the computation does not make sense
* and for speed reasons, the cases returning NaN or Infinity are not
* managed.
*
* When the function is computing x log (x / y) with x 0 and y 0,
* it will compute the right value (0) but a division per zero will occur
* and shoudl be ignored in client code.
*
* @param[in] pA First vector
* @param[in] pB Second vector
* @param[in] blockSize vector length
* @return distance
*
*/
float32_t arm_jensenshannon_distance_f32(const float32_t *pA,const float32_t *pB,uint32_t blockSize);
/**
* @brief Minkowski distance between two vectors
*
* @param[in] pA First vector
* @param[in] pB Second vector
* @param[in] n Norm order (>= 2)
* @param[in] blockSize vector length
* @return distance
*
*/
float32_t arm_minkowski_distance_f32(const float32_t *pA,const float32_t *pB, int32_t order, uint32_t blockSize);
/**
* @brief Dice distance between two vectors
*
* @param[in] pA First vector of packed booleans
* @param[in] pB Second vector of packed booleans
* @param[in] order Distance order
* @param[in] blockSize Number of samples
* @return distance
*
*/
float32_t arm_dice_distance(const uint32_t *pA, const uint32_t *pB, uint32_t numberOfBools);
/**
* @brief Hamming distance between two vectors
*
* @param[in] pA First vector of packed booleans
* @param[in] pB Second vector of packed booleans
* @param[in] numberOfBools Number of booleans
* @return distance
*
*/
float32_t arm_hamming_distance(const uint32_t *pA, const uint32_t *pB, uint32_t numberOfBools);
/**
* @brief Jaccard distance between two vectors
*
* @param[in] pA First vector of packed booleans
* @param[in] pB Second vector of packed booleans
* @param[in] numberOfBools Number of booleans
* @return distance
*
*/
float32_t arm_jaccard_distance(const uint32_t *pA, const uint32_t *pB, uint32_t numberOfBools);
/**
* @brief Kulsinski distance between two vectors
*
* @param[in] pA First vector of packed booleans
* @param[in] pB Second vector of packed booleans
* @param[in] numberOfBools Number of booleans
* @return distance
*
*/
float32_t arm_kulsinski_distance(const uint32_t *pA, const uint32_t *pB, uint32_t numberOfBools);
/**
* @brief Roger Stanimoto distance between two vectors
*
* @param[in] pA First vector of packed booleans
* @param[in] pB Second vector of packed booleans
* @param[in] numberOfBools Number of booleans
* @return distance
*
*/
float32_t arm_rogerstanimoto_distance(const uint32_t *pA, const uint32_t *pB, uint32_t numberOfBools);
/**
* @brief Russell-Rao distance between two vectors
*
* @param[in] pA First vector of packed booleans
* @param[in] pB Second vector of packed booleans
* @param[in] numberOfBools Number of booleans
* @return distance
*
*/
float32_t arm_russellrao_distance(const uint32_t *pA, const uint32_t *pB, uint32_t numberOfBools);
/**
* @brief Sokal-Michener distance between two vectors
*
* @param[in] pA First vector of packed booleans
* @param[in] pB Second vector of packed booleans
* @param[in] numberOfBools Number of booleans
* @return distance
*
*/
float32_t arm_sokalmichener_distance(const uint32_t *pA, const uint32_t *pB, uint32_t numberOfBools);
/**
* @brief Sokal-Sneath distance between two vectors
*
* @param[in] pA First vector of packed booleans
* @param[in] pB Second vector of packed booleans
* @param[in] numberOfBools Number of booleans
* @return distance
*
*/
float32_t arm_sokalsneath_distance(const uint32_t *pA, const uint32_t *pB, uint32_t numberOfBools);
/**
* @brief Yule distance between two vectors
*
* @param[in] pA First vector of packed booleans
* @param[in] pB Second vector of packed booleans
* @param[in] numberOfBools Number of booleans
* @return distance
*
*/
float32_t arm_yule_distance(const uint32_t *pA, const uint32_t *pB, uint32_t numberOfBools);
#ifdef __cplusplus
}
#endif
#endif /* ifndef _DISTANCE_FUNCTIONS_H_ */

View File

@ -1,287 +0,0 @@
/******************************************************************************
* @file fast_math_functions.h
* @brief Public header file for CMSIS DSP Library
* @version V1.9.0
* @date 20. July 2020
******************************************************************************/
/*
* Copyright (c) 2010-2020 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef _FAST_MATH_FUNCTIONS_H_
#define _FAST_MATH_FUNCTIONS_H_
#include "arm_math_types.h"
#include "arm_math_memory.h"
#include "dsp/none.h"
#include "dsp/utils.h"
#ifdef __cplusplus
extern "C"
{
#endif
/**
* @brief Macros required for SINE and COSINE Fast math approximations
*/
#define FAST_MATH_TABLE_SIZE 512
#define FAST_MATH_Q31_SHIFT (32 - 10)
#define FAST_MATH_Q15_SHIFT (16 - 10)
#ifndef PI
#define PI 3.14159265358979f
#endif
/**
* @defgroup groupFastMath Fast Math Functions
* This set of functions provides a fast approximation to sine, cosine, and square root.
* As compared to most of the other functions in the CMSIS math library, the fast math functions
* operate on individual values and not arrays.
* There are separate functions for Q15, Q31, and floating-point data.
*
*/
/**
* @ingroup groupFastMath
*/
/**
@addtogroup sin
@{
*/
/**
* @brief Fast approximation to the trigonometric sine function for floating-point data.
* @param[in] x input value in radians.
* @return sin(x).
*/
float32_t arm_sin_f32(
float32_t x);
/**
* @brief Fast approximation to the trigonometric sine function for Q31 data.
* @param[in] x Scaled input value in radians.
* @return sin(x).
*/
q31_t arm_sin_q31(
q31_t x);
/**
* @brief Fast approximation to the trigonometric sine function for Q15 data.
* @param[in] x Scaled input value in radians.
* @return sin(x).
*/
q15_t arm_sin_q15(
q15_t x);
/**
@} end of sin group
*/
/**
@addtogroup cos
@{
*/
/**
* @brief Fast approximation to the trigonometric cosine function for floating-point data.
* @param[in] x input value in radians.
* @return cos(x).
*/
float32_t arm_cos_f32(
float32_t x);
/**
* @brief Fast approximation to the trigonometric cosine function for Q31 data.
* @param[in] x Scaled input value in radians.
* @return cos(x).
*/
q31_t arm_cos_q31(
q31_t x);
/**
* @brief Fast approximation to the trigonometric cosine function for Q15 data.
* @param[in] x Scaled input value in radians.
* @return cos(x).
*/
q15_t arm_cos_q15(
q15_t x);
/**
@} end of cos group
*/
/**
@brief Floating-point vector of log values.
@param[in] pSrc points to the input vector
@param[out] pDst points to the output vector
@param[in] blockSize number of samples in each vector
@return none
*/
void arm_vlog_f32(
const float32_t * pSrc,
float32_t * pDst,
uint32_t blockSize);
/**
@brief Floating-point vector of exp values.
@param[in] pSrc points to the input vector
@param[out] pDst points to the output vector
@param[in] blockSize number of samples in each vector
@return none
*/
void arm_vexp_f32(
const float32_t * pSrc,
float32_t * pDst,
uint32_t blockSize);
/**
* @defgroup SQRT Square Root
*
* Computes the square root of a number.
* There are separate functions for Q15, Q31, and floating-point data types.
* The square root function is computed using the Newton-Raphson algorithm.
* This is an iterative algorithm of the form:
* <pre>
* x1 = x0 - f(x0)/f'(x0)
* </pre>
* where <code>x1</code> is the current estimate,
* <code>x0</code> is the previous estimate, and
* <code>f'(x0)</code> is the derivative of <code>f()</code> evaluated at <code>x0</code>.
* For the square root function, the algorithm reduces to:
* <pre>
* x0 = in/2 [initial guess]
* x1 = 1/2 * ( x0 + in / x0) [each iteration]
* </pre>
*/
/**
* @addtogroup SQRT
* @{
*/
/**
@brief Floating-point square root function.
@param[in] in input value
@param[out] pOut square root of input value
@return execution status
- \ref ARM_MATH_SUCCESS : input value is positive
- \ref ARM_MATH_ARGUMENT_ERROR : input value is negative; *pOut is set to 0
*/
__STATIC_FORCEINLINE arm_status arm_sqrt_f32(
float32_t in,
float32_t * pOut)
{
if (in >= 0.0f)
{
#if defined ( __CC_ARM )
#if defined __TARGET_FPU_VFP
*pOut = __sqrtf(in);
#else
*pOut = sqrtf(in);
#endif
#elif defined ( __ICCARM__ )
#if defined __ARMVFP__
__ASM("VSQRT.F32 %0,%1" : "=t"(*pOut) : "t"(in));
#else
*pOut = sqrtf(in);
#endif
#else
*pOut = sqrtf(in);
#endif
return (ARM_MATH_SUCCESS);
}
else
{
*pOut = 0.0f;
return (ARM_MATH_ARGUMENT_ERROR);
}
}
/**
@brief Q31 square root function.
@param[in] in input value. The range of the input value is [0 +1) or 0x00000000 to 0x7FFFFFFF
@param[out] pOut points to square root of input value
@return execution status
- \ref ARM_MATH_SUCCESS : input value is positive
- \ref ARM_MATH_ARGUMENT_ERROR : input value is negative; *pOut is set to 0
*/
arm_status arm_sqrt_q31(
q31_t in,
q31_t * pOut);
/**
@brief Q15 square root function.
@param[in] in input value. The range of the input value is [0 +1) or 0x0000 to 0x7FFF
@param[out] pOut points to square root of input value
@return execution status
- \ref ARM_MATH_SUCCESS : input value is positive
- \ref ARM_MATH_ARGUMENT_ERROR : input value is negative; *pOut is set to 0
*/
arm_status arm_sqrt_q15(
q15_t in,
q15_t * pOut);
/**
* @brief Vector Floating-point square root function.
* @param[in] pIn input vector.
* @param[out] pOut vector of square roots of input elements.
* @param[in] len length of input vector.
* @return The function returns ARM_MATH_SUCCESS if input value is positive value or ARM_MATH_ARGUMENT_ERROR if
* <code>in</code> is negative value and returns zero output for negative values.
*/
void arm_vsqrt_f32(
float32_t * pIn,
float32_t * pOut,
uint16_t len);
void arm_vsqrt_q31(
q31_t * pIn,
q31_t * pOut,
uint16_t len);
void arm_vsqrt_q15(
q15_t * pIn,
q15_t * pOut,
uint16_t len);
/**
* @} end of SQRT group
*/
#ifdef __cplusplus
}
#endif
#endif /* ifndef _FAST_MATH_FUNCTIONS_H_ */

View File

@ -1,318 +0,0 @@
/******************************************************************************
* @file interpolation_functions.h
* @brief Public header file for CMSIS DSP Library
* @version V1.9.0
* @date 20. July 2020
******************************************************************************/
/*
* Copyright (c) 2010-2020 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef _INTERPOLATION_FUNCTIONS_H_
#define _INTERPOLATION_FUNCTIONS_H_
#include "arm_math_types.h"
#include "arm_math_memory.h"
#include "dsp/none.h"
#include "dsp/utils.h"
#ifdef __cplusplus
extern "C"
{
#endif
/**
* @defgroup groupInterpolation Interpolation Functions
* These functions perform 1- and 2-dimensional interpolation of data.
* Linear interpolation is used for 1-dimensional data and
* bilinear interpolation is used for 2-dimensional data.
*/
/**
* @brief Instance structure for the floating-point Linear Interpolate function.
*/
typedef struct
{
uint32_t nValues; /**< nValues */
float32_t x1; /**< x1 */
float32_t xSpacing; /**< xSpacing */
float32_t *pYData; /**< pointer to the table of Y values */
} arm_linear_interp_instance_f32;
/**
* @brief Instance structure for the floating-point bilinear interpolation function.
*/
typedef struct
{
uint16_t numRows; /**< number of rows in the data table. */
uint16_t numCols; /**< number of columns in the data table. */
float32_t *pData; /**< points to the data table. */
} arm_bilinear_interp_instance_f32;
/**
* @brief Instance structure for the Q31 bilinear interpolation function.
*/
typedef struct
{
uint16_t numRows; /**< number of rows in the data table. */
uint16_t numCols; /**< number of columns in the data table. */
q31_t *pData; /**< points to the data table. */
} arm_bilinear_interp_instance_q31;
/**
* @brief Instance structure for the Q15 bilinear interpolation function.
*/
typedef struct
{
uint16_t numRows; /**< number of rows in the data table. */
uint16_t numCols; /**< number of columns in the data table. */
q15_t *pData; /**< points to the data table. */
} arm_bilinear_interp_instance_q15;
/**
* @brief Instance structure for the Q15 bilinear interpolation function.
*/
typedef struct
{
uint16_t numRows; /**< number of rows in the data table. */
uint16_t numCols; /**< number of columns in the data table. */
q7_t *pData; /**< points to the data table. */
} arm_bilinear_interp_instance_q7;
/**
* @brief Struct for specifying cubic spline type
*/
typedef enum
{
ARM_SPLINE_NATURAL = 0, /**< Natural spline */
ARM_SPLINE_PARABOLIC_RUNOUT = 1 /**< Parabolic runout spline */
} arm_spline_type;
/**
* @brief Instance structure for the floating-point cubic spline interpolation.
*/
typedef struct
{
arm_spline_type type; /**< Type (boundary conditions) */
const float32_t * x; /**< x values */
const float32_t * y; /**< y values */
uint32_t n_x; /**< Number of known data points */
float32_t * coeffs; /**< Coefficients buffer (b,c, and d) */
} arm_spline_instance_f32;
/**
* @ingroup groupInterpolation
*/
/**
* @addtogroup SplineInterpolate
* @{
*/
/**
* @brief Processing function for the floating-point cubic spline interpolation.
* @param[in] S points to an instance of the floating-point spline structure.
* @param[in] xq points to the x values ot the interpolated data points.
* @param[out] pDst points to the block of output data.
* @param[in] blockSize number of samples of output data.
*/
void arm_spline_f32(
arm_spline_instance_f32 * S,
const float32_t * xq,
float32_t * pDst,
uint32_t blockSize);
/**
* @brief Initialization function for the floating-point cubic spline interpolation.
* @param[in,out] S points to an instance of the floating-point spline structure.
* @param[in] type type of cubic spline interpolation (boundary conditions)
* @param[in] x points to the x values of the known data points.
* @param[in] y points to the y values of the known data points.
* @param[in] n number of known data points.
* @param[in] coeffs coefficients array for b, c, and d
* @param[in] tempBuffer buffer array for internal computations
*/
void arm_spline_init_f32(
arm_spline_instance_f32 * S,
arm_spline_type type,
const float32_t * x,
const float32_t * y,
uint32_t n,
float32_t * coeffs,
float32_t * tempBuffer);
/**
* @} end of SplineInterpolate group
*/
/**
* @addtogroup LinearInterpolate
* @{
*/
/**
* @brief Process function for the floating-point Linear Interpolation Function.
* @param[in,out] S is an instance of the floating-point Linear Interpolation structure
* @param[in] x input sample to process
* @return y processed output sample.
*
*/
float32_t arm_linear_interp_f32(
arm_linear_interp_instance_f32 * S,
float32_t x);
/**
*
* @brief Process function for the Q31 Linear Interpolation Function.
* @param[in] pYData pointer to Q31 Linear Interpolation table
* @param[in] x input sample to process
* @param[in] nValues number of table values
* @return y processed output sample.
*
* \par
* Input sample <code>x</code> is in 12.20 format which contains 12 bits for table index and 20 bits for fractional part.
* This function can support maximum of table size 2^12.
*
*/
q31_t arm_linear_interp_q31(
q31_t * pYData,
q31_t x,
uint32_t nValues);
/**
*
* @brief Process function for the Q15 Linear Interpolation Function.
* @param[in] pYData pointer to Q15 Linear Interpolation table
* @param[in] x input sample to process
* @param[in] nValues number of table values
* @return y processed output sample.
*
* \par
* Input sample <code>x</code> is in 12.20 format which contains 12 bits for table index and 20 bits for fractional part.
* This function can support maximum of table size 2^12.
*
*/
q15_t arm_linear_interp_q15(
q15_t * pYData,
q31_t x,
uint32_t nValues);
/**
*
* @brief Process function for the Q7 Linear Interpolation Function.
* @param[in] pYData pointer to Q7 Linear Interpolation table
* @param[in] x input sample to process
* @param[in] nValues number of table values
* @return y processed output sample.
*
* \par
* Input sample <code>x</code> is in 12.20 format which contains 12 bits for table index and 20 bits for fractional part.
* This function can support maximum of table size 2^12.
*/
q7_t arm_linear_interp_q7(
q7_t * pYData,
q31_t x,
uint32_t nValues);
/**
* @} end of LinearInterpolate group
*/
/**
* @ingroup groupInterpolation
*/
/**
* @addtogroup BilinearInterpolate
* @{
*/
/**
* @brief Floating-point bilinear interpolation.
* @param[in,out] S points to an instance of the interpolation structure.
* @param[in] X interpolation coordinate.
* @param[in] Y interpolation coordinate.
* @return out interpolated value.
*/
float32_t arm_bilinear_interp_f32(
const arm_bilinear_interp_instance_f32 * S,
float32_t X,
float32_t Y);
/**
* @brief Q31 bilinear interpolation.
* @param[in,out] S points to an instance of the interpolation structure.
* @param[in] X interpolation coordinate in 12.20 format.
* @param[in] Y interpolation coordinate in 12.20 format.
* @return out interpolated value.
*/
q31_t arm_bilinear_interp_q31(
arm_bilinear_interp_instance_q31 * S,
q31_t X,
q31_t Y);
/**
* @brief Q15 bilinear interpolation.
* @param[in,out] S points to an instance of the interpolation structure.
* @param[in] X interpolation coordinate in 12.20 format.
* @param[in] Y interpolation coordinate in 12.20 format.
* @return out interpolated value.
*/
q15_t arm_bilinear_interp_q15(
arm_bilinear_interp_instance_q15 * S,
q31_t X,
q31_t Y);
/**
* @brief Q7 bilinear interpolation.
* @param[in,out] S points to an instance of the interpolation structure.
* @param[in] X interpolation coordinate in 12.20 format.
* @param[in] Y interpolation coordinate in 12.20 format.
* @return out interpolated value.
*/
q7_t arm_bilinear_interp_q7(
arm_bilinear_interp_instance_q7 * S,
q31_t X,
q31_t Y);
/**
* @} end of BilinearInterpolate group
*/
#ifdef __cplusplus
}
#endif
#endif /* ifndef _INTERPOLATION_FUNCTIONS_H_ */

View File

@ -1,597 +0,0 @@
/******************************************************************************
* @file matrix_functions.h
* @brief Public header file for CMSIS DSP Library
* @version V1.9.0
* @date 20. July 2020
******************************************************************************/
/*
* Copyright (c) 2010-2020 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef _MATRIX_FUNCTIONS_H_
#define _MATRIX_FUNCTIONS_H_
#include "arm_math_types.h"
#include "arm_math_memory.h"
#include "dsp/none.h"
#include "dsp/utils.h"
#ifdef __cplusplus
extern "C"
{
#endif
/**
* @defgroup groupMatrix Matrix Functions
*
* This set of functions provides basic matrix math operations.
* The functions operate on matrix data structures. For example,
* the type
* definition for the floating-point matrix structure is shown
* below:
* <pre>
* typedef struct
* {
* uint16_t numRows; // number of rows of the matrix.
* uint16_t numCols; // number of columns of the matrix.
* float32_t *pData; // points to the data of the matrix.
* } arm_matrix_instance_f32;
* </pre>
* There are similar definitions for Q15 and Q31 data types.
*
* The structure specifies the size of the matrix and then points to
* an array of data. The array is of size <code>numRows X numCols</code>
* and the values are arranged in row order. That is, the
* matrix element (i, j) is stored at:
* <pre>
* pData[i*numCols + j]
* </pre>
*
* \par Init Functions
* There is an associated initialization function for each type of matrix
* data structure.
* The initialization function sets the values of the internal structure fields.
* Refer to \ref arm_mat_init_f32(), \ref arm_mat_init_q31() and \ref arm_mat_init_q15()
* for floating-point, Q31 and Q15 types, respectively.
*
* \par
* Use of the initialization function is optional. However, if initialization function is used
* then the instance structure cannot be placed into a const data section.
* To place the instance structure in a const data
* section, manually initialize the data structure. For example:
* <pre>
* <code>arm_matrix_instance_f32 S = {nRows, nColumns, pData};</code>
* <code>arm_matrix_instance_q31 S = {nRows, nColumns, pData};</code>
* <code>arm_matrix_instance_q15 S = {nRows, nColumns, pData};</code>
* </pre>
* where <code>nRows</code> specifies the number of rows, <code>nColumns</code>
* specifies the number of columns, and <code>pData</code> points to the
* data array.
*
* \par Size Checking
* By default all of the matrix functions perform size checking on the input and
* output matrices. For example, the matrix addition function verifies that the
* two input matrices and the output matrix all have the same number of rows and
* columns. If the size check fails the functions return:
* <pre>
* ARM_MATH_SIZE_MISMATCH
* </pre>
* Otherwise the functions return
* <pre>
* ARM_MATH_SUCCESS
* </pre>
* There is some overhead associated with this matrix size checking.
* The matrix size checking is enabled via the \#define
* <pre>
* ARM_MATH_MATRIX_CHECK
* </pre>
* within the library project settings. By default this macro is defined
* and size checking is enabled. By changing the project settings and
* undefining this macro size checking is eliminated and the functions
* run a bit faster. With size checking disabled the functions always
* return <code>ARM_MATH_SUCCESS</code>.
*/
/**
* @brief Instance structure for the floating-point matrix structure.
*/
typedef struct
{
uint16_t numRows; /**< number of rows of the matrix. */
uint16_t numCols; /**< number of columns of the matrix. */
float32_t *pData; /**< points to the data of the matrix. */
} arm_matrix_instance_f32;
/**
* @brief Instance structure for the floating-point matrix structure.
*/
typedef struct
{
uint16_t numRows; /**< number of rows of the matrix. */
uint16_t numCols; /**< number of columns of the matrix. */
float64_t *pData; /**< points to the data of the matrix. */
} arm_matrix_instance_f64;
/**
* @brief Instance structure for the Q7 matrix structure.
*/
typedef struct
{
uint16_t numRows; /**< number of rows of the matrix. */
uint16_t numCols; /**< number of columns of the matrix. */
q7_t *pData; /**< points to the data of the matrix. */
} arm_matrix_instance_q7;
/**
* @brief Instance structure for the Q15 matrix structure.
*/
typedef struct
{
uint16_t numRows; /**< number of rows of the matrix. */
uint16_t numCols; /**< number of columns of the matrix. */
q15_t *pData; /**< points to the data of the matrix. */
} arm_matrix_instance_q15;
/**
* @brief Instance structure for the Q31 matrix structure.
*/
typedef struct
{
uint16_t numRows; /**< number of rows of the matrix. */
uint16_t numCols; /**< number of columns of the matrix. */
q31_t *pData; /**< points to the data of the matrix. */
} arm_matrix_instance_q31;
/**
* @brief Floating-point matrix addition.
* @param[in] pSrcA points to the first input matrix structure
* @param[in] pSrcB points to the second input matrix structure
* @param[out] pDst points to output matrix structure
* @return The function returns either
* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
*/
arm_status arm_mat_add_f32(
const arm_matrix_instance_f32 * pSrcA,
const arm_matrix_instance_f32 * pSrcB,
arm_matrix_instance_f32 * pDst);
/**
* @brief Q15 matrix addition.
* @param[in] pSrcA points to the first input matrix structure
* @param[in] pSrcB points to the second input matrix structure
* @param[out] pDst points to output matrix structure
* @return The function returns either
* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
*/
arm_status arm_mat_add_q15(
const arm_matrix_instance_q15 * pSrcA,
const arm_matrix_instance_q15 * pSrcB,
arm_matrix_instance_q15 * pDst);
/**
* @brief Q31 matrix addition.
* @param[in] pSrcA points to the first input matrix structure
* @param[in] pSrcB points to the second input matrix structure
* @param[out] pDst points to output matrix structure
* @return The function returns either
* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
*/
arm_status arm_mat_add_q31(
const arm_matrix_instance_q31 * pSrcA,
const arm_matrix_instance_q31 * pSrcB,
arm_matrix_instance_q31 * pDst);
/**
* @brief Floating-point, complex, matrix multiplication.
* @param[in] pSrcA points to the first input matrix structure
* @param[in] pSrcB points to the second input matrix structure
* @param[out] pDst points to output matrix structure
* @return The function returns either
* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
*/
arm_status arm_mat_cmplx_mult_f32(
const arm_matrix_instance_f32 * pSrcA,
const arm_matrix_instance_f32 * pSrcB,
arm_matrix_instance_f32 * pDst);
/**
* @brief Q15, complex, matrix multiplication.
* @param[in] pSrcA points to the first input matrix structure
* @param[in] pSrcB points to the second input matrix structure
* @param[out] pDst points to output matrix structure
* @return The function returns either
* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
*/
arm_status arm_mat_cmplx_mult_q15(
const arm_matrix_instance_q15 * pSrcA,
const arm_matrix_instance_q15 * pSrcB,
arm_matrix_instance_q15 * pDst,
q15_t * pScratch);
/**
* @brief Q31, complex, matrix multiplication.
* @param[in] pSrcA points to the first input matrix structure
* @param[in] pSrcB points to the second input matrix structure
* @param[out] pDst points to output matrix structure
* @return The function returns either
* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
*/
arm_status arm_mat_cmplx_mult_q31(
const arm_matrix_instance_q31 * pSrcA,
const arm_matrix_instance_q31 * pSrcB,
arm_matrix_instance_q31 * pDst);
/**
* @brief Floating-point matrix transpose.
* @param[in] pSrc points to the input matrix
* @param[out] pDst points to the output matrix
* @return The function returns either <code>ARM_MATH_SIZE_MISMATCH</code>
* or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
*/
arm_status arm_mat_trans_f32(
const arm_matrix_instance_f32 * pSrc,
arm_matrix_instance_f32 * pDst);
/**
* @brief Floating-point complex matrix transpose.
* @param[in] pSrc points to the input matrix
* @param[out] pDst points to the output matrix
* @return The function returns either <code>ARM_MATH_SIZE_MISMATCH</code>
* or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
*/
arm_status arm_mat_cmplx_trans_f32(
const arm_matrix_instance_f32 * pSrc,
arm_matrix_instance_f32 * pDst);
/**
* @brief Q15 matrix transpose.
* @param[in] pSrc points to the input matrix
* @param[out] pDst points to the output matrix
* @return The function returns either <code>ARM_MATH_SIZE_MISMATCH</code>
* or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
*/
arm_status arm_mat_trans_q15(
const arm_matrix_instance_q15 * pSrc,
arm_matrix_instance_q15 * pDst);
/**
* @brief Q15 complex matrix transpose.
* @param[in] pSrc points to the input matrix
* @param[out] pDst points to the output matrix
* @return The function returns either <code>ARM_MATH_SIZE_MISMATCH</code>
* or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
*/
arm_status arm_mat_cmplx_trans_q15(
const arm_matrix_instance_q15 * pSrc,
arm_matrix_instance_q15 * pDst);
/**
* @brief Q7 matrix transpose.
* @param[in] pSrc points to the input matrix
* @param[out] pDst points to the output matrix
* @return The function returns either <code>ARM_MATH_SIZE_MISMATCH</code>
* or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
*/
arm_status arm_mat_trans_q7(
const arm_matrix_instance_q7 * pSrc,
arm_matrix_instance_q7 * pDst);
/**
* @brief Q31 matrix transpose.
* @param[in] pSrc points to the input matrix
* @param[out] pDst points to the output matrix
* @return The function returns either <code>ARM_MATH_SIZE_MISMATCH</code>
* or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
*/
arm_status arm_mat_trans_q31(
const arm_matrix_instance_q31 * pSrc,
arm_matrix_instance_q31 * pDst);
/**
* @brief Q31 complex matrix transpose.
* @param[in] pSrc points to the input matrix
* @param[out] pDst points to the output matrix
* @return The function returns either <code>ARM_MATH_SIZE_MISMATCH</code>
* or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
*/
arm_status arm_mat_cmplx_trans_q31(
const arm_matrix_instance_q31 * pSrc,
arm_matrix_instance_q31 * pDst);
/**
* @brief Floating-point matrix multiplication
* @param[in] pSrcA points to the first input matrix structure
* @param[in] pSrcB points to the second input matrix structure
* @param[out] pDst points to output matrix structure
* @return The function returns either
* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
*/
arm_status arm_mat_mult_f32(
const arm_matrix_instance_f32 * pSrcA,
const arm_matrix_instance_f32 * pSrcB,
arm_matrix_instance_f32 * pDst);
/**
* @brief Floating-point matrix and vector multiplication
* @param[in] pSrcMat points to the input matrix structure
* @param[in] pVec points to vector
* @param[out] pDst points to output vector
*/
void arm_mat_vec_mult_f32(
const arm_matrix_instance_f32 *pSrcMat,
const float32_t *pVec,
float32_t *pDst);
/**
* @brief Q7 matrix multiplication
* @param[in] pSrcA points to the first input matrix structure
* @param[in] pSrcB points to the second input matrix structure
* @param[out] pDst points to output matrix structure
* @param[in] pState points to the array for storing intermediate results
* @return The function returns either
* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
*/
arm_status arm_mat_mult_q7(
const arm_matrix_instance_q7 * pSrcA,
const arm_matrix_instance_q7 * pSrcB,
arm_matrix_instance_q7 * pDst,
q7_t * pState);
/**
* @brief Q7 matrix and vector multiplication
* @param[in] pSrcMat points to the input matrix structure
* @param[in] pVec points to vector
* @param[out] pDst points to output vector
*/
void arm_mat_vec_mult_q7(
const arm_matrix_instance_q7 *pSrcMat,
const q7_t *pVec,
q7_t *pDst);
/**
* @brief Q15 matrix multiplication
* @param[in] pSrcA points to the first input matrix structure
* @param[in] pSrcB points to the second input matrix structure
* @param[out] pDst points to output matrix structure
* @param[in] pState points to the array for storing intermediate results
* @return The function returns either
* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
*/
arm_status arm_mat_mult_q15(
const arm_matrix_instance_q15 * pSrcA,
const arm_matrix_instance_q15 * pSrcB,
arm_matrix_instance_q15 * pDst,
q15_t * pState);
/**
* @brief Q15 matrix and vector multiplication
* @param[in] pSrcMat points to the input matrix structure
* @param[in] pVec points to vector
* @param[out] pDst points to output vector
*/
void arm_mat_vec_mult_q15(
const arm_matrix_instance_q15 *pSrcMat,
const q15_t *pVec,
q15_t *pDst);
/**
* @brief Q15 matrix multiplication (fast variant) for Cortex-M3 and Cortex-M4
* @param[in] pSrcA points to the first input matrix structure
* @param[in] pSrcB points to the second input matrix structure
* @param[out] pDst points to output matrix structure
* @param[in] pState points to the array for storing intermediate results
* @return The function returns either
* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
*/
arm_status arm_mat_mult_fast_q15(
const arm_matrix_instance_q15 * pSrcA,
const arm_matrix_instance_q15 * pSrcB,
arm_matrix_instance_q15 * pDst,
q15_t * pState);
/**
* @brief Q31 matrix multiplication
* @param[in] pSrcA points to the first input matrix structure
* @param[in] pSrcB points to the second input matrix structure
* @param[out] pDst points to output matrix structure
* @return The function returns either
* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
*/
arm_status arm_mat_mult_q31(
const arm_matrix_instance_q31 * pSrcA,
const arm_matrix_instance_q31 * pSrcB,
arm_matrix_instance_q31 * pDst);
/**
* @brief Q31 matrix and vector multiplication
* @param[in] pSrcMat points to the input matrix structure
* @param[in] pVec points to vector
* @param[out] pDst points to output vector
*/
void arm_mat_vec_mult_q31(
const arm_matrix_instance_q31 *pSrcMat,
const q31_t *pVec,
q31_t *pDst);
/**
* @brief Q31 matrix multiplication (fast variant) for Cortex-M3 and Cortex-M4
* @param[in] pSrcA points to the first input matrix structure
* @param[in] pSrcB points to the second input matrix structure
* @param[out] pDst points to output matrix structure
* @return The function returns either
* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
*/
arm_status arm_mat_mult_fast_q31(
const arm_matrix_instance_q31 * pSrcA,
const arm_matrix_instance_q31 * pSrcB,
arm_matrix_instance_q31 * pDst);
/**
* @brief Floating-point matrix subtraction
* @param[in] pSrcA points to the first input matrix structure
* @param[in] pSrcB points to the second input matrix structure
* @param[out] pDst points to output matrix structure
* @return The function returns either
* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
*/
arm_status arm_mat_sub_f32(
const arm_matrix_instance_f32 * pSrcA,
const arm_matrix_instance_f32 * pSrcB,
arm_matrix_instance_f32 * pDst);
/**
* @brief Q15 matrix subtraction
* @param[in] pSrcA points to the first input matrix structure
* @param[in] pSrcB points to the second input matrix structure
* @param[out] pDst points to output matrix structure
* @return The function returns either
* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
*/
arm_status arm_mat_sub_q15(
const arm_matrix_instance_q15 * pSrcA,
const arm_matrix_instance_q15 * pSrcB,
arm_matrix_instance_q15 * pDst);
/**
* @brief Q31 matrix subtraction
* @param[in] pSrcA points to the first input matrix structure
* @param[in] pSrcB points to the second input matrix structure
* @param[out] pDst points to output matrix structure
* @return The function returns either
* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
*/
arm_status arm_mat_sub_q31(
const arm_matrix_instance_q31 * pSrcA,
const arm_matrix_instance_q31 * pSrcB,
arm_matrix_instance_q31 * pDst);
/**
* @brief Floating-point matrix scaling.
* @param[in] pSrc points to the input matrix
* @param[in] scale scale factor
* @param[out] pDst points to the output matrix
* @return The function returns either
* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
*/
arm_status arm_mat_scale_f32(
const arm_matrix_instance_f32 * pSrc,
float32_t scale,
arm_matrix_instance_f32 * pDst);
/**
* @brief Q15 matrix scaling.
* @param[in] pSrc points to input matrix
* @param[in] scaleFract fractional portion of the scale factor
* @param[in] shift number of bits to shift the result by
* @param[out] pDst points to output matrix
* @return The function returns either
* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
*/
arm_status arm_mat_scale_q15(
const arm_matrix_instance_q15 * pSrc,
q15_t scaleFract,
int32_t shift,
arm_matrix_instance_q15 * pDst);
/**
* @brief Q31 matrix scaling.
* @param[in] pSrc points to input matrix
* @param[in] scaleFract fractional portion of the scale factor
* @param[in] shift number of bits to shift the result by
* @param[out] pDst points to output matrix structure
* @return The function returns either
* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
*/
arm_status arm_mat_scale_q31(
const arm_matrix_instance_q31 * pSrc,
q31_t scaleFract,
int32_t shift,
arm_matrix_instance_q31 * pDst);
/**
* @brief Q31 matrix initialization.
* @param[in,out] S points to an instance of the floating-point matrix structure.
* @param[in] nRows number of rows in the matrix.
* @param[in] nColumns number of columns in the matrix.
* @param[in] pData points to the matrix data array.
*/
void arm_mat_init_q31(
arm_matrix_instance_q31 * S,
uint16_t nRows,
uint16_t nColumns,
q31_t * pData);
/**
* @brief Q15 matrix initialization.
* @param[in,out] S points to an instance of the floating-point matrix structure.
* @param[in] nRows number of rows in the matrix.
* @param[in] nColumns number of columns in the matrix.
* @param[in] pData points to the matrix data array.
*/
void arm_mat_init_q15(
arm_matrix_instance_q15 * S,
uint16_t nRows,
uint16_t nColumns,
q15_t * pData);
/**
* @brief Floating-point matrix initialization.
* @param[in,out] S points to an instance of the floating-point matrix structure.
* @param[in] nRows number of rows in the matrix.
* @param[in] nColumns number of columns in the matrix.
* @param[in] pData points to the matrix data array.
*/
void arm_mat_init_f32(
arm_matrix_instance_f32 * S,
uint16_t nRows,
uint16_t nColumns,
float32_t * pData);
/**
* @brief Floating-point matrix inverse.
* @param[in] src points to the instance of the input floating-point matrix structure.
* @param[out] dst points to the instance of the output floating-point matrix structure.
* @return The function returns ARM_MATH_SIZE_MISMATCH, if the dimensions do not match.
* If the input matrix is singular (does not have an inverse), then the algorithm terminates and returns error status ARM_MATH_SINGULAR.
*/
arm_status arm_mat_inverse_f32(
const arm_matrix_instance_f32 * src,
arm_matrix_instance_f32 * dst);
/**
* @brief Floating-point matrix inverse.
* @param[in] src points to the instance of the input floating-point matrix structure.
* @param[out] dst points to the instance of the output floating-point matrix structure.
* @return The function returns ARM_MATH_SIZE_MISMATCH, if the dimensions do not match.
* If the input matrix is singular (does not have an inverse), then the algorithm terminates and returns error status ARM_MATH_SINGULAR.
*/
arm_status arm_mat_inverse_f64(
const arm_matrix_instance_f64 * src,
arm_matrix_instance_f64 * dst);
#ifdef __cplusplus
}
#endif
#endif /* ifndef _MATRIX_FUNCTIONS_H_ */

View File

@ -1,576 +0,0 @@
/******************************************************************************
* @file none.h
* @brief Intrinsincs when no DSP extension available
* @version V1.9.0
* @date 20. July 2020
******************************************************************************/
/*
* Copyright (c) 2010-2020 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
Definitions in this file are allowing to reuse some versions of the
CMSIS-DSP to build on a core (M0 for instance) or a host where
DSP extension are not available.
Ideally a pure C version should have been used instead.
But those are not always available or use a restricted set
of intrinsics.
*/
#ifndef _NONE_H_
#define _NONE_H_
#include "arm_math_types.h"
#ifdef __cplusplus
extern "C"
{
#endif
/*
Normally those kind of definitions are in a compiler file
in Core or Core_A.
But for MSVC compiler it is a bit special. The goal is very specific
to CMSIS-DSP and only to allow the use of this library from other
systems like Python or Matlab.
MSVC is not going to be used to cross-compile to ARM. So, having a MSVC
compiler file in Core or Core_A would not make sense.
*/
#if defined ( _MSC_VER ) || defined(__GNUC_PYTHON__)
__STATIC_FORCEINLINE uint8_t __CLZ(uint32_t data)
{
if (data == 0U) { return 32U; }
uint32_t count = 0U;
uint32_t mask = 0x80000000U;
while ((data & mask) == 0U)
{
count += 1U;
mask = mask >> 1U;
}
return count;
}
__STATIC_FORCEINLINE int32_t __SSAT(int32_t val, uint32_t sat)
{
if ((sat >= 1U) && (sat <= 32U))
{
const int32_t max = (int32_t)((1U << (sat - 1U)) - 1U);
const int32_t min = -1 - max ;
if (val > max)
{
return max;
}
else if (val < min)
{
return min;
}
}
return val;
}
__STATIC_FORCEINLINE uint32_t __USAT(int32_t val, uint32_t sat)
{
if (sat <= 31U)
{
const uint32_t max = ((1U << sat) - 1U);
if (val > (int32_t)max)
{
return max;
}
else if (val < 0)
{
return 0U;
}
}
return (uint32_t)val;
}
/**
\brief Rotate Right in unsigned value (32 bit)
\details Rotate Right (immediate) provides the value of the contents of a register rotated by a variable number of bits.
\param [in] op1 Value to rotate
\param [in] op2 Number of Bits to rotate
\return Rotated value
*/
__STATIC_FORCEINLINE uint32_t __ROR(uint32_t op1, uint32_t op2)
{
op2 %= 32U;
if (op2 == 0U)
{
return op1;
}
return (op1 >> op2) | (op1 << (32U - op2));
}
#endif
/**
* @brief Clips Q63 to Q31 values.
*/
__STATIC_FORCEINLINE q31_t clip_q63_to_q31(
q63_t x)
{
return ((q31_t) (x >> 32) != ((q31_t) x >> 31)) ?
((0x7FFFFFFF ^ ((q31_t) (x >> 63)))) : (q31_t) x;
}
/**
* @brief Clips Q63 to Q15 values.
*/
__STATIC_FORCEINLINE q15_t clip_q63_to_q15(
q63_t x)
{
return ((q31_t) (x >> 32) != ((q31_t) x >> 31)) ?
((0x7FFF ^ ((q15_t) (x >> 63)))) : (q15_t) (x >> 15);
}
/**
* @brief Clips Q31 to Q7 values.
*/
__STATIC_FORCEINLINE q7_t clip_q31_to_q7(
q31_t x)
{
return ((q31_t) (x >> 24) != ((q31_t) x >> 23)) ?
((0x7F ^ ((q7_t) (x >> 31)))) : (q7_t) x;
}
/**
* @brief Clips Q31 to Q15 values.
*/
__STATIC_FORCEINLINE q15_t clip_q31_to_q15(
q31_t x)
{
return ((q31_t) (x >> 16) != ((q31_t) x >> 15)) ?
((0x7FFF ^ ((q15_t) (x >> 31)))) : (q15_t) x;
}
/**
* @brief Multiplies 32 X 64 and returns 32 bit result in 2.30 format.
*/
__STATIC_FORCEINLINE q63_t mult32x64(
q63_t x,
q31_t y)
{
return ((((q63_t) (x & 0x00000000FFFFFFFF) * y) >> 32) +
(((q63_t) (x >> 32) * y) ) );
}
/* SMMLAR */
#define multAcc_32x32_keep32_R(a, x, y) \
a = (q31_t) (((((q63_t) a) << 32) + ((q63_t) x * y) + 0x80000000LL ) >> 32)
/* SMMLSR */
#define multSub_32x32_keep32_R(a, x, y) \
a = (q31_t) (((((q63_t) a) << 32) - ((q63_t) x * y) + 0x80000000LL ) >> 32)
/* SMMULR */
#define mult_32x32_keep32_R(a, x, y) \
a = (q31_t) (((q63_t) x * y + 0x80000000LL ) >> 32)
/* SMMLA */
#define multAcc_32x32_keep32(a, x, y) \
a += (q31_t) (((q63_t) x * y) >> 32)
/* SMMLS */
#define multSub_32x32_keep32(a, x, y) \
a -= (q31_t) (((q63_t) x * y) >> 32)
/* SMMUL */
#define mult_32x32_keep32(a, x, y) \
a = (q31_t) (((q63_t) x * y ) >> 32)
#ifndef ARM_MATH_DSP
/**
* @brief definition to pack two 16 bit values.
*/
#define __PKHBT(ARG1, ARG2, ARG3) ( (((int32_t)(ARG1) << 0) & (int32_t)0x0000FFFF) | \
(((int32_t)(ARG2) << ARG3) & (int32_t)0xFFFF0000) )
#define __PKHTB(ARG1, ARG2, ARG3) ( (((int32_t)(ARG1) << 0) & (int32_t)0xFFFF0000) | \
(((int32_t)(ARG2) >> ARG3) & (int32_t)0x0000FFFF) )
#endif
/**
* @brief definition to pack four 8 bit values.
*/
#ifndef ARM_MATH_BIG_ENDIAN
#define __PACKq7(v0,v1,v2,v3) ( (((int32_t)(v0) << 0) & (int32_t)0x000000FF) | \
(((int32_t)(v1) << 8) & (int32_t)0x0000FF00) | \
(((int32_t)(v2) << 16) & (int32_t)0x00FF0000) | \
(((int32_t)(v3) << 24) & (int32_t)0xFF000000) )
#else
#define __PACKq7(v0,v1,v2,v3) ( (((int32_t)(v3) << 0) & (int32_t)0x000000FF) | \
(((int32_t)(v2) << 8) & (int32_t)0x0000FF00) | \
(((int32_t)(v1) << 16) & (int32_t)0x00FF0000) | \
(((int32_t)(v0) << 24) & (int32_t)0xFF000000) )
#endif
/*
* @brief C custom defined intrinsic functions
*/
#if !defined (ARM_MATH_DSP)
/*
* @brief C custom defined QADD8
*/
__STATIC_FORCEINLINE uint32_t __QADD8(
uint32_t x,
uint32_t y)
{
q31_t r, s, t, u;
r = __SSAT(((((q31_t)x << 24) >> 24) + (((q31_t)y << 24) >> 24)), 8) & (int32_t)0x000000FF;
s = __SSAT(((((q31_t)x << 16) >> 24) + (((q31_t)y << 16) >> 24)), 8) & (int32_t)0x000000FF;
t = __SSAT(((((q31_t)x << 8) >> 24) + (((q31_t)y << 8) >> 24)), 8) & (int32_t)0x000000FF;
u = __SSAT(((((q31_t)x ) >> 24) + (((q31_t)y ) >> 24)), 8) & (int32_t)0x000000FF;
return ((uint32_t)((u << 24) | (t << 16) | (s << 8) | (r )));
}
/*
* @brief C custom defined QSUB8
*/
__STATIC_FORCEINLINE uint32_t __QSUB8(
uint32_t x,
uint32_t y)
{
q31_t r, s, t, u;
r = __SSAT(((((q31_t)x << 24) >> 24) - (((q31_t)y << 24) >> 24)), 8) & (int32_t)0x000000FF;
s = __SSAT(((((q31_t)x << 16) >> 24) - (((q31_t)y << 16) >> 24)), 8) & (int32_t)0x000000FF;
t = __SSAT(((((q31_t)x << 8) >> 24) - (((q31_t)y << 8) >> 24)), 8) & (int32_t)0x000000FF;
u = __SSAT(((((q31_t)x ) >> 24) - (((q31_t)y ) >> 24)), 8) & (int32_t)0x000000FF;
return ((uint32_t)((u << 24) | (t << 16) | (s << 8) | (r )));
}
/*
* @brief C custom defined QADD16
*/
__STATIC_FORCEINLINE uint32_t __QADD16(
uint32_t x,
uint32_t y)
{
/* q31_t r, s; without initialisation 'arm_offset_q15 test' fails but 'intrinsic' tests pass! for armCC */
q31_t r = 0, s = 0;
r = __SSAT(((((q31_t)x << 16) >> 16) + (((q31_t)y << 16) >> 16)), 16) & (int32_t)0x0000FFFF;
s = __SSAT(((((q31_t)x ) >> 16) + (((q31_t)y ) >> 16)), 16) & (int32_t)0x0000FFFF;
return ((uint32_t)((s << 16) | (r )));
}
/*
* @brief C custom defined SHADD16
*/
__STATIC_FORCEINLINE uint32_t __SHADD16(
uint32_t x,
uint32_t y)
{
q31_t r, s;
r = (((((q31_t)x << 16) >> 16) + (((q31_t)y << 16) >> 16)) >> 1) & (int32_t)0x0000FFFF;
s = (((((q31_t)x ) >> 16) + (((q31_t)y ) >> 16)) >> 1) & (int32_t)0x0000FFFF;
return ((uint32_t)((s << 16) | (r )));
}
/*
* @brief C custom defined QSUB16
*/
__STATIC_FORCEINLINE uint32_t __QSUB16(
uint32_t x,
uint32_t y)
{
q31_t r, s;
r = __SSAT(((((q31_t)x << 16) >> 16) - (((q31_t)y << 16) >> 16)), 16) & (int32_t)0x0000FFFF;
s = __SSAT(((((q31_t)x ) >> 16) - (((q31_t)y ) >> 16)), 16) & (int32_t)0x0000FFFF;
return ((uint32_t)((s << 16) | (r )));
}
/*
* @brief C custom defined SHSUB16
*/
__STATIC_FORCEINLINE uint32_t __SHSUB16(
uint32_t x,
uint32_t y)
{
q31_t r, s;
r = (((((q31_t)x << 16) >> 16) - (((q31_t)y << 16) >> 16)) >> 1) & (int32_t)0x0000FFFF;
s = (((((q31_t)x ) >> 16) - (((q31_t)y ) >> 16)) >> 1) & (int32_t)0x0000FFFF;
return ((uint32_t)((s << 16) | (r )));
}
/*
* @brief C custom defined QASX
*/
__STATIC_FORCEINLINE uint32_t __QASX(
uint32_t x,
uint32_t y)
{
q31_t r, s;
r = __SSAT(((((q31_t)x << 16) >> 16) - (((q31_t)y ) >> 16)), 16) & (int32_t)0x0000FFFF;
s = __SSAT(((((q31_t)x ) >> 16) + (((q31_t)y << 16) >> 16)), 16) & (int32_t)0x0000FFFF;
return ((uint32_t)((s << 16) | (r )));
}
/*
* @brief C custom defined SHASX
*/
__STATIC_FORCEINLINE uint32_t __SHASX(
uint32_t x,
uint32_t y)
{
q31_t r, s;
r = (((((q31_t)x << 16) >> 16) - (((q31_t)y ) >> 16)) >> 1) & (int32_t)0x0000FFFF;
s = (((((q31_t)x ) >> 16) + (((q31_t)y << 16) >> 16)) >> 1) & (int32_t)0x0000FFFF;
return ((uint32_t)((s << 16) | (r )));
}
/*
* @brief C custom defined QSAX
*/
__STATIC_FORCEINLINE uint32_t __QSAX(
uint32_t x,
uint32_t y)
{
q31_t r, s;
r = __SSAT(((((q31_t)x << 16) >> 16) + (((q31_t)y ) >> 16)), 16) & (int32_t)0x0000FFFF;
s = __SSAT(((((q31_t)x ) >> 16) - (((q31_t)y << 16) >> 16)), 16) & (int32_t)0x0000FFFF;
return ((uint32_t)((s << 16) | (r )));
}
/*
* @brief C custom defined SHSAX
*/
__STATIC_FORCEINLINE uint32_t __SHSAX(
uint32_t x,
uint32_t y)
{
q31_t r, s;
r = (((((q31_t)x << 16) >> 16) + (((q31_t)y ) >> 16)) >> 1) & (int32_t)0x0000FFFF;
s = (((((q31_t)x ) >> 16) - (((q31_t)y << 16) >> 16)) >> 1) & (int32_t)0x0000FFFF;
return ((uint32_t)((s << 16) | (r )));
}
/*
* @brief C custom defined SMUSDX
*/
__STATIC_FORCEINLINE uint32_t __SMUSDX(
uint32_t x,
uint32_t y)
{
return ((uint32_t)(((((q31_t)x << 16) >> 16) * (((q31_t)y ) >> 16)) -
((((q31_t)x ) >> 16) * (((q31_t)y << 16) >> 16)) ));
}
/*
* @brief C custom defined SMUADX
*/
__STATIC_FORCEINLINE uint32_t __SMUADX(
uint32_t x,
uint32_t y)
{
return ((uint32_t)(((((q31_t)x << 16) >> 16) * (((q31_t)y ) >> 16)) +
((((q31_t)x ) >> 16) * (((q31_t)y << 16) >> 16)) ));
}
/*
* @brief C custom defined QADD
*/
__STATIC_FORCEINLINE int32_t __QADD(
int32_t x,
int32_t y)
{
return ((int32_t)(clip_q63_to_q31((q63_t)x + (q31_t)y)));
}
/*
* @brief C custom defined QSUB
*/
__STATIC_FORCEINLINE int32_t __QSUB(
int32_t x,
int32_t y)
{
return ((int32_t)(clip_q63_to_q31((q63_t)x - (q31_t)y)));
}
/*
* @brief C custom defined SMLAD
*/
__STATIC_FORCEINLINE uint32_t __SMLAD(
uint32_t x,
uint32_t y,
uint32_t sum)
{
return ((uint32_t)(((((q31_t)x << 16) >> 16) * (((q31_t)y << 16) >> 16)) +
((((q31_t)x ) >> 16) * (((q31_t)y ) >> 16)) +
( ((q31_t)sum ) ) ));
}
/*
* @brief C custom defined SMLADX
*/
__STATIC_FORCEINLINE uint32_t __SMLADX(
uint32_t x,
uint32_t y,
uint32_t sum)
{
return ((uint32_t)(((((q31_t)x << 16) >> 16) * (((q31_t)y ) >> 16)) +
((((q31_t)x ) >> 16) * (((q31_t)y << 16) >> 16)) +
( ((q31_t)sum ) ) ));
}
/*
* @brief C custom defined SMLSDX
*/
__STATIC_FORCEINLINE uint32_t __SMLSDX(
uint32_t x,
uint32_t y,
uint32_t sum)
{
return ((uint32_t)(((((q31_t)x << 16) >> 16) * (((q31_t)y ) >> 16)) -
((((q31_t)x ) >> 16) * (((q31_t)y << 16) >> 16)) +
( ((q31_t)sum ) ) ));
}
/*
* @brief C custom defined SMLALD
*/
__STATIC_FORCEINLINE uint64_t __SMLALD(
uint32_t x,
uint32_t y,
uint64_t sum)
{
/* return (sum + ((q15_t) (x >> 16) * (q15_t) (y >> 16)) + ((q15_t) x * (q15_t) y)); */
return ((uint64_t)(((((q31_t)x << 16) >> 16) * (((q31_t)y << 16) >> 16)) +
((((q31_t)x ) >> 16) * (((q31_t)y ) >> 16)) +
( ((q63_t)sum ) ) ));
}
/*
* @brief C custom defined SMLALDX
*/
__STATIC_FORCEINLINE uint64_t __SMLALDX(
uint32_t x,
uint32_t y,
uint64_t sum)
{
/* return (sum + ((q15_t) (x >> 16) * (q15_t) y)) + ((q15_t) x * (q15_t) (y >> 16)); */
return ((uint64_t)(((((q31_t)x << 16) >> 16) * (((q31_t)y ) >> 16)) +
((((q31_t)x ) >> 16) * (((q31_t)y << 16) >> 16)) +
( ((q63_t)sum ) ) ));
}
/*
* @brief C custom defined SMUAD
*/
__STATIC_FORCEINLINE uint32_t __SMUAD(
uint32_t x,
uint32_t y)
{
return ((uint32_t)(((((q31_t)x << 16) >> 16) * (((q31_t)y << 16) >> 16)) +
((((q31_t)x ) >> 16) * (((q31_t)y ) >> 16)) ));
}
/*
* @brief C custom defined SMUSD
*/
__STATIC_FORCEINLINE uint32_t __SMUSD(
uint32_t x,
uint32_t y)
{
return ((uint32_t)(((((q31_t)x << 16) >> 16) * (((q31_t)y << 16) >> 16)) -
((((q31_t)x ) >> 16) * (((q31_t)y ) >> 16)) ));
}
/*
* @brief C custom defined SXTB16
*/
__STATIC_FORCEINLINE uint32_t __SXTB16(
uint32_t x)
{
return ((uint32_t)(((((q31_t)x << 24) >> 24) & (q31_t)0x0000FFFF) |
((((q31_t)x << 8) >> 8) & (q31_t)0xFFFF0000) ));
}
/*
* @brief C custom defined SMMLA
*/
__STATIC_FORCEINLINE int32_t __SMMLA(
int32_t x,
int32_t y,
int32_t sum)
{
return (sum + (int32_t) (((int64_t) x * y) >> 32));
}
#endif /* !defined (ARM_MATH_DSP) */
#ifdef __cplusplus
}
#endif
#endif /* ifndef _TRANSFORM_FUNCTIONS_H_ */

View File

@ -1,483 +0,0 @@
/******************************************************************************
* @file statistics_functions.h
* @brief Public header file for CMSIS DSP Library
* @version V1.9.0
* @date 20. July 2020
******************************************************************************/
/*
* Copyright (c) 2010-2020 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef _STATISTICS_FUNCTIONS_H_
#define _STATISTICS_FUNCTIONS_H_
#include "arm_math_types.h"
#include "arm_math_memory.h"
#include "dsp/none.h"
#include "dsp/utils.h"
#include "dsp/basic_math_functions.h"
#include "dsp/fast_math_functions.h"
#ifdef __cplusplus
extern "C"
{
#endif
/**
* @defgroup groupStats Statistics Functions
*/
/**
* @brief Computation of the LogSumExp
*
* In probabilistic computations, the dynamic of the probability values can be very
* wide because they come from gaussian functions.
* To avoid underflow and overflow issues, the values are represented by their log.
* In this representation, multiplying the original exp values is easy : their logs are added.
* But adding the original exp values is requiring some special handling and it is the
* goal of the LogSumExp function.
*
* If the values are x1...xn, the function is computing:
*
* ln(exp(x1) + ... + exp(xn)) and the computation is done in such a way that
* rounding issues are minimised.
*
* The max xm of the values is extracted and the function is computing:
* xm + ln(exp(x1 - xm) + ... + exp(xn - xm))
*
* @param[in] *in Pointer to an array of input values.
* @param[in] blockSize Number of samples in the input array.
* @return LogSumExp
*
*/
float32_t arm_logsumexp_f32(const float32_t *in, uint32_t blockSize);
/**
* @brief Dot product with log arithmetic
*
* Vectors are containing the log of the samples
*
* @param[in] pSrcA points to the first input vector
* @param[in] pSrcB points to the second input vector
* @param[in] blockSize number of samples in each vector
* @param[in] pTmpBuffer temporary buffer of length blockSize
* @return The log of the dot product .
*
*/
float32_t arm_logsumexp_dot_prod_f32(const float32_t * pSrcA,
const float32_t * pSrcB,
uint32_t blockSize,
float32_t *pTmpBuffer);
/**
* @brief Entropy
*
* @param[in] pSrcA Array of input values.
* @param[in] blockSize Number of samples in the input array.
* @return Entropy -Sum(p ln p)
*
*/
float32_t arm_entropy_f32(const float32_t * pSrcA,uint32_t blockSize);
/**
* @brief Entropy
*
* @param[in] pSrcA Array of input values.
* @param[in] blockSize Number of samples in the input array.
* @return Entropy -Sum(p ln p)
*
*/
float64_t arm_entropy_f64(const float64_t * pSrcA, uint32_t blockSize);
/**
* @brief Kullback-Leibler
*
* @param[in] pSrcA Pointer to an array of input values for probability distribution A.
* @param[in] pSrcB Pointer to an array of input values for probability distribution B.
* @param[in] blockSize Number of samples in the input array.
* @return Kullback-Leibler Divergence D(A || B)
*
*/
float32_t arm_kullback_leibler_f32(const float32_t * pSrcA
,const float32_t * pSrcB
,uint32_t blockSize);
/**
* @brief Kullback-Leibler
*
* @param[in] pSrcA Pointer to an array of input values for probability distribution A.
* @param[in] pSrcB Pointer to an array of input values for probability distribution B.
* @param[in] blockSize Number of samples in the input array.
* @return Kullback-Leibler Divergence D(A || B)
*
*/
float64_t arm_kullback_leibler_f64(const float64_t * pSrcA,
const float64_t * pSrcB,
uint32_t blockSize);
/**
* @brief Sum of the squares of the elements of a Q31 vector.
* @param[in] pSrc is input pointer
* @param[in] blockSize is the number of samples to process
* @param[out] pResult is output value.
*/
void arm_power_q31(
const q31_t * pSrc,
uint32_t blockSize,
q63_t * pResult);
/**
* @brief Sum of the squares of the elements of a floating-point vector.
* @param[in] pSrc is input pointer
* @param[in] blockSize is the number of samples to process
* @param[out] pResult is output value.
*/
void arm_power_f32(
const float32_t * pSrc,
uint32_t blockSize,
float32_t * pResult);
/**
* @brief Sum of the squares of the elements of a Q15 vector.
* @param[in] pSrc is input pointer
* @param[in] blockSize is the number of samples to process
* @param[out] pResult is output value.
*/
void arm_power_q15(
const q15_t * pSrc,
uint32_t blockSize,
q63_t * pResult);
/**
* @brief Sum of the squares of the elements of a Q7 vector.
* @param[in] pSrc is input pointer
* @param[in] blockSize is the number of samples to process
* @param[out] pResult is output value.
*/
void arm_power_q7(
const q7_t * pSrc,
uint32_t blockSize,
q31_t * pResult);
/**
* @brief Mean value of a Q7 vector.
* @param[in] pSrc is input pointer
* @param[in] blockSize is the number of samples to process
* @param[out] pResult is output value.
*/
void arm_mean_q7(
const q7_t * pSrc,
uint32_t blockSize,
q7_t * pResult);
/**
* @brief Mean value of a Q15 vector.
* @param[in] pSrc is input pointer
* @param[in] blockSize is the number of samples to process
* @param[out] pResult is output value.
*/
void arm_mean_q15(
const q15_t * pSrc,
uint32_t blockSize,
q15_t * pResult);
/**
* @brief Mean value of a Q31 vector.
* @param[in] pSrc is input pointer
* @param[in] blockSize is the number of samples to process
* @param[out] pResult is output value.
*/
void arm_mean_q31(
const q31_t * pSrc,
uint32_t blockSize,
q31_t * pResult);
/**
* @brief Mean value of a floating-point vector.
* @param[in] pSrc is input pointer
* @param[in] blockSize is the number of samples to process
* @param[out] pResult is output value.
*/
void arm_mean_f32(
const float32_t * pSrc,
uint32_t blockSize,
float32_t * pResult);
/**
* @brief Variance of the elements of a floating-point vector.
* @param[in] pSrc is input pointer
* @param[in] blockSize is the number of samples to process
* @param[out] pResult is output value.
*/
void arm_var_f32(
const float32_t * pSrc,
uint32_t blockSize,
float32_t * pResult);
/**
* @brief Variance of the elements of a Q31 vector.
* @param[in] pSrc is input pointer
* @param[in] blockSize is the number of samples to process
* @param[out] pResult is output value.
*/
void arm_var_q31(
const q31_t * pSrc,
uint32_t blockSize,
q31_t * pResult);
/**
* @brief Variance of the elements of a Q15 vector.
* @param[in] pSrc is input pointer
* @param[in] blockSize is the number of samples to process
* @param[out] pResult is output value.
*/
void arm_var_q15(
const q15_t * pSrc,
uint32_t blockSize,
q15_t * pResult);
/**
* @brief Root Mean Square of the elements of a floating-point vector.
* @param[in] pSrc is input pointer
* @param[in] blockSize is the number of samples to process
* @param[out] pResult is output value.
*/
void arm_rms_f32(
const float32_t * pSrc,
uint32_t blockSize,
float32_t * pResult);
/**
* @brief Root Mean Square of the elements of a Q31 vector.
* @param[in] pSrc is input pointer
* @param[in] blockSize is the number of samples to process
* @param[out] pResult is output value.
*/
void arm_rms_q31(
const q31_t * pSrc,
uint32_t blockSize,
q31_t * pResult);
/**
* @brief Root Mean Square of the elements of a Q15 vector.
* @param[in] pSrc is input pointer
* @param[in] blockSize is the number of samples to process
* @param[out] pResult is output value.
*/
void arm_rms_q15(
const q15_t * pSrc,
uint32_t blockSize,
q15_t * pResult);
/**
* @brief Standard deviation of the elements of a floating-point vector.
* @param[in] pSrc is input pointer
* @param[in] blockSize is the number of samples to process
* @param[out] pResult is output value.
*/
void arm_std_f32(
const float32_t * pSrc,
uint32_t blockSize,
float32_t * pResult);
/**
* @brief Standard deviation of the elements of a Q31 vector.
* @param[in] pSrc is input pointer
* @param[in] blockSize is the number of samples to process
* @param[out] pResult is output value.
*/
void arm_std_q31(
const q31_t * pSrc,
uint32_t blockSize,
q31_t * pResult);
/**
* @brief Standard deviation of the elements of a Q15 vector.
* @param[in] pSrc is input pointer
* @param[in] blockSize is the number of samples to process
* @param[out] pResult is output value.
*/
void arm_std_q15(
const q15_t * pSrc,
uint32_t blockSize,
q15_t * pResult);
/**
* @brief Minimum value of a Q7 vector.
* @param[in] pSrc is input pointer
* @param[in] blockSize is the number of samples to process
* @param[out] result is output pointer
* @param[in] index is the array index of the minimum value in the input buffer.
*/
void arm_min_q7(
const q7_t * pSrc,
uint32_t blockSize,
q7_t * result,
uint32_t * index);
/**
* @brief Minimum value of a Q15 vector.
* @param[in] pSrc is input pointer
* @param[in] blockSize is the number of samples to process
* @param[out] pResult is output pointer
* @param[in] pIndex is the array index of the minimum value in the input buffer.
*/
void arm_min_q15(
const q15_t * pSrc,
uint32_t blockSize,
q15_t * pResult,
uint32_t * pIndex);
/**
* @brief Minimum value of a Q31 vector.
* @param[in] pSrc is input pointer
* @param[in] blockSize is the number of samples to process
* @param[out] pResult is output pointer
* @param[out] pIndex is the array index of the minimum value in the input buffer.
*/
void arm_min_q31(
const q31_t * pSrc,
uint32_t blockSize,
q31_t * pResult,
uint32_t * pIndex);
/**
* @brief Minimum value of a floating-point vector.
* @param[in] pSrc is input pointer
* @param[in] blockSize is the number of samples to process
* @param[out] pResult is output pointer
* @param[out] pIndex is the array index of the minimum value in the input buffer.
*/
void arm_min_f32(
const float32_t * pSrc,
uint32_t blockSize,
float32_t * pResult,
uint32_t * pIndex);
/**
* @brief Maximum value of a Q7 vector.
* @param[in] pSrc points to the input buffer
* @param[in] blockSize length of the input vector
* @param[out] pResult maximum value returned here
* @param[out] pIndex index of maximum value returned here
*/
void arm_max_q7(
const q7_t * pSrc,
uint32_t blockSize,
q7_t * pResult,
uint32_t * pIndex);
/**
* @brief Maximum value of a Q15 vector.
* @param[in] pSrc points to the input buffer
* @param[in] blockSize length of the input vector
* @param[out] pResult maximum value returned here
* @param[out] pIndex index of maximum value returned here
*/
void arm_max_q15(
const q15_t * pSrc,
uint32_t blockSize,
q15_t * pResult,
uint32_t * pIndex);
/**
* @brief Maximum value of a Q31 vector.
* @param[in] pSrc points to the input buffer
* @param[in] blockSize length of the input vector
* @param[out] pResult maximum value returned here
* @param[out] pIndex index of maximum value returned here
*/
void arm_max_q31(
const q31_t * pSrc,
uint32_t blockSize,
q31_t * pResult,
uint32_t * pIndex);
/**
* @brief Maximum value of a floating-point vector.
* @param[in] pSrc points to the input buffer
* @param[in] blockSize length of the input vector
* @param[out] pResult maximum value returned here
* @param[out] pIndex index of maximum value returned here
*/
void arm_max_f32(
const float32_t * pSrc,
uint32_t blockSize,
float32_t * pResult,
uint32_t * pIndex);
/**
@brief Maximum value of a floating-point vector.
@param[in] pSrc points to the input vector
@param[in] blockSize number of samples in input vector
@param[out] pResult maximum value returned here
@return none
*/
void arm_max_no_idx_f32(
const float32_t *pSrc,
uint32_t blockSize,
float32_t *pResult);
#ifdef __cplusplus
}
#endif
#endif /* ifndef _STATISTICS_FUNCTIONS_H_ */

View File

@ -1,426 +0,0 @@
/******************************************************************************
* @file support_functions.h
* @brief Public header file for CMSIS DSP Library
* @version V1.9.0
* @date 20. July 2020
******************************************************************************/
/*
* Copyright (c) 2010-2020 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef _SUPPORT_FUNCTIONS_H_
#define _SUPPORT_FUNCTIONS_H_
#include "arm_math_types.h"
#include "arm_math_memory.h"
#include "dsp/none.h"
#include "dsp/utils.h"
#ifdef __cplusplus
extern "C"
{
#endif
/**
* @defgroup groupSupport Support Functions
*/
/**
* @brief Converts the elements of the floating-point vector to Q31 vector.
* @param[in] pSrc points to the floating-point input vector
* @param[out] pDst points to the Q31 output vector
* @param[in] blockSize length of the input vector
*/
void arm_float_to_q31(
const float32_t * pSrc,
q31_t * pDst,
uint32_t blockSize);
/**
* @brief Converts the elements of the floating-point vector to Q15 vector.
* @param[in] pSrc points to the floating-point input vector
* @param[out] pDst points to the Q15 output vector
* @param[in] blockSize length of the input vector
*/
void arm_float_to_q15(
const float32_t * pSrc,
q15_t * pDst,
uint32_t blockSize);
/**
* @brief Converts the elements of the floating-point vector to Q7 vector.
* @param[in] pSrc points to the floating-point input vector
* @param[out] pDst points to the Q7 output vector
* @param[in] blockSize length of the input vector
*/
void arm_float_to_q7(
const float32_t * pSrc,
q7_t * pDst,
uint32_t blockSize);
/**
* @brief Converts the elements of the Q31 vector to floating-point vector.
* @param[in] pSrc is input pointer
* @param[out] pDst is output pointer
* @param[in] blockSize is the number of samples to process
*/
void arm_q31_to_float(
const q31_t * pSrc,
float32_t * pDst,
uint32_t blockSize);
/**
* @brief Converts the elements of the Q31 vector to Q15 vector.
* @param[in] pSrc is input pointer
* @param[out] pDst is output pointer
* @param[in] blockSize is the number of samples to process
*/
void arm_q31_to_q15(
const q31_t * pSrc,
q15_t * pDst,
uint32_t blockSize);
/**
* @brief Converts the elements of the Q31 vector to Q7 vector.
* @param[in] pSrc is input pointer
* @param[out] pDst is output pointer
* @param[in] blockSize is the number of samples to process
*/
void arm_q31_to_q7(
const q31_t * pSrc,
q7_t * pDst,
uint32_t blockSize);
/**
* @brief Converts the elements of the Q15 vector to floating-point vector.
* @param[in] pSrc is input pointer
* @param[out] pDst is output pointer
* @param[in] blockSize is the number of samples to process
*/
void arm_q15_to_float(
const q15_t * pSrc,
float32_t * pDst,
uint32_t blockSize);
/**
* @brief Converts the elements of the Q15 vector to Q31 vector.
* @param[in] pSrc is input pointer
* @param[out] pDst is output pointer
* @param[in] blockSize is the number of samples to process
*/
void arm_q15_to_q31(
const q15_t * pSrc,
q31_t * pDst,
uint32_t blockSize);
/**
* @brief Converts the elements of the Q15 vector to Q7 vector.
* @param[in] pSrc is input pointer
* @param[out] pDst is output pointer
* @param[in] blockSize is the number of samples to process
*/
void arm_q15_to_q7(
const q15_t * pSrc,
q7_t * pDst,
uint32_t blockSize);
/**
* @brief Converts the elements of the Q7 vector to floating-point vector.
* @param[in] pSrc is input pointer
* @param[out] pDst is output pointer
* @param[in] blockSize is the number of samples to process
*/
void arm_q7_to_float(
const q7_t * pSrc,
float32_t * pDst,
uint32_t blockSize);
/**
* @brief Converts the elements of the Q7 vector to Q31 vector.
* @param[in] pSrc input pointer
* @param[out] pDst output pointer
* @param[in] blockSize number of samples to process
*/
void arm_q7_to_q31(
const q7_t * pSrc,
q31_t * pDst,
uint32_t blockSize);
/**
* @brief Converts the elements of the Q7 vector to Q15 vector.
* @param[in] pSrc input pointer
* @param[out] pDst output pointer
* @param[in] blockSize number of samples to process
*/
void arm_q7_to_q15(
const q7_t * pSrc,
q15_t * pDst,
uint32_t blockSize);
/**
* @brief Struct for specifying sorting algorithm
*/
typedef enum
{
ARM_SORT_BITONIC = 0,
/**< Bitonic sort */
ARM_SORT_BUBBLE = 1,
/**< Bubble sort */
ARM_SORT_HEAP = 2,
/**< Heap sort */
ARM_SORT_INSERTION = 3,
/**< Insertion sort */
ARM_SORT_QUICK = 4,
/**< Quick sort */
ARM_SORT_SELECTION = 5
/**< Selection sort */
} arm_sort_alg;
/**
* @brief Struct for specifying sorting algorithm
*/
typedef enum
{
ARM_SORT_DESCENDING = 0,
/**< Descending order (9 to 0) */
ARM_SORT_ASCENDING = 1
/**< Ascending order (0 to 9) */
} arm_sort_dir;
/**
* @brief Instance structure for the sorting algorithms.
*/
typedef struct
{
arm_sort_alg alg; /**< Sorting algorithm selected */
arm_sort_dir dir; /**< Sorting order (direction) */
} arm_sort_instance_f32;
/**
* @param[in] S points to an instance of the sorting structure.
* @param[in] pSrc points to the block of input data.
* @param[out] pDst points to the block of output data.
* @param[in] blockSize number of samples to process.
*/
void arm_sort_f32(
const arm_sort_instance_f32 * S,
float32_t * pSrc,
float32_t * pDst,
uint32_t blockSize);
/**
* @param[in,out] S points to an instance of the sorting structure.
* @param[in] alg Selected algorithm.
* @param[in] dir Sorting order.
*/
void arm_sort_init_f32(
arm_sort_instance_f32 * S,
arm_sort_alg alg,
arm_sort_dir dir);
/**
* @brief Instance structure for the sorting algorithms.
*/
typedef struct
{
arm_sort_dir dir; /**< Sorting order (direction) */
float32_t * buffer; /**< Working buffer */
} arm_merge_sort_instance_f32;
/**
* @param[in] S points to an instance of the sorting structure.
* @param[in,out] pSrc points to the block of input data.
* @param[out] pDst points to the block of output data
* @param[in] blockSize number of samples to process.
*/
void arm_merge_sort_f32(
const arm_merge_sort_instance_f32 * S,
float32_t *pSrc,
float32_t *pDst,
uint32_t blockSize);
/**
* @param[in,out] S points to an instance of the sorting structure.
* @param[in] dir Sorting order.
* @param[in] buffer Working buffer.
*/
void arm_merge_sort_init_f32(
arm_merge_sort_instance_f32 * S,
arm_sort_dir dir,
float32_t * buffer);
/**
* @brief Copies the elements of a floating-point vector.
* @param[in] pSrc input pointer
* @param[out] pDst output pointer
* @param[in] blockSize number of samples to process
*/
void arm_copy_f32(
const float32_t * pSrc,
float32_t * pDst,
uint32_t blockSize);
/**
* @brief Copies the elements of a Q7 vector.
* @param[in] pSrc input pointer
* @param[out] pDst output pointer
* @param[in] blockSize number of samples to process
*/
void arm_copy_q7(
const q7_t * pSrc,
q7_t * pDst,
uint32_t blockSize);
/**
* @brief Copies the elements of a Q15 vector.
* @param[in] pSrc input pointer
* @param[out] pDst output pointer
* @param[in] blockSize number of samples to process
*/
void arm_copy_q15(
const q15_t * pSrc,
q15_t * pDst,
uint32_t blockSize);
/**
* @brief Copies the elements of a Q31 vector.
* @param[in] pSrc input pointer
* @param[out] pDst output pointer
* @param[in] blockSize number of samples to process
*/
void arm_copy_q31(
const q31_t * pSrc,
q31_t * pDst,
uint32_t blockSize);
/**
* @brief Fills a constant value into a floating-point vector.
* @param[in] value input value to be filled
* @param[out] pDst output pointer
* @param[in] blockSize number of samples to process
*/
void arm_fill_f32(
float32_t value,
float32_t * pDst,
uint32_t blockSize);
/**
* @brief Fills a constant value into a Q7 vector.
* @param[in] value input value to be filled
* @param[out] pDst output pointer
* @param[in] blockSize number of samples to process
*/
void arm_fill_q7(
q7_t value,
q7_t * pDst,
uint32_t blockSize);
/**
* @brief Fills a constant value into a Q15 vector.
* @param[in] value input value to be filled
* @param[out] pDst output pointer
* @param[in] blockSize number of samples to process
*/
void arm_fill_q15(
q15_t value,
q15_t * pDst,
uint32_t blockSize);
/**
* @brief Fills a constant value into a Q31 vector.
* @param[in] value input value to be filled
* @param[out] pDst output pointer
* @param[in] blockSize number of samples to process
*/
void arm_fill_q31(
q31_t value,
q31_t * pDst,
uint32_t blockSize);
/**
* @brief Weighted sum
*
*
* @param[in] *in Array of input values.
* @param[in] *weigths Weights
* @param[in] blockSize Number of samples in the input array.
* @return Weighted sum
*
*/
float32_t arm_weighted_sum_f32(const float32_t *in
, const float32_t *weigths
, uint32_t blockSize);
/**
* @brief Barycenter
*
*
* @param[in] in List of vectors
* @param[in] weights Weights of the vectors
* @param[out] out Barycenter
* @param[in] nbVectors Number of vectors
* @param[in] vecDim Dimension of space (vector dimension)
* @return None
*
*/
void arm_barycenter_f32(const float32_t *in
, const float32_t *weights
, float32_t *out
, uint32_t nbVectors
, uint32_t vecDim);
#ifdef __cplusplus
}
#endif
#endif /* ifndef _SUPPORT_FUNCTIONS_H_ */

View File

@ -1,42 +0,0 @@
/******************************************************************************
* @file svm_defines.h
* @brief Public header file for CMSIS DSP Library
******************************************************************************/
/*
* Copyright (c) 2010-2020 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef _SVM_DEFINES_H_
#define _SVM_DEFINES_H_
/**
* @brief Struct for specifying SVM Kernel
*/
typedef enum
{
ARM_ML_KERNEL_LINEAR = 0,
/**< Linear kernel */
ARM_ML_KERNEL_POLYNOMIAL = 1,
/**< Polynomial kernel */
ARM_ML_KERNEL_RBF = 2,
/**< Radial Basis Function kernel */
ARM_ML_KERNEL_SIGMOID = 3
/**< Sigmoid kernel */
} arm_ml_kernel_type;
#endif

View File

@ -1,298 +0,0 @@
/******************************************************************************
* @file svm_functions.h
* @brief Public header file for CMSIS DSP Library
* @version V1.9.0
* @date 20. July 2020
******************************************************************************/
/*
* Copyright (c) 2010-2020 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef _SVM_FUNCTIONS_H_
#define _SVM_FUNCTIONS_H_
#include "arm_math_types.h"
#include "arm_math_memory.h"
#include "dsp/none.h"
#include "dsp/utils.h"
#include "dsp/svm_defines.h"
#ifdef __cplusplus
extern "C"
{
#endif
#define STEP(x) (x) <= 0 ? 0 : 1
/**
* @defgroup groupSVM SVM Functions
* This set of functions is implementing SVM classification on 2 classes.
* The training must be done from scikit-learn. The parameters can be easily
* generated from the scikit-learn object. Some examples are given in
* DSP/Testing/PatternGeneration/SVM.py
*
* If more than 2 classes are needed, the functions in this folder
* will have to be used, as building blocks, to do multi-class classification.
*
* No multi-class classification is provided in this SVM folder.
*
*/
/**
* @brief Integer exponentiation
* @param[in] x value
* @param[in] nb integer exponent >= 1
* @return x^nb
*
*/
__STATIC_INLINE float32_t arm_exponent_f32(float32_t x, int32_t nb)
{
float32_t r = x;
nb --;
while(nb > 0)
{
r = r * x;
nb--;
}
return(r);
}
/**
* @brief Instance structure for linear SVM prediction function.
*/
typedef struct
{
uint32_t nbOfSupportVectors; /**< Number of support vectors */
uint32_t vectorDimension; /**< Dimension of vector space */
float32_t intercept; /**< Intercept */
const float32_t *dualCoefficients; /**< Dual coefficients */
const float32_t *supportVectors; /**< Support vectors */
const int32_t *classes; /**< The two SVM classes */
} arm_svm_linear_instance_f32;
/**
* @brief Instance structure for polynomial SVM prediction function.
*/
typedef struct
{
uint32_t nbOfSupportVectors; /**< Number of support vectors */
uint32_t vectorDimension; /**< Dimension of vector space */
float32_t intercept; /**< Intercept */
const float32_t *dualCoefficients; /**< Dual coefficients */
const float32_t *supportVectors; /**< Support vectors */
const int32_t *classes; /**< The two SVM classes */
int32_t degree; /**< Polynomial degree */
float32_t coef0; /**< Polynomial constant */
float32_t gamma; /**< Gamma factor */
} arm_svm_polynomial_instance_f32;
/**
* @brief Instance structure for rbf SVM prediction function.
*/
typedef struct
{
uint32_t nbOfSupportVectors; /**< Number of support vectors */
uint32_t vectorDimension; /**< Dimension of vector space */
float32_t intercept; /**< Intercept */
const float32_t *dualCoefficients; /**< Dual coefficients */
const float32_t *supportVectors; /**< Support vectors */
const int32_t *classes; /**< The two SVM classes */
float32_t gamma; /**< Gamma factor */
} arm_svm_rbf_instance_f32;
/**
* @brief Instance structure for sigmoid SVM prediction function.
*/
typedef struct
{
uint32_t nbOfSupportVectors; /**< Number of support vectors */
uint32_t vectorDimension; /**< Dimension of vector space */
float32_t intercept; /**< Intercept */
const float32_t *dualCoefficients; /**< Dual coefficients */
const float32_t *supportVectors; /**< Support vectors */
const int32_t *classes; /**< The two SVM classes */
float32_t coef0; /**< Independant constant */
float32_t gamma; /**< Gamma factor */
} arm_svm_sigmoid_instance_f32;
/**
* @brief SVM linear instance init function
* @param[in] S Parameters for SVM functions
* @param[in] nbOfSupportVectors Number of support vectors
* @param[in] vectorDimension Dimension of vector space
* @param[in] intercept Intercept
* @param[in] dualCoefficients Array of dual coefficients
* @param[in] supportVectors Array of support vectors
* @param[in] classes Array of 2 classes ID
* @return none.
*
*/
void arm_svm_linear_init_f32(arm_svm_linear_instance_f32 *S,
uint32_t nbOfSupportVectors,
uint32_t vectorDimension,
float32_t intercept,
const float32_t *dualCoefficients,
const float32_t *supportVectors,
const int32_t *classes);
/**
* @brief SVM linear prediction
* @param[in] S Pointer to an instance of the linear SVM structure.
* @param[in] in Pointer to input vector
* @param[out] pResult Decision value
* @return none.
*
*/
void arm_svm_linear_predict_f32(const arm_svm_linear_instance_f32 *S,
const float32_t * in,
int32_t * pResult);
/**
* @brief SVM polynomial instance init function
* @param[in] S points to an instance of the polynomial SVM structure.
* @param[in] nbOfSupportVectors Number of support vectors
* @param[in] vectorDimension Dimension of vector space
* @param[in] intercept Intercept
* @param[in] dualCoefficients Array of dual coefficients
* @param[in] supportVectors Array of support vectors
* @param[in] classes Array of 2 classes ID
* @param[in] degree Polynomial degree
* @param[in] coef0 coeff0 (scikit-learn terminology)
* @param[in] gamma gamma (scikit-learn terminology)
* @return none.
*
*/
void arm_svm_polynomial_init_f32(arm_svm_polynomial_instance_f32 *S,
uint32_t nbOfSupportVectors,
uint32_t vectorDimension,
float32_t intercept,
const float32_t *dualCoefficients,
const float32_t *supportVectors,
const int32_t *classes,
int32_t degree,
float32_t coef0,
float32_t gamma
);
/**
* @brief SVM polynomial prediction
* @param[in] S Pointer to an instance of the polynomial SVM structure.
* @param[in] in Pointer to input vector
* @param[out] pResult Decision value
* @return none.
*
*/
void arm_svm_polynomial_predict_f32(const arm_svm_polynomial_instance_f32 *S,
const float32_t * in,
int32_t * pResult);
/**
* @brief SVM radial basis function instance init function
* @param[in] S points to an instance of the polynomial SVM structure.
* @param[in] nbOfSupportVectors Number of support vectors
* @param[in] vectorDimension Dimension of vector space
* @param[in] intercept Intercept
* @param[in] dualCoefficients Array of dual coefficients
* @param[in] supportVectors Array of support vectors
* @param[in] classes Array of 2 classes ID
* @param[in] gamma gamma (scikit-learn terminology)
* @return none.
*
*/
void arm_svm_rbf_init_f32(arm_svm_rbf_instance_f32 *S,
uint32_t nbOfSupportVectors,
uint32_t vectorDimension,
float32_t intercept,
const float32_t *dualCoefficients,
const float32_t *supportVectors,
const int32_t *classes,
float32_t gamma
);
/**
* @brief SVM rbf prediction
* @param[in] S Pointer to an instance of the rbf SVM structure.
* @param[in] in Pointer to input vector
* @param[out] pResult decision value
* @return none.
*
*/
void arm_svm_rbf_predict_f32(const arm_svm_rbf_instance_f32 *S,
const float32_t * in,
int32_t * pResult);
/**
* @brief SVM sigmoid instance init function
* @param[in] S points to an instance of the rbf SVM structure.
* @param[in] nbOfSupportVectors Number of support vectors
* @param[in] vectorDimension Dimension of vector space
* @param[in] intercept Intercept
* @param[in] dualCoefficients Array of dual coefficients
* @param[in] supportVectors Array of support vectors
* @param[in] classes Array of 2 classes ID
* @param[in] coef0 coeff0 (scikit-learn terminology)
* @param[in] gamma gamma (scikit-learn terminology)
* @return none.
*
*/
void arm_svm_sigmoid_init_f32(arm_svm_sigmoid_instance_f32 *S,
uint32_t nbOfSupportVectors,
uint32_t vectorDimension,
float32_t intercept,
const float32_t *dualCoefficients,
const float32_t *supportVectors,
const int32_t *classes,
float32_t coef0,
float32_t gamma
);
/**
* @brief SVM sigmoid prediction
* @param[in] S Pointer to an instance of the rbf SVM structure.
* @param[in] in Pointer to input vector
* @param[out] pResult Decision value
* @return none.
*
*/
void arm_svm_sigmoid_predict_f32(const arm_svm_sigmoid_instance_f32 *S,
const float32_t * in,
int32_t * pResult);
#ifdef __cplusplus
}
#endif
#endif /* ifndef _SVM_FUNCTIONS_H_ */

View File

@ -1,591 +0,0 @@
/******************************************************************************
* @file transform_functions.h
* @brief Public header file for CMSIS DSP Library
* @version V1.9.0
* @date 20. July 2020
******************************************************************************/
/*
* Copyright (c) 2010-2020 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef _TRANSFORM_FUNCTIONS_H_
#define _TRANSFORM_FUNCTIONS_H_
#include "arm_math_types.h"
#include "arm_math_memory.h"
#include "dsp/none.h"
#include "dsp/utils.h"
#include "dsp/basic_math_functions.h"
#include "dsp/complex_math_functions.h"
#ifdef __cplusplus
extern "C"
{
#endif
/**
* @defgroup groupTransforms Transform Functions
*/
/**
* @brief Instance structure for the Q15 CFFT/CIFFT function.
*/
typedef struct
{
uint16_t fftLen; /**< length of the FFT. */
uint8_t ifftFlag; /**< flag that selects forward (ifftFlag=0) or inverse (ifftFlag=1) transform. */
uint8_t bitReverseFlag; /**< flag that enables (bitReverseFlag=1) or disables (bitReverseFlag=0) bit reversal of output. */
const q15_t *pTwiddle; /**< points to the Sin twiddle factor table. */
const uint16_t *pBitRevTable; /**< points to the bit reversal table. */
uint16_t twidCoefModifier; /**< twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table. */
uint16_t bitRevFactor; /**< bit reversal modifier that supports different size FFTs with the same bit reversal table. */
} arm_cfft_radix2_instance_q15;
/* Deprecated */
arm_status arm_cfft_radix2_init_q15(
arm_cfft_radix2_instance_q15 * S,
uint16_t fftLen,
uint8_t ifftFlag,
uint8_t bitReverseFlag);
/* Deprecated */
void arm_cfft_radix2_q15(
const arm_cfft_radix2_instance_q15 * S,
q15_t * pSrc);
/**
* @brief Instance structure for the Q15 CFFT/CIFFT function.
*/
typedef struct
{
uint16_t fftLen; /**< length of the FFT. */
uint8_t ifftFlag; /**< flag that selects forward (ifftFlag=0) or inverse (ifftFlag=1) transform. */
uint8_t bitReverseFlag; /**< flag that enables (bitReverseFlag=1) or disables (bitReverseFlag=0) bit reversal of output. */
const q15_t *pTwiddle; /**< points to the twiddle factor table. */
const uint16_t *pBitRevTable; /**< points to the bit reversal table. */
uint16_t twidCoefModifier; /**< twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table. */
uint16_t bitRevFactor; /**< bit reversal modifier that supports different size FFTs with the same bit reversal table. */
} arm_cfft_radix4_instance_q15;
/* Deprecated */
arm_status arm_cfft_radix4_init_q15(
arm_cfft_radix4_instance_q15 * S,
uint16_t fftLen,
uint8_t ifftFlag,
uint8_t bitReverseFlag);
/* Deprecated */
void arm_cfft_radix4_q15(
const arm_cfft_radix4_instance_q15 * S,
q15_t * pSrc);
/**
* @brief Instance structure for the Radix-2 Q31 CFFT/CIFFT function.
*/
typedef struct
{
uint16_t fftLen; /**< length of the FFT. */
uint8_t ifftFlag; /**< flag that selects forward (ifftFlag=0) or inverse (ifftFlag=1) transform. */
uint8_t bitReverseFlag; /**< flag that enables (bitReverseFlag=1) or disables (bitReverseFlag=0) bit reversal of output. */
const q31_t *pTwiddle; /**< points to the Twiddle factor table. */
const uint16_t *pBitRevTable; /**< points to the bit reversal table. */
uint16_t twidCoefModifier; /**< twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table. */
uint16_t bitRevFactor; /**< bit reversal modifier that supports different size FFTs with the same bit reversal table. */
} arm_cfft_radix2_instance_q31;
/* Deprecated */
arm_status arm_cfft_radix2_init_q31(
arm_cfft_radix2_instance_q31 * S,
uint16_t fftLen,
uint8_t ifftFlag,
uint8_t bitReverseFlag);
/* Deprecated */
void arm_cfft_radix2_q31(
const arm_cfft_radix2_instance_q31 * S,
q31_t * pSrc);
/**
* @brief Instance structure for the Q31 CFFT/CIFFT function.
*/
typedef struct
{
uint16_t fftLen; /**< length of the FFT. */
uint8_t ifftFlag; /**< flag that selects forward (ifftFlag=0) or inverse (ifftFlag=1) transform. */
uint8_t bitReverseFlag; /**< flag that enables (bitReverseFlag=1) or disables (bitReverseFlag=0) bit reversal of output. */
const q31_t *pTwiddle; /**< points to the twiddle factor table. */
const uint16_t *pBitRevTable; /**< points to the bit reversal table. */
uint16_t twidCoefModifier; /**< twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table. */
uint16_t bitRevFactor; /**< bit reversal modifier that supports different size FFTs with the same bit reversal table. */
} arm_cfft_radix4_instance_q31;
/* Deprecated */
void arm_cfft_radix4_q31(
const arm_cfft_radix4_instance_q31 * S,
q31_t * pSrc);
/* Deprecated */
arm_status arm_cfft_radix4_init_q31(
arm_cfft_radix4_instance_q31 * S,
uint16_t fftLen,
uint8_t ifftFlag,
uint8_t bitReverseFlag);
/**
* @brief Instance structure for the floating-point CFFT/CIFFT function.
*/
typedef struct
{
uint16_t fftLen; /**< length of the FFT. */
uint8_t ifftFlag; /**< flag that selects forward (ifftFlag=0) or inverse (ifftFlag=1) transform. */
uint8_t bitReverseFlag; /**< flag that enables (bitReverseFlag=1) or disables (bitReverseFlag=0) bit reversal of output. */
const float32_t *pTwiddle; /**< points to the Twiddle factor table. */
const uint16_t *pBitRevTable; /**< points to the bit reversal table. */
uint16_t twidCoefModifier; /**< twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table. */
uint16_t bitRevFactor; /**< bit reversal modifier that supports different size FFTs with the same bit reversal table. */
float32_t onebyfftLen; /**< value of 1/fftLen. */
} arm_cfft_radix2_instance_f32;
/* Deprecated */
arm_status arm_cfft_radix2_init_f32(
arm_cfft_radix2_instance_f32 * S,
uint16_t fftLen,
uint8_t ifftFlag,
uint8_t bitReverseFlag);
/* Deprecated */
void arm_cfft_radix2_f32(
const arm_cfft_radix2_instance_f32 * S,
float32_t * pSrc);
/**
* @brief Instance structure for the floating-point CFFT/CIFFT function.
*/
typedef struct
{
uint16_t fftLen; /**< length of the FFT. */
uint8_t ifftFlag; /**< flag that selects forward (ifftFlag=0) or inverse (ifftFlag=1) transform. */
uint8_t bitReverseFlag; /**< flag that enables (bitReverseFlag=1) or disables (bitReverseFlag=0) bit reversal of output. */
const float32_t *pTwiddle; /**< points to the Twiddle factor table. */
const uint16_t *pBitRevTable; /**< points to the bit reversal table. */
uint16_t twidCoefModifier; /**< twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table. */
uint16_t bitRevFactor; /**< bit reversal modifier that supports different size FFTs with the same bit reversal table. */
float32_t onebyfftLen; /**< value of 1/fftLen. */
} arm_cfft_radix4_instance_f32;
/* Deprecated */
arm_status arm_cfft_radix4_init_f32(
arm_cfft_radix4_instance_f32 * S,
uint16_t fftLen,
uint8_t ifftFlag,
uint8_t bitReverseFlag);
/* Deprecated */
void arm_cfft_radix4_f32(
const arm_cfft_radix4_instance_f32 * S,
float32_t * pSrc);
/**
* @brief Instance structure for the fixed-point CFFT/CIFFT function.
*/
typedef struct
{
uint16_t fftLen; /**< length of the FFT. */
const q15_t *pTwiddle; /**< points to the Twiddle factor table. */
const uint16_t *pBitRevTable; /**< points to the bit reversal table. */
uint16_t bitRevLength; /**< bit reversal table length. */
#if defined(ARM_MATH_MVEI)
const uint32_t *rearranged_twiddle_tab_stride1_arr; /**< Per stage reordered twiddle pointer (offset 1) */ \
const uint32_t *rearranged_twiddle_tab_stride2_arr; /**< Per stage reordered twiddle pointer (offset 2) */ \
const uint32_t *rearranged_twiddle_tab_stride3_arr; /**< Per stage reordered twiddle pointer (offset 3) */ \
const q15_t *rearranged_twiddle_stride1; /**< reordered twiddle offset 1 storage */ \
const q15_t *rearranged_twiddle_stride2; /**< reordered twiddle offset 2 storage */ \
const q15_t *rearranged_twiddle_stride3;
#endif
} arm_cfft_instance_q15;
arm_status arm_cfft_init_q15(
arm_cfft_instance_q15 * S,
uint16_t fftLen);
void arm_cfft_q15(
const arm_cfft_instance_q15 * S,
q15_t * p1,
uint8_t ifftFlag,
uint8_t bitReverseFlag);
/**
* @brief Instance structure for the fixed-point CFFT/CIFFT function.
*/
typedef struct
{
uint16_t fftLen; /**< length of the FFT. */
const q31_t *pTwiddle; /**< points to the Twiddle factor table. */
const uint16_t *pBitRevTable; /**< points to the bit reversal table. */
uint16_t bitRevLength; /**< bit reversal table length. */
#if defined(ARM_MATH_MVEI)
const uint32_t *rearranged_twiddle_tab_stride1_arr; /**< Per stage reordered twiddle pointer (offset 1) */ \
const uint32_t *rearranged_twiddle_tab_stride2_arr; /**< Per stage reordered twiddle pointer (offset 2) */ \
const uint32_t *rearranged_twiddle_tab_stride3_arr; /**< Per stage reordered twiddle pointer (offset 3) */ \
const q31_t *rearranged_twiddle_stride1; /**< reordered twiddle offset 1 storage */ \
const q31_t *rearranged_twiddle_stride2; /**< reordered twiddle offset 2 storage */ \
const q31_t *rearranged_twiddle_stride3;
#endif
} arm_cfft_instance_q31;
arm_status arm_cfft_init_q31(
arm_cfft_instance_q31 * S,
uint16_t fftLen);
void arm_cfft_q31(
const arm_cfft_instance_q31 * S,
q31_t * p1,
uint8_t ifftFlag,
uint8_t bitReverseFlag);
/**
* @brief Instance structure for the floating-point CFFT/CIFFT function.
*/
typedef struct
{
uint16_t fftLen; /**< length of the FFT. */
const float32_t *pTwiddle; /**< points to the Twiddle factor table. */
const uint16_t *pBitRevTable; /**< points to the bit reversal table. */
uint16_t bitRevLength; /**< bit reversal table length. */
#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
const uint32_t *rearranged_twiddle_tab_stride1_arr; /**< Per stage reordered twiddle pointer (offset 1) */ \
const uint32_t *rearranged_twiddle_tab_stride2_arr; /**< Per stage reordered twiddle pointer (offset 2) */ \
const uint32_t *rearranged_twiddle_tab_stride3_arr; /**< Per stage reordered twiddle pointer (offset 3) */ \
const float32_t *rearranged_twiddle_stride1; /**< reordered twiddle offset 1 storage */ \
const float32_t *rearranged_twiddle_stride2; /**< reordered twiddle offset 2 storage */ \
const float32_t *rearranged_twiddle_stride3;
#endif
} arm_cfft_instance_f32;
arm_status arm_cfft_init_f32(
arm_cfft_instance_f32 * S,
uint16_t fftLen);
void arm_cfft_f32(
const arm_cfft_instance_f32 * S,
float32_t * p1,
uint8_t ifftFlag,
uint8_t bitReverseFlag);
/**
* @brief Instance structure for the Double Precision Floating-point CFFT/CIFFT function.
*/
typedef struct
{
uint16_t fftLen; /**< length of the FFT. */
const float64_t *pTwiddle; /**< points to the Twiddle factor table. */
const uint16_t *pBitRevTable; /**< points to the bit reversal table. */
uint16_t bitRevLength; /**< bit reversal table length. */
} arm_cfft_instance_f64;
arm_status arm_cfft_init_f64(
arm_cfft_instance_f64 * S,
uint16_t fftLen);
void arm_cfft_f64(
const arm_cfft_instance_f64 * S,
float64_t * p1,
uint8_t ifftFlag,
uint8_t bitReverseFlag);
/**
* @brief Instance structure for the Q15 RFFT/RIFFT function.
*/
typedef struct
{
uint32_t fftLenReal; /**< length of the real FFT. */
uint8_t ifftFlagR; /**< flag that selects forward (ifftFlagR=0) or inverse (ifftFlagR=1) transform. */
uint8_t bitReverseFlagR; /**< flag that enables (bitReverseFlagR=1) or disables (bitReverseFlagR=0) bit reversal of output. */
uint32_t twidCoefRModifier; /**< twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table. */
const q15_t *pTwiddleAReal; /**< points to the real twiddle factor table. */
const q15_t *pTwiddleBReal; /**< points to the imag twiddle factor table. */
#if defined(ARM_MATH_MVEI)
arm_cfft_instance_q15 cfftInst;
#else
const arm_cfft_instance_q15 *pCfft; /**< points to the complex FFT instance. */
#endif
} arm_rfft_instance_q15;
arm_status arm_rfft_init_q15(
arm_rfft_instance_q15 * S,
uint32_t fftLenReal,
uint32_t ifftFlagR,
uint32_t bitReverseFlag);
void arm_rfft_q15(
const arm_rfft_instance_q15 * S,
q15_t * pSrc,
q15_t * pDst);
/**
* @brief Instance structure for the Q31 RFFT/RIFFT function.
*/
typedef struct
{
uint32_t fftLenReal; /**< length of the real FFT. */
uint8_t ifftFlagR; /**< flag that selects forward (ifftFlagR=0) or inverse (ifftFlagR=1) transform. */
uint8_t bitReverseFlagR; /**< flag that enables (bitReverseFlagR=1) or disables (bitReverseFlagR=0) bit reversal of output. */
uint32_t twidCoefRModifier; /**< twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table. */
const q31_t *pTwiddleAReal; /**< points to the real twiddle factor table. */
const q31_t *pTwiddleBReal; /**< points to the imag twiddle factor table. */
#if defined(ARM_MATH_MVEI)
arm_cfft_instance_q31 cfftInst;
#else
const arm_cfft_instance_q31 *pCfft; /**< points to the complex FFT instance. */
#endif
} arm_rfft_instance_q31;
arm_status arm_rfft_init_q31(
arm_rfft_instance_q31 * S,
uint32_t fftLenReal,
uint32_t ifftFlagR,
uint32_t bitReverseFlag);
void arm_rfft_q31(
const arm_rfft_instance_q31 * S,
q31_t * pSrc,
q31_t * pDst);
/**
* @brief Instance structure for the floating-point RFFT/RIFFT function.
*/
typedef struct
{
uint32_t fftLenReal; /**< length of the real FFT. */
uint16_t fftLenBy2; /**< length of the complex FFT. */
uint8_t ifftFlagR; /**< flag that selects forward (ifftFlagR=0) or inverse (ifftFlagR=1) transform. */
uint8_t bitReverseFlagR; /**< flag that enables (bitReverseFlagR=1) or disables (bitReverseFlagR=0) bit reversal of output. */
uint32_t twidCoefRModifier; /**< twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table. */
const float32_t *pTwiddleAReal; /**< points to the real twiddle factor table. */
const float32_t *pTwiddleBReal; /**< points to the imag twiddle factor table. */
arm_cfft_radix4_instance_f32 *pCfft; /**< points to the complex FFT instance. */
} arm_rfft_instance_f32;
arm_status arm_rfft_init_f32(
arm_rfft_instance_f32 * S,
arm_cfft_radix4_instance_f32 * S_CFFT,
uint32_t fftLenReal,
uint32_t ifftFlagR,
uint32_t bitReverseFlag);
void arm_rfft_f32(
const arm_rfft_instance_f32 * S,
float32_t * pSrc,
float32_t * pDst);
/**
* @brief Instance structure for the Double Precision Floating-point RFFT/RIFFT function.
*/
typedef struct
{
arm_cfft_instance_f64 Sint; /**< Internal CFFT structure. */
uint16_t fftLenRFFT; /**< length of the real sequence */
const float64_t * pTwiddleRFFT; /**< Twiddle factors real stage */
} arm_rfft_fast_instance_f64 ;
arm_status arm_rfft_fast_init_f64 (
arm_rfft_fast_instance_f64 * S,
uint16_t fftLen);
void arm_rfft_fast_f64(
arm_rfft_fast_instance_f64 * S,
float64_t * p, float64_t * pOut,
uint8_t ifftFlag);
/**
* @brief Instance structure for the floating-point RFFT/RIFFT function.
*/
typedef struct
{
arm_cfft_instance_f32 Sint; /**< Internal CFFT structure. */
uint16_t fftLenRFFT; /**< length of the real sequence */
const float32_t * pTwiddleRFFT; /**< Twiddle factors real stage */
} arm_rfft_fast_instance_f32 ;
arm_status arm_rfft_fast_init_f32 (
arm_rfft_fast_instance_f32 * S,
uint16_t fftLen);
void arm_rfft_fast_f32(
const arm_rfft_fast_instance_f32 * S,
float32_t * p, float32_t * pOut,
uint8_t ifftFlag);
/**
* @brief Instance structure for the floating-point DCT4/IDCT4 function.
*/
typedef struct
{
uint16_t N; /**< length of the DCT4. */
uint16_t Nby2; /**< half of the length of the DCT4. */
float32_t normalize; /**< normalizing factor. */
const float32_t *pTwiddle; /**< points to the twiddle factor table. */
const float32_t *pCosFactor; /**< points to the cosFactor table. */
arm_rfft_instance_f32 *pRfft; /**< points to the real FFT instance. */
arm_cfft_radix4_instance_f32 *pCfft; /**< points to the complex FFT instance. */
} arm_dct4_instance_f32;
/**
* @brief Initialization function for the floating-point DCT4/IDCT4.
* @param[in,out] S points to an instance of floating-point DCT4/IDCT4 structure.
* @param[in] S_RFFT points to an instance of floating-point RFFT/RIFFT structure.
* @param[in] S_CFFT points to an instance of floating-point CFFT/CIFFT structure.
* @param[in] N length of the DCT4.
* @param[in] Nby2 half of the length of the DCT4.
* @param[in] normalize normalizing factor.
* @return arm_status function returns ARM_MATH_SUCCESS if initialization is successful or ARM_MATH_ARGUMENT_ERROR if <code>fftLenReal</code> is not a supported transform length.
*/
arm_status arm_dct4_init_f32(
arm_dct4_instance_f32 * S,
arm_rfft_instance_f32 * S_RFFT,
arm_cfft_radix4_instance_f32 * S_CFFT,
uint16_t N,
uint16_t Nby2,
float32_t normalize);
/**
* @brief Processing function for the floating-point DCT4/IDCT4.
* @param[in] S points to an instance of the floating-point DCT4/IDCT4 structure.
* @param[in] pState points to state buffer.
* @param[in,out] pInlineBuffer points to the in-place input and output buffer.
*/
void arm_dct4_f32(
const arm_dct4_instance_f32 * S,
float32_t * pState,
float32_t * pInlineBuffer);
/**
* @brief Instance structure for the Q31 DCT4/IDCT4 function.
*/
typedef struct
{
uint16_t N; /**< length of the DCT4. */
uint16_t Nby2; /**< half of the length of the DCT4. */
q31_t normalize; /**< normalizing factor. */
const q31_t *pTwiddle; /**< points to the twiddle factor table. */
const q31_t *pCosFactor; /**< points to the cosFactor table. */
arm_rfft_instance_q31 *pRfft; /**< points to the real FFT instance. */
arm_cfft_radix4_instance_q31 *pCfft; /**< points to the complex FFT instance. */
} arm_dct4_instance_q31;
/**
* @brief Initialization function for the Q31 DCT4/IDCT4.
* @param[in,out] S points to an instance of Q31 DCT4/IDCT4 structure.
* @param[in] S_RFFT points to an instance of Q31 RFFT/RIFFT structure
* @param[in] S_CFFT points to an instance of Q31 CFFT/CIFFT structure
* @param[in] N length of the DCT4.
* @param[in] Nby2 half of the length of the DCT4.
* @param[in] normalize normalizing factor.
* @return arm_status function returns ARM_MATH_SUCCESS if initialization is successful or ARM_MATH_ARGUMENT_ERROR if <code>N</code> is not a supported transform length.
*/
arm_status arm_dct4_init_q31(
arm_dct4_instance_q31 * S,
arm_rfft_instance_q31 * S_RFFT,
arm_cfft_radix4_instance_q31 * S_CFFT,
uint16_t N,
uint16_t Nby2,
q31_t normalize);
/**
* @brief Processing function for the Q31 DCT4/IDCT4.
* @param[in] S points to an instance of the Q31 DCT4 structure.
* @param[in] pState points to state buffer.
* @param[in,out] pInlineBuffer points to the in-place input and output buffer.
*/
void arm_dct4_q31(
const arm_dct4_instance_q31 * S,
q31_t * pState,
q31_t * pInlineBuffer);
/**
* @brief Instance structure for the Q15 DCT4/IDCT4 function.
*/
typedef struct
{
uint16_t N; /**< length of the DCT4. */
uint16_t Nby2; /**< half of the length of the DCT4. */
q15_t normalize; /**< normalizing factor. */
const q15_t *pTwiddle; /**< points to the twiddle factor table. */
const q15_t *pCosFactor; /**< points to the cosFactor table. */
arm_rfft_instance_q15 *pRfft; /**< points to the real FFT instance. */
arm_cfft_radix4_instance_q15 *pCfft; /**< points to the complex FFT instance. */
} arm_dct4_instance_q15;
/**
* @brief Initialization function for the Q15 DCT4/IDCT4.
* @param[in,out] S points to an instance of Q15 DCT4/IDCT4 structure.
* @param[in] S_RFFT points to an instance of Q15 RFFT/RIFFT structure.
* @param[in] S_CFFT points to an instance of Q15 CFFT/CIFFT structure.
* @param[in] N length of the DCT4.
* @param[in] Nby2 half of the length of the DCT4.
* @param[in] normalize normalizing factor.
* @return arm_status function returns ARM_MATH_SUCCESS if initialization is successful or ARM_MATH_ARGUMENT_ERROR if <code>N</code> is not a supported transform length.
*/
arm_status arm_dct4_init_q15(
arm_dct4_instance_q15 * S,
arm_rfft_instance_q15 * S_RFFT,
arm_cfft_radix4_instance_q15 * S_CFFT,
uint16_t N,
uint16_t Nby2,
q15_t normalize);
/**
* @brief Processing function for the Q15 DCT4/IDCT4.
* @param[in] S points to an instance of the Q15 DCT4 structure.
* @param[in] pState points to state buffer.
* @param[in,out] pInlineBuffer points to the in-place input and output buffer.
*/
void arm_dct4_q15(
const arm_dct4_instance_q15 * S,
q15_t * pState,
q15_t * pInlineBuffer);
#ifdef __cplusplus
}
#endif
#endif /* ifndef _TRANSFORM_FUNCTIONS_H_ */

View File

@ -1,239 +0,0 @@
/******************************************************************************
* @file arm_math_utils.h
* @brief Public header file for CMSIS DSP Library
* @version V1.9.0
* @date 20. July 2020
******************************************************************************/
/*
* Copyright (c) 2010-2020 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef _ARM_MATH_UTILS_H_
#define _ARM_MATH_UTILS_H_
#include "arm_math_types.h"
#ifdef __cplusplus
extern "C"
{
#endif
/**
* @brief Macros required for reciprocal calculation in Normalized LMS
*/
#define INDEX_MASK 0x0000003F
#define SQ(x) ((x) * (x))
/**
* @brief Function to Calculates 1/in (reciprocal) value of Q31 Data type.
*/
__STATIC_FORCEINLINE uint32_t arm_recip_q31(
q31_t in,
q31_t * dst,
const q31_t * pRecipTable)
{
q31_t out;
uint32_t tempVal;
uint32_t index, i;
uint32_t signBits;
if (in > 0)
{
signBits = ((uint32_t) (__CLZ( in) - 1));
}
else
{
signBits = ((uint32_t) (__CLZ(-in) - 1));
}
/* Convert input sample to 1.31 format */
in = (in << signBits);
/* calculation of index for initial approximated Val */
index = (uint32_t)(in >> 24);
index = (index & INDEX_MASK);
/* 1.31 with exp 1 */
out = pRecipTable[index];
/* calculation of reciprocal value */
/* running approximation for two iterations */
for (i = 0U; i < 2U; i++)
{
tempVal = (uint32_t) (((q63_t) in * out) >> 31);
tempVal = 0x7FFFFFFFu - tempVal;
/* 1.31 with exp 1 */
/* out = (q31_t) (((q63_t) out * tempVal) >> 30); */
out = clip_q63_to_q31(((q63_t) out * tempVal) >> 30);
}
/* write output */
*dst = out;
/* return num of signbits of out = 1/in value */
return (signBits + 1U);
}
/**
* @brief Function to Calculates 1/in (reciprocal) value of Q15 Data type.
*/
__STATIC_FORCEINLINE uint32_t arm_recip_q15(
q15_t in,
q15_t * dst,
const q15_t * pRecipTable)
{
q15_t out = 0;
uint32_t tempVal = 0;
uint32_t index = 0, i = 0;
uint32_t signBits = 0;
if (in > 0)
{
signBits = ((uint32_t)(__CLZ( in) - 17));
}
else
{
signBits = ((uint32_t)(__CLZ(-in) - 17));
}
/* Convert input sample to 1.15 format */
in = (in << signBits);
/* calculation of index for initial approximated Val */
index = (uint32_t)(in >> 8);
index = (index & INDEX_MASK);
/* 1.15 with exp 1 */
out = pRecipTable[index];
/* calculation of reciprocal value */
/* running approximation for two iterations */
for (i = 0U; i < 2U; i++)
{
tempVal = (uint32_t) (((q31_t) in * out) >> 15);
tempVal = 0x7FFFu - tempVal;
/* 1.15 with exp 1 */
out = (q15_t) (((q31_t) out * tempVal) >> 14);
/* out = clip_q31_to_q15(((q31_t) out * tempVal) >> 14); */
}
/* write output */
*dst = out;
/* return num of signbits of out = 1/in value */
return (signBits + 1);
}
/**
* @brief 64-bit to 32-bit unsigned normalization
* @param[in] in is input unsigned long long value
* @param[out] normalized is the 32-bit normalized value
* @param[out] norm is norm scale
*/
__STATIC_INLINE void arm_norm_64_to_32u(uint64_t in, int32_t * normalized, int32_t *norm)
{
int32_t n1;
int32_t hi = (int32_t) (in >> 32);
int32_t lo = (int32_t) ((in << 32) >> 32);
n1 = __CLZ(hi) - 32;
if (!n1)
{
/*
* input fits in 32-bit
*/
n1 = __CLZ(lo);
if (!n1)
{
/*
* MSB set, need to scale down by 1
*/
*norm = -1;
*normalized = (((uint32_t) lo) >> 1);
} else
{
if (n1 == 32)
{
/*
* input is zero
*/
*norm = 0;
*normalized = 0;
} else
{
/*
* 32-bit normalization
*/
*norm = n1 - 1;
*normalized = lo << *norm;
}
}
} else
{
/*
* input fits in 64-bit
*/
n1 = 1 - n1;
*norm = -n1;
/*
* 64 bit normalization
*/
*normalized = (((uint32_t) lo) >> n1) | (hi << (32 - n1));
}
}
__STATIC_INLINE q31_t arm_div_q63_to_q31(q63_t num, q31_t den)
{
q31_t result;
uint64_t absNum;
int32_t normalized;
int32_t norm;
/*
* if sum fits in 32bits
* avoid costly 64-bit division
*/
absNum = num > 0 ? num : -num;
arm_norm_64_to_32u(absNum, &normalized, &norm);
if (norm > 0)
/*
* 32-bit division
*/
result = (q31_t) num / den;
else
/*
* 64-bit division
*/
result = (q31_t) (num / den);
return result;
}
#ifdef __cplusplus
}
#endif
#endif /*ifndef _ARM_MATH_UTILS_H_ */

View File

@ -1,56 +0,0 @@
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_nn_tables.h
* Description: Extern declaration for NN tables
*
* $Date: 17. January 2018
* $Revision: V.1.0.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2018 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef _ARM_NN_TABLES_H
#define _ARM_NN_TABLES_H
#include "arm_math.h"
/**
* @brief tables for various activation functions
*
*/
extern const q15_t sigmoidTable_q15[256];
extern const q7_t sigmoidTable_q7[256];
extern const q7_t tanhTable_q7[256];
extern const q15_t tanhTable_q15[256];
/**
* @brief 2-way tables for various activation functions
*
* 2-way table, H table for value larger than 1/4
* L table for value smaller than 1/4, H table for remaining
* We have this only for the q15_t version. It does not make
* sense to have it for q7_t type
*/
extern const q15_t sigmoidHTable_q15[192];
extern const q15_t sigmoidLTable_q15[128];
#endif /* ARM_NN_TABLES_H */

View File

@ -1,130 +0,0 @@
/*
* Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_nn_types.h
* Description: Public header file to contain the CMSIS-NN structs for the
* TensorFlowLite micro compliant functions
*
* $Date: April 23, 2020
* $Revision: V.0.5.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
#ifndef _ARM_NN_TYPES_H
#define _ARM_NN_TYPES_H
/** CMSIS-NN object to contain the width and height of a tile */
typedef struct
{
int32_t w; /**< Width */
int32_t h; /**< Height */
} cmsis_nn_tile;
/** CMSIS-NN object used for the function context. */
typedef struct
{
void *buf; /**< Pointer to a buffer needed for the optimization */
int32_t size; /**< Buffer size */
} cmsis_nn_context;
/** CMSIS-NN object to contain the dimensions of the tensors */
typedef struct
{
int32_t n; /**< Generic dimension to contain either the batch size or output channels. Please refer to the function documentation for more information */
int32_t h; /**< Height */
int32_t w; /**< Width */
int32_t c; /**< Input channels */
} cmsis_nn_dims;
/** CMSIS-NN object for the per-channel quantization parameters */
typedef struct
{
int32_t *multiplier; /**< Multiplier values */
int32_t *shift; /**< Shift values */
} cmsis_nn_per_channel_quant_params;
/** CMSIS-NN object for the per-tensor quantization parameters */
typedef struct
{
int32_t multiplier; /**< Multiplier value */
int32_t shift; /**< Shift value */
} cmsis_nn_per_tensor_quant_params;
/** CMSIS-NN object for the quantized Relu activation */
typedef struct
{
int32_t min; /**< Min value used to clamp the result */
int32_t max; /**< Max value used to clamp the result */
} cmsis_nn_activation;
/** CMSIS-NN object for the convolution layer parameters */
typedef struct
{
int32_t input_offset; /**< Zero value for the input tensor */
int32_t output_offset; /**< Zero value for the output tensor */
cmsis_nn_tile stride;
cmsis_nn_tile padding;
cmsis_nn_tile dilation;
cmsis_nn_activation activation;
} cmsis_nn_conv_params;
/** CMSIS-NN object for Depthwise convolution layer parameters */
typedef struct
{
int32_t input_offset; /**< Zero value for the input tensor */
int32_t output_offset; /**< Zero value for the output tensor */
int32_t ch_mult; /**< Channel Multiplier. ch_mult * in_ch = out_ch */
cmsis_nn_tile stride;
cmsis_nn_tile padding;
cmsis_nn_tile dilation;
cmsis_nn_activation activation;
} cmsis_nn_dw_conv_params;
/** CMSIS-NN object for pooling layer parameters */
typedef struct
{
cmsis_nn_tile stride;
cmsis_nn_tile padding;
cmsis_nn_activation activation;
} cmsis_nn_pool_params;
/** CMSIS-NN object for Fully Connected layer parameters */
typedef struct
{
int32_t input_offset; /**< Zero value for the input tensor */
int32_t filter_offset; /**< Zero value for the filter tensor */
int32_t output_offset; /**< Zero value for the output tensor */
cmsis_nn_activation activation;
} cmsis_nn_fc_params;
/** CMSIS-NN object for SVDF layer parameters */
typedef struct
{
int32_t rank;
int32_t input_offset; /**< Zero value for the input tensor */
int32_t output_offset; /**< Zero value for the output tensor */
cmsis_nn_activation input_activation;
cmsis_nn_activation output_activation;
} cmsis_nn_svdf_params;
#endif // _ARM_NN_TYPES_H

View File

@ -1,954 +0,0 @@
/*
* Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_nnsupportfunctions.h
* Description: Public header file of support functions for CMSIS NN Library
*
* $Date: July 31, 2020
* $Revision: V.4.5.4
*
* Target Processor: Cortex-M CPUs
* -------------------------------------------------------------------- */
#ifndef _ARM_NNSUPPORTFUNCTIONS_H_
#define _ARM_NNSUPPORTFUNCTIONS_H_
#include "arm_math.h"
#include "arm_common_tables.h"
#ifdef __cplusplus
extern "C"
{
#endif
#define LEFT_SHIFT(_shift) (_shift > 0 ? _shift : 0)
#define RIGHT_SHIFT(_shift) (_shift > 0 ? 0 : -_shift)
#define MASK_IF_ZERO(x) (x) == 0 ? ~0 : 0
#define MASK_IF_NON_ZERO(x) (x) != 0 ? ~0 : 0
#define SELECT_USING_MASK(mask, a, b) ((mask) & (a)) ^ (~(mask) & (b))
#define MAX(A,B) ((A) > (B) ? (A) : (B))
#define MIN(A,B) ((A) < (B) ? (A) : (B))
#define CLAMP(x, h, l) MAX(MIN((x), (h)), (l))
/**
* @brief Union for SIMD access of q31/q15/q7 types
*/
union arm_nnword
{
q31_t word;
/**< q31 type */
q15_t half_words[2];
/**< q15 type */
q7_t bytes[4];
/**< q7 type */
};
/**
* @brief Union for data type long long
*/
struct arm_nn_double
{
uint32_t low;
int32_t high;
};
union arm_nn_long_long
{
int64_t long_long;
struct arm_nn_double word;
};
/**
* @brief Struct for specifying activation function types
*
*/
typedef enum
{
ARM_SIGMOID = 0,
/**< Sigmoid activation function */
ARM_TANH = 1,
/**< Tanh activation function */
} arm_nn_activation_type;
/**
* @defgroup nndata_convert Neural Network Data Conversion Functions
*
* Perform data type conversion in-between neural network operations
*
*/
/**
* @brief Converts the elements of the q7 vector to q15 vector without left-shift
* @param[in] *pSrc points to the q7 input vector
* @param[out] *pDst points to the q15 output vector
* @param[in] blockSize length of the input vector
*
*/
void arm_q7_to_q15_no_shift(const q7_t *pSrc, q15_t *pDst, uint32_t blockSize);
/**
* @brief Non-saturating addition of elements of a q7 vector
* @param[in] *input Pointer to the q7 input vector
* @param[out] *output Pointer to the q31 output variable.
* @param[in] block_size length of the input vector
* \par Description:
*
* 2^24 samples can be added without saturating the result.
*
* The equation used for the conversion process is:
*
* <pre>
* sum = input[0] + input[1] + .. + input[block_size -1]
* </pre>
*
* */
void arm_nn_add_q7(const q7_t *input, q31_t *output, uint32_t block_size);
/**
* @brief Converts the elements of the q7 vector to reordered q15 vector without left-shift
* @param[in] *pSrc points to the q7 input vector
* @param[out] *pDst points to the q15 output vector
* @param[in] blockSize length of the input vector
* @return none.
*
*/
void arm_q7_to_q15_reordered_no_shift(const q7_t * pSrc, q15_t * pDst, uint32_t blockSize);
/**
* @brief Converts the elements from a q7 vector to a q15 vector with an added offset
* @param[in] src pointer to the q7 input vector
* @param[out] dst pointer to the q15 output vector
* @param[in] block_size length of the input vector
* @param[in] offset q7 offset to be added to each input vector element.
*
* \par Description:
*
* The equation used for the conversion process is:
*
* <pre>
* dst[n] = (q15_t) src[n] + offset; 0 <= n < block_size.
* </pre>
*
*/
void arm_q7_to_q15_with_offset(const q7_t *src, q15_t *dst, uint32_t block_size, q15_t offset);
/**
* @brief Converts the elements of the q7 vector to reordered q15 vector with an added offset
* @param[in] src pointer to the q7 input vector
* @param[out] dst pointer to the q15 output vector
* @param[in] block_size length of the input vector
* @param[in] offset offset to be added to each input vector element.
* @return none.
*
* @details This function does the q7 to q15 expansion with re-ordering of bytes. Re-ordering is a consequence of
* the sign extension intrinsic(DSP extension). The tail (i.e., last (N % 4) elements) retains its original
* order.
*
*/
void arm_q7_to_q15_reordered_with_offset(const q7_t *src, q15_t *dst, uint32_t block_size, q15_t offset);
/**
* @brief Converts the elements from a q7 vector and accumulate to a q15 vector
* @param[in] *src points to the q7 input vector
* @param[out] *dst points to the q15 output vector
* @param[in] block_size length of the input vector
*
* \par Description:
*
* The equation used for the conversion process is:
*
* <pre>
* dst[n] += (q15_t) src[n] ; 0 <= n < block_size.
* </pre>
*
*/
void arm_nn_accumulate_q7_to_q15(q15_t *dst, const q7_t *src, uint32_t block_size);
/**
* @brief Depthwise conv on an im2col buffer where the input channel equals output channel.
* @param[in] row pointer to row
* @param[in] col pointer to im2col buffer, always consists of 2 columns.
* @param[in] num_ch number of channels
* @param[in] out_shift pointer to per output channel requantization shift parameter.
* @param[in] out_mult pointer to per output channel requantization multiplier parameter.
* @param[in] out_offset output tensor offset.
* @param[in] activation_min minimum value to clamp the output to. Range : int8
* @param[in] activation_max maximum value to clamp the output to. Range : int8
* @param[in] kernel_size number of elements in one column.
* @param[in] output_bias per output channel bias. Range : int32
* @param[out] out pointer to output
* @return The function returns one of the two
* 1. The incremented output pointer for a successful operation or
* 2. NULL if implementation is not available.
*
* @details Supported framework: TensorFlow Lite micro.
*/
q7_t *arm_nn_depthwise_conv_s8_core(const q7_t *row,
const q15_t *col,
const uint16_t num_ch,
const int32_t *out_shift,
const int32_t *out_mult,
const int32_t out_offset,
const int32_t activation_min,
const int32_t activation_max,
const uint16_t kernel_size,
const int32_t *const output_bias,
q7_t *out);
/**
* @brief General Matrix-multiplication function with per-channel requantization.
* @param[in] input_row pointer to row operand
* @param[in] input_col pointer to col operand
* @param[in] output_ch number of rows of input_row
* @param[in] col_batches number of column batches. Range: 1 to 4
* @param[in] output_shift pointer to per output channel requantization shift parameter.
* @param[in] output_mult pointer to per output channel requantization multiplier parameter.
* @param[in] out_offset output tensor offset.
* @param[in] col_offset input tensor(col) offset.
* @param[in] row_offset kernel offset(row). Not used.
* @param[in] out_activation_min minimum value to clamp the output to. Range : int8
* @param[in] out_activation_max maximum value to clamp the output to. Range : int8
* @param[in] row_len number of elements in each row
* @param[in] bias per output channel bias. Range : int32
* @param[in,out] out pointer to output
* @return The function returns one of the two
* 1. The incremented output pointer for a successful operation or
* 2. NULL if implementation is not available.
*
* @details Supported framework: TensorFlow Lite
*/
q7_t *arm_nn_mat_mult_s8(const q7_t *input_row,
const q7_t *input_col,
const uint16_t output_ch,
const uint16_t col_batches,
const int32_t *output_shift,
const int32_t *output_mult,
const int32_t out_offset,
const int32_t col_offset,
const int32_t row_offset,
const int16_t out_activation_min,
const int16_t out_activation_max,
const uint16_t row_len,
const int32_t *const bias,
q7_t *out);
/**
* @brief General Matrix-multiplication without requantization for one row & one column
* @param[in] row_elements number of row elements
* @param[in] row_base pointer to row operand
* @param[in] col_base pointer to col operand
* @param[out] sum_col pointer to store sum of column elements
* @param[out] output pointer to store result of multiply-accumulate
* @return The function returns the multiply-accumulated result of the row by column.
*
* @details Pseudo-code
* *output = 0
* sum_col = 0
* for (i = 0; i < row_elements; i++)
* *output += row_base[i] * col_base[i]
* sum_col += col_base[i]
*
*/
arm_status arm_nn_mat_mul_core_1x_s8(int32_t row_elements,
const int8_t *row_base,
const int8_t *col_base,
int32_t *const sum_col,
int32_t *const output);
/**
* @brief General Matrix-multiplication without requantization for four rows and one column
* @param[in] row_elements number of row elements
* @param[in] offset offset between rows. Can be the same as row_elements.
* For e.g, in a 1x1 conv scenario with stride as 1.
* @param[in] row_base pointer to row operand
* @param[in] col_base pointer to col operand
* @param[out] sum_col pointer to store sum of column elements
* @param[out] output pointer to store result(4 int32's) of multiply-accumulate
* @return The function returns the multiply-accumulated result of the row by column
*
* @details Pseudo-code
* output[0] = 0
* ..
* output[3] = 0
* sum_col = 0
* for (i = 0; i < row_elements; i++)
* output[0] += row_base[i] * col_base[i]
* ..
* output[3] += row_base[i + (row_elements * 3)] * col_base[i]
* sum_col += col_base[i]
*/
arm_status arm_nn_mat_mul_core_4x_s8(const int32_t row_elements,
const int32_t offset,
const int8_t *row_base,
const int8_t *col_base,
int32_t *const sum_col,
int32_t *const output);
/**
* @brief General Matrix-multiplication function with per-channel requantization.
* This function assumes:
* - LHS input matrix NOT transposed (nt)
* - RHS input matrix transposed (t)
*
* @note This operation also performs the broadcast bias addition before the requantization
*
* @param[in] lhs Pointer to the LHS input matrix
* @param[in] rhs Pointer to the RHS input matrix
* @param[in] bias Pointer to the bias vector. The length of this vector is equal to the number of output columns (or RHS input rows)
* @param[out] dst Pointer to the output matrix with "m" rows and "n" columns
* @param[in] dst_multipliers Pointer to the multipliers vector needed for the per-channel requantization. The length of this vector is equal to
* the number of output columns (or RHS input rows)
* @param[in] dst_shifts Pointer to the shifts vector needed for the per-channel requantization. The length of this vector is equal to
* the number of output columns (or RHS input rows)
* @param[in] lhs_rows Number of LHS input rows
* @param[in] rhs_rows Number of RHS input rows
* @param[in] rhs_cols Number of LHS/RHS input columns
* @param[in] lhs_offset Offset to be applied to the LHS input value
* @param[in] dst_offset Offset to be applied the output result
* @param[in] activation_min Minimum value to clamp down the output. Range : int8
* @param[in] activation_max Maximum value to clamp up the output. Range : int8
*
* @return The function returns <code>ARM_MATH_SUCCESS</code>
*
*/
arm_status arm_nn_mat_mult_nt_t_s8(const q7_t *lhs,
const q7_t *rhs,
const q31_t *bias,
q7_t *dst,
const int32_t *dst_multipliers,
const int32_t *dst_shifts,
const int32_t lhs_rows,
const int32_t rhs_rows,
const int32_t rhs_cols,
const int32_t lhs_offset,
const int32_t dst_offset,
const int32_t activation_min,
const int32_t activation_max);
/**
* @brief s8 Vector by Matrix (transposed) multiplication
*
* @param[in] lhs Input left-hand side vector
* @param[in] rhs Input right-hand side matrix (transposed)
* @param[in] bias Input bias
* @param[out] dst Output vector
* @param[in] lhs_offset Offset to be added to the input values of the left-hand side vector. Range: -127 to 128
* @param[in] rhs_offset Offset to be added to the input values of the right-hand side matrix. Range: -127 to 128
* @param[in] dst_offset Offset to be added to the output values. Range: -127 to 128
* @param[in] dst_multiplier Output multiplier
* @param[in] dst_shift Output shift
* @param[in] rhs_cols Number of columns in the right-hand side input matrix
* @param[in] rhs_rows Number of rows in the right-hand side input matrix
* @param[in] activation_min Minimum value to clamp the output to. Range: int8
* @param[in] activation_max Maximum value to clamp the output to. Range: int8
*
* @return The function returns <code>ARM_MATH_SUCCESS</code>
*
*/
arm_status arm_nn_vec_mat_mult_t_s8(const q7_t *lhs,
const q7_t *rhs,
const q31_t *bias,
q7_t *dst,
const int32_t lhs_offset,
const int32_t rhs_offset,
const int32_t dst_offset,
const int32_t dst_multiplier,
const int32_t dst_shift,
const int32_t rhs_cols,
const int32_t rhs_rows,
const int32_t activation_min,
const int32_t activation_max);
/**
* @brief Depthwise convolution of transposed rhs matrix with 4 lhs matrices. To be used in padded cases where
* the padding is -lhs_offset(Range: int8). Dimensions are the same for lhs and rhs.
*
* @param[in] lhs Input left-hand side matrix
* @param[in] rhs Input right-hand side matrix (transposed)
* @param[in] lhs_offset LHS matrix offset(input offset). Range: -127 to 128
* @param[in] num_ch Number of channels in LHS/RHS
* @param[in] out_shift Per channel output shift. Length of vector is equal to number of channels
* @param[in] out_mult Per channel output multiplier. Length of vector is equal to number of channels
* @param[in] out_offset Offset to be added to the output values. Range: -127 to 128
* @param[in] activation_min Minimum value to clamp the output to. Range: int8
* @param[in] activation_max Maximum value to clamp the output to. Range: int8
* @param[in] row_x_col (row_dimension * col_dimension) of LHS/RHS matrix
* @param[in] output_bias Per channel output bias. Length of vector is equal to number of channels
* @param[in] out Output pointer
*
* @return The function returns one of the two
* - Updated output pointer if an implementaiton is available
* - NULL if no implementation is available.
*
* @note If number of channels is not a multiple of 4, upto 3 elements outside the boundary will be read out
* for the following.
* - Output shift
* - Output multiplier
* - Output bias
* - rhs
*/
q7_t *arm_nn_depthwise_conv_nt_t_padded_s8(const q7_t *lhs,
const q7_t *rhs,
const int32_t lhs_offset,
const uint16_t num_ch,
const int32_t *out_shift,
const int32_t *out_mult,
const int32_t out_offset,
const int32_t activation_min,
const int32_t activation_max,
const uint16_t row_x_col,
const int32_t *const output_bias,
q7_t *out);
/**
* @brief Depthwise convolution of transposed rhs matrix with 4 lhs matrices. To be used in non-padded cases.
* Dimensions are the same for lhs and rhs.
*
* @param[in] lhs Input left-hand side matrix
* @param[in] rhs Input right-hand side matrix (transposed)
* @param[in] lhs_offset LHS matrix offset(input offset). Range: -127 to 128
* @param[in] num_ch Number of channels in LHS/RHS
* @param[in] out_shift Per channel output shift. Length of vector is equal to number of channels.
* @param[in] out_mult Per channel output multiplier. Length of vector is equal to number of channels.
* @param[in] out_offset Offset to be added to the output values. Range: -127 to 128
* @param[in] activation_min Minimum value to clamp the output to. Range: int8
* @param[in] activation_max Maximum value to clamp the output to. Range: int8
* @param[in] row_x_col (row_dimension * col_dimension) of LHS/RHS matrix
* @param[in] output_bias Per channel output bias. Length of vector is equal to number of channels.
* @param[in] out Output pointer
*
* @return The function returns one of the two
* - Updated output pointer if an implementaiton is available
* - NULL if no implementation is available.
*
* @note If number of channels is not a multiple of 4, upto 3 elements outside the boundary will be read out
* for the following.
* - Output shift
* - Output multiplier
* - Output bias
* - rhs
*/
q7_t *arm_nn_depthwise_conv_nt_t_s8(const q7_t *lhs,
const q7_t *rhs,
const int32_t lhs_offset,
const uint16_t num_ch,
const int32_t *out_shift,
const int32_t *out_mult,
const int32_t out_offset,
const int32_t activation_min,
const int32_t activation_max,
const uint16_t row_x_col,
const int32_t *const output_bias,
q7_t *out);
/**
@brief Read 2 q15 elements and post increment pointer.
@param[in] in_q15 Pointer to pointer that holds address of input.
@return q31 value
*/
__STATIC_FORCEINLINE q31_t arm_nn_read_q15x2_ia(const q15_t **in_q15)
{
q31_t val;
memcpy(&val, *in_q15, 4);
*in_q15 += 2;
return (val);
}
/**
@brief Read 4 q7 from q7 pointer and post increment pointer.
@param[in] in_q7 Pointer to pointer that holds address of input.
@return q31 value
*/
__STATIC_FORCEINLINE q31_t arm_nn_read_q7x4_ia(const q7_t **in_q7)
{
q31_t val;
memcpy(&val, *in_q7, 4);
*in_q7 += 4;
return (val);
}
/**
@brief Read 2 q15 from q15 pointer.
@param[in] in_q15 pointer to address of input.
@return q31 value
*/
__STATIC_FORCEINLINE q31_t arm_nn_read_q15x2(const q15_t *in_q15)
{
q31_t val;
memcpy(&val, in_q15, 4);
return (val);
}
/**
@brief Read 4 q7 values.
@param[in] in_q7 pointer to address of input.
@return q31 value
*/
__STATIC_FORCEINLINE q31_t arm_nn_read_q7x4(const q7_t *in_q7)
{
q31_t val;
memcpy(&val, in_q7, 4);
return (val);
}
/**
* @brief memset optimized for MVE
* @param[in, out] dst Destination pointer
* @param[in] val Value to set
* @param[in] block_size Number of bytes to copy.
*
*/
__STATIC_FORCEINLINE void arm_memset_q7(q7_t *dst,
const q7_t val,
uint32_t block_size)
{
#if defined(ARM_MATH_MVEI)
__asm volatile (
" vdup.8 q0, %[set_val] \n"
" wlstp.8 lr, %[cnt], 1f \n"
"2: \n"
" vstrb.8 q0, [%[in]], 16 \n"
" letp lr, 2b \n"
"1: \n"
:[in] "+r"(dst)
:[cnt] "r"(block_size), [set_val] "r"(val)
:"q0", "memory", "r14");
#else
memset(dst, val, block_size);
#endif
}
#if defined (ARM_MATH_DSP)
/**
* @brief read and expand one q7 word into two q15 words
*/
__STATIC_FORCEINLINE const q7_t *read_and_pad(const q7_t *source, q31_t * out1, q31_t * out2)
{
q31_t inA = arm_nn_read_q7x4_ia(&source);
q31_t inAbuf1 = __SXTB16(__ROR((uint32_t)inA, 8));
q31_t inAbuf2 = __SXTB16(inA);
#ifndef ARM_MATH_BIG_ENDIAN
*out2 = (int32_t) (__PKHTB (inAbuf1, inAbuf2, 16));
*out1 = (int32_t) (__PKHBT (inAbuf2, inAbuf1, 16));
#else
*out1 = (int32_t) (__PKHTB(inAbuf1, inAbuf2, 16));
*out2 = (int32_t) (__PKHBT(inAbuf2, inAbuf1, 16));
#endif
return source;
}
/**
* @brief read and expand one q7 word into two q15 words with reordering
*/
__STATIC_FORCEINLINE const q7_t *read_and_pad_reordered(const q7_t *source, q31_t * out1, q31_t * out2)
{
q31_t inA = arm_nn_read_q7x4_ia(&source);
#ifndef ARM_MATH_BIG_ENDIAN
*out2 = __SXTB16(__ROR((uint32_t)inA, 8));
*out1 = __SXTB16(inA);
#else
*out1 = __SXTB16(__ROR((uint32_t)inA, 8));
*out2 = __SXTB16(inA);
#endif
return source;
}
/**
* @brief read and expand one q7 word into two q15 words with reordering and add an offset
*/
__STATIC_FORCEINLINE const q7_t *read_and_pad_reordered_with_offset(const q7_t *source, q31_t * out1, q31_t * out2, q31_t offset)
{
q31_t inA = arm_nn_read_q7x4_ia(&source);
#ifndef ARM_MATH_BIG_ENDIAN
*out2 = __SXTB16(__ROR((uint32_t)inA, 8));
*out1 = __SXTB16(inA);
#else
*out1 = __SXTB16(__ROR((uint32_t)inA, 8));
*out2 = __SXTB16(inA);
#endif
*out1 = __QADD16(*out1,offset);
*out2 = __QADD16(*out2,offset);
return source;
}
#endif
/**
* @defgroup NNBasicMath Basic Math Functions for Neural Network Computation
*
* Basic Math Functions for Neural Network Computation
*
*/
/**
* @brief q7 vector multiplication with variable output shifts
* @param[in] *pSrcA pointer to the first input vector
* @param[in] *pSrcB pointer to the second input vector
* @param[out] *pDst pointer to the output vector
* @param[in] out_shift amount of right-shift for output
* @param[in] blockSize number of samples in each vector
* @return none.
*
* <b>Scaling and Overflow Behavior:</b>
* \par
* The function uses saturating arithmetic.
* Results outside of the allowable q15 range [0x8000 0x7FFF] will be saturated.
*/
void arm_nn_mult_q15(
q15_t * pSrcA,
q15_t * pSrcB,
q15_t * pDst,
const uint16_t out_shift,
uint32_t blockSize);
/**
* @brief q7 vector multiplication with variable output shifts
* @param[in] *pSrcA pointer to the first input vector
* @param[in] *pSrcB pointer to the second input vector
* @param[out] *pDst pointer to the output vector
* @param[in] out_shift amount of right-shift for output
* @param[in] blockSize number of samples in each vector
* @return none.
*
* <b>Scaling and Overflow Behavior:</b>
* \par
* The function uses saturating arithmetic.
* Results outside of the allowable q7 range [0x80 0x7F] will be saturated.
*/
void arm_nn_mult_q7(
q7_t * pSrcA,
q7_t * pSrcB,
q7_t * pDst,
const uint16_t out_shift,
uint32_t blockSize);
/**
* @brief macro for adding rounding offset
*/
#ifndef ARM_NN_TRUNCATE
#define NN_ROUND(out_shift) ( (0x1u << out_shift) >> 1 )
#else
#define NN_ROUND(out_shift) 0
#endif
// Macros for shortening quantization functions' names and avoid long lines
#define MUL_SAT(a, b) arm_nn_doubling_high_mult((a), (b))
#define MUL_SAT_MVE(a, b) arm_doubling_high_mult_mve_32x4((a), (b))
#define MUL_POW2(a, b) arm_nn_mult_by_power_of_two((a), (b))
#define DIV_POW2(a, b) arm_nn_divide_by_power_of_two((a), (b))
#define DIV_POW2_MVE(a, b) arm_divide_by_power_of_two_mve((a), (b))
#define EXP_ON_NEG(x) arm_nn_exp_on_negative_values((x))
#define ONE_OVER1(x) arm_nn_one_over_one_plus_x_for_x_in_0_1((x))
/**
* @brief Saturating doubling high multiply. Result matches
* NEON instruction VQRDMULH.
* @param[in] m1 Multiplicand. Range: {Q31_MIN, Q31_MAX}
* @param[in] m2 Multiplier. Range: {Q31_MIN, Q31_MAX}
* @return Result of multiplication.
*
*/
__STATIC_FORCEINLINE q31_t arm_nn_doubling_high_mult(const q31_t m1, const q31_t m2)
{
q31_t result = 0;
// Rounding offset to add for a right shift of 31
q63_t mult = 1 << 30;
if ((m1 < 0) ^ (m2 < 0))
{
mult = 1 - mult;
}
// Gets resolved as a SMLAL instruction
mult = mult + (q63_t)m1 * m2;
// Utilize all of the upper 32 bits. This is the doubling step
// as well.
result = (int32_t) (mult / (1ll << 31));
if ((m1 == m2) && (m1 == (int32_t)Q31_MIN))
{
result = Q31_MAX;
}
return result;
}
/**
* @brief Doubling high multiply without saturation. This is intended
* for requantization where the scale is a positive integer
*
* @param[in] m1 Multiplicand. Range: {Q31_MIN, Q31_MAX}
* @param[in] m2 Multiplier Range: {Q31_MIN, Q31_MAX}
* @return Result of multiplication.
* @note The result of this matches that of neon instruction
* VQRDMULH for m1 in range {Q31_MIN, Q31_MAX} and m2 in
* range {Q31_MIN + 1, Q31_MAX}. Saturation occurs when
* m1 equals m2 equals Q31_MIN and that is not handled by
* this function.
*
*/
__STATIC_FORCEINLINE q31_t arm_nn_doubling_high_mult_no_sat(const q31_t m1, const q31_t m2)
{
q31_t result = 0;
union arm_nn_long_long mult;
// Rounding offset to add for a right shift of 31
mult.word.low = 1 << 30;
mult.word.high = 0;
// Gets resolved as a SMLAL instruction
mult.long_long = mult.long_long + (q63_t)m1 * m2;
// Utilize all of the upper 32 bits. This is the doubling step
// as well.
result = (int32_t)(mult.long_long >> 31);
return result;
}
/**
* @brief Rounding divide by power of two.
* @param[in] dividend - Dividend
* @param[in] exponent - Divisor = power(2, exponent)
* Range: [0, 31]
* @return Rounded result of division. Midpoint is rounded away from zero.
*
*/
__STATIC_FORCEINLINE q31_t arm_nn_divide_by_power_of_two(const q31_t dividend, const q31_t exponent)
{
q31_t result = 0;
const q31_t remainder_mask = (1 << exponent) - 1;
int32_t remainder = remainder_mask & dividend;
// Basic division
result = dividend >> exponent;
// Adjust 'result' for rounding (mid point away from zero)
q31_t threshold = remainder_mask >> 1;
if (result < 0)
{
threshold++;
}
if (remainder > threshold)
{
result++;
}
return result;
}
/**
* @brief Requantize a given value.
* @param[in] val Value to be requantized
* @param[in] multiplier multiplier. Range {Q31_MIN + 1, Q32_MAX}
* @param[in] shift left or right shift for 'val * multiplier'
*
* @return Returns (val * multiplier)/(2 ^ shift)
*
*/
__STATIC_FORCEINLINE q31_t arm_nn_requantize(const q31_t val, const q31_t multiplier, const q31_t shift)
{
return arm_nn_divide_by_power_of_two(
arm_nn_doubling_high_mult_no_sat(val * (1 << LEFT_SHIFT(shift)), multiplier),
RIGHT_SHIFT(shift));
}
/**
* @brief memcpy optimized for MVE
* @param[in, out] dst Destination pointer
* @param[in] src Source pointer.
* @param[in] block_size Number of bytes to copy.
*
*/
__STATIC_FORCEINLINE void arm_memcpy_q7(q7_t *__RESTRICT dst,
const q7_t *__RESTRICT src,
uint32_t block_size)
{
#if defined(ARM_MATH_MVEI)
__asm volatile (
" wlstp.8 lr, %[cnt], 1f \n"
"2: \n"
" vldrb.8 q0, [%[in]], 16 \n"
" vstrb.8 q0, [%[out]], 16 \n"
" letp lr, 2b \n"
"1: \n"
:[in] "+r"(src)
,[out] "+r"(dst)
:[cnt] "r"(block_size)
:"q0", "memory", "r14");
#else
memcpy(dst, src, block_size);
#endif
}
#if defined(ARM_MATH_MVEI)
/**
* @brief Vector saturating doubling high multiply returning high half.
* @param[in] m1 Multiplicand
* @param[in] m2 Multiplier
* @return Result of multiplication.
*
*/
__STATIC_FORCEINLINE int32x4_t arm_doubling_high_mult_mve(const int32x4_t m1, const q31_t m2)
{
return vqrdmulhq_n_s32(m1, m2);
}
/**
* @brief Vector rounding divide by power of two.
* @param[in] dividend - Dividend vector
* @param[in] exponent - Divisor = power(2, exponent)
* Range: [0, 31]
* @return Rounded result of division. Midpoint is rounded away from zero.
*
*/
__STATIC_FORCEINLINE int32x4_t arm_divide_by_power_of_two_mve(const int32x4_t dividend, const q31_t exponent)
{
const int32x4_t shift = vdupq_n_s32(-exponent);
const int32x4_t fixup = vshrq_n_s32(vandq_s32(dividend, shift), 31);
const int32x4_t fixed_up_dividend = vqaddq_s32(dividend, fixup);
return vrshlq_s32(fixed_up_dividend, shift);
}
/**
* @brief Requantize a given vector.
* @param[in] val Vector to be requantized
* @param[in] multiplier multiplier
* @param[in] shift shift
*
* @return Returns (val * multiplier)/(2 ^ shift)
*
*/
__STATIC_FORCEINLINE int32x4_t arm_requantize_mve(const int32x4_t val, const q31_t multiplier, const q31_t shift)
{
return arm_divide_by_power_of_two_mve(
arm_doubling_high_mult_mve(vshlq_s32(val, vdupq_n_s32(LEFT_SHIFT(shift))), multiplier),
RIGHT_SHIFT(shift));
}
__STATIC_FORCEINLINE int32x4_t arm_doubling_high_mult_mve_32x4(const int32x4_t m1, const int32x4_t m2)
{
return vqrdmulhq_s32(m1, m2);
}
__STATIC_FORCEINLINE int32x4_t arm_divide_by_power_of_two_mve_32x4(const int32x4_t dividend, const int32x4_t exponent)
{
const int32x4_t shift = -exponent;
const int32x4_t fixup = vshrq_n_s32(vandq_s32(dividend, shift), 31);
const int32x4_t fixed_up_dividend = vqaddq_s32(dividend, fixup);
return vrshlq_s32(fixed_up_dividend, shift);
}
__STATIC_FORCEINLINE int32x4_t arm_requantize_mve_32x4(const int32x4_t val, const int32x4_t multiplier, const int32x4_t shift)
{
const int32x4_t zz = vdupq_n_s32(0);
const mve_pred16_t p = vcmpgtq_n_s32(shift, 0);
const int32x4_t left_shift = vpselq_s32(shift, zz, p);
const int32x4_t right_shift = -vpselq_s32(zz, shift, p);
return arm_divide_by_power_of_two_mve_32x4(arm_doubling_high_mult_mve_32x4(vshlq_s32(val, left_shift), multiplier), right_shift);
}
#endif
// @note The following functions are used only for softmax layer, scaled bits = 5 assumed
__STATIC_FORCEINLINE int32_t arm_nn_exp_on_negative_values(int32_t val)
{
int32_t mask = 0;
int32_t shift = 24;
const int32_t val_mod_minus_quarter = (val & ((1 << shift) - 1)) - (1 << shift);
const int32_t remainder = val_mod_minus_quarter - val;
const int32_t x = (val_mod_minus_quarter << 5) + (1 << 28);
const int32_t x2 = MUL_SAT(x, x);
int32_t result = 1895147668 + MUL_SAT(1895147668, x +
DIV_POW2(MUL_SAT(DIV_POW2(MUL_SAT(x2, x2), 2) + MUL_SAT(x2, x), 715827883) + x2, 1));
#define SELECT_IF_NON_ZERO(x) \
{ \
mask = MASK_IF_NON_ZERO(remainder & (1 << shift++)); \
result = SELECT_USING_MASK(mask, MUL_SAT(result, x), result); \
}
SELECT_IF_NON_ZERO(1672461947)
SELECT_IF_NON_ZERO(1302514674)
SELECT_IF_NON_ZERO(790015084)
SELECT_IF_NON_ZERO(290630308)
SELECT_IF_NON_ZERO(39332535)
SELECT_IF_NON_ZERO(720401)
SELECT_IF_NON_ZERO(242)
#undef SELECT_IF_NON_ZERO
mask = MASK_IF_ZERO(val);
return SELECT_USING_MASK(mask, Q31_MAX, result);
}
__STATIC_FORCEINLINE q31_t arm_nn_mult_by_power_of_two(const int32_t val, const int32_t exp)
{
const int32_t thresh = ((1 << (31 - exp)) - 1);
int32_t result = val << exp;
result = SELECT_USING_MASK(MASK_IF_NON_ZERO(val > thresh), Q31_MAX, result);
result = SELECT_USING_MASK(MASK_IF_NON_ZERO(val < -thresh), Q31_MIN, result);
return result;
}
__STATIC_FORCEINLINE int32_t arm_nn_one_over_one_plus_x_for_x_in_0_1(int32_t val)
{
const int64_t sum = (int64_t)val + (int64_t)Q31_MAX;
const int32_t half_denominator = (int32_t)((sum + (sum >= 0 ? 1 : -1)) / 2L);
int32_t x = 1515870810 + MUL_SAT(half_denominator, -1010580540);
const int32_t shift = (1 << 29);
x += MUL_POW2(MUL_SAT(x, shift - MUL_SAT(half_denominator, x)), 2);
x += MUL_POW2(MUL_SAT(x, shift - MUL_SAT(half_denominator, x)), 2);
x += MUL_POW2(MUL_SAT(x, shift - MUL_SAT(half_denominator, x)), 2);
return MUL_POW2(x, 1);
}
#ifdef __cplusplus
}
#endif
#endif

View File

@ -1,97 +0,0 @@
/*
* Copyright (C) 2010-2018 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_nn_activations_q15.c
* Description: Q15 neural network activation function using direct table look-up
*
* $Date: 17. January 2018
* $Revision: V.1.0.0
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#include "arm_math.h"
#include "arm_common_tables.h"
#include "arm_nnfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup Acti
* @{
*/
/**
* @brief neural network activation function using direct table look-up
*
* @note Refer header file for details.
*
*/
void arm_nn_activations_direct_q15(q15_t * data, uint16_t size, uint16_t int_width, arm_nn_activation_type type)
{
uint16_t i = size;
q15_t *pIn = data;
q15_t *pOut = data;
uint16_t shift_size = 8 + 3 - int_width;
uint32_t bit_mask = 0x7FF >> int_width;
uint32_t full_frac = bit_mask + 1;
const q15_t *lookup_table;
switch (type)
{
case ARM_SIGMOID:
lookup_table = sigmoidTable_q15;
break;
case ARM_TANH:
default:
lookup_table = tanhTable_q15;
break;
}
while (i)
{
q15_t out;
q15_t in = *pIn++;
q15_t frac = (uint32_t) in & bit_mask;
q15_t value = lookup_table[(uint8_t)(in >> shift_size)];
if ((in >> shift_size) != 0x7f)
{
q15_t value2 = lookup_table[(uint8_t)(1 + ((uint8_t)(in >> shift_size)))];
/* doing the interpolation here for better accuracy */
out = ((q31_t) (full_frac - frac) * value + (q31_t) value2 * frac) >> shift_size;
} else
{
/* the largest positive value does not have a right side for linear interpolation */
out = value;
}
*pOut++ = out;
i--;
}
}
/**
* @} end of Acti group
*/

View File

@ -1,90 +0,0 @@
/*
* Copyright (C) 2010-2018 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_nn_activations_q7.c
* Description: Q7 neural network activation function using direct table look-up
*
* $Date: 17. January 2018
* $Revision: V.1.0.0
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#include "arm_math.h"
#include "arm_common_tables.h"
#include "arm_nnfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup Acti
* @{
*/
/**
* @brief Q7 neural network activation function using direct table look-up
* @param[in,out] data pointer to input
* @param[in] size number of elements
* @param[in] int_width bit-width of the integer part, assume to be smaller than 3
* @param[in] type type of activation functions
*
* @details
*
* This is the direct table look-up approach.
*
* Assume here the integer part of the fixed-point is <= 3.
* More than 3 just not making much sense, makes no difference with
* saturation followed by any of these activation functions.
*/
void arm_nn_activations_direct_q7(q7_t * data, uint16_t size, uint16_t int_width, arm_nn_activation_type type)
{
uint16_t i = size;
q7_t *pIn = data;
q7_t *pOut = data;
q7_t in;
q7_t out;
uint16_t shift_size = 3 - int_width;
const q7_t *lookup_table;
switch (type)
{
case ARM_SIGMOID:
lookup_table = sigmoidTable_q7;
break;
case ARM_TANH:
default:
lookup_table = tanhTable_q7;
break;
}
while (i)
{
in = *pIn++;
out = lookup_table[(uint8_t) (in >> shift_size)];
*pOut++ = out;
i--;
}
}
/**
* @} end of Acti group
*/

View File

@ -1,65 +0,0 @@
/*
* Copyright (C) 2010-2019 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_relu6_s8.c
* Description: Basic s8 version of ReLU6
*
* $Date: Spetember 2019
* $Revision: V.1.0.0
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#include "arm_math.h"
#include "arm_nnfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup Acti
* @{
*/
/*
* Basic ReLU6 function
*
* Refer to header file for details.
*
*/
void arm_relu6_s8(q7_t *data, uint16_t size)
{
int32_t i;
for (i = 0; i < size; i++)
{
int32_t ip = data[i];
ip = MAX(ip, 0);
data[i] = MIN(ip, 6);
}
}
/**
* @} end of Acti group
*/

View File

@ -1,104 +0,0 @@
/*
* Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_relu_q15.c
* Description: Q15 version of ReLU
*
* $Date: February 27, 2020
* $Revision: V.1.0.1
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#include "arm_math.h"
#include "arm_nnfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup Acti
* @{
*/
/**
* @brief Q15 RELU function
* @param[in,out] data pointer to input
* @param[in] size number of elements
*
* @details
*
* Optimized relu with QSUB instructions.
*
*/
void arm_relu_q15(q15_t *data, uint16_t size)
{
#if defined(ARM_MATH_DSP)
/* Run the following code for M cores with DSP extension */
uint16_t i = size >> 1;
q15_t *input = data;
q15_t *output = data;
q31_t in;
q31_t buf;
q31_t mask;
while (i)
{
in = read_q15x2_ia(&input);
/* extract the first bit */
buf = __ROR(in & 0x80008000, 15);
/* if MSB=1, mask will be 0xFF, 0x0 otherwise */
mask = __QSUB16(0x00000000, buf);
write_q15x2_ia(&output, in & (~mask));
i--;
}
if (size & 0x1)
{
if (*input < 0)
{
*input = 0;
}
input++;
}
#else
/* Run the following code as reference implementation for M cores without DSP extension */
uint16_t i;
for (i = 0; i < size; i++)
{
if (data[i] < 0)
data[i] = 0;
}
#endif /* ARM_MATH_DSP */
}
/**
* @} end of Acti group
*/

View File

@ -1,109 +0,0 @@
/*
* Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_relu_q7.c
* Description: Q7 version of ReLU
*
* $Date: May 29, 2020
* $Revision: V.1.0.2
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#include "arm_math.h"
#include "arm_nnfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup Acti
* @{
*/
/**
* @brief Q7 RELU function
* @param[in,out] data pointer to input
* @param[in] size number of elements
*
* @details
*
* Optimized relu with QSUB instructions.
*
*/
void arm_relu_q7(q7_t *data, uint16_t size)
{
#if defined(ARM_MATH_DSP)
/* Run the following code for M cores with DSP extension */
uint16_t i = size >> 2;
q7_t *input = data;
q7_t *output = data;
q31_t in;
q31_t buf;
q31_t mask;
while (i)
{
in = read_q7x4_ia(&input);
/* extract the first bit */
buf = (int32_t)__ROR((uint32_t)in & 0x80808080, 7);
/* if MSB=1, mask will be 0xFF, 0x0 otherwise */
mask = __QSUB8(0x00000000, buf);
write_q7x4_ia(&output, in & (~mask));
i--;
}
i = size & 0x3;
while (i)
{
if (*input < 0)
{
*input = 0;
}
input++;
i--;
}
#else
/* Run the following code as reference implementation for cores without DSP extension */
uint16_t i;
for (i = 0; i < size; i++)
{
if (data[i] < 0)
data[i] = 0;
}
#endif
}
/**
* @} end of Acti group
*/

View File

@ -1,258 +0,0 @@
/*
* Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_elementwise_add_s8
* Description: Element wise add
*
* $Date: July 31, 2020
* $Revision: V.2.5.1
*
* Target Processor: Cortex-M CPUs
*
* -------------------------------------------------------------------- */
#include "arm_math.h"
#include "arm_nnfunctions.h"
#include "arm_nnsupportfunctions.h"
#if defined(ARM_MATH_MVEI)
#include "arm_helium_utils.h"
#endif
#if defined(ARM_MATH_MVEI)
#define SAT_INPUT_VECT(__INPUT_V, __MULT, __SHIFT) \
__INPUT_V = arm_doubling_high_mult_mve(__INPUT_V, __MULT); \
__INPUT_V = arm_divide_by_power_of_two_mve(__INPUT_V, -__SHIFT);
#endif
/**
* @note The *_no_sat API does not mean that the input not saturated, Since
* __MULT is a positive integer, it is saturated. The API definition
* has more info about it.
*/
#define SAT_INPUT(__INPUT, __MULT, __SHIFT) \
__INPUT = arm_nn_doubling_high_mult_no_sat(__INPUT, __MULT); \
__INPUT = arm_nn_divide_by_power_of_two(__INPUT, -__SHIFT);
/**
* @ingroup groupNN
*/
/**
* @addtogroup BasicMath
* @{
*/
/*
* s8 element wise add
*
* Refer header file for details.
*
*/
/* Note: __SHIFT is expected to be <=0 */
arm_status
arm_elementwise_add_s8(const int8_t *input_1_vect,
const int8_t *input_2_vect,
const int32_t input_1_offset,
const int32_t input_1_mult,
const int32_t input_1_shift,
const int32_t input_2_offset,
const int32_t input_2_mult,
const int32_t input_2_shift,
const int32_t left_shift,
int8_t *output,
const int32_t out_offset,
const int32_t out_mult,
const int32_t out_shift,
const int32_t out_activation_min,
const int32_t out_activation_max,
const uint32_t block_size)
{
#if defined(ARM_MATH_MVEI)
int32_t count = (int32_t)block_size;
while (count > 0)
{
int32x4_t vect_1;
int32x4_t vect_2;
mve_pred16_t p = vctp32q((uint32_t)count);
vect_1 = vldrbq_z_s32(input_1_vect, p);
vect_2 = vldrbq_z_s32(input_2_vect, p);
vect_1 = vaddq_s32(vect_1, vdupq_n_s32(input_1_offset));
vect_2 = vaddq_s32(vect_2, vdupq_n_s32(input_2_offset));
vect_1 = vshlq_r_s32(vect_1, left_shift);
vect_2 = vshlq_r_s32(vect_2, left_shift);
SAT_INPUT_VECT(vect_1, input_1_mult, input_1_shift);
SAT_INPUT_VECT(vect_2, input_2_mult, input_2_shift);
vect_1 = vaddq_s32(vect_1, vect_2);
SAT_INPUT_VECT(vect_1, out_mult, out_shift);
vect_1 = vaddq_n_s32(vect_1, out_offset);
vect_1 = vmaxq_s32(vect_1, vdupq_n_s32(out_activation_min));
vect_1 = vminq_s32(vect_1, vdupq_n_s32(out_activation_max));
input_1_vect += 4;
input_2_vect += 4;
vstrbq_p_s32(output, vect_1, p);
output += 4;
count -= 4;
}
#else
uint32_t loop_count;
int32_t input_1;
int32_t input_2;
int32_t sum;
#if defined(ARM_MATH_DSP)
int32_t a_1, b_1, a_2, b_2;
int32_t offset_1_packed, offset_2_packed;
int8_t r1, r2, r3, r4;
offset_1_packed = (input_1_offset << 16U) | (input_1_offset & 0x0FFFFL);
offset_2_packed = (input_2_offset << 16U) | (input_2_offset & 0x0FFFFL);
loop_count = block_size >> 2;
while (loop_count > 0U)
{
/* 4 outputs are calculated in one loop. The order of calculation is follows the order of output sign extension
intrinsic */
input_1_vect = read_and_pad_reordered(input_1_vect, &b_1, &a_1);
input_2_vect = read_and_pad_reordered(input_2_vect, &b_2, &a_2);
a_1 = __SADD16(a_1, offset_1_packed);
b_1 = __SADD16(b_1, offset_1_packed);
a_2 = __SADD16(a_2, offset_2_packed);
b_2 = __SADD16(b_2, offset_2_packed);
/* Sum 1 */
input_1 = (int16_t)(b_1 & 0x0FFFFL) << left_shift;
SAT_INPUT(input_1, input_1_mult, input_1_shift);
input_2 = (int16_t)(b_2 & 0x0FFFFL) << left_shift;
SAT_INPUT(input_2, input_2_mult, input_2_shift);
sum = input_1 + input_2;
SAT_INPUT(sum, out_mult, out_shift);
sum += out_offset;
sum = MAX(sum, out_activation_min);
sum = MIN(sum, out_activation_max);
r1 = (q7_t)sum;
/* Sum 3 */
input_1 = (int16_t)((b_1 >> 16) & 0x0FFFFL) << left_shift;
SAT_INPUT(input_1, input_1_mult, input_1_shift);
input_2 = (int16_t)((b_2 >> 16) & 0x0FFFFL) << left_shift;
SAT_INPUT(input_2, input_2_mult, input_2_shift);
sum = input_1 + input_2;
SAT_INPUT(sum, out_mult, out_shift);
sum += out_offset;
sum = MAX(sum, out_activation_min);
sum = MIN(sum, out_activation_max);
r3 = (q7_t)sum;
/* Sum 2 */
input_1 = (int16_t)(a_1 & 0x0FFFFL) << left_shift;
SAT_INPUT(input_1, input_1_mult, input_1_shift);
input_2 = (int16_t)(a_2 & 0x0FFFFL) << left_shift;
SAT_INPUT(input_2, input_2_mult, input_2_shift);
sum = input_1 + input_2;
SAT_INPUT(sum, out_mult, out_shift);
sum += out_offset;
sum = MAX(sum, out_activation_min);
sum = MIN(sum, out_activation_max);
r2 = (q7_t)sum;
/* Sum 4 */
input_1 = (int16_t)((a_1 >> 16) & 0x0FFFFL) << left_shift;
SAT_INPUT(input_1, input_1_mult, input_1_shift);
input_2 = (int16_t)((a_2 >> 16) & 0x0FFFFL) << left_shift;
SAT_INPUT(input_2, input_2_mult, input_2_shift);
sum = input_1 + input_2;
SAT_INPUT(sum, out_mult, out_shift);
sum += out_offset;
sum = MAX(sum, out_activation_min);
sum = MIN(sum, out_activation_max);
r4 = (q7_t)sum;
write_q7x4_ia(&output, __PACKq7(r1, r2, r3, r4));
loop_count--;
}
loop_count = block_size & 0x3;
#else
loop_count = block_size;
#endif
while (loop_count > 0U)
{
/* C = A + B */
input_1 = (*input_1_vect++ + input_1_offset) << left_shift;
input_2 = (*input_2_vect++ + input_2_offset) << left_shift;
input_1 = arm_nn_doubling_high_mult(input_1, input_1_mult);
input_1 = arm_nn_divide_by_power_of_two(input_1, -input_1_shift);
input_2 = arm_nn_doubling_high_mult(input_2, input_2_mult);
input_2 = arm_nn_divide_by_power_of_two(input_2, -input_2_shift);
sum = input_1 + input_2;
SAT_INPUT(sum, out_mult, out_shift);
sum += out_offset;
sum = MAX(sum, out_activation_min);
sum = MIN(sum, out_activation_max);
*output++ = (q7_t)sum;
/* Decrement loop counter */
loop_count--;
}
#endif /* ARM_MATH_MVEI */
return (ARM_MATH_SUCCESS);
}
/**
* @} end of BasicMath group
*/

View File

@ -1,202 +0,0 @@
/*
* Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_elementwise_mul_s8
* Description: Element wise multiplication
*
* $Date: May 29, 2020
* $Revision: V.1.0.3
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#include "arm_math.h"
#include "arm_nnfunctions.h"
#include "arm_nnsupportfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup BasicMath
* @{
*/
/**
* @brief s8 element wise multiplication of two vectors
*
* @note Refer header file for details.
*
*/
arm_status
arm_elementwise_mul_s8(const int8_t *input_1_vect,
const int8_t *input_2_vect,
const int32_t input_1_offset,
const int32_t input_2_offset,
int8_t *output,
const int32_t out_offset,
const int32_t out_mult,
const int32_t out_shift,
const int32_t out_activation_min,
const int32_t out_activation_max,
const uint32_t block_size)
{
int32_t loop_count;
#if defined(ARM_MATH_MVEI)
loop_count = (block_size + 3) / 4;
uint32_t num_elements = block_size;
for (int i = 0; i < loop_count; i++)
{
mve_pred16_t p = vctp32q(num_elements);
int32x4_t input_1 = vldrbq_z_s32(input_1_vect, p);
input_1 = vaddq_n_s32(input_1, input_1_offset);
int32x4_t input_2 = vldrbq_z_s32(input_2_vect, p);
input_2 = vaddq_n_s32(input_2, input_2_offset);
int32x4_t res_0 = vmulq_s32(input_1, input_2);
res_0 = arm_requantize_mve_32x4(res_0, vdupq_n_s32(out_mult), vdupq_n_s32(out_shift));
res_0 += vdupq_n_s32(out_offset);
res_0 = vmaxq_s32(res_0, vdupq_n_s32(out_activation_min));
res_0 = vminq_s32(res_0, vdupq_n_s32(out_activation_max));
vstrbq_p_s32(output, res_0, p);
input_1_vect += 4;
input_2_vect += 4;
output += 4;
num_elements -= 4;
}
#else
int32_t input_1;
int32_t input_2;
int32_t mul_res;
#if defined(ARM_MATH_DSP)
int32_t a_1, b_1, a_2, b_2;
int32_t offset_1_packed, offset_2_packed;
int8_t r1, r2, r3, r4;
offset_1_packed = (input_1_offset << 16U) | (input_1_offset & 0x0FFFFL);
offset_2_packed = (input_2_offset << 16U) | (input_2_offset & 0x0FFFFL);
loop_count = block_size >> 2;
while (loop_count > 0U)
{
/* 4 outputs are calculated in one loop. The order of calculation is follows the order of output sign extension
intrinsic */
input_1_vect = read_and_pad_reordered(input_1_vect, &b_1, &a_1);
input_2_vect = read_and_pad_reordered(input_2_vect, &b_2, &a_2);
a_1 = __SADD16(a_1, offset_1_packed);
b_1 = __SADD16(b_1, offset_1_packed);
a_2 = __SADD16(a_2, offset_2_packed);
b_2 = __SADD16(b_2, offset_2_packed);
/* Mul 1 */
input_1 = (int16_t)(b_1 & 0x0FFFFL);
input_2 = (int16_t)(b_2 & 0x0FFFFL);
mul_res = input_1 * input_2;
mul_res = arm_nn_requantize(mul_res, out_mult, out_shift) + out_offset;
mul_res = MAX(mul_res, out_activation_min);
mul_res = MIN(mul_res, out_activation_max);
r1 = (q7_t)mul_res;
/* Mul 3 */
input_1 = (int16_t)((b_1 >> 16U) & 0x0FFFFL);
input_2 = (int16_t)((b_2 >> 16U) & 0x0FFFFL);
mul_res = input_1 * input_2;
mul_res = arm_nn_requantize(mul_res, out_mult, out_shift) + out_offset;
mul_res = MAX(mul_res, out_activation_min);
mul_res = MIN(mul_res, out_activation_max);
r3 = (q7_t)mul_res;
/* Mul 2 */
input_1 = (int16_t)(a_1 & 0x0FFFFL);
input_2 = (int16_t)(a_2 & 0x0FFFFL);
mul_res = input_1 * input_2;
mul_res = arm_nn_requantize(mul_res, out_mult, out_shift) + out_offset;
mul_res = MAX(mul_res, out_activation_min);
mul_res = MIN(mul_res, out_activation_max);
r2 = (q7_t)mul_res;
/* Mul 4 */
input_1 = (int16_t)((a_1 >> 16U) & 0x0FFFFL);
input_2 = (int16_t)((a_2 >> 16U) & 0x0FFFFL);
mul_res = input_1 * input_2;
mul_res = arm_nn_requantize(mul_res, out_mult, out_shift) + out_offset;
mul_res = MAX(mul_res, out_activation_min);
mul_res = MIN(mul_res, out_activation_max);
r4 = (q7_t)mul_res;
write_q7x4_ia(&output, __PACKq7(r1, r2, r3, r4));
loop_count--;
}
loop_count = block_size & 0x3;
#else
loop_count = block_size;
#endif
while (loop_count > 0U)
{
/* C = A * B */
input_1 = *input_1_vect++ + input_1_offset;
input_2 = *input_2_vect++ + input_2_offset;
mul_res = input_1 * input_2;
mul_res = arm_nn_requantize(mul_res, out_mult, out_shift) + out_offset;
mul_res = MAX(mul_res, out_activation_min);
mul_res = MIN(mul_res, out_activation_max);
*output++ = (q7_t)mul_res;
/* Decrement loop counter */
loop_count--;
}
#endif
return ARM_MATH_SUCCESS;
}
/**
* @} end of BasicMath group
*/

View File

@ -1,65 +0,0 @@
/*
* Copyright (C) 2010-2019 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_concatenation_s8_w.c
* Description: s8 version of concatenation along the W axis
*
* $Date: October 2019
* $Revision: V.1.0.0
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup Concatenation
* @{
*/
/*
* s8 version of concatenation along the W axis
*
* Refer to header file for details.
*
*/
void arm_concatenation_s8_w(const int8_t *input,
const uint16_t input_x,
const uint16_t input_y,
const uint16_t input_z,
const uint16_t input_w,
int8_t *output,
const uint32_t offset_w)
{
const uint32_t input_copy_size = input_x * input_y * input_z * input_w;
output += offset_w * (input_x * input_y * input_z);
memcpy(output, input, input_copy_size);
}
/**
* @} end of Concatenation group
*/

View File

@ -1,74 +0,0 @@
/*
* Copyright (C) 2010-2019 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_concatenation_s8_x.c
* Description: s8 version of concatenation along the X axis
*
* $Date: October 2019
* $Revision: V.1.0.0
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup Concatenation
* @{
*/
/*
* s8 version of concatenation along the X axis
*
* Refer to header file for details.
*
*/
void arm_concatenation_s8_x(const int8_t *input,
const uint16_t input_x,
const uint16_t input_y,
const uint16_t input_z,
const uint16_t input_w,
int8_t *output,
const uint16_t output_x,
const uint32_t offset_x)
{
const uint32_t num_iterations = input_y * input_z * input_w;
output += offset_x;
uint32_t i;
// Copy per row
for (i = 0; i < num_iterations; ++i)
{
memcpy(output, input, input_x);
input += input_x;
output += output_x;
}
}
/**
* @} end of Concatenation group
*/

View File

@ -1,75 +0,0 @@
/*
* Copyright (C) 2010-2019 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_concatenation_s8_y.c
* Description: s8 version of concatenation along the Y axis
*
* $Date: October 2019
* $Revision: V.1.0.0
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup Concatenation
* @{
*/
/*
* s8 version of concatenation along the Y axis
*
* Refer to header file for details.
*
*/
void arm_concatenation_s8_y(const int8_t *input,
const uint16_t input_x,
const uint16_t input_y,
const uint16_t input_z,
const uint16_t input_w,
int8_t *output,
const uint16_t output_y,
const uint32_t offset_y)
{
const uint32_t num_iterations = input_z * input_w;
const uint32_t input_copy_size = input_x * input_y;
const uint32_t output_stride = input_x * output_y;
output += offset_y * input_x;
uint32_t i;
// Copy per tile
for (i = 0; i < num_iterations; ++i)
{
memcpy(output, input, input_copy_size);
input += input_copy_size;
output += output_stride;
}
}
/**
* @} end of Concatenation group
*/

View File

@ -1,74 +0,0 @@
/*
* Copyright (C) 2010-2019 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_concatenation_s8_z.c
* Description: s8 version of concatenation along the Z axis
*
* $Date: October 2019
* $Revision: V.1.0.0
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup Concatenation
* @{
*/
/*
* s8 version of concatenation along the Z axis
*
* Refer to header file for details.
*
*/
void arm_concatenation_s8_z(const int8_t *input,
const uint16_t input_x,
const uint16_t input_y,
const uint16_t input_z,
const uint16_t input_w,
int8_t *output,
const uint16_t output_z,
const uint32_t offset_z)
{
const uint32_t input_copy_size = input_x * input_y * input_z;
const uint32_t output_stride = input_x * input_y * output_z;
output += offset_z * (input_x * input_y);
uint32_t i;
for (i = 0; i < input_w; ++i)
{
memcpy(output, input, input_copy_size);
input += input_copy_size;
output += output_stride;
}
}
/**
* @} end of Concatenation group
*/

View File

@ -1,201 +0,0 @@
/*
* Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_convolve_1_x_n_s8.c
* Description: s8 version of 1xN convolution using symmetric quantization.
*
* $Date: July 27, 2020
* $Revision: V.2.0.1
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#include "arm_math.h"
#include "arm_nn_types.h"
#include "arm_nnfunctions.h"
#include "arm_nnsupportfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup NNConv
* @{
*/
/*
* 1xN s8 convolution function.
*
* Refer header file for details.
*
*/
arm_status arm_convolve_1_x_n_s8(const cmsis_nn_context* ctx,
const cmsis_nn_conv_params* conv_params,
const cmsis_nn_per_channel_quant_params* quant_params,
const cmsis_nn_dims* input_dims,
const q7_t *input_data,
const cmsis_nn_dims* filter_dims,
const q7_t *filter_data,
const cmsis_nn_dims* bias_dims,
const int32_t *bias_data,
const cmsis_nn_dims* output_dims,
q7_t *output_data)
{
(void)bias_dims;
arm_status status = ARM_MATH_SUCCESS;
if (output_dims->w % 4 != 0)
{
status = ARM_MATH_SIZE_MISMATCH;
goto out;
}
#if defined(ARM_MATH_MVEI)
q15_t *buffer_a = (q15_t *)ctx->buf;
const uint16_t input_x = input_dims->w;
const uint16_t kernel_x = filter_dims->w;
const uint16_t output_x = output_dims->w;
const uint16_t output_ch = output_dims->c;
const uint16_t input_ch = input_dims->c;
const uint16_t pad_x = conv_params->padding.w;
const uint16_t stride_x = conv_params->stride.w;
const int32_t input_offset = conv_params->input_offset;
const int32_t out_offset = conv_params->output_offset;
const int32_t out_activation_min = conv_params->activation.min;
const int32_t out_activation_max = conv_params->activation.max;
int32_t *output_mult = quant_params->multiplier;
int32_t *output_shift = quant_params->shift;
for (int i_out_x = 0; i_out_x <= (output_x - 4); i_out_x += 4)
{
int32_t input_begin_idx[4];
int32_t ker_begin_idx[4];
int32_t ker_end_idx[4];
for (int i = 0; i < 4; i++)
{
const int32_t est_input_x_idx = stride_x * (i_out_x + i) - pad_x;
input_begin_idx[i] = MAX(0, est_input_x_idx);
ker_begin_idx[i] = MAX(0, -est_input_x_idx);
ker_end_idx[i] = MIN(kernel_x, input_x - est_input_x_idx);
}
for (int i_out_ch = 0; i_out_ch < output_ch; i_out_ch++)
{
int32x4_t s_offset;
int32_t acc[4];
if ((ker_begin_idx[0] != 0) || (ker_end_idx[3] != kernel_x))
{
int32_t sum_row[4];
(void)arm_nn_mat_mul_core_1x_s8((ker_end_idx[0] - ker_begin_idx[0]) * input_ch,
input_data + input_begin_idx[0] * input_ch,
filter_data + (input_ch * kernel_x * i_out_ch) + (ker_begin_idx[0] * input_ch),
&sum_row[0],
&acc[0]);
(void)arm_nn_mat_mul_core_1x_s8((ker_end_idx[1] - ker_begin_idx[1]) * input_ch,
input_data + input_begin_idx[1] * input_ch,
filter_data + (input_ch * kernel_x * i_out_ch) + (ker_begin_idx[1] * input_ch),
&sum_row[1],
&acc[1]);
(void)arm_nn_mat_mul_core_1x_s8((ker_end_idx[2] - ker_begin_idx[2]) * input_ch,
input_data + input_begin_idx[2] * input_ch,
filter_data + (input_ch * kernel_x * i_out_ch) + (ker_begin_idx[2] * input_ch),
&sum_row[2],
&acc[2]);
(void)arm_nn_mat_mul_core_1x_s8((ker_end_idx[3] - ker_begin_idx[3]) * input_ch,
input_data + input_begin_idx[3] * input_ch,
filter_data + (input_ch * kernel_x * i_out_ch) + (ker_begin_idx[3] * input_ch),
&sum_row[3],
&acc[3]);
s_offset = vldrwq_s32(sum_row);
}
else
{
int32_t sum_row;
(void)arm_nn_mat_mul_core_4x_s8(kernel_x * input_ch,
stride_x * input_ch,
input_data + input_begin_idx[0] * input_ch,
filter_data + (input_ch * kernel_x * i_out_ch),
&sum_row,
acc);
s_offset = vdupq_n_s32(sum_row);
}
int32x4_t res = vldrwq_s32(acc);
s_offset = vmulq_n_s32(s_offset, input_offset);
res = vaddq_s32(res, s_offset);
if (bias_data)
{
res = vaddq_n_s32(res, bias_data[i_out_ch]);
}
res = arm_requantize_mve(res, output_mult[i_out_ch], output_shift[i_out_ch]);
res = vaddq_n_s32(res, out_offset);
res = vmaxq_s32(res, vdupq_n_s32(out_activation_min));
res = vminq_s32(res, vdupq_n_s32(out_activation_max));
const uint32x4_t scatter_offset = {0, output_ch, output_ch * 2, output_ch * 3};
vstrbq_scatter_offset_s32(output_data, scatter_offset, res);
output_data++;
}
output_data += (3 * output_ch);
}
#else
status = arm_convolve_s8(ctx,
conv_params,
quant_params,
input_dims,
input_data,
filter_dims,
filter_data,
bias_dims,
bias_data,
output_dims,
output_data);
#endif
out:
/* Return to application */
return status;
}
int32_t arm_convolve_1_x_n_s8_get_buffer_size(const cmsis_nn_dims* input_dims,
const cmsis_nn_dims* filter_dims)
{
#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
return (2 * input_dims->c * filter_dims->w * filter_dims->h) * sizeof(int16_t);
#else
(void)input_dims;
(void)filter_dims;
return 0;
#endif
}
/**
* @} end of NNConv group
*/

View File

@ -1,236 +0,0 @@
/*
* Copyright (C) 2010-2018 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_convolve_1x1_HWC_q7_fast_nonsquare.c
* Description: Fast Q7 version of 1x1 convolution (non-square shape)
*
* $Date: 17. January 2018
* $Revision: V.1.0.0
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#include "arm_math.h"
#include "arm_nnfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup NNConv
* @{
*/
/**
* @brief Fast Q7 version of 1x1 convolution (non-sqaure shape)
* @param[in] Im_in pointer to input tensor
* @param[in] dim_im_in_x input tensor dimention x
* @param[in] dim_im_in_y input tensor dimention y
* @param[in] ch_im_in number of input tensor channels
* @param[in] wt pointer to kernel weights
* @param[in] ch_im_out number of filters, i.e., output tensor channels
* @param[in] dim_kernel_x filter kernel size x
* @param[in] dim_kernel_y filter kernel size y
* @param[in] padding_x padding size x
* @param[in] padding_y padding size y
* @param[in] stride_x convolution stride x
* @param[in] stride_y convolution stride y
* @param[in] bias pointer to bias
* @param[in] bias_shift amount of left-shift for bias
* @param[in] out_shift amount of right-shift for output
* @param[in,out] Im_out pointer to output tensor
* @param[in] dim_im_out_x output tensor dimension x
* @param[in] dim_im_out_y output tensor dimension y
* @param[in,out] bufferA pointer to buffer space for input
* @param[in,out] bufferB pointer to buffer space for output
* @return The function returns either
* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
*
* This function is optimized for convolution with 1x1 kernel size (i.e., dim_kernel_x=1
* and dim_kernel_y=1). It can be used for the second half of MobileNets [1] after depthwise
* separable convolution.
*
* This function is the version with full list of optimization tricks, but with
* some contraints:
* ch_im_in is multiple of 4
* ch_im_out is multiple of 2
*
* [1] MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications
* https://arxiv.org/abs/1704.04861
*/
arm_status arm_convolve_1x1_HWC_q7_fast_nonsquare(const q7_t * Im_in,
const uint16_t dim_im_in_x,
const uint16_t dim_im_in_y,
const uint16_t ch_im_in,
const q7_t * wt,
const uint16_t ch_im_out,
const uint16_t dim_kernel_x,
const uint16_t dim_kernel_y,
const uint16_t padding_x,
const uint16_t padding_y,
const uint16_t stride_x,
const uint16_t stride_y,
const q7_t * bias,
const uint16_t bias_shift,
const uint16_t out_shift,
q7_t * Im_out,
const uint16_t dim_im_out_x,
const uint16_t dim_im_out_y,
q15_t * bufferA,
q7_t * bufferB)
{
(void)bufferB;
#if defined (ARM_MATH_DSP)
/* Run the following code for Cortex-M4 and Cortex-M7 */
(void)dim_im_in_y;
int16_t i_out_y, i_out_x;
int16_t i_ch_out;
/* -----------------------
* Here we use bufferA as q15_t internally as computation are done with q15_t level
* im2col are done to output in q15_t format from q7_t input
*/
q15_t *pBuffer = bufferA;
q7_t *pOut = Im_out;
if (ch_im_in % 4 != 0 || ch_im_out % 2 != 0 || dim_kernel_x != 1 || dim_kernel_y != 1
|| padding_x != 0 || padding_y != 0 || stride_x != 1 || stride_y != 1)
{
/* check if the input dimension meets the constraints */
return ARM_MATH_SIZE_MISMATCH;
}
for (i_out_y = 0; i_out_y < dim_im_out_y; i_out_y++)
{
for (i_out_x = 0; i_out_x < dim_im_out_x; i_out_x++)
{
/* This part implements the im2col function */
arm_q7_to_q15_reordered_no_shift((q7_t *) Im_in + (i_out_y * dim_im_in_x + i_out_x) * ch_im_in, pBuffer,
ch_im_in);
pBuffer += ch_im_in;
if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel_x * dim_kernel_y)
{
pOut =
arm_nn_mat_mult_kernel_q7_q15_reordered(wt, bufferA, ch_im_out, ch_im_in, bias_shift, out_shift, bias, pOut);
/* counter reset */
pBuffer = bufferA;
}
}
}
/* check if there is left-over for compute */
if (pBuffer != bufferA)
{
const q7_t *pA = wt;
for (i_ch_out = 0; i_ch_out < ch_im_out; i_ch_out++)
{
q31_t sum = ((q31_t)(bias[i_ch_out]) << bias_shift) + NN_ROUND(out_shift);
const q15_t *pB = bufferA;
/* basically each time it process 4 entries */
uint16_t colCnt = ch_im_in * dim_kernel_x * dim_kernel_y >> 2;
while (colCnt)
{
q31_t inA1, inA2;
q31_t inB1, inB2;
pA = read_and_pad_reordered(pA, &inA1, &inA2);
inB1 = arm_nn_read_q15x2_ia(&pB);
sum = __SMLAD(inA1, inB1, sum);
inB2 = arm_nn_read_q15x2_ia(&pB);
sum = __SMLAD(inA2, inB2, sum);
colCnt--;
}
colCnt = ch_im_in * dim_kernel_y * dim_kernel_x & 0x3;
while (colCnt)
{
q7_t inA1 = *pA++;
q15_t inB1 = *pB++;
sum += inA1 * inB1;
colCnt--;
}
*pOut = (q7_t) __SSAT((sum >> out_shift), 8);
pOut++;
}
}
#else
/* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
int i, j, k, l, m, n;
int conv_out;
int in_row, in_col;
if (ch_im_in % 4 != 0 || ch_im_out % 2 != 0 || dim_kernel_x != 1 || dim_kernel_y != 1
|| padding_x != 0 || padding_y != 0 || stride_x != 1 || stride_y != 1)
{
/* check if the input dimension meets the constraints */
return ARM_MATH_SIZE_MISMATCH;
}
for (i = 0; i < ch_im_out; i++)
{
for (j = 0; j < dim_im_out_y; j++)
{
for (k = 0; k < dim_im_out_x; k++)
{
conv_out = ((q31_t)(bias[i]) << bias_shift) + NN_ROUND(out_shift);
for (m = 0; m < dim_kernel_y; m++)
{
for (n = 0; n < dim_kernel_x; n++)
{
// if-for implementation
in_row = stride_y * j + m - padding_y;
in_col = stride_x * k + n - padding_x;
if (in_row >= 0 && in_col >= 0 && in_row < dim_im_in_y && in_col < dim_im_in_x)
{
for (l = 0; l < ch_im_in; l++)
{
conv_out += Im_in[(in_row * dim_im_in_x + in_col) * ch_im_in + l] *
wt[i * ch_im_in * dim_kernel_y * dim_kernel_x + (m * dim_kernel_y + n) * ch_im_in + l];
}
}
}
}
Im_out[i + (j * dim_im_out_x + k) * ch_im_out] = (q7_t) __SSAT((conv_out >> out_shift), 8);
}
}
}
#endif /* ARM_MATH_DSP */
/* Return to application */
return ARM_MATH_SUCCESS;
}
/**
* @} end of NNConv group
*/

View File

@ -1,186 +0,0 @@
/*
* Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_convolve_1x1_s8_fast.c
* Description: Fast q7 version of 1x1 convolution (non-square shape)
*
* $Date: July 27, 2020
* $Revision: V.2.0.2
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
#include "arm_nn_types.h"
#define DIM_KER_X (1U)
#define DIM_KER_Y (1U)
/**
* @ingroup groupNN
*/
/**
* @addtogroup NNConv
* @{
*/
/*
* Fast s8 version for 1x1 convolution (non-square shape)
*
* Refer header file for details.
*
*/
arm_status arm_convolve_1x1_s8_fast(const cmsis_nn_context *ctx,
const cmsis_nn_conv_params *conv_params,
const cmsis_nn_per_channel_quant_params *quant_params,
const cmsis_nn_dims *input_dims,
const q7_t *input_data,
const cmsis_nn_dims *filter_dims,
const q7_t *filter_data,
const cmsis_nn_dims *bias_dims,
const int32_t *bias_data,
const cmsis_nn_dims *output_dims,
q7_t *output_data)
{
if (input_dims->c % 4 != 0 ||
conv_params->padding.w != 0 || conv_params->padding.h != 0 ||
conv_params->stride.w != 1 || conv_params->stride.h != 1)
{
return ARM_MATH_SIZE_MISMATCH;
}
(void)ctx;
(void)filter_dims;
(void)bias_dims;
#if defined(ARM_MATH_MVEI)
const int32_t col_len = input_dims->w * input_dims->h * input_dims->n;
const int32_t output_ch = output_dims->c;
const int32_t input_ch = input_dims->c;
const int32_t input_offset = conv_params->input_offset;
const int32_t out_offset = conv_params->output_offset;
const int32_t out_activation_min = conv_params->activation.min;
const int32_t out_activation_max = conv_params->activation.max;
int32_t *output_mult = quant_params->multiplier;
int32_t *output_shift = quant_params->shift;
for (int i_items = 0; i_items <= (col_len - 4); i_items += 4)
{
for (int i_out_ch = 0; i_out_ch < output_ch; i_out_ch++)
{
int32_t sum_row = 0;
int32_t temp_out[4];
(void)arm_nn_mat_mul_core_4x_s8(input_ch,
input_ch,
input_data + i_items * input_ch,
filter_data + i_out_ch * input_ch,
&sum_row,
temp_out);
int32x4_t res = vldrwq_s32(temp_out);
if (bias_data)
{
res = vaddq_n_s32(res, bias_data[i_out_ch]);
}
sum_row = sum_row * input_offset;
res = vaddq_n_s32(res, sum_row);
res = arm_requantize_mve(res, output_mult[i_out_ch], output_shift[i_out_ch]);
res = vaddq_n_s32(res, out_offset);
res = vmaxq_s32(res, vdupq_n_s32(out_activation_min));
res = vminq_s32(res, vdupq_n_s32(out_activation_max));
const uint32x4_t scatter_offset = {0, (uint32_t)output_ch,
(uint32_t)output_ch * 2,
(uint32_t)output_ch * 3};
vstrbq_scatter_offset_s32(output_data, scatter_offset, res);
output_data++;
}
output_data += (3 * output_ch);
}
/* Handle left over elements */
for (int i_items = (col_len & ~0x3); i_items < col_len; i_items++)
{
for (int i_out_ch = 0; i_out_ch < output_ch; i_out_ch++)
{
int32_t sum_row = 0;
int32_t acc;
(void)arm_nn_mat_mul_core_1x_s8(input_ch,
input_data + i_items * input_ch,
filter_data + i_out_ch * input_ch,
&sum_row,
&acc);
if (bias_data)
{
acc += bias_data[i_out_ch];
}
sum_row = (sum_row * input_offset);
acc += sum_row;
acc = arm_nn_requantize(acc, output_mult[i_out_ch], output_shift[i_out_ch]);
acc += out_offset;
acc = MAX(acc, out_activation_min);
acc = MIN(acc, out_activation_max);
*output_data++ = acc;
}
}
#else
/* Run the following code as reference implementation for Cortex-M processors with or without DSP extension */
const int32_t lhs_rows = input_dims->w * input_dims->h * input_dims->n;
const int32_t rhs_rows = output_dims->c;
const int32_t rhs_cols = input_dims->c;
arm_nn_mat_mult_nt_t_s8(input_data,
filter_data,
bias_data,
output_data,
quant_params->multiplier,
quant_params->shift,
lhs_rows,
rhs_rows,
rhs_cols,
conv_params->input_offset,
conv_params->output_offset,
conv_params->activation.min,
conv_params->activation.max);
#endif
/* Return to application */
return ARM_MATH_SUCCESS;
}
int32_t arm_convolve_1x1_s8_fast_get_buffer_size(const cmsis_nn_dims *input_dims)
{
(void)input_dims;
return 0;
}
/**
* @} end of NNConv group
*/

View File

@ -1,207 +0,0 @@
/*
* Copyright (C) 2010-2018 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_convolve_HWC_q15_basic.c
* Description: Q15 version of convolution
*
* $Date: 17. January 2018
* $Revision: V.1.0.0
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#include "arm_math.h"
#include "arm_nnfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup NNConv
* @{
*/
/**
* @brief Basic Q15 convolution function
* @param[in] Im_in pointer to input tensor
* @param[in] dim_im_in input tensor dimention
* @param[in] ch_im_in number of input tensor channels
* @param[in] wt pointer to kernel weights
* @param[in] ch_im_out number of filters, i.e., output tensor channels
* @param[in] dim_kernel filter kernel size
* @param[in] padding padding sizes
* @param[in] stride convolution stride
* @param[in] bias pointer to bias
* @param[in] bias_shift amount of left-shift for bias
* @param[in] out_shift amount of right-shift for output
* @param[in,out] Im_out pointer to output tensor
* @param[in] dim_im_out output tensor dimension
* @param[in,out] bufferA pointer to buffer space for input
* @param[in,out] bufferB pointer to buffer space for output
* @return The function returns <code>ARM_MATH_SUCCESS</code>
*
* @details
*
* <b>Buffer size:</b>
*
* bufferA size: ch_im_in*dim_kernel*dim_kernel
*
* bufferB size: 0
*
* This basic version is designed to work for any input tensor and weight
* dimension.
*/
arm_status
arm_convolve_HWC_q15_basic(const q15_t * Im_in,
const uint16_t dim_im_in,
const uint16_t ch_im_in,
const q15_t * wt,
const uint16_t ch_im_out,
const uint16_t dim_kernel,
const uint16_t padding,
const uint16_t stride,
const q15_t * bias,
const uint16_t bias_shift,
const uint16_t out_shift,
q15_t * Im_out,
const uint16_t dim_im_out,
q15_t * bufferA,
q7_t * bufferB)
{
(void)bufferB;
#if defined (ARM_MATH_DSP)
/* Run the following code for Cortex-M4 and Cortex-M7 */
int16_t i_out_y, i_out_x, i_ker_y, i_ker_x;
uint16_t im2col_out_pixel_index = 0;
q15_t *pBuffer = bufferA;
q15_t *pOut = Im_out;
q15_t *im_buffer = bufferA;
const q15_t *pA;
int i;
/* This part implements the im2col function */
for (i_out_y = 0; i_out_y < dim_im_out; i_out_y++)
{
for (i_out_x = 0; i_out_x < dim_im_out; i_out_x++)
{
for (i_ker_y = i_out_y * stride - padding; i_ker_y < i_out_y * stride - padding + dim_kernel; i_ker_y++)
{
for (i_ker_x = i_out_x * stride - padding; i_ker_x < i_out_x * stride - padding + dim_kernel; i_ker_x++)
{
if (i_ker_y < 0 || i_ker_y >= dim_im_in || i_ker_x < 0 || i_ker_x >= dim_im_in)
{
/* Filling 0 for out-of-bound paddings */
/* arm_fill_q15(0, pBuffer, ch_im_in); */
memset(pBuffer, 0, sizeof(q15_t)*ch_im_in);
} else
{
/* arm_copy_q15((q15_t *) Im_in + (i_ker_y * dim_im_in + i_ker_x) * ch_im_in, pBuffer, ch_im_in); */
memcpy(pBuffer, (q15_t *) Im_in + (i_ker_y * dim_im_in + i_ker_x) * ch_im_in, sizeof(q15_t)*ch_im_in);
}
pBuffer += ch_im_in;
}
}
pA = wt;
for (i = 0; i < ch_im_out; i++)
{
q31_t sum = ((q31_t)bias[i] << bias_shift) + NN_ROUND(out_shift);
const q15_t *pB = im_buffer;
uint16_t colCnt = ch_im_in * dim_kernel * dim_kernel >> 2;
while (colCnt)
{
q31_t inA1 = arm_nn_read_q15x2_ia(&pA);
q31_t inB1 = arm_nn_read_q15x2_ia(&pB);
q31_t inA2 = arm_nn_read_q15x2_ia(&pA);
q31_t inB2 = arm_nn_read_q15x2_ia(&pB);
sum = __SMLAD(inA1, inB1, sum);
sum = __SMLAD(inA2, inB2, sum);
colCnt--;
}
colCnt = ch_im_in * dim_kernel * dim_kernel & 0x3;
while (colCnt)
{
q15_t inA1 = *pA++;
q15_t inB1 = *pB++;
sum += inA1 * inB1;
colCnt--;
}
*pOut = (q15_t) __SSAT((sum >> out_shift), 16);
pOut++;
}
/* counter reset */
pBuffer = im_buffer;
im2col_out_pixel_index++;
}
}
#else
/* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
int i, j, k, l, m, n;
int conv_out;
int in_row, in_col;
for (i = 0; i < ch_im_out; i++)
{
for (j = 0; j < dim_im_out; j++)
{
for (k = 0; k < dim_im_out; k++)
{
conv_out = ((q31_t)bias[i] << bias_shift) + NN_ROUND(out_shift);
for (m = 0; m < dim_kernel; m++)
{
for (n = 0; n < dim_kernel; n++)
{
in_row = stride * j + m - padding;
in_col = stride * k + n - padding;
if (in_row >= 0 && in_col >= 0 && in_row < dim_im_in && in_col < dim_im_in)
{
for (l = 0; l < ch_im_in; l++)
{
conv_out +=
Im_in[(in_row * dim_im_in + in_col) * ch_im_in +
l] * wt[i * ch_im_in * dim_kernel * dim_kernel + (m * dim_kernel +
n) * ch_im_in + l];
}
}
}
}
Im_out[i + (j * dim_im_out + k) * ch_im_out] = (q15_t) __SSAT((conv_out >> out_shift), 16);
}
}
}
#endif /* ARM_MATH_DSP */
/* Return to application */
return ARM_MATH_SUCCESS;
}
/**
* @} end of NNConv group
*/

View File

@ -1,255 +0,0 @@
/*
* Copyright (C) 2010-2018 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_convolve_HWC_q15_fast.c
* Description: Fast Q15 version of convolution
*
* $Date: 17. January 2018
* $Revision: V.1.0.0
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#include "arm_math.h"
#include "arm_nnfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup NNConv
* @{
*/
/**
* @brief Fast Q15 convolution function
* @param[in] Im_in pointer to input tensor
* @param[in] dim_im_in input tensor dimention
* @param[in] ch_im_in number of input tensor channels
* @param[in] wt pointer to kernel weights
* @param[in] ch_im_out number of filters, i.e., output tensor channels
* @param[in] dim_kernel filter kernel size
* @param[in] padding padding sizes
* @param[in] stride convolution stride
* @param[in] bias pointer to bias
* @param[in] bias_shift amount of left-shift for bias
* @param[in] out_shift amount of right-shift for output
* @param[in,out] Im_out pointer to output tensor
* @param[in] dim_im_out output tensor dimension
* @param[in,out] bufferA pointer to buffer space for input
* @param[in,out] bufferB pointer to buffer space for output
* @return The function returns either
* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
*
* @details
*
* <b>Buffer size:</b>
*
* bufferA size: 2*ch_im_in*dim_kernel*dim_kernel
*
* bufferB size: 0
*
* <b>Input dimension constraints:</b>
*
* ch_im_in is multiple of 2
*
* ch_im_out is multipe of 2
*
*/
arm_status
arm_convolve_HWC_q15_fast(const q15_t * Im_in,
const uint16_t dim_im_in,
const uint16_t ch_im_in,
const q15_t * wt,
const uint16_t ch_im_out,
const uint16_t dim_kernel,
const uint16_t padding,
const uint16_t stride,
const q15_t * bias,
const uint16_t bias_shift,
const uint16_t out_shift,
q15_t * Im_out,
const uint16_t dim_im_out,
q15_t * bufferA,
q7_t * bufferB)
{
(void)bufferB;
#if defined (ARM_MATH_DSP)
int16_t i_out_y, i_out_x, i_ker_y, i_ker_x;
q15_t *pBuffer = bufferA;
q15_t *im_buffer = bufferA;
q15_t *pOut = Im_out;
if (ch_im_in % 2 != 0 || ch_im_out % 2 != 0)
{
/* check if the input dimension meets the constraints */
return ARM_MATH_SIZE_MISMATCH;
}
/* Run the following code for Cortex-M4 and Cortex-M7 */
/* This part implements the im2col function */
for (i_out_y = 0; i_out_y < dim_im_out; i_out_y++)
{
for (i_out_x = 0; i_out_x < dim_im_out; i_out_x++)
{
for (i_ker_y = i_out_y * stride - padding; i_ker_y < i_out_y * stride - padding + dim_kernel; i_ker_y++)
{
for (i_ker_x = i_out_x * stride - padding; i_ker_x < i_out_x * stride - padding + dim_kernel; i_ker_x++)
{
if (i_ker_y < 0 || i_ker_y >= dim_im_in || i_ker_x < 0 || i_ker_x >= dim_im_in)
{
/* arm_fill_q15(0, pBuffer, ch_im_in); */
memset(pBuffer, 0, sizeof(q15_t)*ch_im_in);
} else
{
/* arm_copy_q15((q15_t *) Im_in + (i_ker_y * dim_im_in + i_ker_x) * ch_im_in, pBuffer, ch_im_in); */
memcpy(pBuffer, (q15_t *) Im_in + (i_ker_y * dim_im_in + i_ker_x) * ch_im_in, sizeof(q15_t)*ch_im_in);
}
pBuffer += ch_im_in;
}
}
if (i_out_x & 0x1)
{
int i;
/* initialize the matrix pointers for A */
const q15_t *pA = wt;
/* set up the second output pointers */
q15_t *pOut2 = pOut + ch_im_out;
/* this loop over rows in A */
for (i = 0; i < ch_im_out; i += 2)
{
/* setup pointers for B */
const q15_t *pB = im_buffer;
const q15_t *pB2 = pB + ch_im_in * dim_kernel * dim_kernel;
/* aling the second pointer for A */
const q15_t *pA2 = pA + ch_im_in * dim_kernel * dim_kernel;
/* init the sum with bias */
q31_t sum = ((q31_t)bias[i] << bias_shift) + NN_ROUND(out_shift);
q31_t sum2 = ((q31_t)bias[i] << bias_shift) + NN_ROUND(out_shift);
q31_t sum3 = ((q31_t)bias[i + 1] << bias_shift) + NN_ROUND(out_shift);
q31_t sum4 = ((q31_t)bias[i + 1] << bias_shift) + NN_ROUND(out_shift);
uint16_t colCnt = ch_im_in * dim_kernel * dim_kernel >> 1;
/* accumulate over the vector */
while (colCnt)
{
q31_t inA1 = arm_nn_read_q15x2_ia(&pA);
q31_t inB1 = arm_nn_read_q15x2_ia(&pB);
q31_t inA2 = arm_nn_read_q15x2_ia(&pA2);
q31_t inB2 = arm_nn_read_q15x2_ia(&pB2);
sum = __SMLAD(inA1, inB1, sum);
sum2 = __SMLAD(inA1, inB2, sum2);
sum3 = __SMLAD(inA2, inB1, sum3);
sum4 = __SMLAD(inA2, inB2, sum4);
colCnt--;
} /* while over colCnt */
colCnt = ch_im_in * dim_kernel * dim_kernel & 0x1;
while (colCnt)
{
q15_t inA1 = *pA++;
q15_t inB1 = *pB++;
q15_t inA2 = *pA2++;
q15_t inB2 = *pB2++;
sum += inA1 * inB1;
sum2 += inA1 * inB2;
sum3 += inA2 * inB1;
sum4 += inA2 * inB2;
colCnt--;
} /* while over colCnt */
*pOut++ = (q15_t) __SSAT(sum >> out_shift, 16);
*pOut++ = (q15_t) __SSAT(sum3 >> out_shift, 16);
*pOut2++ = (q15_t) __SSAT(sum2 >> out_shift, 16);
*pOut2++ = (q15_t) __SSAT(sum4 >> out_shift, 16);
/* skip the row computed with A2 */
pA += ch_im_in * dim_kernel * dim_kernel;
} /* for over ch_im_out */
pOut += ch_im_out;
/* counter reset */
pBuffer = im_buffer;
}
}
}
#else
/* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
int i, j, k, l, m, n;
int conv_out;
int in_row, in_col;
if (ch_im_in % 2 != 0 || ch_im_out % 2 != 0)
{
/* check if the input dimension meets the constraints */
return ARM_MATH_SIZE_MISMATCH;
}
for (i = 0; i < ch_im_out; i++)
{
for (j = 0; j < dim_im_out; j++)
{
for (k = 0; k < dim_im_out; k++)
{
conv_out = ((q31_t)bias[i] << bias_shift) + NN_ROUND(out_shift);
for (m = 0; m < dim_kernel; m++)
{
for (n = 0; n < dim_kernel; n++)
{
in_row = stride * j + m - padding;
in_col = stride * k + n - padding;
if (in_row >= 0 && in_col >= 0 && in_row < dim_im_in && in_col < dim_im_in)
{
for (l = 0; l < ch_im_in; l++)
{
conv_out +=
Im_in[(in_row * dim_im_in + in_col) * ch_im_in +
l] * wt[i * ch_im_in * dim_kernel * dim_kernel + (m * dim_kernel +
n) * ch_im_in + l];
}
}
}
}
Im_out[i + (j * dim_im_out + k) * ch_im_out] = (q15_t) __SSAT((conv_out >> out_shift), 16);
}
}
}
#endif /* ARM_MATH_DSP */
/* Return to application */
return ARM_MATH_SUCCESS;
}
/**
* @} end of NNConv group
*/

View File

@ -1,265 +0,0 @@
/*
* Copyright (C) 2010-2018 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_convolve_HWC_q15_fast.c
* Description: Fast Q15 version of convolution
*
* $Date: 24. May 2018
* $Revision: V.1.0.0
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#include "arm_math.h"
#include "arm_nnfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup NNConv
* @{
*/
/**
* @brief Fast Q15 convolution function (non-sqaure shape)
* @param[in] Im_in pointer to input tensor
* @param[in] dim_im_in_x input tensor dimention x
* @param[in] dim_im_in_y input tensor dimention y
* @param[in] ch_im_in number of input tensor channels
* @param[in] wt pointer to kernel weights
* @param[in] ch_im_out number of filters, i.e., output tensor channels
* @param[in] dim_kernel_x filter kernel size x
* @param[in] dim_kernel_y filter kernel size y
* @param[in] padding_x padding size x
* @param[in] padding_y padding size y
* @param[in] stride_x convolution stride x
* @param[in] stride_y convolution stride y
* @param[in] bias pointer to bias
* @param[in] bias_shift amount of left-shift for bias
* @param[in] out_shift amount of right-shift for output
* @param[in,out] Im_out pointer to output tensor
* @param[in] dim_im_out_x output tensor dimension x
* @param[in] dim_im_out_y output tensor dimension y
* @param[in,out] bufferA pointer to buffer space for input
* @param[in,out] bufferB pointer to buffer space for output
* @return The function returns either
* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
*
* @details
*
* <b>Buffer size:</b>
*
* bufferA size: 2*ch_im_in*dim_kernel*dim_kernel
*
* bufferB size: 0
*
* <b>Input dimension constraints:</b>
*
* ch_im_in is multiple of 2
*
* ch_im_out is multipe of 2
*
*/
arm_status
arm_convolve_HWC_q15_fast_nonsquare(const q15_t * Im_in,
const uint16_t dim_im_in_x,
const uint16_t dim_im_in_y,
const uint16_t ch_im_in,
const q15_t * wt,
const uint16_t ch_im_out,
const uint16_t dim_kernel_x,
const uint16_t dim_kernel_y,
const uint16_t padding_x,
const uint16_t padding_y,
const uint16_t stride_x,
const uint16_t stride_y,
const q15_t * bias,
const uint16_t bias_shift,
const uint16_t out_shift,
q15_t * Im_out,
const uint16_t dim_im_out_x,
const uint16_t dim_im_out_y,
q15_t * bufferA,
q7_t * bufferB)
{
(void)bufferB;
#if defined (ARM_MATH_DSP)
int16_t i_out_y, i_out_x, i_ker_y, i_ker_x;
q15_t *pBuffer = bufferA;
q15_t *im_buffer = bufferA;
q15_t *pOut = Im_out;
if (ch_im_in % 2 != 0 || ch_im_out % 2 != 0)
{
/* check if the input dimension meets the constraints */
return ARM_MATH_SIZE_MISMATCH;
}
/* Run the following code for Cortex-M4 and Cortex-M7 */
/* This part implements the im2col function */
for (i_out_y = 0; i_out_y < dim_im_out_y; i_out_y++)
{
for (i_out_x = 0; i_out_x < dim_im_out_x; i_out_x++)
{
for (i_ker_y = i_out_y * stride_y - padding_y; i_ker_y < i_out_y * stride_y - padding_y + dim_kernel_y; i_ker_y++)
{
for (i_ker_x = i_out_x * stride_x - padding_x; i_ker_x < i_out_x * stride_x - padding_x + dim_kernel_x; i_ker_x++)
{
if (i_ker_y < 0 || i_ker_y >= dim_im_in_y || i_ker_x < 0 || i_ker_x >= dim_im_in_x)
{
/* arm_fill_q15(0, pBuffer, ch_im_in); */
memset(pBuffer, 0, sizeof(q15_t)*ch_im_in);
} else
{
/* arm_copy_q15((q15_t *) Im_in + (i_ker_y * dim_im_in_x + i_ker_x) * ch_im_in, pBuffer, ch_im_in); */
memcpy(pBuffer, (q15_t *) Im_in + (i_ker_y * dim_im_in_x + i_ker_x) * ch_im_in, sizeof(q15_t)*ch_im_in);
}
pBuffer += ch_im_in;
}
}
if (i_out_x & 0x1)
{
int i;
/* initialize the matrix pointers for A */
const q15_t *pA = wt;
/* set up the second output pointers */
q15_t *pOut2 = pOut + ch_im_out;
/* this loop over rows in A */
for (i = 0; i < ch_im_out; i += 2)
{
/* setup pointers for B */
const q15_t *pB = im_buffer;
const q15_t *pB2 = pB + ch_im_in * dim_kernel_y * dim_kernel_x;
/* aling the second pointer for A */
const q15_t *pA2 = pA + ch_im_in * dim_kernel_y * dim_kernel_x;
/* init the sum with bias */
q31_t sum = ((q31_t)bias[i] << bias_shift) + NN_ROUND(out_shift);
q31_t sum2 = ((q31_t)bias[i] << bias_shift) + NN_ROUND(out_shift);
q31_t sum3 = ((q31_t)bias[i + 1] << bias_shift) + NN_ROUND(out_shift);
q31_t sum4 = ((q31_t)bias[i + 1] << bias_shift) + NN_ROUND(out_shift);
uint16_t colCnt = ch_im_in * dim_kernel_y * dim_kernel_x >> 1;
/* accumulate over the vector */
while (colCnt)
{
q31_t inA1 = arm_nn_read_q15x2_ia(&pA);
q31_t inB1 = arm_nn_read_q15x2_ia(&pB);
q31_t inA2 = arm_nn_read_q15x2_ia(&pA2);
q31_t inB2 = arm_nn_read_q15x2_ia(&pB2);
sum = __SMLAD(inA1, inB1, sum);
sum2 = __SMLAD(inA1, inB2, sum2);
sum3 = __SMLAD(inA2, inB1, sum3);
sum4 = __SMLAD(inA2, inB2, sum4);
colCnt--;
} /* while over colCnt */
colCnt = ch_im_in * dim_kernel_y * dim_kernel_x & 0x1;
while (colCnt)
{
q15_t inA1 = *pA++;
q15_t inB1 = *pB++;
q15_t inA2 = *pA2++;
q15_t inB2 = *pB2++;
sum += inA1 * inB1;
sum2 += inA1 * inB2;
sum3 += inA2 * inB1;
sum4 += inA2 * inB2;
colCnt--;
} /* while over colCnt */
*pOut++ = (q15_t) __SSAT(sum >> out_shift, 16);
*pOut++ = (q15_t) __SSAT(sum3 >> out_shift, 16);
*pOut2++ = (q15_t) __SSAT(sum2 >> out_shift, 16);
*pOut2++ = (q15_t) __SSAT(sum4 >> out_shift, 16);
/* skip the row computed with A2 */
pA += ch_im_in * dim_kernel_y * dim_kernel_x;
} /* for over ch_im_out */
pOut += ch_im_out;
/* counter reset */
pBuffer = im_buffer;
}
}
}
#else
/* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
int i, j, k, l, m, n;
int conv_out;
int in_row, in_col;
if (ch_im_in % 2 != 0 || ch_im_out % 2 != 0)
{
/* check if the input dimension meets the constraints */
return ARM_MATH_SIZE_MISMATCH;
}
for (i = 0; i < ch_im_out; i++)
{
for (j = 0; j < dim_im_out_y; j++)
{
for (k = 0; k < dim_im_out_x; k++)
{
conv_out = ((q31_t)bias[i] << bias_shift) + NN_ROUND(out_shift);
for (m = 0; m < dim_kernel_y; m++)
{
for (n = 0; n < dim_kernel_x; n++)
{
in_row = stride_y * j + m - padding_y;
in_col = stride_x * k + n - padding_x;
if (in_row >= 0 && in_col >= 0 && in_row < dim_im_in_y && in_col < dim_im_in_x)
{
for (l = 0; l < ch_im_in; l++)
{
conv_out +=
Im_in[(in_row * dim_im_in_x + in_col) * ch_im_in +
l] * wt[i * ch_im_in * dim_kernel_x * dim_kernel_y + (m * dim_kernel_x +
n) * ch_im_in + l];
}
}
}
}
Im_out[i + (j * dim_im_out_x + k) * ch_im_out] = (q15_t) __SSAT((conv_out >> out_shift), 16);
}
}
}
#endif /* ARM_MATH_DSP */
/* Return to application */
return ARM_MATH_SUCCESS;
}
/**
* @} end of NNConv group
*/

View File

@ -1,279 +0,0 @@
/*
* Copyright (C) 2010-2018 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_convolve_HWC_q7_RGB.c
* Description: Q7 version of convolution for RGB image
*
* $Date: 17. January 2018
* $Revision: V.1.0.0
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#include "arm_math.h"
#include "arm_nnfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup NNConv
* @{
*/
/**
* @brief Q7 convolution function for RGB image
* @param[in] Im_in pointer to input tensor
* @param[in] dim_im_in input tensor dimention
* @param[in] ch_im_in number of input tensor channels
* @param[in] wt pointer to kernel weights
* @param[in] ch_im_out number of filters, i.e., output tensor channels
* @param[in] dim_kernel filter kernel size
* @param[in] padding padding sizes
* @param[in] stride convolution stride
* @param[in] bias pointer to bias
* @param[in] bias_shift amount of left-shift for bias
* @param[in] out_shift amount of right-shift for output
* @param[in,out] Im_out pointer to output tensor
* @param[in] dim_im_out output tensor dimension
* @param[in,out] bufferA pointer to buffer space for input
* @param[in,out] bufferB pointer to buffer space for output
* @return The function returns either
* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
*
* @details
*
* <b>Buffer size:</b>
*
* bufferA size: 2*ch_im_in*dim_kernel*dim_kernel
*
* bufferB size: 0
*
* <b>Input dimension constraints:</b>
*
* ch_im_in equals 3
*
* This kernel is written exclusively for convolution with ch_im_in
* equals 3. This applies on the first layer of CNNs which has input
* image with RGB format.
*/
arm_status
arm_convolve_HWC_q7_RGB(const q7_t * Im_in,
const uint16_t dim_im_in,
const uint16_t ch_im_in,
const q7_t * wt,
const uint16_t ch_im_out,
const uint16_t dim_kernel,
const uint16_t padding,
const uint16_t stride,
const q7_t * bias,
const uint16_t bias_shift,
const uint16_t out_shift,
q7_t * Im_out, const uint16_t dim_im_out, q15_t * bufferA, q7_t * bufferB)
{
(void)bufferB;
#if defined (ARM_MATH_DSP)
/* Run the following code for Cortex-M4 and Cortex-M7 */
int16_t i_out_y, i_out_x, i_ker_y, i_ker_x;
/*
* Here we use bufferA as q15_t internally as computation are done with q15_t level
* im2col are done to output in q15_t format from q7_t input
*/
q15_t *pBuffer = bufferA;
q7_t *pOut = Im_out;
// check if number of input channels is 3
if (ch_im_in != 3)
{
return ARM_MATH_SIZE_MISMATCH;
}
// This part implements the im2col function
for (i_out_y = 0; i_out_y < dim_im_out; i_out_y++)
{
for (i_out_x = 0; i_out_x < dim_im_out; i_out_x++)
{
for (i_ker_y = i_out_y * stride - padding; i_ker_y < i_out_y * stride - padding + dim_kernel; i_ker_y++)
{
for (i_ker_x = i_out_x * stride - padding; i_ker_x < i_out_x * stride - padding + dim_kernel; i_ker_x++)
{
if (i_ker_y < 0 || i_ker_y >= dim_im_in || i_ker_x < 0 || i_ker_x >= dim_im_in)
{
/* Equivalent to arm_fill_q15(0, pBuffer, ch_im_in) with assumption: ch_im_in = 3 */
*__SIMD32(pBuffer) = 0x0;
*(pBuffer + 2) = 0;
pBuffer += 3;
} else
{
/*
* Equivalent to:
* arm_q7_to_q15_no_shift( (q7_t*)Im_in+(i_ker_y*dim_im_in+i_ker_x)*3, pBuffer, 3);
*/
const q7_t *pPixel = Im_in + (i_ker_y * dim_im_in + i_ker_x) * 3;
q31_t buf = arm_nn_read_q7x4(pPixel);
union arm_nnword top;
union arm_nnword bottom;
top.word = __SXTB16(buf);
bottom.word = __SXTB16(__ROR(buf, 8));
#ifndef ARM_MATH_BIG_ENDIAN
/*
* little-endian, | omit | 3rd | 2nd | 1st |
* MSB LSB
* top | 3rd | 1st |; bottom | omit | 2nd |
*
* version 1, need to swap 2nd and 3rd weight
* *__SIMD32(pBuffer) = top.word;
* *(pBuffer+2) = bottom.half_words[0];
*
* version 2, no weight shuffling required
*/
*pBuffer++ = top.half_words[0];
*__SIMD32(pBuffer) = __PKHBT(bottom.word, top.word, 0);
#else
/*
* big-endian, | 1st | 2nd | 3rd | omit |
* MSB LSB
* top | 2nd | omit |; bottom | 1st | 3rd |
*
* version 1, need to swap 2nd and 3rd weight
* *__SIMD32(pBuffer) = bottom.word;
* *(pBuffer+2) = top.half_words[1];
*
* version 2, no weight shuffling required
*/
*pBuffer++ = bottom.half_words[0];
*__SIMD32(pBuffer) = __PKHTB(top.word, bottom.word, 0);
#endif
pBuffer += 2;
}
}
}
if (pBuffer == bufferA + 2 * 3 * dim_kernel * dim_kernel)
{
pOut =
arm_nn_mat_mult_kernel_q7_q15(wt, bufferA,
ch_im_out,
3 * dim_kernel * dim_kernel, bias_shift, out_shift, bias, pOut);
/* counter reset */
pBuffer = bufferA;
}
}
}
/* left-over because odd number of output pixels */
if (pBuffer != bufferA)
{
const q7_t *pA = wt;
int i;
for (i = 0; i < ch_im_out; i++)
{
q31_t sum = ((q31_t)bias[i] << bias_shift) + NN_ROUND(out_shift);
q15_t *pB = bufferA;
/* basically each time it process 4 entries */
uint16_t colCnt = 3 * dim_kernel * dim_kernel >> 2;
while (colCnt)
{
q31_t inA1, inA2;
q31_t inB1, inB2;
pA = read_and_pad(pA, &inA1, &inA2);
inB1 = arm_nn_read_q15x2_ia((const q15_t **)&pB);
sum = __SMLAD(inA1, inB1, sum);
inB2 = arm_nn_read_q15x2_ia((const q15_t **)&pB);
sum = __SMLAD(inA2, inB2, sum);
colCnt--;
}
colCnt = 3 * dim_kernel * dim_kernel & 0x3;
while (colCnt)
{
q7_t inA1 = *pA++;
q15_t inB1 = *pB++;
sum += inA1 * inB1;
colCnt--;
}
*pOut++ = (q7_t) __SSAT((sum >> out_shift), 8);
}
}
#else
/* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
int i, j, k, l, m, n;
int conv_out;
int in_row, in_col;
// check if number of input channels is 3
if (ch_im_in != 3)
{
return ARM_MATH_SIZE_MISMATCH;
}
for (i = 0; i < ch_im_out; i++)
{
for (j = 0; j < dim_im_out; j++)
{
for (k = 0; k < dim_im_out; k++)
{
conv_out = (bias[i] << bias_shift) + NN_ROUND(out_shift);
for (m = 0; m < dim_kernel; m++)
{
for (n = 0; n < dim_kernel; n++)
{
/* if-for implementation */
in_row = stride * j + m - padding;
in_col = stride * k + n - padding;
if (in_row >= 0 && in_col >= 0 && in_row < dim_im_in && in_col < dim_im_in)
{
for (l = 0; l < ch_im_in; l++)
{
conv_out +=
Im_in[(in_row * dim_im_in + in_col) * ch_im_in +
l] * wt[i * ch_im_in * dim_kernel * dim_kernel + (m * dim_kernel +
n) * ch_im_in + l];
}
}
}
}
Im_out[i + (j * dim_im_out + k) * ch_im_out] = (q7_t) __SSAT((conv_out >> out_shift), 8);
}
}
}
#endif /* ARM_MATH_DSP */
/* Return to application */
return (ARM_MATH_SUCCESS);
}
/**
* @} end of NNConv group
*/

View File

@ -1,231 +0,0 @@
/*
* Copyright (C) 2010-2018 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_convolve_HWC_q7_basic.c
* Description: Q7 version of convolution
*
* $Date: 17. January 2018
* $Revision: V.1.0.0
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#include "arm_math.h"
#include "arm_nnfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup NNConv
* @{
*/
/**
* @brief Basic Q7 convolution function
* @param[in] Im_in pointer to input tensor
* @param[in] dim_im_in input tensor dimention
* @param[in] ch_im_in number of input tensor channels
* @param[in] wt pointer to kernel weights
* @param[in] ch_im_out number of filters, i.e., output tensor channels
* @param[in] dim_kernel filter kernel size
* @param[in] padding padding sizes
* @param[in] stride convolution stride
* @param[in] bias pointer to bias
* @param[in] bias_shift amount of left-shift for bias
* @param[in] out_shift amount of right-shift for output
* @param[in,out] Im_out pointer to output tensor
* @param[in] dim_im_out output tensor dimension
* @param[in,out] bufferA pointer to buffer space for input
* @param[in,out] bufferB pointer to buffer space for output
* @return The function returns <code>ARM_MATH_SUCCESS</code>
*
* @details
*
* <b>Buffer size:</b>
*
* bufferA size: 2*ch_im_in*dim_kernel*dim_kernel
*
* bufferB size: 0
*
* This basic version is designed to work for any input tensor and weight
* dimension.
*/
arm_status
arm_convolve_HWC_q7_basic(const q7_t * Im_in,
const uint16_t dim_im_in,
const uint16_t ch_im_in,
const q7_t * wt,
const uint16_t ch_im_out,
const uint16_t dim_kernel,
const uint16_t padding,
const uint16_t stride,
const q7_t * bias,
const uint16_t bias_shift,
const uint16_t out_shift,
q7_t * Im_out,
const uint16_t dim_im_out,
q15_t * bufferA,
q7_t * bufferB)
{
(void)bufferB;
#if defined (ARM_MATH_DSP)
/* Run the following code for Cortex-M4 and Cortex-M7 */
int16_t i_out_y, i_out_x, i_ker_y, i_ker_x;
/*
* Here we use bufferA as q15_t internally as computation are done with q15_t level
* im2col are done to output in q15_t format from q7_t input
*/
q15_t *pBuffer = bufferA;
q7_t *pOut = Im_out;
/* This part implements the im2col function */
for (i_out_y = 0; i_out_y < dim_im_out; i_out_y++)
{
for (i_out_x = 0; i_out_x < dim_im_out; i_out_x++)
{
for (i_ker_y = i_out_y * stride - padding; i_ker_y < i_out_y * stride - padding + dim_kernel; i_ker_y++)
{
for (i_ker_x = i_out_x * stride - padding; i_ker_x < i_out_x * stride - padding + dim_kernel; i_ker_x++)
{
if (i_ker_y < 0 || i_ker_y >= dim_im_in || i_ker_x < 0 || i_ker_x >= dim_im_in)
{
/* Filling 0 for out-of-bound paddings */
/* arm_fill_q15(0, pBuffer, ch_im_in); */
memset(pBuffer, 0, sizeof(q15_t)*ch_im_in);
} else
{
/* Copying the pixel data to column */
arm_q7_to_q15_no_shift((q7_t *)
Im_in + (i_ker_y * dim_im_in + i_ker_x) * ch_im_in, pBuffer, ch_im_in);
}
pBuffer += ch_im_in;
}
}
/* Computation is filed for every 2 columns */
if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel * dim_kernel)
{
pOut =
arm_nn_mat_mult_kernel_q7_q15(wt, bufferA,
ch_im_out,
ch_im_in *
dim_kernel * dim_kernel, bias_shift, out_shift, bias, pOut);
/* counter reset */
pBuffer = bufferA;
}
}
}
/* left-over because odd number of output pixels */
if (pBuffer != bufferA)
{
const q7_t *pA = wt;
int i;
for (i = 0; i < ch_im_out; i++)
{
/* Load the accumulator with bias first */
q31_t sum = ((q31_t)bias[i] << bias_shift) + NN_ROUND(out_shift);
/* Point to the beging of the im2col buffer */
const q15_t *pB = bufferA;
/* Each time it process 4 entries */
uint16_t colCnt = ch_im_in * dim_kernel * dim_kernel >> 2;
while (colCnt)
{
q31_t inA1, inA2;
q31_t inB1, inB2;
pA = read_and_pad(pA, &inA1, &inA2);
inB1 = arm_nn_read_q15x2_ia(&pB);
sum = __SMLAD(inA1, inB1, sum);
inB2 = arm_nn_read_q15x2_ia(&pB);
sum = __SMLAD(inA2, inB2, sum);
colCnt--;
}
colCnt = ch_im_in * dim_kernel * dim_kernel & 0x3;
while (colCnt)
{
q7_t inA1 = *pA++;
q15_t inB1 = *pB++;
sum += inA1 * inB1;
colCnt--;
}
*pOut++ = (q7_t) __SSAT((sum >> out_shift), 8);
}
}
#else
/* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
int i, j, k, l, m, n;
int conv_out;
int in_row, in_col;
for (i = 0; i < ch_im_out; i++)
{
for (j = 0; j < dim_im_out; j++)
{
for (k = 0; k < dim_im_out; k++)
{
conv_out = ((q31_t)bias[i] << bias_shift) + NN_ROUND(out_shift);
for (m = 0; m < dim_kernel; m++)
{
for (n = 0; n < dim_kernel; n++)
{
// if-for implementation
in_row = stride * j + m - padding;
in_col = stride * k + n - padding;
if (in_row >= 0 && in_col >= 0 && in_row < dim_im_in && in_col < dim_im_in)
{
for (l = 0; l < ch_im_in; l++)
{
conv_out +=
Im_in[(in_row * dim_im_in + in_col) * ch_im_in +
l] * wt[i * ch_im_in * dim_kernel * dim_kernel + (m * dim_kernel +
n) * ch_im_in + l];
}
}
}
}
Im_out[i + (j * dim_im_out + k) * ch_im_out] = (q7_t) __SSAT((conv_out >> out_shift), 8);
}
}
}
#endif /* ARM_MATH_DSP */
/* Return to application */
return ARM_MATH_SUCCESS;
}
/**
* @} end of NNConv group
*/

View File

@ -1,229 +0,0 @@
/*
* Copyright (C) 2010-2018 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_convolve_HWC_q7_basic.c
* Description: Q7 version of convolution
*
* $Date: 13. July 2018
* $Revision: V.1.0.0
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#include "arm_math.h"
#include "arm_nnfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup NNConv
* @{
*/
/**
* @brief Basic Q7 convolution function (non-sqaure shape)
* @param[in] Im_in pointer to input tensor
* @param[in] dim_im_in_x input tensor dimention x
* @param[in] dim_im_in_y input tensor dimention y
* @param[in] ch_im_in number of input tensor channels
* @param[in] wt pointer to kernel weights
* @param[in] ch_im_out number of filters, i.e., output tensor channels
* @param[in] dim_kernel_x filter kernel size x
* @param[in] dim_kernel_y filter kernel size y
* @param[in] padding_x padding size x
* @param[in] padding_y padding size y
* @param[in] stride_x convolution stride x
* @param[in] stride_y convolution stride y
* @param[in] bias pointer to bias
* @param[in] bias_shift amount of left-shift for bias
* @param[in] out_shift amount of right-shift for output
* @param[in,out] Im_out pointer to output tensor
* @param[in] dim_im_out_x output tensor dimension x
* @param[in] dim_im_out_y output tensor dimension y
* @param[in,out] bufferA pointer to buffer space for input
* @param[in,out] bufferB pointer to buffer space for output
* @return The function returns <code>ARM_MATH_SUCCESS</code>
*/
arm_status arm_convolve_HWC_q7_basic_nonsquare(const q7_t * Im_in,
const uint16_t dim_im_in_x,
const uint16_t dim_im_in_y,
const uint16_t ch_im_in,
const q7_t * wt,
const uint16_t ch_im_out,
const uint16_t dim_kernel_x,
const uint16_t dim_kernel_y,
const uint16_t padding_x,
const uint16_t padding_y,
const uint16_t stride_x,
const uint16_t stride_y,
const q7_t * bias,
const uint16_t bias_shift,
const uint16_t out_shift,
q7_t * Im_out,
const uint16_t dim_im_out_x,
const uint16_t dim_im_out_y,
q15_t * bufferA,
q7_t * bufferB)
{
(void)bufferB;
#if defined (ARM_MATH_DSP)
/* Run the following code for Cortex-M4 and Cortex-M7 */
int16_t i_out_y, i_out_x, i_ker_y, i_ker_x;
/*
* Here we use bufferA as q15_t internally as computation are done with q15_t level
* im2col are done to output in q15_t format from q7_t input
*/
q15_t *pBuffer = bufferA;
q7_t *pOut = Im_out;
/* This part implements the im2col function */
for (i_out_y = 0; i_out_y < dim_im_out_y; i_out_y++)
{
for (i_out_x = 0; i_out_x < dim_im_out_x; i_out_x++)
{
for (i_ker_y = i_out_y * stride_y - padding_y; i_ker_y < i_out_y * stride_y - padding_y + dim_kernel_y; i_ker_y++)
{
for (i_ker_x = i_out_x * stride_x - padding_x; i_ker_x < i_out_x * stride_x - padding_x + dim_kernel_x; i_ker_x++)
{
if (i_ker_y < 0 || i_ker_y >= dim_im_in_y || i_ker_x < 0 || i_ker_x >= dim_im_in_x)
{
/* Filling 0 for out-of-bound paddings */
/* arm_fill_q15(0, pBuffer, ch_im_in); */
memset(pBuffer, 0, sizeof(q15_t)*ch_im_in);
} else
{
/* Copying the pixel data to column */
arm_q7_to_q15_no_shift((q7_t *)
Im_in + (i_ker_y * dim_im_in_x + i_ker_x) * ch_im_in, pBuffer, ch_im_in);
}
pBuffer += ch_im_in;
}
}
/* Computation is filed for every 2 columns */
if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel_y * dim_kernel_x)
{
pOut =
arm_nn_mat_mult_kernel_q7_q15(wt, bufferA,
ch_im_out,
ch_im_in *
dim_kernel_y * dim_kernel_x, bias_shift, out_shift, bias, pOut);
/* counter reset */
pBuffer = bufferA;
}
}
}
/* left-over because odd number of output pixels */
if (pBuffer != bufferA)
{
const q7_t *pA = wt;
int i;
for (i = 0; i < ch_im_out; i++)
{
/* Load the accumulator with bias first */
q31_t sum = ((q31_t)bias[i] << bias_shift) + NN_ROUND(out_shift);
/* Point to the beging of the im2col buffer */
const q15_t *pB = bufferA;
/* Each time it process 4 entries */
uint16_t colCnt = ch_im_in * dim_kernel_y * dim_kernel_x >> 2;
while (colCnt)
{
q31_t inA1, inA2;
q31_t inB1, inB2;
pA = read_and_pad(pA, &inA1, &inA2);
inB1 = arm_nn_read_q15x2_ia(&pB);
sum = __SMLAD(inA1, inB1, sum);
inB2 = arm_nn_read_q15x2_ia(&pB);
sum = __SMLAD(inA2, inB2, sum);
colCnt--;
}
colCnt = ch_im_in * dim_kernel_y * dim_kernel_x & 0x3;
while (colCnt)
{
q7_t inA1 = *pA++;
q15_t inB1 = *pB++;
sum += inA1 * inB1;
colCnt--;
}
*pOut++ = (q7_t) __SSAT((sum >> out_shift), 8);
}
}
#else
/* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
int i, j, k, l, m, n;
int conv_out;
int in_row, in_col;
for (i = 0; i < ch_im_out; i++)
{
for (j = 0; j < dim_im_out_y; j++)
{
for (k = 0; k < dim_im_out_x; k++)
{
conv_out = ((q31_t)bias[i] << bias_shift) + NN_ROUND(out_shift);
for (m = 0; m < dim_kernel_y; m++)
{
for (n = 0; n < dim_kernel_x; n++)
{
// if-for implementation
in_row = stride_y * j + m - padding_y;
in_col = stride_x * k + n - padding_x;
if (in_row >= 0 && in_col >= 0 && in_row < dim_im_in_y && in_col < dim_im_in_x)
{
for (l = 0; l < ch_im_in; l++)
{
conv_out +=
Im_in[(in_row * dim_im_in_x + in_col) * ch_im_in + l] *
wt[i * ch_im_in * dim_kernel_y * dim_kernel_x +
(m * dim_kernel_x + n) * ch_im_in + l];
}
}
}
}
Im_out[i + (j * dim_im_out_x + k) * ch_im_out] = (q7_t) __SSAT((conv_out >> out_shift), 8);
}
}
}
#endif /* ARM_MATH_DSP */
/* Return to application */
return ARM_MATH_SUCCESS;
}
/**
* @} end of NNConv group
*/

View File

@ -1,408 +0,0 @@
/*
* Copyright (C) 2010-2018 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_convolve_HWC_q7_fast.c
* Description: Fast Q7 version of convolution
*
* $Date: 17. January 2018
* $Revision: V.1.0.0
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#include "arm_math.h"
#include "arm_nnfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup NNConv
* @{
*/
/**
* @brief Fast Q7 convolution function
* @param[in] Im_in pointer to input tensor
* @param[in] dim_im_in input tensor dimention
* @param[in] ch_im_in number of input tensor channels
* @param[in] wt pointer to kernel weights
* @param[in] ch_im_out number of filters, i.e., output tensor channels
* @param[in] dim_kernel filter kernel size
* @param[in] padding padding sizes
* @param[in] stride convolution stride
* @param[in] bias pointer to bias
* @param[in] bias_shift amount of left-shift for bias
* @param[in] out_shift amount of right-shift for output
* @param[in,out] Im_out pointer to output tensor
* @param[in] dim_im_out output tensor dimension
* @param[in,out] bufferA pointer to buffer space for input
* @param[in,out] bufferB pointer to buffer space for output
* @return The function returns either
* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
*
* @details
*
* <b>Buffer size:</b>
*
* bufferA size: 2*ch_im_in*dim_kernel*dim_kernel
*
* bufferB size: 0
*
* <b>Input dimension constraints:</b>
*
* ch_im_in is multiple of 4 ( because of the SIMD32 read and swap )
*
* ch_im_out is multipe of 2 ( bacause 2x2 mat_mult kernel )
*
* The im2col converts the Q7 tensor input into Q15 column, which is stored in
* bufferA. There is reordering happenning during this im2col process with
* arm_q7_to_q15_reordered_no_shift. For every four elements, the second and
* third elements are swapped.
*
* The computation kernel arm_nn_mat_mult_kernel_q7_q15_reordered does the
* GEMM computation with the reordered columns.
*
* To speed-up the determination of the padding condition, we split the
* computation into 3x3 parts, i.e., {top, mid, bottom} X {left, mid, right}.
* This reduces the total number of boundary condition checks and improves
* the data copying performance.
*/
arm_status
arm_convolve_HWC_q7_fast(const q7_t * Im_in,
const uint16_t dim_im_in,
const uint16_t ch_im_in,
const q7_t * wt,
const uint16_t ch_im_out,
const uint16_t dim_kernel,
const uint16_t padding,
const uint16_t stride,
const q7_t * bias,
const uint16_t bias_shift,
const uint16_t out_shift,
q7_t * Im_out,
const uint16_t dim_im_out,
q15_t * bufferA,
q7_t * bufferB)
{
(void)bufferB;
#if defined (ARM_MATH_DSP)
/* Run the following code for Cortex-M4 and Cortex-M7 */
int16_t i_out_y, i_out_x, i_ker_y, i_ker_x;
/*
* Here we use bufferA as q15_t internally as computation are done with q15_t level
* im2col are done to output in q15_t format from q7_t input
*/
q15_t *pBuffer = bufferA;
q7_t *pOut = Im_out;
if (ch_im_in % 4 != 0 || ch_im_out % 2 != 0)
{
/* check if the input dimension meets the constraints */
return ARM_MATH_SIZE_MISMATCH;
}
/*
* Here we split the entire matrix into three regions depending on the padding situation
* Top: i_out_y from 0 to padding - 1
* Middle: i_out_y from padding to dim_im_out-padding-1
* Bottom: i_out_y from dim_im_out-padding to dim_im_out-1
*/
/* top part */
for (i_out_y = 0; i_out_y < padding; i_out_y++)
{
for (i_out_x = 0; i_out_x < dim_im_out; i_out_x++)
{
/* This part implements the im2col function */
for (i_ker_y = i_out_y * stride - padding; i_ker_y < i_out_y * stride - padding + dim_kernel; i_ker_y++)
{
for (i_ker_x = i_out_x * stride - padding; i_ker_x < i_out_x * stride - padding + dim_kernel; i_ker_x++)
{
if (i_ker_y < 0 || i_ker_y >= dim_im_in || i_ker_x < 0 || i_ker_x >= dim_im_in)
{
/* arm_fill_q15(0, pBuffer, ch_im_in); */
memset(pBuffer, 0, sizeof(q15_t)*ch_im_in);
} else
{
arm_q7_to_q15_reordered_no_shift
((q7_t *) Im_in + (i_ker_y * dim_im_in + i_ker_x) * ch_im_in, pBuffer, ch_im_in);
}
pBuffer += ch_im_in;
}
}
if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel * dim_kernel)
{
pOut =
arm_nn_mat_mult_kernel_q7_q15_reordered(wt,
bufferA,
ch_im_out,
ch_im_in
*
dim_kernel * dim_kernel, bias_shift, out_shift, bias, pOut);
/* counter reset */
pBuffer = bufferA;
}
}
}
/* middle part, here we also divide the x into left, mid and right */
for (; i_out_y < dim_im_out - padding; i_out_y++)
{
/* left part */
for (i_out_x = 0; i_out_x < padding; i_out_x++)
{
/* This part implements the im2col function */
for (i_ker_y = i_out_y * stride - padding; i_ker_y < i_out_y * stride - padding + dim_kernel; i_ker_y++)
{
for (i_ker_x = i_out_x * stride - padding; i_ker_x < i_out_x * stride - padding + dim_kernel; i_ker_x++)
{
if (i_ker_x < 0 || i_ker_x >= dim_im_in)
{
/* arm_fill_q15(0, pBuffer, ch_im_in); */
memset(pBuffer, 0, sizeof(q15_t)*ch_im_in);
} else
{
arm_q7_to_q15_reordered_no_shift
((q7_t *) Im_in + (i_ker_y * dim_im_in + i_ker_x) * ch_im_in, pBuffer, ch_im_in);
}
pBuffer += ch_im_in;
}
}
if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel * dim_kernel)
{
pOut =
arm_nn_mat_mult_kernel_q7_q15_reordered(wt,
bufferA,
ch_im_out,
ch_im_in
*
dim_kernel * dim_kernel, bias_shift, out_shift, bias, pOut);
/* counter reset */
pBuffer = bufferA;
}
}
/* mid part */
for (; i_out_x < dim_im_out - padding; i_out_x++)
{
/* This part implements the im2col function */
for (i_ker_y = i_out_y * stride - padding; i_ker_y < i_out_y * stride - padding + dim_kernel; i_ker_y++)
{
arm_q7_to_q15_reordered_no_shift((q7_t *) Im_in
+
(i_ker_y *
dim_im_in +
i_out_x *
stride - padding) * ch_im_in, pBuffer, ch_im_in * dim_kernel);
pBuffer += ch_im_in * dim_kernel;
}
if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel * dim_kernel)
{
pOut =
arm_nn_mat_mult_kernel_q7_q15_reordered(wt,
bufferA,
ch_im_out,
ch_im_in
*
dim_kernel * dim_kernel, bias_shift, out_shift, bias, pOut);
/* counter reset */
pBuffer = bufferA;
}
}
/* right part */
for (; i_out_x < dim_im_out; i_out_x++)
{
/* This part implements the im2col function */
for (i_ker_y = i_out_y * stride - padding; i_ker_y < i_out_y * stride - padding + dim_kernel; i_ker_y++)
{
for (i_ker_x = i_out_x * stride - padding; i_ker_x < i_out_x * stride - padding + dim_kernel; i_ker_x++)
{
if (i_ker_x < 0 || i_ker_x >= dim_im_in)
{
/* arm_fill_q15(0, pBuffer, ch_im_in); */
memset(pBuffer, 0, sizeof(q15_t)*ch_im_in);
} else
{
arm_q7_to_q15_reordered_no_shift
((q7_t *) Im_in + (i_ker_y * dim_im_in + i_ker_x) * ch_im_in, pBuffer, ch_im_in);
}
pBuffer += ch_im_in;
}
}
if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel * dim_kernel)
{
pOut =
arm_nn_mat_mult_kernel_q7_q15_reordered(wt,
bufferA,
ch_im_out,
ch_im_in
*
dim_kernel * dim_kernel, bias_shift, out_shift, bias, pOut);
/* counter reset */
pBuffer = bufferA;
}
}
}
for (; i_out_y < dim_im_out; i_out_y++)
{
for (i_out_x = 0; i_out_x < dim_im_out; i_out_x++)
{
/* This part implements the im2col function */
for (i_ker_y = i_out_y * stride - padding; i_ker_y < i_out_y * stride - padding + dim_kernel; i_ker_y++)
{
for (i_ker_x = i_out_x * stride - padding; i_ker_x < i_out_x * stride - padding + dim_kernel; i_ker_x++)
{
if (i_ker_y < 0 || i_ker_y >= dim_im_in || i_ker_x < 0 || i_ker_x >= dim_im_in)
{
/* arm_fill_q15(0, pBuffer, ch_im_in); */
memset(pBuffer, 0, sizeof(q15_t)*ch_im_in);
} else
{
arm_q7_to_q15_reordered_no_shift
((q7_t *) Im_in + (i_ker_y * dim_im_in + i_ker_x) * ch_im_in, pBuffer, ch_im_in);
}
pBuffer += ch_im_in;
}
}
if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel * dim_kernel)
{
pOut =
arm_nn_mat_mult_kernel_q7_q15_reordered(wt,
bufferA,
ch_im_out,
ch_im_in
*
dim_kernel * dim_kernel, bias_shift, out_shift, bias, pOut);
/* counter reset */
pBuffer = bufferA;
}
}
}
/* check if there is left-over for compute */
if (pBuffer != bufferA)
{
const q7_t *pA = wt;
int i;
for (i = 0; i < ch_im_out; i++)
{
q31_t sum = ((q31_t)bias[i] << bias_shift) + NN_ROUND(out_shift);
const q15_t *pB = bufferA;
/* each time it process 4 entries */
uint16_t colCnt = ch_im_in * dim_kernel * dim_kernel >> 2;
while (colCnt)
{
q31_t inA1, inA2;
q31_t inB1, inB2;
pA = read_and_pad_reordered(pA, &inA1, &inA2);
inB1 = arm_nn_read_q15x2_ia(&pB);
sum = __SMLAD(inA1, inB1, sum);
inB2 = arm_nn_read_q15x2_ia(&pB);
sum = __SMLAD(inA2, inB2, sum);
colCnt--;
}
colCnt = ch_im_in * dim_kernel * dim_kernel & 0x3;
while (colCnt)
{
q7_t inA1 = *pA++;
q15_t inB1 = *pB++;
sum += inA1 * inB1;
colCnt--;
}
*pOut = (q7_t) __SSAT((sum >> out_shift), 8);
pOut++;
}
}
#else
/* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
int i, j, k, l, m, n;
int conv_out;
int in_row, in_col;
if (ch_im_in % 4 != 0 || ch_im_out % 2 != 0)
{
/* check if the input dimension meets the constraints */
return ARM_MATH_SIZE_MISMATCH;
}
for (i = 0; i < ch_im_out; i++)
{
for (j = 0; j < dim_im_out; j++)
{
for (k = 0; k < dim_im_out; k++)
{
conv_out = (bias[i] << bias_shift) + NN_ROUND(out_shift);
for (m = 0; m < dim_kernel; m++)
{
for (n = 0; n < dim_kernel; n++)
{
// if-for implementation
in_row = stride * j + m - padding;
in_col = stride * k + n - padding;
if (in_row >= 0 && in_col >= 0 && in_row < dim_im_in && in_col < dim_im_in)
{
for (l = 0; l < ch_im_in; l++)
{
conv_out +=
Im_in[(in_row * dim_im_in + in_col) * ch_im_in +
l] * wt[i * ch_im_in * dim_kernel * dim_kernel + (m * dim_kernel +
n) * ch_im_in + l];
}
}
}
}
Im_out[i + (j * dim_im_out + k) * ch_im_out] = (q7_t) __SSAT((conv_out >> out_shift), 8);
}
}
}
#endif /* ARM_MATH_DSP */
/* Return to application */
return ARM_MATH_SUCCESS;
}
/**
* @} end of NNConv group
*/

View File

@ -1,379 +0,0 @@
/*
* Copyright (C) 2010-2018 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_convolve_HWC_q7_fast_nonsquare.c
* Description: Fast Q7 version of convolution (non-sqaure shape)
*
* $Date: 17. January 2018
* $Revision: V.1.0.0
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#include "arm_math.h"
#include "arm_nnfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup NNConv
* @{
*/
/**
* @brief Fast Q7 convolution function (non-sqaure shape)
* @param[in] Im_in pointer to input tensor
* @param[in] dim_im_in_x input tensor dimention x
* @param[in] dim_im_in_y input tensor dimention y
* @param[in] ch_im_in number of input tensor channels
* @param[in] wt pointer to kernel weights
* @param[in] ch_im_out number of filters, i.e., output tensor channels
* @param[in] dim_kernel_x filter kernel size x
* @param[in] dim_kernel_y filter kernel size y
* @param[in] padding_x padding size x
* @param[in] padding_y padding size y
* @param[in] stride_x convolution stride x
* @param[in] stride_y convolution stride y
* @param[in] bias pointer to bias
* @param[in] bias_shift amount of left-shift for bias
* @param[in] out_shift amount of right-shift for output
* @param[in,out] Im_out pointer to output tensor
* @param[in] dim_im_out_x output tensor dimension x
* @param[in] dim_im_out_y output tensor dimension y
* @param[in,out] bufferA pointer to buffer space for input
* @param[in,out] bufferB pointer to buffer space for output
* @return The function returns either
* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
*
* This function is the version with full list of optimization tricks, but with
* some contraints:
* ch_im_in is multiple of 4
* ch_im_out is multiple of 2
*/
arm_status arm_convolve_HWC_q7_fast_nonsquare(const q7_t * Im_in,
const uint16_t dim_im_in_x,
const uint16_t dim_im_in_y,
const uint16_t ch_im_in,
const q7_t * wt,
const uint16_t ch_im_out,
const uint16_t dim_kernel_x,
const uint16_t dim_kernel_y,
const uint16_t padding_x,
const uint16_t padding_y,
const uint16_t stride_x,
const uint16_t stride_y,
const q7_t * bias,
const uint16_t bias_shift,
const uint16_t out_shift,
q7_t * Im_out,
const uint16_t dim_im_out_x,
const uint16_t dim_im_out_y,
q15_t * bufferA,
q7_t * bufferB)
{
(void)bufferB;
#if defined (ARM_MATH_DSP)
/* Run the following code for Cortex-M4 and Cortex-M7 */
int16_t i_out_y, i_out_x, i_ker_y, i_ker_x;
/* -----------------------
* Here we use bufferA as q15_t internally as computation are done with q15_t level
* im2col are done to output in q15_t format from q7_t input
*/
q15_t *pBuffer = bufferA;
q7_t *pOut = Im_out;
if (ch_im_in % 4 != 0 || ch_im_out % 2 != 0)
{
/* check if the input dimension meets the constraints */
return ARM_MATH_SIZE_MISMATCH;
}
/*
* Here we split the entire matrix into three regions depending on the padding situation
* Top: i_out_y from 0 to padding - 1
* Middle: i_out_y from padding to dim_im_out-padding-1
* Bottom: i_out_y from dim_im_out-padding to dim_im_out-1
*/
/* top part */
for (i_out_y = 0; i_out_y < padding_y; i_out_y++)
{
for (i_out_x = 0; i_out_x < dim_im_out_x; i_out_x++)
{
/* This part implements the im2col function */
for (i_ker_y = i_out_y * stride_y - padding_y; i_ker_y < i_out_y * stride_y - padding_y + dim_kernel_y;
i_ker_y++)
{
for (i_ker_x = i_out_x * stride_x - padding_x; i_ker_x < i_out_x * stride_x - padding_x + dim_kernel_x;
i_ker_x++)
{
if (i_ker_y < 0 || i_ker_y >= dim_im_in_y || i_ker_x < 0 || i_ker_x >= dim_im_in_x)
{
/* arm_fill_q15(0, pBuffer, ch_im_in); */
memset(pBuffer, 0, sizeof(q15_t)*ch_im_in);
} else
{
arm_q7_to_q15_reordered_no_shift((q7_t *) Im_in + (i_ker_y * dim_im_in_x + i_ker_x) * ch_im_in,
pBuffer, ch_im_in);
}
pBuffer += ch_im_in;
}
}
if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel_x * dim_kernel_y)
{
pOut =
arm_nn_mat_mult_kernel_q7_q15_reordered(wt, bufferA, ch_im_out, ch_im_in * dim_kernel_x * dim_kernel_y,
bias_shift, out_shift, bias, pOut);
/* counter reset */
pBuffer = bufferA;
}
}
}
/* middle part, here we also divide the x into left, mid and right */
for (; i_out_y < dim_im_out_y - padding_y; i_out_y++)
{
/* left part */
for (i_out_x = 0; i_out_x < padding_x; i_out_x++)
{
/* This part implements the im2col function */
for (i_ker_y = i_out_y * stride_y - padding_y; i_ker_y < i_out_y * stride_y - padding_y + dim_kernel_y;
i_ker_y++)
{
for (i_ker_x = i_out_x * stride_x - padding_x; i_ker_x < i_out_x * stride_x - padding_x + dim_kernel_x;
i_ker_x++)
{
if (i_ker_x < 0 || i_ker_x >= dim_im_in_x)
{
/* arm_fill_q15(0, pBuffer, ch_im_in); */
memset(pBuffer, 0, sizeof(q15_t)*ch_im_in);
} else
{
arm_q7_to_q15_reordered_no_shift((q7_t *) Im_in + (i_ker_y * dim_im_in_x + i_ker_x) * ch_im_in,
pBuffer, ch_im_in);
}
pBuffer += ch_im_in;
}
}
if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel_x * dim_kernel_y)
{
pOut =
arm_nn_mat_mult_kernel_q7_q15_reordered(wt, bufferA, ch_im_out, ch_im_in * dim_kernel_x * dim_kernel_y,
bias_shift, out_shift, bias, pOut);
/* counter reset */
pBuffer = bufferA;
}
}
/* mid part */
for (; i_out_x < dim_im_out_x - padding_x; i_out_x++)
{
/* This part implements the im2col function */
for (i_ker_y = i_out_y * stride_y - padding_y; i_ker_y < i_out_y * stride_y - padding_y + dim_kernel_y;
i_ker_y++)
{
arm_q7_to_q15_reordered_no_shift((q7_t *) Im_in +
(i_ker_y * dim_im_in_x + i_out_x * stride_x - padding_x) * ch_im_in,
pBuffer, ch_im_in * dim_kernel_x);
pBuffer += ch_im_in * dim_kernel_x;
}
if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel_x * dim_kernel_y)
{
pOut =
arm_nn_mat_mult_kernel_q7_q15_reordered(wt, bufferA, ch_im_out, ch_im_in * dim_kernel_x * dim_kernel_y,
bias_shift, out_shift, bias, pOut);
/* counter reset */
pBuffer = bufferA;
}
}
/* right part */
for (; i_out_x < dim_im_out_x; i_out_x++)
{
/* This part implements the im2col function */
for (i_ker_y = i_out_y * stride_y - padding_y; i_ker_y < i_out_y * stride_y - padding_y + dim_kernel_y;
i_ker_y++)
{
for (i_ker_x = i_out_x * stride_x - padding_x; i_ker_x < i_out_x * stride_x - padding_x + dim_kernel_x;
i_ker_x++)
{
if (i_ker_x < 0 || i_ker_x >= dim_im_in_x)
{
/* arm_fill_q15(0, pBuffer, ch_im_in); */
memset(pBuffer, 0, sizeof(q15_t)*ch_im_in);
} else
{
arm_q7_to_q15_reordered_no_shift((q7_t *) Im_in + (i_ker_y * dim_im_in_x + i_ker_x) * ch_im_in,
pBuffer, ch_im_in);
}
pBuffer += ch_im_in;
}
}
if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel_x * dim_kernel_y)
{
pOut =
arm_nn_mat_mult_kernel_q7_q15_reordered(wt, bufferA, ch_im_out, ch_im_in * dim_kernel_x * dim_kernel_y,
bias_shift, out_shift, bias, pOut);
/* counter reset */
pBuffer = bufferA;
}
}
}
for (; i_out_y < dim_im_out_y; i_out_y++)
{
for (i_out_x = 0; i_out_x < dim_im_out_x; i_out_x++)
{
/* This part implements the im2col function */
for (i_ker_y = i_out_y * stride_y - padding_y; i_ker_y < i_out_y * stride_y - padding_y + dim_kernel_y;
i_ker_y++)
{
for (i_ker_x = i_out_x * stride_x - padding_x; i_ker_x < i_out_x * stride_x - padding_x + dim_kernel_x;
i_ker_x++)
{
if (i_ker_y < 0 || i_ker_y >= dim_im_in_y || i_ker_x < 0 || i_ker_x >= dim_im_in_x)
{
/* arm_fill_q15(0, pBuffer, ch_im_in); */
memset(pBuffer, 0, sizeof(q15_t)*ch_im_in);
} else
{
arm_q7_to_q15_reordered_no_shift((q7_t *) Im_in + (i_ker_y * dim_im_in_x + i_ker_x) * ch_im_in,
pBuffer, ch_im_in);
}
pBuffer += ch_im_in;
}
}
if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel_x * dim_kernel_y)
{
pOut =
arm_nn_mat_mult_kernel_q7_q15_reordered(wt, bufferA, ch_im_out, ch_im_in * dim_kernel_x * dim_kernel_y,
bias_shift, out_shift, bias, pOut);
/* counter reset */
pBuffer = bufferA;
}
}
}
/* check if there is left-over for compute */
if (pBuffer != bufferA)
{
const q7_t *pA = wt;
int i;
for (i = 0; i < ch_im_out; i++)
{
q31_t sum = ((q31_t)(bias[i]) << bias_shift) + NN_ROUND(out_shift);
const q15_t *pB = bufferA;
/* basically each time it process 4 entries */
uint16_t colCnt = ch_im_in * dim_kernel_x * dim_kernel_y >> 2;
while (colCnt)
{
q31_t inA1, inA2;
q31_t inB1, inB2;
pA = read_and_pad_reordered(pA, &inA1, &inA2);
inB1 = arm_nn_read_q15x2_ia(&pB);
sum = __SMLAD(inA1, inB1, sum);
inB2 = arm_nn_read_q15x2_ia(&pB);
sum = __SMLAD(inA2, inB2, sum);
colCnt--;
}
colCnt = (ch_im_in * dim_kernel_y * dim_kernel_x) & 0x3;
while (colCnt)
{
q7_t inA1 = *pA++;
q15_t inB1 = *pB++;
sum += inA1 * inB1;
colCnt--;
}
*pOut = (q7_t) __SSAT((sum >> out_shift), 8);
pOut++;
}
}
#else
/* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
int i, j, k, l, m, n;
int conv_out;
int in_row, in_col;
if (ch_im_in % 4 != 0 || ch_im_out % 2 != 0)
{
/* check if the input dimension meets the constraints */
return ARM_MATH_SIZE_MISMATCH;
}
for (i = 0; i < ch_im_out; i++)
{
for (j = 0; j < dim_im_out_y; j++)
{
for (k = 0; k < dim_im_out_x; k++)
{
conv_out = ((q31_t)(bias[i]) << bias_shift) + NN_ROUND(out_shift);
for (m = 0; m < dim_kernel_y; m++)
{
for (n = 0; n < dim_kernel_x; n++)
{
/* if-for implementation */
in_row = stride_y * j + m - padding_y;
in_col = stride_x * k + n - padding_x;
if (in_row >= 0 && in_col >= 0 && in_row < dim_im_in_y && in_col < dim_im_in_x)
{
for (l = 0; l < ch_im_in; l++)
{
conv_out += Im_in[(in_row * dim_im_in_x + in_col) * ch_im_in + l] *
wt[i * ch_im_in * dim_kernel_y * dim_kernel_x + (m * dim_kernel_x + n) * ch_im_in + l];
}
}
}
}
Im_out[i + (j * dim_im_out_x + k) * ch_im_out] = (q7_t) __SSAT((conv_out >> out_shift), 8);
}
}
}
#endif /* ARM_MATH_DSP */
/* Return to application */
return ARM_MATH_SUCCESS;
}
/**
* @} end of NNConv group
*/

View File

@ -1,382 +0,0 @@
/*
* Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_convolve_s8.c
* Description: s8 version of convolution using symmetric quantization.
*
* $Date: July 27, 2020
* $Revision: V.2.0.2
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#include "arm_math.h"
#include "arm_nn_types.h"
#include "arm_nnfunctions.h"
#include "arm_nnsupportfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup NNConv
* @{
*/
/*
* Basic s8 convolution function.
*
* Refer header file for details. Optimal use case for the DSP/MVE implementation is when input and output channels
* are multiples of 4 or atleast greater than 4.
*
*/
arm_status arm_convolve_s8(const cmsis_nn_context* ctx,
const cmsis_nn_conv_params* conv_params,
const cmsis_nn_per_channel_quant_params* quant_params,
const cmsis_nn_dims* input_dims,
const q7_t *input_data,
const cmsis_nn_dims* filter_dims,
const q7_t *filter_data,
const cmsis_nn_dims* bias_dims,
const int32_t *bias_data,
const cmsis_nn_dims* output_dims,
q7_t *output_data)
{
q15_t *buffer_a = (q15_t *)ctx->buf;
const uint16_t input_batches = input_dims->n;
const uint16_t input_x = input_dims->w;
const uint16_t input_y = input_dims->h;
const uint16_t input_ch = input_dims->c;
const uint16_t kernel_x = filter_dims->w;
const uint16_t kernel_y = filter_dims->h;
const uint16_t output_x = output_dims->w;
const uint16_t output_y = output_dims->h;
const uint16_t output_ch = output_dims->c;
const uint16_t pad_x = conv_params->padding.w;
const uint16_t pad_y = conv_params->padding.h;
const uint16_t stride_x = conv_params->stride.w;
const uint16_t stride_y = conv_params->stride.h;
const int32_t input_offset = conv_params->input_offset;
const int32_t out_offset = conv_params->output_offset;
const int32_t out_activation_min = conv_params->activation.min;
const int32_t out_activation_max = conv_params->activation.max;
int32_t *output_mult = quant_params->multiplier;
int32_t *output_shift = quant_params->shift;
int i_batch;
for (i_batch = 0; i_batch < input_batches; i_batch++)
{
#if defined(ARM_MATH_MVEI)
(void)bias_dims;
/* Generate upto four columns from the input tensor a GEMM computation */
q7_t *im2col_buf = (q7_t *)buffer_a;
q7_t *out = output_data;
int32_t buffer_fill_cnt = 0;
int32_t padded = 0;
const int32_t num_elem = kernel_x * kernel_y * input_ch;
/* This part implements the im2col function */
for (int i_out_y = 0; i_out_y < output_y; i_out_y++)
{
for (int i_out_x = 0; i_out_x < output_x; i_out_x++)
{
for (int i_ker_y = i_out_y * stride_y - pad_y; i_ker_y < i_out_y * stride_y - pad_y + kernel_y; i_ker_y++)
{
for (int i_ker_x = i_out_x * stride_x - pad_x; i_ker_x < i_out_x * stride_x - pad_x + kernel_x; i_ker_x++)
{
if (i_ker_y < 0 || i_ker_y >= input_y || i_ker_x < 0 || i_ker_x >= input_x)
{
memset(im2col_buf, (int8_t)-input_offset, sizeof(q7_t) * input_ch);
padded = 1;
}
else
{
arm_memcpy_q7(im2col_buf, input_data + (i_ker_y * input_x + i_ker_x) * input_ch, input_ch);
}
im2col_buf += input_ch;
}
}
buffer_fill_cnt++;
/* Computation is filed for every 4 columns */
if (buffer_fill_cnt == 4 && (padded == 0))
{
buffer_fill_cnt = 0;
for (int i_out_ch = 0; i_out_ch < output_ch; i_out_ch++)
{
int32_t sum_row;
int32_t acc[4];
(void)arm_nn_mat_mul_core_4x_s8(num_elem,
num_elem,
(q7_t *)buffer_a,
filter_data + num_elem * i_out_ch,
&sum_row,
acc);
int32x4_t s_offset = vdupq_n_s32(sum_row);
int32x4_t res = vldrwq_s32(acc);
s_offset = vmulq_n_s32(s_offset, input_offset);
if (bias_data)
{
res = vaddq_n_s32(res, bias_data[i_out_ch]);
}
res = vaddq_s32(res, s_offset);
res = arm_requantize_mve(res, output_mult[i_out_ch], output_shift[i_out_ch]);
res = vaddq_n_s32(res, out_offset);
res = vmaxq_s32(res, vdupq_n_s32(out_activation_min));
res = vminq_s32(res, vdupq_n_s32(out_activation_max));
const uint32x4_t scatter_offset = {0, output_ch, output_ch * 2, output_ch * 3};
vstrbq_scatter_offset_s32(out, scatter_offset, res);
out++;
}
out += (3 * output_ch);
im2col_buf = (q7_t *)buffer_a;
}
else if (buffer_fill_cnt == 4 && (padded != 0))
{
buffer_fill_cnt = 0;
out = arm_nn_mat_mult_s8(filter_data,
(q7_t *)buffer_a,
output_ch,
4,
output_shift,
output_mult,
out_offset,
input_offset,
0,
out_activation_min,
out_activation_max,
num_elem,
bias_data,
out);
im2col_buf = (q7_t *)buffer_a;
padded = 0;
}
}
}
/* Handle left over columns */
if (buffer_fill_cnt != 0)
{
out = arm_nn_mat_mult_s8(filter_data,
(q7_t *)buffer_a,
output_ch,
buffer_fill_cnt,
output_shift,
output_mult,
out_offset,
input_offset,
0,
out_activation_min,
out_activation_max,
num_elem,
bias_data,
out);
}
#elif defined(ARM_MATH_DSP)
(void)bias_dims;
int32_t i_out_y, i_out_x, i_ker_y, i_ker_x;
/* Generate two columns from the input tensor a GEMM computation */
q15_t *two_column_buf = buffer_a;
q7_t *out = output_data;
/* This part implements the im2col function */
for (i_out_y = 0; i_out_y < output_y; i_out_y++)
{
for (i_out_x = 0; i_out_x < output_x; i_out_x++)
{
for (i_ker_y = i_out_y * stride_y - pad_y; i_ker_y < i_out_y * stride_y - pad_y + kernel_y; i_ker_y++)
{
for (i_ker_x = i_out_x * stride_x - pad_x; i_ker_x < i_out_x * stride_x - pad_x + kernel_x; i_ker_x++)
{
if (i_ker_y < 0 || i_ker_y >= input_y || i_ker_x < 0 || i_ker_x >= input_x)
{
/* Filling 0 for out-of-bound paddings */
memset(two_column_buf, 0, sizeof(q15_t) * input_ch);
}
else
{
/* Copying the pixel data to column */
arm_q7_to_q15_with_offset(input_data + (i_ker_y * input_x + i_ker_x) * input_ch, two_column_buf, input_ch, input_offset);
}
two_column_buf += input_ch;
}
}
/* Computation is filed for every 2 columns */
if (two_column_buf == buffer_a + 2 * input_ch * kernel_y * kernel_x)
{
out =
arm_nn_mat_mult_kernel_s8_s16(filter_data,
buffer_a,
output_ch,
output_shift,
output_mult,
out_offset,
out_activation_min,
out_activation_max,
input_ch * kernel_y * kernel_x,
bias_data,
out);
/* counter reset */
two_column_buf = buffer_a;
}
}
}
/* left-over because odd number of output pixels */
if (two_column_buf != buffer_a)
{
const q7_t *ker_a = filter_data;
int i;
for (i = 0; i < output_ch; i++)
{
/* Load the accumulator with bias first */
q31_t sum = 0;
if (bias_data)
{
sum = bias_data[i];
}
/* Point to the beginning of the im2col buffer where the input is available as a rearranged column */
const q15_t *ip_as_col = buffer_a;
/* 4 multiply and accumulates are done in one loop. */
uint16_t col_count = (input_ch * kernel_y * kernel_x) >> 2;
while (col_count)
{
q31_t ker_a1, ker_a2;
q31_t ip_b1, ip_b2;
ker_a = read_and_pad(ker_a, &ker_a1, &ker_a2);
ip_b1 = arm_nn_read_q15x2_ia(&ip_as_col);
sum = __SMLAD(ker_a1, ip_b1, sum);
ip_b2 = arm_nn_read_q15x2_ia(&ip_as_col);
sum = __SMLAD(ker_a2, ip_b2, sum);
col_count--;
}
/* Handle left over mac */
col_count = input_ch * kernel_y * kernel_x & 0x3;
while (col_count)
{
q7_t ker_a1 = *ker_a++;
q15_t ip_b1 = *ip_as_col++;
sum += ker_a1 * ip_b1;
col_count--;
}
sum = arm_nn_requantize(sum, output_mult[i], output_shift[i]);
sum += out_offset;
sum = MAX(sum, out_activation_min);
sum = MIN(sum, out_activation_max);
*out++ = (q7_t)sum;
}
}
#else
/* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
(void)buffer_a;
int32_t i_out_ch, i_out_y, i_out_x, i_input_ch, i_ker_y, i_ker_x;
int32_t conv_out;
for (i_out_ch = 0; i_out_ch < output_ch; i_out_ch++)
{
for (i_out_y = 0; i_out_y < output_y; i_out_y++)
{
for (i_out_x = 0; i_out_x < output_x; i_out_x++)
{
conv_out = 0;
const int32_t base_idx_y = stride_y * i_out_y - pad_y;
const int32_t base_idx_x = stride_x * i_out_x - pad_x;
const int32_t ker_y_start = MAX(0, -base_idx_y);
const int32_t ker_x_start = MAX(0, -base_idx_x);
const int32_t ker_y_end = MIN(kernel_y, input_y - base_idx_y);
const int32_t ker_x_end = MIN(kernel_x, input_x - base_idx_x);
for (i_ker_y = ker_y_start; i_ker_y < ker_y_end; i_ker_y++)
{
for (i_ker_x = ker_x_start; i_ker_x < ker_x_end; i_ker_x++)
{
const int32_t in_row = base_idx_y + i_ker_y;
const int32_t in_col = base_idx_x + i_ker_x;
for (i_input_ch = 0; i_input_ch < input_ch; i_input_ch++)
{
conv_out +=
(input_data[(in_row * input_x + in_col) * input_ch + i_input_ch] + input_offset) *
filter_data[i_out_ch * input_ch * kernel_y * kernel_x +
(i_ker_y * kernel_x + i_ker_x) * input_ch + i_input_ch];
}
}
}
if (bias_data)
{
conv_out += bias_data[i_out_ch];
}
conv_out = arm_nn_requantize(conv_out, output_mult[i_out_ch], output_shift[i_out_ch]);
conv_out += out_offset;
conv_out = MAX(conv_out, out_activation_min);
conv_out = MIN(conv_out, out_activation_max);
output_data[i_out_ch + (i_out_y * output_x + i_out_x) * output_ch] = (int8_t)conv_out;
}
}
}
#endif
/* Advance to the next batch */
input_data += (input_x * input_y * input_ch);
output_data += (output_x * output_y * output_ch);
}
/* Return to application */
return ARM_MATH_SUCCESS;
}
int32_t arm_convolve_s8_get_buffer_size(const cmsis_nn_dims* input_dims,
const cmsis_nn_dims* filter_dims)
{
#if defined(ARM_MATH_DSP)
return (2 * input_dims->c * filter_dims->w * filter_dims->h) * (int32_t)sizeof(int16_t);
#else
(void)input_dims;
(void)filter_dims;
return 0;
#endif
}
/**
* @} end of NNConv group
*/

View File

@ -1,148 +0,0 @@
/*
* Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_convolve_wrapper_s8.c
* Description: s8 convolution layer wrapper function with the main purpose to call the optimal kernel available in cmsis-nn to perform the convolution.
*
* $Date: May 18, 2020
* $Revision: V.1.0.0
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
#include "arm_nn_types.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup NNConv
* @{
*/
/*
* Convolution layer
*
* Refer header file for details.
*
*/
arm_status arm_convolve_wrapper_s8(const cmsis_nn_context* ctx,
const cmsis_nn_conv_params* conv_params,
const cmsis_nn_per_channel_quant_params* quant_params,
const cmsis_nn_dims* input_dims,
const q7_t *input_data,
const cmsis_nn_dims* filter_dims,
const q7_t *filter_data,
const cmsis_nn_dims* bias_dims,
const int32_t *bias_data,
const cmsis_nn_dims* output_dims,
q7_t *output_data)
{
if ((conv_params->padding.w == 0) &&
(conv_params->padding.h == 0) &&
(input_dims->c % 4 == 0) &&
(conv_params->stride.w == 1) &&
(conv_params->stride.h == 1) &&
(filter_dims->w == 1) &&
(filter_dims->h == 1))
{
return arm_convolve_1x1_s8_fast(ctx,
conv_params,
quant_params,
input_dims,
input_data,
filter_dims,
filter_data,
bias_dims,
bias_data,
output_dims,
output_data);
}
else if ((output_dims->h == 1) &&
(input_dims->h == 1) &&
(filter_dims->h == 1) &&
(output_dims->w % 4 == 0) &&
(input_dims->n == 1))
{
return arm_convolve_1_x_n_s8(ctx,
conv_params,
quant_params,
input_dims,
input_data,
filter_dims,
filter_data,
bias_dims,
bias_data,
output_dims,
output_data);
}
else
{
return arm_convolve_s8(ctx,
conv_params,
quant_params,
input_dims,
input_data,
filter_dims,
filter_data,
bias_dims,
bias_data,
output_dims,
output_data);
}
}
int32_t arm_convolve_wrapper_s8_get_buffer_size(const cmsis_nn_conv_params* conv_params,
const cmsis_nn_dims* input_dims,
const cmsis_nn_dims* filter_dims,
const cmsis_nn_dims* output_dims)
{
if ((conv_params->padding.w == 0) &&
(conv_params->padding.h == 0) &&
(input_dims->c % 4 == 0) &&
(conv_params->stride.w == 1) &&
(conv_params->stride.h == 1) &&
(filter_dims->w == 1) &&
(filter_dims->h == 1))
{
return arm_convolve_1x1_s8_fast_get_buffer_size(input_dims);
}
else if ((output_dims->h == 1) &&
(input_dims->h == 1) &&
(filter_dims->h == 1) &&
(output_dims->w % 4 == 0) &&
(input_dims->n == 1))
{
return arm_convolve_1_x_n_s8_get_buffer_size(input_dims, filter_dims);
}
else
{
return arm_convolve_s8_get_buffer_size(input_dims, filter_dims);
}
}
/**
* @} end of NNConv group
*/

View File

@ -1,213 +0,0 @@
/*
* Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_depthwise_conv_3x3_s8.c
* Description: Optimized s8 depthwise convolution function for channel
* multiplier of 1 and 3x3 kernel size.
*
* $Date: May 14, 2020
* $Revision: V.2.0.0
*
* Target Processor: Cortex-M CPUs
*
* -------------------------------------------------------------------- */
#include "arm_math.h"
#include "arm_nnsupportfunctions.h"
#include "arm_nnfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup NNConv
* @{
*/
/*
* Optimized s8 depthwise convolution function with constraint that
* in_channel == out_channel and kernel_x == kernel_y == 3 with pads at most 1
*
* Refer prototype header file for details.
*
*/
arm_status arm_depthwise_conv_3x3_s8(const cmsis_nn_context *ctx,
const cmsis_nn_dw_conv_params *dw_conv_params,
const cmsis_nn_per_channel_quant_params *quant_params,
const cmsis_nn_dims *input_dims,
const q7_t *input,
const cmsis_nn_dims *filter_dims,
const q7_t *kernel,
const cmsis_nn_dims *bias_dims,
const int32_t *bias,
const cmsis_nn_dims *output_dims,
q7_t *output)
{
(void)ctx;
(void)bias_dims;
const int32_t input_x = input_dims->w;
const int32_t input_y = input_dims->h;
const int32_t input_ch = input_dims->c;
const int32_t output_ch = output_dims->c;
const int32_t pad_x = dw_conv_params->padding.w;
const int32_t pad_y = dw_conv_params->padding.h;
const int32_t stride_x = dw_conv_params->stride.w;
const int32_t stride_y = dw_conv_params->stride.h;
const int32_t *output_shift = quant_params->shift;
const int32_t *output_mult = quant_params->multiplier;
const int32_t output_x = output_dims->w;
const int32_t output_y = output_dims->h;
const int32_t output_offset = dw_conv_params->output_offset;
const int32_t input_offset = dw_conv_params->input_offset;
const int32_t output_activation_min = dw_conv_params->activation.min;
const int32_t output_activation_max = dw_conv_params->activation.max;
/* Check input constraints input_ch == output_ch */
if (input_ch != output_ch)
{
return ARM_MATH_SIZE_MISMATCH;
}
/* Check input constraints pad_x <= 1 */
if (pad_x > 1 || filter_dims->w != 3 || filter_dims->h != 3)
{
return ARM_MATH_ARGUMENT_ERROR;
}
for (int32_t in_h = -pad_y, out_h = 0, out_idx = 0; out_h < output_y; in_h += stride_y, ++out_h)
{
for (int32_t in_w = -pad_x, out_w = 0, ker_h_start = MAX(0, -in_h); out_w < output_x; in_w += stride_x, ++out_w)
{
int32_t in_ch = 0;
int32_t ker_w_start = MAX(0, -in_w);
for (; in_ch <= (input_ch - 4); in_ch += 4)
{
int32_t out_buff0 = bias[in_ch + 0];
int32_t out_buff1 = bias[in_ch + 1];
int32_t out_buff2 = bias[in_ch + 2];
int32_t out_buff3 = bias[in_ch + 3];
const int8_t *input_ptr = input + (in_h + ker_h_start) * (input_ch * input_x) + in_w * input_ch + in_ch;
const int8_t *kernel_ptr = kernel + ker_h_start * (input_ch * 3) + in_ch;
for (int32_t ker_h = ker_h_start; ker_h < MIN(3, input_y - in_h); ++ker_h)
{
int32_t in_val = 0;
int32_t ker_val = 0;
if (ker_w_start == 0)
{
in_val = arm_nn_read_q7x4(input_ptr);
ker_val = arm_nn_read_q7x4(kernel_ptr);
out_buff0 += ((int8_t)in_val + input_offset) * (int8_t)ker_val;
out_buff1 += ((int8_t)(in_val >> 8) + input_offset) * (int8_t)(ker_val >> 8);
out_buff2 += ((int8_t)(in_val >> 16) + input_offset) * (int8_t)(ker_val >> 16);
out_buff3 += ((int8_t)(in_val >> 24) + input_offset) * (int8_t)(ker_val >> 24);
}
in_val = arm_nn_read_q7x4(input_ptr + input_ch);
ker_val = arm_nn_read_q7x4(kernel_ptr + input_ch);
out_buff0 += ((int8_t)in_val + input_offset) * (int8_t)ker_val;
out_buff1 += ((int8_t)(in_val >> 8) + input_offset) * (int8_t)(ker_val >> 8);
out_buff2 += ((int8_t)(in_val >> 16) + input_offset) * (int8_t)(ker_val >> 16);
out_buff3 += ((int8_t)(in_val >> 24) + input_offset) * (int8_t)(ker_val >> 24);
if ((input_x - in_w) >= 3)
{
in_val = arm_nn_read_q7x4(input_ptr + (input_ch << 1));
ker_val = arm_nn_read_q7x4(kernel_ptr + (input_ch << 1));
out_buff0 += ((int8_t)in_val + input_offset) * (int8_t)ker_val;
out_buff1 += ((int8_t)(in_val >> 8) + input_offset) * (int8_t)(ker_val >> 8);
out_buff2 += ((int8_t)(in_val >> 16) + input_offset) * (int8_t)(ker_val >> 16);
out_buff3 += ((int8_t)(in_val >> 24) + input_offset) * (int8_t)(ker_val >> 24);
}
input_ptr += (input_ch * input_x);
kernel_ptr += (input_ch * 3);
}
out_buff0 = arm_nn_requantize(out_buff0, output_mult[in_ch + 0], output_shift[in_ch + 0]);
out_buff1 = arm_nn_requantize(out_buff1, output_mult[in_ch + 1], output_shift[in_ch + 1]);
out_buff2 = arm_nn_requantize(out_buff2, output_mult[in_ch + 2], output_shift[in_ch + 2]);
out_buff3 = arm_nn_requantize(out_buff3, output_mult[in_ch + 3], output_shift[in_ch + 3]);
out_buff0 += output_offset;
out_buff1 += output_offset;
out_buff2 += output_offset;
out_buff3 += output_offset;
out_buff0 = MIN(MAX(out_buff0, output_activation_min), output_activation_max);
out_buff1 = MIN(MAX(out_buff1, output_activation_min), output_activation_max);
out_buff2 = MIN(MAX(out_buff2, output_activation_min), output_activation_max);
out_buff3 = MIN(MAX(out_buff3, output_activation_min), output_activation_max);
output[out_idx++] = (int8_t)out_buff0;
output[out_idx++] = (int8_t)out_buff1;
output[out_idx++] = (int8_t)out_buff2;
output[out_idx++] = (int8_t)out_buff3;
}
// Leftover
for (; in_ch < input_ch; ++in_ch)
{
int32_t out_buff = bias[in_ch];
const int8_t *input_ptr = input + (in_h + ker_h_start) * (input_ch * input_x) + in_w * input_ch + in_ch;
const int8_t *kernel_ptr = kernel + ker_h_start * (input_ch * 3) + in_ch;
for (int32_t ker_h = ker_h_start; ker_h < MIN(3, input_y - in_h); ++ker_h)
{
if (ker_w_start == 0)
{
out_buff += (*(input_ptr) + input_offset) * *(kernel_ptr);
}
out_buff += (*(input_ptr + input_ch) + input_offset) * *(kernel_ptr + input_ch);
if ((input_x - in_w) >= 3)
{
out_buff += (*(input_ptr + (input_ch << 1)) + input_offset) * *(kernel_ptr + (input_ch << 1));
}
input_ptr += (input_ch * input_x);
kernel_ptr += (input_ch * 3);
}
out_buff = arm_nn_requantize(out_buff, output_mult[in_ch], output_shift[in_ch]);
out_buff += output_offset;
out_buff = MIN(MAX(out_buff, output_activation_min), output_activation_max);
output[out_idx++] = (int8_t)out_buff;
}
}
}
/* Return to application */
return ARM_MATH_SUCCESS;
}
/**
* @} end of NNConv group
*/

View File

@ -1,249 +0,0 @@
/*
* Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_depthwise_conv_s8.c
* Description: s8 version of depthwise convolution.
*
* $Date: May 14, 2020
* $Revision: V.2.0.0
*
* Target Processor: Cortex-M CPUs
*
* -------------------------------------------------------------------- */
#include "arm_math.h"
#include "arm_nnfunctions.h"
#include "arm_nnsupportfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup NNConv
* @{
*/
static void depthwise_conv_s8_mult_4(const int8_t *input,
const int32_t input_x,
const int32_t input_y,
const int32_t input_ch,
const int8_t *kernel,
const int32_t output_ch,
const int32_t ch_mult,
const int32_t kernel_x,
const int32_t kernel_y,
const int32_t pad_x,
const int32_t pad_y,
const int32_t stride_x,
const int32_t stride_y,
const int32_t *bias,
int8_t *output,
const int32_t *output_shift,
const int32_t *output_mult,
const int32_t output_x,
const int32_t output_y,
const int32_t output_offset,
const int32_t input_offset,
const int32_t output_activation_min,
const int32_t output_activation_max)
{
for (int32_t in_h = -pad_y, out_h = 0, out_idx = 0; out_h < output_y; in_h += stride_y, ++out_h)
{
for (int32_t in_w = -pad_x, out_w = 0, ker_h_start = MAX(0, -in_h); out_w < output_x; in_w += stride_x, ++out_w)
{
for (int32_t in_ch = 0, out_ch = 0, ker_w_start = MAX(0, -in_w); out_ch < output_ch; ++in_ch, out_ch += ch_mult)
{
for (int mult_tile = 0; mult_tile < ch_mult; mult_tile += 4)
{
int32_t out_buff[4];
out_buff[0] = bias[out_ch + 0 + mult_tile];
out_buff[1] = bias[out_ch + 1 + mult_tile];
out_buff[2] = bias[out_ch + 2 + mult_tile];
out_buff[3] = bias[out_ch + 3 + mult_tile];
for (int32_t ker_h = ker_h_start; ker_h < MIN(kernel_y, input_y - in_h); ++ker_h)
{
int32_t ker_idx = ker_h * (output_ch * kernel_x) + ker_w_start * output_ch + out_ch;
int32_t in_idx = (in_h + ker_h) * (input_ch * input_x) + in_w * input_ch + in_ch;
for (int32_t ker_w = ker_w_start; ker_w < MIN(kernel_x, input_x - in_w); ++ker_w, ker_idx += output_ch)
{
int32_t in_val = input[in_idx + ker_w * input_ch] + input_offset;
out_buff[0] += in_val * kernel[ker_idx + 0 + mult_tile];
out_buff[1] += in_val * kernel[ker_idx + 1 + mult_tile];
out_buff[2] += in_val * kernel[ker_idx + 2 + mult_tile];
out_buff[3] += in_val * kernel[ker_idx + 3 + mult_tile];
}
}
#if defined(ARM_MATH_MVEI)
(void)out_idx;
int32x4_t res = vldrwq_s32(out_buff);
res = arm_requantize_mve_32x4(res, vldrwq_s32(&output_mult[out_ch + mult_tile]), vldrwq_s32(&output_shift[out_ch + mult_tile]));
res = vaddq_n_s32(res, output_offset);
res = vmaxq_s32(res, vdupq_n_s32(output_activation_min));
res = vminq_s32(res, vdupq_n_s32(output_activation_max));
vstrbq_s32(output, res);
output += 4;
#else
out_buff[0] = arm_nn_requantize(out_buff[0], output_mult[out_ch + 0 + mult_tile], output_shift[out_ch + 0 + mult_tile]);
out_buff[1] = arm_nn_requantize(out_buff[1], output_mult[out_ch + 1 + mult_tile], output_shift[out_ch + 1 + mult_tile]);
out_buff[2] = arm_nn_requantize(out_buff[2], output_mult[out_ch + 2 + mult_tile], output_shift[out_ch + 2 + mult_tile]);
out_buff[3] = arm_nn_requantize(out_buff[3], output_mult[out_ch + 3 + mult_tile], output_shift[out_ch + 3 + mult_tile]);
out_buff[0] += output_offset;
out_buff[1] += output_offset;
out_buff[2] += output_offset;
out_buff[3] += output_offset;
out_buff[0] = MIN(MAX(out_buff[0], output_activation_min), output_activation_max);
out_buff[1] = MIN(MAX(out_buff[1], output_activation_min), output_activation_max);
out_buff[2] = MIN(MAX(out_buff[2], output_activation_min), output_activation_max);
out_buff[3] = MIN(MAX(out_buff[3], output_activation_min), output_activation_max);
output[out_idx++] = (int8_t)out_buff[0];
output[out_idx++] = (int8_t)out_buff[1];
output[out_idx++] = (int8_t)out_buff[2];
output[out_idx++] = (int8_t)out_buff[3];
#endif
}
}
}
}
}
static void depthwise_conv_s8_generic(const q7_t *input,
const uint16_t input_x,
const uint16_t input_y,
const uint16_t input_ch,
const q7_t *kernel,
const uint16_t output_ch,
const uint16_t ch_mult,
const uint16_t kernel_x,
const uint16_t kernel_y,
const uint16_t pad_x,
const uint16_t pad_y,
const uint16_t stride_x,
const uint16_t stride_y,
const int32_t *bias,
q7_t *output,
const int32_t *output_shift,
const int32_t *output_mult,
const uint16_t output_x,
const uint16_t output_y,
const int32_t output_offset,
const int32_t input_offset,
const int32_t output_activation_min,
const int32_t output_activation_max)
{
(void)output_ch;
int i_out = 0;
for (int i_out_y = 0; i_out_y < output_y; i_out_y++)
{
const int16_t base_idx_y = (i_out_y * stride_y) - pad_y;
for (int i_out_x = 0; i_out_x < output_x; i_out_x++)
{
const int16_t base_idx_x = (i_out_x * stride_x) - pad_x;
for (int i_input_ch = 0; i_input_ch < input_ch; i_input_ch++)
{
for (int i_ch_mult = 0; i_ch_mult < ch_mult; i_ch_mult++)
{
const int idx_out_ch = i_ch_mult + i_input_ch * ch_mult;
int32_t acc_0;
/* Condition for kernel start dimension: (base_idx_<x,y> + ker_<x,y>_start) >= 0 */
const int ker_y_start = MAX(0, -base_idx_y);
const int ker_x_start = MAX(0, -base_idx_x);
/* Condition for kernel end dimension: (base_idx_<x,y> + ker_<x,y>_end) < input_<x,y> */
const int ker_y_end = MIN(kernel_y, input_y - base_idx_y);
const int ker_x_end = MIN(kernel_x, input_x - base_idx_x);
acc_0 = bias[idx_out_ch];
for (int i_ker_y = ker_y_start; i_ker_y < ker_y_end; i_ker_y++)
{
const int32_t idx_y = base_idx_y + i_ker_y;
for (int i_ker_x = ker_x_start; i_ker_x < ker_x_end; i_ker_x++)
{
const int32_t idx_x = base_idx_x + i_ker_x;
int32_t idx_0 = (idx_y * input_x + idx_x) * input_ch + i_input_ch;
int32_t ker_idx_0 = (i_ker_y * kernel_x + i_ker_x) * (input_ch * ch_mult) + idx_out_ch;
acc_0 += (input[idx_0] + input_offset) * kernel[ker_idx_0];
}
}
/* Requantize and clamp output to provided range */
acc_0 = arm_nn_requantize(acc_0, output_mult[idx_out_ch], output_shift[idx_out_ch]);
acc_0 += output_offset;
acc_0 = MAX(acc_0, output_activation_min);
acc_0 = MIN(acc_0, output_activation_max);
output[i_out++] = acc_0;
}
}
}
}
}
/*
* Basic s8 depthwise convolution function.
*
* Refer header file for details.
* Optimization using DSP extension is not available for the generic case where channel multiplier is > 1.
*
*/
arm_status arm_depthwise_conv_s8(const cmsis_nn_context *ctx,
const cmsis_nn_dw_conv_params *dw_conv_params,
const cmsis_nn_per_channel_quant_params *quant_params,
const cmsis_nn_dims *input_dims,
const q7_t *input,
const cmsis_nn_dims *filter_dims,
const q7_t *kernel,
const cmsis_nn_dims *bias_dims,
const int32_t *bias,
const cmsis_nn_dims *output_dims,
q7_t *output)
{
(void)dw_conv_params->dilation;
(void)ctx;
if (dw_conv_params->ch_mult % 4 == 0)
{
depthwise_conv_s8_mult_4(input, input_dims->w, input_dims->h, input_dims->c, kernel, output_dims->c, dw_conv_params->ch_mult, filter_dims->w, filter_dims->h,
dw_conv_params->padding.w, dw_conv_params->padding.h, dw_conv_params->stride.w, dw_conv_params->stride.h, bias, output,
quant_params->shift, quant_params->multiplier, output_dims->w, output_dims->h, dw_conv_params->output_offset,
dw_conv_params->input_offset, dw_conv_params->activation.min, dw_conv_params->activation.max);
}
else
{
depthwise_conv_s8_generic(input, input_dims->w, input_dims->h, input_dims->c, kernel, output_dims->c, dw_conv_params->ch_mult, filter_dims->w, filter_dims->h,
dw_conv_params->padding.w, dw_conv_params->padding.h, dw_conv_params->stride.w, dw_conv_params->stride.h, bias, output,
quant_params->shift, quant_params->multiplier, output_dims->w, output_dims->h, dw_conv_params->output_offset,
dw_conv_params->input_offset, dw_conv_params->activation.min, dw_conv_params->activation.max);
}
/* Return to application */
return ARM_MATH_SUCCESS;
}
/**
* @} end of NNConv group
*/

View File

@ -1,425 +0,0 @@
/*
* Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_depthwise_conv_s8_opt.c
* Description: Optimized s8 depthwise separable convolution function for
* channel multiplier of 1.
*
* $Date: May 29, 2020
* $Revision: V.2.0.1
*
* Target Processor: Cortex-M CPUs
*
* -------------------------------------------------------------------- */
#include "arm_math.h"
#include "arm_nnsupportfunctions.h"
#include "arm_nnfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup NNConv
* @{
*/
/*
* Optimized s8 depthwise convolution function with constraint that in_channel equals out_channel
*
* Refer prototype header file for details.
*
*/
arm_status arm_depthwise_conv_s8_opt(const cmsis_nn_context *ctx,
const cmsis_nn_dw_conv_params *dw_conv_params,
const cmsis_nn_per_channel_quant_params *quant_params,
const cmsis_nn_dims *input_dims,
const q7_t *input,
const cmsis_nn_dims *filter_dims,
const q7_t *kernel,
const cmsis_nn_dims *bias_dims,
const int32_t *bias,
const cmsis_nn_dims *output_dims,
q7_t *output)
{
const int32_t input_x = input_dims->w;
const int32_t input_y = input_dims->h;
const int32_t input_ch = input_dims->c;
const int32_t output_ch = output_dims->c;
const int32_t kernel_x = filter_dims->w;
const int32_t kernel_y = filter_dims->h;
const int32_t pad_x = dw_conv_params->padding.w;
const int32_t pad_y = dw_conv_params->padding.h;
const int32_t stride_x = dw_conv_params->stride.w;
const int32_t stride_y = dw_conv_params->stride.h;
const int32_t *output_shift = quant_params->shift;
const int32_t *output_mult = quant_params->multiplier;
const int32_t output_x = output_dims->w;
const int32_t output_y = output_dims->h;
const int32_t output_offset = dw_conv_params->output_offset;
const int32_t input_offset = dw_conv_params->input_offset;
const int32_t output_activation_min = dw_conv_params->activation.min;
const int32_t output_activation_max = dw_conv_params->activation.max;
q15_t *buffer_a = (q15_t *)ctx->buf;
/* Check input constraints input_ch == output_ch */
if (input_ch != output_ch)
{
return ARM_MATH_SIZE_MISMATCH;
}
#ifdef ARM_MATH_MVEI
(void)bias_dims;
/* Generate two columns from the input tensor */
q7_t *lhs_buffer = (q7_t *)buffer_a;
q7_t *out = output;
int padded = 0;
int buffer_count = 0;
const int32_t kernel_size = kernel_x * kernel_y;
/* This part implements the im2col function */
for (int i_out_y = 0, base_idx_y = -pad_y; i_out_y < output_y; base_idx_y += stride_y, i_out_y++)
{
for (int i_out_x = 0, base_idx_x = -pad_x; i_out_x < output_x; base_idx_x += stride_x, i_out_x++)
{
for (int i_ker_y = base_idx_y; i_ker_y < base_idx_y + kernel_y; i_ker_y++)
{
for (int i_ker_x = base_idx_x; i_ker_x < base_idx_x + kernel_x; i_ker_x++)
{
if (i_ker_y < 0 || i_ker_y >= input_y || i_ker_x < 0 || i_ker_x >= input_x)
{
arm_memset_q7(lhs_buffer, (int8_t)-input_offset, (uint32_t)input_ch);
padded = 1;
}
else
{
arm_memcpy_q7(lhs_buffer, input + (i_ker_y * input_x + i_ker_x) * input_ch, (uint32_t)input_ch);
}
lhs_buffer += input_ch;
}
}
buffer_count++;
if (buffer_count == 4)
{
lhs_buffer = (q7_t *)buffer_a;
if (padded == 0)
{
out = arm_nn_depthwise_conv_nt_t_s8(lhs_buffer,
kernel,
input_offset,
input_ch,
output_shift,
output_mult,
output_offset,
output_activation_min,
output_activation_max,
kernel_size,
bias,
out);
}
else
{
out = arm_nn_depthwise_conv_nt_t_padded_s8(lhs_buffer,
kernel,
input_offset,
input_ch,
output_shift,
output_mult,
output_offset,
output_activation_min,
output_activation_max,
kernel_size,
bias,
out);
padded = 0;
}
buffer_count = 0;
}
}
}
/* Handle left over buffers */
lhs_buffer = (q7_t *)buffer_a;
for (int i_buf = 0; i_buf < buffer_count; i_buf++)
{
int32_t loop_count = (input_ch + 3) / 4;
int32_t num_ch_to_process = input_ch;
for (int i_loop_cnt = 0, offset = 0; i_loop_cnt < loop_count;
num_ch_to_process -= 4, offset += 4, i_loop_cnt++)
{
const int8_t *col_0 = lhs_buffer + (kernel_size * input_ch * i_buf) + offset;
const int8_t *row_0 = kernel + offset;
int32x4_t out_0 = vldrwq_s32(&bias[offset]);
for (int i_ker = 0; i_ker < kernel_size; i_ker++)
{
const int32x4_t ker_0 = vldrbq_s32(row_0);
int32x4_t ip_0 = vldrbq_s32(col_0);
ip_0 = vaddq_n_s32(ip_0, input_offset);
out_0 += vmulq_s32(ip_0, ker_0);
col_0 += input_ch;
row_0 += input_ch;
}
const int32x4_t mult = vldrwq_s32(&output_mult[offset]);
const int32x4_t shift = vldrwq_s32(&output_shift[offset]);
out_0 = arm_requantize_mve_32x4(out_0, mult, shift);
out_0 = vaddq_n_s32(out_0, output_offset);
out_0 = vmaxq_s32(out_0, vdupq_n_s32(output_activation_min));
out_0 = vminq_s32(out_0, vdupq_n_s32(output_activation_max));
mve_pred16_t p = vctp32q((uint32_t)num_ch_to_process);
vstrbq_p_s32(out, out_0, p);
out += 4;
}
const int tail_ch = input_ch & 0x3;
if (tail_ch != 0)
{
out -= (4 - tail_ch);
}
}
#elif defined(ARM_MATH_DSP)
(void)bias_dims;
/* Run the following code in cores using DSP extension */
q15_t *const col_buffer_start = buffer_a;
q15_t *col_buffer = col_buffer_start;
const int32_t *const bias_start_pos = bias;
const q31_t *const out_mult_start_pos = output_mult;
const q31_t *const out_shift_start_pos = output_shift;
uint16_t row_count;
uint16_t row_shift;
for (int i_out_y = 0; i_out_y < output_y; i_out_y++)
{
const int16_t base_idx_y = (i_out_y * stride_y) - pad_y;
for (int i_out_x = 0; i_out_x < output_x; i_out_x++)
{
const int16_t base_idx_x = (i_out_x * stride_x) - pad_x;
/* Out of bounds is only considered for the y axis as it provides a contiguous zero'ing opportunity than along
the x axis */
const int ker_y_start = MAX(0, -base_idx_y);
/* Condition for kernel end dimension: (base_idx_y + ker_y_end) < input_y */
const int ker_y_end = MIN(kernel_y, input_y - base_idx_y);
int32_t index = 0;
if (ker_y_start != 0)
{
memset(&col_buffer[index], 0, (kernel_x * input_ch) * ker_y_start * sizeof(q15_t));
index += (kernel_x * input_ch) * ker_y_start;
}
for (int i_ker_y = ker_y_start; i_ker_y < ker_y_end; i_ker_y++)
{
const int32_t idx_y = base_idx_y + i_ker_y;
for (int i_ker_x = 0; i_ker_x < kernel_x; i_ker_x++)
{
const int32_t idx_x = base_idx_x + i_ker_x;
if (idx_x < 0 || idx_x >= input_x)
{
memset(&col_buffer[index], 0, input_ch * sizeof(q15_t));
}
else
{
arm_q7_to_q15_with_offset((q7_t *)input + (idx_y * input_x + idx_x) * input_ch, &col_buffer[index], input_ch, input_offset);
}
index += input_ch;
}
}
const int diff = kernel_y - ker_y_end;
if (diff != 0)
{
memset(&col_buffer[index], 0, (kernel_x * input_ch) * diff * sizeof(q15_t));
}
row_count = output_ch / 4;
row_shift = 0;
bias = bias_start_pos;
output_mult = out_mult_start_pos;
output_shift = out_shift_start_pos;
while (row_count)
{
q31_t sum = *bias++;
q31_t sum_2 = *bias++;
q31_t sum_3 = *bias++;
q31_t sum_4 = *bias++;
uint16_t col_count = (kernel_x * kernel_y) / 2;
q15_t *col_pos = col_buffer_start + row_shift;
const q7_t *row_pos = kernel + row_shift;
row_shift += 4;
while (col_count)
{
/* General idea is to read 4 + 4 (input, kernel) pair and re-arrange them in the right order to
use in a SMLAD instruction . One run of this loop produces 4 partial outputs with 8 MACs. */
/* Note: variable names can be improved here to align with rows and columns. */
q31_t ip_a1, ip_a2, ip_b1, ip_b2, op_a, op_b, op_c;
/* Read 4 weights */
ip_b1 = arm_nn_read_q7x4(row_pos);
ip_a1 = arm_nn_read_q7x4(row_pos + input_ch);
op_a = arm_nn_read_q15x2(col_pos);
op_b = arm_nn_read_q15x2(col_pos + input_ch);
ip_a2 = __SXTB16(ip_b1);
ip_b1 = __SXTB16(__ROR(ip_b1, 8));
ip_b2 = __SXTB16(ip_a1);
ip_a1 = __SXTB16(__ROR(ip_a1, 8));
op_c = __PKHBT(op_b, op_a, 16);
op_a = __PKHTB(op_b, op_a, 16);
op_b = __PKHBT(ip_b2, ip_a2, 16);
sum = __SMLAD(op_c, op_b, sum);
op_b = __PKHBT(ip_b1, ip_a1, 16);
sum_2 = __SMLAD(op_a, op_b, sum_2);
op_a = arm_nn_read_q15x2(col_pos + 2);
op_b = arm_nn_read_q15x2(col_pos + input_ch + 2);
op_c = __PKHBT(op_b, op_a, 16);
op_a = __PKHTB(op_b, op_a, 16);
op_b = __PKHTB(ip_a2, ip_b2, 16);
sum_3 = __SMLAD(op_c, op_b, sum_3);
op_b = __PKHTB(ip_a1, ip_b1, 16);
sum_4 = __SMLAD(op_a, op_b, sum_4);
row_pos += input_ch << 1;
col_pos += input_ch << 1;
col_count--;
}
col_count = (kernel_x * kernel_y) & 0x1;
while (col_count)
{
sum += row_pos[0] * col_pos[0];
sum_2 += row_pos[1] * col_pos[1];
sum_3 += row_pos[2] * col_pos[2];
sum_4 += row_pos[3] * col_pos[3];
row_pos += input_ch;
col_pos += input_ch;
col_count--;
}
sum = arm_nn_requantize(sum, *output_mult++, *output_shift++);
sum += output_offset;
sum = MAX(sum, output_activation_min);
sum = MIN(sum, output_activation_max);
*output++ = (q7_t)sum;
sum_2 = arm_nn_requantize(sum_2, *output_mult++, *output_shift++);
sum_2 += output_offset;
sum_2 = MAX(sum_2, output_activation_min);
sum_2 = MIN(sum_2, output_activation_max);
*output++ = (q7_t)sum_2;
sum_3 = arm_nn_requantize(sum_3, *output_mult++, *output_shift++);
sum_3 += output_offset;
sum_3 = MAX(sum_3, output_activation_min);
sum_3 = MIN(sum_3, output_activation_max);
*output++ = (q7_t)sum_3;
sum_4 = arm_nn_requantize(sum_4, *output_mult++, *output_shift++);
sum_4 += output_offset;
sum_4 = MAX(sum_4, output_activation_min);
sum_4 = MIN(sum_4, output_activation_max);
*output++ = (q7_t)sum_4;
row_count--;
}
row_count = output_ch & 0x3;
while (row_count)
{
q15_t *col_pos = col_buffer_start + row_shift;
const q7_t *row_pos = kernel + row_shift;
q31_t sum = *bias++;
const uint16_t col_count = (kernel_x * kernel_y);
row_shift += 1;
for (int i = 0; i < col_count; i++)
{
sum += row_pos[i * input_ch] * col_pos[i * input_ch];
}
sum = arm_nn_requantize(sum, *output_mult++, *output_shift++);
sum += output_offset;
sum = MAX(sum, output_activation_min);
sum = MIN(sum, output_activation_max);
*output++ = (q7_t)sum;
row_count--;
}
// clear counter and pointers
col_buffer = col_buffer_start;
}
}
#else
/* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
return arm_depthwise_conv_s8(ctx,
dw_conv_params,
quant_params,
input_dims,
input,
filter_dims,
kernel,
bias_dims,
bias,
output_dims,
output);
#endif /* ARM_MATH_MVEI | ARM_MATH_DSP */
/* Return to application */
return ARM_MATH_SUCCESS;
}
int32_t arm_depthwise_conv_s8_opt_get_buffer_size(const cmsis_nn_dims *input_dims,
const cmsis_nn_dims *filter_dims)
{
#if defined(ARM_MATH_MVEI)
/* The + 4 accounts for out of bounds read of the lhs buffers in the *_nt_t_* functions. */
return (2 * input_dims->c * filter_dims->w * filter_dims->h) * (int32_t)sizeof(int16_t) + 4;
#elif defined(ARM_MATH_DSP)
return (input_dims->c * filter_dims->w * filter_dims->h) * sizeof(int16_t);
#else
(void)input_dims;
(void)filter_dims;
return 0;
#endif
}
/**
* @} end of NNConv group
*/

View File

@ -1,294 +0,0 @@
/*
* Copyright (C) 2010-2019 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_depthwise_conv_u8_basic_ver1.c
* Description: u8 depthwise convolution function
*
* $Date: May 29, 2020
* $Revision: V.1.1.0
*
* Target : Cortex-M CPUs
*
* -------------------------------------------------------------------- */
#include "arm_math.h"
#include "arm_nnfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup NNConv
* @{
*/
static void depthwise_conv_u8_mult_4(const uint8_t *input,
const int32_t input_x,
const int32_t input_y,
const int32_t input_ch,
const uint8_t *kernel,
const int32_t output_ch,
const int32_t ch_mult,
const int32_t kernel_x,
const int32_t kernel_y,
const int32_t pad_x,
const int32_t pad_y,
const int32_t stride_x,
const int32_t stride_y,
const int32_t *bias,
uint8_t *output,
const int32_t output_shift,
const int32_t output_mult,
const int32_t output_x,
const int32_t output_y,
const int32_t output_offset,
const int32_t input_offset,
const int32_t filter_offset,
const int32_t output_activation_min,
const int32_t output_activation_max)
{
for (int32_t in_h = -pad_y, out_h = 0, out_idx = 0; out_h < output_y; in_h += stride_y, ++out_h)
{
for (int32_t in_w = -pad_x, out_w = 0, ker_h_start = MAX(0, -in_h); out_w < output_x; in_w += stride_x, ++out_w)
{
for (int32_t in_ch = 0, out_ch = 0, ker_w_start = MAX(0, -in_w); out_ch < output_ch; ++in_ch, out_ch += ch_mult)
{
for (int mult_tile = 0; mult_tile < ch_mult; mult_tile += 4)
{
int32_t out_buff[4];
out_buff[0] = 0;
out_buff[1] = 0;
out_buff[2] = 0;
out_buff[3] = 0;
for (int32_t ker_h = ker_h_start; ker_h < MIN(kernel_y, input_y - in_h); ++ker_h)
{
int32_t ker_idx = ker_h * (output_ch * kernel_x) + ker_w_start * output_ch + out_ch;
int32_t in_idx = (in_h + ker_h) * (input_ch * input_x) + in_w * input_ch + in_ch;
for (int32_t ker_w = ker_w_start; ker_w < MIN(kernel_x, input_x - in_w); ++ker_w, ker_idx += output_ch)
{
int32_t in_val = input[in_idx + ker_w * input_ch] + input_offset;
out_buff[0] += in_val * (kernel[ker_idx + 0 + mult_tile] + filter_offset);
out_buff[1] += in_val * (kernel[ker_idx + 1 + mult_tile] + filter_offset);
out_buff[2] += in_val * (kernel[ker_idx + 2 + mult_tile] + filter_offset);
out_buff[3] += in_val * (kernel[ker_idx + 3 + mult_tile] + filter_offset);
}
}
if (bias != NULL)
{
out_buff[0] += bias[out_ch + 0 + mult_tile];
out_buff[1] += bias[out_ch + 1 + mult_tile];
out_buff[2] += bias[out_ch + 2 + mult_tile];
out_buff[3] += bias[out_ch + 3 + mult_tile];
}
out_buff[0] = arm_nn_requantize(out_buff[0], output_mult, output_shift);
out_buff[1] = arm_nn_requantize(out_buff[1], output_mult, output_shift);
out_buff[2] = arm_nn_requantize(out_buff[2], output_mult, output_shift);
out_buff[3] = arm_nn_requantize(out_buff[3], output_mult, output_shift);
out_buff[0] += output_offset;
out_buff[1] += output_offset;
out_buff[2] += output_offset;
out_buff[3] += output_offset;
out_buff[0] = MIN(MAX(out_buff[0], output_activation_min), output_activation_max);
out_buff[1] = MIN(MAX(out_buff[1], output_activation_min), output_activation_max);
out_buff[2] = MIN(MAX(out_buff[2], output_activation_min), output_activation_max);
out_buff[3] = MIN(MAX(out_buff[3], output_activation_min), output_activation_max);
output[out_idx++] = (uint8_t)out_buff[0];
output[out_idx++] = (uint8_t)out_buff[1];
output[out_idx++] = (uint8_t)out_buff[2];
output[out_idx++] = (uint8_t)out_buff[3];
}
}
}
}
}
static void depthwise_conv_u8_generic(const uint8_t *input,
const int32_t input_x,
const int32_t input_y,
const int32_t input_ch,
const uint8_t *kernel,
const int32_t output_ch,
const int32_t ch_mult,
const int32_t kernel_x,
const int32_t kernel_y,
const int32_t pad_x,
const int32_t pad_y,
const int32_t stride_x,
const int32_t stride_y,
const int32_t *bias,
uint8_t *output,
const int32_t output_shift,
const int32_t output_mult,
const int32_t output_x,
const int32_t output_y,
const int32_t output_offset,
const int32_t input_offset,
const int32_t filter_offset,
const int32_t output_activation_min,
const int32_t output_activation_max)
{
(void)output_ch;
int i_out = 0;
for (int i_out_y = 0; i_out_y < output_y; i_out_y++)
{
const int16_t base_idx_y = (i_out_y * stride_y) - pad_y;
for (int i_out_x = 0; i_out_x < output_x; i_out_x++)
{
const int16_t base_idx_x = (i_out_x * stride_x) - pad_x;
for (int i_input_ch = 0; i_input_ch < input_ch; i_input_ch++)
{
for (int i_ch_mult = 0; i_ch_mult < ch_mult; i_ch_mult++)
{
const int idx_out_ch = i_ch_mult + i_input_ch * ch_mult;
int32_t acc_0;
/* Condition for kernel start dimension: (base_idx_<x,y> + ker_<x,y>_start) >= 0 */
const int ker_y_start = MAX(0, -base_idx_y);
const int ker_x_start = MAX(0, -base_idx_x);
/* Condition for kernel end dimension: (base_idx_<x,y> + ker_<x,y>_end) < input_<x,y> */
const int ker_y_end = MIN(kernel_y, input_y - base_idx_y);
const int ker_x_end = MIN(kernel_x, input_x - base_idx_x);
acc_0 = 0;
for (int i_ker_y = ker_y_start; i_ker_y < ker_y_end; i_ker_y++)
{
const int32_t idx_y = base_idx_y + i_ker_y;
for (int i_ker_x = ker_x_start; i_ker_x < ker_x_end; i_ker_x++)
{
const int32_t idx_x = base_idx_x + i_ker_x;
int32_t idx_0 = (idx_y * input_x + idx_x) * input_ch + i_input_ch;
int32_t ker_idx_0 = (i_ker_y * kernel_x + i_ker_x) * (input_ch * ch_mult) + idx_out_ch;
acc_0 += (input[idx_0] + input_offset) * (kernel[ker_idx_0] + filter_offset);
}
}
if (bias != NULL)
{
acc_0 += bias[idx_out_ch];
}
/* Requantize and clamp output to provided range */
acc_0 = arm_nn_requantize(acc_0, output_mult, output_shift);
acc_0 += output_offset;
acc_0 = MAX(acc_0, output_activation_min);
acc_0 = MIN(acc_0, output_activation_max);
output[i_out++] = acc_0;
}
}
}
}
}
/**
* @brief uint8 depthwise convolution function with asymmetric quantization
*
* @param[in] input Pointer to input tensor
* @param[in] input_x Width of input tensor
* @param[in] input_y Height of input tensor
* @param[in] input_ch Channels in input tensor
* @param[in] kernel Pointer to kernel weights
* @param[in] kernel_x Width of kernel
* @param[in] kernel_y Height of kernel
* @param[in] ch_mult Number of channel multiplier
* @param[in] pad_x Padding sizes x
* @param[in] pad_y Padding sizes y
* @param[in] stride_x Convolution stride along the width
* @param[in] stride_y Convolution stride along the height
* @param[in] dilation_x Dilation along width. Not used and intended for future enhancement.
* @param[in] dilation_y Dilation along height. Not used and intended for future enhancement.
* @param[in] bias Pointer to optional bias values. If no bias is
* availble, NULL is expected
* @param[in] input_offset Input tensor zero offset
* @param[in] filter_offset Kernel tensor zero offset
* @param[in] output_offset Output tensor zero offset
* @param[in,out] output Pointer to output tensor
* @param[in] output_x Width of output tensor
* @param[in] output_y Height of output tensor
* @param[in] output_activation_min Minimum value to clamp the output to. Range : {0, 255}
* @param[in] output_activation_max Minimum value to clamp the output to. Range : {0, 255}
* @param[in] output_shift Amount of right-shift for output
* @param[in] output_mult Output multiplier for requantization
* @return The function returns one of the following
* <code>ARM_MATH_SIZE_MISMATCH</code> - Not supported dimension of tensors
* <code>ARM_MATH_SUCCESS</code> - Successful operation
* <code>ARM_MATH_ARGUMENT_ERROR</code> - Implementation not available
*
*
*/
arm_status arm_depthwise_conv_u8_basic_ver1(const uint8_t *input,
const uint16_t input_x,
const uint16_t input_y,
const uint16_t input_ch,
const uint8_t *kernel,
const uint16_t kernel_x,
const uint16_t kernel_y,
const int16_t ch_mult,
const int16_t pad_x,
const int16_t pad_y,
const int16_t stride_x,
const int16_t stride_y,
const int16_t dilation_x,
const int16_t dilation_y,
const int32_t *bias,
const int32_t input_offset,
const int32_t filter_offset,
const int32_t output_offset,
uint8_t *output,
const uint16_t output_x,
const uint16_t output_y,
const int32_t output_activation_min,
const int32_t output_activation_max,
const int32_t output_shift,
const int32_t output_mult)
{
(void)dilation_x;
(void)dilation_y;
if (ch_mult % 4 == 0)
{
depthwise_conv_u8_mult_4(input, input_x, input_y, input_ch, kernel, ch_mult * input_ch, ch_mult,
kernel_x, kernel_y, pad_x, pad_y, stride_x, stride_y, bias, output,
output_shift, output_mult, output_x, output_y, output_offset, input_offset,
filter_offset, output_activation_min, output_activation_max);
}
else
{
depthwise_conv_u8_generic(input, input_x, input_y, input_ch, kernel, ch_mult * input_ch, ch_mult,
kernel_x, kernel_y, pad_x, pad_y, stride_x, stride_y, bias,
output, output_shift, output_mult, output_x, output_y, output_offset,
input_offset, filter_offset, output_activation_min, output_activation_max);
}
/* Return to application */
return ARM_MATH_SUCCESS;
}
/**
* @} end of NNConv group
*/

View File

@ -1,133 +0,0 @@
/*
* Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_depthwise_conv_wrapper_s8.c
* Description: Wrapper API to select appropriate depthwise conv API based
* on dimensions.
*
* $Date: May 29, 2020
* $Revision: V.1.0.1
*
* Target Processor: Cortex-M CPUs
*
* -------------------------------------------------------------------- */
#include "arm_math.h"
#include "arm_nnfunctions.h"
#include "arm_nnsupportfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup NNConv
* @{
*/
/*
* s8 Depthwise conv wrapper function
*
* Refer header file for details.
*
*/
arm_status arm_depthwise_conv_wrapper_s8(const cmsis_nn_context *ctx,
const cmsis_nn_dw_conv_params *dw_conv_params,
const cmsis_nn_per_channel_quant_params *quant_params,
const cmsis_nn_dims *input_dims,
const q7_t *input,
const cmsis_nn_dims *filter_dims,
const q7_t *filter,
const cmsis_nn_dims *bias_dims,
const int32_t *bias,
const cmsis_nn_dims *output_dims,
q7_t *output)
{
arm_status status = ARM_MATH_SUCCESS;
if (1 == dw_conv_params->ch_mult)
{
#if !defined(ARM_MATH_MVEI)
if ((filter_dims->w == 3) && (filter_dims->h == 3) && (dw_conv_params->padding.h <= 1))
{
status = arm_depthwise_conv_3x3_s8(ctx,
dw_conv_params,
quant_params,
input_dims,
input,
filter_dims,
filter,
bias_dims,
bias,
output_dims,
output);
}
else
#endif
{
status = arm_depthwise_conv_s8_opt(ctx,
dw_conv_params,
quant_params,
input_dims,
input,
filter_dims,
filter,
bias_dims,
bias,
output_dims,
output);
}
}
else
{
status = arm_depthwise_conv_s8(ctx,
dw_conv_params,
quant_params,
input_dims,
input,
filter_dims,
filter,
bias_dims,
bias,
output_dims,
output);
}
/* Return to application */
return status;
}
int32_t arm_depthwise_conv_wrapper_s8_get_buffer_size(const cmsis_nn_dw_conv_params *dw_conv_params,
const cmsis_nn_dims *input_dims,
const cmsis_nn_dims *filter_dims,
const cmsis_nn_dims *output_dims)
{
(void)dw_conv_params;
int32_t size = 0;
if (input_dims->c == output_dims->c)
{
size = arm_depthwise_conv_s8_opt_get_buffer_size(input_dims, filter_dims);
}
return size;
}
/**
* @} end of NNConv group
*/

View File

@ -1,418 +0,0 @@
/*
* Copyright (C) 2010-2018 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_depthwise_separable_conv_HWC_q7.c
* Description: Q7 depthwise separable convolution function
*
* $Date: 17. January 2018
* $Revision: V.1.0.0
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#include "arm_math.h"
#include "arm_nnfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup NNConv
* @{
*/
/**
* @brief Q7 depthwise separable convolution function
* @param[in] Im_in pointer to input tensor
* @param[in] dim_im_in input tensor dimention
* @param[in] ch_im_in number of input tensor channels
* @param[in] wt pointer to kernel weights
* @param[in] ch_im_out number of filters, i.e., output tensor channels
* @param[in] dim_kernel filter kernel size
* @param[in] padding padding sizes
* @param[in] stride convolution stride
* @param[in] bias pointer to bias
* @param[in] bias_shift amount of left-shift for bias
* @param[in] out_shift amount of right-shift for output
* @param[in,out] Im_out pointer to output tensor
* @param[in] dim_im_out output tensor dimension
* @param[in,out] bufferA pointer to buffer space for input
* @param[in,out] bufferB pointer to buffer space for output
* @return The function returns either
* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
*
* @details
*
* <b>Buffer size:</b>
*
* bufferA size: 2*ch_im_in*dim_kernel*dim_kernel
*
* bufferB size: 0
*
* <b>Input dimension constraints:</b>
*
* ch_im_in equals ch_im_out
*
* Implementation:
* There are 3 nested loop here:
* Inner loop: calculate each output value with MAC instruction over an accumulator
* Mid loop: loop over different output channel
* Outer loop: loop over different output (x, y)
*/
arm_status arm_depthwise_separable_conv_HWC_q7(const q7_t * Im_in,
const uint16_t dim_im_in,
const uint16_t ch_im_in,
const q7_t * wt,
const uint16_t ch_im_out,
const uint16_t dim_kernel,
const uint16_t padding,
const uint16_t stride,
const q7_t * bias,
const uint16_t bias_shift,
const uint16_t out_shift,
q7_t * Im_out,
const uint16_t dim_im_out,
q15_t * bufferA,
q7_t * bufferB)
{
(void)bufferB;
#if defined (ARM_MATH_DSP)
/* Run the following code for Cortex-M4 and Cortex-M7 */
int16_t i_out_y, i_out_x;
int16_t i_ker_y, i_ker_x;
q7_t *colBuffer = (q7_t *) bufferA;
q7_t *pBuffer = colBuffer;
const q7_t *pBias = bias;
q7_t *pOut = Im_out;
uint16_t rowCnt;
uint16_t row_shift;
/* do some checking here, basically ch_im_in == ch_im_out */
if (ch_im_in != ch_im_out)
{
return ARM_MATH_SIZE_MISMATCH;
}
for (i_out_y = 0; i_out_y < dim_im_out; i_out_y++)
{
for (i_out_x = 0; i_out_x < dim_im_out; i_out_x++)
{
/* we first do im2col here */
for (i_ker_y = i_out_y * stride - padding; i_ker_y < i_out_y * stride - padding + dim_kernel; i_ker_y++)
{
for (i_ker_x = i_out_x * stride - padding; i_ker_x < i_out_x * stride - padding + dim_kernel; i_ker_x++)
{
if (i_ker_y < 0 || i_ker_y >= dim_im_in || i_ker_x < 0 || i_ker_x >= dim_im_in)
{
/* arm_fill_q7(0, pBuffer, ch_im_in); */
memset(pBuffer, 0, ch_im_in);
} else
{
/* arm_copy_q7((q7_t *) Im_in + (i_ker_y * dim_im_in + i_ker_x) * ch_im_in, pBuffer, ch_im_in); */
memcpy(pBuffer, (q7_t *) Im_in + (i_ker_y * dim_im_in + i_ker_x) * ch_im_in, ch_im_in);
}
pBuffer += ch_im_in;
}
}
/* we will do the computation here for each channel */
rowCnt = ch_im_out >> 2;
row_shift = 0;
pBias = bias;
while (rowCnt)
{
q31_t sum = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
q31_t sum2 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
q31_t sum3 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
q31_t sum4 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
uint16_t colCnt = (dim_kernel * dim_kernel) >> 1;
q7_t *pB = colBuffer + row_shift;
const q7_t *pA = wt + row_shift;
row_shift += 4;
#ifdef USE_INTRINSIC
#ifndef ARM_MATH_BIG_ENDIAN
while (colCnt)
{
q31_t inA1, inA2, inB1, inB2, opA, opB;
inB1 = arm_nn_read_q7x4(pB);
pB += ch_im_in;
opB = arm_nn_read_q7x4(pB);
pB += ch_im_in;
inB2 = __PKHTB(opB, inB1, 16);
inB1 = __PKHBT(inB1, opB, 16);
inA1 = arm_nn_read_q7x4(pA);
pA += ch_im_in;
opB = arm_nn_read_q7x4(pA);
pA += ch_im_in;
inA2 = __PKHTB(opB, inA1, 16);
inA1 = __PKHBT(inA1, opB, 16);
opA = __SXTB16(inA1);
opB = __SXTB16(inB1);
sum = __SMLAD(opA, opB, sum);
opA = __SXTB16(__ROR(inA1, 8));
opB = __SXTB16(__ROR(inB1, 8));
sum2 = __SMLAD(opA, opB, sum2);
opA = __SXTB16(inA2);
opB = __SXTB16(inB2);
sum3 = __SMLAD(opA, opB, sum3);
opA = __SXTB16(__ROR(inA2, 8));
opB = __SXTB16(__ROR(inB2, 8));
sum4 = __SMLAD(opA, opB, sum4);
colCnt--;
}
#else
while (colCnt)
{
q31_t inA1, inA2, inB1, inB2, opA, opB;
inB1 = arm_nn_read_q7x4(pB);
pB += ch_im_in;
opB = arm_nn_read_q7x4(pB);
pB += ch_im_in;
inB2 = __PKHBT(opB, inB1, 16);
inB1 = __PKHTB(inB1, opB, 16);
inA1 = arm_nn_read_q7x4(pA);
pA += ch_im_in;
opB = arm_nn_read_q7x4(pA);
pA += ch_im_in;
inA2 = __PKHBT(opB, inA1, 16);
inA1 = __PKHTB(inA1, opB, 16);
opA = __SXTB16(inA1);
opB = __SXTB16(inB1);
sum2 = __SMLAD(opA, opB, sum2);
opA = __SXTB16(__ROR(inA1, 8));
opB = __SXTB16(__ROR(inB1, 8));
sum = __SMLAD(opA, opB, sum);
opA = __SXTB16(inA2);
opB = __SXTB16(inB2);
sum4 = __SMLAD(opA, opB, sum4);
opA = __SXTB16(__ROR(inA2, 8));
opB = __SXTB16(__ROR(inB2, 8));
sum3 = __SMLAD(opA, opB, sum3);
colCnt--;
}
#endif /* ARM_MATH_BIG_ENDIAN */
#else
#ifndef ARM_MATH_BIG_ENDIAN
/*
* r0 r1 r2 r3 r4 r5
* inA1, inA2, inB1, inB2, opA, opB
*/
asm volatile ("COL_LOOP_%=:\n"
"ldr.w r2, [%[pB], #0]\n"
"add.w %[pB], %[pB], %[ch_im_in]\n"
"ldr.w r5, [%[pB], #0]\n"
"add.w %[pB], %[pB], %[ch_im_in]\n"
"pkhtb r3, r5, r2, ASR #16\n"
"pkhbt r2, r2, r5, LSL #16\n"
"ldr.w r0, [%[pA], #0]\n"
"add.w %[pA], %[pA], %[ch_im_in]\n"
"ldr.w r5, [%[pA], #0]\n"
"add.w %[pA], %[pA], %[ch_im_in]\n"
"pkhtb r1, r5, r0, ASR #16\n"
"pkhbt r0, r0, r5, LSL #16\n"
"sxtb16 r4, r0\n"
"sxtb16 r5, r2\n"
"smlad %[sum], r4, r5, %[sum]\n"
"mov.w r4, r0, ror #8\n"
"mov.w r5, r2, ror #8\n"
"sxtb16 r4, r4\n"
"sxtb16 r5, r5\n"
"smlad %[sum2], r4, r5, %[sum2]\n"
"sxtb16 r4, r1\n"
"sxtb16 r5, r3\n"
"smlad %[sum3], r4, r5, %[sum3]\n"
"mov.w r4, r1, ror #8\n"
"mov.w r5, r3, ror #8\n"
"sxtb16 r4, r4\n"
"sxtb16 r5, r5\n"
"smlad %[sum4], r4, r5, %[sum4]\n"
"subs %[colCnt], #1\n"
"bne COL_LOOP_%=\n":[sum]
"+r"(sum),[sum2] "+r"(sum2),
[sum3] "+r"(sum3),
[sum4] "+r"(sum4),[pB] "+r"(pB),
[pA] "+r"(pA):[colCnt]
"r"(colCnt),[ch_im_in] "r"(ch_im_in):"r0", "r1", "r2", "r3", "r4", "r5");
#else
/*
* r0 r1 r2 r3 r4 r5
* inA1, inA2, inB1, inB2, opA, opB
*/
asm volatile ("COL_LOOP_%=:\n"
"ldr.w r2, [%[pB], #0]\n"
"add.w %[pB], %[pB], %[ch_im_in]\n"
"ldr.w r5, [%[pB], #0]\n"
"add.w %[pB], %[pB], %[ch_im_in]\n"
"pkhbt r3, r5, r2, LSL #16\n"
"pkhtb r2, r2, r5, ASR #16\n"
"ldr.w r0, [%[pA], #0]\n"
"add.w %[pA], %[pA], %[ch_im_in]\n"
"ldr.w r5, [%[pA], #0]\n"
"add.w %[pA], %[pA], %[ch_im_in]\n"
"pkhbt r1, r5, r0, LSL #16\n"
"pkhtb r0, r0, r5, ASR #16\n"
"sxtb16 r4, r0\n"
"sxtb16 r5, r2\n"
"smlad %[sum2], r4, r5, %[sum2]\n"
"mov.w r4, r0, ror #8\n"
"mov.w r5, r2, ror #8\n"
"sxtb16 r4, r4\n"
"sxtb16 r5, r5\n"
"smlad %[sum], r4, r5, %[sum]\n"
"sxtb16 r4, r1\n"
"sxtb16 r5, r3\n"
"smlad %[sum4], r4, r5, %[sum4]\n"
"mov.w r4, r1, ror #8\n"
"mov.w r5, r3, ror #8\n"
"sxtb16 r4, r4\n"
"sxtb16 r5, r5\n"
"smlad %[sum3], r4, r5, %[sum3]\n"
"subs %[colCnt], #1\n"
"bne COL_LOOP_%=\n":[sum]
"+r"(sum),[sum2] "+r"(sum2),
[sum3] "+r"(sum3),
[sum4] "+r"(sum4),[pB] "+r"(pB),
[pA] "+r"(pA):[colCnt]
"r"(colCnt),[ch_im_in] "r"(ch_im_in):"r0", "r1", "r2", "r3", "r4", "r5");
#endif /* ARM_MATH_BIG_ENDIAN */
#endif /* USE_INTRINSIC */
colCnt = (dim_kernel * dim_kernel) & 0x1;
while (colCnt)
{
union arm_nnword inA, inB;
inA.word = arm_nn_read_q7x4(pA);
pA += ch_im_in;
inB.word = arm_nn_read_q7x4(pB);
pB += ch_im_in;
sum += inA.bytes[0] * inB.bytes[0];
sum2 += inA.bytes[1] * inB.bytes[1];
sum3 += inA.bytes[2] * inB.bytes[2];
sum4 += inA.bytes[3] * inB.bytes[3];
colCnt--;
}
*pOut++ = (q7_t) __SSAT((sum >> out_shift), 8);
*pOut++ = (q7_t) __SSAT((sum2 >> out_shift), 8);
*pOut++ = (q7_t) __SSAT((sum3 >> out_shift), 8);
*pOut++ = (q7_t) __SSAT((sum4 >> out_shift), 8);
rowCnt--;
}
rowCnt = ch_im_out & 0x3;
while (rowCnt)
{
q7_t *pB = colBuffer + row_shift;
const q7_t *pA = wt + row_shift;
q31_t sum = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
uint16_t colCnt = (dim_kernel * dim_kernel);
row_shift += 1;
while (colCnt)
{
q7_t A1 = *pA;
q7_t B1 = *pB;
pA += ch_im_in;
pB += ch_im_in;
sum += A1 * B1;
colCnt--;
}
*pOut++ = (q7_t) __SSAT((sum >> out_shift), 8);
rowCnt--;
}
/* clear counter and pointers */
pBuffer = colBuffer;
}
}
#else
/* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
int i_out_y, i_out_x, i_ch_out, i_ker_x, i_ker_y;
int conv_out;
/* do some checking here, basically ch_im_in == ch_im_out */
if (ch_im_in != ch_im_out)
{
return ARM_MATH_SIZE_MISMATCH;
}
for (i_out_y = 0; i_out_y < dim_im_out; i_out_y++)
{
for (i_out_x = 0; i_out_x < dim_im_out; i_out_x++)
{
for (i_ch_out = 0; i_ch_out < ch_im_out; i_ch_out++)
{
// for each output
conv_out = ((q31_t)(bias[i_ch_out]) << bias_shift) + NN_ROUND(out_shift);
for (i_ker_y = 0; i_ker_y < dim_kernel; i_ker_y++)
{
for (i_ker_x = 0; i_ker_x < dim_kernel; i_ker_x++)
{
int in_row = stride * i_out_y + i_ker_y - padding;
int in_col = stride * i_out_x + i_ker_x - padding;
if (in_row >= 0 && in_col >= 0 && in_row < dim_im_in && in_col < dim_im_in)
{
conv_out +=
Im_in[(in_row *
dim_im_in +
in_col) *
ch_im_in +
i_ch_out] * wt[(i_ker_y * dim_kernel + i_ker_x) * ch_im_out + i_ch_out];
}
}
}
Im_out[(i_out_y * dim_im_out +
i_out_x) * ch_im_out + i_ch_out] = (q7_t) __SSAT((conv_out >> out_shift), 8);
}
}
}
#endif /* ARM_MATH_DSP */
/* Return to application */
return ARM_MATH_SUCCESS;
}
/**
* @} end of NNConv group
*/

View File

@ -1,413 +0,0 @@
/*
* Copyright (C) 2010-2018 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_depthwise_separable_conv_HWC_q7_nonsquare.c
* Description: Q7 depthwise separable convolution function (non-square shape)
*
* $Date: 17. January 2018
* $Revision: V.1.0.0
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#include "arm_math.h"
#include "arm_nnfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup NNConv
* @{
*/
/**
* @brief Q7 depthwise separable convolution function (non-square shape)
* @param[in] Im_in pointer to input tensor
* @param[in] dim_im_in_x input tensor dimention x
* @param[in] dim_im_in_y input tensor dimention y
* @param[in] ch_im_in number of input tensor channels
* @param[in] wt pointer to kernel weights
* @param[in] ch_im_out number of filters, i.e., output tensor channels
* @param[in] dim_kernel_x filter kernel size x
* @param[in] dim_kernel_y filter kernel size y
* @param[in] padding_x padding sizes x
* @param[in] padding_y padding sizes y
* @param[in] stride_x convolution stride x
* @param[in] stride_y convolution stride y
* @param[in] bias pointer to bias
* @param[in] bias_shift amount of left-shift for bias
* @param[in] out_shift amount of right-shift for output
* @param[in,out] Im_out pointer to output tensor
* @param[in] dim_im_out_x output tensor dimension x
* @param[in] dim_im_out_y output tensor dimension y
* @param[in,out] bufferA pointer to buffer space for input
* @param[in,out] bufferB pointer to buffer space for output
* @return The function returns either
* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
*
* This function is the version with full list of optimization tricks, but with
* some contraints:
* ch_im_in is equal to ch_im_out
*
*/
arm_status arm_depthwise_separable_conv_HWC_q7_nonsquare(const q7_t * Im_in,
const uint16_t dim_im_in_x,
const uint16_t dim_im_in_y,
const uint16_t ch_im_in,
const q7_t * wt,
const uint16_t ch_im_out,
const uint16_t dim_kernel_x,
const uint16_t dim_kernel_y,
const uint16_t padding_x,
const uint16_t padding_y,
const uint16_t stride_x,
const uint16_t stride_y,
const q7_t * bias,
const uint16_t bias_shift,
const uint16_t out_shift,
q7_t * Im_out,
const uint16_t dim_im_out_x,
const uint16_t dim_im_out_y,
q15_t * bufferA,
q7_t * bufferB)
{
(void)bufferB;
#if defined (ARM_MATH_DSP)
/* Run the following code for Cortex-M4 and Cortex-M7 */
/*
* Implementation:
* There are 3 nested loop here:
* Inner loop: calculate each output value with MAC instruction over an accumulator
* Mid loop: loop over different output channel
* Outer loop: loop over different output (x, y)
*
*/
int16_t i_out_y, i_out_x;
int16_t i_ker_y, i_ker_x;
q7_t *colBuffer = (q7_t *) bufferA;
q7_t *pBuffer = colBuffer;
const q7_t *pBias = bias;
q7_t *pOut = Im_out;
uint16_t rowCnt;
uint16_t row_shift;
/* do some checking here, basically ch_im_in == ch_im_out */
if (ch_im_in != ch_im_out)
{
return ARM_MATH_SIZE_MISMATCH;
}
for (i_out_y = 0; i_out_y < dim_im_out_y; i_out_y++)
{
for (i_out_x = 0; i_out_x < dim_im_out_x; i_out_x++)
{
/* we first do im2col here */
for (i_ker_y = i_out_y * stride_y - padding_y; i_ker_y < i_out_y * stride_y - padding_y + dim_kernel_y;
i_ker_y++)
{
for (i_ker_x = i_out_x * stride_x - padding_x; i_ker_x < i_out_x * stride_x - padding_x + dim_kernel_x;
i_ker_x++)
{
if (i_ker_y < 0 || i_ker_y >= dim_im_in_y || i_ker_x < 0 || i_ker_x >= dim_im_in_x)
{
/* arm_fill_q7(0, pBuffer, ch_im_in); */
memset(pBuffer, 0, ch_im_in);
} else
{
/* arm_copy_q7((q7_t *) Im_in + (i_ker_y * dim_im_in_x + i_ker_x) * ch_im_in, pBuffer, ch_im_in); */
memcpy(pBuffer, (q7_t *) Im_in + (i_ker_y * dim_im_in_x + i_ker_x) * ch_im_in, ch_im_in);
}
pBuffer += ch_im_in;
}
}
/* we will do the computation here for each channel */
rowCnt = ch_im_out >> 2;
row_shift = 0;
pBias = bias;
while (rowCnt)
{
q31_t sum = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
q31_t sum2 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
q31_t sum3 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
q31_t sum4 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
uint16_t colCnt = (dim_kernel_x * dim_kernel_y) >> 1;
q7_t *pB = colBuffer + row_shift;
const q7_t *pA = wt + row_shift;
row_shift += 4;
#ifdef USE_INTRINSIC
#ifndef ARM_MATH_BIG_ENDIAN
while (colCnt)
{
q31_t inA1, inA2, inB1, inB2, opA, opB;
inB1 = arm_nn_read_q7x4(pB);
pB += ch_im_in;
opB = arm_nn_read_q7x4(pB);
pB += ch_im_in;
inB2 = __PKHTB(opB, inB1, 16);
inB1 = __PKHBT(inB1, opB, 16);
inA1 = arm_nn_read_q7x4(pA);
pA += ch_im_in;
opB = arm_nn_read_q7x4(pA);
pA += ch_im_in;
inA2 = __PKHTB(opB, inA1, 16);
inA1 = __PKHBT(inA1, opB, 16);
opA = __SXTB16(inA1);
opB = __SXTB16(inB1);
sum = __SMLAD(opA, opB, sum);
opA = __SXTB16(__ROR(inA1, 8));
opB = __SXTB16(__ROR(inB1, 8));
sum2 = __SMLAD(opA, opB, sum2);
opA = __SXTB16(inA2);
opB = __SXTB16(inB2);
sum3 = __SMLAD(opA, opB, sum3);
opA = __SXTB16(__ROR(inA2, 8));
opB = __SXTB16(__ROR(inB2, 8));
sum4 = __SMLAD(opA, opB, sum4);
colCnt--;
}
#else
while (colCnt)
{
q31_t inA1, inA2, inB1, inB2, opA, opB;
inB1 = arm_nn_read_q7x4(pB);
pB += ch_im_in;
opB = arm_nn_read_q7x4(pB);
pB += ch_im_in;
inB2 = __PKHBT(opB, inB1, 16);
inB1 = __PKHTB(inB1, opB, 16);
inA1 = arm_nn_read_q7x4(pA);
pA += ch_im_in;
opB = arm_nn_read_q7x4(pA);
pA += ch_im_in;
inA2 = __PKHBT(opB, inA1, 16);
inA1 = __PKHTB(inA1, opB, 16);
opA = __SXTB16(inA1);
opB = __SXTB16(inB1);
sum2 = __SMLAD(opA, opB, sum2);
opA = __SXTB16(__ROR(inA1, 8));
opB = __SXTB16(__ROR(inB1, 8));
sum = __SMLAD(opA, opB, sum);
opA = __SXTB16(inA2);
opB = __SXTB16(inB2);
sum4 = __SMLAD(opA, opB, sum4);
opA = __SXTB16(__ROR(inA2, 8));
opB = __SXTB16(__ROR(inB2, 8));
sum3 = __SMLAD(opA, opB, sum3);
colCnt--;
}
#endif /* ARM_MATH_BIG_ENDIAN */
#else
#ifndef ARM_MATH_BIG_ENDIAN
// r0 r1 r2 r3 r4 r5
// inA1, inA2, inB1, inB2, opA, opB
asm volatile ("COL_LOOP:\n"
"ldr.w r2, [%[pB], #0]\n"
"add.w %[pB], %[pB], %[ch_im_in]\n"
"ldr.w r5, [%[pB], #0]\n"
"add.w %[pB], %[pB], %[ch_im_in]\n"
"pkhtb r3, r5, r2, ASR #16\n"
"pkhbt r2, r2, r5, LSL #16\n"
"ldr.w r0, [%[pA], #0]\n"
"add.w %[pA], %[pA], %[ch_im_in]\n"
"ldr.w r5, [%[pA], #0]\n"
"add.w %[pA], %[pA], %[ch_im_in]\n"
"pkhtb r1, r5, r0, ASR #16\n"
"pkhbt r0, r0, r5, LSL #16\n"
"sxtb16 r4, r0\n"
"sxtb16 r5, r2\n"
"smlad %[sum], r4, r5, %[sum]\n"
"mov.w r4, r0, ror #8\n"
"mov.w r5, r2, ror #8\n"
"sxtb16 r4, r4\n"
"sxtb16 r5, r5\n"
"smlad %[sum2], r4, r5, %[sum2]\n"
"sxtb16 r4, r1\n"
"sxtb16 r5, r3\n"
"smlad %[sum3], r4, r5, %[sum3]\n"
"mov.w r4, r1, ror #8\n"
"mov.w r5, r3, ror #8\n"
"sxtb16 r4, r4\n"
"sxtb16 r5, r5\n"
"smlad %[sum4], r4, r5, %[sum4]\n"
"subs %[colCnt], #1\n"
"bne COL_LOOP\n":[sum] "+r"(sum),[sum2] "+r"(sum2),[sum3] "+r"(sum3),
[sum4] "+r"(sum4),[pB] "+r"(pB),[pA] "+r"(pA):[colCnt] "r"(colCnt),
[ch_im_in] "r"(ch_im_in):"r0", "r1", "r2", "r3", "r4", "r5");
#else
// r0 r1 r2 r3 r4 r5
// inA1, inA2, inB1, inB2, opA, opB
asm volatile ("COL_LOOP:\n"
"ldr.w r2, [%[pB], #0]\n"
"add.w %[pB], %[pB], %[ch_im_in]\n"
"ldr.w r5, [%[pB], #0]\n"
"add.w %[pB], %[pB], %[ch_im_in]\n"
"pkhbt r3, r5, r2, LSL #16\n"
"pkhtb r2, r2, r5, ASR #16\n"
"ldr.w r0, [%[pA], #0]\n"
"add.w %[pA], %[pA], %[ch_im_in]\n"
"ldr.w r5, [%[pA], #0]\n"
"add.w %[pA], %[pA], %[ch_im_in]\n"
"pkhbt r1, r5, r0, LSL #16\n"
"pkhtb r0, r0, r5, ASR #16\n"
"sxtb16 r4, r0\n"
"sxtb16 r5, r2\n"
"smlad %[sum2], r4, r5, %[sum2]\n"
"mov.w r4, r0, ror #8\n"
"mov.w r5, r2, ror #8\n"
"sxtb16 r4, r4\n"
"sxtb16 r5, r5\n"
"smlad %[sum], r4, r5, %[sum]\n"
"sxtb16 r4, r1\n"
"sxtb16 r5, r3\n"
"smlad %[sum4], r4, r5, %[sum4]\n"
"mov.w r4, r1, ror #8\n"
"mov.w r5, r3, ror #8\n"
"sxtb16 r4, r4\n"
"sxtb16 r5, r5\n"
"smlad %[sum3], r4, r5, %[sum3]\n"
"subs %[colCnt], #1\n"
"bne COL_LOOP\n":[sum] "+r"(sum),[sum2] "+r"(sum2),[sum3] "+r"(sum3),
[sum4] "+r"(sum4),[pB] "+r"(pB),[pA] "+r"(pA):[colCnt] "r"(colCnt),
[ch_im_in] "r"(ch_im_in):"r0", "r1", "r2", "r3", "r4", "r5");
#endif /*ARM_MATH_BIG_ENDIAN */
#endif /* USE_INTRINSIC */
colCnt = (dim_kernel_x * dim_kernel_y) & 0x1;
while (colCnt)
{
union arm_nnword inA, inB;
inA.word = arm_nn_read_q7x4(pA);
pA += ch_im_in;
inB.word = arm_nn_read_q7x4(pB);
pB += ch_im_in;
sum += inA.bytes[0] * inB.bytes[0];
sum2 += inA.bytes[1] * inB.bytes[1];
sum3 += inA.bytes[2] * inB.bytes[2];
sum4 += inA.bytes[3] * inB.bytes[3];
colCnt--;
}
*pOut++ = (q7_t) __SSAT((sum >> out_shift), 8);
*pOut++ = (q7_t) __SSAT((sum2 >> out_shift), 8);
*pOut++ = (q7_t) __SSAT((sum3 >> out_shift), 8);
*pOut++ = (q7_t) __SSAT((sum4 >> out_shift), 8);
rowCnt--;
}
rowCnt = ch_im_out & 0x3;
while (rowCnt)
{
q7_t *pB = colBuffer + row_shift;
const q7_t *pA = wt + row_shift;
q31_t sum = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
uint16_t colCnt = (dim_kernel_x * dim_kernel_y);
row_shift += 1;
while (colCnt)
{
q7_t A1 = *pA;
q7_t B1 = *pB;
pA += ch_im_in;
pB += ch_im_in;
sum += A1 * B1;
colCnt--;
}
*pOut++ = (q7_t) __SSAT((sum >> out_shift), 8);
rowCnt--;
}
// clear counter and pointers
pBuffer = colBuffer;
}
}
#else
/* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
int i_out_y, i_out_x, i_ch_out;
int i_ker_y, i_ker_x;
/* do some checking here, basically ch_im_in == ch_im_out */
if (ch_im_in != ch_im_out)
{
return ARM_MATH_SIZE_MISMATCH;
}
for (i_out_y = 0; i_out_y < dim_im_out_y; i_out_y++)
{
for (i_out_x = 0; i_out_x < dim_im_out_x; i_out_x++)
{
for (i_ch_out = 0; i_ch_out < ch_im_out; i_ch_out++)
{
// for each output
int conv_out = ((q31_t)(bias[i_ch_out]) << bias_shift) + NN_ROUND(out_shift);
for (i_ker_y = 0; i_ker_y < dim_kernel_y; i_ker_y++)
{
for (i_ker_x = 0; i_ker_x < dim_kernel_x; i_ker_x++)
{
int in_row = stride_y * i_out_y + i_ker_y - padding_y;
int in_col = stride_x * i_out_x + i_ker_x - padding_x;
if (in_row >= 0 && in_col >= 0 && in_row < dim_im_in_y && in_col < dim_im_in_x)
{
conv_out += Im_in[(in_row * dim_im_in_x + in_col) * ch_im_in + i_ch_out] *
wt[(i_ker_y * dim_kernel_x + i_ker_x) * ch_im_out + i_ch_out];
}
}
}
Im_out[(i_out_y * dim_im_out_x + i_out_x) * ch_im_out + i_ch_out] =
(q7_t) __SSAT((conv_out >> out_shift), 8);
}
}
}
#endif /* ARM_MATH_DSP */
/* Return to application */
return ARM_MATH_SUCCESS;
}
/**
* @} end of NNConv group
*/

View File

@ -1,219 +0,0 @@
/*
* Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_nn_depthwise_conv_s8_core.c
* Description: Depthwise convolution on im2col buffers.
*
* $Date: May 29, 2020
* $Revision: V.1.0.3
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
#include "arm_math.h"
#include "arm_nnfunctions.h"
/*
* Depthwise conv on an im2col buffer where the input channel equals
* output channel.
*
* Refer header file for details.
*
*/
q7_t *arm_nn_depthwise_conv_s8_core(const q7_t *row,
const q15_t *col,
const uint16_t num_ch,
const int32_t *out_shift,
const int32_t *out_mult,
const int32_t out_offset,
const int32_t activation_min,
const int32_t activation_max,
const uint16_t kernel_size,
const int32_t *const output_bias,
q7_t *out)
{
#if defined(ARM_MATH_MVEI)
int32_t ch_per_loop = num_ch / 4;
const int32_t *bias = output_bias;
int8_t *out_tmp = out;
int32_t idx = 0;
while (ch_per_loop > 0)
{
int32x4_t ip_0;
int32x4_t ip_1;
int32_t ker_loop = kernel_size / 3;
int32x4_t out_0 = vldrwq_s32(bias);
int32x4_t out_1 = out_0;
bias += 4;
const int32_t offset = idx * 4;
const int8_t *row_0 = row + offset;
const int16_t *col_0 = col + offset;
const int16_t *col_1 = col + kernel_size * num_ch + offset;
int32x4_t ker_0 = vldrbq_s32(row_0);
while (ker_loop > 0)
{
const int8_t *row_1 = row_0 + num_ch;
const int8_t *row_2 = row_0 + 2 * num_ch;
const int32x4_t ker_1 = vldrbq_s32(row_1);
const int32x4_t ker_2 = vldrbq_s32(row_2);
ip_0 = vldrhq_s32(col_0);
ip_1 = vldrhq_s32(col_1);
col_0 += num_ch;
col_1 += num_ch;
out_0 += vmulq_s32(ip_0, ker_0);
out_1 += vmulq_s32(ip_1, ker_0);
ip_0 = vldrhq_s32(col_0);
ip_1 = vldrhq_s32(col_1);
col_0 += num_ch;
col_1 += num_ch;
out_0 += vmulq_s32(ip_0, ker_1);
out_1 += vmulq_s32(ip_1, ker_1);
ip_0 = vldrhq_s32(col_0);
ip_1 = vldrhq_s32(col_1);
col_0 += num_ch;
col_1 += num_ch;
out_0 += vmulq_s32(ip_0, ker_2);
out_1 += vmulq_s32(ip_1, ker_2);
row_0 += 3 * num_ch;
ker_0 = vldrbq_s32(row_0);
ker_loop--;
}
idx++;
/* Handle tail kernel elements */
ker_loop = kernel_size - ((kernel_size / 3) * 3);
while (ker_loop > 0)
{
ip_0 = vldrhq_s32(col_0);
ip_1 = vldrhq_s32(col_1);
out_0 += vmulq_s32(ip_0, ker_0);
out_1 += vmulq_s32(ip_1, ker_0);
col_0 += num_ch;
col_1 += num_ch;
ip_0 = vldrhq_s32(col_0);
ip_1 = vldrhq_s32(col_1);
row_0 += num_ch;
ker_0 = vldrbq_s32(row_0);
ker_loop--;
}
const int32x4_t mult = vldrwq_s32(out_mult);
const int32x4_t shift = vldrwq_s32(out_shift);
out_mult += 4;
out_shift += 4;
out_0 = arm_requantize_mve_32x4(out_0, mult, shift);
out_1 = arm_requantize_mve_32x4(out_1, mult, shift);
out_0 = vaddq_n_s32(out_0, out_offset);
out_0 = vmaxq_s32(out_0, vdupq_n_s32(activation_min));
out_0 = vminq_s32(out_0, vdupq_n_s32(activation_max));
vstrbq_s32(out_tmp, out_0);
out_1 = vaddq_n_s32(out_1, out_offset);
out_1 = vmaxq_s32(out_1, vdupq_n_s32(activation_min));
out_1 = vminq_s32(out_1, vdupq_n_s32(activation_max));
vstrbq_s32(out_tmp + num_ch, out_1);
out_tmp += 4;
ch_per_loop--;
}
int32_t tail_ch = num_ch & 3;
if (tail_ch != 0)
{
int32_t ch_idx = (num_ch & ~3);
int32x4_t col_0_sum;
int32x4_t col_1_sum;
const int32_t single_buffer_size = kernel_size * num_ch;
for (int i = 0; i < tail_ch; i++)
{
const int16_t *col_pos_0 = col + ch_idx;
const int16_t *col_pos_1 = col_pos_0 + single_buffer_size;
const int8_t *row_pos = row + ch_idx;
int32_t sum_0 = bias[i];
int32_t sum_1 = bias[i];
for (int j = 0; j < kernel_size; j++)
{
const int8_t row_val = row_pos[j * num_ch];
sum_0 += row_val * col_pos_0[j * num_ch];
sum_1 += row_val * col_pos_1[j * num_ch];
}
col_0_sum[i] = sum_0;
col_1_sum[i] = sum_1;
ch_idx++;
}
const mve_pred16_t p = vctp32q((uint32_t)tail_ch);
const int32x4_t mult = vldrwq_z_s32(out_mult, p);
const int32x4_t shift = vldrwq_z_s32(out_shift, p);
col_0_sum = arm_requantize_mve_32x4(col_0_sum, mult, shift);
col_1_sum = arm_requantize_mve_32x4(col_1_sum, mult, shift);
col_0_sum = vaddq_n_s32(col_0_sum, out_offset);
col_0_sum = vmaxq_s32(col_0_sum, vdupq_n_s32(activation_min));
col_0_sum = vminq_s32(col_0_sum, vdupq_n_s32(activation_max));
vstrbq_p_s32(out_tmp, col_0_sum, p);
col_1_sum = vaddq_n_s32(col_1_sum, out_offset);
col_1_sum = vmaxq_s32(col_1_sum, vdupq_n_s32(activation_min));
col_1_sum = vminq_s32(col_1_sum, vdupq_n_s32(activation_max));
vstrbq_p_s32(out_tmp + num_ch, col_1_sum, p);
out_tmp += tail_ch;
}
return out_tmp + num_ch;
#else
(void)row;
(void)col;
(void)num_ch;
(void)out_shift;
(void)out_mult;
(void)out_offset;
(void)activation_min;
(void)activation_max;
(void)kernel_size;
(void)output_bias;
(void)out;
return NULL;
#endif
}

View File

@ -1,179 +0,0 @@
/*
* Copyright (C) 2010-2018 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_nn_mat_mult_kernel_q7_q15.c
* Description: Matrix-multiplication function for convolution
*
* $Date: 17. January 2018
* $Revision: V.1.0.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
#include "arm_math.h"
#include "arm_nnfunctions.h"
/**
* @brief Matrix-multiplication function for convolution.
*
* @details Refer to header file for details.
*
*/
q7_t *arm_nn_mat_mult_kernel_q7_q15(const q7_t * pA,
const q15_t * pInBuffer,
const uint16_t ch_im_out,
const uint16_t numCol_A,
const uint16_t bias_shift,
const uint16_t out_shift,
const q7_t * bias,
q7_t * pOut)
{
#if defined (ARM_MATH_DSP)
/* set up the second output pointers */
q7_t *pOut2 = pOut + ch_im_out;
const q7_t *pBias = bias;
uint16_t rowCnt = ch_im_out >> 1;
/* this loop over rows in A */
while (rowCnt)
{
/* setup pointers for B */
const q15_t *pB = pInBuffer;
const q15_t *pB2 = pB + numCol_A;
/* align the second pointer for A */
const q7_t *pA2 = pA + numCol_A;
/* init the sum with bias */
q31_t sum = ((q31_t)(*pBias) << bias_shift) + NN_ROUND(out_shift);
q31_t sum2 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
q31_t sum3 = ((q31_t)(*pBias) << bias_shift) + NN_ROUND(out_shift);
q31_t sum4 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
uint16_t colCnt = numCol_A >> 2;
/* accumulate over the vector */
while (colCnt)
{
q31_t inA11, inA12, inA21, inA22;
q31_t inB1 = arm_nn_read_q15x2_ia(&pB);
q31_t inB2 = arm_nn_read_q15x2_ia(&pB2);
pA = read_and_pad(pA, &inA11, &inA12);
pA2 = read_and_pad(pA2, &inA21, &inA22);
sum = __SMLAD(inA11, inB1, sum);
sum2 = __SMLAD(inA11, inB2, sum2);
sum3 = __SMLAD(inA21, inB1, sum3);
sum4 = __SMLAD(inA21, inB2, sum4);
inB1 = arm_nn_read_q15x2_ia(&pB);
inB2 = arm_nn_read_q15x2_ia(&pB2);
sum = __SMLAD(inA12, inB1, sum);
sum2 = __SMLAD(inA12, inB2, sum2);
sum3 = __SMLAD(inA22, inB1, sum3);
sum4 = __SMLAD(inA22, inB2, sum4);
colCnt--;
} /* while over colCnt */
colCnt = numCol_A & 0x3;
while (colCnt)
{
q7_t inA1 = *pA++;
q15_t inB1 = *pB++;
q7_t inA2 = *pA2++;
q15_t inB2 = *pB2++;
sum += inA1 * inB1;
sum2 += inA1 * inB2;
sum3 += inA2 * inB1;
sum4 += inA2 * inB2;
colCnt--;
} /* while over colCnt */
*pOut++ = (q7_t) __SSAT((sum >> out_shift), 8);
*pOut++ = (q7_t) __SSAT((sum3 >> out_shift), 8);
*pOut2++ = (q7_t) __SSAT((sum2 >> out_shift), 8);
*pOut2++ = (q7_t) __SSAT((sum4 >> out_shift), 8);
/* skip the row computed with A2 */
pA += numCol_A;
rowCnt--;
} /* for over ch_im_out */
/* compute left-over row if any */
if (ch_im_out & 0x1)
{
/* setup pointers for B */
const q15_t *pB = pInBuffer;
const q15_t *pB2 = pB + numCol_A;
/* load the bias */
q31_t sum = ((q31_t)(*pBias) << bias_shift) + NN_ROUND(out_shift);
q31_t sum2 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
uint16_t colCnt = numCol_A >> 2;
while (colCnt)
{
q31_t inA11, inA12;
q31_t inB1 = arm_nn_read_q15x2_ia(&pB);
q31_t inB2 = arm_nn_read_q15x2_ia(&pB2);
pA = read_and_pad(pA, &inA11, &inA12);
sum = __SMLAD(inA11, inB1, sum);
sum2 = __SMLAD(inA11, inB2, sum2);
inB1 = arm_nn_read_q15x2_ia(&pB);
inB2 = arm_nn_read_q15x2_ia(&pB2);
sum = __SMLAD(inA12, inB1, sum);
sum2 = __SMLAD(inA12, inB2, sum2);
colCnt--;
}
colCnt = numCol_A & 0x3;
while (colCnt)
{
q7_t inA1 = *pA++;
q15_t inB1 = *pB++;
q15_t inB2 = *pB2++;
sum += inA1 * inB1;
sum2 += inA1 * inB2;
colCnt--;
}
*pOut++ = (q7_t) __SSAT((sum >> out_shift), 8);
*pOut2++ = (q7_t) __SSAT((sum2 >> out_shift), 8);
}
pOut += ch_im_out;
/* return the new output pointer with offset */
return pOut;
#else
/* To be completed */
return NULL;
#endif /* ARM_MATH_DSP */
}

View File

@ -1,129 +0,0 @@
/*
* Copyright (C) 2010-2018 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_nn_mat_mult_kernel_q7_q15_reordered.c
* Description: Matrix-multiplication function for convolution with reordered columns
*
* $Date: 17. January 2018
* $Revision: V.1.0.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
#include "arm_math.h"
/**
* @brief Matrix-multiplication function for convolution with re-ordered input.
*
* @details Refer to header file for details.
*
*/
q7_t *arm_nn_mat_mult_kernel_q7_q15_reordered(const q7_t * pA,
const q15_t * pInBuffer,
const uint16_t ch_im_out,
const uint16_t numCol_A,
const uint16_t bias_shift,
const uint16_t out_shift,
const q7_t * bias,
q7_t * pOut)
{
#if defined (ARM_MATH_DSP)
/* set up the second output pointers */
q7_t *pOut2 = pOut + ch_im_out;
int i;
/* this loop over rows in A */
for (i = 0; i < ch_im_out; i += 2)
{
/* setup pointers for B */
const q15_t *pB = pInBuffer;
const q15_t *pB2 = pB + numCol_A;
/* align the second pointer for A */
const q7_t *pA2 = pA + numCol_A;
/* init the sum with bias */
q31_t sum = ((q31_t)(bias[i]) << bias_shift) + NN_ROUND(out_shift);
q31_t sum2 = ((q31_t)(bias[i]) << bias_shift) + NN_ROUND(out_shift);
q31_t sum3 = ((q31_t)(bias[i + 1]) << bias_shift) + NN_ROUND(out_shift);
q31_t sum4 = ((q31_t)(bias[i + 1]) << bias_shift) + NN_ROUND(out_shift);
uint16_t colCnt = numCol_A >> 2;
/* accumulate over the vector */
while (colCnt)
{
q31_t inA11, inA12, inA21, inA22;
q31_t inB1 = arm_nn_read_q15x2_ia(&pB);
q31_t inB2 = arm_nn_read_q15x2_ia(&pB2);
pA = read_and_pad_reordered(pA, &inA11, &inA12);
pA2 = read_and_pad_reordered(pA2, &inA21, &inA22);
sum = __SMLAD(inA11, inB1, sum);
sum2 = __SMLAD(inA11, inB2, sum2);
sum3 = __SMLAD(inA21, inB1, sum3);
sum4 = __SMLAD(inA21, inB2, sum4);
inB1 = arm_nn_read_q15x2_ia(&pB);
inB2 = arm_nn_read_q15x2_ia(&pB2);
sum = __SMLAD(inA12, inB1, sum);
sum2 = __SMLAD(inA12, inB2, sum2);
sum3 = __SMLAD(inA22, inB1, sum3);
sum4 = __SMLAD(inA22, inB2, sum4);
colCnt--;
} /* while over colCnt */
colCnt = numCol_A & 0x3;
while (colCnt)
{
q7_t inA1 = *pA++;
q15_t inB1 = *pB++;
q7_t inA2 = *pA2++;
q15_t inB2 = *pB2++;
sum += inA1 * inB1;
sum2 += inA1 * inB2;
sum3 += inA2 * inB1;
sum4 += inA2 * inB2;
colCnt--;
} /* while over colCnt */
*pOut++ = (q7_t) __SSAT((sum >> out_shift), 8);
*pOut++ = (q7_t) __SSAT((sum3 >> out_shift), 8);
*pOut2++ = (q7_t) __SSAT((sum2 >> out_shift), 8);
*pOut2++ = (q7_t) __SSAT((sum4 >> out_shift), 8);
/* skip the row computed with A2 */
pA += numCol_A;
} /* for over ch_im_out */
pOut += ch_im_out;
/* return the new output pointer with offset */
return pOut;
#else
/* To be completed */
return NULL;
#endif /* ARM_MATH_DSP */
}

View File

@ -1,391 +0,0 @@
/*
* Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_nn_mat_mult_kernel_s8_s16.c
* Description: Matrix-multiplication function for convolution
*
* $Date: May 29, 2020
* $Revision: V.1.0.2
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
#include "arm_math.h"
#include "arm_nnfunctions.h"
/*
* Matrix-multiplication function for convolution with per-channel requantization.
*
* Refer header file for details.
*
*/
q7_t *arm_nn_mat_mult_kernel_s8_s16(const q7_t *input_a,
const q15_t *input_b,
const uint16_t output_ch,
const int32_t *out_shift,
const int32_t *out_mult,
const int32_t out_offset,
const int16_t activation_min,
const int16_t activation_max,
const uint16_t num_col_a,
const int32_t *const output_bias,
q7_t *out_0)
{
#if defined(ARM_MATH_MVEI)
#define ROW_PER_LOOP (4)
#define COL_PER_LOOP (8)
const q7_t *ip_a0_s8 = input_a;
q7_t *out_1 = out_0 + output_ch;
const int32_t *bias = output_bias;
int32_t row_count = output_ch / ROW_PER_LOOP;
while (row_count)
{
const q15_t *ip_b0_s16 = input_b;
const q15_t *ip_b1_s16 = input_b + num_col_a;
const q7_t *ip_a1_s8 = ip_a0_s8 + num_col_a;
const q7_t *ip_a2_s8 = ip_a0_s8 + num_col_a * 2;
const q7_t *ip_a3_s8 = ip_a0_s8 + num_col_a * 3;
q31_t ch_0_out_n = bias[0];
q31_t ch_1_out_n = bias[1];
q31_t ch_2_out_n = bias[2];
q31_t ch_3_out_n = bias[3];
q31_t ch_0_out_n1 = ch_0_out_n;
q31_t ch_1_out_n1 = ch_1_out_n;
q31_t ch_2_out_n1 = ch_2_out_n;
q31_t ch_3_out_n1 = ch_3_out_n;
bias += 4;
int32_t col_count = num_col_a / COL_PER_LOOP;
while (col_count)
{
// Load inputs
const int16x8_t ip_b0 = vld1q_s16(ip_b0_s16);
ip_b0_s16 += COL_PER_LOOP;
const int16x8_t ip_b1 = vld1q_s16(ip_b1_s16);
ip_b1_s16 += COL_PER_LOOP;
// Load filters
const int16x8_t ip_a0 = vldrbq_s16(ip_a0_s8);
ip_a0_s8 += COL_PER_LOOP;
const int16x8_t ip_a1 = vldrbq_s16(ip_a1_s8);
ip_a1_s8 += COL_PER_LOOP;
const int16x8_t ip_a2 = vldrbq_s16(ip_a2_s8);
ip_a2_s8 += COL_PER_LOOP;
const int16x8_t ip_a3 = vldrbq_s16(ip_a3_s8);
ip_a3_s8 += COL_PER_LOOP;
// MAC
ch_0_out_n += vmladavq_s16(ip_b0, ip_a0);
ch_1_out_n += vmladavq_s16(ip_b0, ip_a1);
ch_2_out_n += vmladavq_s16(ip_b0, ip_a2);
ch_3_out_n += vmladavq_s16(ip_b0, ip_a3);
ch_0_out_n1 += vmladavq_s16(ip_b1, ip_a0);
ch_1_out_n1 += vmladavq_s16(ip_b1, ip_a1);
ch_2_out_n1 += vmladavq_s16(ip_b1, ip_a2);
ch_3_out_n1 += vmladavq_s16(ip_b1, ip_a3);
col_count--;
}
/* Handle tail */
col_count = (num_col_a & (COL_PER_LOOP - 1)) - 1;
while (col_count >= 0)
{
const int32_t b0 = ip_b0_s16[col_count];
const int32_t b1 = ip_b1_s16[col_count];
ch_0_out_n += b0 * ip_a0_s8[col_count];
ch_1_out_n += b0 * ip_a1_s8[col_count];
ch_2_out_n += b0 * ip_a2_s8[col_count];
ch_3_out_n += b0 * ip_a3_s8[col_count];
ch_0_out_n1 += b1 * ip_a0_s8[col_count];
ch_1_out_n1 += b1 * ip_a1_s8[col_count];
ch_2_out_n1 += b1 * ip_a2_s8[col_count];
ch_3_out_n1 += b1 * ip_a3_s8[col_count];
col_count--;
}
ip_a0_s8 += (num_col_a & (COL_PER_LOOP - 1));
int32x4_t out_vec_0;
int32x4_t out_vec_1;
out_vec_0[0] = ch_0_out_n;
out_vec_0[1] = ch_1_out_n;
out_vec_0[2] = ch_2_out_n;
out_vec_0[3] = ch_3_out_n;
out_vec_1[0] = ch_0_out_n1;
out_vec_1[1] = ch_1_out_n1;
out_vec_1[2] = ch_2_out_n1;
out_vec_1[3] = ch_3_out_n1;
int32x4_t mult = vldrwq_s32(out_mult);
int32x4_t shift = vldrwq_s32(out_shift);
out_mult += ROW_PER_LOOP;
out_shift += ROW_PER_LOOP;
out_vec_0 = arm_requantize_mve_32x4(out_vec_0, mult, shift);
out_vec_1 = arm_requantize_mve_32x4(out_vec_1, mult, shift);
out_vec_0 = vaddq_n_s32(out_vec_0, out_offset);
out_vec_0 = vmaxq_s32(out_vec_0, vdupq_n_s32(activation_min));
out_vec_0 = vminq_s32(out_vec_0, vdupq_n_s32(activation_max));
vstrbq_s32(out_0, out_vec_0);
out_0 += ROW_PER_LOOP;
out_vec_1 = vaddq_n_s32(out_vec_1, out_offset);
out_vec_1 = vmaxq_s32(out_vec_1, vdupq_n_s32(activation_min));
out_vec_1 = vminq_s32(out_vec_1, vdupq_n_s32(activation_max));
vstrbq_s32(out_1, out_vec_1);
out_1 += ROW_PER_LOOP;
row_count--;
ip_a0_s8 += (num_col_a * 3);
}
row_count = output_ch & (ROW_PER_LOOP - 1);
if (row_count)
{
ip_a0_s8 = input_a + num_col_a * (output_ch & ~3);
const mve_pred16_t p = vctp32q((uint32_t)row_count);
int32x4_t out_vec_0 = vdupq_n_s32(0);
int32x4_t out_vec_1 = vdupq_n_s32(0);
int32x4_t mult_tail;
int32x4_t shift_tail;
for (int i_ch = 0; i_ch < row_count; i_ch++)
{
int32_t output_0 = bias[i_ch];
int32_t output_1 = bias[i_ch];
const q15_t *ip_b0_s16 = input_b;
const q15_t *ip_b1_s16 = input_b + num_col_a;
for (int i_idx = 0; i_idx < num_col_a; i_idx++)
{
output_0 += ip_b0_s16[i_idx] * ip_a0_s8[i_idx];
output_1 += ip_b1_s16[i_idx] * ip_a0_s8[i_idx];
}
ip_a0_s8 += num_col_a;
out_vec_0[i_ch] = output_0;
out_vec_1[i_ch] = output_1;
mult_tail[i_ch] = out_mult[i_ch];
shift_tail[i_ch] = out_shift[i_ch];
}
out_vec_0 = arm_requantize_mve_32x4(out_vec_0, mult_tail, shift_tail);
out_vec_1 = arm_requantize_mve_32x4(out_vec_1, mult_tail, shift_tail);
out_vec_0 = vaddq_n_s32(out_vec_0, out_offset);
out_vec_0 = vmaxq_s32(out_vec_0, vdupq_n_s32(activation_min));
out_vec_0 = vminq_s32(out_vec_0, vdupq_n_s32(activation_max));
vstrbq_p_s32(out_0, out_vec_0, p);
out_vec_1 = vaddq_n_s32(out_vec_1, out_offset);
out_vec_1 = vmaxq_s32(out_vec_1, vdupq_n_s32(activation_min));
out_vec_1 = vminq_s32(out_vec_1, vdupq_n_s32(activation_max));
vstrbq_p_s32(out_1, out_vec_1, p);
out_1 += row_count;
}
return out_1;
#elif defined(ARM_MATH_DSP)
/* set up the second output pointers */
q7_t *out_1 = out_0 + output_ch;
const int32_t *bias = output_bias;
uint16_t row_count = output_ch / 2;
const q7_t *ip_a0 = input_a;
/* this loop over rows in A */
while (row_count)
{
/* setup pointers for B */
const q15_t *ip_b0 = input_b;
const q15_t *ip_b1 = ip_b0 + num_col_a;
/* align the second pointer for A */
const q7_t *ip_a1 = ip_a0 + num_col_a;
/* Init accumulator with bias for channel N and N + 1 */
q31_t ch_0_out_0 = *bias;
q31_t ch_0_out_1 = *bias++;
q31_t ch_1_out_0 = *bias;
q31_t ch_1_out_1 = *bias++;
uint16_t col_count = num_col_a / 4;
/* accumulate over the vector */
while (col_count)
{
q31_t a01, a02, a11, a12;
q31_t b0 = arm_nn_read_q15x2_ia(&ip_b0);
q31_t b1 = arm_nn_read_q15x2_ia(&ip_b1);
ip_a0 = read_and_pad(ip_a0, &a01, &a02);
ip_a1 = read_and_pad(ip_a1, &a11, &a12);
ch_0_out_0 = __SMLAD(a01, b0, ch_0_out_0);
ch_0_out_1 = __SMLAD(a01, b1, ch_0_out_1);
ch_1_out_0 = __SMLAD(a11, b0, ch_1_out_0);
ch_1_out_1 = __SMLAD(a11, b1, ch_1_out_1);
b0 = arm_nn_read_q15x2_ia(&ip_b0);
b1 = arm_nn_read_q15x2_ia(&ip_b1);
ch_0_out_0 = __SMLAD(a02, b0, ch_0_out_0);
ch_0_out_1 = __SMLAD(a02, b1, ch_0_out_1);
ch_1_out_0 = __SMLAD(a12, b0, ch_1_out_0);
ch_1_out_1 = __SMLAD(a12, b1, ch_1_out_1);
col_count--;
} /* while over col_count */
col_count = num_col_a & 0x3;
while (col_count)
{
q7_t a0 = *ip_a0++;
q15_t b0 = *ip_b0++;
q7_t a1 = *ip_a1++;
q15_t b1 = *ip_b1++;
ch_0_out_0 += a0 * b0;
ch_0_out_1 += a0 * b1;
ch_1_out_0 += a1 * b0;
ch_1_out_1 += a1 * b1;
col_count--;
} /* while over col_count */
ch_0_out_0 = arm_nn_requantize(ch_0_out_0, *out_mult, *out_shift);
ch_0_out_0 += out_offset;
ch_0_out_0 = MAX(ch_0_out_0, activation_min);
ch_0_out_0 = MIN(ch_0_out_0, activation_max);
*out_0++ = (q7_t)ch_0_out_0;
ch_0_out_1 = arm_nn_requantize(ch_0_out_1, *out_mult, *out_shift);
ch_0_out_1 += out_offset;
ch_0_out_1 = MAX(ch_0_out_1, activation_min);
ch_0_out_1 = MIN(ch_0_out_1, activation_max);
*out_1++ = (q7_t)ch_0_out_1;
out_mult++;
out_shift++;
ch_1_out_0 = arm_nn_requantize(ch_1_out_0, *out_mult, *out_shift);
ch_1_out_0 += out_offset;
ch_1_out_0 = MAX(ch_1_out_0, activation_min);
ch_1_out_0 = MIN(ch_1_out_0, activation_max);
*out_0++ = (q7_t)ch_1_out_0;
ch_1_out_1 = arm_nn_requantize(ch_1_out_1, *out_mult, *out_shift);
ch_1_out_1 += out_offset;
ch_1_out_1 = MAX(ch_1_out_1, activation_min);
ch_1_out_1 = MIN(ch_1_out_1, activation_max);
*out_1++ = (q7_t)ch_1_out_1;
out_mult++;
out_shift++;
/* skip row */
ip_a0 += num_col_a;
row_count--;
}
/* compute the last odd numbered row if any */
if (output_ch & 0x1)
{
/* setup pointers for B */
const q15_t *ip_b0 = input_b;
const q15_t *ip_b1 = ip_b0 + num_col_a;
/* load the bias */
q31_t ch_0_out_0 = *bias;
q31_t ch_0_out_1 = *bias++;
uint16_t col_count = num_col_a >> 2;
while (col_count)
{
q31_t a01, a02;
q31_t b0 = arm_nn_read_q15x2_ia(&ip_b0);
q31_t b1 = arm_nn_read_q15x2_ia(&ip_b1);
ip_a0 = read_and_pad(ip_a0, &a01, &a02);
ch_0_out_0 = __SMLAD(a01, b0, ch_0_out_0);
ch_0_out_1 = __SMLAD(a01, b1, ch_0_out_1);
b0 = arm_nn_read_q15x2_ia(&ip_b0);
b1 = arm_nn_read_q15x2_ia(&ip_b1);
ch_0_out_0 = __SMLAD(a02, b0, ch_0_out_0);
ch_0_out_1 = __SMLAD(a02, b1, ch_0_out_1);
col_count--;
}
col_count = num_col_a & 0x3;
while (col_count)
{
q7_t a0 = *ip_a0++;
q15_t b0 = *ip_b0++;
q15_t b1 = *ip_b1++;
ch_0_out_0 += a0 * b0;
ch_0_out_1 += a0 * b1;
col_count--;
}
ch_0_out_0 = arm_nn_requantize(ch_0_out_0, *out_mult, *out_shift);
ch_0_out_0 += out_offset;
ch_0_out_0 = MAX(ch_0_out_0, activation_min);
ch_0_out_0 = MIN(ch_0_out_0, activation_max);
*out_0++ = (q7_t)ch_0_out_0;
ch_0_out_1 = arm_nn_requantize(ch_0_out_1, *out_mult, *out_shift);
ch_0_out_1 += out_offset;
ch_0_out_1 = MAX(ch_0_out_1, activation_min);
ch_0_out_1 = MIN(ch_0_out_1, activation_max);
*out_1++ = (q7_t)ch_0_out_1;
out_mult++;
out_shift++;
}
out_0 += output_ch;
/* return the new output pointer with offset */
return out_0;
#else
(void)input_a;
(void)input_b;
(void)output_ch;
(void)out_shift;
(void)out_mult;
(void)out_offset;
(void)activation_min;
(void)activation_max;
(void)num_col_a;
(void)output_bias;
(void)out_0;
/* To be completed */
return NULL;
#endif
}

View File

@ -1,201 +0,0 @@
/*
* Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_nn_mat_mult_kernel_s8_s16_reordered.c
* Description: Matrix-multiplication function for convolution with reordered columns
*
* $Date: February 27, 2020
* $Revision: V.1.0.2
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
#include "arm_math.h"
/*
* Matrix-multiplication with re-ordered input and bias inputs for convolution with per-channel
* requantization. The re-ordering is a consequence of sign extension is done by the SXTB16 command.
*
* Refer header file for details. This function differs from arm_nn_mat_mult_kernel_s8_s16(), in that it uses
* read_and_pad_reordered() instead of arm_nn_mat_mult_kernel_s8_s16(). Investigating the cycles impact and
* unifying these two functions is a potential future improvement.
*
*/
q7_t *arm_nn_mat_mult_kernel_s8_s16_reordered(const q7_t *input_a,
const q15_t *input_b,
const uint16_t output_ch,
const int32_t *out_shift,
const int32_t *out_mult,
const int32_t out_offset,
const int16_t activation_min,
const int16_t activation_max,
const uint16_t num_col_a,
const int32_t *const output_bias,
q7_t *out_0)
{
#if defined(ARM_MATH_DSP)
/* set up the second output pointers */
q7_t *out_1 = out_0 + output_ch;
const int32_t *bias = output_bias;
uint16_t row_count = output_ch / 2;
const q7_t *ip_a0 = input_a;
/* this loop over rows in A */
while (row_count)
{
/* setup pointers for B */
const q15_t *ip_b0 = input_b;
const q15_t *ip_b1 = ip_b0 + num_col_a;
/* align the second pointer for A */
const q7_t *ip_a1 = ip_a0 + num_col_a;
/* Init accumulator with bias for channel N and N + 1 */
q31_t ch_0_out_0 = *bias;
q31_t ch_0_out_1 = *bias++;
q31_t ch_1_out_0 = *bias;
q31_t ch_1_out_1 = *bias++;
uint16_t col_count = num_col_a / 4;
/* accumulate over the vector */
while (col_count)
{
q31_t a01, a02, a11, a12;
q31_t b0 = arm_nn_read_q15x2_ia(&ip_b0);
q31_t b1 = arm_nn_read_q15x2_ia(&ip_b1);
ip_a0 = read_and_pad_reordered(ip_a0, &a01, &a02);
ip_a1 = read_and_pad_reordered(ip_a1, &a11, &a12);
ch_0_out_0 = __SMLAD(a01, b0, ch_0_out_0);
ch_0_out_1 = __SMLAD(a01, b1, ch_0_out_1);
ch_1_out_0 = __SMLAD(a11, b0, ch_1_out_0);
ch_1_out_1 = __SMLAD(a11, b1, ch_1_out_1);
b0 = arm_nn_read_q15x2_ia(&ip_b0);
b1 = arm_nn_read_q15x2_ia(&ip_b1);
ch_0_out_0 = __SMLAD(a02, b0, ch_0_out_0);
ch_0_out_1 = __SMLAD(a02, b1, ch_0_out_1);
ch_1_out_0 = __SMLAD(a12, b0, ch_1_out_0);
ch_1_out_1 = __SMLAD(a12, b1, ch_1_out_1);
col_count--;
} /* while over col_count */
ch_0_out_0 = arm_nn_requantize(ch_0_out_0, *out_mult, *out_shift);
ch_0_out_0 += out_offset;
ch_0_out_0 = MAX(ch_0_out_0, activation_min);
ch_0_out_0 = MIN(ch_0_out_0, activation_max);
*out_0++ = (q7_t)ch_0_out_0;
ch_0_out_1 = arm_nn_requantize(ch_0_out_1, *out_mult, *out_shift);
ch_0_out_1 += out_offset;
ch_0_out_1 = MAX(ch_0_out_1, activation_min);
ch_0_out_1 = MIN(ch_0_out_1, activation_max);
*out_1++ = (q7_t)ch_0_out_1;
out_mult++;
out_shift++;
ch_1_out_0 = arm_nn_requantize(ch_1_out_0, *out_mult, *out_shift);
ch_1_out_0 += out_offset;
ch_1_out_0 = MAX(ch_1_out_0, activation_min);
ch_1_out_0 = MIN(ch_1_out_0, activation_max);
*out_0++ = (q7_t)ch_1_out_0;
ch_1_out_1 = arm_nn_requantize(ch_1_out_1, *out_mult, *out_shift);
ch_1_out_1 += out_offset;
ch_1_out_1 = MAX(ch_1_out_1, activation_min);
ch_1_out_1 = MIN(ch_1_out_1, activation_max);
*out_1++ = (q7_t)ch_1_out_1;
out_mult++;
out_shift++;
/* skip row */
ip_a0 += num_col_a;
row_count--;
}
if (output_ch & 1)
{
/* setup pointers for B */
const q15_t *ip_b0 = input_b;
const q15_t *ip_b1 = ip_b0 + num_col_a;
/* Init accumulator with bias for channel N + 1 */
q31_t ch_0_out_0 = *bias;
q31_t ch_0_out_1 = ch_0_out_0;
int32_t col_count = num_col_a / 4;
while (col_count)
{
q31_t a01, a02;
q31_t b0 = arm_nn_read_q15x2_ia(&ip_b0);
q31_t b1 = arm_nn_read_q15x2_ia(&ip_b1);
ip_a0 = read_and_pad_reordered(ip_a0, &a01, &a02);
ch_0_out_0 = __SMLAD(a01, b0, ch_0_out_0);
ch_0_out_1 = __SMLAD(a01, b1, ch_0_out_1);
b0 = arm_nn_read_q15x2_ia(&ip_b0);
b1 = arm_nn_read_q15x2_ia(&ip_b1);
ch_0_out_0 = __SMLAD(a02, b0, ch_0_out_0);
ch_0_out_1 = __SMLAD(a02, b1, ch_0_out_1);
col_count--;
} /* while over col_count */
ch_0_out_0 = arm_nn_requantize(ch_0_out_0, *out_mult, *out_shift);
ch_0_out_0 += out_offset;
ch_0_out_0 = MAX(ch_0_out_0, activation_min);
ch_0_out_0 = MIN(ch_0_out_0, activation_max);
*out_0++ = (q7_t)ch_0_out_0;
ch_0_out_1 = arm_nn_requantize(ch_0_out_1, *out_mult, *out_shift);
ch_0_out_1 += out_offset;
ch_0_out_1 = MAX(ch_0_out_1, activation_min);
ch_0_out_1 = MIN(ch_0_out_1, activation_max);
*out_1++ = (q7_t)ch_0_out_1;
}
out_0 += output_ch;
/* return the new output pointer with offset */
return out_0;
#else
(void)input_a;
(void)input_b;
(void)output_ch;
(void)out_shift;
(void)out_mult;
(void)out_offset;
(void)activation_min;
(void)activation_max;
(void)num_col_a;
(void)output_bias;
(void)out_0;
/* To be completed */
return NULL;
#endif
}

View File

@ -1,181 +0,0 @@
/*
* Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_nn_mat_mult_s8.c
* Description: General Matrix-multiplication function
*
* $Date: July 27, 2020
* $Revision: V.2.0.4
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
#include "arm_math.h"
#include "arm_nnfunctions.h"
/*
* s8 General matrix multiplication function with per-channel requantization for upto 4 column batches.
*
* Refer header file for details.
*
*/
q7_t *arm_nn_mat_mult_s8(const q7_t *input_row,
const q7_t *input_col,
const uint16_t output_ch,
const uint16_t col_batches,
const int32_t *output_shift,
const int32_t *output_mult,
const int32_t out_offset,
const int32_t col_offset,
const int32_t row_offset,
const int16_t activation_min,
const int16_t activation_max,
const uint16_t row_len,
const int32_t *const bias,
q7_t *out)
{
#if defined(ARM_MATH_MVEI)
(void)row_offset;
if (col_batches == 4)
{
for (int i_out_ch = 0; i_out_ch < output_ch; i_out_ch++)
{
int32_t row_len_tmp = row_len;
const int8_t *ip_r0 = input_row + (i_out_ch * row_len);
const int8_t *ip_c0 = input_col;
const int8_t *ip_c1 = input_col + row_len;
const int8_t *ip_c2 = input_col + (2 * row_len);
const int8_t *ip_c3 = input_col + (3 * row_len);
int32_t acc_0 = 0;
int32_t acc_1 = 0;
int32_t acc_2 = 0;
int32_t acc_3 = 0;
const int32_t row_loop_cnt = (row_len + 7) / 8;
for (int i_row_loop = 0; i_row_loop < row_loop_cnt; i_row_loop++)
{
mve_pred16_t p = vctp16q((uint32_t)row_len_tmp);
const int16x8_t offset = vdupq_m_n_s16(vuninitializedq_s16(), col_offset, p);
row_len_tmp -= 8;
int16x8_t r0 = vldrbq_z_s16(ip_r0, p);
ip_r0 += 8;
int16x8_t c0 = vldrbq_z_s16(ip_c0, p);
ip_c0 += 8;
c0 = vaddq_m_s16(vuninitializedq_s16(), c0, offset, p);
int16x8_t c1 = vldrbq_z_s16(ip_c1, p);
ip_c1 += 8;
c1 = vaddq_m_s16(vuninitializedq_s16(), c1, offset, p);
int16x8_t c2 = vldrbq_z_s16(ip_c2, p);
ip_c2 += 8;
c2 = vaddq_m_s16(vuninitializedq_s16(), c2, offset, p);
int16x8_t c3 = vldrbq_z_s16(ip_c3, p);
ip_c3 += 8;
c3 = vaddq_m_s16(vuninitializedq_s16(), c3, offset, p);
acc_0 = vmladavaq_p_s16(acc_0, r0, c0, p);
acc_1 = vmladavaq_p_s16(acc_1, r0, c1, p);
acc_2 = vmladavaq_p_s16(acc_2, r0, c2, p);
acc_3 = vmladavaq_p_s16(acc_3, r0, c3, p);
}
int32x4_t res = {acc_0, acc_1, acc_2, acc_3};
if (bias)
{
res = vaddq_n_s32(res, bias[i_out_ch]);
}
res = arm_requantize_mve(res, output_mult[i_out_ch], output_shift[i_out_ch]);
res = vaddq_n_s32(res, out_offset);
res = vmaxq_s32(res, vdupq_n_s32(activation_min));
res = vminq_s32(res, vdupq_n_s32(activation_max));
const uint32x4_t scatter_offset = {0, output_ch, output_ch * 2, output_ch * 3};
vstrbq_scatter_offset_s32(&out[i_out_ch], scatter_offset, res);
}
out += 4 * output_ch;
}
else
{
for (int i_col_batch = (col_batches & ~0x3); i_col_batch < (col_batches & 0x3); i_col_batch++)
{
for (int i_out_ch = 0; i_out_ch < output_ch; i_out_ch++)
{
int32_t row_len_tmp = row_len;
const int8_t *ip_r0 = input_row + (i_out_ch * row_len);
const int8_t *ip_c0 = input_col + (i_col_batch * row_len);
int32_t acc_0 = 0;
const int32_t row_loop_cnt = (row_len + 7) / 8;
for (int i_row_loop = 0; i_row_loop < row_loop_cnt; i_row_loop++)
{
const mve_pred16_t p = vctp16q((uint32_t)row_len_tmp);
const int16x8_t offset = vdupq_m_n_s16(vuninitializedq_s16(), col_offset, p);
row_len_tmp -= 8;
int16x8_t r0 = vldrbq_z_s16(ip_r0, p);
ip_r0 += 8;
int16x8_t c0 = vldrbq_z_s16(ip_c0, p);
ip_c0 += 8;
c0 = vaddq_m_s16(vuninitializedq_s16(), c0, offset, p);
acc_0 = vmladavaq_p_s16(acc_0, r0, c0, p);
}
if (bias)
{
acc_0 += bias[i_out_ch];
}
acc_0 = arm_nn_requantize(acc_0, output_mult[i_out_ch], output_shift[i_out_ch]);
acc_0 += out_offset;
acc_0 = MAX(acc_0, activation_min);
acc_0 = MIN(acc_0, activation_max);
out[i_out_ch] = (q7_t)acc_0;
}
out += output_ch;
}
}
return out;
#else
(void)input_row;
(void)input_col;
(void)output_ch;
(void)col_batches;
(void)output_shift;
(void)output_mult;
(void)out_offset;
(void)col_offset;
(void)row_offset;
(void)activation_min;
(void)activation_max;
(void)row_len;
(void)bias;
(void)out;
return NULL;
#endif
}

View File

@ -1,199 +0,0 @@
/*
* Copyright (C) 2010-2018 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_fully_connected_mat_q7_vec_q15.c
* Description: Mixed Q15-Q7 fully-connected layer function
*
* $Date: 17. January 2018
* $Revision: V.1.0.0
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#include "arm_math.h"
#include "arm_nnfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup FC
* @{
*/
/**
* @brief Mixed Q15-Q7 fully-connected layer function
* @param[in] pV pointer to input vector
* @param[in] pM pointer to matrix weights
* @param[in] dim_vec length of the vector
* @param[in] num_of_rows number of rows in weight matrix
* @param[in] bias_shift amount of left-shift for bias
* @param[in] out_shift amount of right-shift for output
* @param[in] bias pointer to bias
* @param[in,out] pOut pointer to output vector
* @param[in,out] vec_buffer pointer to buffer space for input
* @return The function returns <code>ARM_MATH_SUCCESS</code>
*
* @details
*
* <b>Buffer size:</b>
*
* vec_buffer size: 0
*
* Q7_Q15 version of the fully connected layer
*
* Weights are in q7_t and Activations are in q15_t
*
*/
arm_status
arm_fully_connected_mat_q7_vec_q15(const q15_t * pV,
const q7_t * pM,
const uint16_t dim_vec,
const uint16_t num_of_rows,
const uint16_t bias_shift,
const uint16_t out_shift,
const q7_t * bias,
q15_t * pOut,
q15_t * vec_buffer)
{
(void)vec_buffer;
#if defined (ARM_MATH_DSP)
/* Run the following code for Cortex-M4 and Cortex-M7 */
const q7_t *pB = pM;
const q7_t *pB2;
q15_t *pO = pOut;
const q7_t *pBias = bias;
const q15_t *pA = pV;
uint16_t rowCnt = num_of_rows >> 1;
while (rowCnt)
{
q31_t sum = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
q31_t sum2 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
uint16_t colCnt = dim_vec >> 2;
pA = pV;
pB2 = pB + dim_vec;
while (colCnt)
{
q31_t inV, inM11, inM12, inM21, inM22;
pB = read_and_pad(pB, &inM11, &inM12);
pB2 = read_and_pad(pB2, &inM21, &inM22);
inV = arm_nn_read_q15x2_ia(&pA);
sum = __SMLAD(inV, inM11, sum);
sum2 = __SMLAD(inV, inM21, sum2);
inV = arm_nn_read_q15x2_ia(&pA);
sum = __SMLAD(inV, inM12, sum);
sum2 = __SMLAD(inV, inM22, sum2);
colCnt--;
}
colCnt = dim_vec & 0x3;
while (colCnt)
{
q15_t inV = *pA++;
q7_t inM = *pB++;
q7_t inM2 = *pB2++;
sum += inV * inM;
sum2 += inV * inM2;
colCnt--;
} /* while over colCnt */
*pO++ = (q15_t) (__SSAT((sum >> out_shift), 16));
*pO++ = (q15_t) (__SSAT((sum2 >> out_shift), 16));
/*adjust the pointers and counters */
pB += dim_vec;
rowCnt--;
}
/* left-over part of the rows */
rowCnt = num_of_rows & 0x1;
while (rowCnt)
{
q31_t sum = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
uint16_t colCnt = dim_vec >> 2;
pA = pV;
while (colCnt)
{
q31_t inV1, inV2, inM11, inM12;
pB = read_and_pad(pB, &inM11, &inM12);
inV1 = arm_nn_read_q15x2_ia(&pA);
sum = __SMLAD(inV1, inM11, sum);
inV2 = arm_nn_read_q15x2_ia(&pA);
sum = __SMLAD(inV2, inM12, sum);
colCnt--;
}
/* left-over of the vector */
colCnt = dim_vec & 0x3;
while (colCnt)
{
q15_t inV = *pA++;
q7_t inM = *pB++;
sum += inV * inM;
colCnt--;
}
*pO++ = (q15_t) (__SSAT((sum >> out_shift), 16));
rowCnt--;
}
#else
int i, j;
/* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
for (i = 0; i < num_of_rows; i++)
{
int ip_out = ((q31_t)(bias[i]) << bias_shift) + NN_ROUND(out_shift);
for (j = 0; j < dim_vec; j++)
{
ip_out += pV[j] * pM[i * dim_vec + j];
}
pOut[i] = (q15_t) __SSAT((ip_out >> out_shift), 16);
}
#endif /* ARM_MATH_DSP */
/* Return to ARM_MATH_SUCCESS */
return (ARM_MATH_SUCCESS);
}
/**
* @} end of FC group
*/

View File

@ -1,404 +0,0 @@
/*
* Copyright (C) 2010-2018 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_fully_connected_mat_q7_vec_q15_opt.c
* Description: Mixed Q15-Q7 opt fully-connected layer function
*
* $Date: 17. January 2018
* $Revision: V.1.0.0
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#include "arm_math.h"
#include "arm_nnfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup FC
* @{
*/
/**
* @brief Mixed Q15-Q7 opt fully-connected layer function
* @param[in] pV pointer to input vector
* @param[in] pM pointer to matrix weights
* @param[in] dim_vec length of the vector
* @param[in] num_of_rows number of rows in weight matrix
* @param[in] bias_shift amount of left-shift for bias
* @param[in] out_shift amount of right-shift for output
* @param[in] bias pointer to bias
* @param[in,out] pOut pointer to output vector
* @param[in,out] vec_buffer pointer to buffer space for input
* @return The function returns <code>ARM_MATH_SUCCESS</code>
*
* @details
*
* <b>Buffer size:</b>
*
* vec_buffer size: 0
*
* Q7_Q15 version of the fully connected layer
*
* Weights are in q7_t and Activations are in q15_t
*
* Limitation: x4 version requires weight reordering to work
*
* Here we use only one pointer to read 4 rows in the weight
* matrix. So if the original q7_t matrix looks like this:
*
* | a11 | a12 | a13 | a14 | a15 | a16 | a17 |
*
* | a21 | a22 | a23 | a24 | a25 | a26 | a27 |
*
* | a31 | a32 | a33 | a34 | a35 | a36 | a37 |
*
* | a41 | a42 | a43 | a44 | a45 | a46 | a47 |
*
* | a51 | a52 | a53 | a54 | a55 | a56 | a57 |
*
* | a61 | a62 | a63 | a64 | a65 | a66 | a67 |
*
* We operates on multiple-of-4 rows, so the first four rows becomes
*
* | a11 | a21 | a12 | a22 | a31 | a41 | a32 | a42 |
*
* | a13 | a23 | a14 | a24 | a33 | a43 | a34 | a44 |
*
* | a15 | a25 | a16 | a26 | a35 | a45 | a36 | a46 |
*
* The column left over will be in-order.
* which is:
* | a17 | a27 | a37 | a47 |
*
* For the left-over rows, we do 1x1 computation, so the data remains
* as its original order.
*
* So the stored weight matrix looks like this:
*
* | a11 | a21 | a12 | a22 | a31 | a41 |
*
* | a32 | a42 | a13 | a23 | a14 | a24 |
*
* | a33 | a43 | a34 | a44 | a15 | a25 |
*
* | a16 | a26 | a35 | a45 | a36 | a46 |
*
* | a17 | a27 | a37 | a47 | a51 | a52 |
*
* | a53 | a54 | a55 | a56 | a57 | a61 |
*
* | a62 | a63 | a64 | a65 | a66 | a67 |
*
*/
arm_status
arm_fully_connected_mat_q7_vec_q15_opt(const q15_t * pV,
const q7_t * pM,
const uint16_t dim_vec,
const uint16_t num_of_rows,
const uint16_t bias_shift,
const uint16_t out_shift, const q7_t * bias, q15_t * pOut, q15_t * vec_buffer)
{
(void)vec_buffer;
#if defined (ARM_MATH_DSP)
/* Run the following code for Cortex-M4 and Cortex-M7 */
const q7_t *pB = pM;
q15_t *pO = pOut;
const q7_t *pBias = bias;
const q15_t *pA = pV;
uint16_t rowCnt = num_of_rows >> 2;
while (rowCnt)
{
q31_t sum = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
q31_t sum2 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
q31_t sum3 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
q31_t sum4 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
uint16_t colCnt = dim_vec >> 1;
pA = pV;
#ifdef USE_INTRINSIC
#ifndef ARM_MATH_BIG_ENDIAN
while (colCnt)
{
q31_t inM11, inM12, inM13, inM14;
q31_t inV;
inV = arm_nn_read_q15x2_ia(&pA);
inM11 = arm_nn_read_q7x4_ia(&pB);
inM12 = __SXTB16(__ROR(inM11, 8));
inM11 = __SXTB16(inM11);
sum = __SMLAD(inM11, inV, sum);
sum2 = __SMLAD(inM12, inV, sum2);
inM13 = arm_nn_read_q7x4_ia(&pB);
inM14 = __SXTB16(__ROR(inM13, 8));
inM13 = __SXTB16(inM13);
sum3 = __SMLAD(inM13, inV, sum3);
sum4 = __SMLAD(inM14, inV, sum4);
colCnt--;
}
#else
while (colCnt)
{
q31_t inM11, inM12, inM13, inM14;
q31_t inV;
inV = *__SIMD32(pA)++;
inM11 = arm_nn_read_q7x4_ia(&pB);
inM12 = __SXTB16(__ROR(inM11, 8));
inM11 = __SXTB16(inM11);
sum = __SMLAD(inM12, inV, sum);
sum2 = __SMLAD(inM11, inV, sum2);
inM13 = arm_nn_read_q7x4_ia(&pB);
inM14 = __SXTB16(__ROR(inM13, 8));
inM13 = __SXTB16(inM13);
sum3 = __SMLAD(inM14, inV, sum3);
sum4 = __SMLAD(inM13, inV, sum4);
colCnt--;
}
#endif /* ARM_MATH_BIG_ENDIAN */
#else
/*
* register needed:
* loop counter: colCnt
* accumulators: sum, sum2, sum3, sum4
* pointers: pB, pA
* weight data: inM11, inM12, inM13, inM14
* activation data: inV
*/
#ifndef ARM_MATH_BIG_ENDIAN
asm volatile ("COL_LOOP_%=:\n"
"ldr.w r4, [%[pA]], #4\n"
"ldr.w r1, [%[pB]], #8\n"
"mov.w r0, r1, ror #8\n"
"sxtb16 r0, r0\n"
"sxtb16 r1, r1\n"
"smlad %[sum], r4, r1, %[sum]\n"
"smlad %[sum2], r4, r0, %[sum2]\n"
"ldr.w r3, [%[pB], #-4]\n"
"mov.w r2, r3, ror #8\n"
"sxtb16 r2, r2\n"
"sxtb16 r3, r3\n"
"smlad %[sum3], r4, r3, %[sum3]\n"
"smlad %[sum4], r4, r2, %[sum4]\n"
"subs %[colCnt], #1\n"
"bne COL_LOOP_%=\n":[sum] "+r"(sum),
[sum2] "+r"(sum2),[sum3] "+r"(sum3),
[sum4] "+r"(sum4),[pB] "+r"(pB),[pA] "+r"(pA):[colCnt] "r"(colCnt):"r0", "r1", "r2", "r3", "r4");
#else
asm volatile ("COL_LOOP_%=:\n"
"ldr.w r4, [%[pA]], #4\n"
"ldr.w r1, [%[pB]], #8\n"
"mov.w r0, r1, ror #8\n"
"sxtb16 r0, r0\n"
"sxtb16 r1, r1\n"
"smlad %[sum], r4, r0, %[sum]\n"
"smlad %[sum2], r4, r1, %[sum2]\n"
"ldr.w r3, [%[pB], #-4]\n"
"mov.w r2, r3, ror #8\n"
"sxtb16 r2, r2\n"
"sxtb16 r3, r3\n"
"smlad %[sum3], r4, r2, %[sum3]\n"
"smlad %[sum4], r4, r3, %[sum4]\n"
"subs %[colCnt], #1\n"
"bne COL_LOOP_%=\n":[sum] "+r"(sum),
[sum2] "+r"(sum2),[sum3] "+r"(sum3),
[sum4] "+r"(sum4),[pB] "+r"(pB),[pA] "+r"(pA):[colCnt] "r"(colCnt):"r0", "r1", "r2", "r3", "r4");
#endif /* ARM_MATH_BIG_ENDIAN */
#endif /* USE_INTRINSIC */
colCnt = dim_vec & 0x1;
while (colCnt)
{
q15_t inV = *pA++;
q7_t inM = *pB++;
q7_t inM2 = *pB++;
q7_t inM3 = *pB++;
q7_t inM4 = *pB++;
sum += inV * inM;
sum2 += inV * inM2;
sum3 += inV * inM3;
sum4 += inV * inM4;
colCnt--;
} /* while over colCnt */
*pO++ = (q15_t) (__SSAT((sum >> out_shift), 16));
*pO++ = (q15_t) (__SSAT((sum2 >> out_shift), 16));
*pO++ = (q15_t) (__SSAT((sum3 >> out_shift), 16));
*pO++ = (q15_t) (__SSAT((sum4 >> out_shift), 16));
/* adjust the pointers and counters */
rowCnt--;
}
/* left-over part of the rows */
rowCnt = num_of_rows & 0x3;
while (rowCnt)
{
q31_t sum = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
uint16_t colCnt = dim_vec >> 2;
pA = pV;
while (colCnt)
{
q31_t inV1, inV2, inM11, inM12;
pB = read_and_pad(pB, &inM11, &inM12);
inV1 = arm_nn_read_q15x2_ia(&pA);
sum = __SMLAD(inV1, inM11, sum);
inV2 = arm_nn_read_q15x2_ia(&pA);
sum = __SMLAD(inV2, inM12, sum);
colCnt--;
}
/* left-over of the vector */
colCnt = dim_vec & 0x3;
while (colCnt)
{
q15_t inV = *pA++;
q7_t inM = *pB++;
sum += inV * inM;
colCnt--;
}
*pO++ = (q15_t) (__SSAT((sum >> out_shift), 16));
rowCnt--;
}
#else
/* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
uint16_t rowCnt = num_of_rows >> 2;
const q7_t *pB = pM;
const q15_t *pA;
q15_t *pO = pOut;
const q7_t *pBias = bias;
while (rowCnt)
{
q31_t sum = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
q31_t sum2 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
q31_t sum3 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
q31_t sum4 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
uint16_t colCnt = dim_vec >> 1;
pA = pV;
while (colCnt)
{
q15_t inA1 = *pA++;
q15_t inA2 = *pA++;
q7_t inB1 = *pB++;
q7_t inB3 = *pB++;
q7_t inB2 = *pB++;
q7_t inB4 = *pB++;
sum += inA1 * inB1 + inA2 * inB2;
sum2 += inA1 * inB3 + inA2 * inB4;
inB1 = *pB++;
inB3 = *pB++;
inB2 = *pB++;
inB4 = *pB++;
sum3 += inA1 * inB1 + inA2 * inB2;
sum4 += inA1 * inB3 + inA2 * inB4;
colCnt--;
}
colCnt = dim_vec & 0x1;
while (colCnt)
{
q15_t inA = *pA++;
q7_t inB = *pB++;
sum += inA * inB;
inB = *pB++;
sum2 += inA * inB;
inB = *pB++;
sum3 += inA * inB;
inB = *pB++;
sum4 += inA * inB;
colCnt--;
}
*pO++ = (q15_t) __SSAT((sum >> out_shift), 16);
*pO++ = (q15_t) __SSAT((sum2 >> out_shift), 16);
*pO++ = (q15_t) __SSAT((sum3 >> out_shift), 16);
*pO++ = (q15_t) __SSAT((sum4 >> out_shift), 16);
rowCnt--;
}
rowCnt = num_of_rows & 0x3;
while (rowCnt)
{
int ip_out = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
int j;
pA = pV;
for (j = 0; j < dim_vec; j++)
{
q15_t inA = *pA++;
q7_t inB = *pB++;
ip_out += inA * inB;
}
*pO++ = (q15_t) __SSAT((ip_out >> out_shift), 16);
rowCnt--;
}
#endif /* ARM_MATH_DSP */
/* Return to ARM_MATH_SUCCESS */
return (ARM_MATH_SUCCESS);
}
/**
* @} end of FC group
*/

View File

@ -1,193 +0,0 @@
/*
* Copyright (C) 2010-2018 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_fully_connected_q15.c
* Description: Q15 basic fully-connected layer function
*
* $Date: 17. January 2018
* $Revision: V.1.0.0
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#include "arm_math.h"
#include "arm_nnfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup FC
* @{
*/
/**
* @brief Q15 opt fully-connected layer function
* @param[in] pV pointer to input vector
* @param[in] pM pointer to matrix weights
* @param[in] dim_vec length of the vector
* @param[in] num_of_rows number of rows in weight matrix
* @param[in] bias_shift amount of left-shift for bias
* @param[in] out_shift amount of right-shift for output
* @param[in] bias pointer to bias
* @param[in,out] pOut pointer to output vector
* @param[in,out] vec_buffer pointer to buffer space for input
* @return The function returns <code>ARM_MATH_SUCCESS</code>
*
*
* @details
*
* <b>Buffer size:</b>
*
* vec_buffer size: 0
*
*/
arm_status
arm_fully_connected_q15(const q15_t * pV,
const q15_t * pM,
const uint16_t dim_vec,
const uint16_t num_of_rows,
const uint16_t bias_shift,
const uint16_t out_shift,
const q15_t * bias,
q15_t * pOut,
q15_t * vec_buffer)
{
(void)vec_buffer;
#if defined (ARM_MATH_DSP)
/* Run the following code for Cortex-M4 and Cortex-M7 */
const q15_t *pB = pM;
const q15_t *pB2 = pB + dim_vec;
q15_t *pO = pOut;
const q15_t *pA;
const q15_t *pBias = bias;
uint16_t rowCnt = num_of_rows >> 1;
/* this loop loops over different output */
while (rowCnt) {
q31_t sum = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
q31_t sum2 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
uint16_t colCnt = dim_vec >> 2;
pA = pV;
pB2 = pB + dim_vec;
while (colCnt)
{
q31_t inV1, inM1, inM2;
inV1 = arm_nn_read_q15x2_ia(&pA);
inM1 = arm_nn_read_q15x2_ia(&pB);
sum = __SMLAD(inV1, inM1, sum);
inM2 = arm_nn_read_q15x2_ia(&pB2);
sum2 = __SMLAD(inV1, inM2, sum2);
inV1 = arm_nn_read_q15x2_ia(&pA);
inM1 = arm_nn_read_q15x2_ia(&pB);
sum = __SMLAD(inV1, inM1, sum);
inM2 = arm_nn_read_q15x2_ia(&pB2);
sum2 = __SMLAD(inV1, inM2, sum2);
colCnt--;
}
colCnt = dim_vec & 0x3;
while (colCnt)
{
q15_t inV = *pA++;
q15_t inM = *pB++;
q15_t inM2 = *pB2++;
sum += inV * inM;
sum2 += inV * inM2;
colCnt--;
} /* while over colCnt */
*pO++ = (q15_t) (__SSAT((sum >> out_shift), 16));
*pO++ = (q15_t) (__SSAT((sum2>> out_shift), 16));
/* adjust the pointers and counters */
pB = pB + dim_vec;
rowCnt --;
}
rowCnt = num_of_rows & 0x1;
while (rowCnt) {
q31_t sum = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
uint16_t colCnt = dim_vec >> 2;
pA = pV;
while (colCnt) {
q31_t inV1, inM1;
inV1 = arm_nn_read_q15x2_ia(&pA);
inM1 = arm_nn_read_q15x2_ia(&pB);
sum = __SMLAD(inV1, inM1, sum);
inV1 = arm_nn_read_q15x2_ia(&pA);
inM1 = arm_nn_read_q15x2_ia(&pB);
sum = __SMLAD(inV1, inM1, sum);
colCnt--;
}
/* left-over of the vector */
colCnt = dim_vec & 0x3;
while(colCnt) {
q15_t inV = *pA++;
q15_t inM = *pB++;
sum += inV * inM;
colCnt--;
}
*pO++ = (q15_t) (__SSAT((sum >> out_shift), 16));
rowCnt --;
}
#else
int i, j;
/* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
for (i = 0; i < num_of_rows; i++)
{
int ip_out = ((q31_t)(bias[i]) << bias_shift) + NN_ROUND(out_shift);
for (j = 0; j < dim_vec; j++)
{
ip_out += pV[j] * pM[i * dim_vec + j];
}
pOut[i] = (q15_t) __SSAT((ip_out >> out_shift), 16);
}
#endif /* ARM_MATH_DSP */
/* Return to application */
return (ARM_MATH_SUCCESS);
}
/**
* @} end of FC group
*/

View File

@ -1,332 +0,0 @@
/*
* Copyright (C) 2010-2018 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_fully_connected_q15_opt.c
* Description: Q15 opt fully-connected layer function
*
* $Date: 17. January 2018
* $Revision: V.1.0.0
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#include "arm_math.h"
#include "arm_nnfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup FC
* @{
*/
/**
* @brief Q15 opt fully-connected layer function
* @param[in] pV pointer to input vector
* @param[in] pM pointer to matrix weights
* @param[in] dim_vec length of the vector
* @param[in] num_of_rows number of rows in weight matrix
* @param[in] bias_shift amount of left-shift for bias
* @param[in] out_shift amount of right-shift for output
* @param[in] bias pointer to bias
* @param[in,out] pOut pointer to output vector
* @param[in,out] vec_buffer pointer to buffer space for input
* @return The function returns <code>ARM_MATH_SUCCESS</code>
*
*
* @details
*
* <b>Buffer size:</b>
*
* vec_buffer size: 0
*
* Here we use only one pointer to read 4 rows in the weight
* matrix. So if the original matrix looks like this:
*
* | a11 | a12 | a13 |
*
* | a21 | a22 | a23 |
*
* | a31 | a32 | a33 |
*
* | a41 | a42 | a43 |
*
* | a51 | a52 | a53 |
*
* | a61 | a62 | a63 |
*
* We operates on multiple-of-4 rows, so the first four rows becomes
*
* | a11 | a12 | a21 | a22 | a31 | a32 | a41 | a42 |
*
* | a13 | a23 | a33 | a43 |
*
* Remaining rows are kept the same original order.
*
* So the stored weight matrix looks like this:
*
*
* | a11 | a12 | a21 | a22 | a31 | a32 | a41 | a42 |
*
* | a13 | a23 | a33 | a43 | a51 | a52 | a53 | a61 |
*
* | a62 | a63 |
*/
arm_status
arm_fully_connected_q15_opt(const q15_t * pV,
const q15_t * pM,
const uint16_t dim_vec,
const uint16_t num_of_rows,
const uint16_t bias_shift,
const uint16_t out_shift,
const q15_t * bias,
q15_t * pOut,
q15_t * vec_buffer)
{
(void)vec_buffer;
#if defined (ARM_MATH_DSP)
/* Run the following code for Cortex-M4 and Cortex-M7 */
const q15_t *pB = pM;
q15_t *pO = pOut;
const q15_t *pBias = bias;
const q15_t *pA = pV;
uint16_t rowCnt = num_of_rows >> 2;
while (rowCnt)
{
q31_t sum = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
q31_t sum2 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
q31_t sum3 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
q31_t sum4 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
uint16_t colCnt = dim_vec >> 1;
pA = pV;
#ifdef USE_INTRINSIC
while (colCnt)
{
q31_t inM11, inM12, inM13, inM14;
q31_t inV;
inV = arm_nn_read_q15x2_ia(&pA);
inM11 = arm_nn_read_q15x2_ia(&pB);
sum = __SMLAD(inV, inM11, sum);
inM12 = arm_nn_read_q15x2_ia(&pB);
sum2 = __SMLAD(inV, inM12, sum2);
inM13 = arm_nn_read_q15x2_ia(&pB);
sum3 = __SMLAD(inV, inM13, sum3);
inM14 = arm_nn_read_q15x2_ia(&pB);
sum4 = __SMLAD(inV, inM14, sum4);
colCnt--;
}
#else
/*
* register needed:
* loop counter: colCnt
* accumulators: sum, sum2, sum3, sum4
* pointers: pB, pA
* weight data: inM11, inM12, inM13, inM14
* activation data: inV
*/
asm volatile ("COL_LOOP_%=:\n"
"ldr.w r4, [%[pA]], #4\n"
"ldr.w r0, [%[pB]], #16\n"
"smlad %[sum], r4, r0, %[sum]\n"
"ldr.w r1, [%[pB] , #-12]\n"
"smlad %[sum2], r4, r1, %[sum2]\n"
"ldr.w r2, [%[pB] , #-8]\n"
"smlad %[sum3], r4, r2, %[sum3]\n"
"ldr.w r3, [%[pB] , #-4]\n"
"smlad %[sum4], r4, r3, %[sum4]\n"
"subs %[colCnt], #1\n"
"bne COL_LOOP_%=\n":[sum] "+r"(sum),
[sum2] "+r"(sum2),[sum3] "+r"(sum3),
[sum4] "+r"(sum4),[pB] "+r"(pB),[pA] "+r"(pA):[colCnt] "r"(colCnt):"r0", "r1", "r2", "r3", "r4");
#endif /* USE_INTRINSIC */
colCnt = dim_vec & 0x1;
while (colCnt)
{
q15_t inV = *pA++;
q15_t inM = *pB++;
q15_t inM2 = *pB++;
q15_t inM3 = *pB++;
q15_t inM4 = *pB++;
sum += inV * inM;
sum2 += inV * inM2;
sum3 += inV * inM3;
sum4 += inV * inM4;
colCnt--;
} /* while over colCnt */
*pO++ = (q15_t) (__SSAT((sum >> out_shift), 16));
*pO++ = (q15_t) (__SSAT((sum2 >> out_shift), 16));
*pO++ = (q15_t) (__SSAT((sum3 >> out_shift), 16));
*pO++ = (q15_t) (__SSAT((sum4 >> out_shift), 16));
/* adjust the pointers and counters */
rowCnt--;
}
/* left-over part of the rows */
rowCnt = num_of_rows & 0x3;
while (rowCnt)
{
q31_t sum = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
uint16_t colCnt = dim_vec >> 2;
pA = pV;
while (colCnt)
{
q31_t inV1, inV2, inM1, inM2;
inM1 = arm_nn_read_q15x2_ia(&pB);
inV1 = arm_nn_read_q15x2_ia(&pA);
sum = __SMLAD(inV1, inM1, sum);
inM2 = arm_nn_read_q15x2_ia(&pB);
inV2 = arm_nn_read_q15x2_ia(&pA);
sum = __SMLAD(inV2, inM2, sum);
colCnt--;
}
/* left-over of the vector */
colCnt = dim_vec & 0x3;
while (colCnt)
{
q15_t inV = *pA++;
q15_t inM = *pB++;
sum += inV * inM;
colCnt--;
}
*pO++ = (q15_t) (__SSAT((sum >> out_shift), 16));
rowCnt--;
}
#else
/* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
uint16_t rowCnt = num_of_rows >> 2;
const q15_t *pB = pM;
const q15_t *pA;
q15_t *pO = pOut;
const q15_t *pBias = bias;
while (rowCnt)
{
q31_t sum = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
q31_t sum2 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
q31_t sum3 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
q31_t sum4 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
uint16_t colCnt = dim_vec >> 1;
pA = pV;
while (colCnt)
{
q15_t inA1 = *pA++;
q15_t inA2 = *pA++;
q15_t inB1 = *pB++;
q15_t inB2 = *pB++;
sum += inA1 * inB1 + inA2 * inB2;
inB1 = *pB++;
inB2 = *pB++;
sum2 += inA1 * inB1 + inA2 * inB2;
inB1 = *pB++;
inB2 = *pB++;
sum3 += inA1 * inB1 + inA2 * inB2;
inB1 = *pB++;
inB2 = *pB++;
sum4 += inA1 * inB1 + inA2 * inB2;
colCnt--;
}
colCnt = dim_vec & 0x1;
while (colCnt)
{
q15_t inA = *pA++;
q15_t inB = *pB++;
sum += inA * inB;
inB = *pB++;
sum2 += inA * inB;
inB = *pB++;
sum3 += inA * inB;
inB = *pB++;
sum4 += inA * inB;
colCnt--;
}
*pO++ = (q15_t) __SSAT((sum >> out_shift), 16);
*pO++ = (q15_t) __SSAT((sum2 >> out_shift), 16);
*pO++ = (q15_t) __SSAT((sum3 >> out_shift), 16);
*pO++ = (q15_t) __SSAT((sum4 >> out_shift), 16);
rowCnt--;
}
rowCnt = num_of_rows & 0x3;
while (rowCnt)
{
int ip_out = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
int j;
pA = pV;
for (j = 0; j < dim_vec; j++)
{
q15_t inA = *pA++;
q15_t inB = *pB++;
ip_out += inA * inB;
}
*pO++ = (q15_t) __SSAT((ip_out >> out_shift), 16);
rowCnt--;
}
#endif /* ARM_MATH_DSP */
/* Return to ARM_MATH_SUCCESS */
return (ARM_MATH_SUCCESS);
}
/**
* @} end of FC group
*/

View File

@ -1,198 +0,0 @@
/*
* Copyright (C) 2010-2018 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_fully_connected_q7.c
* Description: Q7 basic fully-connected layer function
*
* $Date: 17. January 2018
* $Revision: V.1.0.0
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#include "arm_math.h"
#include "arm_nnfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup FC
* @{
*/
/**
* @brief Q7 basic fully-connected layer function
* @param[in] pV pointer to input vector
* @param[in] pM pointer to matrix weights
* @param[in] dim_vec length of the vector
* @param[in] num_of_rows number of rows in weight matrix
* @param[in] bias_shift amount of left-shift for bias
* @param[in] out_shift amount of right-shift for output
* @param[in] bias pointer to bias
* @param[in,out] pOut pointer to output vector
* @param[in,out] vec_buffer pointer to buffer space for input
* @return The function returns <code>ARM_MATH_SUCCESS</code>
*
* @details
*
* <b>Buffer size:</b>
*
* vec_buffer size: dim_vec
*
* This basic function is designed to work with regular weight
* matrix without interleaving.
*
*/
arm_status
arm_fully_connected_q7(const q7_t * pV,
const q7_t * pM,
const uint16_t dim_vec,
const uint16_t num_of_rows,
const uint16_t bias_shift,
const uint16_t out_shift, const q7_t * bias, q7_t * pOut, q15_t * vec_buffer)
{
#if defined (ARM_MATH_DSP)
/* Run the following code for Cortex-M4 and Cortex-M7 */
const q7_t *pB = pM;
const q7_t *pB2;
q7_t *pO = pOut;
const q7_t *pBias = bias;
const q15_t *pA;
uint16_t rowCnt = num_of_rows >> 1;
/* expand the vector into the buffer */
arm_q7_to_q15_reordered_no_shift(pV, vec_buffer, dim_vec);
while (rowCnt)
{
q31_t sum = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
q31_t sum2 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
uint16_t colCnt = dim_vec >> 2;
pA = vec_buffer;
pB2 = pB + dim_vec;
while (colCnt)
{
q31_t inV, inM11, inM12, inM21, inM22;
pB = read_and_pad_reordered(pB, &inM11, &inM12);
pB2 = read_and_pad_reordered(pB2, &inM21, &inM22);
inV = arm_nn_read_q15x2_ia(&pA);
sum = __SMLAD(inV, inM11, sum);
sum2 = __SMLAD(inV, inM21, sum2);
inV = arm_nn_read_q15x2_ia(&pA);
sum = __SMLAD(inV, inM12, sum);
sum2 = __SMLAD(inV, inM22, sum2);
colCnt--;
}
colCnt = dim_vec & 0x3;
while (colCnt)
{
q7_t inV = *pA++;
q15_t inM = *pB++;
q15_t inM2 = *pB2++;
sum += inV * inM;
sum2 += inV * inM2;
colCnt--;
} /* while over colCnt */
*pO++ = (q7_t) (__SSAT((sum >> out_shift), 8));
*pO++ = (q7_t) (__SSAT((sum2 >> out_shift), 8));
/* adjust the pointers and counters */
pB += dim_vec;
rowCnt--;
}
/* left-over part of the rows */
rowCnt = num_of_rows & 0x1;
while (rowCnt)
{
uint16_t colCnt = dim_vec >> 2;
q31_t sum = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
pA = vec_buffer;
while (colCnt)
{
q31_t inV1, inV2, inM11, inM12;
pB = read_and_pad_reordered(pB, &inM11, &inM12);
inV1 = arm_nn_read_q15x2_ia(&pA);
sum = __SMLAD(inV1, inM11, sum);
inV2 = arm_nn_read_q15x2_ia(&pA);
sum = __SMLAD(inV2, inM12, sum);
colCnt--;
}
/* left-over of the vector */
colCnt = dim_vec & 0x3;
while (colCnt)
{
q7_t inV = *pA++;
q15_t inM = *pB++;
sum += inV * inM;
colCnt--;
}
*pO++ = (q7_t) (__SSAT((sum >> out_shift), 8));
rowCnt--;
}
#else
int i, j;
/* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
for (i = 0; i < num_of_rows; i++)
{
int ip_out = ((q31_t)(bias[i]) << bias_shift) + NN_ROUND(out_shift);
for (j = 0; j < dim_vec; j++)
{
ip_out += pV[j] * pM[i * dim_vec + j];
}
pOut[i] = (q7_t) __SSAT((ip_out >> out_shift), 8);
}
#endif /* ARM_MATH_DSP */
/* Return to ARM_MATH_SUCCESS */
return (ARM_MATH_SUCCESS);
}
/**
* @} end of FC group
*/

View File

@ -1,484 +0,0 @@
/*
* Copyright (C) 2010-2018 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_fully_connected_q7_opt.c
* Description: Q7 basic fully-connected layer function
*
* $Date: 17. January 2018
* $Revision: V.1.0.0
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#include "arm_math.h"
#include "arm_nnfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup FC
* @{
*/
/**
* @brief Q7 opt fully-connected layer function
* @param[in] pV pointer to input vector
* @param[in] pM pointer to matrix weights
* @param[in] dim_vec length of the vector
* @param[in] num_of_rows number of rows in weight matrix
* @param[in] bias_shift amount of left-shift for bias
* @param[in] out_shift amount of right-shift for output
* @param[in] bias pointer to bias
* @param[in,out] pOut pointer to output vector
* @param[in,out] vec_buffer pointer to buffer space for input
* @return The function returns <code>ARM_MATH_SUCCESS</code>
*
* @details
*
* <b>Buffer size:</b>
*
* vec_buffer size: dim_vec
*
* This opt function is designed to work with interleaved weight
* matrix. The vector input is assumed in q7_t format, we call
* arm_q7_to_q15_no_shift_shuffle function to expand into
* q15_t format with certain weight re-ordering, refer to the function
* comments for more details.
* Here we use only one pointer to read 4 rows in the weight
* matrix. So if the original q7_t matrix looks like this:
*
* | a11 | a12 | a13 | a14 | a15 | a16 | a17 |
*
* | a21 | a22 | a23 | a24 | a25 | a26 | a27 |
*
* | a31 | a32 | a33 | a34 | a35 | a36 | a37 |
*
* | a41 | a42 | a43 | a44 | a45 | a46 | a47 |
*
* | a51 | a52 | a53 | a54 | a55 | a56 | a57 |
*
* | a61 | a62 | a63 | a64 | a65 | a66 | a67 |
*
*
* We operates on multiple-of-4 rows, so the first four rows becomes
*
* | a11 | a21 | a13 | a23 | a31 | a41 | a33 | a43 |
*
* | a12 | a22 | a14 | a24 | a32 | a42 | a34 | a44 |
*
* | a15 | a25 | a35 | a45 | a16 | a26 | a36 | a46 |
*
* So within the kernel, we first read the re-ordered vector in as:
*
* | b1 | b3 | and | b2 | b4 |
*
* the four q31_t weights will look like
*
* | a11 | a13 |, | a21 | a23 |, | a31 | a33 |, | a41 | a43 |
*
* | a12 | a14 |, | a22 | a24 |, | a32 | a34 |, | a42 | a44 |
*
* The column left over will be in-order.
* which is:
*
* | a17 | a27 | a37 | a47 |
*
* For the left-over rows, we do 1x1 computation, so the data remains
* as its original order.
*
* So the stored weight matrix looks like this:
*
* | a11 | a21 | a13 | a23 | a31 | a41 |
*
* | a33 | a43 | a12 | a22 | a14 | a24 |
*
* | a32 | a42 | a34 | a44 | a15 | a25 |
*
* | a35 | a45 | a16 | a26 | a36 | a46 |
*
* | a17 | a27 | a37 | a47 | a51 | a52 |
*
* | a53 | a54 | a55 | a56 | a57 | a61 |
*
* | a62 | a63 | a64 | a65 | a66 | a67 |
*
*
*/
arm_status
arm_fully_connected_q7_opt(const q7_t * pV,
const q7_t * pM,
const uint16_t dim_vec,
const uint16_t num_of_rows,
const uint16_t bias_shift,
const uint16_t out_shift,
const q7_t * bias,
q7_t * pOut,
q15_t * vec_buffer)
{
#if defined (ARM_MATH_DSP)
/* Run the following code for Cortex-M4 and Cortex-M7 */
const q7_t *pB = pM;
q7_t *pO = pOut;
const q7_t *pBias = bias;
const q15_t *pA;
uint16_t rowCnt = num_of_rows >> 2;
arm_q7_to_q15_reordered_no_shift(pV, vec_buffer, dim_vec);
while (rowCnt)
{
q31_t sum = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
q31_t sum2 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
q31_t sum3 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
q31_t sum4 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
uint16_t colCnt = dim_vec >> 2;
pA = vec_buffer;
#ifdef USE_INTRINSIC
#ifndef ARM_MATH_BIG_ENDIAN
while (colCnt)
{
q31_t inM11, inM12, inM13, inM14;
q31_t inV;
inV = arm_nn_read_q15x2_ia(&pA);
inM11 = arm_nn_read_q7x4_ia(&pB);
inM12 = __SXTB16(__ROR(inM11, 8));
inM11 = __SXTB16(inM11);
sum = __SMLAD(inM11, inV, sum);
sum2 = __SMLAD(inM12, inV, sum2);
inM13 = arm_nn_read_q7x4_ia(&pB);
inM14 = __SXTB16(__ROR(inM13, 8));
inM13 = __SXTB16(inM13);
sum3 = __SMLAD(inM13, inV, sum3);
sum4 = __SMLAD(inM14, inV, sum4);
inV = arm_nn_read_q15x2_ia(&pA);
inM11 = arm_nn_read_q7x4_ia(&pB);
inM12 = __SXTB16(__ROR(inM11, 8));
inM11 = __SXTB16(inM11);
sum = __SMLAD(inM11, inV, sum);
sum2 = __SMLAD(inM12, inV, sum2);
inM13 = arm_nn_read_q7x4_ia(&pB);
inM14 = __SXTB16(__ROR(inM13, 8));
inM13 = __SXTB16(inM13);
sum3 = __SMLAD(inM13, inV, sum3);
sum4 = __SMLAD(inM14, inV, sum4);
colCnt--;
}
#else
while (colCnt)
{
q31_t inM11, inM12, inM13, inM14;
q31_t inV;
inV = arm_nn_read_q15x2_ia(&pA);
inM11 = arm_nn_read_q7x4_ia(&pB);
inM12 = __SXTB16(__ROR(inM11, 8));
inM11 = __SXTB16(inM11);
sum = __SMLAD(inM12, inV, sum);
sum2 = __SMLAD(inM11, inV, sum2);
inM13 = arm_nn_read_q7x4_ia(&pB);
inM14 = __SXTB16(__ROR(inM13, 8));
inM13 = __SXTB16(inM13);
sum3 = __SMLAD(inM14, inV, sum3);
sum4 = __SMLAD(inM13, inV, sum4);
inV = arm_nn_read_q15x2_ia(&pA);
inM11 = arm_nn_read_q7x4_ia(&pB);
inM12 = __SXTB16(__ROR(inM11, 8));
inM11 = __SXTB16(inM11);
sum = __SMLAD(inM12, inV, sum);
sum2 = __SMLAD(inM11, inV, sum2);
inM13 = arm_nn_read_q7x4_ia(&pB);
inM14 = __SXTB16(__ROR(inM13, 8));
inM13 = __SXTB16(inM13);
sum3 = __SMLAD(inM14, inV, sum3);
sum4 = __SMLAD(inM13, inV, sum4);
colCnt--;
}
#endif /* ARM_MATH_BIG_ENDIAN */
#else
/*
* register needed:
* loop counter: colCnt
* accumulators: sum, sum2, sum3, sum4
* pointers: pB, pA
* weight data: inM11, inM12, inM13, inM14
* activation data: inV
*/
#ifndef ARM_MATH_BIG_ENDIAN
asm volatile ("COL_LOOP_%=:\n"
"ldr.w r4, [%[pA]], #8\n"
"ldr.w r1, [%[pB]], #16\n"
"mov.w r0, r1, ror #8\n"
"sxtb16 r0, r0\n"
"sxtb16 r1, r1\n"
"smlad %[sum], r4, r1, %[sum]\n"
"smlad %[sum2], r4, r0, %[sum2]\n"
"ldr.w r3, [%[pB], #-12]\n"
"mov.w r2, r3, ror #8\n"
"sxtb16 r2, r2\n"
"sxtb16 r3, r3\n"
"smlad %[sum3], r4, r3, %[sum3]\n"
"smlad %[sum4], r4, r2, %[sum4]\n"
"ldr.w r4, [%[pA], #-4]\n"
"ldr.w r1, [%[pB], #-8]\n"
"mov.w r0, r1, ror #8\n"
"sxtb16 r0, r0\n"
"sxtb16 r1, r1\n"
"smlad %[sum], r4, r1, %[sum]\n"
"smlad %[sum2], r4, r0, %[sum2]\n"
"ldr.w r3, [%[pB], #-4]\n"
"mov.w r2, r3, ror #8\n"
"sxtb16 r2, r2\n"
"sxtb16 r3, r3\n"
"smlad %[sum3], r4, r3, %[sum3]\n"
"smlad %[sum4], r4, r2, %[sum4]\n"
"subs %[colCnt], #1\n"
"bne COL_LOOP_%=\n":[sum] "+r"(sum),
[sum2] "+r"(sum2),[sum3] "+r"(sum3),
[sum4] "+r"(sum4),[pB] "+r"(pB),[pA] "+r"(pA):[colCnt] "r"(colCnt):"r0", "r1", "r2", "r3", "r4");
#else
asm volatile ("COL_LOOP_%=:\n"
"ldr.w r4, [%[pA]], #8\n"
"ldr.w r1, [%[pB]], #16\n"
"mov.w r0, r1, ror #8\n"
"sxtb16 r0, r0\n"
"sxtb16 r1, r1\n"
"smlad %[sum], r4, r0, %[sum]\n"
"smlad %[sum2], r4, r1, %[sum2]\n"
"ldr.w r3, [%[pB], #-12]\n"
"mov.w r2, r3, ror #8\n"
"sxtb16 r2, r2\n"
"sxtb16 r3, r3\n"
"smlad %[sum3], r4, r2, %[sum3]\n"
"smlad %[sum4], r4, r3, %[sum4]\n"
"ldr.w r4, [%[pA], #-4]\n"
"ldr.w r1, [%[pB], #-8]\n"
"mov.w r0, r1, ror #8\n"
"sxtb16 r0, r0\n"
"sxtb16 r1, r1\n"
"smlad %[sum], r4, r0, %[sum]\n"
"smlad %[sum2], r4, r1, %[sum2]\n"
"ldr.w r3, [%[pB], #-4]\n"
"mov.w r2, r3, ror #8\n"
"sxtb16 r2, r2\n"
"sxtb16 r3, r3\n"
"smlad %[sum3], r4, r2, %[sum3]\n"
"smlad %[sum4], r4, r3, %[sum4]\n"
"subs %[colCnt], #1\n"
"bne COL_LOOP_%=\n":[sum] "+r"(sum),
[sum2] "+r"(sum2),[sum3] "+r"(sum3),
[sum4] "+r"(sum4),[pB] "+r"(pB),[pA] "+r"(pA):[colCnt] "r"(colCnt):"r0", "r1", "r2", "r3", "r4");
#endif /* ARM_MATH_BIG_ENDIAN */
#endif /* USE_INTRINSIC */
colCnt = dim_vec & 0x3;
while (colCnt)
{
q15_t inV = *pA++;
q7_t inM = *pB++;
q7_t inM2 = *pB++;
q7_t inM3 = *pB++;
q7_t inM4 = *pB++;
sum += inV * inM;
sum2 += inV * inM2;
sum3 += inV * inM3;
sum4 += inV * inM4;
colCnt--;
} /* while over colCnt */
*pO++ = (q7_t) (__SSAT((sum >> out_shift), 8));
*pO++ = (q7_t) (__SSAT((sum2 >> out_shift), 8));
*pO++ = (q7_t) (__SSAT((sum3 >> out_shift), 8));
*pO++ = (q7_t) (__SSAT((sum4 >> out_shift), 8));
/* adjust the pointers and counters */
rowCnt--;
}
/* left-over part of the rows */
rowCnt = num_of_rows & 0x3;
while (rowCnt)
{
q31_t sum = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
uint16_t colCnt = dim_vec >> 2;
pA = vec_buffer;
while (colCnt)
{
q31_t inV1, inV2, inM11, inM12;
pB = read_and_pad_reordered(pB, &inM11, &inM12);
inV1 = arm_nn_read_q15x2_ia(&pA);
sum = __SMLAD(inV1, inM11, sum);
inV2 = arm_nn_read_q15x2_ia(&pA);
sum = __SMLAD(inV2, inM12, sum);
colCnt--;
}
/* left-over of the vector */
colCnt = dim_vec & 0x3;
while (colCnt)
{
q15_t inV = *pA++;
q7_t inM = *pB++;
sum += inV * inM;
colCnt--;
}
*pO++ = (q7_t) (__SSAT((sum >> out_shift), 8));
rowCnt--;
}
#else
/* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
uint16_t rowCnt = num_of_rows >> 2;
const q7_t *pB = pM;
const q7_t *pA;
q7_t *pO = pOut;
const q7_t *pBias = bias;
while (rowCnt)
{
q31_t sum = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
q31_t sum2 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
q31_t sum3 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
q31_t sum4 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
uint16_t colCnt = dim_vec >> 2;
pA = pV;
while (colCnt)
{
q7_t inA1 = *pA++;
q7_t inA3 = *pA++;
q7_t inA2 = *pA++;
q7_t inA4 = *pA++;
q7_t inB1 = *pB++;
q7_t inB3 = *pB++;
q7_t inB2 = *pB++;
q7_t inB4 = *pB++;
sum += inA1 * inB1 + inA2 * inB2;
sum2 += inA1 * inB3 + inA2 * inB4;
inB1 = *pB++;
inB3 = *pB++;
inB2 = *pB++;
inB4 = *pB++;
sum3 += inA1 * inB1 + inA2 * inB2;
sum4 += inA1 * inB3 + inA2 * inB4;
inB1 = *pB++;
inB3 = *pB++;
inB2 = *pB++;
inB4 = *pB++;
sum += inA3 * inB1 + inA4 * inB2;
sum2 += inA3 * inB3 + inA4 * inB4;
inB1 = *pB++;
inB3 = *pB++;
inB2 = *pB++;
inB4 = *pB++;
sum3 += inA3 * inB1 + inA4 * inB2;
sum4 += inA3 * inB3 + inA4 * inB4;
colCnt--;
}
colCnt = dim_vec & 0x3;
while (colCnt)
{
q7_t inA = *pA++;
q7_t inB = *pB++;
sum += inA * inB;
inB = *pB++;
sum2 += inA * inB;
inB = *pB++;
sum3 += inA * inB;
inB = *pB++;
sum4 += inA * inB;
colCnt--;
}
*pO++ = (q7_t) __SSAT((sum >> out_shift), 8);
*pO++ = (q7_t) __SSAT((sum2 >> out_shift), 8);
*pO++ = (q7_t) __SSAT((sum3 >> out_shift), 8);
*pO++ = (q7_t) __SSAT((sum4 >> out_shift), 8);
rowCnt--;
}
rowCnt = num_of_rows & 0x3;
while (rowCnt)
{
int ip_out = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
int j;
pA = pV;
for (j = 0; j < dim_vec; j++)
{
q7_t inA = *pA++;
q7_t inB = *pB++;
ip_out += inA * inB;
}
*pO++ = (q7_t) __SSAT((ip_out >> out_shift), 8);
rowCnt--;
}
#endif /* ARM_MATH_DSP */
/* Return to ARM_MATH_SUCCESS */
return (ARM_MATH_SUCCESS);
}
/**
* @} end of FC group
*/

View File

@ -1,97 +0,0 @@
/*
* Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_fully_connected_s8
* Description: Fully connected function compatible with TF Lite.
*
* $Date: May 2, 2020
* $Revision: V.2.0.0
*
* Target Processor: Cortex-M and Cortex-A cores
*
* -------------------------------------------------------------------- */
#include "arm_math.h"
#include "arm_nnfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup FC
* @{
*/
/*
* S8 basic fully-connected and matrix multiplication layer function for TensorFlow Lite
*
* Refer header file for details.
*
*/
arm_status
arm_fully_connected_s8(const cmsis_nn_context *ctx,
const cmsis_nn_fc_params *fc_params,
const cmsis_nn_per_tensor_quant_params *quant_params,
const cmsis_nn_dims *input_dims,
const q7_t *input,
const cmsis_nn_dims *filter_dims,
const q7_t *kernel,
const cmsis_nn_dims *bias_dims,
const int32_t *bias,
const cmsis_nn_dims *output_dims,
q7_t *output)
{
(void)bias_dims;
(void)ctx;
int32_t batch_cnt = input_dims->n;
while (batch_cnt)
{
arm_nn_vec_mat_mult_t_s8(input,
kernel,
bias,
output,
fc_params->input_offset,
fc_params->filter_offset,
fc_params->output_offset,
quant_params->multiplier,
quant_params->shift,
filter_dims->n, /* col_dim or accum_depth */
output_dims->c, /* row_dim or output_depth */
fc_params->activation.min,
fc_params->activation.max);
input += filter_dims->n;
output += output_dims->c;
batch_cnt--;
}
return (ARM_MATH_SUCCESS);
}
int32_t arm_fully_connected_s8_get_buffer_size(const cmsis_nn_dims *filter_dims)
{
(void)filter_dims;
return 0;
}
/**
* @} end of FC group
*/

View File

@ -1,81 +0,0 @@
/*
* Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_nn_accumulate_q7_to_q15.c
* Description: Accumulate q7 vector into q15 one.
*
* $Date: May 29, 2020
* $Revision: V.1.0.1
*
* pSrc Processor: Cortex-M CPUs
*
* -------------------------------------------------------------------- */
#include "arm_math.h"
#include "arm_nnfunctions.h"
/**
* @ingroup groupSupport
*/
/**
* @addtogroup NNBasicMath
* @{
*/
void arm_nn_accumulate_q7_to_q15(q15_t *pDst, const q7_t *pSrc, uint32_t length)
{
q15_t *pCnt = pDst;
const q7_t *pV = pSrc;
q31_t v1, v2, vo1, vo2;
int32_t cnt = length >> 2;
q31_t in;
while (cnt > 0l)
{
q31_t value = arm_nn_read_q7x4_ia(&pV);
v1 = __SXTB16(__ROR((uint32_t)value, 8));
v2 = __SXTB16(value);
#ifndef ARM_MATH_BIG_ENDIAN
vo2 = (q31_t)__PKHTB(v1, v2, 16);
vo1 = (q31_t)__PKHBT(v2, v1, 16);
#else
vo1 = (q31_t)__PKHTB(v1, v2, 16);
vo2 = (q31_t)__PKHBT(v2, v1, 16);
#endif
in = arm_nn_read_q15x2(pCnt);
write_q15x2_ia(&pCnt, __QADD16(vo1, in));
in = arm_nn_read_q15x2(pCnt);
write_q15x2_ia(&pCnt, __QADD16(vo2, in));
cnt--;
}
cnt = length & 0x3;
while (cnt > 0l)
{
*pCnt++ += *pV++;
cnt--;
}
}
/**
* @} end of NNBasicMath group
*/

View File

@ -1,82 +0,0 @@
/*
* Copyright (C) 2010-2018 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_nn_add_q7.c
* Description: Non saturating addition of elements of a q7 vector.
*
* $Date: July 2019
* $Revision: V.1.0.0
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#include "arm_math.h"
#include "arm_nnfunctions.h"
/**
* @ingroup groupSupport
*/
/**
* @addtogroup NNBasicMath
* @{
*/
void arm_nn_add_q7(const q7_t *input, q31_t *output, uint32_t block_size)
{
uint32_t block_count;
q31_t result = 0;
#if defined(ARM_MATH_DSP)
/* Loop unrolling: Compute 4 outputs at a time */
block_count = block_size >> 2U;
while (block_count > 0U)
{
const int32_t mult_q15x2 = (1UL << 16) | 1UL;
q31_t in_q7x4 = arm_nn_read_q7x4_ia(&input);
q31_t temp_q15x2 = __SXTAB16(__SXTB16(in_q7x4),
__ROR((uint32_t)in_q7x4, 8));
result = __SMLAD(temp_q15x2, mult_q15x2, result);
/* Decrement loop counter */
block_count--;
}
/* Loop unrolling: Compute remaining outputs */
block_count = block_size & 0x3;
#else
block_count = block_size;
#endif
while (block_count > 0U)
{
/* Add and store result in destination buffer. */
result += *input++;
/* Decrement loop counter */
block_count--;
}
*output = result;
}
/**
* @} end of NNBasicMath group
*/

View File

@ -1,169 +0,0 @@
/*
* Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_nn_depthwise_conv_nt_t_padded_s8.c
* Description: Depthwise convolution with padded matrices.
*
* $Date: March 17, 2020
* $Revision: V.1.0.1
*
* Target Processor: Cortex-M processors with MVE extension
* -------------------------------------------------------------------- */
#include "arm_math.h"
#include "arm_nnfunctions.h"
/**
* @ingroup groupSupport
*/
/**
* @addtogroup NNBasicMath
* @{
*/
/*
* Depthwise convolution of transposed rhs matrix with 4 lhs matrices. One or more of the rhs matrices are padded.
* Dimensions are the same for lhs and rhs.
*
* Refer header file for details.
*
*/
q7_t *arm_nn_depthwise_conv_nt_t_padded_s8(const q7_t *lhs,
const q7_t *rhs,
const int32_t input_offset,
const uint16_t num_ch,
const int32_t *out_shift,
const int32_t *out_mult,
const int32_t out_offset,
const int32_t activation_min,
const int32_t activation_max,
const uint16_t row_x_col,
const int32_t *const output_bias,
q7_t *out)
{
#if defined(ARM_MATH_MVEI)
int32_t loop_count = (num_ch + 3) / 4;
const int32_t *bias = output_bias;
uint32_t num_ch_to_process = num_ch;
for (int i_loop_cnt = 0, offset = 0; i_loop_cnt < loop_count;
num_ch_to_process -= 4, out += 4, offset += 4, i_loop_cnt++)
{
int32x4_t out_0 = vldrwq_s32(bias);
int32x4_t out_1 = out_0;
int32x4_t out_2 = out_0;
int32x4_t out_3 = out_0;
bias += 4;
const int8_t *rhs_0 = rhs + offset;
const int8_t *lhs_0 = lhs + offset;
const int8_t *lhs_1 = lhs + row_x_col * num_ch + offset;
const int8_t *lhs_2 = lhs + (row_x_col * num_ch * 2) + offset;
const int8_t *lhs_3 = lhs + (row_x_col * num_ch * 3) + offset;
for (int i_row_x_col = 0; i_row_x_col < row_x_col; i_row_x_col++)
{
const int32x4_t ker_0 = vldrbq_s32(rhs_0);
int32x4_t ip_0 = vldrbq_s32(lhs_0);
ip_0 = vaddq_n_s32(ip_0, input_offset);
out_0 += vmulq_s32(ip_0, ker_0);
int32x4_t ip_1 = vldrbq_s32(lhs_1);
ip_1 = vaddq_n_s32(ip_1, input_offset);
out_1 += vmulq_s32(ip_1, ker_0);
int32x4_t ip_2 = vldrbq_s32(lhs_2);
ip_2 = vaddq_n_s32(ip_2, input_offset);
out_2 += vmulq_s32(ip_2, ker_0);
int32x4_t ip_3 = vldrbq_s32(lhs_3);
ip_3 = vaddq_n_s32(ip_3, input_offset);
out_3 += vmulq_s32(ip_3, ker_0);
lhs_0 += num_ch;
lhs_1 += num_ch;
lhs_2 += num_ch;
lhs_3 += num_ch;
rhs_0 += num_ch;
}
const int32x4_t mult = vldrwq_s32(out_mult);
const int32x4_t shift = vldrwq_s32(out_shift);
out_mult += 4;
out_shift += 4;
out_0 = arm_requantize_mve_32x4(out_0, mult, shift);
out_0 = vaddq_n_s32(out_0, out_offset);
out_0 = vmaxq_s32(out_0, vdupq_n_s32(activation_min));
out_0 = vminq_s32(out_0, vdupq_n_s32(activation_max));
mve_pred16_t p = vctp32q(num_ch_to_process);
vstrbq_p_s32(out, out_0, p);
out_1 = arm_requantize_mve_32x4(out_1, mult, shift);
out_1 = vaddq_n_s32(out_1, out_offset);
out_1 = vmaxq_s32(out_1, vdupq_n_s32(activation_min));
out_1 = vminq_s32(out_1, vdupq_n_s32(activation_max));
vstrbq_p_s32(out + num_ch, out_1, p);
out_2 = arm_requantize_mve_32x4(out_2, mult, shift);
out_2 = vaddq_n_s32(out_2, out_offset);
out_2 = vmaxq_s32(out_2, vdupq_n_s32(activation_min));
out_2 = vminq_s32(out_2, vdupq_n_s32(activation_max));
vstrbq_p_s32(out + 2 * num_ch, out_2, p);
out_3 = arm_requantize_mve_32x4(out_3, mult, shift);
out_3 = vaddq_n_s32(out_3, out_offset);
out_3 = vmaxq_s32(out_3, vdupq_n_s32(activation_min));
out_3 = vminq_s32(out_3, vdupq_n_s32(activation_max));
vstrbq_p_s32(out + 3 * num_ch, out_3, p);
}
const int tail_ch = num_ch & 0x3;
if (tail_ch != 0)
{
out -= (4 - tail_ch);
}
return out + (3 * num_ch);
#else
(void)lhs;
(void)rhs;
(void)input_offset;
(void)num_ch;
(void)out_shift;
(void)out_mult;
(void)out_offset;
(void)activation_min;
(void)activation_max;
(void)row_x_col;
(void)output_bias;
(void)out;
return NULL;
#endif
}
/**
* @} end of NNBasicMath group
*/

View File

@ -1,171 +0,0 @@
/*
* Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_nn_depthwise_conv_nt_t_s8.c
* Description: Depthwise convolution on matrices with no padding.
*
* $Date: March 17, 2020
* $Revision: V.1.0.1
*
* Target Processor: Cortex-M processors with MVE extension.
* -------------------------------------------------------------------- */
#include "arm_math.h"
#include "arm_nnfunctions.h"
/**
* @ingroup groupSupport
*/
/**
* @addtogroup NNBasicMath
* @{
*/
/*
* Depthwise convolution of rhs matrix with 4 lhs matrices with no padding. Dimensions are the same for lhs and rhs.
*
* Refer header file for details.
*
*/
q7_t *arm_nn_depthwise_conv_nt_t_s8(const q7_t *lhs,
const q7_t *rhs,
const int32_t input_offset,
const uint16_t num_ch,
const int32_t *out_shift,
const int32_t *out_mult,
const int32_t out_offset,
const int32_t activation_min,
const int32_t activation_max,
const uint16_t row_x_col,
const int32_t *const output_bias,
q7_t *out)
{
#if defined(ARM_MATH_MVEI)
const int32_t *bias = output_bias;
int32_t loop_count = (num_ch + 3) / 4;
uint32_t num_ch_to_process = num_ch;
for (int i_loop_cnt = 0, offset = 0; i_loop_cnt < loop_count;
num_ch_to_process -= 4, offset += 4, out += 4, i_loop_cnt++)
{
int32x4_t out_0 = vldrwq_s32(bias);
int32x4_t out_1 = out_0;
int32x4_t out_2 = out_0;
int32x4_t out_3 = out_0;
bias += 4;
const int8_t *rhs_0 = rhs + offset;
const int8_t *lhs_0 = lhs + offset;
const int8_t *lhs_1 = lhs + row_x_col * num_ch + offset;
const int8_t *lhs_2 = lhs + (row_x_col * num_ch * 2) + offset;
const int8_t *lhs_3 = lhs + (row_x_col * num_ch * 3) + offset;
int32x4_t ker_sum = vdupq_n_s32(0);
for (int i_row_x_col = 0; i_row_x_col < row_x_col; i_row_x_col++)
{
const int32x4_t ker_0 = vldrbq_s32(rhs_0);
ker_sum = vaddq_s32(ker_sum, ker_0);
int32x4_t ip_0 = vldrbq_s32(lhs_0);
out_0 += vmulq_s32(ip_0, ker_0);
int32x4_t ip_1 = vldrbq_s32(lhs_1);
out_1 += vmulq_s32(ip_1, ker_0);
int32x4_t ip_2 = vldrbq_s32(lhs_2);
out_2 += vmulq_s32(ip_2, ker_0);
int32x4_t ip_3 = vldrbq_s32(lhs_3);
out_3 += vmulq_s32(ip_3, ker_0);
lhs_0 += num_ch;
lhs_1 += num_ch;
lhs_2 += num_ch;
lhs_3 += num_ch;
rhs_0 += num_ch;
}
ker_sum = vmulq_n_s32(ker_sum, input_offset);
out_0 = ker_sum + out_0;
out_1 = ker_sum + out_1;
out_2 = ker_sum + out_2;
out_3 = ker_sum + out_3;
const int32x4_t mult = vldrwq_s32(out_mult);
const int32x4_t shift = vldrwq_s32(out_shift);
out_mult += 4;
out_shift += 4;
mve_pred16_t p = vctp32q(num_ch_to_process);
out_0 = arm_requantize_mve_32x4(out_0, mult, shift);
out_0 = vaddq_n_s32(out_0, out_offset);
out_0 = vmaxq_s32(out_0, vdupq_n_s32(activation_min));
out_0 = vminq_s32(out_0, vdupq_n_s32(activation_max));
vstrbq_p_s32(out, out_0, p);
out_1 = arm_requantize_mve_32x4(out_1, mult, shift);
out_1 = vaddq_n_s32(out_1, out_offset);
out_1 = vmaxq_s32(out_1, vdupq_n_s32(activation_min));
out_1 = vminq_s32(out_1, vdupq_n_s32(activation_max));
vstrbq_p_s32(out + num_ch, out_1, p);
out_2 = arm_requantize_mve_32x4(out_2, mult, shift);
out_2 = vaddq_n_s32(out_2, out_offset);
out_2 = vmaxq_s32(out_2, vdupq_n_s32(activation_min));
out_2 = vminq_s32(out_2, vdupq_n_s32(activation_max));
vstrbq_p_s32(out + 2 * num_ch, out_2, p);
out_3 = arm_requantize_mve_32x4(out_3, mult, shift);
out_3 = vaddq_n_s32(out_3, out_offset);
out_3 = vmaxq_s32(out_3, vdupq_n_s32(activation_min));
out_3 = vminq_s32(out_3, vdupq_n_s32(activation_max));
vstrbq_p_s32(out + 3 * num_ch, out_3, p);
}
const int tail_ch = num_ch & 0x3;
if (tail_ch != 0)
{
out -= (4 - tail_ch);
}
return out + (3 * num_ch);
#else
(void)lhs;
(void)rhs;
(void)input_offset;
(void)num_ch;
(void)out_shift;
(void)out_mult;
(void)out_offset;
(void)activation_min;
(void)activation_max;
(void)row_x_col;
(void)output_bias;
(void)out;
return NULL;
#endif
}
/**
* @} end of NNBasicMath group
*/

View File

@ -1,91 +0,0 @@
/*
* Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_nn_mat_mul_core_1x_s8.c
* Description: General Matrix-multiplication function
*
* $Date: January 20, 2020
* $Revision: V.1.0.1
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
#include "arm_math.h"
#include "arm_nnfunctions.h"
/**
* @ingroup groupSupport
*/
/**
* @addtogroup NNBasicMath
* @{
*/
/*
* s8 matrix multiplication to process 1 row
*
* Refer header file for details.
*
*/
arm_status arm_nn_mat_mul_core_1x_s8(int32_t row_elements,
const int8_t *row_base,
const int8_t *col_base,
int32_t *const sum_col,
int32_t *const output)
{
int32_t acc_n0 = 0;
int32_t sum_tmp = 0;
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
__asm volatile (
" vldrb.8 q0, [%[col]], 16 \n"
" wlstp.8 lr, %[cnt], 1f \n"
"2: \n"
" vaddva.s8 %[sum], q0 \n"
" vldrb.8 q1, [%[row0]], 16 \n"
" vmladava.s8 %[out0], q0, q1 \n"
" vldrb.8 q0, [%[col]], 16 \n"
" letp lr, 2b \n"
"1: \n"
:[col] "+r"(col_base)
,[sum] "+Te"(sum_tmp)
,[row0] "+r"(row_base)
,[out0] "+Te"(acc_n0)
:[cnt] "r"(row_elements)
:"q0","q1", "memory", "r14");
#else
for (int i = 0; i < row_elements; i++)
{
sum_tmp += col_base[i];
acc_n0 += row_base[i] * col_base[i];
}
#endif
*sum_col = sum_tmp;
*output = acc_n0;
return ARM_MATH_SUCCESS;
}
/**
* @} end of NNBasicMath group
*/

View File

@ -1,118 +0,0 @@
/*
* Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_nn_mat_mul_core_4x_s8.c
* Description: General matrix multiplication function for MVE extension
*
* $Date: January 20, 2020
* $Revision: V.2.0.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
#include "arm_math.h"
#include "arm_nnfunctions.h"
/**
* @ingroup groupSupport
*/
/**
* @addtogroup NNBasicMath
* @{
*/
/*
* s8 matrix multiplication to process 4 rows and one column
*
* Refer header file for details.
*
*/
arm_status arm_nn_mat_mul_core_4x_s8(const int32_t row_elements,
const int32_t offset,
const int8_t *row_base,
const int8_t *col_base,
int32_t *const sum_col,
int32_t *const output)
{
int32_t acc_n0 = 0;
int32_t acc_n1 = 0;
int32_t acc_n2 = 0;
int32_t acc_n3 = 0;
const int8_t *ip_row_0 = row_base;
const int8_t *ip_row_1 = row_base + offset;
const int8_t *ip_row_2 = row_base + (2 * offset);
const int8_t *ip_row_3 = row_base + (3 * offset);
int32_t sum_tmp = 0;
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
__asm volatile(
" vldrb.8 q0, [%[col]], 16 \n"
" wlstp.8 lr, %[cnt], 1f \n"
"2: \n"
" vaddva.s8 %[sum], q0 \n"
" vldrb.8 q1, [%[row0]], 16 \n"
" vmladava.s8 %[out0], q0, q1 \n"
" vldrb.8 q2, [%[row1]], 16 \n"
" vmladava.s8 %[out1], q0, q2 \n"
" vldrb.8 q3, [%[row2]], 16 \n"
" vmladava.s8 %[out2], q0, q3 \n"
" vldrb.8 q4, [%[row3]], 16 \n"
" vmladava.s8 %[out3], q0, q4 \n"
" vldrb.8 q0, [%[col]], 16 \n"
" letp lr, 2b \n"
"1: \n"
:[col] "+r"(col_base)
,[sum] "+Te"(sum_tmp)
,[row0] "+r"(ip_row_0)
,[row1] "+r"(ip_row_1)
,[row2] "+r"(ip_row_2)
,[row3] "+r"(ip_row_3)
,[out0] "+Te"(acc_n0)
,[out1] "+Te"(acc_n1)
,[out2] "+Te"(acc_n2)
,[out3] "+Te"(acc_n3)
: [cnt] "r"(row_elements)
: "q0", "q1", "q2", "q3", "q4", "memory", "r14");
#else
for (int i = 0; i < row_elements; i++)
{
int32_t col = col_base[i];
sum_tmp += col;
acc_n0 += ip_row_0[i] * col;
acc_n1 += ip_row_1[i] * col;
acc_n2 += ip_row_2[i] * col;
acc_n3 += ip_row_3[i] * col;
}
#endif
output[0] = acc_n0;
output[1] = acc_n1;
output[2] = acc_n2;
output[3] = acc_n3;
*sum_col = sum_tmp;
return ARM_MATH_SUCCESS;
}
/**
* @} end of NNBasicMath group
*/

View File

@ -1,584 +0,0 @@
/*
* Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_nn_mat_mult_s8_nt_t_s8
* Description: Matrix multiplication support function with the right-hand-side (rhs) matrix transposed
*
* $Date: July 27 2020
* $Revision: V.1.0.2
*
* Target Processor: Cortex-M
*
* -------------------------------------------------------------------- */
#include "arm_math.h"
#include "arm_nnfunctions.h"
#include "arm_nnsupportfunctions.h"
/**
* @ingroup groupSupport
*/
/**
* @addtogroup NNBasicMath
* @{
*/
/*
* s8 matrix multiplication with the right-hand-side matrix transposed
*
* Refer header file for details.
*
*/
arm_status arm_nn_mat_mult_nt_t_s8(const q7_t *lhs,
const q7_t *rhs,
const q31_t *bias,
q7_t *dst,
const int32_t *dst_multipliers,
const int32_t *dst_shifts,
const int32_t lhs_rows,
const int32_t rhs_rows,
const int32_t rhs_cols,
const int32_t lhs_offset,
const int32_t dst_offset,
const int32_t activation_min,
const int32_t activation_max)
{
#if defined(ARM_MATH_DSP)
const int32_t off0 = rhs_cols - 4;
for (int32_t rhs_rows_idx = 0; rhs_rows_idx <= (rhs_rows - 2); rhs_rows_idx += 2)
{
const q7_t *lhs_ptr = &lhs[0];
q7_t *dst_ptr = &dst[0];
q31_t lhs_offset_contribution0 = 0;
q31_t lhs_offset_contribution1 = 0;
for (int32_t x = 0; x < rhs_cols; ++x)
{
lhs_offset_contribution0 += rhs[x];
lhs_offset_contribution1 += rhs[x + rhs_cols];
}
lhs_offset_contribution0 *= lhs_offset;
lhs_offset_contribution1 *= lhs_offset;
if (bias)
{
lhs_offset_contribution0 += bias[rhs_rows_idx];
lhs_offset_contribution1 += bias[rhs_rows_idx + 1];
}
int32_t lhs_rows_idx = lhs_rows >> 1;
while (lhs_rows_idx)
{
const q7_t *rhs_ptr = &rhs[0];
q31_t res00 = lhs_offset_contribution0;
q31_t res01 = lhs_offset_contribution1;
q31_t res10 = lhs_offset_contribution0;
q31_t res11 = lhs_offset_contribution1;
int32_t rhs_cols_idx = 0;
q31_t val0, val1, val2, val3, val4, val5;
for (; rhs_cols_idx <= (rhs_cols - 16); rhs_cols_idx += 16)
{
val1 = arm_nn_read_q7x4_ia((const q7_t **)&rhs_ptr);
val2 = __SXTB16(val1);
val0 = arm_nn_read_q7x4_ia((const q7_t **)&lhs_ptr);
val3 = __SXTB16(val0);
val4 = arm_nn_read_q7x4((const q7_t *)&rhs_ptr[off0]);
val1 = __SXTB16_RORn(val1, 8);
val0 = __SXTB16_RORn(val0, 8);
// 4 x MAC res00, res01
res00 = __SMLAD(val3, val2, res00);
val5 = __SXTB16(val4);
res00 = __SMLAD(val0, val1, res00);
val4 = __SXTB16_RORn(val4, 8);
res01 = __SMLAD(val3, val5, res01);
res01 = __SMLAD(val0, val4, res01);
// 4 x MAC res10, res11
val0 = arm_nn_read_q7x4((const q7_t *)&lhs_ptr[off0]);
val3 = __SXTB16(val0);
val0 = __SXTB16_RORn(val0, 8);
res10 = __SMLAD(val3, val2, res10);
res11 = __SMLAD(val3, val5, res11);
res10 = __SMLAD(val0, val1, res10);
val1 = arm_nn_read_q7x4_ia((const q7_t **)&rhs_ptr);
res11 = __SMLAD(val0, val4, res11);
val4 = arm_nn_read_q7x4((const q7_t *)&rhs_ptr[off0]);
val2 = __SXTB16(val1);
val0 = arm_nn_read_q7x4_ia((const q7_t **)&lhs_ptr);
val3 = __SXTB16(val0);
val1 = __SXTB16_RORn(val1, 8);
val0 = __SXTB16_RORn(val0, 8);
// 4 x MAC res00, res01
res00 = __SMLAD(val3, val2, res00);
val5 = __SXTB16(val4);
res00 = __SMLAD(val0, val1, res00);
val4 = __SXTB16_RORn(val4, 8);
res01 = __SMLAD(val3, val5, res01);
res01 = __SMLAD(val0, val4, res01);
// 4 x MAC res10, res11
val0 = arm_nn_read_q7x4((const q7_t *)&lhs_ptr[off0]);
val3 = __SXTB16(val0);
val0 = __SXTB16_RORn(val0, 8);
res10 = __SMLAD(val3, val2, res10);
res11 = __SMLAD(val3, val5, res11);
res10 = __SMLAD(val0, val1, res10);
val1 = arm_nn_read_q7x4_ia((const q7_t **)&rhs_ptr);
res11 = __SMLAD(val0, val4, res11);
val4 = arm_nn_read_q7x4((const q7_t *)&rhs_ptr[off0]);
val2 = __SXTB16(val1);
val0 = arm_nn_read_q7x4_ia((const q7_t **)&lhs_ptr);
val3 = __SXTB16(val0);
val1 = __SXTB16_RORn(val1, 8);
val0 = __SXTB16_RORn(val0, 8);
// 4 x MAC res00, res01
res00 = __SMLAD(val3, val2, res00);
val5 = __SXTB16(val4);
res00 = __SMLAD(val0, val1, res00);
val4 = __SXTB16_RORn(val4, 8);
res01 = __SMLAD(val3, val5, res01);
res01 = __SMLAD(val0, val4, res01);
// 4 x MAC res10, res11
val0 = arm_nn_read_q7x4((const q7_t *)&lhs_ptr[off0]);
val3 = __SXTB16(val0);
val0 = __SXTB16_RORn(val0, 8);
res10 = __SMLAD(val3, val2, res10);
res11 = __SMLAD(val3, val5, res11);
res10 = __SMLAD(val0, val1, res10);
val1 = arm_nn_read_q7x4_ia((const q7_t **)&rhs_ptr);
res11 = __SMLAD(val0, val4, res11);
val4 = arm_nn_read_q7x4((const q7_t *)&rhs_ptr[off0]);
val2 = __SXTB16(val1);
val0 = arm_nn_read_q7x4_ia((const q7_t **)&lhs_ptr);
val3 = __SXTB16(val0);
val1 = __SXTB16_RORn(val1, 8);
val0 = __SXTB16_RORn(val0, 8);
// 4 x MAC res00, res01
res00 = __SMLAD(val3, val2, res00);
val5 = __SXTB16(val4);
res00 = __SMLAD(val0, val1, res00);
val4 = __SXTB16_RORn(val4, 8);
res01 = __SMLAD(val3, val5, res01);
res01 = __SMLAD(val0, val4, res01);
// 4 x MAC res10, res11
val0 = arm_nn_read_q7x4((const q7_t *)&lhs_ptr[off0]);
val3 = __SXTB16(val0);
val0 = __SXTB16_RORn(val0, 8);
res10 = __SMLAD(val3, val2, res10);
res11 = __SMLAD(val3, val5, res11);
res10 = __SMLAD(val0, val1, res10);
res11 = __SMLAD(val0, val4, res11);
}
for (; rhs_cols_idx < rhs_cols; ++rhs_cols_idx)
{
q7_t rhs_value0 = rhs_ptr[0];
q7_t rhs_value1 = rhs_ptr[rhs_cols];
q7_t lhs_value = lhs_ptr[0];
res00 += lhs_value * rhs_value0;
res01 += lhs_value * rhs_value1;
lhs_value = lhs_ptr[rhs_cols];
res10 += lhs_value * rhs_value0;
res11 += lhs_value * rhs_value1;
++rhs_ptr;
++lhs_ptr;
}
// Quantize down
res00 = arm_nn_requantize(res00, dst_multipliers[rhs_rows_idx], dst_shifts[rhs_rows_idx]);
res01 = arm_nn_requantize(res01, dst_multipliers[rhs_rows_idx + 1], dst_shifts[rhs_rows_idx + 1]);
res10 = arm_nn_requantize(res10, dst_multipliers[rhs_rows_idx], dst_shifts[rhs_rows_idx]);
res11 = arm_nn_requantize(res11, dst_multipliers[rhs_rows_idx + 1], dst_shifts[rhs_rows_idx + 1]);
// Add offset
res00 += dst_offset;
res01 += dst_offset;
res10 += dst_offset;
res11 += dst_offset;
// Clamp the result
res00 = MAX(res00, activation_min);
res00 = MIN(res00, activation_max);
res01 = MAX(res01, activation_min);
res01 = MIN(res01, activation_max);
res10 = MAX(res10, activation_min);
res10 = MIN(res10, activation_max);
res11 = MAX(res11, activation_min);
res11 = MIN(res11, activation_max);
dst_ptr[0] = (q7_t)res00;
dst_ptr[1] = (q7_t)res01;
dst_ptr += rhs_rows;
dst_ptr[0] = (q7_t)res10;
dst_ptr[1] = (q7_t)res11;
dst_ptr += rhs_rows;
lhs_ptr += rhs_cols;
lhs_rows_idx--;
}
// Left-over rows
if (lhs_rows % 2)
{
const q7_t *rhs_ptr = &rhs[0];
q31_t res00 = lhs_offset_contribution0;
q31_t res01 = lhs_offset_contribution1;
int32_t rhs_cols_idx = 0;
q31_t val0, val1, val2, val3, val4, val5;
for (; rhs_cols_idx <= (rhs_cols - 16); rhs_cols_idx += 16)
{
val0 = arm_nn_read_q7x4_ia((const q7_t **)&rhs_ptr);
val1 = arm_nn_read_q7x4((const q7_t *)&rhs_ptr[off0]);
val2 = arm_nn_read_q7x4_ia((const q7_t **)&lhs_ptr);
val3 = __SXTB16(val0);
val5 = __SXTB16(val2);
val4 = __SXTB16(val1);
val0 = __SXTB16_RORn(val0, 8);
val2 = __SXTB16_RORn(val2, 8);
val1 = __SXTB16_RORn(val1, 8);
// 4 x MAC res00, res01
res00 = __SMLAD(val5, val3, res00);
res00 = __SMLAD(val2, val0, res00);
res01 = __SMLAD(val5, val4, res01);
res01 = __SMLAD(val2, val1, res01);
val0 = arm_nn_read_q7x4_ia((const q7_t **)&rhs_ptr);
val1 = arm_nn_read_q7x4((const q7_t *)&rhs_ptr[off0]);
val2 = arm_nn_read_q7x4_ia((const q7_t **)&lhs_ptr);
val3 = __SXTB16(val0);
val5 = __SXTB16(val2);
val4 = __SXTB16(val1);
val0 = __SXTB16_RORn(val0, 8);
val2 = __SXTB16_RORn(val2, 8);
val1 = __SXTB16_RORn(val1, 8);
// 4 x MAC res00, res01
res00 = __SMLAD(val5, val3, res00);
res00 = __SMLAD(val2, val0, res00);
res01 = __SMLAD(val5, val4, res01);
res01 = __SMLAD(val2, val1, res01);
val0 = arm_nn_read_q7x4_ia((const q7_t **)&rhs_ptr);
val1 = arm_nn_read_q7x4((const q7_t *)&rhs_ptr[off0]);
val2 = arm_nn_read_q7x4_ia((const q7_t **)&lhs_ptr);
val3 = __SXTB16(val0);
val5 = __SXTB16(val2);
val4 = __SXTB16(val1);
val0 = __SXTB16_RORn(val0, 8);
val2 = __SXTB16_RORn(val2, 8);
val1 = __SXTB16_RORn(val1, 8);
// 4 x MAC res00, res01
res00 = __SMLAD(val5, val3, res00);
res00 = __SMLAD(val2, val0, res00);
res01 = __SMLAD(val5, val4, res01);
res01 = __SMLAD(val2, val1, res01);
val0 = arm_nn_read_q7x4_ia((const q7_t **)&rhs_ptr);
val1 = arm_nn_read_q7x4((const q7_t *)&rhs_ptr[off0]);
val2 = arm_nn_read_q7x4_ia((const q7_t **)&lhs_ptr);
val3 = __SXTB16(val0);
val5 = __SXTB16(val2);
val4 = __SXTB16(val1);
val0 = __SXTB16_RORn(val0, 8);
val2 = __SXTB16_RORn(val2, 8);
val1 = __SXTB16_RORn(val1, 8);
// 4 x MAC res00, res01
res00 = __SMLAD(val5, val3, res00);
res00 = __SMLAD(val2, val0, res00);
res01 = __SMLAD(val5, val4, res01);
res01 = __SMLAD(val2, val1, res01);
}
// Left-over accumulations
for (; rhs_cols_idx < rhs_cols; ++rhs_cols_idx)
{
q7_t rhs_value0 = rhs_ptr[0];
q7_t rhs_value1 = rhs_ptr[rhs_cols];
q7_t lhs_value = lhs_ptr[0];
res00 += lhs_value * rhs_value0;
res01 += lhs_value * rhs_value1;
++rhs_ptr;
++lhs_ptr;
}
// Quantize down
res00 = arm_nn_requantize(res00, dst_multipliers[rhs_rows_idx], dst_shifts[rhs_rows_idx]);
res01 = arm_nn_requantize(res01, dst_multipliers[rhs_rows_idx + 1], dst_shifts[rhs_rows_idx + 1]);
// Add offset
res00 += dst_offset;
res01 += dst_offset;
// Clamp the result
res00 = MAX(res00, activation_min);
res00 = MIN(res00, activation_max);
res01 = MAX(res01, activation_min);
res01 = MIN(res01, activation_max);
dst_ptr[0] = (q7_t)res00;
dst_ptr[1] = (q7_t)res01;
}
rhs += 2 * rhs_cols;
dst += 2;
}
if (rhs_rows % 2)
{
const q7_t *lhs_ptr = &lhs[0];
q7_t *dst_ptr = &dst[0];
for (int32_t lhs_rows_idx = 0; lhs_rows_idx < lhs_rows; ++lhs_rows_idx)
{
const q7_t *rhs_ptr = &rhs[0];
q31_t res00 = 0;
if (bias)
{
res00 = bias[rhs_rows - 1];
}
for (int32_t rhs_cols_idx = 0; rhs_cols_idx < rhs_cols; ++rhs_cols_idx)
{
q31_t rhs_value = rhs_ptr[0];
q31_t lhs_value = lhs_ptr[0] + lhs_offset;
res00 += lhs_value * rhs_value;
++rhs_ptr;
++lhs_ptr;
}
// Quantize down
res00 = arm_nn_requantize(res00, dst_multipliers[rhs_rows - 1], dst_shifts[rhs_rows - 1]);
// Add offset
res00 += dst_offset;
// Clamp the result
res00 = MAX(res00, activation_min);
res00 = MIN(res00, activation_max);
dst_ptr[0] = (q7_t)res00;
dst_ptr += rhs_rows;
}
}
#else
for (int32_t rhs_rows_idx = 0; rhs_rows_idx <= (rhs_rows - 2); rhs_rows_idx += 2)
{
const q7_t *lhs_ptr = &lhs[0];
q7_t *dst_ptr = &dst[0];
q31_t lhs_offset_contribution0 = 0;
q31_t lhs_offset_contribution1 = 0;
for (int32_t x = 0; x < rhs_cols; ++x)
{
lhs_offset_contribution0 += rhs[x];
lhs_offset_contribution1 += rhs[x + rhs_cols];
}
lhs_offset_contribution0 *= lhs_offset;
lhs_offset_contribution1 *= lhs_offset;
if (bias)
{
lhs_offset_contribution0 += bias[rhs_rows_idx];
lhs_offset_contribution1 += bias[rhs_rows_idx + 1];
}
int32_t lhs_rows_idx = lhs_rows >> 1;
while (lhs_rows_idx)
{
const q7_t *rhs_ptr = &rhs[0];
q31_t res00 = lhs_offset_contribution0;
q31_t res01 = lhs_offset_contribution1;
q31_t res10 = lhs_offset_contribution0;
q31_t res11 = lhs_offset_contribution1;
for (int32_t rhs_cols_idx = 0; rhs_cols_idx < rhs_cols; ++rhs_cols_idx)
{
q7_t rhs_value0 = rhs_ptr[0];
q7_t rhs_value1 = rhs_ptr[rhs_cols];
q7_t lhs_value = lhs_ptr[0];
res00 += lhs_value * rhs_value0;
res01 += lhs_value * rhs_value1;
lhs_value = lhs_ptr[rhs_cols];
res10 += lhs_value * rhs_value0;
res11 += lhs_value * rhs_value1;
++rhs_ptr;
++lhs_ptr;
}
// Quantize down
res00 = arm_nn_requantize(res00, dst_multipliers[rhs_rows_idx], dst_shifts[rhs_rows_idx]);
res01 = arm_nn_requantize(res01, dst_multipliers[rhs_rows_idx + 1], dst_shifts[rhs_rows_idx + 1]);
res10 = arm_nn_requantize(res10, dst_multipliers[rhs_rows_idx], dst_shifts[rhs_rows_idx]);
res11 = arm_nn_requantize(res11, dst_multipliers[rhs_rows_idx + 1], dst_shifts[rhs_rows_idx + 1]);
// Add offset
res00 += dst_offset;
res01 += dst_offset;
res10 += dst_offset;
res11 += dst_offset;
// Clamp the result
res00 = MAX(res00, activation_min);
res00 = MIN(res00, activation_max);
res01 = MAX(res01, activation_min);
res01 = MIN(res01, activation_max);
res10 = MAX(res10, activation_min);
res10 = MIN(res10, activation_max);
res11 = MAX(res11, activation_min);
res11 = MIN(res11, activation_max);
dst_ptr[0] = (q7_t)res00;
dst_ptr[1] = (q7_t)res01;
dst_ptr += rhs_rows;
dst_ptr[0] = (q7_t)res10;
dst_ptr[1] = (q7_t)res11;
dst_ptr += rhs_rows;
lhs_ptr += rhs_cols;
lhs_rows_idx--;
}
// Left-over rows
if (lhs_rows % 2)
{
const q7_t *rhs_ptr = &rhs[0];
q31_t res00 = lhs_offset_contribution0;
q31_t res01 = lhs_offset_contribution1;
for (int32_t rhs_cols_idx = 0; rhs_cols_idx < rhs_cols; ++rhs_cols_idx)
{
q7_t rhs_value0 = rhs_ptr[0];
q7_t rhs_value1 = rhs_ptr[rhs_cols];
q7_t lhs_value = lhs_ptr[0];
res00 += lhs_value * rhs_value0;
res01 += lhs_value * rhs_value1;
++rhs_ptr;
++lhs_ptr;
}
// Quantize down
res00 = arm_nn_requantize(res00, dst_multipliers[rhs_rows_idx], dst_shifts[rhs_rows_idx]);
res01 = arm_nn_requantize(res01, dst_multipliers[rhs_rows_idx + 1], dst_shifts[rhs_rows_idx + 1]);
// Add offset
res00 += dst_offset;
res01 += dst_offset;
// Clamp the result
res00 = MAX(res00, activation_min);
res00 = MIN(res00, activation_max);
res01 = MAX(res01, activation_min);
res01 = MIN(res01, activation_max);
dst_ptr[0] = (q7_t)res00;
dst_ptr[1] = (q7_t)res01;
}
rhs += 2 * rhs_cols;
dst += 2;
}
if (rhs_rows % 2)
{
const q7_t *lhs_ptr = &lhs[0];
q7_t *dst_ptr = &dst[0];
for (int32_t lhs_rows_idx = 0; lhs_rows_idx < lhs_rows; ++lhs_rows_idx)
{
const q7_t *rhs_ptr = &rhs[0];
q31_t res00 = 0;
if (bias)
{
res00 = bias[rhs_rows - 1];
}
for (int32_t rhs_cols_idx = 0; rhs_cols_idx < rhs_cols; ++rhs_cols_idx)
{
q31_t rhs_value = rhs_ptr[0];
q31_t lhs_value = lhs_ptr[0] + lhs_offset;
res00 += lhs_value * rhs_value;
++rhs_ptr;
++lhs_ptr;
}
// Quantize down
res00 = arm_nn_requantize(res00, dst_multipliers[rhs_rows - 1], dst_shifts[rhs_rows - 1]);
// Add offset
res00 += dst_offset;
// Clamp the result
res00 = MAX(res00, activation_min);
res00 = MIN(res00, activation_max);
dst_ptr[0] = (q7_t)res00;
dst_ptr += rhs_rows;
}
}
#endif
return ARM_MATH_SUCCESS;
}
/**
* @} end of NNBasicMath group
*/

View File

@ -1,145 +0,0 @@
/*
* Copyright (C) 2010-2018 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_nn_mult_q15.c
* Description: Q15 vector multiplication with variable output shifts
*
* $Date: 29. April 2020
* $Revision: V.1.0.1
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
/**
* @ingroup groupSupport
*/
/**
* @addtogroup NNBasicMath
* @{
*/
/**
* @brief Q7 vector multiplication with variable output shifts
* @param[in] *pSrcA pointer to the first input vector
* @param[in] *pSrcB pointer to the second input vector
* @param[out] *pDst pointer to the output vector
* @param[in] out_shift amount of right-shift for output
* @param[in] blockSize number of samples in each vector
*
* <b>Scaling and Overflow Behavior:</b>
* \par
* The function uses saturating arithmetic.
* Results outside of the allowable Q15 range [0x8000 0x7FFF] will be saturated.
*/
void arm_nn_mult_q15(
q15_t * pSrcA,
q15_t * pSrcB,
q15_t * pDst,
const uint16_t out_shift,
uint32_t blockSize)
{
uint32_t blkCnt; /* loop counters */
#if defined (ARM_MATH_DSP)
/* Run the below code for Cortex-M4 and Cortex-M3 */
q31_t inA1, inA2, inB1, inB2; /* temporary input variables */
q15_t out1, out2, out3, out4; /* temporary output variables */
q31_t mul1, mul2, mul3, mul4; /* temporary variables */
/* loop Unrolling */
blkCnt = blockSize >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* read two samples at a time from sourceA */
inA1 = arm_nn_read_q15x2_ia((const q15_t **)&pSrcA);
/* read two samples at a time from sourceB */
inB1 = arm_nn_read_q15x2_ia((const q15_t **)&pSrcB);
/* read two samples at a time from sourceA */
inA2 = arm_nn_read_q15x2_ia((const q15_t **)&pSrcA);
/* read two samples at a time from sourceB */
inB2 = arm_nn_read_q15x2_ia((const q15_t **)&pSrcB);
/* multiply mul = sourceA * sourceB */
mul1 = (q31_t) ((q15_t) (inA1 >> 16) * (q15_t) (inB1 >> 16));
mul2 = (q31_t) ((q15_t) inA1 * (q15_t) inB1);
mul3 = (q31_t) ((q15_t) (inA2 >> 16) * (q15_t) (inB2 >> 16));
mul4 = (q31_t) ((q15_t) inA2 * (q15_t) inB2);
/* saturate result to 16 bit */
out1 = (q15_t) __SSAT((q31_t) (mul1 + NN_ROUND(out_shift)) >> out_shift, 16);
out2 = (q15_t) __SSAT((q31_t) (mul2 + NN_ROUND(out_shift)) >> out_shift, 16);
out3 = (q15_t) __SSAT((q31_t) (mul3 + NN_ROUND(out_shift)) >> out_shift, 16);
out4 = (q15_t) __SSAT((q31_t) (mul4 + NN_ROUND(out_shift)) >> out_shift, 16);
/* store the result */
#ifndef ARM_MATH_BIG_ENDIAN
*__SIMD32(pDst)++ = __PKHBT(out2, out1, 16);
*__SIMD32(pDst)++ = __PKHBT(out4, out3, 16);
#else
*__SIMD32(pDst)++ = __PKHBT(out2, out1, 16);
*__SIMD32(pDst)++ = __PKHBT(out4, out3, 16);
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
/* Decrement the blockSize loop counter */
blkCnt--;
}
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
blkCnt = blockSize % 0x4U;
#else
/* Run the below code for Cortex-M0 */
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_DSP) */
while (blkCnt > 0U)
{
/* C = A * B */
/* Multiply the inputs and store the result in the destination buffer */
*pDst++ = (q15_t) __SSAT(((q31_t) ((q31_t) (*pSrcA++) * (*pSrcB++) + NN_ROUND(out_shift)) >> out_shift), 16);
/* Decrement the blockSize loop counter */
blkCnt--;
}
}
/**
* @} end of NNBasicMath group
*/

View File

@ -1,118 +0,0 @@
/*
* Copyright (C) 2010-2018 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_nn_mult_q7.c
* Description: Q7 vector multiplication with variable output shifts
*
* $Date: 29. April 2020
* $Revision: V.1.0.1
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
/**
* @ingroup groupSupport
*/
/**
* @addtogroup NNBasicMath
* @{
*/
/**
* @brief Q7 vector multiplication with variable output shifts
* @param[in] *pSrcA pointer to the first input vector
* @param[in] *pSrcB pointer to the second input vector
* @param[out] *pDst pointer to the output vector
* @param[in] out_shift amount of right-shift for output
* @param[in] blockSize number of samples in each vector
*
* <b>Scaling and Overflow Behavior:</b>
* \par
* The function uses saturating arithmetic.
* Results outside of the allowable Q7 range [0x80 0x7F] will be saturated.
*/
void arm_nn_mult_q7(
q7_t * pSrcA,
q7_t * pSrcB,
q7_t * pDst,
const uint16_t out_shift,
uint32_t blockSize)
{
uint32_t blkCnt; /* loop counters */
#if defined (ARM_MATH_DSP)
/* Run the below code for Cortex-M4 and Cortex-M3 */
q7_t out1, out2, out3, out4; /* Temporary variables to store the product */
/* loop Unrolling */
blkCnt = blockSize >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C = A * B */
/* Multiply the inputs and store the results in temporary variables */
out1 = (q7_t) __SSAT(((q15_t) ((q15_t) (*pSrcA++) * (*pSrcB++) + NN_ROUND(out_shift)) >> out_shift), 8);
out2 = (q7_t) __SSAT(((q15_t) ((q15_t) (*pSrcA++) * (*pSrcB++) + NN_ROUND(out_shift)) >> out_shift), 8);
out3 = (q7_t) __SSAT(((q15_t) ((q15_t) (*pSrcA++) * (*pSrcB++) + NN_ROUND(out_shift)) >> out_shift), 8);
out4 = (q7_t) __SSAT(((q15_t) ((q15_t) (*pSrcA++) * (*pSrcB++) + NN_ROUND(out_shift)) >> out_shift), 8);
/* Store the results of 4 inputs in the destination buffer in single cycle by packing */
*__SIMD32(pDst)++ = __PACKq7(out1, out2, out3, out4);
/* Decrement the blockSize loop counter */
blkCnt--;
}
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
blkCnt = blockSize % 0x4U;
#else
/* Run the below code for Cortex-M0 */
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_DSP) */
while (blkCnt > 0U)
{
/* C = A * B */
/* Multiply the inputs and store the result in the destination buffer */
*pDst++ = (q7_t) __SSAT(((q15_t) ((q15_t) (*pSrcA++) * (*pSrcB++) + NN_ROUND(out_shift)) >> out_shift), 8);
/* Decrement the blockSize loop counter */
blkCnt--;
}
}
/**
* @} end of NNBasicMath group
*/

View File

@ -1,462 +0,0 @@
/*
* Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_nn_vec_mat_mult_t_s8
* Description: s8 vector by matrix (transposed) multiplication
*
* $Date: April 2, 2020
* $Revision: V.1.5.0
*
* Target Processor: Cortex-M
*
* -------------------------------------------------------------------- */
#include "arm_math.h"
#include "arm_nnfunctions.h"
#include "arm_nnsupportfunctions.h"
/**
* @ingroup groupSupport
*/
/**
* @addtogroup NNBasicMath
* @{
*/
/*
* s8 vector(lhs) by matrix (transposed) multiplication
*
* Refer header file for details.
*
*/
arm_status arm_nn_vec_mat_mult_t_s8(const q7_t *lhs,
const q7_t *rhs,
const q31_t *bias,
q7_t *dst,
const int32_t lhs_offset,
const int32_t rhs_offset,
const int32_t dst_offset,
const int32_t dst_multiplier,
const int32_t dst_shift,
const int32_t rhs_cols,
const int32_t rhs_rows,
const int32_t activation_min,
const int32_t activation_max)
{
#if defined(ARM_MATH_MVEI)
const int16x8_t rhs_offset_vec = vdupq_n_s16((int16_t)rhs_offset);
const int16x8_t lhs_offset_vec = vdupq_n_s16((int16_t)lhs_offset);
int32_t row_loop_cnt = rhs_rows / 4;
for (int i_row_loop_cnt = 0; i_row_loop_cnt < row_loop_cnt; i_row_loop_cnt++)
{
int32_t acc1 = bias[0];
int32_t acc2 = bias[1];
int32_t acc3 = bias[2];
int32_t acc4 = bias[3];
bias += 4;
int32x4_t acc;
const int32_t col_loop_cnt = (rhs_cols + 7) / 8;
const int8_t *vec = lhs;
const int8_t *rhs_0 = rhs;
const int8_t *rhs_1 = rhs + rhs_cols;
const int8_t *rhs_2 = rhs + 2 * rhs_cols;
const int8_t *rhs_3 = rhs + 3 * rhs_cols;
uint32_t col_cnt = (uint32_t)rhs_cols;
for (int i = 0; i < col_loop_cnt; i++)
{
mve_pred16_t p = vctp16q(col_cnt);
col_cnt -= 8;
const int16x8_t tmp_b = vaddq_m_s16(vuninitializedq_s16(),
vldrbq_z_s16(vec, p), lhs_offset_vec, p);
const int16x8_t tmp_a0 = vaddq_m_s16(vuninitializedq_s16(),
vldrbq_z_s16(rhs_0, p), rhs_offset_vec, p);
acc1 = vmladavaq_p_s16(acc1, tmp_a0, tmp_b, p);
const int16x8_t tmp_a1 = vaddq_m_s16(vuninitializedq_s16(),
vldrbq_z_s16(rhs_1, p), rhs_offset_vec, p);
acc2 = vmladavaq_p_s16(acc2, tmp_a1, tmp_b, p);
const int16x8_t tmp_a2 = vaddq_m_s16(vuninitializedq_s16(),
vldrbq_z_s16(rhs_2, p), rhs_offset_vec, p);
acc3 = vmladavaq_p_s16(acc3, tmp_a2, tmp_b, p);
const int16x8_t tmp_a3 = vaddq_m_s16(vuninitializedq_s16(),
vldrbq_z_s16(rhs_3, p), rhs_offset_vec, p);
acc4 = vmladavaq_p_s16(acc4, tmp_a3, tmp_b, p);
vec += 8;
rhs_0 += 8;
rhs_1 += 8;
rhs_2 += 8;
rhs_3 += 8;
}
rhs += 4 * rhs_cols;
acc[0] = acc1;
acc[1] = acc2;
acc[2] = acc3;
acc[3] = acc4;
acc = arm_requantize_mve(acc, dst_multiplier, dst_shift);
acc = vaddq_s32(acc, vdupq_n_s32(dst_offset));
acc = vmaxq_s32(acc, vdupq_n_s32(activation_min));
acc = vminq_s32(acc, vdupq_n_s32(activation_max));
vstrbq_s32(dst, acc);
dst += 4;
}
row_loop_cnt = rhs_rows & 3;
for (int i_row_loop_cnt = 0; i_row_loop_cnt < row_loop_cnt;
i_row_loop_cnt++)
{
int32_t acc = *bias++;
const int32_t col_loop_cnt = (rhs_cols + 7) / 8;
const int8_t *vec = lhs;
const int8_t *kernel_cur = rhs;
uint32_t col_cnt = (uint32_t)rhs_cols;
for (int i = 0; i < col_loop_cnt; i++)
{
mve_pred16_t p = vctp16q(col_cnt);
col_cnt -= 8;
const int16x8_t tmp_b = vaddq_m_s16(vuninitializedq_s16(),
vldrbq_z_s16(vec, p), lhs_offset_vec, p);
const int16x8_t tmp_a = vaddq_m_s16(vuninitializedq_s16(),
vldrbq_z_s16(kernel_cur, p), rhs_offset_vec, p);
acc = vmladavaq_p_s16(acc, tmp_a, tmp_b, p);
vec += 8;
kernel_cur += 8;
}
rhs += rhs_cols;
acc = arm_nn_requantize(acc, dst_multiplier, dst_shift);
acc += dst_offset;
acc = MAX(acc, activation_min);
acc = MIN(acc, activation_max);
*dst++ = (int8_t)(acc);
}
#elif defined(ARM_MATH_DSP)
const int32_t off0 = rhs_cols - 4;
const int16_t lhs_offset_s16 = lhs_offset;
const int16_t rhs_offset_s16 = rhs_offset;
const uint32_t lhs_offset_s16x2 = __PKHBT(lhs_offset_s16, lhs_offset_s16, 16);
const uint32_t rhs_offset_s16x2 = __PKHBT(rhs_offset_s16, rhs_offset_s16, 16);
for (int32_t rhs_rows_idx = 0; rhs_rows_idx <= (rhs_rows - 2); rhs_rows_idx += 2)
{
const q7_t *lhs_ptr = &lhs[0];
const q7_t *rhs_ptr = &rhs[0];
q31_t res00 = *bias++;
q31_t res01 = *bias++;
int32_t rhs_cols_idx = 0;
q31_t val0, val1, val2, val3, val4, val5;
for (; rhs_cols_idx <= (rhs_cols - 16); rhs_cols_idx += 16)
{
// Read 4 x int8 values from the RHS matrix
val0 = arm_nn_read_q7x4_ia((const q7_t **)&rhs_ptr);
val2 = __SXTAB16(rhs_offset_s16x2, val0);
// Read 4 x int8 values from the LHS vector
val1 = arm_nn_read_q7x4_ia((const q7_t **)&lhs_ptr);
val0 = __SXTAB16(rhs_offset_s16x2, __ROR(val0, 8));
val3 = __SXTAB16(lhs_offset_s16x2, val1);
// Read 4 x int8 values from the RHS matrix
val4 = arm_nn_read_q7x4((const q7_t *)rhs_ptr + off0);
val1 = __SXTAB16(lhs_offset_s16x2, __ROR(val1, 8));
// Perform the accumulations
res00 = __SMLAD(val3, val2, res00);
val5 = __SXTAB16(rhs_offset_s16x2, val4);
res00 = __SMLAD(val1, val0, res00);
val4 = __SXTAB16(rhs_offset_s16x2, __ROR(val4, 8));
// Read 4 x int8 values from the RHS matrix
val0 = arm_nn_read_q7x4_ia((const q7_t **)&rhs_ptr);
res01 = __SMLAD(val3, val5, res01);
res01 = __SMLAD(val1, val4, res01);
val2 = __SXTAB16(rhs_offset_s16x2, val0);
// Read 4 x int8 values from the LHS vector
val1 = arm_nn_read_q7x4_ia((const q7_t **)&lhs_ptr);
val0 = __SXTAB16(rhs_offset_s16x2, __ROR(val0, 8));
val3 = __SXTAB16(lhs_offset_s16x2, val1);
// Read 4 x int8 values from the RHS matrix
val4 = arm_nn_read_q7x4((const q7_t *)rhs_ptr + off0);
val1 = __SXTAB16(lhs_offset_s16x2, __ROR(val1, 8));
// Perform the accumulations
res00 = __SMLAD(val3, val2, res00);
val5 = __SXTAB16(rhs_offset_s16x2, val4);
res00 = __SMLAD(val1, val0, res00);
val4 = __SXTAB16(rhs_offset_s16x2, __ROR(val4, 8));
// Read 4 x int8 values from the RHS matrix
val0 = arm_nn_read_q7x4_ia((const q7_t **)&rhs_ptr);
res01 = __SMLAD(val3, val5, res01);
res01 = __SMLAD(val1, val4, res01);
val2 = __SXTAB16(rhs_offset_s16x2, val0);
// Read 4 x int8 values from the LHS vector
val1 = arm_nn_read_q7x4_ia((const q7_t **)&lhs_ptr);
val0 = __SXTAB16(rhs_offset_s16x2, __ROR(val0, 8));
val3 = __SXTAB16(lhs_offset_s16x2, val1);
// Read 4 x int8 values from the RHS matrix
val4 = arm_nn_read_q7x4((const q7_t *)rhs_ptr + off0);
val1 = __SXTAB16(lhs_offset_s16x2, __ROR(val1, 8));
// Perform the accumulations
res00 = __SMLAD(val3, val2, res00);
val5 = __SXTAB16(rhs_offset_s16x2, val4);
res00 = __SMLAD(val1, val0, res00);
val4 = __SXTAB16(rhs_offset_s16x2, __ROR(val4, 8));
// Read 4 x int8 values from the RHS matrix
val0 = arm_nn_read_q7x4_ia((const q7_t **)&rhs_ptr);
res01 = __SMLAD(val3, val5, res01);
res01 = __SMLAD(val1, val4, res01);
val2 = __SXTAB16(rhs_offset_s16x2, val0);
// Read 4 x int8 values from the LHS vector
val1 = arm_nn_read_q7x4_ia((const q7_t **)&lhs_ptr);
val0 = __SXTAB16(rhs_offset_s16x2, __ROR(val0, 8));
val3 = __SXTAB16(lhs_offset_s16x2, val1);
// Read 4 x int8 values from the RHS matrix
val4 = arm_nn_read_q7x4((const q7_t *)rhs_ptr + off0);
val1 = __SXTAB16(lhs_offset_s16x2, __ROR(val1, 8));
// Perform the accumulations
res00 = __SMLAD(val3, val2, res00);
val5 = __SXTAB16(rhs_offset_s16x2, val4);
res00 = __SMLAD(val1, val0, res00);
val4 = __SXTAB16(rhs_offset_s16x2, __ROR(val4, 8));
res01 = __SMLAD(val3, val5, res01);
res01 = __SMLAD(val1, val4, res01);
}
for (; rhs_cols_idx < rhs_cols; ++rhs_cols_idx)
{
q31_t rhs_value0 = rhs_ptr[0] + rhs_offset;
q31_t rhs_value1 = rhs_ptr[rhs_cols] + rhs_offset;
q31_t lhs_value = lhs_ptr[0] + lhs_offset;
res00 += lhs_value * rhs_value0;
res01 += lhs_value * rhs_value1;
++rhs_ptr;
++lhs_ptr;
}
// Quantize down
res00 = arm_nn_requantize(res00, dst_multiplier, dst_shift);
res01 = arm_nn_requantize(res01, dst_multiplier, dst_shift);
// Add offset
res00 += dst_offset;
res01 += dst_offset;
// Clamp the result
res00 = MAX(res00, activation_min);
res00 = MIN(res00, activation_max);
res01 = MAX(res01, activation_min);
res01 = MIN(res01, activation_max);
*dst++ = (q7_t)res00;
*dst++ = (q7_t)res01;
rhs += 2 * rhs_cols;
}
if (rhs_rows % 2)
{
const q7_t *lhs_ptr = &lhs[0];
const q7_t *rhs_ptr = &rhs[0];
q31_t res00 = *bias++;
int32_t rhs_cols_idx = 0;
q31_t val0, val1, val2, val3;
for (; rhs_cols_idx <= (rhs_cols - 16); rhs_cols_idx += 16)
{
val0 = arm_nn_read_q7x4_ia((const q7_t **)&rhs_ptr);
val1 = __SXTAB16(rhs_offset_s16x2, val0);
val2 = arm_nn_read_q7x4_ia((const q7_t **)&lhs_ptr);
val0 = __SXTAB16(rhs_offset_s16x2, __ROR(val0, 8));
val3 = __SXTAB16(lhs_offset_s16x2, val2);
val2 = __SXTAB16(lhs_offset_s16x2, __ROR(val2, 8));
// Partial accumulations
res00 = __SMLAD(val3, val1, res00);
res00 = __SMLAD(val2, val0, res00);
val0 = arm_nn_read_q7x4_ia((const q7_t **)&rhs_ptr);
val1 = __SXTAB16(rhs_offset_s16x2, val0);
val2 = arm_nn_read_q7x4_ia((const q7_t **)&lhs_ptr);
val0 = __SXTAB16(rhs_offset_s16x2, __ROR(val0, 8));
val3 = __SXTAB16(lhs_offset_s16x2, val2);
val2 = __SXTAB16(lhs_offset_s16x2, __ROR(val2, 8));
// Partial accumulations
res00 = __SMLAD(val3, val1, res00);
res00 = __SMLAD(val2, val0, res00);
val0 = arm_nn_read_q7x4_ia((const q7_t **)&rhs_ptr);
val1 = __SXTAB16(rhs_offset_s16x2, val0);
val2 = arm_nn_read_q7x4_ia((const q7_t **)&lhs_ptr);
val0 = __SXTAB16(rhs_offset_s16x2, __ROR(val0, 8));
val3 = __SXTAB16(lhs_offset_s16x2, val2);
val2 = __SXTAB16(lhs_offset_s16x2, __ROR(val2, 8));
// Partial accumulations
res00 = __SMLAD(val3, val1, res00);
res00 = __SMLAD(val2, val0, res00);
val0 = arm_nn_read_q7x4_ia((const q7_t **)&rhs_ptr);
val1 = __SXTAB16(rhs_offset_s16x2, val0);
val2 = arm_nn_read_q7x4_ia((const q7_t **)&lhs_ptr);
val0 = __SXTAB16(rhs_offset_s16x2, __ROR(val0, 8));
val3 = __SXTAB16(lhs_offset_s16x2, val2);
val2 = __SXTAB16(lhs_offset_s16x2, __ROR(val2, 8));
// Partial accumulations
res00 = __SMLAD(val3, val1, res00);
res00 = __SMLAD(val2, val0, res00);
}
for (; rhs_cols_idx < rhs_cols; ++rhs_cols_idx)
{
q31_t rhs_value0 = rhs_ptr[0] + rhs_offset;
q31_t lhs_value = lhs_ptr[0] + lhs_offset;
res00 += lhs_value * rhs_value0;
++rhs_ptr;
++lhs_ptr;
}
// Quantize down
res00 = arm_nn_requantize(res00, dst_multiplier, dst_shift);
// Add offset
res00 += dst_offset;
// Clamp the result
res00 = MAX(res00, activation_min);
res00 = MIN(res00, activation_max);
*dst = (q7_t)res00;
}
#else
for (int32_t rhs_rows_idx = 0; rhs_rows_idx <= (rhs_rows - 2); rhs_rows_idx += 2)
{
const q7_t *lhs_ptr = &lhs[0];
const q7_t *rhs_ptr = &rhs[0];
q31_t res00 = *bias++;
q31_t res01 = *bias++;
for (int32_t rhs_cols_idx = 0; rhs_cols_idx < rhs_cols; ++rhs_cols_idx)
{
q31_t rhs_value0 = rhs_ptr[0] + rhs_offset;
q31_t rhs_value1 = rhs_ptr[rhs_cols] + rhs_offset;
q31_t lhs_value = lhs_ptr[0] + lhs_offset;
res00 += lhs_value * rhs_value0;
res01 += lhs_value * rhs_value1;
++rhs_ptr;
++lhs_ptr;
}
// Quantize down
res00 = arm_nn_requantize(res00, dst_multiplier, dst_shift);
res01 = arm_nn_requantize(res01, dst_multiplier, dst_shift);
// Add offset
res00 += dst_offset;
res01 += dst_offset;
// Clamp the result
res00 = MAX(res00, activation_min);
res00 = MIN(res00, activation_max);
res01 = MAX(res01, activation_min);
res01 = MIN(res01, activation_max);
*dst++ = (q7_t)res00;
*dst++ = (q7_t)res01;
rhs += 2 * rhs_cols;
}
if (rhs_rows % 2)
{
const q7_t *lhs_ptr = &lhs[0];
const q7_t *rhs_ptr = &rhs[0];
q31_t res00 = *bias++;
for (int32_t rhs_cols_idx = 0; rhs_cols_idx < rhs_cols; ++rhs_cols_idx)
{
q31_t rhs_value0 = rhs_ptr[0] + rhs_offset;
q31_t lhs_value = lhs_ptr[0] + lhs_offset;
res00 += lhs_value * rhs_value0;
++rhs_ptr;
++lhs_ptr;
}
// Quantize down
res00 = arm_nn_requantize(res00, dst_multiplier, dst_shift);
// Add offset
res00 += dst_offset;
// Clamp the result
res00 = MAX(res00, activation_min);
res00 = MIN(res00, activation_max);
*dst = (q7_t)res00;
}
#endif
return ARM_MATH_SUCCESS;
}
/**
* @} end of NNBasicMath group
*/

View File

@ -1,297 +0,0 @@
/*
* Copyright (C) 2010-2018 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_nntables.c
* Description: Converts the elements of the Q7 vector to Q15 vector without left-shift
*
* $Date: 17. January 2018
* $Revision: V.1.0.0
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#include "arm_nnsupportfunctions.h"
/**
* @brief tables for various activation functions
*
* This file include the declaration of common tables.
* Most of them are used for activation functions
*
* Assumption:
* Unified table: input is 3.x format, i.e, range of [-8, 8)
* sigmoid(8) = 0.9996646498695336
* tanh(8) = 0.9999997749296758
* The accuracy here should be good enough
*
* 2-stage HL table:
*
* The entire input range is divided into two parts:
*
* Low range table: 0x000x xxxx or 0x111x xxxx
* table entry will be the binary number excluding the first
* two digits, i.e., 0x0x xxxx or 0x1x xxxx
*
*
*
* High range table 0x0010 0000 -- 0x0111 1111
* 0x1000 0000 -- 0x1101 1111
*
* For positive numbers, table entry will be
* 0x0010 0000 -- 0x0111 1111 minus 0x0010 0000
* i.e., 0x0000 0000 - 0x0101 11111
*
* same thing for the negative numbers, table entry will be
* 0x1000 0000 -- 0x1101 1111 minux 0x0010 0000
* i.e., 0x0110 0000 - 0x1011 1111
*/
const q7_t sigmoidTable_q7[256] = {
0x40, 0x42, 0x44, 0x46, 0x48, 0x4a, 0x4c, 0x4e,
0x50, 0x52, 0x53, 0x55, 0x57, 0x59, 0x5a, 0x5c,
0x5e, 0x5f, 0x61, 0x62, 0x63, 0x65, 0x66, 0x67,
0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70,
0x71, 0x72, 0x72, 0x73, 0x74, 0x74, 0x75, 0x76,
0x76, 0x77, 0x77, 0x78, 0x78, 0x79, 0x79, 0x7a,
0x7a, 0x7a, 0x7b, 0x7b, 0x7b, 0x7c, 0x7c, 0x7c,
0x7c, 0x7c, 0x7d, 0x7d, 0x7d, 0x7d, 0x7d, 0x7e,
0x7e, 0x7e, 0x7e, 0x7e, 0x7e, 0x7e, 0x7e, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
0x01, 0x01, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x04,
0x04, 0x04, 0x04, 0x04, 0x05, 0x05, 0x05, 0x06,
0x06, 0x06, 0x07, 0x07, 0x08, 0x08, 0x09, 0x09,
0x0a, 0x0a, 0x0b, 0x0c, 0x0c, 0x0d, 0x0e, 0x0e,
0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
0x17, 0x19, 0x1a, 0x1b, 0x1d, 0x1e, 0x1f, 0x21,
0x22, 0x24, 0x26, 0x27, 0x29, 0x2b, 0x2d, 0x2e,
0x30, 0x32, 0x34, 0x36, 0x38, 0x3a, 0x3c, 0x3e,
};
const q15_t sigmoidTable_q15[256] = {
0x4000, 0x4200, 0x43ff, 0x45fc, 0x47f5, 0x49eb, 0x4bdc, 0x4dc8,
0x4fad, 0x518a, 0x5360, 0x552c, 0x56ef, 0x58a8, 0x5a57, 0x5bfb,
0x5d93, 0x5f20, 0x60a1, 0x6216, 0x637f, 0x64db, 0x662b, 0x676f,
0x68a6, 0x69d2, 0x6af1, 0x6c05, 0x6d0d, 0x6e09, 0x6efb, 0x6fe2,
0x70be, 0x7190, 0x7258, 0x7316, 0x73cc, 0x7478, 0x751b, 0x75b7,
0x764a, 0x76d6, 0x775b, 0x77d8, 0x784f, 0x78c0, 0x792a, 0x798f,
0x79ee, 0x7a48, 0x7a9d, 0x7aed, 0x7b39, 0x7b80, 0x7bc4, 0x7c03,
0x7c3f, 0x7c78, 0x7cad, 0x7ce0, 0x7d0f, 0x7d3c, 0x7d66, 0x7d8d,
0x7db3, 0x7dd6, 0x7df7, 0x7e16, 0x7e33, 0x7e4f, 0x7e69, 0x7e81,
0x7e98, 0x7eae, 0x7ec2, 0x7ed5, 0x7ee7, 0x7ef8, 0x7f08, 0x7f17,
0x7f25, 0x7f32, 0x7f3e, 0x7f4a, 0x7f55, 0x7f5f, 0x7f69, 0x7f72,
0x7f7b, 0x7f83, 0x7f8a, 0x7f91, 0x7f98, 0x7f9e, 0x7fa4, 0x7faa,
0x7faf, 0x7fb4, 0x7fb8, 0x7fbd, 0x7fc1, 0x7fc5, 0x7fc8, 0x7fcc,
0x7fcf, 0x7fd2, 0x7fd5, 0x7fd7, 0x7fda, 0x7fdc, 0x7fde, 0x7fe0,
0x7fe2, 0x7fe4, 0x7fe6, 0x7fe7, 0x7fe9, 0x7fea, 0x7feb, 0x7fed,
0x7fee, 0x7fef, 0x7ff0, 0x7ff1, 0x7ff2, 0x7ff3, 0x7ff4, 0x7ff4,
0x000b, 0x000c, 0x000c, 0x000d, 0x000e, 0x000f, 0x0010, 0x0011,
0x0012, 0x0013, 0x0015, 0x0016, 0x0017, 0x0019, 0x001a, 0x001c,
0x001e, 0x0020, 0x0022, 0x0024, 0x0026, 0x0029, 0x002b, 0x002e,
0x0031, 0x0034, 0x0038, 0x003b, 0x003f, 0x0043, 0x0048, 0x004c,
0x0051, 0x0056, 0x005c, 0x0062, 0x0068, 0x006f, 0x0076, 0x007d,
0x0085, 0x008e, 0x0097, 0x00a1, 0x00ab, 0x00b6, 0x00c2, 0x00ce,
0x00db, 0x00e9, 0x00f8, 0x0108, 0x0119, 0x012b, 0x013e, 0x0152,
0x0168, 0x017f, 0x0197, 0x01b1, 0x01cd, 0x01ea, 0x0209, 0x022a,
0x024d, 0x0273, 0x029a, 0x02c4, 0x02f1, 0x0320, 0x0353, 0x0388,
0x03c1, 0x03fd, 0x043c, 0x0480, 0x04c7, 0x0513, 0x0563, 0x05b8,
0x0612, 0x0671, 0x06d6, 0x0740, 0x07b1, 0x0828, 0x08a5, 0x092a,
0x09b6, 0x0a49, 0x0ae5, 0x0b88, 0x0c34, 0x0cea, 0x0da8, 0x0e70,
0x0f42, 0x101e, 0x1105, 0x11f7, 0x12f3, 0x13fb, 0x150f, 0x162e,
0x175a, 0x1891, 0x19d5, 0x1b25, 0x1c81, 0x1dea, 0x1f5f, 0x20e0,
0x226d, 0x2405, 0x25a9, 0x2758, 0x2911, 0x2ad4, 0x2ca0, 0x2e76,
0x3053, 0x3238, 0x3424, 0x3615, 0x380b, 0x3a04, 0x3c01, 0x3e00,
};
const q15_t sigmoidLTable_q15[128] = {
0x4000, 0x4100, 0x4200, 0x42ff, 0x43ff, 0x44fd, 0x45fc, 0x46f9,
0x47f5, 0x48f1, 0x49eb, 0x4ae5, 0x4bdc, 0x4cd3, 0x4dc8, 0x4ebb,
0x4fad, 0x509c, 0x518a, 0x5276, 0x5360, 0x5447, 0x552c, 0x560f,
0x56ef, 0x57cd, 0x58a8, 0x5981, 0x5a57, 0x5b2a, 0x5bfb, 0x5cc9,
0x5d93, 0x5e5b, 0x5f20, 0x5fe2, 0x60a1, 0x615d, 0x6216, 0x62cc,
0x637f, 0x642e, 0x64db, 0x6584, 0x662b, 0x66ce, 0x676f, 0x680c,
0x68a6, 0x693d, 0x69d2, 0x6a63, 0x6af1, 0x6b7c, 0x6c05, 0x6c8a,
0x6d0d, 0x6d8d, 0x6e09, 0x6e84, 0x6efb, 0x6f70, 0x6fe2, 0x7051,
0x0f42, 0x0faf, 0x101e, 0x1090, 0x1105, 0x117c, 0x11f7, 0x1273,
0x12f3, 0x1376, 0x13fb, 0x1484, 0x150f, 0x159d, 0x162e, 0x16c3,
0x175a, 0x17f4, 0x1891, 0x1932, 0x19d5, 0x1a7c, 0x1b25, 0x1bd2,
0x1c81, 0x1d34, 0x1dea, 0x1ea3, 0x1f5f, 0x201e, 0x20e0, 0x21a5,
0x226d, 0x2337, 0x2405, 0x24d6, 0x25a9, 0x267f, 0x2758, 0x2833,
0x2911, 0x29f1, 0x2ad4, 0x2bb9, 0x2ca0, 0x2d8a, 0x2e76, 0x2f64,
0x3053, 0x3145, 0x3238, 0x332d, 0x3424, 0x351b, 0x3615, 0x370f,
0x380b, 0x3907, 0x3a04, 0x3b03, 0x3c01, 0x3d01, 0x3e00, 0x3f00,
};
const q15_t sigmoidHTable_q15[192] = {
0x70be, 0x7190, 0x7258, 0x7316, 0x73cc, 0x7478, 0x751b, 0x75b7,
0x764a, 0x76d6, 0x775b, 0x77d8, 0x784f, 0x78c0, 0x792a, 0x798f,
0x79ee, 0x7a48, 0x7a9d, 0x7aed, 0x7b39, 0x7b80, 0x7bc4, 0x7c03,
0x7c3f, 0x7c78, 0x7cad, 0x7ce0, 0x7d0f, 0x7d3c, 0x7d66, 0x7d8d,
0x7db3, 0x7dd6, 0x7df7, 0x7e16, 0x7e33, 0x7e4f, 0x7e69, 0x7e81,
0x7e98, 0x7eae, 0x7ec2, 0x7ed5, 0x7ee7, 0x7ef8, 0x7f08, 0x7f17,
0x7f25, 0x7f32, 0x7f3e, 0x7f4a, 0x7f55, 0x7f5f, 0x7f69, 0x7f72,
0x7f7b, 0x7f83, 0x7f8a, 0x7f91, 0x7f98, 0x7f9e, 0x7fa4, 0x7faa,
0x7faf, 0x7fb4, 0x7fb8, 0x7fbd, 0x7fc1, 0x7fc5, 0x7fc8, 0x7fcc,
0x7fcf, 0x7fd2, 0x7fd5, 0x7fd7, 0x7fda, 0x7fdc, 0x7fde, 0x7fe0,
0x7fe2, 0x7fe4, 0x7fe6, 0x7fe7, 0x7fe9, 0x7fea, 0x7feb, 0x7fed,
0x7fee, 0x7fef, 0x7ff0, 0x7ff1, 0x7ff2, 0x7ff3, 0x7ff4, 0x7ff4,
0x000b, 0x000c, 0x000c, 0x000d, 0x000e, 0x000f, 0x0010, 0x0011,
0x0012, 0x0013, 0x0015, 0x0016, 0x0017, 0x0019, 0x001a, 0x001c,
0x001e, 0x0020, 0x0022, 0x0024, 0x0026, 0x0029, 0x002b, 0x002e,
0x0031, 0x0034, 0x0038, 0x003b, 0x003f, 0x0043, 0x0048, 0x004c,
0x0051, 0x0056, 0x005c, 0x0062, 0x0068, 0x006f, 0x0076, 0x007d,
0x0085, 0x008e, 0x0097, 0x00a1, 0x00ab, 0x00b6, 0x00c2, 0x00ce,
0x00db, 0x00e9, 0x00f8, 0x0108, 0x0119, 0x012b, 0x013e, 0x0152,
0x0168, 0x017f, 0x0197, 0x01b1, 0x01cd, 0x01ea, 0x0209, 0x022a,
0x024d, 0x0273, 0x029a, 0x02c4, 0x02f1, 0x0320, 0x0353, 0x0388,
0x03c1, 0x03fd, 0x043c, 0x0480, 0x04c7, 0x0513, 0x0563, 0x05b8,
0x0612, 0x0671, 0x06d6, 0x0740, 0x07b1, 0x0828, 0x08a5, 0x092a,
0x09b6, 0x0a49, 0x0ae5, 0x0b88, 0x0c34, 0x0cea, 0x0da8, 0x0e70,
};
const q7_t tanhTable_q7[256] = {
0x00, 0x08, 0x10, 0x18, 0x1f, 0x27, 0x2e, 0x35,
0x3b, 0x41, 0x47, 0x4c, 0x51, 0x56, 0x5a, 0x5e,
0x61, 0x65, 0x68, 0x6a, 0x6d, 0x6f, 0x71, 0x72,
0x74, 0x75, 0x76, 0x78, 0x78, 0x79, 0x7a, 0x7b,
0x7b, 0x7c, 0x7c, 0x7d, 0x7d, 0x7e, 0x7e, 0x7e,
0x7e, 0x7e, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x81,
0x81, 0x81, 0x81, 0x81, 0x81, 0x81, 0x81, 0x82,
0x82, 0x82, 0x82, 0x82, 0x83, 0x83, 0x84, 0x84,
0x85, 0x85, 0x86, 0x87, 0x88, 0x88, 0x8a, 0x8b,
0x8c, 0x8e, 0x8f, 0x91, 0x93, 0x96, 0x98, 0x9b,
0x9f, 0xa2, 0xa6, 0xaa, 0xaf, 0xb4, 0xb9, 0xbf,
0xc5, 0xcb, 0xd2, 0xd9, 0xe1, 0xe8, 0xf0, 0xf8,
};
const q15_t tanhTable_q15[256] = {
0x0000, 0x07fd, 0x0feb, 0x17b9, 0x1f59, 0x26bf, 0x2ddf, 0x34ae,
0x3b27, 0x4142, 0x46fd, 0x4c56, 0x514d, 0x55e2, 0x5a1a, 0x5df6,
0x617c, 0x64b0, 0x6797, 0x6a37, 0x6c95, 0x6eb5, 0x709e, 0x7254,
0x73dc, 0x753a, 0x7672, 0x7788, 0x787f, 0x795b, 0x7a1e, 0x7acb,
0x7b65, 0x7bee, 0x7c66, 0x7cd1, 0x7d30, 0x7d84, 0x7dce, 0x7e0f,
0x7e49, 0x7e7d, 0x7eaa, 0x7ed2, 0x7ef5, 0x7f14, 0x7f30, 0x7f48,
0x7f5e, 0x7f71, 0x7f82, 0x7f91, 0x7f9e, 0x7fa9, 0x7fb3, 0x7fbc,
0x7fc4, 0x7fcb, 0x7fd1, 0x7fd7, 0x7fdc, 0x7fe0, 0x7fe4, 0x7fe7,
0x7fea, 0x7fed, 0x7fef, 0x7ff1, 0x7ff3, 0x7ff4, 0x7ff6, 0x7ff7,
0x7ff8, 0x7ff9, 0x7ffa, 0x7ffa, 0x7ffb, 0x7ffc, 0x7ffc, 0x7ffd,
0x7ffd, 0x7ffd, 0x7ffe, 0x7ffe, 0x7ffe, 0x7ffe, 0x7fff, 0x7fff,
0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff,
0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff,
0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff,
0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff,
0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff,
0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
0x8000, 0x8000, 0x8001, 0x8001, 0x8001, 0x8001, 0x8001, 0x8001,
0x8001, 0x8001, 0x8001, 0x8002, 0x8002, 0x8002, 0x8002, 0x8003,
0x8003, 0x8003, 0x8004, 0x8004, 0x8005, 0x8006, 0x8006, 0x8007,
0x8008, 0x8009, 0x800a, 0x800c, 0x800d, 0x800f, 0x8011, 0x8013,
0x8016, 0x8019, 0x801c, 0x8020, 0x8024, 0x8029, 0x802f, 0x8035,
0x803c, 0x8044, 0x804d, 0x8057, 0x8062, 0x806f, 0x807e, 0x808f,
0x80a2, 0x80b8, 0x80d0, 0x80ec, 0x810b, 0x812e, 0x8156, 0x8183,
0x81b7, 0x81f1, 0x8232, 0x827c, 0x82d0, 0x832f, 0x839a, 0x8412,
0x849b, 0x8535, 0x85e2, 0x86a5, 0x8781, 0x8878, 0x898e, 0x8ac6,
0x8c24, 0x8dac, 0x8f62, 0x914b, 0x936b, 0x95c9, 0x9869, 0x9b50,
0x9e84, 0xa20a, 0xa5e6, 0xaa1e, 0xaeb3, 0xb3aa, 0xb903, 0xbebe,
0xc4d9, 0xcb52, 0xd221, 0xd941, 0xe0a7, 0xe847, 0xf015, 0xf803,
};
const q15_t tanhLTable_q15[128] = {
0x0000, 0x0400, 0x07fd, 0x0bf7, 0x0feb, 0x13d7, 0x17b9, 0x1b90,
0x1f59, 0x2314, 0x26bf, 0x2a58, 0x2ddf, 0x3151, 0x34ae, 0x37f6,
0x3b27, 0x3e40, 0x4142, 0x442c, 0x46fd, 0x49b6, 0x4c56, 0x4edd,
0x514d, 0x53a3, 0x55e2, 0x580a, 0x5a1a, 0x5c13, 0x5df6, 0x5fc4,
0x617c, 0x6320, 0x64b0, 0x662d, 0x6797, 0x68f0, 0x6a37, 0x6b6e,
0x6c95, 0x6dac, 0x6eb5, 0x6fb0, 0x709e, 0x717f, 0x7254, 0x731e,
0x73dc, 0x7490, 0x753a, 0x75da, 0x7672, 0x7701, 0x7788, 0x7807,
0x787f, 0x78f0, 0x795b, 0x79bf, 0x7a1e, 0x7a77, 0x7acb, 0x7b1b,
0x849b, 0x84e5, 0x8535, 0x8589, 0x85e2, 0x8641, 0x86a5, 0x8710,
0x8781, 0x87f9, 0x8878, 0x88ff, 0x898e, 0x8a26, 0x8ac6, 0x8b70,
0x8c24, 0x8ce2, 0x8dac, 0x8e81, 0x8f62, 0x9050, 0x914b, 0x9254,
0x936b, 0x9492, 0x95c9, 0x9710, 0x9869, 0x99d3, 0x9b50, 0x9ce0,
0x9e84, 0xa03c, 0xa20a, 0xa3ed, 0xa5e6, 0xa7f6, 0xaa1e, 0xac5d,
0xaeb3, 0xb123, 0xb3aa, 0xb64a, 0xb903, 0xbbd4, 0xbebe, 0xc1c0,
0xc4d9, 0xc80a, 0xcb52, 0xceaf, 0xd221, 0xd5a8, 0xd941, 0xdcec,
0xe0a7, 0xe470, 0xe847, 0xec29, 0xf015, 0xf409, 0xf803, 0xfc00,
};
const q15_t tanhHTable_q15[192] = {
0x7b65, 0x7bee, 0x7c66, 0x7cd1, 0x7d30, 0x7d84, 0x7dce, 0x7e0f,
0x7e49, 0x7e7d, 0x7eaa, 0x7ed2, 0x7ef5, 0x7f14, 0x7f30, 0x7f48,
0x7f5e, 0x7f71, 0x7f82, 0x7f91, 0x7f9e, 0x7fa9, 0x7fb3, 0x7fbc,
0x7fc4, 0x7fcb, 0x7fd1, 0x7fd7, 0x7fdc, 0x7fe0, 0x7fe4, 0x7fe7,
0x7fea, 0x7fed, 0x7fef, 0x7ff1, 0x7ff3, 0x7ff4, 0x7ff6, 0x7ff7,
0x7ff8, 0x7ff9, 0x7ffa, 0x7ffa, 0x7ffb, 0x7ffc, 0x7ffc, 0x7ffd,
0x7ffd, 0x7ffd, 0x7ffe, 0x7ffe, 0x7ffe, 0x7ffe, 0x7fff, 0x7fff,
0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff,
0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff,
0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff,
0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff,
0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff,
0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
0x8000, 0x8000, 0x8001, 0x8001, 0x8001, 0x8001, 0x8001, 0x8001,
0x8001, 0x8001, 0x8001, 0x8002, 0x8002, 0x8002, 0x8002, 0x8003,
0x8003, 0x8003, 0x8004, 0x8004, 0x8005, 0x8006, 0x8006, 0x8007,
0x8008, 0x8009, 0x800a, 0x800c, 0x800d, 0x800f, 0x8011, 0x8013,
0x8016, 0x8019, 0x801c, 0x8020, 0x8024, 0x8029, 0x802f, 0x8035,
0x803c, 0x8044, 0x804d, 0x8057, 0x8062, 0x806f, 0x807e, 0x808f,
0x80a2, 0x80b8, 0x80d0, 0x80ec, 0x810b, 0x812e, 0x8156, 0x8183,
0x81b7, 0x81f1, 0x8232, 0x827c, 0x82d0, 0x832f, 0x839a, 0x8412,
};

View File

@ -1,122 +0,0 @@
/*
* Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_q7_to_q15_no_shift.c
* Description: Converts the elements of the Q7 vector to Q15 vector without left-shift
*
* $Date: May 29, 2020
* $Revision: V.1.0.2
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#include "arm_nnsupportfunctions.h"
/**
* @ingroup groupSupport
*/
/**
* @addtogroup nndata_convert
* @{
*/
/**
* @brief Converts the elements of the Q7 vector to Q15 vector without left-shift
* @param[in] *pSrc points to the Q7 input vector
* @param[out] *pDst points to the Q15 output vector
* @param[in] blockSize length of the input vector
*
* \par Description:
*
* The equation used for the conversion process is:
*
* <pre>
* pDst[n] = (q15_t) pSrc[n]; 0 <= n < blockSize.
* </pre>
*
*/
void arm_q7_to_q15_no_shift(const q7_t * pSrc, q15_t * pDst, uint32_t blockSize)
{
const q7_t *pIn = pSrc;
uint32_t blkCnt;
#if defined(ARM_MATH_DSP)
q31_t in;
q31_t in1, in2;
q31_t out1, out2;
/*loop Unrolling */
blkCnt = blockSize >> 2u;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time. */
while (blkCnt > 0u)
{
in = arm_nn_read_q7x4_ia(&pIn);
/* rotatate in by 8 and extend two q7_t values to q15_t values */
in1 = __SXTB16(__ROR((uint32_t)in, 8));
/* extend remaining two q7_t values to q15_t values */
in2 = __SXTB16(in);
#ifndef ARM_MATH_BIG_ENDIAN
out2 = (int32_t)__PKHTB(in1, in2, 16);
out1 = (int32_t)__PKHBT(in2, in1, 16);
#else
out1 = (int32_t)__PKHTB(in1, in2, 16);
out2 = (int32_t)__PKHBT(in2, in1, 16);
#endif
write_q15x2_ia(&pDst, out1);
write_q15x2_ia(&pDst, out2);
/* Decrement the loop counter */
blkCnt--;
}
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
blkCnt = blockSize % 0x4u;
#else
/* Run the below code for Cortex-M0 */
/* Loop over blockSize number of values */
blkCnt = blockSize;
#endif /* #ifndef ARM_MATH_CM0_FAMILY */
while (blkCnt > 0u)
{
/* convert from q7 to q15 and then store the results in the destination buffer */
*pDst++ = (q15_t)*pIn++;
/* Decrement the loop counter */
blkCnt--;
}
}
/**
* @} end of nndata_convert group
*/

View File

@ -1,144 +0,0 @@
/*
* Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_q7_to_q15_reordered_no_shift.c
* Description: Converts the elements of the Q7 vector to reordered Q15 vector without left-shift
*
* $Date: May 29, 2020
* $Revision: V.1.0.1
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#include "arm_nnsupportfunctions.h"
/**
* @ingroup groupSupport
*/
/**
* @addtogroup nndata_convert
* @{
*/
/**
* @brief Converts the elements of the Q7 vector to reordered Q15 vector without left-shift
* @param[in] *pSrc points to the Q7 input vector
* @param[out] *pDst points to the Q15 output vector
* @param[in] blockSize length of the input vector
*
* @details
*
* This function does the q7 to q15 expansion with re-ordering
*
* <pre>
* | A1 | A2 | A3 | A4 |
*
* 0 7 8 15 16 23 24 31
* </pre>
*
* is converted into:
*
* <pre>
* | A1 | A3 | and | A2 | A4 |
*
* 0 15 16 31 0 15 16 31
* </pre>
*
*
* This looks strange but is natural considering how sign-extension is done at
* assembly level.
*
* The expansion of other other oprand will follow the same rule so that the end
* results are the same.
*
* The tail (i.e., last (N % 4) elements) will still be in original order.
*
*/
void arm_q7_to_q15_reordered_no_shift(const q7_t * pSrc, q15_t * pDst, uint32_t blockSize)
{
const q7_t *pIn = pSrc; /* Src pointer */
uint32_t blkCnt; /* loop counter */
#ifndef ARM_MATH_CM0_FAMILY
q31_t in;
q31_t in1, in2;
/* Run the below code for Cortex-M4 and Cortex-M3 */
/*loop Unrolling */
blkCnt = blockSize >> 2u;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0u)
{
/* C = (q15_t) A << 8 */
/* convert from q7 to q15 and then store the results in the destination buffer */
in = arm_nn_read_q7x4_ia(&pIn);
/* rotatate in by 8 and extend two q7_t values to q15_t values */
in1 = __SXTB16(__ROR((uint32_t)in, 8));
/* extend remainig two q7_t values to q15_t values */
in2 = __SXTB16(in);
#ifndef ARM_MATH_BIG_ENDIAN
*__SIMD32(pDst)++ = in2;
*__SIMD32(pDst)++ = in1;
#else
*__SIMD32(pDst)++ = in1;
*__SIMD32(pDst)++ = in2;
#endif
/* Decrement the loop counter */
blkCnt--;
}
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
blkCnt = blockSize % 0x4u;
#else
/* Run the below code for Cortex-M0 */
/* Loop over blockSize number of values */
blkCnt = blockSize;
#endif /* #ifndef ARM_MATH_CM0_FAMILY */
while (blkCnt > 0u)
{
/* C = (q15_t) A << 8 */
/* convert from q7 to q15 and then store the results in the destination buffer */
*pDst++ = (q15_t) * pIn++;
/* Decrement the loop counter */
blkCnt--;
}
}
/**
* @} end of q7_to_x group
*/

View File

@ -1,100 +0,0 @@
/*
* Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_q7_to_q15_reordered_with_offset.c
* Description: Converts the elements of the Q7 vector to a reordered Q15 vector with an added offset. The re-ordering
* is a signature of sign extension intrinsic(DSP extension).
*
* $Date: May 29, 2020
* $Revision: V.2.0.3
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#include "arm_nnsupportfunctions.h"
/**
* @ingroup groupSupport
*/
/**
* @addtogroup nndata_convert
* @{
*/
/**
* @brief Converts the elements of the Q7 vector to a reordered Q15 vector with an added offset.
*
* @note Refer header file for details.
*
*/
void arm_q7_to_q15_reordered_with_offset(const q7_t *src, q15_t *dst, uint32_t block_size, q15_t offset)
{
#if defined(ARM_MATH_DSP)
uint32_t block_cnt;
/* Run the below code for cores that support SIMD instructions */
q31_t in_q7x4;
q31_t out_q15x2_1;
q31_t out_q15x2_2;
/*loop unrolling */
block_cnt = block_size >> 2u;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time. */
const q31_t offset_q15x2 = (q31_t)__PKHBT(offset, offset, 16);
while (block_cnt > 0u)
{
/* convert from q7 to q15 and then store the results in the destination buffer */
in_q7x4 = arm_nn_read_q7x4_ia(&src);
/* Extract and sign extend each of the four q7 values to q15 */
out_q15x2_1 = __SXTAB16(offset_q15x2, __ROR((uint32_t)in_q7x4, 8));
out_q15x2_2 = __SXTAB16(offset_q15x2, in_q7x4);
write_q15x2_ia(&dst, out_q15x2_2);
write_q15x2_ia(&dst, out_q15x2_1);
block_cnt--;
}
/* Handle left over samples */
block_cnt = block_size % 0x4u;
while (block_cnt > 0u)
{
*dst++ = (q15_t)*src++ + offset;
/* Decrement the loop counter */
block_cnt--;
}
#else
(void)src;
(void)dst;
(void)block_size;
(void)offset;
/* Not available */
#endif
}
/**
* @} end of nndata_convert group
*/

Some files were not shown because too many files have changed in this diff Show More