diff --git a/APP_Framework/Framework/knowing/cmsis_5/Core/Include/cmsis_compiler.h b/APP_Framework/Framework/knowing/cmsis_5/Core/Include/cmsis_compiler.h new file mode 100644 index 000000000..adbf296f1 --- /dev/null +++ b/APP_Framework/Framework/knowing/cmsis_5/Core/Include/cmsis_compiler.h @@ -0,0 +1,283 @@ +/**************************************************************************//** + * @file cmsis_compiler.h + * @brief CMSIS compiler generic header file + * @version V5.1.0 + * @date 09. October 2018 + ******************************************************************************/ +/* + * Copyright (c) 2009-2018 Arm Limited. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __CMSIS_COMPILER_H +#define __CMSIS_COMPILER_H + +#include + +/* + * Arm Compiler 4/5 + */ +#if defined ( __CC_ARM ) + #include "cmsis_armcc.h" + + +/* + * Arm Compiler 6.6 LTM (armclang) + */ +#elif defined (__ARMCC_VERSION) && (__ARMCC_VERSION >= 6010050) && (__ARMCC_VERSION < 6100100) + #include "cmsis_armclang_ltm.h" + + /* + * Arm Compiler above 6.10.1 (armclang) + */ +#elif defined (__ARMCC_VERSION) && (__ARMCC_VERSION >= 6100100) + #include "cmsis_armclang.h" + + +/* + * GNU Compiler + */ +#elif defined ( __GNUC__ ) + #include "cmsis_gcc.h" + + +/* + * IAR Compiler + */ +#elif defined ( __ICCARM__ ) + #include + + +/* + * TI Arm Compiler + */ +#elif defined ( __TI_ARM__ ) + #include + + #ifndef __ASM + #define __ASM __asm + #endif + #ifndef __INLINE + #define __INLINE inline + #endif + #ifndef __STATIC_INLINE + #define __STATIC_INLINE static inline + #endif + #ifndef __STATIC_FORCEINLINE + #define __STATIC_FORCEINLINE __STATIC_INLINE + #endif + #ifndef __NO_RETURN + #define __NO_RETURN __attribute__((noreturn)) + #endif + #ifndef __USED + #define __USED __attribute__((used)) + #endif + #ifndef __WEAK + #define __WEAK __attribute__((weak)) + #endif + #ifndef __PACKED + #define __PACKED __attribute__((packed)) + #endif + #ifndef __PACKED_STRUCT + #define __PACKED_STRUCT struct __attribute__((packed)) + #endif + #ifndef __PACKED_UNION + #define __PACKED_UNION union __attribute__((packed)) + #endif + #ifndef __UNALIGNED_UINT32 /* deprecated */ + struct __attribute__((packed)) T_UINT32 { uint32_t v; }; + #define __UNALIGNED_UINT32(x) (((struct T_UINT32 *)(x))->v) + #endif + #ifndef __UNALIGNED_UINT16_WRITE + __PACKED_STRUCT T_UINT16_WRITE { uint16_t v; }; + #define __UNALIGNED_UINT16_WRITE(addr, val) (void)((((struct T_UINT16_WRITE *)(void*)(addr))->v) = (val)) + #endif + #ifndef __UNALIGNED_UINT16_READ + __PACKED_STRUCT T_UINT16_READ { uint16_t v; }; + #define __UNALIGNED_UINT16_READ(addr) (((const struct T_UINT16_READ *)(const void *)(addr))->v) + #endif + #ifndef __UNALIGNED_UINT32_WRITE + __PACKED_STRUCT T_UINT32_WRITE { uint32_t v; }; + #define __UNALIGNED_UINT32_WRITE(addr, val) (void)((((struct T_UINT32_WRITE *)(void *)(addr))->v) = (val)) + #endif + #ifndef __UNALIGNED_UINT32_READ + __PACKED_STRUCT T_UINT32_READ { uint32_t v; }; + #define __UNALIGNED_UINT32_READ(addr) (((const struct T_UINT32_READ *)(const void *)(addr))->v) + #endif + #ifndef __ALIGNED + #define __ALIGNED(x) __attribute__((aligned(x))) + #endif + #ifndef __RESTRICT + #define __RESTRICT __restrict + #endif + #ifndef __COMPILER_BARRIER + #warning No compiler specific solution for __COMPILER_BARRIER. __COMPILER_BARRIER is ignored. + #define __COMPILER_BARRIER() (void)0 + #endif + + +/* + * TASKING Compiler + */ +#elif defined ( __TASKING__ ) + /* + * The CMSIS functions have been implemented as intrinsics in the compiler. + * Please use "carm -?i" to get an up to date list of all intrinsics, + * Including the CMSIS ones. + */ + + #ifndef __ASM + #define __ASM __asm + #endif + #ifndef __INLINE + #define __INLINE inline + #endif + #ifndef __STATIC_INLINE + #define __STATIC_INLINE static inline + #endif + #ifndef __STATIC_FORCEINLINE + #define __STATIC_FORCEINLINE __STATIC_INLINE + #endif + #ifndef __NO_RETURN + #define __NO_RETURN __attribute__((noreturn)) + #endif + #ifndef __USED + #define __USED __attribute__((used)) + #endif + #ifndef __WEAK + #define __WEAK __attribute__((weak)) + #endif + #ifndef __PACKED + #define __PACKED __packed__ + #endif + #ifndef __PACKED_STRUCT + #define __PACKED_STRUCT struct __packed__ + #endif + #ifndef __PACKED_UNION + #define __PACKED_UNION union __packed__ + #endif + #ifndef __UNALIGNED_UINT32 /* deprecated */ + struct __packed__ T_UINT32 { uint32_t v; }; + #define __UNALIGNED_UINT32(x) (((struct T_UINT32 *)(x))->v) + #endif + #ifndef __UNALIGNED_UINT16_WRITE + __PACKED_STRUCT T_UINT16_WRITE { uint16_t v; }; + #define __UNALIGNED_UINT16_WRITE(addr, val) (void)((((struct T_UINT16_WRITE *)(void *)(addr))->v) = (val)) + #endif + #ifndef __UNALIGNED_UINT16_READ + __PACKED_STRUCT T_UINT16_READ { uint16_t v; }; + #define __UNALIGNED_UINT16_READ(addr) (((const struct T_UINT16_READ *)(const void *)(addr))->v) + #endif + #ifndef __UNALIGNED_UINT32_WRITE + __PACKED_STRUCT T_UINT32_WRITE { uint32_t v; }; + #define __UNALIGNED_UINT32_WRITE(addr, val) (void)((((struct T_UINT32_WRITE *)(void *)(addr))->v) = (val)) + #endif + #ifndef __UNALIGNED_UINT32_READ + __PACKED_STRUCT T_UINT32_READ { uint32_t v; }; + #define __UNALIGNED_UINT32_READ(addr) (((const struct T_UINT32_READ *)(const void *)(addr))->v) + #endif + #ifndef __ALIGNED + #define __ALIGNED(x) __align(x) + #endif + #ifndef __RESTRICT + #warning No compiler specific solution for __RESTRICT. __RESTRICT is ignored. + #define __RESTRICT + #endif + #ifndef __COMPILER_BARRIER + #warning No compiler specific solution for __COMPILER_BARRIER. __COMPILER_BARRIER is ignored. + #define __COMPILER_BARRIER() (void)0 + #endif + + +/* + * COSMIC Compiler + */ +#elif defined ( __CSMC__ ) + #include + + #ifndef __ASM + #define __ASM _asm + #endif + #ifndef __INLINE + #define __INLINE inline + #endif + #ifndef __STATIC_INLINE + #define __STATIC_INLINE static inline + #endif + #ifndef __STATIC_FORCEINLINE + #define __STATIC_FORCEINLINE __STATIC_INLINE + #endif + #ifndef __NO_RETURN + // NO RETURN is automatically detected hence no warning here + #define __NO_RETURN + #endif + #ifndef __USED + #warning No compiler specific solution for __USED. __USED is ignored. + #define __USED + #endif + #ifndef __WEAK + #define __WEAK __weak + #endif + #ifndef __PACKED + #define __PACKED @packed + #endif + #ifndef __PACKED_STRUCT + #define __PACKED_STRUCT @packed struct + #endif + #ifndef __PACKED_UNION + #define __PACKED_UNION @packed union + #endif + #ifndef __UNALIGNED_UINT32 /* deprecated */ + @packed struct T_UINT32 { uint32_t v; }; + #define __UNALIGNED_UINT32(x) (((struct T_UINT32 *)(x))->v) + #endif + #ifndef __UNALIGNED_UINT16_WRITE + __PACKED_STRUCT T_UINT16_WRITE { uint16_t v; }; + #define __UNALIGNED_UINT16_WRITE(addr, val) (void)((((struct T_UINT16_WRITE *)(void *)(addr))->v) = (val)) + #endif + #ifndef __UNALIGNED_UINT16_READ + __PACKED_STRUCT T_UINT16_READ { uint16_t v; }; + #define __UNALIGNED_UINT16_READ(addr) (((const struct T_UINT16_READ *)(const void *)(addr))->v) + #endif + #ifndef __UNALIGNED_UINT32_WRITE + __PACKED_STRUCT T_UINT32_WRITE { uint32_t v; }; + #define __UNALIGNED_UINT32_WRITE(addr, val) (void)((((struct T_UINT32_WRITE *)(void *)(addr))->v) = (val)) + #endif + #ifndef __UNALIGNED_UINT32_READ + __PACKED_STRUCT T_UINT32_READ { uint32_t v; }; + #define __UNALIGNED_UINT32_READ(addr) (((const struct T_UINT32_READ *)(const void *)(addr))->v) + #endif + #ifndef __ALIGNED + #warning No compiler specific solution for __ALIGNED. __ALIGNED is ignored. + #define __ALIGNED(x) + #endif + #ifndef __RESTRICT + #warning No compiler specific solution for __RESTRICT. __RESTRICT is ignored. + #define __RESTRICT + #endif + #ifndef __COMPILER_BARRIER + #warning No compiler specific solution for __COMPILER_BARRIER. __COMPILER_BARRIER is ignored. + #define __COMPILER_BARRIER() (void)0 + #endif + + +#else + #error Unknown compiler. +#endif + + +#endif /* __CMSIS_COMPILER_H */ + diff --git a/APP_Framework/Framework/knowing/cmsis_5/Core/Include/cmsis_gcc.h b/APP_Framework/Framework/knowing/cmsis_5/Core/Include/cmsis_gcc.h new file mode 100644 index 000000000..dd63ba6e2 --- /dev/null +++ b/APP_Framework/Framework/knowing/cmsis_5/Core/Include/cmsis_gcc.h @@ -0,0 +1,2209 @@ +/**************************************************************************//** + * @file cmsis_gcc.h + * @brief CMSIS compiler GCC header file + * @version V5.4.0 + * @date 19. March 2021 + ******************************************************************************/ +/* + * Copyright (c) 2009-2021 Arm Limited. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __CMSIS_GCC_H +#define __CMSIS_GCC_H + +/* ignore some GCC warnings */ +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wsign-conversion" +#pragma GCC diagnostic ignored "-Wconversion" +#pragma GCC diagnostic ignored "-Wunused-parameter" + +/* Fallback for __has_builtin */ +#ifndef __has_builtin + #define __has_builtin(x) (0) +#endif + +/* CMSIS compiler specific defines */ +#ifndef __ASM + #define __ASM __asm +#endif +#ifndef __INLINE + #define __INLINE inline +#endif +#ifndef __STATIC_INLINE + #define __STATIC_INLINE static inline +#endif +#ifndef __STATIC_FORCEINLINE + #define __STATIC_FORCEINLINE __attribute__((always_inline)) static inline +#endif +#ifndef __NO_RETURN + #define __NO_RETURN __attribute__((__noreturn__)) +#endif +#ifndef __USED + #define __USED __attribute__((used)) +#endif +#ifndef __WEAK + #define __WEAK __attribute__((weak)) +#endif +#ifndef __PACKED + #define __PACKED __attribute__((packed, aligned(1))) +#endif +#ifndef __PACKED_STRUCT + #define __PACKED_STRUCT struct __attribute__((packed, aligned(1))) +#endif +#ifndef __PACKED_UNION + #define __PACKED_UNION union __attribute__((packed, aligned(1))) +#endif +#ifndef __UNALIGNED_UINT32 /* deprecated */ + #pragma GCC diagnostic push + #pragma GCC diagnostic ignored "-Wpacked" + #pragma GCC diagnostic ignored "-Wattributes" + struct __attribute__((packed)) T_UINT32 { uint32_t v; }; + #pragma GCC diagnostic pop + #define __UNALIGNED_UINT32(x) (((struct T_UINT32 *)(x))->v) +#endif +#ifndef __UNALIGNED_UINT16_WRITE + #pragma GCC diagnostic push + #pragma GCC diagnostic ignored "-Wpacked" + #pragma GCC diagnostic ignored "-Wattributes" + __PACKED_STRUCT T_UINT16_WRITE { uint16_t v; }; + #pragma GCC diagnostic pop + #define __UNALIGNED_UINT16_WRITE(addr, val) (void)((((struct T_UINT16_WRITE *)(void *)(addr))->v) = (val)) +#endif +#ifndef __UNALIGNED_UINT16_READ + #pragma GCC diagnostic push + #pragma GCC diagnostic ignored "-Wpacked" + #pragma GCC diagnostic ignored "-Wattributes" + __PACKED_STRUCT T_UINT16_READ { uint16_t v; }; + #pragma GCC diagnostic pop + #define __UNALIGNED_UINT16_READ(addr) (((const struct T_UINT16_READ *)(const void *)(addr))->v) +#endif +#ifndef __UNALIGNED_UINT32_WRITE + #pragma GCC diagnostic push + #pragma GCC diagnostic ignored "-Wpacked" + #pragma GCC diagnostic ignored "-Wattributes" + __PACKED_STRUCT T_UINT32_WRITE { uint32_t v; }; + #pragma GCC diagnostic pop + #define __UNALIGNED_UINT32_WRITE(addr, val) (void)((((struct T_UINT32_WRITE *)(void *)(addr))->v) = (val)) +#endif +#ifndef __UNALIGNED_UINT32_READ + #pragma GCC diagnostic push + #pragma GCC diagnostic ignored "-Wpacked" + #pragma GCC diagnostic ignored "-Wattributes" + __PACKED_STRUCT T_UINT32_READ { uint32_t v; }; + #pragma GCC diagnostic pop + #define __UNALIGNED_UINT32_READ(addr) (((const struct T_UINT32_READ *)(const void *)(addr))->v) +#endif +#ifndef __ALIGNED + #define __ALIGNED(x) __attribute__((aligned(x))) +#endif +#ifndef __RESTRICT + #define __RESTRICT __restrict +#endif +#ifndef __COMPILER_BARRIER + #define __COMPILER_BARRIER() __ASM volatile("":::"memory") +#endif + +/* ######################### Startup and Lowlevel Init ######################## */ + +#ifndef __PROGRAM_START + +/** + \brief Initializes data and bss sections + \details This default implementations initialized all data and additional bss + sections relying on .copy.table and .zero.table specified properly + in the used linker script. + + */ +__STATIC_FORCEINLINE __NO_RETURN void __cmsis_start(void) +{ + extern void _start(void) __NO_RETURN; + + typedef struct { + uint32_t const* src; + uint32_t* dest; + uint32_t wlen; + } __copy_table_t; + + typedef struct { + uint32_t* dest; + uint32_t wlen; + } __zero_table_t; + + extern const __copy_table_t __copy_table_start__; + extern const __copy_table_t __copy_table_end__; + extern const __zero_table_t __zero_table_start__; + extern const __zero_table_t __zero_table_end__; + + for (__copy_table_t const* pTable = &__copy_table_start__; pTable < &__copy_table_end__; ++pTable) { + for(uint32_t i=0u; iwlen; ++i) { + pTable->dest[i] = pTable->src[i]; + } + } + + for (__zero_table_t const* pTable = &__zero_table_start__; pTable < &__zero_table_end__; ++pTable) { + for(uint32_t i=0u; iwlen; ++i) { + pTable->dest[i] = 0u; + } + } + + _start(); +} + +#define __PROGRAM_START __cmsis_start +#endif + +#ifndef __INITIAL_SP +#define __INITIAL_SP __StackTop +#endif + +#ifndef __STACK_LIMIT +#define __STACK_LIMIT __StackLimit +#endif + +#ifndef __VECTOR_TABLE +#define __VECTOR_TABLE __Vectors +#endif + +#ifndef __VECTOR_TABLE_ATTRIBUTE +#define __VECTOR_TABLE_ATTRIBUTE __attribute__((used, section(".vectors"))) +#endif + +#if defined (__ARM_FEATURE_CMSE) && (__ARM_FEATURE_CMSE == 3U) +#ifndef __STACK_SEAL +#define __STACK_SEAL __StackSeal +#endif + +#ifndef __TZ_STACK_SEAL_SIZE +#define __TZ_STACK_SEAL_SIZE 8U +#endif + +#ifndef __TZ_STACK_SEAL_VALUE +#define __TZ_STACK_SEAL_VALUE 0xFEF5EDA5FEF5EDA5ULL +#endif + + +__STATIC_FORCEINLINE void __TZ_set_STACKSEAL_S (uint32_t* stackTop) { + *((uint64_t *)stackTop) = __TZ_STACK_SEAL_VALUE; +} +#endif + + +/* ########################### Core Function Access ########################### */ +/** \ingroup CMSIS_Core_FunctionInterface + \defgroup CMSIS_Core_RegAccFunctions CMSIS Core Register Access Functions + @{ + */ + +/** + \brief Enable IRQ Interrupts + \details Enables IRQ interrupts by clearing special-purpose register PRIMASK. + Can only be executed in Privileged modes. + */ +__STATIC_FORCEINLINE void __enable_irq(void) +{ + __ASM volatile ("cpsie i" : : : "memory"); +} + + +/** + \brief Disable IRQ Interrupts + \details Disables IRQ interrupts by setting special-purpose register PRIMASK. + Can only be executed in Privileged modes. + */ +__STATIC_FORCEINLINE void __disable_irq(void) +{ + __ASM volatile ("cpsid i" : : : "memory"); +} + + +/** + \brief Get Control Register + \details Returns the content of the Control Register. + \return Control Register value + */ +__STATIC_FORCEINLINE uint32_t __get_CONTROL(void) +{ + uint32_t result; + + __ASM volatile ("MRS %0, control" : "=r" (result) ); + return(result); +} + + +#if (defined (__ARM_FEATURE_CMSE ) && (__ARM_FEATURE_CMSE == 3)) +/** + \brief Get Control Register (non-secure) + \details Returns the content of the non-secure Control Register when in secure mode. + \return non-secure Control Register value + */ +__STATIC_FORCEINLINE uint32_t __TZ_get_CONTROL_NS(void) +{ + uint32_t result; + + __ASM volatile ("MRS %0, control_ns" : "=r" (result) ); + return(result); +} +#endif + + +/** + \brief Set Control Register + \details Writes the given value to the Control Register. + \param [in] control Control Register value to set + */ +__STATIC_FORCEINLINE void __set_CONTROL(uint32_t control) +{ + __ASM volatile ("MSR control, %0" : : "r" (control) : "memory"); +} + + +#if (defined (__ARM_FEATURE_CMSE ) && (__ARM_FEATURE_CMSE == 3)) +/** + \brief Set Control Register (non-secure) + \details Writes the given value to the non-secure Control Register when in secure state. + \param [in] control Control Register value to set + */ +__STATIC_FORCEINLINE void __TZ_set_CONTROL_NS(uint32_t control) +{ + __ASM volatile ("MSR control_ns, %0" : : "r" (control) : "memory"); +} +#endif + + +/** + \brief Get IPSR Register + \details Returns the content of the IPSR Register. + \return IPSR Register value + */ +__STATIC_FORCEINLINE uint32_t __get_IPSR(void) +{ + uint32_t result; + + __ASM volatile ("MRS %0, ipsr" : "=r" (result) ); + return(result); +} + + +/** + \brief Get APSR Register + \details Returns the content of the APSR Register. + \return APSR Register value + */ +__STATIC_FORCEINLINE uint32_t __get_APSR(void) +{ + uint32_t result; + + __ASM volatile ("MRS %0, apsr" : "=r" (result) ); + return(result); +} + + +/** + \brief Get xPSR Register + \details Returns the content of the xPSR Register. + \return xPSR Register value + */ +__STATIC_FORCEINLINE uint32_t __get_xPSR(void) +{ + uint32_t result; + + __ASM volatile ("MRS %0, xpsr" : "=r" (result) ); + return(result); +} + + +/** + \brief Get Process Stack Pointer + \details Returns the current value of the Process Stack Pointer (PSP). + \return PSP Register value + */ +__STATIC_FORCEINLINE uint32_t __get_PSP(void) +{ + uint32_t result; + + __ASM volatile ("MRS %0, psp" : "=r" (result) ); + return(result); +} + + +#if (defined (__ARM_FEATURE_CMSE ) && (__ARM_FEATURE_CMSE == 3)) +/** + \brief Get Process Stack Pointer (non-secure) + \details Returns the current value of the non-secure Process Stack Pointer (PSP) when in secure state. + \return PSP Register value + */ +__STATIC_FORCEINLINE uint32_t __TZ_get_PSP_NS(void) +{ + uint32_t result; + + __ASM volatile ("MRS %0, psp_ns" : "=r" (result) ); + return(result); +} +#endif + + +/** + \brief Set Process Stack Pointer + \details Assigns the given value to the Process Stack Pointer (PSP). + \param [in] topOfProcStack Process Stack Pointer value to set + */ +__STATIC_FORCEINLINE void __set_PSP(uint32_t topOfProcStack) +{ + __ASM volatile ("MSR psp, %0" : : "r" (topOfProcStack) : ); +} + + +#if (defined (__ARM_FEATURE_CMSE ) && (__ARM_FEATURE_CMSE == 3)) +/** + \brief Set Process Stack Pointer (non-secure) + \details Assigns the given value to the non-secure Process Stack Pointer (PSP) when in secure state. + \param [in] topOfProcStack Process Stack Pointer value to set + */ +__STATIC_FORCEINLINE void __TZ_set_PSP_NS(uint32_t topOfProcStack) +{ + __ASM volatile ("MSR psp_ns, %0" : : "r" (topOfProcStack) : ); +} +#endif + + +/** + \brief Get Main Stack Pointer + \details Returns the current value of the Main Stack Pointer (MSP). + \return MSP Register value + */ +__STATIC_FORCEINLINE uint32_t __get_MSP(void) +{ + uint32_t result; + + __ASM volatile ("MRS %0, msp" : "=r" (result) ); + return(result); +} + + +#if (defined (__ARM_FEATURE_CMSE ) && (__ARM_FEATURE_CMSE == 3)) +/** + \brief Get Main Stack Pointer (non-secure) + \details Returns the current value of the non-secure Main Stack Pointer (MSP) when in secure state. + \return MSP Register value + */ +__STATIC_FORCEINLINE uint32_t __TZ_get_MSP_NS(void) +{ + uint32_t result; + + __ASM volatile ("MRS %0, msp_ns" : "=r" (result) ); + return(result); +} +#endif + + +/** + \brief Set Main Stack Pointer + \details Assigns the given value to the Main Stack Pointer (MSP). + \param [in] topOfMainStack Main Stack Pointer value to set + */ +__STATIC_FORCEINLINE void __set_MSP(uint32_t topOfMainStack) +{ + __ASM volatile ("MSR msp, %0" : : "r" (topOfMainStack) : ); +} + + +#if (defined (__ARM_FEATURE_CMSE ) && (__ARM_FEATURE_CMSE == 3)) +/** + \brief Set Main Stack Pointer (non-secure) + \details Assigns the given value to the non-secure Main Stack Pointer (MSP) when in secure state. + \param [in] topOfMainStack Main Stack Pointer value to set + */ +__STATIC_FORCEINLINE void __TZ_set_MSP_NS(uint32_t topOfMainStack) +{ + __ASM volatile ("MSR msp_ns, %0" : : "r" (topOfMainStack) : ); +} +#endif + + +#if (defined (__ARM_FEATURE_CMSE ) && (__ARM_FEATURE_CMSE == 3)) +/** + \brief Get Stack Pointer (non-secure) + \details Returns the current value of the non-secure Stack Pointer (SP) when in secure state. + \return SP Register value + */ +__STATIC_FORCEINLINE uint32_t __TZ_get_SP_NS(void) +{ + uint32_t result; + + __ASM volatile ("MRS %0, sp_ns" : "=r" (result) ); + return(result); +} + + +/** + \brief Set Stack Pointer (non-secure) + \details Assigns the given value to the non-secure Stack Pointer (SP) when in secure state. + \param [in] topOfStack Stack Pointer value to set + */ +__STATIC_FORCEINLINE void __TZ_set_SP_NS(uint32_t topOfStack) +{ + __ASM volatile ("MSR sp_ns, %0" : : "r" (topOfStack) : ); +} +#endif + + +/** + \brief Get Priority Mask + \details Returns the current state of the priority mask bit from the Priority Mask Register. + \return Priority Mask value + */ +__STATIC_FORCEINLINE uint32_t __get_PRIMASK(void) +{ + uint32_t result; + + __ASM volatile ("MRS %0, primask" : "=r" (result) ); + return(result); +} + + +#if (defined (__ARM_FEATURE_CMSE ) && (__ARM_FEATURE_CMSE == 3)) +/** + \brief Get Priority Mask (non-secure) + \details Returns the current state of the non-secure priority mask bit from the Priority Mask Register when in secure state. + \return Priority Mask value + */ +__STATIC_FORCEINLINE uint32_t __TZ_get_PRIMASK_NS(void) +{ + uint32_t result; + + __ASM volatile ("MRS %0, primask_ns" : "=r" (result) ); + return(result); +} +#endif + + +/** + \brief Set Priority Mask + \details Assigns the given value to the Priority Mask Register. + \param [in] priMask Priority Mask + */ +__STATIC_FORCEINLINE void __set_PRIMASK(uint32_t priMask) +{ + __ASM volatile ("MSR primask, %0" : : "r" (priMask) : "memory"); +} + + +#if (defined (__ARM_FEATURE_CMSE ) && (__ARM_FEATURE_CMSE == 3)) +/** + \brief Set Priority Mask (non-secure) + \details Assigns the given value to the non-secure Priority Mask Register when in secure state. + \param [in] priMask Priority Mask + */ +__STATIC_FORCEINLINE void __TZ_set_PRIMASK_NS(uint32_t priMask) +{ + __ASM volatile ("MSR primask_ns, %0" : : "r" (priMask) : "memory"); +} +#endif + + +#if ((defined (__ARM_ARCH_7M__ ) && (__ARM_ARCH_7M__ == 1)) || \ + (defined (__ARM_ARCH_7EM__ ) && (__ARM_ARCH_7EM__ == 1)) || \ + (defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1)) ) +/** + \brief Enable FIQ + \details Enables FIQ interrupts by clearing special-purpose register FAULTMASK. + Can only be executed in Privileged modes. + */ +__STATIC_FORCEINLINE void __enable_fault_irq(void) +{ + __ASM volatile ("cpsie f" : : : "memory"); +} + + +/** + \brief Disable FIQ + \details Disables FIQ interrupts by setting special-purpose register FAULTMASK. + Can only be executed in Privileged modes. + */ +__STATIC_FORCEINLINE void __disable_fault_irq(void) +{ + __ASM volatile ("cpsid f" : : : "memory"); +} + + +/** + \brief Get Base Priority + \details Returns the current value of the Base Priority register. + \return Base Priority register value + */ +__STATIC_FORCEINLINE uint32_t __get_BASEPRI(void) +{ + uint32_t result; + + __ASM volatile ("MRS %0, basepri" : "=r" (result) ); + return(result); +} + + +#if (defined (__ARM_FEATURE_CMSE ) && (__ARM_FEATURE_CMSE == 3)) +/** + \brief Get Base Priority (non-secure) + \details Returns the current value of the non-secure Base Priority register when in secure state. + \return Base Priority register value + */ +__STATIC_FORCEINLINE uint32_t __TZ_get_BASEPRI_NS(void) +{ + uint32_t result; + + __ASM volatile ("MRS %0, basepri_ns" : "=r" (result) ); + return(result); +} +#endif + + +/** + \brief Set Base Priority + \details Assigns the given value to the Base Priority register. + \param [in] basePri Base Priority value to set + */ +__STATIC_FORCEINLINE void __set_BASEPRI(uint32_t basePri) +{ + __ASM volatile ("MSR basepri, %0" : : "r" (basePri) : "memory"); +} + + +#if (defined (__ARM_FEATURE_CMSE ) && (__ARM_FEATURE_CMSE == 3)) +/** + \brief Set Base Priority (non-secure) + \details Assigns the given value to the non-secure Base Priority register when in secure state. + \param [in] basePri Base Priority value to set + */ +__STATIC_FORCEINLINE void __TZ_set_BASEPRI_NS(uint32_t basePri) +{ + __ASM volatile ("MSR basepri_ns, %0" : : "r" (basePri) : "memory"); +} +#endif + + +/** + \brief Set Base Priority with condition + \details Assigns the given value to the Base Priority register only if BASEPRI masking is disabled, + or the new value increases the BASEPRI priority level. + \param [in] basePri Base Priority value to set + */ +__STATIC_FORCEINLINE void __set_BASEPRI_MAX(uint32_t basePri) +{ + __ASM volatile ("MSR basepri_max, %0" : : "r" (basePri) : "memory"); +} + + +/** + \brief Get Fault Mask + \details Returns the current value of the Fault Mask register. + \return Fault Mask register value + */ +__STATIC_FORCEINLINE uint32_t __get_FAULTMASK(void) +{ + uint32_t result; + + __ASM volatile ("MRS %0, faultmask" : "=r" (result) ); + return(result); +} + + +#if (defined (__ARM_FEATURE_CMSE ) && (__ARM_FEATURE_CMSE == 3)) +/** + \brief Get Fault Mask (non-secure) + \details Returns the current value of the non-secure Fault Mask register when in secure state. + \return Fault Mask register value + */ +__STATIC_FORCEINLINE uint32_t __TZ_get_FAULTMASK_NS(void) +{ + uint32_t result; + + __ASM volatile ("MRS %0, faultmask_ns" : "=r" (result) ); + return(result); +} +#endif + + +/** + \brief Set Fault Mask + \details Assigns the given value to the Fault Mask register. + \param [in] faultMask Fault Mask value to set + */ +__STATIC_FORCEINLINE void __set_FAULTMASK(uint32_t faultMask) +{ + __ASM volatile ("MSR faultmask, %0" : : "r" (faultMask) : "memory"); +} + + +#if (defined (__ARM_FEATURE_CMSE ) && (__ARM_FEATURE_CMSE == 3)) +/** + \brief Set Fault Mask (non-secure) + \details Assigns the given value to the non-secure Fault Mask register when in secure state. + \param [in] faultMask Fault Mask value to set + */ +__STATIC_FORCEINLINE void __TZ_set_FAULTMASK_NS(uint32_t faultMask) +{ + __ASM volatile ("MSR faultmask_ns, %0" : : "r" (faultMask) : "memory"); +} +#endif + +#endif /* ((defined (__ARM_ARCH_7M__ ) && (__ARM_ARCH_7M__ == 1)) || \ + (defined (__ARM_ARCH_7EM__ ) && (__ARM_ARCH_7EM__ == 1)) || \ + (defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1)) ) */ + + +#if ((defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1)) || \ + (defined (__ARM_ARCH_8M_BASE__ ) && (__ARM_ARCH_8M_BASE__ == 1)) ) + +/** + \brief Get Process Stack Pointer Limit + Devices without ARMv8-M Main Extensions (i.e. Cortex-M23) lack the non-secure + Stack Pointer Limit register hence zero is returned always in non-secure + mode. + + \details Returns the current value of the Process Stack Pointer Limit (PSPLIM). + \return PSPLIM Register value + */ +__STATIC_FORCEINLINE uint32_t __get_PSPLIM(void) +{ +#if (!(defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1)) && \ + (!defined (__ARM_FEATURE_CMSE) || (__ARM_FEATURE_CMSE < 3))) + // without main extensions, the non-secure PSPLIM is RAZ/WI + return 0U; +#else + uint32_t result; + __ASM volatile ("MRS %0, psplim" : "=r" (result) ); + return result; +#endif +} + +#if (defined (__ARM_FEATURE_CMSE) && (__ARM_FEATURE_CMSE == 3)) +/** + \brief Get Process Stack Pointer Limit (non-secure) + Devices without ARMv8-M Main Extensions (i.e. Cortex-M23) lack the non-secure + Stack Pointer Limit register hence zero is returned always. + + \details Returns the current value of the non-secure Process Stack Pointer Limit (PSPLIM) when in secure state. + \return PSPLIM Register value + */ +__STATIC_FORCEINLINE uint32_t __TZ_get_PSPLIM_NS(void) +{ +#if (!(defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1))) + // without main extensions, the non-secure PSPLIM is RAZ/WI + return 0U; +#else + uint32_t result; + __ASM volatile ("MRS %0, psplim_ns" : "=r" (result) ); + return result; +#endif +} +#endif + + +/** + \brief Set Process Stack Pointer Limit + Devices without ARMv8-M Main Extensions (i.e. Cortex-M23) lack the non-secure + Stack Pointer Limit register hence the write is silently ignored in non-secure + mode. + + \details Assigns the given value to the Process Stack Pointer Limit (PSPLIM). + \param [in] ProcStackPtrLimit Process Stack Pointer Limit value to set + */ +__STATIC_FORCEINLINE void __set_PSPLIM(uint32_t ProcStackPtrLimit) +{ +#if (!(defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1)) && \ + (!defined (__ARM_FEATURE_CMSE) || (__ARM_FEATURE_CMSE < 3))) + // without main extensions, the non-secure PSPLIM is RAZ/WI + (void)ProcStackPtrLimit; +#else + __ASM volatile ("MSR psplim, %0" : : "r" (ProcStackPtrLimit)); +#endif +} + + +#if (defined (__ARM_FEATURE_CMSE ) && (__ARM_FEATURE_CMSE == 3)) +/** + \brief Set Process Stack Pointer (non-secure) + Devices without ARMv8-M Main Extensions (i.e. Cortex-M23) lack the non-secure + Stack Pointer Limit register hence the write is silently ignored. + + \details Assigns the given value to the non-secure Process Stack Pointer Limit (PSPLIM) when in secure state. + \param [in] ProcStackPtrLimit Process Stack Pointer Limit value to set + */ +__STATIC_FORCEINLINE void __TZ_set_PSPLIM_NS(uint32_t ProcStackPtrLimit) +{ +#if (!(defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1))) + // without main extensions, the non-secure PSPLIM is RAZ/WI + (void)ProcStackPtrLimit; +#else + __ASM volatile ("MSR psplim_ns, %0\n" : : "r" (ProcStackPtrLimit)); +#endif +} +#endif + + +/** + \brief Get Main Stack Pointer Limit + Devices without ARMv8-M Main Extensions (i.e. Cortex-M23) lack the non-secure + Stack Pointer Limit register hence zero is returned always in non-secure + mode. + + \details Returns the current value of the Main Stack Pointer Limit (MSPLIM). + \return MSPLIM Register value + */ +__STATIC_FORCEINLINE uint32_t __get_MSPLIM(void) +{ +#if (!(defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1)) && \ + (!defined (__ARM_FEATURE_CMSE) || (__ARM_FEATURE_CMSE < 3))) + // without main extensions, the non-secure MSPLIM is RAZ/WI + return 0U; +#else + uint32_t result; + __ASM volatile ("MRS %0, msplim" : "=r" (result) ); + return result; +#endif +} + + +#if (defined (__ARM_FEATURE_CMSE ) && (__ARM_FEATURE_CMSE == 3)) +/** + \brief Get Main Stack Pointer Limit (non-secure) + Devices without ARMv8-M Main Extensions (i.e. Cortex-M23) lack the non-secure + Stack Pointer Limit register hence zero is returned always. + + \details Returns the current value of the non-secure Main Stack Pointer Limit(MSPLIM) when in secure state. + \return MSPLIM Register value + */ +__STATIC_FORCEINLINE uint32_t __TZ_get_MSPLIM_NS(void) +{ +#if (!(defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1))) + // without main extensions, the non-secure MSPLIM is RAZ/WI + return 0U; +#else + uint32_t result; + __ASM volatile ("MRS %0, msplim_ns" : "=r" (result) ); + return result; +#endif +} +#endif + + +/** + \brief Set Main Stack Pointer Limit + Devices without ARMv8-M Main Extensions (i.e. Cortex-M23) lack the non-secure + Stack Pointer Limit register hence the write is silently ignored in non-secure + mode. + + \details Assigns the given value to the Main Stack Pointer Limit (MSPLIM). + \param [in] MainStackPtrLimit Main Stack Pointer Limit value to set + */ +__STATIC_FORCEINLINE void __set_MSPLIM(uint32_t MainStackPtrLimit) +{ +#if (!(defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1)) && \ + (!defined (__ARM_FEATURE_CMSE) || (__ARM_FEATURE_CMSE < 3))) + // without main extensions, the non-secure MSPLIM is RAZ/WI + (void)MainStackPtrLimit; +#else + __ASM volatile ("MSR msplim, %0" : : "r" (MainStackPtrLimit)); +#endif +} + + +#if (defined (__ARM_FEATURE_CMSE ) && (__ARM_FEATURE_CMSE == 3)) +/** + \brief Set Main Stack Pointer Limit (non-secure) + Devices without ARMv8-M Main Extensions (i.e. Cortex-M23) lack the non-secure + Stack Pointer Limit register hence the write is silently ignored. + + \details Assigns the given value to the non-secure Main Stack Pointer Limit (MSPLIM) when in secure state. + \param [in] MainStackPtrLimit Main Stack Pointer value to set + */ +__STATIC_FORCEINLINE void __TZ_set_MSPLIM_NS(uint32_t MainStackPtrLimit) +{ +#if (!(defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1))) + // without main extensions, the non-secure MSPLIM is RAZ/WI + (void)MainStackPtrLimit; +#else + __ASM volatile ("MSR msplim_ns, %0" : : "r" (MainStackPtrLimit)); +#endif +} +#endif + +#endif /* ((defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1)) || \ + (defined (__ARM_ARCH_8M_BASE__ ) && (__ARM_ARCH_8M_BASE__ == 1)) ) */ + + +/** + \brief Get FPSCR + \details Returns the current value of the Floating Point Status/Control register. + \return Floating Point Status/Control register value + */ +__STATIC_FORCEINLINE uint32_t __get_FPSCR(void) +{ +#if ((defined (__FPU_PRESENT) && (__FPU_PRESENT == 1U)) && \ + (defined (__FPU_USED ) && (__FPU_USED == 1U)) ) +#if __has_builtin(__builtin_arm_get_fpscr) +// Re-enable using built-in when GCC has been fixed +// || (__GNUC__ > 7) || (__GNUC__ == 7 && __GNUC_MINOR__ >= 2) + /* see https://gcc.gnu.org/ml/gcc-patches/2017-04/msg00443.html */ + return __builtin_arm_get_fpscr(); +#else + uint32_t result; + + __ASM volatile ("VMRS %0, fpscr" : "=r" (result) ); + return(result); +#endif +#else + return(0U); +#endif +} + + +/** + \brief Set FPSCR + \details Assigns the given value to the Floating Point Status/Control register. + \param [in] fpscr Floating Point Status/Control value to set + */ +__STATIC_FORCEINLINE void __set_FPSCR(uint32_t fpscr) +{ +#if ((defined (__FPU_PRESENT) && (__FPU_PRESENT == 1U)) && \ + (defined (__FPU_USED ) && (__FPU_USED == 1U)) ) +#if __has_builtin(__builtin_arm_set_fpscr) +// Re-enable using built-in when GCC has been fixed +// || (__GNUC__ > 7) || (__GNUC__ == 7 && __GNUC_MINOR__ >= 2) + /* see https://gcc.gnu.org/ml/gcc-patches/2017-04/msg00443.html */ + __builtin_arm_set_fpscr(fpscr); +#else + __ASM volatile ("VMSR fpscr, %0" : : "r" (fpscr) : "vfpcc", "memory"); +#endif +#else + (void)fpscr; +#endif +} + + +/*@} end of CMSIS_Core_RegAccFunctions */ + + +/* ########################## Core Instruction Access ######################### */ +/** \defgroup CMSIS_Core_InstructionInterface CMSIS Core Instruction Interface + Access to dedicated instructions + @{ +*/ + +/* Define macros for porting to both thumb1 and thumb2. + * For thumb1, use low register (r0-r7), specified by constraint "l" + * Otherwise, use general registers, specified by constraint "r" */ +#if defined (__thumb__) && !defined (__thumb2__) +#define __CMSIS_GCC_OUT_REG(r) "=l" (r) +#define __CMSIS_GCC_RW_REG(r) "+l" (r) +#define __CMSIS_GCC_USE_REG(r) "l" (r) +#else +#define __CMSIS_GCC_OUT_REG(r) "=r" (r) +#define __CMSIS_GCC_RW_REG(r) "+r" (r) +#define __CMSIS_GCC_USE_REG(r) "r" (r) +#endif + +/** + \brief No Operation + \details No Operation does nothing. This instruction can be used for code alignment purposes. + */ +#define __NOP() __ASM volatile ("nop") + +/** + \brief Wait For Interrupt + \details Wait For Interrupt is a hint instruction that suspends execution until one of a number of events occurs. + */ +#define __WFI() __ASM volatile ("wfi":::"memory") + + +/** + \brief Wait For Event + \details Wait For Event is a hint instruction that permits the processor to enter + a low-power state until one of a number of events occurs. + */ +#define __WFE() __ASM volatile ("wfe":::"memory") + + +/** + \brief Send Event + \details Send Event is a hint instruction. It causes an event to be signaled to the CPU. + */ +#define __SEV() __ASM volatile ("sev") + + +/** + \brief Instruction Synchronization Barrier + \details Instruction Synchronization Barrier flushes the pipeline in the processor, + so that all instructions following the ISB are fetched from cache or memory, + after the instruction has been completed. + */ +__STATIC_FORCEINLINE void __ISB(void) +{ + __ASM volatile ("isb 0xF":::"memory"); +} + + +/** + \brief Data Synchronization Barrier + \details Acts as a special kind of Data Memory Barrier. + It completes when all explicit memory accesses before this instruction complete. + */ +__STATIC_FORCEINLINE void __DSB(void) +{ + __ASM volatile ("dsb 0xF":::"memory"); +} + + +/** + \brief Data Memory Barrier + \details Ensures the apparent order of the explicit memory operations before + and after the instruction, without ensuring their completion. + */ +__STATIC_FORCEINLINE void __DMB(void) +{ + __ASM volatile ("dmb 0xF":::"memory"); +} + + +/** + \brief Reverse byte order (32 bit) + \details Reverses the byte order in unsigned integer value. For example, 0x12345678 becomes 0x78563412. + \param [in] value Value to reverse + \return Reversed value + */ +__STATIC_FORCEINLINE uint32_t __REV(uint32_t value) +{ +#if (__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 5) + return __builtin_bswap32(value); +#else + uint32_t result; + + __ASM ("rev %0, %1" : __CMSIS_GCC_OUT_REG (result) : __CMSIS_GCC_USE_REG (value) ); + return result; +#endif +} + + +/** + \brief Reverse byte order (16 bit) + \details Reverses the byte order within each halfword of a word. For example, 0x12345678 becomes 0x34127856. + \param [in] value Value to reverse + \return Reversed value + */ +__STATIC_FORCEINLINE uint32_t __REV16(uint32_t value) +{ + uint32_t result; + + __ASM ("rev16 %0, %1" : __CMSIS_GCC_OUT_REG (result) : __CMSIS_GCC_USE_REG (value) ); + return result; +} + + +/** + \brief Reverse byte order (16 bit) + \details Reverses the byte order in a 16-bit value and returns the signed 16-bit result. For example, 0x0080 becomes 0x8000. + \param [in] value Value to reverse + \return Reversed value + */ +__STATIC_FORCEINLINE int16_t __REVSH(int16_t value) +{ +#if (__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8) + return (int16_t)__builtin_bswap16(value); +#else + int16_t result; + + __ASM ("revsh %0, %1" : __CMSIS_GCC_OUT_REG (result) : __CMSIS_GCC_USE_REG (value) ); + return result; +#endif +} + + +/** + \brief Rotate Right in unsigned value (32 bit) + \details Rotate Right (immediate) provides the value of the contents of a register rotated by a variable number of bits. + \param [in] op1 Value to rotate + \param [in] op2 Number of Bits to rotate + \return Rotated value + */ +__STATIC_FORCEINLINE uint32_t __ROR(uint32_t op1, uint32_t op2) +{ + op2 %= 32U; + if (op2 == 0U) + { + return op1; + } + return (op1 >> op2) | (op1 << (32U - op2)); +} + + +/** + \brief Breakpoint + \details Causes the processor to enter Debug state. + Debug tools can use this to investigate system state when the instruction at a particular address is reached. + \param [in] value is ignored by the processor. + If required, a debugger can use it to store additional information about the breakpoint. + */ +#define __BKPT(value) __ASM volatile ("bkpt "#value) + + +/** + \brief Reverse bit order of value + \details Reverses the bit order of the given value. + \param [in] value Value to reverse + \return Reversed value + */ +__STATIC_FORCEINLINE uint32_t __RBIT(uint32_t value) +{ + uint32_t result; + +#if ((defined (__ARM_ARCH_7M__ ) && (__ARM_ARCH_7M__ == 1)) || \ + (defined (__ARM_ARCH_7EM__ ) && (__ARM_ARCH_7EM__ == 1)) || \ + (defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1)) ) + __ASM ("rbit %0, %1" : "=r" (result) : "r" (value) ); +#else + uint32_t s = (4U /*sizeof(v)*/ * 8U) - 1U; /* extra shift needed at end */ + + result = value; /* r will be reversed bits of v; first get LSB of v */ + for (value >>= 1U; value != 0U; value >>= 1U) + { + result <<= 1U; + result |= value & 1U; + s--; + } + result <<= s; /* shift when v's highest bits are zero */ +#endif + return result; +} + + +/** + \brief Count leading zeros + \details Counts the number of leading zeros of a data value. + \param [in] value Value to count the leading zeros + \return number of leading zeros in value + */ +__STATIC_FORCEINLINE uint8_t __CLZ(uint32_t value) +{ + /* Even though __builtin_clz produces a CLZ instruction on ARM, formally + __builtin_clz(0) is undefined behaviour, so handle this case specially. + This guarantees ARM-compatible results if happening to compile on a non-ARM + target, and ensures the compiler doesn't decide to activate any + optimisations using the logic "value was passed to __builtin_clz, so it + is non-zero". + ARM GCC 7.3 and possibly earlier will optimise this test away, leaving a + single CLZ instruction. + */ + if (value == 0U) + { + return 32U; + } + return __builtin_clz(value); +} + + +#if ((defined (__ARM_ARCH_7M__ ) && (__ARM_ARCH_7M__ == 1)) || \ + (defined (__ARM_ARCH_7EM__ ) && (__ARM_ARCH_7EM__ == 1)) || \ + (defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1)) || \ + (defined (__ARM_ARCH_8M_BASE__ ) && (__ARM_ARCH_8M_BASE__ == 1)) ) +/** + \brief LDR Exclusive (8 bit) + \details Executes a exclusive LDR instruction for 8 bit value. + \param [in] ptr Pointer to data + \return value of type uint8_t at (*ptr) + */ +__STATIC_FORCEINLINE uint8_t __LDREXB(volatile uint8_t *addr) +{ + uint32_t result; + +#if (__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8) + __ASM volatile ("ldrexb %0, %1" : "=r" (result) : "Q" (*addr) ); +#else + /* Prior to GCC 4.8, "Q" will be expanded to [rx, #0] which is not + accepted by assembler. So has to use following less efficient pattern. + */ + __ASM volatile ("ldrexb %0, [%1]" : "=r" (result) : "r" (addr) : "memory" ); +#endif + return ((uint8_t) result); /* Add explicit type cast here */ +} + + +/** + \brief LDR Exclusive (16 bit) + \details Executes a exclusive LDR instruction for 16 bit values. + \param [in] ptr Pointer to data + \return value of type uint16_t at (*ptr) + */ +__STATIC_FORCEINLINE uint16_t __LDREXH(volatile uint16_t *addr) +{ + uint32_t result; + +#if (__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8) + __ASM volatile ("ldrexh %0, %1" : "=r" (result) : "Q" (*addr) ); +#else + /* Prior to GCC 4.8, "Q" will be expanded to [rx, #0] which is not + accepted by assembler. So has to use following less efficient pattern. + */ + __ASM volatile ("ldrexh %0, [%1]" : "=r" (result) : "r" (addr) : "memory" ); +#endif + return ((uint16_t) result); /* Add explicit type cast here */ +} + + +/** + \brief LDR Exclusive (32 bit) + \details Executes a exclusive LDR instruction for 32 bit values. + \param [in] ptr Pointer to data + \return value of type uint32_t at (*ptr) + */ +__STATIC_FORCEINLINE uint32_t __LDREXW(volatile uint32_t *addr) +{ + uint32_t result; + + __ASM volatile ("ldrex %0, %1" : "=r" (result) : "Q" (*addr) ); + return(result); +} + + +/** + \brief STR Exclusive (8 bit) + \details Executes a exclusive STR instruction for 8 bit values. + \param [in] value Value to store + \param [in] ptr Pointer to location + \return 0 Function succeeded + \return 1 Function failed + */ +__STATIC_FORCEINLINE uint32_t __STREXB(uint8_t value, volatile uint8_t *addr) +{ + uint32_t result; + + __ASM volatile ("strexb %0, %2, %1" : "=&r" (result), "=Q" (*addr) : "r" ((uint32_t)value) ); + return(result); +} + + +/** + \brief STR Exclusive (16 bit) + \details Executes a exclusive STR instruction for 16 bit values. + \param [in] value Value to store + \param [in] ptr Pointer to location + \return 0 Function succeeded + \return 1 Function failed + */ +__STATIC_FORCEINLINE uint32_t __STREXH(uint16_t value, volatile uint16_t *addr) +{ + uint32_t result; + + __ASM volatile ("strexh %0, %2, %1" : "=&r" (result), "=Q" (*addr) : "r" ((uint32_t)value) ); + return(result); +} + + +/** + \brief STR Exclusive (32 bit) + \details Executes a exclusive STR instruction for 32 bit values. + \param [in] value Value to store + \param [in] ptr Pointer to location + \return 0 Function succeeded + \return 1 Function failed + */ +__STATIC_FORCEINLINE uint32_t __STREXW(uint32_t value, volatile uint32_t *addr) +{ + uint32_t result; + + __ASM volatile ("strex %0, %2, %1" : "=&r" (result), "=Q" (*addr) : "r" (value) ); + return(result); +} + + +/** + \brief Remove the exclusive lock + \details Removes the exclusive lock which is created by LDREX. + */ +__STATIC_FORCEINLINE void __CLREX(void) +{ + __ASM volatile ("clrex" ::: "memory"); +} + +#endif /* ((defined (__ARM_ARCH_7M__ ) && (__ARM_ARCH_7M__ == 1)) || \ + (defined (__ARM_ARCH_7EM__ ) && (__ARM_ARCH_7EM__ == 1)) || \ + (defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1)) || \ + (defined (__ARM_ARCH_8M_BASE__ ) && (__ARM_ARCH_8M_BASE__ == 1)) ) */ + + +#if ((defined (__ARM_ARCH_7M__ ) && (__ARM_ARCH_7M__ == 1)) || \ + (defined (__ARM_ARCH_7EM__ ) && (__ARM_ARCH_7EM__ == 1)) || \ + (defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1)) ) +/** + \brief Signed Saturate + \details Saturates a signed value. + \param [in] ARG1 Value to be saturated + \param [in] ARG2 Bit position to saturate to (1..32) + \return Saturated value + */ +#define __SSAT(ARG1, ARG2) \ +__extension__ \ +({ \ + int32_t __RES, __ARG1 = (ARG1); \ + __ASM volatile ("ssat %0, %1, %2" : "=r" (__RES) : "I" (ARG2), "r" (__ARG1) : "cc" ); \ + __RES; \ + }) + + +/** + \brief Unsigned Saturate + \details Saturates an unsigned value. + \param [in] ARG1 Value to be saturated + \param [in] ARG2 Bit position to saturate to (0..31) + \return Saturated value + */ +#define __USAT(ARG1, ARG2) \ +__extension__ \ +({ \ + uint32_t __RES, __ARG1 = (ARG1); \ + __ASM volatile ("usat %0, %1, %2" : "=r" (__RES) : "I" (ARG2), "r" (__ARG1) : "cc" ); \ + __RES; \ + }) + + +/** + \brief Rotate Right with Extend (32 bit) + \details Moves each bit of a bitstring right by one bit. + The carry input is shifted in at the left end of the bitstring. + \param [in] value Value to rotate + \return Rotated value + */ +__STATIC_FORCEINLINE uint32_t __RRX(uint32_t value) +{ + uint32_t result; + + __ASM volatile ("rrx %0, %1" : __CMSIS_GCC_OUT_REG (result) : __CMSIS_GCC_USE_REG (value) ); + return(result); +} + + +/** + \brief LDRT Unprivileged (8 bit) + \details Executes a Unprivileged LDRT instruction for 8 bit value. + \param [in] ptr Pointer to data + \return value of type uint8_t at (*ptr) + */ +__STATIC_FORCEINLINE uint8_t __LDRBT(volatile uint8_t *ptr) +{ + uint32_t result; + +#if (__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8) + __ASM volatile ("ldrbt %0, %1" : "=r" (result) : "Q" (*ptr) ); +#else + /* Prior to GCC 4.8, "Q" will be expanded to [rx, #0] which is not + accepted by assembler. So has to use following less efficient pattern. + */ + __ASM volatile ("ldrbt %0, [%1]" : "=r" (result) : "r" (ptr) : "memory" ); +#endif + return ((uint8_t) result); /* Add explicit type cast here */ +} + + +/** + \brief LDRT Unprivileged (16 bit) + \details Executes a Unprivileged LDRT instruction for 16 bit values. + \param [in] ptr Pointer to data + \return value of type uint16_t at (*ptr) + */ +__STATIC_FORCEINLINE uint16_t __LDRHT(volatile uint16_t *ptr) +{ + uint32_t result; + +#if (__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8) + __ASM volatile ("ldrht %0, %1" : "=r" (result) : "Q" (*ptr) ); +#else + /* Prior to GCC 4.8, "Q" will be expanded to [rx, #0] which is not + accepted by assembler. So has to use following less efficient pattern. + */ + __ASM volatile ("ldrht %0, [%1]" : "=r" (result) : "r" (ptr) : "memory" ); +#endif + return ((uint16_t) result); /* Add explicit type cast here */ +} + + +/** + \brief LDRT Unprivileged (32 bit) + \details Executes a Unprivileged LDRT instruction for 32 bit values. + \param [in] ptr Pointer to data + \return value of type uint32_t at (*ptr) + */ +__STATIC_FORCEINLINE uint32_t __LDRT(volatile uint32_t *ptr) +{ + uint32_t result; + + __ASM volatile ("ldrt %0, %1" : "=r" (result) : "Q" (*ptr) ); + return(result); +} + + +/** + \brief STRT Unprivileged (8 bit) + \details Executes a Unprivileged STRT instruction for 8 bit values. + \param [in] value Value to store + \param [in] ptr Pointer to location + */ +__STATIC_FORCEINLINE void __STRBT(uint8_t value, volatile uint8_t *ptr) +{ + __ASM volatile ("strbt %1, %0" : "=Q" (*ptr) : "r" ((uint32_t)value) ); +} + + +/** + \brief STRT Unprivileged (16 bit) + \details Executes a Unprivileged STRT instruction for 16 bit values. + \param [in] value Value to store + \param [in] ptr Pointer to location + */ +__STATIC_FORCEINLINE void __STRHT(uint16_t value, volatile uint16_t *ptr) +{ + __ASM volatile ("strht %1, %0" : "=Q" (*ptr) : "r" ((uint32_t)value) ); +} + + +/** + \brief STRT Unprivileged (32 bit) + \details Executes a Unprivileged STRT instruction for 32 bit values. + \param [in] value Value to store + \param [in] ptr Pointer to location + */ +__STATIC_FORCEINLINE void __STRT(uint32_t value, volatile uint32_t *ptr) +{ + __ASM volatile ("strt %1, %0" : "=Q" (*ptr) : "r" (value) ); +} + +#else /* ((defined (__ARM_ARCH_7M__ ) && (__ARM_ARCH_7M__ == 1)) || \ + (defined (__ARM_ARCH_7EM__ ) && (__ARM_ARCH_7EM__ == 1)) || \ + (defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1)) ) */ + +/** + \brief Signed Saturate + \details Saturates a signed value. + \param [in] value Value to be saturated + \param [in] sat Bit position to saturate to (1..32) + \return Saturated value + */ +__STATIC_FORCEINLINE int32_t __SSAT(int32_t val, uint32_t sat) +{ + if ((sat >= 1U) && (sat <= 32U)) + { + const int32_t max = (int32_t)((1U << (sat - 1U)) - 1U); + const int32_t min = -1 - max ; + if (val > max) + { + return max; + } + else if (val < min) + { + return min; + } + } + return val; +} + +/** + \brief Unsigned Saturate + \details Saturates an unsigned value. + \param [in] value Value to be saturated + \param [in] sat Bit position to saturate to (0..31) + \return Saturated value + */ +__STATIC_FORCEINLINE uint32_t __USAT(int32_t val, uint32_t sat) +{ + if (sat <= 31U) + { + const uint32_t max = ((1U << sat) - 1U); + if (val > (int32_t)max) + { + return max; + } + else if (val < 0) + { + return 0U; + } + } + return (uint32_t)val; +} + +#endif /* ((defined (__ARM_ARCH_7M__ ) && (__ARM_ARCH_7M__ == 1)) || \ + (defined (__ARM_ARCH_7EM__ ) && (__ARM_ARCH_7EM__ == 1)) || \ + (defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1)) ) */ + + +#if ((defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1)) || \ + (defined (__ARM_ARCH_8M_BASE__ ) && (__ARM_ARCH_8M_BASE__ == 1)) ) +/** + \brief Load-Acquire (8 bit) + \details Executes a LDAB instruction for 8 bit value. + \param [in] ptr Pointer to data + \return value of type uint8_t at (*ptr) + */ +__STATIC_FORCEINLINE uint8_t __LDAB(volatile uint8_t *ptr) +{ + uint32_t result; + + __ASM volatile ("ldab %0, %1" : "=r" (result) : "Q" (*ptr) : "memory" ); + return ((uint8_t) result); +} + + +/** + \brief Load-Acquire (16 bit) + \details Executes a LDAH instruction for 16 bit values. + \param [in] ptr Pointer to data + \return value of type uint16_t at (*ptr) + */ +__STATIC_FORCEINLINE uint16_t __LDAH(volatile uint16_t *ptr) +{ + uint32_t result; + + __ASM volatile ("ldah %0, %1" : "=r" (result) : "Q" (*ptr) : "memory" ); + return ((uint16_t) result); +} + + +/** + \brief Load-Acquire (32 bit) + \details Executes a LDA instruction for 32 bit values. + \param [in] ptr Pointer to data + \return value of type uint32_t at (*ptr) + */ +__STATIC_FORCEINLINE uint32_t __LDA(volatile uint32_t *ptr) +{ + uint32_t result; + + __ASM volatile ("lda %0, %1" : "=r" (result) : "Q" (*ptr) : "memory" ); + return(result); +} + + +/** + \brief Store-Release (8 bit) + \details Executes a STLB instruction for 8 bit values. + \param [in] value Value to store + \param [in] ptr Pointer to location + */ +__STATIC_FORCEINLINE void __STLB(uint8_t value, volatile uint8_t *ptr) +{ + __ASM volatile ("stlb %1, %0" : "=Q" (*ptr) : "r" ((uint32_t)value) : "memory" ); +} + + +/** + \brief Store-Release (16 bit) + \details Executes a STLH instruction for 16 bit values. + \param [in] value Value to store + \param [in] ptr Pointer to location + */ +__STATIC_FORCEINLINE void __STLH(uint16_t value, volatile uint16_t *ptr) +{ + __ASM volatile ("stlh %1, %0" : "=Q" (*ptr) : "r" ((uint32_t)value) : "memory" ); +} + + +/** + \brief Store-Release (32 bit) + \details Executes a STL instruction for 32 bit values. + \param [in] value Value to store + \param [in] ptr Pointer to location + */ +__STATIC_FORCEINLINE void __STL(uint32_t value, volatile uint32_t *ptr) +{ + __ASM volatile ("stl %1, %0" : "=Q" (*ptr) : "r" ((uint32_t)value) : "memory" ); +} + + +/** + \brief Load-Acquire Exclusive (8 bit) + \details Executes a LDAB exclusive instruction for 8 bit value. + \param [in] ptr Pointer to data + \return value of type uint8_t at (*ptr) + */ +__STATIC_FORCEINLINE uint8_t __LDAEXB(volatile uint8_t *ptr) +{ + uint32_t result; + + __ASM volatile ("ldaexb %0, %1" : "=r" (result) : "Q" (*ptr) : "memory" ); + return ((uint8_t) result); +} + + +/** + \brief Load-Acquire Exclusive (16 bit) + \details Executes a LDAH exclusive instruction for 16 bit values. + \param [in] ptr Pointer to data + \return value of type uint16_t at (*ptr) + */ +__STATIC_FORCEINLINE uint16_t __LDAEXH(volatile uint16_t *ptr) +{ + uint32_t result; + + __ASM volatile ("ldaexh %0, %1" : "=r" (result) : "Q" (*ptr) : "memory" ); + return ((uint16_t) result); +} + + +/** + \brief Load-Acquire Exclusive (32 bit) + \details Executes a LDA exclusive instruction for 32 bit values. + \param [in] ptr Pointer to data + \return value of type uint32_t at (*ptr) + */ +__STATIC_FORCEINLINE uint32_t __LDAEX(volatile uint32_t *ptr) +{ + uint32_t result; + + __ASM volatile ("ldaex %0, %1" : "=r" (result) : "Q" (*ptr) : "memory" ); + return(result); +} + + +/** + \brief Store-Release Exclusive (8 bit) + \details Executes a STLB exclusive instruction for 8 bit values. + \param [in] value Value to store + \param [in] ptr Pointer to location + \return 0 Function succeeded + \return 1 Function failed + */ +__STATIC_FORCEINLINE uint32_t __STLEXB(uint8_t value, volatile uint8_t *ptr) +{ + uint32_t result; + + __ASM volatile ("stlexb %0, %2, %1" : "=&r" (result), "=Q" (*ptr) : "r" ((uint32_t)value) : "memory" ); + return(result); +} + + +/** + \brief Store-Release Exclusive (16 bit) + \details Executes a STLH exclusive instruction for 16 bit values. + \param [in] value Value to store + \param [in] ptr Pointer to location + \return 0 Function succeeded + \return 1 Function failed + */ +__STATIC_FORCEINLINE uint32_t __STLEXH(uint16_t value, volatile uint16_t *ptr) +{ + uint32_t result; + + __ASM volatile ("stlexh %0, %2, %1" : "=&r" (result), "=Q" (*ptr) : "r" ((uint32_t)value) : "memory" ); + return(result); +} + + +/** + \brief Store-Release Exclusive (32 bit) + \details Executes a STL exclusive instruction for 32 bit values. + \param [in] value Value to store + \param [in] ptr Pointer to location + \return 0 Function succeeded + \return 1 Function failed + */ +__STATIC_FORCEINLINE uint32_t __STLEX(uint32_t value, volatile uint32_t *ptr) +{ + uint32_t result; + + __ASM volatile ("stlex %0, %2, %1" : "=&r" (result), "=Q" (*ptr) : "r" ((uint32_t)value) : "memory" ); + return(result); +} + +#endif /* ((defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1)) || \ + (defined (__ARM_ARCH_8M_BASE__ ) && (__ARM_ARCH_8M_BASE__ == 1)) ) */ + +/*@}*/ /* end of group CMSIS_Core_InstructionInterface */ + + +/* ################### Compiler specific Intrinsics ########################### */ +/** \defgroup CMSIS_SIMD_intrinsics CMSIS SIMD Intrinsics + Access to dedicated SIMD instructions + @{ +*/ + +#if (defined (__ARM_FEATURE_DSP) && (__ARM_FEATURE_DSP == 1)) + +__STATIC_FORCEINLINE uint32_t __SADD8(uint32_t op1, uint32_t op2) +{ + uint32_t result; + + __ASM volatile ("sadd8 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) ); + return(result); +} + +__STATIC_FORCEINLINE uint32_t __QADD8(uint32_t op1, uint32_t op2) +{ + uint32_t result; + + __ASM ("qadd8 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) ); + return(result); +} + +__STATIC_FORCEINLINE uint32_t __SHADD8(uint32_t op1, uint32_t op2) +{ + uint32_t result; + + __ASM ("shadd8 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) ); + return(result); +} + +__STATIC_FORCEINLINE uint32_t __UADD8(uint32_t op1, uint32_t op2) +{ + uint32_t result; + + __ASM volatile ("uadd8 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) ); + return(result); +} + +__STATIC_FORCEINLINE uint32_t __UQADD8(uint32_t op1, uint32_t op2) +{ + uint32_t result; + + __ASM ("uqadd8 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) ); + return(result); +} + +__STATIC_FORCEINLINE uint32_t __UHADD8(uint32_t op1, uint32_t op2) +{ + uint32_t result; + + __ASM ("uhadd8 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) ); + return(result); +} + + +__STATIC_FORCEINLINE uint32_t __SSUB8(uint32_t op1, uint32_t op2) +{ + uint32_t result; + + __ASM volatile ("ssub8 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) ); + return(result); +} + +__STATIC_FORCEINLINE uint32_t __QSUB8(uint32_t op1, uint32_t op2) +{ + uint32_t result; + + __ASM ("qsub8 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) ); + return(result); +} + +__STATIC_FORCEINLINE uint32_t __SHSUB8(uint32_t op1, uint32_t op2) +{ + uint32_t result; + + __ASM ("shsub8 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) ); + return(result); +} + +__STATIC_FORCEINLINE uint32_t __USUB8(uint32_t op1, uint32_t op2) +{ + uint32_t result; + + __ASM volatile ("usub8 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) ); + return(result); +} + +__STATIC_FORCEINLINE uint32_t __UQSUB8(uint32_t op1, uint32_t op2) +{ + uint32_t result; + + __ASM ("uqsub8 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) ); + return(result); +} + +__STATIC_FORCEINLINE uint32_t __UHSUB8(uint32_t op1, uint32_t op2) +{ + uint32_t result; + + __ASM ("uhsub8 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) ); + return(result); +} + + +__STATIC_FORCEINLINE uint32_t __SADD16(uint32_t op1, uint32_t op2) +{ + uint32_t result; + + __ASM volatile ("sadd16 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) ); + return(result); +} + +__STATIC_FORCEINLINE uint32_t __QADD16(uint32_t op1, uint32_t op2) +{ + uint32_t result; + + __ASM ("qadd16 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) ); + return(result); +} + +__STATIC_FORCEINLINE uint32_t __SHADD16(uint32_t op1, uint32_t op2) +{ + uint32_t result; + + __ASM ("shadd16 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) ); + return(result); +} + +__STATIC_FORCEINLINE uint32_t __UADD16(uint32_t op1, uint32_t op2) +{ + uint32_t result; + + __ASM volatile ("uadd16 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) ); + return(result); +} + +__STATIC_FORCEINLINE uint32_t __UQADD16(uint32_t op1, uint32_t op2) +{ + uint32_t result; + + __ASM ("uqadd16 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) ); + return(result); +} + +__STATIC_FORCEINLINE uint32_t __UHADD16(uint32_t op1, uint32_t op2) +{ + uint32_t result; + + __ASM ("uhadd16 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) ); + return(result); +} + +__STATIC_FORCEINLINE uint32_t __SSUB16(uint32_t op1, uint32_t op2) +{ + uint32_t result; + + __ASM volatile ("ssub16 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) ); + return(result); +} + +__STATIC_FORCEINLINE uint32_t __QSUB16(uint32_t op1, uint32_t op2) +{ + uint32_t result; + + __ASM ("qsub16 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) ); + return(result); +} + +__STATIC_FORCEINLINE uint32_t __SHSUB16(uint32_t op1, uint32_t op2) +{ + uint32_t result; + + __ASM ("shsub16 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) ); + return(result); +} + +__STATIC_FORCEINLINE uint32_t __USUB16(uint32_t op1, uint32_t op2) +{ + uint32_t result; + + __ASM volatile ("usub16 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) ); + return(result); +} + +__STATIC_FORCEINLINE uint32_t __UQSUB16(uint32_t op1, uint32_t op2) +{ + uint32_t result; + + __ASM ("uqsub16 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) ); + return(result); +} + +__STATIC_FORCEINLINE uint32_t __UHSUB16(uint32_t op1, uint32_t op2) +{ + uint32_t result; + + __ASM ("uhsub16 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) ); + return(result); +} + +__STATIC_FORCEINLINE uint32_t __SASX(uint32_t op1, uint32_t op2) +{ + uint32_t result; + + __ASM volatile ("sasx %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) ); + return(result); +} + +__STATIC_FORCEINLINE uint32_t __QASX(uint32_t op1, uint32_t op2) +{ + uint32_t result; + + __ASM ("qasx %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) ); + return(result); +} + +__STATIC_FORCEINLINE uint32_t __SHASX(uint32_t op1, uint32_t op2) +{ + uint32_t result; + + __ASM ("shasx %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) ); + return(result); +} + +__STATIC_FORCEINLINE uint32_t __UASX(uint32_t op1, uint32_t op2) +{ + uint32_t result; + + __ASM volatile ("uasx %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) ); + return(result); +} + +__STATIC_FORCEINLINE uint32_t __UQASX(uint32_t op1, uint32_t op2) +{ + uint32_t result; + + __ASM ("uqasx %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) ); + return(result); +} + +__STATIC_FORCEINLINE uint32_t __UHASX(uint32_t op1, uint32_t op2) +{ + uint32_t result; + + __ASM ("uhasx %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) ); + return(result); +} + +__STATIC_FORCEINLINE uint32_t __SSAX(uint32_t op1, uint32_t op2) +{ + uint32_t result; + + __ASM volatile ("ssax %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) ); + return(result); +} + +__STATIC_FORCEINLINE uint32_t __QSAX(uint32_t op1, uint32_t op2) +{ + uint32_t result; + + __ASM ("qsax %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) ); + return(result); +} + +__STATIC_FORCEINLINE uint32_t __SHSAX(uint32_t op1, uint32_t op2) +{ + uint32_t result; + + __ASM ("shsax %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) ); + return(result); +} + +__STATIC_FORCEINLINE uint32_t __USAX(uint32_t op1, uint32_t op2) +{ + uint32_t result; + + __ASM volatile ("usax %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) ); + return(result); +} + +__STATIC_FORCEINLINE uint32_t __UQSAX(uint32_t op1, uint32_t op2) +{ + uint32_t result; + + __ASM ("uqsax %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) ); + return(result); +} + +__STATIC_FORCEINLINE uint32_t __UHSAX(uint32_t op1, uint32_t op2) +{ + uint32_t result; + + __ASM ("uhsax %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) ); + return(result); +} + +__STATIC_FORCEINLINE uint32_t __USAD8(uint32_t op1, uint32_t op2) +{ + uint32_t result; + + __ASM ("usad8 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) ); + return(result); +} + +__STATIC_FORCEINLINE uint32_t __USADA8(uint32_t op1, uint32_t op2, uint32_t op3) +{ + uint32_t result; + + __ASM ("usada8 %0, %1, %2, %3" : "=r" (result) : "r" (op1), "r" (op2), "r" (op3) ); + return(result); +} + +#define __SSAT16(ARG1, ARG2) \ +__extension__ \ +({ \ + int32_t __RES, __ARG1 = (ARG1); \ + __ASM volatile ("ssat16 %0, %1, %2" : "=r" (__RES) : "I" (ARG2), "r" (__ARG1) : "cc" ); \ + __RES; \ + }) + +#define __USAT16(ARG1, ARG2) \ +__extension__ \ +({ \ + uint32_t __RES, __ARG1 = (ARG1); \ + __ASM volatile ("usat16 %0, %1, %2" : "=r" (__RES) : "I" (ARG2), "r" (__ARG1) : "cc" ); \ + __RES; \ + }) + +__STATIC_FORCEINLINE uint32_t __UXTB16(uint32_t op1) +{ + uint32_t result; + + __ASM ("uxtb16 %0, %1" : "=r" (result) : "r" (op1)); + return(result); +} + +__STATIC_FORCEINLINE uint32_t __UXTAB16(uint32_t op1, uint32_t op2) +{ + uint32_t result; + + __ASM ("uxtab16 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) ); + return(result); +} + +__STATIC_FORCEINLINE uint32_t __SXTB16(uint32_t op1) +{ + uint32_t result; + + __ASM ("sxtb16 %0, %1" : "=r" (result) : "r" (op1)); + return(result); +} + +__STATIC_FORCEINLINE uint32_t __SXTB16_RORn(uint32_t op1, uint32_t rotate) +{ + uint32_t result; + if (__builtin_constant_p(rotate) && ((rotate == 8U) || (rotate == 16U) || (rotate == 24U))) { + __ASM volatile ("sxtb16 %0, %1, ROR %2" : "=r" (result) : "r" (op1), "i" (rotate) ); + } else { + result = __SXTB16(__ROR(op1, rotate)) ; + } + return result; +} + +__STATIC_FORCEINLINE uint32_t __SXTAB16(uint32_t op1, uint32_t op2) +{ + uint32_t result; + + __ASM ("sxtab16 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) ); + return(result); +} + +__STATIC_FORCEINLINE uint32_t __SXTAB16_RORn(uint32_t op1, uint32_t op2, uint32_t rotate) +{ + uint32_t result; + if (__builtin_constant_p(rotate) && ((rotate == 8U) || (rotate == 16U) || (rotate == 24U))) { + __ASM volatile ("sxtab16 %0, %1, %2, ROR %3" : "=r" (result) : "r" (op1) , "r" (op2) , "i" (rotate)); + } else { + result = __SXTAB16(op1, __ROR(op2, rotate)); + } + return result; +} + + +__STATIC_FORCEINLINE uint32_t __SMUAD (uint32_t op1, uint32_t op2) +{ + uint32_t result; + + __ASM volatile ("smuad %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) ); + return(result); +} + +__STATIC_FORCEINLINE uint32_t __SMUADX (uint32_t op1, uint32_t op2) +{ + uint32_t result; + + __ASM volatile ("smuadx %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) ); + return(result); +} + +__STATIC_FORCEINLINE uint32_t __SMLAD (uint32_t op1, uint32_t op2, uint32_t op3) +{ + uint32_t result; + + __ASM volatile ("smlad %0, %1, %2, %3" : "=r" (result) : "r" (op1), "r" (op2), "r" (op3) ); + return(result); +} + +__STATIC_FORCEINLINE uint32_t __SMLADX (uint32_t op1, uint32_t op2, uint32_t op3) +{ + uint32_t result; + + __ASM volatile ("smladx %0, %1, %2, %3" : "=r" (result) : "r" (op1), "r" (op2), "r" (op3) ); + return(result); +} + +__STATIC_FORCEINLINE uint64_t __SMLALD (uint32_t op1, uint32_t op2, uint64_t acc) +{ + union llreg_u{ + uint32_t w32[2]; + uint64_t w64; + } llr; + llr.w64 = acc; + +#ifndef __ARMEB__ /* Little endian */ + __ASM volatile ("smlald %0, %1, %2, %3" : "=r" (llr.w32[0]), "=r" (llr.w32[1]): "r" (op1), "r" (op2) , "0" (llr.w32[0]), "1" (llr.w32[1]) ); +#else /* Big endian */ + __ASM volatile ("smlald %0, %1, %2, %3" : "=r" (llr.w32[1]), "=r" (llr.w32[0]): "r" (op1), "r" (op2) , "0" (llr.w32[1]), "1" (llr.w32[0]) ); +#endif + + return(llr.w64); +} + +__STATIC_FORCEINLINE uint64_t __SMLALDX (uint32_t op1, uint32_t op2, uint64_t acc) +{ + union llreg_u{ + uint32_t w32[2]; + uint64_t w64; + } llr; + llr.w64 = acc; + +#ifndef __ARMEB__ /* Little endian */ + __ASM volatile ("smlaldx %0, %1, %2, %3" : "=r" (llr.w32[0]), "=r" (llr.w32[1]): "r" (op1), "r" (op2) , "0" (llr.w32[0]), "1" (llr.w32[1]) ); +#else /* Big endian */ + __ASM volatile ("smlaldx %0, %1, %2, %3" : "=r" (llr.w32[1]), "=r" (llr.w32[0]): "r" (op1), "r" (op2) , "0" (llr.w32[1]), "1" (llr.w32[0]) ); +#endif + + return(llr.w64); +} + +__STATIC_FORCEINLINE uint32_t __SMUSD (uint32_t op1, uint32_t op2) +{ + uint32_t result; + + __ASM volatile ("smusd %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) ); + return(result); +} + +__STATIC_FORCEINLINE uint32_t __SMUSDX (uint32_t op1, uint32_t op2) +{ + uint32_t result; + + __ASM volatile ("smusdx %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) ); + return(result); +} + +__STATIC_FORCEINLINE uint32_t __SMLSD (uint32_t op1, uint32_t op2, uint32_t op3) +{ + uint32_t result; + + __ASM volatile ("smlsd %0, %1, %2, %3" : "=r" (result) : "r" (op1), "r" (op2), "r" (op3) ); + return(result); +} + +__STATIC_FORCEINLINE uint32_t __SMLSDX (uint32_t op1, uint32_t op2, uint32_t op3) +{ + uint32_t result; + + __ASM volatile ("smlsdx %0, %1, %2, %3" : "=r" (result) : "r" (op1), "r" (op2), "r" (op3) ); + return(result); +} + +__STATIC_FORCEINLINE uint64_t __SMLSLD (uint32_t op1, uint32_t op2, uint64_t acc) +{ + union llreg_u{ + uint32_t w32[2]; + uint64_t w64; + } llr; + llr.w64 = acc; + +#ifndef __ARMEB__ /* Little endian */ + __ASM volatile ("smlsld %0, %1, %2, %3" : "=r" (llr.w32[0]), "=r" (llr.w32[1]): "r" (op1), "r" (op2) , "0" (llr.w32[0]), "1" (llr.w32[1]) ); +#else /* Big endian */ + __ASM volatile ("smlsld %0, %1, %2, %3" : "=r" (llr.w32[1]), "=r" (llr.w32[0]): "r" (op1), "r" (op2) , "0" (llr.w32[1]), "1" (llr.w32[0]) ); +#endif + + return(llr.w64); +} + +__STATIC_FORCEINLINE uint64_t __SMLSLDX (uint32_t op1, uint32_t op2, uint64_t acc) +{ + union llreg_u{ + uint32_t w32[2]; + uint64_t w64; + } llr; + llr.w64 = acc; + +#ifndef __ARMEB__ /* Little endian */ + __ASM volatile ("smlsldx %0, %1, %2, %3" : "=r" (llr.w32[0]), "=r" (llr.w32[1]): "r" (op1), "r" (op2) , "0" (llr.w32[0]), "1" (llr.w32[1]) ); +#else /* Big endian */ + __ASM volatile ("smlsldx %0, %1, %2, %3" : "=r" (llr.w32[1]), "=r" (llr.w32[0]): "r" (op1), "r" (op2) , "0" (llr.w32[1]), "1" (llr.w32[0]) ); +#endif + + return(llr.w64); +} + +__STATIC_FORCEINLINE uint32_t __SEL (uint32_t op1, uint32_t op2) +{ + uint32_t result; + + __ASM volatile ("sel %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) ); + return(result); +} + +__STATIC_FORCEINLINE int32_t __QADD( int32_t op1, int32_t op2) +{ + int32_t result; + + __ASM volatile ("qadd %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) ); + return(result); +} + +__STATIC_FORCEINLINE int32_t __QSUB( int32_t op1, int32_t op2) +{ + int32_t result; + + __ASM volatile ("qsub %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) ); + return(result); +} + + +#define __PKHBT(ARG1,ARG2,ARG3) \ +__extension__ \ +({ \ + uint32_t __RES, __ARG1 = (ARG1), __ARG2 = (ARG2); \ + __ASM ("pkhbt %0, %1, %2, lsl %3" : "=r" (__RES) : "r" (__ARG1), "r" (__ARG2), "I" (ARG3) ); \ + __RES; \ + }) + +#define __PKHTB(ARG1,ARG2,ARG3) \ +__extension__ \ +({ \ + uint32_t __RES, __ARG1 = (ARG1), __ARG2 = (ARG2); \ + if (ARG3 == 0) \ + __ASM ("pkhtb %0, %1, %2" : "=r" (__RES) : "r" (__ARG1), "r" (__ARG2) ); \ + else \ + __ASM ("pkhtb %0, %1, %2, asr %3" : "=r" (__RES) : "r" (__ARG1), "r" (__ARG2), "I" (ARG3) ); \ + __RES; \ + }) + + +__STATIC_FORCEINLINE int32_t __SMMLA (int32_t op1, int32_t op2, int32_t op3) +{ + int32_t result; + + __ASM ("smmla %0, %1, %2, %3" : "=r" (result): "r" (op1), "r" (op2), "r" (op3) ); + return(result); +} + +#endif /* (__ARM_FEATURE_DSP == 1) */ +/*@} end of group CMSIS_SIMD_intrinsics */ + + +#pragma GCC diagnostic pop + +#endif /* __CMSIS_GCC_H */ diff --git a/APP_Framework/Framework/knowing/cmsis_5/Core/Include/core_cm4.h b/APP_Framework/Framework/knowing/cmsis_5/Core/Include/core_cm4.h new file mode 100644 index 000000000..e21cd1492 --- /dev/null +++ b/APP_Framework/Framework/knowing/cmsis_5/Core/Include/core_cm4.h @@ -0,0 +1,2129 @@ +/**************************************************************************//** + * @file core_cm4.h + * @brief CMSIS Cortex-M4 Core Peripheral Access Layer Header File + * @version V5.1.2 + * @date 04. June 2021 + ******************************************************************************/ +/* + * Copyright (c) 2009-2020 Arm Limited. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#if defined ( __ICCARM__ ) + #pragma system_include /* treat file as system include file for MISRA check */ +#elif defined (__clang__) + #pragma clang system_header /* treat file as system include file */ +#endif + +#ifndef __CORE_CM4_H_GENERIC +#define __CORE_CM4_H_GENERIC + +#include + +#ifdef __cplusplus + extern "C" { +#endif + +/** + \page CMSIS_MISRA_Exceptions MISRA-C:2004 Compliance Exceptions + CMSIS violates the following MISRA-C:2004 rules: + + \li Required Rule 8.5, object/function definition in header file.
+ Function definitions in header files are used to allow 'inlining'. + + \li Required Rule 18.4, declaration of union type or object of union type: '{...}'.
+ Unions are used for effective representation of core registers. + + \li Advisory Rule 19.7, Function-like macro defined.
+ Function-like macros are used to allow more efficient code. + */ + + +/******************************************************************************* + * CMSIS definitions + ******************************************************************************/ +/** + \ingroup Cortex_M4 + @{ + */ + +#include "cmsis_version.h" + +/* CMSIS CM4 definitions */ +#define __CM4_CMSIS_VERSION_MAIN (__CM_CMSIS_VERSION_MAIN) /*!< \deprecated [31:16] CMSIS HAL main version */ +#define __CM4_CMSIS_VERSION_SUB (__CM_CMSIS_VERSION_SUB) /*!< \deprecated [15:0] CMSIS HAL sub version */ +#define __CM4_CMSIS_VERSION ((__CM4_CMSIS_VERSION_MAIN << 16U) | \ + __CM4_CMSIS_VERSION_SUB ) /*!< \deprecated CMSIS HAL version number */ + +#define __CORTEX_M (4U) /*!< Cortex-M Core */ + +/** __FPU_USED indicates whether an FPU is used or not. + For this, __FPU_PRESENT has to be checked prior to making use of FPU specific registers and functions. +*/ +#if defined ( __CC_ARM ) + #if defined __TARGET_FPU_VFP + #if defined (__FPU_PRESENT) && (__FPU_PRESENT == 1U) + #define __FPU_USED 1U + #else + #error "Compiler generates FPU instructions for a device without an FPU (check __FPU_PRESENT)" + #define __FPU_USED 0U + #endif + #else + #define __FPU_USED 0U + #endif + +#elif defined (__ARMCC_VERSION) && (__ARMCC_VERSION >= 6010050) + #if defined __ARM_FP + #if defined (__FPU_PRESENT) && (__FPU_PRESENT == 1U) + #define __FPU_USED 1U + #else + #warning "Compiler generates FPU instructions for a device without an FPU (check __FPU_PRESENT)" + #define __FPU_USED 0U + #endif + #else + #define __FPU_USED 0U + #endif + +#elif defined ( __GNUC__ ) + #if defined (__VFP_FP__) && !defined(__SOFTFP__) + #if defined (__FPU_PRESENT) && (__FPU_PRESENT == 1U) + #define __FPU_USED 1U + #else + #error "Compiler generates FPU instructions for a device without an FPU (check __FPU_PRESENT)" + #define __FPU_USED 0U + #endif + #else + #define __FPU_USED 0U + #endif + +#elif defined ( __ICCARM__ ) + #if defined __ARMVFP__ + #if defined (__FPU_PRESENT) && (__FPU_PRESENT == 1U) + #define __FPU_USED 1U + #else + #error "Compiler generates FPU instructions for a device without an FPU (check __FPU_PRESENT)" + #define __FPU_USED 0U + #endif + #else + #define __FPU_USED 0U + #endif + +#elif defined ( __TI_ARM__ ) + #if defined __TI_VFP_SUPPORT__ + #if defined (__FPU_PRESENT) && (__FPU_PRESENT == 1U) + #define __FPU_USED 1U + #else + #error "Compiler generates FPU instructions for a device without an FPU (check __FPU_PRESENT)" + #define __FPU_USED 0U + #endif + #else + #define __FPU_USED 0U + #endif + +#elif defined ( __TASKING__ ) + #if defined __FPU_VFP__ + #if defined (__FPU_PRESENT) && (__FPU_PRESENT == 1U) + #define __FPU_USED 1U + #else + #error "Compiler generates FPU instructions for a device without an FPU (check __FPU_PRESENT)" + #define __FPU_USED 0U + #endif + #else + #define __FPU_USED 0U + #endif + +#elif defined ( __CSMC__ ) + #if ( __CSMC__ & 0x400U) + #if defined (__FPU_PRESENT) && (__FPU_PRESENT == 1U) + #define __FPU_USED 1U + #else + #error "Compiler generates FPU instructions for a device without an FPU (check __FPU_PRESENT)" + #define __FPU_USED 0U + #endif + #else + #define __FPU_USED 0U + #endif + +#endif + +#include "cmsis_compiler.h" /* CMSIS compiler specific defines */ + + +#ifdef __cplusplus +} +#endif + +#endif /* __CORE_CM4_H_GENERIC */ + +#ifndef __CMSIS_GENERIC + +#ifndef __CORE_CM4_H_DEPENDANT +#define __CORE_CM4_H_DEPENDANT + +#ifdef __cplusplus + extern "C" { +#endif + +/* check device defines and use defaults */ +#if defined __CHECK_DEVICE_DEFINES + #ifndef __CM4_REV + #define __CM4_REV 0x0000U + #warning "__CM4_REV not defined in device header file; using default!" + #endif + + #ifndef __FPU_PRESENT + #define __FPU_PRESENT 0U + #warning "__FPU_PRESENT not defined in device header file; using default!" + #endif + + #ifndef __MPU_PRESENT + #define __MPU_PRESENT 0U + #warning "__MPU_PRESENT not defined in device header file; using default!" + #endif + + #ifndef __VTOR_PRESENT + #define __VTOR_PRESENT 1U + #warning "__VTOR_PRESENT not defined in device header file; using default!" + #endif + + #ifndef __NVIC_PRIO_BITS + #define __NVIC_PRIO_BITS 3U + #warning "__NVIC_PRIO_BITS not defined in device header file; using default!" + #endif + + #ifndef __Vendor_SysTickConfig + #define __Vendor_SysTickConfig 0U + #warning "__Vendor_SysTickConfig not defined in device header file; using default!" + #endif +#endif + +/* IO definitions (access restrictions to peripheral registers) */ +/** + \defgroup CMSIS_glob_defs CMSIS Global Defines + + IO Type Qualifiers are used + \li to specify the access to peripheral variables. + \li for automatic generation of peripheral register debug information. +*/ +#ifdef __cplusplus + #define __I volatile /*!< Defines 'read only' permissions */ +#else + #define __I volatile const /*!< Defines 'read only' permissions */ +#endif +#define __O volatile /*!< Defines 'write only' permissions */ +#define __IO volatile /*!< Defines 'read / write' permissions */ + +/* following defines should be used for structure members */ +#define __IM volatile const /*! Defines 'read only' structure member permissions */ +#define __OM volatile /*! Defines 'write only' structure member permissions */ +#define __IOM volatile /*! Defines 'read / write' structure member permissions */ + +/*@} end of group Cortex_M4 */ + + + +/******************************************************************************* + * Register Abstraction + Core Register contain: + - Core Register + - Core NVIC Register + - Core SCB Register + - Core SysTick Register + - Core Debug Register + - Core MPU Register + - Core FPU Register + ******************************************************************************/ +/** + \defgroup CMSIS_core_register Defines and Type Definitions + \brief Type definitions and defines for Cortex-M processor based devices. +*/ + +/** + \ingroup CMSIS_core_register + \defgroup CMSIS_CORE Status and Control Registers + \brief Core Register type definitions. + @{ + */ + +/** + \brief Union type to access the Application Program Status Register (APSR). + */ +typedef union +{ + struct + { + uint32_t _reserved0:16; /*!< bit: 0..15 Reserved */ + uint32_t GE:4; /*!< bit: 16..19 Greater than or Equal flags */ + uint32_t _reserved1:7; /*!< bit: 20..26 Reserved */ + uint32_t Q:1; /*!< bit: 27 Saturation condition flag */ + uint32_t V:1; /*!< bit: 28 Overflow condition code flag */ + uint32_t C:1; /*!< bit: 29 Carry condition code flag */ + uint32_t Z:1; /*!< bit: 30 Zero condition code flag */ + uint32_t N:1; /*!< bit: 31 Negative condition code flag */ + } b; /*!< Structure used for bit access */ + uint32_t w; /*!< Type used for word access */ +} APSR_Type; + +/* APSR Register Definitions */ +#define APSR_N_Pos 31U /*!< APSR: N Position */ +#define APSR_N_Msk (1UL << APSR_N_Pos) /*!< APSR: N Mask */ + +#define APSR_Z_Pos 30U /*!< APSR: Z Position */ +#define APSR_Z_Msk (1UL << APSR_Z_Pos) /*!< APSR: Z Mask */ + +#define APSR_C_Pos 29U /*!< APSR: C Position */ +#define APSR_C_Msk (1UL << APSR_C_Pos) /*!< APSR: C Mask */ + +#define APSR_V_Pos 28U /*!< APSR: V Position */ +#define APSR_V_Msk (1UL << APSR_V_Pos) /*!< APSR: V Mask */ + +#define APSR_Q_Pos 27U /*!< APSR: Q Position */ +#define APSR_Q_Msk (1UL << APSR_Q_Pos) /*!< APSR: Q Mask */ + +#define APSR_GE_Pos 16U /*!< APSR: GE Position */ +#define APSR_GE_Msk (0xFUL << APSR_GE_Pos) /*!< APSR: GE Mask */ + + +/** + \brief Union type to access the Interrupt Program Status Register (IPSR). + */ +typedef union +{ + struct + { + uint32_t ISR:9; /*!< bit: 0.. 8 Exception number */ + uint32_t _reserved0:23; /*!< bit: 9..31 Reserved */ + } b; /*!< Structure used for bit access */ + uint32_t w; /*!< Type used for word access */ +} IPSR_Type; + +/* IPSR Register Definitions */ +#define IPSR_ISR_Pos 0U /*!< IPSR: ISR Position */ +#define IPSR_ISR_Msk (0x1FFUL /*<< IPSR_ISR_Pos*/) /*!< IPSR: ISR Mask */ + + +/** + \brief Union type to access the Special-Purpose Program Status Registers (xPSR). + */ +typedef union +{ + struct + { + uint32_t ISR:9; /*!< bit: 0.. 8 Exception number */ + uint32_t _reserved0:1; /*!< bit: 9 Reserved */ + uint32_t ICI_IT_1:6; /*!< bit: 10..15 ICI/IT part 1 */ + uint32_t GE:4; /*!< bit: 16..19 Greater than or Equal flags */ + uint32_t _reserved1:4; /*!< bit: 20..23 Reserved */ + uint32_t T:1; /*!< bit: 24 Thumb bit */ + uint32_t ICI_IT_2:2; /*!< bit: 25..26 ICI/IT part 2 */ + uint32_t Q:1; /*!< bit: 27 Saturation condition flag */ + uint32_t V:1; /*!< bit: 28 Overflow condition code flag */ + uint32_t C:1; /*!< bit: 29 Carry condition code flag */ + uint32_t Z:1; /*!< bit: 30 Zero condition code flag */ + uint32_t N:1; /*!< bit: 31 Negative condition code flag */ + } b; /*!< Structure used for bit access */ + uint32_t w; /*!< Type used for word access */ +} xPSR_Type; + +/* xPSR Register Definitions */ +#define xPSR_N_Pos 31U /*!< xPSR: N Position */ +#define xPSR_N_Msk (1UL << xPSR_N_Pos) /*!< xPSR: N Mask */ + +#define xPSR_Z_Pos 30U /*!< xPSR: Z Position */ +#define xPSR_Z_Msk (1UL << xPSR_Z_Pos) /*!< xPSR: Z Mask */ + +#define xPSR_C_Pos 29U /*!< xPSR: C Position */ +#define xPSR_C_Msk (1UL << xPSR_C_Pos) /*!< xPSR: C Mask */ + +#define xPSR_V_Pos 28U /*!< xPSR: V Position */ +#define xPSR_V_Msk (1UL << xPSR_V_Pos) /*!< xPSR: V Mask */ + +#define xPSR_Q_Pos 27U /*!< xPSR: Q Position */ +#define xPSR_Q_Msk (1UL << xPSR_Q_Pos) /*!< xPSR: Q Mask */ + +#define xPSR_ICI_IT_2_Pos 25U /*!< xPSR: ICI/IT part 2 Position */ +#define xPSR_ICI_IT_2_Msk (3UL << xPSR_ICI_IT_2_Pos) /*!< xPSR: ICI/IT part 2 Mask */ + +#define xPSR_T_Pos 24U /*!< xPSR: T Position */ +#define xPSR_T_Msk (1UL << xPSR_T_Pos) /*!< xPSR: T Mask */ + +#define xPSR_GE_Pos 16U /*!< xPSR: GE Position */ +#define xPSR_GE_Msk (0xFUL << xPSR_GE_Pos) /*!< xPSR: GE Mask */ + +#define xPSR_ICI_IT_1_Pos 10U /*!< xPSR: ICI/IT part 1 Position */ +#define xPSR_ICI_IT_1_Msk (0x3FUL << xPSR_ICI_IT_1_Pos) /*!< xPSR: ICI/IT part 1 Mask */ + +#define xPSR_ISR_Pos 0U /*!< xPSR: ISR Position */ +#define xPSR_ISR_Msk (0x1FFUL /*<< xPSR_ISR_Pos*/) /*!< xPSR: ISR Mask */ + + +/** + \brief Union type to access the Control Registers (CONTROL). + */ +typedef union +{ + struct + { + uint32_t nPRIV:1; /*!< bit: 0 Execution privilege in Thread mode */ + uint32_t SPSEL:1; /*!< bit: 1 Stack to be used */ + uint32_t FPCA:1; /*!< bit: 2 FP extension active flag */ + uint32_t _reserved0:29; /*!< bit: 3..31 Reserved */ + } b; /*!< Structure used for bit access */ + uint32_t w; /*!< Type used for word access */ +} CONTROL_Type; + +/* CONTROL Register Definitions */ +#define CONTROL_FPCA_Pos 2U /*!< CONTROL: FPCA Position */ +#define CONTROL_FPCA_Msk (1UL << CONTROL_FPCA_Pos) /*!< CONTROL: FPCA Mask */ + +#define CONTROL_SPSEL_Pos 1U /*!< CONTROL: SPSEL Position */ +#define CONTROL_SPSEL_Msk (1UL << CONTROL_SPSEL_Pos) /*!< CONTROL: SPSEL Mask */ + +#define CONTROL_nPRIV_Pos 0U /*!< CONTROL: nPRIV Position */ +#define CONTROL_nPRIV_Msk (1UL /*<< CONTROL_nPRIV_Pos*/) /*!< CONTROL: nPRIV Mask */ + +/*@} end of group CMSIS_CORE */ + + +/** + \ingroup CMSIS_core_register + \defgroup CMSIS_NVIC Nested Vectored Interrupt Controller (NVIC) + \brief Type definitions for the NVIC Registers + @{ + */ + +/** + \brief Structure type to access the Nested Vectored Interrupt Controller (NVIC). + */ +typedef struct +{ + __IOM uint32_t ISER[8U]; /*!< Offset: 0x000 (R/W) Interrupt Set Enable Register */ + uint32_t RESERVED0[24U]; + __IOM uint32_t ICER[8U]; /*!< Offset: 0x080 (R/W) Interrupt Clear Enable Register */ + uint32_t RESERVED1[24U]; + __IOM uint32_t ISPR[8U]; /*!< Offset: 0x100 (R/W) Interrupt Set Pending Register */ + uint32_t RESERVED2[24U]; + __IOM uint32_t ICPR[8U]; /*!< Offset: 0x180 (R/W) Interrupt Clear Pending Register */ + uint32_t RESERVED3[24U]; + __IOM uint32_t IABR[8U]; /*!< Offset: 0x200 (R/W) Interrupt Active bit Register */ + uint32_t RESERVED4[56U]; + __IOM uint8_t IP[240U]; /*!< Offset: 0x300 (R/W) Interrupt Priority Register (8Bit wide) */ + uint32_t RESERVED5[644U]; + __OM uint32_t STIR; /*!< Offset: 0xE00 ( /W) Software Trigger Interrupt Register */ +} NVIC_Type; + +/* Software Triggered Interrupt Register Definitions */ +#define NVIC_STIR_INTID_Pos 0U /*!< STIR: INTLINESNUM Position */ +#define NVIC_STIR_INTID_Msk (0x1FFUL /*<< NVIC_STIR_INTID_Pos*/) /*!< STIR: INTLINESNUM Mask */ + +/*@} end of group CMSIS_NVIC */ + + +/** + \ingroup CMSIS_core_register + \defgroup CMSIS_SCB System Control Block (SCB) + \brief Type definitions for the System Control Block Registers + @{ + */ + +/** + \brief Structure type to access the System Control Block (SCB). + */ +typedef struct +{ + __IM uint32_t CPUID; /*!< Offset: 0x000 (R/ ) CPUID Base Register */ + __IOM uint32_t ICSR; /*!< Offset: 0x004 (R/W) Interrupt Control and State Register */ + __IOM uint32_t VTOR; /*!< Offset: 0x008 (R/W) Vector Table Offset Register */ + __IOM uint32_t AIRCR; /*!< Offset: 0x00C (R/W) Application Interrupt and Reset Control Register */ + __IOM uint32_t SCR; /*!< Offset: 0x010 (R/W) System Control Register */ + __IOM uint32_t CCR; /*!< Offset: 0x014 (R/W) Configuration Control Register */ + __IOM uint8_t SHP[12U]; /*!< Offset: 0x018 (R/W) System Handlers Priority Registers (4-7, 8-11, 12-15) */ + __IOM uint32_t SHCSR; /*!< Offset: 0x024 (R/W) System Handler Control and State Register */ + __IOM uint32_t CFSR; /*!< Offset: 0x028 (R/W) Configurable Fault Status Register */ + __IOM uint32_t HFSR; /*!< Offset: 0x02C (R/W) HardFault Status Register */ + __IOM uint32_t DFSR; /*!< Offset: 0x030 (R/W) Debug Fault Status Register */ + __IOM uint32_t MMFAR; /*!< Offset: 0x034 (R/W) MemManage Fault Address Register */ + __IOM uint32_t BFAR; /*!< Offset: 0x038 (R/W) BusFault Address Register */ + __IOM uint32_t AFSR; /*!< Offset: 0x03C (R/W) Auxiliary Fault Status Register */ + __IM uint32_t PFR[2U]; /*!< Offset: 0x040 (R/ ) Processor Feature Register */ + __IM uint32_t DFR; /*!< Offset: 0x048 (R/ ) Debug Feature Register */ + __IM uint32_t ADR; /*!< Offset: 0x04C (R/ ) Auxiliary Feature Register */ + __IM uint32_t MMFR[4U]; /*!< Offset: 0x050 (R/ ) Memory Model Feature Register */ + __IM uint32_t ISAR[5U]; /*!< Offset: 0x060 (R/ ) Instruction Set Attributes Register */ + uint32_t RESERVED0[5U]; + __IOM uint32_t CPACR; /*!< Offset: 0x088 (R/W) Coprocessor Access Control Register */ +} SCB_Type; + +/* SCB CPUID Register Definitions */ +#define SCB_CPUID_IMPLEMENTER_Pos 24U /*!< SCB CPUID: IMPLEMENTER Position */ +#define SCB_CPUID_IMPLEMENTER_Msk (0xFFUL << SCB_CPUID_IMPLEMENTER_Pos) /*!< SCB CPUID: IMPLEMENTER Mask */ + +#define SCB_CPUID_VARIANT_Pos 20U /*!< SCB CPUID: VARIANT Position */ +#define SCB_CPUID_VARIANT_Msk (0xFUL << SCB_CPUID_VARIANT_Pos) /*!< SCB CPUID: VARIANT Mask */ + +#define SCB_CPUID_ARCHITECTURE_Pos 16U /*!< SCB CPUID: ARCHITECTURE Position */ +#define SCB_CPUID_ARCHITECTURE_Msk (0xFUL << SCB_CPUID_ARCHITECTURE_Pos) /*!< SCB CPUID: ARCHITECTURE Mask */ + +#define SCB_CPUID_PARTNO_Pos 4U /*!< SCB CPUID: PARTNO Position */ +#define SCB_CPUID_PARTNO_Msk (0xFFFUL << SCB_CPUID_PARTNO_Pos) /*!< SCB CPUID: PARTNO Mask */ + +#define SCB_CPUID_REVISION_Pos 0U /*!< SCB CPUID: REVISION Position */ +#define SCB_CPUID_REVISION_Msk (0xFUL /*<< SCB_CPUID_REVISION_Pos*/) /*!< SCB CPUID: REVISION Mask */ + +/* SCB Interrupt Control State Register Definitions */ +#define SCB_ICSR_NMIPENDSET_Pos 31U /*!< SCB ICSR: NMIPENDSET Position */ +#define SCB_ICSR_NMIPENDSET_Msk (1UL << SCB_ICSR_NMIPENDSET_Pos) /*!< SCB ICSR: NMIPENDSET Mask */ + +#define SCB_ICSR_PENDSVSET_Pos 28U /*!< SCB ICSR: PENDSVSET Position */ +#define SCB_ICSR_PENDSVSET_Msk (1UL << SCB_ICSR_PENDSVSET_Pos) /*!< SCB ICSR: PENDSVSET Mask */ + +#define SCB_ICSR_PENDSVCLR_Pos 27U /*!< SCB ICSR: PENDSVCLR Position */ +#define SCB_ICSR_PENDSVCLR_Msk (1UL << SCB_ICSR_PENDSVCLR_Pos) /*!< SCB ICSR: PENDSVCLR Mask */ + +#define SCB_ICSR_PENDSTSET_Pos 26U /*!< SCB ICSR: PENDSTSET Position */ +#define SCB_ICSR_PENDSTSET_Msk (1UL << SCB_ICSR_PENDSTSET_Pos) /*!< SCB ICSR: PENDSTSET Mask */ + +#define SCB_ICSR_PENDSTCLR_Pos 25U /*!< SCB ICSR: PENDSTCLR Position */ +#define SCB_ICSR_PENDSTCLR_Msk (1UL << SCB_ICSR_PENDSTCLR_Pos) /*!< SCB ICSR: PENDSTCLR Mask */ + +#define SCB_ICSR_ISRPREEMPT_Pos 23U /*!< SCB ICSR: ISRPREEMPT Position */ +#define SCB_ICSR_ISRPREEMPT_Msk (1UL << SCB_ICSR_ISRPREEMPT_Pos) /*!< SCB ICSR: ISRPREEMPT Mask */ + +#define SCB_ICSR_ISRPENDING_Pos 22U /*!< SCB ICSR: ISRPENDING Position */ +#define SCB_ICSR_ISRPENDING_Msk (1UL << SCB_ICSR_ISRPENDING_Pos) /*!< SCB ICSR: ISRPENDING Mask */ + +#define SCB_ICSR_VECTPENDING_Pos 12U /*!< SCB ICSR: VECTPENDING Position */ +#define SCB_ICSR_VECTPENDING_Msk (0x1FFUL << SCB_ICSR_VECTPENDING_Pos) /*!< SCB ICSR: VECTPENDING Mask */ + +#define SCB_ICSR_RETTOBASE_Pos 11U /*!< SCB ICSR: RETTOBASE Position */ +#define SCB_ICSR_RETTOBASE_Msk (1UL << SCB_ICSR_RETTOBASE_Pos) /*!< SCB ICSR: RETTOBASE Mask */ + +#define SCB_ICSR_VECTACTIVE_Pos 0U /*!< SCB ICSR: VECTACTIVE Position */ +#define SCB_ICSR_VECTACTIVE_Msk (0x1FFUL /*<< SCB_ICSR_VECTACTIVE_Pos*/) /*!< SCB ICSR: VECTACTIVE Mask */ + +/* SCB Vector Table Offset Register Definitions */ +#define SCB_VTOR_TBLOFF_Pos 7U /*!< SCB VTOR: TBLOFF Position */ +#define SCB_VTOR_TBLOFF_Msk (0x1FFFFFFUL << SCB_VTOR_TBLOFF_Pos) /*!< SCB VTOR: TBLOFF Mask */ + +/* SCB Application Interrupt and Reset Control Register Definitions */ +#define SCB_AIRCR_VECTKEY_Pos 16U /*!< SCB AIRCR: VECTKEY Position */ +#define SCB_AIRCR_VECTKEY_Msk (0xFFFFUL << SCB_AIRCR_VECTKEY_Pos) /*!< SCB AIRCR: VECTKEY Mask */ + +#define SCB_AIRCR_VECTKEYSTAT_Pos 16U /*!< SCB AIRCR: VECTKEYSTAT Position */ +#define SCB_AIRCR_VECTKEYSTAT_Msk (0xFFFFUL << SCB_AIRCR_VECTKEYSTAT_Pos) /*!< SCB AIRCR: VECTKEYSTAT Mask */ + +#define SCB_AIRCR_ENDIANESS_Pos 15U /*!< SCB AIRCR: ENDIANESS Position */ +#define SCB_AIRCR_ENDIANESS_Msk (1UL << SCB_AIRCR_ENDIANESS_Pos) /*!< SCB AIRCR: ENDIANESS Mask */ + +#define SCB_AIRCR_PRIGROUP_Pos 8U /*!< SCB AIRCR: PRIGROUP Position */ +#define SCB_AIRCR_PRIGROUP_Msk (7UL << SCB_AIRCR_PRIGROUP_Pos) /*!< SCB AIRCR: PRIGROUP Mask */ + +#define SCB_AIRCR_SYSRESETREQ_Pos 2U /*!< SCB AIRCR: SYSRESETREQ Position */ +#define SCB_AIRCR_SYSRESETREQ_Msk (1UL << SCB_AIRCR_SYSRESETREQ_Pos) /*!< SCB AIRCR: SYSRESETREQ Mask */ + +#define SCB_AIRCR_VECTCLRACTIVE_Pos 1U /*!< SCB AIRCR: VECTCLRACTIVE Position */ +#define SCB_AIRCR_VECTCLRACTIVE_Msk (1UL << SCB_AIRCR_VECTCLRACTIVE_Pos) /*!< SCB AIRCR: VECTCLRACTIVE Mask */ + +#define SCB_AIRCR_VECTRESET_Pos 0U /*!< SCB AIRCR: VECTRESET Position */ +#define SCB_AIRCR_VECTRESET_Msk (1UL /*<< SCB_AIRCR_VECTRESET_Pos*/) /*!< SCB AIRCR: VECTRESET Mask */ + +/* SCB System Control Register Definitions */ +#define SCB_SCR_SEVONPEND_Pos 4U /*!< SCB SCR: SEVONPEND Position */ +#define SCB_SCR_SEVONPEND_Msk (1UL << SCB_SCR_SEVONPEND_Pos) /*!< SCB SCR: SEVONPEND Mask */ + +#define SCB_SCR_SLEEPDEEP_Pos 2U /*!< SCB SCR: SLEEPDEEP Position */ +#define SCB_SCR_SLEEPDEEP_Msk (1UL << SCB_SCR_SLEEPDEEP_Pos) /*!< SCB SCR: SLEEPDEEP Mask */ + +#define SCB_SCR_SLEEPONEXIT_Pos 1U /*!< SCB SCR: SLEEPONEXIT Position */ +#define SCB_SCR_SLEEPONEXIT_Msk (1UL << SCB_SCR_SLEEPONEXIT_Pos) /*!< SCB SCR: SLEEPONEXIT Mask */ + +/* SCB Configuration Control Register Definitions */ +#define SCB_CCR_STKALIGN_Pos 9U /*!< SCB CCR: STKALIGN Position */ +#define SCB_CCR_STKALIGN_Msk (1UL << SCB_CCR_STKALIGN_Pos) /*!< SCB CCR: STKALIGN Mask */ + +#define SCB_CCR_BFHFNMIGN_Pos 8U /*!< SCB CCR: BFHFNMIGN Position */ +#define SCB_CCR_BFHFNMIGN_Msk (1UL << SCB_CCR_BFHFNMIGN_Pos) /*!< SCB CCR: BFHFNMIGN Mask */ + +#define SCB_CCR_DIV_0_TRP_Pos 4U /*!< SCB CCR: DIV_0_TRP Position */ +#define SCB_CCR_DIV_0_TRP_Msk (1UL << SCB_CCR_DIV_0_TRP_Pos) /*!< SCB CCR: DIV_0_TRP Mask */ + +#define SCB_CCR_UNALIGN_TRP_Pos 3U /*!< SCB CCR: UNALIGN_TRP Position */ +#define SCB_CCR_UNALIGN_TRP_Msk (1UL << SCB_CCR_UNALIGN_TRP_Pos) /*!< SCB CCR: UNALIGN_TRP Mask */ + +#define SCB_CCR_USERSETMPEND_Pos 1U /*!< SCB CCR: USERSETMPEND Position */ +#define SCB_CCR_USERSETMPEND_Msk (1UL << SCB_CCR_USERSETMPEND_Pos) /*!< SCB CCR: USERSETMPEND Mask */ + +#define SCB_CCR_NONBASETHRDENA_Pos 0U /*!< SCB CCR: NONBASETHRDENA Position */ +#define SCB_CCR_NONBASETHRDENA_Msk (1UL /*<< SCB_CCR_NONBASETHRDENA_Pos*/) /*!< SCB CCR: NONBASETHRDENA Mask */ + +/* SCB System Handler Control and State Register Definitions */ +#define SCB_SHCSR_USGFAULTENA_Pos 18U /*!< SCB SHCSR: USGFAULTENA Position */ +#define SCB_SHCSR_USGFAULTENA_Msk (1UL << SCB_SHCSR_USGFAULTENA_Pos) /*!< SCB SHCSR: USGFAULTENA Mask */ + +#define SCB_SHCSR_BUSFAULTENA_Pos 17U /*!< SCB SHCSR: BUSFAULTENA Position */ +#define SCB_SHCSR_BUSFAULTENA_Msk (1UL << SCB_SHCSR_BUSFAULTENA_Pos) /*!< SCB SHCSR: BUSFAULTENA Mask */ + +#define SCB_SHCSR_MEMFAULTENA_Pos 16U /*!< SCB SHCSR: MEMFAULTENA Position */ +#define SCB_SHCSR_MEMFAULTENA_Msk (1UL << SCB_SHCSR_MEMFAULTENA_Pos) /*!< SCB SHCSR: MEMFAULTENA Mask */ + +#define SCB_SHCSR_SVCALLPENDED_Pos 15U /*!< SCB SHCSR: SVCALLPENDED Position */ +#define SCB_SHCSR_SVCALLPENDED_Msk (1UL << SCB_SHCSR_SVCALLPENDED_Pos) /*!< SCB SHCSR: SVCALLPENDED Mask */ + +#define SCB_SHCSR_BUSFAULTPENDED_Pos 14U /*!< SCB SHCSR: BUSFAULTPENDED Position */ +#define SCB_SHCSR_BUSFAULTPENDED_Msk (1UL << SCB_SHCSR_BUSFAULTPENDED_Pos) /*!< SCB SHCSR: BUSFAULTPENDED Mask */ + +#define SCB_SHCSR_MEMFAULTPENDED_Pos 13U /*!< SCB SHCSR: MEMFAULTPENDED Position */ +#define SCB_SHCSR_MEMFAULTPENDED_Msk (1UL << SCB_SHCSR_MEMFAULTPENDED_Pos) /*!< SCB SHCSR: MEMFAULTPENDED Mask */ + +#define SCB_SHCSR_USGFAULTPENDED_Pos 12U /*!< SCB SHCSR: USGFAULTPENDED Position */ +#define SCB_SHCSR_USGFAULTPENDED_Msk (1UL << SCB_SHCSR_USGFAULTPENDED_Pos) /*!< SCB SHCSR: USGFAULTPENDED Mask */ + +#define SCB_SHCSR_SYSTICKACT_Pos 11U /*!< SCB SHCSR: SYSTICKACT Position */ +#define SCB_SHCSR_SYSTICKACT_Msk (1UL << SCB_SHCSR_SYSTICKACT_Pos) /*!< SCB SHCSR: SYSTICKACT Mask */ + +#define SCB_SHCSR_PENDSVACT_Pos 10U /*!< SCB SHCSR: PENDSVACT Position */ +#define SCB_SHCSR_PENDSVACT_Msk (1UL << SCB_SHCSR_PENDSVACT_Pos) /*!< SCB SHCSR: PENDSVACT Mask */ + +#define SCB_SHCSR_MONITORACT_Pos 8U /*!< SCB SHCSR: MONITORACT Position */ +#define SCB_SHCSR_MONITORACT_Msk (1UL << SCB_SHCSR_MONITORACT_Pos) /*!< SCB SHCSR: MONITORACT Mask */ + +#define SCB_SHCSR_SVCALLACT_Pos 7U /*!< SCB SHCSR: SVCALLACT Position */ +#define SCB_SHCSR_SVCALLACT_Msk (1UL << SCB_SHCSR_SVCALLACT_Pos) /*!< SCB SHCSR: SVCALLACT Mask */ + +#define SCB_SHCSR_USGFAULTACT_Pos 3U /*!< SCB SHCSR: USGFAULTACT Position */ +#define SCB_SHCSR_USGFAULTACT_Msk (1UL << SCB_SHCSR_USGFAULTACT_Pos) /*!< SCB SHCSR: USGFAULTACT Mask */ + +#define SCB_SHCSR_BUSFAULTACT_Pos 1U /*!< SCB SHCSR: BUSFAULTACT Position */ +#define SCB_SHCSR_BUSFAULTACT_Msk (1UL << SCB_SHCSR_BUSFAULTACT_Pos) /*!< SCB SHCSR: BUSFAULTACT Mask */ + +#define SCB_SHCSR_MEMFAULTACT_Pos 0U /*!< SCB SHCSR: MEMFAULTACT Position */ +#define SCB_SHCSR_MEMFAULTACT_Msk (1UL /*<< SCB_SHCSR_MEMFAULTACT_Pos*/) /*!< SCB SHCSR: MEMFAULTACT Mask */ + +/* SCB Configurable Fault Status Register Definitions */ +#define SCB_CFSR_USGFAULTSR_Pos 16U /*!< SCB CFSR: Usage Fault Status Register Position */ +#define SCB_CFSR_USGFAULTSR_Msk (0xFFFFUL << SCB_CFSR_USGFAULTSR_Pos) /*!< SCB CFSR: Usage Fault Status Register Mask */ + +#define SCB_CFSR_BUSFAULTSR_Pos 8U /*!< SCB CFSR: Bus Fault Status Register Position */ +#define SCB_CFSR_BUSFAULTSR_Msk (0xFFUL << SCB_CFSR_BUSFAULTSR_Pos) /*!< SCB CFSR: Bus Fault Status Register Mask */ + +#define SCB_CFSR_MEMFAULTSR_Pos 0U /*!< SCB CFSR: Memory Manage Fault Status Register Position */ +#define SCB_CFSR_MEMFAULTSR_Msk (0xFFUL /*<< SCB_CFSR_MEMFAULTSR_Pos*/) /*!< SCB CFSR: Memory Manage Fault Status Register Mask */ + +/* MemManage Fault Status Register (part of SCB Configurable Fault Status Register) */ +#define SCB_CFSR_MMARVALID_Pos (SCB_CFSR_MEMFAULTSR_Pos + 7U) /*!< SCB CFSR (MMFSR): MMARVALID Position */ +#define SCB_CFSR_MMARVALID_Msk (1UL << SCB_CFSR_MMARVALID_Pos) /*!< SCB CFSR (MMFSR): MMARVALID Mask */ + +#define SCB_CFSR_MLSPERR_Pos (SCB_CFSR_MEMFAULTSR_Pos + 5U) /*!< SCB CFSR (MMFSR): MLSPERR Position */ +#define SCB_CFSR_MLSPERR_Msk (1UL << SCB_CFSR_MLSPERR_Pos) /*!< SCB CFSR (MMFSR): MLSPERR Mask */ + +#define SCB_CFSR_MSTKERR_Pos (SCB_CFSR_MEMFAULTSR_Pos + 4U) /*!< SCB CFSR (MMFSR): MSTKERR Position */ +#define SCB_CFSR_MSTKERR_Msk (1UL << SCB_CFSR_MSTKERR_Pos) /*!< SCB CFSR (MMFSR): MSTKERR Mask */ + +#define SCB_CFSR_MUNSTKERR_Pos (SCB_CFSR_MEMFAULTSR_Pos + 3U) /*!< SCB CFSR (MMFSR): MUNSTKERR Position */ +#define SCB_CFSR_MUNSTKERR_Msk (1UL << SCB_CFSR_MUNSTKERR_Pos) /*!< SCB CFSR (MMFSR): MUNSTKERR Mask */ + +#define SCB_CFSR_DACCVIOL_Pos (SCB_CFSR_MEMFAULTSR_Pos + 1U) /*!< SCB CFSR (MMFSR): DACCVIOL Position */ +#define SCB_CFSR_DACCVIOL_Msk (1UL << SCB_CFSR_DACCVIOL_Pos) /*!< SCB CFSR (MMFSR): DACCVIOL Mask */ + +#define SCB_CFSR_IACCVIOL_Pos (SCB_CFSR_MEMFAULTSR_Pos + 0U) /*!< SCB CFSR (MMFSR): IACCVIOL Position */ +#define SCB_CFSR_IACCVIOL_Msk (1UL /*<< SCB_CFSR_IACCVIOL_Pos*/) /*!< SCB CFSR (MMFSR): IACCVIOL Mask */ + +/* BusFault Status Register (part of SCB Configurable Fault Status Register) */ +#define SCB_CFSR_BFARVALID_Pos (SCB_CFSR_BUSFAULTSR_Pos + 7U) /*!< SCB CFSR (BFSR): BFARVALID Position */ +#define SCB_CFSR_BFARVALID_Msk (1UL << SCB_CFSR_BFARVALID_Pos) /*!< SCB CFSR (BFSR): BFARVALID Mask */ + +#define SCB_CFSR_LSPERR_Pos (SCB_CFSR_BUSFAULTSR_Pos + 5U) /*!< SCB CFSR (BFSR): LSPERR Position */ +#define SCB_CFSR_LSPERR_Msk (1UL << SCB_CFSR_LSPERR_Pos) /*!< SCB CFSR (BFSR): LSPERR Mask */ + +#define SCB_CFSR_STKERR_Pos (SCB_CFSR_BUSFAULTSR_Pos + 4U) /*!< SCB CFSR (BFSR): STKERR Position */ +#define SCB_CFSR_STKERR_Msk (1UL << SCB_CFSR_STKERR_Pos) /*!< SCB CFSR (BFSR): STKERR Mask */ + +#define SCB_CFSR_UNSTKERR_Pos (SCB_CFSR_BUSFAULTSR_Pos + 3U) /*!< SCB CFSR (BFSR): UNSTKERR Position */ +#define SCB_CFSR_UNSTKERR_Msk (1UL << SCB_CFSR_UNSTKERR_Pos) /*!< SCB CFSR (BFSR): UNSTKERR Mask */ + +#define SCB_CFSR_IMPRECISERR_Pos (SCB_CFSR_BUSFAULTSR_Pos + 2U) /*!< SCB CFSR (BFSR): IMPRECISERR Position */ +#define SCB_CFSR_IMPRECISERR_Msk (1UL << SCB_CFSR_IMPRECISERR_Pos) /*!< SCB CFSR (BFSR): IMPRECISERR Mask */ + +#define SCB_CFSR_PRECISERR_Pos (SCB_CFSR_BUSFAULTSR_Pos + 1U) /*!< SCB CFSR (BFSR): PRECISERR Position */ +#define SCB_CFSR_PRECISERR_Msk (1UL << SCB_CFSR_PRECISERR_Pos) /*!< SCB CFSR (BFSR): PRECISERR Mask */ + +#define SCB_CFSR_IBUSERR_Pos (SCB_CFSR_BUSFAULTSR_Pos + 0U) /*!< SCB CFSR (BFSR): IBUSERR Position */ +#define SCB_CFSR_IBUSERR_Msk (1UL << SCB_CFSR_IBUSERR_Pos) /*!< SCB CFSR (BFSR): IBUSERR Mask */ + +/* UsageFault Status Register (part of SCB Configurable Fault Status Register) */ +#define SCB_CFSR_DIVBYZERO_Pos (SCB_CFSR_USGFAULTSR_Pos + 9U) /*!< SCB CFSR (UFSR): DIVBYZERO Position */ +#define SCB_CFSR_DIVBYZERO_Msk (1UL << SCB_CFSR_DIVBYZERO_Pos) /*!< SCB CFSR (UFSR): DIVBYZERO Mask */ + +#define SCB_CFSR_UNALIGNED_Pos (SCB_CFSR_USGFAULTSR_Pos + 8U) /*!< SCB CFSR (UFSR): UNALIGNED Position */ +#define SCB_CFSR_UNALIGNED_Msk (1UL << SCB_CFSR_UNALIGNED_Pos) /*!< SCB CFSR (UFSR): UNALIGNED Mask */ + +#define SCB_CFSR_NOCP_Pos (SCB_CFSR_USGFAULTSR_Pos + 3U) /*!< SCB CFSR (UFSR): NOCP Position */ +#define SCB_CFSR_NOCP_Msk (1UL << SCB_CFSR_NOCP_Pos) /*!< SCB CFSR (UFSR): NOCP Mask */ + +#define SCB_CFSR_INVPC_Pos (SCB_CFSR_USGFAULTSR_Pos + 2U) /*!< SCB CFSR (UFSR): INVPC Position */ +#define SCB_CFSR_INVPC_Msk (1UL << SCB_CFSR_INVPC_Pos) /*!< SCB CFSR (UFSR): INVPC Mask */ + +#define SCB_CFSR_INVSTATE_Pos (SCB_CFSR_USGFAULTSR_Pos + 1U) /*!< SCB CFSR (UFSR): INVSTATE Position */ +#define SCB_CFSR_INVSTATE_Msk (1UL << SCB_CFSR_INVSTATE_Pos) /*!< SCB CFSR (UFSR): INVSTATE Mask */ + +#define SCB_CFSR_UNDEFINSTR_Pos (SCB_CFSR_USGFAULTSR_Pos + 0U) /*!< SCB CFSR (UFSR): UNDEFINSTR Position */ +#define SCB_CFSR_UNDEFINSTR_Msk (1UL << SCB_CFSR_UNDEFINSTR_Pos) /*!< SCB CFSR (UFSR): UNDEFINSTR Mask */ + +/* SCB Hard Fault Status Register Definitions */ +#define SCB_HFSR_DEBUGEVT_Pos 31U /*!< SCB HFSR: DEBUGEVT Position */ +#define SCB_HFSR_DEBUGEVT_Msk (1UL << SCB_HFSR_DEBUGEVT_Pos) /*!< SCB HFSR: DEBUGEVT Mask */ + +#define SCB_HFSR_FORCED_Pos 30U /*!< SCB HFSR: FORCED Position */ +#define SCB_HFSR_FORCED_Msk (1UL << SCB_HFSR_FORCED_Pos) /*!< SCB HFSR: FORCED Mask */ + +#define SCB_HFSR_VECTTBL_Pos 1U /*!< SCB HFSR: VECTTBL Position */ +#define SCB_HFSR_VECTTBL_Msk (1UL << SCB_HFSR_VECTTBL_Pos) /*!< SCB HFSR: VECTTBL Mask */ + +/* SCB Debug Fault Status Register Definitions */ +#define SCB_DFSR_EXTERNAL_Pos 4U /*!< SCB DFSR: EXTERNAL Position */ +#define SCB_DFSR_EXTERNAL_Msk (1UL << SCB_DFSR_EXTERNAL_Pos) /*!< SCB DFSR: EXTERNAL Mask */ + +#define SCB_DFSR_VCATCH_Pos 3U /*!< SCB DFSR: VCATCH Position */ +#define SCB_DFSR_VCATCH_Msk (1UL << SCB_DFSR_VCATCH_Pos) /*!< SCB DFSR: VCATCH Mask */ + +#define SCB_DFSR_DWTTRAP_Pos 2U /*!< SCB DFSR: DWTTRAP Position */ +#define SCB_DFSR_DWTTRAP_Msk (1UL << SCB_DFSR_DWTTRAP_Pos) /*!< SCB DFSR: DWTTRAP Mask */ + +#define SCB_DFSR_BKPT_Pos 1U /*!< SCB DFSR: BKPT Position */ +#define SCB_DFSR_BKPT_Msk (1UL << SCB_DFSR_BKPT_Pos) /*!< SCB DFSR: BKPT Mask */ + +#define SCB_DFSR_HALTED_Pos 0U /*!< SCB DFSR: HALTED Position */ +#define SCB_DFSR_HALTED_Msk (1UL /*<< SCB_DFSR_HALTED_Pos*/) /*!< SCB DFSR: HALTED Mask */ + +/*@} end of group CMSIS_SCB */ + + +/** + \ingroup CMSIS_core_register + \defgroup CMSIS_SCnSCB System Controls not in SCB (SCnSCB) + \brief Type definitions for the System Control and ID Register not in the SCB + @{ + */ + +/** + \brief Structure type to access the System Control and ID Register not in the SCB. + */ +typedef struct +{ + uint32_t RESERVED0[1U]; + __IM uint32_t ICTR; /*!< Offset: 0x004 (R/ ) Interrupt Controller Type Register */ + __IOM uint32_t ACTLR; /*!< Offset: 0x008 (R/W) Auxiliary Control Register */ +} SCnSCB_Type; + +/* Interrupt Controller Type Register Definitions */ +#define SCnSCB_ICTR_INTLINESNUM_Pos 0U /*!< ICTR: INTLINESNUM Position */ +#define SCnSCB_ICTR_INTLINESNUM_Msk (0xFUL /*<< SCnSCB_ICTR_INTLINESNUM_Pos*/) /*!< ICTR: INTLINESNUM Mask */ + +/* Auxiliary Control Register Definitions */ +#define SCnSCB_ACTLR_DISOOFP_Pos 9U /*!< ACTLR: DISOOFP Position */ +#define SCnSCB_ACTLR_DISOOFP_Msk (1UL << SCnSCB_ACTLR_DISOOFP_Pos) /*!< ACTLR: DISOOFP Mask */ + +#define SCnSCB_ACTLR_DISFPCA_Pos 8U /*!< ACTLR: DISFPCA Position */ +#define SCnSCB_ACTLR_DISFPCA_Msk (1UL << SCnSCB_ACTLR_DISFPCA_Pos) /*!< ACTLR: DISFPCA Mask */ + +#define SCnSCB_ACTLR_DISFOLD_Pos 2U /*!< ACTLR: DISFOLD Position */ +#define SCnSCB_ACTLR_DISFOLD_Msk (1UL << SCnSCB_ACTLR_DISFOLD_Pos) /*!< ACTLR: DISFOLD Mask */ + +#define SCnSCB_ACTLR_DISDEFWBUF_Pos 1U /*!< ACTLR: DISDEFWBUF Position */ +#define SCnSCB_ACTLR_DISDEFWBUF_Msk (1UL << SCnSCB_ACTLR_DISDEFWBUF_Pos) /*!< ACTLR: DISDEFWBUF Mask */ + +#define SCnSCB_ACTLR_DISMCYCINT_Pos 0U /*!< ACTLR: DISMCYCINT Position */ +#define SCnSCB_ACTLR_DISMCYCINT_Msk (1UL /*<< SCnSCB_ACTLR_DISMCYCINT_Pos*/) /*!< ACTLR: DISMCYCINT Mask */ + +/*@} end of group CMSIS_SCnotSCB */ + + +/** + \ingroup CMSIS_core_register + \defgroup CMSIS_SysTick System Tick Timer (SysTick) + \brief Type definitions for the System Timer Registers. + @{ + */ + +/** + \brief Structure type to access the System Timer (SysTick). + */ +typedef struct +{ + __IOM uint32_t CTRL; /*!< Offset: 0x000 (R/W) SysTick Control and Status Register */ + __IOM uint32_t LOAD; /*!< Offset: 0x004 (R/W) SysTick Reload Value Register */ + __IOM uint32_t VAL; /*!< Offset: 0x008 (R/W) SysTick Current Value Register */ + __IM uint32_t CALIB; /*!< Offset: 0x00C (R/ ) SysTick Calibration Register */ +} SysTick_Type; + +/* SysTick Control / Status Register Definitions */ +#define SysTick_CTRL_COUNTFLAG_Pos 16U /*!< SysTick CTRL: COUNTFLAG Position */ +#define SysTick_CTRL_COUNTFLAG_Msk (1UL << SysTick_CTRL_COUNTFLAG_Pos) /*!< SysTick CTRL: COUNTFLAG Mask */ + +#define SysTick_CTRL_CLKSOURCE_Pos 2U /*!< SysTick CTRL: CLKSOURCE Position */ +#define SysTick_CTRL_CLKSOURCE_Msk (1UL << SysTick_CTRL_CLKSOURCE_Pos) /*!< SysTick CTRL: CLKSOURCE Mask */ + +#define SysTick_CTRL_TICKINT_Pos 1U /*!< SysTick CTRL: TICKINT Position */ +#define SysTick_CTRL_TICKINT_Msk (1UL << SysTick_CTRL_TICKINT_Pos) /*!< SysTick CTRL: TICKINT Mask */ + +#define SysTick_CTRL_ENABLE_Pos 0U /*!< SysTick CTRL: ENABLE Position */ +#define SysTick_CTRL_ENABLE_Msk (1UL /*<< SysTick_CTRL_ENABLE_Pos*/) /*!< SysTick CTRL: ENABLE Mask */ + +/* SysTick Reload Register Definitions */ +#define SysTick_LOAD_RELOAD_Pos 0U /*!< SysTick LOAD: RELOAD Position */ +#define SysTick_LOAD_RELOAD_Msk (0xFFFFFFUL /*<< SysTick_LOAD_RELOAD_Pos*/) /*!< SysTick LOAD: RELOAD Mask */ + +/* SysTick Current Register Definitions */ +#define SysTick_VAL_CURRENT_Pos 0U /*!< SysTick VAL: CURRENT Position */ +#define SysTick_VAL_CURRENT_Msk (0xFFFFFFUL /*<< SysTick_VAL_CURRENT_Pos*/) /*!< SysTick VAL: CURRENT Mask */ + +/* SysTick Calibration Register Definitions */ +#define SysTick_CALIB_NOREF_Pos 31U /*!< SysTick CALIB: NOREF Position */ +#define SysTick_CALIB_NOREF_Msk (1UL << SysTick_CALIB_NOREF_Pos) /*!< SysTick CALIB: NOREF Mask */ + +#define SysTick_CALIB_SKEW_Pos 30U /*!< SysTick CALIB: SKEW Position */ +#define SysTick_CALIB_SKEW_Msk (1UL << SysTick_CALIB_SKEW_Pos) /*!< SysTick CALIB: SKEW Mask */ + +#define SysTick_CALIB_TENMS_Pos 0U /*!< SysTick CALIB: TENMS Position */ +#define SysTick_CALIB_TENMS_Msk (0xFFFFFFUL /*<< SysTick_CALIB_TENMS_Pos*/) /*!< SysTick CALIB: TENMS Mask */ + +/*@} end of group CMSIS_SysTick */ + + +/** + \ingroup CMSIS_core_register + \defgroup CMSIS_ITM Instrumentation Trace Macrocell (ITM) + \brief Type definitions for the Instrumentation Trace Macrocell (ITM) + @{ + */ + +/** + \brief Structure type to access the Instrumentation Trace Macrocell Register (ITM). + */ +typedef struct +{ + __OM union + { + __OM uint8_t u8; /*!< Offset: 0x000 ( /W) ITM Stimulus Port 8-bit */ + __OM uint16_t u16; /*!< Offset: 0x000 ( /W) ITM Stimulus Port 16-bit */ + __OM uint32_t u32; /*!< Offset: 0x000 ( /W) ITM Stimulus Port 32-bit */ + } PORT [32U]; /*!< Offset: 0x000 ( /W) ITM Stimulus Port Registers */ + uint32_t RESERVED0[864U]; + __IOM uint32_t TER; /*!< Offset: 0xE00 (R/W) ITM Trace Enable Register */ + uint32_t RESERVED1[15U]; + __IOM uint32_t TPR; /*!< Offset: 0xE40 (R/W) ITM Trace Privilege Register */ + uint32_t RESERVED2[15U]; + __IOM uint32_t TCR; /*!< Offset: 0xE80 (R/W) ITM Trace Control Register */ + uint32_t RESERVED3[32U]; + uint32_t RESERVED4[43U]; + __OM uint32_t LAR; /*!< Offset: 0xFB0 ( /W) ITM Lock Access Register */ + __IM uint32_t LSR; /*!< Offset: 0xFB4 (R/ ) ITM Lock Status Register */ + uint32_t RESERVED5[6U]; + __IM uint32_t PID4; /*!< Offset: 0xFD0 (R/ ) ITM Peripheral Identification Register #4 */ + __IM uint32_t PID5; /*!< Offset: 0xFD4 (R/ ) ITM Peripheral Identification Register #5 */ + __IM uint32_t PID6; /*!< Offset: 0xFD8 (R/ ) ITM Peripheral Identification Register #6 */ + __IM uint32_t PID7; /*!< Offset: 0xFDC (R/ ) ITM Peripheral Identification Register #7 */ + __IM uint32_t PID0; /*!< Offset: 0xFE0 (R/ ) ITM Peripheral Identification Register #0 */ + __IM uint32_t PID1; /*!< Offset: 0xFE4 (R/ ) ITM Peripheral Identification Register #1 */ + __IM uint32_t PID2; /*!< Offset: 0xFE8 (R/ ) ITM Peripheral Identification Register #2 */ + __IM uint32_t PID3; /*!< Offset: 0xFEC (R/ ) ITM Peripheral Identification Register #3 */ + __IM uint32_t CID0; /*!< Offset: 0xFF0 (R/ ) ITM Component Identification Register #0 */ + __IM uint32_t CID1; /*!< Offset: 0xFF4 (R/ ) ITM Component Identification Register #1 */ + __IM uint32_t CID2; /*!< Offset: 0xFF8 (R/ ) ITM Component Identification Register #2 */ + __IM uint32_t CID3; /*!< Offset: 0xFFC (R/ ) ITM Component Identification Register #3 */ +} ITM_Type; + +/* ITM Trace Privilege Register Definitions */ +#define ITM_TPR_PRIVMASK_Pos 0U /*!< ITM TPR: PRIVMASK Position */ +#define ITM_TPR_PRIVMASK_Msk (0xFFFFFFFFUL /*<< ITM_TPR_PRIVMASK_Pos*/) /*!< ITM TPR: PRIVMASK Mask */ + +/* ITM Trace Control Register Definitions */ +#define ITM_TCR_BUSY_Pos 23U /*!< ITM TCR: BUSY Position */ +#define ITM_TCR_BUSY_Msk (1UL << ITM_TCR_BUSY_Pos) /*!< ITM TCR: BUSY Mask */ + +#define ITM_TCR_TraceBusID_Pos 16U /*!< ITM TCR: ATBID Position */ +#define ITM_TCR_TraceBusID_Msk (0x7FUL << ITM_TCR_TraceBusID_Pos) /*!< ITM TCR: ATBID Mask */ + +#define ITM_TCR_GTSFREQ_Pos 10U /*!< ITM TCR: Global timestamp frequency Position */ +#define ITM_TCR_GTSFREQ_Msk (3UL << ITM_TCR_GTSFREQ_Pos) /*!< ITM TCR: Global timestamp frequency Mask */ + +#define ITM_TCR_TSPrescale_Pos 8U /*!< ITM TCR: TSPrescale Position */ +#define ITM_TCR_TSPrescale_Msk (3UL << ITM_TCR_TSPrescale_Pos) /*!< ITM TCR: TSPrescale Mask */ + +#define ITM_TCR_SWOENA_Pos 4U /*!< ITM TCR: SWOENA Position */ +#define ITM_TCR_SWOENA_Msk (1UL << ITM_TCR_SWOENA_Pos) /*!< ITM TCR: SWOENA Mask */ + +#define ITM_TCR_DWTENA_Pos 3U /*!< ITM TCR: DWTENA Position */ +#define ITM_TCR_DWTENA_Msk (1UL << ITM_TCR_DWTENA_Pos) /*!< ITM TCR: DWTENA Mask */ + +#define ITM_TCR_SYNCENA_Pos 2U /*!< ITM TCR: SYNCENA Position */ +#define ITM_TCR_SYNCENA_Msk (1UL << ITM_TCR_SYNCENA_Pos) /*!< ITM TCR: SYNCENA Mask */ + +#define ITM_TCR_TSENA_Pos 1U /*!< ITM TCR: TSENA Position */ +#define ITM_TCR_TSENA_Msk (1UL << ITM_TCR_TSENA_Pos) /*!< ITM TCR: TSENA Mask */ + +#define ITM_TCR_ITMENA_Pos 0U /*!< ITM TCR: ITM Enable bit Position */ +#define ITM_TCR_ITMENA_Msk (1UL /*<< ITM_TCR_ITMENA_Pos*/) /*!< ITM TCR: ITM Enable bit Mask */ + +/* ITM Lock Status Register Definitions */ +#define ITM_LSR_ByteAcc_Pos 2U /*!< ITM LSR: ByteAcc Position */ +#define ITM_LSR_ByteAcc_Msk (1UL << ITM_LSR_ByteAcc_Pos) /*!< ITM LSR: ByteAcc Mask */ + +#define ITM_LSR_Access_Pos 1U /*!< ITM LSR: Access Position */ +#define ITM_LSR_Access_Msk (1UL << ITM_LSR_Access_Pos) /*!< ITM LSR: Access Mask */ + +#define ITM_LSR_Present_Pos 0U /*!< ITM LSR: Present Position */ +#define ITM_LSR_Present_Msk (1UL /*<< ITM_LSR_Present_Pos*/) /*!< ITM LSR: Present Mask */ + +/*@}*/ /* end of group CMSIS_ITM */ + + +/** + \ingroup CMSIS_core_register + \defgroup CMSIS_DWT Data Watchpoint and Trace (DWT) + \brief Type definitions for the Data Watchpoint and Trace (DWT) + @{ + */ + +/** + \brief Structure type to access the Data Watchpoint and Trace Register (DWT). + */ +typedef struct +{ + __IOM uint32_t CTRL; /*!< Offset: 0x000 (R/W) Control Register */ + __IOM uint32_t CYCCNT; /*!< Offset: 0x004 (R/W) Cycle Count Register */ + __IOM uint32_t CPICNT; /*!< Offset: 0x008 (R/W) CPI Count Register */ + __IOM uint32_t EXCCNT; /*!< Offset: 0x00C (R/W) Exception Overhead Count Register */ + __IOM uint32_t SLEEPCNT; /*!< Offset: 0x010 (R/W) Sleep Count Register */ + __IOM uint32_t LSUCNT; /*!< Offset: 0x014 (R/W) LSU Count Register */ + __IOM uint32_t FOLDCNT; /*!< Offset: 0x018 (R/W) Folded-instruction Count Register */ + __IM uint32_t PCSR; /*!< Offset: 0x01C (R/ ) Program Counter Sample Register */ + __IOM uint32_t COMP0; /*!< Offset: 0x020 (R/W) Comparator Register 0 */ + __IOM uint32_t MASK0; /*!< Offset: 0x024 (R/W) Mask Register 0 */ + __IOM uint32_t FUNCTION0; /*!< Offset: 0x028 (R/W) Function Register 0 */ + uint32_t RESERVED0[1U]; + __IOM uint32_t COMP1; /*!< Offset: 0x030 (R/W) Comparator Register 1 */ + __IOM uint32_t MASK1; /*!< Offset: 0x034 (R/W) Mask Register 1 */ + __IOM uint32_t FUNCTION1; /*!< Offset: 0x038 (R/W) Function Register 1 */ + uint32_t RESERVED1[1U]; + __IOM uint32_t COMP2; /*!< Offset: 0x040 (R/W) Comparator Register 2 */ + __IOM uint32_t MASK2; /*!< Offset: 0x044 (R/W) Mask Register 2 */ + __IOM uint32_t FUNCTION2; /*!< Offset: 0x048 (R/W) Function Register 2 */ + uint32_t RESERVED2[1U]; + __IOM uint32_t COMP3; /*!< Offset: 0x050 (R/W) Comparator Register 3 */ + __IOM uint32_t MASK3; /*!< Offset: 0x054 (R/W) Mask Register 3 */ + __IOM uint32_t FUNCTION3; /*!< Offset: 0x058 (R/W) Function Register 3 */ +} DWT_Type; + +/* DWT Control Register Definitions */ +#define DWT_CTRL_NUMCOMP_Pos 28U /*!< DWT CTRL: NUMCOMP Position */ +#define DWT_CTRL_NUMCOMP_Msk (0xFUL << DWT_CTRL_NUMCOMP_Pos) /*!< DWT CTRL: NUMCOMP Mask */ + +#define DWT_CTRL_NOTRCPKT_Pos 27U /*!< DWT CTRL: NOTRCPKT Position */ +#define DWT_CTRL_NOTRCPKT_Msk (0x1UL << DWT_CTRL_NOTRCPKT_Pos) /*!< DWT CTRL: NOTRCPKT Mask */ + +#define DWT_CTRL_NOEXTTRIG_Pos 26U /*!< DWT CTRL: NOEXTTRIG Position */ +#define DWT_CTRL_NOEXTTRIG_Msk (0x1UL << DWT_CTRL_NOEXTTRIG_Pos) /*!< DWT CTRL: NOEXTTRIG Mask */ + +#define DWT_CTRL_NOCYCCNT_Pos 25U /*!< DWT CTRL: NOCYCCNT Position */ +#define DWT_CTRL_NOCYCCNT_Msk (0x1UL << DWT_CTRL_NOCYCCNT_Pos) /*!< DWT CTRL: NOCYCCNT Mask */ + +#define DWT_CTRL_NOPRFCNT_Pos 24U /*!< DWT CTRL: NOPRFCNT Position */ +#define DWT_CTRL_NOPRFCNT_Msk (0x1UL << DWT_CTRL_NOPRFCNT_Pos) /*!< DWT CTRL: NOPRFCNT Mask */ + +#define DWT_CTRL_CYCEVTENA_Pos 22U /*!< DWT CTRL: CYCEVTENA Position */ +#define DWT_CTRL_CYCEVTENA_Msk (0x1UL << DWT_CTRL_CYCEVTENA_Pos) /*!< DWT CTRL: CYCEVTENA Mask */ + +#define DWT_CTRL_FOLDEVTENA_Pos 21U /*!< DWT CTRL: FOLDEVTENA Position */ +#define DWT_CTRL_FOLDEVTENA_Msk (0x1UL << DWT_CTRL_FOLDEVTENA_Pos) /*!< DWT CTRL: FOLDEVTENA Mask */ + +#define DWT_CTRL_LSUEVTENA_Pos 20U /*!< DWT CTRL: LSUEVTENA Position */ +#define DWT_CTRL_LSUEVTENA_Msk (0x1UL << DWT_CTRL_LSUEVTENA_Pos) /*!< DWT CTRL: LSUEVTENA Mask */ + +#define DWT_CTRL_SLEEPEVTENA_Pos 19U /*!< DWT CTRL: SLEEPEVTENA Position */ +#define DWT_CTRL_SLEEPEVTENA_Msk (0x1UL << DWT_CTRL_SLEEPEVTENA_Pos) /*!< DWT CTRL: SLEEPEVTENA Mask */ + +#define DWT_CTRL_EXCEVTENA_Pos 18U /*!< DWT CTRL: EXCEVTENA Position */ +#define DWT_CTRL_EXCEVTENA_Msk (0x1UL << DWT_CTRL_EXCEVTENA_Pos) /*!< DWT CTRL: EXCEVTENA Mask */ + +#define DWT_CTRL_CPIEVTENA_Pos 17U /*!< DWT CTRL: CPIEVTENA Position */ +#define DWT_CTRL_CPIEVTENA_Msk (0x1UL << DWT_CTRL_CPIEVTENA_Pos) /*!< DWT CTRL: CPIEVTENA Mask */ + +#define DWT_CTRL_EXCTRCENA_Pos 16U /*!< DWT CTRL: EXCTRCENA Position */ +#define DWT_CTRL_EXCTRCENA_Msk (0x1UL << DWT_CTRL_EXCTRCENA_Pos) /*!< DWT CTRL: EXCTRCENA Mask */ + +#define DWT_CTRL_PCSAMPLENA_Pos 12U /*!< DWT CTRL: PCSAMPLENA Position */ +#define DWT_CTRL_PCSAMPLENA_Msk (0x1UL << DWT_CTRL_PCSAMPLENA_Pos) /*!< DWT CTRL: PCSAMPLENA Mask */ + +#define DWT_CTRL_SYNCTAP_Pos 10U /*!< DWT CTRL: SYNCTAP Position */ +#define DWT_CTRL_SYNCTAP_Msk (0x3UL << DWT_CTRL_SYNCTAP_Pos) /*!< DWT CTRL: SYNCTAP Mask */ + +#define DWT_CTRL_CYCTAP_Pos 9U /*!< DWT CTRL: CYCTAP Position */ +#define DWT_CTRL_CYCTAP_Msk (0x1UL << DWT_CTRL_CYCTAP_Pos) /*!< DWT CTRL: CYCTAP Mask */ + +#define DWT_CTRL_POSTINIT_Pos 5U /*!< DWT CTRL: POSTINIT Position */ +#define DWT_CTRL_POSTINIT_Msk (0xFUL << DWT_CTRL_POSTINIT_Pos) /*!< DWT CTRL: POSTINIT Mask */ + +#define DWT_CTRL_POSTPRESET_Pos 1U /*!< DWT CTRL: POSTPRESET Position */ +#define DWT_CTRL_POSTPRESET_Msk (0xFUL << DWT_CTRL_POSTPRESET_Pos) /*!< DWT CTRL: POSTPRESET Mask */ + +#define DWT_CTRL_CYCCNTENA_Pos 0U /*!< DWT CTRL: CYCCNTENA Position */ +#define DWT_CTRL_CYCCNTENA_Msk (0x1UL /*<< DWT_CTRL_CYCCNTENA_Pos*/) /*!< DWT CTRL: CYCCNTENA Mask */ + +/* DWT CPI Count Register Definitions */ +#define DWT_CPICNT_CPICNT_Pos 0U /*!< DWT CPICNT: CPICNT Position */ +#define DWT_CPICNT_CPICNT_Msk (0xFFUL /*<< DWT_CPICNT_CPICNT_Pos*/) /*!< DWT CPICNT: CPICNT Mask */ + +/* DWT Exception Overhead Count Register Definitions */ +#define DWT_EXCCNT_EXCCNT_Pos 0U /*!< DWT EXCCNT: EXCCNT Position */ +#define DWT_EXCCNT_EXCCNT_Msk (0xFFUL /*<< DWT_EXCCNT_EXCCNT_Pos*/) /*!< DWT EXCCNT: EXCCNT Mask */ + +/* DWT Sleep Count Register Definitions */ +#define DWT_SLEEPCNT_SLEEPCNT_Pos 0U /*!< DWT SLEEPCNT: SLEEPCNT Position */ +#define DWT_SLEEPCNT_SLEEPCNT_Msk (0xFFUL /*<< DWT_SLEEPCNT_SLEEPCNT_Pos*/) /*!< DWT SLEEPCNT: SLEEPCNT Mask */ + +/* DWT LSU Count Register Definitions */ +#define DWT_LSUCNT_LSUCNT_Pos 0U /*!< DWT LSUCNT: LSUCNT Position */ +#define DWT_LSUCNT_LSUCNT_Msk (0xFFUL /*<< DWT_LSUCNT_LSUCNT_Pos*/) /*!< DWT LSUCNT: LSUCNT Mask */ + +/* DWT Folded-instruction Count Register Definitions */ +#define DWT_FOLDCNT_FOLDCNT_Pos 0U /*!< DWT FOLDCNT: FOLDCNT Position */ +#define DWT_FOLDCNT_FOLDCNT_Msk (0xFFUL /*<< DWT_FOLDCNT_FOLDCNT_Pos*/) /*!< DWT FOLDCNT: FOLDCNT Mask */ + +/* DWT Comparator Mask Register Definitions */ +#define DWT_MASK_MASK_Pos 0U /*!< DWT MASK: MASK Position */ +#define DWT_MASK_MASK_Msk (0x1FUL /*<< DWT_MASK_MASK_Pos*/) /*!< DWT MASK: MASK Mask */ + +/* DWT Comparator Function Register Definitions */ +#define DWT_FUNCTION_MATCHED_Pos 24U /*!< DWT FUNCTION: MATCHED Position */ +#define DWT_FUNCTION_MATCHED_Msk (0x1UL << DWT_FUNCTION_MATCHED_Pos) /*!< DWT FUNCTION: MATCHED Mask */ + +#define DWT_FUNCTION_DATAVADDR1_Pos 16U /*!< DWT FUNCTION: DATAVADDR1 Position */ +#define DWT_FUNCTION_DATAVADDR1_Msk (0xFUL << DWT_FUNCTION_DATAVADDR1_Pos) /*!< DWT FUNCTION: DATAVADDR1 Mask */ + +#define DWT_FUNCTION_DATAVADDR0_Pos 12U /*!< DWT FUNCTION: DATAVADDR0 Position */ +#define DWT_FUNCTION_DATAVADDR0_Msk (0xFUL << DWT_FUNCTION_DATAVADDR0_Pos) /*!< DWT FUNCTION: DATAVADDR0 Mask */ + +#define DWT_FUNCTION_DATAVSIZE_Pos 10U /*!< DWT FUNCTION: DATAVSIZE Position */ +#define DWT_FUNCTION_DATAVSIZE_Msk (0x3UL << DWT_FUNCTION_DATAVSIZE_Pos) /*!< DWT FUNCTION: DATAVSIZE Mask */ + +#define DWT_FUNCTION_LNK1ENA_Pos 9U /*!< DWT FUNCTION: LNK1ENA Position */ +#define DWT_FUNCTION_LNK1ENA_Msk (0x1UL << DWT_FUNCTION_LNK1ENA_Pos) /*!< DWT FUNCTION: LNK1ENA Mask */ + +#define DWT_FUNCTION_DATAVMATCH_Pos 8U /*!< DWT FUNCTION: DATAVMATCH Position */ +#define DWT_FUNCTION_DATAVMATCH_Msk (0x1UL << DWT_FUNCTION_DATAVMATCH_Pos) /*!< DWT FUNCTION: DATAVMATCH Mask */ + +#define DWT_FUNCTION_CYCMATCH_Pos 7U /*!< DWT FUNCTION: CYCMATCH Position */ +#define DWT_FUNCTION_CYCMATCH_Msk (0x1UL << DWT_FUNCTION_CYCMATCH_Pos) /*!< DWT FUNCTION: CYCMATCH Mask */ + +#define DWT_FUNCTION_EMITRANGE_Pos 5U /*!< DWT FUNCTION: EMITRANGE Position */ +#define DWT_FUNCTION_EMITRANGE_Msk (0x1UL << DWT_FUNCTION_EMITRANGE_Pos) /*!< DWT FUNCTION: EMITRANGE Mask */ + +#define DWT_FUNCTION_FUNCTION_Pos 0U /*!< DWT FUNCTION: FUNCTION Position */ +#define DWT_FUNCTION_FUNCTION_Msk (0xFUL /*<< DWT_FUNCTION_FUNCTION_Pos*/) /*!< DWT FUNCTION: FUNCTION Mask */ + +/*@}*/ /* end of group CMSIS_DWT */ + + +/** + \ingroup CMSIS_core_register + \defgroup CMSIS_TPI Trace Port Interface (TPI) + \brief Type definitions for the Trace Port Interface (TPI) + @{ + */ + +/** + \brief Structure type to access the Trace Port Interface Register (TPI). + */ +typedef struct +{ + __IM uint32_t SSPSR; /*!< Offset: 0x000 (R/ ) Supported Parallel Port Size Register */ + __IOM uint32_t CSPSR; /*!< Offset: 0x004 (R/W) Current Parallel Port Size Register */ + uint32_t RESERVED0[2U]; + __IOM uint32_t ACPR; /*!< Offset: 0x010 (R/W) Asynchronous Clock Prescaler Register */ + uint32_t RESERVED1[55U]; + __IOM uint32_t SPPR; /*!< Offset: 0x0F0 (R/W) Selected Pin Protocol Register */ + uint32_t RESERVED2[131U]; + __IM uint32_t FFSR; /*!< Offset: 0x300 (R/ ) Formatter and Flush Status Register */ + __IOM uint32_t FFCR; /*!< Offset: 0x304 (R/W) Formatter and Flush Control Register */ + __IM uint32_t FSCR; /*!< Offset: 0x308 (R/ ) Formatter Synchronization Counter Register */ + uint32_t RESERVED3[759U]; + __IM uint32_t TRIGGER; /*!< Offset: 0xEE8 (R/ ) TRIGGER Register */ + __IM uint32_t FIFO0; /*!< Offset: 0xEEC (R/ ) Integration ETM Data */ + __IM uint32_t ITATBCTR2; /*!< Offset: 0xEF0 (R/ ) ITATBCTR2 */ + uint32_t RESERVED4[1U]; + __IM uint32_t ITATBCTR0; /*!< Offset: 0xEF8 (R/ ) ITATBCTR0 */ + __IM uint32_t FIFO1; /*!< Offset: 0xEFC (R/ ) Integration ITM Data */ + __IOM uint32_t ITCTRL; /*!< Offset: 0xF00 (R/W) Integration Mode Control */ + uint32_t RESERVED5[39U]; + __IOM uint32_t CLAIMSET; /*!< Offset: 0xFA0 (R/W) Claim tag set */ + __IOM uint32_t CLAIMCLR; /*!< Offset: 0xFA4 (R/W) Claim tag clear */ + uint32_t RESERVED7[8U]; + __IM uint32_t DEVID; /*!< Offset: 0xFC8 (R/ ) TPIU_DEVID */ + __IM uint32_t DEVTYPE; /*!< Offset: 0xFCC (R/ ) TPIU_DEVTYPE */ +} TPI_Type; + +/* TPI Asynchronous Clock Prescaler Register Definitions */ +#define TPI_ACPR_PRESCALER_Pos 0U /*!< TPI ACPR: PRESCALER Position */ +#define TPI_ACPR_PRESCALER_Msk (0x1FFFUL /*<< TPI_ACPR_PRESCALER_Pos*/) /*!< TPI ACPR: PRESCALER Mask */ + +/* TPI Selected Pin Protocol Register Definitions */ +#define TPI_SPPR_TXMODE_Pos 0U /*!< TPI SPPR: TXMODE Position */ +#define TPI_SPPR_TXMODE_Msk (0x3UL /*<< TPI_SPPR_TXMODE_Pos*/) /*!< TPI SPPR: TXMODE Mask */ + +/* TPI Formatter and Flush Status Register Definitions */ +#define TPI_FFSR_FtNonStop_Pos 3U /*!< TPI FFSR: FtNonStop Position */ +#define TPI_FFSR_FtNonStop_Msk (0x1UL << TPI_FFSR_FtNonStop_Pos) /*!< TPI FFSR: FtNonStop Mask */ + +#define TPI_FFSR_TCPresent_Pos 2U /*!< TPI FFSR: TCPresent Position */ +#define TPI_FFSR_TCPresent_Msk (0x1UL << TPI_FFSR_TCPresent_Pos) /*!< TPI FFSR: TCPresent Mask */ + +#define TPI_FFSR_FtStopped_Pos 1U /*!< TPI FFSR: FtStopped Position */ +#define TPI_FFSR_FtStopped_Msk (0x1UL << TPI_FFSR_FtStopped_Pos) /*!< TPI FFSR: FtStopped Mask */ + +#define TPI_FFSR_FlInProg_Pos 0U /*!< TPI FFSR: FlInProg Position */ +#define TPI_FFSR_FlInProg_Msk (0x1UL /*<< TPI_FFSR_FlInProg_Pos*/) /*!< TPI FFSR: FlInProg Mask */ + +/* TPI Formatter and Flush Control Register Definitions */ +#define TPI_FFCR_TrigIn_Pos 8U /*!< TPI FFCR: TrigIn Position */ +#define TPI_FFCR_TrigIn_Msk (0x1UL << TPI_FFCR_TrigIn_Pos) /*!< TPI FFCR: TrigIn Mask */ + +#define TPI_FFCR_EnFCont_Pos 1U /*!< TPI FFCR: EnFCont Position */ +#define TPI_FFCR_EnFCont_Msk (0x1UL << TPI_FFCR_EnFCont_Pos) /*!< TPI FFCR: EnFCont Mask */ + +/* TPI TRIGGER Register Definitions */ +#define TPI_TRIGGER_TRIGGER_Pos 0U /*!< TPI TRIGGER: TRIGGER Position */ +#define TPI_TRIGGER_TRIGGER_Msk (0x1UL /*<< TPI_TRIGGER_TRIGGER_Pos*/) /*!< TPI TRIGGER: TRIGGER Mask */ + +/* TPI Integration ETM Data Register Definitions (FIFO0) */ +#define TPI_FIFO0_ITM_ATVALID_Pos 29U /*!< TPI FIFO0: ITM_ATVALID Position */ +#define TPI_FIFO0_ITM_ATVALID_Msk (0x1UL << TPI_FIFO0_ITM_ATVALID_Pos) /*!< TPI FIFO0: ITM_ATVALID Mask */ + +#define TPI_FIFO0_ITM_bytecount_Pos 27U /*!< TPI FIFO0: ITM_bytecount Position */ +#define TPI_FIFO0_ITM_bytecount_Msk (0x3UL << TPI_FIFO0_ITM_bytecount_Pos) /*!< TPI FIFO0: ITM_bytecount Mask */ + +#define TPI_FIFO0_ETM_ATVALID_Pos 26U /*!< TPI FIFO0: ETM_ATVALID Position */ +#define TPI_FIFO0_ETM_ATVALID_Msk (0x1UL << TPI_FIFO0_ETM_ATVALID_Pos) /*!< TPI FIFO0: ETM_ATVALID Mask */ + +#define TPI_FIFO0_ETM_bytecount_Pos 24U /*!< TPI FIFO0: ETM_bytecount Position */ +#define TPI_FIFO0_ETM_bytecount_Msk (0x3UL << TPI_FIFO0_ETM_bytecount_Pos) /*!< TPI FIFO0: ETM_bytecount Mask */ + +#define TPI_FIFO0_ETM2_Pos 16U /*!< TPI FIFO0: ETM2 Position */ +#define TPI_FIFO0_ETM2_Msk (0xFFUL << TPI_FIFO0_ETM2_Pos) /*!< TPI FIFO0: ETM2 Mask */ + +#define TPI_FIFO0_ETM1_Pos 8U /*!< TPI FIFO0: ETM1 Position */ +#define TPI_FIFO0_ETM1_Msk (0xFFUL << TPI_FIFO0_ETM1_Pos) /*!< TPI FIFO0: ETM1 Mask */ + +#define TPI_FIFO0_ETM0_Pos 0U /*!< TPI FIFO0: ETM0 Position */ +#define TPI_FIFO0_ETM0_Msk (0xFFUL /*<< TPI_FIFO0_ETM0_Pos*/) /*!< TPI FIFO0: ETM0 Mask */ + +/* TPI ITATBCTR2 Register Definitions */ +#define TPI_ITATBCTR2_ATREADY2_Pos 0U /*!< TPI ITATBCTR2: ATREADY2 Position */ +#define TPI_ITATBCTR2_ATREADY2_Msk (0x1UL /*<< TPI_ITATBCTR2_ATREADY2_Pos*/) /*!< TPI ITATBCTR2: ATREADY2 Mask */ + +#define TPI_ITATBCTR2_ATREADY1_Pos 0U /*!< TPI ITATBCTR2: ATREADY1 Position */ +#define TPI_ITATBCTR2_ATREADY1_Msk (0x1UL /*<< TPI_ITATBCTR2_ATREADY1_Pos*/) /*!< TPI ITATBCTR2: ATREADY1 Mask */ + +/* TPI Integration ITM Data Register Definitions (FIFO1) */ +#define TPI_FIFO1_ITM_ATVALID_Pos 29U /*!< TPI FIFO1: ITM_ATVALID Position */ +#define TPI_FIFO1_ITM_ATVALID_Msk (0x1UL << TPI_FIFO1_ITM_ATVALID_Pos) /*!< TPI FIFO1: ITM_ATVALID Mask */ + +#define TPI_FIFO1_ITM_bytecount_Pos 27U /*!< TPI FIFO1: ITM_bytecount Position */ +#define TPI_FIFO1_ITM_bytecount_Msk (0x3UL << TPI_FIFO1_ITM_bytecount_Pos) /*!< TPI FIFO1: ITM_bytecount Mask */ + +#define TPI_FIFO1_ETM_ATVALID_Pos 26U /*!< TPI FIFO1: ETM_ATVALID Position */ +#define TPI_FIFO1_ETM_ATVALID_Msk (0x1UL << TPI_FIFO1_ETM_ATVALID_Pos) /*!< TPI FIFO1: ETM_ATVALID Mask */ + +#define TPI_FIFO1_ETM_bytecount_Pos 24U /*!< TPI FIFO1: ETM_bytecount Position */ +#define TPI_FIFO1_ETM_bytecount_Msk (0x3UL << TPI_FIFO1_ETM_bytecount_Pos) /*!< TPI FIFO1: ETM_bytecount Mask */ + +#define TPI_FIFO1_ITM2_Pos 16U /*!< TPI FIFO1: ITM2 Position */ +#define TPI_FIFO1_ITM2_Msk (0xFFUL << TPI_FIFO1_ITM2_Pos) /*!< TPI FIFO1: ITM2 Mask */ + +#define TPI_FIFO1_ITM1_Pos 8U /*!< TPI FIFO1: ITM1 Position */ +#define TPI_FIFO1_ITM1_Msk (0xFFUL << TPI_FIFO1_ITM1_Pos) /*!< TPI FIFO1: ITM1 Mask */ + +#define TPI_FIFO1_ITM0_Pos 0U /*!< TPI FIFO1: ITM0 Position */ +#define TPI_FIFO1_ITM0_Msk (0xFFUL /*<< TPI_FIFO1_ITM0_Pos*/) /*!< TPI FIFO1: ITM0 Mask */ + +/* TPI ITATBCTR0 Register Definitions */ +#define TPI_ITATBCTR0_ATREADY2_Pos 0U /*!< TPI ITATBCTR0: ATREADY2 Position */ +#define TPI_ITATBCTR0_ATREADY2_Msk (0x1UL /*<< TPI_ITATBCTR0_ATREADY2_Pos*/) /*!< TPI ITATBCTR0: ATREADY2 Mask */ + +#define TPI_ITATBCTR0_ATREADY1_Pos 0U /*!< TPI ITATBCTR0: ATREADY1 Position */ +#define TPI_ITATBCTR0_ATREADY1_Msk (0x1UL /*<< TPI_ITATBCTR0_ATREADY1_Pos*/) /*!< TPI ITATBCTR0: ATREADY1 Mask */ + +/* TPI Integration Mode Control Register Definitions */ +#define TPI_ITCTRL_Mode_Pos 0U /*!< TPI ITCTRL: Mode Position */ +#define TPI_ITCTRL_Mode_Msk (0x3UL /*<< TPI_ITCTRL_Mode_Pos*/) /*!< TPI ITCTRL: Mode Mask */ + +/* TPI DEVID Register Definitions */ +#define TPI_DEVID_NRZVALID_Pos 11U /*!< TPI DEVID: NRZVALID Position */ +#define TPI_DEVID_NRZVALID_Msk (0x1UL << TPI_DEVID_NRZVALID_Pos) /*!< TPI DEVID: NRZVALID Mask */ + +#define TPI_DEVID_MANCVALID_Pos 10U /*!< TPI DEVID: MANCVALID Position */ +#define TPI_DEVID_MANCVALID_Msk (0x1UL << TPI_DEVID_MANCVALID_Pos) /*!< TPI DEVID: MANCVALID Mask */ + +#define TPI_DEVID_PTINVALID_Pos 9U /*!< TPI DEVID: PTINVALID Position */ +#define TPI_DEVID_PTINVALID_Msk (0x1UL << TPI_DEVID_PTINVALID_Pos) /*!< TPI DEVID: PTINVALID Mask */ + +#define TPI_DEVID_MinBufSz_Pos 6U /*!< TPI DEVID: MinBufSz Position */ +#define TPI_DEVID_MinBufSz_Msk (0x7UL << TPI_DEVID_MinBufSz_Pos) /*!< TPI DEVID: MinBufSz Mask */ + +#define TPI_DEVID_AsynClkIn_Pos 5U /*!< TPI DEVID: AsynClkIn Position */ +#define TPI_DEVID_AsynClkIn_Msk (0x1UL << TPI_DEVID_AsynClkIn_Pos) /*!< TPI DEVID: AsynClkIn Mask */ + +#define TPI_DEVID_NrTraceInput_Pos 0U /*!< TPI DEVID: NrTraceInput Position */ +#define TPI_DEVID_NrTraceInput_Msk (0x1FUL /*<< TPI_DEVID_NrTraceInput_Pos*/) /*!< TPI DEVID: NrTraceInput Mask */ + +/* TPI DEVTYPE Register Definitions */ +#define TPI_DEVTYPE_SubType_Pos 4U /*!< TPI DEVTYPE: SubType Position */ +#define TPI_DEVTYPE_SubType_Msk (0xFUL /*<< TPI_DEVTYPE_SubType_Pos*/) /*!< TPI DEVTYPE: SubType Mask */ + +#define TPI_DEVTYPE_MajorType_Pos 0U /*!< TPI DEVTYPE: MajorType Position */ +#define TPI_DEVTYPE_MajorType_Msk (0xFUL << TPI_DEVTYPE_MajorType_Pos) /*!< TPI DEVTYPE: MajorType Mask */ + +/*@}*/ /* end of group CMSIS_TPI */ + + +#if defined (__MPU_PRESENT) && (__MPU_PRESENT == 1U) +/** + \ingroup CMSIS_core_register + \defgroup CMSIS_MPU Memory Protection Unit (MPU) + \brief Type definitions for the Memory Protection Unit (MPU) + @{ + */ + +/** + \brief Structure type to access the Memory Protection Unit (MPU). + */ +typedef struct +{ + __IM uint32_t TYPE; /*!< Offset: 0x000 (R/ ) MPU Type Register */ + __IOM uint32_t CTRL; /*!< Offset: 0x004 (R/W) MPU Control Register */ + __IOM uint32_t RNR; /*!< Offset: 0x008 (R/W) MPU Region RNRber Register */ + __IOM uint32_t RBAR; /*!< Offset: 0x00C (R/W) MPU Region Base Address Register */ + __IOM uint32_t RASR; /*!< Offset: 0x010 (R/W) MPU Region Attribute and Size Register */ + __IOM uint32_t RBAR_A1; /*!< Offset: 0x014 (R/W) MPU Alias 1 Region Base Address Register */ + __IOM uint32_t RASR_A1; /*!< Offset: 0x018 (R/W) MPU Alias 1 Region Attribute and Size Register */ + __IOM uint32_t RBAR_A2; /*!< Offset: 0x01C (R/W) MPU Alias 2 Region Base Address Register */ + __IOM uint32_t RASR_A2; /*!< Offset: 0x020 (R/W) MPU Alias 2 Region Attribute and Size Register */ + __IOM uint32_t RBAR_A3; /*!< Offset: 0x024 (R/W) MPU Alias 3 Region Base Address Register */ + __IOM uint32_t RASR_A3; /*!< Offset: 0x028 (R/W) MPU Alias 3 Region Attribute and Size Register */ +} MPU_Type; + +#define MPU_TYPE_RALIASES 4U + +/* MPU Type Register Definitions */ +#define MPU_TYPE_IREGION_Pos 16U /*!< MPU TYPE: IREGION Position */ +#define MPU_TYPE_IREGION_Msk (0xFFUL << MPU_TYPE_IREGION_Pos) /*!< MPU TYPE: IREGION Mask */ + +#define MPU_TYPE_DREGION_Pos 8U /*!< MPU TYPE: DREGION Position */ +#define MPU_TYPE_DREGION_Msk (0xFFUL << MPU_TYPE_DREGION_Pos) /*!< MPU TYPE: DREGION Mask */ + +#define MPU_TYPE_SEPARATE_Pos 0U /*!< MPU TYPE: SEPARATE Position */ +#define MPU_TYPE_SEPARATE_Msk (1UL /*<< MPU_TYPE_SEPARATE_Pos*/) /*!< MPU TYPE: SEPARATE Mask */ + +/* MPU Control Register Definitions */ +#define MPU_CTRL_PRIVDEFENA_Pos 2U /*!< MPU CTRL: PRIVDEFENA Position */ +#define MPU_CTRL_PRIVDEFENA_Msk (1UL << MPU_CTRL_PRIVDEFENA_Pos) /*!< MPU CTRL: PRIVDEFENA Mask */ + +#define MPU_CTRL_HFNMIENA_Pos 1U /*!< MPU CTRL: HFNMIENA Position */ +#define MPU_CTRL_HFNMIENA_Msk (1UL << MPU_CTRL_HFNMIENA_Pos) /*!< MPU CTRL: HFNMIENA Mask */ + +#define MPU_CTRL_ENABLE_Pos 0U /*!< MPU CTRL: ENABLE Position */ +#define MPU_CTRL_ENABLE_Msk (1UL /*<< MPU_CTRL_ENABLE_Pos*/) /*!< MPU CTRL: ENABLE Mask */ + +/* MPU Region Number Register Definitions */ +#define MPU_RNR_REGION_Pos 0U /*!< MPU RNR: REGION Position */ +#define MPU_RNR_REGION_Msk (0xFFUL /*<< MPU_RNR_REGION_Pos*/) /*!< MPU RNR: REGION Mask */ + +/* MPU Region Base Address Register Definitions */ +#define MPU_RBAR_ADDR_Pos 5U /*!< MPU RBAR: ADDR Position */ +#define MPU_RBAR_ADDR_Msk (0x7FFFFFFUL << MPU_RBAR_ADDR_Pos) /*!< MPU RBAR: ADDR Mask */ + +#define MPU_RBAR_VALID_Pos 4U /*!< MPU RBAR: VALID Position */ +#define MPU_RBAR_VALID_Msk (1UL << MPU_RBAR_VALID_Pos) /*!< MPU RBAR: VALID Mask */ + +#define MPU_RBAR_REGION_Pos 0U /*!< MPU RBAR: REGION Position */ +#define MPU_RBAR_REGION_Msk (0xFUL /*<< MPU_RBAR_REGION_Pos*/) /*!< MPU RBAR: REGION Mask */ + +/* MPU Region Attribute and Size Register Definitions */ +#define MPU_RASR_ATTRS_Pos 16U /*!< MPU RASR: MPU Region Attribute field Position */ +#define MPU_RASR_ATTRS_Msk (0xFFFFUL << MPU_RASR_ATTRS_Pos) /*!< MPU RASR: MPU Region Attribute field Mask */ + +#define MPU_RASR_XN_Pos 28U /*!< MPU RASR: ATTRS.XN Position */ +#define MPU_RASR_XN_Msk (1UL << MPU_RASR_XN_Pos) /*!< MPU RASR: ATTRS.XN Mask */ + +#define MPU_RASR_AP_Pos 24U /*!< MPU RASR: ATTRS.AP Position */ +#define MPU_RASR_AP_Msk (0x7UL << MPU_RASR_AP_Pos) /*!< MPU RASR: ATTRS.AP Mask */ + +#define MPU_RASR_TEX_Pos 19U /*!< MPU RASR: ATTRS.TEX Position */ +#define MPU_RASR_TEX_Msk (0x7UL << MPU_RASR_TEX_Pos) /*!< MPU RASR: ATTRS.TEX Mask */ + +#define MPU_RASR_S_Pos 18U /*!< MPU RASR: ATTRS.S Position */ +#define MPU_RASR_S_Msk (1UL << MPU_RASR_S_Pos) /*!< MPU RASR: ATTRS.S Mask */ + +#define MPU_RASR_C_Pos 17U /*!< MPU RASR: ATTRS.C Position */ +#define MPU_RASR_C_Msk (1UL << MPU_RASR_C_Pos) /*!< MPU RASR: ATTRS.C Mask */ + +#define MPU_RASR_B_Pos 16U /*!< MPU RASR: ATTRS.B Position */ +#define MPU_RASR_B_Msk (1UL << MPU_RASR_B_Pos) /*!< MPU RASR: ATTRS.B Mask */ + +#define MPU_RASR_SRD_Pos 8U /*!< MPU RASR: Sub-Region Disable Position */ +#define MPU_RASR_SRD_Msk (0xFFUL << MPU_RASR_SRD_Pos) /*!< MPU RASR: Sub-Region Disable Mask */ + +#define MPU_RASR_SIZE_Pos 1U /*!< MPU RASR: Region Size Field Position */ +#define MPU_RASR_SIZE_Msk (0x1FUL << MPU_RASR_SIZE_Pos) /*!< MPU RASR: Region Size Field Mask */ + +#define MPU_RASR_ENABLE_Pos 0U /*!< MPU RASR: Region enable bit Position */ +#define MPU_RASR_ENABLE_Msk (1UL /*<< MPU_RASR_ENABLE_Pos*/) /*!< MPU RASR: Region enable bit Disable Mask */ + +/*@} end of group CMSIS_MPU */ +#endif /* defined (__MPU_PRESENT) && (__MPU_PRESENT == 1U) */ + + +/** + \ingroup CMSIS_core_register + \defgroup CMSIS_FPU Floating Point Unit (FPU) + \brief Type definitions for the Floating Point Unit (FPU) + @{ + */ + +/** + \brief Structure type to access the Floating Point Unit (FPU). + */ +typedef struct +{ + uint32_t RESERVED0[1U]; + __IOM uint32_t FPCCR; /*!< Offset: 0x004 (R/W) Floating-Point Context Control Register */ + __IOM uint32_t FPCAR; /*!< Offset: 0x008 (R/W) Floating-Point Context Address Register */ + __IOM uint32_t FPDSCR; /*!< Offset: 0x00C (R/W) Floating-Point Default Status Control Register */ + __IM uint32_t MVFR0; /*!< Offset: 0x010 (R/ ) Media and FP Feature Register 0 */ + __IM uint32_t MVFR1; /*!< Offset: 0x014 (R/ ) Media and FP Feature Register 1 */ + __IM uint32_t MVFR2; /*!< Offset: 0x018 (R/ ) Media and FP Feature Register 2 */ +} FPU_Type; + +/* Floating-Point Context Control Register Definitions */ +#define FPU_FPCCR_ASPEN_Pos 31U /*!< FPCCR: ASPEN bit Position */ +#define FPU_FPCCR_ASPEN_Msk (1UL << FPU_FPCCR_ASPEN_Pos) /*!< FPCCR: ASPEN bit Mask */ + +#define FPU_FPCCR_LSPEN_Pos 30U /*!< FPCCR: LSPEN Position */ +#define FPU_FPCCR_LSPEN_Msk (1UL << FPU_FPCCR_LSPEN_Pos) /*!< FPCCR: LSPEN bit Mask */ + +#define FPU_FPCCR_MONRDY_Pos 8U /*!< FPCCR: MONRDY Position */ +#define FPU_FPCCR_MONRDY_Msk (1UL << FPU_FPCCR_MONRDY_Pos) /*!< FPCCR: MONRDY bit Mask */ + +#define FPU_FPCCR_BFRDY_Pos 6U /*!< FPCCR: BFRDY Position */ +#define FPU_FPCCR_BFRDY_Msk (1UL << FPU_FPCCR_BFRDY_Pos) /*!< FPCCR: BFRDY bit Mask */ + +#define FPU_FPCCR_MMRDY_Pos 5U /*!< FPCCR: MMRDY Position */ +#define FPU_FPCCR_MMRDY_Msk (1UL << FPU_FPCCR_MMRDY_Pos) /*!< FPCCR: MMRDY bit Mask */ + +#define FPU_FPCCR_HFRDY_Pos 4U /*!< FPCCR: HFRDY Position */ +#define FPU_FPCCR_HFRDY_Msk (1UL << FPU_FPCCR_HFRDY_Pos) /*!< FPCCR: HFRDY bit Mask */ + +#define FPU_FPCCR_THREAD_Pos 3U /*!< FPCCR: processor mode bit Position */ +#define FPU_FPCCR_THREAD_Msk (1UL << FPU_FPCCR_THREAD_Pos) /*!< FPCCR: processor mode active bit Mask */ + +#define FPU_FPCCR_USER_Pos 1U /*!< FPCCR: privilege level bit Position */ +#define FPU_FPCCR_USER_Msk (1UL << FPU_FPCCR_USER_Pos) /*!< FPCCR: privilege level bit Mask */ + +#define FPU_FPCCR_LSPACT_Pos 0U /*!< FPCCR: Lazy state preservation active bit Position */ +#define FPU_FPCCR_LSPACT_Msk (1UL /*<< FPU_FPCCR_LSPACT_Pos*/) /*!< FPCCR: Lazy state preservation active bit Mask */ + +/* Floating-Point Context Address Register Definitions */ +#define FPU_FPCAR_ADDRESS_Pos 3U /*!< FPCAR: ADDRESS bit Position */ +#define FPU_FPCAR_ADDRESS_Msk (0x1FFFFFFFUL << FPU_FPCAR_ADDRESS_Pos) /*!< FPCAR: ADDRESS bit Mask */ + +/* Floating-Point Default Status Control Register Definitions */ +#define FPU_FPDSCR_AHP_Pos 26U /*!< FPDSCR: AHP bit Position */ +#define FPU_FPDSCR_AHP_Msk (1UL << FPU_FPDSCR_AHP_Pos) /*!< FPDSCR: AHP bit Mask */ + +#define FPU_FPDSCR_DN_Pos 25U /*!< FPDSCR: DN bit Position */ +#define FPU_FPDSCR_DN_Msk (1UL << FPU_FPDSCR_DN_Pos) /*!< FPDSCR: DN bit Mask */ + +#define FPU_FPDSCR_FZ_Pos 24U /*!< FPDSCR: FZ bit Position */ +#define FPU_FPDSCR_FZ_Msk (1UL << FPU_FPDSCR_FZ_Pos) /*!< FPDSCR: FZ bit Mask */ + +#define FPU_FPDSCR_RMode_Pos 22U /*!< FPDSCR: RMode bit Position */ +#define FPU_FPDSCR_RMode_Msk (3UL << FPU_FPDSCR_RMode_Pos) /*!< FPDSCR: RMode bit Mask */ + +/* Media and FP Feature Register 0 Definitions */ +#define FPU_MVFR0_FP_rounding_modes_Pos 28U /*!< MVFR0: FP rounding modes bits Position */ +#define FPU_MVFR0_FP_rounding_modes_Msk (0xFUL << FPU_MVFR0_FP_rounding_modes_Pos) /*!< MVFR0: FP rounding modes bits Mask */ + +#define FPU_MVFR0_Short_vectors_Pos 24U /*!< MVFR0: Short vectors bits Position */ +#define FPU_MVFR0_Short_vectors_Msk (0xFUL << FPU_MVFR0_Short_vectors_Pos) /*!< MVFR0: Short vectors bits Mask */ + +#define FPU_MVFR0_Square_root_Pos 20U /*!< MVFR0: Square root bits Position */ +#define FPU_MVFR0_Square_root_Msk (0xFUL << FPU_MVFR0_Square_root_Pos) /*!< MVFR0: Square root bits Mask */ + +#define FPU_MVFR0_Divide_Pos 16U /*!< MVFR0: Divide bits Position */ +#define FPU_MVFR0_Divide_Msk (0xFUL << FPU_MVFR0_Divide_Pos) /*!< MVFR0: Divide bits Mask */ + +#define FPU_MVFR0_FP_excep_trapping_Pos 12U /*!< MVFR0: FP exception trapping bits Position */ +#define FPU_MVFR0_FP_excep_trapping_Msk (0xFUL << FPU_MVFR0_FP_excep_trapping_Pos) /*!< MVFR0: FP exception trapping bits Mask */ + +#define FPU_MVFR0_Double_precision_Pos 8U /*!< MVFR0: Double-precision bits Position */ +#define FPU_MVFR0_Double_precision_Msk (0xFUL << FPU_MVFR0_Double_precision_Pos) /*!< MVFR0: Double-precision bits Mask */ + +#define FPU_MVFR0_Single_precision_Pos 4U /*!< MVFR0: Single-precision bits Position */ +#define FPU_MVFR0_Single_precision_Msk (0xFUL << FPU_MVFR0_Single_precision_Pos) /*!< MVFR0: Single-precision bits Mask */ + +#define FPU_MVFR0_A_SIMD_registers_Pos 0U /*!< MVFR0: A_SIMD registers bits Position */ +#define FPU_MVFR0_A_SIMD_registers_Msk (0xFUL /*<< FPU_MVFR0_A_SIMD_registers_Pos*/) /*!< MVFR0: A_SIMD registers bits Mask */ + +/* Media and FP Feature Register 1 Definitions */ +#define FPU_MVFR1_FP_fused_MAC_Pos 28U /*!< MVFR1: FP fused MAC bits Position */ +#define FPU_MVFR1_FP_fused_MAC_Msk (0xFUL << FPU_MVFR1_FP_fused_MAC_Pos) /*!< MVFR1: FP fused MAC bits Mask */ + +#define FPU_MVFR1_FP_HPFP_Pos 24U /*!< MVFR1: FP HPFP bits Position */ +#define FPU_MVFR1_FP_HPFP_Msk (0xFUL << FPU_MVFR1_FP_HPFP_Pos) /*!< MVFR1: FP HPFP bits Mask */ + +#define FPU_MVFR1_D_NaN_mode_Pos 4U /*!< MVFR1: D_NaN mode bits Position */ +#define FPU_MVFR1_D_NaN_mode_Msk (0xFUL << FPU_MVFR1_D_NaN_mode_Pos) /*!< MVFR1: D_NaN mode bits Mask */ + +#define FPU_MVFR1_FtZ_mode_Pos 0U /*!< MVFR1: FtZ mode bits Position */ +#define FPU_MVFR1_FtZ_mode_Msk (0xFUL /*<< FPU_MVFR1_FtZ_mode_Pos*/) /*!< MVFR1: FtZ mode bits Mask */ + +/* Media and FP Feature Register 2 Definitions */ + +#define FPU_MVFR2_VFP_Misc_Pos 4U /*!< MVFR2: VFP Misc bits Position */ +#define FPU_MVFR2_VFP_Misc_Msk (0xFUL << FPU_MVFR2_VFP_Misc_Pos) /*!< MVFR2: VFP Misc bits Mask */ + +/*@} end of group CMSIS_FPU */ + + +/** + \ingroup CMSIS_core_register + \defgroup CMSIS_CoreDebug Core Debug Registers (CoreDebug) + \brief Type definitions for the Core Debug Registers + @{ + */ + +/** + \brief Structure type to access the Core Debug Register (CoreDebug). + */ +typedef struct +{ + __IOM uint32_t DHCSR; /*!< Offset: 0x000 (R/W) Debug Halting Control and Status Register */ + __OM uint32_t DCRSR; /*!< Offset: 0x004 ( /W) Debug Core Register Selector Register */ + __IOM uint32_t DCRDR; /*!< Offset: 0x008 (R/W) Debug Core Register Data Register */ + __IOM uint32_t DEMCR; /*!< Offset: 0x00C (R/W) Debug Exception and Monitor Control Register */ +} CoreDebug_Type; + +/* Debug Halting Control and Status Register Definitions */ +#define CoreDebug_DHCSR_DBGKEY_Pos 16U /*!< CoreDebug DHCSR: DBGKEY Position */ +#define CoreDebug_DHCSR_DBGKEY_Msk (0xFFFFUL << CoreDebug_DHCSR_DBGKEY_Pos) /*!< CoreDebug DHCSR: DBGKEY Mask */ + +#define CoreDebug_DHCSR_S_RESET_ST_Pos 25U /*!< CoreDebug DHCSR: S_RESET_ST Position */ +#define CoreDebug_DHCSR_S_RESET_ST_Msk (1UL << CoreDebug_DHCSR_S_RESET_ST_Pos) /*!< CoreDebug DHCSR: S_RESET_ST Mask */ + +#define CoreDebug_DHCSR_S_RETIRE_ST_Pos 24U /*!< CoreDebug DHCSR: S_RETIRE_ST Position */ +#define CoreDebug_DHCSR_S_RETIRE_ST_Msk (1UL << CoreDebug_DHCSR_S_RETIRE_ST_Pos) /*!< CoreDebug DHCSR: S_RETIRE_ST Mask */ + +#define CoreDebug_DHCSR_S_LOCKUP_Pos 19U /*!< CoreDebug DHCSR: S_LOCKUP Position */ +#define CoreDebug_DHCSR_S_LOCKUP_Msk (1UL << CoreDebug_DHCSR_S_LOCKUP_Pos) /*!< CoreDebug DHCSR: S_LOCKUP Mask */ + +#define CoreDebug_DHCSR_S_SLEEP_Pos 18U /*!< CoreDebug DHCSR: S_SLEEP Position */ +#define CoreDebug_DHCSR_S_SLEEP_Msk (1UL << CoreDebug_DHCSR_S_SLEEP_Pos) /*!< CoreDebug DHCSR: S_SLEEP Mask */ + +#define CoreDebug_DHCSR_S_HALT_Pos 17U /*!< CoreDebug DHCSR: S_HALT Position */ +#define CoreDebug_DHCSR_S_HALT_Msk (1UL << CoreDebug_DHCSR_S_HALT_Pos) /*!< CoreDebug DHCSR: S_HALT Mask */ + +#define CoreDebug_DHCSR_S_REGRDY_Pos 16U /*!< CoreDebug DHCSR: S_REGRDY Position */ +#define CoreDebug_DHCSR_S_REGRDY_Msk (1UL << CoreDebug_DHCSR_S_REGRDY_Pos) /*!< CoreDebug DHCSR: S_REGRDY Mask */ + +#define CoreDebug_DHCSR_C_SNAPSTALL_Pos 5U /*!< CoreDebug DHCSR: C_SNAPSTALL Position */ +#define CoreDebug_DHCSR_C_SNAPSTALL_Msk (1UL << CoreDebug_DHCSR_C_SNAPSTALL_Pos) /*!< CoreDebug DHCSR: C_SNAPSTALL Mask */ + +#define CoreDebug_DHCSR_C_MASKINTS_Pos 3U /*!< CoreDebug DHCSR: C_MASKINTS Position */ +#define CoreDebug_DHCSR_C_MASKINTS_Msk (1UL << CoreDebug_DHCSR_C_MASKINTS_Pos) /*!< CoreDebug DHCSR: C_MASKINTS Mask */ + +#define CoreDebug_DHCSR_C_STEP_Pos 2U /*!< CoreDebug DHCSR: C_STEP Position */ +#define CoreDebug_DHCSR_C_STEP_Msk (1UL << CoreDebug_DHCSR_C_STEP_Pos) /*!< CoreDebug DHCSR: C_STEP Mask */ + +#define CoreDebug_DHCSR_C_HALT_Pos 1U /*!< CoreDebug DHCSR: C_HALT Position */ +#define CoreDebug_DHCSR_C_HALT_Msk (1UL << CoreDebug_DHCSR_C_HALT_Pos) /*!< CoreDebug DHCSR: C_HALT Mask */ + +#define CoreDebug_DHCSR_C_DEBUGEN_Pos 0U /*!< CoreDebug DHCSR: C_DEBUGEN Position */ +#define CoreDebug_DHCSR_C_DEBUGEN_Msk (1UL /*<< CoreDebug_DHCSR_C_DEBUGEN_Pos*/) /*!< CoreDebug DHCSR: C_DEBUGEN Mask */ + +/* Debug Core Register Selector Register Definitions */ +#define CoreDebug_DCRSR_REGWnR_Pos 16U /*!< CoreDebug DCRSR: REGWnR Position */ +#define CoreDebug_DCRSR_REGWnR_Msk (1UL << CoreDebug_DCRSR_REGWnR_Pos) /*!< CoreDebug DCRSR: REGWnR Mask */ + +#define CoreDebug_DCRSR_REGSEL_Pos 0U /*!< CoreDebug DCRSR: REGSEL Position */ +#define CoreDebug_DCRSR_REGSEL_Msk (0x1FUL /*<< CoreDebug_DCRSR_REGSEL_Pos*/) /*!< CoreDebug DCRSR: REGSEL Mask */ + +/* Debug Exception and Monitor Control Register Definitions */ +#define CoreDebug_DEMCR_TRCENA_Pos 24U /*!< CoreDebug DEMCR: TRCENA Position */ +#define CoreDebug_DEMCR_TRCENA_Msk (1UL << CoreDebug_DEMCR_TRCENA_Pos) /*!< CoreDebug DEMCR: TRCENA Mask */ + +#define CoreDebug_DEMCR_MON_REQ_Pos 19U /*!< CoreDebug DEMCR: MON_REQ Position */ +#define CoreDebug_DEMCR_MON_REQ_Msk (1UL << CoreDebug_DEMCR_MON_REQ_Pos) /*!< CoreDebug DEMCR: MON_REQ Mask */ + +#define CoreDebug_DEMCR_MON_STEP_Pos 18U /*!< CoreDebug DEMCR: MON_STEP Position */ +#define CoreDebug_DEMCR_MON_STEP_Msk (1UL << CoreDebug_DEMCR_MON_STEP_Pos) /*!< CoreDebug DEMCR: MON_STEP Mask */ + +#define CoreDebug_DEMCR_MON_PEND_Pos 17U /*!< CoreDebug DEMCR: MON_PEND Position */ +#define CoreDebug_DEMCR_MON_PEND_Msk (1UL << CoreDebug_DEMCR_MON_PEND_Pos) /*!< CoreDebug DEMCR: MON_PEND Mask */ + +#define CoreDebug_DEMCR_MON_EN_Pos 16U /*!< CoreDebug DEMCR: MON_EN Position */ +#define CoreDebug_DEMCR_MON_EN_Msk (1UL << CoreDebug_DEMCR_MON_EN_Pos) /*!< CoreDebug DEMCR: MON_EN Mask */ + +#define CoreDebug_DEMCR_VC_HARDERR_Pos 10U /*!< CoreDebug DEMCR: VC_HARDERR Position */ +#define CoreDebug_DEMCR_VC_HARDERR_Msk (1UL << CoreDebug_DEMCR_VC_HARDERR_Pos) /*!< CoreDebug DEMCR: VC_HARDERR Mask */ + +#define CoreDebug_DEMCR_VC_INTERR_Pos 9U /*!< CoreDebug DEMCR: VC_INTERR Position */ +#define CoreDebug_DEMCR_VC_INTERR_Msk (1UL << CoreDebug_DEMCR_VC_INTERR_Pos) /*!< CoreDebug DEMCR: VC_INTERR Mask */ + +#define CoreDebug_DEMCR_VC_BUSERR_Pos 8U /*!< CoreDebug DEMCR: VC_BUSERR Position */ +#define CoreDebug_DEMCR_VC_BUSERR_Msk (1UL << CoreDebug_DEMCR_VC_BUSERR_Pos) /*!< CoreDebug DEMCR: VC_BUSERR Mask */ + +#define CoreDebug_DEMCR_VC_STATERR_Pos 7U /*!< CoreDebug DEMCR: VC_STATERR Position */ +#define CoreDebug_DEMCR_VC_STATERR_Msk (1UL << CoreDebug_DEMCR_VC_STATERR_Pos) /*!< CoreDebug DEMCR: VC_STATERR Mask */ + +#define CoreDebug_DEMCR_VC_CHKERR_Pos 6U /*!< CoreDebug DEMCR: VC_CHKERR Position */ +#define CoreDebug_DEMCR_VC_CHKERR_Msk (1UL << CoreDebug_DEMCR_VC_CHKERR_Pos) /*!< CoreDebug DEMCR: VC_CHKERR Mask */ + +#define CoreDebug_DEMCR_VC_NOCPERR_Pos 5U /*!< CoreDebug DEMCR: VC_NOCPERR Position */ +#define CoreDebug_DEMCR_VC_NOCPERR_Msk (1UL << CoreDebug_DEMCR_VC_NOCPERR_Pos) /*!< CoreDebug DEMCR: VC_NOCPERR Mask */ + +#define CoreDebug_DEMCR_VC_MMERR_Pos 4U /*!< CoreDebug DEMCR: VC_MMERR Position */ +#define CoreDebug_DEMCR_VC_MMERR_Msk (1UL << CoreDebug_DEMCR_VC_MMERR_Pos) /*!< CoreDebug DEMCR: VC_MMERR Mask */ + +#define CoreDebug_DEMCR_VC_CORERESET_Pos 0U /*!< CoreDebug DEMCR: VC_CORERESET Position */ +#define CoreDebug_DEMCR_VC_CORERESET_Msk (1UL /*<< CoreDebug_DEMCR_VC_CORERESET_Pos*/) /*!< CoreDebug DEMCR: VC_CORERESET Mask */ + +/*@} end of group CMSIS_CoreDebug */ + + +/** + \ingroup CMSIS_core_register + \defgroup CMSIS_core_bitfield Core register bit field macros + \brief Macros for use with bit field definitions (xxx_Pos, xxx_Msk). + @{ + */ + +/** + \brief Mask and shift a bit field value for use in a register bit range. + \param[in] field Name of the register bit field. + \param[in] value Value of the bit field. This parameter is interpreted as an uint32_t type. + \return Masked and shifted value. +*/ +#define _VAL2FLD(field, value) (((uint32_t)(value) << field ## _Pos) & field ## _Msk) + +/** + \brief Mask and shift a register value to extract a bit filed value. + \param[in] field Name of the register bit field. + \param[in] value Value of register. This parameter is interpreted as an uint32_t type. + \return Masked and shifted bit field value. +*/ +#define _FLD2VAL(field, value) (((uint32_t)(value) & field ## _Msk) >> field ## _Pos) + +/*@} end of group CMSIS_core_bitfield */ + + +/** + \ingroup CMSIS_core_register + \defgroup CMSIS_core_base Core Definitions + \brief Definitions for base addresses, unions, and structures. + @{ + */ + +/* Memory mapping of Core Hardware */ +#define SCS_BASE (0xE000E000UL) /*!< System Control Space Base Address */ +#define ITM_BASE (0xE0000000UL) /*!< ITM Base Address */ +#define DWT_BASE (0xE0001000UL) /*!< DWT Base Address */ +#define TPI_BASE (0xE0040000UL) /*!< TPI Base Address */ +#define CoreDebug_BASE (0xE000EDF0UL) /*!< Core Debug Base Address */ +#define SysTick_BASE (SCS_BASE + 0x0010UL) /*!< SysTick Base Address */ +#define NVIC_BASE (SCS_BASE + 0x0100UL) /*!< NVIC Base Address */ +#define SCB_BASE (SCS_BASE + 0x0D00UL) /*!< System Control Block Base Address */ + +#define SCnSCB ((SCnSCB_Type *) SCS_BASE ) /*!< System control Register not in SCB */ +#define SCB ((SCB_Type *) SCB_BASE ) /*!< SCB configuration struct */ +#define SysTick ((SysTick_Type *) SysTick_BASE ) /*!< SysTick configuration struct */ +#define NVIC ((NVIC_Type *) NVIC_BASE ) /*!< NVIC configuration struct */ +#define ITM ((ITM_Type *) ITM_BASE ) /*!< ITM configuration struct */ +#define DWT ((DWT_Type *) DWT_BASE ) /*!< DWT configuration struct */ +#define TPI ((TPI_Type *) TPI_BASE ) /*!< TPI configuration struct */ +#define CoreDebug ((CoreDebug_Type *) CoreDebug_BASE) /*!< Core Debug configuration struct */ + +#if defined (__MPU_PRESENT) && (__MPU_PRESENT == 1U) + #define MPU_BASE (SCS_BASE + 0x0D90UL) /*!< Memory Protection Unit */ + #define MPU ((MPU_Type *) MPU_BASE ) /*!< Memory Protection Unit */ +#endif + +#define FPU_BASE (SCS_BASE + 0x0F30UL) /*!< Floating Point Unit */ +#define FPU ((FPU_Type *) FPU_BASE ) /*!< Floating Point Unit */ + +/*@} */ + + + +/******************************************************************************* + * Hardware Abstraction Layer + Core Function Interface contains: + - Core NVIC Functions + - Core SysTick Functions + - Core Debug Functions + - Core Register Access Functions + ******************************************************************************/ +/** + \defgroup CMSIS_Core_FunctionInterface Functions and Instructions Reference +*/ + + + +/* ########################## NVIC functions #################################### */ +/** + \ingroup CMSIS_Core_FunctionInterface + \defgroup CMSIS_Core_NVICFunctions NVIC Functions + \brief Functions that manage interrupts and exceptions via the NVIC. + @{ + */ + +#ifdef CMSIS_NVIC_VIRTUAL + #ifndef CMSIS_NVIC_VIRTUAL_HEADER_FILE + #define CMSIS_NVIC_VIRTUAL_HEADER_FILE "cmsis_nvic_virtual.h" + #endif + #include CMSIS_NVIC_VIRTUAL_HEADER_FILE +#else + #define NVIC_SetPriorityGrouping __NVIC_SetPriorityGrouping + #define NVIC_GetPriorityGrouping __NVIC_GetPriorityGrouping + #define NVIC_EnableIRQ __NVIC_EnableIRQ + #define NVIC_GetEnableIRQ __NVIC_GetEnableIRQ + #define NVIC_DisableIRQ __NVIC_DisableIRQ + #define NVIC_GetPendingIRQ __NVIC_GetPendingIRQ + #define NVIC_SetPendingIRQ __NVIC_SetPendingIRQ + #define NVIC_ClearPendingIRQ __NVIC_ClearPendingIRQ + #define NVIC_GetActive __NVIC_GetActive + #define NVIC_SetPriority __NVIC_SetPriority + #define NVIC_GetPriority __NVIC_GetPriority + #define NVIC_SystemReset __NVIC_SystemReset +#endif /* CMSIS_NVIC_VIRTUAL */ + +#ifdef CMSIS_VECTAB_VIRTUAL + #ifndef CMSIS_VECTAB_VIRTUAL_HEADER_FILE + #define CMSIS_VECTAB_VIRTUAL_HEADER_FILE "cmsis_vectab_virtual.h" + #endif + #include CMSIS_VECTAB_VIRTUAL_HEADER_FILE +#else + #define NVIC_SetVector __NVIC_SetVector + #define NVIC_GetVector __NVIC_GetVector +#endif /* (CMSIS_VECTAB_VIRTUAL) */ + +#define NVIC_USER_IRQ_OFFSET 16 + + +/* The following EXC_RETURN values are saved the LR on exception entry */ +#define EXC_RETURN_HANDLER (0xFFFFFFF1UL) /* return to Handler mode, uses MSP after return */ +#define EXC_RETURN_THREAD_MSP (0xFFFFFFF9UL) /* return to Thread mode, uses MSP after return */ +#define EXC_RETURN_THREAD_PSP (0xFFFFFFFDUL) /* return to Thread mode, uses PSP after return */ +#define EXC_RETURN_HANDLER_FPU (0xFFFFFFE1UL) /* return to Handler mode, uses MSP after return, restore floating-point state */ +#define EXC_RETURN_THREAD_MSP_FPU (0xFFFFFFE9UL) /* return to Thread mode, uses MSP after return, restore floating-point state */ +#define EXC_RETURN_THREAD_PSP_FPU (0xFFFFFFEDUL) /* return to Thread mode, uses PSP after return, restore floating-point state */ + + +/** + \brief Set Priority Grouping + \details Sets the priority grouping field using the required unlock sequence. + The parameter PriorityGroup is assigned to the field SCB->AIRCR [10:8] PRIGROUP field. + Only values from 0..7 are used. + In case of a conflict between priority grouping and available + priority bits (__NVIC_PRIO_BITS), the smallest possible priority group is set. + \param [in] PriorityGroup Priority grouping field. + */ +__STATIC_INLINE void __NVIC_SetPriorityGrouping(uint32_t PriorityGroup) +{ + uint32_t reg_value; + uint32_t PriorityGroupTmp = (PriorityGroup & (uint32_t)0x07UL); /* only values 0..7 are used */ + + reg_value = SCB->AIRCR; /* read old register configuration */ + reg_value &= ~((uint32_t)(SCB_AIRCR_VECTKEY_Msk | SCB_AIRCR_PRIGROUP_Msk)); /* clear bits to change */ + reg_value = (reg_value | + ((uint32_t)0x5FAUL << SCB_AIRCR_VECTKEY_Pos) | + (PriorityGroupTmp << SCB_AIRCR_PRIGROUP_Pos) ); /* Insert write key and priority group */ + SCB->AIRCR = reg_value; +} + + +/** + \brief Get Priority Grouping + \details Reads the priority grouping field from the NVIC Interrupt Controller. + \return Priority grouping field (SCB->AIRCR [10:8] PRIGROUP field). + */ +__STATIC_INLINE uint32_t __NVIC_GetPriorityGrouping(void) +{ + return ((uint32_t)((SCB->AIRCR & SCB_AIRCR_PRIGROUP_Msk) >> SCB_AIRCR_PRIGROUP_Pos)); +} + + +/** + \brief Enable Interrupt + \details Enables a device specific interrupt in the NVIC interrupt controller. + \param [in] IRQn Device specific interrupt number. + \note IRQn must not be negative. + */ +__STATIC_INLINE void __NVIC_EnableIRQ(IRQn_Type IRQn) +{ + if ((int32_t)(IRQn) >= 0) + { + __COMPILER_BARRIER(); + NVIC->ISER[(((uint32_t)IRQn) >> 5UL)] = (uint32_t)(1UL << (((uint32_t)IRQn) & 0x1FUL)); + __COMPILER_BARRIER(); + } +} + + +/** + \brief Get Interrupt Enable status + \details Returns a device specific interrupt enable status from the NVIC interrupt controller. + \param [in] IRQn Device specific interrupt number. + \return 0 Interrupt is not enabled. + \return 1 Interrupt is enabled. + \note IRQn must not be negative. + */ +__STATIC_INLINE uint32_t __NVIC_GetEnableIRQ(IRQn_Type IRQn) +{ + if ((int32_t)(IRQn) >= 0) + { + return((uint32_t)(((NVIC->ISER[(((uint32_t)IRQn) >> 5UL)] & (1UL << (((uint32_t)IRQn) & 0x1FUL))) != 0UL) ? 1UL : 0UL)); + } + else + { + return(0U); + } +} + + +/** + \brief Disable Interrupt + \details Disables a device specific interrupt in the NVIC interrupt controller. + \param [in] IRQn Device specific interrupt number. + \note IRQn must not be negative. + */ +__STATIC_INLINE void __NVIC_DisableIRQ(IRQn_Type IRQn) +{ + if ((int32_t)(IRQn) >= 0) + { + NVIC->ICER[(((uint32_t)IRQn) >> 5UL)] = (uint32_t)(1UL << (((uint32_t)IRQn) & 0x1FUL)); + __DSB(); + __ISB(); + } +} + + +/** + \brief Get Pending Interrupt + \details Reads the NVIC pending register and returns the pending bit for the specified device specific interrupt. + \param [in] IRQn Device specific interrupt number. + \return 0 Interrupt status is not pending. + \return 1 Interrupt status is pending. + \note IRQn must not be negative. + */ +__STATIC_INLINE uint32_t __NVIC_GetPendingIRQ(IRQn_Type IRQn) +{ + if ((int32_t)(IRQn) >= 0) + { + return((uint32_t)(((NVIC->ISPR[(((uint32_t)IRQn) >> 5UL)] & (1UL << (((uint32_t)IRQn) & 0x1FUL))) != 0UL) ? 1UL : 0UL)); + } + else + { + return(0U); + } +} + + +/** + \brief Set Pending Interrupt + \details Sets the pending bit of a device specific interrupt in the NVIC pending register. + \param [in] IRQn Device specific interrupt number. + \note IRQn must not be negative. + */ +__STATIC_INLINE void __NVIC_SetPendingIRQ(IRQn_Type IRQn) +{ + if ((int32_t)(IRQn) >= 0) + { + NVIC->ISPR[(((uint32_t)IRQn) >> 5UL)] = (uint32_t)(1UL << (((uint32_t)IRQn) & 0x1FUL)); + } +} + + +/** + \brief Clear Pending Interrupt + \details Clears the pending bit of a device specific interrupt in the NVIC pending register. + \param [in] IRQn Device specific interrupt number. + \note IRQn must not be negative. + */ +__STATIC_INLINE void __NVIC_ClearPendingIRQ(IRQn_Type IRQn) +{ + if ((int32_t)(IRQn) >= 0) + { + NVIC->ICPR[(((uint32_t)IRQn) >> 5UL)] = (uint32_t)(1UL << (((uint32_t)IRQn) & 0x1FUL)); + } +} + + +/** + \brief Get Active Interrupt + \details Reads the active register in the NVIC and returns the active bit for the device specific interrupt. + \param [in] IRQn Device specific interrupt number. + \return 0 Interrupt status is not active. + \return 1 Interrupt status is active. + \note IRQn must not be negative. + */ +__STATIC_INLINE uint32_t __NVIC_GetActive(IRQn_Type IRQn) +{ + if ((int32_t)(IRQn) >= 0) + { + return((uint32_t)(((NVIC->IABR[(((uint32_t)IRQn) >> 5UL)] & (1UL << (((uint32_t)IRQn) & 0x1FUL))) != 0UL) ? 1UL : 0UL)); + } + else + { + return(0U); + } +} + + +/** + \brief Set Interrupt Priority + \details Sets the priority of a device specific interrupt or a processor exception. + The interrupt number can be positive to specify a device specific interrupt, + or negative to specify a processor exception. + \param [in] IRQn Interrupt number. + \param [in] priority Priority to set. + \note The priority cannot be set for every processor exception. + */ +__STATIC_INLINE void __NVIC_SetPriority(IRQn_Type IRQn, uint32_t priority) +{ + if ((int32_t)(IRQn) >= 0) + { + NVIC->IP[((uint32_t)IRQn)] = (uint8_t)((priority << (8U - __NVIC_PRIO_BITS)) & (uint32_t)0xFFUL); + } + else + { + SCB->SHP[(((uint32_t)IRQn) & 0xFUL)-4UL] = (uint8_t)((priority << (8U - __NVIC_PRIO_BITS)) & (uint32_t)0xFFUL); + } +} + + +/** + \brief Get Interrupt Priority + \details Reads the priority of a device specific interrupt or a processor exception. + The interrupt number can be positive to specify a device specific interrupt, + or negative to specify a processor exception. + \param [in] IRQn Interrupt number. + \return Interrupt Priority. + Value is aligned automatically to the implemented priority bits of the microcontroller. + */ +__STATIC_INLINE uint32_t __NVIC_GetPriority(IRQn_Type IRQn) +{ + + if ((int32_t)(IRQn) >= 0) + { + return(((uint32_t)NVIC->IP[((uint32_t)IRQn)] >> (8U - __NVIC_PRIO_BITS))); + } + else + { + return(((uint32_t)SCB->SHP[(((uint32_t)IRQn) & 0xFUL)-4UL] >> (8U - __NVIC_PRIO_BITS))); + } +} + + +/** + \brief Encode Priority + \details Encodes the priority for an interrupt with the given priority group, + preemptive priority value, and subpriority value. + In case of a conflict between priority grouping and available + priority bits (__NVIC_PRIO_BITS), the smallest possible priority group is set. + \param [in] PriorityGroup Used priority group. + \param [in] PreemptPriority Preemptive priority value (starting from 0). + \param [in] SubPriority Subpriority value (starting from 0). + \return Encoded priority. Value can be used in the function \ref NVIC_SetPriority(). + */ +__STATIC_INLINE uint32_t NVIC_EncodePriority (uint32_t PriorityGroup, uint32_t PreemptPriority, uint32_t SubPriority) +{ + uint32_t PriorityGroupTmp = (PriorityGroup & (uint32_t)0x07UL); /* only values 0..7 are used */ + uint32_t PreemptPriorityBits; + uint32_t SubPriorityBits; + + PreemptPriorityBits = ((7UL - PriorityGroupTmp) > (uint32_t)(__NVIC_PRIO_BITS)) ? (uint32_t)(__NVIC_PRIO_BITS) : (uint32_t)(7UL - PriorityGroupTmp); + SubPriorityBits = ((PriorityGroupTmp + (uint32_t)(__NVIC_PRIO_BITS)) < (uint32_t)7UL) ? (uint32_t)0UL : (uint32_t)((PriorityGroupTmp - 7UL) + (uint32_t)(__NVIC_PRIO_BITS)); + + return ( + ((PreemptPriority & (uint32_t)((1UL << (PreemptPriorityBits)) - 1UL)) << SubPriorityBits) | + ((SubPriority & (uint32_t)((1UL << (SubPriorityBits )) - 1UL))) + ); +} + + +/** + \brief Decode Priority + \details Decodes an interrupt priority value with a given priority group to + preemptive priority value and subpriority value. + In case of a conflict between priority grouping and available + priority bits (__NVIC_PRIO_BITS) the smallest possible priority group is set. + \param [in] Priority Priority value, which can be retrieved with the function \ref NVIC_GetPriority(). + \param [in] PriorityGroup Used priority group. + \param [out] pPreemptPriority Preemptive priority value (starting from 0). + \param [out] pSubPriority Subpriority value (starting from 0). + */ +__STATIC_INLINE void NVIC_DecodePriority (uint32_t Priority, uint32_t PriorityGroup, uint32_t* const pPreemptPriority, uint32_t* const pSubPriority) +{ + uint32_t PriorityGroupTmp = (PriorityGroup & (uint32_t)0x07UL); /* only values 0..7 are used */ + uint32_t PreemptPriorityBits; + uint32_t SubPriorityBits; + + PreemptPriorityBits = ((7UL - PriorityGroupTmp) > (uint32_t)(__NVIC_PRIO_BITS)) ? (uint32_t)(__NVIC_PRIO_BITS) : (uint32_t)(7UL - PriorityGroupTmp); + SubPriorityBits = ((PriorityGroupTmp + (uint32_t)(__NVIC_PRIO_BITS)) < (uint32_t)7UL) ? (uint32_t)0UL : (uint32_t)((PriorityGroupTmp - 7UL) + (uint32_t)(__NVIC_PRIO_BITS)); + + *pPreemptPriority = (Priority >> SubPriorityBits) & (uint32_t)((1UL << (PreemptPriorityBits)) - 1UL); + *pSubPriority = (Priority ) & (uint32_t)((1UL << (SubPriorityBits )) - 1UL); +} + + +/** + \brief Set Interrupt Vector + \details Sets an interrupt vector in SRAM based interrupt vector table. + The interrupt number can be positive to specify a device specific interrupt, + or negative to specify a processor exception. + VTOR must been relocated to SRAM before. + \param [in] IRQn Interrupt number + \param [in] vector Address of interrupt handler function + */ +__STATIC_INLINE void __NVIC_SetVector(IRQn_Type IRQn, uint32_t vector) +{ + uint32_t *vectors = (uint32_t *)SCB->VTOR; + vectors[(int32_t)IRQn + NVIC_USER_IRQ_OFFSET] = vector; + /* ARM Application Note 321 states that the M4 does not require the architectural barrier */ +} + + +/** + \brief Get Interrupt Vector + \details Reads an interrupt vector from interrupt vector table. + The interrupt number can be positive to specify a device specific interrupt, + or negative to specify a processor exception. + \param [in] IRQn Interrupt number. + \return Address of interrupt handler function + */ +__STATIC_INLINE uint32_t __NVIC_GetVector(IRQn_Type IRQn) +{ + uint32_t *vectors = (uint32_t *)SCB->VTOR; + return vectors[(int32_t)IRQn + NVIC_USER_IRQ_OFFSET]; +} + + +/** + \brief System Reset + \details Initiates a system reset request to reset the MCU. + */ +__NO_RETURN __STATIC_INLINE void __NVIC_SystemReset(void) +{ + __DSB(); /* Ensure all outstanding memory accesses included + buffered write are completed before reset */ + SCB->AIRCR = (uint32_t)((0x5FAUL << SCB_AIRCR_VECTKEY_Pos) | + (SCB->AIRCR & SCB_AIRCR_PRIGROUP_Msk) | + SCB_AIRCR_SYSRESETREQ_Msk ); /* Keep priority group unchanged */ + __DSB(); /* Ensure completion of memory access */ + + for(;;) /* wait until reset */ + { + __NOP(); + } +} + +/*@} end of CMSIS_Core_NVICFunctions */ + + +/* ########################## MPU functions #################################### */ + +#if defined (__MPU_PRESENT) && (__MPU_PRESENT == 1U) + +#include "mpu_armv7.h" + +#endif + + +/* ########################## FPU functions #################################### */ +/** + \ingroup CMSIS_Core_FunctionInterface + \defgroup CMSIS_Core_FpuFunctions FPU Functions + \brief Function that provides FPU type. + @{ + */ + +/** + \brief get FPU type + \details returns the FPU type + \returns + - \b 0: No FPU + - \b 1: Single precision FPU + - \b 2: Double + Single precision FPU + */ +__STATIC_INLINE uint32_t SCB_GetFPUType(void) +{ + uint32_t mvfr0; + + mvfr0 = FPU->MVFR0; + if ((mvfr0 & (FPU_MVFR0_Single_precision_Msk | FPU_MVFR0_Double_precision_Msk)) == 0x020U) + { + return 1U; /* Single precision FPU */ + } + else + { + return 0U; /* No FPU */ + } +} + + +/*@} end of CMSIS_Core_FpuFunctions */ + + + +/* ################################## SysTick function ############################################ */ +/** + \ingroup CMSIS_Core_FunctionInterface + \defgroup CMSIS_Core_SysTickFunctions SysTick Functions + \brief Functions that configure the System. + @{ + */ + +#if defined (__Vendor_SysTickConfig) && (__Vendor_SysTickConfig == 0U) + +/** + \brief System Tick Configuration + \details Initializes the System Timer and its interrupt, and starts the System Tick Timer. + Counter is in free running mode to generate periodic interrupts. + \param [in] ticks Number of ticks between two interrupts. + \return 0 Function succeeded. + \return 1 Function failed. + \note When the variable __Vendor_SysTickConfig is set to 1, then the + function SysTick_Config is not included. In this case, the file device.h + must contain a vendor-specific implementation of this function. + */ +__STATIC_INLINE uint32_t SysTick_Config(uint32_t ticks) +{ + if ((ticks - 1UL) > SysTick_LOAD_RELOAD_Msk) + { + return (1UL); /* Reload value impossible */ + } + + SysTick->LOAD = (uint32_t)(ticks - 1UL); /* set reload register */ + NVIC_SetPriority (SysTick_IRQn, (1UL << __NVIC_PRIO_BITS) - 1UL); /* set Priority for Systick Interrupt */ + SysTick->VAL = 0UL; /* Load the SysTick Counter Value */ + SysTick->CTRL = SysTick_CTRL_CLKSOURCE_Msk | + SysTick_CTRL_TICKINT_Msk | + SysTick_CTRL_ENABLE_Msk; /* Enable SysTick IRQ and SysTick Timer */ + return (0UL); /* Function successful */ +} + +#endif + +/*@} end of CMSIS_Core_SysTickFunctions */ + + + +/* ##################################### Debug In/Output function ########################################### */ +/** + \ingroup CMSIS_Core_FunctionInterface + \defgroup CMSIS_core_DebugFunctions ITM Functions + \brief Functions that access the ITM debug interface. + @{ + */ + +extern volatile int32_t ITM_RxBuffer; /*!< External variable to receive characters. */ +#define ITM_RXBUFFER_EMPTY ((int32_t)0x5AA55AA5U) /*!< Value identifying \ref ITM_RxBuffer is ready for next character. */ + + +/** + \brief ITM Send Character + \details Transmits a character via the ITM channel 0, and + \li Just returns when no debugger is connected that has booked the output. + \li Is blocking when a debugger is connected, but the previous character sent has not been transmitted. + \param [in] ch Character to transmit. + \returns Character to transmit. + */ +__STATIC_INLINE uint32_t ITM_SendChar (uint32_t ch) +{ + if (((ITM->TCR & ITM_TCR_ITMENA_Msk) != 0UL) && /* ITM enabled */ + ((ITM->TER & 1UL ) != 0UL) ) /* ITM Port #0 enabled */ + { + while (ITM->PORT[0U].u32 == 0UL) + { + __NOP(); + } + ITM->PORT[0U].u8 = (uint8_t)ch; + } + return (ch); +} + + +/** + \brief ITM Receive Character + \details Inputs a character via the external variable \ref ITM_RxBuffer. + \return Received character. + \return -1 No character pending. + */ +__STATIC_INLINE int32_t ITM_ReceiveChar (void) +{ + int32_t ch = -1; /* no character available */ + + if (ITM_RxBuffer != ITM_RXBUFFER_EMPTY) + { + ch = ITM_RxBuffer; + ITM_RxBuffer = ITM_RXBUFFER_EMPTY; /* ready for next character */ + } + + return (ch); +} + + +/** + \brief ITM Check Character + \details Checks whether a character is pending for reading in the variable \ref ITM_RxBuffer. + \return 0 No character available. + \return 1 Character available. + */ +__STATIC_INLINE int32_t ITM_CheckChar (void) +{ + + if (ITM_RxBuffer == ITM_RXBUFFER_EMPTY) + { + return (0); /* no character available */ + } + else + { + return (1); /* character available */ + } +} + +/*@} end of CMSIS_core_DebugFunctions */ + + + + +#ifdef __cplusplus +} +#endif + +#endif /* __CORE_CM4_H_DEPENDANT */ + +#endif /* __CMSIS_GENERIC */ diff --git a/APP_Framework/Framework/knowing/cmsis_5/DSP/Include/arm_common_tables.h b/APP_Framework/Framework/knowing/cmsis_5/DSP/Include/arm_common_tables.h new file mode 100644 index 000000000..91d2be0a2 --- /dev/null +++ b/APP_Framework/Framework/knowing/cmsis_5/DSP/Include/arm_common_tables.h @@ -0,0 +1,529 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_common_tables.h + * Description: Extern declaration for common tables + * + * @version V1.9.0 + * @date 23 April 2021 + * + * Target Processor: Cortex-M and Cortex-A cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef _ARM_COMMON_TABLES_H +#define _ARM_COMMON_TABLES_H + +#include "arm_math_types.h" +#include "dsp/fast_math_functions.h" + +#ifdef __cplusplus +extern "C" +{ +#endif + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_FFT_ALLOW_TABLES) + /* Double Precision Float CFFT twiddles */ + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREV_1024) + extern const uint16_t armBitRevTable[1024]; + #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */ + + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F64_16) + extern const uint64_t twiddleCoefF64_16[32]; + #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */ + + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F64_32) + extern const uint64_t twiddleCoefF64_32[64]; + #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */ + + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F64_64) + extern const uint64_t twiddleCoefF64_64[128]; + #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */ + + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F64_128) + extern const uint64_t twiddleCoefF64_128[256]; + #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */ + + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F64_256) + extern const uint64_t twiddleCoefF64_256[512]; + #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */ + + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F64_512) + extern const uint64_t twiddleCoefF64_512[1024]; + #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */ + + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F64_1024) + extern const uint64_t twiddleCoefF64_1024[2048]; + #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */ + + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F64_2048) + extern const uint64_t twiddleCoefF64_2048[4096]; + #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */ + + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F64_4096) + extern const uint64_t twiddleCoefF64_4096[8192]; + #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */ + + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F32_16) + extern const float32_t twiddleCoef_16[32]; + #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */ + + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F32_32) + extern const float32_t twiddleCoef_32[64]; + #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */ + + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F32_64) + extern const float32_t twiddleCoef_64[128]; + #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */ + + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F32_128) + extern const float32_t twiddleCoef_128[256]; + #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */ + + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F32_256) + extern const float32_t twiddleCoef_256[512]; + #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */ + + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F32_512) + extern const float32_t twiddleCoef_512[1024]; + #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */ + + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F32_1024) + extern const float32_t twiddleCoef_1024[2048]; + #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */ + + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F32_2048) + extern const float32_t twiddleCoef_2048[4096]; + #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */ + + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F32_4096) + extern const float32_t twiddleCoef_4096[8192]; + #define twiddleCoef twiddleCoef_4096 + #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */ + + /* Q31 */ + + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_Q31_16) + extern const q31_t twiddleCoef_16_q31[24]; + #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */ + + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_Q31_32) + extern const q31_t twiddleCoef_32_q31[48]; + #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */ + + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_Q31_64) + extern const q31_t twiddleCoef_64_q31[96]; + #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */ + + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_Q31_128) + extern const q31_t twiddleCoef_128_q31[192]; + #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */ + + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_Q31_256) + extern const q31_t twiddleCoef_256_q31[384]; + #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */ + + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_Q31_512) + extern const q31_t twiddleCoef_512_q31[768]; + #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */ + + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_Q31_1024) + extern const q31_t twiddleCoef_1024_q31[1536]; + #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */ + + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_Q31_2048) + extern const q31_t twiddleCoef_2048_q31[3072]; + #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */ + + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_Q31_4096) + extern const q31_t twiddleCoef_4096_q31[6144]; + #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */ + + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_Q15_16) + extern const q15_t twiddleCoef_16_q15[24]; + #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */ + + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_Q15_32) + extern const q15_t twiddleCoef_32_q15[48]; + #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */ + + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_Q15_64) + extern const q15_t twiddleCoef_64_q15[96]; + #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */ + + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_Q15_128) + extern const q15_t twiddleCoef_128_q15[192]; + #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */ + + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_Q15_256) + extern const q15_t twiddleCoef_256_q15[384]; + #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */ + + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_Q15_512) + extern const q15_t twiddleCoef_512_q15[768]; + #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */ + + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_Q15_1024) + extern const q15_t twiddleCoef_1024_q15[1536]; + #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */ + + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_Q15_2048) + extern const q15_t twiddleCoef_2048_q15[3072]; + #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */ + + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_Q15_4096) + extern const q15_t twiddleCoef_4096_q15[6144]; + #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */ + + /* Double Precision Float RFFT twiddles */ + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_RFFT_F64_32) + extern const uint64_t twiddleCoefF64_rfft_32[32]; + #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */ + + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_RFFT_F64_64) + extern const uint64_t twiddleCoefF64_rfft_64[64]; + #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */ + + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_RFFT_F64_128) + extern const uint64_t twiddleCoefF64_rfft_128[128]; + #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */ + + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_RFFT_F64_256) + extern const uint64_t twiddleCoefF64_rfft_256[256]; + #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */ + + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_RFFT_F64_512) + extern const uint64_t twiddleCoefF64_rfft_512[512]; + #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */ + + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_RFFT_F64_1024) + extern const uint64_t twiddleCoefF64_rfft_1024[1024]; + #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */ + + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_RFFT_F64_2048) + extern const uint64_t twiddleCoefF64_rfft_2048[2048]; + #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */ + + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_RFFT_F64_4096) + extern const uint64_t twiddleCoefF64_rfft_4096[4096]; + #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */ + + + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_32) + extern const float32_t twiddleCoef_rfft_32[32]; + #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */ + + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_64) + extern const float32_t twiddleCoef_rfft_64[64]; + #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */ + + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_128) + extern const float32_t twiddleCoef_rfft_128[128]; + #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */ + + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_256) + extern const float32_t twiddleCoef_rfft_256[256]; + #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */ + + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_512) + extern const float32_t twiddleCoef_rfft_512[512]; + #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */ + + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_1024) + extern const float32_t twiddleCoef_rfft_1024[1024]; + #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */ + + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_2048) + extern const float32_t twiddleCoef_rfft_2048[2048]; + #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */ + + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_4096) + extern const float32_t twiddleCoef_rfft_4096[4096]; + #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */ + + + /* Double precision floating-point bit reversal tables */ + + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREVIDX_FLT64_16) + #define ARMBITREVINDEXTABLEF64_16_TABLE_LENGTH ((uint16_t)12) + extern const uint16_t armBitRevIndexTableF64_16[ARMBITREVINDEXTABLEF64_16_TABLE_LENGTH]; + #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */ + + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREVIDX_FLT64_32) + #define ARMBITREVINDEXTABLEF64_32_TABLE_LENGTH ((uint16_t)24) + extern const uint16_t armBitRevIndexTableF64_32[ARMBITREVINDEXTABLEF64_32_TABLE_LENGTH]; + #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */ + + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREVIDX_FLT64_64) + #define ARMBITREVINDEXTABLEF64_64_TABLE_LENGTH ((uint16_t)56) + extern const uint16_t armBitRevIndexTableF64_64[ARMBITREVINDEXTABLEF64_64_TABLE_LENGTH]; + #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */ + + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREVIDX_FLT64_128) + #define ARMBITREVINDEXTABLEF64_128_TABLE_LENGTH ((uint16_t)112) + extern const uint16_t armBitRevIndexTableF64_128[ARMBITREVINDEXTABLEF64_128_TABLE_LENGTH]; + #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */ + + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREVIDX_FLT64_256) + #define ARMBITREVINDEXTABLEF64_256_TABLE_LENGTH ((uint16_t)240) + extern const uint16_t armBitRevIndexTableF64_256[ARMBITREVINDEXTABLEF64_256_TABLE_LENGTH]; + #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */ + + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREVIDX_FLT64_512) + #define ARMBITREVINDEXTABLEF64_512_TABLE_LENGTH ((uint16_t)480) + extern const uint16_t armBitRevIndexTableF64_512[ARMBITREVINDEXTABLEF64_512_TABLE_LENGTH]; + #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */ + + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREVIDX_FLT64_1024) + #define ARMBITREVINDEXTABLEF64_1024_TABLE_LENGTH ((uint16_t)992) + extern const uint16_t armBitRevIndexTableF64_1024[ARMBITREVINDEXTABLEF64_1024_TABLE_LENGTH]; + #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */ + + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREVIDX_FLT64_2048) + #define ARMBITREVINDEXTABLEF64_2048_TABLE_LENGTH ((uint16_t)1984) + extern const uint16_t armBitRevIndexTableF64_2048[ARMBITREVINDEXTABLEF64_2048_TABLE_LENGTH]; + #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */ + + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREVIDX_FLT64_4096) + #define ARMBITREVINDEXTABLEF64_4096_TABLE_LENGTH ((uint16_t)4032) + extern const uint16_t armBitRevIndexTableF64_4096[ARMBITREVINDEXTABLEF64_4096_TABLE_LENGTH]; + #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */ + /* floating-point bit reversal tables */ + + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREVIDX_FLT_16) + #define ARMBITREVINDEXTABLE_16_TABLE_LENGTH ((uint16_t)20) + extern const uint16_t armBitRevIndexTable16[ARMBITREVINDEXTABLE_16_TABLE_LENGTH]; + #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */ + + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREVIDX_FLT_32) + #define ARMBITREVINDEXTABLE_32_TABLE_LENGTH ((uint16_t)48) + extern const uint16_t armBitRevIndexTable32[ARMBITREVINDEXTABLE_32_TABLE_LENGTH]; + #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */ + + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREVIDX_FLT_64) + #define ARMBITREVINDEXTABLE_64_TABLE_LENGTH ((uint16_t)56) + extern const uint16_t armBitRevIndexTable64[ARMBITREVINDEXTABLE_64_TABLE_LENGTH]; + #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */ + + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREVIDX_FLT_128) + #define ARMBITREVINDEXTABLE_128_TABLE_LENGTH ((uint16_t)208) + extern const uint16_t armBitRevIndexTable128[ARMBITREVINDEXTABLE_128_TABLE_LENGTH]; + #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */ + + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREVIDX_FLT_256) + #define ARMBITREVINDEXTABLE_256_TABLE_LENGTH ((uint16_t)440) + extern const uint16_t armBitRevIndexTable256[ARMBITREVINDEXTABLE_256_TABLE_LENGTH]; + #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */ + + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREVIDX_FLT_512) + #define ARMBITREVINDEXTABLE_512_TABLE_LENGTH ((uint16_t)448) + extern const uint16_t armBitRevIndexTable512[ARMBITREVINDEXTABLE_512_TABLE_LENGTH]; + #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */ + + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREVIDX_FLT_1024) + #define ARMBITREVINDEXTABLE_1024_TABLE_LENGTH ((uint16_t)1800) + extern const uint16_t armBitRevIndexTable1024[ARMBITREVINDEXTABLE_1024_TABLE_LENGTH]; + #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */ + + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREVIDX_FLT_2048) + #define ARMBITREVINDEXTABLE_2048_TABLE_LENGTH ((uint16_t)3808) + extern const uint16_t armBitRevIndexTable2048[ARMBITREVINDEXTABLE_2048_TABLE_LENGTH]; + #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */ + + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREVIDX_FLT_4096) + #define ARMBITREVINDEXTABLE_4096_TABLE_LENGTH ((uint16_t)4032) + extern const uint16_t armBitRevIndexTable4096[ARMBITREVINDEXTABLE_4096_TABLE_LENGTH]; + #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */ + + + /* fixed-point bit reversal tables */ + + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREVIDX_FXT_16) + #define ARMBITREVINDEXTABLE_FIXED_16_TABLE_LENGTH ((uint16_t)12) + extern const uint16_t armBitRevIndexTable_fixed_16[ARMBITREVINDEXTABLE_FIXED_16_TABLE_LENGTH]; + #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */ + + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREVIDX_FXT_32) + #define ARMBITREVINDEXTABLE_FIXED_32_TABLE_LENGTH ((uint16_t)24) + extern const uint16_t armBitRevIndexTable_fixed_32[ARMBITREVINDEXTABLE_FIXED_32_TABLE_LENGTH]; + #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */ + + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREVIDX_FXT_64) + #define ARMBITREVINDEXTABLE_FIXED_64_TABLE_LENGTH ((uint16_t)56) + extern const uint16_t armBitRevIndexTable_fixed_64[ARMBITREVINDEXTABLE_FIXED_64_TABLE_LENGTH]; + #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */ + + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREVIDX_FXT_128) + #define ARMBITREVINDEXTABLE_FIXED_128_TABLE_LENGTH ((uint16_t)112) + extern const uint16_t armBitRevIndexTable_fixed_128[ARMBITREVINDEXTABLE_FIXED_128_TABLE_LENGTH]; + #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */ + + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREVIDX_FXT_256) + #define ARMBITREVINDEXTABLE_FIXED_256_TABLE_LENGTH ((uint16_t)240) + extern const uint16_t armBitRevIndexTable_fixed_256[ARMBITREVINDEXTABLE_FIXED_256_TABLE_LENGTH]; + #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */ + + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREVIDX_FXT_512) + #define ARMBITREVINDEXTABLE_FIXED_512_TABLE_LENGTH ((uint16_t)480) + extern const uint16_t armBitRevIndexTable_fixed_512[ARMBITREVINDEXTABLE_FIXED_512_TABLE_LENGTH]; + #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */ + + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREVIDX_FXT_1024) + #define ARMBITREVINDEXTABLE_FIXED_1024_TABLE_LENGTH ((uint16_t)992) + extern const uint16_t armBitRevIndexTable_fixed_1024[ARMBITREVINDEXTABLE_FIXED_1024_TABLE_LENGTH]; + #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */ + + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREVIDX_FXT_2048) + #define ARMBITREVINDEXTABLE_FIXED_2048_TABLE_LENGTH ((uint16_t)1984) + extern const uint16_t armBitRevIndexTable_fixed_2048[ARMBITREVINDEXTABLE_FIXED_2048_TABLE_LENGTH]; + #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */ + + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREVIDX_FXT_4096) + #define ARMBITREVINDEXTABLE_FIXED_4096_TABLE_LENGTH ((uint16_t)4032) + extern const uint16_t armBitRevIndexTable_fixed_4096[ARMBITREVINDEXTABLE_FIXED_4096_TABLE_LENGTH]; + #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */ + + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_REALCOEF_F32) + extern const float32_t realCoefA[8192]; + extern const float32_t realCoefB[8192]; + #endif + + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_REALCOEF_Q31) + extern const q31_t realCoefAQ31[8192]; + extern const q31_t realCoefBQ31[8192]; + #endif + + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_REALCOEF_Q15) + extern const q15_t realCoefAQ15[8192]; + extern const q15_t realCoefBQ15[8192]; + #endif + + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_DCT4_F32_128) + extern const float32_t Weights_128[256]; + extern const float32_t cos_factors_128[128]; + #endif + + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_DCT4_F32_512) + extern const float32_t Weights_512[1024]; + extern const float32_t cos_factors_512[512]; + #endif + + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_DCT4_F32_2048) + extern const float32_t Weights_2048[4096]; + extern const float32_t cos_factors_2048[2048]; + #endif + + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_DCT4_F32_8192) + extern const float32_t Weights_8192[16384]; + extern const float32_t cos_factors_8192[8192]; + #endif + + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_DCT4_Q15_128) + extern const q15_t WeightsQ15_128[256]; + extern const q15_t cos_factorsQ15_128[128]; + #endif + + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_DCT4_Q15_512) + extern const q15_t WeightsQ15_512[1024]; + extern const q15_t cos_factorsQ15_512[512]; + #endif + + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_DCT4_Q15_2048) + extern const q15_t WeightsQ15_2048[4096]; + extern const q15_t cos_factorsQ15_2048[2048]; + #endif + + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_DCT4_Q15_8192) + extern const q15_t WeightsQ15_8192[16384]; + extern const q15_t cos_factorsQ15_8192[8192]; + #endif + + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_DCT4_Q31_128) + extern const q31_t WeightsQ31_128[256]; + extern const q31_t cos_factorsQ31_128[128]; + #endif + + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_DCT4_Q31_512) + extern const q31_t WeightsQ31_512[1024]; + extern const q31_t cos_factorsQ31_512[512]; + #endif + + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_DCT4_Q31_2048) + extern const q31_t WeightsQ31_2048[4096]; + extern const q31_t cos_factorsQ31_2048[2048]; + #endif + + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_DCT4_Q31_8192) + extern const q31_t WeightsQ31_8192[16384]; + extern const q31_t cos_factorsQ31_8192[8192]; + #endif + +#endif /* if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_FFT_TABLES) */ + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_FAST_ALLOW_TABLES) + + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FAST_TABLES) || defined(ARM_TABLE_RECIP_Q15) + extern const q15_t armRecipTableQ15[64]; + #endif /* !defined(ARM_DSP_CONFIG_TABLES) defined(ARM_ALL_FAST_TABLES) */ + + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FAST_TABLES) || defined(ARM_TABLE_RECIP_Q31) + extern const q31_t armRecipTableQ31[64]; + #endif /* !defined(ARM_DSP_CONFIG_TABLES) defined(ARM_ALL_FAST_TABLES) */ + + /* Tables for Fast Math Sine and Cosine */ + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FAST_TABLES) || defined(ARM_TABLE_SIN_F32) + extern const float32_t sinTable_f32[FAST_MATH_TABLE_SIZE + 1]; + #endif /* !defined(ARM_DSP_CONFIG_TABLES) defined(ARM_ALL_FAST_TABLES) */ + + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FAST_TABLES) || defined(ARM_TABLE_SIN_Q31) + extern const q31_t sinTable_q31[FAST_MATH_TABLE_SIZE + 1]; + #endif /* !defined(ARM_DSP_CONFIG_TABLES) defined(ARM_ALL_FAST_TABLES) */ + + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FAST_TABLES) || defined(ARM_TABLE_SIN_Q15) + extern const q15_t sinTable_q15[FAST_MATH_TABLE_SIZE + 1]; + #endif /* !defined(ARM_DSP_CONFIG_TABLES) defined(ARM_ALL_FAST_TABLES) */ + + #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE) + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FAST_TABLES) || defined(ARM_TABLE_FAST_SQRT_Q31_MVE) + extern const q31_t sqrtTable_Q31[256]; + #endif /* !defined(ARM_DSP_CONFIG_TABLES) defined(ARM_ALL_FAST_TABLES) */ + #endif + + #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE) + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FAST_TABLES) || defined(ARM_TABLE_FAST_SQRT_Q15_MVE) + extern const q15_t sqrtTable_Q15[256]; + #endif /* !defined(ARM_DSP_CONFIG_TABLES) defined(ARM_ALL_FAST_TABLES) */ + #endif + +#endif /* if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_FAST_TABLES) */ + +#if (defined(ARM_MATH_MVEF) || defined(ARM_MATH_HELIUM)) && !defined(ARM_MATH_AUTOVECTORIZE) + extern const float32_t exp_tab[8]; + extern const float32_t __logf_lut_f32[8]; +#endif /* (defined(ARM_MATH_MVEF) || defined(ARM_MATH_HELIUM)) && !defined(ARM_MATH_AUTOVECTORIZE) */ + +#if (defined(ARM_MATH_MVEI) || defined(ARM_MATH_HELIUM)) && !defined(ARM_MATH_AUTOVECTORIZE) +extern const unsigned char hwLUT[256]; +#endif /* (defined(ARM_MATH_MVEI) || defined(ARM_MATH_HELIUM)) */ + +#ifdef __cplusplus +} +#endif + +#endif /* ARM_COMMON_TABLES_H */ + diff --git a/APP_Framework/Framework/knowing/cmsis_5/DSP/Include/arm_common_tables_f16.h b/APP_Framework/Framework/knowing/cmsis_5/DSP/Include/arm_common_tables_f16.h new file mode 100644 index 000000000..f40c1a4ea --- /dev/null +++ b/APP_Framework/Framework/knowing/cmsis_5/DSP/Include/arm_common_tables_f16.h @@ -0,0 +1,132 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_common_tables_f16.h + * Description: Extern declaration for common tables + * + * @version V1.9.0 + * @date 23 April 2021 + * + * Target Processor: Cortex-M and Cortex-A cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef _ARM_COMMON_TABLES_F16_H +#define _ARM_COMMON_TABLES_F16_H + +#include "arm_math_types_f16.h" + +#ifdef __cplusplus +extern "C" +{ +#endif + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_FFT_ALLOW_TABLES) + + /* F16 */ + #if !defined(__CC_ARM) && defined(ARM_FLOAT16_SUPPORTED) + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F16_16) + extern const float16_t twiddleCoefF16_16[32]; + #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */ + + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F16_32) + extern const float16_t twiddleCoefF16_32[64]; + #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */ + + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F16_64) + extern const float16_t twiddleCoefF16_64[128]; + #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */ + + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F16_128) + extern const float16_t twiddleCoefF16_128[256]; + #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */ + + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F16_256) + extern const float16_t twiddleCoefF16_256[512]; + #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */ + + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F16_512) + extern const float16_t twiddleCoefF16_512[1024]; + #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */ + + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F16_1024) + extern const float16_t twiddleCoefF16_1024[2048]; + #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */ + + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F16_2048) + extern const float16_t twiddleCoefF16_2048[4096]; + #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */ + + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F16_4096) + extern const float16_t twiddleCoefF16_4096[8192]; + #define twiddleCoefF16 twiddleCoefF16_4096 + #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */ + + + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_RFFT_F16_32) + extern const float16_t twiddleCoefF16_rfft_32[32]; + #endif + + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_RFFT_F16_64) + extern const float16_t twiddleCoefF16_rfft_64[64]; + #endif + + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_RFFT_F16_128) + extern const float16_t twiddleCoefF16_rfft_128[128]; + #endif + + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_RFFT_F16_256) + extern const float16_t twiddleCoefF16_rfft_256[256]; + #endif + + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_RFFT_F16_512) + extern const float16_t twiddleCoefF16_rfft_512[512]; + #endif + + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_RFFT_F16_1024) + extern const float16_t twiddleCoefF16_rfft_1024[1024]; + #endif + + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_RFFT_F16_2048) + extern const float16_t twiddleCoefF16_rfft_2048[2048]; + #endif + + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_RFFT_F16_4096) + extern const float16_t twiddleCoefF16_rfft_4096[4096]; + #endif + + #endif /* ARMAC5 */ + +#endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_FFT_ALLOW_TABLES) */ + +#if !defined(__CC_ARM) && defined(ARM_FLOAT16_SUPPORTED) + +#if (defined(ARM_MATH_MVEF) || defined(ARM_MATH_HELIUM)) && !defined(ARM_MATH_AUTOVECTORIZE) + extern const float16_t exp_tab_f16[8]; + extern const float16_t __logf_lut_f16[8]; +#endif /* (defined(ARM_MATH_MVEF) || defined(ARM_MATH_HELIUM)) && !defined(ARM_MATH_AUTOVECTORIZE) */ +#endif + + +#ifdef __cplusplus +} +#endif + +#endif /* _ARM_COMMON_TABLES_F16_H */ + + diff --git a/APP_Framework/Framework/knowing/cmsis_5/DSP/Include/arm_const_structs.h b/APP_Framework/Framework/knowing/cmsis_5/DSP/Include/arm_const_structs.h new file mode 100644 index 000000000..15e7726f7 --- /dev/null +++ b/APP_Framework/Framework/knowing/cmsis_5/DSP/Include/arm_const_structs.h @@ -0,0 +1,86 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_const_structs.h + * Description: Constant structs that are initialized for user convenience. + * For example, some can be given as arguments to the arm_cfft_f32() function. + * + * @version V1.9.0 + * @date 23 April 2021 + * + * Target Processor: Cortex-M and Cortex-A cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef _ARM_CONST_STRUCTS_H +#define _ARM_CONST_STRUCTS_H + +#include "arm_math_types.h" +#include "arm_common_tables.h" +#include "dsp/transform_functions.h" + +#ifdef __cplusplus +extern "C" +{ +#endif + extern const arm_cfft_instance_f64 arm_cfft_sR_f64_len16; + extern const arm_cfft_instance_f64 arm_cfft_sR_f64_len32; + extern const arm_cfft_instance_f64 arm_cfft_sR_f64_len64; + extern const arm_cfft_instance_f64 arm_cfft_sR_f64_len128; + extern const arm_cfft_instance_f64 arm_cfft_sR_f64_len256; + extern const arm_cfft_instance_f64 arm_cfft_sR_f64_len512; + extern const arm_cfft_instance_f64 arm_cfft_sR_f64_len1024; + extern const arm_cfft_instance_f64 arm_cfft_sR_f64_len2048; + extern const arm_cfft_instance_f64 arm_cfft_sR_f64_len4096; + + extern const arm_cfft_instance_f32 arm_cfft_sR_f32_len16; + extern const arm_cfft_instance_f32 arm_cfft_sR_f32_len32; + extern const arm_cfft_instance_f32 arm_cfft_sR_f32_len64; + extern const arm_cfft_instance_f32 arm_cfft_sR_f32_len128; + extern const arm_cfft_instance_f32 arm_cfft_sR_f32_len256; + extern const arm_cfft_instance_f32 arm_cfft_sR_f32_len512; + extern const arm_cfft_instance_f32 arm_cfft_sR_f32_len1024; + extern const arm_cfft_instance_f32 arm_cfft_sR_f32_len2048; + extern const arm_cfft_instance_f32 arm_cfft_sR_f32_len4096; + + extern const arm_cfft_instance_q31 arm_cfft_sR_q31_len16; + extern const arm_cfft_instance_q31 arm_cfft_sR_q31_len32; + extern const arm_cfft_instance_q31 arm_cfft_sR_q31_len64; + extern const arm_cfft_instance_q31 arm_cfft_sR_q31_len128; + extern const arm_cfft_instance_q31 arm_cfft_sR_q31_len256; + extern const arm_cfft_instance_q31 arm_cfft_sR_q31_len512; + extern const arm_cfft_instance_q31 arm_cfft_sR_q31_len1024; + extern const arm_cfft_instance_q31 arm_cfft_sR_q31_len2048; + extern const arm_cfft_instance_q31 arm_cfft_sR_q31_len4096; + + extern const arm_cfft_instance_q15 arm_cfft_sR_q15_len16; + extern const arm_cfft_instance_q15 arm_cfft_sR_q15_len32; + extern const arm_cfft_instance_q15 arm_cfft_sR_q15_len64; + extern const arm_cfft_instance_q15 arm_cfft_sR_q15_len128; + extern const arm_cfft_instance_q15 arm_cfft_sR_q15_len256; + extern const arm_cfft_instance_q15 arm_cfft_sR_q15_len512; + extern const arm_cfft_instance_q15 arm_cfft_sR_q15_len1024; + extern const arm_cfft_instance_q15 arm_cfft_sR_q15_len2048; + extern const arm_cfft_instance_q15 arm_cfft_sR_q15_len4096; + +#ifdef __cplusplus +} +#endif + +#endif + diff --git a/APP_Framework/Framework/knowing/cmsis_5/DSP/Include/arm_const_structs_f16.h b/APP_Framework/Framework/knowing/cmsis_5/DSP/Include/arm_const_structs_f16.h new file mode 100644 index 000000000..584941e6b --- /dev/null +++ b/APP_Framework/Framework/knowing/cmsis_5/DSP/Include/arm_const_structs_f16.h @@ -0,0 +1,77 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_const_structs_f16.h + * Description: Constant structs that are initialized for user convenience. + * For example, some can be given as arguments to the arm_cfft_f16() function. + * + * @version V1.9.0 + * @date 23 April 2021 + * + * Target Processor: Cortex-M and Cortex-A cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef _ARM_CONST_STRUCTS_F16_H +#define _ARM_CONST_STRUCTS_F16_H + +#include "arm_math_types_f16.h" +#include "arm_common_tables.h" +#include "arm_common_tables_f16.h" +#include "dsp/transform_functions_f16.h" + +#ifdef __cplusplus +extern "C" +{ +#endif + +#if !defined(__CC_ARM) && defined(ARM_FLOAT16_SUPPORTED) + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F16_16) && defined(ARM_TABLE_BITREVIDX_FLT_16)) + extern const arm_cfft_instance_f16 arm_cfft_sR_f16_len16; + #endif + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F16_32) && defined(ARM_TABLE_BITREVIDX_FLT_32)) + extern const arm_cfft_instance_f16 arm_cfft_sR_f16_len32; + #endif + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F16_64) && defined(ARM_TABLE_BITREVIDX_FLT_64)) + extern const arm_cfft_instance_f16 arm_cfft_sR_f16_len64; + #endif + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F16_128) && defined(ARM_TABLE_BITREVIDX_FLT_128)) + extern const arm_cfft_instance_f16 arm_cfft_sR_f16_len128; + #endif + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F16_256) && defined(ARM_TABLE_BITREVIDX_FLT_256)) + extern const arm_cfft_instance_f16 arm_cfft_sR_f16_len256; + #endif + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F16_512) && defined(ARM_TABLE_BITREVIDX_FLT_512)) + extern const arm_cfft_instance_f16 arm_cfft_sR_f16_len512; + #endif + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F16_1024) && defined(ARM_TABLE_BITREVIDX_FLT_1024)) + extern const arm_cfft_instance_f16 arm_cfft_sR_f16_len1024; + #endif + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F16_2048) && defined(ARM_TABLE_BITREVIDX_FLT_2048)) + extern const arm_cfft_instance_f16 arm_cfft_sR_f16_len2048; + #endif + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F16_4096) && defined(ARM_TABLE_BITREVIDX_FLT_4096)) + extern const arm_cfft_instance_f16 arm_cfft_sR_f16_len4096; + #endif +#endif + +#ifdef __cplusplus +} +#endif + +#endif \ No newline at end of file diff --git a/APP_Framework/Framework/knowing/cmsis_5/DSP/Include/arm_helium_utils.h b/APP_Framework/Framework/knowing/cmsis_5/DSP/Include/arm_helium_utils.h new file mode 100644 index 000000000..54a9db59b --- /dev/null +++ b/APP_Framework/Framework/knowing/cmsis_5/DSP/Include/arm_helium_utils.h @@ -0,0 +1,753 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_helium_utils.h + * Description: Utility functions for Helium development + * + * @version V1.9.0 + * @date 23 April 2021 + * + * Target Processor: Cortex-M and Cortex-A cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef _ARM_UTILS_HELIUM_H_ +#define _ARM_UTILS_HELIUM_H_ + + +#ifdef __cplusplus +extern "C" +{ +#endif +/*************************************** + +Definitions available for MVEF and MVEI + +***************************************/ +#if (defined (ARM_MATH_HELIUM) || defined(ARM_MATH_MVEF) || defined(ARM_MATH_MVEI)) && !defined(ARM_MATH_AUTOVECTORIZE) + +#define INACTIVELANE 0 /* inactive lane content */ + + +#endif /* defined (ARM_MATH_HELIUM) || defined(ARM_MATH_MVEF) || defined(ARM_MATH_MVEI) */ + +/*************************************** + +Definitions available for MVEF only + +***************************************/ +#if (defined (ARM_MATH_HELIUM) || defined(ARM_MATH_MVEF)) && !defined(ARM_MATH_AUTOVECTORIZE) + +__STATIC_FORCEINLINE float32_t vecAddAcrossF32Mve(float32x4_t in) +{ + float32_t acc; + + acc = vgetq_lane(in, 0) + vgetq_lane(in, 1) + + vgetq_lane(in, 2) + vgetq_lane(in, 3); + + return acc; +} + + + + +/* newton initial guess */ +#define INVSQRT_MAGIC_F32 0x5f3759df +#define INV_NEWTON_INIT_F32 0x7EF127EA + + +#define INVSQRT_NEWTON_MVE_F32(invSqrt, xHalf, xStart)\ +{ \ + float32x4_t tmp; \ + \ + /* tmp = xhalf * x * x */ \ + tmp = vmulq(xStart, xStart); \ + tmp = vmulq(tmp, xHalf); \ + /* (1.5f - xhalf * x * x) */ \ + tmp = vsubq(vdupq_n_f32(1.5f), tmp); \ + /* x = x*(1.5f-xhalf*x*x); */ \ + invSqrt = vmulq(tmp, xStart); \ +} +#endif /* defined (ARM_MATH_HELIUM) || defined(ARM_MATH_MVEF) */ + + +/*************************************** + +Definitions available for f16 datatype with HW acceleration only + +***************************************/ +#if defined(ARM_FLOAT16_SUPPORTED) +#if defined (ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) + +__STATIC_FORCEINLINE float16_t vecAddAcrossF16Mve(float16x8_t in) +{ + float16x8_t tmpVec; + _Float16 acc; + + tmpVec = (float16x8_t) vrev32q_s16((int16x8_t) in); + in = vaddq_f16(tmpVec, in); + tmpVec = (float16x8_t) vrev64q_s32((int32x4_t) in); + in = vaddq_f16(tmpVec, in); + acc = (_Float16)vgetq_lane_f16(in, 0) + (_Float16)vgetq_lane_f16(in, 4); + + return acc; +} + +__STATIC_FORCEINLINE float16x8_t __mve_cmplx_sum_intra_vec_f16( + float16x8_t vecIn) +{ + float16x8_t vecTmp, vecOut; + uint32_t tmp; + + vecTmp = (float16x8_t) vrev64q_s32((int32x4_t) vecIn); + // TO TRACK : using canonical addition leads to unefficient code generation for f16 + // vecTmp = vecTmp + vecAccCpx0; + /* + * Compute + * re0+re1 | im0+im1 | re0+re1 | im0+im1 + * re2+re3 | im2+im3 | re2+re3 | im2+im3 + */ + vecTmp = vaddq_f16(vecTmp, vecIn); + vecOut = vecTmp; + /* + * shift left, random tmp insertion in bottom + */ + vecOut = vreinterpretq_f16_s32(vshlcq_s32(vreinterpretq_s32_f16(vecOut) , &tmp, 32)); + /* + * Compute: + * DONTCARE | DONTCARE | re0+re1+re0+re1 |im0+im1+im0+im1 + * re0+re1+re2+re3 | im0+im1+im2+im3 | re2+re3+re2+re3 |im2+im3+im2+im3 + */ + vecOut = vaddq_f16(vecOut, vecTmp); + /* + * Cmplx sum is in 4rd & 5th f16 elt + * return full vector + */ + return vecOut; +} + + +#define mve_cmplx_sum_intra_r_i_f16(vec, Re, Im) \ +{ \ + float16x8_t vecOut = __mve_cmplx_sum_intra_vec_f16(vec); \ + Re = vgetq_lane(vecOut, 4); \ + Im = vgetq_lane(vecOut, 5); \ +} + +__STATIC_FORCEINLINE void mve_cmplx_sum_intra_vec_f16( + float16x8_t vecIn, + float16_t *pOut) +{ + float16x8_t vecOut = __mve_cmplx_sum_intra_vec_f16(vecIn); + /* + * Cmplx sum is in 4rd & 5th f16 elt + * use 32-bit extraction + */ + *(float32_t *) pOut = ((float32x4_t) vecOut)[2]; +} + + +#define INVSQRT_MAGIC_F16 0x59ba /* ( 0x1ba = 0x3759df >> 13) */ + +/* canonical version of INVSQRT_NEWTON_MVE_F16 leads to bad performance */ +#define INVSQRT_NEWTON_MVE_F16(invSqrt, xHalf, xStart) \ +{ \ + float16x8_t tmp; \ + \ + /* tmp = xhalf * x * x */ \ + tmp = vmulq(xStart, xStart); \ + tmp = vmulq(tmp, xHalf); \ + /* (1.5f - xhalf * x * x) */ \ + tmp = vsubq(vdupq_n_f16((float16_t)1.5), tmp); \ + /* x = x*(1.5f-xhalf*x*x); */ \ + invSqrt = vmulq(tmp, xStart); \ +} + +#endif +#endif + +/*************************************** + +Definitions available for MVEI and MVEF only + +***************************************/ +#if (defined (ARM_MATH_HELIUM) || defined(ARM_MATH_MVEF) || defined(ARM_MATH_MVEI)) && !defined(ARM_MATH_AUTOVECTORIZE) +/* Following functions are used to transpose matrix in f32 and q31 cases */ +__STATIC_INLINE arm_status arm_mat_trans_32bit_2x2_mve( + uint32_t * pDataSrc, + uint32_t * pDataDest) +{ + static const uint32x4_t vecOffs = { 0, 2, 1, 3 }; + /* + * + * | 0 1 | => | 0 2 | + * | 2 3 | | 1 3 | + * + */ + uint32x4_t vecIn = vldrwq_u32((uint32_t const *)pDataSrc); + vstrwq_scatter_shifted_offset_u32(pDataDest, vecOffs, vecIn); + + return (ARM_MATH_SUCCESS); +} + +__STATIC_INLINE arm_status arm_mat_trans_32bit_3x3_mve( + uint32_t * pDataSrc, + uint32_t * pDataDest) +{ + const uint32x4_t vecOffs1 = { 0, 3, 6, 1}; + const uint32x4_t vecOffs2 = { 4, 7, 2, 5}; + /* + * + * | 0 1 2 | | 0 3 6 | 4 x 32 flattened version | 0 3 6 1 | + * | 3 4 5 | => | 1 4 7 | => | 4 7 2 5 | + * | 6 7 8 | | 2 5 8 | (row major) | 8 . . . | + * + */ + uint32x4_t vecIn1 = vldrwq_u32((uint32_t const *) pDataSrc); + uint32x4_t vecIn2 = vldrwq_u32((uint32_t const *) &pDataSrc[4]); + + vstrwq_scatter_shifted_offset_u32(pDataDest, vecOffs1, vecIn1); + vstrwq_scatter_shifted_offset_u32(pDataDest, vecOffs2, vecIn2); + + pDataDest[8] = pDataSrc[8]; + + return (ARM_MATH_SUCCESS); +} + +__STATIC_INLINE arm_status arm_mat_trans_32bit_4x4_mve(uint32_t * pDataSrc, uint32_t * pDataDest) +{ + /* + * 4x4 Matrix transposition + * is 4 x de-interleave operation + * + * 0 1 2 3 0 4 8 12 + * 4 5 6 7 1 5 9 13 + * 8 9 10 11 2 6 10 14 + * 12 13 14 15 3 7 11 15 + */ + + uint32x4x4_t vecIn; + + vecIn = vld4q((uint32_t const *) pDataSrc); + vstrwq(pDataDest, vecIn.val[0]); + pDataDest += 4; + vstrwq(pDataDest, vecIn.val[1]); + pDataDest += 4; + vstrwq(pDataDest, vecIn.val[2]); + pDataDest += 4; + vstrwq(pDataDest, vecIn.val[3]); + + return (ARM_MATH_SUCCESS); +} + + +__STATIC_INLINE arm_status arm_mat_trans_32bit_generic_mve( + uint16_t srcRows, + uint16_t srcCols, + uint32_t * pDataSrc, + uint32_t * pDataDest) +{ + uint32x4_t vecOffs; + uint32_t i; + uint32_t blkCnt; + uint32_t const *pDataC; + uint32_t *pDataDestR; + uint32x4_t vecIn; + + vecOffs = vidupq_u32((uint32_t)0, 1); + vecOffs = vecOffs * srcCols; + + i = srcCols; + do + { + pDataC = (uint32_t const *) pDataSrc; + pDataDestR = pDataDest; + + blkCnt = srcRows >> 2; + while (blkCnt > 0U) + { + vecIn = vldrwq_gather_shifted_offset_u32(pDataC, vecOffs); + vstrwq(pDataDestR, vecIn); + pDataDestR += 4; + pDataC = pDataC + srcCols * 4; + /* + * Decrement the blockSize loop counter + */ + blkCnt--; + } + + /* + * tail + */ + blkCnt = srcRows & 3; + if (blkCnt > 0U) + { + mve_pred16_t p0 = vctp32q(blkCnt); + vecIn = vldrwq_gather_shifted_offset_u32(pDataC, vecOffs); + vstrwq_p(pDataDestR, vecIn, p0); + } + + pDataSrc += 1; + pDataDest += srcRows; + } + while (--i); + + return (ARM_MATH_SUCCESS); +} + +__STATIC_INLINE arm_status arm_mat_cmplx_trans_32bit( + uint16_t srcRows, + uint16_t srcCols, + uint32_t *pDataSrc, + uint16_t dstRows, + uint16_t dstCols, + uint32_t *pDataDest) +{ + uint32_t i; + uint32_t const *pDataC; + uint32_t *pDataRow; + uint32_t *pDataDestR, *pDataDestRow; + uint32x4_t vecOffsRef, vecOffsCur; + uint32_t blkCnt; + uint32x4_t vecIn; + +#ifdef ARM_MATH_MATRIX_CHECK + /* + * Check for matrix mismatch condition + */ + if ((srcRows != dstCols) || (srcCols != dstRows)) + { + /* + * Set status as ARM_MATH_SIZE_MISMATCH + */ + return ARM_MATH_SIZE_MISMATCH; + } +#else + (void)dstRows; + (void)dstCols; +#endif + + /* 2x2, 3x3 and 4x4 specialization to be added */ + + vecOffsRef[0] = 0; + vecOffsRef[1] = 1; + vecOffsRef[2] = srcCols << 1; + vecOffsRef[3] = (srcCols << 1) + 1; + + pDataRow = pDataSrc; + pDataDestRow = pDataDest; + i = srcCols; + do + { + pDataC = (uint32_t const *) pDataRow; + pDataDestR = pDataDestRow; + vecOffsCur = vecOffsRef; + + blkCnt = (srcRows * CMPLX_DIM) >> 2; + while (blkCnt > 0U) + { + vecIn = vldrwq_gather_shifted_offset(pDataC, vecOffsCur); + vstrwq(pDataDestR, vecIn); + pDataDestR += 4; + vecOffsCur = vaddq(vecOffsCur, (srcCols << 2)); + /* + * Decrement the blockSize loop counter + */ + blkCnt--; + } + /* + * tail + * (will be merged thru tail predication) + */ + blkCnt = (srcRows * CMPLX_DIM) & 3; + if (blkCnt > 0U) + { + mve_pred16_t p0 = vctp32q(blkCnt); + vecIn = vldrwq_gather_shifted_offset(pDataC, vecOffsCur); + vstrwq_p(pDataDestR, vecIn, p0); + } + + pDataRow += CMPLX_DIM; + pDataDestRow += (srcRows * CMPLX_DIM); + } + while (--i); + + return (ARM_MATH_SUCCESS); +} + +__STATIC_INLINE arm_status arm_mat_trans_16bit_2x2(uint16_t * pDataSrc, uint16_t * pDataDest) +{ + pDataDest[0] = pDataSrc[0]; + pDataDest[3] = pDataSrc[3]; + pDataDest[2] = pDataSrc[1]; + pDataDest[1] = pDataSrc[2]; + + return (ARM_MATH_SUCCESS); +} + +__STATIC_INLINE arm_status arm_mat_trans_16bit_3x3_mve(uint16_t * pDataSrc, uint16_t * pDataDest) +{ + static const uint16_t stridesTr33[8] = { 0, 3, 6, 1, 4, 7, 2, 5 }; + uint16x8_t vecOffs1; + uint16x8_t vecIn1; + /* + * + * | 0 1 2 | | 0 3 6 | 8 x 16 flattened version | 0 3 6 1 4 7 2 5 | + * | 3 4 5 | => | 1 4 7 | => | 8 . . . . . . . | + * | 6 7 8 | | 2 5 8 | (row major) + * + */ + vecOffs1 = vldrhq_u16((uint16_t const *) stridesTr33); + vecIn1 = vldrhq_u16((uint16_t const *) pDataSrc); + + vstrhq_scatter_shifted_offset_u16(pDataDest, vecOffs1, vecIn1); + + pDataDest[8] = pDataSrc[8]; + + return (ARM_MATH_SUCCESS); +} + + +__STATIC_INLINE arm_status arm_mat_trans_16bit_4x4_mve(uint16_t * pDataSrc, uint16_t * pDataDest) +{ + static const uint16_t stridesTr44_1[8] = { 0, 4, 8, 12, 1, 5, 9, 13 }; + static const uint16_t stridesTr44_2[8] = { 2, 6, 10, 14, 3, 7, 11, 15 }; + uint16x8_t vecOffs1, vecOffs2; + uint16x8_t vecIn1, vecIn2; + uint16_t const * pDataSrcVec = (uint16_t const *) pDataSrc; + + /* + * 4x4 Matrix transposition + * + * | 0 1 2 3 | | 0 4 8 12 | 8 x 16 flattened version + * | 4 5 6 7 | => | 1 5 9 13 | => [0 4 8 12 1 5 9 13] + * | 8 9 10 11 | | 2 6 10 14 | [2 6 10 14 3 7 11 15] + * | 12 13 14 15 | | 3 7 11 15 | + */ + + vecOffs1 = vldrhq_u16((uint16_t const *) stridesTr44_1); + vecOffs2 = vldrhq_u16((uint16_t const *) stridesTr44_2); + vecIn1 = vldrhq_u16(pDataSrcVec); + pDataSrcVec += 8; + vecIn2 = vldrhq_u16(pDataSrcVec); + + vstrhq_scatter_shifted_offset_u16(pDataDest, vecOffs1, vecIn1); + vstrhq_scatter_shifted_offset_u16(pDataDest, vecOffs2, vecIn2); + + + return (ARM_MATH_SUCCESS); +} + + + +__STATIC_INLINE arm_status arm_mat_trans_16bit_generic( + uint16_t srcRows, + uint16_t srcCols, + uint16_t * pDataSrc, + uint16_t * pDataDest) +{ + uint16x8_t vecOffs; + uint32_t i; + uint32_t blkCnt; + uint16_t const *pDataC; + uint16_t *pDataDestR; + uint16x8_t vecIn; + + vecOffs = vidupq_u16((uint32_t)0, 1); + vecOffs = vecOffs * srcCols; + + i = srcCols; + while(i > 0U) + { + pDataC = (uint16_t const *) pDataSrc; + pDataDestR = pDataDest; + + blkCnt = srcRows >> 3; + while (blkCnt > 0U) + { + vecIn = vldrhq_gather_shifted_offset_u16(pDataC, vecOffs); + vstrhq_u16(pDataDestR, vecIn); + pDataDestR += 8; + pDataC = pDataC + srcCols * 8; + /* + * Decrement the blockSize loop counter + */ + blkCnt--; + } + + /* + * tail + */ + blkCnt = srcRows & 7; + if (blkCnt > 0U) + { + mve_pred16_t p0 = vctp16q(blkCnt); + vecIn = vldrhq_gather_shifted_offset_u16(pDataC, vecOffs); + vstrhq_p_u16(pDataDestR, vecIn, p0); + } + pDataSrc += 1; + pDataDest += srcRows; + i--; + } + + return (ARM_MATH_SUCCESS); +} + + +__STATIC_INLINE arm_status arm_mat_cmplx_trans_16bit( + uint16_t srcRows, + uint16_t srcCols, + uint16_t *pDataSrc, + uint16_t dstRows, + uint16_t dstCols, + uint16_t *pDataDest) +{ + static const uint16_t loadCmplxCol[8] = { 0, 0, 1, 1, 2, 2, 3, 3 }; + int i; + uint16x8_t vecOffsRef, vecOffsCur; + uint16_t const *pDataC; + uint16_t *pDataRow; + uint16_t *pDataDestR, *pDataDestRow; + uint32_t blkCnt; + uint16x8_t vecIn; + +#ifdef ARM_MATH_MATRIX_CHECK + /* + * Check for matrix mismatch condition + */ + if ((srcRows != dstCols) || (srcCols != dstRows)) + { + /* + * Set status as ARM_MATH_SIZE_MISMATCH + */ + return ARM_MATH_SIZE_MISMATCH; + } +#else + (void)dstRows; + (void)dstCols; +#endif + + /* + * 2x2, 3x3 and 4x4 specialization to be added + */ + + + /* + * build [0, 1, 2xcol, 2xcol+1, 4xcol, 4xcol+1, 6xcol, 6xcol+1] + */ + vecOffsRef = vldrhq_u16((uint16_t const *) loadCmplxCol); + vecOffsRef = vmulq(vecOffsRef, (uint16_t) (srcCols * CMPLX_DIM)) + + viwdupq_u16((uint32_t)0, (uint16_t) 2, 1); + + pDataRow = pDataSrc; + pDataDestRow = pDataDest; + i = srcCols; + do + { + pDataC = (uint16_t const *) pDataRow; + pDataDestR = pDataDestRow; + vecOffsCur = vecOffsRef; + + blkCnt = (srcRows * CMPLX_DIM) >> 3; + while (blkCnt > 0U) + { + vecIn = vldrhq_gather_shifted_offset(pDataC, vecOffsCur); + vstrhq(pDataDestR, vecIn); + pDataDestR+= 8; // VEC_LANES_U16 + vecOffsCur = vaddq(vecOffsCur, (srcCols << 3)); + /* + * Decrement the blockSize loop counter + */ + blkCnt--; + } + /* + * tail + * (will be merged thru tail predication) + */ + blkCnt = (srcRows * CMPLX_DIM) & 0x7; + if (blkCnt > 0U) + { + mve_pred16_t p0 = vctp16q(blkCnt); + vecIn = vldrhq_gather_shifted_offset(pDataC, vecOffsCur); + vstrhq_p(pDataDestR, vecIn, p0); + } + + pDataRow += CMPLX_DIM; + pDataDestRow += (srcRows * CMPLX_DIM); + } + while (--i); + + return (ARM_MATH_SUCCESS); +} +#endif /* MVEF and MVEI */ + +/*************************************** + +Definitions available for MVEI only + +***************************************/ +#if (defined (ARM_MATH_HELIUM) || defined(ARM_MATH_MVEI)) && !defined(ARM_MATH_AUTOVECTORIZE) + +#include "arm_common_tables.h" + +#define MVE_ASRL_SAT16(acc, shift) ((sqrshrl_sat48(acc, -(32-shift)) >> 32) & 0xffffffff) +#define MVE_ASRL_SAT32(acc, shift) ((sqrshrl(acc, -(32-shift)) >> 32) & 0xffffffff) + + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FAST_TABLES) || defined(ARM_TABLE_FAST_SQRT_Q31_MVE) +__STATIC_INLINE q31x4_t FAST_VSQRT_Q31(q31x4_t vecIn) +{ + q63x2_t vecTmpLL; + q31x4_t vecTmp0, vecTmp1; + q31_t scale; + q63_t tmp64; + q31x4_t vecNrm, vecDst, vecIdx, vecSignBits; + + + vecSignBits = vclsq(vecIn); + vecSignBits = vbicq_n_s32(vecSignBits, 1); + /* + * in = in << no_of_sign_bits; + */ + vecNrm = vshlq(vecIn, vecSignBits); + /* + * index = in >> 24; + */ + vecIdx = vecNrm >> 24; + vecIdx = vecIdx << 1; + + vecTmp0 = vldrwq_gather_shifted_offset_s32(sqrtTable_Q31, (uint32x4_t)vecIdx); + + vecIdx = vecIdx + 1; + + vecTmp1 = vldrwq_gather_shifted_offset_s32(sqrtTable_Q31, (uint32x4_t)vecIdx); + + vecTmp1 = vqrdmulhq(vecTmp1, vecNrm); + vecTmp0 = vecTmp0 - vecTmp1; + vecTmp1 = vqrdmulhq(vecTmp0, vecTmp0); + vecTmp1 = vqrdmulhq(vecNrm, vecTmp1); + vecTmp1 = vdupq_n_s32(0x18000000) - vecTmp1; + vecTmp0 = vqrdmulhq(vecTmp0, vecTmp1); + vecTmpLL = vmullbq_int(vecNrm, vecTmp0); + + /* + * scale elements 0, 2 + */ + scale = 26 + (vecSignBits[0] >> 1); + tmp64 = asrl(vecTmpLL[0], scale); + vecDst[0] = (q31_t) tmp64; + + scale = 26 + (vecSignBits[2] >> 1); + tmp64 = asrl(vecTmpLL[1], scale); + vecDst[2] = (q31_t) tmp64; + + vecTmpLL = vmulltq_int(vecNrm, vecTmp0); + + /* + * scale elements 1, 3 + */ + scale = 26 + (vecSignBits[1] >> 1); + tmp64 = asrl(vecTmpLL[0], scale); + vecDst[1] = (q31_t) tmp64; + + scale = 26 + (vecSignBits[3] >> 1); + tmp64 = asrl(vecTmpLL[1], scale); + vecDst[3] = (q31_t) tmp64; + /* + * set negative values to 0 + */ + vecDst = vdupq_m(vecDst, 0, vcmpltq_n_s32(vecIn, 0)); + + return vecDst; +} +#endif + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FAST_TABLES) || defined(ARM_TABLE_FAST_SQRT_Q15_MVE) +__STATIC_INLINE q15x8_t FAST_VSQRT_Q15(q15x8_t vecIn) +{ + q31x4_t vecTmpLev, vecTmpLodd, vecSignL; + q15x8_t vecTmp0, vecTmp1; + q15x8_t vecNrm, vecDst, vecIdx, vecSignBits; + + vecDst = vuninitializedq_s16(); + + vecSignBits = vclsq(vecIn); + vecSignBits = vbicq_n_s16(vecSignBits, 1); + /* + * in = in << no_of_sign_bits; + */ + vecNrm = vshlq(vecIn, vecSignBits); + + vecIdx = vecNrm >> 8; + vecIdx = vecIdx << 1; + + vecTmp0 = vldrhq_gather_shifted_offset_s16(sqrtTable_Q15, (uint16x8_t)vecIdx); + + vecIdx = vecIdx + 1; + + vecTmp1 = vldrhq_gather_shifted_offset_s16(sqrtTable_Q15, (uint16x8_t)vecIdx); + + vecTmp1 = vqrdmulhq(vecTmp1, vecNrm); + vecTmp0 = vecTmp0 - vecTmp1; + vecTmp1 = vqrdmulhq(vecTmp0, vecTmp0); + vecTmp1 = vqrdmulhq(vecNrm, vecTmp1); + vecTmp1 = vdupq_n_s16(0x1800) - vecTmp1; + vecTmp0 = vqrdmulhq(vecTmp0, vecTmp1); + + vecSignBits = vecSignBits >> 1; + + vecTmpLev = vmullbq_int(vecNrm, vecTmp0); + vecTmpLodd = vmulltq_int(vecNrm, vecTmp0); + + vecTmp0 = vecSignBits + 10; + /* + * negate sign to apply register based vshl + */ + vecTmp0 = -vecTmp0; + + /* + * shift even elements + */ + vecSignL = vmovlbq(vecTmp0); + vecTmpLev = vshlq(vecTmpLev, vecSignL); + /* + * shift odd elements + */ + vecSignL = vmovltq(vecTmp0); + vecTmpLodd = vshlq(vecTmpLodd, vecSignL); + /* + * merge and narrow odd and even parts + */ + vecDst = vmovnbq_s32(vecDst, vecTmpLev); + vecDst = vmovntq_s32(vecDst, vecTmpLodd); + /* + * set negative values to 0 + */ + vecDst = vdupq_m(vecDst, 0, vcmpltq_n_s16(vecIn, 0)); + + return vecDst; +} +#endif + +#endif /* defined (ARM_MATH_HELIUM) || defined(ARM_MATH_MVEI) */ + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/APP_Framework/Framework/knowing/cmsis_5/DSP/Include/arm_math.h b/APP_Framework/Framework/knowing/cmsis_5/DSP/Include/arm_math.h new file mode 100644 index 000000000..79ce541ea --- /dev/null +++ b/APP_Framework/Framework/knowing/cmsis_5/DSP/Include/arm_math.h @@ -0,0 +1,236 @@ +/****************************************************************************** + * @file arm_math.h + * @brief Public header file for CMSIS DSP Library + * @version V1.9.0 + * @date 23 April 2021 + * Target Processor: Cortex-M and Cortex-A cores + ******************************************************************************/ +/* + * Copyright (c) 2010-2021 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + \mainpage CMSIS DSP Software Library + * + * \section intro Introduction + * + * This user manual describes the CMSIS DSP software library, + * a suite of common signal processing functions for use on Cortex-M and Cortex-A processor + * based devices. + * + * The library is divided into a number of functions each covering a specific category: + * - Basic math functions + * - Fast math functions + * - Complex math functions + * - Filtering functions + * - Matrix functions + * - Transform functions + * - Motor control functions + * - Statistical functions + * - Support functions + * - Interpolation functions + * - Support Vector Machine functions (SVM) + * - Bayes classifier functions + * - Distance functions + * - Quaternion functions + * + * The library has generally separate functions for operating on 8-bit integers, 16-bit integers, + * 32-bit integer and 32-bit floating-point values. + * + * The library is providing vectorized versions of most algorthms for Helium + * and of most f32 algorithms for Neon. + * + * When using a vectorized version, provide a little bit of padding after the end of + * a buffer (3 words) because the vectorized code may read a little bit after the end + * of a buffer. You don't have to modify your buffers but just ensure that the + * end of buffer + padding is not outside of a memory region. + * + * \section using Using the Library + * + * The library is released in source form. It is strongly advised to compile the library using -Ofast to + * have the best performances. + * + * The library functions are declared in the public file arm_math.h which is placed in the Include folder. + * Simply include this file. If you don't want to include everything, you can also rely + * on headers in Include/dsp folder and use only what you need. + * + * \section example Examples + * + * The library ships with a number of examples which demonstrate how to use the library functions. + * + * \section toolchain Toolchain Support + * + * The library is now tested on Fast Models building with cmake. + * Core M0, M4, M7, M33, M55, A32 are tested. + * + * + * \section preprocessor Preprocessor Macros + * + * Each library project have different preprocessor macros. + * + * - ARM_MATH_BIG_ENDIAN: + * + * Define macro ARM_MATH_BIG_ENDIAN to build the library for big endian targets. By default library builds for little endian targets. + * + * - ARM_MATH_MATRIX_CHECK: + * + * Define macro ARM_MATH_MATRIX_CHECK for checking on the input and output sizes of matrices + * + * - ARM_MATH_ROUNDING: + * + * Define macro ARM_MATH_ROUNDING for rounding on support functions + * + * - ARM_MATH_LOOPUNROLL: + * + * Define macro ARM_MATH_LOOPUNROLL to enable manual loop unrolling in DSP functions + * + * - ARM_MATH_NEON: + * + * Define macro ARM_MATH_NEON to enable Neon versions of the DSP functions. + * It is not enabled by default when Neon is available because performances are + * dependent on the compiler and target architecture. + * + * - ARM_MATH_NEON_EXPERIMENTAL: + * + * Define macro ARM_MATH_NEON_EXPERIMENTAL to enable experimental Neon versions of + * of some DSP functions. Experimental Neon versions currently do not have better + * performances than the scalar versions. + * + * - ARM_MATH_HELIUM: + * + * It implies the flags ARM_MATH_MVEF and ARM_MATH_MVEI and ARM_MATH_MVE_FLOAT16. + * + * - ARM_MATH_HELIUM_EXPERIMENTAL: + * + * Only taken into account when ARM_MATH_MVEF, ARM_MATH_MVEI or ARM_MATH_MVE_FLOAT16 are defined. + * Enable some vector versions which may have worse performance than scalar + * depending on the core / compiler configuration. + * + * - ARM_MATH_MVEF: + * + * Select Helium versions of the f32 algorithms. + * It implies ARM_MATH_FLOAT16 and ARM_MATH_MVEI. + * + * - ARM_MATH_MVEI: + * + * Select Helium versions of the int and fixed point algorithms. + * + * - ARM_MATH_MVE_FLOAT16: + * + * MVE Float16 implementations of some algorithms (Requires MVE extension). + * + * - DISABLEFLOAT16: + * + * Disable float16 algorithms when __fp16 is not supported for a + * specific compiler / core configuration. + * This is only valid for scalar. When vector architecture is + * supporting f16 then it can't be disabled. + * + * - ARM_MATH_AUTOVECTORIZE: + * + * With Helium or Neon, disable the use of vectorized code with C intrinsics + * and use pure C instead. The vectorization is then done by the compiler. + * + *
+ * \section pack CMSIS-DSP in ARM::CMSIS Pack + * + * The following files relevant to CMSIS-DSP are present in the ARM::CMSIS Pack directories: + * |File/Folder |Content | + * |---------------------------------|------------------------------------------------------------------------| + * |\b CMSIS\\Documentation\\DSP | This documentation | + * |\b CMSIS\\DSP\\Examples | Example projects demonstrating the usage of the library functions | + * |\b CMSIS\\DSP\\Include | DSP_Lib include files for using and building the lib + * |\b CMSIS\\DSP\\PrivateInclude | DSP_Lib private include files for building the lib | + * |\b CMSIS\\DSP\\Lib | DSP_Lib binaries | + * |\b CMSIS\\DSP\\Source | DSP_Lib source files | + * + *
+ * \section rev Revision History of CMSIS-DSP + * Please refer to \ref ChangeLog_pg. + */ + + + + + + + + + + + +/** + * @defgroup groupExamples Examples + */ + + + + + +#ifndef _ARM_MATH_H +#define _ARM_MATH_H + + +#include "arm_math_types.h" +#include "arm_math_memory.h" + +#include "dsp/none.h" +#include "dsp/utils.h" + +#include "dsp/basic_math_functions.h" +#include "dsp/interpolation_functions.h" +#include "dsp/bayes_functions.h" +#include "dsp/matrix_functions.h" +#include "dsp/complex_math_functions.h" +#include "dsp/statistics_functions.h" +#include "dsp/controller_functions.h" +#include "dsp/support_functions.h" +#include "dsp/distance_functions.h" +#include "dsp/svm_functions.h" +#include "dsp/fast_math_functions.h" +#include "dsp/transform_functions.h" +#include "dsp/filtering_functions.h" +#include "dsp/quaternion_math_functions.h" + + + +#ifdef __cplusplus +extern "C" +{ +#endif + + + + +//#define TABLE_SPACING_Q31 0x400000 +//#define TABLE_SPACING_Q15 0x80 + + + + + +#ifdef __cplusplus +} +#endif + + +#endif /* _ARM_MATH_H */ + +/** + * + * End of file. + */ diff --git a/APP_Framework/Framework/knowing/cmsis_5/DSP/Include/arm_math_f16.h b/APP_Framework/Framework/knowing/cmsis_5/DSP/Include/arm_math_f16.h new file mode 100644 index 000000000..c046a127a --- /dev/null +++ b/APP_Framework/Framework/knowing/cmsis_5/DSP/Include/arm_math_f16.h @@ -0,0 +1,59 @@ +/****************************************************************************** + * @file arm_math_f16.h + * @brief Public header file for f16 function of the CMSIS DSP Library + * @version V1.9.0 + * @date 23 April 2021 + * Target Processor: Cortex-M and Cortex-A cores + ******************************************************************************/ +/* + * Copyright (c) 2010-2021 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef _ARM_MATH_F16_H +#define _ARM_MATH_F16_H + +#include "arm_math.h" + +#ifdef __cplusplus +extern "C" +{ +#endif + +#include "arm_math_types_f16.h" +#include "dsp/none.h" +#include "dsp/utils.h" +#include "dsp/basic_math_functions_f16.h" +#include "dsp/interpolation_functions_f16.h" +#include "dsp/bayes_functions_f16.h" +#include "dsp/matrix_functions_f16.h" +#include "dsp/complex_math_functions_f16.h" +#include "dsp/statistics_functions_f16.h" +#include "dsp/controller_functions_f16.h" +#include "dsp/support_functions_f16.h" +#include "dsp/distance_functions_f16.h" +#include "dsp/svm_functions_f16.h" +#include "dsp/fast_math_functions_f16.h" +#include "dsp/transform_functions_f16.h" +#include "dsp/filtering_functions_f16.h" + +#ifdef __cplusplus +} +#endif + +#endif /* _ARM_MATH_F16_H */ + + diff --git a/APP_Framework/Framework/knowing/cmsis_5/DSP/Include/arm_math_memory.h b/APP_Framework/Framework/knowing/cmsis_5/DSP/Include/arm_math_memory.h new file mode 100644 index 000000000..771bb7cd9 --- /dev/null +++ b/APP_Framework/Framework/knowing/cmsis_5/DSP/Include/arm_math_memory.h @@ -0,0 +1,241 @@ +/****************************************************************************** + * @file arm_math_memory.h + * @brief Public header file for CMSIS DSP Library + * @version V1.9.0 + * @date 23 April 2021 + * Target Processor: Cortex-M and Cortex-A cores + ******************************************************************************/ +/* + * Copyright (c) 2010-2021 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef _ARM_MATH_MEMORY_H_ + +#define _ARM_MATH_MEMORY_H_ + +#include "arm_math_types.h" + + +#ifdef __cplusplus +extern "C" +{ +#endif + +/** + @brief definition to read/write two 16 bit values. + @deprecated + */ +#if defined ( __CC_ARM ) + #define __SIMD32_TYPE int32_t __packed +#elif defined ( __ARMCC_VERSION ) && ( __ARMCC_VERSION >= 6010050 ) + #define __SIMD32_TYPE int32_t +#elif defined ( __GNUC__ ) + #define __SIMD32_TYPE int32_t +#elif defined ( __ICCARM__ ) + #define __SIMD32_TYPE int32_t __packed +#elif defined ( __TI_ARM__ ) + #define __SIMD32_TYPE int32_t +#elif defined ( __CSMC__ ) + #define __SIMD32_TYPE int32_t +#elif defined ( __TASKING__ ) + #define __SIMD32_TYPE __un(aligned) int32_t +#elif defined(_MSC_VER ) + #define __SIMD32_TYPE int32_t +#else + #error Unknown compiler +#endif + +#define __SIMD32(addr) (*(__SIMD32_TYPE **) & (addr)) +#define __SIMD32_CONST(addr) ( (__SIMD32_TYPE * ) (addr)) +#define _SIMD32_OFFSET(addr) (*(__SIMD32_TYPE * ) (addr)) +#define __SIMD64(addr) (*( int64_t **) & (addr)) + + +/* SIMD replacement */ + + +/** + @brief Read 2 Q15 from Q15 pointer. + @param[in] pQ15 points to input value + @return Q31 value + */ +__STATIC_FORCEINLINE q31_t read_q15x2 ( + q15_t * pQ15) +{ + q31_t val; + +#ifdef __ARM_FEATURE_UNALIGNED + memcpy (&val, pQ15, 4); +#else + val = (pQ15[1] << 16) | (pQ15[0] & 0x0FFFF) ; +#endif + + return (val); +} + +/** + @brief Read 2 Q15 from Q15 pointer and increment pointer afterwards. + @param[in] pQ15 points to input value + @return Q31 value + */ +__STATIC_FORCEINLINE q31_t read_q15x2_ia ( + q15_t ** pQ15) +{ + q31_t val; + +#ifdef __ARM_FEATURE_UNALIGNED + memcpy (&val, *pQ15, 4); +#else + val = ((*pQ15)[1] << 16) | ((*pQ15)[0] & 0x0FFFF); +#endif + + *pQ15 += 2; + return (val); +} + +/** + @brief Read 2 Q15 from Q15 pointer and decrement pointer afterwards. + @param[in] pQ15 points to input value + @return Q31 value + */ +__STATIC_FORCEINLINE q31_t read_q15x2_da ( + q15_t ** pQ15) +{ + q31_t val; + +#ifdef __ARM_FEATURE_UNALIGNED + memcpy (&val, *pQ15, 4); +#else + val = ((*pQ15)[1] << 16) | ((*pQ15)[0] & 0x0FFFF); +#endif + + *pQ15 -= 2; + return (val); +} + +/** + @brief Write 2 Q15 to Q15 pointer and increment pointer afterwards. + @param[in] pQ15 points to input value + @param[in] value Q31 value + @return none + */ +__STATIC_FORCEINLINE void write_q15x2_ia ( + q15_t ** pQ15, + q31_t value) +{ + q31_t val = value; +#ifdef __ARM_FEATURE_UNALIGNED + memcpy (*pQ15, &val, 4); +#else + (*pQ15)[0] = (val & 0x0FFFF); + (*pQ15)[1] = (val >> 16) & 0x0FFFF; +#endif + + *pQ15 += 2; +} + +/** + @brief Write 2 Q15 to Q15 pointer. + @param[in] pQ15 points to input value + @param[in] value Q31 value + @return none + */ +__STATIC_FORCEINLINE void write_q15x2 ( + q15_t * pQ15, + q31_t value) +{ + q31_t val = value; + +#ifdef __ARM_FEATURE_UNALIGNED + memcpy (pQ15, &val, 4); +#else + pQ15[0] = val & 0x0FFFF; + pQ15[1] = val >> 16; +#endif +} + + +/** + @brief Read 4 Q7 from Q7 pointer and increment pointer afterwards. + @param[in] pQ7 points to input value + @return Q31 value + */ +__STATIC_FORCEINLINE q31_t read_q7x4_ia ( + q7_t ** pQ7) +{ + q31_t val; + + +#ifdef __ARM_FEATURE_UNALIGNED + memcpy (&val, *pQ7, 4); +#else + val =(((*pQ7)[3] & 0x0FF) << 24) | (((*pQ7)[2] & 0x0FF) << 16) | (((*pQ7)[1] & 0x0FF) << 8) | ((*pQ7)[0] & 0x0FF); +#endif + + *pQ7 += 4; + + return (val); +} + +/** + @brief Read 4 Q7 from Q7 pointer and decrement pointer afterwards. + @param[in] pQ7 points to input value + @return Q31 value + */ +__STATIC_FORCEINLINE q31_t read_q7x4_da ( + q7_t ** pQ7) +{ + q31_t val; +#ifdef __ARM_FEATURE_UNALIGNED + memcpy (&val, *pQ7, 4); +#else + val = ((((*pQ7)[3]) & 0x0FF) << 24) | ((((*pQ7)[2]) & 0x0FF) << 16) | ((((*pQ7)[1]) & 0x0FF) << 8) | ((*pQ7)[0] & 0x0FF); +#endif + *pQ7 -= 4; + + return (val); +} + +/** + @brief Write 4 Q7 to Q7 pointer and increment pointer afterwards. + @param[in] pQ7 points to input value + @param[in] value Q31 value + @return none + */ +__STATIC_FORCEINLINE void write_q7x4_ia ( + q7_t ** pQ7, + q31_t value) +{ + q31_t val = value; +#ifdef __ARM_FEATURE_UNALIGNED + memcpy (*pQ7, &val, 4); +#else + (*pQ7)[0] = val & 0x0FF; + (*pQ7)[1] = (val >> 8) & 0x0FF; + (*pQ7)[2] = (val >> 16) & 0x0FF; + (*pQ7)[3] = (val >> 24) & 0x0FF; + +#endif + *pQ7 += 4; +} + + +#ifdef __cplusplus +} +#endif + +#endif /*ifndef _ARM_MATH_MEMORY_H_ */ diff --git a/APP_Framework/Framework/knowing/cmsis_5/DSP/Include/arm_math_types.h b/APP_Framework/Framework/knowing/cmsis_5/DSP/Include/arm_math_types.h new file mode 100644 index 000000000..e9f6ed247 --- /dev/null +++ b/APP_Framework/Framework/knowing/cmsis_5/DSP/Include/arm_math_types.h @@ -0,0 +1,592 @@ +/****************************************************************************** + * @file arm_math_types.h + * @brief Public header file for CMSIS DSP Library + * @version V1.9.0 + * @date 23 April 2021 + * Target Processor: Cortex-M and Cortex-A cores + ******************************************************************************/ +/* + * Copyright (c) 2010-2021 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef _ARM_MATH_TYPES_H_ + +#define _ARM_MATH_TYPES_H_ + +#ifdef __cplusplus +extern "C" +{ +#endif + +/* Compiler specific diagnostic adjustment */ +#if defined ( __CC_ARM ) + +#elif defined ( __ARMCC_VERSION ) && ( __ARMCC_VERSION >= 6010050 ) + +#elif defined ( __GNUC__ ) + #pragma GCC diagnostic push + #pragma GCC diagnostic ignored "-Wsign-conversion" + #pragma GCC diagnostic ignored "-Wconversion" + #pragma GCC diagnostic ignored "-Wunused-parameter" + +#elif defined ( __ICCARM__ ) + +#elif defined ( __TI_ARM__ ) + +#elif defined ( __CSMC__ ) + +#elif defined ( __TASKING__ ) + +#elif defined ( _MSC_VER ) + +#else + #error Unknown compiler +#endif + + +/* Included for instrinsics definitions */ +#if defined (_MSC_VER ) +#include +#define __STATIC_FORCEINLINE static __forceinline +#define __STATIC_INLINE static __inline +#define __ALIGNED(x) __declspec(align(x)) + +#elif defined (__GNUC_PYTHON__) +#include +#define __ALIGNED(x) __attribute__((aligned(x))) +#define __STATIC_FORCEINLINE static inline __attribute__((always_inline)) +#define __STATIC_INLINE static inline + +#else +#include "cmsis_compiler.h" +#endif + + + +#include +#include +#include +#include + +/* evaluate ARM DSP feature */ +#if (defined (__ARM_FEATURE_DSP) && (__ARM_FEATURE_DSP == 1)) + #define ARM_MATH_DSP 1 +#endif + +#if defined(ARM_MATH_NEON) +#include +#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + #if !defined(ARM_MATH_NEON_FLOAT16) + #define ARM_MATH_NEON_FLOAT16 + #endif +#endif +#endif + +#if !defined(ARM_MATH_AUTOVECTORIZE) + +#if __ARM_FEATURE_MVE + #if !defined(ARM_MATH_MVEI) + #define ARM_MATH_MVEI + #endif +#endif + +#if (__ARM_FEATURE_MVE & 2) + #if !defined(ARM_MATH_MVEF) + #define ARM_MATH_MVEF + #endif + #if !defined(ARM_MATH_MVE_FLOAT16) + #define ARM_MATH_MVE_FLOAT16 + #endif +#endif + +#endif /*!defined(ARM_MATH_AUTOVECTORIZE)*/ + + +#if defined (ARM_MATH_HELIUM) + #if !defined(ARM_MATH_MVEF) + #define ARM_MATH_MVEF + #endif + + #if !defined(ARM_MATH_MVEI) + #define ARM_MATH_MVEI + #endif + + #if !defined(ARM_MATH_MVE_FLOAT16) + #define ARM_MATH_MVE_FLOAT16 + #endif +#endif + + + +#if defined ( __CC_ARM ) + /* Enter low optimization region - place directly above function definition */ + #if defined( __ARM_ARCH_7EM__ ) + #define LOW_OPTIMIZATION_ENTER \ + _Pragma ("push") \ + _Pragma ("O1") + #else + #define LOW_OPTIMIZATION_ENTER + #endif + + /* Exit low optimization region - place directly after end of function definition */ + #if defined ( __ARM_ARCH_7EM__ ) + #define LOW_OPTIMIZATION_EXIT \ + _Pragma ("pop") + #else + #define LOW_OPTIMIZATION_EXIT + #endif + + /* Enter low optimization region - place directly above function definition */ + #define IAR_ONLY_LOW_OPTIMIZATION_ENTER + + /* Exit low optimization region - place directly after end of function definition */ + #define IAR_ONLY_LOW_OPTIMIZATION_EXIT + +#elif defined (__ARMCC_VERSION ) && ( __ARMCC_VERSION >= 6010050 ) + #define LOW_OPTIMIZATION_ENTER + #define LOW_OPTIMIZATION_EXIT + #define IAR_ONLY_LOW_OPTIMIZATION_ENTER + #define IAR_ONLY_LOW_OPTIMIZATION_EXIT + +#elif defined ( __GNUC__ ) + #define LOW_OPTIMIZATION_ENTER \ + __attribute__(( optimize("-O1") )) + #define LOW_OPTIMIZATION_EXIT + #define IAR_ONLY_LOW_OPTIMIZATION_ENTER + #define IAR_ONLY_LOW_OPTIMIZATION_EXIT + +#elif defined ( __ICCARM__ ) + /* Enter low optimization region - place directly above function definition */ + #if defined ( __ARM_ARCH_7EM__ ) + #define LOW_OPTIMIZATION_ENTER \ + _Pragma ("optimize=low") + #else + #define LOW_OPTIMIZATION_ENTER + #endif + + /* Exit low optimization region - place directly after end of function definition */ + #define LOW_OPTIMIZATION_EXIT + + /* Enter low optimization region - place directly above function definition */ + #if defined ( __ARM_ARCH_7EM__ ) + #define IAR_ONLY_LOW_OPTIMIZATION_ENTER \ + _Pragma ("optimize=low") + #else + #define IAR_ONLY_LOW_OPTIMIZATION_ENTER + #endif + + /* Exit low optimization region - place directly after end of function definition */ + #define IAR_ONLY_LOW_OPTIMIZATION_EXIT + +#elif defined ( __TI_ARM__ ) + #define LOW_OPTIMIZATION_ENTER + #define LOW_OPTIMIZATION_EXIT + #define IAR_ONLY_LOW_OPTIMIZATION_ENTER + #define IAR_ONLY_LOW_OPTIMIZATION_EXIT + +#elif defined ( __CSMC__ ) + #define LOW_OPTIMIZATION_ENTER + #define LOW_OPTIMIZATION_EXIT + #define IAR_ONLY_LOW_OPTIMIZATION_ENTER + #define IAR_ONLY_LOW_OPTIMIZATION_EXIT + +#elif defined ( __TASKING__ ) + #define LOW_OPTIMIZATION_ENTER + #define LOW_OPTIMIZATION_EXIT + #define IAR_ONLY_LOW_OPTIMIZATION_ENTER + #define IAR_ONLY_LOW_OPTIMIZATION_EXIT + +#elif defined ( _MSC_VER ) || defined(__GNUC_PYTHON__) + #define LOW_OPTIMIZATION_ENTER + #define LOW_OPTIMIZATION_EXIT + #define IAR_ONLY_LOW_OPTIMIZATION_ENTER + #define IAR_ONLY_LOW_OPTIMIZATION_EXIT +#endif + + + +/* Compiler specific diagnostic adjustment */ +#if defined ( __CC_ARM ) + +#elif defined ( __ARMCC_VERSION ) && ( __ARMCC_VERSION >= 6010050 ) + +#elif defined ( __GNUC__ ) +#pragma GCC diagnostic pop + +#elif defined ( __ICCARM__ ) + +#elif defined ( __TI_ARM__ ) + +#elif defined ( __CSMC__ ) + +#elif defined ( __TASKING__ ) + +#elif defined ( _MSC_VER ) + +#else + #error Unknown compiler +#endif + +#ifdef __cplusplus +} +#endif + +#if __ARM_FEATURE_MVE +#include +#endif + +#ifdef __cplusplus +extern "C" +{ +#endif + + /** + * @brief 8-bit fractional data type in 1.7 format. + */ + typedef int8_t q7_t; + + /** + * @brief 16-bit fractional data type in 1.15 format. + */ + typedef int16_t q15_t; + + /** + * @brief 32-bit fractional data type in 1.31 format. + */ + typedef int32_t q31_t; + + /** + * @brief 64-bit fractional data type in 1.63 format. + */ + typedef int64_t q63_t; + + /** + * @brief 32-bit floating-point type definition. + */ + typedef float float32_t; + + /** + * @brief 64-bit floating-point type definition. + */ + typedef double float64_t; + + /** + * @brief vector types + */ +#if defined(ARM_MATH_NEON) || (defined (ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)) + /** + * @brief 64-bit fractional 128-bit vector data type in 1.63 format + */ + typedef int64x2_t q63x2_t; + + /** + * @brief 32-bit fractional 128-bit vector data type in 1.31 format. + */ + typedef int32x4_t q31x4_t; + + /** + * @brief 16-bit fractional 128-bit vector data type with 16-bit alignment in 1.15 format. + */ + typedef __ALIGNED(2) int16x8_t q15x8_t; + + /** + * @brief 8-bit fractional 128-bit vector data type with 8-bit alignment in 1.7 format. + */ + typedef __ALIGNED(1) int8x16_t q7x16_t; + + /** + * @brief 32-bit fractional 128-bit vector pair data type in 1.31 format. + */ + typedef int32x4x2_t q31x4x2_t; + + /** + * @brief 32-bit fractional 128-bit vector quadruplet data type in 1.31 format. + */ + typedef int32x4x4_t q31x4x4_t; + + /** + * @brief 16-bit fractional 128-bit vector pair data type in 1.15 format. + */ + typedef int16x8x2_t q15x8x2_t; + + /** + * @brief 16-bit fractional 128-bit vector quadruplet data type in 1.15 format. + */ + typedef int16x8x4_t q15x8x4_t; + + /** + * @brief 8-bit fractional 128-bit vector pair data type in 1.7 format. + */ + typedef int8x16x2_t q7x16x2_t; + + /** + * @brief 8-bit fractional 128-bit vector quadruplet data type in 1.7 format. + */ + typedef int8x16x4_t q7x16x4_t; + + /** + * @brief 32-bit fractional data type in 9.23 format. + */ + typedef int32_t q23_t; + + /** + * @brief 32-bit fractional 128-bit vector data type in 9.23 format. + */ + typedef int32x4_t q23x4_t; + + /** + * @brief 64-bit status 128-bit vector data type. + */ + typedef int64x2_t status64x2_t; + + /** + * @brief 32-bit status 128-bit vector data type. + */ + typedef int32x4_t status32x4_t; + + /** + * @brief 16-bit status 128-bit vector data type. + */ + typedef int16x8_t status16x8_t; + + /** + * @brief 8-bit status 128-bit vector data type. + */ + typedef int8x16_t status8x16_t; + + +#endif + +#if defined(ARM_MATH_NEON) || (defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)) /* floating point vector*/ + /** + * @brief 32-bit floating-point 128-bit vector type + */ + typedef float32x4_t f32x4_t; + + /** + * @brief 32-bit floating-point 128-bit vector pair data type + */ + typedef float32x4x2_t f32x4x2_t; + + /** + * @brief 32-bit floating-point 128-bit vector quadruplet data type + */ + typedef float32x4x4_t f32x4x4_t; + + /** + * @brief 32-bit ubiquitous 128-bit vector data type + */ + typedef union _any32x4_t + { + float32x4_t f; + int32x4_t i; + } any32x4_t; + +#endif + +#if defined(ARM_MATH_NEON) + /** + * @brief 32-bit fractional 64-bit vector data type in 1.31 format. + */ + typedef int32x2_t q31x2_t; + + /** + * @brief 16-bit fractional 64-bit vector data type in 1.15 format. + */ + typedef __ALIGNED(2) int16x4_t q15x4_t; + + /** + * @brief 8-bit fractional 64-bit vector data type in 1.7 format. + */ + typedef __ALIGNED(1) int8x8_t q7x8_t; + + /** + * @brief 32-bit float 64-bit vector data type. + */ + typedef float32x2_t f32x2_t; + + /** + * @brief 32-bit floating-point 128-bit vector triplet data type + */ + typedef float32x4x3_t f32x4x3_t; + + + /** + * @brief 32-bit fractional 128-bit vector triplet data type in 1.31 format + */ + typedef int32x4x3_t q31x4x3_t; + + /** + * @brief 16-bit fractional 128-bit vector triplet data type in 1.15 format + */ + typedef int16x8x3_t q15x8x3_t; + + /** + * @brief 8-bit fractional 128-bit vector triplet data type in 1.7 format + */ + typedef int8x16x3_t q7x16x3_t; + + /** + * @brief 32-bit floating-point 64-bit vector pair data type + */ + typedef float32x2x2_t f32x2x2_t; + + /** + * @brief 32-bit floating-point 64-bit vector triplet data type + */ + typedef float32x2x3_t f32x2x3_t; + + /** + * @brief 32-bit floating-point 64-bit vector quadruplet data type + */ + typedef float32x2x4_t f32x2x4_t; + + + /** + * @brief 32-bit fractional 64-bit vector pair data type in 1.31 format + */ + typedef int32x2x2_t q31x2x2_t; + + /** + * @brief 32-bit fractional 64-bit vector triplet data type in 1.31 format + */ + typedef int32x2x3_t q31x2x3_t; + + /** + * @brief 32-bit fractional 64-bit vector quadruplet data type in 1.31 format + */ + typedef int32x4x3_t q31x2x4_t; + + /** + * @brief 16-bit fractional 64-bit vector pair data type in 1.15 format + */ + typedef int16x4x2_t q15x4x2_t; + + /** + * @brief 16-bit fractional 64-bit vector triplet data type in 1.15 format + */ + typedef int16x4x2_t q15x4x3_t; + + /** + * @brief 16-bit fractional 64-bit vector quadruplet data type in 1.15 format + */ + typedef int16x4x3_t q15x4x4_t; + + /** + * @brief 8-bit fractional 64-bit vector pair data type in 1.7 format + */ + typedef int8x8x2_t q7x8x2_t; + + /** + * @brief 8-bit fractional 64-bit vector triplet data type in 1.7 format + */ + typedef int8x8x3_t q7x8x3_t; + + /** + * @brief 8-bit fractional 64-bit vector quadruplet data type in 1.7 format + */ + typedef int8x8x4_t q7x8x4_t; + + /** + * @brief 32-bit ubiquitous 64-bit vector data type + */ + typedef union _any32x2_t + { + float32x2_t f; + int32x2_t i; + } any32x2_t; + + + /** + * @brief 32-bit status 64-bit vector data type. + */ + typedef int32x4_t status32x2_t; + + /** + * @brief 16-bit status 64-bit vector data type. + */ + typedef int16x8_t status16x4_t; + + /** + * @brief 8-bit status 64-bit vector data type. + */ + typedef int8x16_t status8x8_t; + +#endif + + + + + +#define F64_MAX ((float64_t)DBL_MAX) +#define F32_MAX ((float32_t)FLT_MAX) + + + +#define F64_MIN (-DBL_MAX) +#define F32_MIN (-FLT_MAX) + + + +#define F64_ABSMAX ((float64_t)DBL_MAX) +#define F32_ABSMAX ((float32_t)FLT_MAX) + + + +#define F64_ABSMIN ((float64_t)0.0) +#define F32_ABSMIN ((float32_t)0.0) + + +#define Q31_MAX ((q31_t)(0x7FFFFFFFL)) +#define Q15_MAX ((q15_t)(0x7FFF)) +#define Q7_MAX ((q7_t)(0x7F)) +#define Q31_MIN ((q31_t)(0x80000000L)) +#define Q15_MIN ((q15_t)(0x8000)) +#define Q7_MIN ((q7_t)(0x80)) + +#define Q31_ABSMAX ((q31_t)(0x7FFFFFFFL)) +#define Q15_ABSMAX ((q15_t)(0x7FFF)) +#define Q7_ABSMAX ((q7_t)(0x7F)) +#define Q31_ABSMIN ((q31_t)0) +#define Q15_ABSMIN ((q15_t)0) +#define Q7_ABSMIN ((q7_t)0) + + /* Dimension C vector space */ + #define CMPLX_DIM 2 + + /** + * @brief Error status returned by some functions in the library. + */ + + typedef enum + { + ARM_MATH_SUCCESS = 0, /**< No error */ + ARM_MATH_ARGUMENT_ERROR = -1, /**< One or more arguments are incorrect */ + ARM_MATH_LENGTH_ERROR = -2, /**< Length of data buffer is incorrect */ + ARM_MATH_SIZE_MISMATCH = -3, /**< Size of matrices is not compatible with the operation */ + ARM_MATH_NANINF = -4, /**< Not-a-number (NaN) or infinity is generated */ + ARM_MATH_SINGULAR = -5, /**< Input matrix is singular and cannot be inverted */ + ARM_MATH_TEST_FAILURE = -6, /**< Test Failed */ + ARM_MATH_DECOMPOSITION_FAILURE = -7 /**< Decomposition Failed */ + } arm_status; + + +#ifdef __cplusplus +} +#endif + +#endif /*ifndef _ARM_MATH_TYPES_H_ */ diff --git a/APP_Framework/Framework/knowing/cmsis_5/DSP/Include/arm_math_types_f16.h b/APP_Framework/Framework/knowing/cmsis_5/DSP/Include/arm_math_types_f16.h new file mode 100644 index 000000000..baf8750f3 --- /dev/null +++ b/APP_Framework/Framework/knowing/cmsis_5/DSP/Include/arm_math_types_f16.h @@ -0,0 +1,156 @@ +/****************************************************************************** + * @file arm_math_types_f16.h + * @brief Public header file for f16 function of the CMSIS DSP Library + * @version V1.9.0 + * @date 23 April 2021 + * Target Processor: Cortex-M and Cortex-A cores + ******************************************************************************/ +/* + * Copyright (c) 2010-2021 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef _ARM_MATH_TYPES_F16_H +#define _ARM_MATH_TYPES_F16_H + +#include "arm_math_types.h" + +#ifdef __cplusplus +extern "C" +{ +#endif + +#if !defined( __CC_ARM ) + +/** + * @brief 16-bit floating-point type definition. + * This is already defined in arm_mve.h + * + * This is not fully supported on ARM AC5. + */ + +/* + +Check if the type __fp16 is available. +If it is not available, f16 version of the kernels +won't be built. + +*/ +#if !(__ARM_FEATURE_MVE & 2) + #if !defined(DISABLEFLOAT16) + #if defined(__ARM_FP16_FORMAT_IEEE) || defined(__ARM_FP16_FORMAT_ALTERNATIVE) + typedef __fp16 float16_t; + #define ARM_FLOAT16_SUPPORTED + #endif + #endif +#else + /* When Vector float16, this flag is always defined and can't be disabled */ + #define ARM_FLOAT16_SUPPORTED +#endif + +#if defined(ARM_MATH_NEON) || (defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)) /* floating point vector*/ + +#if defined(ARM_MATH_MVE_FLOAT16) || defined(ARM_MATH_NEON_FLOAT16) + + /** + * @brief 16-bit floating-point 128-bit vector data type + */ + typedef __ALIGNED(2) float16x8_t f16x8_t; + + /** + * @brief 16-bit floating-point 128-bit vector pair data type + */ + typedef float16x8x2_t f16x8x2_t; + + /** + * @brief 16-bit floating-point 128-bit vector quadruplet data type + */ + typedef float16x8x4_t f16x8x4_t; + + /** + * @brief 16-bit ubiquitous 128-bit vector data type + */ + typedef union _any16x8_t + { + float16x8_t f; + int16x8_t i; + } any16x8_t; +#endif + +#endif + +#if defined(ARM_MATH_NEON) + + +#if defined(ARM_MATH_NEON_FLOAT16) + /** + * @brief 16-bit float 64-bit vector data type. + */ + typedef __ALIGNED(2) float16x4_t f16x4_t; + + /** + * @brief 16-bit floating-point 128-bit vector triplet data type + */ + typedef float16x8x3_t f16x8x3_t; + + /** + * @brief 16-bit floating-point 64-bit vector pair data type + */ + typedef float16x4x2_t f16x4x2_t; + + /** + * @brief 16-bit floating-point 64-bit vector triplet data type + */ + typedef float16x4x3_t f16x4x3_t; + + /** + * @brief 16-bit floating-point 64-bit vector quadruplet data type + */ + typedef float16x4x4_t f16x4x4_t; + + /** + * @brief 16-bit ubiquitous 64-bit vector data type + */ + typedef union _any16x4_t + { + float16x4_t f; + int16x4_t i; + } any16x4_t; +#endif + +#endif + + + +#if defined(ARM_FLOAT16_SUPPORTED) +#define F16_MAX ((float16_t)__FLT16_MAX__) +#define F16_MIN (-(float16_t)__FLT16_MAX__) + +#define F16_ABSMAX ((float16_t)__FLT16_MAX__) +#define F16_ABSMIN ((float16_t)0.0f16) + +#define F16INFINITY ((float16_t)__builtin_inf()) + +#endif /* ARM_FLOAT16_SUPPORTED*/ +#endif /* !defined( __CC_ARM ) */ + +#ifdef __cplusplus +} +#endif + +#endif /* _ARM_MATH_F16_H */ + + diff --git a/APP_Framework/Framework/knowing/cmsis_5/DSP/Include/arm_mve_tables.h b/APP_Framework/Framework/knowing/cmsis_5/DSP/Include/arm_mve_tables.h new file mode 100644 index 000000000..fe41a443c --- /dev/null +++ b/APP_Framework/Framework/knowing/cmsis_5/DSP/Include/arm_mve_tables.h @@ -0,0 +1,231 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_mve_tables.h + * Description: common tables like fft twiddle factors, Bitreverse, reciprocal etc + * used for MVE implementation only + * + * @version V1.9.0 + * @date 23 April 2021 + * + * Target Processor: Cortex-M and Cortex-A cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + #ifndef _ARM_MVE_TABLES_H + #define _ARM_MVE_TABLES_H + +#include "arm_math_types.h" + +#ifdef __cplusplus +extern "C" +{ +#endif + + + + +#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_FFT_ALLOW_TABLES) + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F32_16) || defined(ARM_TABLE_TWIDDLECOEF_F32_32) + +extern uint32_t rearranged_twiddle_tab_stride1_arr_16_f32[2]; +extern uint32_t rearranged_twiddle_tab_stride2_arr_16_f32[2]; +extern uint32_t rearranged_twiddle_tab_stride3_arr_16_f32[2]; +extern float32_t rearranged_twiddle_stride1_16_f32[8]; +extern float32_t rearranged_twiddle_stride2_16_f32[8]; +extern float32_t rearranged_twiddle_stride3_16_f32[8]; +#endif + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F32_64) || defined(ARM_TABLE_TWIDDLECOEF_F32_128) + +extern uint32_t rearranged_twiddle_tab_stride1_arr_64_f32[3]; +extern uint32_t rearranged_twiddle_tab_stride2_arr_64_f32[3]; +extern uint32_t rearranged_twiddle_tab_stride3_arr_64_f32[3]; +extern float32_t rearranged_twiddle_stride1_64_f32[40]; +extern float32_t rearranged_twiddle_stride2_64_f32[40]; +extern float32_t rearranged_twiddle_stride3_64_f32[40]; +#endif + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F32_256) || defined(ARM_TABLE_TWIDDLECOEF_F32_512) + +extern uint32_t rearranged_twiddle_tab_stride1_arr_256_f32[4]; +extern uint32_t rearranged_twiddle_tab_stride2_arr_256_f32[4]; +extern uint32_t rearranged_twiddle_tab_stride3_arr_256_f32[4]; +extern float32_t rearranged_twiddle_stride1_256_f32[168]; +extern float32_t rearranged_twiddle_stride2_256_f32[168]; +extern float32_t rearranged_twiddle_stride3_256_f32[168]; +#endif + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F32_1024) || defined(ARM_TABLE_TWIDDLECOEF_F32_2048) + +extern uint32_t rearranged_twiddle_tab_stride1_arr_1024_f32[5]; +extern uint32_t rearranged_twiddle_tab_stride2_arr_1024_f32[5]; +extern uint32_t rearranged_twiddle_tab_stride3_arr_1024_f32[5]; +extern float32_t rearranged_twiddle_stride1_1024_f32[680]; +extern float32_t rearranged_twiddle_stride2_1024_f32[680]; +extern float32_t rearranged_twiddle_stride3_1024_f32[680]; +#endif + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F32_4096) || defined(ARM_TABLE_TWIDDLECOEF_F32_8192) + +extern uint32_t rearranged_twiddle_tab_stride1_arr_4096_f32[6]; +extern uint32_t rearranged_twiddle_tab_stride2_arr_4096_f32[6]; +extern uint32_t rearranged_twiddle_tab_stride3_arr_4096_f32[6]; +extern float32_t rearranged_twiddle_stride1_4096_f32[2728]; +extern float32_t rearranged_twiddle_stride2_4096_f32[2728]; +extern float32_t rearranged_twiddle_stride3_4096_f32[2728]; +#endif + + +#endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_FFT_ALLOW_TABLES) */ + +#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */ + + + +#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE) + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_FFT_ALLOW_TABLES) + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_Q31_16) || defined(ARM_TABLE_TWIDDLECOEF_Q31_32) + +extern uint32_t rearranged_twiddle_tab_stride1_arr_16_q31[2]; +extern uint32_t rearranged_twiddle_tab_stride2_arr_16_q31[2]; +extern uint32_t rearranged_twiddle_tab_stride3_arr_16_q31[2]; +extern q31_t rearranged_twiddle_stride1_16_q31[8]; +extern q31_t rearranged_twiddle_stride2_16_q31[8]; +extern q31_t rearranged_twiddle_stride3_16_q31[8]; +#endif + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_Q31_64) || defined(ARM_TABLE_TWIDDLECOEF_Q31_128) + +extern uint32_t rearranged_twiddle_tab_stride1_arr_64_q31[3]; +extern uint32_t rearranged_twiddle_tab_stride2_arr_64_q31[3]; +extern uint32_t rearranged_twiddle_tab_stride3_arr_64_q31[3]; +extern q31_t rearranged_twiddle_stride1_64_q31[40]; +extern q31_t rearranged_twiddle_stride2_64_q31[40]; +extern q31_t rearranged_twiddle_stride3_64_q31[40]; +#endif + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_Q31_256) || defined(ARM_TABLE_TWIDDLECOEF_Q31_512) + +extern uint32_t rearranged_twiddle_tab_stride1_arr_256_q31[4]; +extern uint32_t rearranged_twiddle_tab_stride2_arr_256_q31[4]; +extern uint32_t rearranged_twiddle_tab_stride3_arr_256_q31[4]; +extern q31_t rearranged_twiddle_stride1_256_q31[168]; +extern q31_t rearranged_twiddle_stride2_256_q31[168]; +extern q31_t rearranged_twiddle_stride3_256_q31[168]; +#endif + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_Q31_1024) || defined(ARM_TABLE_TWIDDLECOEF_Q31_2048) + +extern uint32_t rearranged_twiddle_tab_stride1_arr_1024_q31[5]; +extern uint32_t rearranged_twiddle_tab_stride2_arr_1024_q31[5]; +extern uint32_t rearranged_twiddle_tab_stride3_arr_1024_q31[5]; +extern q31_t rearranged_twiddle_stride1_1024_q31[680]; +extern q31_t rearranged_twiddle_stride2_1024_q31[680]; +extern q31_t rearranged_twiddle_stride3_1024_q31[680]; +#endif + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_Q31_4096) || defined(ARM_TABLE_TWIDDLECOEF_Q31_8192) + +extern uint32_t rearranged_twiddle_tab_stride1_arr_4096_q31[6]; +extern uint32_t rearranged_twiddle_tab_stride2_arr_4096_q31[6]; +extern uint32_t rearranged_twiddle_tab_stride3_arr_4096_q31[6]; +extern q31_t rearranged_twiddle_stride1_4096_q31[2728]; +extern q31_t rearranged_twiddle_stride2_4096_q31[2728]; +extern q31_t rearranged_twiddle_stride3_4096_q31[2728]; +#endif + + +#endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_FFT_ALLOW_TABLES) */ + +#endif /* defined(ARM_MATH_MVEI) */ + + + +#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE) + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_FFT_ALLOW_TABLES) + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_Q15_16) || defined(ARM_TABLE_TWIDDLECOEF_Q15_32) + +extern uint32_t rearranged_twiddle_tab_stride1_arr_16_q15[2]; +extern uint32_t rearranged_twiddle_tab_stride2_arr_16_q15[2]; +extern uint32_t rearranged_twiddle_tab_stride3_arr_16_q15[2]; +extern q15_t rearranged_twiddle_stride1_16_q15[8]; +extern q15_t rearranged_twiddle_stride2_16_q15[8]; +extern q15_t rearranged_twiddle_stride3_16_q15[8]; +#endif + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_Q15_64) || defined(ARM_TABLE_TWIDDLECOEF_Q15_128) + +extern uint32_t rearranged_twiddle_tab_stride1_arr_64_q15[3]; +extern uint32_t rearranged_twiddle_tab_stride2_arr_64_q15[3]; +extern uint32_t rearranged_twiddle_tab_stride3_arr_64_q15[3]; +extern q15_t rearranged_twiddle_stride1_64_q15[40]; +extern q15_t rearranged_twiddle_stride2_64_q15[40]; +extern q15_t rearranged_twiddle_stride3_64_q15[40]; +#endif + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_Q15_256) || defined(ARM_TABLE_TWIDDLECOEF_Q15_512) + +extern uint32_t rearranged_twiddle_tab_stride1_arr_256_q15[4]; +extern uint32_t rearranged_twiddle_tab_stride2_arr_256_q15[4]; +extern uint32_t rearranged_twiddle_tab_stride3_arr_256_q15[4]; +extern q15_t rearranged_twiddle_stride1_256_q15[168]; +extern q15_t rearranged_twiddle_stride2_256_q15[168]; +extern q15_t rearranged_twiddle_stride3_256_q15[168]; +#endif + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_Q15_1024) || defined(ARM_TABLE_TWIDDLECOEF_Q15_2048) + +extern uint32_t rearranged_twiddle_tab_stride1_arr_1024_q15[5]; +extern uint32_t rearranged_twiddle_tab_stride2_arr_1024_q15[5]; +extern uint32_t rearranged_twiddle_tab_stride3_arr_1024_q15[5]; +extern q15_t rearranged_twiddle_stride1_1024_q15[680]; +extern q15_t rearranged_twiddle_stride2_1024_q15[680]; +extern q15_t rearranged_twiddle_stride3_1024_q15[680]; +#endif + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_Q15_4096) || defined(ARM_TABLE_TWIDDLECOEF_Q15_8192) + +extern uint32_t rearranged_twiddle_tab_stride1_arr_4096_q15[6]; +extern uint32_t rearranged_twiddle_tab_stride2_arr_4096_q15[6]; +extern uint32_t rearranged_twiddle_tab_stride3_arr_4096_q15[6]; +extern q15_t rearranged_twiddle_stride1_4096_q15[2728]; +extern q15_t rearranged_twiddle_stride2_4096_q15[2728]; +extern q15_t rearranged_twiddle_stride3_4096_q15[2728]; +#endif + + +#endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_FFT_ALLOW_TABLES) */ + +#endif /* defined(ARM_MATH_MVEI) */ + + + +#ifdef __cplusplus +} +#endif + +#endif /*_ARM_MVE_TABLES_H*/ + diff --git a/APP_Framework/Framework/knowing/cmsis_5/DSP/Include/arm_mve_tables_f16.h b/APP_Framework/Framework/knowing/cmsis_5/DSP/Include/arm_mve_tables_f16.h new file mode 100644 index 000000000..c93aed181 --- /dev/null +++ b/APP_Framework/Framework/knowing/cmsis_5/DSP/Include/arm_mve_tables_f16.h @@ -0,0 +1,109 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_mve_tables_f16.h + * Description: common tables like fft twiddle factors, Bitreverse, reciprocal etc + * used for MVE implementation only + * + * @version V1.9.0 + * @date 23 April 2021 + * + * Target Processor: Cortex-M and Cortex-A cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + #ifndef _ARM_MVE_TABLES_F16_H + #define _ARM_MVE_TABLES_F16_H + +#include "arm_math_types_f16.h" + +#ifdef __cplusplus +extern "C" +{ +#endif + + + + +#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_FFT_ALLOW_TABLES) + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F16_16) || defined(ARM_TABLE_TWIDDLECOEF_F16_32) + +extern uint32_t rearranged_twiddle_tab_stride1_arr_16_f16[2]; +extern uint32_t rearranged_twiddle_tab_stride2_arr_16_f16[2]; +extern uint32_t rearranged_twiddle_tab_stride3_arr_16_f16[2]; +extern float16_t rearranged_twiddle_stride1_16_f16[8]; +extern float16_t rearranged_twiddle_stride2_16_f16[8]; +extern float16_t rearranged_twiddle_stride3_16_f16[8]; +#endif + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F16_64) || defined(ARM_TABLE_TWIDDLECOEF_F16_128) + +extern uint32_t rearranged_twiddle_tab_stride1_arr_64_f16[3]; +extern uint32_t rearranged_twiddle_tab_stride2_arr_64_f16[3]; +extern uint32_t rearranged_twiddle_tab_stride3_arr_64_f16[3]; +extern float16_t rearranged_twiddle_stride1_64_f16[40]; +extern float16_t rearranged_twiddle_stride2_64_f16[40]; +extern float16_t rearranged_twiddle_stride3_64_f16[40]; +#endif + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F16_256) || defined(ARM_TABLE_TWIDDLECOEF_F16_512) + +extern uint32_t rearranged_twiddle_tab_stride1_arr_256_f16[4]; +extern uint32_t rearranged_twiddle_tab_stride2_arr_256_f16[4]; +extern uint32_t rearranged_twiddle_tab_stride3_arr_256_f16[4]; +extern float16_t rearranged_twiddle_stride1_256_f16[168]; +extern float16_t rearranged_twiddle_stride2_256_f16[168]; +extern float16_t rearranged_twiddle_stride3_256_f16[168]; +#endif + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F16_1024) || defined(ARM_TABLE_TWIDDLECOEF_F16_2048) + +extern uint32_t rearranged_twiddle_tab_stride1_arr_1024_f16[5]; +extern uint32_t rearranged_twiddle_tab_stride2_arr_1024_f16[5]; +extern uint32_t rearranged_twiddle_tab_stride3_arr_1024_f16[5]; +extern float16_t rearranged_twiddle_stride1_1024_f16[680]; +extern float16_t rearranged_twiddle_stride2_1024_f16[680]; +extern float16_t rearranged_twiddle_stride3_1024_f16[680]; +#endif + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F16_4096) || defined(ARM_TABLE_TWIDDLECOEF_F16_8192) + +extern uint32_t rearranged_twiddle_tab_stride1_arr_4096_f16[6]; +extern uint32_t rearranged_twiddle_tab_stride2_arr_4096_f16[6]; +extern uint32_t rearranged_twiddle_tab_stride3_arr_4096_f16[6]; +extern float16_t rearranged_twiddle_stride1_4096_f16[2728]; +extern float16_t rearranged_twiddle_stride2_4096_f16[2728]; +extern float16_t rearranged_twiddle_stride3_4096_f16[2728]; +#endif + + +#endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_FFT_ALLOW_TABLES) */ + +#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */ + + + +#ifdef __cplusplus +} +#endif + +#endif /*_ARM_MVE_TABLES_F16_H*/ + diff --git a/APP_Framework/Framework/knowing/cmsis_5/DSP/Include/arm_vec_math.h b/APP_Framework/Framework/knowing/cmsis_5/DSP/Include/arm_vec_math.h new file mode 100644 index 000000000..029088f1f --- /dev/null +++ b/APP_Framework/Framework/knowing/cmsis_5/DSP/Include/arm_vec_math.h @@ -0,0 +1,373 @@ +/****************************************************************************** + * @file arm_vec_math.h + * @brief Public header file for CMSIS DSP Library + * @version V1.9.0 + * @date 23 April 2021 + * Target Processor: Cortex-M and Cortex-A cores + ******************************************************************************/ +/* + * Copyright (c) 2010-2021 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef _ARM_VEC_MATH_H +#define _ARM_VEC_MATH_H + +#include "arm_math_types.h" +#include "arm_common_tables.h" +#include "arm_helium_utils.h" + +#ifdef __cplusplus +extern "C" +{ +#endif + +#if (defined(ARM_MATH_MVEF) || defined(ARM_MATH_HELIUM)) && !defined(ARM_MATH_AUTOVECTORIZE) + +#define INV_NEWTON_INIT_F32 0x7EF127EA + +static const float32_t __logf_rng_f32=0.693147180f; + + +/* fast inverse approximation (3x newton) */ +__STATIC_INLINE f32x4_t vrecip_medprec_f32( + f32x4_t x) +{ + q31x4_t m; + f32x4_t b; + any32x4_t xinv; + f32x4_t ax = vabsq(x); + + xinv.f = ax; + m = 0x3F800000 - (xinv.i & 0x7F800000); + xinv.i = xinv.i + m; + xinv.f = 1.41176471f - 0.47058824f * xinv.f; + xinv.i = xinv.i + m; + + b = 2.0f - xinv.f * ax; + xinv.f = xinv.f * b; + + b = 2.0f - xinv.f * ax; + xinv.f = xinv.f * b; + + b = 2.0f - xinv.f * ax; + xinv.f = xinv.f * b; + + xinv.f = vdupq_m(xinv.f, INFINITY, vcmpeqq(x, 0.0f)); + /* + * restore sign + */ + xinv.f = vnegq_m(xinv.f, xinv.f, vcmpltq(x, 0.0f)); + + return xinv.f; +} + +/* fast inverse approximation (4x newton) */ +__STATIC_INLINE f32x4_t vrecip_hiprec_f32( + f32x4_t x) +{ + q31x4_t m; + f32x4_t b; + any32x4_t xinv; + f32x4_t ax = vabsq(x); + + xinv.f = ax; + + m = 0x3F800000 - (xinv.i & 0x7F800000); + xinv.i = xinv.i + m; + xinv.f = 1.41176471f - 0.47058824f * xinv.f; + xinv.i = xinv.i + m; + + b = 2.0f - xinv.f * ax; + xinv.f = xinv.f * b; + + b = 2.0f - xinv.f * ax; + xinv.f = xinv.f * b; + + b = 2.0f - xinv.f * ax; + xinv.f = xinv.f * b; + + b = 2.0f - xinv.f * ax; + xinv.f = xinv.f * b; + + xinv.f = vdupq_m(xinv.f, INFINITY, vcmpeqq(x, 0.0f)); + /* + * restore sign + */ + xinv.f = vnegq_m(xinv.f, xinv.f, vcmpltq(x, 0.0f)); + + return xinv.f; +} + +__STATIC_INLINE f32x4_t vdiv_f32( + f32x4_t num, f32x4_t den) +{ + return vmulq(num, vrecip_hiprec_f32(den)); +} + +/** + @brief Single-precision taylor dev. + @param[in] x f32 quad vector input + @param[in] coeffs f32 quad vector coeffs + @return destination f32 quad vector + */ + +__STATIC_INLINE f32x4_t vtaylor_polyq_f32( + f32x4_t x, + const float32_t * coeffs) +{ + f32x4_t A = vfmasq(vdupq_n_f32(coeffs[4]), x, coeffs[0]); + f32x4_t B = vfmasq(vdupq_n_f32(coeffs[6]), x, coeffs[2]); + f32x4_t C = vfmasq(vdupq_n_f32(coeffs[5]), x, coeffs[1]); + f32x4_t D = vfmasq(vdupq_n_f32(coeffs[7]), x, coeffs[3]); + f32x4_t x2 = vmulq(x, x); + f32x4_t x4 = vmulq(x2, x2); + f32x4_t res = vfmaq(vfmaq_f32(A, B, x2), vfmaq_f32(C, D, x2), x4); + + return res; +} + +__STATIC_INLINE f32x4_t vmant_exp_f32( + f32x4_t x, + int32x4_t * e) +{ + any32x4_t r; + int32x4_t n; + + r.f = x; + n = r.i >> 23; + n = n - 127; + r.i = r.i - (n << 23); + + *e = n; + return r.f; +} + + +__STATIC_INLINE f32x4_t vlogq_f32(f32x4_t vecIn) +{ + q31x4_t vecExpUnBiased; + f32x4_t vecTmpFlt0, vecTmpFlt1; + f32x4_t vecAcc0, vecAcc1, vecAcc2, vecAcc3; + f32x4_t vecExpUnBiasedFlt; + + /* + * extract exponent + */ + vecTmpFlt1 = vmant_exp_f32(vecIn, &vecExpUnBiased); + + vecTmpFlt0 = vecTmpFlt1 * vecTmpFlt1; + /* + * a = (__logf_lut_f32[4] * r.f) + (__logf_lut_f32[0]); + */ + vecAcc0 = vdupq_n_f32(__logf_lut_f32[0]); + vecAcc0 = vfmaq(vecAcc0, vecTmpFlt1, __logf_lut_f32[4]); + /* + * b = (__logf_lut_f32[6] * r.f) + (__logf_lut_f32[2]); + */ + vecAcc1 = vdupq_n_f32(__logf_lut_f32[2]); + vecAcc1 = vfmaq(vecAcc1, vecTmpFlt1, __logf_lut_f32[6]); + /* + * c = (__logf_lut_f32[5] * r.f) + (__logf_lut_f32[1]); + */ + vecAcc2 = vdupq_n_f32(__logf_lut_f32[1]); + vecAcc2 = vfmaq(vecAcc2, vecTmpFlt1, __logf_lut_f32[5]); + /* + * d = (__logf_lut_f32[7] * r.f) + (__logf_lut_f32[3]); + */ + vecAcc3 = vdupq_n_f32(__logf_lut_f32[3]); + vecAcc3 = vfmaq(vecAcc3, vecTmpFlt1, __logf_lut_f32[7]); + /* + * a = a + b * xx; + */ + vecAcc0 = vfmaq(vecAcc0, vecAcc1, vecTmpFlt0); + /* + * c = c + d * xx; + */ + vecAcc2 = vfmaq(vecAcc2, vecAcc3, vecTmpFlt0); + /* + * xx = xx * xx; + */ + vecTmpFlt0 = vecTmpFlt0 * vecTmpFlt0; + vecExpUnBiasedFlt = vcvtq_f32_s32(vecExpUnBiased); + /* + * r.f = a + c * xx; + */ + vecAcc0 = vfmaq(vecAcc0, vecAcc2, vecTmpFlt0); + /* + * add exponent + * r.f = r.f + ((float32_t) m) * __logf_rng_f32; + */ + vecAcc0 = vfmaq(vecAcc0, vecExpUnBiasedFlt, __logf_rng_f32); + // set log0 down to -inf + vecAcc0 = vdupq_m(vecAcc0, -INFINITY, vcmpeqq(vecIn, 0.0f)); + return vecAcc0; +} + +__STATIC_INLINE f32x4_t vexpq_f32( + f32x4_t x) +{ + // Perform range reduction [-log(2),log(2)] + int32x4_t m = vcvtq_s32_f32(vmulq_n_f32(x, 1.4426950408f)); + f32x4_t val = vfmsq_f32(x, vcvtq_f32_s32(m), vdupq_n_f32(0.6931471805f)); + + // Polynomial Approximation + f32x4_t poly = vtaylor_polyq_f32(val, exp_tab); + + // Reconstruct + poly = (f32x4_t) (vqaddq_s32((q31x4_t) (poly), vqshlq_n_s32(m, 23))); + + poly = vdupq_m(poly, 0.0f, vcmpltq_n_s32(m, -126)); + return poly; +} + +__STATIC_INLINE f32x4_t arm_vec_exponent_f32(f32x4_t x, int32_t nb) +{ + f32x4_t r = x; + nb--; + while (nb > 0) { + r = vmulq(r, x); + nb--; + } + return (r); +} + +__STATIC_INLINE f32x4_t vrecip_f32(f32x4_t vecIn) +{ + f32x4_t vecSx, vecW, vecTmp; + any32x4_t v; + + vecSx = vabsq(vecIn); + + v.f = vecIn; + v.i = vsubq(vdupq_n_s32(INV_NEWTON_INIT_F32), v.i); + + vecW = vmulq(vecSx, v.f); + + // v.f = v.f * (8 + w * (-28 + w * (56 + w * (-70 + w *(56 + w * (-28 + w * (8 - w))))))); + vecTmp = vsubq(vdupq_n_f32(8.0f), vecW); + vecTmp = vfmasq(vecW, vecTmp, -28.0f); + vecTmp = vfmasq(vecW, vecTmp, 56.0f); + vecTmp = vfmasq(vecW, vecTmp, -70.0f); + vecTmp = vfmasq(vecW, vecTmp, 56.0f); + vecTmp = vfmasq(vecW, vecTmp, -28.0f); + vecTmp = vfmasq(vecW, vecTmp, 8.0f); + v.f = vmulq(v.f, vecTmp); + + v.f = vdupq_m(v.f, INFINITY, vcmpeqq(vecIn, 0.0f)); + /* + * restore sign + */ + v.f = vnegq_m(v.f, v.f, vcmpltq(vecIn, 0.0f)); + return v.f; +} + +__STATIC_INLINE f32x4_t vtanhq_f32( + f32x4_t val) +{ + f32x4_t x = + vminnmq_f32(vmaxnmq_f32(val, vdupq_n_f32(-10.f)), vdupq_n_f32(10.0f)); + f32x4_t exp2x = vexpq_f32(vmulq_n_f32(x, 2.f)); + f32x4_t num = vsubq_n_f32(exp2x, 1.f); + f32x4_t den = vaddq_n_f32(exp2x, 1.f); + f32x4_t tanh = vmulq_f32(num, vrecip_f32(den)); + return tanh; +} + +__STATIC_INLINE f32x4_t vpowq_f32( + f32x4_t val, + f32x4_t n) +{ + return vexpq_f32(vmulq_f32(n, vlogq_f32(val))); +} + +#endif /* (defined(ARM_MATH_MVEF) || defined(ARM_MATH_HELIUM)) && !defined(ARM_MATH_AUTOVECTORIZE)*/ + +#if (defined(ARM_MATH_MVEI) || defined(ARM_MATH_HELIUM)) && !defined(ARM_MATH_AUTOVECTORIZE) +#endif /* (defined(ARM_MATH_MVEI) || defined(ARM_MATH_HELIUM)) */ + +#if (defined(ARM_MATH_NEON) || defined(ARM_MATH_NEON_EXPERIMENTAL)) && !defined(ARM_MATH_AUTOVECTORIZE) + +#include "NEMath.h" +/** + * @brief Vectorized integer exponentiation + * @param[in] x value + * @param[in] nb integer exponent >= 1 + * @return x^nb + * + */ +__STATIC_INLINE float32x4_t arm_vec_exponent_f32(float32x4_t x, int32_t nb) +{ + float32x4_t r = x; + nb --; + while(nb > 0) + { + r = vmulq_f32(r , x); + nb--; + } + return(r); +} + + +__STATIC_INLINE float32x4_t __arm_vec_sqrt_f32_neon(float32x4_t x) +{ + float32x4_t x1 = vmaxq_f32(x, vdupq_n_f32(FLT_MIN)); + float32x4_t e = vrsqrteq_f32(x1); + e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x1, e), e), e); + e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x1, e), e), e); + return vmulq_f32(x, e); +} + +__STATIC_INLINE int16x8_t __arm_vec_sqrt_q15_neon(int16x8_t vec) +{ + float32x4_t tempF; + int32x4_t tempHI,tempLO; + + tempLO = vmovl_s16(vget_low_s16(vec)); + tempF = vcvtq_n_f32_s32(tempLO,15); + tempF = __arm_vec_sqrt_f32_neon(tempF); + tempLO = vcvtq_n_s32_f32(tempF,15); + + tempHI = vmovl_s16(vget_high_s16(vec)); + tempF = vcvtq_n_f32_s32(tempHI,15); + tempF = __arm_vec_sqrt_f32_neon(tempF); + tempHI = vcvtq_n_s32_f32(tempF,15); + + return(vcombine_s16(vqmovn_s32(tempLO),vqmovn_s32(tempHI))); +} + +__STATIC_INLINE int32x4_t __arm_vec_sqrt_q31_neon(int32x4_t vec) +{ + float32x4_t temp; + + temp = vcvtq_n_f32_s32(vec,31); + temp = __arm_vec_sqrt_f32_neon(temp); + return(vcvtq_n_s32_f32(temp,31)); +} + +#endif /* (defined(ARM_MATH_NEON) || defined(ARM_MATH_NEON_EXPERIMENTAL)) && !defined(ARM_MATH_AUTOVECTORIZE) */ + +#ifdef __cplusplus +} +#endif + + +#endif /* _ARM_VEC_MATH_H */ + +/** + * + * End of file. + */ diff --git a/APP_Framework/Framework/knowing/cmsis_5/DSP/Include/arm_vec_math_f16.h b/APP_Framework/Framework/knowing/cmsis_5/DSP/Include/arm_vec_math_f16.h new file mode 100644 index 000000000..ef6e20001 --- /dev/null +++ b/APP_Framework/Framework/knowing/cmsis_5/DSP/Include/arm_vec_math_f16.h @@ -0,0 +1,317 @@ +/****************************************************************************** + * @file arm_vec_math_f16.h + * @brief Public header file for CMSIS DSP Library + * @version V1.9.0 + * @date 23 April 2021 + * Target Processor: Cortex-M and Cortex-A cores + ******************************************************************************/ +/* + * Copyright (c) 2010-2021 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef _ARM_VEC_MATH_F16_H +#define _ARM_VEC_MATH_F16_H + +#include "arm_math_types_f16.h" +#include "arm_common_tables_f16.h" +#include "arm_helium_utils.h" + +#ifdef __cplusplus +extern "C" +{ +#endif + +#if defined(ARM_FLOAT16_SUPPORTED) + + +#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) + + +static const float16_t __logf_rng_f16=0.693147180f16; + +/* fast inverse approximation (3x newton) */ +__STATIC_INLINE f16x8_t vrecip_medprec_f16( + f16x8_t x) +{ + q15x8_t m; + f16x8_t b; + any16x8_t xinv; + f16x8_t ax = vabsq(x); + + xinv.f = ax; + + m = 0x03c00 - (xinv.i & 0x07c00); + xinv.i = xinv.i + m; + xinv.f = 1.41176471f16 - 0.47058824f16 * xinv.f; + xinv.i = xinv.i + m; + + b = 2.0f16 - xinv.f * ax; + xinv.f = xinv.f * b; + + b = 2.0f16 - xinv.f * ax; + xinv.f = xinv.f * b; + + b = 2.0f16 - xinv.f * ax; + xinv.f = xinv.f * b; + + xinv.f = vdupq_m(xinv.f, F16INFINITY, vcmpeqq(x, 0.0f)); + /* + * restore sign + */ + xinv.f = vnegq_m(xinv.f, xinv.f, vcmpltq(x, 0.0f)); + + return xinv.f; +} + +/* fast inverse approximation (4x newton) */ +__STATIC_INLINE f16x8_t vrecip_hiprec_f16( + f16x8_t x) +{ + q15x8_t m; + f16x8_t b; + any16x8_t xinv; + f16x8_t ax = vabsq(x); + + xinv.f = ax; + + m = 0x03c00 - (xinv.i & 0x07c00); + xinv.i = xinv.i + m; + xinv.f = 1.41176471f16 - 0.47058824f16 * xinv.f; + xinv.i = xinv.i + m; + + b = 2.0f16 - xinv.f * ax; + xinv.f = xinv.f * b; + + b = 2.0f16 - xinv.f * ax; + xinv.f = xinv.f * b; + + b = 2.0f16 - xinv.f * ax; + xinv.f = xinv.f * b; + + b = 2.0f16 - xinv.f * ax; + xinv.f = xinv.f * b; + + xinv.f = vdupq_m(xinv.f, F16INFINITY, vcmpeqq(x, 0.0f)); + /* + * restore sign + */ + xinv.f = vnegq_m(xinv.f, xinv.f, vcmpltq(x, 0.0f)); + + return xinv.f; +} + +__STATIC_INLINE f16x8_t vdiv_f16( + f16x8_t num, f16x8_t den) +{ + return vmulq(num, vrecip_hiprec_f16(den)); +} + + +/** + @brief Single-precision taylor dev. + @param[in] x f16 vector input + @param[in] coeffs f16 vector coeffs + @return destination f16 vector + */ + +__STATIC_INLINE float16x8_t vtaylor_polyq_f16( + float16x8_t x, + const float16_t * coeffs) +{ + float16x8_t A = vfmasq(vdupq_n_f16(coeffs[4]), x, coeffs[0]); + float16x8_t B = vfmasq(vdupq_n_f16(coeffs[6]), x, coeffs[2]); + float16x8_t C = vfmasq(vdupq_n_f16(coeffs[5]), x, coeffs[1]); + float16x8_t D = vfmasq(vdupq_n_f16(coeffs[7]), x, coeffs[3]); + float16x8_t x2 = vmulq(x, x); + float16x8_t x4 = vmulq(x2, x2); + float16x8_t res = vfmaq(vfmaq_f16(A, B, x2), vfmaq_f16(C, D, x2), x4); + + return res; +} + +__STATIC_INLINE float16x8_t vmant_exp_f16( + float16x8_t x, + int16x8_t * e) +{ + any16x8_t r; + int16x8_t n; + + r.f = x; + n = r.i >> 10; + n = n - 15; + r.i = r.i - (n << 10); + + *e = n; + return r.f; +} + + +__STATIC_INLINE float16x8_t vlogq_f16(float16x8_t vecIn) +{ + q15x8_t vecExpUnBiased; + float16x8_t vecTmpFlt0, vecTmpFlt1; + float16x8_t vecAcc0, vecAcc1, vecAcc2, vecAcc3; + float16x8_t vecExpUnBiasedFlt; + + /* + * extract exponent + */ + vecTmpFlt1 = vmant_exp_f16(vecIn, &vecExpUnBiased); + + vecTmpFlt0 = vecTmpFlt1 * vecTmpFlt1; + /* + * a = (__logf_lut_f16[4] * r.f) + (__logf_lut_f16[0]); + */ + vecAcc0 = vdupq_n_f16(__logf_lut_f16[0]); + vecAcc0 = vfmaq(vecAcc0, vecTmpFlt1, __logf_lut_f16[4]); + /* + * b = (__logf_lut_f16[6] * r.f) + (__logf_lut_f16[2]); + */ + vecAcc1 = vdupq_n_f16(__logf_lut_f16[2]); + vecAcc1 = vfmaq(vecAcc1, vecTmpFlt1, __logf_lut_f16[6]); + /* + * c = (__logf_lut_f16[5] * r.f) + (__logf_lut_f16[1]); + */ + vecAcc2 = vdupq_n_f16(__logf_lut_f16[1]); + vecAcc2 = vfmaq(vecAcc2, vecTmpFlt1, __logf_lut_f16[5]); + /* + * d = (__logf_lut_f16[7] * r.f) + (__logf_lut_f16[3]); + */ + vecAcc3 = vdupq_n_f16(__logf_lut_f16[3]); + vecAcc3 = vfmaq(vecAcc3, vecTmpFlt1, __logf_lut_f16[7]); + /* + * a = a + b * xx; + */ + vecAcc0 = vfmaq(vecAcc0, vecAcc1, vecTmpFlt0); + /* + * c = c + d * xx; + */ + vecAcc2 = vfmaq(vecAcc2, vecAcc3, vecTmpFlt0); + /* + * xx = xx * xx; + */ + vecTmpFlt0 = vecTmpFlt0 * vecTmpFlt0; + vecExpUnBiasedFlt = vcvtq_f16_s16(vecExpUnBiased); + /* + * r.f = a + c * xx; + */ + vecAcc0 = vfmaq(vecAcc0, vecAcc2, vecTmpFlt0); + /* + * add exponent + * r.f = r.f + ((float32_t) m) * __logf_rng_f16; + */ + vecAcc0 = vfmaq(vecAcc0, vecExpUnBiasedFlt, __logf_rng_f16); + // set log0 down to -inf + vecAcc0 = vdupq_m(vecAcc0, -F16INFINITY, vcmpeqq(vecIn, 0.0f)); + return vecAcc0; +} + +__STATIC_INLINE float16x8_t vexpq_f16( + float16x8_t x) +{ + // Perform range reduction [-log(2),log(2)] + int16x8_t m = vcvtq_s16_f16(vmulq_n_f16(x, 1.4426950408f16)); + float16x8_t val = vfmsq_f16(x, vcvtq_f16_s16(m), vdupq_n_f16(0.6931471805f16)); + + // Polynomial Approximation + float16x8_t poly = vtaylor_polyq_f16(val, exp_tab_f16); + + // Reconstruct + poly = (float16x8_t) (vqaddq_s16((int16x8_t) (poly), vqshlq_n_s16(m, 10))); + + poly = vdupq_m(poly, 0.0f, vcmpltq_n_s16(m, -14)); + return poly; +} + +__STATIC_INLINE float16x8_t arm_vec_exponent_f16(float16x8_t x, int16_t nb) +{ + float16x8_t r = x; + nb--; + while (nb > 0) { + r = vmulq(r, x); + nb--; + } + return (r); +} + +__STATIC_INLINE f16x8_t vpowq_f16( + f16x8_t val, + f16x8_t n) +{ + return vexpq_f16(vmulq_f16(n, vlogq_f16(val))); +} + +#define INV_NEWTON_INIT_F16 0x7773 + +__STATIC_INLINE f16x8_t vrecip_f16(f16x8_t vecIn) +{ + f16x8_t vecSx, vecW, vecTmp; + any16x8_t v; + + vecSx = vabsq(vecIn); + + v.f = vecIn; + v.i = vsubq(vdupq_n_s16(INV_NEWTON_INIT_F16), v.i); + + vecW = vmulq(vecSx, v.f); + + // v.f = v.f * (8 + w * (-28 + w * (56 + w * (-70 + w *(56 + w * (-28 + w * (8 - w))))))); + vecTmp = vsubq(vdupq_n_f16(8.0f), vecW); + vecTmp = vfmasq(vecW, vecTmp, -28.0f); + vecTmp = vfmasq(vecW, vecTmp, 56.0f); + vecTmp = vfmasq(vecW, vecTmp, -70.0f); + vecTmp = vfmasq(vecW, vecTmp, 56.0f); + vecTmp = vfmasq(vecW, vecTmp, -28.0f); + vecTmp = vfmasq(vecW, vecTmp, 8.0f); + v.f = vmulq(v.f, vecTmp); + + v.f = vdupq_m(v.f, F16INFINITY, vcmpeqq(vecIn, 0.0f)); + /* + * restore sign + */ + v.f = vnegq_m(v.f, v.f, vcmpltq(vecIn, 0.0f)); + return v.f; +} + +__STATIC_INLINE f16x8_t vtanhq_f16( + f16x8_t val) +{ + f16x8_t x = + vminnmq_f16(vmaxnmq_f16(val, vdupq_n_f16(-10.f)), vdupq_n_f16(10.0f)); + f16x8_t exp2x = vexpq_f16(vmulq_n_f16(x, 2.f)); + f16x8_t num = vsubq_n_f16(exp2x, 1.f); + f16x8_t den = vaddq_n_f16(exp2x, 1.f); + f16x8_t tanh = vmulq_f16(num, vrecip_f16(den)); + return tanh; +} + +#endif /* defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)*/ + + + +#ifdef __cplusplus +} +#endif + +#endif /* ARM FLOAT16 SUPPORTED */ + +#endif /* _ARM_VEC_MATH_F16_H */ + +/** + * + * End of file. + */ diff --git a/APP_Framework/Framework/knowing/cmsis_5/DSP/Include/dsp/basic_math_functions.h b/APP_Framework/Framework/knowing/cmsis_5/DSP/Include/dsp/basic_math_functions.h new file mode 100644 index 000000000..b3481524e --- /dev/null +++ b/APP_Framework/Framework/knowing/cmsis_5/DSP/Include/dsp/basic_math_functions.h @@ -0,0 +1,764 @@ +/****************************************************************************** + * @file basic_math_functions.h + * @brief Public header file for CMSIS DSP Library + * @version V1.9.0 + * @date 23 April 2021 + * Target Processor: Cortex-M and Cortex-A cores + ******************************************************************************/ +/* + * Copyright (c) 2010-2020 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +#ifndef _BASIC_MATH_FUNCTIONS_H_ +#define _BASIC_MATH_FUNCTIONS_H_ + +#include "arm_math_types.h" +#include "arm_math_memory.h" + +#include "dsp/none.h" +#include "dsp/utils.h" + + +#ifdef __cplusplus +extern "C" +{ +#endif + +/** + * @defgroup groupMath Basic Math Functions + */ + + /** + * @brief Q7 vector multiplication. + * @param[in] pSrcA points to the first input vector + * @param[in] pSrcB points to the second input vector + * @param[out] pDst points to the output vector + * @param[in] blockSize number of samples in each vector + */ + void arm_mult_q7( + const q7_t * pSrcA, + const q7_t * pSrcB, + q7_t * pDst, + uint32_t blockSize); + + + /** + * @brief Q15 vector multiplication. + * @param[in] pSrcA points to the first input vector + * @param[in] pSrcB points to the second input vector + * @param[out] pDst points to the output vector + * @param[in] blockSize number of samples in each vector + */ + void arm_mult_q15( + const q15_t * pSrcA, + const q15_t * pSrcB, + q15_t * pDst, + uint32_t blockSize); + + + /** + * @brief Q31 vector multiplication. + * @param[in] pSrcA points to the first input vector + * @param[in] pSrcB points to the second input vector + * @param[out] pDst points to the output vector + * @param[in] blockSize number of samples in each vector + */ + void arm_mult_q31( + const q31_t * pSrcA, + const q31_t * pSrcB, + q31_t * pDst, + uint32_t blockSize); + + + /** + * @brief Floating-point vector multiplication. + * @param[in] pSrcA points to the first input vector + * @param[in] pSrcB points to the second input vector + * @param[out] pDst points to the output vector + * @param[in] blockSize number of samples in each vector + */ + void arm_mult_f32( + const float32_t * pSrcA, + const float32_t * pSrcB, + float32_t * pDst, + uint32_t blockSize); + + + + /** + * @brief Floating-point vector addition. + * @param[in] pSrcA points to the first input vector + * @param[in] pSrcB points to the second input vector + * @param[out] pDst points to the output vector + * @param[in] blockSize number of samples in each vector + */ + void arm_add_f32( + const float32_t * pSrcA, + const float32_t * pSrcB, + float32_t * pDst, + uint32_t blockSize); + + + + /** + * @brief Q7 vector addition. + * @param[in] pSrcA points to the first input vector + * @param[in] pSrcB points to the second input vector + * @param[out] pDst points to the output vector + * @param[in] blockSize number of samples in each vector + */ + void arm_add_q7( + const q7_t * pSrcA, + const q7_t * pSrcB, + q7_t * pDst, + uint32_t blockSize); + + + /** + * @brief Q15 vector addition. + * @param[in] pSrcA points to the first input vector + * @param[in] pSrcB points to the second input vector + * @param[out] pDst points to the output vector + * @param[in] blockSize number of samples in each vector + */ + void arm_add_q15( + const q15_t * pSrcA, + const q15_t * pSrcB, + q15_t * pDst, + uint32_t blockSize); + + + /** + * @brief Q31 vector addition. + * @param[in] pSrcA points to the first input vector + * @param[in] pSrcB points to the second input vector + * @param[out] pDst points to the output vector + * @param[in] blockSize number of samples in each vector + */ + void arm_add_q31( + const q31_t * pSrcA, + const q31_t * pSrcB, + q31_t * pDst, + uint32_t blockSize); + + + /** + * @brief Floating-point vector subtraction. + * @param[in] pSrcA points to the first input vector + * @param[in] pSrcB points to the second input vector + * @param[out] pDst points to the output vector + * @param[in] blockSize number of samples in each vector + */ + void arm_sub_f32( + const float32_t * pSrcA, + const float32_t * pSrcB, + float32_t * pDst, + uint32_t blockSize); + + + + /** + * @brief Q7 vector subtraction. + * @param[in] pSrcA points to the first input vector + * @param[in] pSrcB points to the second input vector + * @param[out] pDst points to the output vector + * @param[in] blockSize number of samples in each vector + */ + void arm_sub_q7( + const q7_t * pSrcA, + const q7_t * pSrcB, + q7_t * pDst, + uint32_t blockSize); + + + /** + * @brief Q15 vector subtraction. + * @param[in] pSrcA points to the first input vector + * @param[in] pSrcB points to the second input vector + * @param[out] pDst points to the output vector + * @param[in] blockSize number of samples in each vector + */ + void arm_sub_q15( + const q15_t * pSrcA, + const q15_t * pSrcB, + q15_t * pDst, + uint32_t blockSize); + + + /** + * @brief Q31 vector subtraction. + * @param[in] pSrcA points to the first input vector + * @param[in] pSrcB points to the second input vector + * @param[out] pDst points to the output vector + * @param[in] blockSize number of samples in each vector + */ + void arm_sub_q31( + const q31_t * pSrcA, + const q31_t * pSrcB, + q31_t * pDst, + uint32_t blockSize); + + + /** + * @brief Multiplies a floating-point vector by a scalar. + * @param[in] pSrc points to the input vector + * @param[in] scale scale factor to be applied + * @param[out] pDst points to the output vector + * @param[in] blockSize number of samples in the vector + */ + void arm_scale_f32( + const float32_t * pSrc, + float32_t scale, + float32_t * pDst, + uint32_t blockSize); + + + + /** + * @brief Multiplies a Q7 vector by a scalar. + * @param[in] pSrc points to the input vector + * @param[in] scaleFract fractional portion of the scale value + * @param[in] shift number of bits to shift the result by + * @param[out] pDst points to the output vector + * @param[in] blockSize number of samples in the vector + */ + void arm_scale_q7( + const q7_t * pSrc, + q7_t scaleFract, + int8_t shift, + q7_t * pDst, + uint32_t blockSize); + + + /** + * @brief Multiplies a Q15 vector by a scalar. + * @param[in] pSrc points to the input vector + * @param[in] scaleFract fractional portion of the scale value + * @param[in] shift number of bits to shift the result by + * @param[out] pDst points to the output vector + * @param[in] blockSize number of samples in the vector + */ + void arm_scale_q15( + const q15_t * pSrc, + q15_t scaleFract, + int8_t shift, + q15_t * pDst, + uint32_t blockSize); + + + /** + * @brief Multiplies a Q31 vector by a scalar. + * @param[in] pSrc points to the input vector + * @param[in] scaleFract fractional portion of the scale value + * @param[in] shift number of bits to shift the result by + * @param[out] pDst points to the output vector + * @param[in] blockSize number of samples in the vector + */ + void arm_scale_q31( + const q31_t * pSrc, + q31_t scaleFract, + int8_t shift, + q31_t * pDst, + uint32_t blockSize); + + + /** + * @brief Q7 vector absolute value. + * @param[in] pSrc points to the input buffer + * @param[out] pDst points to the output buffer + * @param[in] blockSize number of samples in each vector + */ + void arm_abs_q7( + const q7_t * pSrc, + q7_t * pDst, + uint32_t blockSize); + + + /** + * @brief Floating-point vector absolute value. + * @param[in] pSrc points to the input buffer + * @param[out] pDst points to the output buffer + * @param[in] blockSize number of samples in each vector + */ + void arm_abs_f32( + const float32_t * pSrc, + float32_t * pDst, + uint32_t blockSize); + + + + + /** + * @brief Q15 vector absolute value. + * @param[in] pSrc points to the input buffer + * @param[out] pDst points to the output buffer + * @param[in] blockSize number of samples in each vector + */ + void arm_abs_q15( + const q15_t * pSrc, + q15_t * pDst, + uint32_t blockSize); + + + /** + * @brief Q31 vector absolute value. + * @param[in] pSrc points to the input buffer + * @param[out] pDst points to the output buffer + * @param[in] blockSize number of samples in each vector + */ + void arm_abs_q31( + const q31_t * pSrc, + q31_t * pDst, + uint32_t blockSize); + + + /** + * @brief Dot product of floating-point vectors. + * @param[in] pSrcA points to the first input vector + * @param[in] pSrcB points to the second input vector + * @param[in] blockSize number of samples in each vector + * @param[out] result output result returned here + */ + void arm_dot_prod_f32( + const float32_t * pSrcA, + const float32_t * pSrcB, + uint32_t blockSize, + float32_t * result); + + + + /** + * @brief Dot product of Q7 vectors. + * @param[in] pSrcA points to the first input vector + * @param[in] pSrcB points to the second input vector + * @param[in] blockSize number of samples in each vector + * @param[out] result output result returned here + */ + void arm_dot_prod_q7( + const q7_t * pSrcA, + const q7_t * pSrcB, + uint32_t blockSize, + q31_t * result); + + + /** + * @brief Dot product of Q15 vectors. + * @param[in] pSrcA points to the first input vector + * @param[in] pSrcB points to the second input vector + * @param[in] blockSize number of samples in each vector + * @param[out] result output result returned here + */ + void arm_dot_prod_q15( + const q15_t * pSrcA, + const q15_t * pSrcB, + uint32_t blockSize, + q63_t * result); + + + /** + * @brief Dot product of Q31 vectors. + * @param[in] pSrcA points to the first input vector + * @param[in] pSrcB points to the second input vector + * @param[in] blockSize number of samples in each vector + * @param[out] result output result returned here + */ + void arm_dot_prod_q31( + const q31_t * pSrcA, + const q31_t * pSrcB, + uint32_t blockSize, + q63_t * result); + + + /** + * @brief Shifts the elements of a Q7 vector a specified number of bits. + * @param[in] pSrc points to the input vector + * @param[in] shiftBits number of bits to shift. A positive value shifts left; a negative value shifts right. + * @param[out] pDst points to the output vector + * @param[in] blockSize number of samples in the vector + */ + void arm_shift_q7( + const q7_t * pSrc, + int8_t shiftBits, + q7_t * pDst, + uint32_t blockSize); + + + /** + * @brief Shifts the elements of a Q15 vector a specified number of bits. + * @param[in] pSrc points to the input vector + * @param[in] shiftBits number of bits to shift. A positive value shifts left; a negative value shifts right. + * @param[out] pDst points to the output vector + * @param[in] blockSize number of samples in the vector + */ + void arm_shift_q15( + const q15_t * pSrc, + int8_t shiftBits, + q15_t * pDst, + uint32_t blockSize); + + + /** + * @brief Shifts the elements of a Q31 vector a specified number of bits. + * @param[in] pSrc points to the input vector + * @param[in] shiftBits number of bits to shift. A positive value shifts left; a negative value shifts right. + * @param[out] pDst points to the output vector + * @param[in] blockSize number of samples in the vector + */ + void arm_shift_q31( + const q31_t * pSrc, + int8_t shiftBits, + q31_t * pDst, + uint32_t blockSize); + + + /** + * @brief Adds a constant offset to a floating-point vector. + * @param[in] pSrc points to the input vector + * @param[in] offset is the offset to be added + * @param[out] pDst points to the output vector + * @param[in] blockSize number of samples in the vector + */ + void arm_offset_f32( + const float32_t * pSrc, + float32_t offset, + float32_t * pDst, + uint32_t blockSize); + + + + /** + * @brief Adds a constant offset to a Q7 vector. + * @param[in] pSrc points to the input vector + * @param[in] offset is the offset to be added + * @param[out] pDst points to the output vector + * @param[in] blockSize number of samples in the vector + */ + void arm_offset_q7( + const q7_t * pSrc, + q7_t offset, + q7_t * pDst, + uint32_t blockSize); + + + /** + * @brief Adds a constant offset to a Q15 vector. + * @param[in] pSrc points to the input vector + * @param[in] offset is the offset to be added + * @param[out] pDst points to the output vector + * @param[in] blockSize number of samples in the vector + */ + void arm_offset_q15( + const q15_t * pSrc, + q15_t offset, + q15_t * pDst, + uint32_t blockSize); + + + /** + * @brief Adds a constant offset to a Q31 vector. + * @param[in] pSrc points to the input vector + * @param[in] offset is the offset to be added + * @param[out] pDst points to the output vector + * @param[in] blockSize number of samples in the vector + */ + void arm_offset_q31( + const q31_t * pSrc, + q31_t offset, + q31_t * pDst, + uint32_t blockSize); + + + /** + * @brief Negates the elements of a floating-point vector. + * @param[in] pSrc points to the input vector + * @param[out] pDst points to the output vector + * @param[in] blockSize number of samples in the vector + */ + void arm_negate_f32( + const float32_t * pSrc, + float32_t * pDst, + uint32_t blockSize); + + + /** + * @brief Negates the elements of a Q7 vector. + * @param[in] pSrc points to the input vector + * @param[out] pDst points to the output vector + * @param[in] blockSize number of samples in the vector + */ + void arm_negate_q7( + const q7_t * pSrc, + q7_t * pDst, + uint32_t blockSize); + + + /** + * @brief Negates the elements of a Q15 vector. + * @param[in] pSrc points to the input vector + * @param[out] pDst points to the output vector + * @param[in] blockSize number of samples in the vector + */ + void arm_negate_q15( + const q15_t * pSrc, + q15_t * pDst, + uint32_t blockSize); + + + /** + * @brief Negates the elements of a Q31 vector. + * @param[in] pSrc points to the input vector + * @param[out] pDst points to the output vector + * @param[in] blockSize number of samples in the vector + */ + void arm_negate_q31( + const q31_t * pSrc, + q31_t * pDst, + uint32_t blockSize); + +/** + * @brief Compute the logical bitwise AND of two fixed-point vectors. + * @param[in] pSrcA points to input vector A + * @param[in] pSrcB points to input vector B + * @param[out] pDst points to output vector + * @param[in] blockSize number of samples in each vector + * @return none + */ + void arm_and_u16( + const uint16_t * pSrcA, + const uint16_t * pSrcB, + uint16_t * pDst, + uint32_t blockSize); + + /** + * @brief Compute the logical bitwise AND of two fixed-point vectors. + * @param[in] pSrcA points to input vector A + * @param[in] pSrcB points to input vector B + * @param[out] pDst points to output vector + * @param[in] blockSize number of samples in each vector + * @return none + */ + void arm_and_u32( + const uint32_t * pSrcA, + const uint32_t * pSrcB, + uint32_t * pDst, + uint32_t blockSize); + + /** + * @brief Compute the logical bitwise AND of two fixed-point vectors. + * @param[in] pSrcA points to input vector A + * @param[in] pSrcB points to input vector B + * @param[out] pDst points to output vector + * @param[in] blockSize number of samples in each vector + * @return none + */ + void arm_and_u8( + const uint8_t * pSrcA, + const uint8_t * pSrcB, + uint8_t * pDst, + uint32_t blockSize); + + /** + * @brief Compute the logical bitwise OR of two fixed-point vectors. + * @param[in] pSrcA points to input vector A + * @param[in] pSrcB points to input vector B + * @param[out] pDst points to output vector + * @param[in] blockSize number of samples in each vector + * @return none + */ + void arm_or_u16( + const uint16_t * pSrcA, + const uint16_t * pSrcB, + uint16_t * pDst, + uint32_t blockSize); + + /** + * @brief Compute the logical bitwise OR of two fixed-point vectors. + * @param[in] pSrcA points to input vector A + * @param[in] pSrcB points to input vector B + * @param[out] pDst points to output vector + * @param[in] blockSize number of samples in each vector + * @return none + */ + void arm_or_u32( + const uint32_t * pSrcA, + const uint32_t * pSrcB, + uint32_t * pDst, + uint32_t blockSize); + + /** + * @brief Compute the logical bitwise OR of two fixed-point vectors. + * @param[in] pSrcA points to input vector A + * @param[in] pSrcB points to input vector B + * @param[out] pDst points to output vector + * @param[in] blockSize number of samples in each vector + * @return none + */ + void arm_or_u8( + const uint8_t * pSrcA, + const uint8_t * pSrcB, + uint8_t * pDst, + uint32_t blockSize); + + /** + * @brief Compute the logical bitwise NOT of a fixed-point vector. + * @param[in] pSrc points to input vector + * @param[out] pDst points to output vector + * @param[in] blockSize number of samples in each vector + * @return none + */ + void arm_not_u16( + const uint16_t * pSrc, + uint16_t * pDst, + uint32_t blockSize); + + /** + * @brief Compute the logical bitwise NOT of a fixed-point vector. + * @param[in] pSrc points to input vector + * @param[out] pDst points to output vector + * @param[in] blockSize number of samples in each vector + * @return none + */ + void arm_not_u32( + const uint32_t * pSrc, + uint32_t * pDst, + uint32_t blockSize); + + /** + * @brief Compute the logical bitwise NOT of a fixed-point vector. + * @param[in] pSrc points to input vector + * @param[out] pDst points to output vector + * @param[in] blockSize number of samples in each vector + * @return none + */ + void arm_not_u8( + const uint8_t * pSrc, + uint8_t * pDst, + uint32_t blockSize); + +/** + * @brief Compute the logical bitwise XOR of two fixed-point vectors. + * @param[in] pSrcA points to input vector A + * @param[in] pSrcB points to input vector B + * @param[out] pDst points to output vector + * @param[in] blockSize number of samples in each vector + * @return none + */ + void arm_xor_u16( + const uint16_t * pSrcA, + const uint16_t * pSrcB, + uint16_t * pDst, + uint32_t blockSize); + + /** + * @brief Compute the logical bitwise XOR of two fixed-point vectors. + * @param[in] pSrcA points to input vector A + * @param[in] pSrcB points to input vector B + * @param[out] pDst points to output vector + * @param[in] blockSize number of samples in each vector + * @return none + */ + void arm_xor_u32( + const uint32_t * pSrcA, + const uint32_t * pSrcB, + uint32_t * pDst, + uint32_t blockSize); + + /** + * @brief Compute the logical bitwise XOR of two fixed-point vectors. + * @param[in] pSrcA points to input vector A + * @param[in] pSrcB points to input vector B + * @param[out] pDst points to output vector + * @param[in] blockSize number of samples in each vector + * @return none + */ + void arm_xor_u8( + const uint8_t * pSrcA, + const uint8_t * pSrcB, + uint8_t * pDst, + uint32_t blockSize); + + /** + @brief Elementwise floating-point clipping + @param[in] pSrc points to input values + @param[out] pDst points to output clipped values + @param[in] low lower bound + @param[in] high higher bound + @param[in] numSamples number of samples to clip + @return none + */ + +void arm_clip_f32(const float32_t * pSrc, + float32_t * pDst, + float32_t low, + float32_t high, + uint32_t numSamples); + + /** + @brief Elementwise fixed-point clipping + @param[in] pSrc points to input values + @param[out] pDst points to output clipped values + @param[in] low lower bound + @param[in] high higher bound + @param[in] numSamples number of samples to clip + @return none + */ + +void arm_clip_q31(const q31_t * pSrc, + q31_t * pDst, + q31_t low, + q31_t high, + uint32_t numSamples); + + /** + @brief Elementwise fixed-point clipping + @param[in] pSrc points to input values + @param[out] pDst points to output clipped values + @param[in] low lower bound + @param[in] high higher bound + @param[in] numSamples number of samples to clip + @return none + */ + +void arm_clip_q15(const q15_t * pSrc, + q15_t * pDst, + q15_t low, + q15_t high, + uint32_t numSamples); + + /** + @brief Elementwise fixed-point clipping + @param[in] pSrc points to input values + @param[out] pDst points to output clipped values + @param[in] low lower bound + @param[in] high higher bound + @param[in] numSamples number of samples to clip + @return none + */ + +void arm_clip_q7(const q7_t * pSrc, + q7_t * pDst, + q7_t low, + q7_t high, + uint32_t numSamples); + + +#ifdef __cplusplus +} +#endif + +#endif /* ifndef _BASIC_MATH_FUNCTIONS_H_ */ diff --git a/APP_Framework/Framework/knowing/cmsis_5/DSP/Include/dsp/basic_math_functions_f16.h b/APP_Framework/Framework/knowing/cmsis_5/DSP/Include/dsp/basic_math_functions_f16.h new file mode 100644 index 000000000..1e4acb275 --- /dev/null +++ b/APP_Framework/Framework/knowing/cmsis_5/DSP/Include/dsp/basic_math_functions_f16.h @@ -0,0 +1,168 @@ +/****************************************************************************** + * @file basic_math_functions_f16.h + * @brief Public header file for CMSIS DSP Library + * @version V1.9.0 + * @date 23 April 2021 + * Target Processor: Cortex-M and Cortex-A cores + ******************************************************************************/ +/* + * Copyright (c) 2010-2020 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +#ifndef _BASIC_MATH_FUNCTIONS_F16_H_ +#define _BASIC_MATH_FUNCTIONS_F16_H_ + +#ifdef __cplusplus +extern "C" +{ +#endif + +#include "arm_math_types_f16.h" +#include "arm_math_memory.h" + +#include "dsp/none.h" +#include "dsp/utils.h" + + +#if defined(ARM_FLOAT16_SUPPORTED) + + + /** + * @brief Floating-point vector addition. + * @param[in] pSrcA points to the first input vector + * @param[in] pSrcB points to the second input vector + * @param[out] pDst points to the output vector + * @param[in] blockSize number of samples in each vector + */ + void arm_add_f16( + const float16_t * pSrcA, + const float16_t * pSrcB, + float16_t * pDst, + uint32_t blockSize); + + /** + * @brief Floating-point vector subtraction. + * @param[in] pSrcA points to the first input vector + * @param[in] pSrcB points to the second input vector + * @param[out] pDst points to the output vector + * @param[in] blockSize number of samples in each vector + */ + void arm_sub_f16( + const float16_t * pSrcA, + const float16_t * pSrcB, + float16_t * pDst, + uint32_t blockSize); + + /** + * @brief Multiplies a floating-point vector by a scalar. + * @param[in] pSrc points to the input vector + * @param[in] scale scale factor to be applied + * @param[out] pDst points to the output vector + * @param[in] blockSize number of samples in the vector + */ + void arm_scale_f16( + const float16_t * pSrc, + float16_t scale, + float16_t * pDst, + uint32_t blockSize); + + /** + * @brief Floating-point vector absolute value. + * @param[in] pSrc points to the input buffer + * @param[out] pDst points to the output buffer + * @param[in] blockSize number of samples in each vector + */ + void arm_abs_f16( + const float16_t * pSrc, + float16_t * pDst, + uint32_t blockSize); + + + /** + * @brief Adds a constant offset to a floating-point vector. + * @param[in] pSrc points to the input vector + * @param[in] offset is the offset to be added + * @param[out] pDst points to the output vector + * @param[in] blockSize number of samples in the vector + */ + void arm_offset_f16( + const float16_t * pSrc, + float16_t offset, + float16_t * pDst, + uint32_t blockSize); + + /** + * @brief Dot product of floating-point vectors. + * @param[in] pSrcA points to the first input vector + * @param[in] pSrcB points to the second input vector + * @param[in] blockSize number of samples in each vector + * @param[out] result output result returned here + */ + void arm_dot_prod_f16( + const float16_t * pSrcA, + const float16_t * pSrcB, + uint32_t blockSize, + float16_t * result); + + /** + * @brief Floating-point vector multiplication. + * @param[in] pSrcA points to the first input vector + * @param[in] pSrcB points to the second input vector + * @param[out] pDst points to the output vector + * @param[in] blockSize number of samples in each vector + */ + void arm_mult_f16( + const float16_t * pSrcA, + const float16_t * pSrcB, + float16_t * pDst, + uint32_t blockSize); + + /** + * @brief Negates the elements of a floating-point vector. + * @param[in] pSrc points to the input vector + * @param[out] pDst points to the output vector + * @param[in] blockSize number of samples in the vector + */ + void arm_negate_f16( + const float16_t * pSrc, + float16_t * pDst, + uint32_t blockSize); + + /** + @brief Elementwise floating-point clipping + @param[in] pSrc points to input values + @param[out] pDst points to output clipped values + @param[in] low lower bound + @param[in] high higher bound + @param[in] numSamples number of samples to clip + @return none + */ + +void arm_clip_f16(const float16_t * pSrc, + float16_t * pDst, + float16_t low, + float16_t high, + uint32_t numSamples); + +#endif /* defined(ARM_FLOAT16_SUPPORTED)*/ + +#ifdef __cplusplus +} +#endif + +#endif /* ifndef _BASIC_MATH_FUNCTIONS_F16_H_ */ diff --git a/APP_Framework/Framework/knowing/cmsis_5/DSP/Include/dsp/bayes_functions.h b/APP_Framework/Framework/knowing/cmsis_5/DSP/Include/dsp/bayes_functions.h new file mode 100644 index 000000000..beca38ec6 --- /dev/null +++ b/APP_Framework/Framework/knowing/cmsis_5/DSP/Include/dsp/bayes_functions.h @@ -0,0 +1,89 @@ +/****************************************************************************** + * @file bayes_functions.h + * @brief Public header file for CMSIS DSP Library + * @version V1.9.0 + * @date 23 April 2021 + * Target Processor: Cortex-M and Cortex-A cores + ******************************************************************************/ +/* + * Copyright (c) 2010-2020 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +#ifndef _BAYES_FUNCTIONS_H_ +#define _BAYES_FUNCTIONS_H_ + +#include "arm_math_types.h" +#include "arm_math_memory.h" + +#include "dsp/none.h" +#include "dsp/utils.h" + +#include "dsp/statistics_functions.h" + +/** + * @defgroup groupBayes Bayesian estimators + * + * Implement the naive gaussian Bayes estimator. + * The training must be done from scikit-learn. + * + * The parameters can be easily + * generated from the scikit-learn object. Some examples are given in + * DSP/Testing/PatternGeneration/Bayes.py + */ + +#ifdef __cplusplus +extern "C" +{ +#endif + +/** + * @brief Instance structure for Naive Gaussian Bayesian estimator. + */ +typedef struct +{ + uint32_t vectorDimension; /**< Dimension of vector space */ + uint32_t numberOfClasses; /**< Number of different classes */ + const float32_t *theta; /**< Mean values for the Gaussians */ + const float32_t *sigma; /**< Variances for the Gaussians */ + const float32_t *classPriors; /**< Class prior probabilities */ + float32_t epsilon; /**< Additive value to variances */ +} arm_gaussian_naive_bayes_instance_f32; + +/** + * @brief Naive Gaussian Bayesian Estimator + * + * @param[in] S points to a naive bayes instance structure + * @param[in] in points to the elements of the input vector. + * @param[out] *pOutputProbabilities points to a buffer of length numberOfClasses containing estimated probabilities + * @param[out] *pBufferB points to a temporary buffer of length numberOfClasses + * @return The predicted class + * + */ + + +uint32_t arm_gaussian_naive_bayes_predict_f32(const arm_gaussian_naive_bayes_instance_f32 *S, + const float32_t * in, + float32_t *pOutputProbabilities, + float32_t *pBufferB); + + +#ifdef __cplusplus +} +#endif + +#endif /* ifndef _BAYES_FUNCTIONS_H_ */ diff --git a/APP_Framework/Framework/knowing/cmsis_5/DSP/Include/dsp/bayes_functions_f16.h b/APP_Framework/Framework/knowing/cmsis_5/DSP/Include/dsp/bayes_functions_f16.h new file mode 100644 index 000000000..f2c9ad82e --- /dev/null +++ b/APP_Framework/Framework/knowing/cmsis_5/DSP/Include/dsp/bayes_functions_f16.h @@ -0,0 +1,80 @@ +/****************************************************************************** + * @file bayes_functions_f16.h + * @brief Public header file for CMSIS DSP Library + * @version V1.9.0 + * @date 23 April 2021 + * Target Processor: Cortex-M and Cortex-A cores + ******************************************************************************/ +/* + * Copyright (c) 2010-2020 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +#ifndef _BAYES_FUNCTIONS_F16_H_ +#define _BAYES_FUNCTIONS_F16_H_ + +#include "arm_math_types_f16.h" +#include "arm_math_memory.h" + +#include "dsp/none.h" +#include "dsp/utils.h" + +#include "dsp/statistics_functions_f16.h" + +#ifdef __cplusplus +extern "C" +{ +#endif + +#if defined(ARM_FLOAT16_SUPPORTED) + +/** + * @brief Instance structure for Naive Gaussian Bayesian estimator. + */ +typedef struct +{ + uint32_t vectorDimension; /**< Dimension of vector space */ + uint32_t numberOfClasses; /**< Number of different classes */ + const float16_t *theta; /**< Mean values for the Gaussians */ + const float16_t *sigma; /**< Variances for the Gaussians */ + const float16_t *classPriors; /**< Class prior probabilities */ + float16_t epsilon; /**< Additive value to variances */ +} arm_gaussian_naive_bayes_instance_f16; + +/** + * @brief Naive Gaussian Bayesian Estimator + * + * @param[in] S points to a naive bayes instance structure + * @param[in] in points to the elements of the input vector. + * @param[out] *pOutputProbabilities points to a buffer of length numberOfClasses containing estimated probabilities + * @param[out] *pBufferB points to a temporary buffer of length numberOfClasses + * @return The predicted class + * + */ + + +uint32_t arm_gaussian_naive_bayes_predict_f16(const arm_gaussian_naive_bayes_instance_f16 *S, + const float16_t * in, + float16_t *pOutputProbabilities, + float16_t *pBufferB); + +#endif /*defined(ARM_FLOAT16_SUPPORTED)*/ +#ifdef __cplusplus +} +#endif + +#endif /* ifndef _BAYES_FUNCTIONS_F16_H_ */ diff --git a/APP_Framework/Framework/knowing/cmsis_5/DSP/Include/dsp/complex_math_functions.h b/APP_Framework/Framework/knowing/cmsis_5/DSP/Include/dsp/complex_math_functions.h new file mode 100644 index 000000000..5bf3e1703 --- /dev/null +++ b/APP_Framework/Framework/knowing/cmsis_5/DSP/Include/dsp/complex_math_functions.h @@ -0,0 +1,295 @@ +/****************************************************************************** + * @file complex_math_functions.h + * @brief Public header file for CMSIS DSP Library + * @version V1.9.0 + * @date 23 April 2021 + * Target Processor: Cortex-M and Cortex-A cores + ******************************************************************************/ +/* + * Copyright (c) 2010-2020 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +#ifndef _COMPLEX_MATH_FUNCTIONS_H_ +#define _COMPLEX_MATH_FUNCTIONS_H_ + +#include "arm_math_types.h" +#include "arm_math_memory.h" + +#include "dsp/none.h" +#include "dsp/utils.h" +#include "dsp/fast_math_functions.h" + +#ifdef __cplusplus +extern "C" +{ +#endif + +/** + * @defgroup groupCmplxMath Complex Math Functions + * This set of functions operates on complex data vectors. + * The data in the complex arrays is stored in an interleaved fashion + * (real, imag, real, imag, ...). + * In the API functions, the number of samples in a complex array refers + * to the number of complex values; the array contains twice this number of + * real values. + */ + + /** + * @brief Floating-point complex conjugate. + * @param[in] pSrc points to the input vector + * @param[out] pDst points to the output vector + * @param[in] numSamples number of complex samples in each vector + */ + void arm_cmplx_conj_f32( + const float32_t * pSrc, + float32_t * pDst, + uint32_t numSamples); + + /** + * @brief Q31 complex conjugate. + * @param[in] pSrc points to the input vector + * @param[out] pDst points to the output vector + * @param[in] numSamples number of complex samples in each vector + */ + void arm_cmplx_conj_q31( + const q31_t * pSrc, + q31_t * pDst, + uint32_t numSamples); + + + /** + * @brief Q15 complex conjugate. + * @param[in] pSrc points to the input vector + * @param[out] pDst points to the output vector + * @param[in] numSamples number of complex samples in each vector + */ + void arm_cmplx_conj_q15( + const q15_t * pSrc, + q15_t * pDst, + uint32_t numSamples); + + + /** + * @brief Floating-point complex magnitude squared + * @param[in] pSrc points to the complex input vector + * @param[out] pDst points to the real output vector + * @param[in] numSamples number of complex samples in the input vector + */ + void arm_cmplx_mag_squared_f32( + const float32_t * pSrc, + float32_t * pDst, + uint32_t numSamples); + + + /** + * @brief Q31 complex magnitude squared + * @param[in] pSrc points to the complex input vector + * @param[out] pDst points to the real output vector + * @param[in] numSamples number of complex samples in the input vector + */ + void arm_cmplx_mag_squared_q31( + const q31_t * pSrc, + q31_t * pDst, + uint32_t numSamples); + + + /** + * @brief Q15 complex magnitude squared + * @param[in] pSrc points to the complex input vector + * @param[out] pDst points to the real output vector + * @param[in] numSamples number of complex samples in the input vector + */ + void arm_cmplx_mag_squared_q15( + const q15_t * pSrc, + q15_t * pDst, + uint32_t numSamples); + + +/** + * @brief Floating-point complex magnitude + * @param[in] pSrc points to the complex input vector + * @param[out] pDst points to the real output vector + * @param[in] numSamples number of complex samples in the input vector + */ + void arm_cmplx_mag_f32( + const float32_t * pSrc, + float32_t * pDst, + uint32_t numSamples); + + + /** + * @brief Q31 complex magnitude + * @param[in] pSrc points to the complex input vector + * @param[out] pDst points to the real output vector + * @param[in] numSamples number of complex samples in the input vector + */ + void arm_cmplx_mag_q31( + const q31_t * pSrc, + q31_t * pDst, + uint32_t numSamples); + + + /** + * @brief Q15 complex magnitude + * @param[in] pSrc points to the complex input vector + * @param[out] pDst points to the real output vector + * @param[in] numSamples number of complex samples in the input vector + */ + void arm_cmplx_mag_q15( + const q15_t * pSrc, + q15_t * pDst, + uint32_t numSamples); + + + /** + * @brief Q15 complex dot product + * @param[in] pSrcA points to the first input vector + * @param[in] pSrcB points to the second input vector + * @param[in] numSamples number of complex samples in each vector + * @param[out] realResult real part of the result returned here + * @param[out] imagResult imaginary part of the result returned here + */ + void arm_cmplx_dot_prod_q15( + const q15_t * pSrcA, + const q15_t * pSrcB, + uint32_t numSamples, + q31_t * realResult, + q31_t * imagResult); + + + /** + * @brief Q31 complex dot product + * @param[in] pSrcA points to the first input vector + * @param[in] pSrcB points to the second input vector + * @param[in] numSamples number of complex samples in each vector + * @param[out] realResult real part of the result returned here + * @param[out] imagResult imaginary part of the result returned here + */ + void arm_cmplx_dot_prod_q31( + const q31_t * pSrcA, + const q31_t * pSrcB, + uint32_t numSamples, + q63_t * realResult, + q63_t * imagResult); + + + /** + * @brief Floating-point complex dot product + * @param[in] pSrcA points to the first input vector + * @param[in] pSrcB points to the second input vector + * @param[in] numSamples number of complex samples in each vector + * @param[out] realResult real part of the result returned here + * @param[out] imagResult imaginary part of the result returned here + */ + void arm_cmplx_dot_prod_f32( + const float32_t * pSrcA, + const float32_t * pSrcB, + uint32_t numSamples, + float32_t * realResult, + float32_t * imagResult); + + + /** + * @brief Q15 complex-by-real multiplication + * @param[in] pSrcCmplx points to the complex input vector + * @param[in] pSrcReal points to the real input vector + * @param[out] pCmplxDst points to the complex output vector + * @param[in] numSamples number of samples in each vector + */ + void arm_cmplx_mult_real_q15( + const q15_t * pSrcCmplx, + const q15_t * pSrcReal, + q15_t * pCmplxDst, + uint32_t numSamples); + + + /** + * @brief Q31 complex-by-real multiplication + * @param[in] pSrcCmplx points to the complex input vector + * @param[in] pSrcReal points to the real input vector + * @param[out] pCmplxDst points to the complex output vector + * @param[in] numSamples number of samples in each vector + */ + void arm_cmplx_mult_real_q31( + const q31_t * pSrcCmplx, + const q31_t * pSrcReal, + q31_t * pCmplxDst, + uint32_t numSamples); + + + /** + * @brief Floating-point complex-by-real multiplication + * @param[in] pSrcCmplx points to the complex input vector + * @param[in] pSrcReal points to the real input vector + * @param[out] pCmplxDst points to the complex output vector + * @param[in] numSamples number of samples in each vector + */ + void arm_cmplx_mult_real_f32( + const float32_t * pSrcCmplx, + const float32_t * pSrcReal, + float32_t * pCmplxDst, + uint32_t numSamples); + + /** + * @brief Q15 complex-by-complex multiplication + * @param[in] pSrcA points to the first input vector + * @param[in] pSrcB points to the second input vector + * @param[out] pDst points to the output vector + * @param[in] numSamples number of complex samples in each vector + */ + void arm_cmplx_mult_cmplx_q15( + const q15_t * pSrcA, + const q15_t * pSrcB, + q15_t * pDst, + uint32_t numSamples); + + + /** + * @brief Q31 complex-by-complex multiplication + * @param[in] pSrcA points to the first input vector + * @param[in] pSrcB points to the second input vector + * @param[out] pDst points to the output vector + * @param[in] numSamples number of complex samples in each vector + */ + void arm_cmplx_mult_cmplx_q31( + const q31_t * pSrcA, + const q31_t * pSrcB, + q31_t * pDst, + uint32_t numSamples); + + + /** + * @brief Floating-point complex-by-complex multiplication + * @param[in] pSrcA points to the first input vector + * @param[in] pSrcB points to the second input vector + * @param[out] pDst points to the output vector + * @param[in] numSamples number of complex samples in each vector + */ + void arm_cmplx_mult_cmplx_f32( + const float32_t * pSrcA, + const float32_t * pSrcB, + float32_t * pDst, + uint32_t numSamples); + + + +#ifdef __cplusplus +} +#endif + +#endif /* ifndef _COMPLEX_MATH_FUNCTIONS_H_ */ diff --git a/APP_Framework/Framework/knowing/cmsis_5/DSP/Include/dsp/complex_math_functions_f16.h b/APP_Framework/Framework/knowing/cmsis_5/DSP/Include/dsp/complex_math_functions_f16.h new file mode 100644 index 000000000..da78559bf --- /dev/null +++ b/APP_Framework/Framework/knowing/cmsis_5/DSP/Include/dsp/complex_math_functions_f16.h @@ -0,0 +1,123 @@ +/****************************************************************************** + * @file complex_math_functions_f16.h + * @brief Public header file for CMSIS DSP Library + * @version V1.9.0 + * @date 23 April 2021 + * Target Processor: Cortex-M and Cortex-A cores + ******************************************************************************/ +/* + * Copyright (c) 2010-2020 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +#ifndef _COMPLEX_MATH_FUNCTIONS_F16_H_ +#define _COMPLEX_MATH_FUNCTIONS_F16_H_ + +#include "arm_math_types_f16.h" +#include "arm_math_memory.h" + +#include "dsp/none.h" +#include "dsp/utils.h" +#include "dsp/fast_math_functions_f16.h" + +#ifdef __cplusplus +extern "C" +{ +#endif + +#if defined(ARM_FLOAT16_SUPPORTED) + + /** + * @brief Floating-point complex conjugate. + * @param[in] pSrc points to the input vector + * @param[out] pDst points to the output vector + * @param[in] numSamples number of complex samples in each vector + */ + void arm_cmplx_conj_f16( + const float16_t * pSrc, + float16_t * pDst, + uint32_t numSamples); + + /** + * @brief Floating-point complex magnitude squared + * @param[in] pSrc points to the complex input vector + * @param[out] pDst points to the real output vector + * @param[in] numSamples number of complex samples in the input vector + */ + void arm_cmplx_mag_squared_f16( + const float16_t * pSrc, + float16_t * pDst, + uint32_t numSamples); + + /** + * @brief Floating-point complex magnitude + * @param[in] pSrc points to the complex input vector + * @param[out] pDst points to the real output vector + * @param[in] numSamples number of complex samples in the input vector + */ + void arm_cmplx_mag_f16( + const float16_t * pSrc, + float16_t * pDst, + uint32_t numSamples); + + /** + * @brief Floating-point complex dot product + * @param[in] pSrcA points to the first input vector + * @param[in] pSrcB points to the second input vector + * @param[in] numSamples number of complex samples in each vector + * @param[out] realResult real part of the result returned here + * @param[out] imagResult imaginary part of the result returned here + */ + void arm_cmplx_dot_prod_f16( + const float16_t * pSrcA, + const float16_t * pSrcB, + uint32_t numSamples, + float16_t * realResult, + float16_t * imagResult); + + /** + * @brief Floating-point complex-by-real multiplication + * @param[in] pSrcCmplx points to the complex input vector + * @param[in] pSrcReal points to the real input vector + * @param[out] pCmplxDst points to the complex output vector + * @param[in] numSamples number of samples in each vector + */ + void arm_cmplx_mult_real_f16( + const float16_t * pSrcCmplx, + const float16_t * pSrcReal, + float16_t * pCmplxDst, + uint32_t numSamples); + + /** + * @brief Floating-point complex-by-complex multiplication + * @param[in] pSrcA points to the first input vector + * @param[in] pSrcB points to the second input vector + * @param[out] pDst points to the output vector + * @param[in] numSamples number of complex samples in each vector + */ + void arm_cmplx_mult_cmplx_f16( + const float16_t * pSrcA, + const float16_t * pSrcB, + float16_t * pDst, + uint32_t numSamples); + +#endif /*defined(ARM_FLOAT16_SUPPORTED)*/ +#ifdef __cplusplus +} +#endif + +#endif /* ifndef _COMPLEX_MATH_FUNCTIONS_F16_H_ */ diff --git a/APP_Framework/Framework/knowing/cmsis_5/DSP/Include/dsp/controller_functions.h b/APP_Framework/Framework/knowing/cmsis_5/DSP/Include/dsp/controller_functions.h new file mode 100644 index 000000000..53823dbe9 --- /dev/null +++ b/APP_Framework/Framework/knowing/cmsis_5/DSP/Include/dsp/controller_functions.h @@ -0,0 +1,791 @@ +/****************************************************************************** + * @file controller_functions.h + * @brief Public header file for CMSIS DSP Library + * @version V1.9.0 + * @date 23 April 2021 + * Target Processor: Cortex-M and Cortex-A cores + ******************************************************************************/ +/* + * Copyright (c) 2010-2020 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +#ifndef _CONTROLLER_FUNCTIONS_H_ +#define _CONTROLLER_FUNCTIONS_H_ + +#include "arm_math_types.h" +#include "arm_math_memory.h" + +#include "dsp/none.h" +#include "dsp/utils.h" + +#ifdef __cplusplus +extern "C" +{ +#endif + + /** + * @brief Macros required for SINE and COSINE Controller functions + */ + +#define CONTROLLER_Q31_SHIFT (32 - 9) + /* 1.31(q31) Fixed value of 2/360 */ + /* -1 to +1 is divided into 360 values so total spacing is (2/360) */ +#define INPUT_SPACING 0xB60B61 + +/** + * @defgroup groupController Controller Functions + */ + + + /** + * @ingroup groupController + */ + + /** + * @addtogroup SinCos + * @{ + */ + +/** + * @brief Floating-point sin_cos function. + * @param[in] theta input value in degrees + * @param[out] pSinVal points to the processed sine output. + * @param[out] pCosVal points to the processed cos output. + */ + void arm_sin_cos_f32( + float32_t theta, + float32_t * pSinVal, + float32_t * pCosVal); + + + /** + * @brief Q31 sin_cos function. + * @param[in] theta scaled input value in degrees + * @param[out] pSinVal points to the processed sine output. + * @param[out] pCosVal points to the processed cosine output. + */ + void arm_sin_cos_q31( + q31_t theta, + q31_t * pSinVal, + q31_t * pCosVal); + + /** + * @} end of SinCos group + */ + + /** + * @ingroup groupController + */ + +/** + * @defgroup PID PID Motor Control + * + * A Proportional Integral Derivative (PID) controller is a generic feedback control + * loop mechanism widely used in industrial control systems. + * A PID controller is the most commonly used type of feedback controller. + * + * This set of functions implements (PID) controllers + * for Q15, Q31, and floating-point data types. The functions operate on a single sample + * of data and each call to the function returns a single processed value. + * S points to an instance of the PID control data structure. in + * is the input sample value. The functions return the output value. + * + * \par Algorithm: + *
+   *    y[n] = y[n-1] + A0 * x[n] + A1 * x[n-1] + A2 * x[n-2]
+   *    A0 = Kp + Ki + Kd
+   *    A1 = (-Kp ) - (2 * Kd )
+   *    A2 = Kd
+   * 
+ * + * \par + * where \c Kp is proportional constant, \c Ki is Integral constant and \c Kd is Derivative constant + * + * \par + * \image html PID.gif "Proportional Integral Derivative Controller" + * + * \par + * The PID controller calculates an "error" value as the difference between + * the measured output and the reference input. + * The controller attempts to minimize the error by adjusting the process control inputs. + * The proportional value determines the reaction to the current error, + * the integral value determines the reaction based on the sum of recent errors, + * and the derivative value determines the reaction based on the rate at which the error has been changing. + * + * \par Instance Structure + * The Gains A0, A1, A2 and state variables for a PID controller are stored together in an instance data structure. + * A separate instance structure must be defined for each PID Controller. + * There are separate instance structure declarations for each of the 3 supported data types. + * + * \par Reset Functions + * There is also an associated reset function for each data type which clears the state array. + * + * \par Initialization Functions + * There is also an associated initialization function for each data type. + * The initialization function performs the following operations: + * - Initializes the Gains A0, A1, A2 from Kp,Ki, Kd gains. + * - Zeros out the values in the state buffer. + * + * \par + * Instance structure cannot be placed into a const data section and it is recommended to use the initialization function. + * + * \par Fixed-Point Behavior + * Care must be taken when using the fixed-point versions of the PID Controller functions. + * In particular, the overflow and saturation behavior of the accumulator used in each function must be considered. + * Refer to the function specific documentation below for usage guidelines. + */ + + + /** + * @brief Instance structure for the Q15 PID Control. + */ + typedef struct + { + q15_t A0; /**< The derived gain, A0 = Kp + Ki + Kd . */ +#if !defined (ARM_MATH_DSP) + q15_t A1; /**< The derived gain A1 = -Kp - 2Kd */ + q15_t A2; /**< The derived gain A1 = Kd. */ +#else + q31_t A1; /**< The derived gain A1 = -Kp - 2Kd | Kd.*/ +#endif + q15_t state[3]; /**< The state array of length 3. */ + q15_t Kp; /**< The proportional gain. */ + q15_t Ki; /**< The integral gain. */ + q15_t Kd; /**< The derivative gain. */ + } arm_pid_instance_q15; + + /** + * @brief Instance structure for the Q31 PID Control. + */ + typedef struct + { + q31_t A0; /**< The derived gain, A0 = Kp + Ki + Kd . */ + q31_t A1; /**< The derived gain, A1 = -Kp - 2Kd. */ + q31_t A2; /**< The derived gain, A2 = Kd . */ + q31_t state[3]; /**< The state array of length 3. */ + q31_t Kp; /**< The proportional gain. */ + q31_t Ki; /**< The integral gain. */ + q31_t Kd; /**< The derivative gain. */ + } arm_pid_instance_q31; + + /** + * @brief Instance structure for the floating-point PID Control. + */ + typedef struct + { + float32_t A0; /**< The derived gain, A0 = Kp + Ki + Kd . */ + float32_t A1; /**< The derived gain, A1 = -Kp - 2Kd. */ + float32_t A2; /**< The derived gain, A2 = Kd . */ + float32_t state[3]; /**< The state array of length 3. */ + float32_t Kp; /**< The proportional gain. */ + float32_t Ki; /**< The integral gain. */ + float32_t Kd; /**< The derivative gain. */ + } arm_pid_instance_f32; + + + + /** + * @brief Initialization function for the floating-point PID Control. + * @param[in,out] S points to an instance of the PID structure. + * @param[in] resetStateFlag flag to reset the state. 0 = no change in state 1 = reset the state. + */ + void arm_pid_init_f32( + arm_pid_instance_f32 * S, + int32_t resetStateFlag); + + + /** + * @brief Reset function for the floating-point PID Control. + * @param[in,out] S is an instance of the floating-point PID Control structure + */ + void arm_pid_reset_f32( + arm_pid_instance_f32 * S); + + + /** + * @brief Initialization function for the Q31 PID Control. + * @param[in,out] S points to an instance of the Q15 PID structure. + * @param[in] resetStateFlag flag to reset the state. 0 = no change in state 1 = reset the state. + */ + void arm_pid_init_q31( + arm_pid_instance_q31 * S, + int32_t resetStateFlag); + + + /** + * @brief Reset function for the Q31 PID Control. + * @param[in,out] S points to an instance of the Q31 PID Control structure + */ + + void arm_pid_reset_q31( + arm_pid_instance_q31 * S); + + + /** + * @brief Initialization function for the Q15 PID Control. + * @param[in,out] S points to an instance of the Q15 PID structure. + * @param[in] resetStateFlag flag to reset the state. 0 = no change in state 1 = reset the state. + */ + void arm_pid_init_q15( + arm_pid_instance_q15 * S, + int32_t resetStateFlag); + + + /** + * @brief Reset function for the Q15 PID Control. + * @param[in,out] S points to an instance of the q15 PID Control structure + */ + void arm_pid_reset_q15( + arm_pid_instance_q15 * S); + + + + /** + * @addtogroup PID + * @{ + */ + + /** + * @brief Process function for the floating-point PID Control. + * @param[in,out] S is an instance of the floating-point PID Control structure + * @param[in] in input sample to process + * @return processed output sample. + */ + __STATIC_FORCEINLINE float32_t arm_pid_f32( + arm_pid_instance_f32 * S, + float32_t in) + { + float32_t out; + + /* y[n] = y[n-1] + A0 * x[n] + A1 * x[n-1] + A2 * x[n-2] */ + out = (S->A0 * in) + + (S->A1 * S->state[0]) + (S->A2 * S->state[1]) + (S->state[2]); + + /* Update state */ + S->state[1] = S->state[0]; + S->state[0] = in; + S->state[2] = out; + + /* return to application */ + return (out); + + } + +/** + @brief Process function for the Q31 PID Control. + @param[in,out] S points to an instance of the Q31 PID Control structure + @param[in] in input sample to process + @return processed output sample. + + \par Scaling and Overflow Behavior + The function is implemented using an internal 64-bit accumulator. + The accumulator has a 2.62 format and maintains full precision of the intermediate multiplication results but provides only a single guard bit. + Thus, if the accumulator result overflows it wraps around rather than clip. + In order to avoid overflows completely the input signal must be scaled down by 2 bits as there are four additions. + After all multiply-accumulates are performed, the 2.62 accumulator is truncated to 1.32 format and then saturated to 1.31 format. + */ +__STATIC_FORCEINLINE q31_t arm_pid_q31( + arm_pid_instance_q31 * S, + q31_t in) + { + q63_t acc; + q31_t out; + + /* acc = A0 * x[n] */ + acc = (q63_t) S->A0 * in; + + /* acc += A1 * x[n-1] */ + acc += (q63_t) S->A1 * S->state[0]; + + /* acc += A2 * x[n-2] */ + acc += (q63_t) S->A2 * S->state[1]; + + /* convert output to 1.31 format to add y[n-1] */ + out = (q31_t) (acc >> 31U); + + /* out += y[n-1] */ + out += S->state[2]; + + /* Update state */ + S->state[1] = S->state[0]; + S->state[0] = in; + S->state[2] = out; + + /* return to application */ + return (out); + } + + +/** + @brief Process function for the Q15 PID Control. + @param[in,out] S points to an instance of the Q15 PID Control structure + @param[in] in input sample to process + @return processed output sample. + + \par Scaling and Overflow Behavior + The function is implemented using a 64-bit internal accumulator. + Both Gains and state variables are represented in 1.15 format and multiplications yield a 2.30 result. + The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format. + There is no risk of internal overflow with this approach and the full precision of intermediate multiplications is preserved. + After all additions have been performed, the accumulator is truncated to 34.15 format by discarding low 15 bits. + Lastly, the accumulator is saturated to yield a result in 1.15 format. + */ +__STATIC_FORCEINLINE q15_t arm_pid_q15( + arm_pid_instance_q15 * S, + q15_t in) + { + q63_t acc; + q15_t out; + +#if defined (ARM_MATH_DSP) + /* Implementation of PID controller */ + + /* acc = A0 * x[n] */ + acc = (q31_t) __SMUAD((uint32_t)S->A0, (uint32_t)in); + + /* acc += A1 * x[n-1] + A2 * x[n-2] */ + acc = (q63_t)__SMLALD((uint32_t)S->A1, (uint32_t)read_q15x2 (S->state), (uint64_t)acc); +#else + /* acc = A0 * x[n] */ + acc = ((q31_t) S->A0) * in; + + /* acc += A1 * x[n-1] + A2 * x[n-2] */ + acc += (q31_t) S->A1 * S->state[0]; + acc += (q31_t) S->A2 * S->state[1]; +#endif + + /* acc += y[n-1] */ + acc += (q31_t) S->state[2] << 15; + + /* saturate the output */ + out = (q15_t) (__SSAT((q31_t)(acc >> 15), 16)); + + /* Update state */ + S->state[1] = S->state[0]; + S->state[0] = in; + S->state[2] = out; + + /* return to application */ + return (out); + } + + /** + * @} end of PID group + */ + + /** + * @ingroup groupController + */ + + /** + * @defgroup park Vector Park Transform + * + * Forward Park transform converts the input two-coordinate vector to flux and torque components. + * The Park transform can be used to realize the transformation of the Ialpha and the Ibeta currents + * from the stationary to the moving reference frame and control the spatial relationship between + * the stator vector current and rotor flux vector. + * If we consider the d axis aligned with the rotor flux, the diagram below shows the + * current vector and the relationship from the two reference frames: + * \image html park.gif "Stator current space vector and its component in (a,b) and in the d,q rotating reference frame" + * + * The function operates on a single sample of data and each call to the function returns the processed output. + * The library provides separate functions for Q31 and floating-point data types. + * \par Algorithm + * \image html parkFormula.gif + * where Ialpha and Ibeta are the stator vector components, + * pId and pIq are rotor vector components and cosVal and sinVal are the + * cosine and sine values of theta (rotor flux position). + * \par Fixed-Point Behavior + * Care must be taken when using the Q31 version of the Park transform. + * In particular, the overflow and saturation behavior of the accumulator used must be considered. + * Refer to the function specific documentation below for usage guidelines. + */ + + /** + * @addtogroup park + * @{ + */ + + /** + * @brief Floating-point Park transform + * @param[in] Ialpha input two-phase vector coordinate alpha + * @param[in] Ibeta input two-phase vector coordinate beta + * @param[out] pId points to output rotor reference frame d + * @param[out] pIq points to output rotor reference frame q + * @param[in] sinVal sine value of rotation angle theta + * @param[in] cosVal cosine value of rotation angle theta + * @return none + * + * The function implements the forward Park transform. + * + */ + __STATIC_FORCEINLINE void arm_park_f32( + float32_t Ialpha, + float32_t Ibeta, + float32_t * pId, + float32_t * pIq, + float32_t sinVal, + float32_t cosVal) + { + /* Calculate pId using the equation, pId = Ialpha * cosVal + Ibeta * sinVal */ + *pId = Ialpha * cosVal + Ibeta * sinVal; + + /* Calculate pIq using the equation, pIq = - Ialpha * sinVal + Ibeta * cosVal */ + *pIq = -Ialpha * sinVal + Ibeta * cosVal; + } + + +/** + @brief Park transform for Q31 version + @param[in] Ialpha input two-phase vector coordinate alpha + @param[in] Ibeta input two-phase vector coordinate beta + @param[out] pId points to output rotor reference frame d + @param[out] pIq points to output rotor reference frame q + @param[in] sinVal sine value of rotation angle theta + @param[in] cosVal cosine value of rotation angle theta + @return none + + \par Scaling and Overflow Behavior + The function is implemented using an internal 32-bit accumulator. + The accumulator maintains 1.31 format by truncating lower 31 bits of the intermediate multiplication in 2.62 format. + There is saturation on the addition and subtraction, hence there is no risk of overflow. + */ +__STATIC_FORCEINLINE void arm_park_q31( + q31_t Ialpha, + q31_t Ibeta, + q31_t * pId, + q31_t * pIq, + q31_t sinVal, + q31_t cosVal) + { + q31_t product1, product2; /* Temporary variables used to store intermediate results */ + q31_t product3, product4; /* Temporary variables used to store intermediate results */ + + /* Intermediate product is calculated by (Ialpha * cosVal) */ + product1 = (q31_t) (((q63_t) (Ialpha) * (cosVal)) >> 31); + + /* Intermediate product is calculated by (Ibeta * sinVal) */ + product2 = (q31_t) (((q63_t) (Ibeta) * (sinVal)) >> 31); + + + /* Intermediate product is calculated by (Ialpha * sinVal) */ + product3 = (q31_t) (((q63_t) (Ialpha) * (sinVal)) >> 31); + + /* Intermediate product is calculated by (Ibeta * cosVal) */ + product4 = (q31_t) (((q63_t) (Ibeta) * (cosVal)) >> 31); + + /* Calculate pId by adding the two intermediate products 1 and 2 */ + *pId = __QADD(product1, product2); + + /* Calculate pIq by subtracting the two intermediate products 3 from 4 */ + *pIq = __QSUB(product4, product3); + } + + /** + * @} end of park group + */ + + + /** + * @ingroup groupController + */ + + /** + * @defgroup inv_park Vector Inverse Park transform + * Inverse Park transform converts the input flux and torque components to two-coordinate vector. + * + * The function operates on a single sample of data and each call to the function returns the processed output. + * The library provides separate functions for Q31 and floating-point data types. + * \par Algorithm + * \image html parkInvFormula.gif + * where pIalpha and pIbeta are the stator vector components, + * Id and Iq are rotor vector components and cosVal and sinVal are the + * cosine and sine values of theta (rotor flux position). + * \par Fixed-Point Behavior + * Care must be taken when using the Q31 version of the Park transform. + * In particular, the overflow and saturation behavior of the accumulator used must be considered. + * Refer to the function specific documentation below for usage guidelines. + */ + + /** + * @addtogroup inv_park + * @{ + */ + + /** + * @brief Floating-point Inverse Park transform + * @param[in] Id input coordinate of rotor reference frame d + * @param[in] Iq input coordinate of rotor reference frame q + * @param[out] pIalpha points to output two-phase orthogonal vector axis alpha + * @param[out] pIbeta points to output two-phase orthogonal vector axis beta + * @param[in] sinVal sine value of rotation angle theta + * @param[in] cosVal cosine value of rotation angle theta + * @return none + */ + __STATIC_FORCEINLINE void arm_inv_park_f32( + float32_t Id, + float32_t Iq, + float32_t * pIalpha, + float32_t * pIbeta, + float32_t sinVal, + float32_t cosVal) + { + /* Calculate pIalpha using the equation, pIalpha = Id * cosVal - Iq * sinVal */ + *pIalpha = Id * cosVal - Iq * sinVal; + + /* Calculate pIbeta using the equation, pIbeta = Id * sinVal + Iq * cosVal */ + *pIbeta = Id * sinVal + Iq * cosVal; + } + + +/** + @brief Inverse Park transform for Q31 version + @param[in] Id input coordinate of rotor reference frame d + @param[in] Iq input coordinate of rotor reference frame q + @param[out] pIalpha points to output two-phase orthogonal vector axis alpha + @param[out] pIbeta points to output two-phase orthogonal vector axis beta + @param[in] sinVal sine value of rotation angle theta + @param[in] cosVal cosine value of rotation angle theta + @return none + + @par Scaling and Overflow Behavior + The function is implemented using an internal 32-bit accumulator. + The accumulator maintains 1.31 format by truncating lower 31 bits of the intermediate multiplication in 2.62 format. + There is saturation on the addition, hence there is no risk of overflow. + */ +__STATIC_FORCEINLINE void arm_inv_park_q31( + q31_t Id, + q31_t Iq, + q31_t * pIalpha, + q31_t * pIbeta, + q31_t sinVal, + q31_t cosVal) + { + q31_t product1, product2; /* Temporary variables used to store intermediate results */ + q31_t product3, product4; /* Temporary variables used to store intermediate results */ + + /* Intermediate product is calculated by (Id * cosVal) */ + product1 = (q31_t) (((q63_t) (Id) * (cosVal)) >> 31); + + /* Intermediate product is calculated by (Iq * sinVal) */ + product2 = (q31_t) (((q63_t) (Iq) * (sinVal)) >> 31); + + + /* Intermediate product is calculated by (Id * sinVal) */ + product3 = (q31_t) (((q63_t) (Id) * (sinVal)) >> 31); + + /* Intermediate product is calculated by (Iq * cosVal) */ + product4 = (q31_t) (((q63_t) (Iq) * (cosVal)) >> 31); + + /* Calculate pIalpha by using the two intermediate products 1 and 2 */ + *pIalpha = __QSUB(product1, product2); + + /* Calculate pIbeta by using the two intermediate products 3 and 4 */ + *pIbeta = __QADD(product4, product3); + } + + /** + * @} end of Inverse park group + */ + +/** + * @ingroup groupController + */ + + /** + * @defgroup clarke Vector Clarke Transform + * Forward Clarke transform converts the instantaneous stator phases into a two-coordinate time invariant vector. + * Generally the Clarke transform uses three-phase currents Ia, Ib and Ic to calculate currents + * in the two-phase orthogonal stator axis Ialpha and Ibeta. + * When Ialpha is superposed with Ia as shown in the figure below + * \image html clarke.gif Stator current space vector and its components in (a,b). + * and Ia + Ib + Ic = 0, in this condition Ialpha and Ibeta + * can be calculated using only Ia and Ib. + * + * The function operates on a single sample of data and each call to the function returns the processed output. + * The library provides separate functions for Q31 and floating-point data types. + * \par Algorithm + * \image html clarkeFormula.gif + * where Ia and Ib are the instantaneous stator phases and + * pIalpha and pIbeta are the two coordinates of time invariant vector. + * \par Fixed-Point Behavior + * Care must be taken when using the Q31 version of the Clarke transform. + * In particular, the overflow and saturation behavior of the accumulator used must be considered. + * Refer to the function specific documentation below for usage guidelines. + */ + + /** + * @addtogroup clarke + * @{ + */ + + /** + * + * @brief Floating-point Clarke transform + * @param[in] Ia input three-phase coordinate a + * @param[in] Ib input three-phase coordinate b + * @param[out] pIalpha points to output two-phase orthogonal vector axis alpha + * @param[out] pIbeta points to output two-phase orthogonal vector axis beta + * @return none + */ + __STATIC_FORCEINLINE void arm_clarke_f32( + float32_t Ia, + float32_t Ib, + float32_t * pIalpha, + float32_t * pIbeta) + { + /* Calculate pIalpha using the equation, pIalpha = Ia */ + *pIalpha = Ia; + + /* Calculate pIbeta using the equation, pIbeta = (1/sqrt(3)) * Ia + (2/sqrt(3)) * Ib */ + *pIbeta = (0.57735026919f * Ia + 1.15470053838f * Ib); + } + + +/** + @brief Clarke transform for Q31 version + @param[in] Ia input three-phase coordinate a + @param[in] Ib input three-phase coordinate b + @param[out] pIalpha points to output two-phase orthogonal vector axis alpha + @param[out] pIbeta points to output two-phase orthogonal vector axis beta + @return none + + \par Scaling and Overflow Behavior + The function is implemented using an internal 32-bit accumulator. + The accumulator maintains 1.31 format by truncating lower 31 bits of the intermediate multiplication in 2.62 format. + There is saturation on the addition, hence there is no risk of overflow. + */ +__STATIC_FORCEINLINE void arm_clarke_q31( + q31_t Ia, + q31_t Ib, + q31_t * pIalpha, + q31_t * pIbeta) + { + q31_t product1, product2; /* Temporary variables used to store intermediate results */ + + /* Calculating pIalpha from Ia by equation pIalpha = Ia */ + *pIalpha = Ia; + + /* Intermediate product is calculated by (1/(sqrt(3)) * Ia) */ + product1 = (q31_t) (((q63_t) Ia * 0x24F34E8B) >> 30); + + /* Intermediate product is calculated by (2/sqrt(3) * Ib) */ + product2 = (q31_t) (((q63_t) Ib * 0x49E69D16) >> 30); + + /* pIbeta is calculated by adding the intermediate products */ + *pIbeta = __QADD(product1, product2); + } + + /** + * @} end of clarke group + */ + + + /** + * @ingroup groupController + */ + + /** + * @defgroup inv_clarke Vector Inverse Clarke Transform + * Inverse Clarke transform converts the two-coordinate time invariant vector into instantaneous stator phases. + * + * The function operates on a single sample of data and each call to the function returns the processed output. + * The library provides separate functions for Q31 and floating-point data types. + * \par Algorithm + * \image html clarkeInvFormula.gif + * where pIa and pIb are the instantaneous stator phases and + * Ialpha and Ibeta are the two coordinates of time invariant vector. + * \par Fixed-Point Behavior + * Care must be taken when using the Q31 version of the Clarke transform. + * In particular, the overflow and saturation behavior of the accumulator used must be considered. + * Refer to the function specific documentation below for usage guidelines. + */ + + /** + * @addtogroup inv_clarke + * @{ + */ + + /** + * @brief Floating-point Inverse Clarke transform + * @param[in] Ialpha input two-phase orthogonal vector axis alpha + * @param[in] Ibeta input two-phase orthogonal vector axis beta + * @param[out] pIa points to output three-phase coordinate a + * @param[out] pIb points to output three-phase coordinate b + * @return none + */ + __STATIC_FORCEINLINE void arm_inv_clarke_f32( + float32_t Ialpha, + float32_t Ibeta, + float32_t * pIa, + float32_t * pIb) + { + /* Calculating pIa from Ialpha by equation pIa = Ialpha */ + *pIa = Ialpha; + + /* Calculating pIb from Ialpha and Ibeta by equation pIb = -(1/2) * Ialpha + (sqrt(3)/2) * Ibeta */ + *pIb = -0.5f * Ialpha + 0.8660254039f * Ibeta; + } + + +/** + @brief Inverse Clarke transform for Q31 version + @param[in] Ialpha input two-phase orthogonal vector axis alpha + @param[in] Ibeta input two-phase orthogonal vector axis beta + @param[out] pIa points to output three-phase coordinate a + @param[out] pIb points to output three-phase coordinate b + @return none + + \par Scaling and Overflow Behavior + The function is implemented using an internal 32-bit accumulator. + The accumulator maintains 1.31 format by truncating lower 31 bits of the intermediate multiplication in 2.62 format. + There is saturation on the subtraction, hence there is no risk of overflow. + */ +__STATIC_FORCEINLINE void arm_inv_clarke_q31( + q31_t Ialpha, + q31_t Ibeta, + q31_t * pIa, + q31_t * pIb) + { + q31_t product1, product2; /* Temporary variables used to store intermediate results */ + + /* Calculating pIa from Ialpha by equation pIa = Ialpha */ + *pIa = Ialpha; + + /* Intermediate product is calculated by (1/(2*sqrt(3)) * Ia) */ + product1 = (q31_t) (((q63_t) (Ialpha) * (0x40000000)) >> 31); + + /* Intermediate product is calculated by (1/sqrt(3) * pIb) */ + product2 = (q31_t) (((q63_t) (Ibeta) * (0x6ED9EBA1)) >> 31); + + /* pIb is calculated by subtracting the products */ + *pIb = __QSUB(product2, product1); + } + + /** + * @} end of inv_clarke group + */ + + + + +#ifdef __cplusplus +} +#endif + +#endif /* ifndef _CONTROLLER_FUNCTIONS_H_ */ diff --git a/APP_Framework/Framework/knowing/cmsis_5/DSP/Include/dsp/controller_functions_f16.h b/APP_Framework/Framework/knowing/cmsis_5/DSP/Include/dsp/controller_functions_f16.h new file mode 100644 index 000000000..b0bdd7897 --- /dev/null +++ b/APP_Framework/Framework/knowing/cmsis_5/DSP/Include/dsp/controller_functions_f16.h @@ -0,0 +1,41 @@ +/****************************************************************************** + * @file controller_functions_f16.h + * @brief Public header file for CMSIS DSP Library + * @version V1.9.0 + * @date 23 April 2021 + * Target Processor: Cortex-M and Cortex-A cores + ******************************************************************************/ +/* + * Copyright (c) 2010-2020 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +#ifndef _CONTROLLER_FUNCTIONS_F16_H_ +#define _CONTROLLER_FUNCTIONS_F16_H_ + +#ifdef __cplusplus +extern "C" +{ +#endif + +#if defined(ARM_FLOAT16_SUPPORTED) +#endif /*defined(ARM_FLOAT16_SUPPORTED)*/ +#ifdef __cplusplus +} +#endif + +#endif /* ifndef _CONTROLLER_FUNCTIONS_F16_H_ */ diff --git a/APP_Framework/Framework/knowing/cmsis_5/DSP/Include/dsp/distance_functions.h b/APP_Framework/Framework/knowing/cmsis_5/DSP/Include/dsp/distance_functions.h new file mode 100644 index 000000000..0af3c6f8b --- /dev/null +++ b/APP_Framework/Framework/knowing/cmsis_5/DSP/Include/dsp/distance_functions.h @@ -0,0 +1,297 @@ +/****************************************************************************** + * @file distance_functions.h + * @brief Public header file for CMSIS DSP Library + * @version V1.9.0 + * @date 23 April 2021 + * Target Processor: Cortex-M and Cortex-A cores + ******************************************************************************/ +/* + * Copyright (c) 2010-2020 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +#ifndef _DISTANCE_FUNCTIONS_H_ +#define _DISTANCE_FUNCTIONS_H_ + +#include "arm_math_types.h" +#include "arm_math_memory.h" + +#include "dsp/none.h" +#include "dsp/utils.h" + +#include "dsp/statistics_functions.h" +#include "dsp/basic_math_functions.h" +#include "dsp/fast_math_functions.h" + +#ifdef __cplusplus +extern "C" +{ +#endif + + +/** + * @defgroup groupDistance Distance functions + * + * Distance functions for use with clustering algorithms. + * There are distance functions for float vectors and boolean vectors. + * + */ + +/* 6.14 bug */ +#if defined (__ARMCC_VERSION) && (__ARMCC_VERSION >= 6100100) && (__ARMCC_VERSION < 6150001) + +__attribute__((weak)) float __powisf2(float a, int b); + +#endif + +/** + * @brief Euclidean distance between two vectors + * @param[in] pA First vector + * @param[in] pB Second vector + * @param[in] blockSize vector length + * @return distance + * + */ + +float32_t arm_euclidean_distance_f32(const float32_t *pA,const float32_t *pB, uint32_t blockSize); + +/** + * @brief Bray-Curtis distance between two vectors + * @param[in] pA First vector + * @param[in] pB Second vector + * @param[in] blockSize vector length + * @return distance + * + */ +float32_t arm_braycurtis_distance_f32(const float32_t *pA,const float32_t *pB, uint32_t blockSize); + +/** + * @brief Canberra distance between two vectors + * + * This function may divide by zero when samples pA[i] and pB[i] are both zero. + * The result of the computation will be correct. So the division per zero may be + * ignored. + * + * @param[in] pA First vector + * @param[in] pB Second vector + * @param[in] blockSize vector length + * @return distance + * + */ +float32_t arm_canberra_distance_f32(const float32_t *pA,const float32_t *pB, uint32_t blockSize); + + +/** + * @brief Chebyshev distance between two vectors + * @param[in] pA First vector + * @param[in] pB Second vector + * @param[in] blockSize vector length + * @return distance + * + */ +float32_t arm_chebyshev_distance_f32(const float32_t *pA,const float32_t *pB, uint32_t blockSize); + + +/** + * @brief Cityblock (Manhattan) distance between two vectors + * @param[in] pA First vector + * @param[in] pB Second vector + * @param[in] blockSize vector length + * @return distance + * + */ +float32_t arm_cityblock_distance_f32(const float32_t *pA,const float32_t *pB, uint32_t blockSize); + +/** + * @brief Correlation distance between two vectors + * + * The input vectors are modified in place ! + * + * @param[in] pA First vector + * @param[in] pB Second vector + * @param[in] blockSize vector length + * @return distance + * + */ +float32_t arm_correlation_distance_f32(float32_t *pA,float32_t *pB, uint32_t blockSize); + +/** + * @brief Cosine distance between two vectors + * + * @param[in] pA First vector + * @param[in] pB Second vector + * @param[in] blockSize vector length + * @return distance + * + */ + +float32_t arm_cosine_distance_f32(const float32_t *pA,const float32_t *pB, uint32_t blockSize); + +/** + * @brief Jensen-Shannon distance between two vectors + * + * This function is assuming that elements of second vector are > 0 + * and 0 only when the corresponding element of first vector is 0. + * Otherwise the result of the computation does not make sense + * and for speed reasons, the cases returning NaN or Infinity are not + * managed. + * + * When the function is computing x log (x / y) with x 0 and y 0, + * it will compute the right value (0) but a division per zero will occur + * and shoudl be ignored in client code. + * + * @param[in] pA First vector + * @param[in] pB Second vector + * @param[in] blockSize vector length + * @return distance + * + */ + +float32_t arm_jensenshannon_distance_f32(const float32_t *pA,const float32_t *pB,uint32_t blockSize); + +/** + * @brief Minkowski distance between two vectors + * + * @param[in] pA First vector + * @param[in] pB Second vector + * @param[in] n Norm order (>= 2) + * @param[in] blockSize vector length + * @return distance + * + */ + + + +float32_t arm_minkowski_distance_f32(const float32_t *pA,const float32_t *pB, int32_t order, uint32_t blockSize); + +/** + * @brief Dice distance between two vectors + * + * @param[in] pA First vector of packed booleans + * @param[in] pB Second vector of packed booleans + * @param[in] order Distance order + * @param[in] blockSize Number of samples + * @return distance + * + */ + + +float32_t arm_dice_distance(const uint32_t *pA, const uint32_t *pB, uint32_t numberOfBools); + +/** + * @brief Hamming distance between two vectors + * + * @param[in] pA First vector of packed booleans + * @param[in] pB Second vector of packed booleans + * @param[in] numberOfBools Number of booleans + * @return distance + * + */ + +float32_t arm_hamming_distance(const uint32_t *pA, const uint32_t *pB, uint32_t numberOfBools); + +/** + * @brief Jaccard distance between two vectors + * + * @param[in] pA First vector of packed booleans + * @param[in] pB Second vector of packed booleans + * @param[in] numberOfBools Number of booleans + * @return distance + * + */ + +float32_t arm_jaccard_distance(const uint32_t *pA, const uint32_t *pB, uint32_t numberOfBools); + +/** + * @brief Kulsinski distance between two vectors + * + * @param[in] pA First vector of packed booleans + * @param[in] pB Second vector of packed booleans + * @param[in] numberOfBools Number of booleans + * @return distance + * + */ + +float32_t arm_kulsinski_distance(const uint32_t *pA, const uint32_t *pB, uint32_t numberOfBools); + +/** + * @brief Roger Stanimoto distance between two vectors + * + * @param[in] pA First vector of packed booleans + * @param[in] pB Second vector of packed booleans + * @param[in] numberOfBools Number of booleans + * @return distance + * + */ + +float32_t arm_rogerstanimoto_distance(const uint32_t *pA, const uint32_t *pB, uint32_t numberOfBools); + +/** + * @brief Russell-Rao distance between two vectors + * + * @param[in] pA First vector of packed booleans + * @param[in] pB Second vector of packed booleans + * @param[in] numberOfBools Number of booleans + * @return distance + * + */ + +float32_t arm_russellrao_distance(const uint32_t *pA, const uint32_t *pB, uint32_t numberOfBools); + +/** + * @brief Sokal-Michener distance between two vectors + * + * @param[in] pA First vector of packed booleans + * @param[in] pB Second vector of packed booleans + * @param[in] numberOfBools Number of booleans + * @return distance + * + */ + +float32_t arm_sokalmichener_distance(const uint32_t *pA, const uint32_t *pB, uint32_t numberOfBools); + +/** + * @brief Sokal-Sneath distance between two vectors + * + * @param[in] pA First vector of packed booleans + * @param[in] pB Second vector of packed booleans + * @param[in] numberOfBools Number of booleans + * @return distance + * + */ + +float32_t arm_sokalsneath_distance(const uint32_t *pA, const uint32_t *pB, uint32_t numberOfBools); + +/** + * @brief Yule distance between two vectors + * + * @param[in] pA First vector of packed booleans + * @param[in] pB Second vector of packed booleans + * @param[in] numberOfBools Number of booleans + * @return distance + * + */ + +float32_t arm_yule_distance(const uint32_t *pA, const uint32_t *pB, uint32_t numberOfBools); + + + +#ifdef __cplusplus +} +#endif + +#endif /* ifndef _DISTANCE_FUNCTIONS_H_ */ diff --git a/APP_Framework/Framework/knowing/cmsis_5/DSP/Include/dsp/distance_functions_f16.h b/APP_Framework/Framework/knowing/cmsis_5/DSP/Include/dsp/distance_functions_f16.h new file mode 100644 index 000000000..ab01fc6ff --- /dev/null +++ b/APP_Framework/Framework/knowing/cmsis_5/DSP/Include/dsp/distance_functions_f16.h @@ -0,0 +1,180 @@ +/****************************************************************************** + * @file distance_functions_f16.h + * @brief Public header file for CMSIS DSP Library + * @version V1.9.0 + * @date 23 April 2021 + * Target Processor: Cortex-M and Cortex-A cores + ******************************************************************************/ +/* + * Copyright (c) 2010-2020 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +#ifndef _DISTANCE_FUNCTIONS_F16_H_ +#define _DISTANCE_FUNCTIONS_F16_H_ + +#include "arm_math_types_f16.h" +#include "arm_math_memory.h" + +#include "dsp/none.h" +#include "dsp/utils.h" + +/* 6.14 bug */ +#if defined (__ARMCC_VERSION) && (__ARMCC_VERSION >= 6100100) && (__ARMCC_VERSION < 6150001) +/* Defined in minkowski_f32 */ +__attribute__((weak)) float __powisf2(float a, int b); +#endif + +#include "dsp/statistics_functions_f16.h" +#include "dsp/basic_math_functions_f16.h" + +#include "dsp/fast_math_functions_f16.h" + +#ifdef __cplusplus +extern "C" +{ +#endif + +#if defined(ARM_FLOAT16_SUPPORTED) + +/** + * @brief Euclidean distance between two vectors + * @param[in] pA First vector + * @param[in] pB Second vector + * @param[in] blockSize vector length + * @return distance + * + */ + +float16_t arm_euclidean_distance_f16(const float16_t *pA,const float16_t *pB, uint32_t blockSize); + +/** + * @brief Bray-Curtis distance between two vectors + * @param[in] pA First vector + * @param[in] pB Second vector + * @param[in] blockSize vector length + * @return distance + * + */ +float16_t arm_braycurtis_distance_f16(const float16_t *pA,const float16_t *pB, uint32_t blockSize); + +/** + * @brief Canberra distance between two vectors + * + * This function may divide by zero when samples pA[i] and pB[i] are both zero. + * The result of the computation will be correct. So the division per zero may be + * ignored. + * + * @param[in] pA First vector + * @param[in] pB Second vector + * @param[in] blockSize vector length + * @return distance + * + */ +float16_t arm_canberra_distance_f16(const float16_t *pA,const float16_t *pB, uint32_t blockSize); + + +/** + * @brief Chebyshev distance between two vectors + * @param[in] pA First vector + * @param[in] pB Second vector + * @param[in] blockSize vector length + * @return distance + * + */ +float16_t arm_chebyshev_distance_f16(const float16_t *pA,const float16_t *pB, uint32_t blockSize); + + +/** + * @brief Cityblock (Manhattan) distance between two vectors + * @param[in] pA First vector + * @param[in] pB Second vector + * @param[in] blockSize vector length + * @return distance + * + */ +float16_t arm_cityblock_distance_f16(const float16_t *pA,const float16_t *pB, uint32_t blockSize); + +/** + * @brief Correlation distance between two vectors + * + * The input vectors are modified in place ! + * + * @param[in] pA First vector + * @param[in] pB Second vector + * @param[in] blockSize vector length + * @return distance + * + */ +float16_t arm_correlation_distance_f16(float16_t *pA,float16_t *pB, uint32_t blockSize); + +/** + * @brief Cosine distance between two vectors + * + * @param[in] pA First vector + * @param[in] pB Second vector + * @param[in] blockSize vector length + * @return distance + * + */ + +float16_t arm_cosine_distance_f16(const float16_t *pA,const float16_t *pB, uint32_t blockSize); + +/** + * @brief Jensen-Shannon distance between two vectors + * + * This function is assuming that elements of second vector are > 0 + * and 0 only when the corresponding element of first vector is 0. + * Otherwise the result of the computation does not make sense + * and for speed reasons, the cases returning NaN or Infinity are not + * managed. + * + * When the function is computing x log (x / y) with x 0 and y 0, + * it will compute the right value (0) but a division per zero will occur + * and shoudl be ignored in client code. + * + * @param[in] pA First vector + * @param[in] pB Second vector + * @param[in] blockSize vector length + * @return distance + * + */ + +float16_t arm_jensenshannon_distance_f16(const float16_t *pA,const float16_t *pB,uint32_t blockSize); + +/** + * @brief Minkowski distance between two vectors + * + * @param[in] pA First vector + * @param[in] pB Second vector + * @param[in] n Norm order (>= 2) + * @param[in] blockSize vector length + * @return distance + * + */ + + + +float16_t arm_minkowski_distance_f16(const float16_t *pA,const float16_t *pB, int32_t order, uint32_t blockSize); + + +#endif /*defined(ARM_FLOAT16_SUPPORTED)*/ +#ifdef __cplusplus +} +#endif + +#endif /* ifndef _DISTANCE_FUNCTIONS_F16_H_ */ diff --git a/APP_Framework/Framework/knowing/cmsis_5/DSP/Include/dsp/fast_math_functions.h b/APP_Framework/Framework/knowing/cmsis_5/DSP/Include/dsp/fast_math_functions.h new file mode 100644 index 000000000..e9e72b436 --- /dev/null +++ b/APP_Framework/Framework/knowing/cmsis_5/DSP/Include/dsp/fast_math_functions.h @@ -0,0 +1,305 @@ +/****************************************************************************** + * @file fast_math_functions.h + * @brief Public header file for CMSIS DSP Library + * @version V1.9.0 + * @date 23 April 2021 + * Target Processor: Cortex-M and Cortex-A cores + ******************************************************************************/ +/* + * Copyright (c) 2010-2020 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +#ifndef _FAST_MATH_FUNCTIONS_H_ +#define _FAST_MATH_FUNCTIONS_H_ + +#include "arm_math_types.h" +#include "arm_math_memory.h" + +#include "dsp/none.h" +#include "dsp/utils.h" + +#ifdef __cplusplus +extern "C" +{ +#endif + + /** + * @brief Macros required for SINE and COSINE Fast math approximations + */ + +#define FAST_MATH_TABLE_SIZE 512 +#define FAST_MATH_Q31_SHIFT (32 - 10) +#define FAST_MATH_Q15_SHIFT (16 - 10) + +#ifndef PI + #define PI 3.14159265358979f +#endif + + +/** + * @defgroup groupFastMath Fast Math Functions + * This set of functions provides a fast approximation to sine, cosine, and square root. + * As compared to most of the other functions in the CMSIS math library, the fast math functions + * operate on individual values and not arrays. + * There are separate functions for Q15, Q31, and floating-point data. + * + */ + + /** + * @ingroup groupFastMath + */ + + +/** + @addtogroup sin + @{ + */ + +/** + * @brief Fast approximation to the trigonometric sine function for floating-point data. + * @param[in] x input value in radians. + * @return sin(x). + */ + float32_t arm_sin_f32( + float32_t x); + + + /** + * @brief Fast approximation to the trigonometric sine function for Q31 data. + * @param[in] x Scaled input value in radians. + * @return sin(x). + */ + q31_t arm_sin_q31( + q31_t x); + + + /** + * @brief Fast approximation to the trigonometric sine function for Q15 data. + * @param[in] x Scaled input value in radians. + * @return sin(x). + */ + q15_t arm_sin_q15( + q15_t x); + +/** + @} end of sin group + */ + +/** + @addtogroup cos + @{ + */ + + /** + * @brief Fast approximation to the trigonometric cosine function for floating-point data. + * @param[in] x input value in radians. + * @return cos(x). + */ + float32_t arm_cos_f32( + float32_t x); + + + /** + * @brief Fast approximation to the trigonometric cosine function for Q31 data. + * @param[in] x Scaled input value in radians. + * @return cos(x). + */ + q31_t arm_cos_q31( + q31_t x); + + + /** + * @brief Fast approximation to the trigonometric cosine function for Q15 data. + * @param[in] x Scaled input value in radians. + * @return cos(x). + */ + q15_t arm_cos_q15( + q15_t x); + +/** + @} end of cos group + */ + + +/** + @brief Floating-point vector of log values. + @param[in] pSrc points to the input vector + @param[out] pDst points to the output vector + @param[in] blockSize number of samples in each vector + @return none + */ + void arm_vlog_f32( + const float32_t * pSrc, + float32_t * pDst, + uint32_t blockSize); + +/** + @brief Floating-point vector of exp values. + @param[in] pSrc points to the input vector + @param[out] pDst points to the output vector + @param[in] blockSize number of samples in each vector + @return none + */ + void arm_vexp_f32( + const float32_t * pSrc, + float32_t * pDst, + uint32_t blockSize); + + /** + * @defgroup SQRT Square Root + * + * Computes the square root of a number. + * There are separate functions for Q15, Q31, and floating-point data types. + * The square root function is computed using the Newton-Raphson algorithm. + * This is an iterative algorithm of the form: + *
+   *      x1 = x0 - f(x0)/f'(x0)
+   * 
+ * where x1 is the current estimate, + * x0 is the previous estimate, and + * f'(x0) is the derivative of f() evaluated at x0. + * For the square root function, the algorithm reduces to: + *
+   *     x0 = in/2                         [initial guess]
+   *     x1 = 1/2 * ( x0 + in / x0)        [each iteration]
+   * 
+ */ + + + /** + * @addtogroup SQRT + * @{ + */ + +/** + @brief Floating-point square root function. + @param[in] in input value + @param[out] pOut square root of input value + @return execution status + - \ref ARM_MATH_SUCCESS : input value is positive + - \ref ARM_MATH_ARGUMENT_ERROR : input value is negative; *pOut is set to 0 + */ +__STATIC_FORCEINLINE arm_status arm_sqrt_f32( + float32_t in, + float32_t * pOut) + { + if (in >= 0.0f) + { +#if defined ( __CC_ARM ) + #if defined __TARGET_FPU_VFP + *pOut = __sqrtf(in); + #else + *pOut = sqrtf(in); + #endif + +#elif defined ( __ICCARM__ ) + #if defined __ARMVFP__ + __ASM("VSQRT.F32 %0,%1" : "=t"(*pOut) : "t"(in)); + #else + *pOut = sqrtf(in); + #endif + +#else + *pOut = sqrtf(in); +#endif + + return (ARM_MATH_SUCCESS); + } + else + { + *pOut = 0.0f; + return (ARM_MATH_ARGUMENT_ERROR); + } + } + + +/** + @brief Q31 square root function. + @param[in] in input value. The range of the input value is [0 +1) or 0x00000000 to 0x7FFFFFFF + @param[out] pOut points to square root of input value + @return execution status + - \ref ARM_MATH_SUCCESS : input value is positive + - \ref ARM_MATH_ARGUMENT_ERROR : input value is negative; *pOut is set to 0 + */ +arm_status arm_sqrt_q31( + q31_t in, + q31_t * pOut); + + +/** + @brief Q15 square root function. + @param[in] in input value. The range of the input value is [0 +1) or 0x0000 to 0x7FFF + @param[out] pOut points to square root of input value + @return execution status + - \ref ARM_MATH_SUCCESS : input value is positive + - \ref ARM_MATH_ARGUMENT_ERROR : input value is negative; *pOut is set to 0 + */ +arm_status arm_sqrt_q15( + q15_t in, + q15_t * pOut); + + /** + * @brief Vector Floating-point square root function. + * @param[in] pIn input vector. + * @param[out] pOut vector of square roots of input elements. + * @param[in] len length of input vector. + * @return The function returns ARM_MATH_SUCCESS if input value is positive value or ARM_MATH_ARGUMENT_ERROR if + * in is negative value and returns zero output for negative values. + */ + void arm_vsqrt_f32( + float32_t * pIn, + float32_t * pOut, + uint16_t len); + + void arm_vsqrt_q31( + q31_t * pIn, + q31_t * pOut, + uint16_t len); + + void arm_vsqrt_q15( + q15_t * pIn, + q15_t * pOut, + uint16_t len); + + /** + * @} end of SQRT group + */ + + /** + @brief Fixed point division + @param[in] numerator Numerator + @param[in] denominator Denominator + @param[out] quotient Quotient value normalized between -1.0 and 1.0 + @param[out] shift Shift left value to get the unnormalized quotient + @return error status + + When dividing by 0, an error ARM_MATH_NANINF is returned. And the quotient is forced + to the saturated negative or positive value. + */ + +arm_status arm_divide_q15(q15_t numerator, + q15_t denominator, + q15_t *quotient, + int16_t *shift); + + +#ifdef __cplusplus +} +#endif + +#endif /* ifndef _FAST_MATH_FUNCTIONS_H_ */ diff --git a/APP_Framework/Framework/knowing/cmsis_5/DSP/Include/dsp/fast_math_functions_f16.h b/APP_Framework/Framework/knowing/cmsis_5/DSP/Include/dsp/fast_math_functions_f16.h new file mode 100644 index 000000000..98a13cb3b --- /dev/null +++ b/APP_Framework/Framework/knowing/cmsis_5/DSP/Include/dsp/fast_math_functions_f16.h @@ -0,0 +1,116 @@ +/****************************************************************************** + * @file fast_math_functions_f16.h + * @brief Public header file for CMSIS DSP Library + * @version V1.9.0 + * @date 23 April 2021 + * Target Processor: Cortex-M and Cortex-A cores + ******************************************************************************/ +/* + * Copyright (c) 2010-2020 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +#ifndef _FAST_MATH_FUNCTIONS_F16_H_ +#define _FAST_MATH_FUNCTIONS_F16_H_ + +#include "arm_math_types_f16.h" +#include "arm_math_memory.h" + +#include "dsp/none.h" +#include "dsp/utils.h" + +/* For sqrt_f32 */ +#include "dsp/fast_math_functions.h" + +#ifdef __cplusplus +extern "C" +{ +#endif + +#if defined(ARM_FLOAT16_SUPPORTED) + + /** + * @addtogroup SQRT + * @{ + */ + +/** + @brief Floating-point square root function. + @param[in] in input value + @param[out] pOut square root of input value + @return execution status + - \ref ARM_MATH_SUCCESS : input value is positive + - \ref ARM_MATH_ARGUMENT_ERROR : input value is negative; *pOut is set to 0 + */ +__STATIC_FORCEINLINE arm_status arm_sqrt_f16( + float16_t in, + float16_t * pOut) + { + float32_t r; + arm_status status; + status=arm_sqrt_f32((float32_t)in,&r); + *pOut=(float16_t)r; + return(status); + } + + +/** + @} end of SQRT group + */ + +/** + @brief Floating-point vector of log values. + @param[in] pSrc points to the input vector + @param[out] pDst points to the output vector + @param[in] blockSize number of samples in each vector + @return none + */ + void arm_vlog_f16( + const float16_t * pSrc, + float16_t * pDst, + uint32_t blockSize); + +/** + @brief Floating-point vector of exp values. + @param[in] pSrc points to the input vector + @param[out] pDst points to the output vector + @param[in] blockSize number of samples in each vector + @return none + */ + void arm_vexp_f16( + const float16_t * pSrc, + float16_t * pDst, + uint32_t blockSize); + + /** + @brief Floating-point vector of inverse values. + @param[in] pSrc points to the input vector + @param[out] pDst points to the output vector + @param[in] blockSize number of samples in each vector + @return none + */ + void arm_vinverse_f16( + const float16_t * pSrc, + float16_t * pDst, + uint32_t blockSize); + +#endif /*defined(ARM_FLOAT16_SUPPORTED)*/ +#ifdef __cplusplus +} +#endif + +#endif /* ifndef _FAST_MATH_FUNCTIONS_F16_H_ */ diff --git a/APP_Framework/Framework/knowing/cmsis_5/DSP/Include/dsp/filtering_functions.h b/APP_Framework/Framework/knowing/cmsis_5/DSP/Include/dsp/filtering_functions.h new file mode 100644 index 000000000..634edbfb3 --- /dev/null +++ b/APP_Framework/Framework/knowing/cmsis_5/DSP/Include/dsp/filtering_functions.h @@ -0,0 +1,2468 @@ +/****************************************************************************** + * @file filtering_functions.h + * @brief Public header file for CMSIS DSP Library + * @version V1.9.0 + * @date 23 April 2021 + * Target Processor: Cortex-M and Cortex-A cores + ******************************************************************************/ +/* + * Copyright (c) 2010-2020 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +#ifndef _FILTERING_FUNCTIONS_H_ +#define _FILTERING_FUNCTIONS_H_ + +#include "arm_math_types.h" +#include "arm_math_memory.h" + +#include "dsp/none.h" +#include "dsp/utils.h" + +#include "dsp/support_functions.h" +#include "dsp/fast_math_functions.h" + +#ifdef __cplusplus +extern "C" +{ +#endif + + + +#define DELTA_Q31 ((q31_t)(0x100)) +#define DELTA_Q15 ((q15_t)0x5) + +/** + * @defgroup groupFilters Filtering Functions + */ + + /** + * @brief Instance structure for the Q7 FIR filter. + */ + typedef struct + { + uint16_t numTaps; /**< number of filter coefficients in the filter. */ + q7_t *pState; /**< points to the state variable array. The array is of length numTaps+blockSize-1. */ + const q7_t *pCoeffs; /**< points to the coefficient array. The array is of length numTaps.*/ + } arm_fir_instance_q7; + + /** + * @brief Instance structure for the Q15 FIR filter. + */ + typedef struct + { + uint16_t numTaps; /**< number of filter coefficients in the filter. */ + q15_t *pState; /**< points to the state variable array. The array is of length numTaps+blockSize-1. */ + const q15_t *pCoeffs; /**< points to the coefficient array. The array is of length numTaps.*/ + } arm_fir_instance_q15; + + /** + * @brief Instance structure for the Q31 FIR filter. + */ + typedef struct + { + uint16_t numTaps; /**< number of filter coefficients in the filter. */ + q31_t *pState; /**< points to the state variable array. The array is of length numTaps+blockSize-1. */ + const q31_t *pCoeffs; /**< points to the coefficient array. The array is of length numTaps. */ + } arm_fir_instance_q31; + + /** + * @brief Instance structure for the floating-point FIR filter. + */ + typedef struct + { + uint16_t numTaps; /**< number of filter coefficients in the filter. */ + float32_t *pState; /**< points to the state variable array. The array is of length numTaps+blockSize-1. */ + const float32_t *pCoeffs; /**< points to the coefficient array. The array is of length numTaps. */ + } arm_fir_instance_f32; + + /** + * @brief Processing function for the Q7 FIR filter. + * @param[in] S points to an instance of the Q7 FIR filter structure. + * @param[in] pSrc points to the block of input data. + * @param[out] pDst points to the block of output data. + * @param[in] blockSize number of samples to process. + */ + void arm_fir_q7( + const arm_fir_instance_q7 * S, + const q7_t * pSrc, + q7_t * pDst, + uint32_t blockSize); + + /** + * @brief Initialization function for the Q7 FIR filter. + * @param[in,out] S points to an instance of the Q7 FIR structure. + * @param[in] numTaps Number of filter coefficients in the filter. + * @param[in] pCoeffs points to the filter coefficients. + * @param[in] pState points to the state buffer. + * @param[in] blockSize number of samples that are processed. + * + * For the MVE version, the coefficient length must be a multiple of 16. + * You can pad with zeros if you have less coefficients. + */ + void arm_fir_init_q7( + arm_fir_instance_q7 * S, + uint16_t numTaps, + const q7_t * pCoeffs, + q7_t * pState, + uint32_t blockSize); + + /** + * @brief Processing function for the Q15 FIR filter. + * @param[in] S points to an instance of the Q15 FIR structure. + * @param[in] pSrc points to the block of input data. + * @param[out] pDst points to the block of output data. + * @param[in] blockSize number of samples to process. + */ + void arm_fir_q15( + const arm_fir_instance_q15 * S, + const q15_t * pSrc, + q15_t * pDst, + uint32_t blockSize); + + /** + * @brief Processing function for the fast Q15 FIR filter (fast version). + * @param[in] S points to an instance of the Q15 FIR filter structure. + * @param[in] pSrc points to the block of input data. + * @param[out] pDst points to the block of output data. + * @param[in] blockSize number of samples to process. + */ + void arm_fir_fast_q15( + const arm_fir_instance_q15 * S, + const q15_t * pSrc, + q15_t * pDst, + uint32_t blockSize); + + /** + * @brief Initialization function for the Q15 FIR filter. + * @param[in,out] S points to an instance of the Q15 FIR filter structure. + * @param[in] numTaps Number of filter coefficients in the filter. Must be even and greater than or equal to 4. + * @param[in] pCoeffs points to the filter coefficients. + * @param[in] pState points to the state buffer. + * @param[in] blockSize number of samples that are processed at a time. + * @return The function returns either + * ARM_MATH_SUCCESS if initialization was successful or + * ARM_MATH_ARGUMENT_ERROR if numTaps is not a supported value. + * + * For the MVE version, the coefficient length must be a multiple of 8. + * You can pad with zeros if you have less coefficients. + * + */ + arm_status arm_fir_init_q15( + arm_fir_instance_q15 * S, + uint16_t numTaps, + const q15_t * pCoeffs, + q15_t * pState, + uint32_t blockSize); + + /** + * @brief Processing function for the Q31 FIR filter. + * @param[in] S points to an instance of the Q31 FIR filter structure. + * @param[in] pSrc points to the block of input data. + * @param[out] pDst points to the block of output data. + * @param[in] blockSize number of samples to process. + */ + void arm_fir_q31( + const arm_fir_instance_q31 * S, + const q31_t * pSrc, + q31_t * pDst, + uint32_t blockSize); + + /** + * @brief Processing function for the fast Q31 FIR filter (fast version). + * @param[in] S points to an instance of the Q31 FIR filter structure. + * @param[in] pSrc points to the block of input data. + * @param[out] pDst points to the block of output data. + * @param[in] blockSize number of samples to process. + */ + void arm_fir_fast_q31( + const arm_fir_instance_q31 * S, + const q31_t * pSrc, + q31_t * pDst, + uint32_t blockSize); + + /** + * @brief Initialization function for the Q31 FIR filter. + * @param[in,out] S points to an instance of the Q31 FIR structure. + * @param[in] numTaps Number of filter coefficients in the filter. + * @param[in] pCoeffs points to the filter coefficients. + * @param[in] pState points to the state buffer. + * @param[in] blockSize number of samples that are processed at a time. + * + * For the MVE version, the coefficient length must be a multiple of 4. + * You can pad with zeros if you have less coefficients. + */ + void arm_fir_init_q31( + arm_fir_instance_q31 * S, + uint16_t numTaps, + const q31_t * pCoeffs, + q31_t * pState, + uint32_t blockSize); + + /** + * @brief Processing function for the floating-point FIR filter. + * @param[in] S points to an instance of the floating-point FIR structure. + * @param[in] pSrc points to the block of input data. + * @param[out] pDst points to the block of output data. + * @param[in] blockSize number of samples to process. + */ + void arm_fir_f32( + const arm_fir_instance_f32 * S, + const float32_t * pSrc, + float32_t * pDst, + uint32_t blockSize); + + /** + * @brief Initialization function for the floating-point FIR filter. + * @param[in,out] S points to an instance of the floating-point FIR filter structure. + * @param[in] numTaps Number of filter coefficients in the filter. + * @param[in] pCoeffs points to the filter coefficients. + * @param[in] pState points to the state buffer. + * @param[in] blockSize number of samples that are processed at a time. + */ + void arm_fir_init_f32( + arm_fir_instance_f32 * S, + uint16_t numTaps, + const float32_t * pCoeffs, + float32_t * pState, + uint32_t blockSize); + + /** + * @brief Instance structure for the Q15 Biquad cascade filter. + */ + typedef struct + { + int8_t numStages; /**< number of 2nd order stages in the filter. Overall order is 2*numStages. */ + q15_t *pState; /**< Points to the array of state coefficients. The array is of length 4*numStages. */ + const q15_t *pCoeffs; /**< Points to the array of coefficients. The array is of length 5*numStages. */ + int8_t postShift; /**< Additional shift, in bits, applied to each output sample. */ + } arm_biquad_casd_df1_inst_q15; + + /** + * @brief Instance structure for the Q31 Biquad cascade filter. + */ + typedef struct + { + uint32_t numStages; /**< number of 2nd order stages in the filter. Overall order is 2*numStages. */ + q31_t *pState; /**< Points to the array of state coefficients. The array is of length 4*numStages. */ + const q31_t *pCoeffs; /**< Points to the array of coefficients. The array is of length 5*numStages. */ + uint8_t postShift; /**< Additional shift, in bits, applied to each output sample. */ + } arm_biquad_casd_df1_inst_q31; + + /** + * @brief Instance structure for the floating-point Biquad cascade filter. + */ + typedef struct + { + uint32_t numStages; /**< number of 2nd order stages in the filter. Overall order is 2*numStages. */ + float32_t *pState; /**< Points to the array of state coefficients. The array is of length 4*numStages. */ + const float32_t *pCoeffs; /**< Points to the array of coefficients. The array is of length 5*numStages. */ + } arm_biquad_casd_df1_inst_f32; + +#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) + /** + * @brief Instance structure for the modified Biquad coefs required by vectorized code. + */ + typedef struct + { + float32_t coeffs[8][4]; /**< Points to the array of modified coefficients. The array is of length 32. There is one per stage */ + } arm_biquad_mod_coef_f32; +#endif + + /** + * @brief Processing function for the Q15 Biquad cascade filter. + * @param[in] S points to an instance of the Q15 Biquad cascade structure. + * @param[in] pSrc points to the block of input data. + * @param[out] pDst points to the block of output data. + * @param[in] blockSize number of samples to process. + */ + void arm_biquad_cascade_df1_q15( + const arm_biquad_casd_df1_inst_q15 * S, + const q15_t * pSrc, + q15_t * pDst, + uint32_t blockSize); + + /** + * @brief Initialization function for the Q15 Biquad cascade filter. + * @param[in,out] S points to an instance of the Q15 Biquad cascade structure. + * @param[in] numStages number of 2nd order stages in the filter. + * @param[in] pCoeffs points to the filter coefficients. + * @param[in] pState points to the state buffer. + * @param[in] postShift Shift to be applied to the output. Varies according to the coefficients format + */ + void arm_biquad_cascade_df1_init_q15( + arm_biquad_casd_df1_inst_q15 * S, + uint8_t numStages, + const q15_t * pCoeffs, + q15_t * pState, + int8_t postShift); + + /** + * @brief Fast but less precise processing function for the Q15 Biquad cascade filter for Cortex-M3 and Cortex-M4. + * @param[in] S points to an instance of the Q15 Biquad cascade structure. + * @param[in] pSrc points to the block of input data. + * @param[out] pDst points to the block of output data. + * @param[in] blockSize number of samples to process. + */ + void arm_biquad_cascade_df1_fast_q15( + const arm_biquad_casd_df1_inst_q15 * S, + const q15_t * pSrc, + q15_t * pDst, + uint32_t blockSize); + + /** + * @brief Processing function for the Q31 Biquad cascade filter + * @param[in] S points to an instance of the Q31 Biquad cascade structure. + * @param[in] pSrc points to the block of input data. + * @param[out] pDst points to the block of output data. + * @param[in] blockSize number of samples to process. + */ + void arm_biquad_cascade_df1_q31( + const arm_biquad_casd_df1_inst_q31 * S, + const q31_t * pSrc, + q31_t * pDst, + uint32_t blockSize); + + /** + * @brief Fast but less precise processing function for the Q31 Biquad cascade filter for Cortex-M3 and Cortex-M4. + * @param[in] S points to an instance of the Q31 Biquad cascade structure. + * @param[in] pSrc points to the block of input data. + * @param[out] pDst points to the block of output data. + * @param[in] blockSize number of samples to process. + */ + void arm_biquad_cascade_df1_fast_q31( + const arm_biquad_casd_df1_inst_q31 * S, + const q31_t * pSrc, + q31_t * pDst, + uint32_t blockSize); + + /** + * @brief Initialization function for the Q31 Biquad cascade filter. + * @param[in,out] S points to an instance of the Q31 Biquad cascade structure. + * @param[in] numStages number of 2nd order stages in the filter. + * @param[in] pCoeffs points to the filter coefficients. + * @param[in] pState points to the state buffer. + * @param[in] postShift Shift to be applied to the output. Varies according to the coefficients format + */ + void arm_biquad_cascade_df1_init_q31( + arm_biquad_casd_df1_inst_q31 * S, + uint8_t numStages, + const q31_t * pCoeffs, + q31_t * pState, + int8_t postShift); + + /** + * @brief Processing function for the floating-point Biquad cascade filter. + * @param[in] S points to an instance of the floating-point Biquad cascade structure. + * @param[in] pSrc points to the block of input data. + * @param[out] pDst points to the block of output data. + * @param[in] blockSize number of samples to process. + */ + void arm_biquad_cascade_df1_f32( + const arm_biquad_casd_df1_inst_f32 * S, + const float32_t * pSrc, + float32_t * pDst, + uint32_t blockSize); + + /** + * @brief Initialization function for the floating-point Biquad cascade filter. + * @param[in,out] S points to an instance of the floating-point Biquad cascade structure. + * @param[in] numStages number of 2nd order stages in the filter. + * @param[in] pCoeffs points to the filter coefficients. + * @param[in] pCoeffsMod points to the modified filter coefficients (only MVE version). + * @param[in] pState points to the state buffer. + */ +#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) + void arm_biquad_cascade_df1_mve_init_f32( + arm_biquad_casd_df1_inst_f32 * S, + uint8_t numStages, + const float32_t * pCoeffs, + arm_biquad_mod_coef_f32 * pCoeffsMod, + float32_t * pState); +#endif + + void arm_biquad_cascade_df1_init_f32( + arm_biquad_casd_df1_inst_f32 * S, + uint8_t numStages, + const float32_t * pCoeffs, + float32_t * pState); + + +/** + * @brief Convolution of floating-point sequences. + * @param[in] pSrcA points to the first input sequence. + * @param[in] srcALen length of the first input sequence. + * @param[in] pSrcB points to the second input sequence. + * @param[in] srcBLen length of the second input sequence. + * @param[out] pDst points to the location where the output result is written. Length srcALen+srcBLen-1. + */ + void arm_conv_f32( + const float32_t * pSrcA, + uint32_t srcALen, + const float32_t * pSrcB, + uint32_t srcBLen, + float32_t * pDst); + + + /** + * @brief Convolution of Q15 sequences. + * @param[in] pSrcA points to the first input sequence. + * @param[in] srcALen length of the first input sequence. + * @param[in] pSrcB points to the second input sequence. + * @param[in] srcBLen length of the second input sequence. + * @param[out] pDst points to the block of output data Length srcALen+srcBLen-1. + * @param[in] pScratch1 points to scratch buffer of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2. + * @param[in] pScratch2 points to scratch buffer of size min(srcALen, srcBLen). + */ + void arm_conv_opt_q15( + const q15_t * pSrcA, + uint32_t srcALen, + const q15_t * pSrcB, + uint32_t srcBLen, + q15_t * pDst, + q15_t * pScratch1, + q15_t * pScratch2); + + +/** + * @brief Convolution of Q15 sequences. + * @param[in] pSrcA points to the first input sequence. + * @param[in] srcALen length of the first input sequence. + * @param[in] pSrcB points to the second input sequence. + * @param[in] srcBLen length of the second input sequence. + * @param[out] pDst points to the location where the output result is written. Length srcALen+srcBLen-1. + */ + void arm_conv_q15( + const q15_t * pSrcA, + uint32_t srcALen, + const q15_t * pSrcB, + uint32_t srcBLen, + q15_t * pDst); + + + /** + * @brief Convolution of Q15 sequences (fast version) for Cortex-M3 and Cortex-M4 + * @param[in] pSrcA points to the first input sequence. + * @param[in] srcALen length of the first input sequence. + * @param[in] pSrcB points to the second input sequence. + * @param[in] srcBLen length of the second input sequence. + * @param[out] pDst points to the block of output data Length srcALen+srcBLen-1. + */ + void arm_conv_fast_q15( + const q15_t * pSrcA, + uint32_t srcALen, + const q15_t * pSrcB, + uint32_t srcBLen, + q15_t * pDst); + + + /** + * @brief Convolution of Q15 sequences (fast version) for Cortex-M3 and Cortex-M4 + * @param[in] pSrcA points to the first input sequence. + * @param[in] srcALen length of the first input sequence. + * @param[in] pSrcB points to the second input sequence. + * @param[in] srcBLen length of the second input sequence. + * @param[out] pDst points to the block of output data Length srcALen+srcBLen-1. + * @param[in] pScratch1 points to scratch buffer of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2. + * @param[in] pScratch2 points to scratch buffer of size min(srcALen, srcBLen). + */ + void arm_conv_fast_opt_q15( + const q15_t * pSrcA, + uint32_t srcALen, + const q15_t * pSrcB, + uint32_t srcBLen, + q15_t * pDst, + q15_t * pScratch1, + q15_t * pScratch2); + + + /** + * @brief Convolution of Q31 sequences. + * @param[in] pSrcA points to the first input sequence. + * @param[in] srcALen length of the first input sequence. + * @param[in] pSrcB points to the second input sequence. + * @param[in] srcBLen length of the second input sequence. + * @param[out] pDst points to the block of output data Length srcALen+srcBLen-1. + */ + void arm_conv_q31( + const q31_t * pSrcA, + uint32_t srcALen, + const q31_t * pSrcB, + uint32_t srcBLen, + q31_t * pDst); + + + /** + * @brief Convolution of Q31 sequences (fast version) for Cortex-M3 and Cortex-M4 + * @param[in] pSrcA points to the first input sequence. + * @param[in] srcALen length of the first input sequence. + * @param[in] pSrcB points to the second input sequence. + * @param[in] srcBLen length of the second input sequence. + * @param[out] pDst points to the block of output data Length srcALen+srcBLen-1. + */ + void arm_conv_fast_q31( + const q31_t * pSrcA, + uint32_t srcALen, + const q31_t * pSrcB, + uint32_t srcBLen, + q31_t * pDst); + + + /** + * @brief Convolution of Q7 sequences. + * @param[in] pSrcA points to the first input sequence. + * @param[in] srcALen length of the first input sequence. + * @param[in] pSrcB points to the second input sequence. + * @param[in] srcBLen length of the second input sequence. + * @param[out] pDst points to the block of output data Length srcALen+srcBLen-1. + * @param[in] pScratch1 points to scratch buffer(of type q15_t) of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2. + * @param[in] pScratch2 points to scratch buffer (of type q15_t) of size min(srcALen, srcBLen). + */ + void arm_conv_opt_q7( + const q7_t * pSrcA, + uint32_t srcALen, + const q7_t * pSrcB, + uint32_t srcBLen, + q7_t * pDst, + q15_t * pScratch1, + q15_t * pScratch2); + + + /** + * @brief Convolution of Q7 sequences. + * @param[in] pSrcA points to the first input sequence. + * @param[in] srcALen length of the first input sequence. + * @param[in] pSrcB points to the second input sequence. + * @param[in] srcBLen length of the second input sequence. + * @param[out] pDst points to the block of output data Length srcALen+srcBLen-1. + */ + void arm_conv_q7( + const q7_t * pSrcA, + uint32_t srcALen, + const q7_t * pSrcB, + uint32_t srcBLen, + q7_t * pDst); + + + /** + * @brief Partial convolution of floating-point sequences. + * @param[in] pSrcA points to the first input sequence. + * @param[in] srcALen length of the first input sequence. + * @param[in] pSrcB points to the second input sequence. + * @param[in] srcBLen length of the second input sequence. + * @param[out] pDst points to the block of output data + * @param[in] firstIndex is the first output sample to start with. + * @param[in] numPoints is the number of output points to be computed. + * @return Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2]. + */ + arm_status arm_conv_partial_f32( + const float32_t * pSrcA, + uint32_t srcALen, + const float32_t * pSrcB, + uint32_t srcBLen, + float32_t * pDst, + uint32_t firstIndex, + uint32_t numPoints); + + + /** + * @brief Partial convolution of Q15 sequences. + * @param[in] pSrcA points to the first input sequence. + * @param[in] srcALen length of the first input sequence. + * @param[in] pSrcB points to the second input sequence. + * @param[in] srcBLen length of the second input sequence. + * @param[out] pDst points to the block of output data + * @param[in] firstIndex is the first output sample to start with. + * @param[in] numPoints is the number of output points to be computed. + * @param[in] pScratch1 points to scratch buffer of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2. + * @param[in] pScratch2 points to scratch buffer of size min(srcALen, srcBLen). + * @return Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2]. + */ + arm_status arm_conv_partial_opt_q15( + const q15_t * pSrcA, + uint32_t srcALen, + const q15_t * pSrcB, + uint32_t srcBLen, + q15_t * pDst, + uint32_t firstIndex, + uint32_t numPoints, + q15_t * pScratch1, + q15_t * pScratch2); + + + /** + * @brief Partial convolution of Q15 sequences. + * @param[in] pSrcA points to the first input sequence. + * @param[in] srcALen length of the first input sequence. + * @param[in] pSrcB points to the second input sequence. + * @param[in] srcBLen length of the second input sequence. + * @param[out] pDst points to the block of output data + * @param[in] firstIndex is the first output sample to start with. + * @param[in] numPoints is the number of output points to be computed. + * @return Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2]. + */ + arm_status arm_conv_partial_q15( + const q15_t * pSrcA, + uint32_t srcALen, + const q15_t * pSrcB, + uint32_t srcBLen, + q15_t * pDst, + uint32_t firstIndex, + uint32_t numPoints); + + + /** + * @brief Partial convolution of Q15 sequences (fast version) for Cortex-M3 and Cortex-M4 + * @param[in] pSrcA points to the first input sequence. + * @param[in] srcALen length of the first input sequence. + * @param[in] pSrcB points to the second input sequence. + * @param[in] srcBLen length of the second input sequence. + * @param[out] pDst points to the block of output data + * @param[in] firstIndex is the first output sample to start with. + * @param[in] numPoints is the number of output points to be computed. + * @return Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2]. + */ + arm_status arm_conv_partial_fast_q15( + const q15_t * pSrcA, + uint32_t srcALen, + const q15_t * pSrcB, + uint32_t srcBLen, + q15_t * pDst, + uint32_t firstIndex, + uint32_t numPoints); + + + /** + * @brief Partial convolution of Q15 sequences (fast version) for Cortex-M3 and Cortex-M4 + * @param[in] pSrcA points to the first input sequence. + * @param[in] srcALen length of the first input sequence. + * @param[in] pSrcB points to the second input sequence. + * @param[in] srcBLen length of the second input sequence. + * @param[out] pDst points to the block of output data + * @param[in] firstIndex is the first output sample to start with. + * @param[in] numPoints is the number of output points to be computed. + * @param[in] pScratch1 points to scratch buffer of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2. + * @param[in] pScratch2 points to scratch buffer of size min(srcALen, srcBLen). + * @return Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2]. + */ + arm_status arm_conv_partial_fast_opt_q15( + const q15_t * pSrcA, + uint32_t srcALen, + const q15_t * pSrcB, + uint32_t srcBLen, + q15_t * pDst, + uint32_t firstIndex, + uint32_t numPoints, + q15_t * pScratch1, + q15_t * pScratch2); + + + /** + * @brief Partial convolution of Q31 sequences. + * @param[in] pSrcA points to the first input sequence. + * @param[in] srcALen length of the first input sequence. + * @param[in] pSrcB points to the second input sequence. + * @param[in] srcBLen length of the second input sequence. + * @param[out] pDst points to the block of output data + * @param[in] firstIndex is the first output sample to start with. + * @param[in] numPoints is the number of output points to be computed. + * @return Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2]. + */ + arm_status arm_conv_partial_q31( + const q31_t * pSrcA, + uint32_t srcALen, + const q31_t * pSrcB, + uint32_t srcBLen, + q31_t * pDst, + uint32_t firstIndex, + uint32_t numPoints); + + + /** + * @brief Partial convolution of Q31 sequences (fast version) for Cortex-M3 and Cortex-M4 + * @param[in] pSrcA points to the first input sequence. + * @param[in] srcALen length of the first input sequence. + * @param[in] pSrcB points to the second input sequence. + * @param[in] srcBLen length of the second input sequence. + * @param[out] pDst points to the block of output data + * @param[in] firstIndex is the first output sample to start with. + * @param[in] numPoints is the number of output points to be computed. + * @return Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2]. + */ + arm_status arm_conv_partial_fast_q31( + const q31_t * pSrcA, + uint32_t srcALen, + const q31_t * pSrcB, + uint32_t srcBLen, + q31_t * pDst, + uint32_t firstIndex, + uint32_t numPoints); + + + /** + * @brief Partial convolution of Q7 sequences + * @param[in] pSrcA points to the first input sequence. + * @param[in] srcALen length of the first input sequence. + * @param[in] pSrcB points to the second input sequence. + * @param[in] srcBLen length of the second input sequence. + * @param[out] pDst points to the block of output data + * @param[in] firstIndex is the first output sample to start with. + * @param[in] numPoints is the number of output points to be computed. + * @param[in] pScratch1 points to scratch buffer(of type q15_t) of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2. + * @param[in] pScratch2 points to scratch buffer (of type q15_t) of size min(srcALen, srcBLen). + * @return Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2]. + */ + arm_status arm_conv_partial_opt_q7( + const q7_t * pSrcA, + uint32_t srcALen, + const q7_t * pSrcB, + uint32_t srcBLen, + q7_t * pDst, + uint32_t firstIndex, + uint32_t numPoints, + q15_t * pScratch1, + q15_t * pScratch2); + + +/** + * @brief Partial convolution of Q7 sequences. + * @param[in] pSrcA points to the first input sequence. + * @param[in] srcALen length of the first input sequence. + * @param[in] pSrcB points to the second input sequence. + * @param[in] srcBLen length of the second input sequence. + * @param[out] pDst points to the block of output data + * @param[in] firstIndex is the first output sample to start with. + * @param[in] numPoints is the number of output points to be computed. + * @return Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2]. + */ + arm_status arm_conv_partial_q7( + const q7_t * pSrcA, + uint32_t srcALen, + const q7_t * pSrcB, + uint32_t srcBLen, + q7_t * pDst, + uint32_t firstIndex, + uint32_t numPoints); + + + /** + * @brief Instance structure for the Q15 FIR decimator. + */ + typedef struct + { + uint8_t M; /**< decimation factor. */ + uint16_t numTaps; /**< number of coefficients in the filter. */ + const q15_t *pCoeffs; /**< points to the coefficient array. The array is of length numTaps.*/ + q15_t *pState; /**< points to the state variable array. The array is of length numTaps+blockSize-1. */ + } arm_fir_decimate_instance_q15; + + /** + * @brief Instance structure for the Q31 FIR decimator. + */ + typedef struct + { + uint8_t M; /**< decimation factor. */ + uint16_t numTaps; /**< number of coefficients in the filter. */ + const q31_t *pCoeffs; /**< points to the coefficient array. The array is of length numTaps.*/ + q31_t *pState; /**< points to the state variable array. The array is of length numTaps+blockSize-1. */ + } arm_fir_decimate_instance_q31; + +/** + @brief Instance structure for floating-point FIR decimator. + */ +typedef struct + { + uint8_t M; /**< decimation factor. */ + uint16_t numTaps; /**< number of coefficients in the filter. */ + const float32_t *pCoeffs; /**< points to the coefficient array. The array is of length numTaps.*/ + float32_t *pState; /**< points to the state variable array. The array is of length numTaps+blockSize-1. */ + } arm_fir_decimate_instance_f32; + + +/** + @brief Processing function for floating-point FIR decimator. + @param[in] S points to an instance of the floating-point FIR decimator structure + @param[in] pSrc points to the block of input data + @param[out] pDst points to the block of output data + @param[in] blockSize number of samples to process + */ +void arm_fir_decimate_f32( + const arm_fir_decimate_instance_f32 * S, + const float32_t * pSrc, + float32_t * pDst, + uint32_t blockSize); + + +/** + @brief Initialization function for the floating-point FIR decimator. + @param[in,out] S points to an instance of the floating-point FIR decimator structure + @param[in] numTaps number of coefficients in the filter + @param[in] M decimation factor + @param[in] pCoeffs points to the filter coefficients + @param[in] pState points to the state buffer + @param[in] blockSize number of input samples to process per call + @return execution status + - \ref ARM_MATH_SUCCESS : Operation successful + - \ref ARM_MATH_LENGTH_ERROR : blockSize is not a multiple of M + */ +arm_status arm_fir_decimate_init_f32( + arm_fir_decimate_instance_f32 * S, + uint16_t numTaps, + uint8_t M, + const float32_t * pCoeffs, + float32_t * pState, + uint32_t blockSize); + + + /** + * @brief Processing function for the Q15 FIR decimator. + * @param[in] S points to an instance of the Q15 FIR decimator structure. + * @param[in] pSrc points to the block of input data. + * @param[out] pDst points to the block of output data + * @param[in] blockSize number of input samples to process per call. + */ + void arm_fir_decimate_q15( + const arm_fir_decimate_instance_q15 * S, + const q15_t * pSrc, + q15_t * pDst, + uint32_t blockSize); + + + /** + * @brief Processing function for the Q15 FIR decimator (fast variant) for Cortex-M3 and Cortex-M4. + * @param[in] S points to an instance of the Q15 FIR decimator structure. + * @param[in] pSrc points to the block of input data. + * @param[out] pDst points to the block of output data + * @param[in] blockSize number of input samples to process per call. + */ + void arm_fir_decimate_fast_q15( + const arm_fir_decimate_instance_q15 * S, + const q15_t * pSrc, + q15_t * pDst, + uint32_t blockSize); + + + /** + * @brief Initialization function for the Q15 FIR decimator. + * @param[in,out] S points to an instance of the Q15 FIR decimator structure. + * @param[in] numTaps number of coefficients in the filter. + * @param[in] M decimation factor. + * @param[in] pCoeffs points to the filter coefficients. + * @param[in] pState points to the state buffer. + * @param[in] blockSize number of input samples to process per call. + * @return The function returns ARM_MATH_SUCCESS if initialization is successful or ARM_MATH_LENGTH_ERROR if + * blockSize is not a multiple of M. + */ + arm_status arm_fir_decimate_init_q15( + arm_fir_decimate_instance_q15 * S, + uint16_t numTaps, + uint8_t M, + const q15_t * pCoeffs, + q15_t * pState, + uint32_t blockSize); + + + /** + * @brief Processing function for the Q31 FIR decimator. + * @param[in] S points to an instance of the Q31 FIR decimator structure. + * @param[in] pSrc points to the block of input data. + * @param[out] pDst points to the block of output data + * @param[in] blockSize number of input samples to process per call. + */ + void arm_fir_decimate_q31( + const arm_fir_decimate_instance_q31 * S, + const q31_t * pSrc, + q31_t * pDst, + uint32_t blockSize); + + /** + * @brief Processing function for the Q31 FIR decimator (fast variant) for Cortex-M3 and Cortex-M4. + * @param[in] S points to an instance of the Q31 FIR decimator structure. + * @param[in] pSrc points to the block of input data. + * @param[out] pDst points to the block of output data + * @param[in] blockSize number of input samples to process per call. + */ + void arm_fir_decimate_fast_q31( + const arm_fir_decimate_instance_q31 * S, + const q31_t * pSrc, + q31_t * pDst, + uint32_t blockSize); + + + /** + * @brief Initialization function for the Q31 FIR decimator. + * @param[in,out] S points to an instance of the Q31 FIR decimator structure. + * @param[in] numTaps number of coefficients in the filter. + * @param[in] M decimation factor. + * @param[in] pCoeffs points to the filter coefficients. + * @param[in] pState points to the state buffer. + * @param[in] blockSize number of input samples to process per call. + * @return The function returns ARM_MATH_SUCCESS if initialization is successful or ARM_MATH_LENGTH_ERROR if + * blockSize is not a multiple of M. + */ + arm_status arm_fir_decimate_init_q31( + arm_fir_decimate_instance_q31 * S, + uint16_t numTaps, + uint8_t M, + const q31_t * pCoeffs, + q31_t * pState, + uint32_t blockSize); + + + /** + * @brief Instance structure for the Q15 FIR interpolator. + */ + typedef struct + { + uint8_t L; /**< upsample factor. */ + uint16_t phaseLength; /**< length of each polyphase filter component. */ + const q15_t *pCoeffs; /**< points to the coefficient array. The array is of length L*phaseLength. */ + q15_t *pState; /**< points to the state variable array. The array is of length blockSize+phaseLength-1. */ + } arm_fir_interpolate_instance_q15; + + /** + * @brief Instance structure for the Q31 FIR interpolator. + */ + typedef struct + { + uint8_t L; /**< upsample factor. */ + uint16_t phaseLength; /**< length of each polyphase filter component. */ + const q31_t *pCoeffs; /**< points to the coefficient array. The array is of length L*phaseLength. */ + q31_t *pState; /**< points to the state variable array. The array is of length blockSize+phaseLength-1. */ + } arm_fir_interpolate_instance_q31; + + /** + * @brief Instance structure for the floating-point FIR interpolator. + */ + typedef struct + { + uint8_t L; /**< upsample factor. */ + uint16_t phaseLength; /**< length of each polyphase filter component. */ + const float32_t *pCoeffs; /**< points to the coefficient array. The array is of length L*phaseLength. */ + float32_t *pState; /**< points to the state variable array. The array is of length phaseLength+numTaps-1. */ + } arm_fir_interpolate_instance_f32; + + + /** + * @brief Processing function for the Q15 FIR interpolator. + * @param[in] S points to an instance of the Q15 FIR interpolator structure. + * @param[in] pSrc points to the block of input data. + * @param[out] pDst points to the block of output data. + * @param[in] blockSize number of input samples to process per call. + */ + void arm_fir_interpolate_q15( + const arm_fir_interpolate_instance_q15 * S, + const q15_t * pSrc, + q15_t * pDst, + uint32_t blockSize); + + + /** + * @brief Initialization function for the Q15 FIR interpolator. + * @param[in,out] S points to an instance of the Q15 FIR interpolator structure. + * @param[in] L upsample factor. + * @param[in] numTaps number of filter coefficients in the filter. + * @param[in] pCoeffs points to the filter coefficient buffer. + * @param[in] pState points to the state buffer. + * @param[in] blockSize number of input samples to process per call. + * @return The function returns ARM_MATH_SUCCESS if initialization is successful or ARM_MATH_LENGTH_ERROR if + * the filter length numTaps is not a multiple of the interpolation factor L. + */ + arm_status arm_fir_interpolate_init_q15( + arm_fir_interpolate_instance_q15 * S, + uint8_t L, + uint16_t numTaps, + const q15_t * pCoeffs, + q15_t * pState, + uint32_t blockSize); + + + /** + * @brief Processing function for the Q31 FIR interpolator. + * @param[in] S points to an instance of the Q15 FIR interpolator structure. + * @param[in] pSrc points to the block of input data. + * @param[out] pDst points to the block of output data. + * @param[in] blockSize number of input samples to process per call. + */ + void arm_fir_interpolate_q31( + const arm_fir_interpolate_instance_q31 * S, + const q31_t * pSrc, + q31_t * pDst, + uint32_t blockSize); + + + /** + * @brief Initialization function for the Q31 FIR interpolator. + * @param[in,out] S points to an instance of the Q31 FIR interpolator structure. + * @param[in] L upsample factor. + * @param[in] numTaps number of filter coefficients in the filter. + * @param[in] pCoeffs points to the filter coefficient buffer. + * @param[in] pState points to the state buffer. + * @param[in] blockSize number of input samples to process per call. + * @return The function returns ARM_MATH_SUCCESS if initialization is successful or ARM_MATH_LENGTH_ERROR if + * the filter length numTaps is not a multiple of the interpolation factor L. + */ + arm_status arm_fir_interpolate_init_q31( + arm_fir_interpolate_instance_q31 * S, + uint8_t L, + uint16_t numTaps, + const q31_t * pCoeffs, + q31_t * pState, + uint32_t blockSize); + + + /** + * @brief Processing function for the floating-point FIR interpolator. + * @param[in] S points to an instance of the floating-point FIR interpolator structure. + * @param[in] pSrc points to the block of input data. + * @param[out] pDst points to the block of output data. + * @param[in] blockSize number of input samples to process per call. + */ + void arm_fir_interpolate_f32( + const arm_fir_interpolate_instance_f32 * S, + const float32_t * pSrc, + float32_t * pDst, + uint32_t blockSize); + + + /** + * @brief Initialization function for the floating-point FIR interpolator. + * @param[in,out] S points to an instance of the floating-point FIR interpolator structure. + * @param[in] L upsample factor. + * @param[in] numTaps number of filter coefficients in the filter. + * @param[in] pCoeffs points to the filter coefficient buffer. + * @param[in] pState points to the state buffer. + * @param[in] blockSize number of input samples to process per call. + * @return The function returns ARM_MATH_SUCCESS if initialization is successful or ARM_MATH_LENGTH_ERROR if + * the filter length numTaps is not a multiple of the interpolation factor L. + */ + arm_status arm_fir_interpolate_init_f32( + arm_fir_interpolate_instance_f32 * S, + uint8_t L, + uint16_t numTaps, + const float32_t * pCoeffs, + float32_t * pState, + uint32_t blockSize); + + + /** + * @brief Instance structure for the high precision Q31 Biquad cascade filter. + */ + typedef struct + { + uint8_t numStages; /**< number of 2nd order stages in the filter. Overall order is 2*numStages. */ + q63_t *pState; /**< points to the array of state coefficients. The array is of length 4*numStages. */ + const q31_t *pCoeffs; /**< points to the array of coefficients. The array is of length 5*numStages. */ + uint8_t postShift; /**< additional shift, in bits, applied to each output sample. */ + } arm_biquad_cas_df1_32x64_ins_q31; + + + /** + * @param[in] S points to an instance of the high precision Q31 Biquad cascade filter structure. + * @param[in] pSrc points to the block of input data. + * @param[out] pDst points to the block of output data + * @param[in] blockSize number of samples to process. + */ + void arm_biquad_cas_df1_32x64_q31( + const arm_biquad_cas_df1_32x64_ins_q31 * S, + const q31_t * pSrc, + q31_t * pDst, + uint32_t blockSize); + + + /** + * @param[in,out] S points to an instance of the high precision Q31 Biquad cascade filter structure. + * @param[in] numStages number of 2nd order stages in the filter. + * @param[in] pCoeffs points to the filter coefficients. + * @param[in] pState points to the state buffer. + * @param[in] postShift shift to be applied to the output. Varies according to the coefficients format + */ + void arm_biquad_cas_df1_32x64_init_q31( + arm_biquad_cas_df1_32x64_ins_q31 * S, + uint8_t numStages, + const q31_t * pCoeffs, + q63_t * pState, + uint8_t postShift); + + + /** + * @brief Instance structure for the floating-point transposed direct form II Biquad cascade filter. + */ + typedef struct + { + uint8_t numStages; /**< number of 2nd order stages in the filter. Overall order is 2*numStages. */ + float32_t *pState; /**< points to the array of state coefficients. The array is of length 2*numStages. */ + const float32_t *pCoeffs; /**< points to the array of coefficients. The array is of length 5*numStages. */ + } arm_biquad_cascade_df2T_instance_f32; + + /** + * @brief Instance structure for the floating-point transposed direct form II Biquad cascade filter. + */ + typedef struct + { + uint8_t numStages; /**< number of 2nd order stages in the filter. Overall order is 2*numStages. */ + float32_t *pState; /**< points to the array of state coefficients. The array is of length 4*numStages. */ + const float32_t *pCoeffs; /**< points to the array of coefficients. The array is of length 5*numStages. */ + } arm_biquad_cascade_stereo_df2T_instance_f32; + + /** + * @brief Instance structure for the floating-point transposed direct form II Biquad cascade filter. + */ + typedef struct + { + uint8_t numStages; /**< number of 2nd order stages in the filter. Overall order is 2*numStages. */ + float64_t *pState; /**< points to the array of state coefficients. The array is of length 2*numStages. */ + const float64_t *pCoeffs; /**< points to the array of coefficients. The array is of length 5*numStages. */ + } arm_biquad_cascade_df2T_instance_f64; + + + /** + * @brief Processing function for the floating-point transposed direct form II Biquad cascade filter. + * @param[in] S points to an instance of the filter data structure. + * @param[in] pSrc points to the block of input data. + * @param[out] pDst points to the block of output data + * @param[in] blockSize number of samples to process. + */ + void arm_biquad_cascade_df2T_f32( + const arm_biquad_cascade_df2T_instance_f32 * S, + const float32_t * pSrc, + float32_t * pDst, + uint32_t blockSize); + + + /** + * @brief Processing function for the floating-point transposed direct form II Biquad cascade filter. 2 channels + * @param[in] S points to an instance of the filter data structure. + * @param[in] pSrc points to the block of input data. + * @param[out] pDst points to the block of output data + * @param[in] blockSize number of samples to process. + */ + void arm_biquad_cascade_stereo_df2T_f32( + const arm_biquad_cascade_stereo_df2T_instance_f32 * S, + const float32_t * pSrc, + float32_t * pDst, + uint32_t blockSize); + + + /** + * @brief Processing function for the floating-point transposed direct form II Biquad cascade filter. + * @param[in] S points to an instance of the filter data structure. + * @param[in] pSrc points to the block of input data. + * @param[out] pDst points to the block of output data + * @param[in] blockSize number of samples to process. + */ + void arm_biquad_cascade_df2T_f64( + const arm_biquad_cascade_df2T_instance_f64 * S, + const float64_t * pSrc, + float64_t * pDst, + uint32_t blockSize); + + +#if defined(ARM_MATH_NEON) +void arm_biquad_cascade_df2T_compute_coefs_f32( + arm_biquad_cascade_df2T_instance_f32 * S, + uint8_t numStages, + float32_t * pCoeffs); +#endif + /** + * @brief Initialization function for the floating-point transposed direct form II Biquad cascade filter. + * @param[in,out] S points to an instance of the filter data structure. + * @param[in] numStages number of 2nd order stages in the filter. + * @param[in] pCoeffs points to the filter coefficients. + * @param[in] pState points to the state buffer. + */ + void arm_biquad_cascade_df2T_init_f32( + arm_biquad_cascade_df2T_instance_f32 * S, + uint8_t numStages, + const float32_t * pCoeffs, + float32_t * pState); + + + /** + * @brief Initialization function for the floating-point transposed direct form II Biquad cascade filter. + * @param[in,out] S points to an instance of the filter data structure. + * @param[in] numStages number of 2nd order stages in the filter. + * @param[in] pCoeffs points to the filter coefficients. + * @param[in] pState points to the state buffer. + */ + void arm_biquad_cascade_stereo_df2T_init_f32( + arm_biquad_cascade_stereo_df2T_instance_f32 * S, + uint8_t numStages, + const float32_t * pCoeffs, + float32_t * pState); + + + /** + * @brief Initialization function for the floating-point transposed direct form II Biquad cascade filter. + * @param[in,out] S points to an instance of the filter data structure. + * @param[in] numStages number of 2nd order stages in the filter. + * @param[in] pCoeffs points to the filter coefficients. + * @param[in] pState points to the state buffer. + */ + void arm_biquad_cascade_df2T_init_f64( + arm_biquad_cascade_df2T_instance_f64 * S, + uint8_t numStages, + const float64_t * pCoeffs, + float64_t * pState); + + + /** + * @brief Instance structure for the Q15 FIR lattice filter. + */ + typedef struct + { + uint16_t numStages; /**< number of filter stages. */ + q15_t *pState; /**< points to the state variable array. The array is of length numStages. */ + const q15_t *pCoeffs; /**< points to the coefficient array. The array is of length numStages. */ + } arm_fir_lattice_instance_q15; + + /** + * @brief Instance structure for the Q31 FIR lattice filter. + */ + typedef struct + { + uint16_t numStages; /**< number of filter stages. */ + q31_t *pState; /**< points to the state variable array. The array is of length numStages. */ + const q31_t *pCoeffs; /**< points to the coefficient array. The array is of length numStages. */ + } arm_fir_lattice_instance_q31; + + /** + * @brief Instance structure for the floating-point FIR lattice filter. + */ + typedef struct + { + uint16_t numStages; /**< number of filter stages. */ + float32_t *pState; /**< points to the state variable array. The array is of length numStages. */ + const float32_t *pCoeffs; /**< points to the coefficient array. The array is of length numStages. */ + } arm_fir_lattice_instance_f32; + + + /** + * @brief Initialization function for the Q15 FIR lattice filter. + * @param[in] S points to an instance of the Q15 FIR lattice structure. + * @param[in] numStages number of filter stages. + * @param[in] pCoeffs points to the coefficient buffer. The array is of length numStages. + * @param[in] pState points to the state buffer. The array is of length numStages. + */ + void arm_fir_lattice_init_q15( + arm_fir_lattice_instance_q15 * S, + uint16_t numStages, + const q15_t * pCoeffs, + q15_t * pState); + + + /** + * @brief Processing function for the Q15 FIR lattice filter. + * @param[in] S points to an instance of the Q15 FIR lattice structure. + * @param[in] pSrc points to the block of input data. + * @param[out] pDst points to the block of output data. + * @param[in] blockSize number of samples to process. + */ + void arm_fir_lattice_q15( + const arm_fir_lattice_instance_q15 * S, + const q15_t * pSrc, + q15_t * pDst, + uint32_t blockSize); + + + /** + * @brief Initialization function for the Q31 FIR lattice filter. + * @param[in] S points to an instance of the Q31 FIR lattice structure. + * @param[in] numStages number of filter stages. + * @param[in] pCoeffs points to the coefficient buffer. The array is of length numStages. + * @param[in] pState points to the state buffer. The array is of length numStages. + */ + void arm_fir_lattice_init_q31( + arm_fir_lattice_instance_q31 * S, + uint16_t numStages, + const q31_t * pCoeffs, + q31_t * pState); + + + /** + * @brief Processing function for the Q31 FIR lattice filter. + * @param[in] S points to an instance of the Q31 FIR lattice structure. + * @param[in] pSrc points to the block of input data. + * @param[out] pDst points to the block of output data + * @param[in] blockSize number of samples to process. + */ + void arm_fir_lattice_q31( + const arm_fir_lattice_instance_q31 * S, + const q31_t * pSrc, + q31_t * pDst, + uint32_t blockSize); + + +/** + * @brief Initialization function for the floating-point FIR lattice filter. + * @param[in] S points to an instance of the floating-point FIR lattice structure. + * @param[in] numStages number of filter stages. + * @param[in] pCoeffs points to the coefficient buffer. The array is of length numStages. + * @param[in] pState points to the state buffer. The array is of length numStages. + */ + void arm_fir_lattice_init_f32( + arm_fir_lattice_instance_f32 * S, + uint16_t numStages, + const float32_t * pCoeffs, + float32_t * pState); + + + /** + * @brief Processing function for the floating-point FIR lattice filter. + * @param[in] S points to an instance of the floating-point FIR lattice structure. + * @param[in] pSrc points to the block of input data. + * @param[out] pDst points to the block of output data + * @param[in] blockSize number of samples to process. + */ + void arm_fir_lattice_f32( + const arm_fir_lattice_instance_f32 * S, + const float32_t * pSrc, + float32_t * pDst, + uint32_t blockSize); + + + /** + * @brief Instance structure for the Q15 IIR lattice filter. + */ + typedef struct + { + uint16_t numStages; /**< number of stages in the filter. */ + q15_t *pState; /**< points to the state variable array. The array is of length numStages+blockSize. */ + q15_t *pkCoeffs; /**< points to the reflection coefficient array. The array is of length numStages. */ + q15_t *pvCoeffs; /**< points to the ladder coefficient array. The array is of length numStages+1. */ + } arm_iir_lattice_instance_q15; + + /** + * @brief Instance structure for the Q31 IIR lattice filter. + */ + typedef struct + { + uint16_t numStages; /**< number of stages in the filter. */ + q31_t *pState; /**< points to the state variable array. The array is of length numStages+blockSize. */ + q31_t *pkCoeffs; /**< points to the reflection coefficient array. The array is of length numStages. */ + q31_t *pvCoeffs; /**< points to the ladder coefficient array. The array is of length numStages+1. */ + } arm_iir_lattice_instance_q31; + + /** + * @brief Instance structure for the floating-point IIR lattice filter. + */ + typedef struct + { + uint16_t numStages; /**< number of stages in the filter. */ + float32_t *pState; /**< points to the state variable array. The array is of length numStages+blockSize. */ + float32_t *pkCoeffs; /**< points to the reflection coefficient array. The array is of length numStages. */ + float32_t *pvCoeffs; /**< points to the ladder coefficient array. The array is of length numStages+1. */ + } arm_iir_lattice_instance_f32; + + + /** + * @brief Processing function for the floating-point IIR lattice filter. + * @param[in] S points to an instance of the floating-point IIR lattice structure. + * @param[in] pSrc points to the block of input data. + * @param[out] pDst points to the block of output data. + * @param[in] blockSize number of samples to process. + */ + void arm_iir_lattice_f32( + const arm_iir_lattice_instance_f32 * S, + const float32_t * pSrc, + float32_t * pDst, + uint32_t blockSize); + + + /** + * @brief Initialization function for the floating-point IIR lattice filter. + * @param[in] S points to an instance of the floating-point IIR lattice structure. + * @param[in] numStages number of stages in the filter. + * @param[in] pkCoeffs points to the reflection coefficient buffer. The array is of length numStages. + * @param[in] pvCoeffs points to the ladder coefficient buffer. The array is of length numStages+1. + * @param[in] pState points to the state buffer. The array is of length numStages+blockSize-1. + * @param[in] blockSize number of samples to process. + */ + void arm_iir_lattice_init_f32( + arm_iir_lattice_instance_f32 * S, + uint16_t numStages, + float32_t * pkCoeffs, + float32_t * pvCoeffs, + float32_t * pState, + uint32_t blockSize); + + + /** + * @brief Processing function for the Q31 IIR lattice filter. + * @param[in] S points to an instance of the Q31 IIR lattice structure. + * @param[in] pSrc points to the block of input data. + * @param[out] pDst points to the block of output data. + * @param[in] blockSize number of samples to process. + */ + void arm_iir_lattice_q31( + const arm_iir_lattice_instance_q31 * S, + const q31_t * pSrc, + q31_t * pDst, + uint32_t blockSize); + + + /** + * @brief Initialization function for the Q31 IIR lattice filter. + * @param[in] S points to an instance of the Q31 IIR lattice structure. + * @param[in] numStages number of stages in the filter. + * @param[in] pkCoeffs points to the reflection coefficient buffer. The array is of length numStages. + * @param[in] pvCoeffs points to the ladder coefficient buffer. The array is of length numStages+1. + * @param[in] pState points to the state buffer. The array is of length numStages+blockSize. + * @param[in] blockSize number of samples to process. + */ + void arm_iir_lattice_init_q31( + arm_iir_lattice_instance_q31 * S, + uint16_t numStages, + q31_t * pkCoeffs, + q31_t * pvCoeffs, + q31_t * pState, + uint32_t blockSize); + + + /** + * @brief Processing function for the Q15 IIR lattice filter. + * @param[in] S points to an instance of the Q15 IIR lattice structure. + * @param[in] pSrc points to the block of input data. + * @param[out] pDst points to the block of output data. + * @param[in] blockSize number of samples to process. + */ + void arm_iir_lattice_q15( + const arm_iir_lattice_instance_q15 * S, + const q15_t * pSrc, + q15_t * pDst, + uint32_t blockSize); + + +/** + * @brief Initialization function for the Q15 IIR lattice filter. + * @param[in] S points to an instance of the fixed-point Q15 IIR lattice structure. + * @param[in] numStages number of stages in the filter. + * @param[in] pkCoeffs points to reflection coefficient buffer. The array is of length numStages. + * @param[in] pvCoeffs points to ladder coefficient buffer. The array is of length numStages+1. + * @param[in] pState points to state buffer. The array is of length numStages+blockSize. + * @param[in] blockSize number of samples to process per call. + */ + void arm_iir_lattice_init_q15( + arm_iir_lattice_instance_q15 * S, + uint16_t numStages, + q15_t * pkCoeffs, + q15_t * pvCoeffs, + q15_t * pState, + uint32_t blockSize); + + + /** + * @brief Instance structure for the floating-point LMS filter. + */ + typedef struct + { + uint16_t numTaps; /**< number of coefficients in the filter. */ + float32_t *pState; /**< points to the state variable array. The array is of length numTaps+blockSize-1. */ + float32_t *pCoeffs; /**< points to the coefficient array. The array is of length numTaps. */ + float32_t mu; /**< step size that controls filter coefficient updates. */ + } arm_lms_instance_f32; + + + /** + * @brief Processing function for floating-point LMS filter. + * @param[in] S points to an instance of the floating-point LMS filter structure. + * @param[in] pSrc points to the block of input data. + * @param[in] pRef points to the block of reference data. + * @param[out] pOut points to the block of output data. + * @param[out] pErr points to the block of error data. + * @param[in] blockSize number of samples to process. + */ + void arm_lms_f32( + const arm_lms_instance_f32 * S, + const float32_t * pSrc, + float32_t * pRef, + float32_t * pOut, + float32_t * pErr, + uint32_t blockSize); + + + /** + * @brief Initialization function for floating-point LMS filter. + * @param[in] S points to an instance of the floating-point LMS filter structure. + * @param[in] numTaps number of filter coefficients. + * @param[in] pCoeffs points to the coefficient buffer. + * @param[in] pState points to state buffer. + * @param[in] mu step size that controls filter coefficient updates. + * @param[in] blockSize number of samples to process. + */ + void arm_lms_init_f32( + arm_lms_instance_f32 * S, + uint16_t numTaps, + float32_t * pCoeffs, + float32_t * pState, + float32_t mu, + uint32_t blockSize); + + + /** + * @brief Instance structure for the Q15 LMS filter. + */ + typedef struct + { + uint16_t numTaps; /**< number of coefficients in the filter. */ + q15_t *pState; /**< points to the state variable array. The array is of length numTaps+blockSize-1. */ + q15_t *pCoeffs; /**< points to the coefficient array. The array is of length numTaps. */ + q15_t mu; /**< step size that controls filter coefficient updates. */ + uint32_t postShift; /**< bit shift applied to coefficients. */ + } arm_lms_instance_q15; + + + /** + * @brief Initialization function for the Q15 LMS filter. + * @param[in] S points to an instance of the Q15 LMS filter structure. + * @param[in] numTaps number of filter coefficients. + * @param[in] pCoeffs points to the coefficient buffer. + * @param[in] pState points to the state buffer. + * @param[in] mu step size that controls filter coefficient updates. + * @param[in] blockSize number of samples to process. + * @param[in] postShift bit shift applied to coefficients. + */ + void arm_lms_init_q15( + arm_lms_instance_q15 * S, + uint16_t numTaps, + q15_t * pCoeffs, + q15_t * pState, + q15_t mu, + uint32_t blockSize, + uint32_t postShift); + + + /** + * @brief Processing function for Q15 LMS filter. + * @param[in] S points to an instance of the Q15 LMS filter structure. + * @param[in] pSrc points to the block of input data. + * @param[in] pRef points to the block of reference data. + * @param[out] pOut points to the block of output data. + * @param[out] pErr points to the block of error data. + * @param[in] blockSize number of samples to process. + */ + void arm_lms_q15( + const arm_lms_instance_q15 * S, + const q15_t * pSrc, + q15_t * pRef, + q15_t * pOut, + q15_t * pErr, + uint32_t blockSize); + + + /** + * @brief Instance structure for the Q31 LMS filter. + */ + typedef struct + { + uint16_t numTaps; /**< number of coefficients in the filter. */ + q31_t *pState; /**< points to the state variable array. The array is of length numTaps+blockSize-1. */ + q31_t *pCoeffs; /**< points to the coefficient array. The array is of length numTaps. */ + q31_t mu; /**< step size that controls filter coefficient updates. */ + uint32_t postShift; /**< bit shift applied to coefficients. */ + } arm_lms_instance_q31; + + + /** + * @brief Processing function for Q31 LMS filter. + * @param[in] S points to an instance of the Q15 LMS filter structure. + * @param[in] pSrc points to the block of input data. + * @param[in] pRef points to the block of reference data. + * @param[out] pOut points to the block of output data. + * @param[out] pErr points to the block of error data. + * @param[in] blockSize number of samples to process. + */ + void arm_lms_q31( + const arm_lms_instance_q31 * S, + const q31_t * pSrc, + q31_t * pRef, + q31_t * pOut, + q31_t * pErr, + uint32_t blockSize); + + + /** + * @brief Initialization function for Q31 LMS filter. + * @param[in] S points to an instance of the Q31 LMS filter structure. + * @param[in] numTaps number of filter coefficients. + * @param[in] pCoeffs points to coefficient buffer. + * @param[in] pState points to state buffer. + * @param[in] mu step size that controls filter coefficient updates. + * @param[in] blockSize number of samples to process. + * @param[in] postShift bit shift applied to coefficients. + */ + void arm_lms_init_q31( + arm_lms_instance_q31 * S, + uint16_t numTaps, + q31_t * pCoeffs, + q31_t * pState, + q31_t mu, + uint32_t blockSize, + uint32_t postShift); + + + /** + * @brief Instance structure for the floating-point normalized LMS filter. + */ + typedef struct + { + uint16_t numTaps; /**< number of coefficients in the filter. */ + float32_t *pState; /**< points to the state variable array. The array is of length numTaps+blockSize-1. */ + float32_t *pCoeffs; /**< points to the coefficient array. The array is of length numTaps. */ + float32_t mu; /**< step size that control filter coefficient updates. */ + float32_t energy; /**< saves previous frame energy. */ + float32_t x0; /**< saves previous input sample. */ + } arm_lms_norm_instance_f32; + + + /** + * @brief Processing function for floating-point normalized LMS filter. + * @param[in] S points to an instance of the floating-point normalized LMS filter structure. + * @param[in] pSrc points to the block of input data. + * @param[in] pRef points to the block of reference data. + * @param[out] pOut points to the block of output data. + * @param[out] pErr points to the block of error data. + * @param[in] blockSize number of samples to process. + */ + void arm_lms_norm_f32( + arm_lms_norm_instance_f32 * S, + const float32_t * pSrc, + float32_t * pRef, + float32_t * pOut, + float32_t * pErr, + uint32_t blockSize); + + + /** + * @brief Initialization function for floating-point normalized LMS filter. + * @param[in] S points to an instance of the floating-point LMS filter structure. + * @param[in] numTaps number of filter coefficients. + * @param[in] pCoeffs points to coefficient buffer. + * @param[in] pState points to state buffer. + * @param[in] mu step size that controls filter coefficient updates. + * @param[in] blockSize number of samples to process. + */ + void arm_lms_norm_init_f32( + arm_lms_norm_instance_f32 * S, + uint16_t numTaps, + float32_t * pCoeffs, + float32_t * pState, + float32_t mu, + uint32_t blockSize); + + + /** + * @brief Instance structure for the Q31 normalized LMS filter. + */ + typedef struct + { + uint16_t numTaps; /**< number of coefficients in the filter. */ + q31_t *pState; /**< points to the state variable array. The array is of length numTaps+blockSize-1. */ + q31_t *pCoeffs; /**< points to the coefficient array. The array is of length numTaps. */ + q31_t mu; /**< step size that controls filter coefficient updates. */ + uint8_t postShift; /**< bit shift applied to coefficients. */ + const q31_t *recipTable; /**< points to the reciprocal initial value table. */ + q31_t energy; /**< saves previous frame energy. */ + q31_t x0; /**< saves previous input sample. */ + } arm_lms_norm_instance_q31; + + + /** + * @brief Processing function for Q31 normalized LMS filter. + * @param[in] S points to an instance of the Q31 normalized LMS filter structure. + * @param[in] pSrc points to the block of input data. + * @param[in] pRef points to the block of reference data. + * @param[out] pOut points to the block of output data. + * @param[out] pErr points to the block of error data. + * @param[in] blockSize number of samples to process. + */ + void arm_lms_norm_q31( + arm_lms_norm_instance_q31 * S, + const q31_t * pSrc, + q31_t * pRef, + q31_t * pOut, + q31_t * pErr, + uint32_t blockSize); + + + /** + * @brief Initialization function for Q31 normalized LMS filter. + * @param[in] S points to an instance of the Q31 normalized LMS filter structure. + * @param[in] numTaps number of filter coefficients. + * @param[in] pCoeffs points to coefficient buffer. + * @param[in] pState points to state buffer. + * @param[in] mu step size that controls filter coefficient updates. + * @param[in] blockSize number of samples to process. + * @param[in] postShift bit shift applied to coefficients. + */ + void arm_lms_norm_init_q31( + arm_lms_norm_instance_q31 * S, + uint16_t numTaps, + q31_t * pCoeffs, + q31_t * pState, + q31_t mu, + uint32_t blockSize, + uint8_t postShift); + + + /** + * @brief Instance structure for the Q15 normalized LMS filter. + */ + typedef struct + { + uint16_t numTaps; /**< Number of coefficients in the filter. */ + q15_t *pState; /**< points to the state variable array. The array is of length numTaps+blockSize-1. */ + q15_t *pCoeffs; /**< points to the coefficient array. The array is of length numTaps. */ + q15_t mu; /**< step size that controls filter coefficient updates. */ + uint8_t postShift; /**< bit shift applied to coefficients. */ + const q15_t *recipTable; /**< Points to the reciprocal initial value table. */ + q15_t energy; /**< saves previous frame energy. */ + q15_t x0; /**< saves previous input sample. */ + } arm_lms_norm_instance_q15; + + + /** + * @brief Processing function for Q15 normalized LMS filter. + * @param[in] S points to an instance of the Q15 normalized LMS filter structure. + * @param[in] pSrc points to the block of input data. + * @param[in] pRef points to the block of reference data. + * @param[out] pOut points to the block of output data. + * @param[out] pErr points to the block of error data. + * @param[in] blockSize number of samples to process. + */ + void arm_lms_norm_q15( + arm_lms_norm_instance_q15 * S, + const q15_t * pSrc, + q15_t * pRef, + q15_t * pOut, + q15_t * pErr, + uint32_t blockSize); + + + /** + * @brief Initialization function for Q15 normalized LMS filter. + * @param[in] S points to an instance of the Q15 normalized LMS filter structure. + * @param[in] numTaps number of filter coefficients. + * @param[in] pCoeffs points to coefficient buffer. + * @param[in] pState points to state buffer. + * @param[in] mu step size that controls filter coefficient updates. + * @param[in] blockSize number of samples to process. + * @param[in] postShift bit shift applied to coefficients. + */ + void arm_lms_norm_init_q15( + arm_lms_norm_instance_q15 * S, + uint16_t numTaps, + q15_t * pCoeffs, + q15_t * pState, + q15_t mu, + uint32_t blockSize, + uint8_t postShift); + + + /** + * @brief Correlation of floating-point sequences. + * @param[in] pSrcA points to the first input sequence. + * @param[in] srcALen length of the first input sequence. + * @param[in] pSrcB points to the second input sequence. + * @param[in] srcBLen length of the second input sequence. + * @param[out] pDst points to the block of output data Length 2 * max(srcALen, srcBLen) - 1. + */ + void arm_correlate_f32( + const float32_t * pSrcA, + uint32_t srcALen, + const float32_t * pSrcB, + uint32_t srcBLen, + float32_t * pDst); + + +/** + @brief Correlation of Q15 sequences + @param[in] pSrcA points to the first input sequence + @param[in] srcALen length of the first input sequence + @param[in] pSrcB points to the second input sequence + @param[in] srcBLen length of the second input sequence + @param[out] pDst points to the block of output data Length 2 * max(srcALen, srcBLen) - 1. + @param[in] pScratch points to scratch buffer of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2. +*/ +void arm_correlate_opt_q15( + const q15_t * pSrcA, + uint32_t srcALen, + const q15_t * pSrcB, + uint32_t srcBLen, + q15_t * pDst, + q15_t * pScratch); + + +/** + @brief Correlation of Q15 sequences. + @param[in] pSrcA points to the first input sequence + @param[in] srcALen length of the first input sequence + @param[in] pSrcB points to the second input sequence + @param[in] srcBLen length of the second input sequence + @param[out] pDst points to the block of output data Length 2 * max(srcALen, srcBLen) - 1. + */ + void arm_correlate_q15( + const q15_t * pSrcA, + uint32_t srcALen, + const q15_t * pSrcB, + uint32_t srcBLen, + q15_t * pDst); + + +/** + @brief Correlation of Q15 sequences (fast version). + @param[in] pSrcA points to the first input sequence + @param[in] srcALen length of the first input sequence + @param[in] pSrcB points to the second input sequence + @param[in] srcBLen length of the second input sequence + @param[out] pDst points to the location where the output result is written. Length 2 * max(srcALen, srcBLen) - 1. + @return none + */ +void arm_correlate_fast_q15( + const q15_t * pSrcA, + uint32_t srcALen, + const q15_t * pSrcB, + uint32_t srcBLen, + q15_t * pDst); + + +/** + @brief Correlation of Q15 sequences (fast version). + @param[in] pSrcA points to the first input sequence. + @param[in] srcALen length of the first input sequence. + @param[in] pSrcB points to the second input sequence. + @param[in] srcBLen length of the second input sequence. + @param[out] pDst points to the block of output data Length 2 * max(srcALen, srcBLen) - 1. + @param[in] pScratch points to scratch buffer of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2. + */ +void arm_correlate_fast_opt_q15( + const q15_t * pSrcA, + uint32_t srcALen, + const q15_t * pSrcB, + uint32_t srcBLen, + q15_t * pDst, + q15_t * pScratch); + + + /** + * @brief Correlation of Q31 sequences. + * @param[in] pSrcA points to the first input sequence. + * @param[in] srcALen length of the first input sequence. + * @param[in] pSrcB points to the second input sequence. + * @param[in] srcBLen length of the second input sequence. + * @param[out] pDst points to the block of output data Length 2 * max(srcALen, srcBLen) - 1. + */ + void arm_correlate_q31( + const q31_t * pSrcA, + uint32_t srcALen, + const q31_t * pSrcB, + uint32_t srcBLen, + q31_t * pDst); + + +/** + @brief Correlation of Q31 sequences (fast version). + @param[in] pSrcA points to the first input sequence + @param[in] srcALen length of the first input sequence + @param[in] pSrcB points to the second input sequence + @param[in] srcBLen length of the second input sequence + @param[out] pDst points to the block of output data Length 2 * max(srcALen, srcBLen) - 1. + */ +void arm_correlate_fast_q31( + const q31_t * pSrcA, + uint32_t srcALen, + const q31_t * pSrcB, + uint32_t srcBLen, + q31_t * pDst); + + + /** + * @brief Correlation of Q7 sequences. + * @param[in] pSrcA points to the first input sequence. + * @param[in] srcALen length of the first input sequence. + * @param[in] pSrcB points to the second input sequence. + * @param[in] srcBLen length of the second input sequence. + * @param[out] pDst points to the block of output data Length 2 * max(srcALen, srcBLen) - 1. + * @param[in] pScratch1 points to scratch buffer(of type q15_t) of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2. + * @param[in] pScratch2 points to scratch buffer (of type q15_t) of size min(srcALen, srcBLen). + */ + void arm_correlate_opt_q7( + const q7_t * pSrcA, + uint32_t srcALen, + const q7_t * pSrcB, + uint32_t srcBLen, + q7_t * pDst, + q15_t * pScratch1, + q15_t * pScratch2); + + + /** + * @brief Correlation of Q7 sequences. + * @param[in] pSrcA points to the first input sequence. + * @param[in] srcALen length of the first input sequence. + * @param[in] pSrcB points to the second input sequence. + * @param[in] srcBLen length of the second input sequence. + * @param[out] pDst points to the block of output data Length 2 * max(srcALen, srcBLen) - 1. + */ + void arm_correlate_q7( + const q7_t * pSrcA, + uint32_t srcALen, + const q7_t * pSrcB, + uint32_t srcBLen, + q7_t * pDst); + + + /** + * @brief Instance structure for the floating-point sparse FIR filter. + */ + typedef struct + { + uint16_t numTaps; /**< number of coefficients in the filter. */ + uint16_t stateIndex; /**< state buffer index. Points to the oldest sample in the state buffer. */ + float32_t *pState; /**< points to the state buffer array. The array is of length maxDelay+blockSize-1. */ + const float32_t *pCoeffs; /**< points to the coefficient array. The array is of length numTaps.*/ + uint16_t maxDelay; /**< maximum offset specified by the pTapDelay array. */ + int32_t *pTapDelay; /**< points to the array of delay values. The array is of length numTaps. */ + } arm_fir_sparse_instance_f32; + + /** + * @brief Instance structure for the Q31 sparse FIR filter. + */ + typedef struct + { + uint16_t numTaps; /**< number of coefficients in the filter. */ + uint16_t stateIndex; /**< state buffer index. Points to the oldest sample in the state buffer. */ + q31_t *pState; /**< points to the state buffer array. The array is of length maxDelay+blockSize-1. */ + const q31_t *pCoeffs; /**< points to the coefficient array. The array is of length numTaps.*/ + uint16_t maxDelay; /**< maximum offset specified by the pTapDelay array. */ + int32_t *pTapDelay; /**< points to the array of delay values. The array is of length numTaps. */ + } arm_fir_sparse_instance_q31; + + /** + * @brief Instance structure for the Q15 sparse FIR filter. + */ + typedef struct + { + uint16_t numTaps; /**< number of coefficients in the filter. */ + uint16_t stateIndex; /**< state buffer index. Points to the oldest sample in the state buffer. */ + q15_t *pState; /**< points to the state buffer array. The array is of length maxDelay+blockSize-1. */ + const q15_t *pCoeffs; /**< points to the coefficient array. The array is of length numTaps.*/ + uint16_t maxDelay; /**< maximum offset specified by the pTapDelay array. */ + int32_t *pTapDelay; /**< points to the array of delay values. The array is of length numTaps. */ + } arm_fir_sparse_instance_q15; + + /** + * @brief Instance structure for the Q7 sparse FIR filter. + */ + typedef struct + { + uint16_t numTaps; /**< number of coefficients in the filter. */ + uint16_t stateIndex; /**< state buffer index. Points to the oldest sample in the state buffer. */ + q7_t *pState; /**< points to the state buffer array. The array is of length maxDelay+blockSize-1. */ + const q7_t *pCoeffs; /**< points to the coefficient array. The array is of length numTaps.*/ + uint16_t maxDelay; /**< maximum offset specified by the pTapDelay array. */ + int32_t *pTapDelay; /**< points to the array of delay values. The array is of length numTaps. */ + } arm_fir_sparse_instance_q7; + + + /** + * @brief Processing function for the floating-point sparse FIR filter. + * @param[in] S points to an instance of the floating-point sparse FIR structure. + * @param[in] pSrc points to the block of input data. + * @param[out] pDst points to the block of output data + * @param[in] pScratchIn points to a temporary buffer of size blockSize. + * @param[in] blockSize number of input samples to process per call. + */ + void arm_fir_sparse_f32( + arm_fir_sparse_instance_f32 * S, + const float32_t * pSrc, + float32_t * pDst, + float32_t * pScratchIn, + uint32_t blockSize); + + + /** + * @brief Initialization function for the floating-point sparse FIR filter. + * @param[in,out] S points to an instance of the floating-point sparse FIR structure. + * @param[in] numTaps number of nonzero coefficients in the filter. + * @param[in] pCoeffs points to the array of filter coefficients. + * @param[in] pState points to the state buffer. + * @param[in] pTapDelay points to the array of offset times. + * @param[in] maxDelay maximum offset time supported. + * @param[in] blockSize number of samples that will be processed per block. + */ + void arm_fir_sparse_init_f32( + arm_fir_sparse_instance_f32 * S, + uint16_t numTaps, + const float32_t * pCoeffs, + float32_t * pState, + int32_t * pTapDelay, + uint16_t maxDelay, + uint32_t blockSize); + + + /** + * @brief Processing function for the Q31 sparse FIR filter. + * @param[in] S points to an instance of the Q31 sparse FIR structure. + * @param[in] pSrc points to the block of input data. + * @param[out] pDst points to the block of output data + * @param[in] pScratchIn points to a temporary buffer of size blockSize. + * @param[in] blockSize number of input samples to process per call. + */ + void arm_fir_sparse_q31( + arm_fir_sparse_instance_q31 * S, + const q31_t * pSrc, + q31_t * pDst, + q31_t * pScratchIn, + uint32_t blockSize); + + + /** + * @brief Initialization function for the Q31 sparse FIR filter. + * @param[in,out] S points to an instance of the Q31 sparse FIR structure. + * @param[in] numTaps number of nonzero coefficients in the filter. + * @param[in] pCoeffs points to the array of filter coefficients. + * @param[in] pState points to the state buffer. + * @param[in] pTapDelay points to the array of offset times. + * @param[in] maxDelay maximum offset time supported. + * @param[in] blockSize number of samples that will be processed per block. + */ + void arm_fir_sparse_init_q31( + arm_fir_sparse_instance_q31 * S, + uint16_t numTaps, + const q31_t * pCoeffs, + q31_t * pState, + int32_t * pTapDelay, + uint16_t maxDelay, + uint32_t blockSize); + + + /** + * @brief Processing function for the Q15 sparse FIR filter. + * @param[in] S points to an instance of the Q15 sparse FIR structure. + * @param[in] pSrc points to the block of input data. + * @param[out] pDst points to the block of output data + * @param[in] pScratchIn points to a temporary buffer of size blockSize. + * @param[in] pScratchOut points to a temporary buffer of size blockSize. + * @param[in] blockSize number of input samples to process per call. + */ + void arm_fir_sparse_q15( + arm_fir_sparse_instance_q15 * S, + const q15_t * pSrc, + q15_t * pDst, + q15_t * pScratchIn, + q31_t * pScratchOut, + uint32_t blockSize); + + + /** + * @brief Initialization function for the Q15 sparse FIR filter. + * @param[in,out] S points to an instance of the Q15 sparse FIR structure. + * @param[in] numTaps number of nonzero coefficients in the filter. + * @param[in] pCoeffs points to the array of filter coefficients. + * @param[in] pState points to the state buffer. + * @param[in] pTapDelay points to the array of offset times. + * @param[in] maxDelay maximum offset time supported. + * @param[in] blockSize number of samples that will be processed per block. + */ + void arm_fir_sparse_init_q15( + arm_fir_sparse_instance_q15 * S, + uint16_t numTaps, + const q15_t * pCoeffs, + q15_t * pState, + int32_t * pTapDelay, + uint16_t maxDelay, + uint32_t blockSize); + + + /** + * @brief Processing function for the Q7 sparse FIR filter. + * @param[in] S points to an instance of the Q7 sparse FIR structure. + * @param[in] pSrc points to the block of input data. + * @param[out] pDst points to the block of output data + * @param[in] pScratchIn points to a temporary buffer of size blockSize. + * @param[in] pScratchOut points to a temporary buffer of size blockSize. + * @param[in] blockSize number of input samples to process per call. + */ + void arm_fir_sparse_q7( + arm_fir_sparse_instance_q7 * S, + const q7_t * pSrc, + q7_t * pDst, + q7_t * pScratchIn, + q31_t * pScratchOut, + uint32_t blockSize); + + + /** + * @brief Initialization function for the Q7 sparse FIR filter. + * @param[in,out] S points to an instance of the Q7 sparse FIR structure. + * @param[in] numTaps number of nonzero coefficients in the filter. + * @param[in] pCoeffs points to the array of filter coefficients. + * @param[in] pState points to the state buffer. + * @param[in] pTapDelay points to the array of offset times. + * @param[in] maxDelay maximum offset time supported. + * @param[in] blockSize number of samples that will be processed per block. + */ + void arm_fir_sparse_init_q7( + arm_fir_sparse_instance_q7 * S, + uint16_t numTaps, + const q7_t * pCoeffs, + q7_t * pState, + int32_t * pTapDelay, + uint16_t maxDelay, + uint32_t blockSize); + + + + + + + /** + * @brief floating-point Circular write function. + */ + __STATIC_FORCEINLINE void arm_circularWrite_f32( + int32_t * circBuffer, + int32_t L, + uint16_t * writeOffset, + int32_t bufferInc, + const int32_t * src, + int32_t srcInc, + uint32_t blockSize) + { + uint32_t i = 0U; + int32_t wOffset; + + /* Copy the value of Index pointer that points + * to the current location where the input samples to be copied */ + wOffset = *writeOffset; + + /* Loop over the blockSize */ + i = blockSize; + + while (i > 0U) + { + /* copy the input sample to the circular buffer */ + circBuffer[wOffset] = *src; + + /* Update the input pointer */ + src += srcInc; + + /* Circularly update wOffset. Watch out for positive and negative value */ + wOffset += bufferInc; + if (wOffset >= L) + wOffset -= L; + + /* Decrement the loop counter */ + i--; + } + + /* Update the index pointer */ + *writeOffset = (uint16_t)wOffset; + } + + + + /** + * @brief floating-point Circular Read function. + */ + __STATIC_FORCEINLINE void arm_circularRead_f32( + int32_t * circBuffer, + int32_t L, + int32_t * readOffset, + int32_t bufferInc, + int32_t * dst, + int32_t * dst_base, + int32_t dst_length, + int32_t dstInc, + uint32_t blockSize) + { + uint32_t i = 0U; + int32_t rOffset; + int32_t* dst_end; + + /* Copy the value of Index pointer that points + * to the current location from where the input samples to be read */ + rOffset = *readOffset; + dst_end = dst_base + dst_length; + + /* Loop over the blockSize */ + i = blockSize; + + while (i > 0U) + { + /* copy the sample from the circular buffer to the destination buffer */ + *dst = circBuffer[rOffset]; + + /* Update the input pointer */ + dst += dstInc; + + if (dst == dst_end) + { + dst = dst_base; + } + + /* Circularly update rOffset. Watch out for positive and negative value */ + rOffset += bufferInc; + + if (rOffset >= L) + { + rOffset -= L; + } + + /* Decrement the loop counter */ + i--; + } + + /* Update the index pointer */ + *readOffset = rOffset; + } + + + /** + * @brief Q15 Circular write function. + */ + __STATIC_FORCEINLINE void arm_circularWrite_q15( + q15_t * circBuffer, + int32_t L, + uint16_t * writeOffset, + int32_t bufferInc, + const q15_t * src, + int32_t srcInc, + uint32_t blockSize) + { + uint32_t i = 0U; + int32_t wOffset; + + /* Copy the value of Index pointer that points + * to the current location where the input samples to be copied */ + wOffset = *writeOffset; + + /* Loop over the blockSize */ + i = blockSize; + + while (i > 0U) + { + /* copy the input sample to the circular buffer */ + circBuffer[wOffset] = *src; + + /* Update the input pointer */ + src += srcInc; + + /* Circularly update wOffset. Watch out for positive and negative value */ + wOffset += bufferInc; + if (wOffset >= L) + wOffset -= L; + + /* Decrement the loop counter */ + i--; + } + + /* Update the index pointer */ + *writeOffset = (uint16_t)wOffset; + } + + + /** + * @brief Q15 Circular Read function. + */ + __STATIC_FORCEINLINE void arm_circularRead_q15( + q15_t * circBuffer, + int32_t L, + int32_t * readOffset, + int32_t bufferInc, + q15_t * dst, + q15_t * dst_base, + int32_t dst_length, + int32_t dstInc, + uint32_t blockSize) + { + uint32_t i = 0; + int32_t rOffset; + q15_t* dst_end; + + /* Copy the value of Index pointer that points + * to the current location from where the input samples to be read */ + rOffset = *readOffset; + + dst_end = dst_base + dst_length; + + /* Loop over the blockSize */ + i = blockSize; + + while (i > 0U) + { + /* copy the sample from the circular buffer to the destination buffer */ + *dst = circBuffer[rOffset]; + + /* Update the input pointer */ + dst += dstInc; + + if (dst == dst_end) + { + dst = dst_base; + } + + /* Circularly update wOffset. Watch out for positive and negative value */ + rOffset += bufferInc; + + if (rOffset >= L) + { + rOffset -= L; + } + + /* Decrement the loop counter */ + i--; + } + + /* Update the index pointer */ + *readOffset = rOffset; + } + + + /** + * @brief Q7 Circular write function. + */ + __STATIC_FORCEINLINE void arm_circularWrite_q7( + q7_t * circBuffer, + int32_t L, + uint16_t * writeOffset, + int32_t bufferInc, + const q7_t * src, + int32_t srcInc, + uint32_t blockSize) + { + uint32_t i = 0U; + int32_t wOffset; + + /* Copy the value of Index pointer that points + * to the current location where the input samples to be copied */ + wOffset = *writeOffset; + + /* Loop over the blockSize */ + i = blockSize; + + while (i > 0U) + { + /* copy the input sample to the circular buffer */ + circBuffer[wOffset] = *src; + + /* Update the input pointer */ + src += srcInc; + + /* Circularly update wOffset. Watch out for positive and negative value */ + wOffset += bufferInc; + if (wOffset >= L) + wOffset -= L; + + /* Decrement the loop counter */ + i--; + } + + /* Update the index pointer */ + *writeOffset = (uint16_t)wOffset; + } + + + /** + * @brief Q7 Circular Read function. + */ + __STATIC_FORCEINLINE void arm_circularRead_q7( + q7_t * circBuffer, + int32_t L, + int32_t * readOffset, + int32_t bufferInc, + q7_t * dst, + q7_t * dst_base, + int32_t dst_length, + int32_t dstInc, + uint32_t blockSize) + { + uint32_t i = 0; + int32_t rOffset; + q7_t* dst_end; + + /* Copy the value of Index pointer that points + * to the current location from where the input samples to be read */ + rOffset = *readOffset; + + dst_end = dst_base + dst_length; + + /* Loop over the blockSize */ + i = blockSize; + + while (i > 0U) + { + /* copy the sample from the circular buffer to the destination buffer */ + *dst = circBuffer[rOffset]; + + /* Update the input pointer */ + dst += dstInc; + + if (dst == dst_end) + { + dst = dst_base; + } + + /* Circularly update rOffset. Watch out for positive and negative value */ + rOffset += bufferInc; + + if (rOffset >= L) + { + rOffset -= L; + } + + /* Decrement the loop counter */ + i--; + } + + /* Update the index pointer */ + *readOffset = rOffset; + } + + +/** + @brief Levinson Durbin + @param[in] phi autocovariance vector starting with lag 0 (length is nbCoefs + 1) + @param[out] a autoregressive coefficients + @param[out] err prediction error (variance) + @param[in] nbCoefs number of autoregressive coefficients + @return none + */ +void arm_levinson_durbin_f32(const float32_t *phi, + float32_t *a, + float32_t *err, + int nbCoefs); + + +/** + @brief Levinson Durbin + @param[in] phi autocovariance vector starting with lag 0 (length is nbCoefs + 1) + @param[out] a autoregressive coefficients + @param[out] err prediction error (variance) + @param[in] nbCoefs number of autoregressive coefficients + @return none + */ +void arm_levinson_durbin_q31(const q31_t *phi, + q31_t *a, + q31_t *err, + int nbCoefs); + +#ifdef __cplusplus +} +#endif + +#endif /* ifndef _FILTERING_FUNCTIONS_H_ */ diff --git a/APP_Framework/Framework/knowing/cmsis_5/DSP/Include/dsp/filtering_functions_f16.h b/APP_Framework/Framework/knowing/cmsis_5/DSP/Include/dsp/filtering_functions_f16.h new file mode 100644 index 000000000..6ccb8a2d0 --- /dev/null +++ b/APP_Framework/Framework/knowing/cmsis_5/DSP/Include/dsp/filtering_functions_f16.h @@ -0,0 +1,237 @@ +/****************************************************************************** + * @file filtering_functions_f16.h + * @brief Public header file for CMSIS DSP Library + * @version V1.9.0 + * @date 23 April 2021 + * Target Processor: Cortex-M and Cortex-A cores + ******************************************************************************/ +/* + * Copyright (c) 2010-2020 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +#ifndef _FILTERING_FUNCTIONS_F16_H_ +#define _FILTERING_FUNCTIONS_F16_H_ + +#include "arm_math_types_f16.h" +#include "arm_math_memory.h" + +#include "dsp/none.h" +#include "dsp/utils.h" + + +#ifdef __cplusplus +extern "C" +{ +#endif + +#if defined(ARM_FLOAT16_SUPPORTED) + + /** + * @brief Instance structure for the floating-point FIR filter. + */ + typedef struct + { + uint16_t numTaps; /**< number of filter coefficients in the filter. */ + float16_t *pState; /**< points to the state variable array. The array is of length numTaps+blockSize-1. */ + const float16_t *pCoeffs; /**< points to the coefficient array. The array is of length numTaps. */ + } arm_fir_instance_f16; + + /** + * @brief Initialization function for the floating-point FIR filter. + * @param[in,out] S points to an instance of the floating-point FIR filter structure. + * @param[in] numTaps Number of filter coefficients in the filter. + * @param[in] pCoeffs points to the filter coefficients. + * @param[in] pState points to the state buffer. + * @param[in] blockSize number of samples that are processed at a time. + */ + void arm_fir_init_f16( + arm_fir_instance_f16 * S, + uint16_t numTaps, + const float16_t * pCoeffs, + float16_t * pState, + uint32_t blockSize); + + /** + * @brief Processing function for the floating-point FIR filter. + * @param[in] S points to an instance of the floating-point FIR structure. + * @param[in] pSrc points to the block of input data. + * @param[out] pDst points to the block of output data. + * @param[in] blockSize number of samples to process. + */ + void arm_fir_f16( + const arm_fir_instance_f16 * S, + const float16_t * pSrc, + float16_t * pDst, + uint32_t blockSize); + + + /** + * @brief Instance structure for the floating-point Biquad cascade filter. + */ + typedef struct + { + uint32_t numStages; /**< number of 2nd order stages in the filter. Overall order is 2*numStages. */ + float16_t *pState; /**< Points to the array of state coefficients. The array is of length 4*numStages. */ + const float16_t *pCoeffs; /**< Points to the array of coefficients. The array is of length 5*numStages. */ + } arm_biquad_casd_df1_inst_f16; + +#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) + /** + * @brief Instance structure for the modified Biquad coefs required by vectorized code. + */ + typedef struct + { + float16_t coeffs[12][8]; /**< Points to the array of modified coefficients. The array is of length 32. There is one per stage */ + } arm_biquad_mod_coef_f16; +#endif + + /** + * @brief Processing function for the floating-point Biquad cascade filter. + * @param[in] S points to an instance of the floating-point Biquad cascade structure. + * @param[in] pSrc points to the block of input data. + * @param[out] pDst points to the block of output data. + * @param[in] blockSize number of samples to process. + */ + void arm_biquad_cascade_df1_f16( + const arm_biquad_casd_df1_inst_f16 * S, + const float16_t * pSrc, + float16_t * pDst, + uint32_t blockSize); + +#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) + void arm_biquad_cascade_df1_mve_init_f16( + arm_biquad_casd_df1_inst_f16 * S, + uint8_t numStages, + const float16_t * pCoeffs, + arm_biquad_mod_coef_f16 * pCoeffsMod, + float16_t * pState); +#endif + + void arm_biquad_cascade_df1_init_f16( + arm_biquad_casd_df1_inst_f16 * S, + uint8_t numStages, + const float16_t * pCoeffs, + float16_t * pState); + + /** + * @brief Instance structure for the floating-point transposed direct form II Biquad cascade filter. + */ + typedef struct + { + uint8_t numStages; /**< number of 2nd order stages in the filter. Overall order is 2*numStages. */ + float16_t *pState; /**< points to the array of state coefficients. The array is of length 2*numStages. */ + const float16_t *pCoeffs; /**< points to the array of coefficients. The array is of length 5*numStages. */ + } arm_biquad_cascade_df2T_instance_f16; + + /** + * @brief Instance structure for the floating-point transposed direct form II Biquad cascade filter. + */ + typedef struct + { + uint8_t numStages; /**< number of 2nd order stages in the filter. Overall order is 2*numStages. */ + float16_t *pState; /**< points to the array of state coefficients. The array is of length 4*numStages. */ + const float16_t *pCoeffs; /**< points to the array of coefficients. The array is of length 5*numStages. */ + } arm_biquad_cascade_stereo_df2T_instance_f16; + + /** + * @brief Processing function for the floating-point transposed direct form II Biquad cascade filter. + * @param[in] S points to an instance of the filter data structure. + * @param[in] pSrc points to the block of input data. + * @param[out] pDst points to the block of output data + * @param[in] blockSize number of samples to process. + */ + void arm_biquad_cascade_df2T_f16( + const arm_biquad_cascade_df2T_instance_f16 * S, + const float16_t * pSrc, + float16_t * pDst, + uint32_t blockSize); + + /** + * @brief Processing function for the floating-point transposed direct form II Biquad cascade filter. 2 channels + * @param[in] S points to an instance of the filter data structure. + * @param[in] pSrc points to the block of input data. + * @param[out] pDst points to the block of output data + * @param[in] blockSize number of samples to process. + */ + void arm_biquad_cascade_stereo_df2T_f16( + const arm_biquad_cascade_stereo_df2T_instance_f16 * S, + const float16_t * pSrc, + float16_t * pDst, + uint32_t blockSize); + + /** + * @brief Initialization function for the floating-point transposed direct form II Biquad cascade filter. + * @param[in,out] S points to an instance of the filter data structure. + * @param[in] numStages number of 2nd order stages in the filter. + * @param[in] pCoeffs points to the filter coefficients. + * @param[in] pState points to the state buffer. + */ + void arm_biquad_cascade_df2T_init_f16( + arm_biquad_cascade_df2T_instance_f16 * S, + uint8_t numStages, + const float16_t * pCoeffs, + float16_t * pState); + + /** + * @brief Initialization function for the floating-point transposed direct form II Biquad cascade filter. + * @param[in,out] S points to an instance of the filter data structure. + * @param[in] numStages number of 2nd order stages in the filter. + * @param[in] pCoeffs points to the filter coefficients. + * @param[in] pState points to the state buffer. + */ + void arm_biquad_cascade_stereo_df2T_init_f16( + arm_biquad_cascade_stereo_df2T_instance_f16 * S, + uint8_t numStages, + const float16_t * pCoeffs, + float16_t * pState); + + /** + * @brief Correlation of floating-point sequences. + * @param[in] pSrcA points to the first input sequence. + * @param[in] srcALen length of the first input sequence. + * @param[in] pSrcB points to the second input sequence. + * @param[in] srcBLen length of the second input sequence. + * @param[out] pDst points to the block of output data Length 2 * max(srcALen, srcBLen) - 1. + */ + void arm_correlate_f16( + const float16_t * pSrcA, + uint32_t srcALen, + const float16_t * pSrcB, + uint32_t srcBLen, + float16_t * pDst); + + +/** + @brief Levinson Durbin + @param[in] phi autocovariance vector starting with lag 0 (length is nbCoefs + 1) + @param[out] a autoregressive coefficients + @param[out] err prediction error (variance) + @param[in] nbCoefs number of autoregressive coefficients + @return none + */ +void arm_levinson_durbin_f16(const float16_t *phi, + float16_t *a, + float16_t *err, + int nbCoefs); + +#endif /*defined(ARM_FLOAT16_SUPPORTED)*/ +#ifdef __cplusplus +} +#endif + +#endif /* ifndef _FILTERING_FUNCTIONS_F16_H_ */ diff --git a/APP_Framework/Framework/knowing/cmsis_5/DSP/Include/dsp/interpolation_functions.h b/APP_Framework/Framework/knowing/cmsis_5/DSP/Include/dsp/interpolation_functions.h new file mode 100644 index 000000000..42bf746c4 --- /dev/null +++ b/APP_Framework/Framework/knowing/cmsis_5/DSP/Include/dsp/interpolation_functions.h @@ -0,0 +1,319 @@ +/****************************************************************************** + * @file interpolation_functions.h + * @brief Public header file for CMSIS DSP Library + * @version V1.9.0 + * @date 23 April 2021 + * Target Processor: Cortex-M and Cortex-A cores + ******************************************************************************/ +/* + * Copyright (c) 2010-2020 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +#ifndef _INTERPOLATION_FUNCTIONS_H_ +#define _INTERPOLATION_FUNCTIONS_H_ + +#include "arm_math_types.h" +#include "arm_math_memory.h" + +#include "dsp/none.h" +#include "dsp/utils.h" + +#ifdef __cplusplus +extern "C" +{ +#endif + + +/** + * @defgroup groupInterpolation Interpolation Functions + * These functions perform 1- and 2-dimensional interpolation of data. + * Linear interpolation is used for 1-dimensional data and + * bilinear interpolation is used for 2-dimensional data. + */ + + + /** + * @brief Instance structure for the floating-point Linear Interpolate function. + */ + typedef struct + { + uint32_t nValues; /**< nValues */ + float32_t x1; /**< x1 */ + float32_t xSpacing; /**< xSpacing */ + float32_t *pYData; /**< pointer to the table of Y values */ + } arm_linear_interp_instance_f32; + + /** + * @brief Instance structure for the floating-point bilinear interpolation function. + */ + typedef struct + { + uint16_t numRows; /**< number of rows in the data table. */ + uint16_t numCols; /**< number of columns in the data table. */ + float32_t *pData; /**< points to the data table. */ + } arm_bilinear_interp_instance_f32; + + /** + * @brief Instance structure for the Q31 bilinear interpolation function. + */ + typedef struct + { + uint16_t numRows; /**< number of rows in the data table. */ + uint16_t numCols; /**< number of columns in the data table. */ + q31_t *pData; /**< points to the data table. */ + } arm_bilinear_interp_instance_q31; + + /** + * @brief Instance structure for the Q15 bilinear interpolation function. + */ + typedef struct + { + uint16_t numRows; /**< number of rows in the data table. */ + uint16_t numCols; /**< number of columns in the data table. */ + q15_t *pData; /**< points to the data table. */ + } arm_bilinear_interp_instance_q15; + + /** + * @brief Instance structure for the Q15 bilinear interpolation function. + */ + typedef struct + { + uint16_t numRows; /**< number of rows in the data table. */ + uint16_t numCols; /**< number of columns in the data table. */ + q7_t *pData; /**< points to the data table. */ + } arm_bilinear_interp_instance_q7; + + + /** + * @brief Struct for specifying cubic spline type + */ + typedef enum + { + ARM_SPLINE_NATURAL = 0, /**< Natural spline */ + ARM_SPLINE_PARABOLIC_RUNOUT = 1 /**< Parabolic runout spline */ + } arm_spline_type; + + /** + * @brief Instance structure for the floating-point cubic spline interpolation. + */ + typedef struct + { + arm_spline_type type; /**< Type (boundary conditions) */ + const float32_t * x; /**< x values */ + const float32_t * y; /**< y values */ + uint32_t n_x; /**< Number of known data points */ + float32_t * coeffs; /**< Coefficients buffer (b,c, and d) */ + } arm_spline_instance_f32; + + + + + /** + * @ingroup groupInterpolation + */ + + /** + * @addtogroup SplineInterpolate + * @{ + */ + + + /** + * @brief Processing function for the floating-point cubic spline interpolation. + * @param[in] S points to an instance of the floating-point spline structure. + * @param[in] xq points to the x values ot the interpolated data points. + * @param[out] pDst points to the block of output data. + * @param[in] blockSize number of samples of output data. + */ + void arm_spline_f32( + arm_spline_instance_f32 * S, + const float32_t * xq, + float32_t * pDst, + uint32_t blockSize); + + /** + * @brief Initialization function for the floating-point cubic spline interpolation. + * @param[in,out] S points to an instance of the floating-point spline structure. + * @param[in] type type of cubic spline interpolation (boundary conditions) + * @param[in] x points to the x values of the known data points. + * @param[in] y points to the y values of the known data points. + * @param[in] n number of known data points. + * @param[in] coeffs coefficients array for b, c, and d + * @param[in] tempBuffer buffer array for internal computations + */ + void arm_spline_init_f32( + arm_spline_instance_f32 * S, + arm_spline_type type, + const float32_t * x, + const float32_t * y, + uint32_t n, + float32_t * coeffs, + float32_t * tempBuffer); + + + /** + * @} end of SplineInterpolate group + */ + + + + /** + * @addtogroup LinearInterpolate + * @{ + */ + + /** + * @brief Process function for the floating-point Linear Interpolation Function. + * @param[in,out] S is an instance of the floating-point Linear Interpolation structure + * @param[in] x input sample to process + * @return y processed output sample. + * + */ + float32_t arm_linear_interp_f32( + arm_linear_interp_instance_f32 * S, + float32_t x); + + /** + * + * @brief Process function for the Q31 Linear Interpolation Function. + * @param[in] pYData pointer to Q31 Linear Interpolation table + * @param[in] x input sample to process + * @param[in] nValues number of table values + * @return y processed output sample. + * + * \par + * Input sample x is in 12.20 format which contains 12 bits for table index and 20 bits for fractional part. + * This function can support maximum of table size 2^12. + * + */ + q31_t arm_linear_interp_q31( + q31_t * pYData, + q31_t x, + uint32_t nValues); + + /** + * + * @brief Process function for the Q15 Linear Interpolation Function. + * @param[in] pYData pointer to Q15 Linear Interpolation table + * @param[in] x input sample to process + * @param[in] nValues number of table values + * @return y processed output sample. + * + * \par + * Input sample x is in 12.20 format which contains 12 bits for table index and 20 bits for fractional part. + * This function can support maximum of table size 2^12. + * + */ + q15_t arm_linear_interp_q15( + q15_t * pYData, + q31_t x, + uint32_t nValues); + + /** + * + * @brief Process function for the Q7 Linear Interpolation Function. + * @param[in] pYData pointer to Q7 Linear Interpolation table + * @param[in] x input sample to process + * @param[in] nValues number of table values + * @return y processed output sample. + * + * \par + * Input sample x is in 12.20 format which contains 12 bits for table index and 20 bits for fractional part. + * This function can support maximum of table size 2^12. + */ +q7_t arm_linear_interp_q7( + q7_t * pYData, + q31_t x, + uint32_t nValues); + + /** + * @} end of LinearInterpolate group + */ + + + + + /** + * @ingroup groupInterpolation + */ + + + /** + * @addtogroup BilinearInterpolate + * @{ + */ + + /** + * @brief Floating-point bilinear interpolation. + * @param[in,out] S points to an instance of the interpolation structure. + * @param[in] X interpolation coordinate. + * @param[in] Y interpolation coordinate. + * @return out interpolated value. + */ + float32_t arm_bilinear_interp_f32( + const arm_bilinear_interp_instance_f32 * S, + float32_t X, + float32_t Y); + + /** + * @brief Q31 bilinear interpolation. + * @param[in,out] S points to an instance of the interpolation structure. + * @param[in] X interpolation coordinate in 12.20 format. + * @param[in] Y interpolation coordinate in 12.20 format. + * @return out interpolated value. + */ + q31_t arm_bilinear_interp_q31( + arm_bilinear_interp_instance_q31 * S, + q31_t X, + q31_t Y); + + + /** + * @brief Q15 bilinear interpolation. + * @param[in,out] S points to an instance of the interpolation structure. + * @param[in] X interpolation coordinate in 12.20 format. + * @param[in] Y interpolation coordinate in 12.20 format. + * @return out interpolated value. + */ + q15_t arm_bilinear_interp_q15( + arm_bilinear_interp_instance_q15 * S, + q31_t X, + q31_t Y); + + /** + * @brief Q7 bilinear interpolation. + * @param[in,out] S points to an instance of the interpolation structure. + * @param[in] X interpolation coordinate in 12.20 format. + * @param[in] Y interpolation coordinate in 12.20 format. + * @return out interpolated value. + */ + q7_t arm_bilinear_interp_q7( + arm_bilinear_interp_instance_q7 * S, + q31_t X, + q31_t Y); + /** + * @} end of BilinearInterpolate group + */ + + + +#ifdef __cplusplus +} +#endif + +#endif /* ifndef _INTERPOLATION_FUNCTIONS_H_ */ diff --git a/APP_Framework/Framework/knowing/cmsis_5/DSP/Include/dsp/interpolation_functions_f16.h b/APP_Framework/Framework/knowing/cmsis_5/DSP/Include/dsp/interpolation_functions_f16.h new file mode 100644 index 000000000..01fd87acc --- /dev/null +++ b/APP_Framework/Framework/knowing/cmsis_5/DSP/Include/dsp/interpolation_functions_f16.h @@ -0,0 +1,107 @@ +/****************************************************************************** + * @file interpolation_functions_f16.h + * @brief Public header file for CMSIS DSP Library + * @version V1.9.0 + * @date 23 April 2021 + * Target Processor: Cortex-M and Cortex-A cores + ******************************************************************************/ +/* + * Copyright (c) 2010-2020 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +#ifndef _INTERPOLATION_FUNCTIONS_F16_H_ +#define _INTERPOLATION_FUNCTIONS_F16_H_ + +#include "arm_math_types_f16.h" +#include "arm_math_memory.h" + +#include "dsp/none.h" +#include "dsp/utils.h" + +#ifdef __cplusplus +extern "C" +{ +#endif + +#if defined(ARM_FLOAT16_SUPPORTED) + +typedef struct +{ + uint32_t nValues; /**< nValues */ + float16_t x1; /**< x1 */ + float16_t xSpacing; /**< xSpacing */ + float16_t *pYData; /**< pointer to the table of Y values */ +} arm_linear_interp_instance_f16; + +/** + * @brief Instance structure for the floating-point bilinear interpolation function. + */ +typedef struct +{ + uint16_t numRows;/**< number of rows in the data table. */ + uint16_t numCols;/**< number of columns in the data table. */ + float16_t *pData; /**< points to the data table. */ +} arm_bilinear_interp_instance_f16; + + /** + * @addtogroup LinearInterpolate + * @{ + */ + + /** + * @brief Process function for the floating-point Linear Interpolation Function. + * @param[in,out] S is an instance of the floating-point Linear Interpolation structure + * @param[in] x input sample to process + * @return y processed output sample. + * + */ + float16_t arm_linear_interp_f16( + arm_linear_interp_instance_f16 * S, + float16_t x); + + /** + * @} end of LinearInterpolate group + */ + +/** + * @addtogroup BilinearInterpolate + * @{ + */ + + /** + * @brief Floating-point bilinear interpolation. + * @param[in,out] S points to an instance of the interpolation structure. + * @param[in] X interpolation coordinate. + * @param[in] Y interpolation coordinate. + * @return out interpolated value. + */ + float16_t arm_bilinear_interp_f16( + const arm_bilinear_interp_instance_f16 * S, + float16_t X, + float16_t Y); + + + /** + * @} end of BilinearInterpolate group + */ +#endif /*defined(ARM_FLOAT16_SUPPORTED)*/ +#ifdef __cplusplus +} +#endif + +#endif /* ifndef _INTERPOLATION_FUNCTIONS_F16_H_ */ diff --git a/APP_Framework/Framework/knowing/cmsis_5/DSP/Include/dsp/matrix_functions.h b/APP_Framework/Framework/knowing/cmsis_5/DSP/Include/dsp/matrix_functions.h new file mode 100644 index 000000000..e03a2f18b --- /dev/null +++ b/APP_Framework/Framework/knowing/cmsis_5/DSP/Include/dsp/matrix_functions.h @@ -0,0 +1,742 @@ +/****************************************************************************** + * @file matrix_functions.h + * @brief Public header file for CMSIS DSP Library + * @version V1.9.0 + * @date 23 April 2021 + * Target Processor: Cortex-M and Cortex-A cores + ******************************************************************************/ +/* + * Copyright (c) 2010-2020 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +#ifndef _MATRIX_FUNCTIONS_H_ +#define _MATRIX_FUNCTIONS_H_ + +#include "arm_math_types.h" +#include "arm_math_memory.h" + +#include "dsp/none.h" +#include "dsp/utils.h" + +#ifdef __cplusplus +extern "C" +{ +#endif + +/** + * @defgroup groupMatrix Matrix Functions + * + * This set of functions provides basic matrix math operations. + * The functions operate on matrix data structures. For example, + * the type + * definition for the floating-point matrix structure is shown + * below: + *
+ *     typedef struct
+ *     {
+ *       uint16_t numRows;     // number of rows of the matrix.
+ *       uint16_t numCols;     // number of columns of the matrix.
+ *       float32_t *pData;     // points to the data of the matrix.
+ *     } arm_matrix_instance_f32;
+ * 
+ * There are similar definitions for Q15 and Q31 data types. + * + * The structure specifies the size of the matrix and then points to + * an array of data. The array is of size numRows X numCols + * and the values are arranged in row order. That is, the + * matrix element (i, j) is stored at: + *
+ *     pData[i*numCols + j]
+ * 
+ * + * \par Init Functions + * There is an associated initialization function for each type of matrix + * data structure. + * The initialization function sets the values of the internal structure fields. + * Refer to \ref arm_mat_init_f32(), \ref arm_mat_init_q31() and \ref arm_mat_init_q15() + * for floating-point, Q31 and Q15 types, respectively. + * + * \par + * Use of the initialization function is optional. However, if initialization function is used + * then the instance structure cannot be placed into a const data section. + * To place the instance structure in a const data + * section, manually initialize the data structure. For example: + *
+ * arm_matrix_instance_f32 S = {nRows, nColumns, pData};
+ * arm_matrix_instance_q31 S = {nRows, nColumns, pData};
+ * arm_matrix_instance_q15 S = {nRows, nColumns, pData};
+ * 
+ * where nRows specifies the number of rows, nColumns + * specifies the number of columns, and pData points to the + * data array. + * + * \par Size Checking + * By default all of the matrix functions perform size checking on the input and + * output matrices. For example, the matrix addition function verifies that the + * two input matrices and the output matrix all have the same number of rows and + * columns. If the size check fails the functions return: + *
+ *     ARM_MATH_SIZE_MISMATCH
+ * 
+ * Otherwise the functions return + *
+ *     ARM_MATH_SUCCESS
+ * 
+ * There is some overhead associated with this matrix size checking. + * The matrix size checking is enabled via the \#define + *
+ *     ARM_MATH_MATRIX_CHECK
+ * 
+ * within the library project settings. By default this macro is defined + * and size checking is enabled. By changing the project settings and + * undefining this macro size checking is eliminated and the functions + * run a bit faster. With size checking disabled the functions always + * return ARM_MATH_SUCCESS. + */ + + /** + * @brief Instance structure for the floating-point matrix structure. + */ + typedef struct + { + uint16_t numRows; /**< number of rows of the matrix. */ + uint16_t numCols; /**< number of columns of the matrix. */ + float32_t *pData; /**< points to the data of the matrix. */ + } arm_matrix_instance_f32; + + /** + * @brief Instance structure for the floating-point matrix structure. + */ + typedef struct + { + uint16_t numRows; /**< number of rows of the matrix. */ + uint16_t numCols; /**< number of columns of the matrix. */ + float64_t *pData; /**< points to the data of the matrix. */ + } arm_matrix_instance_f64; + + /** + * @brief Instance structure for the Q7 matrix structure. + */ + typedef struct + { + uint16_t numRows; /**< number of rows of the matrix. */ + uint16_t numCols; /**< number of columns of the matrix. */ + q7_t *pData; /**< points to the data of the matrix. */ + } arm_matrix_instance_q7; + + /** + * @brief Instance structure for the Q15 matrix structure. + */ + typedef struct + { + uint16_t numRows; /**< number of rows of the matrix. */ + uint16_t numCols; /**< number of columns of the matrix. */ + q15_t *pData; /**< points to the data of the matrix. */ + } arm_matrix_instance_q15; + + /** + * @brief Instance structure for the Q31 matrix structure. + */ + typedef struct + { + uint16_t numRows; /**< number of rows of the matrix. */ + uint16_t numCols; /**< number of columns of the matrix. */ + q31_t *pData; /**< points to the data of the matrix. */ + } arm_matrix_instance_q31; + + /** + * @brief Floating-point matrix addition. + * @param[in] pSrcA points to the first input matrix structure + * @param[in] pSrcB points to the second input matrix structure + * @param[out] pDst points to output matrix structure + * @return The function returns either + * ARM_MATH_SIZE_MISMATCH or ARM_MATH_SUCCESS based on the outcome of size checking. + */ +arm_status arm_mat_add_f32( + const arm_matrix_instance_f32 * pSrcA, + const arm_matrix_instance_f32 * pSrcB, + arm_matrix_instance_f32 * pDst); + + /** + * @brief Q15 matrix addition. + * @param[in] pSrcA points to the first input matrix structure + * @param[in] pSrcB points to the second input matrix structure + * @param[out] pDst points to output matrix structure + * @return The function returns either + * ARM_MATH_SIZE_MISMATCH or ARM_MATH_SUCCESS based on the outcome of size checking. + */ +arm_status arm_mat_add_q15( + const arm_matrix_instance_q15 * pSrcA, + const arm_matrix_instance_q15 * pSrcB, + arm_matrix_instance_q15 * pDst); + + /** + * @brief Q31 matrix addition. + * @param[in] pSrcA points to the first input matrix structure + * @param[in] pSrcB points to the second input matrix structure + * @param[out] pDst points to output matrix structure + * @return The function returns either + * ARM_MATH_SIZE_MISMATCH or ARM_MATH_SUCCESS based on the outcome of size checking. + */ +arm_status arm_mat_add_q31( + const arm_matrix_instance_q31 * pSrcA, + const arm_matrix_instance_q31 * pSrcB, + arm_matrix_instance_q31 * pDst); + + /** + * @brief Floating-point, complex, matrix multiplication. + * @param[in] pSrcA points to the first input matrix structure + * @param[in] pSrcB points to the second input matrix structure + * @param[out] pDst points to output matrix structure + * @return The function returns either + * ARM_MATH_SIZE_MISMATCH or ARM_MATH_SUCCESS based on the outcome of size checking. + */ +arm_status arm_mat_cmplx_mult_f32( + const arm_matrix_instance_f32 * pSrcA, + const arm_matrix_instance_f32 * pSrcB, + arm_matrix_instance_f32 * pDst); + + /** + * @brief Q15, complex, matrix multiplication. + * @param[in] pSrcA points to the first input matrix structure + * @param[in] pSrcB points to the second input matrix structure + * @param[out] pDst points to output matrix structure + * @return The function returns either + * ARM_MATH_SIZE_MISMATCH or ARM_MATH_SUCCESS based on the outcome of size checking. + */ +arm_status arm_mat_cmplx_mult_q15( + const arm_matrix_instance_q15 * pSrcA, + const arm_matrix_instance_q15 * pSrcB, + arm_matrix_instance_q15 * pDst, + q15_t * pScratch); + + /** + * @brief Q31, complex, matrix multiplication. + * @param[in] pSrcA points to the first input matrix structure + * @param[in] pSrcB points to the second input matrix structure + * @param[out] pDst points to output matrix structure + * @return The function returns either + * ARM_MATH_SIZE_MISMATCH or ARM_MATH_SUCCESS based on the outcome of size checking. + */ +arm_status arm_mat_cmplx_mult_q31( + const arm_matrix_instance_q31 * pSrcA, + const arm_matrix_instance_q31 * pSrcB, + arm_matrix_instance_q31 * pDst); + + /** + * @brief Floating-point matrix transpose. + * @param[in] pSrc points to the input matrix + * @param[out] pDst points to the output matrix + * @return The function returns either ARM_MATH_SIZE_MISMATCH + * or ARM_MATH_SUCCESS based on the outcome of size checking. + */ +arm_status arm_mat_trans_f32( + const arm_matrix_instance_f32 * pSrc, + arm_matrix_instance_f32 * pDst); + +/** + * @brief Floating-point matrix transpose. + * @param[in] pSrc points to the input matrix + * @param[out] pDst points to the output matrix + * @return The function returns either ARM_MATH_SIZE_MISMATCH + * or ARM_MATH_SUCCESS based on the outcome of size checking. + */ +arm_status arm_mat_trans_f64( + const arm_matrix_instance_f64 * pSrc, + arm_matrix_instance_f64 * pDst); + + /** + * @brief Floating-point complex matrix transpose. + * @param[in] pSrc points to the input matrix + * @param[out] pDst points to the output matrix + * @return The function returns either ARM_MATH_SIZE_MISMATCH + * or ARM_MATH_SUCCESS based on the outcome of size checking. + */ +arm_status arm_mat_cmplx_trans_f32( + const arm_matrix_instance_f32 * pSrc, + arm_matrix_instance_f32 * pDst); + + + /** + * @brief Q15 matrix transpose. + * @param[in] pSrc points to the input matrix + * @param[out] pDst points to the output matrix + * @return The function returns either ARM_MATH_SIZE_MISMATCH + * or ARM_MATH_SUCCESS based on the outcome of size checking. + */ +arm_status arm_mat_trans_q15( + const arm_matrix_instance_q15 * pSrc, + arm_matrix_instance_q15 * pDst); + + /** + * @brief Q15 complex matrix transpose. + * @param[in] pSrc points to the input matrix + * @param[out] pDst points to the output matrix + * @return The function returns either ARM_MATH_SIZE_MISMATCH + * or ARM_MATH_SUCCESS based on the outcome of size checking. + */ +arm_status arm_mat_cmplx_trans_q15( + const arm_matrix_instance_q15 * pSrc, + arm_matrix_instance_q15 * pDst); + + /** + * @brief Q7 matrix transpose. + * @param[in] pSrc points to the input matrix + * @param[out] pDst points to the output matrix + * @return The function returns either ARM_MATH_SIZE_MISMATCH + * or ARM_MATH_SUCCESS based on the outcome of size checking. + */ +arm_status arm_mat_trans_q7( + const arm_matrix_instance_q7 * pSrc, + arm_matrix_instance_q7 * pDst); + + /** + * @brief Q31 matrix transpose. + * @param[in] pSrc points to the input matrix + * @param[out] pDst points to the output matrix + * @return The function returns either ARM_MATH_SIZE_MISMATCH + * or ARM_MATH_SUCCESS based on the outcome of size checking. + */ +arm_status arm_mat_trans_q31( + const arm_matrix_instance_q31 * pSrc, + arm_matrix_instance_q31 * pDst); + + /** + * @brief Q31 complex matrix transpose. + * @param[in] pSrc points to the input matrix + * @param[out] pDst points to the output matrix + * @return The function returns either ARM_MATH_SIZE_MISMATCH + * or ARM_MATH_SUCCESS based on the outcome of size checking. + */ +arm_status arm_mat_cmplx_trans_q31( + const arm_matrix_instance_q31 * pSrc, + arm_matrix_instance_q31 * pDst); + + /** + * @brief Floating-point matrix multiplication + * @param[in] pSrcA points to the first input matrix structure + * @param[in] pSrcB points to the second input matrix structure + * @param[out] pDst points to output matrix structure + * @return The function returns either + * ARM_MATH_SIZE_MISMATCH or ARM_MATH_SUCCESS based on the outcome of size checking. + */ +arm_status arm_mat_mult_f32( + const arm_matrix_instance_f32 * pSrcA, + const arm_matrix_instance_f32 * pSrcB, + arm_matrix_instance_f32 * pDst); + + /** + * @brief Floating-point matrix multiplication + * @param[in] pSrcA points to the first input matrix structure + * @param[in] pSrcB points to the second input matrix structure + * @param[out] pDst points to output matrix structure + * @return The function returns either + * ARM_MATH_SIZE_MISMATCH or ARM_MATH_SUCCESS based on the outcome of size checking. + */ +arm_status arm_mat_mult_f64( + const arm_matrix_instance_f64 * pSrcA, + const arm_matrix_instance_f64 * pSrcB, + arm_matrix_instance_f64 * pDst); + + /** + * @brief Floating-point matrix and vector multiplication + * @param[in] pSrcMat points to the input matrix structure + * @param[in] pVec points to vector + * @param[out] pDst points to output vector + */ +void arm_mat_vec_mult_f32( + const arm_matrix_instance_f32 *pSrcMat, + const float32_t *pVec, + float32_t *pDst); + + /** + * @brief Q7 matrix multiplication + * @param[in] pSrcA points to the first input matrix structure + * @param[in] pSrcB points to the second input matrix structure + * @param[out] pDst points to output matrix structure + * @param[in] pState points to the array for storing intermediate results + * @return The function returns either + * ARM_MATH_SIZE_MISMATCH or ARM_MATH_SUCCESS based on the outcome of size checking. + */ +arm_status arm_mat_mult_q7( + const arm_matrix_instance_q7 * pSrcA, + const arm_matrix_instance_q7 * pSrcB, + arm_matrix_instance_q7 * pDst, + q7_t * pState); + + /** + * @brief Q7 matrix and vector multiplication + * @param[in] pSrcMat points to the input matrix structure + * @param[in] pVec points to vector + * @param[out] pDst points to output vector + */ +void arm_mat_vec_mult_q7( + const arm_matrix_instance_q7 *pSrcMat, + const q7_t *pVec, + q7_t *pDst); + + /** + * @brief Q15 matrix multiplication + * @param[in] pSrcA points to the first input matrix structure + * @param[in] pSrcB points to the second input matrix structure + * @param[out] pDst points to output matrix structure + * @param[in] pState points to the array for storing intermediate results + * @return The function returns either + * ARM_MATH_SIZE_MISMATCH or ARM_MATH_SUCCESS based on the outcome of size checking. + */ +arm_status arm_mat_mult_q15( + const arm_matrix_instance_q15 * pSrcA, + const arm_matrix_instance_q15 * pSrcB, + arm_matrix_instance_q15 * pDst, + q15_t * pState); + + /** + * @brief Q15 matrix and vector multiplication + * @param[in] pSrcMat points to the input matrix structure + * @param[in] pVec points to vector + * @param[out] pDst points to output vector + */ +void arm_mat_vec_mult_q15( + const arm_matrix_instance_q15 *pSrcMat, + const q15_t *pVec, + q15_t *pDst); + + /** + * @brief Q15 matrix multiplication (fast variant) for Cortex-M3 and Cortex-M4 + * @param[in] pSrcA points to the first input matrix structure + * @param[in] pSrcB points to the second input matrix structure + * @param[out] pDst points to output matrix structure + * @param[in] pState points to the array for storing intermediate results + * @return The function returns either + * ARM_MATH_SIZE_MISMATCH or ARM_MATH_SUCCESS based on the outcome of size checking. + */ +arm_status arm_mat_mult_fast_q15( + const arm_matrix_instance_q15 * pSrcA, + const arm_matrix_instance_q15 * pSrcB, + arm_matrix_instance_q15 * pDst, + q15_t * pState); + + /** + * @brief Q31 matrix multiplication + * @param[in] pSrcA points to the first input matrix structure + * @param[in] pSrcB points to the second input matrix structure + * @param[out] pDst points to output matrix structure + * @return The function returns either + * ARM_MATH_SIZE_MISMATCH or ARM_MATH_SUCCESS based on the outcome of size checking. + */ +arm_status arm_mat_mult_q31( + const arm_matrix_instance_q31 * pSrcA, + const arm_matrix_instance_q31 * pSrcB, + arm_matrix_instance_q31 * pDst); + + /** + * @brief Q31 matrix and vector multiplication + * @param[in] pSrcMat points to the input matrix structure + * @param[in] pVec points to vector + * @param[out] pDst points to output vector + */ +void arm_mat_vec_mult_q31( + const arm_matrix_instance_q31 *pSrcMat, + const q31_t *pVec, + q31_t *pDst); + + /** + * @brief Q31 matrix multiplication (fast variant) for Cortex-M3 and Cortex-M4 + * @param[in] pSrcA points to the first input matrix structure + * @param[in] pSrcB points to the second input matrix structure + * @param[out] pDst points to output matrix structure + * @return The function returns either + * ARM_MATH_SIZE_MISMATCH or ARM_MATH_SUCCESS based on the outcome of size checking. + */ +arm_status arm_mat_mult_fast_q31( + const arm_matrix_instance_q31 * pSrcA, + const arm_matrix_instance_q31 * pSrcB, + arm_matrix_instance_q31 * pDst); + + /** + * @brief Floating-point matrix subtraction + * @param[in] pSrcA points to the first input matrix structure + * @param[in] pSrcB points to the second input matrix structure + * @param[out] pDst points to output matrix structure + * @return The function returns either + * ARM_MATH_SIZE_MISMATCH or ARM_MATH_SUCCESS based on the outcome of size checking. + */ +arm_status arm_mat_sub_f32( + const arm_matrix_instance_f32 * pSrcA, + const arm_matrix_instance_f32 * pSrcB, + arm_matrix_instance_f32 * pDst); + + /** + * @brief Floating-point matrix subtraction + * @param[in] pSrcA points to the first input matrix structure + * @param[in] pSrcB points to the second input matrix structure + * @param[out] pDst points to output matrix structure + * @return The function returns either + * ARM_MATH_SIZE_MISMATCH or ARM_MATH_SUCCESS based on the outcome of size checking. + */ +arm_status arm_mat_sub_f64( + const arm_matrix_instance_f64 * pSrcA, + const arm_matrix_instance_f64 * pSrcB, + arm_matrix_instance_f64 * pDst); + + /** + * @brief Q15 matrix subtraction + * @param[in] pSrcA points to the first input matrix structure + * @param[in] pSrcB points to the second input matrix structure + * @param[out] pDst points to output matrix structure + * @return The function returns either + * ARM_MATH_SIZE_MISMATCH or ARM_MATH_SUCCESS based on the outcome of size checking. + */ +arm_status arm_mat_sub_q15( + const arm_matrix_instance_q15 * pSrcA, + const arm_matrix_instance_q15 * pSrcB, + arm_matrix_instance_q15 * pDst); + + /** + * @brief Q31 matrix subtraction + * @param[in] pSrcA points to the first input matrix structure + * @param[in] pSrcB points to the second input matrix structure + * @param[out] pDst points to output matrix structure + * @return The function returns either + * ARM_MATH_SIZE_MISMATCH or ARM_MATH_SUCCESS based on the outcome of size checking. + */ +arm_status arm_mat_sub_q31( + const arm_matrix_instance_q31 * pSrcA, + const arm_matrix_instance_q31 * pSrcB, + arm_matrix_instance_q31 * pDst); + + /** + * @brief Floating-point matrix scaling. + * @param[in] pSrc points to the input matrix + * @param[in] scale scale factor + * @param[out] pDst points to the output matrix + * @return The function returns either + * ARM_MATH_SIZE_MISMATCH or ARM_MATH_SUCCESS based on the outcome of size checking. + */ +arm_status arm_mat_scale_f32( + const arm_matrix_instance_f32 * pSrc, + float32_t scale, + arm_matrix_instance_f32 * pDst); + + /** + * @brief Q15 matrix scaling. + * @param[in] pSrc points to input matrix + * @param[in] scaleFract fractional portion of the scale factor + * @param[in] shift number of bits to shift the result by + * @param[out] pDst points to output matrix + * @return The function returns either + * ARM_MATH_SIZE_MISMATCH or ARM_MATH_SUCCESS based on the outcome of size checking. + */ +arm_status arm_mat_scale_q15( + const arm_matrix_instance_q15 * pSrc, + q15_t scaleFract, + int32_t shift, + arm_matrix_instance_q15 * pDst); + + /** + * @brief Q31 matrix scaling. + * @param[in] pSrc points to input matrix + * @param[in] scaleFract fractional portion of the scale factor + * @param[in] shift number of bits to shift the result by + * @param[out] pDst points to output matrix structure + * @return The function returns either + * ARM_MATH_SIZE_MISMATCH or ARM_MATH_SUCCESS based on the outcome of size checking. + */ +arm_status arm_mat_scale_q31( + const arm_matrix_instance_q31 * pSrc, + q31_t scaleFract, + int32_t shift, + arm_matrix_instance_q31 * pDst); + + /** + * @brief Q31 matrix initialization. + * @param[in,out] S points to an instance of the floating-point matrix structure. + * @param[in] nRows number of rows in the matrix. + * @param[in] nColumns number of columns in the matrix. + * @param[in] pData points to the matrix data array. + */ +void arm_mat_init_q31( + arm_matrix_instance_q31 * S, + uint16_t nRows, + uint16_t nColumns, + q31_t * pData); + + /** + * @brief Q15 matrix initialization. + * @param[in,out] S points to an instance of the floating-point matrix structure. + * @param[in] nRows number of rows in the matrix. + * @param[in] nColumns number of columns in the matrix. + * @param[in] pData points to the matrix data array. + */ +void arm_mat_init_q15( + arm_matrix_instance_q15 * S, + uint16_t nRows, + uint16_t nColumns, + q15_t * pData); + + /** + * @brief Floating-point matrix initialization. + * @param[in,out] S points to an instance of the floating-point matrix structure. + * @param[in] nRows number of rows in the matrix. + * @param[in] nColumns number of columns in the matrix. + * @param[in] pData points to the matrix data array. + */ +void arm_mat_init_f32( + arm_matrix_instance_f32 * S, + uint16_t nRows, + uint16_t nColumns, + float32_t * pData); + + + + /** + * @brief Floating-point matrix inverse. + * @param[in] src points to the instance of the input floating-point matrix structure. + * @param[out] dst points to the instance of the output floating-point matrix structure. + * @return The function returns ARM_MATH_SIZE_MISMATCH, if the dimensions do not match. + * If the input matrix is singular (does not have an inverse), then the algorithm terminates and returns error status ARM_MATH_SINGULAR. + */ + arm_status arm_mat_inverse_f32( + const arm_matrix_instance_f32 * src, + arm_matrix_instance_f32 * dst); + + + /** + * @brief Floating-point matrix inverse. + * @param[in] src points to the instance of the input floating-point matrix structure. + * @param[out] dst points to the instance of the output floating-point matrix structure. + * @return The function returns ARM_MATH_SIZE_MISMATCH, if the dimensions do not match. + * If the input matrix is singular (does not have an inverse), then the algorithm terminates and returns error status ARM_MATH_SINGULAR. + */ + arm_status arm_mat_inverse_f64( + const arm_matrix_instance_f64 * src, + arm_matrix_instance_f64 * dst); + + /** + * @brief Floating-point Cholesky decomposition of Symmetric Positive Definite Matrix. + * @param[in] src points to the instance of the input floating-point matrix structure. + * @param[out] dst points to the instance of the output floating-point matrix structure. + * @return The function returns ARM_MATH_SIZE_MISMATCH, if the dimensions do not match. + * If the input matrix does not have a decomposition, then the algorithm terminates and returns error status ARM_MATH_DECOMPOSITION_FAILURE. + * If the matrix is ill conditioned or only semi-definite, then it is better using the LDL^t decomposition. + * The decomposition is returning a lower triangular matrix. + */ + arm_status arm_mat_cholesky_f64( + const arm_matrix_instance_f64 * src, + arm_matrix_instance_f64 * dst); + + /** + * @brief Floating-point Cholesky decomposition of Symmetric Positive Definite Matrix. + * @param[in] src points to the instance of the input floating-point matrix structure. + * @param[out] dst points to the instance of the output floating-point matrix structure. + * @return The function returns ARM_MATH_SIZE_MISMATCH, if the dimensions do not match. + * If the input matrix does not have a decomposition, then the algorithm terminates and returns error status ARM_MATH_DECOMPOSITION_FAILURE. + * If the matrix is ill conditioned or only semi-definite, then it is better using the LDL^t decomposition. + * The decomposition is returning a lower triangular matrix. + */ + arm_status arm_mat_cholesky_f32( + const arm_matrix_instance_f32 * src, + arm_matrix_instance_f32 * dst); + + /** + * @brief Solve UT . X = A where UT is an upper triangular matrix + * @param[in] ut The upper triangular matrix + * @param[in] a The matrix a + * @param[out] dst The solution X of UT . X = A + * @return The function returns ARM_MATH_SINGULAR, if the system can't be solved. + */ + arm_status arm_mat_solve_upper_triangular_f32( + const arm_matrix_instance_f32 * ut, + const arm_matrix_instance_f32 * a, + arm_matrix_instance_f32 * dst); + + /** + * @brief Solve LT . X = A where LT is a lower triangular matrix + * @param[in] lt The lower triangular matrix + * @param[in] a The matrix a + * @param[out] dst The solution X of LT . X = A + * @return The function returns ARM_MATH_SINGULAR, if the system can't be solved. + */ + arm_status arm_mat_solve_lower_triangular_f32( + const arm_matrix_instance_f32 * lt, + const arm_matrix_instance_f32 * a, + arm_matrix_instance_f32 * dst); + + + /** + * @brief Solve UT . X = A where UT is an upper triangular matrix + * @param[in] ut The upper triangular matrix + * @param[in] a The matrix a + * @param[out] dst The solution X of UT . X = A + * @return The function returns ARM_MATH_SINGULAR, if the system can't be solved. + */ + arm_status arm_mat_solve_upper_triangular_f64( + const arm_matrix_instance_f64 * ut, + const arm_matrix_instance_f64 * a, + arm_matrix_instance_f64 * dst); + + /** + * @brief Solve LT . X = A where LT is a lower triangular matrix + * @param[in] lt The lower triangular matrix + * @param[in] a The matrix a + * @param[out] dst The solution X of LT . X = A + * @return The function returns ARM_MATH_SINGULAR, if the system can't be solved. + */ + arm_status arm_mat_solve_lower_triangular_f64( + const arm_matrix_instance_f64 * lt, + const arm_matrix_instance_f64 * a, + arm_matrix_instance_f64 * dst); + + + /** + * @brief Floating-point LDL decomposition of Symmetric Positive Semi-Definite Matrix. + * @param[in] src points to the instance of the input floating-point matrix structure. + * @param[out] l points to the instance of the output floating-point triangular matrix structure. + * @param[out] d points to the instance of the output floating-point diagonal matrix structure. + * @param[out] p points to the instance of the output floating-point permutation vector. + * @return The function returns ARM_MATH_SIZE_MISMATCH, if the dimensions do not match. + * If the input matrix does not have a decomposition, then the algorithm terminates and returns error status ARM_MATH_DECOMPOSITION_FAILURE. + * The decomposition is returning a lower triangular matrix. + */ + arm_status arm_mat_ldlt_f32( + const arm_matrix_instance_f32 * src, + arm_matrix_instance_f32 * l, + arm_matrix_instance_f32 * d, + uint16_t * pp); + + /** + * @brief Floating-point LDL decomposition of Symmetric Positive Semi-Definite Matrix. + * @param[in] src points to the instance of the input floating-point matrix structure. + * @param[out] l points to the instance of the output floating-point triangular matrix structure. + * @param[out] d points to the instance of the output floating-point diagonal matrix structure. + * @param[out] p points to the instance of the output floating-point permutation vector. + * @return The function returns ARM_MATH_SIZE_MISMATCH, if the dimensions do not match. + * If the input matrix does not have a decomposition, then the algorithm terminates and returns error status ARM_MATH_DECOMPOSITION_FAILURE. + * The decomposition is returning a lower triangular matrix. + */ + arm_status arm_mat_ldlt_f64( + const arm_matrix_instance_f64 * src, + arm_matrix_instance_f64 * l, + arm_matrix_instance_f64 * d, + uint16_t * pp); + +#ifdef __cplusplus +} +#endif + +#endif /* ifndef _MATRIX_FUNCTIONS_H_ */ diff --git a/APP_Framework/Framework/knowing/cmsis_5/DSP/Include/dsp/matrix_functions_f16.h b/APP_Framework/Framework/knowing/cmsis_5/DSP/Include/dsp/matrix_functions_f16.h new file mode 100644 index 000000000..62876a76b --- /dev/null +++ b/APP_Framework/Framework/knowing/cmsis_5/DSP/Include/dsp/matrix_functions_f16.h @@ -0,0 +1,221 @@ +/****************************************************************************** + * @file matrix_functions_f16.h + * @brief Public header file for CMSIS DSP Library + * @version V1.9.0 + * @date 23 April 2021 + * Target Processor: Cortex-M and Cortex-A cores + ******************************************************************************/ +/* + * Copyright (c) 2010-2020 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +#ifndef _MATRIX_FUNCTIONS_F16_H_ +#define _MATRIX_FUNCTIONS_F16_H_ + +#ifdef __cplusplus +extern "C" +{ +#endif + + +#include "arm_math_types_f16.h" +#include "arm_math_memory.h" + +#include "dsp/none.h" +#include "dsp/utils.h" + +#if defined(ARM_FLOAT16_SUPPORTED) + + /** + * @brief Instance structure for the floating-point matrix structure. + */ + typedef struct + { + uint16_t numRows; /**< number of rows of the matrix. */ + uint16_t numCols; /**< number of columns of the matrix. */ + float16_t *pData; /**< points to the data of the matrix. */ + } arm_matrix_instance_f16; + + /** + * @brief Floating-point matrix addition. + * @param[in] pSrcA points to the first input matrix structure + * @param[in] pSrcB points to the second input matrix structure + * @param[out] pDst points to output matrix structure + * @return The function returns either + * ARM_MATH_SIZE_MISMATCH or ARM_MATH_SUCCESS based on the outcome of size checking. + */ +arm_status arm_mat_add_f16( + const arm_matrix_instance_f16 * pSrcA, + const arm_matrix_instance_f16 * pSrcB, + arm_matrix_instance_f16 * pDst); + + /** + * @brief Floating-point, complex, matrix multiplication. + * @param[in] pSrcA points to the first input matrix structure + * @param[in] pSrcB points to the second input matrix structure + * @param[out] pDst points to output matrix structure + * @return The function returns either + * ARM_MATH_SIZE_MISMATCH or ARM_MATH_SUCCESS based on the outcome of size checking. + */ +arm_status arm_mat_cmplx_mult_f16( + const arm_matrix_instance_f16 * pSrcA, + const arm_matrix_instance_f16 * pSrcB, + arm_matrix_instance_f16 * pDst); + + /** + * @brief Floating-point matrix transpose. + * @param[in] pSrc points to the input matrix + * @param[out] pDst points to the output matrix + * @return The function returns either ARM_MATH_SIZE_MISMATCH + * or ARM_MATH_SUCCESS based on the outcome of size checking. + */ +arm_status arm_mat_trans_f16( + const arm_matrix_instance_f16 * pSrc, + arm_matrix_instance_f16 * pDst); + + /** + * @brief Floating-point complex matrix transpose. + * @param[in] pSrc points to the input matrix + * @param[out] pDst points to the output matrix + * @return The function returns either ARM_MATH_SIZE_MISMATCH + * or ARM_MATH_SUCCESS based on the outcome of size checking. + */ +arm_status arm_mat_cmplx_trans_f16( + const arm_matrix_instance_f16 * pSrc, + arm_matrix_instance_f16 * pDst); + + /** + * @brief Floating-point matrix multiplication + * @param[in] pSrcA points to the first input matrix structure + * @param[in] pSrcB points to the second input matrix structure + * @param[out] pDst points to output matrix structure + * @return The function returns either + * ARM_MATH_SIZE_MISMATCH or ARM_MATH_SUCCESS based on the outcome of size checking. + */ +arm_status arm_mat_mult_f16( + const arm_matrix_instance_f16 * pSrcA, + const arm_matrix_instance_f16 * pSrcB, + arm_matrix_instance_f16 * pDst); + /** + * @brief Floating-point matrix and vector multiplication + * @param[in] pSrcMat points to the input matrix structure + * @param[in] pVec points to vector + * @param[out] pDst points to output vector + */ +void arm_mat_vec_mult_f16( + const arm_matrix_instance_f16 *pSrcMat, + const float16_t *pVec, + float16_t *pDst); + + /** + * @brief Floating-point matrix subtraction + * @param[in] pSrcA points to the first input matrix structure + * @param[in] pSrcB points to the second input matrix structure + * @param[out] pDst points to output matrix structure + * @return The function returns either + * ARM_MATH_SIZE_MISMATCH or ARM_MATH_SUCCESS based on the outcome of size checking. + */ +arm_status arm_mat_sub_f16( + const arm_matrix_instance_f16 * pSrcA, + const arm_matrix_instance_f16 * pSrcB, + arm_matrix_instance_f16 * pDst); + + /** + * @brief Floating-point matrix scaling. + * @param[in] pSrc points to the input matrix + * @param[in] scale scale factor + * @param[out] pDst points to the output matrix + * @return The function returns either + * ARM_MATH_SIZE_MISMATCH or ARM_MATH_SUCCESS based on the outcome of size checking. + */ +arm_status arm_mat_scale_f16( + const arm_matrix_instance_f16 * pSrc, + float16_t scale, + arm_matrix_instance_f16 * pDst); + + /** + * @brief Floating-point matrix initialization. + * @param[in,out] S points to an instance of the floating-point matrix structure. + * @param[in] nRows number of rows in the matrix. + * @param[in] nColumns number of columns in the matrix. + * @param[in] pData points to the matrix data array. + */ +void arm_mat_init_f16( + arm_matrix_instance_f16 * S, + uint16_t nRows, + uint16_t nColumns, + float16_t * pData); + + + /** + * @brief Floating-point matrix inverse. + * @param[in] src points to the instance of the input floating-point matrix structure. + * @param[out] dst points to the instance of the output floating-point matrix structure. + * @return The function returns ARM_MATH_SIZE_MISMATCH, if the dimensions do not match. + * If the input matrix is singular (does not have an inverse), then the algorithm terminates and returns error status ARM_MATH_SINGULAR. + */ + arm_status arm_mat_inverse_f16( + const arm_matrix_instance_f16 * src, + arm_matrix_instance_f16 * dst); + + + /** + * @brief Floating-point Cholesky decomposition of Symmetric Positive Definite Matrix. + * @param[in] src points to the instance of the input floating-point matrix structure. + * @param[out] dst points to the instance of the output floating-point matrix structure. + * @return The function returns ARM_MATH_SIZE_MISMATCH, if the dimensions do not match. + * If the input matrix does not have a decomposition, then the algorithm terminates and returns error status ARM_MATH_DECOMPOSITION_FAILURE. + * If the matrix is ill conditioned or only semi-definite, then it is better using the LDL^t decomposition. + * The decomposition is returning a lower triangular matrix. + */ + arm_status arm_mat_cholesky_f16( + const arm_matrix_instance_f16 * src, + arm_matrix_instance_f16 * dst); + + /** + * @brief Solve UT . X = A where UT is an upper triangular matrix + * @param[in] ut The upper triangular matrix + * @param[in] a The matrix a + * @param[out] dst The solution X of UT . X = A + * @return The function returns ARM_MATH_SINGULAR, if the system can't be solved. + */ + arm_status arm_mat_solve_upper_triangular_f16( + const arm_matrix_instance_f16 * ut, + const arm_matrix_instance_f16 * a, + arm_matrix_instance_f16 * dst); + + /** + * @brief Solve LT . X = A where LT is a lower triangular matrix + * @param[in] lt The lower triangular matrix + * @param[in] a The matrix a + * @param[out] dst The solution X of LT . X = A + * @return The function returns ARM_MATH_SINGULAR, if the system can't be solved. + */ + arm_status arm_mat_solve_lower_triangular_f16( + const arm_matrix_instance_f16 * lt, + const arm_matrix_instance_f16 * a, + arm_matrix_instance_f16 * dst); + + + +#endif /*defined(ARM_FLOAT16_SUPPORTED)*/ +#ifdef __cplusplus +} +#endif + +#endif /* ifndef _MATRIX_FUNCTIONS_F16_H_ */ diff --git a/APP_Framework/Framework/knowing/cmsis_5/DSP/Include/dsp/none.h b/APP_Framework/Framework/knowing/cmsis_5/DSP/Include/dsp/none.h new file mode 100644 index 000000000..62f2d144a --- /dev/null +++ b/APP_Framework/Framework/knowing/cmsis_5/DSP/Include/dsp/none.h @@ -0,0 +1,576 @@ +/****************************************************************************** + * @file none.h + * @brief Intrinsincs when no DSP extension available + * @version V1.9.0 + * @date 20. July 2020 + ******************************************************************************/ +/* + * Copyright (c) 2010-2020 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + +Definitions in this file are allowing to reuse some versions of the +CMSIS-DSP to build on a core (M0 for instance) or a host where +DSP extension are not available. + +Ideally a pure C version should have been used instead. +But those are not always available or use a restricted set +of intrinsics. + +*/ + +#ifndef _NONE_H_ +#define _NONE_H_ + +#include "arm_math_types.h" + +#ifdef __cplusplus +extern "C" +{ +#endif + + + +/* + +Normally those kind of definitions are in a compiler file +in Core or Core_A. + +But for MSVC compiler it is a bit special. The goal is very specific +to CMSIS-DSP and only to allow the use of this library from other +systems like Python or Matlab. + +MSVC is not going to be used to cross-compile to ARM. So, having a MSVC +compiler file in Core or Core_A would not make sense. + +*/ +#if defined ( _MSC_VER ) || defined(__GNUC_PYTHON__) + __STATIC_FORCEINLINE uint8_t __CLZ(uint32_t data) + { + if (data == 0U) { return 32U; } + + uint32_t count = 0U; + uint32_t mask = 0x80000000U; + + while ((data & mask) == 0U) + { + count += 1U; + mask = mask >> 1U; + } + return count; + } + + __STATIC_FORCEINLINE int32_t __SSAT(int32_t val, uint32_t sat) + { + if ((sat >= 1U) && (sat <= 32U)) + { + const int32_t max = (int32_t)((1U << (sat - 1U)) - 1U); + const int32_t min = -1 - max ; + if (val > max) + { + return max; + } + else if (val < min) + { + return min; + } + } + return val; + } + + __STATIC_FORCEINLINE uint32_t __USAT(int32_t val, uint32_t sat) + { + if (sat <= 31U) + { + const uint32_t max = ((1U << sat) - 1U); + if (val > (int32_t)max) + { + return max; + } + else if (val < 0) + { + return 0U; + } + } + return (uint32_t)val; + } + + /** + \brief Rotate Right in unsigned value (32 bit) + \details Rotate Right (immediate) provides the value of the contents of a register rotated by a variable number of bits. + \param [in] op1 Value to rotate + \param [in] op2 Number of Bits to rotate + \return Rotated value + */ +__STATIC_FORCEINLINE uint32_t __ROR(uint32_t op1, uint32_t op2) +{ + op2 %= 32U; + if (op2 == 0U) + { + return op1; + } + return (op1 >> op2) | (op1 << (32U - op2)); +} + + +#endif + +/** + * @brief Clips Q63 to Q31 values. + */ + __STATIC_FORCEINLINE q31_t clip_q63_to_q31( + q63_t x) + { + return ((q31_t) (x >> 32) != ((q31_t) x >> 31)) ? + ((0x7FFFFFFF ^ ((q31_t) (x >> 63)))) : (q31_t) x; + } + + /** + * @brief Clips Q63 to Q15 values. + */ + __STATIC_FORCEINLINE q15_t clip_q63_to_q15( + q63_t x) + { + return ((q31_t) (x >> 32) != ((q31_t) x >> 31)) ? + ((0x7FFF ^ ((q15_t) (x >> 63)))) : (q15_t) (x >> 15); + } + + /** + * @brief Clips Q31 to Q7 values. + */ + __STATIC_FORCEINLINE q7_t clip_q31_to_q7( + q31_t x) + { + return ((q31_t) (x >> 24) != ((q31_t) x >> 23)) ? + ((0x7F ^ ((q7_t) (x >> 31)))) : (q7_t) x; + } + + /** + * @brief Clips Q31 to Q15 values. + */ + __STATIC_FORCEINLINE q15_t clip_q31_to_q15( + q31_t x) + { + return ((q31_t) (x >> 16) != ((q31_t) x >> 15)) ? + ((0x7FFF ^ ((q15_t) (x >> 31)))) : (q15_t) x; + } + + /** + * @brief Multiplies 32 X 64 and returns 32 bit result in 2.30 format. + */ + __STATIC_FORCEINLINE q63_t mult32x64( + q63_t x, + q31_t y) + { + return ((((q63_t) (x & 0x00000000FFFFFFFF) * y) >> 32) + + (((q63_t) (x >> 32) * y) ) ); + } + +/* SMMLAR */ +#define multAcc_32x32_keep32_R(a, x, y) \ + a = (q31_t) (((((q63_t) a) << 32) + ((q63_t) x * y) + 0x80000000LL ) >> 32) + +/* SMMLSR */ +#define multSub_32x32_keep32_R(a, x, y) \ + a = (q31_t) (((((q63_t) a) << 32) - ((q63_t) x * y) + 0x80000000LL ) >> 32) + +/* SMMULR */ +#define mult_32x32_keep32_R(a, x, y) \ + a = (q31_t) (((q63_t) x * y + 0x80000000LL ) >> 32) + +/* SMMLA */ +#define multAcc_32x32_keep32(a, x, y) \ + a += (q31_t) (((q63_t) x * y) >> 32) + +/* SMMLS */ +#define multSub_32x32_keep32(a, x, y) \ + a -= (q31_t) (((q63_t) x * y) >> 32) + +/* SMMUL */ +#define mult_32x32_keep32(a, x, y) \ + a = (q31_t) (((q63_t) x * y ) >> 32) + +#ifndef ARM_MATH_DSP + /** + * @brief definition to pack two 16 bit values. + */ + #define __PKHBT(ARG1, ARG2, ARG3) ( (((int32_t)(ARG1) << 0) & (int32_t)0x0000FFFF) | \ + (((int32_t)(ARG2) << ARG3) & (int32_t)0xFFFF0000) ) + #define __PKHTB(ARG1, ARG2, ARG3) ( (((int32_t)(ARG1) << 0) & (int32_t)0xFFFF0000) | \ + (((int32_t)(ARG2) >> ARG3) & (int32_t)0x0000FFFF) ) +#endif + + /** + * @brief definition to pack four 8 bit values. + */ +#ifndef ARM_MATH_BIG_ENDIAN + #define __PACKq7(v0,v1,v2,v3) ( (((int32_t)(v0) << 0) & (int32_t)0x000000FF) | \ + (((int32_t)(v1) << 8) & (int32_t)0x0000FF00) | \ + (((int32_t)(v2) << 16) & (int32_t)0x00FF0000) | \ + (((int32_t)(v3) << 24) & (int32_t)0xFF000000) ) +#else + #define __PACKq7(v0,v1,v2,v3) ( (((int32_t)(v3) << 0) & (int32_t)0x000000FF) | \ + (((int32_t)(v2) << 8) & (int32_t)0x0000FF00) | \ + (((int32_t)(v1) << 16) & (int32_t)0x00FF0000) | \ + (((int32_t)(v0) << 24) & (int32_t)0xFF000000) ) +#endif + + + + +/* + * @brief C custom defined intrinsic functions + */ +#if !defined (ARM_MATH_DSP) + + + /* + * @brief C custom defined QADD8 + */ + __STATIC_FORCEINLINE uint32_t __QADD8( + uint32_t x, + uint32_t y) + { + q31_t r, s, t, u; + + r = __SSAT(((((q31_t)x << 24) >> 24) + (((q31_t)y << 24) >> 24)), 8) & (int32_t)0x000000FF; + s = __SSAT(((((q31_t)x << 16) >> 24) + (((q31_t)y << 16) >> 24)), 8) & (int32_t)0x000000FF; + t = __SSAT(((((q31_t)x << 8) >> 24) + (((q31_t)y << 8) >> 24)), 8) & (int32_t)0x000000FF; + u = __SSAT(((((q31_t)x ) >> 24) + (((q31_t)y ) >> 24)), 8) & (int32_t)0x000000FF; + + return ((uint32_t)((u << 24) | (t << 16) | (s << 8) | (r ))); + } + + + /* + * @brief C custom defined QSUB8 + */ + __STATIC_FORCEINLINE uint32_t __QSUB8( + uint32_t x, + uint32_t y) + { + q31_t r, s, t, u; + + r = __SSAT(((((q31_t)x << 24) >> 24) - (((q31_t)y << 24) >> 24)), 8) & (int32_t)0x000000FF; + s = __SSAT(((((q31_t)x << 16) >> 24) - (((q31_t)y << 16) >> 24)), 8) & (int32_t)0x000000FF; + t = __SSAT(((((q31_t)x << 8) >> 24) - (((q31_t)y << 8) >> 24)), 8) & (int32_t)0x000000FF; + u = __SSAT(((((q31_t)x ) >> 24) - (((q31_t)y ) >> 24)), 8) & (int32_t)0x000000FF; + + return ((uint32_t)((u << 24) | (t << 16) | (s << 8) | (r ))); + } + + + /* + * @brief C custom defined QADD16 + */ + __STATIC_FORCEINLINE uint32_t __QADD16( + uint32_t x, + uint32_t y) + { +/* q31_t r, s; without initialisation 'arm_offset_q15 test' fails but 'intrinsic' tests pass! for armCC */ + q31_t r = 0, s = 0; + + r = __SSAT(((((q31_t)x << 16) >> 16) + (((q31_t)y << 16) >> 16)), 16) & (int32_t)0x0000FFFF; + s = __SSAT(((((q31_t)x ) >> 16) + (((q31_t)y ) >> 16)), 16) & (int32_t)0x0000FFFF; + + return ((uint32_t)((s << 16) | (r ))); + } + + + /* + * @brief C custom defined SHADD16 + */ + __STATIC_FORCEINLINE uint32_t __SHADD16( + uint32_t x, + uint32_t y) + { + q31_t r, s; + + r = (((((q31_t)x << 16) >> 16) + (((q31_t)y << 16) >> 16)) >> 1) & (int32_t)0x0000FFFF; + s = (((((q31_t)x ) >> 16) + (((q31_t)y ) >> 16)) >> 1) & (int32_t)0x0000FFFF; + + return ((uint32_t)((s << 16) | (r ))); + } + + + /* + * @brief C custom defined QSUB16 + */ + __STATIC_FORCEINLINE uint32_t __QSUB16( + uint32_t x, + uint32_t y) + { + q31_t r, s; + + r = __SSAT(((((q31_t)x << 16) >> 16) - (((q31_t)y << 16) >> 16)), 16) & (int32_t)0x0000FFFF; + s = __SSAT(((((q31_t)x ) >> 16) - (((q31_t)y ) >> 16)), 16) & (int32_t)0x0000FFFF; + + return ((uint32_t)((s << 16) | (r ))); + } + + + /* + * @brief C custom defined SHSUB16 + */ + __STATIC_FORCEINLINE uint32_t __SHSUB16( + uint32_t x, + uint32_t y) + { + q31_t r, s; + + r = (((((q31_t)x << 16) >> 16) - (((q31_t)y << 16) >> 16)) >> 1) & (int32_t)0x0000FFFF; + s = (((((q31_t)x ) >> 16) - (((q31_t)y ) >> 16)) >> 1) & (int32_t)0x0000FFFF; + + return ((uint32_t)((s << 16) | (r ))); + } + + + /* + * @brief C custom defined QASX + */ + __STATIC_FORCEINLINE uint32_t __QASX( + uint32_t x, + uint32_t y) + { + q31_t r, s; + + r = __SSAT(((((q31_t)x << 16) >> 16) - (((q31_t)y ) >> 16)), 16) & (int32_t)0x0000FFFF; + s = __SSAT(((((q31_t)x ) >> 16) + (((q31_t)y << 16) >> 16)), 16) & (int32_t)0x0000FFFF; + + return ((uint32_t)((s << 16) | (r ))); + } + + + /* + * @brief C custom defined SHASX + */ + __STATIC_FORCEINLINE uint32_t __SHASX( + uint32_t x, + uint32_t y) + { + q31_t r, s; + + r = (((((q31_t)x << 16) >> 16) - (((q31_t)y ) >> 16)) >> 1) & (int32_t)0x0000FFFF; + s = (((((q31_t)x ) >> 16) + (((q31_t)y << 16) >> 16)) >> 1) & (int32_t)0x0000FFFF; + + return ((uint32_t)((s << 16) | (r ))); + } + + + /* + * @brief C custom defined QSAX + */ + __STATIC_FORCEINLINE uint32_t __QSAX( + uint32_t x, + uint32_t y) + { + q31_t r, s; + + r = __SSAT(((((q31_t)x << 16) >> 16) + (((q31_t)y ) >> 16)), 16) & (int32_t)0x0000FFFF; + s = __SSAT(((((q31_t)x ) >> 16) - (((q31_t)y << 16) >> 16)), 16) & (int32_t)0x0000FFFF; + + return ((uint32_t)((s << 16) | (r ))); + } + + + /* + * @brief C custom defined SHSAX + */ + __STATIC_FORCEINLINE uint32_t __SHSAX( + uint32_t x, + uint32_t y) + { + q31_t r, s; + + r = (((((q31_t)x << 16) >> 16) + (((q31_t)y ) >> 16)) >> 1) & (int32_t)0x0000FFFF; + s = (((((q31_t)x ) >> 16) - (((q31_t)y << 16) >> 16)) >> 1) & (int32_t)0x0000FFFF; + + return ((uint32_t)((s << 16) | (r ))); + } + + + /* + * @brief C custom defined SMUSDX + */ + __STATIC_FORCEINLINE uint32_t __SMUSDX( + uint32_t x, + uint32_t y) + { + return ((uint32_t)(((((q31_t)x << 16) >> 16) * (((q31_t)y ) >> 16)) - + ((((q31_t)x ) >> 16) * (((q31_t)y << 16) >> 16)) )); + } + + /* + * @brief C custom defined SMUADX + */ + __STATIC_FORCEINLINE uint32_t __SMUADX( + uint32_t x, + uint32_t y) + { + return ((uint32_t)(((((q31_t)x << 16) >> 16) * (((q31_t)y ) >> 16)) + + ((((q31_t)x ) >> 16) * (((q31_t)y << 16) >> 16)) )); + } + + + /* + * @brief C custom defined QADD + */ + __STATIC_FORCEINLINE int32_t __QADD( + int32_t x, + int32_t y) + { + return ((int32_t)(clip_q63_to_q31((q63_t)x + (q31_t)y))); + } + + + /* + * @brief C custom defined QSUB + */ + __STATIC_FORCEINLINE int32_t __QSUB( + int32_t x, + int32_t y) + { + return ((int32_t)(clip_q63_to_q31((q63_t)x - (q31_t)y))); + } + + + /* + * @brief C custom defined SMLAD + */ + __STATIC_FORCEINLINE uint32_t __SMLAD( + uint32_t x, + uint32_t y, + uint32_t sum) + { + return ((uint32_t)(((((q31_t)x << 16) >> 16) * (((q31_t)y << 16) >> 16)) + + ((((q31_t)x ) >> 16) * (((q31_t)y ) >> 16)) + + ( ((q31_t)sum ) ) )); + } + + + /* + * @brief C custom defined SMLADX + */ + __STATIC_FORCEINLINE uint32_t __SMLADX( + uint32_t x, + uint32_t y, + uint32_t sum) + { + return ((uint32_t)(((((q31_t)x << 16) >> 16) * (((q31_t)y ) >> 16)) + + ((((q31_t)x ) >> 16) * (((q31_t)y << 16) >> 16)) + + ( ((q31_t)sum ) ) )); + } + + + /* + * @brief C custom defined SMLSDX + */ + __STATIC_FORCEINLINE uint32_t __SMLSDX( + uint32_t x, + uint32_t y, + uint32_t sum) + { + return ((uint32_t)(((((q31_t)x << 16) >> 16) * (((q31_t)y ) >> 16)) - + ((((q31_t)x ) >> 16) * (((q31_t)y << 16) >> 16)) + + ( ((q31_t)sum ) ) )); + } + + + /* + * @brief C custom defined SMLALD + */ + __STATIC_FORCEINLINE uint64_t __SMLALD( + uint32_t x, + uint32_t y, + uint64_t sum) + { +/* return (sum + ((q15_t) (x >> 16) * (q15_t) (y >> 16)) + ((q15_t) x * (q15_t) y)); */ + return ((uint64_t)(((((q31_t)x << 16) >> 16) * (((q31_t)y << 16) >> 16)) + + ((((q31_t)x ) >> 16) * (((q31_t)y ) >> 16)) + + ( ((q63_t)sum ) ) )); + } + + + /* + * @brief C custom defined SMLALDX + */ + __STATIC_FORCEINLINE uint64_t __SMLALDX( + uint32_t x, + uint32_t y, + uint64_t sum) + { +/* return (sum + ((q15_t) (x >> 16) * (q15_t) y)) + ((q15_t) x * (q15_t) (y >> 16)); */ + return ((uint64_t)(((((q31_t)x << 16) >> 16) * (((q31_t)y ) >> 16)) + + ((((q31_t)x ) >> 16) * (((q31_t)y << 16) >> 16)) + + ( ((q63_t)sum ) ) )); + } + + + /* + * @brief C custom defined SMUAD + */ + __STATIC_FORCEINLINE uint32_t __SMUAD( + uint32_t x, + uint32_t y) + { + return ((uint32_t)(((((q31_t)x << 16) >> 16) * (((q31_t)y << 16) >> 16)) + + ((((q31_t)x ) >> 16) * (((q31_t)y ) >> 16)) )); + } + + + /* + * @brief C custom defined SMUSD + */ + __STATIC_FORCEINLINE uint32_t __SMUSD( + uint32_t x, + uint32_t y) + { + return ((uint32_t)(((((q31_t)x << 16) >> 16) * (((q31_t)y << 16) >> 16)) - + ((((q31_t)x ) >> 16) * (((q31_t)y ) >> 16)) )); + } + + + /* + * @brief C custom defined SXTB16 + */ + __STATIC_FORCEINLINE uint32_t __SXTB16( + uint32_t x) + { + return ((uint32_t)(((((q31_t)x << 24) >> 24) & (q31_t)0x0000FFFF) | + ((((q31_t)x << 8) >> 8) & (q31_t)0xFFFF0000) )); + } + + /* + * @brief C custom defined SMMLA + */ + __STATIC_FORCEINLINE int32_t __SMMLA( + int32_t x, + int32_t y, + int32_t sum) + { + return (sum + (int32_t) (((int64_t) x * y) >> 32)); + } + +#endif /* !defined (ARM_MATH_DSP) */ + + +#ifdef __cplusplus +} +#endif + +#endif /* ifndef _TRANSFORM_FUNCTIONS_H_ */ diff --git a/APP_Framework/Framework/knowing/cmsis_5/DSP/Include/dsp/quaternion_math_functions.h b/APP_Framework/Framework/knowing/cmsis_5/DSP/Include/dsp/quaternion_math_functions.h new file mode 100644 index 000000000..2e1f2e0af --- /dev/null +++ b/APP_Framework/Framework/knowing/cmsis_5/DSP/Include/dsp/quaternion_math_functions.h @@ -0,0 +1,159 @@ +/****************************************************************************** + * @file quaternion_math_functions.h + * @brief Public header file for CMSIS DSP Library + * @version V1.9.0 + * @date 23 April 2021 + * + * Target Processor: Cortex-M and Cortex-A cores + ******************************************************************************/ +/* + * Copyright (c) 2010-2021 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +#ifndef _QUATERNION_MATH_FUNCTIONS_H_ +#define _QUATERNION_MATH_FUNCTIONS_H_ + +#include "arm_math_types.h" +#include "arm_math_memory.h" + +#include "dsp/none.h" +#include "dsp/utils.h" + + +#ifdef __cplusplus +extern "C" +{ +#endif + +/** + * @defgroup groupQuaternionMath Quaternion Math Functions + * Functions to operates on quaternions and convert between a + * rotation and quaternion representation. + */ + + +/** + @brief Floating-point quaternion Norm. + @param[in] pInputQuaternions points to the input vector of quaternions + @param[out] pNorms points to the output vector of norms + @param[in] nbQuaternions number of quaternions in each vector + @return none + */ + + + +void arm_quaternion_norm_f32(const float32_t *pInputQuaternions, + float32_t *pNorms, + uint32_t nbQuaternions); + + +/** + @brief Floating-point quaternion inverse. + @param[in] pInputQuaternions points to the input vector of quaternions + @param[out] pInverseQuaternions points to the output vector of inverse quaternions + @param[in] nbQuaternions number of quaternions in each vector + @return none + */ + +void arm_quaternion_inverse_f32(const float32_t *pInputQuaternions, + float32_t *pInverseQuaternions, + uint32_t nbQuaternions); + +/** + @brief Floating-point quaternion conjugates. + @param[in] pInputQuaternions points to the input vector of quaternions + @param[out] pConjugateQuaternions points to the output vector of conjugate quaternions + @param[in] nbQuaternions number of quaternions in each vector + @return none + */ +void arm_quaternion_conjugate_f32(const float32_t *inputQuaternions, + float32_t *pConjugateQuaternions, + uint32_t nbQuaternions); + +/** + @brief Floating-point normalization of quaternions. + @param[in] pInputQuaternions points to the input vector of quaternions + @param[out] pNormalizedQuaternions points to the output vector of normalized quaternions + @param[in] nbQuaternions number of quaternions in each vector + @return none + */ +void arm_quaternion_normalize_f32(const float32_t *inputQuaternions, + float32_t *pNormalizedQuaternions, + uint32_t nbQuaternions); + + +/** + @brief Floating-point product of two quaternions. + @param[in] qa First quaternion + @param[in] qb Second quaternion + @param[out] r Product of two quaternions + @return none + */ +void arm_quaternion_product_single_f32(const float32_t *qa, + const float32_t *qb, + float32_t *r); + +/** + @brief Floating-point elementwise product two quaternions. + @param[in] qa First array of quaternions + @param[in] qb Second array of quaternions + @param[out] r Elementwise product of quaternions + @param[in] nbQuaternions Number of quaternions in the array + @return none + */ +void arm_quaternion_product_f32(const float32_t *qa, + const float32_t *qb, + float32_t *r, + uint32_t nbQuaternions); + +/** + * @brief Conversion of quaternion to equivalent rotation matrix. + * @param[in] pInputQuaternions points to an array of normalized quaternions + * @param[out] pOutputRotations points to an array of 3x3 rotations (in row order) + * @param[in] nbQuaternions in the array + * @return none. + * + * Format of rotation matrix + * \par + * The quaternion a + ib + jc + kd is converted into rotation matrix: + * a^2 + b^2 - c^2 - d^2 2bc - 2ad 2bd + 2ac + * 2bc + 2ad a^2 - b^2 + c^2 - d^2 2cd - 2ab + * 2bd - 2ac 2cd + 2ab a^2 - b^2 - c^2 + d^2 + * + * Rotation matrix is saved in row order : R00 R01 R02 R10 R11 R12 R20 R21 R22 + */ +void arm_quaternion2rotation_f32(const float32_t *pInputQuaternions, + float32_t *pOutputRotations, + uint32_t nbQuaternions); + +/** + * @brief Conversion of a rotation matrix to equivalent quaternion. + * @param[in] pInputRotations points to an array 3x3 rotation matrix (in row order) + * @param[out] pOutputQuaternions points to an array of quaternions + * @param[in] nbQuaternions in the array + * @return none. +*/ +void arm_rotation2quaternion_f32(const float32_t *pInputRotations, + float32_t *pOutputQuaternions, + uint32_t nbQuaternions); + +#ifdef __cplusplus +} +#endif + +#endif /* ifndef _QUATERNION_MATH_FUNCTIONS_H_ */ diff --git a/APP_Framework/Framework/knowing/cmsis_5/DSP/Include/dsp/statistics_functions.h b/APP_Framework/Framework/knowing/cmsis_5/DSP/Include/dsp/statistics_functions.h new file mode 100644 index 000000000..ee5c69265 --- /dev/null +++ b/APP_Framework/Framework/knowing/cmsis_5/DSP/Include/dsp/statistics_functions.h @@ -0,0 +1,586 @@ +/****************************************************************************** + * @file statistics_functions.h + * @brief Public header file for CMSIS DSP Library + * @version V1.9.0 + * @date 23 April 2021 + * Target Processor: Cortex-M and Cortex-A cores + ******************************************************************************/ +/* + * Copyright (c) 2010-2020 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +#ifndef _STATISTICS_FUNCTIONS_H_ +#define _STATISTICS_FUNCTIONS_H_ + +#include "arm_math_types.h" +#include "arm_math_memory.h" + +#include "dsp/none.h" +#include "dsp/utils.h" + +#include "dsp/basic_math_functions.h" +#include "dsp/fast_math_functions.h" + +#ifdef __cplusplus +extern "C" +{ +#endif + + +/** + * @defgroup groupStats Statistics Functions + */ + +/** + * @brief Computation of the LogSumExp + * + * In probabilistic computations, the dynamic of the probability values can be very + * wide because they come from gaussian functions. + * To avoid underflow and overflow issues, the values are represented by their log. + * In this representation, multiplying the original exp values is easy : their logs are added. + * But adding the original exp values is requiring some special handling and it is the + * goal of the LogSumExp function. + * + * If the values are x1...xn, the function is computing: + * + * ln(exp(x1) + ... + exp(xn)) and the computation is done in such a way that + * rounding issues are minimised. + * + * The max xm of the values is extracted and the function is computing: + * xm + ln(exp(x1 - xm) + ... + exp(xn - xm)) + * + * @param[in] *in Pointer to an array of input values. + * @param[in] blockSize Number of samples in the input array. + * @return LogSumExp + * + */ + + +float32_t arm_logsumexp_f32(const float32_t *in, uint32_t blockSize); + +/** + * @brief Dot product with log arithmetic + * + * Vectors are containing the log of the samples + * + * @param[in] pSrcA points to the first input vector + * @param[in] pSrcB points to the second input vector + * @param[in] blockSize number of samples in each vector + * @param[in] pTmpBuffer temporary buffer of length blockSize + * @return The log of the dot product . + * + */ + + +float32_t arm_logsumexp_dot_prod_f32(const float32_t * pSrcA, + const float32_t * pSrcB, + uint32_t blockSize, + float32_t *pTmpBuffer); + +/** + * @brief Entropy + * + * @param[in] pSrcA Array of input values. + * @param[in] blockSize Number of samples in the input array. + * @return Entropy -Sum(p ln p) + * + */ + + +float32_t arm_entropy_f32(const float32_t * pSrcA,uint32_t blockSize); + + +/** + * @brief Entropy + * + * @param[in] pSrcA Array of input values. + * @param[in] blockSize Number of samples in the input array. + * @return Entropy -Sum(p ln p) + * + */ + + +float64_t arm_entropy_f64(const float64_t * pSrcA, uint32_t blockSize); + + +/** + * @brief Kullback-Leibler + * + * @param[in] pSrcA Pointer to an array of input values for probability distribution A. + * @param[in] pSrcB Pointer to an array of input values for probability distribution B. + * @param[in] blockSize Number of samples in the input array. + * @return Kullback-Leibler Divergence D(A || B) + * + */ +float32_t arm_kullback_leibler_f32(const float32_t * pSrcA + ,const float32_t * pSrcB + ,uint32_t blockSize); + + +/** + * @brief Kullback-Leibler + * + * @param[in] pSrcA Pointer to an array of input values for probability distribution A. + * @param[in] pSrcB Pointer to an array of input values for probability distribution B. + * @param[in] blockSize Number of samples in the input array. + * @return Kullback-Leibler Divergence D(A || B) + * + */ +float64_t arm_kullback_leibler_f64(const float64_t * pSrcA, + const float64_t * pSrcB, + uint32_t blockSize); + + + /** + * @brief Sum of the squares of the elements of a Q31 vector. + * @param[in] pSrc is input pointer + * @param[in] blockSize is the number of samples to process + * @param[out] pResult is output value. + */ + void arm_power_q31( + const q31_t * pSrc, + uint32_t blockSize, + q63_t * pResult); + + + /** + * @brief Sum of the squares of the elements of a floating-point vector. + * @param[in] pSrc is input pointer + * @param[in] blockSize is the number of samples to process + * @param[out] pResult is output value. + */ + void arm_power_f32( + const float32_t * pSrc, + uint32_t blockSize, + float32_t * pResult); + + + /** + * @brief Sum of the squares of the elements of a Q15 vector. + * @param[in] pSrc is input pointer + * @param[in] blockSize is the number of samples to process + * @param[out] pResult is output value. + */ + void arm_power_q15( + const q15_t * pSrc, + uint32_t blockSize, + q63_t * pResult); + + + /** + * @brief Sum of the squares of the elements of a Q7 vector. + * @param[in] pSrc is input pointer + * @param[in] blockSize is the number of samples to process + * @param[out] pResult is output value. + */ + void arm_power_q7( + const q7_t * pSrc, + uint32_t blockSize, + q31_t * pResult); + + + /** + * @brief Mean value of a Q7 vector. + * @param[in] pSrc is input pointer + * @param[in] blockSize is the number of samples to process + * @param[out] pResult is output value. + */ + void arm_mean_q7( + const q7_t * pSrc, + uint32_t blockSize, + q7_t * pResult); + + + /** + * @brief Mean value of a Q15 vector. + * @param[in] pSrc is input pointer + * @param[in] blockSize is the number of samples to process + * @param[out] pResult is output value. + */ + void arm_mean_q15( + const q15_t * pSrc, + uint32_t blockSize, + q15_t * pResult); + + + /** + * @brief Mean value of a Q31 vector. + * @param[in] pSrc is input pointer + * @param[in] blockSize is the number of samples to process + * @param[out] pResult is output value. + */ + void arm_mean_q31( + const q31_t * pSrc, + uint32_t blockSize, + q31_t * pResult); + + + /** + * @brief Mean value of a floating-point vector. + * @param[in] pSrc is input pointer + * @param[in] blockSize is the number of samples to process + * @param[out] pResult is output value. + */ + void arm_mean_f32( + const float32_t * pSrc, + uint32_t blockSize, + float32_t * pResult); + + + /** + * @brief Variance of the elements of a floating-point vector. + * @param[in] pSrc is input pointer + * @param[in] blockSize is the number of samples to process + * @param[out] pResult is output value. + */ + void arm_var_f32( + const float32_t * pSrc, + uint32_t blockSize, + float32_t * pResult); + + + /** + * @brief Variance of the elements of a Q31 vector. + * @param[in] pSrc is input pointer + * @param[in] blockSize is the number of samples to process + * @param[out] pResult is output value. + */ + void arm_var_q31( + const q31_t * pSrc, + uint32_t blockSize, + q31_t * pResult); + + + /** + * @brief Variance of the elements of a Q15 vector. + * @param[in] pSrc is input pointer + * @param[in] blockSize is the number of samples to process + * @param[out] pResult is output value. + */ + void arm_var_q15( + const q15_t * pSrc, + uint32_t blockSize, + q15_t * pResult); + + + /** + * @brief Root Mean Square of the elements of a floating-point vector. + * @param[in] pSrc is input pointer + * @param[in] blockSize is the number of samples to process + * @param[out] pResult is output value. + */ + void arm_rms_f32( + const float32_t * pSrc, + uint32_t blockSize, + float32_t * pResult); + + + /** + * @brief Root Mean Square of the elements of a Q31 vector. + * @param[in] pSrc is input pointer + * @param[in] blockSize is the number of samples to process + * @param[out] pResult is output value. + */ + void arm_rms_q31( + const q31_t * pSrc, + uint32_t blockSize, + q31_t * pResult); + + + /** + * @brief Root Mean Square of the elements of a Q15 vector. + * @param[in] pSrc is input pointer + * @param[in] blockSize is the number of samples to process + * @param[out] pResult is output value. + */ + void arm_rms_q15( + const q15_t * pSrc, + uint32_t blockSize, + q15_t * pResult); + + + /** + * @brief Standard deviation of the elements of a floating-point vector. + * @param[in] pSrc is input pointer + * @param[in] blockSize is the number of samples to process + * @param[out] pResult is output value. + */ + void arm_std_f32( + const float32_t * pSrc, + uint32_t blockSize, + float32_t * pResult); + + + /** + * @brief Standard deviation of the elements of a Q31 vector. + * @param[in] pSrc is input pointer + * @param[in] blockSize is the number of samples to process + * @param[out] pResult is output value. + */ + void arm_std_q31( + const q31_t * pSrc, + uint32_t blockSize, + q31_t * pResult); + + + /** + * @brief Standard deviation of the elements of a Q15 vector. + * @param[in] pSrc is input pointer + * @param[in] blockSize is the number of samples to process + * @param[out] pResult is output value. + */ + void arm_std_q15( + const q15_t * pSrc, + uint32_t blockSize, + q15_t * pResult); + + + + /** + * @brief Minimum value of a Q7 vector. + * @param[in] pSrc is input pointer + * @param[in] blockSize is the number of samples to process + * @param[out] result is output pointer + * @param[in] index is the array index of the minimum value in the input buffer. + */ + void arm_min_q7( + const q7_t * pSrc, + uint32_t blockSize, + q7_t * result, + uint32_t * index); + + /** + * @brief Minimum value of absolute values of a Q7 vector. + * @param[in] pSrc is input pointer + * @param[in] blockSize is the number of samples to process + * @param[out] result is output pointer + * @param[in] index is the array index of the minimum value in the input buffer. + */ + void arm_absmin_q7( + const q7_t * pSrc, + uint32_t blockSize, + q7_t * result, + uint32_t * index); + + + /** + * @brief Minimum value of a Q15 vector. + * @param[in] pSrc is input pointer + * @param[in] blockSize is the number of samples to process + * @param[out] pResult is output pointer + * @param[in] pIndex is the array index of the minimum value in the input buffer. + */ + void arm_min_q15( + const q15_t * pSrc, + uint32_t blockSize, + q15_t * pResult, + uint32_t * pIndex); + +/** + * @brief Minimum value of absolute values of a Q15 vector. + * @param[in] pSrc is input pointer + * @param[in] blockSize is the number of samples to process + * @param[out] pResult is output pointer + * @param[in] pIndex is the array index of the minimum value in the input buffer. + */ + void arm_absmin_q15( + const q15_t * pSrc, + uint32_t blockSize, + q15_t * pResult, + uint32_t * pIndex); + + + /** + * @brief Minimum value of a Q31 vector. + * @param[in] pSrc is input pointer + * @param[in] blockSize is the number of samples to process + * @param[out] pResult is output pointer + * @param[out] pIndex is the array index of the minimum value in the input buffer. + */ + void arm_min_q31( + const q31_t * pSrc, + uint32_t blockSize, + q31_t * pResult, + uint32_t * pIndex); + + /** + * @brief Minimum value of absolute values of a Q31 vector. + * @param[in] pSrc is input pointer + * @param[in] blockSize is the number of samples to process + * @param[out] pResult is output pointer + * @param[out] pIndex is the array index of the minimum value in the input buffer. + */ + void arm_absmin_q31( + const q31_t * pSrc, + uint32_t blockSize, + q31_t * pResult, + uint32_t * pIndex); + + + /** + * @brief Minimum value of a floating-point vector. + * @param[in] pSrc is input pointer + * @param[in] blockSize is the number of samples to process + * @param[out] pResult is output pointer + * @param[out] pIndex is the array index of the minimum value in the input buffer. + */ + void arm_min_f32( + const float32_t * pSrc, + uint32_t blockSize, + float32_t * pResult, + uint32_t * pIndex); + + /** + * @brief Minimum value of absolute values of a floating-point vector. + * @param[in] pSrc is input pointer + * @param[in] blockSize is the number of samples to process + * @param[out] pResult is output pointer + * @param[out] pIndex is the array index of the minimum value in the input buffer. + */ + void arm_absmin_f32( + const float32_t * pSrc, + uint32_t blockSize, + float32_t * pResult, + uint32_t * pIndex); + + +/** + * @brief Maximum value of a Q7 vector. + * @param[in] pSrc points to the input buffer + * @param[in] blockSize length of the input vector + * @param[out] pResult maximum value returned here + * @param[out] pIndex index of maximum value returned here + */ + void arm_max_q7( + const q7_t * pSrc, + uint32_t blockSize, + q7_t * pResult, + uint32_t * pIndex); + +/** + * @brief Maximum value of absolute values of a Q7 vector. + * @param[in] pSrc points to the input buffer + * @param[in] blockSize length of the input vector + * @param[out] pResult maximum value returned here + * @param[out] pIndex index of maximum value returned here + */ + void arm_absmax_q7( + const q7_t * pSrc, + uint32_t blockSize, + q7_t * pResult, + uint32_t * pIndex); + + +/** + * @brief Maximum value of a Q15 vector. + * @param[in] pSrc points to the input buffer + * @param[in] blockSize length of the input vector + * @param[out] pResult maximum value returned here + * @param[out] pIndex index of maximum value returned here + */ + void arm_max_q15( + const q15_t * pSrc, + uint32_t blockSize, + q15_t * pResult, + uint32_t * pIndex); + +/** + * @brief Maximum value of absolute values of a Q15 vector. + * @param[in] pSrc points to the input buffer + * @param[in] blockSize length of the input vector + * @param[out] pResult maximum value returned here + * @param[out] pIndex index of maximum value returned here + */ + void arm_absmax_q15( + const q15_t * pSrc, + uint32_t blockSize, + q15_t * pResult, + uint32_t * pIndex); + +/** + * @brief Maximum value of a Q31 vector. + * @param[in] pSrc points to the input buffer + * @param[in] blockSize length of the input vector + * @param[out] pResult maximum value returned here + * @param[out] pIndex index of maximum value returned here + */ + void arm_max_q31( + const q31_t * pSrc, + uint32_t blockSize, + q31_t * pResult, + uint32_t * pIndex); + +/** + * @brief Maximum value of absolute values of a Q31 vector. + * @param[in] pSrc points to the input buffer + * @param[in] blockSize length of the input vector + * @param[out] pResult maximum value returned here + * @param[out] pIndex index of maximum value returned here + */ + void arm_absmax_q31( + const q31_t * pSrc, + uint32_t blockSize, + q31_t * pResult, + uint32_t * pIndex); + +/** + * @brief Maximum value of a floating-point vector. + * @param[in] pSrc points to the input buffer + * @param[in] blockSize length of the input vector + * @param[out] pResult maximum value returned here + * @param[out] pIndex index of maximum value returned here + */ + void arm_max_f32( + const float32_t * pSrc, + uint32_t blockSize, + float32_t * pResult, + uint32_t * pIndex); + +/** + * @brief Maximum value of absolute values of a floating-point vector. + * @param[in] pSrc points to the input buffer + * @param[in] blockSize length of the input vector + * @param[out] pResult maximum value returned here + * @param[out] pIndex index of maximum value returned here + */ + void arm_absmax_f32( + const float32_t * pSrc, + uint32_t blockSize, + float32_t * pResult, + uint32_t * pIndex); + + /** + @brief Maximum value of a floating-point vector. + @param[in] pSrc points to the input vector + @param[in] blockSize number of samples in input vector + @param[out] pResult maximum value returned here + @return none + */ + void arm_max_no_idx_f32( + const float32_t *pSrc, + uint32_t blockSize, + float32_t *pResult); + + + + +#ifdef __cplusplus +} +#endif + +#endif /* ifndef _STATISTICS_FUNCTIONS_H_ */ diff --git a/APP_Framework/Framework/knowing/cmsis_5/DSP/Include/dsp/statistics_functions_f16.h b/APP_Framework/Framework/knowing/cmsis_5/DSP/Include/dsp/statistics_functions_f16.h new file mode 100644 index 000000000..8ed3a844c --- /dev/null +++ b/APP_Framework/Framework/knowing/cmsis_5/DSP/Include/dsp/statistics_functions_f16.h @@ -0,0 +1,218 @@ +/****************************************************************************** + * @file statistics_functions_f16.h + * @brief Public header file for CMSIS DSP Library + * @version V1.9.0 + * @date 23 April 2021 + * Target Processor: Cortex-M and Cortex-A cores + ******************************************************************************/ +/* + * Copyright (c) 2010-2020 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +#ifndef _STATISTICS_FUNCTIONS_F16_H_ +#define _STATISTICS_FUNCTIONS_F16_H_ + +#include "arm_math_types_f16.h" +#include "arm_math_memory.h" + +#include "dsp/none.h" +#include "dsp/utils.h" + +#include "dsp/basic_math_functions_f16.h" +#include "dsp/fast_math_functions_f16.h" + +#ifdef __cplusplus +extern "C" +{ +#endif + +#if defined(ARM_FLOAT16_SUPPORTED) + + /** + * @brief Sum of the squares of the elements of a floating-point vector. + * @param[in] pSrc is input pointer + * @param[in] blockSize is the number of samples to process + * @param[out] pResult is output value. + */ + void arm_power_f16( + const float16_t * pSrc, + uint32_t blockSize, + float16_t * pResult); + + /** + * @brief Mean value of a floating-point vector. + * @param[in] pSrc is input pointer + * @param[in] blockSize is the number of samples to process + * @param[out] pResult is output value. + */ + void arm_mean_f16( + const float16_t * pSrc, + uint32_t blockSize, + float16_t * pResult); + + /** + * @brief Variance of the elements of a floating-point vector. + * @param[in] pSrc is input pointer + * @param[in] blockSize is the number of samples to process + * @param[out] pResult is output value. + */ + void arm_var_f16( + const float16_t * pSrc, + uint32_t blockSize, + float16_t * pResult); + + /** + * @brief Root Mean Square of the elements of a floating-point vector. + * @param[in] pSrc is input pointer + * @param[in] blockSize is the number of samples to process + * @param[out] pResult is output value. + */ + void arm_rms_f16( + const float16_t * pSrc, + uint32_t blockSize, + float16_t * pResult); + + /** + * @brief Standard deviation of the elements of a floating-point vector. + * @param[in] pSrc is input pointer + * @param[in] blockSize is the number of samples to process + * @param[out] pResult is output value. + */ + void arm_std_f16( + const float16_t * pSrc, + uint32_t blockSize, + float16_t * pResult); + + /** + * @brief Minimum value of a floating-point vector. + * @param[in] pSrc is input pointer + * @param[in] blockSize is the number of samples to process + * @param[out] pResult is output pointer + * @param[out] pIndex is the array index of the minimum value in the input buffer. + */ + void arm_min_f16( + const float16_t * pSrc, + uint32_t blockSize, + float16_t * pResult, + uint32_t * pIndex); + + /** + * @brief Minimum value of absolute values of a floating-point vector. + * @param[in] pSrc is input pointer + * @param[in] blockSize is the number of samples to process + * @param[out] pResult is output pointer + * @param[out] pIndex is the array index of the minimum value in the input buffer. + */ + void arm_absmin_f16( + const float16_t * pSrc, + uint32_t blockSize, + float16_t * pResult, + uint32_t * pIndex); + +/** + * @brief Maximum value of a floating-point vector. + * @param[in] pSrc points to the input buffer + * @param[in] blockSize length of the input vector + * @param[out] pResult maximum value returned here + * @param[out] pIndex index of maximum value returned here + */ + void arm_max_f16( + const float16_t * pSrc, + uint32_t blockSize, + float16_t * pResult, + uint32_t * pIndex); + +/** + * @brief Maximum value of absolute values of a floating-point vector. + * @param[in] pSrc points to the input buffer + * @param[in] blockSize length of the input vector + * @param[out] pResult maximum value returned here + * @param[out] pIndex index of maximum value returned here + */ + void arm_absmax_f16( + const float16_t * pSrc, + uint32_t blockSize, + float16_t * pResult, + uint32_t * pIndex); + +/** + * @brief Entropy + * + * @param[in] pSrcA Array of input values. + * @param[in] blockSize Number of samples in the input array. + * @return Entropy -Sum(p ln p) + * + */ + + +float16_t arm_entropy_f16(const float16_t * pSrcA,uint32_t blockSize); + +float16_t arm_logsumexp_f16(const float16_t *in, uint32_t blockSize); + +/** + * @brief Dot product with log arithmetic + * + * Vectors are containing the log of the samples + * + * @param[in] pSrcA points to the first input vector + * @param[in] pSrcB points to the second input vector + * @param[in] blockSize number of samples in each vector + * @param[in] pTmpBuffer temporary buffer of length blockSize + * @return The log of the dot product . + * + */ + + +float16_t arm_logsumexp_dot_prod_f16(const float16_t * pSrcA, + const float16_t * pSrcB, + uint32_t blockSize, + float16_t *pTmpBuffer); + +/** + * @brief Kullback-Leibler + * + * @param[in] pSrcA Pointer to an array of input values for probability distribution A. + * @param[in] pSrcB Pointer to an array of input values for probability distribution B. + * @param[in] blockSize Number of samples in the input array. + * @return Kullback-Leibler Divergence D(A || B) + * + */ +float16_t arm_kullback_leibler_f16(const float16_t * pSrcA + ,const float16_t * pSrcB + ,uint32_t blockSize); + +/** + @brief Maximum value of a floating-point vector. + @param[in] pSrc points to the input vector + @param[in] blockSize number of samples in input vector + @param[out] pResult maximum value returned here + @return none + */ + void arm_max_no_idx_f16( + const float16_t *pSrc, + uint32_t blockSize, + float16_t *pResult); + + + +#endif /*defined(ARM_FLOAT16_SUPPORTED)*/ +#ifdef __cplusplus +} +#endif + +#endif /* ifndef _STATISTICS_FUNCTIONS_F16_H_ */ diff --git a/APP_Framework/Framework/knowing/cmsis_5/DSP/Include/dsp/support_functions.h b/APP_Framework/Framework/knowing/cmsis_5/DSP/Include/dsp/support_functions.h new file mode 100644 index 000000000..3c2a7de3e --- /dev/null +++ b/APP_Framework/Framework/knowing/cmsis_5/DSP/Include/dsp/support_functions.h @@ -0,0 +1,427 @@ +/****************************************************************************** + * @file support_functions.h + * @brief Public header file for CMSIS DSP Library + * @version V1.9.0 + * @date 23 April 2021 + * Target Processor: Cortex-M and Cortex-A cores + ******************************************************************************/ +/* + * Copyright (c) 2010-2020 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +#ifndef _SUPPORT_FUNCTIONS_H_ +#define _SUPPORT_FUNCTIONS_H_ + +#include "arm_math_types.h" +#include "arm_math_memory.h" + +#include "dsp/none.h" +#include "dsp/utils.h" + +#ifdef __cplusplus +extern "C" +{ +#endif + +/** + * @defgroup groupSupport Support Functions + */ + + +/** + * @brief Converts the elements of the floating-point vector to Q31 vector. + * @param[in] pSrc points to the floating-point input vector + * @param[out] pDst points to the Q31 output vector + * @param[in] blockSize length of the input vector + */ + void arm_float_to_q31( + const float32_t * pSrc, + q31_t * pDst, + uint32_t blockSize); + + + /** + * @brief Converts the elements of the floating-point vector to Q15 vector. + * @param[in] pSrc points to the floating-point input vector + * @param[out] pDst points to the Q15 output vector + * @param[in] blockSize length of the input vector + */ + void arm_float_to_q15( + const float32_t * pSrc, + q15_t * pDst, + uint32_t blockSize); + + + /** + * @brief Converts the elements of the floating-point vector to Q7 vector. + * @param[in] pSrc points to the floating-point input vector + * @param[out] pDst points to the Q7 output vector + * @param[in] blockSize length of the input vector + */ + void arm_float_to_q7( + const float32_t * pSrc, + q7_t * pDst, + uint32_t blockSize); + + + /** + * @brief Converts the elements of the Q31 vector to floating-point vector. + * @param[in] pSrc is input pointer + * @param[out] pDst is output pointer + * @param[in] blockSize is the number of samples to process + */ + void arm_q31_to_float( + const q31_t * pSrc, + float32_t * pDst, + uint32_t blockSize); + + + /** + * @brief Converts the elements of the Q31 vector to Q15 vector. + * @param[in] pSrc is input pointer + * @param[out] pDst is output pointer + * @param[in] blockSize is the number of samples to process + */ + void arm_q31_to_q15( + const q31_t * pSrc, + q15_t * pDst, + uint32_t blockSize); + + + /** + * @brief Converts the elements of the Q31 vector to Q7 vector. + * @param[in] pSrc is input pointer + * @param[out] pDst is output pointer + * @param[in] blockSize is the number of samples to process + */ + void arm_q31_to_q7( + const q31_t * pSrc, + q7_t * pDst, + uint32_t blockSize); + + + /** + * @brief Converts the elements of the Q15 vector to floating-point vector. + * @param[in] pSrc is input pointer + * @param[out] pDst is output pointer + * @param[in] blockSize is the number of samples to process + */ + void arm_q15_to_float( + const q15_t * pSrc, + float32_t * pDst, + uint32_t blockSize); + + + /** + * @brief Converts the elements of the Q15 vector to Q31 vector. + * @param[in] pSrc is input pointer + * @param[out] pDst is output pointer + * @param[in] blockSize is the number of samples to process + */ + void arm_q15_to_q31( + const q15_t * pSrc, + q31_t * pDst, + uint32_t blockSize); + + + /** + * @brief Converts the elements of the Q15 vector to Q7 vector. + * @param[in] pSrc is input pointer + * @param[out] pDst is output pointer + * @param[in] blockSize is the number of samples to process + */ + void arm_q15_to_q7( + const q15_t * pSrc, + q7_t * pDst, + uint32_t blockSize); + + + /** + * @brief Converts the elements of the Q7 vector to floating-point vector. + * @param[in] pSrc is input pointer + * @param[out] pDst is output pointer + * @param[in] blockSize is the number of samples to process + */ + void arm_q7_to_float( + const q7_t * pSrc, + float32_t * pDst, + uint32_t blockSize); + + + /** + * @brief Converts the elements of the Q7 vector to Q31 vector. + * @param[in] pSrc input pointer + * @param[out] pDst output pointer + * @param[in] blockSize number of samples to process + */ + void arm_q7_to_q31( + const q7_t * pSrc, + q31_t * pDst, + uint32_t blockSize); + + + /** + * @brief Converts the elements of the Q7 vector to Q15 vector. + * @param[in] pSrc input pointer + * @param[out] pDst output pointer + * @param[in] blockSize number of samples to process + */ + void arm_q7_to_q15( + const q7_t * pSrc, + q15_t * pDst, + uint32_t blockSize); + + + + + + /** + * @brief Struct for specifying sorting algorithm + */ + typedef enum + { + ARM_SORT_BITONIC = 0, + /**< Bitonic sort */ + ARM_SORT_BUBBLE = 1, + /**< Bubble sort */ + ARM_SORT_HEAP = 2, + /**< Heap sort */ + ARM_SORT_INSERTION = 3, + /**< Insertion sort */ + ARM_SORT_QUICK = 4, + /**< Quick sort */ + ARM_SORT_SELECTION = 5 + /**< Selection sort */ + } arm_sort_alg; + + /** + * @brief Struct for specifying sorting algorithm + */ + typedef enum + { + ARM_SORT_DESCENDING = 0, + /**< Descending order (9 to 0) */ + ARM_SORT_ASCENDING = 1 + /**< Ascending order (0 to 9) */ + } arm_sort_dir; + + /** + * @brief Instance structure for the sorting algorithms. + */ + typedef struct + { + arm_sort_alg alg; /**< Sorting algorithm selected */ + arm_sort_dir dir; /**< Sorting order (direction) */ + } arm_sort_instance_f32; + + /** + * @param[in] S points to an instance of the sorting structure. + * @param[in] pSrc points to the block of input data. + * @param[out] pDst points to the block of output data. + * @param[in] blockSize number of samples to process. + */ + void arm_sort_f32( + const arm_sort_instance_f32 * S, + float32_t * pSrc, + float32_t * pDst, + uint32_t blockSize); + + /** + * @param[in,out] S points to an instance of the sorting structure. + * @param[in] alg Selected algorithm. + * @param[in] dir Sorting order. + */ + void arm_sort_init_f32( + arm_sort_instance_f32 * S, + arm_sort_alg alg, + arm_sort_dir dir); + + /** + * @brief Instance structure for the sorting algorithms. + */ + typedef struct + { + arm_sort_dir dir; /**< Sorting order (direction) */ + float32_t * buffer; /**< Working buffer */ + } arm_merge_sort_instance_f32; + + /** + * @param[in] S points to an instance of the sorting structure. + * @param[in,out] pSrc points to the block of input data. + * @param[out] pDst points to the block of output data + * @param[in] blockSize number of samples to process. + */ + void arm_merge_sort_f32( + const arm_merge_sort_instance_f32 * S, + float32_t *pSrc, + float32_t *pDst, + uint32_t blockSize); + + /** + * @param[in,out] S points to an instance of the sorting structure. + * @param[in] dir Sorting order. + * @param[in] buffer Working buffer. + */ + void arm_merge_sort_init_f32( + arm_merge_sort_instance_f32 * S, + arm_sort_dir dir, + float32_t * buffer); + + + + /** + * @brief Copies the elements of a floating-point vector. + * @param[in] pSrc input pointer + * @param[out] pDst output pointer + * @param[in] blockSize number of samples to process + */ + void arm_copy_f32( + const float32_t * pSrc, + float32_t * pDst, + uint32_t blockSize); + + + /** + * @brief Copies the elements of a Q7 vector. + * @param[in] pSrc input pointer + * @param[out] pDst output pointer + * @param[in] blockSize number of samples to process + */ + void arm_copy_q7( + const q7_t * pSrc, + q7_t * pDst, + uint32_t blockSize); + + + /** + * @brief Copies the elements of a Q15 vector. + * @param[in] pSrc input pointer + * @param[out] pDst output pointer + * @param[in] blockSize number of samples to process + */ + void arm_copy_q15( + const q15_t * pSrc, + q15_t * pDst, + uint32_t blockSize); + + + /** + * @brief Copies the elements of a Q31 vector. + * @param[in] pSrc input pointer + * @param[out] pDst output pointer + * @param[in] blockSize number of samples to process + */ + void arm_copy_q31( + const q31_t * pSrc, + q31_t * pDst, + uint32_t blockSize); + + + /** + * @brief Fills a constant value into a floating-point vector. + * @param[in] value input value to be filled + * @param[out] pDst output pointer + * @param[in] blockSize number of samples to process + */ + void arm_fill_f32( + float32_t value, + float32_t * pDst, + uint32_t blockSize); + + + /** + * @brief Fills a constant value into a Q7 vector. + * @param[in] value input value to be filled + * @param[out] pDst output pointer + * @param[in] blockSize number of samples to process + */ + void arm_fill_q7( + q7_t value, + q7_t * pDst, + uint32_t blockSize); + + + /** + * @brief Fills a constant value into a Q15 vector. + * @param[in] value input value to be filled + * @param[out] pDst output pointer + * @param[in] blockSize number of samples to process + */ + void arm_fill_q15( + q15_t value, + q15_t * pDst, + uint32_t blockSize); + + + /** + * @brief Fills a constant value into a Q31 vector. + * @param[in] value input value to be filled + * @param[out] pDst output pointer + * @param[in] blockSize number of samples to process + */ + void arm_fill_q31( + q31_t value, + q31_t * pDst, + uint32_t blockSize); + + + + + + + +/** + * @brief Weighted sum + * + * + * @param[in] *in Array of input values. + * @param[in] *weigths Weights + * @param[in] blockSize Number of samples in the input array. + * @return Weighted sum + * + */ +float32_t arm_weighted_sum_f32(const float32_t *in + , const float32_t *weigths + , uint32_t blockSize); + + +/** + * @brief Barycenter + * + * + * @param[in] in List of vectors + * @param[in] weights Weights of the vectors + * @param[out] out Barycenter + * @param[in] nbVectors Number of vectors + * @param[in] vecDim Dimension of space (vector dimension) + * @return None + * + */ +void arm_barycenter_f32(const float32_t *in + , const float32_t *weights + , float32_t *out + , uint32_t nbVectors + , uint32_t vecDim); + + + +#ifdef __cplusplus +} +#endif + +#endif /* ifndef _SUPPORT_FUNCTIONS_H_ */ diff --git a/APP_Framework/Framework/knowing/cmsis_5/DSP/Include/dsp/support_functions_f16.h b/APP_Framework/Framework/knowing/cmsis_5/DSP/Include/dsp/support_functions_f16.h new file mode 100644 index 000000000..47b6535f1 --- /dev/null +++ b/APP_Framework/Framework/knowing/cmsis_5/DSP/Include/dsp/support_functions_f16.h @@ -0,0 +1,129 @@ +/****************************************************************************** + * @file support_functions_f16.h + * @brief Public header file for CMSIS DSP Library + * @version V1.9.0 + * @date 23 April 2021 + * Target Processor: Cortex-M and Cortex-A cores + ******************************************************************************/ +/* + * Copyright (c) 2010-2020 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +#ifndef _SUPPORT_FUNCTIONS_F16_H_ +#define _SUPPORT_FUNCTIONS_F16_H_ + +#include "arm_math_types_f16.h" +#include "arm_math_memory.h" + +#include "dsp/none.h" +#include "dsp/utils.h" + +#ifdef __cplusplus +extern "C" +{ +#endif + +#if defined(ARM_FLOAT16_SUPPORTED) + + /** + * @brief Copies the elements of a floating-point vector. + * @param[in] pSrc input pointer + * @param[out] pDst output pointer + * @param[in] blockSize number of samples to process + */ +void arm_copy_f16(const float16_t * pSrc, float16_t * pDst, uint32_t blockSize); + + /** + * @brief Fills a constant value into a floating-point vector. + * @param[in] value input value to be filled + * @param[out] pDst output pointer + * @param[in] blockSize number of samples to process + */ +void arm_fill_f16(float16_t value, float16_t * pDst, uint32_t blockSize); + +/** + * @brief Converts the elements of the floating-point vector to Q31 vector. + * @param[in] pSrc points to the f16 input vector + * @param[out] pDst points to the q15 output vector + * @param[in] blockSize length of the input vector + */ +void arm_f16_to_q15(const float16_t * pSrc, q15_t * pDst, uint32_t blockSize); + +/** + * @brief Converts the elements of the floating-point vector to Q31 vector. + * @param[in] pSrc points to the q15 input vector + * @param[out] pDst points to the f16 output vector + * @param[in] blockSize length of the input vector + */ +void arm_q15_to_f16(const q15_t * pSrc, float16_t * pDst, uint32_t blockSize); + + +/** + * @brief Converts the elements of the floating-point vector to Q31 vector. + * @param[in] pSrc points to the f32 input vector + * @param[out] pDst points to the f16 output vector + * @param[in] blockSize length of the input vector + */ +void arm_float_to_f16(const float32_t * pSrc, float16_t * pDst, uint32_t blockSize); + +/** + * @brief Converts the elements of the floating-point vector to Q31 vector. + * @param[in] pSrc points to the f16 input vector + * @param[out] pDst points to the f32 output vector + * @param[in] blockSize length of the input vector + */ +void arm_f16_to_float(const float16_t * pSrc, float32_t * pDst, uint32_t blockSize); + +/** + * @brief Weighted sum + * + * + * @param[in] *in Array of input values. + * @param[in] *weigths Weights + * @param[in] blockSize Number of samples in the input array. + * @return Weighted sum + * + */ +float16_t arm_weighted_sum_f16(const float16_t *in + , const float16_t *weigths + , uint32_t blockSize); + +/** + * @brief Barycenter + * + * + * @param[in] in List of vectors + * @param[in] weights Weights of the vectors + * @param[out] out Barycenter + * @param[in] nbVectors Number of vectors + * @param[in] vecDim Dimension of space (vector dimension) + * @return None + * + */ +void arm_barycenter_f16(const float16_t *in + , const float16_t *weights + , float16_t *out + , uint32_t nbVectors + , uint32_t vecDim); + +#endif /*defined(ARM_FLOAT16_SUPPORTED)*/ +#ifdef __cplusplus +} +#endif + +#endif /* ifndef _SUPPORT_FUNCTIONS_F16_H_ */ diff --git a/APP_Framework/Framework/knowing/cmsis_5/DSP/Include/dsp/svm_defines.h b/APP_Framework/Framework/knowing/cmsis_5/DSP/Include/dsp/svm_defines.h new file mode 100644 index 000000000..1f6001f32 --- /dev/null +++ b/APP_Framework/Framework/knowing/cmsis_5/DSP/Include/dsp/svm_defines.h @@ -0,0 +1,46 @@ +/****************************************************************************** + * @file svm_defines.h + * @brief Public header file for CMSIS DSP Library + * @version V1.9.0 + * @date 23 April 2021 + * + * Target Processor: Cortex-M and Cortex-A cores + ******************************************************************************/ +/* + * Copyright (c) 2010-2020 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +#ifndef _SVM_DEFINES_H_ +#define _SVM_DEFINES_H_ + +/** + * @brief Struct for specifying SVM Kernel + */ +typedef enum +{ + ARM_ML_KERNEL_LINEAR = 0, + /**< Linear kernel */ + ARM_ML_KERNEL_POLYNOMIAL = 1, + /**< Polynomial kernel */ + ARM_ML_KERNEL_RBF = 2, + /**< Radial Basis Function kernel */ + ARM_ML_KERNEL_SIGMOID = 3 + /**< Sigmoid kernel */ +} arm_ml_kernel_type; + +#endif diff --git a/APP_Framework/Framework/knowing/cmsis_5/DSP/Include/dsp/svm_functions.h b/APP_Framework/Framework/knowing/cmsis_5/DSP/Include/dsp/svm_functions.h new file mode 100644 index 000000000..8fdcb13e1 --- /dev/null +++ b/APP_Framework/Framework/knowing/cmsis_5/DSP/Include/dsp/svm_functions.h @@ -0,0 +1,299 @@ +/****************************************************************************** + * @file svm_functions.h + * @brief Public header file for CMSIS DSP Library + * @version V1.9.0 + * @date 23 April 2021 + * Target Processor: Cortex-M and Cortex-A cores + ******************************************************************************/ +/* + * Copyright (c) 2010-2020 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +#ifndef _SVM_FUNCTIONS_H_ +#define _SVM_FUNCTIONS_H_ + +#include "arm_math_types.h" +#include "arm_math_memory.h" + +#include "dsp/none.h" +#include "dsp/utils.h" +#include "dsp/svm_defines.h" + +#ifdef __cplusplus +extern "C" +{ +#endif + +#define STEP(x) (x) <= 0 ? 0 : 1 + +/** + * @defgroup groupSVM SVM Functions + * This set of functions is implementing SVM classification on 2 classes. + * The training must be done from scikit-learn. The parameters can be easily + * generated from the scikit-learn object. Some examples are given in + * DSP/Testing/PatternGeneration/SVM.py + * + * If more than 2 classes are needed, the functions in this folder + * will have to be used, as building blocks, to do multi-class classification. + * + * No multi-class classification is provided in this SVM folder. + * + */ + +/** + * @brief Integer exponentiation + * @param[in] x value + * @param[in] nb integer exponent >= 1 + * @return x^nb + * + */ +__STATIC_INLINE float32_t arm_exponent_f32(float32_t x, int32_t nb) +{ + float32_t r = x; + nb --; + while(nb > 0) + { + r = r * x; + nb--; + } + return(r); +} + + + + + +/** + * @brief Instance structure for linear SVM prediction function. + */ +typedef struct +{ + uint32_t nbOfSupportVectors; /**< Number of support vectors */ + uint32_t vectorDimension; /**< Dimension of vector space */ + float32_t intercept; /**< Intercept */ + const float32_t *dualCoefficients; /**< Dual coefficients */ + const float32_t *supportVectors; /**< Support vectors */ + const int32_t *classes; /**< The two SVM classes */ +} arm_svm_linear_instance_f32; + + +/** + * @brief Instance structure for polynomial SVM prediction function. + */ +typedef struct +{ + uint32_t nbOfSupportVectors; /**< Number of support vectors */ + uint32_t vectorDimension; /**< Dimension of vector space */ + float32_t intercept; /**< Intercept */ + const float32_t *dualCoefficients; /**< Dual coefficients */ + const float32_t *supportVectors; /**< Support vectors */ + const int32_t *classes; /**< The two SVM classes */ + int32_t degree; /**< Polynomial degree */ + float32_t coef0; /**< Polynomial constant */ + float32_t gamma; /**< Gamma factor */ +} arm_svm_polynomial_instance_f32; + +/** + * @brief Instance structure for rbf SVM prediction function. + */ +typedef struct +{ + uint32_t nbOfSupportVectors; /**< Number of support vectors */ + uint32_t vectorDimension; /**< Dimension of vector space */ + float32_t intercept; /**< Intercept */ + const float32_t *dualCoefficients; /**< Dual coefficients */ + const float32_t *supportVectors; /**< Support vectors */ + const int32_t *classes; /**< The two SVM classes */ + float32_t gamma; /**< Gamma factor */ +} arm_svm_rbf_instance_f32; + +/** + * @brief Instance structure for sigmoid SVM prediction function. + */ +typedef struct +{ + uint32_t nbOfSupportVectors; /**< Number of support vectors */ + uint32_t vectorDimension; /**< Dimension of vector space */ + float32_t intercept; /**< Intercept */ + const float32_t *dualCoefficients; /**< Dual coefficients */ + const float32_t *supportVectors; /**< Support vectors */ + const int32_t *classes; /**< The two SVM classes */ + float32_t coef0; /**< Independent constant */ + float32_t gamma; /**< Gamma factor */ +} arm_svm_sigmoid_instance_f32; + +/** + * @brief SVM linear instance init function + * @param[in] S Parameters for SVM functions + * @param[in] nbOfSupportVectors Number of support vectors + * @param[in] vectorDimension Dimension of vector space + * @param[in] intercept Intercept + * @param[in] dualCoefficients Array of dual coefficients + * @param[in] supportVectors Array of support vectors + * @param[in] classes Array of 2 classes ID + * @return none. + * + */ + + +void arm_svm_linear_init_f32(arm_svm_linear_instance_f32 *S, + uint32_t nbOfSupportVectors, + uint32_t vectorDimension, + float32_t intercept, + const float32_t *dualCoefficients, + const float32_t *supportVectors, + const int32_t *classes); + +/** + * @brief SVM linear prediction + * @param[in] S Pointer to an instance of the linear SVM structure. + * @param[in] in Pointer to input vector + * @param[out] pResult Decision value + * @return none. + * + */ + +void arm_svm_linear_predict_f32(const arm_svm_linear_instance_f32 *S, + const float32_t * in, + int32_t * pResult); + + +/** + * @brief SVM polynomial instance init function + * @param[in] S points to an instance of the polynomial SVM structure. + * @param[in] nbOfSupportVectors Number of support vectors + * @param[in] vectorDimension Dimension of vector space + * @param[in] intercept Intercept + * @param[in] dualCoefficients Array of dual coefficients + * @param[in] supportVectors Array of support vectors + * @param[in] classes Array of 2 classes ID + * @param[in] degree Polynomial degree + * @param[in] coef0 coeff0 (scikit-learn terminology) + * @param[in] gamma gamma (scikit-learn terminology) + * @return none. + * + */ + + +void arm_svm_polynomial_init_f32(arm_svm_polynomial_instance_f32 *S, + uint32_t nbOfSupportVectors, + uint32_t vectorDimension, + float32_t intercept, + const float32_t *dualCoefficients, + const float32_t *supportVectors, + const int32_t *classes, + int32_t degree, + float32_t coef0, + float32_t gamma + ); + +/** + * @brief SVM polynomial prediction + * @param[in] S Pointer to an instance of the polynomial SVM structure. + * @param[in] in Pointer to input vector + * @param[out] pResult Decision value + * @return none. + * + */ +void arm_svm_polynomial_predict_f32(const arm_svm_polynomial_instance_f32 *S, + const float32_t * in, + int32_t * pResult); + + +/** + * @brief SVM radial basis function instance init function + * @param[in] S points to an instance of the polynomial SVM structure. + * @param[in] nbOfSupportVectors Number of support vectors + * @param[in] vectorDimension Dimension of vector space + * @param[in] intercept Intercept + * @param[in] dualCoefficients Array of dual coefficients + * @param[in] supportVectors Array of support vectors + * @param[in] classes Array of 2 classes ID + * @param[in] gamma gamma (scikit-learn terminology) + * @return none. + * + */ + +void arm_svm_rbf_init_f32(arm_svm_rbf_instance_f32 *S, + uint32_t nbOfSupportVectors, + uint32_t vectorDimension, + float32_t intercept, + const float32_t *dualCoefficients, + const float32_t *supportVectors, + const int32_t *classes, + float32_t gamma + ); + +/** + * @brief SVM rbf prediction + * @param[in] S Pointer to an instance of the rbf SVM structure. + * @param[in] in Pointer to input vector + * @param[out] pResult decision value + * @return none. + * + */ +void arm_svm_rbf_predict_f32(const arm_svm_rbf_instance_f32 *S, + const float32_t * in, + int32_t * pResult); + +/** + * @brief SVM sigmoid instance init function + * @param[in] S points to an instance of the rbf SVM structure. + * @param[in] nbOfSupportVectors Number of support vectors + * @param[in] vectorDimension Dimension of vector space + * @param[in] intercept Intercept + * @param[in] dualCoefficients Array of dual coefficients + * @param[in] supportVectors Array of support vectors + * @param[in] classes Array of 2 classes ID + * @param[in] coef0 coeff0 (scikit-learn terminology) + * @param[in] gamma gamma (scikit-learn terminology) + * @return none. + * + */ + +void arm_svm_sigmoid_init_f32(arm_svm_sigmoid_instance_f32 *S, + uint32_t nbOfSupportVectors, + uint32_t vectorDimension, + float32_t intercept, + const float32_t *dualCoefficients, + const float32_t *supportVectors, + const int32_t *classes, + float32_t coef0, + float32_t gamma + ); + +/** + * @brief SVM sigmoid prediction + * @param[in] S Pointer to an instance of the rbf SVM structure. + * @param[in] in Pointer to input vector + * @param[out] pResult Decision value + * @return none. + * + */ +void arm_svm_sigmoid_predict_f32(const arm_svm_sigmoid_instance_f32 *S, + const float32_t * in, + int32_t * pResult); + + + + +#ifdef __cplusplus +} +#endif + +#endif /* ifndef _SVM_FUNCTIONS_H_ */ diff --git a/APP_Framework/Framework/knowing/cmsis_5/DSP/Include/dsp/svm_functions_f16.h b/APP_Framework/Framework/knowing/cmsis_5/DSP/Include/dsp/svm_functions_f16.h new file mode 100644 index 000000000..b80ed7cf9 --- /dev/null +++ b/APP_Framework/Framework/knowing/cmsis_5/DSP/Include/dsp/svm_functions_f16.h @@ -0,0 +1,298 @@ +/****************************************************************************** + * @file svm_functions_f16.h + * @brief Public header file for CMSIS DSP Library + * @version V1.9.0 + * @date 23 April 2021 + * Target Processor: Cortex-M and Cortex-A cores + ******************************************************************************/ +/* + * Copyright (c) 2010-2020 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +#ifndef _SVM_FUNCTIONS_F16_H_ +#define _SVM_FUNCTIONS_F16_H_ + +#include "arm_math_types_f16.h" +#include "arm_math_memory.h" + +#include "dsp/none.h" +#include "dsp/utils.h" +#include "dsp/svm_defines.h" + +#ifdef __cplusplus +extern "C" +{ +#endif + +#if defined(ARM_FLOAT16_SUPPORTED) + +#define STEP(x) (x) <= 0 ? 0 : 1 + +/** + * @defgroup groupSVM SVM Functions + * This set of functions is implementing SVM classification on 2 classes. + * The training must be done from scikit-learn. The parameters can be easily + * generated from the scikit-learn object. Some examples are given in + * DSP/Testing/PatternGeneration/SVM.py + * + * If more than 2 classes are needed, the functions in this folder + * will have to be used, as building blocks, to do multi-class classification. + * + * No multi-class classification is provided in this SVM folder. + * + */ + +/** + * @brief Integer exponentiation + * @param[in] x value + * @param[in] nb integer exponent >= 1 + * @return x^nb + * + */ +__STATIC_INLINE float16_t arm_exponent_f16(float16_t x, int32_t nb) +{ + float16_t r = x; + nb --; + while(nb > 0) + { + r = r * x; + nb--; + } + return(r); +} + + +/** + * @brief Instance structure for linear SVM prediction function. + */ +typedef struct +{ + uint32_t nbOfSupportVectors; /**< Number of support vectors */ + uint32_t vectorDimension; /**< Dimension of vector space */ + float16_t intercept; /**< Intercept */ + const float16_t *dualCoefficients; /**< Dual coefficients */ + const float16_t *supportVectors; /**< Support vectors */ + const int32_t *classes; /**< The two SVM classes */ +} arm_svm_linear_instance_f16; + + +/** + * @brief Instance structure for polynomial SVM prediction function. + */ +typedef struct +{ + uint32_t nbOfSupportVectors; /**< Number of support vectors */ + uint32_t vectorDimension; /**< Dimension of vector space */ + float16_t intercept; /**< Intercept */ + const float16_t *dualCoefficients; /**< Dual coefficients */ + const float16_t *supportVectors; /**< Support vectors */ + const int32_t *classes; /**< The two SVM classes */ + int32_t degree; /**< Polynomial degree */ + float16_t coef0; /**< Polynomial constant */ + float16_t gamma; /**< Gamma factor */ +} arm_svm_polynomial_instance_f16; + +/** + * @brief Instance structure for rbf SVM prediction function. + */ +typedef struct +{ + uint32_t nbOfSupportVectors; /**< Number of support vectors */ + uint32_t vectorDimension; /**< Dimension of vector space */ + float16_t intercept; /**< Intercept */ + const float16_t *dualCoefficients; /**< Dual coefficients */ + const float16_t *supportVectors; /**< Support vectors */ + const int32_t *classes; /**< The two SVM classes */ + float16_t gamma; /**< Gamma factor */ +} arm_svm_rbf_instance_f16; + +/** + * @brief Instance structure for sigmoid SVM prediction function. + */ +typedef struct +{ + uint32_t nbOfSupportVectors; /**< Number of support vectors */ + uint32_t vectorDimension; /**< Dimension of vector space */ + float16_t intercept; /**< Intercept */ + const float16_t *dualCoefficients; /**< Dual coefficients */ + const float16_t *supportVectors; /**< Support vectors */ + const int32_t *classes; /**< The two SVM classes */ + float16_t coef0; /**< Independent constant */ + float16_t gamma; /**< Gamma factor */ +} arm_svm_sigmoid_instance_f16; + +/** + * @brief SVM linear instance init function + * @param[in] S Parameters for SVM functions + * @param[in] nbOfSupportVectors Number of support vectors + * @param[in] vectorDimension Dimension of vector space + * @param[in] intercept Intercept + * @param[in] dualCoefficients Array of dual coefficients + * @param[in] supportVectors Array of support vectors + * @param[in] classes Array of 2 classes ID + * @return none. + * + */ + + +void arm_svm_linear_init_f16(arm_svm_linear_instance_f16 *S, + uint32_t nbOfSupportVectors, + uint32_t vectorDimension, + float16_t intercept, + const float16_t *dualCoefficients, + const float16_t *supportVectors, + const int32_t *classes); + +/** + * @brief SVM linear prediction + * @param[in] S Pointer to an instance of the linear SVM structure. + * @param[in] in Pointer to input vector + * @param[out] pResult Decision value + * @return none. + * + */ + +void arm_svm_linear_predict_f16(const arm_svm_linear_instance_f16 *S, + const float16_t * in, + int32_t * pResult); + + +/** + * @brief SVM polynomial instance init function + * @param[in] S points to an instance of the polynomial SVM structure. + * @param[in] nbOfSupportVectors Number of support vectors + * @param[in] vectorDimension Dimension of vector space + * @param[in] intercept Intercept + * @param[in] dualCoefficients Array of dual coefficients + * @param[in] supportVectors Array of support vectors + * @param[in] classes Array of 2 classes ID + * @param[in] degree Polynomial degree + * @param[in] coef0 coeff0 (scikit-learn terminology) + * @param[in] gamma gamma (scikit-learn terminology) + * @return none. + * + */ + + +void arm_svm_polynomial_init_f16(arm_svm_polynomial_instance_f16 *S, + uint32_t nbOfSupportVectors, + uint32_t vectorDimension, + float16_t intercept, + const float16_t *dualCoefficients, + const float16_t *supportVectors, + const int32_t *classes, + int32_t degree, + float16_t coef0, + float16_t gamma + ); + +/** + * @brief SVM polynomial prediction + * @param[in] S Pointer to an instance of the polynomial SVM structure. + * @param[in] in Pointer to input vector + * @param[out] pResult Decision value + * @return none. + * + */ +void arm_svm_polynomial_predict_f16(const arm_svm_polynomial_instance_f16 *S, + const float16_t * in, + int32_t * pResult); + + +/** + * @brief SVM radial basis function instance init function + * @param[in] S points to an instance of the polynomial SVM structure. + * @param[in] nbOfSupportVectors Number of support vectors + * @param[in] vectorDimension Dimension of vector space + * @param[in] intercept Intercept + * @param[in] dualCoefficients Array of dual coefficients + * @param[in] supportVectors Array of support vectors + * @param[in] classes Array of 2 classes ID + * @param[in] gamma gamma (scikit-learn terminology) + * @return none. + * + */ + +void arm_svm_rbf_init_f16(arm_svm_rbf_instance_f16 *S, + uint32_t nbOfSupportVectors, + uint32_t vectorDimension, + float16_t intercept, + const float16_t *dualCoefficients, + const float16_t *supportVectors, + const int32_t *classes, + float16_t gamma + ); + +/** + * @brief SVM rbf prediction + * @param[in] S Pointer to an instance of the rbf SVM structure. + * @param[in] in Pointer to input vector + * @param[out] pResult decision value + * @return none. + * + */ +void arm_svm_rbf_predict_f16(const arm_svm_rbf_instance_f16 *S, + const float16_t * in, + int32_t * pResult); + +/** + * @brief SVM sigmoid instance init function + * @param[in] S points to an instance of the rbf SVM structure. + * @param[in] nbOfSupportVectors Number of support vectors + * @param[in] vectorDimension Dimension of vector space + * @param[in] intercept Intercept + * @param[in] dualCoefficients Array of dual coefficients + * @param[in] supportVectors Array of support vectors + * @param[in] classes Array of 2 classes ID + * @param[in] coef0 coeff0 (scikit-learn terminology) + * @param[in] gamma gamma (scikit-learn terminology) + * @return none. + * + */ + +void arm_svm_sigmoid_init_f16(arm_svm_sigmoid_instance_f16 *S, + uint32_t nbOfSupportVectors, + uint32_t vectorDimension, + float16_t intercept, + const float16_t *dualCoefficients, + const float16_t *supportVectors, + const int32_t *classes, + float16_t coef0, + float16_t gamma + ); + +/** + * @brief SVM sigmoid prediction + * @param[in] S Pointer to an instance of the rbf SVM structure. + * @param[in] in Pointer to input vector + * @param[out] pResult Decision value + * @return none. + * + */ +void arm_svm_sigmoid_predict_f16(const arm_svm_sigmoid_instance_f16 *S, + const float16_t * in, + int32_t * pResult); + + + +#endif /*defined(ARM_FLOAT16_SUPPORTED)*/ +#ifdef __cplusplus +} +#endif + +#endif /* ifndef _SVM_FUNCTIONS_F16_H_ */ diff --git a/APP_Framework/Framework/knowing/cmsis_5/DSP/Include/dsp/transform_functions.h b/APP_Framework/Framework/knowing/cmsis_5/DSP/Include/dsp/transform_functions.h new file mode 100644 index 000000000..bf9c43c87 --- /dev/null +++ b/APP_Framework/Framework/knowing/cmsis_5/DSP/Include/dsp/transform_functions.h @@ -0,0 +1,592 @@ +/****************************************************************************** + * @file transform_functions.h + * @brief Public header file for CMSIS DSP Library + * @version V1.9.0 + * @date 23 April 2021 + * Target Processor: Cortex-M and Cortex-A cores + ******************************************************************************/ +/* + * Copyright (c) 2010-2020 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +#ifndef _TRANSFORM_FUNCTIONS_H_ +#define _TRANSFORM_FUNCTIONS_H_ + +#include "arm_math_types.h" +#include "arm_math_memory.h" + +#include "dsp/none.h" +#include "dsp/utils.h" + +#include "dsp/basic_math_functions.h" +#include "dsp/complex_math_functions.h" + +#ifdef __cplusplus +extern "C" +{ +#endif + + +/** + * @defgroup groupTransforms Transform Functions + */ + + + /** + * @brief Instance structure for the Q15 CFFT/CIFFT function. + */ + typedef struct + { + uint16_t fftLen; /**< length of the FFT. */ + uint8_t ifftFlag; /**< flag that selects forward (ifftFlag=0) or inverse (ifftFlag=1) transform. */ + uint8_t bitReverseFlag; /**< flag that enables (bitReverseFlag=1) or disables (bitReverseFlag=0) bit reversal of output. */ + const q15_t *pTwiddle; /**< points to the Sin twiddle factor table. */ + const uint16_t *pBitRevTable; /**< points to the bit reversal table. */ + uint16_t twidCoefModifier; /**< twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table. */ + uint16_t bitRevFactor; /**< bit reversal modifier that supports different size FFTs with the same bit reversal table. */ + } arm_cfft_radix2_instance_q15; + +/* Deprecated */ + arm_status arm_cfft_radix2_init_q15( + arm_cfft_radix2_instance_q15 * S, + uint16_t fftLen, + uint8_t ifftFlag, + uint8_t bitReverseFlag); + +/* Deprecated */ + void arm_cfft_radix2_q15( + const arm_cfft_radix2_instance_q15 * S, + q15_t * pSrc); + + + /** + * @brief Instance structure for the Q15 CFFT/CIFFT function. + */ + typedef struct + { + uint16_t fftLen; /**< length of the FFT. */ + uint8_t ifftFlag; /**< flag that selects forward (ifftFlag=0) or inverse (ifftFlag=1) transform. */ + uint8_t bitReverseFlag; /**< flag that enables (bitReverseFlag=1) or disables (bitReverseFlag=0) bit reversal of output. */ + const q15_t *pTwiddle; /**< points to the twiddle factor table. */ + const uint16_t *pBitRevTable; /**< points to the bit reversal table. */ + uint16_t twidCoefModifier; /**< twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table. */ + uint16_t bitRevFactor; /**< bit reversal modifier that supports different size FFTs with the same bit reversal table. */ + } arm_cfft_radix4_instance_q15; + +/* Deprecated */ + arm_status arm_cfft_radix4_init_q15( + arm_cfft_radix4_instance_q15 * S, + uint16_t fftLen, + uint8_t ifftFlag, + uint8_t bitReverseFlag); + +/* Deprecated */ + void arm_cfft_radix4_q15( + const arm_cfft_radix4_instance_q15 * S, + q15_t * pSrc); + + /** + * @brief Instance structure for the Radix-2 Q31 CFFT/CIFFT function. + */ + typedef struct + { + uint16_t fftLen; /**< length of the FFT. */ + uint8_t ifftFlag; /**< flag that selects forward (ifftFlag=0) or inverse (ifftFlag=1) transform. */ + uint8_t bitReverseFlag; /**< flag that enables (bitReverseFlag=1) or disables (bitReverseFlag=0) bit reversal of output. */ + const q31_t *pTwiddle; /**< points to the Twiddle factor table. */ + const uint16_t *pBitRevTable; /**< points to the bit reversal table. */ + uint16_t twidCoefModifier; /**< twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table. */ + uint16_t bitRevFactor; /**< bit reversal modifier that supports different size FFTs with the same bit reversal table. */ + } arm_cfft_radix2_instance_q31; + +/* Deprecated */ + arm_status arm_cfft_radix2_init_q31( + arm_cfft_radix2_instance_q31 * S, + uint16_t fftLen, + uint8_t ifftFlag, + uint8_t bitReverseFlag); + +/* Deprecated */ + void arm_cfft_radix2_q31( + const arm_cfft_radix2_instance_q31 * S, + q31_t * pSrc); + + /** + * @brief Instance structure for the Q31 CFFT/CIFFT function. + */ + typedef struct + { + uint16_t fftLen; /**< length of the FFT. */ + uint8_t ifftFlag; /**< flag that selects forward (ifftFlag=0) or inverse (ifftFlag=1) transform. */ + uint8_t bitReverseFlag; /**< flag that enables (bitReverseFlag=1) or disables (bitReverseFlag=0) bit reversal of output. */ + const q31_t *pTwiddle; /**< points to the twiddle factor table. */ + const uint16_t *pBitRevTable; /**< points to the bit reversal table. */ + uint16_t twidCoefModifier; /**< twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table. */ + uint16_t bitRevFactor; /**< bit reversal modifier that supports different size FFTs with the same bit reversal table. */ + } arm_cfft_radix4_instance_q31; + +/* Deprecated */ + void arm_cfft_radix4_q31( + const arm_cfft_radix4_instance_q31 * S, + q31_t * pSrc); + +/* Deprecated */ + arm_status arm_cfft_radix4_init_q31( + arm_cfft_radix4_instance_q31 * S, + uint16_t fftLen, + uint8_t ifftFlag, + uint8_t bitReverseFlag); + + /** + * @brief Instance structure for the floating-point CFFT/CIFFT function. + */ + typedef struct + { + uint16_t fftLen; /**< length of the FFT. */ + uint8_t ifftFlag; /**< flag that selects forward (ifftFlag=0) or inverse (ifftFlag=1) transform. */ + uint8_t bitReverseFlag; /**< flag that enables (bitReverseFlag=1) or disables (bitReverseFlag=0) bit reversal of output. */ + const float32_t *pTwiddle; /**< points to the Twiddle factor table. */ + const uint16_t *pBitRevTable; /**< points to the bit reversal table. */ + uint16_t twidCoefModifier; /**< twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table. */ + uint16_t bitRevFactor; /**< bit reversal modifier that supports different size FFTs with the same bit reversal table. */ + float32_t onebyfftLen; /**< value of 1/fftLen. */ + } arm_cfft_radix2_instance_f32; + + +/* Deprecated */ + arm_status arm_cfft_radix2_init_f32( + arm_cfft_radix2_instance_f32 * S, + uint16_t fftLen, + uint8_t ifftFlag, + uint8_t bitReverseFlag); + +/* Deprecated */ + void arm_cfft_radix2_f32( + const arm_cfft_radix2_instance_f32 * S, + float32_t * pSrc); + + /** + * @brief Instance structure for the floating-point CFFT/CIFFT function. + */ + typedef struct + { + uint16_t fftLen; /**< length of the FFT. */ + uint8_t ifftFlag; /**< flag that selects forward (ifftFlag=0) or inverse (ifftFlag=1) transform. */ + uint8_t bitReverseFlag; /**< flag that enables (bitReverseFlag=1) or disables (bitReverseFlag=0) bit reversal of output. */ + const float32_t *pTwiddle; /**< points to the Twiddle factor table. */ + const uint16_t *pBitRevTable; /**< points to the bit reversal table. */ + uint16_t twidCoefModifier; /**< twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table. */ + uint16_t bitRevFactor; /**< bit reversal modifier that supports different size FFTs with the same bit reversal table. */ + float32_t onebyfftLen; /**< value of 1/fftLen. */ + } arm_cfft_radix4_instance_f32; + + + +/* Deprecated */ + arm_status arm_cfft_radix4_init_f32( + arm_cfft_radix4_instance_f32 * S, + uint16_t fftLen, + uint8_t ifftFlag, + uint8_t bitReverseFlag); + +/* Deprecated */ + void arm_cfft_radix4_f32( + const arm_cfft_radix4_instance_f32 * S, + float32_t * pSrc); + + /** + * @brief Instance structure for the fixed-point CFFT/CIFFT function. + */ + typedef struct + { + uint16_t fftLen; /**< length of the FFT. */ + const q15_t *pTwiddle; /**< points to the Twiddle factor table. */ + const uint16_t *pBitRevTable; /**< points to the bit reversal table. */ + uint16_t bitRevLength; /**< bit reversal table length. */ +#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE) + const uint32_t *rearranged_twiddle_tab_stride1_arr; /**< Per stage reordered twiddle pointer (offset 1) */ \ + const uint32_t *rearranged_twiddle_tab_stride2_arr; /**< Per stage reordered twiddle pointer (offset 2) */ \ + const uint32_t *rearranged_twiddle_tab_stride3_arr; /**< Per stage reordered twiddle pointer (offset 3) */ \ + const q15_t *rearranged_twiddle_stride1; /**< reordered twiddle offset 1 storage */ \ + const q15_t *rearranged_twiddle_stride2; /**< reordered twiddle offset 2 storage */ \ + const q15_t *rearranged_twiddle_stride3; +#endif + } arm_cfft_instance_q15; + +arm_status arm_cfft_init_q15( + arm_cfft_instance_q15 * S, + uint16_t fftLen); + +void arm_cfft_q15( + const arm_cfft_instance_q15 * S, + q15_t * p1, + uint8_t ifftFlag, + uint8_t bitReverseFlag); + + /** + * @brief Instance structure for the fixed-point CFFT/CIFFT function. + */ + typedef struct + { + uint16_t fftLen; /**< length of the FFT. */ + const q31_t *pTwiddle; /**< points to the Twiddle factor table. */ + const uint16_t *pBitRevTable; /**< points to the bit reversal table. */ + uint16_t bitRevLength; /**< bit reversal table length. */ +#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE) + const uint32_t *rearranged_twiddle_tab_stride1_arr; /**< Per stage reordered twiddle pointer (offset 1) */ \ + const uint32_t *rearranged_twiddle_tab_stride2_arr; /**< Per stage reordered twiddle pointer (offset 2) */ \ + const uint32_t *rearranged_twiddle_tab_stride3_arr; /**< Per stage reordered twiddle pointer (offset 3) */ \ + const q31_t *rearranged_twiddle_stride1; /**< reordered twiddle offset 1 storage */ \ + const q31_t *rearranged_twiddle_stride2; /**< reordered twiddle offset 2 storage */ \ + const q31_t *rearranged_twiddle_stride3; +#endif + } arm_cfft_instance_q31; + +arm_status arm_cfft_init_q31( + arm_cfft_instance_q31 * S, + uint16_t fftLen); + +void arm_cfft_q31( + const arm_cfft_instance_q31 * S, + q31_t * p1, + uint8_t ifftFlag, + uint8_t bitReverseFlag); + + /** + * @brief Instance structure for the floating-point CFFT/CIFFT function. + */ + typedef struct + { + uint16_t fftLen; /**< length of the FFT. */ + const float32_t *pTwiddle; /**< points to the Twiddle factor table. */ + const uint16_t *pBitRevTable; /**< points to the bit reversal table. */ + uint16_t bitRevLength; /**< bit reversal table length. */ +#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) + const uint32_t *rearranged_twiddle_tab_stride1_arr; /**< Per stage reordered twiddle pointer (offset 1) */ \ + const uint32_t *rearranged_twiddle_tab_stride2_arr; /**< Per stage reordered twiddle pointer (offset 2) */ \ + const uint32_t *rearranged_twiddle_tab_stride3_arr; /**< Per stage reordered twiddle pointer (offset 3) */ \ + const float32_t *rearranged_twiddle_stride1; /**< reordered twiddle offset 1 storage */ \ + const float32_t *rearranged_twiddle_stride2; /**< reordered twiddle offset 2 storage */ \ + const float32_t *rearranged_twiddle_stride3; +#endif + } arm_cfft_instance_f32; + + + + arm_status arm_cfft_init_f32( + arm_cfft_instance_f32 * S, + uint16_t fftLen); + + void arm_cfft_f32( + const arm_cfft_instance_f32 * S, + float32_t * p1, + uint8_t ifftFlag, + uint8_t bitReverseFlag); + + + /** + * @brief Instance structure for the Double Precision Floating-point CFFT/CIFFT function. + */ + typedef struct + { + uint16_t fftLen; /**< length of the FFT. */ + const float64_t *pTwiddle; /**< points to the Twiddle factor table. */ + const uint16_t *pBitRevTable; /**< points to the bit reversal table. */ + uint16_t bitRevLength; /**< bit reversal table length. */ + } arm_cfft_instance_f64; + + arm_status arm_cfft_init_f64( + arm_cfft_instance_f64 * S, + uint16_t fftLen); + + void arm_cfft_f64( + const arm_cfft_instance_f64 * S, + float64_t * p1, + uint8_t ifftFlag, + uint8_t bitReverseFlag); + + /** + * @brief Instance structure for the Q15 RFFT/RIFFT function. + */ + typedef struct + { + uint32_t fftLenReal; /**< length of the real FFT. */ + uint8_t ifftFlagR; /**< flag that selects forward (ifftFlagR=0) or inverse (ifftFlagR=1) transform. */ + uint8_t bitReverseFlagR; /**< flag that enables (bitReverseFlagR=1) or disables (bitReverseFlagR=0) bit reversal of output. */ + uint32_t twidCoefRModifier; /**< twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table. */ + const q15_t *pTwiddleAReal; /**< points to the real twiddle factor table. */ + const q15_t *pTwiddleBReal; /**< points to the imag twiddle factor table. */ +#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE) + arm_cfft_instance_q15 cfftInst; +#else + const arm_cfft_instance_q15 *pCfft; /**< points to the complex FFT instance. */ +#endif + } arm_rfft_instance_q15; + + arm_status arm_rfft_init_q15( + arm_rfft_instance_q15 * S, + uint32_t fftLenReal, + uint32_t ifftFlagR, + uint32_t bitReverseFlag); + + void arm_rfft_q15( + const arm_rfft_instance_q15 * S, + q15_t * pSrc, + q15_t * pDst); + + /** + * @brief Instance structure for the Q31 RFFT/RIFFT function. + */ + typedef struct + { + uint32_t fftLenReal; /**< length of the real FFT. */ + uint8_t ifftFlagR; /**< flag that selects forward (ifftFlagR=0) or inverse (ifftFlagR=1) transform. */ + uint8_t bitReverseFlagR; /**< flag that enables (bitReverseFlagR=1) or disables (bitReverseFlagR=0) bit reversal of output. */ + uint32_t twidCoefRModifier; /**< twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table. */ + const q31_t *pTwiddleAReal; /**< points to the real twiddle factor table. */ + const q31_t *pTwiddleBReal; /**< points to the imag twiddle factor table. */ +#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE) + arm_cfft_instance_q31 cfftInst; +#else + const arm_cfft_instance_q31 *pCfft; /**< points to the complex FFT instance. */ +#endif + } arm_rfft_instance_q31; + + arm_status arm_rfft_init_q31( + arm_rfft_instance_q31 * S, + uint32_t fftLenReal, + uint32_t ifftFlagR, + uint32_t bitReverseFlag); + + void arm_rfft_q31( + const arm_rfft_instance_q31 * S, + q31_t * pSrc, + q31_t * pDst); + + /** + * @brief Instance structure for the floating-point RFFT/RIFFT function. + */ + typedef struct + { + uint32_t fftLenReal; /**< length of the real FFT. */ + uint16_t fftLenBy2; /**< length of the complex FFT. */ + uint8_t ifftFlagR; /**< flag that selects forward (ifftFlagR=0) or inverse (ifftFlagR=1) transform. */ + uint8_t bitReverseFlagR; /**< flag that enables (bitReverseFlagR=1) or disables (bitReverseFlagR=0) bit reversal of output. */ + uint32_t twidCoefRModifier; /**< twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table. */ + const float32_t *pTwiddleAReal; /**< points to the real twiddle factor table. */ + const float32_t *pTwiddleBReal; /**< points to the imag twiddle factor table. */ + arm_cfft_radix4_instance_f32 *pCfft; /**< points to the complex FFT instance. */ + } arm_rfft_instance_f32; + + arm_status arm_rfft_init_f32( + arm_rfft_instance_f32 * S, + arm_cfft_radix4_instance_f32 * S_CFFT, + uint32_t fftLenReal, + uint32_t ifftFlagR, + uint32_t bitReverseFlag); + + void arm_rfft_f32( + const arm_rfft_instance_f32 * S, + float32_t * pSrc, + float32_t * pDst); + + /** + * @brief Instance structure for the Double Precision Floating-point RFFT/RIFFT function. + */ +typedef struct + { + arm_cfft_instance_f64 Sint; /**< Internal CFFT structure. */ + uint16_t fftLenRFFT; /**< length of the real sequence */ + const float64_t * pTwiddleRFFT; /**< Twiddle factors real stage */ + } arm_rfft_fast_instance_f64 ; + +arm_status arm_rfft_fast_init_f64 ( + arm_rfft_fast_instance_f64 * S, + uint16_t fftLen); + + +void arm_rfft_fast_f64( + arm_rfft_fast_instance_f64 * S, + float64_t * p, float64_t * pOut, + uint8_t ifftFlag); + + + /** + * @brief Instance structure for the floating-point RFFT/RIFFT function. + */ +typedef struct + { + arm_cfft_instance_f32 Sint; /**< Internal CFFT structure. */ + uint16_t fftLenRFFT; /**< length of the real sequence */ + const float32_t * pTwiddleRFFT; /**< Twiddle factors real stage */ + } arm_rfft_fast_instance_f32 ; + +arm_status arm_rfft_fast_init_f32 ( + arm_rfft_fast_instance_f32 * S, + uint16_t fftLen); + + + void arm_rfft_fast_f32( + const arm_rfft_fast_instance_f32 * S, + float32_t * p, float32_t * pOut, + uint8_t ifftFlag); + + /** + * @brief Instance structure for the floating-point DCT4/IDCT4 function. + */ + typedef struct + { + uint16_t N; /**< length of the DCT4. */ + uint16_t Nby2; /**< half of the length of the DCT4. */ + float32_t normalize; /**< normalizing factor. */ + const float32_t *pTwiddle; /**< points to the twiddle factor table. */ + const float32_t *pCosFactor; /**< points to the cosFactor table. */ + arm_rfft_instance_f32 *pRfft; /**< points to the real FFT instance. */ + arm_cfft_radix4_instance_f32 *pCfft; /**< points to the complex FFT instance. */ + } arm_dct4_instance_f32; + + + /** + * @brief Initialization function for the floating-point DCT4/IDCT4. + * @param[in,out] S points to an instance of floating-point DCT4/IDCT4 structure. + * @param[in] S_RFFT points to an instance of floating-point RFFT/RIFFT structure. + * @param[in] S_CFFT points to an instance of floating-point CFFT/CIFFT structure. + * @param[in] N length of the DCT4. + * @param[in] Nby2 half of the length of the DCT4. + * @param[in] normalize normalizing factor. + * @return arm_status function returns ARM_MATH_SUCCESS if initialization is successful or ARM_MATH_ARGUMENT_ERROR if fftLenReal is not a supported transform length. + */ + arm_status arm_dct4_init_f32( + arm_dct4_instance_f32 * S, + arm_rfft_instance_f32 * S_RFFT, + arm_cfft_radix4_instance_f32 * S_CFFT, + uint16_t N, + uint16_t Nby2, + float32_t normalize); + + + /** + * @brief Processing function for the floating-point DCT4/IDCT4. + * @param[in] S points to an instance of the floating-point DCT4/IDCT4 structure. + * @param[in] pState points to state buffer. + * @param[in,out] pInlineBuffer points to the in-place input and output buffer. + */ + void arm_dct4_f32( + const arm_dct4_instance_f32 * S, + float32_t * pState, + float32_t * pInlineBuffer); + + + /** + * @brief Instance structure for the Q31 DCT4/IDCT4 function. + */ + typedef struct + { + uint16_t N; /**< length of the DCT4. */ + uint16_t Nby2; /**< half of the length of the DCT4. */ + q31_t normalize; /**< normalizing factor. */ + const q31_t *pTwiddle; /**< points to the twiddle factor table. */ + const q31_t *pCosFactor; /**< points to the cosFactor table. */ + arm_rfft_instance_q31 *pRfft; /**< points to the real FFT instance. */ + arm_cfft_radix4_instance_q31 *pCfft; /**< points to the complex FFT instance. */ + } arm_dct4_instance_q31; + + + /** + * @brief Initialization function for the Q31 DCT4/IDCT4. + * @param[in,out] S points to an instance of Q31 DCT4/IDCT4 structure. + * @param[in] S_RFFT points to an instance of Q31 RFFT/RIFFT structure + * @param[in] S_CFFT points to an instance of Q31 CFFT/CIFFT structure + * @param[in] N length of the DCT4. + * @param[in] Nby2 half of the length of the DCT4. + * @param[in] normalize normalizing factor. + * @return arm_status function returns ARM_MATH_SUCCESS if initialization is successful or ARM_MATH_ARGUMENT_ERROR if N is not a supported transform length. + */ + arm_status arm_dct4_init_q31( + arm_dct4_instance_q31 * S, + arm_rfft_instance_q31 * S_RFFT, + arm_cfft_radix4_instance_q31 * S_CFFT, + uint16_t N, + uint16_t Nby2, + q31_t normalize); + + + /** + * @brief Processing function for the Q31 DCT4/IDCT4. + * @param[in] S points to an instance of the Q31 DCT4 structure. + * @param[in] pState points to state buffer. + * @param[in,out] pInlineBuffer points to the in-place input and output buffer. + */ + void arm_dct4_q31( + const arm_dct4_instance_q31 * S, + q31_t * pState, + q31_t * pInlineBuffer); + + + /** + * @brief Instance structure for the Q15 DCT4/IDCT4 function. + */ + typedef struct + { + uint16_t N; /**< length of the DCT4. */ + uint16_t Nby2; /**< half of the length of the DCT4. */ + q15_t normalize; /**< normalizing factor. */ + const q15_t *pTwiddle; /**< points to the twiddle factor table. */ + const q15_t *pCosFactor; /**< points to the cosFactor table. */ + arm_rfft_instance_q15 *pRfft; /**< points to the real FFT instance. */ + arm_cfft_radix4_instance_q15 *pCfft; /**< points to the complex FFT instance. */ + } arm_dct4_instance_q15; + + + /** + * @brief Initialization function for the Q15 DCT4/IDCT4. + * @param[in,out] S points to an instance of Q15 DCT4/IDCT4 structure. + * @param[in] S_RFFT points to an instance of Q15 RFFT/RIFFT structure. + * @param[in] S_CFFT points to an instance of Q15 CFFT/CIFFT structure. + * @param[in] N length of the DCT4. + * @param[in] Nby2 half of the length of the DCT4. + * @param[in] normalize normalizing factor. + * @return arm_status function returns ARM_MATH_SUCCESS if initialization is successful or ARM_MATH_ARGUMENT_ERROR if N is not a supported transform length. + */ + arm_status arm_dct4_init_q15( + arm_dct4_instance_q15 * S, + arm_rfft_instance_q15 * S_RFFT, + arm_cfft_radix4_instance_q15 * S_CFFT, + uint16_t N, + uint16_t Nby2, + q15_t normalize); + + + /** + * @brief Processing function for the Q15 DCT4/IDCT4. + * @param[in] S points to an instance of the Q15 DCT4 structure. + * @param[in] pState points to state buffer. + * @param[in,out] pInlineBuffer points to the in-place input and output buffer. + */ + void arm_dct4_q15( + const arm_dct4_instance_q15 * S, + q15_t * pState, + q15_t * pInlineBuffer); + + + +#ifdef __cplusplus +} +#endif + +#endif /* ifndef _TRANSFORM_FUNCTIONS_H_ */ diff --git a/APP_Framework/Framework/knowing/cmsis_5/DSP/Include/dsp/transform_functions_f16.h b/APP_Framework/Framework/knowing/cmsis_5/DSP/Include/dsp/transform_functions_f16.h new file mode 100644 index 000000000..67f1adc21 --- /dev/null +++ b/APP_Framework/Framework/knowing/cmsis_5/DSP/Include/dsp/transform_functions_f16.h @@ -0,0 +1,157 @@ +/****************************************************************************** + * @file transform_functions_f16.h + * @brief Public header file for CMSIS DSP Library + * @version V1.9.0 + * @date 23 April 2021 + * Target Processor: Cortex-M and Cortex-A cores + ******************************************************************************/ +/* + * Copyright (c) 2010-2020 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +#ifndef _TRANSFORM_FUNCTIONS_F16_H_ +#define _TRANSFORM_FUNCTIONS_F16_H_ + +#include "arm_math_types_f16.h" +#include "arm_math_memory.h" + +#include "dsp/none.h" +#include "dsp/utils.h" + +#ifdef __cplusplus +extern "C" +{ +#endif + + + +#if defined(ARM_FLOAT16_SUPPORTED) + + + /** + * @brief Instance structure for the floating-point CFFT/CIFFT function. + */ + typedef struct + { + uint16_t fftLen; /**< length of the FFT. */ + uint8_t ifftFlag; /**< flag that selects forward (ifftFlag=0) or inverse (ifftFlag=1) transform. */ + uint8_t bitReverseFlag; /**< flag that enables (bitReverseFlag=1) or disables (bitReverseFlag=0) bit reversal of output. */ + const float16_t *pTwiddle; /**< points to the Twiddle factor table. */ + const uint16_t *pBitRevTable; /**< points to the bit reversal table. */ + uint16_t twidCoefModifier; /**< twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table. */ + uint16_t bitRevFactor; /**< bit reversal modifier that supports different size FFTs with the same bit reversal table. */ + float16_t onebyfftLen; /**< value of 1/fftLen. */ + } arm_cfft_radix2_instance_f16; + + /** + * @brief Instance structure for the floating-point CFFT/CIFFT function. + */ + typedef struct + { + uint16_t fftLen; /**< length of the FFT. */ + uint8_t ifftFlag; /**< flag that selects forward (ifftFlag=0) or inverse (ifftFlag=1) transform. */ + uint8_t bitReverseFlag; /**< flag that enables (bitReverseFlag=1) or disables (bitReverseFlag=0) bit reversal of output. */ + const float16_t *pTwiddle; /**< points to the Twiddle factor table. */ + const uint16_t *pBitRevTable; /**< points to the bit reversal table. */ + uint16_t twidCoefModifier; /**< twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table. */ + uint16_t bitRevFactor; /**< bit reversal modifier that supports different size FFTs with the same bit reversal table. */ + float16_t onebyfftLen; /**< value of 1/fftLen. */ + } arm_cfft_radix4_instance_f16; + + /** + * @brief Instance structure for the floating-point CFFT/CIFFT function. + */ + typedef struct + { + uint16_t fftLen; /**< length of the FFT. */ + const float16_t *pTwiddle; /**< points to the Twiddle factor table. */ + const uint16_t *pBitRevTable; /**< points to the bit reversal table. */ + uint16_t bitRevLength; /**< bit reversal table length. */ +#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) + const uint32_t *rearranged_twiddle_tab_stride1_arr; /**< Per stage reordered twiddle pointer (offset 1) */ \ + const uint32_t *rearranged_twiddle_tab_stride2_arr; /**< Per stage reordered twiddle pointer (offset 2) */ \ + const uint32_t *rearranged_twiddle_tab_stride3_arr; /**< Per stage reordered twiddle pointer (offset 3) */ \ + const float16_t *rearranged_twiddle_stride1; /**< reordered twiddle offset 1 storage */ \ + const float16_t *rearranged_twiddle_stride2; /**< reordered twiddle offset 2 storage */ \ + const float16_t *rearranged_twiddle_stride3; +#endif + } arm_cfft_instance_f16; + + + arm_status arm_cfft_init_f16( + arm_cfft_instance_f16 * S, + uint16_t fftLen); + + void arm_cfft_f16( + const arm_cfft_instance_f16 * S, + float16_t * p1, + uint8_t ifftFlag, + uint8_t bitReverseFlag); + + /** + * @brief Instance structure for the floating-point RFFT/RIFFT function. + */ +typedef struct + { + arm_cfft_instance_f16 Sint; /**< Internal CFFT structure. */ + uint16_t fftLenRFFT; /**< length of the real sequence */ + const float16_t * pTwiddleRFFT; /**< Twiddle factors real stage */ + } arm_rfft_fast_instance_f16 ; + +arm_status arm_rfft_fast_init_f16 ( + arm_rfft_fast_instance_f16 * S, + uint16_t fftLen); + + + void arm_rfft_fast_f16( + const arm_rfft_fast_instance_f16 * S, + float16_t * p, float16_t * pOut, + uint8_t ifftFlag); + +/* Deprecated */ + arm_status arm_cfft_radix4_init_f16( + arm_cfft_radix4_instance_f16 * S, + uint16_t fftLen, + uint8_t ifftFlag, + uint8_t bitReverseFlag); + +/* Deprecated */ + void arm_cfft_radix4_f16( + const arm_cfft_radix4_instance_f16 * S, + float16_t * pSrc); + + +/* Deprecated */ + arm_status arm_cfft_radix2_init_f16( + arm_cfft_radix2_instance_f16 * S, + uint16_t fftLen, + uint8_t ifftFlag, + uint8_t bitReverseFlag); + +/* Deprecated */ + void arm_cfft_radix2_f16( + const arm_cfft_radix2_instance_f16 * S, + float16_t * pSrc); + +#endif /* defined(ARM_FLOAT16_SUPPORTED)*/ + +#ifdef __cplusplus +} +#endif + +#endif /* ifndef _TRANSFORM_FUNCTIONS_F16_H_ */ diff --git a/APP_Framework/Framework/knowing/cmsis_5/DSP/Include/dsp/utils.h b/APP_Framework/Framework/knowing/cmsis_5/DSP/Include/dsp/utils.h new file mode 100644 index 000000000..7f5acb374 --- /dev/null +++ b/APP_Framework/Framework/knowing/cmsis_5/DSP/Include/dsp/utils.h @@ -0,0 +1,240 @@ +/****************************************************************************** + * @file arm_math_utils.h + * @brief Public header file for CMSIS DSP Library + * @version V1.9.0 + * @date 20. July 2020 + ******************************************************************************/ +/* + * Copyright (c) 2010-2020 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef _ARM_MATH_UTILS_H_ + +#define _ARM_MATH_UTILS_H_ + +#include "arm_math_types.h" + +#ifdef __cplusplus +extern "C" +{ +#endif + + /** + * @brief Macros required for reciprocal calculation in Normalized LMS + */ + +#define INDEX_MASK 0x0000003F + + +#define SQ(x) ((x) * (x)) + +#define ROUND_UP(N, S) ((((N) + (S) - 1) / (S)) * (S)) + + + /** + * @brief Function to Calculates 1/in (reciprocal) value of Q31 Data type. + */ + __STATIC_FORCEINLINE uint32_t arm_recip_q31( + q31_t in, + q31_t * dst, + const q31_t * pRecipTable) + { + q31_t out; + uint32_t tempVal; + uint32_t index, i; + uint32_t signBits; + + if (in > 0) + { + signBits = ((uint32_t) (__CLZ( in) - 1)); + } + else + { + signBits = ((uint32_t) (__CLZ(-in) - 1)); + } + + /* Convert input sample to 1.31 format */ + in = (in << signBits); + + /* calculation of index for initial approximated Val */ + index = (uint32_t)(in >> 24); + index = (index & INDEX_MASK); + + /* 1.31 with exp 1 */ + out = pRecipTable[index]; + + /* calculation of reciprocal value */ + /* running approximation for two iterations */ + for (i = 0U; i < 2U; i++) + { + tempVal = (uint32_t) (((q63_t) in * out) >> 31); + tempVal = 0x7FFFFFFFu - tempVal; + /* 1.31 with exp 1 */ + /* out = (q31_t) (((q63_t) out * tempVal) >> 30); */ + out = clip_q63_to_q31(((q63_t) out * tempVal) >> 30); + } + + /* write output */ + *dst = out; + + /* return num of signbits of out = 1/in value */ + return (signBits + 1U); + } + + + /** + * @brief Function to Calculates 1/in (reciprocal) value of Q15 Data type. + */ + __STATIC_FORCEINLINE uint32_t arm_recip_q15( + q15_t in, + q15_t * dst, + const q15_t * pRecipTable) + { + q15_t out = 0; + uint32_t tempVal = 0; + uint32_t index = 0, i = 0; + uint32_t signBits = 0; + + if (in > 0) + { + signBits = ((uint32_t)(__CLZ( in) - 17)); + } + else + { + signBits = ((uint32_t)(__CLZ(-in) - 17)); + } + + /* Convert input sample to 1.15 format */ + in = (in << signBits); + + /* calculation of index for initial approximated Val */ + index = (uint32_t)(in >> 8); + index = (index & INDEX_MASK); + + /* 1.15 with exp 1 */ + out = pRecipTable[index]; + + /* calculation of reciprocal value */ + /* running approximation for two iterations */ + for (i = 0U; i < 2U; i++) + { + tempVal = (uint32_t) (((q31_t) in * out) >> 15); + tempVal = 0x7FFFu - tempVal; + /* 1.15 with exp 1 */ + out = (q15_t) (((q31_t) out * tempVal) >> 14); + /* out = clip_q31_to_q15(((q31_t) out * tempVal) >> 14); */ + } + + /* write output */ + *dst = out; + + /* return num of signbits of out = 1/in value */ + return (signBits + 1); + } + + +/** + * @brief 64-bit to 32-bit unsigned normalization + * @param[in] in is input unsigned long long value + * @param[out] normalized is the 32-bit normalized value + * @param[out] norm is norm scale + */ +__STATIC_INLINE void arm_norm_64_to_32u(uint64_t in, int32_t * normalized, int32_t *norm) +{ + int32_t n1; + int32_t hi = (int32_t) (in >> 32); + int32_t lo = (int32_t) ((in << 32) >> 32); + + n1 = __CLZ(hi) - 32; + if (!n1) + { + /* + * input fits in 32-bit + */ + n1 = __CLZ(lo); + if (!n1) + { + /* + * MSB set, need to scale down by 1 + */ + *norm = -1; + *normalized = (((uint32_t) lo) >> 1); + } else + { + if (n1 == 32) + { + /* + * input is zero + */ + *norm = 0; + *normalized = 0; + } else + { + /* + * 32-bit normalization + */ + *norm = n1 - 1; + *normalized = lo << *norm; + } + } + } else + { + /* + * input fits in 64-bit + */ + n1 = 1 - n1; + *norm = -n1; + /* + * 64 bit normalization + */ + *normalized = (((uint32_t) lo) >> n1) | (hi << (32 - n1)); + } +} + +__STATIC_INLINE q31_t arm_div_q63_to_q31(q63_t num, q31_t den) +{ + q31_t result; + uint64_t absNum; + int32_t normalized; + int32_t norm; + + /* + * if sum fits in 32bits + * avoid costly 64-bit division + */ + absNum = num > 0 ? num : -num; + arm_norm_64_to_32u(absNum, &normalized, &norm); + if (norm > 0) + /* + * 32-bit division + */ + result = (q31_t) num / den; + else + /* + * 64-bit division + */ + result = (q31_t) (num / den); + + return result; +} + + +#ifdef __cplusplus +} +#endif + +#endif /*ifndef _ARM_MATH_UTILS_H_ */ diff --git a/APP_Framework/Framework/knowing/cmsis_5/NN/Include/arm_nn_tables.h b/APP_Framework/Framework/knowing/cmsis_5/NN/Include/arm_nn_tables.h new file mode 100644 index 000000000..35dfc3ba7 --- /dev/null +++ b/APP_Framework/Framework/knowing/cmsis_5/NN/Include/arm_nn_tables.h @@ -0,0 +1,56 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_nn_tables.h + * Description: Extern declaration for NN tables + * + * $Date: 09. October 2020 + * $Revision: V.1.0.1 + * + * Target Processor: Cortex-M cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2018 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef _ARM_NN_TABLES_H +#define _ARM_NN_TABLES_H + +#include "arm_math_types.h" + +/** +* @brief tables for various activation functions +* +*/ + +extern const q15_t sigmoidTable_q15[256]; +extern const q7_t sigmoidTable_q7[256]; + +extern const q7_t tanhTable_q7[256]; +extern const q15_t tanhTable_q15[256]; + +/** + * @brief 2-way tables for various activation functions + * + * 2-way table, H table for value larger than 1/4 + * L table for value smaller than 1/4, H table for remaining + * We have this only for the q15_t version. It does not make + * sense to have it for q7_t type + */ +extern const q15_t sigmoidHTable_q15[192]; +extern const q15_t sigmoidLTable_q15[128]; + +#endif /* ARM_NN_TABLES_H */ diff --git a/APP_Framework/Framework/knowing/cmsis_5/NN/Include/arm_nn_types.h b/APP_Framework/Framework/knowing/cmsis_5/NN/Include/arm_nn_types.h new file mode 100644 index 000000000..c37123612 --- /dev/null +++ b/APP_Framework/Framework/knowing/cmsis_5/NN/Include/arm_nn_types.h @@ -0,0 +1,130 @@ +/* + * Copyright (C) 2020-2021 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_nn_types.h + * Description: Public header file to contain the CMSIS-NN structs for the + * TensorFlowLite micro compliant functions + * + * $Date: 19. March 2021 + * $Revision: V.2.0.0 + * + * Target Processor: Cortex-M cores + * -------------------------------------------------------------------- */ + +#ifndef _ARM_NN_TYPES_H +#define _ARM_NN_TYPES_H + +#include + +/** CMSIS-NN object to contain the width and height of a tile */ +typedef struct +{ + int32_t w; /**< Width */ + int32_t h; /**< Height */ +} cmsis_nn_tile; + +/** CMSIS-NN object used for the function context. */ +typedef struct +{ + void *buf; /**< Pointer to a buffer needed for the optimization */ + int32_t size; /**< Buffer size */ +} cmsis_nn_context; + +/** CMSIS-NN object to contain the dimensions of the tensors */ +typedef struct +{ + int32_t n; /**< Generic dimension to contain either the batch size or output channels. + Please refer to the function documentation for more information */ + int32_t h; /**< Height */ + int32_t w; /**< Width */ + int32_t c; /**< Input channels */ +} cmsis_nn_dims; + +/** CMSIS-NN object for the per-channel quantization parameters */ +typedef struct +{ + int32_t *multiplier; /**< Multiplier values */ + int32_t *shift; /**< Shift values */ +} cmsis_nn_per_channel_quant_params; + +/** CMSIS-NN object for the per-tensor quantization parameters */ +typedef struct +{ + int32_t multiplier; /**< Multiplier value */ + int32_t shift; /**< Shift value */ +} cmsis_nn_per_tensor_quant_params; + +/** CMSIS-NN object for the quantized Relu activation */ +typedef struct +{ + int32_t min; /**< Min value used to clamp the result */ + int32_t max; /**< Max value used to clamp the result */ +} cmsis_nn_activation; + +/** CMSIS-NN object for the convolution layer parameters */ +typedef struct +{ + int32_t input_offset; /**< Zero value for the input tensor */ + int32_t output_offset; /**< Zero value for the output tensor */ + cmsis_nn_tile stride; + cmsis_nn_tile padding; + cmsis_nn_tile dilation; + cmsis_nn_activation activation; +} cmsis_nn_conv_params; + +/** CMSIS-NN object for Depthwise convolution layer parameters */ +typedef struct +{ + int32_t input_offset; /**< Zero value for the input tensor */ + int32_t output_offset; /**< Zero value for the output tensor */ + int32_t ch_mult; /**< Channel Multiplier. ch_mult * in_ch = out_ch */ + cmsis_nn_tile stride; + cmsis_nn_tile padding; + cmsis_nn_tile dilation; + cmsis_nn_activation activation; +} cmsis_nn_dw_conv_params; +/** CMSIS-NN object for pooling layer parameters */ +typedef struct +{ + cmsis_nn_tile stride; + cmsis_nn_tile padding; + cmsis_nn_activation activation; +} cmsis_nn_pool_params; + +/** CMSIS-NN object for Fully Connected layer parameters */ +typedef struct +{ + int32_t input_offset; /**< Zero value for the input tensor */ + int32_t filter_offset; /**< Zero value for the filter tensor. Not used */ + int32_t output_offset; /**< Zero value for the output tensor */ + cmsis_nn_activation activation; +} cmsis_nn_fc_params; + +/** CMSIS-NN object for SVDF layer parameters */ +typedef struct +{ + int32_t rank; + int32_t input_offset; /**< Zero value for the input tensor */ + int32_t output_offset; /**< Zero value for the output tensor */ + cmsis_nn_activation input_activation; + cmsis_nn_activation output_activation; +} cmsis_nn_svdf_params; + +#endif // _ARM_NN_TYPES_H diff --git a/APP_Framework/Framework/knowing/cmsis_5/NN/Include/arm_nnfunctions.h b/APP_Framework/Framework/knowing/cmsis_5/NN/Include/arm_nnfunctions.h new file mode 100644 index 000000000..fc7824272 --- /dev/null +++ b/APP_Framework/Framework/knowing/cmsis_5/NN/Include/arm_nnfunctions.h @@ -0,0 +1,2097 @@ +/* + * Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_nnfunctions.h + * Description: Public header file for CMSIS NN Library + * + * $Date: 19 March 2021 + * $Revision: V.7.0.0 + * + * Target Processor: Cortex-M CPUs + * -------------------------------------------------------------------- */ + +/** + \mainpage CMSIS NN Software Library + * + * Introduction + * ------------ + * + * This user manual describes the CMSIS NN software library, + * a collection of efficient neural network kernels developed to maximize the + * performance and minimize the memory footprint of neural networks on Cortex-M processor cores. + * + * The library is divided into a number of functions each covering a specific category: + * - Convolution Functions + * - Activation Functions + * - Fully-connected Layer Functions + * - SVDF Layer Functions + * - Pooling Functions + * - Softmax Functions + * - Basic math Functions + * + * The library has separate functions for operating on different weight and activation data + * types including 8-bit integers (q7_t) and 16-bit integers (q15_t). The descrition of the + * kernels are included in the function description. The implementation details are also + * described in this paper [1]. + * + * Function Classification + * -------- + * The functions can be classified into two segments + * - Legacy functions supporting ARM's internal symmetric quantization(8 bits). + * - Functions that support TensorFlow Lite framework with symmetric quantization(8 bits). + * + * The legacy functions can be identified with their suffix of _q7 or _q15 and are no new development is done there. + * The article in [2] describes in detail how to run a network using the legacy functions. + * + * The functions supporting TensorFlow Lite framework is identified by the _s8 suffix and can be invoked from TFL + * micro. The functions are bit exact to TensorFlow Lite. Refer to the TensorFlow's documentation in [3] on how to run + * a TensorFlow Lite model using optimized CMSIS-NN kernels. + * + * Block Diagram + * -------- + * \image html CMSIS-NN-OVERVIEW.PNG + * + * Examples + * -------- + * + * The library ships with a number of examples which demonstrate how to use the library functions. + * + * Pre-processor Macros + * ------------ + * + * Each library project have different pre-processor macros. + * + * - ARM_MATH_DSP: + * + * Define macro ARM_MATH_DSP, If the silicon supports DSP instructions(DSP extension). + * + * - ARM_MATH_MVEI: + * + * Define macro ARM_MATH_MVEI, If the silicon supports M-Profile Vector Extension. + + * - ARM_MATH_AUTOVECTORIZE + * Used in conjucture with ARM_MATH_MVEI to let the compiler auto vectorize for the functions that uses inline + * assembly. It does not affect functions that use C or intrinsics. + * - ARM_MATH_BIG_ENDIAN: + * + * Define macro ARM_MATH_BIG_ENDIAN to build the library for big endian targets. This is supported only for the legacy + * functions i.e, functions targetted at TensorFlow Lite do not support big endianness. By default library builds for + * little endian targets. + * + * - ARM_NN_TRUNCATE: + * + * Define macro ARM_NN_TRUNCATE to use floor instead of round-to-the-nearest-int for the computation. + * + * + * Copyright Notice + * ------------ + * + * Copyright (C) 2010-2019 Arm Limited. All rights reserved. + * + * [1] CMSIS-NN: Efficient Neural Network Kernels for Arm Cortex-M CPUs https://arxiv.org/abs/1801.06601 + * + * [2] Converting a Neural Network for Arm Cortex-M with CMSIS-NN + * + https://developer.arm.com/solutions/machine-learning-on-arm/developer-material/how-to-guides/converting-a-neural-network-for-arm-cortex-m-with-cmsis-nn/single-page + * [3] https://www.tensorflow.org/lite/microcontrollers/library + * + * [4] https://github.com/ARM-software/CMSIS_5/tree/develop/CMSIS/NN#legacy-vs-tfl-micro-compliant-apis + */ + +/** + * @defgroup groupNN Neural Network Functions + * A collection of functions to perform basic operations for neural network layers. Functions with a _s8 suffix support + * TensorFlow Lite framework. + */ + +#ifndef _ARM_NNFUNCTIONS_H +#define _ARM_NNFUNCTIONS_H + +#include "arm_math_types.h" +#include "arm_nn_types.h" + +#define USE_INTRINSIC + +//#define ARM_NN_TRUNCATE /* This config the rounding model to floor or round to the nearest int */ + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * @brief Struct for specifying activation function types + * + */ +typedef enum +{ + ARM_SIGMOID = 0, + /**< Sigmoid activation function */ + ARM_TANH = 1, + /**< Tanh activation function */ +} arm_nn_activation_type; + +/** + * @defgroup NNConv Convolution Functions + * + * Collection of convolution, depthwise convolution functions and their variants. + * + * The convolution is implemented in 2 steps: im2col and GEMM + * + * im2col is a process of converting each patch of image data into + * a column. After im2col, the convolution is computed as matrix-matrix + * multiplication. + * + * To reduce the memory footprint, the im2col is performed partially. + * Each iteration, only a few column (i.e., patches) are generated and + * computed with GEMM kernels similar to CMSIS-DSP arm_mat_mult functions. + * + */ + +/** + * @brief s8 convolution layer wrapper function with the main purpose to call the optimal kernel available in + cmsis-nn + * to perform the convolution. + * + * @param[in, out] ctx Function context that contains the additional buffer if required by the function. + arm_convolve_wrapper_s8_get_buffer_size will return the buffer_size if required + * @param[in] conv_params Convolution parameters (e.g. strides, dilations, pads,...). + * Range of conv_params->input_offset : [-127, 128] + * Range of conv_params->output_offset : [-128, 127] + * @param[in] quant_params Per-channel quantization info. + * It contains the multiplier and shift values to be applied to each output channel + * @param[in] input_dims Input (activation) tensor dimensions. Format: [N, H, W, C_IN] + * @param[in] input_data Input (activation) data pointer. Data type: int8 + * @param[in] filter_dims Filter tensor dimensions. Format: [C_OUT, HK, WK, C_IN] where HK and WK are the + * spatial filter dimensions + * @param[in] filter_data Filter data pointer. Data type: int8 + * @param[in] bias_dims Bias tensor dimensions. Format: [C_OUT] + * @param[in] bias_data Bias data pointer. Data type: int32 + * @param[in] output_dims Output tensor dimensions. Format: [N, H, W, C_OUT] + * @param[out] output_data Output data pointer. Data type: int8 + * + * @return The function returns either + * ARM_MATH_SIZE_MISMATCH if argument constraints fail. or, + * ARM_MATH_SUCCESS on successful completion. + * + */ +arm_status arm_convolve_wrapper_s8(const cmsis_nn_context *ctx, + const cmsis_nn_conv_params *conv_params, + const cmsis_nn_per_channel_quant_params *quant_params, + const cmsis_nn_dims *input_dims, + const q7_t *input_data, + const cmsis_nn_dims *filter_dims, + const q7_t *filter_data, + const cmsis_nn_dims *bias_dims, + const int32_t *bias_data, + const cmsis_nn_dims *output_dims, + q7_t *output_data); + +/** + * @brief Get the required buffer size for arm_convolve_wrapper_s8 + * + * @param[in] conv_params Convolution parameters (e.g. strides, dilations, pads,...). + * Range of conv_params->input_offset : [-127, 128] + * Range of conv_params->output_offset : [-128, 127] + * @param[in] input_dims Input (activation) dimensions. Format: [N, H, W, C_IN] + * @param[in] filter_dims Filter dimensions. Format: [C_OUT, HK, WK, C_IN] where HK and WK are the spatial + * filter dimensions + * @param[in] output_dims Output tensor dimensions. Format: [N, H, W, C_OUT] + * + * @return The function returns required buffer size(bytes) + * + */ +int32_t arm_convolve_wrapper_s8_get_buffer_size(const cmsis_nn_conv_params *conv_params, + const cmsis_nn_dims *input_dims, + const cmsis_nn_dims *filter_dims, + const cmsis_nn_dims *output_dims); + +/** + * @brief Basic s8 convolution function + * @param[in, out] ctx Function context that contains the additional buffer if required by the function. + arm_convolve_s8_get_buffer_size will return the buffer_size if required + * @param[in] conv_params Convolution parameters (e.g. strides, dilations, pads,...). + * Range of conv_params->input_offset : [-127, 128] + * Range of conv_params->output_offset : [-128, 127] + * @param[in] quant_params Per-channel quantization info. + * It contains the multiplier and shift values to be applied to each output channel + * @param[in] input_dims Input (activation) tensor dimensions. Format: [N, H, W, C_IN] + * @param[in] input_data Input (activation) data pointer. Data type: int8 + * @param[in] filter_dims Filter tensor dimensions. Format: [C_OUT, HK, WK, C_IN] where HK and WK are the + * spatial filter dimensions + * @param[in] filter_data Filter data pointer. Data type: int8 + * @param[in] bias_dims Bias tensor dimensions. Format: [C_OUT] + * @param[in] bias_data Optional bias data pointer. Data type: int32 + * @param[in] output_dims Output tensor dimensions. Format: [N, H, W, C_OUT] + * @param[out] output_data Output data pointer. Data type: int8 + + * @return The function returns ARM_MATH_SUCCESS + * + * @details + * 1. Supported framework: TensorFlow Lite micro + * 2. q7 is used as data type eventhough it is s8 data. It is done so to be consistent with existing APIs. + * 3. Additional memory is required for optimization. Refer to argument 'ctx' for details. + * + */ +arm_status arm_convolve_s8(const cmsis_nn_context *ctx, + const cmsis_nn_conv_params *conv_params, + const cmsis_nn_per_channel_quant_params *quant_params, + const cmsis_nn_dims *input_dims, + const q7_t *input_data, + const cmsis_nn_dims *filter_dims, + const q7_t *filter_data, + const cmsis_nn_dims *bias_dims, + const int32_t *bias_data, + const cmsis_nn_dims *output_dims, + q7_t *output_data); + +/** + * @brief Get the required buffer size for s8 convolution function + * + * @param[in] input_dims Input (activation) tensor dimensions. Format: [N, H, W, C_IN] + * @param[in] filter_dims Filter tensor dimensions. Format: [C_OUT, HK, WK, C_IN] where HK and WK + * are the spatial filter dimensions + * @return The function returns required buffer size(bytes) + * + */ +int32_t arm_convolve_s8_get_buffer_size(const cmsis_nn_dims *input_dims, const cmsis_nn_dims *filter_dims); + +/** + * @brief Basic Q7 convolution function + * @param[in] Im_in pointer to input tensor + * @param[in] dim_im_in input tensor dimension + * @param[in] ch_im_in number of input tensor channels + * @param[in] wt pointer to kernel weights + * @param[in] ch_im_out number of filters, i.e., output tensor channels + * @param[in] dim_kernel filter kernel size + * @param[in] padding padding sizes + * @param[in] stride convolution stride + * @param[in] bias pointer to bias + * @param[in] bias_shift amount of left-shift for bias + * @param[in] out_shift amount of right-shift for output + * @param[in,out] Im_out pointer to output tensor + * @param[in] dim_im_out output tensor dimension + * @param[in,out] bufferA pointer to buffer space for input + * @param[in,out] bufferB pointer to buffer space for output + * @return The function returns ARM_MATH_SUCCESS + * + */ +arm_status arm_convolve_HWC_q7_basic(const q7_t *Im_in, + const uint16_t dim_im_in, + const uint16_t ch_im_in, + const q7_t *wt, + const uint16_t ch_im_out, + const uint16_t dim_kernel, + const uint16_t padding, + const uint16_t stride, + const q7_t *bias, + const uint16_t bias_shift, + const uint16_t out_shift, + q7_t *Im_out, + const uint16_t dim_im_out, + q15_t *bufferA, + q7_t *bufferB); + +/** + * @brief Basic Q7 convolution function (non-square shape) + * @param[in] Im_in pointer to input tensor + * @param[in] dim_im_in_x input tensor dimension x + * @param[in] dim_im_in_y input tensor dimension y + * @param[in] ch_im_in number of input tensor channels + * @param[in] wt pointer to kernel weights + * @param[in] ch_im_out number of filters, i.e., output tensor channels + * @param[in] dim_kernel_x filter kernel size x + * @param[in] dim_kernel_y filter kernel size y + * @param[in] padding_x padding size x + * @param[in] padding_y padding size y + * @param[in] stride_x convolution stride x + * @param[in] stride_y convolution stride y + * @param[in] bias pointer to bias + * @param[in] bias_shift amount of left-shift for bias + * @param[in] out_shift amount of right-shift for output + * @param[in,out] Im_out pointer to output tensor + * @param[in] dim_im_out_x output tensor dimension x + * @param[in] dim_im_out_y output tensor dimension y + * @param[in,out] bufferA pointer to buffer space for input + * @param[in,out] bufferB pointer to buffer space for output + * @return The function returns ARM_MATH_SUCCESS + */ +arm_status arm_convolve_HWC_q7_basic_nonsquare(const q7_t *Im_in, + const uint16_t dim_im_in_x, + const uint16_t dim_im_in_y, + const uint16_t ch_im_in, + const q7_t *wt, + const uint16_t ch_im_out, + const uint16_t dim_kernel_x, + const uint16_t dim_kernel_y, + const uint16_t padding_x, + const uint16_t padding_y, + const uint16_t stride_x, + const uint16_t stride_y, + const q7_t *bias, + const uint16_t bias_shift, + const uint16_t out_shift, + q7_t *Im_out, + const uint16_t dim_im_out_x, + const uint16_t dim_im_out_y, + q15_t *bufferA, + q7_t *bufferB); + +/** + * @brief Basic Q15 convolution function + * @param[in] Im_in pointer to input tensor + * @param[in] dim_im_in input tensor dimension + * @param[in] ch_im_in number of input tensor channels + * @param[in] wt pointer to kernel weights + * @param[in] ch_im_out number of filters, i.e., output tensor channels + * @param[in] dim_kernel filter kernel size + * @param[in] padding padding sizes + * @param[in] stride convolution stride + * @param[in] bias pointer to bias + * @param[in] bias_shift amount of left-shift for bias + * @param[in] out_shift amount of right-shift for output + * @param[in,out] Im_out pointer to output tensor + * @param[in] dim_im_out output tensor dimension + * @param[in,out] bufferA pointer to buffer space for input + * @param[in,out] bufferB pointer to buffer space for output + * @return The function returns ARM_MATH_SUCCESS + * + */ +arm_status arm_convolve_HWC_q15_basic(const q15_t *Im_in, + const uint16_t dim_im_in, + const uint16_t ch_im_in, + const q15_t *wt, + const uint16_t ch_im_out, + const uint16_t dim_kernel, + const uint16_t padding, + const uint16_t stride, + const q15_t *bias, + const uint16_t bias_shift, + const uint16_t out_shift, + q15_t *Im_out, + const uint16_t dim_im_out, + q15_t *bufferA, + q7_t *bufferB); + +/** + * @brief Fast Q7 convolution function + * @param[in] Im_in pointer to input tensor + * @param[in] dim_im_in input tensor dimension + * @param[in] ch_im_in number of input tensor channels + * @param[in] wt pointer to kernel weights + * @param[in] ch_im_out number of filters, i.e., output tensor channels + * @param[in] dim_kernel filter kernel size + * @param[in] padding padding sizes + * @param[in] stride convolution stride + * @param[in] bias pointer to bias + * @param[in] bias_shift amount of left-shift for bias + * @param[in] out_shift amount of right-shift for output + * @param[in,out] Im_out pointer to output tensor + * @param[in] dim_im_out output tensor dimension + * @param[in,out] bufferA pointer to buffer space for input + * @param[in,out] bufferB pointer to buffer space for output + * @return The function returns either + * ARM_MATH_SIZE_MISMATCH or ARM_MATH_SUCCESS based on the outcome of size checking. + * + * This function is the version with full list of optimization tricks, but with + * some contraints: + * ch_im_in is multiple of 4 + * ch_im_out is multiple of 2 + */ +arm_status arm_convolve_HWC_q7_fast(const q7_t *Im_in, + const uint16_t dim_im_in, + const uint16_t ch_im_in, + const q7_t *wt, + const uint16_t ch_im_out, + const uint16_t dim_kernel, + const uint16_t padding, + const uint16_t stride, + const q7_t *bias, + const uint16_t bias_shift, + const uint16_t out_shift, + q7_t *Im_out, + const uint16_t dim_im_out, + q15_t *bufferA, + q7_t *bufferB); + +/** + * @brief Fast Q7 convolution function (non-sqaure shape) + * @param[in] Im_in pointer to input tensor + * @param[in] dim_im_in_x input tensor dimension x + * @param[in] dim_im_in_y input tensor dimension y + * @param[in] ch_im_in number of input tensor channels + * @param[in] wt pointer to kernel weights + * @param[in] ch_im_out number of filters, i.e., output tensor channels + * @param[in] dim_kernel_x filter kernel size x + * @param[in] dim_kernel_y filter kernel size y + * @param[in] padding_x padding size x + * @param[in] padding_y padding size y + * @param[in] stride_x convolution stride x + * @param[in] stride_y convolution stride y + * @param[in] bias pointer to bias + * @param[in] bias_shift amount of left-shift for bias + * @param[in] out_shift amount of right-shift for output + * @param[in,out] Im_out pointer to output tensor + * @param[in] dim_im_out_x output tensor dimension x + * @param[in] dim_im_out_y output tensor dimension y + * @param[in,out] bufferA pointer to buffer space for input + * @param[in,out] bufferB pointer to buffer space for output + * @return The function returns either + * ARM_MATH_SIZE_MISMATCH or ARM_MATH_SUCCESS based on the outcome of size checking. + * + * This function is the version with full list of optimization tricks, but with + * some contraints: + * ch_im_in is multiple of 4 + * ch_im_out is multiple of 2 + */ + +arm_status arm_convolve_HWC_q7_fast_nonsquare(const q7_t *Im_in, + const uint16_t dim_im_in_x, + const uint16_t dim_im_in_y, + const uint16_t ch_im_in, + const q7_t *wt, + const uint16_t ch_im_out, + const uint16_t dim_kernel_x, + const uint16_t dim_kernel_y, + const uint16_t padding_x, + const uint16_t padding_y, + const uint16_t stride_x, + const uint16_t stride_y, + const q7_t *bias, + const uint16_t bias_shift, + const uint16_t out_shift, + q7_t *Im_out, + const uint16_t dim_im_out_x, + const uint16_t dim_im_out_y, + q15_t *bufferA, + q7_t *bufferB); + +/** + * @brief Fast Q7 version of 1x1 convolution (non-sqaure shape) + * @param[in] Im_in pointer to input tensor + * @param[in] dim_im_in_x input tensor dimension x + * @param[in] dim_im_in_y input tensor dimension y + * @param[in] ch_im_in number of input tensor channels + * @param[in] wt pointer to kernel weights + * @param[in] ch_im_out number of filters, i.e., output tensor channels + * @param[in] dim_kernel_x filter kernel size x + * @param[in] dim_kernel_y filter kernel size y + * @param[in] padding_x padding size x + * @param[in] padding_y padding size y + * @param[in] stride_x convolution stride x + * @param[in] stride_y convolution stride y + * @param[in] bias pointer to bias + * @param[in] bias_shift amount of left-shift for bias + * @param[in] out_shift amount of right-shift for output + * @param[in,out] Im_out pointer to output tensor + * @param[in] dim_im_out_x output tensor dimension x + * @param[in] dim_im_out_y output tensor dimension y + * @param[in,out] bufferA pointer to buffer space for input + * @param[in,out] bufferB pointer to buffer space for output + * @return The function returns either + * ARM_MATH_SIZE_MISMATCH if argument constraints fail. or, + * ARM_MATH_SUCCESS on successful completion. + * + * This function implement convolution with 1x1 kernel size (i.e., dim_kernel_x=1 + * and dim_kernel_y=1). It can be used for + * second half of MobileNets after depthwise separable convolution. + * + * This function is the version with full list of optimization tricks, but with + * some contraints: + * ch_im_in is multiple of 4 + * ch_im_out is multiple of 2 + */ +arm_status arm_convolve_1x1_HWC_q7_fast_nonsquare(const q7_t *Im_in, + const uint16_t dim_im_in_x, + const uint16_t dim_im_in_y, + const uint16_t ch_im_in, + const q7_t *wt, + const uint16_t ch_im_out, + const uint16_t dim_kernel_x, + const uint16_t dim_kernel_y, + const uint16_t padding_x, + const uint16_t padding_y, + const uint16_t stride_x, + const uint16_t stride_y, + const q7_t *bias, + const uint16_t bias_shift, + const uint16_t out_shift, + q7_t *Im_out, + const uint16_t dim_im_out_x, + const uint16_t dim_im_out_y, + q15_t *bufferA, + q7_t *bufferB); + +/** + * @brief Fast s8 version for 1x1 convolution (non-square shape) + * + * @param[in, out] ctx Function context that contains the additional buffer if required by the function. + arm_convolve_1x1_s8_fast_get_buffer_size will return the buffer_size if required + * @param[in] conv_params Convolution parameters (e.g. strides, dilations, pads,...). + * Range of conv_params->input_offset : [-127, 128] + * Range of conv_params->output_offset : [-128, 127] + * @param[in] quant_params Per-channel quantization info. + * It contains the multiplier and shift values to be applied to each output channel + * @param[in] input_dims Input (activation) tensor dimensions. Format: [N, H, W, C_IN] + * @param[in] input_data Input (activation) data pointer. Data type: int8 + * @param[in] filter_dims Filter tensor dimensions. Format: [C_OUT, 1, 1, C_IN] + * @param[in] filter_data Filter data pointer. Data type: int8 + * @param[in] bias_dims Bias tensor dimensions. Format: [C_OUT] + * @param[in] bias_data Optional bias data pointer. Data type: int32 + * @param[in] output_dims Output tensor dimensions. Format: [N, H, W, C_OUT] + * @param[out] output_data Output data pointer. Data type: int8 + * + * @return The function returns either + * ARM_MATH_SIZE_MISMATCH if argument constraints fail. or, + * ARM_MATH_SUCCESS on successful completion. + * + * @details + * - Supported framework : TensorFlow Lite Micro + * - The following constrains on the arguments apply + * -# input_dims->c is a multiple of 4 + * -# conv_params->padding.w = conv_params->padding.h = 0 + * -# conv_params->stride.w = conv_params->stride.h = 1 + * + */ +arm_status arm_convolve_1x1_s8_fast(const cmsis_nn_context *ctx, + const cmsis_nn_conv_params *conv_params, + const cmsis_nn_per_channel_quant_params *quant_params, + const cmsis_nn_dims *input_dims, + const q7_t *input_data, + const cmsis_nn_dims *filter_dims, + const q7_t *filter_data, + const cmsis_nn_dims *bias_dims, + const int32_t *bias_data, + const cmsis_nn_dims *output_dims, + q7_t *output_data); + +/** + * @brief Get the required buffer size for arm_convolve_1x1_s8_fast + * + * @param[in] input_dims Input (activation) dimensions + * @return The function returns the required buffer size in bytes + * + */ +int32_t arm_convolve_1x1_s8_fast_get_buffer_size(const cmsis_nn_dims *input_dims); + +/** + * @brief 1xn convolution + * + * @param[in, out] ctx Function context that contains the additional buffer if required by the function. + arm_convolve_1_x_n_s8_get_buffer_size will return the buffer_size if required + * @param[in] conv_params Convolution parameters (e.g. strides, dilations, pads,...). + * Range of conv_params->input_offset : [-127, 128] + * Range of conv_params->output_offset : [-128, 127] + * @param[in] quant_params Per-channel quantization info. + * It contains the multiplier and shift values to be applied to each output channel + * @param[in] input_dims Input (activation) tensor dimensions. Format: [N, H, W, C_IN] + * @param[in] input_data Input (activation) data pointer. Data type: int8 + * @param[in] filter_dims Filter tensor dimensions. Format: [C_OUT, 1, WK, C_IN] where WK is the horizontal + * spatial filter dimension + * @param[in] filter_data Filter data pointer. Data type: int8 + * @param[in] bias_dims Bias tensor dimensions. Format: [C_OUT] + * @param[in] bias_data Optional bias data pointer. Data type: int32 + * @param[in] output_dims Output tensor dimensions. Format: [N, H, W, C_OUT] + * @param[out] output_data Output data pointer. Data type: int8 + * + * @return The function returns either + * ARM_MATH_SIZE_MISMATCH if argument constraints fail. or, + * ARM_MATH_SUCCESS on successful completion. + * + * @details + * - Supported framework : TensorFlow Lite Micro + * - The following constrains on the arguments apply + * -# input_dims->n equals 1 + * -# ouput_dims->w is a multiple of 4 + * -# Explicit constraints(since it is for 1xN convolution) + * -## input_dims->h equals 1 + * -## output_dims->h equals 1 + * -## filter_dims->h equals 1 + *@todo Remove constraint on output_dims->w to make the function generic. + * + */ +arm_status arm_convolve_1_x_n_s8(const cmsis_nn_context *ctx, + const cmsis_nn_conv_params *conv_params, + const cmsis_nn_per_channel_quant_params *quant_params, + const cmsis_nn_dims *input_dims, + const q7_t *input_data, + const cmsis_nn_dims *filter_dims, + const q7_t *filter_data, + const cmsis_nn_dims *bias_dims, + const int32_t *bias_data, + const cmsis_nn_dims *output_dims, + q7_t *output_data); + +/** + * @brief Get the required additional buffer size for 1xn convolution + * + * @param[in] input_dims Input (activation) tensor dimensions. Format: [N, H, W, C_IN] + * @param[in] filter_dims Filter tensor dimensions. Format: [C_OUT, 1, WK, C_IN] where WK is the + * horizontal spatial filter dimension + * @return The function returns required buffer size(bytes) + * + */ +int32_t arm_convolve_1_x_n_s8_get_buffer_size(const cmsis_nn_dims *input_dims, const cmsis_nn_dims *filter_dims); + +/** + * @brief Q7 version of convolution for RGB image + * @param[in] Im_in pointer to input tensor + * @param[in] dim_im_in input tensor dimension + * @param[in] ch_im_in number of input tensor channels + * @param[in] wt pointer to kernel weights + * @param[in] ch_im_out number of filters, i.e., output tensor channels + * @param[in] dim_kernel filter kernel size + * @param[in] padding padding sizes + * @param[in] stride convolution stride + * @param[in] bias pointer to bias + * @param[in] bias_shift amount of left-shift for bias + * @param[in] out_shift amount of right-shift for output + * @param[in,out] Im_out pointer to output tensor + * @param[in] dim_im_out output tensor dimension + * @param[in,out] bufferA pointer to buffer space for input + * @param[in,out] bufferB pointer to buffer space for output + * @return The function returns either + * ARM_MATH_SIZE_MISMATCH or ARM_MATH_SUCCESS based on the outcome of size checking. + * + * This kernel is written exclusively for convolution with ch_im_in + * equals 3. This applies on the first layer of CNNs which has input + * image with RGB format. + */ + +arm_status arm_convolve_HWC_q7_RGB(const q7_t *Im_in, + const uint16_t dim_im_in, + const uint16_t ch_im_in, + const q7_t *wt, + const uint16_t ch_im_out, + const uint16_t dim_kernel, + const uint16_t padding, + const uint16_t stride, + const q7_t *bias, + const uint16_t bias_shift, + const uint16_t out_shift, + q7_t *Im_out, + const uint16_t dim_im_out, + q15_t *bufferA, + q7_t *bufferB); + +/** + * @brief Fast Q15 convolution function + * @param[in] Im_in pointer to input tensor + * @param[in] dim_im_in input tensor dimension + * @param[in] ch_im_in number of input tensor channels + * @param[in] wt pointer to kernel weights + * @param[in] ch_im_out number of filters, i.e., output tensor channels + * @param[in] dim_kernel filter kernel size + * @param[in] padding padding sizes + * @param[in] stride convolution stride + * @param[in] bias pointer to bias + * @param[in] bias_shift amount of left-shift for bias + * @param[in] out_shift amount of right-shift for output + * @param[in,out] Im_out pointer to output tensor + * @param[in] dim_im_out output tensor dimension + * @param[in,out] bufferA pointer to buffer space for input + * @param[in,out] bufferB pointer to buffer space for output + * @return The function returns either + * ARM_MATH_SIZE_MISMATCH or ARM_MATH_SUCCESS based on the outcome of size checking. + * + * This function is the version with full list of optimization tricks, but with + * some contraints: + * ch_im_in is multiple of 2 + * ch_im_out is multiple of 2 + */ + +arm_status arm_convolve_HWC_q15_fast(const q15_t *Im_in, + const uint16_t dim_im_in, + const uint16_t ch_im_in, + const q15_t *wt, + const uint16_t ch_im_out, + const uint16_t dim_kernel, + const uint16_t padding, + const uint16_t stride, + const q15_t *bias, + const uint16_t bias_shift, + const uint16_t out_shift, + q15_t *Im_out, + const uint16_t dim_im_out, + q15_t *bufferA, + q7_t *bufferB); + +/** + * @brief Fast Q15 convolution function (non-sqaure shape) + * @param[in] Im_in pointer to input tensor + * @param[in] dim_im_in_x input tensor dimension x + * @param[in] dim_im_in_y input tensor dimension y + * @param[in] ch_im_in number of input tensor channels + * @param[in] wt pointer to kernel weights + * @param[in] ch_im_out number of filters, i.e., output tensor channels + * @param[in] dim_kernel_x filter kernel size x + * @param[in] dim_kernel_y filter kernel size y + * @param[in] padding_x padding size x + * @param[in] padding_y padding size y + * @param[in] stride_x convolution stride x + * @param[in] stride_y convolution stride y + * @param[in] bias pointer to bias + * @param[in] bias_shift amount of left-shift for bias + * @param[in] out_shift amount of right-shift for output + * @param[in,out] Im_out pointer to output tensor + * @param[in] dim_im_out_x output tensor dimension x + * @param[in] dim_im_out_y output tensor dimension y + * @param[in,out] bufferA pointer to buffer space for input + * @param[in,out] bufferB pointer to buffer space for output + * @return The function returns either + * ARM_MATH_SIZE_MISMATCH or ARM_MATH_SUCCESS based on the outcome of size checking. + * + * @details + * + * Buffer size: + * + * bufferA size: 2*ch_im_in*dim_kernel*dim_kernel + * + * bufferB size: 0 + * + * Input dimension constraints: + * + * ch_im_in is multiple of 2 + * + * ch_im_out is multipe of 2 + * + */ + +arm_status arm_convolve_HWC_q15_fast_nonsquare(const q15_t *Im_in, + const uint16_t dim_im_in_x, + const uint16_t dim_im_in_y, + const uint16_t ch_im_in, + const q15_t *wt, + const uint16_t ch_im_out, + const uint16_t dim_kernel_x, + const uint16_t dim_kernel_y, + const uint16_t padding_x, + const uint16_t padding_y, + const uint16_t stride_x, + const uint16_t stride_y, + const q15_t *bias, + const uint16_t bias_shift, + const uint16_t out_shift, + q15_t *Im_out, + const uint16_t dim_im_out_x, + const uint16_t dim_im_out_y, + q15_t *bufferA, + q7_t *bufferB); + +/** + * @brief Q7 depthwise separable convolution function + * @param[in] Im_in pointer to input tensor + * @param[in] dim_im_in input tensor dimension + * @param[in] ch_im_in number of input tensor channels + * @param[in] wt pointer to kernel weights + * @param[in] ch_im_out number of filters, i.e., output tensor channels + * @param[in] dim_kernel filter kernel size + * @param[in] padding padding sizes + * @param[in] stride convolution stride + * @param[in] bias pointer to bias + * @param[in] bias_shift amount of left-shift for bias + * @param[in] out_shift amount of right-shift for output + * @param[in,out] Im_out pointer to output tensor + * @param[in] dim_im_out output tensor dimension + * @param[in,out] bufferA pointer to buffer space for input + * @param[in,out] bufferB pointer to buffer space for output + * @return The function returns either + * ARM_MATH_SIZE_MISMATCH or ARM_MATH_SUCCESS based on the outcome of size checking. + * + * This function is the version with full list of optimization tricks, but with + * some contraints: + * ch_im_in is multiple of 2 + * ch_im_out is multiple of 2 + */ + +arm_status arm_depthwise_separable_conv_HWC_q7(const q7_t *Im_in, + const uint16_t dim_im_in, + const uint16_t ch_im_in, + const q7_t *wt, + const uint16_t ch_im_out, + const uint16_t dim_kernel, + const uint16_t padding, + const uint16_t stride, + const q7_t *bias, + const uint16_t bias_shift, + const uint16_t out_shift, + q7_t *Im_out, + const uint16_t dim_im_out, + q15_t *bufferA, + q7_t *bufferB); + +/** + * @brief Q7 depthwise separable convolution function (non-square shape) + * @param[in] Im_in pointer to input tensor + * @param[in] dim_im_in_x input tensor dimension x + * @param[in] dim_im_in_y input tensor dimension y + * @param[in] ch_im_in number of input tensor channels + * @param[in] wt pointer to kernel weights + * @param[in] ch_im_out number of filters, i.e., output tensor channels + * @param[in] dim_kernel_x filter kernel size x + * @param[in] dim_kernel_y filter kernel size y + * @param[in] padding_x padding sizes x + * @param[in] padding_y padding sizes y + * @param[in] stride_x convolution stride x + * @param[in] stride_y convolution stride y + * @param[in] bias pointer to bias + * @param[in] bias_shift amount of left-shift for bias + * @param[in] out_shift amount of right-shift for output + * @param[in,out] Im_out pointer to output tensor + * @param[in] dim_im_out_x output tensor dimension x + * @param[in] dim_im_out_y output tensor dimension y + * @param[in,out] bufferA pointer to buffer space for input + * @param[in,out] bufferB pointer to buffer space for output + * @return The function returns either + * ARM_MATH_SIZE_MISMATCH or ARM_MATH_SUCCESS based on the outcome of size checking. + * + * This function is the version with full list of optimization tricks, but with + * some contraints: + * ch_im_in is multiple of 2 + * ch_im_out is multiple of 2 + */ +arm_status arm_depthwise_separable_conv_HWC_q7_nonsquare(const q7_t *Im_in, + const uint16_t dim_im_in_x, + const uint16_t dim_im_in_y, + const uint16_t ch_im_in, + const q7_t *wt, + const uint16_t ch_im_out, + const uint16_t dim_kernel_x, + const uint16_t dim_kernel_y, + const uint16_t padding_x, + const uint16_t padding_y, + const uint16_t stride_x, + const uint16_t stride_y, + const q7_t *bias, + const uint16_t bias_shift, + const uint16_t out_shift, + q7_t *Im_out, + const uint16_t dim_im_out_x, + const uint16_t dim_im_out_y, + q15_t *bufferA, + q7_t *bufferB); + +/** + * @brief Wrapper function to pick the right optimized s8 depthwise convolution function + * + * @param[in, out] ctx Function context (e.g. temporary buffer). Check the function + * definition file to see if an additional buffer is required. + * Optional function {API}_get_buffer_size() provides the buffer + * size if required. + * @param[in] dw_conv_params Depthwise convolution parameters (e.g. strides, dilations, pads,...) + * dw_conv_params->dilation is not used. + * Range of dw_conv_params->input_offset : [-127, 128] + * Range of dw_conv_params->output_offset : [-128, 127] + * @param[in] quant_params Per-channel quantization info. + * It contains the multiplier and shift values to be applied to each + * output channel + * @param[in] input_dims Input (activation) tensor dimensions. Format: [H, W, C_IN] + * Batch argument N is not used and assumed to be 1. + * @param[in] input_data Input (activation) data pointer. Data type: int8 + * @param[in] filter_dims Filter tensor dimensions. Format: [1, H, W, C_OUT] + * @param[in] filter_data Filter data pointer. Data type: int8 + * @param[in] bias_dims Bias tensor dimensions. Format: [C_OUT] + * @param[in] bias_data Bias data pointer. Data type: int32 + * @param[in] output_dims Output tensor dimensions. Format: [1, H, W, C_OUT] + * @param[in, out] output_data Output data pointer. Data type: int8 + * @return The function returns + * ARM_MATH_SUCCESS - Successful completion. + * + * @details + * - Supported framework: TensorFlow Lite + * - Picks one of the the following functions + * -# arm_depthwise_conv_s8() + * -# arm_depthwise_conv_3x3_s8() - Cortex-M CPUs with DSP extension only + * -# arm_depthwise_conv_s8_opt() + * - q7 is used as data type eventhough it is s8 data. It is done so to be consistent with existing APIs. + * - Check details of arm_depthwise_conv_s8_opt() for potential data that can be accessed outside of the + * boundary. + */ +arm_status arm_depthwise_conv_wrapper_s8(const cmsis_nn_context *ctx, + const cmsis_nn_dw_conv_params *dw_conv_params, + const cmsis_nn_per_channel_quant_params *quant_params, + const cmsis_nn_dims *input_dims, + const q7_t *input_data, + const cmsis_nn_dims *filter_dims, + const q7_t *filter_data, + const cmsis_nn_dims *bias_dims, + const int32_t *bias_data, + const cmsis_nn_dims *output_dims, + q7_t *output_data); + +/** + * @brief Get size of additional buffer required by arm_depthwise_conv_wrapper_s8() + * + * @param[in] dw_conv_params Depthwise convolution parameters (e.g. strides, dilations, pads,...) + * dw_conv_params->dilation is not used. + * Range of dw_conv_params->input_offset : [-127, 128] + * Range of dw_conv_params->input_offset : [-128, 127] + * @param[in] input_dims Input (activation) tensor dimensions. Format: [H, W, C_IN] + * Batch argument N is not used and assumed to be 1. + * @param[in] filter_dims Filter tensor dimensions. Format: [1, H, W, C_OUT] + * @param[in] output_dims Output tensor dimensions. Format: [1, H, W, C_OUT] + * @return Size of additional memory required for optimizations in bytes. + * + */ +int32_t arm_depthwise_conv_wrapper_s8_get_buffer_size(const cmsis_nn_dw_conv_params *dw_conv_params, + const cmsis_nn_dims *input_dims, + const cmsis_nn_dims *filter_dims, + const cmsis_nn_dims *output_dims); + +/** + * @brief Basic s8 depthwise convolution function that doesn't have any constraints on the input dimensions. + * + * @param[in, out] ctx Function context (e.g. temporary buffer). Check the function + * definition file to see if an additional buffer is required. + * Optional function {API}_get_buffer_size() provides the buffer + * size if an additional buffer is required. + * exists if additional memory is. + * @param[in] dw_conv_params Depthwise convolution parameters (e.g. strides, dilations, pads,...) + * dw_conv_params->dilation is not used. + * Range of dw_conv_params->input_offset : [-127, 128] + * Range of dw_conv_params->input_offset : [-128, 127] + * @param[in] quant_params Per-channel quantization info. + * It contains the multiplier and shift values to be applied to each + * output channel + * @param[in] input_dims Input (activation) tensor dimensions. Format: [1, H, W, C_IN] + * Batch argument N is not used. + * @param[in] input_data Input (activation) data pointer. Data type: int8 + * @param[in] filter_dims Filter tensor dimensions. Format: [1, H, W, C_OUT] + * @param[in] filter_data Filter data pointer. Data type: int8 + * @param[in] bias_dims Bias tensor dimensions. Format: [C_OUT] + * @param[in] bias_data Bias data pointer. Data type: int32 + * @param[in] output_dims Output tensor dimensions. Format: [1, H, W, C_OUT] + * @param[in, out] output_data Output data pointer. Data type: int8 + * @return The function returns ARM_MATH_SUCCESS + * + * @details + * - Supported framework: TensorFlow Lite + * - q7 is used as data type eventhough it is s8 data. It is done so to be consistent with existing APIs. + */ +arm_status arm_depthwise_conv_s8(const cmsis_nn_context *ctx, + const cmsis_nn_dw_conv_params *dw_conv_params, + const cmsis_nn_per_channel_quant_params *quant_params, + const cmsis_nn_dims *input_dims, + const q7_t *input_data, + const cmsis_nn_dims *filter_dims, + const q7_t *filter_data, + const cmsis_nn_dims *bias_dims, + const int32_t *bias_data, + const cmsis_nn_dims *output_dims, + q7_t *output_data); + +/** + * @brief Optimized s8 depthwise convolution function for 3x3 kernel size with some constraints on + * the input arguments(documented below). Refer arm_depthwise_conv_s8() for function + * argument details. + * + * @return The function returns one of the following + * ARM_MATH_SIZE_MISMATCH - Unsupported dimension of tensors + * ARM_MATH_ARGUMENT_ERROR - Unsupported pad size along the x axis + * ARM_MATH_SUCCESS - Successful operation + * + * @details + * - Supported framework : TensorFlow Lite Micro + * - The following constrains on the arguments apply + * -# Number of input channel equals number of output channels + * -# Filter height and width equals 3 + * -# Padding along x is either 0 or 1. + * + */ +arm_status arm_depthwise_conv_3x3_s8(const cmsis_nn_context *ctx, + const cmsis_nn_dw_conv_params *dw_conv_params, + const cmsis_nn_per_channel_quant_params *quant_params, + const cmsis_nn_dims *input_dims, + const q7_t *input_data, + const cmsis_nn_dims *filter_dims, + const q7_t *filter_data, + const cmsis_nn_dims *bias_dims, + const int32_t *bias_data, + const cmsis_nn_dims *output_dims, + q7_t *output_data); + +/** + * @brief Optimized s8 depthwise convolution function with constraint that in_channel equals out_channel. + * Refer arm_depthwise_conv_s8() for function argument details. + * + * @return The function returns one of the following + * ARM_MATH_SIZE_MISMATCH - input channel != output channel or + * ch_mult != 1 + * ARM_MATH_SUCCESS - Successful operation + * + * @note If number of channels is not a multiple of 4, upto 3 elements outside the boundary will be read out + * for the following if MVE optimizations(Arm Helium Technology) are used. + * - Output shift + * - Output multiplier + * - Output bias + * - kernel + * @details + * - Supported framework: TensorFlow Lite + * - The following constrains on the arguments apply + * -# Number of input channel equals number of output channels or ch_mult equals 1 + * - q7 is used as data type eventhough it is s8 data. It is done so to be consistent with existing APIs. + * - Reccomended when number of channels is 4 or greater. + * + */ +arm_status arm_depthwise_conv_s8_opt(const cmsis_nn_context *ctx, + const cmsis_nn_dw_conv_params *dw_conv_params, + const cmsis_nn_per_channel_quant_params *quant_params, + const cmsis_nn_dims *input_dims, + const q7_t *input_data, + const cmsis_nn_dims *filter_dims, + const q7_t *filter_data, + const cmsis_nn_dims *bias_dims, + const int32_t *bias_data, + const cmsis_nn_dims *output_dims, + q7_t *output_data); + +/** + * @brief Get the required buffer size for optimized s8 depthwise convolution + * function with constraint that in_channel equals out_channel. + * @param[in] input_dims Input (activation) tensor dimensions. Format: [1, H, W, C_IN] + * Batch argument N is not used. + * @param[in] filter_dims Filter tensor dimensions. Format: [1, H, W, C_OUT] + * @return The function returns required buffer size in bytes + * + */ +int32_t arm_depthwise_conv_s8_opt_get_buffer_size(const cmsis_nn_dims *input_dims, const cmsis_nn_dims *filter_dims); + +/** + * @defgroup FC Fully-connected Layer Functions + * + * Collection of fully-connected and matrix multiplication functions. + * + * Fully-connected layer is basically a matrix-vector multiplication + * with bias. The matrix is the weights and the input/output vectors + * are the activation values. Supported {weight, activation} precisions + * include {8-bit, 8-bit}, {16-bit, 16-bit}, and {8-bit, 16-bit}. + * + * Here we have two types of kernel functions. The basic function + * implements the function using regular GEMV approach. The opt functions + * operates with weights in interleaved formats. + * + */ + +/** + *@brief Q7 basic fully-connected layer function + *@param[in] pV pointer to input vector + *@param[in] pM pointer to matrix weights + *@param[in] dim_vec length of the vector + *@param[in] num_of_rows number of rows in weight matrix + *@param[in] bias_shift amount of left-shift for bias + *@param[in] out_shift amount of right-shift for output + *@param[in] bias pointer to bias + *@param[in,out] pOut pointer to output vector + *@param[in,out] vec_buffer pointer to buffer space for input + *@return The function returns ARM_MATH_SUCCESS + * + */ + +arm_status arm_fully_connected_q7(const q7_t *pV, + const q7_t *pM, + const uint16_t dim_vec, + const uint16_t num_of_rows, + const uint16_t bias_shift, + const uint16_t out_shift, + const q7_t *bias, + q7_t *pOut, + q15_t *vec_buffer); + +/** + * @brief Basic s8 Fully Connected function. + * + * @param[in, out] ctx Function context (e.g. temporary buffer). Check the function + * definition file to see if an additional buffer is required. + * Optional function {API}_get_buffer_size() provides the buffer + * size if an additional buffer is required. + * @param[in] fc_params Fully Connected layer parameters (e.g. strides, dilations, pads,...) + * Range of fc_params->input_offset : [-127, 128] + * fc_params->filter_offset : 0 + * Range of fc_params->output_offset : [-128, 127] + * @param[in] quant_params Per-tensor quantization info. + * It contains the multiplier and shift values to be applied to the output tensor. + * @param[in] input_dims Input (activation) tensor dimensions. Format: [N, H, W, C_IN] + * Input dimension is taken as Nx(H * W * C_IN) + * @param[in] input_data Input (activation) data pointer. Data type: int8 + * @param[in] filter_dims Two dimensional filter dimensions. Format: [N, C] + * N : accumulation depth and equals (H * W * C_IN) from input_dims + * C : output depth and equals C_OUT in output_dims + * H & W : Not used + * @param[in] filter_data Filter data pointer. Data type: int8 + * @param[in] bias_dims Bias tensor dimensions. Format: [C_OUT] + * N, H, W : Not used + * @param[in] bias_data Bias data pointer. Data type: int32 + * @param[in] output_dims Output tensor dimensions. Format: [N, C_OUT] + * N : Batches + * C_OUT : Output depth + * H & W : Not used. + * @param[in, out] output_data Output data pointer. Data type: int8 + * @return The function returns ARM_MATH_SUCCESS + * + * @details + * - Supported framework: TensorFlow Lite + * - q7 is used as data type eventhough it is s8 data. It is done so to be consistent with existing APIs. + */ +arm_status arm_fully_connected_s8(const cmsis_nn_context *ctx, + const cmsis_nn_fc_params *fc_params, + const cmsis_nn_per_tensor_quant_params *quant_params, + const cmsis_nn_dims *input_dims, + const q7_t *input_data, + const cmsis_nn_dims *filter_dims, + const q7_t *filter_data, + const cmsis_nn_dims *bias_dims, + const int32_t *bias_data, + const cmsis_nn_dims *output_dims, + q7_t *output_data); + +/** + * @brief Get the required buffer size for S8 basic fully-connected and + * matrix multiplication layer function for TF Lite + * @param[in] filter_dims dimension of filter + * @return The function returns required buffer size in bytes + * + */ +int32_t arm_fully_connected_s8_get_buffer_size(const cmsis_nn_dims *filter_dims); + +/** + * @brief Q7 opt fully-connected layer function + * @param[in] pV pointer to input vector + * @param[in] pM pointer to matrix weights + * @param[in] dim_vec length of the vector + * @param[in] num_of_rows number of rows in weight matrix + * @param[in] bias_shift amount of left-shift for bias + * @param[in] out_shift amount of right-shift for output + * @param[in] bias pointer to bias + * @param[in,out] pOut pointer to output vector + * @param[in,out] vec_buffer pointer to buffer space for input + * @return The function returns ARM_MATH_SUCCESS + * + */ + +arm_status arm_fully_connected_q7_opt(const q7_t *pV, + const q7_t *pM, + const uint16_t dim_vec, + const uint16_t num_of_rows, + const uint16_t bias_shift, + const uint16_t out_shift, + const q7_t *bias, + q7_t *pOut, + q15_t *vec_buffer); + +/** + * @brief Q15 basic fully-connected layer function + * @param[in] pV pointer to input vector + * @param[in] pM pointer to matrix weights + * @param[in] dim_vec length of the vector + * @param[in] num_of_rows number of rows in weight matrix + * @param[in] bias_shift amount of left-shift for bias + * @param[in] out_shift amount of right-shift for output + * @param[in] bias pointer to bias + * @param[in,out] pOut pointer to output vector + * @param[in,out] vec_buffer pointer to buffer space for input + * @return The function returns ARM_MATH_SUCCESS + * + */ + +arm_status arm_fully_connected_q15(const q15_t *pV, + const q15_t *pM, + const uint16_t dim_vec, + const uint16_t num_of_rows, + const uint16_t bias_shift, + const uint16_t out_shift, + const q15_t *bias, + q15_t *pOut, + q15_t *vec_buffer); + +/** + * @brief Q15 opt fully-connected layer function + * @param[in] pV pointer to input vector + * @param[in] pM pointer to matrix weights + * @param[in] dim_vec length of the vector + * @param[in] num_of_rows number of rows in weight matrix + * @param[in] bias_shift amount of left-shift for bias + * @param[in] out_shift amount of right-shift for output + * @param[in] bias pointer to bias + * @param[in,out] pOut pointer to output vector + * @param[in,out] vec_buffer pointer to buffer space for input + * @return The function returns ARM_MATH_SUCCESS + * + */ + +arm_status arm_fully_connected_q15_opt(const q15_t *pV, + const q15_t *pM, + const uint16_t dim_vec, + const uint16_t num_of_rows, + const uint16_t bias_shift, + const uint16_t out_shift, + const q15_t *bias, + q15_t *pOut, + q15_t *vec_buffer); + +/** + * @brief Mixed Q15-Q7 fully-connected layer function + * @param[in] pV pointer to input vector + * @param[in] pM pointer to matrix weights + * @param[in] dim_vec length of the vector + * @param[in] num_of_rows number of rows in weight matrix + * @param[in] bias_shift amount of left-shift for bias + * @param[in] out_shift amount of right-shift for output + * @param[in] bias pointer to bias + * @param[in,out] pOut pointer to output vector + * @param[in,out] vec_buffer pointer to buffer space for input + * @return The function returns ARM_MATH_SUCCESS + * + */ + +arm_status arm_fully_connected_mat_q7_vec_q15(const q15_t *pV, + const q7_t *pM, + const uint16_t dim_vec, + const uint16_t num_of_rows, + const uint16_t bias_shift, + const uint16_t out_shift, + const q7_t *bias, + q15_t *pOut, + q15_t *vec_buffer); + +/** + * @brief Mixed Q15-Q7 opt fully-connected layer function + * @param[in] pV pointer to input vector + * @param[in] pM pointer to matrix weights + * @param[in] dim_vec length of the vector + * @param[in] num_of_rows number of rows in weight matrix + * @param[in] bias_shift amount of left-shift for bias + * @param[in] out_shift amount of right-shift for output + * @param[in] bias pointer to bias + * @param[in,out] pOut pointer to output vector + * @param[in,out] vec_buffer pointer to buffer space for input + * @return The function returns ARM_MATH_SUCCESS + * + */ + +arm_status arm_fully_connected_mat_q7_vec_q15_opt(const q15_t *pV, + const q7_t *pM, + const uint16_t dim_vec, + const uint16_t num_of_rows, + const uint16_t bias_shift, + const uint16_t out_shift, + const q7_t *bias, + q15_t *pOut, + q15_t *vec_buffer); + +/** + * @brief Matrix-Multiplication Kernels for Convolution + * + * These functions are used within convolution layer functions for + * matrix multiplication. + * + * The implementation is similar to CMSIS-DSP arm_mat_mult functions + * with one Q7 and one Q15 operands. The Q15 operand is the im2col + * output which is always with 2 columns. + * + */ + +/** + * @brief Matrix-multiplication function for convolution + * @param[in] pA pointer to operand A + * @param[in] pInBuffer pointer to operand B, always conssists of 2 vectors + * @param[in] ch_im_out numRow of A + * @param[in] numCol_A numCol of A + * @param[in] bias_shift amount of left-shift for bias + * @param[in] out_shift amount of right-shift for output + * @param[in] bias the bias + * @param[in,out] pOut pointer to output + * @return The function returns the incremented output pointer + */ + +q7_t *arm_nn_mat_mult_kernel_q7_q15(const q7_t *pA, + const q15_t *pInBuffer, + const uint16_t ch_im_out, + const uint16_t numCol_A, + const uint16_t bias_shift, + const uint16_t out_shift, + const q7_t *bias, + q7_t *pOut); +/** + * @brief Matrix-multiplication function for convolution with per-channel requantization. + * @param[in] input_a pointer to operand A + * @param[in] input_b pointer to operand B, always consists of 2 vectors. + * @param[in] output_ch number of rows of A + * @param[in] out_shift pointer to per output channel requantization shift parameter. + * @param[in] out_mult pointer to per output channel requantization multiplier parameter. + * @param[in] out_offset output tensor offset. + * @param[in] activation_min minimum value to clamp the output to. Range : int8 + * @param[in] activation_max maximum value to clamp the output to. Range : int8 + * @param[in] num_col_a number of columns of A + * @param[in] output_bias per output channel bias. Range : int32 + * @param[in,out] out_0 pointer to output + * @return The function returns one of the two + * 1. The incremented output pointer for a successful operation or + * 2. NULL if implementation is not available. + * + * @details This function does the matrix multiplication of weight matrix for all output channels + * with 2 columns from im2col and produces two elements/output_channel. The outputs are + * clamped in the range provided by activation min and max. + * Supported framework: TensorFlow Lite micro. + */ +q7_t *arm_nn_mat_mult_kernel_s8_s16(const q7_t *input_a, + const q15_t *input_b, + const uint16_t output_ch, + const int32_t *out_shift, + const int32_t *out_mult, + const int32_t out_offset, + const int16_t activation_min, + const int16_t activation_max, + const uint16_t num_col_a, + const int32_t *const output_bias, + q7_t *out_0); + +/** + * @brief Matrix-multiplication of re-ordered input B with A. + * + * @details For arguments, refer arm_nn_mat_mult_kernel_s8_s16. The re-ordering is a consequence + * of sign extension done by the SXTB16 command on input_b. The outputs are clamped in the range + * provided by activation min and max. + * * @details + * - Supported framework : TensorFlow Lite Micro + * - The following constrains on the arguments apply + * -# num_col_a is a multiple of 4 + * -# output_ch is a multiple of 2 + * + */ +q7_t *arm_nn_mat_mult_kernel_s8_s16_reordered(const q7_t *input_a, + const q15_t *input_b, + const uint16_t output_ch, + const int32_t *out_shift, + const int32_t *out_mult, + const int32_t out_offset, + const int16_t activation_min, + const int16_t activation_max, + const uint16_t num_col_a, + const int32_t *const output_bias, + q7_t *out_0); + +/** + *@brief Matrix-multiplication function for convolution with reordered columns + *@param[in] pA pointer to operand A + *@param[in] pInBuffer pointer to operand B, always conssists of 2 vectors + *@param[in] ch_im_out numRow of A + *@param[in] numCol_A numCol of A + *@param[in] bias_shift amount of left-shift for bias + *@param[in] out_shift amount of right-shift for output + *@param[in] bias the bias + *@param[in,out] pOut pointer to output + *@return The function returns the incremented output pointer + * + *@details This function assumes that data in pInBuffer are reordered + */ +q7_t *arm_nn_mat_mult_kernel_q7_q15_reordered(const q7_t *pA, + const q15_t *pInBuffer, + const uint16_t ch_im_out, + const uint16_t numCol_A, + const uint16_t bias_shift, + const uint16_t out_shift, + const q7_t *bias, + q7_t *pOut); + +#ifdef __cplusplus +} +#endif + +/* + * Other functions + * These layers are typically not timing critical + * Basic implementation is supported here + */ + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * @defgroup BasicMath Basic math functions + * + * Element wise add and multiplication functions. + * + */ + +/** + * @brief s8 element wise add of two vectors + * @param[in] input_1_vect pointer to input vector 1 + * @param[in] input_2_vect pointer to input vector 2 + * @param[in] input_1_offset offset for input 1. Range: Range: -127 to 128 + * @param[in] input_1_mult multiplier for input 1 + * @param[in] input_1_shift shift for input 1 + * @param[in] input_2_offset offset for input 2. Range: Range: -127 to 128 + * @param[in] input_2_mult multiplier for input 2 + * @param[in] input_2_shift shift for input 2 + * @param[in] left_shift input left shift + * @param[in,out] output pointer to output vector + * @param[in] out_offset output offset + * @param[in] out_mult output multiplier + * @param[in] out_shift output shift + * @param[in] out_activation_min minimum value to clamp output to + * @param[in] out_activation_max maximum value to clamp output to + * @param[in] block_size number of samples + * @return The function returns ARM_MATH_SUCCESS + */ +arm_status arm_elementwise_add_s8(const int8_t *input_1_vect, + const int8_t *input_2_vect, + const int32_t input_1_offset, + const int32_t input_1_mult, + const int32_t input_1_shift, + const int32_t input_2_offset, + const int32_t input_2_mult, + const int32_t input_2_shift, + const int32_t left_shift, + int8_t *output, + const int32_t out_offset, + const int32_t out_mult, + const int32_t out_shift, + const int32_t out_activation_min, + const int32_t out_activation_max, + const uint32_t block_size); + +/** + * @brief s8 element wise multiplication + * @param[in] input_1_vect pointer to input vector 1 + * @param[in] input_2_vect pointer to input vector 2 + * @param[in] input_1_offset offset for input 1. Range: Range: -127 to 128 + * @param[in] input_2_offset offset for input 2. Range: Range: -127 to 128 + * @param[in,out] output pointer to output vector + * @param[in] out_offset output offset + * @param[in] out_mult output multiplier + * @param[in] out_shift output shift + * @param[in] out_activation_min minimum value to clamp output to + * @param[in] out_activation_max maximum value to clamp output to + * @param[in] block_size number of samples + * @return The function returns ARM_MATH_SUCCESS + * + * @details Supported framework: TensorFlow Lite micro + */ +arm_status arm_elementwise_mul_s8(const int8_t *input_1_vect, + const int8_t *input_2_vect, + const int32_t input_1_offset, + const int32_t input_2_offset, + int8_t *output, + const int32_t out_offset, + const int32_t out_mult, + const int32_t out_shift, + const int32_t out_activation_min, + const int32_t out_activation_max, + const uint32_t block_size); +/** + * @defgroup Acti Activation Functions + * + * Perform activation layers, including ReLU (Rectified Linear Unit), + * sigmoid and tanh + * + */ + +/** + * @brief Q7 RELU function + * @param[in,out] data pointer to input + * @param[in] size number of elements + * @return none. + */ + +void arm_relu_q7(q7_t *data, uint16_t size); + +/** + * @brief s8 ReLU6 function + * @param[in,out] data pointer to input + * @param[in] size number of elements + */ + +void arm_relu6_s8(q7_t *data, uint16_t size); + +/** + * @brief Q15 RELU function + * @param[in,out] data pointer to input + * @param[in] size number of elements + * @return none. + */ + +void arm_relu_q15(q15_t *data, uint16_t size); + +/** + * @brief Q7 neural network activation function using direct table look-up + * @param[in,out] data pointer to input + * @param[in] size number of elements + * @param[in] int_width bit-width of the integer part, assume to be smaller than 3 + * @param[in] type type of activation functions + * @return none. + */ + +void arm_nn_activations_direct_q7(q7_t *data, uint16_t size, uint16_t int_width, arm_nn_activation_type type); + +/** + * @brief Q15 neural network activation function using direct table look-up + * @param[in,out] data pointer to input + * @param[in] size number of elements + * @param[in] int_width bit-width of the integer part, assume to be smaller than 3 + * @param[in] type type of activation functions + * @return none. + * + * @details + * + * This is the direct table look-up approach. + * + * Assume here the integer part of the fixed-point is <= 3. + * More than 3 just not making much sense, makes no difference with + * saturation followed by any of these activation functions. + */ + +void arm_nn_activations_direct_q15(q15_t *data, uint16_t size, uint16_t int_width, arm_nn_activation_type type); + +/** + * @defgroup Pooling Pooling Functions + * + * Perform pooling functions, including max pooling and average pooling + * + */ + +/** + * @brief Q7 max pooling function + * @param[in] Im_in pointer to input tensor + * @param[in] dim_im_in input tensor dimension + * @param[in] ch_im_in number of input tensor channels + * @param[in] dim_kernel filter kernel size + * @param[in] padding padding sizes + * @param[in] stride convolution stride + * @param[in] dim_im_out output tensor dimension + * @param[in,out] bufferA pointer to buffer space for input + * @param[in,out] Im_out pointer to output tensor + * @return none. + * + */ + +void arm_maxpool_q7_HWC(q7_t *Im_in, + const uint16_t dim_im_in, + const uint16_t ch_im_in, + const uint16_t dim_kernel, + const uint16_t padding, + const uint16_t stride, + const uint16_t dim_im_out, + q7_t *bufferA, + q7_t *Im_out); + +/** + * @brief Q7 average pooling function + * @param[in] Im_in pointer to input tensor + * @param[in] dim_im_in input tensor dimension + * @param[in] ch_im_in number of input tensor channels + * @param[in] dim_kernel filter kernel size + * @param[in] padding padding sizes + * @param[in] stride convolution stride + * @param[in] dim_im_out output tensor dimension + * @param[in,out] bufferA pointer to buffer space for input + * @param[in,out] Im_out pointer to output tensor + * @return none. + * + */ + +void arm_avepool_q7_HWC(q7_t *Im_in, + const uint16_t dim_im_in, + const uint16_t ch_im_in, + const uint16_t dim_kernel, + const uint16_t padding, + const uint16_t stride, + const uint16_t dim_im_out, + q7_t *bufferA, + q7_t *Im_out); + +/** + * @brief s8 average pooling function. + * + * @param[in, out] ctx Function context (e.g. temporary buffer). Check the function + * definition file to see if an additional buffer is required. + * Optional function {API}_get_buffer_size() provides the buffer + * size if an additional buffer is required. + * @param[in] pool_params Pooling parameters + * @param[in] input_dims Input (activation) tensor dimensions. Format: [H, W, C_IN] + * Argument 'N' is not used. + * @param[in] input_data Input (activation) data pointer. Data type: int8 + * @param[in] filter_dims Filter tensor dimensions. Format: [H, W] + * Argument N and C are not used. + * @param[in] output_dims Output tensor dimensions. Format: [H, W, C_OUT] + * Argument N is not used. + * C_OUT equals C_IN. + * @param[in, out] output_data Output data pointer. Data type: int8 + * @return The function returns + * ARM_MATH_SUCCESS - Successful operation + * + * @details + * - Supported Framework: TensorFlow Lite + * + */ +arm_status arm_avgpool_s8(const cmsis_nn_context *ctx, + const cmsis_nn_pool_params *pool_params, + const cmsis_nn_dims *input_dims, + const q7_t *input_data, + const cmsis_nn_dims *filter_dims, + const cmsis_nn_dims *output_dims, + q7_t *output_data); + +/** + * @brief Get the required buffer size for S8 average pooling function + * @param[in] dim_dst_width output tensor dimension + * @param[in] ch_src number of input tensor channels + * @return The function returns required buffer size in bytes + * + */ +int32_t arm_avgpool_s8_get_buffer_size(const int dim_dst_width, const int ch_src); + +/** + * @brief s8 max pooling function. + * + * @param[in, out] ctx Function context (e.g. temporary buffer). Check the function + * definition file to see if an additional buffer is required. + * Optional function {API}_get_buffer_size() provides the buffer + * size if an additional buffer is required. + * @param[in] pool_params Pooling parameters + * @param[in] input_dims Input (activation) tensor dimensions. Format: [H, W, C_IN] + * Argument 'N' is not used. + * @param[in] input_data Input (activation) data pointer. Data type: int8 + * @param[in] filter_dims Filter tensor dimensions. Format: [H, W] + * Argument N and C are not used. + * @param[in] output_dims Output tensor dimensions. Format: [H, W, C_OUT] + * Argument N is not used. + * C_OUT equals C_IN. + * @param[in, out] output_data Output data pointer. Data type: int8 + * @return The function returns + * ARM_MATH_SUCCESS - Successful operation + * + * @details + * - Supported Framework: TensorFlow Lite + * + */ +arm_status arm_max_pool_s8(const cmsis_nn_context *ctx, + const cmsis_nn_pool_params *pool_params, + const cmsis_nn_dims *input_dims, + const q7_t *input_data, + const cmsis_nn_dims *filter_dims, + const cmsis_nn_dims *output_dims, + q7_t *output_data); +/** + * @defgroup Softmax Softmax Functions + * + * EXP(2) based softmax functions. + * + */ + +/** + * @brief Q7 softmax function + * @param[in] vec_in pointer to input vector + * @param[in] dim_vec input vector dimension + * @param[out] p_out pointer to output vector + * + * @note This function is an optimized version which is not bit-accurate with + * TensorFlow Lite's kernel + * + */ + +void arm_softmax_q7(const q7_t *vec_in, const uint16_t dim_vec, q7_t *p_out); + +/** + * @brief Q7 softmax function with batch parameter + * @param[in] vec_in pointer to input vector + * @param[in] nb_batches number of batches + * @param[in] dim_vec input vector dimension + * @param[out] p_out pointer to output vector + * @return none. + * + * @note This function is an optimized version which is not bit-accurate with + * TensorFlow Lite's kernel + * + */ + +void arm_softmax_with_batch_q7(const q7_t *vec_in, const uint16_t nb_batches, const uint16_t dim_vec, q7_t *p_out); +/** + * @brief Q15 softmax function + * @param[in] vec_in pointer to input vector + * @param[in] dim_vec input vector dimension + * @param[out] p_out pointer to output vector + * @return none. + * + * @note This function is an optimized version which is not bit-accurate with + * TensorFlow Lite's kernel + * + */ + +void arm_softmax_q15(const q15_t *vec_in, const uint16_t dim_vec, q15_t *p_out); + +/** + * @brief S8 softmax function + * @param[in] input Pointer to the input tensor + * @param[in] num_rows Number of rows in the input tensor + * @param[in] row_size Number of elements in each input row + * @param[in] mult Input quantization multiplier + * @param[in] shift Input quantization shift within the range [0, 31] + * @param[in] diff_min Minimum difference with max in row. Used to check if + * the quantized exponential operation can be performed + * @param[out] output Pointer to the output tensor + * + * @note Supported framework: TensorFlow Lite micro (bit-accurate) + * + */ + +void arm_softmax_s8(const int8_t *input, + const int32_t num_rows, + const int32_t row_size, + const int32_t mult, + const int32_t shift, + const int32_t diff_min, + int8_t *output); + +/** + * @brief U8 softmax function + * @param[in] input Pointer to the input tensor + * @param[in] num_rows Number of rows in the input tensor + * @param[in] row_size Number of elements in each input row + * @param[in] mult Input quantization multiplier + * @param[in] shift Input quantization shift within the range [0, 31] + * @param[in] diff_min Minimum difference with max in row. Used to check if + * the quantized exponential operation can be performed + * @param[out] output Pointer to the output tensor + * + * @note Supported framework: TensorFlow Lite micro (bit-accurate) + * + */ + +void arm_softmax_u8(const uint8_t *input, + const int32_t num_rows, + const int32_t row_size, + const int32_t mult, + const int32_t shift, + const int32_t diff_min, + uint8_t *output); + +/** + * @brief uint8 depthwise convolution function with asymmetric quantization + * Unless specified otherwise, arguments are mandatory. + * + * @param[in] input Pointer to input tensor + * @param[in] input_x Width of input tensor + * @param[in] input_y Height of input tensor + * @param[in] input_ch Channels in input tensor + * @param[in] kernel Pointer to kernel weights + * @param[in] kernel_x Width of kernel + * @param[in] kernel_y Height of kernel + * @param[in] ch_mult Number of channel multiplier + * @param[in] pad_x Padding sizes x + * @param[in] pad_y Padding sizes y + * @param[in] stride_x stride along the width + * @param[in] stride_y stride along the height + * @param[in] dilation_x Dilation along width. Not used and intended for future enhancement. + * @param[in] dilation_y Dilation along height. Not used and intended for future enhancement. + * @param[in] bias Pointer to optional bias values. If no bias is + * availble, NULL is expected + * @param[in] input_offset Input tensor zero offset + * @param[in] filter_offset Kernel tensor zero offset + * @param[in] output_offset Output tensor zero offset + * @param[in,out] output Pointer to output tensor + * @param[in] output_x Width of output tensor + * @param[in] output_y Height of output tensor + * @param[in] output_activation_min Minimum value to clamp the output to. Range : {0, 255} + * @param[in] output_activation_max Minimum value to clamp the output to. Range : {0, 255} + * @param[in] out_shift Amount of right-shift for output + * @param[in] out_mult Output multiplier for requantization + * @return The function returns the following + * ARM_MATH_SUCCESS - Successful operation + * + */ +arm_status arm_depthwise_conv_u8_basic_ver1(const uint8_t *input, + const uint16_t input_x, + const uint16_t input_y, + const uint16_t input_ch, + const uint8_t *kernel, + const uint16_t kernel_x, + const uint16_t kernel_y, + const int16_t ch_mult, + const int16_t pad_x, + const int16_t pad_y, + const int16_t stride_x, + const int16_t stride_y, + const int16_t dilation_x, + const int16_t dilation_y, + const int32_t *bias, + const int32_t input_offset, + const int32_t filter_offset, + const int32_t output_offset, + uint8_t *output, + const uint16_t output_x, + const uint16_t output_y, + const int32_t output_activation_min, + const int32_t output_activation_max, + const int32_t out_shift, + const int32_t out_mult); + +/** + * @defgroup Reshape Reshape Functions + * + */ + +/** + * @brief Reshape a s8 vector into another with different shape + * @param[in] input points to the s8 input vector + * @param[out] output points to the s8 output vector + * @param[in] total_size total size of the input and output vectors in bytes + * + * @note The output is expected to be in a memory area that does not overlap with the input's + * + */ +void arm_reshape_s8(const int8_t *input, int8_t *output, const uint32_t total_size); + +/** + * @defgroup Concatenation Concatenation Functions + * + */ + +/** + * @brief int8/uint8 concatenation function to be used for concatenating N-tensors along the X axis + * This function should be called for each input tensor to concatenate. The argument offset_x + * will be used to store the input tensor in the correct position in the output tensor + * + * i.e. offset_x = 0 + * for(i = 0 i < num_input_tensors; ++i) + * { + * arm_concatenation_s8_x(&input[i], ..., &output, ..., ..., offset_x) + * offset_x += input_x[i] + * } + * + * This function assumes that the output tensor has: + * -# The same height of the input tensor + * -# The same number of channels of the input tensor + * -# The same batch size of the input tensor + * + * Unless specified otherwise, arguments are mandatory. + * + * @note This function, data layout independent, can be used to concatenate either int8 or uint8 tensors because it + * does not involve any arithmetic operation + * + * @param[in] input Pointer to input tensor + * @param[in] input_x Width of input tensor + * @param[in] input_y Height of input tensor + * @param[in] input_z Channels in input tensor + * @param[in] input_w Batch size in input tensor + * @param[out] output Pointer to output tensor + * @param[in] output_x Width of output tensor + * @param[in] offset_x The offset (in number of elements) on the X axis to start concatenating the input tensor + * It is user responsibility to provide the correct value + * + * Input constraints + * offset_x is less than output_x + * + */ +void arm_concatenation_s8_x(const int8_t *input, + const uint16_t input_x, + const uint16_t input_y, + const uint16_t input_z, + const uint16_t input_w, + int8_t *output, + const uint16_t output_x, + const uint32_t offset_x); + +/** + * @brief int8/uint8 concatenation function to be used for concatenating N-tensors along the Y axis + * This function should be called for each input tensor to concatenate. The argument offset_y + * will be used to store the input tensor in the correct position in the output tensor + * + * i.e. offset_y = 0 + * for(i = 0 i < num_input_tensors; ++i) + * { + * arm_concatenation_s8_y(&input[i], ..., &output, ..., ..., offset_y) + * offset_y += input_y[i] + * } + * + * This function assumes that the output tensor has: + * -# The same width of the input tensor + * -# The same number of channels of the input tensor + * -# The same batch size of the input tensor + * + * Unless specified otherwise, arguments are mandatory. + * + * @note This function, data layout independent, can be used to concatenate either int8 or uint8 tensors because it + * does not involve any arithmetic operation + * + * @param[in] input Pointer to input tensor + * @param[in] input_x Width of input tensor + * @param[in] input_y Height of input tensor + * @param[in] input_z Channels in input tensor + * @param[in] input_w Batch size in input tensor + * @param[out] output Pointer to output tensor + * @param[in] output_y Height of output tensor + * @param[in] offset_y The offset on the Y axis to start concatenating the input tensor + * It is user responsibility to provide the correct value + * + * Input constraints + * offset_y is less than output_y + * + */ +void arm_concatenation_s8_y(const int8_t *input, + const uint16_t input_x, + const uint16_t input_y, + const uint16_t input_z, + const uint16_t input_w, + int8_t *output, + const uint16_t output_y, + const uint32_t offset_y); + +/** + * @brief int8/uint8 concatenation function to be used for concatenating N-tensors along the Z axis + * This function should be called for each input tensor to concatenate. The argument offset_z + * will be used to store the input tensor in the correct position in the output tensor + * + * i.e. offset_z = 0 + * for(i = 0 i < num_input_tensors; ++i) + * { + * arm_concatenation_s8_z(&input[i], ..., &output, ..., ..., offset_z) + * offset_z += input_z[i] + * } + * + * This function assumes that the output tensor has: + * -# The same width of the input tensor + * -# The same height of the input tensor + * -# The same batch size of the input tensor + * + * Unless specified otherwise, arguments are mandatory. + * + * @note This function, data layout independent, can be used to concatenate either int8 or uint8 tensors because it + * does not involve any arithmetic operation + * + * @param[in] input Pointer to input tensor + * @param[in] input_x Width of input tensor + * @param[in] input_y Height of input tensor + * @param[in] input_z Channels in input tensor + * @param[in] input_w Batch size in input tensor + * @param[out] output Pointer to output tensor + * @param[in] output_z Channels in output tensor + * @param[in] offset_z The offset on the Z axis to start concatenating the input tensor + * It is user responsibility to provide the correct value + * + * Input constraints + * offset_z is less than output_z + * + */ +void arm_concatenation_s8_z(const int8_t *input, + const uint16_t input_x, + const uint16_t input_y, + const uint16_t input_z, + const uint16_t input_w, + int8_t *output, + const uint16_t output_z, + const uint32_t offset_z); + +/** + * @brief int8/uint8 concatenation function to be used for concatenating N-tensors along the W axis (Batch size) + * This function should be called for each input tensor to concatenate. The argument offset_w + * will be used to store the input tensor in the correct position in the output tensor + * + * i.e. offset_w = 0 + * for(i = 0 i < num_input_tensors; ++i) + * { + * arm_concatenation_s8_w(&input[i], ..., &output, ..., ..., offset_w) + * offset_w += input_w[i] + * } + * + * This function assumes that the output tensor has: + * -# The same width of the input tensor + * -# The same height of the input tensor + * -# The same number o channels of the input tensor + * + * Unless specified otherwise, arguments are mandatory. + * + * @note This function, data layout independent, can be used to concatenate either int8 or uint8 tensors because it + * does not involve any arithmetic operation + * + * @param[in] input Pointer to input tensor + * @param[in] input_x Width of input tensor + * @param[in] input_y Height of input tensor + * @param[in] input_z Channels in input tensor + * @param[in] input_w Batch size in input tensor + * @param[out] output Pointer to output tensor + * @param[in] offset_w The offset on the W axis to start concatenating the input tensor + * It is user responsibility to provide the correct value + * + */ +void arm_concatenation_s8_w(const int8_t *input, + const uint16_t input_x, + const uint16_t input_y, + const uint16_t input_z, + const uint16_t input_w, + int8_t *output, + const uint32_t offset_w); +/** + * @defgroup SVDF SVDF Layer Functions + * + */ + +/** + * @brief s8 SVDF function + * + * @param[in] input_ctx Temporary scratch buffer + * @param[in] output_ctx Temporary output scratch buffer + * @param[in] svdf_params SVDF Parameters + * Range of svdf_params->input_offset : [-128, 127] + * Range of svdf_params->output_offset : [-128, 127] + * @param[in] input_quant_params Input quantization parameters + * @param[in] output_quant_params Output quantization parameters + * @param[in] input_dims Input tensor dimensions + * @param[in] input_data Pointer to input tensor + * @param[in] state_dims State tensor dimensions + * @param[in] state_data Pointer to state tensor + * @param[in] weights_feature_dims Weights (feature) tensor dimensions + * @param[in] weights_feature_data Pointer to the weights (feature) tensor + * @param[in] weights_time_dims Weights (time) tensor dimensions + * @param[in] weights_time_data Pointer to the weights (time) tensor + * @param[in] bias_dims Bias tensor dimensions + * @param[in] bias_data Pointer to bias tensor + * @param[in] output_dims Output tensor dimensions + * @param[out] output_data Pointer to the output tensor + * + * @return The function returns ARM_MATH_SUCCESS + * + * @details + * 1. Supported framework: TensorFlow Lite micro + * 2. q7 is used as data type eventhough it is s8 data. It is done so to be consistent with existing APIs. + * + */ +arm_status arm_svdf_s8(const cmsis_nn_context *input_ctx, + const cmsis_nn_context *output_ctx, + const cmsis_nn_svdf_params *svdf_params, + const cmsis_nn_per_tensor_quant_params *input_quant_params, + const cmsis_nn_per_tensor_quant_params *output_quant_params, + const cmsis_nn_dims *input_dims, + const q7_t *input_data, + const cmsis_nn_dims *state_dims, + q15_t *state_data, + const cmsis_nn_dims *weights_feature_dims, + const q7_t *weights_feature_data, + const cmsis_nn_dims *weights_time_dims, + const q15_t *weights_time_data, + const cmsis_nn_dims *bias_dims, + const q31_t *bias_data, + const cmsis_nn_dims *output_dims, + q7_t *output_data); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/APP_Framework/Framework/knowing/cmsis_5/NN/Include/arm_nnsupportfunctions.h b/APP_Framework/Framework/knowing/cmsis_5/NN/Include/arm_nnsupportfunctions.h new file mode 100644 index 000000000..74888b533 --- /dev/null +++ b/APP_Framework/Framework/knowing/cmsis_5/NN/Include/arm_nnsupportfunctions.h @@ -0,0 +1,973 @@ +/* + * Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_nnsupportfunctions.h + * Description: Public header file of support functions for CMSIS NN Library + * + * $Date: 15. April 2021 + * $Revision: V.5.5.0 + * + * Target Processor: Cortex-M CPUs + * -------------------------------------------------------------------- */ + +#ifndef _ARM_NNSUPPORTFUNCTIONS_H_ +#define _ARM_NNSUPPORTFUNCTIONS_H_ + +#include "arm_common_tables.h" +#include "arm_math_types.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define LEFT_SHIFT(_shift) (_shift > 0 ? _shift : 0) +#define RIGHT_SHIFT(_shift) (_shift > 0 ? 0 : -_shift) +#define MASK_IF_ZERO(x) (x) == 0 ? ~0 : 0 +#define MASK_IF_NON_ZERO(x) (x) != 0 ? ~0 : 0 +#define SELECT_USING_MASK(mask, a, b) ((mask) & (a)) ^ (~(mask) & (b)) + +#define MAX(A, B) ((A) > (B) ? (A) : (B)) +#define MIN(A, B) ((A) < (B) ? (A) : (B)) +#define CLAMP(x, h, l) MAX(MIN((x), (h)), (l)) + +/** + * @brief Union for SIMD access of q31/q15/q7 types + */ +union arm_nnword +{ + q31_t word; + /**< q31 type */ + q15_t half_words[2]; + /**< q15 type */ + q7_t bytes[4]; + /**< q7 type */ +}; + +/** + * @brief Union for data type long long + */ +struct arm_nn_double +{ + uint32_t low; + int32_t high; +}; + +union arm_nn_long_long +{ + int64_t long_long; + struct arm_nn_double word; +}; + +/** + * @defgroup nndata_convert Neural Network Data Conversion Functions + * + * Perform data type conversion in-between neural network operations + * + */ + +/** + * @brief Converts the elements of the q7 vector to q15 vector without left-shift + * @param[in] *pSrc points to the q7 input vector + * @param[out] *pDst points to the q15 output vector + * @param[in] blockSize length of the input vector + * + */ +void arm_q7_to_q15_no_shift(const q7_t *pSrc, q15_t *pDst, uint32_t blockSize); + +/** + * @brief Non-saturating addition of elements of a q7 vector + * @param[in] *input Pointer to the q7 input vector + * @param[out] *output Pointer to the q31 output variable. + * @param[in] block_size length of the input vector + * \par Description: + * + * 2^24 samples can be added without saturating the result. + * + * The equation used for the conversion process is: + * + *
+ *  sum = input[0] + input[1] + .. + input[block_size -1]
+ * 
+ * + * */ +void arm_nn_add_q7(const q7_t *input, q31_t *output, uint32_t block_size); + +/** + * @brief Converts the elements of the q7 vector to reordered q15 vector without left-shift + * @param[in] *pSrc points to the q7 input vector + * @param[out] *pDst points to the q15 output vector + * @param[in] blockSize length of the input vector + * @return none. + * + */ +void arm_q7_to_q15_reordered_no_shift(const q7_t *pSrc, q15_t *pDst, uint32_t blockSize); + +/** + * @brief Converts the elements from a q7 vector to a q15 vector with an added offset + * @param[in] src pointer to the q7 input vector + * @param[out] dst pointer to the q15 output vector + * @param[in] block_size length of the input vector + * @param[in] offset q7 offset to be added to each input vector element. + * + * \par Description: + * + * The equation used for the conversion process is: + * + *
+ *  dst[n] = (q15_t) src[n] + offset;   0 <= n < block_size.
+ * 
+ * + */ +void arm_q7_to_q15_with_offset(const q7_t *src, q15_t *dst, uint32_t block_size, q15_t offset); + +/** + * @brief Converts the elements of the q7 vector to reordered q15 vector with an added offset + * @param[in] src pointer to the q7 input vector + * @param[out] dst pointer to the q15 output vector + * @param[in] block_size length of the input vector + * @param[in] offset offset to be added to each input vector element. + * @return none. + * + * @details This function does the q7 to q15 expansion with re-ordering of bytes. Re-ordering is a consequence of + * the sign extension intrinsic(DSP extension). The tail (i.e., last (N % 4) elements) retains its + * original order. + * + */ +void arm_q7_to_q15_reordered_with_offset(const q7_t *src, q15_t *dst, uint32_t block_size, q15_t offset); + +/** + * @brief Converts the elements from a q7 vector and accumulate to a q15 vector + * @param[in] *src points to the q7 input vector + * @param[out] *dst points to the q15 output vector + * @param[in] block_size length of the input vector + * + * \par Description: + * + * The equation used for the conversion process is: + * + *
+ *  dst[n] += (q15_t) src[n] ;   0 <= n < block_size.
+ * 
+ * + */ +void arm_nn_accumulate_q7_to_q15(q15_t *dst, const q7_t *src, uint32_t block_size); + +/** + * @brief Depthwise conv on an im2col buffer where the input channel equals output channel. + * @param[in] row pointer to row + * @param[in] col pointer to im2col buffer, always consists of 2 columns. + * @param[in] num_ch number of channels + * @param[in] out_shift pointer to per output channel requantization shift parameter. + * @param[in] out_mult pointer to per output channel requantization multiplier parameter. + * @param[in] out_offset output tensor offset. + * @param[in] activation_min minimum value to clamp the output to. Range : int8 + * @param[in] activation_max maximum value to clamp the output to. Range : int8 + * @param[in] kernel_size number of elements in one column. + * @param[in] output_bias per output channel bias. Range : int32 + * @param[out] out pointer to output + * @return The function returns one of the two + * 1. The incremented output pointer for a successful operation or + * 2. NULL if implementation is not available. + * + * @details Supported framework: TensorFlow Lite micro. + */ +q7_t *arm_nn_depthwise_conv_s8_core(const q7_t *row, + const q15_t *col, + const uint16_t num_ch, + const int32_t *out_shift, + const int32_t *out_mult, + const int32_t out_offset, + const int32_t activation_min, + const int32_t activation_max, + const uint16_t kernel_size, + const int32_t *const output_bias, + q7_t *out); + +/** + * @brief General Matrix-multiplication function with per-channel requantization. + * @param[in] input_row pointer to row operand + * @param[in] input_col pointer to col operand + * @param[in] output_ch number of rows of input_row + * @param[in] col_batches number of column batches. Range: 1 to 4 + * @param[in] output_shift pointer to per output channel requantization shift parameter. + * @param[in] output_mult pointer to per output channel requantization multiplier parameter. + * @param[in] out_offset output tensor offset. + * @param[in] col_offset input tensor(col) offset. + * @param[in] row_offset kernel offset(row). Not used. + * @param[in] out_activation_min minimum value to clamp the output to. Range : int8 + * @param[in] out_activation_max maximum value to clamp the output to. Range : int8 + * @param[in] row_len number of elements in each row + * @param[in] bias per output channel bias. Range : int32 + * @param[in,out] out pointer to output + * @return The function returns one of the two + * 1. The incremented output pointer for a successful operation or + * 2. NULL if implementation is not available. + * + * @details Supported framework: TensorFlow Lite + */ +q7_t *arm_nn_mat_mult_s8(const q7_t *input_row, + const q7_t *input_col, + const uint16_t output_ch, + const uint16_t col_batches, + const int32_t *output_shift, + const int32_t *output_mult, + const int32_t out_offset, + const int32_t col_offset, + const int32_t row_offset, + const int16_t out_activation_min, + const int16_t out_activation_max, + const uint16_t row_len, + const int32_t *const bias, + q7_t *out); + +/** + * @brief General Matrix-multiplication without requantization for one row & one column + * @param[in] row_elements number of row elements + * @param[in] row_base pointer to row operand + * @param[in] col_base pointer to col operand + * @param[out] sum_col pointer to store sum of column elements + * @param[out] output pointer to store result of multiply-accumulate + * @return The function returns the multiply-accumulated result of the row by column. + * + * @details Pseudo-code + * *output = 0 + * sum_col = 0 + * for (i = 0; i < row_elements; i++) + * *output += row_base[i] * col_base[i] + * sum_col += col_base[i] + * + */ +arm_status arm_nn_mat_mul_core_1x_s8(int32_t row_elements, + const int8_t *row_base, + const int8_t *col_base, + int32_t *const sum_col, + int32_t *const output); + +/** + * @brief General Matrix-multiplication without requantization for four rows and one column + * @param[in] row_elements number of row elements + * @param[in] offset offset between rows. Can be the same as row_elements. + * For e.g, in a 1x1 conv scenario with stride as 1. + * @param[in] row_base pointer to row operand + * @param[in] col_base pointer to col operand + * @param[out] sum_col pointer to store sum of column elements + * @param[out] output pointer to store result(4 int32's) of multiply-accumulate + * @return The function returns the multiply-accumulated result of the row by column + * + * @details Pseudo-code + * output[0] = 0 + * .. + * output[3] = 0 + * sum_col = 0 + * for (i = 0; i < row_elements; i++) + * output[0] += row_base[i] * col_base[i] + * .. + * output[3] += row_base[i + (row_elements * 3)] * col_base[i] + * sum_col += col_base[i] + */ +arm_status arm_nn_mat_mul_core_4x_s8(const int32_t row_elements, + const int32_t offset, + const int8_t *row_base, + const int8_t *col_base, + int32_t *const sum_col, + int32_t *const output); + +/** + * @brief General Matrix-multiplication function with per-channel requantization. + * This function assumes: + * - LHS input matrix NOT transposed (nt) + * - RHS input matrix transposed (t) + * + * @note This operation also performs the broadcast bias addition before the requantization + * + * @param[in] lhs Pointer to the LHS input matrix + * @param[in] rhs Pointer to the RHS input matrix + * @param[in] bias Pointer to the bias vector. The length of this vector is equal to the number of + * output columns (or RHS input rows) + * @param[out] dst Pointer to the output matrix with "m" rows and "n" columns + * @param[in] dst_multipliers Pointer to the multipliers vector needed for the per-channel requantization. + * The length of this vector is equal to the number of output columns (or RHS input + * rows) + * @param[in] dst_shifts Pointer to the shifts vector needed for the per-channel requantization. The length + * of this vector is equal to the number of output columns (or RHS input rows) + * @param[in] lhs_rows Number of LHS input rows + * @param[in] rhs_rows Number of RHS input rows + * @param[in] rhs_cols Number of LHS/RHS input columns + * @param[in] lhs_offset Offset to be applied to the LHS input value + * @param[in] dst_offset Offset to be applied the output result + * @param[in] activation_min Minimum value to clamp down the output. Range : int8 + * @param[in] activation_max Maximum value to clamp up the output. Range : int8 + * + * @return The function returns ARM_MATH_SUCCESS + * + */ +arm_status arm_nn_mat_mult_nt_t_s8(const q7_t *lhs, + const q7_t *rhs, + const q31_t *bias, + q7_t *dst, + const int32_t *dst_multipliers, + const int32_t *dst_shifts, + const int32_t lhs_rows, + const int32_t rhs_rows, + const int32_t rhs_cols, + const int32_t lhs_offset, + const int32_t dst_offset, + const int32_t activation_min, + const int32_t activation_max); + +/** + * @brief s8 Vector by Matrix (transposed) multiplication + * + * @param[in] lhs Input left-hand side vector + * @param[in] rhs Input right-hand side matrix (transposed) + * @param[in] bias Input bias + * @param[out] dst Output vector + * @param[in] lhs_offset Offset to be added to the input values of the left-hand side vector. + * Range: -127 to 128 + * @param[in] rhs_offset Not used + * @param[in] dst_offset Offset to be added to the output values. Range: -127 to 128 + * @param[in] dst_multiplier Output multiplier + * @param[in] dst_shift Output shift + * @param[in] rhs_cols Number of columns in the right-hand side input matrix + * @param[in] rhs_rows Number of rows in the right-hand side input matrix + * @param[in] activation_min Minimum value to clamp the output to. Range: int8 + * @param[in] activation_max Maximum value to clamp the output to. Range: int8 + * + * @return The function returns ARM_MATH_SUCCESS + * + */ +arm_status arm_nn_vec_mat_mult_t_s8(const q7_t *lhs, + const q7_t *rhs, + const q31_t *bias, + q7_t *dst, + const int32_t lhs_offset, + const int32_t rhs_offset, + const int32_t dst_offset, + const int32_t dst_multiplier, + const int32_t dst_shift, + const int32_t rhs_cols, + const int32_t rhs_rows, + const int32_t activation_min, + const int32_t activation_max); + +/** + * @brief s8 Vector by Matrix (transposed) multiplication with s16 output + * + * @param[in] lhs Input left-hand side vector + * @param[in] rhs Input right-hand side matrix (transposed) + * @param[out] dst Output vector + * @param[in] lhs_offset Offset to be added to the input values of the left-hand side + * vector. Range: -127 to 128 + * @param[in] rhs_offset Not used + * @param[in] scatter_offset Address offset for dst. First output is stored at 'dst', the + * second at 'dst + scatter_offset' and so on. + * @param[in] dst_multiplier Output multiplier + * @param[in] dst_shift Output shift + * @param[in] rhs_cols Number of columns in the right-hand side input matrix + * @param[in] rhs_rows Number of rows in the right-hand side input matrix + * @param[in] activation_min Minimum value to clamp the output to. Range: int16 + * @param[in] activation_max Maximum value to clamp the output to. Range: int16 + * + * @return The function returns ARM_MATH_SUCCESS + * + */ +arm_status arm_nn_vec_mat_mult_t_svdf_s8(const q7_t *lhs, + const q7_t *rhs, + q15_t *dst, + const int32_t lhs_offset, + const int32_t rhs_offset, + const int32_t scatter_offset, + const int32_t dst_multiplier, + const int32_t dst_shift, + const int32_t rhs_cols, + const int32_t rhs_rows, + const int32_t activation_min, + const int32_t activation_max); + +/** + * @brief Depthwise convolution of transposed rhs matrix with 4 lhs matrices. To be used in padded cases where + * the padding is -lhs_offset(Range: int8). Dimensions are the same for lhs and rhs. + * + * @param[in] lhs Input left-hand side matrix + * @param[in] rhs Input right-hand side matrix (transposed) + * @param[in] lhs_offset LHS matrix offset(input offset). Range: -127 to 128 + * @param[in] num_ch Number of channels in LHS/RHS + * @param[in] out_shift Per channel output shift. Length of vector is equal to number of channels + * @param[in] out_mult Per channel output multiplier. Length of vector is equal to number of channels + * @param[in] out_offset Offset to be added to the output values. Range: -127 to 128 + * @param[in] activation_min Minimum value to clamp the output to. Range: int8 + * @param[in] activation_max Maximum value to clamp the output to. Range: int8 + * @param[in] row_x_col (row_dimension * col_dimension) of LHS/RHS matrix + * @param[in] output_bias Per channel output bias. Length of vector is equal to number of channels + * @param[in] out Output pointer + * + * @return The function returns one of the two + * - Updated output pointer if an implementation is available + * - NULL if no implementation is available. + * + * @note If number of channels is not a multiple of 4, upto 3 elements outside the boundary will be read + * out for the following. + * - Output shift + * - Output multiplier + * - Output bias + * - rhs + */ +q7_t *arm_nn_depthwise_conv_nt_t_padded_s8(const q7_t *lhs, + const q7_t *rhs, + const int32_t lhs_offset, + const uint16_t num_ch, + const int32_t *out_shift, + const int32_t *out_mult, + const int32_t out_offset, + const int32_t activation_min, + const int32_t activation_max, + const uint16_t row_x_col, + const int32_t *const output_bias, + q7_t *out); + +/** + * @brief Depthwise convolution of transposed rhs matrix with 4 lhs matrices. To be used in non-padded cases. + * Dimensions are the same for lhs and rhs. + * + * @param[in] lhs Input left-hand side matrix + * @param[in] rhs Input right-hand side matrix (transposed) + * @param[in] lhs_offset LHS matrix offset(input offset). Range: -127 to 128 + * @param[in] num_ch Number of channels in LHS/RHS + * @param[in] out_shift Per channel output shift. Length of vector is equal to number of channels. + * @param[in] out_mult Per channel output multiplier. Length of vector is equal to number of channels. + * @param[in] out_offset Offset to be added to the output values. Range: -127 to 128 + * @param[in] activation_min Minimum value to clamp the output to. Range: int8 + * @param[in] activation_max Maximum value to clamp the output to. Range: int8 + * @param[in] row_x_col (row_dimension * col_dimension) of LHS/RHS matrix + * @param[in] output_bias Per channel output bias. Length of vector is equal to number of channels. + * @param[in] out Output pointer + * + * @return The function returns one of the two + * - Updated output pointer if an implementation is available + * - NULL if no implementation is available. + * + * @note If number of channels is not a multiple of 4, upto 3 elements outside the boundary will be read + * out for the following. + * - Output shift + * - Output multiplier + * - Output bias + * - rhs + */ +q7_t *arm_nn_depthwise_conv_nt_t_s8(const q7_t *lhs, + const q7_t *rhs, + const int32_t lhs_offset, + const uint16_t num_ch, + const int32_t *out_shift, + const int32_t *out_mult, + const int32_t out_offset, + const int32_t activation_min, + const int32_t activation_max, + const uint16_t row_x_col, + const int32_t *const output_bias, + q7_t *out); + +/** + @brief Read 2 q15 elements and post increment pointer. + @param[in] in_q15 Pointer to pointer that holds address of input. + @return q31 value + */ +__STATIC_FORCEINLINE q31_t arm_nn_read_q15x2_ia(const q15_t **in_q15) +{ + q31_t val; + + memcpy(&val, *in_q15, 4); + *in_q15 += 2; + + return (val); +} + +/** + @brief Read 4 q7 from q7 pointer and post increment pointer. + @param[in] in_q7 Pointer to pointer that holds address of input. + @return q31 value + */ +__STATIC_FORCEINLINE q31_t arm_nn_read_q7x4_ia(const q7_t **in_q7) +{ + q31_t val; + memcpy(&val, *in_q7, 4); + *in_q7 += 4; + + return (val); +} + +/** + @brief Read 2 q15 from q15 pointer. + @param[in] in_q15 pointer to address of input. + @return q31 value + */ +__STATIC_FORCEINLINE q31_t arm_nn_read_q15x2(const q15_t *in_q15) +{ + q31_t val; + memcpy(&val, in_q15, 4); + + return (val); +} + +/** + @brief Read 4 q7 values. + @param[in] in_q7 pointer to address of input. + @return q31 value + */ +__STATIC_FORCEINLINE q31_t arm_nn_read_q7x4(const q7_t *in_q7) +{ + q31_t val; + memcpy(&val, in_q7, 4); + + return (val); +} + +/** + * @brief memset optimized for MVE + * @param[in, out] dst Destination pointer + * @param[in] val Value to set + * @param[in] block_size Number of bytes to copy. + * + */ +__STATIC_FORCEINLINE void arm_memset_q7(q7_t *dst, const q7_t val, uint32_t block_size) +{ +#if defined(ARM_MATH_MVEI) + __asm volatile(" vdup.8 q0, %[set_val] \n" + " wlstp.8 lr, %[cnt], 1f \n" + "2: \n" + " vstrb.8 q0, [%[in]], 16 \n" + " letp lr, 2b \n" + "1: \n" + : [ in ] "+r"(dst) + : [ cnt ] "r"(block_size), [ set_val ] "r"(val) + : "q0", "memory", "r14"); +#else + memset(dst, val, block_size); +#endif +} + +#if defined(ARM_MATH_DSP) + +/** + * @brief read and expand one q7 word into two q15 words + */ + +__STATIC_FORCEINLINE const q7_t *read_and_pad(const q7_t *source, q31_t *out1, q31_t *out2) +{ + q31_t inA = arm_nn_read_q7x4_ia(&source); + q31_t inAbuf1 = __SXTB16(__ROR((uint32_t)inA, 8)); + q31_t inAbuf2 = __SXTB16(inA); + +#ifndef ARM_MATH_BIG_ENDIAN + *out2 = (int32_t)(__PKHTB(inAbuf1, inAbuf2, 16)); + *out1 = (int32_t)(__PKHBT(inAbuf2, inAbuf1, 16)); +#else + *out1 = (int32_t)(__PKHTB(inAbuf1, inAbuf2, 16)); + *out2 = (int32_t)(__PKHBT(inAbuf2, inAbuf1, 16)); +#endif + + return source; +} + +/** + * @brief read and expand one q7 word into two q15 words with reordering + */ + +__STATIC_FORCEINLINE const q7_t *read_and_pad_reordered(const q7_t *source, q31_t *out1, q31_t *out2) +{ + q31_t inA = arm_nn_read_q7x4_ia(&source); +#ifndef ARM_MATH_BIG_ENDIAN + *out2 = __SXTB16(__ROR((uint32_t)inA, 8)); + *out1 = __SXTB16(inA); +#else + *out1 = __SXTB16(__ROR((uint32_t)inA, 8)); + *out2 = __SXTB16(inA); +#endif + + return source; +} + +/** + * @brief read and expand one q7 word into two q15 words with reordering and add an offset + */ +__STATIC_FORCEINLINE const q7_t * +read_and_pad_reordered_with_offset(const q7_t *source, q31_t *out1, q31_t *out2, q31_t offset) +{ + q31_t inA = arm_nn_read_q7x4_ia(&source); + +#ifndef ARM_MATH_BIG_ENDIAN + *out2 = __SXTB16(__ROR((uint32_t)inA, 8)); + *out1 = __SXTB16(inA); +#else + *out1 = __SXTB16(__ROR((uint32_t)inA, 8)); + *out2 = __SXTB16(inA); +#endif + *out1 = __QADD16(*out1, offset); + *out2 = __QADD16(*out2, offset); + + return source; +} + +#endif + +/** + * @defgroup NNBasicMath Basic Math Functions for Neural Network Computation + * + * Basic Math Functions for Neural Network Computation + * + */ + +/** + * @brief q7 vector multiplication with variable output shifts + * @param[in] *pSrcA pointer to the first input vector + * @param[in] *pSrcB pointer to the second input vector + * @param[out] *pDst pointer to the output vector + * @param[in] out_shift amount of right-shift for output + * @param[in] blockSize number of samples in each vector + * @return none. + * + * Scaling and Overflow Behavior: + * \par + * The function uses saturating arithmetic. + * Results outside of the allowable q15 range [0x8000 0x7FFF] will be saturated. + */ + +void arm_nn_mult_q15(q15_t *pSrcA, q15_t *pSrcB, q15_t *pDst, const uint16_t out_shift, uint32_t blockSize); + +/** + * @brief q7 vector multiplication with variable output shifts + * @param[in] *pSrcA pointer to the first input vector + * @param[in] *pSrcB pointer to the second input vector + * @param[out] *pDst pointer to the output vector + * @param[in] out_shift amount of right-shift for output + * @param[in] blockSize number of samples in each vector + * @return none. + * + * Scaling and Overflow Behavior: + * \par + * The function uses saturating arithmetic. + * Results outside of the allowable q7 range [0x80 0x7F] will be saturated. + */ + +void arm_nn_mult_q7(q7_t *pSrcA, q7_t *pSrcB, q7_t *pDst, const uint16_t out_shift, uint32_t blockSize); + +/** + * @brief macro for adding rounding offset + */ +#ifndef ARM_NN_TRUNCATE +#define NN_ROUND(out_shift) ((0x1u << out_shift) >> 1) +#else +#define NN_ROUND(out_shift) 0 +#endif + +// Macros for shortening quantization functions' names and avoid long lines +#define MUL_SAT(a, b) arm_nn_doubling_high_mult((a), (b)) +#define MUL_SAT_MVE(a, b) arm_doubling_high_mult_mve_32x4((a), (b)) +#define MUL_POW2(a, b) arm_nn_mult_by_power_of_two((a), (b)) + +#define DIV_POW2(a, b) arm_nn_divide_by_power_of_two((a), (b)) +#define DIV_POW2_MVE(a, b) arm_divide_by_power_of_two_mve((a), (b)) + +#define EXP_ON_NEG(x) arm_nn_exp_on_negative_values((x)) +#define ONE_OVER1(x) arm_nn_one_over_one_plus_x_for_x_in_0_1((x)) + +/** + * @brief Saturating doubling high multiply. Result matches + * NEON instruction VQRDMULH. + * @param[in] m1 Multiplicand. Range: {Q31_MIN, Q31_MAX} + * @param[in] m2 Multiplier. Range: {Q31_MIN, Q31_MAX} + * @return Result of multiplication. + * + */ +__STATIC_FORCEINLINE q31_t arm_nn_doubling_high_mult(const q31_t m1, const q31_t m2) +{ + q31_t result = 0; + // Rounding offset to add for a right shift of 31 + q63_t mult = 1 << 30; + + if ((m1 < 0) ^ (m2 < 0)) + { + mult = 1 - mult; + } + // Gets resolved as a SMLAL instruction + mult = mult + (q63_t)m1 * m2; + + // Utilize all of the upper 32 bits. This is the doubling step + // as well. + result = (int32_t)(mult / (1ll << 31)); + + if ((m1 == m2) && (m1 == (int32_t)Q31_MIN)) + { + result = Q31_MAX; + } + return result; +} + +/** + * @brief Doubling high multiply without saturation. This is intended + * for requantization where the scale is a positive integer + * + * @param[in] m1 Multiplicand. Range: {Q31_MIN, Q31_MAX} + * @param[in] m2 Multiplier Range: {Q31_MIN, Q31_MAX} + * @return Result of multiplication. + * @note The result of this matches that of neon instruction + * VQRDMULH for m1 in range {Q31_MIN, Q31_MAX} and m2 in + * range {Q31_MIN + 1, Q31_MAX}. Saturation occurs when + * m1 equals m2 equals Q31_MIN and that is not handled by + * this function. + * + */ +__STATIC_FORCEINLINE q31_t arm_nn_doubling_high_mult_no_sat(const q31_t m1, const q31_t m2) +{ + q31_t result = 0; + union arm_nn_long_long mult; + + // Rounding offset to add for a right shift of 31 + mult.word.low = 1 << 30; + mult.word.high = 0; + + // Gets resolved as a SMLAL instruction + mult.long_long = mult.long_long + (q63_t)m1 * m2; + + // Utilize all of the upper 32 bits. This is the doubling step + // as well. + result = (int32_t)(mult.long_long >> 31); + + return result; +} + +/** + * @brief Rounding divide by power of two. + * @param[in] dividend - Dividend + * @param[in] exponent - Divisor = power(2, exponent) + * Range: [0, 31] + * @return Rounded result of division. Midpoint is rounded away from zero. + * + */ +__STATIC_FORCEINLINE q31_t arm_nn_divide_by_power_of_two(const q31_t dividend, const q31_t exponent) +{ + q31_t result = 0; + const q31_t remainder_mask = (1 << exponent) - 1; + int32_t remainder = remainder_mask & dividend; + + // Basic division + result = dividend >> exponent; + + // Adjust 'result' for rounding (mid point away from zero) + q31_t threshold = remainder_mask >> 1; + if (result < 0) + { + threshold++; + } + if (remainder > threshold) + { + result++; + } + + return result; +} + +/** + * @brief Requantize a given value. + * @param[in] val Value to be requantized + * @param[in] multiplier multiplier. Range {Q31_MIN + 1, Q32_MAX} + * @param[in] shift left or right shift for 'val * multiplier' + * + * @return Returns (val * multiplier)/(2 ^ shift) + * + */ +__STATIC_FORCEINLINE q31_t arm_nn_requantize(const q31_t val, const q31_t multiplier, const q31_t shift) +{ + return arm_nn_divide_by_power_of_two(arm_nn_doubling_high_mult_no_sat(val * (1 << LEFT_SHIFT(shift)), multiplier), + RIGHT_SHIFT(shift)); +} + +/** + * @brief memcpy optimized for MVE + * @param[in, out] dst Destination pointer + * @param[in] src Source pointer. + * @param[in] block_size Number of bytes to copy. + * + */ +__STATIC_FORCEINLINE void arm_memcpy_q7(q7_t *__RESTRICT dst, const q7_t *__RESTRICT src, uint32_t block_size) +{ +#if defined(ARM_MATH_MVEI) + __asm volatile(" wlstp.8 lr, %[cnt], 1f \n" + "2: \n" + " vldrb.8 q0, [%[in]], 16 \n" + " vstrb.8 q0, [%[out]], 16 \n" + " letp lr, 2b \n" + "1: \n" + : [ in ] "+r"(src), [ out ] "+r"(dst) + : [ cnt ] "r"(block_size) + : "q0", "memory", "r14"); +#else + memcpy(dst, src, block_size); +#endif +} + +#if defined(ARM_MATH_MVEI) +/** + * @brief Vector saturating doubling high multiply returning high half. + * @param[in] m1 Multiplicand + * @param[in] m2 Multiplier + * @return Result of multiplication. + * + */ +__STATIC_FORCEINLINE int32x4_t arm_doubling_high_mult_mve(const int32x4_t m1, const q31_t m2) +{ + return vqrdmulhq_n_s32(m1, m2); +} + +/** + * @brief Vector rounding divide by power of two. + * @param[in] dividend - Dividend vector + * @param[in] exponent - Divisor = power(2, exponent) + * Range: [0, 31] + * @return Rounded result of division. Midpoint is rounded away from zero. + * + */ +__STATIC_FORCEINLINE int32x4_t arm_divide_by_power_of_two_mve(const int32x4_t dividend, const q31_t exponent) +{ + const int32x4_t shift = vdupq_n_s32(-exponent); + const int32x4_t fixup = vshrq_n_s32(vandq_s32(dividend, shift), 31); + const int32x4_t fixed_up_dividend = vqaddq_s32(dividend, fixup); + return vrshlq_s32(fixed_up_dividend, shift); +} + +/** + * @brief Requantize a given vector. + * @param[in] val Vector to be requantized + * @param[in] multiplier multiplier + * @param[in] shift shift + * + * @return Returns (val * multiplier)/(2 ^ shift) + * + */ +__STATIC_FORCEINLINE int32x4_t arm_requantize_mve(const int32x4_t val, const q31_t multiplier, const q31_t shift) +{ + return arm_divide_by_power_of_two_mve( + arm_doubling_high_mult_mve(vshlq_s32(val, vdupq_n_s32(LEFT_SHIFT(shift))), multiplier), RIGHT_SHIFT(shift)); +} + +__STATIC_FORCEINLINE int32x4_t arm_doubling_high_mult_mve_32x4(const int32x4_t m1, const int32x4_t m2) +{ + return vqrdmulhq_s32(m1, m2); +} + +__STATIC_FORCEINLINE int32x4_t arm_divide_by_power_of_two_mve_32x4(const int32x4_t dividend, const int32x4_t exponent) +{ + const int32x4_t shift = -exponent; + const int32x4_t fixup = vshrq_n_s32(vandq_s32(dividend, shift), 31); + const int32x4_t fixed_up_dividend = vqaddq_s32(dividend, fixup); + return vrshlq_s32(fixed_up_dividend, shift); +} + +__STATIC_FORCEINLINE int32x4_t arm_requantize_mve_32x4(const int32x4_t val, + const int32x4_t multiplier, + const int32x4_t shift) +{ + const int32x4_t zz = vdupq_n_s32(0); + const mve_pred16_t p = vcmpgtq_n_s32(shift, 0); + + const int32x4_t left_shift = vpselq_s32(shift, zz, p); + const int32x4_t right_shift = -vpselq_s32(zz, shift, p); + + return arm_divide_by_power_of_two_mve_32x4(arm_doubling_high_mult_mve_32x4(vshlq_s32(val, left_shift), multiplier), + right_shift); +} +#endif + +// @note The following functions are used only for softmax layer, scaled bits = 5 assumed + +__STATIC_FORCEINLINE int32_t arm_nn_exp_on_negative_values(int32_t val) +{ + int32_t mask = 0; + int32_t shift = 24; + + const int32_t val_mod_minus_quarter = (val & ((1 << shift) - 1)) - (1 << shift); + const int32_t remainder = val_mod_minus_quarter - val; + const int32_t x = (val_mod_minus_quarter << 5) + (1 << 28); + const int32_t x2 = MUL_SAT(x, x); + + int32_t result = 1895147668 + + MUL_SAT(1895147668, x + DIV_POW2(MUL_SAT(DIV_POW2(MUL_SAT(x2, x2), 2) + MUL_SAT(x2, x), 715827883) + x2, 1)); + +#define SELECT_IF_NON_ZERO(x) \ + { \ + mask = MASK_IF_NON_ZERO(remainder & (1 << shift++)); \ + result = SELECT_USING_MASK(mask, MUL_SAT(result, x), result); \ + } + + SELECT_IF_NON_ZERO(1672461947) + SELECT_IF_NON_ZERO(1302514674) + SELECT_IF_NON_ZERO(790015084) + SELECT_IF_NON_ZERO(290630308) + SELECT_IF_NON_ZERO(39332535) + SELECT_IF_NON_ZERO(720401) + SELECT_IF_NON_ZERO(242) + +#undef SELECT_IF_NON_ZERO + + mask = MASK_IF_ZERO(val); + return SELECT_USING_MASK(mask, Q31_MAX, result); +} + +__STATIC_FORCEINLINE q31_t arm_nn_mult_by_power_of_two(const int32_t val, const int32_t exp) +{ + const int32_t thresh = ((1 << (31 - exp)) - 1); + int32_t result = val << exp; + result = SELECT_USING_MASK(MASK_IF_NON_ZERO(val > thresh), Q31_MAX, result); + result = SELECT_USING_MASK(MASK_IF_NON_ZERO(val < -thresh), Q31_MIN, result); + return result; +} + +__STATIC_FORCEINLINE int32_t arm_nn_one_over_one_plus_x_for_x_in_0_1(int32_t val) +{ + const int64_t sum = (int64_t)val + (int64_t)Q31_MAX; + const int32_t half_denominator = (int32_t)((sum + (sum >= 0 ? 1 : -1)) / 2L); + int32_t x = 1515870810 + MUL_SAT(half_denominator, -1010580540); + + const int32_t shift = (1 << 29); + x += MUL_POW2(MUL_SAT(x, shift - MUL_SAT(half_denominator, x)), 2); + x += MUL_POW2(MUL_SAT(x, shift - MUL_SAT(half_denominator, x)), 2); + x += MUL_POW2(MUL_SAT(x, shift - MUL_SAT(half_denominator, x)), 2); + + return MUL_POW2(x, 1); +} + +/** + @brief Write 2 q15 elements and post increment pointer. + @param[in] dest_q15 Pointer to pointer that holds address of destination. + @param[in] src_q31 Input value to be written. + @return none + */ +__STATIC_FORCEINLINE void arm_nn_write_q15x2_ia(q15_t **dest_q15, q31_t src_q31) +{ + q31_t val = src_q31; + + memcpy(*dest_q15, &val, 4); + *dest_q15 += 2; +} + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/APP_Framework/Framework/knowing/cmsis_5/NN/Source/ActivationFunctions/arm_nn_activations_q15.c b/APP_Framework/Framework/knowing/cmsis_5/NN/Source/ActivationFunctions/arm_nn_activations_q15.c new file mode 100644 index 000000000..cb8a08fe0 --- /dev/null +++ b/APP_Framework/Framework/knowing/cmsis_5/NN/Source/ActivationFunctions/arm_nn_activations_q15.c @@ -0,0 +1,96 @@ +/* + * Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_nn_activations_q15.c + * Description: Q15 neural network activation function using direct table look-up + * + * $Date: 09. October 2020 + * $Revision: V.1.0.1 + * + * Target Processor: Cortex-M cores + * + * -------------------------------------------------------------------- */ + +#include "arm_nn_tables.h" +#include "arm_nnfunctions.h" + +/** + * @ingroup groupNN + */ + +/** + * @addtogroup Acti + * @{ + */ + +/** + * @brief neural network activation function using direct table look-up + * + * @note Refer header file for details. + * + */ + +void arm_nn_activations_direct_q15(q15_t *data, uint16_t size, uint16_t int_width, arm_nn_activation_type type) +{ + uint16_t i = size; + q15_t *pIn = data; + q15_t *pOut = data; + uint16_t shift_size = 8 + 3 - int_width; + uint32_t bit_mask = 0x7FF >> int_width; + uint32_t full_frac = bit_mask + 1; + const q15_t *lookup_table; + + switch (type) + { + case ARM_SIGMOID: + lookup_table = sigmoidTable_q15; + break; + case ARM_TANH: + default: + lookup_table = tanhTable_q15; + break; + } + + while (i) + { + q15_t out; + q15_t in = *pIn++; + q15_t frac = (uint32_t)in & bit_mask; + q15_t value = lookup_table[(uint8_t)(in >> shift_size)]; + if ((in >> shift_size) != 0x7f) + { + q15_t value2 = lookup_table[(uint8_t)(1 + ((uint8_t)(in >> shift_size)))]; + /* doing the interpolation here for better accuracy */ + out = ((q31_t)(full_frac - frac) * value + (q31_t)value2 * frac) >> shift_size; + } + else + { + /* the largest positive value does not have a right side for linear interpolation */ + out = value; + } + + *pOut++ = out; + i--; + } +} + +/** + * @} end of Acti group + */ diff --git a/APP_Framework/Framework/knowing/cmsis_5/NN/Source/ActivationFunctions/arm_nn_activations_q7.c b/APP_Framework/Framework/knowing/cmsis_5/NN/Source/ActivationFunctions/arm_nn_activations_q7.c new file mode 100644 index 000000000..72a0b1560 --- /dev/null +++ b/APP_Framework/Framework/knowing/cmsis_5/NN/Source/ActivationFunctions/arm_nn_activations_q7.c @@ -0,0 +1,89 @@ +/* + * Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_nn_activations_q7.c + * Description: Q7 neural network activation function using direct table look-up + * + * $Date: 09. October 2020 + * $Revision: V.1.0.1 + * + * Target Processor: Cortex-M cores + * + * -------------------------------------------------------------------- */ + +#include "arm_nn_tables.h" +#include "arm_nnfunctions.h" + +/** + * @ingroup groupNN + */ + +/** + * @addtogroup Acti + * @{ + */ + +/** + * @brief Q7 neural network activation function using direct table look-up + * @param[in,out] data pointer to input + * @param[in] size number of elements + * @param[in] int_width bit-width of the integer part, assume to be smaller than 3 + * @param[in] type type of activation functions + * + * @details + * + * This is the direct table look-up approach. + * + * Assume here the integer part of the fixed-point is <= 3. + * More than 3 just not making much sense, makes no difference with + * saturation followed by any of these activation functions. + */ + +void arm_nn_activations_direct_q7(q7_t *data, uint16_t size, uint16_t int_width, arm_nn_activation_type type) +{ + uint16_t i = size; + q7_t *pIn = data; + q7_t *pOut = data; + q7_t in; + q7_t out; + uint16_t shift_size = 3 - int_width; + const q7_t *lookup_table; + switch (type) + { + case ARM_SIGMOID: + lookup_table = sigmoidTable_q7; + break; + case ARM_TANH: + default: + lookup_table = tanhTable_q7; + break; + } + while (i) + { + in = *pIn++; + out = lookup_table[(uint8_t)(in >> shift_size)]; + *pOut++ = out; + i--; + } +} + +/** + * @} end of Acti group + */ diff --git a/APP_Framework/Framework/knowing/cmsis_5/NN/Source/ActivationFunctions/arm_relu6_s8.c b/APP_Framework/Framework/knowing/cmsis_5/NN/Source/ActivationFunctions/arm_relu6_s8.c new file mode 100644 index 000000000..a460b3055 --- /dev/null +++ b/APP_Framework/Framework/knowing/cmsis_5/NN/Source/ActivationFunctions/arm_relu6_s8.c @@ -0,0 +1,65 @@ +/* + * Copyright (C) 2010-2019 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_relu6_s8.c + * Description: Basic s8 version of ReLU6 + * + * $Date: 09. October 2020 + * $Revision: V.1.0.1 + * + * Target Processor: Cortex-M cores + * + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" +#include "arm_nnsupportfunctions.h" + +/** + * @ingroup groupNN + */ + +/** + * @addtogroup Acti + * @{ + */ + +/* + * Basic ReLU6 function + * + * Refer to header file for details. + * + */ + +void arm_relu6_s8(q7_t *data, uint16_t size) +{ + int32_t i; + + for (i = 0; i < size; i++) + { + int32_t ip = data[i]; + + ip = MAX(ip, 0); + data[i] = MIN(ip, 6); + } +} + +/** + * @} end of Acti group + */ diff --git a/APP_Framework/Framework/knowing/cmsis_5/NN/Source/ActivationFunctions/arm_relu_q15.c b/APP_Framework/Framework/knowing/cmsis_5/NN/Source/ActivationFunctions/arm_relu_q15.c new file mode 100644 index 000000000..d62117c78 --- /dev/null +++ b/APP_Framework/Framework/knowing/cmsis_5/NN/Source/ActivationFunctions/arm_relu_q15.c @@ -0,0 +1,104 @@ +/* + * Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_relu_q15.c + * Description: Q15 version of ReLU + * + * $Date: 09. October 2020 + * $Revision: V.1.0.2 + * + * Target Processor: Cortex-M cores + * + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" +#include "arm_nnsupportfunctions.h" + +/** + * @ingroup groupNN + */ + +/** + * @addtogroup Acti + * @{ + */ + +/** + * @brief Q15 RELU function + * @param[in,out] data pointer to input + * @param[in] size number of elements + * + * @details + * + * Optimized relu with QSUB instructions. + * + */ + +void arm_relu_q15(q15_t *data, uint16_t size) +{ + +#if defined(ARM_MATH_DSP) + /* Run the following code for M cores with DSP extension */ + + uint16_t i = size >> 1; + q15_t *input = data; + q15_t *output = data; + q31_t in; + q31_t buf; + q31_t mask; + + while (i) + { + in = read_q15x2_ia(&input); + + /* extract the first bit */ + buf = __ROR(in & 0x80008000, 15); + + /* if MSB=1, mask will be 0xFF, 0x0 otherwise */ + mask = __QSUB16(0x00000000, buf); + + arm_nn_write_q15x2_ia(&output, in & (~mask)); + i--; + } + + if (size & 0x1) + { + if (*input < 0) + { + *input = 0; + } + input++; + } +#else + /* Run the following code as reference implementation for M cores without DSP extension */ + uint16_t i; + + for (i = 0; i < size; i++) + { + if (data[i] < 0) + data[i] = 0; + } + +#endif /* ARM_MATH_DSP */ +} + +/** + * @} end of Acti group + */ diff --git a/APP_Framework/Framework/knowing/cmsis_5/NN/Source/ActivationFunctions/arm_relu_q7.c b/APP_Framework/Framework/knowing/cmsis_5/NN/Source/ActivationFunctions/arm_relu_q7.c new file mode 100644 index 000000000..75be35d99 --- /dev/null +++ b/APP_Framework/Framework/knowing/cmsis_5/NN/Source/ActivationFunctions/arm_relu_q7.c @@ -0,0 +1,109 @@ +/* + * Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_relu_q7.c + * Description: Q7 version of ReLU + * + * $Date: 09. October 2020 + * $Revision: V.1.0.3 + * + * Target Processor: Cortex-M cores + * + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" +#include "arm_nnsupportfunctions.h" + +/** + * @ingroup groupNN + */ + +/** + * @addtogroup Acti + * @{ + */ + +/** + * @brief Q7 RELU function + * @param[in,out] data pointer to input + * @param[in] size number of elements + * + * @details + * + * Optimized relu with QSUB instructions. + * + */ + +void arm_relu_q7(q7_t *data, uint16_t size) +{ + +#if defined(ARM_MATH_DSP) + /* Run the following code for M cores with DSP extension */ + + uint16_t i = size >> 2; + q7_t *input = data; + q7_t *output = data; + q31_t in; + q31_t buf; + q31_t mask; + + while (i) + { + in = read_q7x4_ia(&input); + + /* extract the first bit */ + buf = (int32_t)__ROR((uint32_t)in & 0x80808080, 7); + + /* if MSB=1, mask will be 0xFF, 0x0 otherwise */ + mask = __QSUB8(0x00000000, buf); + + write_q7x4_ia(&output, in & (~mask)); + + i--; + } + + i = size & 0x3; + while (i) + { + if (*input < 0) + { + *input = 0; + } + input++; + i--; + } + +#else + /* Run the following code as reference implementation for cores without DSP extension */ + + uint16_t i; + + for (i = 0; i < size; i++) + { + if (data[i] < 0) + data[i] = 0; + } + +#endif +} + +/** + * @} end of Acti group + */ diff --git a/APP_Framework/Framework/knowing/cmsis_5/NN/Source/BasicMathFunctions/arm_elementwise_add_s8.c b/APP_Framework/Framework/knowing/cmsis_5/NN/Source/BasicMathFunctions/arm_elementwise_add_s8.c new file mode 100644 index 000000000..85740edb8 --- /dev/null +++ b/APP_Framework/Framework/knowing/cmsis_5/NN/Source/BasicMathFunctions/arm_elementwise_add_s8.c @@ -0,0 +1,255 @@ +/* + * Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_elementwise_add_s8 + * Description: Element wise add + * + * $Date: 01. March 2021 + * $Revision: V.2.5.3 + * + * Target Processor: Cortex-M CPUs + * + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" +#include "arm_nnsupportfunctions.h" +#if defined(ARM_MATH_MVEI) +#include "arm_helium_utils.h" +#endif + +#if defined(ARM_MATH_MVEI) +#define SAT_INPUT_VECT(__INPUT_V, __MULT, __SHIFT) \ + __INPUT_V = arm_doubling_high_mult_mve(__INPUT_V, __MULT); \ + __INPUT_V = arm_divide_by_power_of_two_mve(__INPUT_V, -__SHIFT); +#endif + +/** + * @note The *_no_sat API does not mean that the input not saturated, Since + * __MULT is a positive integer, it is saturated. The API definition + * has more info about it. + */ +#define SAT_INPUT(__INPUT, __MULT, __SHIFT) \ + __INPUT = arm_nn_doubling_high_mult_no_sat(__INPUT, __MULT); \ + __INPUT = arm_nn_divide_by_power_of_two(__INPUT, -__SHIFT); + +/** + * @ingroup groupNN + */ + +/** + * @addtogroup BasicMath + * @{ + */ + +/* + * s8 element wise add + * + * Refer header file for details. + * + */ + +/* Note: __SHIFT is expected to be <=0 */ + +arm_status arm_elementwise_add_s8(const int8_t *input_1_vect, + const int8_t *input_2_vect, + const int32_t input_1_offset, + const int32_t input_1_mult, + const int32_t input_1_shift, + const int32_t input_2_offset, + const int32_t input_2_mult, + const int32_t input_2_shift, + const int32_t left_shift, + int8_t *output, + const int32_t out_offset, + const int32_t out_mult, + const int32_t out_shift, + const int32_t out_activation_min, + const int32_t out_activation_max, + const uint32_t block_size) +{ +#if defined(ARM_MATH_MVEI) + int32_t count = (int32_t)block_size; + + while (count > 0) + { + int32x4_t vect_1; + int32x4_t vect_2; + + mve_pred16_t p = vctp32q((uint32_t)count); + + vect_1 = vldrbq_z_s32(input_1_vect, p); + vect_2 = vldrbq_z_s32(input_2_vect, p); + + vect_1 = vaddq_s32(vect_1, vdupq_n_s32(input_1_offset)); + vect_2 = vaddq_s32(vect_2, vdupq_n_s32(input_2_offset)); + + vect_1 = vshlq_r_s32(vect_1, left_shift); + vect_2 = vshlq_r_s32(vect_2, left_shift); + + SAT_INPUT_VECT(vect_1, input_1_mult, input_1_shift); + SAT_INPUT_VECT(vect_2, input_2_mult, input_2_shift); + + vect_1 = vaddq_s32(vect_1, vect_2); + SAT_INPUT_VECT(vect_1, out_mult, out_shift); + + vect_1 = vaddq_n_s32(vect_1, out_offset); + + vect_1 = vmaxq_s32(vect_1, vdupq_n_s32(out_activation_min)); + vect_1 = vminq_s32(vect_1, vdupq_n_s32(out_activation_max)); + + input_1_vect += 4; + input_2_vect += 4; + vstrbq_p_s32(output, vect_1, p); + + output += 4; + count -= 4; + } +#else + uint32_t loop_count; + int32_t input_1; + int32_t input_2; + int32_t sum; + +#if defined(ARM_MATH_DSP) + int32_t a_1, b_1, a_2, b_2; + + int32_t offset_1_packed, offset_2_packed; + + int8_t r1, r2, r3, r4; + + offset_1_packed = (input_1_offset << 16U) | (input_1_offset & 0x0FFFFL); + offset_2_packed = (input_2_offset << 16U) | (input_2_offset & 0x0FFFFL); + + loop_count = block_size >> 2; + + while (loop_count > 0U) + { + /* 4 outputs are calculated in one loop. The order of calculation is follows the order of output sign extension + intrinsic */ + input_1_vect = read_and_pad_reordered(input_1_vect, &b_1, &a_1); + input_2_vect = read_and_pad_reordered(input_2_vect, &b_2, &a_2); + + a_1 = __SADD16(a_1, offset_1_packed); + b_1 = __SADD16(b_1, offset_1_packed); + + a_2 = __SADD16(a_2, offset_2_packed); + b_2 = __SADD16(b_2, offset_2_packed); + + /* Sum 1 */ + input_1 = (b_1 & 0x0FFFF) << left_shift; + + SAT_INPUT(input_1, input_1_mult, input_1_shift); + + input_2 = (b_2 & 0x0FFFF) << left_shift; + SAT_INPUT(input_2, input_2_mult, input_2_shift); + + sum = input_1 + input_2; + SAT_INPUT(sum, out_mult, out_shift); + sum += out_offset; + sum = MAX(sum, out_activation_min); + sum = MIN(sum, out_activation_max); + r1 = (q7_t)sum; + + /* Sum 3 */ + input_1 = ((b_1 >> 16) & 0x0FFFF) << left_shift; + SAT_INPUT(input_1, input_1_mult, input_1_shift); + + input_2 = ((b_2 >> 16) & 0x0FFFF) << left_shift; + SAT_INPUT(input_2, input_2_mult, input_2_shift); + + sum = input_1 + input_2; + SAT_INPUT(sum, out_mult, out_shift); + sum += out_offset; + sum = MAX(sum, out_activation_min); + sum = MIN(sum, out_activation_max); + r3 = (q7_t)sum; + + /* Sum 2 */ + input_1 = (a_1 & 0x0FFFF) << left_shift; + SAT_INPUT(input_1, input_1_mult, input_1_shift); + + input_2 = (a_2 & 0x0FFFF) << left_shift; + SAT_INPUT(input_2, input_2_mult, input_2_shift); + + sum = input_1 + input_2; + SAT_INPUT(sum, out_mult, out_shift); + sum += out_offset; + sum = MAX(sum, out_activation_min); + sum = MIN(sum, out_activation_max); + r2 = (q7_t)sum; + + /* Sum 4 */ + input_1 = ((a_1 >> 16) & 0x0FFFF) << left_shift; + SAT_INPUT(input_1, input_1_mult, input_1_shift); + + input_2 = ((a_2 >> 16) & 0x0FFFF) << left_shift; + SAT_INPUT(input_2, input_2_mult, input_2_shift); + + sum = input_1 + input_2; + SAT_INPUT(sum, out_mult, out_shift); + sum += out_offset; + sum = MAX(sum, out_activation_min); + sum = MIN(sum, out_activation_max); + r4 = (q7_t)sum; + + write_q7x4_ia(&output, __PACKq7(r1, r2, r3, r4)); + + loop_count--; + } + + loop_count = block_size & 0x3; +#else + loop_count = block_size; +#endif + + while (loop_count > 0U) + { + /* C = A + B */ + + input_1 = (*input_1_vect++ + input_1_offset) << left_shift; + input_2 = (*input_2_vect++ + input_2_offset) << left_shift; + + input_1 = arm_nn_doubling_high_mult(input_1, input_1_mult); + input_1 = arm_nn_divide_by_power_of_two(input_1, -input_1_shift); + + input_2 = arm_nn_doubling_high_mult(input_2, input_2_mult); + input_2 = arm_nn_divide_by_power_of_two(input_2, -input_2_shift); + + sum = input_1 + input_2; + SAT_INPUT(sum, out_mult, out_shift); + sum += out_offset; + + sum = MAX(sum, out_activation_min); + sum = MIN(sum, out_activation_max); + + *output++ = (q7_t)sum; + + /* Decrement loop counter */ + loop_count--; + } + +#endif /* ARM_MATH_MVEI */ + + return (ARM_MATH_SUCCESS); +} + +/** + * @} end of BasicMath group + */ diff --git a/APP_Framework/Framework/knowing/cmsis_5/NN/Source/BasicMathFunctions/arm_elementwise_mul_s8.c b/APP_Framework/Framework/knowing/cmsis_5/NN/Source/BasicMathFunctions/arm_elementwise_mul_s8.c new file mode 100644 index 000000000..7c560fe5c --- /dev/null +++ b/APP_Framework/Framework/knowing/cmsis_5/NN/Source/BasicMathFunctions/arm_elementwise_mul_s8.c @@ -0,0 +1,200 @@ +/* + * Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_elementwise_mul_s8 + * Description: Element wise multiplication + * + * $Date: January 26, 2021 + * $Revision: V.1.0.5 + * + * Target Processor: Cortex-M cores + * + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" +#include "arm_nnsupportfunctions.h" + +/** + * @ingroup groupNN + */ + +/** + * @addtogroup BasicMath + * @{ + */ + +/** + * @brief s8 element wise multiplication of two vectors + * + * @note Refer header file for details. + * + */ + +arm_status arm_elementwise_mul_s8(const int8_t *input_1_vect, + const int8_t *input_2_vect, + const int32_t input_1_offset, + const int32_t input_2_offset, + int8_t *output, + const int32_t out_offset, + const int32_t out_mult, + const int32_t out_shift, + const int32_t out_activation_min, + const int32_t out_activation_max, + const uint32_t block_size) +{ + + int32_t loop_count; +#if defined(ARM_MATH_MVEI) + + loop_count = (block_size + 3) / 4; + uint32_t num_elements = block_size; + + for (int i = 0; i < loop_count; i++) + { + mve_pred16_t p = vctp32q(num_elements); + + int32x4_t input_1 = vldrbq_z_s32(input_1_vect, p); + input_1 = vaddq_n_s32(input_1, input_1_offset); + + int32x4_t input_2 = vldrbq_z_s32(input_2_vect, p); + input_2 = vaddq_n_s32(input_2, input_2_offset); + + int32x4_t res_0 = vmulq_s32(input_1, input_2); + + res_0 = arm_requantize_mve_32x4(res_0, vdupq_n_s32(out_mult), vdupq_n_s32(out_shift)); + + res_0 += vdupq_n_s32(out_offset); + + res_0 = vmaxq_s32(res_0, vdupq_n_s32(out_activation_min)); + res_0 = vminq_s32(res_0, vdupq_n_s32(out_activation_max)); + + vstrbq_p_s32(output, res_0, p); + input_1_vect += 4; + input_2_vect += 4; + output += 4; + num_elements -= 4; + } + +#else + int32_t input_1; + int32_t input_2; + int32_t mul_res; + +#if defined(ARM_MATH_DSP) + int32_t a_1, b_1, a_2, b_2; + + int32_t offset_1_packed, offset_2_packed; + + int8_t r1, r2, r3, r4; + + offset_1_packed = (input_1_offset << 16U) | (input_1_offset & 0x0FFFFL); + offset_2_packed = (input_2_offset << 16U) | (input_2_offset & 0x0FFFFL); + + loop_count = block_size >> 2; + + while (loop_count > 0) + { + /* 4 outputs are calculated in one loop. The order of calculation is follows the order of output sign extension + intrinsic */ + input_1_vect = read_and_pad_reordered(input_1_vect, &b_1, &a_1); + input_2_vect = read_and_pad_reordered(input_2_vect, &b_2, &a_2); + + a_1 = __SADD16(a_1, offset_1_packed); + b_1 = __SADD16(b_1, offset_1_packed); + + a_2 = __SADD16(a_2, offset_2_packed); + b_2 = __SADD16(b_2, offset_2_packed); + + /* Mul 1 */ + input_1 = (int16_t)(b_1 & 0x0FFFFL); + input_2 = (int16_t)(b_2 & 0x0FFFFL); + + mul_res = input_1 * input_2; + mul_res = arm_nn_requantize(mul_res, out_mult, out_shift) + out_offset; + + mul_res = MAX(mul_res, out_activation_min); + mul_res = MIN(mul_res, out_activation_max); + r1 = (q7_t)mul_res; + + /* Mul 3 */ + input_1 = (int16_t)((b_1 >> 16U) & 0x0FFFFL); + input_2 = (int16_t)((b_2 >> 16U) & 0x0FFFFL); + + mul_res = input_1 * input_2; + mul_res = arm_nn_requantize(mul_res, out_mult, out_shift) + out_offset; + mul_res = MAX(mul_res, out_activation_min); + mul_res = MIN(mul_res, out_activation_max); + r3 = (q7_t)mul_res; + + /* Mul 2 */ + input_1 = (int16_t)(a_1 & 0x0FFFFL); + input_2 = (int16_t)(a_2 & 0x0FFFFL); + + mul_res = input_1 * input_2; + mul_res = arm_nn_requantize(mul_res, out_mult, out_shift) + out_offset; + mul_res = MAX(mul_res, out_activation_min); + mul_res = MIN(mul_res, out_activation_max); + r2 = (q7_t)mul_res; + + /* Mul 4 */ + input_1 = (int16_t)((a_1 >> 16U) & 0x0FFFFL); + input_2 = (int16_t)((a_2 >> 16U) & 0x0FFFFL); + + mul_res = input_1 * input_2; + mul_res = arm_nn_requantize(mul_res, out_mult, out_shift) + out_offset; + mul_res = MAX(mul_res, out_activation_min); + mul_res = MIN(mul_res, out_activation_max); + r4 = (q7_t)mul_res; + + write_q7x4_ia(&output, __PACKq7(r1, r2, r3, r4)); + + loop_count--; + } + + loop_count = block_size & 0x3; +#else + loop_count = block_size; +#endif + + while (loop_count > 0) + { + /* C = A * B */ + + input_1 = *input_1_vect++ + input_1_offset; + input_2 = *input_2_vect++ + input_2_offset; + + mul_res = input_1 * input_2; + mul_res = arm_nn_requantize(mul_res, out_mult, out_shift) + out_offset; + + mul_res = MAX(mul_res, out_activation_min); + mul_res = MIN(mul_res, out_activation_max); + + *output++ = (q7_t)mul_res; + + /* Decrement loop counter */ + loop_count--; + } +#endif + return ARM_MATH_SUCCESS; +} + +/** + * @} end of BasicMath group + */ diff --git a/APP_Framework/Framework/knowing/cmsis_5/NN/Source/ConcatenationFunctions/arm_concatenation_s8_w.c b/APP_Framework/Framework/knowing/cmsis_5/NN/Source/ConcatenationFunctions/arm_concatenation_s8_w.c new file mode 100644 index 000000000..17ffe26bd --- /dev/null +++ b/APP_Framework/Framework/knowing/cmsis_5/NN/Source/ConcatenationFunctions/arm_concatenation_s8_w.c @@ -0,0 +1,65 @@ +/* + * Copyright (C) 2010-2019 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_concatenation_s8_w.c + * Description: s8 version of concatenation along the W axis + * + * $Date: October 2019 + * $Revision: V.1.0.0 + * + * Target Processor: Cortex-M cores + * + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" + +/** + * @ingroup groupNN + */ + +/** + * @addtogroup Concatenation + * @{ + */ + +/* + * s8 version of concatenation along the W axis + * + * Refer to header file for details. + * + */ +void arm_concatenation_s8_w(const int8_t *input, + const uint16_t input_x, + const uint16_t input_y, + const uint16_t input_z, + const uint16_t input_w, + int8_t *output, + const uint32_t offset_w) +{ + const uint32_t input_copy_size = input_x * input_y * input_z * input_w; + + output += offset_w * (input_x * input_y * input_z); + + memcpy(output, input, input_copy_size); +} + +/** + * @} end of Concatenation group + */ diff --git a/APP_Framework/Framework/knowing/cmsis_5/NN/Source/ConcatenationFunctions/arm_concatenation_s8_x.c b/APP_Framework/Framework/knowing/cmsis_5/NN/Source/ConcatenationFunctions/arm_concatenation_s8_x.c new file mode 100644 index 000000000..de89fc718 --- /dev/null +++ b/APP_Framework/Framework/knowing/cmsis_5/NN/Source/ConcatenationFunctions/arm_concatenation_s8_x.c @@ -0,0 +1,74 @@ +/* + * Copyright (C) 2010-2019 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_concatenation_s8_x.c + * Description: s8 version of concatenation along the X axis + * + * $Date: October 2019 + * $Revision: V.1.0.0 + * + * Target Processor: Cortex-M cores + * + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" + +/** + * @ingroup groupNN + */ + +/** + * @addtogroup Concatenation + * @{ + */ + +/* + * s8 version of concatenation along the X axis + * + * Refer to header file for details. + * + */ +void arm_concatenation_s8_x(const int8_t *input, + const uint16_t input_x, + const uint16_t input_y, + const uint16_t input_z, + const uint16_t input_w, + int8_t *output, + const uint16_t output_x, + const uint32_t offset_x) +{ + const uint32_t num_iterations = input_y * input_z * input_w; + + output += offset_x; + + uint32_t i; + + // Copy per row + for (i = 0; i < num_iterations; ++i) + { + memcpy(output, input, input_x); + input += input_x; + output += output_x; + } +} + +/** + * @} end of Concatenation group + */ diff --git a/APP_Framework/Framework/knowing/cmsis_5/NN/Source/ConcatenationFunctions/arm_concatenation_s8_y.c b/APP_Framework/Framework/knowing/cmsis_5/NN/Source/ConcatenationFunctions/arm_concatenation_s8_y.c new file mode 100644 index 000000000..c80d56acb --- /dev/null +++ b/APP_Framework/Framework/knowing/cmsis_5/NN/Source/ConcatenationFunctions/arm_concatenation_s8_y.c @@ -0,0 +1,75 @@ +/* + * Copyright (C) 2010-2019 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_concatenation_s8_y.c + * Description: s8 version of concatenation along the Y axis + * + * $Date: October 2019 + * $Revision: V.1.0.0 + * + * Target Processor: Cortex-M cores + * + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" + +/** + * @ingroup groupNN + */ + +/** + * @addtogroup Concatenation + * @{ + */ + +/* + * s8 version of concatenation along the Y axis + * + * Refer to header file for details. + * + */ +void arm_concatenation_s8_y(const int8_t *input, + const uint16_t input_x, + const uint16_t input_y, + const uint16_t input_z, + const uint16_t input_w, + int8_t *output, + const uint16_t output_y, + const uint32_t offset_y) +{ + const uint32_t num_iterations = input_z * input_w; + const uint32_t input_copy_size = input_x * input_y; + const uint32_t output_stride = input_x * output_y; + + output += offset_y * input_x; + uint32_t i; + + // Copy per tile + for (i = 0; i < num_iterations; ++i) + { + memcpy(output, input, input_copy_size); + input += input_copy_size; + output += output_stride; + } +} + +/** + * @} end of Concatenation group + */ diff --git a/APP_Framework/Framework/knowing/cmsis_5/NN/Source/ConcatenationFunctions/arm_concatenation_s8_z.c b/APP_Framework/Framework/knowing/cmsis_5/NN/Source/ConcatenationFunctions/arm_concatenation_s8_z.c new file mode 100644 index 000000000..342e4d887 --- /dev/null +++ b/APP_Framework/Framework/knowing/cmsis_5/NN/Source/ConcatenationFunctions/arm_concatenation_s8_z.c @@ -0,0 +1,74 @@ +/* + * Copyright (C) 2010-2019 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_concatenation_s8_z.c + * Description: s8 version of concatenation along the Z axis + * + * $Date: October 2019 + * $Revision: V.1.0.0 + * + * Target Processor: Cortex-M cores + * + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" + +/** + * @ingroup groupNN + */ + +/** + * @addtogroup Concatenation + * @{ + */ + +/* + * s8 version of concatenation along the Z axis + * + * Refer to header file for details. + * + */ +void arm_concatenation_s8_z(const int8_t *input, + const uint16_t input_x, + const uint16_t input_y, + const uint16_t input_z, + const uint16_t input_w, + int8_t *output, + const uint16_t output_z, + const uint32_t offset_z) +{ + const uint32_t input_copy_size = input_x * input_y * input_z; + const uint32_t output_stride = input_x * input_y * output_z; + + output += offset_z * (input_x * input_y); + + uint32_t i; + + for (i = 0; i < input_w; ++i) + { + memcpy(output, input, input_copy_size); + input += input_copy_size; + output += output_stride; + } +} + +/** + * @} end of Concatenation group + */ diff --git a/APP_Framework/Framework/knowing/cmsis_5/NN/Source/ConvolutionFunctions/arm_convolve_1_x_n_s8.c b/APP_Framework/Framework/knowing/cmsis_5/NN/Source/ConvolutionFunctions/arm_convolve_1_x_n_s8.c new file mode 100644 index 000000000..75e9bb869 --- /dev/null +++ b/APP_Framework/Framework/knowing/cmsis_5/NN/Source/ConvolutionFunctions/arm_convolve_1_x_n_s8.c @@ -0,0 +1,203 @@ +/* + * Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_convolve_1_x_n_s8.c + * Description: s8 version of 1xN convolution using symmetric quantization. + * + * $Date: January 26, 2021 + * $Revision: V.2.0.3 + * + * Target Processor: Cortex-M cores + * + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" +#include "arm_nnsupportfunctions.h" + +/** + * @ingroup groupNN + */ + +/** + * @addtogroup NNConv + * @{ + */ + +/* + * 1xN s8 convolution function. + * + * Refer header file for details. + * + */ + +arm_status arm_convolve_1_x_n_s8(const cmsis_nn_context *ctx, + const cmsis_nn_conv_params *conv_params, + const cmsis_nn_per_channel_quant_params *quant_params, + const cmsis_nn_dims *input_dims, + const q7_t *input_data, + const cmsis_nn_dims *filter_dims, + const q7_t *filter_data, + const cmsis_nn_dims *bias_dims, + const int32_t *bias_data, + const cmsis_nn_dims *output_dims, + q7_t *output_data) +{ + (void)bias_dims; + arm_status status = ARM_MATH_SUCCESS; + if (output_dims->w % 4 != 0) + { + status = ARM_MATH_SIZE_MISMATCH; + goto out; + } + +#if defined(ARM_MATH_MVEI) + (void)ctx; + + const uint16_t input_x = input_dims->w; + const uint16_t kernel_x = filter_dims->w; + const uint16_t output_x = output_dims->w; + const uint16_t output_ch = output_dims->c; + const uint16_t input_ch = input_dims->c; + const uint16_t pad_x = conv_params->padding.w; + const uint16_t stride_x = conv_params->stride.w; + + const int32_t input_offset = conv_params->input_offset; + const int32_t out_offset = conv_params->output_offset; + const int32_t out_activation_min = conv_params->activation.min; + const int32_t out_activation_max = conv_params->activation.max; + int32_t *output_mult = quant_params->multiplier; + int32_t *output_shift = quant_params->shift; + + for (int i_out_x = 0; i_out_x <= (output_x - 4); i_out_x += 4) + { + int32_t input_begin_idx[4]; + int32_t ker_begin_idx[4]; + int32_t ker_end_idx[4]; + + for (int i = 0; i < 4; i++) + { + const int32_t est_input_x_idx = stride_x * (i_out_x + i) - pad_x; + input_begin_idx[i] = MAX(0, est_input_x_idx); + ker_begin_idx[i] = MAX(0, -est_input_x_idx); + ker_end_idx[i] = MIN(kernel_x, input_x - est_input_x_idx); + } + + for (int i_out_ch = 0; i_out_ch < output_ch; i_out_ch++) + { + int32x4_t s_offset; + int32_t acc[4]; + if ((ker_begin_idx[0] != 0) || (ker_end_idx[3] != kernel_x)) + { + int32_t sum_row[4]; + + (void)arm_nn_mat_mul_core_1x_s8((ker_end_idx[0] - ker_begin_idx[0]) * input_ch, + input_data + input_begin_idx[0] * input_ch, + filter_data + (input_ch * kernel_x * i_out_ch) + + (ker_begin_idx[0] * input_ch), + &sum_row[0], + &acc[0]); + (void)arm_nn_mat_mul_core_1x_s8((ker_end_idx[1] - ker_begin_idx[1]) * input_ch, + input_data + input_begin_idx[1] * input_ch, + filter_data + (input_ch * kernel_x * i_out_ch) + + (ker_begin_idx[1] * input_ch), + &sum_row[1], + &acc[1]); + + (void)arm_nn_mat_mul_core_1x_s8((ker_end_idx[2] - ker_begin_idx[2]) * input_ch, + input_data + input_begin_idx[2] * input_ch, + filter_data + (input_ch * kernel_x * i_out_ch) + + (ker_begin_idx[2] * input_ch), + &sum_row[2], + &acc[2]); + + (void)arm_nn_mat_mul_core_1x_s8((ker_end_idx[3] - ker_begin_idx[3]) * input_ch, + input_data + input_begin_idx[3] * input_ch, + filter_data + (input_ch * kernel_x * i_out_ch) + + (ker_begin_idx[3] * input_ch), + &sum_row[3], + &acc[3]); + + s_offset = vldrwq_s32(sum_row); + } + else + { + int32_t sum_row; + (void)arm_nn_mat_mul_core_4x_s8(kernel_x * input_ch, + stride_x * input_ch, + input_data + input_begin_idx[0] * input_ch, + filter_data + (input_ch * kernel_x * i_out_ch), + &sum_row, + acc); + + s_offset = vdupq_n_s32(sum_row); + } + int32x4_t res = vldrwq_s32(acc); + s_offset = vmulq_n_s32(s_offset, input_offset); + res = vaddq_s32(res, s_offset); + if (bias_data) + { + res = vaddq_n_s32(res, bias_data[i_out_ch]); + } + res = arm_requantize_mve(res, output_mult[i_out_ch], output_shift[i_out_ch]); + res = vaddq_n_s32(res, out_offset); + + res = vmaxq_s32(res, vdupq_n_s32(out_activation_min)); + res = vminq_s32(res, vdupq_n_s32(out_activation_max)); + + const uint32x4_t scatter_offset = {0, output_ch, output_ch * 2, output_ch * 3}; + vstrbq_scatter_offset_s32(output_data, scatter_offset, res); + output_data++; + } + output_data += (3 * output_ch); + } + +#else + status = arm_convolve_s8(ctx, + conv_params, + quant_params, + input_dims, + input_data, + filter_dims, + filter_data, + bias_dims, + bias_data, + output_dims, + output_data); +#endif + +out: + /* Return to application */ + return status; +} + +int32_t arm_convolve_1_x_n_s8_get_buffer_size(const cmsis_nn_dims *input_dims, const cmsis_nn_dims *filter_dims) +{ +#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI) + return (2 * input_dims->c * filter_dims->w * filter_dims->h) * sizeof(int16_t); +#else + (void)input_dims; + (void)filter_dims; + return 0; +#endif +} + +/** + * @} end of NNConv group + */ diff --git a/APP_Framework/Framework/knowing/cmsis_5/NN/Source/ConvolutionFunctions/arm_convolve_1x1_HWC_q7_fast_nonsquare.c b/APP_Framework/Framework/knowing/cmsis_5/NN/Source/ConvolutionFunctions/arm_convolve_1x1_HWC_q7_fast_nonsquare.c new file mode 100644 index 000000000..6418707f1 --- /dev/null +++ b/APP_Framework/Framework/knowing/cmsis_5/NN/Source/ConvolutionFunctions/arm_convolve_1x1_HWC_q7_fast_nonsquare.c @@ -0,0 +1,235 @@ +/* + * Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_convolve_1x1_HWC_q7_fast_nonsquare.c + * Description: Fast Q7 version of 1x1 convolution (non-square shape) + * + * $Date: January 26, 2021 + * $Revision: V.1.0.2 + * + * Target Processor: Cortex-M cores + * + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" +#include "arm_nnsupportfunctions.h" + +/** + * @ingroup groupNN + */ + +/** + * @addtogroup NNConv + * @{ + */ + +/** + * @brief Fast Q7 version of 1x1 convolution (non-sqaure shape) + * @param[in] Im_in pointer to input tensor + * @param[in] dim_im_in_x input tensor dimention x + * @param[in] dim_im_in_y input tensor dimention y + * @param[in] ch_im_in number of input tensor channels + * @param[in] wt pointer to kernel weights + * @param[in] ch_im_out number of filters, i.e., output tensor channels + * @param[in] dim_kernel_x filter kernel size x + * @param[in] dim_kernel_y filter kernel size y + * @param[in] padding_x padding size x + * @param[in] padding_y padding size y + * @param[in] stride_x convolution stride x + * @param[in] stride_y convolution stride y + * @param[in] bias pointer to bias + * @param[in] bias_shift amount of left-shift for bias + * @param[in] out_shift amount of right-shift for output + * @param[in,out] Im_out pointer to output tensor + * @param[in] dim_im_out_x output tensor dimension x + * @param[in] dim_im_out_y output tensor dimension y + * @param[in,out] bufferA pointer to buffer space for input + * @param[in,out] bufferB pointer to buffer space for output + * @return The function returns either + * ARM_MATH_SIZE_MISMATCH or ARM_MATH_SUCCESS based on the outcome of size checking. + * + * This function is optimized for convolution with 1x1 kernel size (i.e., dim_kernel_x=1 + * and dim_kernel_y=1). It can be used for the second half of MobileNets [1] after depthwise + * separable convolution. + * + * This function is the version with full list of optimization tricks, but with + * some constraints: + * ch_im_in is multiple of 4 + * ch_im_out is multiple of 2 + * + * [1] MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications + * https://arxiv.org/abs/1704.04861 + */ + +arm_status arm_convolve_1x1_HWC_q7_fast_nonsquare(const q7_t *Im_in, + const uint16_t dim_im_in_x, + const uint16_t dim_im_in_y, + const uint16_t ch_im_in, + const q7_t *wt, + const uint16_t ch_im_out, + const uint16_t dim_kernel_x, + const uint16_t dim_kernel_y, + const uint16_t padding_x, + const uint16_t padding_y, + const uint16_t stride_x, + const uint16_t stride_y, + const q7_t *bias, + const uint16_t bias_shift, + const uint16_t out_shift, + q7_t *Im_out, + const uint16_t dim_im_out_x, + const uint16_t dim_im_out_y, + q15_t *bufferA, + q7_t *bufferB) +{ + (void)bufferB; +#if defined(ARM_MATH_DSP) + /* Run the following code for Cortex-M4 and Cortex-M7 */ + (void)dim_im_in_y; + int16_t i_out_y, i_out_x; + int16_t i_ch_out; + + /* ----------------------- + * Here we use bufferA as q15_t internally as computation are done with q15_t level + * im2col are done to output in q15_t format from q7_t input + */ + + q15_t *pBuffer = bufferA; + q7_t *pOut = Im_out; + + if (ch_im_in % 4 != 0 || ch_im_out % 2 != 0 || dim_kernel_x != 1 || dim_kernel_y != 1 || padding_x != 0 || + padding_y != 0 || stride_x != 1 || stride_y != 1) + { + /* check if the input dimension meets the constraints */ + return ARM_MATH_SIZE_MISMATCH; + } + + for (i_out_y = 0; i_out_y < dim_im_out_y; i_out_y++) + { + for (i_out_x = 0; i_out_x < dim_im_out_x; i_out_x++) + { + /* This part implements the im2col function */ + arm_q7_to_q15_reordered_no_shift( + (q7_t *)Im_in + (i_out_y * dim_im_in_x + i_out_x) * ch_im_in, pBuffer, ch_im_in); + pBuffer += ch_im_in; + + if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel_x * dim_kernel_y) + { + pOut = arm_nn_mat_mult_kernel_q7_q15_reordered( + wt, bufferA, ch_im_out, ch_im_in, bias_shift, out_shift, bias, pOut); + /* counter reset */ + pBuffer = bufferA; + } + } + } + + /* check if there is left-over for compute */ + if (pBuffer != bufferA) + { + const q7_t *pA = wt; + for (i_ch_out = 0; i_ch_out < ch_im_out; i_ch_out++) + { + q31_t sum = ((q31_t)(bias[i_ch_out]) << bias_shift) + NN_ROUND(out_shift); + const q15_t *pB = bufferA; + /* basically each time it process 4 entries */ + uint16_t colCnt = ch_im_in * dim_kernel_x * dim_kernel_y >> 2; + + while (colCnt) + { + + q31_t inA1, inA2; + q31_t inB1, inB2; + + pA = read_and_pad_reordered(pA, &inA1, &inA2); + + inB1 = arm_nn_read_q15x2_ia(&pB); + sum = __SMLAD(inA1, inB1, sum); + inB2 = arm_nn_read_q15x2_ia(&pB); + + sum = __SMLAD(inA2, inB2, sum); + + colCnt--; + } + colCnt = ch_im_in * dim_kernel_y * dim_kernel_x & 0x3; + while (colCnt) + { + q7_t inA1 = *pA++; + q15_t inB1 = *pB++; + sum += inA1 * inB1; + colCnt--; + } + *pOut = (q7_t)__SSAT((sum >> out_shift), 8); + pOut++; + } + } + +#else + (void)bufferA; + /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */ + int i, j, k, l, m, n; + int conv_out; + int in_row, in_col; + + if (ch_im_in % 4 != 0 || ch_im_out % 2 != 0 || dim_kernel_x != 1 || dim_kernel_y != 1 || padding_x != 0 || + padding_y != 0 || stride_x != 1 || stride_y != 1) + { + /* check if the input dimension meets the constraints */ + return ARM_MATH_SIZE_MISMATCH; + } + + for (i = 0; i < ch_im_out; i++) + { + for (j = 0; j < dim_im_out_y; j++) + { + for (k = 0; k < dim_im_out_x; k++) + { + conv_out = ((q31_t)(bias[i]) << bias_shift) + NN_ROUND(out_shift); + for (m = 0; m < dim_kernel_y; m++) + { + for (n = 0; n < dim_kernel_x; n++) + { + // if-for implementation + in_row = stride_y * j + m - padding_y; + in_col = stride_x * k + n - padding_x; + if (in_row >= 0 && in_col >= 0 && in_row < dim_im_in_y && in_col < dim_im_in_x) + { + for (l = 0; l < ch_im_in; l++) + { + conv_out += Im_in[(in_row * dim_im_in_x + in_col) * ch_im_in + l] * + wt[i * ch_im_in * dim_kernel_y * dim_kernel_x + (m * dim_kernel_y + n) * ch_im_in + + l]; + } + } + } + } + Im_out[i + (j * dim_im_out_x + k) * ch_im_out] = (q7_t)__SSAT((conv_out >> out_shift), 8); + } + } + } + +#endif /* ARM_MATH_DSP */ + + /* Return to application */ + return ARM_MATH_SUCCESS; +} + +/** + * @} end of NNConv group + */ diff --git a/APP_Framework/Framework/knowing/cmsis_5/NN/Source/ConvolutionFunctions/arm_convolve_1x1_s8_fast.c b/APP_Framework/Framework/knowing/cmsis_5/NN/Source/ConvolutionFunctions/arm_convolve_1x1_s8_fast.c new file mode 100644 index 000000000..933847f0f --- /dev/null +++ b/APP_Framework/Framework/knowing/cmsis_5/NN/Source/ConvolutionFunctions/arm_convolve_1x1_s8_fast.c @@ -0,0 +1,181 @@ +/* + * Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_convolve_1x1_s8_fast.c + * Description: Fast q7 version of 1x1 convolution (non-square shape) + * + * $Date: 09. October 2020 + * $Revision: V.2.0.3 + * + * Target Processor: Cortex-M cores + * + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" +#include "arm_nnsupportfunctions.h" + +#define DIM_KER_X (1U) +#define DIM_KER_Y (1U) + +/** + * @ingroup groupNN + */ + +/** + * @addtogroup NNConv + * @{ + */ + +/* + * Fast s8 version for 1x1 convolution (non-square shape) + * + * Refer header file for details. + * + */ + +arm_status arm_convolve_1x1_s8_fast(const cmsis_nn_context *ctx, + const cmsis_nn_conv_params *conv_params, + const cmsis_nn_per_channel_quant_params *quant_params, + const cmsis_nn_dims *input_dims, + const q7_t *input_data, + const cmsis_nn_dims *filter_dims, + const q7_t *filter_data, + const cmsis_nn_dims *bias_dims, + const int32_t *bias_data, + const cmsis_nn_dims *output_dims, + q7_t *output_data) +{ + if (input_dims->c % 4 != 0 || conv_params->padding.w != 0 || conv_params->padding.h != 0 || + conv_params->stride.w != 1 || conv_params->stride.h != 1) + { + return ARM_MATH_SIZE_MISMATCH; + } + + (void)ctx; + (void)filter_dims; + (void)bias_dims; + +#if defined(ARM_MATH_MVEI) + + const int32_t col_len = input_dims->w * input_dims->h * input_dims->n; + const int32_t output_ch = output_dims->c; + const int32_t input_ch = input_dims->c; + const int32_t input_offset = conv_params->input_offset; + const int32_t out_offset = conv_params->output_offset; + const int32_t out_activation_min = conv_params->activation.min; + const int32_t out_activation_max = conv_params->activation.max; + int32_t *output_mult = quant_params->multiplier; + int32_t *output_shift = quant_params->shift; + + for (int i_items = 0; i_items <= (col_len - 4); i_items += 4) + { + for (int i_out_ch = 0; i_out_ch < output_ch; i_out_ch++) + { + int32_t sum_row = 0; + int32_t temp_out[4]; + + (void)arm_nn_mat_mul_core_4x_s8(input_ch, + input_ch, + input_data + i_items * input_ch, + filter_data + i_out_ch * input_ch, + &sum_row, + temp_out); + int32x4_t res = vldrwq_s32(temp_out); + if (bias_data) + { + res = vaddq_n_s32(res, bias_data[i_out_ch]); + } + sum_row = sum_row * input_offset; + res = vaddq_n_s32(res, sum_row); + res = arm_requantize_mve(res, output_mult[i_out_ch], output_shift[i_out_ch]); + res = vaddq_n_s32(res, out_offset); + + res = vmaxq_s32(res, vdupq_n_s32(out_activation_min)); + res = vminq_s32(res, vdupq_n_s32(out_activation_max)); + + const uint32x4_t scatter_offset = { + 0, (uint32_t)output_ch, (uint32_t)output_ch * 2, (uint32_t)output_ch * 3}; + vstrbq_scatter_offset_s32(output_data, scatter_offset, res); + output_data++; + } + output_data += (3 * output_ch); + } + + /* Handle left over elements */ + for (int i_items = (col_len & ~0x3); i_items < col_len; i_items++) + { + for (int i_out_ch = 0; i_out_ch < output_ch; i_out_ch++) + { + int32_t sum_row = 0; + + int32_t acc; + (void)arm_nn_mat_mul_core_1x_s8( + input_ch, input_data + i_items * input_ch, filter_data + i_out_ch * input_ch, &sum_row, &acc); + if (bias_data) + { + acc += bias_data[i_out_ch]; + } + sum_row = (sum_row * input_offset); + acc += sum_row; + acc = arm_nn_requantize(acc, output_mult[i_out_ch], output_shift[i_out_ch]); + acc += out_offset; + + acc = MAX(acc, out_activation_min); + acc = MIN(acc, out_activation_max); + *output_data++ = acc; + } + } + +#else + /* Run the following code as reference implementation for Cortex-M processors with or without DSP extension */ + + const int32_t lhs_rows = input_dims->w * input_dims->h * input_dims->n; + const int32_t rhs_rows = output_dims->c; + const int32_t rhs_cols = input_dims->c; + + arm_nn_mat_mult_nt_t_s8(input_data, + filter_data, + bias_data, + output_data, + quant_params->multiplier, + quant_params->shift, + lhs_rows, + rhs_rows, + rhs_cols, + conv_params->input_offset, + conv_params->output_offset, + conv_params->activation.min, + conv_params->activation.max); + +#endif + + /* Return to application */ + return ARM_MATH_SUCCESS; +} + +int32_t arm_convolve_1x1_s8_fast_get_buffer_size(const cmsis_nn_dims *input_dims) +{ + (void)input_dims; + return 0; +} + +/** + * @} end of NNConv group + */ diff --git a/APP_Framework/Framework/knowing/cmsis_5/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q15_basic.c b/APP_Framework/Framework/knowing/cmsis_5/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q15_basic.c new file mode 100644 index 000000000..e3502ebf4 --- /dev/null +++ b/APP_Framework/Framework/knowing/cmsis_5/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q15_basic.c @@ -0,0 +1,209 @@ +/* + * Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_convolve_HWC_q15_basic.c + * Description: Q15 version of convolution + * + * $Date: January 26, 2021 + * $Revision: V.1.0.2 + * + * Target Processor: Cortex-M cores + * + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" +#include "arm_nnsupportfunctions.h" + +/** + * @ingroup groupNN + */ + +/** + * @addtogroup NNConv + * @{ + */ + +/** + * @brief Basic Q15 convolution function + * @param[in] Im_in pointer to input tensor + * @param[in] dim_im_in input tensor dimention + * @param[in] ch_im_in number of input tensor channels + * @param[in] wt pointer to kernel weights + * @param[in] ch_im_out number of filters, i.e., output tensor channels + * @param[in] dim_kernel filter kernel size + * @param[in] padding padding sizes + * @param[in] stride convolution stride + * @param[in] bias pointer to bias + * @param[in] bias_shift amount of left-shift for bias + * @param[in] out_shift amount of right-shift for output + * @param[in,out] Im_out pointer to output tensor + * @param[in] dim_im_out output tensor dimension + * @param[in,out] bufferA pointer to buffer space for input + * @param[in,out] bufferB pointer to buffer space for output + * @return The function returns ARM_MATH_SUCCESS + * + * @details + * + * Buffer size: + * + * bufferA size: ch_im_in*dim_kernel*dim_kernel + * + * bufferB size: 0 + * + * This basic version is designed to work for any input tensor and weight + * dimension. + */ + +arm_status arm_convolve_HWC_q15_basic(const q15_t *Im_in, + const uint16_t dim_im_in, + const uint16_t ch_im_in, + const q15_t *wt, + const uint16_t ch_im_out, + const uint16_t dim_kernel, + const uint16_t padding, + const uint16_t stride, + const q15_t *bias, + const uint16_t bias_shift, + const uint16_t out_shift, + q15_t *Im_out, + const uint16_t dim_im_out, + q15_t *bufferA, + q7_t *bufferB) +{ + (void)bufferB; +#if defined(ARM_MATH_DSP) + /* Run the following code for Cortex-M4 and Cortex-M7 */ + + int16_t i_out_y, i_out_x, i_ker_y, i_ker_x; + + uint16_t im2col_out_pixel_index = 0; + q15_t *pBuffer = bufferA; + q15_t *pOut = Im_out; + q15_t *im_buffer = bufferA; + const q15_t *pA; + int i; + + /* This part implements the im2col function */ + for (i_out_y = 0; i_out_y < dim_im_out; i_out_y++) + { + for (i_out_x = 0; i_out_x < dim_im_out; i_out_x++) + { + for (i_ker_y = i_out_y * stride - padding; i_ker_y < i_out_y * stride - padding + dim_kernel; i_ker_y++) + { + for (i_ker_x = i_out_x * stride - padding; i_ker_x < i_out_x * stride - padding + dim_kernel; i_ker_x++) + { + if (i_ker_y < 0 || i_ker_y >= dim_im_in || i_ker_x < 0 || i_ker_x >= dim_im_in) + { + /* Filling 0 for out-of-bound paddings */ + /* arm_fill_q15(0, pBuffer, ch_im_in); */ + memset(pBuffer, 0, sizeof(q15_t) * ch_im_in); + } + else + { + /* arm_copy_q15((q15_t *) Im_in + (i_ker_y * dim_im_in + i_ker_x) * ch_im_in, pBuffer, + * ch_im_in); */ + memcpy(pBuffer, + (q15_t *)Im_in + (i_ker_y * dim_im_in + i_ker_x) * ch_im_in, + sizeof(q15_t) * ch_im_in); + } + pBuffer += ch_im_in; + } + } + + pA = wt; + for (i = 0; i < ch_im_out; i++) + { + q31_t sum = ((q31_t)bias[i] << bias_shift) + NN_ROUND(out_shift); + const q15_t *pB = im_buffer; + uint16_t colCnt = ch_im_in * dim_kernel * dim_kernel >> 2; + while (colCnt) + { + q31_t inA1 = arm_nn_read_q15x2_ia(&pA); + q31_t inB1 = arm_nn_read_q15x2_ia(&pB); + q31_t inA2 = arm_nn_read_q15x2_ia(&pA); + q31_t inB2 = arm_nn_read_q15x2_ia(&pB); + + sum = __SMLAD(inA1, inB1, sum); + sum = __SMLAD(inA2, inB2, sum); + + colCnt--; + } + colCnt = ch_im_in * dim_kernel * dim_kernel & 0x3; + while (colCnt) + { + q15_t inA1 = *pA++; + q15_t inB1 = *pB++; + sum += inA1 * inB1; + colCnt--; + } + *pOut = (q15_t)__SSAT((sum >> out_shift), 16); + pOut++; + } + + /* counter reset */ + pBuffer = im_buffer; + im2col_out_pixel_index++; + } + } + +#else + (void)bufferA; + /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */ + int i, j, k, l, m, n; + int conv_out; + int in_row, in_col; + + for (i = 0; i < ch_im_out; i++) + { + for (j = 0; j < dim_im_out; j++) + { + for (k = 0; k < dim_im_out; k++) + { + conv_out = ((q31_t)bias[i] << bias_shift) + NN_ROUND(out_shift); + for (m = 0; m < dim_kernel; m++) + { + for (n = 0; n < dim_kernel; n++) + { + in_row = stride * j + m - padding; + in_col = stride * k + n - padding; + if (in_row >= 0 && in_col >= 0 && in_row < dim_im_in && in_col < dim_im_in) + { + for (l = 0; l < ch_im_in; l++) + { + conv_out += Im_in[(in_row * dim_im_in + in_col) * ch_im_in + l] * + wt[i * ch_im_in * dim_kernel * dim_kernel + (m * dim_kernel + n) * ch_im_in + l]; + } + } + } + } + Im_out[i + (j * dim_im_out + k) * ch_im_out] = (q15_t)__SSAT((conv_out >> out_shift), 16); + } + } + } + +#endif /* ARM_MATH_DSP */ + + /* Return to application */ + return ARM_MATH_SUCCESS; +} + +/** + * @} end of NNConv group + */ diff --git a/APP_Framework/Framework/knowing/cmsis_5/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q15_fast.c b/APP_Framework/Framework/knowing/cmsis_5/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q15_fast.c new file mode 100644 index 000000000..ac007e4a9 --- /dev/null +++ b/APP_Framework/Framework/knowing/cmsis_5/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q15_fast.c @@ -0,0 +1,257 @@ +/* + * Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_convolve_HWC_q15_fast.c + * Description: Fast Q15 version of convolution + * + * $Date: January 26, 2021 + * $Revision: V.1.0.2 + * + * Target Processor: Cortex-M cores + * + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" +#include "arm_nnsupportfunctions.h" + +/** + * @ingroup groupNN + */ + +/** + * @addtogroup NNConv + * @{ + */ + +/** + * @brief Fast Q15 convolution function + * @param[in] Im_in pointer to input tensor + * @param[in] dim_im_in input tensor dimention + * @param[in] ch_im_in number of input tensor channels + * @param[in] wt pointer to kernel weights + * @param[in] ch_im_out number of filters, i.e., output tensor channels + * @param[in] dim_kernel filter kernel size + * @param[in] padding padding sizes + * @param[in] stride convolution stride + * @param[in] bias pointer to bias + * @param[in] bias_shift amount of left-shift for bias + * @param[in] out_shift amount of right-shift for output + * @param[in,out] Im_out pointer to output tensor + * @param[in] dim_im_out output tensor dimension + * @param[in,out] bufferA pointer to buffer space for input + * @param[in,out] bufferB pointer to buffer space for output + * @return The function returns either + * ARM_MATH_SIZE_MISMATCH or ARM_MATH_SUCCESS based on the outcome of size checking. + * + * @details + * + * Buffer size: + * + * bufferA size: 2*ch_im_in*dim_kernel*dim_kernel + * + * bufferB size: 0 + * + * Input dimension constraints: + * + * ch_im_in is multiple of 2 + * + * ch_im_out is multiple of 2 + * + */ + +arm_status arm_convolve_HWC_q15_fast(const q15_t *Im_in, + const uint16_t dim_im_in, + const uint16_t ch_im_in, + const q15_t *wt, + const uint16_t ch_im_out, + const uint16_t dim_kernel, + const uint16_t padding, + const uint16_t stride, + const q15_t *bias, + const uint16_t bias_shift, + const uint16_t out_shift, + q15_t *Im_out, + const uint16_t dim_im_out, + q15_t *bufferA, + q7_t *bufferB) +{ + (void)bufferB; +#if defined(ARM_MATH_DSP) + int16_t i_out_y, i_out_x, i_ker_y, i_ker_x; + + q15_t *pBuffer = bufferA; + q15_t *im_buffer = bufferA; + q15_t *pOut = Im_out; + + if (ch_im_in % 2 != 0 || ch_im_out % 2 != 0) + { + /* check if the input dimension meets the constraints */ + return ARM_MATH_SIZE_MISMATCH; + } + + /* Run the following code for Cortex-M4 and Cortex-M7 */ + + /* This part implements the im2col function */ + for (i_out_y = 0; i_out_y < dim_im_out; i_out_y++) + { + for (i_out_x = 0; i_out_x < dim_im_out; i_out_x++) + { + for (i_ker_y = i_out_y * stride - padding; i_ker_y < i_out_y * stride - padding + dim_kernel; i_ker_y++) + { + for (i_ker_x = i_out_x * stride - padding; i_ker_x < i_out_x * stride - padding + dim_kernel; i_ker_x++) + { + if (i_ker_y < 0 || i_ker_y >= dim_im_in || i_ker_x < 0 || i_ker_x >= dim_im_in) + { + /* arm_fill_q15(0, pBuffer, ch_im_in); */ + memset(pBuffer, 0, sizeof(q15_t) * ch_im_in); + } + else + { + /* arm_copy_q15((q15_t *) Im_in + (i_ker_y * dim_im_in + i_ker_x) * ch_im_in, pBuffer, + * ch_im_in); */ + memcpy(pBuffer, + (q15_t *)Im_in + (i_ker_y * dim_im_in + i_ker_x) * ch_im_in, + sizeof(q15_t) * ch_im_in); + } + pBuffer += ch_im_in; + } + } + + if (i_out_x & 0x1) + { + int i; + /* initialize the matrix pointers for A */ + const q15_t *pA = wt; + + /* set up the second output pointers */ + q15_t *pOut2 = pOut + ch_im_out; + + /* this loop over rows in A */ + for (i = 0; i < ch_im_out; i += 2) + { + /* setup pointers for B */ + const q15_t *pB = im_buffer; + const q15_t *pB2 = pB + ch_im_in * dim_kernel * dim_kernel; + + /* aling the second pointer for A */ + const q15_t *pA2 = pA + ch_im_in * dim_kernel * dim_kernel; + + /* init the sum with bias */ + q31_t sum = ((q31_t)bias[i] << bias_shift) + NN_ROUND(out_shift); + q31_t sum2 = ((q31_t)bias[i] << bias_shift) + NN_ROUND(out_shift); + q31_t sum3 = ((q31_t)bias[i + 1] << bias_shift) + NN_ROUND(out_shift); + q31_t sum4 = ((q31_t)bias[i + 1] << bias_shift) + NN_ROUND(out_shift); + + uint16_t colCnt = ch_im_in * dim_kernel * dim_kernel >> 1; + /* accumulate over the vector */ + while (colCnt) + { + q31_t inA1 = arm_nn_read_q15x2_ia(&pA); + q31_t inB1 = arm_nn_read_q15x2_ia(&pB); + q31_t inA2 = arm_nn_read_q15x2_ia(&pA2); + q31_t inB2 = arm_nn_read_q15x2_ia(&pB2); + + sum = __SMLAD(inA1, inB1, sum); + sum2 = __SMLAD(inA1, inB2, sum2); + sum3 = __SMLAD(inA2, inB1, sum3); + sum4 = __SMLAD(inA2, inB2, sum4); + + colCnt--; + } /* while over colCnt */ + colCnt = ch_im_in * dim_kernel * dim_kernel & 0x1; + while (colCnt) + { + q15_t inA1 = *pA++; + q15_t inB1 = *pB++; + q15_t inA2 = *pA2++; + q15_t inB2 = *pB2++; + + sum += inA1 * inB1; + sum2 += inA1 * inB2; + sum3 += inA2 * inB1; + sum4 += inA2 * inB2; + colCnt--; + } /* while over colCnt */ + *pOut++ = (q15_t)__SSAT(sum >> out_shift, 16); + *pOut++ = (q15_t)__SSAT(sum3 >> out_shift, 16); + *pOut2++ = (q15_t)__SSAT(sum2 >> out_shift, 16); + *pOut2++ = (q15_t)__SSAT(sum4 >> out_shift, 16); + + /* skip the row computed with A2 */ + pA += ch_im_in * dim_kernel * dim_kernel; + } /* for over ch_im_out */ + + pOut += ch_im_out; + /* counter reset */ + pBuffer = im_buffer; + } + } + } + +#else + (void)bufferA; + /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */ + int i, j, k, l, m, n; + int conv_out; + int in_row, in_col; + + if (ch_im_in % 2 != 0 || ch_im_out % 2 != 0) + { + /* check if the input dimension meets the constraints */ + return ARM_MATH_SIZE_MISMATCH; + } + + for (i = 0; i < ch_im_out; i++) + { + for (j = 0; j < dim_im_out; j++) + { + for (k = 0; k < dim_im_out; k++) + { + conv_out = ((q31_t)bias[i] << bias_shift) + NN_ROUND(out_shift); + for (m = 0; m < dim_kernel; m++) + { + for (n = 0; n < dim_kernel; n++) + { + in_row = stride * j + m - padding; + in_col = stride * k + n - padding; + if (in_row >= 0 && in_col >= 0 && in_row < dim_im_in && in_col < dim_im_in) + { + for (l = 0; l < ch_im_in; l++) + { + conv_out += Im_in[(in_row * dim_im_in + in_col) * ch_im_in + l] * + wt[i * ch_im_in * dim_kernel * dim_kernel + (m * dim_kernel + n) * ch_im_in + l]; + } + } + } + } + Im_out[i + (j * dim_im_out + k) * ch_im_out] = (q15_t)__SSAT((conv_out >> out_shift), 16); + } + } + } + +#endif /* ARM_MATH_DSP */ + + /* Return to application */ + return ARM_MATH_SUCCESS; +} + +/** + * @} end of NNConv group + */ diff --git a/APP_Framework/Framework/knowing/cmsis_5/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q15_fast_nonsquare.c b/APP_Framework/Framework/knowing/cmsis_5/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q15_fast_nonsquare.c new file mode 100644 index 000000000..27947e848 --- /dev/null +++ b/APP_Framework/Framework/knowing/cmsis_5/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q15_fast_nonsquare.c @@ -0,0 +1,270 @@ +/* + * Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_convolve_HWC_q15_fast.c + * Description: Fast Q15 version of convolution + * + * $Date: January 26, 2021 + * $Revision: V.1.0.2 + * + * Target Processor: Cortex-M cores + * + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" +#include "arm_nnsupportfunctions.h" + +/** + * @ingroup groupNN + */ + +/** + * @addtogroup NNConv + * @{ + */ + +/** + * @brief Fast Q15 convolution function (non-sqaure shape) + * @param[in] Im_in pointer to input tensor + * @param[in] dim_im_in_x input tensor dimention x + * @param[in] dim_im_in_y input tensor dimention y + * @param[in] ch_im_in number of input tensor channels + * @param[in] wt pointer to kernel weights + * @param[in] ch_im_out number of filters, i.e., output tensor channels + * @param[in] dim_kernel_x filter kernel size x + * @param[in] dim_kernel_y filter kernel size y + * @param[in] padding_x padding size x + * @param[in] padding_y padding size y + * @param[in] stride_x convolution stride x + * @param[in] stride_y convolution stride y + * @param[in] bias pointer to bias + * @param[in] bias_shift amount of left-shift for bias + * @param[in] out_shift amount of right-shift for output + * @param[in,out] Im_out pointer to output tensor + * @param[in] dim_im_out_x output tensor dimension x + * @param[in] dim_im_out_y output tensor dimension y + * @param[in,out] bufferA pointer to buffer space for input + * @param[in,out] bufferB pointer to buffer space for output + * @return The function returns either + * ARM_MATH_SIZE_MISMATCH or ARM_MATH_SUCCESS based on the outcome of size checking. + * + * @details + * + * Buffer size: + * + * bufferA size: 2*ch_im_in*dim_kernel*dim_kernel + * + * bufferB size: 0 + * + * Input dimension constraints: + * + * ch_im_in is multiple of 2 + * + * ch_im_out is multiple of 2 + * + */ + +arm_status arm_convolve_HWC_q15_fast_nonsquare(const q15_t *Im_in, + const uint16_t dim_im_in_x, + const uint16_t dim_im_in_y, + const uint16_t ch_im_in, + const q15_t *wt, + const uint16_t ch_im_out, + const uint16_t dim_kernel_x, + const uint16_t dim_kernel_y, + const uint16_t padding_x, + const uint16_t padding_y, + const uint16_t stride_x, + const uint16_t stride_y, + const q15_t *bias, + const uint16_t bias_shift, + const uint16_t out_shift, + q15_t *Im_out, + const uint16_t dim_im_out_x, + const uint16_t dim_im_out_y, + q15_t *bufferA, + q7_t *bufferB) +{ + (void)bufferB; +#if defined(ARM_MATH_DSP) + int16_t i_out_y, i_out_x, i_ker_y, i_ker_x; + + q15_t *pBuffer = bufferA; + q15_t *im_buffer = bufferA; + q15_t *pOut = Im_out; + + if (ch_im_in % 2 != 0 || ch_im_out % 2 != 0) + { + /* check if the input dimension meets the constraints */ + return ARM_MATH_SIZE_MISMATCH; + } + + /* Run the following code for Cortex-M4 and Cortex-M7 */ + + /* This part implements the im2col function */ + for (i_out_y = 0; i_out_y < dim_im_out_y; i_out_y++) + { + for (i_out_x = 0; i_out_x < dim_im_out_x; i_out_x++) + { + for (i_ker_y = i_out_y * stride_y - padding_y; i_ker_y < i_out_y * stride_y - padding_y + dim_kernel_y; + i_ker_y++) + { + for (i_ker_x = i_out_x * stride_x - padding_x; i_ker_x < i_out_x * stride_x - padding_x + dim_kernel_x; + i_ker_x++) + { + if (i_ker_y < 0 || i_ker_y >= dim_im_in_y || i_ker_x < 0 || i_ker_x >= dim_im_in_x) + { + /* arm_fill_q15(0, pBuffer, ch_im_in); */ + memset(pBuffer, 0, sizeof(q15_t) * ch_im_in); + } + else + { + /* arm_copy_q15((q15_t *) Im_in + (i_ker_y * dim_im_in_x + i_ker_x) * ch_im_in, pBuffer, + * ch_im_in); */ + memcpy(pBuffer, + (q15_t *)Im_in + (i_ker_y * dim_im_in_x + i_ker_x) * ch_im_in, + sizeof(q15_t) * ch_im_in); + } + pBuffer += ch_im_in; + } + } + + if (i_out_x & 0x1) + { + int i; + /* initialize the matrix pointers for A */ + const q15_t *pA = wt; + + /* set up the second output pointers */ + q15_t *pOut2 = pOut + ch_im_out; + + /* this loop over rows in A */ + for (i = 0; i < ch_im_out; i += 2) + { + /* setup pointers for B */ + const q15_t *pB = im_buffer; + const q15_t *pB2 = pB + ch_im_in * dim_kernel_y * dim_kernel_x; + + /* aling the second pointer for A */ + const q15_t *pA2 = pA + ch_im_in * dim_kernel_y * dim_kernel_x; + + /* init the sum with bias */ + q31_t sum = ((q31_t)bias[i] << bias_shift) + NN_ROUND(out_shift); + q31_t sum2 = ((q31_t)bias[i] << bias_shift) + NN_ROUND(out_shift); + q31_t sum3 = ((q31_t)bias[i + 1] << bias_shift) + NN_ROUND(out_shift); + q31_t sum4 = ((q31_t)bias[i + 1] << bias_shift) + NN_ROUND(out_shift); + + uint16_t colCnt = ch_im_in * dim_kernel_y * dim_kernel_x >> 1; + /* accumulate over the vector */ + while (colCnt) + { + q31_t inA1 = arm_nn_read_q15x2_ia(&pA); + q31_t inB1 = arm_nn_read_q15x2_ia(&pB); + q31_t inA2 = arm_nn_read_q15x2_ia(&pA2); + q31_t inB2 = arm_nn_read_q15x2_ia(&pB2); + + sum = __SMLAD(inA1, inB1, sum); + sum2 = __SMLAD(inA1, inB2, sum2); + sum3 = __SMLAD(inA2, inB1, sum3); + sum4 = __SMLAD(inA2, inB2, sum4); + + colCnt--; + } /* while over colCnt */ + colCnt = ch_im_in * dim_kernel_y * dim_kernel_x & 0x1; + while (colCnt) + { + q15_t inA1 = *pA++; + q15_t inB1 = *pB++; + q15_t inA2 = *pA2++; + q15_t inB2 = *pB2++; + + sum += inA1 * inB1; + sum2 += inA1 * inB2; + sum3 += inA2 * inB1; + sum4 += inA2 * inB2; + colCnt--; + } /* while over colCnt */ + *pOut++ = (q15_t)__SSAT(sum >> out_shift, 16); + *pOut++ = (q15_t)__SSAT(sum3 >> out_shift, 16); + *pOut2++ = (q15_t)__SSAT(sum2 >> out_shift, 16); + *pOut2++ = (q15_t)__SSAT(sum4 >> out_shift, 16); + + /* skip the row computed with A2 */ + pA += ch_im_in * dim_kernel_y * dim_kernel_x; + } /* for over ch_im_out */ + + pOut += ch_im_out; + /* counter reset */ + pBuffer = im_buffer; + } + } + } + +#else + (void)bufferA; + /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */ + int i, j, k, l, m, n; + int conv_out; + int in_row, in_col; + + if (ch_im_in % 2 != 0 || ch_im_out % 2 != 0) + { + /* check if the input dimension meets the constraints */ + return ARM_MATH_SIZE_MISMATCH; + } + + for (i = 0; i < ch_im_out; i++) + { + for (j = 0; j < dim_im_out_y; j++) + { + for (k = 0; k < dim_im_out_x; k++) + { + conv_out = ((q31_t)bias[i] << bias_shift) + NN_ROUND(out_shift); + for (m = 0; m < dim_kernel_y; m++) + { + for (n = 0; n < dim_kernel_x; n++) + { + in_row = stride_y * j + m - padding_y; + in_col = stride_x * k + n - padding_x; + if (in_row >= 0 && in_col >= 0 && in_row < dim_im_in_y && in_col < dim_im_in_x) + { + for (l = 0; l < ch_im_in; l++) + { + conv_out += Im_in[(in_row * dim_im_in_x + in_col) * ch_im_in + l] * + wt[i * ch_im_in * dim_kernel_x * dim_kernel_y + (m * dim_kernel_x + n) * ch_im_in + + l]; + } + } + } + } + Im_out[i + (j * dim_im_out_x + k) * ch_im_out] = (q15_t)__SSAT((conv_out >> out_shift), 16); + } + } + } + +#endif /* ARM_MATH_DSP */ + + /* Return to application */ + return ARM_MATH_SUCCESS; +} + +/** + * @} end of NNConv group + */ diff --git a/APP_Framework/Framework/knowing/cmsis_5/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_RGB.c b/APP_Framework/Framework/knowing/cmsis_5/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_RGB.c new file mode 100644 index 000000000..46e9a7788 --- /dev/null +++ b/APP_Framework/Framework/knowing/cmsis_5/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_RGB.c @@ -0,0 +1,279 @@ +/* + * Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_convolve_HWC_q7_RGB.c + * Description: Q7 version of convolution for RGB image + * + * $Date: January 26, 2021 + * $Revision: V.1.0.2 + * + * Target Processor: Cortex-M cores + * + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" +#include "arm_nnsupportfunctions.h" + +/** + * @ingroup groupNN + */ + +/** + * @addtogroup NNConv + * @{ + */ + +/** + * @brief Q7 convolution function for RGB image + * @param[in] Im_in pointer to input tensor + * @param[in] dim_im_in input tensor dimention + * @param[in] ch_im_in number of input tensor channels + * @param[in] wt pointer to kernel weights + * @param[in] ch_im_out number of filters, i.e., output tensor channels + * @param[in] dim_kernel filter kernel size + * @param[in] padding padding sizes + * @param[in] stride convolution stride + * @param[in] bias pointer to bias + * @param[in] bias_shift amount of left-shift for bias + * @param[in] out_shift amount of right-shift for output + * @param[in,out] Im_out pointer to output tensor + * @param[in] dim_im_out output tensor dimension + * @param[in,out] bufferA pointer to buffer space for input + * @param[in,out] bufferB pointer to buffer space for output + * @return The function returns either + * ARM_MATH_SIZE_MISMATCH or ARM_MATH_SUCCESS based on the outcome of size checking. + * + * @details + * + * Buffer size: + * + * bufferA size: 2*ch_im_in*dim_kernel*dim_kernel + * + * bufferB size: 0 + * + * Input dimension constraints: + * + * ch_im_in equals 3 + * + * This kernel is written exclusively for convolution with ch_im_in + * equals 3. This applies on the first layer of CNNs which has input + * image with RGB format. + */ + +arm_status arm_convolve_HWC_q7_RGB(const q7_t *Im_in, + const uint16_t dim_im_in, + const uint16_t ch_im_in, + const q7_t *wt, + const uint16_t ch_im_out, + const uint16_t dim_kernel, + const uint16_t padding, + const uint16_t stride, + const q7_t *bias, + const uint16_t bias_shift, + const uint16_t out_shift, + q7_t *Im_out, + const uint16_t dim_im_out, + q15_t *bufferA, + q7_t *bufferB) +{ + (void)bufferB; +#if defined(ARM_MATH_DSP) + /* Run the following code for Cortex-M4 and Cortex-M7 */ + int16_t i_out_y, i_out_x, i_ker_y, i_ker_x; + + /* + * Here we use bufferA as q15_t internally as computation are done with q15_t level + * im2col are done to output in q15_t format from q7_t input + */ + q15_t *pBuffer = bufferA; + q7_t *pOut = Im_out; + + // check if number of input channels is 3 + if (ch_im_in != 3) + { + return ARM_MATH_SIZE_MISMATCH; + } + // This part implements the im2col function + for (i_out_y = 0; i_out_y < dim_im_out; i_out_y++) + { + for (i_out_x = 0; i_out_x < dim_im_out; i_out_x++) + { + for (i_ker_y = i_out_y * stride - padding; i_ker_y < i_out_y * stride - padding + dim_kernel; i_ker_y++) + { + for (i_ker_x = i_out_x * stride - padding; i_ker_x < i_out_x * stride - padding + dim_kernel; i_ker_x++) + { + if (i_ker_y < 0 || i_ker_y >= dim_im_in || i_ker_x < 0 || i_ker_x >= dim_im_in) + { + /* Equivalent to arm_fill_q15(0, pBuffer, ch_im_in) with assumption: ch_im_in = 3 */ + *__SIMD32(pBuffer) = 0x0; + *(pBuffer + 2) = 0; + pBuffer += 3; + } + else + { + /* + * Equivalent to: + * arm_q7_to_q15_no_shift( (q7_t*)Im_in+(i_ker_y*dim_im_in+i_ker_x)*3, pBuffer, 3); + */ + + const q7_t *pPixel = Im_in + (i_ker_y * dim_im_in + i_ker_x) * 3; + q31_t buf = arm_nn_read_q7x4(pPixel); + + union arm_nnword top; + union arm_nnword bottom; + + top.word = __SXTB16(buf); + bottom.word = __SXTB16(__ROR(buf, 8)); + +#ifndef ARM_MATH_BIG_ENDIAN + /* + * little-endian, | omit | 3rd | 2nd | 1st | + * MSB LSB + * top | 3rd | 1st |; bottom | omit | 2nd | + * + * version 1, need to swap 2nd and 3rd weight + * *__SIMD32(pBuffer) = top.word; + * *(pBuffer+2) = bottom.half_words[0]; + * + * version 2, no weight shuffling required + */ + *pBuffer++ = top.half_words[0]; + *__SIMD32(pBuffer) = __PKHBT(bottom.word, top.word, 0); +#else + /* + * big-endian, | 1st | 2nd | 3rd | omit | + * MSB LSB + * top | 2nd | omit |; bottom | 1st | 3rd | + * + * version 1, need to swap 2nd and 3rd weight + * *__SIMD32(pBuffer) = bottom.word; + * *(pBuffer+2) = top.half_words[1]; + * + * version 2, no weight shuffling required + */ + *pBuffer++ = bottom.half_words[0]; + *__SIMD32(pBuffer) = __PKHTB(top.word, bottom.word, 0); +#endif + pBuffer += 2; + } + } + } + + if (pBuffer == bufferA + 2 * 3 * dim_kernel * dim_kernel) + { + pOut = arm_nn_mat_mult_kernel_q7_q15( + wt, bufferA, ch_im_out, 3 * dim_kernel * dim_kernel, bias_shift, out_shift, bias, pOut); + + /* counter reset */ + pBuffer = bufferA; + } + } + } + + /* left-over because odd number of output pixels */ + if (pBuffer != bufferA) + { + const q7_t *pA = wt; + int i; + + for (i = 0; i < ch_im_out; i++) + { + q31_t sum = ((q31_t)bias[i] << bias_shift) + NN_ROUND(out_shift); + q15_t *pB = bufferA; + /* basically each time it process 4 entries */ + uint16_t colCnt = 3 * dim_kernel * dim_kernel >> 2; + + while (colCnt) + { + + q31_t inA1, inA2; + q31_t inB1, inB2; + + pA = read_and_pad(pA, &inA1, &inA2); + + inB1 = arm_nn_read_q15x2_ia((const q15_t **)&pB); + sum = __SMLAD(inA1, inB1, sum); + inB2 = arm_nn_read_q15x2_ia((const q15_t **)&pB); + sum = __SMLAD(inA2, inB2, sum); + + colCnt--; + } + colCnt = 3 * dim_kernel * dim_kernel & 0x3; + while (colCnt) + { + q7_t inA1 = *pA++; + q15_t inB1 = *pB++; + sum += inA1 * inB1; + colCnt--; + } + *pOut++ = (q7_t)__SSAT((sum >> out_shift), 8); + } + } +#else + (void)bufferA; + /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */ + int i, j, k, l, m, n; + int conv_out; + int in_row, in_col; + + // check if number of input channels is 3 + if (ch_im_in != 3) + { + return ARM_MATH_SIZE_MISMATCH; + } + + for (i = 0; i < ch_im_out; i++) + { + for (j = 0; j < dim_im_out; j++) + { + for (k = 0; k < dim_im_out; k++) + { + conv_out = (bias[i] << bias_shift) + NN_ROUND(out_shift); + for (m = 0; m < dim_kernel; m++) + { + for (n = 0; n < dim_kernel; n++) + { + /* if-for implementation */ + in_row = stride * j + m - padding; + in_col = stride * k + n - padding; + if (in_row >= 0 && in_col >= 0 && in_row < dim_im_in && in_col < dim_im_in) + { + for (l = 0; l < ch_im_in; l++) + { + conv_out += Im_in[(in_row * dim_im_in + in_col) * ch_im_in + l] * + wt[i * ch_im_in * dim_kernel * dim_kernel + (m * dim_kernel + n) * ch_im_in + l]; + } + } + } + } + Im_out[i + (j * dim_im_out + k) * ch_im_out] = (q7_t)__SSAT((conv_out >> out_shift), 8); + } + } + } + +#endif /* ARM_MATH_DSP */ + + /* Return to application */ + return (ARM_MATH_SUCCESS); +} + +/** + * @} end of NNConv group + */ diff --git a/APP_Framework/Framework/knowing/cmsis_5/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_basic.c b/APP_Framework/Framework/knowing/cmsis_5/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_basic.c new file mode 100644 index 000000000..942682e09 --- /dev/null +++ b/APP_Framework/Framework/knowing/cmsis_5/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_basic.c @@ -0,0 +1,227 @@ +/* + * Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_convolve_HWC_q7_basic.c + * Description: Q7 version of convolution + * + * $Date: 09. October 2020 + * $Revision: V.1.0.1 + * + * Target Processor: Cortex-M cores + * + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" +#include "arm_nnsupportfunctions.h" + +/** + * @ingroup groupNN + */ + +/** + * @addtogroup NNConv + * @{ + */ + +/** + * @brief Basic Q7 convolution function + * @param[in] Im_in pointer to input tensor + * @param[in] dim_im_in input tensor dimention + * @param[in] ch_im_in number of input tensor channels + * @param[in] wt pointer to kernel weights + * @param[in] ch_im_out number of filters, i.e., output tensor channels + * @param[in] dim_kernel filter kernel size + * @param[in] padding padding sizes + * @param[in] stride convolution stride + * @param[in] bias pointer to bias + * @param[in] bias_shift amount of left-shift for bias + * @param[in] out_shift amount of right-shift for output + * @param[in,out] Im_out pointer to output tensor + * @param[in] dim_im_out output tensor dimension + * @param[in,out] bufferA pointer to buffer space for input + * @param[in,out] bufferB pointer to buffer space for output + * @return The function returns ARM_MATH_SUCCESS + * + * @details + * + * Buffer size: + * + * bufferA size: 2*ch_im_in*dim_kernel*dim_kernel + * + * bufferB size: 0 + * + * This basic version is designed to work for any input tensor and weight + * dimension. + */ + +arm_status arm_convolve_HWC_q7_basic(const q7_t *Im_in, + const uint16_t dim_im_in, + const uint16_t ch_im_in, + const q7_t *wt, + const uint16_t ch_im_out, + const uint16_t dim_kernel, + const uint16_t padding, + const uint16_t stride, + const q7_t *bias, + const uint16_t bias_shift, + const uint16_t out_shift, + q7_t *Im_out, + const uint16_t dim_im_out, + q15_t *bufferA, + q7_t *bufferB) +{ + (void)bufferB; +#if defined(ARM_MATH_DSP) + /* Run the following code for Cortex-M4 and Cortex-M7 */ + + int16_t i_out_y, i_out_x, i_ker_y, i_ker_x; + + /* + * Here we use bufferA as q15_t internally as computation are done with q15_t level + * im2col are done to output in q15_t format from q7_t input + */ + q15_t *pBuffer = bufferA; + q7_t *pOut = Im_out; + + /* This part implements the im2col function */ + for (i_out_y = 0; i_out_y < dim_im_out; i_out_y++) + { + for (i_out_x = 0; i_out_x < dim_im_out; i_out_x++) + { + for (i_ker_y = i_out_y * stride - padding; i_ker_y < i_out_y * stride - padding + dim_kernel; i_ker_y++) + { + for (i_ker_x = i_out_x * stride - padding; i_ker_x < i_out_x * stride - padding + dim_kernel; i_ker_x++) + { + if (i_ker_y < 0 || i_ker_y >= dim_im_in || i_ker_x < 0 || i_ker_x >= dim_im_in) + { + /* Filling 0 for out-of-bound paddings */ + /* arm_fill_q15(0, pBuffer, ch_im_in); */ + memset(pBuffer, 0, sizeof(q15_t) * ch_im_in); + } + else + { + /* Copying the pixel data to column */ + arm_q7_to_q15_no_shift( + (q7_t *)Im_in + (i_ker_y * dim_im_in + i_ker_x) * ch_im_in, pBuffer, ch_im_in); + } + pBuffer += ch_im_in; + } + } + + /* Computation is filed for every 2 columns */ + if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel * dim_kernel) + { + pOut = arm_nn_mat_mult_kernel_q7_q15( + wt, bufferA, ch_im_out, ch_im_in * dim_kernel * dim_kernel, bias_shift, out_shift, bias, pOut); + + /* counter reset */ + pBuffer = bufferA; + } + } + } + + /* left-over because odd number of output pixels */ + if (pBuffer != bufferA) + { + const q7_t *pA = wt; + int i; + + for (i = 0; i < ch_im_out; i++) + { + /* Load the accumulator with bias first */ + q31_t sum = ((q31_t)bias[i] << bias_shift) + NN_ROUND(out_shift); + + /* Point to the beging of the im2col buffer */ + const q15_t *pB = bufferA; + + /* Each time it process 4 entries */ + uint16_t colCnt = ch_im_in * dim_kernel * dim_kernel >> 2; + + while (colCnt) + { + q31_t inA1, inA2; + q31_t inB1, inB2; + + pA = read_and_pad(pA, &inA1, &inA2); + + inB1 = arm_nn_read_q15x2_ia(&pB); + sum = __SMLAD(inA1, inB1, sum); + inB2 = arm_nn_read_q15x2_ia(&pB); + + sum = __SMLAD(inA2, inB2, sum); + + colCnt--; + } + colCnt = ch_im_in * dim_kernel * dim_kernel & 0x3; + while (colCnt) + { + q7_t inA1 = *pA++; + q15_t inB1 = *pB++; + sum += inA1 * inB1; + colCnt--; + } + *pOut++ = (q7_t)__SSAT((sum >> out_shift), 8); + } + } +#else + /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */ + + int i, j, k, l, m, n; + int conv_out; + int in_row, in_col; + + for (i = 0; i < ch_im_out; i++) + { + for (j = 0; j < dim_im_out; j++) + { + for (k = 0; k < dim_im_out; k++) + { + conv_out = ((q31_t)bias[i] << bias_shift) + NN_ROUND(out_shift); + for (m = 0; m < dim_kernel; m++) + { + for (n = 0; n < dim_kernel; n++) + { + // if-for implementation + in_row = stride * j + m - padding; + in_col = stride * k + n - padding; + if (in_row >= 0 && in_col >= 0 && in_row < dim_im_in && in_col < dim_im_in) + { + for (l = 0; l < ch_im_in; l++) + { + conv_out += Im_in[(in_row * dim_im_in + in_col) * ch_im_in + l] * + wt[i * ch_im_in * dim_kernel * dim_kernel + (m * dim_kernel + n) * ch_im_in + l]; + } + } + } + } + Im_out[i + (j * dim_im_out + k) * ch_im_out] = (q7_t)__SSAT((conv_out >> out_shift), 8); + } + } + } + +#endif /* ARM_MATH_DSP */ + + /* Return to application */ + return ARM_MATH_SUCCESS; +} + +/** + * @} end of NNConv group + */ diff --git a/APP_Framework/Framework/knowing/cmsis_5/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_basic_nonsquare.c b/APP_Framework/Framework/knowing/cmsis_5/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_basic_nonsquare.c new file mode 100644 index 000000000..cd9f78fd0 --- /dev/null +++ b/APP_Framework/Framework/knowing/cmsis_5/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_basic_nonsquare.c @@ -0,0 +1,229 @@ +/* + * Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_convolve_HWC_q7_basic.c + * Description: Q7 version of convolution + * + * $Date: January 26, 2021 + * $Revision: V.1.0.2 + * + * Target Processor: Cortex-M cores + * + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" +#include "arm_nnsupportfunctions.h" + +/** + * @ingroup groupNN + */ + +/** + * @addtogroup NNConv + * @{ + */ + +/** + * @brief Basic Q7 convolution function (non-sqaure shape) + * @param[in] Im_in pointer to input tensor + * @param[in] dim_im_in_x input tensor dimention x + * @param[in] dim_im_in_y input tensor dimention y + * @param[in] ch_im_in number of input tensor channels + * @param[in] wt pointer to kernel weights + * @param[in] ch_im_out number of filters, i.e., output tensor channels + * @param[in] dim_kernel_x filter kernel size x + * @param[in] dim_kernel_y filter kernel size y + * @param[in] padding_x padding size x + * @param[in] padding_y padding size y + * @param[in] stride_x convolution stride x + * @param[in] stride_y convolution stride y + * @param[in] bias pointer to bias + * @param[in] bias_shift amount of left-shift for bias + * @param[in] out_shift amount of right-shift for output + * @param[in,out] Im_out pointer to output tensor + * @param[in] dim_im_out_x output tensor dimension x + * @param[in] dim_im_out_y output tensor dimension y + * @param[in,out] bufferA pointer to buffer space for input + * @param[in,out] bufferB pointer to buffer space for output + * @return The function returns ARM_MATH_SUCCESS + */ + +arm_status arm_convolve_HWC_q7_basic_nonsquare(const q7_t *Im_in, + const uint16_t dim_im_in_x, + const uint16_t dim_im_in_y, + const uint16_t ch_im_in, + const q7_t *wt, + const uint16_t ch_im_out, + const uint16_t dim_kernel_x, + const uint16_t dim_kernel_y, + const uint16_t padding_x, + const uint16_t padding_y, + const uint16_t stride_x, + const uint16_t stride_y, + const q7_t *bias, + const uint16_t bias_shift, + const uint16_t out_shift, + q7_t *Im_out, + const uint16_t dim_im_out_x, + const uint16_t dim_im_out_y, + q15_t *bufferA, + q7_t *bufferB) +{ + (void)bufferB; +#if defined(ARM_MATH_DSP) + /* Run the following code for Cortex-M4 and Cortex-M7 */ + + int16_t i_out_y, i_out_x, i_ker_y, i_ker_x; + + /* + * Here we use bufferA as q15_t internally as computation are done with q15_t level + * im2col are done to output in q15_t format from q7_t input + */ + q15_t *pBuffer = bufferA; + q7_t *pOut = Im_out; + + /* This part implements the im2col function */ + for (i_out_y = 0; i_out_y < dim_im_out_y; i_out_y++) + { + for (i_out_x = 0; i_out_x < dim_im_out_x; i_out_x++) + { + for (i_ker_y = i_out_y * stride_y - padding_y; i_ker_y < i_out_y * stride_y - padding_y + dim_kernel_y; + i_ker_y++) + { + for (i_ker_x = i_out_x * stride_x - padding_x; i_ker_x < i_out_x * stride_x - padding_x + dim_kernel_x; + i_ker_x++) + { + if (i_ker_y < 0 || i_ker_y >= dim_im_in_y || i_ker_x < 0 || i_ker_x >= dim_im_in_x) + { + /* Filling 0 for out-of-bound paddings */ + /* arm_fill_q15(0, pBuffer, ch_im_in); */ + memset(pBuffer, 0, sizeof(q15_t) * ch_im_in); + } + else + { + /* Copying the pixel data to column */ + arm_q7_to_q15_no_shift( + (q7_t *)Im_in + (i_ker_y * dim_im_in_x + i_ker_x) * ch_im_in, pBuffer, ch_im_in); + } + pBuffer += ch_im_in; + } + } + + /* Computation is filed for every 2 columns */ + if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel_y * dim_kernel_x) + { + pOut = arm_nn_mat_mult_kernel_q7_q15( + wt, bufferA, ch_im_out, ch_im_in * dim_kernel_y * dim_kernel_x, bias_shift, out_shift, bias, pOut); + + /* counter reset */ + pBuffer = bufferA; + } + } + } + + /* left-over because odd number of output pixels */ + if (pBuffer != bufferA) + { + const q7_t *pA = wt; + int i; + + for (i = 0; i < ch_im_out; i++) + { + /* Load the accumulator with bias first */ + q31_t sum = ((q31_t)bias[i] << bias_shift) + NN_ROUND(out_shift); + + /* Point to the beging of the im2col buffer */ + const q15_t *pB = bufferA; + + /* Each time it process 4 entries */ + uint16_t colCnt = ch_im_in * dim_kernel_y * dim_kernel_x >> 2; + + while (colCnt) + { + q31_t inA1, inA2; + q31_t inB1, inB2; + + pA = read_and_pad(pA, &inA1, &inA2); + + inB1 = arm_nn_read_q15x2_ia(&pB); + sum = __SMLAD(inA1, inB1, sum); + inB2 = arm_nn_read_q15x2_ia(&pB); + + sum = __SMLAD(inA2, inB2, sum); + + colCnt--; + } + colCnt = ch_im_in * dim_kernel_y * dim_kernel_x & 0x3; + while (colCnt) + { + q7_t inA1 = *pA++; + q15_t inB1 = *pB++; + sum += inA1 * inB1; + colCnt--; + } + *pOut++ = (q7_t)__SSAT((sum >> out_shift), 8); + } + } +#else + /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */ + (void)bufferA; + int i, j, k, l, m, n; + int conv_out; + int in_row, in_col; + + for (i = 0; i < ch_im_out; i++) + { + for (j = 0; j < dim_im_out_y; j++) + { + for (k = 0; k < dim_im_out_x; k++) + { + conv_out = ((q31_t)bias[i] << bias_shift) + NN_ROUND(out_shift); + for (m = 0; m < dim_kernel_y; m++) + { + for (n = 0; n < dim_kernel_x; n++) + { + // if-for implementation + in_row = stride_y * j + m - padding_y; + in_col = stride_x * k + n - padding_x; + if (in_row >= 0 && in_col >= 0 && in_row < dim_im_in_y && in_col < dim_im_in_x) + { + for (l = 0; l < ch_im_in; l++) + { + conv_out += Im_in[(in_row * dim_im_in_x + in_col) * ch_im_in + l] * + wt[i * ch_im_in * dim_kernel_y * dim_kernel_x + (m * dim_kernel_x + n) * ch_im_in + + l]; + } + } + } + } + Im_out[i + (j * dim_im_out_x + k) * ch_im_out] = (q7_t)__SSAT((conv_out >> out_shift), 8); + } + } + } + +#endif /* ARM_MATH_DSP */ + + /* Return to application */ + return ARM_MATH_SUCCESS; +} + +/** + * @} end of NNConv group + */ diff --git a/APP_Framework/Framework/knowing/cmsis_5/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_fast.c b/APP_Framework/Framework/knowing/cmsis_5/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_fast.c new file mode 100644 index 000000000..bd9959f2c --- /dev/null +++ b/APP_Framework/Framework/knowing/cmsis_5/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_fast.c @@ -0,0 +1,380 @@ +/* + * Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_convolve_HWC_q7_fast.c + * Description: Fast Q7 version of convolution + * + * $Date: January 26, 2021 + * $Revision: V.1.0.2 + * + * Target Processor: Cortex-M cores + * + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" +#include "arm_nnsupportfunctions.h" + +/** + * @ingroup groupNN + */ + +/** + * @addtogroup NNConv + * @{ + */ + +/** + * @brief Fast Q7 convolution function + * @param[in] Im_in pointer to input tensor + * @param[in] dim_im_in input tensor dimention + * @param[in] ch_im_in number of input tensor channels + * @param[in] wt pointer to kernel weights + * @param[in] ch_im_out number of filters, i.e., output tensor channels + * @param[in] dim_kernel filter kernel size + * @param[in] padding padding sizes + * @param[in] stride convolution stride + * @param[in] bias pointer to bias + * @param[in] bias_shift amount of left-shift for bias + * @param[in] out_shift amount of right-shift for output + * @param[in,out] Im_out pointer to output tensor + * @param[in] dim_im_out output tensor dimension + * @param[in,out] bufferA pointer to buffer space for input + * @param[in,out] bufferB pointer to buffer space for output + * @return The function returns either + * ARM_MATH_SIZE_MISMATCH or ARM_MATH_SUCCESS based on the outcome of size checking. + * + * @details + * + * Buffer size: + * + * bufferA size: 2*ch_im_in*dim_kernel*dim_kernel + * + * bufferB size: 0 + * + * Input dimension constraints: + * + * ch_im_in is multiple of 4 ( because of the SIMD32 read and swap ) + * + * ch_im_out is multiple of 2 ( bacause 2x2 mat_mult kernel ) + * + * The im2col converts the Q7 tensor input into Q15 column, which is stored in + * bufferA. There is reordering happenning during this im2col process with + * arm_q7_to_q15_reordered_no_shift. For every four elements, the second and + * third elements are swapped. + * + * The computation kernel arm_nn_mat_mult_kernel_q7_q15_reordered does the + * GEMM computation with the reordered columns. + * + * To speed-up the determination of the padding condition, we split the + * computation into 3x3 parts, i.e., {top, mid, bottom} X {left, mid, right}. + * This reduces the total number of boundary condition checks and improves + * the data copying performance. + */ + +arm_status arm_convolve_HWC_q7_fast(const q7_t *Im_in, + const uint16_t dim_im_in, + const uint16_t ch_im_in, + const q7_t *wt, + const uint16_t ch_im_out, + const uint16_t dim_kernel, + const uint16_t padding, + const uint16_t stride, + const q7_t *bias, + const uint16_t bias_shift, + const uint16_t out_shift, + q7_t *Im_out, + const uint16_t dim_im_out, + q15_t *bufferA, + q7_t *bufferB) +{ + (void)bufferB; +#if defined(ARM_MATH_DSP) + /* Run the following code for Cortex-M4 and Cortex-M7 */ + + int16_t i_out_y, i_out_x, i_ker_y, i_ker_x; + + /* + * Here we use bufferA as q15_t internally as computation are done with q15_t level + * im2col are done to output in q15_t format from q7_t input + */ + + q15_t *pBuffer = bufferA; + q7_t *pOut = Im_out; + + if (ch_im_in % 4 != 0 || ch_im_out % 2 != 0) + { + /* check if the input dimension meets the constraints */ + return ARM_MATH_SIZE_MISMATCH; + } + + /* + * Here we split the entire matrix into three regions depending on the padding situation + * Top: i_out_y from 0 to padding - 1 + * Middle: i_out_y from padding to dim_im_out-padding-1 + * Bottom: i_out_y from dim_im_out-padding to dim_im_out-1 + */ + + /* top part */ + for (i_out_y = 0; i_out_y < padding; i_out_y++) + { + for (i_out_x = 0; i_out_x < dim_im_out; i_out_x++) + { + /* This part implements the im2col function */ + for (i_ker_y = i_out_y * stride - padding; i_ker_y < i_out_y * stride - padding + dim_kernel; i_ker_y++) + { + for (i_ker_x = i_out_x * stride - padding; i_ker_x < i_out_x * stride - padding + dim_kernel; i_ker_x++) + { + if (i_ker_y < 0 || i_ker_y >= dim_im_in || i_ker_x < 0 || i_ker_x >= dim_im_in) + { + /* arm_fill_q15(0, pBuffer, ch_im_in); */ + memset(pBuffer, 0, sizeof(q15_t) * ch_im_in); + } + else + { + arm_q7_to_q15_reordered_no_shift( + (q7_t *)Im_in + (i_ker_y * dim_im_in + i_ker_x) * ch_im_in, pBuffer, ch_im_in); + } + pBuffer += ch_im_in; + } + } + + if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel * dim_kernel) + { + pOut = arm_nn_mat_mult_kernel_q7_q15_reordered( + wt, bufferA, ch_im_out, ch_im_in * dim_kernel * dim_kernel, bias_shift, out_shift, bias, pOut); + /* counter reset */ + pBuffer = bufferA; + } + } + } + + /* middle part, here we also divide the x into left, mid and right */ + for (; i_out_y < dim_im_out - padding; i_out_y++) + { + + /* left part */ + for (i_out_x = 0; i_out_x < padding; i_out_x++) + { + /* This part implements the im2col function */ + for (i_ker_y = i_out_y * stride - padding; i_ker_y < i_out_y * stride - padding + dim_kernel; i_ker_y++) + { + for (i_ker_x = i_out_x * stride - padding; i_ker_x < i_out_x * stride - padding + dim_kernel; i_ker_x++) + { + if (i_ker_x < 0 || i_ker_x >= dim_im_in) + { + /* arm_fill_q15(0, pBuffer, ch_im_in); */ + memset(pBuffer, 0, sizeof(q15_t) * ch_im_in); + } + else + { + arm_q7_to_q15_reordered_no_shift( + (q7_t *)Im_in + (i_ker_y * dim_im_in + i_ker_x) * ch_im_in, pBuffer, ch_im_in); + } + pBuffer += ch_im_in; + } + } + + if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel * dim_kernel) + { + pOut = arm_nn_mat_mult_kernel_q7_q15_reordered( + wt, bufferA, ch_im_out, ch_im_in * dim_kernel * dim_kernel, bias_shift, out_shift, bias, pOut); + /* counter reset */ + pBuffer = bufferA; + } + } + + /* mid part */ + for (; i_out_x < dim_im_out - padding; i_out_x++) + { + /* This part implements the im2col function */ + for (i_ker_y = i_out_y * stride - padding; i_ker_y < i_out_y * stride - padding + dim_kernel; i_ker_y++) + { + arm_q7_to_q15_reordered_no_shift((q7_t *)Im_in + + (i_ker_y * dim_im_in + i_out_x * stride - padding) * ch_im_in, + pBuffer, + ch_im_in * dim_kernel); + pBuffer += ch_im_in * dim_kernel; + } + + if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel * dim_kernel) + { + pOut = arm_nn_mat_mult_kernel_q7_q15_reordered( + wt, bufferA, ch_im_out, ch_im_in * dim_kernel * dim_kernel, bias_shift, out_shift, bias, pOut); + /* counter reset */ + pBuffer = bufferA; + } + } + + /* right part */ + for (; i_out_x < dim_im_out; i_out_x++) + { + /* This part implements the im2col function */ + for (i_ker_y = i_out_y * stride - padding; i_ker_y < i_out_y * stride - padding + dim_kernel; i_ker_y++) + { + for (i_ker_x = i_out_x * stride - padding; i_ker_x < i_out_x * stride - padding + dim_kernel; i_ker_x++) + { + if (i_ker_x < 0 || i_ker_x >= dim_im_in) + { + /* arm_fill_q15(0, pBuffer, ch_im_in); */ + memset(pBuffer, 0, sizeof(q15_t) * ch_im_in); + } + else + { + arm_q7_to_q15_reordered_no_shift( + (q7_t *)Im_in + (i_ker_y * dim_im_in + i_ker_x) * ch_im_in, pBuffer, ch_im_in); + } + pBuffer += ch_im_in; + } + } + + if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel * dim_kernel) + { + pOut = arm_nn_mat_mult_kernel_q7_q15_reordered( + wt, bufferA, ch_im_out, ch_im_in * dim_kernel * dim_kernel, bias_shift, out_shift, bias, pOut); + /* counter reset */ + pBuffer = bufferA; + } + } + } + + for (; i_out_y < dim_im_out; i_out_y++) + { + for (i_out_x = 0; i_out_x < dim_im_out; i_out_x++) + { + /* This part implements the im2col function */ + for (i_ker_y = i_out_y * stride - padding; i_ker_y < i_out_y * stride - padding + dim_kernel; i_ker_y++) + { + for (i_ker_x = i_out_x * stride - padding; i_ker_x < i_out_x * stride - padding + dim_kernel; i_ker_x++) + { + if (i_ker_y < 0 || i_ker_y >= dim_im_in || i_ker_x < 0 || i_ker_x >= dim_im_in) + { + /* arm_fill_q15(0, pBuffer, ch_im_in); */ + memset(pBuffer, 0, sizeof(q15_t) * ch_im_in); + } + else + { + arm_q7_to_q15_reordered_no_shift( + (q7_t *)Im_in + (i_ker_y * dim_im_in + i_ker_x) * ch_im_in, pBuffer, ch_im_in); + } + pBuffer += ch_im_in; + } + } + + if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel * dim_kernel) + { + pOut = arm_nn_mat_mult_kernel_q7_q15_reordered( + wt, bufferA, ch_im_out, ch_im_in * dim_kernel * dim_kernel, bias_shift, out_shift, bias, pOut); + /* counter reset */ + pBuffer = bufferA; + } + } + } + + /* check if there is left-over for compute */ + if (pBuffer != bufferA) + { + const q7_t *pA = wt; + int i; + + for (i = 0; i < ch_im_out; i++) + { + q31_t sum = ((q31_t)bias[i] << bias_shift) + NN_ROUND(out_shift); + const q15_t *pB = bufferA; + /* each time it process 4 entries */ + uint16_t colCnt = ch_im_in * dim_kernel * dim_kernel >> 2; + + while (colCnt) + { + + q31_t inA1, inA2; + q31_t inB1, inB2; + + pA = read_and_pad_reordered(pA, &inA1, &inA2); + + inB1 = arm_nn_read_q15x2_ia(&pB); + sum = __SMLAD(inA1, inB1, sum); + inB2 = arm_nn_read_q15x2_ia(&pB); + sum = __SMLAD(inA2, inB2, sum); + + colCnt--; + } + colCnt = ch_im_in * dim_kernel * dim_kernel & 0x3; + while (colCnt) + { + q7_t inA1 = *pA++; + q15_t inB1 = *pB++; + sum += inA1 * inB1; + colCnt--; + } + *pOut = (q7_t)__SSAT((sum >> out_shift), 8); + pOut++; + } + } +#else + (void)bufferA; + /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */ + int i, j, k, l, m, n; + int conv_out; + int in_row, in_col; + + if (ch_im_in % 4 != 0 || ch_im_out % 2 != 0) + { + /* check if the input dimension meets the constraints */ + return ARM_MATH_SIZE_MISMATCH; + } + + for (i = 0; i < ch_im_out; i++) + { + for (j = 0; j < dim_im_out; j++) + { + for (k = 0; k < dim_im_out; k++) + { + conv_out = (bias[i] << bias_shift) + NN_ROUND(out_shift); + for (m = 0; m < dim_kernel; m++) + { + for (n = 0; n < dim_kernel; n++) + { + // if-for implementation + in_row = stride * j + m - padding; + in_col = stride * k + n - padding; + if (in_row >= 0 && in_col >= 0 && in_row < dim_im_in && in_col < dim_im_in) + { + for (l = 0; l < ch_im_in; l++) + { + conv_out += Im_in[(in_row * dim_im_in + in_col) * ch_im_in + l] * + wt[i * ch_im_in * dim_kernel * dim_kernel + (m * dim_kernel + n) * ch_im_in + l]; + } + } + } + } + Im_out[i + (j * dim_im_out + k) * ch_im_out] = (q7_t)__SSAT((conv_out >> out_shift), 8); + } + } + } + +#endif /* ARM_MATH_DSP */ + + /* Return to application */ + return ARM_MATH_SUCCESS; +} + +/** + * @} end of NNConv group + */ diff --git a/APP_Framework/Framework/knowing/cmsis_5/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_fast_nonsquare.c b/APP_Framework/Framework/knowing/cmsis_5/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_fast_nonsquare.c new file mode 100644 index 000000000..6ad061b10 --- /dev/null +++ b/APP_Framework/Framework/knowing/cmsis_5/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_fast_nonsquare.c @@ -0,0 +1,378 @@ +/* + * Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_convolve_HWC_q7_fast_nonsquare.c + * Description: Fast Q7 version of convolution (non-sqaure shape) + * + * $Date: January 26, 2021 + * $Revision: V.1.0.2 + * + * Target Processor: Cortex-M cores + * + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" +#include "arm_nnsupportfunctions.h" + +/** + * @ingroup groupNN + */ + +/** + * @addtogroup NNConv + * @{ + */ + +/** + * @brief Fast Q7 convolution function (non-sqaure shape) + * @param[in] Im_in pointer to input tensor + * @param[in] dim_im_in_x input tensor dimention x + * @param[in] dim_im_in_y input tensor dimention y + * @param[in] ch_im_in number of input tensor channels + * @param[in] wt pointer to kernel weights + * @param[in] ch_im_out number of filters, i.e., output tensor channels + * @param[in] dim_kernel_x filter kernel size x + * @param[in] dim_kernel_y filter kernel size y + * @param[in] padding_x padding size x + * @param[in] padding_y padding size y + * @param[in] stride_x convolution stride x + * @param[in] stride_y convolution stride y + * @param[in] bias pointer to bias + * @param[in] bias_shift amount of left-shift for bias + * @param[in] out_shift amount of right-shift for output + * @param[in,out] Im_out pointer to output tensor + * @param[in] dim_im_out_x output tensor dimension x + * @param[in] dim_im_out_y output tensor dimension y + * @param[in,out] bufferA pointer to buffer space for input + * @param[in,out] bufferB pointer to buffer space for output + * @return The function returns either + * ARM_MATH_SIZE_MISMATCH or ARM_MATH_SUCCESS based on the outcome of size checking. + * + * This function is the version with full list of optimization tricks, but with + * some constraints: + * ch_im_in is multiple of 4 + * ch_im_out is multiple of 2 + */ + +arm_status arm_convolve_HWC_q7_fast_nonsquare(const q7_t *Im_in, + const uint16_t dim_im_in_x, + const uint16_t dim_im_in_y, + const uint16_t ch_im_in, + const q7_t *wt, + const uint16_t ch_im_out, + const uint16_t dim_kernel_x, + const uint16_t dim_kernel_y, + const uint16_t padding_x, + const uint16_t padding_y, + const uint16_t stride_x, + const uint16_t stride_y, + const q7_t *bias, + const uint16_t bias_shift, + const uint16_t out_shift, + q7_t *Im_out, + const uint16_t dim_im_out_x, + const uint16_t dim_im_out_y, + q15_t *bufferA, + q7_t *bufferB) +{ + (void)bufferB; +#if defined(ARM_MATH_DSP) + /* Run the following code for Cortex-M4 and Cortex-M7 */ + + int16_t i_out_y, i_out_x, i_ker_y, i_ker_x; + + /* ----------------------- + * Here we use bufferA as q15_t internally as computation are done with q15_t level + * im2col are done to output in q15_t format from q7_t input + */ + + q15_t *pBuffer = bufferA; + q7_t *pOut = Im_out; + + if (ch_im_in % 4 != 0 || ch_im_out % 2 != 0) + { + /* check if the input dimension meets the constraints */ + return ARM_MATH_SIZE_MISMATCH; + } + + /* + * Here we split the entire matrix into three regions depending on the padding situation + * Top: i_out_y from 0 to padding - 1 + * Middle: i_out_y from padding to dim_im_out-padding-1 + * Bottom: i_out_y from dim_im_out-padding to dim_im_out-1 + */ + + /* top part */ + for (i_out_y = 0; i_out_y < padding_y; i_out_y++) + { + for (i_out_x = 0; i_out_x < dim_im_out_x; i_out_x++) + { + /* This part implements the im2col function */ + for (i_ker_y = i_out_y * stride_y - padding_y; i_ker_y < i_out_y * stride_y - padding_y + dim_kernel_y; + i_ker_y++) + { + for (i_ker_x = i_out_x * stride_x - padding_x; i_ker_x < i_out_x * stride_x - padding_x + dim_kernel_x; + i_ker_x++) + { + if (i_ker_y < 0 || i_ker_y >= dim_im_in_y || i_ker_x < 0 || i_ker_x >= dim_im_in_x) + { + /* arm_fill_q15(0, pBuffer, ch_im_in); */ + memset(pBuffer, 0, sizeof(q15_t) * ch_im_in); + } + else + { + arm_q7_to_q15_reordered_no_shift( + (q7_t *)Im_in + (i_ker_y * dim_im_in_x + i_ker_x) * ch_im_in, pBuffer, ch_im_in); + } + pBuffer += ch_im_in; + } + } + + if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel_x * dim_kernel_y) + { + pOut = arm_nn_mat_mult_kernel_q7_q15_reordered( + wt, bufferA, ch_im_out, ch_im_in * dim_kernel_x * dim_kernel_y, bias_shift, out_shift, bias, pOut); + /* counter reset */ + pBuffer = bufferA; + } + } + } + + /* middle part, here we also divide the x into left, mid and right */ + for (; i_out_y < dim_im_out_y - padding_y; i_out_y++) + { + + /* left part */ + for (i_out_x = 0; i_out_x < padding_x; i_out_x++) + { + /* This part implements the im2col function */ + for (i_ker_y = i_out_y * stride_y - padding_y; i_ker_y < i_out_y * stride_y - padding_y + dim_kernel_y; + i_ker_y++) + { + for (i_ker_x = i_out_x * stride_x - padding_x; i_ker_x < i_out_x * stride_x - padding_x + dim_kernel_x; + i_ker_x++) + { + if (i_ker_x < 0 || i_ker_x >= dim_im_in_x) + { + /* arm_fill_q15(0, pBuffer, ch_im_in); */ + memset(pBuffer, 0, sizeof(q15_t) * ch_im_in); + } + else + { + arm_q7_to_q15_reordered_no_shift( + (q7_t *)Im_in + (i_ker_y * dim_im_in_x + i_ker_x) * ch_im_in, pBuffer, ch_im_in); + } + pBuffer += ch_im_in; + } + } + + if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel_x * dim_kernel_y) + { + pOut = arm_nn_mat_mult_kernel_q7_q15_reordered( + wt, bufferA, ch_im_out, ch_im_in * dim_kernel_x * dim_kernel_y, bias_shift, out_shift, bias, pOut); + /* counter reset */ + pBuffer = bufferA; + } + } + + /* mid part */ + for (; i_out_x < dim_im_out_x - padding_x; i_out_x++) + { + /* This part implements the im2col function */ + for (i_ker_y = i_out_y * stride_y - padding_y; i_ker_y < i_out_y * stride_y - padding_y + dim_kernel_y; + i_ker_y++) + { + arm_q7_to_q15_reordered_no_shift( + (q7_t *)Im_in + (i_ker_y * dim_im_in_x + i_out_x * stride_x - padding_x) * ch_im_in, + pBuffer, + ch_im_in * dim_kernel_x); + pBuffer += ch_im_in * dim_kernel_x; + } + + if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel_x * dim_kernel_y) + { + pOut = arm_nn_mat_mult_kernel_q7_q15_reordered( + wt, bufferA, ch_im_out, ch_im_in * dim_kernel_x * dim_kernel_y, bias_shift, out_shift, bias, pOut); + /* counter reset */ + pBuffer = bufferA; + } + } + + /* right part */ + for (; i_out_x < dim_im_out_x; i_out_x++) + { + /* This part implements the im2col function */ + for (i_ker_y = i_out_y * stride_y - padding_y; i_ker_y < i_out_y * stride_y - padding_y + dim_kernel_y; + i_ker_y++) + { + for (i_ker_x = i_out_x * stride_x - padding_x; i_ker_x < i_out_x * stride_x - padding_x + dim_kernel_x; + i_ker_x++) + { + if (i_ker_x < 0 || i_ker_x >= dim_im_in_x) + { + /* arm_fill_q15(0, pBuffer, ch_im_in); */ + memset(pBuffer, 0, sizeof(q15_t) * ch_im_in); + } + else + { + arm_q7_to_q15_reordered_no_shift( + (q7_t *)Im_in + (i_ker_y * dim_im_in_x + i_ker_x) * ch_im_in, pBuffer, ch_im_in); + } + pBuffer += ch_im_in; + } + } + + if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel_x * dim_kernel_y) + { + pOut = arm_nn_mat_mult_kernel_q7_q15_reordered( + wt, bufferA, ch_im_out, ch_im_in * dim_kernel_x * dim_kernel_y, bias_shift, out_shift, bias, pOut); + /* counter reset */ + pBuffer = bufferA; + } + } + } + + for (; i_out_y < dim_im_out_y; i_out_y++) + { + for (i_out_x = 0; i_out_x < dim_im_out_x; i_out_x++) + { + /* This part implements the im2col function */ + for (i_ker_y = i_out_y * stride_y - padding_y; i_ker_y < i_out_y * stride_y - padding_y + dim_kernel_y; + i_ker_y++) + { + for (i_ker_x = i_out_x * stride_x - padding_x; i_ker_x < i_out_x * stride_x - padding_x + dim_kernel_x; + i_ker_x++) + { + if (i_ker_y < 0 || i_ker_y >= dim_im_in_y || i_ker_x < 0 || i_ker_x >= dim_im_in_x) + { + /* arm_fill_q15(0, pBuffer, ch_im_in); */ + memset(pBuffer, 0, sizeof(q15_t) * ch_im_in); + } + else + { + arm_q7_to_q15_reordered_no_shift( + (q7_t *)Im_in + (i_ker_y * dim_im_in_x + i_ker_x) * ch_im_in, pBuffer, ch_im_in); + } + pBuffer += ch_im_in; + } + } + + if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel_x * dim_kernel_y) + { + pOut = arm_nn_mat_mult_kernel_q7_q15_reordered( + wt, bufferA, ch_im_out, ch_im_in * dim_kernel_x * dim_kernel_y, bias_shift, out_shift, bias, pOut); + /* counter reset */ + pBuffer = bufferA; + } + } + } + + /* check if there is left-over for compute */ + if (pBuffer != bufferA) + { + const q7_t *pA = wt; + int i; + for (i = 0; i < ch_im_out; i++) + { + q31_t sum = ((q31_t)(bias[i]) << bias_shift) + NN_ROUND(out_shift); + const q15_t *pB = bufferA; + /* basically each time it process 4 entries */ + uint16_t colCnt = ch_im_in * dim_kernel_x * dim_kernel_y >> 2; + + while (colCnt) + { + + q31_t inA1, inA2; + q31_t inB1, inB2; + + pA = read_and_pad_reordered(pA, &inA1, &inA2); + + inB1 = arm_nn_read_q15x2_ia(&pB); + sum = __SMLAD(inA1, inB1, sum); + inB2 = arm_nn_read_q15x2_ia(&pB); + sum = __SMLAD(inA2, inB2, sum); + + colCnt--; + } + colCnt = (ch_im_in * dim_kernel_y * dim_kernel_x) & 0x3; + while (colCnt) + { + q7_t inA1 = *pA++; + q15_t inB1 = *pB++; + sum += inA1 * inB1; + colCnt--; + } + *pOut = (q7_t)__SSAT((sum >> out_shift), 8); + pOut++; + } + } + +#else + (void)bufferA; + /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */ + int i, j, k, l, m, n; + int conv_out; + int in_row, in_col; + + if (ch_im_in % 4 != 0 || ch_im_out % 2 != 0) + { + /* check if the input dimension meets the constraints */ + return ARM_MATH_SIZE_MISMATCH; + } + + for (i = 0; i < ch_im_out; i++) + { + for (j = 0; j < dim_im_out_y; j++) + { + for (k = 0; k < dim_im_out_x; k++) + { + conv_out = ((q31_t)(bias[i]) << bias_shift) + NN_ROUND(out_shift); + for (m = 0; m < dim_kernel_y; m++) + { + for (n = 0; n < dim_kernel_x; n++) + { + /* if-for implementation */ + in_row = stride_y * j + m - padding_y; + in_col = stride_x * k + n - padding_x; + if (in_row >= 0 && in_col >= 0 && in_row < dim_im_in_y && in_col < dim_im_in_x) + { + for (l = 0; l < ch_im_in; l++) + { + conv_out += Im_in[(in_row * dim_im_in_x + in_col) * ch_im_in + l] * + wt[i * ch_im_in * dim_kernel_y * dim_kernel_x + (m * dim_kernel_x + n) * ch_im_in + + l]; + } + } + } + } + Im_out[i + (j * dim_im_out_x + k) * ch_im_out] = (q7_t)__SSAT((conv_out >> out_shift), 8); + } + } + } + +#endif /* ARM_MATH_DSP */ + + /* Return to application */ + return ARM_MATH_SUCCESS; +} + +/** + * @} end of NNConv group + */ diff --git a/APP_Framework/Framework/knowing/cmsis_5/NN/Source/ConvolutionFunctions/arm_convolve_s8.c b/APP_Framework/Framework/knowing/cmsis_5/NN/Source/ConvolutionFunctions/arm_convolve_s8.c new file mode 100644 index 000000000..32b31d07a --- /dev/null +++ b/APP_Framework/Framework/knowing/cmsis_5/NN/Source/ConvolutionFunctions/arm_convolve_s8.c @@ -0,0 +1,380 @@ +/* + * Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_convolve_s8.c + * Description: s8 version of convolution using symmetric quantization. + * + * $Date: January 26, 2021 + * $Revision: V.2.0.4 + * + * Target Processor: Cortex-M cores + * + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" +#include "arm_nnsupportfunctions.h" + +/** + * @ingroup groupNN + */ + +/** + * @addtogroup NNConv + * @{ + */ + +/* + * Basic s8 convolution function. + * + * Refer header file for details. Optimal use case for the DSP/MVE implementation is when input and output channels + * are multiples of 4 or atleast greater than 4. + * + */ + +arm_status arm_convolve_s8(const cmsis_nn_context *ctx, + const cmsis_nn_conv_params *conv_params, + const cmsis_nn_per_channel_quant_params *quant_params, + const cmsis_nn_dims *input_dims, + const q7_t *input_data, + const cmsis_nn_dims *filter_dims, + const q7_t *filter_data, + const cmsis_nn_dims *bias_dims, + const int32_t *bias_data, + const cmsis_nn_dims *output_dims, + q7_t *output_data) +{ + (void)bias_dims; + q15_t *buffer_a = (q15_t *)ctx->buf; + + const uint16_t input_batches = input_dims->n; + const uint16_t input_x = input_dims->w; + const uint16_t input_y = input_dims->h; + const uint16_t input_ch = input_dims->c; + const uint16_t kernel_x = filter_dims->w; + const uint16_t kernel_y = filter_dims->h; + const uint16_t output_x = output_dims->w; + const uint16_t output_y = output_dims->h; + const uint16_t output_ch = output_dims->c; + + const uint16_t pad_x = conv_params->padding.w; + const uint16_t pad_y = conv_params->padding.h; + const uint16_t stride_x = conv_params->stride.w; + const uint16_t stride_y = conv_params->stride.h; + + const int32_t input_offset = conv_params->input_offset; + const int32_t out_offset = conv_params->output_offset; + const int32_t out_activation_min = conv_params->activation.min; + const int32_t out_activation_max = conv_params->activation.max; + int32_t *output_mult = quant_params->multiplier; + int32_t *output_shift = quant_params->shift; + + int i_batch; + for (i_batch = 0; i_batch < input_batches; i_batch++) + { +#if defined(ARM_MATH_MVEI) + /* Generate upto four columns from the input tensor a GEMM computation */ + q7_t *im2col_buf = (q7_t *)buffer_a; + q7_t *out = output_data; + int32_t buffer_fill_cnt = 0; + int32_t padded = 0; + const int32_t num_elem = kernel_x * kernel_y * input_ch; + + /* This part implements the im2col function */ + for (int i_out_y = 0; i_out_y < output_y; i_out_y++) + { + for (int i_out_x = 0; i_out_x < output_x; i_out_x++) + { + for (int i_ker_y = i_out_y * stride_y - pad_y; i_ker_y < i_out_y * stride_y - pad_y + kernel_y; + i_ker_y++) + { + for (int i_ker_x = i_out_x * stride_x - pad_x; i_ker_x < i_out_x * stride_x - pad_x + kernel_x; + i_ker_x++) + { + if (i_ker_y < 0 || i_ker_y >= input_y || i_ker_x < 0 || i_ker_x >= input_x) + { + memset(im2col_buf, (int8_t)-input_offset, sizeof(q7_t) * input_ch); + padded = 1; + } + else + { + arm_memcpy_q7(im2col_buf, input_data + (i_ker_y * input_x + i_ker_x) * input_ch, input_ch); + } + im2col_buf += input_ch; + } + } + + buffer_fill_cnt++; + + /* Computation is filed for every 4 columns */ + if (buffer_fill_cnt == 4 && (padded == 0)) + { + buffer_fill_cnt = 0; + for (int i_out_ch = 0; i_out_ch < output_ch; i_out_ch++) + { + int32_t sum_row; + int32_t acc[4]; + + (void)arm_nn_mat_mul_core_4x_s8( + num_elem, num_elem, (q7_t *)buffer_a, filter_data + num_elem * i_out_ch, &sum_row, acc); + int32x4_t s_offset = vdupq_n_s32(sum_row); + + int32x4_t res = vldrwq_s32(acc); + s_offset = vmulq_n_s32(s_offset, input_offset); + if (bias_data) + { + res = vaddq_n_s32(res, bias_data[i_out_ch]); + } + res = vaddq_s32(res, s_offset); + res = arm_requantize_mve(res, output_mult[i_out_ch], output_shift[i_out_ch]); + res = vaddq_n_s32(res, out_offset); + + res = vmaxq_s32(res, vdupq_n_s32(out_activation_min)); + res = vminq_s32(res, vdupq_n_s32(out_activation_max)); + + const uint32x4_t scatter_offset = {0, output_ch, output_ch * 2, output_ch * 3}; + vstrbq_scatter_offset_s32(out, scatter_offset, res); + out++; + } + out += (3 * output_ch); + im2col_buf = (q7_t *)buffer_a; + } + else if (buffer_fill_cnt == 4 && (padded != 0)) + { + buffer_fill_cnt = 0; + out = arm_nn_mat_mult_s8(filter_data, + (q7_t *)buffer_a, + output_ch, + 4, + output_shift, + output_mult, + out_offset, + input_offset, + 0, + out_activation_min, + out_activation_max, + num_elem, + bias_data, + out); + + im2col_buf = (q7_t *)buffer_a; + padded = 0; + } + } + } + /* Handle left over columns */ + if (buffer_fill_cnt != 0) + { + out = arm_nn_mat_mult_s8(filter_data, + (q7_t *)buffer_a, + output_ch, + buffer_fill_cnt, + output_shift, + output_mult, + out_offset, + input_offset, + 0, + out_activation_min, + out_activation_max, + num_elem, + bias_data, + out); + } + +#elif defined(ARM_MATH_DSP) + int32_t i_out_y, i_out_x, i_ker_y, i_ker_x; + + /* Generate two columns from the input tensor a GEMM computation */ + q15_t *two_column_buf = buffer_a; + q7_t *out = output_data; + + /* This part implements the im2col function */ + for (i_out_y = 0; i_out_y < output_y; i_out_y++) + { + for (i_out_x = 0; i_out_x < output_x; i_out_x++) + { + for (i_ker_y = i_out_y * stride_y - pad_y; i_ker_y < i_out_y * stride_y - pad_y + kernel_y; i_ker_y++) + { + for (i_ker_x = i_out_x * stride_x - pad_x; i_ker_x < i_out_x * stride_x - pad_x + kernel_x; + i_ker_x++) + { + if (i_ker_y < 0 || i_ker_y >= input_y || i_ker_x < 0 || i_ker_x >= input_x) + { + /* Filling 0 for out-of-bound paddings */ + memset(two_column_buf, 0, sizeof(q15_t) * input_ch); + } + else + { + /* Copying the pixel data to column */ + arm_q7_to_q15_with_offset(input_data + (i_ker_y * input_x + i_ker_x) * input_ch, + two_column_buf, + input_ch, + input_offset); + } + two_column_buf += input_ch; + } + } + + /* Computation is filed for every 2 columns */ + if (two_column_buf == buffer_a + 2 * input_ch * kernel_y * kernel_x) + { + out = arm_nn_mat_mult_kernel_s8_s16(filter_data, + buffer_a, + output_ch, + output_shift, + output_mult, + out_offset, + out_activation_min, + out_activation_max, + input_ch * kernel_y * kernel_x, + bias_data, + out); + + /* counter reset */ + two_column_buf = buffer_a; + } + } + } + + /* left-over because odd number of output pixels */ + if (two_column_buf != buffer_a) + { + const q7_t *ker_a = filter_data; + int i; + + for (i = 0; i < output_ch; i++) + { + /* Load the accumulator with bias first */ + q31_t sum = 0; + if (bias_data) + { + sum = bias_data[i]; + } + + /* Point to the beginning of the im2col buffer where the input is available as a rearranged column */ + const q15_t *ip_as_col = buffer_a; + + /* 4 multiply and accumulates are done in one loop. */ + uint16_t col_count = (input_ch * kernel_y * kernel_x) >> 2; + + while (col_count) + { + q31_t ker_a1, ker_a2; + q31_t ip_b1, ip_b2; + + ker_a = read_and_pad(ker_a, &ker_a1, &ker_a2); + + ip_b1 = arm_nn_read_q15x2_ia(&ip_as_col); + sum = __SMLAD(ker_a1, ip_b1, sum); + ip_b2 = arm_nn_read_q15x2_ia(&ip_as_col); + sum = __SMLAD(ker_a2, ip_b2, sum); + + col_count--; + } + /* Handle left over mac */ + col_count = input_ch * kernel_y * kernel_x & 0x3; + while (col_count) + { + q7_t ker_a1 = *ker_a++; + q15_t ip_b1 = *ip_as_col++; + sum += ker_a1 * ip_b1; + col_count--; + } + + sum = arm_nn_requantize(sum, output_mult[i], output_shift[i]); + sum += out_offset; + sum = MAX(sum, out_activation_min); + sum = MIN(sum, out_activation_max); + *out++ = (q7_t)sum; + } + } +#else + /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */ + (void)buffer_a; + int32_t i_out_ch, i_out_y, i_out_x, i_input_ch, i_ker_y, i_ker_x; + int32_t conv_out; + + for (i_out_ch = 0; i_out_ch < output_ch; i_out_ch++) + { + for (i_out_y = 0; i_out_y < output_y; i_out_y++) + { + for (i_out_x = 0; i_out_x < output_x; i_out_x++) + { + conv_out = 0; + + const int32_t base_idx_y = stride_y * i_out_y - pad_y; + const int32_t base_idx_x = stride_x * i_out_x - pad_x; + + const int32_t ker_y_start = MAX(0, -base_idx_y); + const int32_t ker_x_start = MAX(0, -base_idx_x); + + const int32_t ker_y_end = MIN(kernel_y, input_y - base_idx_y); + const int32_t ker_x_end = MIN(kernel_x, input_x - base_idx_x); + + for (i_ker_y = ker_y_start; i_ker_y < ker_y_end; i_ker_y++) + { + for (i_ker_x = ker_x_start; i_ker_x < ker_x_end; i_ker_x++) + { + const int32_t in_row = base_idx_y + i_ker_y; + const int32_t in_col = base_idx_x + i_ker_x; + for (i_input_ch = 0; i_input_ch < input_ch; i_input_ch++) + { + conv_out += + (input_data[(in_row * input_x + in_col) * input_ch + i_input_ch] + input_offset) * + filter_data[i_out_ch * input_ch * kernel_y * kernel_x + + (i_ker_y * kernel_x + i_ker_x) * input_ch + i_input_ch]; + } + } + } + if (bias_data) + { + conv_out += bias_data[i_out_ch]; + } + conv_out = arm_nn_requantize(conv_out, output_mult[i_out_ch], output_shift[i_out_ch]); + conv_out += out_offset; + conv_out = MAX(conv_out, out_activation_min); + conv_out = MIN(conv_out, out_activation_max); + output_data[i_out_ch + (i_out_y * output_x + i_out_x) * output_ch] = (int8_t)conv_out; + } + } + } +#endif + /* Advance to the next batch */ + input_data += (input_x * input_y * input_ch); + output_data += (output_x * output_y * output_ch); + } + + /* Return to application */ + return ARM_MATH_SUCCESS; +} + +int32_t arm_convolve_s8_get_buffer_size(const cmsis_nn_dims *input_dims, const cmsis_nn_dims *filter_dims) +{ +#if defined(ARM_MATH_DSP) + return (2 * input_dims->c * filter_dims->w * filter_dims->h) * (int32_t)sizeof(int16_t); +#else + (void)input_dims; + (void)filter_dims; + return 0; +#endif +} + +/** + * @} end of NNConv group + */ diff --git a/APP_Framework/Framework/knowing/cmsis_5/NN/Source/ConvolutionFunctions/arm_convolve_wrapper_s8.c b/APP_Framework/Framework/knowing/cmsis_5/NN/Source/ConvolutionFunctions/arm_convolve_wrapper_s8.c new file mode 100644 index 000000000..55a65b583 --- /dev/null +++ b/APP_Framework/Framework/knowing/cmsis_5/NN/Source/ConvolutionFunctions/arm_convolve_wrapper_s8.c @@ -0,0 +1,131 @@ +/* + * Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_convolve_wrapper_s8.c + * Description: s8 convolution layer wrapper function with the main purpose to call the optimal kernel available in + * cmsis-nn to perform the convolution. + * + * $Date: 09. October 2020 + * $Revision: V.1.0.1 + * + * Target Processor: Cortex-M cores + * + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" + +/** + * @ingroup groupNN + */ + +/** + * @addtogroup NNConv + * @{ + */ + +/* + * Convolution layer + * + * Refer header file for details. + * + */ + +arm_status arm_convolve_wrapper_s8(const cmsis_nn_context *ctx, + const cmsis_nn_conv_params *conv_params, + const cmsis_nn_per_channel_quant_params *quant_params, + const cmsis_nn_dims *input_dims, + const q7_t *input_data, + const cmsis_nn_dims *filter_dims, + const q7_t *filter_data, + const cmsis_nn_dims *bias_dims, + const int32_t *bias_data, + const cmsis_nn_dims *output_dims, + q7_t *output_data) +{ + if ((conv_params->padding.w == 0) && (conv_params->padding.h == 0) && (input_dims->c % 4 == 0) && + (conv_params->stride.w == 1) && (conv_params->stride.h == 1) && (filter_dims->w == 1) && (filter_dims->h == 1)) + { + return arm_convolve_1x1_s8_fast(ctx, + conv_params, + quant_params, + input_dims, + input_data, + filter_dims, + filter_data, + bias_dims, + bias_data, + output_dims, + output_data); + } + else if ((output_dims->h == 1) && (input_dims->h == 1) && (filter_dims->h == 1) && (output_dims->w % 4 == 0) && + (input_dims->n == 1)) + { + return arm_convolve_1_x_n_s8(ctx, + conv_params, + quant_params, + input_dims, + input_data, + filter_dims, + filter_data, + bias_dims, + bias_data, + output_dims, + output_data); + } + else + { + return arm_convolve_s8(ctx, + conv_params, + quant_params, + input_dims, + input_data, + filter_dims, + filter_data, + bias_dims, + bias_data, + output_dims, + output_data); + } +} + +int32_t arm_convolve_wrapper_s8_get_buffer_size(const cmsis_nn_conv_params *conv_params, + const cmsis_nn_dims *input_dims, + const cmsis_nn_dims *filter_dims, + const cmsis_nn_dims *output_dims) +{ + if ((conv_params->padding.w == 0) && (conv_params->padding.h == 0) && (input_dims->c % 4 == 0) && + (conv_params->stride.w == 1) && (conv_params->stride.h == 1) && (filter_dims->w == 1) && (filter_dims->h == 1)) + { + return arm_convolve_1x1_s8_fast_get_buffer_size(input_dims); + } + else if ((output_dims->h == 1) && (input_dims->h == 1) && (filter_dims->h == 1) && (output_dims->w % 4 == 0) && + (input_dims->n == 1)) + { + return arm_convolve_1_x_n_s8_get_buffer_size(input_dims, filter_dims); + } + else + { + return arm_convolve_s8_get_buffer_size(input_dims, filter_dims); + } +} + +/** + * @} end of NNConv group + */ diff --git a/APP_Framework/Framework/knowing/cmsis_5/NN/Source/ConvolutionFunctions/arm_depthwise_conv_3x3_s8.c b/APP_Framework/Framework/knowing/cmsis_5/NN/Source/ConvolutionFunctions/arm_depthwise_conv_3x3_s8.c new file mode 100644 index 000000000..d5569b39b --- /dev/null +++ b/APP_Framework/Framework/knowing/cmsis_5/NN/Source/ConvolutionFunctions/arm_depthwise_conv_3x3_s8.c @@ -0,0 +1,212 @@ +/* + * Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_depthwise_conv_3x3_s8.c + * Description: Optimized s8 depthwise convolution function for channel + * multiplier of 1 and 3x3 kernel size. + * + * $Date: 09. October 2020 + * $Revision: V.2.0.1 + * + * Target Processor: Cortex-M CPUs + * + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" +#include "arm_nnsupportfunctions.h" + +/** + * @ingroup groupNN + */ + +/** + * @addtogroup NNConv + * @{ + */ + +/* + * Optimized s8 depthwise convolution function with constraint that + * in_channel == out_channel and kernel_x == kernel_y == 3 with pads at most 1 + * + * Refer prototype header file for details. + * + */ + +arm_status arm_depthwise_conv_3x3_s8(const cmsis_nn_context *ctx, + const cmsis_nn_dw_conv_params *dw_conv_params, + const cmsis_nn_per_channel_quant_params *quant_params, + const cmsis_nn_dims *input_dims, + const q7_t *input, + const cmsis_nn_dims *filter_dims, + const q7_t *kernel, + const cmsis_nn_dims *bias_dims, + const int32_t *bias, + const cmsis_nn_dims *output_dims, + q7_t *output) +{ + (void)ctx; + (void)bias_dims; + + const int32_t input_x = input_dims->w; + const int32_t input_y = input_dims->h; + const int32_t input_ch = input_dims->c; + const int32_t output_ch = output_dims->c; + const int32_t pad_x = dw_conv_params->padding.w; + const int32_t pad_y = dw_conv_params->padding.h; + const int32_t stride_x = dw_conv_params->stride.w; + const int32_t stride_y = dw_conv_params->stride.h; + const int32_t *output_shift = quant_params->shift; + const int32_t *output_mult = quant_params->multiplier; + const int32_t output_x = output_dims->w; + const int32_t output_y = output_dims->h; + const int32_t output_offset = dw_conv_params->output_offset; + const int32_t input_offset = dw_conv_params->input_offset; + const int32_t output_activation_min = dw_conv_params->activation.min; + const int32_t output_activation_max = dw_conv_params->activation.max; + + /* Check input constraints input_ch == output_ch */ + if (input_ch != output_ch) + { + return ARM_MATH_SIZE_MISMATCH; + } + /* Check input constraints pad_x <= 1 */ + if (pad_x > 1 || filter_dims->w != 3 || filter_dims->h != 3) + { + return ARM_MATH_ARGUMENT_ERROR; + } + + for (int32_t in_h = -pad_y, out_h = 0, out_idx = 0; out_h < output_y; in_h += stride_y, ++out_h) + { + for (int32_t in_w = -pad_x, out_w = 0, ker_h_start = MAX(0, -in_h); out_w < output_x; in_w += stride_x, ++out_w) + { + int32_t in_ch = 0; + int32_t ker_w_start = MAX(0, -in_w); + + for (; in_ch <= (input_ch - 4); in_ch += 4) + { + int32_t out_buff0 = bias[in_ch + 0]; + int32_t out_buff1 = bias[in_ch + 1]; + int32_t out_buff2 = bias[in_ch + 2]; + int32_t out_buff3 = bias[in_ch + 3]; + + const int8_t *input_ptr = input + (in_h + ker_h_start) * (input_ch * input_x) + in_w * input_ch + in_ch; + const int8_t *kernel_ptr = kernel + ker_h_start * (input_ch * 3) + in_ch; + + for (int32_t ker_h = ker_h_start; ker_h < MIN(3, input_y - in_h); ++ker_h) + { + int32_t in_val = 0; + int32_t ker_val = 0; + + if (ker_w_start == 0) + { + in_val = arm_nn_read_q7x4(input_ptr); + ker_val = arm_nn_read_q7x4(kernel_ptr); + + out_buff0 += ((int8_t)in_val + input_offset) * (int8_t)ker_val; + out_buff1 += ((int8_t)(in_val >> 8) + input_offset) * (int8_t)(ker_val >> 8); + out_buff2 += ((int8_t)(in_val >> 16) + input_offset) * (int8_t)(ker_val >> 16); + out_buff3 += ((int8_t)(in_val >> 24) + input_offset) * (int8_t)(ker_val >> 24); + } + + in_val = arm_nn_read_q7x4(input_ptr + input_ch); + ker_val = arm_nn_read_q7x4(kernel_ptr + input_ch); + + out_buff0 += ((int8_t)in_val + input_offset) * (int8_t)ker_val; + out_buff1 += ((int8_t)(in_val >> 8) + input_offset) * (int8_t)(ker_val >> 8); + out_buff2 += ((int8_t)(in_val >> 16) + input_offset) * (int8_t)(ker_val >> 16); + out_buff3 += ((int8_t)(in_val >> 24) + input_offset) * (int8_t)(ker_val >> 24); + + if ((input_x - in_w) >= 3) + { + in_val = arm_nn_read_q7x4(input_ptr + (input_ch << 1)); + ker_val = arm_nn_read_q7x4(kernel_ptr + (input_ch << 1)); + + out_buff0 += ((int8_t)in_val + input_offset) * (int8_t)ker_val; + out_buff1 += ((int8_t)(in_val >> 8) + input_offset) * (int8_t)(ker_val >> 8); + out_buff2 += ((int8_t)(in_val >> 16) + input_offset) * (int8_t)(ker_val >> 16); + out_buff3 += ((int8_t)(in_val >> 24) + input_offset) * (int8_t)(ker_val >> 24); + } + + input_ptr += (input_ch * input_x); + kernel_ptr += (input_ch * 3); + } + + out_buff0 = arm_nn_requantize(out_buff0, output_mult[in_ch + 0], output_shift[in_ch + 0]); + out_buff1 = arm_nn_requantize(out_buff1, output_mult[in_ch + 1], output_shift[in_ch + 1]); + out_buff2 = arm_nn_requantize(out_buff2, output_mult[in_ch + 2], output_shift[in_ch + 2]); + out_buff3 = arm_nn_requantize(out_buff3, output_mult[in_ch + 3], output_shift[in_ch + 3]); + + out_buff0 += output_offset; + out_buff1 += output_offset; + out_buff2 += output_offset; + out_buff3 += output_offset; + + out_buff0 = MIN(MAX(out_buff0, output_activation_min), output_activation_max); + out_buff1 = MIN(MAX(out_buff1, output_activation_min), output_activation_max); + out_buff2 = MIN(MAX(out_buff2, output_activation_min), output_activation_max); + out_buff3 = MIN(MAX(out_buff3, output_activation_min), output_activation_max); + + output[out_idx++] = (int8_t)out_buff0; + output[out_idx++] = (int8_t)out_buff1; + output[out_idx++] = (int8_t)out_buff2; + output[out_idx++] = (int8_t)out_buff3; + } + + // Leftover + for (; in_ch < input_ch; ++in_ch) + { + int32_t out_buff = bias[in_ch]; + + const int8_t *input_ptr = input + (in_h + ker_h_start) * (input_ch * input_x) + in_w * input_ch + in_ch; + const int8_t *kernel_ptr = kernel + ker_h_start * (input_ch * 3) + in_ch; + + for (int32_t ker_h = ker_h_start; ker_h < MIN(3, input_y - in_h); ++ker_h) + { + if (ker_w_start == 0) + { + out_buff += (*(input_ptr) + input_offset) * *(kernel_ptr); + } + + out_buff += (*(input_ptr + input_ch) + input_offset) * *(kernel_ptr + input_ch); + + if ((input_x - in_w) >= 3) + { + out_buff += (*(input_ptr + (input_ch << 1)) + input_offset) * *(kernel_ptr + (input_ch << 1)); + } + + input_ptr += (input_ch * input_x); + kernel_ptr += (input_ch * 3); + } + + out_buff = arm_nn_requantize(out_buff, output_mult[in_ch], output_shift[in_ch]); + out_buff += output_offset; + out_buff = MIN(MAX(out_buff, output_activation_min), output_activation_max); + output[out_idx++] = (int8_t)out_buff; + } + } + } + + /* Return to application */ + return ARM_MATH_SUCCESS; +} + +/** + * @} end of NNConv group + */ diff --git a/APP_Framework/Framework/knowing/cmsis_5/NN/Source/ConvolutionFunctions/arm_depthwise_conv_s8.c b/APP_Framework/Framework/knowing/cmsis_5/NN/Source/ConvolutionFunctions/arm_depthwise_conv_s8.c new file mode 100644 index 000000000..980812798 --- /dev/null +++ b/APP_Framework/Framework/knowing/cmsis_5/NN/Source/ConvolutionFunctions/arm_depthwise_conv_s8.c @@ -0,0 +1,305 @@ +/* + * Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_depthwise_conv_s8.c + * Description: s8 version of depthwise convolution. + * + * $Date: 11. May 2021 + * $Revision: V.2.5.0 + * + * Target Processor: Cortex-M CPUs + * + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" +#include "arm_nnsupportfunctions.h" + +/** + * @ingroup groupNN + */ + +/** + * @addtogroup NNConv + * @{ + */ + +static void depthwise_conv_s8_mult_4(const int8_t *input, + const int32_t input_x, + const int32_t input_y, + const int32_t input_ch, + const int8_t *kernel, + const int32_t output_ch, + const int32_t ch_mult, + const int32_t kernel_x, + const int32_t kernel_y, + const int32_t pad_x, + const int32_t pad_y, + const int32_t stride_x, + const int32_t stride_y, + const int32_t *bias, + int8_t *output, + const int32_t *output_shift, + const int32_t *output_mult, + const int32_t output_x, + const int32_t output_y, + const int32_t output_offset, + const int32_t input_offset, + const int32_t output_activation_min, + const int32_t output_activation_max) +{ + for (int32_t in_h = -pad_y, out_h = 0, out_idx = 0; out_h < output_y; in_h += stride_y, ++out_h) + { + for (int32_t in_w = -pad_x, out_w = 0, ker_h_start = MAX(0, -in_h); out_w < output_x; in_w += stride_x, ++out_w) + { + for (int32_t in_ch = 0, out_ch = 0, ker_w_start = MAX(0, -in_w); out_ch < output_ch; + ++in_ch, out_ch += ch_mult) + { + for (int mult_tile = 0; mult_tile < ch_mult; mult_tile += 4) + { + int32_t out_buff[4]; + + out_buff[0] = bias[out_ch + 0 + mult_tile]; + out_buff[1] = bias[out_ch + 1 + mult_tile]; + out_buff[2] = bias[out_ch + 2 + mult_tile]; + out_buff[3] = bias[out_ch + 3 + mult_tile]; + + for (int32_t ker_h = ker_h_start; ker_h < MIN(kernel_y, input_y - in_h); ++ker_h) + { + int32_t ker_idx = ker_h * (output_ch * kernel_x) + ker_w_start * output_ch + out_ch; + int32_t in_idx = (in_h + ker_h) * (input_ch * input_x) + in_w * input_ch + in_ch; + + for (int32_t ker_w = ker_w_start; ker_w < MIN(kernel_x, input_x - in_w); + ++ker_w, ker_idx += output_ch) + { + int32_t in_val = input[in_idx + ker_w * input_ch] + input_offset; + out_buff[0] += in_val * kernel[ker_idx + 0 + mult_tile]; + out_buff[1] += in_val * kernel[ker_idx + 1 + mult_tile]; + out_buff[2] += in_val * kernel[ker_idx + 2 + mult_tile]; + out_buff[3] += in_val * kernel[ker_idx + 3 + mult_tile]; + } + } +#if defined(ARM_MATH_MVEI) + (void)out_idx; + int32x4_t res = vldrwq_s32(out_buff); + res = arm_requantize_mve_32x4(res, + vldrwq_s32(&output_mult[out_ch + mult_tile]), + vldrwq_s32(&output_shift[out_ch + mult_tile])); + res = vaddq_n_s32(res, output_offset); + + res = vmaxq_s32(res, vdupq_n_s32(output_activation_min)); + res = vminq_s32(res, vdupq_n_s32(output_activation_max)); + vstrbq_s32(output, res); + output += 4; +#else + out_buff[0] = arm_nn_requantize( + out_buff[0], output_mult[out_ch + 0 + mult_tile], output_shift[out_ch + 0 + mult_tile]); + out_buff[1] = arm_nn_requantize( + out_buff[1], output_mult[out_ch + 1 + mult_tile], output_shift[out_ch + 1 + mult_tile]); + out_buff[2] = arm_nn_requantize( + out_buff[2], output_mult[out_ch + 2 + mult_tile], output_shift[out_ch + 2 + mult_tile]); + out_buff[3] = arm_nn_requantize( + out_buff[3], output_mult[out_ch + 3 + mult_tile], output_shift[out_ch + 3 + mult_tile]); + + out_buff[0] += output_offset; + out_buff[1] += output_offset; + out_buff[2] += output_offset; + out_buff[3] += output_offset; + + out_buff[0] = MIN(MAX(out_buff[0], output_activation_min), output_activation_max); + out_buff[1] = MIN(MAX(out_buff[1], output_activation_min), output_activation_max); + out_buff[2] = MIN(MAX(out_buff[2], output_activation_min), output_activation_max); + out_buff[3] = MIN(MAX(out_buff[3], output_activation_min), output_activation_max); + + output[out_idx++] = (int8_t)out_buff[0]; + output[out_idx++] = (int8_t)out_buff[1]; + output[out_idx++] = (int8_t)out_buff[2]; + output[out_idx++] = (int8_t)out_buff[3]; + +#endif + } + } + } + } +} + +static void depthwise_conv_s8_generic(const q7_t *input, + const uint16_t input_batches, + const uint16_t input_x, + const uint16_t input_y, + const uint16_t input_ch, + const q7_t *kernel, + const uint16_t output_ch, + const uint16_t ch_mult, + const uint16_t kernel_x, + const uint16_t kernel_y, + const uint16_t pad_x, + const uint16_t pad_y, + const uint16_t stride_x, + const uint16_t stride_y, + const int32_t *bias, + q7_t *output, + const int32_t *output_shift, + const int32_t *output_mult, + const uint16_t output_x, + const uint16_t output_y, + const int32_t output_offset, + const int32_t input_offset, + const int32_t output_activation_min, + const int32_t output_activation_max) +{ + (void)output_ch; + int i_out = 0; + int i_batch; + + for (i_batch = 0; i_batch < input_batches; i_batch++) + { + for (int i_out_y = 0; i_out_y < output_y; i_out_y++) + { + const int16_t base_idx_y = (i_out_y * stride_y) - pad_y; + for (int i_out_x = 0; i_out_x < output_x; i_out_x++) + { + const int16_t base_idx_x = (i_out_x * stride_x) - pad_x; + for (int i_input_ch = 0; i_input_ch < input_ch; i_input_ch++) + { + for (int i_ch_mult = 0; i_ch_mult < ch_mult; i_ch_mult++) + { + const int idx_out_ch = i_ch_mult + i_input_ch * ch_mult; + int32_t acc_0; + /* Condition for kernel start dimension: (base_idx_ + ker__start) >= 0 */ + const int ker_y_start = MAX(0, -base_idx_y); + const int ker_x_start = MAX(0, -base_idx_x); + /* Condition for kernel end dimension: (base_idx_ + ker__end) < input_ */ + const int ker_y_end = MIN(kernel_y, input_y - base_idx_y); + const int ker_x_end = MIN(kernel_x, input_x - base_idx_x); + acc_0 = bias[idx_out_ch]; + + for (int i_ker_y = ker_y_start; i_ker_y < ker_y_end; i_ker_y++) + { + const int32_t idx_y = base_idx_y + i_ker_y; + for (int i_ker_x = ker_x_start; i_ker_x < ker_x_end; i_ker_x++) + { + const int32_t idx_x = base_idx_x + i_ker_x; + int32_t idx_0 = (idx_y * input_x + idx_x) * input_ch + i_input_ch; + int32_t ker_idx_0 = (i_ker_y * kernel_x + i_ker_x) * (input_ch * ch_mult) + idx_out_ch; + + acc_0 += (input[idx_0] + input_offset) * kernel[ker_idx_0]; + } + } + + /* Requantize and clamp output to provided range */ + acc_0 = arm_nn_requantize(acc_0, output_mult[idx_out_ch], output_shift[idx_out_ch]); + acc_0 += output_offset; + acc_0 = MAX(acc_0, output_activation_min); + acc_0 = MIN(acc_0, output_activation_max); + + output[i_out++] = acc_0; + } + } + } + } + /* Advance to the next batch */ + input += (input_x * input_y * input_ch); + } +} + +/* + * Basic s8 depthwise convolution function. + * + * Refer header file for details. + * Optimization using DSP extension is not available for the generic case where channel multiplier is > 1. + * + */ +arm_status arm_depthwise_conv_s8(const cmsis_nn_context *ctx, + const cmsis_nn_dw_conv_params *dw_conv_params, + const cmsis_nn_per_channel_quant_params *quant_params, + const cmsis_nn_dims *input_dims, + const q7_t *input, + const cmsis_nn_dims *filter_dims, + const q7_t *kernel, + const cmsis_nn_dims *bias_dims, + const int32_t *bias, + const cmsis_nn_dims *output_dims, + q7_t *output) +{ + (void)dw_conv_params->dilation; + (void)bias_dims; + (void)ctx; + + if (dw_conv_params->ch_mult % 4 == 0 && input_dims->n == 1) + { + depthwise_conv_s8_mult_4(input, + input_dims->w, + input_dims->h, + input_dims->c, + kernel, + output_dims->c, + dw_conv_params->ch_mult, + filter_dims->w, + filter_dims->h, + dw_conv_params->padding.w, + dw_conv_params->padding.h, + dw_conv_params->stride.w, + dw_conv_params->stride.h, + bias, + output, + quant_params->shift, + quant_params->multiplier, + output_dims->w, + output_dims->h, + dw_conv_params->output_offset, + dw_conv_params->input_offset, + dw_conv_params->activation.min, + dw_conv_params->activation.max); + } + else + { + depthwise_conv_s8_generic(input, + input_dims->n, + input_dims->w, + input_dims->h, + input_dims->c, + kernel, + output_dims->c, + dw_conv_params->ch_mult, + filter_dims->w, + filter_dims->h, + dw_conv_params->padding.w, + dw_conv_params->padding.h, + dw_conv_params->stride.w, + dw_conv_params->stride.h, + bias, + output, + quant_params->shift, + quant_params->multiplier, + output_dims->w, + output_dims->h, + dw_conv_params->output_offset, + dw_conv_params->input_offset, + dw_conv_params->activation.min, + dw_conv_params->activation.max); + } + + /* Return to application */ + return ARM_MATH_SUCCESS; +} + +/** + * @} end of NNConv group + */ diff --git a/APP_Framework/Framework/knowing/cmsis_5/NN/Source/ConvolutionFunctions/arm_depthwise_conv_s8_opt.c b/APP_Framework/Framework/knowing/cmsis_5/NN/Source/ConvolutionFunctions/arm_depthwise_conv_s8_opt.c new file mode 100644 index 000000000..d72b0996e --- /dev/null +++ b/APP_Framework/Framework/knowing/cmsis_5/NN/Source/ConvolutionFunctions/arm_depthwise_conv_s8_opt.c @@ -0,0 +1,428 @@ +/* + * Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_depthwise_conv_s8_opt.c + * Description: Optimized s8 depthwise separable convolution function for + * channel multiplier of 1. + * + * $Date: January 26, 2021 + * $Revision: V.2.0.3 + * + * Target Processor: Cortex-M CPUs + * + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" +#include "arm_nnsupportfunctions.h" + +/** + * @ingroup groupNN + */ + +/** + * @addtogroup NNConv + * @{ + */ + +/* + * Optimized s8 depthwise convolution function with constraint that in_channel equals out_channel + * + * Refer prototype header file for details. + * + */ + +arm_status arm_depthwise_conv_s8_opt(const cmsis_nn_context *ctx, + const cmsis_nn_dw_conv_params *dw_conv_params, + const cmsis_nn_per_channel_quant_params *quant_params, + const cmsis_nn_dims *input_dims, + const q7_t *input, + const cmsis_nn_dims *filter_dims, + const q7_t *kernel, + const cmsis_nn_dims *bias_dims, + const int32_t *bias, + const cmsis_nn_dims *output_dims, + q7_t *output) +{ + + const int32_t input_ch = input_dims->c; + const int32_t output_ch = output_dims->c; + + /* Check input constraints input_ch == output_ch */ + if (input_ch != output_ch) + { + return ARM_MATH_SIZE_MISMATCH; + } +#ifdef ARM_MATH_DSP + const int32_t input_x = input_dims->w; + const int32_t input_y = input_dims->h; + const int32_t kernel_x = filter_dims->w; + const int32_t kernel_y = filter_dims->h; + const int32_t pad_x = dw_conv_params->padding.w; + const int32_t pad_y = dw_conv_params->padding.h; + const int32_t stride_x = dw_conv_params->stride.w; + const int32_t stride_y = dw_conv_params->stride.h; + const int32_t *output_shift = quant_params->shift; + const int32_t *output_mult = quant_params->multiplier; + const int32_t output_x = output_dims->w; + const int32_t output_y = output_dims->h; + const int32_t output_offset = dw_conv_params->output_offset; + const int32_t input_offset = dw_conv_params->input_offset; + const int32_t output_activation_min = dw_conv_params->activation.min; + const int32_t output_activation_max = dw_conv_params->activation.max; + q15_t *buffer_a = (q15_t *)ctx->buf; + +#ifdef ARM_MATH_MVEI + (void)bias_dims; + /* Generate two columns from the input tensor */ + q7_t *lhs_buffer = (q7_t *)buffer_a; + q7_t *out = output; + int padded = 0; + int buffer_count = 0; + const int32_t kernel_size = kernel_x * kernel_y; + + /* This part implements the im2col function */ + for (int i_out_y = 0, base_idx_y = -pad_y; i_out_y < output_y; base_idx_y += stride_y, i_out_y++) + { + for (int i_out_x = 0, base_idx_x = -pad_x; i_out_x < output_x; base_idx_x += stride_x, i_out_x++) + { + for (int i_ker_y = base_idx_y; i_ker_y < base_idx_y + kernel_y; i_ker_y++) + { + for (int i_ker_x = base_idx_x; i_ker_x < base_idx_x + kernel_x; i_ker_x++) + { + if (i_ker_y < 0 || i_ker_y >= input_y || i_ker_x < 0 || i_ker_x >= input_x) + { + arm_memset_q7(lhs_buffer, (int8_t)-input_offset, (uint32_t)input_ch); + padded = 1; + } + else + { + arm_memcpy_q7(lhs_buffer, input + (i_ker_y * input_x + i_ker_x) * input_ch, (uint32_t)input_ch); + } + lhs_buffer += input_ch; + } + } + buffer_count++; + + if (buffer_count == 4) + { + lhs_buffer = (q7_t *)buffer_a; + if (padded == 0) + { + out = arm_nn_depthwise_conv_nt_t_s8(lhs_buffer, + kernel, + input_offset, + input_ch, + output_shift, + output_mult, + output_offset, + output_activation_min, + output_activation_max, + kernel_size, + bias, + out); + } + else + { + out = arm_nn_depthwise_conv_nt_t_padded_s8(lhs_buffer, + kernel, + input_offset, + input_ch, + output_shift, + output_mult, + output_offset, + output_activation_min, + output_activation_max, + kernel_size, + bias, + out); + padded = 0; + } + buffer_count = 0; + } + } + } + + /* Handle left over buffers */ + lhs_buffer = (q7_t *)buffer_a; + + for (int i_buf = 0; i_buf < buffer_count; i_buf++) + { + int32_t loop_count = (input_ch + 3) / 4; + + int32_t num_ch_to_process = input_ch; + for (int i_loop_cnt = 0, offset = 0; i_loop_cnt < loop_count; num_ch_to_process -= 4, offset += 4, i_loop_cnt++) + { + const int8_t *col_0 = lhs_buffer + (kernel_size * input_ch * i_buf) + offset; + const int8_t *row_0 = kernel + offset; + int32x4_t out_0 = vldrwq_s32(&bias[offset]); + + for (int i_ker = 0; i_ker < kernel_size; i_ker++) + { + const int32x4_t ker_0 = vldrbq_s32(row_0); + + int32x4_t ip_0 = vldrbq_s32(col_0); + ip_0 = vaddq_n_s32(ip_0, input_offset); + out_0 += vmulq_s32(ip_0, ker_0); + + col_0 += input_ch; + row_0 += input_ch; + } + + const int32x4_t mult = vldrwq_s32(&output_mult[offset]); + const int32x4_t shift = vldrwq_s32(&output_shift[offset]); + + out_0 = arm_requantize_mve_32x4(out_0, mult, shift); + out_0 = vaddq_n_s32(out_0, output_offset); + out_0 = vmaxq_s32(out_0, vdupq_n_s32(output_activation_min)); + out_0 = vminq_s32(out_0, vdupq_n_s32(output_activation_max)); + mve_pred16_t p = vctp32q((uint32_t)num_ch_to_process); + vstrbq_p_s32(out, out_0, p); + + out += 4; + } + + const int tail_ch = input_ch & 0x3; + if (tail_ch != 0) + { + out -= (4 - tail_ch); + } + } + +#else // ARM_MATH_DSP + (void)bias_dims; + /* Run the following code in cores using DSP extension */ + q15_t *const col_buffer_start = buffer_a; + q15_t *col_buffer = col_buffer_start; + const int32_t *const bias_start_pos = bias; + const q31_t *const out_mult_start_pos = output_mult; + const q31_t *const out_shift_start_pos = output_shift; + uint16_t row_count; + uint16_t row_shift; + + for (int i_out_y = 0; i_out_y < output_y; i_out_y++) + { + const int16_t base_idx_y = (i_out_y * stride_y) - pad_y; + for (int i_out_x = 0; i_out_x < output_x; i_out_x++) + { + const int16_t base_idx_x = (i_out_x * stride_x) - pad_x; + + /* Out of bounds is only considered for the y axis as it provides a contiguous zero'ing opportunity than + along the x axis */ + const int ker_y_start = MAX(0, -base_idx_y); + /* Condition for kernel end dimension: (base_idx_y + ker_y_end) < input_y */ + const int ker_y_end = MIN(kernel_y, input_y - base_idx_y); + + int32_t index = 0; + if (ker_y_start != 0) + { + memset(&col_buffer[index], 0, (kernel_x * input_ch) * ker_y_start * sizeof(q15_t)); + index += (kernel_x * input_ch) * ker_y_start; + } + + for (int i_ker_y = ker_y_start; i_ker_y < ker_y_end; i_ker_y++) + { + const int32_t idx_y = base_idx_y + i_ker_y; + + for (int i_ker_x = 0; i_ker_x < kernel_x; i_ker_x++) + { + const int32_t idx_x = base_idx_x + i_ker_x; + if (idx_x < 0 || idx_x >= input_x) + { + memset(&col_buffer[index], 0, input_ch * sizeof(q15_t)); + } + else + { + arm_q7_to_q15_with_offset((q7_t *)input + (idx_y * input_x + idx_x) * input_ch, + &col_buffer[index], + input_ch, + input_offset); + } + index += input_ch; + } + } + + const int diff = kernel_y - ker_y_end; + if (diff != 0) + { + memset(&col_buffer[index], 0, (kernel_x * input_ch) * diff * sizeof(q15_t)); + } + + row_count = output_ch / 4; + row_shift = 0; + bias = bias_start_pos; + output_mult = out_mult_start_pos; + output_shift = out_shift_start_pos; + + while (row_count) + { + q31_t sum = *bias++; + q31_t sum_2 = *bias++; + q31_t sum_3 = *bias++; + q31_t sum_4 = *bias++; + + uint16_t col_count = (kernel_x * kernel_y) / 2; + q15_t *col_pos = col_buffer_start + row_shift; + const q7_t *row_pos = kernel + row_shift; + row_shift += 4; + + while (col_count) + { + /* General idea is to read 4 + 4 (input, kernel) pair and re-arrange them in the right order to + use in a SMLAD instruction . One run of this loop produces 4 partial outputs with 8 MACs. */ + /* Note: variable names can be improved here to align with rows and columns. */ + q31_t ip_a1, ip_a2, ip_b1, ip_b2, op_a, op_b, op_c; + /* Read 4 weights */ + ip_b1 = arm_nn_read_q7x4(row_pos); + ip_a1 = arm_nn_read_q7x4(row_pos + input_ch); + op_a = arm_nn_read_q15x2(col_pos); + op_b = arm_nn_read_q15x2(col_pos + input_ch); + + ip_a2 = __SXTB16(ip_b1); + ip_b1 = __SXTB16(__ROR(ip_b1, 8)); + + ip_b2 = __SXTB16(ip_a1); + ip_a1 = __SXTB16(__ROR(ip_a1, 8)); + + op_c = __PKHBT(op_b, op_a, 16); + op_a = __PKHTB(op_b, op_a, 16); + op_b = __PKHBT(ip_b2, ip_a2, 16); + sum = __SMLAD(op_c, op_b, sum); + + op_b = __PKHBT(ip_b1, ip_a1, 16); + sum_2 = __SMLAD(op_a, op_b, sum_2); + + op_a = arm_nn_read_q15x2(col_pos + 2); + op_b = arm_nn_read_q15x2(col_pos + input_ch + 2); + + op_c = __PKHBT(op_b, op_a, 16); + op_a = __PKHTB(op_b, op_a, 16); + op_b = __PKHTB(ip_a2, ip_b2, 16); + sum_3 = __SMLAD(op_c, op_b, sum_3); + + op_b = __PKHTB(ip_a1, ip_b1, 16); + sum_4 = __SMLAD(op_a, op_b, sum_4); + + row_pos += input_ch << 1; + col_pos += input_ch << 1; + col_count--; + } + + col_count = (kernel_x * kernel_y) & 0x1; + while (col_count) + { + sum += row_pos[0] * col_pos[0]; + sum_2 += row_pos[1] * col_pos[1]; + sum_3 += row_pos[2] * col_pos[2]; + sum_4 += row_pos[3] * col_pos[3]; + + row_pos += input_ch; + col_pos += input_ch; + + col_count--; + } + sum = arm_nn_requantize(sum, *output_mult++, *output_shift++); + sum += output_offset; + sum = MAX(sum, output_activation_min); + sum = MIN(sum, output_activation_max); + *output++ = (q7_t)sum; + + sum_2 = arm_nn_requantize(sum_2, *output_mult++, *output_shift++); + sum_2 += output_offset; + sum_2 = MAX(sum_2, output_activation_min); + sum_2 = MIN(sum_2, output_activation_max); + *output++ = (q7_t)sum_2; + sum_3 = arm_nn_requantize(sum_3, *output_mult++, *output_shift++); + sum_3 += output_offset; + sum_3 = MAX(sum_3, output_activation_min); + sum_3 = MIN(sum_3, output_activation_max); + *output++ = (q7_t)sum_3; + + sum_4 = arm_nn_requantize(sum_4, *output_mult++, *output_shift++); + sum_4 += output_offset; + sum_4 = MAX(sum_4, output_activation_min); + sum_4 = MIN(sum_4, output_activation_max); + *output++ = (q7_t)sum_4; + + row_count--; + } + + row_count = output_ch & 0x3; + while (row_count) + { + q15_t *col_pos = col_buffer_start + row_shift; + const q7_t *row_pos = kernel + row_shift; + q31_t sum = *bias++; + const uint16_t col_count = (kernel_x * kernel_y); + row_shift += 1; + + for (int i = 0; i < col_count; i++) + { + sum += row_pos[i * input_ch] * col_pos[i * input_ch]; + } + sum = arm_nn_requantize(sum, *output_mult++, *output_shift++); + sum += output_offset; + sum = MAX(sum, output_activation_min); + sum = MIN(sum, output_activation_max); + *output++ = (q7_t)sum; + + row_count--; + } + + // clear counter and pointers + col_buffer = col_buffer_start; + } + } +#endif +#else + /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */ + return arm_depthwise_conv_s8(ctx, + dw_conv_params, + quant_params, + input_dims, + input, + filter_dims, + kernel, + bias_dims, + bias, + output_dims, + output); +#endif /* ARM_MATH_MVEI | ARM_MATH_DSP */ + + /* Return to application */ + return ARM_MATH_SUCCESS; +} + +int32_t arm_depthwise_conv_s8_opt_get_buffer_size(const cmsis_nn_dims *input_dims, const cmsis_nn_dims *filter_dims) +{ +#if defined(ARM_MATH_MVEI) + /* The + 4 accounts for out of bounds read of the lhs buffers in the *_nt_t_* functions. */ + return (2 * input_dims->c * filter_dims->w * filter_dims->h) * (int32_t)sizeof(int16_t) + 4; +#elif defined(ARM_MATH_DSP) + return (input_dims->c * filter_dims->w * filter_dims->h) * sizeof(int16_t); +#else + (void)input_dims; + (void)filter_dims; + return 0; +#endif +} + +/** + * @} end of NNConv group + */ diff --git a/APP_Framework/Framework/knowing/cmsis_5/NN/Source/ConvolutionFunctions/arm_depthwise_conv_u8_basic_ver1.c b/APP_Framework/Framework/knowing/cmsis_5/NN/Source/ConvolutionFunctions/arm_depthwise_conv_u8_basic_ver1.c new file mode 100644 index 000000000..c9d0afc22 --- /dev/null +++ b/APP_Framework/Framework/knowing/cmsis_5/NN/Source/ConvolutionFunctions/arm_depthwise_conv_u8_basic_ver1.c @@ -0,0 +1,336 @@ +/* + * Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_depthwise_conv_u8_basic_ver1.c + * Description: u8 depthwise convolution function + * + * $Date: 09. October 2020 + * $Revision: V.1.1.1 + * + * Target : Cortex-M CPUs + * + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" +#include "arm_nnsupportfunctions.h" + +/** + * @ingroup groupNN + */ + +/** + * @addtogroup NNConv + * @{ + */ + +static void depthwise_conv_u8_mult_4(const uint8_t *input, + const int32_t input_x, + const int32_t input_y, + const int32_t input_ch, + const uint8_t *kernel, + const int32_t output_ch, + const int32_t ch_mult, + const int32_t kernel_x, + const int32_t kernel_y, + const int32_t pad_x, + const int32_t pad_y, + const int32_t stride_x, + const int32_t stride_y, + const int32_t *bias, + uint8_t *output, + const int32_t output_shift, + const int32_t output_mult, + const int32_t output_x, + const int32_t output_y, + const int32_t output_offset, + const int32_t input_offset, + const int32_t filter_offset, + const int32_t output_activation_min, + const int32_t output_activation_max) +{ + for (int32_t in_h = -pad_y, out_h = 0, out_idx = 0; out_h < output_y; in_h += stride_y, ++out_h) + { + for (int32_t in_w = -pad_x, out_w = 0, ker_h_start = MAX(0, -in_h); out_w < output_x; in_w += stride_x, ++out_w) + { + for (int32_t in_ch = 0, out_ch = 0, ker_w_start = MAX(0, -in_w); out_ch < output_ch; + ++in_ch, out_ch += ch_mult) + { + for (int mult_tile = 0; mult_tile < ch_mult; mult_tile += 4) + { + int32_t out_buff[4]; + + out_buff[0] = 0; + out_buff[1] = 0; + out_buff[2] = 0; + out_buff[3] = 0; + + for (int32_t ker_h = ker_h_start; ker_h < MIN(kernel_y, input_y - in_h); ++ker_h) + { + int32_t ker_idx = ker_h * (output_ch * kernel_x) + ker_w_start * output_ch + out_ch; + int32_t in_idx = (in_h + ker_h) * (input_ch * input_x) + in_w * input_ch + in_ch; + + for (int32_t ker_w = ker_w_start; ker_w < MIN(kernel_x, input_x - in_w); + ++ker_w, ker_idx += output_ch) + { + int32_t in_val = input[in_idx + ker_w * input_ch] + input_offset; + out_buff[0] += in_val * (kernel[ker_idx + 0 + mult_tile] + filter_offset); + out_buff[1] += in_val * (kernel[ker_idx + 1 + mult_tile] + filter_offset); + out_buff[2] += in_val * (kernel[ker_idx + 2 + mult_tile] + filter_offset); + out_buff[3] += in_val * (kernel[ker_idx + 3 + mult_tile] + filter_offset); + } + } + + if (bias != NULL) + { + out_buff[0] += bias[out_ch + 0 + mult_tile]; + out_buff[1] += bias[out_ch + 1 + mult_tile]; + out_buff[2] += bias[out_ch + 2 + mult_tile]; + out_buff[3] += bias[out_ch + 3 + mult_tile]; + } + out_buff[0] = arm_nn_requantize(out_buff[0], output_mult, output_shift); + out_buff[1] = arm_nn_requantize(out_buff[1], output_mult, output_shift); + out_buff[2] = arm_nn_requantize(out_buff[2], output_mult, output_shift); + out_buff[3] = arm_nn_requantize(out_buff[3], output_mult, output_shift); + + out_buff[0] += output_offset; + out_buff[1] += output_offset; + out_buff[2] += output_offset; + out_buff[3] += output_offset; + + out_buff[0] = MIN(MAX(out_buff[0], output_activation_min), output_activation_max); + out_buff[1] = MIN(MAX(out_buff[1], output_activation_min), output_activation_max); + out_buff[2] = MIN(MAX(out_buff[2], output_activation_min), output_activation_max); + out_buff[3] = MIN(MAX(out_buff[3], output_activation_min), output_activation_max); + + output[out_idx++] = (uint8_t)out_buff[0]; + output[out_idx++] = (uint8_t)out_buff[1]; + output[out_idx++] = (uint8_t)out_buff[2]; + output[out_idx++] = (uint8_t)out_buff[3]; + } + } + } + } +} + +static void depthwise_conv_u8_generic(const uint8_t *input, + const int32_t input_x, + const int32_t input_y, + const int32_t input_ch, + const uint8_t *kernel, + const int32_t output_ch, + const int32_t ch_mult, + const int32_t kernel_x, + const int32_t kernel_y, + const int32_t pad_x, + const int32_t pad_y, + const int32_t stride_x, + const int32_t stride_y, + const int32_t *bias, + uint8_t *output, + const int32_t output_shift, + const int32_t output_mult, + const int32_t output_x, + const int32_t output_y, + const int32_t output_offset, + const int32_t input_offset, + const int32_t filter_offset, + const int32_t output_activation_min, + const int32_t output_activation_max) +{ + (void)output_ch; + int i_out = 0; + for (int i_out_y = 0; i_out_y < output_y; i_out_y++) + { + const int16_t base_idx_y = (i_out_y * stride_y) - pad_y; + for (int i_out_x = 0; i_out_x < output_x; i_out_x++) + { + const int16_t base_idx_x = (i_out_x * stride_x) - pad_x; + for (int i_input_ch = 0; i_input_ch < input_ch; i_input_ch++) + { + for (int i_ch_mult = 0; i_ch_mult < ch_mult; i_ch_mult++) + { + const int idx_out_ch = i_ch_mult + i_input_ch * ch_mult; + int32_t acc_0; + /* Condition for kernel start dimension: (base_idx_ + ker__start) >= 0 */ + const int ker_y_start = MAX(0, -base_idx_y); + const int ker_x_start = MAX(0, -base_idx_x); + /* Condition for kernel end dimension: (base_idx_ + ker__end) < input_ */ + const int ker_y_end = MIN(kernel_y, input_y - base_idx_y); + const int ker_x_end = MIN(kernel_x, input_x - base_idx_x); + acc_0 = 0; + + for (int i_ker_y = ker_y_start; i_ker_y < ker_y_end; i_ker_y++) + { + const int32_t idx_y = base_idx_y + i_ker_y; + for (int i_ker_x = ker_x_start; i_ker_x < ker_x_end; i_ker_x++) + { + const int32_t idx_x = base_idx_x + i_ker_x; + int32_t idx_0 = (idx_y * input_x + idx_x) * input_ch + i_input_ch; + int32_t ker_idx_0 = (i_ker_y * kernel_x + i_ker_x) * (input_ch * ch_mult) + idx_out_ch; + + acc_0 += (input[idx_0] + input_offset) * (kernel[ker_idx_0] + filter_offset); + } + } + if (bias != NULL) + { + acc_0 += bias[idx_out_ch]; + } + + /* Requantize and clamp output to provided range */ + acc_0 = arm_nn_requantize(acc_0, output_mult, output_shift); + acc_0 += output_offset; + acc_0 = MAX(acc_0, output_activation_min); + acc_0 = MIN(acc_0, output_activation_max); + + output[i_out++] = acc_0; + } + } + } + } +} + +/** + * @brief uint8 depthwise convolution function with asymmetric quantization + * + * @param[in] input Pointer to input tensor + * @param[in] input_x Width of input tensor + * @param[in] input_y Height of input tensor + * @param[in] input_ch Channels in input tensor + * @param[in] kernel Pointer to kernel weights + * @param[in] kernel_x Width of kernel + * @param[in] kernel_y Height of kernel + * @param[in] ch_mult Number of channel multiplier + * @param[in] pad_x Padding sizes x + * @param[in] pad_y Padding sizes y + * @param[in] stride_x Convolution stride along the width + * @param[in] stride_y Convolution stride along the height + * @param[in] dilation_x Dilation along width. Not used and intended for future enhancement. + * @param[in] dilation_y Dilation along height. Not used and intended for future enhancement. + * @param[in] bias Pointer to optional bias values. If no bias is + * available, NULL is expected + * @param[in] input_offset Input tensor zero offset + * @param[in] filter_offset Kernel tensor zero offset + * @param[in] output_offset Output tensor zero offset + * @param[in,out] output Pointer to output tensor + * @param[in] output_x Width of output tensor + * @param[in] output_y Height of output tensor + * @param[in] output_activation_min Minimum value to clamp the output to. Range : {0, 255} + * @param[in] output_activation_max Minimum value to clamp the output to. Range : {0, 255} + * @param[in] output_shift Amount of right-shift for output + * @param[in] output_mult Output multiplier for requantization + * @return The function returns one of the following + * ARM_MATH_SIZE_MISMATCH - Not supported dimension of tensors + * ARM_MATH_SUCCESS - Successful operation + * ARM_MATH_ARGUMENT_ERROR - Implementation not available + * + * + */ + +arm_status arm_depthwise_conv_u8_basic_ver1(const uint8_t *input, + const uint16_t input_x, + const uint16_t input_y, + const uint16_t input_ch, + const uint8_t *kernel, + const uint16_t kernel_x, + const uint16_t kernel_y, + const int16_t ch_mult, + const int16_t pad_x, + const int16_t pad_y, + const int16_t stride_x, + const int16_t stride_y, + const int16_t dilation_x, + const int16_t dilation_y, + const int32_t *bias, + const int32_t input_offset, + const int32_t filter_offset, + const int32_t output_offset, + uint8_t *output, + const uint16_t output_x, + const uint16_t output_y, + const int32_t output_activation_min, + const int32_t output_activation_max, + const int32_t output_shift, + const int32_t output_mult) +{ + (void)dilation_x; + (void)dilation_y; + + if (ch_mult % 4 == 0) + { + depthwise_conv_u8_mult_4(input, + input_x, + input_y, + input_ch, + kernel, + ch_mult * input_ch, + ch_mult, + kernel_x, + kernel_y, + pad_x, + pad_y, + stride_x, + stride_y, + bias, + output, + output_shift, + output_mult, + output_x, + output_y, + output_offset, + input_offset, + filter_offset, + output_activation_min, + output_activation_max); + } + else + { + depthwise_conv_u8_generic(input, + input_x, + input_y, + input_ch, + kernel, + ch_mult * input_ch, + ch_mult, + kernel_x, + kernel_y, + pad_x, + pad_y, + stride_x, + stride_y, + bias, + output, + output_shift, + output_mult, + output_x, + output_y, + output_offset, + input_offset, + filter_offset, + output_activation_min, + output_activation_max); + } + + /* Return to application */ + return ARM_MATH_SUCCESS; +} + +/** + * @} end of NNConv group + */ diff --git a/APP_Framework/Framework/knowing/cmsis_5/NN/Source/ConvolutionFunctions/arm_depthwise_conv_wrapper_s8.c b/APP_Framework/Framework/knowing/cmsis_5/NN/Source/ConvolutionFunctions/arm_depthwise_conv_wrapper_s8.c new file mode 100644 index 000000000..684dc1ef9 --- /dev/null +++ b/APP_Framework/Framework/knowing/cmsis_5/NN/Source/ConvolutionFunctions/arm_depthwise_conv_wrapper_s8.c @@ -0,0 +1,132 @@ +/* + * Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_depthwise_conv_wrapper_s8.c + * Description: Wrapper API to select appropriate depthwise conv API based + * on dimensions. + * + * $Date: 11. May 2021 + * $Revision: V.1.0.3 + * + * Target Processor: Cortex-M CPUs + * + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" + +/** + * @ingroup groupNN + */ + +/** + * @addtogroup NNConv + * @{ + */ + +/* + * s8 Depthwise conv wrapper function + * + * Refer header file for details. + * + */ +arm_status arm_depthwise_conv_wrapper_s8(const cmsis_nn_context *ctx, + const cmsis_nn_dw_conv_params *dw_conv_params, + const cmsis_nn_per_channel_quant_params *quant_params, + const cmsis_nn_dims *input_dims, + const q7_t *input, + const cmsis_nn_dims *filter_dims, + const q7_t *filter, + const cmsis_nn_dims *bias_dims, + const int32_t *bias, + const cmsis_nn_dims *output_dims, + q7_t *output) +{ + arm_status status = ARM_MATH_SUCCESS; + if (1 == dw_conv_params->ch_mult && input_dims->n == 1) + { +#if !defined(ARM_MATH_MVEI) + if ((filter_dims->w == 3) && (filter_dims->h == 3) && (dw_conv_params->padding.h <= 1)) + { + status = arm_depthwise_conv_3x3_s8(ctx, + dw_conv_params, + quant_params, + input_dims, + input, + filter_dims, + filter, + bias_dims, + bias, + output_dims, + output); + } + else +#endif + { + status = arm_depthwise_conv_s8_opt(ctx, + dw_conv_params, + quant_params, + input_dims, + input, + filter_dims, + filter, + bias_dims, + bias, + output_dims, + output); + } + } + else + { + status = arm_depthwise_conv_s8(ctx, + dw_conv_params, + quant_params, + input_dims, + input, + filter_dims, + filter, + bias_dims, + bias, + output_dims, + output); + } + + /* Return to application */ + return status; +} + +int32_t arm_depthwise_conv_wrapper_s8_get_buffer_size(const cmsis_nn_dw_conv_params *dw_conv_params, + const cmsis_nn_dims *input_dims, + const cmsis_nn_dims *filter_dims, + const cmsis_nn_dims *output_dims) +{ + (void)dw_conv_params; + int32_t size = 0; + + if (input_dims->c == output_dims->c && input_dims->n == 1) + { + size = arm_depthwise_conv_s8_opt_get_buffer_size(input_dims, filter_dims); + } + + return size; +} + +/** + * @} end of NNConv group + */ diff --git a/APP_Framework/Framework/knowing/cmsis_5/NN/Source/ConvolutionFunctions/arm_depthwise_separable_conv_HWC_q7.c b/APP_Framework/Framework/knowing/cmsis_5/NN/Source/ConvolutionFunctions/arm_depthwise_separable_conv_HWC_q7.c new file mode 100644 index 000000000..de0ef8fec --- /dev/null +++ b/APP_Framework/Framework/knowing/cmsis_5/NN/Source/ConvolutionFunctions/arm_depthwise_separable_conv_HWC_q7.c @@ -0,0 +1,422 @@ +/* + * Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_depthwise_separable_conv_HWC_q7.c + * Description: Q7 depthwise separable convolution function + * + * $Date: January 26, 2021 + * $Revision: V.1.0.2 + * + * Target Processor: Cortex-M cores + * + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" +#include "arm_nnsupportfunctions.h" + +/** + * @ingroup groupNN + */ + +/** + * @addtogroup NNConv + * @{ + */ + +/** + * @brief Q7 depthwise separable convolution function + * @param[in] Im_in pointer to input tensor + * @param[in] dim_im_in input tensor dimension + * @param[in] ch_im_in number of input tensor channels + * @param[in] wt pointer to kernel weights + * @param[in] ch_im_out number of filters, i.e., output tensor channels + * @param[in] dim_kernel filter kernel size + * @param[in] padding padding sizes + * @param[in] stride convolution stride + * @param[in] bias pointer to bias + * @param[in] bias_shift amount of left-shift for bias + * @param[in] out_shift amount of right-shift for output + * @param[in,out] Im_out pointer to output tensor + * @param[in] dim_im_out output tensor dimension + * @param[in,out] bufferA pointer to buffer space for input + * @param[in,out] bufferB pointer to buffer space for output + * @return The function returns either + * ARM_MATH_SIZE_MISMATCH or ARM_MATH_SUCCESS based on the outcome of size checking. + * + * @details + * + * Buffer size: + * + * bufferA size: 2*ch_im_in*dim_kernel*dim_kernel + * + * bufferB size: 0 + * + * Input dimension constraints: + * + * ch_im_in equals ch_im_out + * + * Implementation: + * There are 3 nested loop here: + * Inner loop: calculate each output value with MAC instruction over an accumulator + * Mid loop: loop over different output channel + * Outer loop: loop over different output (x, y) + */ + +arm_status arm_depthwise_separable_conv_HWC_q7(const q7_t *Im_in, + const uint16_t dim_im_in, + const uint16_t ch_im_in, + const q7_t *wt, + const uint16_t ch_im_out, + const uint16_t dim_kernel, + const uint16_t padding, + const uint16_t stride, + const q7_t *bias, + const uint16_t bias_shift, + const uint16_t out_shift, + q7_t *Im_out, + const uint16_t dim_im_out, + q15_t *bufferA, + q7_t *bufferB) +{ + (void)bufferB; +#if defined(ARM_MATH_DSP) + /* Run the following code for Cortex-M4 and Cortex-M7 */ + + int16_t i_out_y, i_out_x; + int16_t i_ker_y, i_ker_x; + q7_t *colBuffer = (q7_t *)bufferA; + q7_t *pBuffer = colBuffer; + const q7_t *pBias = bias; + q7_t *pOut = Im_out; + uint16_t rowCnt; + uint16_t row_shift; + + /* do some checking here, basically ch_im_in == ch_im_out */ + if (ch_im_in != ch_im_out) + { + return ARM_MATH_SIZE_MISMATCH; + } + + for (i_out_y = 0; i_out_y < dim_im_out; i_out_y++) + { + for (i_out_x = 0; i_out_x < dim_im_out; i_out_x++) + { + /* we first do im2col here */ + for (i_ker_y = i_out_y * stride - padding; i_ker_y < i_out_y * stride - padding + dim_kernel; i_ker_y++) + { + for (i_ker_x = i_out_x * stride - padding; i_ker_x < i_out_x * stride - padding + dim_kernel; i_ker_x++) + { + if (i_ker_y < 0 || i_ker_y >= dim_im_in || i_ker_x < 0 || i_ker_x >= dim_im_in) + { + /* arm_fill_q7(0, pBuffer, ch_im_in); */ + memset(pBuffer, 0, ch_im_in); + } + else + { + /* arm_copy_q7((q7_t *) Im_in + (i_ker_y * dim_im_in + i_ker_x) * ch_im_in, pBuffer, ch_im_in); + */ + memcpy(pBuffer, (q7_t *)Im_in + (i_ker_y * dim_im_in + i_ker_x) * ch_im_in, ch_im_in); + } + pBuffer += ch_im_in; + } + } + + /* we will do the computation here for each channel */ + rowCnt = ch_im_out >> 2; + row_shift = 0; + pBias = bias; + + while (rowCnt) + { + q31_t sum = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); + q31_t sum2 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); + q31_t sum3 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); + q31_t sum4 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); + + uint16_t colCnt = (dim_kernel * dim_kernel) >> 1; + q7_t *pB = colBuffer + row_shift; + const q7_t *pA = wt + row_shift; + row_shift += 4; + +#ifdef USE_INTRINSIC + +#ifndef ARM_MATH_BIG_ENDIAN + + while (colCnt) + { + q31_t inA1, inA2, inB1, inB2, opA, opB; + + inB1 = arm_nn_read_q7x4(pB); + pB += ch_im_in; + opB = arm_nn_read_q7x4(pB); + pB += ch_im_in; + inB2 = __PKHTB(opB, inB1, 16); + inB1 = __PKHBT(inB1, opB, 16); + inA1 = arm_nn_read_q7x4(pA); + pA += ch_im_in; + opB = arm_nn_read_q7x4(pA); + pA += ch_im_in; + inA2 = __PKHTB(opB, inA1, 16); + inA1 = __PKHBT(inA1, opB, 16); + opA = __SXTB16(inA1); + opB = __SXTB16(inB1); + sum = __SMLAD(opA, opB, sum); + opA = __SXTB16(__ROR(inA1, 8)); + opB = __SXTB16(__ROR(inB1, 8)); + sum2 = __SMLAD(opA, opB, sum2); + opA = __SXTB16(inA2); + opB = __SXTB16(inB2); + sum3 = __SMLAD(opA, opB, sum3); + opA = __SXTB16(__ROR(inA2, 8)); + opB = __SXTB16(__ROR(inB2, 8)); + sum4 = __SMLAD(opA, opB, sum4); + colCnt--; + } +#else + + while (colCnt) + { + q31_t inA1, inA2, inB1, inB2, opA, opB; + + inB1 = arm_nn_read_q7x4(pB); + pB += ch_im_in; + opB = arm_nn_read_q7x4(pB); + pB += ch_im_in; + inB2 = __PKHBT(opB, inB1, 16); + inB1 = __PKHTB(inB1, opB, 16); + inA1 = arm_nn_read_q7x4(pA); + pA += ch_im_in; + opB = arm_nn_read_q7x4(pA); + pA += ch_im_in; + inA2 = __PKHBT(opB, inA1, 16); + inA1 = __PKHTB(inA1, opB, 16); + opA = __SXTB16(inA1); + opB = __SXTB16(inB1); + sum2 = __SMLAD(opA, opB, sum2); + opA = __SXTB16(__ROR(inA1, 8)); + opB = __SXTB16(__ROR(inB1, 8)); + sum = __SMLAD(opA, opB, sum); + opA = __SXTB16(inA2); + opB = __SXTB16(inB2); + sum4 = __SMLAD(opA, opB, sum4); + opA = __SXTB16(__ROR(inA2, 8)); + opB = __SXTB16(__ROR(inB2, 8)); + sum3 = __SMLAD(opA, opB, sum3); + colCnt--; + } + +#endif /* ARM_MATH_BIG_ENDIAN */ + +#else + +#ifndef ARM_MATH_BIG_ENDIAN + /* + * r0 r1 r2 r3 r4 r5 + * inA1, inA2, inB1, inB2, opA, opB + */ + + asm volatile("COL_LOOP_%=:\n" + "ldr.w r2, [%[pB], #0]\n" + "add.w %[pB], %[pB], %[ch_im_in]\n" + "ldr.w r5, [%[pB], #0]\n" + "add.w %[pB], %[pB], %[ch_im_in]\n" + "pkhtb r3, r5, r2, ASR #16\n" + "pkhbt r2, r2, r5, LSL #16\n" + "ldr.w r0, [%[pA], #0]\n" + "add.w %[pA], %[pA], %[ch_im_in]\n" + "ldr.w r5, [%[pA], #0]\n" + "add.w %[pA], %[pA], %[ch_im_in]\n" + "pkhtb r1, r5, r0, ASR #16\n" + "pkhbt r0, r0, r5, LSL #16\n" + "sxtb16 r4, r0\n" + "sxtb16 r5, r2\n" + "smlad %[sum], r4, r5, %[sum]\n" + "mov.w r4, r0, ror #8\n" + "mov.w r5, r2, ror #8\n" + "sxtb16 r4, r4\n" + "sxtb16 r5, r5\n" + "smlad %[sum2], r4, r5, %[sum2]\n" + "sxtb16 r4, r1\n" + "sxtb16 r5, r3\n" + "smlad %[sum3], r4, r5, %[sum3]\n" + "mov.w r4, r1, ror #8\n" + "mov.w r5, r3, ror #8\n" + "sxtb16 r4, r4\n" + "sxtb16 r5, r5\n" + "smlad %[sum4], r4, r5, %[sum4]\n" + "subs %[colCnt], #1\n" + "bne COL_LOOP_%=\n" + : [ sum ] "+r"(sum), + [ sum2 ] "+r"(sum2), + [ sum3 ] "+r"(sum3), + [ sum4 ] "+r"(sum4), + [ pB ] "+r"(pB), + [ pA ] "+r"(pA) + : [ colCnt ] "r"(colCnt), [ ch_im_in ] "r"(ch_im_in) + : "r0", "r1", "r2", "r3", "r4", "r5"); +#else + /* + * r0 r1 r2 r3 r4 r5 + * inA1, inA2, inB1, inB2, opA, opB + */ + asm volatile("COL_LOOP_%=:\n" + "ldr.w r2, [%[pB], #0]\n" + "add.w %[pB], %[pB], %[ch_im_in]\n" + "ldr.w r5, [%[pB], #0]\n" + "add.w %[pB], %[pB], %[ch_im_in]\n" + "pkhbt r3, r5, r2, LSL #16\n" + "pkhtb r2, r2, r5, ASR #16\n" + "ldr.w r0, [%[pA], #0]\n" + "add.w %[pA], %[pA], %[ch_im_in]\n" + "ldr.w r5, [%[pA], #0]\n" + "add.w %[pA], %[pA], %[ch_im_in]\n" + "pkhbt r1, r5, r0, LSL #16\n" + "pkhtb r0, r0, r5, ASR #16\n" + "sxtb16 r4, r0\n" + "sxtb16 r5, r2\n" + "smlad %[sum2], r4, r5, %[sum2]\n" + "mov.w r4, r0, ror #8\n" + "mov.w r5, r2, ror #8\n" + "sxtb16 r4, r4\n" + "sxtb16 r5, r5\n" + "smlad %[sum], r4, r5, %[sum]\n" + "sxtb16 r4, r1\n" + "sxtb16 r5, r3\n" + "smlad %[sum4], r4, r5, %[sum4]\n" + "mov.w r4, r1, ror #8\n" + "mov.w r5, r3, ror #8\n" + "sxtb16 r4, r4\n" + "sxtb16 r5, r5\n" + "smlad %[sum3], r4, r5, %[sum3]\n" + "subs %[colCnt], #1\n" + "bne COL_LOOP_%=\n" + : [ sum ] "+r"(sum), + [ sum2 ] "+r"(sum2), + [ sum3 ] "+r"(sum3), + [ sum4 ] "+r"(sum4), + [ pB ] "+r"(pB), + [ pA ] "+r"(pA) + : [ colCnt ] "r"(colCnt), [ ch_im_in ] "r"(ch_im_in) + : "r0", "r1", "r2", "r3", "r4", "r5"); + +#endif /* ARM_MATH_BIG_ENDIAN */ + +#endif /* USE_INTRINSIC */ + + colCnt = (dim_kernel * dim_kernel) & 0x1; + while (colCnt) + { + union arm_nnword inA, inB; + inA.word = arm_nn_read_q7x4(pA); + pA += ch_im_in; + inB.word = arm_nn_read_q7x4(pB); + pB += ch_im_in; + sum += inA.bytes[0] * inB.bytes[0]; + sum2 += inA.bytes[1] * inB.bytes[1]; + sum3 += inA.bytes[2] * inB.bytes[2]; + sum4 += inA.bytes[3] * inB.bytes[3]; + colCnt--; + } + + *pOut++ = (q7_t)__SSAT((sum >> out_shift), 8); + *pOut++ = (q7_t)__SSAT((sum2 >> out_shift), 8); + *pOut++ = (q7_t)__SSAT((sum3 >> out_shift), 8); + *pOut++ = (q7_t)__SSAT((sum4 >> out_shift), 8); + + rowCnt--; + } + + rowCnt = ch_im_out & 0x3; + while (rowCnt) + { + q7_t *pB = colBuffer + row_shift; + const q7_t *pA = wt + row_shift; + q31_t sum = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); + uint16_t colCnt = (dim_kernel * dim_kernel); + + row_shift += 1; + + while (colCnt) + { + q7_t A1 = *pA; + q7_t B1 = *pB; + pA += ch_im_in; + pB += ch_im_in; + sum += A1 * B1; + + colCnt--; + } + *pOut++ = (q7_t)__SSAT((sum >> out_shift), 8); + rowCnt--; + } + + /* clear counter and pointers */ + pBuffer = colBuffer; + } + } + +#else + (void)bufferA; + /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */ + int i_out_y, i_out_x, i_ch_out, i_ker_x, i_ker_y; + int conv_out; + + /* do some checking here, basically ch_im_in == ch_im_out */ + if (ch_im_in != ch_im_out) + { + return ARM_MATH_SIZE_MISMATCH; + } + + for (i_out_y = 0; i_out_y < dim_im_out; i_out_y++) + { + for (i_out_x = 0; i_out_x < dim_im_out; i_out_x++) + { + for (i_ch_out = 0; i_ch_out < ch_im_out; i_ch_out++) + { + // for each output + conv_out = ((q31_t)(bias[i_ch_out]) << bias_shift) + NN_ROUND(out_shift); + for (i_ker_y = 0; i_ker_y < dim_kernel; i_ker_y++) + { + for (i_ker_x = 0; i_ker_x < dim_kernel; i_ker_x++) + { + int in_row = stride * i_out_y + i_ker_y - padding; + int in_col = stride * i_out_x + i_ker_x - padding; + if (in_row >= 0 && in_col >= 0 && in_row < dim_im_in && in_col < dim_im_in) + { + conv_out += Im_in[(in_row * dim_im_in + in_col) * ch_im_in + i_ch_out] * + wt[(i_ker_y * dim_kernel + i_ker_x) * ch_im_out + i_ch_out]; + } + } + } + Im_out[(i_out_y * dim_im_out + i_out_x) * ch_im_out + i_ch_out] = + (q7_t)__SSAT((conv_out >> out_shift), 8); + } + } + } + +#endif /* ARM_MATH_DSP */ + + /* Return to application */ + return ARM_MATH_SUCCESS; +} + +/** + * @} end of NNConv group + */ diff --git a/APP_Framework/Framework/knowing/cmsis_5/NN/Source/ConvolutionFunctions/arm_depthwise_separable_conv_HWC_q7_nonsquare.c b/APP_Framework/Framework/knowing/cmsis_5/NN/Source/ConvolutionFunctions/arm_depthwise_separable_conv_HWC_q7_nonsquare.c new file mode 100644 index 000000000..9cf89b303 --- /dev/null +++ b/APP_Framework/Framework/knowing/cmsis_5/NN/Source/ConvolutionFunctions/arm_depthwise_separable_conv_HWC_q7_nonsquare.c @@ -0,0 +1,427 @@ +/* + * Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_depthwise_separable_conv_HWC_q7_nonsquare.c + * Description: Q7 depthwise separable convolution function (non-square shape) + * + * $Date: January 26, 2021 + * $Revision: V.1.0.2 + * + * Target Processor: Cortex-M cores + * + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" +#include "arm_nnsupportfunctions.h" + +/** + * @ingroup groupNN + */ + +/** + * @addtogroup NNConv + * @{ + */ + +/** + * @brief Q7 depthwise separable convolution function (non-square shape) + * @param[in] Im_in pointer to input tensor + * @param[in] dim_im_in_x input tensor dimension x + * @param[in] dim_im_in_y input tensor dimension y + * @param[in] ch_im_in number of input tensor channels + * @param[in] wt pointer to kernel weights + * @param[in] ch_im_out number of filters, i.e., output tensor channels + * @param[in] dim_kernel_x filter kernel size x + * @param[in] dim_kernel_y filter kernel size y + * @param[in] padding_x padding sizes x + * @param[in] padding_y padding sizes y + * @param[in] stride_x convolution stride x + * @param[in] stride_y convolution stride y + * @param[in] bias pointer to bias + * @param[in] bias_shift amount of left-shift for bias + * @param[in] out_shift amount of right-shift for output + * @param[in,out] Im_out pointer to output tensor + * @param[in] dim_im_out_x output tensor dimension x + * @param[in] dim_im_out_y output tensor dimension y + * @param[in,out] bufferA pointer to buffer space for input + * @param[in,out] bufferB pointer to buffer space for output + * @return The function returns either + * ARM_MATH_SIZE_MISMATCH or ARM_MATH_SUCCESS based on the outcome of size checking. + * + * This function is the version with full list of optimization tricks, but with + * some constraints: + * ch_im_in is equal to ch_im_out + * + */ + +arm_status arm_depthwise_separable_conv_HWC_q7_nonsquare(const q7_t *Im_in, + const uint16_t dim_im_in_x, + const uint16_t dim_im_in_y, + const uint16_t ch_im_in, + const q7_t *wt, + const uint16_t ch_im_out, + const uint16_t dim_kernel_x, + const uint16_t dim_kernel_y, + const uint16_t padding_x, + const uint16_t padding_y, + const uint16_t stride_x, + const uint16_t stride_y, + const q7_t *bias, + const uint16_t bias_shift, + const uint16_t out_shift, + q7_t *Im_out, + const uint16_t dim_im_out_x, + const uint16_t dim_im_out_y, + q15_t *bufferA, + q7_t *bufferB) +{ + + (void)bufferB; + +#if defined(ARM_MATH_DSP) + /* Run the following code for Cortex-M4 and Cortex-M7 */ + + /* + * Implementation: + * There are 3 nested loop here: + * Inner loop: calculate each output value with MAC instruction over an accumulator + * Mid loop: loop over different output channel + * Outer loop: loop over different output (x, y) + * + */ + + int16_t i_out_y, i_out_x; + int16_t i_ker_y, i_ker_x; + q7_t *colBuffer = (q7_t *)bufferA; + q7_t *pBuffer = colBuffer; + const q7_t *pBias = bias; + q7_t *pOut = Im_out; + uint16_t rowCnt; + uint16_t row_shift; + + /* do some checking here, basically ch_im_in == ch_im_out */ + if (ch_im_in != ch_im_out) + { + return ARM_MATH_SIZE_MISMATCH; + } + + for (i_out_y = 0; i_out_y < dim_im_out_y; i_out_y++) + { + for (i_out_x = 0; i_out_x < dim_im_out_x; i_out_x++) + { + /* we first do im2col here */ + for (i_ker_y = i_out_y * stride_y - padding_y; i_ker_y < i_out_y * stride_y - padding_y + dim_kernel_y; + i_ker_y++) + { + for (i_ker_x = i_out_x * stride_x - padding_x; i_ker_x < i_out_x * stride_x - padding_x + dim_kernel_x; + i_ker_x++) + { + if (i_ker_y < 0 || i_ker_y >= dim_im_in_y || i_ker_x < 0 || i_ker_x >= dim_im_in_x) + { + /* arm_fill_q7(0, pBuffer, ch_im_in); */ + memset(pBuffer, 0, ch_im_in); + } + else + { + /* arm_copy_q7((q7_t *) Im_in + (i_ker_y * dim_im_in_x + i_ker_x) * ch_im_in, pBuffer, + * ch_im_in); */ + memcpy(pBuffer, (q7_t *)Im_in + (i_ker_y * dim_im_in_x + i_ker_x) * ch_im_in, ch_im_in); + } + pBuffer += ch_im_in; + } + } + + /* we will do the computation here for each channel */ + rowCnt = ch_im_out >> 2; + row_shift = 0; + pBias = bias; + + while (rowCnt) + { + q31_t sum = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); + q31_t sum2 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); + q31_t sum3 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); + q31_t sum4 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); + + uint16_t colCnt = (dim_kernel_x * dim_kernel_y) >> 1; + q7_t *pB = colBuffer + row_shift; + const q7_t *pA = wt + row_shift; + row_shift += 4; + +#ifdef USE_INTRINSIC + +#ifndef ARM_MATH_BIG_ENDIAN + + while (colCnt) + { + q31_t inA1, inA2, inB1, inB2, opA, opB; + + inB1 = arm_nn_read_q7x4(pB); + pB += ch_im_in; + opB = arm_nn_read_q7x4(pB); + pB += ch_im_in; + inB2 = __PKHTB(opB, inB1, 16); + inB1 = __PKHBT(inB1, opB, 16); + inA1 = arm_nn_read_q7x4(pA); + pA += ch_im_in; + opB = arm_nn_read_q7x4(pA); + pA += ch_im_in; + inA2 = __PKHTB(opB, inA1, 16); + inA1 = __PKHBT(inA1, opB, 16); + opA = __SXTB16(inA1); + opB = __SXTB16(inB1); + sum = __SMLAD(opA, opB, sum); + opA = __SXTB16(__ROR(inA1, 8)); + opB = __SXTB16(__ROR(inB1, 8)); + sum2 = __SMLAD(opA, opB, sum2); + opA = __SXTB16(inA2); + opB = __SXTB16(inB2); + sum3 = __SMLAD(opA, opB, sum3); + opA = __SXTB16(__ROR(inA2, 8)); + opB = __SXTB16(__ROR(inB2, 8)); + sum4 = __SMLAD(opA, opB, sum4); + colCnt--; + } +#else + + while (colCnt) + { + q31_t inA1, inA2, inB1, inB2, opA, opB; + + inB1 = arm_nn_read_q7x4(pB); + pB += ch_im_in; + opB = arm_nn_read_q7x4(pB); + pB += ch_im_in; + inB2 = __PKHBT(opB, inB1, 16); + inB1 = __PKHTB(inB1, opB, 16); + inA1 = arm_nn_read_q7x4(pA); + pA += ch_im_in; + opB = arm_nn_read_q7x4(pA); + pA += ch_im_in; + inA2 = __PKHBT(opB, inA1, 16); + inA1 = __PKHTB(inA1, opB, 16); + opA = __SXTB16(inA1); + opB = __SXTB16(inB1); + sum2 = __SMLAD(opA, opB, sum2); + opA = __SXTB16(__ROR(inA1, 8)); + opB = __SXTB16(__ROR(inB1, 8)); + sum = __SMLAD(opA, opB, sum); + opA = __SXTB16(inA2); + opB = __SXTB16(inB2); + sum4 = __SMLAD(opA, opB, sum4); + opA = __SXTB16(__ROR(inA2, 8)); + opB = __SXTB16(__ROR(inB2, 8)); + sum3 = __SMLAD(opA, opB, sum3); + colCnt--; + } + +#endif /* ARM_MATH_BIG_ENDIAN */ + +#else + +#ifndef ARM_MATH_BIG_ENDIAN + // r0 r1 r2 r3 r4 r5 + // inA1, inA2, inB1, inB2, opA, opB + asm volatile("COL_LOOP:\n" + "ldr.w r2, [%[pB], #0]\n" + "add.w %[pB], %[pB], %[ch_im_in]\n" + "ldr.w r5, [%[pB], #0]\n" + "add.w %[pB], %[pB], %[ch_im_in]\n" + "pkhtb r3, r5, r2, ASR #16\n" + "pkhbt r2, r2, r5, LSL #16\n" + "ldr.w r0, [%[pA], #0]\n" + "add.w %[pA], %[pA], %[ch_im_in]\n" + "ldr.w r5, [%[pA], #0]\n" + "add.w %[pA], %[pA], %[ch_im_in]\n" + "pkhtb r1, r5, r0, ASR #16\n" + "pkhbt r0, r0, r5, LSL #16\n" + "sxtb16 r4, r0\n" + "sxtb16 r5, r2\n" + "smlad %[sum], r4, r5, %[sum]\n" + "mov.w r4, r0, ror #8\n" + "mov.w r5, r2, ror #8\n" + "sxtb16 r4, r4\n" + "sxtb16 r5, r5\n" + "smlad %[sum2], r4, r5, %[sum2]\n" + "sxtb16 r4, r1\n" + "sxtb16 r5, r3\n" + "smlad %[sum3], r4, r5, %[sum3]\n" + "mov.w r4, r1, ror #8\n" + "mov.w r5, r3, ror #8\n" + "sxtb16 r4, r4\n" + "sxtb16 r5, r5\n" + "smlad %[sum4], r4, r5, %[sum4]\n" + "subs %[colCnt], #1\n" + "bne COL_LOOP\n" + : [ sum ] "+r"(sum), + [ sum2 ] "+r"(sum2), + [ sum3 ] "+r"(sum3), + [ sum4 ] "+r"(sum4), + [ pB ] "+r"(pB), + [ pA ] "+r"(pA) + : [ colCnt ] "r"(colCnt), [ ch_im_in ] "r"(ch_im_in) + : "r0", "r1", "r2", "r3", "r4", "r5"); +#else + // r0 r1 r2 r3 r4 r5 + // inA1, inA2, inB1, inB2, opA, opB + asm volatile("COL_LOOP:\n" + "ldr.w r2, [%[pB], #0]\n" + "add.w %[pB], %[pB], %[ch_im_in]\n" + "ldr.w r5, [%[pB], #0]\n" + "add.w %[pB], %[pB], %[ch_im_in]\n" + "pkhbt r3, r5, r2, LSL #16\n" + "pkhtb r2, r2, r5, ASR #16\n" + "ldr.w r0, [%[pA], #0]\n" + "add.w %[pA], %[pA], %[ch_im_in]\n" + "ldr.w r5, [%[pA], #0]\n" + "add.w %[pA], %[pA], %[ch_im_in]\n" + "pkhbt r1, r5, r0, LSL #16\n" + "pkhtb r0, r0, r5, ASR #16\n" + "sxtb16 r4, r0\n" + "sxtb16 r5, r2\n" + "smlad %[sum2], r4, r5, %[sum2]\n" + "mov.w r4, r0, ror #8\n" + "mov.w r5, r2, ror #8\n" + "sxtb16 r4, r4\n" + "sxtb16 r5, r5\n" + "smlad %[sum], r4, r5, %[sum]\n" + "sxtb16 r4, r1\n" + "sxtb16 r5, r3\n" + "smlad %[sum4], r4, r5, %[sum4]\n" + "mov.w r4, r1, ror #8\n" + "mov.w r5, r3, ror #8\n" + "sxtb16 r4, r4\n" + "sxtb16 r5, r5\n" + "smlad %[sum3], r4, r5, %[sum3]\n" + "subs %[colCnt], #1\n" + "bne COL_LOOP\n" + : [ sum ] "+r"(sum), + [ sum2 ] "+r"(sum2), + [ sum3 ] "+r"(sum3), + [ sum4 ] "+r"(sum4), + [ pB ] "+r"(pB), + [ pA ] "+r"(pA) + : [ colCnt ] "r"(colCnt), [ ch_im_in ] "r"(ch_im_in) + : "r0", "r1", "r2", "r3", "r4", "r5"); +#endif /*ARM_MATH_BIG_ENDIAN */ + +#endif /* USE_INTRINSIC */ + + colCnt = (dim_kernel_x * dim_kernel_y) & 0x1; + while (colCnt) + { + union arm_nnword inA, inB; + inA.word = arm_nn_read_q7x4(pA); + pA += ch_im_in; + inB.word = arm_nn_read_q7x4(pB); + pB += ch_im_in; + sum += inA.bytes[0] * inB.bytes[0]; + sum2 += inA.bytes[1] * inB.bytes[1]; + sum3 += inA.bytes[2] * inB.bytes[2]; + sum4 += inA.bytes[3] * inB.bytes[3]; + colCnt--; + } + + *pOut++ = (q7_t)__SSAT((sum >> out_shift), 8); + *pOut++ = (q7_t)__SSAT((sum2 >> out_shift), 8); + *pOut++ = (q7_t)__SSAT((sum3 >> out_shift), 8); + *pOut++ = (q7_t)__SSAT((sum4 >> out_shift), 8); + + rowCnt--; + } + + rowCnt = ch_im_out & 0x3; + while (rowCnt) + { + q7_t *pB = colBuffer + row_shift; + const q7_t *pA = wt + row_shift; + q31_t sum = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); + uint16_t colCnt = (dim_kernel_x * dim_kernel_y); + + row_shift += 1; + + while (colCnt) + { + q7_t A1 = *pA; + q7_t B1 = *pB; + pA += ch_im_in; + pB += ch_im_in; + sum += A1 * B1; + + colCnt--; + } + *pOut++ = (q7_t)__SSAT((sum >> out_shift), 8); + rowCnt--; + } + + // clear counter and pointers + pBuffer = colBuffer; + } + } + +#else + (void)bufferA; + + /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */ + int i_out_y, i_out_x, i_ch_out; + int i_ker_y, i_ker_x; + + /* do some checking here, basically ch_im_in == ch_im_out */ + if (ch_im_in != ch_im_out) + { + return ARM_MATH_SIZE_MISMATCH; + } + + for (i_out_y = 0; i_out_y < dim_im_out_y; i_out_y++) + { + for (i_out_x = 0; i_out_x < dim_im_out_x; i_out_x++) + { + for (i_ch_out = 0; i_ch_out < ch_im_out; i_ch_out++) + { + // for each output + int conv_out = ((q31_t)(bias[i_ch_out]) << bias_shift) + NN_ROUND(out_shift); + for (i_ker_y = 0; i_ker_y < dim_kernel_y; i_ker_y++) + { + for (i_ker_x = 0; i_ker_x < dim_kernel_x; i_ker_x++) + { + int in_row = stride_y * i_out_y + i_ker_y - padding_y; + int in_col = stride_x * i_out_x + i_ker_x - padding_x; + if (in_row >= 0 && in_col >= 0 && in_row < dim_im_in_y && in_col < dim_im_in_x) + { + conv_out += Im_in[(in_row * dim_im_in_x + in_col) * ch_im_in + i_ch_out] * + wt[(i_ker_y * dim_kernel_x + i_ker_x) * ch_im_out + i_ch_out]; + } + } + } + Im_out[(i_out_y * dim_im_out_x + i_out_x) * ch_im_out + i_ch_out] = + (q7_t)__SSAT((conv_out >> out_shift), 8); + } + } + } + +#endif /* ARM_MATH_DSP */ + + /* Return to application */ + return ARM_MATH_SUCCESS; +} + +/** + * @} end of NNConv group + */ diff --git a/APP_Framework/Framework/knowing/cmsis_5/NN/Source/ConvolutionFunctions/arm_nn_depthwise_conv_s8_core.c b/APP_Framework/Framework/knowing/cmsis_5/NN/Source/ConvolutionFunctions/arm_nn_depthwise_conv_s8_core.c new file mode 100644 index 000000000..481eeba6e --- /dev/null +++ b/APP_Framework/Framework/knowing/cmsis_5/NN/Source/ConvolutionFunctions/arm_nn_depthwise_conv_s8_core.c @@ -0,0 +1,218 @@ +/* + * Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_nn_depthwise_conv_s8_core.c + * Description: Depthwise convolution on im2col buffers. + * + * $Date: 09. October 2020 + * $Revision: V.1.0.4 + * + * Target Processor: Cortex-M cores + * -------------------------------------------------------------------- */ + +#include "arm_nnsupportfunctions.h" + +/* + * Depthwise conv on an im2col buffer where the input channel equals + * output channel. + * + * Refer header file for details. + * + */ + +q7_t *arm_nn_depthwise_conv_s8_core(const q7_t *row, + const q15_t *col, + const uint16_t num_ch, + const int32_t *out_shift, + const int32_t *out_mult, + const int32_t out_offset, + const int32_t activation_min, + const int32_t activation_max, + const uint16_t kernel_size, + const int32_t *const output_bias, + q7_t *out) +{ +#if defined(ARM_MATH_MVEI) + int32_t ch_per_loop = num_ch / 4; + + const int32_t *bias = output_bias; + int8_t *out_tmp = out; + + int32_t idx = 0; + + while (ch_per_loop > 0) + { + int32x4_t ip_0; + int32x4_t ip_1; + int32_t ker_loop = kernel_size / 3; + int32x4_t out_0 = vldrwq_s32(bias); + int32x4_t out_1 = out_0; + bias += 4; + + const int32_t offset = idx * 4; + const int8_t *row_0 = row + offset; + const int16_t *col_0 = col + offset; + const int16_t *col_1 = col + kernel_size * num_ch + offset; + + int32x4_t ker_0 = vldrbq_s32(row_0); + + while (ker_loop > 0) + { + const int8_t *row_1 = row_0 + num_ch; + const int8_t *row_2 = row_0 + 2 * num_ch; + const int32x4_t ker_1 = vldrbq_s32(row_1); + const int32x4_t ker_2 = vldrbq_s32(row_2); + + ip_0 = vldrhq_s32(col_0); + ip_1 = vldrhq_s32(col_1); + col_0 += num_ch; + col_1 += num_ch; + + out_0 += vmulq_s32(ip_0, ker_0); + out_1 += vmulq_s32(ip_1, ker_0); + + ip_0 = vldrhq_s32(col_0); + ip_1 = vldrhq_s32(col_1); + col_0 += num_ch; + col_1 += num_ch; + + out_0 += vmulq_s32(ip_0, ker_1); + out_1 += vmulq_s32(ip_1, ker_1); + + ip_0 = vldrhq_s32(col_0); + ip_1 = vldrhq_s32(col_1); + col_0 += num_ch; + col_1 += num_ch; + + out_0 += vmulq_s32(ip_0, ker_2); + out_1 += vmulq_s32(ip_1, ker_2); + row_0 += 3 * num_ch; + + ker_0 = vldrbq_s32(row_0); + ker_loop--; + } + + idx++; + /* Handle tail kernel elements */ + ker_loop = kernel_size - ((kernel_size / 3) * 3); + while (ker_loop > 0) + { + ip_0 = vldrhq_s32(col_0); + ip_1 = vldrhq_s32(col_1); + + out_0 += vmulq_s32(ip_0, ker_0); + out_1 += vmulq_s32(ip_1, ker_0); + + col_0 += num_ch; + col_1 += num_ch; + + ip_0 = vldrhq_s32(col_0); + ip_1 = vldrhq_s32(col_1); + + row_0 += num_ch; + ker_0 = vldrbq_s32(row_0); + ker_loop--; + } + const int32x4_t mult = vldrwq_s32(out_mult); + const int32x4_t shift = vldrwq_s32(out_shift); + out_mult += 4; + out_shift += 4; + + out_0 = arm_requantize_mve_32x4(out_0, mult, shift); + out_1 = arm_requantize_mve_32x4(out_1, mult, shift); + + out_0 = vaddq_n_s32(out_0, out_offset); + out_0 = vmaxq_s32(out_0, vdupq_n_s32(activation_min)); + out_0 = vminq_s32(out_0, vdupq_n_s32(activation_max)); + vstrbq_s32(out_tmp, out_0); + + out_1 = vaddq_n_s32(out_1, out_offset); + out_1 = vmaxq_s32(out_1, vdupq_n_s32(activation_min)); + out_1 = vminq_s32(out_1, vdupq_n_s32(activation_max)); + vstrbq_s32(out_tmp + num_ch, out_1); + + out_tmp += 4; + ch_per_loop--; + } + + int32_t tail_ch = num_ch & 3; + if (tail_ch != 0) + { + int32_t ch_idx = (num_ch & ~3); + int32x4_t col_0_sum; + int32x4_t col_1_sum; + + const int32_t single_buffer_size = kernel_size * num_ch; + for (int i = 0; i < tail_ch; i++) + { + const int16_t *col_pos_0 = col + ch_idx; + const int16_t *col_pos_1 = col_pos_0 + single_buffer_size; + + const int8_t *row_pos = row + ch_idx; + int32_t sum_0 = bias[i]; + int32_t sum_1 = bias[i]; + + for (int j = 0; j < kernel_size; j++) + { + const int8_t row_val = row_pos[j * num_ch]; + sum_0 += row_val * col_pos_0[j * num_ch]; + sum_1 += row_val * col_pos_1[j * num_ch]; + } + col_0_sum[i] = sum_0; + col_1_sum[i] = sum_1; + + ch_idx++; + } + const mve_pred16_t p = vctp32q((uint32_t)tail_ch); + const int32x4_t mult = vldrwq_z_s32(out_mult, p); + const int32x4_t shift = vldrwq_z_s32(out_shift, p); + + col_0_sum = arm_requantize_mve_32x4(col_0_sum, mult, shift); + col_1_sum = arm_requantize_mve_32x4(col_1_sum, mult, shift); + + col_0_sum = vaddq_n_s32(col_0_sum, out_offset); + col_0_sum = vmaxq_s32(col_0_sum, vdupq_n_s32(activation_min)); + col_0_sum = vminq_s32(col_0_sum, vdupq_n_s32(activation_max)); + vstrbq_p_s32(out_tmp, col_0_sum, p); + + col_1_sum = vaddq_n_s32(col_1_sum, out_offset); + col_1_sum = vmaxq_s32(col_1_sum, vdupq_n_s32(activation_min)); + col_1_sum = vminq_s32(col_1_sum, vdupq_n_s32(activation_max)); + vstrbq_p_s32(out_tmp + num_ch, col_1_sum, p); + + out_tmp += tail_ch; + } + + return out_tmp + num_ch; +#else + (void)row; + (void)col; + (void)num_ch; + (void)out_shift; + (void)out_mult; + (void)out_offset; + (void)activation_min; + (void)activation_max; + (void)kernel_size; + (void)output_bias; + (void)out; + return NULL; +#endif +} diff --git a/APP_Framework/Framework/knowing/cmsis_5/NN/Source/ConvolutionFunctions/arm_nn_mat_mult_kernel_q7_q15.c b/APP_Framework/Framework/knowing/cmsis_5/NN/Source/ConvolutionFunctions/arm_nn_mat_mult_kernel_q7_q15.c new file mode 100644 index 000000000..05c95b669 --- /dev/null +++ b/APP_Framework/Framework/knowing/cmsis_5/NN/Source/ConvolutionFunctions/arm_nn_mat_mult_kernel_q7_q15.c @@ -0,0 +1,186 @@ +/* + * Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_nn_mat_mult_kernel_q7_q15.c + * Description: Matrix-multiplication function for convolution + * + * $Date: January 26, 2021 + * $Revision: V.1.0.2 + * + * Target Processor: Cortex-M cores + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" +#include "arm_nnsupportfunctions.h" + +/** + * @brief Matrix-multiplication function for convolution. + * + * @details Refer to header file for details. + * + */ + +q7_t *arm_nn_mat_mult_kernel_q7_q15(const q7_t *pA, + const q15_t *pInBuffer, + const uint16_t ch_im_out, + const uint16_t numCol_A, + const uint16_t bias_shift, + const uint16_t out_shift, + const q7_t *bias, + q7_t *pOut) +{ +#if defined(ARM_MATH_DSP) + /* set up the second output pointers */ + q7_t *pOut2 = pOut + ch_im_out; + const q7_t *pBias = bias; + + uint16_t rowCnt = ch_im_out >> 1; + /* this loop over rows in A */ + while (rowCnt) + { + /* setup pointers for B */ + const q15_t *pB = pInBuffer; + const q15_t *pB2 = pB + numCol_A; + + /* align the second pointer for A */ + const q7_t *pA2 = pA + numCol_A; + + /* init the sum with bias */ + q31_t sum = ((q31_t)(*pBias) << bias_shift) + NN_ROUND(out_shift); + q31_t sum2 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); + q31_t sum3 = ((q31_t)(*pBias) << bias_shift) + NN_ROUND(out_shift); + q31_t sum4 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); + + uint16_t colCnt = numCol_A >> 2; + /* accumulate over the vector */ + while (colCnt) + { + q31_t inA11, inA12, inA21, inA22; + + q31_t inB1 = arm_nn_read_q15x2_ia(&pB); + q31_t inB2 = arm_nn_read_q15x2_ia(&pB2); + + pA = read_and_pad(pA, &inA11, &inA12); + pA2 = read_and_pad(pA2, &inA21, &inA22); + + sum = __SMLAD(inA11, inB1, sum); + sum2 = __SMLAD(inA11, inB2, sum2); + sum3 = __SMLAD(inA21, inB1, sum3); + sum4 = __SMLAD(inA21, inB2, sum4); + + inB1 = arm_nn_read_q15x2_ia(&pB); + inB2 = arm_nn_read_q15x2_ia(&pB2); + + sum = __SMLAD(inA12, inB1, sum); + sum2 = __SMLAD(inA12, inB2, sum2); + sum3 = __SMLAD(inA22, inB1, sum3); + sum4 = __SMLAD(inA22, inB2, sum4); + + colCnt--; + } /* while over colCnt */ + colCnt = numCol_A & 0x3; + while (colCnt) + { + q7_t inA1 = *pA++; + q15_t inB1 = *pB++; + q7_t inA2 = *pA2++; + q15_t inB2 = *pB2++; + + sum += inA1 * inB1; + sum2 += inA1 * inB2; + sum3 += inA2 * inB1; + sum4 += inA2 * inB2; + colCnt--; + } /* while over colCnt */ + *pOut++ = (q7_t)__SSAT((sum >> out_shift), 8); + *pOut++ = (q7_t)__SSAT((sum3 >> out_shift), 8); + *pOut2++ = (q7_t)__SSAT((sum2 >> out_shift), 8); + *pOut2++ = (q7_t)__SSAT((sum4 >> out_shift), 8); + + /* skip the row computed with A2 */ + pA += numCol_A; + rowCnt--; + } /* for over ch_im_out */ + + /* compute left-over row if any */ + if (ch_im_out & 0x1) + { + /* setup pointers for B */ + const q15_t *pB = pInBuffer; + const q15_t *pB2 = pB + numCol_A; + + /* load the bias */ + q31_t sum = ((q31_t)(*pBias) << bias_shift) + NN_ROUND(out_shift); + q31_t sum2 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); + + uint16_t colCnt = numCol_A >> 2; + while (colCnt) + { + q31_t inA11, inA12; + + q31_t inB1 = arm_nn_read_q15x2_ia(&pB); + q31_t inB2 = arm_nn_read_q15x2_ia(&pB2); + + pA = read_and_pad(pA, &inA11, &inA12); + + sum = __SMLAD(inA11, inB1, sum); + sum2 = __SMLAD(inA11, inB2, sum2); + + inB1 = arm_nn_read_q15x2_ia(&pB); + inB2 = arm_nn_read_q15x2_ia(&pB2); + + sum = __SMLAD(inA12, inB1, sum); + sum2 = __SMLAD(inA12, inB2, sum2); + + colCnt--; + } + colCnt = numCol_A & 0x3; + while (colCnt) + { + q7_t inA1 = *pA++; + q15_t inB1 = *pB++; + q15_t inB2 = *pB2++; + + sum += inA1 * inB1; + sum2 += inA1 * inB2; + colCnt--; + } + + *pOut++ = (q7_t)__SSAT((sum >> out_shift), 8); + *pOut2++ = (q7_t)__SSAT((sum2 >> out_shift), 8); + } + + pOut += ch_im_out; + + /* return the new output pointer with offset */ + return pOut; +#else + (void)pA; + (void)pInBuffer; + (void)ch_im_out; + (void)numCol_A; + (void)bias_shift; + (void)out_shift; + (void)bias; + (void)pOut; + /* To be completed */ + return NULL; +#endif /* ARM_MATH_DSP */ +} diff --git a/APP_Framework/Framework/knowing/cmsis_5/NN/Source/ConvolutionFunctions/arm_nn_mat_mult_kernel_q7_q15_reordered.c b/APP_Framework/Framework/knowing/cmsis_5/NN/Source/ConvolutionFunctions/arm_nn_mat_mult_kernel_q7_q15_reordered.c new file mode 100644 index 000000000..0870ac320 --- /dev/null +++ b/APP_Framework/Framework/knowing/cmsis_5/NN/Source/ConvolutionFunctions/arm_nn_mat_mult_kernel_q7_q15_reordered.c @@ -0,0 +1,137 @@ +/* + * Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_nn_mat_mult_kernel_q7_q15_reordered.c + * Description: Matrix-multiplication function for convolution with reordered columns + * + * $Date: January 26, 2021 + * $Revision: V.1.0.2 + * + * Target Processor: Cortex-M cores + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" +#include "arm_nnsupportfunctions.h" + +/** + * @brief Matrix-multiplication function for convolution with re-ordered input. + * + * @details Refer to header file for details. + * + */ + +q7_t *arm_nn_mat_mult_kernel_q7_q15_reordered(const q7_t *pA, + const q15_t *pInBuffer, + const uint16_t ch_im_out, + const uint16_t numCol_A, + const uint16_t bias_shift, + const uint16_t out_shift, + const q7_t *bias, + q7_t *pOut) +{ + +#if defined(ARM_MATH_DSP) + /* set up the second output pointers */ + q7_t *pOut2 = pOut + ch_im_out; + int i; + + /* this loop over rows in A */ + for (i = 0; i < ch_im_out; i += 2) + { + /* setup pointers for B */ + const q15_t *pB = pInBuffer; + const q15_t *pB2 = pB + numCol_A; + + /* align the second pointer for A */ + const q7_t *pA2 = pA + numCol_A; + + /* init the sum with bias */ + q31_t sum = ((q31_t)(bias[i]) << bias_shift) + NN_ROUND(out_shift); + q31_t sum2 = ((q31_t)(bias[i]) << bias_shift) + NN_ROUND(out_shift); + q31_t sum3 = ((q31_t)(bias[i + 1]) << bias_shift) + NN_ROUND(out_shift); + q31_t sum4 = ((q31_t)(bias[i + 1]) << bias_shift) + NN_ROUND(out_shift); + + uint16_t colCnt = numCol_A >> 2; + /* accumulate over the vector */ + while (colCnt) + { + q31_t inA11, inA12, inA21, inA22; + + q31_t inB1 = arm_nn_read_q15x2_ia(&pB); + q31_t inB2 = arm_nn_read_q15x2_ia(&pB2); + + pA = read_and_pad_reordered(pA, &inA11, &inA12); + pA2 = read_and_pad_reordered(pA2, &inA21, &inA22); + + sum = __SMLAD(inA11, inB1, sum); + sum2 = __SMLAD(inA11, inB2, sum2); + sum3 = __SMLAD(inA21, inB1, sum3); + sum4 = __SMLAD(inA21, inB2, sum4); + + inB1 = arm_nn_read_q15x2_ia(&pB); + inB2 = arm_nn_read_q15x2_ia(&pB2); + + sum = __SMLAD(inA12, inB1, sum); + sum2 = __SMLAD(inA12, inB2, sum2); + sum3 = __SMLAD(inA22, inB1, sum3); + sum4 = __SMLAD(inA22, inB2, sum4); + + colCnt--; + } /* while over colCnt */ + colCnt = numCol_A & 0x3; + while (colCnt) + { + q7_t inA1 = *pA++; + q15_t inB1 = *pB++; + q7_t inA2 = *pA2++; + q15_t inB2 = *pB2++; + + sum += inA1 * inB1; + sum2 += inA1 * inB2; + sum3 += inA2 * inB1; + sum4 += inA2 * inB2; + colCnt--; + } /* while over colCnt */ + *pOut++ = (q7_t)__SSAT((sum >> out_shift), 8); + *pOut++ = (q7_t)__SSAT((sum3 >> out_shift), 8); + *pOut2++ = (q7_t)__SSAT((sum2 >> out_shift), 8); + *pOut2++ = (q7_t)__SSAT((sum4 >> out_shift), 8); + + /* skip the row computed with A2 */ + pA += numCol_A; + } /* for over ch_im_out */ + + pOut += ch_im_out; + + /* return the new output pointer with offset */ + return pOut; +#else + (void)pA; + (void)pInBuffer; + (void)ch_im_out; + (void)numCol_A; + (void)bias_shift; + (void)out_shift; + (void)bias; + (void)pOut; + /* To be completed */ + return NULL; +#endif /* ARM_MATH_DSP */ +} diff --git a/APP_Framework/Framework/knowing/cmsis_5/NN/Source/ConvolutionFunctions/arm_nn_mat_mult_kernel_s8_s16.c b/APP_Framework/Framework/knowing/cmsis_5/NN/Source/ConvolutionFunctions/arm_nn_mat_mult_kernel_s8_s16.c new file mode 100644 index 000000000..cc54ca90b --- /dev/null +++ b/APP_Framework/Framework/knowing/cmsis_5/NN/Source/ConvolutionFunctions/arm_nn_mat_mult_kernel_s8_s16.c @@ -0,0 +1,391 @@ +/* + * Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_nn_mat_mult_kernel_s8_s16.c + * Description: Matrix-multiplication function for convolution + * + * $Date: 09. October 2020 + * $Revision: V.1.0.3 + * + * Target Processor: Cortex-M cores + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" +#include "arm_nnsupportfunctions.h" + +/* + * Matrix-multiplication function for convolution with per-channel requantization. + * + * Refer header file for details. + * + */ + +q7_t *arm_nn_mat_mult_kernel_s8_s16(const q7_t *input_a, + const q15_t *input_b, + const uint16_t output_ch, + const int32_t *out_shift, + const int32_t *out_mult, + const int32_t out_offset, + const int16_t activation_min, + const int16_t activation_max, + const uint16_t num_col_a, + const int32_t *const output_bias, + q7_t *out_0) +{ +#if defined(ARM_MATH_MVEI) +#define ROW_PER_LOOP (4) +#define COL_PER_LOOP (8) + + const q7_t *ip_a0_s8 = input_a; + q7_t *out_1 = out_0 + output_ch; + + const int32_t *bias = output_bias; + + int32_t row_count = output_ch / ROW_PER_LOOP; + + while (row_count) + { + const q15_t *ip_b0_s16 = input_b; + const q15_t *ip_b1_s16 = input_b + num_col_a; + + const q7_t *ip_a1_s8 = ip_a0_s8 + num_col_a; + const q7_t *ip_a2_s8 = ip_a0_s8 + num_col_a * 2; + const q7_t *ip_a3_s8 = ip_a0_s8 + num_col_a * 3; + + q31_t ch_0_out_n = bias[0]; + q31_t ch_1_out_n = bias[1]; + q31_t ch_2_out_n = bias[2]; + q31_t ch_3_out_n = bias[3]; + + q31_t ch_0_out_n1 = ch_0_out_n; + q31_t ch_1_out_n1 = ch_1_out_n; + q31_t ch_2_out_n1 = ch_2_out_n; + q31_t ch_3_out_n1 = ch_3_out_n; + bias += 4; + + int32_t col_count = num_col_a / COL_PER_LOOP; + + while (col_count) + { + // Load inputs + const int16x8_t ip_b0 = vld1q_s16(ip_b0_s16); + ip_b0_s16 += COL_PER_LOOP; + const int16x8_t ip_b1 = vld1q_s16(ip_b1_s16); + ip_b1_s16 += COL_PER_LOOP; + + // Load filters + const int16x8_t ip_a0 = vldrbq_s16(ip_a0_s8); + ip_a0_s8 += COL_PER_LOOP; + const int16x8_t ip_a1 = vldrbq_s16(ip_a1_s8); + ip_a1_s8 += COL_PER_LOOP; + const int16x8_t ip_a2 = vldrbq_s16(ip_a2_s8); + ip_a2_s8 += COL_PER_LOOP; + const int16x8_t ip_a3 = vldrbq_s16(ip_a3_s8); + ip_a3_s8 += COL_PER_LOOP; + + // MAC + ch_0_out_n += vmladavq_s16(ip_b0, ip_a0); + ch_1_out_n += vmladavq_s16(ip_b0, ip_a1); + ch_2_out_n += vmladavq_s16(ip_b0, ip_a2); + ch_3_out_n += vmladavq_s16(ip_b0, ip_a3); + ch_0_out_n1 += vmladavq_s16(ip_b1, ip_a0); + ch_1_out_n1 += vmladavq_s16(ip_b1, ip_a1); + ch_2_out_n1 += vmladavq_s16(ip_b1, ip_a2); + ch_3_out_n1 += vmladavq_s16(ip_b1, ip_a3); + + col_count--; + } + + /* Handle tail */ + col_count = (num_col_a & (COL_PER_LOOP - 1)) - 1; + while (col_count >= 0) + { + const int32_t b0 = ip_b0_s16[col_count]; + const int32_t b1 = ip_b1_s16[col_count]; + + ch_0_out_n += b0 * ip_a0_s8[col_count]; + ch_1_out_n += b0 * ip_a1_s8[col_count]; + ch_2_out_n += b0 * ip_a2_s8[col_count]; + ch_3_out_n += b0 * ip_a3_s8[col_count]; + + ch_0_out_n1 += b1 * ip_a0_s8[col_count]; + ch_1_out_n1 += b1 * ip_a1_s8[col_count]; + ch_2_out_n1 += b1 * ip_a2_s8[col_count]; + ch_3_out_n1 += b1 * ip_a3_s8[col_count]; + col_count--; + } + ip_a0_s8 += (num_col_a & (COL_PER_LOOP - 1)); + + int32x4_t out_vec_0; + int32x4_t out_vec_1; + out_vec_0[0] = ch_0_out_n; + out_vec_0[1] = ch_1_out_n; + out_vec_0[2] = ch_2_out_n; + out_vec_0[3] = ch_3_out_n; + + out_vec_1[0] = ch_0_out_n1; + out_vec_1[1] = ch_1_out_n1; + out_vec_1[2] = ch_2_out_n1; + out_vec_1[3] = ch_3_out_n1; + + int32x4_t mult = vldrwq_s32(out_mult); + int32x4_t shift = vldrwq_s32(out_shift); + out_mult += ROW_PER_LOOP; + out_shift += ROW_PER_LOOP; + + out_vec_0 = arm_requantize_mve_32x4(out_vec_0, mult, shift); + out_vec_1 = arm_requantize_mve_32x4(out_vec_1, mult, shift); + + out_vec_0 = vaddq_n_s32(out_vec_0, out_offset); + out_vec_0 = vmaxq_s32(out_vec_0, vdupq_n_s32(activation_min)); + out_vec_0 = vminq_s32(out_vec_0, vdupq_n_s32(activation_max)); + vstrbq_s32(out_0, out_vec_0); + out_0 += ROW_PER_LOOP; + + out_vec_1 = vaddq_n_s32(out_vec_1, out_offset); + out_vec_1 = vmaxq_s32(out_vec_1, vdupq_n_s32(activation_min)); + out_vec_1 = vminq_s32(out_vec_1, vdupq_n_s32(activation_max)); + vstrbq_s32(out_1, out_vec_1); + out_1 += ROW_PER_LOOP; + row_count--; + ip_a0_s8 += (num_col_a * 3); + } + + row_count = output_ch & (ROW_PER_LOOP - 1); + + if (row_count) + { + ip_a0_s8 = input_a + num_col_a * (output_ch & ~3); + const mve_pred16_t p = vctp32q((uint32_t)row_count); + int32x4_t out_vec_0 = vdupq_n_s32(0); + int32x4_t out_vec_1 = vdupq_n_s32(0); + int32x4_t mult_tail; + int32x4_t shift_tail; + + for (int i_ch = 0; i_ch < row_count; i_ch++) + { + int32_t output_0 = bias[i_ch]; + int32_t output_1 = bias[i_ch]; + const q15_t *ip_b0_s16 = input_b; + const q15_t *ip_b1_s16 = input_b + num_col_a; + + for (int i_idx = 0; i_idx < num_col_a; i_idx++) + { + output_0 += ip_b0_s16[i_idx] * ip_a0_s8[i_idx]; + output_1 += ip_b1_s16[i_idx] * ip_a0_s8[i_idx]; + } + + ip_a0_s8 += num_col_a; + out_vec_0[i_ch] = output_0; + out_vec_1[i_ch] = output_1; + mult_tail[i_ch] = out_mult[i_ch]; + shift_tail[i_ch] = out_shift[i_ch]; + } + out_vec_0 = arm_requantize_mve_32x4(out_vec_0, mult_tail, shift_tail); + out_vec_1 = arm_requantize_mve_32x4(out_vec_1, mult_tail, shift_tail); + + out_vec_0 = vaddq_n_s32(out_vec_0, out_offset); + out_vec_0 = vmaxq_s32(out_vec_0, vdupq_n_s32(activation_min)); + out_vec_0 = vminq_s32(out_vec_0, vdupq_n_s32(activation_max)); + vstrbq_p_s32(out_0, out_vec_0, p); + + out_vec_1 = vaddq_n_s32(out_vec_1, out_offset); + out_vec_1 = vmaxq_s32(out_vec_1, vdupq_n_s32(activation_min)); + out_vec_1 = vminq_s32(out_vec_1, vdupq_n_s32(activation_max)); + + vstrbq_p_s32(out_1, out_vec_1, p); + out_1 += row_count; + } + + return out_1; + +#elif defined(ARM_MATH_DSP) + /* set up the second output pointers */ + q7_t *out_1 = out_0 + output_ch; + const int32_t *bias = output_bias; + + uint16_t row_count = output_ch / 2; + const q7_t *ip_a0 = input_a; + /* this loop over rows in A */ + while (row_count) + { + /* setup pointers for B */ + const q15_t *ip_b0 = input_b; + const q15_t *ip_b1 = ip_b0 + num_col_a; + + /* align the second pointer for A */ + const q7_t *ip_a1 = ip_a0 + num_col_a; + + /* Init accumulator with bias for channel N and N + 1 */ + q31_t ch_0_out_0 = *bias; + q31_t ch_0_out_1 = *bias++; + q31_t ch_1_out_0 = *bias; + q31_t ch_1_out_1 = *bias++; + + uint16_t col_count = num_col_a / 4; + /* accumulate over the vector */ + while (col_count) + { + q31_t a01, a02, a11, a12; + q31_t b0 = arm_nn_read_q15x2_ia(&ip_b0); + q31_t b1 = arm_nn_read_q15x2_ia(&ip_b1); + + ip_a0 = read_and_pad(ip_a0, &a01, &a02); + ip_a1 = read_and_pad(ip_a1, &a11, &a12); + + ch_0_out_0 = __SMLAD(a01, b0, ch_0_out_0); + ch_0_out_1 = __SMLAD(a01, b1, ch_0_out_1); + ch_1_out_0 = __SMLAD(a11, b0, ch_1_out_0); + ch_1_out_1 = __SMLAD(a11, b1, ch_1_out_1); + + b0 = arm_nn_read_q15x2_ia(&ip_b0); + b1 = arm_nn_read_q15x2_ia(&ip_b1); + + ch_0_out_0 = __SMLAD(a02, b0, ch_0_out_0); + ch_0_out_1 = __SMLAD(a02, b1, ch_0_out_1); + ch_1_out_0 = __SMLAD(a12, b0, ch_1_out_0); + ch_1_out_1 = __SMLAD(a12, b1, ch_1_out_1); + + col_count--; + } /* while over col_count */ + col_count = num_col_a & 0x3; + while (col_count) + { + q7_t a0 = *ip_a0++; + q15_t b0 = *ip_b0++; + q7_t a1 = *ip_a1++; + q15_t b1 = *ip_b1++; + + ch_0_out_0 += a0 * b0; + ch_0_out_1 += a0 * b1; + ch_1_out_0 += a1 * b0; + ch_1_out_1 += a1 * b1; + col_count--; + } /* while over col_count */ + + ch_0_out_0 = arm_nn_requantize(ch_0_out_0, *out_mult, *out_shift); + ch_0_out_0 += out_offset; + ch_0_out_0 = MAX(ch_0_out_0, activation_min); + ch_0_out_0 = MIN(ch_0_out_0, activation_max); + *out_0++ = (q7_t)ch_0_out_0; + + ch_0_out_1 = arm_nn_requantize(ch_0_out_1, *out_mult, *out_shift); + ch_0_out_1 += out_offset; + ch_0_out_1 = MAX(ch_0_out_1, activation_min); + ch_0_out_1 = MIN(ch_0_out_1, activation_max); + *out_1++ = (q7_t)ch_0_out_1; + out_mult++; + out_shift++; + + ch_1_out_0 = arm_nn_requantize(ch_1_out_0, *out_mult, *out_shift); + ch_1_out_0 += out_offset; + ch_1_out_0 = MAX(ch_1_out_0, activation_min); + ch_1_out_0 = MIN(ch_1_out_0, activation_max); + *out_0++ = (q7_t)ch_1_out_0; + + ch_1_out_1 = arm_nn_requantize(ch_1_out_1, *out_mult, *out_shift); + ch_1_out_1 += out_offset; + ch_1_out_1 = MAX(ch_1_out_1, activation_min); + ch_1_out_1 = MIN(ch_1_out_1, activation_max); + *out_1++ = (q7_t)ch_1_out_1; + out_mult++; + out_shift++; + + /* skip row */ + ip_a0 += num_col_a; + row_count--; + } + + /* compute the last odd numbered row if any */ + if (output_ch & 0x1) + { + /* setup pointers for B */ + const q15_t *ip_b0 = input_b; + const q15_t *ip_b1 = ip_b0 + num_col_a; + + /* load the bias */ + q31_t ch_0_out_0 = *bias; + q31_t ch_0_out_1 = *bias++; + + uint16_t col_count = num_col_a >> 2; + while (col_count) + { + q31_t a01, a02; + q31_t b0 = arm_nn_read_q15x2_ia(&ip_b0); + q31_t b1 = arm_nn_read_q15x2_ia(&ip_b1); + + ip_a0 = read_and_pad(ip_a0, &a01, &a02); + + ch_0_out_0 = __SMLAD(a01, b0, ch_0_out_0); + ch_0_out_1 = __SMLAD(a01, b1, ch_0_out_1); + + b0 = arm_nn_read_q15x2_ia(&ip_b0); + b1 = arm_nn_read_q15x2_ia(&ip_b1); + ch_0_out_0 = __SMLAD(a02, b0, ch_0_out_0); + ch_0_out_1 = __SMLAD(a02, b1, ch_0_out_1); + + col_count--; + } + col_count = num_col_a & 0x3; + while (col_count) + { + q7_t a0 = *ip_a0++; + q15_t b0 = *ip_b0++; + q15_t b1 = *ip_b1++; + + ch_0_out_0 += a0 * b0; + ch_0_out_1 += a0 * b1; + col_count--; + } + ch_0_out_0 = arm_nn_requantize(ch_0_out_0, *out_mult, *out_shift); + ch_0_out_0 += out_offset; + ch_0_out_0 = MAX(ch_0_out_0, activation_min); + ch_0_out_0 = MIN(ch_0_out_0, activation_max); + *out_0++ = (q7_t)ch_0_out_0; + + ch_0_out_1 = arm_nn_requantize(ch_0_out_1, *out_mult, *out_shift); + ch_0_out_1 += out_offset; + ch_0_out_1 = MAX(ch_0_out_1, activation_min); + ch_0_out_1 = MIN(ch_0_out_1, activation_max); + *out_1++ = (q7_t)ch_0_out_1; + out_mult++; + out_shift++; + } + + out_0 += output_ch; + + /* return the new output pointer with offset */ + return out_0; +#else + (void)input_a; + (void)input_b; + (void)output_ch; + (void)out_shift; + (void)out_mult; + (void)out_offset; + (void)activation_min; + (void)activation_max; + (void)num_col_a; + (void)output_bias; + (void)out_0; + /* To be completed */ + return NULL; +#endif +} diff --git a/APP_Framework/Framework/knowing/cmsis_5/NN/Source/ConvolutionFunctions/arm_nn_mat_mult_kernel_s8_s16_reordered.c b/APP_Framework/Framework/knowing/cmsis_5/NN/Source/ConvolutionFunctions/arm_nn_mat_mult_kernel_s8_s16_reordered.c new file mode 100644 index 000000000..842a1803f --- /dev/null +++ b/APP_Framework/Framework/knowing/cmsis_5/NN/Source/ConvolutionFunctions/arm_nn_mat_mult_kernel_s8_s16_reordered.c @@ -0,0 +1,201 @@ +/* + * Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_nn_mat_mult_kernel_s8_s16_reordered.c + * Description: Matrix-multiplication function for convolution with reordered columns + * + * $Date: 09. October 2020 + * $Revision: V.1.0.3 + * + * Target Processor: Cortex-M cores + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" +#include "arm_nnsupportfunctions.h" + +/* + * Matrix-multiplication with re-ordered input and bias inputs for convolution with per-channel + * requantization. The re-ordering is a consequence of sign extension is done by the SXTB16 command. + * + * Refer header file for details. This function differs from arm_nn_mat_mult_kernel_s8_s16(), in that it uses + * read_and_pad_reordered() instead of arm_nn_mat_mult_kernel_s8_s16(). Investigating the cycles impact and + * unifying these two functions is a potential future improvement. + * + */ + +q7_t *arm_nn_mat_mult_kernel_s8_s16_reordered(const q7_t *input_a, + const q15_t *input_b, + const uint16_t output_ch, + const int32_t *out_shift, + const int32_t *out_mult, + const int32_t out_offset, + const int16_t activation_min, + const int16_t activation_max, + const uint16_t num_col_a, + const int32_t *const output_bias, + q7_t *out_0) +{ +#if defined(ARM_MATH_DSP) + /* set up the second output pointers */ + q7_t *out_1 = out_0 + output_ch; + const int32_t *bias = output_bias; + + uint16_t row_count = output_ch / 2; + const q7_t *ip_a0 = input_a; + /* this loop over rows in A */ + while (row_count) + { + /* setup pointers for B */ + const q15_t *ip_b0 = input_b; + const q15_t *ip_b1 = ip_b0 + num_col_a; + + /* align the second pointer for A */ + const q7_t *ip_a1 = ip_a0 + num_col_a; + + /* Init accumulator with bias for channel N and N + 1 */ + q31_t ch_0_out_0 = *bias; + q31_t ch_0_out_1 = *bias++; + q31_t ch_1_out_0 = *bias; + q31_t ch_1_out_1 = *bias++; + + uint16_t col_count = num_col_a / 4; + /* accumulate over the vector */ + while (col_count) + { + q31_t a01, a02, a11, a12; + q31_t b0 = arm_nn_read_q15x2_ia(&ip_b0); + q31_t b1 = arm_nn_read_q15x2_ia(&ip_b1); + + ip_a0 = read_and_pad_reordered(ip_a0, &a01, &a02); + ip_a1 = read_and_pad_reordered(ip_a1, &a11, &a12); + + ch_0_out_0 = __SMLAD(a01, b0, ch_0_out_0); + ch_0_out_1 = __SMLAD(a01, b1, ch_0_out_1); + ch_1_out_0 = __SMLAD(a11, b0, ch_1_out_0); + ch_1_out_1 = __SMLAD(a11, b1, ch_1_out_1); + + b0 = arm_nn_read_q15x2_ia(&ip_b0); + b1 = arm_nn_read_q15x2_ia(&ip_b1); + + ch_0_out_0 = __SMLAD(a02, b0, ch_0_out_0); + ch_0_out_1 = __SMLAD(a02, b1, ch_0_out_1); + ch_1_out_0 = __SMLAD(a12, b0, ch_1_out_0); + ch_1_out_1 = __SMLAD(a12, b1, ch_1_out_1); + + col_count--; + } /* while over col_count */ + + ch_0_out_0 = arm_nn_requantize(ch_0_out_0, *out_mult, *out_shift); + ch_0_out_0 += out_offset; + ch_0_out_0 = MAX(ch_0_out_0, activation_min); + ch_0_out_0 = MIN(ch_0_out_0, activation_max); + *out_0++ = (q7_t)ch_0_out_0; + + ch_0_out_1 = arm_nn_requantize(ch_0_out_1, *out_mult, *out_shift); + ch_0_out_1 += out_offset; + ch_0_out_1 = MAX(ch_0_out_1, activation_min); + ch_0_out_1 = MIN(ch_0_out_1, activation_max); + *out_1++ = (q7_t)ch_0_out_1; + out_mult++; + out_shift++; + + ch_1_out_0 = arm_nn_requantize(ch_1_out_0, *out_mult, *out_shift); + ch_1_out_0 += out_offset; + ch_1_out_0 = MAX(ch_1_out_0, activation_min); + ch_1_out_0 = MIN(ch_1_out_0, activation_max); + *out_0++ = (q7_t)ch_1_out_0; + + ch_1_out_1 = arm_nn_requantize(ch_1_out_1, *out_mult, *out_shift); + ch_1_out_1 += out_offset; + ch_1_out_1 = MAX(ch_1_out_1, activation_min); + ch_1_out_1 = MIN(ch_1_out_1, activation_max); + *out_1++ = (q7_t)ch_1_out_1; + out_mult++; + out_shift++; + + /* skip row */ + ip_a0 += num_col_a; + row_count--; + } + + if (output_ch & 1) + { + /* setup pointers for B */ + const q15_t *ip_b0 = input_b; + const q15_t *ip_b1 = ip_b0 + num_col_a; + + /* Init accumulator with bias for channel N + 1 */ + q31_t ch_0_out_0 = *bias; + q31_t ch_0_out_1 = ch_0_out_0; + + int32_t col_count = num_col_a / 4; + while (col_count) + { + q31_t a01, a02; + q31_t b0 = arm_nn_read_q15x2_ia(&ip_b0); + q31_t b1 = arm_nn_read_q15x2_ia(&ip_b1); + + ip_a0 = read_and_pad_reordered(ip_a0, &a01, &a02); + + ch_0_out_0 = __SMLAD(a01, b0, ch_0_out_0); + ch_0_out_1 = __SMLAD(a01, b1, ch_0_out_1); + + b0 = arm_nn_read_q15x2_ia(&ip_b0); + b1 = arm_nn_read_q15x2_ia(&ip_b1); + + ch_0_out_0 = __SMLAD(a02, b0, ch_0_out_0); + ch_0_out_1 = __SMLAD(a02, b1, ch_0_out_1); + + col_count--; + } /* while over col_count */ + + ch_0_out_0 = arm_nn_requantize(ch_0_out_0, *out_mult, *out_shift); + ch_0_out_0 += out_offset; + ch_0_out_0 = MAX(ch_0_out_0, activation_min); + ch_0_out_0 = MIN(ch_0_out_0, activation_max); + *out_0++ = (q7_t)ch_0_out_0; + + ch_0_out_1 = arm_nn_requantize(ch_0_out_1, *out_mult, *out_shift); + ch_0_out_1 += out_offset; + ch_0_out_1 = MAX(ch_0_out_1, activation_min); + ch_0_out_1 = MIN(ch_0_out_1, activation_max); + *out_1++ = (q7_t)ch_0_out_1; + } + + out_0 += output_ch; + + /* return the new output pointer with offset */ + return out_0; +#else + (void)input_a; + (void)input_b; + (void)output_ch; + (void)out_shift; + (void)out_mult; + (void)out_offset; + (void)activation_min; + (void)activation_max; + (void)num_col_a; + (void)output_bias; + (void)out_0; + /* To be completed */ + return NULL; +#endif +} diff --git a/APP_Framework/Framework/knowing/cmsis_5/NN/Source/ConvolutionFunctions/arm_nn_mat_mult_s8.c b/APP_Framework/Framework/knowing/cmsis_5/NN/Source/ConvolutionFunctions/arm_nn_mat_mult_s8.c new file mode 100644 index 000000000..d9f404a35 --- /dev/null +++ b/APP_Framework/Framework/knowing/cmsis_5/NN/Source/ConvolutionFunctions/arm_nn_mat_mult_s8.c @@ -0,0 +1,180 @@ +/* + * Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_nn_mat_mult_s8.c + * Description: General Matrix-multiplication function + * + * $Date: 09. October 2020 + * $Revision: V.2.0.5 + * + * Target Processor: Cortex-M cores + * -------------------------------------------------------------------- */ + +#include "arm_nnsupportfunctions.h" + +/* + * s8 General matrix multiplication function with per-channel requantization for upto 4 column batches. + * + * Refer header file for details. + * + */ + +q7_t *arm_nn_mat_mult_s8(const q7_t *input_row, + const q7_t *input_col, + const uint16_t output_ch, + const uint16_t col_batches, + const int32_t *output_shift, + const int32_t *output_mult, + const int32_t out_offset, + const int32_t col_offset, + const int32_t row_offset, + const int16_t activation_min, + const int16_t activation_max, + const uint16_t row_len, + const int32_t *const bias, + q7_t *out) +{ +#if defined(ARM_MATH_MVEI) + (void)row_offset; + if (col_batches == 4) + { + for (int i_out_ch = 0; i_out_ch < output_ch; i_out_ch++) + { + int32_t row_len_tmp = row_len; + const int8_t *ip_r0 = input_row + (i_out_ch * row_len); + const int8_t *ip_c0 = input_col; + const int8_t *ip_c1 = input_col + row_len; + const int8_t *ip_c2 = input_col + (2 * row_len); + const int8_t *ip_c3 = input_col + (3 * row_len); + + int32_t acc_0 = 0; + int32_t acc_1 = 0; + int32_t acc_2 = 0; + int32_t acc_3 = 0; + const int32_t row_loop_cnt = (row_len + 7) / 8; + + for (int i_row_loop = 0; i_row_loop < row_loop_cnt; i_row_loop++) + { + mve_pred16_t p = vctp16q((uint32_t)row_len_tmp); + const int16x8_t offset = vdupq_m_n_s16(vuninitializedq_s16(), col_offset, p); + row_len_tmp -= 8; + + int16x8_t r0 = vldrbq_z_s16(ip_r0, p); + ip_r0 += 8; + + int16x8_t c0 = vldrbq_z_s16(ip_c0, p); + ip_c0 += 8; + c0 = vaddq_m_s16(vuninitializedq_s16(), c0, offset, p); + + int16x8_t c1 = vldrbq_z_s16(ip_c1, p); + ip_c1 += 8; + c1 = vaddq_m_s16(vuninitializedq_s16(), c1, offset, p); + + int16x8_t c2 = vldrbq_z_s16(ip_c2, p); + ip_c2 += 8; + c2 = vaddq_m_s16(vuninitializedq_s16(), c2, offset, p); + + int16x8_t c3 = vldrbq_z_s16(ip_c3, p); + ip_c3 += 8; + c3 = vaddq_m_s16(vuninitializedq_s16(), c3, offset, p); + + acc_0 = vmladavaq_p_s16(acc_0, r0, c0, p); + acc_1 = vmladavaq_p_s16(acc_1, r0, c1, p); + acc_2 = vmladavaq_p_s16(acc_2, r0, c2, p); + acc_3 = vmladavaq_p_s16(acc_3, r0, c3, p); + } + + int32x4_t res = {acc_0, acc_1, acc_2, acc_3}; + if (bias) + { + res = vaddq_n_s32(res, bias[i_out_ch]); + } + res = arm_requantize_mve(res, output_mult[i_out_ch], output_shift[i_out_ch]); + res = vaddq_n_s32(res, out_offset); + + res = vmaxq_s32(res, vdupq_n_s32(activation_min)); + res = vminq_s32(res, vdupq_n_s32(activation_max)); + + const uint32x4_t scatter_offset = {0, output_ch, output_ch * 2, output_ch * 3}; + vstrbq_scatter_offset_s32(&out[i_out_ch], scatter_offset, res); + } + out += 4 * output_ch; + } + else + { + for (int i_col_batch = (col_batches & ~0x3); i_col_batch < (col_batches & 0x3); i_col_batch++) + { + for (int i_out_ch = 0; i_out_ch < output_ch; i_out_ch++) + { + int32_t row_len_tmp = row_len; + + const int8_t *ip_r0 = input_row + (i_out_ch * row_len); + const int8_t *ip_c0 = input_col + (i_col_batch * row_len); + int32_t acc_0 = 0; + const int32_t row_loop_cnt = (row_len + 7) / 8; + + for (int i_row_loop = 0; i_row_loop < row_loop_cnt; i_row_loop++) + { + const mve_pred16_t p = vctp16q((uint32_t)row_len_tmp); + const int16x8_t offset = vdupq_m_n_s16(vuninitializedq_s16(), col_offset, p); + row_len_tmp -= 8; + + int16x8_t r0 = vldrbq_z_s16(ip_r0, p); + ip_r0 += 8; + int16x8_t c0 = vldrbq_z_s16(ip_c0, p); + ip_c0 += 8; + + c0 = vaddq_m_s16(vuninitializedq_s16(), c0, offset, p); + acc_0 = vmladavaq_p_s16(acc_0, r0, c0, p); + } + + if (bias) + { + acc_0 += bias[i_out_ch]; + } + acc_0 = arm_nn_requantize(acc_0, output_mult[i_out_ch], output_shift[i_out_ch]); + acc_0 += out_offset; + acc_0 = MAX(acc_0, activation_min); + acc_0 = MIN(acc_0, activation_max); + out[i_out_ch] = (q7_t)acc_0; + } + out += output_ch; + } + } + return out; + +#else + (void)input_row; + (void)input_col; + (void)output_ch; + (void)col_batches; + (void)output_shift; + (void)output_mult; + (void)out_offset; + (void)col_offset; + (void)row_offset; + (void)activation_min; + (void)activation_max; + (void)row_len; + (void)bias; + (void)out; + return NULL; +#endif +} diff --git a/APP_Framework/Framework/knowing/cmsis_5/NN/Source/FullyConnectedFunctions/arm_fully_connected_mat_q7_vec_q15.c b/APP_Framework/Framework/knowing/cmsis_5/NN/Source/FullyConnectedFunctions/arm_fully_connected_mat_q7_vec_q15.c new file mode 100644 index 000000000..fa9f775b2 --- /dev/null +++ b/APP_Framework/Framework/knowing/cmsis_5/NN/Source/FullyConnectedFunctions/arm_fully_connected_mat_q7_vec_q15.c @@ -0,0 +1,197 @@ +/* + * Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_fully_connected_mat_q7_vec_q15.c + * Description: Mixed Q15-Q7 fully-connected layer function + * + * $Date: 09. October 2020 + * $Revision: V.1.0.1 + * + * Target Processor: Cortex-M cores + * + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" +#include "arm_nnsupportfunctions.h" + +/** + * @ingroup groupNN + */ + +/** + * @addtogroup FC + * @{ + */ + +/** + * @brief Mixed Q15-Q7 fully-connected layer function + * @param[in] pV pointer to input vector + * @param[in] pM pointer to matrix weights + * @param[in] dim_vec length of the vector + * @param[in] num_of_rows number of rows in weight matrix + * @param[in] bias_shift amount of left-shift for bias + * @param[in] out_shift amount of right-shift for output + * @param[in] bias pointer to bias + * @param[in,out] pOut pointer to output vector + * @param[in,out] vec_buffer pointer to buffer space for input + * @return The function returns ARM_MATH_SUCCESS + * + * @details + * + * Buffer size: + * + * vec_buffer size: 0 + * + * Q7_Q15 version of the fully connected layer + * + * Weights are in q7_t and Activations are in q15_t + * + */ + +arm_status arm_fully_connected_mat_q7_vec_q15(const q15_t *pV, + const q7_t *pM, + const uint16_t dim_vec, + const uint16_t num_of_rows, + const uint16_t bias_shift, + const uint16_t out_shift, + const q7_t *bias, + q15_t *pOut, + q15_t *vec_buffer) +{ + (void)vec_buffer; +#if defined(ARM_MATH_DSP) + /* Run the following code for Cortex-M4 and Cortex-M7 */ + + const q7_t *pB = pM; + const q7_t *pB2; + q15_t *pO = pOut; + const q7_t *pBias = bias; + const q15_t *pA = pV; + + uint16_t rowCnt = num_of_rows >> 1; + + while (rowCnt) + { + q31_t sum = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); + q31_t sum2 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); + uint16_t colCnt = dim_vec >> 2; + + pA = pV; + pB2 = pB + dim_vec; + + while (colCnt) + { + q31_t inV, inM11, inM12, inM21, inM22; + pB = read_and_pad(pB, &inM11, &inM12); + pB2 = read_and_pad(pB2, &inM21, &inM22); + + inV = arm_nn_read_q15x2_ia(&pA); + + sum = __SMLAD(inV, inM11, sum); + sum2 = __SMLAD(inV, inM21, sum2); + + inV = arm_nn_read_q15x2_ia(&pA); + + sum = __SMLAD(inV, inM12, sum); + sum2 = __SMLAD(inV, inM22, sum2); + + colCnt--; + } + colCnt = dim_vec & 0x3; + while (colCnt) + { + q15_t inV = *pA++; + q7_t inM = *pB++; + q7_t inM2 = *pB2++; + + sum += inV * inM; + sum2 += inV * inM2; + colCnt--; + } /* while over colCnt */ + *pO++ = (q15_t)(__SSAT((sum >> out_shift), 16)); + *pO++ = (q15_t)(__SSAT((sum2 >> out_shift), 16)); + + /*adjust the pointers and counters */ + pB += dim_vec; + rowCnt--; + } + + /* left-over part of the rows */ + rowCnt = num_of_rows & 0x1; + + while (rowCnt) + { + q31_t sum = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); + uint16_t colCnt = dim_vec >> 2; + + pA = pV; + + while (colCnt) + { + q31_t inV1, inV2, inM11, inM12; + + pB = read_and_pad(pB, &inM11, &inM12); + + inV1 = arm_nn_read_q15x2_ia(&pA); + sum = __SMLAD(inV1, inM11, sum); + + inV2 = arm_nn_read_q15x2_ia(&pA); + sum = __SMLAD(inV2, inM12, sum); + + colCnt--; + } + + /* left-over of the vector */ + colCnt = dim_vec & 0x3; + while (colCnt) + { + q15_t inV = *pA++; + q7_t inM = *pB++; + sum += inV * inM; + colCnt--; + } + + *pO++ = (q15_t)(__SSAT((sum >> out_shift), 16)); + + rowCnt--; + } + +#else + int i, j; + /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */ + for (i = 0; i < num_of_rows; i++) + { + int ip_out = ((q31_t)(bias[i]) << bias_shift) + NN_ROUND(out_shift); + for (j = 0; j < dim_vec; j++) + { + ip_out += pV[j] * pM[i * dim_vec + j]; + } + pOut[i] = (q15_t)__SSAT((ip_out >> out_shift), 16); + } + +#endif /* ARM_MATH_DSP */ + + /* Return to ARM_MATH_SUCCESS */ + return (ARM_MATH_SUCCESS); +} + +/** + * @} end of FC group + */ diff --git a/APP_Framework/Framework/knowing/cmsis_5/NN/Source/FullyConnectedFunctions/arm_fully_connected_mat_q7_vec_q15_opt.c b/APP_Framework/Framework/knowing/cmsis_5/NN/Source/FullyConnectedFunctions/arm_fully_connected_mat_q7_vec_q15_opt.c new file mode 100644 index 000000000..2826ac5f6 --- /dev/null +++ b/APP_Framework/Framework/knowing/cmsis_5/NN/Source/FullyConnectedFunctions/arm_fully_connected_mat_q7_vec_q15_opt.c @@ -0,0 +1,417 @@ +/* + * Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_fully_connected_mat_q7_vec_q15_opt.c + * Description: Mixed Q15-Q7 opt fully-connected layer function + * + * $Date: 09. October 2020 + * $Revision: V.1.0.1 + * + * Target Processor: Cortex-M cores + * + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" +#include "arm_nnsupportfunctions.h" + +/** + * @ingroup groupNN + */ + +/** + * @addtogroup FC + * @{ + */ + +/** + * @brief Mixed Q15-Q7 opt fully-connected layer function + * @param[in] pV pointer to input vector + * @param[in] pM pointer to matrix weights + * @param[in] dim_vec length of the vector + * @param[in] num_of_rows number of rows in weight matrix + * @param[in] bias_shift amount of left-shift for bias + * @param[in] out_shift amount of right-shift for output + * @param[in] bias pointer to bias + * @param[in,out] pOut pointer to output vector + * @param[in,out] vec_buffer pointer to buffer space for input + * @return The function returns ARM_MATH_SUCCESS + * + * @details + * + * Buffer size: + * + * vec_buffer size: 0 + * + * Q7_Q15 version of the fully connected layer + * + * Weights are in q7_t and Activations are in q15_t + * + * Limitation: x4 version requires weight reordering to work + * + * Here we use only one pointer to read 4 rows in the weight + * matrix. So if the original q7_t matrix looks like this: + * + * | a11 | a12 | a13 | a14 | a15 | a16 | a17 | + * + * | a21 | a22 | a23 | a24 | a25 | a26 | a27 | + * + * | a31 | a32 | a33 | a34 | a35 | a36 | a37 | + * + * | a41 | a42 | a43 | a44 | a45 | a46 | a47 | + * + * | a51 | a52 | a53 | a54 | a55 | a56 | a57 | + * + * | a61 | a62 | a63 | a64 | a65 | a66 | a67 | + * + * We operates on multiple-of-4 rows, so the first four rows becomes + * + * | a11 | a21 | a12 | a22 | a31 | a41 | a32 | a42 | + * + * | a13 | a23 | a14 | a24 | a33 | a43 | a34 | a44 | + * + * | a15 | a25 | a16 | a26 | a35 | a45 | a36 | a46 | + * + * The column left over will be in-order. + * which is: + * | a17 | a27 | a37 | a47 | + * + * For the left-over rows, we do 1x1 computation, so the data remains + * as its original order. + * + * So the stored weight matrix looks like this: + * + * | a11 | a21 | a12 | a22 | a31 | a41 | + * + * | a32 | a42 | a13 | a23 | a14 | a24 | + * + * | a33 | a43 | a34 | a44 | a15 | a25 | + * + * | a16 | a26 | a35 | a45 | a36 | a46 | + * + * | a17 | a27 | a37 | a47 | a51 | a52 | + * + * | a53 | a54 | a55 | a56 | a57 | a61 | + * + * | a62 | a63 | a64 | a65 | a66 | a67 | + * + */ + +arm_status arm_fully_connected_mat_q7_vec_q15_opt(const q15_t *pV, + const q7_t *pM, + const uint16_t dim_vec, + const uint16_t num_of_rows, + const uint16_t bias_shift, + const uint16_t out_shift, + const q7_t *bias, + q15_t *pOut, + q15_t *vec_buffer) +{ + + (void)vec_buffer; +#if defined(ARM_MATH_DSP) + /* Run the following code for Cortex-M4 and Cortex-M7 */ + + const q7_t *pB = pM; + q15_t *pO = pOut; + const q7_t *pBias = bias; + const q15_t *pA = pV; + + uint16_t rowCnt = num_of_rows >> 2; + + while (rowCnt) + { + q31_t sum = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); + q31_t sum2 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); + q31_t sum3 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); + q31_t sum4 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); + + uint16_t colCnt = dim_vec >> 1; + + pA = pV; + +#ifdef USE_INTRINSIC + +#ifndef ARM_MATH_BIG_ENDIAN + + while (colCnt) + { + q31_t inM11, inM12, inM13, inM14; + q31_t inV; + + inV = arm_nn_read_q15x2_ia(&pA); + inM11 = arm_nn_read_q7x4_ia(&pB); + inM12 = __SXTB16(__ROR(inM11, 8)); + inM11 = __SXTB16(inM11); + sum = __SMLAD(inM11, inV, sum); + sum2 = __SMLAD(inM12, inV, sum2); + inM13 = arm_nn_read_q7x4_ia(&pB); + inM14 = __SXTB16(__ROR(inM13, 8)); + inM13 = __SXTB16(inM13); + sum3 = __SMLAD(inM13, inV, sum3); + sum4 = __SMLAD(inM14, inV, sum4); + colCnt--; + } + +#else + + while (colCnt) + { + q31_t inM11, inM12, inM13, inM14; + q31_t inV; + + inV = *__SIMD32(pA)++; + inM11 = arm_nn_read_q7x4_ia(&pB); + inM12 = __SXTB16(__ROR(inM11, 8)); + inM11 = __SXTB16(inM11); + sum = __SMLAD(inM12, inV, sum); + sum2 = __SMLAD(inM11, inV, sum2); + inM13 = arm_nn_read_q7x4_ia(&pB); + inM14 = __SXTB16(__ROR(inM13, 8)); + inM13 = __SXTB16(inM13); + sum3 = __SMLAD(inM14, inV, sum3); + sum4 = __SMLAD(inM13, inV, sum4); + colCnt--; + } + +#endif /* ARM_MATH_BIG_ENDIAN */ + +#else + + /* + * register needed: + * loop counter: colCnt + * accumulators: sum, sum2, sum3, sum4 + * pointers: pB, pA + * weight data: inM11, inM12, inM13, inM14 + * activation data: inV + */ + +#ifndef ARM_MATH_BIG_ENDIAN + asm volatile("COL_LOOP_%=:\n" + "ldr.w r4, [%[pA]], #4\n" + "ldr.w r1, [%[pB]], #8\n" + "mov.w r0, r1, ror #8\n" + "sxtb16 r0, r0\n" + "sxtb16 r1, r1\n" + "smlad %[sum], r4, r1, %[sum]\n" + "smlad %[sum2], r4, r0, %[sum2]\n" + "ldr.w r3, [%[pB], #-4]\n" + "mov.w r2, r3, ror #8\n" + "sxtb16 r2, r2\n" + "sxtb16 r3, r3\n" + "smlad %[sum3], r4, r3, %[sum3]\n" + "smlad %[sum4], r4, r2, %[sum4]\n" + "subs %[colCnt], #1\n" + "bne COL_LOOP_%=\n" + : [ sum ] "+r"(sum), + [ sum2 ] "+r"(sum2), + [ sum3 ] "+r"(sum3), + [ sum4 ] "+r"(sum4), + [ pB ] "+r"(pB), + [ pA ] "+r"(pA) + : [ colCnt ] "r"(colCnt) + : "r0", "r1", "r2", "r3", "r4"); +#else + asm volatile("COL_LOOP_%=:\n" + "ldr.w r4, [%[pA]], #4\n" + "ldr.w r1, [%[pB]], #8\n" + "mov.w r0, r1, ror #8\n" + "sxtb16 r0, r0\n" + "sxtb16 r1, r1\n" + "smlad %[sum], r4, r0, %[sum]\n" + "smlad %[sum2], r4, r1, %[sum2]\n" + "ldr.w r3, [%[pB], #-4]\n" + "mov.w r2, r3, ror #8\n" + "sxtb16 r2, r2\n" + "sxtb16 r3, r3\n" + "smlad %[sum3], r4, r2, %[sum3]\n" + "smlad %[sum4], r4, r3, %[sum4]\n" + "subs %[colCnt], #1\n" + "bne COL_LOOP_%=\n" + : [ sum ] "+r"(sum), + [ sum2 ] "+r"(sum2), + [ sum3 ] "+r"(sum3), + [ sum4 ] "+r"(sum4), + [ pB ] "+r"(pB), + [ pA ] "+r"(pA) + : [ colCnt ] "r"(colCnt) + : "r0", "r1", "r2", "r3", "r4"); +#endif /* ARM_MATH_BIG_ENDIAN */ + +#endif /* USE_INTRINSIC */ + + colCnt = dim_vec & 0x1; + while (colCnt) + { + q15_t inV = *pA++; + q7_t inM = *pB++; + q7_t inM2 = *pB++; + q7_t inM3 = *pB++; + q7_t inM4 = *pB++; + + sum += inV * inM; + sum2 += inV * inM2; + sum3 += inV * inM3; + sum4 += inV * inM4; + colCnt--; + } /* while over colCnt */ + *pO++ = (q15_t)(__SSAT((sum >> out_shift), 16)); + *pO++ = (q15_t)(__SSAT((sum2 >> out_shift), 16)); + *pO++ = (q15_t)(__SSAT((sum3 >> out_shift), 16)); + *pO++ = (q15_t)(__SSAT((sum4 >> out_shift), 16)); + + /* adjust the pointers and counters */ + rowCnt--; + } + + /* left-over part of the rows */ + rowCnt = num_of_rows & 0x3; + + while (rowCnt) + { + q31_t sum = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); + + uint16_t colCnt = dim_vec >> 2; + + pA = pV; + + while (colCnt) + { + q31_t inV1, inV2, inM11, inM12; + + pB = read_and_pad(pB, &inM11, &inM12); + + inV1 = arm_nn_read_q15x2_ia(&pA); + sum = __SMLAD(inV1, inM11, sum); + + inV2 = arm_nn_read_q15x2_ia(&pA); + sum = __SMLAD(inV2, inM12, sum); + + colCnt--; + } + + /* left-over of the vector */ + colCnt = dim_vec & 0x3; + while (colCnt) + { + q15_t inV = *pA++; + q7_t inM = *pB++; + sum += inV * inM; + colCnt--; + } + + *pO++ = (q15_t)(__SSAT((sum >> out_shift), 16)); + + rowCnt--; + } + +#else + /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */ + uint16_t rowCnt = num_of_rows >> 2; + const q7_t *pB = pM; + const q15_t *pA; + q15_t *pO = pOut; + const q7_t *pBias = bias; + + while (rowCnt) + { + q31_t sum = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); + q31_t sum2 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); + q31_t sum3 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); + q31_t sum4 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); + uint16_t colCnt = dim_vec >> 1; + + pA = pV; + + while (colCnt) + { + q15_t inA1 = *pA++; + q15_t inA2 = *pA++; + + q7_t inB1 = *pB++; + q7_t inB3 = *pB++; + q7_t inB2 = *pB++; + q7_t inB4 = *pB++; + + sum += inA1 * inB1 + inA2 * inB2; + sum2 += inA1 * inB3 + inA2 * inB4; + + inB1 = *pB++; + inB3 = *pB++; + inB2 = *pB++; + inB4 = *pB++; + + sum3 += inA1 * inB1 + inA2 * inB2; + sum4 += inA1 * inB3 + inA2 * inB4; + + colCnt--; + } + + colCnt = dim_vec & 0x1; + while (colCnt) + { + q15_t inA = *pA++; + q7_t inB = *pB++; + sum += inA * inB; + inB = *pB++; + sum2 += inA * inB; + inB = *pB++; + sum3 += inA * inB; + inB = *pB++; + sum4 += inA * inB; + + colCnt--; + } + *pO++ = (q15_t)__SSAT((sum >> out_shift), 16); + *pO++ = (q15_t)__SSAT((sum2 >> out_shift), 16); + *pO++ = (q15_t)__SSAT((sum3 >> out_shift), 16); + *pO++ = (q15_t)__SSAT((sum4 >> out_shift), 16); + + rowCnt--; + } + + rowCnt = num_of_rows & 0x3; + + while (rowCnt) + { + int ip_out = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); + int j; + + pA = pV; + for (j = 0; j < dim_vec; j++) + { + q15_t inA = *pA++; + q7_t inB = *pB++; + ip_out += inA * inB; + } + *pO++ = (q15_t)__SSAT((ip_out >> out_shift), 16); + + rowCnt--; + } + +#endif /* ARM_MATH_DSP */ + + /* Return to ARM_MATH_SUCCESS */ + return (ARM_MATH_SUCCESS); +} + +/** + * @} end of FC group + */ diff --git a/APP_Framework/Framework/knowing/cmsis_5/NN/Source/FullyConnectedFunctions/arm_fully_connected_q15.c b/APP_Framework/Framework/knowing/cmsis_5/NN/Source/FullyConnectedFunctions/arm_fully_connected_q15.c new file mode 100644 index 000000000..67d70ec12 --- /dev/null +++ b/APP_Framework/Framework/knowing/cmsis_5/NN/Source/FullyConnectedFunctions/arm_fully_connected_q15.c @@ -0,0 +1,195 @@ +/* + * Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_fully_connected_q15.c + * Description: Q15 basic fully-connected layer function + * + * $Date: 09. October 2020 + * $Revision: V.1.0.1 + * + * Target Processor: Cortex-M cores + * + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" +#include "arm_nnsupportfunctions.h" + +/** + * @ingroup groupNN + */ + +/** + * @addtogroup FC + * @{ + */ + +/** + * @brief Q15 opt fully-connected layer function + * @param[in] pV pointer to input vector + * @param[in] pM pointer to matrix weights + * @param[in] dim_vec length of the vector + * @param[in] num_of_rows number of rows in weight matrix + * @param[in] bias_shift amount of left-shift for bias + * @param[in] out_shift amount of right-shift for output + * @param[in] bias pointer to bias + * @param[in,out] pOut pointer to output vector + * @param[in,out] vec_buffer pointer to buffer space for input + * @return The function returns ARM_MATH_SUCCESS + * + * + * @details + * + * Buffer size: + * + * vec_buffer size: 0 + * + */ + +arm_status arm_fully_connected_q15(const q15_t *pV, + const q15_t *pM, + const uint16_t dim_vec, + const uint16_t num_of_rows, + const uint16_t bias_shift, + const uint16_t out_shift, + const q15_t *bias, + q15_t *pOut, + q15_t *vec_buffer) +{ + (void)vec_buffer; +#if defined(ARM_MATH_DSP) + /* Run the following code for Cortex-M4 and Cortex-M7 */ + + const q15_t *pB = pM; + const q15_t *pB2 = pB + dim_vec; + q15_t *pO = pOut; + const q15_t *pA; + const q15_t *pBias = bias; + uint16_t rowCnt = num_of_rows >> 1; + + /* this loop loops over different output */ + while (rowCnt) + { + q31_t sum = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); + q31_t sum2 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); + + uint16_t colCnt = dim_vec >> 2; + + pA = pV; + pB2 = pB + dim_vec; + + while (colCnt) + { + q31_t inV1, inM1, inM2; + inV1 = arm_nn_read_q15x2_ia(&pA); + inM1 = arm_nn_read_q15x2_ia(&pB); + sum = __SMLAD(inV1, inM1, sum); + inM2 = arm_nn_read_q15x2_ia(&pB2); + sum2 = __SMLAD(inV1, inM2, sum2); + + inV1 = arm_nn_read_q15x2_ia(&pA); + inM1 = arm_nn_read_q15x2_ia(&pB); + sum = __SMLAD(inV1, inM1, sum); + inM2 = arm_nn_read_q15x2_ia(&pB2); + sum2 = __SMLAD(inV1, inM2, sum2); + + colCnt--; + } + colCnt = dim_vec & 0x3; + while (colCnt) + { + q15_t inV = *pA++; + q15_t inM = *pB++; + q15_t inM2 = *pB2++; + + sum += inV * inM; + sum2 += inV * inM2; + colCnt--; + } /* while over colCnt */ + *pO++ = (q15_t)(__SSAT((sum >> out_shift), 16)); + *pO++ = (q15_t)(__SSAT((sum2 >> out_shift), 16)); + + /* adjust the pointers and counters */ + pB = pB + dim_vec; + rowCnt--; + } + + rowCnt = num_of_rows & 0x1; + + while (rowCnt) + { + q31_t sum = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); + + uint16_t colCnt = dim_vec >> 2; + + pA = pV; + + while (colCnt) + { + q31_t inV1, inM1; + inV1 = arm_nn_read_q15x2_ia(&pA); + inM1 = arm_nn_read_q15x2_ia(&pB); + sum = __SMLAD(inV1, inM1, sum); + + inV1 = arm_nn_read_q15x2_ia(&pA); + inM1 = arm_nn_read_q15x2_ia(&pB); + sum = __SMLAD(inV1, inM1, sum); + + colCnt--; + } + + /* left-over of the vector */ + colCnt = dim_vec & 0x3; + while (colCnt) + { + q15_t inV = *pA++; + q15_t inM = *pB++; + + sum += inV * inM; + + colCnt--; + } + + *pO++ = (q15_t)(__SSAT((sum >> out_shift), 16)); + + rowCnt--; + } + +#else + int i, j; + /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */ + for (i = 0; i < num_of_rows; i++) + { + int ip_out = ((q31_t)(bias[i]) << bias_shift) + NN_ROUND(out_shift); + for (j = 0; j < dim_vec; j++) + { + ip_out += pV[j] * pM[i * dim_vec + j]; + } + pOut[i] = (q15_t)__SSAT((ip_out >> out_shift), 16); + } + +#endif /* ARM_MATH_DSP */ + + /* Return to application */ + return (ARM_MATH_SUCCESS); +} + +/** + * @} end of FC group + */ diff --git a/APP_Framework/Framework/knowing/cmsis_5/NN/Source/FullyConnectedFunctions/arm_fully_connected_q15_opt.c b/APP_Framework/Framework/knowing/cmsis_5/NN/Source/FullyConnectedFunctions/arm_fully_connected_q15_opt.c new file mode 100644 index 000000000..9de861825 --- /dev/null +++ b/APP_Framework/Framework/knowing/cmsis_5/NN/Source/FullyConnectedFunctions/arm_fully_connected_q15_opt.c @@ -0,0 +1,336 @@ +/* + * Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_fully_connected_q15_opt.c + * Description: Q15 opt fully-connected layer function + * + * $Date: 09. October 2020 + * $Revision: V.1.0.1 + * + * Target Processor: Cortex-M cores + * + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" +#include "arm_nnsupportfunctions.h" + +/** + * @ingroup groupNN + */ + +/** + * @addtogroup FC + * @{ + */ + +/** + * @brief Q15 opt fully-connected layer function + * @param[in] pV pointer to input vector + * @param[in] pM pointer to matrix weights + * @param[in] dim_vec length of the vector + * @param[in] num_of_rows number of rows in weight matrix + * @param[in] bias_shift amount of left-shift for bias + * @param[in] out_shift amount of right-shift for output + * @param[in] bias pointer to bias + * @param[in,out] pOut pointer to output vector + * @param[in,out] vec_buffer pointer to buffer space for input + * @return The function returns ARM_MATH_SUCCESS + * + * + * @details + * + * Buffer size: + * + * vec_buffer size: 0 + * + * Here we use only one pointer to read 4 rows in the weight + * matrix. So if the original matrix looks like this: + * + * | a11 | a12 | a13 | + * + * | a21 | a22 | a23 | + * + * | a31 | a32 | a33 | + * + * | a41 | a42 | a43 | + * + * | a51 | a52 | a53 | + * + * | a61 | a62 | a63 | + * + * We operates on multiple-of-4 rows, so the first four rows becomes + * + * | a11 | a12 | a21 | a22 | a31 | a32 | a41 | a42 | + * + * | a13 | a23 | a33 | a43 | + * + * Remaining rows are kept the same original order. + * + * So the stored weight matrix looks like this: + * + * + * | a11 | a12 | a21 | a22 | a31 | a32 | a41 | a42 | + * + * | a13 | a23 | a33 | a43 | a51 | a52 | a53 | a61 | + * + * | a62 | a63 | + */ + +arm_status arm_fully_connected_q15_opt(const q15_t *pV, + const q15_t *pM, + const uint16_t dim_vec, + const uint16_t num_of_rows, + const uint16_t bias_shift, + const uint16_t out_shift, + const q15_t *bias, + q15_t *pOut, + q15_t *vec_buffer) +{ + (void)vec_buffer; +#if defined(ARM_MATH_DSP) + /* Run the following code for Cortex-M4 and Cortex-M7 */ + + const q15_t *pB = pM; + q15_t *pO = pOut; + const q15_t *pBias = bias; + const q15_t *pA = pV; + + uint16_t rowCnt = num_of_rows >> 2; + + while (rowCnt) + { + q31_t sum = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); + q31_t sum2 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); + q31_t sum3 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); + q31_t sum4 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); + + uint16_t colCnt = dim_vec >> 1; + + pA = pV; + +#ifdef USE_INTRINSIC + + while (colCnt) + { + q31_t inM11, inM12, inM13, inM14; + q31_t inV; + + inV = arm_nn_read_q15x2_ia(&pA); + inM11 = arm_nn_read_q15x2_ia(&pB); + sum = __SMLAD(inV, inM11, sum); + inM12 = arm_nn_read_q15x2_ia(&pB); + sum2 = __SMLAD(inV, inM12, sum2); + inM13 = arm_nn_read_q15x2_ia(&pB); + sum3 = __SMLAD(inV, inM13, sum3); + inM14 = arm_nn_read_q15x2_ia(&pB); + sum4 = __SMLAD(inV, inM14, sum4); + colCnt--; + } + +#else + + /* + * register needed: + * loop counter: colCnt + * accumulators: sum, sum2, sum3, sum4 + * pointers: pB, pA + * weight data: inM11, inM12, inM13, inM14 + * activation data: inV + */ + + asm volatile("COL_LOOP_%=:\n" + "ldr.w r4, [%[pA]], #4\n" + "ldr.w r0, [%[pB]], #16\n" + "smlad %[sum], r4, r0, %[sum]\n" + "ldr.w r1, [%[pB] , #-12]\n" + "smlad %[sum2], r4, r1, %[sum2]\n" + "ldr.w r2, [%[pB] , #-8]\n" + "smlad %[sum3], r4, r2, %[sum3]\n" + "ldr.w r3, [%[pB] , #-4]\n" + "smlad %[sum4], r4, r3, %[sum4]\n" + "subs %[colCnt], #1\n" + "bne COL_LOOP_%=\n" + : [ sum ] "+r"(sum), + [ sum2 ] "+r"(sum2), + [ sum3 ] "+r"(sum3), + [ sum4 ] "+r"(sum4), + [ pB ] "+r"(pB), + [ pA ] "+r"(pA) + : [ colCnt ] "r"(colCnt) + : "r0", "r1", "r2", "r3", "r4"); + +#endif /* USE_INTRINSIC */ + + colCnt = dim_vec & 0x1; + while (colCnt) + { + + q15_t inV = *pA++; + q15_t inM = *pB++; + q15_t inM2 = *pB++; + q15_t inM3 = *pB++; + q15_t inM4 = *pB++; + + sum += inV * inM; + sum2 += inV * inM2; + sum3 += inV * inM3; + sum4 += inV * inM4; + colCnt--; + } /* while over colCnt */ + *pO++ = (q15_t)(__SSAT((sum >> out_shift), 16)); + *pO++ = (q15_t)(__SSAT((sum2 >> out_shift), 16)); + *pO++ = (q15_t)(__SSAT((sum3 >> out_shift), 16)); + *pO++ = (q15_t)(__SSAT((sum4 >> out_shift), 16)); + + /* adjust the pointers and counters */ + rowCnt--; + } + + /* left-over part of the rows */ + rowCnt = num_of_rows & 0x3; + + while (rowCnt) + { + q31_t sum = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); + + uint16_t colCnt = dim_vec >> 2; + + pA = pV; + + while (colCnt) + { + q31_t inV1, inV2, inM1, inM2; + + inM1 = arm_nn_read_q15x2_ia(&pB); + inV1 = arm_nn_read_q15x2_ia(&pA); + sum = __SMLAD(inV1, inM1, sum); + + inM2 = arm_nn_read_q15x2_ia(&pB); + inV2 = arm_nn_read_q15x2_ia(&pA); + sum = __SMLAD(inV2, inM2, sum); + + colCnt--; + } + + /* left-over of the vector */ + colCnt = dim_vec & 0x3; + while (colCnt) + { + q15_t inV = *pA++; + q15_t inM = *pB++; + sum += inV * inM; + colCnt--; + } + + *pO++ = (q15_t)(__SSAT((sum >> out_shift), 16)); + + rowCnt--; + } + +#else + /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */ + uint16_t rowCnt = num_of_rows >> 2; + const q15_t *pB = pM; + const q15_t *pA; + q15_t *pO = pOut; + const q15_t *pBias = bias; + + while (rowCnt) + { + q31_t sum = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); + q31_t sum2 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); + q31_t sum3 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); + q31_t sum4 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); + + uint16_t colCnt = dim_vec >> 1; + + pA = pV; + while (colCnt) + { + q15_t inA1 = *pA++; + q15_t inA2 = *pA++; + + q15_t inB1 = *pB++; + q15_t inB2 = *pB++; + sum += inA1 * inB1 + inA2 * inB2; + + inB1 = *pB++; + inB2 = *pB++; + sum2 += inA1 * inB1 + inA2 * inB2; + + inB1 = *pB++; + inB2 = *pB++; + sum3 += inA1 * inB1 + inA2 * inB2; + + inB1 = *pB++; + inB2 = *pB++; + sum4 += inA1 * inB1 + inA2 * inB2; + + colCnt--; + } + colCnt = dim_vec & 0x1; + while (colCnt) + { + q15_t inA = *pA++; + q15_t inB = *pB++; + sum += inA * inB; + inB = *pB++; + sum2 += inA * inB; + inB = *pB++; + sum3 += inA * inB; + inB = *pB++; + sum4 += inA * inB; + colCnt--; + } + *pO++ = (q15_t)__SSAT((sum >> out_shift), 16); + *pO++ = (q15_t)__SSAT((sum2 >> out_shift), 16); + *pO++ = (q15_t)__SSAT((sum3 >> out_shift), 16); + *pO++ = (q15_t)__SSAT((sum4 >> out_shift), 16); + + rowCnt--; + } + rowCnt = num_of_rows & 0x3; + + while (rowCnt) + { + int ip_out = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); + int j; + + pA = pV; + for (j = 0; j < dim_vec; j++) + { + q15_t inA = *pA++; + q15_t inB = *pB++; + ip_out += inA * inB; + } + *pO++ = (q15_t)__SSAT((ip_out >> out_shift), 16); + + rowCnt--; + } + +#endif /* ARM_MATH_DSP */ + + /* Return to ARM_MATH_SUCCESS */ + return (ARM_MATH_SUCCESS); +} + +/** + * @} end of FC group + */ diff --git a/APP_Framework/Framework/knowing/cmsis_5/NN/Source/FullyConnectedFunctions/arm_fully_connected_q7.c b/APP_Framework/Framework/knowing/cmsis_5/NN/Source/FullyConnectedFunctions/arm_fully_connected_q7.c new file mode 100644 index 000000000..178102dac --- /dev/null +++ b/APP_Framework/Framework/knowing/cmsis_5/NN/Source/FullyConnectedFunctions/arm_fully_connected_q7.c @@ -0,0 +1,200 @@ +/* + * Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_fully_connected_q7.c + * Description: Q7 basic fully-connected layer function + * + * $Date: January 26, 2021 + * $Revision: V.1.0.2 + * + * Target Processor: Cortex-M cores + * + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" +#include "arm_nnsupportfunctions.h" + +/** + * @ingroup groupNN + */ + +/** + * @addtogroup FC + * @{ + */ + +/** + * @brief Q7 basic fully-connected layer function + * @param[in] pV pointer to input vector + * @param[in] pM pointer to matrix weights + * @param[in] dim_vec length of the vector + * @param[in] num_of_rows number of rows in weight matrix + * @param[in] bias_shift amount of left-shift for bias + * @param[in] out_shift amount of right-shift for output + * @param[in] bias pointer to bias + * @param[in,out] pOut pointer to output vector + * @param[in,out] vec_buffer pointer to buffer space for input + * @return The function returns ARM_MATH_SUCCESS + * + * @details + * + * Buffer size: + * + * vec_buffer size: dim_vec + * + * This basic function is designed to work with regular weight + * matrix without interleaving. + * + */ + +arm_status arm_fully_connected_q7(const q7_t *pV, + const q7_t *pM, + const uint16_t dim_vec, + const uint16_t num_of_rows, + const uint16_t bias_shift, + const uint16_t out_shift, + const q7_t *bias, + q7_t *pOut, + q15_t *vec_buffer) +{ + +#if defined(ARM_MATH_DSP) + /* Run the following code for Cortex-M4 and Cortex-M7 */ + + const q7_t *pB = pM; + const q7_t *pB2; + q7_t *pO = pOut; + const q7_t *pBias = bias; + const q15_t *pA; + uint16_t rowCnt = num_of_rows >> 1; + + /* expand the vector into the buffer */ + arm_q7_to_q15_reordered_no_shift(pV, vec_buffer, dim_vec); + + while (rowCnt) + { + q31_t sum = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); + q31_t sum2 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); + uint16_t colCnt = dim_vec >> 2; + + pA = vec_buffer; + pB2 = pB + dim_vec; + + while (colCnt) + { + q31_t inV, inM11, inM12, inM21, inM22; + pB = read_and_pad_reordered(pB, &inM11, &inM12); + pB2 = read_and_pad_reordered(pB2, &inM21, &inM22); + + inV = arm_nn_read_q15x2_ia(&pA); + + sum = __SMLAD(inV, inM11, sum); + sum2 = __SMLAD(inV, inM21, sum2); + + inV = arm_nn_read_q15x2_ia(&pA); + + sum = __SMLAD(inV, inM12, sum); + sum2 = __SMLAD(inV, inM22, sum2); + + colCnt--; + } + colCnt = dim_vec & 0x3; + while (colCnt) + { + q7_t inV = *pA++; + q15_t inM = *pB++; + q15_t inM2 = *pB2++; + + sum += inV * inM; + sum2 += inV * inM2; + colCnt--; + } /* while over colCnt */ + *pO++ = (q7_t)(__SSAT((sum >> out_shift), 8)); + *pO++ = (q7_t)(__SSAT((sum2 >> out_shift), 8)); + + /* adjust the pointers and counters */ + pB += dim_vec; + rowCnt--; + } + + /* left-over part of the rows */ + rowCnt = num_of_rows & 0x1; + + while (rowCnt) + { + uint16_t colCnt = dim_vec >> 2; + q31_t sum = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); + + pA = vec_buffer; + + while (colCnt) + { + q31_t inV1, inV2, inM11, inM12; + + pB = read_and_pad_reordered(pB, &inM11, &inM12); + + inV1 = arm_nn_read_q15x2_ia(&pA); + sum = __SMLAD(inV1, inM11, sum); + + inV2 = arm_nn_read_q15x2_ia(&pA); + sum = __SMLAD(inV2, inM12, sum); + + colCnt--; + } + + /* left-over of the vector */ + colCnt = dim_vec & 0x3; + while (colCnt) + { + q7_t inV = *pA++; + q15_t inM = *pB++; + sum += inV * inM; + colCnt--; + } + + *pO++ = (q7_t)(__SSAT((sum >> out_shift), 8)); + + rowCnt--; + } + +#else + (void)vec_buffer; + int i, j; + + /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */ + for (i = 0; i < num_of_rows; i++) + { + int ip_out = ((q31_t)(bias[i]) << bias_shift) + NN_ROUND(out_shift); + for (j = 0; j < dim_vec; j++) + { + ip_out += pV[j] * pM[i * dim_vec + j]; + } + pOut[i] = (q7_t)__SSAT((ip_out >> out_shift), 8); + } + +#endif /* ARM_MATH_DSP */ + + /* Return to ARM_MATH_SUCCESS */ + return (ARM_MATH_SUCCESS); +} + +/** + * @} end of FC group + */ diff --git a/APP_Framework/Framework/knowing/cmsis_5/NN/Source/FullyConnectedFunctions/arm_fully_connected_q7_opt.c b/APP_Framework/Framework/knowing/cmsis_5/NN/Source/FullyConnectedFunctions/arm_fully_connected_q7_opt.c new file mode 100644 index 000000000..77c338636 --- /dev/null +++ b/APP_Framework/Framework/knowing/cmsis_5/NN/Source/FullyConnectedFunctions/arm_fully_connected_q7_opt.c @@ -0,0 +1,494 @@ +/* + * Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_fully_connected_q7_opt.c + * Description: Q7 basic fully-connected layer function + * + * $Date: 09. October 2020 + * $Revision: V.1.0.1 + * + * Target Processor: Cortex-M cores + * + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" +#include "arm_nnsupportfunctions.h" + +/** + * @ingroup groupNN + */ + +/** + * @addtogroup FC + * @{ + */ + +/** + * @brief Q7 opt fully-connected layer function + * @param[in] pV pointer to input vector + * @param[in] pM pointer to matrix weights + * @param[in] dim_vec length of the vector + * @param[in] num_of_rows number of rows in weight matrix + * @param[in] bias_shift amount of left-shift for bias + * @param[in] out_shift amount of right-shift for output + * @param[in] bias pointer to bias + * @param[in,out] pOut pointer to output vector + * @param[in,out] vec_buffer pointer to buffer space for input + * @return The function returns ARM_MATH_SUCCESS + * + * @details + * + * Buffer size: + * + * vec_buffer size: dim_vec + * + * This opt function is designed to work with interleaved weight + * matrix. The vector input is assumed in q7_t format, we call + * arm_q7_to_q15_no_shift_shuffle function to expand into + * q15_t format with certain weight re-ordering, refer to the function + * comments for more details. + * Here we use only one pointer to read 4 rows in the weight + * matrix. So if the original q7_t matrix looks like this: + * + * | a11 | a12 | a13 | a14 | a15 | a16 | a17 | + * + * | a21 | a22 | a23 | a24 | a25 | a26 | a27 | + * + * | a31 | a32 | a33 | a34 | a35 | a36 | a37 | + * + * | a41 | a42 | a43 | a44 | a45 | a46 | a47 | + * + * | a51 | a52 | a53 | a54 | a55 | a56 | a57 | + * + * | a61 | a62 | a63 | a64 | a65 | a66 | a67 | + * + * + * We operates on multiple-of-4 rows, so the first four rows becomes + * + * | a11 | a21 | a13 | a23 | a31 | a41 | a33 | a43 | + * + * | a12 | a22 | a14 | a24 | a32 | a42 | a34 | a44 | + * + * | a15 | a25 | a35 | a45 | a16 | a26 | a36 | a46 | + * + * So within the kernel, we first read the re-ordered vector in as: + * + * | b1 | b3 | and | b2 | b4 | + * + * the four q31_t weights will look like + * + * | a11 | a13 |, | a21 | a23 |, | a31 | a33 |, | a41 | a43 | + * + * | a12 | a14 |, | a22 | a24 |, | a32 | a34 |, | a42 | a44 | + * + * The column left over will be in-order. + * which is: + * + * | a17 | a27 | a37 | a47 | + * + * For the left-over rows, we do 1x1 computation, so the data remains + * as its original order. + * + * So the stored weight matrix looks like this: + * + * | a11 | a21 | a13 | a23 | a31 | a41 | + * + * | a33 | a43 | a12 | a22 | a14 | a24 | + * + * | a32 | a42 | a34 | a44 | a15 | a25 | + * + * | a35 | a45 | a16 | a26 | a36 | a46 | + * + * | a17 | a27 | a37 | a47 | a51 | a52 | + * + * | a53 | a54 | a55 | a56 | a57 | a61 | + * + * | a62 | a63 | a64 | a65 | a66 | a67 | + * + * + */ + +arm_status arm_fully_connected_q7_opt(const q7_t *pV, + const q7_t *pM, + const uint16_t dim_vec, + const uint16_t num_of_rows, + const uint16_t bias_shift, + const uint16_t out_shift, + const q7_t *bias, + q7_t *pOut, + q15_t *vec_buffer) +{ + +#if defined(ARM_MATH_DSP) + /* Run the following code for Cortex-M4 and Cortex-M7 */ + + const q7_t *pB = pM; + q7_t *pO = pOut; + const q7_t *pBias = bias; + const q15_t *pA; + uint16_t rowCnt = num_of_rows >> 2; + + arm_q7_to_q15_reordered_no_shift(pV, vec_buffer, dim_vec); + + while (rowCnt) + { + + q31_t sum = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); + q31_t sum2 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); + q31_t sum3 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); + q31_t sum4 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); + + uint16_t colCnt = dim_vec >> 2; + + pA = vec_buffer; + +#ifdef USE_INTRINSIC + +#ifndef ARM_MATH_BIG_ENDIAN + while (colCnt) + { + q31_t inM11, inM12, inM13, inM14; + q31_t inV; + + inV = arm_nn_read_q15x2_ia(&pA); + inM11 = arm_nn_read_q7x4_ia(&pB); + inM12 = __SXTB16(__ROR(inM11, 8)); + inM11 = __SXTB16(inM11); + sum = __SMLAD(inM11, inV, sum); + sum2 = __SMLAD(inM12, inV, sum2); + inM13 = arm_nn_read_q7x4_ia(&pB); + inM14 = __SXTB16(__ROR(inM13, 8)); + inM13 = __SXTB16(inM13); + sum3 = __SMLAD(inM13, inV, sum3); + sum4 = __SMLAD(inM14, inV, sum4); + + inV = arm_nn_read_q15x2_ia(&pA); + inM11 = arm_nn_read_q7x4_ia(&pB); + inM12 = __SXTB16(__ROR(inM11, 8)); + inM11 = __SXTB16(inM11); + sum = __SMLAD(inM11, inV, sum); + sum2 = __SMLAD(inM12, inV, sum2); + inM13 = arm_nn_read_q7x4_ia(&pB); + inM14 = __SXTB16(__ROR(inM13, 8)); + inM13 = __SXTB16(inM13); + sum3 = __SMLAD(inM13, inV, sum3); + sum4 = __SMLAD(inM14, inV, sum4); + colCnt--; + } +#else + while (colCnt) + { + q31_t inM11, inM12, inM13, inM14; + q31_t inV; + + inV = arm_nn_read_q15x2_ia(&pA); + inM11 = arm_nn_read_q7x4_ia(&pB); + inM12 = __SXTB16(__ROR(inM11, 8)); + inM11 = __SXTB16(inM11); + sum = __SMLAD(inM12, inV, sum); + sum2 = __SMLAD(inM11, inV, sum2); + inM13 = arm_nn_read_q7x4_ia(&pB); + inM14 = __SXTB16(__ROR(inM13, 8)); + inM13 = __SXTB16(inM13); + sum3 = __SMLAD(inM14, inV, sum3); + sum4 = __SMLAD(inM13, inV, sum4); + + inV = arm_nn_read_q15x2_ia(&pA); + inM11 = arm_nn_read_q7x4_ia(&pB); + inM12 = __SXTB16(__ROR(inM11, 8)); + inM11 = __SXTB16(inM11); + sum = __SMLAD(inM12, inV, sum); + sum2 = __SMLAD(inM11, inV, sum2); + inM13 = arm_nn_read_q7x4_ia(&pB); + inM14 = __SXTB16(__ROR(inM13, 8)); + inM13 = __SXTB16(inM13); + sum3 = __SMLAD(inM14, inV, sum3); + sum4 = __SMLAD(inM13, inV, sum4); + colCnt--; + } +#endif /* ARM_MATH_BIG_ENDIAN */ + +#else + + /* + * register needed: + * loop counter: colCnt + * accumulators: sum, sum2, sum3, sum4 + * pointers: pB, pA + * weight data: inM11, inM12, inM13, inM14 + * activation data: inV + */ + +#ifndef ARM_MATH_BIG_ENDIAN + asm volatile("COL_LOOP_%=:\n" + "ldr.w r4, [%[pA]], #8\n" + "ldr.w r1, [%[pB]], #16\n" + "mov.w r0, r1, ror #8\n" + "sxtb16 r0, r0\n" + "sxtb16 r1, r1\n" + "smlad %[sum], r4, r1, %[sum]\n" + "smlad %[sum2], r4, r0, %[sum2]\n" + "ldr.w r3, [%[pB], #-12]\n" + "mov.w r2, r3, ror #8\n" + "sxtb16 r2, r2\n" + "sxtb16 r3, r3\n" + "smlad %[sum3], r4, r3, %[sum3]\n" + "smlad %[sum4], r4, r2, %[sum4]\n" + "ldr.w r4, [%[pA], #-4]\n" + "ldr.w r1, [%[pB], #-8]\n" + "mov.w r0, r1, ror #8\n" + "sxtb16 r0, r0\n" + "sxtb16 r1, r1\n" + "smlad %[sum], r4, r1, %[sum]\n" + "smlad %[sum2], r4, r0, %[sum2]\n" + "ldr.w r3, [%[pB], #-4]\n" + "mov.w r2, r3, ror #8\n" + "sxtb16 r2, r2\n" + "sxtb16 r3, r3\n" + "smlad %[sum3], r4, r3, %[sum3]\n" + "smlad %[sum4], r4, r2, %[sum4]\n" + "subs %[colCnt], #1\n" + "bne COL_LOOP_%=\n" + : [ sum ] "+r"(sum), + [ sum2 ] "+r"(sum2), + [ sum3 ] "+r"(sum3), + [ sum4 ] "+r"(sum4), + [ pB ] "+r"(pB), + [ pA ] "+r"(pA) + : [ colCnt ] "r"(colCnt) + : "r0", "r1", "r2", "r3", "r4"); +#else + asm volatile("COL_LOOP_%=:\n" + "ldr.w r4, [%[pA]], #8\n" + "ldr.w r1, [%[pB]], #16\n" + "mov.w r0, r1, ror #8\n" + "sxtb16 r0, r0\n" + "sxtb16 r1, r1\n" + "smlad %[sum], r4, r0, %[sum]\n" + "smlad %[sum2], r4, r1, %[sum2]\n" + "ldr.w r3, [%[pB], #-12]\n" + "mov.w r2, r3, ror #8\n" + "sxtb16 r2, r2\n" + "sxtb16 r3, r3\n" + "smlad %[sum3], r4, r2, %[sum3]\n" + "smlad %[sum4], r4, r3, %[sum4]\n" + "ldr.w r4, [%[pA], #-4]\n" + "ldr.w r1, [%[pB], #-8]\n" + "mov.w r0, r1, ror #8\n" + "sxtb16 r0, r0\n" + "sxtb16 r1, r1\n" + "smlad %[sum], r4, r0, %[sum]\n" + "smlad %[sum2], r4, r1, %[sum2]\n" + "ldr.w r3, [%[pB], #-4]\n" + "mov.w r2, r3, ror #8\n" + "sxtb16 r2, r2\n" + "sxtb16 r3, r3\n" + "smlad %[sum3], r4, r2, %[sum3]\n" + "smlad %[sum4], r4, r3, %[sum4]\n" + "subs %[colCnt], #1\n" + "bne COL_LOOP_%=\n" + : [ sum ] "+r"(sum), + [ sum2 ] "+r"(sum2), + [ sum3 ] "+r"(sum3), + [ sum4 ] "+r"(sum4), + [ pB ] "+r"(pB), + [ pA ] "+r"(pA) + : [ colCnt ] "r"(colCnt) + : "r0", "r1", "r2", "r3", "r4"); +#endif /* ARM_MATH_BIG_ENDIAN */ + +#endif /* USE_INTRINSIC */ + + colCnt = dim_vec & 0x3; + while (colCnt) + { + q15_t inV = *pA++; + q7_t inM = *pB++; + q7_t inM2 = *pB++; + q7_t inM3 = *pB++; + q7_t inM4 = *pB++; + + sum += inV * inM; + sum2 += inV * inM2; + sum3 += inV * inM3; + sum4 += inV * inM4; + colCnt--; + } /* while over colCnt */ + *pO++ = (q7_t)(__SSAT((sum >> out_shift), 8)); + *pO++ = (q7_t)(__SSAT((sum2 >> out_shift), 8)); + *pO++ = (q7_t)(__SSAT((sum3 >> out_shift), 8)); + *pO++ = (q7_t)(__SSAT((sum4 >> out_shift), 8)); + + /* adjust the pointers and counters */ + rowCnt--; + } + + /* left-over part of the rows */ + rowCnt = num_of_rows & 0x3; + + while (rowCnt) + { + q31_t sum = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); + uint16_t colCnt = dim_vec >> 2; + + pA = vec_buffer; + + while (colCnt) + { + q31_t inV1, inV2, inM11, inM12; + + pB = read_and_pad_reordered(pB, &inM11, &inM12); + + inV1 = arm_nn_read_q15x2_ia(&pA); + sum = __SMLAD(inV1, inM11, sum); + + inV2 = arm_nn_read_q15x2_ia(&pA); + sum = __SMLAD(inV2, inM12, sum); + + colCnt--; + } + + /* left-over of the vector */ + colCnt = dim_vec & 0x3; + while (colCnt) + { + q15_t inV = *pA++; + q7_t inM = *pB++; + sum += inV * inM; + colCnt--; + } + + *pO++ = (q7_t)(__SSAT((sum >> out_shift), 8)); + + rowCnt--; + } + +#else + /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */ + uint16_t rowCnt = num_of_rows >> 2; + const q7_t *pB = pM; + const q7_t *pA; + q7_t *pO = pOut; + const q7_t *pBias = bias; + + while (rowCnt) + { + q31_t sum = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); + q31_t sum2 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); + q31_t sum3 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); + q31_t sum4 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); + + uint16_t colCnt = dim_vec >> 2; + + pA = pV; + + while (colCnt) + { + q7_t inA1 = *pA++; + q7_t inA3 = *pA++; + q7_t inA2 = *pA++; + q7_t inA4 = *pA++; + + q7_t inB1 = *pB++; + q7_t inB3 = *pB++; + q7_t inB2 = *pB++; + q7_t inB4 = *pB++; + + sum += inA1 * inB1 + inA2 * inB2; + sum2 += inA1 * inB3 + inA2 * inB4; + + inB1 = *pB++; + inB3 = *pB++; + inB2 = *pB++; + inB4 = *pB++; + + sum3 += inA1 * inB1 + inA2 * inB2; + sum4 += inA1 * inB3 + inA2 * inB4; + + inB1 = *pB++; + inB3 = *pB++; + inB2 = *pB++; + inB4 = *pB++; + + sum += inA3 * inB1 + inA4 * inB2; + sum2 += inA3 * inB3 + inA4 * inB4; + + inB1 = *pB++; + inB3 = *pB++; + inB2 = *pB++; + inB4 = *pB++; + + sum3 += inA3 * inB1 + inA4 * inB2; + sum4 += inA3 * inB3 + inA4 * inB4; + + colCnt--; + } + colCnt = dim_vec & 0x3; + while (colCnt) + { + q7_t inA = *pA++; + q7_t inB = *pB++; + sum += inA * inB; + inB = *pB++; + sum2 += inA * inB; + inB = *pB++; + sum3 += inA * inB; + inB = *pB++; + sum4 += inA * inB; + + colCnt--; + } + *pO++ = (q7_t)__SSAT((sum >> out_shift), 8); + *pO++ = (q7_t)__SSAT((sum2 >> out_shift), 8); + *pO++ = (q7_t)__SSAT((sum3 >> out_shift), 8); + *pO++ = (q7_t)__SSAT((sum4 >> out_shift), 8); + + rowCnt--; + } + + rowCnt = num_of_rows & 0x3; + + while (rowCnt) + { + int ip_out = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); + + int j; + + pA = pV; + for (j = 0; j < dim_vec; j++) + { + q7_t inA = *pA++; + q7_t inB = *pB++; + ip_out += inA * inB; + } + *pO++ = (q7_t)__SSAT((ip_out >> out_shift), 8); + + rowCnt--; + } + +#endif /* ARM_MATH_DSP */ + + /* Return to ARM_MATH_SUCCESS */ + return (ARM_MATH_SUCCESS); +} + +/** + * @} end of FC group + */ diff --git a/APP_Framework/Framework/knowing/cmsis_5/NN/Source/FullyConnectedFunctions/arm_fully_connected_s8.c b/APP_Framework/Framework/knowing/cmsis_5/NN/Source/FullyConnectedFunctions/arm_fully_connected_s8.c new file mode 100644 index 000000000..dbb08074c --- /dev/null +++ b/APP_Framework/Framework/knowing/cmsis_5/NN/Source/FullyConnectedFunctions/arm_fully_connected_s8.c @@ -0,0 +1,98 @@ +/* + * Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_fully_connected_s8 + * Description: Fully connected function compatible with TF Lite. + * + * $Date: 19. March 2021 + * $Revision: V.3.0.0 + * + * Target Processor: Cortex-M and Cortex-A cores + * + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" +#include "arm_nnsupportfunctions.h" + +/** + * @ingroup groupNN + */ + +/** + * @addtogroup FC + * @{ + */ + +/* + * S8 basic fully-connected and matrix multiplication layer function for TensorFlow Lite + * + * Refer header file for details. + * + */ + +arm_status arm_fully_connected_s8(const cmsis_nn_context *ctx, + const cmsis_nn_fc_params *fc_params, + const cmsis_nn_per_tensor_quant_params *quant_params, + const cmsis_nn_dims *input_dims, + const q7_t *input, + const cmsis_nn_dims *filter_dims, + const q7_t *kernel, + const cmsis_nn_dims *bias_dims, + const int32_t *bias, + const cmsis_nn_dims *output_dims, + q7_t *output) +{ + (void)bias_dims; + (void)ctx; + (void)fc_params->filter_offset; + + int32_t batch_cnt = input_dims->n; + + while (batch_cnt) + { + arm_nn_vec_mat_mult_t_s8(input, + kernel, + bias, + output, + fc_params->input_offset, + 0, + fc_params->output_offset, + quant_params->multiplier, + quant_params->shift, + filter_dims->n, /* col_dim or accum_depth */ + output_dims->c, /* row_dim or output_depth */ + fc_params->activation.min, + fc_params->activation.max); + input += filter_dims->n; + output += output_dims->c; + batch_cnt--; + } + return (ARM_MATH_SUCCESS); +} + +int32_t arm_fully_connected_s8_get_buffer_size(const cmsis_nn_dims *filter_dims) +{ + (void)filter_dims; + return 0; +} + +/** + * @} end of FC group + */ diff --git a/APP_Framework/Framework/knowing/cmsis_5/NN/Source/NNSupportFunctions/arm_nn_accumulate_q7_to_q15.c b/APP_Framework/Framework/knowing/cmsis_5/NN/Source/NNSupportFunctions/arm_nn_accumulate_q7_to_q15.c new file mode 100644 index 000000000..82c295281 --- /dev/null +++ b/APP_Framework/Framework/knowing/cmsis_5/NN/Source/NNSupportFunctions/arm_nn_accumulate_q7_to_q15.c @@ -0,0 +1,82 @@ +/* + * Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_nn_accumulate_q7_to_q15.c + * Description: Accumulate q7 vector into q15 one. + * + * $Date: 09. October 2020 + * $Revision: V.1.0.2 + * + * pSrc Processor: Cortex-M CPUs + * + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" +#include "arm_nnsupportfunctions.h" + +/** + * @ingroup groupSupport + */ + +/** + * @addtogroup NNBasicMath + * @{ + */ + +void arm_nn_accumulate_q7_to_q15(q15_t *pDst, const q7_t *pSrc, uint32_t length) +{ + q15_t *pCnt = pDst; + const q7_t *pV = pSrc; + q31_t v1, v2, vo1, vo2; + int32_t cnt = length >> 2; + q31_t in; + + while (cnt > 0l) + { + q31_t value = arm_nn_read_q7x4_ia(&pV); + v1 = __SXTB16(__ROR((uint32_t)value, 8)); + v2 = __SXTB16(value); +#ifndef ARM_MATH_BIG_ENDIAN + vo2 = (q31_t)__PKHTB(v1, v2, 16); + vo1 = (q31_t)__PKHBT(v2, v1, 16); +#else + vo1 = (q31_t)__PKHTB(v1, v2, 16); + vo2 = (q31_t)__PKHBT(v2, v1, 16); +#endif + + in = arm_nn_read_q15x2(pCnt); + arm_nn_write_q15x2_ia(&pCnt, __QADD16(vo1, in)); + + in = arm_nn_read_q15x2(pCnt); + arm_nn_write_q15x2_ia(&pCnt, __QADD16(vo2, in)); + + cnt--; + } + cnt = length & 0x3; + while (cnt > 0l) + { + *pCnt++ += *pV++; + cnt--; + } +} + +/** + * @} end of NNBasicMath group + */ diff --git a/APP_Framework/Framework/knowing/cmsis_5/NN/Source/NNSupportFunctions/arm_nn_add_q7.c b/APP_Framework/Framework/knowing/cmsis_5/NN/Source/NNSupportFunctions/arm_nn_add_q7.c new file mode 100644 index 000000000..86cf5475c --- /dev/null +++ b/APP_Framework/Framework/knowing/cmsis_5/NN/Source/NNSupportFunctions/arm_nn_add_q7.c @@ -0,0 +1,82 @@ +/* + * Copyright (C) 2010-2018 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_nn_add_q7.c + * Description: Non saturating addition of elements of a q7 vector. + * + * $Date: 09. October 2020 + * $Revision: V.1.0.1 + * + * Target Processor: Cortex-M cores + * + * -------------------------------------------------------------------- */ + +#include "arm_nn_tables.h" +#include "arm_nnsupportfunctions.h" + +/** + * @ingroup groupSupport + */ + +/** + * @addtogroup NNBasicMath + * @{ + */ + +void arm_nn_add_q7(const q7_t *input, q31_t *output, uint32_t block_size) +{ + uint32_t block_count; + q31_t result = 0; +#if defined(ARM_MATH_DSP) + /* Loop unrolling: Compute 4 outputs at a time */ + block_count = block_size >> 2U; + + while (block_count > 0U) + { + const int32_t mult_q15x2 = (1UL << 16) | 1UL; + q31_t in_q7x4 = arm_nn_read_q7x4_ia(&input); + q31_t temp_q15x2 = __SXTAB16(__SXTB16(in_q7x4), __ROR((uint32_t)in_q7x4, 8)); + + result = __SMLAD(temp_q15x2, mult_q15x2, result); + + /* Decrement loop counter */ + block_count--; + } + + /* Loop unrolling: Compute remaining outputs */ + block_count = block_size & 0x3; +#else + block_count = block_size; +#endif + while (block_count > 0U) + { + /* Add and store result in destination buffer. */ + result += *input++; + + /* Decrement loop counter */ + block_count--; + } + + *output = result; +} + +/** + * @} end of NNBasicMath group + */ \ No newline at end of file diff --git a/APP_Framework/Framework/knowing/cmsis_5/NN/Source/NNSupportFunctions/arm_nn_depthwise_conv_nt_t_padded_s8.c b/APP_Framework/Framework/knowing/cmsis_5/NN/Source/NNSupportFunctions/arm_nn_depthwise_conv_nt_t_padded_s8.c new file mode 100644 index 000000000..b633ef4ad --- /dev/null +++ b/APP_Framework/Framework/knowing/cmsis_5/NN/Source/NNSupportFunctions/arm_nn_depthwise_conv_nt_t_padded_s8.c @@ -0,0 +1,168 @@ +/* + * Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_nn_depthwise_conv_nt_t_padded_s8.c + * Description: Depthwise convolution with padded matrices. + * + * $Date: 09. October 2020 + * $Revision: V.1.0.2 + * + * Target Processor: Cortex-M processors with MVE extension + * -------------------------------------------------------------------- */ + +#include "arm_nnsupportfunctions.h" + +/** + * @ingroup groupSupport + */ + +/** + * @addtogroup NNBasicMath + * @{ + */ + +/* + * Depthwise convolution of transposed rhs matrix with 4 lhs matrices. One or more of the rhs matrices are padded. + * Dimensions are the same for lhs and rhs. + * + * Refer header file for details. + * + */ + +q7_t *arm_nn_depthwise_conv_nt_t_padded_s8(const q7_t *lhs, + const q7_t *rhs, + const int32_t input_offset, + const uint16_t num_ch, + const int32_t *out_shift, + const int32_t *out_mult, + const int32_t out_offset, + const int32_t activation_min, + const int32_t activation_max, + const uint16_t row_x_col, + const int32_t *const output_bias, + q7_t *out) +{ +#if defined(ARM_MATH_MVEI) + int32_t loop_count = (num_ch + 3) / 4; + const int32_t *bias = output_bias; + uint32_t num_ch_to_process = num_ch; + + for (int i_loop_cnt = 0, offset = 0; i_loop_cnt < loop_count; + num_ch_to_process -= 4, out += 4, offset += 4, i_loop_cnt++) + { + int32x4_t out_0 = vldrwq_s32(bias); + int32x4_t out_1 = out_0; + int32x4_t out_2 = out_0; + int32x4_t out_3 = out_0; + bias += 4; + + const int8_t *rhs_0 = rhs + offset; + const int8_t *lhs_0 = lhs + offset; + const int8_t *lhs_1 = lhs + row_x_col * num_ch + offset; + const int8_t *lhs_2 = lhs + (row_x_col * num_ch * 2) + offset; + const int8_t *lhs_3 = lhs + (row_x_col * num_ch * 3) + offset; + + for (int i_row_x_col = 0; i_row_x_col < row_x_col; i_row_x_col++) + { + const int32x4_t ker_0 = vldrbq_s32(rhs_0); + + int32x4_t ip_0 = vldrbq_s32(lhs_0); + ip_0 = vaddq_n_s32(ip_0, input_offset); + out_0 += vmulq_s32(ip_0, ker_0); + + int32x4_t ip_1 = vldrbq_s32(lhs_1); + ip_1 = vaddq_n_s32(ip_1, input_offset); + out_1 += vmulq_s32(ip_1, ker_0); + + int32x4_t ip_2 = vldrbq_s32(lhs_2); + ip_2 = vaddq_n_s32(ip_2, input_offset); + out_2 += vmulq_s32(ip_2, ker_0); + + int32x4_t ip_3 = vldrbq_s32(lhs_3); + ip_3 = vaddq_n_s32(ip_3, input_offset); + + out_3 += vmulq_s32(ip_3, ker_0); + + lhs_0 += num_ch; + lhs_1 += num_ch; + lhs_2 += num_ch; + lhs_3 += num_ch; + + rhs_0 += num_ch; + } + + const int32x4_t mult = vldrwq_s32(out_mult); + const int32x4_t shift = vldrwq_s32(out_shift); + out_mult += 4; + out_shift += 4; + + out_0 = arm_requantize_mve_32x4(out_0, mult, shift); + out_0 = vaddq_n_s32(out_0, out_offset); + out_0 = vmaxq_s32(out_0, vdupq_n_s32(activation_min)); + out_0 = vminq_s32(out_0, vdupq_n_s32(activation_max)); + mve_pred16_t p = vctp32q(num_ch_to_process); + vstrbq_p_s32(out, out_0, p); + + out_1 = arm_requantize_mve_32x4(out_1, mult, shift); + out_1 = vaddq_n_s32(out_1, out_offset); + out_1 = vmaxq_s32(out_1, vdupq_n_s32(activation_min)); + out_1 = vminq_s32(out_1, vdupq_n_s32(activation_max)); + vstrbq_p_s32(out + num_ch, out_1, p); + + out_2 = arm_requantize_mve_32x4(out_2, mult, shift); + out_2 = vaddq_n_s32(out_2, out_offset); + out_2 = vmaxq_s32(out_2, vdupq_n_s32(activation_min)); + out_2 = vminq_s32(out_2, vdupq_n_s32(activation_max)); + vstrbq_p_s32(out + 2 * num_ch, out_2, p); + + out_3 = arm_requantize_mve_32x4(out_3, mult, shift); + out_3 = vaddq_n_s32(out_3, out_offset); + out_3 = vmaxq_s32(out_3, vdupq_n_s32(activation_min)); + out_3 = vminq_s32(out_3, vdupq_n_s32(activation_max)); + vstrbq_p_s32(out + 3 * num_ch, out_3, p); + } + + const int tail_ch = num_ch & 0x3; + if (tail_ch != 0) + { + out -= (4 - tail_ch); + } + return out + (3 * num_ch); + +#else + (void)lhs; + (void)rhs; + (void)input_offset; + (void)num_ch; + (void)out_shift; + (void)out_mult; + (void)out_offset; + (void)activation_min; + (void)activation_max; + (void)row_x_col; + (void)output_bias; + (void)out; + return NULL; +#endif +} + +/** + * @} end of NNBasicMath group + */ diff --git a/APP_Framework/Framework/knowing/cmsis_5/NN/Source/NNSupportFunctions/arm_nn_depthwise_conv_nt_t_s8.c b/APP_Framework/Framework/knowing/cmsis_5/NN/Source/NNSupportFunctions/arm_nn_depthwise_conv_nt_t_s8.c new file mode 100644 index 000000000..dda12fd2b --- /dev/null +++ b/APP_Framework/Framework/knowing/cmsis_5/NN/Source/NNSupportFunctions/arm_nn_depthwise_conv_nt_t_s8.c @@ -0,0 +1,170 @@ +/* + * Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_nn_depthwise_conv_nt_t_s8.c + * Description: Depthwise convolution on matrices with no padding. + * + * $Date: 09. October 2020 + * $Revision: V.1.0.2 + * + * Target Processor: Cortex-M processors with MVE extension. + * -------------------------------------------------------------------- */ + +#include "arm_nnsupportfunctions.h" + +/** + * @ingroup groupSupport + */ + +/** + * @addtogroup NNBasicMath + * @{ + */ + +/* + * Depthwise convolution of rhs matrix with 4 lhs matrices with no padding. Dimensions are the same for lhs and rhs. + * + * Refer header file for details. + * + */ + +q7_t *arm_nn_depthwise_conv_nt_t_s8(const q7_t *lhs, + const q7_t *rhs, + const int32_t input_offset, + const uint16_t num_ch, + const int32_t *out_shift, + const int32_t *out_mult, + const int32_t out_offset, + const int32_t activation_min, + const int32_t activation_max, + const uint16_t row_x_col, + const int32_t *const output_bias, + q7_t *out) +{ +#if defined(ARM_MATH_MVEI) + const int32_t *bias = output_bias; + int32_t loop_count = (num_ch + 3) / 4; + uint32_t num_ch_to_process = num_ch; + + for (int i_loop_cnt = 0, offset = 0; i_loop_cnt < loop_count; + num_ch_to_process -= 4, offset += 4, out += 4, i_loop_cnt++) + { + int32x4_t out_0 = vldrwq_s32(bias); + int32x4_t out_1 = out_0; + int32x4_t out_2 = out_0; + int32x4_t out_3 = out_0; + bias += 4; + + const int8_t *rhs_0 = rhs + offset; + const int8_t *lhs_0 = lhs + offset; + const int8_t *lhs_1 = lhs + row_x_col * num_ch + offset; + const int8_t *lhs_2 = lhs + (row_x_col * num_ch * 2) + offset; + const int8_t *lhs_3 = lhs + (row_x_col * num_ch * 3) + offset; + int32x4_t ker_sum = vdupq_n_s32(0); + + for (int i_row_x_col = 0; i_row_x_col < row_x_col; i_row_x_col++) + { + const int32x4_t ker_0 = vldrbq_s32(rhs_0); + ker_sum = vaddq_s32(ker_sum, ker_0); + + int32x4_t ip_0 = vldrbq_s32(lhs_0); + out_0 += vmulq_s32(ip_0, ker_0); + + int32x4_t ip_1 = vldrbq_s32(lhs_1); + out_1 += vmulq_s32(ip_1, ker_0); + + int32x4_t ip_2 = vldrbq_s32(lhs_2); + out_2 += vmulq_s32(ip_2, ker_0); + + int32x4_t ip_3 = vldrbq_s32(lhs_3); + out_3 += vmulq_s32(ip_3, ker_0); + + lhs_0 += num_ch; + lhs_1 += num_ch; + lhs_2 += num_ch; + lhs_3 += num_ch; + + rhs_0 += num_ch; + } + + ker_sum = vmulq_n_s32(ker_sum, input_offset); + out_0 = ker_sum + out_0; + out_1 = ker_sum + out_1; + out_2 = ker_sum + out_2; + out_3 = ker_sum + out_3; + + const int32x4_t mult = vldrwq_s32(out_mult); + const int32x4_t shift = vldrwq_s32(out_shift); + out_mult += 4; + out_shift += 4; + mve_pred16_t p = vctp32q(num_ch_to_process); + + out_0 = arm_requantize_mve_32x4(out_0, mult, shift); + out_0 = vaddq_n_s32(out_0, out_offset); + out_0 = vmaxq_s32(out_0, vdupq_n_s32(activation_min)); + out_0 = vminq_s32(out_0, vdupq_n_s32(activation_max)); + vstrbq_p_s32(out, out_0, p); + + out_1 = arm_requantize_mve_32x4(out_1, mult, shift); + out_1 = vaddq_n_s32(out_1, out_offset); + out_1 = vmaxq_s32(out_1, vdupq_n_s32(activation_min)); + out_1 = vminq_s32(out_1, vdupq_n_s32(activation_max)); + vstrbq_p_s32(out + num_ch, out_1, p); + + out_2 = arm_requantize_mve_32x4(out_2, mult, shift); + out_2 = vaddq_n_s32(out_2, out_offset); + out_2 = vmaxq_s32(out_2, vdupq_n_s32(activation_min)); + out_2 = vminq_s32(out_2, vdupq_n_s32(activation_max)); + vstrbq_p_s32(out + 2 * num_ch, out_2, p); + + out_3 = arm_requantize_mve_32x4(out_3, mult, shift); + out_3 = vaddq_n_s32(out_3, out_offset); + out_3 = vmaxq_s32(out_3, vdupq_n_s32(activation_min)); + out_3 = vminq_s32(out_3, vdupq_n_s32(activation_max)); + vstrbq_p_s32(out + 3 * num_ch, out_3, p); + } + + const int tail_ch = num_ch & 0x3; + if (tail_ch != 0) + { + out -= (4 - tail_ch); + } + + return out + (3 * num_ch); +#else + (void)lhs; + (void)rhs; + (void)input_offset; + (void)num_ch; + (void)out_shift; + (void)out_mult; + (void)out_offset; + (void)activation_min; + (void)activation_max; + (void)row_x_col; + (void)output_bias; + (void)out; + return NULL; +#endif +} + +/** + * @} end of NNBasicMath group + */ diff --git a/APP_Framework/Framework/knowing/cmsis_5/NN/Source/NNSupportFunctions/arm_nn_mat_mul_core_1x_s8.c b/APP_Framework/Framework/knowing/cmsis_5/NN/Source/NNSupportFunctions/arm_nn_mat_mul_core_1x_s8.c new file mode 100644 index 000000000..9f7c9f8d7 --- /dev/null +++ b/APP_Framework/Framework/knowing/cmsis_5/NN/Source/NNSupportFunctions/arm_nn_mat_mul_core_1x_s8.c @@ -0,0 +1,86 @@ +/* + * Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_nn_mat_mul_core_1x_s8.c + * Description: General Matrix-multiplication function + * + * $Date: 09. October 2020 + * $Revision: V.1.0.2 + * + * Target Processor: Cortex-M cores + * -------------------------------------------------------------------- */ + +#include "arm_nnsupportfunctions.h" + +/** + * @ingroup groupSupport + */ + +/** + * @addtogroup NNBasicMath + * @{ + */ + +/* + * s8 matrix multiplication to process 1 row + * + * Refer header file for details. + * + */ + +arm_status arm_nn_mat_mul_core_1x_s8(int32_t row_elements, + const int8_t *row_base, + const int8_t *col_base, + int32_t *const sum_col, + int32_t *const output) +{ + int32_t acc_n0 = 0; + int32_t sum_tmp = 0; + +#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE) + + __ASM volatile(" vldrb.8 q0, [%[col]], 16 \n" + " wlstp.8 lr, %[cnt], 1f \n" + "2: \n" + " vaddva.s8 %[sum], q0 \n" + " vldrb.8 q1, [%[row0]], 16 \n" + " vmladava.s8 %[out0], q0, q1 \n" + " vldrb.8 q0, [%[col]], 16 \n" + " letp lr, 2b \n" + "1: \n" + : [col] "+r"(col_base), [sum] "+Te"(sum_tmp), [row0] "+r"(row_base), [out0] "+Te"(acc_n0) + : [cnt] "r"(row_elements) + : "q0", "q1", "memory", "r14"); +#else + for (int i = 0; i < row_elements; i++) + { + sum_tmp += col_base[i]; + acc_n0 += row_base[i] * col_base[i]; + } +#endif + + *sum_col = sum_tmp; + *output = acc_n0; + return ARM_MATH_SUCCESS; +} + +/** + * @} end of NNBasicMath group + */ diff --git a/APP_Framework/Framework/knowing/cmsis_5/NN/Source/NNSupportFunctions/arm_nn_mat_mul_core_4x_s8.c b/APP_Framework/Framework/knowing/cmsis_5/NN/Source/NNSupportFunctions/arm_nn_mat_mul_core_4x_s8.c new file mode 100644 index 000000000..10ec8ad55 --- /dev/null +++ b/APP_Framework/Framework/knowing/cmsis_5/NN/Source/NNSupportFunctions/arm_nn_mat_mul_core_4x_s8.c @@ -0,0 +1,116 @@ +/* + * Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_nn_mat_mul_core_4x_s8.c + * Description: General matrix multiplication function for MVE extension + * + * $Date: 09. October 2020 + * $Revision: V.2.0.1 + * + * Target Processor: Cortex-M cores + * -------------------------------------------------------------------- */ + +#include "arm_nnsupportfunctions.h" + +/** + * @ingroup groupSupport + */ + +/** + * @addtogroup NNBasicMath + * @{ + */ + +/* + * s8 matrix multiplication to process 4 rows and one column + * + * Refer header file for details. + * + */ +arm_status arm_nn_mat_mul_core_4x_s8(const int32_t row_elements, + const int32_t offset, + const int8_t *row_base, + const int8_t *col_base, + int32_t *const sum_col, + int32_t *const output) +{ + int32_t acc_n0 = 0; + int32_t acc_n1 = 0; + int32_t acc_n2 = 0; + int32_t acc_n3 = 0; + + const int8_t *ip_row_0 = row_base; + const int8_t *ip_row_1 = row_base + offset; + const int8_t *ip_row_2 = row_base + (2 * offset); + const int8_t *ip_row_3 = row_base + (3 * offset); + int32_t sum_tmp = 0; + +#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE) + __ASM volatile(" vldrb.8 q0, [%[col]], 16 \n" + " wlstp.8 lr, %[cnt], 1f \n" + "2: \n" + " vaddva.s8 %[sum], q0 \n" + " vldrb.8 q1, [%[row0]], 16 \n" + " vmladava.s8 %[out0], q0, q1 \n" + " vldrb.8 q2, [%[row1]], 16 \n" + " vmladava.s8 %[out1], q0, q2 \n" + " vldrb.8 q3, [%[row2]], 16 \n" + " vmladava.s8 %[out2], q0, q3 \n" + " vldrb.8 q4, [%[row3]], 16 \n" + " vmladava.s8 %[out3], q0, q4 \n" + " vldrb.8 q0, [%[col]], 16 \n" + " letp lr, 2b \n" + "1: \n" + : [col] "+r"(col_base), + [sum] "+Te"(sum_tmp), + [row0] "+r"(ip_row_0), + [row1] "+r"(ip_row_1), + [row2] "+r"(ip_row_2), + [row3] "+r"(ip_row_3), + [out0] "+Te"(acc_n0), + [out1] "+Te"(acc_n1), + [out2] "+Te"(acc_n2), + [out3] "+Te"(acc_n3) + : [cnt] "r"(row_elements) + : "q0", "q1", "q2", "q3", "q4", "memory", "r14"); +#else + for (int i = 0; i < row_elements; i++) + { + int32_t col = col_base[i]; + sum_tmp += col; + acc_n0 += ip_row_0[i] * col; + acc_n1 += ip_row_1[i] * col; + acc_n2 += ip_row_2[i] * col; + acc_n3 += ip_row_3[i] * col; + } +#endif + output[0] = acc_n0; + output[1] = acc_n1; + output[2] = acc_n2; + output[3] = acc_n3; + + *sum_col = sum_tmp; + + return ARM_MATH_SUCCESS; +} + +/** + * @} end of NNBasicMath group + */ diff --git a/APP_Framework/Framework/knowing/cmsis_5/NN/Source/NNSupportFunctions/arm_nn_mat_mult_nt_t_s8.c b/APP_Framework/Framework/knowing/cmsis_5/NN/Source/NNSupportFunctions/arm_nn_mat_mult_nt_t_s8.c new file mode 100644 index 000000000..d0420c239 --- /dev/null +++ b/APP_Framework/Framework/knowing/cmsis_5/NN/Source/NNSupportFunctions/arm_nn_mat_mult_nt_t_s8.c @@ -0,0 +1,582 @@ +/* + * Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_nn_mat_mult_s8_nt_t_s8 + * Description: Matrix multiplication support function with the right-hand-side (rhs) matrix transposed + * + * $Date: 09. October 2020 + * $Revision: V.1.0.3 + * + * Target Processor: Cortex-M + * + * -------------------------------------------------------------------- */ + +#include "arm_nnsupportfunctions.h" + +/** + * @ingroup groupSupport + */ + +/** + * @addtogroup NNBasicMath + * @{ + */ + +/* + * s8 matrix multiplication with the right-hand-side matrix transposed + * + * Refer header file for details. + * + */ +arm_status arm_nn_mat_mult_nt_t_s8(const q7_t *lhs, + const q7_t *rhs, + const q31_t *bias, + q7_t *dst, + const int32_t *dst_multipliers, + const int32_t *dst_shifts, + const int32_t lhs_rows, + const int32_t rhs_rows, + const int32_t rhs_cols, + const int32_t lhs_offset, + const int32_t dst_offset, + const int32_t activation_min, + const int32_t activation_max) +{ +#if defined(ARM_MATH_DSP) + const int32_t off0 = rhs_cols - 4; + + for (int32_t rhs_rows_idx = 0; rhs_rows_idx <= (rhs_rows - 2); rhs_rows_idx += 2) + { + const q7_t *lhs_ptr = &lhs[0]; + q7_t *dst_ptr = &dst[0]; + + q31_t lhs_offset_contribution0 = 0; + q31_t lhs_offset_contribution1 = 0; + + for (int32_t x = 0; x < rhs_cols; ++x) + { + lhs_offset_contribution0 += rhs[x]; + lhs_offset_contribution1 += rhs[x + rhs_cols]; + } + + lhs_offset_contribution0 *= lhs_offset; + lhs_offset_contribution1 *= lhs_offset; + if (bias) + { + lhs_offset_contribution0 += bias[rhs_rows_idx]; + lhs_offset_contribution1 += bias[rhs_rows_idx + 1]; + } + + int32_t lhs_rows_idx = lhs_rows >> 1; + + while (lhs_rows_idx) + { + const q7_t *rhs_ptr = &rhs[0]; + + q31_t res00 = lhs_offset_contribution0; + q31_t res01 = lhs_offset_contribution1; + q31_t res10 = lhs_offset_contribution0; + q31_t res11 = lhs_offset_contribution1; + + int32_t rhs_cols_idx = 0; + + q31_t val0, val1, val2, val3, val4, val5; + + for (; rhs_cols_idx <= (rhs_cols - 16); rhs_cols_idx += 16) + { + val1 = arm_nn_read_q7x4_ia((const q7_t **)&rhs_ptr); + val2 = __SXTB16(val1); + val0 = arm_nn_read_q7x4_ia((const q7_t **)&lhs_ptr); + val3 = __SXTB16(val0); + val4 = arm_nn_read_q7x4((const q7_t *)&rhs_ptr[off0]); + val1 = __SXTB16_RORn(val1, 8); + val0 = __SXTB16_RORn(val0, 8); + + // 4 x MAC res00, res01 + res00 = __SMLAD(val3, val2, res00); + val5 = __SXTB16(val4); + res00 = __SMLAD(val0, val1, res00); + val4 = __SXTB16_RORn(val4, 8); + res01 = __SMLAD(val3, val5, res01); + res01 = __SMLAD(val0, val4, res01); + + // 4 x MAC res10, res11 + val0 = arm_nn_read_q7x4((const q7_t *)&lhs_ptr[off0]); + val3 = __SXTB16(val0); + val0 = __SXTB16_RORn(val0, 8); + res10 = __SMLAD(val3, val2, res10); + res11 = __SMLAD(val3, val5, res11); + res10 = __SMLAD(val0, val1, res10); + val1 = arm_nn_read_q7x4_ia((const q7_t **)&rhs_ptr); + res11 = __SMLAD(val0, val4, res11); + + val4 = arm_nn_read_q7x4((const q7_t *)&rhs_ptr[off0]); + val2 = __SXTB16(val1); + val0 = arm_nn_read_q7x4_ia((const q7_t **)&lhs_ptr); + val3 = __SXTB16(val0); + val1 = __SXTB16_RORn(val1, 8); + val0 = __SXTB16_RORn(val0, 8); + + // 4 x MAC res00, res01 + res00 = __SMLAD(val3, val2, res00); + val5 = __SXTB16(val4); + res00 = __SMLAD(val0, val1, res00); + val4 = __SXTB16_RORn(val4, 8); + res01 = __SMLAD(val3, val5, res01); + res01 = __SMLAD(val0, val4, res01); + + // 4 x MAC res10, res11 + val0 = arm_nn_read_q7x4((const q7_t *)&lhs_ptr[off0]); + val3 = __SXTB16(val0); + val0 = __SXTB16_RORn(val0, 8); + res10 = __SMLAD(val3, val2, res10); + res11 = __SMLAD(val3, val5, res11); + res10 = __SMLAD(val0, val1, res10); + val1 = arm_nn_read_q7x4_ia((const q7_t **)&rhs_ptr); + res11 = __SMLAD(val0, val4, res11); + + val4 = arm_nn_read_q7x4((const q7_t *)&rhs_ptr[off0]); + val2 = __SXTB16(val1); + val0 = arm_nn_read_q7x4_ia((const q7_t **)&lhs_ptr); + val3 = __SXTB16(val0); + val1 = __SXTB16_RORn(val1, 8); + val0 = __SXTB16_RORn(val0, 8); + + // 4 x MAC res00, res01 + res00 = __SMLAD(val3, val2, res00); + val5 = __SXTB16(val4); + res00 = __SMLAD(val0, val1, res00); + val4 = __SXTB16_RORn(val4, 8); + res01 = __SMLAD(val3, val5, res01); + res01 = __SMLAD(val0, val4, res01); + + // 4 x MAC res10, res11 + val0 = arm_nn_read_q7x4((const q7_t *)&lhs_ptr[off0]); + val3 = __SXTB16(val0); + val0 = __SXTB16_RORn(val0, 8); + res10 = __SMLAD(val3, val2, res10); + res11 = __SMLAD(val3, val5, res11); + res10 = __SMLAD(val0, val1, res10); + val1 = arm_nn_read_q7x4_ia((const q7_t **)&rhs_ptr); + res11 = __SMLAD(val0, val4, res11); + + val4 = arm_nn_read_q7x4((const q7_t *)&rhs_ptr[off0]); + val2 = __SXTB16(val1); + val0 = arm_nn_read_q7x4_ia((const q7_t **)&lhs_ptr); + val3 = __SXTB16(val0); + val1 = __SXTB16_RORn(val1, 8); + val0 = __SXTB16_RORn(val0, 8); + + // 4 x MAC res00, res01 + res00 = __SMLAD(val3, val2, res00); + val5 = __SXTB16(val4); + res00 = __SMLAD(val0, val1, res00); + val4 = __SXTB16_RORn(val4, 8); + res01 = __SMLAD(val3, val5, res01); + res01 = __SMLAD(val0, val4, res01); + + // 4 x MAC res10, res11 + val0 = arm_nn_read_q7x4((const q7_t *)&lhs_ptr[off0]); + val3 = __SXTB16(val0); + val0 = __SXTB16_RORn(val0, 8); + res10 = __SMLAD(val3, val2, res10); + res11 = __SMLAD(val3, val5, res11); + res10 = __SMLAD(val0, val1, res10); + res11 = __SMLAD(val0, val4, res11); + } + + for (; rhs_cols_idx < rhs_cols; ++rhs_cols_idx) + { + q7_t rhs_value0 = rhs_ptr[0]; + q7_t rhs_value1 = rhs_ptr[rhs_cols]; + q7_t lhs_value = lhs_ptr[0]; + + res00 += lhs_value * rhs_value0; + res01 += lhs_value * rhs_value1; + + lhs_value = lhs_ptr[rhs_cols]; + res10 += lhs_value * rhs_value0; + res11 += lhs_value * rhs_value1; + + ++rhs_ptr; + ++lhs_ptr; + } + + // Quantize down + res00 = arm_nn_requantize(res00, dst_multipliers[rhs_rows_idx], dst_shifts[rhs_rows_idx]); + res01 = arm_nn_requantize(res01, dst_multipliers[rhs_rows_idx + 1], dst_shifts[rhs_rows_idx + 1]); + res10 = arm_nn_requantize(res10, dst_multipliers[rhs_rows_idx], dst_shifts[rhs_rows_idx]); + res11 = arm_nn_requantize(res11, dst_multipliers[rhs_rows_idx + 1], dst_shifts[rhs_rows_idx + 1]); + + // Add offset + res00 += dst_offset; + res01 += dst_offset; + res10 += dst_offset; + res11 += dst_offset; + + // Clamp the result + res00 = MAX(res00, activation_min); + res00 = MIN(res00, activation_max); + res01 = MAX(res01, activation_min); + res01 = MIN(res01, activation_max); + res10 = MAX(res10, activation_min); + res10 = MIN(res10, activation_max); + res11 = MAX(res11, activation_min); + res11 = MIN(res11, activation_max); + + dst_ptr[0] = (q7_t)res00; + dst_ptr[1] = (q7_t)res01; + dst_ptr += rhs_rows; + dst_ptr[0] = (q7_t)res10; + dst_ptr[1] = (q7_t)res11; + dst_ptr += rhs_rows; + + lhs_ptr += rhs_cols; + + lhs_rows_idx--; + } + + // Left-over rows + if (lhs_rows % 2) + { + const q7_t *rhs_ptr = &rhs[0]; + + q31_t res00 = lhs_offset_contribution0; + q31_t res01 = lhs_offset_contribution1; + + int32_t rhs_cols_idx = 0; + + q31_t val0, val1, val2, val3, val4, val5; + for (; rhs_cols_idx <= (rhs_cols - 16); rhs_cols_idx += 16) + { + val0 = arm_nn_read_q7x4_ia((const q7_t **)&rhs_ptr); + val1 = arm_nn_read_q7x4((const q7_t *)&rhs_ptr[off0]); + val2 = arm_nn_read_q7x4_ia((const q7_t **)&lhs_ptr); + val3 = __SXTB16(val0); + val5 = __SXTB16(val2); + val4 = __SXTB16(val1); + val0 = __SXTB16_RORn(val0, 8); + val2 = __SXTB16_RORn(val2, 8); + val1 = __SXTB16_RORn(val1, 8); + + // 4 x MAC res00, res01 + res00 = __SMLAD(val5, val3, res00); + res00 = __SMLAD(val2, val0, res00); + res01 = __SMLAD(val5, val4, res01); + res01 = __SMLAD(val2, val1, res01); + + val0 = arm_nn_read_q7x4_ia((const q7_t **)&rhs_ptr); + val1 = arm_nn_read_q7x4((const q7_t *)&rhs_ptr[off0]); + val2 = arm_nn_read_q7x4_ia((const q7_t **)&lhs_ptr); + val3 = __SXTB16(val0); + val5 = __SXTB16(val2); + val4 = __SXTB16(val1); + val0 = __SXTB16_RORn(val0, 8); + val2 = __SXTB16_RORn(val2, 8); + val1 = __SXTB16_RORn(val1, 8); + + // 4 x MAC res00, res01 + res00 = __SMLAD(val5, val3, res00); + res00 = __SMLAD(val2, val0, res00); + res01 = __SMLAD(val5, val4, res01); + res01 = __SMLAD(val2, val1, res01); + + val0 = arm_nn_read_q7x4_ia((const q7_t **)&rhs_ptr); + val1 = arm_nn_read_q7x4((const q7_t *)&rhs_ptr[off0]); + val2 = arm_nn_read_q7x4_ia((const q7_t **)&lhs_ptr); + val3 = __SXTB16(val0); + val5 = __SXTB16(val2); + val4 = __SXTB16(val1); + val0 = __SXTB16_RORn(val0, 8); + val2 = __SXTB16_RORn(val2, 8); + val1 = __SXTB16_RORn(val1, 8); + + // 4 x MAC res00, res01 + res00 = __SMLAD(val5, val3, res00); + res00 = __SMLAD(val2, val0, res00); + res01 = __SMLAD(val5, val4, res01); + res01 = __SMLAD(val2, val1, res01); + + val0 = arm_nn_read_q7x4_ia((const q7_t **)&rhs_ptr); + val1 = arm_nn_read_q7x4((const q7_t *)&rhs_ptr[off0]); + val2 = arm_nn_read_q7x4_ia((const q7_t **)&lhs_ptr); + val3 = __SXTB16(val0); + val5 = __SXTB16(val2); + val4 = __SXTB16(val1); + val0 = __SXTB16_RORn(val0, 8); + val2 = __SXTB16_RORn(val2, 8); + val1 = __SXTB16_RORn(val1, 8); + + // 4 x MAC res00, res01 + res00 = __SMLAD(val5, val3, res00); + res00 = __SMLAD(val2, val0, res00); + res01 = __SMLAD(val5, val4, res01); + res01 = __SMLAD(val2, val1, res01); + } + + // Left-over accumulations + for (; rhs_cols_idx < rhs_cols; ++rhs_cols_idx) + { + q7_t rhs_value0 = rhs_ptr[0]; + q7_t rhs_value1 = rhs_ptr[rhs_cols]; + q7_t lhs_value = lhs_ptr[0]; + + res00 += lhs_value * rhs_value0; + res01 += lhs_value * rhs_value1; + + ++rhs_ptr; + ++lhs_ptr; + } + + // Quantize down + res00 = arm_nn_requantize(res00, dst_multipliers[rhs_rows_idx], dst_shifts[rhs_rows_idx]); + res01 = arm_nn_requantize(res01, dst_multipliers[rhs_rows_idx + 1], dst_shifts[rhs_rows_idx + 1]); + + // Add offset + res00 += dst_offset; + res01 += dst_offset; + + // Clamp the result + res00 = MAX(res00, activation_min); + res00 = MIN(res00, activation_max); + res01 = MAX(res01, activation_min); + res01 = MIN(res01, activation_max); + + dst_ptr[0] = (q7_t)res00; + dst_ptr[1] = (q7_t)res01; + } + + rhs += 2 * rhs_cols; + dst += 2; + } + + if (rhs_rows % 2) + { + const q7_t *lhs_ptr = &lhs[0]; + q7_t *dst_ptr = &dst[0]; + + for (int32_t lhs_rows_idx = 0; lhs_rows_idx < lhs_rows; ++lhs_rows_idx) + { + const q7_t *rhs_ptr = &rhs[0]; + q31_t res00 = 0; + if (bias) + { + res00 = bias[rhs_rows - 1]; + } + + for (int32_t rhs_cols_idx = 0; rhs_cols_idx < rhs_cols; ++rhs_cols_idx) + { + q31_t rhs_value = rhs_ptr[0]; + q31_t lhs_value = lhs_ptr[0] + lhs_offset; + + res00 += lhs_value * rhs_value; + + ++rhs_ptr; + ++lhs_ptr; + } + + // Quantize down + res00 = arm_nn_requantize(res00, dst_multipliers[rhs_rows - 1], dst_shifts[rhs_rows - 1]); + + // Add offset + res00 += dst_offset; + + // Clamp the result + res00 = MAX(res00, activation_min); + res00 = MIN(res00, activation_max); + + dst_ptr[0] = (q7_t)res00; + dst_ptr += rhs_rows; + } + } +#else + for (int32_t rhs_rows_idx = 0; rhs_rows_idx <= (rhs_rows - 2); rhs_rows_idx += 2) + { + const q7_t *lhs_ptr = &lhs[0]; + q7_t *dst_ptr = &dst[0]; + + q31_t lhs_offset_contribution0 = 0; + q31_t lhs_offset_contribution1 = 0; + + for (int32_t x = 0; x < rhs_cols; ++x) + { + lhs_offset_contribution0 += rhs[x]; + lhs_offset_contribution1 += rhs[x + rhs_cols]; + } + + lhs_offset_contribution0 *= lhs_offset; + lhs_offset_contribution1 *= lhs_offset; + if (bias) + { + lhs_offset_contribution0 += bias[rhs_rows_idx]; + lhs_offset_contribution1 += bias[rhs_rows_idx + 1]; + } + + int32_t lhs_rows_idx = lhs_rows >> 1; + + while (lhs_rows_idx) + { + const q7_t *rhs_ptr = &rhs[0]; + + q31_t res00 = lhs_offset_contribution0; + q31_t res01 = lhs_offset_contribution1; + q31_t res10 = lhs_offset_contribution0; + q31_t res11 = lhs_offset_contribution1; + + for (int32_t rhs_cols_idx = rhs_cols; rhs_cols_idx != 0; rhs_cols_idx--) + { + q7_t rhs_value0 = rhs_ptr[0]; + q7_t rhs_value1 = rhs_ptr[rhs_cols]; + q7_t lhs_value = lhs_ptr[0]; + + res00 += lhs_value * rhs_value0; + res01 += lhs_value * rhs_value1; + + lhs_value = lhs_ptr[rhs_cols]; + res10 += lhs_value * rhs_value0; + res11 += lhs_value * rhs_value1; + + ++rhs_ptr; + ++lhs_ptr; + } + + // Quantize down + res00 = arm_nn_requantize(res00, dst_multipliers[rhs_rows_idx], dst_shifts[rhs_rows_idx]); + res01 = arm_nn_requantize(res01, dst_multipliers[rhs_rows_idx + 1], dst_shifts[rhs_rows_idx + 1]); + res10 = arm_nn_requantize(res10, dst_multipliers[rhs_rows_idx], dst_shifts[rhs_rows_idx]); + res11 = arm_nn_requantize(res11, dst_multipliers[rhs_rows_idx + 1], dst_shifts[rhs_rows_idx + 1]); + + // Add offset + res00 += dst_offset; + res01 += dst_offset; + res10 += dst_offset; + res11 += dst_offset; + + // Clamp the result + res00 = MAX(res00, activation_min); + res00 = MIN(res00, activation_max); + res01 = MAX(res01, activation_min); + res01 = MIN(res01, activation_max); + res10 = MAX(res10, activation_min); + res10 = MIN(res10, activation_max); + res11 = MAX(res11, activation_min); + res11 = MIN(res11, activation_max); + + dst_ptr[0] = (q7_t)res00; + dst_ptr[1] = (q7_t)res01; + dst_ptr += rhs_rows; + dst_ptr[0] = (q7_t)res10; + dst_ptr[1] = (q7_t)res11; + dst_ptr += rhs_rows; + + lhs_ptr += rhs_cols; + + lhs_rows_idx--; + } + + // Left-over rows + if (lhs_rows % 2) + { + const q7_t *rhs_ptr = &rhs[0]; + + q31_t res00 = lhs_offset_contribution0; + q31_t res01 = lhs_offset_contribution1; + + for (int32_t rhs_cols_idx = rhs_cols; rhs_cols_idx != 0; rhs_cols_idx--) + { + q7_t rhs_value0 = rhs_ptr[0]; + q7_t rhs_value1 = rhs_ptr[rhs_cols]; + q7_t lhs_value = lhs_ptr[0]; + + res00 += lhs_value * rhs_value0; + res01 += lhs_value * rhs_value1; + + ++rhs_ptr; + ++lhs_ptr; + } + + // Quantize down + res00 = arm_nn_requantize(res00, dst_multipliers[rhs_rows_idx], dst_shifts[rhs_rows_idx]); + res01 = arm_nn_requantize(res01, dst_multipliers[rhs_rows_idx + 1], dst_shifts[rhs_rows_idx + 1]); + + // Add offset + res00 += dst_offset; + res01 += dst_offset; + + // Clamp the result + res00 = MAX(res00, activation_min); + res00 = MIN(res00, activation_max); + res01 = MAX(res01, activation_min); + res01 = MIN(res01, activation_max); + + dst_ptr[0] = (q7_t)res00; + dst_ptr[1] = (q7_t)res01; + } + + rhs += 2 * rhs_cols; + dst += 2; + } + + if (rhs_rows % 2) + { + const q7_t *lhs_ptr = &lhs[0]; + q7_t *dst_ptr = &dst[0]; + + for (int32_t lhs_rows_idx = 0; lhs_rows_idx < lhs_rows; ++lhs_rows_idx) + { + const q7_t *rhs_ptr = &rhs[0]; + q31_t res00 = 0; + if (bias) + { + res00 = bias[rhs_rows - 1]; + } + + for (int32_t rhs_cols_idx = rhs_cols; rhs_cols_idx != 0; rhs_cols_idx--) + { + q31_t rhs_value = rhs_ptr[0]; + q31_t lhs_value = lhs_ptr[0] + lhs_offset; + + res00 += lhs_value * rhs_value; + + ++rhs_ptr; + ++lhs_ptr; + } + + // Quantize down + res00 = arm_nn_requantize(res00, dst_multipliers[rhs_rows - 1], dst_shifts[rhs_rows - 1]); + + // Add offset + res00 += dst_offset; + + // Clamp the result + res00 = MAX(res00, activation_min); + res00 = MIN(res00, activation_max); + + dst_ptr[0] = (q7_t)res00; + dst_ptr += rhs_rows; + } + } +#endif + return ARM_MATH_SUCCESS; +} + +/** + * @} end of NNBasicMath group + */ diff --git a/APP_Framework/Framework/knowing/cmsis_5/NN/Source/NNSupportFunctions/arm_nn_mult_q15.c b/APP_Framework/Framework/knowing/cmsis_5/NN/Source/NNSupportFunctions/arm_nn_mult_q15.c new file mode 100644 index 000000000..6c54618bb --- /dev/null +++ b/APP_Framework/Framework/knowing/cmsis_5/NN/Source/NNSupportFunctions/arm_nn_mult_q15.c @@ -0,0 +1,138 @@ +/* + * Copyright (C) 2010-2018 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_nn_mult_q15.c + * Description: Q15 vector multiplication with variable output shifts + * + * $Date: 09. October 2020 + * $Revision: V.1.0.2 + * + * Target Processor: Cortex-M cores + * + * -------------------------------------------------------------------- */ + +#include "arm_nnsupportfunctions.h" + +/** + * @ingroup groupSupport + */ + +/** + * @addtogroup NNBasicMath + * @{ + */ + +/** + * @brief Q7 vector multiplication with variable output shifts + * @param[in] *pSrcA pointer to the first input vector + * @param[in] *pSrcB pointer to the second input vector + * @param[out] *pDst pointer to the output vector + * @param[in] out_shift amount of right-shift for output + * @param[in] blockSize number of samples in each vector + * + * Scaling and Overflow Behavior: + * \par + * The function uses saturating arithmetic. + * Results outside of the allowable Q15 range [0x8000 0x7FFF] will be saturated. + */ + +void arm_nn_mult_q15(q15_t *pSrcA, q15_t *pSrcB, q15_t *pDst, const uint16_t out_shift, uint32_t blockSize) +{ + uint32_t blkCnt; /* loop counters */ + +#if defined(ARM_MATH_DSP) + + /* Run the below code for Cortex-M4 and Cortex-M3 */ + q31_t inA1, inA2, inB1, inB2; /* temporary input variables */ + q15_t out1, out2, out3, out4; /* temporary output variables */ + q31_t mul1, mul2, mul3, mul4; /* temporary variables */ + + /* loop Unrolling */ + blkCnt = blockSize >> 2U; + + /* First part of the processing with loop unrolling. Compute 4 outputs at a time. + ** a second loop below computes the remaining 1 to 3 samples. */ + while (blkCnt > 0U) + { + /* read two samples at a time from sourceA */ + inA1 = arm_nn_read_q15x2_ia((const q15_t **)&pSrcA); + /* read two samples at a time from sourceB */ + inB1 = arm_nn_read_q15x2_ia((const q15_t **)&pSrcB); + /* read two samples at a time from sourceA */ + inA2 = arm_nn_read_q15x2_ia((const q15_t **)&pSrcA); + /* read two samples at a time from sourceB */ + inB2 = arm_nn_read_q15x2_ia((const q15_t **)&pSrcB); + + /* multiply mul = sourceA * sourceB */ + mul1 = (q31_t)((q15_t)(inA1 >> 16) * (q15_t)(inB1 >> 16)); + mul2 = (q31_t)((q15_t)inA1 * (q15_t)inB1); + mul3 = (q31_t)((q15_t)(inA2 >> 16) * (q15_t)(inB2 >> 16)); + mul4 = (q31_t)((q15_t)inA2 * (q15_t)inB2); + + /* saturate result to 16 bit */ + out1 = (q15_t)__SSAT((q31_t)(mul1 + NN_ROUND(out_shift)) >> out_shift, 16); + out2 = (q15_t)__SSAT((q31_t)(mul2 + NN_ROUND(out_shift)) >> out_shift, 16); + out3 = (q15_t)__SSAT((q31_t)(mul3 + NN_ROUND(out_shift)) >> out_shift, 16); + out4 = (q15_t)__SSAT((q31_t)(mul4 + NN_ROUND(out_shift)) >> out_shift, 16); + + /* store the result */ +#ifndef ARM_MATH_BIG_ENDIAN + + *__SIMD32(pDst)++ = __PKHBT(out2, out1, 16); + *__SIMD32(pDst)++ = __PKHBT(out4, out3, 16); + +#else + + *__SIMD32(pDst)++ = __PKHBT(out2, out1, 16); + *__SIMD32(pDst)++ = __PKHBT(out4, out3, 16); + +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + /* Decrement the blockSize loop counter */ + blkCnt--; + } + + /* If the blockSize is not a multiple of 4, compute any remaining output samples here. + ** No loop unrolling is used. */ + blkCnt = blockSize % 0x4U; + +#else + + /* Run the below code for Cortex-M0 */ + + /* Initialize blkCnt with number of samples */ + blkCnt = blockSize; + +#endif /* #if defined (ARM_MATH_DSP) */ + + while (blkCnt > 0U) + { + /* C = A * B */ + /* Multiply the inputs and store the result in the destination buffer */ + *pDst++ = (q15_t)__SSAT(((q31_t)((q31_t)(*pSrcA++) * (*pSrcB++) + NN_ROUND(out_shift)) >> out_shift), 16); + + /* Decrement the blockSize loop counter */ + blkCnt--; + } +} + +/** + * @} end of NNBasicMath group + */ diff --git a/APP_Framework/Framework/knowing/cmsis_5/NN/Source/NNSupportFunctions/arm_nn_mult_q7.c b/APP_Framework/Framework/knowing/cmsis_5/NN/Source/NNSupportFunctions/arm_nn_mult_q7.c new file mode 100644 index 000000000..40dd1cdad --- /dev/null +++ b/APP_Framework/Framework/knowing/cmsis_5/NN/Source/NNSupportFunctions/arm_nn_mult_q7.c @@ -0,0 +1,112 @@ +/* + * Copyright (C) 2010-2018 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_nn_mult_q7.c + * Description: Q7 vector multiplication with variable output shifts + * + * $Date: 09. October 2020 + * $Revision: V.1.0.2 + * + * Target Processor: Cortex-M cores + * + * -------------------------------------------------------------------- */ + +#include "arm_nnsupportfunctions.h" + +/** + * @ingroup groupSupport + */ + +/** + * @addtogroup NNBasicMath + * @{ + */ + +/** + * @brief Q7 vector multiplication with variable output shifts + * @param[in] *pSrcA pointer to the first input vector + * @param[in] *pSrcB pointer to the second input vector + * @param[out] *pDst pointer to the output vector + * @param[in] out_shift amount of right-shift for output + * @param[in] blockSize number of samples in each vector + * + * Scaling and Overflow Behavior: + * \par + * The function uses saturating arithmetic. + * Results outside of the allowable Q7 range [0x80 0x7F] will be saturated. + */ + +void arm_nn_mult_q7(q7_t *pSrcA, q7_t *pSrcB, q7_t *pDst, const uint16_t out_shift, uint32_t blockSize) +{ + uint32_t blkCnt; /* loop counters */ + +#if defined(ARM_MATH_DSP) + + /* Run the below code for Cortex-M4 and Cortex-M3 */ + q7_t out1, out2, out3, out4; /* Temporary variables to store the product */ + + /* loop Unrolling */ + blkCnt = blockSize >> 2U; + + /* First part of the processing with loop unrolling. Compute 4 outputs at a time. + ** a second loop below computes the remaining 1 to 3 samples. */ + while (blkCnt > 0U) + { + /* C = A * B */ + /* Multiply the inputs and store the results in temporary variables */ + out1 = (q7_t)__SSAT(((q15_t)((q15_t)(*pSrcA++) * (*pSrcB++) + NN_ROUND(out_shift)) >> out_shift), 8); + out2 = (q7_t)__SSAT(((q15_t)((q15_t)(*pSrcA++) * (*pSrcB++) + NN_ROUND(out_shift)) >> out_shift), 8); + out3 = (q7_t)__SSAT(((q15_t)((q15_t)(*pSrcA++) * (*pSrcB++) + NN_ROUND(out_shift)) >> out_shift), 8); + out4 = (q7_t)__SSAT(((q15_t)((q15_t)(*pSrcA++) * (*pSrcB++) + NN_ROUND(out_shift)) >> out_shift), 8); + + /* Store the results of 4 inputs in the destination buffer in single cycle by packing */ + *__SIMD32(pDst)++ = __PACKq7(out1, out2, out3, out4); + + /* Decrement the blockSize loop counter */ + blkCnt--; + } + + /* If the blockSize is not a multiple of 4, compute any remaining output samples here. + ** No loop unrolling is used. */ + blkCnt = blockSize % 0x4U; + +#else + + /* Run the below code for Cortex-M0 */ + + /* Initialize blkCnt with number of samples */ + blkCnt = blockSize; + +#endif /* #if defined (ARM_MATH_DSP) */ + + while (blkCnt > 0U) + { + /* C = A * B */ + /* Multiply the inputs and store the result in the destination buffer */ + *pDst++ = (q7_t)__SSAT(((q15_t)((q15_t)(*pSrcA++) * (*pSrcB++) + NN_ROUND(out_shift)) >> out_shift), 8); + + /* Decrement the blockSize loop counter */ + blkCnt--; + } +} + +/** + * @} end of NNBasicMath group + */ diff --git a/APP_Framework/Framework/knowing/cmsis_5/NN/Source/NNSupportFunctions/arm_nn_vec_mat_mult_t_s8.c b/APP_Framework/Framework/knowing/cmsis_5/NN/Source/NNSupportFunctions/arm_nn_vec_mat_mult_t_s8.c new file mode 100644 index 000000000..9ee217935 --- /dev/null +++ b/APP_Framework/Framework/knowing/cmsis_5/NN/Source/NNSupportFunctions/arm_nn_vec_mat_mult_t_s8.c @@ -0,0 +1,392 @@ +/* + * Copyright (C) 2020-2021 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_nn_vec_mat_mult_t_s8 + * Description: s8 vector by matrix (transposed) multiplication + * + * $Date: 02. May 2021 + * $Revision: V.2.5.0 + * + * Target Processor: Cortex-M + * + * -------------------------------------------------------------------- */ + +#include "arm_nnsupportfunctions.h" + +/** + * @ingroup groupSupport + */ + +/** + * @addtogroup NNBasicMath + * @{ + */ + +/* + * s8 vector(lhs) by matrix (transposed) multiplication + * + * Refer header file for details. + * + */ +arm_status arm_nn_vec_mat_mult_t_s8(const q7_t *lhs, + const q7_t *rhs, + const q31_t *bias, + q7_t *dst, + const int32_t lhs_offset, + const int32_t rhs_offset, + const int32_t dst_offset, + const int32_t dst_multiplier, + const int32_t dst_shift, + const int32_t rhs_cols, + const int32_t rhs_rows, + const int32_t activation_min, + const int32_t activation_max) +{ + (void)rhs_offset; +#if defined(ARM_MATH_MVEI) + int32_t row_loop_cnt = rhs_rows / 3; + + for (int i_row_loop_cnt = 0; i_row_loop_cnt < row_loop_cnt; i_row_loop_cnt++) + { + int32_t acc_0 = 0; + int32_t acc_1 = 0; + int32_t acc_2 = 0; + + const int32_t col_loop_cnt = (rhs_cols + 15) / 16; + + const int8_t *lhs_vec = lhs; + const int8_t *rhs_0 = rhs; + const int8_t *rhs_1 = rhs + rhs_cols; + const int8_t *rhs_2 = rhs + 2 * rhs_cols; + + int32_t rhs_sum_0 = 0; + int32_t rhs_sum_1 = 0; + int32_t rhs_sum_2 = 0; + + uint32_t col_cnt = (uint32_t)rhs_cols; + + for (int i = 0; i < col_loop_cnt; i++) + { + mve_pred16_t p = vctp8q(col_cnt); + col_cnt -= 16; + + const int8x16_t input = vldrbq_z_s8(lhs_vec, p); + + const int8x16_t ker_0 = vldrbq_z_s8(rhs_0, p); + rhs_sum_0 = vaddvaq_p_s8(rhs_sum_0, ker_0, p); + acc_0 = vmladavaq_p_s8(acc_0, ker_0, input, p); + + const int8x16_t ker_1 = vldrbq_z_s8(rhs_1, p); + rhs_sum_1 = vaddvaq_p_s8(rhs_sum_1, ker_1, p); + acc_1 = vmladavaq_p_s8(acc_1, ker_1, input, p); + + const int8x16_t ker_2 = vldrbq_z_s8(rhs_2, p); + rhs_sum_2 = vaddvaq_p_s8(rhs_sum_2, ker_2, p); + acc_2 = vmladavaq_p_s8(acc_2, ker_2, input, p); + + lhs_vec += 16; + rhs_0 += 16; + rhs_1 += 16; + rhs_2 += 16; + } + rhs += 3 * rhs_cols; + + int32x4_t acc = {acc_0, acc_1, acc_2, 0}; + mve_pred16_t p = vctp32q(3); + if (bias) + { + int32x4_t b = vldrwq_z_s32(bias, p); + acc = vaddq_m_s32(vuninitializedq_s32(), acc, b, p); + bias += 3; + } + const int32x4_t rhs_sum = {rhs_sum_0, rhs_sum_1, rhs_sum_2, 0}; + acc += vdupq_n_s32(lhs_offset) * rhs_sum; + + acc = arm_requantize_mve(acc, dst_multiplier, dst_shift); + acc = vaddq_s32(acc, vdupq_n_s32(dst_offset)); + acc = vmaxq_s32(acc, vdupq_n_s32(activation_min)); + acc = vminq_s32(acc, vdupq_n_s32(activation_max)); + vstrbq_p_s32(dst, acc, p); + dst += 3; + } + + const int loop_cnt = rhs_rows % 3; + for (int i_row_loop_cnt = 0; i_row_loop_cnt < loop_cnt; i_row_loop_cnt++) + { + int32_t acc_0 = 0; + const int32_t col_loop_cnt = (rhs_cols + 15) / 16; + const int8_t *lhs_vec = lhs; + const int8_t *rhs_0 = rhs; + int32_t rhs_sum_0 = 0; + uint32_t col_cnt = (uint32_t)rhs_cols; + + for (int i = 0; i < col_loop_cnt; i++) + { + mve_pred16_t p = vctp8q(col_cnt); + col_cnt -= 16; + const int8x16_t input = vldrbq_z_s8(lhs_vec, p); + + const int8x16_t ker_0 = vldrbq_z_s8(rhs_0, p); + rhs_sum_0 = vaddvaq_p_s8(rhs_sum_0, ker_0, p); + acc_0 = vmladavaq_p_s8(acc_0, ker_0, input, p); + + lhs_vec += 16; + rhs_0 += 16; + } + rhs += rhs_cols; + + if (bias) + { + acc_0 += *bias; + bias++; + } + const int32_t offsets = rhs_sum_0 * lhs_offset; + acc_0 += offsets; + acc_0 = arm_nn_requantize(acc_0, dst_multiplier, dst_shift); + acc_0 += dst_offset; + + // Clamp the result + acc_0 = MAX(acc_0, activation_min); + *dst = MIN(acc_0, activation_max); + dst++; + } + +#elif defined(ARM_MATH_DSP) + int32_t row_loop_cnt = rhs_rows / 2; + + const int16_t lhs_offset_s16 = (int16_t)lhs_offset; + + const uint32_t lhs_offset_s16x2 = __PKHBT(lhs_offset_s16, lhs_offset_s16, 16); + + for (int32_t i = 0; i < row_loop_cnt; i++) + { + int32_t acc_0 = 0; + int32_t acc_1 = 0; + if (bias) + { + acc_0 = *bias++; + acc_1 = *bias++; + } + + const int32_t col_loop_cnt = rhs_cols / 4; + + const int8_t *lhs_vec = lhs; + const int8_t *rhs_0 = rhs; + const int8_t *rhs_1 = rhs + rhs_cols; + rhs += 2 * rhs_cols; + + for (int j = col_loop_cnt; j != 0; j--) + { + int32_t vec_0 = arm_nn_read_q7x4_ia(&lhs_vec); + int32_t vec_1 = __SXTAB16_RORn(lhs_offset_s16x2, (uint32_t)vec_0, 8); + + vec_0 = __SXTAB16(lhs_offset_s16x2, vec_0); + + int32_t ker_0 = arm_nn_read_q7x4_ia(&rhs_0); + int32_t ker_1 = __SXTB16_RORn((uint32_t)ker_0, 8); + ker_0 = __SXTB16(ker_0); + + acc_0 = __SMLAD(ker_1, vec_1, acc_0); + acc_0 = __SMLAD(ker_0, vec_0, acc_0); + + ker_0 = arm_nn_read_q7x4_ia(&rhs_1); + ker_1 = __SXTB16_RORn((uint32_t)ker_0, 8); + ker_0 = __SXTB16(ker_0); + + acc_1 = __SMLAD(ker_1, vec_1, acc_1); + acc_1 = __SMLAD(ker_0, vec_0, acc_1); + } + + for (int k = col_loop_cnt * 4; k < rhs_cols; k++) + { + const int32_t lhs_temp = (*lhs_vec + lhs_offset); + lhs_vec++; + acc_0 += lhs_temp * (*rhs_0); + rhs_0++; + acc_1 += lhs_temp * (*rhs_1); + rhs_1++; + } + + acc_0 = arm_nn_requantize(acc_0, dst_multiplier, dst_shift); + acc_1 = arm_nn_requantize(acc_1, dst_multiplier, dst_shift); + + // Add offset + acc_0 += dst_offset; + acc_1 += dst_offset; + // Clamp the result + acc_0 = MAX(acc_0, activation_min); + acc_0 = MIN(acc_0, activation_max); + acc_1 = MAX(acc_1, activation_min); + acc_1 = MIN(acc_1, activation_max); + + *dst++ = (q7_t)acc_0; + *dst++ = (q7_t)acc_1; + } + + if (rhs_rows & 0x1) + { + int32_t acc_0 = 0; + if (bias) + { + acc_0 = *bias++; + } + const int32_t col_loop_cnt = rhs_cols / 4; + + const int8_t *lhs_vec = lhs; + const int8_t *rhs_0 = rhs; + + for (int i = col_loop_cnt; i != 0; i--) + { + int32_t vec_0 = arm_nn_read_q7x4_ia(&lhs_vec); + int32_t vec_1 = __SXTAB16_RORn(lhs_offset_s16x2, (uint32_t)vec_0, 8); + vec_0 = __SXTAB16(lhs_offset_s16x2, vec_0); + + int32_t ker_0 = arm_nn_read_q7x4_ia(&rhs_0); + int32_t ker_1 = __SXTB16_RORn((uint32_t)ker_0, 8); + ker_0 = __SXTB16(ker_0); + + acc_0 = __SMLAD(ker_1, vec_1, acc_0); + acc_0 = __SMLAD(ker_0, vec_0, acc_0); + } + + for (int j = col_loop_cnt * 4; j < rhs_cols; j++) + { + const int32_t lhs_temp = (*lhs_vec + lhs_offset); + lhs_vec++; + acc_0 += lhs_temp * (*rhs_0); + rhs_0++; + } + + acc_0 = arm_nn_requantize(acc_0, dst_multiplier, dst_shift); + + // Add offset + acc_0 += dst_offset; + // Clamp the result + acc_0 = MAX(acc_0, activation_min); + acc_0 = MIN(acc_0, activation_max); + + *dst++ = (q7_t)acc_0; + } + +#else + + int32_t row_loop_cnt = rhs_rows / 3; + + for (int i_row_loop_cnt = 0; i_row_loop_cnt < row_loop_cnt; i_row_loop_cnt++) + { + const q7_t *lhs_ptr = lhs; + const q7_t *rhs_ptr_0 = &rhs[0]; + const q7_t *rhs_ptr_1 = &rhs[rhs_cols]; + const q7_t *rhs_ptr_2 = &rhs[rhs_cols * 2]; + + q31_t res00 = 0; + q31_t res01 = 0; + q31_t res02 = 0; + if (bias) + { + res00 = *bias++; + res01 = *bias++; + res02 = *bias++; + } + for (int32_t rhs_cols_idx = 0; rhs_cols_idx < rhs_cols; ++rhs_cols_idx) + { + const q31_t rhs_value0 = (int8_t)*rhs_ptr_0; + const q31_t rhs_value1 = (int8_t)*rhs_ptr_1; + const q31_t rhs_value2 = (int8_t)*rhs_ptr_2; + const q31_t lhs_value = (int8_t)*lhs_ptr + lhs_offset; + + res00 += lhs_value * rhs_value0; + res01 += lhs_value * rhs_value1; + res02 += lhs_value * rhs_value2; + + ++rhs_ptr_0; + ++rhs_ptr_1; + ++rhs_ptr_2; + ++lhs_ptr; + } + // Quantize down + res00 = arm_nn_requantize(res00, dst_multiplier, dst_shift); + res01 = arm_nn_requantize(res01, dst_multiplier, dst_shift); + res02 = arm_nn_requantize(res02, dst_multiplier, dst_shift); + + // Add offset + res00 += dst_offset; + res01 += dst_offset; + res02 += dst_offset; + + // Clamp the result + res00 = MAX(res00, activation_min); + res00 = MIN(res00, activation_max); + res01 = MAX(res01, activation_min); + res01 = MIN(res01, activation_max); + res02 = MAX(res02, activation_min); + res02 = MIN(res02, activation_max); + + *dst++ = (q7_t)res00; + *dst++ = (q7_t)res01; + *dst++ = (q7_t)res02; + + rhs += 3 * rhs_cols; + } + + const int loop_cnt = rhs_rows % 3; + + for (int i_loop_cnt = 0; i_loop_cnt < loop_cnt; i_loop_cnt++) + { + const q7_t *lhs_ptr = &lhs[0]; + const q7_t *rhs_ptr = &rhs[0]; + + q31_t res00 = 0; + if (bias) + { + res00 = *bias++; + } + + for (int32_t rhs_cols_idx = 0; rhs_cols_idx < rhs_cols; ++rhs_cols_idx) + { + q31_t rhs_value0 = (int8_t)rhs_ptr[0] + rhs_offset; + q31_t lhs_value = (int8_t)lhs_ptr[0] + lhs_offset; + + res00 += lhs_value * rhs_value0; + + ++rhs_ptr; + ++lhs_ptr; + } + + // Quantize down + res00 = arm_nn_requantize(res00, dst_multiplier, dst_shift); + + // Add offset + res00 += dst_offset; + + // Clamp the result + res00 = MAX(res00, activation_min); + res00 = MIN(res00, activation_max); + + *dst++ = (q7_t)res00; + rhs += rhs_cols; + } +#endif + return ARM_MATH_SUCCESS; +} + +/** + * @} end of NNBasicMath group + */ diff --git a/APP_Framework/Framework/knowing/cmsis_5/NN/Source/NNSupportFunctions/arm_nn_vec_mat_mult_t_svdf_s8.c b/APP_Framework/Framework/knowing/cmsis_5/NN/Source/NNSupportFunctions/arm_nn_vec_mat_mult_t_svdf_s8.c new file mode 100644 index 000000000..1e799ac39 --- /dev/null +++ b/APP_Framework/Framework/knowing/cmsis_5/NN/Source/NNSupportFunctions/arm_nn_vec_mat_mult_t_svdf_s8.c @@ -0,0 +1,341 @@ +/* + * Copyright (C) 2021 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_nn_vec_mat_mult_t_svdf_s8 + * Description: s8 vector by matrix (transposed) multiplication with + * s16 output. Targetted at SVDF operator. + * + * $Date: 15. April 2021 + * $Revision: V.1.0.0 + * + * Target Processor: Cortex-M + * + * -------------------------------------------------------------------- */ + +#include "arm_nnsupportfunctions.h" + +/** + * @ingroup groupSupport + */ + +/** + * @addtogroup NNBasicMath + * @{ + */ + +/* + * s8 vector(lhs) by matrix (transposed) multiplication + * + * Refer header file for details. + * + */ +arm_status arm_nn_vec_mat_mult_t_svdf_s8(const q7_t *lhs, + const q7_t *rhs, + q15_t *dst, + const int32_t lhs_offset, + const int32_t rhs_offset, + const int32_t dst_offset, + const int32_t dst_multiplier, + const int32_t dst_shift, + const int32_t rhs_cols, + const int32_t rhs_rows, + const int32_t activation_min, + const int32_t activation_max) +{ + (void)rhs_offset; + if (rhs_cols < 0 || (Q31_MAX - rhs_cols) < 16 || dst_offset < 0) + { + return ARM_MATH_ARGUMENT_ERROR; + } + + (void)rhs_offset; +#if defined(ARM_MATH_MVEI) + int32_t row_loop_cnt = rhs_rows / 3; + + for (int i_row_loop_cnt = 0; i_row_loop_cnt < row_loop_cnt; i_row_loop_cnt++) + { + int32_t acc_0 = 0; + int32_t acc_1 = 0; + int32_t acc_2 = 0; + + const int32_t col_loop_cnt = (rhs_cols + 15) / 16; + + const int8_t *lhs_vec = lhs; + const int8_t *rhs_0 = rhs; + const int8_t *rhs_1 = rhs + rhs_cols; + const int8_t *rhs_2 = rhs + 2 * rhs_cols; + + int32_t rhs_sum_0 = 0; + int32_t rhs_sum_1 = 0; + int32_t rhs_sum_2 = 0; + + uint32_t col_cnt = (uint32_t)rhs_cols; + + for (int i = 0; i < col_loop_cnt; i++) + { + mve_pred16_t p = vctp8q(col_cnt); + col_cnt -= 16; + + const int8x16_t input = vldrbq_z_s8(lhs_vec, p); + + const int8x16_t ker_0 = vldrbq_z_s8(rhs_0, p); + rhs_sum_0 = vaddvaq_p_s8(rhs_sum_0, ker_0, p); + acc_0 = vmladavaq_p_s8(acc_0, ker_0, input, p); + + const int8x16_t ker_1 = vldrbq_z_s8(rhs_1, p); + rhs_sum_1 = vaddvaq_p_s8(rhs_sum_1, ker_1, p); + acc_1 = vmladavaq_p_s8(acc_1, ker_1, input, p); + + const int8x16_t ker_2 = vldrbq_z_s8(rhs_2, p); + rhs_sum_2 = vaddvaq_p_s8(rhs_sum_2, ker_2, p); + acc_2 = vmladavaq_p_s8(acc_2, ker_2, input, p); + + lhs_vec += 16; + rhs_0 += 16; + rhs_1 += 16; + rhs_2 += 16; + } + rhs += 3 * rhs_cols; + + int32x4_t acc = {acc_0, acc_1, acc_2, 0}; + const int32x4_t rhs_sum = {rhs_sum_0, rhs_sum_1, rhs_sum_2, 0}; + acc += vdupq_n_s32(lhs_offset) * rhs_sum; + + acc = arm_requantize_mve(acc, dst_multiplier, dst_shift); + acc = vmaxq_s32(acc, vdupq_n_s32(activation_min)); + acc = vminq_s32(acc, vdupq_n_s32(activation_max)); + *(dst) = (int16_t)acc[0]; + *(dst + dst_offset) = (int16_t)acc[1]; + *(dst + 2 * dst_offset) = (int16_t)acc[2]; + dst += 3 * dst_offset; + } + + const int loop_cnt = rhs_rows % 3; + for (int i_row_loop_cnt = 0; i_row_loop_cnt < loop_cnt; i_row_loop_cnt++) + { + int32_t acc_0 = 0; + const int32_t col_loop_cnt = (rhs_cols + 15) / 16; + const int8_t *lhs_vec = lhs; + const int8_t *rhs_0 = rhs; + int32_t rhs_sum_0 = 0; + uint32_t col_cnt = (uint32_t)rhs_cols; + + for (int i = 0; i < col_loop_cnt; i++) + { + mve_pred16_t p = vctp8q(col_cnt); + col_cnt -= 16; + const int8x16_t input = vldrbq_z_s8(lhs_vec, p); + + const int8x16_t ker_0 = vldrbq_z_s8(rhs_0, p); + rhs_sum_0 = vaddvaq_p_s8(rhs_sum_0, ker_0, p); + acc_0 = vmladavaq_p_s8(acc_0, ker_0, input, p); + + lhs_vec += 16; + rhs_0 += 16; + } + rhs += rhs_cols; + + const int32_t offsets = rhs_sum_0 * lhs_offset; + acc_0 = __QADD(acc_0, offsets); + acc_0 = arm_nn_requantize(acc_0, dst_multiplier, dst_shift); + + // Clamp the result + acc_0 = MAX(acc_0, activation_min); + *dst = (q15_t)MIN(acc_0, activation_max); + dst += dst_offset; + } + +#elif defined(ARM_MATH_DSP) + int32_t row_loop_cnt = rhs_rows / 2; + + const int16_t lhs_offset_s16 = lhs_offset; + const int16_t rhs_offset_s16 = rhs_offset; + + const uint32_t lhs_offset_s16x2 = __PKHBT(lhs_offset_s16, lhs_offset_s16, 16); + const uint32_t rhs_offset_s16x2 = __PKHBT(rhs_offset_s16, rhs_offset_s16, 16); + for (int32_t i = 0; i < row_loop_cnt; i++) + { + int32_t acc_0 = 0; + int32_t acc_1 = 0; + + const int32_t col_loop_cnt = rhs_cols / 4; + const int8_t *lhs_vec = lhs; + const int8_t *rhs_0 = rhs; + const int8_t *rhs_1 = rhs + rhs_cols; + rhs += 2 * rhs_cols; + for (int j = col_loop_cnt; j != 0; j--) + { + int32_t vec_0 = arm_nn_read_q7x4_ia(&lhs_vec); + int32_t vec_1 = __SXTAB16_RORn(lhs_offset_s16x2, (uint32_t)vec_0, 8); + vec_0 = __SXTAB16(lhs_offset_s16x2, vec_0); + int32_t ker_0 = arm_nn_read_q7x4_ia(&rhs_0); + int32_t ker_1 = __SXTAB16_RORn(rhs_offset_s16x2, (uint32_t)ker_0, 8); + ker_0 = __SXTAB16(rhs_offset_s16x2, ker_0); + acc_0 = __SMLAD(ker_1, vec_1, acc_0); + acc_0 = __SMLAD(ker_0, vec_0, acc_0); + ker_0 = arm_nn_read_q7x4_ia(&rhs_1); + ker_1 = __SXTAB16_RORn(rhs_offset_s16x2, (uint32_t)ker_0, 8); + ker_0 = __SXTAB16(rhs_offset_s16x2, ker_0); + acc_1 = __SMLAD(ker_1, vec_1, acc_1); + acc_1 = __SMLAD(ker_0, vec_0, acc_1); + } + for (int k = col_loop_cnt * 4; k < rhs_cols; k++) + { + const int32_t lhs_temp = (*lhs_vec + lhs_offset); + lhs_vec++; + acc_0 += lhs_temp * (*rhs_0 + rhs_offset); + rhs_0++; + acc_1 += lhs_temp * (*rhs_1 + rhs_offset); + rhs_1++; + } + acc_0 = arm_nn_requantize(acc_0, dst_multiplier, dst_shift); + acc_1 = arm_nn_requantize(acc_1, dst_multiplier, dst_shift); + + // Clamp the result + acc_0 = MAX(acc_0, activation_min); + acc_0 = MIN(acc_0, activation_max); + acc_1 = MAX(acc_1, activation_min); + acc_1 = MIN(acc_1, activation_max); + *dst = (q15_t)acc_0; + *(dst + dst_offset) = (q15_t)acc_1; + dst += 2 * dst_offset; + } + if (rhs_rows & 0x1) + { + int32_t acc_0 = 0; + const int32_t col_loop_cnt = rhs_cols / 4; + const int8_t *lhs_vec = lhs; + const int8_t *rhs_0 = rhs; + for (int i = col_loop_cnt; i != 0; i--) + { + int32_t vec_0 = arm_nn_read_q7x4_ia(&lhs_vec); + int32_t vec_1 = __SXTAB16(lhs_offset_s16x2, __ROR((uint32_t)vec_0, 8)); + vec_0 = __SXTAB16(lhs_offset_s16x2, vec_0); + int32_t ker_0 = arm_nn_read_q7x4_ia(&rhs_0); + int32_t ker_1 = __SXTAB16(rhs_offset_s16x2, __ROR((uint32_t)ker_0, 8)); + ker_0 = __SXTAB16(rhs_offset_s16x2, ker_0); + acc_0 = __SMLAD(ker_1, vec_1, acc_0); + acc_0 = __SMLAD(ker_0, vec_0, acc_0); + } + for (int j = col_loop_cnt * 4; j < rhs_cols; j++) + { + const int32_t lhs_temp = (*lhs_vec + lhs_offset); + lhs_vec++; + acc_0 += lhs_temp * (*rhs_0 + rhs_offset); + rhs_0++; + } + acc_0 = arm_nn_requantize(acc_0, dst_multiplier, dst_shift); + + // Clamp the result + acc_0 = MAX(acc_0, activation_min); + acc_0 = MIN(acc_0, activation_max); + *dst = (q15_t)acc_0; + dst += dst_offset; + } + +#else + + int32_t row_loop_cnt = rhs_rows / 3; + + for (int i_row_loop_cnt = 0; i_row_loop_cnt < row_loop_cnt; i_row_loop_cnt++) + { + const q7_t *lhs_ptr = lhs; + const q7_t *rhs_ptr_0 = &rhs[0]; + const q7_t *rhs_ptr_1 = &rhs[rhs_cols]; + const q7_t *rhs_ptr_2 = &rhs[rhs_cols * 2]; + + q31_t res00 = 0; + q31_t res01 = 0; + q31_t res02 = 0; + for (int32_t rhs_cols_idx = 0; rhs_cols_idx < rhs_cols; ++rhs_cols_idx) + { + const q31_t rhs_value0 = (int8_t)*rhs_ptr_0; + const q31_t rhs_value1 = (int8_t)*rhs_ptr_1; + const q31_t rhs_value2 = (int8_t)*rhs_ptr_2; + const q31_t lhs_value = (int8_t)*lhs_ptr + lhs_offset; + + res00 += lhs_value * rhs_value0; + res01 += lhs_value * rhs_value1; + res02 += lhs_value * rhs_value2; + + ++rhs_ptr_0; + ++rhs_ptr_1; + ++rhs_ptr_2; + ++lhs_ptr; + } + // Quantize down + res00 = arm_nn_requantize(res00, dst_multiplier, dst_shift); + res01 = arm_nn_requantize(res01, dst_multiplier, dst_shift); + res02 = arm_nn_requantize(res02, dst_multiplier, dst_shift); + + // Clamp the result + res00 = MAX(res00, activation_min); + res00 = MIN(res00, activation_max); + res01 = MAX(res01, activation_min); + res01 = MIN(res01, activation_max); + res02 = MAX(res02, activation_min); + res02 = MIN(res02, activation_max); + + *dst = (q15_t)res00; + *(dst + dst_offset) = (q15_t)res01; + *(dst + 2 * dst_offset) = (q15_t)res02; + dst += 3 * dst_offset; + rhs += 3 * rhs_cols; + } + + const int loop_cnt = rhs_rows % 3; + + for (int i_loop_cnt = 0; i_loop_cnt < loop_cnt; i_loop_cnt++) + { + const q7_t *lhs_ptr = &lhs[0]; + const q7_t *rhs_ptr = &rhs[0]; + + q31_t res00 = 0; + + for (int32_t rhs_cols_idx = 0; rhs_cols_idx < rhs_cols; ++rhs_cols_idx) + { + q31_t rhs_value0 = (int8_t)rhs_ptr[0] + rhs_offset; + q31_t lhs_value = (int8_t)lhs_ptr[0] + lhs_offset; + + res00 += lhs_value * rhs_value0; + + ++rhs_ptr; + ++lhs_ptr; + } + + // Quantize down + res00 = arm_nn_requantize(res00, dst_multiplier, dst_shift); + + // Clamp the result + res00 = MAX(res00, activation_min); + res00 = MIN(res00, activation_max); + + *dst = (q15_t)res00; + dst += dst_offset; + rhs += rhs_cols; + } +#endif + + return ARM_MATH_SUCCESS; +} + +/** + * @} end of NNBasicMath group + */ diff --git a/APP_Framework/Framework/knowing/cmsis_5/NN/Source/NNSupportFunctions/arm_nntables.c b/APP_Framework/Framework/knowing/cmsis_5/NN/Source/NNSupportFunctions/arm_nntables.c new file mode 100644 index 000000000..5a8cea2fc --- /dev/null +++ b/APP_Framework/Framework/knowing/cmsis_5/NN/Source/NNSupportFunctions/arm_nntables.c @@ -0,0 +1,203 @@ +/* + * Copyright (C) 2010-2018 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_nntables.c + * Description: Converts the elements of the Q7 vector to Q15 vector without left-shift + * + * $Date: 17. January 2018 + * $Revision: V.1.0.0 + * + * Target Processor: Cortex-M cores + * + * -------------------------------------------------------------------- */ + +#include "arm_nnsupportfunctions.h" + +/** + * @brief tables for various activation functions + * + * This file include the declaration of common tables. + * Most of them are used for activation functions + * + * Assumption: + * Unified table: input is 3.x format, i.e, range of [-8, 8) + * sigmoid(8) = 0.9996646498695336 + * tanh(8) = 0.9999997749296758 + * The accuracy here should be good enough + * + * 2-stage HL table: + * + * The entire input range is divided into two parts: + * + * Low range table: 0x000x xxxx or 0x111x xxxx + * table entry will be the binary number excluding the first + * two digits, i.e., 0x0x xxxx or 0x1x xxxx + * + * + * + * High range table 0x0010 0000 -- 0x0111 1111 + * 0x1000 0000 -- 0x1101 1111 + * + * For positive numbers, table entry will be + * 0x0010 0000 -- 0x0111 1111 minus 0x0010 0000 + * i.e., 0x0000 0000 - 0x0101 11111 + * + * same thing for the negative numbers, table entry will be + * 0x1000 0000 -- 0x1101 1111 minux 0x0010 0000 + * i.e., 0x0110 0000 - 0x1011 1111 + */ + +const q7_t sigmoidTable_q7[256] = { + 0x40, 0x42, 0x44, 0x46, 0x48, 0x4a, 0x4c, 0x4e, 0x50, 0x52, 0x53, 0x55, 0x57, 0x59, 0x5a, 0x5c, 0x5e, 0x5f, 0x61, + 0x62, 0x63, 0x65, 0x66, 0x67, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x72, 0x73, 0x74, 0x74, + 0x75, 0x76, 0x76, 0x77, 0x77, 0x78, 0x78, 0x79, 0x79, 0x7a, 0x7a, 0x7a, 0x7b, 0x7b, 0x7b, 0x7c, 0x7c, 0x7c, 0x7c, + 0x7c, 0x7d, 0x7d, 0x7d, 0x7d, 0x7d, 0x7e, 0x7e, 0x7e, 0x7e, 0x7e, 0x7e, 0x7e, 0x7e, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, + 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, + 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, + 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, + 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x04, 0x04, 0x04, 0x04, 0x04, 0x05, 0x05, 0x05, 0x06, 0x06, + 0x06, 0x07, 0x07, 0x08, 0x08, 0x09, 0x09, 0x0a, 0x0a, 0x0b, 0x0c, 0x0c, 0x0d, 0x0e, 0x0e, 0x0f, 0x10, 0x11, 0x12, + 0x13, 0x14, 0x15, 0x16, 0x17, 0x19, 0x1a, 0x1b, 0x1d, 0x1e, 0x1f, 0x21, 0x22, 0x24, 0x26, 0x27, 0x29, 0x2b, 0x2d, + 0x2e, 0x30, 0x32, 0x34, 0x36, 0x38, 0x3a, 0x3c, 0x3e, +}; + +const q15_t sigmoidTable_q15[256] = { + 0x4000, 0x4200, 0x43ff, 0x45fc, 0x47f5, 0x49eb, 0x4bdc, 0x4dc8, 0x4fad, 0x518a, 0x5360, 0x552c, 0x56ef, 0x58a8, + 0x5a57, 0x5bfb, 0x5d93, 0x5f20, 0x60a1, 0x6216, 0x637f, 0x64db, 0x662b, 0x676f, 0x68a6, 0x69d2, 0x6af1, 0x6c05, + 0x6d0d, 0x6e09, 0x6efb, 0x6fe2, 0x70be, 0x7190, 0x7258, 0x7316, 0x73cc, 0x7478, 0x751b, 0x75b7, 0x764a, 0x76d6, + 0x775b, 0x77d8, 0x784f, 0x78c0, 0x792a, 0x798f, 0x79ee, 0x7a48, 0x7a9d, 0x7aed, 0x7b39, 0x7b80, 0x7bc4, 0x7c03, + 0x7c3f, 0x7c78, 0x7cad, 0x7ce0, 0x7d0f, 0x7d3c, 0x7d66, 0x7d8d, 0x7db3, 0x7dd6, 0x7df7, 0x7e16, 0x7e33, 0x7e4f, + 0x7e69, 0x7e81, 0x7e98, 0x7eae, 0x7ec2, 0x7ed5, 0x7ee7, 0x7ef8, 0x7f08, 0x7f17, 0x7f25, 0x7f32, 0x7f3e, 0x7f4a, + 0x7f55, 0x7f5f, 0x7f69, 0x7f72, 0x7f7b, 0x7f83, 0x7f8a, 0x7f91, 0x7f98, 0x7f9e, 0x7fa4, 0x7faa, 0x7faf, 0x7fb4, + 0x7fb8, 0x7fbd, 0x7fc1, 0x7fc5, 0x7fc8, 0x7fcc, 0x7fcf, 0x7fd2, 0x7fd5, 0x7fd7, 0x7fda, 0x7fdc, 0x7fde, 0x7fe0, + 0x7fe2, 0x7fe4, 0x7fe6, 0x7fe7, 0x7fe9, 0x7fea, 0x7feb, 0x7fed, 0x7fee, 0x7fef, 0x7ff0, 0x7ff1, 0x7ff2, 0x7ff3, + 0x7ff4, 0x7ff4, 0x000b, 0x000c, 0x000c, 0x000d, 0x000e, 0x000f, 0x0010, 0x0011, 0x0012, 0x0013, 0x0015, 0x0016, + 0x0017, 0x0019, 0x001a, 0x001c, 0x001e, 0x0020, 0x0022, 0x0024, 0x0026, 0x0029, 0x002b, 0x002e, 0x0031, 0x0034, + 0x0038, 0x003b, 0x003f, 0x0043, 0x0048, 0x004c, 0x0051, 0x0056, 0x005c, 0x0062, 0x0068, 0x006f, 0x0076, 0x007d, + 0x0085, 0x008e, 0x0097, 0x00a1, 0x00ab, 0x00b6, 0x00c2, 0x00ce, 0x00db, 0x00e9, 0x00f8, 0x0108, 0x0119, 0x012b, + 0x013e, 0x0152, 0x0168, 0x017f, 0x0197, 0x01b1, 0x01cd, 0x01ea, 0x0209, 0x022a, 0x024d, 0x0273, 0x029a, 0x02c4, + 0x02f1, 0x0320, 0x0353, 0x0388, 0x03c1, 0x03fd, 0x043c, 0x0480, 0x04c7, 0x0513, 0x0563, 0x05b8, 0x0612, 0x0671, + 0x06d6, 0x0740, 0x07b1, 0x0828, 0x08a5, 0x092a, 0x09b6, 0x0a49, 0x0ae5, 0x0b88, 0x0c34, 0x0cea, 0x0da8, 0x0e70, + 0x0f42, 0x101e, 0x1105, 0x11f7, 0x12f3, 0x13fb, 0x150f, 0x162e, 0x175a, 0x1891, 0x19d5, 0x1b25, 0x1c81, 0x1dea, + 0x1f5f, 0x20e0, 0x226d, 0x2405, 0x25a9, 0x2758, 0x2911, 0x2ad4, 0x2ca0, 0x2e76, 0x3053, 0x3238, 0x3424, 0x3615, + 0x380b, 0x3a04, 0x3c01, 0x3e00, +}; + +const q15_t sigmoidLTable_q15[128] = { + 0x4000, 0x4100, 0x4200, 0x42ff, 0x43ff, 0x44fd, 0x45fc, 0x46f9, 0x47f5, 0x48f1, 0x49eb, 0x4ae5, 0x4bdc, + 0x4cd3, 0x4dc8, 0x4ebb, 0x4fad, 0x509c, 0x518a, 0x5276, 0x5360, 0x5447, 0x552c, 0x560f, 0x56ef, 0x57cd, + 0x58a8, 0x5981, 0x5a57, 0x5b2a, 0x5bfb, 0x5cc9, 0x5d93, 0x5e5b, 0x5f20, 0x5fe2, 0x60a1, 0x615d, 0x6216, + 0x62cc, 0x637f, 0x642e, 0x64db, 0x6584, 0x662b, 0x66ce, 0x676f, 0x680c, 0x68a6, 0x693d, 0x69d2, 0x6a63, + 0x6af1, 0x6b7c, 0x6c05, 0x6c8a, 0x6d0d, 0x6d8d, 0x6e09, 0x6e84, 0x6efb, 0x6f70, 0x6fe2, 0x7051, 0x0f42, + 0x0faf, 0x101e, 0x1090, 0x1105, 0x117c, 0x11f7, 0x1273, 0x12f3, 0x1376, 0x13fb, 0x1484, 0x150f, 0x159d, + 0x162e, 0x16c3, 0x175a, 0x17f4, 0x1891, 0x1932, 0x19d5, 0x1a7c, 0x1b25, 0x1bd2, 0x1c81, 0x1d34, 0x1dea, + 0x1ea3, 0x1f5f, 0x201e, 0x20e0, 0x21a5, 0x226d, 0x2337, 0x2405, 0x24d6, 0x25a9, 0x267f, 0x2758, 0x2833, + 0x2911, 0x29f1, 0x2ad4, 0x2bb9, 0x2ca0, 0x2d8a, 0x2e76, 0x2f64, 0x3053, 0x3145, 0x3238, 0x332d, 0x3424, + 0x351b, 0x3615, 0x370f, 0x380b, 0x3907, 0x3a04, 0x3b03, 0x3c01, 0x3d01, 0x3e00, 0x3f00, +}; + +const q15_t sigmoidHTable_q15[192] = { + 0x70be, 0x7190, 0x7258, 0x7316, 0x73cc, 0x7478, 0x751b, 0x75b7, 0x764a, 0x76d6, 0x775b, 0x77d8, 0x784f, 0x78c0, + 0x792a, 0x798f, 0x79ee, 0x7a48, 0x7a9d, 0x7aed, 0x7b39, 0x7b80, 0x7bc4, 0x7c03, 0x7c3f, 0x7c78, 0x7cad, 0x7ce0, + 0x7d0f, 0x7d3c, 0x7d66, 0x7d8d, 0x7db3, 0x7dd6, 0x7df7, 0x7e16, 0x7e33, 0x7e4f, 0x7e69, 0x7e81, 0x7e98, 0x7eae, + 0x7ec2, 0x7ed5, 0x7ee7, 0x7ef8, 0x7f08, 0x7f17, 0x7f25, 0x7f32, 0x7f3e, 0x7f4a, 0x7f55, 0x7f5f, 0x7f69, 0x7f72, + 0x7f7b, 0x7f83, 0x7f8a, 0x7f91, 0x7f98, 0x7f9e, 0x7fa4, 0x7faa, 0x7faf, 0x7fb4, 0x7fb8, 0x7fbd, 0x7fc1, 0x7fc5, + 0x7fc8, 0x7fcc, 0x7fcf, 0x7fd2, 0x7fd5, 0x7fd7, 0x7fda, 0x7fdc, 0x7fde, 0x7fe0, 0x7fe2, 0x7fe4, 0x7fe6, 0x7fe7, + 0x7fe9, 0x7fea, 0x7feb, 0x7fed, 0x7fee, 0x7fef, 0x7ff0, 0x7ff1, 0x7ff2, 0x7ff3, 0x7ff4, 0x7ff4, 0x000b, 0x000c, + 0x000c, 0x000d, 0x000e, 0x000f, 0x0010, 0x0011, 0x0012, 0x0013, 0x0015, 0x0016, 0x0017, 0x0019, 0x001a, 0x001c, + 0x001e, 0x0020, 0x0022, 0x0024, 0x0026, 0x0029, 0x002b, 0x002e, 0x0031, 0x0034, 0x0038, 0x003b, 0x003f, 0x0043, + 0x0048, 0x004c, 0x0051, 0x0056, 0x005c, 0x0062, 0x0068, 0x006f, 0x0076, 0x007d, 0x0085, 0x008e, 0x0097, 0x00a1, + 0x00ab, 0x00b6, 0x00c2, 0x00ce, 0x00db, 0x00e9, 0x00f8, 0x0108, 0x0119, 0x012b, 0x013e, 0x0152, 0x0168, 0x017f, + 0x0197, 0x01b1, 0x01cd, 0x01ea, 0x0209, 0x022a, 0x024d, 0x0273, 0x029a, 0x02c4, 0x02f1, 0x0320, 0x0353, 0x0388, + 0x03c1, 0x03fd, 0x043c, 0x0480, 0x04c7, 0x0513, 0x0563, 0x05b8, 0x0612, 0x0671, 0x06d6, 0x0740, 0x07b1, 0x0828, + 0x08a5, 0x092a, 0x09b6, 0x0a49, 0x0ae5, 0x0b88, 0x0c34, 0x0cea, 0x0da8, 0x0e70, +}; + +const q7_t tanhTable_q7[256] = { + 0x00, 0x08, 0x10, 0x18, 0x1f, 0x27, 0x2e, 0x35, 0x3b, 0x41, 0x47, 0x4c, 0x51, 0x56, 0x5a, 0x5e, 0x61, 0x65, 0x68, + 0x6a, 0x6d, 0x6f, 0x71, 0x72, 0x74, 0x75, 0x76, 0x78, 0x78, 0x79, 0x7a, 0x7b, 0x7b, 0x7c, 0x7c, 0x7d, 0x7d, 0x7e, + 0x7e, 0x7e, 0x7e, 0x7e, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, + 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, + 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, + 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, + 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x81, 0x81, + 0x81, 0x81, 0x81, 0x81, 0x81, 0x81, 0x82, 0x82, 0x82, 0x82, 0x82, 0x83, 0x83, 0x84, 0x84, 0x85, 0x85, 0x86, 0x87, + 0x88, 0x88, 0x8a, 0x8b, 0x8c, 0x8e, 0x8f, 0x91, 0x93, 0x96, 0x98, 0x9b, 0x9f, 0xa2, 0xa6, 0xaa, 0xaf, 0xb4, 0xb9, + 0xbf, 0xc5, 0xcb, 0xd2, 0xd9, 0xe1, 0xe8, 0xf0, 0xf8, +}; + +const q15_t tanhTable_q15[256] = { + 0x0000, 0x07fd, 0x0feb, 0x17b9, 0x1f59, 0x26bf, 0x2ddf, 0x34ae, 0x3b27, 0x4142, 0x46fd, 0x4c56, 0x514d, 0x55e2, + 0x5a1a, 0x5df6, 0x617c, 0x64b0, 0x6797, 0x6a37, 0x6c95, 0x6eb5, 0x709e, 0x7254, 0x73dc, 0x753a, 0x7672, 0x7788, + 0x787f, 0x795b, 0x7a1e, 0x7acb, 0x7b65, 0x7bee, 0x7c66, 0x7cd1, 0x7d30, 0x7d84, 0x7dce, 0x7e0f, 0x7e49, 0x7e7d, + 0x7eaa, 0x7ed2, 0x7ef5, 0x7f14, 0x7f30, 0x7f48, 0x7f5e, 0x7f71, 0x7f82, 0x7f91, 0x7f9e, 0x7fa9, 0x7fb3, 0x7fbc, + 0x7fc4, 0x7fcb, 0x7fd1, 0x7fd7, 0x7fdc, 0x7fe0, 0x7fe4, 0x7fe7, 0x7fea, 0x7fed, 0x7fef, 0x7ff1, 0x7ff3, 0x7ff4, + 0x7ff6, 0x7ff7, 0x7ff8, 0x7ff9, 0x7ffa, 0x7ffa, 0x7ffb, 0x7ffc, 0x7ffc, 0x7ffd, 0x7ffd, 0x7ffd, 0x7ffe, 0x7ffe, + 0x7ffe, 0x7ffe, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, + 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, + 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, + 0x7fff, 0x7fff, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8001, 0x8001, 0x8001, 0x8001, 0x8001, 0x8001, + 0x8001, 0x8001, 0x8001, 0x8002, 0x8002, 0x8002, 0x8002, 0x8003, 0x8003, 0x8003, 0x8004, 0x8004, 0x8005, 0x8006, + 0x8006, 0x8007, 0x8008, 0x8009, 0x800a, 0x800c, 0x800d, 0x800f, 0x8011, 0x8013, 0x8016, 0x8019, 0x801c, 0x8020, + 0x8024, 0x8029, 0x802f, 0x8035, 0x803c, 0x8044, 0x804d, 0x8057, 0x8062, 0x806f, 0x807e, 0x808f, 0x80a2, 0x80b8, + 0x80d0, 0x80ec, 0x810b, 0x812e, 0x8156, 0x8183, 0x81b7, 0x81f1, 0x8232, 0x827c, 0x82d0, 0x832f, 0x839a, 0x8412, + 0x849b, 0x8535, 0x85e2, 0x86a5, 0x8781, 0x8878, 0x898e, 0x8ac6, 0x8c24, 0x8dac, 0x8f62, 0x914b, 0x936b, 0x95c9, + 0x9869, 0x9b50, 0x9e84, 0xa20a, 0xa5e6, 0xaa1e, 0xaeb3, 0xb3aa, 0xb903, 0xbebe, 0xc4d9, 0xcb52, 0xd221, 0xd941, + 0xe0a7, 0xe847, 0xf015, 0xf803, +}; + +const q15_t tanhLTable_q15[128] = { + 0x0000, 0x0400, 0x07fd, 0x0bf7, 0x0feb, 0x13d7, 0x17b9, 0x1b90, 0x1f59, 0x2314, 0x26bf, 0x2a58, 0x2ddf, + 0x3151, 0x34ae, 0x37f6, 0x3b27, 0x3e40, 0x4142, 0x442c, 0x46fd, 0x49b6, 0x4c56, 0x4edd, 0x514d, 0x53a3, + 0x55e2, 0x580a, 0x5a1a, 0x5c13, 0x5df6, 0x5fc4, 0x617c, 0x6320, 0x64b0, 0x662d, 0x6797, 0x68f0, 0x6a37, + 0x6b6e, 0x6c95, 0x6dac, 0x6eb5, 0x6fb0, 0x709e, 0x717f, 0x7254, 0x731e, 0x73dc, 0x7490, 0x753a, 0x75da, + 0x7672, 0x7701, 0x7788, 0x7807, 0x787f, 0x78f0, 0x795b, 0x79bf, 0x7a1e, 0x7a77, 0x7acb, 0x7b1b, 0x849b, + 0x84e5, 0x8535, 0x8589, 0x85e2, 0x8641, 0x86a5, 0x8710, 0x8781, 0x87f9, 0x8878, 0x88ff, 0x898e, 0x8a26, + 0x8ac6, 0x8b70, 0x8c24, 0x8ce2, 0x8dac, 0x8e81, 0x8f62, 0x9050, 0x914b, 0x9254, 0x936b, 0x9492, 0x95c9, + 0x9710, 0x9869, 0x99d3, 0x9b50, 0x9ce0, 0x9e84, 0xa03c, 0xa20a, 0xa3ed, 0xa5e6, 0xa7f6, 0xaa1e, 0xac5d, + 0xaeb3, 0xb123, 0xb3aa, 0xb64a, 0xb903, 0xbbd4, 0xbebe, 0xc1c0, 0xc4d9, 0xc80a, 0xcb52, 0xceaf, 0xd221, + 0xd5a8, 0xd941, 0xdcec, 0xe0a7, 0xe470, 0xe847, 0xec29, 0xf015, 0xf409, 0xf803, 0xfc00, +}; + +const q15_t tanhHTable_q15[192] = { + 0x7b65, 0x7bee, 0x7c66, 0x7cd1, 0x7d30, 0x7d84, 0x7dce, 0x7e0f, 0x7e49, 0x7e7d, 0x7eaa, 0x7ed2, 0x7ef5, 0x7f14, + 0x7f30, 0x7f48, 0x7f5e, 0x7f71, 0x7f82, 0x7f91, 0x7f9e, 0x7fa9, 0x7fb3, 0x7fbc, 0x7fc4, 0x7fcb, 0x7fd1, 0x7fd7, + 0x7fdc, 0x7fe0, 0x7fe4, 0x7fe7, 0x7fea, 0x7fed, 0x7fef, 0x7ff1, 0x7ff3, 0x7ff4, 0x7ff6, 0x7ff7, 0x7ff8, 0x7ff9, + 0x7ffa, 0x7ffa, 0x7ffb, 0x7ffc, 0x7ffc, 0x7ffd, 0x7ffd, 0x7ffd, 0x7ffe, 0x7ffe, 0x7ffe, 0x7ffe, 0x7fff, 0x7fff, + 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, + 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, + 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8001, 0x8001, 0x8001, 0x8001, 0x8001, 0x8001, 0x8001, 0x8001, 0x8001, 0x8002, + 0x8002, 0x8002, 0x8002, 0x8003, 0x8003, 0x8003, 0x8004, 0x8004, 0x8005, 0x8006, 0x8006, 0x8007, 0x8008, 0x8009, + 0x800a, 0x800c, 0x800d, 0x800f, 0x8011, 0x8013, 0x8016, 0x8019, 0x801c, 0x8020, 0x8024, 0x8029, 0x802f, 0x8035, + 0x803c, 0x8044, 0x804d, 0x8057, 0x8062, 0x806f, 0x807e, 0x808f, 0x80a2, 0x80b8, 0x80d0, 0x80ec, 0x810b, 0x812e, + 0x8156, 0x8183, 0x81b7, 0x81f1, 0x8232, 0x827c, 0x82d0, 0x832f, 0x839a, 0x8412, +}; diff --git a/APP_Framework/Framework/knowing/cmsis_5/NN/Source/NNSupportFunctions/arm_q7_to_q15_no_shift.c b/APP_Framework/Framework/knowing/cmsis_5/NN/Source/NNSupportFunctions/arm_q7_to_q15_no_shift.c new file mode 100644 index 000000000..6f2f57530 --- /dev/null +++ b/APP_Framework/Framework/knowing/cmsis_5/NN/Source/NNSupportFunctions/arm_q7_to_q15_no_shift.c @@ -0,0 +1,121 @@ +/* + * Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_q7_to_q15_no_shift.c + * Description: Converts the elements of the Q7 vector to Q15 vector without left-shift + * + * $Date: May 29, 2020 + * $Revision: V.1.0.2 + * + * Target Processor: Cortex-M cores + * + * -------------------------------------------------------------------- */ + +#include "arm_nnsupportfunctions.h" + +/** + * @ingroup groupSupport + */ + +/** + * @addtogroup nndata_convert + * @{ + */ + +/** + * @brief Converts the elements of the Q7 vector to Q15 vector without left-shift + * @param[in] *pSrc points to the Q7 input vector + * @param[out] *pDst points to the Q15 output vector + * @param[in] blockSize length of the input vector + * + * \par Description: + * + * The equation used for the conversion process is: + * + *
+ * 	pDst[n] = (q15_t) pSrc[n];   0 <= n < blockSize.
+ * 
+ * + */ + +void arm_q7_to_q15_no_shift(const q7_t *pSrc, q15_t *pDst, uint32_t blockSize) +{ + const q7_t *pIn = pSrc; + uint32_t blkCnt; + +#if defined(ARM_MATH_DSP) + q31_t in; + q31_t in1, in2; + q31_t out1, out2; + + /*loop Unrolling */ + blkCnt = blockSize >> 2u; + + /* First part of the processing with loop unrolling. Compute 4 outputs at a time. */ + while (blkCnt > 0u) + { + in = arm_nn_read_q7x4_ia(&pIn); + + /* rotatate in by 8 and extend two q7_t values to q15_t values */ + in1 = __SXTB16(__ROR((uint32_t)in, 8)); + + /* extend remaining two q7_t values to q15_t values */ + in2 = __SXTB16(in); + +#ifndef ARM_MATH_BIG_ENDIAN + out2 = (int32_t)__PKHTB(in1, in2, 16); + out1 = (int32_t)__PKHBT(in2, in1, 16); +#else + out1 = (int32_t)__PKHTB(in1, in2, 16); + out2 = (int32_t)__PKHBT(in2, in1, 16); +#endif + arm_nn_write_q15x2_ia(&pDst, out1); + arm_nn_write_q15x2_ia(&pDst, out2); + + /* Decrement the loop counter */ + blkCnt--; + } + + /* If the blockSize is not a multiple of 4, compute any remaining output samples here. + ** No loop unrolling is used. */ + blkCnt = blockSize % 0x4u; + +#else + + /* Run the below code for Cortex-M0 */ + + /* Loop over blockSize number of values */ + blkCnt = blockSize; + +#endif /* #ifndef ARM_MATH_CM0_FAMILY */ + + while (blkCnt > 0u) + { + /* convert from q7 to q15 and then store the results in the destination buffer */ + *pDst++ = (q15_t)*pIn++; + + /* Decrement the loop counter */ + blkCnt--; + } +} + +/** + * @} end of nndata_convert group + */ diff --git a/APP_Framework/Framework/knowing/cmsis_5/NN/Source/NNSupportFunctions/arm_q7_to_q15_reordered_no_shift.c b/APP_Framework/Framework/knowing/cmsis_5/NN/Source/NNSupportFunctions/arm_q7_to_q15_reordered_no_shift.c new file mode 100644 index 000000000..9017970df --- /dev/null +++ b/APP_Framework/Framework/knowing/cmsis_5/NN/Source/NNSupportFunctions/arm_q7_to_q15_reordered_no_shift.c @@ -0,0 +1,143 @@ +/* + * Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_q7_to_q15_reordered_no_shift.c + * Description: Converts the elements of the Q7 vector to reordered Q15 vector without left-shift + * + * $Date: May 29, 2020 + * $Revision: V.1.0.1 + * + * Target Processor: Cortex-M cores + * + * -------------------------------------------------------------------- */ + +#include "arm_nnsupportfunctions.h" + +/** + * @ingroup groupSupport + */ + +/** + * @addtogroup nndata_convert + * @{ + */ + +/** + * @brief Converts the elements of the Q7 vector to reordered Q15 vector without left-shift + * @param[in] *pSrc points to the Q7 input vector + * @param[out] *pDst points to the Q15 output vector + * @param[in] blockSize length of the input vector + * + * @details + * + * This function does the q7 to q15 expansion with re-ordering + * + *
+ *                          |   A1   |   A2   |   A3   |   A4   |
+ *
+ *                           0      7 8     15 16    23 24    31
+ * 
+ * + * is converted into: + * + *
+ *  |       A1       |       A3       |   and  |       A2       |       A4       |
+ *
+ *   0             15 16            31          0             15 16            31
+ * 
+ * + * + * This looks strange but is natural considering how sign-extension is done at + * assembly level. + * + * The expansion of other other oprand will follow the same rule so that the end + * results are the same. + * + * The tail (i.e., last (N % 4) elements) will still be in original order. + * + */ + +void arm_q7_to_q15_reordered_no_shift(const q7_t *pSrc, q15_t *pDst, uint32_t blockSize) +{ + const q7_t *pIn = pSrc; /* Src pointer */ + uint32_t blkCnt; /* loop counter */ + +#ifndef ARM_MATH_CM0_FAMILY + q31_t in; + q31_t in1, in2; + + /* Run the below code for Cortex-M4 and Cortex-M3 */ + + /*loop Unrolling */ + blkCnt = blockSize >> 2u; + + /* First part of the processing with loop unrolling. Compute 4 outputs at a time. + ** a second loop below computes the remaining 1 to 3 samples. */ + while (blkCnt > 0u) + { + /* C = (q15_t) A << 8 */ + /* convert from q7 to q15 and then store the results in the destination buffer */ + in = arm_nn_read_q7x4_ia(&pIn); + + /* rotatate in by 8 and extend two q7_t values to q15_t values */ + in1 = __SXTB16(__ROR((uint32_t)in, 8)); + + /* extend remainig two q7_t values to q15_t values */ + in2 = __SXTB16(in); + +#ifndef ARM_MATH_BIG_ENDIAN + *__SIMD32(pDst)++ = in2; + *__SIMD32(pDst)++ = in1; +#else + *__SIMD32(pDst)++ = in1; + *__SIMD32(pDst)++ = in2; +#endif + + /* Decrement the loop counter */ + blkCnt--; + } + + /* If the blockSize is not a multiple of 4, compute any remaining output samples here. + ** No loop unrolling is used. */ + blkCnt = blockSize % 0x4u; + +#else + + /* Run the below code for Cortex-M0 */ + + /* Loop over blockSize number of values */ + blkCnt = blockSize; + +#endif /* #ifndef ARM_MATH_CM0_FAMILY */ + + while (blkCnt > 0u) + { + /* C = (q15_t) A << 8 */ + /* convert from q7 to q15 and then store the results in the destination buffer */ + *pDst++ = (q15_t)*pIn++; + + /* Decrement the loop counter */ + blkCnt--; + } +} + +/** + * @} end of q7_to_x group + */ diff --git a/APP_Framework/Framework/knowing/cmsis_5/NN/Source/NNSupportFunctions/arm_q7_to_q15_reordered_with_offset.c b/APP_Framework/Framework/knowing/cmsis_5/NN/Source/NNSupportFunctions/arm_q7_to_q15_reordered_with_offset.c new file mode 100644 index 000000000..765929d59 --- /dev/null +++ b/APP_Framework/Framework/knowing/cmsis_5/NN/Source/NNSupportFunctions/arm_q7_to_q15_reordered_with_offset.c @@ -0,0 +1,100 @@ +/* + * Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_q7_to_q15_reordered_with_offset.c + * Description: Converts the elements of the Q7 vector to a reordered Q15 vector with an added offset. The re-ordering + * is a signature of sign extension intrinsic(DSP extension). + * + * $Date: May 29, 2020 + * $Revision: V.2.0.3 + * + * Target Processor: Cortex-M cores + * + * -------------------------------------------------------------------- */ + +#include "arm_nnsupportfunctions.h" + +/** + * @ingroup groupSupport + */ + +/** + * @addtogroup nndata_convert + * @{ + */ + +/** + * @brief Converts the elements of the Q7 vector to a reordered Q15 vector with an added offset. + * + * @note Refer header file for details. + * + */ + +void arm_q7_to_q15_reordered_with_offset(const q7_t *src, q15_t *dst, uint32_t block_size, q15_t offset) +{ + +#if defined(ARM_MATH_DSP) + uint32_t block_cnt; + /* Run the below code for cores that support SIMD instructions */ + q31_t in_q7x4; + q31_t out_q15x2_1; + q31_t out_q15x2_2; + + /*loop unrolling */ + block_cnt = block_size >> 2u; + + /* First part of the processing with loop unrolling. Compute 4 outputs at a time. */ + const q31_t offset_q15x2 = (q31_t)__PKHBT(offset, offset, 16); + while (block_cnt > 0u) + { + /* convert from q7 to q15 and then store the results in the destination buffer */ + in_q7x4 = arm_nn_read_q7x4_ia(&src); + + /* Extract and sign extend each of the four q7 values to q15 */ + out_q15x2_1 = __SXTAB16(offset_q15x2, __ROR((uint32_t)in_q7x4, 8)); + out_q15x2_2 = __SXTAB16(offset_q15x2, in_q7x4); + + arm_nn_write_q15x2_ia(&dst, out_q15x2_2); + arm_nn_write_q15x2_ia(&dst, out_q15x2_1); + + block_cnt--; + } + /* Handle left over samples */ + block_cnt = block_size % 0x4u; + + while (block_cnt > 0u) + { + *dst++ = (q15_t)*src++ + offset; + + /* Decrement the loop counter */ + block_cnt--; + } +#else + (void)src; + (void)dst; + (void)block_size; + (void)offset; + /* Not available */ +#endif +} + +/** + * @} end of nndata_convert group + */ diff --git a/APP_Framework/Framework/knowing/cmsis_5/NN/Source/NNSupportFunctions/arm_q7_to_q15_with_offset.c b/APP_Framework/Framework/knowing/cmsis_5/NN/Source/NNSupportFunctions/arm_q7_to_q15_with_offset.c new file mode 100644 index 000000000..ea29986d4 --- /dev/null +++ b/APP_Framework/Framework/knowing/cmsis_5/NN/Source/NNSupportFunctions/arm_q7_to_q15_with_offset.c @@ -0,0 +1,114 @@ +/* + * Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in_q7x4 compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in_q7x4 writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_q7_to_q15_with_offset.c + * Description: Converts the elements of the Q7 vector to Q15 vector with an added offset + * + * $Date: March 3, 2020 + * $Revision: V.2.0.2 + * + * Target Processor: Cortex-M cores + * + * -------------------------------------------------------------------- */ + +#include "arm_nnsupportfunctions.h" + +/** + * @ingroup groupSupport + */ + +/** + * @addtogroup nndata_convert + * @{ + */ + +void arm_q7_to_q15_with_offset(const q7_t *src, q15_t *dst, uint32_t block_size, q15_t offset) +{ + int block_cnt; + +#if defined(ARM_MATH_MVEI) + + int16x8_t source; + const int16x8_t source_offset = vdupq_n_s16(offset); + block_cnt = block_size / 8; + + while (block_cnt > 0) + { + source = vldrbq_s16(src); + source = vaddq_s16(source, source_offset); + vstrhq_s16(dst, source); + dst += 8; + src += 8; + block_cnt--; + } + + block_cnt = block_size & 0x7; + +#elif defined(ARM_MATH_DSP) + /* Run the below code for cores that support SIMD instructions */ + q31_t in_q7x4; + q31_t in_q15x2_1; + q31_t in_q15x2_2; + q31_t out_q15x2_1; + q31_t out_q15x2_2; + + /*loop unrolling */ + block_cnt = block_size >> 2; + + /* First part of the processing with loop unrolling. Compute 4 outputs at a time. */ + const q31_t offset_q15x2 = __PKHBT(offset, offset, 16); + while (block_cnt > 0) + { + /* convert from q7 to q15 and then store the results in the destination buffer */ + in_q7x4 = arm_nn_read_q7x4_ia(&src); + + /* Extract and sign extend each of the four q7 values to q15 */ + in_q15x2_1 = __SXTAB16(offset_q15x2, __ROR(in_q7x4, 8)); + in_q15x2_2 = __SXTAB16(offset_q15x2, in_q7x4); + + out_q15x2_2 = __PKHTB(in_q15x2_1, in_q15x2_2, 16); + out_q15x2_1 = __PKHBT(in_q15x2_2, in_q15x2_1, 16); + + arm_nn_write_q15x2_ia(&dst, out_q15x2_1); + arm_nn_write_q15x2_ia(&dst, out_q15x2_2); + + block_cnt--; + } + /* Handle left over samples */ + block_cnt = block_size % 0x4; + +#else + /* Run the below code for Cortex-M0 */ + /* Loop over block_size number of values */ + block_cnt = block_size; +#endif + + while (block_cnt > 0) + { + *dst++ = (q15_t)*src++ + offset; + + /* Decrement the loop counter */ + block_cnt--; + } +} + +/** + * @} end of nndata_convert group + */ diff --git a/APP_Framework/Framework/knowing/cmsis_5/NN/Source/PoolingFunctions/arm_avgpool_s8.c b/APP_Framework/Framework/knowing/cmsis_5/NN/Source/PoolingFunctions/arm_avgpool_s8.c new file mode 100644 index 000000000..fa59c084f --- /dev/null +++ b/APP_Framework/Framework/knowing/cmsis_5/NN/Source/PoolingFunctions/arm_avgpool_s8.c @@ -0,0 +1,382 @@ +/* + * Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_avgpool_s8.c + * Description: Pooling function implementations + * + * $Date: 01. March 2021 + * $Revision: V.2.0.4 + * + * Target Processor: Cortex-M CPUs + * + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" +#include "arm_nnsupportfunctions.h" + +#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI) + +static void scale_q31_to_q7_and_clamp(const q31_t *buffer, + q7_t *target, + int32_t length, + const int32_t count, + const int act_min, + const int act_max) +{ + const int half_count = count / 2; + for (int i = 0; i < length; i++) + { + int32_t sum = buffer[i] > 0 ? (buffer[i] + half_count) : (buffer[i] - half_count); + sum = sum / count; + sum = MAX(sum, act_min); + sum = MIN(sum, act_max); + + target[i] = (q7_t)sum; + } +} +#endif + +/** + * @ingroup groupNN + */ + +/** + * @addtogroup Pooling + * @{ + */ + +/* + * s8 average pooling function + * + * Refer to header file for details. + * + */ + +#if defined(ARM_MATH_MVEI) + +arm_status arm_avgpool_s8(const cmsis_nn_context *ctx, + const cmsis_nn_pool_params *pool_params, + const cmsis_nn_dims *input_dims, + const q7_t *src, + const cmsis_nn_dims *filter_dims, + const cmsis_nn_dims *output_dims, + q7_t *dst) +{ + (void)ctx; + const int32_t input_y = input_dims->h; + const int32_t input_x = input_dims->w; + const int32_t output_y = output_dims->h; + const int32_t output_x = output_dims->w; + const int32_t stride_y = pool_params->stride.h; + const int32_t stride_x = pool_params->stride.w; + const int32_t kernel_y = filter_dims->h; + const int32_t kernel_x = filter_dims->w; + const int32_t pad_y = pool_params->padding.h; + const int32_t pad_x = pool_params->padding.w; + const int32_t act_min = pool_params->activation.min; + const int32_t act_max = pool_params->activation.max; + const int32_t ch_src = input_dims->c; + + int32_t i_x, i_y; + int32_t k_x, k_y; + + for (i_y = 0; i_y < output_y; i_y++) + { + for (i_x = 0; i_x < output_x; i_x++) + { + + int32_t k_y_start, k_y_end; + int32_t k_x_start, k_x_end; + int32_t chCnt; + const int8_t *pTmp, *pTmpInner; + int8_t *pDst; + + k_y_start = MAX(0, i_y * stride_y - pad_y); + k_y_end = MIN(i_y * stride_y - pad_y + kernel_y, input_y); + + k_x_start = MAX(0, i_x * stride_x - pad_x); + k_x_end = MIN(i_x * stride_x - pad_x + kernel_x, input_x); + + pTmp = src; + pDst = &dst[ch_src * (i_x + i_y * output_x)]; + + chCnt = ch_src >> 4; + while (chCnt > 0) + { + int32x4_t sumV1, sumV2, sumV3, sumV4; + + int8x16_t tempV; + int16x8_t tempVLO, tempVHI; + int32x4_t tempVLOLO, tempVLOHI, tempVHILO, tempVHIHI; + int32_t count = 0; + + sumV1 = vdupq_n_s32(0); + sumV2 = vdupq_n_s32(0); + sumV3 = vdupq_n_s32(0); + sumV4 = vdupq_n_s32(0); + + for (k_y = k_y_start; k_y < k_y_end; k_y++) + { + for (k_x = k_x_start; k_x < k_x_end; k_x++) + { + pTmpInner = pTmp + (ch_src * (k_x + k_y * input_x)); + tempV = vldrbq_s8(pTmpInner); + + tempVLO = vmovlbq_s8(tempV); + tempVHI = vmovltq_s8(tempV); + + tempVLOLO = vmovlbq_s16(tempVLO); + tempVLOHI = vmovltq_s16(tempVLO); + + tempVHILO = vmovlbq_s16(tempVHI); + tempVHIHI = vmovltq_s16(tempVHI); + + sumV1 = vaddq_s32(sumV1, tempVLOLO); + sumV2 = vaddq_s32(sumV2, tempVLOHI); + sumV3 = vaddq_s32(sumV3, tempVHILO); + sumV4 = vaddq_s32(sumV4, tempVHIHI); + + count++; + } + } + + // Prevent static code issue DIVIDE_BY_ZERO. + if (count == 0) + { + return ARM_MATH_ARGUMENT_ERROR; + } + + sumV1[0] = sumV1[0] > 0 ? (sumV1[0] + count / 2) / count : (sumV1[0] - count / 2) / count; + sumV1[1] = sumV1[1] > 0 ? (sumV1[1] + count / 2) / count : (sumV1[1] - count / 2) / count; + sumV1[2] = sumV1[2] > 0 ? (sumV1[2] + count / 2) / count : (sumV1[2] - count / 2) / count; + sumV1[3] = sumV1[3] > 0 ? (sumV1[3] + count / 2) / count : (sumV1[3] - count / 2) / count; + + sumV2[0] = sumV2[0] > 0 ? (sumV2[0] + count / 2) / count : (sumV2[0] - count / 2) / count; + sumV2[1] = sumV2[1] > 0 ? (sumV2[1] + count / 2) / count : (sumV2[1] - count / 2) / count; + sumV2[2] = sumV2[2] > 0 ? (sumV2[2] + count / 2) / count : (sumV2[2] - count / 2) / count; + sumV2[3] = sumV2[3] > 0 ? (sumV2[3] + count / 2) / count : (sumV2[3] - count / 2) / count; + + sumV3[0] = sumV3[0] > 0 ? (sumV3[0] + count / 2) / count : (sumV3[0] - count / 2) / count; + sumV3[1] = sumV3[1] > 0 ? (sumV3[1] + count / 2) / count : (sumV3[1] - count / 2) / count; + sumV3[2] = sumV3[2] > 0 ? (sumV3[2] + count / 2) / count : (sumV3[2] - count / 2) / count; + sumV3[3] = sumV3[3] > 0 ? (sumV3[3] + count / 2) / count : (sumV3[3] - count / 2) / count; + + sumV4[0] = sumV4[0] > 0 ? (sumV4[0] + count / 2) / count : (sumV4[0] - count / 2) / count; + sumV4[1] = sumV4[1] > 0 ? (sumV4[1] + count / 2) / count : (sumV4[1] - count / 2) / count; + sumV4[2] = sumV4[2] > 0 ? (sumV4[2] + count / 2) / count : (sumV4[2] - count / 2) / count; + sumV4[3] = sumV4[3] > 0 ? (sumV4[3] + count / 2) / count : (sumV4[3] - count / 2) / count; + + sumV1 = vmaxq_s32(sumV1, vdupq_n_s32(act_min)); + sumV1 = vminq_s32(sumV1, vdupq_n_s32(act_max)); + + sumV2 = vmaxq_s32(sumV2, vdupq_n_s32(act_min)); + sumV2 = vminq_s32(sumV2, vdupq_n_s32(act_max)); + + sumV3 = vmaxq_s32(sumV3, vdupq_n_s32(act_min)); + sumV3 = vminq_s32(sumV3, vdupq_n_s32(act_max)); + + sumV4 = vmaxq_s32(sumV4, vdupq_n_s32(act_min)); + sumV4 = vminq_s32(sumV4, vdupq_n_s32(act_max)); + + tempVLO = vmovnbq_s32(tempVLO, sumV1); + tempVLO = vmovntq_s32(tempVLO, sumV2); + + tempVHI = vmovnbq_s32(tempVHI, sumV3); + tempVHI = vmovntq_s32(tempVHI, sumV4); + + tempV = vmovnbq_s16(tempV, tempVLO); + tempV = vmovntq_s16(tempV, tempVHI); + + vstrbq_s8(pDst, tempV); + pDst += 16; + + chCnt--; + pTmp += 16; + } + + chCnt = ch_src & 0xF; + while (chCnt > 0) + { + int32_t sum = 0; + int32_t count = 0; + + for (k_y = k_y_start; k_y < k_y_end; k_y++) + { + for (k_x = k_x_start; k_x < k_x_end; k_x++) + { + sum += pTmp[ch_src * (k_x + k_y * input_x)]; + count++; + } + } + sum = sum > 0 ? (sum + count / 2) / count : (sum - count / 2) / count; + sum = MAX(sum, act_min); + sum = MIN(sum, act_max); + + *pDst++ = sum; + + chCnt--; + pTmp++; + } + } + } + return ARM_MATH_SUCCESS; +} + +#else +arm_status arm_avgpool_s8(const cmsis_nn_context *ctx, + const cmsis_nn_pool_params *pool_params, + const cmsis_nn_dims *input_dims, + const q7_t *src, + const cmsis_nn_dims *filter_dims, + const cmsis_nn_dims *output_dims, + q7_t *dst) +{ + const int32_t input_y = input_dims->h; + const int32_t input_x = input_dims->w; + const int32_t output_y = output_dims->h; + const int32_t output_x = output_dims->w; + const int32_t stride_y = pool_params->stride.h; + const int32_t stride_x = pool_params->stride.w; + const int32_t kernel_y = filter_dims->h; + const int32_t kernel_x = filter_dims->w; + const int32_t pad_y = pool_params->padding.h; + const int32_t pad_x = pool_params->padding.w; + const int32_t act_min = pool_params->activation.min; + const int32_t act_max = pool_params->activation.max; + const int32_t ch_src = input_dims->c; + q31_t *buffer = (q31_t *)ctx->buf; + +#if defined(ARM_MATH_DSP) + + /* Run the following code for CPU's with DSP extension + */ + for (int i_y = 0, idx_y = -pad_y; i_y < output_y; idx_y += stride_y, i_y++) + { + for (int i_x = 0, idx_x = -pad_x; i_x < output_x; idx_x += stride_x, i_x++) + { + /* Condition for kernel start dimension: + (base_idx_ + kernel__start) >= 0 */ + const int32_t kernel_y_start = MAX(0, -idx_y); + const int32_t kernel_x_start = MAX(0, -idx_x); + + /* Condition for kernel end dimension: + (base_idx_ + kernel__end) < dim_src_ */ + const int32_t kernel_y_end = MIN(kernel_y, input_y - idx_y); + const int32_t kernel_x_end = MIN(kernel_x, input_x - idx_x); + + int count = 0; + + for (int k_y = kernel_y_start; k_y < kernel_y_end; k_y++) + { + for (int k_x = kernel_x_start; k_x < kernel_x_end; k_x++) + { + const q7_t *start = src + ch_src * (k_x + idx_x + (k_y + idx_y) * input_x); + + if (count == 0) + { + for (int i = 0; i < ch_src; i++) + { + buffer[i] = start[i]; + } + } + else + { + for (int i = 0; i < ch_src; i++) + { + buffer[i] = __QADD(start[i], buffer[i]); + } + } + count++; + } + } + + // Prevent static code issue DIVIDE_BY_ZERO. + if (count == 0) + { + return ARM_MATH_ARGUMENT_ERROR; + } + + scale_q31_to_q7_and_clamp(buffer, dst, ch_src, count, act_min, act_max); + dst += ch_src; + } + } +#else + + /* Reference C code adapted from CMSIS-NN arm_avepool_q7_HWC. + */ + (void)buffer; + int16_t i_ch_in, i_x, i_y; + int16_t k_x, k_y; + + for (i_y = 0; i_y < output_y; i_y++) + { + for (i_x = 0; i_x < output_x; i_x++) + { + for (i_ch_in = 0; i_ch_in < ch_src; i_ch_in++) + { + int sum = 0; + int count = 0; + for (k_y = i_y * stride_y - pad_y; k_y < i_y * stride_y - pad_y + kernel_y; k_y++) + { + for (k_x = i_x * stride_x - pad_x; k_x < i_x * stride_x - pad_x + kernel_x; k_x++) + { + if (k_y >= 0 && k_x >= 0 && k_y < input_y && k_x < input_x) + { + sum += src[i_ch_in + ch_src * (k_x + k_y * input_x)]; + count++; + } + } + } + + // Prevent static code issue DIVIDE_BY_ZERO. + if (count == 0) + { + return ARM_MATH_ARGUMENT_ERROR; + } + + sum = sum > 0 ? (sum + count / 2) / count : (sum - count / 2) / count; + sum = MAX(sum, act_min); + sum = MIN(sum, act_max); + + dst[i_ch_in + ch_src * (i_x + i_y * output_x)] = sum; + } + } + } + +#endif + return ARM_MATH_SUCCESS; +} + +#endif /* ARM_MATH_MVEI */ + +int32_t arm_avgpool_s8_get_buffer_size(const int output_x, const int ch_src) +{ + (void)output_x; + +#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI) + return (ch_src * sizeof(int32_t)); +#else + (void)ch_src; + return 0; +#endif +} +/** + * @} end of Pooling group + */ diff --git a/APP_Framework/Framework/knowing/cmsis_5/NN/Source/PoolingFunctions/arm_max_pool_s8.c b/APP_Framework/Framework/knowing/cmsis_5/NN/Source/PoolingFunctions/arm_max_pool_s8.c new file mode 100644 index 000000000..36163667b --- /dev/null +++ b/APP_Framework/Framework/knowing/cmsis_5/NN/Source/PoolingFunctions/arm_max_pool_s8.c @@ -0,0 +1,229 @@ +/* + * Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_max_pool_s8.c + * Description: Pooling function implementations + * + * $Date: 19. Februari 2021 + * $Revision: V.2.0.2 + * + * Target Processor: Cortex-M CPUs + * + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" +#include "arm_nnsupportfunctions.h" + +static void compare_and_replace_if_larger_q7(q7_t *base, const q7_t *target, int32_t length) +{ +#if defined(ARM_MATH_MVEI) + int32_t loop_count = (length + 15) / 16; + for (int i = 0; i < loop_count; i++) + { + mve_pred16_t p = vctp8q((uint32_t)length); + const int8x16_t op_1 = vldrbq_z_s8(base, p); + const int8x16_t op_2 = vldrbq_z_s8(target, p); + const int8x16_t max = vmaxq_m_s8(vuninitializedq_s8(), op_1, op_2, p); + vstrbq_p_s8(base, max, p); + base += 16; + target += 16; + length -= 16; + } +#else + q7_t *dst = base; + const q7_t *src = target; + union arm_nnword ref_max; + union arm_nnword comp_max; + int32_t cnt = length >> 2; + + while (cnt > 0l) + { + ref_max.word = arm_nn_read_q7x4(dst); + comp_max.word = arm_nn_read_q7x4_ia(&src); + + if (comp_max.bytes[0] > ref_max.bytes[0]) + { + ref_max.bytes[0] = comp_max.bytes[0]; + } + if (comp_max.bytes[1] > ref_max.bytes[1]) + { + ref_max.bytes[1] = comp_max.bytes[1]; + } + if (comp_max.bytes[2] > ref_max.bytes[2]) + { + ref_max.bytes[2] = comp_max.bytes[2]; + } + if (comp_max.bytes[3] > ref_max.bytes[3]) + { + ref_max.bytes[3] = comp_max.bytes[3]; + } + + write_q7x4_ia(&dst, ref_max.word); + + cnt--; + } + + cnt = length & 0x3; + while (cnt > 0l) + { + if (*src > *dst) + { + *dst = *src; + } + dst++; + src++; + cnt--; + } +#endif +} + +static void clamp_output(q7_t *source, int32_t length, const int32_t act_min, const int32_t act_max) +{ +#if defined(ARM_MATH_MVEI) + int32_t loop_count = (length + 15) / 16; + for (int i = 0; i < loop_count; i++) + { + mve_pred16_t p = vctp8q((uint32_t)length); + length -= 16; + const int8x16_t src = vldrbq_z_s8(source, p); + const int8x16_t predicated_min = vdupq_m_n_s8(vuninitializedq_s8(), (int8_t)act_min, p); + const int8x16_t predicated_max = vdupq_m_n_s8(vuninitializedq_s8(), (int8_t)act_max, p); + int8x16_t res = vmaxq_m_s8(vuninitializedq_s8(), src, predicated_min, p); + res = vminq_m_s8(vuninitializedq_s8(), res, predicated_max, p); + vstrbq_p_s8(source, res, p); + source += 16; + } +#else + union arm_nnword in; + int32_t cnt = length >> 2; + + while (cnt > 0l) + { + in.word = arm_nn_read_q7x4(source); + + in.bytes[0] = MAX(in.bytes[0], act_min); + in.bytes[0] = MIN(in.bytes[0], act_max); + in.bytes[1] = MAX(in.bytes[1], act_min); + in.bytes[1] = MIN(in.bytes[1], act_max); + in.bytes[2] = MAX(in.bytes[2], act_min); + in.bytes[2] = MIN(in.bytes[2], act_max); + in.bytes[3] = MAX(in.bytes[3], act_min); + in.bytes[3] = MIN(in.bytes[3], act_max); + + write_q7x4_ia(&source, in.word); + cnt--; + } + + cnt = length & 0x3; + while (cnt > 0l) + { + int32_t comp = *source; + comp = MAX(comp, act_min); + comp = MIN(comp, act_max); + *source++ = (int8_t)comp; + cnt--; + } +#endif +} + +/** + * @ingroup groupNN + */ + +/** + * @addtogroup Pooling + * @{ + */ + +/* + * Optimized s8 max pooling function + * + * Refer to header file for details. + * + */ + +arm_status arm_max_pool_s8(const cmsis_nn_context *ctx, + const cmsis_nn_pool_params *pool_params, + const cmsis_nn_dims *input_dims, + const q7_t *src, + const cmsis_nn_dims *filter_dims, + const cmsis_nn_dims *output_dims, + q7_t *dst) +{ + const int32_t input_y = input_dims->h; + const int32_t input_x = input_dims->w; + const int32_t output_y = output_dims->h; + const int32_t output_x = output_dims->w; + const int32_t stride_y = pool_params->stride.h; + const int32_t stride_x = pool_params->stride.w; + const int32_t kernel_y = filter_dims->h; + const int32_t kernel_x = filter_dims->w; + const int32_t pad_y = pool_params->padding.h; + const int32_t pad_x = pool_params->padding.w; + const int32_t act_min = pool_params->activation.min; + const int32_t act_max = pool_params->activation.max; + const int32_t channel_in = input_dims->c; + (void)ctx; + q7_t *dst_base = dst; + + for (int i_y = 0, base_idx_y = -pad_y; i_y < output_y; base_idx_y += stride_y, i_y++) + { + for (int i_x = 0, base_idx_x = -pad_x; i_x < output_x; base_idx_x += stride_x, i_x++) + { + /* Condition for kernel start dimension: (base_idx_ + kernel__start) >= 0 */ + const int32_t ker_y_start = MAX(0, -base_idx_y); + const int32_t ker_x_start = MAX(0, -base_idx_x); + + /* Condition for kernel end dimension: (base_idx_ + kernel__end) < dim_src_ */ + const int32_t kernel_y_end = MIN(kernel_y, input_y - base_idx_y); + const int32_t kernel_x_end = MIN(kernel_x, input_x - base_idx_x); + + int count = 0; + + for (int k_y = ker_y_start; k_y < kernel_y_end; k_y++) + { + for (int k_x = ker_x_start; k_x < kernel_x_end; k_x++) + { + const q7_t *start = src + channel_in * (k_x + base_idx_x + (k_y + base_idx_y) * input_x); + + if (count == 0) + { + memcpy(dst, start, channel_in); + count++; + } + else + { + compare_and_replace_if_larger_q7(dst, start, channel_in); + } + } + } + /* 'count' is expected to be non-zero here. */ + dst += channel_in; + } + } + + clamp_output(dst_base, output_x * output_y * channel_in, act_min, act_max); + + return ARM_MATH_SUCCESS; +} + +/** + * @} end of Pooling group + */ diff --git a/APP_Framework/Framework/knowing/cmsis_5/NN/Source/PoolingFunctions/arm_pool_q7_HWC.c b/APP_Framework/Framework/knowing/cmsis_5/NN/Source/PoolingFunctions/arm_pool_q7_HWC.c new file mode 100644 index 000000000..7546049ce --- /dev/null +++ b/APP_Framework/Framework/knowing/cmsis_5/NN/Source/PoolingFunctions/arm_pool_q7_HWC.c @@ -0,0 +1,464 @@ +/* + * Copyright (C) 2010-2018 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_pool_q7_HWC.c + * Description: Pooling function implementations + * + * $Date: 09. October 2020 + * $Revision: V.1.0.1 + * + * Target Processor: Cortex-M cores + * + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" +#include "arm_nnsupportfunctions.h" + +#if defined(ARM_MATH_DSP) + +/** + * @brief A few utility functions used by pooling functions + * + * + */ + +static void buffer_scale_back_q15_to_q7(q15_t *buffer, q7_t *target, uint16_t length, uint16_t scale) +{ + int i; + + for (i = 0; i < length; i++) + { + target[i] = (q7_t)(buffer[i] / scale); + } +} + +static void compare_and_replace_if_larger_q7(q7_t *base, // base data + const q7_t *target, // compare target + const uint16_t length // data length +) +{ + q7_t *pIn = base; + const q7_t *pCom = target; + union arm_nnword in; + union arm_nnword com; + uint16_t cnt = length >> 2; + + while (cnt > 0u) + { + in.word = arm_nn_read_q7x4((const q7_t *)pIn); + com.word = arm_nn_read_q7x4_ia((const q7_t **)&pCom); + + // if version + if (com.bytes[0] > in.bytes[0]) + in.bytes[0] = com.bytes[0]; + if (com.bytes[1] > in.bytes[1]) + in.bytes[1] = com.bytes[1]; + if (com.bytes[2] > in.bytes[2]) + in.bytes[2] = com.bytes[2]; + if (com.bytes[3] > in.bytes[3]) + in.bytes[3] = com.bytes[3]; + + *__SIMD32(pIn)++ = in.word; + + cnt--; + } + + cnt = length & 0x3; + while (cnt > 0u) + { + if (*pCom > *pIn) + { + *pIn = *pCom; + } + pIn++; + pCom++; + cnt--; + } +} + +static void accumulate_q7_to_q15(q15_t *base, q7_t *target, const uint16_t length) +{ + q15_t *pCnt = base; + q7_t *pV = target; + q31_t v1, v2, vo1, vo2; + uint16_t cnt = length >> 2; + q31_t in; + + while (cnt > 0u) + { + q31_t value = arm_nn_read_q7x4_ia((const q7_t **)&pV); + v1 = __SXTB16(__ROR(value, 8)); + v2 = __SXTB16(value); +#ifndef ARM_MATH_BIG_ENDIAN + + vo2 = __PKHTB(v1, v2, 16); + vo1 = __PKHBT(v2, v1, 16); + +#else + + vo1 = __PKHTB(v1, v2, 16); + vo2 = __PKHBT(v2, v1, 16); + +#endif + + in = arm_nn_read_q15x2(pCnt); + *__SIMD32(pCnt)++ = __QADD16(vo1, in); + + in = arm_nn_read_q15x2(pCnt); + *__SIMD32(pCnt)++ = __QADD16(vo2, in); + + cnt--; + } + cnt = length & 0x3; + while (cnt > 0u) + { + *pCnt++ += *pV++; + cnt--; + } +} + +#endif // ARM_MATH_DSP + +/** + * @ingroup groupNN + */ + +/** + * @addtogroup Pooling + * @{ + */ + +/** + * @brief Q7 max pooling function + * @param[in, out] Im_in pointer to input tensor + * @param[in] dim_im_in input tensor dimention + * @param[in] ch_im_in number of input tensor channels + * @param[in] dim_kernel filter kernel size + * @param[in] padding padding sizes + * @param[in] stride convolution stride + * @param[in] dim_im_out output tensor dimension + * @param[in,out] bufferA Not used + * @param[in,out] Im_out pointer to output tensor + * + * @details + * + * The pooling function is implemented as split x-pooling then + * y-pooling. + * + * This pooling function is input-destructive. Input data is undefined + * after calling this function. + * + */ + +void arm_maxpool_q7_HWC(q7_t *Im_in, + const uint16_t dim_im_in, + const uint16_t ch_im_in, + const uint16_t dim_kernel, + const uint16_t padding, + const uint16_t stride, + const uint16_t dim_im_out, + q7_t *bufferA, + q7_t *Im_out) +{ + (void)bufferA; +#if defined(ARM_MATH_DSP) + /* Run the following code for Cortex-M4 and Cortex-M7 */ + + int16_t i_x, i_y; + + /* first does the pooling along x axis */ + for (i_y = 0; i_y < dim_im_in; i_y++) + { + + for (i_x = 0; i_x < dim_im_out; i_x++) + { + /* for each output pixel */ + q7_t *target = Im_in + (i_y * dim_im_in + i_x) * ch_im_in; + q7_t *win_start; + q7_t *win_stop; + if (i_x * stride - padding < 0) + { + win_start = target; + } + else + { + win_start = Im_in + (i_y * dim_im_in + i_x * stride - padding) * ch_im_in; + } + + if (i_x * stride - padding + dim_kernel >= dim_im_in) + { + win_stop = Im_in + (i_y * dim_im_in + dim_im_in) * ch_im_in; + } + else + { + win_stop = Im_in + (i_y * dim_im_in + i_x * stride - padding + dim_kernel) * ch_im_in; + } + + /* first step is to copy over initial data */ + /* arm_copy_q7(win_start, target, ch_im_in); */ + memmove(target, win_start, ch_im_in); + + /* start the max operation from the second part */ + win_start += ch_im_in; + for (; win_start < win_stop; win_start += ch_im_in) + { + compare_and_replace_if_larger_q7(target, win_start, ch_im_in); + } + } + } + + /* then does the pooling along y axis */ + for (i_y = 0; i_y < dim_im_out; i_y++) + { + + /* for each output row */ + q7_t *target = Im_out + i_y * dim_im_out * ch_im_in; + q7_t *row_start; + q7_t *row_end; + /* setting the starting row */ + if (i_y * stride - padding < 0) + { + row_start = Im_in; + } + else + { + row_start = Im_in + (i_y * stride - padding) * dim_im_in * ch_im_in; + } + /* setting the stopping row */ + if (i_y * stride - padding + dim_kernel >= dim_im_in) + { + row_end = Im_in + dim_im_in * dim_im_in * ch_im_in; + } + else + { + row_end = Im_in + (i_y * stride - padding + dim_kernel) * dim_im_in * ch_im_in; + } + + /* copy over the first row */ + /* arm_copy_q7(row_start, target, dim_im_out * ch_im_in); */ + memmove(target, row_start, dim_im_out * ch_im_in); + + /* move over to next row */ + row_start += ch_im_in * dim_im_in; + + for (; row_start < row_end; row_start += dim_im_in * ch_im_in) + { + compare_and_replace_if_larger_q7(target, row_start, dim_im_out * ch_im_in); + } + } + +#else + /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */ + int16_t i_ch_in, i_x, i_y; + int16_t k_x, k_y; + + for (i_ch_in = 0; i_ch_in < ch_im_in; i_ch_in++) + { + for (i_y = 0; i_y < dim_im_out; i_y++) + { + for (i_x = 0; i_x < dim_im_out; i_x++) + { + int max = -129; + for (k_y = i_y * stride - padding; k_y < i_y * stride - padding + dim_kernel; k_y++) + { + for (k_x = i_x * stride - padding; k_x < i_x * stride - padding + dim_kernel; k_x++) + { + if (k_y >= 0 && k_x >= 0 && k_y < dim_im_in && k_x < dim_im_in) + { + if (Im_in[i_ch_in + ch_im_in * (k_x + k_y * dim_im_in)] > max) + { + max = Im_in[i_ch_in + ch_im_in * (k_x + k_y * dim_im_in)]; + } + } + } + } + Im_out[i_ch_in + ch_im_in * (i_x + i_y * dim_im_out)] = max; + } + } + } + +#endif /* ARM_MATH_DSP */ +} + +/** + * @brief Q7 average pooling function + * @param[in,out] Im_in pointer to input tensor + * @param[in] dim_im_in input tensor dimention + * @param[in] ch_im_in number of input tensor channels + * @param[in] dim_kernel filter kernel size + * @param[in] padding padding sizes + * @param[in] stride convolution stride + * @param[in] dim_im_out output tensor dimension + * @param[in,out] bufferA pointer to buffer space for input + * @param[in,out] Im_out pointer to output tensor + * + * @details + * + * Buffer size: + * + * bufferA size: 2*dim_im_out*ch_im_in + * + * The pooling function is implemented as split x-pooling then + * y-pooling. + * + * This pooling function is input-destructive. Input data is undefined + * after calling this function. + * + */ + +void arm_avepool_q7_HWC(q7_t *Im_in, + const uint16_t dim_im_in, + const uint16_t ch_im_in, + const uint16_t dim_kernel, + const uint16_t padding, + const uint16_t stride, + const uint16_t dim_im_out, + q7_t *bufferA, + q7_t *Im_out) +{ + +#if defined(ARM_MATH_DSP) + /* Run the following code for Cortex-M4 and Cortex-M7 */ + + q15_t *buffer = (q15_t *)bufferA; + int16_t i_x, i_y; + int16_t count = 0; + + /* first does the pooling along x axis */ + for (i_y = 0; i_y < dim_im_in; i_y++) + { + + for (i_x = 0; i_x < dim_im_out; i_x++) + { + /* for each output pixel */ + q7_t *target = Im_in + (i_y * dim_im_in + i_x) * ch_im_in; + q7_t *win_start; + q7_t *win_stop; + if (i_x * stride - padding < 0) + { + win_start = target; + } + else + { + win_start = Im_in + (i_y * dim_im_in + i_x * stride - padding) * ch_im_in; + } + + if (i_x * stride - padding + dim_kernel >= dim_im_in) + { + win_stop = Im_in + (i_y * dim_im_in + dim_im_in) * ch_im_in; + } + else + { + win_stop = Im_in + (i_y * dim_im_in + i_x * stride - padding + dim_kernel) * ch_im_in; + } + + /* first step is to copy over initial data */ + arm_q7_to_q15_no_shift(win_start, buffer, ch_im_in); + count = 1; + + /* start the max operation from the second part */ + win_start += ch_im_in; + for (; win_start < win_stop; win_start += ch_im_in) + { + accumulate_q7_to_q15(buffer, win_start, ch_im_in); + count++; + } + buffer_scale_back_q15_to_q7(buffer, target, ch_im_in, count); + } + } + + /* then does the pooling along y axis */ + for (i_y = 0; i_y < dim_im_out; i_y++) + { + /* for each output row */ + q7_t *target = Im_out + i_y * dim_im_out * ch_im_in; + q7_t *row_start; + q7_t *row_end; + /* setting the starting row */ + if (i_y * stride - padding < 0) + { + row_start = Im_in; + } + else + { + row_start = Im_in + (i_y * stride - padding) * dim_im_in * ch_im_in; + } + /* setting the stopping row */ + if (i_y * stride - padding + dim_kernel >= dim_im_in) + { + row_end = Im_in + dim_im_in * dim_im_in * ch_im_in; + } + else + { + row_end = Im_in + (i_y * stride - padding + dim_kernel) * dim_im_in * ch_im_in; + } + + /* copy over the first row */ + arm_q7_to_q15_no_shift(row_start, buffer, dim_im_out * ch_im_in); + count = 1; + + /* move over to next row */ + row_start += ch_im_in * dim_im_in; + + for (; row_start < row_end; row_start += dim_im_in * ch_im_in) + { + accumulate_q7_to_q15(buffer, row_start, dim_im_out * ch_im_in); + count++; + } + buffer_scale_back_q15_to_q7(buffer, target, dim_im_out * ch_im_in, count); + } + +#else + /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */ + + (void)bufferA; + int16_t i_ch_in, i_x, i_y; + int16_t k_x, k_y; + + for (i_ch_in = 0; i_ch_in < ch_im_in; i_ch_in++) + { + for (i_y = 0; i_y < dim_im_out; i_y++) + { + for (i_x = 0; i_x < dim_im_out; i_x++) + { + int sum = 0; + int count = 0; + for (k_y = i_y * stride - padding; k_y < i_y * stride - padding + dim_kernel; k_y++) + { + for (k_x = i_x * stride - padding; k_x < i_x * stride - padding + dim_kernel; k_x++) + { + if (k_y >= 0 && k_x >= 0 && k_y < dim_im_in && k_x < dim_im_in) + { + sum += Im_in[i_ch_in + ch_im_in * (k_x + k_y * dim_im_in)]; + count++; + } + } + } + Im_out[i_ch_in + ch_im_in * (i_x + i_y * dim_im_out)] = sum / count; + } + } + } + +#endif /* ARM_MATH_DSP */ +} + +/** + * @} end of Pooling group + */ diff --git a/APP_Framework/Framework/knowing/cmsis_5/NN/Source/ReshapeFunctions/arm_reshape_s8.c b/APP_Framework/Framework/knowing/cmsis_5/NN/Source/ReshapeFunctions/arm_reshape_s8.c new file mode 100644 index 000000000..cd839dcc0 --- /dev/null +++ b/APP_Framework/Framework/knowing/cmsis_5/NN/Source/ReshapeFunctions/arm_reshape_s8.c @@ -0,0 +1,56 @@ +/* + * Copyright (C) 2010-2019 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_reshape_s8.c + * Description: Reshape a s8 vector + * + * $Date: September 2019 + * $Revision: V.1.0.0 + * + * Target Processor: Cortex-M cores + * + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" + +/** + * @ingroup groupNN + */ + +/** + * @addtogroup Reshape + * @{ + */ + +/** + * Basic s8 reshape function. + * + * Refer header file for details. + * + */ + +void arm_reshape_s8(const int8_t *input, int8_t *output, const uint32_t total_size) +{ + memcpy(output, input, total_size); +} + +/** + * @} end of Reshape group + */ \ No newline at end of file diff --git a/APP_Framework/Framework/knowing/cmsis_5/NN/Source/SVDFunctions/arm_svdf_s8.c b/APP_Framework/Framework/knowing/cmsis_5/NN/Source/SVDFunctions/arm_svdf_s8.c new file mode 100644 index 000000000..c8bed031d --- /dev/null +++ b/APP_Framework/Framework/knowing/cmsis_5/NN/Source/SVDFunctions/arm_svdf_s8.c @@ -0,0 +1,254 @@ +/* + * Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_svdf_s8.c + * Description: S8 basic SVDF layer function + * + * $Date: 15. April 2021 + * $Revision: V.1.5.0 + * + * Target Processor: Cortex-M processors + * + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" +#include "arm_nnsupportfunctions.h" + +/** + * @ingroup groupNN + */ + +/** + * @addtogroup SVDF + * @{ + */ + +/* + * S8 SVDF layer function for TensorFlow Lite + * + * Refer to header file for details. + * + */ + +arm_status arm_svdf_s8(const cmsis_nn_context *input_ctx, + const cmsis_nn_context *output_ctx, + const cmsis_nn_svdf_params *svdf_params, + const cmsis_nn_per_tensor_quant_params *input_quant_params, + const cmsis_nn_per_tensor_quant_params *output_quant_params, + const cmsis_nn_dims *input_dims, + const q7_t *input_data, + const cmsis_nn_dims *state_dims, + q15_t *state_data, + const cmsis_nn_dims *weights_feature_dims, + const q7_t *weights_feature_data, + const cmsis_nn_dims *weights_time_dims, + const q15_t *weights_time_data, + const cmsis_nn_dims *bias_dims, + const q31_t *bias_data, + const cmsis_nn_dims *output_dims, + q7_t *output_data) +{ + (void)bias_dims; + (void)state_dims; + (void)output_dims; + + const q31_t multiplier_in = input_quant_params->multiplier; + const q31_t shift_in = input_quant_params->shift; + const q31_t multiplier_out = output_quant_params->multiplier; + const q31_t shift_2 = output_quant_params->shift; + const int32_t zp_in = svdf_params->input_offset; + const int32_t zp_out = svdf_params->output_offset; + const int32_t in_activation_min = svdf_params->input_activation.min; + const int32_t in_activation_max = svdf_params->input_activation.max; + const int32_t out_activation_min = svdf_params->output_activation.min; + const int32_t out_activation_max = svdf_params->output_activation.max; + const int16_t rank = svdf_params->rank; + + const int32_t input_batches = input_dims->n; + const int32_t input_height = input_dims->h; + const int32_t feature_batches = weights_feature_dims->n; + const int32_t time_batches = weights_time_dims->h; + const int32_t unit_count = feature_batches / rank; + + q31_t *buffer_a = (q31_t *)input_ctx->buf; + q31_t *buffer_b = (q31_t *)output_ctx->buf; + + memmove((q15_t *)state_data, + (q15_t *)state_data + 1, + (size_t)(input_batches * feature_batches * time_batches * (int32_t)sizeof(int16_t))); + + for (int i_batch = 0; i_batch < input_batches; i_batch++) + { + q15_t *res_ptr = state_data + (time_batches * i_batch * feature_batches) + (time_batches - 1); + const q7_t *weight = weights_feature_data; + const q7_t *input = input_data + i_batch * input_height; + + arm_status res = arm_nn_vec_mat_mult_t_svdf_s8(input, + weight, + res_ptr, + -zp_in, + 0, + time_batches, + multiplier_in, + shift_in, + input_height, + feature_batches, + in_activation_min, + in_activation_max); + + if (res != ARM_MATH_SUCCESS) + { + return res; + } + } + + { + q31_t *ptr_a = buffer_a; + const q15_t *v2 = state_data; + for (int i_batch = 0; i_batch < input_batches; i_batch++) + { + const q15_t *v1 = weights_time_data; + + for (int i_feature_batch = 0; i_feature_batch < feature_batches; i_feature_batch++) + { + *ptr_a = 0; + int32_t sum = 0; +#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI) + int j = 0; + int32_t block_count = time_batches >> 1; + for (int i = 0; i < block_count; i++) + { + j += 2; + q31_t r1 = arm_nn_read_q15x2_ia(&v1); + q31_t r2 = arm_nn_read_q15x2_ia(&v2); + + sum = __SMLAD(r1, r2, sum); + } + + // Process the remaining data + for (; j < time_batches; j++) + { + sum += *v1 * *v2; + v1++; + v2++; + } +#else + for (int j = 0; j < time_batches; j++) + { + sum += *v1 * *v2; + v1++; + v2++; + } +#endif + + *ptr_a = sum; + ptr_a++; + } + } + } + + if (bias_data) + { + if (unit_count == feature_batches) + { + for (int i = 0; i < input_batches; i++) + { + q31_t *output_temp = buffer_b + i * feature_batches; + const q31_t *ptr_a = buffer_a + i * feature_batches; + + const int32_t *bi = bias_data; + for (int j = 0; j < feature_batches; j++) + { + output_temp[j] = ptr_a[j] + bi[j]; + } + } + } + else + { + for (int i_batch = 0; i_batch < input_batches; i_batch++) + { + q31_t *output_data_temp = buffer_b + i_batch * unit_count; + q31_t *ptr_a = buffer_a + i_batch * feature_batches; + + for (int i = 0; i < unit_count; i++) + { + int32_t sum = bias_data[i]; + for (int j = 0; j < rank; j++) + { + sum += *ptr_a; + ptr_a++; + } + output_data_temp[i] = sum; + } + } + } + } + else + { + for (int i_batch = 0; i_batch < input_batches; i_batch++) + { + q31_t *output_data_temp = buffer_b + i_batch * unit_count; + q31_t *ptr_a = buffer_a + i_batch * feature_batches; + + for (int i = 0; i < unit_count; i++) + { + int32_t sum = 0; + for (int j = 0; j < rank; j++) + { + sum += *ptr_a; + ptr_a++; + } + output_data_temp[i] = sum; + } + } + } + +#if defined(ARM_MATH_MVEI) + int32_t num_elements = input_batches * unit_count; + const int32_t loop_count = (num_elements + 3) / 4; + for (int i_op = 0; i_op < loop_count; i_op++) + { + mve_pred16_t p = vctp32q((uint32_t)num_elements); + int32x4_t op = vldrwq_z_s32(buffer_b, p); + op = arm_requantize_mve(op, multiplier_out, shift_2); + op = vaddq_n_s32(op, zp_out); + const int32x4_t min_vec = vdupq_n_s32((int8_t)out_activation_min); + const int32x4_t max_vec = vdupq_n_s32((int8_t)out_activation_max); + op = vmaxq_s32(op, min_vec); + op = vminq_s32(op, max_vec); + vstrbq_p_s32(output_data, op, p); + output_data += 4; + buffer_b += 4; + num_elements -= 4; + } +#else + for (int i = 0; i < input_batches * unit_count; i++) + { + output_data[i] = (q7_t)CLAMP( + arm_nn_requantize(buffer_b[i], multiplier_out, shift_2) + zp_out, out_activation_max, out_activation_min); + } +#endif + + return (ARM_MATH_SUCCESS); +} + +/** + * @} end of SVDF group + */ diff --git a/APP_Framework/Framework/knowing/cmsis_5/NN/Source/SoftmaxFunctions/arm_softmax_q15.c b/APP_Framework/Framework/knowing/cmsis_5/NN/Source/SoftmaxFunctions/arm_softmax_q15.c new file mode 100644 index 000000000..18f3e8364 --- /dev/null +++ b/APP_Framework/Framework/knowing/cmsis_5/NN/Source/SoftmaxFunctions/arm_softmax_q15.c @@ -0,0 +1,118 @@ +/* + * Copyright (C) 2010-2018 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_softmax_q15.c + * Description: Q15 softmax function + * + * $Date: 09. October 2020 + * $Revision: V.1.0.1 + * + * Target Processor: Cortex-M cores + * + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" + +/** + * @ingroup groupNN + */ + +/** + * @addtogroup Softmax + * @{ + */ + +/** + * @brief Q15 softmax function + * @param[in] vec_in pointer to input vector + * @param[in] dim_vec input vector dimention + * @param[out] p_out pointer to output vector + * + * @details + * + * Here, instead of typical e based softmax, we use + * 2-based softmax, i.e.,: + * + * y_i = 2^(x_i) / sum(2^x_j) + * + * The relative output will be different here. + * But mathematically, the gradient will be the same + * with a log(2) scaling factor. + * + */ + +void arm_softmax_q15(const q15_t *vec_in, const uint16_t dim_vec, q15_t *p_out) +{ + q31_t sum; + int16_t i; + uint8_t shift; + q31_t base; + base = -1 * 0x100000; + for (i = 0; i < dim_vec; i++) + { + if (vec_in[i] > base) + { + base = vec_in[i]; + } + } + + /* we ignore really small values + * anyway, they will be 0 after shrinking + * to q15_t + */ + base = base - 16; + + sum = 0; + + for (i = 0; i < dim_vec; i++) + { + if (vec_in[i] > base) + { + shift = (uint8_t)__USAT(vec_in[i] - base, 5); + sum += 0x1 << shift; + } + } + + /* This is effectively (0x1 << 32) / sum */ + int64_t div_base = 0x100000000LL; + int output_base = (int32_t)(div_base / sum); + + /* Final confidence will be output_base >> ( 17 - (vec_in[i] - base) ) + * so 32768 (0x1<<15) -> 100% confidence when sum = 0x1 << 16, output_base = 0x1 << 16 + * and vec_in[i]-base = 16 + */ + for (i = 0; i < dim_vec; i++) + { + if (vec_in[i] > base) + { + /* Here minimum value of 17+base-vec[i] will be 1 */ + shift = (uint8_t)__USAT(17 + base - vec_in[i], 5); + p_out[i] = (q15_t)__SSAT((output_base >> shift), 16); + } + else + { + p_out[i] = 0; + } + } +} + +/** + * @} end of Softmax group + */ diff --git a/APP_Framework/Framework/knowing/cmsis_5/NN/Source/SoftmaxFunctions/arm_softmax_q7.c b/APP_Framework/Framework/knowing/cmsis_5/NN/Source/SoftmaxFunctions/arm_softmax_q7.c new file mode 100644 index 000000000..58eb990d9 --- /dev/null +++ b/APP_Framework/Framework/knowing/cmsis_5/NN/Source/SoftmaxFunctions/arm_softmax_q7.c @@ -0,0 +1,107 @@ +/* + * Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_softmax_q7.c + * Description: Q7 softmax function + * + * $Date: 09. October 2020 + * $Revision: V.1.0.2 + * + * Target Processor: Cortex-M cores + * + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" + +/** + * @ingroup groupNN + */ + +/** + * @addtogroup Softmax + * @{ + */ + +/** + * @brief Q7 softmax function + * @param[in] vec_in pointer to input vector + * @param[in] dim_vec input vector dimention + * @param[out] p_out pointer to output vector + * + * @details + * + * Here, instead of typical natural logarithm e based softmax, we use + * 2-based softmax here, i.e.,: + * + * y_i = 2^(x_i) / sum(2^x_j) + * + * The relative output will be different here. + * But mathematically, the gradient will be the same + * with a log(2) scaling factor. + * + */ + +void arm_softmax_q7(const q7_t *vec_in, const uint16_t dim_vec, q7_t *p_out) +{ + q31_t sum; + int16_t i; + uint8_t shift; + q15_t base; + base = -128; + + /* We first search for the maximum */ + for (i = 0; i < dim_vec; i++) + { + if (vec_in[i] > base) + { + base = vec_in[i]; + } + } + + /* + * So the base is set to max-8, meaning + * that we ignore really small values. + * anyway, they will be 0 after shrinking to q7_t. + */ + base = base - (1 << 3); + + sum = 0; + + for (i = 0; i < dim_vec; i++) + { + shift = (uint8_t)__USAT(vec_in[i] - base, 3); + sum += 0x1 << shift; + } + + /* This is effectively (0x1 << 20) / sum */ + int output_base = (1 << 20) / sum; + + for (i = 0; i < dim_vec; i++) + { + + /* Here minimum value of 13+base-vec_in[i] will be 5 */ + shift = (uint8_t)__USAT(13 + base - vec_in[i], 5); + p_out[i] = (q7_t)__SSAT((output_base >> shift), 8); + } +} + +/** + * @} end of Softmax group + */ diff --git a/APP_Framework/Framework/knowing/cmsis_5/NN/Source/SoftmaxFunctions/arm_softmax_s8.c b/APP_Framework/Framework/knowing/cmsis_5/NN/Source/SoftmaxFunctions/arm_softmax_s8.c new file mode 100644 index 000000000..09ac947c7 --- /dev/null +++ b/APP_Framework/Framework/knowing/cmsis_5/NN/Source/SoftmaxFunctions/arm_softmax_s8.c @@ -0,0 +1,261 @@ +/* + * Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_softmax_s8.c + * Description: S8 softmax function + * + * $Date: 01. March 2021 + * $Revision: V.2.0.2 + * + * Target Processor: Cortex-M cores + * + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" +#include "arm_nnsupportfunctions.h" + +#define ACCUM_BITS 12 + +#ifdef ARM_MATH_MVEI +static int32x4_t arm_exp_on_negative_values_mve_32x4(int32x4_t val) +{ +#define SHIFT_START (24) + int32_t shift = SHIFT_START; + int32x4_t mask; + + const int32x4_t val_mod_minus_quarter = + vandq_s32(val, vdupq_n_s32((1 << SHIFT_START) - 1)) - vdupq_n_s32(1 << SHIFT_START); + const int32x4_t remainder = vsubq_s32(val_mod_minus_quarter, val); + const int32x4_t x = vaddq_n_s32(val_mod_minus_quarter << 5, 1 << 28); + const int32x4_t x2 = MUL_SAT_MVE(x, x); + const int32x4_t op_1 = DIV_POW2_MVE(MUL_SAT_MVE(x2, x2), 2) + MUL_SAT_MVE(x2, x); + const int32x4_t op_2 = x + DIV_POW2_MVE(MUL_SAT_MVE(op_1, vdupq_n_s32(715827883)) + x2, 1); + int32x4_t result = vdupq_n_s32(1895147668) + MUL_SAT_MVE(vdupq_n_s32(1895147668), op_2); + +#define SELECT_IF_NON_ZERO(x) \ + { \ + mve_pred16_t p = vcmpneq_n_s32(remainder & vdupq_n_s32(1 << shift++), 0); \ + mask = vmvnq_m_s32(vdupq_n_s32(0), vdupq_n_s32(0), p); \ + result = SELECT_USING_MASK(mask, MUL_SAT_MVE(result, vdupq_n_s32(x)), result); \ + } + + SELECT_IF_NON_ZERO(1672461947) + SELECT_IF_NON_ZERO(1302514674) + SELECT_IF_NON_ZERO(790015084) + SELECT_IF_NON_ZERO(290630308) + SELECT_IF_NON_ZERO(39332535) + SELECT_IF_NON_ZERO(720401) + SELECT_IF_NON_ZERO(242) + +#undef SELECT_IF_NON_ZERO + + mve_pred16_t p = vcmpeqq_n_s32(val, 0); + mask = vmvnq_m_s32(vdupq_n_s32(0), vdupq_n_s32(0), p); + + result = SELECT_USING_MASK(mask, vdupq_n_s32(Q31_MAX), result); + return result; +} +#endif + +/** + * @ingroup groupNN + */ + +/** + * @addtogroup Softmax + * @{ + */ + +void arm_softmax_s8(const int8_t *input, + const int32_t num_rows, + const int32_t row_size, + const int32_t mult, + const int32_t shift, + const int32_t diff_min, + int8_t *output) +{ +#ifdef ARM_MATH_MVEI + +#define ACT_MIN ((int8_t)Q7_MIN) +#define ACT_MAX ((int8_t)Q7_MAX) + + const int32_t mask = (1 << shift); + + for (int i_num_rows = 0; i_num_rows < num_rows; ++i_num_rows) + { + int8_t max = ACT_MIN; + + int32_t vec_count = (row_size + 15) / 16; + uint32_t r_count = (uint32_t)row_size; + for (int i = 0; i < vec_count; i++) + { + mve_pred16_t p = vctp8q(r_count); + const int8x16_t ip = vldrbq_z_s8(&input[i * 16], p); + max = vmaxvq_p_s8(max, ip, p); + r_count -= 16; + } + + vec_count = row_size / 4; + int32_t idx = 0; + int32_t sum = 0; + + while (vec_count) + { + int32x4_t ip = vldrbq_s32(&input[idx * 4]); + ip = vsubq_n_s32(ip, max); + mve_pred16_t p = vcmpgeq_n_s32(ip, diff_min); + if (p != 0) + { + ip = vmulq_n_s32(ip, mask); + + int32x4_t res = MUL_SAT_MVE(ip, vdupq_n_s32(mult)); + + res = arm_exp_on_negative_values_mve_32x4(res); + res = DIV_POW2_MVE(res, ACCUM_BITS); + res = vpselq_s32(res, vdupq_n_s32(0), p); + sum += vaddvq_s32(res); + } + + vec_count--; + idx++; + } + + const int32_t tail_idx = row_size & ~3; + for (int i = 0; i < (row_size & 3); i++) + { + const int32_t diff = input[tail_idx + i] - max; + if (diff >= diff_min) + { + sum += DIV_POW2(EXP_ON_NEG(MUL_SAT(diff * mask, mult)), ACCUM_BITS); + } + } + + const int32_t headroom = __CLZ((uint32_t)sum); + const int32_t bits_over_unit = ACCUM_BITS - headroom + 23; + const int32_t shifted_scale = ONE_OVER1((sum > 0 ? sum << headroom : 0) - (1 << 31)); + + vec_count = row_size / 4; + idx = 0; + + while (vec_count) + { + int32x4_t ip = vldrbq_s32(&input[idx]); + ip = vsubq_n_s32(ip, max); + + mve_pred16_t p = vcmpgeq_n_s32(ip, diff_min); + + int32x4_t tmp_res; + + if (p != 0) + { + ip = vmulq_n_s32(ip, mask); + + tmp_res = MUL_SAT_MVE(ip, vdupq_n_s32(mult)); + tmp_res = arm_exp_on_negative_values_mve_32x4(tmp_res); + tmp_res = MUL_SAT_MVE(vdupq_n_s32(shifted_scale), tmp_res); + tmp_res = DIV_POW2_MVE(tmp_res, bits_over_unit); + tmp_res += vdupq_n_s32(ACT_MIN); + + tmp_res = vmaxq_s32(tmp_res, vdupq_n_s32(ACT_MIN)); + tmp_res = vminq_s32(tmp_res, vdupq_n_s32(ACT_MAX)); + tmp_res = vpselq_s32(tmp_res, vdupq_n_s32(ACT_MIN), p); + } + else + { + tmp_res = vdupq_n_s32(ACT_MIN); + } + vstrbq_s32(&output[idx], tmp_res); + vec_count--; + idx += 4; + } + + for (int i = 0; i < (row_size & 3); i++) + { + int32_t diff = input[tail_idx + i] - max; + if (diff >= diff_min) + { + const int32_t res = + DIV_POW2(MUL_SAT(shifted_scale, EXP_ON_NEG(MUL_SAT(diff * mask, mult))), bits_over_unit) - 128; + output[tail_idx + i] = (int8_t)CLAMP(res, (int32_t)ACT_MAX, (int32_t)ACT_MIN); + } + else + { + output[tail_idx + i] = ACT_MIN; + } + } + + input += row_size; + output += row_size; + } +#else + const int32_t mask = (1 << shift); + + int32_t col = 0; + int32_t row_idx; + + for (row_idx = 0; row_idx < num_rows; ++row_idx) + { + // Find the maximum value in order to ensure numerical stability + int8_t max = *input; + + for (col = 1; col < row_size; ++col) + { + max = MAX(max, input[col]); + } + + int32_t diff = 0; + int32_t sum = 0; + + for (col = 0; col < row_size; ++col) + { + diff = input[col] - max; + if (diff >= diff_min) + { + sum += DIV_POW2(EXP_ON_NEG(MUL_SAT(diff * mask, mult)), ACCUM_BITS); + } + } + + const int32_t headroom = __CLZ(sum); + const int32_t bits_over_unit = ACCUM_BITS - headroom + 23; + const int32_t shifted_scale = ONE_OVER1((sum > 0 ? sum << headroom : 0) - (1 << 31)); + + for (col = 0; col < row_size; ++col) + { + diff = input[col] - max; + if (diff >= diff_min) + { + const int32_t res = + DIV_POW2(MUL_SAT(shifted_scale, EXP_ON_NEG(MUL_SAT(diff * mask, mult))), bits_over_unit) - 128; + output[col] = (int8_t)CLAMP(res, (int32_t)127, (int32_t)-128); + } + else + { + output[col] = -128; + } + } + input += row_size; + output += row_size; + } + +#endif +} +/** + * @} end of Softmax group + */ diff --git a/APP_Framework/Framework/knowing/cmsis_5/NN/Source/SoftmaxFunctions/arm_softmax_u8.c b/APP_Framework/Framework/knowing/cmsis_5/NN/Source/SoftmaxFunctions/arm_softmax_u8.c new file mode 100644 index 000000000..c4df8f8a4 --- /dev/null +++ b/APP_Framework/Framework/knowing/cmsis_5/NN/Source/SoftmaxFunctions/arm_softmax_u8.c @@ -0,0 +1,103 @@ +/* + * Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_softmax_u8.c + * Description: U8 softmax function + * + * $Date: 09. October 2020 + * $Revision: V.1.0.2 + * + * Target Processor: Cortex-M CPUs + * + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" +#include "arm_nnsupportfunctions.h" + +#define ACCUM_BITS 12 + +/** + * @ingroup groupNN + */ + +/** + * @addtogroup Softmax + * @{ + */ +void arm_softmax_u8(const uint8_t *input, + const int32_t num_rows, + const int32_t row_size, + const int32_t mult, + const int32_t shift, + const int32_t diff_min, + uint8_t *output) +{ + const int32_t mask = (1 << shift); + + int32_t col = 0; + int32_t row_idx; + + for (row_idx = 0; row_idx < num_rows; ++row_idx) + { + // Find the maximum value in order to ensure numerical stability + uint8_t max = *input; + + for (col = 1; col < row_size; ++col) + { + max = MAX(max, input[col]); + } + + int32_t diff = 0; + int32_t sum = 0; + + for (col = 0; col < row_size; ++col) + { + diff = input[col] - max; + if (diff >= diff_min) + { + sum += DIV_POW2(EXP_ON_NEG(MUL_SAT(diff * mask, mult)), ACCUM_BITS); + } + } + + const int32_t headroom = __CLZ((uint32_t)sum); + const int32_t bits_over_unit = ACCUM_BITS - headroom + 23; + const int32_t shifted_scale = ONE_OVER1((sum << headroom) - (1 << 31)); + + for (col = 0; col < row_size; ++col) + { + diff = input[col] - max; + if (diff >= diff_min) + { + const int32_t res = + DIV_POW2(MUL_SAT(shifted_scale, EXP_ON_NEG(MUL_SAT(diff * mask, mult))), bits_over_unit); + output[col] = (uint8_t)CLAMP(res, (int32_t)255, (int32_t)0); + } + else + { + output[col] = 0; + } + } + input += row_size; + output += row_size; + } +} +/** + * @} end of Softmax group + */ \ No newline at end of file diff --git a/APP_Framework/Framework/knowing/cmsis_5/NN/Source/SoftmaxFunctions/arm_softmax_with_batch_q7.c b/APP_Framework/Framework/knowing/cmsis_5/NN/Source/SoftmaxFunctions/arm_softmax_with_batch_q7.c new file mode 100644 index 000000000..66e892e0a --- /dev/null +++ b/APP_Framework/Framework/knowing/cmsis_5/NN/Source/SoftmaxFunctions/arm_softmax_with_batch_q7.c @@ -0,0 +1,74 @@ +/* + * Copyright (C) 2010-2019 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_softmax_with_batch_q7.c + * Description: Q7 softmax function + * + * $Date: 09. October 2020 + * $Revision: V.1.0.1 + * + * Target Processor: Cortex-M and Cortex-A cores + * + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" + +/** + * @ingroup groupNN + */ + +/** + * @addtogroup Softmax + * @{ + */ + +/** + * @brief Q7 softmax function with batch parameter + * @param[in] vec_in pointer to input vector + * @param[in] nb_batches number of batches + * @param[in] dim_vec input vector dimention + * @param[out] p_out pointer to output vector + * + * @details + * + * Here, instead of typical natural logarithm e based softmax, we use + * 2-based softmax here, i.e.,: + * + * y_i = 2^(x_i) / sum(2^x_j) + * + * The relative output will be different here. + * But mathematically, the gradient will be the same + * with a log(2) scaling factor. + * + */ + +void arm_softmax_with_batch_q7(const q7_t *vec_in, const uint16_t nb_batches, const uint16_t dim_vec, q7_t *p_out) +{ + for (int i = 0; i < nb_batches; i++) + { + arm_softmax_q7(vec_in, dim_vec, p_out); + vec_in += dim_vec; + p_out += dim_vec; + } +} + +/** + * @} end of Softmax group + */