diff --git a/kernel/arch/risc-v/nuclei/gcc/los_arch_context.h b/kernel/arch/risc-v/nuclei/gcc/los_arch_context.h new file mode 100644 index 00000000..09e7ab9e --- /dev/null +++ b/kernel/arch/risc-v/nuclei/gcc/los_arch_context.h @@ -0,0 +1,93 @@ +/* + * Copyright (c) 2013-2020, Huawei Technologies Co., Ltd. All rights reserved. + * Copyright (c) 2020-2021 Huawei Device Co., Ltd. All rights reserved. + * Copyright (c) 2021 Nuclei Limited. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, this list + * of conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _LOS_ARCH_CONTEXT_H +#define _LOS_ARCH_CONTEXT_H + +#include "los_compiler.h" +#include "los_context.h" + +#ifdef __cplusplus +#if __cplusplus +extern "C" { +#endif /* __cplusplus */ +#endif /* __cplusplus */ + +/** + * @ingroup los_hw + */ +typedef unsigned long STACK_TYPE; + +typedef struct { + STACK_TYPE epc; /* epc - epc - program counter */ + STACK_TYPE ra; /* x1 - ra - return address for jumps */ + STACK_TYPE t0; /* x5 - t0 - temporary register 0 */ + STACK_TYPE t1; /* x6 - t1 - temporary register 1 */ + STACK_TYPE t2; /* x7 - t2 - temporary register 2 */ + STACK_TYPE s0_fp; /* x8 - s0/fp - saved register 0 or frame pointer */ + STACK_TYPE s1; /* x9 - s1 - saved register 1 */ + STACK_TYPE a0; /* x10 - a0 - return value or function argument 0 */ + STACK_TYPE a1; /* x11 - a1 - return value or function argument 1 */ + STACK_TYPE a2; /* x12 - a2 - function argument 2 */ + STACK_TYPE a3; /* x13 - a3 - function argument 3 */ + STACK_TYPE a4; /* x14 - a4 - function argument 4 */ + STACK_TYPE a5; /* x15 - a5 - function argument 5 */ +#ifndef __riscv_32e + STACK_TYPE a6; /* x16 - a6 - function argument 6 */ + STACK_TYPE a7; /* x17 - s7 - function argument 7 */ + STACK_TYPE s2; /* x18 - s2 - saved register 2 */ + STACK_TYPE s3; /* x19 - s3 - saved register 3 */ + STACK_TYPE s4; /* x20 - s4 - saved register 4 */ + STACK_TYPE s5; /* x21 - s5 - saved register 5 */ + STACK_TYPE s6; /* x22 - s6 - saved register 6 */ + STACK_TYPE s7; /* x23 - s7 - saved register 7 */ + STACK_TYPE s8; /* x24 - s8 - saved register 8 */ + STACK_TYPE s9; /* x25 - s9 - saved register 9 */ + STACK_TYPE s10; /* x26 - s10 - saved register 10 */ + STACK_TYPE s11; /* x27 - s11 - saved register 11 */ + STACK_TYPE t3; /* x28 - t3 - temporary register 3 */ + STACK_TYPE t4; /* x29 - t4 - temporary register 4 */ + STACK_TYPE t5; /* x30 - t5 - temporary register 5 */ + STACK_TYPE t6; /* x31 - t6 - temporary register 6 */ +#endif + STACK_TYPE mstatus; /* - machine status register */ +} TaskContext; + +extern VOID HalStartToRun(VOID); + +#ifdef __cplusplus +#if __cplusplus +} +#endif /* __cplusplus */ +#endif /* __cplusplus */ + +#endif /* _LOS_HW_H */ diff --git a/kernel/arch/risc-v/nuclei/gcc/los_arch_interrupt.h b/kernel/arch/risc-v/nuclei/gcc/los_arch_interrupt.h new file mode 100644 index 00000000..63a27c8d --- /dev/null +++ b/kernel/arch/risc-v/nuclei/gcc/los_arch_interrupt.h @@ -0,0 +1,222 @@ +/* + * Copyright (c) 2013-2020, Huawei Technologies Co., Ltd. All rights reserved. + * Copyright (c) 2020-2021 Huawei Device Co., Ltd. All rights reserved. + * Copyright (c) 2021 Nuclei Limited. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, this list + * of conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef _LOS_HWI_H +#define _LOS_HWI_H + +#include "nuclei_sdk_soc.h" +#include "los_compiler.h" +#include "los_config.h" +#include "los_interrupt.h" +#include "los_arch_context.h" + +#ifdef __cplusplus +#if __cplusplus +extern "C" { +#endif /* __cplusplus */ +#endif /* __cplusplus */ +/** + * @ingroup los_hwi + * Count of Nuclei system interrupt vector. + */ +#define OS_RISCV_SYS_VECTOR_CNT 19 + +/** + * @ingroup los_hwi + * Count of Nuclei interrupt vector maxium, which is configurable. + */ +#define OS_RISCV_CUSTOM_IRQ_VECTOR_CNT SOC_INT_MAX + +/** + * @ingroup los_hwi + * Count of Nuclei interrupt vector. + */ +#define OS_RISCV_VECTOR_CNT (OS_RISCV_SYS_VECTOR_CNT + OS_RISCV_CUSTOM_IRQ_VECTOR_CNT) + +/** + * Maximum number of supported hardware devices that generate hardware interrupts. + */ +#define OS_HWI_MAX_NUM (OS_RISCV_VECTOR_CNT-1) + +extern VOID HalHwiDefaultHandler(VOID); + +/** + * @ingroup los_hwi + * Hardware interrupt error code: Invalid interrupt number. + * + * Value: 0x02000900 + * + * Solution: Ensure that the interrupt number is valid. The value range of the interrupt number applicable + * for a risc-v platform is [0, OS_RISCV_VECTOR_CNT]. + */ +#define OS_ERRNO_HWI_NUM_INVALID LOS_ERRNO_OS_ERROR(LOS_MOD_HWI, 0x00) + +/** + * @ingroup los_hwi + * Hardware interrupt error code: Null hardware interrupt handling function. + * + * Value: 0x02000901 + * + * Solution: Pass in a valid non-null hardware interrupt handling function. + */ +// #define OS_ERRNO_HWI_PROC_FUNC_NULL LOS_ERRNO_OS_ERROR(LOS_MOD_HWI, 0x01) + +/** + * @ingroup los_hwi + * Hardware interrupt error code: Insufficient interrupt resources for hardware interrupt creation. + * + * Value: 0x02000902 + * + * Solution: Increase the configured maximum number of supported hardware interrupts. + */ +// #define OS_ERRNO_HWI_CB_UNAVAILABLE LOS_ERRNO_OS_ERROR(LOS_MOD_HWI, 0x02) + +/** + * @ingroup los_hwi + * Hardware interrupt error code: Insufficient memory for hardware interrupt initialization. + * + * Value: 0x02000903 + * + * Solution: Expand the configured memory. + */ +// #define OS_ERRNO_HWI_NO_MEMORY LOS_ERRNO_OS_ERROR(LOS_MOD_HWI, 0x03) + +/** + * @ingroup los_hwi + * Hardware interrupt error code: The interrupt has already been created. + * + * Value: 0x02000904 + * + * Solution: Check whether the interrupt specified by the passed-in interrupt number has already been created. + */ +// #define OS_ERRNO_HWI_ALREADY_CREATED LOS_ERRNO_OS_ERROR(LOS_MOD_HWI, 0x04) + +/** + * @ingroup los_hwi + * Hardware interrupt error code: Invalid interrupt priority. + * + * Value: 0x02000905 + * + * Solution: Ensure that the interrupt priority is valid. + */ +// #define OS_ERRNO_HWI_PRIO_INVALID LOS_ERRNO_OS_ERROR(LOS_MOD_HWI, 0x05) + +/** + * @ingroup los_hwi + * Hardware interrupt error code: Incorrect interrupt creation mode. + * + * Value: 0x02000906 + * + * Solution: The interrupt creation mode can be only set to ECLIC_NON_VECTOR_INTERRUPT or ECLIC_VECTOR_INTERRUPT of which the + * value can be 0 or 1. + */ +#define OS_ERRNO_HWI_MODE_INVALID LOS_ERRNO_OS_ERROR(LOS_MOD_HWI, 0x06) + +/** + * @ingroup los_hwi + * Hardware interrupt error code: The interrupt has already been created as a fast interrupt. + * + * Value: 0x02000907 + * + * Solution: Check whether the interrupt specified by the passed-in interrupt number has already been created. + */ +// #define OS_ERRNO_HWI_FASTMODE_ALREADY_CREATED LOS_ERRNO_OS_ERROR(LOS_MOD_HWI, 0x07) + +/** + * @ingroup los_hwi + * Hardware interrupt error code: The API is called during an interrupt, which is forbidden. + * + * Value: 0x02000908 + * + * * Solution: Do not call the API during an interrupt. + */ +// #define OS_ERRNO_HWI_INTERR LOS_ERRNO_OS_ERROR(LOS_MOD_HWI, 0x08) + +/** + * @ingroup los_hwi + * Hardware interrupt error code:the hwi support SHARED error. + * + * Value: 0x02000909 + * + * * Solution:check the input params hwiMode and irqParam of HalHwiCreate or HalHwiDelete whether adapt the current + * hwi. + */ +// #define OS_ERRNO_HWI_SHARED_ERROR LOS_ERRNO_OS_ERROR(LOS_MOD_HWI, 0x09) + +/** + * @ingroup los_hwi + * Hardware interrupt error code:Invalid interrupt Arg. + * + * Value: 0x0200090a + * + * * Solution:check the interrupt Arg, Arg should only be ECLIC_LEVEL_TRIGGER, ECLIC_POSTIVE_EDGE_TRIGGER or + * ECLIC_NEGTIVE_EDGE_TRIGGER. + */ +#define OS_ERRNO_HWI_ARG_INVALID LOS_ERRNO_OS_ERROR(LOS_MOD_HWI, 0x0a) + +/** + * @ingroup los_hwi + * Hardware interrupt error code:The interrupt corresponded to the hwi number or devid has not been created. + * + * Value: 0x0200090b + * + * * Solution:check the hwi number or devid, make sure the hwi number or devid need to delete. + */ +// #define OS_ERRNO_HWI_HWINUM_UNCREATE LOS_ERRNO_OS_ERROR(LOS_MOD_HWI, 0x0b) + +extern UINT32 HalUnalignedAccessFix(UINTPTR mcause, UINTPTR mepc, UINTPTR mtval, VOID *sp); + +extern VOID DisplayTaskInfo(VOID); + +extern UINT32 g_intCount; + +__attribute__((always_inline)) static inline VOID HalIntEnter(VOID) +{ + g_intCount += 1; +} + +__attribute__((always_inline)) static inline VOID HalIntExit(VOID) +{ + g_intCount -= 1; +} + +__attribute__((always_inline)) static inline UINT32 HalIsIntAcvive(VOID) +{ + return (g_intCount > 0); +} + +#ifdef __cplusplus +#if __cplusplus +} +#endif /* __cplusplus */ +#endif /* __cplusplus */ + +#endif /* _LOS_HWI_H */ diff --git a/kernel/arch/risc-v/nuclei/gcc/los_arch_timer.h b/kernel/arch/risc-v/nuclei/gcc/los_arch_timer.h new file mode 100644 index 00000000..384bdd5b --- /dev/null +++ b/kernel/arch/risc-v/nuclei/gcc/los_arch_timer.h @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2013-2019 Huawei Technologies Co., Ltd. All rights reserved. + * Copyright (c) 2020-2021 Huawei Device Co., Ltd. All rights reserved. + * Copyright (c) 2021 Nuclei Limited. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, this list + * of conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _LOS_ARCH_TIMER_H +#define _LOS_ARCH_TIMER_H + +#include "los_config.h" +#include "los_compiler.h" +#include "los_context.h" + +#ifdef __cplusplus +#if __cplusplus +extern "C" { +#endif /* __cpluscplus */ +#endif /* __cpluscplus */ + +UINT32 HalTickStart(OS_TICK_HANDLER handler); + +#ifdef __cplusplus +#if __cplusplus +} +#endif /* __cpluscplus */ +#endif /* __cpluscplus */ + +#endif /* _LOS_ARCH_TIMER_H */ + diff --git a/kernel/arch/risc-v/nuclei/gcc/los_context.c b/kernel/arch/risc-v/nuclei/gcc/los_context.c new file mode 100644 index 00000000..3f309ea4 --- /dev/null +++ b/kernel/arch/risc-v/nuclei/gcc/los_context.c @@ -0,0 +1,124 @@ +/* + * Copyright (c) 2021 Nuclei Limited. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "los_arch_context.h" +#include "los_arch_interrupt.h" +#include "los_arch_timer.h" +#include "los_task.h" +#include "los_memory.h" +#include "los_timer.h" +#include "nuclei_sdk_soc.h" + +#define INITIAL_MSTATUS ( MSTATUS_MPP | MSTATUS_MPIE | MSTATUS_FS_INITIAL) + +#define ALIGN_DOWN(size, align) ((size) & ~((align) - 1)) + +#ifdef __cplusplus +#if __cplusplus +extern "C" { +#endif /* __cplusplus */ +#endif /* __cplusplus */ + +LITE_OS_SEC_TEXT_INIT VOID HalArchInit(VOID) +{ + HalHwiInit(); +} + +LITE_OS_SEC_TEXT_MINOR VOID HalSysExit(VOID) +{ + HalIntLock(); + while (1) { + } +} + +LITE_OS_SEC_TEXT_INIT VOID *HalTskStackInit(UINT32 taskID, UINT32 stackSize, VOID *topStack) +{ + UINT32 index; + UINT8 *stk; + TaskContext *context = NULL; + + /* initialize the task stack, write magic num to stack top */ + *((UINT32 *)(topStack)) = OS_TASK_MAGIC_WORD; + + stk = ((UINT8 *)topStack) + stackSize + sizeof(STACK_TYPE); + stk = (UINT8 *)ALIGN_DOWN((unsigned long)stk, REGBYTES); + context = (TaskContext *)(stk - sizeof(TaskContext)); + + for (index = 1; index < sizeof(TaskContext)/ sizeof(STACK_TYPE); index ++) { + ((STACK_TYPE *)context)[index] = OS_TASK_STACK_INIT; + } + context->ra = (STACK_TYPE)HalSysExit; + context->a0 = (STACK_TYPE)taskID; + context->epc = (STACK_TYPE)OsTaskEntry; + + context->mstatus = INITIAL_MSTATUS; + + + return (VOID *)context; +} + +extern BOOL g_taskScheduled; +extern LosTask g_losTask; +LITE_OS_SEC_TEXT_INIT UINT32 HalStartSchedule(OS_TICK_HANDLER handler) +{ + UINT32 ret; + __disable_irq(); + ret = HalTickStart(handler); + if (ret != LOS_OK) { + return ret; + } + g_taskScheduled = TRUE; + /* Set newTask to runTask */ + g_losTask.runTask = g_losTask.newTask; + g_losTask.runTask->taskStatus |= OS_TASK_STATUS_RUNNING; + HalStartToRun(); + return LOS_OK; /* never return */ +} + +VOID HalTaskSchedule(VOID) +{ + SysTimer_SetSWIRQ(); +} + +VOID HalTaskSwitch(VOID) +{ + SysTimer_ClearSWIRQ(); + g_losTask.runTask->taskStatus &= ~OS_TASK_STATUS_RUNNING; + /* Set newTask to runTask */ + g_losTask.runTask = g_losTask.newTask; + g_losTask.runTask->taskStatus |= OS_TASK_STATUS_RUNNING; +} + +LITE_OS_SEC_TEXT VOID HalTaskScheduleCheck(VOID) +{ +#if (LOSCFG_BASE_CORE_TSK_MONITOR == 1) + OsTaskSwitchCheck(); +#endif + return; +} + +VOID HalEnterSleep(LOS_SysSleepEnum sleep) +{ + __WFI(); +} + +#ifdef __cplusplus +#if __cplusplus +} +#endif /* __cplusplus */ +#endif /* __cplusplus */ diff --git a/kernel/arch/risc-v/nuclei/gcc/los_dispatch.S b/kernel/arch/risc-v/nuclei/gcc/los_dispatch.S new file mode 100644 index 00000000..99123fa2 --- /dev/null +++ b/kernel/arch/risc-v/nuclei/gcc/los_dispatch.S @@ -0,0 +1,229 @@ +/* + * Copyright (c) 2021 Nuclei Limited. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, this list + * of conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "riscv_encoding.h" + +#ifndef __riscv_32e +#define portRegNum 30 +#else +#define portRegNum 14 +#endif + +#define portCONTEXT_SIZE ( portRegNum * REGBYTES ) + + .section .text + .align 4 + + .type HalIntLock, %function + .global HalIntLock +HalIntLock: + csrr a0, mstatus // return value + li t0, MSTATUS_MIE // mie + csrrc zero, mstatus, t0 + ret + + .type HalIntUnLock, %function + .global HalIntUnLock +HalIntUnLock: + csrr a0, mstatus // return value + li t0, MSTATUS_MIE // mie + csrrs zero, mstatus, t0 + ret + + .type HalIntRestore, %function + .global HalIntRestore +HalIntRestore: + csrw mstatus, a0 + ret + + +/* Start the first task. This also clears the bit that indicates the FPU is + in use in case the FPU was used before the scheduler was started - which + would otherwise result in the unnecessary leaving of space in the stack + for lazy saving of FPU registers. */ + .type HalStartToRun, %function + .global HalStartToRun + .align 3 +HalStartToRun: + /* Setup Interrupt Stack using + The stack that was used by main() + before the scheduler is started is + no longer required after the scheduler is started. + Interrupt stack pointer is stored in CSR_MSCRATCH */ + la t0, _sp + csrw CSR_MSCRATCH, t0 + /* get stack pointer */ + la t0, g_losTask + LOAD t1, 0x0(t0) + LOAD sp, 0(t1) + //LOAD sp, 0x0(sp) /* Read sp from first TCB member */ + + /* Pop PC from stack and set MEPC */ + LOAD t0, 0 * REGBYTES(sp) + csrw CSR_MEPC, t0 + /* Pop mstatus from stack and set it */ + LOAD t0, (portRegNum - 1) * REGBYTES(sp) + csrw CSR_MSTATUS, t0 + /* Interrupt still disable here */ + /* Restore Registers from Stack */ + LOAD x1, 1 * REGBYTES(sp) /* RA */ + LOAD x5, 2 * REGBYTES(sp) + LOAD x6, 3 * REGBYTES(sp) + LOAD x7, 4 * REGBYTES(sp) + LOAD x8, 5 * REGBYTES(sp) + LOAD x9, 6 * REGBYTES(sp) + LOAD x10, 7 * REGBYTES(sp) + LOAD x11, 8 * REGBYTES(sp) + LOAD x12, 9 * REGBYTES(sp) + LOAD x13, 10 * REGBYTES(sp) + LOAD x14, 11 * REGBYTES(sp) + LOAD x15, 12 * REGBYTES(sp) +#ifndef __riscv_32e + LOAD x16, 13 * REGBYTES(sp) + LOAD x17, 14 * REGBYTES(sp) + LOAD x18, 15 * REGBYTES(sp) + LOAD x19, 16 * REGBYTES(sp) + LOAD x20, 17 * REGBYTES(sp) + LOAD x21, 18 * REGBYTES(sp) + LOAD x22, 19 * REGBYTES(sp) + LOAD x23, 20 * REGBYTES(sp) + LOAD x24, 21 * REGBYTES(sp) + LOAD x25, 22 * REGBYTES(sp) + LOAD x26, 23 * REGBYTES(sp) + LOAD x27, 24 * REGBYTES(sp) + LOAD x28, 25 * REGBYTES(sp) + LOAD x29, 26 * REGBYTES(sp) + LOAD x30, 27 * REGBYTES(sp) + LOAD x31, 28 * REGBYTES(sp) +#endif + + addi sp, sp, portCONTEXT_SIZE + + mret + +.extern HalTaskSwitch +.align 2 +.global eclic_msip_handler +eclic_msip_handler: + addi sp, sp, -portCONTEXT_SIZE + STORE x1, 1 * REGBYTES(sp) /* RA */ + STORE x5, 2 * REGBYTES(sp) + STORE x6, 3 * REGBYTES(sp) + STORE x7, 4 * REGBYTES(sp) + STORE x8, 5 * REGBYTES(sp) + STORE x9, 6 * REGBYTES(sp) + STORE x10, 7 * REGBYTES(sp) + STORE x11, 8 * REGBYTES(sp) + STORE x12, 9 * REGBYTES(sp) + STORE x13, 10 * REGBYTES(sp) + STORE x14, 11 * REGBYTES(sp) + STORE x15, 12 * REGBYTES(sp) +#ifndef __riscv_32e + STORE x16, 13 * REGBYTES(sp) + STORE x17, 14 * REGBYTES(sp) + STORE x18, 15 * REGBYTES(sp) + STORE x19, 16 * REGBYTES(sp) + STORE x20, 17 * REGBYTES(sp) + STORE x21, 18 * REGBYTES(sp) + STORE x22, 19 * REGBYTES(sp) + STORE x23, 20 * REGBYTES(sp) + STORE x24, 21 * REGBYTES(sp) + STORE x25, 22 * REGBYTES(sp) + STORE x26, 23 * REGBYTES(sp) + STORE x27, 24 * REGBYTES(sp) + STORE x28, 25 * REGBYTES(sp) + STORE x29, 26 * REGBYTES(sp) + STORE x30, 27 * REGBYTES(sp) + STORE x31, 28 * REGBYTES(sp) +#endif + /* Push mstatus to stack */ + csrr t0, CSR_MSTATUS + STORE t0, (portRegNum - 1) * REGBYTES(sp) + + /* Push additional registers */ + + /* Store sp to task stack */ + la t0, g_losTask + LOAD t0, 0(t0) + STORE sp, 0(t0) + + csrr t0, CSR_MEPC + STORE t0, 0(sp) + + /* Switch task context */ + jal HalTaskSwitch + /* Load new task */ + la t0, g_losTask + LOAD t0, 0(t0) + LOAD sp, 0x0(t0) /* Read sp from first TCB member */ + + /* Pop PC from stack and set MEPC */ + LOAD t0, 0 * REGBYTES(sp) + csrw CSR_MEPC, t0 + /* Pop additional registers */ + + /* Pop mstatus from stack and set it */ + LOAD t0, (portRegNum - 1) * REGBYTES(sp) + csrw CSR_MSTATUS, t0 + /* Interrupt still disable here */ + /* Restore Registers from Stack */ + LOAD x1, 1 * REGBYTES(sp) /* RA */ + LOAD x5, 2 * REGBYTES(sp) + LOAD x6, 3 * REGBYTES(sp) + LOAD x7, 4 * REGBYTES(sp) + LOAD x8, 5 * REGBYTES(sp) + LOAD x9, 6 * REGBYTES(sp) + LOAD x10, 7 * REGBYTES(sp) + LOAD x11, 8 * REGBYTES(sp) + LOAD x12, 9 * REGBYTES(sp) + LOAD x13, 10 * REGBYTES(sp) + LOAD x14, 11 * REGBYTES(sp) + LOAD x15, 12 * REGBYTES(sp) +#ifndef __riscv_32e + LOAD x16, 13 * REGBYTES(sp) + LOAD x17, 14 * REGBYTES(sp) + LOAD x18, 15 * REGBYTES(sp) + LOAD x19, 16 * REGBYTES(sp) + LOAD x20, 17 * REGBYTES(sp) + LOAD x21, 18 * REGBYTES(sp) + LOAD x22, 19 * REGBYTES(sp) + LOAD x23, 20 * REGBYTES(sp) + LOAD x24, 21 * REGBYTES(sp) + LOAD x25, 22 * REGBYTES(sp) + LOAD x26, 23 * REGBYTES(sp) + LOAD x27, 24 * REGBYTES(sp) + LOAD x28, 25 * REGBYTES(sp) + LOAD x29, 26 * REGBYTES(sp) + LOAD x30, 27 * REGBYTES(sp) + LOAD x31, 28 * REGBYTES(sp) +#endif + + addi sp, sp, portCONTEXT_SIZE + mret diff --git a/kernel/arch/risc-v/nuclei/gcc/los_exc.S b/kernel/arch/risc-v/nuclei/gcc/los_exc.S new file mode 100644 index 00000000..fea0f9ad --- /dev/null +++ b/kernel/arch/risc-v/nuclei/gcc/los_exc.S @@ -0,0 +1,255 @@ +/* + * Copyright (c) 2021 Nuclei Limited. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, this list + * of conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _LOS_EXC_S +#define _LOS_EXC_S + +#include "riscv_encoding.h" + +.section .text.entry +.align 8 + +/** + * \brief Global interrupt disabled + * \details + * This function disable global interrupt. + * \remarks + * - All the interrupt requests will be ignored by CPU. + */ +.macro DISABLE_MIE + csrc CSR_MSTATUS, MSTATUS_MIE +.endm + +/** + * \brief Macro for context save + * \details + * This macro save ABI defined caller saved registers in the stack. + * \remarks + * - This Macro could use to save context when you enter to interrupt + * or exception +*/ +/* Save caller registers */ +.macro SAVE_CONTEXT + csrrw sp, CSR_MSCRATCHCSWL, sp + /* Allocate stack space for context saving */ +#ifndef __riscv_32e + addi sp, sp, -20*REGBYTES +#else + addi sp, sp, -14*REGBYTES +#endif /* __riscv_32e */ + + STORE x1, 0*REGBYTES(sp) + STORE x4, 1*REGBYTES(sp) + STORE x5, 2*REGBYTES(sp) + STORE x6, 3*REGBYTES(sp) + STORE x7, 4*REGBYTES(sp) + STORE x10, 5*REGBYTES(sp) + STORE x11, 6*REGBYTES(sp) + STORE x12, 7*REGBYTES(sp) + STORE x13, 8*REGBYTES(sp) + STORE x14, 9*REGBYTES(sp) + STORE x15, 10*REGBYTES(sp) +#ifndef __riscv_32e + STORE x16, 14*REGBYTES(sp) + STORE x17, 15*REGBYTES(sp) + STORE x28, 16*REGBYTES(sp) + STORE x29, 17*REGBYTES(sp) + STORE x30, 18*REGBYTES(sp) + STORE x31, 19*REGBYTES(sp) +#endif /* __riscv_32e */ +.endm + +/** + * \brief Macro for restore caller registers + * \details + * This macro restore ABI defined caller saved registers from stack. + * \remarks + * - You could use this macro to restore context before you want return + * from interrupt or exeception + */ +/* Restore caller registers */ +.macro RESTORE_CONTEXT + LOAD x1, 0*REGBYTES(sp) + LOAD x4, 1*REGBYTES(sp) + LOAD x5, 2*REGBYTES(sp) + LOAD x6, 3*REGBYTES(sp) + LOAD x7, 4*REGBYTES(sp) + LOAD x10, 5*REGBYTES(sp) + LOAD x11, 6*REGBYTES(sp) + LOAD x12, 7*REGBYTES(sp) + LOAD x13, 8*REGBYTES(sp) + LOAD x14, 9*REGBYTES(sp) + LOAD x15, 10*REGBYTES(sp) +#ifndef __riscv_32e + LOAD x16, 14*REGBYTES(sp) + LOAD x17, 15*REGBYTES(sp) + LOAD x28, 16*REGBYTES(sp) + LOAD x29, 17*REGBYTES(sp) + LOAD x30, 18*REGBYTES(sp) + LOAD x31, 19*REGBYTES(sp) + + /* De-allocate the stack space */ + addi sp, sp, 20*REGBYTES +#else + /* De-allocate the stack space */ + addi sp, sp, 14*REGBYTES +#endif /* __riscv_32e */ + csrrw sp, CSR_MSCRATCHCSWL, sp +.endm + +/** + * \brief Macro for save necessary CSRs to stack + * \details + * This macro store MCAUSE, MEPC, MSUBM to stack. + */ +.macro SAVE_CSR_CONTEXT + /* Store CSR mcause to stack using pushmcause */ + csrrwi x0, CSR_PUSHMCAUSE, 11 + /* Store CSR mepc to stack using pushmepc */ + csrrwi x0, CSR_PUSHMEPC, 12 + /* Store CSR msub to stack using pushmsub */ + csrrwi x0, CSR_PUSHMSUBM, 13 +.endm + +/** + * \brief Macro for restore necessary CSRs from stack + * \details + * This macro restore MSUBM, MEPC, MCAUSE from stack. + */ +.macro RESTORE_CSR_CONTEXT + LOAD x5, 13*REGBYTES(sp) + csrw CSR_MSUBM, x5 + LOAD x5, 12*REGBYTES(sp) + csrw CSR_MEPC, x5 + LOAD x5, 11*REGBYTES(sp) + csrw CSR_MCAUSE, x5 +.endm + +/** + * \brief Exception/NMI Entry + * \details + * This function provide common entry functions for exception/nmi. + * \remarks + * This function provide a default exception/nmi entry. + * ABI defined caller save register and some CSR registers + * to be saved before enter interrupt handler and be restored before return. + */ +.section .text.trap +/* In CLIC mode, the exeception entry must be 64bytes aligned */ +.align 6 +.global exc_entry +exc_entry: + /* Save the caller saving registers (context) */ + SAVE_CONTEXT + /* Save the necessary CSR registers */ + SAVE_CSR_CONTEXT + + /* + * Set the exception handler function arguments + * argument 1: mcause value + * argument 2: current stack point(SP) value + */ + csrr a0, mcause + mv a1, sp + /* + * TODO: Call the exception handler function + * By default, the function template is provided in + * system_Device.c, you can adjust it as you want + */ + call core_exception_handler + + /* Restore the necessary CSR registers */ + RESTORE_CSR_CONTEXT + /* Restore the caller saving registers (context) */ + RESTORE_CONTEXT + + /* Return to regular code */ + mret + +/** + * \brief Non-Vector Interrupt Entry + * \details + * This function provide common entry functions for handling + * non-vector interrupts + * \remarks + * This function provide a default non-vector interrupt entry. + * ABI defined caller save register and some CSR registers need + * to be saved before enter interrupt handler and be restored before return. + */ +.section .text.irq +/* In CLIC mode, the interrupt entry must be 4bytes aligned */ +.align 2 +.extern g_intCount +.global irq_entry +/* This label will be set to MTVT2 register */ +irq_entry: + /* Save the caller saving registers (context) */ + SAVE_CONTEXT + /* Save the necessary CSR registers */ + SAVE_CSR_CONTEXT + + /* This special CSR read/write operation, which is actually + * claim the CLIC to find its pending highest ID, if the ID + * is not 0, then automatically enable the mstatus.MIE, and + * jump to its vector-entry-label, and update the link register + */ + la t0, g_intCount + lw t1, 0(t0) + add t1, t1, 0x1 + sw t1, 0(t0) + + csrrw ra, CSR_JALMNXTI, ra + + /* Critical section with interrupts disabled */ + DISABLE_MIE + + la t0, g_intCount + lw t1, 0(t0) + li t2, 0x1 + sub t1, t1, t2 + sw t1, 0(t0) + + /* Restore the necessary CSR registers */ + RESTORE_CSR_CONTEXT + /* Restore the caller saving registers (context) */ + RESTORE_CONTEXT + + /* Return to regular code */ + mret + +/* Default Handler for Exceptions / Interrupts */ +.global default_intexc_handler +Undef_Handler: +default_intexc_handler: +1: + j 1b + +#endif /* _LOS_TRAP_S */ + diff --git a/kernel/arch/risc-v/nuclei/gcc/los_interrupt.c b/kernel/arch/risc-v/nuclei/gcc/los_interrupt.c new file mode 100644 index 00000000..2e2f808b --- /dev/null +++ b/kernel/arch/risc-v/nuclei/gcc/los_interrupt.c @@ -0,0 +1,175 @@ +/* + * Copyright (c) 2021 Nuclei Limited. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, this list + * of conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include +#include "los_arch.h" +#include "los_arch_interrupt.h" +#include "los_arch_context.h" +#include "los_task.h" +#include "los_debug.h" +#include "nuclei_sdk_hal.h" + +#ifdef __cplusplus +#if __cplusplus +extern "C" { +#endif /* __cplusplus */ +#endif /* __cplusplus */ + +UINT32 g_intCount = 0; + +// LosExcInfo g_excInfo; +LITE_OS_SEC_TEXT_INIT VOID HalHwiInit(VOID) +{ + // already setup interrupt vectors +} + +/***************************************************************************** + Function : HalHwiCreate + Description : create hardware interrupt + Input : hwiNum --- hwi num to create + hwiPrio --- priority of the hwi + mode --- hwi interrupt mode, between vector or non-vector + handler --- hwi handler + arg --- set trig mode of the hwi handler + Level Triggerred = 0 + Postive/Rising Edge Triggered = 1 + Negtive/Falling Edge Triggered = 3 + Output : None + Return : LOS_OK on success or error code on failure + *****************************************************************************/ + UINT32 HalHwiCreate(HWI_HANDLE_T hwiNum, + HWI_PRIOR_T hwiPrio, + HWI_MODE_T mode, + HWI_PROC_FUNC handler, + HWI_ARG_T arg) +{ + if (hwiNum > SOC_INT_MAX){ + return OS_ERRNO_HWI_NUM_INVALID; + } + if (mode > ECLIC_VECTOR_INTERRUPT){ + return OS_ERRNO_HWI_MODE_INVALID; + } + if (arg > ECLIC_NEGTIVE_EDGE_TRIGGER){ + return OS_ERRNO_HWI_ARG_INVALID; + } + + /* set interrupt vector mode */ + ECLIC_SetShvIRQ(hwiNum, mode); + /* set interrupt trigger mode and polarity */ + ECLIC_SetTrigIRQ(hwiNum, arg); + /* set interrupt level */ + // default to 0 + ECLIC_SetLevelIRQ(hwiNum, 0); + /* set interrupt priority */ + ECLIC_SetPriorityIRQ(hwiNum, hwiPrio); + if (handler != NULL) { + /* set interrupt handler entry to vector table */ + ECLIC_SetVector(hwiNum, (rv_csr_t)handler); + } + /* enable interrupt */ + ECLIC_EnableIRQ(hwiNum); + return LOS_OK; +} + +/***************************************************************************** + Function : HalHwiDelete + Description : Delete hardware interrupt + Input : hwiNum --- hwi num to delete + Return : LOS_OK on success or error code on failure + *****************************************************************************/ +LITE_OS_SEC_TEXT UINT32 HalHwiDelete(HWI_HANDLE_T hwiNum) +{ + // change func to default func + ECLIC_SetVector(hwiNum, HalHwiDefaultHandler); + // disable interrupt + ECLIC_DisableIRQ(hwiNum); + return LOS_OK; +} + +/* **************************************************************************** + Function : HalHwiDefaultHandler + Description : default handler of the hardware interrupt + Input : None + Output : None + Return : None + **************************************************************************** */ +LITE_OS_SEC_TEXT_INIT VOID HalHwiDefaultHandler(VOID) +{ + PRINT_ERR("default handler\n"); + while (1) { + } +} + +/* **************************************************************************** + Function : HalDisplayTaskInfo + Description : display the task list + Input : None + Output : None + Return : None + **************************************************************************** */ +VOID HalDisplayTaskInfo(VOID) +{ + TSK_INFO_S taskInfo; + UINT32 index; + UINT32 ret; + + PRINTK("ID Pri Status name \r\n"); + PRINTK("-- --- --------- ----\r\n"); + + for (index = 0; index < LOSCFG_BASE_CORE_TSK_LIMIT; index++) { + ret = LOS_TaskInfoGet(index, &taskInfo); + if (ret != LOS_OK) { + continue; + } + PRINTK("%d %d %s %s \r\n", + taskInfo.uwTaskID, taskInfo.usTaskPrio, OsConvertTskStatus(taskInfo.usTaskStatus), taskInfo.acName); + } + return; +} + +/* **************************************************************************** + Function : HalUnalignedAccessFix + Description : Unaligned acess fixes are not supported by default + Input : None + Output : None + Return : None + **************************************************************************** */ +WEAK UINT32 HalUnalignedAccessFix(UINTPTR mcause, UINTPTR mepc, UINTPTR mtval, VOID *sp) +{ + /* Unaligned acess fixes are not supported by default */ + PRINTK("Unaligned acess fixes are not support by default!\r\n"); + return LOS_NOK; +} +#ifdef __cplusplus +#if __cplusplus +} +#endif /* __cplusplus */ +#endif /* __cplusplus */ diff --git a/kernel/arch/risc-v/nuclei/gcc/los_timer.c b/kernel/arch/risc-v/nuclei/gcc/los_timer.c new file mode 100644 index 00000000..ae0ce335 --- /dev/null +++ b/kernel/arch/risc-v/nuclei/gcc/los_timer.c @@ -0,0 +1,148 @@ +/* + * Copyright (c) 2013-2020, Huawei Technologies Co., Ltd. All rights reserved. + * Copyright (c) 2020-2021 Huawei Device Co., Ltd. All rights reserved. + * Copyright (c) 2021 Nuclei Limited. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, this list + * of conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "los_tick.h" +#include "los_config.h" +#include "los_arch_interrupt.h" +#include "nuclei_sdk_hal.h" +#include "los_timer.h" + +#ifdef __cplusplus +#if __cplusplus +extern "C" { +#endif /* __cplusplus */ +#endif /* __cplusplus */ + +#define configKERNEL_INTERRUPT_PRIORITY 0 + +#define SYSTICK_TICK_CONST (SOC_TIMER_FREQ / LOSCFG_BASE_CORE_TICK_PER_SECOND) + +static OS_TICK_HANDLER systick_handler = (OS_TICK_HANDLER)NULL; + +extern UINT32 g_intCount; + +WEAK UINT32 HalTickStart(OS_TICK_HANDLER handler) +{ + SysTick_Config(SYSTICK_TICK_CONST); + ECLIC_DisableIRQ(SysTimer_IRQn); + ECLIC_SetLevelIRQ(SysTimer_IRQn, configKERNEL_INTERRUPT_PRIORITY); + ECLIC_SetShvIRQ(SysTimer_IRQn, ECLIC_NON_VECTOR_INTERRUPT); + ECLIC_EnableIRQ(SysTimer_IRQn); + + /* Set SWI interrupt level to lowest level/priority, SysTimerSW as Vector Interrupt */ + ECLIC_SetShvIRQ(SysTimerSW_IRQn, ECLIC_VECTOR_INTERRUPT); + ECLIC_SetLevelIRQ(SysTimerSW_IRQn, configKERNEL_INTERRUPT_PRIORITY); + ECLIC_EnableIRQ(SysTimerSW_IRQn); + g_sysClock = SystemCoreClock; + g_cyclesPerTick = g_sysClock / LOSCFG_BASE_CORE_TICK_PER_SECOND; + g_intCount = 0; + g_ullTickCount = 0; + + systick_handler = handler; + + return LOS_OK; /* never return */ +} + +#define HalTickSysTickHandler eclic_mtip_handler + +void HalTickSysTickHandler( void ) +{ + UINT32 intSave; + + intSave = LOS_IntLock(); + + SysTick_Reload(SYSTICK_TICK_CONST); + /* Do systick handler. */ + if ((void *)systick_handler != NULL) { + systick_handler(); + } + + LOS_IntRestore(intSave); +} +/* **************************************************************************** +Function : HalGetCpuCycle +Description : Get System cycle count +Input : none +output : cntHi --- CpuTick High 4 byte + cntLo --- CpuTick Low 4 byte +return : none +**************************************************************************** */ +LITE_OS_SEC_TEXT_MINOR VOID HalGetCpuCycle(UINT32 *cntHi, UINT32 *cntLo) +{ + volatile uint32_t high0, low, high; + + high0 = __RV_CSR_READ(CSR_MCYCLEH); + low = __RV_CSR_READ(CSR_MCYCLE); + high = __RV_CSR_READ(CSR_MCYCLEH); + if (high0 != high) { + low = __RV_CSR_READ(CSR_MCYCLE); + } + *cntHi = high; + *cntLo = low; + return; +} + +WEAK VOID HalDelay(UINT32 ticks) +{ + return; +} + +WEAK UINT64 HalGetExpandTick(VOID) +{ + return LOS_OK; +} + +WEAK INT32 HalGetRtcTime(UINT64 *usec) +{ + return LOS_OK; +} + +WEAK INT32 HalGetRtcTimeZone(INT32 *timeZone) +{ + return LOS_OK; +} + +WEAK INT32 HalSetRtcTime(UINT64 utcTime, UINT64 *usec) +{ + return LOS_OK; +} + +WEAK INT32 HalSetRtcTimeZone(INT32 timeZone) +{ + return LOS_OK; +} + +#ifdef __cplusplus +#if __cplusplus +} +#endif /* __cplusplus */ +#endif /* __cplusplus */ diff --git a/kernel/arch/risc-v/nuclei/gcc/nmsis/Core/Include/core_compatiable.h b/kernel/arch/risc-v/nuclei/gcc/nmsis/Core/Include/core_compatiable.h new file mode 100644 index 00000000..316a309f --- /dev/null +++ b/kernel/arch/risc-v/nuclei/gcc/nmsis/Core/Include/core_compatiable.h @@ -0,0 +1,232 @@ +/* + * Copyright (c) 2019 Nuclei Limited. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __CORE_COMPATIABLE_H__ +#define __CORE_COMPATIABLE_H__ +/*! + * @file core_compatiable.h + * @brief ARM compatiable function definitions header file + */ +#ifdef __cplusplus + extern "C" { +#endif + +/* ===== ARM Compatiable Functions ===== */ +/** + * \defgroup NMSIS_Core_ARMCompatiable_Functions ARM Compatiable Functions + * \ingroup NMSIS_Core + * \brief A few functions that compatiable with ARM CMSIS-Core. + * \details + * + * Here we provided a few functions that compatiable with ARM CMSIS-Core, + * mostly used in the DSP and NN library. + * @{ + */ +/** \brief Instruction Synchronization Barrier, compatiable with ARM */ +#define __ISB() __RWMB() + +/** \brief Data Synchronization Barrier, compatiable with ARM */ +#define __DSB() __RWMB() + +/** \brief Data Memory Barrier, compatiable with ARM */ +#define __DMB() __RWMB() + +/** \brief LDRT Unprivileged (8 bit), ARM Compatiable */ +#define __LDRBT(ptr) __LB((ptr)) +/** \brief LDRT Unprivileged (16 bit), ARM Compatiable */ +#define __LDRHT(ptr) __LH((ptr)) +/** \brief LDRT Unprivileged (32 bit), ARM Compatiable */ +#define __LDRT(ptr) __LW((ptr)) + +/** \brief STRT Unprivileged (8 bit), ARM Compatiable */ +#define __STRBT(val, ptr) __SB((ptr), (val)) +/** \brief STRT Unprivileged (16 bit), ARM Compatiable */ +#define __STRHT(val, ptr) __SH((ptr), (val)) +/** \brief STRT Unprivileged (32 bit), ARM Compatiable */ +#define __STRT(val, ptr) __SW((ptr), (val)) + +/* ===== Saturation Operations ===== */ +/** + * \brief Signed Saturate + * \details Saturates a signed value. + * \param [in] value Value to be saturated + * \param [in] sat Bit position to saturate to (1..32) + * \return Saturated value + */ +#if defined(__DSP_PRESENT) && (__DSP_PRESENT == 1) +#define __SSAT(val, sat) __RV_SCLIP32((val), (sat-1)) +#else +__STATIC_FORCEINLINE int32_t __SSAT(int32_t val, uint32_t sat) +{ + if ((sat >= 1U) && (sat <= 32U)) { + const int32_t max = (int32_t)((1U << (sat - 1U)) - 1U); + const int32_t min = -1 - max ; + if (val > max) { + return max; + } else if (val < min) { + return min; + } + } + return val; +} +#endif + +/** + * \brief Unsigned Saturate + * \details Saturates an unsigned value. + * \param [in] value Value to be saturated + * \param [in] sat Bit position to saturate to (0..31) + * \return Saturated value + */ +#if defined(__DSP_PRESENT) && (__DSP_PRESENT == 1) +#define __USAT(val, sat) __RV_UCLIP32((val), (sat)) +#else +__STATIC_FORCEINLINE uint32_t __USAT(int32_t val, uint32_t sat) +{ + if (sat <= 31U) { + const uint32_t max = ((1U << sat) - 1U); + if (val > (int32_t)max) { + return max; + } else if (val < 0) { + return 0U; + } + } + return (uint32_t)val; +} +#endif + +/* ===== Data Processing Operations ===== */ +/** + * \brief Reverse byte order (32 bit) + * \details Reverses the byte order in unsigned integer value. + * For example, 0x12345678 becomes 0x78563412. + * \param [in] value Value to reverse + * \return Reversed value + */ +__STATIC_FORCEINLINE uint32_t __REV(uint32_t value) +{ + uint32_t result; + + result = ((value & 0xff000000) >> 24) + | ((value & 0x00ff0000) >> 8 ) + | ((value & 0x0000ff00) << 8 ) + | ((value & 0x000000ff) << 24); + return result; +} + +/** + * \brief Reverse byte order (16 bit) + * \details Reverses the byte order within each halfword of a word. + * For example, 0x12345678 becomes 0x34127856. + * \param [in] value Value to reverse + * \return Reversed value + */ +__STATIC_FORCEINLINE uint32_t __REV16(uint32_t value) +{ + uint32_t result; + result = ((value & 0xff000000) >> 8) + | ((value & 0x00ff00000) << 8 ) + | ((value & 0x0000ff00) >> 8 ) + | ((value & 0x000000ff) << 8) ; + + return result; +} + +/** + * \brief Reverse byte order (16 bit) + * \details Reverses the byte order in a 16-bit value + * and returns the signed 16-bit result. + * For example, 0x0080 becomes 0x8000. + * \param [in] value Value to reverse + * \return Reversed value + */ +__STATIC_FORCEINLINE int16_t __REVSH(int16_t value) +{ + int16_t result; + result = ((value & 0xff00) >> 8) | ((value & 0x00ff) << 8); + return result; +} + +/** + * \brief Rotate Right in unsigned value (32 bit) + * \details Rotate Right (immediate) provides the value of + * the contents of a register rotated by a variable number of bits. + * \param [in] op1 Value to rotate + * \param [in] op2 Number of Bits to rotate(0-31) + * \return Rotated value + */ +__STATIC_FORCEINLINE uint32_t __ROR(uint32_t op1, uint32_t op2) +{ + op2 = op2 & 0x1F; + if (op2 == 0U) { + return op1; + } + return (op1 >> op2) | (op1 << (32U - op2)); +} + +/** + * \brief Reverse bit order of value + * \details Reverses the bit order of the given value. + * \param [in] value Value to reverse + * \return Reversed value + */ +#if defined(__DSP_PRESENT) && (__DSP_PRESENT == 1) +#define __RBIT(value) __RV_BITREVI((value), 31) +#else +__STATIC_FORCEINLINE uint32_t __RBIT(uint32_t value) +{ + uint32_t result; + uint32_t s = (4U /*sizeof(v)*/ * 8U) - 1U; /* extra shift needed at end */ + + result = value; /* r will be reversed bits of v; first get LSB of v */ + for (value >>= 1U; value != 0U; value >>= 1U) { + result <<= 1U; + result |= value & 1U; + s--; + } + result <<= s; /* shift when v's highest bits are zero */ + return result; +} +#endif /* defined(__DSP_PRESENT) && (__DSP_PRESENT == 1) */ + +/** + * \brief Count leading zeros + * \details Counts the number of leading zeros of a data value. + * \param [in] data Value to count the leading zeros + * \return number of leading zeros in value + */ +#if defined(__DSP_PRESENT) && (__DSP_PRESENT == 1) +#define __CLZ(data) __RV_CLZ32(data) +#else +__STATIC_FORCEINLINE uint8_t __CLZ(uint32_t data) +{ + uint8_t ret = 0; + uint32_t temp = ~data; + while (temp & 0x80000000) { + temp <<= 1; + ret++; + } + return ret; +} +#endif /* defined(__DSP_PRESENT) && (__DSP_PRESENT == 1) */ + +/** @} */ /* End of Doxygen Group NMSIS_Core_ARMCompatiable_Functions */ + +#ifdef __cplusplus +} +#endif +#endif /* __CORE_COMPATIABLE_H__ */ diff --git a/kernel/arch/risc-v/nuclei/gcc/nmsis/Core/Include/core_feature_base.h b/kernel/arch/risc-v/nuclei/gcc/nmsis/Core/Include/core_feature_base.h new file mode 100644 index 00000000..5f351a33 --- /dev/null +++ b/kernel/arch/risc-v/nuclei/gcc/nmsis/Core/Include/core_feature_base.h @@ -0,0 +1,1177 @@ +/* + * Copyright (c) 2019 Nuclei Limited. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __CORE_FEATURE_BASE__ +#define __CORE_FEATURE_BASE__ +/*! + * @file core_feature_base.h + * @brief Base core feature API for Nuclei N/NX Core + */ +#include +#include "riscv_encoding.h" + +#ifdef __cplusplus + extern "C" { +#endif + +/** + * \defgroup NMSIS_Core_Registers Register Define and Type Definitions + * \brief Type definitions and defines for core registers. + * + * @{ + */ +#ifndef __RISCV_XLEN + /** \brief Refer to the width of an integer register in bits(either 32 or 64) */ + #ifndef __riscv_xlen + #define __RISCV_XLEN 32 + #else + #define __RISCV_XLEN __riscv_xlen + #endif +#endif /* __RISCV_XLEN */ + +/** \brief Type of Control and Status Register(CSR), depends on the XLEN defined in RISC-V */ +#if __RISCV_XLEN == 32 + typedef uint32_t rv_csr_t; +#elif __RISCV_XLEN == 64 + typedef uint64_t rv_csr_t; +#else + typedef uint32_t rv_csr_t; +#endif +/** @} */ /* End of Doxygen Group NMSIS_Core_Registers */ +/** + * \defgroup NMSIS_Core_Base_Registers Base Register Define and Type Definitions + * \ingroup NMSIS_Core_Registers + * \brief Type definitions and defines for base core registers. + * + * @{ + */ +/** + * \brief Union type to access MISA register. + */ +typedef union { + struct { + rv_csr_t a:1; /*!< bit: 0 Atomic extension */ + rv_csr_t b:1; /*!< bit: 1 Tentatively reserved for Bit-Manipulation extension */ + rv_csr_t c:1; /*!< bit: 2 Compressed extension */ + rv_csr_t d:1; /*!< bit: 3 Double-precision floating-point extension */ + rv_csr_t e:1; /*!< bit: 4 RV32E base ISA */ + rv_csr_t f:1; /*!< bit: 5 Single-precision floating-point extension */ + rv_csr_t g:1; /*!< bit: 6 Additional standard extensions present */ + rv_csr_t h:1; /*!< bit: 7 Hypervisor extension */ + rv_csr_t i:1; /*!< bit: 8 RV32I/64I/128I base ISA */ + rv_csr_t j:1; /*!< bit: 9 Tentatively reserved for Dynamically Translated Languages extension */ + rv_csr_t _reserved1:1; /*!< bit: 10 Reserved */ + rv_csr_t l:1; /*!< bit: 11 Tentatively reserved for Decimal Floating-Point extension */ + rv_csr_t m:1; /*!< bit: 12 Integer Multiply/Divide extension */ + rv_csr_t n:1; /*!< bit: 13 User-level interrupts supported */ + rv_csr_t _reserved2:1; /*!< bit: 14 Reserved */ + rv_csr_t p:1; /*!< bit: 15 Tentatively reserved for Packed-SIMD extension */ + rv_csr_t q:1; /*!< bit: 16 Quad-precision floating-point extension */ + rv_csr_t _resreved3:1; /*!< bit: 17 Reserved */ + rv_csr_t s:1; /*!< bit: 18 Supervisor mode implemented */ + rv_csr_t t:1; /*!< bit: 19 Tentatively reserved for Transactional Memory extension */ + rv_csr_t u:1; /*!< bit: 20 User mode implemented */ + rv_csr_t v:1; /*!< bit: 21 Tentatively reserved for Vector extension */ + rv_csr_t _reserved4:1; /*!< bit: 22 Reserved */ + rv_csr_t x:1; /*!< bit: 23 Non-standard extensions present */ +#if defined(__RISCV_XLEN) && __RISCV_XLEN == 64 + rv_csr_t _reserved5:38; /*!< bit: 24..61 Reserved */ + rv_csr_t mxl:2; /*!< bit: 62..63 Machine XLEN */ +#else + rv_csr_t _reserved5:6; /*!< bit: 24..29 Reserved */ + rv_csr_t mxl:2; /*!< bit: 30..31 Machine XLEN */ +#endif + } b; /*!< Structure used for bit access */ + rv_csr_t d; /*!< Type used for csr data access */ +} CSR_MISA_Type; + +/** + * \brief Union type to access MSTATUS configure register. + */ +typedef union { + struct { +#if defined(__RISCV_XLEN) && __RISCV_XLEN == 64 + rv_csr_t _reserved0:3; /*!< bit: 0..2 Reserved */ + rv_csr_t mie:1; /*!< bit: 3 Machine mode interrupt enable flag */ + rv_csr_t _reserved1:3; /*!< bit: 4..6 Reserved */ + rv_csr_t mpie:1; /*!< bit: 7 mirror of MIE flag */ + rv_csr_t _reserved2:3; /*!< bit: 8..10 Reserved */ + rv_csr_t mpp:2; /*!< bit: 11..12 mirror of Privilege Mode */ + rv_csr_t fs:2; /*!< bit: 13..14 FS status flag */ + rv_csr_t xs:2; /*!< bit: 15..16 XS status flag */ + rv_csr_t mprv:1; /*!< bit: Machine mode PMP */ + rv_csr_t _reserved3:14; /*!< bit: 18..31 Reserved */ + rv_csr_t uxl:2; /*!< bit: 32..33 user mode xlen */ + rv_csr_t _reserved6:29; /*!< bit: 34..62 Reserved */ + rv_csr_t sd:1; /*!< bit: Dirty status for XS or FS */ +#else + rv_csr_t _reserved0:1; /*!< bit: 0 Reserved */ + rv_csr_t sie:1; /*!< bit: 1 supervisor interrupt enable flag */ + rv_csr_t _reserved1:1; /*!< bit: 2 Reserved */ + rv_csr_t mie:1; /*!< bit: 3 Machine mode interrupt enable flag */ + rv_csr_t _reserved2:1; /*!< bit: 4 Reserved */ + rv_csr_t spie:1; /*!< bit: 3 Supervisor Privilede mode interrupt enable flag */ + rv_csr_t _reserved3:1; /*!< bit: Reserved */ + rv_csr_t mpie:1; /*!< bit: mirror of MIE flag */ + rv_csr_t _reserved4:3; /*!< bit: Reserved */ + rv_csr_t mpp:2; /*!< bit: mirror of Privilege Mode */ + rv_csr_t fs:2; /*!< bit: FS status flag */ + rv_csr_t xs:2; /*!< bit: XS status flag */ + rv_csr_t mprv:1; /*!< bit: Machine mode PMP */ + rv_csr_t sum:1; /*!< bit: Supervisor Mode load and store protection */ + rv_csr_t _reserved6:12; /*!< bit: 19..30 Reserved */ + rv_csr_t sd:1; /*!< bit: Dirty status for XS or FS */ +#endif + } b; /*!< Structure used for bit access */ + rv_csr_t d; /*!< Type used for csr data access */ +} CSR_MSTATUS_Type; + +/** + * \brief Union type to access MTVEC configure register. + */ +typedef union { + struct { + rv_csr_t mode:6; /*!< bit: 0..5 interrupt mode control */ +#if defined(__RISCV_XLEN) && __RISCV_XLEN == 64 + rv_csr_t addr:58; /*!< bit: 6..63 mtvec address */ +#else + rv_csr_t addr:26; /*!< bit: 6..31 mtvec address */ +#endif + } b; /*!< Structure used for bit access */ + rv_csr_t d; /*!< Type used for csr data access */ +} CSR_MTVEC_Type; + +/** + * \brief Union type to access MCAUSE configure register. + */ +typedef union { + struct { + rv_csr_t exccode:12; /*!< bit: 11..0 exception or interrupt code */ + rv_csr_t _reserved0:4; /*!< bit: 15..12 Reserved */ + rv_csr_t mpil:8; /*!< bit: 23..16 Previous interrupt level */ + rv_csr_t _reserved1:3; /*!< bit: 26..24 Reserved */ + rv_csr_t mpie:1; /*!< bit: 27 Interrupt enable flag before enter interrupt */ + rv_csr_t mpp:2; /*!< bit: 29..28 Privilede mode flag before enter interrupt */ + rv_csr_t minhv:1; /*!< bit: 30 Machine interrupt vector table */ +#if defined(__RISCV_XLEN) && __RISCV_XLEN == 64 + rv_csr_t _reserved2:32; /*!< bit: 31..62 Reserved */ + rv_csr_t interrupt:1; /*!< bit: 63 trap type. 0 means exception and 1 means interrupt */ +#else + rv_csr_t interrupt:1; /*!< bit: 31 trap type. 0 means exception and 1 means interrupt */ +#endif + } b; /*!< Structure used for bit access */ + rv_csr_t d; /*!< Type used for csr data access */ +} CSR_MCAUSE_Type; + +/** + * \brief Union type to access MCOUNTINHIBIT configure register. + */ +typedef union { + struct { + rv_csr_t cy:1; /*!< bit: 0 1 means disable mcycle counter */ + rv_csr_t _reserved0:1; /*!< bit: 1 Reserved */ + rv_csr_t ir:1; /*!< bit: 2 1 means disable minstret counter */ +#if defined(__RISCV_XLEN) && __RISCV_XLEN == 64 + rv_csr_t _reserved1:61; /*!< bit: 3..63 Reserved */ +#else + rv_csr_t _reserved1:29; /*!< bit: 3..31 Reserved */ +#endif + } b; /*!< Structure used for bit access */ + rv_csr_t d; /*!< Type used for csr data access */ +} CSR_MCOUNTINHIBIT_Type; + +/** + * \brief Union type to access msubm configure register. + */ +typedef union { + struct { + rv_csr_t _reserved0:6; /*!< bit: 0..5 Reserved */ + rv_csr_t typ:2; /*!< bit: 6..7 current trap type */ + rv_csr_t ptyp:2; /*!< bit: 8..9 previous trap type */ +#if defined(__RISCV_XLEN) && __RISCV_XLEN == 64 + rv_csr_t _reserved1:54; /*!< bit: 10..63 Reserved */ +#else + rv_csr_t _reserved1:22; /*!< bit: 10..31 Reserved */ +#endif + } b; /*!< Structure used for bit access */ + rv_csr_t d; /*!< Type used for csr data access */ +} CSR_MSUBM_Type; + +/** + * \brief Union type to access MMISC_CTRL configure register. + */ +typedef union { + struct { + rv_csr_t _reserved0:3; /*!< bit: 0..2 Reserved */ + rv_csr_t bpu:1; /*!< bit: 3 dynamic prediction enable flag */ + rv_csr_t _reserved1:2; /*!< bit: 4..5 Reserved */ + rv_csr_t misalign:1; /*!< bit: 6 misaligned access support flag */ + rv_csr_t _reserved2:2; /*!< bit: 7..8 Reserved */ + rv_csr_t nmi_cause:1; /*!< bit: 9 mnvec control and nmi mcase exccode */ +#if defined(__RISCV_XLEN) && __RISCV_XLEN == 64 + rv_csr_t _reserved3:54; /*!< bit: 10..63 Reserved */ +#else + rv_csr_t _reserved3:22; /*!< bit: 10..31 Reserved */ +#endif + } b; /*!< Structure used for bit access */ + rv_csr_t d; /*!< Type used for csr data access */ +} CSR_MMISCCTRL_Type; + + +/** + * \brief Union type to access MSAVESTATUS configure register. + */ +typedef union { + struct { + rv_csr_t mpie1:1; /*!< bit: 0 interrupt enable flag of fisrt level NMI/exception nestting */ + rv_csr_t mpp1:2; /*!< bit: 1..2 privilede mode of fisrt level NMI/exception nestting */ + rv_csr_t _reserved0:3; /*!< bit: 3..5 Reserved */ + rv_csr_t ptyp1:2; /*!< bit: 6..7 NMI/exception type of before first nestting */ + rv_csr_t mpie2:1; /*!< bit: 8 interrupt enable flag of second level NMI/exception nestting */ + rv_csr_t mpp2:2; /*!< bit: 9..10 privilede mode of second level NMI/exception nestting */ + rv_csr_t _reserved1:3; /*!< bit: 11..13 Reserved */ + rv_csr_t ptyp2:2; /*!< bit: 14..15 NMI/exception type of before second nestting */ +#if defined(__RISCV_XLEN) && __RISCV_XLEN == 64 + rv_csr_t _reserved2:48; /*!< bit: 16..63 Reserved*/ +#else + rv_csr_t _reserved2:16; /*!< bit: 16..31 Reserved*/ +#endif + } b; /*!< Structure used for bit access */ + rv_csr_t w; /*!< Type used for csr data access */ +} CSR_MSAVESTATUS_Type; +/** @} */ /* End of Doxygen Group NMSIS_Core_Base_Registers */ + +/* ########################### Core Function Access ########################### */ +/** + * \defgroup NMSIS_Core_CSR_Register_Access Core CSR Register Access + * \ingroup NMSIS_Core + * \brief Functions to access the Core CSR Registers + * \details + * + * The following functions or macros provide access to Core CSR registers. + * - \ref NMSIS_Core_CSR_Encoding + * - \ref NMSIS_Core_CSR_Registers + * @{ + */ + + +#ifndef __ASSEMBLY__ + +/** + * \brief CSR operation Macro for csrrw instruction. + * \details + * Read the content of csr register to __v, + * then write content of val into csr register, then return __v + * \param csr CSR macro definition defined in + * \ref NMSIS_Core_CSR_Registers, eg. \ref CSR_MSTATUS + * \param val value to store into the CSR register + * \return the CSR register value before written + */ +#define __RV_CSR_SWAP(csr, val) \ + ({ \ + register rv_csr_t __v = (unsigned long)(val); \ + __ASM volatile("csrrw %0, " STRINGIFY(csr) ", %1" \ + : "=r"(__v) \ + : "rK"(__v) \ + : "memory"); \ + __v; \ + }) + +/** + * \brief CSR operation Macro for csrr instruction. + * \details + * Read the content of csr register to __v and return it + * \param csr CSR macro definition defined in + * \ref NMSIS_Core_CSR_Registers, eg. \ref CSR_MSTATUS + * \return the CSR register value + */ +#define __RV_CSR_READ(csr) \ + ({ \ + register rv_csr_t __v; \ + __ASM volatile("csrr %0, " STRINGIFY(csr) \ + : "=r"(__v) \ + : \ + : "memory"); \ + __v; \ + }) + +/** + * \brief CSR operation Macro for csrw instruction. + * \details + * Write the content of val to csr register + * \param csr CSR macro definition defined in + * \ref NMSIS_Core_CSR_Registers, eg. \ref CSR_MSTATUS + * \param val value to store into the CSR register + */ +#define __RV_CSR_WRITE(csr, val) \ + ({ \ + register rv_csr_t __v = (rv_csr_t)(val); \ + __ASM volatile("csrw " STRINGIFY(csr) ", %0" \ + : \ + : "rK"(__v) \ + : "memory"); \ + }) + +/** + * \brief CSR operation Macro for csrrs instruction. + * \details + * Read the content of csr register to __v, + * then set csr register to be __v | val, then return __v + * \param csr CSR macro definition defined in + * \ref NMSIS_Core_CSR_Registers, eg. \ref CSR_MSTATUS + * \param val Mask value to be used wih csrrs instruction + * \return the CSR register value before written + */ +#define __RV_CSR_READ_SET(csr, val) \ + ({ \ + register rv_csr_t __v = (rv_csr_t)(val); \ + __ASM volatile("csrrs %0, " STRINGIFY(csr) ", %1" \ + : "=r"(__v) \ + : "rK"(__v) \ + : "memory"); \ + __v; \ + }) + +/** + * \brief CSR operation Macro for csrs instruction. + * \details + * Set csr register to be csr_content | val + * \param csr CSR macro definition defined in + * \ref NMSIS_Core_CSR_Registers, eg. \ref CSR_MSTATUS + * \param val Mask value to be used wih csrs instruction + */ +#define __RV_CSR_SET(csr, val) \ + ({ \ + register rv_csr_t __v = (rv_csr_t)(val); \ + __ASM volatile("csrs " STRINGIFY(csr) ", %0" \ + : \ + : "rK"(__v) \ + : "memory"); \ + }) + +/** + * \brief CSR operation Macro for csrrc instruction. + * \details + * Read the content of csr register to __v, + * then set csr register to be __v & ~val, then return __v + * \param csr CSR macro definition defined in + * \ref NMSIS_Core_CSR_Registers, eg. \ref CSR_MSTATUS + * \param val Mask value to be used wih csrrc instruction + * \return the CSR register value before written + */ +#define __RV_CSR_READ_CLEAR(csr, val) \ + ({ \ + register rv_csr_t __v = (rv_csr_t)(val); \ + __ASM volatile("csrrc %0, " STRINGIFY(csr) ", %1" \ + : "=r"(__v) \ + : "rK"(__v) \ + : "memory"); \ + __v; \ + }) + +/** + * \brief CSR operation Macro for csrc instruction. + * \details + * Set csr register to be csr_content & ~val + * \param csr CSR macro definition defined in + * \ref NMSIS_Core_CSR_Registers, eg. \ref CSR_MSTATUS + * \param val Mask value to be used wih csrc instruction + */ +#define __RV_CSR_CLEAR(csr, val) \ + ({ \ + register rv_csr_t __v = (rv_csr_t)(val); \ + __ASM volatile("csrc " STRINGIFY(csr) ", %0" \ + : \ + : "rK"(__v) \ + : "memory"); \ + }) +#endif /* __ASSEMBLY__ */ + +/** + * \brief Enable IRQ Interrupts + * \details Enables IRQ interrupts by setting the MIE-bit in the MSTATUS Register. + * \remarks + * Can only be executed in Privileged modes. + */ +__STATIC_FORCEINLINE void __enable_irq(void) +{ + __RV_CSR_SET(CSR_MSTATUS, MSTATUS_MIE); +} + +/** + * \brief Disable IRQ Interrupts + * \details Disables IRQ interrupts by clearing the MIE-bit in the MSTATUS Register. + * \remarks + * Can only be executed in Privileged modes. + */ +__STATIC_FORCEINLINE void __disable_irq(void) +{ + __RV_CSR_CLEAR(CSR_MSTATUS, MSTATUS_MIE); +} + +/** + * \brief Read whole 64 bits value of mcycle counter + * \details This function will read the whole 64 bits of MCYCLE register + * \return The whole 64 bits value of MCYCLE + * \remarks It will work for both RV32 and RV64 to get full 64bits value of MCYCLE + */ +__STATIC_FORCEINLINE uint64_t __get_rv_cycle(void) +{ +#if __RISCV_XLEN == 32 + volatile uint32_t high0, low, high; + uint64_t full; + + high0 = __RV_CSR_READ(CSR_MCYCLEH); + low = __RV_CSR_READ(CSR_MCYCLE); + high = __RV_CSR_READ(CSR_MCYCLEH); + if (high0 != high) { + low = __RV_CSR_READ(CSR_MCYCLE); + } + full = (((uint64_t)high) << 32) | low; + return full; +#elif __RISCV_XLEN == 64 + return (uint64_t)__RV_CSR_READ(CSR_MCYCLE); +#else // TODO Need cover for XLEN=128 case in future + return (uint64_t)__RV_CSR_READ(CSR_MCYCLE); +#endif +} + +/** + * \brief Read whole 64 bits value of machine instruction-retired counter + * \details This function will read the whole 64 bits of MINSTRET register + * \return The whole 64 bits value of MINSTRET + * \remarks It will work for both RV32 and RV64 to get full 64bits value of MINSTRET + */ +__STATIC_FORCEINLINE uint64_t __get_rv_instret(void) +{ +#if __RISCV_XLEN == 32 + volatile uint32_t high0, low, high; + uint64_t full; + + high0 = __RV_CSR_READ(CSR_MINSTRETH); + low = __RV_CSR_READ(CSR_MINSTRET); + high = __RV_CSR_READ(CSR_MINSTRETH); + if (high0 != high) { + low = __RV_CSR_READ(CSR_MINSTRET); + } + full = (((uint64_t)high) << 32) | low; + return full; +#elif __RISCV_XLEN == 64 + return (uint64_t)__RV_CSR_READ(CSR_MINSTRET); +#else // TODO Need cover for XLEN=128 case in future + return (uint64_t)__RV_CSR_READ(CSR_MINSTRET); +#endif +} + +/** + * \brief Read whole 64 bits value of real-time clock + * \details This function will read the whole 64 bits of TIME register + * \return The whole 64 bits value of TIME CSR + * \remarks It will work for both RV32 and RV64 to get full 64bits value of TIME + * \attention only available when user mode available + */ +__STATIC_FORCEINLINE uint64_t __get_rv_time(void) +{ +#if __RISCV_XLEN == 32 + volatile uint32_t high0, low, high; + uint64_t full; + + high0 = __RV_CSR_READ(CSR_TIMEH); + low = __RV_CSR_READ(CSR_TIME); + high = __RV_CSR_READ(CSR_TIMEH); + if (high0 != high) { + low = __RV_CSR_READ(CSR_TIME); + } + full = (((uint64_t)high) << 32) | low; + return full; +#elif __RISCV_XLEN == 64 + return (uint64_t)__RV_CSR_READ(CSR_TIME); +#else // TODO Need cover for XLEN=128 case in future + return (uint64_t)__RV_CSR_READ(CSR_TIME); +#endif +} + +/** @} */ /* End of Doxygen Group NMSIS_Core_CSR_Register_Access */ + +/* ########################### CPU Intrinsic Functions ########################### */ +/** + * \defgroup NMSIS_Core_CPU_Intrinsic Intrinsic Functions for CPU Intructions + * \ingroup NMSIS_Core + * \brief Functions that generate RISC-V CPU instructions. + * \details + * + * The following functions generate specified RISC-V instructions that cannot be directly accessed by compiler. + * @{ + */ + +/** + * \brief NOP Instruction + * \details + * No Operation does nothing. + * This instruction can be used for code alignment purposes. + */ +__STATIC_FORCEINLINE void __NOP(void) +{ + __ASM volatile("nop"); +} + +/** + * \brief Wait For Interrupt + * \details + * Wait For Interrupt is is executed using CSR_WFE.WFE=0 and WFI instruction. + * It will suspends execution until interrupt, NMI or Debug happened. + * When Core is waked up by interrupt, if + * 1. mstatus.MIE == 1(interrupt enabled), Core will enter ISR code + * 2. mstatus.MIE == 0(interrupt disabled), Core will resume previous execution + */ +__STATIC_FORCEINLINE void __WFI(void) +{ + __RV_CSR_CLEAR(CSR_WFE, WFE_WFE); + __ASM volatile("wfi"); +} + +/** + * \brief Wait For Event + * \details + * Wait For Event is executed using CSR_WFE.WFE=1 and WFI instruction. + * It will suspends execution until event, NMI or Debug happened. + * When Core is waked up, Core will resume previous execution + */ +__STATIC_FORCEINLINE void __WFE(void) +{ + __RV_CSR_SET(CSR_WFE, WFE_WFE); + __ASM volatile("wfi"); + __RV_CSR_CLEAR(CSR_WFE, WFE_WFE); +} + +/** + * \brief Breakpoint Instruction + * \details + * Causes the processor to enter Debug state. + * Debug tools can use this to investigate system state + * when the instruction at a particular address is reached. + */ +__STATIC_FORCEINLINE void __EBREAK(void) +{ + __ASM volatile("ebreak"); +} + +/** + * \brief Environment Call Instruction + * \details + * The ECALL instruction is used to make a service request to + * the execution environment. + */ +__STATIC_FORCEINLINE void __ECALL(void) +{ + __ASM volatile("ecall"); +} + +/** + * \brief WFI Sleep Mode enumeration + */ +typedef enum WFI_SleepMode { + WFI_SHALLOW_SLEEP = 0, /*!< Shallow sleep mode, the core_clk will poweroff */ + WFI_DEEP_SLEEP = 1 /*!< Deep sleep mode, the core_clk and core_ano_clk will poweroff */ +} WFI_SleepMode_Type; + +/** + * \brief Set Sleep mode of WFI + * \details + * Set the SLEEPVALUE CSR register to control the + * WFI Sleep mode. + * \param[in] mode The sleep mode to be set + */ +__STATIC_FORCEINLINE void __set_wfi_sleepmode(WFI_SleepMode_Type mode) +{ + __RV_CSR_WRITE(CSR_SLEEPVALUE, mode); +} + +/** + * \brief Send TX Event + * \details + * Set the CSR TXEVT to control send a TX Event. + * The Core will output signal tx_evt as output event signal. + */ +__STATIC_FORCEINLINE void __TXEVT(void) +{ + __RV_CSR_SET(CSR_TXEVT, 0x1); +} + +/** + * \brief Enable MCYCLE counter + * \details + * Clear the CY bit of MCOUNTINHIBIT to 0 to enable MCYCLE Counter + */ +__STATIC_FORCEINLINE void __enable_mcycle_counter(void) +{ + __RV_CSR_CLEAR(CSR_MCOUNTINHIBIT, MCOUNTINHIBIT_CY); +} + +/** + * \brief Disable MCYCLE counter + * \details + * Set the CY bit of MCOUNTINHIBIT to 1 to disable MCYCLE Counter + */ +__STATIC_FORCEINLINE void __disable_mcycle_counter(void) +{ + __RV_CSR_SET(CSR_MCOUNTINHIBIT, MCOUNTINHIBIT_CY); +} + +/** + * \brief Enable MINSTRET counter + * \details + * Clear the IR bit of MCOUNTINHIBIT to 0 to enable MINSTRET Counter + */ +__STATIC_FORCEINLINE void __enable_minstret_counter(void) +{ + __RV_CSR_CLEAR(CSR_MCOUNTINHIBIT, MCOUNTINHIBIT_IR); +} + +/** + * \brief Disable MINSTRET counter + * \details + * Set the IR bit of MCOUNTINHIBIT to 1 to disable MINSTRET Counter + */ +__STATIC_FORCEINLINE void __disable_minstret_counter(void) +{ + __RV_CSR_SET(CSR_MCOUNTINHIBIT, MCOUNTINHIBIT_IR); +} + +/** + * \brief Enable MCYCLE & MINSTRET counter + * \details + * Clear the IR and CY bit of MCOUNTINHIBIT to 1 to enable MINSTRET & MCYCLE Counter + */ +__STATIC_FORCEINLINE void __enable_all_counter(void) +{ + __RV_CSR_CLEAR(CSR_MCOUNTINHIBIT, MCOUNTINHIBIT_IR|MCOUNTINHIBIT_CY); +} + +/** + * \brief Disable MCYCLE & MINSTRET counter + * \details + * Set the IR and CY bit of MCOUNTINHIBIT to 1 to disable MINSTRET & MCYCLE Counter + */ +__STATIC_FORCEINLINE void __disable_all_counter(void) +{ + __RV_CSR_SET(CSR_MCOUNTINHIBIT, MCOUNTINHIBIT_IR|MCOUNTINHIBIT_CY); +} + +/** + * \brief Execute fence instruction, p -> pred, s -> succ + * \details + * the FENCE instruction ensures that all memory accesses from instructions preceding + * the fence in program order (the `predecessor set`) appear earlier in the global memory order than + * memory accesses from instructions appearing after the fence in program order (the `successor set`). + * For details, please refer to The RISC-V Instruction Set Manual + * \param p predecessor set, such as iorw, rw, r, w + * \param s successor set, such as iorw, rw, r, w + **/ +#define __FENCE(p, s) __ASM volatile ("fence " #p "," #s : : : "memory") + +/** + * \brief Fence.i Instruction + * \details + * The FENCE.I instruction is used to synchronize the instruction + * and data streams. + */ +__STATIC_FORCEINLINE void __FENCE_I(void) +{ + __ASM volatile("fence.i"); +} + +/** \brief Read & Write Memory barrier */ +#define __RWMB() __FENCE(iorw,iorw) + +/** \brief Read Memory barrier */ +#define __RMB() __FENCE(ir,ir) + +/** \brief Write Memory barrier */ +#define __WMB() __FENCE(ow,ow) + +/** \brief SMP Read & Write Memory barrier */ +#define __SMP_RWMB() __FENCE(rw,rw) + +/** \brief SMP Read Memory barrier */ +#define __SMP_RMB() __FENCE(r,r) + +/** \brief SMP Write Memory barrier */ +#define __SMP_WMB() __FENCE(w,w) + +/** \brief CPU relax for busy loop */ +#define __CPU_RELAX() __ASM volatile ("" : : : "memory") + + +/* ===== Load/Store Operations ===== */ +/** + * \brief Load 8bit value from address (8 bit) + * \details Load 8 bit value. + * \param [in] addr Address pointer to data + * \return value of type uint8_t at (*addr) + */ +__STATIC_FORCEINLINE uint8_t __LB(volatile void *addr) +{ + uint8_t result; + + __ASM volatile ("lb %0, 0(%1)" : "=r" (result) : "r" (addr)); + return result; +} + +/** + * \brief Load 16bit value from address (16 bit) + * \details Load 16 bit value. + * \param [in] addr Address pointer to data + * \return value of type uint16_t at (*addr) + */ +__STATIC_FORCEINLINE uint16_t __LH(volatile void *addr) +{ + uint16_t result; + + __ASM volatile ("lh %0, 0(%1)" : "=r" (result) : "r" (addr)); + return result; +} + +/** + * \brief Load 32bit value from address (32 bit) + * \details Load 32 bit value. + * \param [in] addr Address pointer to data + * \return value of type uint32_t at (*addr) + */ +__STATIC_FORCEINLINE uint32_t __LW(volatile void *addr) +{ + uint32_t result; + + __ASM volatile ("lw %0, 0(%1)" : "=r" (result) : "r" (addr)); + return result; +} + +#if __RISCV_XLEN != 32 +/** + * \brief Load 64bit value from address (64 bit) + * \details Load 64 bit value. + * \param [in] addr Address pointer to data + * \return value of type uint64_t at (*addr) + * \remarks RV64 only macro + */ +__STATIC_FORCEINLINE uint64_t __LD(volatile void *addr) +{ + uint64_t result; + __ASM volatile ("ld %0, 0(%1)" : "=r" (result) : "r" (addr)); + return result; +} +#endif + +/** + * \brief Write 8bit value to address (8 bit) + * \details Write 8 bit value. + * \param [in] addr Address pointer to data + * \param [in] val Value to set + */ +__STATIC_FORCEINLINE void __SB(volatile void *addr, uint8_t val) +{ + __ASM volatile ("sb %0, 0(%1)" : : "r" (val), "r" (addr)); +} + +/** + * \brief Write 16bit value to address (16 bit) + * \details Write 16 bit value. + * \param [in] addr Address pointer to data + * \param [in] val Value to set + */ +__STATIC_FORCEINLINE void __SH(volatile void *addr, uint16_t val) +{ + __ASM volatile ("sh %0, 0(%1)" : : "r" (val), "r" (addr)); +} + +/** + * \brief Write 32bit value to address (32 bit) + * \details Write 32 bit value. + * \param [in] addr Address pointer to data + * \param [in] val Value to set + */ +__STATIC_FORCEINLINE void __SW(volatile void *addr, uint32_t val) +{ + __ASM volatile ("sw %0, 0(%1)" : : "r" (val), "r" (addr)); +} + +#if __RISCV_XLEN != 32 +/** + * \brief Write 64bit value to address (64 bit) + * \details Write 64 bit value. + * \param [in] addr Address pointer to data + * \param [in] val Value to set + */ +__STATIC_FORCEINLINE void __SD(volatile void *addr, uint64_t val) +{ + __ASM volatile ("sd %0, 0(%1)" : : "r" (val), "r" (addr)); +} +#endif + +/** + * \brief Compare and Swap 32bit value using LR and SC + * \details Compare old value with memory, if identical, + * store new value in memory. Return the initial value in memory. + * Success is indicated by comparing return value with OLD. + * memory address, return 0 if successful, otherwise return !0 + * \param [in] addr Address pointer to data, address need to be 4byte aligned + * \param [in] oldval Old value of the data in address + * \param [in] newval New value to be stored into the address + * \return return the initial value in memory + */ +__STATIC_FORCEINLINE uint32_t __CAS_W(volatile uint32_t *addr, uint32_t oldval, uint32_t newval) +{ + register uint32_t result; + register uint32_t rc; + + __ASM volatile ( \ + "0: lr.w %0, %2 \n" \ + " bne %0, %z3, 1f \n" \ + " sc.w %1, %z4, %2 \n" \ + " bnez %1, 0b \n" \ + "1:\n" \ + : "=&r"(result), "=&r"(rc), "+A"(*addr) \ + : "r"(oldval), "r"(newval) \ + : "memory"); + return result; +} + +/** + * \brief Atomic Swap 32bit value into memory + * \details Atomically swap new 32bit value into memory using amoswap.d. + * \param [in] addr Address pointer to data, address need to be 4byte aligned + * \param [in] newval New value to be stored into the address + * \return return the original value in memory + */ +__STATIC_FORCEINLINE uint32_t __AMOSWAP_W(volatile uint32_t *addr, uint32_t newval) +{ + register uint32_t result; + + __ASM volatile ("amoswap.w %0, %2, %1" : \ + "=r"(result), "+A"(*addr) : "r"(newval) : "memory"); + return result; +} + +/** + * \brief Atomic Add with 32bit value + * \details Atomically ADD 32bit value with value in memory using amoadd.d. + * \param [in] addr Address pointer to data, address need to be 4byte aligned + * \param [in] value value to be ADDed + * \return return memory value + add value + */ +__STATIC_FORCEINLINE int32_t __AMOADD_W(volatile int32_t *addr, int32_t value) +{ + register int32_t result; + + __ASM volatile ("amoadd.w %0, %2, %1" : \ + "=r"(result), "+A"(*addr) : "r"(value) : "memory"); + return *addr; +} + +/** + * \brief Atomic And with 32bit value + * \details Atomically AND 32bit value with value in memory using amoand.d. + * \param [in] addr Address pointer to data, address need to be 4byte aligned + * \param [in] value value to be ANDed + * \return return memory value & and value + */ +__STATIC_FORCEINLINE int32_t __AMOAND_W(volatile int32_t *addr, int32_t value) +{ + register int32_t result; + + __ASM volatile ("amoand.w %0, %2, %1" : \ + "=r"(result), "+A"(*addr) : "r"(value) : "memory"); + return *addr; +} + +/** + * \brief Atomic OR with 32bit value + * \details Atomically OR 32bit value with value in memory using amoor.d. + * \param [in] addr Address pointer to data, address need to be 4byte aligned + * \param [in] value value to be ORed + * \return return memory value | and value + */ +__STATIC_FORCEINLINE int32_t __AMOOR_W(volatile int32_t *addr, int32_t value) +{ + register int32_t result; + + __ASM volatile ("amoor.w %0, %2, %1" : \ + "=r"(result), "+A"(*addr) : "r"(value) : "memory"); + return *addr; +} + +/** + * \brief Atomic XOR with 32bit value + * \details Atomically XOR 32bit value with value in memory using amoxor.d. + * \param [in] addr Address pointer to data, address need to be 4byte aligned + * \param [in] value value to be XORed + * \return return memory value ^ and value + */ +__STATIC_FORCEINLINE int32_t __AMOXOR_W(volatile int32_t *addr, int32_t value) +{ + register int32_t result; + + __ASM volatile ("amoxor.w %0, %2, %1" : \ + "=r"(result), "+A"(*addr) : "r"(value) : "memory"); + return *addr; +} + +/** + * \brief Atomic unsigned MAX with 32bit value + * \details Atomically unsigned max compare 32bit value with value in memory using amomaxu.d. + * \param [in] addr Address pointer to data, address need to be 4byte aligned + * \param [in] value value to be compared + * \return return the bigger value + */ +__STATIC_FORCEINLINE uint32_t __AMOMAXU_W(volatile uint32_t *addr, uint32_t value) +{ + register uint32_t result; + + __ASM volatile ("amomaxu.w %0, %2, %1" : \ + "=r"(result), "+A"(*addr) : "r"(value) : "memory"); + return *addr; +} + +/** + * \brief Atomic signed MAX with 32bit value + * \details Atomically signed max compare 32bit value with value in memory using amomax.d. + * \param [in] addr Address pointer to data, address need to be 4byte aligned + * \param [in] value value to be compared + * \return the bigger value + */ +__STATIC_FORCEINLINE int32_t __AMOMAX_W(volatile int32_t *addr, int32_t value) +{ + register int32_t result; + + __ASM volatile ("amomax.w %0, %2, %1" : \ + "=r"(result), "+A"(*addr) : "r"(value) : "memory"); + return *addr; +} + +/** + * \brief Atomic unsigned MIN with 32bit value + * \details Atomically unsigned min compare 32bit value with value in memory using amominu.d. + * \param [in] addr Address pointer to data, address need to be 4byte aligned + * \param [in] value value to be compared + * \return the smaller value + */ +__STATIC_FORCEINLINE uint32_t __AMOMINU_W(volatile uint32_t *addr, uint32_t value) +{ + register uint32_t result; + + __ASM volatile ("amominu.w %0, %2, %1" : \ + "=r"(result), "+A"(*addr) : "r"(value) : "memory"); + return *addr; +} + +/** + * \brief Atomic signed MIN with 32bit value + * \details Atomically signed min compare 32bit value with value in memory using amomin.d. + * \param [in] addr Address pointer to data, address need to be 4byte aligned + * \param [in] value value to be compared + * \return the smaller value + */ +__STATIC_FORCEINLINE int32_t __AMOMIN_W(volatile int32_t *addr, int32_t value) +{ + register int32_t result; + + __ASM volatile ("amomin.w %0, %2, %1" : \ + "=r"(result), "+A"(*addr) : "r"(value) : "memory"); + return *addr; +} + +#if __RISCV_XLEN == 64 +/** + * \brief Compare and Swap 64bit value using LR and SC + * \details Compare old value with memory, if identical, + * store new value in memory. Return the initial value in memory. + * Success is indicated by comparing return value with OLD. + * memory address, return 0 if successful, otherwise return !0 + * \param [in] addr Address pointer to data, address need to be 8byte aligned + * \param [in] oldval Old value of the data in address + * \param [in] newval New value to be stored into the address + * \return return the initial value in memory + */ +__STATIC_FORCEINLINE uint64_t __CAS_D(volatile uint64_t *addr, uint64_t oldval, uint64_t newval) +{ + register uint64_t result; + register uint64_t rc; + + __ASM volatile ( \ + "0: lr.d %0, %2 \n" \ + " bne %0, %z3, 1f \n" \ + " sc.d %1, %z4, %2 \n" \ + " bnez %1, 0b \n" \ + "1:\n" \ + : "=&r"(result), "=&r"(rc), "+A"(*addr) \ + : "r"(oldval), "r"(newval) \ + : "memory"); + return result; +} + +/** + * \brief Atomic Swap 64bit value into memory + * \details Atomically swap new 64bit value into memory using amoswap.d. + * \param [in] addr Address pointer to data, address need to be 8byte aligned + * \param [in] newval New value to be stored into the address + * \return return the original value in memory + */ +__STATIC_FORCEINLINE uint64_t __AMOSWAP_D(volatile uint64_t *addr, uint64_t newval) +{ + register uint64_t result; + + __ASM volatile ("amoswap.d %0, %2, %1" : \ + "=r"(result), "+A"(*addr) : "r"(newval) : "memory"); + return result; +} + +/** + * \brief Atomic Add with 64bit value + * \details Atomically ADD 64bit value with value in memory using amoadd.d. + * \param [in] addr Address pointer to data, address need to be 8byte aligned + * \param [in] value value to be ADDed + * \return return memory value + add value + */ +__STATIC_FORCEINLINE int64_t __AMOADD_D(volatile int64_t *addr, int64_t value) +{ + register int64_t result; + + __ASM volatile ("amoadd.d %0, %2, %1" : \ + "=r"(result), "+A"(*addr) : "r"(value) : "memory"); + return *addr; +} + +/** + * \brief Atomic And with 64bit value + * \details Atomically AND 64bit value with value in memory using amoand.d. + * \param [in] addr Address pointer to data, address need to be 8byte aligned + * \param [in] value value to be ANDed + * \return return memory value & and value + */ +__STATIC_FORCEINLINE int64_t __AMOAND_D(volatile int64_t *addr, int64_t value) +{ + register int64_t result; + + __ASM volatile ("amoand.d %0, %2, %1" : \ + "=r"(result), "+A"(*addr) : "r"(value) : "memory"); + return *addr; +} + +/** + * \brief Atomic OR with 64bit value + * \details Atomically OR 64bit value with value in memory using amoor.d. + * \param [in] addr Address pointer to data, address need to be 8byte aligned + * \param [in] value value to be ORed + * \return return memory value | and value + */ +__STATIC_FORCEINLINE int64_t __AMOOR_D(volatile int64_t *addr, int64_t value) +{ + register int64_t result; + + __ASM volatile ("amoor.d %0, %2, %1" : \ + "=r"(result), "+A"(*addr) : "r"(value) : "memory"); + return *addr; +} + +/** + * \brief Atomic XOR with 64bit value + * \details Atomically XOR 64bit value with value in memory using amoxor.d. + * \param [in] addr Address pointer to data, address need to be 8byte aligned + * \param [in] value value to be XORed + * \return return memory value ^ and value + */ +__STATIC_FORCEINLINE int64_t __AMOXOR_D(volatile int64_t *addr, int64_t value) +{ + register int64_t result; + + __ASM volatile ("amoxor.d %0, %2, %1" : \ + "=r"(result), "+A"(*addr) : "r"(value) : "memory"); + return *addr; +} + +/** + * \brief Atomic unsigned MAX with 64bit value + * \details Atomically unsigned max compare 64bit value with value in memory using amomaxu.d. + * \param [in] addr Address pointer to data, address need to be 8byte aligned + * \param [in] value value to be compared + * \return return the bigger value + */ +__STATIC_FORCEINLINE uint64_t __AMOMAXU_D(volatile uint64_t *addr, uint64_t value) +{ + register uint64_t result; + + __ASM volatile ("amomaxu.d %0, %2, %1" : \ + "=r"(result), "+A"(*addr) : "r"(value) : "memory"); + return *addr; +} + +/** + * \brief Atomic signed MAX with 64bit value + * \details Atomically signed max compare 64bit value with value in memory using amomax.d. + * \param [in] addr Address pointer to data, address need to be 8byte aligned + * \param [in] value value to be compared + * \return the bigger value + */ +__STATIC_FORCEINLINE int64_t __AMOMAX_D(volatile int64_t *addr, int64_t value) +{ + register int64_t result; + + __ASM volatile ("amomax.d %0, %2, %1" : \ + "=r"(result), "+A"(*addr) : "r"(value) : "memory"); + return *addr; +} + +/** + * \brief Atomic unsigned MIN with 64bit value + * \details Atomically unsigned min compare 64bit value with value in memory using amominu.d. + * \param [in] addr Address pointer to data, address need to be 8byte aligned + * \param [in] value value to be compared + * \return the smaller value + */ +__STATIC_FORCEINLINE uint64_t __AMOMINU_D(volatile uint64_t *addr, uint64_t value) +{ + register uint64_t result; + + __ASM volatile ("amominu.d %0, %2, %1" : \ + "=r"(result), "+A"(*addr) : "r"(value) : "memory"); + return *addr; +} + +/** + * \brief Atomic signed MIN with 64bit value + * \details Atomically signed min compare 64bit value with value in memory using amomin.d. + * \param [in] addr Address pointer to data, address need to be 8byte aligned + * \param [in] value value to be compared + * \return the smaller value + */ +__STATIC_FORCEINLINE int64_t __AMOMIN_D(volatile int64_t *addr, int64_t value) +{ + register int64_t result; + + __ASM volatile ("amomin.d %0, %2, %1" : \ + "=r"(result), "+A"(*addr) : "r"(value) : "memory"); + return *addr; +} +#endif /* __RISCV_XLEN == 64 */ + +/** @} */ /* End of Doxygen Group NMSIS_Core_CPU_Intrinsic */ + +#ifdef __cplusplus +} +#endif +#endif /* __CORE_FEATURE_BASE__ */ diff --git a/kernel/arch/risc-v/nuclei/gcc/nmsis/Core/Include/core_feature_cache.h b/kernel/arch/risc-v/nuclei/gcc/nmsis/Core/Include/core_feature_cache.h new file mode 100644 index 00000000..38b9eb97 --- /dev/null +++ b/kernel/arch/risc-v/nuclei/gcc/nmsis/Core/Include/core_feature_cache.h @@ -0,0 +1,124 @@ +/* + * Copyright (c) 2019 Nuclei Limited. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __CORE_FEATURE_CACHE_H__ +#define __CORE_FEATURE_CACHE_H__ +/*! + * @file core_feature_cache.h + * @brief Cache feature API header file for Nuclei N/NX Core + */ +/* + * Cache Feature Configuration Macro: + * 1. __ICACHE_PRESENT: Define whether I-Cache Unit is present or not. + * * 0: Not present + * * 1: Present + * 1. __DCACHE_PRESENT: Define whether D-Cache Unit is present or not. + * * 0: Not present + * * 1: Present + */ +#ifdef __cplusplus + extern "C" { +#endif + +#if defined(__ICACHE_PRESENT) && (__ICACHE_PRESENT == 1) + +/* ########################## Cache functions #################################### */ +/** + * \defgroup NMSIS_Core_Cache Cache Functions + * \brief Functions that configure Instruction and Data Cache. + * @{ + */ + +/** @} */ /* End of Doxygen Group NMSIS_Core_Cache */ + +/** + * \defgroup NMSIS_Core_ICache I-Cache Functions + * \ingroup NMSIS_Core_Cache + * \brief Functions that configure Instruction Cache. + * @{ + */ +/** + * \brief Enable ICache + * \details + * This function enable I-Cache + * \remarks + * - This \ref CSR_MCACHE_CTL register control I Cache enable. + * \sa + * - \ref DisableICache +*/ +__STATIC_FORCEINLINE void EnableICache (void) +{ + __RV_CSR_SET(CSR_MCACHE_CTL, CSR_MCACHE_CTL_IE); +} + +/** + * \brief Disable ICache + * \details + * This function Disable I-Cache + * \remarks + * - This \ref CSR_MCACHE_CTL register control I Cache enable. + * \sa + * - \ref EnableICache + */ +__STATIC_FORCEINLINE void DisableICache (void) +{ + __RV_CSR_CLEAR(CSR_MCACHE_CTL, CSR_MCACHE_CTL_IE); +} +/** @} */ /* End of Doxygen Group NMSIS_Core_ICache */ +#endif /* defined(__ICACHE_PRESENT) && (__ICACHE_PRESENT == 1) */ + +#if defined(__DCACHE_PRESENT) && (__DCACHE_PRESENT == 1) +/** + * \defgroup NMSIS_Core_DCache D-Cache Functions + * \ingroup NMSIS_Core_Cache + * \brief Functions that configure Data Cache. + * @{ + */ +/** + * \brief Enable DCache + * \details + * This function enable D-Cache + * \remarks + * - This \ref CSR_MCACHE_CTL register control D Cache enable. + * \sa + * - \ref DisableDCache +*/ +__STATIC_FORCEINLINE void EnableDCache (void) +{ + __RV_CSR_SET(CSR_MCACHE_CTL, CSR_MCACHE_CTL_DE); +} + +/** + * \brief Disable DCache + * \details + * This function Disable D-Cache + * \remarks + * - This \ref CSR_MCACHE_CTL register control D Cache enable. + * \sa + * - \ref EnableDCache + */ +__STATIC_FORCEINLINE void DisableDCache (void) +{ + __RV_CSR_CLEAR(CSR_MCACHE_CTL, CSR_MCACHE_CTL_DE); +} +/** @} */ /* End of Doxygen Group NMSIS_Core_DCache */ +#endif /* defined(__DCACHE_PRESENT) && (__DCACHE_PRESENT == 1) */ + +#ifdef __cplusplus +} +#endif +#endif /** __CORE_FEATURE_CACHE_H__ */ diff --git a/kernel/arch/risc-v/nuclei/gcc/nmsis/Core/Include/core_feature_dsp.h b/kernel/arch/risc-v/nuclei/gcc/nmsis/Core/Include/core_feature_dsp.h new file mode 100644 index 00000000..4d41e553 --- /dev/null +++ b/kernel/arch/risc-v/nuclei/gcc/nmsis/Core/Include/core_feature_dsp.h @@ -0,0 +1,18659 @@ +/* + * Copyright (c) 2019 Nuclei Limited. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __CORE_FEATURE_DSP__ +#define __CORE_FEATURE_DSP__ + +/*! + * @file core_feature_dsp.h + * @brief DSP feature API header file for Nuclei N/NX Core + */ +/* + * DSP Feature Configuration Macro: + * 1. __DSP_PRESENT: Define whether Digital Signal Processing Unit(DSP) is present or not + * * 0: Not present + * * 1: Present + */ +#ifdef __cplusplus + extern "C" { +#endif + +#if defined(__DSP_PRESENT) && (__DSP_PRESENT == 1) + +/* ########################### CPU SIMD DSP Intrinsic Functions ########################### */ +/** + * \defgroup NMSIS_Core_DSP_Intrinsic Intrinsic Functions for SIMD Instructions + * \ingroup NMSIS_Core + * \brief Functions that generate RISC-V DSP SIMD instructions. + * \details + * + * The following functions generate specified RISC-V SIMD instructions that cannot be directly accessed by compiler. + * * **DSP ISA Extension Instruction Summary** + * + **Shorthand Definitions** + * - r.H == rH1: r[31:16], r.L == r.H0: r[15:0] + * - r.B3: r[31:24], r.B2: r[23:16], r.B1: r[15:8], r.B0: r[7:0] + * - r.B[x]: r[(x*8+7):(x*8+0)] + * - r.H[x]: r[(x*16+7):(x*16+0)] + * - r.W[x]: r[(x*32+31):(x*32+0)] + * - r[xU]: the upper 32-bit of a 64-bit number; xU represents the GPR number that contains this upper part 32-bit value. + * - r[xL]: the lower 32-bit of a 64-bit number; xL represents the GPR number that contains this lower part 32-bit value. + * - r[xU].r[xL]: a 64-bit number that is formed from a pair of GPRs. + * - s>>: signed arithmetic right shift: + * - u>>: unsigned logical right shift + * - SAT.Qn(): Saturate to the range of [-2^n, 2^n-1], if saturation happens, set PSW.OV. + * - SAT.Um(): Saturate to the range of [0, 2^m-1], if saturation happens, set PSW.OV. + * - RUND(): Indicate `rounding`, i.e., add 1 to the most significant discarded bit for right shift or MSW-type multiplication instructions. + * - Sign or Zero Extending functions: + * - SEm(data): Sign-Extend data to m-bit.: + * - ZEm(data): Zero-Extend data to m-bit. + * - ABS(x): Calculate the absolute value of `x`. + * - CONCAT(x,y): Concatinate `x` and `y` to form a value. + * - u<: Unsinged less than comparison. + * - u<=: Unsinged less than & equal comparison. + * - u>: Unsinged greater than comparison. + * - s*: Signed multiplication. + * - u*: Unsigned multiplication. + * + * @{ + */ +/** @} */ /* End of Doxygen Group NMSIS_Core_DSP_Intrinsic */ + + +/** + * \defgroup NMSIS_Core_DSP_Intrinsic_SIMD_DATA_PROCESS SIMD Data Processing Instructions + * \ingroup NMSIS_Core_DSP_Intrinsic + * \brief SIMD Data Processing Instructions + * \details + */ + +/** + * \defgroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_ADDSUB SIMD 16-bit Add/Subtract Instructions + * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_DATA_PROCESS + * \brief SIMD 16-bit Add/Subtract Instructions + * \details + * Based on the combination of the types of the two 16-bit arithmetic operations, the SIMD 16-bit + * add/subtract instructions can be classified into 6 main categories: Addition (two 16-bit addition), + * Subtraction (two 16-bit subtraction), Crossed Add & Sub (one addition and one subtraction), and + * Crossed Sub & Add (one subtraction and one addition), Straight Add & Sub (one addition and one + * subtraction), and Straight Sub & Add (one subtraction and one addition). + * Based on the way of how an overflow condition is handled, the SIMD 16-bit add/subtract + * instructions can be classified into 5 groups: Wrap-around (dropping overflow), Signed Halving + * (keeping overflow by dropping 1 LSB bit), Unsigned Halving, Signed Saturation (clipping overflow), + * and Unsigned Saturation. + * Together, there are 30 SIMD 16-bit add/subtract instructions. + */ + +/** + * \defgroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_ADDSUB SIMD 8-bit Addition & Subtraction Instructions + * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_DATA_PROCESS + * \brief SIMD 8-bit Addition & Subtraction Instructions + * \details + * Based on the types of the four 8-bit arithmetic operations, the SIMD 8-bit add/subtract instructions + * can be classified into 2 main categories: Addition (four 8-bit addition), and Subtraction (four 8-bit + * subtraction). + * Based on the way of how an overflow condition is handled for singed or unsigned operation, the + * SIMD 8-bit add/subtract instructions can be classified into 5 groups: Wrap-around (dropping + * overflow), Signed Halving (keeping overflow by dropping 1 LSB bit), Unsigned Halving, Signed + * Saturation (clipping overflow), and Unsigned Saturation. + * Together, there are 10 SIMD 8-bit add/subtract instructions. + */ + +/** + * \defgroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_SHIFT SIMD 16-bit Shift Instructions + * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_DATA_PROCESS + * \brief SIMD 16-bit Shift Instructions + * \details + * there are 14 SIMD 16-bit shift instructions. + */ + +/** + * \defgroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_SHIFT SIMD 8-bit Shift Instructions + * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_DATA_PROCESS + * \brief SIMD 8-bit Shift Instructions + * \details + * there are 14 SIMD 8-bit shift instructions. + */ + +/** + * \defgroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_CMP SIMD 16-bit Compare Instructions + * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_DATA_PROCESS + * \brief SIMD 16-bit Compare Instructions + * \details + * there are 5 SIMD 16-bit Compare instructions. + */ + +/** + * \defgroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_CMP SIMD 8-bit Compare Instructions + * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_DATA_PROCESS + * \brief SIMD 8-bit Compare Instructions + * \details + * there are 5 SIMD 8-bit Compare instructions. + */ + +/** + * \defgroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_MULTIPLY SIMD 16-bit Multiply Instructions + * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_DATA_PROCESS + * \brief SIMD 16-bit Multiply Instructions + * \details + * there are 6 SIMD 16-bit Multiply instructions. + */ + +/** + * \defgroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_MULTIPLY SIMD 8-bit Multiply Instructions + * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_DATA_PROCESS + * \brief SIMD 8-bit Multiply Instructions + * \details + * there are 6 SIMD 8-bit Multiply instructions. + */ + +/** + * \defgroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_MISC SIMD 16-bit Miscellaneous Instructions + * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_DATA_PROCESS + * \brief SIMD 16-bit Miscellaneous Instructions + * \details + * there are 10 SIMD 16-bit Misc instructions. + */ + +/** + * \defgroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_MISC SIMD 8-bit Miscellaneous Instructions + * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_DATA_PROCESS + * \brief SIMD 8-bit Miscellaneous Instructions + * \details + * there are 10 SIMD 8-bit Miscellaneous instructions. + */ + +/** + * \defgroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_UNPACK SIMD 8-bit Unpacking Instructions + * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_DATA_PROCESS + * \brief SIMD 8-bit Unpacking Instructions + * \details + * there are 8 SIMD 8-bit Unpacking instructions. + */ + +/** + * \defgroup NMSIS_Core_DSP_Intrinsic_NON_SIMD Non-SIMD Instructions + * \ingroup NMSIS_Core_DSP_Intrinsic + * \brief Non-SIMD Instructions + * \details + */ + +/** + * \defgroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_Q15_SAT_ALU Non-SIMD Q15 saturation ALU Instructions + * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD + * \brief Non-SIMD Q15 saturation ALU Instructions + * \details + * there are 7 Non-SIMD Q15 saturation ALU Instructions + */ + +/** + * \defgroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_Q31_SAT_ALU Non-SIMD Q31 saturation ALU Instructions + * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD + * \brief Non-SIMD Q31 saturation ALU Instructions + * \details + * there are Non-SIMD Q31 saturation ALU Instructions + */ + +/** + * \defgroup NMSIS_Core_DSP_Intrinsic_32B_COMPUTATION 32-bit Computation Instructions + * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD + * \brief 32-bit Computation Instructions + * \details + * there are 8 32-bit Computation Instructions + */ + +/** + * \defgroup NMSIS_Core_DSP_Intrinsic_OV_FLAG_SC OV (Overflow) flag Set/Clear Instructions + * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD + * \brief OV (Overflow) flag Set/Clear Instructions + * \details + * The following table lists the user instructions related to Overflow (OV) flag manipulation. there are 2 OV (Overflow) flag Set/Clear Instructions + */ + +/** + * \defgroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_MISC Non-SIMD Miscellaneous Instructions + * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD + * \brief Non-SIMD Miscellaneous Instructions + * \details + * There are 13 Miscellaneous Instructions here. + */ + +/** + * \defgroup NMSIS_Core_DSP_Intrinsic_PART_SIMD_DATA_PROCESS Partial-SIMD Data Processing Instructions + * \ingroup NMSIS_Core_DSP_Intrinsic + * \brief Partial-SIMD Data Processing Instructions + * \details + */ + +/** + * \defgroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_PACK SIMD 16-bit Packing Instructions + * \ingroup NMSIS_Core_DSP_Intrinsic_PART_SIMD_DATA_PROCESS + * \brief SIMD 16-bit Packing Instructions + * \details + * there are 4 SIMD16-bit Packing Instructions. + */ + +/** + * \defgroup NMSIS_Core_DSP_Intrinsic_SIGNED_MSW_32X32_MAC Signed MSW 32x32 Multiply and Add Instructions + * \ingroup NMSIS_Core_DSP_Intrinsic_PART_SIMD_DATA_PROCESS + * \brief Signed MSW 32x32 Multiply and Add Instructions + * \details + * there are 8 Signed MSW 32x32 Multiply and Add Instructions + */ + +/** + * \defgroup NMSIS_Core_DSP_Intrinsic_SIGNED_MSW_32X16_MAC Signed MSW 32x16 Multiply and Add Instructions + * \ingroup NMSIS_Core_DSP_Intrinsic_PART_SIMD_DATA_PROCESS + * \brief Signed MSW 32x16 Multiply and Add Instructions + * \details + * there are 15 Signed MSW 32x16 Multiply and Add Instructions + */ + +/** + * \defgroup NMSIS_Core_DSP_Intrinsic_SIGNED_16B_MULT_32B_ADDSUB Signed 16-bit Multiply 32-bit Add/Subtract Instructions + * \ingroup NMSIS_Core_DSP_Intrinsic_PART_SIMD_DATA_PROCESS + * \brief Signed 16-bit Multiply 32-bit Add/Subtract Instructions + * \details + * there are 18 Signed 16-bit Multiply 32-bit Add/Subtract Instructions + */ + +/** + * \defgroup NMSIS_Core_DSP_Intrinsic_SIGNED_16B_MULT_64B_ADDSUB Signed 16-bit Multiply 64-bit Add/Subtract Instructions + * \ingroup NMSIS_Core_DSP_Intrinsic_PART_SIMD_DATA_PROCESS + * \brief Signed 16-bit Multiply 64-bit Add/Subtract Instructions + * \details + * there is Signed 16-bit Multiply 64-bit Add/Subtract Instructions + */ + +/** + * \defgroup NMSIS_Core_DSP_Intrinsic_PART_SIMD_MISC Partial-SIMD Miscellaneous Instructions + * \ingroup NMSIS_Core_DSP_Intrinsic_PART_SIMD_DATA_PROCESS + * \brief Partial-SIMD Miscellaneous Instructions + * \details + * there are 7 Partial-SIMD Miscellaneous Instructions + */ + +/** + * \defgroup NMSIS_Core_DSP_Intrinsic_8B_MULT_32B_ADD 8-bit Multiply with 32-bit Add Instructions + * \ingroup NMSIS_Core_DSP_Intrinsic_PART_SIMD_DATA_PROCESS + * \brief 8-bit Multiply with 32-bit Add Instructions + * \details + * there are 3 8-bit Multiply with 32-bit Add Instructions + */ + +/** + * \defgroup NMSIS_Core_DSP_Intrinsic_64B_PROFILE 64-bit Profile Instructions + * \ingroup NMSIS_Core_DSP_Intrinsic + * \brief 64-bit Profile Instructions + * \details + */ + +/** + * \defgroup NMSIS_Core_DSP_Intrinsic_64B_ADDSUB 64-bit Addition & Subtraction Instructions + * \ingroup NMSIS_Core_DSP_Intrinsic_64B_PROFILE + * \brief 64-bit Addition & Subtraction Instructions + * \details + * there are 10 64-bit Addition & Subtraction Instructions. + */ + +/** + * \defgroup NMSIS_Core_DSP_Intrinsic_32B_MULT_64B_ADDSUB 32-bit Multiply with 64-bit Add/Subtract Instructions + * \ingroup NMSIS_Core_DSP_Intrinsic_64B_PROFILE + * \brief 32-bit Multiply with 64-bit Add/Subtract Instructions + * \details + * there are 32-bit Multiply 64-bit Add/Subtract Instructions + */ + +/** + * \defgroup NMSIS_Core_DSP_Intrinsic_SIGNED_16B_MULT_64B_ADDSUB Signed 16-bit Multiply with 64-bit Add/Subtract Instructions + * \ingroup NMSIS_Core_DSP_Intrinsic_64B_PROFILE + * \brief Signed 16-bit Multiply with 64-bit Add/Subtract Instructions + * \details + * there are 10 Signed 16-bit Multiply with 64-bit Add/Subtract Instructions + */ + +/** + * \defgroup NMSIS_Core_DSP_Intrinsic_RV64_ONLY RV64 Only Instructions + * \ingroup NMSIS_Core_DSP_Intrinsic + * \brief RV64 Only Instructions + * \details + */ + +/** + * \defgroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_ADDSUB (RV64 Only) SIMD 32-bit Add/Subtract Instructions + * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_ONLY + * \brief (RV64 Only) SIMD 32-bit Add/Subtract Instructions + * \details + * The following tables list instructions that are only present in RV64. + * There are 30 SIMD 32-bit addition or subtraction instructions.there are 4 SIMD16-bit Packing Instructions. + */ + +/** + * \defgroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_SHIFT (RV64 Only) SIMD 32-bit Shift Instructions + * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_ONLY + * \brief (RV64 Only) SIMD 32-bit Shift Instructions + * \details + * there are 14 (RV64 Only) SIMD 32-bit Shift Instructions + */ + +/** + * \defgroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_MISC (RV64 Only) SIMD 32-bit Miscellaneous Instructions + * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_ONLY + * \brief (RV64 Only) SIMD 32-bit Miscellaneous Instructions + * \details + * there are 5 (RV64 Only) SIMD 32-bit Miscellaneous Instructions + */ + +/** + * \defgroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_Q15_SAT_MULT (RV64 Only) SIMD Q15 Saturating Multiply Instructions + * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_ONLY + * \brief (RV64 Only) SIMD Q15 Saturating Multiply Instructions + * \details + * there are 9 (RV64 Only) SIMD Q15 saturating Multiply Instructions + */ + +/** + * \defgroup NMSIS_Core_DSP_Intrinsic_RV64_32B_MULT (RV64 Only) 32-bit Multiply Instructions + * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_ONLY + * \brief (RV64 Only) 32-bit Multiply Instructions + * \details + * there is 3 RV64 Only) 32-bit Multiply Instructions + */ + +/** + * \defgroup NMSIS_Core_DSP_Intrinsic_RV64_32B_MULT_ADD (RV64 Only) 32-bit Multiply & Add Instructions + * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_ONLY + * \brief (RV64 Only) 32-bit Multiply & Add Instructions + * \details + * there are 3 (RV64 Only) 32-bit Multiply & Add Instructions + */ + +/** + * \defgroup NMSIS_Core_DSP_Intrinsic_RV64_32B_PARALLEL_MAC (RV64 Only) 32-bit Parallel Multiply & Add Instructions + * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_ONLY + * \brief (RV64 Only) 32-bit Parallel Multiply & Add Instructions + * \details + * there are 12 (RV64 Only) 32-bit Parallel Multiply & Add Instructions + */ + +/** + * \defgroup NMSIS_Core_DSP_Intrinsic_RV64_NON_SIMD_32B_SHIFT (RV64 Only) Non-SIMD 32-bit Shift Instructions + * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_ONLY + * \brief (RV64 Only) Non-SIMD 32-bit Shift Instructions + * \details + * there are 1 (RV64 Only) Non-SIMD 32-bit Shift Instructions + */ + +/** + * \defgroup NMSIS_Core_DSP_Intrinsic_RV64_32B_PACK 32-bit Packing Instructions + * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_ONLY + * \brief 32-bit Packing Instructions + * \details + * There are four 32-bit packing instructions here + */ + +/* ===== Inline Function Start for 3.1. ADD8 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_ADDSUB + * \brief ADD8 (SIMD 8-bit Addition) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * ADD8 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do 8-bit integer element additions simultaneously. + * + * **Description**:\n + * This instruction adds the 8-bit integer elements in Rs1 with the 8-bit integer elements + * in Rs2, and then writes the 8-bit element results to Rd. + * + * **Note**:\n + * This instruction can be used for either signed or unsigned addition. + * + * **Operations**:\n + * ~~~ + * Rd.B[x] = Rs1.B[x] + Rs2.B[x]; + * for RV32: x=3...0, + * for RV64: x=7...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_ADD8(unsigned long a, unsigned long b) +{ + register unsigned long result; + __ASM volatile("add8 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 3.1. ADD8 ===== */ + +/* ===== Inline Function Start for 3.2. ADD16 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_ADDSUB + * \brief ADD16 (SIMD 16-bit Addition) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * ADD16 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do 16-bit integer element additions simultaneously. + * + * **Description**:\n + * This instruction adds the 16-bit integer elements in Rs1 with the 16-bit integer + * elements in Rs2, and then writes the 16-bit element results to Rd. + * + * **Note**:\n + * This instruction can be used for either signed or unsigned addition. + * + * **Operations**:\n + * ~~~ + * Rd.H[x] = Rs1.H[x] + Rs2.H[x]; + * for RV32: x=1...0, + * for RV64: x=3...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_ADD16(unsigned long a, unsigned long b) +{ + register unsigned long result; + __ASM volatile("add16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 3.2. ADD16 ===== */ + +/* ===== Inline Function Start for 3.3. ADD64 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_64B_ADDSUB + * \brief ADD64 (64-bit Addition) + * \details + * **Type**: 64-bit Profile + * + * **Syntax**:\n + * ~~~ + * ADD64 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Add two 64-bit signed or unsigned integers. + * + * **RV32 Description**:\n + * This instruction adds the 64-bit integer of an even/odd pair of registers specified + * by Rs1(4,1) with the 64-bit integer of an even/odd pair of registers specified by Rs2(4,1), and then + * writes the 64-bit result to an even/odd pair of registers specified by Rd(4,1). + * Rx(4,1), i.e., value d, determines the even/odd pair group of two registers. Specifically, the register + * pair includes register 2d and 2d+1. + * The odd `2d+1` register of the pair contains the high 32-bit of the result and the even `2d` register + * of the pair contains the low 32-bit of the result. + * + * **RV64 Description**:\n + * This instruction has the same behavior as the ADD instruction in RV64I. + * + * **Note**:\n + * This instruction can be used for either signed or unsigned addition. + * + * **Operations**:\n + * ~~~ + * RV32: + * t_L = CONCAT(Rd(4,1),1'b0); t_H = CONCAT(Rd(4,1),1'b1); + * a_L = CONCAT(Rs1(4,1),1'b0); a_H = CONCAT(Rs1(4,1),1'b1); + * b_L = CONCAT(Rs2(4,1),1'b0); b_H = CONCAT(Rs2(4,1),1'b1); + * R[t_H].R[t_L] = R[a_H].R[a_L] + R[b_H].R[b_L]; + * RV64: + * Rd = Rs1 + Rs2; + * ~~~ + * + * \param [in] a unsigned long long type of value stored in a + * \param [in] b unsigned long long type of value stored in b + * \return value stored in unsigned long long type + */ +__STATIC_FORCEINLINE unsigned long long __RV_ADD64(unsigned long long a, unsigned long long b) +{ + register unsigned long long result; + __ASM volatile("add64 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 3.3. ADD64 ===== */ + +/* ===== Inline Function Start for 3.4. AVE ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_MISC + * \brief AVE (Average with Rounding) + * \details + * **Type**: DSP + * + * **Syntax**:\n + * ~~~ + * AVE Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Calculate the average of the contents of two general registers. + * + * **Description**:\n + * This instruction calculates the average value of two signed integers stored in Rs1 and + * Rs2, rounds up a half-integer result to the nearest integer, and writes the result to Rd. + * + * **Operations**:\n + * ~~~ + * Sum = CONCAT(Rs1[MSB],Rs1[MSB:0]) + CONCAT(Rs2[MSB],Rs2[MSB:0]) + 1; + * Rd = Sum[(MSB+1):1]; + * for RV32: MSB=31, + * for RV64: MSB=63 + * ~~~ + * + * \param [in] a long type of value stored in a + * \param [in] b long type of value stored in b + * \return value stored in long type + */ +__STATIC_FORCEINLINE long __RV_AVE(long a, long b) +{ + register long result; + __ASM volatile("ave %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 3.4. AVE ===== */ + +/* ===== Inline Function Start for 3.5. BITREV ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_MISC + * \brief BITREV (Bit Reverse) + * \details + * **Type**: DSP + * + * **Syntax**:\n + * ~~~ + * BITREV Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Reverse the bit positions of the source operand within a specified width starting from bit + * 0. The reversed width is a variable from a GPR. + * + * **Description**:\n + * This instruction reverses the bit positions of the content of Rs1. The reversed bit width + * is calculated as Rs2[4:0]+1 (RV32) or Rs2[5:0]+1 (RV64). The upper bits beyond the reversed width + * are filled with zeros. After the bit reverse operation, the result is written to Rd. + * + * **Operations**:\n + * ~~~ + * msb = Rs2[4:0]; (for RV32) + * msb = Rs2[5:0]; (for RV64) + * rev[0:msb] = Rs1[msb:0]; + * Rd = ZE(rev[msb:0]); + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_BITREV(unsigned long a, unsigned long b) +{ + register unsigned long result; + __ASM volatile("bitrev %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 3.5. BITREV ===== */ + +/* ===== Inline Function Start for 3.6. BITREVI ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_MISC + * \brief BITREVI (Bit Reverse Immediate) + * \details + * **Type**: DSP + * + * **Syntax**:\n + * ~~~ + * (RV32) BITREVI Rd, Rs1, imm[4:0] + * (RV64) BITREVI Rd, Rs1, imm[5:0] + * ~~~ + * + * **Purpose**:\n + * Reverse the bit positions of the source operand within a specified width starting from bit + * 0. The reversed width is an immediate value. + * + * **Description**:\n + * This instruction reverses the bit positions of the content of Rs1. The reversed bit width + * is calculated as imm[4:0]+1 (RV32) or imm[5:0]+1 (RV64). The upper bits beyond the reversed width + * are filled with zeros. After the bit reverse operation, the result is written to Rd. + * + * **Operations**:\n + * ~~~ + * msb = imm[4:0]; (RV32) + * msb = imm[5:0]; (RV64) + * rev[0:msb] = Rs1[msb:0]; + * Rd = ZE(rev[msb:0]); + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in unsigned long type + */ +#define __RV_BITREVI(a, b) \ + ({ \ + register unsigned long result; \ + register unsigned long __a = (unsigned long)(a); \ + __ASM volatile("bitrevi %0, %1, %2" : "=r"(result) : "r"(__a), "K"(b)); \ + result; \ + }) +/* ===== Inline Function End for 3.6. BITREVI ===== */ + +/* ===== Inline Function Start for 3.7. BPICK ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_MISC + * \brief BPICK (Bit-wise Pick) + * \details + * **Type**: DSP + * + * **Syntax**:\n + * ~~~ + * BPICK Rd, Rs1, Rs2, Rc + * ~~~ + * + * **Purpose**:\n + * Select from two source operands based on a bit mask in the third operand. + * + * **Description**:\n + * This instruction selects individual bits from Rs1 or Rs2, based on the bit mask value in + * Rc. If a bit in Rc is 1, the corresponding bit is from Rs1; otherwise, the corresponding bit is from Rs2. + * The selection results are written to Rd. + * + * **Operations**:\n + * ~~~ + * Rd[x] = Rc[x]? Rs1[x] : Rs2[x]; + * for RV32, x=31...0 + * for RV64, x=63...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \param [in] c unsigned long type of value stored in c + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_BPICK(unsigned long a, unsigned long b, unsigned long c) +{ + register unsigned long result; + __ASM volatile("bpick %0, %1, %2, %3" : "=r"(result) : "r"(a), "r"(b), "r"(c)); + return result; +} +/* ===== Inline Function End for 3.7. BPICK ===== */ + +/* ===== Inline Function Start for 3.8. CLROV ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_OV_FLAG_SC + * \brief CLROV (Clear OV flag) + * \details + * **Type**: DSP + * + * **Syntax**:\n + * ~~~ + * CLROV # pseudo mnemonic + * ~~~ + * + * **Purpose**:\n + * This pseudo instruction is an alias to `CSRRCI x0, ucode, 1` instruction. + * + * + */ +__STATIC_FORCEINLINE void __RV_CLROV(void) +{ + __ASM volatile("clrov "); +} +/* ===== Inline Function End for 3.8. CLROV ===== */ + +/* ===== Inline Function Start for 3.9. CLRS8 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_MISC + * \brief CLRS8 (SIMD 8-bit Count Leading Redundant Sign) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * CLRS8 Rd, Rs1 + * ~~~ + * + * **Purpose**:\n + * Count the number of redundant sign bits of the 8-bit elements of a general register. + * + * **Description**:\n + * Starting from the bits next to the sign bits of the 8-bit elements of Rs1, this instruction + * counts the number of redundant sign bits and writes the result to the corresponding 8-bit elements + * of Rd. + * + * **Operations**:\n + * ~~~ + * snum[x] = Rs1.B[x]; + * cnt[x] = 0; + * for (i = 6 to 0) { + * if (snum[x](i) == snum[x](7)) { + * cnt[x] = cnt[x] + 1; + * } else { + * break; + * } + * } + * Rd.B[x] = cnt[x]; + * for RV32: x=3...0 + * for RV64: x=7...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_CLRS8(unsigned long a) +{ + register unsigned long result; + __ASM volatile("clrs8 %0, %1" : "=r"(result) : "r"(a)); + return result; +} +/* ===== Inline Function End for 3.9. CLRS8 ===== */ + +/* ===== Inline Function Start for 3.10. CLRS16 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_MISC + * \brief CLRS16 (SIMD 16-bit Count Leading Redundant Sign) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * CLRS16 Rd, Rs1 + * ~~~ + * + * **Purpose**:\n + * Count the number of redundant sign bits of the 16-bit elements of a general register. + * + * **Description**:\n + * Starting from the bits next to the sign bits of the 16-bit elements of Rs1, this + * instruction counts the number of redundant sign bits and writes the result to the corresponding 16- + * bit elements of Rd. + * + * **Operations**:\n + * ~~~ + * snum[x] = Rs1.H[x]; + * cnt[x] = 0; + * for (i = 14 to 0) { + * if (snum[x](i) == snum[x](15)) { + * cnt[x] = cnt[x] + 1; + * } else { + * break; + * } + * } + * Rd.H[x] = cnt[x]; + * for RV32: x=1...0 + * for RV64: x=3...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_CLRS16(unsigned long a) +{ + register unsigned long result; + __ASM volatile("clrs16 %0, %1" : "=r"(result) : "r"(a)); + return result; +} +/* ===== Inline Function End for 3.10. CLRS16 ===== */ + +/* ===== Inline Function Start for 3.11. CLRS32 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_PART_SIMD_MISC + * \brief CLRS32 (SIMD 32-bit Count Leading Redundant Sign) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * CLRS32 Rd, Rs1 + * ~~~ + * + * **Purpose**:\n + * Count the number of redundant sign bits of the 32-bit elements of a general register. + * + * **Description**:\n + * Starting from the bits next to the sign bits of the 32-bit elements of Rs1, this + * instruction counts the number of redundant sign bits and writes the result to the corresponding 32- + * bit elements of Rd. + * + * **Operations**:\n + * ~~~ + * snum[x] = Rs1.W[x]; + * cnt[x] = 0; + * for (i = 30 to 0) { + * if (snum[x](i) == snum[x](31)) { + * cnt[x] = cnt[x] + 1; + * } else { + * break; + * } + * } + * Rd.W[x] = cnt[x]; + * for RV32: x=0 + * for RV64: x=1...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_CLRS32(unsigned long a) +{ + register unsigned long result; + __ASM volatile("clrs32 %0, %1" : "=r"(result) : "r"(a)); + return result; +} +/* ===== Inline Function End for 3.11. CLRS32 ===== */ + +/* ===== Inline Function Start for 3.12. CLO8 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_MISC + * \brief CLO8 (SIMD 8-bit Count Leading One) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * CLO8 Rd, Rs1 + * ~~~ + * + * **Purpose**:\n + * Count the number of leading one bits of the 8-bit elements of a general register. + * + * **Description**:\n + * Starting from the most significant bits of the 8-bit elements of Rs1, this instruction + * counts the number of leading one bits and writes the results to the corresponding 8-bit elements of + * Rd. + * + * **Operations**:\n + * ~~~ + * snum[x] = Rs1.B[x]; + * cnt[x] = 0; + * for (i = 7 to 0) { + * if (snum[x](i) == 1) { + * cnt[x] = cnt[x] + 1; + * } else { + * break; + * } + * } + * Rd.B[x] = cnt[x]; + * for RV32: x=3...0 + * for RV64: x=7...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_CLO8(unsigned long a) +{ + register unsigned long result; + __ASM volatile("clo8 %0, %1" : "=r"(result) : "r"(a)); + return result; +} +/* ===== Inline Function End for 3.12. CLO8 ===== */ + +/* ===== Inline Function Start for 3.13. CLO16 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_MISC + * \brief CLO16 (SIMD 16-bit Count Leading One) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * CLO16 Rd, Rs1 + * ~~~ + * + * **Purpose**:\n + * Count the number of leading one bits of the 16-bit elements of a general register. + * + * **Description**:\n + * Starting from the most significant bits of the 16-bit elements of Rs1, this instruction + * counts the number of leading one bits and writes the results to the corresponding 16-bit elements + * of Rd. + * + * **Operations**:\n + * ~~~ + * snum[x] = Rs1.H[x]; + * cnt[x] = 0; + * for (i = 15 to 0) { + * if (snum[x](i) == 1) { + * cnt[x] = cnt[x] + 1; + * } else { + * break; + * } + * } + * Rd.H[x] = cnt[x]; + * for RV32: x=1...0 + * for RV64: x=3...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_CLO16(unsigned long a) +{ + register unsigned long result; + __ASM volatile("clo16 %0, %1" : "=r"(result) : "r"(a)); + return result; +} +/* ===== Inline Function End for 3.13. CLO16 ===== */ + +/* ===== Inline Function Start for 3.14. CLO32 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_PART_SIMD_MISC + * \brief CLO32 (SIMD 32-bit Count Leading One) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * CLO32 Rd, Rs1 + * ~~~ + * + * **Purpose**:\n + * Count the number of leading one bits of the 32-bit elements of a general register. + * + * **Description**:\n + * Starting from the most significant bits of the 32-bit elements of Rs1, this instruction + * counts the number of leading one bits and writes the results to the corresponding 32-bit elements + * of Rd. + * + * **Operations**:\n + * ~~~ + * snum[x] = Rs1.W[x]; + * cnt[x] = 0; + * for (i = 31 to 0) { + * if (snum[x](i) == 1) { + * cnt[x] = cnt[x] + 1; + * } else { + * break; + * } + * } + * Rd.W[x] = cnt[x]; + * for RV32: x=0 + * for RV64: x=1...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_CLO32(unsigned long a) +{ + register unsigned long result; + __ASM volatile("clo32 %0, %1" : "=r"(result) : "r"(a)); + return result; +} +/* ===== Inline Function End for 3.14. CLO32 ===== */ + +/* ===== Inline Function Start for 3.15. CLZ8 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_MISC + * \brief CLZ8 (SIMD 8-bit Count Leading Zero) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * CLZ8 Rd, Rs1 + * ~~~ + * + * **Purpose**:\n + * Count the number of leading zero bits of the 8-bit elements of a general register. + * + * **Description**:\n + * Starting from the most significant bits of the 8-bit elements of Rs1, this instruction + * counts the number of leading zero bits and writes the results to the corresponding 8-bit elements of + * Rd. + * + * **Operations**:\n + * ~~~ + * snum[x] = Rs1.B[x]; + * cnt[x] = 0; + * for (i = 7 to 0) { + * if (snum[x](i) == 0) { + * cnt[x] = cnt[x] + 1; + * } else { + * break; + * } + * } + * Rd.B[x] = cnt[x]; + * for RV32: x=3...0 + * for RV64: x=7...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_CLZ8(unsigned long a) +{ + register unsigned long result; + __ASM volatile("clz8 %0, %1" : "=r"(result) : "r"(a)); + return result; +} +/* ===== Inline Function End for 3.15. CLZ8 ===== */ + +/* ===== Inline Function Start for 3.16. CLZ16 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_MISC + * \brief CLZ16 (SIMD 16-bit Count Leading Zero) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * CLZ16 Rd, Rs1 + * ~~~ + * + * **Purpose**:\n + * Count the number of leading zero bits of the 16-bit elements of a general register. + * + * **Description**:\n + * Starting from the most significant bits of the 16-bit elements of Rs1, this instruction + * counts the number of leading zero bits and writes the results to the corresponding 16-bit elements + * of Rd. + * + * **Operations**:\n + * ~~~ + * snum[x] = Rs1.H[x]; + * cnt[x] = 0; + * for (i = 15 to 0) { + * if (snum[x](i) == 0) { + * cnt[x] = cnt[x] + 1; + * } else { + * break; + * } + * } + * Rd.H[x] = cnt[x]; + * for RV32: x=1...0 + * for RV64: x=3...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_CLZ16(unsigned long a) +{ + register unsigned long result; + __ASM volatile("clz16 %0, %1" : "=r"(result) : "r"(a)); + return result; +} +/* ===== Inline Function End for 3.16. CLZ16 ===== */ + +/* ===== Inline Function Start for 3.17. CLZ32 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_PART_SIMD_MISC + * \brief CLZ32 (SIMD 32-bit Count Leading Zero) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * CLZ32 Rd, Rs1 + * ~~~ + * + * **Purpose**:\n + * Count the number of leading zero bits of the 32-bit elements of a general register. + * + * **Description**:\n + * Starting from the most significant bits of the 32-bit elements of Rs1, this instruction + * counts the number of leading zero bits and writes the results to the corresponding 32-bit elements + * of Rd. + * + * **Operations**:\n + * ~~~ + * snum[x] = Rs1.W[x]; + * cnt[x] = 0; + * for (i = 31 to 0) { + * if (snum[x](i) == 0) { + * cnt[x] = cnt[x] + 1; + * } else { + * break; + * } + * } + * Rd.W[x] = cnt[x]; + * for RV32: x=0 + * for RV64: x=1...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_CLZ32(unsigned long a) +{ + register unsigned long result; + __ASM volatile("clz32 %0, %1" : "=r"(result) : "r"(a)); + return result; +} +/* ===== Inline Function End for 3.17. CLZ32 ===== */ + +/* ===== Inline Function Start for 3.18. CMPEQ8 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_CMP + * \brief CMPEQ8 (SIMD 8-bit Integer Compare Equal) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * CMPEQ8 Rs, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do 8-bit integer elements equal comparisons simultaneously. + * + * **Description**:\n + * This instruction compares the 8-bit integer elements in Rs1 with the 8-bit integer + * elements in Rs2 to see if they are equal. If they are equal, the result is 0xFF; otherwise, the result is + * 0x0. The 8-bit element comparison results are written to Rd. + * + * **Note**:\n + * This instruction can be used for either signed or unsigned numbers. + * + * **Operations**:\n + * ~~~ + * Rd.B[x] = (Rs1.B[x] == Rs2.B[x])? 0xff : 0x0; + * for RV32: x=3...0, + * for RV64: x=7...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_CMPEQ8(unsigned long a, unsigned long b) +{ + register unsigned long result; + __ASM volatile("cmpeq8 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 3.18. CMPEQ8 ===== */ + +/* ===== Inline Function Start for 3.19. CMPEQ16 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_CMP + * \brief CMPEQ16 (SIMD 16-bit Integer Compare Equal) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * CMPEQ16 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do 16-bit integer elements equal comparisons simultaneously. + * + * **Description**:\n + * This instruction compares the 16-bit integer elements in Rs1 with the 16-bit integer + * elements in Rs2 to see if they are equal. If they are equal, the result is 0xFFFF; otherwise, the result + * is 0x0. The 16-bit element comparison results are written to Rt. + * + * **Note**:\n + * This instruction can be used for either signed or unsigned numbers. + * + * **Operations**:\n + * ~~~ + * Rd.H[x] = (Rs1.H[x] == Rs2.H[x])? 0xffff : 0x0; + * for RV32: x=1...0, + * for RV64: x=3...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_CMPEQ16(unsigned long a, unsigned long b) +{ + register unsigned long result; + __ASM volatile("cmpeq16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 3.19. CMPEQ16 ===== */ + +/* ===== Inline Function Start for 3.20. CRAS16 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_ADDSUB + * \brief CRAS16 (SIMD 16-bit Cross Addition & Subtraction) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * CRAS16 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do 16-bit integer element addition and 16-bit integer element subtraction in a 32-bit + * chunk simultaneously. Operands are from crossed positions in 32-bit chunks. + * + * **Description**:\n + * This instruction adds the 16-bit integer element in [31:16] of 32-bit chunks in Rs1 with + * the 16-bit integer element in [15:0] of 32-bit chunks in Rs2, and writes the result to [31:16] of 32-bit + * chunks in Rd; at the same time, it subtracts the 16-bit integer element in [31:16] of 32-bit chunks in + * Rs2 from the 16-bit integer element in [15:0] of 32-bit chunks, and writes the result to [15:0] of 32- + * bit chunks in Rd. + * + * **Note**:\n + * This instruction can be used for either signed or unsigned operations. + * + * **Operations**:\n + * ~~~ + * Rd.W[x][31:16] = Rs1.W[x][31:16] + Rs2.W[x][15:0]; + * Rd.W[x][15:0] = Rs1.W[x][15:0] - Rs2.W[x][31:16]; + * for RV32, x=0 + * for RV64, x=1...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_CRAS16(unsigned long a, unsigned long b) +{ + register unsigned long result; + __ASM volatile("cras16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 3.20. CRAS16 ===== */ + +/* ===== Inline Function Start for 3.21. CRSA16 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_ADDSUB + * \brief CRSA16 (SIMD 16-bit Cross Subtraction & Addition) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * CRSA16 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do 16-bit integer element subtraction and 16-bit integer element addition in a 32-bit + * chunk simultaneously. Operands are from crossed positions in 32-bit chunks. + * + * **Description**:\n + * This instruction subtracts the 16-bit integer element in [15:0] of 32-bit chunks in Rs2 + * from the 16-bit integer element in [31:16] of 32-bit chunks in Rs1, and writes the result to [31:16] of + * 32-bit chunks in Rd; at the same time, it adds the 16-bit integer element in [31:16] of 32-bit chunks + * in Rs2 with the 16-bit integer element in [15:0] of 32-bit chunks in Rs1, and writes the result to + * [15:0] of 32-bit chunks in Rd. + * + * **Note**:\n + * This instruction can be used for either signed or unsigned operations. + * + * **Operations**:\n + * ~~~ + * Rd.W[x][31:16] = Rs1.W[x][31:16] - Rs2.W[x][15:0]; + * Rd.W[x][15:0] = Rs1.W[x][15:0] + Rs2.W[x][31:16]; + * for RV32, x=0 + * for RV64, x=1...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_CRSA16(unsigned long a, unsigned long b) +{ + register unsigned long result; + __ASM volatile("crsa16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 3.21. CRSA16 ===== */ + +/* ===== Inline Function Start for 3.22. INSB ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_MISC + * \brief INSB (Insert Byte) + * \details + * **Type**: DSP + * + * **Syntax**:\n + * ~~~ + * (RV32) INSB Rd, Rs1, imm[1:0] + * (RV64) INSB Rd, Rs1, imm[2:0] + * ~~~ + * + * **Purpose**:\n + * Insert byte 0 of a 32-bit or 64-bit register into one of the byte elements of another register. + * + * **Description**:\n + * This instruction inserts byte 0 of Rs1 into byte `imm[1:0]` (RV32) or `imm[2:0]` (RV64) + * of Rd. + * + * **Operations**:\n + * ~~~ + * bpos = imm[1:0]; (RV32) + * bpos = imm[2:0]; (RV64) + * Rd.B[bpos] = Rs1.B[0] + * ~~~ + * + * \param [in] t unsigned long type of value stored in t + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in unsigned long type + */ +#define __RV_INSB(t, a, b) \ + ({ \ + register unsigned long __t = (unsigned long)(t); \ + register unsigned long __a = (unsigned long)(a); \ + __ASM volatile("insb %0, %1, %2" : "+r"(__t) : "r"(__a), "K"(b)); \ + __t; \ + }) +/* ===== Inline Function End for 3.22. INSB ===== */ + +/* ===== Inline Function Start for 3.23. KABS8 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_MISC + * \brief KABS8 (SIMD 8-bit Saturating Absolute) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * KABS8 Rd, Rs1 + * ~~~ + * + * **Purpose**:\n + * Get the absolute value of 8-bit signed integer elements simultaneously. + * + * **Description**:\n + * This instruction calculates the absolute value of 8-bit signed integer elements stored + * in Rs1 and writes the element results to Rd. If the input number is 0x80, this instruction generates + * 0x7f as the output and sets the OV bit to 1. + * + * **Operations**:\n + * ~~~ + * src = Rs1.B[x]; + * if (src == 0x80) { + * src = 0x7f; + * OV = 1; + * } else if (src[7] == 1) + * src = -src; + * } + * Rd.B[x] = src; + * for RV32: x=3...0, + * for RV64: x=7...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_KABS8(unsigned long a) +{ + register unsigned long result; + __ASM volatile("kabs8 %0, %1" : "=r"(result) : "r"(a)); + return result; +} +/* ===== Inline Function End for 3.23. KABS8 ===== */ + +/* ===== Inline Function Start for 3.24. KABS16 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_MISC + * \brief KABS16 (SIMD 16-bit Saturating Absolute) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * KABS16 Rd, Rs1 + * ~~~ + * + * **Purpose**:\n + * Get the absolute value of 16-bit signed integer elements simultaneously. + * + * **Description**:\n + * This instruction calculates the absolute value of 16-bit signed integer elements stored + * in Rs1 and writes the element results to Rd. If the input number is 0x8000, this instruction + * generates 0x7fff as the output and sets the OV bit to 1. + * + * **Operations**:\n + * ~~~ + * src = Rs1.H[x]; + * if (src == 0x8000) { + * src = 0x7fff; + * OV = 1; + * } else if (src[15] == 1) + * src = -src; + * } + * Rd.H[x] = src; + * for RV32: x=1...0, + * for RV64: x=3...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_KABS16(unsigned long a) +{ + register unsigned long result; + __ASM volatile("kabs16 %0, %1" : "=r"(result) : "r"(a)); + return result; +} +/* ===== Inline Function End for 3.24. KABS16 ===== */ + +/* ===== Inline Function Start for 3.25. KABSW ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_Q31_SAT_ALU + * \brief KABSW (Scalar 32-bit Absolute Value with Saturation) + * \details + * **Type**: DSP + * + * **Syntax**:\n + * ~~~ + * KABSW Rd, Rs1 + * ~~~ + * + * **Purpose**:\n + * Get the absolute value of a signed 32-bit integer in a general register. + * + * **Description**:\n + * This instruction calculates the absolute value of a signed 32-bit integer stored in Rs1. + * The result is sign-extended (for RV64) and written to Rd. This instruction with the minimum + * negative integer input of 0x80000000 will produce a saturated output of maximum positive integer + * of 0x7fffffff and the OV flag will be set to 1. + * + * **Operations**:\n + * ~~~ + * if (Rs1.W[0] >= 0) { + * res = Rs1.W[0]; + * } else { + * If (Rs1.W[0] == 0x80000000) { + * res = 0x7fffffff; + * OV = 1; + * } else { + * res = -Rs1.W[0]; + * } + * } + * Rd = SE32(res); + * ~~~ + * + * \param [in] a signed long type of value stored in a + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_KABSW(signed long a) +{ + register unsigned long result; + __ASM volatile("kabsw %0, %1" : "=r"(result) : "r"(a)); + return result; +} +/* ===== Inline Function End for 3.25. KABSW ===== */ + +/* ===== Inline Function Start for 3.26. KADD8 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_ADDSUB + * \brief KADD8 (SIMD 8-bit Signed Saturating Addition) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * KADD8 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do 8-bit signed integer element saturating additions simultaneously. + * + * **Description**:\n + * This instruction adds the 8-bit signed integer elements in Rs1 with the 8-bit signed + * integer elements in Rs2. If any of the results are beyond the Q7 number range (-2^7 <= Q7 <= 2^7-1), they + * are saturated to the range and the OV bit is set to 1. The saturated results are written to Rd. + * + * **Operations**:\n + * ~~~ + * res[x] = Rs1.B[x] + Rs2.B[x]; + * if (res[x] > 127) { + * res[x] = 127; + * OV = 1; + * } else if (res[x] < -128) { + * res[x] = -128; + * OV = 1; + * } + * Rd.B[x] = res[x]; + * for RV32: x=3...0, + * for RV64: x=7...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_KADD8(unsigned long a, unsigned long b) +{ + register unsigned long result; + __ASM volatile("kadd8 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 3.26. KADD8 ===== */ + +/* ===== Inline Function Start for 3.27. KADD16 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_ADDSUB + * \brief KADD16 (SIMD 16-bit Signed Saturating Addition) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * KADD16 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do 16-bit signed integer element saturating additions simultaneously. + * + * **Description**:\n + * This instruction adds the 16-bit signed integer elements in Rs1 with the 16-bit signed + * integer elements in Rs2. If any of the results are beyond the Q15 number range (-2^15 <= Q15 <= 2^15-1), + * they are saturated to the range and the OV bit is set to 1. The saturated results are written to Rd. + * + * **Operations**:\n + * ~~~ + * res[x] = Rs1.H[x] + Rs2.H[x]; + * if (res[x] > 32767) { + * res[x] = 32767; + * OV = 1; + * } else if (res[x] < -32768) { + * res[x] = -32768; + * OV = 1; + * } + * Rd.H[x] = res[x]; + * for RV32: x=1...0, + * for RV64: x=3...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_KADD16(unsigned long a, unsigned long b) +{ + register unsigned long result; + __ASM volatile("kadd16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 3.27. KADD16 ===== */ + +/* ===== Inline Function Start for 3.28. KADD64 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_64B_ADDSUB + * \brief KADD64 (64-bit Signed Saturating Addition) + * \details + * **Type**: DSP (64-bit Profile) + * + * **Syntax**:\n + * ~~~ + * KADD64 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Add two 64-bit signed integers. The result is saturated to the Q63 range. + * + * **RV32 Description**:\n + * This instruction adds the 64-bit signed integer of an even/odd pair of registers + * specified by Rs1(4,1) with the 64-bit signed integer of an even/odd pair of registers specified by + * Rs2(4,1). If the 64-bit result is beyond the Q63 number range (-2^63 <= Q63 <= 2^63-1), it is saturated to the + * range and the OV bit is set to 1. The saturated result is written to an even/odd pair of registers + * specified by Rd(4,1). + * Rx(4,1), i.e., value d, determines the even/odd pair group of two registers. Specifically, the register + * pair includes register 2d and 2d+1. + * The odd `2d+1` register of the pair contains the high 32-bit of the result and the even `2d` register + * of the pair contains the low 32-bit of the result. + * + * **RV64 Description**:\n + * This instruction adds the 64-bit signed integer in Rs1 with the 64-bit signed + * integer in Rs2. If the result is beyond the Q63 number range (-2^63 <= Q63 <= 2^63-1), it is saturated to the + * range and the OV bit is set to 1. The saturated result is written to Rd. + * + * **Operations**:\n + * ~~~ + * RV32: + * t_L = CONCAT(Rd(4,1),1'b0); t_H = CONCAT(Rd(4,1),1'b1); + * a_L = CONCAT(Rs1(4,1),1'b0); a_H = CONCAT(Rs1(4,1),1'b1); + * b_L = CONCAT(Rs2(4,1),1'b0); b_H = CONCAT(Rs2(4,1),1'b1); + * result = R[a_H].R[a_L] + R[b_H].R[b_L]; + * if (result > (2^63)-1) { + * result = (2^63)-1; OV = 1; + * } else if (result < -2^63) { + * result = -2^63; OV = 1; + * } + * R[t_H].R[t_L] = result; + * RV64: + * result = Rs1 + Rs2; + * if (result > (2^63)-1) { + * result = (2^63)-1; OV = 1; + * } else if (result < -2^63) { + * result = -2^63; OV = 1; + * } + * Rd = result; + * ~~~ + * + * \param [in] a long long type of value stored in a + * \param [in] b long long type of value stored in b + * \return value stored in long long type + */ +__STATIC_FORCEINLINE long long __RV_KADD64(long long a, long long b) +{ + register long long result; + __ASM volatile("kadd64 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 3.28. KADD64 ===== */ + +/* ===== Inline Function Start for 3.29. KADDH ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_Q15_SAT_ALU + * \brief KADDH (Signed Addition with Q15 Saturation) + * \details + * **Type**: DSP + * + * **Syntax**:\n + * ~~~ + * KADDH Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Add the signed lower 32-bit content of two registers with Q15 saturation. + * + * **Description**:\n + * The signed lower 32-bit content of Rs1 is added with the signed lower 32-bit content of + * Rs2. And the result is saturated to the 16-bit signed integer range of [-2^15, 2^15-1] and then sign- + * extended and written to Rd. If saturation happens, this instruction sets the OV flag. + * + * **Operations**:\n + * ~~~ + * tmp = Rs1.W[0] + Rs2.W[0]; + * if (tmp > 32767) { + * res = 32767; + * OV = 1; + * } else if (tmp < -32768) { + * res = -32768; + * OV = 1 + * } else { + * res = tmp; + * } + * Rd = SE(tmp[15:0]); + * ~~~ + * + * \param [in] a int type of value stored in a + * \param [in] b int type of value stored in b + * \return value stored in long type + */ +__STATIC_FORCEINLINE long __RV_KADDH(int a, int b) +{ + register long result; + __ASM volatile("kaddh %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 3.29. KADDH ===== */ + +/* ===== Inline Function Start for 3.30. KADDW ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_Q31_SAT_ALU + * \brief KADDW (Signed Addition with Q31 Saturation) + * \details + * **Type**: DSP + * + * **Syntax**:\n + * ~~~ + * KADDW Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Add the lower 32-bit signed content of two registers with Q31 saturation. + * + * **Description**:\n + * The lower 32-bit signed content of Rs1 is added with the lower 32-bit signed content of + * Rs2. And the result is saturated to the 32-bit signed integer range of [-2^31, 2^31-1] and then sign- + * extended and written to Rd. If saturation happens, this instruction sets the OV flag. + * + * **Operations**:\n + * ~~~ + * tmp = Rs1.W[0] + Rs2.W[0]; + * if (tmp > (2^31)-1) { + * res = (2^31)-1; + * OV = 1; + * } else if (tmp < -2^31) { + * res = -2^31; + * OV = 1 + * } else { + * res = tmp; + * } + * Rd = res[31:0]; // RV32 + * Rd = SE(res[31:0]) // RV64 + * ~~~ + * + * \param [in] a int type of value stored in a + * \param [in] b int type of value stored in b + * \return value stored in long type + */ +__STATIC_FORCEINLINE long __RV_KADDW(int a, int b) +{ + register long result; + __ASM volatile("kaddw %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 3.30. KADDW ===== */ + +/* ===== Inline Function Start for 3.31. KCRAS16 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_ADDSUB + * \brief KCRAS16 (SIMD 16-bit Signed Saturating Cross Addition & Subtraction) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * KCRAS16 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do 16-bit signed integer element saturating addition and 16-bit signed integer element + * saturating subtraction in a 32-bit chunk simultaneously. Operands are from crossed positions in 32- + * bit chunks. + * + * **Description**:\n + * This instruction adds the 16-bit signed integer element in [31:16] of 32-bit chunks in + * Rs1 with the 16-bit signed integer element in [15:0] of 32-bit chunks in Rs2; at the same time, it + * subtracts the 16-bit signed integer element in [31:16] of 32-bit chunks in Rs2 from the 16-bit signed + * integer element in [15:0] of 32-bit chunks in Rs1. If any of the results are beyond the Q15 number + * range (-2^15 <= Q15 <= 2^15-1), they are saturated to the range and the OV bit is set to 1. The saturated + * results are written to [31:16] of 32-bit chunks in Rd for addition and [15:0] of 32-bit chunks in Rd for + * subtraction. + * + * **Operations**:\n + * ~~~ + * res1 = Rs1.W[x][31:16] + Rs2.W[x][15:0]; + * res2 = Rs1.W[x][15:0] - Rs2.W[x][31:16]; + * for (res in [res1, res2]) { + * if (res > (2^15)-1) { + * res = (2^15)-1; + * OV = 1; + * } else if (res < -2^15) { + * res = -2^15; + * OV = 1; + * } + * } + * Rd.W[x][31:16] = res1; + * Rd.W[x][15:0] = res2; + * for RV32, x=0 + * for RV64, x=1...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_KCRAS16(unsigned long a, unsigned long b) +{ + register unsigned long result; + __ASM volatile("kcras16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 3.31. KCRAS16 ===== */ + +/* ===== Inline Function Start for 3.32. KCRSA16 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_ADDSUB + * \brief KCRSA16 (SIMD 16-bit Signed Saturating Cross Subtraction & Addition) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * KCRSA16 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do 16-bit signed integer element saturating subtraction and 16-bit signed integer element + * saturating addition in a 32-bit chunk simultaneously. Operands are from crossed positions in 32-bit + * chunks. + * + * **Description**:\n + * This instruction subtracts the 16-bit signed integer element in [15:0] of 32-bit chunks + * in Rs2 from the 16-bit signed integer element in [31:16] of 32-bit chunks in Rs1; at the same time, it + * adds the 16-bit signed integer element in [31:16] of 32-bit chunks in Rs2 with the 16-bit signed + * integer element in [15:0] of 32-bit chunks in Rs1. If any of the results are beyond the Q15 number + * range (-2^15 <= Q15 <= 2^15-1), they are saturated to the range and the OV bit is set to 1. The saturated + * results are written to [31:16] of 32-bit chunks in Rd for subtraction and [15:0] of 32-bit chunks in Rd + * for addition. + * + * **Operations**:\n + * ~~~ + * res1 = Rs1.W[x][31:16] - Rs2.W[x][15:0]; + * res2 = Rs1.W[x][15:0] + Rs2.W[x][31:16]; + * for (res in [res1, res2]) { + * if (res > (2^15)-1) { + * res = (2^15)-1; + * OV = 1; + * } else if (res < -2^15) { + * res = -2^15; + * OV = 1; + * } + * } + * Rd.W[x][31:16] = res1; + * Rd.W[x][15:0] = res2; + * for RV32, x=0 + * for RV64, x=1...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_KCRSA16(unsigned long a, unsigned long b) +{ + register unsigned long result; + __ASM volatile("kcrsa16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 3.32. KCRSA16 ===== */ + +/* ===== Inline Function Start for 3.33.1. KDMBB ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_Q31_SAT_ALU + * \brief KDMBB (Signed Saturating Double Multiply B16 x B16) + * \details + * **Type**: DSP + * + * **Syntax**:\n + * ~~~ + * KDMxy Rd, Rs1, Rs2 (xy = BB, BT, TT) + * ~~~ + * + * **Purpose**:\n + * Multiply the signed Q15 integer contents of two 16-bit data in the corresponding portion + * of the lower 32-bit chunk in registers and then double and saturate the Q31 result. The result is + * written into the destination register for RV32 or sign-extended to 64-bits and written into the + * destination register for RV64. If saturation happens, an overflow flag OV will be set. + * + * **Description**:\n + * Multiply the top or bottom 16-bit Q15 content of the lower 32-bit portion in Rs1 with + * the top or bottom 16-bit Q15 content of the lower 32-bit portion in Rs2. The Q30 result is then + * doubled and saturated into a Q31 value. The Q31 value is then written into Rd (sign-extended in + * RV64). When both the two Q15 inputs are 0x8000, saturation will happen. The result will be + * saturated to 0x7FFFFFFF and the overflow flag OV will be set. + * + * **Operations**:\n + * ~~~ + * aop = Rs1.H[0]; bop = Rs2.H[0]; // KDMBB + * aop = Rs1.H[0]; bop = Rs2.H[1]; // KDMBT + * aop = Rs1.H[1]; bop = Rs2.H[1]; // KDMTT + * If (0x8000 != aop | 0x8000 != bop) { + * Mresult = aop * bop; + * resQ31 = Mresult << 1; + * Rd = resQ31; // RV32 + * Rd = SE(resQ31); // RV64 + * } else { + * resQ31 = 0x7FFFFFFF; + * Rd = resQ31; // RV32 + * Rd = SE(resQ31); // RV64 + * OV = 1; + * } + * ~~~ + * + * \param [in] a unsigned int type of value stored in a + * \param [in] b unsigned int type of value stored in b + * \return value stored in long type + */ +__STATIC_FORCEINLINE long __RV_KDMBB(unsigned int a, unsigned int b) +{ + register long result; + __ASM volatile("kdmbb %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 3.33.1. KDMBB ===== */ + +/* ===== Inline Function Start for 3.33.2. KDMBT ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_Q31_SAT_ALU + * \brief KDMBT (Signed Saturating Double Multiply B16 x T16) + * \details + * **Type**: DSP + * + * **Syntax**:\n + * ~~~ + * KDMxy Rd, Rs1, Rs2 (xy = BB, BT, TT) + * ~~~ + * + * **Purpose**:\n + * Multiply the signed Q15 integer contents of two 16-bit data in the corresponding portion + * of the lower 32-bit chunk in registers and then double and saturate the Q31 result. The result is + * written into the destination register for RV32 or sign-extended to 64-bits and written into the + * destination register for RV64. If saturation happens, an overflow flag OV will be set. + * + * **Description**:\n + * Multiply the top or bottom 16-bit Q15 content of the lower 32-bit portion in Rs1 with + * the top or bottom 16-bit Q15 content of the lower 32-bit portion in Rs2. The Q30 result is then + * doubled and saturated into a Q31 value. The Q31 value is then written into Rd (sign-extended in + * RV64). When both the two Q15 inputs are 0x8000, saturation will happen. The result will be + * saturated to 0x7FFFFFFF and the overflow flag OV will be set. + * + * **Operations**:\n + * ~~~ + * aop = Rs1.H[0]; bop = Rs2.H[0]; // KDMBB + * aop = Rs1.H[0]; bop = Rs2.H[1]; // KDMBT + * aop = Rs1.H[1]; bop = Rs2.H[1]; // KDMTT + * If (0x8000 != aop | 0x8000 != bop) { + * Mresult = aop * bop; + * resQ31 = Mresult << 1; + * Rd = resQ31; // RV32 + * Rd = SE(resQ31); // RV64 + * } else { + * resQ31 = 0x7FFFFFFF; + * Rd = resQ31; // RV32 + * Rd = SE(resQ31); // RV64 + * OV = 1; + * } + * ~~~ + * + * \param [in] a unsigned int type of value stored in a + * \param [in] b unsigned int type of value stored in b + * \return value stored in long type + */ +__STATIC_FORCEINLINE long __RV_KDMBT(unsigned int a, unsigned int b) +{ + register long result; + __ASM volatile("kdmbt %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 3.33.2. KDMBT ===== */ + +/* ===== Inline Function Start for 3.33.3. KDMTT ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_Q31_SAT_ALU + * \brief KDMTT (Signed Saturating Double Multiply T16 x T16) + * \details + * **Type**: DSP + * + * **Syntax**:\n + * ~~~ + * KDMxy Rd, Rs1, Rs2 (xy = BB, BT, TT) + * ~~~ + * + * **Purpose**:\n + * Multiply the signed Q15 integer contents of two 16-bit data in the corresponding portion + * of the lower 32-bit chunk in registers and then double and saturate the Q31 result. The result is + * written into the destination register for RV32 or sign-extended to 64-bits and written into the + * destination register for RV64. If saturation happens, an overflow flag OV will be set. + * + * **Description**:\n + * Multiply the top or bottom 16-bit Q15 content of the lower 32-bit portion in Rs1 with + * the top or bottom 16-bit Q15 content of the lower 32-bit portion in Rs2. The Q30 result is then + * doubled and saturated into a Q31 value. The Q31 value is then written into Rd (sign-extended in + * RV64). When both the two Q15 inputs are 0x8000, saturation will happen. The result will be + * saturated to 0x7FFFFFFF and the overflow flag OV will be set. + * + * **Operations**:\n + * ~~~ + * aop = Rs1.H[0]; bop = Rs2.H[0]; // KDMBB + * aop = Rs1.H[0]; bop = Rs2.H[1]; // KDMBT + * aop = Rs1.H[1]; bop = Rs2.H[1]; // KDMTT + * If (0x8000 != aop | 0x8000 != bop) { + * Mresult = aop * bop; + * resQ31 = Mresult << 1; + * Rd = resQ31; // RV32 + * Rd = SE(resQ31); // RV64 + * } else { + * resQ31 = 0x7FFFFFFF; + * Rd = resQ31; // RV32 + * Rd = SE(resQ31); // RV64 + * OV = 1; + * } + * ~~~ + * + * \param [in] a unsigned int type of value stored in a + * \param [in] b unsigned int type of value stored in b + * \return value stored in long type + */ +__STATIC_FORCEINLINE long __RV_KDMTT(unsigned int a, unsigned int b) +{ + register long result; + __ASM volatile("kdmtt %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 3.33.3. KDMTT ===== */ + +/* ===== Inline Function Start for 3.34.1. KDMABB ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_Q31_SAT_ALU + * \brief KDMABB (Signed Saturating Double Multiply Addition B16 x B16) + * \details + * **Type**: DSP + * + * **Syntax**:\n + * ~~~ + * KDMAxy Rd, Rs1, Rs2 (xy = BB, BT, TT) + * ~~~ + * + * **Purpose**:\n + * Multiply the signed Q15 integer contents of two 16-bit data in the corresponding portion + * of the lower 32-bit chunk in registers and then double and saturate the Q31 result, add the result + * with the sign-extended lower 32-bit chunk destination register and write the saturated addition + * result into the destination register. If saturation happens, an overflow flag OV will be set. + * + * **Description**:\n + * Multiply the top or bottom 16-bit Q15 content of the lower 32-bit portion in Rs1 with + * the top or bottom 16-bit Q15 content of the lower 32-bit portion in Rs2. The Q30 result is then + * doubled and saturated into a Q31 value. The Q31 value is then added with the content of Rd. If the + * addition result is beyond the Q31 number range (-2^31 <= Q31 <= 2^31-1), it is saturated to the range and + * the OV flag is set to 1. The result after saturation is written to Rd. + * When both the two Q15 inputs are 0x8000, saturation will happen and the overflow flag OV will be + * set. + * + * **Operations**:\n + * ~~~ + * aop = Rs1.H[0]; bop = Rs2.H[0]; // KDMABB + * aop = Rs1.H[0]; bop = Rs2.H[1]; // KDMABT + * aop = Rs1.H[1]; bop = Rs2.H[1]; // KDMATT + * If (0x8000 != aop | 0x8000 != bop) { + * Mresult = aop * bop; + * resQ31 = Mresult << 1; + * } else { + * resQ31 = 0x7FFFFFFF; + * OV = 1; + * } + * resadd = Rd + resQ31; // RV32 + * resadd = Rd.W[0] + resQ31; // RV64 + * if (resadd > (2^31)-1) { + * resadd = (2^31)-1; + * OV = 1; + * } else if (resadd < -2^31) { + * resadd = -2^31; + * OV = 1; + * } + * Rd = resadd; // RV32 + * Rd = SE(resadd); // RV64 + * ~~~ + * + * \param [in] t long type of value stored in t + * \param [in] a unsigned int type of value stored in a + * \param [in] b unsigned int type of value stored in b + * \return value stored in long type + */ +__STATIC_FORCEINLINE long __RV_KDMABB(long t, unsigned int a, unsigned int b) +{ + __ASM volatile("kdmabb %0, %1, %2" : "+r"(t) : "r"(a), "r"(b)); + return t; +} +/* ===== Inline Function End for 3.34.1. KDMABB ===== */ + +/* ===== Inline Function Start for 3.34.2. KDMABT ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_Q31_SAT_ALU + * \brief KDMABT (Signed Saturating Double Multiply Addition B16 x T16) + * \details + * **Type**: DSP + * + * **Syntax**:\n + * ~~~ + * KDMAxy Rd, Rs1, Rs2 (xy = BB, BT, TT) + * ~~~ + * + * **Purpose**:\n + * Multiply the signed Q15 integer contents of two 16-bit data in the corresponding portion + * of the lower 32-bit chunk in registers and then double and saturate the Q31 result, add the result + * with the sign-extended lower 32-bit chunk destination register and write the saturated addition + * result into the destination register. If saturation happens, an overflow flag OV will be set. + * + * **Description**:\n + * Multiply the top or bottom 16-bit Q15 content of the lower 32-bit portion in Rs1 with + * the top or bottom 16-bit Q15 content of the lower 32-bit portion in Rs2. The Q30 result is then + * doubled and saturated into a Q31 value. The Q31 value is then added with the content of Rd. If the + * addition result is beyond the Q31 number range (-2^31 <= Q31 <= 2^31-1), it is saturated to the range and + * the OV flag is set to 1. The result after saturation is written to Rd. + * When both the two Q15 inputs are 0x8000, saturation will happen and the overflow flag OV will be + * set. + * + * **Operations**:\n + * ~~~ + * aop = Rs1.H[0]; bop = Rs2.H[0]; // KDMABB + * aop = Rs1.H[0]; bop = Rs2.H[1]; // KDMABT + * aop = Rs1.H[1]; bop = Rs2.H[1]; // KDMATT + * If (0x8000 != aop | 0x8000 != bop) { + * Mresult = aop * bop; + * resQ31 = Mresult << 1; + * } else { + * resQ31 = 0x7FFFFFFF; + * OV = 1; + * } + * resadd = Rd + resQ31; // RV32 + * resadd = Rd.W[0] + resQ31; // RV64 + * if (resadd > (2^31)-1) { + * resadd = (2^31)-1; + * OV = 1; + * } else if (resadd < -2^31) { + * resadd = -2^31; + * OV = 1; + * } + * Rd = resadd; // RV32 + * Rd = SE(resadd); // RV64 + * ~~~ + * + * \param [in] t long type of value stored in t + * \param [in] a unsigned int type of value stored in a + * \param [in] b unsigned int type of value stored in b + * \return value stored in long type + */ +__STATIC_FORCEINLINE long __RV_KDMABT(long t, unsigned int a, unsigned int b) +{ + __ASM volatile("kdmabt %0, %1, %2" : "+r"(t) : "r"(a), "r"(b)); + return t; +} +/* ===== Inline Function End for 3.34.2. KDMABT ===== */ + +/* ===== Inline Function Start for 3.34.3. KDMATT ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_Q31_SAT_ALU + * \brief KDMATT (Signed Saturating Double Multiply Addition T16 x T16) + * \details + * **Type**: DSP + * + * **Syntax**:\n + * ~~~ + * KDMAxy Rd, Rs1, Rs2 (xy = BB, BT, TT) + * ~~~ + * + * **Purpose**:\n + * Multiply the signed Q15 integer contents of two 16-bit data in the corresponding portion + * of the lower 32-bit chunk in registers and then double and saturate the Q31 result, add the result + * with the sign-extended lower 32-bit chunk destination register and write the saturated addition + * result into the destination register. If saturation happens, an overflow flag OV will be set. + * + * **Description**:\n + * Multiply the top or bottom 16-bit Q15 content of the lower 32-bit portion in Rs1 with + * the top or bottom 16-bit Q15 content of the lower 32-bit portion in Rs2. The Q30 result is then + * doubled and saturated into a Q31 value. The Q31 value is then added with the content of Rd. If the + * addition result is beyond the Q31 number range (-2^31 <= Q31 <= 2^31-1), it is saturated to the range and + * the OV flag is set to 1. The result after saturation is written to Rd. + * When both the two Q15 inputs are 0x8000, saturation will happen and the overflow flag OV will be + * set. + * + * **Operations**:\n + * ~~~ + * aop = Rs1.H[0]; bop = Rs2.H[0]; // KDMABB + * aop = Rs1.H[0]; bop = Rs2.H[1]; // KDMABT + * aop = Rs1.H[1]; bop = Rs2.H[1]; // KDMATT + * If (0x8000 != aop | 0x8000 != bop) { + * Mresult = aop * bop; + * resQ31 = Mresult << 1; + * } else { + * resQ31 = 0x7FFFFFFF; + * OV = 1; + * } + * resadd = Rd + resQ31; // RV32 + * resadd = Rd.W[0] + resQ31; // RV64 + * if (resadd > (2^31)-1) { + * resadd = (2^31)-1; + * OV = 1; + * } else if (resadd < -2^31) { + * resadd = -2^31; + * OV = 1; + * } + * Rd = resadd; // RV32 + * Rd = SE(resadd); // RV64 + * ~~~ + * + * \param [in] t long type of value stored in t + * \param [in] a unsigned int type of value stored in a + * \param [in] b unsigned int type of value stored in b + * \return value stored in long type + */ +__STATIC_FORCEINLINE long __RV_KDMATT(long t, unsigned int a, unsigned int b) +{ + __ASM volatile("kdmatt %0, %1, %2" : "+r"(t) : "r"(a), "r"(b)); + return t; +} +/* ===== Inline Function End for 3.34.3. KDMATT ===== */ + +/* ===== Inline Function Start for 3.35.1. KHM8 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_MULTIPLY + * \brief KHM8 (SIMD Signed Saturating Q7 Multiply) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * KHM8 Rd, Rs1, Rs2 + * KHMX8 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do Q7xQ7 element multiplications simultaneously. The Q14 results are then reduced to Q7 + * numbers again. + * + * **Description**:\n + * For the `KHM8` instruction, multiply the top 8-bit Q7 content of 16-bit chunks in Rs1 + * with the top 8-bit Q7 content of 16-bit chunks in Rs2. At the same time, multiply the bottom 8-bit Q7 + * content of 16-bit chunks in Rs1 with the bottom 8-bit Q7 content of 16-bit chunks in Rs2. + * For the `KHMX16` instruction, multiply the top 8-bit Q7 content of 16-bit chunks in Rs1 with the + * bottom 8-bit Q7 content of 16-bit chunks in Rs2. At the same time, multiply the bottom 8-bit Q7 + * content of 16-bit chunks in Rs1 with the top 8-bit Q7 content of 16-bit chunks in Rs2. + * The Q14 results are then right-shifted 7-bits and saturated into Q7 values. The Q7 results are then + * written into Rd. When both the two Q7 inputs of a multiplication are 0x80, saturation will happen. + * The result will be saturated to 0x7F and the overflow flag OV will be set. + * + * **Operations**:\n + * ~~~ + * if (is `KHM8`) { + * op1t = Rs1.B[x+1]; op2t = Rs2.B[x+1]; // top + * op1b = Rs1.B[x]; op2b = Rs2.B[x]; // bottom + * } else if (is `KHMX8`) { + * op1t = Rs1.H[x+1]; op2t = Rs2.H[x]; // Rs1 top + * op1b = Rs1.H[x]; op2b = Rs2.H[x+1]; // Rs1 bottom + * } + * for ((aop,bop,res) in [(op1t,op2t,rest), (op1b,op2b,resb)]) { + * if (0x80 != aop | 0x80 != bop) { + * res = (aop s* bop) >> 7; + * } else { + * res= 0x7F; + * OV = 1; + * } + * } + * Rd.H[x/2] = concat(rest, resb); + * for RV32, x=0,2 + * for RV64, x=0,2,4,6 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_KHM8(unsigned long a, unsigned long b) +{ + register unsigned long result; + __ASM volatile("khm8 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 3.35.1. KHM8 ===== */ + +/* ===== Inline Function Start for 3.35.2. KHMX8 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_MULTIPLY + * \brief KHMX8 (SIMD Signed Saturating Crossed Q7 Multiply) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * KHM8 Rd, Rs1, Rs2 + * KHMX8 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do Q7xQ7 element multiplications simultaneously. The Q14 results are then reduced to Q7 + * numbers again. + * + * **Description**:\n + * For the `KHM8` instruction, multiply the top 8-bit Q7 content of 16-bit chunks in Rs1 + * with the top 8-bit Q7 content of 16-bit chunks in Rs2. At the same time, multiply the bottom 8-bit Q7 + * content of 16-bit chunks in Rs1 with the bottom 8-bit Q7 content of 16-bit chunks in Rs2. + * For the `KHMX16` instruction, multiply the top 8-bit Q7 content of 16-bit chunks in Rs1 with the + * bottom 8-bit Q7 content of 16-bit chunks in Rs2. At the same time, multiply the bottom 8-bit Q7 + * content of 16-bit chunks in Rs1 with the top 8-bit Q7 content of 16-bit chunks in Rs2. + * The Q14 results are then right-shifted 7-bits and saturated into Q7 values. The Q7 results are then + * written into Rd. When both the two Q7 inputs of a multiplication are 0x80, saturation will happen. + * The result will be saturated to 0x7F and the overflow flag OV will be set. + * + * **Operations**:\n + * ~~~ + * if (is `KHM8`) { + * op1t = Rs1.B[x+1]; op2t = Rs2.B[x+1]; // top + * op1b = Rs1.B[x]; op2b = Rs2.B[x]; // bottom + * } else if (is `KHMX8`) { + * op1t = Rs1.H[x+1]; op2t = Rs2.H[x]; // Rs1 top + * op1b = Rs1.H[x]; op2b = Rs2.H[x+1]; // Rs1 bottom + * } + * for ((aop,bop,res) in [(op1t,op2t,rest), (op1b,op2b,resb)]) { + * if (0x80 != aop | 0x80 != bop) { + * res = (aop s* bop) >> 7; + * } else { + * res= 0x7F; + * OV = 1; + * } + * } + * Rd.H[x/2] = concat(rest, resb); + * for RV32, x=0,2 + * for RV64, x=0,2,4,6 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_KHMX8(unsigned long a, unsigned long b) +{ + register unsigned long result; + __ASM volatile("khmx8 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 3.35.2. KHMX8 ===== */ + +/* ===== Inline Function Start for 3.36.1. KHM16 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_MULTIPLY + * \brief KHM16 (SIMD Signed Saturating Q15 Multiply) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * KHM16 Rd, Rs1, Rs2 + * KHMX16 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do Q15xQ15 element multiplications simultaneously. The Q30 results are then reduced to + * Q15 numbers again. + * + * **Description**:\n + * For the `KHM16` instruction, multiply the top 16-bit Q15 content of 32-bit chunks in + * Rs1 with the top 16-bit Q15 content of 32-bit chunks in Rs2. At the same time, multiply the bottom + * 16-bit Q15 content of 32-bit chunks in Rs1 with the bottom 16-bit Q15 content of 32-bit chunks in + * Rs2. + * For the `KHMX16` instruction, multiply the top 16-bit Q15 content of 32-bit chunks in Rs1 with the + * bottom 16-bit Q15 content of 32-bit chunks in Rs2. At the same time, multiply the bottom 16-bit Q15 + * content of 32-bit chunks in Rs1 with the top 16-bit Q15 content of 32-bit chunks in Rs2. + * The Q30 results are then right-shifted 15-bits and saturated into Q15 values. The Q15 results are + * then written into Rd. When both the two Q15 inputs of a multiplication are 0x8000, saturation will + * happen. The result will be saturated to 0x7FFF and the overflow flag OV will be set. + * + * **Operations**:\n + * ~~~ + * if (is `KHM16`) { + * op1t = Rs1.H[x+1]; op2t = Rs2.H[x+1]; // top + * op1b = Rs1.H[x]; op2b = Rs2.H[x]; // bottom + * } else if (is `KHMX16`) { + * op1t = Rs1.H[x+1]; op2t = Rs2.H[x]; // Rs1 top + * op1b = Rs1.H[x]; op2b = Rs2.H[x+1]; // Rs1 bottom + * } + * for ((aop,bop,res) in [(op1t,op2t,rest), (op1b,op2b,resb)]) { + * if (0x8000 != aop | 0x8000 != bop) { + * res = (aop s* bop) >> 15; + * } else { + * res= 0x7FFF; + * OV = 1; + * } + * } + * Rd.W[x/2] = concat(rest, resb); + * for RV32: x=0 + * for RV64: x=0,2 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_KHM16(unsigned long a, unsigned long b) +{ + register unsigned long result; + __ASM volatile("khm16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 3.36.1. KHM16 ===== */ + +/* ===== Inline Function Start for 3.36.2. KHMX16 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_MULTIPLY + * \brief KHMX16 (SIMD Signed Saturating Crossed Q15 Multiply) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * KHM16 Rd, Rs1, Rs2 + * KHMX16 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do Q15xQ15 element multiplications simultaneously. The Q30 results are then reduced to + * Q15 numbers again. + * + * **Description**:\n + * For the `KHM16` instruction, multiply the top 16-bit Q15 content of 32-bit chunks in + * Rs1 with the top 16-bit Q15 content of 32-bit chunks in Rs2. At the same time, multiply the bottom + * 16-bit Q15 content of 32-bit chunks in Rs1 with the bottom 16-bit Q15 content of 32-bit chunks in + * Rs2. + * For the `KHMX16` instruction, multiply the top 16-bit Q15 content of 32-bit chunks in Rs1 with the + * bottom 16-bit Q15 content of 32-bit chunks in Rs2. At the same time, multiply the bottom 16-bit Q15 + * content of 32-bit chunks in Rs1 with the top 16-bit Q15 content of 32-bit chunks in Rs2. + * The Q30 results are then right-shifted 15-bits and saturated into Q15 values. The Q15 results are + * then written into Rd. When both the two Q15 inputs of a multiplication are 0x8000, saturation will + * happen. The result will be saturated to 0x7FFF and the overflow flag OV will be set. + * + * **Operations**:\n + * ~~~ + * if (is `KHM16`) { + * op1t = Rs1.H[x+1]; op2t = Rs2.H[x+1]; // top + * op1b = Rs1.H[x]; op2b = Rs2.H[x]; // bottom + * } else if (is `KHMX16`) { + * op1t = Rs1.H[x+1]; op2t = Rs2.H[x]; // Rs1 top + * op1b = Rs1.H[x]; op2b = Rs2.H[x+1]; // Rs1 bottom + * } + * for ((aop,bop,res) in [(op1t,op2t,rest), (op1b,op2b,resb)]) { + * if (0x8000 != aop | 0x8000 != bop) { + * res = (aop s* bop) >> 15; + * } else { + * res= 0x7FFF; + * OV = 1; + * } + * } + * Rd.W[x/2] = concat(rest, resb); + * for RV32: x=0 + * for RV64: x=0,2 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_KHMX16(unsigned long a, unsigned long b) +{ + register unsigned long result; + __ASM volatile("khmx16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 3.36.2. KHMX16 ===== */ + +/* ===== Inline Function Start for 3.37.1. KHMBB ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_Q15_SAT_ALU + * \brief KHMBB (Signed Saturating Half Multiply B16 x B16) + * \details + * **Type**: DSP + * + * **Syntax**:\n + * ~~~ + * KHMxy Rd, Rs1, Rs2 (xy = BB, BT, TT) + * ~~~ + * + * **Purpose**:\n + * Multiply the signed Q15 number contents of two 16-bit data in the corresponding portion + * of the lower 32-bit chunk in registers and then right-shift 15 bits to turn the Q30 result into a Q15 + * number again and saturate the Q15 result into the destination register. If saturation happens, an + * overflow flag OV will be set. + * + * **Description**:\n + * Multiply the top or bottom 16-bit Q15 content of the lower 32-bit portion in Rs1 with + * the top or bottom 16-bit Q15 content of the lower 32-bit portion in Rs2. The Q30 result is then right- + * shifted 15-bits and saturated into a Q15 value. The Q15 value is then sing-extended and written into + * Rd. When both the two Q15 inputs are 0x8000, saturation will happen. The result will be saturated + * to 0x7FFF and the overflow flag OV will be set. + * + * **Operations**:\n + * ~~~ + * aop = Rs1.H[0]; bop = Rs2.H[0]; // KHMBB + * aop = Rs1.H[0]; bop = Rs2.H[1]; // KHMBT + * aop = Rs1.H[1]; bop = Rs2.H[1]; // KHMTT + * If (0x8000 != aop | 0x8000 != bop) { + * Mresult[31:0] = aop * bop; + * res[15:0] = Mresult[30:15]; + * } else { + * res[15:0] = 0x7FFF; + * OV = 1; + * } + * Rd = SE32(res[15:0]); // Rv32 + * Rd = SE64(res[15:0]); // RV64 + * ~~~ + * + * \param [in] a unsigned int type of value stored in a + * \param [in] b unsigned int type of value stored in b + * \return value stored in long type + */ +__STATIC_FORCEINLINE long __RV_KHMBB(unsigned int a, unsigned int b) +{ + register long result; + __ASM volatile("khmbb %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 3.37.1. KHMBB ===== */ + +/* ===== Inline Function Start for 3.37.2. KHMBT ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_Q15_SAT_ALU + * \brief KHMBT (Signed Saturating Half Multiply B16 x T16) + * \details + * **Type**: DSP + * + * **Syntax**:\n + * ~~~ + * KHMxy Rd, Rs1, Rs2 (xy = BB, BT, TT) + * ~~~ + * + * **Purpose**:\n + * Multiply the signed Q15 number contents of two 16-bit data in the corresponding portion + * of the lower 32-bit chunk in registers and then right-shift 15 bits to turn the Q30 result into a Q15 + * number again and saturate the Q15 result into the destination register. If saturation happens, an + * overflow flag OV will be set. + * + * **Description**:\n + * Multiply the top or bottom 16-bit Q15 content of the lower 32-bit portion in Rs1 with + * the top or bottom 16-bit Q15 content of the lower 32-bit portion in Rs2. The Q30 result is then right- + * shifted 15-bits and saturated into a Q15 value. The Q15 value is then sing-extended and written into + * Rd. When both the two Q15 inputs are 0x8000, saturation will happen. The result will be saturated + * to 0x7FFF and the overflow flag OV will be set. + * + * **Operations**:\n + * ~~~ + * aop = Rs1.H[0]; bop = Rs2.H[0]; // KHMBB + * aop = Rs1.H[0]; bop = Rs2.H[1]; // KHMBT + * aop = Rs1.H[1]; bop = Rs2.H[1]; // KHMTT + * If (0x8000 != aop | 0x8000 != bop) { + * Mresult[31:0] = aop * bop; + * res[15:0] = Mresult[30:15]; + * } else { + * res[15:0] = 0x7FFF; + * OV = 1; + * } + * Rd = SE32(res[15:0]); // Rv32 + * Rd = SE64(res[15:0]); // RV64 + * ~~~ + * + * \param [in] a unsigned int type of value stored in a + * \param [in] b unsigned int type of value stored in b + * \return value stored in long type + */ +__STATIC_FORCEINLINE long __RV_KHMBT(unsigned int a, unsigned int b) +{ + register long result; + __ASM volatile("khmbt %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 3.37.2. KHMBT ===== */ + +/* ===== Inline Function Start for 3.37.3. KHMTT ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_Q15_SAT_ALU + * \brief KHMTT (Signed Saturating Half Multiply T16 x T16) + * \details + * **Type**: DSP + * + * **Syntax**:\n + * ~~~ + * KHMxy Rd, Rs1, Rs2 (xy = BB, BT, TT) + * ~~~ + * + * **Purpose**:\n + * Multiply the signed Q15 number contents of two 16-bit data in the corresponding portion + * of the lower 32-bit chunk in registers and then right-shift 15 bits to turn the Q30 result into a Q15 + * number again and saturate the Q15 result into the destination register. If saturation happens, an + * overflow flag OV will be set. + * + * **Description**:\n + * Multiply the top or bottom 16-bit Q15 content of the lower 32-bit portion in Rs1 with + * the top or bottom 16-bit Q15 content of the lower 32-bit portion in Rs2. The Q30 result is then right- + * shifted 15-bits and saturated into a Q15 value. The Q15 value is then sing-extended and written into + * Rd. When both the two Q15 inputs are 0x8000, saturation will happen. The result will be saturated + * to 0x7FFF and the overflow flag OV will be set. + * + * **Operations**:\n + * ~~~ + * aop = Rs1.H[0]; bop = Rs2.H[0]; // KHMBB + * aop = Rs1.H[0]; bop = Rs2.H[1]; // KHMBT + * aop = Rs1.H[1]; bop = Rs2.H[1]; // KHMTT + * If (0x8000 != aop | 0x8000 != bop) { + * Mresult[31:0] = aop * bop; + * res[15:0] = Mresult[30:15]; + * } else { + * res[15:0] = 0x7FFF; + * OV = 1; + * } + * Rd = SE32(res[15:0]); // Rv32 + * Rd = SE64(res[15:0]); // RV64 + * ~~~ + * + * \param [in] a unsigned int type of value stored in a + * \param [in] b unsigned int type of value stored in b + * \return value stored in long type + */ +__STATIC_FORCEINLINE long __RV_KHMTT(unsigned int a, unsigned int b) +{ + register long result; + __ASM volatile("khmtt %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 3.37.3. KHMTT ===== */ + +/* ===== Inline Function Start for 3.38.1. KMABB ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_16B_MULT_32B_ADDSUB + * \brief KMABB (SIMD Saturating Signed Multiply Bottom Halfs & Add) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * KMABB Rd, Rs1, Rs2 + * KMABT Rd, Rs1, Rs2 + * KMATT Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Multiply the signed 16-bit content of 32-bit elements in a register with the 16-bit content + * of 32-bit elements in another register and add the result to the content of 32-bit elements in the + * third register. The addition result may be saturated and is written to the third register. + * * KMABB: rd.W[x] + bottom*bottom (per 32-bit element) + * * KMABT rd.W[x] + bottom*top (per 32-bit element) + * * KMATT rd.W[x] + top*top (per 32-bit element) + * + * **Description**:\n + * For the `KMABB` instruction, it multiplies the bottom 16-bit content of 32-bit elements in Rs1 with + * the bottom 16-bit content of 32-bit elements in Rs2. + * For the `KMABT` instruction, it multiplies the bottom 16-bit content of 32-bit elements in Rs1 with + * the top 16-bit content of 32-bit elements in Rs2. + * For the `KMATT` instruction, it multiplies the top 16-bit content of 32-bit elements in Rs1 with the + * top 16-bit content of 32-bit elements in Rs2. + * The multiplication result is added to the content of 32-bit elements in Rd. If the addition result is + * beyond the Q31 number range (-2^31 <= Q31 <= 2^31-1), it is saturated to the range and the OV bit is set to + * 1. The results after saturation are written to Rd. The 16-bit contents of Rs1 and Rs2 are treated as + * signed integers. + * + * **Operations**:\n + * ~~~ + * res[x] = Rd.W[x] + (Rs1.W[x].H[0] * Rs2.W[x].H[0]); // KMABB + * res[x] = Rd.W[x] + (Rs1.W[x].H[0] * Rs2.W[x].H[1]); // KMABT + * res[x] = Rd.W[x] + (Rs1.W[x].H[1] * Rs2.W[x].H[1]); // KMATT + * if (res[x] > (2^31)-1) { + * res[x] = (2^31)-1; + * OV = 1; + * } else if (res[x] < -2^31) { + * res[x] = -2^31; + * OV = 1; + * } + * Rd.W[x] = res[x]; + * for RV32: x=0 + * for RV64: x=1...0 + * ~~~ + * + * \param [in] t long type of value stored in t + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in long type + */ +__STATIC_FORCEINLINE long __RV_KMABB(long t, unsigned long a, unsigned long b) +{ + __ASM volatile("kmabb %0, %1, %2" : "+r"(t) : "r"(a), "r"(b)); + return t; +} +/* ===== Inline Function End for 3.38.1. KMABB ===== */ + +/* ===== Inline Function Start for 3.38.2. KMABT ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_16B_MULT_32B_ADDSUB + * \brief KMABT (SIMD Saturating Signed Multiply Bottom & Top Halfs & Add) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * KMABB Rd, Rs1, Rs2 + * KMABT Rd, Rs1, Rs2 + * KMATT Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Multiply the signed 16-bit content of 32-bit elements in a register with the 16-bit content + * of 32-bit elements in another register and add the result to the content of 32-bit elements in the + * third register. The addition result may be saturated and is written to the third register. + * * KMABB: rd.W[x] + bottom*bottom (per 32-bit element) + * * KMABT rd.W[x] + bottom*top (per 32-bit element) + * * KMATT rd.W[x] + top*top (per 32-bit element) + * + * **Description**:\n + * For the `KMABB` instruction, it multiplies the bottom 16-bit content of 32-bit elements in Rs1 with + * the bottom 16-bit content of 32-bit elements in Rs2. + * For the `KMABT` instruction, it multiplies the bottom 16-bit content of 32-bit elements in Rs1 with + * the top 16-bit content of 32-bit elements in Rs2. + * For the `KMATT` instruction, it multiplies the top 16-bit content of 32-bit elements in Rs1 with the + * top 16-bit content of 32-bit elements in Rs2. + * The multiplication result is added to the content of 32-bit elements in Rd. If the addition result is + * beyond the Q31 number range (-2^31 <= Q31 <= 2^31-1), it is saturated to the range and the OV bit is set to + * 1. The results after saturation are written to Rd. The 16-bit contents of Rs1 and Rs2 are treated as + * signed integers. + * + * **Operations**:\n + * ~~~ + * res[x] = Rd.W[x] + (Rs1.W[x].H[0] * Rs2.W[x].H[0]); // KMABB + * res[x] = Rd.W[x] + (Rs1.W[x].H[0] * Rs2.W[x].H[1]); // KMABT + * res[x] = Rd.W[x] + (Rs1.W[x].H[1] * Rs2.W[x].H[1]); // KMATT + * if (res[x] > (2^31)-1) { + * res[x] = (2^31)-1; + * OV = 1; + * } else if (res[x] < -2^31) { + * res[x] = -2^31; + * OV = 1; + * } + * Rd.W[x] = res[x]; + * for RV32: x=0 + * for RV64: x=1...0 + * ~~~ + * + * \param [in] t long type of value stored in t + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in long type + */ +__STATIC_FORCEINLINE long __RV_KMABT(long t, unsigned long a, unsigned long b) +{ + __ASM volatile("kmabt %0, %1, %2" : "+r"(t) : "r"(a), "r"(b)); + return t; +} +/* ===== Inline Function End for 3.38.2. KMABT ===== */ + +/* ===== Inline Function Start for 3.38.3. KMATT ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_16B_MULT_32B_ADDSUB + * \brief KMATT (SIMD Saturating Signed Multiply Top Halfs & Add) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * KMABB Rd, Rs1, Rs2 + * KMABT Rd, Rs1, Rs2 + * KMATT Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Multiply the signed 16-bit content of 32-bit elements in a register with the 16-bit content + * of 32-bit elements in another register and add the result to the content of 32-bit elements in the + * third register. The addition result may be saturated and is written to the third register. + * * KMABB: rd.W[x] + bottom*bottom (per 32-bit element) + * * KMABT rd.W[x] + bottom*top (per 32-bit element) + * * KMATT rd.W[x] + top*top (per 32-bit element) + * + * **Description**:\n + * For the `KMABB` instruction, it multiplies the bottom 16-bit content of 32-bit elements in Rs1 with + * the bottom 16-bit content of 32-bit elements in Rs2. + * For the `KMABT` instruction, it multiplies the bottom 16-bit content of 32-bit elements in Rs1 with + * the top 16-bit content of 32-bit elements in Rs2. + * For the `KMATT` instruction, it multiplies the top 16-bit content of 32-bit elements in Rs1 with the + * top 16-bit content of 32-bit elements in Rs2. + * The multiplication result is added to the content of 32-bit elements in Rd. If the addition result is + * beyond the Q31 number range (-2^31 <= Q31 <= 2^31-1), it is saturated to the range and the OV bit is set to + * 1. The results after saturation are written to Rd. The 16-bit contents of Rs1 and Rs2 are treated as + * signed integers. + * + * **Operations**:\n + * ~~~ + * res[x] = Rd.W[x] + (Rs1.W[x].H[0] * Rs2.W[x].H[0]); // KMABB + * res[x] = Rd.W[x] + (Rs1.W[x].H[0] * Rs2.W[x].H[1]); // KMABT + * res[x] = Rd.W[x] + (Rs1.W[x].H[1] * Rs2.W[x].H[1]); // KMATT + * if (res[x] > (2^31)-1) { + * res[x] = (2^31)-1; + * OV = 1; + * } else if (res[x] < -2^31) { + * res[x] = -2^31; + * OV = 1; + * } + * Rd.W[x] = res[x]; + * for RV32: x=0 + * for RV64: x=1...0 + * ~~~ + * + * \param [in] t long type of value stored in t + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in long type + */ +__STATIC_FORCEINLINE long __RV_KMATT(long t, unsigned long a, unsigned long b) +{ + __ASM volatile("kmatt %0, %1, %2" : "+r"(t) : "r"(a), "r"(b)); + return t; +} +/* ===== Inline Function End for 3.38.3. KMATT ===== */ + +/* ===== Inline Function Start for 3.39.1. KMADA ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_16B_MULT_32B_ADDSUB + * \brief KMADA (SIMD Saturating Signed Multiply Two Halfs and Two Adds) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * KMADA Rd, Rs1, Rs2 + * KMAXDA Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do two signed 16-bit multiplications from 32-bit elements in two registers; and then adds + * the two 32-bit results and 32-bit elements in a third register together. The addition result may be + * saturated. + * * KMADA: rd.W[x] + top*top + bottom*bottom (per 32-bit element) + * * KMAXDA: rd.W[x] + top*bottom + bottom*top (per 32-bit element) + * + * **Description**:\n + * For the `KMADA instruction, it multiplies the bottom 16-bit content of 32-bit elements in Rs1 with + * the bottom 16-bit content of 32-bit elements in Rs2 and then adds the result to the result of + * multiplying the top 16-bit content of 32-bit elements in Rs1 with the top 16-bit content of 32-bit + * elements in Rs2. + * For the `KMAXDA` instruction, it multiplies the top 16-bit content of 32-bit elements in Rs1 with the + * bottom 16-bit content of 32-bit elements in Rs2 and then adds the result to the result of multiplying + * the bottom 16-bit content of 32-bit elements in Rs1 with the top 16-bit content of 32-bit elements in + * Rs2. + * The result is added to the content of 32-bit elements in Rd. If the addition result is beyond the Q31 + * number range (-2^31 <= Q31 <= 2^31-1), it is saturated to the range and the OV bit is set to 1. The 32-bit + * results after saturation are written to Rd. The 16-bit contents of Rs1 and Rs2 are treated as signed + * integers. + * + * **Operations**:\n + * ~~~ + * // KMADA + * res[x] = Rd.W[x] + (Rs1.W[x].H[1] * Rs2.W[x].H[1]) + (Rs1.W[x].H[0] * Rs2.W[x].H[0]); + * // KMAXDA + * res[x] = Rd.W[x] + (Rs1.W[x].H[1] * Rs2.W[x].H[0]) + (Rs1.W[x].H[0] * Rs2.W[x].H[1]); + * if (res[x] > (2^31)-1) { + * res[x] = (2^31)-1; + * OV = 1; + * } else if (res[x] < -2^31) { + * res[x] = -2^31; + * OV = 1; + * } + * Rd.W[x] = res[x]; + * for RV32: x=0 + * for RV64: x=1...0 + * ~~~ + * + * \param [in] t long type of value stored in t + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in long type + */ +__STATIC_FORCEINLINE long __RV_KMADA(long t, unsigned long a, unsigned long b) +{ + __ASM volatile("kmada %0, %1, %2" : "+r"(t) : "r"(a), "r"(b)); + return t; +} +/* ===== Inline Function End for 3.39.1. KMADA ===== */ + +/* ===== Inline Function Start for 3.39.2. KMAXDA ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_16B_MULT_32B_ADDSUB + * \brief KMAXDA (SIMD Saturating Signed Crossed Multiply Two Halfs and Two Adds) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * KMADA Rd, Rs1, Rs2 + * KMAXDA Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do two signed 16-bit multiplications from 32-bit elements in two registers; and then adds + * the two 32-bit results and 32-bit elements in a third register together. The addition result may be + * saturated. + * * KMADA: rd.W[x] + top*top + bottom*bottom (per 32-bit element) + * * KMAXDA: rd.W[x] + top*bottom + bottom*top (per 32-bit element) + * + * **Description**:\n + * For the `KMADA instruction, it multiplies the bottom 16-bit content of 32-bit elements in Rs1 with + * the bottom 16-bit content of 32-bit elements in Rs2 and then adds the result to the result of + * multiplying the top 16-bit content of 32-bit elements in Rs1 with the top 16-bit content of 32-bit + * elements in Rs2. + * For the `KMAXDA` instruction, it multiplies the top 16-bit content of 32-bit elements in Rs1 with the + * bottom 16-bit content of 32-bit elements in Rs2 and then adds the result to the result of multiplying + * the bottom 16-bit content of 32-bit elements in Rs1 with the top 16-bit content of 32-bit elements in + * Rs2. + * The result is added to the content of 32-bit elements in Rd. If the addition result is beyond the Q31 + * number range (-2^31 <= Q31 <= 2^31-1), it is saturated to the range and the OV bit is set to 1. The 32-bit + * results after saturation are written to Rd. The 16-bit contents of Rs1 and Rs2 are treated as signed + * integers. + * + * **Operations**:\n + * ~~~ + * // KMADA + * res[x] = Rd.W[x] + (Rs1.W[x].H[1] * Rs2.W[x].H[1]) + (Rs1.W[x].H[0] * Rs2.W[x].H[0]); + * // KMAXDA + * res[x] = Rd.W[x] + (Rs1.W[x].H[1] * Rs2.W[x].H[0]) + (Rs1.W[x].H[0] * Rs2.W[x].H[1]); + * if (res[x] > (2^31)-1) { + * res[x] = (2^31)-1; + * OV = 1; + * } else if (res[x] < -2^31) { + * res[x] = -2^31; + * OV = 1; + * } + * Rd.W[x] = res[x]; + * for RV32: x=0 + * for RV64: x=1...0 + * ~~~ + * + * \param [in] t long type of value stored in t + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in long type + */ +__STATIC_FORCEINLINE long __RV_KMAXDA(long t, unsigned long a, unsigned long b) +{ + __ASM volatile("kmaxda %0, %1, %2" : "+r"(t) : "r"(a), "r"(b)); + return t; +} +/* ===== Inline Function End for 3.39.2. KMAXDA ===== */ + +/* ===== Inline Function Start for 3.40.1. KMADS ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_16B_MULT_32B_ADDSUB + * \brief KMADS (SIMD Saturating Signed Multiply Two Halfs & Subtract & Add) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * KMADS Rd, Rs1, Rs2 + * KMADRS Rd, Rs1, Rs2 + * KMAXDS Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do two signed 16-bit multiplications from 32-bit elements in two registers; and then + * perform a subtraction operation between the two 32-bit results. Then add the subtraction result to + * the corresponding 32-bit elements in a third register. The addition result may be saturated. + * * KMADS: rd.W[x] + (top*top - bottom*bottom) (per 32-bit element) + * * KMADRS: rd.W[x] + (bottom*bottom - top*top) (per 32-bit element) + * * KMAXDS: rd.W[x] + (top*bottom - bottom*top) (per 32-bit element) + * + * **Description**:\n + * For the `KMADS` instruction, it multiplies the bottom 16-bit content of 32-bit elements in Rs1 with + * the bottom 16-bit content of 32-bit elements in Rs2 and then subtracts the result from the result of + * multiplying the top 16-bit content of 32-bit elements in Rs1 with the top 16-bit content of 32-bit + * elements in Rs2. + * For the `KMADRS` instruction, it multiplies the top 16-bit content of 32-bit elements in Rs1 with the + * top 16-bit content of 32-bit elements in Rs2 and then subtracts the result from the result of + * multiplying the bottom 16-bit content of 32-bit elements in Rs1 with the bottom 16-bit content of 32- + * bit elements in Rs2. + * For the `KMAXDS` instruction, it multiplies the bottom 16-bit content of 32-bit elements in Rs1 with + * the top 16-bit content of 32-bit elements in Rs2 and then subtracts the result from the result of + * multiplying the top 16-bit content of 32-bit elements in Rs1 with the bottom 16-bit content of 32-bit + * elements in Rs2. + * The subtraction result is then added to the content of the corresponding 32-bit elements in Rd. If the + * addition result is beyond the Q31 number range (-2^31 <= Q31 <= 2^31-1), it is saturated to the range and + * the OV bit is set to 1. The 32-bit results after saturation are written to Rd. The 16-bit contents of Rs1 + * and Rs2 are treated as signed integers. + * + * **Operations**:\n + * ~~~ + * // KMADS + * res[x] = Rd.W[x] + (Rs1.W[x].H[1] * Rs2.W[x].H[1]) - (Rs1.W[x].H[0] * Rs2.W[x].H[0]); + * // KMADRS + * res[x] = Rd.W[x] + (Rs1.W[x].H[0] * Rs2.W[x].H[0]) - (Rs1.W[x].H[1] * Rs2.W[x].H[1]); + * // KMAXDS + * res[x] = Rd.W[x] + (Rs1.W[x].H[1] * Rs2.W[x].H[0]) - (Rs1.W[x].H[0] * Rs2.W[x].H[1]); + * if (res[x] > (2^31)-1) { + * res[x] = (2^31)-1; + * OV = 1; + * } else if (res[x] < -2^31) { + * res[x] = -2^31; + * OV = 1; + * } + * Rd.W[x] = res[x]; + * for RV32: x=0 + * for RV64: x=1...0 + * ~~~ + * + * \param [in] t long type of value stored in t + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in long type + */ +__STATIC_FORCEINLINE long __RV_KMADS(long t, unsigned long a, unsigned long b) +{ + __ASM volatile("kmads %0, %1, %2" : "+r"(t) : "r"(a), "r"(b)); + return t; +} +/* ===== Inline Function End for 3.40.1. KMADS ===== */ + +/* ===== Inline Function Start for 3.40.2. KMADRS ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_16B_MULT_32B_ADDSUB + * \brief KMADRS (SIMD Saturating Signed Multiply Two Halfs & Reverse Subtract & Add) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * KMADS Rd, Rs1, Rs2 + * KMADRS Rd, Rs1, Rs2 + * KMAXDS Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do two signed 16-bit multiplications from 32-bit elements in two registers; and then + * perform a subtraction operation between the two 32-bit results. Then add the subtraction result to + * the corresponding 32-bit elements in a third register. The addition result may be saturated. + * * KMADS: rd.W[x] + (top*top - bottom*bottom) (per 32-bit element) + * * KMADRS: rd.W[x] + (bottom*bottom - top*top) (per 32-bit element) + * * KMAXDS: rd.W[x] + (top*bottom - bottom*top) (per 32-bit element) + * + * **Description**:\n + * For the `KMADS` instruction, it multiplies the bottom 16-bit content of 32-bit elements in Rs1 with + * the bottom 16-bit content of 32-bit elements in Rs2 and then subtracts the result from the result of + * multiplying the top 16-bit content of 32-bit elements in Rs1 with the top 16-bit content of 32-bit + * elements in Rs2. + * For the `KMADRS` instruction, it multiplies the top 16-bit content of 32-bit elements in Rs1 with the + * top 16-bit content of 32-bit elements in Rs2 and then subtracts the result from the result of + * multiplying the bottom 16-bit content of 32-bit elements in Rs1 with the bottom 16-bit content of 32- + * bit elements in Rs2. + * For the `KMAXDS` instruction, it multiplies the bottom 16-bit content of 32-bit elements in Rs1 with + * the top 16-bit content of 32-bit elements in Rs2 and then subtracts the result from the result of + * multiplying the top 16-bit content of 32-bit elements in Rs1 with the bottom 16-bit content of 32-bit + * elements in Rs2. + * The subtraction result is then added to the content of the corresponding 32-bit elements in Rd. If the + * addition result is beyond the Q31 number range (-2^31 <= Q31 <= 2^31-1), it is saturated to the range and + * the OV bit is set to 1. The 32-bit results after saturation are written to Rd. The 16-bit contents of Rs1 + * and Rs2 are treated as signed integers. + * + * **Operations**:\n + * ~~~ + * // KMADS + * res[x] = Rd.W[x] + (Rs1.W[x].H[1] * Rs2.W[x].H[1]) - (Rs1.W[x].H[0] * Rs2.W[x].H[0]); + * // KMADRS + * res[x] = Rd.W[x] + (Rs1.W[x].H[0] * Rs2.W[x].H[0]) - (Rs1.W[x].H[1] * Rs2.W[x].H[1]); + * // KMAXDS + * res[x] = Rd.W[x] + (Rs1.W[x].H[1] * Rs2.W[x].H[0]) - (Rs1.W[x].H[0] * Rs2.W[x].H[1]); + * if (res[x] > (2^31)-1) { + * res[x] = (2^31)-1; + * OV = 1; + * } else if (res[x] < -2^31) { + * res[x] = -2^31; + * OV = 1; + * } + * Rd.W[x] = res[x]; + * for RV32: x=0 + * for RV64: x=1...0 + * ~~~ + * + * \param [in] t long type of value stored in t + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in long type + */ +__STATIC_FORCEINLINE long __RV_KMADRS(long t, unsigned long a, unsigned long b) +{ + __ASM volatile("kmadrs %0, %1, %2" : "+r"(t) : "r"(a), "r"(b)); + return t; +} +/* ===== Inline Function End for 3.40.2. KMADRS ===== */ + +/* ===== Inline Function Start for 3.40.3. KMAXDS ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_16B_MULT_32B_ADDSUB + * \brief KMAXDS (SIMD Saturating Signed Crossed Multiply Two Halfs & Subtract & Add) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * KMADS Rd, Rs1, Rs2 + * KMADRS Rd, Rs1, Rs2 + * KMAXDS Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do two signed 16-bit multiplications from 32-bit elements in two registers; and then + * perform a subtraction operation between the two 32-bit results. Then add the subtraction result to + * the corresponding 32-bit elements in a third register. The addition result may be saturated. + * * KMADS: rd.W[x] + (top*top - bottom*bottom) (per 32-bit element) + * * KMADRS: rd.W[x] + (bottom*bottom - top*top) (per 32-bit element) + * * KMAXDS: rd.W[x] + (top*bottom - bottom*top) (per 32-bit element) + * + * **Description**:\n + * For the `KMADS` instruction, it multiplies the bottom 16-bit content of 32-bit elements in Rs1 with + * the bottom 16-bit content of 32-bit elements in Rs2 and then subtracts the result from the result of + * multiplying the top 16-bit content of 32-bit elements in Rs1 with the top 16-bit content of 32-bit + * elements in Rs2. + * For the `KMADRS` instruction, it multiplies the top 16-bit content of 32-bit elements in Rs1 with the + * top 16-bit content of 32-bit elements in Rs2 and then subtracts the result from the result of + * multiplying the bottom 16-bit content of 32-bit elements in Rs1 with the bottom 16-bit content of 32- + * bit elements in Rs2. + * For the `KMAXDS` instruction, it multiplies the bottom 16-bit content of 32-bit elements in Rs1 with + * the top 16-bit content of 32-bit elements in Rs2 and then subtracts the result from the result of + * multiplying the top 16-bit content of 32-bit elements in Rs1 with the bottom 16-bit content of 32-bit + * elements in Rs2. + * The subtraction result is then added to the content of the corresponding 32-bit elements in Rd. If the + * addition result is beyond the Q31 number range (-2^31 <= Q31 <= 2^31-1), it is saturated to the range and + * the OV bit is set to 1. The 32-bit results after saturation are written to Rd. The 16-bit contents of Rs1 + * and Rs2 are treated as signed integers. + * + * **Operations**:\n + * ~~~ + * // KMADS + * res[x] = Rd.W[x] + (Rs1.W[x].H[1] * Rs2.W[x].H[1]) - (Rs1.W[x].H[0] * Rs2.W[x].H[0]); + * // KMADRS + * res[x] = Rd.W[x] + (Rs1.W[x].H[0] * Rs2.W[x].H[0]) - (Rs1.W[x].H[1] * Rs2.W[x].H[1]); + * // KMAXDS + * res[x] = Rd.W[x] + (Rs1.W[x].H[1] * Rs2.W[x].H[0]) - (Rs1.W[x].H[0] * Rs2.W[x].H[1]); + * if (res[x] > (2^31)-1) { + * res[x] = (2^31)-1; + * OV = 1; + * } else if (res[x] < -2^31) { + * res[x] = -2^31; + * OV = 1; + * } + * Rd.W[x] = res[x]; + * for RV32: x=0 + * for RV64: x=1...0 + * ~~~ + * + * \param [in] t long type of value stored in t + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in long type + */ +__STATIC_FORCEINLINE long __RV_KMAXDS(long t, unsigned long a, unsigned long b) +{ + __ASM volatile("kmaxds %0, %1, %2" : "+r"(t) : "r"(a), "r"(b)); + return t; +} +/* ===== Inline Function End for 3.40.3. KMAXDS ===== */ + +/* ===== Inline Function Start for 3.41. KMAR64 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_32B_MULT_64B_ADDSUB + * \brief KMAR64 (Signed Multiply and Saturating Add to 64-Bit Data) + * \details + * **Type**: DSP (64-bit Profile) + * + * **Syntax**:\n + * ~~~ + * KMAR64 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Multiply the 32-bit signed elements in two registers and add the 64-bit multiplication + * results to the 64-bit signed data of a pair of registers (RV32) or a register (RV64). The result is + * saturated to the Q63 range and written back to the pair of registers (RV32) or the register (RV64). + * + * **RV32 Description**:\n + * This instruction multiplies the 32-bit signed data of Rs1 with that of Rs2. It adds + * the 64-bit multiplication result to the 64-bit signed data of an even/odd pair of registers specified by + * Rd(4,1) with unlimited precision. If the 64-bit addition result is beyond the Q63 number range (-2^63 <= + * Q63 <= 2^63-1), it is saturated to the range and the OV bit is set to 1. The saturated result is written back + * to the even/odd pair of registers specified by Rd(4,1). + * Rx(4,1), i.e., value d, determines the even/odd pair group of two registers. Specifically, the register + * pair includes register 2d and 2d+1. + * The odd `2d+1` register of the pair contains the high 32-bit of the result and the even `2d` register + * of the pair contains the low 32-bit of the result. + * + * **RV64 Description**:\n + * This instruction multiplies the 32-bit signed elements of Rs1 with that of Rs2. It + * adds the 64-bit multiplication results to the 64-bit signed data of Rd with unlimited precision. If the + * 64-bit addition result is beyond the Q63 number range (-2^63 <= Q63 <= 2^63-1), it is saturated to the range + * and the OV bit is set to 1. The saturated result is written back to Rd. + * + * **Operations**:\n + * ~~~ + * RV32: + * t_L = CONCAT(Rd(4,1),1'b0); t_H = CONCAT(Rd(4,1),1'b1); + * result = R[t_H].R[t_L] + (Rs1 * Rs2); + * if (result > (2^63)-1) { + * result = (2^63)-1; OV = 1; + * } else if (result < -2^63) { + * result = -2^63; OV = 1; + * } + * R[t_H].R[t_L] = result; + * RV64: + * // `result` has unlimited precision + * result = Rd + (Rs1.W[0] * Rs2.W[0]) + (Rs1.W[1] * Rs2.W[1]); + * if (result > (2^63)-1) { + * result = (2^63)-1; OV = 1; + * } else if (result < -2^63) { + * result = -2^63; OV = 1; + * } + * Rd = result; + * ~~~ + * + * \param [in] t long long type of value stored in t + * \param [in] a long type of value stored in a + * \param [in] b long type of value stored in b + * \return value stored in long long type + */ +__STATIC_FORCEINLINE long long __RV_KMAR64(long long t, long a, long b) +{ + __ASM volatile("kmar64 %0, %1, %2" : "+r"(t) : "r"(a), "r"(b)); + return t; +} +/* ===== Inline Function End for 3.41. KMAR64 ===== */ + +/* ===== Inline Function Start for 3.42.1. KMDA ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_16B_MULT_32B_ADDSUB + * \brief KMDA (SIMD Signed Multiply Two Halfs and Add) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * KMDA Rd, Rs1, Rs2 + * KMXDA Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do two signed 16-bit multiplications from the 32-bit elements of two registers; and then + * adds the two 32-bit results together. The addition result may be saturated. + * * KMDA: top*top + bottom*bottom (per 32-bit element) + * * KMXDA: top*bottom + bottom*top (per 32-bit element) + * + * **Description**:\n + * For the `KMDA` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1 + * with the bottom 16-bit content of the 32-bit elements of Rs2 and then adds the result to the result of + * multiplying the top 16-bit content of the 32-bit elements of Rs1 with the top 16-bit content of the 32- + * bit elements of Rs2. + * For the `KMXDA` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1 + * with the top 16-bit content of the 32-bit elements of Rs2 and then adds the result to the result of + * multiplying the top 16-bit content of the 32-bit elements of Rs1 with the bottom 16-bit content of the + * 32-bit elements of Rs2. + * The addition result is checked for saturation. If saturation happens, the result is saturated to 2^31-1. + * The final results are written to Rd. The 16-bit contents are treated as signed integers. + * + * **Operations**:\n + * ~~~ + * if Rs1.W[x] != 0x80008000) or (Rs2.W[x] != 0x80008000 { // KMDA Rd.W[x] = Rs1.W[x].H[1] * + * Rs2.W[x].H[1]) + (Rs1.W[x].H[0] * Rs2.W[x].H[0]; // KMXDA Rd.W[x] = Rs1.W[x].H[1] * Rs2.W[x].H[0]) + * + (Rs1.W[x].H[0] * Rs2.W[x].H[1]; } else { Rd.W[x] = 0x7fffffff; OV = 1; } for RV32: x=0 for RV64: + * x=1...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in long type + */ +__STATIC_FORCEINLINE long __RV_KMDA(unsigned long a, unsigned long b) +{ + register long result; + __ASM volatile("kmda %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 3.42.1. KMDA ===== */ + +/* ===== Inline Function Start for 3.42.2. KMXDA ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_16B_MULT_32B_ADDSUB + * \brief KMXDA (SIMD Signed Crossed Multiply Two Halfs and Add) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * KMDA Rd, Rs1, Rs2 + * KMXDA Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do two signed 16-bit multiplications from the 32-bit elements of two registers; and then + * adds the two 32-bit results together. The addition result may be saturated. + * * KMDA: top*top + bottom*bottom (per 32-bit element) + * * KMXDA: top*bottom + bottom*top (per 32-bit element) + * + * **Description**:\n + * For the `KMDA` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1 + * with the bottom 16-bit content of the 32-bit elements of Rs2 and then adds the result to the result of + * multiplying the top 16-bit content of the 32-bit elements of Rs1 with the top 16-bit content of the 32- + * bit elements of Rs2. + * For the `KMXDA` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1 + * with the top 16-bit content of the 32-bit elements of Rs2 and then adds the result to the result of + * multiplying the top 16-bit content of the 32-bit elements of Rs1 with the bottom 16-bit content of the + * 32-bit elements of Rs2. + * The addition result is checked for saturation. If saturation happens, the result is saturated to 2^31-1. + * The final results are written to Rd. The 16-bit contents are treated as signed integers. + * + * **Operations**:\n + * ~~~ + * if Rs1.W[x] != 0x80008000) or (Rs2.W[x] != 0x80008000 { // KMDA Rd.W[x] = Rs1.W[x].H[1] * + * Rs2.W[x].H[1]) + (Rs1.W[x].H[0] * Rs2.W[x].H[0]; // KMXDA Rd.W[x] = Rs1.W[x].H[1] * Rs2.W[x].H[0]) + * + (Rs1.W[x].H[0] * Rs2.W[x].H[1]; } else { Rd.W[x] = 0x7fffffff; OV = 1; } for RV32: x=0 for RV64: + * x=1...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in long type + */ +__STATIC_FORCEINLINE long __RV_KMXDA(unsigned long a, unsigned long b) +{ + register long result; + __ASM volatile("kmxda %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 3.42.2. KMXDA ===== */ + +/* ===== Inline Function Start for 3.43.1. KMMAC ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_MSW_32X32_MAC + * \brief KMMAC (SIMD Saturating MSW Signed Multiply Word and Add) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * KMMAC Rd, Rs1, Rs2 + * KMMAC.u Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Multiply the signed 32-bit integer elements of two registers and add the most significant + * 32-bit results with the signed 32-bit integer elements of a third register. The addition results are + * saturated first and then written back to the third register. The `.u` form performs an additional + * rounding up operation on the multiplication results before adding the most significant 32-bit part + * of the results. + * + * **Description**:\n + * This instruction multiplies the signed 32-bit elements of Rs1 with the signed 32-bit elements of Rs2 + * and adds the most significant 32-bit multiplication results with the signed 32-bit elements of Rd. If + * the addition result is beyond the Q31 number range (-2^31 <= Q31 <= 2^31-1), it is saturated to the range + * and the OV bit is set to 1. The results after saturation are written to Rd. The `.u` form of the + * instruction additionally rounds up the most significant 32-bit of the 64-bit multiplication results by + * adding a 1 to bit 31 of the results. + * + * **Operations**:\n + * ~~~ + * Mres[x][63:0] = Rs1.W[x] * Rs2.W[x]; + * if (`.u` form) { + * Round[x][32:0] = Mres[x][63:31] + 1; + * res[x] = Rd.W[x] + Round[x][32:1]; + * } else { + * res[x] = Rd.W[x] + Mres[x][63:32]; + * } + * if (res[x] > (2^31)-1) { + * res[x] = (2^31)-1; + * OV = 1; + * } else if (res[x] < -2^31) { + * res[x] = -2^31; + * OV = 1; + * } + * Rd.W[x] = res[x]; + * for RV32: x=0 + * for RV64: x=1...0 + * ~~~ + * + * \param [in] t long type of value stored in t + * \param [in] a long type of value stored in a + * \param [in] b long type of value stored in b + * \return value stored in long type + */ +__STATIC_FORCEINLINE long __RV_KMMAC(long t, long a, long b) +{ + __ASM volatile("kmmac %0, %1, %2" : "+r"(t) : "r"(a), "r"(b)); + return t; +} +/* ===== Inline Function End for 3.43.1. KMMAC ===== */ + +/* ===== Inline Function Start for 3.43.2. KMMAC.u ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_MSW_32X32_MAC + * \brief KMMAC.u (SIMD Saturating MSW Signed Multiply Word and Add with Rounding) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * KMMAC Rd, Rs1, Rs2 + * KMMAC.u Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Multiply the signed 32-bit integer elements of two registers and add the most significant + * 32-bit results with the signed 32-bit integer elements of a third register. The addition results are + * saturated first and then written back to the third register. The `.u` form performs an additional + * rounding up operation on the multiplication results before adding the most significant 32-bit part + * of the results. + * + * **Description**:\n + * This instruction multiplies the signed 32-bit elements of Rs1 with the signed 32-bit elements of Rs2 + * and adds the most significant 32-bit multiplication results with the signed 32-bit elements of Rd. If + * the addition result is beyond the Q31 number range (-2^31 <= Q31 <= 2^31-1), it is saturated to the range + * and the OV bit is set to 1. The results after saturation are written to Rd. The `.u` form of the + * instruction additionally rounds up the most significant 32-bit of the 64-bit multiplication results by + * adding a 1 to bit 31 of the results. + * + * **Operations**:\n + * ~~~ + * Mres[x][63:0] = Rs1.W[x] * Rs2.W[x]; + * if (`.u` form) { + * Round[x][32:0] = Mres[x][63:31] + 1; + * res[x] = Rd.W[x] + Round[x][32:1]; + * } else { + * res[x] = Rd.W[x] + Mres[x][63:32]; + * } + * if (res[x] > (2^31)-1) { + * res[x] = (2^31)-1; + * OV = 1; + * } else if (res[x] < -2^31) { + * res[x] = -2^31; + * OV = 1; + * } + * Rd.W[x] = res[x]; + * for RV32: x=0 + * for RV64: x=1...0 + * ~~~ + * + * \param [in] t long type of value stored in t + * \param [in] a long type of value stored in a + * \param [in] b long type of value stored in b + * \return value stored in long type + */ +__STATIC_FORCEINLINE long __RV_KMMAC_U(long t, long a, long b) +{ + __ASM volatile("kmmac.u %0, %1, %2" : "+r"(t) : "r"(a), "r"(b)); + return t; +} +/* ===== Inline Function End for 3.43.2. KMMAC.u ===== */ + +/* ===== Inline Function Start for 3.44.1. KMMAWB ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_MSW_32X16_MAC + * \brief KMMAWB (SIMD Saturating MSW Signed Multiply Word and Bottom Half and Add) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * KMMAWB Rd, Rs1, Rs2 + * KMMAWB.u Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Multiply the signed 32-bit integer elements of one register and the bottom 16-bit of the + * corresponding 32-bit elements of another register and add the most significant 32-bit results with + * the corresponding signed 32-bit elements of a third register. The addition result is written to the + * corresponding 32-bit elements of the third register. The `.u` form rounds up the multiplication + * results from the most significant discarded bit before the addition operations. + * + * **Description**:\n + * This instruction multiplies the signed 32-bit elements of Rs1 with the signed bottom 16-bit content + * of the corresponding 32-bit elements of Rs2 and adds the most significant 32-bit multiplication + * results with the corresponding signed 32-bit elements of Rd. If the addition result is beyond the Q31 + * number range (-2^31 <= Q31 <= 2^31-1), it is saturated to the range and the OV bit is set to 1. The results + * after saturation are written to the corresponding 32-bit elements of Rd. The `.u` form of the + * instruction rounds up the most significant 32-bit of the 48-bit multiplication results by adding a 1 to + * bit 15 of the result before the addition operations. + * + * **Operations**:\n + * ~~~ + * Mres[x][47:0] = Rs1.W[x] * Rs2.W[x].H[0]; + * if (`.u` form) { + * Round[x][32:0] = Mres[x][47:15] + 1; + * res[x] = Rd.W[x] + Round[x][32:1]; + * } else { + * res[x] = Rd.W[x] + Mres[x][47:16]; + * } + * if (res[x] > (2^31)-1) { + * res[x] = (2^31)-1; + * OV = 1; + * } else if (res[x] < -2^31) { + * res[x] = -2^31; + * OV = 1; + * } + * Rd.W[x] = res[x]; + * for RV32: x=0 + * for RV64: x=1...0 + * ~~~ + * + * \param [in] t long type of value stored in t + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in long type + */ +__STATIC_FORCEINLINE long __RV_KMMAWB(long t, unsigned long a, unsigned long b) +{ + __ASM volatile("kmmawb %0, %1, %2" : "+r"(t) : "r"(a), "r"(b)); + return t; +} +/* ===== Inline Function End for 3.44.1. KMMAWB ===== */ + +/* ===== Inline Function Start for 3.44.2. KMMAWB.u ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_MSW_32X16_MAC + * \brief KMMAWB.u (SIMD Saturating MSW Signed Multiply Word and Bottom Half and Add with Rounding) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * KMMAWB Rd, Rs1, Rs2 + * KMMAWB.u Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Multiply the signed 32-bit integer elements of one register and the bottom 16-bit of the + * corresponding 32-bit elements of another register and add the most significant 32-bit results with + * the corresponding signed 32-bit elements of a third register. The addition result is written to the + * corresponding 32-bit elements of the third register. The `.u` form rounds up the multiplication + * results from the most significant discarded bit before the addition operations. + * + * **Description**:\n + * This instruction multiplies the signed 32-bit elements of Rs1 with the signed bottom 16-bit content + * of the corresponding 32-bit elements of Rs2 and adds the most significant 32-bit multiplication + * results with the corresponding signed 32-bit elements of Rd. If the addition result is beyond the Q31 + * number range (-2^31 <= Q31 <= 2^31-1), it is saturated to the range and the OV bit is set to 1. The results + * after saturation are written to the corresponding 32-bit elements of Rd. The `.u` form of the + * instruction rounds up the most significant 32-bit of the 48-bit multiplication results by adding a 1 to + * bit 15 of the result before the addition operations. + * + * **Operations**:\n + * ~~~ + * Mres[x][47:0] = Rs1.W[x] * Rs2.W[x].H[0]; + * if (`.u` form) { + * Round[x][32:0] = Mres[x][47:15] + 1; + * res[x] = Rd.W[x] + Round[x][32:1]; + * } else { + * res[x] = Rd.W[x] + Mres[x][47:16]; + * } + * if (res[x] > (2^31)-1) { + * res[x] = (2^31)-1; + * OV = 1; + * } else if (res[x] < -2^31) { + * res[x] = -2^31; + * OV = 1; + * } + * Rd.W[x] = res[x]; + * for RV32: x=0 + * for RV64: x=1...0 + * ~~~ + * + * \param [in] t long type of value stored in t + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in long type + */ +__STATIC_FORCEINLINE long __RV_KMMAWB_U(long t, unsigned long a, unsigned long b) +{ + __ASM volatile("kmmawb.u %0, %1, %2" : "+r"(t) : "r"(a), "r"(b)); + return t; +} +/* ===== Inline Function End for 3.44.2. KMMAWB.u ===== */ + +/* ===== Inline Function Start for 3.45.1. KMMAWB2 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_MSW_32X16_MAC + * \brief KMMAWB2 (SIMD Saturating MSW Signed Multiply Word and Bottom Half & 2 and Add) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * KMMAWB2 Rd, Rs1, Rs2 + * KMMAWB2.u Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Multiply the signed 32-bit elements of one register and the bottom 16-bit of the + * corresponding 32-bit elements of another register, double the multiplication results and add the + * saturated most significant 32-bit results with the corresponding signed 32-bit elements of a third + * register. The saturated addition result is written to the corresponding 32-bit elements of the third + * register. The `.u` form rounds up the multiplication results from the most significant discarded bit + * before the addition operations. + * + * **Description**:\n + * This instruction multiplies the signed 32-bit Q31 elements of Rs1 with the signed bottom 16-bit Q15 + * content of the corresponding 32-bit elements of Rs2, doubles the Q46 results to Q47 numbers and + * adds the saturated most significant 32-bit Q31 multiplication results with the corresponding signed + * 32-bit elements of Rd. If the addition result is beyond the Q31 number range (-2^31 <= Q31 <= 2^31-1), it is + * saturated to the range and the OV bit is set to 1. The results after saturation are written to the + * corresponding 32-bit elements of Rd. The `.u` form of the instruction rounds up the most significant + * 32-bit of the 48-bit Q47 multiplication results by adding a 1 to bit 15 (i.e., bit 14 before doubling) of + * the result before the addition operations. + * + * **Operations**:\n + * ~~~ + * if ((Rs1.W[x] == 0x80000000) & (Rs2.W[x].H[0] == 0x8000)) { + * addop.W[x] = 0x7fffffff; + * OV = 1; + * } else { + * Mres[x][47:0] = Rs1.W[x] s* Rs2.W[x].H[0]; + * if (`.u` form) { + * Mres[x][47:14] = Mres[x][47:14] + 1; + * } + * addop.W[x] = Mres[x][46:15]; // doubling + * } + * res[x] = Rd.W[x] + addop.W[x]; + * if (res[x] > (2^31)-1) { + * res[x] = (2^31)-1; + * OV = 1; + * } else if (res[x] < -2^31) { + * res[x] = -2^31; + * OV = 1; + * } + * Rd.W[x] = res[x]; + * for RV32: x=0 + * for RV64: x=1...0 + * ~~~ + * + * \param [in] t long type of value stored in t + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in long type + */ +__STATIC_FORCEINLINE long __RV_KMMAWB2(long t, unsigned long a, unsigned long b) +{ + __ASM volatile("kmmawb2 %0, %1, %2" : "+r"(t) : "r"(a), "r"(b)); + return t; +} +/* ===== Inline Function End for 3.45.1. KMMAWB2 ===== */ + +/* ===== Inline Function Start for 3.45.2. KMMAWB2.u ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_MSW_32X16_MAC + * \brief KMMAWB2.u (SIMD Saturating MSW Signed Multiply Word and Bottom Half & 2 and Add with Rounding) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * KMMAWB2 Rd, Rs1, Rs2 + * KMMAWB2.u Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Multiply the signed 32-bit elements of one register and the bottom 16-bit of the + * corresponding 32-bit elements of another register, double the multiplication results and add the + * saturated most significant 32-bit results with the corresponding signed 32-bit elements of a third + * register. The saturated addition result is written to the corresponding 32-bit elements of the third + * register. The `.u` form rounds up the multiplication results from the most significant discarded bit + * before the addition operations. + * + * **Description**:\n + * This instruction multiplies the signed 32-bit Q31 elements of Rs1 with the signed bottom 16-bit Q15 + * content of the corresponding 32-bit elements of Rs2, doubles the Q46 results to Q47 numbers and + * adds the saturated most significant 32-bit Q31 multiplication results with the corresponding signed + * 32-bit elements of Rd. If the addition result is beyond the Q31 number range (-2^31 <= Q31 <= 2^31-1), it is + * saturated to the range and the OV bit is set to 1. The results after saturation are written to the + * corresponding 32-bit elements of Rd. The `.u` form of the instruction rounds up the most significant + * 32-bit of the 48-bit Q47 multiplication results by adding a 1 to bit 15 (i.e., bit 14 before doubling) of + * the result before the addition operations. + * + * **Operations**:\n + * ~~~ + * if ((Rs1.W[x] == 0x80000000) & (Rs2.W[x].H[0] == 0x8000)) { + * addop.W[x] = 0x7fffffff; + * OV = 1; + * } else { + * Mres[x][47:0] = Rs1.W[x] s* Rs2.W[x].H[0]; + * if (`.u` form) { + * Mres[x][47:14] = Mres[x][47:14] + 1; + * } + * addop.W[x] = Mres[x][46:15]; // doubling + * } + * res[x] = Rd.W[x] + addop.W[x]; + * if (res[x] > (2^31)-1) { + * res[x] = (2^31)-1; + * OV = 1; + * } else if (res[x] < -2^31) { + * res[x] = -2^31; + * OV = 1; + * } + * Rd.W[x] = res[x]; + * for RV32: x=0 + * for RV64: x=1...0 + * ~~~ + * + * \param [in] t long type of value stored in t + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in long type + */ +__STATIC_FORCEINLINE long __RV_KMMAWB2_U(long t, unsigned long a, unsigned long b) +{ + __ASM volatile("kmmawb2.u %0, %1, %2" : "+r"(t) : "r"(a), "r"(b)); + return t; +} +/* ===== Inline Function End for 3.45.2. KMMAWB2.u ===== */ + +/* ===== Inline Function Start for 3.46.1. KMMAWT ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_MSW_32X16_MAC + * \brief KMMAWT (SIMD Saturating MSW Signed Multiply Word and Top Half and Add) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * KMMAWT Rd, Rs1, Rs2 + * KMMAWT.u Rd Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Multiply the signed 32-bit integer elements of one register and the signed top 16-bit of the + * corresponding 32-bit elements of another register and add the most significant 32-bit results with + * the corresponding signed 32-bit elements of a third register. The addition results are written to the + * corresponding 32-bit elements of the third register. The `.u` form rounds up the multiplication + * results from the most significant discarded bit before the addition operations. + * + * **Description**:\n + * This instruction multiplies the signed 32-bit elements of Rs1 with the signed top 16-bit of the + * corresponding 32-bit elements of Rs2 and adds the most significant 32-bit multiplication results + * with the corresponding signed 32-bit elements of Rd. If the addition result is beyond the Q31 + * number range (-2^31 <= Q31 <= 2^31-1), it is saturated to the range and the OV bit is set to 1. The results + * after saturation are written to the corresponding 32-bit elements of Rd. The `.u` form of the + * instruction rounds up the most significant 32-bit of the 48-bit multiplication results by adding a 1 to + * bit 15 of the result before the addition operations. + * + * **Operations**:\n + * ~~~ + * Mres[x][47:0] = Rs1.W[x] * Rs2.W[x].H[1]; + * if (`.u` form) { + * Round[x][32:0] = Mres[x][47:15] + 1; + * res[x] = Rd.W[x] + Round[x][32:1]; + * } else { + * res[x] = Rd.W[x] + Mres[x][47:16]; + * } + * if (res[x] > (2^31)-1) { + * res[x] = (2^31)-1; + * OV = 1; + * } else if (res[x] < -2^31) { + * res[x] = -2^31; + * OV = 1; + * } + * Rd.W[x] = res[x]; + * for RV32: x=0 + * for RV64: x=1...0 + * ~~~ + * + * \param [in] t long type of value stored in t + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in long type + */ +__STATIC_FORCEINLINE long __RV_KMMAWT(long t, unsigned long a, unsigned long b) +{ + __ASM volatile("kmmawt %0, %1, %2" : "+r"(t) : "r"(a), "r"(b)); + return t; +} +/* ===== Inline Function End for 3.46.1. KMMAWT ===== */ + +/* ===== Inline Function Start for 3.46.2. KMMAWT.u ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_MSW_32X16_MAC + * \brief KMMAWT.u (SIMD Saturating MSW Signed Multiply Word and Top Half and Add with Rounding) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * KMMAWT Rd, Rs1, Rs2 + * KMMAWT.u Rd Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Multiply the signed 32-bit integer elements of one register and the signed top 16-bit of the + * corresponding 32-bit elements of another register and add the most significant 32-bit results with + * the corresponding signed 32-bit elements of a third register. The addition results are written to the + * corresponding 32-bit elements of the third register. The `.u` form rounds up the multiplication + * results from the most significant discarded bit before the addition operations. + * + * **Description**:\n + * This instruction multiplies the signed 32-bit elements of Rs1 with the signed top 16-bit of the + * corresponding 32-bit elements of Rs2 and adds the most significant 32-bit multiplication results + * with the corresponding signed 32-bit elements of Rd. If the addition result is beyond the Q31 + * number range (-2^31 <= Q31 <= 2^31-1), it is saturated to the range and the OV bit is set to 1. The results + * after saturation are written to the corresponding 32-bit elements of Rd. The `.u` form of the + * instruction rounds up the most significant 32-bit of the 48-bit multiplication results by adding a 1 to + * bit 15 of the result before the addition operations. + * + * **Operations**:\n + * ~~~ + * Mres[x][47:0] = Rs1.W[x] * Rs2.W[x].H[1]; + * if (`.u` form) { + * Round[x][32:0] = Mres[x][47:15] + 1; + * res[x] = Rd.W[x] + Round[x][32:1]; + * } else { + * res[x] = Rd.W[x] + Mres[x][47:16]; + * } + * if (res[x] > (2^31)-1) { + * res[x] = (2^31)-1; + * OV = 1; + * } else if (res[x] < -2^31) { + * res[x] = -2^31; + * OV = 1; + * } + * Rd.W[x] = res[x]; + * for RV32: x=0 + * for RV64: x=1...0 + * ~~~ + * + * \param [in] t long type of value stored in t + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in long type + */ +__STATIC_FORCEINLINE long __RV_KMMAWT_U(long t, unsigned long a, unsigned long b) +{ + __ASM volatile("kmmawt.u %0, %1, %2" : "+r"(t) : "r"(a), "r"(b)); + return t; +} +/* ===== Inline Function End for 3.46.2. KMMAWT.u ===== */ + +/* ===== Inline Function Start for 3.47.1. KMMAWT2 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_MSW_32X16_MAC + * \brief KMMAWT2 (SIMD Saturating MSW Signed Multiply Word and Top Half & 2 and Add) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * KMMAWT2 Rd, Rs1, Rs2 + * KMMAWT2.u Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Multiply the signed 32-bit elements of one register and the top 16-bit of the + * corresponding 32-bit elements of another register, double the multiplication results and add the + * saturated most significant 32-bit results with the corresponding signed 32-bit elements of a third + * register. The saturated addition result is written to the corresponding 32-bit elements of the third + * register. The `.u` form rounds up the multiplication results from the most significant discarded bit + * before the addition operations. + * + * **Description**:\n + * This instruction multiplies the signed 32-bit Q31 elements of Rs1 with the signed top 16-bit Q15 + * content of the corresponding 32-bit elements of Rs2, doubles the Q46 results to Q47 numbers and + * adds the saturated most significant 32-bit Q31 multiplication results with the corresponding signed + * 32-bit elements of Rd. If the addition result is beyond the Q31 number range (-2^31 <= Q31 <= 2^31-1), it is + * saturated to the range and the OV bit is set to 1. The results after saturation are written to the + * corresponding 32-bit elements of Rd. The `.u` form of the instruction rounds up the most significant + * 32-bit of the 48-bit Q47 multiplication results by adding a 1 to bit 15 (i.e., bit 14 before doubling) of + * the result before the addition operations. + * + * **Operations**:\n + * ~~~ + * if ((Rs1.W[x] == 0x80000000) & (Rs2.W[x].H[1] == 0x8000)) { + * addop.W[x] = 0x7fffffff; + * OV = 1; + * } else { + * Mres[x][47:0] = Rs1.W[x] s* Rs2.W[x].H[1]; + * if (`.u` form) { + * Mres[x][47:14] = Mres[x][47:14] + 1; + * } + * addop.W[x] = Mres[x][46:15]; // doubling + * } + * res[x] = Rd.W[x] + addop.W[x]; + * if (res[x] > (2^31)-1) { + * res[x] = (2^31)-1; + * OV = 1; + * } else if (res[x] < -2^31) { + * res[x] = -2^31; + * OV = 1; + * } + * Rd.W[x] = res[x]; + * for RV32: x=0 + * for RV64: x=1...0 + * ~~~ + * + * \param [in] t long type of value stored in t + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in long type + */ +__STATIC_FORCEINLINE long __RV_KMMAWT2(long t, unsigned long a, unsigned long b) +{ + __ASM volatile("kmmawt2 %0, %1, %2" : "+r"(t) : "r"(a), "r"(b)); + return t; +} +/* ===== Inline Function End for 3.47.1. KMMAWT2 ===== */ + +/* ===== Inline Function Start for 3.47.2. KMMAWT2.u ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_MSW_32X16_MAC + * \brief KMMAWT2.u (SIMD Saturating MSW Signed Multiply Word and Top Half & 2 and Add with Rounding) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * KMMAWT2 Rd, Rs1, Rs2 + * KMMAWT2.u Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Multiply the signed 32-bit elements of one register and the top 16-bit of the + * corresponding 32-bit elements of another register, double the multiplication results and add the + * saturated most significant 32-bit results with the corresponding signed 32-bit elements of a third + * register. The saturated addition result is written to the corresponding 32-bit elements of the third + * register. The `.u` form rounds up the multiplication results from the most significant discarded bit + * before the addition operations. + * + * **Description**:\n + * This instruction multiplies the signed 32-bit Q31 elements of Rs1 with the signed top 16-bit Q15 + * content of the corresponding 32-bit elements of Rs2, doubles the Q46 results to Q47 numbers and + * adds the saturated most significant 32-bit Q31 multiplication results with the corresponding signed + * 32-bit elements of Rd. If the addition result is beyond the Q31 number range (-2^31 <= Q31 <= 2^31-1), it is + * saturated to the range and the OV bit is set to 1. The results after saturation are written to the + * corresponding 32-bit elements of Rd. The `.u` form of the instruction rounds up the most significant + * 32-bit of the 48-bit Q47 multiplication results by adding a 1 to bit 15 (i.e., bit 14 before doubling) of + * the result before the addition operations. + * + * **Operations**:\n + * ~~~ + * if ((Rs1.W[x] == 0x80000000) & (Rs2.W[x].H[1] == 0x8000)) { + * addop.W[x] = 0x7fffffff; + * OV = 1; + * } else { + * Mres[x][47:0] = Rs1.W[x] s* Rs2.W[x].H[1]; + * if (`.u` form) { + * Mres[x][47:14] = Mres[x][47:14] + 1; + * } + * addop.W[x] = Mres[x][46:15]; // doubling + * } + * res[x] = Rd.W[x] + addop.W[x]; + * if (res[x] > (2^31)-1) { + * res[x] = (2^31)-1; + * OV = 1; + * } else if (res[x] < -2^31) { + * res[x] = -2^31; + * OV = 1; + * } + * Rd.W[x] = res[x]; + * for RV32: x=0 + * for RV64: x=1...0 + * ~~~ + * + * \param [in] t long type of value stored in t + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in long type + */ +__STATIC_FORCEINLINE long __RV_KMMAWT2_U(long t, unsigned long a, unsigned long b) +{ + __ASM volatile("kmmawt2.u %0, %1, %2" : "+r"(t) : "r"(a), "r"(b)); + return t; +} +/* ===== Inline Function End for 3.47.2. KMMAWT2.u ===== */ + +/* ===== Inline Function Start for 3.48.1. KMMSB ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_MSW_32X32_MAC + * \brief KMMSB (SIMD Saturating MSW Signed Multiply Word and Subtract) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * KMMSB Rd, Rs1, Rs2 + * KMMSB.u Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Multiply the signed 32-bit integer elements of two registers and subtract the most + * significant 32-bit results from the signed 32-bit elements of a third register. The subtraction results + * are written to the third register. The `.u` form performs an additional rounding up operation on + * the multiplication results before subtracting the most significant 32-bit part of the results. + * + * **Description**:\n + * This instruction multiplies the signed 32-bit elements of Rs1 with the signed 32-bit elements of Rs2 + * and subtracts the most significant 32-bit multiplication results from the signed 32-bit elements of + * Rd. If the subtraction result is beyond the Q31 number range (-2^31 <= Q31 <= 2^31-1), it is saturated to the + * range and the OV bit is set to 1. The results after saturation are written to Rd. The `.u` form of the + * instruction additionally rounds up the most significant 32-bit of the 64-bit multiplication results by + * adding a 1 to bit 31 of the results. + * + * **Operations**:\n + * ~~~ + * Mres[x][63:0] = Rs1.W[x] * Rs2.W[x]; + * if (`.u` form) { + * Round[x][32:0] = Mres[x][63:31] + 1; + * res[x] = Rd.W[x] - Round[x][32:1]; + * } else { + * res[x] = Rd.W[x] - Mres[x][63:32]; + * } + * if (res[x] > (2^31)-1) { + * res[x] = (2^31)-1; + * OV = 1; + * } else if (res[x] < -2^31) { + * res[x] = -2^31; + * OV = 1; + * } + * Rd.W[x] = res[x]; + * for RV32: x=0 + * for RV64: x=1...0 + * ~~~ + * + * \param [in] t long type of value stored in t + * \param [in] a long type of value stored in a + * \param [in] b long type of value stored in b + * \return value stored in long type + */ +__STATIC_FORCEINLINE long __RV_KMMSB(long t, long a, long b) +{ + __ASM volatile("kmmsb %0, %1, %2" : "+r"(t) : "r"(a), "r"(b)); + return t; +} +/* ===== Inline Function End for 3.48.1. KMMSB ===== */ + +/* ===== Inline Function Start for 3.48.2. KMMSB.u ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_MSW_32X32_MAC + * \brief KMMSB.u (SIMD Saturating MSW Signed Multiply Word and Subtraction with Rounding) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * KMMSB Rd, Rs1, Rs2 + * KMMSB.u Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Multiply the signed 32-bit integer elements of two registers and subtract the most + * significant 32-bit results from the signed 32-bit elements of a third register. The subtraction results + * are written to the third register. The `.u` form performs an additional rounding up operation on + * the multiplication results before subtracting the most significant 32-bit part of the results. + * + * **Description**:\n + * This instruction multiplies the signed 32-bit elements of Rs1 with the signed 32-bit elements of Rs2 + * and subtracts the most significant 32-bit multiplication results from the signed 32-bit elements of + * Rd. If the subtraction result is beyond the Q31 number range (-2^31 <= Q31 <= 2^31-1), it is saturated to the + * range and the OV bit is set to 1. The results after saturation are written to Rd. The `.u` form of the + * instruction additionally rounds up the most significant 32-bit of the 64-bit multiplication results by + * adding a 1 to bit 31 of the results. + * + * **Operations**:\n + * ~~~ + * Mres[x][63:0] = Rs1.W[x] * Rs2.W[x]; + * if (`.u` form) { + * Round[x][32:0] = Mres[x][63:31] + 1; + * res[x] = Rd.W[x] - Round[x][32:1]; + * } else { + * res[x] = Rd.W[x] - Mres[x][63:32]; + * } + * if (res[x] > (2^31)-1) { + * res[x] = (2^31)-1; + * OV = 1; + * } else if (res[x] < -2^31) { + * res[x] = -2^31; + * OV = 1; + * } + * Rd.W[x] = res[x]; + * for RV32: x=0 + * for RV64: x=1...0 + * ~~~ + * + * \param [in] t long type of value stored in t + * \param [in] a long type of value stored in a + * \param [in] b long type of value stored in b + * \return value stored in long type + */ +__STATIC_FORCEINLINE long __RV_KMMSB_U(long t, long a, long b) +{ + __ASM volatile("kmmsb.u %0, %1, %2" : "+r"(t) : "r"(a), "r"(b)); + return t; +} +/* ===== Inline Function End for 3.48.2. KMMSB.u ===== */ + +/* ===== Inline Function Start for 3.49.1. KMMWB2 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_MSW_32X16_MAC + * \brief KMMWB2 (SIMD Saturating MSW Signed Multiply Word and Bottom Half & 2) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * KMMWB2 Rd, Rs1, Rs2 + * KMMWB2.u Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Multiply the signed 32-bit integer elements of one register and the bottom 16-bit of the + * corresponding 32-bit elements of another register, double the multiplication results and write the + * saturated most significant 32-bit results to the corresponding 32-bit elements of a register. The `.u` + * form rounds up the results from the most significant discarded bit. + * + * **Description**:\n + * This instruction multiplies the signed 32-bit Q31 elements of Rs1 with the signed bottom 16-bit Q15 + * content of the corresponding 32-bit elements of Rs2, doubles the Q46 results to Q47 numbers and + * writes the saturated most significant 32-bit Q31 multiplication results to the corresponding 32-bit + * elements of Rd. The `.u` form of the instruction rounds up the most significant 32-bit of the 48-bit + * Q47 multiplication results by adding a 1 to bit 15 (i.e., bit 14 before doubling) of the results. + * + * **Operations**:\n + * ~~~ + * if ((Rs1.W[x] == 0x80000000) & (Rs2.W[x].H[0] == 0x8000)) { + * Rd.W[x] = 0x7fffffff; + * OV = 1; + * } else { + * Mres[x][47:0] = Rs1.W[x] s* Rs2.W[x].H[0]; + * if (`.u` form) { + * Round[x][32:0] = Mres[x][46:14] + 1; + * Rd.W[x] = Round[x][32:1]; + * } else { + * Rd.W[x] = Mres[x][46:15]; + * } + * } + * for RV32: x=0 + * for RV64: x=1...0 + * ~~~ + * + * \param [in] a long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in long type + */ +__STATIC_FORCEINLINE long __RV_KMMWB2(long a, unsigned long b) +{ + register long result; + __ASM volatile("kmmwb2 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 3.49.1. KMMWB2 ===== */ + +/* ===== Inline Function Start for 3.49.2. KMMWB2.u ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_MSW_32X16_MAC + * \brief KMMWB2.u (SIMD Saturating MSW Signed Multiply Word and Bottom Half & 2 with Rounding) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * KMMWB2 Rd, Rs1, Rs2 + * KMMWB2.u Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Multiply the signed 32-bit integer elements of one register and the bottom 16-bit of the + * corresponding 32-bit elements of another register, double the multiplication results and write the + * saturated most significant 32-bit results to the corresponding 32-bit elements of a register. The `.u` + * form rounds up the results from the most significant discarded bit. + * + * **Description**:\n + * This instruction multiplies the signed 32-bit Q31 elements of Rs1 with the signed bottom 16-bit Q15 + * content of the corresponding 32-bit elements of Rs2, doubles the Q46 results to Q47 numbers and + * writes the saturated most significant 32-bit Q31 multiplication results to the corresponding 32-bit + * elements of Rd. The `.u` form of the instruction rounds up the most significant 32-bit of the 48-bit + * Q47 multiplication results by adding a 1 to bit 15 (i.e., bit 14 before doubling) of the results. + * + * **Operations**:\n + * ~~~ + * if ((Rs1.W[x] == 0x80000000) & (Rs2.W[x].H[0] == 0x8000)) { + * Rd.W[x] = 0x7fffffff; + * OV = 1; + * } else { + * Mres[x][47:0] = Rs1.W[x] s* Rs2.W[x].H[0]; + * if (`.u` form) { + * Round[x][32:0] = Mres[x][46:14] + 1; + * Rd.W[x] = Round[x][32:1]; + * } else { + * Rd.W[x] = Mres[x][46:15]; + * } + * } + * for RV32: x=0 + * for RV64: x=1...0 + * ~~~ + * + * \param [in] a long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in long type + */ +__STATIC_FORCEINLINE long __RV_KMMWB2_U(long a, unsigned long b) +{ + register long result; + __ASM volatile("kmmwb2.u %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 3.49.2. KMMWB2.u ===== */ + +/* ===== Inline Function Start for 3.50.1. KMMWT2 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_MSW_32X16_MAC + * \brief KMMWT2 (SIMD Saturating MSW Signed Multiply Word and Top Half & 2) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * KMMWT2 Rd, Rs1, Rs2 + * KMMWT2.u Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Multiply the signed 32-bit integer elements of one register and the top 16-bit of the + * corresponding 32-bit elements of another register, double the multiplication results and write the + * saturated most significant 32-bit results to the corresponding 32-bit elements of a register. The `.u` + * form rounds up the results from the most significant discarded bit. + * + * **Description**:\n + * This instruction multiplies the signed 32-bit Q31 elements of Rs1 with the signed top 16-bit Q15 + * content of the corresponding 32-bit elements of Rs2, doubles the Q46 results to Q47 numbers and + * writes the saturated most significant 32-bit Q31 multiplication results to the corresponding 32-bit + * elements of Rd. The `.u` form of the instruction rounds up the most significant 32-bit of the 48-bit + * Q47 multiplication results by adding a 1 to bit 15 (i.e., bit 14 before doubling) of the results. + * + * **Operations**:\n + * ~~~ + * if ((Rs1.W[x] == 0x80000000) & (Rs2.W[x].H[1] == 0x8000)) { + * Rd.W[x] = 0x7fffffff; + * OV = 1; + * } else { + * Mres[x][47:0] = Rs1.W[x] s* Rs2.W[x].H[1]; + * if (`.u` form) { + * Round[x][32:0] = Mres[x][46:14] + 1; + * Rd.W[x] = Round[x][32:1]; + * } else { + * Rd.W[x] = Mres[x][46:15]; + * } + * } + * for RV32: x=0 + * for RV64: x=1...0 + * ~~~ + * + * \param [in] a long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in long type + */ +__STATIC_FORCEINLINE long __RV_KMMWT2(long a, unsigned long b) +{ + register long result; + __ASM volatile("kmmwt2 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 3.50.1. KMMWT2 ===== */ + +/* ===== Inline Function Start for 3.50.2. KMMWT2.u ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_MSW_32X16_MAC + * \brief KMMWT2.u (SIMD Saturating MSW Signed Multiply Word and Top Half & 2 with Rounding) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * KMMWT2 Rd, Rs1, Rs2 + * KMMWT2.u Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Multiply the signed 32-bit integer elements of one register and the top 16-bit of the + * corresponding 32-bit elements of another register, double the multiplication results and write the + * saturated most significant 32-bit results to the corresponding 32-bit elements of a register. The `.u` + * form rounds up the results from the most significant discarded bit. + * + * **Description**:\n + * This instruction multiplies the signed 32-bit Q31 elements of Rs1 with the signed top 16-bit Q15 + * content of the corresponding 32-bit elements of Rs2, doubles the Q46 results to Q47 numbers and + * writes the saturated most significant 32-bit Q31 multiplication results to the corresponding 32-bit + * elements of Rd. The `.u` form of the instruction rounds up the most significant 32-bit of the 48-bit + * Q47 multiplication results by adding a 1 to bit 15 (i.e., bit 14 before doubling) of the results. + * + * **Operations**:\n + * ~~~ + * if ((Rs1.W[x] == 0x80000000) & (Rs2.W[x].H[1] == 0x8000)) { + * Rd.W[x] = 0x7fffffff; + * OV = 1; + * } else { + * Mres[x][47:0] = Rs1.W[x] s* Rs2.W[x].H[1]; + * if (`.u` form) { + * Round[x][32:0] = Mres[x][46:14] + 1; + * Rd.W[x] = Round[x][32:1]; + * } else { + * Rd.W[x] = Mres[x][46:15]; + * } + * } + * for RV32: x=0 + * for RV64: x=1...0 + * ~~~ + * + * \param [in] a long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in long type + */ +__STATIC_FORCEINLINE long __RV_KMMWT2_U(long a, unsigned long b) +{ + register long result; + __ASM volatile("kmmwt2.u %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 3.50.2. KMMWT2.u ===== */ + +/* ===== Inline Function Start for 3.51.1. KMSDA ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_16B_MULT_32B_ADDSUB + * \brief KMSDA (SIMD Saturating Signed Multiply Two Halfs & Add & Subtract) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * KMSDA Rd, Rs1, Rs2 + * KMSXDA Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do two signed 16-bit multiplications from the 32-bit elements of two registers; and then + * subtracts the two 32-bit results from the corresponding 32-bit elements of a third register. The + * subtraction result may be saturated. + * * KMSDA: rd.W[x] - top*top - bottom*bottom (per 32-bit element) + * * KMSXDA: rd.W[x] - top*bottom - bottom*top (per 32-bit element) + * + * **Description**:\n + * For the `KMSDA` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1 + * with the bottom 16-bit content of the 32-bit elements of Rs2 and multiplies the top 16-bit content of + * the 32-bit elements of Rs1 with the top 16-bit content of the 32-bit elements of Rs2. + * For the `KMSXDA` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1 + * with the top 16-bit content of the 32-bit elements of Rs2 and multiplies the top 16-bit content of the + * 32-bit elements of Rs1 with the bottom 16-bit content of the 32-bit elements of Rs2. + * The two 32-bit multiplication results are then subtracted from the content of the corresponding 32- + * bit elements of Rd. If the subtraction result is beyond the Q31 number range (-2^31 <= Q31 <= 2^31-1), it is + * saturated to the range and the OV bit is set to 1. The results after saturation are written to Rd. The + * 16-bit contents are treated as signed integers. + * + * **Operations**:\n + * ~~~ + * // KMSDA + * res[x] = Rd.W[x] - (Rs1.W[x].H[1] * Rs2.W[x].H[1]) - (Rs1.W[x].H[0] * Rs2.W[x].H[0]); + * // KMSXDA + * res[x] = Rd.W[x] - (Rs1.W[x].H[1] * Rs2.W[x].H[0]) - (Rs1.W[x].H[0] * Rs2.W[x].H[1]); + * if (res[x] > (2^31)-1) { + * res[x] = (2^31)-1; + * OV = 1; + * } else if (res[x] < -2^31) { + * res[x] = -2^31; + * OV = 1; + * } + * Rd.W[x] = res[x]; + * for RV32: x=0 + * for RV64: x=1...0 + * ~~~ + * + * \param [in] t long type of value stored in t + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in long type + */ +__STATIC_FORCEINLINE long __RV_KMSDA(long t, unsigned long a, unsigned long b) +{ + __ASM volatile("kmsda %0, %1, %2" : "+r"(t) : "r"(a), "r"(b)); + return t; +} +/* ===== Inline Function End for 3.51.1. KMSDA ===== */ + +/* ===== Inline Function Start for 3.51.2. KMSXDA ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_16B_MULT_32B_ADDSUB + * \brief KMSXDA (SIMD Saturating Signed Crossed Multiply Two Halfs & Add & Subtract) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * KMSDA Rd, Rs1, Rs2 + * KMSXDA Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do two signed 16-bit multiplications from the 32-bit elements of two registers; and then + * subtracts the two 32-bit results from the corresponding 32-bit elements of a third register. The + * subtraction result may be saturated. + * * KMSDA: rd.W[x] - top*top - bottom*bottom (per 32-bit element) + * * KMSXDA: rd.W[x] - top*bottom - bottom*top (per 32-bit element) + * + * **Description**:\n + * For the `KMSDA` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1 + * with the bottom 16-bit content of the 32-bit elements of Rs2 and multiplies the top 16-bit content of + * the 32-bit elements of Rs1 with the top 16-bit content of the 32-bit elements of Rs2. + * For the `KMSXDA` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1 + * with the top 16-bit content of the 32-bit elements of Rs2 and multiplies the top 16-bit content of the + * 32-bit elements of Rs1 with the bottom 16-bit content of the 32-bit elements of Rs2. + * The two 32-bit multiplication results are then subtracted from the content of the corresponding 32- + * bit elements of Rd. If the subtraction result is beyond the Q31 number range (-2^31 <= Q31 <= 2^31-1), it is + * saturated to the range and the OV bit is set to 1. The results after saturation are written to Rd. The + * 16-bit contents are treated as signed integers. + * + * **Operations**:\n + * ~~~ + * // KMSDA + * res[x] = Rd.W[x] - (Rs1.W[x].H[1] * Rs2.W[x].H[1]) - (Rs1.W[x].H[0] * Rs2.W[x].H[0]); + * // KMSXDA + * res[x] = Rd.W[x] - (Rs1.W[x].H[1] * Rs2.W[x].H[0]) - (Rs1.W[x].H[0] * Rs2.W[x].H[1]); + * if (res[x] > (2^31)-1) { + * res[x] = (2^31)-1; + * OV = 1; + * } else if (res[x] < -2^31) { + * res[x] = -2^31; + * OV = 1; + * } + * Rd.W[x] = res[x]; + * for RV32: x=0 + * for RV64: x=1...0 + * ~~~ + * + * \param [in] t long type of value stored in t + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in long type + */ +__STATIC_FORCEINLINE long __RV_KMSXDA(long t, unsigned long a, unsigned long b) +{ + __ASM volatile("kmsxda %0, %1, %2" : "+r"(t) : "r"(a), "r"(b)); + return t; +} +/* ===== Inline Function End for 3.51.2. KMSXDA ===== */ + +/* ===== Inline Function Start for 3.52. KMSR64 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_32B_MULT_64B_ADDSUB + * \brief KMSR64 (Signed Multiply and Saturating Subtract from 64-Bit Data) + * \details + * **Type**: DSP (64-bit Profile) + * + * **Syntax**:\n + * ~~~ + * KMSR64 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Multiply the 32-bit signed elements in two registers and subtract the 64-bit multiplication + * results from the 64-bit signed data of a pair of registers (RV32) or a register (RV64). The result is + * saturated to the Q63 range and written back to the pair of registers (RV32) or the register (RV64). + * + * **RV32 Description**:\n + * This instruction multiplies the 32-bit signed data of Rs1 with that of Rs2. It + * subtracts the 64-bit multiplication result from the 64-bit signed data of an even/odd pair of registers + * specified by Rd(4,1) with unlimited precision. If the 64-bit subtraction result is beyond the Q63 + * number range (-2^63 <= Q63 <= 2^63-1), it is saturated to the range and the OV bit is set to 1. The saturated + * result is written back to the even/odd pair of registers specified by Rd(4,1). + * Rx(4,1), i.e., d, determines the even/odd pair group of two registers. Specifically, the register pair + * includes register 2d and 2d+1. + * The odd `2d+1` register of the pair contains the high 32-bit of the result and the even `2d` register + * of the pair contains the low 32-bit of the result. + * + * **RV64 Description**:\n + * This instruction multiplies the 32-bit signed elements of Rs1 with that of Rs2. It + * subtracts the 64-bit multiplication results from the 64-bit signed data in Rd with unlimited + * precision. If the 64-bit subtraction result is beyond the Q63 number range (-2^63 <= Q63 <= 2^63-1), it is + * saturated to the range and the OV bit is set to 1. The saturated result is written back to Rd. + * + * **Operations**:\n + * ~~~ + * RV32: + * t_L = CONCAT(Rd(4,1),1'b0); t_H = CONCAT(Rd(4,1),1'b1); + * result = R[t_H].R[t_L] - (Rs1 * Rs2); + * if (result > (2^63)-1) { + * result = (2^63)-1; OV = 1; + * } else if (result < -2^63) { + * result = -2^63; OV = 1; + * } + * R[t_H].R[t_L] = result; + * RV64: + * // `result` has unlimited precision + * result = Rd - (Rs1.W[0] * Rs2.W[0]) - (Rs1.W[1] * Rs2.W[1]); + * if (result > (2^63)-1) { + * result = (2^63)-1; OV = 1; + * } else if (result < -2^63) { + * result = -2^63; OV = 1; + * } + * Rd = result; + * ~~~ + * + * \param [in] t long long type of value stored in t + * \param [in] a long type of value stored in a + * \param [in] b long type of value stored in b + * \return value stored in long long type + */ +__STATIC_FORCEINLINE long long __RV_KMSR64(long long t, long a, long b) +{ + __ASM volatile("kmsr64 %0, %1, %2" : "+r"(t) : "r"(a), "r"(b)); + return t; +} +/* ===== Inline Function End for 3.52. KMSR64 ===== */ + +/* ===== Inline Function Start for 3.53. KSLLW ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_Q31_SAT_ALU + * \brief KSLLW (Saturating Shift Left Logical for Word) + * \details + * **Type**: DSP + * + * **Syntax**:\n + * ~~~ + * KSLLW Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do logical left shift operation with saturation on a 32-bit word. The shift amount is a + * variable from a GPR. + * + * **Description**:\n + * The first word data in Rs1 is left-shifted logically. The shifted out bits are filled with + * zero and the shift amount is specified by the low-order 5-bits of the value in the Rs2 register. Any + * shifted value greater than 2^31-1 is saturated to 2^31-1. Any shifted value smaller than -2^31 is saturated + * to -2^31. And the saturated result is sign-extended and written to Rd. If any saturation is performed, + * set OV bit to 1. + * + * **Operations**:\n + * ~~~ + * sa = Rs2[4:0]; + * res[(31+sa):0] = Rs1.W[0] << sa; + * if (res > (2^31)-1) { + * res = 0x7fffffff; OV = 1; + * } else if (res < -2^31) { + * res = 0x80000000; OV = 1; + * } + * Rd[31:0] = res[31:0]; // RV32 + * Rd[63:0] = SE(res[31:0]); // RV64 + * ~~~ + * + * \param [in] a long type of value stored in a + * \param [in] b unsigned int type of value stored in b + * \return value stored in long type + */ +__STATIC_FORCEINLINE long __RV_KSLLW(long a, unsigned int b) +{ + register long result; + __ASM volatile("ksllw %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 3.53. KSLLW ===== */ + +/* ===== Inline Function Start for 3.54. KSLLIW ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_Q31_SAT_ALU + * \brief KSLLIW (Saturating Shift Left Logical Immediate for Word) + * \details + * **Type**: DSP + * + * **Syntax**:\n + * ~~~ + * KSLLIW Rd, Rs1, imm5u + * ~~~ + * + * **Purpose**:\n + * Do logical left shift operation with saturation on a 32-bit word. The shift amount is an + * immediate value. + * + * **Description**:\n + * The first word data in Rs1 is left-shifted logically. The shifted out bits are filled with + * zero and the shift amount is specified by the imm5u constant. Any shifted value greater than 2^31-1 is + * saturated to 2^31-1. Any shifted value smaller than -2^31 is saturated to -2^31. And the saturated result is + * sign-extended and written to Rd. If any saturation is performed, set OV bit to 1. + * + * **Operations**:\n + * ~~~ + * sa = imm5u; + * res[(31+sa):0] = Rs1.W[0] << sa; + * if (res > (2^31)-1) { + * res = 0x7fffffff; OV = 1; + * } else if (res < -2^31) { + * res = 0x80000000; OV = 1; + * } + * Rd[31:0] = res[31:0]; // RV32 + * Rd[63:0] = SE(res[31:0]); // RV64 + * ~~~ + * + * \param [in] a long type of value stored in a + * \param [in] b unsigned int type of value stored in b + * \return value stored in long type + */ +#define __RV_KSLLIW(a, b) \ + ({ \ + register long result; \ + register long __a = (long)(a); \ + __ASM volatile("kslliw %0, %1, %2" : "=r"(result) : "r"(__a), "K"(b)); \ + result; \ + }) +/* ===== Inline Function End for 3.54. KSLLIW ===== */ + +/* ===== Inline Function Start for 3.55. KSLL8 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_SHIFT + * \brief KSLL8 (SIMD 8-bit Saturating Shift Left Logical) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * KSLL8 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do 8-bit elements logical left shift operations with saturation simultaneously. The shift + * amount is a variable from a GPR. + * + * **Description**:\n + * The 8-bit data elements in Rs1 are left-shifted logically. The shifted out bits are filled + * with zero and the shift amount is specified by the low-order 3-bits of the value in the Rs2 register. + * Any shifted value greater than 2^7-1 is saturated to 2^7-1. Any shifted value smaller than -2^7 is + * saturated to -2^7. And the saturated results are written to Rd. If any saturation is performed, set OV + * bit to 1. + * + * **Operations**:\n + * ~~~ + * sa = Rs2[2:0]; + * if (sa != 0) { + * res[(7+sa):0] = Rs1.B[x] << sa; + * if (res > (2^7)-1) { + * res = 0x7f; OV = 1; + * } else if (res < -2^7) { + * res = 0x80; OV = 1; + * } + * Rd.B[x] = res[7:0]; + * } else { + * Rd = Rs1; + * } + * for RV32: x=3...0, + * for RV64: x=7...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned int type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_KSLL8(unsigned long a, unsigned int b) +{ + register unsigned long result; + __ASM volatile("ksll8 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 3.55. KSLL8 ===== */ + +/* ===== Inline Function Start for 3.56. KSLLI8 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_SHIFT + * \brief KSLLI8 (SIMD 8-bit Saturating Shift Left Logical Immediate) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * KSLLI8 Rd, Rs1, imm3u + * ~~~ + * + * **Purpose**:\n + * Do 8-bit elements logical left shift operations with saturation simultaneously. The shift + * amount is an immediate value. + * + * **Description**:\n + * The 8-bit data elements in Rs1 are left-shifted logically. The shifted out bits are filled + * with zero and the shift amount is specified by the imm3u constant. Any shifted value greater than + * 2^7-1 is saturated to 2^7-1. Any shifted value smaller than -2^7 is saturated to -2^7. And the saturated + * results are written to Rd. If any saturation is performed, set OV bit to 1. + * + * **Operations**:\n + * ~~~ + * sa = imm3u[2:0]; + * if (sa != 0) { + * res[(7+sa):0] = Rs1.B[x] << sa; + * if (res > (2^7)-1) { + * res = 0x7f; OV = 1; + * } else if (res < -2^7) { + * res = 0x80; OV = 1; + * } + * Rd.B[x] = res[7:0]; + * } else { + * Rd = Rs1; + * } + * for RV32: x=3...0, + * for RV64: x=7...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned int type of value stored in b + * \return value stored in unsigned long type + */ +#define __RV_KSLLI8(a, b) \ + ({ \ + register unsigned long result; \ + register unsigned long __a = (unsigned long)(a); \ + __ASM volatile("kslli8 %0, %1, %2" : "=r"(result) : "r"(__a), "K"(b)); \ + result; \ + }) +/* ===== Inline Function End for 3.56. KSLLI8 ===== */ + +/* ===== Inline Function Start for 3.57. KSLL16 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_SHIFT + * \brief KSLL16 (SIMD 16-bit Saturating Shift Left Logical) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * KSLL16 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do 16-bit elements logical left shift operations with saturation simultaneously. The shift + * amount is a variable from a GPR. + * + * **Description**:\n + * The 16-bit data elements in Rs1 are left-shifted logically. The shifted out bits are filled + * with zero and the shift amount is specified by the low-order 4-bits of the value in the Rs2 register. + * Any shifted value greater than 2^15-1 is saturated to 2^15-1. Any shifted value smaller than -2^15 is + * saturated to -2^15. And the saturated results are written to Rd. If any saturation is performed, set OV + * bit to 1. + * + * **Operations**:\n + * ~~~ + * sa = Rs2[3:0]; + * if (sa != 0) { + * res[(15+sa):0] = Rs1.H[x] << sa; + * if (res > (2^15)-1) { + * res = 0x7fff; OV = 1; + * } else if (res < -2^15) { + * res = 0x8000; OV = 1; + * } + * Rd.H[x] = res[15:0]; + * } else { + * Rd = Rs1; + * } + * for RV32: x=1...0, + * for RV64: x=3...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned int type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_KSLL16(unsigned long a, unsigned int b) +{ + register unsigned long result; + __ASM volatile("ksll16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 3.57. KSLL16 ===== */ + +/* ===== Inline Function Start for 3.58. KSLLI16 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_SHIFT + * \brief KSLLI16 (SIMD 16-bit Saturating Shift Left Logical Immediate) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * KSLLI16 Rd, Rs1, imm4u + * ~~~ + * + * **Purpose**:\n + * Do 16-bit elements logical left shift operations with saturation simultaneously. The shift + * amount is an immediate value. + * + * **Description**:\n + * The 16-bit data elements in Rs1 are left-shifted logically. The shifted out bits are filled + * with zero and the shift amount is specified by the imm4u constant. Any shifted value greater than + * 2^15-1 is saturated to 2^15-1. Any shifted value smaller than -2^15 is saturated to -2^15. And the saturated + * results are written to Rd. If any saturation is performed, set OV bit to 1. + * + * **Operations**:\n + * ~~~ + * sa = imm4u[3:0]; + * if (sa != 0) { + * res[(15+sa):0] = Rs1.H[x] << sa; + * if (res > (2^15)-1) { + * res = 0x7fff; OV = 1; + * } else if (res < -2^15) { + * res = 0x8000; OV = 1; + * } + * Rd.H[x] = res[15:0]; + * } else { + * Rd = Rs1; + * } + * for RV32: x=1...0, + * for RV64: x=3...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned int type of value stored in b + * \return value stored in unsigned long type + */ +#define __RV_KSLLI16(a, b) \ + ({ \ + register unsigned long result; \ + register unsigned long __a = (unsigned long)(a); \ + __ASM volatile("kslli16 %0, %1, %2" : "=r"(result) : "r"(__a), "K"(b)); \ + result; \ + }) +/* ===== Inline Function End for 3.58. KSLLI16 ===== */ + +/* ===== Inline Function Start for 3.59.1. KSLRA8 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_SHIFT + * \brief KSLRA8 (SIMD 8-bit Shift Left Logical with Saturation or Shift Right Arithmetic) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * KSLRA8 Rd, Rs1, Rs2 + * KSLRA8.u Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do 8-bit elements logical left (positive) or arithmetic right (negative) shift operation with + * Q7 saturation for the left shift. The `.u` form performs additional rounding up operations for the + * right shift. + * + * **Description**:\n + * The 8-bit data elements of Rs1 are left-shifted logically or right-shifted arithmetically + * based on the value of Rs2[3:0]. Rs2[3:0] is in the signed range of [-2^3, 2^3-1]. A positive Rs2[3:0] means + * logical left shift and a negative Rs2[3:0] means arithmetic right shift. The shift amount is the + * absolute value of Rs2[3:0]. However, the behavior of `Rs2[3:0]==-2^3 (0x8)` is defined to be + * equivalent to the behavior of `Rs2[3:0]==-(2^3-1) (0x9)`. + * The left-shifted results are saturated to the 8-bit signed integer range of [-2^7, 2^7-1]. For the `.u` form + * of the instruction, the right-shifted results are added a 1 to the most significant discarded bit + * position for rounding effect. After the shift, saturation, or rounding, the final results are written to + * Rd. If any saturation happens, this instruction sets the OV flag. The value of Rs2[31:4] will not affect + * this instruction. + * + * **Operations**:\n + * ~~~ + * if (Rs2[3:0] < 0) { + * sa = -Rs2[3:0]; + * sa = (sa == 8)? 7 : sa; + * if (`.u` form) { + * res[7:-1] = SE9(Rs1.B[x][7:sa-1]) + 1; + * Rd.B[x] = res[7:0]; + * } else { + * Rd.B[x] = SE8(Rs1.B[x][7:sa]); + * } + * } else { + * sa = Rs2[2:0]; + * res[(7+sa):0] = Rs1.B[x] <<(logic) sa; + * if (res > (2^7)-1) { + * res[7:0] = 0x7f; OV = 1; + * } else if (res < -2^7) { + * res[7:0] = 0x80; OV = 1; + * } + * Rd.B[x] = res[7:0]; + * } + * for RV32: x=3...0, + * for RV64: x=7...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b int type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_KSLRA8(unsigned long a, int b) +{ + register unsigned long result; + __ASM volatile("kslra8 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 3.59.1. KSLRA8 ===== */ + +/* ===== Inline Function Start for 3.59.2. KSLRA8.u ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_SHIFT + * \brief KSLRA8.u (SIMD 8-bit Shift Left Logical with Saturation or Rounding Shift Right Arithmetic) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * KSLRA8 Rd, Rs1, Rs2 + * KSLRA8.u Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do 8-bit elements logical left (positive) or arithmetic right (negative) shift operation with + * Q7 saturation for the left shift. The `.u` form performs additional rounding up operations for the + * right shift. + * + * **Description**:\n + * The 8-bit data elements of Rs1 are left-shifted logically or right-shifted arithmetically + * based on the value of Rs2[3:0]. Rs2[3:0] is in the signed range of [-2^3, 2^3-1]. A positive Rs2[3:0] means + * logical left shift and a negative Rs2[3:0] means arithmetic right shift. The shift amount is the + * absolute value of Rs2[3:0]. However, the behavior of `Rs2[3:0]==-2^3 (0x8)` is defined to be + * equivalent to the behavior of `Rs2[3:0]==-(2^3-1) (0x9)`. + * The left-shifted results are saturated to the 8-bit signed integer range of [-2^7, 2^7-1]. For the `.u` form + * of the instruction, the right-shifted results are added a 1 to the most significant discarded bit + * position for rounding effect. After the shift, saturation, or rounding, the final results are written to + * Rd. If any saturation happens, this instruction sets the OV flag. The value of Rs2[31:4] will not affect + * this instruction. + * + * **Operations**:\n + * ~~~ + * if (Rs2[3:0] < 0) { + * sa = -Rs2[3:0]; + * sa = (sa == 8)? 7 : sa; + * if (`.u` form) { + * res[7:-1] = SE9(Rs1.B[x][7:sa-1]) + 1; + * Rd.B[x] = res[7:0]; + * } else { + * Rd.B[x] = SE8(Rs1.B[x][7:sa]); + * } + * } else { + * sa = Rs2[2:0]; + * res[(7+sa):0] = Rs1.B[x] <<(logic) sa; + * if (res > (2^7)-1) { + * res[7:0] = 0x7f; OV = 1; + * } else if (res < -2^7) { + * res[7:0] = 0x80; OV = 1; + * } + * Rd.B[x] = res[7:0]; + * } + * for RV32: x=3...0, + * for RV64: x=7...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b int type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_KSLRA8_U(unsigned long a, int b) +{ + register unsigned long result; + __ASM volatile("kslra8.u %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 3.59.2. KSLRA8.u ===== */ + +/* ===== Inline Function Start for 3.60.1. KSLRA16 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_SHIFT + * \brief KSLRA16 (SIMD 16-bit Shift Left Logical with Saturation or Shift Right Arithmetic) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * KSLRA16 Rd, Rs1, Rs2 + * KSLRA16.u Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do 16-bit elements logical left (positive) or arithmetic right (negative) shift operation with + * Q15 saturation for the left shift. The `.u` form performs additional rounding up operations for the + * right shift. + * + * **Description**:\n + * The 16-bit data elements of Rs1 are left-shifted logically or right-shifted arithmetically + * based on the value of Rs2[4:0]. Rs2[4:0] is in the signed range of [-2^4, 2^4-1]. A positive Rs2[4:0] means + * logical left shift and a negative Rs2[4:0] means arithmetic right shift. The shift amount is the + * absolute value of Rs2[4:0]. However, the behavior of `Rs2[4:0]==-2^4 (0x10)` is defined to be + * equivalent to the behavior of `Rs2[4:0]==-(2^4-1) (0x11)`. + * The left-shifted results are saturated to the 16-bit signed integer range of [-2^15, 2^15-1]. For the `.u` + * form of the instruction, the right-shifted results are added a 1 to the most significant discarded bit + * position for rounding effect. After the shift, saturation, or rounding, the final results are written to + * Rd. If any saturation happens, this instruction sets the OV flag. The value of Rs2[31:5] will not affect + * this instruction. + * + * **Operations**:\n + * ~~~ + * if (Rs2[4:0] < 0) { + * sa = -Rs2[4:0]; + * sa = (sa == 16)? 15 : sa; + * if (`.u` form) { + * res[15:-1] = SE17(Rs1.H[x][15:sa-1]) + 1; + * Rd.H[x] = res[15:0]; + * } else { + * Rd.H[x] = SE16(Rs1.H[x][15:sa]); + * } + * } else { + * sa = Rs2[3:0]; + * res[(15+sa):0] = Rs1.H[x] <<(logic) sa; + * if (res > (2^15)-1) { + * res[15:0] = 0x7fff; OV = 1; + * } else if (res < -2^15) { + * res[15:0] = 0x8000; OV = 1; + * } + * d.H[x] = res[15:0]; + * } + * for RV32: x=1...0, + * for RV64: x=3...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b int type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_KSLRA16(unsigned long a, int b) +{ + register unsigned long result; + __ASM volatile("kslra16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 3.60.1. KSLRA16 ===== */ + +/* ===== Inline Function Start for 3.60.2. KSLRA16.u ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_SHIFT + * \brief KSLRA16.u (SIMD 16-bit Shift Left Logical with Saturation or Rounding Shift Right Arithmetic) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * KSLRA16 Rd, Rs1, Rs2 + * KSLRA16.u Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do 16-bit elements logical left (positive) or arithmetic right (negative) shift operation with + * Q15 saturation for the left shift. The `.u` form performs additional rounding up operations for the + * right shift. + * + * **Description**:\n + * The 16-bit data elements of Rs1 are left-shifted logically or right-shifted arithmetically + * based on the value of Rs2[4:0]. Rs2[4:0] is in the signed range of [-2^4, 2^4-1]. A positive Rs2[4:0] means + * logical left shift and a negative Rs2[4:0] means arithmetic right shift. The shift amount is the + * absolute value of Rs2[4:0]. However, the behavior of `Rs2[4:0]==-2^4 (0x10)` is defined to be + * equivalent to the behavior of `Rs2[4:0]==-(2^4-1) (0x11)`. + * The left-shifted results are saturated to the 16-bit signed integer range of [-2^15, 2^15-1]. For the `.u` + * form of the instruction, the right-shifted results are added a 1 to the most significant discarded bit + * position for rounding effect. After the shift, saturation, or rounding, the final results are written to + * Rd. If any saturation happens, this instruction sets the OV flag. The value of Rs2[31:5] will not affect + * this instruction. + * + * **Operations**:\n + * ~~~ + * if (Rs2[4:0] < 0) { + * sa = -Rs2[4:0]; + * sa = (sa == 16)? 15 : sa; + * if (`.u` form) { + * res[15:-1] = SE17(Rs1.H[x][15:sa-1]) + 1; + * Rd.H[x] = res[15:0]; + * } else { + * Rd.H[x] = SE16(Rs1.H[x][15:sa]); + * } + * } else { + * sa = Rs2[3:0]; + * res[(15+sa):0] = Rs1.H[x] <<(logic) sa; + * if (res > (2^15)-1) { + * res[15:0] = 0x7fff; OV = 1; + * } else if (res < -2^15) { + * res[15:0] = 0x8000; OV = 1; + * } + * d.H[x] = res[15:0]; + * } + * for RV32: x=1...0, + * for RV64: x=3...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b int type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_KSLRA16_U(unsigned long a, int b) +{ + register unsigned long result; + __ASM volatile("kslra16.u %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 3.60.2. KSLRA16.u ===== */ + +/* ===== Inline Function Start for 3.61. KSLRAW ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_Q31_SAT_ALU + * \brief KSLRAW (Shift Left Logical with Q31 Saturation or Shift Right Arithmetic) + * \details + * **Type**: DSP + * + * **Syntax**:\n + * ~~~ + * KSLRAW Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Perform a logical left (positive) or arithmetic right (negative) shift operation with Q31 + * saturation for the left shift on a 32-bit data. + * + * **Description**:\n + * The lower 32-bit content of Rs1 is left-shifted logically or right-shifted arithmetically + * based on the value of Rs2[5:0]. Rs2[5:0] is in the signed range of [-25, 25-1]. A positive Rs2[5:0] means + * logical left shift and a negative Rs2[5:0] means arithmetic right shift. The shift amount is the + * absolute value of Rs2[5:0] clamped to the actual shift range of [0, 31]. + * The left-shifted result is saturated to the 32-bit signed integer range of [-2^31, 2^31-1]. After the shift + * operation, the final result is bit-31 sign-extended and written to Rd. If any saturation happens, this + * instruction sets the OV flag. The value of Rs2[31:6] will not affected the operation of this instruction. + * + * **Operations**:\n + * ~~~ + * if (Rs2[5:0] < 0) { + * sa = -Rs2[5:0]; + * sa = (sa == 32)? 31 : sa; + * res[31:0] = Rs1.W[0] >>(arith) sa; + * } else { + * sa = Rs2[5:0]; + * tmp = Rs1.W[0] <<(logic) sa; + * if (tmp > (2^31)-1) { + * res[31:0] = (2^31)-1; + * OV = 1; + * } else if (tmp < -2^31) { + * res[31:0] = -2^31; + * OV = 1 + * } else { + * res[31:0] = tmp[31:0]; + * } + * } + * Rd = res[31:0]; // RV32 + * Rd = SE64(res[31:0]); // RV64 + * ~~~ + * + * \param [in] a int type of value stored in a + * \param [in] b int type of value stored in b + * \return value stored in long type + */ +__STATIC_FORCEINLINE long __RV_KSLRAW(int a, int b) +{ + register long result; + __ASM volatile("kslraw %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 3.61. KSLRAW ===== */ + +/* ===== Inline Function Start for 3.62. KSLRAW.u ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_Q31_SAT_ALU + * \brief KSLRAW.u (Shift Left Logical with Q31 Saturation or Rounding Shift Right Arithmetic) + * \details + * **Type**: DSP + * + * **Syntax**:\n + * ~~~ + * KSLRAW.u Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Perform a logical left (positive) or arithmetic right (negative) shift operation with Q31 + * saturation for the left shift and a rounding up operation for the right shift on a 32-bit data. + * + * **Description**:\n + * The lower 32-bit content of Rs1 is left-shifted logically or right-shifted arithmetically + * based on the value of Rs2[5:0]. Rs2[5:0] is in the signed range of [-25, 25-1]. A positive Rs2[5:0] means + * logical left shift and a negative Rs2[5:0] means arithmetic right shift. The shift amount is the + * absolute value of Rs2[5:0] clamped to the actual shift range of [0, 31]. + * The left-shifted result is saturated to the 32-bit signed integer range of [-2^31, 2^31-1]. The right-shifted + * result is added a 1 to the most significant discarded bit position for rounding effect. After the shift, + * saturation, or rounding, the final result is bit-31 sign-extended and written to Rd. If any saturation + * happens, this instruction sets the OV flag. The value of Rs2[31:6] will not affect the operation of this + * instruction. + * + * **Operations**:\n + * ~~~ + * if (Rs2[5:0] < 0) { + * sa = -Rs2[5:0]; + * sa = (sa == 32)? 31 : sa; + * res[31:-1] = SE33(Rs1[31:(sa-1)]) + 1; + * rst[31:0] = res[31:0]; + * } else { + * sa = Rs2[5:0]; + * tmp = Rs1.W[0] <<(logic) sa; + * if (tmp > (2^31)-1) { + * rst[31:0] = (2^31)-1; + * OV = 1; + * } else if (tmp < -2^31) { + * rst[31:0] = -2^31; + * OV = 1 + * } else { + * rst[31:0] = tmp[31:0]; + * } + * } + * Rd = rst[31:0]; // RV32 + * Rd = SE64(rst[31:0]); // RV64 + * ~~~ + * + * \param [in] a int type of value stored in a + * \param [in] b int type of value stored in b + * \return value stored in long type + */ +__STATIC_FORCEINLINE long __RV_KSLRAW_U(int a, int b) +{ + register long result; + __ASM volatile("kslraw.u %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 3.62. KSLRAW.u ===== */ + +/* ===== Inline Function Start for 3.63. KSTAS16 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_ADDSUB + * \brief KSTAS16 (SIMD 16-bit Signed Saturating Straight Addition & Subtraction) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * KSTAS16 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do 16-bit signed integer element saturating addition and 16-bit signed integer element + * saturating subtraction in a 32-bit chunk simultaneously. Operands are from corresponding + * positions in 32-bit chunks. + * + * **Description**:\n + * This instruction adds the 16-bit signed integer element in [31:16] of 32-bit chunks in + * Rs1 with the 16-bit signed integer element in [31:16] of 32-bit chunks in Rs2; at the same time, it + * subtracts the 16-bit signed integer element in [15:0] of 32-bit chunks in Rs2 from the 16-bit signed + * integer element in [15:0] of 32-bit chunks in Rs1. If any of the results are beyond the Q15 number + * range (-2^15 <= Q15 <= 2^15-1), they are saturated to the range and the OV bit is set to 1. The saturated + * results are written to [31:16] of 32-bit chunks in Rd for addition and [15:0] of 32-bit chunks in Rd for + * subtraction. + * + * **Operations**:\n + * ~~~ + * res1 = Rs1.W[x][31:16] + Rs2.W[x][31:16]; + * res2 = Rs1.W[x][15:0] - Rs2.W[x][15:0]; + * for (res in [res1, res2]) { + * if (res > (2^15)-1) { + * res = (2^15)-1; + * OV = 1; + * } else if (res < -2^15) { + * res = -2^15; + * OV = 1; + * } + * } + * Rd.W[x][31:16] = res1; + * Rd.W[x][15:0] = res2; + * for RV32, x=0 + * for RV64, x=1...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_KSTAS16(unsigned long a, unsigned long b) +{ + register unsigned long result; + __ASM volatile("kstas16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 3.63. KSTAS16 ===== */ + +/* ===== Inline Function Start for 3.64. KSTSA16 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_ADDSUB + * \brief KSTSA16 (SIMD 16-bit Signed Saturating Straight Subtraction & Addition) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * KSTSA16 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do 16-bit signed integer element saturating subtraction and 16-bit signed integer element + * saturating addition in a 32-bit chunk simultaneously. Operands are from corresponding positions in + * 32-bit chunks. + * + * **Description**:\n + * This instruction subtracts the 16-bit signed integer element in [31:16] of 32-bit chunks + * in Rs2 from the 16-bit signed integer element in [31:16] of 32-bit chunks in Rs1; at the same time, it + * adds the 16-bit signed integer element in [15:0] of 32-bit chunks in Rs2 with the 16-bit signed integer + * element in [15:0] of 32-bit chunks in Rs1. If any of the results are beyond the Q15 number range (-2^15 + * <= Q15 <= 2^15-1), they are saturated to the range and the OV bit is set to 1. The saturated results are + * written to [31:16] of 32-bit chunks in Rd for subtraction and [15:0] of 32-bit chunks in Rd for + * addition. + * + * **Operations**:\n + * ~~~ + * res1 = Rs1.W[x][31:16] - Rs2.W[x][31:16]; + * res2 = Rs1.W[x][15:0] + Rs2.W[x][15:0]; + * for (res in [res1, res2]) { + * if (res > (2^15)-1) { + * res = (2^15)-1; + * OV = 1; + * } else if (res < -2^15) { + * res = -2^15; + * OV = 1; + * } + * } + * Rd.W[x][31:16] = res1; + * Rd.W[x][15:0] = res2; + * for RV32, x=0 + * for RV64, x=1...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_KSTSA16(unsigned long a, unsigned long b) +{ + register unsigned long result; + __ASM volatile("kstsa16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 3.64. KSTSA16 ===== */ + +/* ===== Inline Function Start for 3.65. KSUB8 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_ADDSUB + * \brief KSUB8 (SIMD 8-bit Signed Saturating Subtraction) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * KSUB8 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do 8-bit signed elements saturating subtractions simultaneously. + * + * **Description**:\n + * This instruction subtracts the 8-bit signed integer elements in Rs2 from the 8-bit + * signed integer elements in Rs1. If any of the results are beyond the Q7 number range (-2^7 <= Q7 <= 27 + * -1), they are saturated to the range and the OV bit is set to 1. The saturated results are written to Rd. + * + * **Operations**:\n + * ~~~ + * res[x] = Rs1.B[x] - Rs2.B[x]; + * if (res[x] > (2^7)-1) { + * res[x] = (2^7)-1; + * OV = 1; + * } else if (res[x] < -2^7) { + * res[x] = -2^7; + * OV = 1; + * } + * Rd.B[x] = res[x]; + * for RV32: x=3...0, + * for RV64: x=7...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_KSUB8(unsigned long a, unsigned long b) +{ + register unsigned long result; + __ASM volatile("ksub8 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 3.65. KSUB8 ===== */ + +/* ===== Inline Function Start for 3.66. KSUB16 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_ADDSUB + * \brief KSUB16 (SIMD 16-bit Signed Saturating Subtraction) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * KSUB16 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do 16-bit signed integer elements saturating subtractions simultaneously. + * + * **Description**:\n + * This instruction subtracts the 16-bit signed integer elements in Rs2 from the 16-bit + * signed integer elements in Rs1. If any of the results are beyond the Q15 number range (-2^15 <= Q15 <= + * 2^15-1), they are saturated to the range and the OV bit is set to 1. The saturated results are written to + * Rd. + * + * **Operations**:\n + * ~~~ + * res[x] = Rs1.H[x] - Rs2.H[x]; + * if (res[x] > (2^15)-1) { + * res[x] = (2^15)-1; + * OV = 1; + * } else if (res[x] < -2^15) { + * res[x] = -2^15; + * OV = 1; + * } + * Rd.H[x] = res[x]; + * for RV32: x=1...0, + * for RV64: x=3...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_KSUB16(unsigned long a, unsigned long b) +{ + register unsigned long result; + __ASM volatile("ksub16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 3.66. KSUB16 ===== */ + +/* ===== Inline Function Start for 3.67. KSUB64 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_64B_ADDSUB + * \brief KSUB64 (64-bit Signed Saturating Subtraction) + * \details + * **Type**: DSP (64-bit Profile) + * + * **Syntax**:\n + * ~~~ + * KSUB64 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Perform a 64-bit signed integer subtraction. The result is saturated to the Q63 range. + * + * **RV32 Description**:\n + * This instruction subtracts the 64-bit signed integer of an even/odd pair of + * registers specified by Rs2(4,1) from the 64-bit signed integer of an even/odd pair of registers + * specified by Rs1(4,1). If the 64-bit result is beyond the Q63 number range (-2^63 <= Q63 <= 2^63-1), it is + * saturated to the range and the OV bit is set to 1. The saturated result is then written to an even/odd + * pair of registers specified by Rd(4,1). + * Rx(4,1), i.e., d, determines the even/odd pair group of two registers. Specifically, the register pair + * includes register 2d and 2d+1. + * The odd `2d+1` register of the pair contains the high 32-bit of the operand and the even `2d` + * register of the pair contains the low 32-bit of the operand. + * + * **RV64 Description**:\n + * This instruction subtracts the 64-bit signed integer of Rs2 from the 64-bit signed + * integer of Rs1. If the 64-bit result is beyond the Q63 number range (-2^63 <= Q63 <= 2^63-1), it is saturated + * to the range and the OV bit is set to 1. The saturated result is then written to Rd. + * + * **Operations**:\n + * ~~~ + * RV32: + * t_L = CONCAT(Rd(4,1),1'b0); t_H = CONCAT(Rd(4,1),1'b1); + * a_L = CONCAT(Rs1(4,1),1'b0); a_H = CONCAT(Rs1(4,1),1'b1); + * b_L = CONCAT(Rs2(4,1),1'b0); b_H = CONCAT(Rs2(4,1),1'b1); + * result = R[a_H].R[a_L] - R[b_H].R[b_L]; + * if (result > (2^63)-1) { + * result = (2^63)-1; OV = 1; + * } else if (result < -2^63) { + * result = -2^63; OV = 1; + * } + * R[t_H].R[t_L] = result; + * RV64: + * result = Rs1 - Rs2; + * if (result > (2^63)-1) { + * result = (2^63)-1; OV = 1; + * } else if (result < -2^63) { + * result = -2^63; OV = 1; + * } + * Rd = result; + * ~~~ + * + * \param [in] a long long type of value stored in a + * \param [in] b long long type of value stored in b + * \return value stored in long long type + */ +__STATIC_FORCEINLINE long long __RV_KSUB64(long long a, long long b) +{ + register long long result; + __ASM volatile("ksub64 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 3.67. KSUB64 ===== */ + +/* ===== Inline Function Start for 3.68. KSUBH ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_Q15_SAT_ALU + * \brief KSUBH (Signed Subtraction with Q15 Saturation) + * \details + * **Type**: DSP + * + * **Syntax**:\n + * ~~~ + * KSUBH Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Subtract the signed lower 32-bit content of two registers with Q15 saturation. + * + * **Description**:\n + * The signed lower 32-bit content of Rs2 is subtracted from the signed lower 32-bit + * content of Rs1. And the result is saturated to the 16-bit signed integer range of [-2^15, 2^15-1] and then + * sign-extended and written to Rd. If saturation happens, this instruction sets the OV flag. + * + * **Operations**:\n + * ~~~ + * tmp = Rs1.W[0] - Rs2.W[0]; + * if (tmp > (2^15)-1) { + * res = (2^15)-1; + * OV = 1; + * } else if (tmp < -2^15) { + * res = -2^15; + * OV = 1 + * } else { + * res = tmp; + * } + * Rd = SE(res[15:0]); + * ~~~ + * + * \param [in] a int type of value stored in a + * \param [in] b int type of value stored in b + * \return value stored in long type + */ +__STATIC_FORCEINLINE long __RV_KSUBH(int a, int b) +{ + register long result; + __ASM volatile("ksubh %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 3.68. KSUBH ===== */ + +/* ===== Inline Function Start for 3.69. KSUBW ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_Q31_SAT_ALU + * \brief KSUBW (Signed Subtraction with Q31 Saturation) + * \details + * **Type**: DSP + * + * **Syntax**:\n + * ~~~ + * KSUBW Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Subtract the signed lower 32-bit content of two registers with Q31 saturation. + * + * **Description**:\n + * The signed lower 32-bit content of Rs2 is subtracted from the signed lower 32-bit + * content of Rs1. And the result is saturated to the 32-bit signed integer range of [-2^31, 2^31-1] and then + * sign-extened and written to Rd. If saturation happens, this instruction sets the OV flag. + * + * **Operations**:\n + * ~~~ + * tmp = Rs1.W[0] - Rs2.W[0]; + * if (tmp > (2^31)-1) { + * res = (2^31)-1; + * OV = 1; + * } else if (tmp < -2^31) { + * res = -2^31; + * OV = 1 + * } else { + * res = tmp; + * } + * Rd = res[31:0]; // RV32 + * Rd = SE(res[31:0]); // RV64 + * ~~~ + * + * \param [in] a int type of value stored in a + * \param [in] b int type of value stored in b + * \return value stored in long type + */ +__STATIC_FORCEINLINE long __RV_KSUBW(int a, int b) +{ + register long result; + __ASM volatile("ksubw %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 3.69. KSUBW ===== */ + +/* ===== Inline Function Start for 3.70.1. KWMMUL ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_MSW_32X32_MAC + * \brief KWMMUL (SIMD Saturating MSW Signed Multiply Word & Double) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * KWMMUL Rd, Rs1, Rs2 + * KWMMUL.u Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Multiply the signed 32-bit integer elements of two registers, shift the results left 1-bit, + * saturate, and write the most significant 32-bit results to a register. The `.u` form additionally + * rounds up the multiplication results from the most signification discarded bit. + * + * **Description**:\n + * This instruction multiplies the 32-bit elements of Rs1 with the 32-bit elements of Rs2. It then shifts + * the multiplication results one bit to the left and takes the most significant 32-bit results. If the + * shifted result is greater than 2^31-1, it is saturated to 2^31-1 and the OV flag is set to 1. The final element + * result is written to Rd. The 32-bit elements of Rs1 and Rs2 are treated as signed integers. The `.u` + * form of the instruction additionally rounds up the 64-bit multiplication results by adding a 1 to bit + * 30 before the shift and saturation operations. + * + * **Operations**:\n + * ~~~ + * if ((0x80000000 != Rs1.W[x]) | (0x80000000 != Rs2.W[x])) { + * Mres[x][63:0] = Rs1.W[x] * Rs2.W[x]; + * if (`.u` form) { + * Round[x][33:0] = Mres[x][63:30] + 1; + * Rd.W[x] = Round[x][32:1]; + * } else { + * Rd.W[x] = Mres[x][62:31]; + * } + * } else { + * Rd.W[x] = 0x7fffffff; + * OV = 1; + * } + * for RV32: x=0 + * for RV64: x=1...0 + * ~~~ + * + * \param [in] a long type of value stored in a + * \param [in] b long type of value stored in b + * \return value stored in long type + */ +__STATIC_FORCEINLINE long __RV_KWMMUL(long a, long b) +{ + register long result; + __ASM volatile("kwmmul %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 3.70.1. KWMMUL ===== */ + +/* ===== Inline Function Start for 3.70.2. KWMMUL.u ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_MSW_32X32_MAC + * \brief KWMMUL.u (SIMD Saturating MSW Signed Multiply Word & Double with Rounding) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * KWMMUL Rd, Rs1, Rs2 + * KWMMUL.u Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Multiply the signed 32-bit integer elements of two registers, shift the results left 1-bit, + * saturate, and write the most significant 32-bit results to a register. The `.u` form additionally + * rounds up the multiplication results from the most signification discarded bit. + * + * **Description**:\n + * This instruction multiplies the 32-bit elements of Rs1 with the 32-bit elements of Rs2. It then shifts + * the multiplication results one bit to the left and takes the most significant 32-bit results. If the + * shifted result is greater than 2^31-1, it is saturated to 2^31-1 and the OV flag is set to 1. The final element + * result is written to Rd. The 32-bit elements of Rs1 and Rs2 are treated as signed integers. The `.u` + * form of the instruction additionally rounds up the 64-bit multiplication results by adding a 1 to bit + * 30 before the shift and saturation operations. + * + * **Operations**:\n + * ~~~ + * if ((0x80000000 != Rs1.W[x]) | (0x80000000 != Rs2.W[x])) { + * Mres[x][63:0] = Rs1.W[x] * Rs2.W[x]; + * if (`.u` form) { + * Round[x][33:0] = Mres[x][63:30] + 1; + * Rd.W[x] = Round[x][32:1]; + * } else { + * Rd.W[x] = Mres[x][62:31]; + * } + * } else { + * Rd.W[x] = 0x7fffffff; + * OV = 1; + * } + * for RV32: x=0 + * for RV64: x=1...0 + * ~~~ + * + * \param [in] a long type of value stored in a + * \param [in] b long type of value stored in b + * \return value stored in long type + */ +__STATIC_FORCEINLINE long __RV_KWMMUL_U(long a, long b) +{ + register long result; + __ASM volatile("kwmmul.u %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 3.70.2. KWMMUL.u ===== */ + +/* ===== Inline Function Start for 3.71. MADDR32 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_MISC + * \brief MADDR32 (Multiply and Add to 32-Bit Word) + * \details + * **Type**: DSP + * + * **Syntax**:\n + * ~~~ + * MADDR32 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Multiply the 32-bit contents of two registers and add the lower 32-bit multiplication result + * to the 32-bit content of a destination register. Write the final result back to the destination register. + * + * **Description**:\n + * This instruction multiplies the lower 32-bit content of Rs1 with that of Rs2. It adds the + * lower 32-bit multiplication result to the lower 32-bit content of Rd and writes the final result (RV32) + * or sign-extended result (RV64) back to Rd. The contents of Rs1 and Rs2 can be either signed or + * unsigned integers. + * + * **Operations**:\n + * ~~~ + * RV32: + * Mresult = Rs1 * Rs2; + * Rd = Rd + Mresult.W[0]; + * RV64: + * Mresult = Rs1.W[0] * Rs2.W[0]; + * tres[31:0] = Rd.W[0] + Mresult.W[0]; + * Rd = SE64(tres[31:0]); + * ~~~ + * + * \param [in] t unsigned long type of value stored in t + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_MADDR32(unsigned long t, unsigned long a, unsigned long b) +{ + __ASM volatile("maddr32 %0, %1, %2" : "+r"(t) : "r"(a), "r"(b)); + return t; +} +/* ===== Inline Function End for 3.71. MADDR32 ===== */ + +/* ===== Inline Function Start for 3.72. MAXW ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_32B_COMPUTATION + * \brief MAXW (32-bit Signed Word Maximum) + * \details + * **Type**: DSP + * + * **Syntax**:\n + * ~~~ + * MAXW Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Get the larger value from the 32-bit contents of two general registers. + * + * **Description**:\n + * This instruction compares two signed 32-bit integers stored in Rs1 and Rs2, picks the + * larger value as the result, and writes the result to Rd. + * + * **Operations**:\n + * ~~~ + * if (Rs1.W[0] >= Rs2.W[0]) { + * Rd = SE(Rs1.W[0]); + * } else { + * Rd = SE(Rs2.W[0]); + * } + * ~~~ + * + * \param [in] a int type of value stored in a + * \param [in] b int type of value stored in b + * \return value stored in long type + */ +__STATIC_FORCEINLINE long __RV_MAXW(int a, int b) +{ + register long result; + __ASM volatile("maxw %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 3.72. MAXW ===== */ + +/* ===== Inline Function Start for 3.73. MINW ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_32B_COMPUTATION + * \brief MINW (32-bit Signed Word Minimum) + * \details + * **Type**: DSP + * + * **Syntax**:\n + * ~~~ + * MINW Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Get the smaller value from the 32-bit contents of two general registers. + * + * **Description**:\n + * This instruction compares two signed 32-bit integers stored in Rs1 and Rs2, picks the + * smaller value as the result, and writes the result to Rd. + * + * **Operations**:\n + * ~~~ + * if (Rs1.W[0] >= Rs2.W[0]) { Rd = SE(Rs2.W[0]); } else { Rd = SE(Rs1.W[0]); } + * ~~~ + * + * \param [in] a int type of value stored in a + * \param [in] b int type of value stored in b + * \return value stored in long type + */ +__STATIC_FORCEINLINE long __RV_MINW(int a, int b) +{ + register long result; + __ASM volatile("minw %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 3.73. MINW ===== */ + +/* ===== Inline Function Start for 3.74. MSUBR32 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_MISC + * \brief MSUBR32 (Multiply and Subtract from 32-Bit Word) + * \details + * **Type**: DSP + * + * **Syntax**:\n + * ~~~ + * MSUBR32 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Multiply the 32-bit contents of two registers and subtract the lower 32-bit multiplication + * result from the 32-bit content of a destination register. Write the final result back to the destination + * register. + * + * **Description**:\n + * This instruction multiplies the lower 32-bit content of Rs1 with that of Rs2, subtracts + * the lower 32-bit multiplication result from the lower 32-bit content of Rd, then writes the final + * result (RV32) or sign-extended result (RV64) back to Rd. The contents of Rs1 and Rs2 can be either + * signed or unsigned integers. + * + * **Operations**:\n + * ~~~ + * RV32: + * Mresult = Rs1 * Rs2; + * Rd = Rd - Mresult.W[0]; + * RV64: + * Mresult = Rs1.W[0] * Rs2.W[0]; + * tres[31:0] = Rd.W[0] - Mresult.W[0]; + * Rd = SE64(tres[31:0]); + * ~~~ + * + * \param [in] t unsigned long type of value stored in t + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_MSUBR32(unsigned long t, unsigned long a, unsigned long b) +{ + __ASM volatile("msubr32 %0, %1, %2" : "+r"(t) : "r"(a), "r"(b)); + return t; +} +/* ===== Inline Function End for 3.74. MSUBR32 ===== */ + +/* ===== Inline Function Start for 3.75. MULR64 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_32B_COMPUTATION + * \brief MULR64 (Multiply Word Unsigned to 64-bit Data) + * \details + * **Type**: DSP + * + * **Syntax**:\n + * ~~~ + * MULR64 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Multiply the 32-bit unsigned integer contents of two registers and write the 64-bit result. + * + * **RV32 Description**:\n + * This instruction multiplies the 32-bit content of Rs1 with that of Rs2 and writes the 64-bit + * multiplication result to an even/odd pair of registers containing Rd. Rd(4,1) index d determines the + * even/odd pair group of the two registers. Specifically, the register pair includes register 2d and + * 2d+1. + * The odd `2d+1` register of the pair contains the high 32-bit of the result and the even `2d` register + * of the pair contains the low 32-bit of the result. + * The lower 32-bit contents of Rs1 and Rs2 are treated as unsigned integers. + * + * **RV64 Description**:\n + * This instruction multiplies the lower 32-bit content of Rs1 with that of Rs2 and writes the 64-bit + * multiplication result to Rd. + * The lower 32-bit contents of Rs1 and Rs2 are treated as unsigned integers. + * + * **Operations**:\n + * ~~~ + * RV32: + * Mresult = CONCAT(1`b0,Rs1) u* CONCAT(1`b0,Rs2); + * R[Rd(4,1).1(0)][31:0] = Mresult[63:32]; + * R[Rd(4,1).0(0)][31:0] = Mresult[31:0]; + * RV64: + * Rd = Mresult[63:0]; + * Mresult = CONCAT(1`b0,Rs1.W[0]) u* CONCAT(1`b0,Rs2.W[0]); + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in unsigned long long type + */ +__STATIC_FORCEINLINE unsigned long long __RV_MULR64(unsigned long a, unsigned long b) +{ + register unsigned long long result; + __ASM volatile("mulr64 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 3.75. MULR64 ===== */ + +/* ===== Inline Function Start for 3.76. MULSR64 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_32B_COMPUTATION + * \brief MULSR64 (Multiply Word Signed to 64-bit Data) + * \details + * **Type**: DSP + * + * **Syntax**:\n + * ~~~ + * MULSR64 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Multiply the 32-bit signed integer contents of two registers and write the 64-bit result. + * + * **RV32 Description**:\n + * This instruction multiplies the lower 32-bit content of Rs1 with the lower 32-bit content of Rs2 and + * writes the 64-bit multiplication result to an even/odd pair of registers containing Rd. Rd(4,1) index d + * determines the even/odd pair group of the two registers. Specifically, the register pair includes + * register 2d and 2d+1. + * The odd `2d+1` register of the pair contains the high 32-bit of the result and the even `2d` register + * of the pair contains the low 32-bit of the result. + * The lower 32-bit contents of Rs1 and Rs2 are treated as signed integers. + * + * **RV64 Description**:\n + * This instruction multiplies the lower 32-bit content of Rs1 with the lower 32-bit content of Rs2 and + * writes the 64-bit multiplication result to Rd. + * The lower 32-bit contents of Rs1 and Rs2 are treated as signed integers. + * + * **Operations**:\n + * ~~~ + * RV32: + * Mresult = Ra s* Rb; + * R[Rd(4,1).1(0)][31:0] = Mresult[63:32]; + * R[Rd(4,1).0(0)][31:0] = Mresult[31:0]; + * RV64: + * Mresult = Ra.W[0] s* Rb.W[0]; + * Rd = Mresult[63:0]; + * ~~~ + * + * \param [in] a long type of value stored in a + * \param [in] b long type of value stored in b + * \return value stored in long long type + */ +__STATIC_FORCEINLINE long long __RV_MULSR64(long a, long b) +{ + register long long result; + __ASM volatile("mulsr64 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 3.76. MULSR64 ===== */ + +/* ===== Inline Function Start for 3.77. PBSAD ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_PART_SIMD_MISC + * \brief PBSAD (Parallel Byte Sum of Absolute Difference) + * \details + * **Type**: DSP + * + * **Syntax**:\n + * ~~~ + * PBSAD Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Calculate the sum of absolute difference of unsigned 8-bit data elements. + * + * **Description**:\n + * This instruction subtracts the un-signed 8-bit elements of Rs2 from those of Rs1. Then + * it adds the absolute value of each difference together and writes the result to Rd. + * + * **Operations**:\n + * ~~~ + * absdiff[x] = ABS(Rs1.B[x] - Rs2.B[x]); + * Rd = SUM(absdiff[x]); + * for RV32: x=3...0, + * for RV64: x=7...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_PBSAD(unsigned long a, unsigned long b) +{ + register unsigned long result; + __ASM volatile("pbsad %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 3.77. PBSAD ===== */ + +/* ===== Inline Function Start for 3.78. PBSADA ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_PART_SIMD_MISC + * \brief PBSADA (Parallel Byte Sum of Absolute Difference Accum) + * \details + * **Type**: DSP + * + * **Syntax**:\n + * ~~~ + * PBSADA Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Calculate the sum of absolute difference of four unsigned 8-bit data elements and + * accumulate it into a register. + * + * **Description**:\n + * This instruction subtracts the un-signed 8-bit elements of Rs2 from those of Rs1. It + * then adds the absolute value of each difference together along with the content of Rd and writes the + * accumulated result back to Rd. + * + * **Operations**:\n + * ~~~ + * absdiff[x] = ABS(Rs1.B[x] - Rs2.B[x]); + * Rd = Rd + SUM(absdiff[x]); + * for RV32: x=3...0, + * for RV64: x=7...0 + * ~~~ + * + * \param [in] t unsigned long type of value stored in t + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_PBSADA(unsigned long t, unsigned long a, unsigned long b) +{ + __ASM volatile("pbsada %0, %1, %2" : "+r"(t) : "r"(a), "r"(b)); + return t; +} +/* ===== Inline Function End for 3.78. PBSADA ===== */ + +/* ===== Inline Function Start for 3.79.1. PKBB16 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_PACK + * \brief PKBB16 (Pack Two 16-bit Data from Both Bottom Half) + * \details + * **Type**: DSP + * + * **Syntax**:\n + * ~~~ + * PKBB16 Rd, Rs1, Rs2 + * PKBT16 Rd, Rs1, Rs2 + * PKTT16 Rd, Rs1, Rs2 + * PKTB16 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Pack 16-bit data from 32-bit chunks in two registers. + * * PKBB16: bottom.bottom + * * PKBT16 bottom.top + * * PKTT16 top.top + * * PKTB16 top.bottom + * + * **Description**:\n + * (PKBB16) moves Rs1.W[x][15:0] to Rd.W[x][31:16] and moves Rs2.W[x] [15:0] to + * Rd.W[x] [15:0]. + * (PKBT16) moves Rs1.W[x] [15:0] to Rd.W[x] [31:16] and moves Rs2.W[x] [31:16] to Rd.W[x] [15:0]. + * (PKTT16) moves Rs1.W[x] [31:16] to Rd.W[x] [31:16] and moves Rs2.W[x] [31:16] to Rd.W[x] [15:0]. + * (PKTB16) moves Rs1.W[x] [31:16] to Rd.W[x] [31:16] and moves Rs2.W[x] [15:0] to Rd.W[x] [15:0]. + * + * **Operations**:\n + * ~~~ + * Rd.W[x][31:0] = CONCAT(Rs1.W[x][15:0], Rs2.W[x][15:0]); // PKBB16 + * Rd.W[x][31:0] = CONCAT(Rs1.W[x][15:0], Rs2.W[x][31:16]); // PKBT16 + * Rd.W[x][31:0] = CONCAT(Rs1.W[x][31:16], Rs2.W[x][15:0]); // PKTB16 + * Rd.W[x][31:0] = CONCAT(Rs1.W[x][31:16], Rs2.W[x][31:16]); // PKTT16 + * for RV32: x=0, + * for RV64: x=1...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_PKBB16(unsigned long a, unsigned long b) +{ + register unsigned long result; + __ASM volatile("pkbb16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 3.79.1. PKBB16 ===== */ + +/* ===== Inline Function Start for 3.79.2. PKBT16 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_PACK + * \brief PKBT16 (Pack Two 16-bit Data from Bottom and Top Half) + * \details + * **Type**: DSP + * + * **Syntax**:\n + * ~~~ + * PKBB16 Rd, Rs1, Rs2 + * PKBT16 Rd, Rs1, Rs2 + * PKTT16 Rd, Rs1, Rs2 + * PKTB16 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Pack 16-bit data from 32-bit chunks in two registers. + * * PKBB16: bottom.bottom + * * PKBT16 bottom.top + * * PKTT16 top.top + * * PKTB16 top.bottom + * + * **Description**:\n + * (PKBB16) moves Rs1.W[x][15:0] to Rd.W[x][31:16] and moves Rs2.W[x] [15:0] to + * Rd.W[x] [15:0]. + * (PKBT16) moves Rs1.W[x] [15:0] to Rd.W[x] [31:16] and moves Rs2.W[x] [31:16] to Rd.W[x] [15:0]. + * (PKTT16) moves Rs1.W[x] [31:16] to Rd.W[x] [31:16] and moves Rs2.W[x] [31:16] to Rd.W[x] [15:0]. + * (PKTB16) moves Rs1.W[x] [31:16] to Rd.W[x] [31:16] and moves Rs2.W[x] [15:0] to Rd.W[x] [15:0]. + * + * **Operations**:\n + * ~~~ + * Rd.W[x][31:0] = CONCAT(Rs1.W[x][15:0], Rs2.W[x][15:0]); // PKBB16 + * Rd.W[x][31:0] = CONCAT(Rs1.W[x][15:0], Rs2.W[x][31:16]); // PKBT16 + * Rd.W[x][31:0] = CONCAT(Rs1.W[x][31:16], Rs2.W[x][15:0]); // PKTB16 + * Rd.W[x][31:0] = CONCAT(Rs1.W[x][31:16], Rs2.W[x][31:16]); // PKTT16 + * for RV32: x=0, + * for RV64: x=1...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_PKBT16(unsigned long a, unsigned long b) +{ + register unsigned long result; + __ASM volatile("pkbt16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 3.79.2. PKBT16 ===== */ + +/* ===== Inline Function Start for 3.79.3. PKTT16 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_PACK + * \brief PKTT16 (Pack Two 16-bit Data from Both Top Half) + * \details + * **Type**: DSP + * + * **Syntax**:\n + * ~~~ + * PKBB16 Rd, Rs1, Rs2 + * PKBT16 Rd, Rs1, Rs2 + * PKTT16 Rd, Rs1, Rs2 + * PKTB16 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Pack 16-bit data from 32-bit chunks in two registers. + * * PKBB16: bottom.bottom + * * PKBT16 bottom.top + * * PKTT16 top.top + * * PKTB16 top.bottom + * + * **Description**:\n + * (PKBB16) moves Rs1.W[x][15:0] to Rd.W[x][31:16] and moves Rs2.W[x] [15:0] to + * Rd.W[x] [15:0]. + * (PKBT16) moves Rs1.W[x] [15:0] to Rd.W[x] [31:16] and moves Rs2.W[x] [31:16] to Rd.W[x] [15:0]. + * (PKTT16) moves Rs1.W[x] [31:16] to Rd.W[x] [31:16] and moves Rs2.W[x] [31:16] to Rd.W[x] [15:0]. + * (PKTB16) moves Rs1.W[x] [31:16] to Rd.W[x] [31:16] and moves Rs2.W[x] [15:0] to Rd.W[x] [15:0]. + * + * **Operations**:\n + * ~~~ + * Rd.W[x][31:0] = CONCAT(Rs1.W[x][15:0], Rs2.W[x][15:0]); // PKBB16 + * Rd.W[x][31:0] = CONCAT(Rs1.W[x][15:0], Rs2.W[x][31:16]); // PKBT16 + * Rd.W[x][31:0] = CONCAT(Rs1.W[x][31:16], Rs2.W[x][15:0]); // PKTB16 + * Rd.W[x][31:0] = CONCAT(Rs1.W[x][31:16], Rs2.W[x][31:16]); // PKTT16 + * for RV32: x=0, + * for RV64: x=1...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_PKTT16(unsigned long a, unsigned long b) +{ + register unsigned long result; + __ASM volatile("pktt16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 3.79.3. PKTT16 ===== */ + +/* ===== Inline Function Start for 3.79.4. PKTB16 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_PACK + * \brief PKTB16 (Pack Two 16-bit Data from Top and Bottom Half) + * \details + * **Type**: DSP + * + * **Syntax**:\n + * ~~~ + * PKBB16 Rd, Rs1, Rs2 + * PKBT16 Rd, Rs1, Rs2 + * PKTT16 Rd, Rs1, Rs2 + * PKTB16 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Pack 16-bit data from 32-bit chunks in two registers. + * * PKBB16: bottom.bottom + * * PKBT16 bottom.top + * * PKTT16 top.top + * * PKTB16 top.bottom + * + * **Description**:\n + * (PKBB16) moves Rs1.W[x][15:0] to Rd.W[x][31:16] and moves Rs2.W[x] [15:0] to + * Rd.W[x] [15:0]. + * (PKBT16) moves Rs1.W[x] [15:0] to Rd.W[x] [31:16] and moves Rs2.W[x] [31:16] to Rd.W[x] [15:0]. + * (PKTT16) moves Rs1.W[x] [31:16] to Rd.W[x] [31:16] and moves Rs2.W[x] [31:16] to Rd.W[x] [15:0]. + * (PKTB16) moves Rs1.W[x] [31:16] to Rd.W[x] [31:16] and moves Rs2.W[x] [15:0] to Rd.W[x] [15:0]. + * + * **Operations**:\n + * ~~~ + * Rd.W[x][31:0] = CONCAT(Rs1.W[x][15:0], Rs2.W[x][15:0]); // PKBB16 + * Rd.W[x][31:0] = CONCAT(Rs1.W[x][15:0], Rs2.W[x][31:16]); // PKBT16 + * Rd.W[x][31:0] = CONCAT(Rs1.W[x][31:16], Rs2.W[x][15:0]); // PKTB16 + * Rd.W[x][31:0] = CONCAT(Rs1.W[x][31:16], Rs2.W[x][31:16]); // PKTT16 + * for RV32: x=0, + * for RV64: x=1...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_PKTB16(unsigned long a, unsigned long b) +{ + register unsigned long result; + __ASM volatile("pktb16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 3.79.4. PKTB16 ===== */ + +/* ===== Inline Function Start for 3.80. RADD8 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_ADDSUB + * \brief RADD8 (SIMD 8-bit Signed Halving Addition) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * RADD8 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do 8-bit signed integer element additions simultaneously. The element results are halved + * to avoid overflow or saturation. + * + * **Description**:\n + * This instruction adds the 8-bit signed integer elements in Rs1 with the 8-bit signed + * integer elements in Rs2. The results are first arithmetically right-shifted by 1 bit and then written to + * Rd. + * + * **Examples**:\n + * ~~~ + * * Rs1 = 0x7F, Rs2 = 0x7F, Rd = 0x7F + * * Rs1 = 0x80, Rs2 = 0x80, Rd = 0x80 + * * Rs1 = 0x40, Rs2 = 0x80, Rd = 0xE0 + * ~~~ + * + * **Operations**:\n + * ~~~ + * Rd.B[x] = (Rs1.B[x] + Rs2.B[x]) s>> 1; for RV32: x=3...0, for RV64: x=7...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_RADD8(unsigned long a, unsigned long b) +{ + register unsigned long result; + __ASM volatile("radd8 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 3.80. RADD8 ===== */ + +/* ===== Inline Function Start for 3.81. RADD16 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_ADDSUB + * \brief RADD16 (SIMD 16-bit Signed Halving Addition) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * RADD16 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do 16-bit signed integer element additions simultaneously. The results are halved to avoid + * overflow or saturation. + * + * **Description**:\n + * This instruction adds the 16-bit signed integer elements in Rs1 with the 16-bit signed + * integer elements in Rs2. The results are first arithmetically right-shifted by 1 bit and then written to + * Rd. + * + * **Examples**:\n + * ~~~ + * * Rs1 = 0x7FFF, Rs2 = 0x7FFF, Rd = 0x7FFF + * * Rs1 = 0x8000, Rs2 = 0x8000, Rd = 0x8000 + * * Rs1 = 0x4000, Rs2 = 0x8000, Rd = 0xE000 + * ~~~ + * + * **Operations**:\n + * ~~~ + * Rd.H[x] = (Rs1.H[x] + Rs2.H[x]) s>> 1; for RV32: x=1...0, for RV64: x=3...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_RADD16(unsigned long a, unsigned long b) +{ + register unsigned long result; + __ASM volatile("radd16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 3.81. RADD16 ===== */ + +/* ===== Inline Function Start for 3.82. RADD64 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_64B_ADDSUB + * \brief RADD64 (64-bit Signed Halving Addition) + * \details + * **Type**: DSP (64-bit Profile) + * + * **Syntax**:\n + * ~~~ + * RADD64 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Add two 64-bit signed integers. The result is halved to avoid overflow or saturation. + * + * **RV32 Description**:\n + * This instruction adds the 64-bit signed integer of an even/odd pair of registers + * specified by Rs1(4,1) with the 64-bit signed integer of an even/odd pair of registers specified by + * Rs2(4,1). The 64-bit addition result is first arithmetically right-shifted by 1 bit and then written to an + * even/odd pair of registers specified by Rd(4,1). + * Rx(4,1), i.e., value d, determines the even/odd pair group of two registers. Specifically, the register + * pair includes register 2d and 2d+1. + * The odd `2d+1` register of the pair contains the high 32-bit of the result and the even `2d` register + * of the pair contains the low 32-bit of the result. + * + * **RV64 Description**:\n + * This instruction adds the 64-bit signed integer in Rs1 with the 64-bit signed + * integer in Rs2. The 64-bit addition result is first arithmetically right-shifted by 1 bit and then + * written to Rd. + * + * **Operations**:\n + * ~~~ + * RV32: + * t_L = CONCAT(Rd(4,1),1'b0); t_H = CONCAT(Rd(4,1),1'b1); + * a_L = CONCAT(Rs1(4,1),1'b0); a_H = CONCAT(Rs1(4,1),1'b1); + * b_L = CONCAT(Rs2(4,1),1'b0); b_H = CONCAT(Rs2(4,1),1'b1); + * R[t_H].R[t_L] = (R[a_H].R[a_L] + R[b_H].R[b_L]) s>> 1; + * RV64: + * Rd = (Rs1 + Rs2) s>> 1; + * ~~~ + * + * \param [in] a long long type of value stored in a + * \param [in] b long long type of value stored in b + * \return value stored in long long type + */ +__STATIC_FORCEINLINE long long __RV_RADD64(long long a, long long b) +{ + register long long result; + __ASM volatile("radd64 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 3.82. RADD64 ===== */ + +/* ===== Inline Function Start for 3.83. RADDW ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_32B_COMPUTATION + * \brief RADDW (32-bit Signed Halving Addition) + * \details + * **Type**: DSP + * + * **Syntax**:\n + * ~~~ + * RADDW Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Add 32-bit signed integers and the results are halved to avoid overflow or saturation. + * + * **Description**:\n + * This instruction adds the first 32-bit signed integer in Rs1 with the first 32-bit signed + * integer in Rs2. The result is first arithmetically right-shifted by 1 bit and then sign-extended and + * written to Rd. + * + * **Examples**:\n + * ~~~ + * * Rs1 = 0x7FFFFFFF, Rs2 = 0x7FFFFFFF, Rd = 0x7FFFFFFF + * * Rs1 = 0x80000000, Rs2 = 0x80000000, Rd = 0x80000000 + * * Rs1 = 0x40000000, Rs2 = 0x80000000, Rd = 0xE0000000 + * ~~~ + * + * **Operations**:\n + * ~~~ + * RV32: + * Rd[31:0] = (Rs1[31:0] + Rs2[31:0]) s>> 1; + * RV64: + * resw[31:0] = (Rs1[31:0] + Rs2[31:0]) s>> 1; + * Rd[63:0] = SE(resw[31:0]); + * ~~~ + * + * \param [in] a int type of value stored in a + * \param [in] b int type of value stored in b + * \return value stored in long type + */ +__STATIC_FORCEINLINE long __RV_RADDW(int a, int b) +{ + register long result; + __ASM volatile("raddw %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 3.83. RADDW ===== */ + +/* ===== Inline Function Start for 3.84. RCRAS16 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_ADDSUB + * \brief RCRAS16 (SIMD 16-bit Signed Halving Cross Addition & Subtraction) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * RCRAS16 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do 16-bit signed integer element addition and 16-bit signed integer element subtraction in + * a 32-bit chunk simultaneously. Operands are from crossed positions in 32-bit chunks. The results + * are halved to avoid overflow or saturation. + * + * **Description**:\n + * This instruction adds the 16-bit signed integer element in [31:16] of 32-bit chunks in + * Rs1 with the 16-bit signed integer element in [15:0] of 32-bit chunks in Rs2, and subtracts the 16-bit + * signed integer element in [31:16] of 32-bit chunks in Rs2 from the 16-bit signed integer element in + * [15:0] of 32-bit chunks in Rs1. The element results are first arithmetically right-shifted by 1 bit and + * then written to [31:16] of 32-bit chunks in Rd and [15:0] of 32-bit chunks in Rd. + * + * **Examples**:\n + * ~~~ + * Please see `RADD16` and `RSUB16` instructions. + * ~~~ + * + * **Operations**:\n + * ~~~ + * Rd.W[x][31:16] = (Rs1.W[x][31:16] + Rs2.W[x][15:0]) s>> 1; + * Rd.W[x][15:0] = (Rs1.W[x][15:0] - Rs2.W[x][31:16]) s>> 1; + * for RV32, x=0 + * for RV64, x=1...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_RCRAS16(unsigned long a, unsigned long b) +{ + register unsigned long result; + __ASM volatile("rcras16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 3.84. RCRAS16 ===== */ + +/* ===== Inline Function Start for 3.85. RCRSA16 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_ADDSUB + * \brief RCRSA16 (SIMD 16-bit Signed Halving Cross Subtraction & Addition) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * RCRSA16 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do 16-bit signed integer element subtraction and 16-bit signed integer element addition in + * a 32-bit chunk simultaneously. Operands are from crossed positions in 32-bit chunks. The results + * are halved to avoid overflow or saturation. + * + * **Description**:\n + * This instruction subtracts the 16-bit signed integer element in [15:0] of 32-bit chunks + * in Rs2 from the 16-bit signed integer element in [31:16] of 32-bit chunks in Rs1, and adds the 16-bit + * signed element integer in [15:0] of 32-bit chunks in Rs1 with the 16-bit signed integer element in + * [31:16] of 32-bit chunks in Rs2. The two results are first arithmetically right-shifted by 1 bit and + * then written to [31:16] of 32-bit chunks in Rd and [15:0] of 32-bit chunks in Rd. + * + * **Examples**:\n + * ~~~ + * Please see `RADD16` and `RSUB16` instructions. + * ~~~ + * + * **Operations**:\n + * ~~~ + * Rd.W[x][31:16] = (Rs1.W[x][31:16] - Rs2.W[x][15:0]) s>> 1; + * Rd.W[x][15:0] = (Rs1.W[x][15:0] + Rs2.W[x][31:16]) s>> 1; + * for RV32, x=0 + * for RV64, x=1...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_RCRSA16(unsigned long a, unsigned long b) +{ + register unsigned long result; + __ASM volatile("rcrsa16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 3.85. RCRSA16 ===== */ + +/* ===== Inline Function Start for 3.86. RDOV ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_OV_FLAG_SC + * \brief RDOV (Read OV flag) + * \details + * **Type**: DSP + * + * **Syntax**:\n + * ~~~ + * RDOV Rd # pseudo mnemonic + * ~~~ + * + * **Purpose**:\n + * This pseudo instruction is an alias to `CSRR Rd, ucode` instruction which maps to the real + * instruction of `CSRRS Rd, ucode, x0`. + * + * + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_RDOV(void) +{ + register unsigned long result; + __ASM volatile("rdov %0" : "=r"(result)); + return result; +} +/* ===== Inline Function End for 3.86. RDOV ===== */ + +/* ===== Inline Function Start for 3.87. RSTAS16 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_ADDSUB + * \brief RSTAS16 (SIMD 16-bit Signed Halving Straight Addition & Subtraction) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * RSTAS16 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do 16-bit signed integer element addition and 16-bit signed integer element subtraction in + * a 32-bit chunk simultaneously. Operands are from corresponding positions in 32-bit chunks. The + * results are halved to avoid overflow or saturation. + * + * **Description**:\n + * This instruction adds the 16-bit signed integer element in [31:16] of 32-bit chunks in + * Rs1 with the 16-bit signed integer element in [31:16] of 32-bit chunks in Rs2, and subtracts the 16-bit + * signed integer element in [15:0] of 32-bit chunks in Rs2 from the 16-bit signed integer element in + * [15:0] of 32-bit chunks in Rs1. The element results are first arithmetically right-shifted by 1 bit and + * then written to [31:16] of 32-bit chunks in Rd and [15:0] of 32-bit chunks in Rd. + * + * **Examples**:\n + * ~~~ + * Please see `RADD16` and `RSUB16` instructions. + * ~~~ + * + * **Operations**:\n + * ~~~ + * Rd.W[x][31:16] = (Rs1.W[x][31:16] + Rs2.W[x][31:16]) s>> 1; + * Rd.W[x][15:0] = (Rs1.W[x][15:0] - Rs2.W[x][15:0]) s>> 1; + * for RV32, x=0 + * for RV64, x=1...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_RSTAS16(unsigned long a, unsigned long b) +{ + register unsigned long result; + __ASM volatile("rstas16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 3.87. RSTAS16 ===== */ + +/* ===== Inline Function Start for 3.88. RSTSA16 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_ADDSUB + * \brief RSTSA16 (SIMD 16-bit Signed Halving Straight Subtraction & Addition) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * RSTSA16 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do 16-bit signed integer element subtraction and 16-bit signed integer element addition in + * a 32-bit chunk simultaneously. Operands are from corresponding positions in 32-bit chunks. The + * results are halved to avoid overflow or saturation. + * + * **Description**:\n + * This instruction subtracts the 16-bit signed integer element in [31:16] of 32-bit chunks + * in Rs2 from the 16-bit signed integer element in [31:16] of 32-bit chunks in Rs1, and adds the 16-bit + * signed element integer in [15:0] of 32-bit chunks in Rs1 with the 16-bit signed integer element in + * [15:0] of 32-bit chunks in Rs2. The two results are first arithmetically right-shifted by 1 bit and then + * written to [31:16] of 32-bit chunks in Rd and [15:0] of 32-bit chunks in Rd. + * + * **Examples**:\n + * ~~~ + * Please see `RADD16` and `RSUB16` instructions. + * ~~~ + * + * **Operations**:\n + * ~~~ + * Rd.W[x][31:16] = (Rs1.W[x][31:16] - Rs2.W[x][31:16]) s>> 1; + * Rd.W[x][15:0] = (Rs1.W[x][15:0] + Rs2.W[x][15:0]) s>> 1; + * for RV32, x=0 + * for RV64, x=1...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_RSTSA16(unsigned long a, unsigned long b) +{ + register unsigned long result; + __ASM volatile("rstsa16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 3.88. RSTSA16 ===== */ + +/* ===== Inline Function Start for 3.89. RSUB8 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_ADDSUB + * \brief RSUB8 (SIMD 8-bit Signed Halving Subtraction) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * RSUB8 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do 8-bit signed integer element subtractions simultaneously. The results are halved to + * avoid overflow or saturation. + * + * **Description**:\n + * This instruction subtracts the 8-bit signed integer elements in Rs2 from the 8-bit + * signed integer elements in Rs1. The results are first arithmetically right-shifted by 1 bit and then + * written to Rd. + * + * **Examples**:\n + * ~~~ + * * Rs1 = 0x7F, Rs2 = 0x80, Rd = 0x7F + * * Rs1 = 0x80, Rs2 = 0x7F, Rd = 0x80 + * * Rs1= 0x80, Rs2 = 0x40, Rd = 0xA0 + * ~~~ + * + * **Operations**:\n + * ~~~ + * Rd.B[x] = (Rs1.B[x] - Rs2.B[x]) s>> 1; + * for RV32: x=3...0, + * for RV64: x=7...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_RSUB8(unsigned long a, unsigned long b) +{ + register unsigned long result; + __ASM volatile("rsub8 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 3.89. RSUB8 ===== */ + +/* ===== Inline Function Start for 3.90. RSUB16 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_ADDSUB + * \brief RSUB16 (SIMD 16-bit Signed Halving Subtraction) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * RSUB16 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do 16-bit signed integer element subtractions simultaneously. The results are halved to + * avoid overflow or saturation. + * + * **Description**:\n + * This instruction subtracts the 16-bit signed integer elements in Rs2 from the 16-bit + * signed integer elements in Rs1. The results are first arithmetically right-shifted by 1 bit and then + * written to Rd. + * + * **Examples**:\n + * ~~~ + * * Ra = 0x7FFF, Rb = 0x8000, Rt = 0x7FFF + * * Ra = 0x8000, Rb = 0x7FFF, Rt = 0x8000 + * * Ra = 0x8000, Rb = 0x4000, Rt = 0xA000 + * ~~~ + * + * **Operations**:\n + * ~~~ + * Rd.H[x] = (Rs1.H[x] - Rs2.H[x]) s>> 1; + * for RV32: x=1...0, + * for RV64: x=3...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_RSUB16(unsigned long a, unsigned long b) +{ + register unsigned long result; + __ASM volatile("rsub16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 3.90. RSUB16 ===== */ + +/* ===== Inline Function Start for 3.91. RSUB64 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_64B_ADDSUB + * \brief RSUB64 (64-bit Signed Halving Subtraction) + * \details + * **Type**: DSP (64-bit Profile) + * + * **Syntax**:\n + * ~~~ + * RSUB64 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Perform a 64-bit signed integer subtraction. The result is halved to avoid overflow or + * saturation. + * + * **RV32 Description**:\n + * This instruction subtracts the 64-bit signed integer of an even/odd pair of + * registers specified by Rb(4,1) from the 64-bit signed integer of an even/odd pair of registers + * specified by Ra(4,1). The subtraction result is first arithmetically right-shifted by 1 bit and then + * written to an even/odd pair of registers specified by Rt(4,1). + * Rx(4,1), i.e., value d, determines the even/odd pair group of two registers. Specifically, the register + * pair includes register 2d and 2d+1. + * The odd `2d+1` register of the pair contains the high 32-bit of the result and the even `2d` register + * of the pair contains the low 32-bit of the result. + * + * **RV64 Description**:\n + * This instruction subtracts the 64-bit signed integer in Rs2 from the 64-bit signed + * integer in Rs1. The 64-bit subtraction result is first arithmetically right-shifted by 1 bit and then + * written to Rd. + * + * **Operations**:\n + * ~~~ + * RV32: + * t_L = CONCAT(Rd(4,1),1'b0); t_H = CONCAT(Rd(4,1),1'b1); + * a_L = CONCAT(Rs1(4,1),1'b0); a_H = CONCAT(Rs1(4,1),1'b1); + * b_L = CONCAT(Rs2(4,1),1'b0); b_H = CONCAT(Rs2(4,1),1'b1); + * R[t_H].R[t_L] = (R[a_H].R[a_L] - R[b_H].R[b_L]) s>> 1; + * RV64: + * Rd = (Rs1 - Rs2) s>> 1; + * ~~~ + * + * \param [in] a long long type of value stored in a + * \param [in] b long long type of value stored in b + * \return value stored in long long type + */ +__STATIC_FORCEINLINE long long __RV_RSUB64(long long a, long long b) +{ + register long long result; + __ASM volatile("rsub64 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 3.91. RSUB64 ===== */ + +/* ===== Inline Function Start for 3.92. RSUBW ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_32B_COMPUTATION + * \brief RSUBW (32-bit Signed Halving Subtraction) + * \details + * **Type**: DSP + * + * **Syntax**:\n + * ~~~ + * RSUBW Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Subtract 32-bit signed integers and the result is halved to avoid overflow or saturation. + * + * **Description**:\n + * This instruction subtracts the first 32-bit signed integer in Rs2 from the first 32-bit + * signed integer in Rs1. The result is first arithmetically right-shifted by 1 bit and then sign-extended + * and written to Rd. + * + * **Examples**:\n + * ~~~ + * * Rs1 = 0x7FFFFFFF, Rs2 = 0x80000000, Rd = 0x7FFFFFFF + * * Rs1 = 0x80000000, Rs2 = 0x7FFFFFFF, Rd = 0x80000000 + * * Rs1 = 0x80000000, Rs2 = 0x40000000, Rd = 0xA0000000 + * ~~~ + * + * **Operations**:\n + * ~~~ + * RV32: + * Rd[31:0] = (Rs1[31:0] - Rs2[31:0]) s>> 1; + * RV64: + * resw[31:0] = (Rs1[31:0] - Rs2[31:0]) s>> 1; + * Rd[63:0] = SE(resw[31:0]); + * ~~~ + * + * \param [in] a int type of value stored in a + * \param [in] b int type of value stored in b + * \return value stored in long type + */ +__STATIC_FORCEINLINE long __RV_RSUBW(int a, int b) +{ + register long result; + __ASM volatile("rsubw %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 3.92. RSUBW ===== */ + +/* ===== Inline Function Start for 3.93. SCLIP8 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_MISC + * \brief SCLIP8 (SIMD 8-bit Signed Clip Value) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * SCLIP8 Rd, Rs1, imm3u[2:0] + * ~~~ + * + * **Purpose**:\n + * Limit the 8-bit signed integer elements of a register into a signed range simultaneously. + * + * **Description**:\n + * This instruction limits the 8-bit signed integer elements stored in Rs1 into a signed + * integer range between 2^imm3u-1 and -2^imm3u, and writes the limited results to Rd. For example, if + * imm3u is 3, the 8-bit input values should be saturated between 7 and -8. If saturation is performed, + * set OV bit to 1. + * + * **Operations**:\n + * ~~~ + * src = Rs1.B[x]; + * if (src > (2^imm3u)-1) { + * src = (2^imm3u)-1; + * OV = 1; + * } else if (src < -2^imm3u) { + * src = -2^imm3u; + * OV = 1; + * } + * Rd.B[x] = src + * for RV32: x=3...0, + * for RV64: x=7...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned int type of value stored in b + * \return value stored in unsigned long type + */ +#define __RV_SCLIP8(a, b) \ + ({ \ + register unsigned long result; \ + register unsigned long __a = (unsigned long)(a); \ + __ASM volatile("sclip8 %0, %1, %2" : "=r"(result) : "r"(__a), "K"(b)); \ + result; \ + }) +/* ===== Inline Function End for 3.93. SCLIP8 ===== */ + +/* ===== Inline Function Start for 3.94. SCLIP16 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_MISC + * \brief SCLIP16 (SIMD 16-bit Signed Clip Value) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * SCLIP16 Rd, Rs1, imm4u[3:0] + * ~~~ + * + * **Purpose**:\n + * Limit the 16-bit signed integer elements of a register into a signed range simultaneously. + * + * **Description**:\n + * This instruction limits the 16-bit signed integer elements stored in Rs1 into a signed + * integer range between 2imm4u-1 and -2imm4u, and writes the limited results to Rd. For example, if + * imm4u is 3, the 16-bit input values should be saturated between 7 and -8. If saturation is performed, + * set OV bit to 1. + * + * **Operations**:\n + * ~~~ + * src = Rs1.H[x]; + * if (src > (2^imm4u)-1) { + * src = (2^imm4u)-1; + * OV = 1; + * } else if (src < -2^imm4u) { + * src = -2^imm4u; + * OV = 1; + * } + * Rd.H[x] = src + * for RV32: x=1...0, + * for RV64: x=3...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned int type of value stored in b + * \return value stored in unsigned long type + */ +#define __RV_SCLIP16(a, b) \ + ({ \ + register unsigned long result; \ + register unsigned long __a = (unsigned long)(a); \ + __ASM volatile("sclip16 %0, %1, %2" : "=r"(result) : "r"(__a), "K"(b)); \ + result; \ + }) +/* ===== Inline Function End for 3.94. SCLIP16 ===== */ + +/* ===== Inline Function Start for 3.95. SCLIP32 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_PART_SIMD_MISC + * \brief SCLIP32 (SIMD 32-bit Signed Clip Value) + * \details + * **Type**: DSP + * + * **Syntax**:\n + * ~~~ + * SCLIP32 Rd, Rs1, imm5u[4:0] + * ~~~ + * + * **Purpose**:\n + * Limit the 32-bit signed integer elements of a register into a signed range simultaneously. + * + * **Description**:\n + * This instruction limits the 32-bit signed integer elements stored in Rs1 into a signed + * integer range between 2imm5u-1 and -2imm5u, and writes the limited results to Rd. For example, if + * imm5u is 3, the 32-bit input values should be saturated between 7 and -8. If saturation is performed, + * set OV bit to 1. + * + * **Operations**:\n + * ~~~ + * src = Rs1.W[x]; + * if (src > (2^imm5u)-1) { + * src = (2^imm5u)-1; + * OV = 1; + * } else if (src < -2^imm5u) { + * src = -2^imm5u; + * OV = 1; + * } + * Rd.W[x] = src + * for RV32: x=0, + * for RV64: x=1...0 + * ~~~ + * + * \param [in] a long type of value stored in a + * \param [in] b unsigned int type of value stored in b + * \return value stored in long type + */ +#define __RV_SCLIP32(a, b) \ + ({ \ + register long result; \ + register long __a = (long)(a); \ + __ASM volatile("sclip32 %0, %1, %2" : "=r"(result) : "r"(__a), "K"(b)); \ + result; \ + }) +/* ===== Inline Function End for 3.95. SCLIP32 ===== */ + +/* ===== Inline Function Start for 3.96. SCMPLE8 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_CMP + * \brief SCMPLE8 (SIMD 8-bit Signed Compare Less Than & Equal) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * SCMPLE8 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do 8-bit signed integer elements less than & equal comparisons simultaneously. + * + * **Description**:\n + * This instruction compares the 8-bit signed integer elements in Rs1 with the 8-bit + * signed integer elements in Rs2 to see if the one in Rs1 is less than or equal to the one in Rs2. If it is + * true, the result is 0xFF; otherwise, the result is 0x0. The element comparison results are written to + * Rd + * + * **Operations**:\n + * ~~~ + * Rd.B[x] = (Rs1.B[x] {le} Rs2.B[x])? 0xff : 0x0; + * for RV32: x=3...0, + * for RV64: x=7...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_SCMPLE8(unsigned long a, unsigned long b) +{ + register unsigned long result; + __ASM volatile("scmple8 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 3.96. SCMPLE8 ===== */ + +/* ===== Inline Function Start for 3.97. SCMPLE16 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_CMP + * \brief SCMPLE16 (SIMD 16-bit Signed Compare Less Than & Equal) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * SCMPLE16 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do 16-bit signed integer elements less than & equal comparisons simultaneously. + * + * **Description**:\n + * This instruction compares the 16-bit signed integer elements in Rs1 with the 16-bit + * signed integer elements in Rs2 to see if the one in Rs1 is less than or equal to the one in Rs2. If it is + * true, the result is 0xFFFF; otherwise, the result is 0x0. The element comparison results are written + * to Rd. + * + * **Operations**:\n + * ~~~ + * Rd.H[x] = (Rs1.H[x] {le} Rs2.H[x])? 0xffff : 0x0; + * for RV32: x=1...0, + * for RV64: x=3...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_SCMPLE16(unsigned long a, unsigned long b) +{ + register unsigned long result; + __ASM volatile("scmple16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 3.97. SCMPLE16 ===== */ + +/* ===== Inline Function Start for 3.98. SCMPLT8 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_CMP + * \brief SCMPLT8 (SIMD 8-bit Signed Compare Less Than) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * SCMPLT8 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do 8-bit signed integer elements less than comparisons simultaneously. + * + * **Description**:\n + * This instruction compares the 8-bit signed integer elements in Rs1 with the 8-bit + * signed integer elements in Rs2 to see if the one in Rs1 is less than the one in Rs2. If it is true, the + * result is 0xFF; otherwise, the result is 0x0. The element comparison results are written to Rd. + * + * **Operations**:\n + * ~~~ + * Rd.B[x] = (Rs1.B[x] < Rs2.B[x])? 0xff : 0x0; + * for RV32: x=3...0, + * for RV64: x=7...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_SCMPLT8(unsigned long a, unsigned long b) +{ + register unsigned long result; + __ASM volatile("scmplt8 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 3.98. SCMPLT8 ===== */ + +/* ===== Inline Function Start for 3.99. SCMPLT16 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_CMP + * \brief SCMPLT16 (SIMD 16-bit Signed Compare Less Than) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * SCMPLT16 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do 16-bit signed integer elements less than comparisons simultaneously. + * + * **Description**:\n + * This instruction compares the 16-bit signed integer elements in Rs1 with the two 16- + * bit signed integer elements in Rs2 to see if the one in Rs1 is less than the one in Rs2. If it is true, the + * result is 0xFFFF; otherwise, the result is 0x0. The element comparison results are written to Rd. + * + * **Operations**:\n + * ~~~ + * Rd.H[x] = (Rs1.H[x] < Rs2.H[x])? 0xffff : 0x0; + * for RV32: x=1...0, + * for RV64: x=3...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_SCMPLT16(unsigned long a, unsigned long b) +{ + register unsigned long result; + __ASM volatile("scmplt16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 3.99. SCMPLT16 ===== */ + +/* ===== Inline Function Start for 3.100. SLL8 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_SHIFT + * \brief SLL8 (SIMD 8-bit Shift Left Logical) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * SLL8 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do 8-bit elements logical left shift operations simultaneously. The shift amount is a + * variable from a GPR. + * + * **Description**:\n + * The 8-bit elements in Rs1 are left-shifted logically. And the results are written to Rd. + * The shifted out bits are filled with zero and the shift amount is specified by the low-order 3-bits of + * the value in the Rs2 register. + * + * **Operations**:\n + * ~~~ + * sa = Rs2[2:0]; + * Rd.B[x] = Rs1.B[x] << sa; + * for RV32: x=3...0, + * for RV64: x=7...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned int type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_SLL8(unsigned long a, unsigned int b) +{ + register unsigned long result; + __ASM volatile("sll8 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 3.100. SLL8 ===== */ + +/* ===== Inline Function Start for 3.101. SLLI8 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_SHIFT + * \brief SLLI8 (SIMD 8-bit Shift Left Logical Immediate) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * SLLI8 Rd, Rs1, imm3u + * ~~~ + * + * **Purpose**:\n + * Do 8-bit elements logical left shift operations simultaneously. The shift amount is an + * immediate value. + * + * **Description**:\n + * The 8-bit elements in Rs1 are left-shifted logically. And the results are written to Rd. + * The shifted out bits are filled with zero and the shift amount is specified by the imm3u constant. + * + * **Operations**:\n + * ~~~ + * sa = imm3u[2:0]; + * Rd.B[x] = Rs1.B[x] << sa; + * for RV32: x=3...0, + * for RV64: x=7...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned int type of value stored in b + * \return value stored in unsigned long type + */ +#define __RV_SLLI8(a, b) \ + ({ \ + register unsigned long result; \ + register unsigned long __a = (unsigned long)(a); \ + __ASM volatile("slli8 %0, %1, %2" : "=r"(result) : "r"(__a), "K"(b)); \ + result; \ + }) +/* ===== Inline Function End for 3.101. SLLI8 ===== */ + +/* ===== Inline Function Start for 3.102. SLL16 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_SHIFT + * \brief SLL16 (SIMD 16-bit Shift Left Logical) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * SLL16 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do 16-bit elements logical left shift operations simultaneously. The shift amount is a + * variable from a GPR. + * + * **Description**:\n + * The 16-bit elements in Rs1 are left-shifted logically. And the results are written to Rd. + * The shifted out bits are filled with zero and the shift amount is specified by the low-order 4-bits of + * the value in the Rs2 register. + * + * **Operations**:\n + * ~~~ + * sa = Rs2[3:0]; + * Rd.H[x] = Rs1.H[x] << sa; + * for RV32: x=1...0, + * for RV64: x=3...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned int type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_SLL16(unsigned long a, unsigned int b) +{ + register unsigned long result; + __ASM volatile("sll16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 3.102. SLL16 ===== */ + +/* ===== Inline Function Start for 3.103. SLLI16 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_SHIFT + * \brief SLLI16 (SIMD 16-bit Shift Left Logical Immediate) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * SLLI16 Rd, Rs1, imm4[3:0] + * ~~~ + * + * **Purpose**:\n + * Do 16-bit element logical left shift operations simultaneously. The shift amount is an + * immediate value. + * + * **Description**:\n + * The 16-bit elements in Rs1 are left-shifted logically. The shifted out bits are filled with + * zero and the shift amount is specified by the imm4[3:0] constant. And the results are written to Rd. + * + * **Operations**:\n + * ~~~ + * sa = imm4[3:0]; + * Rd.H[x] = Rs1.H[x] << sa; + * for RV32: x=1...0, + * for RV64: x=3...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned int type of value stored in b + * \return value stored in unsigned long type + */ +#define __RV_SLLI16(a, b) \ + ({ \ + register unsigned long result; \ + register unsigned long __a = (unsigned long)(a); \ + __ASM volatile("slli16 %0, %1, %2" : "=r"(result) : "r"(__a), "K"(b)); \ + result; \ + }) +/* ===== Inline Function End for 3.103. SLLI16 ===== */ + +/* ===== Inline Function Start for 3.104. SMAL ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_16B_MULT_64B_ADDSUB + * \brief SMAL (Signed Multiply Halfs & Add 64-bit) + * \details + * **Type**: Partial-SIMD + * + * **Syntax**:\n + * ~~~ + * SMAL Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Multiply the signed bottom 16-bit content of the 32-bit elements of a register with the top + * 16-bit content of the same 32-bit elements of the same register, and add the results with a 64-bit + * value of an even/odd pair of registers (RV32) or a register (RV64). The addition result is written back + * to another even/odd pair of registers (RV32) or a register (RV64). + * + * **RV32 Description**:\n + * This instruction multiplies the bottom 16-bit content of the lower 32-bit of Rs2 with the top 16-bit + * content of the lower 32-bit of Rs2 and adds the result with the 64-bit value of an even/odd pair of + * registers specified by Rs1(4,1). The 64-bit addition result is written back to an even/odd pair of + * registers specified by Rd(4,1). The 16-bit values of Rs2, and the 64-bit value of the Rs1(4,1) register- + * pair are treated as signed integers. + * Rx(4,1), i.e., d, determines the even/odd pair group of the two registers. Specifically, the register pair + * includes register 2d and 2d+1. + * The odd `2d+1` register of the pair contains the high 32-bit of the operand and the even `2d` + * register of the pair contains the low 32-bit of the operand. + * + * **RV64 Description**:\n + * This instruction multiplies the bottom 16-bit content of the 32-bit elements of Rs2 with the top 16-bit + * content of the same 32-bit elements of Rs2 and adds the results with the 64-bit value of Rs1. The 64- + * bit addition result is written back to Rd. The 16-bit values of Rs2, and the 64-bit value of Rs1 are + * treated as signed integers. + * + * **Operations**:\n + * ~~~ + * RV32: + * Mres[31:0] = Rs2.H[1] * Rs2.H[0]; + * Idx0 = CONCAT(Rs1(4,1),1'b0); Idx1 = CONCAT(Rs1(4,1),1'b1); + + * Idx2 = CONCAT(Rd(4,1),1'b0); Idx3 = CONCAT(Rd(4,1),1'b1); + * R[Idx3].R[Idx2] = R[Idx1].R[Idx0] + SE64(Mres[31:0]); + * RV64: + * Mres[0][31:0] = Rs2.W[0].H[1] * Rs2.W[0].H[0]; + * Mres[1][31:0] = Rs2.W[1].H[1] * Rs2.W[1].H[0]; + * Rd = Rs1 + SE64(Mres[1][31:0]) + SE64(Mres[0][31:0]); + * ~~~ + * + * \param [in] a long long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in long long type + */ +__STATIC_FORCEINLINE long long __RV_SMAL(long long a, unsigned long b) +{ + register long long result; + __ASM volatile("smal %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 3.104. SMAL ===== */ + +/* ===== Inline Function Start for 3.105.1. SMALBB ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_16B_MULT_64B_ADDSUB + * \brief SMALBB (Signed Multiply Bottom Halfs & Add 64-bit) + * \details + * **Type**: DSP (64-bit Profile) + * + * **Syntax**:\n + * ~~~ + * SMALBB Rd, Rs1, Rs2 + * SMALBT Rd, Rs1, Rs2 + * SMALTT Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Multiply the signed 16-bit content of the 32-bit elements of a register with the 16-bit + * content of the corresponding 32-bit elements of another register and add the results with a 64-bit + * value of an even/odd pair of registers (RV32) or a register (RV64). The addition result is written back + * to the register-pair (RV32) or the register (RV64). + * * SMALBB: rt pair + bottom*bottom (all 32-bit elements) + * * SMALBT rt pair + bottom*top (all 32-bit elements) + * * SMALTT rt pair + top*top (all 32-bit elements) + * + * **RV32 Description**:\n + * For the `SMALBB` instruction, it multiplies the bottom 16-bit content of Rs1 with the bottom 16-bit + * content of Rs2. + * For the `SMALBT` instruction, it multiplies the bottom 16-bit content of Rs1 with the top 16-bit + * content of Rs2. + * For the `SMALTT` instruction, it multiplies the top 16-bit content of Rs1 with the top 16-bit content + * of Rs2. + * The multiplication result is added with the 64-bit value of an even/odd pair of registers specified by + * Rd(4,1). The 64-bit addition result is written back to the register-pair. The 16-bit values of Rs1 and + * Rs2, and the 64-bit value of the register-pair are treated as signed integers. + * Rd(4,1), i.e., d, determines the even/odd pair group of the two registers. Specifically, the register pair + * includes register 2d and 2d+1. + * The odd `2d+1` register of the pair contains the high 32-bit of the operand and the even `2d` + * register of the pair contains the low 32-bit of the operand. + * + * **RV64 Description**:\n + * For the `SMALBB` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1 + * with the bottom 16-bit content of the 32-bit elements of Rs2. + * For the `SMALBT` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1 + * with the top 16-bit content of the 32-bit elements of Rs2. + * For the `SMALTT` instruction, it multiplies the top 16-bit content of the 32-bit elements of Rs1 with + * the top 16-bit content of the 32-bit elements of Rs2. + * The multiplication results are added with the 64-bit value of Rd. The 64-bit addition result is written + * back to Rd. The 16-bit values of Rs1 and Rs2, and the 64-bit value of Rd are treated as signed + * integers. + * + * **Operations**:\n + * ~~~ + * RV32: + * Mres[31:0] = Rs1.H[0] * Rs2.H[0]; // SMALBB + * Mres[31:0] = Rs1.H[0] * Rs2.H[1]; // SMALBT + * Mres[31:0] = Rs1.H[1] * Rs2.H[1]; // SMALTT + * Idx0 = CONCAT(Rd(4,1),1'b0); Idx1 = CONCAT(Rd(4,1),1'b1); + * R[Idx1].R[Idx0] = R[Idx1].R[Idx0] + SE64(Mres[31:0]); + * RV64: + * // SMALBB + * Mres[0][31:0] = Rs1.W[0].H[0] * Rs2.W[0].H[0]; + * Mres[1][31:0] = Rs1.W[1].H[0] * Rs2.W[1].H[0]; + * // SMALBT + * Mres[0][31:0] = Rs1.W[0].H[0] * Rs2.W[0].H[1]; + * Mres[1][31:0] = Rs1.W[1].H[0] * Rs2.W[1].H[1]; + * // SMALTT + * Mres[0][31:0] = Rs1.W[0].H[1] * Rs2.W[0].H[1]; + * Mres[1][31:0] = Rs1.W[1].H[1] * Rs2.W[1].H[1]; + * Rd = Rd + SE64(Mres[0][31:0]) + SE64(Mres[1][31:0]); + * ~~~ + * + * \param [in] t long long type of value stored in t + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in long long type + */ +__STATIC_FORCEINLINE long long __RV_SMALBB(long long t, unsigned long a, unsigned long b) +{ + __ASM volatile("smalbb %0, %1, %2" : "+r"(t) : "r"(a), "r"(b)); + return t; +} +/* ===== Inline Function End for 3.105.1. SMALBB ===== */ + +/* ===== Inline Function Start for 3.105.2. SMALBT ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_16B_MULT_64B_ADDSUB + * \brief SMALBT (Signed Multiply Bottom Half & Top Half & Add 64-bit) + * \details + * **Type**: DSP (64-bit Profile) + * + * **Syntax**:\n + * ~~~ + * SMALBB Rd, Rs1, Rs2 + * SMALBT Rd, Rs1, Rs2 + * SMALTT Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Multiply the signed 16-bit content of the 32-bit elements of a register with the 16-bit + * content of the corresponding 32-bit elements of another register and add the results with a 64-bit + * value of an even/odd pair of registers (RV32) or a register (RV64). The addition result is written back + * to the register-pair (RV32) or the register (RV64). + * * SMALBB: rt pair + bottom*bottom (all 32-bit elements) + * * SMALBT rt pair + bottom*top (all 32-bit elements) + * * SMALTT rt pair + top*top (all 32-bit elements) + * + * **RV32 Description**:\n + * For the `SMALBB` instruction, it multiplies the bottom 16-bit content of Rs1 with the bottom 16-bit + * content of Rs2. + * For the `SMALBT` instruction, it multiplies the bottom 16-bit content of Rs1 with the top 16-bit + * content of Rs2. + * For the `SMALTT` instruction, it multiplies the top 16-bit content of Rs1 with the top 16-bit content + * of Rs2. + * The multiplication result is added with the 64-bit value of an even/odd pair of registers specified by + * Rd(4,1). The 64-bit addition result is written back to the register-pair. The 16-bit values of Rs1 and + * Rs2, and the 64-bit value of the register-pair are treated as signed integers. + * Rd(4,1), i.e., d, determines the even/odd pair group of the two registers. Specifically, the register pair + * includes register 2d and 2d+1. + * The odd `2d+1` register of the pair contains the high 32-bit of the operand and the even `2d` + * register of the pair contains the low 32-bit of the operand. + * + * **RV64 Description**:\n + * For the `SMALBB` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1 + * with the bottom 16-bit content of the 32-bit elements of Rs2. + * For the `SMALBT` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1 + * with the top 16-bit content of the 32-bit elements of Rs2. + * For the `SMALTT` instruction, it multiplies the top 16-bit content of the 32-bit elements of Rs1 with + * the top 16-bit content of the 32-bit elements of Rs2. + * The multiplication results are added with the 64-bit value of Rd. The 64-bit addition result is written + * back to Rd. The 16-bit values of Rs1 and Rs2, and the 64-bit value of Rd are treated as signed + * integers. + * + * **Operations**:\n + * ~~~ + * RV32: + * Mres[31:0] = Rs1.H[0] * Rs2.H[0]; // SMALBB + * Mres[31:0] = Rs1.H[0] * Rs2.H[1]; // SMALBT + * Mres[31:0] = Rs1.H[1] * Rs2.H[1]; // SMALTT + * Idx0 = CONCAT(Rd(4,1),1'b0); Idx1 = CONCAT(Rd(4,1),1'b1); + * R[Idx1].R[Idx0] = R[Idx1].R[Idx0] + SE64(Mres[31:0]); + * RV64: + * // SMALBB + * Mres[0][31:0] = Rs1.W[0].H[0] * Rs2.W[0].H[0]; + * Mres[1][31:0] = Rs1.W[1].H[0] * Rs2.W[1].H[0]; + * // SMALBT + * Mres[0][31:0] = Rs1.W[0].H[0] * Rs2.W[0].H[1]; + * Mres[1][31:0] = Rs1.W[1].H[0] * Rs2.W[1].H[1]; + * // SMALTT + * Mres[0][31:0] = Rs1.W[0].H[1] * Rs2.W[0].H[1]; + * Mres[1][31:0] = Rs1.W[1].H[1] * Rs2.W[1].H[1]; + * Rd = Rd + SE64(Mres[0][31:0]) + SE64(Mres[1][31:0]); + * ~~~ + * + * \param [in] t long long type of value stored in t + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in long long type + */ +__STATIC_FORCEINLINE long long __RV_SMALBT(long long t, unsigned long a, unsigned long b) +{ + __ASM volatile("smalbt %0, %1, %2" : "+r"(t) : "r"(a), "r"(b)); + return t; +} +/* ===== Inline Function End for 3.105.2. SMALBT ===== */ + +/* ===== Inline Function Start for 3.105.3. SMALTT ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_16B_MULT_64B_ADDSUB + * \brief SMALTT (Signed Multiply Top Halfs & Add 64-bit) + * \details + * **Type**: DSP (64-bit Profile) + * + * **Syntax**:\n + * ~~~ + * SMALBB Rd, Rs1, Rs2 + * SMALBT Rd, Rs1, Rs2 + * SMALTT Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Multiply the signed 16-bit content of the 32-bit elements of a register with the 16-bit + * content of the corresponding 32-bit elements of another register and add the results with a 64-bit + * value of an even/odd pair of registers (RV32) or a register (RV64). The addition result is written back + * to the register-pair (RV32) or the register (RV64). + * * SMALBB: rt pair + bottom*bottom (all 32-bit elements) + * * SMALBT rt pair + bottom*top (all 32-bit elements) + * * SMALTT rt pair + top*top (all 32-bit elements) + * + * **RV32 Description**:\n + * For the `SMALBB` instruction, it multiplies the bottom 16-bit content of Rs1 with the bottom 16-bit + * content of Rs2. + * For the `SMALBT` instruction, it multiplies the bottom 16-bit content of Rs1 with the top 16-bit + * content of Rs2. + * For the `SMALTT` instruction, it multiplies the top 16-bit content of Rs1 with the top 16-bit content + * of Rs2. + * The multiplication result is added with the 64-bit value of an even/odd pair of registers specified by + * Rd(4,1). The 64-bit addition result is written back to the register-pair. The 16-bit values of Rs1 and + * Rs2, and the 64-bit value of the register-pair are treated as signed integers. + * Rd(4,1), i.e., d, determines the even/odd pair group of the two registers. Specifically, the register pair + * includes register 2d and 2d+1. + * The odd `2d+1` register of the pair contains the high 32-bit of the operand and the even `2d` + * register of the pair contains the low 32-bit of the operand. + * + * **RV64 Description**:\n + * For the `SMALBB` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1 + * with the bottom 16-bit content of the 32-bit elements of Rs2. + * For the `SMALBT` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1 + * with the top 16-bit content of the 32-bit elements of Rs2. + * For the `SMALTT` instruction, it multiplies the top 16-bit content of the 32-bit elements of Rs1 with + * the top 16-bit content of the 32-bit elements of Rs2. + * The multiplication results are added with the 64-bit value of Rd. The 64-bit addition result is written + * back to Rd. The 16-bit values of Rs1 and Rs2, and the 64-bit value of Rd are treated as signed + * integers. + * + * **Operations**:\n + * ~~~ + * RV32: + * Mres[31:0] = Rs1.H[0] * Rs2.H[0]; // SMALBB + * Mres[31:0] = Rs1.H[0] * Rs2.H[1]; // SMALBT + * Mres[31:0] = Rs1.H[1] * Rs2.H[1]; // SMALTT + * Idx0 = CONCAT(Rd(4,1),1'b0); Idx1 = CONCAT(Rd(4,1),1'b1); + * R[Idx1].R[Idx0] = R[Idx1].R[Idx0] + SE64(Mres[31:0]); + * RV64: + * // SMALBB + * Mres[0][31:0] = Rs1.W[0].H[0] * Rs2.W[0].H[0]; + * Mres[1][31:0] = Rs1.W[1].H[0] * Rs2.W[1].H[0]; + * // SMALBT + * Mres[0][31:0] = Rs1.W[0].H[0] * Rs2.W[0].H[1]; + * Mres[1][31:0] = Rs1.W[1].H[0] * Rs2.W[1].H[1]; + * // SMALTT + * Mres[0][31:0] = Rs1.W[0].H[1] * Rs2.W[0].H[1]; + * Mres[1][31:0] = Rs1.W[1].H[1] * Rs2.W[1].H[1]; + * Rd = Rd + SE64(Mres[0][31:0]) + SE64(Mres[1][31:0]); + * ~~~ + * + * \param [in] t long long type of value stored in t + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in long long type + */ +__STATIC_FORCEINLINE long long __RV_SMALTT(long long t, unsigned long a, unsigned long b) +{ + __ASM volatile("smaltt %0, %1, %2" : "+r"(t) : "r"(a), "r"(b)); + return t; +} +/* ===== Inline Function End for 3.105.3. SMALTT ===== */ + +/* ===== Inline Function Start for 3.106.1. SMALDA ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_16B_MULT_64B_ADDSUB + * \brief SMALDA (Signed Multiply Two Halfs and Two Adds 64-bit) + * \details + * **Type**: DSP (64-bit Profile) + * + * **Syntax**:\n + * ~~~ + * SMALDA Rd, Rs1, Rs2 + * SMALXDA Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do two signed 16-bit multiplications from the 32-bit elements of two registers; and then + * adds the two 32-bit results and the 64-bit value of an even/odd pair of registers together. + * * SMALDA: rt pair+ top*top + bottom*bottom (all 32-bit elements) + * * SMALXDA: rt pair+ top*bottom + bottom*top (all 32-bit elements) + * + * **RV32 Description**:\n + * For the `SMALDA` instruction, it multiplies the bottom 16-bit content of Rs1 with the bottom 16-bit + * content of Rs2 and then adds the result to the result of multiplying the top 16-bit content of Rs1 with + * the top 16-bit content of Rs2 with unlimited precision. + * For the `SMALXDA` instruction, it multiplies the top 16-bit content of Rs1 with the bottom 16-bit + * content of Rs2 and then adds the result to the result of multiplying the bottom 16-bit content of Rs1 + * with the top 16-bit content of Rs2 with unlimited precision. + * The result is added to the 64-bit value of an even/odd pair of registers specified by Rd(4,1). The 64- + * bit addition result is written back to the register-pair. The 16-bit values of Rs1 and Rs2, and the 64- + * bit value of the register-pair are treated as signed integers. + * Rd(4,1), i.e., d, determines the even/odd pair group of the two registers. Specifically, the register pair + * includes register 2d and 2d+1. + * The odd `2d+1` register of the pair contains the high 32-bit of the operand and the even `2d` + * register of the pair contains the low 32-bit of the operand. + * + * **RV64 Description**:\n + * For the `SMALDA` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1 + * with the bottom 16-bit content of the 32-bit elements of Rs2 and then adds the result to the result of + * multiplying the top 16-bit content of the 32-bit elements of Rs1 with the top 16-bit content of the 32- + * bit elements of Rs2 with unlimited precision. + * For the `SMALXDA` instruction, it multiplies the top 16-bit content of the 32-bit elements of Rs1 + * with the bottom 16-bit content of the 32-bit elements of Rs2 and then adds the result to the result of + * multiplying the bottom 16-bit content of the 32-bit elements of Rs1 with the top 16-bit content of the + * 32-bit elements of Rs2 with unlimited precision. + * The results are added to the 64-bit value of Rd. The 64-bit addition result is written back to Rd. The + * 16-bit values of Rs1 and Rs2, and the 64-bit value of Rd are treated as signed integers. + * + * **Operations**:\n + * ~~~ + * RV32: + * // SMALDA + * Mres0[31:0] = (Rs1.H[0] * Rs2.H[0]); + * Mres1[31:0] = (Rs1.H[1] * Rs2.H[1]); + * // SMALXDA + * Mres0[31:0] = (Rs1.H[0] * Rs2.H[1]); + * Mres1[31:0] = (Rs1.H[1] * Rs2.H[0]); + * Idx0 = CONCAT(Rd(4,1),1'b0); Idx1 = CONCAT(Rd(4,1),1'b1); + * R[Idx1].R[Idx0] = R[Idx1].R[Idx0] + SE64(Mres0[31:0]) + SE64(Mres1[31:0]); + * RV64: + * // SMALDA + * Mres0[0][31:0] = (Rs1.W[0].H[0] * Rs2.W[0].H[0]); + * Mres1[0][31:0] = (Rs1.W[0].H[1] * Rs2.W[0].H[1]); + * Mres0[1][31:0] = (Rs1.W[1].H[0] * Rs2.W[1].H[0]); + * Mres1[1][31:0] = (Rs1.W[1].H[1] * Rs2.W[1].H[1]); + * // SMALXDA + * Mres0[0][31:0] = (Rs1.W[0].H[0] * Rs2.W[0].H[1]); + * Mres1[0][31:0] = (Rs1.W[0].H[1] * Rs2.W[0].H[0]); + * Mres0[1][31:0] = (Rs1.W[1].H[0] * Rs2.W[1].H[1]); + * Mres1[1][31:0] = (Rs1.W[1].H[1] * Rs2.W[1].H[0]); + * Rd = Rd + SE64(Mres0[0][31:0]) + SE64(Mres1[0][31:0]) + SE64(Mres0[1][31:0]) + + * SE64(Mres1[1][31:0]); + * ~~~ + * + * \param [in] t long long type of value stored in t + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in long long type + */ +__STATIC_FORCEINLINE long long __RV_SMALDA(long long t, unsigned long a, unsigned long b) +{ + __ASM volatile("smalda %0, %1, %2" : "+r"(t) : "r"(a), "r"(b)); + return t; +} +/* ===== Inline Function End for 3.106.1. SMALDA ===== */ + +/* ===== Inline Function Start for 3.106.2. SMALXDA ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_16B_MULT_64B_ADDSUB + * \brief SMALXDA (Signed Crossed Multiply Two Halfs and Two Adds 64-bit) + * \details + * **Type**: DSP (64-bit Profile) + * + * **Syntax**:\n + * ~~~ + * SMALDA Rd, Rs1, Rs2 + * SMALXDA Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do two signed 16-bit multiplications from the 32-bit elements of two registers; and then + * adds the two 32-bit results and the 64-bit value of an even/odd pair of registers together. + * * SMALDA: rt pair+ top*top + bottom*bottom (all 32-bit elements) + * * SMALXDA: rt pair+ top*bottom + bottom*top (all 32-bit elements) + * + * **RV32 Description**:\n + * For the `SMALDA` instruction, it multiplies the bottom 16-bit content of Rs1 with the bottom 16-bit + * content of Rs2 and then adds the result to the result of multiplying the top 16-bit content of Rs1 with + * the top 16-bit content of Rs2 with unlimited precision. + * For the `SMALXDA` instruction, it multiplies the top 16-bit content of Rs1 with the bottom 16-bit + * content of Rs2 and then adds the result to the result of multiplying the bottom 16-bit content of Rs1 + * with the top 16-bit content of Rs2 with unlimited precision. + * The result is added to the 64-bit value of an even/odd pair of registers specified by Rd(4,1). The 64- + * bit addition result is written back to the register-pair. The 16-bit values of Rs1 and Rs2, and the 64- + * bit value of the register-pair are treated as signed integers. + * Rd(4,1), i.e., d, determines the even/odd pair group of the two registers. Specifically, the register pair + * includes register 2d and 2d+1. + * The odd `2d+1` register of the pair contains the high 32-bit of the operand and the even `2d` + * register of the pair contains the low 32-bit of the operand. + * + * **RV64 Description**:\n + * For the `SMALDA` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1 + * with the bottom 16-bit content of the 32-bit elements of Rs2 and then adds the result to the result of + * multiplying the top 16-bit content of the 32-bit elements of Rs1 with the top 16-bit content of the 32- + * bit elements of Rs2 with unlimited precision. + * For the `SMALXDA` instruction, it multiplies the top 16-bit content of the 32-bit elements of Rs1 + * with the bottom 16-bit content of the 32-bit elements of Rs2 and then adds the result to the result of + * multiplying the bottom 16-bit content of the 32-bit elements of Rs1 with the top 16-bit content of the + * 32-bit elements of Rs2 with unlimited precision. + * The results are added to the 64-bit value of Rd. The 64-bit addition result is written back to Rd. The + * 16-bit values of Rs1 and Rs2, and the 64-bit value of Rd are treated as signed integers. + * + * **Operations**:\n + * ~~~ + * RV32: + * // SMALDA + * Mres0[31:0] = (Rs1.H[0] * Rs2.H[0]); + * Mres1[31:0] = (Rs1.H[1] * Rs2.H[1]); + * // SMALXDA + * Mres0[31:0] = (Rs1.H[0] * Rs2.H[1]); + * Mres1[31:0] = (Rs1.H[1] * Rs2.H[0]); + * Idx0 = CONCAT(Rd(4,1),1'b0); Idx1 = CONCAT(Rd(4,1),1'b1); + * R[Idx1].R[Idx0] = R[Idx1].R[Idx0] + SE64(Mres0[31:0]) + SE64(Mres1[31:0]); + * RV64: + * // SMALDA + * Mres0[0][31:0] = (Rs1.W[0].H[0] * Rs2.W[0].H[0]); + * Mres1[0][31:0] = (Rs1.W[0].H[1] * Rs2.W[0].H[1]); + * Mres0[1][31:0] = (Rs1.W[1].H[0] * Rs2.W[1].H[0]); + * Mres1[1][31:0] = (Rs1.W[1].H[1] * Rs2.W[1].H[1]); + * // SMALXDA + * Mres0[0][31:0] = (Rs1.W[0].H[0] * Rs2.W[0].H[1]); + * Mres1[0][31:0] = (Rs1.W[0].H[1] * Rs2.W[0].H[0]); + * Mres0[1][31:0] = (Rs1.W[1].H[0] * Rs2.W[1].H[1]); + * Mres1[1][31:0] = (Rs1.W[1].H[1] * Rs2.W[1].H[0]); + * Rd = Rd + SE64(Mres0[0][31:0]) + SE64(Mres1[0][31:0]) + SE64(Mres0[1][31:0]) + + * SE64(Mres1[1][31:0]); + * ~~~ + * + * \param [in] t long long type of value stored in t + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in long long type + */ +__STATIC_FORCEINLINE long long __RV_SMALXDA(long long t, unsigned long a, unsigned long b) +{ + __ASM volatile("smalxda %0, %1, %2" : "+r"(t) : "r"(a), "r"(b)); + return t; +} +/* ===== Inline Function End for 3.106.2. SMALXDA ===== */ + +/* ===== Inline Function Start for 3.107.1. SMALDS ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_16B_MULT_64B_ADDSUB + * \brief SMALDS (Signed Multiply Two Halfs & Subtract & Add 64-bit) + * \details + * **Type**: DSP (64-bit Profile) + * + * **Syntax**:\n + * ~~~ + * SMALDS Rd, Rs1, Rs2 + * SMALDRS Rd, Rs1, Rs2 + * SMALXDS Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do two signed 16-bit multiplications from the 32-bit elements of two registers; and then + * perform a subtraction operation between the two 32-bit results. Then add the subtraction result to + * the 64-bit value of an even/odd pair of registers (RV32) or a register (RV64). The addition result is + * written back to the register-pair. + * * SMALDS: rt pair + (top*top - bottom*bottom) (all 32-bit elements) + * * SMALDRS: rt pair + (bottom*bottom - top*top) (all 32-bit elements) + * * SMALXDS: rt pair + (top*bottom - bottom*top) (all 32-bit elements) + * + * **RV32 Description**:\n + * For the `SMALDS` instruction, it multiplies the bottom 16-bit content of Rs1 with the bottom 16-bit + * content of Rs2 and then subtracts the result from the result of multiplying the top 16-bit content of + * Rs1 with the top 16-bit content of Rs2. + * For the `SMALDRS` instruction, it multiplies the top 16-bit content of Rs1 with the top 16-bit content + * of Rs2 and then subtracts the result from the result of multiplying the bottom 16-bit content of Rs1 + * with the bottom 16-bit content of Rs2. + * For the `SMALXDS` instruction, it multiplies the bottom 16-bit content of Rs1 with the top 16-bit + * content of Rs2 and then subtracts the result from the result of multiplying the top 16-bit content of + * Rs1 with the bottom 16-bit content of Rs2. + * The subtraction result is then added to the 64-bit value of an even/odd pair of registers specified by + * Rd(4,1). The 64-bit addition result is written back to the register-pair. The 16-bit values of Rs1 and + * Rs2, and the 64-bit value of the register-pair are treated as signed integers. + * Rd(4,1), i.e., d, determines the even/odd pair group of the two registers. Specifically, the register pair + * includes register 2d and 2d+1. + * The odd `2d+1` register of the pair contains the high 32-bit of the operand and the even `2d` + * register of the pair contains the low 32-bit of the operand. + * + * **RV64 Description**:\n + * For the `SMALDS` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1 + * with the bottom 16-bit content of the 32-bit elements of Rs2 and then subtracts the result from the + * result of multiplying the top 16-bit content of the 32-bit elements of Rs1 with the top 16-bit content + * of the 32-bit elements of Rs2. + * For the `SMALDRS` instruction, it multiplies the top 16-bit content of the 32-bit elements of Rs1 with + * the top 16-bit content of the 32-bit elements of Rs2 and then subtracts the result from the result of + * multiplying the bottom 16-bit content of the 32-bit elements of Rs1 with the bottom 16-bit content of + * the 32-bit elements of Rs2. + * For the `SMALXDS` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1 + * with the top 16-bit content of the 32-bit elements of Rs2 and then subtracts the result from the + * result of multiplying the top 16-bit content of the 32-bit elements of Rs1 with the bottom 16-bit + * content of the 32-bit elements of Rs2. + * The subtraction results are then added to the 64-bit value of Rd. The 64-bit addition result is written + * back to Rd. The 16-bit values of Rs1 and Rs2, and the 64-bit value of Rd are treated as signed + * integers. + * + * **Operations**:\n + * ~~~ + * * RV32: + * Mres[31:0] = (Rs1.H[1] * Rs2.H[1]) - (Rs1.H[0] * Rs2.H[0]); // SMALDS + * Mres[31:0] = (Rs1.H[0] * Rs2.H[0]) - (Rs1.H[1] * Rs2.H[1]); // SMALDRS + * Mres[31:0] = (Rs1.H[1] * Rs2.H[0]) - (Rs1.H[0] * Rs2.H[1]); // SMALXDS + * Idx0 = CONCAT(Rd(4,1),1'b0); Idx1 = CONCAT(Rd(4,1),1'b1); + * R[Idx1].R[Idx0] = R[Idx1].R[Idx0] + SE64(Mres[31:0]); + * * RV64: + * // SMALDS + * Mres[0][31:0] = (Rs1.W[0].H[1] * Rs2.W[0].H[1]) - (Rs1.W[0].H[0] * Rs2.W[0].H[0]); + * Mres[1][31:0] = (Rs1.W[1].H[1] * Rs2.W[0].H[1]) - (Rs1.W[1].H[0] * Rs2.W[1].H[0]); + * // SMALDRS + * Mres[0][31:0] = (Rs1.W[0].H[0] * Rs2.W[0].H[0]) - (Rs1.W[0].H[1] * Rs2.W[0].H[1]); + * Mres[1][31:0] = (Rs1.W[1].H[0] * Rs2.W[0].H[0]) - (Rs1.W[1].H[1] * Rs2.W[1].H[1]); + * // SMALXDS + * Mres[0][31:0] = (Rs1.W[0].H[1] * Rs2.W[0].H[0]) - (Rs1.W[0].H[0] * Rs2.W[0].H[1]); + * Mres[1][31:0] = (Rs1.W[1].H[1] * Rs2.W[0].H[0]) - (Rs1.W[1].H[0] * Rs2.W[1].H[1]); + * Rd = Rd + SE64(Mres[0][31:0]) + SE64(Mres[1][31:0]); + * ~~~ + * + * \param [in] t long long type of value stored in t + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in long long type + */ +__STATIC_FORCEINLINE long long __RV_SMALDS(long long t, unsigned long a, unsigned long b) +{ + __ASM volatile("smalds %0, %1, %2" : "+r"(t) : "r"(a), "r"(b)); + return t; +} +/* ===== Inline Function End for 3.107.1. SMALDS ===== */ + +/* ===== Inline Function Start for 3.107.2. SMALDRS ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_16B_MULT_64B_ADDSUB + * \brief SMALDRS (Signed Multiply Two Halfs & Reverse Subtract & Add 64- bit) + * \details + * **Type**: DSP (64-bit Profile) + * + * **Syntax**:\n + * ~~~ + * SMALDS Rd, Rs1, Rs2 + * SMALDRS Rd, Rs1, Rs2 + * SMALXDS Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do two signed 16-bit multiplications from the 32-bit elements of two registers; and then + * perform a subtraction operation between the two 32-bit results. Then add the subtraction result to + * the 64-bit value of an even/odd pair of registers (RV32) or a register (RV64). The addition result is + * written back to the register-pair. + * * SMALDS: rt pair + (top*top - bottom*bottom) (all 32-bit elements) + * * SMALDRS: rt pair + (bottom*bottom - top*top) (all 32-bit elements) + * * SMALXDS: rt pair + (top*bottom - bottom*top) (all 32-bit elements) + * + * **RV32 Description**:\n + * For the `SMALDS` instruction, it multiplies the bottom 16-bit content of Rs1 with the bottom 16-bit + * content of Rs2 and then subtracts the result from the result of multiplying the top 16-bit content of + * Rs1 with the top 16-bit content of Rs2. + * For the `SMALDRS` instruction, it multiplies the top 16-bit content of Rs1 with the top 16-bit content + * of Rs2 and then subtracts the result from the result of multiplying the bottom 16-bit content of Rs1 + * with the bottom 16-bit content of Rs2. + * For the `SMALXDS` instruction, it multiplies the bottom 16-bit content of Rs1 with the top 16-bit + * content of Rs2 and then subtracts the result from the result of multiplying the top 16-bit content of + * Rs1 with the bottom 16-bit content of Rs2. + * The subtraction result is then added to the 64-bit value of an even/odd pair of registers specified by + * Rd(4,1). The 64-bit addition result is written back to the register-pair. The 16-bit values of Rs1 and + * Rs2, and the 64-bit value of the register-pair are treated as signed integers. + * Rd(4,1), i.e., d, determines the even/odd pair group of the two registers. Specifically, the register pair + * includes register 2d and 2d+1. + * The odd `2d+1` register of the pair contains the high 32-bit of the operand and the even `2d` + * register of the pair contains the low 32-bit of the operand. + * + * **RV64 Description**:\n + * For the `SMALDS` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1 + * with the bottom 16-bit content of the 32-bit elements of Rs2 and then subtracts the result from the + * result of multiplying the top 16-bit content of the 32-bit elements of Rs1 with the top 16-bit content + * of the 32-bit elements of Rs2. + * For the `SMALDRS` instruction, it multiplies the top 16-bit content of the 32-bit elements of Rs1 with + * the top 16-bit content of the 32-bit elements of Rs2 and then subtracts the result from the result of + * multiplying the bottom 16-bit content of the 32-bit elements of Rs1 with the bottom 16-bit content of + * the 32-bit elements of Rs2. + * For the `SMALXDS` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1 + * with the top 16-bit content of the 32-bit elements of Rs2 and then subtracts the result from the + * result of multiplying the top 16-bit content of the 32-bit elements of Rs1 with the bottom 16-bit + * content of the 32-bit elements of Rs2. + * The subtraction results are then added to the 64-bit value of Rd. The 64-bit addition result is written + * back to Rd. The 16-bit values of Rs1 and Rs2, and the 64-bit value of Rd are treated as signed + * integers. + * + * **Operations**:\n + * ~~~ + * * RV32: + * Mres[31:0] = (Rs1.H[1] * Rs2.H[1]) - (Rs1.H[0] * Rs2.H[0]); // SMALDS + * Mres[31:0] = (Rs1.H[0] * Rs2.H[0]) - (Rs1.H[1] * Rs2.H[1]); // SMALDRS + * Mres[31:0] = (Rs1.H[1] * Rs2.H[0]) - (Rs1.H[0] * Rs2.H[1]); // SMALXDS + * Idx0 = CONCAT(Rd(4,1),1'b0); Idx1 = CONCAT(Rd(4,1),1'b1); + * R[Idx1].R[Idx0] = R[Idx1].R[Idx0] + SE64(Mres[31:0]); + * * RV64: + * // SMALDS + * Mres[0][31:0] = (Rs1.W[0].H[1] * Rs2.W[0].H[1]) - (Rs1.W[0].H[0] * Rs2.W[0].H[0]); + * Mres[1][31:0] = (Rs1.W[1].H[1] * Rs2.W[0].H[1]) - (Rs1.W[1].H[0] * Rs2.W[1].H[0]); + * // SMALDRS + * Mres[0][31:0] = (Rs1.W[0].H[0] * Rs2.W[0].H[0]) - (Rs1.W[0].H[1] * Rs2.W[0].H[1]); + * Mres[1][31:0] = (Rs1.W[1].H[0] * Rs2.W[0].H[0]) - (Rs1.W[1].H[1] * Rs2.W[1].H[1]); + * // SMALXDS + * Mres[0][31:0] = (Rs1.W[0].H[1] * Rs2.W[0].H[0]) - (Rs1.W[0].H[0] * Rs2.W[0].H[1]); + * Mres[1][31:0] = (Rs1.W[1].H[1] * Rs2.W[0].H[0]) - (Rs1.W[1].H[0] * Rs2.W[1].H[1]); + * Rd = Rd + SE64(Mres[0][31:0]) + SE64(Mres[1][31:0]); + * ~~~ + * + * \param [in] t long long type of value stored in t + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in long long type + */ +__STATIC_FORCEINLINE long long __RV_SMALDRS(long long t, unsigned long a, unsigned long b) +{ + __ASM volatile("smaldrs %0, %1, %2" : "+r"(t) : "r"(a), "r"(b)); + return t; +} +/* ===== Inline Function End for 3.107.2. SMALDRS ===== */ + +/* ===== Inline Function Start for 3.107.3. SMALXDS ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_16B_MULT_64B_ADDSUB + * \brief SMALXDS (Signed Crossed Multiply Two Halfs & Subtract & Add 64- bit) + * \details + * **Type**: DSP (64-bit Profile) + * + * **Syntax**:\n + * ~~~ + * SMALDS Rd, Rs1, Rs2 + * SMALDRS Rd, Rs1, Rs2 + * SMALXDS Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do two signed 16-bit multiplications from the 32-bit elements of two registers; and then + * perform a subtraction operation between the two 32-bit results. Then add the subtraction result to + * the 64-bit value of an even/odd pair of registers (RV32) or a register (RV64). The addition result is + * written back to the register-pair. + * * SMALDS: rt pair + (top*top - bottom*bottom) (all 32-bit elements) + * * SMALDRS: rt pair + (bottom*bottom - top*top) (all 32-bit elements) + * * SMALXDS: rt pair + (top*bottom - bottom*top) (all 32-bit elements) + * + * **RV32 Description**:\n + * For the `SMALDS` instruction, it multiplies the bottom 16-bit content of Rs1 with the bottom 16-bit + * content of Rs2 and then subtracts the result from the result of multiplying the top 16-bit content of + * Rs1 with the top 16-bit content of Rs2. + * For the `SMALDRS` instruction, it multiplies the top 16-bit content of Rs1 with the top 16-bit content + * of Rs2 and then subtracts the result from the result of multiplying the bottom 16-bit content of Rs1 + * with the bottom 16-bit content of Rs2. + * For the `SMALXDS` instruction, it multiplies the bottom 16-bit content of Rs1 with the top 16-bit + * content of Rs2 and then subtracts the result from the result of multiplying the top 16-bit content of + * Rs1 with the bottom 16-bit content of Rs2. + * The subtraction result is then added to the 64-bit value of an even/odd pair of registers specified by + * Rd(4,1). The 64-bit addition result is written back to the register-pair. The 16-bit values of Rs1 and + * Rs2, and the 64-bit value of the register-pair are treated as signed integers. + * Rd(4,1), i.e., d, determines the even/odd pair group of the two registers. Specifically, the register pair + * includes register 2d and 2d+1. + * The odd `2d+1` register of the pair contains the high 32-bit of the operand and the even `2d` + * register of the pair contains the low 32-bit of the operand. + * + * **RV64 Description**:\n + * For the `SMALDS` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1 + * with the bottom 16-bit content of the 32-bit elements of Rs2 and then subtracts the result from the + * result of multiplying the top 16-bit content of the 32-bit elements of Rs1 with the top 16-bit content + * of the 32-bit elements of Rs2. + * For the `SMALDRS` instruction, it multiplies the top 16-bit content of the 32-bit elements of Rs1 with + * the top 16-bit content of the 32-bit elements of Rs2 and then subtracts the result from the result of + * multiplying the bottom 16-bit content of the 32-bit elements of Rs1 with the bottom 16-bit content of + * the 32-bit elements of Rs2. + * For the `SMALXDS` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1 + * with the top 16-bit content of the 32-bit elements of Rs2 and then subtracts the result from the + * result of multiplying the top 16-bit content of the 32-bit elements of Rs1 with the bottom 16-bit + * content of the 32-bit elements of Rs2. + * The subtraction results are then added to the 64-bit value of Rd. The 64-bit addition result is written + * back to Rd. The 16-bit values of Rs1 and Rs2, and the 64-bit value of Rd are treated as signed + * integers. + * + * **Operations**:\n + * ~~~ + * * RV32: + * Mres[31:0] = (Rs1.H[1] * Rs2.H[1]) - (Rs1.H[0] * Rs2.H[0]); // SMALDS + * Mres[31:0] = (Rs1.H[0] * Rs2.H[0]) - (Rs1.H[1] * Rs2.H[1]); // SMALDRS + * Mres[31:0] = (Rs1.H[1] * Rs2.H[0]) - (Rs1.H[0] * Rs2.H[1]); // SMALXDS + * Idx0 = CONCAT(Rd(4,1),1'b0); Idx1 = CONCAT(Rd(4,1),1'b1); + * R[Idx1].R[Idx0] = R[Idx1].R[Idx0] + SE64(Mres[31:0]); + * * RV64: + * // SMALDS + * Mres[0][31:0] = (Rs1.W[0].H[1] * Rs2.W[0].H[1]) - (Rs1.W[0].H[0] * Rs2.W[0].H[0]); + * Mres[1][31:0] = (Rs1.W[1].H[1] * Rs2.W[0].H[1]) - (Rs1.W[1].H[0] * Rs2.W[1].H[0]); + * // SMALDRS + * Mres[0][31:0] = (Rs1.W[0].H[0] * Rs2.W[0].H[0]) - (Rs1.W[0].H[1] * Rs2.W[0].H[1]); + * Mres[1][31:0] = (Rs1.W[1].H[0] * Rs2.W[0].H[0]) - (Rs1.W[1].H[1] * Rs2.W[1].H[1]); + * // SMALXDS + * Mres[0][31:0] = (Rs1.W[0].H[1] * Rs2.W[0].H[0]) - (Rs1.W[0].H[0] * Rs2.W[0].H[1]); + * Mres[1][31:0] = (Rs1.W[1].H[1] * Rs2.W[0].H[0]) - (Rs1.W[1].H[0] * Rs2.W[1].H[1]); + * Rd = Rd + SE64(Mres[0][31:0]) + SE64(Mres[1][31:0]); + * ~~~ + * + * \param [in] t long long type of value stored in t + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in long long type + */ +__STATIC_FORCEINLINE long long __RV_SMALXDS(long long t, unsigned long a, unsigned long b) +{ + __ASM volatile("smalxds %0, %1, %2" : "+r"(t) : "r"(a), "r"(b)); + return t; +} +/* ===== Inline Function End for 3.107.3. SMALXDS ===== */ + +/* ===== Inline Function Start for 3.108. SMAR64 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_32B_MULT_64B_ADDSUB + * \brief SMAR64 (Signed Multiply and Add to 64-Bit Data) + * \details + * **Type**: DSP (64-bit Profile) + * + * **Syntax**:\n + * ~~~ + * SMAR64 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Multiply the 32-bit signed elements in two registers and add the 64-bit multiplication + * result to the 64-bit signed data of a pair of registers (RV32) or a register (RV64). The result is written + * back to the pair of registers (RV32) or a register (RV64). + * + * **RV32 Description**:\n + * This instruction multiplies the 32-bit signed data of Rs1 with that of Rs2. It adds + * the 64-bit multiplication result to the 64-bit signed data of an even/odd pair of registers specified by + * Rd(4,1). The addition result is written back to the even/odd pair of registers specified by Rd(4,1). + * Rx(4,1), i.e., d, determines the even/odd pair group of two registers. Specifically, the register pair + * includes register 2d and 2d+1. + * The odd `2d+1` register of the pair contains the high 32-bit of the result and the even `2d` register + * of the pair contains the low 32-bit of the result. + * + * **RV64 Description**:\n + * This instruction multiplies the 32-bit signed elements of Rs1 with that of Rs2. It + * adds the 64-bit multiplication results to the 64-bit signed data of Rd. The addition result is written + * back to Rd. + * + * **Operations**:\n + * ~~~ + * * RV32: + * t_L = CONCAT(Rd(4,1),1'b0); t_H = CONCAT(Rd(4,1),1'b1); + * R[t_H].R[t_L] = R[t_H].R[t_L] + (Rs1 * Rs2); + * * RV64: + * Rd = Rd + (Rs1.W[0] * Rs2.W[0]) + (Rs1.W[1] * Rs2.W[1]); + * ~~~ + * + * \param [in] t long long type of value stored in t + * \param [in] a long type of value stored in a + * \param [in] b long type of value stored in b + * \return value stored in long long type + */ +__STATIC_FORCEINLINE long long __RV_SMAR64(long long t, long a, long b) +{ + __ASM volatile("smar64 %0, %1, %2" : "+r"(t) : "r"(a), "r"(b)); + return t; +} +/* ===== Inline Function End for 3.108. SMAR64 ===== */ + +/* ===== Inline Function Start for 3.109. SMAQA ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_8B_MULT_32B_ADD + * \brief SMAQA (Signed Multiply Four Bytes with 32-bit Adds) + * \details + * **Type**: Partial-SIMD (Reduction) + * + * **Syntax**:\n + * ~~~ + * SMAQA Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do four signed 8-bit multiplications from 32-bit chunks of two registers; and then adds + * the four 16-bit results and the content of corresponding 32-bit chunks of a third register together. + * + * **Description**:\n + * This instruction multiplies the four signed 8-bit elements of 32-bit chunks of Rs1 with the four + * signed 8-bit elements of 32-bit chunks of Rs2 and then adds the four results together with the signed + * content of the corresponding 32-bit chunks of Rd. The final results are written back to the + * corresponding 32-bit chunks in Rd. + * + * **Operations**:\n + * ~~~ + * res[x] = Rd.W[x] + + * (Rs1.W[x].B[3] s* Rs2.W[x].B[3]) + (Rs1.W[x].B[2] s* Rs2.W[x].B[2]) + + * (Rs1.W[x].B[1] s* Rs2.W[x].B[1]) + (Rs1.W[x].B[0] s* Rs2.W[x].B[0]); + * Rd.W[x] = res[x]; + * for RV32: x=0, + * for RV64: x=1,0 + * ~~~ + * + * \param [in] t long type of value stored in t + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in long type + */ +__STATIC_FORCEINLINE long __RV_SMAQA(long t, unsigned long a, unsigned long b) +{ + __ASM volatile("smaqa %0, %1, %2" : "+r"(t) : "r"(a), "r"(b)); + return t; +} +/* ===== Inline Function End for 3.109. SMAQA ===== */ + +/* ===== Inline Function Start for 3.110. SMAQA.SU ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_8B_MULT_32B_ADD + * \brief SMAQA.SU (Signed and Unsigned Multiply Four Bytes with 32-bit Adds) + * \details + * **Type**: Partial-SIMD (Reduction) + * + * **Syntax**:\n + * ~~~ + * SMAQA.SU Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do four `signed x unsigned` 8-bit multiplications from 32-bit chunks of two registers; and + * then adds the four 16-bit results and the content of corresponding 32-bit chunks of a third register + * together. + * + * **Description**:\n + * This instruction multiplies the four signed 8-bit elements of 32-bit chunks of Rs1 with the four + * unsigned 8-bit elements of 32-bit chunks of Rs2 and then adds the four results together with the + * signed content of the corresponding 32-bit chunks of Rd. The final results are written back to the + * corresponding 32-bit chunks in Rd. + * + * **Operations**:\n + * ~~~ + * res[x] = Rd.W[x] + + * (Rs1.W[x].B[3] su* Rs2.W[x].B[3]) + (Rs1.W[x].B[2] su* Rs2.W[x].B[2]) + + * (Rs1.W[x].B[1] su* Rs2.W[x].B[1]) + (Rs1.W[x].B[0] su* Rs2.W[x].B[0]); + * Rd.W[x] = res[x]; + * for RV32: x=0, + * for RV64: x=1...0 + * ~~~ + * + * \param [in] t long type of value stored in t + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in long type + */ +__STATIC_FORCEINLINE long __RV_SMAQA_SU(long t, unsigned long a, unsigned long b) +{ + __ASM volatile("smaqa.su %0, %1, %2" : "+r"(t) : "r"(a), "r"(b)); + return t; +} +/* ===== Inline Function End for 3.110. SMAQA.SU ===== */ + +/* ===== Inline Function Start for 3.111. SMAX8 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_MISC + * \brief SMAX8 (SIMD 8-bit Signed Maximum) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * SMAX8 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do 8-bit signed integer elements finding maximum operations simultaneously. + * + * **Description**:\n + * This instruction compares the 8-bit signed integer elements in Rs1 with the 8-bit + * signed integer elements in Rs2 and selects the numbers that is greater than the other one. The + * selected results are written to Rd. + * + * **Operations**:\n + * ~~~ + * Rd.B[x] = (Rs1.B[x] > Rs2.B[x])? Rs1.B[x] : Rs2.B[x]; + * for RV32: x=3...0, + * for RV64: x=7...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_SMAX8(unsigned long a, unsigned long b) +{ + register unsigned long result; + __ASM volatile("smax8 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 3.111. SMAX8 ===== */ + +/* ===== Inline Function Start for 3.112. SMAX16 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_MISC + * \brief SMAX16 (SIMD 16-bit Signed Maximum) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * SMAX16 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do 16-bit signed integer elements finding maximum operations simultaneously. + * + * **Description**:\n + * This instruction compares the 16-bit signed integer elements in Rs1 with the 16-bit + * signed integer elements in Rs2 and selects the numbers that is greater than the other one. The + * selected results are written to Rd. + * + * **Operations**:\n + * ~~~ + * Rd.H[x] = (Rs1.H[x] > Rs2.H[x])? Rs1.H[x] : Rs2.H[x]; + * for RV32: x=1...0, + * for RV64: x=3...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_SMAX16(unsigned long a, unsigned long b) +{ + register unsigned long result; + __ASM volatile("smax16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 3.112. SMAX16 ===== */ + +/* ===== Inline Function Start for 3.113.1. SMBB16 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_16B_MULT_32B_ADDSUB + * \brief SMBB16 (SIMD Signed Multiply Bottom Half & Bottom Half) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * SMBB16 Rd, Rs1, Rs2 + * SMBT16 Rd, Rs1, Rs2 + * SMTT16 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Multiply the signed 16-bit content of the 32-bit elements of a register with the signed 16- + * bit content of the 32-bit elements of another register and write the result to a third register. + * * SMBB16: W[x].bottom*W[x].bottom + * * SMBT16: W[x].bottom *W[x].top + * * SMTT16: W[x].top * W[x].top + * + * **Description**:\n + * For the `SMBB16` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1 + * with the bottom 16-bit content of the 32-bit elements of Rs2. + * For the `SMBT16` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1 + * with the top 16-bit content of the 32-bit elements of Rs2. + * For the `SMTT16` instruction, it multiplies the top 16-bit content of the 32-bit elements of Rs1 with + * the top 16-bit content of the 32-bit elements of Rs2. + * The multiplication results are written to Rd. The 16-bit contents of Rs1 and Rs2 are treated as signed + * integers. + * + * **Operations**:\n + * ~~~ + * Rd.W[x] = Rs1.W[x].H[0] * Rs2.W[x].H[0]; // SMBB16 + * Rd.W[x] = Rs1.W[x].H[0] * Rs2.W[x].H[1]; // SMBT16 + * Rd.W[x] = Rs1.W[x].H[1] * Rs2.W[x].H[1]; // SMTT16 + * for RV32: x=0, + * for RV64: x=1...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in long type + */ +__STATIC_FORCEINLINE long __RV_SMBB16(unsigned long a, unsigned long b) +{ + register long result; + __ASM volatile("smbb16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 3.113.1. SMBB16 ===== */ + +/* ===== Inline Function Start for 3.113.2. SMBT16 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_16B_MULT_32B_ADDSUB + * \brief SMBT16 (SIMD Signed Multiply Bottom Half & Top Half) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * SMBB16 Rd, Rs1, Rs2 + * SMBT16 Rd, Rs1, Rs2 + * SMTT16 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Multiply the signed 16-bit content of the 32-bit elements of a register with the signed 16- + * bit content of the 32-bit elements of another register and write the result to a third register. + * * SMBB16: W[x].bottom*W[x].bottom + * * SMBT16: W[x].bottom *W[x].top + * * SMTT16: W[x].top * W[x].top + * + * **Description**:\n + * For the `SMBB16` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1 + * with the bottom 16-bit content of the 32-bit elements of Rs2. + * For the `SMBT16` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1 + * with the top 16-bit content of the 32-bit elements of Rs2. + * For the `SMTT16` instruction, it multiplies the top 16-bit content of the 32-bit elements of Rs1 with + * the top 16-bit content of the 32-bit elements of Rs2. + * The multiplication results are written to Rd. The 16-bit contents of Rs1 and Rs2 are treated as signed + * integers. + * + * **Operations**:\n + * ~~~ + * Rd.W[x] = Rs1.W[x].H[0] * Rs2.W[x].H[0]; // SMBB16 + * Rd.W[x] = Rs1.W[x].H[0] * Rs2.W[x].H[1]; // SMBT16 + * Rd.W[x] = Rs1.W[x].H[1] * Rs2.W[x].H[1]; // SMTT16 + * for RV32: x=0, + * for RV64: x=1...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in long type + */ +__STATIC_FORCEINLINE long __RV_SMBT16(unsigned long a, unsigned long b) +{ + register long result; + __ASM volatile("smbt16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 3.113.2. SMBT16 ===== */ + +/* ===== Inline Function Start for 3.113.3. SMTT16 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_16B_MULT_32B_ADDSUB + * \brief SMTT16 (SIMD Signed Multiply Top Half & Top Half) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * SMBB16 Rd, Rs1, Rs2 + * SMBT16 Rd, Rs1, Rs2 + * SMTT16 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Multiply the signed 16-bit content of the 32-bit elements of a register with the signed 16- + * bit content of the 32-bit elements of another register and write the result to a third register. + * * SMBB16: W[x].bottom*W[x].bottom + * * SMBT16: W[x].bottom *W[x].top + * * SMTT16: W[x].top * W[x].top + * + * **Description**:\n + * For the `SMBB16` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1 + * with the bottom 16-bit content of the 32-bit elements of Rs2. + * For the `SMBT16` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1 + * with the top 16-bit content of the 32-bit elements of Rs2. + * For the `SMTT16` instruction, it multiplies the top 16-bit content of the 32-bit elements of Rs1 with + * the top 16-bit content of the 32-bit elements of Rs2. + * The multiplication results are written to Rd. The 16-bit contents of Rs1 and Rs2 are treated as signed + * integers. + * + * **Operations**:\n + * ~~~ + * Rd.W[x] = Rs1.W[x].H[0] * Rs2.W[x].H[0]; // SMBB16 + * Rd.W[x] = Rs1.W[x].H[0] * Rs2.W[x].H[1]; // SMBT16 + * Rd.W[x] = Rs1.W[x].H[1] * Rs2.W[x].H[1]; // SMTT16 + * for RV32: x=0, + * for RV64: x=1...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in long type + */ +__STATIC_FORCEINLINE long __RV_SMTT16(unsigned long a, unsigned long b) +{ + register long result; + __ASM volatile("smtt16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 3.113.3. SMTT16 ===== */ + +/* ===== Inline Function Start for 3.114.1. SMDS ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_16B_MULT_32B_ADDSUB + * \brief SMDS (SIMD Signed Multiply Two Halfs and Subtract) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * SMDS Rd, Rs1, Rs2 + * SMDRS Rd, Rs1, Rs2 + * SMXDS Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do two signed 16-bit multiplications from the 32-bit elements of two registers; and then + * perform a subtraction operation between the two 32-bit results. + * * SMDS: top*top - bottom*bottom (per 32-bit element) + * * SMDRS: bottom*bottom - top*top (per 32-bit element) + * * SMXDS: top*bottom - bottom*top (per 32-bit element) + * + * **Description**:\n + * For the `SMDS` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1 with + * the bottom 16-bit content of the 32-bit elements of Rs2 and then subtracts the result from the result + * of multiplying the top 16-bit content of the 32-bit elements of Rs1 with the top 16-bit content of the + * 32-bit elements of Rs2. + * For the `SMDRS` instruction, it multiplies the top 16-bit content of the 32-bit elements of Rs1 with + * the top 16-bit content of the 32-bit elements of Rs2 and then subtracts the result from the result of + * multiplying the bottom 16-bit content of the 32-bit elements of Rs1 with the bottom 16-bit content of + * the 32-bit elements of Rs2. + * For the `SMXDS` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1 + * with the top 16-bit content of the 32-bit elements of Rs2 and then subtracts the result from the + * result of multiplying the top 16-bit content of the 32-bit elements of Rs1 with the bottom 16-bit + * content of the 32-bit elements of Rs2. + * The subtraction result is written to the corresponding 32-bit element of Rd. The 16-bit contents of + * multiplication are treated as signed integers. + * + * **Operations**:\n + * ~~~ + * * SMDS: + * Rd.W[x] = (Rs1.W[x].H[1] * Rs2.W[x].H[1]) - (Rs1.W[x].H[0] * Rs2.W[x].H[0]); + * * SMDRS: + * Rd.W[x] = (Rs1.W[x].H[0] * Rs2.W[x].H[0]) - (Rs1.W[x].H[1] * Rs2.W[x].H[1]); + * * SMXDS: + * Rd.W[x] = (Rs1.W[x].H[1] * Rs2.W[x].H[0]) - (Rs1.W[x].H[0] * Rs2.W[x].H[1]); + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in long type + */ +__STATIC_FORCEINLINE long __RV_SMDS(unsigned long a, unsigned long b) +{ + register long result; + __ASM volatile("smds %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 3.114.1. SMDS ===== */ + +/* ===== Inline Function Start for 3.114.2. SMDRS ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_16B_MULT_32B_ADDSUB + * \brief SMDRS (SIMD Signed Multiply Two Halfs and Reverse Subtract) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * SMDS Rd, Rs1, Rs2 + * SMDRS Rd, Rs1, Rs2 + * SMXDS Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do two signed 16-bit multiplications from the 32-bit elements of two registers; and then + * perform a subtraction operation between the two 32-bit results. + * * SMDS: top*top - bottom*bottom (per 32-bit element) + * * SMDRS: bottom*bottom - top*top (per 32-bit element) + * * SMXDS: top*bottom - bottom*top (per 32-bit element) + * + * **Description**:\n + * For the `SMDS` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1 with + * the bottom 16-bit content of the 32-bit elements of Rs2 and then subtracts the result from the result + * of multiplying the top 16-bit content of the 32-bit elements of Rs1 with the top 16-bit content of the + * 32-bit elements of Rs2. + * For the `SMDRS` instruction, it multiplies the top 16-bit content of the 32-bit elements of Rs1 with + * the top 16-bit content of the 32-bit elements of Rs2 and then subtracts the result from the result of + * multiplying the bottom 16-bit content of the 32-bit elements of Rs1 with the bottom 16-bit content of + * the 32-bit elements of Rs2. + * For the `SMXDS` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1 + * with the top 16-bit content of the 32-bit elements of Rs2 and then subtracts the result from the + * result of multiplying the top 16-bit content of the 32-bit elements of Rs1 with the bottom 16-bit + * content of the 32-bit elements of Rs2. + * The subtraction result is written to the corresponding 32-bit element of Rd. The 16-bit contents of + * multiplication are treated as signed integers. + * + * **Operations**:\n + * ~~~ + * * SMDS: + * Rd.W[x] = (Rs1.W[x].H[1] * Rs2.W[x].H[1]) - (Rs1.W[x].H[0] * Rs2.W[x].H[0]); + * * SMDRS: + * Rd.W[x] = (Rs1.W[x].H[0] * Rs2.W[x].H[0]) - (Rs1.W[x].H[1] * Rs2.W[x].H[1]); + * * SMXDS: + * Rd.W[x] = (Rs1.W[x].H[1] * Rs2.W[x].H[0]) - (Rs1.W[x].H[0] * Rs2.W[x].H[1]); + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in long type + */ +__STATIC_FORCEINLINE long __RV_SMDRS(unsigned long a, unsigned long b) +{ + register long result; + __ASM volatile("smdrs %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 3.114.2. SMDRS ===== */ + +/* ===== Inline Function Start for 3.114.3. SMXDS ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_16B_MULT_32B_ADDSUB + * \brief SMXDS (SIMD Signed Crossed Multiply Two Halfs and Subtract) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * SMDS Rd, Rs1, Rs2 + * SMDRS Rd, Rs1, Rs2 + * SMXDS Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do two signed 16-bit multiplications from the 32-bit elements of two registers; and then + * perform a subtraction operation between the two 32-bit results. + * * SMDS: top*top - bottom*bottom (per 32-bit element) + * * SMDRS: bottom*bottom - top*top (per 32-bit element) + * * SMXDS: top*bottom - bottom*top (per 32-bit element) + * + * **Description**:\n + * For the `SMDS` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1 with + * the bottom 16-bit content of the 32-bit elements of Rs2 and then subtracts the result from the result + * of multiplying the top 16-bit content of the 32-bit elements of Rs1 with the top 16-bit content of the + * 32-bit elements of Rs2. + * For the `SMDRS` instruction, it multiplies the top 16-bit content of the 32-bit elements of Rs1 with + * the top 16-bit content of the 32-bit elements of Rs2 and then subtracts the result from the result of + * multiplying the bottom 16-bit content of the 32-bit elements of Rs1 with the bottom 16-bit content of + * the 32-bit elements of Rs2. + * For the `SMXDS` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1 + * with the top 16-bit content of the 32-bit elements of Rs2 and then subtracts the result from the + * result of multiplying the top 16-bit content of the 32-bit elements of Rs1 with the bottom 16-bit + * content of the 32-bit elements of Rs2. + * The subtraction result is written to the corresponding 32-bit element of Rd. The 16-bit contents of + * multiplication are treated as signed integers. + * + * **Operations**:\n + * ~~~ + * * SMDS: + * Rd.W[x] = (Rs1.W[x].H[1] * Rs2.W[x].H[1]) - (Rs1.W[x].H[0] * Rs2.W[x].H[0]); + * * SMDRS: + * Rd.W[x] = (Rs1.W[x].H[0] * Rs2.W[x].H[0]) - (Rs1.W[x].H[1] * Rs2.W[x].H[1]); + * * SMXDS: + * Rd.W[x] = (Rs1.W[x].H[1] * Rs2.W[x].H[0]) - (Rs1.W[x].H[0] * Rs2.W[x].H[1]); + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in long type + */ +__STATIC_FORCEINLINE long __RV_SMXDS(unsigned long a, unsigned long b) +{ + register long result; + __ASM volatile("smxds %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 3.114.3. SMXDS ===== */ + +/* ===== Inline Function Start for 3.115. SMIN8 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_MISC + * \brief SMIN8 (SIMD 8-bit Signed Minimum) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * SMIN8 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do 8-bit signed integer elements finding minimum operations simultaneously. + * + * **Description**:\n + * This instruction compares the 8-bit signed integer elements in Rs1 with the 8-bit + * signed integer elements in Rs2 and selects the numbers that is less than the other one. The selected + * results are written to Rd. + * + * **Operations**:\n + * ~~~ + * Rd.B[x] = (Rs1.B[x] < Rs2.B[x])? Rs1.B[x] : Rs2.B[x]; + * for RV32: x=3...0, + * for RV64: x=7...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_SMIN8(unsigned long a, unsigned long b) +{ + register unsigned long result; + __ASM volatile("smin8 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 3.115. SMIN8 ===== */ + +/* ===== Inline Function Start for 3.116. SMIN16 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_MISC + * \brief SMIN16 (SIMD 16-bit Signed Minimum) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * SMIN16 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do 16-bit signed integer elements finding minimum operations simultaneously. + * + * **Description**:\n + * This instruction compares the 16-bit signed integer elements in Rs1 with the 16-bit + * signed integer elements in Rs2 and selects the numbers that is less than the other one. The selected + * results are written to Rd. + * + * **Operations**:\n + * ~~~ + * Rd.H[x] = (Rs1.H[x] < Rs2.H[x])? Rs1.H[x] : Rs2.H[x]; + * for RV32: x=1...0, + * for RV64: x=3...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_SMIN16(unsigned long a, unsigned long b) +{ + register unsigned long result; + __ASM volatile("smin16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 3.116. SMIN16 ===== */ + +/* ===== Inline Function Start for 3.117.1. SMMUL ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_MSW_32X32_MAC + * \brief SMMUL (SIMD MSW Signed Multiply Word) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * SMMUL Rd, Rs1, Rs2 + * SMMUL.u Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Multiply the 32-bit signed integer elements of two registers and write the most significant + * 32-bit results to the corresponding 32-bit elements of a register. The `.u` form performs an + * additional rounding up operation on the multiplication results before taking the most significant + * 32-bit part of the results. + * + * **Description**:\n + * This instruction multiplies the 32-bit elements of Rs1 with the 32-bit elements of Rs2 and writes the + * most significant 32-bit multiplication results to the corresponding 32-bit elements of Rd. The 32-bit + * elements of Rs1 and Rs2 are treated as signed integers. The `.u` form of the instruction rounds up + * the most significant 32-bit of the 64-bit multiplication results by adding a 1 to bit 31 of the results. + * * For `smmul/RV32` instruction, it is an alias to `mulh/RV32` instruction. + * + * **Operations**:\n + * ~~~ + * Mres[x][63:0] = Rs1.W[x] * Rs2.W[x]; + * if (`.u` form) { + * Round[x][32:0] = Mres[x][63:31] + 1; + * Rd.W[x] = Round[x][32:1]; + * } else { + * Rd.W[x] = Mres[x][63:32]; + * } + * for RV32: x=0 + * for RV64: x=1...0 + * ~~~ + * + * \param [in] a long type of value stored in a + * \param [in] b long type of value stored in b + * \return value stored in long type + */ +__STATIC_FORCEINLINE long __RV_SMMUL(long a, long b) +{ + register long result; + __ASM volatile("smmul %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 3.117.1. SMMUL ===== */ + +/* ===== Inline Function Start for 3.117.2. SMMUL.u ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_MSW_32X32_MAC + * \brief SMMUL.u (SIMD MSW Signed Multiply Word with Rounding) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * SMMUL Rd, Rs1, Rs2 + * SMMUL.u Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Multiply the 32-bit signed integer elements of two registers and write the most significant + * 32-bit results to the corresponding 32-bit elements of a register. The `.u` form performs an + * additional rounding up operation on the multiplication results before taking the most significant + * 32-bit part of the results. + * + * **Description**:\n + * This instruction multiplies the 32-bit elements of Rs1 with the 32-bit elements of Rs2 and writes the + * most significant 32-bit multiplication results to the corresponding 32-bit elements of Rd. The 32-bit + * elements of Rs1 and Rs2 are treated as signed integers. The `.u` form of the instruction rounds up + * the most significant 32-bit of the 64-bit multiplication results by adding a 1 to bit 31 of the results. + * * For `smmul/RV32` instruction, it is an alias to `mulh/RV32` instruction. + * + * **Operations**:\n + * ~~~ + * Mres[x][63:0] = Rs1.W[x] * Rs2.W[x]; + * if (`.u` form) { + * Round[x][32:0] = Mres[x][63:31] + 1; + * Rd.W[x] = Round[x][32:1]; + * } else { + * Rd.W[x] = Mres[x][63:32]; + * } + * for RV32: x=0 + * for RV64: x=1...0 + * ~~~ + * + * \param [in] a long type of value stored in a + * \param [in] b long type of value stored in b + * \return value stored in long type + */ +__STATIC_FORCEINLINE long __RV_SMMUL_U(long a, long b) +{ + register long result; + __ASM volatile("smmul.u %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 3.117.2. SMMUL.u ===== */ + +/* ===== Inline Function Start for 3.118.1. SMMWB ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_MSW_32X16_MAC + * \brief SMMWB (SIMD MSW Signed Multiply Word and Bottom Half) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * SMMWB Rd, Rs1, Rs2 + * SMMWB.u Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Multiply the signed 32-bit integer elements of one register and the bottom 16-bit of the + * corresponding 32-bit elements of another register, and write the most significant 32-bit results to + * the corresponding 32-bit elements of a register. The `.u` form rounds up the results from the most + * significant discarded bit. + * + * **Description**:\n + * This instruction multiplies the signed 32-bit elements of Rs1 with the signed bottom 16-bit content + * of the corresponding 32-bit elements of Rs2 and writes the most significant 32-bit multiplication + * results to the corresponding 32-bit elements of Rd. The `.u` form of the instruction rounds up the + * most significant 32-bit of the 48-bit multiplication results by adding a 1 to bit 15 of the results. + * + * **Operations**:\n + * ~~~ + * Mres[x][47:0] = Rs1.W[x] * Rs2.W[x].H[0]; + * if (`.u` form) { + * Round[x][32:0] = Mres[x][47:15] + 1; + * Rd.W[x] = Round[x][32:1]; + * } else { + * Rd.W[x] = Mres[x][47:16]; + * } + * for RV32: x=0 + * for RV64: x=1...0 + * ~~~ + * + * \param [in] a long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in long type + */ +__STATIC_FORCEINLINE long __RV_SMMWB(long a, unsigned long b) +{ + register long result; + __ASM volatile("smmwb %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 3.118.1. SMMWB ===== */ + +/* ===== Inline Function Start for 3.118.2. SMMWB.u ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_MSW_32X16_MAC + * \brief SMMWB.u (SIMD MSW Signed Multiply Word and Bottom Half with Rounding) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * SMMWB Rd, Rs1, Rs2 + * SMMWB.u Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Multiply the signed 32-bit integer elements of one register and the bottom 16-bit of the + * corresponding 32-bit elements of another register, and write the most significant 32-bit results to + * the corresponding 32-bit elements of a register. The `.u` form rounds up the results from the most + * significant discarded bit. + * + * **Description**:\n + * This instruction multiplies the signed 32-bit elements of Rs1 with the signed bottom 16-bit content + * of the corresponding 32-bit elements of Rs2 and writes the most significant 32-bit multiplication + * results to the corresponding 32-bit elements of Rd. The `.u` form of the instruction rounds up the + * most significant 32-bit of the 48-bit multiplication results by adding a 1 to bit 15 of the results. + * + * **Operations**:\n + * ~~~ + * Mres[x][47:0] = Rs1.W[x] * Rs2.W[x].H[0]; + * if (`.u` form) { + * Round[x][32:0] = Mres[x][47:15] + 1; + * Rd.W[x] = Round[x][32:1]; + * } else { + * Rd.W[x] = Mres[x][47:16]; + * } + * for RV32: x=0 + * for RV64: x=1...0 + * ~~~ + * + * \param [in] a long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in long type + */ +__STATIC_FORCEINLINE long __RV_SMMWB_U(long a, unsigned long b) +{ + register long result; + __ASM volatile("smmwb.u %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 3.118.2. SMMWB.u ===== */ + +/* ===== Inline Function Start for 3.119.1. SMMWT ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_MSW_32X16_MAC + * \brief SMMWT (SIMD MSW Signed Multiply Word and Top Half) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * SMMWT Rd, Rs1, Rs2 + * SMMWT.u Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Multiply the signed 32-bit integer elements of one register and the top 16-bit of the + * corresponding 32-bit elements of another register, and write the most significant 32-bit results to + * the corresponding 32-bit elements of a register. The `.u` form rounds up the results from the most + * significant discarded bit. + * + * **Description**:\n + * This instruction multiplies the signed 32-bit elements of Rs1 with the top signed 16-bit content of + * the corresponding 32-bit elements of Rs2 and writes the most significant 32-bit multiplication + * results to the corresponding 32-bit elements of Rd. The `.u` form of the instruction rounds up the + * most significant 32-bit of the 48-bit multiplication results by adding a 1 to bit 15 of the results. + * + * **Operations**:\n + * ~~~ + * Mres[x][47:0] = Rs1.W[x] * Rs2.W[x].H[1]; + * if (`.u` form) { + * Round[x][32:0] = Mres[x][47:15] + 1; + * Rd.W[x] = Round[x][32:1]; + * } else { + * Rd.W[x] = Mres[x][47:16]; + * } + * for RV32: x=0 + * for RV64: x=1...0 + * ~~~ + * + * \param [in] a long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in long type + */ +__STATIC_FORCEINLINE long __RV_SMMWT(long a, unsigned long b) +{ + register long result; + __ASM volatile("smmwt %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 3.119.1. SMMWT ===== */ + +/* ===== Inline Function Start for 3.119.2. SMMWT.u ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_MSW_32X16_MAC + * \brief SMMWT.u (SIMD MSW Signed Multiply Word and Top Half with Rounding) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * SMMWT Rd, Rs1, Rs2 + * SMMWT.u Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Multiply the signed 32-bit integer elements of one register and the top 16-bit of the + * corresponding 32-bit elements of another register, and write the most significant 32-bit results to + * the corresponding 32-bit elements of a register. The `.u` form rounds up the results from the most + * significant discarded bit. + * + * **Description**:\n + * This instruction multiplies the signed 32-bit elements of Rs1 with the top signed 16-bit content of + * the corresponding 32-bit elements of Rs2 and writes the most significant 32-bit multiplication + * results to the corresponding 32-bit elements of Rd. The `.u` form of the instruction rounds up the + * most significant 32-bit of the 48-bit multiplication results by adding a 1 to bit 15 of the results. + * + * **Operations**:\n + * ~~~ + * Mres[x][47:0] = Rs1.W[x] * Rs2.W[x].H[1]; + * if (`.u` form) { + * Round[x][32:0] = Mres[x][47:15] + 1; + * Rd.W[x] = Round[x][32:1]; + * } else { + * Rd.W[x] = Mres[x][47:16]; + * } + * for RV32: x=0 + * for RV64: x=1...0 + * ~~~ + * + * \param [in] a long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in long type + */ +__STATIC_FORCEINLINE long __RV_SMMWT_U(long a, unsigned long b) +{ + register long result; + __ASM volatile("smmwt.u %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 3.119.2. SMMWT.u ===== */ + +/* ===== Inline Function Start for 3.120.1. SMSLDA ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_16B_MULT_64B_ADDSUB + * \brief SMSLDA (Signed Multiply Two Halfs & Add & Subtract 64-bit) + * \details + * **Type**: DSP (64-bit Profile) + * + * **Syntax**:\n + * ~~~ + * SMSLDA Rd, Rs1, Rs2 + * SMSLXDA Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do two signed 16-bit multiplications from the 32-bit elements of two registers; and then + * subtracts the two 32-bit results from the 64-bit value of an even/odd pair of registers (RV32) or a + * register (RV64). The subtraction result is written back to the register-pair. + * * SMSLDA: rd pair - top*top - bottom*bottom (all 32-bit elements) + * * SMSLXDA: rd pair - top*bottom - bottom*top (all 32-bit elements) + * + * **RV32 Description**:\n + * For the `SMSLDA` instruction, it multiplies the bottom 16-bit content of Rs1 with the bottom 16-bit + * content Rs2 and multiplies the top 16-bit content of Rs1 with the top 16-bit content of Rs2. + * For the `SMSLXDA` instruction, it multiplies the top 16-bit content of Rs1 with the bottom 16-bit + * content of Rs2 and multiplies the bottom 16-bit content of Rs1 with the top 16-bit content of Rs2. + * The two multiplication results are subtracted from the 64-bit value of an even/odd pair of registers + * specified by Rd(4,1). The 64-bit subtraction result is written back to the register-pair. The 16-bit + * values of Rs1 and Rs2, and the 64-bit value of the register-pair are treated as signed integers. + * Rd(4,1), i.e., d, determines the even/odd pair group of the two registers. Specifically, the register pair + * includes register 2d and 2d+1. + * The odd `2d+1` register of the pair contains the high 32-bit of the result and the even `2d` register + * of the pair contains the low 32-bit of the result. + * + * **RV64 Description**:\n + * For the `SMSLDA` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1 + * with the bottom 16-bit content of the 32-bit elements of Rs2 and multiplies the top 16-bit content of + * the 32-bit elements of Rs1 with the top 16-bit content of the 32-bit elements of Rs2. + * For the `SMSLXDA` instruction, it multiplies the top 16-bit content of the 32-bit elements of Rs1 with + * the bottom 16-bit content of the 32-bit elements of Rs2 and multiplies the bottom 16-bit content of + * the 32-bit elements of Rs1 with the top 16-bit content of the 32-bit elements of Rs2. + * The four multiplication results are subtracted from the 64-bit value of Rd. The 64-bit subtraction + * result is written back to Rd. The 16-bit values of Rs1 and Rs2, and the 64-bit value of Rd are treated + * as signed integers. + * + * **Operations**:\n + * ~~~ + * * RV32: + * // SMSLDA + * Mres0[31:0] = (Rs1.H[0] * Rs2.H[0]); + * Mres1[31:0] = (Rs1.H[1] * Rs2.H[1]); + * // SMSLXDA + * Mres0[31:0] = (Rs1.H[0] * Rs2.H[1]); + * Mres1[31:0] = (Rs1.H[1] * Rs2.H[0]); + * Idx0 = CONCAT(Rd(4,1),1'b0); Idx1 = CONCAT(Rd(4,1),1'b1); + * R[Idx1].R[Idx0] = R[Idx1].R[Idx0] - SE64(Mres0[31:0]) - SE64(Mres1[31:0]); + * * RV64: + * // SMSLDA + * Mres0[0][31:0] = (Rs1.W[0].H[0] * Rs2.W[0].H[0]); + * Mres1[0][31:0] = (Rs1.W[0].H[1] * Rs2.W[0].H[1]); + * Mres0[1][31:0] = (Rs1.W[1].H[0] * Rs2.W[1].H[0]); + * Mres1[1][31:0] = (Rs1.W[1].H[1] * Rs2.W[1].H[1]); + * // SMSLXDA + * Mres0[0][31:0] = (Rs1.W[0].H[0] * Rs2.W[0].H[1]); + * Mres1[0][31:0] = (Rs1.W[0].H[1] * Rs2.W[0].H[0]); + * Mres0[1][31:0] = (Rs1.W[1].H[0] * Rs2.W[1].H[1]); + * Mres1[1][31:0] = (Rs1.W[1].H[1] * Rs2.W[1].H[0]); + * Rd = Rd - SE64(Mres0[0][31:0]) - SE64(Mres1[0][31:0]) - SE64(Mres0[1][31:0]) - + * SE64(Mres1[1][31:0]); + * ~~~ + * + * \param [in] t long long type of value stored in t + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in long long type + */ +__STATIC_FORCEINLINE long long __RV_SMSLDA(long long t, unsigned long a, unsigned long b) +{ + __ASM volatile("smslda %0, %1, %2" : "+r"(t) : "r"(a), "r"(b)); + return t; +} +/* ===== Inline Function End for 3.120.1. SMSLDA ===== */ + +/* ===== Inline Function Start for 3.120.2. SMSLXDA ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_16B_MULT_64B_ADDSUB + * \brief SMSLXDA (Signed Crossed Multiply Two Halfs & Add & Subtract 64- bit) + * \details + * **Type**: DSP (64-bit Profile) + * + * **Syntax**:\n + * ~~~ + * SMSLDA Rd, Rs1, Rs2 + * SMSLXDA Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do two signed 16-bit multiplications from the 32-bit elements of two registers; and then + * subtracts the two 32-bit results from the 64-bit value of an even/odd pair of registers (RV32) or a + * register (RV64). The subtraction result is written back to the register-pair. + * * SMSLDA: rd pair - top*top - bottom*bottom (all 32-bit elements) + * * SMSLXDA: rd pair - top*bottom - bottom*top (all 32-bit elements) + * + * **RV32 Description**:\n + * For the `SMSLDA` instruction, it multiplies the bottom 16-bit content of Rs1 with the bottom 16-bit + * content Rs2 and multiplies the top 16-bit content of Rs1 with the top 16-bit content of Rs2. + * For the `SMSLXDA` instruction, it multiplies the top 16-bit content of Rs1 with the bottom 16-bit + * content of Rs2 and multiplies the bottom 16-bit content of Rs1 with the top 16-bit content of Rs2. + * The two multiplication results are subtracted from the 64-bit value of an even/odd pair of registers + * specified by Rd(4,1). The 64-bit subtraction result is written back to the register-pair. The 16-bit + * values of Rs1 and Rs2, and the 64-bit value of the register-pair are treated as signed integers. + * Rd(4,1), i.e., d, determines the even/odd pair group of the two registers. Specifically, the register pair + * includes register 2d and 2d+1. + * The odd `2d+1` register of the pair contains the high 32-bit of the result and the even `2d` register + * of the pair contains the low 32-bit of the result. + * + * **RV64 Description**:\n + * For the `SMSLDA` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1 + * with the bottom 16-bit content of the 32-bit elements of Rs2 and multiplies the top 16-bit content of + * the 32-bit elements of Rs1 with the top 16-bit content of the 32-bit elements of Rs2. + * For the `SMSLXDA` instruction, it multiplies the top 16-bit content of the 32-bit elements of Rs1 with + * the bottom 16-bit content of the 32-bit elements of Rs2 and multiplies the bottom 16-bit content of + * the 32-bit elements of Rs1 with the top 16-bit content of the 32-bit elements of Rs2. + * The four multiplication results are subtracted from the 64-bit value of Rd. The 64-bit subtraction + * result is written back to Rd. The 16-bit values of Rs1 and Rs2, and the 64-bit value of Rd are treated + * as signed integers. + * + * **Operations**:\n + * ~~~ + * * RV32: + * // SMSLDA + * Mres0[31:0] = (Rs1.H[0] * Rs2.H[0]); + * Mres1[31:0] = (Rs1.H[1] * Rs2.H[1]); + * // SMSLXDA + * Mres0[31:0] = (Rs1.H[0] * Rs2.H[1]); + * Mres1[31:0] = (Rs1.H[1] * Rs2.H[0]); + * Idx0 = CONCAT(Rd(4,1),1'b0); Idx1 = CONCAT(Rd(4,1),1'b1); + * R[Idx1].R[Idx0] = R[Idx1].R[Idx0] - SE64(Mres0[31:0]) - SE64(Mres1[31:0]); + * * RV64: + * // SMSLDA + * Mres0[0][31:0] = (Rs1.W[0].H[0] * Rs2.W[0].H[0]); + * Mres1[0][31:0] = (Rs1.W[0].H[1] * Rs2.W[0].H[1]); + * Mres0[1][31:0] = (Rs1.W[1].H[0] * Rs2.W[1].H[0]); + * Mres1[1][31:0] = (Rs1.W[1].H[1] * Rs2.W[1].H[1]); + * // SMSLXDA + * Mres0[0][31:0] = (Rs1.W[0].H[0] * Rs2.W[0].H[1]); + * Mres1[0][31:0] = (Rs1.W[0].H[1] * Rs2.W[0].H[0]); + * Mres0[1][31:0] = (Rs1.W[1].H[0] * Rs2.W[1].H[1]); + * Mres1[1][31:0] = (Rs1.W[1].H[1] * Rs2.W[1].H[0]); + * Rd = Rd - SE64(Mres0[0][31:0]) - SE64(Mres1[0][31:0]) - SE64(Mres0[1][31:0]) - + * SE64(Mres1[1][31:0]); + * ~~~ + * + * \param [in] t long long type of value stored in t + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in long long type + */ +__STATIC_FORCEINLINE long long __RV_SMSLXDA(long long t, unsigned long a, unsigned long b) +{ + __ASM volatile("smslxda %0, %1, %2" : "+r"(t) : "r"(a), "r"(b)); + return t; +} +/* ===== Inline Function End for 3.120.2. SMSLXDA ===== */ + +/* ===== Inline Function Start for 3.121. SMSR64 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_32B_MULT_64B_ADDSUB + * \brief SMSR64 (Signed Multiply and Subtract from 64- Bit Data) + * \details + * **Type**: DSP (64-bit Profile) + * + * **Syntax**:\n + * ~~~ + * SMSR64 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Multiply the 32-bit signed elements in two registers and subtract the 64-bit multiplication + * results from the 64-bit signed data of a pair of registers (RV32) or a register (RV64). The result is + * written back to the pair of registers (RV32) or a register (RV64). + * + * **RV32 Description**:\n + * This instruction multiplies the 32-bit signed data of Rs1 with that of Rs2. It + * subtracts the 64-bit multiplication result from the 64-bit signed data of an even/odd pair of registers + * specified by Rd(4,1). The subtraction result is written back to the even/odd pair of registers + * specified by Rd(4,1). + * Rx(4,1), i.e., d, determines the even/odd pair group of two registers. Specifically, the register pair + * includes register 2d and 2d+1. + * The odd `2d+1` register of the pair contains the high 32-bit of the result and the even `2d` register + * of the pair contains the low 32-bit of the result. + * + * **RV64 Description**:\n + * This instruction multiplies the 32-bit signed elements of Rs1 with that of Rs2. It + * subtracts the 64-bit multiplication results from the 64-bit signed data of Rd. The subtraction result is + * written back to Rd. + * + * **Operations**:\n + * ~~~ + * * RV32: + * t_L = CONCAT(Rd(4,1),1'b0); t_H = CONCAT(Rd(4,1),1'b1); + * R[t_H].R[t_L] = R[t_H].R[t_L] - (Rs1 * Rs2); + * * RV64: + * Rd = Rd - (Rs1.W[0] * Rs2.W[0]) - (Rs1.W[1] * Rs2.W[1]); + * ~~~ + * + * \param [in] t long long type of value stored in t + * \param [in] a long type of value stored in a + * \param [in] b long type of value stored in b + * \return value stored in long long type + */ +__STATIC_FORCEINLINE long long __RV_SMSR64(long long t, long a, long b) +{ + __ASM volatile("smsr64 %0, %1, %2" : "+r"(t) : "r"(a), "r"(b)); + return t; +} +/* ===== Inline Function End for 3.121. SMSR64 ===== */ + +/* ===== Inline Function Start for 3.122.1. SMUL8 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_MULTIPLY + * \brief SMUL8 (SIMD Signed 8-bit Multiply) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * SMUL8 Rd, Rs1, Rs2 + * SMULX8 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do signed 8-bit multiplications and generate four 16-bit results simultaneously. + * + * **RV32 Description**:\n + * For the `SMUL8` instruction, multiply the 8-bit data elements of Rs1 with the + * corresponding 8-bit data elements of Rs2. + * For the `SMULX8` instruction, multiply the first and second 8-bit data elements of Rs1 with the + * second and first 8-bit data elements of Rs2. At the same time, multiply the third and fourth 8-bit data + * elements of Rs1 with the fourth and third 8-bit data elements of Rs2. + * The four 16-bit results are then written into an even/odd pair of registers specified by Rd(4,1). + * Rd(4,1), i.e., d, determines the even/odd pair group of two registers. Specifically, the register pair + * includes register 2d and 2d+1. + * The odd `2d+1` register of the pair contains the two 16-bit results calculated from the top part of + * Rs1 and the even `2d` register of the pair contains the two 16-bit results calculated from the bottom + * part of Rs1. + * + * **RV64 Description**:\n + * For the `SMUL8` instruction, multiply the 8-bit data elements of Rs1 with the + * corresponding 8-bit data elements of Rs2. + * For the `SMULX8` instruction, multiply the first and second 8-bit data elements of Rs1 with the + * second and first 8-bit data elements of Rs2. At the same time, multiply the third and fourth 8-bit data + * elements of Rs1 with the fourth and third 8-bit data elements of Rs2. + * The four 16-bit results are then written into Rd. The Rd.W[1] contains the two 16-bit results + * calculated from the top part of Rs1 and the Rd.W[0] contains the two 16-bit results calculated from + * the bottom part of Rs1. + * + * **Operations**:\n + * ~~~ + * * RV32: + * if (is `SMUL8`) { + * op1t[x/2] = Rs1.B[x+1]; op2t[x/2] = Rs2.B[x+1]; // top + * op1b[x/2] = Rs1.B[x]; op2b[x/2] = Rs2.B[x]; // bottom + * } else if (is `SMULX8`) { + * op1t[x/2] = Rs1.B[x+1]; op2t[x/2] = Rs2.B[x]; // Rs1 top + * op1b[x/2] = Rs1.B[x]; op2b[x/2] = Rs2.B[x+1]; // Rs1 bottom + * } + * rest[x/2] = op1t[x/2] s* op2t[x/2]; + * resb[x/2] = op1b[x/2] s* op2b[x/2]; + * t_L = CONCAT(Rd(4,1),1'b0); t_H = CONCAT(Rd(4,1),1'b1); + * R[t_H].H[1] = rest[1]; R[t_H].H[0] = resb[1]; + * R[t_L].H[1] = rest[0]; R[t_L].H[0] = resb[0]; + * x = 0 and 2 + * * RV64: + * if (is `SMUL8`) { + * op1t[x/2] = Rs1.B[x+1]; op2t[x/2] = Rs2.B[x+1]; // top + * op1b[x/2] = Rs1.B[x]; op2b[x/2] = Rs2.B[x]; // bottom + * } else if (is `SMULX8`) { + * op1t[x/2] = Rs1.B[x+1]; op2t[x/2] = Rs2.B[x]; // Rs1 top + * op1b[x/2] = Rs1.B[x]; op2b[x/2] = Rs2.B[x+1]; // Rs1 bottom + * } + * rest[x/2] = op1t[x/2] s* op2t[x/2]; + * resb[x/2] = op1b[x/2] s* op2b[x/2]; + * t_L = CONCAT(Rd(4,1),1'b0); t_H = CONCAT(Rd(4,1),1'b1); + * Rd.W[1].H[1] = rest[1]; Rd.W[1].H[0] = resb[1]; + * Rd.W[0].H[1] = rest[0]; Rd.W[0].H[0] = resb[0]; + * x = 0 and 2 + * ~~~ + * + * \param [in] a unsigned int type of value stored in a + * \param [in] b unsigned int type of value stored in b + * \return value stored in unsigned long long type + */ +__STATIC_FORCEINLINE unsigned long long __RV_SMUL8(unsigned int a, unsigned int b) +{ + register unsigned long long result; + __ASM volatile("smul8 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 3.122.1. SMUL8 ===== */ + +/* ===== Inline Function Start for 3.122.2. SMULX8 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_MULTIPLY + * \brief SMULX8 (SIMD Signed Crossed 8-bit Multiply) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * SMUL8 Rd, Rs1, Rs2 + * SMULX8 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do signed 8-bit multiplications and generate four 16-bit results simultaneously. + * + * **RV32 Description**:\n + * For the `SMUL8` instruction, multiply the 8-bit data elements of Rs1 with the + * corresponding 8-bit data elements of Rs2. + * For the `SMULX8` instruction, multiply the first and second 8-bit data elements of Rs1 with the + * second and first 8-bit data elements of Rs2. At the same time, multiply the third and fourth 8-bit data + * elements of Rs1 with the fourth and third 8-bit data elements of Rs2. + * The four 16-bit results are then written into an even/odd pair of registers specified by Rd(4,1). + * Rd(4,1), i.e., d, determines the even/odd pair group of two registers. Specifically, the register pair + * includes register 2d and 2d+1. + * The odd `2d+1` register of the pair contains the two 16-bit results calculated from the top part of + * Rs1 and the even `2d` register of the pair contains the two 16-bit results calculated from the bottom + * part of Rs1. + * + * **RV64 Description**:\n + * For the `SMUL8` instruction, multiply the 8-bit data elements of Rs1 with the + * corresponding 8-bit data elements of Rs2. + * For the `SMULX8` instruction, multiply the first and second 8-bit data elements of Rs1 with the + * second and first 8-bit data elements of Rs2. At the same time, multiply the third and fourth 8-bit data + * elements of Rs1 with the fourth and third 8-bit data elements of Rs2. + * The four 16-bit results are then written into Rd. The Rd.W[1] contains the two 16-bit results + * calculated from the top part of Rs1 and the Rd.W[0] contains the two 16-bit results calculated from + * the bottom part of Rs1. + * + * **Operations**:\n + * ~~~ + * * RV32: + * if (is `SMUL8`) { + * op1t[x/2] = Rs1.B[x+1]; op2t[x/2] = Rs2.B[x+1]; // top + * op1b[x/2] = Rs1.B[x]; op2b[x/2] = Rs2.B[x]; // bottom + * } else if (is `SMULX8`) { + * op1t[x/2] = Rs1.B[x+1]; op2t[x/2] = Rs2.B[x]; // Rs1 top + * op1b[x/2] = Rs1.B[x]; op2b[x/2] = Rs2.B[x+1]; // Rs1 bottom + * } + * rest[x/2] = op1t[x/2] s* op2t[x/2]; + * resb[x/2] = op1b[x/2] s* op2b[x/2]; + * t_L = CONCAT(Rd(4,1),1'b0); t_H = CONCAT(Rd(4,1),1'b1); + * R[t_H].H[1] = rest[1]; R[t_H].H[0] = resb[1]; + * R[t_L].H[1] = rest[0]; R[t_L].H[0] = resb[0]; + * x = 0 and 2 + * * RV64: + * if (is `SMUL8`) { + * op1t[x/2] = Rs1.B[x+1]; op2t[x/2] = Rs2.B[x+1]; // top + * op1b[x/2] = Rs1.B[x]; op2b[x/2] = Rs2.B[x]; // bottom + * } else if (is `SMULX8`) { + * op1t[x/2] = Rs1.B[x+1]; op2t[x/2] = Rs2.B[x]; // Rs1 top + * op1b[x/2] = Rs1.B[x]; op2b[x/2] = Rs2.B[x+1]; // Rs1 bottom + * } + * rest[x/2] = op1t[x/2] s* op2t[x/2]; + * resb[x/2] = op1b[x/2] s* op2b[x/2]; + * t_L = CONCAT(Rd(4,1),1'b0); t_H = CONCAT(Rd(4,1),1'b1); + * Rd.W[1].H[1] = rest[1]; Rd.W[1].H[0] = resb[1]; + * Rd.W[0].H[1] = rest[0]; Rd.W[0].H[0] = resb[0]; + * x = 0 and 2 + * ~~~ + * + * \param [in] a unsigned int type of value stored in a + * \param [in] b unsigned int type of value stored in b + * \return value stored in unsigned long long type + */ +__STATIC_FORCEINLINE unsigned long long __RV_SMULX8(unsigned int a, unsigned int b) +{ + register unsigned long long result; + __ASM volatile("smulx8 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 3.122.2. SMULX8 ===== */ + +/* ===== Inline Function Start for 3.123.1. SMUL16 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_MULTIPLY + * \brief SMUL16 (SIMD Signed 16-bit Multiply) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * SMUL16 Rd, Rs1, Rs2 + * SMULX16 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do signed 16-bit multiplications and generate two 32-bit results simultaneously. + * + * **RV32 Description**:\n + * For the `SMUL16` instruction, multiply the top 16-bit Q15 content of Rs1 with + * the top 16-bit Q15 content of Rs2. At the same time, multiply the bottom 16-bit Q15 content of Rs1 + * with the bottom 16-bit Q15 content of Rs2. + * For the `SMULX16` instruction, multiply the top 16-bit Q15 content of Rs1 with the bottom 16-bit + * Q15 content of Rs2. At the same time, multiply the bottom 16-bit Q15 content of Rs1 with the top 16- + * bit Q15 content of Rs2. + * The two Q30 results are then written into an even/odd pair of registers specified by Rd(4,1). Rd(4,1), + * i.e., d, determines the even/odd pair group of two registers. Specifically, the register pair includes + * register 2d and 2d+1. + * The odd `2d+1` register of the pair contains the 32-bit result calculated from the top part of Rs1 and + * the even `2d` register of the pair contains the 32-bit result calculated from the bottom part of Rs1. + * + * **RV64 Description**:\n + * For the `SMUL16` instruction, multiply the top 16-bit Q15 content of the lower + * 32-bit word in Rs1 with the top 16-bit Q15 content of the lower 32-bit word in Rs2. At the same time, + * multiply the bottom 16-bit Q15 content of the lower 32-bit word in Rs1 with the bottom 16-bit Q15 + * content of the lower 32-bit word in Rs2. + * For the `SMULX16` instruction, multiply the top 16-bit Q15 content of the lower 32-bit word in Rs1 + * with the bottom 16-bit Q15 content of the lower 32-bit word in Rs2. At the same time, multiply the + * bottom 16-bit Q15 content of the lower 32-bit word in Rs1 with the top 16-bit Q15 content of the + * lower 32-bit word in Rs2. + * The two 32-bit Q30 results are then written into Rd. The result calculated from the top 16-bit of the + * lower 32-bit word in Rs1 is written to Rd.W[1]. And the result calculated from the bottom 16-bit of + * the lower 32-bit word in Rs1 is written to Rd.W[0] + * + * **Operations**:\n + * ~~~ + * * RV32: + * if (is `SMUL16`) { + * op1t = Rs1.H[1]; op2t = Rs2.H[1]; // top + * op1b = Rs1.H[0]; op2b = Rs2.H[0]; // bottom + * } else if (is `SMULX16`) { + * op1t = Rs1.H[1]; op2t = Rs2.H[0]; // Rs1 top + * op1b = Rs1.H[0]; op2b = Rs2.H[1]; // Rs1 bottom + * } + * for ((aop,bop,res) in [(op1t,op2t,rest), (op1b,op2b,resb)]) { + * res = aop s* bop; + * } + * t_L = CONCAT(Rd(4,1),1'b0); t_H = CONCAT(Rd(4,1),1'b1); + * R[t_H] = rest; + * R[t_L] = resb; + * * RV64: + * if (is `SMUL16`) { + * op1t = Rs1.H[1]; op2t = Rs2.H[1]; // top + * op1b = Rs1.H[0]; op2b = Rs2.H[0]; // bottom + * } else if (is `SMULX16`) { + * op1t = Rs1.H[1]; op2t = Rs2.H[0]; // Rs1 top + * op1b = Rs1.H[0]; op2b = Rs2.H[1]; // Rs1 bottom + * } + * for ((aop,bop,res) in [(op1t,op2t,rest), (op1b,op2b,resb)]) { + * res = aop s* bop; + * } + * Rd.W[1] = rest; + * Rd.W[0] = resb; + * ~~~ + * + * \param [in] a unsigned int type of value stored in a + * \param [in] b unsigned int type of value stored in b + * \return value stored in unsigned long long type + */ +__STATIC_FORCEINLINE unsigned long long __RV_SMUL16(unsigned int a, unsigned int b) +{ + register unsigned long long result; + __ASM volatile("smul16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 3.123.1. SMUL16 ===== */ + +/* ===== Inline Function Start for 3.123.2. SMULX16 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_MULTIPLY + * \brief SMULX16 (SIMD Signed Crossed 16-bit Multiply) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * SMUL16 Rd, Rs1, Rs2 + * SMULX16 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do signed 16-bit multiplications and generate two 32-bit results simultaneously. + * + * **RV32 Description**:\n + * For the `SMUL16` instruction, multiply the top 16-bit Q15 content of Rs1 with + * the top 16-bit Q15 content of Rs2. At the same time, multiply the bottom 16-bit Q15 content of Rs1 + * with the bottom 16-bit Q15 content of Rs2. + * For the `SMULX16` instruction, multiply the top 16-bit Q15 content of Rs1 with the bottom 16-bit + * Q15 content of Rs2. At the same time, multiply the bottom 16-bit Q15 content of Rs1 with the top 16- + * bit Q15 content of Rs2. + * The two Q30 results are then written into an even/odd pair of registers specified by Rd(4,1). Rd(4,1), + * i.e., d, determines the even/odd pair group of two registers. Specifically, the register pair includes + * register 2d and 2d+1. + * The odd `2d+1` register of the pair contains the 32-bit result calculated from the top part of Rs1 and + * the even `2d` register of the pair contains the 32-bit result calculated from the bottom part of Rs1. + * + * **RV64 Description**:\n + * For the `SMUL16` instruction, multiply the top 16-bit Q15 content of the lower + * 32-bit word in Rs1 with the top 16-bit Q15 content of the lower 32-bit word in Rs2. At the same time, + * multiply the bottom 16-bit Q15 content of the lower 32-bit word in Rs1 with the bottom 16-bit Q15 + * content of the lower 32-bit word in Rs2. + * For the `SMULX16` instruction, multiply the top 16-bit Q15 content of the lower 32-bit word in Rs1 + * with the bottom 16-bit Q15 content of the lower 32-bit word in Rs2. At the same time, multiply the + * bottom 16-bit Q15 content of the lower 32-bit word in Rs1 with the top 16-bit Q15 content of the + * lower 32-bit word in Rs2. + * The two 32-bit Q30 results are then written into Rd. The result calculated from the top 16-bit of the + * lower 32-bit word in Rs1 is written to Rd.W[1]. And the result calculated from the bottom 16-bit of + * the lower 32-bit word in Rs1 is written to Rd.W[0] + * + * **Operations**:\n + * ~~~ + * * RV32: + * if (is `SMUL16`) { + * op1t = Rs1.H[1]; op2t = Rs2.H[1]; // top + * op1b = Rs1.H[0]; op2b = Rs2.H[0]; // bottom + * } else if (is `SMULX16`) { + * op1t = Rs1.H[1]; op2t = Rs2.H[0]; // Rs1 top + * op1b = Rs1.H[0]; op2b = Rs2.H[1]; // Rs1 bottom + * } + * for ((aop,bop,res) in [(op1t,op2t,rest), (op1b,op2b,resb)]) { + * res = aop s* bop; + * } + * t_L = CONCAT(Rd(4,1),1'b0); t_H = CONCAT(Rd(4,1),1'b1); + * R[t_H] = rest; + * R[t_L] = resb; + * * RV64: + * if (is `SMUL16`) { + * op1t = Rs1.H[1]; op2t = Rs2.H[1]; // top + * op1b = Rs1.H[0]; op2b = Rs2.H[0]; // bottom + * } else if (is `SMULX16`) { + * op1t = Rs1.H[1]; op2t = Rs2.H[0]; // Rs1 top + * op1b = Rs1.H[0]; op2b = Rs2.H[1]; // Rs1 bottom + * } + * for ((aop,bop,res) in [(op1t,op2t,rest), (op1b,op2b,resb)]) { + * res = aop s* bop; + * } + * Rd.W[1] = rest; + * Rd.W[0] = resb; + * ~~~ + * + * \param [in] a unsigned int type of value stored in a + * \param [in] b unsigned int type of value stored in b + * \return value stored in unsigned long long type + */ +__STATIC_FORCEINLINE unsigned long long __RV_SMULX16(unsigned int a, unsigned int b) +{ + register unsigned long long result; + __ASM volatile("smulx16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 3.123.2. SMULX16 ===== */ + +/* ===== Inline Function Start for 3.124. SRA.u ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_MISC + * \brief SRA.u (Rounding Shift Right Arithmetic) + * \details + * **Type**: DSP + * + * **Syntax**:\n + * ~~~ + * SRA.u Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Perform an arithmetic right shift operation with rounding. The shift amount is a variable + * from a GPR. + * + * **Description**:\n + * This instruction right-shifts the content of Rs1 arithmetically. The shifted out bits are + * filled with the sign-bit and the shift amount is specified by the low-order 5-bits (RV32) or 6-bits + * (RV64) of the Rs2 register. For the rounding operation, a value of 1 is added to the most significant + * discarded bit of the data to calculate the final result. And the result is written to Rd. + * + * **Operations**:\n + * ~~~ + * * RV32: + * sa = Rs2[4:0]; + * if (sa > 0) { + * res[31:-1] = SE33(Rs1[31:(sa-1)]) + 1; + * Rd = res[31:0]; + * } else { + * Rd = Rs1; + * } + * * RV64: + * sa = Rs2[5:0]; + * if (sa > 0) { + * res[63:-1] = SE65(Rs1[63:(sa-1)]) + 1; + * Rd = res[63:0]; + * } else { + * Rd = Rs1; + * } + * ~~~ + * + * \param [in] a long type of value stored in a + * \param [in] b unsigned int type of value stored in b + * \return value stored in long type + */ +__STATIC_FORCEINLINE long __RV_SRA_U(long a, unsigned int b) +{ + register long result; + __ASM volatile("sra.u %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 3.124. SRA.u ===== */ + +/* ===== Inline Function Start for 3.125. SRAI.u ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_MISC + * \brief SRAI.u (Rounding Shift Right Arithmetic Immediate) + * \details + * **Type**: DSP + * + * **Syntax**:\n + * ~~~ + * SRAI.u Rd, Rs1, imm6u[4:0] (RV32) + * SRAI.u Rd, Rs1, imm6u[5:0] (RV64) + * ~~~ + * + * **Purpose**:\n + * Perform an arithmetic right shift operation with rounding. The shift amount is an + * immediate value. + * + * **Description**:\n + * This instruction right-shifts the content of Rs1 arithmetically. The shifted out bits are + * filled with the sign-bit and the shift amount is specified by the imm6u[4:0] (RV32) or imm6u[5:0] + * (RV64) constant . For the rounding operation, a value of 1 is added to the most significant discarded + * bit of the data to calculate the final result. And the result is written to Rd. + * + * **Operations**:\n + * ~~~ + * * RV32: + * sa = imm6u[4:0]; + * if (sa > 0) { + * res[31:-1] = SE33(Rs1[31:(sa-1)]) + 1; + * Rd = res[31:0]; + * } else { + * Rd = Rs1; + * } + * * RV64: + * sa = imm6u[5:0]; + * if (sa > 0) { + * res[63:-1] = SE65(Rs1[63:(sa-1)]) + 1; + * Rd = res[63:0]; + * } else { + * Rd = Rs1; + * } + * ~~~ + * + * \param [in] a long type of value stored in a + * \param [in] b unsigned int type of value stored in b + * \return value stored in long type + */ +#define __RV_SRAI_U(a, b) \ + ({ \ + register long result; \ + register long __a = (long)(a); \ + __ASM volatile("srai.u %0, %1, %2" : "=r"(result) : "r"(__a), "K"(b)); \ + result; \ + }) +/* ===== Inline Function End for 3.125. SRAI.u ===== */ + +/* ===== Inline Function Start for 3.126.1. SRA8 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_SHIFT + * \brief SRA8 (SIMD 8-bit Shift Right Arithmetic) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * SRA8 Rd, Rs1, Rs2 + * SRA8.u Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do 8-bit element arithmetic right shift operations simultaneously. The shift amount is a + * variable from a GPR. The `.u` form performs additional rounding up operations on the shifted + * results. + * + * **Description**:\n + * The 8-bit data elements in Rs1 are right-shifted arithmetically, that is, the shifted out + * bits are filled with the sign-bit of the data elements. The shift amount is specified by the low-order + * 3-bits of the value in the Rs2 register. For the rounding operation of the `.u` form, a value of 1 is + * added to the most significant discarded bit of each 8-bit data element to calculate the final results. + * And the results are written to Rd. + * + * **Operations**:\n + * ~~~ + * sa = Rs2[2:0]; + * if (sa > 0) { + * if (`.u` form) { // SRA8.u + * res[7:-1] = SE9(Rs1.B[x][7:sa-1]) + 1; + * Rd.B[x] = res[7:0]; + * } else { // SRA8 + * Rd.B[x] = SE8(Rd.B[x][7:sa]) + * } + * } else { + * Rd = Rs1; + * } + * for RV32: x=3...0, + * for RV64: x=7...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned int type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_SRA8(unsigned long a, unsigned int b) +{ + register unsigned long result; + __ASM volatile("sra8 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 3.126.1. SRA8 ===== */ + +/* ===== Inline Function Start for 3.126.2. SRA8.u ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_SHIFT + * \brief SRA8.u (SIMD 8-bit Rounding Shift Right Arithmetic) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * SRA8 Rd, Rs1, Rs2 + * SRA8.u Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do 8-bit element arithmetic right shift operations simultaneously. The shift amount is a + * variable from a GPR. The `.u` form performs additional rounding up operations on the shifted + * results. + * + * **Description**:\n + * The 8-bit data elements in Rs1 are right-shifted arithmetically, that is, the shifted out + * bits are filled with the sign-bit of the data elements. The shift amount is specified by the low-order + * 3-bits of the value in the Rs2 register. For the rounding operation of the `.u` form, a value of 1 is + * added to the most significant discarded bit of each 8-bit data element to calculate the final results. + * And the results are written to Rd. + * + * **Operations**:\n + * ~~~ + * sa = Rs2[2:0]; + * if (sa > 0) { + * if (`.u` form) { // SRA8.u + * res[7:-1] = SE9(Rs1.B[x][7:sa-1]) + 1; + * Rd.B[x] = res[7:0]; + * } else { // SRA8 + * Rd.B[x] = SE8(Rd.B[x][7:sa]) + * } + * } else { + * Rd = Rs1; + * } + * for RV32: x=3...0, + * for RV64: x=7...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned int type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_SRA8_U(unsigned long a, unsigned int b) +{ + register unsigned long result; + __ASM volatile("sra8.u %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 3.126.2. SRA8.u ===== */ + +/* ===== Inline Function Start for 3.127.1. SRAI8 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_SHIFT + * \brief SRAI8 (SIMD 8-bit Shift Right Arithmetic Immediate) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * SRAI8 Rd, Rs1, imm3u + * SRAI8.u Rd, Rs1, imm3u + * ~~~ + * + * **Purpose**:\n + * Do 8-bit element arithmetic right shift operations simultaneously. The shift amount is an + * immediate value. The `.u` form performs additional rounding up operations on the shifted results. + * + * **Description**:\n + * The 8-bit data elements in Rs1 are right-shifted arithmetically, that is, the shifted out + * bits are filled with the sign-bit of the data elements. The shift amount is specified by the imm3u + * constant. For the rounding operation of the `.u` form, a value of 1 is added to the most significant + * discarded bit of each 8-bit data element to calculate the final results. And the results are written to + * Rd. + * + * **Operations**:\n + * ~~~ + * sa = imm3u[2:0]; + * if (sa > 0) { + * if (`.u` form) { // SRA8.u + * res[7:-1] = SE9(Rs1.B[x][7:sa-1]) + 1; + * Rd.B[x] = res[7:0]; + * } else { // SRA8 + * Rd.B[x] = SE8(Rd.B[x][7:sa]) + * } + * } else { + * Rd = Rs1; + * } + * for RV32: x=3...0, + * for RV64: x=7...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned int type of value stored in b + * \return value stored in unsigned long type + */ +#define __RV_SRAI8(a, b) \ + ({ \ + register unsigned long result; \ + register unsigned long __a = (unsigned long)(a); \ + __ASM volatile("srai8 %0, %1, %2" : "=r"(result) : "r"(__a), "K"(b)); \ + result; \ + }) +/* ===== Inline Function End for 3.127.1. SRAI8 ===== */ + +/* ===== Inline Function Start for 3.127.2. SRAI8.u ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_SHIFT + * \brief SRAI8.u (SIMD 8-bit Rounding Shift Right Arithmetic Immediate) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * SRAI8 Rd, Rs1, imm3u + * SRAI8.u Rd, Rs1, imm3u + * ~~~ + * + * **Purpose**:\n + * Do 8-bit element arithmetic right shift operations simultaneously. The shift amount is an + * immediate value. The `.u` form performs additional rounding up operations on the shifted results. + * + * **Description**:\n + * The 8-bit data elements in Rs1 are right-shifted arithmetically, that is, the shifted out + * bits are filled with the sign-bit of the data elements. The shift amount is specified by the imm3u + * constant. For the rounding operation of the `.u` form, a value of 1 is added to the most significant + * discarded bit of each 8-bit data element to calculate the final results. And the results are written to + * Rd. + * + * **Operations**:\n + * ~~~ + * sa = imm3u[2:0]; + * if (sa > 0) { + * if (`.u` form) { // SRA8.u + * res[7:-1] = SE9(Rs1.B[x][7:sa-1]) + 1; + * Rd.B[x] = res[7:0]; + * } else { // SRA8 + * Rd.B[x] = SE8(Rd.B[x][7:sa]) + * } + * } else { + * Rd = Rs1; + * } + * for RV32: x=3...0, + * for RV64: x=7...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned int type of value stored in b + * \return value stored in unsigned long type + */ +#define __RV_SRAI8_U(a, b) \ + ({ \ + register unsigned long result; \ + register unsigned long __a = (unsigned long)(a); \ + __ASM volatile("srai8.u %0, %1, %2" : "=r"(result) : "r"(__a), "K"(b)); \ + result; \ + }) +/* ===== Inline Function End for 3.127.2. SRAI8.u ===== */ + +/* ===== Inline Function Start for 3.128.1. SRA16 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_SHIFT + * \brief SRA16 (SIMD 16-bit Shift Right Arithmetic) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * SRA16 Rd, Rs1, Rs2 + * SRA16.u Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do 16-bit element arithmetic right shift operations simultaneously. The shift amount is a + * variable from a GPR. The `.u` form performs additional rounding up operations on the shifted + * results. + * + * **Description**:\n + * The 16-bit data elements in Rs1 are right-shifted arithmetically, that is, the shifted out + * bits are filled with the sign-bit of the data elements. The shift amount is specified by the low-order + * 4-bits of the value in the Rs2 register. For the rounding operation of the `.u` form, a value of 1 is + * added to the most significant discarded bit of each 16-bit data element to calculate the final results. + * And the results are written to Rd. + * + * **Operations**:\n + * ~~~ + * sa = Rs2[3:0]; + * if (sa != 0) { + * if (`.u` form) { // SRA16.u + * res[15:-1] = SE17(Rs1.H[x][15:sa-1]) + 1; + * Rd.H[x] = res[15:0]; + * } else { // SRA16 + * Rd.H[x] = SE16(Rs1.H[x][15:sa]) + * } + * } else { + * Rd = Rs1; + * } + * for RV32: x=1...0, + * for RV64: x=3...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_SRA16(unsigned long a, unsigned long b) +{ + register unsigned long result; + __ASM volatile("sra16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 3.128.1. SRA16 ===== */ + +/* ===== Inline Function Start for 3.128.2. SRA16.u ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_SHIFT + * \brief SRA16.u (SIMD 16-bit Rounding Shift Right Arithmetic) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * SRA16 Rd, Rs1, Rs2 + * SRA16.u Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do 16-bit element arithmetic right shift operations simultaneously. The shift amount is a + * variable from a GPR. The `.u` form performs additional rounding up operations on the shifted + * results. + * + * **Description**:\n + * The 16-bit data elements in Rs1 are right-shifted arithmetically, that is, the shifted out + * bits are filled with the sign-bit of the data elements. The shift amount is specified by the low-order + * 4-bits of the value in the Rs2 register. For the rounding operation of the `.u` form, a value of 1 is + * added to the most significant discarded bit of each 16-bit data element to calculate the final results. + * And the results are written to Rd. + * + * **Operations**:\n + * ~~~ + * sa = Rs2[3:0]; + * if (sa != 0) { + * if (`.u` form) { // SRA16.u + * res[15:-1] = SE17(Rs1.H[x][15:sa-1]) + 1; + * Rd.H[x] = res[15:0]; + * } else { // SRA16 + * Rd.H[x] = SE16(Rs1.H[x][15:sa]) + * } + * } else { + * Rd = Rs1; + * } + * for RV32: x=1...0, + * for RV64: x=3...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_SRA16_U(unsigned long a, unsigned long b) +{ + register unsigned long result; + __ASM volatile("sra16.u %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 3.128.2. SRA16.u ===== */ + +/* ===== Inline Function Start for 3.129.1. SRAI16 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_SHIFT + * \brief SRAI16 (SIMD 16-bit Shift Right Arithmetic Immediate) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * SRAI16 Rd, Rs1, imm4u + * SRAI16.u Rd, Rs1, imm4u + * ~~~ + * + * **Purpose**:\n + * Do 16-bit elements arithmetic right shift operations simultaneously. The shift amount is + * an immediate value. The `.u` form performs additional rounding up operations on the shifted + * results. + * + * **Description**:\n + * The 16-bit data elements in Rs1 are right-shifted arithmetically, that is, the shifted out + * bits are filled with the sign-bit of the 16-bit data elements. The shift amount is specified by the + * imm4u constant. For the rounding operation of the `.u` form, a value of 1 is added to the most + * significant discarded bit of each 16-bit data to calculate the final results. And the results are written + * to Rd. + * + * **Operations**:\n + * ~~~ + * sa = imm4u[3:0]; + * if (sa > 0) { + * if (`.u` form) { // SRAI16.u + * res[15:-1] = SE17(Rs1.H[x][15:sa-1]) + 1; + * Rd.H[x] = res[15:0]; + * } else { // SRAI16 + * Rd.H[x] = SE16(Rs1.H[x][15:sa]); + * } + * } else { + * Rd = Rs1; + * } + * for RV32: x=1...0, + * for RV64: x=3...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in unsigned long type + */ +#define __RV_SRAI16(a, b) \ + ({ \ + register unsigned long result; \ + register unsigned long __a = (unsigned long)(a); \ + __ASM volatile("srai16 %0, %1, %2" : "=r"(result) : "r"(__a), "K"(b)); \ + result; \ + }) +/* ===== Inline Function End for 3.129.1. SRAI16 ===== */ + +/* ===== Inline Function Start for 3.129.2. SRAI16.u ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_SHIFT + * \brief SRAI16.u (SIMD 16-bit Rounding Shift Right Arithmetic Immediate) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * SRAI16 Rd, Rs1, imm4u + * SRAI16.u Rd, Rs1, imm4u + * ~~~ + * + * **Purpose**:\n + * Do 16-bit elements arithmetic right shift operations simultaneously. The shift amount is + * an immediate value. The `.u` form performs additional rounding up operations on the shifted + * results. + * + * **Description**:\n + * The 16-bit data elements in Rs1 are right-shifted arithmetically, that is, the shifted out + * bits are filled with the sign-bit of the 16-bit data elements. The shift amount is specified by the + * imm4u constant. For the rounding operation of the `.u` form, a value of 1 is added to the most + * significant discarded bit of each 16-bit data to calculate the final results. And the results are written + * to Rd. + * + * **Operations**:\n + * ~~~ + * sa = imm4u[3:0]; + * if (sa > 0) { + * if (`.u` form) { // SRAI16.u + * res[15:-1] = SE17(Rs1.H[x][15:sa-1]) + 1; + * Rd.H[x] = res[15:0]; + * } else { // SRAI16 + * Rd.H[x] = SE16(Rs1.H[x][15:sa]); + * } + * } else { + * Rd = Rs1; + * } + * for RV32: x=1...0, + * for RV64: x=3...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in unsigned long type + */ +#define __RV_SRAI16_U(a, b) \ + ({ \ + register unsigned long result; \ + register unsigned long __a = (unsigned long)(a); \ + __ASM volatile("srai16.u %0, %1, %2" : "=r"(result) : "r"(__a), "K"(b)); \ + result; \ + }) +/* ===== Inline Function End for 3.129.2. SRAI16.u ===== */ + +/* ===== Inline Function Start for 3.130.1. SRL8 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_SHIFT + * \brief SRL8 (SIMD 8-bit Shift Right Logical) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * SRL8 Rt, Ra, Rb + * SRL8.u Rt, Ra, Rb + * ~~~ + * + * **Purpose**:\n + * Do 8-bit elements logical right shift operations simultaneously. The shift amount is a + * variable from a GPR. The `.u` form performs additional rounding up operations on the shifted + * results. + * + * **Description**:\n + * The 8-bit data elements in Rs1 are right-shifted logically, that is, the shifted out bits are + * filled with zero. The shift amount is specified by the low-order 3-bits of the value in the Rs2 register. + * For the rounding operation of the `.u` form, a value of 1 is added to the most significant discarded + * bit of each 8-bit data element to calculate the final results. And the results are written to Rd. + * + * **Operations**:\n + * ~~~ + * sa = Rs2[2:0]; + * if (sa > 0) { + * if (`.u` form) { // SRL8.u + * res[8:0] = ZE9(Rs1.B[x][7:sa-1]) + 1; + * Rd.B[x] = res[8:1]; + * } else { // SRL8 + * Rd.B[x] = ZE8(Rs1.B[x][7:sa]); + * } + * } else { + * Rd = Rs1; + * } + * for RV32: x=3...0, + * for RV64: x=7...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned int type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_SRL8(unsigned long a, unsigned int b) +{ + register unsigned long result; + __ASM volatile("srl8 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 3.130.1. SRL8 ===== */ + +/* ===== Inline Function Start for 3.130.2. SRL8.u ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_SHIFT + * \brief SRL8.u (SIMD 8-bit Rounding Shift Right Logical) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * SRL8 Rt, Ra, Rb + * SRL8.u Rt, Ra, Rb + * ~~~ + * + * **Purpose**:\n + * Do 8-bit elements logical right shift operations simultaneously. The shift amount is a + * variable from a GPR. The `.u` form performs additional rounding up operations on the shifted + * results. + * + * **Description**:\n + * The 8-bit data elements in Rs1 are right-shifted logically, that is, the shifted out bits are + * filled with zero. The shift amount is specified by the low-order 3-bits of the value in the Rs2 register. + * For the rounding operation of the `.u` form, a value of 1 is added to the most significant discarded + * bit of each 8-bit data element to calculate the final results. And the results are written to Rd. + * + * **Operations**:\n + * ~~~ + * sa = Rs2[2:0]; + * if (sa > 0) { + * if (`.u` form) { // SRL8.u + * res[8:0] = ZE9(Rs1.B[x][7:sa-1]) + 1; + * Rd.B[x] = res[8:1]; + * } else { // SRL8 + * Rd.B[x] = ZE8(Rs1.B[x][7:sa]); + * } + * } else { + * Rd = Rs1; + * } + * for RV32: x=3...0, + * for RV64: x=7...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned int type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_SRL8_U(unsigned long a, unsigned int b) +{ + register unsigned long result; + __ASM volatile("srl8.u %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 3.130.2. SRL8.u ===== */ + +/* ===== Inline Function Start for 3.131.1. SRLI8 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_SHIFT + * \brief SRLI8 (SIMD 8-bit Shift Right Logical Immediate) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * SRLI8 Rt, Ra, imm3u + * SRLI8.u Rt, Ra, imm3u + * ~~~ + * + * **Purpose**:\n + * Do 8-bit elements logical right shift operations simultaneously. The shift amount is an + * immediate value. The `.u` form performs additional rounding up operations on the shifted results. + * + * **Description**:\n + * The 8-bit data elements in Rs1 are right-shifted logically, that is, the shifted out bits are + * filled with zero. The shift amount is specified by the imm3u constant. For the rounding operation of + * the `.u` form, a value of 1 is added to the most significant discarded bit of each 8-bit data element to + * calculate the final results. And the results are written to Rd. + * + * **Operations**:\n + * ~~~ + * sa = imm3u[2:0]; + * if (sa > 0) { + * if (`.u` form) { // SRLI8.u + * res[8:0] = ZE9(Rs1.B[x][7:sa-1]) + 1; + * Rd.B[x] = res[8:1]; + * } else { // SRLI8 + * Rd.B[x] = ZE8(Rs1.B[x][7:sa]); + * } + * } else { + * Rd = Rs1; + * } + * for RV32: x=3...0, + * for RV64: x=7...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned int type of value stored in b + * \return value stored in unsigned long type + */ +#define __RV_SRLI8(a, b) \ + ({ \ + register unsigned long result; \ + register unsigned long __a = (unsigned long)(a); \ + __ASM volatile("srli8 %0, %1, %2" : "=r"(result) : "r"(__a), "K"(b)); \ + result; \ + }) +/* ===== Inline Function End for 3.131.1. SRLI8 ===== */ + +/* ===== Inline Function Start for 3.131.2. SRLI8.u ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_SHIFT + * \brief SRLI8.u (SIMD 8-bit Rounding Shift Right Logical Immediate) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * SRLI8 Rt, Ra, imm3u + * SRLI8.u Rt, Ra, imm3u + * ~~~ + * + * **Purpose**:\n + * Do 8-bit elements logical right shift operations simultaneously. The shift amount is an + * immediate value. The `.u` form performs additional rounding up operations on the shifted results. + * + * **Description**:\n + * The 8-bit data elements in Rs1 are right-shifted logically, that is, the shifted out bits are + * filled with zero. The shift amount is specified by the imm3u constant. For the rounding operation of + * the `.u` form, a value of 1 is added to the most significant discarded bit of each 8-bit data element to + * calculate the final results. And the results are written to Rd. + * + * **Operations**:\n + * ~~~ + * sa = imm3u[2:0]; + * if (sa > 0) { + * if (`.u` form) { // SRLI8.u + * res[8:0] = ZE9(Rs1.B[x][7:sa-1]) + 1; + * Rd.B[x] = res[8:1]; + * } else { // SRLI8 + * Rd.B[x] = ZE8(Rs1.B[x][7:sa]); + * } + * } else { + * Rd = Rs1; + * } + * for RV32: x=3...0, + * for RV64: x=7...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned int type of value stored in b + * \return value stored in unsigned long type + */ +#define __RV_SRLI8_U(a, b) \ + ({ \ + register unsigned long result; \ + register unsigned long __a = (unsigned long)(a); \ + __ASM volatile("srli8.u %0, %1, %2" : "=r"(result) : "r"(__a), "K"(b)); \ + result; \ + }) +/* ===== Inline Function End for 3.131.2. SRLI8.u ===== */ + +/* ===== Inline Function Start for 3.132.1. SRL16 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_SHIFT + * \brief SRL16 (SIMD 16-bit Shift Right Logical) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * SRL16 Rt, Ra, Rb + * SRL16.u Rt, Ra, Rb + * ~~~ + * + * **Purpose**:\n + * Do 16-bit elements logical right shift operations simultaneously. The shift amount is a variable from a GPR. The `.u` form performs additional rounding upoperations on the shifted results. + * + * **Description**:\n + * The 16-bit data elements in Rs1 are right-shifted logically, that is, the shifted out bits + * are filled with zero. The shift amount is specified by the low-order 4-bits of the value in the Rs2 + * register. For the rounding operation of the `.u` form, a value of 1 is added to the most significant + * discarded bit of each 16-bit data element to calculate the final results. And the results are written to + * Rd. + * + * **Operations**:\n + * ~~~ + * sa = Rs2[3:0]; + * if (sa > 0) { + * if (`.u` form) { // SRL16.u + * res[16:0] = ZE17(Rs1.H[x][15:sa-1]) + 1; + * Rd.H[x] = res[16:1]; + * } else { // SRL16 + * Rd.H[x] = ZE16(Rs1.H[x][15:sa]); + * } + * } else { + * Rd = Rs1; + * } + * for RV32: x=1...0, + * for RV64: x=3...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned int type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_SRL16(unsigned long a, unsigned int b) +{ + register unsigned long result; + __ASM volatile("srl16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 3.132.1. SRL16 ===== */ + +/* ===== Inline Function Start for 3.132.2. SRL16.u ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_SHIFT + * \brief SRL16.u (SIMD 16-bit Rounding Shift Right Logical) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * SRL16 Rt, Ra, Rb + * SRL16.u Rt, Ra, Rb + * ~~~ + * + * **Purpose**:\n + * Do 16-bit elements logical right shift operations simultaneously. The shift amount is a variable from a GPR. The `.u` form performs additional rounding upoperations on the shifted results. + * + * **Description**:\n + * The 16-bit data elements in Rs1 are right-shifted logically, that is, the shifted out bits + * are filled with zero. The shift amount is specified by the low-order 4-bits of the value in the Rs2 + * register. For the rounding operation of the `.u` form, a value of 1 is added to the most significant + * discarded bit of each 16-bit data element to calculate the final results. And the results are written to + * Rd. + * + * **Operations**:\n + * ~~~ + * sa = Rs2[3:0]; + * if (sa > 0) { + * if (`.u` form) { // SRL16.u + * res[16:0] = ZE17(Rs1.H[x][15:sa-1]) + 1; + * Rd.H[x] = res[16:1]; + * } else { // SRL16 + * Rd.H[x] = ZE16(Rs1.H[x][15:sa]); + * } + * } else { + * Rd = Rs1; + * } + * for RV32: x=1...0, + * for RV64: x=3...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned int type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_SRL16_U(unsigned long a, unsigned int b) +{ + register unsigned long result; + __ASM volatile("srl16.u %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 3.132.2. SRL16.u ===== */ + +/* ===== Inline Function Start for 3.133.1. SRLI16 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_SHIFT + * \brief SRLI16 (SIMD 16-bit Shift Right Logical Immediate) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * SRLI16 Rt, Ra, imm4u + * SRLI16.u Rt, Ra, imm4u + * ~~~ + * + * **Purpose**:\n + * Do 16-bit elements logical right shift operations simultaneously. The shift amount is an + * immediate value. The `.u` form performs additional rounding up operations on the shifted results. + * + * **Description**:\n + * The 16-bit data elements in Rs1 are right-shifted logically, that is, the shifted out bits + * are filled with zero. The shift amount is specified by the imm4u constant. For the rounding + * operation of the `.u` form, a value of 1 is added to the most significant discarded bit of each 16-bit + * data element to calculate the final results. And the results are written to Rd. + * + * **Operations**:\n + * ~~~ + * sa = imm4u; + * if (sa > 0) { + * if (`.u` form) { // SRLI16.u + * res[16:0] = ZE17(Rs1.H[x][15:sa-1]) + 1; + * Rd.H[x] = res[16:1]; + * } else { // SRLI16 + * Rd.H[x] = ZE16(Rs1.H[x][15:sa]); + * } + * } else { + * Rd = Rs1; + * } + * for RV32: x=1...0, + * for RV64: x=3...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned int type of value stored in b + * \return value stored in unsigned long type + */ +#define __RV_SRLI16(a, b) \ + ({ \ + register unsigned long result; \ + register unsigned long __a = (unsigned long)(a); \ + __ASM volatile("srli16 %0, %1, %2" : "=r"(result) : "r"(__a), "K"(b)); \ + result; \ + }) +/* ===== Inline Function End for 3.133.1. SRLI16 ===== */ + +/* ===== Inline Function Start for 3.133.2. SRLI16.u ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_SHIFT + * \brief SRLI16.u (SIMD 16-bit Rounding Shift Right Logical Immediate) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * SRLI16 Rt, Ra, imm4u + * SRLI16.u Rt, Ra, imm4u + * ~~~ + * + * **Purpose**:\n + * Do 16-bit elements logical right shift operations simultaneously. The shift amount is an + * immediate value. The `.u` form performs additional rounding up operations on the shifted results. + * + * **Description**:\n + * The 16-bit data elements in Rs1 are right-shifted logically, that is, the shifted out bits + * are filled with zero. The shift amount is specified by the imm4u constant. For the rounding + * operation of the `.u` form, a value of 1 is added to the most significant discarded bit of each 16-bit + * data element to calculate the final results. And the results are written to Rd. + * + * **Operations**:\n + * ~~~ + * sa = imm4u; + * if (sa > 0) { + * if (`.u` form) { // SRLI16.u + * res[16:0] = ZE17(Rs1.H[x][15:sa-1]) + 1; + * Rd.H[x] = res[16:1]; + * } else { // SRLI16 + * Rd.H[x] = ZE16(Rs1.H[x][15:sa]); + * } + * } else { + * Rd = Rs1; + * } + * for RV32: x=1...0, + * for RV64: x=3...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned int type of value stored in b + * \return value stored in unsigned long type + */ +#define __RV_SRLI16_U(a, b) \ + ({ \ + register unsigned long result; \ + register unsigned long __a = (unsigned long)(a); \ + __ASM volatile("srli16.u %0, %1, %2" : "=r"(result) : "r"(__a), "K"(b)); \ + result; \ + }) +/* ===== Inline Function End for 3.133.2. SRLI16.u ===== */ + +/* ===== Inline Function Start for 3.134. STAS16 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_ADDSUB + * \brief STAS16 (SIMD 16-bit Straight Addition & Subtraction) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * STAS16 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do 16-bit integer element addition and 16-bit integer element subtraction in a 32-bit + * chunk simultaneously. Operands are from corresponding positions in 32-bit chunks. + * + * **Description**:\n + * This instruction adds the 16-bit integer element in [31:16] of 32-bit chunks in Rs1 with + * the 16-bit integer element in [31:16] of 32-bit chunks in Rs2, and writes the result to [31:16] of 32-bit + * chunks in Rd; at the same time, it subtracts the 16-bit integer element in [15:0] of 32-bit chunks in + * Rs2 from the 16-bit integer element in [15:0] of 32-bit chunks, and writes the result to [15:0] of 32- + * bit chunks in Rd. + * + * **Note**:\n + * This instruction can be used for either signed or unsigned operations. + * + * **Operations**:\n + * ~~~ + * Rd.W[x][31:16] = Rs1.W[x][31:16] + Rs2.W[x][31:16]; + * Rd.W[x][15:0] = Rs1.W[x][15:0] - Rs2.W[x][15:0]; + * for RV32, x=0 + * for RV64, x=1...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_STAS16(unsigned long a, unsigned long b) +{ + register unsigned long result; + __ASM volatile("stas16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 3.134. STAS16 ===== */ + +/* ===== Inline Function Start for 3.135. STSA16 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_ADDSUB + * \brief STSA16 (SIMD 16-bit Straight Subtraction & Addition) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * STSA16 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do 16-bit integer element subtraction and 16-bit integer element addition in a 32-bit + * chunk simultaneously. Operands are from corresponding positions in 32-bit chunks. + * + * **Description**:\n + * This instruction subtracts the 16-bit integer element in [31:16] of 32-bit chunks in Rs2 + * from the 16-bit integer element in [31:16] of 32-bit chunks in Rs1, and writes the result to [31:16] of + * 32-bit chunks in Rd; at the same time, it adds the 16-bit integer element in [15:0] of 32-bit chunks in + * Rs2 with the 16-bit integer element in [15:0] of 32-bit chunks in Rs1, and writes the result to [15:0] of + * 32-bit chunks in Rd. + * + * **Note**:\n + * This instruction can be used for either signed or unsigned operations. + * + * **Operations**:\n + * ~~~ + * Rd.W[x][31:16] = Rs1.W[x][31:16] - Rs2.W[x][31:16]; + * Rd.W[x][15:0] = Rs1.W[x][15:0] + Rs2.W[x][15:0]; + * for RV32, x=0 + * for RV64, x=1...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_STSA16(unsigned long a, unsigned long b) +{ + register unsigned long result; + __ASM volatile("stsa16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 3.135. STSA16 ===== */ + +/* ===== Inline Function Start for 3.136. SUB8 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_ADDSUB + * \brief SUB8 (SIMD 8-bit Subtraction) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * SUB8 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do 8-bit integer element subtractions simultaneously. + * + * **Description**:\n + * This instruction subtracts the 8-bit integer elements in Rs2 from the 8-bit integer + * elements in Rs1, and then writes the result to Rd. + * + * **Note**:\n + * This instruction can be used for either signed or unsigned subtraction. + * + * **Operations**:\n + * ~~~ + * Rd.B[x] = Rs1.B[x] - Rs2.B[x]; + * for RV32: x=3...0, + * for RV64: x=7...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_SUB8(unsigned long a, unsigned long b) +{ + register unsigned long result; + __ASM volatile("sub8 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 3.136. SUB8 ===== */ + +/* ===== Inline Function Start for 3.137. SUB16 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_ADDSUB + * \brief SUB16 (SIMD 16-bit Subtraction) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * SUB16 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do 16-bit integer element subtractions simultaneously. + * + * **Description**:\n + * This instruction subtracts the 16-bit integer elements in Rs2 from the 16-bit integer + * elements in Rs1, and then writes the result to Rd. + * + * **Note**:\n + * This instruction can be used for either signed or unsigned subtraction. + * + * **Operations**:\n + * ~~~ + * Rd.H[x] = Rs1.H[x] - Rs2.H[x]; + * for RV32: x=1...0, + * for RV64: x=3...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_SUB16(unsigned long a, unsigned long b) +{ + register unsigned long result; + __ASM volatile("sub16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 3.137. SUB16 ===== */ + +/* ===== Inline Function Start for 3.138. SUB64 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_64B_ADDSUB + * \brief SUB64 (64-bit Subtraction) + * \details + * **Type**: DSP (64-bit Profile) + * + * **Syntax**:\n + * ~~~ + * SUB64 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Perform a 64-bit signed or unsigned integer subtraction. + * + * **RV32 Description**:\n + * This instruction subtracts the 64-bit integer of an even/odd pair of registers + * specified by Rs2(4,1) from the 64-bit integer of an even/odd pair of registers specified by Rs1(4,1), + * and then writes the 64-bit result to an even/odd pair of registers specified by Rd(4,1). + * Rx(4,1), i.e., d, determines the even/odd pair group of two registers. Specifically, the register pair + * includes register 2d and 2d+1. + * The odd `2d+1` register of the pair contains the high 32-bit of the operand and the even `2d` + * register of the pair contains the low 32-bit of the operand. + * + * **RV64 Description**:\n + * This instruction subtracts the 64-bit integer of Rs2 from the 64-bit integer of Rs1, + * and then writes the 64-bit result to Rd. + * + * **Note**:\n + * This instruction can be used for either signed or unsigned subtraction. + * + * **Operations**:\n + * ~~~ + * * RV32: + * t_L = CONCAT(Rd(4,1),1'b0); t_H = CONCAT(Rd(4,1),1'b1); + * a_L = CONCAT(Rs1(4,1),1'b0); a_H = CONCAT(Rs1(4,1),1'b1); + * b_L = CONCAT(Rs2(4,1),1'b0); b_H = CONCAT(Rs2(4,1),1'b1); + * R[t_H].R[t_L] = R[a_H].R[a_L] - R[b_H].R[b_L]; + * * RV64: + * Rd = Rs1 - Rs2; + * ~~~ + * + * \param [in] a unsigned long long type of value stored in a + * \param [in] b unsigned long long type of value stored in b + * \return value stored in unsigned long long type + */ +__STATIC_FORCEINLINE unsigned long long __RV_SUB64(unsigned long long a, unsigned long long b) +{ + register unsigned long long result; + __ASM volatile("sub64 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 3.138. SUB64 ===== */ + +/* ===== Inline Function Start for 3.139.1. SUNPKD810 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_UNPACK + * \brief SUNPKD810 (Signed Unpacking Bytes 1 & 0) + * \details + * **Type**: DSP + * + * **Syntax**:\n + * ~~~ + * SUNPKD8xy Rd, Rs1 + * xy = {10, 20, 30, 31, 32} + * ~~~ + * + * **Purpose**:\n + * Unpack byte *x and byte y* of 32-bit chunks in a register into two 16-bit signed halfwords + * of 32-bit chunks in a register. + * + * **Description**:\n + * For the `SUNPKD8(x)(*y*)` instruction, it unpacks byte *x and byte y* of 32-bit chunks in Rs1 into + * two 16-bit signed halfwords and writes the results to the top part and the bottom part of 32-bit + * chunks in Rd. + * + * **Operations**:\n + * ~~~ + * Rd.W[m].H[1] = SE16(Rs1.W[m].B[x]) + * Rd.W[m].H[0] = SE16(Rs1.W[m].B[y]) + * // SUNPKD810, x=1,y=0 + * // SUNPKD820, x=2,y=0 + * // SUNPKD830, x=3,y=0 + * // SUNPKD831, x=3,y=1 + * // SUNPKD832, x=3,y=2 + * for RV32: m=0, + * for RV64: m=1...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_SUNPKD810(unsigned long a) +{ + register unsigned long result; + __ASM volatile("sunpkd810 %0, %1" : "=r"(result) : "r"(a)); + return result; +} +/* ===== Inline Function End for 3.139.1. SUNPKD810 ===== */ + +/* ===== Inline Function Start for 3.139.2. SUNPKD820 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_UNPACK + * \brief SUNPKD820 (Signed Unpacking Bytes 2 & 0) + * \details + * **Type**: DSP + * + * **Syntax**:\n + * ~~~ + * SUNPKD8xy Rd, Rs1 + * xy = {10, 20, 30, 31, 32} + * ~~~ + * + * **Purpose**:\n + * Unpack byte *x and byte y* of 32-bit chunks in a register into two 16-bit signed halfwords + * of 32-bit chunks in a register. + * + * **Description**:\n + * For the `SUNPKD8(x)(*y*)` instruction, it unpacks byte *x and byte y* of 32-bit chunks in Rs1 into + * two 16-bit signed halfwords and writes the results to the top part and the bottom part of 32-bit + * chunks in Rd. + * + * **Operations**:\n + * ~~~ + * Rd.W[m].H[1] = SE16(Rs1.W[m].B[x]) + * Rd.W[m].H[0] = SE16(Rs1.W[m].B[y]) + * // SUNPKD810, x=1,y=0 + * // SUNPKD820, x=2,y=0 + * // SUNPKD830, x=3,y=0 + * // SUNPKD831, x=3,y=1 + * // SUNPKD832, x=3,y=2 + * for RV32: m=0, + * for RV64: m=1...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_SUNPKD820(unsigned long a) +{ + register unsigned long result; + __ASM volatile("sunpkd820 %0, %1" : "=r"(result) : "r"(a)); + return result; +} +/* ===== Inline Function End for 3.139.2. SUNPKD820 ===== */ + +/* ===== Inline Function Start for 3.139.3. SUNPKD830 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_UNPACK + * \brief SUNPKD830 (Signed Unpacking Bytes 3 & 0) + * \details + * **Type**: DSP + * + * **Syntax**:\n + * ~~~ + * SUNPKD8xy Rd, Rs1 + * xy = {10, 20, 30, 31, 32} + * ~~~ + * + * **Purpose**:\n + * Unpack byte *x and byte y* of 32-bit chunks in a register into two 16-bit signed halfwords + * of 32-bit chunks in a register. + * + * **Description**:\n + * For the `SUNPKD8(x)(*y*)` instruction, it unpacks byte *x and byte y* of 32-bit chunks in Rs1 into + * two 16-bit signed halfwords and writes the results to the top part and the bottom part of 32-bit + * chunks in Rd. + * + * **Operations**:\n + * ~~~ + * Rd.W[m].H[1] = SE16(Rs1.W[m].B[x]) + * Rd.W[m].H[0] = SE16(Rs1.W[m].B[y]) + * // SUNPKD810, x=1,y=0 + * // SUNPKD820, x=2,y=0 + * // SUNPKD830, x=3,y=0 + * // SUNPKD831, x=3,y=1 + * // SUNPKD832, x=3,y=2 + * for RV32: m=0, + * for RV64: m=1...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_SUNPKD830(unsigned long a) +{ + register unsigned long result; + __ASM volatile("sunpkd830 %0, %1" : "=r"(result) : "r"(a)); + return result; +} +/* ===== Inline Function End for 3.139.3. SUNPKD830 ===== */ + +/* ===== Inline Function Start for 3.139.4. SUNPKD831 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_UNPACK + * \brief SUNPKD831 (Signed Unpacking Bytes 3 & 1) + * \details + * **Type**: DSP + * + * **Syntax**:\n + * ~~~ + * SUNPKD8xy Rd, Rs1 + * xy = {10, 20, 30, 31, 32} + * ~~~ + * + * **Purpose**:\n + * Unpack byte *x and byte y* of 32-bit chunks in a register into two 16-bit signed halfwords + * of 32-bit chunks in a register. + * + * **Description**:\n + * For the `SUNPKD8(x)(*y*)` instruction, it unpacks byte *x and byte y* of 32-bit chunks in Rs1 into + * two 16-bit signed halfwords and writes the results to the top part and the bottom part of 32-bit + * chunks in Rd. + * + * **Operations**:\n + * ~~~ + * Rd.W[m].H[1] = SE16(Rs1.W[m].B[x]) + * Rd.W[m].H[0] = SE16(Rs1.W[m].B[y]) + * // SUNPKD810, x=1,y=0 + * // SUNPKD820, x=2,y=0 + * // SUNPKD830, x=3,y=0 + * // SUNPKD831, x=3,y=1 + * // SUNPKD832, x=3,y=2 + * for RV32: m=0, + * for RV64: m=1...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_SUNPKD831(unsigned long a) +{ + register unsigned long result; + __ASM volatile("sunpkd831 %0, %1" : "=r"(result) : "r"(a)); + return result; +} +/* ===== Inline Function End for 3.139.4. SUNPKD831 ===== */ + +/* ===== Inline Function Start for 3.139.5. SUNPKD832 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_UNPACK + * \brief SUNPKD832 (Signed Unpacking Bytes 3 & 2) + * \details + * **Type**: DSP + * + * **Syntax**:\n + * ~~~ + * SUNPKD8xy Rd, Rs1 + * xy = {10, 20, 30, 31, 32} + * ~~~ + * + * **Purpose**:\n + * Unpack byte *x and byte y* of 32-bit chunks in a register into two 16-bit signed halfwords + * of 32-bit chunks in a register. + * + * **Description**:\n + * For the `SUNPKD8(x)(*y*)` instruction, it unpacks byte *x and byte y* of 32-bit chunks in Rs1 into + * two 16-bit signed halfwords and writes the results to the top part and the bottom part of 32-bit + * chunks in Rd. + * + * **Operations**:\n + * ~~~ + * Rd.W[m].H[1] = SE16(Rs1.W[m].B[x]) + * Rd.W[m].H[0] = SE16(Rs1.W[m].B[y]) + * // SUNPKD810, x=1,y=0 + * // SUNPKD820, x=2,y=0 + * // SUNPKD830, x=3,y=0 + * // SUNPKD831, x=3,y=1 + * // SUNPKD832, x=3,y=2 + * for RV32: m=0, + * for RV64: m=1...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_SUNPKD832(unsigned long a) +{ + register unsigned long result; + __ASM volatile("sunpkd832 %0, %1" : "=r"(result) : "r"(a)); + return result; +} +/* ===== Inline Function End for 3.139.5. SUNPKD832 ===== */ + +/* ===== Inline Function Start for 3.140. SWAP8 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_MISC + * \brief SWAP8 (Swap Byte within Halfword) + * \details + * **Type**: DSP + * + * **Syntax**:\n + * ~~~ + * SWAP8 Rd, Rs1 + * ~~~ + * + * **Purpose**:\n + * Swap the bytes within each halfword of a register. + * + * **Description**:\n + * This instruction swaps the bytes within each halfword of Rs1 and writes the result to + * Rd. + * + * **Operations**:\n + * ~~~ + * Rd.H[x] = CONCAT(Rs1.H[x][7:0],Rs1.H[x][15:8]); + * for RV32: x=1...0, + * for RV64: x=3...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_SWAP8(unsigned long a) +{ + register unsigned long result; + __ASM volatile("swap8 %0, %1" : "=r"(result) : "r"(a)); + return result; +} +/* ===== Inline Function End for 3.140. SWAP8 ===== */ + +/* ===== Inline Function Start for 3.141. SWAP16 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_MISC + * \brief SWAP16 (Swap Halfword within Word) + * \details + * **Type**: DSP + * + * **Syntax**:\n + * ~~~ + * SWAP16 Rd, Rs1 + * ~~~ + * + * **Purpose**:\n + * Swap the 16-bit halfwords within each word of a register. + * + * **Description**:\n + * This instruction swaps the 16-bit halfwords within each word of Rs1 and writes the + * result to Rd. + * + * **Operations**:\n + * ~~~ + * Rd.W[x] = CONCAT(Rs1.W[x][15:0],Rs1.H[x][31:16]); + * for RV32: x=0, + * for RV64: x=1...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_SWAP16(unsigned long a) +{ + register unsigned long result; + __ASM volatile("swap16 %0, %1" : "=r"(result) : "r"(a)); + return result; +} +/* ===== Inline Function End for 3.141. SWAP16 ===== */ + +/* ===== Inline Function Start for 3.142. UCLIP8 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_MISC + * \brief UCLIP8 (SIMD 8-bit Unsigned Clip Value) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * UCLIP8 Rt, Ra, imm3u + * ~~~ + * + * **Purpose**:\n + * Limit the 8-bit signed elements of a register into an unsigned range simultaneously. + * + * **Description**:\n + * This instruction limits the 8-bit signed elements stored in Rs1 into an unsigned integer + * range between 2^imm3u-1 and 0, and writes the limited results to Rd. For example, if imm3u is 3, the 8- + * bit input values should be saturated between 7 and 0. If saturation is performed, set OV bit to 1. + * + * **Operations**:\n + * ~~~ + * src = Rs1.H[x]; + * if (src > (2^imm3u)-1) { + * src = (2^imm3u)-1; + * OV = 1; + * } else if (src < 0) { + * src = 0; + * OV = 1; + * } + * Rd.H[x] = src; + * for RV32: x=3...0, + * for RV64: x=7...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned int type of value stored in b + * \return value stored in unsigned long type + */ +#define __RV_UCLIP8(a, b) \ + ({ \ + register unsigned long result; \ + register unsigned long __a = (unsigned long)(a); \ + __ASM volatile("uclip8 %0, %1, %2" : "=r"(result) : "r"(__a), "K"(b)); \ + result; \ + }) +/* ===== Inline Function End for 3.142. UCLIP8 ===== */ + +/* ===== Inline Function Start for 3.143. UCLIP16 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_MISC + * \brief UCLIP16 (SIMD 16-bit Unsigned Clip Value) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * UCLIP16 Rt, Ra, imm4u + * ~~~ + * + * **Purpose**:\n + * Limit the 16-bit signed elements of a register into an unsigned range simultaneously. + * + * **Description**:\n + * This instruction limits the 16-bit signed elements stored in Rs1 into an unsigned + * integer range between 2imm4u-1 and 0, and writes the limited results to Rd. For example, if imm4u is + * 3, the 16-bit input values should be saturated between 7 and 0. If saturation is performed, set OV bit + * to 1. + * + * **Operations**:\n + * ~~~ + * src = Rs1.H[x]; + * if (src > (2^imm4u)-1) { + * src = (2^imm4u)-1; + * OV = 1; + * } else if (src < 0) { + * src = 0; + * OV = 1; + * } + * Rd.H[x] = src; + * for RV32: x=1...0, + * for RV64: x=3...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned int type of value stored in b + * \return value stored in unsigned long type + */ +#define __RV_UCLIP16(a, b) \ + ({ \ + register unsigned long result; \ + register unsigned long __a = (unsigned long)(a); \ + __ASM volatile("uclip16 %0, %1, %2" : "=r"(result) : "r"(__a), "K"(b)); \ + result; \ + }) +/* ===== Inline Function End for 3.143. UCLIP16 ===== */ + +/* ===== Inline Function Start for 3.144. UCLIP32 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_PART_SIMD_MISC + * \brief UCLIP32 (SIMD 32-bit Unsigned Clip Value) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * UCLIP32 Rd, Rs1, imm5u[4:0] + * ~~~ + * + * **Purpose**:\n + * Limit the 32-bit signed integer elements of a register into an unsigned range + * simultaneously. + * + * **Description**:\n + * This instruction limits the 32-bit signed integer elements stored in Rs1 into an + * unsigned integer range between 2imm5u-1 and 0, and writes the limited results to Rd. For example, if + * imm5u is 3, the 32-bit input values should be saturated between 7 and 0. If saturation is performed, + * set OV bit to 1. + * + * **Operations**:\n + * ~~~ + * src = Rs1.W[x]; + * if (src > (2^imm5u)-1) { + * src = (2^imm5u)-1; + * OV = 1; + * } else if (src < 0) { + * src = 0; + * OV = 1; + * } + * Rd.W[x] = src + * for RV32: x=0, + * for RV64: x=1...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned int type of value stored in b + * \return value stored in unsigned long type + */ +#define __RV_UCLIP32(a, b) \ + ({ \ + register unsigned long result; \ + register unsigned long __a = (unsigned long)(a); \ + __ASM volatile("uclip32 %0, %1, %2" : "=r"(result) : "r"(__a), "K"(b)); \ + result; \ + }) +/* ===== Inline Function End for 3.144. UCLIP32 ===== */ + +/* ===== Inline Function Start for 3.145. UCMPLE8 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_CMP + * \brief UCMPLE8 (SIMD 8-bit Unsigned Compare Less Than & Equal) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * UCMPLE8 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do 8-bit unsigned integer elements less than & equal comparisons simultaneously. + * + * **Description**:\n + * This instruction compares the 8-bit unsigned integer elements in Rs1 with the 8-bit + * unsigned integer elements in Rs2 to see if the one in Rs1 is less than or equal to the one in Rs2. If it + * is true, the result is 0xFF; otherwise, the result is 0x0. The four comparison results are written to + * Rd. + * + * **Operations**:\n + * ~~~ + * Rd.B[x] = (Rs1.B[x] <=u Rs2.B[x])? 0xff : 0x0; + * for RV32: x=3...0, + * for RV64: x=7...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_UCMPLE8(unsigned long a, unsigned long b) +{ + register unsigned long result; + __ASM volatile("ucmple8 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 3.145. UCMPLE8 ===== */ + +/* ===== Inline Function Start for 3.146. UCMPLE16 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_CMP + * \brief UCMPLE16 (SIMD 16-bit Unsigned Compare Less Than & Equal) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * UCMPLE16 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do 16-bit unsigned integer elements less than & equal comparisons simultaneously. + * + * **Description**:\n + * This instruction compares the 16-bit unsigned integer elements in Rs1 with the 16-bit + * unsigned integer elements in Rs2 to see if the one in Rs1 is less than or equal to the one in Rs2. If it + * is true, the result is 0xFFFF; otherwise, the result is 0x0. The element comparison results are + * written to Rd. + * + * **Operations**:\n + * ~~~ + * Rd.H[x] = (Rs1.H[x] <=u Rs2.H[x])? 0xffff : 0x0; + * for RV32: x=1...0, + * for RV64: x=3...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_UCMPLE16(unsigned long a, unsigned long b) +{ + register unsigned long result; + __ASM volatile("ucmple16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 3.146. UCMPLE16 ===== */ + +/* ===== Inline Function Start for 3.147. UCMPLT8 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_CMP + * \brief UCMPLT8 (SIMD 8-bit Unsigned Compare Less Than) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * UCMPLT8 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do 8-bit unsigned integer elements less than comparisons simultaneously. + * + * **Description**:\n + * This instruction compares the 8-bit unsigned integer elements in Rs1 with the 8-bit + * unsigned integer elements in Rs2 to see if the one in Rs1 is less than the one in Rs2. If it is true, the + * result is 0xFF; otherwise, the result is 0x0. The element comparison results are written to Rd. + * + * **Operations**:\n + * ~~~ + * Rd.B[x] = (Rs1.B[x] (2^8)-1) { + * res[x] = (2^8)-1; + * OV = 1; + * } + * Rd.B[x] = res[x]; + * for RV32: x=3...0, + * for RV64: x=7...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_UKADD8(unsigned long a, unsigned long b) +{ + register unsigned long result; + __ASM volatile("ukadd8 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 3.149. UKADD8 ===== */ + +/* ===== Inline Function Start for 3.150. UKADD16 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_ADDSUB + * \brief UKADD16 (SIMD 16-bit Unsigned Saturating Addition) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * UKADD16 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do 16-bit unsigned integer element saturating additions simultaneously. + * + * **Description**:\n + * This instruction adds the 16-bit unsigned integer elements in Rs1 with the 16-bit + * unsigned integer elements in Rs2. If any of the results are beyond the 16-bit unsigned number + * range (0 <= RES <= 2^16-1), they are saturated to the range and the OV bit is set to 1. The saturated + * results are written to Rd. + * + * **Operations**:\n + * ~~~ + * res[x] = Rs1.H[x] + Rs2.H[x]; + * if (res[x] > (2^16)-1) { + * res[x] = (2^16)-1; + * OV = 1; + * } + * Rd.H[x] = res[x]; + * for RV32: x=1...0, + * for RV64: x=3...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_UKADD16(unsigned long a, unsigned long b) +{ + register unsigned long result; + __ASM volatile("ukadd16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 3.150. UKADD16 ===== */ + +/* ===== Inline Function Start for 3.151. UKADD64 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_64B_ADDSUB + * \brief UKADD64 (64-bit Unsigned Saturating Addition) + * \details + * **Type**: DSP (64-bit Profile) + * + * **Syntax**:\n + * ~~~ + * UKADD64 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Add two 64-bit unsigned integers. The result is saturated to the U64 range. + * + * **RV32 Description**:\n + * This instruction adds the 64-bit unsigned integer of an even/odd pair of registers + * specified by Rs1(4,1) with the 64-bit unsigned integer of an even/odd pair of registers specified by + * Rs2(4,1). If the 64-bit result is beyond the U64 number range (0 <= U64 <= 2^64-1), it is saturated to the + * range and the OV bit is set to 1. The saturated result is written to an even/odd pair of registers + * specified by Rd(4,1). + * Rx(4,1), i.e., d, determines the even/odd pair group of two registers. Specifically, the register pair + * includes register 2d and 2d+1. + * The odd `2d+1` register of the pair contains the high 32-bit of the result and the even `2d` register + * of the pair contains the low 32-bit of the result. + * + * **RV64 Description**:\n + * This instruction adds the 64-bit unsigned integer in Rs1 with the 64-bit unsigned + * integer in Rs2. If the 64-bit result is beyond the U64 number range (0 <= U64 <= 2^64-1), it is saturated to + * the range and the OV bit is set to 1. The saturated result is written to Rd. + * + * **Operations**:\n + * ~~~ + * * RV32: + * t_L = CONCAT(Rt(4,1),1'b0); t_H = CONCAT(Rt(4,1),1'b1); + * a_L = CONCAT(Ra(4,1),1'b0); a_H = CONCAT(Ra(4,1),1'b1); + * b_L = CONCAT(Rb(4,1),1'b0); b_H = CONCAT(Rb(4,1),1'b1); + * result = R[a_H].R[a_L] + R[b_H].R[b_L]; + * if (result > (2^64)-1) { + * result = (2^64)-1; OV = 1; + * } + * R[t_H].R[t_L] = result; + * * RV64: + * result = Rs1 + Rs2; + * if (result > (2^64)-1) { + * result = (2^64)-1; OV = 1; + * } + * Rd = result; + * ~~~ + * + * \param [in] a unsigned long long type of value stored in a + * \param [in] b unsigned long long type of value stored in b + * \return value stored in unsigned long long type + */ +__STATIC_FORCEINLINE unsigned long long __RV_UKADD64(unsigned long long a, unsigned long long b) +{ + register unsigned long long result; + __ASM volatile("ukadd64 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 3.151. UKADD64 ===== */ + +/* ===== Inline Function Start for 3.152. UKADDH ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_Q15_SAT_ALU + * \brief UKADDH (Unsigned Addition with U16 Saturation) + * \details + * **Type**: DSP + * + * **Syntax**:\n + * ~~~ + * UKADDH Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Add the unsigned lower 32-bit content of two registers with U16 saturation. + * + * **Description**:\n + * The unsigned lower 32-bit content of Rs1 is added with the unsigned lower 32-bit + * content of Rs2. And the result is saturated to the 16-bit unsigned integer range of [0, 2^16-1] and then + * sign-extended and written to Rd. If saturation happens, this instruction sets the OV flag. + * + * **Operations**:\n + * ~~~ + * tmp = Rs1.W[0] + Rs2.W[0]; + * if (tmp > (2^16)-1) { + * tmp = (2^16)-1; + * OV = 1; + * } + * Rd = SE(tmp[15:0]); + * ~~~ + * + * \param [in] a unsigned int type of value stored in a + * \param [in] b unsigned int type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_UKADDH(unsigned int a, unsigned int b) +{ + register unsigned long result; + __ASM volatile("ukaddh %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 3.152. UKADDH ===== */ + +/* ===== Inline Function Start for 3.153. UKADDW ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_Q31_SAT_ALU + * \brief UKADDW (Unsigned Addition with U32 Saturation) + * \details + * **Type**: DSP + * + * **Syntax**:\n + * ~~~ + * UKADDW Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Add the unsigned lower 32-bit content of two registers with U32 saturation. + * + * **Description**:\n + * The unsigned lower 32-bit content of Rs1 is added with the unsigned lower 32-bit + * content of Rs2. And the result is saturated to the 32-bit unsigned integer range of [0, 2^32-1] and then + * sign-extended and written to Rd. If saturation happens, this instruction sets the OV flag. + * + * **Operations**:\n + * ~~~ + * tmp = Rs1.W[0] + Rs2.W[0]; + * if (tmp > (2^32)-1) { + * tmp[31:0] = (2^32)-1; + * OV = 1; + * } + * Rd = tmp[31:0]; // RV32 + * Rd = SE(tmp[31:0]); // RV64 + * ~~~ + * + * \param [in] a unsigned int type of value stored in a + * \param [in] b unsigned int type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_UKADDW(unsigned int a, unsigned int b) +{ + register unsigned long result; + __ASM volatile("ukaddw %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 3.153. UKADDW ===== */ + +/* ===== Inline Function Start for 3.154. UKCRAS16 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_ADDSUB + * \brief UKCRAS16 (SIMD 16-bit Unsigned Saturating Cross Addition & Subtraction) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * UKCRAS16 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do one 16-bit unsigned integer element saturating addition and one 16-bit unsigned + * integer element saturating subtraction in a 32-bit chunk simultaneously. Operands are from crossed + * positions in 32-bit chunks. + * + * **Description**:\n + * This instruction adds the 16-bit unsigned integer element in [31:16] of 32-bit chunks in + * Rs1 with the 16-bit unsigned integer element in [15:0] of 32-bit chunks in Rs2; at the same time, it + * subtracts the 16-bit unsigned integer element in [31:16] of 32-bit chunks in Rs2 from the 16-bit + * unsigned integer element in [15:0] of 32-bit chunks in Rs1. If any of the results are beyond the 16-bit + * unsigned number range (0 <= RES <= 2^16-1), they are saturated to the range and the OV bit is set to 1. + * The saturated results are written to [31:16] of 32-bit chunks in Rd for addition and [15:0] of 32-bit + * chunks in Rd for subtraction. + * + * **Operations**:\n + * ~~~ + * res1 = Rs1.W[x][31:16] + Rs2.W[x][15:0]; + * res2 = Rs1.W[x][15:0] - Rs2.W[x][31:16]; + * if (res1 > (2^16)-1) { + * res1 = (2^16)-1; + * OV = 1; + * } + * if (res2 < 0) { + * res2 = 0; + * OV = 1; + * } + * Rd.W[x][31:16] = res1; + * Rd.W[x][15:0] = res2; + * for RV32, x=0 + * for RV64, x=1...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_UKCRAS16(unsigned long a, unsigned long b) +{ + register unsigned long result; + __ASM volatile("ukcras16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 3.154. UKCRAS16 ===== */ + +/* ===== Inline Function Start for 3.155. UKCRSA16 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_ADDSUB + * \brief UKCRSA16 (SIMD 16-bit Unsigned Saturating Cross Subtraction & Addition) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * UKCRSA16 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do one 16-bit unsigned integer element saturating subtraction and one 16-bit unsigned + * integer element saturating addition in a 32-bit chunk simultaneously. Operands are from crossed + * positions in 32-bit chunks. + * + * **Description**:\n + * This instruction subtracts the 16-bit unsigned integer element in [15:0] of 32-bit + * chunks in Rs2 from the 16-bit unsigned integer element in [31:16] of 32-bit chunks in Rs1; at the + * same time, it adds the 16-bit unsigned integer element in [31:16] of 32-bit chunks in Rs2 with the 16- + * bit unsigned integer element in [15:0] of 32-bit chunks in Rs1. If any of the results are beyond the + * 16-bit unsigned number range (0 <= RES <= 2^16-1), they are saturated to the range and the OV bit is set + * to 1. The saturated results are written to [31:16] of 32-bit chunks in Rd for subtraction and [15:0] of + * 32-bit chunks in Rd for addition. + * + * **Operations**:\n + * ~~~ + * res1 = Rs1.W[x][31:16] - Rs2.W[x][15:0]; + * res2 = Rs1.W[x][15:0] + Rs2.W[x][31:16]; + * if (res1 < 0) { + * res1 = 0; + * OV = 1; + * } else if (res2 > (2^16)-1) { + * res2 = (2^16)-1; + * OV = 1; + * } + * Rd.W[x][31:16] = res1; + * Rd.W[x][15:0] = res2; + * for RV32, x=0 + * for RV64, x=1...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_UKCRSA16(unsigned long a, unsigned long b) +{ + register unsigned long result; + __ASM volatile("ukcrsa16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 3.155. UKCRSA16 ===== */ + +/* ===== Inline Function Start for 3.156. UKMAR64 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_32B_MULT_64B_ADDSUB + * \brief UKMAR64 (Unsigned Multiply and Saturating Add to 64-Bit Data) + * \details + * **Type**: DSP (64-bit Profile) + * + * **Syntax**:\n + * ~~~ + * UKMAR64 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Multiply the 32-bit unsigned elements in two registers and add the 64-bit multiplication + * results to the 64-bit unsigned data of a pair of registers (RV32) or a register (RV64). The result is + * saturated to the U64 range and written back to the pair of registers (RV32) or the register (RV64). + * + * **RV32 Description**:\n + * This instruction multiplies the 32-bit unsigned data of Rs1 with that of Rs2. It + * adds the 64-bit multiplication result to the 64-bit unsigned data of an even/odd pair of registers + * specified by Rd(4,1) with unlimited precision. If the 64-bit addition result is beyond the U64 number + * range (0 <= U64 <= 2^64-1), it is saturated to the range and the OV bit is set to 1. The saturated result is + * written back to the even/odd pair of registers specified by Rd(4,1). + * Rx(4,1), i.e., d, determines the even/odd pair group of two registers. Specifically, the register pair + * includes register 2d and 2d+1. + * The odd `2d+1` register of the pair contains the high 32-bit of the result and the even `2d` register + * of the pair contains the low 32-bit of the result. + * + * **RV64 Description**:\n + * This instruction multiplies the 32-bit unsigned elements of Rs1 with that of Rs2. + * It adds the 64-bit multiplication results to the 64-bit unsigned data in Rd with unlimited precision. If + * the 64-bit addition result is beyond the U64 number range (0 <= U64 <= 2^64-1), it is saturated to the + * range and the OV bit is set to 1. The saturated result is written back to Rd. + * + * **Operations**:\n + * ~~~ + * * RV32: + * t_L = CONCAT(Rd(4,1),1'b0); t_H = CONCAT(Rd(4,1),1'b1); + * result = R[t_H].R[t_L] + (Rs1 * Rs2); + * if (result > (2^64)-1) { + * result = (2^64)-1; OV = 1; + * } + * R[t_H].R[t_L] = result; + * * RV64: + * // `result` has unlimited precision + * result = Rd + (Rs1.W[0] u* Rs2.W[0]) + (Rs1.W[1] u* Rs2.W[1]); + * if (result > (2^64)-1) { + * result = (2^64)-1; OV = 1; + * } + * Rd = result; + * ~~~ + * + * \param [in] t unsigned long long type of value stored in t + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in unsigned long long type + */ +__STATIC_FORCEINLINE unsigned long long __RV_UKMAR64(unsigned long long t, unsigned long a, unsigned long b) +{ + __ASM volatile("ukmar64 %0, %1, %2" : "+r"(t) : "r"(a), "r"(b)); + return t; +} +/* ===== Inline Function End for 3.156. UKMAR64 ===== */ + +/* ===== Inline Function Start for 3.157. UKMSR64 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_32B_MULT_64B_ADDSUB + * \brief UKMSR64 (Unsigned Multiply and Saturating Subtract from 64-Bit Data) + * \details + * **Type**: DSP (64-bit Profile) + * + * **Syntax**:\n + * ~~~ + * UKMSR64 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Multiply the 32-bit unsigned elements in two registers and subtract the 64-bit + * multiplication results from the 64-bit unsigned data of a pair of registers (RV32) or a register (RV64). + * The result is saturated to the U64 range and written back to the pair of registers (RV32) or a register + * (RV64). + * + * **RV32 Description**:\n + * This instruction multiplies the 32-bit unsigned data of Rs1 with that of Rs2. It + * subtracts the 64-bit multiplication result from the 64-bit unsigned data of an even/odd pair of + * registers specified by Rd(4,1) with unlimited precision. If the 64-bit subtraction result is beyond the + * U64 number range (0 <= U64 <= 2^64-1), it is saturated to the range and the OV bit is set to 1. The + * saturated result is written back to the even/odd pair of registers specified by Rd(4,1). + * Rx(4,1), i.e., d, determines the even/odd pair group of two registers. Specifically, the register pair + * includes register 2d and 2d+1. + * The odd `2d+1` register of the pair contains the high 32-bit of the result and the even `2d` register + * of the pair contains the low 32-bit of the result. + * + * **RV64 Description**:\n + * This instruction multiplies the 32-bit unsigned elements of Rs1 with that of Rs2. + * It subtracts the 64-bit multiplication results from the 64-bit unsigned data of Rd with unlimited + * precision. If the 64-bit subtraction result is beyond the U64 number range (0 <= U64 <= 2^64-1), it is + * saturated to the range and the OV bit is set to 1. The saturated result is written back to Rd. + * + * **Operations**:\n + * ~~~ + * * RV32: + * t_L = CONCAT(Rd(4,1),1'b0); t_H = CONCAT(Rd(4,1),1'b1); + * result = R[t_H].R[t_L] - (Rs1 u* Rs2); + * if (result < 0) { + * result = 0; OV = 1; + * } + * R[t_H].R[t_L] = result; + * * RV64: + * // `result` has unlimited precision + * result = Rd - (Rs1.W[0] u* Rs2.W[0]) - (Rs1.W[1] u* Rs2.W[1]); + * if (result < 0) { + * result = 0; OV = 1; + * } + * Rd = result; + * ~~~ + * + * \param [in] t unsigned long long type of value stored in t + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in unsigned long long type + */ +__STATIC_FORCEINLINE unsigned long long __RV_UKMSR64(unsigned long long t, unsigned long a, unsigned long b) +{ + __ASM volatile("ukmsr64 %0, %1, %2" : "+r"(t) : "r"(a), "r"(b)); + return t; +} +/* ===== Inline Function End for 3.157. UKMSR64 ===== */ + +/* ===== Inline Function Start for 3.158. UKSTAS16 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_ADDSUB + * \brief UKSTAS16 (SIMD 16-bit Unsigned Saturating Straight Addition & Subtraction) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * UKSTAS16 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do one 16-bit unsigned integer element saturating addition and one 16-bit unsigned + * integer element saturating subtraction in a 32-bit chunk simultaneously. Operands are from + * corresponding positions in 32-bit chunks. + * + * **Description**:\n + * This instruction adds the 16-bit unsigned integer element in [31:16] of 32-bit chunks in + * Rs1 with the 16-bit unsigned integer element in [31:16] of 32-bit chunks in Rs2; at the same time, it + * subtracts the 16-bit unsigned integer element in [15:0] of 32-bit chunks in Rs2 from the 16-bit + * unsigned integer element in [15:0] of 32-bit chunks in Rs1. If any of the results are beyond the 16-bit + * unsigned number range (0 <= RES <= 2^16-1), they are saturated to the range and the OV bit is set to 1. + * The saturated results are written to [31:16] of 32-bit chunks in Rd for addition and [15:0] of 32-bit + * chunks in Rd for subtraction. + * + * **Operations**:\n + * ~~~ + * res1 = Rs1.W[x][31:16] + Rs2.W[x][31:16]; + * res2 = Rs1.W[x][15:0] - Rs2.W[x][15:0]; + * if (res1 > (2^16)-1) { + * res1 = (2^16)-1; + * OV = 1; + * } + * if (res2 < 0) { + * res2 = 0; + * OV = 1; + * } + * Rd.W[x][31:16] = res1; + * Rd.W[x][15:0] = res2; + * for RV32, x=0 + * for RV64, x=1...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_UKSTAS16(unsigned long a, unsigned long b) +{ + register unsigned long result; + __ASM volatile("ukstas16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 3.158. UKSTAS16 ===== */ + +/* ===== Inline Function Start for 3.159. UKSTSA16 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_ADDSUB + * \brief UKSTSA16 (SIMD 16-bit Unsigned Saturating Straight Subtraction & Addition) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * UKSTSA16 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do one 16-bit unsigned integer element saturating subtraction and one 16-bit unsigned + * integer element saturating addition in a 32-bit chunk simultaneously. Operands are from + * corresponding positions in 32-bit chunks. + * + * **Description**:\n + * This instruction subtracts the 16-bit unsigned integer element in [31:16] of 32-bit + * chunks in Rs2 from the 16-bit unsigned integer element in [31:16] of 32-bit chunks in Rs1; at the + * same time, it adds the 16-bit unsigned integer element in [15:0] of 32-bit chunks in Rs2 with the 16- + * bit unsigned integer element in [15:0] of 32-bit chunks in Rs1. If any of the results are beyond the + * 16-bit unsigned number range (0 <= RES <= 2^16-1), they are saturated to the range and the OV bit is set + * to 1. The saturated results are written to [31:16] of 32-bit chunks in Rd for subtraction and [15:0] of + * 32-bit chunks in Rd for addition. + * + * **Operations**:\n + * ~~~ + * res1 = Rs1.W[x][31:16] - Rs2.W[x][31:16]; + * res2 = Rs1.W[x][15:0] + Rs2.W[x][15:0]; + * if (res1 < 0) { + * res1 = 0; + * OV = 1; + * } else if (res2 > (2^16)-1) { + * res2 = (2^16)-1; + * OV = 1; + * } + * Rd.W[x][31:16] = res1; + * Rd.W[x][15:0] = res2; + * for RV32, x=0 + * for RV64, x=1...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_UKSTSA16(unsigned long a, unsigned long b) +{ + register unsigned long result; + __ASM volatile("ukstsa16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 3.159. UKSTSA16 ===== */ + +/* ===== Inline Function Start for 3.160. UKSUB8 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_ADDSUB + * \brief UKSUB8 (SIMD 8-bit Unsigned Saturating Subtraction) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * UKSUB8 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do 8-bit unsigned integer elements saturating subtractions simultaneously. + * + * **Description**:\n + * This instruction subtracts the 8-bit unsigned integer elements in Rs2 from the 8-bit + * unsigned integer elements in Rs1. If any of the results are beyond the 8-bit unsigned number range + * (0 <= RES <= 28-1), they are saturated to the range and the OV bit is set to 1. The saturated results are + * written to Rd. + * + * **Operations**:\n + * ~~~ + * res[x] = Rs1.B[x] - Rs2.B[x]; + * if (res[x] < 0) { + * res[x] = 0; + * OV = 1; + * } + * Rd.B[x] = res[x]; + * for RV32: x=3...0, + * for RV64: x=7...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_UKSUB8(unsigned long a, unsigned long b) +{ + register unsigned long result; + __ASM volatile("uksub8 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 3.160. UKSUB8 ===== */ + +/* ===== Inline Function Start for 3.161. UKSUB16 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_ADDSUB + * \brief UKSUB16 (SIMD 16-bit Unsigned Saturating Subtraction) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * UKSUB16 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do 16-bit unsigned integer elements saturating subtractions simultaneously. + * + * **Description**:\n + * This instruction subtracts the 16-bit unsigned integer elements in Rs2 from the 16-bit + * unsigned integer elements in Rs1. If any of the results are beyond the 16-bit unsigned number + * range (0 <= RES <= 2^16-1), they are saturated to the range and the OV bit is set to 1. The saturated + * results are written to Rd. + * + * **Operations**:\n + * ~~~ + * res[x] = Rs1.H[x] - Rs2.H[x]; + * if (res[x] < 0) { + * res[x] = 0; + * OV = 1; + * } + * Rd.H[x] = res[x]; + * for RV32: x=1...0, + * for RV64: x=3...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_UKSUB16(unsigned long a, unsigned long b) +{ + register unsigned long result; + __ASM volatile("uksub16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 3.161. UKSUB16 ===== */ + +/* ===== Inline Function Start for 3.162. UKSUB64 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_64B_ADDSUB + * \brief UKSUB64 (64-bit Unsigned Saturating Subtraction) + * \details + * **Type**: DSP (64-bit Profile) + * + * **Syntax**:\n + * ~~~ + * UKSUB64 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Perform a 64-bit signed integer subtraction. The result is saturated to the U64 range. + * + * **RV32 Description**:\n + * This instruction subtracts the 64-bit unsigned integer of an even/odd pair of + * registers specified by Rs2(4,1) from the 64-bit unsigned integer of an even/odd pair of registers + * specified by Rs1(4,1). If the 64-bit result is beyond the U64 number range (0 <= U64 <= 2^64-1), it is + * saturated to the range and the OV bit is set to 1. The saturated result is then written to an even/odd + * pair of registers specified by Rd(4,1). + * Rx(4,1), i.e., d, determines the even/odd pair group of two registers. Specifically, the register pair + * includes register 2d and 2d+1. + * The odd `2d+1` register of the pair contains the high 32-bit of the operand and the even `2d` + * register of the pair contains the low 32-bit of the operand. + * + * **RV64 Description**:\n + * This instruction subtracts the 64-bit unsigned integer of Rs2 from the 64-bit + * unsigned integer of an even/odd pair of Rs1. If the 64-bit result is beyond the U64 number range (0 <= + * U64 <= 2^64-1), it is saturated to the range and the OV bit is set to 1. The saturated result is then written + * to Rd. + * + * **Operations**:\n + * ~~~ + * * RV32: + * t_L = CONCAT(Rd(4,1),1'b0); t_H = CONCAT(Rd(4,1),1'b1); + * a_L = CONCAT(Rs1(4,1),1'b0); a_H = CONCAT(Rs1(4,1),1'b1); + * b_L = CONCAT(Rs2(4,1),1'b0); b_H = CONCAT(Rs2(4,1),1'b1); + * result = R[a_H].R[a_L] - R[b_H].R[b_L]; + * if (result < 0) { + * result = 0; OV = 1; + * } + * R[t_H].R[t_L] = result; + * * RV64 + * result = Rs1 - Rs2; + * if (result < 0) { + * result = 0; OV = 1; + * } + * Rd = result; + * ~~~ + * + * \param [in] a unsigned long long type of value stored in a + * \param [in] b unsigned long long type of value stored in b + * \return value stored in unsigned long long type + */ +__STATIC_FORCEINLINE unsigned long long __RV_UKSUB64(unsigned long long a, unsigned long long b) +{ + register unsigned long long result; + __ASM volatile("uksub64 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 3.162. UKSUB64 ===== */ + +/* ===== Inline Function Start for 3.163. UKSUBH ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_Q15_SAT_ALU + * \brief UKSUBH (Unsigned Subtraction with U16 Saturation) + * \details + * **Type**: DSP + * + * **Syntax**:\n + * ~~~ + * UKSUBH Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Subtract the unsigned lower 32-bit content of two registers with U16 saturation. + * + * **Description**:\n + * The unsigned lower 32-bit content of Rs2 is subtracted from the unsigned lower 32-bit + * content of Rs1. And the result is saturated to the 16-bit unsigned integer range of [0, 2^16-1] and then + * sign-extended and written to Rd. If saturation happens, this instruction sets the OV flag. + * + * **Operations**:\n + * ~~~ + * tmp = Rs1.W[0] - Rs2.W[0]; + * if (tmp > (2^16)-1) { + * tmp = (2^16)-1; + * OV = 1; + * } + * else if (tmp < 0) { + * tmp = 0; + * OV = 1; + * } + * Rd = SE(tmp[15:0]); + * ~~~ + * + * \param [in] a unsigned int type of value stored in a + * \param [in] b unsigned int type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_UKSUBH(unsigned int a, unsigned int b) +{ + register unsigned long result; + __ASM volatile("uksubh %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 3.163. UKSUBH ===== */ + +/* ===== Inline Function Start for 3.164. UKSUBW ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_Q31_SAT_ALU + * \brief UKSUBW (Unsigned Subtraction with U32 Saturation) + * \details + * **Type**: DSP + * + * **Syntax**:\n + * ~~~ + * UKSUBW Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Subtract the unsigned lower 32-bit content of two registers with unsigned 32-bit + * saturation. + * + * **Description**:\n + * The unsigned lower 32-bit content of Rs2 is subtracted from the unsigned lower 32-bit + * content of Rs1. And the result is saturated to the 32-bit unsigned integer range of [0, 2^32-1] and then + * sign-extended and written to Rd. If saturation happens, this instruction sets the OV flag. + * + * **Operations**:\n + * ~~~ + * tmp = Rs1.W[0] - Rs2.W[0]; + * if (tmp < 0) { + * tmp[31:0] = 0; + * OV = 1; + * } + * Rd = tmp[31:0]; // RV32 + * Rd = SE(tmp[31:0]); // RV64 + * ~~~ + * + * \param [in] a unsigned int type of value stored in a + * \param [in] b unsigned int type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_UKSUBW(unsigned int a, unsigned int b) +{ + register unsigned long result; + __ASM volatile("uksubw %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 3.164. UKSUBW ===== */ + +/* ===== Inline Function Start for 3.165. UMAR64 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_32B_MULT_64B_ADDSUB + * \brief UMAR64 (Unsigned Multiply and Add to 64-Bit Data) + * \details + * **Type**: DSP (64-bit Profile) + * + * **Syntax**:\n + * ~~~ + * UMAR64 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Multiply the 32-bit unsigned elements in two registers and add the 64-bit multiplication + * results to the 64-bit unsigned data of a pair of registers (RV32) or a register (RV64). The result is + * written back to the pair of registers (RV32) or a register (RV64). + * + * **RV32 Description**:\n + * This instruction multiplies the 32-bit unsigned data of Rs1 with that of Rs2. It + * adds the 64-bit multiplication result to the 64-bit unsigned data of an even/odd pair of registers + * specified by Rd(4,1). The addition result is written back to the even/odd pair of registers specified by + * Rd(4,1). + * Rx(4,1), i.e., d, determines the even/odd pair group of two registers. Specifically, the register pair + * includes register 2d and 2d+1. + * The odd `2d+1` register of the pair contains the high 32-bit of the result and the even `2d` register + * of the pair contains the low 32-bit of the result. + * + * **RV64 Description**:\n + * This instruction multiplies the 32-bit unsigned elements of Rs1 with that of Rs2. + * It adds the 64-bit multiplication results to the 64-bit unsigned data of Rd. The addition result is + * written back to Rd. + * + * **Operations**:\n + * ~~~ + * * RV32: + * t_L = CONCAT(Rd(4,1),1'b0); t_H = CONCAT(Rd(4,1),1'b1); + * R[t_H].R[t_L] = R[t_H].R[t_L] + (Rs1 * Rs2); + * * RV64: + * Rd = Rd + (Rs1.W[0] u* Rs2.W[0]) + (Rs1.W[1] u* Rs2.W[1]); + * ~~~ + * + * \param [in] t unsigned long long type of value stored in t + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in unsigned long long type + */ +__STATIC_FORCEINLINE unsigned long long __RV_UMAR64(unsigned long long t, unsigned long a, unsigned long b) +{ + __ASM volatile("umar64 %0, %1, %2" : "+r"(t) : "r"(a), "r"(b)); + return t; +} +/* ===== Inline Function End for 3.165. UMAR64 ===== */ + +/* ===== Inline Function Start for 3.166. UMAQA ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_8B_MULT_32B_ADD + * \brief UMAQA (Unsigned Multiply Four Bytes with 32- bit Adds) + * \details + * **Type**: DSP + * + * **Syntax**:\n + * ~~~ + * UMAQA Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do four unsigned 8-bit multiplications from 32-bit chunks of two registers; and then adds + * the four 16-bit results and the content of corresponding 32-bit chunks of a third register together. + * + * **Description**:\n + * This instruction multiplies the four unsigned 8-bit elements of 32-bit chunks of Rs1 with the four + * unsigned 8-bit elements of 32-bit chunks of Rs2 and then adds the four results together with the + * unsigned content of the corresponding 32-bit chunks of Rd. The final results are written back to the + * corresponding 32-bit chunks in Rd. + * + * **Operations**:\n + * ~~~ + * res[x] = Rd.W[x] + (Rs1.W[x].B[3] u* Rs2.W[x].B[3]) + + * (Rs1.W[x].B[2] u* Rs2.W[x].B[2]) + (Rs1.W[x].B[1] u* Rs2.W[x].B[1]) + + * (Rs1.W[x].B[0] u* Rs2.W[x].B[0]); + * Rd.W[x] = res[x]; + * for RV32: x=0, + * for RV64: x=1...0 + * ~~~ + * + * \param [in] t unsigned long type of value stored in t + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_UMAQA(unsigned long t, unsigned long a, unsigned long b) +{ + __ASM volatile("umaqa %0, %1, %2" : "+r"(t) : "r"(a), "r"(b)); + return t; +} +/* ===== Inline Function End for 3.166. UMAQA ===== */ + +/* ===== Inline Function Start for 3.167. UMAX8 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_MISC + * \brief UMAX8 (SIMD 8-bit Unsigned Maximum) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * UMAX8 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do 8-bit unsigned integer elements finding maximum operations simultaneously. + * + * **Description**:\n + * This instruction compares the 8-bit unsigned integer elements in Rs1 with the four 8- + * bit unsigned integer elements in Rs2 and selects the numbers that is greater than the other one. The + * two selected results are written to Rd. + * + * **Operations**:\n + * ~~~ + * Rd.B[x] = (Rs1.B[x] >u Rs2.B[x])? Rs1.B[x] : Rs2.B[x]; + * for RV32: x=3...0, + * for RV64: x=7...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_UMAX8(unsigned long a, unsigned long b) +{ + register unsigned long result; + __ASM volatile("umax8 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 3.167. UMAX8 ===== */ + +/* ===== Inline Function Start for 3.168. UMAX16 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_MISC + * \brief UMAX16 (SIMD 16-bit Unsigned Maximum) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * UMAX16 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do 16-bit unsigned integer elements finding maximum operations simultaneously. + * + * **Description**:\n + * This instruction compares the 16-bit unsigned integer elements in Rs1 with the 16-bit + * unsigned integer elements in Rs2 and selects the numbers that is greater than the other one. The + * selected results are written to Rd. + * + * **Operations**:\n + * ~~~ + * Rd.H[x] = (Rs1.H[x] >u Rs2.H[x])? Rs1.H[x] : Rs2.H[x]; + * for RV32: x=1...0, + * for RV64: x=3...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_UMAX16(unsigned long a, unsigned long b) +{ + register unsigned long result; + __ASM volatile("umax16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 3.168. UMAX16 ===== */ + +/* ===== Inline Function Start for 3.169. UMIN8 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_MISC + * \brief UMIN8 (SIMD 8-bit Unsigned Minimum) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * UMIN8 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do 8-bit unsigned integer elements finding minimum operations simultaneously. + * + * **Description**:\n + * This instruction compares the 8-bit unsigned integer elements in Rs1 with the 8-bit + * unsigned integer elements in Rs2 and selects the numbers that is less than the other one. The + * selected results are written to Rd. + * + * **Operations**:\n + * ~~~ + * Rd.B[x] = (Rs1.B[x] > 1; + * for RV32: x=3...0, + * for RV64: x=7...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_URADD8(unsigned long a, unsigned long b) +{ + register unsigned long result; + __ASM volatile("uradd8 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 3.174. URADD8 ===== */ + +/* ===== Inline Function Start for 3.175. URADD16 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_ADDSUB + * \brief URADD16 (SIMD 16-bit Unsigned Halving Addition) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * URADD16 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do 16-bit unsigned integer element additions simultaneously. The results are halved to + * avoid overflow or saturation. + * + * **Description**:\n + * This instruction adds the 16-bit unsigned integer elements in Rs1 with the 16-bit + * unsigned integer elements in Rs2. The results are first logically right-shifted by 1 bit and then + * written to Rd. + * + * **Examples**:\n + * ~~~ + * * Ra = 0x7FFF, Rb = 0x7FFF Rt = 0x7FFF + * * Ra = 0x8000, Rb = 0x8000 Rt = 0x8000 + * * Ra = 0x4000, Rb = 0x8000 Rt = 0x6000 + * ~~~ + * + * **Operations**:\n + * ~~~ + * Rd.H[x] = (Rs1.H[x] + Rs2.H[x]) u>> 1; + * for RV32: x=1...0, + * for RV64: x=3...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_URADD16(unsigned long a, unsigned long b) +{ + register unsigned long result; + __ASM volatile("uradd16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 3.175. URADD16 ===== */ + +/* ===== Inline Function Start for 3.176. URADD64 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_64B_ADDSUB + * \brief URADD64 (64-bit Unsigned Halving Addition) + * \details + * **Type**: DSP (64-bit Profile) + * + * **Syntax**:\n + * ~~~ + * URADD64 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Add two 64-bit unsigned integers. The result is halved to avoid overflow or saturation. + * + * **RV32 Description**:\n + * This instruction adds the 64-bit unsigned integer of an even/odd pair of registers + * specified by Rs1(4,1) with the 64-bit unsigned integer of an even/odd pair of registers specified by + * Rs2(4,1). The 64-bit addition result is first logically right-shifted by 1 bit and then written to an + * even/odd pair of registers specified by Rd(4,1). + * Rx(4,1), i.e., d, determines the even/odd pair group of two registers. Specifically, the register pair + * includes register 2d and 2d+1. + * The odd `2d+1` register of the pair contains the high 32-bit of the result and the even `2d` register + * of the pair contains the low 32-bit of the result. + * + * **RV64 Description**:\n + * This instruction adds the 64-bit unsigned integer in Rs1 with the 64-bit unsigned + * integer Rs2. The 64-bit addition result is first logically right-shifted by 1 bit and then written to Rd. + * + * **Operations**:\n + * ~~~ + * * RV32: + * t_L = CONCAT(Rt(4,1),1'b0); t_H = CONCAT(Rt(4,1),1'b1); + * a_L = CONCAT(Ra(4,1),1'b0); a_H = CONCAT(Ra(4,1),1'b1); + * b_L = CONCAT(Rb(4,1),1'b0); b_H = CONCAT(Rb(4,1),1'b1); + * R[t_H].R[t_L] = (R[a_H].R[a_L] + R[b_H].R[b_L]) u>> 1; + * * RV64: + * Rd = (Rs1 + Rs2) u>> 1; + * ~~~ + * + * \param [in] a unsigned long long type of value stored in a + * \param [in] b unsigned long long type of value stored in b + * \return value stored in unsigned long long type + */ +__STATIC_FORCEINLINE unsigned long long __RV_URADD64(unsigned long long a, unsigned long long b) +{ + register unsigned long long result; + __ASM volatile("uradd64 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 3.176. URADD64 ===== */ + +/* ===== Inline Function Start for 3.177. URADDW ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_32B_COMPUTATION + * \brief URADDW (32-bit Unsigned Halving Addition) + * \details + * **Type**: DSP + * + * **Syntax**:\n + * ~~~ + * URADDW Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Add 32-bit unsigned integers and the results are halved to avoid overflow or saturation. + * + * **Description**:\n + * This instruction adds the first 32-bit unsigned integer in Rs1 with the first 32-bit + * unsigned integer in Rs2. The result is first logically right-shifted by 1 bit and then sign-extended and + * written to Rd. + * + * **Examples**:\n + * ~~~ + * * Ra = 0x7FFFFFFF, Rb = 0x7FFFFFFF Rt = 0x7FFFFFFF + * * Ra = 0x80000000, Rb = 0x80000000 Rt = 0x80000000 + * * Ra = 0x40000000, Rb = 0x80000000 Rt = 0x60000000 + * ~~~ + * + * **Operations**:\n + * ~~~ + * * RV32: + * Rd[31:0] = (Rs1[31:0] + Rs2[31:0]) u>> 1; + * * RV64: + * resw[31:0] = (Rs1[31:0] + Rs2[31:0]) u>> 1; + * Rd[63:0] = SE(resw[31:0]); + * ~~~ + * + * \param [in] a unsigned int type of value stored in a + * \param [in] b unsigned int type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_URADDW(unsigned int a, unsigned int b) +{ + register unsigned long result; + __ASM volatile("uraddw %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 3.177. URADDW ===== */ + +/* ===== Inline Function Start for 3.178. URCRAS16 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_ADDSUB + * \brief URCRAS16 (SIMD 16-bit Unsigned Halving Cross Addition & Subtraction) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * URCRAS16 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do 16-bit unsigned integer element addition and 16-bit unsigned integer element + * subtraction in a 32-bit chunk simultaneously. Operands are from crossed positions in 32-bit chunks. + * The results are halved to avoid overflow or saturation. + * + * **Description**:\n + * This instruction adds the 16-bit unsigned integer in [31:16] of 32-bit chunks in Rs1 + * with the 16-bit unsigned integer in [15:0] of 32-bit chunks in Rs2, and subtracts the 16-bit unsigned + * integer in [31:16] of 32-bit chunks in Rs2 from the 16-bit unsigned integer in [15:0] of 32-bit chunks + * in Rs1. The element results are first logically right-shifted by 1 bit and then written to [31:16] of 32- + * bit chunks in Rd and [15:0] of 32-bit chunks in Rd. + * + * **Examples**:\n + * ~~~ + * Please see `URADD16` and `URSUB16` instructions. + * ~~~ + * + * **Operations**:\n + * ~~~ + * Rd.W[x][31:16] = (Rs1.W[x][31:16] + Rs2.W[x][15:0]) u>> 1; + * Rd.W[x][15:0] = (Rs1.W[x][15:0] - Rs2.W[x][31:16]) u>> 1; + * for RV32, x=0 + * for RV64, x=1...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_URCRAS16(unsigned long a, unsigned long b) +{ + register unsigned long result; + __ASM volatile("urcras16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 3.178. URCRAS16 ===== */ + +/* ===== Inline Function Start for 3.179. URCRSA16 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_ADDSUB + * \brief URCRSA16 (SIMD 16-bit Unsigned Halving Cross Subtraction & Addition) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * URCRSA16 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do 16-bit unsigned integer element subtraction and 16-bit unsigned integer element + * addition in a 32-bit chunk simultaneously. Operands are from crossed positions in 32-bit chunks. + * The results are halved to avoid overflow or saturation. + * + * **Description**:\n + * This instruction subtracts the 16-bit unsigned integer in [15:0] of 32-bit chunks in Rs2 + * from the 16-bit unsigned integer in [31:16] of 32-bit chunks in Rs1, and adds the 16-bit unsigned + * integer in [15:0] of 32-bit chunks in Rs1 with the 16-bit unsigned integer in [31:16] of 32-bit chunks + * in Rs2. The two results are first logically right-shifted by 1 bit and then written to [31:16] of 32-bit + * chunks in Rd and [15:0] of 32-bit chunks in Rd. + * + * **Examples**:\n + * ~~~ + * Please see `URADD16` and `URSUB16` instructions. + * ~~~ + * + * **Operations**:\n + * ~~~ + * Rd.W[x][31:16] = (Rs1.W[x][31:16] - Rs2.W[x][15:0]) u>> 1; + * Rd.W[x][15:0] = (Rs1.W[x][15:0] + Rs2.W[x][31:16]) u>> 1; + * for RV32, x=0 + * for RV64, x=1...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_URCRSA16(unsigned long a, unsigned long b) +{ + register unsigned long result; + __ASM volatile("urcrsa16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 3.179. URCRSA16 ===== */ + +/* ===== Inline Function Start for 3.180. URSTAS16 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_ADDSUB + * \brief URSTAS16 (SIMD 16-bit Unsigned Halving Straight Addition & Subtraction) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * URSTAS16 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do 16-bit unsigned integer element addition and 16-bit unsigned integer element + * subtraction in a 32-bit chunk simultaneously. Operands are from corresponding positions in 32-bit + * chunks. The results are halved to avoid overflow or saturation. + * + * **Description**:\n + * This instruction adds the 16-bit unsigned integer in [31:16] of 32-bit chunks in Rs1 + * with the 16-bit unsigned integer in [31:16] of 32-bit chunks in Rs2, and subtracts the 16-bit unsigned + * integer in [15:0] of 32-bit chunks in Rs2 from the 16-bit unsigned integer in [15:0] of 32-bit chunks + * in Rs1. The element results are first logically right-shifted by 1 bit and then written to [31:16] of 32- + * bit chunks in Rd and [15:0] of 32-bit chunks in Rd. + * + * **Examples**:\n + * ~~~ + * Please see `URADD16` and `URSUB16` instructions. + * ~~~ + * + * **Operations**:\n + * ~~~ + * Rd.W[x][31:16] = (Rs1.W[x][31:16] + Rs2.W[x][31:16]) u>> 1; + * Rd.W[x][15:0] = (Rs1.W[x][15:0] - Rs2.W[x][15:0]) u>> 1; + * for RV32, x=0 + * for RV64, x=1...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_URSTAS16(unsigned long a, unsigned long b) +{ + register unsigned long result; + __ASM volatile("urstas16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 3.180. URSTAS16 ===== */ + +/* ===== Inline Function Start for 3.181. URSTSA16 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_ADDSUB + * \brief URSTSA16 (SIMD 16-bit Unsigned Halving Straight Subtraction & Addition) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * URCRSA16 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do 16-bit unsigned integer element subtraction and 16-bit unsigned integer element + * addition in a 32-bit chunk simultaneously. Operands are from corresponding positions in 32-bit + * chunks. The results are halved to avoid overflow or saturation. + * + * **Description**:\n + * This instruction subtracts the 16-bit unsigned integer in [31:16] of 32-bit chunks in Rs2 + * from the 16-bit unsigned integer in [31:16] of 32-bit chunks in Rs1, and adds the 16-bit unsigned + * integer in [15:0] of 32-bit chunks in Rs1 with the 16-bit unsigned integer in [15:0] of 32-bit chunks in + * Rs2. The two results are first logically right-shifted by 1 bit and then written to [31:16] of 32-bit + * chunks in Rd and [15:0] of 32-bit chunks in Rd. + * + * **Examples**:\n + * ~~~ + * Please see `URADD16` and `URSUB16` instructions. + * ~~~ + * + * **Operations**:\n + * ~~~ + * Rd.W[x][31:16] = (Rs1.W[x][31:16] - Rs2.W[x][31:16]) u>> 1; + * Rd.W[x][15:0] = (Rs1.W[x][15:0] + Rs2.W[x][15:0]) u>> 1; + * for RV32, x=0 + * for RV64, x=1...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_URSTSA16(unsigned long a, unsigned long b) +{ + register unsigned long result; + __ASM volatile("urstsa16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 3.181. URSTSA16 ===== */ + +/* ===== Inline Function Start for 3.182. URSUB8 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_ADDSUB + * \brief URSUB8 (SIMD 8-bit Unsigned Halving Subtraction) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * URSUB8 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do 8-bit unsigned integer element subtractions simultaneously. The results are halved to + * avoid overflow or saturation. + * + * **Description**:\n + * This instruction subtracts the 8-bit unsigned integer elements in Rs2 from the 8-bit + * unsigned integer elements in Rs1. The results are first logically right-shifted by 1 bit and then + * written to Rd. + * + * **Examples**:\n + * ~~~ + * * Ra = 0x7F, Rb = 0x80 Rt = 0xFF + * * Ra = 0x80, Rb = 0x7F Rt = 0x00 + * * Ra = 0x80, Rb = 0x40 Rt = 0x20 + * ~~~ + * + * **Operations**:\n + * ~~~ + * Rd.B[x] = (Rs1.B[x] - Rs2.B[x]) u>> 1; + * for RV32: x=3...0, + * for RV64: x=7...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_URSUB8(unsigned long a, unsigned long b) +{ + register unsigned long result; + __ASM volatile("ursub8 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 3.182. URSUB8 ===== */ + +/* ===== Inline Function Start for 3.183. URSUB16 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_ADDSUB + * \brief URSUB16 (SIMD 16-bit Unsigned Halving Subtraction) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * URSUB16 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do 16-bit unsigned integer element subtractions simultaneously. The results are halved to + * avoid overflow or saturation. + * + * **Description**:\n + * This instruction subtracts the 16-bit unsigned integer elements in Rs2 from the 16-bit + * unsigned integer elements in Rs1. The results are first logically right-shifted by 1 bit and then + * written to Rd. + * + * **Examples**:\n + * ~~~ + * * Ra = 0x7FFF, Rb = 0x8000 Rt = 0xFFFF + * * Ra = 0x8000, Rb = 0x7FFF Rt = 0x0000 + * * Ra = 0x8000, Rb = 0x4000 Rt = 0x2000 + * ~~~ + * + * **Operations**:\n + * ~~~ + * Rd.H[x] = (Rs1.H[x] - Rs2.H[x]) u>> 1; + * for RV32: x=1...0, + * for RV64: x=3...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_URSUB16(unsigned long a, unsigned long b) +{ + register unsigned long result; + __ASM volatile("ursub16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 3.183. URSUB16 ===== */ + +/* ===== Inline Function Start for 3.184. URSUB64 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_64B_ADDSUB + * \brief URSUB64 (64-bit Unsigned Halving Subtraction) + * \details + * **Type**: DSP (64-bit Profile) + * + * **Syntax**:\n + * ~~~ + * URSUB64 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Perform a 64-bit unsigned integer subtraction. The result is halved to avoid overflow or + * saturation. + * + * **RV32 Description**:\n + * This instruction subtracts the 64-bit unsigned integer of an even/odd pair of + * registers specified by Rs2(4,1) from the 64-bit unsigned integer of an even/odd pair of registers + * specified by Rs1(4,1). The subtraction result is first logically right-shifted by 1 bit and then written + * to an even/odd pair of registers specified by Rd(4,1). + * Rx(4,1), i.e., d, determines the even/odd pair group of two registers. Specifically, the register pair + * includes register 2d and 2d+1. + * The odd `2d+1` register of the pair contains the high 32-bit of the result and the even `2d` register + * of the pair contains the low 32-bit of the result. + * + * **RV64 Description**:\n + * This instruction subtracts the 64-bit unsigned integer in Rs2 from the 64-bit + * unsigned integer in Rs1. The subtraction result is first logically right-shifted by 1 bit and then + * written to Rd. + * + * **Operations**:\n + * ~~~ + * * RV32: + * t_L = CONCAT(Rt(4,1),1'b0); t_H = CONCAT(Rt(4,1),1'b1); + * a_L = CONCAT(Ra(4,1),1'b0); a_H = CONCAT(Ra(4,1),1'b1); + * b_L = CONCAT(Rb(4,1),1'b0); b_H = CONCAT(Rb(4,1),1'b1); + * R[t_H].R[t_L] = (R[a_H].R[a_L] - R[b_H].R[b_L]) u>> 1; + * * RV64: + * Rd = (Rs1 - Rs2) u>> 1; + * ~~~ + * + * \param [in] a unsigned long long type of value stored in a + * \param [in] b unsigned long long type of value stored in b + * \return value stored in unsigned long long type + */ +__STATIC_FORCEINLINE unsigned long long __RV_URSUB64(unsigned long long a, unsigned long long b) +{ + register unsigned long long result; + __ASM volatile("ursub64 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 3.184. URSUB64 ===== */ + +/* ===== Inline Function Start for 3.185. URSUBW ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_32B_COMPUTATION + * \brief URSUBW (32-bit Unsigned Halving Subtraction) + * \details + * **Type**: DSP + * + * **Syntax**:\n + * ~~~ + * URSUBW Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Subtract 32-bit unsigned integers and the result is halved to avoid overflow or saturation. + * + * **Description**:\n + * This instruction subtracts the first 32-bit signed integer in Rs2 from the first 32-bit + * signed integer in Rs1. The result is first logically right-shifted by 1 bit and then sign-extended and + * written to Rd. + * + * **Examples**:\n + * ~~~ + * * Ra = 0x7FFFFFFF, Rb = 0x80000000 Rt = 0xFFFFFFFF + * * Ra = 0x80000000, Rb = 0x7FFFFFFF Rt = 0x00000000 + * * Ra = 0x80000000, Rb = 0x40000000 Rt = 0x20000000 + * ~~~ + * + * **Operations**:\n + * ~~~ + * * RV32: + * Rd[31:0] = (Rs1[31:0] - Rs2[31:0]) u>> 1; + * * RV64: + * resw[31:0] = (Rs1[31:0] - Rs2[31:0]) u>> 1; + * Rd[63:0] = SE(resw[31:0]); + * ~~~ + * + * \param [in] a unsigned int type of value stored in a + * \param [in] b unsigned int type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_URSUBW(unsigned int a, unsigned int b) +{ + register unsigned long result; + __ASM volatile("ursubw %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 3.185. URSUBW ===== */ + +/* ===== Inline Function Start for 3.186. WEXTI ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_MISC + * \brief WEXTI (Extract Word from 64-bit Immediate) + * \details + * **Type**: DSP + * + * **Syntax**:\n + * ~~~ + * WEXTI Rd, Rs1, #LSBloc + * ~~~ + * + * **Purpose**:\n + * Extract a 32-bit word from a 64-bit value stored in an even/odd pair of registers (RV32) or + * a register (RV64) starting from a specified immediate LSB bit position. + * + * **RV32 Description**:\n + * This instruction extracts a 32-bit word from a 64-bit value of an even/odd pair of registers specified + * by Rs1(4,1) starting from a specified immediate LSB bit position, #LSBloc. The extracted word is + * written to Rd. + * Rs1(4,1), i.e., d, determines the even/odd pair group of the two registers. Specifically, the register + * pair includes register 2d and 2d+1. + * The odd `2d+1` register of the pair contains the high 32-bit of the 64-bit value and the even `2d` + * register of the pair contains the low 32-bit of the 64-bit value. + * + * **RV64 Description**:\n + * This instruction extracts a 32-bit word from a 64-bit value in Rs1 starting from a specified + * immediate LSB bit position, #LSBloc. The extracted word is sign-extended and written to lower 32- + * bit of Rd. + * + * **Operations**:\n + * ~~~ + * * RV32: + * Idx0 = CONCAT(Rs1(4,1),1'b0); Idx1 = CONCAT(Rs2(4,1),1'b1); + * src[63:0] = Concat(R[Idx1], R[Idx0]); + * Rd = src[31+LSBloc:LSBloc]; + * * RV64: + * ExtractW = Rs1[31+LSBloc:LSBloc]; + * Rd = SE(ExtractW) + * ~~~ + * + * \param [in] a long long type of value stored in a + * \param [in] b unsigned int type of value stored in b + * \return value stored in unsigned long type + */ +#define __RV_WEXTI(a, b) \ + ({ \ + register unsigned long result; \ + register long long __a = (long long)(a); \ + __ASM volatile("wexti %0, %1, %2" : "=r"(result) : "r"(__a), "K"(b)); \ + result; \ + }) +/* ===== Inline Function End for 3.186. WEXTI ===== */ + +/* ===== Inline Function Start for 3.187. WEXT ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_MISC + * \brief WEXT (Extract Word from 64-bit) + * \details + * **Type**: DSP + * + * **Syntax**:\n + * ~~~ + * WEXT Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Extract a 32-bit word from a 64-bit value stored in an even/odd pair of registers (RV32) or + * a register (RV64) starting from a specified LSB bit position in a register. + * + * **RV32 Description**:\n + * This instruction extracts a 32-bit word from a 64-bit value of an even/odd pair of registers specified + * by Rs1(4,1) starting from a specified LSB bit position, specified in Rs2[4:0]. The extracted word is + * written to Rd. + * Rs1(4,1), i.e., d, determines the even/odd pair group of the two registers. Specifically, the register + * pair includes register 2d and 2d+1. + * The odd `2d+1` register of the pair contains the high 32-bit of the 64-bit value and the even `2d` + * register of the pair contains the low 32-bit of the 64-bit value. + * + * **Operations**:\n + * ~~~ + * * RV32: + * Idx0 = CONCAT(Rs1(4,1),1'b0); Idx1 = CONCAT(Rs1(4,1),1'b1); + * src[63:0] = Concat(R[Idx1], R[Idx0]); + * LSBloc = Rs2[4:0]; + * Rd = src[31+LSBloc:LSBloc]; + * * RV64: + * LSBloc = Rs2[4:0]; + * ExtractW = Rs1[31+LSBloc:LSBloc]; + * Rd = SE(ExtractW) + * ~~~ + * + * \param [in] a long long type of value stored in a + * \param [in] b unsigned int type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_WEXT(long long a, unsigned int b) +{ + register unsigned long result; + __ASM volatile("wext %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 3.187. WEXT ===== */ + +/* ===== Inline Function Start for 3.188.1. ZUNPKD810 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_UNPACK + * \brief ZUNPKD810 (Unsigned Unpacking Bytes 1 & 0) + * \details + * **Type**: DSP + * + * **Syntax**:\n + * ~~~ + * ZUNPKD8xy Rd, Rs1 + * xy = {10, 20, 30, 31, 32} + * ~~~ + * + * **Purpose**:\n + * Unpack byte x and byte y of 32-bit chunks in a register into two 16-bit unsigned + * halfwords of 32-bit chunks in a register. + * + * **Description**:\n + * For the `ZUNPKD8(x)(*y*)` instruction, it unpacks byte *x and byte y* of 32-bit chunks in Rs1 into + * two 16-bit unsigned halfwords and writes the results to the top part and the bottom part of 32-bit + * chunks in Rd. + * + * **Operations**:\n + * ~~~ + * Rd.W[m].H[1] = ZE16(Rs1.W[m].B[x]) + * Rd.W[m].H[0] = ZE16(Rs1.W[m].B[y]) + * // ZUNPKD810, x=1,y=0 + * // ZUNPKD820, x=2,y=0 + * // ZUNPKD830, x=3,y=0 + * // ZUNPKD831, x=3,y=1 + * // ZUNPKD832, x=3,y=2 + * for RV32: m=0, + * for RV64: m=1...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_ZUNPKD810(unsigned long a) +{ + register unsigned long result; + __ASM volatile("zunpkd810 %0, %1" : "=r"(result) : "r"(a)); + return result; +} +/* ===== Inline Function End for 3.188.1. ZUNPKD810 ===== */ + +/* ===== Inline Function Start for 3.188.2. ZUNPKD820 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_UNPACK + * \brief ZUNPKD820 (Unsigned Unpacking Bytes 2 & 0) + * \details + * **Type**: DSP + * + * **Syntax**:\n + * ~~~ + * ZUNPKD8xy Rd, Rs1 + * xy = {10, 20, 30, 31, 32} + * ~~~ + * + * **Purpose**:\n + * Unpack byte x and byte y of 32-bit chunks in a register into two 16-bit unsigned + * halfwords of 32-bit chunks in a register. + * + * **Description**:\n + * For the `ZUNPKD8(x)(*y*)` instruction, it unpacks byte *x and byte y* of 32-bit chunks in Rs1 into + * two 16-bit unsigned halfwords and writes the results to the top part and the bottom part of 32-bit + * chunks in Rd. + * + * **Operations**:\n + * ~~~ + * Rd.W[m].H[1] = ZE16(Rs1.W[m].B[x]) + * Rd.W[m].H[0] = ZE16(Rs1.W[m].B[y]) + * // ZUNPKD810, x=1,y=0 + * // ZUNPKD820, x=2,y=0 + * // ZUNPKD830, x=3,y=0 + * // ZUNPKD831, x=3,y=1 + * // ZUNPKD832, x=3,y=2 + * for RV32: m=0, + * for RV64: m=1...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_ZUNPKD820(unsigned long a) +{ + register unsigned long result; + __ASM volatile("zunpkd820 %0, %1" : "=r"(result) : "r"(a)); + return result; +} +/* ===== Inline Function End for 3.188.2. ZUNPKD820 ===== */ + +/* ===== Inline Function Start for 3.188.3. ZUNPKD830 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_UNPACK + * \brief ZUNPKD830 (Unsigned Unpacking Bytes 3 & 0) + * \details + * **Type**: DSP + * + * **Syntax**:\n + * ~~~ + * ZUNPKD8xy Rd, Rs1 + * xy = {10, 20, 30, 31, 32} + * ~~~ + * + * **Purpose**:\n + * Unpack byte x and byte y of 32-bit chunks in a register into two 16-bit unsigned + * halfwords of 32-bit chunks in a register. + * + * **Description**:\n + * For the `ZUNPKD8(x)(*y*)` instruction, it unpacks byte *x and byte y* of 32-bit chunks in Rs1 into + * two 16-bit unsigned halfwords and writes the results to the top part and the bottom part of 32-bit + * chunks in Rd. + * + * **Operations**:\n + * ~~~ + * Rd.W[m].H[1] = ZE16(Rs1.W[m].B[x]) + * Rd.W[m].H[0] = ZE16(Rs1.W[m].B[y]) + * // ZUNPKD810, x=1,y=0 + * // ZUNPKD820, x=2,y=0 + * // ZUNPKD830, x=3,y=0 + * // ZUNPKD831, x=3,y=1 + * // ZUNPKD832, x=3,y=2 + * for RV32: m=0, + * for RV64: m=1...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_ZUNPKD830(unsigned long a) +{ + register unsigned long result; + __ASM volatile("zunpkd830 %0, %1" : "=r"(result) : "r"(a)); + return result; +} +/* ===== Inline Function End for 3.188.3. ZUNPKD830 ===== */ + +/* ===== Inline Function Start for 3.188.4. ZUNPKD831 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_UNPACK + * \brief ZUNPKD831 (Unsigned Unpacking Bytes 3 & 1) + * \details + * **Type**: DSP + * + * **Syntax**:\n + * ~~~ + * ZUNPKD8xy Rd, Rs1 + * xy = {10, 20, 30, 31, 32} + * ~~~ + * + * **Purpose**:\n + * Unpack byte x and byte y of 32-bit chunks in a register into two 16-bit unsigned + * halfwords of 32-bit chunks in a register. + * + * **Description**:\n + * For the `ZUNPKD8(x)(*y*)` instruction, it unpacks byte *x and byte y* of 32-bit chunks in Rs1 into + * two 16-bit unsigned halfwords and writes the results to the top part and the bottom part of 32-bit + * chunks in Rd. + * + * **Operations**:\n + * ~~~ + * Rd.W[m].H[1] = ZE16(Rs1.W[m].B[x]) + * Rd.W[m].H[0] = ZE16(Rs1.W[m].B[y]) + * // ZUNPKD810, x=1,y=0 + * // ZUNPKD820, x=2,y=0 + * // ZUNPKD830, x=3,y=0 + * // ZUNPKD831, x=3,y=1 + * // ZUNPKD832, x=3,y=2 + * for RV32: m=0, + * for RV64: m=1...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_ZUNPKD831(unsigned long a) +{ + register unsigned long result; + __ASM volatile("zunpkd831 %0, %1" : "=r"(result) : "r"(a)); + return result; +} +/* ===== Inline Function End for 3.188.4. ZUNPKD831 ===== */ + +/* ===== Inline Function Start for 3.188.5. ZUNPKD832 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_UNPACK + * \brief ZUNPKD832 (Unsigned Unpacking Bytes 3 & 2) + * \details + * **Type**: DSP + * + * **Syntax**:\n + * ~~~ + * ZUNPKD8xy Rd, Rs1 + * xy = {10, 20, 30, 31, 32} + * ~~~ + * + * **Purpose**:\n + * Unpack byte x and byte y of 32-bit chunks in a register into two 16-bit unsigned + * halfwords of 32-bit chunks in a register. + * + * **Description**:\n + * For the `ZUNPKD8(x)(*y*)` instruction, it unpacks byte *x and byte y* of 32-bit chunks in Rs1 into + * two 16-bit unsigned halfwords and writes the results to the top part and the bottom part of 32-bit + * chunks in Rd. + * + * **Operations**:\n + * ~~~ + * Rd.W[m].H[1] = ZE16(Rs1.W[m].B[x]) + * Rd.W[m].H[0] = ZE16(Rs1.W[m].B[y]) + * // ZUNPKD810, x=1,y=0 + * // ZUNPKD820, x=2,y=0 + * // ZUNPKD830, x=3,y=0 + * // ZUNPKD831, x=3,y=1 + * // ZUNPKD832, x=3,y=2 + * for RV32: m=0, + * for RV64: m=1...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_ZUNPKD832(unsigned long a) +{ + register unsigned long result; + __ASM volatile("zunpkd832 %0, %1" : "=r"(result) : "r"(a)); + return result; +} +/* ===== Inline Function End for 3.188.5. ZUNPKD832 ===== */ + +#if (__RISCV_XLEN == 64) || defined(__ONLY_FOR_DOXYGEN_DOCUMENT_GENERATION__) + +/* ===== Inline Function Start for 4.1. ADD32 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_ADDSUB + * \brief ADD32 (SIMD 32-bit Addition) + * \details + * **Type**: SIMD (RV64 Only) + * + * **Syntax**:\n + * ~~~ + * ADD32 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do 32-bit integer element additions simultaneously. + * + * **Description**:\n + * This instruction adds the 32-bit integer elements in Rs1 with the 32-bit integer + * elements in Rs2, and then writes the 32-bit element results to Rd. + * + * **Note**:\n + * This instruction can be used for either signed or unsigned addition. + * + * **Operations**:\n + * ~~~ + * Rd.W[x] = Rs1.W[x] + Rs2.W[x]; + * for RV64: x=1...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_ADD32(unsigned long a, unsigned long b) +{ + register unsigned long result; + __ASM volatile("add32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 4.1. ADD32 ===== */ + +/* ===== Inline Function Start for 4.2. CRAS32 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_ADDSUB + * \brief CRAS32 (SIMD 32-bit Cross Addition & Subtraction) + * \details + * **Type**: SIMD (RV64 Only) + * + * **Syntax**:\n + * ~~~ + * CRAS32 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do 32-bit integer element addition and 32-bit integer element subtraction in a 64-bit + * chunk simultaneously. Operands are from crossed 32-bit elements. + * + * **Description**:\n + * This instruction adds the 32-bit integer element in [63:32] of Rs1 with the 32-bit + * integer element in [31:0] of Rs2, and writes the result to [63:32] of Rd; at the same time, it subtracts + * the 32-bit integer element in [63:32] of Rs2 from the 32-bit integer element in [31:0] of Rs1, and + * writes the result to [31:0] of Rd. + * + * **Note**:\n + * This instruction can be used for either signed or unsigned operations. + * + * **Operations**:\n + * ~~~ + * Rd.W[1] = Rs1.W[1] + Rs2.W[0]; + * Rd.W[0] = Rs1.W[0] - Rs2.W[1]; + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_CRAS32(unsigned long a, unsigned long b) +{ + register unsigned long result; + __ASM volatile("cras32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 4.2. CRAS32 ===== */ + +/* ===== Inline Function Start for 4.3. CRSA32 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_ADDSUB + * \brief CRSA32 (SIMD 32-bit Cross Subtraction & Addition) + * \details + * **Type**: SIMD (RV64 Only) + * + * **Syntax**:\n + * ~~~ + * CRSA32 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do 32-bit integer element subtraction and 32-bit integer element addition in a 64-bit + * chunk simultaneously. Operands are from crossed 32-bit elements. + * *Description: * + * This instruction subtracts the 32-bit integer element in [31:0] of Rs2 from the 32-bit integer element + * in [63:32] of Rs1, and writes the result to [63:32] of Rd; at the same time, it adds the 32-bit integer + * element in [31:0] of Rs1 with the 32-bit integer element in [63:32] of Rs2, and writes the result to + * [31:0] of Rd + * + * **Note**:\n + * This instruction can be used for either signed or unsigned operations. + * + * **Operations**:\n + * ~~~ + * Rd.W[1] = Rs1.W[1] - Rs2.W[0]; + * Rd.W[0] = Rs1.W[0] + Rs2.W[1]; + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_CRSA32(unsigned long a, unsigned long b) +{ + register unsigned long result; + __ASM volatile("crsa32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 4.3. CRSA32 ===== */ + +/* ===== Inline Function Start for 4.4. KABS32 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_MISC + * \brief KABS32 (Scalar 32-bit Absolute Value with Saturation) + * \details + * **Type**: DSP (RV64 Only) +24 20 +19 15 +14 12 +11 7 +KABS32 +10010 +Rs1 +000 +Rd +6 0 +GE80B +1111111 + * + * **Syntax**:\n + * ~~~ + * KABS32 Rd, Rs1 + * ~~~ + * + * **Purpose**:\n + * Get the absolute value of signed 32-bit integer elements in a general register. + * + * **Description**:\n + * This instruction calculates the absolute value of signed 32-bit integer elements stored + * in Rs1. The results are written to Rd. This instruction with the minimum negative integer input of + * 0x80000000 will produce a saturated output of maximum positive integer of 0x7fffffff and the OV + * flag will be set to 1. + * + * **Operations**:\n + * ~~~ + * if (Rs1.W[x] >= 0) { + * res[x] = Rs1.W[x]; + * } else { + * If (Rs1.W[x] == 0x80000000) { + * res[x] = 0x7fffffff; + * OV = 1; + * } else { + * res[x] = -Rs1.W[x]; + * } + * } + * Rd.W[x] = res[x]; + * for RV64: x=1...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_KABS32(unsigned long a) +{ + register unsigned long result; + __ASM volatile("kabs32 %0, %1" : "=r"(result) : "r"(a)); + return result; +} +/* ===== Inline Function End for 4.4. KABS32 ===== */ + +/* ===== Inline Function Start for 4.5. KADD32 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_ADDSUB + * \brief KADD32 (SIMD 32-bit Signed Saturating Addition) + * \details + * **Type**: SIMD (RV64 Only) + * + * **Syntax**:\n + * ~~~ + * KADD32 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do 32-bit signed integer element saturating additions simultaneously. + * + * **Description**:\n + * This instruction adds the 32-bit signed integer elements in Rs1 with the 32-bit signed + * integer elements in Rs2. If any of the results are beyond the Q31 number range (-2^31 <= Q31 <= 2^31-1), + * they are saturated to the range and the OV bit is set to 1. The saturated results are written to Rd. + * + * **Operations**:\n + * ~~~ + * res[x] = Rs1.W[x] + Rs2.W[x]; + * if (res[x] > (2^31)-1) { + * res[x] = (2^31)-1; + * OV = 1; + * } else if (res[x] < -2^31) { + * res[x] = -2^31; + * OV = 1; + * } + * Rd.W[x] = res[x]; + * for RV64: x=1...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_KADD32(unsigned long a, unsigned long b) +{ + register unsigned long result; + __ASM volatile("kadd32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 4.5. KADD32 ===== */ + +/* ===== Inline Function Start for 4.6. KCRAS32 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_ADDSUB + * \brief KCRAS32 (SIMD 32-bit Signed Saturating Cross Addition & Subtraction) + * \details + * **Type**: SIM (RV64 Only) + * + * **Syntax**:\n + * ~~~ + * KCRAS32 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do 32-bit signed integer element saturating addition and 32-bit signed integer element + * saturating subtraction in a 64-bit chunk simultaneously. Operands are from crossed 32-bit elements. + * + * **Description**:\n + * This instruction adds the 32-bit integer element in [63:32] of Rs1 with the 32-bit + * integer element in [31:0] of Rs2; at the same time, it subtracts the 32-bit integer element in [63:32] of + * Rs2 from the 32-bit integer element in [31:0] of Rs1. If any of the results are beyond the Q31 number + * range (-2^31 <= Q31 <= 2^31-1), they are saturated to the range and the OV bit is set to 1. The saturated + * results are written to [63:32] of Rd for addition and [31:0] of Rd for subtraction. + * + * **Operations**:\n + * ~~~ + * res[1] = Rs1.W[1] + Rs2.W[0]; + * res[0] = Rs1.W[0] - Rs2.W[1]; + * if (res[x] > (2^31)-1) { + * res[x] = (2^31)-1; + * OV = 1; + * } else if (res < -2^31) { + * res[x] = -2^31; + * OV = 1; + * } + * Rd.W[1] = res[1]; + * Rd.W[0] = res[0]; + * for RV64, x=1...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_KCRAS32(unsigned long a, unsigned long b) +{ + register unsigned long result; + __ASM volatile("kcras32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 4.6. KCRAS32 ===== */ + +/* ===== Inline Function Start for 4.7. KCRSA32 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_ADDSUB + * \brief KCRSA32 (SIMD 32-bit Signed Saturating Cross Subtraction & Addition) + * \details + * **Type**: SIMD (RV64 Only) + * + * **Syntax**:\n + * ~~~ + * KCRSA32 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do 32-bit signed integer element saturating subtraction and 32-bit signed integer element + * saturating addition in a 64-bit chunk simultaneously. Operands are from crossed 32-bit elements. + * *Description: * + * This instruction subtracts the 32-bit integer element in [31:0] of Rs2 from the 32-bit integer element + * in [63:32] of Rs1; at the same time, it adds the 32-bit integer element in [31:0] of Rs1 with the 32-bit + * integer element in [63:32] of Rs2. If any of the results are beyond the Q31 number range (-2^31 <= Q31 + * <= 2^31-1), they are saturated to the range and the OV bit is set to 1. The saturated results are written to + * [63:32] of Rd for subtraction and [31:0] of Rd for addition. + * + * **Operations**:\n + * ~~~ + * res[1] = Rs1.W[1] - Rs2.W[0]; + * res[0] = Rs1.W[0] + Rs2.W[1]; + * if (res[x] > (2^31)-1) { + * res[x] = (2^31)-1; + * OV = 1; + * } else if (res < -2^31) { + * res[x] = -2^31; + * OV = 1; + * } + * Rd.W[1] = res[1]; + * Rd.W[0] = res[0]; + * for RV64, x=1...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_KCRSA32(unsigned long a, unsigned long b) +{ + register unsigned long result; + __ASM volatile("kcrsa32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 4.7. KCRSA32 ===== */ + +/* ===== Inline Function Start for 4.8.1. KDMBB16 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_Q15_SAT_MULT + * \brief KDMBB16 (SIMD Signed Saturating Double Multiply B16 x B16) + * \details + * **Type**: SIMD (RV64 only) + * + * **Syntax**:\n + * ~~~ + * KDMxy16 Rd, Rs1, Rs2 (xy = BB, BT, TT) + * ~~~ + * + * **Purpose**:\n + * Multiply the signed Q15 integer contents of two 16-bit data in the corresponding portion + * of the 32-bit chunks in registers and then double and saturate the Q31 results into the 32-bit chunks + * in the destination register. If saturation happens, an overflow flag OV will be set. + * + * **Description**:\n + * Multiply the top or bottom 16-bit Q15 content of the 32-bit portions in Rs1 with the top + * or bottom 16-bit Q15 content of the 32-bit portions in Rs2. The Q30 results are then doubled and + * saturated into Q31 values. The Q31 values are then written into the 32-bit chunks in Rd. When both + * the two Q15 inputs are 0x8000, saturation will happen. The result will be saturated to 0x7FFFFFFF + * and the overflow flag OV will be set. + * + * **Operations**:\n + * ~~~ + * // KDMBB16: (x,y,z)=(0,0,0),(2,2,1) + * // KDMBT16: (x,y,z)=(0,1,0),(2,3,1) + * // KDMTT16: (x,y,z)=(1,1,0),(3,3,1) + * aop[z] = Rs1.H[x]; bop[z] = Rs2.H[y]; + * If (0x8000 != aop[z] | 0x8000 != bop[z]) { + * Mresult[z] = aop[z] * bop[z]; + * resQ31[z] = Mresult[z] << 1; + * } else { + * resQ31[z] = 0x7FFFFFFF; + * OV = 1; + * } + * Rd.W[z] = resQ31[z]; + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_KDMBB16(unsigned long a, unsigned long b) +{ + register unsigned long result; + __ASM volatile("kdmbb16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 4.8.1. KDMBB16 ===== */ + +/* ===== Inline Function Start for 4.8.2. KDMBT16 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_Q15_SAT_MULT + * \brief KDMBT16 (SIMD Signed Saturating Double Multiply B16 x T16) + * \details + * **Type**: SIMD (RV64 only) + * + * **Syntax**:\n + * ~~~ + * KDMxy16 Rd, Rs1, Rs2 (xy = BB, BT, TT) + * ~~~ + * + * **Purpose**:\n + * Multiply the signed Q15 integer contents of two 16-bit data in the corresponding portion + * of the 32-bit chunks in registers and then double and saturate the Q31 results into the 32-bit chunks + * in the destination register. If saturation happens, an overflow flag OV will be set. + * + * **Description**:\n + * Multiply the top or bottom 16-bit Q15 content of the 32-bit portions in Rs1 with the top + * or bottom 16-bit Q15 content of the 32-bit portions in Rs2. The Q30 results are then doubled and + * saturated into Q31 values. The Q31 values are then written into the 32-bit chunks in Rd. When both + * the two Q15 inputs are 0x8000, saturation will happen. The result will be saturated to 0x7FFFFFFF + * and the overflow flag OV will be set. + * + * **Operations**:\n + * ~~~ + * // KDMBB16: (x,y,z)=(0,0,0),(2,2,1) + * // KDMBT16: (x,y,z)=(0,1,0),(2,3,1) + * // KDMTT16: (x,y,z)=(1,1,0),(3,3,1) + * aop[z] = Rs1.H[x]; bop[z] = Rs2.H[y]; + * If (0x8000 != aop[z] | 0x8000 != bop[z]) { + * Mresult[z] = aop[z] * bop[z]; + * resQ31[z] = Mresult[z] << 1; + * } else { + * resQ31[z] = 0x7FFFFFFF; + * OV = 1; + * } + * Rd.W[z] = resQ31[z]; + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_KDMBT16(unsigned long a, unsigned long b) +{ + register unsigned long result; + __ASM volatile("kdmbt16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 4.8.2. KDMBT16 ===== */ + +/* ===== Inline Function Start for 4.8.3. KDMTT16 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_Q15_SAT_MULT + * \brief KDMTT16 (SIMD Signed Saturating Double Multiply T16 x T16) + * \details + * **Type**: SIMD (RV64 only) + * + * **Syntax**:\n + * ~~~ + * KDMxy16 Rd, Rs1, Rs2 (xy = BB, BT, TT) + * ~~~ + * + * **Purpose**:\n + * Multiply the signed Q15 integer contents of two 16-bit data in the corresponding portion + * of the 32-bit chunks in registers and then double and saturate the Q31 results into the 32-bit chunks + * in the destination register. If saturation happens, an overflow flag OV will be set. + * + * **Description**:\n + * Multiply the top or bottom 16-bit Q15 content of the 32-bit portions in Rs1 with the top + * or bottom 16-bit Q15 content of the 32-bit portions in Rs2. The Q30 results are then doubled and + * saturated into Q31 values. The Q31 values are then written into the 32-bit chunks in Rd. When both + * the two Q15 inputs are 0x8000, saturation will happen. The result will be saturated to 0x7FFFFFFF + * and the overflow flag OV will be set. + * + * **Operations**:\n + * ~~~ + * // KDMBB16: (x,y,z)=(0,0,0),(2,2,1) + * // KDMBT16: (x,y,z)=(0,1,0),(2,3,1) + * // KDMTT16: (x,y,z)=(1,1,0),(3,3,1) + * aop[z] = Rs1.H[x]; bop[z] = Rs2.H[y]; + * If (0x8000 != aop[z] | 0x8000 != bop[z]) { + * Mresult[z] = aop[z] * bop[z]; + * resQ31[z] = Mresult[z] << 1; + * } else { + * resQ31[z] = 0x7FFFFFFF; + * OV = 1; + * } + * Rd.W[z] = resQ31[z]; + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_KDMTT16(unsigned long a, unsigned long b) +{ + register unsigned long result; + __ASM volatile("kdmtt16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 4.8.3. KDMTT16 ===== */ + +/* ===== Inline Function Start for 4.9.1. KDMABB16 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_Q15_SAT_MULT + * \brief KDMABB16 (SIMD Signed Saturating Double Multiply Addition B16 x B16) + * \details + * **Type**: SIMD (RV64 only) + * + * **Syntax**:\n + * ~~~ + * KDMAxy16 Rd, Rs1, Rs2 (xy = BB, BT, TT) + * ~~~ + * + * **Purpose**:\n + * Multiply the signed Q15 integer contents of two 16-bit data in the corresponding portion + * of the 32-bit chunks in registers and then double and saturate the Q31 results, add the results with + * the values of the corresponding 32-bit chunks from the destination register and write the saturated + * addition results back into the corresponding 32-bit chunks of the destination register. If saturation + * happens, an overflow flag OV will be set. + * + * **Description**:\n + * Multiply the top or bottom 16-bit Q15 content of the 32-bit portions in Rs1 with the top + * or bottom 16-bit Q15 content of the corresponding 32-bit portions in Rs2. The Q30 results are then + * doubled and saturated into Q31 values. The Q31 values are then added with the content of the + * corresponding 32-bit portions of Rd. If the addition results are beyond the Q31 number range (-2^31 <= + * Q31 <= 2^31-1), they are saturated to the range and the OV flag is set to 1. The results after saturation + * are written back to Rd. + * When both the two Q15 inputs are 0x8000, saturation will happen and the overflow flag OV will be + * set. + * + * **Operations**:\n + * ~~~ + * // KDMABB16: (x,y,z)=(0,0,0),(2,2,1) + * // KDMABT16: (x,y,z)=(0,1,0),(2,3,1) + * // KDMATT16: (x,y,z)=(1,1,0),(3,3,1) + * aop[z] = Rs1.H[x]; bop[z] = Rs2.H[y]; + * If (0x8000 != aop[z] | 0x8000 != bop[z]) { + * Mresult[z] = aop[z] * bop[z]; + * resQ31[z] = Mresult[z] << 1; + * } else { + * resQ31[z] = 0x7FFFFFFF; + * OV = 1; + * } + * resadd[z] = Rd.W[z] + resQ31[z]; + * if (resadd[z] > (2^31)-1) { + * resadd[z] = (2^31)-1; + * OV = 1; + * } else if (resadd[z] < -2^31) { + * resadd[z] = -2^31; + * OV = 1; + * } + * Rd.W[z] = resadd[z]; + * ~~~ + * + * \param [in] t unsigned long type of value stored in t + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_KDMABB16(unsigned long t, unsigned long a, unsigned long b) +{ + __ASM volatile("kdmabb16 %0, %1, %2" : "+r"(t) : "r"(a), "r"(b)); + return t; +} +/* ===== Inline Function End for 4.9.1. KDMABB16 ===== */ + +/* ===== Inline Function Start for 4.9.2. KDMABT16 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_Q15_SAT_MULT + * \brief KDMABT16 (SIMD Signed Saturating Double Multiply Addition B16 x T16) + * \details + * **Type**: SIMD (RV64 only) + * + * **Syntax**:\n + * ~~~ + * KDMAxy16 Rd, Rs1, Rs2 (xy = BB, BT, TT) + * ~~~ + * + * **Purpose**:\n + * Multiply the signed Q15 integer contents of two 16-bit data in the corresponding portion + * of the 32-bit chunks in registers and then double and saturate the Q31 results, add the results with + * the values of the corresponding 32-bit chunks from the destination register and write the saturated + * addition results back into the corresponding 32-bit chunks of the destination register. If saturation + * happens, an overflow flag OV will be set. + * + * **Description**:\n + * Multiply the top or bottom 16-bit Q15 content of the 32-bit portions in Rs1 with the top + * or bottom 16-bit Q15 content of the corresponding 32-bit portions in Rs2. The Q30 results are then + * doubled and saturated into Q31 values. The Q31 values are then added with the content of the + * corresponding 32-bit portions of Rd. If the addition results are beyond the Q31 number range (-2^31 <= + * Q31 <= 2^31-1), they are saturated to the range and the OV flag is set to 1. The results after saturation + * are written back to Rd. + * When both the two Q15 inputs are 0x8000, saturation will happen and the overflow flag OV will be + * set. + * + * **Operations**:\n + * ~~~ + * // KDMABB16: (x,y,z)=(0,0,0),(2,2,1) + * // KDMABT16: (x,y,z)=(0,1,0),(2,3,1) + * // KDMATT16: (x,y,z)=(1,1,0),(3,3,1) + * aop[z] = Rs1.H[x]; bop[z] = Rs2.H[y]; + * If (0x8000 != aop[z] | 0x8000 != bop[z]) { + * Mresult[z] = aop[z] * bop[z]; + * resQ31[z] = Mresult[z] << 1; + * } else { + * resQ31[z] = 0x7FFFFFFF; + * OV = 1; + * } + * resadd[z] = Rd.W[z] + resQ31[z]; + * if (resadd[z] > (2^31)-1) { + * resadd[z] = (2^31)-1; + * OV = 1; + * } else if (resadd[z] < -2^31) { + * resadd[z] = -2^31; + * OV = 1; + * } + * Rd.W[z] = resadd[z]; + * ~~~ + * + * \param [in] t unsigned long type of value stored in t + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_KDMABT16(unsigned long t, unsigned long a, unsigned long b) +{ + __ASM volatile("kdmabt16 %0, %1, %2" : "+r"(t) : "r"(a), "r"(b)); + return t; +} +/* ===== Inline Function End for 4.9.2. KDMABT16 ===== */ + +/* ===== Inline Function Start for 4.9.3. KDMATT16 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_Q15_SAT_MULT + * \brief KDMATT16 (SIMD Signed Saturating Double Multiply Addition T16 x T16) + * \details + * **Type**: SIMD (RV64 only) + * + * **Syntax**:\n + * ~~~ + * KDMAxy16 Rd, Rs1, Rs2 (xy = BB, BT, TT) + * ~~~ + * + * **Purpose**:\n + * Multiply the signed Q15 integer contents of two 16-bit data in the corresponding portion + * of the 32-bit chunks in registers and then double and saturate the Q31 results, add the results with + * the values of the corresponding 32-bit chunks from the destination register and write the saturated + * addition results back into the corresponding 32-bit chunks of the destination register. If saturation + * happens, an overflow flag OV will be set. + * + * **Description**:\n + * Multiply the top or bottom 16-bit Q15 content of the 32-bit portions in Rs1 with the top + * or bottom 16-bit Q15 content of the corresponding 32-bit portions in Rs2. The Q30 results are then + * doubled and saturated into Q31 values. The Q31 values are then added with the content of the + * corresponding 32-bit portions of Rd. If the addition results are beyond the Q31 number range (-2^31 <= + * Q31 <= 2^31-1), they are saturated to the range and the OV flag is set to 1. The results after saturation + * are written back to Rd. + * When both the two Q15 inputs are 0x8000, saturation will happen and the overflow flag OV will be + * set. + * + * **Operations**:\n + * ~~~ + * // KDMABB16: (x,y,z)=(0,0,0),(2,2,1) + * // KDMABT16: (x,y,z)=(0,1,0),(2,3,1) + * // KDMATT16: (x,y,z)=(1,1,0),(3,3,1) + * aop[z] = Rs1.H[x]; bop[z] = Rs2.H[y]; + * If (0x8000 != aop[z] | 0x8000 != bop[z]) { + * Mresult[z] = aop[z] * bop[z]; + * resQ31[z] = Mresult[z] << 1; + * } else { + * resQ31[z] = 0x7FFFFFFF; + * OV = 1; + * } + * resadd[z] = Rd.W[z] + resQ31[z]; + * if (resadd[z] > (2^31)-1) { + * resadd[z] = (2^31)-1; + * OV = 1; + * } else if (resadd[z] < -2^31) { + * resadd[z] = -2^31; + * OV = 1; + * } + * Rd.W[z] = resadd[z]; + * ~~~ + * + * \param [in] t unsigned long type of value stored in t + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_KDMATT16(unsigned long t, unsigned long a, unsigned long b) +{ + __ASM volatile("kdmatt16 %0, %1, %2" : "+r"(t) : "r"(a), "r"(b)); + return t; +} +/* ===== Inline Function End for 4.9.3. KDMATT16 ===== */ + +/* ===== Inline Function Start for 4.10.1. KHMBB16 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_Q15_SAT_MULT + * \brief KHMBB16 (SIMD Signed Saturating Half Multiply B16 x B16) + * \details + * **Type**: SIMD (RV64 Only) + * + * **Syntax**:\n + * ~~~ + * KHMxy16 Rd, Rs1, Rs2 (xy = BB, BT, TT) + * ~~~ + * + * **Purpose**:\n + * Multiply the signed Q15 integer contents of two 16-bit data in the corresponding portion + * of the 32-bit chunks in registers and then right-shift 15 bits to turn the Q30 results into Q15 + * numbers again and saturate the Q15 results into the destination register. If saturation happens, an + * overflow flag OV will be set. + * + * **Description**:\n + * Multiply the top or bottom 16-bit Q15 content of the 32-bit portions in Rs1 with the top + * or bottom 16-bit Q15 content of the 32-bit portion in Rs2. The Q30 results are then right-shifted 15- + * bits and saturated into Q15 values. The 32-bit Q15 values are then written into the 32-bit chunks in + * Rd. When both the two Q15 inputs are 0x8000, saturation will happen. The result will be saturated + * to 0x7FFF and the overflow flag OV will be set. + * + * **Operations**:\n + * ~~~ + * // KHMBB16: (x,y,z)=(0,0,0),(2,2,1) + * // KHMBT16: (x,y,z)=(0,1,0),(2,3,1) + * // KHMTT16: (x,y,z)=(1,1,0),(3,3,1) + * aop = Rs1.H[x]; bop = Rs2.H[y]; + * If (0x8000 != aop | 0x8000 != bop) { + * Mresult[31:0] = aop * bop; + * res[15:0] = Mresult[30:15]; + * } else { + * res[15:0] = 0x7FFF; + * OV = 1; + * } + * Rd.W[z] = SE32(res[15:0]); + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_KHMBB16(unsigned long a, unsigned long b) +{ + register unsigned long result; + __ASM volatile("khmbb16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 4.10.1. KHMBB16 ===== */ + +/* ===== Inline Function Start for 4.10.2. KHMBT16 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_Q15_SAT_MULT + * \brief KHMBT16 (SIMD Signed Saturating Half Multiply B16 x T16) + * \details + * **Type**: SIMD (RV64 Only) + * + * **Syntax**:\n + * ~~~ + * KHMxy16 Rd, Rs1, Rs2 (xy = BB, BT, TT) + * ~~~ + * + * **Purpose**:\n + * Multiply the signed Q15 integer contents of two 16-bit data in the corresponding portion + * of the 32-bit chunks in registers and then right-shift 15 bits to turn the Q30 results into Q15 + * numbers again and saturate the Q15 results into the destination register. If saturation happens, an + * overflow flag OV will be set. + * + * **Description**:\n + * Multiply the top or bottom 16-bit Q15 content of the 32-bit portions in Rs1 with the top + * or bottom 16-bit Q15 content of the 32-bit portion in Rs2. The Q30 results are then right-shifted 15- + * bits and saturated into Q15 values. The 32-bit Q15 values are then written into the 32-bit chunks in + * Rd. When both the two Q15 inputs are 0x8000, saturation will happen. The result will be saturated + * to 0x7FFF and the overflow flag OV will be set. + * + * **Operations**:\n + * ~~~ + * // KHMBB16: (x,y,z)=(0,0,0),(2,2,1) + * // KHMBT16: (x,y,z)=(0,1,0),(2,3,1) + * // KHMTT16: (x,y,z)=(1,1,0),(3,3,1) + * aop = Rs1.H[x]; bop = Rs2.H[y]; + * If (0x8000 != aop | 0x8000 != bop) { + * Mresult[31:0] = aop * bop; + * res[15:0] = Mresult[30:15]; + * } else { + * res[15:0] = 0x7FFF; + * OV = 1; + * } + * Rd.W[z] = SE32(res[15:0]); + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_KHMBT16(unsigned long a, unsigned long b) +{ + register unsigned long result; + __ASM volatile("khmbt16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 4.10.2. KHMBT16 ===== */ + +/* ===== Inline Function Start for 4.10.3. KHMTT16 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_Q15_SAT_MULT + * \brief KHMTT16 (SIMD Signed Saturating Half Multiply T16 x T16) + * \details + * **Type**: SIMD (RV64 Only) + * + * **Syntax**:\n + * ~~~ + * KHMxy16 Rd, Rs1, Rs2 (xy = BB, BT, TT) + * ~~~ + * + * **Purpose**:\n + * Multiply the signed Q15 integer contents of two 16-bit data in the corresponding portion + * of the 32-bit chunks in registers and then right-shift 15 bits to turn the Q30 results into Q15 + * numbers again and saturate the Q15 results into the destination register. If saturation happens, an + * overflow flag OV will be set. + * + * **Description**:\n + * Multiply the top or bottom 16-bit Q15 content of the 32-bit portions in Rs1 with the top + * or bottom 16-bit Q15 content of the 32-bit portion in Rs2. The Q30 results are then right-shifted 15- + * bits and saturated into Q15 values. The 32-bit Q15 values are then written into the 32-bit chunks in + * Rd. When both the two Q15 inputs are 0x8000, saturation will happen. The result will be saturated + * to 0x7FFF and the overflow flag OV will be set. + * + * **Operations**:\n + * ~~~ + * // KHMBB16: (x,y,z)=(0,0,0),(2,2,1) + * // KHMBT16: (x,y,z)=(0,1,0),(2,3,1) + * // KHMTT16: (x,y,z)=(1,1,0),(3,3,1) + * aop = Rs1.H[x]; bop = Rs2.H[y]; + * If (0x8000 != aop | 0x8000 != bop) { + * Mresult[31:0] = aop * bop; + * res[15:0] = Mresult[30:15]; + * } else { + * res[15:0] = 0x7FFF; + * OV = 1; + * } + * Rd.W[z] = SE32(res[15:0]); + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_KHMTT16(unsigned long a, unsigned long b) +{ + register unsigned long result; + __ASM volatile("khmtt16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 4.10.3. KHMTT16 ===== */ + +/* ===== Inline Function Start for 4.11.1. KMABB32 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_32B_MULT_ADD + * \brief KMABB32 (Saturating Signed Multiply Bottom Words & Add) + * \details + * **Type**: DSP (RV64 Only) + * + * **Syntax**:\n + * ~~~ + * KMABB32 Rd, Rs1, Rs2 + * KMABT32 Rd, Rs1, Rs2 + * KMATT32 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Multiply the signed 32-bit element in a register with the 32-bit element in another register + * and add the result to the content of 64-bit data in the third register. The addition result may be + * saturated and is written to the third register. + * * KMABB32: rd + bottom*bottom + * * KMABT32: rd + bottom*top + * * KMATT32: rd + top*top + * + * **Description**:\n + * For the `KMABB32` instruction, it multiplies the bottom 32-bit element in Rs1 with the bottom 32-bit + * element in Rs2. + * For the `KMABT32` instruction, it multiplies the bottom 32-bit element in Rs1 with the top 32-bit + * element in Rs2. + * For the `KMATT32` instruction, it multiplies the top 32-bit element in Rs1 with the top 32-bit + * element in Rs2. + * The multiplication result is added to the content of 64-bit data in Rd. If the addition result is beyond + * the Q63 number range (-2^63 <= Q63 <= 2^63-1), it is saturated to the range and the OV bit is set to 1. The + * result after saturation is written to Rd. The 32-bit contents of Rs1 and Rs2 are treated as signed + * integers. + * + * **Operations**:\n + * ~~~ + * res = Rd + (Rs1.W[0] * Rs2.W[0]); // KMABB32 + * res = Rd + (Rs1.W[0] * Rs2.W[1]); // KMABT32 + * res = Rd + (Rs1.W[1] * Rs2.W[1]); // KMATT32 + * if (res > (2^63)-1) { + * res = (2^63)-1; + * OV = 1; + * } else if (res < -2^63) { + * res = -2^63; + * OV = 1; + * } + * Rd = res; + * *Exceptions:* None + * ~~~ + * + * \param [in] t long type of value stored in t + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in long type + */ +__STATIC_FORCEINLINE long __RV_KMABB32(long t, unsigned long a, unsigned long b) +{ + __ASM volatile("kmabb32 %0, %1, %2" : "+r"(t) : "r"(a), "r"(b)); + return t; +} +/* ===== Inline Function End for 4.11.1. KMABB32 ===== */ + +/* ===== Inline Function Start for 4.11.2. KMABT32 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_32B_MULT_ADD + * \brief KMABT32 (Saturating Signed Multiply Bottom & Top Words & Add) + * \details + * **Type**: DSP (RV64 Only) + * + * **Syntax**:\n + * ~~~ + * KMABB32 Rd, Rs1, Rs2 + * KMABT32 Rd, Rs1, Rs2 + * KMATT32 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Multiply the signed 32-bit element in a register with the 32-bit element in another register + * and add the result to the content of 64-bit data in the third register. The addition result may be + * saturated and is written to the third register. + * * KMABB32: rd + bottom*bottom + * * KMABT32: rd + bottom*top + * * KMATT32: rd + top*top + * + * **Description**:\n + * For the `KMABB32` instruction, it multiplies the bottom 32-bit element in Rs1 with the bottom 32-bit + * element in Rs2. + * For the `KMABT32` instruction, it multiplies the bottom 32-bit element in Rs1 with the top 32-bit + * element in Rs2. + * For the `KMATT32` instruction, it multiplies the top 32-bit element in Rs1 with the top 32-bit + * element in Rs2. + * The multiplication result is added to the content of 64-bit data in Rd. If the addition result is beyond + * the Q63 number range (-2^63 <= Q63 <= 2^63-1), it is saturated to the range and the OV bit is set to 1. The + * result after saturation is written to Rd. The 32-bit contents of Rs1 and Rs2 are treated as signed + * integers. + * + * **Operations**:\n + * ~~~ + * res = Rd + (Rs1.W[0] * Rs2.W[0]); // KMABB32 + * res = Rd + (Rs1.W[0] * Rs2.W[1]); // KMABT32 + * res = Rd + (Rs1.W[1] * Rs2.W[1]); // KMATT32 + * if (res > (2^63)-1) { + * res = (2^63)-1; + * OV = 1; + * } else if (res < -2^63) { + * res = -2^63; + * OV = 1; + * } + * Rd = res; + * *Exceptions:* None + * ~~~ + * + * \param [in] t long type of value stored in t + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in long type + */ +__STATIC_FORCEINLINE long __RV_KMABT32(long t, unsigned long a, unsigned long b) +{ + __ASM volatile("kmabt32 %0, %1, %2" : "+r"(t) : "r"(a), "r"(b)); + return t; +} +/* ===== Inline Function End for 4.11.2. KMABT32 ===== */ + +/* ===== Inline Function Start for 4.11.3. KMATT32 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_32B_MULT_ADD + * \brief KMATT32 (Saturating Signed Multiply Top Words & Add) + * \details + * **Type**: DSP (RV64 Only) + * + * **Syntax**:\n + * ~~~ + * KMABB32 Rd, Rs1, Rs2 + * KMABT32 Rd, Rs1, Rs2 + * KMATT32 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Multiply the signed 32-bit element in a register with the 32-bit element in another register + * and add the result to the content of 64-bit data in the third register. The addition result may be + * saturated and is written to the third register. + * * KMABB32: rd + bottom*bottom + * * KMABT32: rd + bottom*top + * * KMATT32: rd + top*top + * + * **Description**:\n + * For the `KMABB32` instruction, it multiplies the bottom 32-bit element in Rs1 with the bottom 32-bit + * element in Rs2. + * For the `KMABT32` instruction, it multiplies the bottom 32-bit element in Rs1 with the top 32-bit + * element in Rs2. + * For the `KMATT32` instruction, it multiplies the top 32-bit element in Rs1 with the top 32-bit + * element in Rs2. + * The multiplication result is added to the content of 64-bit data in Rd. If the addition result is beyond + * the Q63 number range (-2^63 <= Q63 <= 2^63-1), it is saturated to the range and the OV bit is set to 1. The + * result after saturation is written to Rd. The 32-bit contents of Rs1 and Rs2 are treated as signed + * integers. + * + * **Operations**:\n + * ~~~ + * res = Rd + (Rs1.W[0] * Rs2.W[0]); // KMABB32 + * res = Rd + (Rs1.W[0] * Rs2.W[1]); // KMABT32 + * res = Rd + (Rs1.W[1] * Rs2.W[1]); // KMATT32 + * if (res > (2^63)-1) { + * res = (2^63)-1; + * OV = 1; + * } else if (res < -2^63) { + * res = -2^63; + * OV = 1; + * } + * Rd = res; + * *Exceptions:* None + * ~~~ + * + * \param [in] t long type of value stored in t + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in long type + */ +__STATIC_FORCEINLINE long __RV_KMATT32(long t, unsigned long a, unsigned long b) +{ + __ASM volatile("kmatt32 %0, %1, %2" : "+r"(t) : "r"(a), "r"(b)); + return t; +} +/* ===== Inline Function End for 4.11.3. KMATT32 ===== */ + +/* ===== Inline Function Start for 4.12.1. KMADA32 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_32B_PARALLEL_MAC + * \brief KMADA32 (Saturating Signed Multiply Two Words and Two Adds) + * \details + * **Type**: DSP (RV64 Only) + * + * **Syntax**:\n + * ~~~ + * KMADA32 Rd, Rs1, Rs2 + * KMAXDA32 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do two signed 32-bit multiplications from 32-bit data in two registers; and then adds the + * two 64-bit results and 64-bit data in a third register together. The addition result may be saturated. + * * KMADA32: rd + top*top + bottom*bottom + * * KMAXDA32: rd + top*bottom + bottom*top + * + * **Description**:\n + * For the `KMADA32` instruction, it multiplies the bottom 32-bit element in Rs1 with the bottom 32- + * bit element in Rs2 and then adds the result to the result of multiplying the top 32-bit element in Rs1 + * with the top 32-bit element in Rs2. It is actually an alias of the `KMAR64` instruction. + * For the `KMAXDA32` instruction, it multiplies the top 32-bit element in Rs1 with the bottom 32-bit + * element in Rs2 and then adds the result to the result of multiplying the bottom 32-bit element in Rs1 + * with the top 32-bit element in Rs2. + * The result is added to the content of 64-bit data in Rd. If the addition result is beyond the Q63 + * number range (-2^63 <= Q63 <= 2^63-1), it is saturated to the range and the OV bit is set to 1. The 64-bit + * result is written to Rd. The 32-bit contents of Rs1 and Rs2 are treated as signed integers. + * + * **Operations**:\n + * ~~~ + * res = Rd + (Rs1.W[1] * Rs2.w[1]) + (Rs1.W[0] * Rs2.W[0]); // KMADA32 + * res = Rd + (Rs1.W[1] * Rs2.W[0]) + (Rs1.W[0] * Rs2.W[1]); // KMAXDA32 + * if (res > (2^63)-1) { + * res = (2^63)-1; + * OV = 1; + * } else if (res < -2^63) { + * res = -2^63; + * OV = 1; + * } + * Rd = res; + * ~~~ + * + * \param [in] t long type of value stored in t + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in long type + */ +__STATIC_FORCEINLINE long __RV_KMADA32(long t, unsigned long a, unsigned long b) +{ + __ASM volatile("kmada32 %0, %1, %2" : "+r"(t) : "r"(a), "r"(b)); + return t; +} +/* ===== Inline Function End for 4.12.1. KMADA32 ===== */ + +/* ===== Inline Function Start for 4.12.2. KMAXDA32 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_32B_PARALLEL_MAC + * \brief KMAXDA32 (Saturating Signed Crossed Multiply Two Words and Two Adds) + * \details + * **Type**: DSP (RV64 Only) + * + * **Syntax**:\n + * ~~~ + * KMADA32 Rd, Rs1, Rs2 + * KMAXDA32 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do two signed 32-bit multiplications from 32-bit data in two registers; and then adds the + * two 64-bit results and 64-bit data in a third register together. The addition result may be saturated. + * * KMADA32: rd + top*top + bottom*bottom + * * KMAXDA32: rd + top*bottom + bottom*top + * + * **Description**:\n + * For the `KMADA32` instruction, it multiplies the bottom 32-bit element in Rs1 with the bottom 32- + * bit element in Rs2 and then adds the result to the result of multiplying the top 32-bit element in Rs1 + * with the top 32-bit element in Rs2. It is actually an alias of the `KMAR64` instruction. + * For the `KMAXDA32` instruction, it multiplies the top 32-bit element in Rs1 with the bottom 32-bit + * element in Rs2 and then adds the result to the result of multiplying the bottom 32-bit element in Rs1 + * with the top 32-bit element in Rs2. + * The result is added to the content of 64-bit data in Rd. If the addition result is beyond the Q63 + * number range (-2^63 <= Q63 <= 2^63-1), it is saturated to the range and the OV bit is set to 1. The 64-bit + * result is written to Rd. The 32-bit contents of Rs1 and Rs2 are treated as signed integers. + * + * **Operations**:\n + * ~~~ + * res = Rd + (Rs1.W[1] * Rs2.w[1]) + (Rs1.W[0] * Rs2.W[0]); // KMADA32 + * res = Rd + (Rs1.W[1] * Rs2.W[0]) + (Rs1.W[0] * Rs2.W[1]); // KMAXDA32 + * if (res > (2^63)-1) { + * res = (2^63)-1; + * OV = 1; + * } else if (res < -2^63) { + * res = -2^63; + * OV = 1; + * } + * Rd = res; + * ~~~ + * + * \param [in] t long type of value stored in t + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in long type + */ +__STATIC_FORCEINLINE long __RV_KMAXDA32(long t, unsigned long a, unsigned long b) +{ + __ASM volatile("kmaxda32 %0, %1, %2" : "+r"(t) : "r"(a), "r"(b)); + return t; +} +/* ===== Inline Function End for 4.12.2. KMAXDA32 ===== */ + +/* ===== Inline Function Start for 4.13.1. KMDA32 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_32B_PARALLEL_MAC + * \brief KMDA32 (Signed Multiply Two Words and Add) + * \details + * **Type**: DSP (RV64 Only) + * + * **Syntax**:\n + * ~~~ + * KMDA32 Rd, Rs1, Rs2 + * KMXDA32 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do two signed 32-bit multiplications from the 32-bit element of two registers; and then + * adds the two 64-bit results together. The addition result may be saturated. + * * KMDA32: top*top + bottom*bottom + * * KMXDA32: top*bottom + bottom*top + * + * **Description**:\n + * For the `KMDA32` instruction, it multiplies the bottom 32-bit element of Rs1 with the bottom 32-bit + * element of Rs2 and then adds the result to the result of multiplying the top 32-bit element of Rs1 + * with the top 32-bit element of Rs2. + * For the `KMXDA32` instruction, it multiplies the bottom 32-bit element of Rs1 with the top 32-bit + * element of Rs2 and then adds the result to the result of multiplying the top 32-bit element of Rs1 + * with the bottom 32-bit element of Rs2. + * The addition result is checked for saturation. If saturation happens, the result is saturated to 2^63-1. + * The final result is written to Rd. The 32-bit contents are treated as signed integers. + * + * **Operations**:\n + * ~~~ + * if ((Rs1 != 0x8000000080000000) or (Rs2 != 0x8000000080000000)) { + * Rd = (Rs1.W[1] * Rs2.W[1]) + (Rs1.W[0] * Rs2.W[0]); // KMDA32 + * Rd = (Rs1.W[1] * Rs2.W[0]) + (Rs1.W[0] * Rs2.W[1]); // KMXDA32 + * } else { + * Rd = 0x7fffffffffffffff; + * OV = 1; + * } + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in long type + */ +__STATIC_FORCEINLINE long __RV_KMDA32(unsigned long a, unsigned long b) +{ + register long result; + __ASM volatile("kmda32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 4.13.1. KMDA32 ===== */ + +/* ===== Inline Function Start for 4.13.2. KMXDA32 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_32B_PARALLEL_MAC + * \brief KMXDA32 (Signed Crossed Multiply Two Words and Add) + * \details + * **Type**: DSP (RV64 Only) + * + * **Syntax**:\n + * ~~~ + * KMDA32 Rd, Rs1, Rs2 + * KMXDA32 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do two signed 32-bit multiplications from the 32-bit element of two registers; and then + * adds the two 64-bit results together. The addition result may be saturated. + * * KMDA32: top*top + bottom*bottom + * * KMXDA32: top*bottom + bottom*top + * + * **Description**:\n + * For the `KMDA32` instruction, it multiplies the bottom 32-bit element of Rs1 with the bottom 32-bit + * element of Rs2 and then adds the result to the result of multiplying the top 32-bit element of Rs1 + * with the top 32-bit element of Rs2. + * For the `KMXDA32` instruction, it multiplies the bottom 32-bit element of Rs1 with the top 32-bit + * element of Rs2 and then adds the result to the result of multiplying the top 32-bit element of Rs1 + * with the bottom 32-bit element of Rs2. + * The addition result is checked for saturation. If saturation happens, the result is saturated to 2^63-1. + * The final result is written to Rd. The 32-bit contents are treated as signed integers. + * + * **Operations**:\n + * ~~~ + * if ((Rs1 != 0x8000000080000000) or (Rs2 != 0x8000000080000000)) { + * Rd = (Rs1.W[1] * Rs2.W[1]) + (Rs1.W[0] * Rs2.W[0]); // KMDA32 + * Rd = (Rs1.W[1] * Rs2.W[0]) + (Rs1.W[0] * Rs2.W[1]); // KMXDA32 + * } else { + * Rd = 0x7fffffffffffffff; + * OV = 1; + * } + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in long type + */ +__STATIC_FORCEINLINE long __RV_KMXDA32(unsigned long a, unsigned long b) +{ + register long result; + __ASM volatile("kmxda32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 4.13.2. KMXDA32 ===== */ + +/* ===== Inline Function Start for 4.14.1. KMADS32 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_32B_PARALLEL_MAC + * \brief KMADS32 (Saturating Signed Multiply Two Words & Subtract & Add) + * \details + * **Type**: DSP (RV64 Only) + * + * **Syntax**:\n + * ~~~ + * KMADS32 Rd, Rs1, Rs2 + * KMADRS32 Rd, Rs1, Rs2 + * KMAXDS32 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do two signed 32-bit multiplications from 32-bit elements in two registers; and then + * perform a subtraction operation between the two 64-bit results. Then add the subtraction result to + * 64-bit data in a third register. The addition result may be saturated. + * * KMADS32: rd + (top*top - bottom*bottom) + * * KMADRS32: rd + (bottom*bottom - top*top) + * * KMAXDS32: rd + (top*bottom - bottom*top) + * + * **Description**:\n + * For the `KMADS32` instruction, it multiplies the bottom 32-bit element in Rs1 with the bottom 32-bit + * element in Rs2 and then subtracts the result from the result of multiplying the top 32-bit element in + * Rs1 with the top 32-bit element in Rs2. + * For the `KMADRS32` instruction, it multiplies the top 32-bit element in Rs1 with the top 32-bit + * element in Rs2 and then subtracts the result from the result of multiplying the bottom 32-bit + * element in Rs1 with the bottom 32-bit element in Rs2. + * For the `KMAXDS32` instruction, it multiplies the bottom 32-bit element in Rs1 with the top 32-bit + * element in Rs2 and then subtracts the result from the result of multiplying the top 32-bit element in + * Rs1 with the bottom 32-bit element in Rs2. + * The subtraction result is then added to the content of 64-bit data in Rd. If the addition result is + * beyond the Q63 number range (-2^63 <= Q63 <= 2^63-1), it is saturated to the range and the OV bit is set to + * 1. The 64-bit result after saturation is written to Rd. The 32-bit contents of Rs1 and Rs2 are treated + * as signed integers. + * + * **Operations**:\n + * ~~~ + * res = Rd + (Rs1.W[1] * Rs2.W[1]) - (Rs1.W[0] * Rs2.W[0]); // KMADS32 + * res = Rd + (Rs1.W[0] * Rs2.W[0]) - (Rs1.W[1] * Rs2.W[1]); // KMADRS32 + * res = Rd + (Rs1.W[1] * Rs2.W[0]) - (Rs1.W[0] * Rs2.W[1]); // KMAXDS32 + * if (res > (2^63)-1) { + * res = (2^63)-1; + * OV = 1; + * } else if (res < -2^63) { + * res = -2^63; + * OV = 1; + * } + * Rd = res; + * ~~~ + * + * \param [in] t long type of value stored in t + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in long type + */ +__STATIC_FORCEINLINE long __RV_KMADS32(long t, unsigned long a, unsigned long b) +{ + __ASM volatile("kmads32 %0, %1, %2" : "+r"(t) : "r"(a), "r"(b)); + return t; +} +/* ===== Inline Function End for 4.14.1. KMADS32 ===== */ + +/* ===== Inline Function Start for 4.14.2. KMADRS32 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_32B_PARALLEL_MAC + * \brief KMADRS32 (Saturating Signed Multiply Two Words & Reverse Subtract & Add) + * \details + * **Type**: DSP (RV64 Only) + * + * **Syntax**:\n + * ~~~ + * KMADS32 Rd, Rs1, Rs2 + * KMADRS32 Rd, Rs1, Rs2 + * KMAXDS32 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do two signed 32-bit multiplications from 32-bit elements in two registers; and then + * perform a subtraction operation between the two 64-bit results. Then add the subtraction result to + * 64-bit data in a third register. The addition result may be saturated. + * * KMADS32: rd + (top*top - bottom*bottom) + * * KMADRS32: rd + (bottom*bottom - top*top) + * * KMAXDS32: rd + (top*bottom - bottom*top) + * + * **Description**:\n + * For the `KMADS32` instruction, it multiplies the bottom 32-bit element in Rs1 with the bottom 32-bit + * element in Rs2 and then subtracts the result from the result of multiplying the top 32-bit element in + * Rs1 with the top 32-bit element in Rs2. + * For the `KMADRS32` instruction, it multiplies the top 32-bit element in Rs1 with the top 32-bit + * element in Rs2 and then subtracts the result from the result of multiplying the bottom 32-bit + * element in Rs1 with the bottom 32-bit element in Rs2. + * For the `KMAXDS32` instruction, it multiplies the bottom 32-bit element in Rs1 with the top 32-bit + * element in Rs2 and then subtracts the result from the result of multiplying the top 32-bit element in + * Rs1 with the bottom 32-bit element in Rs2. + * The subtraction result is then added to the content of 64-bit data in Rd. If the addition result is + * beyond the Q63 number range (-2^63 <= Q63 <= 2^63-1), it is saturated to the range and the OV bit is set to + * 1. The 64-bit result after saturation is written to Rd. The 32-bit contents of Rs1 and Rs2 are treated + * as signed integers. + * + * **Operations**:\n + * ~~~ + * res = Rd + (Rs1.W[1] * Rs2.W[1]) - (Rs1.W[0] * Rs2.W[0]); // KMADS32 + * res = Rd + (Rs1.W[0] * Rs2.W[0]) - (Rs1.W[1] * Rs2.W[1]); // KMADRS32 + * res = Rd + (Rs1.W[1] * Rs2.W[0]) - (Rs1.W[0] * Rs2.W[1]); // KMAXDS32 + * if (res > (2^63)-1) { + * res = (2^63)-1; + * OV = 1; + * } else if (res < -2^63) { + * res = -2^63; + * OV = 1; + * } + * Rd = res; + * ~~~ + * + * \param [in] t long type of value stored in t + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in long type + */ +__STATIC_FORCEINLINE long __RV_KMADRS32(long t, unsigned long a, unsigned long b) +{ + __ASM volatile("kmadrs32 %0, %1, %2" : "+r"(t) : "r"(a), "r"(b)); + return t; +} +/* ===== Inline Function End for 4.14.2. KMADRS32 ===== */ + +/* ===== Inline Function Start for 4.14.3. KMAXDS32 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_32B_PARALLEL_MAC + * \brief KMAXDS32 (Saturating Signed Crossed Multiply Two Words & Subtract & Add) + * \details + * **Type**: DSP (RV64 Only) + * + * **Syntax**:\n + * ~~~ + * KMADS32 Rd, Rs1, Rs2 + * KMADRS32 Rd, Rs1, Rs2 + * KMAXDS32 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do two signed 32-bit multiplications from 32-bit elements in two registers; and then + * perform a subtraction operation between the two 64-bit results. Then add the subtraction result to + * 64-bit data in a third register. The addition result may be saturated. + * * KMADS32: rd + (top*top - bottom*bottom) + * * KMADRS32: rd + (bottom*bottom - top*top) + * * KMAXDS32: rd + (top*bottom - bottom*top) + * + * **Description**:\n + * For the `KMADS32` instruction, it multiplies the bottom 32-bit element in Rs1 with the bottom 32-bit + * element in Rs2 and then subtracts the result from the result of multiplying the top 32-bit element in + * Rs1 with the top 32-bit element in Rs2. + * For the `KMADRS32` instruction, it multiplies the top 32-bit element in Rs1 with the top 32-bit + * element in Rs2 and then subtracts the result from the result of multiplying the bottom 32-bit + * element in Rs1 with the bottom 32-bit element in Rs2. + * For the `KMAXDS32` instruction, it multiplies the bottom 32-bit element in Rs1 with the top 32-bit + * element in Rs2 and then subtracts the result from the result of multiplying the top 32-bit element in + * Rs1 with the bottom 32-bit element in Rs2. + * The subtraction result is then added to the content of 64-bit data in Rd. If the addition result is + * beyond the Q63 number range (-2^63 <= Q63 <= 2^63-1), it is saturated to the range and the OV bit is set to + * 1. The 64-bit result after saturation is written to Rd. The 32-bit contents of Rs1 and Rs2 are treated + * as signed integers. + * + * **Operations**:\n + * ~~~ + * res = Rd + (Rs1.W[1] * Rs2.W[1]) - (Rs1.W[0] * Rs2.W[0]); // KMADS32 + * res = Rd + (Rs1.W[0] * Rs2.W[0]) - (Rs1.W[1] * Rs2.W[1]); // KMADRS32 + * res = Rd + (Rs1.W[1] * Rs2.W[0]) - (Rs1.W[0] * Rs2.W[1]); // KMAXDS32 + * if (res > (2^63)-1) { + * res = (2^63)-1; + * OV = 1; + * } else if (res < -2^63) { + * res = -2^63; + * OV = 1; + * } + * Rd = res; + * ~~~ + * + * \param [in] t long type of value stored in t + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in long type + */ +__STATIC_FORCEINLINE long __RV_KMAXDS32(long t, unsigned long a, unsigned long b) +{ + __ASM volatile("kmaxds32 %0, %1, %2" : "+r"(t) : "r"(a), "r"(b)); + return t; +} +/* ===== Inline Function End for 4.14.3. KMAXDS32 ===== */ + +/* ===== Inline Function Start for 4.15.1. KMSDA32 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_32B_PARALLEL_MAC + * \brief KMSDA32 (Saturating Signed Multiply Two Words & Add & Subtract) + * \details + * **Type**: DSP (RV64 Only) + * + * **Syntax**:\n + * ~~~ + * KMSDA32 Rd, Rs1, Rs2 + * KMSXDA32 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do two signed 32-bit multiplications from the 32-bit element of two registers; and then + * subtracts the two 64-bit results from a third register. The subtraction result may be saturated. + * * KMSDA: rd - top*top - bottom*bottom + * * KMSXDA: rd - top*bottom - bottom*top + * + * **Description**:\n + * For the `KMSDA32` instruction, it multiplies the bottom 32-bit element of Rs1 with the bottom 32-bit + * element of Rs2 and multiplies the top 32-bit element of Rs1 with the top 32-bit element of Rs2. + * For the `KMSXDA32` instruction, it multiplies the bottom 32-bit element of Rs1 with the top 32-bit + * element of Rs2 and multiplies the top 32-bit element of Rs1 with the bottom 32-bit element of Rs2. + * The two 64-bit multiplication results are then subtracted from the content of Rd. If the subtraction + * result is beyond the Q63 number range (-2^63 <= Q63 <= 2^63-1), it is saturated to the range and the OV bit + * is set to 1. The result after saturation is written to Rd. The 32-bit contents are treated as signed + * integers. + * + * **Operations**:\n + * ~~~ + * res = Rd - (Rs1.W[1] * Rs2.W[1]) - (Rs1.W[0] * Rs2.W[0]); // KMSDA32 + * res = Rd - (Rs1.W[1] * Rs2.W[0]) - (Rs1.W[0] * Rs2.W[1]); // KMSXDA32 + * if (res > (2^63)-1) { + * res = (2^63)-1; + * OV = 1; + * } else if (res < -2^63) { + * res = -2^63; + * OV = 1; + * } + * Rd = res; + * ~~~ + * + * \param [in] t long type of value stored in t + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in long type + */ +__STATIC_FORCEINLINE long __RV_KMSDA32(long t, unsigned long a, unsigned long b) +{ + __ASM volatile("kmsda32 %0, %1, %2" : "+r"(t) : "r"(a), "r"(b)); + return t; +} +/* ===== Inline Function End for 4.15.1. KMSDA32 ===== */ + +/* ===== Inline Function Start for 4.15.2. KMSXDA32 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_32B_PARALLEL_MAC + * \brief KMSXDA32 (Saturating Signed Crossed Multiply Two Words & Add & Subtract) + * \details + * **Type**: DSP (RV64 Only) + * + * **Syntax**:\n + * ~~~ + * KMSDA32 Rd, Rs1, Rs2 + * KMSXDA32 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do two signed 32-bit multiplications from the 32-bit element of two registers; and then + * subtracts the two 64-bit results from a third register. The subtraction result may be saturated. + * * KMSDA: rd - top*top - bottom*bottom + * * KMSXDA: rd - top*bottom - bottom*top + * + * **Description**:\n + * For the `KMSDA32` instruction, it multiplies the bottom 32-bit element of Rs1 with the bottom 32-bit + * element of Rs2 and multiplies the top 32-bit element of Rs1 with the top 32-bit element of Rs2. + * For the `KMSXDA32` instruction, it multiplies the bottom 32-bit element of Rs1 with the top 32-bit + * element of Rs2 and multiplies the top 32-bit element of Rs1 with the bottom 32-bit element of Rs2. + * The two 64-bit multiplication results are then subtracted from the content of Rd. If the subtraction + * result is beyond the Q63 number range (-2^63 <= Q63 <= 2^63-1), it is saturated to the range and the OV bit + * is set to 1. The result after saturation is written to Rd. The 32-bit contents are treated as signed + * integers. + * + * **Operations**:\n + * ~~~ + * res = Rd - (Rs1.W[1] * Rs2.W[1]) - (Rs1.W[0] * Rs2.W[0]); // KMSDA32 + * res = Rd - (Rs1.W[1] * Rs2.W[0]) - (Rs1.W[0] * Rs2.W[1]); // KMSXDA32 + * if (res > (2^63)-1) { + * res = (2^63)-1; + * OV = 1; + * } else if (res < -2^63) { + * res = -2^63; + * OV = 1; + * } + * Rd = res; + * ~~~ + * + * \param [in] t long type of value stored in t + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in long type + */ +__STATIC_FORCEINLINE long __RV_KMSXDA32(long t, unsigned long a, unsigned long b) +{ + __ASM volatile("kmsxda32 %0, %1, %2" : "+r"(t) : "r"(a), "r"(b)); + return t; +} +/* ===== Inline Function End for 4.15.2. KMSXDA32 ===== */ + +/* ===== Inline Function Start for 4.16. KSLL32 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_SHIFT + * \brief KSLL32 (SIMD 32-bit Saturating Shift Left Logical) + * \details + * **Type**: SIMD (RV64 Only) + * + * **Syntax**:\n + * ~~~ + * KSLL32 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do 32-bit elements logical left shift operations with saturation simultaneously. The shift + * amount is a variable from a GPR. + * + * **Description**:\n + * The 32-bit data elements in Rs1 are left-shifted logically. The shifted out bits are filled + * with zero and the shift amount is specified by the low-order 5-bits of the value in the Rs2 register. + * Any shifted value greater than 2^31-1 is saturated to 2^31-1. Any shifted value smaller than -2^31 is + * saturated to -2^31. And the saturated results are written to Rd. If any saturation is performed, set OV + * bit to 1. + * + * **Operations**:\n + * ~~~ + * sa = Rs2[4:0]; + * if (sa != 0) { + * res[(31+sa):0] = Rs1.W[x] << sa; + * if (res > (2^31)-1) { + * res = 0x7fffffff; OV = 1; + * } else if (res < -2^31) { + * res = 0x80000000; OV = 1; + * } + * Rd.W[x] = res[31:0]; + * } else { + * Rd = Rs1; + * } + * for RV64: x=1...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned int type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_KSLL32(unsigned long a, unsigned int b) +{ + register unsigned long result; + __ASM volatile("ksll32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 4.16. KSLL32 ===== */ + +/* ===== Inline Function Start for 4.17. KSLLI32 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_SHIFT + * \brief KSLLI32 (SIMD 32-bit Saturating Shift Left Logical Immediate) + * \details + * **Type**: SIMD (RV64 Only) + * + * **Syntax**:\n + * ~~~ + * KSLLI32 Rd, Rs1, imm5u + * ~~~ + * + * **Purpose**:\n + * Do 32-bit elements logical left shift operations with saturation simultaneously. The shift + * amount is an immediate value. + * + * **Description**:\n + * The 32-bit data elements in Rs1 are left-shifted logically. The shifted out bits are filled + * with zero and the shift amount is specified by the imm5u constant. Any shifted value greater than + * 2^31-1 is saturated to 2^31-1. Any shifted value smaller than -2^31 is saturated to -2^31. And the saturated + * results are written to Rd. If any saturation is performed, set OV bit to 1. + * + * **Operations**:\n + * ~~~ + * sa = imm5u[4:0]; + * if (sa != 0) { + * res[(31+sa):0] = Rs1.W[x] << sa; + * if (res > (2^31)-1) { + * res = 0x7fffffff; OV = 1; + * } else if (res < -2^31) { + * res = 0x80000000; OV = 1; + * } + * Rd.W[x] = res[31:0]; + * } else { + * Rd = Rs1; + * } + * for RV64: x=1...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned int type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_KSLLI32(unsigned long a, unsigned int b) +{ + register unsigned long result; + __ASM volatile("kslli32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 4.17. KSLLI32 ===== */ + +/* ===== Inline Function Start for 4.18.1. KSLRA32 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_SHIFT + * \brief KSLRA32 (SIMD 32-bit Shift Left Logical with Saturation or Shift Right Arithmetic) + * \details + * **Type**: SIMD (RV64 Only) + * + * **Syntax**:\n + * ~~~ + * KSLRA32 Rd, Rs1, Rs2 + * KSLRA32.u Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do 32-bit elements logical left (positive) or arithmetic right (negative) shift operation with + * Q31 saturation for the left shift. The `.u` form performs additional rounding up operations for the + * right shift. + * + * **Description**:\n + * The 32-bit data elements of Rs1 are left-shifted logically or right-shifted arithmetically + * based on the value of Rs2[5:0]. Rs2[5:0] is in the signed range of [-25, 25-1]. A positive Rs2[5:0] means + * logical left shift and a negative Rs2[5:0] means arithmetic right shift. The shift amount is the + * absolute value of Rs2[5:0]. However, the behavior of `Rs2[5:0]==-25 (0x20)` is defined to be + * equivalent to the behavior of `Rs2[5:0]==-(25-1) (0x21)`. + * The left-shifted results are saturated to the 32-bit signed integer range of [-2^31, 2^31-1]. For the `.u` + * form of the instruction, the right-shifted results are added a 1 to the most significant discarded bit + * position for rounding effect. After the shift, saturation, or rounding, the final results are written to + * Rd. If any saturation happens, this instruction sets the OV flag. The value of Rs2[31:6] will not affect + * this instruction. + * + * **Operations**:\n + * ~~~ + * if (Rs2[5:0] < 0) { + * sa = -Rs2[5:0]; + * sa = (sa == 32)? 31 : sa; + * if (`.u` form) { + * res[31:-1] = SE33(Rs1.W[x][31:sa-1]) + 1; + * Rd.W[x] = res[31:0]; + * } else { + * Rd.W[x] = SE32(Rs1.W[x][31:sa]); + * } + * } else { + * sa = Rs2[4:0]; + * res[(31+sa):0] = Rs1.W[x] <<(logic) sa; + * if (res > (2^31)-1) { + * res[31:0] = 0x7fffffff; OV = 1; + * } else if (res < -2^31) { + * res[31:0] = 0x80000000; OV = 1; + * } + * Rd.W[x] = res[31:0]; + * } + * for RV64: x=1...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b int type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_KSLRA32(unsigned long a, int b) +{ + register unsigned long result; + __ASM volatile("kslra32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 4.18.1. KSLRA32 ===== */ + +/* ===== Inline Function Start for 4.18.2. KSLRA32.u ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_SHIFT + * \brief KSLRA32.u (SIMD 32-bit Shift Left Logical with Saturation or Rounding Shift Right Arithmetic) + * \details + * **Type**: SIMD (RV64 Only) + * + * **Syntax**:\n + * ~~~ + * KSLRA32 Rd, Rs1, Rs2 + * KSLRA32.u Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do 32-bit elements logical left (positive) or arithmetic right (negative) shift operation with + * Q31 saturation for the left shift. The `.u` form performs additional rounding up operations for the + * right shift. + * + * **Description**:\n + * The 32-bit data elements of Rs1 are left-shifted logically or right-shifted arithmetically + * based on the value of Rs2[5:0]. Rs2[5:0] is in the signed range of [-25, 25-1]. A positive Rs2[5:0] means + * logical left shift and a negative Rs2[5:0] means arithmetic right shift. The shift amount is the + * absolute value of Rs2[5:0]. However, the behavior of `Rs2[5:0]==-25 (0x20)` is defined to be + * equivalent to the behavior of `Rs2[5:0]==-(25-1) (0x21)`. + * The left-shifted results are saturated to the 32-bit signed integer range of [-2^31, 2^31-1]. For the `.u` + * form of the instruction, the right-shifted results are added a 1 to the most significant discarded bit + * position for rounding effect. After the shift, saturation, or rounding, the final results are written to + * Rd. If any saturation happens, this instruction sets the OV flag. The value of Rs2[31:6] will not affect + * this instruction. + * + * **Operations**:\n + * ~~~ + * if (Rs2[5:0] < 0) { + * sa = -Rs2[5:0]; + * sa = (sa == 32)? 31 : sa; + * if (`.u` form) { + * res[31:-1] = SE33(Rs1.W[x][31:sa-1]) + 1; + * Rd.W[x] = res[31:0]; + * } else { + * Rd.W[x] = SE32(Rs1.W[x][31:sa]); + * } + * } else { + * sa = Rs2[4:0]; + * res[(31+sa):0] = Rs1.W[x] <<(logic) sa; + * if (res > (2^31)-1) { + * res[31:0] = 0x7fffffff; OV = 1; + * } else if (res < -2^31) { + * res[31:0] = 0x80000000; OV = 1; + * } + * Rd.W[x] = res[31:0]; + * } + * for RV64: x=1...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b int type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_KSLRA32_U(unsigned long a, int b) +{ + register unsigned long result; + __ASM volatile("kslra32.u %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 4.18.2. KSLRA32.u ===== */ + +/* ===== Inline Function Start for 4.19. KSTAS32 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_ADDSUB + * \brief KSTAS32 (SIMD 32-bit Signed Saturating Straight Addition & Subtraction) + * \details + * **Type**: SIMD (RV64 Only) + * + * **Syntax**:\n + * ~~~ + * KSTAS32 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do 32-bit signed integer element saturating addition and 32-bit signed integer element + * saturating subtraction in a 64-bit chunk simultaneously. Operands are from corresponding 32-bit + * elements. + * + * **Description**:\n + * This instruction adds the 32-bit integer element in [63:32] of Rs1 with the 32-bit + * integer element in [63:32] of Rs2; at the same time, it subtracts the 32-bit integer element in [31:0] of + * Rs2 from the 32-bit integer element in [31:0] of Rs1. If any of the results are beyond the Q31 number + * range (-2^31 <= Q31 <= 2^31-1), they are saturated to the range and the OV bit is set to 1. The saturated + * results are written to [63:32] of Rd for addition and [31:0] of Rd for subtraction. + * + * **Operations**:\n + * ~~~ + * res[1] = Rs1.W[1] + Rs2.W[1]; + * res[0] = Rs1.W[0] - Rs2.W[0]; + * if (res[x] > (2^31)-1) { + * res[x] = (2^31)-1; + * OV = 1; + * } else if (res < -2^31) { + * res[x] = -2^31; + * OV = 1; + * } + * Rd.W[1] = res[1]; + * Rd.W[0] = res[0]; + * for RV64, x=1...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_KSTAS32(unsigned long a, unsigned long b) +{ + register unsigned long result; + __ASM volatile("kstas32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 4.19. KSTAS32 ===== */ + +/* ===== Inline Function Start for 4.20. KSTSA32 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_ADDSUB + * \brief KSTSA32 (SIMD 32-bit Signed Saturating Straight Subtraction & Addition) + * \details + * **Type**: SIM (RV64 Only) + * + * **Syntax**:\n + * ~~~ + * KSTSA32 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do 32-bit signed integer element saturating subtraction and 32-bit signed integer element + * saturating addition in a 64-bit chunk simultaneously. Operands are from corresponding 32-bit + * elements. + * *Description: * + * This instruction subtracts the 32-bit integer element in [63:32] of Rs2 from the 32-bit integer + * element in [63:32] of Rs1; at the same time, it adds the 32-bit integer element in [31:0] of Rs1 with + * the 32-bit integer element in [31:0] of Rs2. If any of the results are beyond the Q31 number range (- + * 231 <= Q31 <= 2^31-1), they are saturated to the range and the OV bit is set to 1. The saturated results are + * written to [63:32] of Rd for subtraction and [31:0] of Rd for addition. + * + * **Operations**:\n + * ~~~ + * res[1] = Rs1.W[1] - Rs2.W[1]; + * res[0] = Rs1.W[0] + Rs2.W[0]; + * if (res[x] > (2^31)-1) { + * res[x] = (2^31)-1; + * OV = 1; + * } else if (res < -2^31) { + * res[x] = -2^31; + * OV = 1; + * } + * Rd.W[1] = res[1]; + * Rd.W[0] = res[0]; + * for RV64, x=1...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_KSTSA32(unsigned long a, unsigned long b) +{ + register unsigned long result; + __ASM volatile("kstsa32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 4.20. KSTSA32 ===== */ + +/* ===== Inline Function Start for 4.21. KSUB32 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_ADDSUB + * \brief KSUB32 (SIMD 32-bit Signed Saturating Subtraction) + * \details + * **Type**: SIMD (RV64 Only) + * + * **Syntax**:\n + * ~~~ + * KSUB32 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do 32-bit signed integer elements saturating subtractions simultaneously. + * + * **Description**:\n + * This instruction subtracts the 32-bit signed integer elements in Rs2 from the 32-bit + * signed integer elements in Rs1. If any of the results are beyond the Q31 number range (-2^31 <= Q31 <= + * 2^31-1), they are saturated to the range and the OV bit is set to 1. The saturated results are written to + * Rd. + * + * **Operations**:\n + * ~~~ + * res[x] = Rs1.W[x] - Rs2.W[x]; + * if (res[x] > (2^31)-1) { + * res[x] = (2^31)-1; + * OV = 1; + * } else if (res[x] < -2^31) { + * res[x] = -2^31; + * OV = 1; + * } + * Rd.W[x] = res[x]; + * for RV64: x=1...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_KSUB32(unsigned long a, unsigned long b) +{ + register unsigned long result; + __ASM volatile("ksub32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 4.21. KSUB32 ===== */ + +/* ===== Inline Function Start for 4.22.1. PKBB32 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_32B_PACK + * \brief PKBB32 (Pack Two 32-bit Data from Both Bottom Half) + * \details + * **Type**: DSP (RV64 Only) + * + * **Syntax**:\n + * ~~~ + * PKBB32 Rd, Rs1, Rs2 + * PKBT32 Rd, Rs1, Rs2 + * PKTT32 Rd, Rs1, Rs2 + * PKTB32 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Pack 32-bit data from 64-bit chunks in two registers. + * * PKBB32: bottom.bottom + * * PKBT32: bottom.top + * * PKTT32: top.top + * * PKTB32: top.bottom + * + * **Description**:\n + * (PKBB32) moves Rs1.W[0] to Rd.W[1] and moves Rs2.W[0] to Rd.W[0]. + * (PKBT32) moves Rs1.W[0] to Rd.W[1] and moves Rs2.W[1] to Rd.W[0]. + * (PKTT32) moves Rs1.W[1] to Rd.W[1] and moves Rs2.W[1] to Rd.W[0]. + * (PKTB32) moves Rs1.W[1] to Rd.W[1] and moves Rs2.W[0] to Rd.W[0]. + * + * **Operations**:\n + * ~~~ + * Rd = CONCAT(Rs1.W[_*0*_], Rs2.W[_*0*_]); // PKBB32 + * Rd = CONCAT(Rs1.W[_*0*_], Rs2.W[_*1*_]); // PKBT32 + * Rd = CONCAT(Rs1.W[_*1*_], Rs2.W[_*1*_]); // PKTT32 + * Rd = CONCAT(Rs1.W[_*1*_], Rs2.W[_*0*_]); // PKTB32 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_PKBB32(unsigned long a, unsigned long b) +{ + register unsigned long result; + __ASM volatile("pkbb32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 4.22.1. PKBB32 ===== */ + +/* ===== Inline Function Start for 4.22.2. PKBT32 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_32B_PACK + * \brief PKBT32 (Pack Two 32-bit Data from Bottom and Top Half) + * \details + * **Type**: DSP (RV64 Only) + * + * **Syntax**:\n + * ~~~ + * PKBB32 Rd, Rs1, Rs2 + * PKBT32 Rd, Rs1, Rs2 + * PKTT32 Rd, Rs1, Rs2 + * PKTB32 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Pack 32-bit data from 64-bit chunks in two registers. + * * PKBB32: bottom.bottom + * * PKBT32: bottom.top + * * PKTT32: top.top + * * PKTB32: top.bottom + * + * **Description**:\n + * (PKBB32) moves Rs1.W[0] to Rd.W[1] and moves Rs2.W[0] to Rd.W[0]. + * (PKBT32) moves Rs1.W[0] to Rd.W[1] and moves Rs2.W[1] to Rd.W[0]. + * (PKTT32) moves Rs1.W[1] to Rd.W[1] and moves Rs2.W[1] to Rd.W[0]. + * (PKTB32) moves Rs1.W[1] to Rd.W[1] and moves Rs2.W[0] to Rd.W[0]. + * + * **Operations**:\n + * ~~~ + * Rd = CONCAT(Rs1.W[_*0*_], Rs2.W[_*0*_]); // PKBB32 + * Rd = CONCAT(Rs1.W[_*0*_], Rs2.W[_*1*_]); // PKBT32 + * Rd = CONCAT(Rs1.W[_*1*_], Rs2.W[_*1*_]); // PKTT32 + * Rd = CONCAT(Rs1.W[_*1*_], Rs2.W[_*0*_]); // PKTB32 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_PKBT32(unsigned long a, unsigned long b) +{ + register unsigned long result; + __ASM volatile("pkbt32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 4.22.2. PKBT32 ===== */ + +/* ===== Inline Function Start for 4.22.3. PKTT32 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_32B_PACK + * \brief PKTT32 (Pack Two 32-bit Data from Both Top Half) + * \details + * **Type**: DSP (RV64 Only) + * + * **Syntax**:\n + * ~~~ + * PKBB32 Rd, Rs1, Rs2 + * PKBT32 Rd, Rs1, Rs2 + * PKTT32 Rd, Rs1, Rs2 + * PKTB32 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Pack 32-bit data from 64-bit chunks in two registers. + * * PKBB32: bottom.bottom + * * PKBT32: bottom.top + * * PKTT32: top.top + * * PKTB32: top.bottom + * + * **Description**:\n + * (PKBB32) moves Rs1.W[0] to Rd.W[1] and moves Rs2.W[0] to Rd.W[0]. + * (PKBT32) moves Rs1.W[0] to Rd.W[1] and moves Rs2.W[1] to Rd.W[0]. + * (PKTT32) moves Rs1.W[1] to Rd.W[1] and moves Rs2.W[1] to Rd.W[0]. + * (PKTB32) moves Rs1.W[1] to Rd.W[1] and moves Rs2.W[0] to Rd.W[0]. + * + * **Operations**:\n + * ~~~ + * Rd = CONCAT(Rs1.W[_*0*_], Rs2.W[_*0*_]); // PKBB32 + * Rd = CONCAT(Rs1.W[_*0*_], Rs2.W[_*1*_]); // PKBT32 + * Rd = CONCAT(Rs1.W[_*1*_], Rs2.W[_*1*_]); // PKTT32 + * Rd = CONCAT(Rs1.W[_*1*_], Rs2.W[_*0*_]); // PKTB32 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_PKTT32(unsigned long a, unsigned long b) +{ + register unsigned long result; + __ASM volatile("pktt32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 4.22.3. PKTT32 ===== */ + +/* ===== Inline Function Start for 4.22.4. PKTB32 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_32B_PACK + * \brief PKTB32 (Pack Two 32-bit Data from Top and Bottom Half) + * \details + * **Type**: DSP (RV64 Only) + * + * **Syntax**:\n + * ~~~ + * PKBB32 Rd, Rs1, Rs2 + * PKBT32 Rd, Rs1, Rs2 + * PKTT32 Rd, Rs1, Rs2 + * PKTB32 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Pack 32-bit data from 64-bit chunks in two registers. + * * PKBB32: bottom.bottom + * * PKBT32: bottom.top + * * PKTT32: top.top + * * PKTB32: top.bottom + * + * **Description**:\n + * (PKBB32) moves Rs1.W[0] to Rd.W[1] and moves Rs2.W[0] to Rd.W[0]. + * (PKBT32) moves Rs1.W[0] to Rd.W[1] and moves Rs2.W[1] to Rd.W[0]. + * (PKTT32) moves Rs1.W[1] to Rd.W[1] and moves Rs2.W[1] to Rd.W[0]. + * (PKTB32) moves Rs1.W[1] to Rd.W[1] and moves Rs2.W[0] to Rd.W[0]. + * + * **Operations**:\n + * ~~~ + * Rd = CONCAT(Rs1.W[_*0*_], Rs2.W[_*0*_]); // PKBB32 + * Rd = CONCAT(Rs1.W[_*0*_], Rs2.W[_*1*_]); // PKBT32 + * Rd = CONCAT(Rs1.W[_*1*_], Rs2.W[_*1*_]); // PKTT32 + * Rd = CONCAT(Rs1.W[_*1*_], Rs2.W[_*0*_]); // PKTB32 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_PKTB32(unsigned long a, unsigned long b) +{ + register unsigned long result; + __ASM volatile("pktb32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 4.22.4. PKTB32 ===== */ + +/* ===== Inline Function Start for 4.23. RADD32 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_ADDSUB + * \brief RADD32 (SIMD 32-bit Signed Halving Addition) + * \details + * **Type**: SIMD (RV64 Only) + * + * **Syntax**:\n + * ~~~ + * RADD32 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do 32-bit signed integer element additions simultaneously. The results are halved to avoid + * overflow or saturation. + * + * **Description**:\n + * This instruction adds the 32-bit signed integer elements in Rs1 with the 32-bit signed + * integer elements in Rs2. The results are first arithmetically right-shifted by 1 bit and then written to + * Rd. + * + * **Examples**:\n + * ~~~ + * * Rs1 = 0x7FFFFFFF, Rs2 = 0x7FFFFFFF Rd = 0x7FFFFFFF + * * Rs1 = 0x80000000, Rs2 = 0x80000000 Rd = 0x80000000 + * * Rs1 = 0x40000000, Rs2 = 0x80000000 Rd = 0xE0000000 + * ~~~ + * + * **Operations**:\n + * ~~~ + * Rd.W[x] = (Rs1.W[x] + Rs2.W[x]) s>> 1; + * for RV64: x=1...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_RADD32(unsigned long a, unsigned long b) +{ + register unsigned long result; + __ASM volatile("radd32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 4.23. RADD32 ===== */ + +/* ===== Inline Function Start for 4.24. RCRAS32 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_ADDSUB + * \brief RCRAS32 (SIMD 32-bit Signed Halving Cross Addition & Subtraction) + * \details + * **Type**: SIMD (RV64 Only) + * + * **Syntax**:\n + * ~~~ + * RCRAS32 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do 32-bit signed integer element addition and 32-bit signed integer element subtraction in + * a 64-bit chunk simultaneously. Operands are from crossed 32-bit elements. The results are halved to + * avoid overflow or saturation. + * + * **Description**:\n + * This instruction adds the 32-bit signed integer element in [63:32] of Rs1 with the 32-bit + * signed integer element in [31:0] of Rs2, and subtracts the 32-bit signed integer element in [63:32] of + * Rs2 from the 32-bit signed integer element in [31:0] of Rs1. The element results are first + * arithmetically right-shifted by 1 bit and then written to [63:32] of Rd for addition and [31:0] of Rd + * for subtraction. + * + * **Examples**:\n + * ~~~ + * Please see `RADD32` and `RSUB32` instructions. + * ~~~ + * + * **Operations**:\n + * ~~~ + * Rd.W[1] = (Rs1.W[1] + Rs2.W[0]) s>> 1; + * Rd.W[0] = (Rs1.W[0] - Rs2.W[1]) s>> 1; + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_RCRAS32(unsigned long a, unsigned long b) +{ + register unsigned long result; + __ASM volatile("rcras32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 4.24. RCRAS32 ===== */ + +/* ===== Inline Function Start for 4.25. RCRSA32 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_ADDSUB + * \brief RCRSA32 (SIMD 32-bit Signed Halving Cross Subtraction & Addition) + * \details + * **Type**: SIMD (RV64 Only) + * + * **Syntax**:\n + * ~~~ + * RCRSA32 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do 32-bit signed integer element subtraction and 32-bit signed integer element addition in + * a 64-bit chunk simultaneously. Operands are from crossed 32-bit elements. The results are halved to + * avoid overflow or saturation. + * + * **Description**:\n + * This instruction subtracts the 32-bit signed integer element in [31:0] of Rs2 from the + * 32-bit signed integer element in [63:32] of Rs1, and adds the 32-bit signed element integer in [31:0] + * of Rs1 with the 32-bit signed integer element in [63:32] of Rs2. The two results are first + * arithmetically right-shifted by 1 bit and then written to [63:32] of Rd for subtraction and [31:0] of + * Rd for addition. + * + * **Examples**:\n + * ~~~ + * Please see `RADD32` and `RSUB32` instructions. + * ~~~ + * + * **Operations**:\n + * ~~~ + * Rd.W[1] = (Rs1.W[1] - Rs2.W[0]) s>> 1; + * Rd.W[0] = (Rs1.W[0] + Rs2.W[1]) s>> 1; + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_RCRSA32(unsigned long a, unsigned long b) +{ + register unsigned long result; + __ASM volatile("rcrsa32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 4.25. RCRSA32 ===== */ + +/* ===== Inline Function Start for 4.26. RSTAS32 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_ADDSUB + * \brief RSTAS32 (SIMD 32-bit Signed Halving Straight Addition & Subtraction) + * \details + * **Type**: SIMD (RV64 Only) + * + * **Syntax**:\n + * ~~~ + * RSTAS32 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do 32-bit signed integer element addition and 32-bit signed integer element subtraction in + * a 64-bit chunk simultaneously. Operands are from corresponding 32-bit elements. The results are + * halved to avoid overflow or saturation. + * + * **Description**:\n + * This instruction adds the 32-bit signed integer element in [63:32] of Rs1 with the 32-bit + * signed integer element in [63:32] of Rs2, and subtracts the 32-bit signed integer element in [31:0] of + * Rs2 from the 32-bit signed integer element in [31:0] of Rs1. The element results are first + * arithmetically right-shifted by 1 bit and then written to [63:32] of Rd for addition and [31:0] of Rd + * for subtraction. + * + * **Examples**:\n + * ~~~ + * Please see `RADD32` and `RSUB32` instructions. + * ~~~ + * + * **Operations**:\n + * ~~~ + * Rd.W[1] = (Rs1.W[1] + Rs2.W[1]) s>> 1; + * Rd.W[0] = (Rs1.W[0] - Rs2.W[0]) s>> 1; + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_RSTAS32(unsigned long a, unsigned long b) +{ + register unsigned long result; + __ASM volatile("rstas32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 4.26. RSTAS32 ===== */ + +/* ===== Inline Function Start for 4.27. RSTSA32 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_ADDSUB + * \brief RSTSA32 (SIMD 32-bit Signed Halving Straight Subtraction & Addition) + * \details + * **Type**: SIMD (RV64 Only) + * + * **Syntax**:\n + * ~~~ + * RSTSA32 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do 32-bit signed integer element subtraction and 32-bit signed integer element addition in + * a 64-bit chunk simultaneously. Operands are from corresponding 32-bit elements. The results are + * halved to avoid overflow or saturation. + * + * **Description**:\n + * This instruction subtracts the 32-bit signed integer element in [63:32] of Rs2 from the + * 32-bit signed integer element in [63:32] of Rs1, and adds the 32-bit signed element integer in [31:0] + * of Rs1 with the 32-bit signed integer element in [31:0] of Rs2. The two results are first arithmetically + * right-shifted by 1 bit and then written to [63:32] of Rd for subtraction and [31:0] of Rd for addition. + * + * **Examples**:\n + * ~~~ + * Please see `RADD32` and `RSUB32` instructions. + * ~~~ + * + * **Operations**:\n + * ~~~ + * Rd.W[1] = (Rs1.W[1] - Rs2.W[1]) s>> 1; + * Rd.W[0] = (Rs1.W[0] + Rs2.W[0]) s>> 1; + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_RSTSA32(unsigned long a, unsigned long b) +{ + register unsigned long result; + __ASM volatile("rstsa32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 4.27. RSTSA32 ===== */ + +/* ===== Inline Function Start for 4.28. RSUB32 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_ADDSUB + * \brief RSUB32 (SIMD 32-bit Signed Halving Subtraction) + * \details + * **Type**: SIMD (RV64 Only) + * + * **Syntax**:\n + * ~~~ + * RSUB32 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do 32-bit signed integer element subtractions simultaneously. The results are halved to + * avoid overflow or saturation. + * + * **Description**:\n + * This instruction subtracts the 32-bit signed integer elements in Rs2 from the 32-bit + * signed integer elements in Rs1. The results are first arithmetically right-shifted by 1 bit and then + * written to Rd. + * + * **Examples**:\n + * ~~~ + * * Ra = 0x7FFFFFFF, Rb = 0x80000000 Rt = 0x7FFFFFFF + * * Ra = 0x80000000, Rb = 0x7FFFFFFF Rt = 0x80000000 + * * Ra = 0x80000000, Rb = 0x40000000 Rt = 0xA0000000 + * ~~~ + * + * **Operations**:\n + * ~~~ + * Rd.W[x] = (Rs1.W[x] - Rs2.W[x]) s>> 1; + * for RV64: x=1...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_RSUB32(unsigned long a, unsigned long b) +{ + register unsigned long result; + __ASM volatile("rsub32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 4.28. RSUB32 ===== */ + +/* ===== Inline Function Start for 4.29. SLL32 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_SHIFT + * \brief SLL32 (SIMD 32-bit Shift Left Logical) + * \details + * **Type**: SIMD (RV64 Only) + * + * **Syntax**:\n + * ~~~ + * SLL32 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do 32-bit elements logical left shift operations simultaneously. The shift amount is a + * variable from a GPR. + * + * **Description**:\n + * The 32-bit elements in Rs1 are left-shifted logically. And the results are written to Rd. + * The shifted out bits are filled with zero and the shift amount is specified by the low-order 5-bits of + * the value in the Rs2 register. + * + * **Operations**:\n + * ~~~ + * sa = Rs2[4:0]; + * Rd.W[x] = Rs1.W[x] << sa; + * for RV64: x=1...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned int type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_SLL32(unsigned long a, unsigned int b) +{ + register unsigned long result; + __ASM volatile("sll32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 4.29. SLL32 ===== */ + +/* ===== Inline Function Start for 4.30. SLLI32 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_SHIFT + * \brief SLLI32 (SIMD 32-bit Shift Left Logical Immediate) + * \details + * **Type**: SIMD (RV64 Only) + * + * **Syntax**:\n + * ~~~ + * SLLI32 Rd, Rs1, imm5u[4:0] + * ~~~ + * + * **Purpose**:\n + * Do 32-bit element logical left shift operations simultaneously. The shift amount is an + * immediate value. + * + * **Description**:\n + * The 32-bit elements in Rs1 are left-shifted logically. The shifted out bits are filled with + * zero and the shift amount is specified by the imm5u[4:0] constant. And the results are written to Rd. + * + * **Operations**:\n + * ~~~ + * sa = imm5u[4:0]; + * Rd.W[x] = Rs1.W[x] << sa; + * for RV64: x=1...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned int type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_SLLI32(unsigned long a, unsigned int b) +{ + register unsigned long result; + __ASM volatile("slli32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 4.30. SLLI32 ===== */ + +/* ===== Inline Function Start for 4.31. SMAX32 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_MISC + * \brief SMAX32 (SIMD 32-bit Signed Maximum) + * \details + * **Type**: SIMD (RV64 Only) + * + * **Syntax**:\n + * ~~~ + * SMAX32 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do 32-bit signed integer elements finding maximum operations simultaneously. + * + * **Description**:\n + * This instruction compares the 32-bit signed integer elements in Rs1 with the 32-bit + * signed integer elements in Rs2 and selects the numbers that is greater than the other one. The + * selected results are written to Rd. + * + * **Operations**:\n + * ~~~ + * Rd.W[x] = (Rs1.W[x] > Rs2.W[x])? Rs1.W[x] : Rs2.W[x]; + * for RV64: x=1...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_SMAX32(unsigned long a, unsigned long b) +{ + register unsigned long result; + __ASM volatile("smax32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 4.31. SMAX32 ===== */ + +/* ===== Inline Function Start for 4.32.1. SMBB32 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_32B_MULT + * \brief SMBB32 (Signed Multiply Bottom Word & Bottom Word) + * \details + * **Type**: DSP (RV64 Only) + * + * **Syntax**:\n + * ~~~ + * SMBB32 Rd, Rs1, Rs2 + * SMBT32 Rd, Rs1, Rs2 + * SMTT32 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Multiply the signed 32-bit element of a register with the signed 32-bit element of another + * register and write the 64-bit result to a third register. + * * SMBB32: bottom*bottom + * * SMBT32: bottom*top + * * SMTT32: top*top + * + * **Description**:\n + * For the `SMBB32` instruction, it multiplies the bottom 32-bit element of Rs1 with the bottom 32-bit + * element of Rs2. It is actually an alias of `MULSR64` instruction. + * For the `SMBT32` instruction, it multiplies the bottom 32-bit element of Rs1 with the top 32-bit + * element of Rs2. + * For the `SMTT32` instruction, it multiplies the top 32-bit element of Rs1 with the top 32-bit element + * of Rs2. + * The 64-bit multiplication result is written to Rd. The 32-bit contents of Rs1 and Rs2 are treated as + * signed integers. + * + * **Operations**:\n + * ~~~ + * res = Rs1.W[0] * Rs2.W[0]; // SMBB32 res = Rs1.W[0] * Rs2.w[1]; // SMBT32 res = Rs1.W[1] * Rs2.W[1]; + * // SMTT32 Rd = res; + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in long type + */ +__STATIC_FORCEINLINE long __RV_SMBB32(unsigned long a, unsigned long b) +{ + register long result; + __ASM volatile("smbb32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 4.32.1. SMBB32 ===== */ + +/* ===== Inline Function Start for 4.32.2. SMBT32 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_32B_MULT + * \brief SMBT32 (Signed Multiply Bottom Word & Top Word) + * \details + * **Type**: DSP (RV64 Only) + * + * **Syntax**:\n + * ~~~ + * SMBB32 Rd, Rs1, Rs2 + * SMBT32 Rd, Rs1, Rs2 + * SMTT32 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Multiply the signed 32-bit element of a register with the signed 32-bit element of another + * register and write the 64-bit result to a third register. + * * SMBB32: bottom*bottom + * * SMBT32: bottom*top + * * SMTT32: top*top + * + * **Description**:\n + * For the `SMBB32` instruction, it multiplies the bottom 32-bit element of Rs1 with the bottom 32-bit + * element of Rs2. It is actually an alias of `MULSR64` instruction. + * For the `SMBT32` instruction, it multiplies the bottom 32-bit element of Rs1 with the top 32-bit + * element of Rs2. + * For the `SMTT32` instruction, it multiplies the top 32-bit element of Rs1 with the top 32-bit element + * of Rs2. + * The 64-bit multiplication result is written to Rd. The 32-bit contents of Rs1 and Rs2 are treated as + * signed integers. + * + * **Operations**:\n + * ~~~ + * res = Rs1.W[0] * Rs2.W[0]; // SMBB32 res = Rs1.W[0] * Rs2.w[1]; // SMBT32 res = Rs1.W[1] * Rs2.W[1]; + * // SMTT32 Rd = res; + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in long type + */ +__STATIC_FORCEINLINE long __RV_SMBT32(unsigned long a, unsigned long b) +{ + register long result; + __ASM volatile("smbt32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 4.32.2. SMBT32 ===== */ + +/* ===== Inline Function Start for 4.32.3. SMTT32 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_32B_MULT + * \brief SMTT32 (Signed Multiply Top Word & Top Word) + * \details + * **Type**: DSP (RV64 Only) + * + * **Syntax**:\n + * ~~~ + * SMBB32 Rd, Rs1, Rs2 + * SMBT32 Rd, Rs1, Rs2 + * SMTT32 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Multiply the signed 32-bit element of a register with the signed 32-bit element of another + * register and write the 64-bit result to a third register. + * * SMBB32: bottom*bottom + * * SMBT32: bottom*top + * * SMTT32: top*top + * + * **Description**:\n + * For the `SMBB32` instruction, it multiplies the bottom 32-bit element of Rs1 with the bottom 32-bit + * element of Rs2. It is actually an alias of `MULSR64` instruction. + * For the `SMBT32` instruction, it multiplies the bottom 32-bit element of Rs1 with the top 32-bit + * element of Rs2. + * For the `SMTT32` instruction, it multiplies the top 32-bit element of Rs1 with the top 32-bit element + * of Rs2. + * The 64-bit multiplication result is written to Rd. The 32-bit contents of Rs1 and Rs2 are treated as + * signed integers. + * + * **Operations**:\n + * ~~~ + * res = Rs1.W[0] * Rs2.W[0]; // SMBB32 res = Rs1.W[0] * Rs2.w[1]; // SMBT32 res = Rs1.W[1] * Rs2.W[1]; + * // SMTT32 Rd = res; + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in long type + */ +__STATIC_FORCEINLINE long __RV_SMTT32(unsigned long a, unsigned long b) +{ + register long result; + __ASM volatile("smtt32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 4.32.3. SMTT32 ===== */ + +/* ===== Inline Function Start for 4.33.1. SMDS32 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_32B_PARALLEL_MAC + * \brief SMDS32 (Signed Multiply Two Words and Subtract) + * \details + * **Type**: DSP (RV64 Only) + * + * **Syntax**:\n + * ~~~ + * SMDS32 Rd, Rs1, Rs2 + * SMDRS32 Rd, Rs1, Rs2 + * SMXDS32 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do two signed 32-bit multiplications from the l 32-bit element of two registers; and then + * perform a subtraction operation between the two 64-bit results. + * * SMDS32: top*top - bottom*bottom + * * SMDRS32: bottom*bottom - top*top + * * SMXDS32: top*bottom - bottom*top + * + * **Description**:\n + * For the `SMDS32` instruction, it multiplies the bottom 32-bit element of Rs1 with the bottom 32-bit + * element of Rs2 and then subtracts the result from the result of multiplying the top 32-bit element of + * Rs1 with the top 32-bit element of Rs2. + * For the `SMDRS32` instruction, it multiplies the top 32-bit element of Rs1 with the top 32-bit + * element of Rs2 and then subtracts the result from the result of multiplying the bottom 32-bit + * element of Rs1 with the bottom 32-bit element of Rs2. + * For the `SMXDS32` instruction, it multiplies the bottom 32-bit element of Rs1 with the top 32-bit + * element of Rs2 and then subtracts the result from the result of multiplying the top 32-bit element of + * Rs1 with the bottom 32-bit element of Rs2. + * The subtraction result is written to Rd. The 32-bit contents of Rs1 and Rs2 are treated as signed + * integers. + * + * **Operations**:\n + * ~~~ + * Rt = (Rs1.W[1] * Rs2.W[1]) - (Rs1.W[0] * Rs2.W[0]); // SMDS32 + * Rt = (Rs1.W[0] * Rs2.W[0]) - (Rs1.W[1] * Rs2.W[1]); // SMDRS32 + * Rt = (Rs1.W[1] * Rs2.W[0]) - (Rs1.W[0] * Rs2.W[1]); // SMXDS32 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in long type + */ +__STATIC_FORCEINLINE long __RV_SMDS32(unsigned long a, unsigned long b) +{ + register long result; + __ASM volatile("smds32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 4.33.1. SMDS32 ===== */ + +/* ===== Inline Function Start for 4.33.2. SMDRS32 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_32B_PARALLEL_MAC + * \brief SMDRS32 (Signed Multiply Two Words and Reverse Subtract) + * \details + * **Type**: DSP (RV64 Only) + * + * **Syntax**:\n + * ~~~ + * SMDS32 Rd, Rs1, Rs2 + * SMDRS32 Rd, Rs1, Rs2 + * SMXDS32 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do two signed 32-bit multiplications from the l 32-bit element of two registers; and then + * perform a subtraction operation between the two 64-bit results. + * * SMDS32: top*top - bottom*bottom + * * SMDRS32: bottom*bottom - top*top + * * SMXDS32: top*bottom - bottom*top + * + * **Description**:\n + * For the `SMDS32` instruction, it multiplies the bottom 32-bit element of Rs1 with the bottom 32-bit + * element of Rs2 and then subtracts the result from the result of multiplying the top 32-bit element of + * Rs1 with the top 32-bit element of Rs2. + * For the `SMDRS32` instruction, it multiplies the top 32-bit element of Rs1 with the top 32-bit + * element of Rs2 and then subtracts the result from the result of multiplying the bottom 32-bit + * element of Rs1 with the bottom 32-bit element of Rs2. + * For the `SMXDS32` instruction, it multiplies the bottom 32-bit element of Rs1 with the top 32-bit + * element of Rs2 and then subtracts the result from the result of multiplying the top 32-bit element of + * Rs1 with the bottom 32-bit element of Rs2. + * The subtraction result is written to Rd. The 32-bit contents of Rs1 and Rs2 are treated as signed + * integers. + * + * **Operations**:\n + * ~~~ + * Rt = (Rs1.W[1] * Rs2.W[1]) - (Rs1.W[0] * Rs2.W[0]); // SMDS32 + * Rt = (Rs1.W[0] * Rs2.W[0]) - (Rs1.W[1] * Rs2.W[1]); // SMDRS32 + * Rt = (Rs1.W[1] * Rs2.W[0]) - (Rs1.W[0] * Rs2.W[1]); // SMXDS32 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in long type + */ +__STATIC_FORCEINLINE long __RV_SMDRS32(unsigned long a, unsigned long b) +{ + register long result; + __ASM volatile("smdrs32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 4.33.2. SMDRS32 ===== */ + +/* ===== Inline Function Start for 4.33.3. SMXDS32 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_32B_PARALLEL_MAC + * \brief SMXDS32 (Signed Crossed Multiply Two Words and Subtract) + * \details + * **Type**: DSP (RV64 Only) + * + * **Syntax**:\n + * ~~~ + * SMDS32 Rd, Rs1, Rs2 + * SMDRS32 Rd, Rs1, Rs2 + * SMXDS32 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do two signed 32-bit multiplications from the l 32-bit element of two registers; and then + * perform a subtraction operation between the two 64-bit results. + * * SMDS32: top*top - bottom*bottom + * * SMDRS32: bottom*bottom - top*top + * * SMXDS32: top*bottom - bottom*top + * + * **Description**:\n + * For the `SMDS32` instruction, it multiplies the bottom 32-bit element of Rs1 with the bottom 32-bit + * element of Rs2 and then subtracts the result from the result of multiplying the top 32-bit element of + * Rs1 with the top 32-bit element of Rs2. + * For the `SMDRS32` instruction, it multiplies the top 32-bit element of Rs1 with the top 32-bit + * element of Rs2 and then subtracts the result from the result of multiplying the bottom 32-bit + * element of Rs1 with the bottom 32-bit element of Rs2. + * For the `SMXDS32` instruction, it multiplies the bottom 32-bit element of Rs1 with the top 32-bit + * element of Rs2 and then subtracts the result from the result of multiplying the top 32-bit element of + * Rs1 with the bottom 32-bit element of Rs2. + * The subtraction result is written to Rd. The 32-bit contents of Rs1 and Rs2 are treated as signed + * integers. + * + * **Operations**:\n + * ~~~ + * Rt = (Rs1.W[1] * Rs2.W[1]) - (Rs1.W[0] * Rs2.W[0]); // SMDS32 + * Rt = (Rs1.W[0] * Rs2.W[0]) - (Rs1.W[1] * Rs2.W[1]); // SMDRS32 + * Rt = (Rs1.W[1] * Rs2.W[0]) - (Rs1.W[0] * Rs2.W[1]); // SMXDS32 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in long type + */ +__STATIC_FORCEINLINE long __RV_SMXDS32(unsigned long a, unsigned long b) +{ + register long result; + __ASM volatile("smxds32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 4.33.3. SMXDS32 ===== */ + +/* ===== Inline Function Start for 4.34. SMIN32 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_MISC + * \brief SMIN32 (SIMD 32-bit Signed Minimum) + * \details + * **Type**: SIMD (RV64 Only) + * + * **Syntax**:\n + * ~~~ + * SMIN32 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do 32-bit signed integer elements finding minimum operations simultaneously. + * + * **Description**:\n + * This instruction compares the 32-bit signed integer elements in Rs1 with the 32-bit + * signed integer elements in Rs2 and selects the numbers that is less than the other one. The selected + * results are written to Rd. + * + * **Operations**:\n + * ~~~ + * Rd.W[x] = (Rs1.W[x] < Rs2.W[x])? Rs1.W[x] : Rs2.W[x]; + * for RV64: x=1...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_SMIN32(unsigned long a, unsigned long b) +{ + register unsigned long result; + __ASM volatile("smin32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 4.34. SMIN32 ===== */ + +/* ===== Inline Function Start for 4.35.1. SRA32 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_SHIFT + * \brief SRA32 (SIMD 32-bit Shift Right Arithmetic) + * \details + * **Type**: SIMD (RV64 Only) + * + * **Syntax**:\n + * ~~~ + * SRA32 Rd, Rs1, Rs2 + * SRA32.u Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do 32-bit element arithmetic right shift operations simultaneously. The shift amount is a + * variable from a GPR. The `.u` form performs additional rounding up operations on the shifted + * results. + * + * **Description**:\n + * The 32-bit data elements in Rs1 are right-shifted arithmetically, that is, the shifted out + * bits are filled with the sign-bit of the data elements. The shift amount is specified by the low-order + * 5-bits of the value in the Rs2 register. For the rounding operation of the `.u` form, a value of 1 is + * added to the most significant discarded bit of each 32-bit data element to calculate the final results. + * And the results are written to Rd. + * + * **Operations**:\n + * ~~~ + * sa = Rs2[4:0]; + * if (sa > 0) { + * if (`.u` form) { // SRA32.u + * res[31:-1] = SE33(Rs1.W[x][31:sa-1]) + 1; + * Rd.W[x] = res[31:0]; + * else { // SRA32 + * Rd.W[x] = SE32(Rs1.W[x][31:sa]) + * } + * } else { + * Rd = Rs1; + * } + * for RV64: x=1...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned int type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_SRA32(unsigned long a, unsigned int b) +{ + register unsigned long result; + __ASM volatile("sra32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 4.35.1. SRA32 ===== */ + +/* ===== Inline Function Start for 4.35.2. SRA32.u ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_SHIFT + * \brief SRA32.u (SIMD 32-bit Rounding Shift Right Arithmetic) + * \details + * **Type**: SIMD (RV64 Only) + * + * **Syntax**:\n + * ~~~ + * SRA32 Rd, Rs1, Rs2 + * SRA32.u Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do 32-bit element arithmetic right shift operations simultaneously. The shift amount is a + * variable from a GPR. The `.u` form performs additional rounding up operations on the shifted + * results. + * + * **Description**:\n + * The 32-bit data elements in Rs1 are right-shifted arithmetically, that is, the shifted out + * bits are filled with the sign-bit of the data elements. The shift amount is specified by the low-order + * 5-bits of the value in the Rs2 register. For the rounding operation of the `.u` form, a value of 1 is + * added to the most significant discarded bit of each 32-bit data element to calculate the final results. + * And the results are written to Rd. + * + * **Operations**:\n + * ~~~ + * sa = Rs2[4:0]; + * if (sa > 0) { + * if (`.u` form) { // SRA32.u + * res[31:-1] = SE33(Rs1.W[x][31:sa-1]) + 1; + * Rd.W[x] = res[31:0]; + * else { // SRA32 + * Rd.W[x] = SE32(Rs1.W[x][31:sa]) + * } + * } else { + * Rd = Rs1; + * } + * for RV64: x=1...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned int type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_SRA32_U(unsigned long a, unsigned int b) +{ + register unsigned long result; + __ASM volatile("sra32.u %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 4.35.2. SRA32.u ===== */ + +/* ===== Inline Function Start for 4.36.1. SRAI32 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_SHIFT + * \brief SRAI32 (SIMD 32-bit Shift Right Arithmetic Immediate) + * \details + * **Type**: DSP (RV64 Only) + * + * **Syntax**:\n + * ~~~ + * SRAI32 Rd, Rs1, imm5u + * SRAI32.u Rd, Rs1, imm5u + * ~~~ + * + * **Purpose**:\n + * Do 32-bit elements arithmetic right shift operations simultaneously. The shift amount is + * an immediate value. The `.u` form performs additional rounding up operations on the shifted + * results. + * + * **Description**:\n + * The 32-bit data elements in Rs1 are right-shifted arithmetically, that is, the shifted out + * bits are filled with the sign-bit of the 32-bit data elements. The shift amount is specified by the + * imm5u constant. For the rounding operation of the `.u` form, a value of 1 is added to the most + * significant discarded bit of each 32-bit data to calculate the final results. And the results are written + * to Rd. + * + * **Operations**:\n + * ~~~ + * sa = imm5u[4:0]; + * if (sa > 0) { + * if (`.u` form) { // SRAI32.u + * res[31:-1] = SE33(Rs1.W[x][31:sa-1]) + 1; + * Rd.W[x] = res[31:0]; + * else { // SRAI32 + * Rd.W[x] = SE32(Rs1.W[x][31:sa]); + * } + * } else { + * Rd = Rs1; + * } + * for RV64: x=1...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned int type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_SRAI32(unsigned long a, unsigned int b) +{ + register unsigned long result; + __ASM volatile("srai32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 4.36.1. SRAI32 ===== */ + +/* ===== Inline Function Start for 4.36.2. SRAI32.u ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_SHIFT + * \brief SRAI32.u (SIMD 32-bit Rounding Shift Right Arithmetic Immediate) + * \details + * **Type**: DSP (RV64 Only) + * + * **Syntax**:\n + * ~~~ + * SRAI32 Rd, Rs1, imm5u + * SRAI32.u Rd, Rs1, imm5u + * ~~~ + * + * **Purpose**:\n + * Do 32-bit elements arithmetic right shift operations simultaneously. The shift amount is + * an immediate value. The `.u` form performs additional rounding up operations on the shifted + * results. + * + * **Description**:\n + * The 32-bit data elements in Rs1 are right-shifted arithmetically, that is, the shifted out + * bits are filled with the sign-bit of the 32-bit data elements. The shift amount is specified by the + * imm5u constant. For the rounding operation of the `.u` form, a value of 1 is added to the most + * significant discarded bit of each 32-bit data to calculate the final results. And the results are written + * to Rd. + * + * **Operations**:\n + * ~~~ + * sa = imm5u[4:0]; + * if (sa > 0) { + * if (`.u` form) { // SRAI32.u + * res[31:-1] = SE33(Rs1.W[x][31:sa-1]) + 1; + * Rd.W[x] = res[31:0]; + * else { // SRAI32 + * Rd.W[x] = SE32(Rs1.W[x][31:sa]); + * } + * } else { + * Rd = Rs1; + * } + * for RV64: x=1...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned int type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_SRAI32_U(unsigned long a, unsigned int b) +{ + register unsigned long result; + __ASM volatile("srai32.u %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 4.36.2. SRAI32.u ===== */ + +/* ===== Inline Function Start for 4.37. SRAIW.u ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_NON_SIMD_32B_SHIFT + * \brief SRAIW.u (Rounding Shift Right Arithmetic Immediate Word) + * \details + * **Type**: DSP (RV64 only) + * + * **Syntax**:\n + * ~~~ + * SRAIW.u Rd, Rs1, imm5u + * ~~~ + * + * **Purpose**:\n + * Perform a 32-bit arithmetic right shift operation with rounding. The shift amount is an + * immediate value. + * + * **Description**:\n + * This instruction right-shifts the lower 32-bit content of Rs1 arithmetically. The shifted + * out bits are filled with the sign-bit Rs1(31) and the shift amount is specified by the imm5u constant. + * For the rounding operation, a value of 1 is added to the most significant discarded bit of the data to + * calculate the final result. And the result is sign-extended and written to Rd. + * + * **Operations**:\n + * ~~~ + * sa = imm5u; + * if (sa != 0) { + * res[31:-1] = SE33(Rs1[31:(sa-1)]) + 1; + * Rd = SE32(res[31:0]); + * } else { + * Rd = SE32(Rs1.W[0]); + * } + * ~~~ + * + * \param [in] a int type of value stored in a + * \param [in] b unsigned int type of value stored in b + * \return value stored in long type + */ +__STATIC_FORCEINLINE long __RV_SRAIW_U(int a, unsigned int b) +{ + register long result; + __ASM volatile("sraiw.u %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 4.37. SRAIW.u ===== */ + +/* ===== Inline Function Start for 4.38.1. SRL32 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_SHIFT + * \brief SRL32 (SIMD 32-bit Shift Right Logical) + * \details + * **Type**: SIMD (RV64 Only) + * + * **Syntax**:\n + * ~~~ + * SRL32 Rd, Rs1, Rs2 + * SRL32.u Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do 32-bit element logical right shift operations simultaneously. The shift amount is a + * variable from a GPR. The `.u` form performs additional rounding up operations on the shifted + * results. + * + * **Description**:\n + * The 32-bit data elements in Rs1 are right-shifted logically, that is, the shifted out bits + * are filled with zero. The shift amount is specified by the low-order 5-bits of the value in the Rs2 + * register. For the rounding operation of the `.u` form, a value of 1 is added to the most significant + * discarded bit of each 32-bit data element to calculate the final results. And the results are written to + * Rd. + * + * **Operations**:\n + * ~~~ + * sa = Rs2[4:0]; + * if (sa > 0) { + * if (`.u` form) { // SRA32.u + * res[31:-1] = ZE33(Rs1.W[x][31:sa-1]) + 1; + * Rd.W[x] = res[31:0]; + * else { // SRA32 + * Rd.W[x] = ZE32(Rs1.W[x][31:sa]) + * } + * } else { + * Rd = Rs1; + * } + * for RV64: x=1...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned int type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_SRL32(unsigned long a, unsigned int b) +{ + register unsigned long result; + __ASM volatile("srl32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 4.38.1. SRL32 ===== */ + +/* ===== Inline Function Start for 4.38.2. SRL32.u ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_SHIFT + * \brief SRL32.u (SIMD 32-bit Rounding Shift Right Logical) + * \details + * **Type**: SIMD (RV64 Only) + * + * **Syntax**:\n + * ~~~ + * SRL32 Rd, Rs1, Rs2 + * SRL32.u Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do 32-bit element logical right shift operations simultaneously. The shift amount is a + * variable from a GPR. The `.u` form performs additional rounding up operations on the shifted + * results. + * + * **Description**:\n + * The 32-bit data elements in Rs1 are right-shifted logically, that is, the shifted out bits + * are filled with zero. The shift amount is specified by the low-order 5-bits of the value in the Rs2 + * register. For the rounding operation of the `.u` form, a value of 1 is added to the most significant + * discarded bit of each 32-bit data element to calculate the final results. And the results are written to + * Rd. + * + * **Operations**:\n + * ~~~ + * sa = Rs2[4:0]; + * if (sa > 0) { + * if (`.u` form) { // SRA32.u + * res[31:-1] = ZE33(Rs1.W[x][31:sa-1]) + 1; + * Rd.W[x] = res[31:0]; + * else { // SRA32 + * Rd.W[x] = ZE32(Rs1.W[x][31:sa]) + * } + * } else { + * Rd = Rs1; + * } + * for RV64: x=1...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned int type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_SRL32_U(unsigned long a, unsigned int b) +{ + register unsigned long result; + __ASM volatile("srl32.u %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 4.38.2. SRL32.u ===== */ + +/* ===== Inline Function Start for 4.39.1. SRLI32 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_SHIFT + * \brief SRLI32 (SIMD 32-bit Shift Right Logical Immediate) + * \details + * **Type**: SIMD (RV64 Only) + * + * **Syntax**:\n + * ~~~ + * SRLI32 Rd, Rs1, imm5u + * SRLI32.u Rd, Rs1, imm5u + * ~~~ + * + * **Purpose**:\n + * Do 32-bit elements logical right shift operations simultaneously. The shift amount is an + * immediate value. The `.u` form performs additional rounding up operations on the shifted results. + * + * **Description**:\n + * The 32-bit data elements in Rs1 are right-shifted logically, that is, the shifted out bits + * are filled with zero. The shift amount is specified by the imm5u constant. For the rounding + * operation of the `.u` form, a value of 1 is added to the most significant discarded bit of each 32-bit + * data to calculate the final results. And the results are written to Rd. + * + * **Operations**:\n + * ~~~ + * sa = imm5u[4:0]; + * if (sa > 0) { + * if (`.u` form) { // SRLI32.u + * res[31:-1] = ZE33(Rs1.W[x][31:sa-1]) + 1; + * Rd.W[x] = res[31:0]; + * else { // SRLI32 + * Rd.W[x] = ZE32(Rs1.W[x][31:sa]); + * } + * } else { + * Rd = Rs1; + * } + * for RV64: x=1...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned int type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_SRLI32(unsigned long a, unsigned int b) +{ + register unsigned long result; + __ASM volatile("srli32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 4.39.1. SRLI32 ===== */ + +/* ===== Inline Function Start for 4.39.2. SRLI32.u ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_SHIFT + * \brief SRLI32.u (SIMD 32-bit Rounding Shift Right Logical Immediate) + * \details + * **Type**: SIMD (RV64 Only) + * + * **Syntax**:\n + * ~~~ + * SRLI32 Rd, Rs1, imm5u + * SRLI32.u Rd, Rs1, imm5u + * ~~~ + * + * **Purpose**:\n + * Do 32-bit elements logical right shift operations simultaneously. The shift amount is an + * immediate value. The `.u` form performs additional rounding up operations on the shifted results. + * + * **Description**:\n + * The 32-bit data elements in Rs1 are right-shifted logically, that is, the shifted out bits + * are filled with zero. The shift amount is specified by the imm5u constant. For the rounding + * operation of the `.u` form, a value of 1 is added to the most significant discarded bit of each 32-bit + * data to calculate the final results. And the results are written to Rd. + * + * **Operations**:\n + * ~~~ + * sa = imm5u[4:0]; + * if (sa > 0) { + * if (`.u` form) { // SRLI32.u + * res[31:-1] = ZE33(Rs1.W[x][31:sa-1]) + 1; + * Rd.W[x] = res[31:0]; + * else { // SRLI32 + * Rd.W[x] = ZE32(Rs1.W[x][31:sa]); + * } + * } else { + * Rd = Rs1; + * } + * for RV64: x=1...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned int type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_SRLI32_U(unsigned long a, unsigned int b) +{ + register unsigned long result; + __ASM volatile("srli32.u %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 4.39.2. SRLI32.u ===== */ + +/* ===== Inline Function Start for 4.40. STAS32 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_ADDSUB + * \brief STAS32 (SIMD 32-bit Straight Addition & Subtraction) + * \details + * **Type**: SIMD (RV64 Only) + * + * **Syntax**:\n + * ~~~ + * STAS32 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do 32-bit integer element addition and 32-bit integer element subtraction in a 64-bit + * chunk simultaneously. Operands are from corresponding 32-bit elements. + * + * **Description**:\n + * This instruction adds the 32-bit integer element in [63:32] of Rs1 with the 32-bit + * integer element in [63:32] of Rs2, and writes the result to [63:32] of Rd; at the same time, it subtracts + * the 32-bit integer element in [31:0] of Rs2 from the 32-bit integer element in [31:0] of Rs1, and + * writes the result to [31:0] of Rd. + * + * **Note**:\n + * This instruction can be used for either signed or unsigned operations. + * + * **Operations**:\n + * ~~~ + * Rd.W[1] = Rs1.W[1] + Rs2.W[1]; + * Rd.W[0] = Rs1.W[0] - Rs2.W[0]; + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_STAS32(unsigned long a, unsigned long b) +{ + register unsigned long result; + __ASM volatile("stas32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 4.40. STAS32 ===== */ + +/* ===== Inline Function Start for 4.41. STSA32 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_ADDSUB + * \brief STSA32 (SIMD 32-bit Straight Subtraction & Addition) + * \details + * **Type**: SIMD (RV64 Only) + * + * **Syntax**:\n + * ~~~ + * STSA32 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do 32-bit integer element subtraction and 32-bit integer element addition in a 64-bit + * chunk simultaneously. Operands are from corresponding 32-bit elements. + * *Description: * + * This instruction subtracts the 32-bit integer element in [63:32] of Rs2 from the 32-bit integer + * element in [63:32] of Rs1, and writes the result to [63:32] of Rd; at the same time, it adds the 32-bit + * integer element in [31:0] of Rs1 with the 32-bit integer element in [31:0] of Rs2, and writes the result + * to [31:0] of Rd + * + * **Note**:\n + * This instruction can be used for either signed or unsigned operations. + * + * **Operations**:\n + * ~~~ + * Rd.W[1] = Rs1.W[1] - Rs2.W[1]; + * Rd.W[0] = Rs1.W[0] + Rs2.W[0]; + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_STSA32(unsigned long a, unsigned long b) +{ + register unsigned long result; + __ASM volatile("stsa32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 4.41. STSA32 ===== */ + +/* ===== Inline Function Start for 4.42. SUB32 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_ADDSUB + * \brief SUB32 (SIMD 32-bit Subtraction) + * \details + * **Type**: DSP (RV64 Only) + * + * **Syntax**:\n + * ~~~ + * SUB32 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do 32-bit integer element subtractions simultaneously. + * + * **Description**:\n + * This instruction subtracts the 32-bit integer elements in Rs2 from the 32-bit integer + * elements in Rs1, and then writes the results to Rd. + * + * **Note**:\n + * This instruction can be used for either signed or unsigned subtraction. + * + * **Operations**:\n + * ~~~ + * Rd.W[x] = Rs1.W[x] - Rs2.W[x]; + * for RV64: x=1...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_SUB32(unsigned long a, unsigned long b) +{ + register unsigned long result; + __ASM volatile("sub32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 4.42. SUB32 ===== */ + +/* ===== Inline Function Start for 4.43. UKADD32 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_ADDSUB + * \brief UKADD32 (SIMD 32-bit Unsigned Saturating Addition) + * \details + * **Type**: SIMD (RV64 Only) + * + * **Syntax**:\n + * ~~~ + * UKADD32 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do 32-bit unsigned integer element saturating additions simultaneously. + * + * **Description**:\n + * This instruction adds the 32-bit unsigned integer elements in Rs1 with the 32-bit + * unsigned integer elements in Rs2. If any of the results are beyond the 32-bit unsigned number + * range (0 <= RES <= 2^32-1), they are saturated to the range and the OV bit is set to 1. The saturated + * results are written to Rd. + * + * **Operations**:\n + * ~~~ + * res[x] = Rs1.W[x] + Rs2.W[x]; + * if (res[x] > (2^32)-1) { + * res[x] = (2^32)-1; + * OV = 1; + * } + * Rd.W[x] = res[x]; + * for RV64: x=1...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_UKADD32(unsigned long a, unsigned long b) +{ + register unsigned long result; + __ASM volatile("ukadd32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 4.43. UKADD32 ===== */ + +/* ===== Inline Function Start for 4.44. UKCRAS32 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_ADDSUB + * \brief UKCRAS32 (SIMD 32-bit Unsigned Saturating Cross Addition & Subtraction) + * \details + * **Type**: SIMD (RV64 Only) + * + * **Syntax**:\n + * ~~~ + * UKCRAS32 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do one 32-bit unsigned integer element saturating addition and one 32-bit unsigned + * integer element saturating subtraction in a 64-bit chunk simultaneously. Operands are from crossed + * 32-bit elements. + * + * **Description**:\n + * This instruction adds the 32-bit unsigned integer element in [63:32] of Rs1 with the 32- + * bit unsigned integer element in [31:0] of Rs2; at the same time, it subtracts the 32-bit unsigned + * integer element in [63:32] of Rs2 from the 32-bit unsigned integer element in [31:0] Rs1. If any of the + * results are beyond the 32-bit unsigned number range (0 <= RES <= 2^32-1), they are saturated to the + * range and the OV bit is set to 1. The saturated results are written to [63:32] of Rd for addition and + * [31:0] of Rd for subtraction. + * + * **Operations**:\n + * ~~~ + * res1 = Rs1.W[1] + Rs2.W[0]; + * res2 = Rs1.W[0] - Rs2.W[1]; + * if (res1 > (2^32)-1) { + * res1 = (2^32)-1; + * OV = 1; + * } + * if (res2 < 0) { + * res2 = 0; + * OV = 1; + * } + * Rd.W[1] = res1; + * Rd.W[0] = res2; + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_UKCRAS32(unsigned long a, unsigned long b) +{ + register unsigned long result; + __ASM volatile("ukcras32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 4.44. UKCRAS32 ===== */ + +/* ===== Inline Function Start for 4.45. UKCRSA32 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_ADDSUB + * \brief UKCRSA32 (SIMD 32-bit Unsigned Saturating Cross Subtraction & Addition) + * \details + * **Type**: SIMD (RV64 Only) + * + * **Syntax**:\n + * ~~~ + * UKCRSA32 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do one 32-bit unsigned integer element saturating subtraction and one 32-bit unsigned + * integer element saturating addition in a 64-bit chunk simultaneously. Operands are from crossed + * 32-bit elements. + * + * **Description**:\n + * This instruction subtracts the 32-bit unsigned integer element in [31:0] of Rs2 from the + * 32-bit unsigned integer element in [63:32] of Rs1; at the same time, it adds the 32-bit unsigned + * integer element in [63:32] of Rs2 with the 32-bit unsigned integer element in [31:0] Rs1. If any of the + * results are beyond the 32-bit unsigned number range (0 <= RES <= 2^32-1), they are saturated to the + * range and the OV bit is set to 1. The saturated results are written to [63:32] of Rd for subtraction and + * [31:0] of Rd for addition. + * + * **Operations**:\n + * ~~~ + * res1 = Rs1.W[1] - Rs2.W[0]; + * res2 = Rs1.W[0] + Rs2.W[1]; + * if (res1 < 0) { + * res1 = 0; + * OV = 1; + * } else if (res2 > (2^32)-1) { + * res2 = (2^32)-1; + * OV = 1; + * } + * Rd.W[1] = res1; + * Rd.W[0] = res2; + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_UKCRSA32(unsigned long a, unsigned long b) +{ + register unsigned long result; + __ASM volatile("ukcrsa32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 4.45. UKCRSA32 ===== */ + +/* ===== Inline Function Start for 4.46. UKSTAS32 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_ADDSUB + * \brief UKSTAS32 (SIMD 32-bit Unsigned Saturating Straight Addition & Subtraction) + * \details + * **Type**: SIMD (RV64 Only) + * + * **Syntax**:\n + * ~~~ + * UKSTAS32 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do one 32-bit unsigned integer element saturating addition and one 32-bit unsigned + * integer element saturating subtraction in a 64-bit chunk simultaneously. Operands are from + * corresponding 32-bit elements. + * + * **Description**:\n + * This instruction adds the 32-bit unsigned integer element in [63:32] of Rs1 with the 32- + * bit unsigned integer element in [63:32] of Rs2; at the same time, it subtracts the 32-bit unsigned + * integer element in [31:0] of Rs2 from the 32-bit unsigned integer element in [31:0] Rs1. If any of the + * results are beyond the 32-bit unsigned number range (0 <= RES <= 2^32-1), they are saturated to the + * range and the OV bit is set to 1. The saturated results are written to [63:32] of Rd for addition and + * [31:0] of Rd for subtraction. + * + * **Operations**:\n + * ~~~ + * res1 = Rs1.W[1] + Rs2.W[1]; + * res2 = Rs1.W[0] - Rs2.W[0]; + * if (res1 > (2^32)-1) { + * res1 = (2^32)-1; + * OV = 1; + * } + * if (res2 < 0) { + * res2 = 0; + * OV = 1; + * } + * Rd.W[1] = res1; + * Rd.W[0] = res2; + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_UKSTAS32(unsigned long a, unsigned long b) +{ + register unsigned long result; + __ASM volatile("ukstas32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 4.46. UKSTAS32 ===== */ + +/* ===== Inline Function Start for 4.47. UKSTSA32 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_ADDSUB + * \brief UKSTSA32 (SIMD 32-bit Unsigned Saturating Straight Subtraction & Addition) + * \details + * **Type**: SIMD (RV64 Only) + * + * **Syntax**:\n + * ~~~ + * UKSTSA32 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do one 32-bit unsigned integer element saturating subtraction and one 32-bit unsigned + * integer element saturating addition in a 64-bit chunk simultaneously. Operands are from + * corresponding 32-bit elements. + * + * **Description**:\n + * This instruction subtracts the 32-bit unsigned integer element in [63:32] of Rs2 from + * the 32-bit unsigned integer element in [63:32] of Rs1; at the same time, it adds the 32-bit unsigned + * integer element in [31:0] of Rs2 with the 32-bit unsigned integer element in [31:0] Rs1. If any of the + * results are beyond the 32-bit unsigned number range (0 <= RES <= 2^32-1), they are saturated to the + * range and the OV bit is set to 1. The saturated results are written to [63:32] of Rd for subtraction and + * [31:0] of Rd for addition. + * + * **Operations**:\n + * ~~~ + * res1 = Rs1.W[1] - Rs2.W[1]; + * res2 = Rs1.W[0] + Rs2.W[0]; + * if (res1 < 0) { + * res1 = 0; + * OV = 1; + * } else if (res2 > (2^32)-1) { + * res2 = (2^32)-1; + * OV = 1; + * } + * Rd.W[1] = res1; + * Rd.W[0] = res2; + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_UKSTSA32(unsigned long a, unsigned long b) +{ + register unsigned long result; + __ASM volatile("ukstsa32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 4.47. UKSTSA32 ===== */ + +/* ===== Inline Function Start for 4.48. UKSUB32 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_ADDSUB + * \brief UKSUB32 (SIMD 32-bit Unsigned Saturating Subtraction) + * \details + * **Type**: SIMD (RV64 Only) + * + * **Syntax**:\n + * ~~~ + * UKSUB32 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do 32-bit unsigned integer elements saturating subtractions simultaneously. + * + * **Description**:\n + * This instruction subtracts the 32-bit unsigned integer elements in Rs2 from the 32-bit + * unsigned integer elements in Rs1. If any of the results are beyond the 32-bit unsigned number + * range (0 <= RES <= 2^32-1), they are saturated to the range and the OV bit is set to 1. The saturated + * results are written to Rd. + * + * **Operations**:\n + * ~~~ + * res[x] = Rs1.W[x] - Rs2.W[x]; + * if (res[x] < 0) { + * res[x] = 0; + * OV = 1; + * } + * Rd.W[x] = res[x]; + * for RV64: x=1...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_UKSUB32(unsigned long a, unsigned long b) +{ + register unsigned long result; + __ASM volatile("uksub32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 4.48. UKSUB32 ===== */ + +/* ===== Inline Function Start for 4.49. UMAX32 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_MISC + * \brief UMAX32 (SIMD 32-bit Unsigned Maximum) + * \details + * **Type**: SIMD (RV64 Only) + * + * **Syntax**:\n + * ~~~ + * UMAX32 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do 32-bit unsigned integer elements finding maximum operations simultaneously. + * + * **Description**:\n + * This instruction compares the 32-bit unsigned integer elements in Rs1 with the 32-bit + * unsigned integer elements in Rs2 and selects the numbers that is greater than the other one. The + * selected results are written to Rd. + * + * **Operations**:\n + * ~~~ + * Rd.W[x] = (Rs1.W[x] u> Rs2.W[x])? Rs1.W[x] : Rs2.W[x]; + * for RV64: x=1...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_UMAX32(unsigned long a, unsigned long b) +{ + register unsigned long result; + __ASM volatile("umax32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 4.49. UMAX32 ===== */ + +/* ===== Inline Function Start for 4.50. UMIN32 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_MISC + * \brief UMIN32 (SIMD 32-bit Unsigned Minimum) + * \details + * **Type**: SIMD (RV64 Only) + * + * **Syntax**:\n + * ~~~ + * UMIN32 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do 32-bit unsigned integer elements finding minimum operations simultaneously. + * + * **Description**:\n + * This instruction compares the 32-bit unsigned integer elements in Rs1 with the 32-bit + * unsigned integer elements in Rs2 and selects the numbers that is less than the other one. The + * selected results are written to Rd. + * + * **Operations**:\n + * ~~~ + * Rd.W[x] = (Rs1.W[x] > 1; + * for RV64: x=1...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_URADD32(unsigned long a, unsigned long b) +{ + register unsigned long result; + __ASM volatile("uradd32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 4.51. URADD32 ===== */ + +/* ===== Inline Function Start for 4.52. URCRAS32 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_ADDSUB + * \brief URCRAS32 (SIMD 32-bit Unsigned Halving Cross Addition & Subtraction) + * \details + * **Type**: SIMD (RV64 Only) + * + * **Syntax**:\n + * ~~~ + * URCRAS32 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do 32-bit unsigned integer element addition and 32-bit unsigned integer element + * subtraction in a 64-bit chunk simultaneously. Operands are from crossed 32-bit elements. The + * results are halved to avoid overflow or saturation. + * + * **Description**:\n + * This instruction adds the 32-bit unsigned integer element in [63:32] of Rs1 with the 32- + * bit unsigned integer element in [31:0] of Rs2, and subtracts the 32-bit unsigned integer element in + * [63:32] of Rs2 from the 32-bit unsigned integer element in [31:0] of Rs1. The element results are first + * logically right-shifted by 1 bit and then written to [63:32] of Rd for addition and [31:0] of Rd for + * subtraction. + * + * **Examples**:\n + * ~~~ + * Please see `URADD32` and `URSUB32` instructions. + * ~~~ + * + * **Operations**:\n + * ~~~ + * Rd.W[1] = (Rs1.W[1] + Rs2.W[0]) u>> 1; + * Rd.W[0] = (Rs1.W[0] - Rs2.W[1]) u>> 1; + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_URCRAS32(unsigned long a, unsigned long b) +{ + register unsigned long result; + __ASM volatile("urcras32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 4.52. URCRAS32 ===== */ + +/* ===== Inline Function Start for 4.53. URCRSA32 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_ADDSUB + * \brief URCRSA32 (SIMD 32-bit Unsigned Halving Cross Subtraction & Addition) + * \details + * **Type**: SIMD (RV64 Only) + * + * **Syntax**:\n + * ~~~ + * URCRSA32 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do 32-bit unsigned integer element subtraction and 32-bit unsigned integer element + * addition in a 64-bit chunk simultaneously. Operands are from crossed 32-bit elements. The results + * are halved to avoid overflow or saturation. + * + * **Description**:\n + * This instruction subtracts the 32-bit unsigned integer element in [31:0] of Rs2 from the + * 32-bit unsigned integer element in [63:32] of Rs1, and adds the 32-bit unsigned element integer in + * [31:0] of Rs1 with the 32-bit unsigned integer element in [63:32] of Rs2. The two results are first + * logically right-shifted by 1 bit and then written to [63:32] of Rd for subtraction and [31:0] of Rd for + * addition. + * + * **Examples**:\n + * ~~~ + * Please see `URADD32` and `URSUB32` instructions. + * ~~~ + * + * **Operations**:\n + * ~~~ + * Rd.W[1] = (Rs1.W[1] - Rs2.W[0]) u>> 1; + * Rd.W[0] = (Rs1.W[0] + Rs2.W[1]) u>> 1; + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_URCRSA32(unsigned long a, unsigned long b) +{ + register unsigned long result; + __ASM volatile("urcrsa32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 4.53. URCRSA32 ===== */ + +/* ===== Inline Function Start for 4.54. URSTAS32 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_ADDSUB + * \brief URSTAS32 (SIMD 32-bit Unsigned Halving Straight Addition & Subtraction) + * \details + * **Type**: SIMD (RV64 Only) + * + * **Syntax**:\n + * ~~~ + * URSTAS32 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do 32-bit unsigned integer element addition and 32-bit unsigned integer element + * subtraction in a 64-bit chunk simultaneously. Operands are from corresponding 32-bit elements. + * The results are halved to avoid overflow or saturation. + * + * **Description**:\n + * This instruction adds the 32-bit unsigned integer element in [63:32] of Rs1 with the 32- + * bit unsigned integer element in [63:32] of Rs2, and subtracts the 32-bit unsigned integer element in + * [31:0] of Rs2 from the 32-bit unsigned integer element in [31:0] of Rs1. The element results are first + * logically right-shifted by 1 bit and then written to [63:32] of Rd for addition and [31:0] of Rd for + * subtraction. + * + * **Examples**:\n + * ~~~ + * Please see `URADD32` and `URSUB32` instructions. + * ~~~ + * + * **Operations**:\n + * ~~~ + * Rd.W[1] = (Rs1.W[1] + Rs2.W[1]) u>> 1; + * Rd.W[0] = (Rs1.W[0] - Rs2.W[0]) u>> 1; + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_URSTAS32(unsigned long a, unsigned long b) +{ + register unsigned long result; + __ASM volatile("urstas32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 4.54. URSTAS32 ===== */ + +/* ===== Inline Function Start for 4.55. URSTSA32 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_ADDSUB + * \brief URSTSA32 (SIMD 32-bit Unsigned Halving Straight Subtraction & Addition) + * \details + * **Type**: SIMD (RV64 Only) + * + * **Syntax**:\n + * ~~~ + * URSTSA32 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do 32-bit unsigned integer element subtraction and 32-bit unsigned integer element + * addition in a 64-bit chunk simultaneously. Operands are from corresponding 32-bit elements. The + * results are halved to avoid overflow or saturation. + * + * **Description**:\n + * This instruction subtracts the 32-bit unsigned integer element in [63:32] of Rs2 from + * the 32-bit unsigned integer element in [63:32] of Rs1, and adds the 32-bit unsigned element integer + * in [31:0] of Rs1 with the 32-bit unsigned integer element in [31:0] of Rs2. The two results are first + * logically right-shifted by 1 bit and then written to [63:32] of Rd for subtraction and [31:0] of Rd for + * addition. + * + * **Examples**:\n + * ~~~ + * Please see `URADD32` and `URSUB32` instructions. + * ~~~ + * + * **Operations**:\n + * ~~~ + * Rd.W[1] = (Rs1.W[1] - Rs2.W[1]) u>> 1; + * Rd.W[0] = (Rs1.W[0] + Rs2.W[0]) u>> 1; + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_URSTSA32(unsigned long a, unsigned long b) +{ + register unsigned long result; + __ASM volatile("urstsa32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 4.55. URSTSA32 ===== */ + +/* ===== Inline Function Start for 4.56. URSUB32 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_ADDSUB + * \brief URSUB32 (SIMD 32-bit Unsigned Halving Subtraction) + * \details + * **Type**: SIMD (RV64 Only) + * + * **Syntax**:\n + * ~~~ + * URSUB32 Rd, Rs1, Rs2 + * ~~~ + * + * **Purpose**:\n + * Do 32-bit unsigned integer element subtractions simultaneously. The results are halved to + * avoid overflow or saturation. + * + * **Description**:\n + * This instruction subtracts the 32-bit unsigned integer elements in Rs2 from the 32-bit + * unsigned integer elements in Rs1. The results are first logically right-shifted by 1 bit and then + * written to Rd. + * + * **Examples**:\n + * ~~~ + * * Ra = 0x7FFFFFFF, Rb = 0x80000000, Rt = 0xFFFFFFFF + * * Ra = 0x80000000, Rb = 0x7FFFFFFF, Rt = 0x00000000 + * * Ra = 0x80000000, Rb = 0x40000000, Rt = 0x20000000 + * ~~~ + * + * **Operations**:\n + * ~~~ + * Rd.W[x] = (Rs1.W[x] - Rs2.W[x]) u>> 1; + * for RV64: x=1...0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \param [in] b unsigned long type of value stored in b + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_URSUB32(unsigned long a, unsigned long b) +{ + register unsigned long result; + __ASM volatile("ursub32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for 4.56. URSUB32 ===== */ + +#endif /* __RISCV_XLEN == 64 */ + + +#if (__RISCV_XLEN == 32) || defined(__ONLY_FOR_DOXYGEN_DOCUMENT_GENERATION__) +/* XXXXX Nuclei Extended DSP Instructions for RV32 XXXXX */ +/** + * \defgroup NMSIS_Core_DSP_Intrinsic_NUCLEI_CUSTOM Nuclei Customized DSP Instructions + * \ingroup NMSIS_Core_DSP_Intrinsic + * \brief (RV32 only)Nuclei Customized DSP Instructions + * \details This is Nuclei customized DSP instructions only for RV32 + */ +/* ===== Inline Function Start for A.1. DKHM8 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_NUCLEI_CUSTOM + * \brief DKHM8 (64-bit SIMD Signed Saturating Q7 Multiply) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * DKHM8 Rd, Rs1, Rs2 + * # Rd, Rs1, Rs2 are all even/odd pair of registers + * ~~~ + * + * **Purpose**:\n + * Do Q7xQ7 element multiplications simultaneously. The Q14 results are then reduced to Q7 + * numbers again. + * + * **Description**:\n + * For the `DKHM8` instruction, multiply the top 8-bit Q7 content of 16-bit chunks in Rs1 + * with the top 8-bit Q7 content of 16-bit chunks in Rs2. At the same time, multiply the bottom 8-bit Q7 + * content of 16-bit chunks in Rs1 with the bottom 8-bit Q7 content of 16-bit chunks in Rs2. + * + * The Q14 results are then right-shifted 7-bits and saturated into Q7 values. The Q7 results are then + * written into Rd. When both the two Q7 inputs of a multiplication are 0x80, saturation will happen. + * The result will be saturated to 0x7F and the overflow flag OV will be set. + * + * **Operations**:\n + * ~~~ + * op1t = Rs1.B[x+1]; op2t = Rs2.B[x+1]; // top + * op1b = Rs1.B[x]; op2b = Rs2.B[x]; // bottom + * for ((aop,bop,res) in [(op1t,op2t,rest), (op1b,op2b,resb)]) { + * if (0x80 != aop | 0x80 != bop) { + * res = (aop s* bop) >> 7; + * } else { + * res= 0x7F; + * OV = 1; + * } + * } + * Rd.H[x/2] = concat(rest, resb); + * for RV32, x=0,2,4,6 + * ~~~ + * + * \param [in] a unsigned long long type of value stored in a + * \param [in] b unsigned long long type of value stored in b + * \return value stored in unsigned long long type + */ +__STATIC_FORCEINLINE unsigned long long __RV_DKHM8(unsigned long long a, unsigned long long b) +{ + unsigned long long result; + __ASM volatile("dkhm8 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for A.1. DKHM8 ===== */ + +/* ===== Inline Function Start for A.2. DKHM16 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_NUCLEI_CUSTOM + * \brief DKHM16 (64-bit SIMD Signed Saturating Q15 Multiply) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * DKHM16 Rd, Rs1, Rs2 + * # Rd, Rs1, Rs2 are all even/odd pair of registers + * ~~~ + * + * **Purpose**:\n + * Do Q15xQ15 element multiplications simultaneously. The Q30 results are then reduced to + * Q15 numbers again. + * + * **Description**:\n + * For the `DKHM16` instruction, multiply the top 16-bit Q15 content of 32-bit chunks in + * Rs1 with the top 16-bit Q15 content of 32-bit chunks in Rs2. At the same time, multiply the bottom + * 16-bit Q15 content of 32-bit chunks in Rs1 with the bottom 16-bit Q15 content of 32-bit chunks in + * Rs2. + * + * The Q30 results are then right-shifted 15-bits and saturated into Q15 values. The Q15 results are + * then written into Rd. When both the two Q15 inputs of a multiplication are 0x8000, saturation will + * happen. The result will be saturated to 0x7FFF and the overflow flag OV will be set. + * + * **Operations**:\n + * ~~~ + * op1t = Rs1.H[x+1]; op2t = Rs2.H[x+1]; // top + * op1b = Rs1.H[x]; op2b = Rs2.H[x]; // bottom + * for ((aop,bop,res) in [(op1t,op2t,rest), (op1b,op2b,resb)]) { + * if (0x8000 != aop | 0x8000 != bop) { + * res = (aop s* bop) >> 15; + * } else { + * res= 0x7FFF; + * OV = 1; + * } + * } + * Rd.W[x/2] = concat(rest, resb); + * for RV32: x=0, 2 + * ~~~ + * + * \param [in] a unsigned long long type of value stored in a + * \param [in] b unsigned long long type of value stored in b + * \return value stored in unsigned long long type + */ +__STATIC_FORCEINLINE unsigned long long __RV_DKHM16(unsigned long long a, unsigned long long b) +{ + unsigned long long result; + __ASM volatile("dkhm16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for A.2. DKHM16 ===== */ + +/* ===== Inline Function Start for A.3. DKABS8 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_NUCLEI_CUSTOM + * \brief DKABS8 (64-bit SIMD 8-bit Saturating Absolute) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * DKABS8 Rd, Rs1 + * # Rd, Rs1 are all even/odd pair of registers + * ~~~ + * + * **Purpose**:\n + * Get the absolute value of 8-bit signed integer elements simultaneously. + * + * **Description**:\n + * This instruction calculates the absolute value of 8-bit signed integer elements stored + * in Rs1 and writes the element results to Rd. If the input number is 0x80, this instruction generates + * 0x7f as the output and sets the OV bit to 1. + * + * **Operations**:\n + * ~~~ + * src = Rs1.B[x]; + * if (src == 0x80) { + * src = 0x7f; + * OV = 1; + * } else if (src[7] == 1) + * src = -src; + * } + * Rd.B[x] = src; + * for RV32: x=7...0, + * ~~~ + * + * \param [in] a unsigned long long type of value stored in a + * \return value stored in unsigned long long type + */ +__STATIC_FORCEINLINE unsigned long long __RV_DKABS8(unsigned long long a) +{ + unsigned long long result; + __ASM volatile("dkabs8 %0, %1" : "=r"(result) : "r"(a)); + return result; +} +/* ===== Inline Function End for A.3. DKABS8 ===== */ + +/* ===== Inline Function Start for A.4. DKABS16 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_NUCLEI_CUSTOM + * \brief DKABS16 (64-bit SIMD 16-bit Saturating Absolute) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * DKABS16 Rd, Rs1 + * # Rd, Rs1 are all even/odd pair of registers + * ~~~ + * + * **Purpose**:\n + * Get the absolute value of 16-bit signed integer elements simultaneously. + * + * **Description**:\n + * This instruction calculates the absolute value of 16-bit signed integer elements stored + * in Rs1 and writes the element results to Rd. If the input number is 0x8000, this instruction + * generates 0x7fff as the output and sets the OV bit to 1. + * + * **Operations**:\n + * ~~~ + * src = Rs1.H[x]; + * if (src == 0x8000) { + * src = 0x7fff; + * OV = 1; + * } else if (src[15] == 1) + * src = -src; + * } + * Rd.H[x] = src; + * for RV32: x=3...0, + * ~~~ + * + * \param [in] a unsigned long long type of value stored in a + * \return value stored in unsigned long long type + */ +__STATIC_FORCEINLINE unsigned long long __RV_DKABS16(unsigned long long a) +{ + unsigned long long result; + __ASM volatile("dkabs16 %0, %1" : "=r"(result) : "r"(a)); + return result; +} +/* ===== Inline Function End for A.4. DKABS16 ===== */ + +/* ===== Inline Function Start for A.5. DKSLRA8 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_NUCLEI_CUSTOM + * \brief DKSLRA8 (64-bit SIMD 8-bit Shift Left Logical with Saturation or Shift Right Arithmetic) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * DKSLRA8 Rd, Rs1, Rs2 + * # Rd, Rs1 are all even/odd pair of registers + * ~~~ + * + * **Purpose**:\n + * Do 8-bit elements logical left (positive) or arithmetic right (negative) shift operation with + * Q7 saturation for the left shift. + * + * **Description**:\n + * The 8-bit data elements of Rs1 are left-shifted logically or right-shifted arithmetically + * based on the value of Rs2[3:0]. Rs2[3:0] is in the signed range of [-2^3, 2^3-1]. A positive Rs2[3:0] means + * logical left shift and a negative Rs2[3:0] means arithmetic right shift. The shift amount is the + * absolute value of Rs2[3:0]. However, the behavior of `Rs2[3:0]==-2^3 (0x8)` is defined to be + * equivalent to the behavior of `Rs2[3:0]==-(2^3-1) (0x9)`. + * The left-shifted results are saturated to the 8-bit signed integer range of [-2^7, 2^7-1]. + * If any saturation happens, this instruction sets the OV flag. The value of Rs2[31:4] will not affect + * this instruction. + * + * **Operations**:\n + * ~~~ + * if (Rs2[3:0] < 0) { + * sa = -Rs2[3:0]; + * sa = (sa == 8)? 7 : sa; + * Rd.B[x] = SE8(Rs1.B[x][7:sa]); + * } else { + * sa = Rs2[2:0]; + * res[(7+sa):0] = Rs1.B[x] <<(logic) sa; + * if (res > (2^7)-1) { + * res[7:0] = 0x7f; OV = 1; + * } else if (res < -2^7) { + * res[7:0] = 0x80; OV = 1; + * } + * Rd.B[x] = res[7:0]; + * } + * for RV32: x=7...0, + * ~~~ + * + * \param [in] a unsigned long long type of value stored in a + * \param [in] b int type of value stored in b + * \return value stored in unsigned long long type + */ +__STATIC_FORCEINLINE unsigned long long __RV_DKSLRA8(unsigned long long a, int b) +{ + unsigned long long result; + __ASM volatile("dkslra8 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for A.5. DKSLRA8 ===== */ + +/* ===== Inline Function Start for A.6. DKSLRA16 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_NUCLEI_CUSTOM + * \brief DKSLRA16 (64-bit SIMD 16-bit Shift Left Logical with Saturation or Shift Right Arithmetic) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * DKSLRA16 Rd, Rs1, Rs2 + * # Rd, Rs1 are all even/odd pair of registers + * ~~~ + * + * **Purpose**:\n + * Do 16-bit elements logical left (positive) or arithmetic right (negative) shift operation with + * Q15 saturation for the left shift. + * + * **Description**:\n + * The 16-bit data elements of Rs1 are left-shifted logically or right-shifted arithmetically + * based on the value of Rs2[4:0]. Rs2[4:0] is in the signed range of [-2^4, 2^4-1]. A positive Rs2[4:0] means + * logical left shift and a negative Rs2[4:0] means arithmetic right shift. The shift amount is the + * absolute value of Rs2[4:0]. However, the behavior of `Rs2[4:0]==-2^4 (0x10)` is defined to be + * equivalent to the behavior of `Rs2[4:0]==-(2^4-1) (0x11)`. + * The left-shifted results are saturated to the 16-bit signed integer range of [-2^15, 2^15-1]. + * After the shift, saturation, or rounding, the final results are written to + * Rd. If any saturation happens, this instruction sets the OV flag. The value of Rs2[31:5] will not affect + * this instruction. + * + * **Operations**:\n + * ~~~ + * if (Rs2[4:0] < 0) { + * sa = -Rs2[4:0]; + * sa = (sa == 16)? 15 : sa; + * Rd.H[x] = SE16(Rs1.H[x][15:sa]); + * } else { + * sa = Rs2[3:0]; + * res[(15+sa):0] = Rs1.H[x] <<(logic) sa; + * if (res > (2^15)-1) { + * res[15:0] = 0x7fff; OV = 1; + * } else if (res < -2^15) { + * res[15:0] = 0x8000; OV = 1; + * } + * d.H[x] = res[15:0]; + * } + * for RV32: x=3...0, + * ~~~ + * + * \param [in] a unsigned long long type of value stored in a + * \param [in] b int type of value stored in b + * \return value stored in unsigned long long type + */ +__STATIC_FORCEINLINE unsigned long long __RV_DKSLRA16(unsigned long long a, int b) +{ + unsigned long long result; + __ASM volatile("dkslra16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for A.6. DKSLRA16 ===== */ + +/* ===== Inline Function Start for A.7. DKADD8 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_NUCLEI_CUSTOM + * \brief DKADD8 (64-bit SIMD 8-bit Signed Saturating Addition) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * DKADD8 Rd, Rs1, Rs2 + * # Rd, Rs1, Rs2 are all even/odd pair of registers + * ~~~ + * + * **Purpose**:\n + * Do 8-bit signed integer element saturating additions simultaneously. + * + * **Description**:\n + * This instruction adds the 8-bit signed integer elements in Rs1 with the 8-bit signed + * integer elements in Rs2. If any of the results are beyond the Q7 number range (-2^7 <= Q7 <= 2^7-1), they + * are saturated to the range and the OV bit is set to 1. The saturated results are written to Rd. + * + * **Operations**:\n + * ~~~ + * res[x] = Rs1.B[x] + Rs2.B[x]; + * if (res[x] > 127) { + * res[x] = 127; + * OV = 1; + * } else if (res[x] < -128) { + * res[x] = -128; + * OV = 1; + * } + * Rd.B[x] = res[x]; + * for RV32: x=7...0, + * ~~~ + * + * \param [in] a unsigned long long type of value stored in a + * \param [in] b unsigned long long type of value stored in b + * \return value stored in unsigned long long type + */ +__STATIC_FORCEINLINE unsigned long long __RV_DKADD8(unsigned long long a, unsigned long long b) +{ + unsigned long long result; + __ASM volatile("dkadd8 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for A.7. DKADD8 ===== */ + +/* ===== Inline Function Start for A.8. DKADD16 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_NUCLEI_CUSTOM + * \brief DKADD16 (64-bit SIMD 16-bit Signed Saturating Addition) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * DKADD16 Rd, Rs1, Rs2 + * # Rd, Rs1, Rs2 are all even/odd pair of registers + * ~~~ + * + * **Purpose**:\n + * Do 16-bit signed integer element saturating additions simultaneously. + * + * **Description**:\n + * This instruction adds the 16-bit signed integer elements in Rs1 with the 16-bit signed + * integer elements in Rs2. If any of the results are beyond the Q15 number range (-2^15 <= Q15 <= 2^15-1), + * they are saturated to the range and the OV bit is set to 1. The saturated results are written to Rd. + * + * **Operations**:\n + * ~~~ + * res[x] = Rs1.H[x] + Rs2.H[x]; + * if (res[x] > 32767) { + * res[x] = 32767; + * OV = 1; + * } else if (res[x] < -32768) { + * res[x] = -32768; + * OV = 1; + * } + * Rd.H[x] = res[x]; + * for RV32: x=3...0, + * ~~~ + * + * \param [in] a unsigned long long type of value stored in a + * \param [in] b unsigned long long type of value stored in b + * \return value stored in unsigned long long type + */ +__STATIC_FORCEINLINE unsigned long long __RV_DKADD16(unsigned long long a, unsigned long long b) +{ + unsigned long long result; + __ASM volatile("dkadd16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for A.8. DKADD16 ===== */ + +/* ===== Inline Function Start for A.10. DKSUB8 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_NUCLEI_CUSTOM + * \brief DKSUB8 (64-bit SIMD 8-bit Signed Saturating Subtraction) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * DKSUB8 Rd, Rs1, Rs2 + * # Rd, Rs1, Rs2 are all even/odd pair of registers + * ~~~ + * + * **Purpose**:\n + * Do 8-bit signed elements saturating subtractions simultaneously. + * + * **Description**:\n + * This instruction subtracts the 8-bit signed integer elements in Rs2 from the 8-bit + * signed integer elements in Rs1. If any of the results are beyond the Q7 number range (-2^7 <= Q7 <= 2^7-1), + * they are saturated to the range and the OV bit is set to 1. The saturated results are written to Rd. + * + * **Operations**:\n + * ~~~ + * res[x] = Rs1.B[x] - Rs2.B[x]; + * if (res[x] > (2^7)-1) { + * res[x] = (2^7)-1; + * OV = 1; + * } else if (res[x] < -2^7) { + * res[x] = -2^7; + * OV = 1; + * } + * Rd.B[x] = res[x]; + * for RV32: x=7...0, + * ~~~ + * + * \param [in] a unsigned long long type of value stored in a + * \param [in] b unsigned long long type of value stored in b + * \return value stored in unsigned long long type + */ +__STATIC_FORCEINLINE unsigned long long __RV_DKSUB8(unsigned long long a, unsigned long long b) +{ + unsigned long long result; + __ASM volatile("dksub8 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for A.9. DKSUB8 ===== */ + +/* ===== Inline Function Start for A.10. DKSUB16 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_NUCLEI_CUSTOM + * \brief DKSUB16 (64-bit SIMD 16-bit Signed Saturating Subtraction) + * \details + * **Type**: SIMD + * + * **Syntax**:\n + * ~~~ + * DKSUB16 Rd, Rs1, Rs2 + * # Rd, Rs1, Rs2 are all even/odd pair of registers + * ~~~ + * + * **Purpose**:\n + * Do 16-bit signed integer elements saturating subtractions simultaneously. + * + * **Description**:\n + * This instruction subtracts the 16-bit signed integer elements in Rs2 from the 16-bit + * signed integer elements in Rs1. If any of the results are beyond the Q15 number range (-2^15 <= Q15 <= + * 2^15-1), they are saturated to the range and the OV bit is set to 1. The saturated results are written to + * Rd. + * + * **Operations**:\n + * ~~~ + * res[x] = Rs1.H[x] - Rs2.H[x]; + * if (res[x] > (2^15)-1) { + * res[x] = (2^15)-1; + * OV = 1; + * } else if (res[x] < -2^15) { + * res[x] = -2^15; + * OV = 1; + * } + * Rd.H[x] = res[x]; + * for RV32: x=3...0, + * ~~~ + * + * \param [in] a unsigned long long type of value stored in a + * \param [in] b unsigned long long type of value stored in b + * \return value stored in unsigned long long type + */ +__STATIC_FORCEINLINE unsigned long long __RV_DKSUB16(unsigned long long a, unsigned long long b) +{ + unsigned long long result; + __ASM volatile("dksub16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); + return result; +} +/* ===== Inline Function End for A.10. DKSUB16 ===== */ + +/* ===== Inline Function Start for A.11.1. EXPD80 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_NUCLEI_CUSTOM + * \brief EXPD80 (Expand and Copy Byte 0 to 32bit) + * \details + * **Type**: DSP + * + * **Syntax**:\n + * ~~~ + * EXPD80 Rd, Rs1 + * ~~~ + * + * **Purpose**:\n + * Copy 8-bit data from 32-bit chunks into 4 bytes in a register. + * + * **Description**:\n + * Moves Rs1.B[0][7:0] to Rd.[0][7:0], Rd.[1][7:0], Rd.[2][7:0], Rd.[3][7:0] + * + * **Operations**:\n + * ~~~ + * Rd.W[x][31:0] = CONCAT(Rs1.B[0][7:0], Rs1.B[0][7:0], Rs1.B[0][7:0], Rs1.B[0][7:0]); + * for RV32: x=0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_EXPD80(unsigned long a) +{ + unsigned long result; + __ASM volatile("expd80 %0, %1" : "=r"(result) : "r"(a)); + return result; +} +/* ===== Inline Function End for A11.1. EXPD80 ===== */ + +/* ===== Inline Function Start for A.11.2. EXPD81 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_NUCLEI_CUSTOM + * \brief EXPD81 (Expand and Copy Byte 1 to 32bit) + * \details + * **Type**: DSP + * + * **Syntax**:\n + * ~~~ + * EXPD81 Rd, Rs1 + * ~~~ + * + * **Purpose**:\n + * Copy 8-bit data from 32-bit chunks into 4 bytes in a register. + * + * **Description**:\n + * Moves Rs1.B[1][7:0] to Rd.[0][7:0], Rd.[1][7:0], Rd.[2][7:0], Rd.[3][7:0] + * + * **Operations**:\n + * ~~~ + * Rd.W[x][31:0] = CONCAT(Rs1.B[1][7:0], Rs1.B[1][7:0], Rs1.B[1][7:0], Rs1.B[1][7:0]); + * for RV32: x=0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_EXPD81(unsigned long a) +{ + unsigned long result; + __ASM volatile("expd81 %0, %1" : "=r"(result) : "r"(a)); + return result; +} +/* ===== Inline Function End for A11.2. EXPD81 ===== */ + +/* ===== Inline Function Start for A.11.3. EXPD82 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_NUCLEI_CUSTOM + * \brief EXPD82 (Expand and Copy Byte 2 to 32bit) + * \details + * **Type**: DSP + * + * **Syntax**:\n + * ~~~ + * EXPD82 Rd, Rs1 + * ~~~ + * + * **Purpose**:\n + * Copy 8-bit data from 32-bit chunks into 4 bytes in a register. + * + * **Description**:\n + * Moves Rs1.B[2][7:0] to Rd.[0][7:0], Rd.[1][7:0], Rd.[2][7:0], Rd.[3][7:0] + * + * **Operations**:\n + * ~~~ + * Rd.W[x][31:0] = CONCAT(Rs1.B[2][7:0], Rs1.B[2][7:0], Rs1.B[2][7:0], Rs1.B[2][7:0]); + * for RV32: x=0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_EXPD82(unsigned long a) +{ + unsigned long result; + __ASM volatile("expd82 %0, %1" : "=r"(result) : "r"(a)); + return result; +} +/* ===== Inline Function End for A11.3. EXPD82 ===== */ + +/* ===== Inline Function Start for A.11.4. EXPD83 ===== */ +/** + * \ingroup NMSIS_Core_DSP_Intrinsic_NUCLEI_CUSTOM + * \brief EXPD83 (Expand and Copy Byte 3 to 32bit) + * \details + * **Type**: DSP + * + * **Syntax**:\n + * ~~~ + * EXPD83 Rd, Rs1 + * ~~~ + * + * **Purpose**:\n + * Copy 8-bit data from 32-bit chunks into 4 bytes in a register. + * + * **Description**:\n + * Moves Rs1.B[3][7:0] to Rd.[0][7:0], Rd.[1][7:0], Rd.[2][7:0], Rd.[3][7:0] + * + * **Operations**:\n + * ~~~ + * Rd.W[x][31:0] = CONCAT(Rs1.B[3][7:0], Rs1.B[3][7:0], Rs1.B[3][7:0], Rs1.B[3][7:0]); + * for RV32: x=0 + * ~~~ + * + * \param [in] a unsigned long type of value stored in a + * \return value stored in unsigned long type + */ +__STATIC_FORCEINLINE unsigned long __RV_EXPD83(unsigned long a) +{ + unsigned long result; + __ASM volatile("expd83 %0, %1" : "=r"(result) : "r"(a)); + return result; +} +/* ===== Inline Function End for A11.4. EXPD83 ===== */ +#endif /* __RISCV_XLEN == 32 */ + +#if defined(__RISCV_FEATURE_DSP) && (__RISCV_FEATURE_DSP == 1) +/* XXXXX ARM Compatiable SIMD API XXXXX */ +/** \brief Q setting quad 8-bit saturating addition. */ +#define __QADD8(x, y) __RV_KADD8(x, y) +/** \brief Q setting quad 8-bit saturating subtract. */ +#define __QSUB8(x, y) __RV_KSUB8((x), (y)) +/** \brief Q setting dual 16-bit saturating addition. */ +#define __QADD16(x, y) __RV_KADD16((x), (y)) +/** \brief Dual 16-bit signed addition with halved results. */ +#define __SHADD16(x, y) __RV_RADD16((x), (y)) +/** \brief Q setting dual 16-bit saturating subtract. */ +#define __QSUB16(x, y) __RV_KSUB16((x), (y)) +/** \brief Dual 16-bit signed subtraction with halved results. */ +#define __SHSUB16(x, y) __RV_RSUB16((x), (y)) +/** \brief Q setting dual 16-bit add and subtract with exchange. */ +#define __QASX(x, y) __RV_KCRAS16((x), (y)) +/** \brief Dual 16-bit signed addition and subtraction with halved results.*/ +#define __SHASX(x, y) __RV_RCRAS16((x), (y)) +/** \brief Q setting dual 16-bit subtract and add with exchange. */ +#define __QSAX(x, y) __RV_KCRSA16((x), (y)) +/** \brief Dual 16-bit signed subtraction and addition with halved results.*/ +#define __SHSAX(x, y) __RV_RCRSA16((x), (y)) +/** \brief Dual 16-bit signed multiply with exchange returning difference. */ +#define __SMUSDX(x, y) __RV_SMXDS((y), (x)) +/** \brief Q setting sum of dual 16-bit signed multiply with exchange. */ +__STATIC_FORCEINLINE int32_t __SMUADX (int32_t op1, int32_t op2) +{ + return (int32_t)__RV_KMXDA(op1, op2); +} +/** \brief Q setting saturating add. */ +#define __QADD(x, y) __RV_KADDW((x), (y)) +/** \brief Q setting saturating subtract. */ +#define __QSUB(x, y) __RV_KSUBW((x), (y)) +/** \brief Q setting dual 16-bit signed multiply with single 32-bit accumulator. */ +__STATIC_FORCEINLINE int32_t __SMLAD(int32_t op1, int32_t op2, int32_t op3) +{ + return (int32_t)__RV_KMADA(op3, op1, op2); +} +/** \brief Q setting pre-exchanged dual 16-bit signed multiply with single 32-bit accumulator. */ +__STATIC_FORCEINLINE int32_t __SMLADX(int32_t op1, int32_t op2, int32_t op3) +{ + return (int32_t)__RV_KMAXDA(op3, op1, op2); +} +/** \brief Q setting dual 16-bit signed multiply with exchange subtract with 32-bit accumulate. */ +__STATIC_FORCEINLINE int32_t __SMLSDX(int32_t op1, int32_t op2, int32_t op3) +{ + return (op3 - (int32_t)__RV_SMXDS(op1, op2)); +} +/** \brief Dual 16-bit signed multiply with single 64-bit accumulator. */ +__STATIC_FORCEINLINE int64_t __SMLALD(int32_t op1, int32_t op2, int64_t acc) +{ + return (int64_t)__RV_SMALDA(acc, op1, op2); +} +/** \brief Dual 16-bit signed multiply with exchange with single 64-bit accumulator. */ +__STATIC_FORCEINLINE int64_t __SMLALDX(int32_t op1, int32_t op2, int64_t acc) +{ + return (int64_t)__RV_SMALXDA(acc, op1, op2); +} +/** \brief Q setting sum of dual 16-bit signed multiply. */ +__STATIC_FORCEINLINE int32_t __SMUAD(int32_t op1, int32_t op2) +{ + return (int32_t)__RV_KMDA(op1, op2); +} +/** \brief Dual 16-bit signed multiply returning difference. */ +__STATIC_FORCEINLINE int32_t __SMUSD(int32_t op1, int32_t op2) +{ + return (int32_t)__RV_SMDRS(op1, op2); +} +/** \brief Dual extract 8-bits and sign extend each to 16-bits. */ +#define __SXTB16(x) __RV_SUNPKD820(x) +/** \brief Dual extracted 8-bit to 16-bit signed addition. TODO Need test */ +__STATIC_FORCEINLINE int32_t __SXTAB16(uint32_t op1, uint32_t op2) +{ + return __RV_ADD16(op1, __RV_SUNPKD830(op2)); +} +/** \brief 32-bit signed multiply with 32-bit truncated accumulator. */ +__STATIC_FORCEINLINE int32_t __SMMLA(int32_t op1, int32_t op2, int32_t op3) +{ + int32_t mul; + mul = (int32_t)__RV_SMMUL(op1, op2); + return (op3 + mul); +} +#define __DKHM8 __RV_DKHM8 +#define __DKHM16 __RV_DKHM16 +#define __DKSUB16 __RV_DKSUB16 +#define __SMAQA __RV_SMAQA +#define __MULSR64 __RV_MULSR64 +#define __DQADD8 __RV_DKADD8 +#define __DQSUB8 __RV_DKSUB8 +#define __DKADD16 __RV_DKADD16 +#define __PKBB16 __RV_PKBB16 +#define __DKSLRA16 __RV_DKSLRA16 +#define __DKSLRA8 __RV_DKSLRA8 +#define __KABSW __RV_KABSW +#define __DKABS8 __RV_DKABS8 +#define __DKABS16 __RV_DKABS16 +#define __SMALDA __RV_SMALDA +#define __SMSLDA __RV_SMSLDA +#define __SMALBB __RV_SMALBB +#define __SUB64 __RV_SUB64 +#define __ADD64 __RV_ADD64 +#define __SMBB16 __RV_SMBB16 +#define __SMBT16 __RV_SMBT16 +#define __SMTT16 __RV_SMTT16 +#define __EXPD80 __RV_EXPD80 +#define __SMAX8 __RV_SMAX8 +#define __SMAX16 __RV_SMAX16 +#define __PKTT16 __RV_PKTT16 +#define __KADD16 __RV_KADD16 +#define __SADD16 __RV_ADD16 + +#endif /* (__RISCV_FEATURE_DSP == 1) */ + +#endif /* defined(__DSP_PRESENT) && (__DSP_PRESENT == 1) */ + +/** \brief Halfword packing instruction. Combines bits[15:0] of val1 with bits[31:16] of val2 levitated with the val3. */ +#define __PKHBT(ARG1,ARG2,ARG3) ( ((((uint32_t)(ARG1)) ) & 0x0000FFFFUL) | \ + ((((uint32_t)(ARG2)) << (ARG3)) & 0xFFFF0000UL) ) +/** \brief Halfword packing instruction. Combines bits[31:16] of val1 with bits[15:0] of val2 right-shifted with the val3. */ +#define __PKHTB(ARG1,ARG2,ARG3) ( ((((uint32_t)(ARG1)) ) & 0xFFFF0000UL) | \ + ((((uint32_t)(ARG2)) >> (ARG3)) & 0x0000FFFFUL) ) + +#ifdef __cplusplus +} +#endif + +#endif /* __CORE_FEATURE_DSP__ */ diff --git a/kernel/arch/risc-v/nuclei/gcc/nmsis/Core/Include/core_feature_eclic.h b/kernel/arch/risc-v/nuclei/gcc/nmsis/Core/Include/core_feature_eclic.h new file mode 100644 index 00000000..c579c17e --- /dev/null +++ b/kernel/arch/risc-v/nuclei/gcc/nmsis/Core/Include/core_feature_eclic.h @@ -0,0 +1,897 @@ +/* + * Copyright (c) 2019 Nuclei Limited. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __CORE_FEATURE_ECLIC__ +#define __CORE_FEATURE_ECLIC__ +/*! + * @file core_feature_eclic.h + * @brief ECLIC feature API header file for Nuclei N/NX Core + */ +/* + * ECLIC Feature Configuration Macro: + * 1. __ECLIC_PRESENT: Define whether Enhanced Core Local Interrupt Controller (ECLIC) Unit is present or not + * * 0: Not present + * * 1: Present + * 2. __ECLIC_BASEADDR: Base address of the ECLIC unit. + * 3. ECLIC_GetInfoCtlbits(): Define the number of hardware bits are actually implemented in the clicintctl registers. + * Valid number is 1 - 8. + * 4. __ECLIC_INTNUM : Define the external interrupt number of ECLIC Unit + * + */ +#ifdef __cplusplus + extern "C" { +#endif + +#if defined(__ECLIC_PRESENT) && (__ECLIC_PRESENT == 1) +/** + * \defgroup NMSIS_Core_ECLIC_Registers Register Define and Type Definitions Of ECLIC + * \ingroup NMSIS_Core_Registers + * \brief Type definitions and defines for eclic registers. + * + * @{ + */ + +/** + * \brief Union type to access CLICFG configure register. + */ +typedef union +{ + struct { + uint8_t _reserved0:1; /*!< bit: 0 Overflow condition code flag */ + uint8_t nlbits:4; /*!< bit: 29 Carry condition code flag */ + uint8_t _reserved1:2; /*!< bit: 30 Zero condition code flag */ + uint8_t _reserved2:1; /*!< bit: 31 Negative condition code flag */ + } b; /*!< Structure used for bit access */ + uint8_t w; /*!< Type used for byte access */ +} CLICCFG_Type; + +/** + * \brief Union type to access CLICINFO information register. + */ +typedef union { + struct { + uint32_t numint:13; /*!< bit: 0..12 number of maximum interrupt inputs supported */ + uint32_t version:8; /*!< bit: 13..20 20:17 for architecture version,16:13 for implementation version */ + uint32_t intctlbits:4; /*!< bit: 21..24 specifies how many hardware bits are actually implemented in the clicintctl registers */ + uint32_t _reserved0:7; /*!< bit: 25..31 Reserved */ + } b; /*!< Structure used for bit access */ + uint32_t w; /*!< Type used for word access */ +} CLICINFO_Type; + +/** + * \brief Access to the structure of a vector interrupt controller. + */ +typedef struct { + __IOM uint8_t INTIP; /*!< Offset: 0x000 (R/W) Interrupt set pending register */ + __IOM uint8_t INTIE; /*!< Offset: 0x001 (R/W) Interrupt set enable register */ + __IOM uint8_t INTATTR; /*!< Offset: 0x002 (R/W) Interrupt set attributes register */ + __IOM uint8_t INTCTRL; /*!< Offset: 0x003 (R/W) Interrupt configure register */ +} CLIC_CTRL_Type; + +typedef struct { + __IOM uint8_t CFG; /*!< Offset: 0x000 (R/W) CLIC configuration register */ + uint8_t RESERVED0[3]; + __IM uint32_t INFO; /*!< Offset: 0x004 (R/ ) CLIC information register */ + uint8_t RESERVED1[3]; + __IOM uint8_t MTH; /*!< Offset: 0x00B (R/W) CLIC machine mode threshold register */ + uint32_t RESERVED2[0x3FD]; + CLIC_CTRL_Type CTRL[4096]; /*!< Offset: 0x1000 (R/W) CLIC register structure for INTIP, INTIE, INTATTR, INTCTL */ +} CLIC_Type; + +#define CLIC_CLICCFG_NLBIT_Pos 1U /*!< CLIC CLICCFG: NLBIT Position */ +#define CLIC_CLICCFG_NLBIT_Msk (0xFUL << CLIC_CLICCFG_NLBIT_Pos) /*!< CLIC CLICCFG: NLBIT Mask */ + +#define CLIC_CLICINFO_CTLBIT_Pos 21U /*!< CLIC INTINFO: __ECLIC_GetInfoCtlbits() Position */ +#define CLIC_CLICINFO_CTLBIT_Msk (0xFUL << CLIC_CLICINFO_CTLBIT_Pos) /*!< CLIC INTINFO: __ECLIC_GetInfoCtlbits() Mask */ + +#define CLIC_CLICINFO_VER_Pos 13U /*!< CLIC CLICINFO: VERSION Position */ +#define CLIC_CLICINFO_VER_Msk (0xFFUL << CLIC_CLICCFG_NLBIT_Pos) /*!< CLIC CLICINFO: VERSION Mask */ + +#define CLIC_CLICINFO_NUM_Pos 0U /*!< CLIC CLICINFO: NUM Position */ +#define CLIC_CLICINFO_NUM_Msk (0xFFFUL << CLIC_CLICINFO_NUM_Pos) /*!< CLIC CLICINFO: NUM Mask */ + +#define CLIC_INTIP_IP_Pos 0U /*!< CLIC INTIP: IP Position */ +#define CLIC_INTIP_IP_Msk (0x1UL << CLIC_INTIP_IP_Pos) /*!< CLIC INTIP: IP Mask */ + +#define CLIC_INTIE_IE_Pos 0U /*!< CLIC INTIE: IE Position */ +#define CLIC_INTIE_IE_Msk (0x1UL << CLIC_INTIE_IE_Pos) /*!< CLIC INTIE: IE Mask */ + +#define CLIC_INTATTR_TRIG_Pos 1U /*!< CLIC INTATTR: TRIG Position */ +#define CLIC_INTATTR_TRIG_Msk (0x3UL << CLIC_INTATTR_TRIG_Pos) /*!< CLIC INTATTR: TRIG Mask */ + +#define CLIC_INTATTR_SHV_Pos 0U /*!< CLIC INTATTR: SHV Position */ +#define CLIC_INTATTR_SHV_Msk (0x1UL << CLIC_INTATTR_SHV_Pos) /*!< CLIC INTATTR: SHV Mask */ + +#define ECLIC_MAX_NLBITS 8U /*!< Max nlbit of the CLICINTCTLBITS */ +#define ECLIC_MODE_MTVEC_Msk 3U /*!< ECLIC Mode mask for MTVT CSR Register */ + +#define ECLIC_NON_VECTOR_INTERRUPT 0x0 /*!< Non-Vector Interrupt Mode of ECLIC */ +#define ECLIC_VECTOR_INTERRUPT 0x1 /*!< Vector Interrupt Mode of ECLIC */ + +/**\brief ECLIC Trigger Enum for different Trigger Type */ +typedef enum ECLIC_TRIGGER { + ECLIC_LEVEL_TRIGGER = 0x0, /*!< Level Triggerred, trig[0] = 0 */ + ECLIC_POSTIVE_EDGE_TRIGGER = 0x1, /*!< Postive/Rising Edge Triggered, trig[1] = 0, trig[0] = 1 */ + ECLIC_NEGTIVE_EDGE_TRIGGER = 0x3, /*!< Negtive/Falling Edge Triggered, trig[1] = 1, trig[0] = 0 */ + ECLIC_MAX_TRIGGER = 0x3 /*!< MAX Supported Trigger Mode */ +} ECLIC_TRIGGER_Type; + +#ifndef __ECLIC_BASEADDR +/* Base address of ECLIC(__ECLIC_BASEADDR) should be defined in */ +#error "__ECLIC_BASEADDR is not defined, please check!" +#endif + +#ifndef __ECLIC_INTCTLBITS +/* Define __ECLIC_INTCTLBITS to get via ECLIC->INFO if not defined */ +#define __ECLIC_INTCTLBITS (__ECLIC_GetInfoCtlbits()) +#endif + +/* ECLIC Memory mapping of Device */ +#define ECLIC_BASE __ECLIC_BASEADDR /*!< ECLIC Base Address */ +#define ECLIC ((CLIC_Type *) ECLIC_BASE) /*!< CLIC configuration struct */ + +/** @} */ /* end of group NMSIS_Core_ECLIC_Registers */ + +/* ########################## ECLIC functions #################################### */ +/** + * \defgroup NMSIS_Core_IntExc Interrupts and Exceptions + * \brief Functions that manage interrupts and exceptions via the ECLIC. + * + * @{ + */ + +/** + * \brief Definition of IRQn numbers + * \details + * The core interrupt enumeration names for IRQn values are defined in the file .h. + * - Interrupt ID(IRQn) from 0 to 18 are reserved for core internal interrupts. + * - Interrupt ID(IRQn) start from 19 represent device-specific external interrupts. + * - The first device-specific interrupt has the IRQn value 19. + * + * The table below describes the core interrupt names and their availability in various Nuclei Cores. + */ +/* The following enum IRQn definition in this file + * is only used for doxygen documentation generation, + * The .h is the real file to define it by vendor + */ +#if defined(__ONLY_FOR_DOXYGEN_DOCUMENT_GENERATION__) +typedef enum IRQn { + /* ========= Nuclei N/NX Core Specific Interrupt Numbers =========== */ + /* Core Internal Interrupt IRQn definitions */ + Reserved0_IRQn = 0, /*!< Internal reserved */ + Reserved1_IRQn = 1, /*!< Internal reserved */ + Reserved2_IRQn = 2, /*!< Internal reserved */ + SysTimerSW_IRQn = 3, /*!< System Timer SW interrupt */ + Reserved3_IRQn = 4, /*!< Internal reserved */ + Reserved4_IRQn = 5, /*!< Internal reserved */ + Reserved5_IRQn = 6, /*!< Internal reserved */ + SysTimer_IRQn = 7, /*!< System Timer Interrupt */ + Reserved6_IRQn = 8, /*!< Internal reserved */ + Reserved7_IRQn = 9, /*!< Internal reserved */ + Reserved8_IRQn = 10, /*!< Internal reserved */ + Reserved9_IRQn = 11, /*!< Internal reserved */ + Reserved10_IRQn = 12, /*!< Internal reserved */ + Reserved11_IRQn = 13, /*!< Internal reserved */ + Reserved12_IRQn = 14, /*!< Internal reserved */ + Reserved13_IRQn = 15, /*!< Internal reserved */ + Reserved14_IRQn = 16, /*!< Internal reserved */ + Reserved15_IRQn = 17, /*!< Internal reserved */ + Reserved16_IRQn = 18, /*!< Internal reserved */ + + /* ========= Device Specific Interrupt Numbers =================== */ + /* ToDo: add here your device specific external interrupt numbers. + * 19~max(NUM_INTERRUPT, 1023) is reserved number for user. + * Maxmum interrupt supported could get from clicinfo.NUM_INTERRUPT. + * According the interrupt handlers defined in startup_Device.S + * eg.: Interrupt for Timer#1 eclic_tim1_handler -> TIM1_IRQn */ + FirstDeviceSpecificInterrupt_IRQn = 19, /*!< First Device Specific Interrupt */ + SOC_INT_MAX, /*!< Number of total interrupts */ +} IRQn_Type; +#endif /* __ONLY_FOR_DOXYGEN_DOCUMENT_GENERATION__ */ + +#ifdef NMSIS_ECLIC_VIRTUAL + #ifndef NMSIS_ECLIC_VIRTUAL_HEADER_FILE + #define NMSIS_ECLIC_VIRTUAL_HEADER_FILE "nmsis_eclic_virtual.h" + #endif + #include NMSIS_ECLIC_VIRTUAL_HEADER_FILE +#else + #define ECLIC_SetCfgNlbits __ECLIC_SetCfgNlbits + #define ECLIC_GetCfgNlbits __ECLIC_GetCfgNlbits + #define ECLIC_GetInfoVer __ECLIC_GetInfoVer + #define ECLIC_GetInfoCtlbits __ECLIC_GetInfoCtlbits + #define ECLIC_GetInfoNum __ECLIC_GetInfoNum + #define ECLIC_SetMth __ECLIC_SetMth + #define ECLIC_GetMth __ECLIC_GetMth + #define ECLIC_EnableIRQ __ECLIC_EnableIRQ + #define ECLIC_GetEnableIRQ __ECLIC_GetEnableIRQ + #define ECLIC_DisableIRQ __ECLIC_DisableIRQ + #define ECLIC_SetPendingIRQ __ECLIC_SetPendingIRQ + #define ECLIC_GetPendingIRQ __ECLIC_GetPendingIRQ + #define ECLIC_ClearPendingIRQ __ECLIC_ClearPendingIRQ + #define ECLIC_SetTrigIRQ __ECLIC_SetTrigIRQ + #define ECLIC_GetTrigIRQ __ECLIC_GetTrigIRQ + #define ECLIC_SetShvIRQ __ECLIC_SetShvIRQ + #define ECLIC_GetShvIRQ __ECLIC_GetShvIRQ + #define ECLIC_SetCtrlIRQ __ECLIC_SetCtrlIRQ + #define ECLIC_GetCtrlIRQ __ECLIC_GetCtrlIRQ + #define ECLIC_SetLevelIRQ __ECLIC_SetLevelIRQ + #define ECLIC_GetLevelIRQ __ECLIC_GetLevelIRQ + #define ECLIC_SetPriorityIRQ __ECLIC_SetPriorityIRQ + #define ECLIC_GetPriorityIRQ __ECLIC_GetPriorityIRQ + +#endif /* NMSIS_ECLIC_VIRTUAL */ + +#ifdef NMSIS_VECTAB_VIRTUAL + #ifndef NMSIS_VECTAB_VIRTUAL_HEADER_FILE + #define NMSIS_VECTAB_VIRTUAL_HEADER_FILE "nmsis_vectab_virtual.h" + #endif + #include NMSIS_VECTAB_VIRTUAL_HEADER_FILE +#else + #define ECLIC_SetVector __ECLIC_SetVector + #define ECLIC_GetVector __ECLIC_GetVector +#endif /* (NMSIS_VECTAB_VIRTUAL) */ + +/** + * \brief Set nlbits value + * \details + * This function set the nlbits value of CLICCFG register. + * \param [in] nlbits nlbits value + * \remarks + * - nlbits is used to set the width of level in the CLICINTCTL[i]. + * \sa + * - \ref ECLIC_GetCfgNlbits + */ +__STATIC_FORCEINLINE void __ECLIC_SetCfgNlbits(uint32_t nlbits) +{ + ECLIC->CFG &= ~CLIC_CLICCFG_NLBIT_Msk; + ECLIC->CFG |= (uint8_t)((nlbits <CFG & CLIC_CLICCFG_NLBIT_Msk) >> CLIC_CLICCFG_NLBIT_Pos)); +} + +/** + * \brief Get the ECLIC version number + * \details + * This function gets the hardware version information from CLICINFO register. + * \return hardware version number in CLICINFO register. + * \remarks + * - This function gets harware version information from CLICINFO register. + * - Bit 20:17 for architecture version, bit 16:13 for implementation version. + * \sa + * - \ref ECLIC_GetInfoNum +*/ +__STATIC_FORCEINLINE uint32_t __ECLIC_GetInfoVer(void) +{ + return ((uint32_t)((ECLIC->INFO & CLIC_CLICINFO_VER_Msk) >> CLIC_CLICINFO_VER_Pos)); +} + +/** + * \brief Get CLICINTCTLBITS + * \details + * This function gets CLICINTCTLBITS from CLICINFO register. + * \return CLICINTCTLBITS from CLICINFO register. + * \remarks + * - In the CLICINTCTL[i] registers, with 2 <= CLICINTCTLBITS <= 8. + * - The implemented bits are kept left-justified in the most-significant bits of each 8-bit + * CLICINTCTL[I] register, with the lower unimplemented bits treated as hardwired to 1. + * \sa + * - \ref ECLIC_GetInfoNum + */ +__STATIC_FORCEINLINE uint32_t __ECLIC_GetInfoCtlbits(void) +{ + return ((uint32_t)((ECLIC->INFO & CLIC_CLICINFO_CTLBIT_Msk) >> CLIC_CLICINFO_CTLBIT_Pos)); +} + +/** + * \brief Get number of maximum interrupt inputs supported + * \details + * This function gets number of maximum interrupt inputs supported from CLICINFO register. + * \return number of maximum interrupt inputs supported from CLICINFO register. + * \remarks + * - This function gets number of maximum interrupt inputs supported from CLICINFO register. + * - The num_interrupt field specifies the actual number of maximum interrupt inputs supported in this implementation. + * \sa + * - \ref ECLIC_GetInfoCtlbits + */ +__STATIC_FORCEINLINE uint32_t __ECLIC_GetInfoNum(void) +{ + return ((uint32_t)((ECLIC->INFO & CLIC_CLICINFO_NUM_Msk) >> CLIC_CLICINFO_NUM_Pos)); +} + +/** + * \brief Set Machine Mode Interrupt Level Threshold + * \details + * This function sets machine mode interrupt level threshold. + * \param [in] mth Interrupt Level Threshold. + * \sa + * - \ref ECLIC_GetMth + */ +__STATIC_FORCEINLINE void __ECLIC_SetMth(uint8_t mth) +{ + ECLIC->MTH = mth; +} + +/** + * \brief Get Machine Mode Interrupt Level Threshold + * \details + * This function gets machine mode interrupt level threshold. + * \return Interrupt Level Threshold. + * \sa + * - \ref ECLIC_SetMth + */ +__STATIC_FORCEINLINE uint8_t __ECLIC_GetMth(void) +{ + return (ECLIC->MTH); +} + + +/** + * \brief Enable a specific interrupt + * \details + * This function enables the specific interrupt \em IRQn. + * \param [in] IRQn Interrupt number + * \remarks + * - IRQn must not be negative. + * \sa + * - \ref ECLIC_DisableIRQ + */ +__STATIC_FORCEINLINE void __ECLIC_EnableIRQ(IRQn_Type IRQn) +{ + ECLIC->CTRL[IRQn].INTIE |= CLIC_INTIE_IE_Msk; +} + +/** + * \brief Get a specific interrupt enable status + * \details + * This function returns the interrupt enable status for the specific interrupt \em IRQn. + * \param [in] IRQn Interrupt number + * \returns + * - 0 Interrupt is not enabled + * - 1 Interrupt is pending + * \remarks + * - IRQn must not be negative. + * \sa + * - \ref ECLIC_EnableIRQ + * - \ref ECLIC_DisableIRQ + */ +__STATIC_FORCEINLINE uint32_t __ECLIC_GetEnableIRQ(IRQn_Type IRQn) +{ + return((uint32_t) (ECLIC->CTRL[IRQn].INTIE) & CLIC_INTIE_IE_Msk); +} + +/** + * \brief Disable a specific interrupt + * \details + * This function disables the specific interrupt \em IRQn. + * \param [in] IRQn Number of the external interrupt to disable + * \remarks + * - IRQn must not be negative. + * \sa + * - \ref ECLIC_EnableIRQ + */ +__STATIC_FORCEINLINE void __ECLIC_DisableIRQ(IRQn_Type IRQn) +{ + ECLIC->CTRL[IRQn].INTIE &= ~CLIC_INTIE_IE_Msk; +} + +/** + * \brief Get the pending specific interrupt + * \details + * This function returns the pending status of the specific interrupt \em IRQn. + * \param [in] IRQn Interrupt number + * \returns + * - 0 Interrupt is not pending + * - 1 Interrupt is pending + * \remarks + * - IRQn must not be negative. + * \sa + * - \ref ECLIC_SetPendingIRQ + * - \ref ECLIC_ClearPendingIRQ + */ +__STATIC_FORCEINLINE int32_t __ECLIC_GetPendingIRQ(IRQn_Type IRQn) +{ + return((uint32_t)(ECLIC->CTRL[IRQn].INTIP) & CLIC_INTIP_IP_Msk); +} + +/** + * \brief Set a specific interrupt to pending + * \details + * This function sets the pending bit for the specific interrupt \em IRQn. + * \param [in] IRQn Interrupt number + * \remarks + * - IRQn must not be negative. + * \sa + * - \ref ECLIC_GetPendingIRQ + * - \ref ECLIC_ClearPendingIRQ + */ +__STATIC_FORCEINLINE void __ECLIC_SetPendingIRQ(IRQn_Type IRQn) +{ + ECLIC->CTRL[IRQn].INTIP |= CLIC_INTIP_IP_Msk; +} + +/** + * \brief Clear a specific interrupt from pending + * \details + * This function removes the pending state of the specific interrupt \em IRQn. + * \em IRQn cannot be a negative number. + * \param [in] IRQn Interrupt number + * \remarks + * - IRQn must not be negative. + * \sa + * - \ref ECLIC_SetPendingIRQ + * - \ref ECLIC_GetPendingIRQ + */ +__STATIC_FORCEINLINE void __ECLIC_ClearPendingIRQ(IRQn_Type IRQn) +{ + ECLIC->CTRL[IRQn].INTIP &= ~ CLIC_INTIP_IP_Msk; +} + +/** + * \brief Set trigger mode and polarity for a specific interrupt + * \details + * This function set trigger mode and polarity of the specific interrupt \em IRQn. + * \param [in] IRQn Interrupt number + * \param [in] trig + * - 00 level trigger, \ref ECLIC_LEVEL_TRIGGER + * - 01 positive edge trigger, \ref ECLIC_POSTIVE_EDGE_TRIGGER + * - 02 level trigger, \ref ECLIC_LEVEL_TRIGGER + * - 03 negative edge trigger, \ref ECLIC_NEGTIVE_EDGE_TRIGGER + * \remarks + * - IRQn must not be negative. + * + * \sa + * - \ref ECLIC_GetTrigIRQ + */ +__STATIC_FORCEINLINE void __ECLIC_SetTrigIRQ(IRQn_Type IRQn, uint32_t trig) +{ + ECLIC->CTRL[IRQn].INTATTR &= ~CLIC_INTATTR_TRIG_Msk; + ECLIC->CTRL[IRQn].INTATTR |= (uint8_t)(trig<CTRL[IRQn].INTATTR) & CLIC_INTATTR_TRIG_Msk)>>CLIC_INTATTR_TRIG_Pos)); +} + +/** + * \brief Set interrupt working mode for a specific interrupt + * \details + * This function set selective hardware vector or non-vector working mode of the specific interrupt \em IRQn. + * \param [in] IRQn Interrupt number + * \param [in] shv + * - 0 non-vector mode, \ref ECLIC_NON_VECTOR_INTERRUPT + * - 1 vector mode, \ref ECLIC_VECTOR_INTERRUPT + * \remarks + * - IRQn must not be negative. + * \sa + * - \ref ECLIC_GetShvIRQ + */ +__STATIC_FORCEINLINE void __ECLIC_SetShvIRQ(IRQn_Type IRQn, uint32_t shv) +{ + ECLIC->CTRL[IRQn].INTATTR &= ~CLIC_INTATTR_SHV_Msk; + ECLIC->CTRL[IRQn].INTATTR |= (uint8_t)(shv<CTRL[IRQn].INTATTR) & CLIC_INTATTR_SHV_Msk)>>CLIC_INTATTR_SHV_Pos)); +} + +/** + * \brief Modify ECLIC Interrupt Input Control Register for a specific interrupt + * \details + * This function modify ECLIC Interrupt Input Control(CLICINTCTL[i]) register of the specific interrupt \em IRQn. + * \param [in] IRQn Interrupt number + * \param [in] intctrl Set value for CLICINTCTL[i] register + * \remarks + * - IRQn must not be negative. + * \sa + * - \ref ECLIC_GetCtrlIRQ + */ +__STATIC_FORCEINLINE void __ECLIC_SetCtrlIRQ(IRQn_Type IRQn, uint8_t intctrl) +{ + ECLIC->CTRL[IRQn].INTCTRL = intctrl; +} + +/** + * \brief Get ECLIC Interrupt Input Control Register value for a specific interrupt + * \details + * This function modify ECLIC Interrupt Input Control register of the specific interrupt \em IRQn. + * \param [in] IRQn Interrupt number + * \return value of ECLIC Interrupt Input Control register + * \remarks + * - IRQn must not be negative. + * \sa + * - \ref ECLIC_SetCtrlIRQ + */ +__STATIC_FORCEINLINE uint8_t __ECLIC_GetCtrlIRQ(IRQn_Type IRQn) +{ + return (ECLIC->CTRL[IRQn].INTCTRL); +} + +/** + * \brief Set ECLIC Interrupt level of a specific interrupt + * \details + * This function set interrupt level of the specific interrupt \em IRQn. + * \param [in] IRQn Interrupt number + * \param [in] lvl_abs Interrupt level + * \remarks + * - IRQn must not be negative. + * - If lvl_abs to be set is larger than the max level allowed, it will be force to be max level. + * - When you set level value you need use clciinfo.nlbits to get the width of level. + * Then we could know the maximum of level. CLICINTCTLBITS is how many total bits are + * present in the CLICINTCTL register. + * \sa + * - \ref ECLIC_GetLevelIRQ + */ +__STATIC_FORCEINLINE void __ECLIC_SetLevelIRQ(IRQn_Type IRQn, uint8_t lvl_abs) +{ + uint8_t nlbits = __ECLIC_GetCfgNlbits(); + uint8_t intctlbits = (uint8_t)__ECLIC_INTCTLBITS; + + if (nlbits == 0) { + return; + } + + if (nlbits > intctlbits) { + nlbits = intctlbits; + } + uint8_t maxlvl = ((1 << nlbits) - 1); + if (lvl_abs > maxlvl) { + lvl_abs = maxlvl; + } + uint8_t lvl = lvl_abs << (ECLIC_MAX_NLBITS - nlbits); + uint8_t cur_ctrl = __ECLIC_GetCtrlIRQ(IRQn); + cur_ctrl = cur_ctrl << nlbits; + cur_ctrl = cur_ctrl >> nlbits; + __ECLIC_SetCtrlIRQ(IRQn, (cur_ctrl | lvl)); +} + +/** + * \brief Get ECLIC Interrupt level of a specific interrupt + * \details + * This function get interrupt level of the specific interrupt \em IRQn. + * \param [in] IRQn Interrupt number + * \return Interrupt level + * \remarks + * - IRQn must not be negative. + * \sa + * - \ref ECLIC_SetLevelIRQ + */ +__STATIC_FORCEINLINE uint8_t __ECLIC_GetLevelIRQ(IRQn_Type IRQn) +{ + uint8_t nlbits = __ECLIC_GetCfgNlbits(); + uint8_t intctlbits = (uint8_t)__ECLIC_INTCTLBITS; + + if (nlbits == 0) { + return 0; + } + + if (nlbits > intctlbits) { + nlbits = intctlbits; + } + uint8_t intctrl = __ECLIC_GetCtrlIRQ(IRQn); + uint8_t lvl_abs = intctrl >> (ECLIC_MAX_NLBITS - nlbits); + return lvl_abs; +} + +/** + * \brief Get ECLIC Interrupt priority of a specific interrupt + * \details + * This function get interrupt priority of the specific interrupt \em IRQn. + * \param [in] IRQn Interrupt number + * \param [in] pri Interrupt priority + * \remarks + * - IRQn must not be negative. + * - If pri to be set is larger than the max priority allowed, it will be force to be max priority. + * - Priority width is CLICINTCTLBITS minus clciinfo.nlbits if clciinfo.nlbits + * is less than CLICINTCTLBITS. Otherwise priority width is 0. + * \sa + * - \ref ECLIC_GetPriorityIRQ + */ +__STATIC_FORCEINLINE void __ECLIC_SetPriorityIRQ(IRQn_Type IRQn, uint8_t pri) +{ + uint8_t nlbits = __ECLIC_GetCfgNlbits(); + uint8_t intctlbits = (uint8_t)__ECLIC_INTCTLBITS; + if (nlbits < intctlbits) { + uint8_t maxpri = ((1 << (intctlbits - nlbits)) - 1); + if (pri > maxpri) { + pri = maxpri; + } + pri = pri << (ECLIC_MAX_NLBITS - intctlbits); + uint8_t mask = ((uint8_t)(-1)) >> intctlbits; + pri = pri | mask; + uint8_t cur_ctrl = __ECLIC_GetCtrlIRQ(IRQn); + cur_ctrl = cur_ctrl >> (ECLIC_MAX_NLBITS - nlbits); + cur_ctrl = cur_ctrl << (ECLIC_MAX_NLBITS - nlbits); + __ECLIC_SetCtrlIRQ(IRQn, (cur_ctrl | pri)); + } +} + +/** + * \brief Get ECLIC Interrupt priority of a specific interrupt + * \details + * This function get interrupt priority of the specific interrupt \em IRQn. + * \param [in] IRQn Interrupt number + * \return Interrupt priority + * \remarks + * - IRQn must not be negative. + * \sa + * - \ref ECLIC_SetPriorityIRQ + */ +__STATIC_FORCEINLINE uint8_t __ECLIC_GetPriorityIRQ(IRQn_Type IRQn) +{ + uint8_t nlbits = __ECLIC_GetCfgNlbits(); + uint8_t intctlbits = (uint8_t)__ECLIC_INTCTLBITS; + if (nlbits < intctlbits) { + uint8_t cur_ctrl = __ECLIC_GetCtrlIRQ(IRQn); + uint8_t pri = cur_ctrl << nlbits; + pri = pri >> nlbits; + pri = pri >> (ECLIC_MAX_NLBITS - intctlbits); + return pri; + } else { + return 0; + } +} + +/** + * \brief Set Interrupt Vector of a specific interrupt + * \details + * This function set interrupt handler address of the specific interrupt \em IRQn. + * \param [in] IRQn Interrupt number + * \param [in] vector Interrupt handler address + * \remarks + * - IRQn must not be negative. + * - You can set the \ref CSR_CSR_MTVT to set interrupt vector table entry address. + * - If your vector table is placed in readonly section, the vector for IRQn will not be modified. + * For this case, you need to use the correct irq handler name defined in your vector table as + * your irq handler function name. + * - This function will only work correctly when the vector table is placed in an read-write enabled section. + * \sa + * - \ref ECLIC_GetVector + */ +__STATIC_FORCEINLINE void __ECLIC_SetVector(IRQn_Type IRQn, rv_csr_t vector) +{ +#if __RISCV_XLEN == 32 + volatile uint32_t vec_base; + vec_base = ((uint32_t)__RV_CSR_READ(CSR_MTVT)); + (* (unsigned long *) (vec_base + ((int32_t)IRQn) * 4)) = vector; +#elif __RISCV_XLEN == 64 + volatile uint64_t vec_base; + vec_base = ((uint64_t)__RV_CSR_READ(CSR_MTVT)); + (* (unsigned long *) (vec_base + ((int32_t)IRQn) * 8)) = vector; +#else // TODO Need cover for XLEN=128 case in future + volatile uint64_t vec_base; + vec_base = ((uint64_t)__RV_CSR_READ(CSR_MTVT)); + (* (unsigned long *) (vec_base + ((int32_t)IRQn) * 8)) = vector; +#endif +} + +/** + * \brief Get Interrupt Vector of a specific interrupt + * \details + * This function get interrupt handler address of the specific interrupt \em IRQn. + * \param [in] IRQn Interrupt number + * \return Interrupt handler address + * \remarks + * - IRQn must not be negative. + * - You can read \ref CSR_CSR_MTVT to get interrupt vector table entry address. + * \sa + * - \ref ECLIC_SetVector + */ +__STATIC_FORCEINLINE rv_csr_t __ECLIC_GetVector(IRQn_Type IRQn) +{ +#if __RISCV_XLEN == 32 + return (*(uint32_t *)(__RV_CSR_READ(CSR_MTVT)+IRQn*4)); +#elif __RISCV_XLEN == 64 + return (*(uint64_t *)(__RV_CSR_READ(CSR_MTVT)+IRQn*8)); +#else // TODO Need cover for XLEN=128 case in future + return (*(uint64_t *)(__RV_CSR_READ(CSR_MTVT)+IRQn*8)); +#endif +} + +/** + * \brief Set Exception entry address + * \details + * This function set exception handler address to 'CSR_MTVEC'. + * \param [in] addr Exception handler address + * \remarks + * - This function use to set exception handler address to 'CSR_MTVEC'. Address is 4 bytes align. + * \sa + * - \ref __get_exc_entry + */ +__STATIC_FORCEINLINE void __set_exc_entry(rv_csr_t addr) +{ + addr &= (rv_csr_t)(~0x3F); + addr |= ECLIC_MODE_MTVEC_Msk; + __RV_CSR_WRITE(CSR_MTVEC, addr); +} + +/** + * \brief Get Exception entry address + * \details + * This function get exception handler address from 'CSR_MTVEC'. + * \return Exception handler address + * \remarks + * - This function use to get exception handler address from 'CSR_MTVEC'. Address is 4 bytes align + * \sa + * - \ref __set_exc_entry + */ +__STATIC_FORCEINLINE rv_csr_t __get_exc_entry(void) +{ + unsigned long addr = __RV_CSR_READ(CSR_MTVEC); + return (addr & ~ECLIC_MODE_MTVEC_Msk); +} + +/** + * \brief Set Non-vector interrupt entry address + * \details + * This function set Non-vector interrupt address. + * \param [in] addr Non-vector interrupt entry address + * \remarks + * - This function use to set non-vector interrupt entry address to 'CSR_MTVT2' if + * - CSR_MTVT2 bit0 is 1. If 'CSR_MTVT2' bit0 is 0 then set address to 'CSR_MTVEC' + * \sa + * - \ref __get_nonvec_entry + */ +__STATIC_FORCEINLINE void __set_nonvec_entry(rv_csr_t addr) +{ + if (__RV_CSR_READ(CSR_MTVT2) & 0x1){ + __RV_CSR_WRITE(CSR_MTVT2, addr | 0x01); + } else { + addr &= (rv_csr_t)(~0x3F); + addr |= ECLIC_MODE_MTVEC_Msk; + __RV_CSR_WRITE(CSR_MTVEC, addr); + } +} + +/** + * \brief Get Non-vector interrupt entry address + * \details + * This function get Non-vector interrupt address. + * \return Non-vector interrupt handler address + * \remarks + * - This function use to get non-vector interrupt entry address from 'CSR_MTVT2' if + * - CSR_MTVT2 bit0 is 1. If 'CSR_MTVT2' bit0 is 0 then get address from 'CSR_MTVEC'. + * \sa + * - \ref __set_nonvec_entry + */ +__STATIC_FORCEINLINE rv_csr_t __get_nonvec_entry(void) +{ + if (__RV_CSR_READ(CSR_MTVT2) & 0x1) { + return __RV_CSR_READ(CSR_MTVT2) & (~(rv_csr_t)(0x1)); + } else { + rv_csr_t addr = __RV_CSR_READ(CSR_MTVEC); + return (addr & ~ECLIC_MODE_MTVEC_Msk); + } +} + +/** + * \brief Get NMI interrupt entry from 'CSR_MNVEC' + * \details + * This function get NMI interrupt address from 'CSR_MNVEC'. + * \return NMI interrupt handler address + * \remarks + * - This function use to get NMI interrupt handler address from 'CSR_MNVEC'. If CSR_MMISC_CTL[9] = 1 'CSR_MNVEC' + * - will be equal as mtvec. If CSR_MMISC_CTL[9] = 0 'CSR_MNVEC' will be equal as reset vector. + * - NMI entry is defined via \ref CSR_MMISC_CTL, writing to \ref CSR_MNVEC will be ignored. + */ +__STATIC_FORCEINLINE rv_csr_t __get_nmi_entry(void) +{ + return __RV_CSR_READ(CSR_MNVEC); +} + +/** + * \brief Save necessary CSRs into variables for vector interrupt nesting + * \details + * This macro is used to declare variables which are used for saving + * CSRs(MCAUSE, MEPC, MSUB), and it will read these CSR content into + * these variables, it need to be used in a vector-interrupt if nesting + * is required. + * \remarks + * - Interrupt will be enabled after this macro is called + * - It need to be used together with \ref RESTORE_IRQ_CSR_CONTEXT + * - Don't use variable names __mcause, __mpec, __msubm in your ISR code + * - If you want to enable interrupt nesting feature for vector interrupt, + * you can do it like this: + * \code + * // __INTERRUPT attribute will generates function entry and exit sequences suitable + * // for use in an interrupt handler when this attribute is present + * __INTERRUPT void eclic_mtip_handler(void) + * { + * // Must call this to save CSRs + * SAVE_IRQ_CSR_CONTEXT(); + * // !!!Interrupt is enabled here!!! + * // !!!Higher priority interrupt could nest it!!! + * + * // put you own interrupt handling code here + * + * // Must call this to restore CSRs + * RESTORE_IRQ_CSR_CONTEXT(); + * } + * \endcode + */ +#define SAVE_IRQ_CSR_CONTEXT() \ + rv_csr_t __mcause = __RV_CSR_READ(CSR_MCAUSE); \ + rv_csr_t __mepc = __RV_CSR_READ(CSR_MEPC); \ + rv_csr_t __msubm = __RV_CSR_READ(CSR_MSUBM); \ + __enable_irq(); + +/** + * \brief Restore necessary CSRs from variables for vector interrupt nesting + * \details + * This macro is used restore CSRs(MCAUSE, MEPC, MSUB) from pre-defined variables + * in \ref SAVE_IRQ_CSR_CONTEXT macro. + * \remarks + * - Interrupt will be disabled after this macro is called + * - It need to be used together with \ref SAVE_IRQ_CSR_CONTEXT + */ +#define RESTORE_IRQ_CSR_CONTEXT() \ + __disable_irq(); \ + __RV_CSR_WRITE(CSR_MSUBM, __msubm); \ + __RV_CSR_WRITE(CSR_MEPC, __mepc); \ + __RV_CSR_WRITE(CSR_MCAUSE, __mcause); + +/** @} */ /* End of Doxygen Group NMSIS_Core_IntExc */ + +#endif /* defined(__ECLIC_PRESENT) && (__ECLIC_PRESENT == 1) */ + +#ifdef __cplusplus +} +#endif +#endif /** __CORE_FEATURE_ECLIC__ */ diff --git a/kernel/arch/risc-v/nuclei/gcc/nmsis/Core/Include/core_feature_fpu.h b/kernel/arch/risc-v/nuclei/gcc/nmsis/Core/Include/core_feature_fpu.h new file mode 100644 index 00000000..c9e13b79 --- /dev/null +++ b/kernel/arch/risc-v/nuclei/gcc/nmsis/Core/Include/core_feature_fpu.h @@ -0,0 +1,304 @@ +/* + * Copyright (c) 2019 Nuclei Limited. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __CORE_FEATURE_FPU_H__ +#define __CORE_FEATURE_FPU_H__ +/*! + * @file core_feature_fpu.h + * @brief FPU feature API header file for Nuclei N/NX Core + */ +/* + * FPU Feature Configuration Macro: + * 1. __FPU_PRESENT: Define whether Floating Point Unit(FPU) is present or not + * * 0: Not present + * * 1: Single precision FPU present, __RISCV_FLEN == 32 + * * 2: Double precision FPU present, __RISCV_FLEN == 64 + */ +#ifdef __cplusplus + extern "C" { +#endif + +/* ===== FPU Operations ===== */ +/** + * \defgroup NMSIS_Core_FPU_Functions FPU Functions + * \ingroup NMSIS_Core + * \brief Functions that related to the RISC-V FPU (F and D extension). + * \details + * + * Nuclei provided floating point unit by RISC-V F and D extension. + * * `F extension` adds single-precision floating-point computational + * instructions compliant with the IEEE 754-2008 arithmetic standard, __RISCV_FLEN = 32. + * The F extension adds 32 floating-point registers, f0-f31, each 32 bits wide, + * and a floating-point control and status register fcsr, which contains the + * operating mode and exception status of the floating-point unit. + * * `D extension` adds double-precision floating-point computational instructions + * compliant with the IEEE 754-2008 arithmetic standard. + * The D extension widens the 32 floating-point registers, f0-f31, to 64 bits, __RISCV_FLEN = 64 + * @{ + */ +#if defined(__FPU_PRESENT) && (__FPU_PRESENT > 0) + +#if __FPU_PRESENT == 1 + /** \brief Refer to the width of the floating point register in bits(either 32 or 64) */ + #define __RISCV_FLEN 32 +#elif __FPU_PRESENT == 2 + #define __RISCV_FLEN 64 +#else + #define __RISCV_FLEN __riscv_flen +#endif /* __FPU_PRESENT == 1 */ + +/** \brief Get FCSR CSR Register */ +#define __get_FCSR() __RV_CSR_READ(CSR_FCSR) +/** \brief Set FCSR CSR Register with val */ +#define __set_FCSR(val) __RV_CSR_WRITE(CSR_FCSR, (val)) +/** \brief Get FRM CSR Register */ +#define __get_FRM() __RV_CSR_READ(CSR_FRM) +/** \brief Set FRM CSR Register with val */ +#define __set_FRM(val) __RV_CSR_WRITE(CSR_FRM, (val)) +/** \brief Get FFLAGS CSR Register */ +#define __get_FFLAGS() __RV_CSR_READ(CSR_FFLAGS) +/** \brief Set FFLAGS CSR Register with val */ +#define __set_FFLAGS(val) __RV_CSR_WRITE(CSR_FFLAGS, (val)) + +/** \brief Enable FPU Unit */ +#define __enable_FPU() __RV_CSR_SET(CSR_MSTATUS, MSTATUS_FS) +/** + * \brief Disable FPU Unit + * \details + * * We can save power by disable FPU Unit. + * * When FPU Unit is disabled, any access to FPU related CSR registers + * and FPU instructions will cause illegal Instuction Exception. + * */ +#define __disable_FPU() __RV_CSR_CLEAR(CSR_MSTATUS, MSTATUS_FS) + + +/** + * \brief Load a single-precision value from memory into float point register freg using flw instruction + * \details The FLW instruction loads a single-precision floating point value from memory + * address (addr + ofs) into floating point register freg(f0-f31) + * \param [in] freg The floating point register, eg. FREG(0), f0 + * \param [in] addr The memory base address, 4 byte aligned required + * \param [in] ofs a 12-bit immediate signed byte offset value, should be an const value + * \remarks + * * FLW and FSW operations need to make sure the address is 4 bytes aligned, + * otherwise it will cause exception code 4(Load address misaligned) or 6 (Store/AMO address misaligned) + * * FLW and FSW do not modify the bits being transferred; in particular, the payloads of non-canonical + * NaNs are preserved + * + */ +#define __RV_FLW(freg, addr, ofs) \ + ({ \ + register rv_csr_t __addr = (rv_csr_t)(addr); \ + __ASM volatile("flw " STRINGIFY(freg) ", %0(%1) " \ + : : "I"(ofs), "r"(__addr) \ + : "memory"); \ + }) + +/** + * \brief Store a single-precision value from float point freg into memory using fsw instruction + * \details The FSW instruction stores a single-precision value from floating point register to memory + * \param [in] freg The floating point register(f0-f31), eg. FREG(0), f0 + * \param [in] addr The memory base address, 4 byte aligned required + * \param [in] ofs a 12-bit immediate signed byte offset value, should be an const value + * \remarks + * * FLW and FSW operations need to make sure the address is 4 bytes aligned, + * otherwise it will cause exception code 4(Load address misaligned) or 6 (Store/AMO address misaligned) + * * FLW and FSW do not modify the bits being transferred; in particular, the payloads of non-canonical + * NaNs are preserved + * + */ +#define __RV_FSW(freg, addr, ofs) \ + ({ \ + register rv_csr_t __addr = (rv_csr_t)(addr); \ + __ASM volatile("fsw " STRINGIFY(freg) ", %0(%1) " \ + : : "I"(ofs), "r"(__addr) \ + : "memory"); \ + }) + +/** + * \brief Load a double-precision value from memory into float point register freg using fld instruction + * \details The FLD instruction loads a double-precision floating point value from memory + * address (addr + ofs) into floating point register freg(f0-f31) + * \param [in] freg The floating point register, eg. FREG(0), f0 + * \param [in] addr The memory base address, 8 byte aligned required + * \param [in] ofs a 12-bit immediate signed byte offset value, should be an const value + * \attention + * * Function only available for double precision floating point unit, FLEN = 64 + * \remarks + * * FLD and FSD operations need to make sure the address is 8 bytes aligned, + * otherwise it will cause exception code 4(Load address misaligned) or 6 (Store/AMO address misaligned) + * * FLD and FSD do not modify the bits being transferred; in particular, the payloads of non-canonical + * NaNs are preserved. + */ +#define __RV_FLD(freg, addr, ofs) \ + ({ \ + register rv_csr_t __addr = (rv_csr_t)(addr); \ + __ASM volatile("fld " STRINGIFY(freg) ", %0(%1) " \ + : : "I"(ofs), "r"(__addr) \ + : "memory"); \ + }) + +/** + * \brief Store a double-precision value from float point freg into memory using fsd instruction + * \details The FSD instruction stores double-precision value from floating point register to memory + * \param [in] freg The floating point register(f0-f31), eg. FREG(0), f0 + * \param [in] addr The memory base address, 8 byte aligned required + * \param [in] ofs a 12-bit immediate signed byte offset value, should be an const value + * \attention + * * Function only available for double precision floating point unit, FLEN = 64 + * \remarks + * * FLD and FSD operations need to make sure the address is 8 bytes aligned, + * otherwise it will cause exception code 4(Load address misaligned) or 6 (Store/AMO address misaligned) + * * FLD and FSD do not modify the bits being transferred; in particular, the payloads of non-canonical + * NaNs are preserved. + * + */ +#define __RV_FSD(freg, addr, ofs) \ + ({ \ + register rv_csr_t __addr = (rv_csr_t)(addr); \ + __ASM volatile("fsd " STRINGIFY(freg) ", %0(%1) " \ + : : "I"(ofs), "r"(__addr) \ + : "memory"); \ + }) + +/** + * \def __RV_FLOAD + * \brief Load a float point value from memory into float point register freg using flw/fld instruction + * \details + * * For Single-Precison Floating-Point Mode(__FPU_PRESENT == 1, __RISCV_FLEN == 32): + * It will call \ref __RV_FLW to load a single-precision floating point value from memory to floating point register + * * For Double-Precison Floating-Point Mode(__FPU_PRESENT == 2, __RISCV_FLEN == 64): + * It will call \ref __RV_FLD to load a double-precision floating point value from memory to floating point register + * + * \attention + * Function behaviour is different for __FPU_PRESENT = 1 or 2, please see the real function this macro represent + */ +/** + * \def __RV_FSTORE + * \brief Store a float value from float point freg into memory using fsw/fsd instruction + * \details + * * For Single-Precison Floating-Point Mode(__FPU_PRESENT == 1, __RISCV_FLEN == 32): + * It will call \ref __RV_FSW to store floating point register into memory + * * For Double-Precison Floating-Point Mode(__FPU_PRESENT == 2, __RISCV_FLEN == 64): + * It will call \ref __RV_FSD to store floating point register into memory + * + * \attention + * Function behaviour is different for __FPU_PRESENT = 1 or 2, please see the real function this macro represent + */ +#if __FPU_PRESENT == 1 +#define __RV_FLOAD __RV_FLW +#define __RV_FSTORE __RV_FSW +/** \brief Type of FPU register, depends on the FLEN defined in RISC-V */ +typedef uint32_t rv_fpu_t; +#elif __FPU_PRESENT == 2 +#define __RV_FLOAD __RV_FLD +#define __RV_FSTORE __RV_FSD +/** \brief Type of FPU register, depends on the FLEN defined in RISC-V */ +typedef uint64_t rv_fpu_t; +#endif /* __FPU_PRESENT == 2 */ + +/** + * \brief Save FPU context into variables for interrupt nesting + * \details + * This macro is used to declare variables which are used for saving + * FPU context, and it will store the nessary fpu registers into + * these variables, it need to be used in a interrupt when in this + * interrupt fpu registers are used. + * \remarks + * - It need to be used together with \ref RESTORE_FPU_CONTEXT + * - Don't use variable names __fpu_context in your ISR code + * - If you isr code will use fpu registers, and this interrupt is nested. + * Then you can do it like this: + * \code + * void eclic_mtip_handler(void) + * { + * // !!!Interrupt is enabled here!!! + * // !!!Higher priority interrupt could nest it!!! + * + * // Necessary only when you need to use fpu registers + * // in this isr handler functions + * SAVE_FPU_CONTEXT(); + * + * // put you own interrupt handling code here + * + * // pair of SAVE_FPU_CONTEXT() + * RESTORE_FPU_CONTEXT(); + * } + * \endcode + */ +#define SAVE_FPU_CONTEXT() \ + rv_fpu_t __fpu_context[20]; \ + __RV_FSTORE(FREG(0), __fpu_context, 0 << LOG_FPREGBYTES); \ + __RV_FSTORE(FREG(1), __fpu_context, 1 << LOG_FPREGBYTES); \ + __RV_FSTORE(FREG(2), __fpu_context, 2 << LOG_FPREGBYTES); \ + __RV_FSTORE(FREG(3), __fpu_context, 3 << LOG_FPREGBYTES); \ + __RV_FSTORE(FREG(4), __fpu_context, 4 << LOG_FPREGBYTES); \ + __RV_FSTORE(FREG(5), __fpu_context, 5 << LOG_FPREGBYTES); \ + __RV_FSTORE(FREG(6), __fpu_context, 6 << LOG_FPREGBYTES); \ + __RV_FSTORE(FREG(7), __fpu_context, 7 << LOG_FPREGBYTES); \ + __RV_FSTORE(FREG(10), __fpu_context, 8 << LOG_FPREGBYTES); \ + __RV_FSTORE(FREG(11), __fpu_context, 9 << LOG_FPREGBYTES); \ + __RV_FSTORE(FREG(12), __fpu_context, 10 << LOG_FPREGBYTES); \ + __RV_FSTORE(FREG(13), __fpu_context, 11 << LOG_FPREGBYTES); \ + __RV_FSTORE(FREG(14), __fpu_context, 12 << LOG_FPREGBYTES); \ + __RV_FSTORE(FREG(15), __fpu_context, 13 << LOG_FPREGBYTES); \ + __RV_FSTORE(FREG(16), __fpu_context, 14 << LOG_FPREGBYTES); \ + __RV_FSTORE(FREG(17), __fpu_context, 15 << LOG_FPREGBYTES); \ + __RV_FSTORE(FREG(28), __fpu_context, 16 << LOG_FPREGBYTES); \ + __RV_FSTORE(FREG(29), __fpu_context, 17 << LOG_FPREGBYTES); \ + __RV_FSTORE(FREG(30), __fpu_context, 18 << LOG_FPREGBYTES); \ + __RV_FSTORE(FREG(31), __fpu_context, 19 << LOG_FPREGBYTES); + +/** + * \brief Restore necessary fpu registers from variables for interrupt nesting + * \details + * This macro is used restore necessary fpu registers from pre-defined variables + * in \ref SAVE_FPU_CONTEXT macro. + * \remarks + * - It need to be used together with \ref SAVE_FPU_CONTEXT + */ +#define RESTORE_FPU_CONTEXT() \ + __RV_FLOAD(FREG(0), __fpu_context, 0 << LOG_FPREGBYTES); \ + __RV_FLOAD(FREG(1), __fpu_context, 1 << LOG_FPREGBYTES); \ + __RV_FLOAD(FREG(2), __fpu_context, 2 << LOG_FPREGBYTES); \ + __RV_FLOAD(FREG(3), __fpu_context, 3 << LOG_FPREGBYTES); \ + __RV_FLOAD(FREG(4), __fpu_context, 4 << LOG_FPREGBYTES); \ + __RV_FLOAD(FREG(5), __fpu_context, 5 << LOG_FPREGBYTES); \ + __RV_FLOAD(FREG(6), __fpu_context, 6 << LOG_FPREGBYTES); \ + __RV_FLOAD(FREG(7), __fpu_context, 7 << LOG_FPREGBYTES); \ + __RV_FLOAD(FREG(10), __fpu_context, 8 << LOG_FPREGBYTES); \ + __RV_FLOAD(FREG(11), __fpu_context, 9 << LOG_FPREGBYTES); \ + __RV_FLOAD(FREG(12), __fpu_context, 10 << LOG_FPREGBYTES); \ + __RV_FLOAD(FREG(13), __fpu_context, 11 << LOG_FPREGBYTES); \ + __RV_FLOAD(FREG(14), __fpu_context, 12 << LOG_FPREGBYTES); \ + __RV_FLOAD(FREG(15), __fpu_context, 13 << LOG_FPREGBYTES); \ + __RV_FLOAD(FREG(16), __fpu_context, 14 << LOG_FPREGBYTES); \ + __RV_FLOAD(FREG(17), __fpu_context, 15 << LOG_FPREGBYTES); \ + __RV_FLOAD(FREG(28), __fpu_context, 16 << LOG_FPREGBYTES); \ + __RV_FLOAD(FREG(29), __fpu_context, 17 << LOG_FPREGBYTES); \ + __RV_FLOAD(FREG(30), __fpu_context, 18 << LOG_FPREGBYTES); \ + __RV_FLOAD(FREG(31), __fpu_context, 19 << LOG_FPREGBYTES); +#else +#define SAVE_FPU_CONTEXT() +#define RESTORE_FPU_CONTEXT() +#endif /* __FPU_PRESENT > 0 */ +/** @} */ /* End of Doxygen Group NMSIS_Core_FPU_Functions */ + +#ifdef __cplusplus +} +#endif +#endif /** __RISCV_EXT_FPU_H__ */ diff --git a/kernel/arch/risc-v/nuclei/gcc/nmsis/Core/Include/core_feature_pmp.h b/kernel/arch/risc-v/nuclei/gcc/nmsis/Core/Include/core_feature_pmp.h new file mode 100644 index 00000000..997dfaee --- /dev/null +++ b/kernel/arch/risc-v/nuclei/gcc/nmsis/Core/Include/core_feature_pmp.h @@ -0,0 +1,260 @@ +/* + * Copyright (c) 2019 Nuclei Limited. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __CORE_FEATURE_PMP_H__ +#define __CORE_FEATURE_PMP_H__ +/*! + * @file core_feature_pmp.h + * @brief PMP feature API header file for Nuclei N/NX Core + */ +/* + * PMP Feature Configuration Macro: + * 1. __PMP_PRESENT: Define whether Physical Memory Protection(PMP) is present or not + * * 0: Not present + * * 1: Present + * 2. __PMP_ENTRY_NUM: Define the number of PMP entries, only 8 or 16 is configurable. + */ +#ifdef __cplusplus + extern "C" { +#endif + +#if defined(__PMP_PRESENT) && (__PMP_PRESENT == 1) +/* ===== PMP Operations ===== */ +/** + * \defgroup NMSIS_Core_PMP_Functions PMP Functions + * \ingroup NMSIS_Core + * \brief Functions that related to the RISCV Phyiscal Memory Protection. + * \details + * Optional physical memory protection (PMP) unit provides per-hart machine-mode + * control registers to allow physical memory access privileges (read, write, execute) + * to be specified for each physical memory region. + * + * The PMP can supports region access control settings as small as four bytes. + * + * @{ + */ +#ifndef __PMP_ENTRY_NUM +/* numbers of PMP entries(__PMP_ENTRY_NUM) should be defined in */ +#error "__PMP_ENTRY_NUM is not defined, please check!" +#endif + +/** + * \brief Get 8bit PMPxCFG Register by PMP entry index + * \details Return the content of the PMPxCFG Register. + * \param [in] idx PMP region index(0-15) + * \return PMPxCFG Register value + */ +__STATIC_INLINE uint8_t __get_PMPxCFG(uint32_t idx) +{ + rv_csr_t pmpcfg = 0; + + if (idx >= __PMP_ENTRY_NUM) return 0; +#if __RISCV_XLEN == 32 + if (idx < 4) { + pmpcfg = __RV_CSR_READ(CSR_PMPCFG0); + } else if ((idx >=4) && (idx < 8)) { + idx -= 4; + pmpcfg = __RV_CSR_READ(CSR_PMPCFG1); + } else if ((idx >=8) && (idx < 12)) { + idx -= 8; + pmpcfg = __RV_CSR_READ(CSR_PMPCFG2); + } else { + idx -= 12; + pmpcfg = __RV_CSR_READ(CSR_PMPCFG3); + } + + idx = idx << 3; + return (uint8_t)((pmpcfg>>idx) & 0xFF); +#elif __RISCV_XLEN == 64 + if (idx < 8) { + pmpcfg = __RV_CSR_READ(CSR_PMPCFG0); + } else { + idx -= 8; + pmpcfg = __RV_CSR_READ(CSR_PMPCFG2); + } + idx = idx << 3; + return (uint8_t)((pmpcfg>>idx) & 0xFF); +#else + // TODO Add RV128 Handling + return 0; +#endif +} + +/** + * \brief Set 8bit PMPxCFG by pmp entry index + * \details Set the given pmpxcfg value to the PMPxCFG Register. + * \param [in] idx PMPx region index(0-15) + * \param [in] pmpxcfg PMPxCFG register value to set + */ +__STATIC_INLINE void __set_PMPxCFG(uint32_t idx, uint8_t pmpxcfg) +{ + rv_csr_t pmpcfgx = 0; + if (idx >= __PMP_ENTRY_NUM) return; + +#if __RISCV_XLEN == 32 + if (idx < 4) { + pmpcfgx = __RV_CSR_READ(CSR_PMPCFG0); + idx = idx << 3; + pmpcfgx = (pmpcfgx & ~(0xFFUL << idx)) | ((rv_csr_t)pmpxcfg << idx); + __RV_CSR_WRITE(CSR_PMPCFG0, pmpcfgx); + } else if ((idx >=4) && (idx < 8)) { + idx -= 4; + pmpcfgx = __RV_CSR_READ(CSR_PMPCFG1); + idx = idx << 3; + pmpcfgx = (pmpcfgx & ~(0xFFUL << idx)) | ((rv_csr_t)pmpxcfg << idx); + __RV_CSR_WRITE(CSR_PMPCFG1, pmpcfgx); + } else if ((idx >=8) && (idx < 12)) { + idx -= 8; + pmpcfgx = __RV_CSR_READ(CSR_PMPCFG2); + idx = idx << 3; + pmpcfgx = (pmpcfgx & ~(0xFFUL << idx)) | ((rv_csr_t)pmpxcfg << idx); + __RV_CSR_WRITE(CSR_PMPCFG2, pmpcfgx); + } else { + idx -= 12; + pmpcfgx = __RV_CSR_READ(CSR_PMPCFG3); + idx = idx << 3; + pmpcfgx = (pmpcfgx & ~(0xFFUL << idx)) | ((rv_csr_t)pmpxcfg << idx); + __RV_CSR_WRITE(CSR_PMPCFG3, pmpcfgx); + } +#elif __RISCV_XLEN == 64 + if (idx < 8) { + pmpcfgx = __RV_CSR_READ(CSR_PMPCFG0); + idx = idx << 3; + pmpcfgx = (pmpcfgx & ~(0xFFULL << idx)) | ((rv_csr_t)pmpxcfg << idx); + __RV_CSR_WRITE(CSR_PMPCFG0, pmpcfgx); + } else { + idx -= 8; + pmpcfgx = __RV_CSR_READ(CSR_PMPCFG2); + idx = idx << 3; + pmpcfgx = (pmpcfgx & ~(0xFFULL << idx)) | ((rv_csr_t)pmpxcfg << idx); + __RV_CSR_WRITE(CSR_PMPCFG2, pmpcfgx); + } +#else + // TODO Add RV128 Handling +#endif +} + +/** + * \brief Get PMPCFGx Register by index + * \details Return the content of the PMPCFGx Register. + * \param [in] idx PMPCFG CSR index(0-3) + * \return PMPCFGx Register value + * \remark + * - For RV64, only idx = 0 and idx = 2 is allowed. + * pmpcfg0 and pmpcfg2 hold the configurations + * for the 16 PMP entries, pmpcfg1 and pmpcfg3 are illegal + * - For RV32, pmpcfg0–pmpcfg3, hold the configurations + * pmp0cfg–pmp15cfg for the 16 PMP entries + */ +__STATIC_INLINE rv_csr_t __get_PMPCFGx(uint32_t idx) +{ + switch (idx) { + case 0: return __RV_CSR_READ(CSR_PMPCFG0); + case 1: return __RV_CSR_READ(CSR_PMPCFG1); + case 2: return __RV_CSR_READ(CSR_PMPCFG2); + case 3: return __RV_CSR_READ(CSR_PMPCFG3); + default: return 0; + } +} + +/** + * \brief Set PMPCFGx by index + * \details Write the given value to the PMPCFGx Register. + * \param [in] idx PMPCFG CSR index(0-3) + * \param [in] pmpcfg PMPCFGx Register value to set + * \remark + * - For RV64, only idx = 0 and idx = 2 is allowed. + * pmpcfg0 and pmpcfg2 hold the configurations + * for the 16 PMP entries, pmpcfg1 and pmpcfg3 are illegal + * - For RV32, pmpcfg0–pmpcfg3, hold the configurations + * pmp0cfg–pmp15cfg for the 16 PMP entries + */ +__STATIC_INLINE void __set_PMPCFGx(uint32_t idx, rv_csr_t pmpcfg) +{ + switch (idx) { + case 0: __RV_CSR_WRITE(CSR_PMPCFG0, pmpcfg); break; + case 1: __RV_CSR_WRITE(CSR_PMPCFG1, pmpcfg); break; + case 2: __RV_CSR_WRITE(CSR_PMPCFG2, pmpcfg); break; + case 3: __RV_CSR_WRITE(CSR_PMPCFG3, pmpcfg); break; + default: return; + } +} + +/** + * \brief Get PMPADDRx Register by index + * \details Return the content of the PMPADDRx Register. + * \param [in] idx PMP region index(0-15) + * \return PMPADDRx Register value + */ +__STATIC_INLINE rv_csr_t __get_PMPADDRx(uint32_t idx) +{ + switch (idx) { + case 0: return __RV_CSR_READ(CSR_PMPADDR0); + case 1: return __RV_CSR_READ(CSR_PMPADDR1); + case 2: return __RV_CSR_READ(CSR_PMPADDR2); + case 3: return __RV_CSR_READ(CSR_PMPADDR3); + case 4: return __RV_CSR_READ(CSR_PMPADDR4); + case 5: return __RV_CSR_READ(CSR_PMPADDR5); + case 6: return __RV_CSR_READ(CSR_PMPADDR6); + case 7: return __RV_CSR_READ(CSR_PMPADDR7); + case 8: return __RV_CSR_READ(CSR_PMPADDR8); + case 9: return __RV_CSR_READ(CSR_PMPADDR9); + case 10: return __RV_CSR_READ(CSR_PMPADDR10); + case 11: return __RV_CSR_READ(CSR_PMPADDR11); + case 12: return __RV_CSR_READ(CSR_PMPADDR12); + case 13: return __RV_CSR_READ(CSR_PMPADDR13); + case 14: return __RV_CSR_READ(CSR_PMPADDR14); + case 15: return __RV_CSR_READ(CSR_PMPADDR15); + default: return 0; + } +} + +/** + * \brief Set PMPADDRx by index + * \details Write the given value to the PMPADDRx Register. + * \param [in] idx PMP region index(0-15) + * \param [in] pmpaddr PMPADDRx Register value to set + */ +__STATIC_INLINE void __set_PMPADDRx(uint32_t idx, rv_csr_t pmpaddr) +{ + switch (idx) { + case 0: __RV_CSR_WRITE(CSR_PMPADDR0, pmpaddr); break; + case 1: __RV_CSR_WRITE(CSR_PMPADDR1, pmpaddr); break; + case 2: __RV_CSR_WRITE(CSR_PMPADDR2, pmpaddr); break; + case 3: __RV_CSR_WRITE(CSR_PMPADDR3, pmpaddr); break; + case 4: __RV_CSR_WRITE(CSR_PMPADDR4, pmpaddr); break; + case 5: __RV_CSR_WRITE(CSR_PMPADDR5, pmpaddr); break; + case 6: __RV_CSR_WRITE(CSR_PMPADDR6, pmpaddr); break; + case 7: __RV_CSR_WRITE(CSR_PMPADDR7, pmpaddr); break; + case 8: __RV_CSR_WRITE(CSR_PMPADDR8, pmpaddr); break; + case 9: __RV_CSR_WRITE(CSR_PMPADDR9, pmpaddr); break; + case 10: __RV_CSR_WRITE(CSR_PMPADDR10, pmpaddr); break; + case 11: __RV_CSR_WRITE(CSR_PMPADDR11, pmpaddr); break; + case 12: __RV_CSR_WRITE(CSR_PMPADDR12, pmpaddr); break; + case 13: __RV_CSR_WRITE(CSR_PMPADDR13, pmpaddr); break; + case 14: __RV_CSR_WRITE(CSR_PMPADDR14, pmpaddr); break; + case 15: __RV_CSR_WRITE(CSR_PMPADDR15, pmpaddr); break; + default: return; + } +} +/** @} */ /* End of Doxygen Group NMSIS_Core_PMP_Functions */ +#endif /* defined(__PMP_PRESENT) && (__PMP_PRESENT == 1) */ + +#ifdef __cplusplus +} +#endif +#endif /** __CORE_FEATURE_PMP_H__ */ diff --git a/kernel/arch/risc-v/nuclei/gcc/nmsis/Core/Include/core_feature_timer.h b/kernel/arch/risc-v/nuclei/gcc/nmsis/Core/Include/core_feature_timer.h new file mode 100644 index 00000000..6e9b7af3 --- /dev/null +++ b/kernel/arch/risc-v/nuclei/gcc/nmsis/Core/Include/core_feature_timer.h @@ -0,0 +1,364 @@ +/* + * Copyright (c) 2019 Nuclei Limited. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __CORE_FEATURE_TIMER_H__ +#define __CORE_FEATURE_TIMER_H__ +/*! + * @file core_feature_timer.h + * @brief System Timer feature API header file for Nuclei N/NX Core + */ +/* + * System Timer Feature Configuration Macro: + * 1. __SYSTIMER_PRESENT: Define whether Private System Timer is present or not. + * * 0: Not present + * * 1: Present + * 2. __SYSTIMER_BASEADDR: Define the base address of the System Timer. + */ +#ifdef __cplusplus + extern "C" { +#endif + +#if defined(__SYSTIMER_PRESENT) && (__SYSTIMER_PRESENT == 1) +/** + * \defgroup NMSIS_Core_SysTimer_Registers Register Define and Type Definitions Of System Timer + * \ingroup NMSIS_Core_Registers + * \brief Type definitions and defines for system timer registers. + * + * @{ + */ +/** + * \brief Structure type to access the System Timer (SysTimer). + * \details + * Structure definition to access the system timer(SysTimer). + * \remarks + * - MSFTRST register is introduced in Nuclei N Core version 1.3(\ref __NUCLEI_N_REV >= 0x0103) + * - MSTOP register is renamed to MTIMECTL register in Nuclei N Core version 1.4(\ref __NUCLEI_N_REV >= 0x0104) + * - CMPCLREN and CLKSRC bit in MTIMECTL register is introduced in Nuclei N Core version 1.4(\ref __NUCLEI_N_REV >= 0x0104) + */ +typedef struct { + __IOM uint64_t MTIMER; /*!< Offset: 0x000 (R/W) System Timer current value 64bits Register */ + __IOM uint64_t MTIMERCMP; /*!< Offset: 0x008 (R/W) System Timer compare Value 64bits Register */ + __IOM uint32_t RESERVED0[0x3F8]; /*!< Offset: 0x010 - 0xFEC Reserved */ + __IOM uint32_t MSFTRST; /*!< Offset: 0xFF0 (R/W) System Timer Software Core Reset Register */ + __IOM uint32_t RESERVED1; /*!< Offset: 0xFF4 Reserved */ + __IOM uint32_t MTIMECTL; /*!< Offset: 0xFF8 (R/W) System Timer Control Register, previously MSTOP register */ + __IOM uint32_t MSIP; /*!< Offset: 0xFFC (R/W) System Timer SW interrupt Register */ +} SysTimer_Type; + +/* Timer Control / Status Register Definitions */ +#define SysTimer_MTIMECTL_TIMESTOP_Pos 0U /*!< SysTick Timer MTIMECTL: TIMESTOP bit Position */ +#define SysTimer_MTIMECTL_TIMESTOP_Msk (1UL << SysTimer_MTIMECTL_TIMESTOP_Pos) /*!< SysTick Timer MTIMECTL: TIMESTOP Mask */ +#define SysTimer_MTIMECTL_CMPCLREN_Pos 1U /*!< SysTick Timer MTIMECTL: CMPCLREN bit Position */ +#define SysTimer_MTIMECTL_CMPCLREN_Msk (1UL << SysTimer_MTIMECTL_CMPCLREN_Pos) /*!< SysTick Timer MTIMECTL: CMPCLREN Mask */ +#define SysTimer_MTIMECTL_CLKSRC_Pos 2U /*!< SysTick Timer MTIMECTL: CLKSRC bit Position */ +#define SysTimer_MTIMECTL_CLKSRC_Msk (1UL << SysTimer_MTIMECTL_CLKSRC_Pos) /*!< SysTick Timer MTIMECTL: CLKSRC Mask */ + +#define SysTimer_MSIP_MSIP_Pos 0U /*!< SysTick Timer MSIP: MSIP bit Position */ +#define SysTimer_MSIP_MSIP_Msk (1UL << SysTimer_MSIP_MSIP_Pos) /*!< SysTick Timer MSIP: MSIP Mask */ + +#define SysTimer_MTIMER_Msk (0xFFFFFFFFFFFFFFFFULL) /*!< SysTick Timer MTIMER value Mask */ +#define SysTimer_MTIMERCMP_Msk (0xFFFFFFFFFFFFFFFFULL) /*!< SysTick Timer MTIMERCMP value Mask */ +#define SysTimer_MTIMECTL_Msk (0xFFFFFFFFUL) /*!< SysTick Timer MTIMECTL/MSTOP value Mask */ +#define SysTimer_MSIP_Msk (0xFFFFFFFFUL) /*!< SysTick Timer MSIP value Mask */ +#define SysTimer_MSFTRST_Msk (0xFFFFFFFFUL) /*!< SysTick Timer MSFTRST value Mask */ + +#define SysTimer_MSFRST_KEY (0x80000A5FUL) /*!< SysTick Timer Software Reset Request Key */ + +#ifndef __SYSTIMER_BASEADDR +/* Base address of SYSTIMER(__SYSTIMER_BASEADDR) should be defined in */ +#error "__SYSTIMER_BASEADDR is not defined, please check!" +#endif +/* System Timer Memory mapping of Device */ +#define SysTimer_BASE __SYSTIMER_BASEADDR /*!< SysTick Base Address */ +#define SysTimer ((SysTimer_Type *) SysTimer_BASE) /*!< SysTick configuration struct */ +/** @} */ /* end of group NMSIS_Core_SysTimer_Registers */ + +/* ################################## SysTimer function ############################################ */ +/** + * \defgroup NMSIS_Core_SysTimer SysTimer Functions + * \brief Functions that configure the Core System Timer. + * @{ + */ +/** + * \brief Set system timer load value + * \details + * This function set the system timer load value in MTIMER register. + * \param [in] value value to set system timer MTIMER register. + * \remarks + * - Load value is 64bits wide. + * - \ref SysTimer_GetLoadValue + */ +__STATIC_FORCEINLINE void SysTimer_SetLoadValue(uint64_t value) +{ + SysTimer->MTIMER = value; +} + +/** + * \brief Get system timer load value + * \details + * This function get the system timer current value in MTIMER register. + * \return current value(64bit) of system timer MTIMER register. + * \remarks + * - Load value is 64bits wide. + * - \ref SysTimer_SetLoadValue + */ +__STATIC_FORCEINLINE uint64_t SysTimer_GetLoadValue(void) +{ + return SysTimer->MTIMER; +} + +/** + * \brief Set system timer compare value + * \details + * This function set the system Timer compare value in MTIMERCMP register. + * \param [in] value compare value to set system timer MTIMERCMP register. + * \remarks + * - Compare value is 64bits wide. + * - If compare value is larger than current value timer interrupt generate. + * - Modify the load value or compare value less to clear the interrupt. + * - \ref SysTimer_GetCompareValue + */ +__STATIC_FORCEINLINE void SysTimer_SetCompareValue(uint64_t value) +{ + SysTimer->MTIMERCMP = value; +} + +/** + * \brief Get system timer compare value + * \details + * This function get the system timer compare value in MTIMERCMP register. + * \return compare value of system timer MTIMERCMP register. + * \remarks + * - Compare value is 64bits wide. + * - \ref SysTimer_SetCompareValue + */ +__STATIC_FORCEINLINE uint64_t SysTimer_GetCompareValue(void) +{ + return SysTimer->MTIMERCMP; +} + +/** + * \brief Enable system timer counter running + * \details + * Enable system timer counter running by clear + * TIMESTOP bit in MTIMECTL register. + */ +__STATIC_FORCEINLINE void SysTimer_Start(void) +{ + SysTimer->MTIMECTL &= ~(SysTimer_MTIMECTL_TIMESTOP_Msk); +} + +/** + * \brief Stop system timer counter running + * \details + * Stop system timer counter running by set + * TIMESTOP bit in MTIMECTL register. + */ +__STATIC_FORCEINLINE void SysTimer_Stop(void) +{ + SysTimer->MTIMECTL |= SysTimer_MTIMECTL_TIMESTOP_Msk; +} + +/** + * \brief Set system timer control value + * \details + * This function set the system timer MTIMECTL register value. + * \param [in] mctl value to set MTIMECTL register + * \remarks + * - Bit TIMESTOP is used to start and stop timer. + * Clear TIMESTOP bit to 0 to start timer, otherwise to stop timer. + * - Bit CMPCLREN is used to enable auto MTIMER clear to zero when MTIMER >= MTIMERCMP. + * Clear CMPCLREN bit to 0 to stop auto clear MTIMER feature, otherwise to enable it. + * - Bit CLKSRC is used to select timer clock source. + * Clear CLKSRC bit to 0 to use *mtime_toggle_a*, otherwise use *core_clk_aon* + * - \ref SysTimer_GetControlValue + */ +__STATIC_FORCEINLINE void SysTimer_SetControlValue(uint32_t mctl) +{ + SysTimer->MTIMECTL = (mctl & SysTimer_MTIMECTL_Msk); +} + +/** + * \brief Get system timer control value + * \details + * This function get the system timer MTIMECTL register value. + * \return MTIMECTL register value + * \remarks + * - \ref SysTimer_SetControlValue + */ +__STATIC_FORCEINLINE uint32_t SysTimer_GetControlValue(void) +{ + return (SysTimer->MTIMECTL & SysTimer_MTIMECTL_Msk); +} + +/** + * \brief Trigger or set software interrupt via system timer + * \details + * This function set the system timer MSIP bit in MSIP register. + * \remarks + * - Set system timer MSIP bit and generate a SW interrupt. + * - \ref SysTimer_ClearSWIRQ + * - \ref SysTimer_GetMsipValue + */ +__STATIC_FORCEINLINE void SysTimer_SetSWIRQ(void) +{ + SysTimer->MSIP |= SysTimer_MSIP_MSIP_Msk; +} + +/** + * \brief Clear system timer software interrupt pending request + * \details + * This function clear the system timer MSIP bit in MSIP register. + * \remarks + * - Clear system timer MSIP bit in MSIP register to clear the software interrupt pending. + * - \ref SysTimer_SetSWIRQ + * - \ref SysTimer_GetMsipValue + */ +__STATIC_FORCEINLINE void SysTimer_ClearSWIRQ(void) +{ + SysTimer->MSIP &= ~SysTimer_MSIP_MSIP_Msk; +} + +/** + * \brief Get system timer MSIP register value + * \details + * This function get the system timer MSIP register value. + * \return Value of Timer MSIP register. + * \remarks + * - Bit0 is SW interrupt flag. + * Bit0 is 1 then SW interrupt set. Bit0 is 0 then SW interrupt clear. + * - \ref SysTimer_SetSWIRQ + * - \ref SysTimer_ClearSWIRQ + */ +__STATIC_FORCEINLINE uint32_t SysTimer_GetMsipValue(void) +{ + return (uint32_t)(SysTimer->MSIP & SysTimer_MSIP_Msk); +} + +/** + * \brief Set system timer MSIP register value + * \details + * This function set the system timer MSIP register value. + * \param [in] msip value to set MSIP register + */ +__STATIC_FORCEINLINE void SysTimer_SetMsipValue(uint32_t msip) +{ + SysTimer->MSIP = (msip & SysTimer_MSIP_Msk); +} + +/** + * \brief Do software reset request + * \details + * This function will do software reset request through MTIMER + * - Software need to write \ref SysTimer_MSFRST_KEY to generate software reset request + * - The software request flag can be cleared by reset operation to clear + * \remarks + * - The software reset is sent to SoC, SoC need to generate reset signal and send back to Core + * - This function will not return, it will do while(1) to wait the Core reset happened + */ +__STATIC_FORCEINLINE void SysTimer_SoftwareReset(void) +{ + SysTimer->MSFTRST = SysTimer_MSFRST_KEY; + while(1); +} + +#if defined (__Vendor_SysTickConfig) && (__Vendor_SysTickConfig == 0U) && defined(__ECLIC_PRESENT) && (__ECLIC_PRESENT == 1) +/** + * \brief System Tick Configuration + * \details Initializes the System Timer and its non-vector interrupt, and starts the System Tick Timer. + * + * In our default implementation, the timer counter will be set to zero, and it will start a timer compare non-vector interrupt + * when it matchs the ticks user set, during the timer interrupt user should reload the system tick using \ref SysTick_Reload function + * or similar function written by user, so it can produce period timer interrupt. + * \param [in] ticks Number of ticks between two interrupts. + * \return 0 Function succeeded. + * \return 1 Function failed. + * \remarks + * - For \ref __NUCLEI_N_REV >= 0x0104, the CMPCLREN bit in MTIMECTL is introduced, + * but we assume that the CMPCLREN bit is set to 0, so MTIMER register will not be + * auto cleared to 0 when MTIMER >= MTIMERCMP. + * - When the variable \ref __Vendor_SysTickConfig is set to 1, then the + * function \ref SysTick_Config is not included. + * - In this case, the file .h must contain a vendor-specific implementation + * of this function. + * - If user need this function to start a period timer interrupt, then in timer interrupt handler + * routine code, user should call \ref SysTick_Reload with ticks to reload the timer. + * - This function only available when __SYSTIMER_PRESENT == 1 and __ECLIC_PRESENT == 1 and __Vendor_SysTickConfig == 0 + * \sa + * - \ref SysTimer_SetCompareValue; SysTimer_SetLoadValue + */ +__STATIC_INLINE uint32_t SysTick_Config(uint64_t ticks) +{ + SysTimer_SetLoadValue(0); + SysTimer_SetCompareValue(ticks); + ECLIC_SetShvIRQ(SysTimer_IRQn, ECLIC_NON_VECTOR_INTERRUPT); + ECLIC_SetLevelIRQ(SysTimer_IRQn, 0); + ECLIC_EnableIRQ(SysTimer_IRQn); + return (0UL); +} + +/** + * \brief System Tick Reload + * \details Reload the System Timer Tick when the MTIMECMP reached TIME value + * + * \param [in] ticks Number of ticks between two interrupts. + * \return 0 Function succeeded. + * \return 1 Function failed. + * \remarks + * - For \ref __NUCLEI_N_REV >= 0x0104, the CMPCLREN bit in MTIMECTL is introduced, + * but for this \ref SysTick_Config function, we assume this CMPCLREN bit is set to 0, + * so in interrupt handler function, user still need to set the MTIMERCMP or MTIMER to reload + * the system tick, if vendor want to use this timer's auto clear feature, they can define + * \ref __Vendor_SysTickConfig to 1, and implement \ref SysTick_Config and \ref SysTick_Reload functions. + * - When the variable \ref __Vendor_SysTickConfig is set to 1, then the + * function \ref SysTick_Reload is not included. + * - In this case, the file .h must contain a vendor-specific implementation + * of this function. + * - This function only available when __SYSTIMER_PRESENT == 1 and __ECLIC_PRESENT == 1 and __Vendor_SysTickConfig == 0 + * - Since the MTIMERCMP value might overflow, if overflowed, MTIMER will be set to 0, and MTIMERCMP set to ticks + * \sa + * - \ref SysTimer_SetCompareValue + * - \ref SysTimer_SetLoadValue + */ +__STATIC_FORCEINLINE uint32_t SysTick_Reload(uint64_t ticks) +{ + uint64_t cur_ticks = SysTimer->MTIMER; + uint64_t reload_ticks = ticks + cur_ticks; + + if (__USUALLY(reload_ticks > cur_ticks)) { + SysTimer->MTIMERCMP = reload_ticks; + } else { + /* When added the ticks value, then the MTIMERCMP < TIMER, + * which means the MTIMERCMP is overflowed, + * so we need to reset the counter to zero */ + SysTimer->MTIMER = 0; + SysTimer->MTIMERCMP = ticks; + } + + return (0UL); +} + +#endif /* defined(__Vendor_SysTickConfig) && (__Vendor_SysTickConfig == 0U) */ +/** @} */ /* End of Doxygen Group NMSIS_Core_SysTimer */ + +#endif /* defined(__SYSTIMER_PRESENT) && (__SYSTIMER_PRESENT == 1) */ + +#ifdef __cplusplus +} +#endif +#endif /** __CORE_FEATURE_TIMER_H__ */ + diff --git a/kernel/arch/risc-v/nuclei/gcc/nmsis/Core/Include/nmsis_compiler.h b/kernel/arch/risc-v/nuclei/gcc/nmsis/Core/Include/nmsis_compiler.h new file mode 100644 index 00000000..c5278db1 --- /dev/null +++ b/kernel/arch/risc-v/nuclei/gcc/nmsis/Core/Include/nmsis_compiler.h @@ -0,0 +1,37 @@ +/* + * Copyright (c) 2019 Nuclei Limited. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NMSIS_COMPILER_H +#define __NMSIS_COMPILER_H + +#include + +/*! + * @file nmsis_compiler.h + * @brief NMSIS compiler generic header file + */ +#if defined ( __GNUC__ ) + /** GNU GCC Compiler */ + #include "nmsis_gcc.h" +#else + #error Unknown compiler. +#endif + + +#endif /* __NMSIS_COMPILER_H */ + diff --git a/kernel/arch/risc-v/nuclei/gcc/nmsis/Core/Include/nmsis_core.h b/kernel/arch/risc-v/nuclei/gcc/nmsis/Core/Include/nmsis_core.h new file mode 100644 index 00000000..fa7821da --- /dev/null +++ b/kernel/arch/risc-v/nuclei/gcc/nmsis/Core/Include/nmsis_core.h @@ -0,0 +1,87 @@ +/* + * Copyright (c) 2009-2019 Arm Limited. All rights reserved. + * -- Adaptable modifications made for Nuclei Processors. -- + * Copyright (c) 2019 Nuclei Limited. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __NMSIS_CORE_H__ +#define __NMSIS_CORE_H__ + +#include + +#ifdef __cplusplus + extern "C" { +#endif + +#include "nmsis_version.h" + +/** + * \ingroup NMSIS_Core_VersionControl + * @{ + */ +/* The following enum __NUCLEI_N_REV/__NUCLEI_NX_REV definition in this file + * is only used for doxygen documentation generation, + * The .h is the real file to define it by vendor + */ +#if defined(__ONLY_FOR_DOXYGEN_DOCUMENT_GENERATION__) +/** + * \brief Nuclei N class core revision number + * \details + * Reversion number format: [15:8] revision number, [7:0] patch number + * \attention + * This define is exclusive with \ref __NUCLEI_NX_REV + */ +#define __NUCLEI_N_REV (0x0104) +/** + * \brief Nuclei NX class core revision number + * \details + * Reversion number format: [15:8] revision number, [7:0] patch number + * \attention + * This define is exclusive with \ref __NUCLEI_N_REV + */ +#define __NUCLEI_NX_REV (0x0100) +#endif /* __ONLY_FOR_DOXYGEN_DOCUMENT_GENERATION__ */ +/** @} */ /* End of Group NMSIS_Core_VersionControl */ + +#include "nmsis_compiler.h" /* NMSIS compiler specific defines */ + +/* === Include Nuclei Core Related Headers === */ +/* Include core base feature header file */ +#include "core_feature_base.h" + +#ifndef __NMSIS_GENERIC +/* Include core eclic feature header file */ +#include "core_feature_eclic.h" +/* Include core systimer feature header file */ +#include "core_feature_timer.h" +#endif + +/* Include core fpu feature header file */ +#include "core_feature_fpu.h" +/* Include core dsp feature header file */ +#include "core_feature_dsp.h" +/* Include core pmp feature header file */ +#include "core_feature_pmp.h" +/* Include core cache feature header file */ +#include "core_feature_cache.h" + +/* Include compatiable functions header file */ +#include "core_compatiable.h" + +#ifdef __cplusplus +} +#endif +#endif /* __NMSIS_CORE_H__ */ diff --git a/kernel/arch/risc-v/nuclei/gcc/nmsis/Core/Include/nmsis_gcc.h b/kernel/arch/risc-v/nuclei/gcc/nmsis/Core/Include/nmsis_gcc.h new file mode 100644 index 00000000..9f7eb9d2 --- /dev/null +++ b/kernel/arch/risc-v/nuclei/gcc/nmsis/Core/Include/nmsis_gcc.h @@ -0,0 +1,265 @@ +/* + * Copyright (c) 2019 Nuclei Limited. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NMSIS_GCC_H__ +#define __NMSIS_GCC_H__ +/*! + * @file nmsis_gcc.h + * @brief NMSIS compiler GCC header file + */ +#include +#include "riscv_encoding.h" + +#ifdef __cplusplus + extern "C" { +#endif + +/* ######################### Startup and Lowlevel Init ######################## */ +/** + * \defgroup NMSIS_Core_CompilerControl Compiler Control + * \ingroup NMSIS_Core + * \brief Compiler agnostic \#define symbols for generic c/c++ source code + * \details + * + * The NMSIS-Core provides the header file nmsis_compiler.h with consistent \#define symbols for generate C or C++ source files that should be compiler agnostic. + * Each NMSIS compliant compiler should support the functionality described in this section. + * + * The header file nmsis_compiler.h is also included by each Device Header File so that these definitions are available. + * @{ + */ +/* ignore some GCC warnings */ +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wsign-conversion" +#pragma GCC diagnostic ignored "-Wconversion" +#pragma GCC diagnostic ignored "-Wunused-parameter" + +/* Fallback for __has_builtin */ +#ifndef __has_builtin + #define __has_builtin(x) (0) +#endif + +/* NMSIS compiler specific defines */ +/** \brief Pass information from the compiler to the assembler. */ +#ifndef __ASM + #define __ASM __asm +#endif + +/** \brief Recommend that function should be inlined by the compiler. */ +#ifndef __INLINE + #define __INLINE inline +#endif + +/** \brief Define a static function that may be inlined by the compiler. */ +#ifndef __STATIC_INLINE + #define __STATIC_INLINE static inline +#endif + +/** \brief Define a static function that should be always inlined by the compiler. */ +#ifndef __STATIC_FORCEINLINE + #define __STATIC_FORCEINLINE __attribute__((always_inline)) static inline +#endif + +/** \brief Inform the compiler that a function does not return. */ +#ifndef __NO_RETURN + #define __NO_RETURN __attribute__((__noreturn__)) +#endif + +/** \brief Inform that a variable shall be retained in executable image. */ +#ifndef __USED + #define __USED __attribute__((used)) +#endif + +/** \brief restrict pointer qualifier to enable additional optimizations. */ +#ifndef __WEAK + #define __WEAK __attribute__((weak)) +#endif + +/** \brief specified the vector size of the variable, measured in bytes */ +#ifndef __VECTOR_SIZE + #define __VECTOR_SIZE(x) __attribute__((vector_size(x))) +#endif + +/** \brief Request smallest possible alignment. */ +#ifndef __PACKED + #define __PACKED __attribute__((packed, aligned(1))) +#endif + +/** \brief Request smallest possible alignment for a structure. */ +#ifndef __PACKED_STRUCT + #define __PACKED_STRUCT struct __attribute__((packed, aligned(1))) +#endif + +/** \brief Request smallest possible alignment for a union. */ +#ifndef __PACKED_UNION + #define __PACKED_UNION union __attribute__((packed, aligned(1))) +#endif + +#ifndef __UNALIGNED_UINT16_WRITE + #pragma GCC diagnostic push + #pragma GCC diagnostic ignored "-Wpacked" + #pragma GCC diagnostic ignored "-Wattributes" + /** \brief Packed struct for unaligned uint16_t write access */ + __PACKED_STRUCT T_UINT16_WRITE { + uint16_t v; + }; + #pragma GCC diagnostic pop + /** \brief Pointer for unaligned write of a uint16_t variable. */ + #define __UNALIGNED_UINT16_WRITE(addr, val) (void)((((struct T_UINT16_WRITE *)(void *)(addr))->v) = (val)) +#endif + +#ifndef __UNALIGNED_UINT16_READ + #pragma GCC diagnostic push + #pragma GCC diagnostic ignored "-Wpacked" + #pragma GCC diagnostic ignored "-Wattributes" + /** \brief Packed struct for unaligned uint16_t read access */ + __PACKED_STRUCT T_UINT16_READ { + uint16_t v; + }; + #pragma GCC diagnostic pop + /** \brief Pointer for unaligned read of a uint16_t variable. */ + #define __UNALIGNED_UINT16_READ(addr) (((const struct T_UINT16_READ *)(const void *)(addr))->v) +#endif + +#ifndef __UNALIGNED_UINT32_WRITE + #pragma GCC diagnostic push + #pragma GCC diagnostic ignored "-Wpacked" + #pragma GCC diagnostic ignored "-Wattributes" + /** \brief Packed struct for unaligned uint32_t write access */ + __PACKED_STRUCT T_UINT32_WRITE { + uint32_t v; + }; + #pragma GCC diagnostic pop + /** \brief Pointer for unaligned write of a uint32_t variable. */ + #define __UNALIGNED_UINT32_WRITE(addr, val) (void)((((struct T_UINT32_WRITE *)(void *)(addr))->v) = (val)) +#endif + +#ifndef __UNALIGNED_UINT32_READ + #pragma GCC diagnostic push + #pragma GCC diagnostic ignored "-Wpacked" + #pragma GCC diagnostic ignored "-Wattributes" + /** \brief Packed struct for unaligned uint32_t read access */ + __PACKED_STRUCT T_UINT32_READ { + uint32_t v; + }; + #pragma GCC diagnostic pop + /** \brief Pointer for unaligned read of a uint32_t variable. */ + #define __UNALIGNED_UINT32_READ(addr) (((const struct T_UINT32_READ *)(const void *)(addr))->v) +#endif + +/** \brief Minimum `x` bytes alignment for a variable. */ +#ifndef __ALIGNED + #define __ALIGNED(x) __attribute__((aligned(x))) +#endif + +/** \brief restrict pointer qualifier to enable additional optimizations. */ +#ifndef __RESTRICT + #define __RESTRICT __restrict +#endif + +/** \brief Barrier to prevent compiler from reordering instructions. */ +#ifndef __COMPILER_BARRIER + #define __COMPILER_BARRIER() __ASM volatile("":::"memory") +#endif + +/** \brief provide the compiler with branch prediction information, the branch is usually true */ +#ifndef __USUALLY + #define __USUALLY(exp) __builtin_expect((exp), 1) +#endif + +/** \brief provide the compiler with branch prediction information, the branch is rarely true */ +#ifndef __RARELY + #define __RARELY(exp) __builtin_expect((exp), 0) +#endif + +/** \brief Use this attribute to indicate that the specified function is an interrupt handler. */ +#ifndef __INTERRUPT + #define __INTERRUPT __attribute__((interrupt)) +#endif + +/** @} */ /* End of Doxygen Group NMSIS_Core_CompilerControl */ + +/* IO definitions (access restrictions to peripheral registers) */ +/** + * \defgroup NMSIS_Core_PeriphAccess Peripheral Access + * \brief Naming conventions and optional features for accessing peripherals. + * + * The section below describes the naming conventions, requirements, and optional features + * for accessing device specific peripherals. + * Most of the rules also apply to the core peripherals. + * + * The **Device Header File ** contains typically these definition + * and also includes the core specific header files. + * + * @{ + */ +/** \brief Defines 'read only' permissions */ +#ifdef __cplusplus + #define __I volatile +#else + #define __I volatile const +#endif +/** \brief Defines 'write only' permissions */ +#define __O volatile +/** \brief Defines 'read / write' permissions */ +#define __IO volatile + +/* following defines should be used for structure members */ +/** \brief Defines 'read only' structure member permissions */ +#define __IM volatile const +/** \brief Defines 'write only' structure member permissions */ +#define __OM volatile +/** \brief Defines 'read/write' structure member permissions */ +#define __IOM volatile + +/** + * \brief Mask and shift a bit field value for use in a register bit range. + * \details The macro \ref _VAL2FLD uses the #define's _Pos and _Msk of the related bit + * field to shift bit-field values for assigning to a register. + * + * **Example**: + * \code + * ECLIC->CFG = _VAL2FLD(CLIC_CLICCFG_NLBIT, 3); + * \endcode + * \param[in] field Name of the register bit field. + * \param[in] value Value of the bit field. This parameter is interpreted as an uint32_t type. + * \return Masked and shifted value. + */ +#define _VAL2FLD(field, value) (((uint32_t)(value) << field ## _Pos) & field ## _Msk) + +/** + * \brief Mask and shift a register value to extract a bit filed value. + * \details The macro \ref _FLD2VAL uses the #define's _Pos and _Msk of the related bit + * field to extract the value of a bit field from a register. + * + * **Example**: + * \code + * nlbits = _FLD2VAL(CLIC_CLICCFG_NLBIT, ECLIC->CFG); + * \endcode + * \param[in] field Name of the register bit field. + * \param[in] value Value of register. This parameter is interpreted as an uint32_t type. + * \return Masked and shifted bit field value. + */ +#define _FLD2VAL(field, value) (((uint32_t)(value) & field ## _Msk) >> field ## _Pos) + +/** @} */ /* end of group NMSIS_Core_PeriphAccess */ + + +#ifdef __cplusplus +} +#endif +#endif /* __NMSIS_GCC_H__ */ diff --git a/kernel/arch/risc-v/nuclei/gcc/nmsis/Core/Include/nmsis_version.h b/kernel/arch/risc-v/nuclei/gcc/nmsis/Core/Include/nmsis_version.h new file mode 100644 index 00000000..16507998 --- /dev/null +++ b/kernel/arch/risc-v/nuclei/gcc/nmsis/Core/Include/nmsis_version.h @@ -0,0 +1,87 @@ +/* + * Copyright (c) 2019 Nuclei Limited. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __NMSIS_VERSION_H +#define __NMSIS_VERSION_H + +/** + * \defgroup NMSIS_Core_VersionControl Version Control + * \ingroup NMSIS_Core + * \brief Version \#define symbols for NMSIS release specific C/C++ source code + * \details + * + * We followed the [semantic versioning 2.0.0](https://semver.org/) to control NMSIS version. + * The version format is **MAJOR.MINOR.PATCH**, increment the: + * 1. MAJOR version when you make incompatible API changes, + * 2. MINOR version when you add functionality in a backwards compatible manner, and + * 3. PATCH version when you make backwards compatible bug fixes. + * + * The header file `nmsis_version.h` is included by each core header so that these definitions are available. + * + * **Example Usage for NMSIS Version Check**: + * \code + * #if defined(__NMSIS_VERSION) && (__NMSIS_VERSION >= 0x00010105) + * #warning "Yes, we have NMSIS 1.1.5 or later" + * #else + * #error "We need NMSIS 1.1.5 or later!" + * #endif + * \endcode + * + * @{ + */ + +/*! + * \file nmsis_version.h + * \brief NMSIS Version definitions + **/ + +/** + * \brief Represent the NMSIS major version + * \details + * The NMSIS major version can be used to + * differentiate between NMSIS major releases. + * */ +#define __NMSIS_VERSION_MAJOR (1U) + +/** + * \brief Represent the NMSIS minor version + * \details + * The NMSIS minor version can be used to + * query a NMSIS release update including new features. + * + **/ +#define __NMSIS_VERSION_MINOR (0U) + +/** + * \brief Represent the NMSIS patch version + * \details + * The NMSIS patch version can be used to + * show bug fixes in this package. + **/ +#define __NMSIS_VERSION_PATCH (1U) +/** + * \brief Represent the NMSIS Version + * \details + * NMSIS Version format: **MAJOR.MINOR.PATCH** + * * MAJOR: \ref __NMSIS_VERSION_MAJOR, stored in `bits [31:16]` of \ref __NMSIS_VERSION + * * MINOR: \ref __NMSIS_VERSION_MINOR, stored in `bits [15:8]` of \ref __NMSIS_VERSION + * * PATCH: \ref __NMSIS_VERSION_PATCH, stored in `bits [7:0]` of \ref __NMSIS_VERSION + **/ +#define __NMSIS_VERSION ((__NMSIS_VERSION_MAJOR << 16U) | (__NMSIS_VERSION_MINOR << 8) | __NMSIS_VERSION_PATCH) + +/** @} */ /* End of Doxygen Group NMSIS_Core_VersionControl */ +#endif diff --git a/kernel/arch/risc-v/nuclei/gcc/nmsis/Core/Include/riscv_bits.h b/kernel/arch/risc-v/nuclei/gcc/nmsis/Core/Include/riscv_bits.h new file mode 100644 index 00000000..a18c1686 --- /dev/null +++ b/kernel/arch/risc-v/nuclei/gcc/nmsis/Core/Include/riscv_bits.h @@ -0,0 +1,92 @@ +/* + * Copyright (c) 2019 Nuclei Limited. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __RISCV_BITS_H__ +#define __RISCV_BITS_H__ + +#ifdef __cplusplus + extern "C" { +#endif + +#if __riscv_xlen == 64 +# define SLL32 sllw +# define STORE sd +# define LOAD ld +# define LWU lwu +# define LOG_REGBYTES 3 +#else +# define SLL32 sll +# define STORE sw +# define LOAD lw +# define LWU lw +# define LOG_REGBYTES 2 +#endif /* __riscv_xlen */ + +#define REGBYTES (1 << LOG_REGBYTES) + +#if __riscv_flen == 64 +# define FPSTORE fsd +# define FPLOAD fld +# define LOG_FPREGBYTES 3 +#else +# define FPSTORE fsw +# define FPLOAD flw +# define LOG_FPREGBYTES 2 +#endif /* __riscv_flen */ +#define FPREGBYTES (1 << LOG_FPREGBYTES) + +#define __rv_likely(x) __builtin_expect((x), 1) +#define __rv_unlikely(x) __builtin_expect((x), 0) + +#define __RV_ROUNDUP(a, b) ((((a)-1)/(b)+1)*(b)) +#define __RV_ROUNDDOWN(a, b) ((a)/(b)*(b)) + +#define __RV_MAX(a, b) ((a) > (b) ? (a) : (b)) +#define __RV_MIN(a, b) ((a) < (b) ? (a) : (b)) +#define __RV_CLAMP(a, lo, hi) MIN(MAX(a, lo), hi) + +#define __RV_EXTRACT_FIELD(val, which) (((val) & (which)) / ((which) & ~((which)-1))) +#define __RV_INSERT_FIELD(val, which, fieldval) (((val) & ~(which)) | ((fieldval) * ((which) & ~((which)-1)))) + +#ifdef __ASSEMBLY__ +#define _AC(X,Y) X +#define _AT(T,X) X +#else +#define __AC(X,Y) (X##Y) +#define _AC(X,Y) __AC(X,Y) +#define _AT(T,X) ((T)(X)) +#endif /* __ASSEMBLY__ */ + +#define _UL(x) (_AC(x, UL)) +#define _ULL(x) (_AC(x, ULL)) + +#define _BITUL(x) (_UL(1) << (x)) +#define _BITULL(x) (_ULL(1) << (x)) + +#define UL(x) (_UL(x)) +#define ULL(x) (_ULL(x)) + +#define STR(x) XSTR(x) +#define XSTR(x) #x +#define __STR(s) #s +#define STRINGIFY(s) __STR(s) + +#ifdef __cplusplus +} +#endif + +#endif /** __RISCV_BITS_H__ */ diff --git a/kernel/arch/risc-v/nuclei/gcc/nmsis/Core/Include/riscv_encoding.h b/kernel/arch/risc-v/nuclei/gcc/nmsis/Core/Include/riscv_encoding.h new file mode 100644 index 00000000..8b218b0b --- /dev/null +++ b/kernel/arch/risc-v/nuclei/gcc/nmsis/Core/Include/riscv_encoding.h @@ -0,0 +1,617 @@ +/* + * Copyright (c) 2019 Nuclei Limited. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __RISCV_ENCODING_H__ +#define __RISCV_ENCODING_H__ + +#include "riscv_bits.h" +#ifdef __cplusplus + extern "C" { +#endif +/** + * \defgroup NMSIS_Core_CSR_Encoding Core CSR Encodings + * \ingroup NMSIS_Core + * \brief NMSIS Core CSR Encodings + * \details + * + * The following macros are used for CSR encodings + * @{ + */ +#define MSTATUS_UIE 0x00000001 +#define MSTATUS_SIE 0x00000002 +#define MSTATUS_HIE 0x00000004 +#define MSTATUS_MIE 0x00000008 +#define MSTATUS_UPIE 0x00000010 +#define MSTATUS_SPIE 0x00000020 +#define MSTATUS_HPIE 0x00000040 +#define MSTATUS_MPIE 0x00000080 +#define MSTATUS_SPP 0x00000100 +#define MSTATUS_MPP 0x00001800 +#define MSTATUS_FS 0x00006000 +#define MSTATUS_XS 0x00018000 +#define MSTATUS_MPRV 0x00020000 +#define MSTATUS_PUM 0x00040000 +#define MSTATUS_MXR 0x00080000 +#define MSTATUS_VM 0x1F000000 +#define MSTATUS32_SD 0x80000000 +#define MSTATUS64_SD 0x8000000000000000 + +#define MSTATUS_FS_INITIAL 0x00002000 +#define MSTATUS_FS_CLEAN 0x00004000 +#define MSTATUS_FS_DIRTY 0x00006000 + +#define SSTATUS_UIE 0x00000001 +#define SSTATUS_SIE 0x00000002 +#define SSTATUS_UPIE 0x00000010 +#define SSTATUS_SPIE 0x00000020 +#define SSTATUS_SPP 0x00000100 +#define SSTATUS_FS 0x00006000 +#define SSTATUS_XS 0x00018000 +#define SSTATUS_PUM 0x00040000 +#define SSTATUS32_SD 0x80000000 +#define SSTATUS64_SD 0x8000000000000000 + +#define CSR_MCACHE_CTL_IE 0x00000001 +#define CSR_MCACHE_CTL_DE 0x00010000 + +#define DCSR_XDEBUGVER (3U<<30) +#define DCSR_NDRESET (1<<29) +#define DCSR_FULLRESET (1<<28) +#define DCSR_EBREAKM (1<<15) +#define DCSR_EBREAKH (1<<14) +#define DCSR_EBREAKS (1<<13) +#define DCSR_EBREAKU (1<<12) +#define DCSR_STOPCYCLE (1<<10) +#define DCSR_STOPTIME (1<<9) +#define DCSR_CAUSE (7<<6) +#define DCSR_DEBUGINT (1<<5) +#define DCSR_HALT (1<<3) +#define DCSR_STEP (1<<2) +#define DCSR_PRV (3<<0) + +#define DCSR_CAUSE_NONE 0 +#define DCSR_CAUSE_SWBP 1 +#define DCSR_CAUSE_HWBP 2 +#define DCSR_CAUSE_DEBUGINT 3 +#define DCSR_CAUSE_STEP 4 +#define DCSR_CAUSE_HALT 5 + +#define MCONTROL_TYPE(xlen) (0xfULL<<((xlen)-4)) +#define MCONTROL_DMODE(xlen) (1ULL<<((xlen)-5)) +#define MCONTROL_MASKMAX(xlen) (0x3fULL<<((xlen)-11)) + +#define MCONTROL_SELECT (1<<19) +#define MCONTROL_TIMING (1<<18) +#define MCONTROL_ACTION (0x3f<<12) +#define MCONTROL_CHAIN (1<<11) +#define MCONTROL_MATCH (0xf<<7) +#define MCONTROL_M (1<<6) +#define MCONTROL_H (1<<5) +#define MCONTROL_S (1<<4) +#define MCONTROL_U (1<<3) +#define MCONTROL_EXECUTE (1<<2) +#define MCONTROL_STORE (1<<1) +#define MCONTROL_LOAD (1<<0) + +#define MCONTROL_TYPE_NONE 0 +#define MCONTROL_TYPE_MATCH 2 + +#define MCONTROL_ACTION_DEBUG_EXCEPTION 0 +#define MCONTROL_ACTION_DEBUG_MODE 1 +#define MCONTROL_ACTION_TRACE_START 2 +#define MCONTROL_ACTION_TRACE_STOP 3 +#define MCONTROL_ACTION_TRACE_EMIT 4 + +#define MCONTROL_MATCH_EQUAL 0 +#define MCONTROL_MATCH_NAPOT 1 +#define MCONTROL_MATCH_GE 2 +#define MCONTROL_MATCH_LT 3 +#define MCONTROL_MATCH_MASK_LOW 4 +#define MCONTROL_MATCH_MASK_HIGH 5 + +#define MIP_SSIP (1 << IRQ_S_SOFT) +#define MIP_HSIP (1 << IRQ_H_SOFT) +#define MIP_MSIP (1 << IRQ_M_SOFT) +#define MIP_STIP (1 << IRQ_S_TIMER) +#define MIP_HTIP (1 << IRQ_H_TIMER) +#define MIP_MTIP (1 << IRQ_M_TIMER) +#define MIP_SEIP (1 << IRQ_S_EXT) +#define MIP_HEIP (1 << IRQ_H_EXT) +#define MIP_MEIP (1 << IRQ_M_EXT) + +#define MIE_SSIE MIP_SSIP +#define MIE_HSIE MIP_HSIP +#define MIE_MSIE MIP_MSIP +#define MIE_STIE MIP_STIP +#define MIE_HTIE MIP_HTIP +#define MIE_MTIE MIP_MTIP +#define MIE_SEIE MIP_SEIP +#define MIE_HEIE MIP_HEIP +#define MIE_MEIE MIP_MEIP + +/* === Nuclei custom CSR bit mask === */ + +#define WFE_WFE (0x1) +#define TXEVT_TXEVT (0x1) +#define SLEEPVALUE_SLEEPVALUE (0x1) + +#define MCOUNTINHIBIT_IR (1<<2) +#define MCOUNTINHIBIT_CY (1<<0) + +#define MILM_CTL_ILM_BPA (((1ULL<<((__riscv_xlen)-10))-1)<<10) +#define MILM_CTL_ILM_EN (1<<0) + +#define MDLM_CTL_DLM_BPA (((1ULL<<((__riscv_xlen)-10))-1)<<10) +#define MDLM_CTL_DLM_EN (1<<0) + +#define MSUBM_PTYP (0x3<<8) +#define MSUBM_TYP (0x3<<6) + +#define MDCAUSE_MDCAUSE (0x3) + +#define MMISC_CTL_NMI_CAUSE_FFF (1<<9) +#define MMISC_CTL_MISALIGN (1<<6) +#define MMISC_CTL_BPU (1<<3) + +#define MCACHE_CTL_IC_EN (1<<0) +#define MCACHE_CTL_IC_SCPD_MOD (1<<1) +#define MCACHE_CTL_DC_EN (1<<16) + +#define MTVT2_MTVT2EN (1<<0) +#define MTVT2_COMMON_CODE_ENTRY (((1ULL<<((__riscv_xlen)-2))-1)<<2) + +#define MCFG_INFO_TEE (1<<0) +#define MCFG_INFO_ECC (1<<1) +#define MCFG_INFO_CLIC (1<<2) +#define MCFG_INFO_PLIC (1<<3) +#define MCFG_INFO_FIO (1<<4) +#define MCFG_INFO_PPI (1<<5) +#define MCFG_INFO_NICE (1<<6) +#define MCFG_INFO_ILM (1<<7) +#define MCFG_INFO_DLM (1<<8) +#define MCFG_INFO_ICACHE (1<<9) +#define MCFG_INFO_DCACHE (1<<10) + +#define MICFG_IC_SET (0xF<<0) +#define MICFG_IC_WAY (0x7<<4) +#define MICFG_IC_LSIZE (0x7<<7) +#define MICFG_ILM_SIZE (0x1F<<16) +#define MICFG_ILM_XONLY (1<<21) + +#define MDCFG_DC_SET (0xF<<0) +#define MDCFG_DC_WAY (0x7<<4) +#define MDCFG_DC_LSIZE (0x7<<7) +#define MDCFG_DLM_SIZE (0x1F<<16) + +#define MPPICFG_INFO_PPI_SIZE (0x1F<<1) +#define MPPICFG_INFO_PPI_BPA (((1ULL<<((__riscv_xlen)-10))-1)<<10) + +#define MFIOCFG_INFO_FIO_SIZE (0x1F<<1) +#define MFIOCFG_INFO_FIO_BPA (((1ULL<<((__riscv_xlen)-10))-1)<<10) + +#define SIP_SSIP MIP_SSIP +#define SIP_STIP MIP_STIP + +#define PRV_U 0 +#define PRV_S 1 +#define PRV_H 2 +#define PRV_M 3 + +#define VM_MBARE 0 +#define VM_MBB 1 +#define VM_MBBID 2 +#define VM_SV32 8 +#define VM_SV39 9 +#define VM_SV48 10 + +#define IRQ_S_SOFT 1 +#define IRQ_H_SOFT 2 +#define IRQ_M_SOFT 3 +#define IRQ_S_TIMER 5 +#define IRQ_H_TIMER 6 +#define IRQ_M_TIMER 7 +#define IRQ_S_EXT 9 +#define IRQ_H_EXT 10 +#define IRQ_M_EXT 11 +#define IRQ_COP 12 +#define IRQ_HOST 13 + +#define DEFAULT_RSTVEC 0x00001000 +#define DEFAULT_NMIVEC 0x00001004 +#define DEFAULT_MTVEC 0x00001010 +#define CONFIG_STRING_ADDR 0x0000100C +#define EXT_IO_BASE 0x40000000 +#define DRAM_BASE 0x80000000 + +/* === FPU FRM Rounding Mode === */ +/** FPU Round to Nearest, ties to Even*/ +#define FRM_RNDMODE_RNE 0x0 +/** FPU Round Towards Zero */ +#define FRM_RNDMODE_RTZ 0x1 +/** FPU Round Down (towards -inf) */ +#define FRM_RNDMODE_RDN 0x2 +/** FPU Round Up (towards +inf) */ +#define FRM_RNDMODE_RUP 0x3 +/** FPU Round to nearest, ties to Max Magnitude */ +#define FRM_RNDMODE_RMM 0x4 +/** + * In instruction's rm, selects dynamic rounding mode. + * In Rounding Mode register, Invalid */ +#define FRM_RNDMODE_DYN 0x7 + +/* === FPU FFLAGS Accrued Exceptions === */ +/** FPU Inexact */ +#define FFLAGS_AE_NX (1<<0) +/** FPU Underflow */ +#define FFLAGS_AE_UF (1<<1) +/** FPU Overflow */ +#define FFLAGS_AE_OF (1<<2) +/** FPU Divide by Zero */ +#define FFLAGS_AE_DZ (1<<3) +/** FPU Invalid Operation */ +#define FFLAGS_AE_NV (1<<4) + +/** Floating Point Register f0-f31, eg. f0 -> FREG(0) */ +#define FREG(idx) f##idx + + +/* === PMP CFG Bits === */ +#define PMP_R 0x01 +#define PMP_W 0x02 +#define PMP_X 0x04 +#define PMP_A 0x18 +#define PMP_A_TOR 0x08 +#define PMP_A_NA4 0x10 +#define PMP_A_NAPOT 0x18 +#define PMP_L 0x80 + +#define PMP_SHIFT 2 +#define PMP_COUNT 16 + +// page table entry (PTE) fields +#define PTE_V 0x001 // Valid +#define PTE_R 0x002 // Read +#define PTE_W 0x004 // Write +#define PTE_X 0x008 // Execute +#define PTE_U 0x010 // User +#define PTE_G 0x020 // Global +#define PTE_A 0x040 // Accessed +#define PTE_D 0x080 // Dirty +#define PTE_SOFT 0x300 // Reserved for Software + +#define PTE_PPN_SHIFT 10 + +#define PTE_TABLE(PTE) (((PTE) & (PTE_V | PTE_R | PTE_W | PTE_X)) == PTE_V) + +#ifdef __riscv + +#ifdef __riscv64 +# define MSTATUS_SD MSTATUS64_SD +# define SSTATUS_SD SSTATUS64_SD +# define RISCV_PGLEVEL_BITS 9 +#else +# define MSTATUS_SD MSTATUS32_SD +# define SSTATUS_SD SSTATUS32_SD +# define RISCV_PGLEVEL_BITS 10 +#endif /* __riscv64 */ + +#define RISCV_PGSHIFT 12 +#define RISCV_PGSIZE (1 << RISCV_PGSHIFT) + +#endif /* __riscv */ + +#define DOWNLOAD_MODE_FLASHXIP 0 +#define DOWNLOAD_MODE_FLASH 1 +#define DOWNLOAD_MODE_ILM 2 +#define DOWNLOAD_MODE_DDR 3 + +/** + * \defgroup NMSIS_Core_CSR_Registers Core CSR Registers + * \ingroup NMSIS_Core + * \brief NMSIS Core CSR Register Definitions + * \details + * + * The following macros are used for CSR Register Defintions. + * @{ + */ +/* === Standard RISC-V CSR Registers === */ +#define CSR_USTATUS 0x0 +#define CSR_FFLAGS 0x1 +#define CSR_FRM 0x2 +#define CSR_FCSR 0x3 +#define CSR_CYCLE 0xc00 +#define CSR_TIME 0xc01 +#define CSR_INSTRET 0xc02 +#define CSR_HPMCOUNTER3 0xc03 +#define CSR_HPMCOUNTER4 0xc04 +#define CSR_HPMCOUNTER5 0xc05 +#define CSR_HPMCOUNTER6 0xc06 +#define CSR_HPMCOUNTER7 0xc07 +#define CSR_HPMCOUNTER8 0xc08 +#define CSR_HPMCOUNTER9 0xc09 +#define CSR_HPMCOUNTER10 0xc0a +#define CSR_HPMCOUNTER11 0xc0b +#define CSR_HPMCOUNTER12 0xc0c +#define CSR_HPMCOUNTER13 0xc0d +#define CSR_HPMCOUNTER14 0xc0e +#define CSR_HPMCOUNTER15 0xc0f +#define CSR_HPMCOUNTER16 0xc10 +#define CSR_HPMCOUNTER17 0xc11 +#define CSR_HPMCOUNTER18 0xc12 +#define CSR_HPMCOUNTER19 0xc13 +#define CSR_HPMCOUNTER20 0xc14 +#define CSR_HPMCOUNTER21 0xc15 +#define CSR_HPMCOUNTER22 0xc16 +#define CSR_HPMCOUNTER23 0xc17 +#define CSR_HPMCOUNTER24 0xc18 +#define CSR_HPMCOUNTER25 0xc19 +#define CSR_HPMCOUNTER26 0xc1a +#define CSR_HPMCOUNTER27 0xc1b +#define CSR_HPMCOUNTER28 0xc1c +#define CSR_HPMCOUNTER29 0xc1d +#define CSR_HPMCOUNTER30 0xc1e +#define CSR_HPMCOUNTER31 0xc1f +#define CSR_SSTATUS 0x100 +#define CSR_SIE 0x104 +#define CSR_STVEC 0x105 +#define CSR_SSCRATCH 0x140 +#define CSR_SEPC 0x141 +#define CSR_SCAUSE 0x142 +#define CSR_SBADADDR 0x143 +#define CSR_SIP 0x144 +#define CSR_SPTBR 0x180 +#define CSR_MSTATUS 0x300 +#define CSR_MISA 0x301 +#define CSR_MEDELEG 0x302 +#define CSR_MIDELEG 0x303 +#define CSR_MIE 0x304 +#define CSR_MTVEC 0x305 +#define CSR_MCOUNTEREN 0x306 +#define CSR_MSCRATCH 0x340 +#define CSR_MEPC 0x341 +#define CSR_MCAUSE 0x342 +#define CSR_MBADADDR 0x343 +#define CSR_MTVAL 0x343 +#define CSR_MIP 0x344 +#define CSR_PMPCFG0 0x3a0 +#define CSR_PMPCFG1 0x3a1 +#define CSR_PMPCFG2 0x3a2 +#define CSR_PMPCFG3 0x3a3 +#define CSR_PMPADDR0 0x3b0 +#define CSR_PMPADDR1 0x3b1 +#define CSR_PMPADDR2 0x3b2 +#define CSR_PMPADDR3 0x3b3 +#define CSR_PMPADDR4 0x3b4 +#define CSR_PMPADDR5 0x3b5 +#define CSR_PMPADDR6 0x3b6 +#define CSR_PMPADDR7 0x3b7 +#define CSR_PMPADDR8 0x3b8 +#define CSR_PMPADDR9 0x3b9 +#define CSR_PMPADDR10 0x3ba +#define CSR_PMPADDR11 0x3bb +#define CSR_PMPADDR12 0x3bc +#define CSR_PMPADDR13 0x3bd +#define CSR_PMPADDR14 0x3be +#define CSR_PMPADDR15 0x3bf +#define CSR_TSELECT 0x7a0 +#define CSR_TDATA1 0x7a1 +#define CSR_TDATA2 0x7a2 +#define CSR_TDATA3 0x7a3 +#define CSR_DCSR 0x7b0 +#define CSR_DPC 0x7b1 +#define CSR_DSCRATCH 0x7b2 +#define CSR_MCYCLE 0xb00 +#define CSR_MINSTRET 0xb02 +#define CSR_MHPMCOUNTER3 0xb03 +#define CSR_MHPMCOUNTER4 0xb04 +#define CSR_MHPMCOUNTER5 0xb05 +#define CSR_MHPMCOUNTER6 0xb06 +#define CSR_MHPMCOUNTER7 0xb07 +#define CSR_MHPMCOUNTER8 0xb08 +#define CSR_MHPMCOUNTER9 0xb09 +#define CSR_MHPMCOUNTER10 0xb0a +#define CSR_MHPMCOUNTER11 0xb0b +#define CSR_MHPMCOUNTER12 0xb0c +#define CSR_MHPMCOUNTER13 0xb0d +#define CSR_MHPMCOUNTER14 0xb0e +#define CSR_MHPMCOUNTER15 0xb0f +#define CSR_MHPMCOUNTER16 0xb10 +#define CSR_MHPMCOUNTER17 0xb11 +#define CSR_MHPMCOUNTER18 0xb12 +#define CSR_MHPMCOUNTER19 0xb13 +#define CSR_MHPMCOUNTER20 0xb14 +#define CSR_MHPMCOUNTER21 0xb15 +#define CSR_MHPMCOUNTER22 0xb16 +#define CSR_MHPMCOUNTER23 0xb17 +#define CSR_MHPMCOUNTER24 0xb18 +#define CSR_MHPMCOUNTER25 0xb19 +#define CSR_MHPMCOUNTER26 0xb1a +#define CSR_MHPMCOUNTER27 0xb1b +#define CSR_MHPMCOUNTER28 0xb1c +#define CSR_MHPMCOUNTER29 0xb1d +#define CSR_MHPMCOUNTER30 0xb1e +#define CSR_MHPMCOUNTER31 0xb1f +#define CSR_MUCOUNTEREN 0x320 +#define CSR_MSCOUNTEREN 0x321 +#define CSR_MHPMEVENT3 0x323 +#define CSR_MHPMEVENT4 0x324 +#define CSR_MHPMEVENT5 0x325 +#define CSR_MHPMEVENT6 0x326 +#define CSR_MHPMEVENT7 0x327 +#define CSR_MHPMEVENT8 0x328 +#define CSR_MHPMEVENT9 0x329 +#define CSR_MHPMEVENT10 0x32a +#define CSR_MHPMEVENT11 0x32b +#define CSR_MHPMEVENT12 0x32c +#define CSR_MHPMEVENT13 0x32d +#define CSR_MHPMEVENT14 0x32e +#define CSR_MHPMEVENT15 0x32f +#define CSR_MHPMEVENT16 0x330 +#define CSR_MHPMEVENT17 0x331 +#define CSR_MHPMEVENT18 0x332 +#define CSR_MHPMEVENT19 0x333 +#define CSR_MHPMEVENT20 0x334 +#define CSR_MHPMEVENT21 0x335 +#define CSR_MHPMEVENT22 0x336 +#define CSR_MHPMEVENT23 0x337 +#define CSR_MHPMEVENT24 0x338 +#define CSR_MHPMEVENT25 0x339 +#define CSR_MHPMEVENT26 0x33a +#define CSR_MHPMEVENT27 0x33b +#define CSR_MHPMEVENT28 0x33c +#define CSR_MHPMEVENT29 0x33d +#define CSR_MHPMEVENT30 0x33e +#define CSR_MHPMEVENT31 0x33f +#define CSR_MVENDORID 0xf11 +#define CSR_MARCHID 0xf12 +#define CSR_MIMPID 0xf13 +#define CSR_MHARTID 0xf14 +#define CSR_CYCLEH 0xc80 +#define CSR_TIMEH 0xc81 +#define CSR_INSTRETH 0xc82 +#define CSR_HPMCOUNTER3H 0xc83 +#define CSR_HPMCOUNTER4H 0xc84 +#define CSR_HPMCOUNTER5H 0xc85 +#define CSR_HPMCOUNTER6H 0xc86 +#define CSR_HPMCOUNTER7H 0xc87 +#define CSR_HPMCOUNTER8H 0xc88 +#define CSR_HPMCOUNTER9H 0xc89 +#define CSR_HPMCOUNTER10H 0xc8a +#define CSR_HPMCOUNTER11H 0xc8b +#define CSR_HPMCOUNTER12H 0xc8c +#define CSR_HPMCOUNTER13H 0xc8d +#define CSR_HPMCOUNTER14H 0xc8e +#define CSR_HPMCOUNTER15H 0xc8f +#define CSR_HPMCOUNTER16H 0xc90 +#define CSR_HPMCOUNTER17H 0xc91 +#define CSR_HPMCOUNTER18H 0xc92 +#define CSR_HPMCOUNTER19H 0xc93 +#define CSR_HPMCOUNTER20H 0xc94 +#define CSR_HPMCOUNTER21H 0xc95 +#define CSR_HPMCOUNTER22H 0xc96 +#define CSR_HPMCOUNTER23H 0xc97 +#define CSR_HPMCOUNTER24H 0xc98 +#define CSR_HPMCOUNTER25H 0xc99 +#define CSR_HPMCOUNTER26H 0xc9a +#define CSR_HPMCOUNTER27H 0xc9b +#define CSR_HPMCOUNTER28H 0xc9c +#define CSR_HPMCOUNTER29H 0xc9d +#define CSR_HPMCOUNTER30H 0xc9e +#define CSR_HPMCOUNTER31H 0xc9f +#define CSR_MCYCLEH 0xb80 +#define CSR_MINSTRETH 0xb82 +#define CSR_MHPMCOUNTER3H 0xb83 +#define CSR_MHPMCOUNTER4H 0xb84 +#define CSR_MHPMCOUNTER5H 0xb85 +#define CSR_MHPMCOUNTER6H 0xb86 +#define CSR_MHPMCOUNTER7H 0xb87 +#define CSR_MHPMCOUNTER8H 0xb88 +#define CSR_MHPMCOUNTER9H 0xb89 +#define CSR_MHPMCOUNTER10H 0xb8a +#define CSR_MHPMCOUNTER11H 0xb8b +#define CSR_MHPMCOUNTER12H 0xb8c +#define CSR_MHPMCOUNTER13H 0xb8d +#define CSR_MHPMCOUNTER14H 0xb8e +#define CSR_MHPMCOUNTER15H 0xb8f +#define CSR_MHPMCOUNTER16H 0xb90 +#define CSR_MHPMCOUNTER17H 0xb91 +#define CSR_MHPMCOUNTER18H 0xb92 +#define CSR_MHPMCOUNTER19H 0xb93 +#define CSR_MHPMCOUNTER20H 0xb94 +#define CSR_MHPMCOUNTER21H 0xb95 +#define CSR_MHPMCOUNTER22H 0xb96 +#define CSR_MHPMCOUNTER23H 0xb97 +#define CSR_MHPMCOUNTER24H 0xb98 +#define CSR_MHPMCOUNTER25H 0xb99 +#define CSR_MHPMCOUNTER26H 0xb9a +#define CSR_MHPMCOUNTER27H 0xb9b +#define CSR_MHPMCOUNTER28H 0xb9c +#define CSR_MHPMCOUNTER29H 0xb9d +#define CSR_MHPMCOUNTER30H 0xb9e +#define CSR_MHPMCOUNTER31H 0xb9f + +/* === CLIC CSR Registers === */ +#define CSR_MTVT 0x307 +#define CSR_MNXTI 0x345 +#define CSR_MINTSTATUS 0x346 +#define CSR_MSCRATCHCSW 0x348 +#define CSR_MSCRATCHCSWL 0x349 +#define CSR_MCLICBASE 0x350 + +/* === Nuclei custom CSR Registers === */ +#define CSR_MCOUNTINHIBIT 0x320 +#define CSR_MILM_CTL 0x7C0 +#define CSR_MDLM_CTL 0x7C1 +#define CSR_MNVEC 0x7C3 +#define CSR_MSUBM 0x7C4 +#define CSR_MDCAUSE 0x7C9 +#define CSR_MCACHE_CTL 0x7CA +#define CSR_MMISC_CTL 0x7D0 +#define CSR_MSAVESTATUS 0x7D6 +#define CSR_MSAVEEPC1 0x7D7 +#define CSR_MSAVECAUSE1 0x7D8 +#define CSR_MSAVEEPC2 0x7D9 +#define CSR_MSAVECAUSE2 0x7DA +#define CSR_MSAVEDCAUSE1 0x7DB +#define CSR_MSAVEDCAUSE2 0x7DC +#define CSR_PUSHMSUBM 0x7EB +#define CSR_MTVT2 0x7EC +#define CSR_JALMNXTI 0x7ED +#define CSR_PUSHMCAUSE 0x7EE +#define CSR_PUSHMEPC 0x7EF +#define CSR_MPPICFG_INFO 0x7F0 +#define CSR_MFIOCFG_INFO 0x7F1 +#define CSR_SLEEPVALUE 0x811 +#define CSR_TXEVT 0x812 +#define CSR_WFE 0x810 +#define CSR_MICFG_INFO 0xFC0 +#define CSR_MDCFG_INFO 0xFC1 +#define CSR_MCFG_INFO 0xFC2 + +/** @} */ /** End of Doxygen Group NMSIS_Core_CSR_Registers **/ + +/* Exception Code in MCAUSE CSR */ +#define CAUSE_MISALIGNED_FETCH 0x0 +#define CAUSE_FAULT_FETCH 0x1 +#define CAUSE_ILLEGAL_INSTRUCTION 0x2 +#define CAUSE_BREAKPOINT 0x3 +#define CAUSE_MISALIGNED_LOAD 0x4 +#define CAUSE_FAULT_LOAD 0x5 +#define CAUSE_MISALIGNED_STORE 0x6 +#define CAUSE_FAULT_STORE 0x7 +#define CAUSE_USER_ECALL 0x8 +#define CAUSE_SUPERVISOR_ECALL 0x9 +#define CAUSE_HYPERVISOR_ECALL 0xa +#define CAUSE_MACHINE_ECALL 0xb + +/* Exception Subcode in MDCAUSE CSR */ +#define DCAUSE_FAULT_FETCH_PMP 0x1 +#define DCAUSE_FAULT_FETCH_INST 0x2 + +#define DCAUSE_FAULT_LOAD_PMP 0x1 +#define DCAUSE_FAULT_LOAD_INST 0x2 +#define DCAUSE_FAULT_LOAD_NICE 0x3 + +#define DCAUSE_FAULT_STORE_PMP 0x1 +#define DCAUSE_FAULT_STORE_INST 0x2 + +/** @} */ /** End of Doxygen Group NMSIS_Core_CSR_Encoding **/ + +#ifdef __cplusplus +} +#endif +#endif /* __RISCV_ENCODING_H__ */ diff --git a/kernel/arch/risc-v/nuclei/gcc/nmsis/DSP/Include/riscv_common_tables.h b/kernel/arch/risc-v/nuclei/gcc/nmsis/DSP/Include/riscv_common_tables.h new file mode 100644 index 00000000..c7bfb466 --- /dev/null +++ b/kernel/arch/risc-v/nuclei/gcc/nmsis/DSP/Include/riscv_common_tables.h @@ -0,0 +1,379 @@ +/* ---------------------------------------------------------------------- + * Project: NMSIS DSP Library + * Title: riscv_common_tables.h + * Description: Extern declaration for common tables + * + * $Date: 27. January 2017 + * $Revision: V.1.5.1 + * + * Target Processor: RISC-V Cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * Copyright (c) 2019 Nuclei Limited. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef _RISCV_COMMON_TABLES_H +#define _RISCV_COMMON_TABLES_H + +#include "riscv_math.h" + +#if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_FFT_ALLOW_TABLES) + + #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) || defined(RISCV_TABLE_BITREV_1024) + extern const uint16_t riscvBitRevTable[1024]; + #endif /* !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) */ + + #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) || defined(RISCV_TABLE_TWIDDLECOEF_F32_16) + extern const float32_t twiddleCoef_16[32]; + #endif /* !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) */ + + #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) || defined(RISCV_TABLE_TWIDDLECOEF_F32_32) + extern const float32_t twiddleCoef_32[64]; + #endif /* !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) */ + + #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) || defined(RISCV_TABLE_TWIDDLECOEF_F32_64) + extern const float32_t twiddleCoef_64[128]; + #endif /* !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) */ + + #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) || defined(RISCV_TABLE_TWIDDLECOEF_F32_128) + extern const float32_t twiddleCoef_128[256]; + #endif /* !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) */ + + #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) || defined(RISCV_TABLE_TWIDDLECOEF_F32_256) + extern const float32_t twiddleCoef_256[512]; + #endif /* !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) */ + + #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) || defined(RISCV_TABLE_TWIDDLECOEF_F32_512) + extern const float32_t twiddleCoef_512[1024]; + #endif /* !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) */ + + #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) || defined(RISCV_TABLE_TWIDDLECOEF_F32_1024) + extern const float32_t twiddleCoef_1024[2048]; + #endif /* !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) */ + + #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) || defined(RISCV_TABLE_TWIDDLECOEF_F32_2048) + extern const float32_t twiddleCoef_2048[4096]; + #endif /* !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) */ + + #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) || defined(RISCV_TABLE_TWIDDLECOEF_F32_4096) + extern const float32_t twiddleCoef_4096[8192]; + #define twiddleCoef twiddleCoef_4096 + #endif /* !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) */ + + #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) || defined(RISCV_TABLE_TWIDDLECOEF_Q31_16) + extern const q31_t twiddleCoef_16_q31[24]; + #endif /* !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) */ + + #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) || defined(RISCV_TABLE_TWIDDLECOEF_Q31_32) + extern const q31_t twiddleCoef_32_q31[48]; + #endif /* !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) */ + + #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) || defined(RISCV_TABLE_TWIDDLECOEF_Q31_64) + extern const q31_t twiddleCoef_64_q31[96]; + #endif /* !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) */ + + #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) || defined(RISCV_TABLE_TWIDDLECOEF_Q31_128) + extern const q31_t twiddleCoef_128_q31[192]; + #endif /* !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) */ + + #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) || defined(RISCV_TABLE_TWIDDLECOEF_Q31_256) + extern const q31_t twiddleCoef_256_q31[384]; + #endif /* !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) */ + + #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) || defined(RISCV_TABLE_TWIDDLECOEF_Q31_512) + extern const q31_t twiddleCoef_512_q31[768]; + #endif /* !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) */ + + #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) || defined(RISCV_TABLE_TWIDDLECOEF_Q31_1024) + extern const q31_t twiddleCoef_1024_q31[1536]; + #endif /* !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) */ + + #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) || defined(RISCV_TABLE_TWIDDLECOEF_Q31_2048) + extern const q31_t twiddleCoef_2048_q31[3072]; + #endif /* !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) */ + + #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) || defined(RISCV_TABLE_TWIDDLECOEF_Q31_4096) + extern const q31_t twiddleCoef_4096_q31[6144]; + #endif /* !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) */ + + #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) || defined(RISCV_TABLE_TWIDDLECOEF_Q15_16) + extern const q15_t twiddleCoef_16_q15[24]; + #endif /* !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) */ + + #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) || defined(RISCV_TABLE_TWIDDLECOEF_Q15_32) + extern const q15_t twiddleCoef_32_q15[48]; + #endif /* !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) */ + + #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) || defined(RISCV_TABLE_TWIDDLECOEF_Q15_64) + extern const q15_t twiddleCoef_64_q15[96]; + #endif /* !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) */ + + #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) || defined(RISCV_TABLE_TWIDDLECOEF_Q15_128) + extern const q15_t twiddleCoef_128_q15[192]; + #endif /* !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) */ + + #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) || defined(RISCV_TABLE_TWIDDLECOEF_Q15_256) + extern const q15_t twiddleCoef_256_q15[384]; + #endif /* !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) */ + + #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) || defined(RISCV_TABLE_TWIDDLECOEF_Q15_512) + extern const q15_t twiddleCoef_512_q15[768]; + #endif /* !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) */ + + #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) || defined(RISCV_TABLE_TWIDDLECOEF_Q15_1024) + extern const q15_t twiddleCoef_1024_q15[1536]; + #endif /* !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) */ + + #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) || defined(RISCV_TABLE_TWIDDLECOEF_Q15_2048) + extern const q15_t twiddleCoef_2048_q15[3072]; + #endif /* !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) */ + + #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) || defined(RISCV_TABLE_TWIDDLECOEF_Q15_4096) + extern const q15_t twiddleCoef_4096_q15[6144]; + #endif /* !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) */ + + #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) || defined(RISCV_TABLE_TWIDDLECOEF_RFFT_F32_32) + extern const float32_t twiddleCoef_rfft_32[32]; + #endif /* !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) */ + + #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) || defined(RISCV_TABLE_TWIDDLECOEF_RFFT_F32_64) + extern const float32_t twiddleCoef_rfft_64[64]; + #endif /* !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) */ + + #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) || defined(RISCV_TABLE_TWIDDLECOEF_RFFT_F32_128) + extern const float32_t twiddleCoef_rfft_128[128]; + #endif /* !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) */ + + #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) || defined(RISCV_TABLE_TWIDDLECOEF_RFFT_F32_256) + extern const float32_t twiddleCoef_rfft_256[256]; + #endif /* !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) */ + + #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) || defined(RISCV_TABLE_TWIDDLECOEF_RFFT_F32_512) + extern const float32_t twiddleCoef_rfft_512[512]; + #endif /* !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) */ + + #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) || defined(RISCV_TABLE_TWIDDLECOEF_RFFT_F32_1024) + extern const float32_t twiddleCoef_rfft_1024[1024]; + #endif /* !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) */ + + #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) || defined(RISCV_TABLE_TWIDDLECOEF_RFFT_F32_2048) + extern const float32_t twiddleCoef_rfft_2048[2048]; + #endif /* !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) */ + + #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) || defined(RISCV_TABLE_TWIDDLECOEF_RFFT_F32_4096) + extern const float32_t twiddleCoef_rfft_4096[4096]; + #endif /* !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) */ + + /* floating-point bit reversal tables */ + + #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) || defined(RISCV_TABLE_BITREVIDX_FLT_16) + #define RISCVBITREVINDEXTABLE_16_TABLE_LENGTH ((uint16_t)20) + extern const uint16_t riscvBitRevIndexTable16[RISCVBITREVINDEXTABLE_16_TABLE_LENGTH]; + #endif /* !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) */ + + #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) || defined(RISCV_TABLE_BITREVIDX_FLT_32) + #define RISCVBITREVINDEXTABLE_32_TABLE_LENGTH ((uint16_t)48) + extern const uint16_t riscvBitRevIndexTable32[RISCVBITREVINDEXTABLE_32_TABLE_LENGTH]; + #endif /* !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) */ + + #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) || defined(RISCV_TABLE_BITREVIDX_FLT_64) + #define RISCVBITREVINDEXTABLE_64_TABLE_LENGTH ((uint16_t)56) + extern const uint16_t riscvBitRevIndexTable64[RISCVBITREVINDEXTABLE_64_TABLE_LENGTH]; + #endif /* !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) */ + + #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) || defined(RISCV_TABLE_BITREVIDX_FLT_128) + #define RISCVBITREVINDEXTABLE_128_TABLE_LENGTH ((uint16_t)208) + extern const uint16_t riscvBitRevIndexTable128[RISCVBITREVINDEXTABLE_128_TABLE_LENGTH]; + #endif /* !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) */ + + #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) || defined(RISCV_TABLE_BITREVIDX_FLT_256) + #define RISCVBITREVINDEXTABLE_256_TABLE_LENGTH ((uint16_t)440) + extern const uint16_t riscvBitRevIndexTable256[RISCVBITREVINDEXTABLE_256_TABLE_LENGTH]; + #endif /* !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) */ + + #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) || defined(RISCV_TABLE_BITREVIDX_FLT_512) + #define RISCVBITREVINDEXTABLE_512_TABLE_LENGTH ((uint16_t)448) + extern const uint16_t riscvBitRevIndexTable512[RISCVBITREVINDEXTABLE_512_TABLE_LENGTH]; + #endif /* !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) */ + + #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) || defined(RISCV_TABLE_BITREVIDX_FLT_1024) + #define RISCVBITREVINDEXTABLE_1024_TABLE_LENGTH ((uint16_t)1800) + extern const uint16_t riscvBitRevIndexTable1024[RISCVBITREVINDEXTABLE_1024_TABLE_LENGTH]; + #endif /* !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) */ + + #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) || defined(RISCV_TABLE_BITREVIDX_FLT_2048) + #define RISCVBITREVINDEXTABLE_2048_TABLE_LENGTH ((uint16_t)3808) + extern const uint16_t riscvBitRevIndexTable2048[RISCVBITREVINDEXTABLE_2048_TABLE_LENGTH]; + #endif /* !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) */ + + #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) || defined(RISCV_TABLE_BITREVIDX_FLT_4096) + #define RISCVBITREVINDEXTABLE_4096_TABLE_LENGTH ((uint16_t)4032) + extern const uint16_t riscvBitRevIndexTable4096[RISCVBITREVINDEXTABLE_4096_TABLE_LENGTH]; + #endif /* !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) */ + + + /* fixed-point bit reversal tables */ + + #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) || defined(RISCV_TABLE_BITREVIDX_FXT_16) + #define RISCVBITREVINDEXTABLE_FIXED_16_TABLE_LENGTH ((uint16_t)12) + extern const uint16_t riscvBitRevIndexTable_fixed_16[RISCVBITREVINDEXTABLE_FIXED_16_TABLE_LENGTH]; + #endif /* !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) */ + + #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) || defined(RISCV_TABLE_BITREVIDX_FXT_32) + #define RISCVBITREVINDEXTABLE_FIXED_32_TABLE_LENGTH ((uint16_t)24) + extern const uint16_t riscvBitRevIndexTable_fixed_32[RISCVBITREVINDEXTABLE_FIXED_32_TABLE_LENGTH]; + #endif /* !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) */ + + #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) || defined(RISCV_TABLE_BITREVIDX_FXT_64) + #define RISCVBITREVINDEXTABLE_FIXED_64_TABLE_LENGTH ((uint16_t)56) + extern const uint16_t riscvBitRevIndexTable_fixed_64[RISCVBITREVINDEXTABLE_FIXED_64_TABLE_LENGTH]; + #endif /* !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) */ + + #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) || defined(RISCV_TABLE_BITREVIDX_FXT_128) + #define RISCVBITREVINDEXTABLE_FIXED_128_TABLE_LENGTH ((uint16_t)112) + extern const uint16_t riscvBitRevIndexTable_fixed_128[RISCVBITREVINDEXTABLE_FIXED_128_TABLE_LENGTH]; + #endif /* !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) */ + + #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) || defined(RISCV_TABLE_BITREVIDX_FXT_256) + #define RISCVBITREVINDEXTABLE_FIXED_256_TABLE_LENGTH ((uint16_t)240) + extern const uint16_t riscvBitRevIndexTable_fixed_256[RISCVBITREVINDEXTABLE_FIXED_256_TABLE_LENGTH]; + #endif /* !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) */ + + #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) || defined(RISCV_TABLE_BITREVIDX_FXT_512) + #define RISCVBITREVINDEXTABLE_FIXED_512_TABLE_LENGTH ((uint16_t)480) + extern const uint16_t riscvBitRevIndexTable_fixed_512[RISCVBITREVINDEXTABLE_FIXED_512_TABLE_LENGTH]; + #endif /* !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) */ + + #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) || defined(RISCV_TABLE_BITREVIDX_FXT_1024) + #define RISCVBITREVINDEXTABLE_FIXED_1024_TABLE_LENGTH ((uint16_t)992) + extern const uint16_t riscvBitRevIndexTable_fixed_1024[RISCVBITREVINDEXTABLE_FIXED_1024_TABLE_LENGTH]; + #endif /* !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) */ + + #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) || defined(RISCV_TABLE_BITREVIDX_FXT_2048) + #define RISCVBITREVINDEXTABLE_FIXED_2048_TABLE_LENGTH ((uint16_t)1984) + extern const uint16_t riscvBitRevIndexTable_fixed_2048[RISCVBITREVINDEXTABLE_FIXED_2048_TABLE_LENGTH]; + #endif /* !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) */ + + #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) || defined(RISCV_TABLE_BITREVIDX_FXT_4096) + #define RISCVBITREVINDEXTABLE_FIXED_4096_TABLE_LENGTH ((uint16_t)4032) + extern const uint16_t riscvBitRevIndexTable_fixed_4096[RISCVBITREVINDEXTABLE_FIXED_4096_TABLE_LENGTH]; + #endif /* !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) */ + + #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) || defined(RISCV_TABLE_REALCOEF_F32) + extern const float32_t realCoefA[8192]; + extern const float32_t realCoefB[8192]; + #endif + + #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) || defined(RISCV_TABLE_REALCOEF_Q31) + extern const q31_t realCoefAQ31[8192]; + extern const q31_t realCoefBQ31[8192]; + #endif + + #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) || defined(RISCV_TABLE_REALCOEF_Q15) + extern const q15_t realCoefAQ15[8192]; + extern const q15_t realCoefBQ15[8192]; + #endif + + #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) || defined(RISCV_TABLE_DCT4_F32_128) + extern const float32_t Weights_128[256]; + extern const float32_t cos_factors_128[128]; + #endif + + #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) || defined(RISCV_TABLE_DCT4_F32_512) + extern const float32_t Weights_512[1024]; + extern const float32_t cos_factors_512[512]; + #endif + + #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) || defined(RISCV_TABLE_DCT4_F32_2048) + extern const float32_t Weights_2048[4096]; + extern const float32_t cos_factors_2048[2048]; + #endif + + #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) || defined(RISCV_TABLE_DCT4_F32_8192) + extern const float32_t Weights_8192[16384]; + extern const float32_t cos_factors_8192[8192]; + #endif + + #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) || defined(RISCV_TABLE_DCT4_Q15_128) + extern const q15_t WeightsQ15_128[256]; + extern const q15_t cos_factorsQ15_128[128]; + #endif + + #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) || defined(RISCV_TABLE_DCT4_Q15_512) + extern const q15_t WeightsQ15_512[1024]; + extern const q15_t cos_factorsQ15_512[512]; + #endif + + #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) || defined(RISCV_TABLE_DCT4_Q15_2048) + extern const q15_t WeightsQ15_2048[4096]; + extern const q15_t cos_factorsQ15_2048[2048]; + #endif + + #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) || defined(RISCV_TABLE_DCT4_Q15_8192) + extern const q15_t WeightsQ15_8192[16384]; + extern const q15_t cos_factorsQ15_8192[8192]; + #endif + + #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) || defined(RISCV_TABLE_DCT4_Q31_128) + extern const q31_t WeightsQ31_128[256]; + extern const q31_t cos_factorsQ31_128[128]; + #endif + + #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) || defined(RISCV_TABLE_DCT4_Q31_512) + extern const q31_t WeightsQ31_512[1024]; + extern const q31_t cos_factorsQ31_512[512]; + #endif + + #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) || defined(RISCV_TABLE_DCT4_Q31_2048) + extern const q31_t WeightsQ31_2048[4096]; + extern const q31_t cos_factorsQ31_2048[2048]; + #endif + + #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) || defined(RISCV_TABLE_DCT4_Q31_8192) + extern const q31_t WeightsQ31_8192[16384]; + extern const q31_t cos_factorsQ31_8192[8192]; + #endif + +#endif /* if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_FFT_TABLES) */ + +#if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_FAST_ALLOW_TABLES) + + #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FAST_TABLES) || defined(RISCV_TABLE_RECIP_Q15) + extern const q15_t riscvRecipTableQ15[64]; + #endif /* !defined(RISCV_DSP_CONFIG_TABLES) defined(RISCV_ALL_FAST_TABLES) */ + + #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FAST_TABLES) || defined(RISCV_TABLE_RECIP_Q31) + extern const q31_t riscvRecipTableQ31[64]; + #endif /* !defined(RISCV_DSP_CONFIG_TABLES) defined(RISCV_ALL_FAST_TABLES) */ + + /* Tables for Fast Math Sine and Cosine */ + #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FAST_TABLES) || defined(RISCV_TABLE_SIN_F32) + extern const float32_t sinTable_f32[FAST_MATH_TABLE_SIZE + 1]; + #endif /* !defined(RISCV_DSP_CONFIG_TABLES) defined(RISCV_ALL_FAST_TABLES) */ + + #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FAST_TABLES) || defined(RISCV_TABLE_SIN_Q31) + extern const q31_t sinTable_q31[FAST_MATH_TABLE_SIZE + 1]; + #endif /* !defined(RISCV_DSP_CONFIG_TABLES) defined(RISCV_ALL_FAST_TABLES) */ + + #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FAST_TABLES) || defined(RISCV_TABLE_SIN_Q15) + extern const q15_t sinTable_q15[FAST_MATH_TABLE_SIZE + 1]; + #endif /* !defined(RISCV_DSP_CONFIG_TABLES) defined(RISCV_ALL_FAST_TABLES) */ + +#endif /* if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_FAST_TABLES) */ + +#endif /* RISCV_COMMON_TABLES_H */ diff --git a/kernel/arch/risc-v/nuclei/gcc/nmsis/DSP/Include/riscv_const_structs.h b/kernel/arch/risc-v/nuclei/gcc/nmsis/DSP/Include/riscv_const_structs.h new file mode 100644 index 00000000..471baef7 --- /dev/null +++ b/kernel/arch/risc-v/nuclei/gcc/nmsis/DSP/Include/riscv_const_structs.h @@ -0,0 +1,67 @@ +/* ---------------------------------------------------------------------- + * Project: NMSIS DSP Library + * Title: riscv_const_structs.h + * Description: Constant structs that are initialized for user convenience. + * For example, some can be given as arguments to the riscv_cfft_f32() function. + * + * $Date: 27. January 2017 + * $Revision: V.1.5.1 + * + * Target Processor: RISC-V Cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * Copyright (c) 2019 Nuclei Limited. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef _RISCV_CONST_STRUCTS_H +#define _RISCV_CONST_STRUCTS_H + +#include "riscv_math.h" +#include "riscv_common_tables.h" + + extern const riscv_cfft_instance_f32 riscv_cfft_sR_f32_len16; + extern const riscv_cfft_instance_f32 riscv_cfft_sR_f32_len32; + extern const riscv_cfft_instance_f32 riscv_cfft_sR_f32_len64; + extern const riscv_cfft_instance_f32 riscv_cfft_sR_f32_len128; + extern const riscv_cfft_instance_f32 riscv_cfft_sR_f32_len256; + extern const riscv_cfft_instance_f32 riscv_cfft_sR_f32_len512; + extern const riscv_cfft_instance_f32 riscv_cfft_sR_f32_len1024; + extern const riscv_cfft_instance_f32 riscv_cfft_sR_f32_len2048; + extern const riscv_cfft_instance_f32 riscv_cfft_sR_f32_len4096; + + extern const riscv_cfft_instance_q31 riscv_cfft_sR_q31_len16; + extern const riscv_cfft_instance_q31 riscv_cfft_sR_q31_len32; + extern const riscv_cfft_instance_q31 riscv_cfft_sR_q31_len64; + extern const riscv_cfft_instance_q31 riscv_cfft_sR_q31_len128; + extern const riscv_cfft_instance_q31 riscv_cfft_sR_q31_len256; + extern const riscv_cfft_instance_q31 riscv_cfft_sR_q31_len512; + extern const riscv_cfft_instance_q31 riscv_cfft_sR_q31_len1024; + extern const riscv_cfft_instance_q31 riscv_cfft_sR_q31_len2048; + extern const riscv_cfft_instance_q31 riscv_cfft_sR_q31_len4096; + + extern const riscv_cfft_instance_q15 riscv_cfft_sR_q15_len16; + extern const riscv_cfft_instance_q15 riscv_cfft_sR_q15_len32; + extern const riscv_cfft_instance_q15 riscv_cfft_sR_q15_len64; + extern const riscv_cfft_instance_q15 riscv_cfft_sR_q15_len128; + extern const riscv_cfft_instance_q15 riscv_cfft_sR_q15_len256; + extern const riscv_cfft_instance_q15 riscv_cfft_sR_q15_len512; + extern const riscv_cfft_instance_q15 riscv_cfft_sR_q15_len1024; + extern const riscv_cfft_instance_q15 riscv_cfft_sR_q15_len2048; + extern const riscv_cfft_instance_q15 riscv_cfft_sR_q15_len4096; + +#endif diff --git a/kernel/arch/risc-v/nuclei/gcc/nmsis/DSP/Include/riscv_math.h b/kernel/arch/risc-v/nuclei/gcc/nmsis/DSP/Include/riscv_math.h new file mode 100644 index 00000000..d561d102 --- /dev/null +++ b/kernel/arch/risc-v/nuclei/gcc/nmsis/DSP/Include/riscv_math.h @@ -0,0 +1,7386 @@ +/****************************************************************************** + * @file riscv_math.h + * @brief Public header file for NMSIS DSP Library + * @version V1.6.0 + * @date 18. March 2019 + ******************************************************************************/ +/* + * Copyright (c) 2010-2019 Arm Limited or its affiliates. All rights reserved. + * Copyright (c) 2019 Nuclei Limited. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + \mainpage NMSIS DSP Software Library + * + * Introduction + * ------------ + * + * This user manual describes the NMSIS DSP software library, + * a suite of common signal processing functions for use on Nuclei N/NX processor based devices. + * + * The library is divided into a number of functions each covering a specific category: + * - Basic math functions + * - Fast math functions + * - Complex math functions + * - Filters + * - Matrix functions + * - Transform functions + * - Motor control functions + * - Statistical functions + * - Support functions + * - Interpolation functions + * + * The library has separate functions for operating on 8-bit integers, 16-bit integers, + * 32-bit integer and 32-bit floating-point values. + * + * The library functions are declared in the public file riscv_math.h which is placed in the Include folder. + * Simply include this file and link the appropriate library in the application and begin calling the library functions. + * The Library supports single public header file riscv_math.h for Nuclei N cores with little endian. + * Same header file will be used for floating point unit(FPU) variants. + * + * \note Please refer to [NMSIS-DSP](../../../dsp/index.html) + * + * Examples + * -------- + * + * The library ships with a number of examples which demonstrate how to use the library functions. + * + * Toolchain Support + * ----------------- + * + * The library has been developed and tested with nuclei riscv gcc toolchain. + * + * Building the Library + * -------------------- + * + * In NMSIS repo, it contains a Makefile to rebuild libraries on nuclei riscv gcc toolchain in the NMSIS/ folder. + * * In *NMSIS* folder, you can run `make gen_dsp_lib` to build and install DSP library into **NMSIS/Library/DSP/GCC** folder. + * + * Preprocessor Macros + * ------------------- + * + * Each library project have different preprocessor macros. + * + * - RISCV_MATH_MATRIX_CHECK: + * + * Define macro RISCV_MATH_MATRIX_CHECK for checking on the input and output sizes of matrices + * + * - RISCV_MATH_ROUNDING: + * + * Define macro RISCV_MATH_ROUNDING for rounding on support functions + * + * - RISCV_MATH_LOOPUNROLL: + * + * Define macro RISCV_MATH_LOOPUNROLL to enable manual loop unrolling in DSP functions + * + */ + + +/** + * @defgroup groupMath Basic Math Functions + */ + +/** + * @defgroup groupFastMath Fast Math Functions + * This set of functions provides a fast approximation to sine, cosine, and square root. + * As compared to most of the other functions in the NMSIS math library, the fast math functions + * operate on individual values and not arrays. + * There are separate functions for Q15, Q31, and floating-point data. + * + */ + +/** + * @defgroup groupCmplxMath Complex Math Functions + * This set of functions operates on complex data vectors. + * The data in the complex arrays is stored in an interleaved fashion + * (real, imag, real, imag, ...). + * In the API functions, the number of samples in a complex array refers + * to the number of complex values; the array contains twice this number of + * real values. + */ + +/** + * @defgroup groupFilters Filtering Functions + */ + +/** + * @defgroup groupMatrix Matrix Functions + * + * This set of functions provides basic matrix math operations. + * The functions operate on matrix data structures. For example, + * the type + * definition for the floating-point matrix structure is shown + * below: + *
+ *     typedef struct
+ *     {
+ *       uint16_t numRows;     // number of rows of the matrix.
+ *       uint16_t numCols;     // number of columns of the matrix.
+ *       float32_t *pData;     // points to the data of the matrix.
+ *     } riscv_matrix_instance_f32;
+ * 
+ * There are similar definitions for Q15 and Q31 data types. + * + * The structure specifies the size of the matrix and then points to + * an array of data. The array is of size numRows X numCols + * and the values are arranged in row order. That is, the + * matrix element (i, j) is stored at: + *
+ *     pData[i*numCols + j]
+ * 
+ * + * \par Init Functions + * There is an associated initialization function for each type of matrix + * data structure. + * The initialization function sets the values of the internal structure fields. + * Refer to \ref riscv_mat_init_f32(), \ref riscv_mat_init_q31() and \ref riscv_mat_init_q15() + * for floating-point, Q31 and Q15 types, respectively. + * + * \par + * Use of the initialization function is optional. However, if initialization function is used + * then the instance structure cannot be placed into a const data section. + * To place the instance structure in a const data + * section, manually initialize the data structure. For example: + *
+ * riscv_matrix_instance_f32 S = {nRows, nColumns, pData};
+ * riscv_matrix_instance_q31 S = {nRows, nColumns, pData};
+ * riscv_matrix_instance_q15 S = {nRows, nColumns, pData};
+ * 
+ * where nRows specifies the number of rows, nColumns + * specifies the number of columns, and pData points to the + * data array. + * + * \par Size Checking + * By default all of the matrix functions perform size checking on the input and + * output matrices. For example, the matrix addition function verifies that the + * two input matrices and the output matrix all have the same number of rows and + * columns. If the size check fails the functions return: + *
+ *     RISCV_MATH_SIZE_MISMATCH
+ * 
+ * Otherwise the functions return + *
+ *     RISCV_MATH_SUCCESS
+ * 
+ * There is some overhead associated with this matrix size checking. + * The matrix size checking is enabled via the \#define + *
+ *     RISCV_MATH_MATRIX_CHECK
+ * 
+ * within the library project settings. By default this macro is defined + * and size checking is enabled. By changing the project settings and + * undefining this macro size checking is eliminated and the functions + * run a bit faster. With size checking disabled the functions always + * return RISCV_MATH_SUCCESS. + */ + +/** + * @defgroup groupTransforms Transform Functions + */ + +/** + * @defgroup groupController Controller Functions + */ + +/** + * @defgroup groupStats Statistics Functions + */ + +/** + * @defgroup groupSupport Support Functions + */ + +/** + * @defgroup groupInterpolation Interpolation Functions + * These functions perform 1- and 2-dimensional interpolation of data. + * Linear interpolation is used for 1-dimensional data and + * bilinear interpolation is used for 2-dimensional data. + */ + +/** + * @defgroup groupExamples Examples + */ + +#ifndef _RISCV_MATH_H +#define _RISCV_MATH_H + +#ifdef __cplusplus +extern "C" +{ +#endif + +/* Compiler specific diagnostic adjustment */ +#if defined ( __CC_ARM ) + +#elif defined ( __ARMCC_VERSION ) && ( __ARMCC_VERSION >= 6010050 ) + +#elif defined ( __GNUC__ ) + #pragma GCC diagnostic push + #pragma GCC diagnostic ignored "-Wsign-conversion" + #pragma GCC diagnostic ignored "-Wconversion" + #pragma GCC diagnostic ignored "-Wunused-parameter" + +#elif defined ( __ICCRISCV__ ) + +#elif defined ( __TI_RISCV__ ) + +#elif defined ( __CSMC__ ) + +#elif defined ( __TASKING__ ) + +#elif defined ( _MSC_VER ) + +#else + #error Unknown compiler +#endif + + +/* Included for instrinsics definitions */ +#if !defined ( _MSC_VER ) + +#define __NMSIS_GENERIC +#if (defined (__RISCV_FEATURE_DSP) && (__RISCV_FEATURE_DSP == 1)) + #define __DSP_PRESENT 1 +#endif +#include "nmsis_core.h" + + +#else +#include +#define __STATIC_FORCEINLINE static __forceinline +#define __ALIGNED(x) __declspec(align(x)) +#define LOW_OPTIMIZATION_ENTER +#define LOW_OPTIMIZATION_EXIT +#define IAR_ONLY_LOW_OPTIMIZATION_ENTER +#define IAR_ONLY_LOW_OPTIMIZATION_EXIT +#endif + +#include "string.h" +#include +#include "float.h" + +/* evaluate RISCV DSP feature */ +#if (defined (__RISCV_FEATURE_DSP) && (__RISCV_FEATURE_DSP == 1)) + #define RISCV_MATH_DSP +#endif + + /** + * @brief Macros required for reciprocal calculation in Normalized LMS + */ + +#define DELTA_Q31 (0x100) +#define DELTA_Q15 0x5 +#define INDEX_MASK 0x0000003F +#ifndef PI + #define PI 3.14159265358979f +#endif + + /** + * @brief Macros required for SINE and COSINE Fast math approximations + */ + +#define FAST_MATH_TABLE_SIZE 512 +#define FAST_MATH_Q31_SHIFT (32 - 10) +#define FAST_MATH_Q15_SHIFT (16 - 10) +#define CONTROLLER_Q31_SHIFT (32 - 9) +#define TABLE_SPACING_Q31 0x400000 +#define TABLE_SPACING_Q15 0x80 + + /** + * @brief Macros required for SINE and COSINE Controller functions + */ + /* 1.31(q31) Fixed value of 2/360 */ + /* -1 to +1 is divided into 360 values so total spacing is (2/360) */ +#define INPUT_SPACING 0xB60B61 + + + /** + * @brief Error status returned by some functions in the library. + */ + + typedef enum + { + RISCV_MATH_SUCCESS = 0, /**< No error */ + RISCV_MATH_ARGUMENT_ERROR = -1, /**< One or more arguments are incorrect */ + RISCV_MATH_LENGTH_ERROR = -2, /**< Length of data buffer is incorrect */ + RISCV_MATH_SIZE_MISMATCH = -3, /**< Size of matrices is not compatible with the operation */ + RISCV_MATH_NANINF = -4, /**< Not-a-number (NaN) or infinity is generated */ + RISCV_MATH_SINGULAR = -5, /**< Input matrix is singular and cannot be inverted */ + RISCV_MATH_TEST_FAILURE = -6 /**< Test Failed */ + } riscv_status; + + /** + * @brief 8-bit fractional data type in 1.7 format. + */ + typedef int8_t q7_t; + + /** + * @brief 16-bit fractional data type in 1.15 format. + */ + typedef int16_t q15_t; + + /** + * @brief 32-bit fractional data type in 1.31 format. + */ + typedef int32_t q31_t; + + /** + * @brief 64-bit fractional data type in 1.63 format. + */ + typedef int64_t q63_t; + + /** + * @brief 32-bit floating-point type definition. + */ + typedef float float32_t; + + /** + * @brief 64-bit floating-point type definition. + */ + typedef double float64_t; + + +/** + @brief definition to read/write two 16 bit values. + @deprecated + */ +#define __SIMD32_TYPE int32_t + +#define __SIMD32(addr) (*(__SIMD32_TYPE **) & (addr)) +#define __SIMD32_CONST(addr) ( (__SIMD32_TYPE * ) (addr)) +#define _SIMD32_OFFSET(addr) (*(__SIMD32_TYPE * ) (addr)) +#define __SIMD64(addr) (*( int64_t **) & (addr)) + +/* SIMD replacement */ + +/** + @brief Read 2 Q31 from Q31 pointer and increment pointer afterwards. + @param[in] pQ31 points to input value + @return Q63 value + */ +__STATIC_FORCEINLINE q63_t read_q31x2_ia ( + q31_t ** pQ31) +{ + q63_t val; +#ifndef RISCV_ALIGN_ACCESS +#if __RISCV_XLEN == 64 + val = __LD(*pQ31); +#else + val = *((q63_t *)*pQ31); +#endif /* __RISCV_XLEN == 64 */ +#else + memcpy((void *)(&val), (void *)(*pQ31), 8); +#endif + *pQ31 += 2; + return (val); +} + +/** + @brief Read 2 Q31 from Q31 pointer and decrement pointer afterwards. + @param[in] pQ31 points to input value + @return Q63 value + */ +__STATIC_FORCEINLINE q63_t read_q31x2_da ( + q31_t ** pQ31) +{ + q63_t val; +#ifndef RISCV_ALIGN_ACCESS +#if __RISCV_XLEN == 64 + val = __LD(*pQ31); +#else + val = *((q63_t *)*pQ31); +#endif /* __RISCV_XLEN == 64 */ +#else + memcpy((void *)(&val), (void *)(*pQ31), 8); +#endif + *pQ31 -= 2; + return (val); +} + +/** + @brief Read 2 Q31 from Q31 pointer. + @param[in] pQ31 points to input value + @return Q63 value + */ +__STATIC_FORCEINLINE q63_t read_q31x2 ( + q31_t * pQ31) +{ + q63_t val; +#ifndef RISCV_ALIGN_ACCESS +#if __RISCV_XLEN == 64 + val = __LD(pQ31); +#else + val = *((q63_t *)pQ31); +#endif /* __RISCV_XLEN == 64 */ +#else + memcpy((void *)(&val), (void *)(pQ31), 8); +#endif + return (val); +} + +/** + @brief Write 2 Q31 to Q31 pointer and increment pointer afterwards. + @param[in] pQ31 points to input value + @param[in] value Q63 value + @return none + */ +__STATIC_FORCEINLINE void write_q31x2_ia ( + q31_t ** pQ31, + q63_t value) +{ +#ifndef RISCV_ALIGN_ACCESS +#if __RISCV_XLEN == 64 + __SD(*pQ31, value); +#else + *((q63_t *)*pQ31) = value; +#endif /* __RISCV_XLEN == 64 */ +#else + memcpy((void *)(*pQ31), (void *)(&value), 8); +#endif + *pQ31 += 2; +} + +/** + @brief Write 2 Q31 to Q31 pointer. + @param[in] pQ31 points to input value + @param[in] value Q63 value + @return none + */ +__STATIC_FORCEINLINE void write_q31x2 ( + q31_t * pQ31, + q63_t value) +{ +#ifndef RISCV_ALIGN_ACCESS +#if __RISCV_XLEN == 64 + __SD(pQ31, value); +#else + *((q63_t *)pQ31) = value; +#endif /* __RISCV_XLEN == 64 */ +#else + memcpy((void *)(pQ31), (void *)(&value), 8); +#endif +} + +/** + @brief Read 2 Q15 from Q15 pointer. + @param[in] pQ15 points to input value + @return Q31 value + */ +__STATIC_FORCEINLINE q31_t read_q15x2 ( + q15_t * pQ15) +{ + q31_t val; + +#ifndef RISCV_ALIGN_ACCESS + __ASM volatile ( + "lw %0, (%1)" + :"=r"(val) + :"r"(pQ15) + ); +#else + memcpy((void *)(&val), (void *)(pQ15), 4); +#endif + return (val); +} + +/** + @brief Read 2 Q15 from Q15 pointer and increment pointer afterwards. + @param[in] pQ15 points to input value + @return Q31 value + */ +__STATIC_FORCEINLINE q31_t read_q15x2_ia ( + q15_t ** pQ15) +{ + q31_t val; + +#ifndef RISCV_ALIGN_ACCESS + __ASM volatile ( + "lw %0, (%1)" + :"=r"(val) + :"r"(*pQ15) + ); +#else + memcpy((void *)(&val), (void *)(*pQ15), 4); +#endif + *pQ15 += 2; + + return (val); +} + +/** + @brief Read 4 Q15 from Q15 pointer and increment pointer afterwards. + @param[in] pQ15 points to input value + @return Q63 value + */ +__STATIC_FORCEINLINE q63_t read_q15x4_ia ( + q15_t ** pQ15) +{ + q63_t val; +#ifndef RISCV_ALIGN_ACCESS + val = *((q63_t *)*pQ15); +#else + memcpy((void *)(&val), (void *)(*pQ15), 8); +#endif + *pQ15 += 4; + + return (val); +} + +/** + @brief Read 4 Q15 from Q15 pointer. + @param[in] pQ15 points to input value + @return Q63 value + */ +__STATIC_FORCEINLINE q63_t read_q15x4 ( + q15_t * pQ15) +{ + q63_t val; +#ifndef RISCV_ALIGN_ACCESS +#if __RISCV_XLEN == 64 + val = __LD(pQ15); +#else + val = *((q63_t *)pQ15); +#endif /* __RISCV_XLEN == 64 */ +#else + memcpy((void *)(&val), (void *)(pQ15), 8); +#endif + return (val); +} + +/** + @brief Read 2 Q15 from Q15 pointer and decrement pointer afterwards. + @param[in] pQ15 points to input value + @return Q31 value + */ +__STATIC_FORCEINLINE q31_t read_q15x2_da ( + q15_t ** pQ15) +{ + q31_t val; + +#ifndef RISCV_ALIGN_ACCESS + __ASM volatile ( + "lw %0, (%1)" + :"=r"(val) + :"r"(*pQ15) + ); +#else + memcpy((void *)(&val), (void *)(*pQ15), 4); +#endif + *pQ15 -= 2; + + return (val); +} + +/** + @brief Read 4 Q15 from Q15 pointer and decrement pointer afterwards. + @param[in] pQ15 points to input value + @return Q31 value + */ +__STATIC_FORCEINLINE q63_t read_q15x4_da ( + q15_t ** pQ15) +{ + q63_t val; +#ifndef RISCV_ALIGN_ACCESS + val = *((q63_t *)*pQ15); +#else + memcpy((void *)(&val), (void *)(*pQ15), 8); +#endif + *pQ15 -= 4; + + return (val); +} + +/** + @brief Write 2 Q15 to Q15 pointer and increment pointer afterwards. + @param[in] pQ15 points to input value + @param[in] value Q31 value + @return none + */ +__STATIC_FORCEINLINE void write_q15x2_ia ( + q15_t ** pQ15, + q31_t value) +{ +#ifndef RISCV_ALIGN_ACCESS + __ASM volatile ( + "sw %0, (%1)" + : + :"r"(value), "r"(*pQ15) + :"memory" + ); +#else + memcpy((void *)(*pQ15), (void *)(&value), 4); +#endif + *pQ15 += 2; +} + +/** + @brief Write 4 Q15 to Q15 pointer and increment pointer afterwards. + @param[in] pQ15 points to input value + @param[in] value Q31 value + @return none + */ +__STATIC_FORCEINLINE void write_q15x4_ia ( + q15_t ** pQ15, + q63_t value) +{ +#ifndef RISCV_ALIGN_ACCESS + *((q63_t *)*pQ15) = value; +#else + memcpy((void *)(*pQ15), (void *)(&value), 8); +#endif + *pQ15 += 4; +} + +/** + @brief Write 4 Q15 to Q15 pointer and decrement pointer afterwards. + @param[in] pQ15 points to input value + @param[in] value Q31 value + @return none + */ +__STATIC_FORCEINLINE void write_q15x4_da ( + q15_t ** pQ15, + q63_t value) +{ +#ifndef RISCV_ALIGN_ACCESS + *((q63_t *)*pQ15) = value; +#else + memcpy((void *)(*pQ15), (void *)(&value), 8); +#endif + *pQ15 -= 4; +} + +/** + @brief Write 2 Q15 to Q15 pointer. + @param[in] pQ15 points to input value + @param[in] value Q31 value + @return none + */ +__STATIC_FORCEINLINE void write_q15x2 ( + q15_t * pQ15, + q31_t value) +{ +#ifndef RISCV_ALIGN_ACCESS + __ASM volatile ( + "sw %0, (%1)" + : + :"r"(value), "r"(pQ15) + :"memory" + ); +#else + memcpy((void *)(pQ15), (void *)(&value), 4); +#endif + +} + +/** + @brief Write 4 Q15 to Q15 pointer. + @param[in] pQ15 points to input value + @param[in] value Q31 value + @return none + */ +__STATIC_FORCEINLINE void write_q15x4 ( + q15_t * pQ15, + q63_t value) +{ +#ifndef RISCV_ALIGN_ACCESS + *((q63_t *)pQ15) = value; +#else + memcpy((void *)(pQ15), (void *)(&value), 8); +#endif +} + +/** + @brief Read 8 Q7 from Q7 pointer and increment pointer afterwards. + @param[in] pQ7 points to input value + @return Q63 value + */ +__STATIC_FORCEINLINE q63_t read_q7x8_ia ( + q7_t ** pQ7) +{ + q63_t val; +#ifndef RISCV_ALIGN_ACCESS + val = *((q63_t *)*pQ7); +#else + memcpy((void *)(&val), (void *)(*pQ7), 8); +#endif + *pQ7 += 8; + + return val; +} + +/** + @brief Read 8 Q7 from Q7 pointer and decrement pointer afterwards. + @param[in] pQ7 points to input value + @return Q63 value + */ +__STATIC_FORCEINLINE q63_t read_q7x8_da ( + q7_t ** pQ7) +{ + q63_t val; +#ifndef RISCV_ALIGN_ACCESS + val = *((q63_t *)*pQ7); +#else + memcpy((void *)(&val), (void *)(*pQ7), 8); +#endif + *pQ7 -= 8; + return val; +} + +/** + @brief Read 4 Q7 from Q7 pointer and increment pointer afterwards. + @param[in] pQ7 points to input value + @return Q31 value + */ +__STATIC_FORCEINLINE q31_t read_q7x4_ia ( + q7_t ** pQ7) +{ + q31_t val; + +#ifndef RISCV_ALIGN_ACCESS + __ASM volatile ( + "lw %0, (%1)" + :"=r"(val) + :"r"(*pQ7) + ); +#else + memcpy((void *)(&val), (void *)(*pQ7), 4); +#endif + *pQ7 += 4; + + return (val); +} + +/** + @brief Read 4 Q7 from Q7 pointer and decrement pointer afterwards. + @param[in] pQ7 points to input value + @return Q31 value + */ +__STATIC_FORCEINLINE q31_t read_q7x4_da ( + q7_t ** pQ7) +{ + q31_t val; + +#ifndef RISCV_ALIGN_ACCESS + __ASM volatile ( + "lw %0, (%1)" + :"=r"(val) + :"r"(*pQ7) + ); +#else + memcpy((void *)(&val), (void *)(*pQ7), 4); +#endif + *pQ7 -= 4; + + return (val); +} + +/** + @brief Write 8 Q7 to Q7 pointer and increment pointer afterwards. + @param[in] pQ7 points to input value + @param[in] value Q63 value + @return none + */ +__STATIC_FORCEINLINE void write_q7x8_ia ( + q7_t ** pQ7, + q63_t value) +{ +#ifndef RISCV_ALIGN_ACCESS + *((q63_t *)*pQ7) = value; +#else + memcpy((void *)(*pQ7), (void *)(&value), 8); +#endif + *pQ7 += 8; +} + +/** + @brief Write 4 Q7 to Q7 pointer and increment pointer afterwards. + @param[in] pQ7 points to input value + @param[in] value Q31 value + @return none + */ +__STATIC_FORCEINLINE void write_q7x4_ia ( + q7_t ** pQ7, + q31_t value) +{ + q31_t val = value; + +#ifndef RISCV_ALIGN_ACCESS + __ASM volatile ( + "sw %0, (%1)" + : + :"r"(value), "r"(*pQ7) + :"memory" + ); +#else + memcpy((void *)(*pQ7), (void *)(&value), 4); +#endif + *pQ7 += 4; +} + +/** +* @brief definition to pack four 8 bit values. +*/ +#define __PACKq7(v0,v1,v2,v3) ( (((int32_t)(v0) << 0) & (int32_t)0x000000FF) | \ + (((int32_t)(v1) << 8) & (int32_t)0x0000FF00) | \ + (((int32_t)(v2) << 16) & (int32_t)0x00FF0000) | \ + (((int32_t)(v3) << 24) & (int32_t)0xFF000000) ) + + + + /** + * @brief Clips Q63 to Q31 values. + */ + __STATIC_FORCEINLINE q31_t clip_q63_to_q31( + q63_t x) + { + return ((q31_t) (x >> 32) != ((q31_t) x >> 31)) ? + ((0x7FFFFFFF ^ ((q31_t) (x >> 63)))) : (q31_t) x; + } + + /** + * @brief Clips Q63 to Q15 values. + */ + __STATIC_FORCEINLINE q15_t clip_q63_to_q15( + q63_t x) + { + return ((q31_t) (x >> 32) != ((q31_t) x >> 31)) ? + ((0x7FFF ^ ((q15_t) (x >> 63)))) : (q15_t) (x >> 15); + } + + /** + * @brief Clips Q31 to Q7 values. + */ + __STATIC_FORCEINLINE q7_t clip_q31_to_q7( + q31_t x) + { + return ((q31_t) (x >> 24) != ((q31_t) x >> 23)) ? + ((0x7F ^ ((q7_t) (x >> 31)))) : (q7_t) x; + } + + /** + * @brief Clips Q31 to Q15 values. + */ + __STATIC_FORCEINLINE q15_t clip_q31_to_q15( + q31_t x) + { + return ((q31_t) (x >> 16) != ((q31_t) x >> 15)) ? + ((0x7FFF ^ ((q15_t) (x >> 31)))) : (q15_t) x; + } + + /** + * @brief Multiplies 32 X 64 and returns 32 bit result in 2.30 format. + */ + __STATIC_FORCEINLINE q63_t mult32x64( + q63_t x, + q31_t y) + { + return ((((q63_t) (x & 0x00000000FFFFFFFF) * y) >> 32) + + (((q63_t) (x >> 32) * y) ) ); + } + + /** + * @brief Function to Calculates 1/in (reciprocal) value of Q31 Data type. + */ + __STATIC_FORCEINLINE uint32_t riscv_recip_q31( + q31_t in, + q31_t * dst, + const q31_t * pRecipTable) + { + q31_t out; + uint32_t tempVal; + uint32_t index, i; + uint32_t signBits; + + if (in > 0) + { + signBits = ((uint32_t) (__CLZ( in) - 1)); + } + else + { + signBits = ((uint32_t) (__CLZ(-in) - 1)); + } + + /* Convert input sample to 1.31 format */ + in = (in << signBits); + + /* calculation of index for initial approximated Val */ + index = (uint32_t)(in >> 24); + index = (index & INDEX_MASK); + + /* 1.31 with exp 1 */ + out = pRecipTable[index]; + + /* calculation of reciprocal value */ + /* running approximation for two iterations */ + for (i = 0U; i < 2U; i++) + { + tempVal = (uint32_t) (((q63_t) in * out) >> 31); + tempVal = 0x7FFFFFFFu - tempVal; + /* 1.31 with exp 1 */ + /* out = (q31_t) (((q63_t) out * tempVal) >> 30); */ + out = clip_q63_to_q31(((q63_t) out * tempVal) >> 30); + } + + /* write output */ + *dst = out; + + /* return num of signbits of out = 1/in value */ + return (signBits + 1U); + } + + + /** + * @brief Function to Calculates 1/in (reciprocal) value of Q15 Data type. + */ + __STATIC_FORCEINLINE uint32_t riscv_recip_q15( + q15_t in, + q15_t * dst, + const q15_t * pRecipTable) + { + q15_t out = 0; + uint32_t tempVal = 0; + uint32_t index = 0, i = 0; + uint32_t signBits = 0; + + if (in > 0) + { + signBits = ((uint32_t)(__CLZ( in) - 17)); + } + else + { + signBits = ((uint32_t)(__CLZ(-in) - 17)); + } + + /* Convert input sample to 1.15 format */ + in = (in << signBits); + + /* calculation of index for initial approximated Val */ + index = (uint32_t)(in >> 8); + index = (index & INDEX_MASK); + + /* 1.15 with exp 1 */ + out = pRecipTable[index]; + + /* calculation of reciprocal value */ + /* running approximation for two iterations */ + for (i = 0U; i < 2U; i++) + { + tempVal = (uint32_t) (((q31_t) in * out) >> 15); + tempVal = 0x7FFFu - tempVal; + /* 1.15 with exp 1 */ + out = (q15_t) (((q31_t) out * tempVal) >> 14); + /* out = clip_q31_to_q15(((q31_t) out * tempVal) >> 14); */ + } + + /* write output */ + *dst = out; + + /* return num of signbits of out = 1/in value */ + return (signBits + 1); + } + + +/* + * @brief C custom defined intrinsic functions + */ +#if !defined (RISCV_MATH_DSP) + + /* + * @brief C custom defined QADD8 + */ + __STATIC_FORCEINLINE uint32_t __QADD8( + uint32_t x, + uint32_t y) + { + q31_t r, s, t, u; + + r = __SSAT(((((q31_t)x << 24) >> 24) + (((q31_t)y << 24) >> 24)), 8) & (int32_t)0x000000FF; + s = __SSAT(((((q31_t)x << 16) >> 24) + (((q31_t)y << 16) >> 24)), 8) & (int32_t)0x000000FF; + t = __SSAT(((((q31_t)x << 8) >> 24) + (((q31_t)y << 8) >> 24)), 8) & (int32_t)0x000000FF; + u = __SSAT(((((q31_t)x ) >> 24) + (((q31_t)y ) >> 24)), 8) & (int32_t)0x000000FF; + + return ((uint32_t)((u << 24) | (t << 16) | (s << 8) | (r ))); + } + + + /* + * @brief C custom defined QSUB8 + */ + __STATIC_FORCEINLINE uint32_t __QSUB8( + uint32_t x, + uint32_t y) + { + q31_t r, s, t, u; + + r = __SSAT(((((q31_t)x << 24) >> 24) - (((q31_t)y << 24) >> 24)), 8) & (int32_t)0x000000FF; + s = __SSAT(((((q31_t)x << 16) >> 24) - (((q31_t)y << 16) >> 24)), 8) & (int32_t)0x000000FF; + t = __SSAT(((((q31_t)x << 8) >> 24) - (((q31_t)y << 8) >> 24)), 8) & (int32_t)0x000000FF; + u = __SSAT(((((q31_t)x ) >> 24) - (((q31_t)y ) >> 24)), 8) & (int32_t)0x000000FF; + + return ((uint32_t)((u << 24) | (t << 16) | (s << 8) | (r ))); + } + + + /* + * @brief C custom defined QADD16 + */ + __STATIC_FORCEINLINE uint32_t __QADD16( + uint32_t x, + uint32_t y) + { +/* q31_t r, s; without initialisation 'riscv_offset_q15 test' fails but 'intrinsic' tests pass! for armCC */ + q31_t r = 0, s = 0; + + r = __SSAT(((((q31_t)x << 16) >> 16) + (((q31_t)y << 16) >> 16)), 16) & (int32_t)0x0000FFFF; + s = __SSAT(((((q31_t)x ) >> 16) + (((q31_t)y ) >> 16)), 16) & (int32_t)0x0000FFFF; + + return ((uint32_t)((s << 16) | (r ))); + } + + + /* + * @brief C custom defined SHADD16 + */ + __STATIC_FORCEINLINE uint32_t __SHADD16( + uint32_t x, + uint32_t y) + { + q31_t r, s; + + r = (((((q31_t)x << 16) >> 16) + (((q31_t)y << 16) >> 16)) >> 1) & (int32_t)0x0000FFFF; + s = (((((q31_t)x ) >> 16) + (((q31_t)y ) >> 16)) >> 1) & (int32_t)0x0000FFFF; + + return ((uint32_t)((s << 16) | (r ))); + } + + + /* + * @brief C custom defined QSUB16 + */ + __STATIC_FORCEINLINE uint32_t __QSUB16( + uint32_t x, + uint32_t y) + { + q31_t r, s; + + r = __SSAT(((((q31_t)x << 16) >> 16) - (((q31_t)y << 16) >> 16)), 16) & (int32_t)0x0000FFFF; + s = __SSAT(((((q31_t)x ) >> 16) - (((q31_t)y ) >> 16)), 16) & (int32_t)0x0000FFFF; + + return ((uint32_t)((s << 16) | (r ))); + } + + + /* + * @brief C custom defined SHSUB16 + */ + __STATIC_FORCEINLINE uint32_t __SHSUB16( + uint32_t x, + uint32_t y) + { + q31_t r, s; + + r = (((((q31_t)x << 16) >> 16) - (((q31_t)y << 16) >> 16)) >> 1) & (int32_t)0x0000FFFF; + s = (((((q31_t)x ) >> 16) - (((q31_t)y ) >> 16)) >> 1) & (int32_t)0x0000FFFF; + + return ((uint32_t)((s << 16) | (r ))); + } + + + /* + * @brief C custom defined QASX + */ + __STATIC_FORCEINLINE uint32_t __QASX( + uint32_t x, + uint32_t y) + { + q31_t r, s; + + r = __SSAT(((((q31_t)x << 16) >> 16) - (((q31_t)y ) >> 16)), 16) & (int32_t)0x0000FFFF; + s = __SSAT(((((q31_t)x ) >> 16) + (((q31_t)y << 16) >> 16)), 16) & (int32_t)0x0000FFFF; + + return ((uint32_t)((s << 16) | (r ))); + } + + + /* + * @brief C custom defined SHASX + */ + __STATIC_FORCEINLINE uint32_t __SHASX( + uint32_t x, + uint32_t y) + { + q31_t r, s; + + r = (((((q31_t)x << 16) >> 16) - (((q31_t)y ) >> 16)) >> 1) & (int32_t)0x0000FFFF; + s = (((((q31_t)x ) >> 16) + (((q31_t)y << 16) >> 16)) >> 1) & (int32_t)0x0000FFFF; + + return ((uint32_t)((s << 16) | (r ))); + } + + + /* + * @brief C custom defined QSAX + */ + __STATIC_FORCEINLINE uint32_t __QSAX( + uint32_t x, + uint32_t y) + { + q31_t r, s; + + r = __SSAT(((((q31_t)x << 16) >> 16) + (((q31_t)y ) >> 16)), 16) & (int32_t)0x0000FFFF; + s = __SSAT(((((q31_t)x ) >> 16) - (((q31_t)y << 16) >> 16)), 16) & (int32_t)0x0000FFFF; + + return ((uint32_t)((s << 16) | (r ))); + } + + + /* + * @brief C custom defined SHSAX + */ + __STATIC_FORCEINLINE uint32_t __SHSAX( + uint32_t x, + uint32_t y) + { + q31_t r, s; + + r = (((((q31_t)x << 16) >> 16) + (((q31_t)y ) >> 16)) >> 1) & (int32_t)0x0000FFFF; + s = (((((q31_t)x ) >> 16) - (((q31_t)y << 16) >> 16)) >> 1) & (int32_t)0x0000FFFF; + + return ((uint32_t)((s << 16) | (r ))); + } + + + /* + * @brief C custom defined SMUSDX + */ + __STATIC_FORCEINLINE uint32_t __SMUSDX( + uint32_t x, + uint32_t y) + { + return ((uint32_t)(((((q31_t)x << 16) >> 16) * (((q31_t)y ) >> 16)) - + ((((q31_t)x ) >> 16) * (((q31_t)y << 16) >> 16)) )); + } + + /* + * @brief C custom defined SMUADX + */ + __STATIC_FORCEINLINE uint32_t __SMUADX( + uint32_t x, + uint32_t y) + { + return ((uint32_t)(((((q31_t)x << 16) >> 16) * (((q31_t)y ) >> 16)) + + ((((q31_t)x ) >> 16) * (((q31_t)y << 16) >> 16)) )); + } + + + /* + * @brief C custom defined QADD + */ + __STATIC_FORCEINLINE int32_t __QADD( + int32_t x, + int32_t y) + { + return ((int32_t)(clip_q63_to_q31((q63_t)x + (q31_t)y))); + } + + + /* + * @brief C custom defined QSUB + */ + __STATIC_FORCEINLINE int32_t __QSUB( + int32_t x, + int32_t y) + { + return ((int32_t)(clip_q63_to_q31((q63_t)x - (q31_t)y))); + } + + + /* + * @brief C custom defined SMLAD + */ + __STATIC_FORCEINLINE uint32_t __SMLAD( + uint32_t x, + uint32_t y, + uint32_t sum) + { + return ((uint32_t)(((((q31_t)x << 16) >> 16) * (((q31_t)y << 16) >> 16)) + + ((((q31_t)x ) >> 16) * (((q31_t)y ) >> 16)) + + ( ((q31_t)sum ) ) )); + } + + + /* + * @brief C custom defined SMLADX + */ + __STATIC_FORCEINLINE uint32_t __SMLADX( + uint32_t x, + uint32_t y, + uint32_t sum) + { + return ((uint32_t)(((((q31_t)x << 16) >> 16) * (((q31_t)y ) >> 16)) + + ((((q31_t)x ) >> 16) * (((q31_t)y << 16) >> 16)) + + ( ((q31_t)sum ) ) )); + } + + + /* + * @brief C custom defined SMLSDX + */ + __STATIC_FORCEINLINE uint32_t __SMLSDX( + uint32_t x, + uint32_t y, + uint32_t sum) + { + return ((uint32_t)(((((q31_t)x << 16) >> 16) * (((q31_t)y ) >> 16)) - + ((((q31_t)x ) >> 16) * (((q31_t)y << 16) >> 16)) + + ( ((q31_t)sum ) ) )); + } + + + /* + * @brief C custom defined SMLALD + */ + __STATIC_FORCEINLINE uint64_t __SMLALD( + uint32_t x, + uint32_t y, + uint64_t sum) + { +/* return (sum + ((q15_t) (x >> 16) * (q15_t) (y >> 16)) + ((q15_t) x * (q15_t) y)); */ + return ((uint64_t)(((((q31_t)x << 16) >> 16) * (((q31_t)y << 16) >> 16)) + + ((((q31_t)x ) >> 16) * (((q31_t)y ) >> 16)) + + ( ((q63_t)sum ) ) )); + } + + + /* + * @brief C custom defined SMLALDX + */ + __STATIC_FORCEINLINE uint64_t __SMLALDX( + uint32_t x, + uint32_t y, + uint64_t sum) + { +/* return (sum + ((q15_t) (x >> 16) * (q15_t) y)) + ((q15_t) x * (q15_t) (y >> 16)); */ + return ((uint64_t)(((((q31_t)x << 16) >> 16) * (((q31_t)y ) >> 16)) + + ((((q31_t)x ) >> 16) * (((q31_t)y << 16) >> 16)) + + ( ((q63_t)sum ) ) )); + } + + + /* + * @brief C custom defined SMUAD + */ + __STATIC_FORCEINLINE uint32_t __SMUAD( + uint32_t x, + uint32_t y) + { + return ((uint32_t)(((((q31_t)x << 16) >> 16) * (((q31_t)y << 16) >> 16)) + + ((((q31_t)x ) >> 16) * (((q31_t)y ) >> 16)) )); + } + + + /* + * @brief C custom defined SMUSD + */ + __STATIC_FORCEINLINE uint32_t __SMUSD( + uint32_t x, + uint32_t y) + { + return ((uint32_t)(((((q31_t)x << 16) >> 16) * (((q31_t)y << 16) >> 16)) - + ((((q31_t)x ) >> 16) * (((q31_t)y ) >> 16)) )); + } + + + /* + * @brief C custom defined SXTB16 + */ + __STATIC_FORCEINLINE uint32_t __SXTB16( + uint32_t x) + { + return ((uint32_t)(((((q31_t)x << 24) >> 24) & (q31_t)0x0000FFFF) | + ((((q31_t)x << 8) >> 8) & (q31_t)0xFFFF0000) )); + } + + /* + * @brief C custom defined SMMLA + */ + __STATIC_FORCEINLINE int32_t __SMMLA( + int32_t x, + int32_t y, + int32_t sum) + { + return (sum + (int32_t) (((int64_t) x * y) >> 32)); + } + +#endif /* !defined (RISCV_MATH_DSP) */ + + + /** + * @brief Instance structure for the Q7 FIR filter. + */ + typedef struct + { + uint16_t numTaps; /**< number of filter coefficients in the filter. */ + q7_t *pState; /**< points to the state variable array. The array is of length numTaps+blockSize-1. */ + const q7_t *pCoeffs; /**< points to the coefficient array. The array is of length numTaps.*/ + } riscv_fir_instance_q7; + + /** + * @brief Instance structure for the Q15 FIR filter. + */ + typedef struct + { + uint16_t numTaps; /**< number of filter coefficients in the filter. */ + q15_t *pState; /**< points to the state variable array. The array is of length numTaps+blockSize-1. */ + const q15_t *pCoeffs; /**< points to the coefficient array. The array is of length numTaps.*/ + } riscv_fir_instance_q15; + + /** + * @brief Instance structure for the Q31 FIR filter. + */ + typedef struct + { + uint16_t numTaps; /**< number of filter coefficients in the filter. */ + q31_t *pState; /**< points to the state variable array. The array is of length numTaps+blockSize-1. */ + const q31_t *pCoeffs; /**< points to the coefficient array. The array is of length numTaps. */ + } riscv_fir_instance_q31; + + /** + * @brief Instance structure for the floating-point FIR filter. + */ + typedef struct + { + uint16_t numTaps; /**< number of filter coefficients in the filter. */ + float32_t *pState; /**< points to the state variable array. The array is of length numTaps+blockSize-1. */ + const float32_t *pCoeffs; /**< points to the coefficient array. The array is of length numTaps. */ + } riscv_fir_instance_f32; + + /** + * @brief Processing function for the Q7 FIR filter. + * @param[in] S points to an instance of the Q7 FIR filter structure. + * @param[in] pSrc points to the block of input data. + * @param[out] pDst points to the block of output data. + * @param[in] blockSize number of samples to process. + */ + void riscv_fir_q7( + const riscv_fir_instance_q7 * S, + const q7_t * pSrc, + q7_t * pDst, + uint32_t blockSize); + + /** + * @brief Initialization function for the Q7 FIR filter. + * @param[in,out] S points to an instance of the Q7 FIR structure. + * @param[in] numTaps Number of filter coefficients in the filter. + * @param[in] pCoeffs points to the filter coefficients. + * @param[in] pState points to the state buffer. + * @param[in] blockSize number of samples that are processed. + */ + void riscv_fir_init_q7( + riscv_fir_instance_q7 * S, + uint16_t numTaps, + const q7_t * pCoeffs, + q7_t * pState, + uint32_t blockSize); + + /** + * @brief Processing function for the Q15 FIR filter. + * @param[in] S points to an instance of the Q15 FIR structure. + * @param[in] pSrc points to the block of input data. + * @param[out] pDst points to the block of output data. + * @param[in] blockSize number of samples to process. + */ + void riscv_fir_q15( + const riscv_fir_instance_q15 * S, + const q15_t * pSrc, + q15_t * pDst, + uint32_t blockSize); + + /** + * @brief Processing function for the fast Q15 FIR filter (fast version). + * @param[in] S points to an instance of the Q15 FIR filter structure. + * @param[in] pSrc points to the block of input data. + * @param[out] pDst points to the block of output data. + * @param[in] blockSize number of samples to process. + */ + void riscv_fir_fast_q15( + const riscv_fir_instance_q15 * S, + const q15_t * pSrc, + q15_t * pDst, + uint32_t blockSize); + + /** + * @brief Initialization function for the Q15 FIR filter. + * @param[in,out] S points to an instance of the Q15 FIR filter structure. + * @param[in] numTaps Number of filter coefficients in the filter. Must be even and greater than or equal to 4. + * @param[in] pCoeffs points to the filter coefficients. + * @param[in] pState points to the state buffer. + * @param[in] blockSize number of samples that are processed at a time. + * @return The function returns either + * RISCV_MATH_SUCCESS if initialization was successful or + * RISCV_MATH_ARGUMENT_ERROR if numTaps is not a supported value. + */ + riscv_status riscv_fir_init_q15( + riscv_fir_instance_q15 * S, + uint16_t numTaps, + const q15_t * pCoeffs, + q15_t * pState, + uint32_t blockSize); + + /** + * @brief Processing function for the Q31 FIR filter. + * @param[in] S points to an instance of the Q31 FIR filter structure. + * @param[in] pSrc points to the block of input data. + * @param[out] pDst points to the block of output data. + * @param[in] blockSize number of samples to process. + */ + void riscv_fir_q31( + const riscv_fir_instance_q31 * S, + const q31_t * pSrc, + q31_t * pDst, + uint32_t blockSize); + + /** + * @brief Processing function for the fast Q31 FIR filter (fast version). + * @param[in] S points to an instance of the Q31 FIR filter structure. + * @param[in] pSrc points to the block of input data. + * @param[out] pDst points to the block of output data. + * @param[in] blockSize number of samples to process. + */ + void riscv_fir_fast_q31( + const riscv_fir_instance_q31 * S, + const q31_t * pSrc, + q31_t * pDst, + uint32_t blockSize); + + /** + * @brief Initialization function for the Q31 FIR filter. + * @param[in,out] S points to an instance of the Q31 FIR structure. + * @param[in] numTaps Number of filter coefficients in the filter. + * @param[in] pCoeffs points to the filter coefficients. + * @param[in] pState points to the state buffer. + * @param[in] blockSize number of samples that are processed at a time. + */ + void riscv_fir_init_q31( + riscv_fir_instance_q31 * S, + uint16_t numTaps, + const q31_t * pCoeffs, + q31_t * pState, + uint32_t blockSize); + + /** + * @brief Processing function for the floating-point FIR filter. + * @param[in] S points to an instance of the floating-point FIR structure. + * @param[in] pSrc points to the block of input data. + * @param[out] pDst points to the block of output data. + * @param[in] blockSize number of samples to process. + */ + void riscv_fir_f32( + const riscv_fir_instance_f32 * S, + const float32_t * pSrc, + float32_t * pDst, + uint32_t blockSize); + + /** + * @brief Initialization function for the floating-point FIR filter. + * @param[in,out] S points to an instance of the floating-point FIR filter structure. + * @param[in] numTaps Number of filter coefficients in the filter. + * @param[in] pCoeffs points to the filter coefficients. + * @param[in] pState points to the state buffer. + * @param[in] blockSize number of samples that are processed at a time. + */ + void riscv_fir_init_f32( + riscv_fir_instance_f32 * S, + uint16_t numTaps, + const float32_t * pCoeffs, + float32_t * pState, + uint32_t blockSize); + + /** + * @brief Instance structure for the Q15 Biquad cascade filter. + */ + typedef struct + { + int8_t numStages; /**< number of 2nd order stages in the filter. Overall order is 2*numStages. */ + q15_t *pState; /**< Points to the array of state coefficients. The array is of length 4*numStages. */ + const q15_t *pCoeffs; /**< Points to the array of coefficients. The array is of length 5*numStages. */ + int8_t postShift; /**< Additional shift, in bits, applied to each output sample. */ + } riscv_biquad_casd_df1_inst_q15; + + /** + * @brief Instance structure for the Q31 Biquad cascade filter. + */ + typedef struct + { + uint32_t numStages; /**< number of 2nd order stages in the filter. Overall order is 2*numStages. */ + q31_t *pState; /**< Points to the array of state coefficients. The array is of length 4*numStages. */ + const q31_t *pCoeffs; /**< Points to the array of coefficients. The array is of length 5*numStages. */ + uint8_t postShift; /**< Additional shift, in bits, applied to each output sample. */ + } riscv_biquad_casd_df1_inst_q31; + + /** + * @brief Instance structure for the floating-point Biquad cascade filter. + */ + typedef struct + { + uint32_t numStages; /**< number of 2nd order stages in the filter. Overall order is 2*numStages. */ + float32_t *pState; /**< Points to the array of state coefficients. The array is of length 4*numStages. */ + const float32_t *pCoeffs; /**< Points to the array of coefficients. The array is of length 5*numStages. */ + } riscv_biquad_casd_df1_inst_f32; + + /** + * @brief Processing function for the Q15 Biquad cascade filter. + * @param[in] S points to an instance of the Q15 Biquad cascade structure. + * @param[in] pSrc points to the block of input data. + * @param[out] pDst points to the block of output data. + * @param[in] blockSize number of samples to process. + */ + void riscv_biquad_cascade_df1_q15( + const riscv_biquad_casd_df1_inst_q15 * S, + const q15_t * pSrc, + q15_t * pDst, + uint32_t blockSize); + + /** + * @brief Initialization function for the Q15 Biquad cascade filter. + * @param[in,out] S points to an instance of the Q15 Biquad cascade structure. + * @param[in] numStages number of 2nd order stages in the filter. + * @param[in] pCoeffs points to the filter coefficients. + * @param[in] pState points to the state buffer. + * @param[in] postShift Shift to be applied to the output. Varies according to the coefficients format + */ + void riscv_biquad_cascade_df1_init_q15( + riscv_biquad_casd_df1_inst_q15 * S, + uint8_t numStages, + const q15_t * pCoeffs, + q15_t * pState, + int8_t postShift); + + /** + * @brief Fast but less precise processing function for the Q15 Biquad cascade filter for RISC-V Core with DSP enabled. + * @param[in] S points to an instance of the Q15 Biquad cascade structure. + * @param[in] pSrc points to the block of input data. + * @param[out] pDst points to the block of output data. + * @param[in] blockSize number of samples to process. + */ + void riscv_biquad_cascade_df1_fast_q15( + const riscv_biquad_casd_df1_inst_q15 * S, + const q15_t * pSrc, + q15_t * pDst, + uint32_t blockSize); + + /** + * @brief Processing function for the Q31 Biquad cascade filter + * @param[in] S points to an instance of the Q31 Biquad cascade structure. + * @param[in] pSrc points to the block of input data. + * @param[out] pDst points to the block of output data. + * @param[in] blockSize number of samples to process. + */ + void riscv_biquad_cascade_df1_q31( + const riscv_biquad_casd_df1_inst_q31 * S, + const q31_t * pSrc, + q31_t * pDst, + uint32_t blockSize); + + /** + * @brief Fast but less precise processing function for the Q31 Biquad cascade filter for RISC-V Core with DSP enabled. + * @param[in] S points to an instance of the Q31 Biquad cascade structure. + * @param[in] pSrc points to the block of input data. + * @param[out] pDst points to the block of output data. + * @param[in] blockSize number of samples to process. + */ + void riscv_biquad_cascade_df1_fast_q31( + const riscv_biquad_casd_df1_inst_q31 * S, + const q31_t * pSrc, + q31_t * pDst, + uint32_t blockSize); + + /** + * @brief Initialization function for the Q31 Biquad cascade filter. + * @param[in,out] S points to an instance of the Q31 Biquad cascade structure. + * @param[in] numStages number of 2nd order stages in the filter. + * @param[in] pCoeffs points to the filter coefficients. + * @param[in] pState points to the state buffer. + * @param[in] postShift Shift to be applied to the output. Varies according to the coefficients format + */ + void riscv_biquad_cascade_df1_init_q31( + riscv_biquad_casd_df1_inst_q31 * S, + uint8_t numStages, + const q31_t * pCoeffs, + q31_t * pState, + int8_t postShift); + + /** + * @brief Processing function for the floating-point Biquad cascade filter. + * @param[in] S points to an instance of the floating-point Biquad cascade structure. + * @param[in] pSrc points to the block of input data. + * @param[out] pDst points to the block of output data. + * @param[in] blockSize number of samples to process. + */ + void riscv_biquad_cascade_df1_f32( + const riscv_biquad_casd_df1_inst_f32 * S, + const float32_t * pSrc, + float32_t * pDst, + uint32_t blockSize); + + /** + * @brief Initialization function for the floating-point Biquad cascade filter. + * @param[in,out] S points to an instance of the floating-point Biquad cascade structure. + * @param[in] numStages number of 2nd order stages in the filter. + * @param[in] pCoeffs points to the filter coefficients. + * @param[in] pState points to the state buffer. + */ + void riscv_biquad_cascade_df1_init_f32( + riscv_biquad_casd_df1_inst_f32 * S, + uint8_t numStages, + const float32_t * pCoeffs, + float32_t * pState); + + /** + * @brief Instance structure for the floating-point matrix structure. + */ + typedef struct + { + uint16_t numRows; /**< number of rows of the matrix. */ + uint16_t numCols; /**< number of columns of the matrix. */ + float32_t *pData; /**< points to the data of the matrix. */ + } riscv_matrix_instance_f32; + + + /** + * @brief Instance structure for the floating-point matrix structure. + */ + typedef struct + { + uint16_t numRows; /**< number of rows of the matrix. */ + uint16_t numCols; /**< number of columns of the matrix. */ + float64_t *pData; /**< points to the data of the matrix. */ + } riscv_matrix_instance_f64; + + /** + * @brief Instance structure for the Q15 matrix structure. + */ + typedef struct + { + uint16_t numRows; /**< number of rows of the matrix. */ + uint16_t numCols; /**< number of columns of the matrix. */ + q15_t *pData; /**< points to the data of the matrix. */ + } riscv_matrix_instance_q15; + + /** + * @brief Instance structure for the Q31 matrix structure. + */ + typedef struct + { + uint16_t numRows; /**< number of rows of the matrix. */ + uint16_t numCols; /**< number of columns of the matrix. */ + q31_t *pData; /**< points to the data of the matrix. */ + } riscv_matrix_instance_q31; + + /** + * @brief Floating-point matrix addition. + * @param[in] pSrcA points to the first input matrix structure + * @param[in] pSrcB points to the second input matrix structure + * @param[out] pDst points to output matrix structure + * @return The function returns either + * RISCV_MATH_SIZE_MISMATCH or RISCV_MATH_SUCCESS based on the outcome of size checking. + */ +riscv_status riscv_mat_add_f32( + const riscv_matrix_instance_f32 * pSrcA, + const riscv_matrix_instance_f32 * pSrcB, + riscv_matrix_instance_f32 * pDst); + + /** + * @brief Q15 matrix addition. + * @param[in] pSrcA points to the first input matrix structure + * @param[in] pSrcB points to the second input matrix structure + * @param[out] pDst points to output matrix structure + * @return The function returns either + * RISCV_MATH_SIZE_MISMATCH or RISCV_MATH_SUCCESS based on the outcome of size checking. + */ +riscv_status riscv_mat_add_q15( + const riscv_matrix_instance_q15 * pSrcA, + const riscv_matrix_instance_q15 * pSrcB, + riscv_matrix_instance_q15 * pDst); + + /** + * @brief Q31 matrix addition. + * @param[in] pSrcA points to the first input matrix structure + * @param[in] pSrcB points to the second input matrix structure + * @param[out] pDst points to output matrix structure + * @return The function returns either + * RISCV_MATH_SIZE_MISMATCH or RISCV_MATH_SUCCESS based on the outcome of size checking. + */ +riscv_status riscv_mat_add_q31( + const riscv_matrix_instance_q31 * pSrcA, + const riscv_matrix_instance_q31 * pSrcB, + riscv_matrix_instance_q31 * pDst); + + /** + * @brief Floating-point, complex, matrix multiplication. + * @param[in] pSrcA points to the first input matrix structure + * @param[in] pSrcB points to the second input matrix structure + * @param[out] pDst points to output matrix structure + * @return The function returns either + * RISCV_MATH_SIZE_MISMATCH or RISCV_MATH_SUCCESS based on the outcome of size checking. + */ +riscv_status riscv_mat_cmplx_mult_f32( + const riscv_matrix_instance_f32 * pSrcA, + const riscv_matrix_instance_f32 * pSrcB, + riscv_matrix_instance_f32 * pDst); + + /** + * @brief Q15, complex, matrix multiplication. + * @param[in] pSrcA points to the first input matrix structure + * @param[in] pSrcB points to the second input matrix structure + * @param[out] pDst points to output matrix structure + * @return The function returns either + * RISCV_MATH_SIZE_MISMATCH or RISCV_MATH_SUCCESS based on the outcome of size checking. + */ +riscv_status riscv_mat_cmplx_mult_q15( + const riscv_matrix_instance_q15 * pSrcA, + const riscv_matrix_instance_q15 * pSrcB, + riscv_matrix_instance_q15 * pDst, + q15_t * pScratch); + + /** + * @brief Q31, complex, matrix multiplication. + * @param[in] pSrcA points to the first input matrix structure + * @param[in] pSrcB points to the second input matrix structure + * @param[out] pDst points to output matrix structure + * @return The function returns either + * RISCV_MATH_SIZE_MISMATCH or RISCV_MATH_SUCCESS based on the outcome of size checking. + */ +riscv_status riscv_mat_cmplx_mult_q31( + const riscv_matrix_instance_q31 * pSrcA, + const riscv_matrix_instance_q31 * pSrcB, + riscv_matrix_instance_q31 * pDst); + + /** + * @brief Floating-point matrix transpose. + * @param[in] pSrc points to the input matrix + * @param[out] pDst points to the output matrix + * @return The function returns either RISCV_MATH_SIZE_MISMATCH + * or RISCV_MATH_SUCCESS based on the outcome of size checking. + */ +riscv_status riscv_mat_trans_f32( + const riscv_matrix_instance_f32 * pSrc, + riscv_matrix_instance_f32 * pDst); + + /** + * @brief Q15 matrix transpose. + * @param[in] pSrc points to the input matrix + * @param[out] pDst points to the output matrix + * @return The function returns either RISCV_MATH_SIZE_MISMATCH + * or RISCV_MATH_SUCCESS based on the outcome of size checking. + */ +riscv_status riscv_mat_trans_q15( + const riscv_matrix_instance_q15 * pSrc, + riscv_matrix_instance_q15 * pDst); + + /** + * @brief Q31 matrix transpose. + * @param[in] pSrc points to the input matrix + * @param[out] pDst points to the output matrix + * @return The function returns either RISCV_MATH_SIZE_MISMATCH + * or RISCV_MATH_SUCCESS based on the outcome of size checking. + */ +riscv_status riscv_mat_trans_q31( + const riscv_matrix_instance_q31 * pSrc, + riscv_matrix_instance_q31 * pDst); + + /** + * @brief Floating-point matrix multiplication + * @param[in] pSrcA points to the first input matrix structure + * @param[in] pSrcB points to the second input matrix structure + * @param[out] pDst points to output matrix structure + * @return The function returns either + * RISCV_MATH_SIZE_MISMATCH or RISCV_MATH_SUCCESS based on the outcome of size checking. + */ +riscv_status riscv_mat_mult_f32( + const riscv_matrix_instance_f32 * pSrcA, + const riscv_matrix_instance_f32 * pSrcB, + riscv_matrix_instance_f32 * pDst); + + /** + * @brief Q15 matrix multiplication + * @param[in] pSrcA points to the first input matrix structure + * @param[in] pSrcB points to the second input matrix structure + * @param[out] pDst points to output matrix structure + * @param[in] pState points to the array for storing intermediate results + * @return The function returns either + * RISCV_MATH_SIZE_MISMATCH or RISCV_MATH_SUCCESS based on the outcome of size checking. + */ +riscv_status riscv_mat_mult_q15( + const riscv_matrix_instance_q15 * pSrcA, + const riscv_matrix_instance_q15 * pSrcB, + riscv_matrix_instance_q15 * pDst, + q15_t * pState); + + /** + * @brief Q15 matrix multiplication (fast variant) for RISC-V Core with DSP enabled + * @param[in] pSrcA points to the first input matrix structure + * @param[in] pSrcB points to the second input matrix structure + * @param[out] pDst points to output matrix structure + * @param[in] pState points to the array for storing intermediate results + * @return The function returns either + * RISCV_MATH_SIZE_MISMATCH or RISCV_MATH_SUCCESS based on the outcome of size checking. + */ +riscv_status riscv_mat_mult_fast_q15( + const riscv_matrix_instance_q15 * pSrcA, + const riscv_matrix_instance_q15 * pSrcB, + riscv_matrix_instance_q15 * pDst, + q15_t * pState); + + /** + * @brief Q31 matrix multiplication + * @param[in] pSrcA points to the first input matrix structure + * @param[in] pSrcB points to the second input matrix structure + * @param[out] pDst points to output matrix structure + * @return The function returns either + * RISCV_MATH_SIZE_MISMATCH or RISCV_MATH_SUCCESS based on the outcome of size checking. + */ +riscv_status riscv_mat_mult_q31( + const riscv_matrix_instance_q31 * pSrcA, + const riscv_matrix_instance_q31 * pSrcB, + riscv_matrix_instance_q31 * pDst); + + /** + * @brief Q31 matrix multiplication (fast variant) for RISC-V Core with DSP enabled + * @param[in] pSrcA points to the first input matrix structure + * @param[in] pSrcB points to the second input matrix structure + * @param[out] pDst points to output matrix structure + * @return The function returns either + * RISCV_MATH_SIZE_MISMATCH or RISCV_MATH_SUCCESS based on the outcome of size checking. + */ +riscv_status riscv_mat_mult_fast_q31( + const riscv_matrix_instance_q31 * pSrcA, + const riscv_matrix_instance_q31 * pSrcB, + riscv_matrix_instance_q31 * pDst); + + /** + * @brief Floating-point matrix subtraction + * @param[in] pSrcA points to the first input matrix structure + * @param[in] pSrcB points to the second input matrix structure + * @param[out] pDst points to output matrix structure + * @return The function returns either + * RISCV_MATH_SIZE_MISMATCH or RISCV_MATH_SUCCESS based on the outcome of size checking. + */ +riscv_status riscv_mat_sub_f32( + const riscv_matrix_instance_f32 * pSrcA, + const riscv_matrix_instance_f32 * pSrcB, + riscv_matrix_instance_f32 * pDst); + + /** + * @brief Q15 matrix subtraction + * @param[in] pSrcA points to the first input matrix structure + * @param[in] pSrcB points to the second input matrix structure + * @param[out] pDst points to output matrix structure + * @return The function returns either + * RISCV_MATH_SIZE_MISMATCH or RISCV_MATH_SUCCESS based on the outcome of size checking. + */ +riscv_status riscv_mat_sub_q15( + const riscv_matrix_instance_q15 * pSrcA, + const riscv_matrix_instance_q15 * pSrcB, + riscv_matrix_instance_q15 * pDst); + + /** + * @brief Q31 matrix subtraction + * @param[in] pSrcA points to the first input matrix structure + * @param[in] pSrcB points to the second input matrix structure + * @param[out] pDst points to output matrix structure + * @return The function returns either + * RISCV_MATH_SIZE_MISMATCH or RISCV_MATH_SUCCESS based on the outcome of size checking. + */ +riscv_status riscv_mat_sub_q31( + const riscv_matrix_instance_q31 * pSrcA, + const riscv_matrix_instance_q31 * pSrcB, + riscv_matrix_instance_q31 * pDst); + + /** + * @brief Floating-point matrix scaling. + * @param[in] pSrc points to the input matrix + * @param[in] scale scale factor + * @param[out] pDst points to the output matrix + * @return The function returns either + * RISCV_MATH_SIZE_MISMATCH or RISCV_MATH_SUCCESS based on the outcome of size checking. + */ +riscv_status riscv_mat_scale_f32( + const riscv_matrix_instance_f32 * pSrc, + float32_t scale, + riscv_matrix_instance_f32 * pDst); + + /** + * @brief Q15 matrix scaling. + * @param[in] pSrc points to input matrix + * @param[in] scaleFract fractional portion of the scale factor + * @param[in] shift number of bits to shift the result by + * @param[out] pDst points to output matrix + * @return The function returns either + * RISCV_MATH_SIZE_MISMATCH or RISCV_MATH_SUCCESS based on the outcome of size checking. + */ +riscv_status riscv_mat_scale_q15( + const riscv_matrix_instance_q15 * pSrc, + q15_t scaleFract, + int32_t shift, + riscv_matrix_instance_q15 * pDst); + + /** + * @brief Q31 matrix scaling. + * @param[in] pSrc points to input matrix + * @param[in] scaleFract fractional portion of the scale factor + * @param[in] shift number of bits to shift the result by + * @param[out] pDst points to output matrix structure + * @return The function returns either + * RISCV_MATH_SIZE_MISMATCH or RISCV_MATH_SUCCESS based on the outcome of size checking. + */ +riscv_status riscv_mat_scale_q31( + const riscv_matrix_instance_q31 * pSrc, + q31_t scaleFract, + int32_t shift, + riscv_matrix_instance_q31 * pDst); + + /** + * @brief Q31 matrix initialization. + * @param[in,out] S points to an instance of the floating-point matrix structure. + * @param[in] nRows number of rows in the matrix. + * @param[in] nColumns number of columns in the matrix. + * @param[in] pData points to the matrix data array. + */ +void riscv_mat_init_q31( + riscv_matrix_instance_q31 * S, + uint16_t nRows, + uint16_t nColumns, + q31_t * pData); + + /** + * @brief Q15 matrix initialization. + * @param[in,out] S points to an instance of the floating-point matrix structure. + * @param[in] nRows number of rows in the matrix. + * @param[in] nColumns number of columns in the matrix. + * @param[in] pData points to the matrix data array. + */ +void riscv_mat_init_q15( + riscv_matrix_instance_q15 * S, + uint16_t nRows, + uint16_t nColumns, + q15_t * pData); + + /** + * @brief Floating-point matrix initialization. + * @param[in,out] S points to an instance of the floating-point matrix structure. + * @param[in] nRows number of rows in the matrix. + * @param[in] nColumns number of columns in the matrix. + * @param[in] pData points to the matrix data array. + */ +void riscv_mat_init_f32( + riscv_matrix_instance_f32 * S, + uint16_t nRows, + uint16_t nColumns, + float32_t * pData); + + + /** + * @brief Instance structure for the Q15 PID Control. + */ + typedef struct + { + q15_t A0; /**< The derived gain, A0 = Kp + Ki + Kd . */ +#if !defined (RISCV_MATH_DSP) + q15_t A1; + q15_t A2; +#else + q31_t A1; /**< The derived gain A1 = -Kp - 2Kd | Kd.*/ +#endif + q15_t state[3]; /**< The state array of length 3. */ + q15_t Kp; /**< The proportional gain. */ + q15_t Ki; /**< The integral gain. */ + q15_t Kd; /**< The derivative gain. */ + } riscv_pid_instance_q15; + + /** + * @brief Instance structure for the Q31 PID Control. + */ + typedef struct + { + q31_t A0; /**< The derived gain, A0 = Kp + Ki + Kd . */ + q31_t A1; /**< The derived gain, A1 = -Kp - 2Kd. */ + q31_t A2; /**< The derived gain, A2 = Kd . */ + q31_t state[3]; /**< The state array of length 3. */ + q31_t Kp; /**< The proportional gain. */ + q31_t Ki; /**< The integral gain. */ + q31_t Kd; /**< The derivative gain. */ + } riscv_pid_instance_q31; + + /** + * @brief Instance structure for the floating-point PID Control. + */ + typedef struct + { + float32_t A0; /**< The derived gain, A0 = Kp + Ki + Kd . */ + float32_t A1; /**< The derived gain, A1 = -Kp - 2Kd. */ + float32_t A2; /**< The derived gain, A2 = Kd . */ + float32_t state[3]; /**< The state array of length 3. */ + float32_t Kp; /**< The proportional gain. */ + float32_t Ki; /**< The integral gain. */ + float32_t Kd; /**< The derivative gain. */ + } riscv_pid_instance_f32; + + + + /** + * @brief Initialization function for the floating-point PID Control. + * @param[in,out] S points to an instance of the PID structure. + * @param[in] resetStateFlag flag to reset the state. 0 = no change in state 1 = reset the state. + */ + void riscv_pid_init_f32( + riscv_pid_instance_f32 * S, + int32_t resetStateFlag); + + + /** + * @brief Reset function for the floating-point PID Control. + * @param[in,out] S is an instance of the floating-point PID Control structure + */ + void riscv_pid_reset_f32( + riscv_pid_instance_f32 * S); + + + /** + * @brief Initialization function for the Q31 PID Control. + * @param[in,out] S points to an instance of the Q15 PID structure. + * @param[in] resetStateFlag flag to reset the state. 0 = no change in state 1 = reset the state. + */ + void riscv_pid_init_q31( + riscv_pid_instance_q31 * S, + int32_t resetStateFlag); + + + /** + * @brief Reset function for the Q31 PID Control. + * @param[in,out] S points to an instance of the Q31 PID Control structure + */ + + void riscv_pid_reset_q31( + riscv_pid_instance_q31 * S); + + + /** + * @brief Initialization function for the Q15 PID Control. + * @param[in,out] S points to an instance of the Q15 PID structure. + * @param[in] resetStateFlag flag to reset the state. 0 = no change in state 1 = reset the state. + */ + void riscv_pid_init_q15( + riscv_pid_instance_q15 * S, + int32_t resetStateFlag); + + + /** + * @brief Reset function for the Q15 PID Control. + * @param[in,out] S points to an instance of the q15 PID Control structure + */ + void riscv_pid_reset_q15( + riscv_pid_instance_q15 * S); + + + /** + * @brief Instance structure for the floating-point Linear Interpolate function. + */ + typedef struct + { + uint32_t nValues; /**< nValues */ + float32_t x1; /**< x1 */ + float32_t xSpacing; /**< xSpacing */ + float32_t *pYData; /**< pointer to the table of Y values */ + } riscv_linear_interp_instance_f32; + + /** + * @brief Instance structure for the floating-point bilinear interpolation function. + */ + typedef struct + { + uint16_t numRows; /**< number of rows in the data table. */ + uint16_t numCols; /**< number of columns in the data table. */ + float32_t *pData; /**< points to the data table. */ + } riscv_bilinear_interp_instance_f32; + + /** + * @brief Instance structure for the Q31 bilinear interpolation function. + */ + typedef struct + { + uint16_t numRows; /**< number of rows in the data table. */ + uint16_t numCols; /**< number of columns in the data table. */ + q31_t *pData; /**< points to the data table. */ + } riscv_bilinear_interp_instance_q31; + + /** + * @brief Instance structure for the Q15 bilinear interpolation function. + */ + typedef struct + { + uint16_t numRows; /**< number of rows in the data table. */ + uint16_t numCols; /**< number of columns in the data table. */ + q15_t *pData; /**< points to the data table. */ + } riscv_bilinear_interp_instance_q15; + + /** + * @brief Instance structure for the Q15 bilinear interpolation function. + */ + typedef struct + { + uint16_t numRows; /**< number of rows in the data table. */ + uint16_t numCols; /**< number of columns in the data table. */ + q7_t *pData; /**< points to the data table. */ + } riscv_bilinear_interp_instance_q7; + + + /** + * @brief Q7 vector multiplication. + * @param[in] pSrcA points to the first input vector + * @param[in] pSrcB points to the second input vector + * @param[out] pDst points to the output vector + * @param[in] blockSize number of samples in each vector + */ + void riscv_mult_q7( + const q7_t * pSrcA, + const q7_t * pSrcB, + q7_t * pDst, + uint32_t blockSize); + + + /** + * @brief Q15 vector multiplication. + * @param[in] pSrcA points to the first input vector + * @param[in] pSrcB points to the second input vector + * @param[out] pDst points to the output vector + * @param[in] blockSize number of samples in each vector + */ + void riscv_mult_q15( + const q15_t * pSrcA, + const q15_t * pSrcB, + q15_t * pDst, + uint32_t blockSize); + + + /** + * @brief Q31 vector multiplication. + * @param[in] pSrcA points to the first input vector + * @param[in] pSrcB points to the second input vector + * @param[out] pDst points to the output vector + * @param[in] blockSize number of samples in each vector + */ + void riscv_mult_q31( + const q31_t * pSrcA, + const q31_t * pSrcB, + q31_t * pDst, + uint32_t blockSize); + + + /** + * @brief Floating-point vector multiplication. + * @param[in] pSrcA points to the first input vector + * @param[in] pSrcB points to the second input vector + * @param[out] pDst points to the output vector + * @param[in] blockSize number of samples in each vector + */ + void riscv_mult_f32( + const float32_t * pSrcA, + const float32_t * pSrcB, + float32_t * pDst, + uint32_t blockSize); + + + /** + * @brief Instance structure for the Q15 CFFT/CIFFT function. + */ + typedef struct + { + uint16_t fftLen; /**< length of the FFT. */ + uint8_t ifftFlag; /**< flag that selects forward (ifftFlag=0) or inverse (ifftFlag=1) transform. */ + uint8_t bitReverseFlag; /**< flag that enables (bitReverseFlag=1) or disables (bitReverseFlag=0) bit reversal of output. */ + const q15_t *pTwiddle; /**< points to the Sin twiddle factor table. */ + const uint16_t *pBitRevTable; /**< points to the bit reversal table. */ + uint16_t twidCoefModifier; /**< twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table. */ + uint16_t bitRevFactor; /**< bit reversal modifier that supports different size FFTs with the same bit reversal table. */ + } riscv_cfft_radix2_instance_q15; + +/* Deprecated */ + riscv_status riscv_cfft_radix2_init_q15( + riscv_cfft_radix2_instance_q15 * S, + uint16_t fftLen, + uint8_t ifftFlag, + uint8_t bitReverseFlag); + +/* Deprecated */ + void riscv_cfft_radix2_q15( + const riscv_cfft_radix2_instance_q15 * S, + q15_t * pSrc); + + + /** + * @brief Instance structure for the Q15 CFFT/CIFFT function. + */ + typedef struct + { + uint16_t fftLen; /**< length of the FFT. */ + uint8_t ifftFlag; /**< flag that selects forward (ifftFlag=0) or inverse (ifftFlag=1) transform. */ + uint8_t bitReverseFlag; /**< flag that enables (bitReverseFlag=1) or disables (bitReverseFlag=0) bit reversal of output. */ + const q15_t *pTwiddle; /**< points to the twiddle factor table. */ + const uint16_t *pBitRevTable; /**< points to the bit reversal table. */ + uint16_t twidCoefModifier; /**< twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table. */ + uint16_t bitRevFactor; /**< bit reversal modifier that supports different size FFTs with the same bit reversal table. */ + } riscv_cfft_radix4_instance_q15; + +/* Deprecated */ + riscv_status riscv_cfft_radix4_init_q15( + riscv_cfft_radix4_instance_q15 * S, + uint16_t fftLen, + uint8_t ifftFlag, + uint8_t bitReverseFlag); + +/* Deprecated */ + void riscv_cfft_radix4_q15( + const riscv_cfft_radix4_instance_q15 * S, + q15_t * pSrc); + + /** + * @brief Instance structure for the Radix-2 Q31 CFFT/CIFFT function. + */ + typedef struct + { + uint16_t fftLen; /**< length of the FFT. */ + uint8_t ifftFlag; /**< flag that selects forward (ifftFlag=0) or inverse (ifftFlag=1) transform. */ + uint8_t bitReverseFlag; /**< flag that enables (bitReverseFlag=1) or disables (bitReverseFlag=0) bit reversal of output. */ + const q31_t *pTwiddle; /**< points to the Twiddle factor table. */ + const uint16_t *pBitRevTable; /**< points to the bit reversal table. */ + uint16_t twidCoefModifier; /**< twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table. */ + uint16_t bitRevFactor; /**< bit reversal modifier that supports different size FFTs with the same bit reversal table. */ + } riscv_cfft_radix2_instance_q31; + +/* Deprecated */ + riscv_status riscv_cfft_radix2_init_q31( + riscv_cfft_radix2_instance_q31 * S, + uint16_t fftLen, + uint8_t ifftFlag, + uint8_t bitReverseFlag); + +/* Deprecated */ + void riscv_cfft_radix2_q31( + const riscv_cfft_radix2_instance_q31 * S, + q31_t * pSrc); + + /** + * @brief Instance structure for the Q31 CFFT/CIFFT function. + */ + typedef struct + { + uint16_t fftLen; /**< length of the FFT. */ + uint8_t ifftFlag; /**< flag that selects forward (ifftFlag=0) or inverse (ifftFlag=1) transform. */ + uint8_t bitReverseFlag; /**< flag that enables (bitReverseFlag=1) or disables (bitReverseFlag=0) bit reversal of output. */ + const q31_t *pTwiddle; /**< points to the twiddle factor table. */ + const uint16_t *pBitRevTable; /**< points to the bit reversal table. */ + uint16_t twidCoefModifier; /**< twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table. */ + uint16_t bitRevFactor; /**< bit reversal modifier that supports different size FFTs with the same bit reversal table. */ + } riscv_cfft_radix4_instance_q31; + +/* Deprecated */ + void riscv_cfft_radix4_q31( + const riscv_cfft_radix4_instance_q31 * S, + q31_t * pSrc); + +/* Deprecated */ + riscv_status riscv_cfft_radix4_init_q31( + riscv_cfft_radix4_instance_q31 * S, + uint16_t fftLen, + uint8_t ifftFlag, + uint8_t bitReverseFlag); + + /** + * @brief Instance structure for the floating-point CFFT/CIFFT function. + */ + typedef struct + { + uint16_t fftLen; /**< length of the FFT. */ + uint8_t ifftFlag; /**< flag that selects forward (ifftFlag=0) or inverse (ifftFlag=1) transform. */ + uint8_t bitReverseFlag; /**< flag that enables (bitReverseFlag=1) or disables (bitReverseFlag=0) bit reversal of output. */ + const float32_t *pTwiddle; /**< points to the Twiddle factor table. */ + const uint16_t *pBitRevTable; /**< points to the bit reversal table. */ + uint16_t twidCoefModifier; /**< twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table. */ + uint16_t bitRevFactor; /**< bit reversal modifier that supports different size FFTs with the same bit reversal table. */ + float32_t onebyfftLen; /**< value of 1/fftLen. */ + } riscv_cfft_radix2_instance_f32; + +/* Deprecated */ + riscv_status riscv_cfft_radix2_init_f32( + riscv_cfft_radix2_instance_f32 * S, + uint16_t fftLen, + uint8_t ifftFlag, + uint8_t bitReverseFlag); + +/* Deprecated */ + void riscv_cfft_radix2_f32( + const riscv_cfft_radix2_instance_f32 * S, + float32_t * pSrc); + + /** + * @brief Instance structure for the floating-point CFFT/CIFFT function. + */ + typedef struct + { + uint16_t fftLen; /**< length of the FFT. */ + uint8_t ifftFlag; /**< flag that selects forward (ifftFlag=0) or inverse (ifftFlag=1) transform. */ + uint8_t bitReverseFlag; /**< flag that enables (bitReverseFlag=1) or disables (bitReverseFlag=0) bit reversal of output. */ + const float32_t *pTwiddle; /**< points to the Twiddle factor table. */ + const uint16_t *pBitRevTable; /**< points to the bit reversal table. */ + uint16_t twidCoefModifier; /**< twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table. */ + uint16_t bitRevFactor; /**< bit reversal modifier that supports different size FFTs with the same bit reversal table. */ + float32_t onebyfftLen; /**< value of 1/fftLen. */ + } riscv_cfft_radix4_instance_f32; + +/* Deprecated */ + riscv_status riscv_cfft_radix4_init_f32( + riscv_cfft_radix4_instance_f32 * S, + uint16_t fftLen, + uint8_t ifftFlag, + uint8_t bitReverseFlag); + +/* Deprecated */ + void riscv_cfft_radix4_f32( + const riscv_cfft_radix4_instance_f32 * S, + float32_t * pSrc); + + /** + * @brief Instance structure for the fixed-point CFFT/CIFFT function. + */ + typedef struct + { + uint16_t fftLen; /**< length of the FFT. */ + const q15_t *pTwiddle; /**< points to the Twiddle factor table. */ + const uint16_t *pBitRevTable; /**< points to the bit reversal table. */ + uint16_t bitRevLength; /**< bit reversal table length. */ + } riscv_cfft_instance_q15; + +void riscv_cfft_q15( + const riscv_cfft_instance_q15 * S, + q15_t * p1, + uint8_t ifftFlag, + uint8_t bitReverseFlag); + + /** + * @brief Instance structure for the fixed-point CFFT/CIFFT function. + */ + typedef struct + { + uint16_t fftLen; /**< length of the FFT. */ + const q31_t *pTwiddle; /**< points to the Twiddle factor table. */ + const uint16_t *pBitRevTable; /**< points to the bit reversal table. */ + uint16_t bitRevLength; /**< bit reversal table length. */ + } riscv_cfft_instance_q31; + +void riscv_cfft_q31( + const riscv_cfft_instance_q31 * S, + q31_t * p1, + uint8_t ifftFlag, + uint8_t bitReverseFlag); + + /** + * @brief Instance structure for the floating-point CFFT/CIFFT function. + */ + typedef struct + { + uint16_t fftLen; /**< length of the FFT. */ + const float32_t *pTwiddle; /**< points to the Twiddle factor table. */ + const uint16_t *pBitRevTable; /**< points to the bit reversal table. */ + uint16_t bitRevLength; /**< bit reversal table length. */ + } riscv_cfft_instance_f32; + + void riscv_cfft_f32( + const riscv_cfft_instance_f32 * S, + float32_t * p1, + uint8_t ifftFlag, + uint8_t bitReverseFlag); + + /** + * @brief Instance structure for the Q15 RFFT/RIFFT function. + */ + typedef struct + { + uint32_t fftLenReal; /**< length of the real FFT. */ + uint8_t ifftFlagR; /**< flag that selects forward (ifftFlagR=0) or inverse (ifftFlagR=1) transform. */ + uint8_t bitReverseFlagR; /**< flag that enables (bitReverseFlagR=1) or disables (bitReverseFlagR=0) bit reversal of output. */ + uint32_t twidCoefRModifier; /**< twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table. */ + const q15_t *pTwiddleAReal; /**< points to the real twiddle factor table. */ + const q15_t *pTwiddleBReal; /**< points to the imag twiddle factor table. */ + const riscv_cfft_instance_q15 *pCfft; /**< points to the complex FFT instance. */ + } riscv_rfft_instance_q15; + + riscv_status riscv_rfft_init_q15( + riscv_rfft_instance_q15 * S, + uint32_t fftLenReal, + uint32_t ifftFlagR, + uint32_t bitReverseFlag); + + void riscv_rfft_q15( + const riscv_rfft_instance_q15 * S, + q15_t * pSrc, + q15_t * pDst); + + /** + * @brief Instance structure for the Q31 RFFT/RIFFT function. + */ + typedef struct + { + uint32_t fftLenReal; /**< length of the real FFT. */ + uint8_t ifftFlagR; /**< flag that selects forward (ifftFlagR=0) or inverse (ifftFlagR=1) transform. */ + uint8_t bitReverseFlagR; /**< flag that enables (bitReverseFlagR=1) or disables (bitReverseFlagR=0) bit reversal of output. */ + uint32_t twidCoefRModifier; /**< twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table. */ + const q31_t *pTwiddleAReal; /**< points to the real twiddle factor table. */ + const q31_t *pTwiddleBReal; /**< points to the imag twiddle factor table. */ + const riscv_cfft_instance_q31 *pCfft; /**< points to the complex FFT instance. */ + } riscv_rfft_instance_q31; + + riscv_status riscv_rfft_init_q31( + riscv_rfft_instance_q31 * S, + uint32_t fftLenReal, + uint32_t ifftFlagR, + uint32_t bitReverseFlag); + + void riscv_rfft_q31( + const riscv_rfft_instance_q31 * S, + q31_t * pSrc, + q31_t * pDst); + + /** + * @brief Instance structure for the floating-point RFFT/RIFFT function. + */ + typedef struct + { + uint32_t fftLenReal; /**< length of the real FFT. */ + uint16_t fftLenBy2; /**< length of the complex FFT. */ + uint8_t ifftFlagR; /**< flag that selects forward (ifftFlagR=0) or inverse (ifftFlagR=1) transform. */ + uint8_t bitReverseFlagR; /**< flag that enables (bitReverseFlagR=1) or disables (bitReverseFlagR=0) bit reversal of output. */ + uint32_t twidCoefRModifier; /**< twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table. */ + const float32_t *pTwiddleAReal; /**< points to the real twiddle factor table. */ + const float32_t *pTwiddleBReal; /**< points to the imag twiddle factor table. */ + riscv_cfft_radix4_instance_f32 *pCfft; /**< points to the complex FFT instance. */ + } riscv_rfft_instance_f32; + + riscv_status riscv_rfft_init_f32( + riscv_rfft_instance_f32 * S, + riscv_cfft_radix4_instance_f32 * S_CFFT, + uint32_t fftLenReal, + uint32_t ifftFlagR, + uint32_t bitReverseFlag); + + void riscv_rfft_f32( + const riscv_rfft_instance_f32 * S, + float32_t * pSrc, + float32_t * pDst); + + /** + * @brief Instance structure for the floating-point RFFT/RIFFT function. + */ +typedef struct + { + riscv_cfft_instance_f32 Sint; /**< Internal CFFT structure. */ + uint16_t fftLenRFFT; /**< length of the real sequence */ + const float32_t * pTwiddleRFFT; /**< Twiddle factors real stage */ + } riscv_rfft_fast_instance_f32 ; + +riscv_status riscv_rfft_fast_init_f32 ( + riscv_rfft_fast_instance_f32 * S, + uint16_t fftLen); + +riscv_status riscv_rfft_32_fast_init_f32 ( riscv_rfft_fast_instance_f32 * S ); + +riscv_status riscv_rfft_64_fast_init_f32 ( riscv_rfft_fast_instance_f32 * S ); + +riscv_status riscv_rfft_128_fast_init_f32 ( riscv_rfft_fast_instance_f32 * S ); + +riscv_status riscv_rfft_256_fast_init_f32 ( riscv_rfft_fast_instance_f32 * S ); + +riscv_status riscv_rfft_512_fast_init_f32 ( riscv_rfft_fast_instance_f32 * S ); + +riscv_status riscv_rfft_1024_fast_init_f32 ( riscv_rfft_fast_instance_f32 * S ); + +riscv_status riscv_rfft_2048_fast_init_f32 ( riscv_rfft_fast_instance_f32 * S ); + +riscv_status riscv_rfft_4096_fast_init_f32 ( riscv_rfft_fast_instance_f32 * S ); + + + void riscv_rfft_fast_f32( + riscv_rfft_fast_instance_f32 * S, + float32_t * p, float32_t * pOut, + uint8_t ifftFlag); + + /** + * @brief Instance structure for the floating-point DCT4/IDCT4 function. + */ + typedef struct + { + uint16_t N; /**< length of the DCT4. */ + uint16_t Nby2; /**< half of the length of the DCT4. */ + float32_t normalize; /**< normalizing factor. */ + const float32_t *pTwiddle; /**< points to the twiddle factor table. */ + const float32_t *pCosFactor; /**< points to the cosFactor table. */ + riscv_rfft_instance_f32 *pRfft; /**< points to the real FFT instance. */ + riscv_cfft_radix4_instance_f32 *pCfft; /**< points to the complex FFT instance. */ + } riscv_dct4_instance_f32; + + + /** + * @brief Initialization function for the floating-point DCT4/IDCT4. + * @param[in,out] S points to an instance of floating-point DCT4/IDCT4 structure. + * @param[in] S_RFFT points to an instance of floating-point RFFT/RIFFT structure. + * @param[in] S_CFFT points to an instance of floating-point CFFT/CIFFT structure. + * @param[in] N length of the DCT4. + * @param[in] Nby2 half of the length of the DCT4. + * @param[in] normalize normalizing factor. + * @return riscv_status function returns RISCV_MATH_SUCCESS if initialization is successful or RISCV_MATH_ARGUMENT_ERROR if fftLenReal is not a supported transform length. + */ + riscv_status riscv_dct4_init_f32( + riscv_dct4_instance_f32 * S, + riscv_rfft_instance_f32 * S_RFFT, + riscv_cfft_radix4_instance_f32 * S_CFFT, + uint16_t N, + uint16_t Nby2, + float32_t normalize); + + + /** + * @brief Processing function for the floating-point DCT4/IDCT4. + * @param[in] S points to an instance of the floating-point DCT4/IDCT4 structure. + * @param[in] pState points to state buffer. + * @param[in,out] pInlineBuffer points to the in-place input and output buffer. + */ + void riscv_dct4_f32( + const riscv_dct4_instance_f32 * S, + float32_t * pState, + float32_t * pInlineBuffer); + + + /** + * @brief Instance structure for the Q31 DCT4/IDCT4 function. + */ + typedef struct + { + uint16_t N; /**< length of the DCT4. */ + uint16_t Nby2; /**< half of the length of the DCT4. */ + q31_t normalize; /**< normalizing factor. */ + const q31_t *pTwiddle; /**< points to the twiddle factor table. */ + const q31_t *pCosFactor; /**< points to the cosFactor table. */ + riscv_rfft_instance_q31 *pRfft; /**< points to the real FFT instance. */ + riscv_cfft_radix4_instance_q31 *pCfft; /**< points to the complex FFT instance. */ + } riscv_dct4_instance_q31; + + + /** + * @brief Initialization function for the Q31 DCT4/IDCT4. + * @param[in,out] S points to an instance of Q31 DCT4/IDCT4 structure. + * @param[in] S_RFFT points to an instance of Q31 RFFT/RIFFT structure + * @param[in] S_CFFT points to an instance of Q31 CFFT/CIFFT structure + * @param[in] N length of the DCT4. + * @param[in] Nby2 half of the length of the DCT4. + * @param[in] normalize normalizing factor. + * @return riscv_status function returns RISCV_MATH_SUCCESS if initialization is successful or RISCV_MATH_ARGUMENT_ERROR if N is not a supported transform length. + */ + riscv_status riscv_dct4_init_q31( + riscv_dct4_instance_q31 * S, + riscv_rfft_instance_q31 * S_RFFT, + riscv_cfft_radix4_instance_q31 * S_CFFT, + uint16_t N, + uint16_t Nby2, + q31_t normalize); + + + /** + * @brief Processing function for the Q31 DCT4/IDCT4. + * @param[in] S points to an instance of the Q31 DCT4 structure. + * @param[in] pState points to state buffer. + * @param[in,out] pInlineBuffer points to the in-place input and output buffer. + */ + void riscv_dct4_q31( + const riscv_dct4_instance_q31 * S, + q31_t * pState, + q31_t * pInlineBuffer); + + + /** + * @brief Instance structure for the Q15 DCT4/IDCT4 function. + */ + typedef struct + { + uint16_t N; /**< length of the DCT4. */ + uint16_t Nby2; /**< half of the length of the DCT4. */ + q15_t normalize; /**< normalizing factor. */ + const q15_t *pTwiddle; /**< points to the twiddle factor table. */ + const q15_t *pCosFactor; /**< points to the cosFactor table. */ + riscv_rfft_instance_q15 *pRfft; /**< points to the real FFT instance. */ + riscv_cfft_radix4_instance_q15 *pCfft; /**< points to the complex FFT instance. */ + } riscv_dct4_instance_q15; + + + /** + * @brief Initialization function for the Q15 DCT4/IDCT4. + * @param[in,out] S points to an instance of Q15 DCT4/IDCT4 structure. + * @param[in] S_RFFT points to an instance of Q15 RFFT/RIFFT structure. + * @param[in] S_CFFT points to an instance of Q15 CFFT/CIFFT structure. + * @param[in] N length of the DCT4. + * @param[in] Nby2 half of the length of the DCT4. + * @param[in] normalize normalizing factor. + * @return riscv_status function returns RISCV_MATH_SUCCESS if initialization is successful or RISCV_MATH_ARGUMENT_ERROR if N is not a supported transform length. + */ + riscv_status riscv_dct4_init_q15( + riscv_dct4_instance_q15 * S, + riscv_rfft_instance_q15 * S_RFFT, + riscv_cfft_radix4_instance_q15 * S_CFFT, + uint16_t N, + uint16_t Nby2, + q15_t normalize); + + + /** + * @brief Processing function for the Q15 DCT4/IDCT4. + * @param[in] S points to an instance of the Q15 DCT4 structure. + * @param[in] pState points to state buffer. + * @param[in,out] pInlineBuffer points to the in-place input and output buffer. + */ + void riscv_dct4_q15( + const riscv_dct4_instance_q15 * S, + q15_t * pState, + q15_t * pInlineBuffer); + + + /** + * @brief Floating-point vector addition. + * @param[in] pSrcA points to the first input vector + * @param[in] pSrcB points to the second input vector + * @param[out] pDst points to the output vector + * @param[in] blockSize number of samples in each vector + */ + void riscv_add_f32( + const float32_t * pSrcA, + const float32_t * pSrcB, + float32_t * pDst, + uint32_t blockSize); + + + /** + * @brief Q7 vector addition. + * @param[in] pSrcA points to the first input vector + * @param[in] pSrcB points to the second input vector + * @param[out] pDst points to the output vector + * @param[in] blockSize number of samples in each vector + */ + void riscv_add_q7( + const q7_t * pSrcA, + const q7_t * pSrcB, + q7_t * pDst, + uint32_t blockSize); + + + /** + * @brief Q15 vector addition. + * @param[in] pSrcA points to the first input vector + * @param[in] pSrcB points to the second input vector + * @param[out] pDst points to the output vector + * @param[in] blockSize number of samples in each vector + */ + void riscv_add_q15( + const q15_t * pSrcA, + const q15_t * pSrcB, + q15_t * pDst, + uint32_t blockSize); + + + /** + * @brief Q31 vector addition. + * @param[in] pSrcA points to the first input vector + * @param[in] pSrcB points to the second input vector + * @param[out] pDst points to the output vector + * @param[in] blockSize number of samples in each vector + */ + void riscv_add_q31( + const q31_t * pSrcA, + const q31_t * pSrcB, + q31_t * pDst, + uint32_t blockSize); + + + /** + * @brief Floating-point vector subtraction. + * @param[in] pSrcA points to the first input vector + * @param[in] pSrcB points to the second input vector + * @param[out] pDst points to the output vector + * @param[in] blockSize number of samples in each vector + */ + void riscv_sub_f32( + const float32_t * pSrcA, + const float32_t * pSrcB, + float32_t * pDst, + uint32_t blockSize); + + + /** + * @brief Q7 vector subtraction. + * @param[in] pSrcA points to the first input vector + * @param[in] pSrcB points to the second input vector + * @param[out] pDst points to the output vector + * @param[in] blockSize number of samples in each vector + */ + void riscv_sub_q7( + const q7_t * pSrcA, + const q7_t * pSrcB, + q7_t * pDst, + uint32_t blockSize); + + + /** + * @brief Q15 vector subtraction. + * @param[in] pSrcA points to the first input vector + * @param[in] pSrcB points to the second input vector + * @param[out] pDst points to the output vector + * @param[in] blockSize number of samples in each vector + */ + void riscv_sub_q15( + const q15_t * pSrcA, + const q15_t * pSrcB, + q15_t * pDst, + uint32_t blockSize); + + + /** + * @brief Q31 vector subtraction. + * @param[in] pSrcA points to the first input vector + * @param[in] pSrcB points to the second input vector + * @param[out] pDst points to the output vector + * @param[in] blockSize number of samples in each vector + */ + void riscv_sub_q31( + const q31_t * pSrcA, + const q31_t * pSrcB, + q31_t * pDst, + uint32_t blockSize); + + + /** + * @brief Multiplies a floating-point vector by a scalar. + * @param[in] pSrc points to the input vector + * @param[in] scale scale factor to be applied + * @param[out] pDst points to the output vector + * @param[in] blockSize number of samples in the vector + */ + void riscv_scale_f32( + const float32_t * pSrc, + float32_t scale, + float32_t * pDst, + uint32_t blockSize); + + + /** + * @brief Multiplies a Q7 vector by a scalar. + * @param[in] pSrc points to the input vector + * @param[in] scaleFract fractional portion of the scale value + * @param[in] shift number of bits to shift the result by + * @param[out] pDst points to the output vector + * @param[in] blockSize number of samples in the vector + */ + void riscv_scale_q7( + const q7_t * pSrc, + q7_t scaleFract, + int8_t shift, + q7_t * pDst, + uint32_t blockSize); + + + /** + * @brief Multiplies a Q15 vector by a scalar. + * @param[in] pSrc points to the input vector + * @param[in] scaleFract fractional portion of the scale value + * @param[in] shift number of bits to shift the result by + * @param[out] pDst points to the output vector + * @param[in] blockSize number of samples in the vector + */ + void riscv_scale_q15( + const q15_t * pSrc, + q15_t scaleFract, + int8_t shift, + q15_t * pDst, + uint32_t blockSize); + + + /** + * @brief Multiplies a Q31 vector by a scalar. + * @param[in] pSrc points to the input vector + * @param[in] scaleFract fractional portion of the scale value + * @param[in] shift number of bits to shift the result by + * @param[out] pDst points to the output vector + * @param[in] blockSize number of samples in the vector + */ + void riscv_scale_q31( + const q31_t * pSrc, + q31_t scaleFract, + int8_t shift, + q31_t * pDst, + uint32_t blockSize); + + + /** + * @brief Q7 vector absolute value. + * @param[in] pSrc points to the input buffer + * @param[out] pDst points to the output buffer + * @param[in] blockSize number of samples in each vector + */ + void riscv_abs_q7( + const q7_t * pSrc, + q7_t * pDst, + uint32_t blockSize); + + + /** + * @brief Floating-point vector absolute value. + * @param[in] pSrc points to the input buffer + * @param[out] pDst points to the output buffer + * @param[in] blockSize number of samples in each vector + */ + void riscv_abs_f32( + const float32_t * pSrc, + float32_t * pDst, + uint32_t blockSize); + + + /** + * @brief Q15 vector absolute value. + * @param[in] pSrc points to the input buffer + * @param[out] pDst points to the output buffer + * @param[in] blockSize number of samples in each vector + */ + void riscv_abs_q15( + const q15_t * pSrc, + q15_t * pDst, + uint32_t blockSize); + + + /** + * @brief Q31 vector absolute value. + * @param[in] pSrc points to the input buffer + * @param[out] pDst points to the output buffer + * @param[in] blockSize number of samples in each vector + */ + void riscv_abs_q31( + const q31_t * pSrc, + q31_t * pDst, + uint32_t blockSize); + + + /** + * @brief Dot product of floating-point vectors. + * @param[in] pSrcA points to the first input vector + * @param[in] pSrcB points to the second input vector + * @param[in] blockSize number of samples in each vector + * @param[out] result output result returned here + */ + void riscv_dot_prod_f32( + const float32_t * pSrcA, + const float32_t * pSrcB, + uint32_t blockSize, + float32_t * result); + + + /** + * @brief Dot product of Q7 vectors. + * @param[in] pSrcA points to the first input vector + * @param[in] pSrcB points to the second input vector + * @param[in] blockSize number of samples in each vector + * @param[out] result output result returned here + */ + void riscv_dot_prod_q7( + const q7_t * pSrcA, + const q7_t * pSrcB, + uint32_t blockSize, + q31_t * result); + + + /** + * @brief Dot product of Q15 vectors. + * @param[in] pSrcA points to the first input vector + * @param[in] pSrcB points to the second input vector + * @param[in] blockSize number of samples in each vector + * @param[out] result output result returned here + */ + void riscv_dot_prod_q15( + const q15_t * pSrcA, + const q15_t * pSrcB, + uint32_t blockSize, + q63_t * result); + + + /** + * @brief Dot product of Q31 vectors. + * @param[in] pSrcA points to the first input vector + * @param[in] pSrcB points to the second input vector + * @param[in] blockSize number of samples in each vector + * @param[out] result output result returned here + */ + void riscv_dot_prod_q31( + const q31_t * pSrcA, + const q31_t * pSrcB, + uint32_t blockSize, + q63_t * result); + + + /** + * @brief Shifts the elements of a Q7 vector a specified number of bits. + * @param[in] pSrc points to the input vector + * @param[in] shiftBits number of bits to shift. A positive value shifts left; a negative value shifts right. + * @param[out] pDst points to the output vector + * @param[in] blockSize number of samples in the vector + */ + void riscv_shift_q7( + const q7_t * pSrc, + int8_t shiftBits, + q7_t * pDst, + uint32_t blockSize); + + + /** + * @brief Shifts the elements of a Q15 vector a specified number of bits. + * @param[in] pSrc points to the input vector + * @param[in] shiftBits number of bits to shift. A positive value shifts left; a negative value shifts right. + * @param[out] pDst points to the output vector + * @param[in] blockSize number of samples in the vector + */ + void riscv_shift_q15( + const q15_t * pSrc, + int8_t shiftBits, + q15_t * pDst, + uint32_t blockSize); + + + /** + * @brief Shifts the elements of a Q31 vector a specified number of bits. + * @param[in] pSrc points to the input vector + * @param[in] shiftBits number of bits to shift. A positive value shifts left; a negative value shifts right. + * @param[out] pDst points to the output vector + * @param[in] blockSize number of samples in the vector + */ + void riscv_shift_q31( + const q31_t * pSrc, + int8_t shiftBits, + q31_t * pDst, + uint32_t blockSize); + + + /** + * @brief Adds a constant offset to a floating-point vector. + * @param[in] pSrc points to the input vector + * @param[in] offset is the offset to be added + * @param[out] pDst points to the output vector + * @param[in] blockSize number of samples in the vector + */ + void riscv_offset_f32( + const float32_t * pSrc, + float32_t offset, + float32_t * pDst, + uint32_t blockSize); + + + /** + * @brief Adds a constant offset to a Q7 vector. + * @param[in] pSrc points to the input vector + * @param[in] offset is the offset to be added + * @param[out] pDst points to the output vector + * @param[in] blockSize number of samples in the vector + */ + void riscv_offset_q7( + const q7_t * pSrc, + q7_t offset, + q7_t * pDst, + uint32_t blockSize); + + + /** + * @brief Adds a constant offset to a Q15 vector. + * @param[in] pSrc points to the input vector + * @param[in] offset is the offset to be added + * @param[out] pDst points to the output vector + * @param[in] blockSize number of samples in the vector + */ + void riscv_offset_q15( + const q15_t * pSrc, + q15_t offset, + q15_t * pDst, + uint32_t blockSize); + + + /** + * @brief Adds a constant offset to a Q31 vector. + * @param[in] pSrc points to the input vector + * @param[in] offset is the offset to be added + * @param[out] pDst points to the output vector + * @param[in] blockSize number of samples in the vector + */ + void riscv_offset_q31( + const q31_t * pSrc, + q31_t offset, + q31_t * pDst, + uint32_t blockSize); + + + /** + * @brief Negates the elements of a floating-point vector. + * @param[in] pSrc points to the input vector + * @param[out] pDst points to the output vector + * @param[in] blockSize number of samples in the vector + */ + void riscv_negate_f32( + const float32_t * pSrc, + float32_t * pDst, + uint32_t blockSize); + + + /** + * @brief Negates the elements of a Q7 vector. + * @param[in] pSrc points to the input vector + * @param[out] pDst points to the output vector + * @param[in] blockSize number of samples in the vector + */ + void riscv_negate_q7( + const q7_t * pSrc, + q7_t * pDst, + uint32_t blockSize); + + + /** + * @brief Negates the elements of a Q15 vector. + * @param[in] pSrc points to the input vector + * @param[out] pDst points to the output vector + * @param[in] blockSize number of samples in the vector + */ + void riscv_negate_q15( + const q15_t * pSrc, + q15_t * pDst, + uint32_t blockSize); + + + /** + * @brief Negates the elements of a Q31 vector. + * @param[in] pSrc points to the input vector + * @param[out] pDst points to the output vector + * @param[in] blockSize number of samples in the vector + */ + void riscv_negate_q31( + const q31_t * pSrc, + q31_t * pDst, + uint32_t blockSize); + + + /** + * @brief Copies the elements of a floating-point vector. + * @param[in] pSrc input pointer + * @param[out] pDst output pointer + * @param[in] blockSize number of samples to process + */ + void riscv_copy_f32( + const float32_t * pSrc, + float32_t * pDst, + uint32_t blockSize); + + + /** + * @brief Copies the elements of a Q7 vector. + * @param[in] pSrc input pointer + * @param[out] pDst output pointer + * @param[in] blockSize number of samples to process + */ + void riscv_copy_q7( + const q7_t * pSrc, + q7_t * pDst, + uint32_t blockSize); + + + /** + * @brief Copies the elements of a Q15 vector. + * @param[in] pSrc input pointer + * @param[out] pDst output pointer + * @param[in] blockSize number of samples to process + */ + void riscv_copy_q15( + const q15_t * pSrc, + q15_t * pDst, + uint32_t blockSize); + + + /** + * @brief Copies the elements of a Q31 vector. + * @param[in] pSrc input pointer + * @param[out] pDst output pointer + * @param[in] blockSize number of samples to process + */ + void riscv_copy_q31( + const q31_t * pSrc, + q31_t * pDst, + uint32_t blockSize); + + + /** + * @brief Fills a constant value into a floating-point vector. + * @param[in] value input value to be filled + * @param[out] pDst output pointer + * @param[in] blockSize number of samples to process + */ + void riscv_fill_f32( + float32_t value, + float32_t * pDst, + uint32_t blockSize); + + + /** + * @brief Fills a constant value into a Q7 vector. + * @param[in] value input value to be filled + * @param[out] pDst output pointer + * @param[in] blockSize number of samples to process + */ + void riscv_fill_q7( + q7_t value, + q7_t * pDst, + uint32_t blockSize); + + + /** + * @brief Fills a constant value into a Q15 vector. + * @param[in] value input value to be filled + * @param[out] pDst output pointer + * @param[in] blockSize number of samples to process + */ + void riscv_fill_q15( + q15_t value, + q15_t * pDst, + uint32_t blockSize); + + + /** + * @brief Fills a constant value into a Q31 vector. + * @param[in] value input value to be filled + * @param[out] pDst output pointer + * @param[in] blockSize number of samples to process + */ + void riscv_fill_q31( + q31_t value, + q31_t * pDst, + uint32_t blockSize); + + +/** + * @brief Convolution of floating-point sequences. + * @param[in] pSrcA points to the first input sequence. + * @param[in] srcALen length of the first input sequence. + * @param[in] pSrcB points to the second input sequence. + * @param[in] srcBLen length of the second input sequence. + * @param[out] pDst points to the location where the output result is written. Length srcALen+srcBLen-1. + */ + void riscv_conv_f32( + const float32_t * pSrcA, + uint32_t srcALen, + const float32_t * pSrcB, + uint32_t srcBLen, + float32_t * pDst); + + + /** + * @brief Convolution of Q15 sequences. + * @param[in] pSrcA points to the first input sequence. + * @param[in] srcALen length of the first input sequence. + * @param[in] pSrcB points to the second input sequence. + * @param[in] srcBLen length of the second input sequence. + * @param[out] pDst points to the block of output data Length srcALen+srcBLen-1. + * @param[in] pScratch1 points to scratch buffer of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2. + * @param[in] pScratch2 points to scratch buffer of size min(srcALen, srcBLen). + */ + void riscv_conv_opt_q15( + const q15_t * pSrcA, + uint32_t srcALen, + const q15_t * pSrcB, + uint32_t srcBLen, + q15_t * pDst, + q15_t * pScratch1, + q15_t * pScratch2); + + +/** + * @brief Convolution of Q15 sequences. + * @param[in] pSrcA points to the first input sequence. + * @param[in] srcALen length of the first input sequence. + * @param[in] pSrcB points to the second input sequence. + * @param[in] srcBLen length of the second input sequence. + * @param[out] pDst points to the location where the output result is written. Length srcALen+srcBLen-1. + */ + void riscv_conv_q15( + const q15_t * pSrcA, + uint32_t srcALen, + const q15_t * pSrcB, + uint32_t srcBLen, + q15_t * pDst); + + + /** + * @brief Convolution of Q15 sequences (fast version) for RISC-V Core with DSP enabled + * @param[in] pSrcA points to the first input sequence. + * @param[in] srcALen length of the first input sequence. + * @param[in] pSrcB points to the second input sequence. + * @param[in] srcBLen length of the second input sequence. + * @param[out] pDst points to the block of output data Length srcALen+srcBLen-1. + */ + void riscv_conv_fast_q15( + const q15_t * pSrcA, + uint32_t srcALen, + const q15_t * pSrcB, + uint32_t srcBLen, + q15_t * pDst); + + + /** + * @brief Convolution of Q15 sequences (fast version) for RISC-V Core with DSP enabled + * @param[in] pSrcA points to the first input sequence. + * @param[in] srcALen length of the first input sequence. + * @param[in] pSrcB points to the second input sequence. + * @param[in] srcBLen length of the second input sequence. + * @param[out] pDst points to the block of output data Length srcALen+srcBLen-1. + * @param[in] pScratch1 points to scratch buffer of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2. + * @param[in] pScratch2 points to scratch buffer of size min(srcALen, srcBLen). + */ + void riscv_conv_fast_opt_q15( + const q15_t * pSrcA, + uint32_t srcALen, + const q15_t * pSrcB, + uint32_t srcBLen, + q15_t * pDst, + q15_t * pScratch1, + q15_t * pScratch2); + + + /** + * @brief Convolution of Q31 sequences. + * @param[in] pSrcA points to the first input sequence. + * @param[in] srcALen length of the first input sequence. + * @param[in] pSrcB points to the second input sequence. + * @param[in] srcBLen length of the second input sequence. + * @param[out] pDst points to the block of output data Length srcALen+srcBLen-1. + */ + void riscv_conv_q31( + const q31_t * pSrcA, + uint32_t srcALen, + const q31_t * pSrcB, + uint32_t srcBLen, + q31_t * pDst); + + + /** + * @brief Convolution of Q31 sequences (fast version) for RISC-V Core with DSP enabled + * @param[in] pSrcA points to the first input sequence. + * @param[in] srcALen length of the first input sequence. + * @param[in] pSrcB points to the second input sequence. + * @param[in] srcBLen length of the second input sequence. + * @param[out] pDst points to the block of output data Length srcALen+srcBLen-1. + */ + void riscv_conv_fast_q31( + const q31_t * pSrcA, + uint32_t srcALen, + const q31_t * pSrcB, + uint32_t srcBLen, + q31_t * pDst); + + + /** + * @brief Convolution of Q7 sequences. + * @param[in] pSrcA points to the first input sequence. + * @param[in] srcALen length of the first input sequence. + * @param[in] pSrcB points to the second input sequence. + * @param[in] srcBLen length of the second input sequence. + * @param[out] pDst points to the block of output data Length srcALen+srcBLen-1. + * @param[in] pScratch1 points to scratch buffer(of type q15_t) of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2. + * @param[in] pScratch2 points to scratch buffer (of type q15_t) of size min(srcALen, srcBLen). + */ + void riscv_conv_opt_q7( + const q7_t * pSrcA, + uint32_t srcALen, + const q7_t * pSrcB, + uint32_t srcBLen, + q7_t * pDst, + q15_t * pScratch1, + q15_t * pScratch2); + + + /** + * @brief Convolution of Q7 sequences. + * @param[in] pSrcA points to the first input sequence. + * @param[in] srcALen length of the first input sequence. + * @param[in] pSrcB points to the second input sequence. + * @param[in] srcBLen length of the second input sequence. + * @param[out] pDst points to the block of output data Length srcALen+srcBLen-1. + */ + void riscv_conv_q7( + const q7_t * pSrcA, + uint32_t srcALen, + const q7_t * pSrcB, + uint32_t srcBLen, + q7_t * pDst); + + + /** + * @brief Partial convolution of floating-point sequences. + * @param[in] pSrcA points to the first input sequence. + * @param[in] srcALen length of the first input sequence. + * @param[in] pSrcB points to the second input sequence. + * @param[in] srcBLen length of the second input sequence. + * @param[out] pDst points to the block of output data + * @param[in] firstIndex is the first output sample to start with. + * @param[in] numPoints is the number of output points to be computed. + * @return Returns either RISCV_MATH_SUCCESS if the function completed correctly or RISCV_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2]. + */ + riscv_status riscv_conv_partial_f32( + const float32_t * pSrcA, + uint32_t srcALen, + const float32_t * pSrcB, + uint32_t srcBLen, + float32_t * pDst, + uint32_t firstIndex, + uint32_t numPoints); + + + /** + * @brief Partial convolution of Q15 sequences. + * @param[in] pSrcA points to the first input sequence. + * @param[in] srcALen length of the first input sequence. + * @param[in] pSrcB points to the second input sequence. + * @param[in] srcBLen length of the second input sequence. + * @param[out] pDst points to the block of output data + * @param[in] firstIndex is the first output sample to start with. + * @param[in] numPoints is the number of output points to be computed. + * @param[in] pScratch1 points to scratch buffer of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2. + * @param[in] pScratch2 points to scratch buffer of size min(srcALen, srcBLen). + * @return Returns either RISCV_MATH_SUCCESS if the function completed correctly or RISCV_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2]. + */ + riscv_status riscv_conv_partial_opt_q15( + const q15_t * pSrcA, + uint32_t srcALen, + const q15_t * pSrcB, + uint32_t srcBLen, + q15_t * pDst, + uint32_t firstIndex, + uint32_t numPoints, + q15_t * pScratch1, + q15_t * pScratch2); + + + /** + * @brief Partial convolution of Q15 sequences. + * @param[in] pSrcA points to the first input sequence. + * @param[in] srcALen length of the first input sequence. + * @param[in] pSrcB points to the second input sequence. + * @param[in] srcBLen length of the second input sequence. + * @param[out] pDst points to the block of output data + * @param[in] firstIndex is the first output sample to start with. + * @param[in] numPoints is the number of output points to be computed. + * @return Returns either RISCV_MATH_SUCCESS if the function completed correctly or RISCV_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2]. + */ + riscv_status riscv_conv_partial_q15( + const q15_t * pSrcA, + uint32_t srcALen, + const q15_t * pSrcB, + uint32_t srcBLen, + q15_t * pDst, + uint32_t firstIndex, + uint32_t numPoints); + + + /** + * @brief Partial convolution of Q15 sequences (fast version) for RISC-V Core with DSP enabled + * @param[in] pSrcA points to the first input sequence. + * @param[in] srcALen length of the first input sequence. + * @param[in] pSrcB points to the second input sequence. + * @param[in] srcBLen length of the second input sequence. + * @param[out] pDst points to the block of output data + * @param[in] firstIndex is the first output sample to start with. + * @param[in] numPoints is the number of output points to be computed. + * @return Returns either RISCV_MATH_SUCCESS if the function completed correctly or RISCV_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2]. + */ + riscv_status riscv_conv_partial_fast_q15( + const q15_t * pSrcA, + uint32_t srcALen, + const q15_t * pSrcB, + uint32_t srcBLen, + q15_t * pDst, + uint32_t firstIndex, + uint32_t numPoints); + + + /** + * @brief Partial convolution of Q15 sequences (fast version) for RISC-V Core with DSP enabled + * @param[in] pSrcA points to the first input sequence. + * @param[in] srcALen length of the first input sequence. + * @param[in] pSrcB points to the second input sequence. + * @param[in] srcBLen length of the second input sequence. + * @param[out] pDst points to the block of output data + * @param[in] firstIndex is the first output sample to start with. + * @param[in] numPoints is the number of output points to be computed. + * @param[in] pScratch1 points to scratch buffer of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2. + * @param[in] pScratch2 points to scratch buffer of size min(srcALen, srcBLen). + * @return Returns either RISCV_MATH_SUCCESS if the function completed correctly or RISCV_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2]. + */ + riscv_status riscv_conv_partial_fast_opt_q15( + const q15_t * pSrcA, + uint32_t srcALen, + const q15_t * pSrcB, + uint32_t srcBLen, + q15_t * pDst, + uint32_t firstIndex, + uint32_t numPoints, + q15_t * pScratch1, + q15_t * pScratch2); + + + /** + * @brief Partial convolution of Q31 sequences. + * @param[in] pSrcA points to the first input sequence. + * @param[in] srcALen length of the first input sequence. + * @param[in] pSrcB points to the second input sequence. + * @param[in] srcBLen length of the second input sequence. + * @param[out] pDst points to the block of output data + * @param[in] firstIndex is the first output sample to start with. + * @param[in] numPoints is the number of output points to be computed. + * @return Returns either RISCV_MATH_SUCCESS if the function completed correctly or RISCV_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2]. + */ + riscv_status riscv_conv_partial_q31( + const q31_t * pSrcA, + uint32_t srcALen, + const q31_t * pSrcB, + uint32_t srcBLen, + q31_t * pDst, + uint32_t firstIndex, + uint32_t numPoints); + + + /** + * @brief Partial convolution of Q31 sequences (fast version) for RISC-V Core with DSP enabled + * @param[in] pSrcA points to the first input sequence. + * @param[in] srcALen length of the first input sequence. + * @param[in] pSrcB points to the second input sequence. + * @param[in] srcBLen length of the second input sequence. + * @param[out] pDst points to the block of output data + * @param[in] firstIndex is the first output sample to start with. + * @param[in] numPoints is the number of output points to be computed. + * @return Returns either RISCV_MATH_SUCCESS if the function completed correctly or RISCV_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2]. + */ + riscv_status riscv_conv_partial_fast_q31( + const q31_t * pSrcA, + uint32_t srcALen, + const q31_t * pSrcB, + uint32_t srcBLen, + q31_t * pDst, + uint32_t firstIndex, + uint32_t numPoints); + + + /** + * @brief Partial convolution of Q7 sequences + * @param[in] pSrcA points to the first input sequence. + * @param[in] srcALen length of the first input sequence. + * @param[in] pSrcB points to the second input sequence. + * @param[in] srcBLen length of the second input sequence. + * @param[out] pDst points to the block of output data + * @param[in] firstIndex is the first output sample to start with. + * @param[in] numPoints is the number of output points to be computed. + * @param[in] pScratch1 points to scratch buffer(of type q15_t) of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2. + * @param[in] pScratch2 points to scratch buffer (of type q15_t) of size min(srcALen, srcBLen). + * @return Returns either RISCV_MATH_SUCCESS if the function completed correctly or RISCV_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2]. + */ + riscv_status riscv_conv_partial_opt_q7( + const q7_t * pSrcA, + uint32_t srcALen, + const q7_t * pSrcB, + uint32_t srcBLen, + q7_t * pDst, + uint32_t firstIndex, + uint32_t numPoints, + q15_t * pScratch1, + q15_t * pScratch2); + + +/** + * @brief Partial convolution of Q7 sequences. + * @param[in] pSrcA points to the first input sequence. + * @param[in] srcALen length of the first input sequence. + * @param[in] pSrcB points to the second input sequence. + * @param[in] srcBLen length of the second input sequence. + * @param[out] pDst points to the block of output data + * @param[in] firstIndex is the first output sample to start with. + * @param[in] numPoints is the number of output points to be computed. + * @return Returns either RISCV_MATH_SUCCESS if the function completed correctly or RISCV_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2]. + */ + riscv_status riscv_conv_partial_q7( + const q7_t * pSrcA, + uint32_t srcALen, + const q7_t * pSrcB, + uint32_t srcBLen, + q7_t * pDst, + uint32_t firstIndex, + uint32_t numPoints); + + + /** + * @brief Instance structure for the Q15 FIR decimator. + */ + typedef struct + { + uint8_t M; /**< decimation factor. */ + uint16_t numTaps; /**< number of coefficients in the filter. */ + const q15_t *pCoeffs; /**< points to the coefficient array. The array is of length numTaps.*/ + q15_t *pState; /**< points to the state variable array. The array is of length numTaps+blockSize-1. */ + } riscv_fir_decimate_instance_q15; + + /** + * @brief Instance structure for the Q31 FIR decimator. + */ + typedef struct + { + uint8_t M; /**< decimation factor. */ + uint16_t numTaps; /**< number of coefficients in the filter. */ + const q31_t *pCoeffs; /**< points to the coefficient array. The array is of length numTaps.*/ + q31_t *pState; /**< points to the state variable array. The array is of length numTaps+blockSize-1. */ + } riscv_fir_decimate_instance_q31; + +/** + @brief Instance structure for floating-point FIR decimator. + */ +typedef struct + { + uint8_t M; /**< decimation factor. */ + uint16_t numTaps; /**< number of coefficients in the filter. */ + const float32_t *pCoeffs; /**< points to the coefficient array. The array is of length numTaps.*/ + float32_t *pState; /**< points to the state variable array. The array is of length numTaps+blockSize-1. */ + } riscv_fir_decimate_instance_f32; + + +/** + @brief Processing function for floating-point FIR decimator. + @param[in] S points to an instance of the floating-point FIR decimator structure + @param[in] pSrc points to the block of input data + @param[out] pDst points to the block of output data + @param[in] blockSize number of samples to process + */ +void riscv_fir_decimate_f32( + const riscv_fir_decimate_instance_f32 * S, + const float32_t * pSrc, + float32_t * pDst, + uint32_t blockSize); + + +/** + @brief Initialization function for the floating-point FIR decimator. + @param[in,out] S points to an instance of the floating-point FIR decimator structure + @param[in] numTaps number of coefficients in the filter + @param[in] M decimation factor + @param[in] pCoeffs points to the filter coefficients + @param[in] pState points to the state buffer + @param[in] blockSize number of input samples to process per call + @return execution status + - \ref RISCV_MATH_SUCCESS : Operation successful + - \ref RISCV_MATH_LENGTH_ERROR : blockSize is not a multiple of M + */ +riscv_status riscv_fir_decimate_init_f32( + riscv_fir_decimate_instance_f32 * S, + uint16_t numTaps, + uint8_t M, + const float32_t * pCoeffs, + float32_t * pState, + uint32_t blockSize); + + + /** + * @brief Processing function for the Q15 FIR decimator. + * @param[in] S points to an instance of the Q15 FIR decimator structure. + * @param[in] pSrc points to the block of input data. + * @param[out] pDst points to the block of output data + * @param[in] blockSize number of input samples to process per call. + */ + void riscv_fir_decimate_q15( + const riscv_fir_decimate_instance_q15 * S, + const q15_t * pSrc, + q15_t * pDst, + uint32_t blockSize); + + + /** + * @brief Processing function for the Q15 FIR decimator (fast variant) for RISC-V Core with DSP enabled. + * @param[in] S points to an instance of the Q15 FIR decimator structure. + * @param[in] pSrc points to the block of input data. + * @param[out] pDst points to the block of output data + * @param[in] blockSize number of input samples to process per call. + */ + void riscv_fir_decimate_fast_q15( + const riscv_fir_decimate_instance_q15 * S, + const q15_t * pSrc, + q15_t * pDst, + uint32_t blockSize); + + + /** + * @brief Initialization function for the Q15 FIR decimator. + * @param[in,out] S points to an instance of the Q15 FIR decimator structure. + * @param[in] numTaps number of coefficients in the filter. + * @param[in] M decimation factor. + * @param[in] pCoeffs points to the filter coefficients. + * @param[in] pState points to the state buffer. + * @param[in] blockSize number of input samples to process per call. + * @return The function returns RISCV_MATH_SUCCESS if initialization is successful or RISCV_MATH_LENGTH_ERROR if + * blockSize is not a multiple of M. + */ + riscv_status riscv_fir_decimate_init_q15( + riscv_fir_decimate_instance_q15 * S, + uint16_t numTaps, + uint8_t M, + const q15_t * pCoeffs, + q15_t * pState, + uint32_t blockSize); + + + /** + * @brief Processing function for the Q31 FIR decimator. + * @param[in] S points to an instance of the Q31 FIR decimator structure. + * @param[in] pSrc points to the block of input data. + * @param[out] pDst points to the block of output data + * @param[in] blockSize number of input samples to process per call. + */ + void riscv_fir_decimate_q31( + const riscv_fir_decimate_instance_q31 * S, + const q31_t * pSrc, + q31_t * pDst, + uint32_t blockSize); + + /** + * @brief Processing function for the Q31 FIR decimator (fast variant) for RISC-V Core with DSP enabled. + * @param[in] S points to an instance of the Q31 FIR decimator structure. + * @param[in] pSrc points to the block of input data. + * @param[out] pDst points to the block of output data + * @param[in] blockSize number of input samples to process per call. + */ + void riscv_fir_decimate_fast_q31( + const riscv_fir_decimate_instance_q31 * S, + const q31_t * pSrc, + q31_t * pDst, + uint32_t blockSize); + + + /** + * @brief Initialization function for the Q31 FIR decimator. + * @param[in,out] S points to an instance of the Q31 FIR decimator structure. + * @param[in] numTaps number of coefficients in the filter. + * @param[in] M decimation factor. + * @param[in] pCoeffs points to the filter coefficients. + * @param[in] pState points to the state buffer. + * @param[in] blockSize number of input samples to process per call. + * @return The function returns RISCV_MATH_SUCCESS if initialization is successful or RISCV_MATH_LENGTH_ERROR if + * blockSize is not a multiple of M. + */ + riscv_status riscv_fir_decimate_init_q31( + riscv_fir_decimate_instance_q31 * S, + uint16_t numTaps, + uint8_t M, + const q31_t * pCoeffs, + q31_t * pState, + uint32_t blockSize); + + + /** + * @brief Instance structure for the Q15 FIR interpolator. + */ + typedef struct + { + uint8_t L; /**< upsample factor. */ + uint16_t phaseLength; /**< length of each polyphase filter component. */ + const q15_t *pCoeffs; /**< points to the coefficient array. The array is of length L*phaseLength. */ + q15_t *pState; /**< points to the state variable array. The array is of length blockSize+phaseLength-1. */ + } riscv_fir_interpolate_instance_q15; + + /** + * @brief Instance structure for the Q31 FIR interpolator. + */ + typedef struct + { + uint8_t L; /**< upsample factor. */ + uint16_t phaseLength; /**< length of each polyphase filter component. */ + const q31_t *pCoeffs; /**< points to the coefficient array. The array is of length L*phaseLength. */ + q31_t *pState; /**< points to the state variable array. The array is of length blockSize+phaseLength-1. */ + } riscv_fir_interpolate_instance_q31; + + /** + * @brief Instance structure for the floating-point FIR interpolator. + */ + typedef struct + { + uint8_t L; /**< upsample factor. */ + uint16_t phaseLength; /**< length of each polyphase filter component. */ + const float32_t *pCoeffs; /**< points to the coefficient array. The array is of length L*phaseLength. */ + float32_t *pState; /**< points to the state variable array. The array is of length phaseLength+numTaps-1. */ + } riscv_fir_interpolate_instance_f32; + + + /** + * @brief Processing function for the Q15 FIR interpolator. + * @param[in] S points to an instance of the Q15 FIR interpolator structure. + * @param[in] pSrc points to the block of input data. + * @param[out] pDst points to the block of output data. + * @param[in] blockSize number of input samples to process per call. + */ + void riscv_fir_interpolate_q15( + const riscv_fir_interpolate_instance_q15 * S, + const q15_t * pSrc, + q15_t * pDst, + uint32_t blockSize); + + + /** + * @brief Initialization function for the Q15 FIR interpolator. + * @param[in,out] S points to an instance of the Q15 FIR interpolator structure. + * @param[in] L upsample factor. + * @param[in] numTaps number of filter coefficients in the filter. + * @param[in] pCoeffs points to the filter coefficient buffer. + * @param[in] pState points to the state buffer. + * @param[in] blockSize number of input samples to process per call. + * @return The function returns RISCV_MATH_SUCCESS if initialization is successful or RISCV_MATH_LENGTH_ERROR if + * the filter length numTaps is not a multiple of the interpolation factor L. + */ + riscv_status riscv_fir_interpolate_init_q15( + riscv_fir_interpolate_instance_q15 * S, + uint8_t L, + uint16_t numTaps, + const q15_t * pCoeffs, + q15_t * pState, + uint32_t blockSize); + + + /** + * @brief Processing function for the Q31 FIR interpolator. + * @param[in] S points to an instance of the Q15 FIR interpolator structure. + * @param[in] pSrc points to the block of input data. + * @param[out] pDst points to the block of output data. + * @param[in] blockSize number of input samples to process per call. + */ + void riscv_fir_interpolate_q31( + const riscv_fir_interpolate_instance_q31 * S, + const q31_t * pSrc, + q31_t * pDst, + uint32_t blockSize); + + + /** + * @brief Initialization function for the Q31 FIR interpolator. + * @param[in,out] S points to an instance of the Q31 FIR interpolator structure. + * @param[in] L upsample factor. + * @param[in] numTaps number of filter coefficients in the filter. + * @param[in] pCoeffs points to the filter coefficient buffer. + * @param[in] pState points to the state buffer. + * @param[in] blockSize number of input samples to process per call. + * @return The function returns RISCV_MATH_SUCCESS if initialization is successful or RISCV_MATH_LENGTH_ERROR if + * the filter length numTaps is not a multiple of the interpolation factor L. + */ + riscv_status riscv_fir_interpolate_init_q31( + riscv_fir_interpolate_instance_q31 * S, + uint8_t L, + uint16_t numTaps, + const q31_t * pCoeffs, + q31_t * pState, + uint32_t blockSize); + + + /** + * @brief Processing function for the floating-point FIR interpolator. + * @param[in] S points to an instance of the floating-point FIR interpolator structure. + * @param[in] pSrc points to the block of input data. + * @param[out] pDst points to the block of output data. + * @param[in] blockSize number of input samples to process per call. + */ + void riscv_fir_interpolate_f32( + const riscv_fir_interpolate_instance_f32 * S, + const float32_t * pSrc, + float32_t * pDst, + uint32_t blockSize); + + + /** + * @brief Initialization function for the floating-point FIR interpolator. + * @param[in,out] S points to an instance of the floating-point FIR interpolator structure. + * @param[in] L upsample factor. + * @param[in] numTaps number of filter coefficients in the filter. + * @param[in] pCoeffs points to the filter coefficient buffer. + * @param[in] pState points to the state buffer. + * @param[in] blockSize number of input samples to process per call. + * @return The function returns RISCV_MATH_SUCCESS if initialization is successful or RISCV_MATH_LENGTH_ERROR if + * the filter length numTaps is not a multiple of the interpolation factor L. + */ + riscv_status riscv_fir_interpolate_init_f32( + riscv_fir_interpolate_instance_f32 * S, + uint8_t L, + uint16_t numTaps, + const float32_t * pCoeffs, + float32_t * pState, + uint32_t blockSize); + + + /** + * @brief Instance structure for the high precision Q31 Biquad cascade filter. + */ + typedef struct + { + uint8_t numStages; /**< number of 2nd order stages in the filter. Overall order is 2*numStages. */ + q63_t *pState; /**< points to the array of state coefficients. The array is of length 4*numStages. */ + const q31_t *pCoeffs; /**< points to the array of coefficients. The array is of length 5*numStages. */ + uint8_t postShift; /**< additional shift, in bits, applied to each output sample. */ + } riscv_biquad_cas_df1_32x64_ins_q31; + + + /** + * @param[in] S points to an instance of the high precision Q31 Biquad cascade filter structure. + * @param[in] pSrc points to the block of input data. + * @param[out] pDst points to the block of output data + * @param[in] blockSize number of samples to process. + */ + void riscv_biquad_cas_df1_32x64_q31( + const riscv_biquad_cas_df1_32x64_ins_q31 * S, + q31_t * pSrc, + q31_t * pDst, + uint32_t blockSize); + + + /** + * @param[in,out] S points to an instance of the high precision Q31 Biquad cascade filter structure. + * @param[in] numStages number of 2nd order stages in the filter. + * @param[in] pCoeffs points to the filter coefficients. + * @param[in] pState points to the state buffer. + * @param[in] postShift shift to be applied to the output. Varies according to the coefficients format + */ + void riscv_biquad_cas_df1_32x64_init_q31( + riscv_biquad_cas_df1_32x64_ins_q31 * S, + uint8_t numStages, + const q31_t * pCoeffs, + q63_t * pState, + uint8_t postShift); + + + /** + * @brief Instance structure for the floating-point transposed direct form II Biquad cascade filter. + */ + typedef struct + { + uint8_t numStages; /**< number of 2nd order stages in the filter. Overall order is 2*numStages. */ + float32_t *pState; /**< points to the array of state coefficients. The array is of length 2*numStages. */ + const float32_t *pCoeffs; /**< points to the array of coefficients. The array is of length 5*numStages. */ + } riscv_biquad_cascade_df2T_instance_f32; + + /** + * @brief Instance structure for the floating-point transposed direct form II Biquad cascade filter. + */ + typedef struct + { + uint8_t numStages; /**< number of 2nd order stages in the filter. Overall order is 2*numStages. */ + float32_t *pState; /**< points to the array of state coefficients. The array is of length 4*numStages. */ + const float32_t *pCoeffs; /**< points to the array of coefficients. The array is of length 5*numStages. */ + } riscv_biquad_cascade_stereo_df2T_instance_f32; + + /** + * @brief Instance structure for the floating-point transposed direct form II Biquad cascade filter. + */ + typedef struct + { + uint8_t numStages; /**< number of 2nd order stages in the filter. Overall order is 2*numStages. */ + float64_t *pState; /**< points to the array of state coefficients. The array is of length 2*numStages. */ + float64_t *pCoeffs; /**< points to the array of coefficients. The array is of length 5*numStages. */ + } riscv_biquad_cascade_df2T_instance_f64; + + + /** + * @brief Processing function for the floating-point transposed direct form II Biquad cascade filter. + * @param[in] S points to an instance of the filter data structure. + * @param[in] pSrc points to the block of input data. + * @param[out] pDst points to the block of output data + * @param[in] blockSize number of samples to process. + */ + void riscv_biquad_cascade_df2T_f32( + const riscv_biquad_cascade_df2T_instance_f32 * S, + const float32_t * pSrc, + float32_t * pDst, + uint32_t blockSize); + + + /** + * @brief Processing function for the floating-point transposed direct form II Biquad cascade filter. 2 channels + * @param[in] S points to an instance of the filter data structure. + * @param[in] pSrc points to the block of input data. + * @param[out] pDst points to the block of output data + * @param[in] blockSize number of samples to process. + */ + void riscv_biquad_cascade_stereo_df2T_f32( + const riscv_biquad_cascade_stereo_df2T_instance_f32 * S, + const float32_t * pSrc, + float32_t * pDst, + uint32_t blockSize); + + + /** + * @brief Processing function for the floating-point transposed direct form II Biquad cascade filter. + * @param[in] S points to an instance of the filter data structure. + * @param[in] pSrc points to the block of input data. + * @param[out] pDst points to the block of output data + * @param[in] blockSize number of samples to process. + */ + void riscv_biquad_cascade_df2T_f64( + const riscv_biquad_cascade_df2T_instance_f64 * S, + float64_t * pSrc, + float64_t * pDst, + uint32_t blockSize); + + + /** + * @brief Initialization function for the floating-point transposed direct form II Biquad cascade filter. + * @param[in,out] S points to an instance of the filter data structure. + * @param[in] numStages number of 2nd order stages in the filter. + * @param[in] pCoeffs points to the filter coefficients. + * @param[in] pState points to the state buffer. + */ + void riscv_biquad_cascade_df2T_init_f32( + riscv_biquad_cascade_df2T_instance_f32 * S, + uint8_t numStages, + const float32_t * pCoeffs, + float32_t * pState); + + + /** + * @brief Initialization function for the floating-point transposed direct form II Biquad cascade filter. + * @param[in,out] S points to an instance of the filter data structure. + * @param[in] numStages number of 2nd order stages in the filter. + * @param[in] pCoeffs points to the filter coefficients. + * @param[in] pState points to the state buffer. + */ + void riscv_biquad_cascade_stereo_df2T_init_f32( + riscv_biquad_cascade_stereo_df2T_instance_f32 * S, + uint8_t numStages, + const float32_t * pCoeffs, + float32_t * pState); + + + /** + * @brief Initialization function for the floating-point transposed direct form II Biquad cascade filter. + * @param[in,out] S points to an instance of the filter data structure. + * @param[in] numStages number of 2nd order stages in the filter. + * @param[in] pCoeffs points to the filter coefficients. + * @param[in] pState points to the state buffer. + */ + void riscv_biquad_cascade_df2T_init_f64( + riscv_biquad_cascade_df2T_instance_f64 * S, + uint8_t numStages, + float64_t * pCoeffs, + float64_t * pState); + + + /** + * @brief Instance structure for the Q15 FIR lattice filter. + */ + typedef struct + { + uint16_t numStages; /**< number of filter stages. */ + q15_t *pState; /**< points to the state variable array. The array is of length numStages. */ + const q15_t *pCoeffs; /**< points to the coefficient array. The array is of length numStages. */ + } riscv_fir_lattice_instance_q15; + + /** + * @brief Instance structure for the Q31 FIR lattice filter. + */ + typedef struct + { + uint16_t numStages; /**< number of filter stages. */ + q31_t *pState; /**< points to the state variable array. The array is of length numStages. */ + const q31_t *pCoeffs; /**< points to the coefficient array. The array is of length numStages. */ + } riscv_fir_lattice_instance_q31; + + /** + * @brief Instance structure for the floating-point FIR lattice filter. + */ + typedef struct + { + uint16_t numStages; /**< number of filter stages. */ + float32_t *pState; /**< points to the state variable array. The array is of length numStages. */ + const float32_t *pCoeffs; /**< points to the coefficient array. The array is of length numStages. */ + } riscv_fir_lattice_instance_f32; + + + /** + * @brief Initialization function for the Q15 FIR lattice filter. + * @param[in] S points to an instance of the Q15 FIR lattice structure. + * @param[in] numStages number of filter stages. + * @param[in] pCoeffs points to the coefficient buffer. The array is of length numStages. + * @param[in] pState points to the state buffer. The array is of length numStages. + */ + void riscv_fir_lattice_init_q15( + riscv_fir_lattice_instance_q15 * S, + uint16_t numStages, + const q15_t * pCoeffs, + q15_t * pState); + + + /** + * @brief Processing function for the Q15 FIR lattice filter. + * @param[in] S points to an instance of the Q15 FIR lattice structure. + * @param[in] pSrc points to the block of input data. + * @param[out] pDst points to the block of output data. + * @param[in] blockSize number of samples to process. + */ + void riscv_fir_lattice_q15( + const riscv_fir_lattice_instance_q15 * S, + const q15_t * pSrc, + q15_t * pDst, + uint32_t blockSize); + + + /** + * @brief Initialization function for the Q31 FIR lattice filter. + * @param[in] S points to an instance of the Q31 FIR lattice structure. + * @param[in] numStages number of filter stages. + * @param[in] pCoeffs points to the coefficient buffer. The array is of length numStages. + * @param[in] pState points to the state buffer. The array is of length numStages. + */ + void riscv_fir_lattice_init_q31( + riscv_fir_lattice_instance_q31 * S, + uint16_t numStages, + const q31_t * pCoeffs, + q31_t * pState); + + + /** + * @brief Processing function for the Q31 FIR lattice filter. + * @param[in] S points to an instance of the Q31 FIR lattice structure. + * @param[in] pSrc points to the block of input data. + * @param[out] pDst points to the block of output data + * @param[in] blockSize number of samples to process. + */ + void riscv_fir_lattice_q31( + const riscv_fir_lattice_instance_q31 * S, + const q31_t * pSrc, + q31_t * pDst, + uint32_t blockSize); + + +/** + * @brief Initialization function for the floating-point FIR lattice filter. + * @param[in] S points to an instance of the floating-point FIR lattice structure. + * @param[in] numStages number of filter stages. + * @param[in] pCoeffs points to the coefficient buffer. The array is of length numStages. + * @param[in] pState points to the state buffer. The array is of length numStages. + */ + void riscv_fir_lattice_init_f32( + riscv_fir_lattice_instance_f32 * S, + uint16_t numStages, + const float32_t * pCoeffs, + float32_t * pState); + + + /** + * @brief Processing function for the floating-point FIR lattice filter. + * @param[in] S points to an instance of the floating-point FIR lattice structure. + * @param[in] pSrc points to the block of input data. + * @param[out] pDst points to the block of output data + * @param[in] blockSize number of samples to process. + */ + void riscv_fir_lattice_f32( + const riscv_fir_lattice_instance_f32 * S, + const float32_t * pSrc, + float32_t * pDst, + uint32_t blockSize); + + + /** + * @brief Instance structure for the Q15 IIR lattice filter. + */ + typedef struct + { + uint16_t numStages; /**< number of stages in the filter. */ + q15_t *pState; /**< points to the state variable array. The array is of length numStages+blockSize. */ + q15_t *pkCoeffs; /**< points to the reflection coefficient array. The array is of length numStages. */ + q15_t *pvCoeffs; /**< points to the ladder coefficient array. The array is of length numStages+1. */ + } riscv_iir_lattice_instance_q15; + + /** + * @brief Instance structure for the Q31 IIR lattice filter. + */ + typedef struct + { + uint16_t numStages; /**< number of stages in the filter. */ + q31_t *pState; /**< points to the state variable array. The array is of length numStages+blockSize. */ + q31_t *pkCoeffs; /**< points to the reflection coefficient array. The array is of length numStages. */ + q31_t *pvCoeffs; /**< points to the ladder coefficient array. The array is of length numStages+1. */ + } riscv_iir_lattice_instance_q31; + + /** + * @brief Instance structure for the floating-point IIR lattice filter. + */ + typedef struct + { + uint16_t numStages; /**< number of stages in the filter. */ + float32_t *pState; /**< points to the state variable array. The array is of length numStages+blockSize. */ + float32_t *pkCoeffs; /**< points to the reflection coefficient array. The array is of length numStages. */ + float32_t *pvCoeffs; /**< points to the ladder coefficient array. The array is of length numStages+1. */ + } riscv_iir_lattice_instance_f32; + + + /** + * @brief Processing function for the floating-point IIR lattice filter. + * @param[in] S points to an instance of the floating-point IIR lattice structure. + * @param[in] pSrc points to the block of input data. + * @param[out] pDst points to the block of output data. + * @param[in] blockSize number of samples to process. + */ + void riscv_iir_lattice_f32( + const riscv_iir_lattice_instance_f32 * S, + const float32_t * pSrc, + float32_t * pDst, + uint32_t blockSize); + + + /** + * @brief Initialization function for the floating-point IIR lattice filter. + * @param[in] S points to an instance of the floating-point IIR lattice structure. + * @param[in] numStages number of stages in the filter. + * @param[in] pkCoeffs points to the reflection coefficient buffer. The array is of length numStages. + * @param[in] pvCoeffs points to the ladder coefficient buffer. The array is of length numStages+1. + * @param[in] pState points to the state buffer. The array is of length numStages+blockSize-1. + * @param[in] blockSize number of samples to process. + */ + void riscv_iir_lattice_init_f32( + riscv_iir_lattice_instance_f32 * S, + uint16_t numStages, + float32_t * pkCoeffs, + float32_t * pvCoeffs, + float32_t * pState, + uint32_t blockSize); + + + /** + * @brief Processing function for the Q31 IIR lattice filter. + * @param[in] S points to an instance of the Q31 IIR lattice structure. + * @param[in] pSrc points to the block of input data. + * @param[out] pDst points to the block of output data. + * @param[in] blockSize number of samples to process. + */ + void riscv_iir_lattice_q31( + const riscv_iir_lattice_instance_q31 * S, + const q31_t * pSrc, + q31_t * pDst, + uint32_t blockSize); + + + /** + * @brief Initialization function for the Q31 IIR lattice filter. + * @param[in] S points to an instance of the Q31 IIR lattice structure. + * @param[in] numStages number of stages in the filter. + * @param[in] pkCoeffs points to the reflection coefficient buffer. The array is of length numStages. + * @param[in] pvCoeffs points to the ladder coefficient buffer. The array is of length numStages+1. + * @param[in] pState points to the state buffer. The array is of length numStages+blockSize. + * @param[in] blockSize number of samples to process. + */ + void riscv_iir_lattice_init_q31( + riscv_iir_lattice_instance_q31 * S, + uint16_t numStages, + q31_t * pkCoeffs, + q31_t * pvCoeffs, + q31_t * pState, + uint32_t blockSize); + + + /** + * @brief Processing function for the Q15 IIR lattice filter. + * @param[in] S points to an instance of the Q15 IIR lattice structure. + * @param[in] pSrc points to the block of input data. + * @param[out] pDst points to the block of output data. + * @param[in] blockSize number of samples to process. + */ + void riscv_iir_lattice_q15( + const riscv_iir_lattice_instance_q15 * S, + const q15_t * pSrc, + q15_t * pDst, + uint32_t blockSize); + + +/** + * @brief Initialization function for the Q15 IIR lattice filter. + * @param[in] S points to an instance of the fixed-point Q15 IIR lattice structure. + * @param[in] numStages number of stages in the filter. + * @param[in] pkCoeffs points to reflection coefficient buffer. The array is of length numStages. + * @param[in] pvCoeffs points to ladder coefficient buffer. The array is of length numStages+1. + * @param[in] pState points to state buffer. The array is of length numStages+blockSize. + * @param[in] blockSize number of samples to process per call. + */ + void riscv_iir_lattice_init_q15( + riscv_iir_lattice_instance_q15 * S, + uint16_t numStages, + q15_t * pkCoeffs, + q15_t * pvCoeffs, + q15_t * pState, + uint32_t blockSize); + + + /** + * @brief Instance structure for the floating-point LMS filter. + */ + typedef struct + { + uint16_t numTaps; /**< number of coefficients in the filter. */ + float32_t *pState; /**< points to the state variable array. The array is of length numTaps+blockSize-1. */ + float32_t *pCoeffs; /**< points to the coefficient array. The array is of length numTaps. */ + float32_t mu; /**< step size that controls filter coefficient updates. */ + } riscv_lms_instance_f32; + + + /** + * @brief Processing function for floating-point LMS filter. + * @param[in] S points to an instance of the floating-point LMS filter structure. + * @param[in] pSrc points to the block of input data. + * @param[in] pRef points to the block of reference data. + * @param[out] pOut points to the block of output data. + * @param[out] pErr points to the block of error data. + * @param[in] blockSize number of samples to process. + */ + void riscv_lms_f32( + const riscv_lms_instance_f32 * S, + const float32_t * pSrc, + float32_t * pRef, + float32_t * pOut, + float32_t * pErr, + uint32_t blockSize); + + + /** + * @brief Initialization function for floating-point LMS filter. + * @param[in] S points to an instance of the floating-point LMS filter structure. + * @param[in] numTaps number of filter coefficients. + * @param[in] pCoeffs points to the coefficient buffer. + * @param[in] pState points to state buffer. + * @param[in] mu step size that controls filter coefficient updates. + * @param[in] blockSize number of samples to process. + */ + void riscv_lms_init_f32( + riscv_lms_instance_f32 * S, + uint16_t numTaps, + float32_t * pCoeffs, + float32_t * pState, + float32_t mu, + uint32_t blockSize); + + + /** + * @brief Instance structure for the Q15 LMS filter. + */ + typedef struct + { + uint16_t numTaps; /**< number of coefficients in the filter. */ + q15_t *pState; /**< points to the state variable array. The array is of length numTaps+blockSize-1. */ + q15_t *pCoeffs; /**< points to the coefficient array. The array is of length numTaps. */ + q15_t mu; /**< step size that controls filter coefficient updates. */ + uint32_t postShift; /**< bit shift applied to coefficients. */ + } riscv_lms_instance_q15; + + + /** + * @brief Initialization function for the Q15 LMS filter. + * @param[in] S points to an instance of the Q15 LMS filter structure. + * @param[in] numTaps number of filter coefficients. + * @param[in] pCoeffs points to the coefficient buffer. + * @param[in] pState points to the state buffer. + * @param[in] mu step size that controls filter coefficient updates. + * @param[in] blockSize number of samples to process. + * @param[in] postShift bit shift applied to coefficients. + */ + void riscv_lms_init_q15( + riscv_lms_instance_q15 * S, + uint16_t numTaps, + q15_t * pCoeffs, + q15_t * pState, + q15_t mu, + uint32_t blockSize, + uint32_t postShift); + + + /** + * @brief Processing function for Q15 LMS filter. + * @param[in] S points to an instance of the Q15 LMS filter structure. + * @param[in] pSrc points to the block of input data. + * @param[in] pRef points to the block of reference data. + * @param[out] pOut points to the block of output data. + * @param[out] pErr points to the block of error data. + * @param[in] blockSize number of samples to process. + */ + void riscv_lms_q15( + const riscv_lms_instance_q15 * S, + const q15_t * pSrc, + q15_t * pRef, + q15_t * pOut, + q15_t * pErr, + uint32_t blockSize); + + + /** + * @brief Instance structure for the Q31 LMS filter. + */ + typedef struct + { + uint16_t numTaps; /**< number of coefficients in the filter. */ + q31_t *pState; /**< points to the state variable array. The array is of length numTaps+blockSize-1. */ + q31_t *pCoeffs; /**< points to the coefficient array. The array is of length numTaps. */ + q31_t mu; /**< step size that controls filter coefficient updates. */ + uint32_t postShift; /**< bit shift applied to coefficients. */ + } riscv_lms_instance_q31; + + + /** + * @brief Processing function for Q31 LMS filter. + * @param[in] S points to an instance of the Q15 LMS filter structure. + * @param[in] pSrc points to the block of input data. + * @param[in] pRef points to the block of reference data. + * @param[out] pOut points to the block of output data. + * @param[out] pErr points to the block of error data. + * @param[in] blockSize number of samples to process. + */ + void riscv_lms_q31( + const riscv_lms_instance_q31 * S, + const q31_t * pSrc, + q31_t * pRef, + q31_t * pOut, + q31_t * pErr, + uint32_t blockSize); + + + /** + * @brief Initialization function for Q31 LMS filter. + * @param[in] S points to an instance of the Q31 LMS filter structure. + * @param[in] numTaps number of filter coefficients. + * @param[in] pCoeffs points to coefficient buffer. + * @param[in] pState points to state buffer. + * @param[in] mu step size that controls filter coefficient updates. + * @param[in] blockSize number of samples to process. + * @param[in] postShift bit shift applied to coefficients. + */ + void riscv_lms_init_q31( + riscv_lms_instance_q31 * S, + uint16_t numTaps, + q31_t * pCoeffs, + q31_t * pState, + q31_t mu, + uint32_t blockSize, + uint32_t postShift); + + + /** + * @brief Instance structure for the floating-point normalized LMS filter. + */ + typedef struct + { + uint16_t numTaps; /**< number of coefficients in the filter. */ + float32_t *pState; /**< points to the state variable array. The array is of length numTaps+blockSize-1. */ + float32_t *pCoeffs; /**< points to the coefficient array. The array is of length numTaps. */ + float32_t mu; /**< step size that control filter coefficient updates. */ + float32_t energy; /**< saves previous frame energy. */ + float32_t x0; /**< saves previous input sample. */ + } riscv_lms_norm_instance_f32; + + + /** + * @brief Processing function for floating-point normalized LMS filter. + * @param[in] S points to an instance of the floating-point normalized LMS filter structure. + * @param[in] pSrc points to the block of input data. + * @param[in] pRef points to the block of reference data. + * @param[out] pOut points to the block of output data. + * @param[out] pErr points to the block of error data. + * @param[in] blockSize number of samples to process. + */ + void riscv_lms_norm_f32( + riscv_lms_norm_instance_f32 * S, + const float32_t * pSrc, + float32_t * pRef, + float32_t * pOut, + float32_t * pErr, + uint32_t blockSize); + + + /** + * @brief Initialization function for floating-point normalized LMS filter. + * @param[in] S points to an instance of the floating-point LMS filter structure. + * @param[in] numTaps number of filter coefficients. + * @param[in] pCoeffs points to coefficient buffer. + * @param[in] pState points to state buffer. + * @param[in] mu step size that controls filter coefficient updates. + * @param[in] blockSize number of samples to process. + */ + void riscv_lms_norm_init_f32( + riscv_lms_norm_instance_f32 * S, + uint16_t numTaps, + float32_t * pCoeffs, + float32_t * pState, + float32_t mu, + uint32_t blockSize); + + + /** + * @brief Instance structure for the Q31 normalized LMS filter. + */ + typedef struct + { + uint16_t numTaps; /**< number of coefficients in the filter. */ + q31_t *pState; /**< points to the state variable array. The array is of length numTaps+blockSize-1. */ + q31_t *pCoeffs; /**< points to the coefficient array. The array is of length numTaps. */ + q31_t mu; /**< step size that controls filter coefficient updates. */ + uint8_t postShift; /**< bit shift applied to coefficients. */ + const q31_t *recipTable; /**< points to the reciprocal initial value table. */ + q31_t energy; /**< saves previous frame energy. */ + q31_t x0; /**< saves previous input sample. */ + } riscv_lms_norm_instance_q31; + + + /** + * @brief Processing function for Q31 normalized LMS filter. + * @param[in] S points to an instance of the Q31 normalized LMS filter structure. + * @param[in] pSrc points to the block of input data. + * @param[in] pRef points to the block of reference data. + * @param[out] pOut points to the block of output data. + * @param[out] pErr points to the block of error data. + * @param[in] blockSize number of samples to process. + */ + void riscv_lms_norm_q31( + riscv_lms_norm_instance_q31 * S, + const q31_t * pSrc, + q31_t * pRef, + q31_t * pOut, + q31_t * pErr, + uint32_t blockSize); + + + /** + * @brief Initialization function for Q31 normalized LMS filter. + * @param[in] S points to an instance of the Q31 normalized LMS filter structure. + * @param[in] numTaps number of filter coefficients. + * @param[in] pCoeffs points to coefficient buffer. + * @param[in] pState points to state buffer. + * @param[in] mu step size that controls filter coefficient updates. + * @param[in] blockSize number of samples to process. + * @param[in] postShift bit shift applied to coefficients. + */ + void riscv_lms_norm_init_q31( + riscv_lms_norm_instance_q31 * S, + uint16_t numTaps, + q31_t * pCoeffs, + q31_t * pState, + q31_t mu, + uint32_t blockSize, + uint8_t postShift); + + + /** + * @brief Instance structure for the Q15 normalized LMS filter. + */ + typedef struct + { + uint16_t numTaps; /**< Number of coefficients in the filter. */ + q15_t *pState; /**< points to the state variable array. The array is of length numTaps+blockSize-1. */ + q15_t *pCoeffs; /**< points to the coefficient array. The array is of length numTaps. */ + q15_t mu; /**< step size that controls filter coefficient updates. */ + uint8_t postShift; /**< bit shift applied to coefficients. */ + const q15_t *recipTable; /**< Points to the reciprocal initial value table. */ + q15_t energy; /**< saves previous frame energy. */ + q15_t x0; /**< saves previous input sample. */ + } riscv_lms_norm_instance_q15; + + + /** + * @brief Processing function for Q15 normalized LMS filter. + * @param[in] S points to an instance of the Q15 normalized LMS filter structure. + * @param[in] pSrc points to the block of input data. + * @param[in] pRef points to the block of reference data. + * @param[out] pOut points to the block of output data. + * @param[out] pErr points to the block of error data. + * @param[in] blockSize number of samples to process. + */ + void riscv_lms_norm_q15( + riscv_lms_norm_instance_q15 * S, + const q15_t * pSrc, + q15_t * pRef, + q15_t * pOut, + q15_t * pErr, + uint32_t blockSize); + + + /** + * @brief Initialization function for Q15 normalized LMS filter. + * @param[in] S points to an instance of the Q15 normalized LMS filter structure. + * @param[in] numTaps number of filter coefficients. + * @param[in] pCoeffs points to coefficient buffer. + * @param[in] pState points to state buffer. + * @param[in] mu step size that controls filter coefficient updates. + * @param[in] blockSize number of samples to process. + * @param[in] postShift bit shift applied to coefficients. + */ + void riscv_lms_norm_init_q15( + riscv_lms_norm_instance_q15 * S, + uint16_t numTaps, + q15_t * pCoeffs, + q15_t * pState, + q15_t mu, + uint32_t blockSize, + uint8_t postShift); + + + /** + * @brief Correlation of floating-point sequences. + * @param[in] pSrcA points to the first input sequence. + * @param[in] srcALen length of the first input sequence. + * @param[in] pSrcB points to the second input sequence. + * @param[in] srcBLen length of the second input sequence. + * @param[out] pDst points to the block of output data Length 2 * max(srcALen, srcBLen) - 1. + */ + void riscv_correlate_f32( + const float32_t * pSrcA, + uint32_t srcALen, + const float32_t * pSrcB, + uint32_t srcBLen, + float32_t * pDst); + + +/** + @brief Correlation of Q15 sequences + @param[in] pSrcA points to the first input sequence + @param[in] srcALen length of the first input sequence + @param[in] pSrcB points to the second input sequence + @param[in] srcBLen length of the second input sequence + @param[out] pDst points to the block of output data Length 2 * max(srcALen, srcBLen) - 1. + @param[in] pScratch points to scratch buffer of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2. +*/ +void riscv_correlate_opt_q15( + const q15_t * pSrcA, + uint32_t srcALen, + const q15_t * pSrcB, + uint32_t srcBLen, + q15_t * pDst, + q15_t * pScratch); + + +/** + @brief Correlation of Q15 sequences. + @param[in] pSrcA points to the first input sequence + @param[in] srcALen length of the first input sequence + @param[in] pSrcB points to the second input sequence + @param[in] srcBLen length of the second input sequence + @param[out] pDst points to the block of output data Length 2 * max(srcALen, srcBLen) - 1. + */ + void riscv_correlate_q15( + const q15_t * pSrcA, + uint32_t srcALen, + const q15_t * pSrcB, + uint32_t srcBLen, + q15_t * pDst); + + +/** + @brief Correlation of Q15 sequences (fast version). + @param[in] pSrcA points to the first input sequence + @param[in] srcALen length of the first input sequence + @param[in] pSrcB points to the second input sequence + @param[in] srcBLen length of the second input sequence + @param[out] pDst points to the location where the output result is written. Length 2 * max(srcALen, srcBLen) - 1. + @return none + */ +void riscv_correlate_fast_q15( + const q15_t * pSrcA, + uint32_t srcALen, + const q15_t * pSrcB, + uint32_t srcBLen, + q15_t * pDst); + +/** + @brief Correlation of Q15 sequences (fast version). + @param[in] pSrcA points to the first input sequence. + @param[in] srcALen length of the first input sequence. + @param[in] pSrcB points to the second input sequence. + @param[in] srcBLen length of the second input sequence. + @param[out] pDst points to the block of output data Length 2 * max(srcALen, srcBLen) - 1. + @param[in] pScratch points to scratch buffer of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2. + */ +void riscv_correlate_fast_opt_q15( + const q15_t * pSrcA, + uint32_t srcALen, + const q15_t * pSrcB, + uint32_t srcBLen, + q15_t * pDst, + q15_t * pScratch); + + + /** + * @brief Correlation of Q31 sequences. + * @param[in] pSrcA points to the first input sequence. + * @param[in] srcALen length of the first input sequence. + * @param[in] pSrcB points to the second input sequence. + * @param[in] srcBLen length of the second input sequence. + * @param[out] pDst points to the block of output data Length 2 * max(srcALen, srcBLen) - 1. + */ + void riscv_correlate_q31( + const q31_t * pSrcA, + uint32_t srcALen, + const q31_t * pSrcB, + uint32_t srcBLen, + q31_t * pDst); + + +/** + @brief Correlation of Q31 sequences (fast version). + @param[in] pSrcA points to the first input sequence + @param[in] srcALen length of the first input sequence + @param[in] pSrcB points to the second input sequence + @param[in] srcBLen length of the second input sequence + @param[out] pDst points to the block of output data Length 2 * max(srcALen, srcBLen) - 1. + */ +void riscv_correlate_fast_q31( + const q31_t * pSrcA, + uint32_t srcALen, + const q31_t * pSrcB, + uint32_t srcBLen, + q31_t * pDst); + + + /** + * @brief Correlation of Q7 sequences. + * @param[in] pSrcA points to the first input sequence. + * @param[in] srcALen length of the first input sequence. + * @param[in] pSrcB points to the second input sequence. + * @param[in] srcBLen length of the second input sequence. + * @param[out] pDst points to the block of output data Length 2 * max(srcALen, srcBLen) - 1. + * @param[in] pScratch1 points to scratch buffer(of type q15_t) of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2. + * @param[in] pScratch2 points to scratch buffer (of type q15_t) of size min(srcALen, srcBLen). + */ + void riscv_correlate_opt_q7( + const q7_t * pSrcA, + uint32_t srcALen, + const q7_t * pSrcB, + uint32_t srcBLen, + q7_t * pDst, + q15_t * pScratch1, + q15_t * pScratch2); + + + /** + * @brief Correlation of Q7 sequences. + * @param[in] pSrcA points to the first input sequence. + * @param[in] srcALen length of the first input sequence. + * @param[in] pSrcB points to the second input sequence. + * @param[in] srcBLen length of the second input sequence. + * @param[out] pDst points to the block of output data Length 2 * max(srcALen, srcBLen) - 1. + */ + void riscv_correlate_q7( + const q7_t * pSrcA, + uint32_t srcALen, + const q7_t * pSrcB, + uint32_t srcBLen, + q7_t * pDst); + + + /** + * @brief Instance structure for the floating-point sparse FIR filter. + */ + typedef struct + { + uint16_t numTaps; /**< number of coefficients in the filter. */ + uint16_t stateIndex; /**< state buffer index. Points to the oldest sample in the state buffer. */ + float32_t *pState; /**< points to the state buffer array. The array is of length maxDelay+blockSize-1. */ + const float32_t *pCoeffs; /**< points to the coefficient array. The array is of length numTaps.*/ + uint16_t maxDelay; /**< maximum offset specified by the pTapDelay array. */ + int32_t *pTapDelay; /**< points to the array of delay values. The array is of length numTaps. */ + } riscv_fir_sparse_instance_f32; + + /** + * @brief Instance structure for the Q31 sparse FIR filter. + */ + typedef struct + { + uint16_t numTaps; /**< number of coefficients in the filter. */ + uint16_t stateIndex; /**< state buffer index. Points to the oldest sample in the state buffer. */ + q31_t *pState; /**< points to the state buffer array. The array is of length maxDelay+blockSize-1. */ + const q31_t *pCoeffs; /**< points to the coefficient array. The array is of length numTaps.*/ + uint16_t maxDelay; /**< maximum offset specified by the pTapDelay array. */ + int32_t *pTapDelay; /**< points to the array of delay values. The array is of length numTaps. */ + } riscv_fir_sparse_instance_q31; + + /** + * @brief Instance structure for the Q15 sparse FIR filter. + */ + typedef struct + { + uint16_t numTaps; /**< number of coefficients in the filter. */ + uint16_t stateIndex; /**< state buffer index. Points to the oldest sample in the state buffer. */ + q15_t *pState; /**< points to the state buffer array. The array is of length maxDelay+blockSize-1. */ + const q15_t *pCoeffs; /**< points to the coefficient array. The array is of length numTaps.*/ + uint16_t maxDelay; /**< maximum offset specified by the pTapDelay array. */ + int32_t *pTapDelay; /**< points to the array of delay values. The array is of length numTaps. */ + } riscv_fir_sparse_instance_q15; + + /** + * @brief Instance structure for the Q7 sparse FIR filter. + */ + typedef struct + { + uint16_t numTaps; /**< number of coefficients in the filter. */ + uint16_t stateIndex; /**< state buffer index. Points to the oldest sample in the state buffer. */ + q7_t *pState; /**< points to the state buffer array. The array is of length maxDelay+blockSize-1. */ + const q7_t *pCoeffs; /**< points to the coefficient array. The array is of length numTaps.*/ + uint16_t maxDelay; /**< maximum offset specified by the pTapDelay array. */ + int32_t *pTapDelay; /**< points to the array of delay values. The array is of length numTaps. */ + } riscv_fir_sparse_instance_q7; + + + /** + * @brief Processing function for the floating-point sparse FIR filter. + * @param[in] S points to an instance of the floating-point sparse FIR structure. + * @param[in] pSrc points to the block of input data. + * @param[out] pDst points to the block of output data + * @param[in] pScratchIn points to a temporary buffer of size blockSize. + * @param[in] blockSize number of input samples to process per call. + */ + void riscv_fir_sparse_f32( + riscv_fir_sparse_instance_f32 * S, + const float32_t * pSrc, + float32_t * pDst, + float32_t * pScratchIn, + uint32_t blockSize); + + + /** + * @brief Initialization function for the floating-point sparse FIR filter. + * @param[in,out] S points to an instance of the floating-point sparse FIR structure. + * @param[in] numTaps number of nonzero coefficients in the filter. + * @param[in] pCoeffs points to the array of filter coefficients. + * @param[in] pState points to the state buffer. + * @param[in] pTapDelay points to the array of offset times. + * @param[in] maxDelay maximum offset time supported. + * @param[in] blockSize number of samples that will be processed per block. + */ + void riscv_fir_sparse_init_f32( + riscv_fir_sparse_instance_f32 * S, + uint16_t numTaps, + const float32_t * pCoeffs, + float32_t * pState, + int32_t * pTapDelay, + uint16_t maxDelay, + uint32_t blockSize); + + + /** + * @brief Processing function for the Q31 sparse FIR filter. + * @param[in] S points to an instance of the Q31 sparse FIR structure. + * @param[in] pSrc points to the block of input data. + * @param[out] pDst points to the block of output data + * @param[in] pScratchIn points to a temporary buffer of size blockSize. + * @param[in] blockSize number of input samples to process per call. + */ + void riscv_fir_sparse_q31( + riscv_fir_sparse_instance_q31 * S, + const q31_t * pSrc, + q31_t * pDst, + q31_t * pScratchIn, + uint32_t blockSize); + + + /** + * @brief Initialization function for the Q31 sparse FIR filter. + * @param[in,out] S points to an instance of the Q31 sparse FIR structure. + * @param[in] numTaps number of nonzero coefficients in the filter. + * @param[in] pCoeffs points to the array of filter coefficients. + * @param[in] pState points to the state buffer. + * @param[in] pTapDelay points to the array of offset times. + * @param[in] maxDelay maximum offset time supported. + * @param[in] blockSize number of samples that will be processed per block. + */ + void riscv_fir_sparse_init_q31( + riscv_fir_sparse_instance_q31 * S, + uint16_t numTaps, + const q31_t * pCoeffs, + q31_t * pState, + int32_t * pTapDelay, + uint16_t maxDelay, + uint32_t blockSize); + + + /** + * @brief Processing function for the Q15 sparse FIR filter. + * @param[in] S points to an instance of the Q15 sparse FIR structure. + * @param[in] pSrc points to the block of input data. + * @param[out] pDst points to the block of output data + * @param[in] pScratchIn points to a temporary buffer of size blockSize. + * @param[in] pScratchOut points to a temporary buffer of size blockSize. + * @param[in] blockSize number of input samples to process per call. + */ + void riscv_fir_sparse_q15( + riscv_fir_sparse_instance_q15 * S, + const q15_t * pSrc, + q15_t * pDst, + q15_t * pScratchIn, + q31_t * pScratchOut, + uint32_t blockSize); + + + /** + * @brief Initialization function for the Q15 sparse FIR filter. + * @param[in,out] S points to an instance of the Q15 sparse FIR structure. + * @param[in] numTaps number of nonzero coefficients in the filter. + * @param[in] pCoeffs points to the array of filter coefficients. + * @param[in] pState points to the state buffer. + * @param[in] pTapDelay points to the array of offset times. + * @param[in] maxDelay maximum offset time supported. + * @param[in] blockSize number of samples that will be processed per block. + */ + void riscv_fir_sparse_init_q15( + riscv_fir_sparse_instance_q15 * S, + uint16_t numTaps, + const q15_t * pCoeffs, + q15_t * pState, + int32_t * pTapDelay, + uint16_t maxDelay, + uint32_t blockSize); + + + /** + * @brief Processing function for the Q7 sparse FIR filter. + * @param[in] S points to an instance of the Q7 sparse FIR structure. + * @param[in] pSrc points to the block of input data. + * @param[out] pDst points to the block of output data + * @param[in] pScratchIn points to a temporary buffer of size blockSize. + * @param[in] pScratchOut points to a temporary buffer of size blockSize. + * @param[in] blockSize number of input samples to process per call. + */ + void riscv_fir_sparse_q7( + riscv_fir_sparse_instance_q7 * S, + const q7_t * pSrc, + q7_t * pDst, + q7_t * pScratchIn, + q31_t * pScratchOut, + uint32_t blockSize); + + + /** + * @brief Initialization function for the Q7 sparse FIR filter. + * @param[in,out] S points to an instance of the Q7 sparse FIR structure. + * @param[in] numTaps number of nonzero coefficients in the filter. + * @param[in] pCoeffs points to the array of filter coefficients. + * @param[in] pState points to the state buffer. + * @param[in] pTapDelay points to the array of offset times. + * @param[in] maxDelay maximum offset time supported. + * @param[in] blockSize number of samples that will be processed per block. + */ + void riscv_fir_sparse_init_q7( + riscv_fir_sparse_instance_q7 * S, + uint16_t numTaps, + const q7_t * pCoeffs, + q7_t * pState, + int32_t * pTapDelay, + uint16_t maxDelay, + uint32_t blockSize); + + + /** + * @brief Floating-point sin_cos function. + * @param[in] theta input value in degrees + * @param[out] pSinVal points to the processed sine output. + * @param[out] pCosVal points to the processed cos output. + */ + void riscv_sin_cos_f32( + float32_t theta, + float32_t * pSinVal, + float32_t * pCosVal); + + + /** + * @brief Q31 sin_cos function. + * @param[in] theta scaled input value in degrees + * @param[out] pSinVal points to the processed sine output. + * @param[out] pCosVal points to the processed cosine output. + */ + void riscv_sin_cos_q31( + q31_t theta, + q31_t * pSinVal, + q31_t * pCosVal); + + + /** + * @brief Floating-point complex conjugate. + * @param[in] pSrc points to the input vector + * @param[out] pDst points to the output vector + * @param[in] numSamples number of complex samples in each vector + */ + void riscv_cmplx_conj_f32( + const float32_t * pSrc, + float32_t * pDst, + uint32_t numSamples); + + /** + * @brief Q31 complex conjugate. + * @param[in] pSrc points to the input vector + * @param[out] pDst points to the output vector + * @param[in] numSamples number of complex samples in each vector + */ + void riscv_cmplx_conj_q31( + const q31_t * pSrc, + q31_t * pDst, + uint32_t numSamples); + + + /** + * @brief Q15 complex conjugate. + * @param[in] pSrc points to the input vector + * @param[out] pDst points to the output vector + * @param[in] numSamples number of complex samples in each vector + */ + void riscv_cmplx_conj_q15( + const q15_t * pSrc, + q15_t * pDst, + uint32_t numSamples); + + + /** + * @brief Floating-point complex magnitude squared + * @param[in] pSrc points to the complex input vector + * @param[out] pDst points to the real output vector + * @param[in] numSamples number of complex samples in the input vector + */ + void riscv_cmplx_mag_squared_f32( + const float32_t * pSrc, + float32_t * pDst, + uint32_t numSamples); + + + /** + * @brief Q31 complex magnitude squared + * @param[in] pSrc points to the complex input vector + * @param[out] pDst points to the real output vector + * @param[in] numSamples number of complex samples in the input vector + */ + void riscv_cmplx_mag_squared_q31( + const q31_t * pSrc, + q31_t * pDst, + uint32_t numSamples); + + + /** + * @brief Q15 complex magnitude squared + * @param[in] pSrc points to the complex input vector + * @param[out] pDst points to the real output vector + * @param[in] numSamples number of complex samples in the input vector + */ + void riscv_cmplx_mag_squared_q15( + const q15_t * pSrc, + q15_t * pDst, + uint32_t numSamples); + + + /** + * @ingroup groupController + */ + + /** + * @defgroup PID PID Motor Control + * + * A Proportional Integral Derivative (PID) controller is a generic feedback control + * loop mechanism widely used in industrial control systems. + * A PID controller is the most commonly used type of feedback controller. + * + * This set of functions implements (PID) controllers + * for Q15, Q31, and floating-point data types. The functions operate on a single sample + * of data and each call to the function returns a single processed value. + * S points to an instance of the PID control data structure. in + * is the input sample value. The functions return the output value. + * + * \par Algorithm: + *
+   *    y[n] = y[n-1] + A0 * x[n] + A1 * x[n-1] + A2 * x[n-2]
+   *    A0 = Kp + Ki + Kd
+   *    A1 = (-Kp ) - (2 * Kd )
+   *    A2 = Kd
+   * 
+ * + * \par + * where \c Kp is proportional constant, \c Ki is Integral constant and \c Kd is Derivative constant + * + * \par + * \image html PID.png "Proportional Integral Derivative Controller" + * + * \par + * The PID controller calculates an "error" value as the difference between + * the measured output and the reference input. + * The controller attempts to minimize the error by adjusting the process control inputs. + * The proportional value determines the reaction to the current error, + * the integral value determines the reaction based on the sum of recent errors, + * and the derivative value determines the reaction based on the rate at which the error has been changing. + * + * \par Instance Structure + * The Gains A0, A1, A2 and state variables for a PID controller are stored together in an instance data structure. + * A separate instance structure must be defined for each PID Controller. + * There are separate instance structure declarations for each of the 3 supported data types. + * + * \par Reset Functions + * There is also an associated reset function for each data type which clears the state array. + * + * \par Initialization Functions + * There is also an associated initialization function for each data type. + * The initialization function performs the following operations: + * - Initializes the Gains A0, A1, A2 from Kp,Ki, Kd gains. + * - Zeros out the values in the state buffer. + * + * \par + * Instance structure cannot be placed into a const data section and it is recommended to use the initialization function. + * + * \par Fixed-Point Behavior + * Care must be taken when using the fixed-point versions of the PID Controller functions. + * In particular, the overflow and saturation behavior of the accumulator used in each function must be considered. + * Refer to the function specific documentation below for usage guidelines. + */ + + /** + * @addtogroup PID + * @{ + */ + + /** + * @brief Process function for the floating-point PID Control. + * @param[in,out] S is an instance of the floating-point PID Control structure + * @param[in] in input sample to process + * @return processed output sample. + */ + __STATIC_FORCEINLINE float32_t riscv_pid_f32( + riscv_pid_instance_f32 * S, + float32_t in) + { + float32_t out; + + /* y[n] = y[n-1] + A0 * x[n] + A1 * x[n-1] + A2 * x[n-2] */ + out = (S->A0 * in) + + (S->A1 * S->state[0]) + (S->A2 * S->state[1]) + (S->state[2]); + + /* Update state */ + S->state[1] = S->state[0]; + S->state[0] = in; + S->state[2] = out; + + /* return to application */ + return (out); + + } + +/** + @brief Process function for the Q31 PID Control. + @param[in,out] S points to an instance of the Q31 PID Control structure + @param[in] in input sample to process + @return processed output sample. + + \par Scaling and Overflow Behavior + The function is implemented using an internal 64-bit accumulator. + The accumulator has a 2.62 format and maintains full precision of the intermediate multiplication results but provides only a single guard bit. + Thus, if the accumulator result overflows it wraps around rather than clip. + In order to avoid overflows completely the input signal must be scaled down by 2 bits as there are four additions. + After all multiply-accumulates are performed, the 2.62 accumulator is truncated to 1.32 format and then saturated to 1.31 format. + */ +__STATIC_FORCEINLINE q31_t riscv_pid_q31( + riscv_pid_instance_q31 * S, + q31_t in) + { + q63_t acc; + q31_t out; + + /* acc = A0 * x[n] */ + acc = (q63_t) S->A0 * in; + + /* acc += A1 * x[n-1] */ + acc += (q63_t) S->A1 * S->state[0]; + + /* acc += A2 * x[n-2] */ + acc += (q63_t) S->A2 * S->state[1]; + + /* convert output to 1.31 format to add y[n-1] */ + out = (q31_t) (acc >> 31U); + + /* out += y[n-1] */ + out += S->state[2]; + + /* Update state */ + S->state[1] = S->state[0]; + S->state[0] = in; + S->state[2] = out; + + /* return to application */ + return (out); + } + + +/** + @brief Process function for the Q15 PID Control. + @param[in,out] S points to an instance of the Q15 PID Control structure + @param[in] in input sample to process + @return processed output sample. + + \par Scaling and Overflow Behavior + The function is implemented using a 64-bit internal accumulator. + Both Gains and state variables are represented in 1.15 format and multiplications yield a 2.30 result. + The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format. + There is no risk of internal overflow with this approach and the full precision of intermediate multiplications is preserved. + After all additions have been performed, the accumulator is truncated to 34.15 format by discarding low 15 bits. + Lastly, the accumulator is saturated to yield a result in 1.15 format. + */ +__STATIC_FORCEINLINE q15_t riscv_pid_q15( + riscv_pid_instance_q15 * S, + q15_t in) + { + q63_t acc; + q15_t out; + +#if defined (RISCV_MATH_DSP) + /* Implementation of PID controller */ + + /* acc = A0 * x[n] */ + acc = (q31_t) __RV_KMDA((uint32_t)S->A0, (uint32_t)in); + + /* acc += A1 * x[n-1] + A2 * x[n-2] */ + acc = (q63_t)__RV_SMALDA((uint64_t)acc, (uint32_t)S->A1, (uint32_t)read_q15x2 (S->state)); +#else + /* acc = A0 * x[n] */ + acc = ((q31_t) S->A0) * in; + + /* acc += A1 * x[n-1] + A2 * x[n-2] */ + acc += (q31_t) S->A1 * S->state[0]; + acc += (q31_t) S->A2 * S->state[1]; +#endif + + /* acc += y[n-1] */ + acc += (q31_t) S->state[2] << 15; + + /* saturate the output */ + out = (q15_t) (__SSAT((acc >> 15), 16)); + + /* Update state */ + S->state[1] = S->state[0]; + S->state[0] = in; + S->state[2] = out; + + /* return to application */ + return (out); + } + + /** + * @} end of PID group + */ + + + /** + * @brief Floating-point matrix inverse. + * @param[in] src points to the instance of the input floating-point matrix structure. + * @param[out] dst points to the instance of the output floating-point matrix structure. + * @return The function returns RISCV_MATH_SIZE_MISMATCH, if the dimensions do not match. + * If the input matrix is singular (does not have an inverse), then the algorithm terminates and returns error status RISCV_MATH_SINGULAR. + */ + riscv_status riscv_mat_inverse_f32( + const riscv_matrix_instance_f32 * src, + riscv_matrix_instance_f32 * dst); + + + /** + * @brief Floating-point matrix inverse. + * @param[in] src points to the instance of the input floating-point matrix structure. + * @param[out] dst points to the instance of the output floating-point matrix structure. + * @return The function returns RISCV_MATH_SIZE_MISMATCH, if the dimensions do not match. + * If the input matrix is singular (does not have an inverse), then the algorithm terminates and returns error status RISCV_MATH_SINGULAR. + */ + riscv_status riscv_mat_inverse_f64( + const riscv_matrix_instance_f64 * src, + riscv_matrix_instance_f64 * dst); + + + + /** + * @ingroup groupController + */ + + /** + * @defgroup clarke Vector Clarke Transform + * Forward Clarke transform converts the instantaneous stator phases into a two-coordinate time invariant vector. + * Generally the Clarke transform uses three-phase currents Ia, Ib and Ic to calculate currents + * in the two-phase orthogonal stator axis Ialpha and Ibeta. + * When Ialpha is superposed with Ia as shown in the figure below + * \image html clarke.png Stator current space vector and its components in (a,b). + * and Ia + Ib + Ic = 0, in this condition Ialpha and Ibeta + * can be calculated using only Ia and Ib. + * + * The function operates on a single sample of data and each call to the function returns the processed output. + * The library provides separate functions for Q31 and floating-point data types. + * \par Algorithm + * \image html clarkeFormula.png + * where Ia and Ib are the instantaneous stator phases and + * pIalpha and pIbeta are the two coordinates of time invariant vector. + * \par Fixed-Point Behavior + * Care must be taken when using the Q31 version of the Clarke transform. + * In particular, the overflow and saturation behavior of the accumulator used must be considered. + * Refer to the function specific documentation below for usage guidelines. + */ + + /** + * @addtogroup clarke + * @{ + */ + + /** + * + * @brief Floating-point Clarke transform + * @param[in] Ia input three-phase coordinate a + * @param[in] Ib input three-phase coordinate b + * @param[out] pIalpha points to output two-phase orthogonal vector axis alpha + * @param[out] pIbeta points to output two-phase orthogonal vector axis beta + * @return none + */ + __STATIC_FORCEINLINE void riscv_clarke_f32( + float32_t Ia, + float32_t Ib, + float32_t * pIalpha, + float32_t * pIbeta) + { + /* Calculate pIalpha using the equation, pIalpha = Ia */ + *pIalpha = Ia; + + /* Calculate pIbeta using the equation, pIbeta = (1/sqrt(3)) * Ia + (2/sqrt(3)) * Ib */ + *pIbeta = ((float32_t) 0.57735026919 * Ia + (float32_t) 1.15470053838 * Ib); + } + + +/** + @brief Clarke transform for Q31 version + @param[in] Ia input three-phase coordinate a + @param[in] Ib input three-phase coordinate b + @param[out] pIalpha points to output two-phase orthogonal vector axis alpha + @param[out] pIbeta points to output two-phase orthogonal vector axis beta + @return none + + \par Scaling and Overflow Behavior + The function is implemented using an internal 32-bit accumulator. + The accumulator maintains 1.31 format by truncating lower 31 bits of the intermediate multiplication in 2.62 format. + There is saturation on the addition, hence there is no risk of overflow. + */ +__STATIC_FORCEINLINE void riscv_clarke_q31( + q31_t Ia, + q31_t Ib, + q31_t * pIalpha, + q31_t * pIbeta) + { + q31_t product1, product2; /* Temporary variables used to store intermediate results */ + + /* Calculating pIalpha from Ia by equation pIalpha = Ia */ + *pIalpha = Ia; + + /* Intermediate product is calculated by (1/(sqrt(3)) * Ia) */ + product1 = (q31_t) (((q63_t) Ia * 0x24F34E8B) >> 30); + + /* Intermediate product is calculated by (2/sqrt(3) * Ib) */ + product2 = (q31_t) (((q63_t) Ib * 0x49E69D16) >> 30); + + /* pIbeta is calculated by adding the intermediate products */ + *pIbeta = __QADD(product1, product2); + } + + /** + * @} end of clarke group + */ + + + /** + * @ingroup groupController + */ + + /** + * @defgroup inv_clarke Vector Inverse Clarke Transform + * Inverse Clarke transform converts the two-coordinate time invariant vector into instantaneous stator phases. + * + * The function operates on a single sample of data and each call to the function returns the processed output. + * The library provides separate functions for Q31 and floating-point data types. + * \par Algorithm + * \image html clarkeInvFormula.png + * where pIa and pIb are the instantaneous stator phases and + * Ialpha and Ibeta are the two coordinates of time invariant vector. + * \par Fixed-Point Behavior + * Care must be taken when using the Q31 version of the Clarke transform. + * In particular, the overflow and saturation behavior of the accumulator used must be considered. + * Refer to the function specific documentation below for usage guidelines. + */ + + /** + * @addtogroup inv_clarke + * @{ + */ + + /** + * @brief Floating-point Inverse Clarke transform + * @param[in] Ialpha input two-phase orthogonal vector axis alpha + * @param[in] Ibeta input two-phase orthogonal vector axis beta + * @param[out] pIa points to output three-phase coordinate a + * @param[out] pIb points to output three-phase coordinate b + * @return none + */ + __STATIC_FORCEINLINE void riscv_inv_clarke_f32( + float32_t Ialpha, + float32_t Ibeta, + float32_t * pIa, + float32_t * pIb) + { + /* Calculating pIa from Ialpha by equation pIa = Ialpha */ + *pIa = Ialpha; + + /* Calculating pIb from Ialpha and Ibeta by equation pIb = -(1/2) * Ialpha + (sqrt(3)/2) * Ibeta */ + *pIb = -0.5f * Ialpha + 0.8660254039f * Ibeta; + } + + +/** + @brief Inverse Clarke transform for Q31 version + @param[in] Ialpha input two-phase orthogonal vector axis alpha + @param[in] Ibeta input two-phase orthogonal vector axis beta + @param[out] pIa points to output three-phase coordinate a + @param[out] pIb points to output three-phase coordinate b + @return none + + \par Scaling and Overflow Behavior + The function is implemented using an internal 32-bit accumulator. + The accumulator maintains 1.31 format by truncating lower 31 bits of the intermediate multiplication in 2.62 format. + There is saturation on the subtraction, hence there is no risk of overflow. + */ +__STATIC_FORCEINLINE void riscv_inv_clarke_q31( + q31_t Ialpha, + q31_t Ibeta, + q31_t * pIa, + q31_t * pIb) + { + q31_t product1, product2; /* Temporary variables used to store intermediate results */ + + /* Calculating pIa from Ialpha by equation pIa = Ialpha */ + *pIa = Ialpha; + + /* Intermediate product is calculated by (1/(2*sqrt(3)) * Ia) */ + product1 = (q31_t) (((q63_t) (Ialpha) * (0x40000000)) >> 31); + + /* Intermediate product is calculated by (1/sqrt(3) * pIb) */ + product2 = (q31_t) (((q63_t) (Ibeta) * (0x6ED9EBA1)) >> 31); + + /* pIb is calculated by subtracting the products */ + *pIb = __QSUB(product2, product1); + } + + /** + * @} end of inv_clarke group + */ + + + + /** + * @ingroup groupController + */ + + /** + * @defgroup park Vector Park Transform + * + * Forward Park transform converts the input two-coordinate vector to flux and torque components. + * The Park transform can be used to realize the transformation of the Ialpha and the Ibeta currents + * from the stationary to the moving reference frame and control the spatial relationship between + * the stator vector current and rotor flux vector. + * If we consider the d axis aligned with the rotor flux, the diagram below shows the + * current vector and the relationship from the two reference frames: + * \image html park.png "Stator current space vector and its component in (a,b) and in the d,q rotating reference frame" + * + * The function operates on a single sample of data and each call to the function returns the processed output. + * The library provides separate functions for Q31 and floating-point data types. + * \par Algorithm + * \image html parkFormula.png + * where Ialpha and Ibeta are the stator vector components, + * pId and pIq are rotor vector components and cosVal and sinVal are the + * cosine and sine values of theta (rotor flux position). + * \par Fixed-Point Behavior + * Care must be taken when using the Q31 version of the Park transform. + * In particular, the overflow and saturation behavior of the accumulator used must be considered. + * Refer to the function specific documentation below for usage guidelines. + */ + + /** + * @addtogroup park + * @{ + */ + + /** + * @brief Floating-point Park transform + * @param[in] Ialpha input two-phase vector coordinate alpha + * @param[in] Ibeta input two-phase vector coordinate beta + * @param[out] pId points to output rotor reference frame d + * @param[out] pIq points to output rotor reference frame q + * @param[in] sinVal sine value of rotation angle theta + * @param[in] cosVal cosine value of rotation angle theta + * @return none + * + * The function implements the forward Park transform. + * + */ + __STATIC_FORCEINLINE void riscv_park_f32( + float32_t Ialpha, + float32_t Ibeta, + float32_t * pId, + float32_t * pIq, + float32_t sinVal, + float32_t cosVal) + { + /* Calculate pId using the equation, pId = Ialpha * cosVal + Ibeta * sinVal */ + *pId = Ialpha * cosVal + Ibeta * sinVal; + + /* Calculate pIq using the equation, pIq = - Ialpha * sinVal + Ibeta * cosVal */ + *pIq = -Ialpha * sinVal + Ibeta * cosVal; + } + + +/** + @brief Park transform for Q31 version + @param[in] Ialpha input two-phase vector coordinate alpha + @param[in] Ibeta input two-phase vector coordinate beta + @param[out] pId points to output rotor reference frame d + @param[out] pIq points to output rotor reference frame q + @param[in] sinVal sine value of rotation angle theta + @param[in] cosVal cosine value of rotation angle theta + @return none + + \par Scaling and Overflow Behavior + The function is implemented using an internal 32-bit accumulator. + The accumulator maintains 1.31 format by truncating lower 31 bits of the intermediate multiplication in 2.62 format. + There is saturation on the addition and subtraction, hence there is no risk of overflow. + */ +__STATIC_FORCEINLINE void riscv_park_q31( + q31_t Ialpha, + q31_t Ibeta, + q31_t * pId, + q31_t * pIq, + q31_t sinVal, + q31_t cosVal) + { + q31_t product1, product2; /* Temporary variables used to store intermediate results */ + q31_t product3, product4; /* Temporary variables used to store intermediate results */ + + /* Intermediate product is calculated by (Ialpha * cosVal) */ + product1 = (q31_t) (((q63_t) (Ialpha) * (cosVal)) >> 31); + + /* Intermediate product is calculated by (Ibeta * sinVal) */ + product2 = (q31_t) (((q63_t) (Ibeta) * (sinVal)) >> 31); + + + /* Intermediate product is calculated by (Ialpha * sinVal) */ + product3 = (q31_t) (((q63_t) (Ialpha) * (sinVal)) >> 31); + + /* Intermediate product is calculated by (Ibeta * cosVal) */ + product4 = (q31_t) (((q63_t) (Ibeta) * (cosVal)) >> 31); + + /* Calculate pId by adding the two intermediate products 1 and 2 */ + *pId = __QADD(product1, product2); + + /* Calculate pIq by subtracting the two intermediate products 3 from 4 */ + *pIq = __QSUB(product4, product3); + } + + /** + * @} end of park group + */ + + + /** + * @ingroup groupController + */ + + /** + * @defgroup inv_park Vector Inverse Park transform + * Inverse Park transform converts the input flux and torque components to two-coordinate vector. + * + * The function operates on a single sample of data and each call to the function returns the processed output. + * The library provides separate functions for Q31 and floating-point data types. + * \par Algorithm + * \image html parkInvFormula.png + * where pIalpha and pIbeta are the stator vector components, + * Id and Iq are rotor vector components and cosVal and sinVal are the + * cosine and sine values of theta (rotor flux position). + * \par Fixed-Point Behavior + * Care must be taken when using the Q31 version of the Park transform. + * In particular, the overflow and saturation behavior of the accumulator used must be considered. + * Refer to the function specific documentation below for usage guidelines. + */ + + /** + * @addtogroup inv_park + * @{ + */ + + /** + * @brief Floating-point Inverse Park transform + * @param[in] Id input coordinate of rotor reference frame d + * @param[in] Iq input coordinate of rotor reference frame q + * @param[out] pIalpha points to output two-phase orthogonal vector axis alpha + * @param[out] pIbeta points to output two-phase orthogonal vector axis beta + * @param[in] sinVal sine value of rotation angle theta + * @param[in] cosVal cosine value of rotation angle theta + * @return none + */ + __STATIC_FORCEINLINE void riscv_inv_park_f32( + float32_t Id, + float32_t Iq, + float32_t * pIalpha, + float32_t * pIbeta, + float32_t sinVal, + float32_t cosVal) + { + /* Calculate pIalpha using the equation, pIalpha = Id * cosVal - Iq * sinVal */ + *pIalpha = Id * cosVal - Iq * sinVal; + + /* Calculate pIbeta using the equation, pIbeta = Id * sinVal + Iq * cosVal */ + *pIbeta = Id * sinVal + Iq * cosVal; + } + + +/** + @brief Inverse Park transform for Q31 version + @param[in] Id input coordinate of rotor reference frame d + @param[in] Iq input coordinate of rotor reference frame q + @param[out] pIalpha points to output two-phase orthogonal vector axis alpha + @param[out] pIbeta points to output two-phase orthogonal vector axis beta + @param[in] sinVal sine value of rotation angle theta + @param[in] cosVal cosine value of rotation angle theta + @return none + + @par Scaling and Overflow Behavior + The function is implemented using an internal 32-bit accumulator. + The accumulator maintains 1.31 format by truncating lower 31 bits of the intermediate multiplication in 2.62 format. + There is saturation on the addition, hence there is no risk of overflow. + */ +__STATIC_FORCEINLINE void riscv_inv_park_q31( + q31_t Id, + q31_t Iq, + q31_t * pIalpha, + q31_t * pIbeta, + q31_t sinVal, + q31_t cosVal) + { + q31_t product1, product2; /* Temporary variables used to store intermediate results */ + q31_t product3, product4; /* Temporary variables used to store intermediate results */ + + /* Intermediate product is calculated by (Id * cosVal) */ + product1 = (q31_t) (((q63_t) (Id) * (cosVal)) >> 31); + + /* Intermediate product is calculated by (Iq * sinVal) */ + product2 = (q31_t) (((q63_t) (Iq) * (sinVal)) >> 31); + + + /* Intermediate product is calculated by (Id * sinVal) */ + product3 = (q31_t) (((q63_t) (Id) * (sinVal)) >> 31); + + /* Intermediate product is calculated by (Iq * cosVal) */ + product4 = (q31_t) (((q63_t) (Iq) * (cosVal)) >> 31); + + /* Calculate pIalpha by using the two intermediate products 1 and 2 */ + *pIalpha = __QSUB(product1, product2); + + /* Calculate pIbeta by using the two intermediate products 3 and 4 */ + *pIbeta = __QADD(product4, product3); + } + + /** + * @} end of Inverse park group + */ + + + /** + * @ingroup groupInterpolation + */ + + /** + * @defgroup LinearInterpolate Linear Interpolation + * + * Linear interpolation is a method of curve fitting using linear polynomials. + * Linear interpolation works by effectively drawing a straight line between two neighboring samples and returning the appropriate point along that line + * + * \par + * \image html LinearInterp.png "Linear interpolation" + * + * \par + * A Linear Interpolate function calculates an output value(y), for the input(x) + * using linear interpolation of the input values x0, x1( nearest input values) and the output values y0 and y1(nearest output values) + * + * \par Algorithm: + *
+   *       y = y0 + (x - x0) * ((y1 - y0)/(x1-x0))
+   *       where x0, x1 are nearest values of input x
+   *             y0, y1 are nearest values to output y
+   * 
+ * + * \par + * This set of functions implements Linear interpolation process + * for Q7, Q15, Q31, and floating-point data types. The functions operate on a single + * sample of data and each call to the function returns a single processed value. + * S points to an instance of the Linear Interpolate function data structure. + * x is the input sample value. The functions returns the output value. + * + * \par + * if x is outside of the table boundary, Linear interpolation returns first value of the table + * if x is below input range and returns last value of table if x is above range. + */ + + /** + * @addtogroup LinearInterpolate + * @{ + */ + + /** + * @brief Process function for the floating-point Linear Interpolation Function. + * @param[in,out] S is an instance of the floating-point Linear Interpolation structure + * @param[in] x input sample to process + * @return y processed output sample. + * + */ + __STATIC_FORCEINLINE float32_t riscv_linear_interp_f32( + riscv_linear_interp_instance_f32 * S, + float32_t x) + { + float32_t y; + float32_t x0, x1; /* Nearest input values */ + float32_t y0, y1; /* Nearest output values */ + float32_t xSpacing = S->xSpacing; /* spacing between input values */ + int32_t i; /* Index variable */ + float32_t *pYData = S->pYData; /* pointer to output table */ + + /* Calculation of index */ + i = (int32_t) ((x - S->x1) / xSpacing); + + if (i < 0) + { + /* Iniatilize output for below specified range as least output value of table */ + y = pYData[0]; + } + else if ((uint32_t)i >= (S->nValues - 1)) + { + /* Iniatilize output for above specified range as last output value of table */ + y = pYData[S->nValues - 1]; + } + else + { + /* Calculation of nearest input values */ + x0 = S->x1 + i * xSpacing; + x1 = S->x1 + (i + 1) * xSpacing; + + /* Read of nearest output values */ + y0 = pYData[i]; + y1 = pYData[i + 1]; + + /* Calculation of output */ + y = y0 + (x - x0) * ((y1 - y0) / (x1 - x0)); + + } + + /* returns output value */ + return (y); + } + + + /** + * + * @brief Process function for the Q31 Linear Interpolation Function. + * @param[in] pYData pointer to Q31 Linear Interpolation table + * @param[in] x input sample to process + * @param[in] nValues number of table values + * @return y processed output sample. + * + * \par + * Input sample x is in 12.20 format which contains 12 bits for table index and 20 bits for fractional part. + * This function can support maximum of table size 2^12. + * + */ + __STATIC_FORCEINLINE q31_t riscv_linear_interp_q31( + q31_t * pYData, + q31_t x, + uint32_t nValues) + { + q31_t y; /* output */ + q31_t y0, y1; /* Nearest output values */ + q31_t fract; /* fractional part */ + int32_t index; /* Index to read nearest output values */ + + /* Input is in 12.20 format */ + /* 12 bits for the table index */ + /* Index value calculation */ + index = ((x & (q31_t)0xFFF00000) >> 20); + + if (index >= (int32_t)(nValues - 1)) + { + return (pYData[nValues - 1]); + } + else if (index < 0) + { + return (pYData[0]); + } + else + { + /* 20 bits for the fractional part */ + /* shift left by 11 to keep fract in 1.31 format */ + fract = (x & 0x000FFFFF) << 11; + + /* Read two nearest output values from the index in 1.31(q31) format */ + y0 = pYData[index]; + y1 = pYData[index + 1]; + + /* Calculation of y0 * (1-fract) and y is in 2.30 format */ + y = ((q31_t) ((q63_t) y0 * (0x7FFFFFFF - fract) >> 32)); + + /* Calculation of y0 * (1-fract) + y1 *fract and y is in 2.30 format */ + y += ((q31_t) (((q63_t) y1 * fract) >> 32)); + + /* Convert y to 1.31 format */ + return (y << 1U); + } + } + + + /** + * + * @brief Process function for the Q15 Linear Interpolation Function. + * @param[in] pYData pointer to Q15 Linear Interpolation table + * @param[in] x input sample to process + * @param[in] nValues number of table values + * @return y processed output sample. + * + * \par + * Input sample x is in 12.20 format which contains 12 bits for table index and 20 bits for fractional part. + * This function can support maximum of table size 2^12. + * + */ + __STATIC_FORCEINLINE q15_t riscv_linear_interp_q15( + q15_t * pYData, + q31_t x, + uint32_t nValues) + { + q63_t y; /* output */ + q15_t y0, y1; /* Nearest output values */ + q31_t fract; /* fractional part */ + int32_t index; /* Index to read nearest output values */ + + /* Input is in 12.20 format */ + /* 12 bits for the table index */ + /* Index value calculation */ + index = ((x & (int32_t)0xFFF00000) >> 20); + + if (index >= (int32_t)(nValues - 1)) + { + return (pYData[nValues - 1]); + } + else if (index < 0) + { + return (pYData[0]); + } + else + { + /* 20 bits for the fractional part */ + /* fract is in 12.20 format */ + fract = (x & 0x000FFFFF); + + /* Read two nearest output values from the index */ + y0 = pYData[index]; + y1 = pYData[index + 1]; + + /* Calculation of y0 * (1-fract) and y is in 13.35 format */ + y = ((q63_t) y0 * (0xFFFFF - fract)); + + /* Calculation of (y0 * (1-fract) + y1 * fract) and y is in 13.35 format */ + y += ((q63_t) y1 * (fract)); + + /* convert y to 1.15 format */ + return (q15_t) (y >> 20); + } + } + + + /** + * + * @brief Process function for the Q7 Linear Interpolation Function. + * @param[in] pYData pointer to Q7 Linear Interpolation table + * @param[in] x input sample to process + * @param[in] nValues number of table values + * @return y processed output sample. + * + * \par + * Input sample x is in 12.20 format which contains 12 bits for table index and 20 bits for fractional part. + * This function can support maximum of table size 2^12. + */ + __STATIC_FORCEINLINE q7_t riscv_linear_interp_q7( + q7_t * pYData, + q31_t x, + uint32_t nValues) + { + q31_t y; /* output */ + q7_t y0, y1; /* Nearest output values */ + q31_t fract; /* fractional part */ + uint32_t index; /* Index to read nearest output values */ + + /* Input is in 12.20 format */ + /* 12 bits for the table index */ + /* Index value calculation */ + if (x < 0) + { + return (pYData[0]); + } + index = (x >> 20) & 0xfff; + + if (index >= (nValues - 1)) + { + return (pYData[nValues - 1]); + } + else + { + /* 20 bits for the fractional part */ + /* fract is in 12.20 format */ + fract = (x & 0x000FFFFF); + + /* Read two nearest output values from the index and are in 1.7(q7) format */ + y0 = pYData[index]; + y1 = pYData[index + 1]; + + /* Calculation of y0 * (1-fract ) and y is in 13.27(q27) format */ + y = ((y0 * (0xFFFFF - fract))); + + /* Calculation of y1 * fract + y0 * (1-fract) and y is in 13.27(q27) format */ + y += (y1 * fract); + + /* convert y to 1.7(q7) format */ + return (q7_t) (y >> 20); + } + } + + /** + * @} end of LinearInterpolate group + */ + + /** + * @brief Fast approximation to the trigonometric sine function for floating-point data. + * @param[in] x input value in radians. + * @return sin(x). + */ + float32_t riscv_sin_f32( + float32_t x); + + + /** + * @brief Fast approximation to the trigonometric sine function for Q31 data. + * @param[in] x Scaled input value in radians. + * @return sin(x). + */ + q31_t riscv_sin_q31( + q31_t x); + + + /** + * @brief Fast approximation to the trigonometric sine function for Q15 data. + * @param[in] x Scaled input value in radians. + * @return sin(x). + */ + q15_t riscv_sin_q15( + q15_t x); + + + /** + * @brief Fast approximation to the trigonometric cosine function for floating-point data. + * @param[in] x input value in radians. + * @return cos(x). + */ + float32_t riscv_cos_f32( + float32_t x); + + + /** + * @brief Fast approximation to the trigonometric cosine function for Q31 data. + * @param[in] x Scaled input value in radians. + * @return cos(x). + */ + q31_t riscv_cos_q31( + q31_t x); + + + /** + * @brief Fast approximation to the trigonometric cosine function for Q15 data. + * @param[in] x Scaled input value in radians. + * @return cos(x). + */ + q15_t riscv_cos_q15( + q15_t x); + + + /** + * @ingroup groupFastMath + */ + + + /** + * @defgroup SQRT Square Root + * + * Computes the square root of a number. + * There are separate functions for Q15, Q31, and floating-point data types. + * The square root function is computed using the Newton-Raphson algorithm. + * This is an iterative algorithm of the form: + *
+   *      x1 = x0 - f(x0)/f'(x0)
+   * 
+ * where x1 is the current estimate, + * x0 is the previous estimate, and + * f'(x0) is the derivative of f() evaluated at x0. + * For the square root function, the algorithm reduces to: + *
+   *     x0 = in/2                         [initial guess]
+   *     x1 = 1/2 * ( x0 + in / x0)        [each iteration]
+   * 
+ */ + + + /** + * @addtogroup SQRT + * @{ + */ + +/** + @brief Floating-point square root function. + @param[in] in input value + @param[out] pOut square root of input value + @return execution status + - \ref RISCV_MATH_SUCCESS : input value is positive + - \ref RISCV_MATH_ARGUMENT_ERROR : input value is negative; *pOut is set to 0 + */ +__STATIC_FORCEINLINE riscv_status riscv_sqrt_f32( + float32_t in, + float32_t * pOut) + { + + if (in >= 0.0f) + { +#if defined ( __riscv_flen ) + __ASM volatile("fsqrt.s %0, %1" : "=f"(*pOut) : "f"(in)); +#else + *pOut = sqrtf(in); +#endif /*__riscv_flen*/ + + return (RISCV_MATH_SUCCESS); + } + else + { + *pOut = 0.0f; + return (RISCV_MATH_ARGUMENT_ERROR); + } + } + + +/** + @brief Q31 square root function. + @param[in] in input value. The range of the input value is [0 +1) or 0x00000000 to 0x7FFFFFFF + @param[out] pOut points to square root of input value + @return execution status + - \ref RISCV_MATH_SUCCESS : input value is positive + - \ref RISCV_MATH_ARGUMENT_ERROR : input value is negative; *pOut is set to 0 + */ +riscv_status riscv_sqrt_q31( + q31_t in, + q31_t * pOut); + + +/** + @brief Q15 square root function. + @param[in] in input value. The range of the input value is [0 +1) or 0x0000 to 0x7FFF + @param[out] pOut points to square root of input value + @return execution status + - \ref RISCV_MATH_SUCCESS : input value is positive + - \ref RISCV_MATH_ARGUMENT_ERROR : input value is negative; *pOut is set to 0 + */ +riscv_status riscv_sqrt_q15( + q15_t in, + q15_t * pOut); + + /** + * @brief Vector Floating-point square root function. + * @param[in] pIn input vector. + * @param[out] pOut vector of square roots of input elements. + * @param[in] len length of input vector. + * @return The function returns RISCV_MATH_SUCCESS if input value is positive value or RISCV_MATH_ARGUMENT_ERROR if + * in is negative value and returns zero output for negative values. + */ + void riscv_vsqrt_f32( + float32_t * pIn, + float32_t * pOut, + uint16_t len); + + void riscv_vsqrt_q31( + q31_t * pIn, + q31_t * pOut, + uint16_t len); + + void riscv_vsqrt_q15( + q15_t * pIn, + q15_t * pOut, + uint16_t len); + + /** + * @} end of SQRT group + */ + + + /** + * @brief floating-point Circular write function. + */ + __STATIC_FORCEINLINE void riscv_circularWrite_f32( + int32_t * circBuffer, + int32_t L, + uint16_t * writeOffset, + int32_t bufferInc, + const int32_t * src, + int32_t srcInc, + uint32_t blockSize) + { + uint32_t i = 0U; + int32_t wOffset; + + /* Copy the value of Index pointer that points + * to the current location where the input samples to be copied */ + wOffset = *writeOffset; + + /* Loop over the blockSize */ + i = blockSize; + + while (i > 0U) + { + /* copy the input sample to the circular buffer */ + circBuffer[wOffset] = *src; + + /* Update the input pointer */ + src += srcInc; + + /* Circularly update wOffset. Watch out for positive and negative value */ + wOffset += bufferInc; + if (wOffset >= L) + wOffset -= L; + + /* Decrement the loop counter */ + i--; + } + + /* Update the index pointer */ + *writeOffset = (uint16_t)wOffset; + } + + + + /** + * @brief floating-point Circular Read function. + */ + __STATIC_FORCEINLINE void riscv_circularRead_f32( + int32_t * circBuffer, + int32_t L, + int32_t * readOffset, + int32_t bufferInc, + int32_t * dst, + int32_t * dst_base, + int32_t dst_length, + int32_t dstInc, + uint32_t blockSize) + { + uint32_t i = 0U; + int32_t rOffset; + int32_t* dst_end; + + /* Copy the value of Index pointer that points + * to the current location from where the input samples to be read */ + rOffset = *readOffset; + dst_end = dst_base + dst_length; + + /* Loop over the blockSize */ + i = blockSize; + + while (i > 0U) + { + /* copy the sample from the circular buffer to the destination buffer */ + *dst = circBuffer[rOffset]; + + /* Update the input pointer */ + dst += dstInc; + + if (dst == dst_end) + { + dst = dst_base; + } + + /* Circularly update rOffset. Watch out for positive and negative value */ + rOffset += bufferInc; + + if (rOffset >= L) + { + rOffset -= L; + } + + /* Decrement the loop counter */ + i--; + } + + /* Update the index pointer */ + *readOffset = rOffset; + } + + + /** + * @brief Q15 Circular write function. + */ + __STATIC_FORCEINLINE void riscv_circularWrite_q15( + q15_t * circBuffer, + int32_t L, + uint16_t * writeOffset, + int32_t bufferInc, + const q15_t * src, + int32_t srcInc, + uint32_t blockSize) + { + uint32_t i = 0U; + int32_t wOffset; + + /* Copy the value of Index pointer that points + * to the current location where the input samples to be copied */ + wOffset = *writeOffset; + + /* Loop over the blockSize */ + i = blockSize; + + while (i > 0U) + { + /* copy the input sample to the circular buffer */ + circBuffer[wOffset] = *src; + + /* Update the input pointer */ + src += srcInc; + + /* Circularly update wOffset. Watch out for positive and negative value */ + wOffset += bufferInc; + if (wOffset >= L) + wOffset -= L; + + /* Decrement the loop counter */ + i--; + } + + /* Update the index pointer */ + *writeOffset = (uint16_t)wOffset; + } + + + /** + * @brief Q15 Circular Read function. + */ + __STATIC_FORCEINLINE void riscv_circularRead_q15( + q15_t * circBuffer, + int32_t L, + int32_t * readOffset, + int32_t bufferInc, + q15_t * dst, + q15_t * dst_base, + int32_t dst_length, + int32_t dstInc, + uint32_t blockSize) + { + uint32_t i = 0; + int32_t rOffset; + q15_t* dst_end; + + /* Copy the value of Index pointer that points + * to the current location from where the input samples to be read */ + rOffset = *readOffset; + + dst_end = dst_base + dst_length; + + /* Loop over the blockSize */ + i = blockSize; + + while (i > 0U) + { + /* copy the sample from the circular buffer to the destination buffer */ + *dst = circBuffer[rOffset]; + + /* Update the input pointer */ + dst += dstInc; + + if (dst == dst_end) + { + dst = dst_base; + } + + /* Circularly update wOffset. Watch out for positive and negative value */ + rOffset += bufferInc; + + if (rOffset >= L) + { + rOffset -= L; + } + + /* Decrement the loop counter */ + i--; + } + + /* Update the index pointer */ + *readOffset = rOffset; + } + + + /** + * @brief Q7 Circular write function. + */ + __STATIC_FORCEINLINE void riscv_circularWrite_q7( + q7_t * circBuffer, + int32_t L, + uint16_t * writeOffset, + int32_t bufferInc, + const q7_t * src, + int32_t srcInc, + uint32_t blockSize) + { + uint32_t i = 0U; + int32_t wOffset; + + /* Copy the value of Index pointer that points + * to the current location where the input samples to be copied */ + wOffset = *writeOffset; + + /* Loop over the blockSize */ + i = blockSize; + + while (i > 0U) + { + /* copy the input sample to the circular buffer */ + circBuffer[wOffset] = *src; + + /* Update the input pointer */ + src += srcInc; + + /* Circularly update wOffset. Watch out for positive and negative value */ + wOffset += bufferInc; + if (wOffset >= L) + wOffset -= L; + + /* Decrement the loop counter */ + i--; + } + + /* Update the index pointer */ + *writeOffset = (uint16_t)wOffset; + } + + + /** + * @brief Q7 Circular Read function. + */ + __STATIC_FORCEINLINE void riscv_circularRead_q7( + q7_t * circBuffer, + int32_t L, + int32_t * readOffset, + int32_t bufferInc, + q7_t * dst, + q7_t * dst_base, + int32_t dst_length, + int32_t dstInc, + uint32_t blockSize) + { + uint32_t i = 0; + int32_t rOffset; + q7_t* dst_end; + + /* Copy the value of Index pointer that points + * to the current location from where the input samples to be read */ + rOffset = *readOffset; + + dst_end = dst_base + dst_length; + + /* Loop over the blockSize */ + i = blockSize; + + while (i > 0U) + { + /* copy the sample from the circular buffer to the destination buffer */ + *dst = circBuffer[rOffset]; + + /* Update the input pointer */ + dst += dstInc; + + if (dst == dst_end) + { + dst = dst_base; + } + + /* Circularly update rOffset. Watch out for positive and negative value */ + rOffset += bufferInc; + + if (rOffset >= L) + { + rOffset -= L; + } + + /* Decrement the loop counter */ + i--; + } + + /* Update the index pointer */ + *readOffset = rOffset; + } + + + /** + * @brief Sum of the squares of the elements of a Q31 vector. + * @param[in] pSrc is input pointer + * @param[in] blockSize is the number of samples to process + * @param[out] pResult is output value. + */ + void riscv_power_q31( + const q31_t * pSrc, + uint32_t blockSize, + q63_t * pResult); + + + /** + * @brief Sum of the squares of the elements of a floating-point vector. + * @param[in] pSrc is input pointer + * @param[in] blockSize is the number of samples to process + * @param[out] pResult is output value. + */ + void riscv_power_f32( + const float32_t * pSrc, + uint32_t blockSize, + float32_t * pResult); + + + /** + * @brief Sum of the squares of the elements of a Q15 vector. + * @param[in] pSrc is input pointer + * @param[in] blockSize is the number of samples to process + * @param[out] pResult is output value. + */ + void riscv_power_q15( + const q15_t * pSrc, + uint32_t blockSize, + q63_t * pResult); + + + /** + * @brief Sum of the squares of the elements of a Q7 vector. + * @param[in] pSrc is input pointer + * @param[in] blockSize is the number of samples to process + * @param[out] pResult is output value. + */ + void riscv_power_q7( + const q7_t * pSrc, + uint32_t blockSize, + q31_t * pResult); + + + /** + * @brief Mean value of a Q7 vector. + * @param[in] pSrc is input pointer + * @param[in] blockSize is the number of samples to process + * @param[out] pResult is output value. + */ + void riscv_mean_q7( + const q7_t * pSrc, + uint32_t blockSize, + q7_t * pResult); + + + /** + * @brief Mean value of a Q15 vector. + * @param[in] pSrc is input pointer + * @param[in] blockSize is the number of samples to process + * @param[out] pResult is output value. + */ + void riscv_mean_q15( + const q15_t * pSrc, + uint32_t blockSize, + q15_t * pResult); + + + /** + * @brief Mean value of a Q31 vector. + * @param[in] pSrc is input pointer + * @param[in] blockSize is the number of samples to process + * @param[out] pResult is output value. + */ + void riscv_mean_q31( + const q31_t * pSrc, + uint32_t blockSize, + q31_t * pResult); + + + /** + * @brief Mean value of a floating-point vector. + * @param[in] pSrc is input pointer + * @param[in] blockSize is the number of samples to process + * @param[out] pResult is output value. + */ + void riscv_mean_f32( + const float32_t * pSrc, + uint32_t blockSize, + float32_t * pResult); + + + /** + * @brief Variance of the elements of a floating-point vector. + * @param[in] pSrc is input pointer + * @param[in] blockSize is the number of samples to process + * @param[out] pResult is output value. + */ + void riscv_var_f32( + const float32_t * pSrc, + uint32_t blockSize, + float32_t * pResult); + + + /** + * @brief Variance of the elements of a Q31 vector. + * @param[in] pSrc is input pointer + * @param[in] blockSize is the number of samples to process + * @param[out] pResult is output value. + */ + void riscv_var_q31( + const q31_t * pSrc, + uint32_t blockSize, + q31_t * pResult); + + + /** + * @brief Variance of the elements of a Q15 vector. + * @param[in] pSrc is input pointer + * @param[in] blockSize is the number of samples to process + * @param[out] pResult is output value. + */ + void riscv_var_q15( + const q15_t * pSrc, + uint32_t blockSize, + q15_t * pResult); + + + /** + * @brief Root Mean Square of the elements of a floating-point vector. + * @param[in] pSrc is input pointer + * @param[in] blockSize is the number of samples to process + * @param[out] pResult is output value. + */ + void riscv_rms_f32( + const float32_t * pSrc, + uint32_t blockSize, + float32_t * pResult); + + + /** + * @brief Root Mean Square of the elements of a Q31 vector. + * @param[in] pSrc is input pointer + * @param[in] blockSize is the number of samples to process + * @param[out] pResult is output value. + */ + void riscv_rms_q31( + const q31_t * pSrc, + uint32_t blockSize, + q31_t * pResult); + + + /** + * @brief Root Mean Square of the elements of a Q15 vector. + * @param[in] pSrc is input pointer + * @param[in] blockSize is the number of samples to process + * @param[out] pResult is output value. + */ + void riscv_rms_q15( + const q15_t * pSrc, + uint32_t blockSize, + q15_t * pResult); + + + /** + * @brief Standard deviation of the elements of a floating-point vector. + * @param[in] pSrc is input pointer + * @param[in] blockSize is the number of samples to process + * @param[out] pResult is output value. + */ + void riscv_std_f32( + const float32_t * pSrc, + uint32_t blockSize, + float32_t * pResult); + + + /** + * @brief Standard deviation of the elements of a Q31 vector. + * @param[in] pSrc is input pointer + * @param[in] blockSize is the number of samples to process + * @param[out] pResult is output value. + */ + void riscv_std_q31( + const q31_t * pSrc, + uint32_t blockSize, + q31_t * pResult); + + + /** + * @brief Standard deviation of the elements of a Q15 vector. + * @param[in] pSrc is input pointer + * @param[in] blockSize is the number of samples to process + * @param[out] pResult is output value. + */ + void riscv_std_q15( + const q15_t * pSrc, + uint32_t blockSize, + q15_t * pResult); + + + /** + * @brief Floating-point complex magnitude + * @param[in] pSrc points to the complex input vector + * @param[out] pDst points to the real output vector + * @param[in] numSamples number of complex samples in the input vector + */ + void riscv_cmplx_mag_f32( + const float32_t * pSrc, + float32_t * pDst, + uint32_t numSamples); + + + /** + * @brief Q31 complex magnitude + * @param[in] pSrc points to the complex input vector + * @param[out] pDst points to the real output vector + * @param[in] numSamples number of complex samples in the input vector + */ + void riscv_cmplx_mag_q31( + const q31_t * pSrc, + q31_t * pDst, + uint32_t numSamples); + + + /** + * @brief Q15 complex magnitude + * @param[in] pSrc points to the complex input vector + * @param[out] pDst points to the real output vector + * @param[in] numSamples number of complex samples in the input vector + */ + void riscv_cmplx_mag_q15( + const q15_t * pSrc, + q15_t * pDst, + uint32_t numSamples); + + + /** + * @brief Q15 complex dot product + * @param[in] pSrcA points to the first input vector + * @param[in] pSrcB points to the second input vector + * @param[in] numSamples number of complex samples in each vector + * @param[out] realResult real part of the result returned here + * @param[out] imagResult imaginary part of the result returned here + */ + void riscv_cmplx_dot_prod_q15( + const q15_t * pSrcA, + const q15_t * pSrcB, + uint32_t numSamples, + q31_t * realResult, + q31_t * imagResult); + + + /** + * @brief Q31 complex dot product + * @param[in] pSrcA points to the first input vector + * @param[in] pSrcB points to the second input vector + * @param[in] numSamples number of complex samples in each vector + * @param[out] realResult real part of the result returned here + * @param[out] imagResult imaginary part of the result returned here + */ + void riscv_cmplx_dot_prod_q31( + const q31_t * pSrcA, + const q31_t * pSrcB, + uint32_t numSamples, + q63_t * realResult, + q63_t * imagResult); + + + /** + * @brief Floating-point complex dot product + * @param[in] pSrcA points to the first input vector + * @param[in] pSrcB points to the second input vector + * @param[in] numSamples number of complex samples in each vector + * @param[out] realResult real part of the result returned here + * @param[out] imagResult imaginary part of the result returned here + */ + void riscv_cmplx_dot_prod_f32( + const float32_t * pSrcA, + const float32_t * pSrcB, + uint32_t numSamples, + float32_t * realResult, + float32_t * imagResult); + + + /** + * @brief Q15 complex-by-real multiplication + * @param[in] pSrcCmplx points to the complex input vector + * @param[in] pSrcReal points to the real input vector + * @param[out] pCmplxDst points to the complex output vector + * @param[in] numSamples number of samples in each vector + */ + void riscv_cmplx_mult_real_q15( + const q15_t * pSrcCmplx, + const q15_t * pSrcReal, + q15_t * pCmplxDst, + uint32_t numSamples); + + + /** + * @brief Q31 complex-by-real multiplication + * @param[in] pSrcCmplx points to the complex input vector + * @param[in] pSrcReal points to the real input vector + * @param[out] pCmplxDst points to the complex output vector + * @param[in] numSamples number of samples in each vector + */ + void riscv_cmplx_mult_real_q31( + const q31_t * pSrcCmplx, + const q31_t * pSrcReal, + q31_t * pCmplxDst, + uint32_t numSamples); + + + /** + * @brief Floating-point complex-by-real multiplication + * @param[in] pSrcCmplx points to the complex input vector + * @param[in] pSrcReal points to the real input vector + * @param[out] pCmplxDst points to the complex output vector + * @param[in] numSamples number of samples in each vector + */ + void riscv_cmplx_mult_real_f32( + const float32_t * pSrcCmplx, + const float32_t * pSrcReal, + float32_t * pCmplxDst, + uint32_t numSamples); + + + /** + * @brief Minimum value of a Q7 vector. + * @param[in] pSrc is input pointer + * @param[in] blockSize is the number of samples to process + * @param[out] result is output pointer + * @param[in] index is the array index of the minimum value in the input buffer. + */ + void riscv_min_q7( + const q7_t * pSrc, + uint32_t blockSize, + q7_t * result, + uint32_t * index); + + + /** + * @brief Minimum value of a Q15 vector. + * @param[in] pSrc is input pointer + * @param[in] blockSize is the number of samples to process + * @param[out] pResult is output pointer + * @param[in] pIndex is the array index of the minimum value in the input buffer. + */ + void riscv_min_q15( + const q15_t * pSrc, + uint32_t blockSize, + q15_t * pResult, + uint32_t * pIndex); + + + /** + * @brief Minimum value of a Q31 vector. + * @param[in] pSrc is input pointer + * @param[in] blockSize is the number of samples to process + * @param[out] pResult is output pointer + * @param[out] pIndex is the array index of the minimum value in the input buffer. + */ + void riscv_min_q31( + const q31_t * pSrc, + uint32_t blockSize, + q31_t * pResult, + uint32_t * pIndex); + + + /** + * @brief Minimum value of a floating-point vector. + * @param[in] pSrc is input pointer + * @param[in] blockSize is the number of samples to process + * @param[out] pResult is output pointer + * @param[out] pIndex is the array index of the minimum value in the input buffer. + */ + void riscv_min_f32( + const float32_t * pSrc, + uint32_t blockSize, + float32_t * pResult, + uint32_t * pIndex); + + +/** + * @brief Maximum value of a Q7 vector. + * @param[in] pSrc points to the input buffer + * @param[in] blockSize length of the input vector + * @param[out] pResult maximum value returned here + * @param[out] pIndex index of maximum value returned here + */ + void riscv_max_q7( + const q7_t * pSrc, + uint32_t blockSize, + q7_t * pResult, + uint32_t * pIndex); + + +/** + * @brief Maximum value of a Q15 vector. + * @param[in] pSrc points to the input buffer + * @param[in] blockSize length of the input vector + * @param[out] pResult maximum value returned here + * @param[out] pIndex index of maximum value returned here + */ + void riscv_max_q15( + const q15_t * pSrc, + uint32_t blockSize, + q15_t * pResult, + uint32_t * pIndex); + + +/** + * @brief Maximum value of a Q31 vector. + * @param[in] pSrc points to the input buffer + * @param[in] blockSize length of the input vector + * @param[out] pResult maximum value returned here + * @param[out] pIndex index of maximum value returned here + */ + void riscv_max_q31( + const q31_t * pSrc, + uint32_t blockSize, + q31_t * pResult, + uint32_t * pIndex); + + +/** + * @brief Maximum value of a floating-point vector. + * @param[in] pSrc points to the input buffer + * @param[in] blockSize length of the input vector + * @param[out] pResult maximum value returned here + * @param[out] pIndex index of maximum value returned here + */ + void riscv_max_f32( + const float32_t * pSrc, + uint32_t blockSize, + float32_t * pResult, + uint32_t * pIndex); + + + /** + * @brief Q15 complex-by-complex multiplication + * @param[in] pSrcA points to the first input vector + * @param[in] pSrcB points to the second input vector + * @param[out] pDst points to the output vector + * @param[in] numSamples number of complex samples in each vector + */ + void riscv_cmplx_mult_cmplx_q15( + const q15_t * pSrcA, + const q15_t * pSrcB, + q15_t * pDst, + uint32_t numSamples); + + + /** + * @brief Q31 complex-by-complex multiplication + * @param[in] pSrcA points to the first input vector + * @param[in] pSrcB points to the second input vector + * @param[out] pDst points to the output vector + * @param[in] numSamples number of complex samples in each vector + */ + void riscv_cmplx_mult_cmplx_q31( + const q31_t * pSrcA, + const q31_t * pSrcB, + q31_t * pDst, + uint32_t numSamples); + + + /** + * @brief Floating-point complex-by-complex multiplication + * @param[in] pSrcA points to the first input vector + * @param[in] pSrcB points to the second input vector + * @param[out] pDst points to the output vector + * @param[in] numSamples number of complex samples in each vector + */ + void riscv_cmplx_mult_cmplx_f32( + const float32_t * pSrcA, + const float32_t * pSrcB, + float32_t * pDst, + uint32_t numSamples); + + + /** + * @brief Converts the elements of the floating-point vector to Q31 vector. + * @param[in] pSrc points to the floating-point input vector + * @param[out] pDst points to the Q31 output vector + * @param[in] blockSize length of the input vector + */ + void riscv_float_to_q31( + const float32_t * pSrc, + q31_t * pDst, + uint32_t blockSize); + + + /** + * @brief Converts the elements of the floating-point vector to Q15 vector. + * @param[in] pSrc points to the floating-point input vector + * @param[out] pDst points to the Q15 output vector + * @param[in] blockSize length of the input vector + */ + void riscv_float_to_q15( + const float32_t * pSrc, + q15_t * pDst, + uint32_t blockSize); + + + /** + * @brief Converts the elements of the floating-point vector to Q7 vector. + * @param[in] pSrc points to the floating-point input vector + * @param[out] pDst points to the Q7 output vector + * @param[in] blockSize length of the input vector + */ + void riscv_float_to_q7( + const float32_t * pSrc, + q7_t * pDst, + uint32_t blockSize); + + + /** + * @brief Converts the elements of the Q31 vector to floating-point vector. + * @param[in] pSrc is input pointer + * @param[out] pDst is output pointer + * @param[in] blockSize is the number of samples to process + */ + void riscv_q31_to_float( + const q31_t * pSrc, + float32_t * pDst, + uint32_t blockSize); + + + /** + * @brief Converts the elements of the Q31 vector to Q15 vector. + * @param[in] pSrc is input pointer + * @param[out] pDst is output pointer + * @param[in] blockSize is the number of samples to process + */ + void riscv_q31_to_q15( + const q31_t * pSrc, + q15_t * pDst, + uint32_t blockSize); + + + /** + * @brief Converts the elements of the Q31 vector to Q7 vector. + * @param[in] pSrc is input pointer + * @param[out] pDst is output pointer + * @param[in] blockSize is the number of samples to process + */ + void riscv_q31_to_q7( + const q31_t * pSrc, + q7_t * pDst, + uint32_t blockSize); + + + /** + * @brief Converts the elements of the Q15 vector to floating-point vector. + * @param[in] pSrc is input pointer + * @param[out] pDst is output pointer + * @param[in] blockSize is the number of samples to process + */ + void riscv_q15_to_float( + const q15_t * pSrc, + float32_t * pDst, + uint32_t blockSize); + + + /** + * @brief Converts the elements of the Q15 vector to Q31 vector. + * @param[in] pSrc is input pointer + * @param[out] pDst is output pointer + * @param[in] blockSize is the number of samples to process + */ + void riscv_q15_to_q31( + const q15_t * pSrc, + q31_t * pDst, + uint32_t blockSize); + + + /** + * @brief Converts the elements of the Q15 vector to Q7 vector. + * @param[in] pSrc is input pointer + * @param[out] pDst is output pointer + * @param[in] blockSize is the number of samples to process + */ + void riscv_q15_to_q7( + const q15_t * pSrc, + q7_t * pDst, + uint32_t blockSize); + + + /** + * @brief Converts the elements of the Q7 vector to floating-point vector. + * @param[in] pSrc is input pointer + * @param[out] pDst is output pointer + * @param[in] blockSize is the number of samples to process + */ + void riscv_q7_to_float( + const q7_t * pSrc, + float32_t * pDst, + uint32_t blockSize); + + + /** + * @brief Converts the elements of the Q7 vector to Q31 vector. + * @param[in] pSrc input pointer + * @param[out] pDst output pointer + * @param[in] blockSize number of samples to process + */ + void riscv_q7_to_q31( + const q7_t * pSrc, + q31_t * pDst, + uint32_t blockSize); + + + /** + * @brief Converts the elements of the Q7 vector to Q15 vector. + * @param[in] pSrc input pointer + * @param[out] pDst output pointer + * @param[in] blockSize number of samples to process + */ + void riscv_q7_to_q15( + const q7_t * pSrc, + q15_t * pDst, + uint32_t blockSize); + + + /** + * @ingroup groupInterpolation + */ + + /** + * @defgroup BilinearInterpolate Bilinear Interpolation + * + * Bilinear interpolation is an extension of linear interpolation applied to a two dimensional grid. + * The underlying function f(x, y) is sampled on a regular grid and the interpolation process + * determines values between the grid points. + * Bilinear interpolation is equivalent to two step linear interpolation, first in the x-dimension and then in the y-dimension. + * Bilinear interpolation is often used in image processing to rescale images. + * The NMSIS DSP library provides bilinear interpolation functions for Q7, Q15, Q31, and floating-point data types. + * + * Algorithm + * \par + * The instance structure used by the bilinear interpolation functions describes a two dimensional data table. + * For floating-point, the instance structure is defined as: + *
+   *   typedef struct
+   *   {
+   *     uint16_t numRows;
+   *     uint16_t numCols;
+   *     float32_t *pData;
+   * } riscv_bilinear_interp_instance_f32;
+   * 
+ * + * \par + * where numRows specifies the number of rows in the table; + * numCols specifies the number of columns in the table; + * and pData points to an array of size numRows*numCols values. + * The data table pTable is organized in row order and the supplied data values fall on integer indexes. + * That is, table element (x,y) is located at pTable[x + y*numCols] where x and y are integers. + * + * \par + * Let (x, y) specify the desired interpolation point. Then define: + *
+   *     XF = floor(x)
+   *     YF = floor(y)
+   * 
+ * \par + * The interpolated output point is computed as: + *
+   *  f(x, y) = f(XF, YF) * (1-(x-XF)) * (1-(y-YF))
+   *           + f(XF+1, YF) * (x-XF)*(1-(y-YF))
+   *           + f(XF, YF+1) * (1-(x-XF))*(y-YF)
+   *           + f(XF+1, YF+1) * (x-XF)*(y-YF)
+   * 
+ * Note that the coordinates (x, y) contain integer and fractional components. + * The integer components specify which portion of the table to use while the + * fractional components control the interpolation processor. + * + * \par + * if (x,y) are outside of the table boundary, Bilinear interpolation returns zero output. + */ + + + /** + * @addtogroup BilinearInterpolate + * @{ + */ + + /** + * @brief Floating-point bilinear interpolation. + * @param[in,out] S points to an instance of the interpolation structure. + * @param[in] X interpolation coordinate. + * @param[in] Y interpolation coordinate. + * @return out interpolated value. + */ + __STATIC_FORCEINLINE float32_t riscv_bilinear_interp_f32( + const riscv_bilinear_interp_instance_f32 * S, + float32_t X, + float32_t Y) + { + float32_t out; + float32_t f00, f01, f10, f11; + float32_t *pData = S->pData; + int32_t xIndex, yIndex, index; + float32_t xdiff, ydiff; + float32_t b1, b2, b3, b4; + + xIndex = (int32_t) X; + yIndex = (int32_t) Y; + + /* Care taken for table outside boundary */ + /* Returns zero output when values are outside table boundary */ + if (xIndex < 0 || xIndex > (S->numRows - 1) || yIndex < 0 || yIndex > (S->numCols - 1)) + { + return (0); + } + + /* Calculation of index for two nearest points in X-direction */ + index = (xIndex - 1) + (yIndex - 1) * S->numCols; + + + /* Read two nearest points in X-direction */ + f00 = pData[index]; + f01 = pData[index + 1]; + + /* Calculation of index for two nearest points in Y-direction */ + index = (xIndex - 1) + (yIndex) * S->numCols; + + + /* Read two nearest points in Y-direction */ + f10 = pData[index]; + f11 = pData[index + 1]; + + /* Calculation of intermediate values */ + b1 = f00; + b2 = f01 - f00; + b3 = f10 - f00; + b4 = f00 - f01 - f10 + f11; + + /* Calculation of fractional part in X */ + xdiff = X - xIndex; + + /* Calculation of fractional part in Y */ + ydiff = Y - yIndex; + + /* Calculation of bi-linear interpolated output */ + out = b1 + b2 * xdiff + b3 * ydiff + b4 * xdiff * ydiff; + + /* return to application */ + return (out); + } + + + /** + * @brief Q31 bilinear interpolation. + * @param[in,out] S points to an instance of the interpolation structure. + * @param[in] X interpolation coordinate in 12.20 format. + * @param[in] Y interpolation coordinate in 12.20 format. + * @return out interpolated value. + */ + __STATIC_FORCEINLINE q31_t riscv_bilinear_interp_q31( + riscv_bilinear_interp_instance_q31 * S, + q31_t X, + q31_t Y) + { + q31_t out; /* Temporary output */ + q31_t acc = 0; /* output */ + q31_t xfract, yfract; /* X, Y fractional parts */ + q31_t x1, x2, y1, y2; /* Nearest output values */ + int32_t rI, cI; /* Row and column indices */ + q31_t *pYData = S->pData; /* pointer to output table values */ + uint32_t nCols = S->numCols; /* num of rows */ + + /* Input is in 12.20 format */ + /* 12 bits for the table index */ + /* Index value calculation */ + rI = ((X & (q31_t)0xFFF00000) >> 20); + + /* Input is in 12.20 format */ + /* 12 bits for the table index */ + /* Index value calculation */ + cI = ((Y & (q31_t)0xFFF00000) >> 20); + + /* Care taken for table outside boundary */ + /* Returns zero output when values are outside table boundary */ + if (rI < 0 || rI > (S->numRows - 1) || cI < 0 || cI > (S->numCols - 1)) + { + return (0); + } + + /* 20 bits for the fractional part */ + /* shift left xfract by 11 to keep 1.31 format */ + xfract = (X & 0x000FFFFF) << 11U; + + /* Read two nearest output values from the index */ + x1 = pYData[(rI) + (int32_t)nCols * (cI) ]; + x2 = pYData[(rI) + (int32_t)nCols * (cI) + 1]; + + /* 20 bits for the fractional part */ + /* shift left yfract by 11 to keep 1.31 format */ + yfract = (Y & 0x000FFFFF) << 11U; + + /* Read two nearest output values from the index */ + y1 = pYData[(rI) + (int32_t)nCols * (cI + 1) ]; + y2 = pYData[(rI) + (int32_t)nCols * (cI + 1) + 1]; + + /* Calculation of x1 * (1-xfract ) * (1-yfract) and acc is in 3.29(q29) format */ + out = ((q31_t) (((q63_t) x1 * (0x7FFFFFFF - xfract)) >> 32)); + acc = ((q31_t) (((q63_t) out * (0x7FFFFFFF - yfract)) >> 32)); + + /* x2 * (xfract) * (1-yfract) in 3.29(q29) and adding to acc */ + out = ((q31_t) ((q63_t) x2 * (0x7FFFFFFF - yfract) >> 32)); + acc += ((q31_t) ((q63_t) out * (xfract) >> 32)); + + /* y1 * (1 - xfract) * (yfract) in 3.29(q29) and adding to acc */ + out = ((q31_t) ((q63_t) y1 * (0x7FFFFFFF - xfract) >> 32)); + acc += ((q31_t) ((q63_t) out * (yfract) >> 32)); + + /* y2 * (xfract) * (yfract) in 3.29(q29) and adding to acc */ + out = ((q31_t) ((q63_t) y2 * (xfract) >> 32)); + acc += ((q31_t) ((q63_t) out * (yfract) >> 32)); + + /* Convert acc to 1.31(q31) format */ + return ((q31_t)(acc << 2)); + } + + + /** + * @brief Q15 bilinear interpolation. + * @param[in,out] S points to an instance of the interpolation structure. + * @param[in] X interpolation coordinate in 12.20 format. + * @param[in] Y interpolation coordinate in 12.20 format. + * @return out interpolated value. + */ + __STATIC_FORCEINLINE q15_t riscv_bilinear_interp_q15( + riscv_bilinear_interp_instance_q15 * S, + q31_t X, + q31_t Y) + { + q63_t acc = 0; /* output */ + q31_t out; /* Temporary output */ + q15_t x1, x2, y1, y2; /* Nearest output values */ + q31_t xfract, yfract; /* X, Y fractional parts */ + int32_t rI, cI; /* Row and column indices */ + q15_t *pYData = S->pData; /* pointer to output table values */ + uint32_t nCols = S->numCols; /* num of rows */ + + /* Input is in 12.20 format */ + /* 12 bits for the table index */ + /* Index value calculation */ + rI = ((X & (q31_t)0xFFF00000) >> 20); + + /* Input is in 12.20 format */ + /* 12 bits for the table index */ + /* Index value calculation */ + cI = ((Y & (q31_t)0xFFF00000) >> 20); + + /* Care taken for table outside boundary */ + /* Returns zero output when values are outside table boundary */ + if (rI < 0 || rI > (S->numRows - 1) || cI < 0 || cI > (S->numCols - 1)) + { + return (0); + } + + /* 20 bits for the fractional part */ + /* xfract should be in 12.20 format */ + xfract = (X & 0x000FFFFF); + + /* Read two nearest output values from the index */ + x1 = pYData[((uint32_t)rI) + nCols * ((uint32_t)cI) ]; + x2 = pYData[((uint32_t)rI) + nCols * ((uint32_t)cI) + 1]; + + /* 20 bits for the fractional part */ + /* yfract should be in 12.20 format */ + yfract = (Y & 0x000FFFFF); + + /* Read two nearest output values from the index */ + y1 = pYData[((uint32_t)rI) + nCols * ((uint32_t)cI + 1) ]; + y2 = pYData[((uint32_t)rI) + nCols * ((uint32_t)cI + 1) + 1]; + + /* Calculation of x1 * (1-xfract ) * (1-yfract) and acc is in 13.51 format */ + + /* x1 is in 1.15(q15), xfract in 12.20 format and out is in 13.35 format */ + /* convert 13.35 to 13.31 by right shifting and out is in 1.31 */ + out = (q31_t) (((q63_t) x1 * (0xFFFFF - xfract)) >> 4U); + acc = ((q63_t) out * (0xFFFFF - yfract)); + + /* x2 * (xfract) * (1-yfract) in 1.51 and adding to acc */ + out = (q31_t) (((q63_t) x2 * (0xFFFFF - yfract)) >> 4U); + acc += ((q63_t) out * (xfract)); + + /* y1 * (1 - xfract) * (yfract) in 1.51 and adding to acc */ + out = (q31_t) (((q63_t) y1 * (0xFFFFF - xfract)) >> 4U); + acc += ((q63_t) out * (yfract)); + + /* y2 * (xfract) * (yfract) in 1.51 and adding to acc */ + out = (q31_t) (((q63_t) y2 * (xfract)) >> 4U); + acc += ((q63_t) out * (yfract)); + + /* acc is in 13.51 format and down shift acc by 36 times */ + /* Convert out to 1.15 format */ + return ((q15_t)(acc >> 36)); + } + + + /** + * @brief Q7 bilinear interpolation. + * @param[in,out] S points to an instance of the interpolation structure. + * @param[in] X interpolation coordinate in 12.20 format. + * @param[in] Y interpolation coordinate in 12.20 format. + * @return out interpolated value. + */ + __STATIC_FORCEINLINE q7_t riscv_bilinear_interp_q7( + riscv_bilinear_interp_instance_q7 * S, + q31_t X, + q31_t Y) + { + q63_t acc = 0; /* output */ + q31_t out; /* Temporary output */ + q31_t xfract, yfract; /* X, Y fractional parts */ + q7_t x1, x2, y1, y2; /* Nearest output values */ + int32_t rI, cI; /* Row and column indices */ + q7_t *pYData = S->pData; /* pointer to output table values */ + uint32_t nCols = S->numCols; /* num of rows */ + + /* Input is in 12.20 format */ + /* 12 bits for the table index */ + /* Index value calculation */ + rI = ((X & (q31_t)0xFFF00000) >> 20); + + /* Input is in 12.20 format */ + /* 12 bits for the table index */ + /* Index value calculation */ + cI = ((Y & (q31_t)0xFFF00000) >> 20); + + /* Care taken for table outside boundary */ + /* Returns zero output when values are outside table boundary */ + if (rI < 0 || rI > (S->numRows - 1) || cI < 0 || cI > (S->numCols - 1)) + { + return (0); + } + + /* 20 bits for the fractional part */ + /* xfract should be in 12.20 format */ + xfract = (X & (q31_t)0x000FFFFF); + + /* Read two nearest output values from the index */ + x1 = pYData[((uint32_t)rI) + nCols * ((uint32_t)cI) ]; + x2 = pYData[((uint32_t)rI) + nCols * ((uint32_t)cI) + 1]; + + /* 20 bits for the fractional part */ + /* yfract should be in 12.20 format */ + yfract = (Y & (q31_t)0x000FFFFF); + + /* Read two nearest output values from the index */ + y1 = pYData[((uint32_t)rI) + nCols * ((uint32_t)cI + 1) ]; + y2 = pYData[((uint32_t)rI) + nCols * ((uint32_t)cI + 1) + 1]; + + /* Calculation of x1 * (1-xfract ) * (1-yfract) and acc is in 16.47 format */ + out = ((x1 * (0xFFFFF - xfract))); + acc = (((q63_t) out * (0xFFFFF - yfract))); + + /* x2 * (xfract) * (1-yfract) in 2.22 and adding to acc */ + out = ((x2 * (0xFFFFF - yfract))); + acc += (((q63_t) out * (xfract))); + + /* y1 * (1 - xfract) * (yfract) in 2.22 and adding to acc */ + out = ((y1 * (0xFFFFF - xfract))); + acc += (((q63_t) out * (yfract))); + + /* y2 * (xfract) * (yfract) in 2.22 and adding to acc */ + out = ((y2 * (yfract))); + acc += (((q63_t) out * (xfract))); + + /* acc in 16.47 format and down shift by 40 to convert to 1.7 format */ + return ((q7_t)(acc >> 40)); + } + + /** + * @} end of BilinearInterpolate group + */ + + +/* SMMLAR */ +#define multAcc_32x32_keep32_R(a, x, y) \ + a = (q31_t) (((((q63_t) a) << 32) + ((q63_t) x * y) + 0x80000000LL ) >> 32) + +/* SMMLSR */ +#define multSub_32x32_keep32_R(a, x, y) \ + a = (q31_t) (((((q63_t) a) << 32) - ((q63_t) x * y) + 0x80000000LL ) >> 32) + +/* SMMULR */ +#define mult_32x32_keep32_R(a, x, y) \ + a = (q31_t) (((q63_t) x * y + 0x80000000LL ) >> 32) + +/* SMMLA */ +#define multAcc_32x32_keep32(a, x, y) \ + a += (q31_t) (((q63_t) x * y) >> 32) + +/* SMMLS */ +#define multSub_32x32_keep32(a, x, y) \ + a -= (q31_t) (((q63_t) x * y) >> 32) + +/* SMMUL */ +#define mult_32x32_keep32(a, x, y) \ + a = (q31_t) (((q63_t) x * y ) >> 32) + + +#define LOW_OPTIMIZATION_ENTER \ + __attribute__(( optimize("-O1") )) +#define LOW_OPTIMIZATION_EXIT +#define IAR_ONLY_LOW_OPTIMIZATION_ENTER +#define IAR_ONLY_LOW_OPTIMIZATION_EXIT + + +#ifdef __cplusplus +} +#endif + + +#endif /* _RISCV_MATH_H */ + +/** + * + * End of file. + */ diff --git a/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/DSP/GCC/libnmsis_dsp_rv32imac.a b/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/DSP/GCC/libnmsis_dsp_rv32imac.a new file mode 100644 index 00000000..3f2e5918 Binary files /dev/null and b/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/DSP/GCC/libnmsis_dsp_rv32imac.a differ diff --git a/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/DSP/GCC/libnmsis_dsp_rv32imacp.a b/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/DSP/GCC/libnmsis_dsp_rv32imacp.a new file mode 100644 index 00000000..ed8e1f08 Binary files /dev/null and b/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/DSP/GCC/libnmsis_dsp_rv32imacp.a differ diff --git a/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/DSP/GCC/libnmsis_dsp_rv32imafc.a b/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/DSP/GCC/libnmsis_dsp_rv32imafc.a new file mode 100644 index 00000000..050b9148 Binary files /dev/null and b/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/DSP/GCC/libnmsis_dsp_rv32imafc.a differ diff --git a/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/DSP/GCC/libnmsis_dsp_rv32imafcp.a b/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/DSP/GCC/libnmsis_dsp_rv32imafcp.a new file mode 100644 index 00000000..8adfbd24 Binary files /dev/null and b/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/DSP/GCC/libnmsis_dsp_rv32imafcp.a differ diff --git a/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/DSP/GCC/libnmsis_dsp_rv32imafdc.a b/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/DSP/GCC/libnmsis_dsp_rv32imafdc.a new file mode 100644 index 00000000..262e9e4e Binary files /dev/null and b/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/DSP/GCC/libnmsis_dsp_rv32imafdc.a differ diff --git a/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/DSP/GCC/libnmsis_dsp_rv32imafdcp.a b/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/DSP/GCC/libnmsis_dsp_rv32imafdcp.a new file mode 100644 index 00000000..c08ad914 Binary files /dev/null and b/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/DSP/GCC/libnmsis_dsp_rv32imafdcp.a differ diff --git a/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/DSP/GCC/libnmsis_dsp_rv64imac.a b/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/DSP/GCC/libnmsis_dsp_rv64imac.a new file mode 100644 index 00000000..04e3709b Binary files /dev/null and b/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/DSP/GCC/libnmsis_dsp_rv64imac.a differ diff --git a/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/DSP/GCC/libnmsis_dsp_rv64imacp.a b/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/DSP/GCC/libnmsis_dsp_rv64imacp.a new file mode 100644 index 00000000..7f2d0706 Binary files /dev/null and b/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/DSP/GCC/libnmsis_dsp_rv64imacp.a differ diff --git a/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/DSP/GCC/libnmsis_dsp_rv64imafc.a b/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/DSP/GCC/libnmsis_dsp_rv64imafc.a new file mode 100644 index 00000000..577a9072 Binary files /dev/null and b/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/DSP/GCC/libnmsis_dsp_rv64imafc.a differ diff --git a/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/DSP/GCC/libnmsis_dsp_rv64imafcp.a b/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/DSP/GCC/libnmsis_dsp_rv64imafcp.a new file mode 100644 index 00000000..7a4731cf Binary files /dev/null and b/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/DSP/GCC/libnmsis_dsp_rv64imafcp.a differ diff --git a/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/DSP/GCC/libnmsis_dsp_rv64imafdc.a b/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/DSP/GCC/libnmsis_dsp_rv64imafdc.a new file mode 100644 index 00000000..a77b5705 Binary files /dev/null and b/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/DSP/GCC/libnmsis_dsp_rv64imafdc.a differ diff --git a/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/DSP/GCC/libnmsis_dsp_rv64imafdcp.a b/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/DSP/GCC/libnmsis_dsp_rv64imafdcp.a new file mode 100644 index 00000000..d7ebb92a Binary files /dev/null and b/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/DSP/GCC/libnmsis_dsp_rv64imafdcp.a differ diff --git a/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/NN/GCC/libnmsis_nn_rv32imac.a b/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/NN/GCC/libnmsis_nn_rv32imac.a new file mode 100644 index 00000000..0c53f849 Binary files /dev/null and b/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/NN/GCC/libnmsis_nn_rv32imac.a differ diff --git a/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/NN/GCC/libnmsis_nn_rv32imacp.a b/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/NN/GCC/libnmsis_nn_rv32imacp.a new file mode 100644 index 00000000..2b4bb7ac Binary files /dev/null and b/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/NN/GCC/libnmsis_nn_rv32imacp.a differ diff --git a/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/NN/GCC/libnmsis_nn_rv32imafc.a b/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/NN/GCC/libnmsis_nn_rv32imafc.a new file mode 100644 index 00000000..f879b126 Binary files /dev/null and b/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/NN/GCC/libnmsis_nn_rv32imafc.a differ diff --git a/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/NN/GCC/libnmsis_nn_rv32imafcp.a b/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/NN/GCC/libnmsis_nn_rv32imafcp.a new file mode 100644 index 00000000..43aff14b Binary files /dev/null and b/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/NN/GCC/libnmsis_nn_rv32imafcp.a differ diff --git a/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/NN/GCC/libnmsis_nn_rv32imafdc.a b/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/NN/GCC/libnmsis_nn_rv32imafdc.a new file mode 100644 index 00000000..4211799a Binary files /dev/null and b/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/NN/GCC/libnmsis_nn_rv32imafdc.a differ diff --git a/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/NN/GCC/libnmsis_nn_rv32imafdcp.a b/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/NN/GCC/libnmsis_nn_rv32imafdcp.a new file mode 100644 index 00000000..09793aea Binary files /dev/null and b/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/NN/GCC/libnmsis_nn_rv32imafdcp.a differ diff --git a/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/NN/GCC/libnmsis_nn_rv64imac.a b/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/NN/GCC/libnmsis_nn_rv64imac.a new file mode 100644 index 00000000..6868dbe2 Binary files /dev/null and b/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/NN/GCC/libnmsis_nn_rv64imac.a differ diff --git a/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/NN/GCC/libnmsis_nn_rv64imacp.a b/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/NN/GCC/libnmsis_nn_rv64imacp.a new file mode 100644 index 00000000..42d38954 Binary files /dev/null and b/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/NN/GCC/libnmsis_nn_rv64imacp.a differ diff --git a/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/NN/GCC/libnmsis_nn_rv64imafc.a b/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/NN/GCC/libnmsis_nn_rv64imafc.a new file mode 100644 index 00000000..e79348fb Binary files /dev/null and b/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/NN/GCC/libnmsis_nn_rv64imafc.a differ diff --git a/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/NN/GCC/libnmsis_nn_rv64imafcp.a b/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/NN/GCC/libnmsis_nn_rv64imafcp.a new file mode 100644 index 00000000..08d9c723 Binary files /dev/null and b/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/NN/GCC/libnmsis_nn_rv64imafcp.a differ diff --git a/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/NN/GCC/libnmsis_nn_rv64imafdc.a b/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/NN/GCC/libnmsis_nn_rv64imafdc.a new file mode 100644 index 00000000..2eb57b6e Binary files /dev/null and b/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/NN/GCC/libnmsis_nn_rv64imafdc.a differ diff --git a/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/NN/GCC/libnmsis_nn_rv64imafdcp.a b/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/NN/GCC/libnmsis_nn_rv64imafdcp.a new file mode 100644 index 00000000..08360fb4 Binary files /dev/null and b/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/NN/GCC/libnmsis_nn_rv64imafdcp.a differ diff --git a/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/mathlib/GCC/libmathlib_rv64imafdcpv.a b/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/mathlib/GCC/libmathlib_rv64imafdcpv.a new file mode 100644 index 00000000..c6cea696 Binary files /dev/null and b/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/mathlib/GCC/libmathlib_rv64imafdcpv.a differ diff --git a/kernel/arch/risc-v/nuclei/gcc/nmsis/NN/Include/riscv_nn_tables.h b/kernel/arch/risc-v/nuclei/gcc/nmsis/NN/Include/riscv_nn_tables.h new file mode 100644 index 00000000..3b068f5f --- /dev/null +++ b/kernel/arch/risc-v/nuclei/gcc/nmsis/NN/Include/riscv_nn_tables.h @@ -0,0 +1,57 @@ +/* ---------------------------------------------------------------------- + * Project: NMSIS NN Library + * Title: riscv_nn_tables.h + * Description: Extern declaration for NN tables + * + * $Date: 17. January 2018 + * $Revision: V.1.0.0 + * + * Target Processor: RISC-V Cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2018 Arm Limited or its affiliates. All rights reserved. + * Copyright (c) 2019 Nuclei Limited. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef _RISCV_NN_TABLES_H +#define _RISCV_NN_TABLES_H + +#include "riscv_math.h" + +/** +* @brief tables for various activation functions +* +*/ + +extern const q15_t sigmoidTable_q15[256]; +extern const q7_t sigmoidTable_q7[256]; + +extern const q7_t tanhTable_q7[256]; +extern const q15_t tanhTable_q15[256]; + + /** + * @brief 2-way tables for various activation functions + * + * 2-way table, H table for value larger than 1/4 + * L table for value smaller than 1/4, H table for remaining + * We have this only for the q15_t version. It does not make + * sense to have it for q7_t type + */ +extern const q15_t sigmoidHTable_q15[192]; +extern const q15_t sigmoidLTable_q15[128]; + +#endif /* RISCV_NN_TABLES_H */ diff --git a/kernel/arch/risc-v/nuclei/gcc/nmsis/NN/Include/riscv_nnfunctions.h b/kernel/arch/risc-v/nuclei/gcc/nmsis/NN/Include/riscv_nnfunctions.h new file mode 100644 index 00000000..4b68e417 --- /dev/null +++ b/kernel/arch/risc-v/nuclei/gcc/nmsis/NN/Include/riscv_nnfunctions.h @@ -0,0 +1,1134 @@ +/* + * Copyright (C) 2010-2018 Arm Limited or its affiliates. All rights reserved. + * Copyright (c) 2019 Nuclei Limited. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: NMSIS NN Library + * Title: riscv_nnfunctions.h + * Description: Public header file for NMSIS NN Library + * + * $Date: 13. July 2018 + * $Revision: V.1.0.0 + * + * Target Processor: RISC-V Cores + * -------------------------------------------------------------------- */ + +/** + \mainpage NMSIS NN Software Library + * + * Introduction + * ------------ + * + * This user manual describes the NMSIS NN software library, + * a collection of efficient neural network kernels developed to maximize the + * performance and minimize the memory footprint of neural networks on Nuclei N processor cores. + * + * The library is divided into a number of functions each covering a specific category: + * - Neural Network Convolution Functions + * - Neural Network Activation Functions + * - Fully-connected Layer Functions + * - Neural Network Pooling Functions + * - Softmax Functions + * - Neural Network Support Functions + * + * The library has separate functions for operating on different weight and activation data + * types including 8-bit integers (q7_t) and 16-bit integers (q15_t). The descrition of the + * kernels are included in the function description. The implementation details are also + * described in this paper [1]. + * + * \note Please refer to [NMSIS-NN](../../../nn/index.html) + * + * Block Diagram + * -------- + * \image html NMSIS-NN-OVERVIEW.PNG + * + * Examples + * -------- + * + * The library ships with a number of examples which demonstrate how to use the library functions. + * + * Pre-processor Macros + * ------------ + * + * Each library project have differant pre-processor macros. + * + * - RISCV_MATH_DSP: + * + * Define macro RISCV_MATH_DSP, If the silicon supports DSP instructions. + * + * - RISCV_NN_TRUNCATE: + * + * Define macro RISCV_NN_TRUNCATE to use floor instead of round-to-the-nearest-int for the computation. + * + * + * [1] CMSIS-NN: Efficient Neural Network Kernels for Arm Cortex-M CPUs https://arxiv.org/abs/1801.06601 + */ + +/** + * @defgroup groupNN Neural Network Functions + * These functions perform basic operations for neural network layers. + */ + +#ifndef _RISCV_NNFUNCTIONS_H +#define _RISCV_NNFUNCTIONS_H + +#include "riscv_nnsupportfunctions.h" +#include "riscv_nn_tables.h" + +#define USE_INTRINSIC + +//#define RISCV_NN_TRUNCATE /* This config the rounding model to floor or round to the nearest int */ + +#ifdef __cplusplus +extern "C" +{ +#endif + +/** + * @defgroup NNConv Neural Network Convolution Functions + * + * Perform convolution layer + * + * The convolution is implemented in 2 steps: im2col and GEMM + * + * im2col is a process of converting each patch of image data into + * a column. After im2col, the convolution is computed as matrix-matrix + * multiplication. + * + * To reduce the memory footprint, the im2col is performed partially. + * Each iteration, only a few column (i.e., patches) are generated and + * computed with GEMM kernels similar to NMSIS-DSP riscv_mat_mult functions. + * + */ + + /** + * @brief Basic Q7 convolution function + * @param[in] Im_in pointer to input tensor + * @param[in] dim_im_in input tensor dimention + * @param[in] ch_im_in number of input tensor channels + * @param[in] wt pointer to kernel weights + * @param[in] ch_im_out number of filters, i.e., output tensor channels + * @param[in] dim_kernel filter kernel size + * @param[in] padding padding sizes + * @param[in] stride convolution stride + * @param[in] bias pointer to bias + * @param[in] bias_shift amount of left-shift for bias + * @param[in] out_shift amount of right-shift for output + * @param[in,out] Im_out pointer to output tensor + * @param[in] dim_im_out output tensor dimension + * @param[in,out] bufferA pointer to buffer space for input + * @param[in,out] bufferB pointer to buffer space for output + * @return The function returns RISCV_MATH_SUCCESS + * + */ + + riscv_status riscv_convolve_HWC_q7_basic(const q7_t * Im_in, + const uint16_t dim_im_in, + const uint16_t ch_im_in, + const q7_t * wt, + const uint16_t ch_im_out, + const uint16_t dim_kernel, + const uint16_t padding, + const uint16_t stride, + const q7_t * bias, + const uint16_t bias_shift, + const uint16_t out_shift, + q7_t * Im_out, + const uint16_t dim_im_out, + q15_t * bufferA, + q7_t * bufferB); + + /** + * @brief Basic Q7 convolution function (non-sqaure shape) + * @param[in] Im_in pointer to input tensor + * @param[in] dim_im_in_x input tensor dimention x + * @param[in] dim_im_in_y input tensor dimention y + * @param[in] ch_im_in number of input tensor channels + * @param[in] wt pointer to kernel weights + * @param[in] ch_im_out number of filters, i.e., output tensor channels + * @param[in] dim_kernel_x filter kernel size x + * @param[in] dim_kernel_y filter kernel size y + * @param[in] padding_x padding size x + * @param[in] padding_y padding size y + * @param[in] stride_x convolution stride x + * @param[in] stride_y convolution stride y + * @param[in] bias pointer to bias + * @param[in] bias_shift amount of left-shift for bias + * @param[in] out_shift amount of right-shift for output + * @param[in,out] Im_out pointer to output tensor + * @param[in] dim_im_out_x output tensor dimension x + * @param[in] dim_im_out_y output tensor dimension y + * @param[in,out] bufferA pointer to buffer space for input + * @param[in,out] bufferB pointer to buffer space for output + * @return The function returns RISCV_MATH_SUCCESS + */ + + riscv_status riscv_convolve_HWC_q7_basic_nonsquare(const q7_t * Im_in, + const uint16_t dim_im_in_x, + const uint16_t dim_im_in_y, + const uint16_t ch_im_in, + const q7_t * wt, + const uint16_t ch_im_out, + const uint16_t dim_kernel_x, + const uint16_t dim_kernel_y, + const uint16_t padding_x, + const uint16_t padding_y, + const uint16_t stride_x, + const uint16_t stride_y, + const q7_t * bias, + const uint16_t bias_shift, + const uint16_t out_shift, + q7_t * Im_out, + const uint16_t dim_im_out_x, + const uint16_t dim_im_out_y, + q15_t * bufferA, + q7_t * bufferB); + + /** + * @brief Basic Q15 convolution function + * @param[in] Im_in pointer to input tensor + * @param[in] dim_im_in input tensor dimention + * @param[in] ch_im_in number of input tensor channels + * @param[in] wt pointer to kernel weights + * @param[in] ch_im_out number of filters, i.e., output tensor channels + * @param[in] dim_kernel filter kernel size + * @param[in] padding padding sizes + * @param[in] stride convolution stride + * @param[in] bias pointer to bias + * @param[in] bias_shift amount of left-shift for bias + * @param[in] out_shift amount of right-shift for output + * @param[in,out] Im_out pointer to output tensor + * @param[in] dim_im_out output tensor dimension + * @param[in,out] bufferA pointer to buffer space for input + * @param[in,out] bufferB pointer to buffer space for output + * @return The function returns RISCV_MATH_SUCCESS + * + */ + + riscv_status riscv_convolve_HWC_q15_basic(const q15_t * Im_in, + const uint16_t dim_im_in, + const uint16_t ch_im_in, + const q15_t * wt, + const uint16_t ch_im_out, + const uint16_t dim_kernel, + const uint16_t padding, + const uint16_t stride, + const q15_t * bias, + const uint16_t bias_shift, + const uint16_t out_shift, + q15_t * Im_out, + const uint16_t dim_im_out, + q15_t * bufferA, + q7_t * bufferB); + + /** + * @brief Fast Q7 convolution function + * @param[in] Im_in pointer to input tensor + * @param[in] dim_im_in input tensor dimention + * @param[in] ch_im_in number of input tensor channels + * @param[in] wt pointer to kernel weights + * @param[in] ch_im_out number of filters, i.e., output tensor channels + * @param[in] dim_kernel filter kernel size + * @param[in] padding padding sizes + * @param[in] stride convolution stride + * @param[in] bias pointer to bias + * @param[in] bias_shift amount of left-shift for bias + * @param[in] out_shift amount of right-shift for output + * @param[in,out] Im_out pointer to output tensor + * @param[in] dim_im_out output tensor dimension + * @param[in,out] bufferA pointer to buffer space for input + * @param[in,out] bufferB pointer to buffer space for output + * @return The function returns either + * RISCV_MATH_SIZE_MISMATCH or RISCV_MATH_SUCCESS based on the outcome of size checking. + * + * This function is the version with full list of optimization tricks, but with + * some contraints: + * ch_im_in is multiple of 4 + * ch_im_out is multiple of 2 + */ + + riscv_status riscv_convolve_HWC_q7_fast(const q7_t * Im_in, + const uint16_t dim_im_in, + const uint16_t ch_im_in, + const q7_t * wt, + const uint16_t ch_im_out, + const uint16_t dim_kernel, + const uint16_t padding, + const uint16_t stride, + const q7_t * bias, + const uint16_t bias_shift, + const uint16_t out_shift, + q7_t * Im_out, + const uint16_t dim_im_out, + q15_t * bufferA, + q7_t * bufferB); + + /** + * @brief Fast Q7 convolution function (non-sqaure shape) + * @param[in] Im_in pointer to input tensor + * @param[in] dim_im_in_x input tensor dimention x + * @param[in] dim_im_in_y input tensor dimention y + * @param[in] ch_im_in number of input tensor channels + * @param[in] wt pointer to kernel weights + * @param[in] ch_im_out number of filters, i.e., output tensor channels + * @param[in] dim_kernel_x filter kernel size x + * @param[in] dim_kernel_y filter kernel size y + * @param[in] padding_x padding size x + * @param[in] padding_y padding size y + * @param[in] stride_x convolution stride x + * @param[in] stride_y convolution stride y + * @param[in] bias pointer to bias + * @param[in] bias_shift amount of left-shift for bias + * @param[in] out_shift amount of right-shift for output + * @param[in,out] Im_out pointer to output tensor + * @param[in] dim_im_out_x output tensor dimension x + * @param[in] dim_im_out_y output tensor dimension y + * @param[in,out] bufferA pointer to buffer space for input + * @param[in,out] bufferB pointer to buffer space for output + * @return The function returns either + * RISCV_MATH_SIZE_MISMATCH or RISCV_MATH_SUCCESS based on the outcome of size checking. + * + * This function is the version with full list of optimization tricks, but with + * some contraints: + * ch_im_in is multiple of 4 + * ch_im_out is multiple of 2 + */ + + riscv_status riscv_convolve_HWC_q7_fast_nonsquare(const q7_t * Im_in, + const uint16_t dim_im_in_x, + const uint16_t dim_im_in_y, + const uint16_t ch_im_in, + const q7_t * wt, + const uint16_t ch_im_out, + const uint16_t dim_kernel_x, + const uint16_t dim_kernel_y, + const uint16_t padding_x, + const uint16_t padding_y, + const uint16_t stride_x, + const uint16_t stride_y, + const q7_t * bias, + const uint16_t bias_shift, + const uint16_t out_shift, + q7_t * Im_out, + const uint16_t dim_im_out_x, + const uint16_t dim_im_out_y, + q15_t * bufferA, + q7_t * bufferB); + + /** + * @brief Fast Q7 version of 1x1 convolution (non-sqaure shape) + * @param[in] Im_in pointer to input tensor + * @param[in] dim_im_in_x input tensor dimention x + * @param[in] dim_im_in_y input tensor dimention y + * @param[in] ch_im_in number of input tensor channels + * @param[in] wt pointer to kernel weights + * @param[in] ch_im_out number of filters, i.e., output tensor channels + * @param[in] dim_kernel_x filter kernel size x + * @param[in] dim_kernel_y filter kernel size y + * @param[in] padding_x padding size x + * @param[in] padding_y padding size y + * @param[in] stride_x convolution stride x + * @param[in] stride_y convolution stride y + * @param[in] bias pointer to bias + * @param[in] bias_shift amount of left-shift for bias + * @param[in] out_shift amount of right-shift for output + * @param[in,out] Im_out pointer to output tensor + * @param[in] dim_im_out_x output tensor dimension x + * @param[in] dim_im_out_y output tensor dimension y + * @param[in,out] bufferA pointer to buffer space for input + * @param[in,out] bufferB pointer to buffer space for output + * @return The function returns either + * RISCV_MATH_SIZE_MISMATCH or RISCV_MATH_SUCCESS based on the outcome of size checking. + * + * This function implement convolution with 1x1 kernel size (i.e., dim_kernel_x=1 + * and dim_kernel_y=1). It can be used for + * second half of MobileNets after depthwise separable convolution. + * + * This function is the version with full list of optimization tricks, but with + * some contraints: + * ch_im_in is multiple of 4 + * ch_im_out is multiple of 2 + */ + riscv_status riscv_convolve_1x1_HWC_q7_fast_nonsquare(const q7_t * Im_in, + const uint16_t dim_im_in_x, + const uint16_t dim_im_in_y, + const uint16_t ch_im_in, + const q7_t * wt, + const uint16_t ch_im_out, + const uint16_t dim_kernel_x, + const uint16_t dim_kernel_y, + const uint16_t padding_x, + const uint16_t padding_y, + const uint16_t stride_x, + const uint16_t stride_y, + const q7_t * bias, + const uint16_t bias_shift, + const uint16_t out_shift, + q7_t * Im_out, + const uint16_t dim_im_out_x, + const uint16_t dim_im_out_y, + q15_t * bufferA, + q7_t * bufferB); + + /** + * @brief Q7 version of convolution for RGB image + * @param[in] Im_in pointer to input tensor + * @param[in] dim_im_in input tensor dimention + * @param[in] ch_im_in number of input tensor channels + * @param[in] wt pointer to kernel weights + * @param[in] ch_im_out number of filters, i.e., output tensor channels + * @param[in] dim_kernel filter kernel size + * @param[in] padding padding sizes + * @param[in] stride convolution stride + * @param[in] bias pointer to bias + * @param[in] bias_shift amount of left-shift for bias + * @param[in] out_shift amount of right-shift for output + * @param[in,out] Im_out pointer to output tensor + * @param[in] dim_im_out output tensor dimension + * @param[in,out] bufferA pointer to buffer space for input + * @param[in,out] bufferB pointer to buffer space for output + * @return The function returns either + * RISCV_MATH_SIZE_MISMATCH or RISCV_MATH_SUCCESS based on the outcome of size checking. + * + * This kernel is written exclusively for convolution with ch_im_in + * equals 3. This applies on the first layer of CNNs which has input + * image with RGB format. + */ + + riscv_status riscv_convolve_HWC_q7_RGB(const q7_t * Im_in, + const uint16_t dim_im_in, + const uint16_t ch_im_in, + const q7_t * wt, + const uint16_t ch_im_out, + const uint16_t dim_kernel, + const uint16_t padding, + const uint16_t stride, + const q7_t * bias, + const uint16_t bias_shift, + const uint16_t out_shift, + q7_t * Im_out, + const uint16_t dim_im_out, + q15_t * bufferA, + q7_t * bufferB); + + /** + * @brief Fast Q15 convolution function + * @param[in] Im_in pointer to input tensor + * @param[in] dim_im_in input tensor dimention + * @param[in] ch_im_in number of input tensor channels + * @param[in] wt pointer to kernel weights + * @param[in] ch_im_out number of filters, i.e., output tensor channels + * @param[in] dim_kernel filter kernel size + * @param[in] padding padding sizes + * @param[in] stride convolution stride + * @param[in] bias pointer to bias + * @param[in] bias_shift amount of left-shift for bias + * @param[in] out_shift amount of right-shift for output + * @param[in,out] Im_out pointer to output tensor + * @param[in] dim_im_out output tensor dimension + * @param[in,out] bufferA pointer to buffer space for input + * @param[in,out] bufferB pointer to buffer space for output + * @return The function returns either + * RISCV_MATH_SIZE_MISMATCH or RISCV_MATH_SUCCESS based on the outcome of size checking. + * + * This function is the version with full list of optimization tricks, but with + * some contraints: + * ch_im_in is multiple of 2 + * ch_im_out is multiple of 2 + */ + + riscv_status riscv_convolve_HWC_q15_fast(const q15_t * Im_in, + const uint16_t dim_im_in, + const uint16_t ch_im_in, + const q15_t * wt, + const uint16_t ch_im_out, + const uint16_t dim_kernel, + const uint16_t padding, + const uint16_t stride, + const q15_t * bias, + const uint16_t bias_shift, + const uint16_t out_shift, + q15_t * Im_out, + const uint16_t dim_im_out, + q15_t * bufferA, + q7_t * bufferB); + + /** + * @brief Fast Q15 convolution function (non-sqaure shape) + * @param[in] Im_in pointer to input tensor + * @param[in] dim_im_in_x input tensor dimention x + * @param[in] dim_im_in_y input tensor dimention y + * @param[in] ch_im_in number of input tensor channels + * @param[in] wt pointer to kernel weights + * @param[in] ch_im_out number of filters, i.e., output tensor channels + * @param[in] dim_kernel_x filter kernel size x + * @param[in] dim_kernel_y filter kernel size y + * @param[in] padding_x padding size x + * @param[in] padding_y padding size y + * @param[in] stride_x convolution stride x + * @param[in] stride_y convolution stride y + * @param[in] bias pointer to bias + * @param[in] bias_shift amount of left-shift for bias + * @param[in] out_shift amount of right-shift for output + * @param[in,out] Im_out pointer to output tensor + * @param[in] dim_im_out_x output tensor dimension x + * @param[in] dim_im_out_y output tensor dimension y + * @param[in,out] bufferA pointer to buffer space for input + * @param[in,out] bufferB pointer to buffer space for output + * @return The function returns either + * RISCV_MATH_SIZE_MISMATCH or RISCV_MATH_SUCCESS based on the outcome of size checking. + * + * @details + * + * Buffer size: + * + * bufferA size: 2*ch_im_in*dim_kernel*dim_kernel + * + * bufferB size: 0 + * + * Input dimension constraints: + * + * ch_im_in is multiple of 2 + * + * ch_im_out is multipe of 2 + * + */ + + riscv_status + riscv_convolve_HWC_q15_fast_nonsquare(const q15_t * Im_in, + const uint16_t dim_im_in_x, + const uint16_t dim_im_in_y, + const uint16_t ch_im_in, + const q15_t * wt, + const uint16_t ch_im_out, + const uint16_t dim_kernel_x, + const uint16_t dim_kernel_y, + const uint16_t padding_x, + const uint16_t padding_y, + const uint16_t stride_x, + const uint16_t stride_y, + const q15_t * bias, + const uint16_t bias_shift, + const uint16_t out_shift, + q15_t * Im_out, + const uint16_t dim_im_out_x, + const uint16_t dim_im_out_y, + q15_t * bufferA, + q7_t * bufferB); + + /** + * @brief Q7 depthwise separable convolution function + * @param[in] Im_in pointer to input tensor + * @param[in] dim_im_in input tensor dimention + * @param[in] ch_im_in number of input tensor channels + * @param[in] wt pointer to kernel weights + * @param[in] ch_im_out number of filters, i.e., output tensor channels + * @param[in] dim_kernel filter kernel size + * @param[in] padding padding sizes + * @param[in] stride convolution stride + * @param[in] bias pointer to bias + * @param[in] bias_shift amount of left-shift for bias + * @param[in] out_shift amount of right-shift for output + * @param[in,out] Im_out pointer to output tensor + * @param[in] dim_im_out output tensor dimension + * @param[in,out] bufferA pointer to buffer space for input + * @param[in,out] bufferB pointer to buffer space for output + * @return The function returns either + * RISCV_MATH_SIZE_MISMATCH or RISCV_MATH_SUCCESS based on the outcome of size checking. + * + * This function is the version with full list of optimization tricks, but with + * some contraints: + * ch_im_in is multiple of 2 + * ch_im_out is multiple of 2 + */ + + riscv_status riscv_depthwise_separable_conv_HWC_q7(const q7_t * Im_in, + const uint16_t dim_im_in, + const uint16_t ch_im_in, + const q7_t * wt, + const uint16_t ch_im_out, + const uint16_t dim_kernel, + const uint16_t padding, + const uint16_t stride, + const q7_t * bias, + const uint16_t bias_shift, + const uint16_t out_shift, + q7_t * Im_out, + const uint16_t dim_im_out, + q15_t * bufferA, + q7_t * bufferB); + + /** + * @brief Q7 depthwise separable convolution function (non-square shape) + * @param[in] Im_in pointer to input tensor + * @param[in] dim_im_in_x input tensor dimention x + * @param[in] dim_im_in_y input tensor dimention y + * @param[in] ch_im_in number of input tensor channels + * @param[in] wt pointer to kernel weights + * @param[in] ch_im_out number of filters, i.e., output tensor channels + * @param[in] dim_kernel_x filter kernel size x + * @param[in] dim_kernel_y filter kernel size y + * @param[in] padding_x padding sizes x + * @param[in] padding_y padding sizes y + * @param[in] stride_x convolution stride x + * @param[in] stride_y convolution stride y + * @param[in] bias pointer to bias + * @param[in] bias_shift amount of left-shift for bias + * @param[in] out_shift amount of right-shift for output + * @param[in,out] Im_out pointer to output tensor + * @param[in] dim_im_out_x output tensor dimension x + * @param[in] dim_im_out_y output tensor dimension y + * @param[in,out] bufferA pointer to buffer space for input + * @param[in,out] bufferB pointer to buffer space for output + * @return The function returns either + * RISCV_MATH_SIZE_MISMATCH or RISCV_MATH_SUCCESS based on the outcome of size checking. + * + * This function is the version with full list of optimization tricks, but with + * some contraints: + * ch_im_in is multiple of 2 + * ch_im_out is multiple of 2 + */ + riscv_status riscv_depthwise_separable_conv_HWC_q7_nonsquare(const q7_t * Im_in, + const uint16_t dim_im_in_x, + const uint16_t dim_im_in_y, + const uint16_t ch_im_in, + const q7_t * wt, + const uint16_t ch_im_out, + const uint16_t dim_kernel_x, + const uint16_t dim_kernel_y, + const uint16_t padding_x, + const uint16_t padding_y, + const uint16_t stride_x, + const uint16_t stride_y, + const q7_t * bias, + const uint16_t bias_shift, + const uint16_t out_shift, + q7_t * Im_out, + const uint16_t dim_im_out_x, + const uint16_t dim_im_out_y, + q15_t * bufferA, + q7_t * bufferB); + + +/** + * @defgroup FC Fully-connected Layer Functions + * + * Perform fully-connected layer + * + * Fully-connected layer is basically a matrix-vector multiplication + * with bias. The matrix is the weights and the input/output vectors + * are the activation values. Supported {weight, activation} precisions + * include {8-bit, 8-bit}, {16-bit, 16-bit}, and {8-bit, 16-bit}. + * + * Here we have two types of kernel functions. The basic function + * implements the function using regular GEMV approach. The opt functions + * operates with weights in interleaved formats. + * + */ + + /** + * @brief Q7 basic fully-connected layer function + * @param[in] pV pointer to input vector + * @param[in] pM pointer to matrix weights + * @param[in] dim_vec length of the vector + * @param[in] num_of_rows number of rows in weight matrix + * @param[in] bias_shift amount of left-shift for bias + * @param[in] out_shift amount of right-shift for output + * @param[in] bias pointer to bias + * @param[in,out] pOut pointer to output vector + * @param[in,out] vec_buffer pointer to buffer space for input + * @return The function returns RISCV_MATH_SUCCESS + * + */ + + riscv_status riscv_fully_connected_q7(const q7_t * pV, + const q7_t * pM, + const uint16_t dim_vec, + const uint16_t num_of_rows, + const uint16_t bias_shift, + const uint16_t out_shift, + const q7_t * bias, + q7_t * pOut, + q15_t * vec_buffer); + + /** + * @brief S8 basic fully-connected layer function for TF Lite + * @param[in] pInput pointer to pInput vector + * @param[in] pWeight pointer to matrix weights + * @param[in] col_dim dimension of the input vector + * @param[in] row_dim dimension of the output vector + * @param[in] nb_batches number of batches + * @param[in] input_offset + * @param[in] filter_offset + * @param[in] out_mult requantization parameter + * @param[in] out_shift requantization parameter + * @param[in] output_offset + * @param[in] pBias pointer to bias + * @param[out] pOut pointer to output vector + * @param[in] output_activation_min for clamping + * @param[in] output_activation_max for clamping + * @param[in,out] vec_buffer pointer to buffer space for pInput + * @return The function returns RISCV_MATH_SUCCESS + * + * @details + * + * Buffer size: + * + * vec_buffer size: col_dim of word16. + * + * This basic function is designed to work with regular pWeight + * matrix without interleaving. + * + */ + riscv_status + riscv_fully_connected_s8(const int8_t *pInput, + const int8_t *weight, + const uint16_t input_length, + const uint16_t num_rows, + const uint16_t nb_batches, + const int32_t input_offset, + const int32_t filter_offset, + const int32_t out_mult, + const int32_t out_shift, + const int32_t output_offset, + const int8_t *bias, + int8_t *pOut, + const int32_t output_activation_min, + const int32_t output_activation_max, + q15_t *vec_buffer) ; + + /** + * @brief Q7 opt fully-connected layer function + * @param[in] pV pointer to input vector + * @param[in] pM pointer to matrix weights + * @param[in] dim_vec length of the vector + * @param[in] num_of_rows number of rows in weight matrix + * @param[in] bias_shift amount of left-shift for bias + * @param[in] out_shift amount of right-shift for output + * @param[in] bias pointer to bias + * @param[in,out] pOut pointer to output vector + * @param[in,out] vec_buffer pointer to buffer space for input + * @return The function returns RISCV_MATH_SUCCESS + * + */ + + riscv_status riscv_fully_connected_q7_opt(const q7_t * pV, + const q7_t * pM, + const uint16_t dim_vec, + const uint16_t num_of_rows, + const uint16_t bias_shift, + const uint16_t out_shift, + const q7_t * bias, + q7_t * pOut, + q15_t * vec_buffer); + + /** + * @brief Q15 basic fully-connected layer function + * @param[in] pV pointer to input vector + * @param[in] pM pointer to matrix weights + * @param[in] dim_vec length of the vector + * @param[in] num_of_rows number of rows in weight matrix + * @param[in] bias_shift amount of left-shift for bias + * @param[in] out_shift amount of right-shift for output + * @param[in] bias pointer to bias + * @param[in,out] pOut pointer to output vector + * @param[in,out] vec_buffer pointer to buffer space for input + * @return The function returns RISCV_MATH_SUCCESS + * + */ + + riscv_status riscv_fully_connected_q15(const q15_t * pV, + const q15_t * pM, + const uint16_t dim_vec, + const uint16_t num_of_rows, + const uint16_t bias_shift, + const uint16_t out_shift, + const q15_t * bias, + q15_t * pOut, + q15_t * vec_buffer); + + /** + * @brief Q15 opt fully-connected layer function + * @param[in] pV pointer to input vector + * @param[in] pM pointer to matrix weights + * @param[in] dim_vec length of the vector + * @param[in] num_of_rows number of rows in weight matrix + * @param[in] bias_shift amount of left-shift for bias + * @param[in] out_shift amount of right-shift for output + * @param[in] bias pointer to bias + * @param[in,out] pOut pointer to output vector + * @param[in,out] vec_buffer pointer to buffer space for input + * @return The function returns RISCV_MATH_SUCCESS + * + */ + + riscv_status riscv_fully_connected_q15_opt(const q15_t * pV, + const q15_t * pM, + const uint16_t dim_vec, + const uint16_t num_of_rows, + const uint16_t bias_shift, + const uint16_t out_shift, + const q15_t * bias, + q15_t * pOut, + q15_t * vec_buffer); + + /** + * @brief Mixed Q15-Q7 fully-connected layer function + * @param[in] pV pointer to input vector + * @param[in] pM pointer to matrix weights + * @param[in] dim_vec length of the vector + * @param[in] num_of_rows number of rows in weight matrix + * @param[in] bias_shift amount of left-shift for bias + * @param[in] out_shift amount of right-shift for output + * @param[in] bias pointer to bias + * @param[in,out] pOut pointer to output vector + * @param[in,out] vec_buffer pointer to buffer space for input + * @return The function returns RISCV_MATH_SUCCESS + * + */ + + riscv_status riscv_fully_connected_mat_q7_vec_q15(const q15_t * pV, + const q7_t * pM, + const uint16_t dim_vec, + const uint16_t num_of_rows, + const uint16_t bias_shift, + const uint16_t out_shift, + const q7_t * bias, + q15_t * pOut, + q15_t * vec_buffer); + + /** + * @brief Mixed Q15-Q7 opt fully-connected layer function + * @param[in] pV pointer to input vector + * @param[in] pM pointer to matrix weights + * @param[in] dim_vec length of the vector + * @param[in] num_of_rows number of rows in weight matrix + * @param[in] bias_shift amount of left-shift for bias + * @param[in] out_shift amount of right-shift for output + * @param[in] bias pointer to bias + * @param[in,out] pOut pointer to output vector + * @param[in,out] vec_buffer pointer to buffer space for input + * @return The function returns RISCV_MATH_SUCCESS + * + */ + + riscv_status riscv_fully_connected_mat_q7_vec_q15_opt(const q15_t * pV, + const q7_t * pM, + const uint16_t dim_vec, + const uint16_t num_of_rows, + const uint16_t bias_shift, + const uint16_t out_shift, + const q7_t * bias, + q15_t * pOut, + q15_t * vec_buffer); + +/** + * @brief Matrix-Multiplication Kernels for Convolution + * + * These functions are used within convolution layer functions for + * matrix multiplication. + * + * The implementation is similar to NMSIS-DSP riscv_mat_mult functions + * with one Q7 and one Q15 operands. The Q15 operand is the im2col + * output which is always with 2 columns. + * + */ + + /** + * @brief Matrix-multiplication function for convolution + * @param[in] pA pointer to operand A + * @param[in] pInBuffer pointer to operand B, always conssists of 2 vectors + * @param[in] ch_im_out numRow of A + * @param[in] numCol_A numCol of A + * @param[in] bias_shift amount of left-shift for bias + * @param[in] out_shift amount of right-shift for output + * @param[in] bias the bias + * @param[in,out] pOut pointer to output + * @return The function returns the incremented output pointer + */ + + q7_t *riscv_nn_mat_mult_kernel_q7_q15(const q7_t * pA, + const q15_t * pInBuffer, + const uint16_t ch_im_out, + const uint16_t numCol_A, + const uint16_t bias_shift, + const uint16_t out_shift, + const q7_t * bias, + q7_t * pOut); + + q7_t *riscv_nn_mat_mult_kernel_q7(const q7_t * pA, + const q7_t * pInBuffer, + const uint16_t ch_im_out, + const uint16_t numCol_A, + const uint16_t bias_shift, + const uint16_t out_shift, + const q7_t * bias, + q7_t * pOut); + + /** + * @brief Matrix-multiplication function for convolution with reordered columns + * @param[in] pA pointer to operand A + * @param[in] pInBuffer pointer to operand B, always conssists of 2 vectors + * @param[in] ch_im_out numRow of A + * @param[in] numCol_A numCol of A + * @param[in] bias_shift amount of left-shift for bias + * @param[in] out_shift amount of right-shift for output + * @param[in] bias the bias + * @param[in,out] pOut pointer to output + * @return The function returns the incremented output pointer + */ + + q7_t *riscv_nn_mat_mult_kernel_q7_q15_reordered(const q7_t * pA, + const q15_t * pInBuffer, + const uint16_t ch_im_out, + const uint16_t numCol_A, + const uint16_t bias_shift, + const uint16_t out_shift, + const q7_t * bias, + q7_t * pOut); + + q7_t *riscv_nn_mat_mult_kernel_q7_reordered(const q7_t * pA, + const q7_t * pInBuffer, + const uint16_t ch_im_out, + const uint16_t numCol_A, + const uint16_t bias_shift, + const uint16_t out_shift, + const q7_t * bias, + q7_t * pOut); + +#ifdef __cplusplus +} +#endif + +/* + * Other functions + * These layers are typically not timing critical + * Basic implementation is supported here + */ + +#ifdef __cplusplus +extern "C" +{ +#endif + +/** + * @defgroup Acti Neural Network Activation Functions + * + * Perform activation layers, including ReLU (Rectified Linear Unit), + * sigmoid and tanh + * + */ + + /** + * @brief Q7 RELU function + * @param[in,out] data pointer to input + * @param[in] size number of elements + * @return none. + */ + + void riscv_relu_q7(q7_t * data, uint16_t size); + + /** + * @brief Q15 RELU function + * @param[in,out] data pointer to input + * @param[in] size number of elements + * @return none. + */ + + void riscv_relu_q15(q15_t * data, uint16_t size); + + /** + * @brief Q7 neural network activation function using direct table look-up + * @param[in,out] data pointer to input + * @param[in] size number of elements + * @param[in] int_width bit-width of the integer part, assume to be smaller than 3 + * @param[in] type type of activation functions + * @return none. + */ + + void riscv_nn_activations_direct_q7(q7_t * data, uint16_t size, uint16_t int_width, + riscv_nn_activation_type type); + + /** + * @brief Q15 neural network activation function using direct table look-up + * @param[in,out] data pointer to input + * @param[in] size number of elements + * @param[in] int_width bit-width of the integer part, assume to be smaller than 3 + * @param[in] type type of activation functions + * @return none. + */ + + void riscv_nn_activations_direct_q15(q15_t * data, uint16_t size, uint16_t int_width, + riscv_nn_activation_type type); + +/** + * @defgroup Pooling Neural Network Pooling Functions + * + * Perform pooling functions, including max pooling and average pooling + * + */ + + /** + * @brief Q7 max pooling function + * @param[in] Im_in pointer to input tensor + * @param[in] dim_im_in input tensor dimention + * @param[in] ch_im_in number of input tensor channels + * @param[in] dim_kernel filter kernel size + * @param[in] padding padding sizes + * @param[in] stride convolution stride + * @param[in] dim_im_out output tensor dimension + * @param[in,out] bufferA pointer to buffer space for input + * @param[in,out] Im_out pointer to output tensor + * @return none. + * + */ + + void riscv_maxpool_q7_HWC(q7_t * Im_in, + const uint16_t dim_im_in, + const uint16_t ch_im_in, + const uint16_t dim_kernel, + const uint16_t padding, + const uint16_t stride, + const uint16_t dim_im_out, + q7_t * bufferA, + q7_t * Im_out); + + /** + * @brief Q7 average pooling function + * @param[in] Im_in pointer to input tensor + * @param[in] dim_im_in input tensor dimention + * @param[in] ch_im_in number of input tensor channels + * @param[in] dim_kernel filter kernel size + * @param[in] padding padding sizes + * @param[in] stride convolution stride + * @param[in] dim_im_out output tensor dimension + * @param[in,out] bufferA pointer to buffer space for input + * @param[in,out] Im_out pointer to output tensor + * @return none. + * + */ + + void riscv_avepool_q7_HWC(q7_t * Im_in, + const uint16_t dim_im_in, + const uint16_t ch_im_in, + const uint16_t dim_kernel, + const uint16_t padding, + const uint16_t stride, + const uint16_t dim_im_out, + q7_t * bufferA, + q7_t * Im_out); + +/** + * @defgroup Softmax Softmax Functions + * + * EXP(2) based softmax function + * + */ + + /** + * @brief Q7 softmax function + * @param[in] vec_in pointer to input vector + * @param[in] dim_vec input vector dimention + * @param[out] p_out pointer to output vector + * @return none. + * + */ + + void riscv_softmax_q7(const q7_t * vec_in, const uint16_t dim_vec, q7_t * p_out); + + /** + * @brief Q15 softmax function + * @param[in] vec_in pointer to input vector + * @param[in] dim_vec input vector dimention + * @param[out] p_out pointer to output vector + * @return none. + * + */ + + void riscv_softmax_q15(const q15_t * vec_in, const uint16_t dim_vec, q15_t * p_out); + + /** + * @brief uint8 depthwise convolution function with asymmetric quantization for even number of channel multiplier + * and input channels. Unless specified otherwise, arguments are mandatory. + * + * @param[in] input Pointer to input tensor + * @param[in] input_x Width of input tensor + * @param[in] input_y Height of input tensor + * @param[in] input_ch Channels in input tensor + * @param[in] kernel Pointer to kernel weights + * @param[in] kernel_x Width of kernel + * @param[in] kernel_y Height of kernel + * @param[in] ch_mult Number of channel multiplier + * @param[in] pad_x Padding sizes x + * @param[in] pad_y Padding sizes y + * @param[in] stride_x Convolution stride along the width + * @param[in] stride_y Convolution stride along the height + * @param[in] dilation_x Dilation along width. Not used and intended for future enhancement. + * @param[in] dilation_y Dilation along height. Not used and intended for future enhancement. + * @param[in] bias Pointer to optional bias values. If no bias is + * availble, NULL is expected + * @param[in] input_offset Input tensor zero offset + * @param[in] filter_offset Kernel tensor zero offset + * @param[in] output_offset Output tensor zero offset + * @param[in,out] output Pointer to output tensor + * @param[in] output_x Width of output tensor + * @param[in] output_y Height of output tensor + * @param[in] output_activation_min Minimum value to clamp the output to. Range : {0, 255} + * @param[in] output_activation_max Minimum value to clamp the output to. Range : {0, 255} + * @param[in] out_shift Amount of right-shift for output + * @param[in] out_mult Output multiplier for requantization + * @return The function returns one of the following + * RISCV_MATH_SIZE_MISMATCH - Not supported dimension of tensors + * RISCV_MATH_SUCCESS - Successful operation + * RISCV_MATH_ARGUMENT_ERROR - Implementation not available + * + * Input constraints + * ch_mult is multiple of 2 + * kernel_x is multiple of 2 + * + */ + riscv_status riscv_depthwise_conv_u8_basic_ver1(const uint8_t *input, + const uint16_t input_x, + const uint16_t input_y, + const uint16_t input_ch, + const uint8_t *kernel, + const uint16_t kernel_x, + const uint16_t kernel_y, + const int16_t ch_mult, + const int16_t pad_x, + const int16_t pad_y, + const int16_t stride_x, + const int16_t stride_y, + const int16_t dilation_x, + const int16_t dilation_y, + const int32_t *bias, + const int32_t input_offset, + const int32_t filter_offset, + const int32_t output_offset, + uint8_t *output, + const uint16_t output_x, + const uint16_t output_y, + const int32_t output_activation_min, + const int32_t output_activation_max, + const int32_t out_shift, + const int32_t out_mult); +#ifdef __cplusplus +} +#endif + +#endif diff --git a/kernel/arch/risc-v/nuclei/gcc/nmsis/NN/Include/riscv_nnsupportfunctions.h b/kernel/arch/risc-v/nuclei/gcc/nmsis/NN/Include/riscv_nnsupportfunctions.h new file mode 100644 index 00000000..6061f08d --- /dev/null +++ b/kernel/arch/risc-v/nuclei/gcc/nmsis/NN/Include/riscv_nnsupportfunctions.h @@ -0,0 +1,366 @@ +/* + * Copyright (C) 2010-2018 Arm Limited or its affiliates. All rights reserved. + * Copyright (c) 2019 Nuclei Limited. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: NMSIS NN Library + * Title: riscv_nnsupportfunctions.h + * Description: Public header file of support functions for NMSIS NN Library + * + * $Date: July 2019 + * $Revision: V.1.0.0 + * + * Target Processor: RISC-V Cores + * -------------------------------------------------------------------- */ + +#ifndef _RISCV_NNSUPPORTFUNCTIONS_H_ +#define _RISCV_NNSUPPORTFUNCTIONS_H_ + +#include "riscv_math.h" +#include "riscv_common_tables.h" + +#ifdef __cplusplus +extern "C" +{ +#endif + +#define LEFT_SHIFT(_shift) (_shift > 0 ? _shift : 0) +#define RIGHT_SHIFT(_shift) (_shift > 0 ? 0 : -_shift) +#define Q31_MIN (0x80000000L) +#define Q31_MAX (0x7FFFFFFFL) + +#define MAX(A,B) (A) > (B) ? (A) : (B) +#define MIN(A,B) (A) < (B) ? (A) : (B) + +/** + * @brief Union for SIMD access of q31/q15/q7 types + */ +union riscv_nnword +{ + q31_t word; + /**< q31 type */ + q15_t half_words[2]; + /**< q15 type */ + q7_t bytes[4]; + /**< q7 type */ +}; + +/** + * @brief Struct for specifying activation function types + * + */ +typedef enum +{ + RISCV_SIGMOID = 0, + /**< Sigmoid activation function */ + RISCV_TANH = 1, + /**< Tanh activation function */ +} riscv_nn_activation_type; + +/** + * @defgroup nndata_convert Neural Network Data Conversion Functions + * + * Perform data type conversion in-between neural network operations + * + */ + +/** + * @brief Converts the elements of the q7 vector to q15 vector without left-shift + * @param[in] *pSrc points to the q7 input vector + * @param[out] *pDst points to the q15 output vector + * @param[in] blockSize length of the input vector + * @return none. + * + */ +void riscv_q7_to_q15_no_shift(const q7_t * pSrc, q15_t * pDst, uint32_t blockSize); + +void riscv_q7_to_q7_no_shift(const q7_t * pSrc, q7_t * pDst, uint32_t blockSize); + +/** + * @brief Non-saturating addition of elements of a q7 vector + * @param[in] *input Pointer to the q7 input vector + * @param[out] *output Pointer to the q31 output variable. + * @param[in] block_size length of the input vector + * @return none. + * \par Description: + * + * 2^24 samples can be added without saturating the result. + * + * The equation used for the conversion process is: + * + *
+ *  sum = input[0] + input[1] + .. + input[block_size -1]
+ * 
+ * + * */ +void riscv_nn_add_q7(const q7_t *input, q31_t *output, uint32_t block_size); + +/** + * @brief Converts the elements of the q7 vector to reordered q15 vector without left-shift + * @param[in] *pSrc points to the q7 input vector + * @param[out] *pDst points to the q15 output vector + * @param[in] blockSize length of the input vector + * @return none. + * + */ +void riscv_q7_to_q15_reordered_no_shift(const q7_t * pSrc, q15_t * pDst, uint32_t blockSize); + +void riscv_q7_to_q7_reordered_no_shift(const q7_t * pSrc, q7_t * pDst, uint32_t blockSize); + +/** + * @brief Converts the elements from a q7 vector to a q15 vector with an added offset + * @param[in] *src points to the q7 input vector + * @param[out] *dst points to the q15 output vector + * @param[in] block_size length of the input vector + * @param[in] offset q7 offset to be added to each input vector element. + * @return none. + * + * \par Description: + * + * The equation used for the conversion process is: + * + *
+ *  dst[n] = (q15_t) src[n] + offset;   0 <= n < block_size.
+ * 
+ * + */ +void riscv_q7_to_q15_with_offset(const q7_t *src, q15_t *dst, uint32_t block_size, q7_t offset); + +#if defined (RISCV_MATH_DSP) + +/** + * @brief read and expand one q7 word into two q15 words + */ + +__STATIC_FORCEINLINE void *read_and_pad(void *source, q31_t * out1, q31_t * out2) +{ + q31_t inA = *__SIMD32(source)++; + q31_t inAbuf1 = __SXTB16(__ROR(inA, 8)); + q31_t inAbuf2 = __SXTB16(inA); + + *out2 = __PKHTB(inAbuf1, inAbuf2, 16); + *out1 = __PKHBT(inAbuf2, inAbuf1, 16); + + return source; +} + +/** + * @brief read and expand one q7 word into two q15 words with reordering + */ + +__STATIC_FORCEINLINE q7_t *read_and_pad_reordered(q7_t *source, q31_t * out1, q31_t * out2) +{ + q31_t inA = read_q7x4_ia(&source); + *out2 = __SXTB16(__ROR(inA, 8)); + *out1 = __SXTB16(inA); + + return source; +} + +/** + * @brief read and expand one q7 word into two q15 words with reordering and add an offset + */ +__STATIC_FORCEINLINE q7_t *read_and_pad_reordered_with_offset(q7_t *source, q31_t * out1, q31_t * out2,q31_t offset) +{ + q31_t inA = read_q7x4_ia(&source); + + *out2 = __SXTB16(__ROR(inA, 8)); + *out1 = __SXTB16(inA); + *out1 = __QADD16(*out1,offset); + *out2 = __QADD16(*out2,offset); + + return source; +} + + +#endif + + + +/** + * @defgroup NNBasicMath Basic Math Functions for Neural Network Computation + * + * Basic Math Functions for Neural Network Computation + * + */ + +/** + * @brief q7 vector multiplication with variable output shifts + * @param[in] *pSrcA pointer to the first input vector + * @param[in] *pSrcB pointer to the second input vector + * @param[out] *pDst pointer to the output vector + * @param[in] out_shift amount of right-shift for output + * @param[in] blockSize number of samples in each vector + * @return none. + * + * Scaling and Overflow Behavior: + * \par + * The function uses saturating arithmetic. + * Results outside of the allowable q15 range [0x8000 0x7FFF] will be saturated. + */ + +void riscv_nn_mult_q15( + q15_t * pSrcA, + q15_t * pSrcB, + q15_t * pDst, + const uint16_t out_shift, + uint32_t blockSize); + +/** + * @brief q7 vector multiplication with variable output shifts + * @param[in] *pSrcA pointer to the first input vector + * @param[in] *pSrcB pointer to the second input vector + * @param[out] *pDst pointer to the output vector + * @param[in] out_shift amount of right-shift for output + * @param[in] blockSize number of samples in each vector + * @return none. + * + * Scaling and Overflow Behavior: + * \par + * The function uses saturating arithmetic. + * Results outside of the allowable q7 range [0x80 0x7F] will be saturated. + */ + +void riscv_nn_mult_q7( + q7_t * pSrcA, + q7_t * pSrcB, + q7_t * pDst, + const uint16_t out_shift, + uint32_t blockSize); + +/** + * @brief macro for adding rounding offset + */ +#ifndef RISCV_NN_TRUNCATE + #define NN_ROUND(out_shift) ( (0x1 << out_shift) >> 1 ) +#else + #define NN_ROUND(out_shift) 0 +#endif + +/** + * @brief Saturating doubling high multiply. Result matches + * NEON instruction VQRDMULH. + * @param[in] m1 Multiplicand + * @param[in] m2 Multiplier + * @return Result of multiplication. + * + */ +__STATIC_FORCEINLINE q31_t riscv_nn_sat_doubling_high_mult(const q31_t m1, const q31_t m2) +{ + q31_t result = 0; + // Rounding offset to add for a right shift of 31 + q63_t mult = 1 << 30; + + if ((m1 < 0) ^ (m2 < 0)) + { + mult = 1 - mult; + } + // Gets resolved as a SMLAL instruction + mult = mult + (q63_t)m1 * m2; + + // Utilize all of the upper 32 bits. This is the doubling step + // as well. + result = mult / (1UL << 31); + + if ((m1 == m2) && (m1 == Q31_MIN)) + { + result = Q31_MAX; + } + return result; +} + +/** + * @brief Rounding divide by power of two. + * @param[in] dividend - Dividend + * @param[in] exponent - Divisor = power(2, exponent) + * Range: [0, 31] + * @return Rounded result of division. Midpoint is rounded away from zero. + * + */ +__STATIC_FORCEINLINE q31_t riscv_nn_divide_by_power_of_two(const q31_t dividend, const q31_t exponent) +{ + q31_t result = 0; + const q31_t remainder_mask = (1l << exponent) - 1; + int32_t remainder = remainder_mask & dividend; + + // Basic division + result = dividend >> exponent; + + // Adjust 'result' for rounding (mid point away from zero) + q31_t threshold = remainder_mask >> 1; + if (result < 0) + { + threshold++; + } + if (remainder > threshold) + { + result++; + } + + return result; +} + +/** + * @brief Requantize a given value. + * @param[in] val Value to be requantized + * @param[in] multiplier multiplier + * @param[in] shift left or right shift for 'val * multiplier' + * + * @return Returns (val * multiplier)/(2 ^ shift) + * + */ +__STATIC_FORCEINLINE q31_t riscv_nn_requantize(const q31_t val, const q31_t multiplier, const q31_t shift) +{ + return riscv_nn_divide_by_power_of_two(riscv_nn_sat_doubling_high_mult(val * (1 << LEFT_SHIFT(shift)), multiplier), + RIGHT_SHIFT(shift)); +} + +/** + @brief Read 2 q15 elements and post increment pointer. + @param[in] in_q15 Pointer to pointer that holds address of input. + @return q31 value + */ +__STATIC_FORCEINLINE q31_t riscv_nn_read_q15x2_ia(const q15_t **in_q15) +{ + q31_t val; + + memcpy(&val, *in_q15, 4); + *in_q15 += 2; + + return (val); +} + +/** + @brief Read 4 q7 from q7 pointer and post increment pointer. + @param[in] in_q7 Pointer to pointer that holds address of input. + @return q31 value + */ +__STATIC_FORCEINLINE q31_t riscv_nn_read_q7x4_ia(const q7_t **in_q7) +{ + q31_t val; + memcpy(&val, *in_q7, 4); + *in_q7 += 4; + + return (val); +} + +#ifdef __cplusplus +} +#endif + +#endif