diff --git a/kernel/arch/risc-v/nuclei/gcc/los_arch_context.h b/kernel/arch/risc-v/nuclei/gcc/los_arch_context.h
new file mode 100644
index 00000000..09e7ab9e
--- /dev/null
+++ b/kernel/arch/risc-v/nuclei/gcc/los_arch_context.h
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2013-2020, Huawei Technologies Co., Ltd. All rights reserved.
+ * Copyright (c) 2020-2021 Huawei Device Co., Ltd. All rights reserved.
+ * Copyright (c) 2021 Nuclei Limited. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this list of
+ *    conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice, this list
+ *    of conditions and the following disclaimer in the documentation and/or other materials
+ *    provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors may be used
+ *    to endorse or promote products derived from this software without specific prior written
+ *    permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _LOS_ARCH_CONTEXT_H
+#define _LOS_ARCH_CONTEXT_H
+
+#include "los_compiler.h"
+#include "los_context.h"
+
+#ifdef __cplusplus
+#if __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+#endif /* __cplusplus */
+
+/**
+ * @ingroup los_hw
+ */
+typedef unsigned long STACK_TYPE;
+
+typedef struct {
+    STACK_TYPE epc;        /* epc - epc    - program counter                     */
+    STACK_TYPE ra;         /* x1  - ra     - return address for jumps            */
+    STACK_TYPE t0;         /* x5  - t0     - temporary register 0                */
+    STACK_TYPE t1;         /* x6  - t1     - temporary register 1                */
+    STACK_TYPE t2;         /* x7  - t2     - temporary register 2                */
+    STACK_TYPE s0_fp;      /* x8  - s0/fp  - saved register 0 or frame pointer   */
+    STACK_TYPE s1;         /* x9  - s1     - saved register 1                    */
+    STACK_TYPE a0;         /* x10 - a0     - return value or function argument 0 */
+    STACK_TYPE a1;         /* x11 - a1     - return value or function argument 1 */
+    STACK_TYPE a2;         /* x12 - a2     - function argument 2                 */
+    STACK_TYPE a3;         /* x13 - a3     - function argument 3                 */
+    STACK_TYPE a4;         /* x14 - a4     - function argument 4                 */
+    STACK_TYPE a5;         /* x15 - a5     - function argument 5                 */
+#ifndef __riscv_32e
+    STACK_TYPE a6;         /* x16 - a6     - function argument 6                 */
+    STACK_TYPE a7;         /* x17 - s7     - function argument 7                 */
+    STACK_TYPE s2;         /* x18 - s2     - saved register 2                    */
+    STACK_TYPE s3;         /* x19 - s3     - saved register 3                    */
+    STACK_TYPE s4;         /* x20 - s4     - saved register 4                    */
+    STACK_TYPE s5;         /* x21 - s5     - saved register 5                    */
+    STACK_TYPE s6;         /* x22 - s6     - saved register 6                    */
+    STACK_TYPE s7;         /* x23 - s7     - saved register 7                    */
+    STACK_TYPE s8;         /* x24 - s8     - saved register 8                    */
+    STACK_TYPE s9;         /* x25 - s9     - saved register 9                    */
+    STACK_TYPE s10;        /* x26 - s10    - saved register 10                   */
+    STACK_TYPE s11;        /* x27 - s11    - saved register 11                   */
+    STACK_TYPE t3;         /* x28 - t3     - temporary register 3                */
+    STACK_TYPE t4;         /* x29 - t4     - temporary register 4                */
+    STACK_TYPE t5;         /* x30 - t5     - temporary register 5                */
+    STACK_TYPE t6;         /* x31 - t6     - temporary register 6                */
+#endif
+    STACK_TYPE mstatus;    /*              - machine status register             */
+} TaskContext;
+
+extern VOID HalStartToRun(VOID);
+
+#ifdef __cplusplus
+#if __cplusplus
+}
+#endif /* __cplusplus */
+#endif /* __cplusplus */
+
+#endif /* _LOS_HW_H */
diff --git a/kernel/arch/risc-v/nuclei/gcc/los_arch_interrupt.h b/kernel/arch/risc-v/nuclei/gcc/los_arch_interrupt.h
new file mode 100644
index 00000000..63a27c8d
--- /dev/null
+++ b/kernel/arch/risc-v/nuclei/gcc/los_arch_interrupt.h
@@ -0,0 +1,222 @@
+/*
+ * Copyright (c) 2013-2020, Huawei Technologies Co., Ltd. All rights reserved.
+ * Copyright (c) 2020-2021 Huawei Device Co., Ltd. All rights reserved.
+ * Copyright (c) 2021 Nuclei Limited. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this list of
+ *    conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice, this list
+ *    of conditions and the following disclaimer in the documentation and/or other materials
+ *    provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors may be used
+ *    to endorse or promote products derived from this software without specific prior written
+ *    permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef _LOS_HWI_H
+#define _LOS_HWI_H
+
+#include "nuclei_sdk_soc.h"
+#include "los_compiler.h"
+#include "los_config.h"
+#include "los_interrupt.h"
+#include "los_arch_context.h"
+
+#ifdef __cplusplus
+#if __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+#endif /* __cplusplus */
+/**
+ * @ingroup los_hwi
+ * Count of Nuclei system interrupt vector.
+ */
+#define OS_RISCV_SYS_VECTOR_CNT   19
+
+/**
+ * @ingroup los_hwi
+ * Count of Nuclei interrupt vector maxium, which is configurable.
+ */
+#define OS_RISCV_CUSTOM_IRQ_VECTOR_CNT  SOC_INT_MAX
+
+/**
+ * @ingroup los_hwi
+ * Count of Nuclei interrupt vector.
+ */
+#define OS_RISCV_VECTOR_CNT                  (OS_RISCV_SYS_VECTOR_CNT + OS_RISCV_CUSTOM_IRQ_VECTOR_CNT)
+
+/**
+ * Maximum number of supported hardware devices that generate hardware interrupts.
+ */
+#define OS_HWI_MAX_NUM        (OS_RISCV_VECTOR_CNT-1)
+
+extern VOID HalHwiDefaultHandler(VOID);
+
+/**
+ * @ingroup los_hwi
+ * Hardware interrupt error code: Invalid interrupt number.
+ *
+ * Value: 0x02000900
+ *
+ * Solution: Ensure that the interrupt number is valid. The value range of the interrupt number applicable
+ * for a risc-v platform is [0, OS_RISCV_VECTOR_CNT].
+ */
+#define OS_ERRNO_HWI_NUM_INVALID                 LOS_ERRNO_OS_ERROR(LOS_MOD_HWI, 0x00)
+
+/**
+ * @ingroup los_hwi
+ * Hardware interrupt error code: Null hardware interrupt handling function.
+ *
+ * Value: 0x02000901
+ *
+ * Solution: Pass in a valid non-null hardware interrupt handling function.
+ */
+// #define OS_ERRNO_HWI_PROC_FUNC_NULL              LOS_ERRNO_OS_ERROR(LOS_MOD_HWI, 0x01)
+
+/**
+ * @ingroup los_hwi
+ * Hardware interrupt error code: Insufficient interrupt resources for hardware interrupt creation.
+ *
+ * Value: 0x02000902
+ *
+ * Solution: Increase the configured maximum number of supported hardware interrupts.
+ */
+// #define OS_ERRNO_HWI_CB_UNAVAILABLE              LOS_ERRNO_OS_ERROR(LOS_MOD_HWI, 0x02)
+
+/**
+ * @ingroup los_hwi
+ * Hardware interrupt error code: Insufficient memory for hardware interrupt initialization.
+ *
+ * Value: 0x02000903
+ *
+ * Solution: Expand the configured memory.
+ */
+// #define OS_ERRNO_HWI_NO_MEMORY                   LOS_ERRNO_OS_ERROR(LOS_MOD_HWI, 0x03)
+
+/**
+ * @ingroup los_hwi
+ * Hardware interrupt error code: The interrupt has already been created.
+ *
+ * Value: 0x02000904
+ *
+ * Solution: Check whether the interrupt specified by the passed-in interrupt number has already been created.
+ */
+// #define OS_ERRNO_HWI_ALREADY_CREATED             LOS_ERRNO_OS_ERROR(LOS_MOD_HWI, 0x04)
+
+/**
+ * @ingroup los_hwi
+ * Hardware interrupt error code: Invalid interrupt priority.
+ *
+ * Value: 0x02000905
+ *
+ * Solution: Ensure that the interrupt priority is valid.
+ */
+// #define OS_ERRNO_HWI_PRIO_INVALID                LOS_ERRNO_OS_ERROR(LOS_MOD_HWI, 0x05)
+
+/**
+ * @ingroup los_hwi
+ * Hardware interrupt error code: Incorrect interrupt creation mode.
+ *
+ * Value: 0x02000906
+ *
+ * Solution: The interrupt creation mode can be only set to ECLIC_NON_VECTOR_INTERRUPT or ECLIC_VECTOR_INTERRUPT of which the
+ * value can be 0 or 1.
+ */
+#define OS_ERRNO_HWI_MODE_INVALID                LOS_ERRNO_OS_ERROR(LOS_MOD_HWI, 0x06)
+
+/**
+ * @ingroup los_hwi
+ * Hardware interrupt error code: The interrupt has already been created as a fast interrupt.
+ *
+ * Value: 0x02000907
+ *
+ * Solution: Check whether the interrupt specified by the passed-in interrupt number has already been created.
+ */
+// #define OS_ERRNO_HWI_FASTMODE_ALREADY_CREATED    LOS_ERRNO_OS_ERROR(LOS_MOD_HWI, 0x07)
+
+/**
+ * @ingroup los_hwi
+ * Hardware interrupt error code: The API is called during an interrupt, which is forbidden.
+ *
+ * Value: 0x02000908
+ *
+ * * Solution: Do not call the API during an interrupt.
+ */
+// #define OS_ERRNO_HWI_INTERR LOS_ERRNO_OS_ERROR(LOS_MOD_HWI, 0x08)
+
+/**
+ * @ingroup los_hwi
+ * Hardware interrupt error code:the hwi support SHARED error.
+ *
+ * Value: 0x02000909
+ *
+ * * Solution:check the input params hwiMode and irqParam of HalHwiCreate or HalHwiDelete whether adapt the current
+ * hwi.
+ */
+// #define OS_ERRNO_HWI_SHARED_ERROR LOS_ERRNO_OS_ERROR(LOS_MOD_HWI, 0x09)
+
+/**
+ * @ingroup los_hwi
+ * Hardware interrupt error code:Invalid interrupt Arg.
+ *
+ * Value: 0x0200090a
+ *
+ * * Solution:check the interrupt Arg, Arg should only be ECLIC_LEVEL_TRIGGER, ECLIC_POSTIVE_EDGE_TRIGGER or
+ *  ECLIC_NEGTIVE_EDGE_TRIGGER.
+ */
+#define OS_ERRNO_HWI_ARG_INVALID LOS_ERRNO_OS_ERROR(LOS_MOD_HWI, 0x0a)
+
+/**
+ * @ingroup los_hwi
+ * Hardware interrupt error code:The interrupt corresponded to the hwi number or devid  has not been created.
+ *
+ * Value: 0x0200090b
+ *
+ * * Solution:check the hwi number or devid, make sure the hwi number or devid need to delete.
+ */
+// #define OS_ERRNO_HWI_HWINUM_UNCREATE LOS_ERRNO_OS_ERROR(LOS_MOD_HWI, 0x0b)
+
+extern UINT32 HalUnalignedAccessFix(UINTPTR mcause, UINTPTR mepc, UINTPTR mtval, VOID *sp);
+
+extern VOID DisplayTaskInfo(VOID);
+
+extern UINT32 g_intCount;
+
+__attribute__((always_inline)) static inline VOID HalIntEnter(VOID)
+{
+    g_intCount += 1;
+}
+
+__attribute__((always_inline)) static inline VOID HalIntExit(VOID)
+{
+    g_intCount -= 1;
+}
+
+__attribute__((always_inline)) static inline UINT32 HalIsIntAcvive(VOID)
+{
+    return (g_intCount > 0);
+}
+
+#ifdef __cplusplus
+#if __cplusplus
+}
+#endif /* __cplusplus */
+#endif /* __cplusplus */
+
+#endif /* _LOS_HWI_H */
diff --git a/kernel/arch/risc-v/nuclei/gcc/los_arch_timer.h b/kernel/arch/risc-v/nuclei/gcc/los_arch_timer.h
new file mode 100644
index 00000000..384bdd5b
--- /dev/null
+++ b/kernel/arch/risc-v/nuclei/gcc/los_arch_timer.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2013-2019 Huawei Technologies Co., Ltd. All rights reserved.
+ * Copyright (c) 2020-2021 Huawei Device Co., Ltd. All rights reserved.
+ * Copyright (c) 2021 Nuclei Limited. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this list of
+ *    conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice, this list
+ *    of conditions and the following disclaimer in the documentation and/or other materials
+ *    provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors may be used
+ *    to endorse or promote products derived from this software without specific prior written
+ *    permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _LOS_ARCH_TIMER_H
+#define _LOS_ARCH_TIMER_H
+
+#include "los_config.h"
+#include "los_compiler.h"
+#include "los_context.h"
+
+#ifdef __cplusplus
+#if __cplusplus
+extern "C" {
+#endif /* __cpluscplus */
+#endif /* __cpluscplus */
+
+UINT32 HalTickStart(OS_TICK_HANDLER handler);
+
+#ifdef __cplusplus
+#if __cplusplus
+}
+#endif /* __cpluscplus */
+#endif /* __cpluscplus */
+
+#endif /* _LOS_ARCH_TIMER_H */
+
diff --git a/kernel/arch/risc-v/nuclei/gcc/los_context.c b/kernel/arch/risc-v/nuclei/gcc/los_context.c
new file mode 100644
index 00000000..3f309ea4
--- /dev/null
+++ b/kernel/arch/risc-v/nuclei/gcc/los_context.c
@@ -0,0 +1,124 @@
+/*
+ * Copyright (c) 2021 Nuclei Limited. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "los_arch_context.h"
+#include "los_arch_interrupt.h"
+#include "los_arch_timer.h"
+#include "los_task.h"
+#include "los_memory.h"
+#include "los_timer.h"
+#include "nuclei_sdk_soc.h"
+
+#define INITIAL_MSTATUS                 ( MSTATUS_MPP | MSTATUS_MPIE | MSTATUS_FS_INITIAL)
+
+#define ALIGN_DOWN(size, align)         ((size) & ~((align) - 1))
+
+#ifdef __cplusplus
+#if __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+#endif /* __cplusplus */
+
+LITE_OS_SEC_TEXT_INIT VOID HalArchInit(VOID)
+{
+    HalHwiInit();
+}
+
+LITE_OS_SEC_TEXT_MINOR VOID HalSysExit(VOID)
+{
+    HalIntLock();
+    while (1) {
+    }
+}
+
+LITE_OS_SEC_TEXT_INIT VOID *HalTskStackInit(UINT32 taskID, UINT32 stackSize, VOID *topStack)
+{
+    UINT32 index;
+    UINT8 *stk;
+    TaskContext  *context = NULL;
+
+    /* initialize the task stack, write magic num to stack top */
+    *((UINT32 *)(topStack)) = OS_TASK_MAGIC_WORD;
+
+    stk = ((UINT8 *)topStack) + stackSize + sizeof(STACK_TYPE);
+    stk = (UINT8 *)ALIGN_DOWN((unsigned long)stk, REGBYTES);
+    context = (TaskContext *)(stk - sizeof(TaskContext));
+
+    for (index = 1; index < sizeof(TaskContext)/ sizeof(STACK_TYPE); index ++) {
+        ((STACK_TYPE *)context)[index] = OS_TASK_STACK_INIT;
+    }
+    context->ra      = (STACK_TYPE)HalSysExit;
+    context->a0      = (STACK_TYPE)taskID;
+    context->epc     = (STACK_TYPE)OsTaskEntry;
+
+    context->mstatus = INITIAL_MSTATUS;
+
+
+    return (VOID *)context;
+}
+
+extern BOOL g_taskScheduled;
+extern LosTask g_losTask;
+LITE_OS_SEC_TEXT_INIT UINT32 HalStartSchedule(OS_TICK_HANDLER handler)
+{
+    UINT32 ret;
+    __disable_irq();
+    ret = HalTickStart(handler);
+    if (ret != LOS_OK) {
+        return ret;
+    }
+    g_taskScheduled = TRUE;
+    /* Set newTask to runTask */
+    g_losTask.runTask = g_losTask.newTask;
+    g_losTask.runTask->taskStatus |= OS_TASK_STATUS_RUNNING;
+    HalStartToRun();
+    return LOS_OK; /* never return */
+}
+
+VOID HalTaskSchedule(VOID)
+{
+    SysTimer_SetSWIRQ();
+}
+
+VOID HalTaskSwitch(VOID)
+{
+    SysTimer_ClearSWIRQ();
+    g_losTask.runTask->taskStatus &= ~OS_TASK_STATUS_RUNNING;
+    /* Set newTask to runTask */
+    g_losTask.runTask = g_losTask.newTask;
+    g_losTask.runTask->taskStatus |= OS_TASK_STATUS_RUNNING;
+}
+
+LITE_OS_SEC_TEXT VOID HalTaskScheduleCheck(VOID)
+{
+#if (LOSCFG_BASE_CORE_TSK_MONITOR == 1)
+    OsTaskSwitchCheck();
+#endif
+    return;
+}
+
+VOID HalEnterSleep(LOS_SysSleepEnum sleep)
+{
+    __WFI();
+}
+
+#ifdef __cplusplus
+#if __cplusplus
+}
+#endif /* __cplusplus */
+#endif /* __cplusplus */
diff --git a/kernel/arch/risc-v/nuclei/gcc/los_dispatch.S b/kernel/arch/risc-v/nuclei/gcc/los_dispatch.S
new file mode 100644
index 00000000..99123fa2
--- /dev/null
+++ b/kernel/arch/risc-v/nuclei/gcc/los_dispatch.S
@@ -0,0 +1,229 @@
+/*
+ * Copyright (c) 2021 Nuclei Limited. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this list of
+ *    conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice, this list
+ *    of conditions and the following disclaimer in the documentation and/or other materials
+ *    provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors may be used
+ *    to endorse or promote products derived from this software without specific prior written
+ *    permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "riscv_encoding.h"
+
+#ifndef __riscv_32e
+#define portRegNum          30
+#else
+#define portRegNum          14
+#endif
+
+#define portCONTEXT_SIZE    ( portRegNum * REGBYTES )
+
+    .section .text
+    .align 4
+
+    .type HalIntLock, %function
+    .global HalIntLock
+HalIntLock:
+    csrr    a0, mstatus           // return value
+    li      t0, MSTATUS_MIE   // mie
+    csrrc   zero, mstatus, t0
+    ret
+
+    .type HalIntUnLock, %function
+    .global HalIntUnLock
+HalIntUnLock:
+    csrr    a0, mstatus           // return value
+    li      t0, MSTATUS_MIE   // mie
+    csrrs   zero, mstatus, t0
+    ret
+
+    .type HalIntRestore, %function
+    .global HalIntRestore
+HalIntRestore:
+    csrw mstatus, a0
+    ret
+
+
+/* Start the first task.  This also clears the bit that indicates the FPU is
+    in use in case the FPU was used before the scheduler was started - which
+    would otherwise result in the unnecessary leaving of space in the stack
+    for lazy saving of FPU registers. */
+    .type HalStartToRun, %function
+    .global HalStartToRun
+    .align 3
+HalStartToRun:
+    /* Setup Interrupt Stack using
+       The stack that was used by main()
+       before the scheduler is started is
+       no longer required after the scheduler is started.
+       Interrupt stack pointer is stored in CSR_MSCRATCH */
+    la t0, _sp
+    csrw CSR_MSCRATCH, t0
+    /* get stack pointer */
+    la t0, g_losTask
+    LOAD t1, 0x0(t0)
+    LOAD      sp, 0(t1)
+    //LOAD sp, 0x0(sp)                /* Read sp from first TCB member */
+
+    /* Pop PC from stack and set MEPC */
+    LOAD t0,  0  * REGBYTES(sp)
+    csrw CSR_MEPC, t0
+    /* Pop mstatus from stack and set it */
+    LOAD t0,  (portRegNum - 1)  * REGBYTES(sp)
+    csrw CSR_MSTATUS, t0
+    /* Interrupt still disable here */
+    /* Restore Registers from Stack */
+    LOAD x1,  1  * REGBYTES(sp)    /* RA */
+    LOAD x5,  2  * REGBYTES(sp)
+    LOAD x6,  3  * REGBYTES(sp)
+    LOAD x7,  4  * REGBYTES(sp)
+    LOAD x8,  5  * REGBYTES(sp)
+    LOAD x9,  6  * REGBYTES(sp)
+    LOAD x10, 7  * REGBYTES(sp)
+    LOAD x11, 8  * REGBYTES(sp)
+    LOAD x12, 9  * REGBYTES(sp)
+    LOAD x13, 10 * REGBYTES(sp)
+    LOAD x14, 11 * REGBYTES(sp)
+    LOAD x15, 12 * REGBYTES(sp)
+#ifndef __riscv_32e
+    LOAD x16, 13 * REGBYTES(sp)
+    LOAD x17, 14 * REGBYTES(sp)
+    LOAD x18, 15 * REGBYTES(sp)
+    LOAD x19, 16 * REGBYTES(sp)
+    LOAD x20, 17 * REGBYTES(sp)
+    LOAD x21, 18 * REGBYTES(sp)
+    LOAD x22, 19 * REGBYTES(sp)
+    LOAD x23, 20 * REGBYTES(sp)
+    LOAD x24, 21 * REGBYTES(sp)
+    LOAD x25, 22 * REGBYTES(sp)
+    LOAD x26, 23 * REGBYTES(sp)
+    LOAD x27, 24 * REGBYTES(sp)
+    LOAD x28, 25 * REGBYTES(sp)
+    LOAD x29, 26 * REGBYTES(sp)
+    LOAD x30, 27 * REGBYTES(sp)
+    LOAD x31, 28 * REGBYTES(sp)
+#endif
+
+    addi sp, sp, portCONTEXT_SIZE
+
+    mret
+
+.extern HalTaskSwitch
+.align 2
+.global eclic_msip_handler
+eclic_msip_handler:
+    addi sp, sp, -portCONTEXT_SIZE
+    STORE x1,  1  * REGBYTES(sp)    /* RA */
+    STORE x5,  2  * REGBYTES(sp)
+    STORE x6,  3  * REGBYTES(sp)
+    STORE x7,  4  * REGBYTES(sp)
+    STORE x8,  5  * REGBYTES(sp)
+    STORE x9,  6  * REGBYTES(sp)
+    STORE x10, 7  * REGBYTES(sp)
+    STORE x11, 8  * REGBYTES(sp)
+    STORE x12, 9  * REGBYTES(sp)
+    STORE x13, 10 * REGBYTES(sp)
+    STORE x14, 11 * REGBYTES(sp)
+    STORE x15, 12 * REGBYTES(sp)
+#ifndef __riscv_32e
+    STORE x16, 13 * REGBYTES(sp)
+    STORE x17, 14 * REGBYTES(sp)
+    STORE x18, 15 * REGBYTES(sp)
+    STORE x19, 16 * REGBYTES(sp)
+    STORE x20, 17 * REGBYTES(sp)
+    STORE x21, 18 * REGBYTES(sp)
+    STORE x22, 19 * REGBYTES(sp)
+    STORE x23, 20 * REGBYTES(sp)
+    STORE x24, 21 * REGBYTES(sp)
+    STORE x25, 22 * REGBYTES(sp)
+    STORE x26, 23 * REGBYTES(sp)
+    STORE x27, 24 * REGBYTES(sp)
+    STORE x28, 25 * REGBYTES(sp)
+    STORE x29, 26 * REGBYTES(sp)
+    STORE x30, 27 * REGBYTES(sp)
+    STORE x31, 28 * REGBYTES(sp)
+#endif
+    /* Push mstatus to stack */
+    csrr t0, CSR_MSTATUS
+    STORE t0,  (portRegNum - 1)  * REGBYTES(sp)
+
+    /* Push additional registers */
+
+    /* Store sp to task stack */
+    la t0, g_losTask
+    LOAD t0, 0(t0)
+    STORE sp, 0(t0)
+
+    csrr t0, CSR_MEPC
+    STORE t0, 0(sp)
+
+    /* Switch task context */
+    jal HalTaskSwitch
+    /* Load new task */
+    la t0, g_losTask
+    LOAD t0, 0(t0)
+    LOAD sp, 0x0(t0)                /* Read sp from first TCB member */
+
+    /* Pop PC from stack and set MEPC */
+    LOAD t0,  0  * REGBYTES(sp)
+    csrw CSR_MEPC, t0
+    /* Pop additional registers */
+
+    /* Pop mstatus from stack and set it */
+    LOAD t0,  (portRegNum - 1)  * REGBYTES(sp)
+    csrw CSR_MSTATUS, t0
+    /* Interrupt still disable here */
+    /* Restore Registers from Stack */
+    LOAD x1,  1  * REGBYTES(sp)    /* RA */
+    LOAD x5,  2  * REGBYTES(sp)
+    LOAD x6,  3  * REGBYTES(sp)
+    LOAD x7,  4  * REGBYTES(sp)
+    LOAD x8,  5  * REGBYTES(sp)
+    LOAD x9,  6  * REGBYTES(sp)
+    LOAD x10, 7  * REGBYTES(sp)
+    LOAD x11, 8  * REGBYTES(sp)
+    LOAD x12, 9  * REGBYTES(sp)
+    LOAD x13, 10 * REGBYTES(sp)
+    LOAD x14, 11 * REGBYTES(sp)
+    LOAD x15, 12 * REGBYTES(sp)
+#ifndef __riscv_32e
+    LOAD x16, 13 * REGBYTES(sp)
+    LOAD x17, 14 * REGBYTES(sp)
+    LOAD x18, 15 * REGBYTES(sp)
+    LOAD x19, 16 * REGBYTES(sp)
+    LOAD x20, 17 * REGBYTES(sp)
+    LOAD x21, 18 * REGBYTES(sp)
+    LOAD x22, 19 * REGBYTES(sp)
+    LOAD x23, 20 * REGBYTES(sp)
+    LOAD x24, 21 * REGBYTES(sp)
+    LOAD x25, 22 * REGBYTES(sp)
+    LOAD x26, 23 * REGBYTES(sp)
+    LOAD x27, 24 * REGBYTES(sp)
+    LOAD x28, 25 * REGBYTES(sp)
+    LOAD x29, 26 * REGBYTES(sp)
+    LOAD x30, 27 * REGBYTES(sp)
+    LOAD x31, 28 * REGBYTES(sp)
+#endif
+
+    addi sp, sp, portCONTEXT_SIZE
+    mret
diff --git a/kernel/arch/risc-v/nuclei/gcc/los_exc.S b/kernel/arch/risc-v/nuclei/gcc/los_exc.S
new file mode 100644
index 00000000..fea0f9ad
--- /dev/null
+++ b/kernel/arch/risc-v/nuclei/gcc/los_exc.S
@@ -0,0 +1,255 @@
+/*
+ * Copyright (c) 2021 Nuclei Limited. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this list of
+ *    conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice, this list
+ *    of conditions and the following disclaimer in the documentation and/or other materials
+ *    provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors may be used
+ *    to endorse or promote products derived from this software without specific prior written
+ *    permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _LOS_EXC_S
+#define _LOS_EXC_S
+
+#include "riscv_encoding.h"
+
+.section    .text.entry
+.align 8
+
+/**
+ * \brief  Global interrupt disabled
+ * \details
+ *  This function disable global interrupt.
+ * \remarks
+ *  - All the interrupt requests will be ignored by CPU.
+ */
+.macro DISABLE_MIE
+    csrc CSR_MSTATUS, MSTATUS_MIE
+.endm
+
+/**
+ * \brief  Macro for context save
+ * \details
+ * This macro save ABI defined caller saved registers in the stack.
+ * \remarks
+ * - This Macro could use to save context when you enter to interrupt
+ * or exception
+*/
+/* Save caller registers */
+.macro SAVE_CONTEXT
+    csrrw sp, CSR_MSCRATCHCSWL, sp
+    /* Allocate stack space for context saving */
+#ifndef __riscv_32e
+    addi sp, sp, -20*REGBYTES
+#else
+    addi sp, sp, -14*REGBYTES
+#endif /* __riscv_32e */
+
+    STORE x1, 0*REGBYTES(sp)
+    STORE x4, 1*REGBYTES(sp)
+    STORE x5, 2*REGBYTES(sp)
+    STORE x6, 3*REGBYTES(sp)
+    STORE x7, 4*REGBYTES(sp)
+    STORE x10, 5*REGBYTES(sp)
+    STORE x11, 6*REGBYTES(sp)
+    STORE x12, 7*REGBYTES(sp)
+    STORE x13, 8*REGBYTES(sp)
+    STORE x14, 9*REGBYTES(sp)
+    STORE x15, 10*REGBYTES(sp)
+#ifndef __riscv_32e
+    STORE x16, 14*REGBYTES(sp)
+    STORE x17, 15*REGBYTES(sp)
+    STORE x28, 16*REGBYTES(sp)
+    STORE x29, 17*REGBYTES(sp)
+    STORE x30, 18*REGBYTES(sp)
+    STORE x31, 19*REGBYTES(sp)
+#endif /* __riscv_32e */
+.endm
+
+/**
+ * \brief  Macro for restore caller registers
+ * \details
+ * This macro restore ABI defined caller saved registers from stack.
+ * \remarks
+ * - You could use this macro to restore context before you want return
+ * from interrupt or exeception
+ */
+/* Restore caller registers */
+.macro RESTORE_CONTEXT
+    LOAD x1, 0*REGBYTES(sp)
+    LOAD x4, 1*REGBYTES(sp)
+    LOAD x5, 2*REGBYTES(sp)
+    LOAD x6, 3*REGBYTES(sp)
+    LOAD x7, 4*REGBYTES(sp)
+    LOAD x10, 5*REGBYTES(sp)
+    LOAD x11, 6*REGBYTES(sp)
+    LOAD x12, 7*REGBYTES(sp)
+    LOAD x13, 8*REGBYTES(sp)
+    LOAD x14, 9*REGBYTES(sp)
+    LOAD x15, 10*REGBYTES(sp)
+#ifndef __riscv_32e
+    LOAD x16, 14*REGBYTES(sp)
+    LOAD x17, 15*REGBYTES(sp)
+    LOAD x28, 16*REGBYTES(sp)
+    LOAD x29, 17*REGBYTES(sp)
+    LOAD x30, 18*REGBYTES(sp)
+    LOAD x31, 19*REGBYTES(sp)
+
+    /* De-allocate the stack space */
+    addi sp, sp, 20*REGBYTES
+#else
+    /* De-allocate the stack space */
+    addi sp, sp, 14*REGBYTES
+#endif /* __riscv_32e */
+    csrrw sp, CSR_MSCRATCHCSWL, sp
+.endm
+
+/**
+ * \brief  Macro for save necessary CSRs to stack
+ * \details
+ * This macro store MCAUSE, MEPC, MSUBM to stack.
+ */
+.macro SAVE_CSR_CONTEXT
+    /* Store CSR mcause to stack using pushmcause */
+    csrrwi  x0, CSR_PUSHMCAUSE, 11
+    /* Store CSR mepc to stack using pushmepc */
+    csrrwi  x0, CSR_PUSHMEPC, 12
+    /* Store CSR msub to stack using pushmsub */
+    csrrwi  x0, CSR_PUSHMSUBM, 13
+.endm
+
+/**
+ * \brief  Macro for restore necessary CSRs from stack
+ * \details
+ * This macro restore MSUBM, MEPC, MCAUSE from stack.
+ */
+.macro RESTORE_CSR_CONTEXT
+    LOAD x5,  13*REGBYTES(sp)
+    csrw CSR_MSUBM, x5
+    LOAD x5,  12*REGBYTES(sp)
+    csrw CSR_MEPC, x5
+    LOAD x5,  11*REGBYTES(sp)
+    csrw CSR_MCAUSE, x5
+.endm
+
+/**
+ * \brief  Exception/NMI Entry
+ * \details
+ * This function provide common entry functions for exception/nmi.
+ * \remarks
+ * This function provide a default exception/nmi entry.
+ * ABI defined caller save register and some CSR registers
+ * to be saved before enter interrupt handler and be restored before return.
+ */
+.section .text.trap
+/* In CLIC mode, the exeception entry must be 64bytes aligned */
+.align 6
+.global exc_entry
+exc_entry:
+    /* Save the caller saving registers (context) */
+    SAVE_CONTEXT
+    /* Save the necessary CSR registers */
+    SAVE_CSR_CONTEXT
+
+    /*
+     * Set the exception handler function arguments
+     * argument 1: mcause value
+     * argument 2: current stack point(SP) value
+     */
+    csrr a0, mcause
+    mv a1, sp
+    /*
+     * TODO: Call the exception handler function
+     * By default, the function template is provided in
+     * system_Device.c, you can adjust it as you want
+     */
+    call core_exception_handler
+
+    /* Restore the necessary CSR registers */
+    RESTORE_CSR_CONTEXT
+    /* Restore the caller saving registers (context) */
+    RESTORE_CONTEXT
+
+    /* Return to regular code */
+    mret
+
+/**
+ * \brief  Non-Vector Interrupt Entry
+ * \details
+ * This function provide common entry functions for handling
+ * non-vector interrupts
+ * \remarks
+ * This function provide a default non-vector interrupt entry.
+ * ABI defined caller save register and some CSR registers need
+ * to be saved before enter interrupt handler and be restored before return.
+ */
+.section      .text.irq
+/* In CLIC mode, the interrupt entry must be 4bytes aligned */
+.align 2
+.extern g_intCount
+.global irq_entry
+/* This label will be set to MTVT2 register */
+irq_entry:
+    /* Save the caller saving registers (context) */
+    SAVE_CONTEXT
+    /* Save the necessary CSR registers */
+    SAVE_CSR_CONTEXT
+
+    /* This special CSR read/write operation, which is actually
+     * claim the CLIC to find its pending highest ID, if the ID
+     * is not 0, then automatically enable the mstatus.MIE, and
+     * jump to its vector-entry-label, and update the link register
+     */
+    la t0, g_intCount
+    lw t1, 0(t0)
+    add t1, t1, 0x1
+    sw t1, 0(t0)
+
+    csrrw ra, CSR_JALMNXTI, ra
+
+    /* Critical section with interrupts disabled */
+    DISABLE_MIE
+
+    la t0, g_intCount
+    lw t1, 0(t0)
+    li t2, 0x1
+    sub t1, t1, t2
+    sw t1, 0(t0)
+
+    /* Restore the necessary CSR registers */
+    RESTORE_CSR_CONTEXT
+    /* Restore the caller saving registers (context) */
+    RESTORE_CONTEXT
+
+    /* Return to regular code */
+    mret
+
+/* Default Handler for Exceptions / Interrupts */
+.global default_intexc_handler
+Undef_Handler:
+default_intexc_handler:
+1:
+    j 1b
+
+#endif /* _LOS_TRAP_S */
+
diff --git a/kernel/arch/risc-v/nuclei/gcc/los_interrupt.c b/kernel/arch/risc-v/nuclei/gcc/los_interrupt.c
new file mode 100644
index 00000000..2e2f808b
--- /dev/null
+++ b/kernel/arch/risc-v/nuclei/gcc/los_interrupt.c
@@ -0,0 +1,175 @@
+/*
+ * Copyright (c) 2021 Nuclei Limited. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this list of
+ *    conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice, this list
+ *    of conditions and the following disclaimer in the documentation and/or other materials
+ *    provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors may be used
+ *    to endorse or promote products derived from this software without specific prior written
+ *    permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stdio.h>
+#include <stdarg.h>
+#include "los_arch.h"
+#include "los_arch_interrupt.h"
+#include "los_arch_context.h"
+#include "los_task.h"
+#include "los_debug.h"
+#include "nuclei_sdk_hal.h"
+
+#ifdef __cplusplus
+#if __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+#endif /* __cplusplus */
+
+UINT32 g_intCount = 0;
+
+// LosExcInfo g_excInfo;
+LITE_OS_SEC_TEXT_INIT VOID HalHwiInit(VOID)
+{
+    // already setup interrupt vectors
+}
+
+/*****************************************************************************
+ Function    : HalHwiCreate
+ Description : create hardware interrupt
+ Input       : hwiNum       --- hwi num to create
+               hwiPrio      --- priority of the hwi
+               mode         --- hwi interrupt mode, between vector or non-vector
+               handler      --- hwi handler
+               arg          --- set trig mode of the hwi handler
+                                Level Triggerred = 0
+                                Postive/Rising Edge Triggered = 1
+                                Negtive/Falling Edge Triggered = 3
+ Output      : None
+ Return      : LOS_OK on success or error code on failure
+ *****************************************************************************/
+ UINT32 HalHwiCreate(HWI_HANDLE_T hwiNum,
+                                     HWI_PRIOR_T hwiPrio,
+                                     HWI_MODE_T mode,
+                                     HWI_PROC_FUNC handler,
+                                     HWI_ARG_T arg)
+{
+    if (hwiNum > SOC_INT_MAX){
+        return OS_ERRNO_HWI_NUM_INVALID;
+    }
+    if (mode > ECLIC_VECTOR_INTERRUPT){
+        return OS_ERRNO_HWI_MODE_INVALID;
+    }
+    if (arg > ECLIC_NEGTIVE_EDGE_TRIGGER){
+        return OS_ERRNO_HWI_ARG_INVALID;
+    }
+
+    /* set interrupt vector mode */
+    ECLIC_SetShvIRQ(hwiNum, mode);
+    /* set interrupt trigger mode and polarity */
+    ECLIC_SetTrigIRQ(hwiNum, arg);
+    /* set interrupt level */
+    // default to 0
+    ECLIC_SetLevelIRQ(hwiNum, 0);
+    /* set interrupt priority */
+    ECLIC_SetPriorityIRQ(hwiNum, hwiPrio);
+    if (handler != NULL) {
+        /* set interrupt handler entry to vector table */
+        ECLIC_SetVector(hwiNum, (rv_csr_t)handler);
+    }
+    /* enable interrupt */
+    ECLIC_EnableIRQ(hwiNum);
+    return LOS_OK;
+}
+
+/*****************************************************************************
+ Function    : HalHwiDelete
+ Description : Delete hardware interrupt
+ Input       : hwiNum   --- hwi num to delete
+ Return      : LOS_OK on success or error code on failure
+ *****************************************************************************/
+LITE_OS_SEC_TEXT UINT32 HalHwiDelete(HWI_HANDLE_T hwiNum)
+{
+    // change func to default func
+    ECLIC_SetVector(hwiNum, HalHwiDefaultHandler);
+    // disable interrupt
+    ECLIC_DisableIRQ(hwiNum);
+    return LOS_OK;
+}
+
+/* ****************************************************************************
+ Function    : HalHwiDefaultHandler
+ Description : default handler of the hardware interrupt
+ Input       : None
+ Output      : None
+ Return      : None
+ **************************************************************************** */
+LITE_OS_SEC_TEXT_INIT VOID HalHwiDefaultHandler(VOID)
+{
+    PRINT_ERR("default handler\n");
+    while (1) {
+    }
+}
+
+/* ****************************************************************************
+ Function    : HalDisplayTaskInfo
+ Description : display the task list
+ Input       : None
+ Output      : None
+ Return      : None
+ **************************************************************************** */
+VOID HalDisplayTaskInfo(VOID)
+{
+    TSK_INFO_S taskInfo;
+    UINT32 index;
+    UINT32 ret;
+
+    PRINTK("ID  Pri    Status     name \r\n");
+    PRINTK("--  ---    ---------  ----\r\n");
+
+    for (index = 0; index < LOSCFG_BASE_CORE_TSK_LIMIT; index++) {
+        ret = LOS_TaskInfoGet(index, &taskInfo);
+        if (ret != LOS_OK) {
+            continue;
+        }
+        PRINTK("%d    %d     %s      %s \r\n",
+               taskInfo.uwTaskID, taskInfo.usTaskPrio, OsConvertTskStatus(taskInfo.usTaskStatus), taskInfo.acName);
+    }
+    return;
+}
+
+/* ****************************************************************************
+ Function    : HalUnalignedAccessFix
+ Description : Unaligned acess fixes are not supported by default
+ Input       : None
+ Output      : None
+ Return      : None
+ **************************************************************************** */
+WEAK UINT32 HalUnalignedAccessFix(UINTPTR mcause, UINTPTR mepc, UINTPTR mtval, VOID *sp)
+{
+    /* Unaligned acess fixes are not supported by default */
+    PRINTK("Unaligned acess fixes are not support by default!\r\n");
+    return LOS_NOK;
+}
+#ifdef __cplusplus
+#if __cplusplus
+}
+#endif /* __cplusplus */
+#endif /* __cplusplus */
diff --git a/kernel/arch/risc-v/nuclei/gcc/los_timer.c b/kernel/arch/risc-v/nuclei/gcc/los_timer.c
new file mode 100644
index 00000000..ae0ce335
--- /dev/null
+++ b/kernel/arch/risc-v/nuclei/gcc/los_timer.c
@@ -0,0 +1,148 @@
+/*
+ * Copyright (c) 2013-2020, Huawei Technologies Co., Ltd. All rights reserved.
+ * Copyright (c) 2020-2021 Huawei Device Co., Ltd. All rights reserved.
+ * Copyright (c) 2021 Nuclei Limited. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this list of
+ *    conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice, this list
+ *    of conditions and the following disclaimer in the documentation and/or other materials
+ *    provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors may be used
+ *    to endorse or promote products derived from this software without specific prior written
+ *    permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "los_tick.h"
+#include "los_config.h"
+#include "los_arch_interrupt.h"
+#include "nuclei_sdk_hal.h"
+#include "los_timer.h"
+
+#ifdef __cplusplus
+#if __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+#endif /* __cplusplus */
+
+#define configKERNEL_INTERRUPT_PRIORITY         0
+
+#define SYSTICK_TICK_CONST  (SOC_TIMER_FREQ / LOSCFG_BASE_CORE_TICK_PER_SECOND)
+
+static OS_TICK_HANDLER systick_handler = (OS_TICK_HANDLER)NULL;
+
+extern UINT32 g_intCount;
+
+WEAK UINT32 HalTickStart(OS_TICK_HANDLER handler)
+{
+    SysTick_Config(SYSTICK_TICK_CONST);
+    ECLIC_DisableIRQ(SysTimer_IRQn);
+    ECLIC_SetLevelIRQ(SysTimer_IRQn, configKERNEL_INTERRUPT_PRIORITY);
+    ECLIC_SetShvIRQ(SysTimer_IRQn, ECLIC_NON_VECTOR_INTERRUPT);
+    ECLIC_EnableIRQ(SysTimer_IRQn);
+
+    /* Set SWI interrupt level to lowest level/priority, SysTimerSW as Vector Interrupt */
+    ECLIC_SetShvIRQ(SysTimerSW_IRQn, ECLIC_VECTOR_INTERRUPT);
+    ECLIC_SetLevelIRQ(SysTimerSW_IRQn, configKERNEL_INTERRUPT_PRIORITY);
+    ECLIC_EnableIRQ(SysTimerSW_IRQn);
+    g_sysClock = SystemCoreClock;
+    g_cyclesPerTick = g_sysClock / LOSCFG_BASE_CORE_TICK_PER_SECOND;
+    g_intCount = 0;
+    g_ullTickCount = 0;
+
+    systick_handler = handler;
+
+    return LOS_OK; /* never return */
+}
+
+#define HalTickSysTickHandler eclic_mtip_handler
+
+void HalTickSysTickHandler( void )
+{
+    UINT32 intSave;
+
+    intSave = LOS_IntLock();
+
+    SysTick_Reload(SYSTICK_TICK_CONST);
+    /* Do systick handler. */
+    if ((void *)systick_handler != NULL) {
+        systick_handler();
+    }
+        
+    LOS_IntRestore(intSave);
+}
+/* ****************************************************************************
+Function    : HalGetCpuCycle
+Description : Get System cycle count
+Input       : none
+output      : cntHi  --- CpuTick High 4 byte
+              cntLo  --- CpuTick Low 4 byte
+return      : none
+**************************************************************************** */
+LITE_OS_SEC_TEXT_MINOR VOID HalGetCpuCycle(UINT32 *cntHi, UINT32 *cntLo)
+{
+    volatile uint32_t high0, low, high;
+
+    high0 = __RV_CSR_READ(CSR_MCYCLEH);
+    low = __RV_CSR_READ(CSR_MCYCLE);
+    high = __RV_CSR_READ(CSR_MCYCLEH);
+    if (high0 != high) {
+        low = __RV_CSR_READ(CSR_MCYCLE);
+    }
+    *cntHi = high;
+    *cntLo = low;
+    return;
+}
+
+WEAK VOID HalDelay(UINT32 ticks)
+{
+    return;
+}
+
+WEAK UINT64 HalGetExpandTick(VOID)
+{
+    return LOS_OK;
+}
+
+WEAK INT32 HalGetRtcTime(UINT64 *usec)
+{
+    return LOS_OK;
+}
+
+WEAK INT32 HalGetRtcTimeZone(INT32 *timeZone)
+{
+    return LOS_OK;
+}
+
+WEAK INT32 HalSetRtcTime(UINT64 utcTime, UINT64 *usec)
+{
+    return LOS_OK;
+}
+
+WEAK INT32 HalSetRtcTimeZone(INT32 timeZone)
+{
+    return LOS_OK;
+}
+
+#ifdef __cplusplus
+#if __cplusplus
+}
+#endif /* __cplusplus */
+#endif /* __cplusplus */
diff --git a/kernel/arch/risc-v/nuclei/gcc/nmsis/Core/Include/core_compatiable.h b/kernel/arch/risc-v/nuclei/gcc/nmsis/Core/Include/core_compatiable.h
new file mode 100644
index 00000000..316a309f
--- /dev/null
+++ b/kernel/arch/risc-v/nuclei/gcc/nmsis/Core/Include/core_compatiable.h
@@ -0,0 +1,232 @@
+/*
+ * Copyright (c) 2019 Nuclei Limited. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __CORE_COMPATIABLE_H__
+#define __CORE_COMPATIABLE_H__
+/*!
+ * @file     core_compatiable.h
+ * @brief    ARM compatiable function definitions header file
+ */
+#ifdef __cplusplus
+ extern "C" {
+#endif
+
+/* ===== ARM Compatiable Functions ===== */
+/**
+ * \defgroup NMSIS_Core_ARMCompatiable_Functions   ARM Compatiable Functions
+ * \ingroup  NMSIS_Core
+ * \brief    A few functions that compatiable with ARM CMSIS-Core.
+ * \details
+ *
+ * Here we provided a few functions that compatiable with ARM CMSIS-Core,
+ * mostly used in the DSP and NN library.
+ * @{
+ */
+/** \brief Instruction Synchronization Barrier, compatiable with ARM */
+#define __ISB()                             __RWMB()
+
+/** \brief Data Synchronization Barrier, compatiable with ARM */
+#define __DSB()                             __RWMB()
+
+/** \brief Data Memory Barrier, compatiable with ARM */
+#define __DMB()                             __RWMB()
+
+/** \brief LDRT Unprivileged (8 bit), ARM Compatiable */
+#define __LDRBT(ptr)                        __LB((ptr))
+/** \brief LDRT Unprivileged (16 bit), ARM Compatiable */
+#define __LDRHT(ptr)                        __LH((ptr))
+/** \brief LDRT Unprivileged (32 bit), ARM Compatiable */
+#define __LDRT(ptr)                         __LW((ptr))
+
+/** \brief STRT Unprivileged (8 bit), ARM Compatiable */
+#define __STRBT(val, ptr)                   __SB((ptr), (val))
+/** \brief STRT Unprivileged (16 bit), ARM Compatiable */
+#define __STRHT(val, ptr)                   __SH((ptr), (val))
+/** \brief STRT Unprivileged (32 bit), ARM Compatiable */
+#define __STRT(val, ptr)                    __SW((ptr), (val))
+
+/* ===== Saturation Operations ===== */
+/**
+ * \brief   Signed Saturate
+ * \details Saturates a signed value.
+ * \param [in]  value  Value to be saturated
+ * \param [in]    sat  Bit position to saturate to (1..32)
+ * \return             Saturated value
+ */
+#if defined(__DSP_PRESENT) && (__DSP_PRESENT == 1)
+#define __SSAT(val, sat)          __RV_SCLIP32((val), (sat-1))
+#else
+__STATIC_FORCEINLINE int32_t __SSAT(int32_t val, uint32_t sat)
+{
+    if ((sat >= 1U) && (sat <= 32U)) {
+        const int32_t max = (int32_t)((1U << (sat - 1U)) - 1U);
+        const int32_t min = -1 - max ;
+        if (val > max) {
+            return max;
+        } else if (val < min) {
+            return min;
+        }
+    }
+    return val;
+}
+#endif
+
+/**
+ * \brief   Unsigned Saturate
+ * \details Saturates an unsigned value.
+ * \param [in]  value  Value to be saturated
+ * \param [in]    sat  Bit position to saturate to (0..31)
+ * \return             Saturated value
+ */
+#if defined(__DSP_PRESENT) && (__DSP_PRESENT == 1)
+#define __USAT(val, sat)        __RV_UCLIP32((val), (sat))
+#else
+__STATIC_FORCEINLINE uint32_t __USAT(int32_t val, uint32_t sat)
+{
+    if (sat <= 31U) {
+        const uint32_t max = ((1U << sat) - 1U);
+        if (val > (int32_t)max) {
+            return max;
+        } else if (val < 0) {
+            return 0U;
+        }
+    }
+    return (uint32_t)val;
+}
+#endif
+
+/* ===== Data Processing Operations ===== */
+/**
+ * \brief   Reverse byte order (32 bit)
+ * \details Reverses the byte order in unsigned integer value.
+ * For example, 0x12345678 becomes 0x78563412.
+ * \param [in]    value  Value to reverse
+ * \return               Reversed value
+ */
+__STATIC_FORCEINLINE uint32_t __REV(uint32_t value)
+{
+    uint32_t result;
+
+    result =  ((value & 0xff000000) >> 24)
+        | ((value & 0x00ff0000) >> 8 )
+        | ((value & 0x0000ff00) << 8 )
+        | ((value & 0x000000ff) << 24);
+    return result;
+}
+
+/**
+ * \brief   Reverse byte order (16 bit)
+ * \details Reverses the byte order within each halfword of a word.
+ * For example, 0x12345678 becomes 0x34127856.
+ * \param [in]    value  Value to reverse
+ * \return               Reversed value
+ */
+__STATIC_FORCEINLINE uint32_t __REV16(uint32_t value)
+{
+    uint32_t result;
+    result =  ((value & 0xff000000) >> 8)
+        | ((value & 0x00ff00000) << 8 )
+        | ((value & 0x0000ff00) >> 8 )
+        | ((value & 0x000000ff) << 8) ;
+
+    return result;
+}
+
+/**
+ * \brief   Reverse byte order (16 bit)
+ * \details Reverses the byte order in a 16-bit value
+ * and returns the signed 16-bit result.
+ * For example, 0x0080 becomes 0x8000.
+ * \param [in]    value  Value to reverse
+ * \return               Reversed value
+ */
+__STATIC_FORCEINLINE int16_t __REVSH(int16_t value)
+{
+    int16_t result;
+    result = ((value & 0xff00) >> 8) | ((value & 0x00ff) << 8);
+    return result;
+}
+
+/**
+ * \brief   Rotate Right in unsigned value (32 bit)
+ * \details Rotate Right (immediate) provides the value of
+ * the contents of a register rotated by a variable number of bits.
+ * \param [in]    op1  Value to rotate
+ * \param [in]    op2  Number of Bits to rotate(0-31)
+ * \return               Rotated value
+ */
+__STATIC_FORCEINLINE uint32_t __ROR(uint32_t op1, uint32_t op2)
+{
+    op2 = op2 & 0x1F;
+    if (op2 == 0U) {
+      return op1;
+    }
+    return (op1 >> op2) | (op1 << (32U - op2));
+}
+
+/**
+ * \brief   Reverse bit order of value
+ * \details Reverses the bit order of the given value.
+ * \param [in]    value  Value to reverse
+ * \return               Reversed value
+ */
+#if defined(__DSP_PRESENT) && (__DSP_PRESENT == 1)
+#define __RBIT(value)           __RV_BITREVI((value), 31)
+#else
+__STATIC_FORCEINLINE uint32_t __RBIT(uint32_t value)
+{
+    uint32_t result;
+    uint32_t s = (4U /*sizeof(v)*/ * 8U) - 1U; /* extra shift needed at end */
+
+    result = value; /* r will be reversed bits of v; first get LSB of v */
+    for (value >>= 1U; value != 0U; value >>= 1U) {
+        result <<= 1U;
+        result |= value & 1U;
+        s--;
+    }
+    result <<= s; /* shift when v's highest bits are zero */
+    return result;
+}
+#endif /* defined(__DSP_PRESENT) && (__DSP_PRESENT == 1) */
+
+/**
+ * \brief   Count leading zeros
+ * \details Counts the number of leading zeros of a data value.
+ * \param [in]  data  Value to count the leading zeros
+ * \return             number of leading zeros in value
+ */
+#if defined(__DSP_PRESENT) && (__DSP_PRESENT == 1)
+#define __CLZ(data)         __RV_CLZ32(data)
+#else
+__STATIC_FORCEINLINE uint8_t __CLZ(uint32_t data)
+{
+    uint8_t ret = 0;
+    uint32_t temp = ~data;
+    while (temp & 0x80000000) {
+          temp <<= 1;
+          ret++;
+    }
+    return ret;
+}
+#endif /* defined(__DSP_PRESENT) && (__DSP_PRESENT == 1) */
+
+/** @} */ /* End of Doxygen Group NMSIS_Core_ARMCompatiable_Functions */
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* __CORE_COMPATIABLE_H__ */
diff --git a/kernel/arch/risc-v/nuclei/gcc/nmsis/Core/Include/core_feature_base.h b/kernel/arch/risc-v/nuclei/gcc/nmsis/Core/Include/core_feature_base.h
new file mode 100644
index 00000000..5f351a33
--- /dev/null
+++ b/kernel/arch/risc-v/nuclei/gcc/nmsis/Core/Include/core_feature_base.h
@@ -0,0 +1,1177 @@
+/*
+ * Copyright (c) 2019 Nuclei Limited. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __CORE_FEATURE_BASE__
+#define __CORE_FEATURE_BASE__
+/*!
+ * @file     core_feature_base.h
+ * @brief    Base core feature API for Nuclei N/NX Core
+ */
+#include <stdint.h>
+#include "riscv_encoding.h"
+
+#ifdef __cplusplus
+ extern "C" {
+#endif
+
+/**
+ * \defgroup NMSIS_Core_Registers     Register Define and Type Definitions
+ * \brief   Type definitions and defines for core registers.
+ *
+ * @{
+ */
+#ifndef __RISCV_XLEN
+  /** \brief Refer to the width of an integer register in bits(either 32 or 64) */
+  #ifndef __riscv_xlen
+    #define __RISCV_XLEN    32
+  #else
+    #define __RISCV_XLEN    __riscv_xlen
+  #endif
+#endif /* __RISCV_XLEN */
+
+/** \brief Type of Control and Status Register(CSR), depends on the XLEN defined in RISC-V */
+#if __RISCV_XLEN == 32
+  typedef uint32_t rv_csr_t;
+#elif __RISCV_XLEN == 64
+  typedef uint64_t rv_csr_t;
+#else
+  typedef uint32_t rv_csr_t;
+#endif
+/** @} */ /* End of Doxygen Group NMSIS_Core_Registers */
+/**
+ * \defgroup NMSIS_Core_Base_Registers     Base Register Define and Type Definitions
+ * \ingroup NMSIS_Core_Registers
+ * \brief   Type definitions and defines for base core registers.
+ *
+ * @{
+ */
+/**
+ * \brief  Union type to access MISA register.
+ */
+typedef union {
+    struct {
+        rv_csr_t a:1;                           /*!< bit:     0  Atomic extension */
+        rv_csr_t b:1;                           /*!< bit:     1  Tentatively reserved for Bit-Manipulation extension */
+        rv_csr_t c:1;                           /*!< bit:     2  Compressed extension */
+        rv_csr_t d:1;                           /*!< bit:     3  Double-precision floating-point extension */
+        rv_csr_t e:1;                           /*!< bit:     4  RV32E base ISA */
+        rv_csr_t f:1;                           /*!< bit:     5  Single-precision floating-point extension */
+        rv_csr_t g:1;                           /*!< bit:     6  Additional standard extensions present */
+        rv_csr_t h:1;                           /*!< bit:     7  Hypervisor extension */
+        rv_csr_t i:1;                           /*!< bit:     8  RV32I/64I/128I base ISA */
+        rv_csr_t j:1;                           /*!< bit:     9  Tentatively reserved for Dynamically Translated Languages extension */
+        rv_csr_t _reserved1:1;                  /*!< bit:     10 Reserved  */
+        rv_csr_t l:1;                           /*!< bit:     11 Tentatively reserved for Decimal Floating-Point extension  */
+        rv_csr_t m:1;                           /*!< bit:     12 Integer Multiply/Divide extension */
+        rv_csr_t n:1;                           /*!< bit:     13 User-level interrupts supported  */
+        rv_csr_t _reserved2:1;                  /*!< bit:     14 Reserved  */
+        rv_csr_t p:1;                           /*!< bit:     15 Tentatively reserved for Packed-SIMD extension  */
+        rv_csr_t q:1;                           /*!< bit:     16 Quad-precision floating-point extension  */
+        rv_csr_t _resreved3:1;                  /*!< bit:     17 Reserved  */
+        rv_csr_t s:1;                           /*!< bit:     18 Supervisor mode implemented  */
+        rv_csr_t t:1;                           /*!< bit:     19 Tentatively reserved for Transactional Memory extension  */
+        rv_csr_t u:1;                           /*!< bit:     20 User mode implemented  */
+        rv_csr_t v:1;                           /*!< bit:     21 Tentatively reserved for Vector extension  */
+        rv_csr_t _reserved4:1;                  /*!< bit:     22 Reserved  */
+        rv_csr_t x:1;                           /*!< bit:     23 Non-standard extensions present  */
+#if defined(__RISCV_XLEN) && __RISCV_XLEN == 64
+        rv_csr_t _reserved5:38;                 /*!< bit:     24..61 Reserved  */
+        rv_csr_t mxl:2;                         /*!< bit:     62..63 Machine XLEN  */
+#else
+        rv_csr_t _reserved5:6;                  /*!< bit:     24..29 Reserved  */
+        rv_csr_t mxl:2;                         /*!< bit:     30..31 Machine XLEN  */
+#endif
+    } b;                                        /*!< Structure used for bit  access */
+    rv_csr_t d;                                 /*!< Type      used for csr data access */
+} CSR_MISA_Type;
+
+/**
+ * \brief  Union type to access MSTATUS configure register.
+ */
+typedef union {
+    struct {
+#if defined(__RISCV_XLEN) && __RISCV_XLEN == 64
+        rv_csr_t _reserved0:3;                  /*!< bit:     0..2  Reserved */
+        rv_csr_t mie:1;                         /*!< bit:     3  Machine mode interrupt enable flag */
+        rv_csr_t _reserved1:3;                  /*!< bit:     4..6  Reserved */
+        rv_csr_t mpie:1;                        /*!< bit:     7  mirror of MIE flag */
+        rv_csr_t _reserved2:3;                  /*!< bit:     8..10  Reserved */
+        rv_csr_t mpp:2;                         /*!< bit:     11..12 mirror of Privilege Mode */
+        rv_csr_t fs:2;                          /*!< bit:     13..14 FS status flag */
+        rv_csr_t xs:2;                          /*!< bit:     15..16 XS status flag */
+        rv_csr_t mprv:1;                        /*!< bit:     Machine mode PMP */
+        rv_csr_t _reserved3:14;                 /*!< bit:     18..31 Reserved */
+        rv_csr_t uxl:2;                         /*!< bit:     32..33 user mode xlen */
+        rv_csr_t _reserved6:29;                 /*!< bit:     34..62 Reserved  */
+        rv_csr_t sd:1;                          /*!< bit:     Dirty status for XS or FS */
+#else
+        rv_csr_t _reserved0:1;                  /*!< bit:     0  Reserved */
+        rv_csr_t sie:1;                         /*!< bit:     1  supervisor interrupt enable flag */
+        rv_csr_t _reserved1:1;                  /*!< bit:     2  Reserved */
+        rv_csr_t mie:1;                         /*!< bit:     3  Machine mode interrupt enable flag */
+        rv_csr_t _reserved2:1;                  /*!< bit:     4  Reserved */
+        rv_csr_t spie:1;                        /*!< bit:     3  Supervisor Privilede mode interrupt enable flag */
+        rv_csr_t _reserved3:1;                  /*!< bit:     Reserved */
+        rv_csr_t mpie:1;                        /*!< bit:     mirror of MIE flag */
+        rv_csr_t _reserved4:3;                  /*!< bit:     Reserved */
+        rv_csr_t mpp:2;                         /*!< bit:     mirror of Privilege Mode */
+        rv_csr_t fs:2;                          /*!< bit:     FS status flag */
+        rv_csr_t xs:2;                          /*!< bit:     XS status flag */
+        rv_csr_t mprv:1;                        /*!< bit:     Machine mode PMP */
+        rv_csr_t sum:1;                         /*!< bit:     Supervisor Mode load and store protection */
+        rv_csr_t _reserved6:12;                 /*!< bit:     19..30 Reserved  */
+        rv_csr_t sd:1;                          /*!< bit:     Dirty status for XS or FS */
+#endif
+    } b;                                        /*!< Structure used for bit  access */
+    rv_csr_t d;                                 /*!< Type      used for csr data access */
+} CSR_MSTATUS_Type;
+
+/**
+ * \brief  Union type to access MTVEC configure register.
+ */
+typedef union {
+    struct {
+        rv_csr_t mode:6;                        /*!< bit:     0..5   interrupt mode control */
+#if defined(__RISCV_XLEN) && __RISCV_XLEN == 64
+        rv_csr_t addr:58;                       /*!< bit:     6..63  mtvec address */
+#else
+        rv_csr_t addr:26;                       /*!< bit:     6..31  mtvec address */
+#endif
+    } b;                                        /*!< Structure used for bit  access */
+    rv_csr_t d;                                 /*!< Type      used for csr data access */
+} CSR_MTVEC_Type;
+
+/**
+ * \brief  Union type to access MCAUSE configure register.
+ */
+typedef union {
+    struct {
+        rv_csr_t exccode:12;                    /*!< bit:     11..0  exception or interrupt code */
+        rv_csr_t _reserved0:4;                  /*!< bit:     15..12  Reserved */
+        rv_csr_t mpil:8;                        /*!< bit:     23..16  Previous interrupt level */
+        rv_csr_t _reserved1:3;                  /*!< bit:     26..24  Reserved */
+        rv_csr_t mpie:1;                        /*!< bit:     27  Interrupt enable flag before enter interrupt */
+        rv_csr_t mpp:2;                         /*!< bit:     29..28  Privilede mode flag before enter interrupt */
+        rv_csr_t minhv:1;                       /*!< bit:     30  Machine interrupt vector table */
+#if defined(__RISCV_XLEN) && __RISCV_XLEN == 64
+        rv_csr_t _reserved2:32;                 /*!< bit:     31..62  Reserved */
+        rv_csr_t interrupt:1;                   /*!< bit:     63  trap type. 0 means exception and 1 means interrupt */
+#else
+        rv_csr_t interrupt:1;                   /*!< bit:     31  trap type. 0 means exception and 1 means interrupt */
+#endif
+    } b;                                        /*!< Structure used for bit  access */
+    rv_csr_t d;                                 /*!< Type      used for csr data access */
+} CSR_MCAUSE_Type;
+
+/**
+ * \brief  Union type to access MCOUNTINHIBIT configure register.
+ */
+typedef union {
+    struct {
+        rv_csr_t cy:1;                          /*!< bit:     0     1 means disable mcycle counter */
+        rv_csr_t _reserved0:1;                  /*!< bit:     1     Reserved */
+        rv_csr_t ir:1;                          /*!< bit:     2     1 means disable minstret counter */
+#if defined(__RISCV_XLEN) && __RISCV_XLEN == 64
+        rv_csr_t _reserved1:61;                 /*!< bit:     3..63 Reserved */
+#else
+        rv_csr_t _reserved1:29;                 /*!< bit:     3..31 Reserved */
+#endif
+    } b;                                        /*!< Structure used for bit  access */
+    rv_csr_t d;                                 /*!< Type      used for csr data access */
+} CSR_MCOUNTINHIBIT_Type;
+
+/**
+ * \brief  Union type to access msubm configure register.
+ */
+typedef union {
+    struct {
+        rv_csr_t _reserved0:6;                  /*!< bit:     0..5   Reserved */
+        rv_csr_t typ:2;                         /*!< bit:     6..7   current trap type */
+        rv_csr_t ptyp:2;                        /*!< bit:     8..9   previous trap type */
+#if defined(__RISCV_XLEN) && __RISCV_XLEN == 64
+        rv_csr_t _reserved1:54;                 /*!< bit:     10..63 Reserved */
+#else
+        rv_csr_t _reserved1:22;                 /*!< bit:     10..31 Reserved */
+#endif
+    } b;                                        /*!< Structure used for bit  access */
+    rv_csr_t d;                                 /*!< Type      used for csr data access */
+} CSR_MSUBM_Type;
+
+/**
+ * \brief  Union type to access MMISC_CTRL configure register.
+ */
+typedef union {
+    struct {
+        rv_csr_t _reserved0:3;                  /*!< bit:     0..2  Reserved */
+        rv_csr_t bpu:1;                         /*!< bit:     3     dynamic prediction enable flag */
+        rv_csr_t _reserved1:2;                  /*!< bit:     4..5  Reserved */
+        rv_csr_t misalign:1;                    /*!< bit:     6     misaligned access support flag */
+        rv_csr_t _reserved2:2;                  /*!< bit:     7..8  Reserved */
+        rv_csr_t nmi_cause:1;                   /*!< bit:     9     mnvec control and nmi mcase exccode */
+#if defined(__RISCV_XLEN) && __RISCV_XLEN == 64
+        rv_csr_t _reserved3:54;                 /*!< bit:     10..63 Reserved */
+#else
+        rv_csr_t _reserved3:22;                 /*!< bit:     10..31 Reserved */
+#endif
+    } b;                                        /*!< Structure used for bit  access */
+    rv_csr_t d;                                 /*!< Type      used for csr data access */
+} CSR_MMISCCTRL_Type;
+
+
+/**
+ * \brief  Union type to access MSAVESTATUS configure register.
+ */
+typedef union {
+    struct {
+        rv_csr_t mpie1:1;                       /*!< bit:     0     interrupt enable flag of fisrt level NMI/exception nestting */
+        rv_csr_t mpp1:2;                        /*!< bit:     1..2  privilede mode of fisrt level NMI/exception nestting */
+        rv_csr_t _reserved0:3;                  /*!< bit:     3..5  Reserved */
+        rv_csr_t ptyp1:2;                       /*!< bit:     6..7  NMI/exception type of before first nestting */
+        rv_csr_t mpie2:1;                       /*!< bit:     8     interrupt enable flag of second level NMI/exception nestting */
+        rv_csr_t mpp2:2;                        /*!< bit:     9..10 privilede mode of second level NMI/exception nestting */
+        rv_csr_t _reserved1:3;                  /*!< bit:     11..13     Reserved */
+        rv_csr_t ptyp2:2;                       /*!< bit:     14..15     NMI/exception type of before second nestting */
+#if defined(__RISCV_XLEN) && __RISCV_XLEN == 64
+        rv_csr_t _reserved2:48;                 /*!< bit:     16..63 Reserved*/
+#else
+        rv_csr_t _reserved2:16;                 /*!< bit:     16..31 Reserved*/
+#endif
+    } b;                                        /*!< Structure used for bit  access */
+    rv_csr_t w;                                 /*!< Type      used for csr data access */
+} CSR_MSAVESTATUS_Type;
+/** @} */ /* End of Doxygen Group NMSIS_Core_Base_Registers */
+
+/* ###########################  Core Function Access  ########################### */
+/**
+ * \defgroup NMSIS_Core_CSR_Register_Access    Core CSR Register Access
+ * \ingroup  NMSIS_Core
+ * \brief    Functions to access the Core CSR Registers
+ * \details
+ *
+ * The following functions or macros provide access to Core CSR registers.
+ * - \ref NMSIS_Core_CSR_Encoding
+ * - \ref NMSIS_Core_CSR_Registers
+ *   @{
+ */
+
+
+#ifndef __ASSEMBLY__
+
+/**
+ * \brief CSR operation Macro for csrrw instruction.
+ * \details
+ * Read the content of csr register to __v,
+ * then write content of val into csr register, then return __v
+ * \param csr   CSR macro definition defined in
+ *              \ref NMSIS_Core_CSR_Registers, eg. \ref CSR_MSTATUS
+ * \param val   value to store into the CSR register
+ * \return the CSR register value before written
+ */
+#define __RV_CSR_SWAP(csr, val)                                 \
+    ({                                                          \
+        register rv_csr_t __v = (unsigned long)(val);           \
+        __ASM volatile("csrrw %0, " STRINGIFY(csr) ", %1"       \
+                     : "=r"(__v)                                \
+                     : "rK"(__v)                                \
+                     : "memory");                               \
+        __v;                                                    \
+    })
+
+/**
+ * \brief CSR operation Macro for csrr instruction.
+ * \details
+ * Read the content of csr register to __v and return it
+ * \param csr   CSR macro definition defined in
+ *              \ref NMSIS_Core_CSR_Registers, eg. \ref CSR_MSTATUS
+ * \return the CSR register value
+ */
+#define __RV_CSR_READ(csr)                                      \
+    ({                                                          \
+        register rv_csr_t __v;                                  \
+        __ASM volatile("csrr %0, " STRINGIFY(csr)               \
+                     : "=r"(__v)                                \
+                     :                                          \
+                     : "memory");                               \
+        __v;                                                    \
+    })
+
+/**
+ * \brief CSR operation Macro for csrw instruction.
+ * \details
+ * Write the content of val to csr register
+ * \param csr   CSR macro definition defined in
+ *              \ref NMSIS_Core_CSR_Registers, eg. \ref CSR_MSTATUS
+ * \param val   value to store into the CSR register
+ */
+#define __RV_CSR_WRITE(csr, val)                                \
+    ({                                                          \
+        register rv_csr_t __v = (rv_csr_t)(val);                \
+        __ASM volatile("csrw " STRINGIFY(csr) ", %0"            \
+                     :                                          \
+                     : "rK"(__v)                                \
+                     : "memory");                               \
+    })
+
+/**
+ * \brief CSR operation Macro for csrrs instruction.
+ * \details
+ * Read the content of csr register to __v,
+ * then set csr register to be __v | val, then return __v
+ * \param csr   CSR macro definition defined in
+ *              \ref NMSIS_Core_CSR_Registers, eg. \ref CSR_MSTATUS
+ * \param val   Mask value to be used wih csrrs instruction
+ * \return the CSR register value before written
+ */
+#define __RV_CSR_READ_SET(csr, val)                             \
+    ({                                                          \
+        register rv_csr_t __v = (rv_csr_t)(val);                \
+        __ASM volatile("csrrs %0, " STRINGIFY(csr) ", %1"       \
+                     : "=r"(__v)                                \
+                     : "rK"(__v)                                \
+                     : "memory");                               \
+        __v;                                                    \
+    })
+
+/**
+ * \brief CSR operation Macro for csrs instruction.
+ * \details
+ * Set csr register to be csr_content | val
+ * \param csr   CSR macro definition defined in
+ *              \ref NMSIS_Core_CSR_Registers, eg. \ref CSR_MSTATUS
+ * \param val   Mask value to be used wih csrs instruction
+ */
+#define __RV_CSR_SET(csr, val)                                  \
+    ({                                                          \
+        register rv_csr_t __v = (rv_csr_t)(val);                \
+        __ASM volatile("csrs " STRINGIFY(csr) ", %0"            \
+                     :                                          \
+                     : "rK"(__v)                                \
+                     : "memory");                               \
+    })
+
+/**
+ * \brief CSR operation Macro for csrrc instruction.
+ * \details
+ * Read the content of csr register to __v,
+ * then set csr register to be __v & ~val, then return __v
+ * \param csr   CSR macro definition defined in
+ *              \ref NMSIS_Core_CSR_Registers, eg. \ref CSR_MSTATUS
+ * \param val   Mask value to be used wih csrrc instruction
+ * \return the CSR register value before written
+ */
+#define __RV_CSR_READ_CLEAR(csr, val)                           \
+    ({                                                          \
+        register rv_csr_t __v = (rv_csr_t)(val);                \
+        __ASM volatile("csrrc %0, " STRINGIFY(csr) ", %1"       \
+                     : "=r"(__v)                                \
+                     : "rK"(__v)                                \
+                     : "memory");                               \
+        __v;                                                    \
+    })
+
+/**
+ * \brief CSR operation Macro for csrc instruction.
+ * \details
+ * Set csr register to be csr_content & ~val
+ * \param csr   CSR macro definition defined in
+ *              \ref NMSIS_Core_CSR_Registers, eg. \ref CSR_MSTATUS
+ * \param val   Mask value to be used wih csrc instruction
+ */
+#define __RV_CSR_CLEAR(csr, val)                                \
+    ({                                                          \
+        register rv_csr_t __v = (rv_csr_t)(val);                \
+        __ASM volatile("csrc " STRINGIFY(csr) ", %0"            \
+                     :                                          \
+                     : "rK"(__v)                                \
+                     : "memory");                               \
+    })
+#endif /* __ASSEMBLY__ */
+
+/**
+ * \brief   Enable IRQ Interrupts
+ * \details Enables IRQ interrupts by setting the MIE-bit in the MSTATUS Register.
+ * \remarks
+ *          Can only be executed in Privileged modes.
+ */
+__STATIC_FORCEINLINE void __enable_irq(void)
+{
+    __RV_CSR_SET(CSR_MSTATUS, MSTATUS_MIE);
+}
+
+/**
+ * \brief   Disable IRQ Interrupts
+ * \details Disables IRQ interrupts by clearing the MIE-bit in the MSTATUS Register.
+ * \remarks
+ *          Can only be executed in Privileged modes.
+ */
+__STATIC_FORCEINLINE void __disable_irq(void)
+{
+    __RV_CSR_CLEAR(CSR_MSTATUS, MSTATUS_MIE);
+}
+
+/**
+ * \brief   Read whole 64 bits value of mcycle counter
+ * \details This function will read the whole 64 bits of MCYCLE register
+ * \return  The whole 64 bits value of MCYCLE
+ * \remarks It will work for both RV32 and RV64 to get full 64bits value of MCYCLE
+ */
+__STATIC_FORCEINLINE uint64_t __get_rv_cycle(void)
+{
+#if __RISCV_XLEN == 32
+    volatile uint32_t high0, low, high;
+    uint64_t full;
+
+    high0 = __RV_CSR_READ(CSR_MCYCLEH);
+    low = __RV_CSR_READ(CSR_MCYCLE);
+    high = __RV_CSR_READ(CSR_MCYCLEH);
+    if (high0 != high) {
+        low = __RV_CSR_READ(CSR_MCYCLE);
+    }
+    full = (((uint64_t)high) << 32) | low;
+    return full;
+#elif __RISCV_XLEN == 64
+    return (uint64_t)__RV_CSR_READ(CSR_MCYCLE);
+#else // TODO Need cover for XLEN=128 case in future
+    return (uint64_t)__RV_CSR_READ(CSR_MCYCLE);
+#endif
+}
+
+/**
+ * \brief   Read whole 64 bits value of machine instruction-retired counter
+ * \details This function will read the whole 64 bits of MINSTRET register
+ * \return  The whole 64 bits value of MINSTRET
+ * \remarks It will work for both RV32 and RV64 to get full 64bits value of MINSTRET
+ */
+__STATIC_FORCEINLINE uint64_t __get_rv_instret(void)
+{
+#if __RISCV_XLEN == 32
+    volatile uint32_t high0, low, high;
+    uint64_t full;
+
+    high0 = __RV_CSR_READ(CSR_MINSTRETH);
+    low = __RV_CSR_READ(CSR_MINSTRET);
+    high = __RV_CSR_READ(CSR_MINSTRETH);
+    if (high0 != high) {
+        low = __RV_CSR_READ(CSR_MINSTRET);
+    }
+    full = (((uint64_t)high) << 32) | low;
+    return full;
+#elif __RISCV_XLEN == 64
+    return (uint64_t)__RV_CSR_READ(CSR_MINSTRET);
+#else // TODO Need cover for XLEN=128 case in future
+    return (uint64_t)__RV_CSR_READ(CSR_MINSTRET);
+#endif
+}
+
+/**
+ * \brief   Read whole 64 bits value of real-time clock
+ * \details This function will read the whole 64 bits of TIME register
+ * \return  The whole 64 bits value of TIME CSR
+ * \remarks It will work for both RV32 and RV64 to get full 64bits value of TIME
+ * \attention only available when user mode available
+ */
+__STATIC_FORCEINLINE uint64_t __get_rv_time(void)
+{
+#if __RISCV_XLEN == 32
+    volatile uint32_t high0, low, high;
+    uint64_t full;
+
+    high0 = __RV_CSR_READ(CSR_TIMEH);
+    low = __RV_CSR_READ(CSR_TIME);
+    high = __RV_CSR_READ(CSR_TIMEH);
+    if (high0 != high) {
+        low = __RV_CSR_READ(CSR_TIME);
+    }
+    full = (((uint64_t)high) << 32) | low;
+    return full;
+#elif __RISCV_XLEN == 64
+    return (uint64_t)__RV_CSR_READ(CSR_TIME);
+#else // TODO Need cover for XLEN=128 case in future
+    return (uint64_t)__RV_CSR_READ(CSR_TIME);
+#endif
+}
+
+/** @} */ /* End of Doxygen Group NMSIS_Core_CSR_Register_Access */
+
+/* ###########################  CPU Intrinsic Functions ########################### */
+/**
+ * \defgroup NMSIS_Core_CPU_Intrinsic   Intrinsic Functions for CPU Intructions
+ * \ingroup  NMSIS_Core
+ * \brief    Functions that generate RISC-V CPU instructions.
+ * \details
+ *
+ * The following functions generate specified RISC-V instructions that cannot be directly accessed by compiler.
+ *   @{
+ */
+
+/**
+ * \brief   NOP Instruction
+ * \details
+ * No Operation does nothing.
+ * This instruction can be used for code alignment purposes.
+ */
+__STATIC_FORCEINLINE void __NOP(void)
+{
+    __ASM volatile("nop");
+}
+
+/**
+ * \brief   Wait For Interrupt
+ * \details
+ * Wait For Interrupt is is executed using CSR_WFE.WFE=0 and WFI instruction.
+ * It will suspends execution until interrupt, NMI or Debug happened.
+ * When Core is waked up by interrupt, if
+ * 1. mstatus.MIE == 1(interrupt enabled), Core will enter ISR code
+ * 2. mstatus.MIE == 0(interrupt disabled), Core will resume previous execution
+ */
+__STATIC_FORCEINLINE void __WFI(void)
+{
+    __RV_CSR_CLEAR(CSR_WFE, WFE_WFE);
+    __ASM volatile("wfi");
+}
+
+/**
+ * \brief   Wait For Event
+ * \details
+ * Wait For Event is executed using CSR_WFE.WFE=1 and WFI instruction.
+ * It will suspends execution until event, NMI or Debug happened.
+ * When Core is waked up, Core will resume previous execution
+ */
+__STATIC_FORCEINLINE void __WFE(void)
+{
+    __RV_CSR_SET(CSR_WFE, WFE_WFE);
+    __ASM volatile("wfi");
+    __RV_CSR_CLEAR(CSR_WFE, WFE_WFE);
+}
+
+/**
+ * \brief   Breakpoint Instruction
+ * \details
+ * Causes the processor to enter Debug state.
+ * Debug tools can use this to investigate system state
+ * when the instruction at a particular address is reached.
+ */
+__STATIC_FORCEINLINE void __EBREAK(void)
+{
+    __ASM volatile("ebreak");
+}
+
+/**
+ * \brief   Environment Call Instruction
+ * \details
+ * The ECALL instruction is used to make a service request to
+ * the execution environment.
+ */
+__STATIC_FORCEINLINE void __ECALL(void)
+{
+    __ASM volatile("ecall");
+}
+
+/**
+ * \brief WFI Sleep Mode enumeration
+ */
+typedef enum WFI_SleepMode {
+    WFI_SHALLOW_SLEEP = 0,      /*!< Shallow sleep mode, the core_clk will poweroff */
+    WFI_DEEP_SLEEP = 1          /*!< Deep sleep mode, the core_clk and core_ano_clk will poweroff */
+} WFI_SleepMode_Type;
+
+/**
+ * \brief   Set Sleep mode of WFI
+ * \details
+ * Set the SLEEPVALUE CSR register to control the
+ * WFI Sleep mode.
+ * \param[in] mode      The sleep mode to be set
+ */
+__STATIC_FORCEINLINE void __set_wfi_sleepmode(WFI_SleepMode_Type mode)
+{
+    __RV_CSR_WRITE(CSR_SLEEPVALUE, mode);
+}
+
+/**
+ * \brief   Send TX Event
+ * \details
+ * Set the CSR TXEVT to control send a TX Event.
+ * The Core will output signal tx_evt as output event signal.
+ */
+__STATIC_FORCEINLINE void __TXEVT(void)
+{
+    __RV_CSR_SET(CSR_TXEVT, 0x1);
+}
+
+/**
+ * \brief   Enable MCYCLE counter
+ * \details
+ * Clear the CY bit of MCOUNTINHIBIT to 0 to enable MCYCLE Counter
+ */
+__STATIC_FORCEINLINE void __enable_mcycle_counter(void)
+{
+    __RV_CSR_CLEAR(CSR_MCOUNTINHIBIT, MCOUNTINHIBIT_CY);
+}
+
+/**
+ * \brief   Disable MCYCLE counter
+ * \details
+ * Set the CY bit of MCOUNTINHIBIT to 1 to disable MCYCLE Counter
+ */
+__STATIC_FORCEINLINE void __disable_mcycle_counter(void)
+{
+    __RV_CSR_SET(CSR_MCOUNTINHIBIT, MCOUNTINHIBIT_CY);
+}
+
+/**
+ * \brief   Enable MINSTRET counter
+ * \details
+ * Clear the IR bit of MCOUNTINHIBIT to 0 to enable MINSTRET Counter
+ */
+__STATIC_FORCEINLINE void __enable_minstret_counter(void)
+{
+    __RV_CSR_CLEAR(CSR_MCOUNTINHIBIT, MCOUNTINHIBIT_IR);
+}
+
+/**
+ * \brief   Disable MINSTRET counter
+ * \details
+ * Set the IR bit of MCOUNTINHIBIT to 1 to disable MINSTRET Counter
+ */
+__STATIC_FORCEINLINE void __disable_minstret_counter(void)
+{
+    __RV_CSR_SET(CSR_MCOUNTINHIBIT, MCOUNTINHIBIT_IR);
+}
+
+/**
+ * \brief   Enable MCYCLE & MINSTRET counter
+ * \details
+ * Clear the IR and CY bit of MCOUNTINHIBIT to 1 to enable MINSTRET & MCYCLE Counter
+ */
+__STATIC_FORCEINLINE void __enable_all_counter(void)
+{
+    __RV_CSR_CLEAR(CSR_MCOUNTINHIBIT, MCOUNTINHIBIT_IR|MCOUNTINHIBIT_CY);
+}
+
+/**
+ * \brief   Disable MCYCLE & MINSTRET counter
+ * \details
+ * Set the IR and CY bit of MCOUNTINHIBIT to 1 to disable MINSTRET & MCYCLE Counter
+ */
+__STATIC_FORCEINLINE void __disable_all_counter(void)
+{
+    __RV_CSR_SET(CSR_MCOUNTINHIBIT, MCOUNTINHIBIT_IR|MCOUNTINHIBIT_CY);
+}
+
+/**
+ * \brief Execute fence instruction, p -> pred, s -> succ
+ * \details
+ * the FENCE instruction ensures that all memory accesses from instructions preceding
+ * the fence in program order (the `predecessor set`) appear earlier in the global memory order than
+ * memory accesses from instructions appearing after the fence in program order (the `successor set`).
+ * For details, please refer to The RISC-V Instruction Set Manual
+ * \param p     predecessor set, such as iorw, rw, r, w
+ * \param s     successor set, such as iorw, rw, r, w
+ **/
+#define __FENCE(p, s) __ASM volatile ("fence " #p "," #s : : : "memory")
+
+/**
+ * \brief   Fence.i Instruction
+ * \details
+ * The FENCE.I instruction is used to synchronize the instruction
+ * and data streams.
+ */
+__STATIC_FORCEINLINE void __FENCE_I(void)
+{
+    __ASM volatile("fence.i");
+}
+
+/** \brief Read & Write Memory barrier */
+#define __RWMB()        __FENCE(iorw,iorw)
+
+/** \brief Read Memory barrier */
+#define __RMB()         __FENCE(ir,ir)
+
+/** \brief Write Memory barrier */
+#define __WMB()         __FENCE(ow,ow)
+
+/** \brief SMP Read & Write Memory barrier */
+#define __SMP_RWMB()    __FENCE(rw,rw)
+
+/** \brief SMP Read Memory barrier */
+#define __SMP_RMB()     __FENCE(r,r)
+
+/** \brief SMP Write Memory barrier */
+#define __SMP_WMB()     __FENCE(w,w)
+
+/** \brief CPU relax for busy loop */
+#define __CPU_RELAX()   __ASM volatile ("" : : : "memory")
+
+
+/* ===== Load/Store Operations ===== */
+/**
+ * \brief  Load 8bit value from address (8 bit)
+ * \details Load 8 bit value.
+ * \param [in]    addr  Address pointer to data
+ * \return              value of type uint8_t at (*addr)
+ */
+__STATIC_FORCEINLINE uint8_t __LB(volatile void *addr)
+{
+    uint8_t result;
+
+    __ASM volatile ("lb %0, 0(%1)" : "=r" (result) : "r" (addr));
+    return result;
+}
+
+/**
+ * \brief  Load 16bit value from address (16 bit)
+ * \details Load 16 bit value.
+ * \param [in]    addr  Address pointer to data
+ * \return              value of type uint16_t at (*addr)
+ */
+__STATIC_FORCEINLINE uint16_t __LH(volatile void *addr)
+{
+    uint16_t result;
+
+    __ASM volatile ("lh %0, 0(%1)" : "=r" (result) : "r" (addr));
+    return result;
+}
+
+/**
+ * \brief  Load 32bit value from address (32 bit)
+ * \details Load 32 bit value.
+ * \param [in]    addr  Address pointer to data
+ * \return              value of type uint32_t at (*addr)
+ */
+__STATIC_FORCEINLINE uint32_t __LW(volatile void *addr)
+{
+    uint32_t result;
+
+    __ASM volatile ("lw %0, 0(%1)" : "=r" (result) : "r" (addr));
+    return result;
+}
+
+#if __RISCV_XLEN != 32
+/**
+ * \brief  Load 64bit value from address (64 bit)
+ * \details Load 64 bit value.
+ * \param [in]    addr  Address pointer to data
+ * \return              value of type uint64_t at (*addr)
+ * \remarks RV64 only macro
+ */
+__STATIC_FORCEINLINE uint64_t __LD(volatile void *addr)
+{
+    uint64_t result;
+    __ASM volatile ("ld %0, 0(%1)" : "=r" (result) : "r" (addr));
+    return result;
+}
+#endif
+
+/**
+ * \brief  Write 8bit value to address (8 bit)
+ * \details Write 8 bit value.
+ * \param [in]    addr  Address pointer to data
+ * \param [in]    val   Value to set
+ */
+__STATIC_FORCEINLINE void __SB(volatile void *addr, uint8_t val)
+{
+    __ASM volatile ("sb %0, 0(%1)" : : "r" (val), "r" (addr));
+}
+
+/**
+ * \brief  Write 16bit value to address (16 bit)
+ * \details Write 16 bit value.
+ * \param [in]    addr  Address pointer to data
+ * \param [in]    val   Value to set
+ */
+__STATIC_FORCEINLINE void __SH(volatile void *addr, uint16_t val)
+{
+    __ASM volatile ("sh %0, 0(%1)" : : "r" (val), "r" (addr));
+}
+
+/**
+ * \brief  Write 32bit value to address (32 bit)
+ * \details Write 32 bit value.
+ * \param [in]    addr  Address pointer to data
+ * \param [in]    val   Value to set
+ */
+__STATIC_FORCEINLINE void __SW(volatile void *addr, uint32_t val)
+{
+    __ASM volatile ("sw %0, 0(%1)" : : "r" (val), "r" (addr));
+}
+
+#if __RISCV_XLEN != 32
+/**
+ * \brief  Write 64bit value to address (64 bit)
+ * \details Write 64 bit value.
+ * \param [in]    addr  Address pointer to data
+ * \param [in]    val   Value to set
+ */
+__STATIC_FORCEINLINE void __SD(volatile void *addr, uint64_t val)
+{
+    __ASM volatile ("sd %0, 0(%1)" : : "r" (val), "r" (addr));
+}
+#endif
+
+/**
+ * \brief  Compare and Swap 32bit value using LR and SC
+ * \details Compare old value with memory, if identical,
+ * store new value in memory. Return the initial value in memory.
+ * Success is indicated by comparing return value with OLD.
+ * memory address, return 0 if successful, otherwise return !0
+ * \param [in]    addr      Address pointer to data, address need to be 4byte aligned
+ * \param [in]    oldval    Old value of the data in address
+ * \param [in]    newval    New value to be stored into the address
+ * \return  return the initial value in memory
+ */
+__STATIC_FORCEINLINE uint32_t __CAS_W(volatile uint32_t *addr, uint32_t oldval, uint32_t newval)
+{
+    register uint32_t result;
+    register uint32_t rc;
+
+    __ASM volatile (                                \
+            "0:     lr.w %0, %2      \n"            \
+            "       bne  %0, %z3, 1f \n"            \
+            "       sc.w %1, %z4, %2 \n"            \
+            "       bnez %1, 0b      \n"            \
+            "1:\n"                                  \
+            : "=&r"(result), "=&r"(rc), "+A"(*addr) \
+            : "r"(oldval), "r"(newval)              \
+            : "memory");
+    return result;
+}
+
+/**
+ * \brief  Atomic Swap 32bit value into memory
+ * \details Atomically swap new 32bit value into memory using amoswap.d.
+ * \param [in]    addr      Address pointer to data, address need to be 4byte aligned
+ * \param [in]    newval    New value to be stored into the address
+ * \return  return the original value in memory
+ */
+__STATIC_FORCEINLINE uint32_t __AMOSWAP_W(volatile uint32_t *addr, uint32_t newval)
+{
+    register uint32_t result;
+
+    __ASM volatile ("amoswap.w %0, %2, %1" : \
+            "=r"(result), "+A"(*addr) : "r"(newval) : "memory");
+    return result;
+}
+
+/**
+ * \brief  Atomic Add with 32bit value
+ * \details Atomically ADD 32bit value with value in memory using amoadd.d.
+ * \param [in]    addr   Address pointer to data, address need to be 4byte aligned
+ * \param [in]    value  value to be ADDed
+ * \return  return memory value + add value
+ */
+__STATIC_FORCEINLINE int32_t __AMOADD_W(volatile int32_t *addr, int32_t value)
+{
+    register int32_t result;
+
+    __ASM volatile ("amoadd.w %0, %2, %1" : \
+            "=r"(result), "+A"(*addr) : "r"(value) : "memory");
+    return *addr;
+}
+
+/**
+ * \brief  Atomic And with 32bit value
+ * \details Atomically AND 32bit value with value in memory using amoand.d.
+ * \param [in]    addr   Address pointer to data, address need to be 4byte aligned
+ * \param [in]    value  value to be ANDed
+ * \return  return memory value & and value
+ */
+__STATIC_FORCEINLINE int32_t __AMOAND_W(volatile int32_t *addr, int32_t value)
+{
+    register int32_t result;
+
+    __ASM volatile ("amoand.w %0, %2, %1" : \
+            "=r"(result), "+A"(*addr) : "r"(value) : "memory");
+    return *addr;
+}
+
+/**
+ * \brief  Atomic OR with 32bit value
+ * \details Atomically OR 32bit value with value in memory using amoor.d.
+ * \param [in]    addr   Address pointer to data, address need to be 4byte aligned
+ * \param [in]    value  value to be ORed
+ * \return  return memory value | and value
+ */
+__STATIC_FORCEINLINE int32_t __AMOOR_W(volatile int32_t *addr, int32_t value)
+{
+    register int32_t result;
+
+    __ASM volatile ("amoor.w %0, %2, %1" : \
+            "=r"(result), "+A"(*addr) : "r"(value) : "memory");
+    return *addr;
+}
+
+/**
+ * \brief  Atomic XOR with 32bit value
+ * \details Atomically XOR 32bit value with value in memory using amoxor.d.
+ * \param [in]    addr   Address pointer to data, address need to be 4byte aligned
+ * \param [in]    value  value to be XORed
+ * \return  return memory value ^ and value
+ */
+__STATIC_FORCEINLINE int32_t __AMOXOR_W(volatile int32_t *addr, int32_t value)
+{
+    register int32_t result;
+
+    __ASM volatile ("amoxor.w %0, %2, %1" : \
+            "=r"(result), "+A"(*addr) : "r"(value) : "memory");
+    return *addr;
+}
+
+/**
+ * \brief  Atomic unsigned MAX with 32bit value
+ * \details Atomically unsigned max compare 32bit value with value in memory using amomaxu.d.
+ * \param [in]    addr   Address pointer to data, address need to be 4byte aligned
+ * \param [in]    value  value to be compared
+ * \return  return the bigger value
+ */
+__STATIC_FORCEINLINE uint32_t __AMOMAXU_W(volatile uint32_t *addr, uint32_t value)
+{
+    register uint32_t result;
+
+    __ASM volatile ("amomaxu.w %0, %2, %1" : \
+            "=r"(result), "+A"(*addr) : "r"(value) : "memory");
+    return *addr;
+}
+
+/**
+ * \brief  Atomic signed MAX with 32bit value
+ * \details Atomically signed max compare 32bit value with value in memory using amomax.d.
+ * \param [in]    addr   Address pointer to data, address need to be 4byte aligned
+ * \param [in]    value  value to be compared
+ * \return the bigger value
+ */
+__STATIC_FORCEINLINE int32_t __AMOMAX_W(volatile int32_t *addr, int32_t value)
+{
+    register int32_t result;
+
+    __ASM volatile ("amomax.w %0, %2, %1" : \
+            "=r"(result), "+A"(*addr) : "r"(value) : "memory");
+    return *addr;
+}
+
+/**
+ * \brief  Atomic unsigned MIN with 32bit value
+ * \details Atomically unsigned min compare 32bit value with value in memory using amominu.d.
+ * \param [in]    addr   Address pointer to data, address need to be 4byte aligned
+ * \param [in]    value  value to be compared
+ * \return the smaller value
+ */
+__STATIC_FORCEINLINE uint32_t __AMOMINU_W(volatile uint32_t *addr, uint32_t value)
+{
+    register uint32_t result;
+
+    __ASM volatile ("amominu.w %0, %2, %1" : \
+            "=r"(result), "+A"(*addr) : "r"(value) : "memory");
+    return *addr;
+}
+
+/**
+ * \brief  Atomic signed MIN with 32bit value
+ * \details Atomically signed min compare 32bit value with value in memory using amomin.d.
+ * \param [in]    addr   Address pointer to data, address need to be 4byte aligned
+ * \param [in]    value  value to be compared
+ * \return  the smaller value
+ */
+__STATIC_FORCEINLINE int32_t __AMOMIN_W(volatile int32_t *addr, int32_t value)
+{
+    register int32_t result;
+
+    __ASM volatile ("amomin.w %0, %2, %1" : \
+            "=r"(result), "+A"(*addr) : "r"(value) : "memory");
+    return *addr;
+}
+
+#if __RISCV_XLEN == 64
+/**
+ * \brief  Compare and Swap 64bit value using LR and SC
+ * \details Compare old value with memory, if identical,
+ * store new value in memory. Return the initial value in memory.
+ * Success is indicated by comparing return value with OLD.
+ * memory address, return 0 if successful, otherwise return !0
+ * \param [in]    addr      Address pointer to data, address need to be 8byte aligned
+ * \param [in]    oldval    Old value of the data in address
+ * \param [in]    newval    New value to be stored into the address
+ * \return  return the initial value in memory
+ */
+__STATIC_FORCEINLINE uint64_t __CAS_D(volatile uint64_t *addr, uint64_t oldval, uint64_t newval)
+{
+    register uint64_t result;
+    register uint64_t rc;
+
+    __ASM volatile (                                \
+            "0:     lr.d %0, %2      \n"            \
+            "       bne  %0, %z3, 1f \n"            \
+            "       sc.d %1, %z4, %2 \n"            \
+            "       bnez %1, 0b      \n"            \
+            "1:\n"                                  \
+            : "=&r"(result), "=&r"(rc), "+A"(*addr) \
+            : "r"(oldval), "r"(newval)              \
+            : "memory");
+    return result;
+}
+
+/**
+ * \brief  Atomic Swap 64bit value into memory
+ * \details Atomically swap new 64bit value into memory using amoswap.d.
+ * \param [in]    addr      Address pointer to data, address need to be 8byte aligned
+ * \param [in]    newval    New value to be stored into the address
+ * \return  return the original value in memory
+ */
+__STATIC_FORCEINLINE uint64_t __AMOSWAP_D(volatile uint64_t *addr, uint64_t newval)
+{
+    register uint64_t result;
+
+    __ASM volatile ("amoswap.d %0, %2, %1" : \
+            "=r"(result), "+A"(*addr) : "r"(newval) : "memory");
+    return result;
+}
+
+/**
+ * \brief  Atomic Add with 64bit value
+ * \details Atomically ADD 64bit value with value in memory using amoadd.d.
+ * \param [in]    addr   Address pointer to data, address need to be 8byte aligned
+ * \param [in]    value  value to be ADDed
+ * \return  return memory value + add value
+ */
+__STATIC_FORCEINLINE int64_t __AMOADD_D(volatile int64_t *addr, int64_t value)
+{
+    register int64_t result;
+
+    __ASM volatile ("amoadd.d %0, %2, %1" : \
+            "=r"(result), "+A"(*addr) : "r"(value) : "memory");
+    return *addr;
+}
+
+/**
+ * \brief  Atomic And with 64bit value
+ * \details Atomically AND 64bit value with value in memory using amoand.d.
+ * \param [in]    addr   Address pointer to data, address need to be 8byte aligned
+ * \param [in]    value  value to be ANDed
+ * \return  return memory value & and value
+ */
+__STATIC_FORCEINLINE int64_t __AMOAND_D(volatile int64_t *addr, int64_t value)
+{
+    register int64_t result;
+
+    __ASM volatile ("amoand.d %0, %2, %1" : \
+            "=r"(result), "+A"(*addr) : "r"(value) : "memory");
+    return *addr;
+}
+
+/**
+ * \brief  Atomic OR with 64bit value
+ * \details Atomically OR 64bit value with value in memory using amoor.d.
+ * \param [in]    addr   Address pointer to data, address need to be 8byte aligned
+ * \param [in]    value  value to be ORed
+ * \return  return memory value | and value
+ */
+__STATIC_FORCEINLINE int64_t __AMOOR_D(volatile int64_t *addr, int64_t value)
+{
+    register int64_t result;
+
+    __ASM volatile ("amoor.d %0, %2, %1" : \
+            "=r"(result), "+A"(*addr) : "r"(value) : "memory");
+    return *addr;
+}
+
+/**
+ * \brief  Atomic XOR with 64bit value
+ * \details Atomically XOR 64bit value with value in memory using amoxor.d.
+ * \param [in]    addr   Address pointer to data, address need to be 8byte aligned
+ * \param [in]    value  value to be XORed
+ * \return  return memory value ^ and value
+ */
+__STATIC_FORCEINLINE int64_t __AMOXOR_D(volatile int64_t *addr, int64_t value)
+{
+    register int64_t result;
+
+    __ASM volatile ("amoxor.d %0, %2, %1" : \
+            "=r"(result), "+A"(*addr) : "r"(value) : "memory");
+    return *addr;
+}
+
+/**
+ * \brief  Atomic unsigned MAX with 64bit value
+ * \details Atomically unsigned max compare 64bit value with value in memory using amomaxu.d.
+ * \param [in]    addr   Address pointer to data, address need to be 8byte aligned
+ * \param [in]    value  value to be compared
+ * \return  return the bigger value
+ */
+__STATIC_FORCEINLINE uint64_t __AMOMAXU_D(volatile uint64_t *addr, uint64_t value)
+{
+    register uint64_t result;
+
+    __ASM volatile ("amomaxu.d %0, %2, %1" : \
+            "=r"(result), "+A"(*addr) : "r"(value) : "memory");
+    return *addr;
+}
+
+/**
+ * \brief  Atomic signed MAX with 64bit value
+ * \details Atomically signed max compare 64bit value with value in memory using amomax.d.
+ * \param [in]    addr   Address pointer to data, address need to be 8byte aligned
+ * \param [in]    value  value to be compared
+ * \return the bigger value
+ */
+__STATIC_FORCEINLINE int64_t __AMOMAX_D(volatile int64_t *addr, int64_t value)
+{
+    register int64_t result;
+
+    __ASM volatile ("amomax.d %0, %2, %1" : \
+            "=r"(result), "+A"(*addr) : "r"(value) : "memory");
+    return *addr;
+}
+
+/**
+ * \brief  Atomic unsigned MIN with 64bit value
+ * \details Atomically unsigned min compare 64bit value with value in memory using amominu.d.
+ * \param [in]    addr   Address pointer to data, address need to be 8byte aligned
+ * \param [in]    value  value to be compared
+ * \return the smaller value
+ */
+__STATIC_FORCEINLINE uint64_t __AMOMINU_D(volatile uint64_t *addr, uint64_t value)
+{
+    register uint64_t result;
+
+    __ASM volatile ("amominu.d %0, %2, %1" : \
+            "=r"(result), "+A"(*addr) : "r"(value) : "memory");
+    return *addr;
+}
+
+/**
+ * \brief  Atomic signed MIN with 64bit value
+ * \details Atomically signed min compare 64bit value with value in memory using amomin.d.
+ * \param [in]    addr   Address pointer to data, address need to be 8byte aligned
+ * \param [in]    value  value to be compared
+ * \return  the smaller value
+ */
+__STATIC_FORCEINLINE int64_t __AMOMIN_D(volatile int64_t *addr, int64_t value)
+{
+    register int64_t result;
+
+    __ASM volatile ("amomin.d %0, %2, %1" : \
+            "=r"(result), "+A"(*addr) : "r"(value) : "memory");
+    return *addr;
+}
+#endif /* __RISCV_XLEN == 64  */
+
+/** @} */ /* End of Doxygen Group NMSIS_Core_CPU_Intrinsic */
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* __CORE_FEATURE_BASE__ */
diff --git a/kernel/arch/risc-v/nuclei/gcc/nmsis/Core/Include/core_feature_cache.h b/kernel/arch/risc-v/nuclei/gcc/nmsis/Core/Include/core_feature_cache.h
new file mode 100644
index 00000000..38b9eb97
--- /dev/null
+++ b/kernel/arch/risc-v/nuclei/gcc/nmsis/Core/Include/core_feature_cache.h
@@ -0,0 +1,124 @@
+/*
+ * Copyright (c) 2019 Nuclei Limited. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __CORE_FEATURE_CACHE_H__
+#define __CORE_FEATURE_CACHE_H__
+/*!
+ * @file     core_feature_cache.h
+ * @brief    Cache feature API header file for Nuclei N/NX Core
+ */
+/*
+ * Cache Feature Configuration Macro:
+ * 1. __ICACHE_PRESENT:  Define whether I-Cache Unit is present or not.
+ *   * 0: Not present
+ *   * 1: Present
+ * 1. __DCACHE_PRESENT:  Define whether D-Cache Unit is present or not.
+ *   * 0: Not present
+ *   * 1: Present
+ */
+#ifdef __cplusplus
+ extern "C" {
+#endif
+
+#if defined(__ICACHE_PRESENT) && (__ICACHE_PRESENT == 1)
+
+/* ##########################  Cache functions  #################################### */
+/**
+ * \defgroup NMSIS_Core_Cache       Cache Functions
+ * \brief    Functions that configure Instruction and Data Cache.
+ * @{
+ */
+
+/** @} */ /* End of Doxygen Group NMSIS_Core_Cache */
+
+/**
+ * \defgroup NMSIS_Core_ICache      I-Cache Functions
+ * \ingroup  NMSIS_Core_Cache
+ * \brief    Functions that configure Instruction Cache.
+ * @{
+ */
+/**
+ * \brief  Enable ICache
+ * \details
+ * This function enable I-Cache
+ * \remarks
+ * - This \ref CSR_MCACHE_CTL register control I Cache enable.
+ * \sa
+ * - \ref DisableICache
+*/
+__STATIC_FORCEINLINE void EnableICache (void)
+{
+    __RV_CSR_SET(CSR_MCACHE_CTL, CSR_MCACHE_CTL_IE);
+}
+
+/**
+ * \brief  Disable ICache
+ * \details
+ * This function Disable I-Cache
+ * \remarks
+ * - This \ref CSR_MCACHE_CTL register control I Cache enable.
+ * \sa
+ * - \ref EnableICache
+ */
+__STATIC_FORCEINLINE void DisableICache (void)
+{
+    __RV_CSR_CLEAR(CSR_MCACHE_CTL, CSR_MCACHE_CTL_IE);
+}
+/** @} */ /* End of Doxygen Group NMSIS_Core_ICache */
+#endif /* defined(__ICACHE_PRESENT) && (__ICACHE_PRESENT == 1) */
+
+#if defined(__DCACHE_PRESENT) && (__DCACHE_PRESENT == 1)
+/**
+ * \defgroup NMSIS_Core_DCache      D-Cache Functions
+ * \ingroup  NMSIS_Core_Cache
+ * \brief    Functions that configure Data Cache.
+ * @{
+ */
+/**
+ * \brief  Enable DCache
+ * \details
+ * This function enable D-Cache
+ * \remarks
+ * - This \ref CSR_MCACHE_CTL register control D Cache enable.
+ * \sa
+ * - \ref DisableDCache
+*/
+__STATIC_FORCEINLINE void EnableDCache (void)
+{
+    __RV_CSR_SET(CSR_MCACHE_CTL, CSR_MCACHE_CTL_DE);
+}
+
+/**
+ * \brief  Disable DCache
+ * \details
+ * This function Disable D-Cache
+ * \remarks
+ * - This \ref CSR_MCACHE_CTL register control D Cache enable.
+ * \sa
+ * - \ref EnableDCache
+ */
+__STATIC_FORCEINLINE void DisableDCache (void)
+{
+    __RV_CSR_CLEAR(CSR_MCACHE_CTL, CSR_MCACHE_CTL_DE);
+}
+/** @} */ /* End of Doxygen Group NMSIS_Core_DCache */
+#endif /* defined(__DCACHE_PRESENT) && (__DCACHE_PRESENT == 1) */
+
+#ifdef __cplusplus
+}
+#endif
+#endif /** __CORE_FEATURE_CACHE_H__ */
diff --git a/kernel/arch/risc-v/nuclei/gcc/nmsis/Core/Include/core_feature_dsp.h b/kernel/arch/risc-v/nuclei/gcc/nmsis/Core/Include/core_feature_dsp.h
new file mode 100644
index 00000000..4d41e553
--- /dev/null
+++ b/kernel/arch/risc-v/nuclei/gcc/nmsis/Core/Include/core_feature_dsp.h
@@ -0,0 +1,18659 @@
+/*
+ * Copyright (c) 2019 Nuclei Limited. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __CORE_FEATURE_DSP__
+#define __CORE_FEATURE_DSP__
+
+/*!
+ * @file     core_feature_dsp.h
+ * @brief    DSP feature API header file for Nuclei N/NX Core
+ */
+/*
+ * DSP Feature Configuration Macro:
+ * 1. __DSP_PRESENT:  Define whether Digital Signal Processing Unit(DSP) is present or not
+ *   * 0: Not present
+ *   * 1: Present
+ */
+#ifdef __cplusplus
+ extern "C" {
+#endif
+
+#if defined(__DSP_PRESENT) && (__DSP_PRESENT == 1)
+
+/* ###########################  CPU SIMD DSP Intrinsic Functions ########################### */
+/**
+ * \defgroup NMSIS_Core_DSP_Intrinsic   Intrinsic Functions for SIMD Instructions
+ * \ingroup  NMSIS_Core
+ * \brief    Functions that generate RISC-V DSP SIMD instructions.
+ * \details
+ *
+ * The following functions generate specified RISC-V SIMD instructions that cannot be directly accessed by compiler.
+ * * **DSP ISA Extension Instruction Summary**
+ *   + **Shorthand Definitions**
+ *     - r.H == rH1: r[31:16], r.L == r.H0: r[15:0]
+ *     - r.B3: r[31:24], r.B2: r[23:16], r.B1: r[15:8], r.B0: r[7:0]
+ *     - r.B[x]: r[(x*8+7):(x*8+0)]
+ *     - r.H[x]: r[(x*16+7):(x*16+0)]
+ *     - r.W[x]: r[(x*32+31):(x*32+0)]
+ *     - r[xU]: the upper 32-bit of a 64-bit number; xU represents the GPR number that contains this upper part 32-bit value.
+ *     - r[xL]: the lower 32-bit of a 64-bit number; xL represents the GPR number that contains this lower part 32-bit value.
+ *     - r[xU].r[xL]: a 64-bit number that is formed from a pair of GPRs.
+ *     - s>>: signed arithmetic right shift:
+ *     - u>>: unsigned logical right shift
+ *     - SAT.Qn(): Saturate to the range of [-2^n, 2^n-1], if saturation happens, set PSW.OV.
+ *     - SAT.Um(): Saturate to the range of [0, 2^m-1], if saturation happens, set PSW.OV.
+ *     - RUND(): Indicate `rounding`, i.e., add 1 to the most significant discarded bit for right shift or MSW-type multiplication instructions.
+ *     - Sign or Zero Extending functions:
+ *       - SEm(data): Sign-Extend data to m-bit.:
+ *       - ZEm(data): Zero-Extend data to m-bit.
+ *     - ABS(x): Calculate the absolute value of `x`.
+ *     - CONCAT(x,y): Concatinate `x` and `y` to form a value.
+ *     - u<: Unsinged less than comparison.
+ *     - u<=: Unsinged less than & equal comparison.
+ *     - u>: Unsinged greater than comparison.
+ *     - s*: Signed multiplication.
+ *     - u*: Unsigned multiplication.
+ *
+ *   @{
+ */
+/** @} */ /* End of Doxygen Group NMSIS_Core_DSP_Intrinsic */
+
+
+/**
+ * \defgroup NMSIS_Core_DSP_Intrinsic_SIMD_DATA_PROCESS      SIMD Data Processing Instructions
+ * \ingroup  NMSIS_Core_DSP_Intrinsic
+ * \brief    SIMD Data Processing Instructions
+ * \details
+ */
+
+/**
+ * \defgroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_ADDSUB      SIMD 16-bit Add/Subtract Instructions
+ * \ingroup  NMSIS_Core_DSP_Intrinsic_SIMD_DATA_PROCESS
+ * \brief    SIMD 16-bit Add/Subtract Instructions
+ * \details
+ * Based on the combination of the types of the two 16-bit arithmetic operations, the SIMD 16-bit
+ * add/subtract instructions can be classified into 6 main categories: Addition (two 16-bit addition),
+ * Subtraction (two 16-bit subtraction), Crossed Add & Sub (one addition and one subtraction), and
+ * Crossed Sub & Add (one subtraction and one addition), Straight Add & Sub (one addition and one
+ * subtraction), and Straight Sub & Add (one subtraction and one addition).
+ * Based on the way of how an overflow condition is handled, the SIMD 16-bit add/subtract
+ * instructions can be classified into 5 groups: Wrap-around (dropping overflow), Signed Halving
+ * (keeping overflow by dropping 1 LSB bit), Unsigned Halving, Signed Saturation (clipping overflow),
+ * and Unsigned Saturation.
+ * Together, there are 30 SIMD 16-bit add/subtract instructions.
+ */
+
+/**
+ * \defgroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_ADDSUB      SIMD 8-bit Addition & Subtraction Instructions
+ * \ingroup  NMSIS_Core_DSP_Intrinsic_SIMD_DATA_PROCESS
+ * \brief    SIMD 8-bit Addition & Subtraction Instructions
+ * \details
+ * Based on the types of the four 8-bit arithmetic operations, the SIMD 8-bit add/subtract instructions
+ * can be classified into 2 main categories: Addition (four 8-bit addition), and Subtraction (four 8-bit
+ * subtraction).
+ * Based on the way of how an overflow condition is handled for singed or unsigned operation, the
+ * SIMD 8-bit add/subtract instructions can be classified into 5 groups: Wrap-around (dropping
+ * overflow), Signed Halving (keeping overflow by dropping 1 LSB bit), Unsigned Halving, Signed
+ * Saturation (clipping overflow), and Unsigned Saturation.
+ * Together, there are 10 SIMD 8-bit add/subtract instructions.
+ */
+
+/**
+ * \defgroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_SHIFT      SIMD 16-bit Shift Instructions
+ * \ingroup  NMSIS_Core_DSP_Intrinsic_SIMD_DATA_PROCESS
+ * \brief    SIMD 16-bit Shift Instructions
+ * \details
+ * there are 14 SIMD 16-bit shift instructions.
+ */
+
+/**
+ * \defgroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_SHIFT      SIMD 8-bit Shift Instructions
+ * \ingroup  NMSIS_Core_DSP_Intrinsic_SIMD_DATA_PROCESS
+ * \brief    SIMD 8-bit Shift Instructions
+ * \details
+ *  there are 14 SIMD 8-bit shift instructions.
+ */
+
+/**
+ * \defgroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_CMP      SIMD 16-bit Compare Instructions
+ * \ingroup  NMSIS_Core_DSP_Intrinsic_SIMD_DATA_PROCESS
+ * \brief    SIMD 16-bit Compare Instructions
+ * \details
+ *  there are 5 SIMD 16-bit Compare instructions.
+ */
+
+/**
+ * \defgroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_CMP      SIMD 8-bit Compare Instructions
+ * \ingroup  NMSIS_Core_DSP_Intrinsic_SIMD_DATA_PROCESS
+ * \brief    SIMD 8-bit Compare Instructions
+ * \details
+ *  there are 5  SIMD 8-bit Compare instructions.
+ */
+
+/**
+ * \defgroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_MULTIPLY      SIMD 16-bit Multiply Instructions
+ * \ingroup  NMSIS_Core_DSP_Intrinsic_SIMD_DATA_PROCESS
+ * \brief    SIMD 16-bit Multiply Instructions
+ * \details
+ * there are 6 SIMD 16-bit Multiply instructions.
+ */
+
+/**
+ * \defgroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_MULTIPLY      SIMD 8-bit Multiply Instructions
+ * \ingroup  NMSIS_Core_DSP_Intrinsic_SIMD_DATA_PROCESS
+ * \brief    SIMD 8-bit Multiply Instructions
+ * \details
+ *  there are 6 SIMD 8-bit Multiply instructions.
+ */
+
+/**
+ * \defgroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_MISC      SIMD 16-bit Miscellaneous Instructions
+ * \ingroup  NMSIS_Core_DSP_Intrinsic_SIMD_DATA_PROCESS
+ * \brief    SIMD 16-bit Miscellaneous Instructions
+ * \details
+ *  there are 10 SIMD 16-bit Misc instructions.
+ */
+
+/**
+ * \defgroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_MISC      SIMD 8-bit Miscellaneous Instructions
+ * \ingroup  NMSIS_Core_DSP_Intrinsic_SIMD_DATA_PROCESS
+ * \brief    SIMD 8-bit Miscellaneous Instructions
+ * \details
+ *  there are 10 SIMD 8-bit Miscellaneous instructions.
+ */
+
+/**
+ * \defgroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_UNPACK      SIMD 8-bit Unpacking Instructions
+ * \ingroup  NMSIS_Core_DSP_Intrinsic_SIMD_DATA_PROCESS
+ * \brief    SIMD 8-bit Unpacking Instructions
+ * \details
+ *  there are 8 SIMD 8-bit Unpacking instructions.
+ */
+
+/**
+ * \defgroup NMSIS_Core_DSP_Intrinsic_NON_SIMD      Non-SIMD Instructions
+ * \ingroup  NMSIS_Core_DSP_Intrinsic
+ * \brief    Non-SIMD Instructions
+ * \details
+ */
+
+/**
+ * \defgroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_Q15_SAT_ALU      Non-SIMD Q15 saturation ALU Instructions
+ * \ingroup  NMSIS_Core_DSP_Intrinsic_NON_SIMD
+ * \brief    Non-SIMD Q15 saturation ALU Instructions
+ * \details
+ * there are 7 Non-SIMD Q15 saturation ALU Instructions
+ */
+
+/**
+ * \defgroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_Q31_SAT_ALU      Non-SIMD Q31 saturation ALU Instructions
+ * \ingroup  NMSIS_Core_DSP_Intrinsic_NON_SIMD
+ * \brief    Non-SIMD Q31 saturation ALU Instructions
+ * \details
+ *  there are Non-SIMD Q31 saturation ALU Instructions
+ */
+
+/**
+ * \defgroup NMSIS_Core_DSP_Intrinsic_32B_COMPUTATION      32-bit Computation Instructions
+ * \ingroup  NMSIS_Core_DSP_Intrinsic_NON_SIMD
+ * \brief    32-bit Computation Instructions
+ * \details
+ * there are 8 32-bit Computation Instructions
+ */
+
+/**
+ * \defgroup NMSIS_Core_DSP_Intrinsic_OV_FLAG_SC      OV (Overflow) flag Set/Clear Instructions
+ * \ingroup  NMSIS_Core_DSP_Intrinsic_NON_SIMD
+ * \brief    OV (Overflow) flag Set/Clear Instructions
+ * \details
+ * The following table lists the user instructions related to Overflow (OV) flag manipulation. there are 2 OV (Overflow) flag Set/Clear Instructions
+ */
+
+/**
+ * \defgroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_MISC      Non-SIMD Miscellaneous Instructions
+ * \ingroup  NMSIS_Core_DSP_Intrinsic_NON_SIMD
+ * \brief    Non-SIMD Miscellaneous Instructions
+ * \details
+ * There are 13 Miscellaneous Instructions here.
+ */
+
+/**
+ * \defgroup NMSIS_Core_DSP_Intrinsic_PART_SIMD_DATA_PROCESS      Partial-SIMD Data Processing Instructions
+ * \ingroup  NMSIS_Core_DSP_Intrinsic
+ * \brief    Partial-SIMD Data Processing Instructions
+ * \details
+ */
+
+/**
+ * \defgroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_PACK      SIMD 16-bit Packing Instructions
+ * \ingroup  NMSIS_Core_DSP_Intrinsic_PART_SIMD_DATA_PROCESS
+ * \brief    SIMD 16-bit Packing Instructions
+ * \details
+ * there are 4 SIMD16-bit Packing Instructions.
+ */
+
+/**
+ * \defgroup NMSIS_Core_DSP_Intrinsic_SIGNED_MSW_32X32_MAC      Signed MSW 32x32 Multiply and Add Instructions
+ * \ingroup  NMSIS_Core_DSP_Intrinsic_PART_SIMD_DATA_PROCESS
+ * \brief    Signed MSW 32x32 Multiply and Add Instructions
+ * \details
+ *  there are 8 Signed MSW 32x32 Multiply and Add Instructions
+ */
+
+/**
+ * \defgroup NMSIS_Core_DSP_Intrinsic_SIGNED_MSW_32X16_MAC      Signed MSW 32x16 Multiply and Add Instructions
+ * \ingroup  NMSIS_Core_DSP_Intrinsic_PART_SIMD_DATA_PROCESS
+ * \brief    Signed MSW 32x16 Multiply and Add Instructions
+ * \details
+ * there are 15 Signed MSW 32x16 Multiply and Add Instructions
+ */
+
+/**
+ * \defgroup NMSIS_Core_DSP_Intrinsic_SIGNED_16B_MULT_32B_ADDSUB      Signed 16-bit Multiply 32-bit Add/Subtract Instructions
+ * \ingroup  NMSIS_Core_DSP_Intrinsic_PART_SIMD_DATA_PROCESS
+ * \brief    Signed 16-bit Multiply 32-bit Add/Subtract Instructions
+ * \details
+ *  there are 18 Signed 16-bit Multiply 32-bit Add/Subtract Instructions
+ */
+
+/**
+ * \defgroup NMSIS_Core_DSP_Intrinsic_SIGNED_16B_MULT_64B_ADDSUB      Signed 16-bit Multiply 64-bit Add/Subtract Instructions
+ * \ingroup  NMSIS_Core_DSP_Intrinsic_PART_SIMD_DATA_PROCESS
+ * \brief    Signed 16-bit Multiply 64-bit Add/Subtract Instructions
+ * \details
+ *  there is Signed 16-bit Multiply 64-bit Add/Subtract Instructions
+ */
+
+/**
+ * \defgroup NMSIS_Core_DSP_Intrinsic_PART_SIMD_MISC      Partial-SIMD Miscellaneous Instructions
+ * \ingroup  NMSIS_Core_DSP_Intrinsic_PART_SIMD_DATA_PROCESS
+ * \brief    Partial-SIMD Miscellaneous Instructions
+ * \details
+ *  there are  7 Partial-SIMD Miscellaneous Instructions
+ */
+
+/**
+ * \defgroup NMSIS_Core_DSP_Intrinsic_8B_MULT_32B_ADD      8-bit Multiply with 32-bit Add Instructions
+ * \ingroup  NMSIS_Core_DSP_Intrinsic_PART_SIMD_DATA_PROCESS
+ * \brief    8-bit Multiply with 32-bit Add Instructions
+ * \details
+ * there are  3 8-bit Multiply with 32-bit Add Instructions
+ */
+
+/**
+ * \defgroup NMSIS_Core_DSP_Intrinsic_64B_PROFILE      64-bit Profile Instructions
+ * \ingroup  NMSIS_Core_DSP_Intrinsic
+ * \brief    64-bit Profile Instructions
+ * \details
+ */
+
+/**
+ * \defgroup NMSIS_Core_DSP_Intrinsic_64B_ADDSUB      64-bit Addition & Subtraction Instructions
+ * \ingroup  NMSIS_Core_DSP_Intrinsic_64B_PROFILE
+ * \brief    64-bit Addition & Subtraction Instructions
+ * \details
+ * there are 10 64-bit Addition & Subtraction Instructions.
+ */
+
+/**
+ * \defgroup NMSIS_Core_DSP_Intrinsic_32B_MULT_64B_ADDSUB      32-bit Multiply with 64-bit Add/Subtract Instructions
+ * \ingroup  NMSIS_Core_DSP_Intrinsic_64B_PROFILE
+ * \brief    32-bit Multiply with 64-bit Add/Subtract Instructions
+ * \details
+ *  there are 32-bit Multiply 64-bit Add/Subtract Instructions
+ */
+
+/**
+ * \defgroup NMSIS_Core_DSP_Intrinsic_SIGNED_16B_MULT_64B_ADDSUB      Signed 16-bit Multiply with 64-bit Add/Subtract Instructions
+ * \ingroup  NMSIS_Core_DSP_Intrinsic_64B_PROFILE
+ * \brief    Signed 16-bit Multiply with 64-bit Add/Subtract Instructions
+ * \details
+ * there are 10 Signed 16-bit Multiply with 64-bit Add/Subtract Instructions
+ */
+
+/**
+ * \defgroup NMSIS_Core_DSP_Intrinsic_RV64_ONLY      RV64 Only Instructions
+ * \ingroup  NMSIS_Core_DSP_Intrinsic
+ * \brief    RV64 Only Instructions
+ * \details
+ */
+
+/**
+ * \defgroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_ADDSUB      (RV64 Only) SIMD 32-bit Add/Subtract Instructions
+ * \ingroup  NMSIS_Core_DSP_Intrinsic_RV64_ONLY
+ * \brief    (RV64 Only) SIMD 32-bit Add/Subtract Instructions
+ * \details
+ * The following tables list instructions that are only present in RV64.
+ * There are 30 SIMD 32-bit addition or subtraction instructions.there are 4 SIMD16-bit Packing Instructions.
+ */
+
+/**
+ * \defgroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_SHIFT      (RV64 Only) SIMD 32-bit Shift Instructions
+ * \ingroup  NMSIS_Core_DSP_Intrinsic_RV64_ONLY
+ * \brief    (RV64 Only) SIMD 32-bit Shift Instructions
+ * \details
+ *  there are 14 (RV64 Only) SIMD 32-bit Shift Instructions
+ */
+
+/**
+ * \defgroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_MISC      (RV64 Only) SIMD 32-bit Miscellaneous Instructions
+ * \ingroup  NMSIS_Core_DSP_Intrinsic_RV64_ONLY
+ * \brief    (RV64 Only) SIMD 32-bit Miscellaneous Instructions
+ * \details
+ * there are 5  (RV64 Only) SIMD 32-bit Miscellaneous Instructions
+ */
+
+/**
+ * \defgroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_Q15_SAT_MULT      (RV64 Only) SIMD Q15 Saturating Multiply Instructions
+ * \ingroup  NMSIS_Core_DSP_Intrinsic_RV64_ONLY
+ * \brief    (RV64 Only) SIMD Q15 Saturating Multiply Instructions
+ * \details
+ *  there are 9 (RV64 Only) SIMD Q15 saturating Multiply Instructions
+ */
+
+/**
+ * \defgroup NMSIS_Core_DSP_Intrinsic_RV64_32B_MULT      (RV64 Only) 32-bit Multiply Instructions
+ * \ingroup  NMSIS_Core_DSP_Intrinsic_RV64_ONLY
+ * \brief    (RV64 Only) 32-bit Multiply Instructions
+ * \details
+ *  there is 3 RV64 Only) 32-bit Multiply Instructions
+ */
+
+/**
+ * \defgroup NMSIS_Core_DSP_Intrinsic_RV64_32B_MULT_ADD      (RV64 Only) 32-bit Multiply & Add Instructions
+ * \ingroup  NMSIS_Core_DSP_Intrinsic_RV64_ONLY
+ * \brief    (RV64 Only) 32-bit Multiply & Add Instructions
+ * \details
+ *  there are  3 (RV64 Only) 32-bit Multiply & Add Instructions
+ */
+
+/**
+ * \defgroup NMSIS_Core_DSP_Intrinsic_RV64_32B_PARALLEL_MAC      (RV64 Only) 32-bit Parallel Multiply & Add Instructions
+ * \ingroup  NMSIS_Core_DSP_Intrinsic_RV64_ONLY
+ * \brief    (RV64 Only) 32-bit Parallel Multiply & Add Instructions
+ * \details
+ * there are 12 (RV64 Only) 32-bit Parallel Multiply & Add Instructions
+ */
+
+/**
+ * \defgroup NMSIS_Core_DSP_Intrinsic_RV64_NON_SIMD_32B_SHIFT      (RV64 Only) Non-SIMD 32-bit Shift Instructions
+ * \ingroup  NMSIS_Core_DSP_Intrinsic_RV64_ONLY
+ * \brief    (RV64 Only) Non-SIMD 32-bit Shift Instructions
+ * \details
+ *  there are 1  (RV64 Only) Non-SIMD 32-bit Shift Instructions
+ */
+
+/**
+ * \defgroup NMSIS_Core_DSP_Intrinsic_RV64_32B_PACK      32-bit Packing Instructions
+ * \ingroup  NMSIS_Core_DSP_Intrinsic_RV64_ONLY
+ * \brief    32-bit Packing Instructions
+ * \details
+ *  There are four 32-bit packing instructions here
+ */
+
+/* ===== Inline Function Start for 3.1. ADD8 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_ADDSUB
+ * \brief ADD8 (SIMD 8-bit Addition)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * ADD8 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 8-bit integer element additions simultaneously.
+ *
+ * **Description**:\n
+ * This instruction adds the 8-bit integer elements in Rs1 with the 8-bit integer elements
+ * in Rs2, and then writes the 8-bit element results to Rd.
+ *
+ * **Note**:\n
+ * This instruction can be used for either signed or unsigned addition.
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.B[x] = Rs1.B[x] + Rs2.B[x];
+ * for RV32: x=3...0,
+ * for RV64: x=7...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_ADD8(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("add8 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.1. ADD8 ===== */
+
+/* ===== Inline Function Start for 3.2. ADD16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_ADDSUB
+ * \brief ADD16 (SIMD 16-bit Addition)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * ADD16 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 16-bit integer element additions simultaneously.
+ *
+ * **Description**:\n
+ * This instruction adds the 16-bit integer elements in Rs1 with the 16-bit integer
+ * elements in Rs2, and then writes the 16-bit element results to Rd.
+ *
+ * **Note**:\n
+ * This instruction can be used for either signed or unsigned addition.
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.H[x] = Rs1.H[x] + Rs2.H[x];
+ * for RV32: x=1...0,
+ * for RV64: x=3...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_ADD16(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("add16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.2. ADD16 ===== */
+
+/* ===== Inline Function Start for 3.3. ADD64 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_64B_ADDSUB
+ * \brief ADD64 (64-bit Addition)
+ * \details
+ * **Type**: 64-bit Profile
+ *
+ * **Syntax**:\n
+ * ~~~
+ * ADD64 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Add two 64-bit signed or unsigned integers.
+ *
+ * **RV32 Description**:\n
+ * This instruction adds the 64-bit integer of an even/odd pair of registers specified
+ * by Rs1(4,1) with the 64-bit integer of an even/odd pair of registers specified by Rs2(4,1), and then
+ * writes the 64-bit result to an even/odd pair of registers specified by Rd(4,1).
+ * Rx(4,1), i.e., value d, determines the even/odd pair group of two registers. Specifically, the register
+ * pair includes register 2d and 2d+1.
+ * The odd `2d+1` register of the pair contains the high 32-bit of the result and the even `2d` register
+ * of the pair contains the low 32-bit of the result.
+ *
+ * **RV64 Description**:\n
+ * This instruction has the same behavior as the ADD instruction in RV64I.
+ *
+ * **Note**:\n
+ * This instruction can be used for either signed or unsigned addition.
+ *
+ * **Operations**:\n
+ * ~~~
+ * RV32:
+ *  t_L = CONCAT(Rd(4,1),1'b0); t_H = CONCAT(Rd(4,1),1'b1);
+ *  a_L = CONCAT(Rs1(4,1),1'b0); a_H = CONCAT(Rs1(4,1),1'b1);
+ *  b_L = CONCAT(Rs2(4,1),1'b0); b_H = CONCAT(Rs2(4,1),1'b1);
+ *  R[t_H].R[t_L] = R[a_H].R[a_L] + R[b_H].R[b_L];
+ * RV64:
+ *  Rd = Rs1 + Rs2;
+ * ~~~
+ *
+ * \param [in]  a    unsigned long long type of value stored in a
+ * \param [in]  b    unsigned long long type of value stored in b
+ * \return value stored in unsigned long long type
+ */
+__STATIC_FORCEINLINE unsigned long long __RV_ADD64(unsigned long long a, unsigned long long b)
+{
+    register unsigned long long result;
+    __ASM volatile("add64 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.3. ADD64 ===== */
+
+/* ===== Inline Function Start for 3.4. AVE ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_MISC
+ * \brief AVE (Average with Rounding)
+ * \details
+ * **Type**: DSP
+ *
+ * **Syntax**:\n
+ * ~~~
+ * AVE Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Calculate the average of the contents of two general registers.
+ *
+ * **Description**:\n
+ * This instruction calculates the average value of two signed integers stored in Rs1 and
+ * Rs2, rounds up a half-integer result to the nearest integer, and writes the result to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * Sum = CONCAT(Rs1[MSB],Rs1[MSB:0]) + CONCAT(Rs2[MSB],Rs2[MSB:0]) + 1;
+ * Rd = Sum[(MSB+1):1];
+ * for RV32: MSB=31,
+ * for RV64: MSB=63
+ * ~~~
+ *
+ * \param [in]  a    long type of value stored in a
+ * \param [in]  b    long type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_AVE(long a, long b)
+{
+    register long result;
+    __ASM volatile("ave %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.4. AVE ===== */
+
+/* ===== Inline Function Start for 3.5. BITREV ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_MISC
+ * \brief BITREV (Bit Reverse)
+ * \details
+ * **Type**: DSP
+ *
+ * **Syntax**:\n
+ * ~~~
+ * BITREV Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Reverse the bit positions of the source operand within a specified width starting from bit
+ * 0. The reversed width is a variable from a GPR.
+ *
+ * **Description**:\n
+ * This instruction reverses the bit positions of the content of Rs1. The reversed bit width
+ * is calculated as Rs2[4:0]+1 (RV32) or Rs2[5:0]+1 (RV64). The upper bits beyond the reversed width
+ * are filled with zeros. After the bit reverse operation, the result is written to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * msb = Rs2[4:0]; (for RV32)
+ * msb = Rs2[5:0]; (for RV64)
+ * rev[0:msb] = Rs1[msb:0];
+ * Rd = ZE(rev[msb:0]);
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_BITREV(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("bitrev %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.5. BITREV ===== */
+
+/* ===== Inline Function Start for 3.6. BITREVI ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_MISC
+ * \brief BITREVI (Bit Reverse Immediate)
+ * \details
+ * **Type**: DSP
+ *
+ * **Syntax**:\n
+ * ~~~
+ * (RV32) BITREVI Rd, Rs1, imm[4:0]
+ * (RV64) BITREVI Rd, Rs1, imm[5:0]
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Reverse the bit positions of the source operand within a specified width starting from bit
+ * 0. The reversed width is an immediate value.
+ *
+ * **Description**:\n
+ * This instruction reverses the bit positions of the content of Rs1. The reversed bit width
+ * is calculated as imm[4:0]+1 (RV32) or imm[5:0]+1 (RV64). The upper bits beyond the reversed width
+ * are filled with zeros. After the bit reverse operation, the result is written to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * msb = imm[4:0]; (RV32)
+ * msb = imm[5:0]; (RV64)
+ * rev[0:msb] = Rs1[msb:0];
+ * Rd = ZE(rev[msb:0]);
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+#define __RV_BITREVI(a, b)    \
+    ({    \
+        register unsigned long result;    \
+        register unsigned long __a = (unsigned long)(a);    \
+        __ASM volatile("bitrevi %0, %1, %2" : "=r"(result) : "r"(__a), "K"(b));    \
+        result;    \
+    })
+/* ===== Inline Function End for 3.6. BITREVI ===== */
+
+/* ===== Inline Function Start for 3.7. BPICK ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_MISC
+ * \brief BPICK (Bit-wise Pick)
+ * \details
+ * **Type**: DSP
+ *
+ * **Syntax**:\n
+ * ~~~
+ * BPICK Rd, Rs1, Rs2, Rc
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Select from two source operands based on a bit mask in the third operand.
+ *
+ * **Description**:\n
+ * This instruction selects individual bits from Rs1 or Rs2, based on the bit mask value in
+ * Rc. If a bit in Rc is 1, the corresponding bit is from Rs1; otherwise, the corresponding bit is from Rs2.
+ * The selection results are written to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd[x] = Rc[x]? Rs1[x] : Rs2[x];
+ * for RV32, x=31...0
+ * for RV64, x=63...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \param [in]  c    unsigned long type of value stored in c
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_BPICK(unsigned long a, unsigned long b, unsigned long c)
+{
+    register unsigned long result;
+    __ASM volatile("bpick %0, %1, %2, %3" : "=r"(result) : "r"(a), "r"(b), "r"(c));
+    return result;
+}
+/* ===== Inline Function End for 3.7. BPICK ===== */
+
+/* ===== Inline Function Start for 3.8. CLROV ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_OV_FLAG_SC
+ * \brief CLROV (Clear OV flag)
+ * \details
+ * **Type**: DSP
+ *
+ * **Syntax**:\n
+ * ~~~
+ * CLROV # pseudo mnemonic
+ * ~~~
+ *
+ * **Purpose**:\n
+ * This pseudo instruction is an alias to `CSRRCI x0, ucode, 1` instruction.
+ *
+ *
+ */
+__STATIC_FORCEINLINE void __RV_CLROV(void)
+{
+    __ASM volatile("clrov ");
+}
+/* ===== Inline Function End for 3.8. CLROV ===== */
+
+/* ===== Inline Function Start for 3.9. CLRS8 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_MISC
+ * \brief CLRS8 (SIMD 8-bit Count Leading Redundant Sign)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * CLRS8 Rd, Rs1
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Count the number of redundant sign bits of the 8-bit elements of a general register.
+ *
+ * **Description**:\n
+ * Starting from the bits next to the sign bits of the 8-bit elements of Rs1, this instruction
+ * counts the number of redundant sign bits and writes the result to the corresponding 8-bit elements
+ * of Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * snum[x] = Rs1.B[x];
+ * cnt[x] = 0;
+ * for (i = 6 to 0) {
+ *   if (snum[x](i) == snum[x](7)) {
+ *     cnt[x] = cnt[x] + 1;
+ *   } else {
+ *     break;
+ *   }
+ * }
+ * Rd.B[x] = cnt[x];
+ * for RV32: x=3...0
+ * for RV64: x=7...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_CLRS8(unsigned long a)
+{
+    register unsigned long result;
+    __ASM volatile("clrs8 %0, %1" : "=r"(result) : "r"(a));
+    return result;
+}
+/* ===== Inline Function End for 3.9. CLRS8 ===== */
+
+/* ===== Inline Function Start for 3.10. CLRS16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_MISC
+ * \brief CLRS16 (SIMD 16-bit Count Leading Redundant Sign)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * CLRS16 Rd, Rs1
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Count the number of redundant sign bits of the 16-bit elements of a general register.
+ *
+ * **Description**:\n
+ * Starting from the bits next to the sign bits of the 16-bit elements of Rs1, this
+ * instruction counts the number of redundant sign bits and writes the result to the corresponding 16-
+ * bit elements of Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * snum[x] = Rs1.H[x];
+ * cnt[x] = 0;
+ * for (i = 14 to 0) {
+ *   if (snum[x](i) == snum[x](15)) {
+ *     cnt[x] = cnt[x] + 1;
+ *   } else {
+ *     break;
+ *   }
+ * }
+ * Rd.H[x] = cnt[x];
+ * for RV32: x=1...0
+ * for RV64: x=3...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_CLRS16(unsigned long a)
+{
+    register unsigned long result;
+    __ASM volatile("clrs16 %0, %1" : "=r"(result) : "r"(a));
+    return result;
+}
+/* ===== Inline Function End for 3.10. CLRS16 ===== */
+
+/* ===== Inline Function Start for 3.11. CLRS32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_PART_SIMD_MISC
+ * \brief CLRS32 (SIMD 32-bit Count Leading Redundant Sign)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * CLRS32 Rd, Rs1
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Count the number of redundant sign bits of the 32-bit elements of a general register.
+ *
+ * **Description**:\n
+ * Starting from the bits next to the sign bits of the 32-bit elements of Rs1, this
+ * instruction counts the number of redundant sign bits and writes the result to the corresponding 32-
+ * bit elements of Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * snum[x] = Rs1.W[x];
+ * cnt[x] = 0;
+ * for (i = 30 to 0) {
+ *   if (snum[x](i) == snum[x](31)) {
+ *     cnt[x] = cnt[x] + 1;
+ *   } else {
+ *     break;
+ *   }
+ * }
+ * Rd.W[x] = cnt[x];
+ * for RV32: x=0
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_CLRS32(unsigned long a)
+{
+    register unsigned long result;
+    __ASM volatile("clrs32 %0, %1" : "=r"(result) : "r"(a));
+    return result;
+}
+/* ===== Inline Function End for 3.11. CLRS32 ===== */
+
+/* ===== Inline Function Start for 3.12. CLO8 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_MISC
+ * \brief CLO8 (SIMD 8-bit Count Leading One)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * CLO8 Rd, Rs1
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Count the number of leading one bits of the 8-bit elements of a general register.
+ *
+ * **Description**:\n
+ * Starting from the most significant bits of the 8-bit elements of Rs1, this instruction
+ * counts the number of leading one bits and writes the results to the corresponding 8-bit elements of
+ * Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * snum[x] = Rs1.B[x];
+ * cnt[x] = 0;
+ *   for (i = 7 to 0) {
+ *   if (snum[x](i) == 1) {
+ *     cnt[x] = cnt[x] + 1;
+ *   } else {
+ *     break;
+ *   }
+ * }
+ * Rd.B[x] = cnt[x];
+ * for RV32: x=3...0
+ * for RV64: x=7...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_CLO8(unsigned long a)
+{
+    register unsigned long result;
+    __ASM volatile("clo8 %0, %1" : "=r"(result) : "r"(a));
+    return result;
+}
+/* ===== Inline Function End for 3.12. CLO8 ===== */
+
+/* ===== Inline Function Start for 3.13. CLO16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_MISC
+ * \brief CLO16 (SIMD 16-bit Count Leading One)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * CLO16 Rd, Rs1
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Count the number of leading one bits of the 16-bit elements of a general register.
+ *
+ * **Description**:\n
+ * Starting from the most significant bits of the 16-bit elements of Rs1, this instruction
+ * counts the number of leading one bits and writes the results to the corresponding 16-bit elements
+ * of Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * snum[x] = Rs1.H[x];
+ * cnt[x] = 0;
+ * for (i = 15 to 0) {
+ *   if (snum[x](i) == 1) {
+ *     cnt[x] = cnt[x] + 1;
+ *   } else {
+ *     break;
+ *   }
+ * }
+ * Rd.H[x] = cnt[x];
+ * for RV32: x=1...0
+ * for RV64: x=3...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_CLO16(unsigned long a)
+{
+    register unsigned long result;
+    __ASM volatile("clo16 %0, %1" : "=r"(result) : "r"(a));
+    return result;
+}
+/* ===== Inline Function End for 3.13. CLO16 ===== */
+
+/* ===== Inline Function Start for 3.14. CLO32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_PART_SIMD_MISC
+ * \brief CLO32 (SIMD 32-bit Count Leading One)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * CLO32 Rd, Rs1
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Count the number of leading one bits of the 32-bit elements of a general register.
+ *
+ * **Description**:\n
+ * Starting from the most significant bits of the 32-bit elements of Rs1, this instruction
+ * counts the number of leading one bits and writes the results to the corresponding 32-bit elements
+ * of Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * snum[x] = Rs1.W[x];
+ * cnt[x] = 0;
+ * for (i = 31 to 0) {
+ *   if (snum[x](i) == 1) {
+ *     cnt[x] = cnt[x] + 1;
+ *   } else {
+ *     break;
+ *   }
+ * }
+ * Rd.W[x] = cnt[x];
+ * for RV32: x=0
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_CLO32(unsigned long a)
+{
+    register unsigned long result;
+    __ASM volatile("clo32 %0, %1" : "=r"(result) : "r"(a));
+    return result;
+}
+/* ===== Inline Function End for 3.14. CLO32 ===== */
+
+/* ===== Inline Function Start for 3.15. CLZ8 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_MISC
+ * \brief CLZ8 (SIMD 8-bit Count Leading Zero)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * CLZ8 Rd, Rs1
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Count the number of leading zero bits of the 8-bit elements of a general register.
+ *
+ * **Description**:\n
+ * Starting from the most significant bits of the 8-bit elements of Rs1, this instruction
+ * counts the number of leading zero bits and writes the results to the corresponding 8-bit elements of
+ * Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * snum[x] = Rs1.B[x];
+ * cnt[x] = 0;
+ * for (i = 7 to 0) {
+ *   if (snum[x](i) == 0) {
+ *     cnt[x] = cnt[x] + 1;
+ *   } else {
+ *     break;
+ *   }
+ * }
+ * Rd.B[x] = cnt[x];
+ * for RV32: x=3...0
+ * for RV64: x=7...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_CLZ8(unsigned long a)
+{
+    register unsigned long result;
+    __ASM volatile("clz8 %0, %1" : "=r"(result) : "r"(a));
+    return result;
+}
+/* ===== Inline Function End for 3.15. CLZ8 ===== */
+
+/* ===== Inline Function Start for 3.16. CLZ16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_MISC
+ * \brief CLZ16 (SIMD 16-bit Count Leading Zero)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * CLZ16 Rd, Rs1
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Count the number of leading zero bits of the 16-bit elements of a general register.
+ *
+ * **Description**:\n
+ * Starting from the most significant bits of the 16-bit elements of Rs1, this instruction
+ * counts the number of leading zero bits and writes the results to the corresponding 16-bit elements
+ * of Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * snum[x] = Rs1.H[x];
+ * cnt[x] = 0;
+ * for (i = 15 to 0) {
+ *   if (snum[x](i) == 0) {
+ *     cnt[x] = cnt[x] + 1;
+ *   } else {
+ *     break;
+ *   }
+ * }
+ * Rd.H[x] = cnt[x];
+ * for RV32: x=1...0
+ * for RV64: x=3...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_CLZ16(unsigned long a)
+{
+    register unsigned long result;
+    __ASM volatile("clz16 %0, %1" : "=r"(result) : "r"(a));
+    return result;
+}
+/* ===== Inline Function End for 3.16. CLZ16 ===== */
+
+/* ===== Inline Function Start for 3.17. CLZ32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_PART_SIMD_MISC
+ * \brief CLZ32 (SIMD 32-bit Count Leading Zero)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * CLZ32 Rd, Rs1
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Count the number of leading zero bits of the 32-bit elements of a general register.
+ *
+ * **Description**:\n
+ * Starting from the most significant bits of the 32-bit elements of Rs1, this instruction
+ * counts the number of leading zero bits and writes the results to the corresponding 32-bit elements
+ * of Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * snum[x] = Rs1.W[x];
+ * cnt[x] = 0;
+ * for (i = 31 to 0) {
+ *   if (snum[x](i) == 0) {
+ *     cnt[x] = cnt[x] + 1;
+ *   } else {
+ *     break;
+ *   }
+ * }
+ * Rd.W[x] = cnt[x];
+ * for RV32: x=0
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_CLZ32(unsigned long a)
+{
+    register unsigned long result;
+    __ASM volatile("clz32 %0, %1" : "=r"(result) : "r"(a));
+    return result;
+}
+/* ===== Inline Function End for 3.17. CLZ32 ===== */
+
+/* ===== Inline Function Start for 3.18. CMPEQ8 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_CMP
+ * \brief CMPEQ8 (SIMD 8-bit Integer Compare Equal)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * CMPEQ8 Rs, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 8-bit integer elements equal comparisons simultaneously.
+ *
+ * **Description**:\n
+ * This instruction compares the 8-bit integer elements in Rs1 with the 8-bit integer
+ * elements in Rs2 to see if they are equal. If they are equal, the result is 0xFF; otherwise, the result is
+ * 0x0. The 8-bit element comparison results are written to Rd.
+ *
+ * **Note**:\n
+ * This instruction can be used for either signed or unsigned numbers.
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.B[x] = (Rs1.B[x] == Rs2.B[x])? 0xff : 0x0;
+ * for RV32: x=3...0,
+ * for RV64: x=7...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_CMPEQ8(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("cmpeq8 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.18. CMPEQ8 ===== */
+
+/* ===== Inline Function Start for 3.19. CMPEQ16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_CMP
+ * \brief CMPEQ16 (SIMD 16-bit Integer Compare Equal)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * CMPEQ16 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 16-bit integer elements equal comparisons simultaneously.
+ *
+ * **Description**:\n
+ * This instruction compares the 16-bit integer elements in Rs1 with the 16-bit integer
+ * elements in Rs2 to see if they are equal. If they are equal, the result is 0xFFFF; otherwise, the result
+ * is 0x0. The 16-bit element comparison results are written to Rt.
+ *
+ * **Note**:\n
+ * This instruction can be used for either signed or unsigned numbers.
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.H[x] = (Rs1.H[x] == Rs2.H[x])? 0xffff : 0x0;
+ * for RV32: x=1...0,
+ * for RV64: x=3...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_CMPEQ16(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("cmpeq16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.19. CMPEQ16 ===== */
+
+/* ===== Inline Function Start for 3.20. CRAS16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_ADDSUB
+ * \brief CRAS16 (SIMD 16-bit Cross Addition & Subtraction)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * CRAS16 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 16-bit integer element addition and 16-bit integer element subtraction in a 32-bit
+ * chunk simultaneously. Operands are from crossed positions in 32-bit chunks.
+ *
+ * **Description**:\n
+ * This instruction adds the 16-bit integer element in [31:16] of 32-bit chunks in Rs1 with
+ * the 16-bit integer element in [15:0] of 32-bit chunks in Rs2, and writes the result to [31:16] of 32-bit
+ * chunks in Rd; at the same time, it subtracts the 16-bit integer element in [31:16] of 32-bit chunks in
+ * Rs2 from the 16-bit integer element in [15:0] of 32-bit chunks, and writes the result to [15:0] of 32-
+ * bit chunks in Rd.
+ *
+ * **Note**:\n
+ * This instruction can be used for either signed or unsigned operations.
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.W[x][31:16] = Rs1.W[x][31:16] + Rs2.W[x][15:0];
+ * Rd.W[x][15:0] = Rs1.W[x][15:0] - Rs2.W[x][31:16];
+ * for RV32, x=0
+ * for RV64, x=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_CRAS16(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("cras16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.20. CRAS16 ===== */
+
+/* ===== Inline Function Start for 3.21. CRSA16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_ADDSUB
+ * \brief CRSA16 (SIMD 16-bit Cross Subtraction & Addition)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * CRSA16 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 16-bit integer element subtraction and 16-bit integer element addition in a 32-bit
+ * chunk simultaneously. Operands are from crossed positions in 32-bit chunks.
+ *
+ * **Description**:\n
+ * This instruction subtracts the 16-bit integer element in [15:0] of 32-bit chunks in Rs2
+ * from the 16-bit integer element in [31:16] of 32-bit chunks in Rs1, and writes the result to [31:16] of
+ * 32-bit chunks in Rd; at the same time, it adds the 16-bit integer element in [31:16] of 32-bit chunks
+ * in Rs2 with the 16-bit integer element in [15:0] of 32-bit chunks in Rs1, and writes the result to
+ * [15:0] of 32-bit chunks in Rd.
+ *
+ * **Note**:\n
+ * This instruction can be used for either signed or unsigned operations.
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.W[x][31:16] = Rs1.W[x][31:16] - Rs2.W[x][15:0];
+ * Rd.W[x][15:0] = Rs1.W[x][15:0] + Rs2.W[x][31:16];
+ * for RV32, x=0
+ * for RV64, x=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_CRSA16(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("crsa16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.21. CRSA16 ===== */
+
+/* ===== Inline Function Start for 3.22. INSB ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_MISC
+ * \brief INSB (Insert Byte)
+ * \details
+ * **Type**: DSP
+ *
+ * **Syntax**:\n
+ * ~~~
+ * (RV32) INSB Rd, Rs1, imm[1:0]
+ * (RV64) INSB Rd, Rs1, imm[2:0]
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Insert byte 0 of a 32-bit or 64-bit register into one of the byte elements of another register.
+ *
+ * **Description**:\n
+ * This instruction inserts byte 0 of Rs1 into byte `imm[1:0]` (RV32) or `imm[2:0]` (RV64)
+ * of Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * bpos = imm[1:0]; (RV32)
+ * bpos = imm[2:0]; (RV64)
+ * Rd.B[bpos] = Rs1.B[0]
+ * ~~~
+ *
+ * \param [in]  t    unsigned long type of value stored in t
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+#define __RV_INSB(t, a, b)    \
+    ({    \
+        register unsigned long __t = (unsigned long)(t);    \
+        register unsigned long __a = (unsigned long)(a);    \
+        __ASM volatile("insb %0, %1, %2" : "+r"(__t) : "r"(__a), "K"(b));    \
+        __t;    \
+    })
+/* ===== Inline Function End for 3.22. INSB ===== */
+
+/* ===== Inline Function Start for 3.23. KABS8 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_MISC
+ * \brief KABS8 (SIMD 8-bit Saturating Absolute)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KABS8 Rd, Rs1
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Get the absolute value of 8-bit signed integer elements simultaneously.
+ *
+ * **Description**:\n
+ * This instruction calculates the absolute value of 8-bit signed integer elements stored
+ * in Rs1 and writes the element results to Rd. If the input number is 0x80, this instruction generates
+ * 0x7f as the output and sets the OV bit to 1.
+ *
+ * **Operations**:\n
+ * ~~~
+ * src = Rs1.B[x];
+ * if (src == 0x80) {
+ *   src = 0x7f;
+ *   OV = 1;
+ * } else if (src[7] == 1)
+ *   src = -src;
+ * }
+ * Rd.B[x] = src;
+ * for RV32: x=3...0,
+ * for RV64: x=7...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_KABS8(unsigned long a)
+{
+    register unsigned long result;
+    __ASM volatile("kabs8 %0, %1" : "=r"(result) : "r"(a));
+    return result;
+}
+/* ===== Inline Function End for 3.23. KABS8 ===== */
+
+/* ===== Inline Function Start for 3.24. KABS16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_MISC
+ * \brief KABS16 (SIMD 16-bit Saturating Absolute)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KABS16 Rd, Rs1
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Get the absolute value of 16-bit signed integer elements simultaneously.
+ *
+ * **Description**:\n
+ * This instruction calculates the absolute value of 16-bit signed integer elements stored
+ * in Rs1 and writes the element results to Rd. If the input number is 0x8000, this instruction
+ * generates 0x7fff as the output and sets the OV bit to 1.
+ *
+ * **Operations**:\n
+ * ~~~
+ * src = Rs1.H[x];
+ * if (src == 0x8000) {
+ *   src = 0x7fff;
+ *   OV = 1;
+ * } else if (src[15] == 1)
+ *   src = -src;
+ * }
+ * Rd.H[x] = src;
+ * for RV32: x=1...0,
+ * for RV64: x=3...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_KABS16(unsigned long a)
+{
+    register unsigned long result;
+    __ASM volatile("kabs16 %0, %1" : "=r"(result) : "r"(a));
+    return result;
+}
+/* ===== Inline Function End for 3.24. KABS16 ===== */
+
+/* ===== Inline Function Start for 3.25. KABSW ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_Q31_SAT_ALU
+ * \brief KABSW (Scalar 32-bit Absolute Value with Saturation)
+ * \details
+ * **Type**: DSP
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KABSW Rd, Rs1
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Get the absolute value of a signed 32-bit integer in a general register.
+ *
+ * **Description**:\n
+ * This instruction calculates the absolute value of a signed 32-bit integer stored in Rs1.
+ * The result is sign-extended (for RV64) and written to Rd. This instruction with the minimum
+ * negative integer input of 0x80000000 will produce a saturated output of maximum positive integer
+ * of 0x7fffffff and the OV flag will be set to 1.
+ *
+ * **Operations**:\n
+ * ~~~
+ * if (Rs1.W[0] >= 0) {
+ *   res = Rs1.W[0];
+ * } else {
+ *   If (Rs1.W[0] == 0x80000000) {
+ *     res = 0x7fffffff;
+ *     OV = 1;
+ *   } else {
+ *     res = -Rs1.W[0];
+ *   }
+ * }
+ * Rd = SE32(res);
+ * ~~~
+ *
+ * \param [in]  a    signed long type of value stored in a
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_KABSW(signed long a)
+{
+    register unsigned long result;
+    __ASM volatile("kabsw %0, %1" : "=r"(result) : "r"(a));
+    return result;
+}
+/* ===== Inline Function End for 3.25. KABSW ===== */
+
+/* ===== Inline Function Start for 3.26. KADD8 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_ADDSUB
+ * \brief KADD8 (SIMD 8-bit Signed Saturating Addition)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KADD8 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 8-bit signed integer element saturating additions simultaneously.
+ *
+ * **Description**:\n
+ * This instruction adds the 8-bit signed integer elements in Rs1 with the 8-bit signed
+ * integer elements in Rs2. If any of the results are beyond the Q7 number range (-2^7 <= Q7 <= 2^7-1), they
+ * are saturated to the range and the OV bit is set to 1. The saturated results are written to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * res[x] = Rs1.B[x] + Rs2.B[x];
+ * if (res[x] > 127) {
+ *   res[x] = 127;
+ *   OV = 1;
+ * } else if (res[x] < -128) {
+ *   res[x] = -128;
+ *   OV = 1;
+ * }
+ * Rd.B[x] = res[x];
+ * for RV32: x=3...0,
+ * for RV64: x=7...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_KADD8(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("kadd8 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.26. KADD8 ===== */
+
+/* ===== Inline Function Start for 3.27. KADD16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_ADDSUB
+ * \brief KADD16 (SIMD 16-bit Signed Saturating Addition)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KADD16 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 16-bit signed integer element saturating additions simultaneously.
+ *
+ * **Description**:\n
+ * This instruction adds the 16-bit signed integer elements in Rs1 with the 16-bit signed
+ * integer elements in Rs2. If any of the results are beyond the Q15 number range (-2^15 <= Q15 <= 2^15-1),
+ * they are saturated to the range and the OV bit is set to 1. The saturated results are written to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * res[x] = Rs1.H[x] + Rs2.H[x];
+ * if (res[x] > 32767) {
+ *   res[x] = 32767;
+ *   OV = 1;
+ * } else if (res[x] < -32768) {
+ *   res[x] = -32768;
+ *   OV = 1;
+ * }
+ * Rd.H[x] = res[x];
+ * for RV32: x=1...0,
+ * for RV64: x=3...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_KADD16(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("kadd16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.27. KADD16 ===== */
+
+/* ===== Inline Function Start for 3.28. KADD64 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_64B_ADDSUB
+ * \brief KADD64 (64-bit Signed Saturating Addition)
+ * \details
+ * **Type**: DSP (64-bit Profile)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KADD64 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Add two 64-bit signed integers. The result is saturated to the Q63 range.
+ *
+ * **RV32 Description**:\n
+ * This instruction adds the 64-bit signed integer of an even/odd pair of registers
+ * specified by Rs1(4,1) with the 64-bit signed integer of an even/odd pair of registers specified by
+ * Rs2(4,1). If the 64-bit result is beyond the Q63 number range (-2^63 <= Q63 <= 2^63-1), it is saturated to the
+ * range and the OV bit is set to 1. The saturated result is written to an even/odd pair of registers
+ * specified by Rd(4,1).
+ * Rx(4,1), i.e., value d, determines the even/odd pair group of two registers. Specifically, the register
+ * pair includes register 2d and 2d+1.
+ * The odd `2d+1` register of the pair contains the high 32-bit of the result and the even `2d` register
+ * of the pair contains the low 32-bit of the result.
+ *
+ * **RV64 Description**:\n
+ * This instruction adds the 64-bit signed integer in Rs1 with the 64-bit signed
+ * integer in Rs2. If the result is beyond the Q63 number range (-2^63 <= Q63 <= 2^63-1), it is saturated to the
+ * range and the OV bit is set to 1. The saturated result is written to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * RV32:
+ *  t_L = CONCAT(Rd(4,1),1'b0); t_H = CONCAT(Rd(4,1),1'b1);
+ *  a_L = CONCAT(Rs1(4,1),1'b0); a_H = CONCAT(Rs1(4,1),1'b1);
+ *  b_L = CONCAT(Rs2(4,1),1'b0); b_H = CONCAT(Rs2(4,1),1'b1);
+ *  result = R[a_H].R[a_L] + R[b_H].R[b_L];
+ *  if (result > (2^63)-1) {
+ *    result = (2^63)-1; OV = 1;
+ *  } else if (result < -2^63) {
+ *    result = -2^63; OV = 1;
+ *  }
+ *  R[t_H].R[t_L] = result;
+ * RV64:
+ *  result = Rs1 + Rs2;
+ *  if (result > (2^63)-1) {
+ *    result = (2^63)-1; OV = 1;
+ *  } else if (result < -2^63) {
+ *    result = -2^63; OV = 1;
+ *  }
+ *  Rd = result;
+ * ~~~
+ *
+ * \param [in]  a    long long type of value stored in a
+ * \param [in]  b    long long type of value stored in b
+ * \return value stored in long long type
+ */
+__STATIC_FORCEINLINE long long __RV_KADD64(long long a, long long b)
+{
+    register long long result;
+    __ASM volatile("kadd64 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.28. KADD64 ===== */
+
+/* ===== Inline Function Start for 3.29. KADDH ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_Q15_SAT_ALU
+ * \brief KADDH (Signed Addition with Q15 Saturation)
+ * \details
+ * **Type**: DSP
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KADDH Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Add the signed lower 32-bit content of two registers with Q15 saturation.
+ *
+ * **Description**:\n
+ * The signed lower 32-bit content of Rs1 is added with the signed lower 32-bit content of
+ * Rs2. And the result is saturated to the 16-bit signed integer range of [-2^15, 2^15-1] and then sign-
+ * extended and written to Rd. If saturation happens, this instruction sets the OV flag.
+ *
+ * **Operations**:\n
+ * ~~~
+ * tmp = Rs1.W[0] + Rs2.W[0];
+ * if (tmp > 32767) {
+ *   res = 32767;
+ *   OV = 1;
+ * } else if (tmp < -32768) {
+ *   res = -32768;
+ *   OV = 1
+ * } else {
+ *   res = tmp;
+ * }
+ * Rd = SE(tmp[15:0]);
+ * ~~~
+ *
+ * \param [in]  a    int type of value stored in a
+ * \param [in]  b    int type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_KADDH(int a, int b)
+{
+    register long result;
+    __ASM volatile("kaddh %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.29. KADDH ===== */
+
+/* ===== Inline Function Start for 3.30. KADDW ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_Q31_SAT_ALU
+ * \brief KADDW (Signed Addition with Q31 Saturation)
+ * \details
+ * **Type**: DSP
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KADDW Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Add the lower 32-bit signed content of two registers with Q31 saturation.
+ *
+ * **Description**:\n
+ * The lower 32-bit signed content of Rs1 is added with the lower 32-bit signed content of
+ * Rs2. And the result is saturated to the 32-bit signed integer range of [-2^31, 2^31-1] and then sign-
+ * extended and written to Rd. If saturation happens, this instruction sets the OV flag.
+ *
+ * **Operations**:\n
+ * ~~~
+ * tmp = Rs1.W[0] + Rs2.W[0];
+ * if (tmp > (2^31)-1) {
+ *   res = (2^31)-1;
+ *   OV = 1;
+ * } else if (tmp < -2^31) {
+ *   res = -2^31;
+ *   OV = 1
+ * } else {
+ *   res = tmp;
+ * }
+ * Rd = res[31:0]; // RV32
+ * Rd = SE(res[31:0]) // RV64
+ * ~~~
+ *
+ * \param [in]  a    int type of value stored in a
+ * \param [in]  b    int type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_KADDW(int a, int b)
+{
+    register long result;
+    __ASM volatile("kaddw %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.30. KADDW ===== */
+
+/* ===== Inline Function Start for 3.31. KCRAS16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_ADDSUB
+ * \brief KCRAS16 (SIMD 16-bit Signed Saturating Cross Addition & Subtraction)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KCRAS16 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 16-bit signed integer element saturating addition and 16-bit signed integer element
+ * saturating subtraction in a 32-bit chunk simultaneously. Operands are from crossed positions in 32-
+ * bit chunks.
+ *
+ * **Description**:\n
+ * This instruction adds the 16-bit signed integer element in [31:16] of 32-bit chunks in
+ * Rs1 with the 16-bit signed integer element in [15:0] of 32-bit chunks in Rs2; at the same time, it
+ * subtracts the 16-bit signed integer element in [31:16] of 32-bit chunks in Rs2 from the 16-bit signed
+ * integer element in [15:0] of 32-bit chunks in Rs1. If any of the results are beyond the Q15 number
+ * range (-2^15 <= Q15 <= 2^15-1), they are saturated to the range and the OV bit is set to 1. The saturated
+ * results are written to [31:16] of 32-bit chunks in Rd for addition and [15:0] of 32-bit chunks in Rd for
+ * subtraction.
+ *
+ * **Operations**:\n
+ * ~~~
+ * res1 = Rs1.W[x][31:16] + Rs2.W[x][15:0];
+ * res2 = Rs1.W[x][15:0] - Rs2.W[x][31:16];
+ * for (res in [res1, res2]) {
+ *   if (res > (2^15)-1) {
+ *     res = (2^15)-1;
+ *     OV = 1;
+ *   } else if (res < -2^15) {
+ *     res = -2^15;
+ *     OV = 1;
+ *   }
+ * }
+ * Rd.W[x][31:16] = res1;
+ * Rd.W[x][15:0] = res2;
+ * for RV32, x=0
+ * for RV64, x=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_KCRAS16(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("kcras16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.31. KCRAS16 ===== */
+
+/* ===== Inline Function Start for 3.32. KCRSA16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_ADDSUB
+ * \brief KCRSA16 (SIMD 16-bit Signed Saturating Cross Subtraction & Addition)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KCRSA16 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 16-bit signed integer element saturating subtraction and 16-bit signed integer element
+ * saturating addition in a 32-bit chunk simultaneously. Operands are from crossed positions in 32-bit
+ * chunks.
+ *
+ * **Description**:\n
+ * This instruction subtracts the 16-bit signed integer element in [15:0] of 32-bit chunks
+ * in Rs2 from the 16-bit signed integer element in [31:16] of 32-bit chunks in Rs1; at the same time, it
+ * adds the 16-bit signed integer element in [31:16] of 32-bit chunks in Rs2 with the 16-bit signed
+ * integer element in [15:0] of 32-bit chunks in Rs1. If any of the results are beyond the Q15 number
+ * range (-2^15 <= Q15 <= 2^15-1), they are saturated to the range and the OV bit is set to 1. The saturated
+ * results are written to [31:16] of 32-bit chunks in Rd for subtraction and [15:0] of 32-bit chunks in Rd
+ * for addition.
+ *
+ * **Operations**:\n
+ * ~~~
+ * res1 = Rs1.W[x][31:16] - Rs2.W[x][15:0];
+ * res2 = Rs1.W[x][15:0] + Rs2.W[x][31:16];
+ * for (res in [res1, res2]) {
+ *   if (res > (2^15)-1) {
+ *     res = (2^15)-1;
+ *     OV = 1;
+ *   } else if (res < -2^15) {
+ *     res = -2^15;
+ *     OV = 1;
+ *   }
+ * }
+ * Rd.W[x][31:16] = res1;
+ * Rd.W[x][15:0] = res2;
+ * for RV32, x=0
+ * for RV64, x=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_KCRSA16(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("kcrsa16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.32. KCRSA16 ===== */
+
+/* ===== Inline Function Start for 3.33.1. KDMBB ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_Q31_SAT_ALU
+ * \brief KDMBB (Signed Saturating Double Multiply B16 x B16)
+ * \details
+ * **Type**: DSP
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KDMxy Rd, Rs1, Rs2 (xy = BB, BT, TT)
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Multiply the signed Q15 integer contents of two 16-bit data in the corresponding portion
+ * of the lower 32-bit chunk in registers and then double and saturate the Q31 result. The result is
+ * written into the destination register for RV32 or sign-extended to 64-bits and written into the
+ * destination register for RV64. If saturation happens, an overflow flag OV will be set.
+ *
+ * **Description**:\n
+ * Multiply the top or bottom 16-bit Q15 content of the lower 32-bit portion in Rs1 with
+ * the top or bottom 16-bit Q15 content of the lower 32-bit portion in Rs2. The Q30 result is then
+ * doubled and saturated into a Q31 value. The Q31 value is then written into Rd (sign-extended in
+ * RV64). When both the two Q15 inputs are 0x8000, saturation will happen. The result will be
+ * saturated to 0x7FFFFFFF and the overflow flag OV will be set.
+ *
+ * **Operations**:\n
+ * ~~~
+ * aop = Rs1.H[0]; bop = Rs2.H[0]; // KDMBB
+ * aop = Rs1.H[0]; bop = Rs2.H[1]; // KDMBT
+ * aop = Rs1.H[1]; bop = Rs2.H[1]; // KDMTT
+ * If (0x8000 != aop | 0x8000 != bop) {
+ *   Mresult = aop * bop;
+ *   resQ31 = Mresult << 1;
+ *   Rd = resQ31; // RV32
+ *   Rd = SE(resQ31); // RV64
+ * } else {
+ *   resQ31 = 0x7FFFFFFF;
+ *   Rd = resQ31; // RV32
+ *   Rd = SE(resQ31); // RV64
+ *   OV = 1;
+ * }
+ * ~~~
+ *
+ * \param [in]  a    unsigned int type of value stored in a
+ * \param [in]  b    unsigned int type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_KDMBB(unsigned int a, unsigned int b)
+{
+    register long result;
+    __ASM volatile("kdmbb %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.33.1. KDMBB ===== */
+
+/* ===== Inline Function Start for 3.33.2. KDMBT ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_Q31_SAT_ALU
+ * \brief KDMBT (Signed Saturating Double Multiply B16 x T16)
+ * \details
+ * **Type**: DSP
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KDMxy Rd, Rs1, Rs2 (xy = BB, BT, TT)
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Multiply the signed Q15 integer contents of two 16-bit data in the corresponding portion
+ * of the lower 32-bit chunk in registers and then double and saturate the Q31 result. The result is
+ * written into the destination register for RV32 or sign-extended to 64-bits and written into the
+ * destination register for RV64. If saturation happens, an overflow flag OV will be set.
+ *
+ * **Description**:\n
+ * Multiply the top or bottom 16-bit Q15 content of the lower 32-bit portion in Rs1 with
+ * the top or bottom 16-bit Q15 content of the lower 32-bit portion in Rs2. The Q30 result is then
+ * doubled and saturated into a Q31 value. The Q31 value is then written into Rd (sign-extended in
+ * RV64). When both the two Q15 inputs are 0x8000, saturation will happen. The result will be
+ * saturated to 0x7FFFFFFF and the overflow flag OV will be set.
+ *
+ * **Operations**:\n
+ * ~~~
+ * aop = Rs1.H[0]; bop = Rs2.H[0]; // KDMBB
+ * aop = Rs1.H[0]; bop = Rs2.H[1]; // KDMBT
+ * aop = Rs1.H[1]; bop = Rs2.H[1]; // KDMTT
+ * If (0x8000 != aop | 0x8000 != bop) {
+ *   Mresult = aop * bop;
+ *   resQ31 = Mresult << 1;
+ *   Rd = resQ31; // RV32
+ *   Rd = SE(resQ31); // RV64
+ * } else {
+ *   resQ31 = 0x7FFFFFFF;
+ *   Rd = resQ31; // RV32
+ *   Rd = SE(resQ31); // RV64
+ *   OV = 1;
+ * }
+ * ~~~
+ *
+ * \param [in]  a    unsigned int type of value stored in a
+ * \param [in]  b    unsigned int type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_KDMBT(unsigned int a, unsigned int b)
+{
+    register long result;
+    __ASM volatile("kdmbt %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.33.2. KDMBT ===== */
+
+/* ===== Inline Function Start for 3.33.3. KDMTT ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_Q31_SAT_ALU
+ * \brief KDMTT (Signed Saturating Double Multiply T16 x T16)
+ * \details
+ * **Type**: DSP
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KDMxy Rd, Rs1, Rs2 (xy = BB, BT, TT)
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Multiply the signed Q15 integer contents of two 16-bit data in the corresponding portion
+ * of the lower 32-bit chunk in registers and then double and saturate the Q31 result. The result is
+ * written into the destination register for RV32 or sign-extended to 64-bits and written into the
+ * destination register for RV64. If saturation happens, an overflow flag OV will be set.
+ *
+ * **Description**:\n
+ * Multiply the top or bottom 16-bit Q15 content of the lower 32-bit portion in Rs1 with
+ * the top or bottom 16-bit Q15 content of the lower 32-bit portion in Rs2. The Q30 result is then
+ * doubled and saturated into a Q31 value. The Q31 value is then written into Rd (sign-extended in
+ * RV64). When both the two Q15 inputs are 0x8000, saturation will happen. The result will be
+ * saturated to 0x7FFFFFFF and the overflow flag OV will be set.
+ *
+ * **Operations**:\n
+ * ~~~
+ * aop = Rs1.H[0]; bop = Rs2.H[0]; // KDMBB
+ * aop = Rs1.H[0]; bop = Rs2.H[1]; // KDMBT
+ * aop = Rs1.H[1]; bop = Rs2.H[1]; // KDMTT
+ * If (0x8000 != aop | 0x8000 != bop) {
+ *   Mresult = aop * bop;
+ *   resQ31 = Mresult << 1;
+ *   Rd = resQ31; // RV32
+ *   Rd = SE(resQ31); // RV64
+ * } else {
+ *   resQ31 = 0x7FFFFFFF;
+ *   Rd = resQ31; // RV32
+ *   Rd = SE(resQ31); // RV64
+ *   OV = 1;
+ * }
+ * ~~~
+ *
+ * \param [in]  a    unsigned int type of value stored in a
+ * \param [in]  b    unsigned int type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_KDMTT(unsigned int a, unsigned int b)
+{
+    register long result;
+    __ASM volatile("kdmtt %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.33.3. KDMTT ===== */
+
+/* ===== Inline Function Start for 3.34.1. KDMABB ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_Q31_SAT_ALU
+ * \brief KDMABB (Signed Saturating Double Multiply Addition B16 x B16)
+ * \details
+ * **Type**: DSP
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KDMAxy Rd, Rs1, Rs2 (xy = BB, BT, TT)
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Multiply the signed Q15 integer contents of two 16-bit data in the corresponding portion
+ * of the lower 32-bit chunk in registers and then double and saturate the Q31 result, add the result
+ * with the sign-extended lower 32-bit chunk destination register and write the saturated addition
+ * result into the destination register. If saturation happens, an overflow flag OV will be set.
+ *
+ * **Description**:\n
+ * Multiply the top or bottom 16-bit Q15 content of the lower 32-bit portion in Rs1 with
+ * the top or bottom 16-bit Q15 content of the lower 32-bit portion in Rs2. The Q30 result is then
+ * doubled and saturated into a Q31 value. The Q31 value is then added with the content of Rd. If the
+ * addition result is beyond the Q31 number range (-2^31 <= Q31 <= 2^31-1), it is saturated to the range and
+ * the OV flag is set to 1. The result after saturation is written to Rd.
+ * When both the two Q15 inputs are 0x8000, saturation will happen and the overflow flag OV will be
+ * set.
+ *
+ * **Operations**:\n
+ * ~~~
+ * aop = Rs1.H[0]; bop = Rs2.H[0]; // KDMABB
+ * aop = Rs1.H[0]; bop = Rs2.H[1]; // KDMABT
+ * aop = Rs1.H[1]; bop = Rs2.H[1]; // KDMATT
+ * If (0x8000 != aop | 0x8000 != bop) {
+ *   Mresult = aop * bop;
+ *   resQ31 = Mresult << 1;
+ * } else {
+ *   resQ31 = 0x7FFFFFFF;
+ *   OV = 1;
+ * }
+ * resadd = Rd + resQ31; // RV32
+ * resadd = Rd.W[0] + resQ31; // RV64
+ * if (resadd > (2^31)-1) {
+ *   resadd = (2^31)-1;
+ *   OV = 1;
+ * } else if (resadd < -2^31) {
+ *   resadd = -2^31;
+ *   OV = 1;
+ * }
+ * Rd = resadd; // RV32
+ * Rd = SE(resadd); // RV64
+ * ~~~
+ *
+ * \param [in]  t    long type of value stored in t
+ * \param [in]  a    unsigned int type of value stored in a
+ * \param [in]  b    unsigned int type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_KDMABB(long t, unsigned int a, unsigned int b)
+{
+    __ASM volatile("kdmabb %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
+    return t;
+}
+/* ===== Inline Function End for 3.34.1. KDMABB ===== */
+
+/* ===== Inline Function Start for 3.34.2. KDMABT ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_Q31_SAT_ALU
+ * \brief KDMABT (Signed Saturating Double Multiply Addition B16 x T16)
+ * \details
+ * **Type**: DSP
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KDMAxy Rd, Rs1, Rs2 (xy = BB, BT, TT)
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Multiply the signed Q15 integer contents of two 16-bit data in the corresponding portion
+ * of the lower 32-bit chunk in registers and then double and saturate the Q31 result, add the result
+ * with the sign-extended lower 32-bit chunk destination register and write the saturated addition
+ * result into the destination register. If saturation happens, an overflow flag OV will be set.
+ *
+ * **Description**:\n
+ * Multiply the top or bottom 16-bit Q15 content of the lower 32-bit portion in Rs1 with
+ * the top or bottom 16-bit Q15 content of the lower 32-bit portion in Rs2. The Q30 result is then
+ * doubled and saturated into a Q31 value. The Q31 value is then added with the content of Rd. If the
+ * addition result is beyond the Q31 number range (-2^31 <= Q31 <= 2^31-1), it is saturated to the range and
+ * the OV flag is set to 1. The result after saturation is written to Rd.
+ * When both the two Q15 inputs are 0x8000, saturation will happen and the overflow flag OV will be
+ * set.
+ *
+ * **Operations**:\n
+ * ~~~
+ * aop = Rs1.H[0]; bop = Rs2.H[0]; // KDMABB
+ * aop = Rs1.H[0]; bop = Rs2.H[1]; // KDMABT
+ * aop = Rs1.H[1]; bop = Rs2.H[1]; // KDMATT
+ * If (0x8000 != aop | 0x8000 != bop) {
+ *   Mresult = aop * bop;
+ *   resQ31 = Mresult << 1;
+ * } else {
+ *   resQ31 = 0x7FFFFFFF;
+ *   OV = 1;
+ * }
+ * resadd = Rd + resQ31; // RV32
+ * resadd = Rd.W[0] + resQ31; // RV64
+ * if (resadd > (2^31)-1) {
+ *   resadd = (2^31)-1;
+ *   OV = 1;
+ * } else if (resadd < -2^31) {
+ *   resadd = -2^31;
+ *   OV = 1;
+ * }
+ * Rd = resadd; // RV32
+ * Rd = SE(resadd); // RV64
+ * ~~~
+ *
+ * \param [in]  t    long type of value stored in t
+ * \param [in]  a    unsigned int type of value stored in a
+ * \param [in]  b    unsigned int type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_KDMABT(long t, unsigned int a, unsigned int b)
+{
+    __ASM volatile("kdmabt %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
+    return t;
+}
+/* ===== Inline Function End for 3.34.2. KDMABT ===== */
+
+/* ===== Inline Function Start for 3.34.3. KDMATT ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_Q31_SAT_ALU
+ * \brief KDMATT (Signed Saturating Double Multiply Addition T16 x T16)
+ * \details
+ * **Type**: DSP
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KDMAxy Rd, Rs1, Rs2 (xy = BB, BT, TT)
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Multiply the signed Q15 integer contents of two 16-bit data in the corresponding portion
+ * of the lower 32-bit chunk in registers and then double and saturate the Q31 result, add the result
+ * with the sign-extended lower 32-bit chunk destination register and write the saturated addition
+ * result into the destination register. If saturation happens, an overflow flag OV will be set.
+ *
+ * **Description**:\n
+ * Multiply the top or bottom 16-bit Q15 content of the lower 32-bit portion in Rs1 with
+ * the top or bottom 16-bit Q15 content of the lower 32-bit portion in Rs2. The Q30 result is then
+ * doubled and saturated into a Q31 value. The Q31 value is then added with the content of Rd. If the
+ * addition result is beyond the Q31 number range (-2^31 <= Q31 <= 2^31-1), it is saturated to the range and
+ * the OV flag is set to 1. The result after saturation is written to Rd.
+ * When both the two Q15 inputs are 0x8000, saturation will happen and the overflow flag OV will be
+ * set.
+ *
+ * **Operations**:\n
+ * ~~~
+ * aop = Rs1.H[0]; bop = Rs2.H[0]; // KDMABB
+ * aop = Rs1.H[0]; bop = Rs2.H[1]; // KDMABT
+ * aop = Rs1.H[1]; bop = Rs2.H[1]; // KDMATT
+ * If (0x8000 != aop | 0x8000 != bop) {
+ *   Mresult = aop * bop;
+ *   resQ31 = Mresult << 1;
+ * } else {
+ *   resQ31 = 0x7FFFFFFF;
+ *   OV = 1;
+ * }
+ * resadd = Rd + resQ31; // RV32
+ * resadd = Rd.W[0] + resQ31; // RV64
+ * if (resadd > (2^31)-1) {
+ *   resadd = (2^31)-1;
+ *   OV = 1;
+ * } else if (resadd < -2^31) {
+ *   resadd = -2^31;
+ *   OV = 1;
+ * }
+ * Rd = resadd; // RV32
+ * Rd = SE(resadd); // RV64
+ * ~~~
+ *
+ * \param [in]  t    long type of value stored in t
+ * \param [in]  a    unsigned int type of value stored in a
+ * \param [in]  b    unsigned int type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_KDMATT(long t, unsigned int a, unsigned int b)
+{
+    __ASM volatile("kdmatt %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
+    return t;
+}
+/* ===== Inline Function End for 3.34.3. KDMATT ===== */
+
+/* ===== Inline Function Start for 3.35.1. KHM8 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_MULTIPLY
+ * \brief KHM8 (SIMD Signed Saturating Q7 Multiply)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KHM8 Rd, Rs1, Rs2
+ * KHMX8 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do Q7xQ7 element multiplications simultaneously. The Q14 results are then reduced to Q7
+ * numbers again.
+ *
+ * **Description**:\n
+ * For the `KHM8` instruction, multiply the top 8-bit Q7 content of 16-bit chunks in Rs1
+ * with the top 8-bit Q7 content of 16-bit chunks in Rs2. At the same time, multiply the bottom 8-bit Q7
+ * content of 16-bit chunks in Rs1 with the bottom 8-bit Q7 content of 16-bit chunks in Rs2.
+ * For the `KHMX16` instruction, multiply the top 8-bit Q7 content of 16-bit chunks in Rs1 with the
+ * bottom 8-bit Q7 content of 16-bit chunks in Rs2. At the same time, multiply the bottom 8-bit Q7
+ * content of 16-bit chunks in Rs1 with the top 8-bit Q7 content of 16-bit chunks in Rs2.
+ * The Q14 results are then right-shifted 7-bits and saturated into Q7 values. The Q7 results are then
+ * written into Rd. When both the two Q7 inputs of a multiplication are 0x80, saturation will happen.
+ * The result will be saturated to 0x7F and the overflow flag OV will be set.
+ *
+ * **Operations**:\n
+ * ~~~
+ * if (is `KHM8`) {
+ *   op1t = Rs1.B[x+1]; op2t = Rs2.B[x+1]; // top
+ *   op1b = Rs1.B[x]; op2b = Rs2.B[x]; // bottom
+ * } else if (is `KHMX8`) {
+ *   op1t = Rs1.H[x+1]; op2t = Rs2.H[x]; // Rs1 top
+ *   op1b = Rs1.H[x]; op2b = Rs2.H[x+1]; // Rs1 bottom
+ * }
+ * for ((aop,bop,res) in [(op1t,op2t,rest), (op1b,op2b,resb)]) {
+ *   if (0x80 != aop | 0x80 != bop) {
+ *     res = (aop s* bop) >> 7;
+ *   } else {
+ *     res= 0x7F;
+ *     OV = 1;
+ *   }
+ * }
+ * Rd.H[x/2] = concat(rest, resb);
+ * for RV32, x=0,2
+ * for RV64, x=0,2,4,6
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_KHM8(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("khm8 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.35.1. KHM8 ===== */
+
+/* ===== Inline Function Start for 3.35.2. KHMX8 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_MULTIPLY
+ * \brief KHMX8 (SIMD Signed Saturating Crossed Q7 Multiply)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KHM8 Rd, Rs1, Rs2
+ * KHMX8 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do Q7xQ7 element multiplications simultaneously. The Q14 results are then reduced to Q7
+ * numbers again.
+ *
+ * **Description**:\n
+ * For the `KHM8` instruction, multiply the top 8-bit Q7 content of 16-bit chunks in Rs1
+ * with the top 8-bit Q7 content of 16-bit chunks in Rs2. At the same time, multiply the bottom 8-bit Q7
+ * content of 16-bit chunks in Rs1 with the bottom 8-bit Q7 content of 16-bit chunks in Rs2.
+ * For the `KHMX16` instruction, multiply the top 8-bit Q7 content of 16-bit chunks in Rs1 with the
+ * bottom 8-bit Q7 content of 16-bit chunks in Rs2. At the same time, multiply the bottom 8-bit Q7
+ * content of 16-bit chunks in Rs1 with the top 8-bit Q7 content of 16-bit chunks in Rs2.
+ * The Q14 results are then right-shifted 7-bits and saturated into Q7 values. The Q7 results are then
+ * written into Rd. When both the two Q7 inputs of a multiplication are 0x80, saturation will happen.
+ * The result will be saturated to 0x7F and the overflow flag OV will be set.
+ *
+ * **Operations**:\n
+ * ~~~
+ * if (is `KHM8`) {
+ *   op1t = Rs1.B[x+1]; op2t = Rs2.B[x+1]; // top
+ *   op1b = Rs1.B[x]; op2b = Rs2.B[x]; // bottom
+ * } else if (is `KHMX8`) {
+ *   op1t = Rs1.H[x+1]; op2t = Rs2.H[x]; // Rs1 top
+ *   op1b = Rs1.H[x]; op2b = Rs2.H[x+1]; // Rs1 bottom
+ * }
+ * for ((aop,bop,res) in [(op1t,op2t,rest), (op1b,op2b,resb)]) {
+ *   if (0x80 != aop | 0x80 != bop) {
+ *     res = (aop s* bop) >> 7;
+ *   } else {
+ *     res= 0x7F;
+ *     OV = 1;
+ *   }
+ * }
+ * Rd.H[x/2] = concat(rest, resb);
+ * for RV32, x=0,2
+ * for RV64, x=0,2,4,6
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_KHMX8(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("khmx8 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.35.2. KHMX8 ===== */
+
+/* ===== Inline Function Start for 3.36.1. KHM16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_MULTIPLY
+ * \brief KHM16 (SIMD Signed Saturating Q15 Multiply)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KHM16 Rd, Rs1, Rs2
+ * KHMX16 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do Q15xQ15 element multiplications simultaneously. The Q30 results are then reduced to
+ * Q15 numbers again.
+ *
+ * **Description**:\n
+ * For the `KHM16` instruction, multiply the top 16-bit Q15 content of 32-bit chunks in
+ * Rs1 with the top 16-bit Q15 content of 32-bit chunks in Rs2. At the same time, multiply the bottom
+ * 16-bit Q15 content of 32-bit chunks in Rs1 with the bottom 16-bit Q15 content of 32-bit chunks in
+ * Rs2.
+ * For the `KHMX16` instruction, multiply the top 16-bit Q15 content of 32-bit chunks in Rs1 with the
+ * bottom 16-bit Q15 content of 32-bit chunks in Rs2. At the same time, multiply the bottom 16-bit Q15
+ * content of 32-bit chunks in Rs1 with the top 16-bit Q15 content of 32-bit chunks in Rs2.
+ * The Q30 results are then right-shifted 15-bits and saturated into Q15 values. The Q15 results are
+ * then written into Rd. When both the two Q15 inputs of a multiplication are 0x8000, saturation will
+ * happen. The result will be saturated to 0x7FFF and the overflow flag OV will be set.
+ *
+ * **Operations**:\n
+ * ~~~
+ * if (is `KHM16`) {
+ *   op1t = Rs1.H[x+1]; op2t = Rs2.H[x+1]; // top
+ *   op1b = Rs1.H[x]; op2b = Rs2.H[x]; // bottom
+ * } else if (is `KHMX16`) {
+ *   op1t = Rs1.H[x+1]; op2t = Rs2.H[x]; // Rs1 top
+ *   op1b = Rs1.H[x]; op2b = Rs2.H[x+1]; // Rs1 bottom
+ * }
+ * for ((aop,bop,res) in [(op1t,op2t,rest), (op1b,op2b,resb)]) {
+ *   if (0x8000 != aop | 0x8000 != bop) {
+ *     res = (aop s* bop) >> 15;
+ *   } else {
+ *     res= 0x7FFF;
+ *     OV = 1;
+ *   }
+ * }
+ * Rd.W[x/2] = concat(rest, resb);
+ * for RV32: x=0
+ * for RV64: x=0,2
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_KHM16(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("khm16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.36.1. KHM16 ===== */
+
+/* ===== Inline Function Start for 3.36.2. KHMX16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_MULTIPLY
+ * \brief KHMX16 (SIMD Signed Saturating Crossed Q15 Multiply)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KHM16 Rd, Rs1, Rs2
+ * KHMX16 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do Q15xQ15 element multiplications simultaneously. The Q30 results are then reduced to
+ * Q15 numbers again.
+ *
+ * **Description**:\n
+ * For the `KHM16` instruction, multiply the top 16-bit Q15 content of 32-bit chunks in
+ * Rs1 with the top 16-bit Q15 content of 32-bit chunks in Rs2. At the same time, multiply the bottom
+ * 16-bit Q15 content of 32-bit chunks in Rs1 with the bottom 16-bit Q15 content of 32-bit chunks in
+ * Rs2.
+ * For the `KHMX16` instruction, multiply the top 16-bit Q15 content of 32-bit chunks in Rs1 with the
+ * bottom 16-bit Q15 content of 32-bit chunks in Rs2. At the same time, multiply the bottom 16-bit Q15
+ * content of 32-bit chunks in Rs1 with the top 16-bit Q15 content of 32-bit chunks in Rs2.
+ * The Q30 results are then right-shifted 15-bits and saturated into Q15 values. The Q15 results are
+ * then written into Rd. When both the two Q15 inputs of a multiplication are 0x8000, saturation will
+ * happen. The result will be saturated to 0x7FFF and the overflow flag OV will be set.
+ *
+ * **Operations**:\n
+ * ~~~
+ * if (is `KHM16`) {
+ *   op1t = Rs1.H[x+1]; op2t = Rs2.H[x+1]; // top
+ *   op1b = Rs1.H[x]; op2b = Rs2.H[x]; // bottom
+ * } else if (is `KHMX16`) {
+ *   op1t = Rs1.H[x+1]; op2t = Rs2.H[x]; // Rs1 top
+ *   op1b = Rs1.H[x]; op2b = Rs2.H[x+1]; // Rs1 bottom
+ * }
+ * for ((aop,bop,res) in [(op1t,op2t,rest), (op1b,op2b,resb)]) {
+ *   if (0x8000 != aop | 0x8000 != bop) {
+ *     res = (aop s* bop) >> 15;
+ *   } else {
+ *     res= 0x7FFF;
+ *     OV = 1;
+ *   }
+ * }
+ * Rd.W[x/2] = concat(rest, resb);
+ * for RV32: x=0
+ * for RV64: x=0,2
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_KHMX16(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("khmx16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.36.2. KHMX16 ===== */
+
+/* ===== Inline Function Start for 3.37.1. KHMBB ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_Q15_SAT_ALU
+ * \brief KHMBB (Signed Saturating Half Multiply B16 x B16)
+ * \details
+ * **Type**: DSP
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KHMxy Rd, Rs1, Rs2 (xy = BB, BT, TT)
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Multiply the signed Q15 number contents of two 16-bit data in the corresponding portion
+ * of the lower 32-bit chunk in registers and then right-shift 15 bits to turn the Q30 result into a Q15
+ * number again and saturate the Q15 result into the destination register. If saturation happens, an
+ * overflow flag OV will be set.
+ *
+ * **Description**:\n
+ * Multiply the top or bottom 16-bit Q15 content of the lower 32-bit portion in Rs1 with
+ * the top or bottom 16-bit Q15 content of the lower 32-bit portion in Rs2. The Q30 result is then right-
+ * shifted 15-bits and saturated into a Q15 value. The Q15 value is then sing-extended and written into
+ * Rd. When both the two Q15 inputs are 0x8000, saturation will happen. The result will be saturated
+ * to 0x7FFF and the overflow flag OV will be set.
+ *
+ * **Operations**:\n
+ * ~~~
+ * aop = Rs1.H[0]; bop = Rs2.H[0]; // KHMBB
+ * aop = Rs1.H[0]; bop = Rs2.H[1]; // KHMBT
+ * aop = Rs1.H[1]; bop = Rs2.H[1]; // KHMTT
+ * If (0x8000 != aop | 0x8000 != bop) {
+ *   Mresult[31:0] = aop * bop;
+ *   res[15:0] = Mresult[30:15];
+ * } else {
+ *   res[15:0] = 0x7FFF;
+ *   OV = 1;
+ * }
+ * Rd = SE32(res[15:0]); // Rv32
+ * Rd = SE64(res[15:0]); // RV64
+ * ~~~
+ *
+ * \param [in]  a    unsigned int type of value stored in a
+ * \param [in]  b    unsigned int type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_KHMBB(unsigned int a, unsigned int b)
+{
+    register long result;
+    __ASM volatile("khmbb %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.37.1. KHMBB ===== */
+
+/* ===== Inline Function Start for 3.37.2. KHMBT ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_Q15_SAT_ALU
+ * \brief KHMBT (Signed Saturating Half Multiply B16 x T16)
+ * \details
+ * **Type**: DSP
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KHMxy Rd, Rs1, Rs2 (xy = BB, BT, TT)
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Multiply the signed Q15 number contents of two 16-bit data in the corresponding portion
+ * of the lower 32-bit chunk in registers and then right-shift 15 bits to turn the Q30 result into a Q15
+ * number again and saturate the Q15 result into the destination register. If saturation happens, an
+ * overflow flag OV will be set.
+ *
+ * **Description**:\n
+ * Multiply the top or bottom 16-bit Q15 content of the lower 32-bit portion in Rs1 with
+ * the top or bottom 16-bit Q15 content of the lower 32-bit portion in Rs2. The Q30 result is then right-
+ * shifted 15-bits and saturated into a Q15 value. The Q15 value is then sing-extended and written into
+ * Rd. When both the two Q15 inputs are 0x8000, saturation will happen. The result will be saturated
+ * to 0x7FFF and the overflow flag OV will be set.
+ *
+ * **Operations**:\n
+ * ~~~
+ * aop = Rs1.H[0]; bop = Rs2.H[0]; // KHMBB
+ * aop = Rs1.H[0]; bop = Rs2.H[1]; // KHMBT
+ * aop = Rs1.H[1]; bop = Rs2.H[1]; // KHMTT
+ * If (0x8000 != aop | 0x8000 != bop) {
+ *   Mresult[31:0] = aop * bop;
+ *   res[15:0] = Mresult[30:15];
+ * } else {
+ *   res[15:0] = 0x7FFF;
+ *   OV = 1;
+ * }
+ * Rd = SE32(res[15:0]); // Rv32
+ * Rd = SE64(res[15:0]); // RV64
+ * ~~~
+ *
+ * \param [in]  a    unsigned int type of value stored in a
+ * \param [in]  b    unsigned int type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_KHMBT(unsigned int a, unsigned int b)
+{
+    register long result;
+    __ASM volatile("khmbt %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.37.2. KHMBT ===== */
+
+/* ===== Inline Function Start for 3.37.3. KHMTT ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_Q15_SAT_ALU
+ * \brief KHMTT (Signed Saturating Half Multiply T16 x T16)
+ * \details
+ * **Type**: DSP
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KHMxy Rd, Rs1, Rs2 (xy = BB, BT, TT)
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Multiply the signed Q15 number contents of two 16-bit data in the corresponding portion
+ * of the lower 32-bit chunk in registers and then right-shift 15 bits to turn the Q30 result into a Q15
+ * number again and saturate the Q15 result into the destination register. If saturation happens, an
+ * overflow flag OV will be set.
+ *
+ * **Description**:\n
+ * Multiply the top or bottom 16-bit Q15 content of the lower 32-bit portion in Rs1 with
+ * the top or bottom 16-bit Q15 content of the lower 32-bit portion in Rs2. The Q30 result is then right-
+ * shifted 15-bits and saturated into a Q15 value. The Q15 value is then sing-extended and written into
+ * Rd. When both the two Q15 inputs are 0x8000, saturation will happen. The result will be saturated
+ * to 0x7FFF and the overflow flag OV will be set.
+ *
+ * **Operations**:\n
+ * ~~~
+ * aop = Rs1.H[0]; bop = Rs2.H[0]; // KHMBB
+ * aop = Rs1.H[0]; bop = Rs2.H[1]; // KHMBT
+ * aop = Rs1.H[1]; bop = Rs2.H[1]; // KHMTT
+ * If (0x8000 != aop | 0x8000 != bop) {
+ *   Mresult[31:0] = aop * bop;
+ *   res[15:0] = Mresult[30:15];
+ * } else {
+ *   res[15:0] = 0x7FFF;
+ *   OV = 1;
+ * }
+ * Rd = SE32(res[15:0]); // Rv32
+ * Rd = SE64(res[15:0]); // RV64
+ * ~~~
+ *
+ * \param [in]  a    unsigned int type of value stored in a
+ * \param [in]  b    unsigned int type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_KHMTT(unsigned int a, unsigned int b)
+{
+    register long result;
+    __ASM volatile("khmtt %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.37.3. KHMTT ===== */
+
+/* ===== Inline Function Start for 3.38.1. KMABB ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_16B_MULT_32B_ADDSUB
+ * \brief KMABB (SIMD Saturating Signed Multiply Bottom Halfs & Add)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KMABB Rd, Rs1, Rs2
+ * KMABT Rd, Rs1, Rs2
+ * KMATT Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Multiply the signed 16-bit content of 32-bit elements in a register with the 16-bit content
+ * of 32-bit elements in another register and add the result to the content of 32-bit elements in the
+ * third register. The addition result may be saturated and is written to the third register.
+ * * KMABB: rd.W[x] + bottom*bottom (per 32-bit element)
+ * * KMABT rd.W[x] + bottom*top (per 32-bit element)
+ * * KMATT rd.W[x] + top*top (per 32-bit element)
+ *
+ * **Description**:\n
+ * For the `KMABB` instruction, it multiplies the bottom 16-bit content of 32-bit elements in Rs1 with
+ * the bottom 16-bit content of 32-bit elements in Rs2.
+ * For the `KMABT` instruction, it multiplies the bottom 16-bit content of 32-bit elements in Rs1 with
+ * the top 16-bit content of 32-bit elements in Rs2.
+ * For the `KMATT` instruction, it multiplies the top 16-bit content of 32-bit elements in Rs1 with the
+ * top 16-bit content of 32-bit elements in Rs2.
+ * The multiplication result is added to the content of 32-bit elements in Rd. If the addition result is
+ * beyond the Q31 number range (-2^31 <= Q31 <= 2^31-1), it is saturated to the range and the OV bit is set to
+ * 1. The results after saturation are written to Rd. The 16-bit contents of Rs1 and Rs2 are treated as
+ * signed integers.
+ *
+ * **Operations**:\n
+ * ~~~
+ * res[x] = Rd.W[x] + (Rs1.W[x].H[0] * Rs2.W[x].H[0]); // KMABB
+ * res[x] = Rd.W[x] + (Rs1.W[x].H[0] * Rs2.W[x].H[1]); // KMABT
+ * res[x] = Rd.W[x] + (Rs1.W[x].H[1] * Rs2.W[x].H[1]); // KMATT
+ * if (res[x] > (2^31)-1) {
+ *   res[x] = (2^31)-1;
+ *   OV = 1;
+ * } else if (res[x] < -2^31) {
+ *   res[x] = -2^31;
+ *   OV = 1;
+ * }
+ * Rd.W[x] = res[x];
+ * for RV32: x=0
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  t    long type of value stored in t
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_KMABB(long t, unsigned long a, unsigned long b)
+{
+    __ASM volatile("kmabb %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
+    return t;
+}
+/* ===== Inline Function End for 3.38.1. KMABB ===== */
+
+/* ===== Inline Function Start for 3.38.2. KMABT ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_16B_MULT_32B_ADDSUB
+ * \brief KMABT (SIMD Saturating Signed Multiply Bottom & Top Halfs & Add)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KMABB Rd, Rs1, Rs2
+ * KMABT Rd, Rs1, Rs2
+ * KMATT Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Multiply the signed 16-bit content of 32-bit elements in a register with the 16-bit content
+ * of 32-bit elements in another register and add the result to the content of 32-bit elements in the
+ * third register. The addition result may be saturated and is written to the third register.
+ * * KMABB: rd.W[x] + bottom*bottom (per 32-bit element)
+ * * KMABT rd.W[x] + bottom*top (per 32-bit element)
+ * * KMATT rd.W[x] + top*top (per 32-bit element)
+ *
+ * **Description**:\n
+ * For the `KMABB` instruction, it multiplies the bottom 16-bit content of 32-bit elements in Rs1 with
+ * the bottom 16-bit content of 32-bit elements in Rs2.
+ * For the `KMABT` instruction, it multiplies the bottom 16-bit content of 32-bit elements in Rs1 with
+ * the top 16-bit content of 32-bit elements in Rs2.
+ * For the `KMATT` instruction, it multiplies the top 16-bit content of 32-bit elements in Rs1 with the
+ * top 16-bit content of 32-bit elements in Rs2.
+ * The multiplication result is added to the content of 32-bit elements in Rd. If the addition result is
+ * beyond the Q31 number range (-2^31 <= Q31 <= 2^31-1), it is saturated to the range and the OV bit is set to
+ * 1. The results after saturation are written to Rd. The 16-bit contents of Rs1 and Rs2 are treated as
+ * signed integers.
+ *
+ * **Operations**:\n
+ * ~~~
+ * res[x] = Rd.W[x] + (Rs1.W[x].H[0] * Rs2.W[x].H[0]); // KMABB
+ * res[x] = Rd.W[x] + (Rs1.W[x].H[0] * Rs2.W[x].H[1]); // KMABT
+ * res[x] = Rd.W[x] + (Rs1.W[x].H[1] * Rs2.W[x].H[1]); // KMATT
+ * if (res[x] > (2^31)-1) {
+ *   res[x] = (2^31)-1;
+ *   OV = 1;
+ * } else if (res[x] < -2^31) {
+ *   res[x] = -2^31;
+ *   OV = 1;
+ * }
+ * Rd.W[x] = res[x];
+ * for RV32: x=0
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  t    long type of value stored in t
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_KMABT(long t, unsigned long a, unsigned long b)
+{
+    __ASM volatile("kmabt %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
+    return t;
+}
+/* ===== Inline Function End for 3.38.2. KMABT ===== */
+
+/* ===== Inline Function Start for 3.38.3. KMATT ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_16B_MULT_32B_ADDSUB
+ * \brief KMATT (SIMD Saturating Signed Multiply Top Halfs & Add)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KMABB Rd, Rs1, Rs2
+ * KMABT Rd, Rs1, Rs2
+ * KMATT Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Multiply the signed 16-bit content of 32-bit elements in a register with the 16-bit content
+ * of 32-bit elements in another register and add the result to the content of 32-bit elements in the
+ * third register. The addition result may be saturated and is written to the third register.
+ * * KMABB: rd.W[x] + bottom*bottom (per 32-bit element)
+ * * KMABT rd.W[x] + bottom*top (per 32-bit element)
+ * * KMATT rd.W[x] + top*top (per 32-bit element)
+ *
+ * **Description**:\n
+ * For the `KMABB` instruction, it multiplies the bottom 16-bit content of 32-bit elements in Rs1 with
+ * the bottom 16-bit content of 32-bit elements in Rs2.
+ * For the `KMABT` instruction, it multiplies the bottom 16-bit content of 32-bit elements in Rs1 with
+ * the top 16-bit content of 32-bit elements in Rs2.
+ * For the `KMATT` instruction, it multiplies the top 16-bit content of 32-bit elements in Rs1 with the
+ * top 16-bit content of 32-bit elements in Rs2.
+ * The multiplication result is added to the content of 32-bit elements in Rd. If the addition result is
+ * beyond the Q31 number range (-2^31 <= Q31 <= 2^31-1), it is saturated to the range and the OV bit is set to
+ * 1. The results after saturation are written to Rd. The 16-bit contents of Rs1 and Rs2 are treated as
+ * signed integers.
+ *
+ * **Operations**:\n
+ * ~~~
+ * res[x] = Rd.W[x] + (Rs1.W[x].H[0] * Rs2.W[x].H[0]); // KMABB
+ * res[x] = Rd.W[x] + (Rs1.W[x].H[0] * Rs2.W[x].H[1]); // KMABT
+ * res[x] = Rd.W[x] + (Rs1.W[x].H[1] * Rs2.W[x].H[1]); // KMATT
+ * if (res[x] > (2^31)-1) {
+ *   res[x] = (2^31)-1;
+ *   OV = 1;
+ * } else if (res[x] < -2^31) {
+ *   res[x] = -2^31;
+ *   OV = 1;
+ * }
+ * Rd.W[x] = res[x];
+ * for RV32: x=0
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  t    long type of value stored in t
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_KMATT(long t, unsigned long a, unsigned long b)
+{
+    __ASM volatile("kmatt %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
+    return t;
+}
+/* ===== Inline Function End for 3.38.3. KMATT ===== */
+
+/* ===== Inline Function Start for 3.39.1. KMADA ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_16B_MULT_32B_ADDSUB
+ * \brief KMADA (SIMD Saturating Signed Multiply Two Halfs and Two Adds)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KMADA Rd, Rs1, Rs2
+ * KMAXDA Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do two signed 16-bit multiplications from 32-bit elements in two registers; and then adds
+ * the two 32-bit results and 32-bit elements in a third register together. The addition result may be
+ * saturated.
+ * * KMADA: rd.W[x] + top*top + bottom*bottom (per 32-bit element)
+ * * KMAXDA: rd.W[x] + top*bottom + bottom*top (per 32-bit element)
+ *
+ * **Description**:\n
+ * For the `KMADA instruction, it multiplies the bottom 16-bit content of 32-bit elements in Rs1 with
+ * the bottom 16-bit content of 32-bit elements in Rs2 and then adds the result to the result of
+ * multiplying the top 16-bit content of 32-bit elements in Rs1 with the top 16-bit content of 32-bit
+ * elements in Rs2.
+ * For the `KMAXDA` instruction, it multiplies the top 16-bit content of 32-bit elements in Rs1 with the
+ * bottom 16-bit content of 32-bit elements in Rs2 and then adds the result to the result of multiplying
+ * the bottom 16-bit content of 32-bit elements in Rs1 with the top 16-bit content of 32-bit elements in
+ * Rs2.
+ * The result is added to the content of 32-bit elements in Rd. If the addition result is beyond the Q31
+ * number range (-2^31 <= Q31 <= 2^31-1), it is saturated to the range and the OV bit is set to 1. The 32-bit
+ * results after saturation are written to Rd. The 16-bit contents of Rs1 and Rs2 are treated as signed
+ * integers.
+ *
+ * **Operations**:\n
+ * ~~~
+ * // KMADA
+ * res[x] = Rd.W[x] + (Rs1.W[x].H[1] * Rs2.W[x].H[1]) + (Rs1.W[x].H[0] * Rs2.W[x].H[0]);
+ * // KMAXDA
+ * res[x] = Rd.W[x] + (Rs1.W[x].H[1] * Rs2.W[x].H[0]) + (Rs1.W[x].H[0] * Rs2.W[x].H[1]);
+ * if (res[x] > (2^31)-1) {
+ *   res[x] = (2^31)-1;
+ *   OV = 1;
+ * } else if (res[x] < -2^31) {
+ *   res[x] = -2^31;
+ * OV = 1;
+ * }
+ * Rd.W[x] = res[x];
+ * for RV32: x=0
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  t    long type of value stored in t
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_KMADA(long t, unsigned long a, unsigned long b)
+{
+    __ASM volatile("kmada %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
+    return t;
+}
+/* ===== Inline Function End for 3.39.1. KMADA ===== */
+
+/* ===== Inline Function Start for 3.39.2. KMAXDA ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_16B_MULT_32B_ADDSUB
+ * \brief KMAXDA (SIMD Saturating Signed Crossed Multiply Two Halfs and Two Adds)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KMADA Rd, Rs1, Rs2
+ * KMAXDA Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do two signed 16-bit multiplications from 32-bit elements in two registers; and then adds
+ * the two 32-bit results and 32-bit elements in a third register together. The addition result may be
+ * saturated.
+ * * KMADA: rd.W[x] + top*top + bottom*bottom (per 32-bit element)
+ * * KMAXDA: rd.W[x] + top*bottom + bottom*top (per 32-bit element)
+ *
+ * **Description**:\n
+ * For the `KMADA instruction, it multiplies the bottom 16-bit content of 32-bit elements in Rs1 with
+ * the bottom 16-bit content of 32-bit elements in Rs2 and then adds the result to the result of
+ * multiplying the top 16-bit content of 32-bit elements in Rs1 with the top 16-bit content of 32-bit
+ * elements in Rs2.
+ * For the `KMAXDA` instruction, it multiplies the top 16-bit content of 32-bit elements in Rs1 with the
+ * bottom 16-bit content of 32-bit elements in Rs2 and then adds the result to the result of multiplying
+ * the bottom 16-bit content of 32-bit elements in Rs1 with the top 16-bit content of 32-bit elements in
+ * Rs2.
+ * The result is added to the content of 32-bit elements in Rd. If the addition result is beyond the Q31
+ * number range (-2^31 <= Q31 <= 2^31-1), it is saturated to the range and the OV bit is set to 1. The 32-bit
+ * results after saturation are written to Rd. The 16-bit contents of Rs1 and Rs2 are treated as signed
+ * integers.
+ *
+ * **Operations**:\n
+ * ~~~
+ * // KMADA
+ * res[x] = Rd.W[x] + (Rs1.W[x].H[1] * Rs2.W[x].H[1]) + (Rs1.W[x].H[0] * Rs2.W[x].H[0]);
+ * // KMAXDA
+ * res[x] = Rd.W[x] + (Rs1.W[x].H[1] * Rs2.W[x].H[0]) + (Rs1.W[x].H[0] * Rs2.W[x].H[1]);
+ * if (res[x] > (2^31)-1) {
+ *   res[x] = (2^31)-1;
+ *   OV = 1;
+ * } else if (res[x] < -2^31) {
+ *   res[x] = -2^31;
+ * OV = 1;
+ * }
+ * Rd.W[x] = res[x];
+ * for RV32: x=0
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  t    long type of value stored in t
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_KMAXDA(long t, unsigned long a, unsigned long b)
+{
+    __ASM volatile("kmaxda %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
+    return t;
+}
+/* ===== Inline Function End for 3.39.2. KMAXDA ===== */
+
+/* ===== Inline Function Start for 3.40.1. KMADS ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_16B_MULT_32B_ADDSUB
+ * \brief KMADS (SIMD Saturating Signed Multiply Two Halfs & Subtract & Add)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KMADS Rd, Rs1, Rs2
+ * KMADRS Rd, Rs1, Rs2
+ * KMAXDS Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do two signed 16-bit multiplications from 32-bit elements in two registers; and then
+ * perform a subtraction operation between the two 32-bit results. Then add the subtraction result to
+ * the corresponding 32-bit elements in a third register. The addition result may be saturated.
+ * * KMADS: rd.W[x] + (top*top - bottom*bottom) (per 32-bit element)
+ * * KMADRS: rd.W[x] + (bottom*bottom - top*top) (per 32-bit element)
+ * * KMAXDS: rd.W[x] + (top*bottom - bottom*top) (per 32-bit element)
+ *
+ * **Description**:\n
+ * For the `KMADS` instruction, it multiplies the bottom 16-bit content of 32-bit elements in Rs1 with
+ * the bottom 16-bit content of 32-bit elements in Rs2 and then subtracts the result from the result of
+ * multiplying the top 16-bit content of 32-bit elements in Rs1 with the top 16-bit content of 32-bit
+ * elements in Rs2.
+ * For the `KMADRS` instruction, it multiplies the top 16-bit content of 32-bit elements in Rs1 with the
+ * top 16-bit content of 32-bit elements in Rs2 and then subtracts the result from the result of
+ * multiplying the bottom 16-bit content of 32-bit elements in Rs1 with the bottom 16-bit content of 32-
+ * bit elements in Rs2.
+ * For the `KMAXDS` instruction, it multiplies the bottom 16-bit content of 32-bit elements in Rs1 with
+ * the top 16-bit content of 32-bit elements in Rs2 and then subtracts the result from the result of
+ * multiplying the top 16-bit content of 32-bit elements in Rs1 with the bottom 16-bit content of 32-bit
+ * elements in Rs2.
+ * The subtraction result is then added to the content of the corresponding 32-bit elements in Rd. If the
+ * addition result is beyond the Q31 number range (-2^31 <= Q31 <= 2^31-1), it is saturated to the range and
+ * the OV bit is set to 1. The 32-bit results after saturation are written to Rd. The 16-bit contents of Rs1
+ * and Rs2 are treated as signed integers.
+ *
+ * **Operations**:\n
+ * ~~~
+ * // KMADS
+ * res[x] = Rd.W[x] + (Rs1.W[x].H[1] * Rs2.W[x].H[1]) - (Rs1.W[x].H[0] * Rs2.W[x].H[0]);
+ * // KMADRS
+ * res[x] = Rd.W[x] + (Rs1.W[x].H[0] * Rs2.W[x].H[0]) - (Rs1.W[x].H[1] * Rs2.W[x].H[1]);
+ * // KMAXDS
+ * res[x] = Rd.W[x] + (Rs1.W[x].H[1] * Rs2.W[x].H[0]) - (Rs1.W[x].H[0] * Rs2.W[x].H[1]);
+ * if (res[x] > (2^31)-1) {
+ *   res[x] = (2^31)-1;
+ *   OV = 1;
+ * } else if (res[x] < -2^31) {
+ *   res[x] = -2^31;
+ *   OV = 1;
+ * }
+ * Rd.W[x] = res[x];
+ * for RV32: x=0
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  t    long type of value stored in t
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_KMADS(long t, unsigned long a, unsigned long b)
+{
+    __ASM volatile("kmads %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
+    return t;
+}
+/* ===== Inline Function End for 3.40.1. KMADS ===== */
+
+/* ===== Inline Function Start for 3.40.2. KMADRS ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_16B_MULT_32B_ADDSUB
+ * \brief KMADRS (SIMD Saturating Signed Multiply Two Halfs & Reverse Subtract & Add)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KMADS Rd, Rs1, Rs2
+ * KMADRS Rd, Rs1, Rs2
+ * KMAXDS Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do two signed 16-bit multiplications from 32-bit elements in two registers; and then
+ * perform a subtraction operation between the two 32-bit results. Then add the subtraction result to
+ * the corresponding 32-bit elements in a third register. The addition result may be saturated.
+ * * KMADS: rd.W[x] + (top*top - bottom*bottom) (per 32-bit element)
+ * * KMADRS: rd.W[x] + (bottom*bottom - top*top) (per 32-bit element)
+ * * KMAXDS: rd.W[x] + (top*bottom - bottom*top) (per 32-bit element)
+ *
+ * **Description**:\n
+ * For the `KMADS` instruction, it multiplies the bottom 16-bit content of 32-bit elements in Rs1 with
+ * the bottom 16-bit content of 32-bit elements in Rs2 and then subtracts the result from the result of
+ * multiplying the top 16-bit content of 32-bit elements in Rs1 with the top 16-bit content of 32-bit
+ * elements in Rs2.
+ * For the `KMADRS` instruction, it multiplies the top 16-bit content of 32-bit elements in Rs1 with the
+ * top 16-bit content of 32-bit elements in Rs2 and then subtracts the result from the result of
+ * multiplying the bottom 16-bit content of 32-bit elements in Rs1 with the bottom 16-bit content of 32-
+ * bit elements in Rs2.
+ * For the `KMAXDS` instruction, it multiplies the bottom 16-bit content of 32-bit elements in Rs1 with
+ * the top 16-bit content of 32-bit elements in Rs2 and then subtracts the result from the result of
+ * multiplying the top 16-bit content of 32-bit elements in Rs1 with the bottom 16-bit content of 32-bit
+ * elements in Rs2.
+ * The subtraction result is then added to the content of the corresponding 32-bit elements in Rd. If the
+ * addition result is beyond the Q31 number range (-2^31 <= Q31 <= 2^31-1), it is saturated to the range and
+ * the OV bit is set to 1. The 32-bit results after saturation are written to Rd. The 16-bit contents of Rs1
+ * and Rs2 are treated as signed integers.
+ *
+ * **Operations**:\n
+ * ~~~
+ * // KMADS
+ * res[x] = Rd.W[x] + (Rs1.W[x].H[1] * Rs2.W[x].H[1]) - (Rs1.W[x].H[0] * Rs2.W[x].H[0]);
+ * // KMADRS
+ * res[x] = Rd.W[x] + (Rs1.W[x].H[0] * Rs2.W[x].H[0]) - (Rs1.W[x].H[1] * Rs2.W[x].H[1]);
+ * // KMAXDS
+ * res[x] = Rd.W[x] + (Rs1.W[x].H[1] * Rs2.W[x].H[0]) - (Rs1.W[x].H[0] * Rs2.W[x].H[1]);
+ * if (res[x] > (2^31)-1) {
+ *   res[x] = (2^31)-1;
+ *   OV = 1;
+ * } else if (res[x] < -2^31) {
+ *   res[x] = -2^31;
+ *   OV = 1;
+ * }
+ * Rd.W[x] = res[x];
+ * for RV32: x=0
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  t    long type of value stored in t
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_KMADRS(long t, unsigned long a, unsigned long b)
+{
+    __ASM volatile("kmadrs %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
+    return t;
+}
+/* ===== Inline Function End for 3.40.2. KMADRS ===== */
+
+/* ===== Inline Function Start for 3.40.3. KMAXDS ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_16B_MULT_32B_ADDSUB
+ * \brief KMAXDS (SIMD Saturating Signed Crossed Multiply Two Halfs & Subtract & Add)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KMADS Rd, Rs1, Rs2
+ * KMADRS Rd, Rs1, Rs2
+ * KMAXDS Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do two signed 16-bit multiplications from 32-bit elements in two registers; and then
+ * perform a subtraction operation between the two 32-bit results. Then add the subtraction result to
+ * the corresponding 32-bit elements in a third register. The addition result may be saturated.
+ * * KMADS: rd.W[x] + (top*top - bottom*bottom) (per 32-bit element)
+ * * KMADRS: rd.W[x] + (bottom*bottom - top*top) (per 32-bit element)
+ * * KMAXDS: rd.W[x] + (top*bottom - bottom*top) (per 32-bit element)
+ *
+ * **Description**:\n
+ * For the `KMADS` instruction, it multiplies the bottom 16-bit content of 32-bit elements in Rs1 with
+ * the bottom 16-bit content of 32-bit elements in Rs2 and then subtracts the result from the result of
+ * multiplying the top 16-bit content of 32-bit elements in Rs1 with the top 16-bit content of 32-bit
+ * elements in Rs2.
+ * For the `KMADRS` instruction, it multiplies the top 16-bit content of 32-bit elements in Rs1 with the
+ * top 16-bit content of 32-bit elements in Rs2 and then subtracts the result from the result of
+ * multiplying the bottom 16-bit content of 32-bit elements in Rs1 with the bottom 16-bit content of 32-
+ * bit elements in Rs2.
+ * For the `KMAXDS` instruction, it multiplies the bottom 16-bit content of 32-bit elements in Rs1 with
+ * the top 16-bit content of 32-bit elements in Rs2 and then subtracts the result from the result of
+ * multiplying the top 16-bit content of 32-bit elements in Rs1 with the bottom 16-bit content of 32-bit
+ * elements in Rs2.
+ * The subtraction result is then added to the content of the corresponding 32-bit elements in Rd. If the
+ * addition result is beyond the Q31 number range (-2^31 <= Q31 <= 2^31-1), it is saturated to the range and
+ * the OV bit is set to 1. The 32-bit results after saturation are written to Rd. The 16-bit contents of Rs1
+ * and Rs2 are treated as signed integers.
+ *
+ * **Operations**:\n
+ * ~~~
+ * // KMADS
+ * res[x] = Rd.W[x] + (Rs1.W[x].H[1] * Rs2.W[x].H[1]) - (Rs1.W[x].H[0] * Rs2.W[x].H[0]);
+ * // KMADRS
+ * res[x] = Rd.W[x] + (Rs1.W[x].H[0] * Rs2.W[x].H[0]) - (Rs1.W[x].H[1] * Rs2.W[x].H[1]);
+ * // KMAXDS
+ * res[x] = Rd.W[x] + (Rs1.W[x].H[1] * Rs2.W[x].H[0]) - (Rs1.W[x].H[0] * Rs2.W[x].H[1]);
+ * if (res[x] > (2^31)-1) {
+ *   res[x] = (2^31)-1;
+ *   OV = 1;
+ * } else if (res[x] < -2^31) {
+ *   res[x] = -2^31;
+ *   OV = 1;
+ * }
+ * Rd.W[x] = res[x];
+ * for RV32: x=0
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  t    long type of value stored in t
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_KMAXDS(long t, unsigned long a, unsigned long b)
+{
+    __ASM volatile("kmaxds %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
+    return t;
+}
+/* ===== Inline Function End for 3.40.3. KMAXDS ===== */
+
+/* ===== Inline Function Start for 3.41. KMAR64 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_32B_MULT_64B_ADDSUB
+ * \brief KMAR64 (Signed Multiply and Saturating Add to 64-Bit Data)
+ * \details
+ * **Type**: DSP (64-bit Profile)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KMAR64 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Multiply the 32-bit signed elements in two registers and add the 64-bit multiplication
+ * results to the 64-bit signed data of a pair of registers (RV32) or a register (RV64). The result is
+ * saturated to the Q63 range and written back to the pair of registers (RV32) or the register (RV64).
+ *
+ * **RV32 Description**:\n
+ * This instruction multiplies the 32-bit signed data of Rs1 with that of Rs2. It adds
+ * the 64-bit multiplication result to the 64-bit signed data of an even/odd pair of registers specified by
+ * Rd(4,1) with unlimited precision. If the 64-bit addition result is beyond the Q63 number range (-2^63 <=
+ * Q63 <= 2^63-1), it is saturated to the range and the OV bit is set to 1. The saturated result is written back
+ * to the even/odd pair of registers specified by Rd(4,1).
+ * Rx(4,1), i.e., value d, determines the even/odd pair group of two registers. Specifically, the register
+ * pair includes register 2d and 2d+1.
+ * The odd `2d+1` register of the pair contains the high 32-bit of the result and the even `2d` register
+ * of the pair contains the low 32-bit of the result.
+ *
+ * **RV64 Description**:\n
+ * This instruction multiplies the 32-bit signed elements of Rs1 with that of Rs2. It
+ * adds the 64-bit multiplication results to the 64-bit signed data of Rd with unlimited precision. If the
+ * 64-bit addition result is beyond the Q63 number range (-2^63 <= Q63 <= 2^63-1), it is saturated to the range
+ * and the OV bit is set to 1. The saturated result is written back to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * RV32:
+ * t_L = CONCAT(Rd(4,1),1'b0); t_H = CONCAT(Rd(4,1),1'b1);
+ * result = R[t_H].R[t_L] + (Rs1 * Rs2);
+ * if (result > (2^63)-1) {
+ *   result = (2^63)-1; OV = 1;
+ * } else if (result < -2^63) {
+ *   result = -2^63; OV = 1;
+ * }
+ * R[t_H].R[t_L] = result;
+ * RV64:
+ * // `result` has unlimited precision
+ * result = Rd + (Rs1.W[0] * Rs2.W[0]) + (Rs1.W[1] * Rs2.W[1]);
+ * if (result > (2^63)-1) {
+ *   result = (2^63)-1; OV = 1;
+ * } else if (result < -2^63) {
+ *   result = -2^63; OV = 1;
+ * }
+ * Rd = result;
+ * ~~~
+ *
+ * \param [in]  t    long long type of value stored in t
+ * \param [in]  a    long type of value stored in a
+ * \param [in]  b    long type of value stored in b
+ * \return value stored in long long type
+ */
+__STATIC_FORCEINLINE long long __RV_KMAR64(long long t, long a, long b)
+{
+    __ASM volatile("kmar64 %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
+    return t;
+}
+/* ===== Inline Function End for 3.41. KMAR64 ===== */
+
+/* ===== Inline Function Start for 3.42.1. KMDA ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_16B_MULT_32B_ADDSUB
+ * \brief KMDA (SIMD Signed Multiply Two Halfs and Add)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KMDA Rd, Rs1, Rs2
+ * KMXDA Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do two signed 16-bit multiplications from the 32-bit elements of two registers; and then
+ * adds the two 32-bit results together. The addition result may be saturated.
+ * * KMDA: top*top + bottom*bottom (per 32-bit element)
+ * * KMXDA: top*bottom + bottom*top (per 32-bit element)
+ *
+ * **Description**:\n
+ * For the `KMDA` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1
+ * with the bottom 16-bit content of the 32-bit elements of Rs2 and then adds the result to the result of
+ * multiplying the top 16-bit content of the 32-bit elements of Rs1 with the top 16-bit content of the 32-
+ * bit elements of Rs2.
+ * For the `KMXDA` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1
+ * with the top 16-bit content of the 32-bit elements of Rs2 and then adds the result to the result of
+ * multiplying the top 16-bit content of the 32-bit elements of Rs1 with the bottom 16-bit content of the
+ * 32-bit elements of Rs2.
+ * The addition result is checked for saturation. If saturation happens, the result is saturated to 2^31-1.
+ * The final results are written to Rd. The 16-bit contents are treated as signed integers.
+ *
+ * **Operations**:\n
+ * ~~~
+ * if  Rs1.W[x]  !=  0x80008000)  or  (Rs2.W[x]  !=  0x80008000  {  //  KMDA  Rd.W[x]  =  Rs1.W[x].H[1]  *
+ * Rs2.W[x].H[1]) + (Rs1.W[x].H[0] * Rs2.W[x].H[0]; // KMXDA Rd.W[x] = Rs1.W[x].H[1] * Rs2.W[x].H[0])
+ * +  (Rs1.W[x].H[0]  *  Rs2.W[x].H[1];  }  else  {  Rd.W[x]  =  0x7fffffff;  OV  =  1;  }  for  RV32:  x=0  for  RV64:
+ * x=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_KMDA(unsigned long a, unsigned long b)
+{
+    register long result;
+    __ASM volatile("kmda %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.42.1. KMDA ===== */
+
+/* ===== Inline Function Start for 3.42.2. KMXDA ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_16B_MULT_32B_ADDSUB
+ * \brief KMXDA (SIMD Signed Crossed Multiply Two Halfs and Add)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KMDA Rd, Rs1, Rs2
+ * KMXDA Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do two signed 16-bit multiplications from the 32-bit elements of two registers; and then
+ * adds the two 32-bit results together. The addition result may be saturated.
+ * * KMDA: top*top + bottom*bottom (per 32-bit element)
+ * * KMXDA: top*bottom + bottom*top (per 32-bit element)
+ *
+ * **Description**:\n
+ * For the `KMDA` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1
+ * with the bottom 16-bit content of the 32-bit elements of Rs2 and then adds the result to the result of
+ * multiplying the top 16-bit content of the 32-bit elements of Rs1 with the top 16-bit content of the 32-
+ * bit elements of Rs2.
+ * For the `KMXDA` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1
+ * with the top 16-bit content of the 32-bit elements of Rs2 and then adds the result to the result of
+ * multiplying the top 16-bit content of the 32-bit elements of Rs1 with the bottom 16-bit content of the
+ * 32-bit elements of Rs2.
+ * The addition result is checked for saturation. If saturation happens, the result is saturated to 2^31-1.
+ * The final results are written to Rd. The 16-bit contents are treated as signed integers.
+ *
+ * **Operations**:\n
+ * ~~~
+ * if  Rs1.W[x]  !=  0x80008000)  or  (Rs2.W[x]  !=  0x80008000  {  //  KMDA  Rd.W[x]  =  Rs1.W[x].H[1]  *
+ * Rs2.W[x].H[1]) + (Rs1.W[x].H[0] * Rs2.W[x].H[0]; // KMXDA Rd.W[x] = Rs1.W[x].H[1] * Rs2.W[x].H[0])
+ * +  (Rs1.W[x].H[0]  *  Rs2.W[x].H[1];  }  else  {  Rd.W[x]  =  0x7fffffff;  OV  =  1;  }  for  RV32:  x=0  for  RV64:
+ * x=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_KMXDA(unsigned long a, unsigned long b)
+{
+    register long result;
+    __ASM volatile("kmxda %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.42.2. KMXDA ===== */
+
+/* ===== Inline Function Start for 3.43.1. KMMAC ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_MSW_32X32_MAC
+ * \brief KMMAC (SIMD Saturating MSW Signed Multiply Word and Add)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KMMAC Rd, Rs1, Rs2
+ * KMMAC.u Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Multiply the signed 32-bit integer elements of two registers and add the most significant
+ * 32-bit results with the signed 32-bit integer elements of a third register. The addition results are
+ * saturated first and then written back to the third register. The `.u` form performs an additional
+ * rounding up operation on the multiplication results before adding the most significant 32-bit part
+ * of the results.
+ *
+ * **Description**:\n
+ * This instruction multiplies the signed 32-bit elements of Rs1 with the signed 32-bit elements of Rs2
+ * and adds the most significant 32-bit multiplication results with the signed 32-bit elements of Rd. If
+ * the addition result is beyond the Q31 number range (-2^31 <= Q31 <= 2^31-1), it is saturated to the range
+ * and the OV bit is set to 1. The results after saturation are written to Rd. The `.u` form of the
+ * instruction additionally rounds up the most significant 32-bit of the 64-bit multiplication results by
+ * adding a 1 to bit 31 of the results.
+ *
+ * **Operations**:\n
+ * ~~~
+ * Mres[x][63:0] = Rs1.W[x] * Rs2.W[x];
+ * if (`.u` form) {
+ *   Round[x][32:0] = Mres[x][63:31] + 1;
+ *   res[x] = Rd.W[x] + Round[x][32:1];
+ * } else {
+ *   res[x] = Rd.W[x] + Mres[x][63:32];
+ * }
+ * if (res[x] > (2^31)-1) {
+ *   res[x] = (2^31)-1;
+ *   OV = 1;
+ * } else if (res[x] < -2^31) {
+ *   res[x] = -2^31;
+ *   OV = 1;
+ * }
+ * Rd.W[x] = res[x];
+ * for RV32: x=0
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  t    long type of value stored in t
+ * \param [in]  a    long type of value stored in a
+ * \param [in]  b    long type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_KMMAC(long t, long a, long b)
+{
+    __ASM volatile("kmmac %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
+    return t;
+}
+/* ===== Inline Function End for 3.43.1. KMMAC ===== */
+
+/* ===== Inline Function Start for 3.43.2. KMMAC.u ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_MSW_32X32_MAC
+ * \brief KMMAC.u (SIMD Saturating MSW Signed Multiply Word and Add with Rounding)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KMMAC Rd, Rs1, Rs2
+ * KMMAC.u Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Multiply the signed 32-bit integer elements of two registers and add the most significant
+ * 32-bit results with the signed 32-bit integer elements of a third register. The addition results are
+ * saturated first and then written back to the third register. The `.u` form performs an additional
+ * rounding up operation on the multiplication results before adding the most significant 32-bit part
+ * of the results.
+ *
+ * **Description**:\n
+ * This instruction multiplies the signed 32-bit elements of Rs1 with the signed 32-bit elements of Rs2
+ * and adds the most significant 32-bit multiplication results with the signed 32-bit elements of Rd. If
+ * the addition result is beyond the Q31 number range (-2^31 <= Q31 <= 2^31-1), it is saturated to the range
+ * and the OV bit is set to 1. The results after saturation are written to Rd. The `.u` form of the
+ * instruction additionally rounds up the most significant 32-bit of the 64-bit multiplication results by
+ * adding a 1 to bit 31 of the results.
+ *
+ * **Operations**:\n
+ * ~~~
+ * Mres[x][63:0] = Rs1.W[x] * Rs2.W[x];
+ * if (`.u` form) {
+ *   Round[x][32:0] = Mres[x][63:31] + 1;
+ *   res[x] = Rd.W[x] + Round[x][32:1];
+ * } else {
+ *   res[x] = Rd.W[x] + Mres[x][63:32];
+ * }
+ * if (res[x] > (2^31)-1) {
+ *   res[x] = (2^31)-1;
+ *   OV = 1;
+ * } else if (res[x] < -2^31) {
+ *   res[x] = -2^31;
+ *   OV = 1;
+ * }
+ * Rd.W[x] = res[x];
+ * for RV32: x=0
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  t    long type of value stored in t
+ * \param [in]  a    long type of value stored in a
+ * \param [in]  b    long type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_KMMAC_U(long t, long a, long b)
+{
+    __ASM volatile("kmmac.u %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
+    return t;
+}
+/* ===== Inline Function End for 3.43.2. KMMAC.u ===== */
+
+/* ===== Inline Function Start for 3.44.1. KMMAWB ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_MSW_32X16_MAC
+ * \brief KMMAWB (SIMD Saturating MSW Signed Multiply Word and Bottom Half and Add)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KMMAWB Rd, Rs1, Rs2
+ * KMMAWB.u Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Multiply the signed 32-bit integer elements of one register and the bottom 16-bit of the
+ * corresponding 32-bit elements of another register and add the most significant 32-bit results with
+ * the corresponding signed 32-bit elements of a third register. The addition result is written to the
+ * corresponding 32-bit elements of the third register. The `.u` form rounds up the multiplication
+ * results from the most significant discarded bit before the addition operations.
+ *
+ * **Description**:\n
+ * This instruction multiplies the signed 32-bit elements of Rs1 with the signed bottom 16-bit content
+ * of the corresponding 32-bit elements of Rs2 and adds the most significant 32-bit multiplication
+ * results with the corresponding signed 32-bit elements of Rd. If the addition result is beyond the Q31
+ * number range (-2^31 <= Q31 <= 2^31-1), it is saturated to the range and the OV bit is set to 1. The results
+ * after saturation are written to the corresponding 32-bit elements of Rd. The `.u` form of the
+ * instruction rounds up the most significant 32-bit of the 48-bit multiplication results by adding a 1 to
+ * bit 15 of the result before the addition operations.
+ *
+ * **Operations**:\n
+ * ~~~
+ * Mres[x][47:0] = Rs1.W[x] * Rs2.W[x].H[0];
+ * if (`.u` form) {
+ *   Round[x][32:0] = Mres[x][47:15] + 1;
+ *   res[x] = Rd.W[x] + Round[x][32:1];
+ * } else {
+ *   res[x] = Rd.W[x] + Mres[x][47:16];
+ * }
+ * if (res[x] > (2^31)-1) {
+ *   res[x] = (2^31)-1;
+ *   OV = 1;
+ * } else if (res[x] < -2^31) {
+ *   res[x] = -2^31;
+ *   OV = 1;
+ * }
+ * Rd.W[x] = res[x];
+ * for RV32: x=0
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  t    long type of value stored in t
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_KMMAWB(long t, unsigned long a, unsigned long b)
+{
+    __ASM volatile("kmmawb %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
+    return t;
+}
+/* ===== Inline Function End for 3.44.1. KMMAWB ===== */
+
+/* ===== Inline Function Start for 3.44.2. KMMAWB.u ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_MSW_32X16_MAC
+ * \brief KMMAWB.u (SIMD Saturating MSW Signed Multiply Word and Bottom Half and Add with Rounding)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KMMAWB Rd, Rs1, Rs2
+ * KMMAWB.u Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Multiply the signed 32-bit integer elements of one register and the bottom 16-bit of the
+ * corresponding 32-bit elements of another register and add the most significant 32-bit results with
+ * the corresponding signed 32-bit elements of a third register. The addition result is written to the
+ * corresponding 32-bit elements of the third register. The `.u` form rounds up the multiplication
+ * results from the most significant discarded bit before the addition operations.
+ *
+ * **Description**:\n
+ * This instruction multiplies the signed 32-bit elements of Rs1 with the signed bottom 16-bit content
+ * of the corresponding 32-bit elements of Rs2 and adds the most significant 32-bit multiplication
+ * results with the corresponding signed 32-bit elements of Rd. If the addition result is beyond the Q31
+ * number range (-2^31 <= Q31 <= 2^31-1), it is saturated to the range and the OV bit is set to 1. The results
+ * after saturation are written to the corresponding 32-bit elements of Rd. The `.u` form of the
+ * instruction rounds up the most significant 32-bit of the 48-bit multiplication results by adding a 1 to
+ * bit 15 of the result before the addition operations.
+ *
+ * **Operations**:\n
+ * ~~~
+ * Mres[x][47:0] = Rs1.W[x] * Rs2.W[x].H[0];
+ * if (`.u` form) {
+ *   Round[x][32:0] = Mres[x][47:15] + 1;
+ *   res[x] = Rd.W[x] + Round[x][32:1];
+ * } else {
+ *   res[x] = Rd.W[x] + Mres[x][47:16];
+ * }
+ * if (res[x] > (2^31)-1) {
+ *   res[x] = (2^31)-1;
+ *   OV = 1;
+ * } else if (res[x] < -2^31) {
+ *   res[x] = -2^31;
+ *   OV = 1;
+ * }
+ * Rd.W[x] = res[x];
+ * for RV32: x=0
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  t    long type of value stored in t
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_KMMAWB_U(long t, unsigned long a, unsigned long b)
+{
+    __ASM volatile("kmmawb.u %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
+    return t;
+}
+/* ===== Inline Function End for 3.44.2. KMMAWB.u ===== */
+
+/* ===== Inline Function Start for 3.45.1. KMMAWB2 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_MSW_32X16_MAC
+ * \brief KMMAWB2 (SIMD Saturating MSW Signed Multiply Word and Bottom Half & 2 and Add)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KMMAWB2 Rd, Rs1, Rs2
+ * KMMAWB2.u Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Multiply the signed 32-bit elements of one register and the bottom 16-bit of the
+ * corresponding 32-bit elements of another register, double the multiplication results and add the
+ * saturated most significant 32-bit results with the corresponding signed 32-bit elements of a third
+ * register. The saturated addition result is written to the corresponding 32-bit elements of the third
+ * register. The `.u` form rounds up the multiplication results from the most significant discarded bit
+ * before the addition operations.
+ *
+ * **Description**:\n
+ * This instruction multiplies the signed 32-bit Q31 elements of Rs1 with the signed bottom 16-bit Q15
+ * content of the corresponding 32-bit elements of Rs2, doubles the Q46 results to Q47 numbers and
+ * adds the saturated most significant 32-bit Q31 multiplication results with the corresponding signed
+ * 32-bit elements of Rd. If the addition result is beyond the Q31 number range (-2^31 <= Q31 <= 2^31-1), it is
+ * saturated to the range and the OV bit is set to 1. The results after saturation are written to the
+ * corresponding 32-bit elements of Rd. The `.u` form of the instruction rounds up the most significant
+ * 32-bit of the 48-bit Q47 multiplication results by adding a 1 to bit 15 (i.e., bit 14 before doubling) of
+ * the result before the addition operations.
+ *
+ * **Operations**:\n
+ * ~~~
+ * if ((Rs1.W[x] == 0x80000000) & (Rs2.W[x].H[0] == 0x8000)) {
+ *   addop.W[x] = 0x7fffffff;
+ *   OV = 1;
+ * } else {
+ *   Mres[x][47:0] = Rs1.W[x] s* Rs2.W[x].H[0];
+ *   if (`.u` form) {
+ *     Mres[x][47:14] = Mres[x][47:14] + 1;
+ *   }
+ *   addop.W[x] = Mres[x][46:15]; // doubling
+ * }
+ * res[x] = Rd.W[x] + addop.W[x];
+ * if (res[x] > (2^31)-1) {
+ *   res[x] = (2^31)-1;
+ *   OV = 1;
+ * } else if (res[x] < -2^31) {
+ *   res[x] = -2^31;
+ *   OV = 1;
+ * }
+ * Rd.W[x] = res[x];
+ * for RV32: x=0
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  t    long type of value stored in t
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_KMMAWB2(long t, unsigned long a, unsigned long b)
+{
+    __ASM volatile("kmmawb2 %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
+    return t;
+}
+/* ===== Inline Function End for 3.45.1. KMMAWB2 ===== */
+
+/* ===== Inline Function Start for 3.45.2. KMMAWB2.u ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_MSW_32X16_MAC
+ * \brief KMMAWB2.u (SIMD Saturating MSW Signed Multiply Word and Bottom Half & 2 and Add with Rounding)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KMMAWB2 Rd, Rs1, Rs2
+ * KMMAWB2.u Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Multiply the signed 32-bit elements of one register and the bottom 16-bit of the
+ * corresponding 32-bit elements of another register, double the multiplication results and add the
+ * saturated most significant 32-bit results with the corresponding signed 32-bit elements of a third
+ * register. The saturated addition result is written to the corresponding 32-bit elements of the third
+ * register. The `.u` form rounds up the multiplication results from the most significant discarded bit
+ * before the addition operations.
+ *
+ * **Description**:\n
+ * This instruction multiplies the signed 32-bit Q31 elements of Rs1 with the signed bottom 16-bit Q15
+ * content of the corresponding 32-bit elements of Rs2, doubles the Q46 results to Q47 numbers and
+ * adds the saturated most significant 32-bit Q31 multiplication results with the corresponding signed
+ * 32-bit elements of Rd. If the addition result is beyond the Q31 number range (-2^31 <= Q31 <= 2^31-1), it is
+ * saturated to the range and the OV bit is set to 1. The results after saturation are written to the
+ * corresponding 32-bit elements of Rd. The `.u` form of the instruction rounds up the most significant
+ * 32-bit of the 48-bit Q47 multiplication results by adding a 1 to bit 15 (i.e., bit 14 before doubling) of
+ * the result before the addition operations.
+ *
+ * **Operations**:\n
+ * ~~~
+ * if ((Rs1.W[x] == 0x80000000) & (Rs2.W[x].H[0] == 0x8000)) {
+ *   addop.W[x] = 0x7fffffff;
+ *   OV = 1;
+ * } else {
+ *   Mres[x][47:0] = Rs1.W[x] s* Rs2.W[x].H[0];
+ *   if (`.u` form) {
+ *     Mres[x][47:14] = Mres[x][47:14] + 1;
+ *   }
+ *   addop.W[x] = Mres[x][46:15]; // doubling
+ * }
+ * res[x] = Rd.W[x] + addop.W[x];
+ * if (res[x] > (2^31)-1) {
+ *   res[x] = (2^31)-1;
+ *   OV = 1;
+ * } else if (res[x] < -2^31) {
+ *   res[x] = -2^31;
+ *   OV = 1;
+ * }
+ * Rd.W[x] = res[x];
+ * for RV32: x=0
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  t    long type of value stored in t
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_KMMAWB2_U(long t, unsigned long a, unsigned long b)
+{
+    __ASM volatile("kmmawb2.u %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
+    return t;
+}
+/* ===== Inline Function End for 3.45.2. KMMAWB2.u ===== */
+
+/* ===== Inline Function Start for 3.46.1. KMMAWT ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_MSW_32X16_MAC
+ * \brief KMMAWT (SIMD Saturating MSW Signed Multiply Word and Top Half and Add)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KMMAWT Rd, Rs1, Rs2
+ * KMMAWT.u Rd Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Multiply the signed 32-bit integer elements of one register and the signed top 16-bit of the
+ * corresponding 32-bit elements of another register and add the most significant 32-bit results with
+ * the corresponding signed 32-bit elements of a third register. The addition results are written to the
+ * corresponding 32-bit elements of the third register. The `.u` form rounds up the multiplication
+ * results from the most significant discarded bit before the addition operations.
+ *
+ * **Description**:\n
+ * This instruction multiplies the signed 32-bit elements of Rs1 with the signed top 16-bit of the
+ * corresponding 32-bit elements of Rs2 and adds the most significant 32-bit multiplication results
+ * with the corresponding signed 32-bit elements of Rd. If the addition result is beyond the Q31
+ * number range (-2^31 <= Q31 <= 2^31-1), it is saturated to the range and the OV bit is set to 1. The results
+ * after saturation are written to the corresponding 32-bit elements of Rd. The `.u` form of the
+ * instruction rounds up the most significant 32-bit of the 48-bit multiplication results by adding a 1 to
+ * bit 15 of the result before the addition operations.
+ *
+ * **Operations**:\n
+ * ~~~
+ * Mres[x][47:0] = Rs1.W[x] * Rs2.W[x].H[1];
+ * if (`.u` form) {
+ *   Round[x][32:0] = Mres[x][47:15] + 1;
+ *   res[x] = Rd.W[x] + Round[x][32:1];
+ * } else {
+ *   res[x] = Rd.W[x] + Mres[x][47:16];
+ * }
+ * if (res[x] > (2^31)-1) {
+ *   res[x] = (2^31)-1;
+ *   OV = 1;
+ * } else if (res[x] < -2^31) {
+ *   res[x] = -2^31;
+ *   OV = 1;
+ * }
+ * Rd.W[x] = res[x];
+ * for RV32: x=0
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  t    long type of value stored in t
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_KMMAWT(long t, unsigned long a, unsigned long b)
+{
+    __ASM volatile("kmmawt %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
+    return t;
+}
+/* ===== Inline Function End for 3.46.1. KMMAWT ===== */
+
+/* ===== Inline Function Start for 3.46.2. KMMAWT.u ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_MSW_32X16_MAC
+ * \brief KMMAWT.u (SIMD Saturating MSW Signed Multiply Word and Top Half and Add with Rounding)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KMMAWT Rd, Rs1, Rs2
+ * KMMAWT.u Rd Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Multiply the signed 32-bit integer elements of one register and the signed top 16-bit of the
+ * corresponding 32-bit elements of another register and add the most significant 32-bit results with
+ * the corresponding signed 32-bit elements of a third register. The addition results are written to the
+ * corresponding 32-bit elements of the third register. The `.u` form rounds up the multiplication
+ * results from the most significant discarded bit before the addition operations.
+ *
+ * **Description**:\n
+ * This instruction multiplies the signed 32-bit elements of Rs1 with the signed top 16-bit of the
+ * corresponding 32-bit elements of Rs2 and adds the most significant 32-bit multiplication results
+ * with the corresponding signed 32-bit elements of Rd. If the addition result is beyond the Q31
+ * number range (-2^31 <= Q31 <= 2^31-1), it is saturated to the range and the OV bit is set to 1. The results
+ * after saturation are written to the corresponding 32-bit elements of Rd. The `.u` form of the
+ * instruction rounds up the most significant 32-bit of the 48-bit multiplication results by adding a 1 to
+ * bit 15 of the result before the addition operations.
+ *
+ * **Operations**:\n
+ * ~~~
+ * Mres[x][47:0] = Rs1.W[x] * Rs2.W[x].H[1];
+ * if (`.u` form) {
+ *   Round[x][32:0] = Mres[x][47:15] + 1;
+ *   res[x] = Rd.W[x] + Round[x][32:1];
+ * } else {
+ *   res[x] = Rd.W[x] + Mres[x][47:16];
+ * }
+ * if (res[x] > (2^31)-1) {
+ *   res[x] = (2^31)-1;
+ *   OV = 1;
+ * } else if (res[x] < -2^31) {
+ *   res[x] = -2^31;
+ *   OV = 1;
+ * }
+ * Rd.W[x] = res[x];
+ * for RV32: x=0
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  t    long type of value stored in t
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_KMMAWT_U(long t, unsigned long a, unsigned long b)
+{
+    __ASM volatile("kmmawt.u %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
+    return t;
+}
+/* ===== Inline Function End for 3.46.2. KMMAWT.u ===== */
+
+/* ===== Inline Function Start for 3.47.1. KMMAWT2 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_MSW_32X16_MAC
+ * \brief KMMAWT2 (SIMD Saturating MSW Signed Multiply Word and Top Half & 2 and Add)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KMMAWT2 Rd, Rs1, Rs2
+ * KMMAWT2.u Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Multiply the signed 32-bit elements of one register and the top 16-bit of the
+ * corresponding 32-bit elements of another register, double the multiplication results and add the
+ * saturated most significant 32-bit results with the corresponding signed 32-bit elements of a third
+ * register. The saturated addition result is written to the corresponding 32-bit elements of the third
+ * register. The `.u` form rounds up the multiplication results from the most significant discarded bit
+ * before the addition operations.
+ *
+ * **Description**:\n
+ * This instruction multiplies the signed 32-bit Q31 elements of Rs1 with the signed top 16-bit Q15
+ * content of the corresponding 32-bit elements of Rs2, doubles the Q46 results to Q47 numbers and
+ * adds the saturated most significant 32-bit Q31 multiplication results with the corresponding signed
+ * 32-bit elements of Rd. If the addition result is beyond the Q31 number range (-2^31 <= Q31 <= 2^31-1), it is
+ * saturated to the range and the OV bit is set to 1. The results after saturation are written to the
+ * corresponding 32-bit elements of Rd. The `.u` form of the instruction rounds up the most significant
+ * 32-bit of the 48-bit Q47 multiplication results by adding a 1 to bit 15 (i.e., bit 14 before doubling) of
+ * the result before the addition operations.
+ *
+ * **Operations**:\n
+ * ~~~
+ * if ((Rs1.W[x] == 0x80000000) & (Rs2.W[x].H[1] == 0x8000)) {
+ *   addop.W[x] = 0x7fffffff;
+ *   OV = 1;
+ * } else {
+ *   Mres[x][47:0] = Rs1.W[x] s* Rs2.W[x].H[1];
+ *   if (`.u` form) {
+ *     Mres[x][47:14] = Mres[x][47:14] + 1;
+ *   }
+ *   addop.W[x] = Mres[x][46:15]; // doubling
+ * }
+ * res[x] = Rd.W[x] + addop.W[x];
+ * if (res[x] > (2^31)-1) {
+ *   res[x] = (2^31)-1;
+ *   OV = 1;
+ * } else if (res[x] < -2^31) {
+ *   res[x] = -2^31;
+ *   OV = 1;
+ * }
+ * Rd.W[x] = res[x];
+ * for RV32: x=0
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  t    long type of value stored in t
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_KMMAWT2(long t, unsigned long a, unsigned long b)
+{
+    __ASM volatile("kmmawt2 %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
+    return t;
+}
+/* ===== Inline Function End for 3.47.1. KMMAWT2 ===== */
+
+/* ===== Inline Function Start for 3.47.2. KMMAWT2.u ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_MSW_32X16_MAC
+ * \brief KMMAWT2.u (SIMD Saturating MSW Signed Multiply Word and Top Half & 2 and Add with Rounding)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KMMAWT2 Rd, Rs1, Rs2
+ * KMMAWT2.u Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Multiply the signed 32-bit elements of one register and the top 16-bit of the
+ * corresponding 32-bit elements of another register, double the multiplication results and add the
+ * saturated most significant 32-bit results with the corresponding signed 32-bit elements of a third
+ * register. The saturated addition result is written to the corresponding 32-bit elements of the third
+ * register. The `.u` form rounds up the multiplication results from the most significant discarded bit
+ * before the addition operations.
+ *
+ * **Description**:\n
+ * This instruction multiplies the signed 32-bit Q31 elements of Rs1 with the signed top 16-bit Q15
+ * content of the corresponding 32-bit elements of Rs2, doubles the Q46 results to Q47 numbers and
+ * adds the saturated most significant 32-bit Q31 multiplication results with the corresponding signed
+ * 32-bit elements of Rd. If the addition result is beyond the Q31 number range (-2^31 <= Q31 <= 2^31-1), it is
+ * saturated to the range and the OV bit is set to 1. The results after saturation are written to the
+ * corresponding 32-bit elements of Rd. The `.u` form of the instruction rounds up the most significant
+ * 32-bit of the 48-bit Q47 multiplication results by adding a 1 to bit 15 (i.e., bit 14 before doubling) of
+ * the result before the addition operations.
+ *
+ * **Operations**:\n
+ * ~~~
+ * if ((Rs1.W[x] == 0x80000000) & (Rs2.W[x].H[1] == 0x8000)) {
+ *   addop.W[x] = 0x7fffffff;
+ *   OV = 1;
+ * } else {
+ *   Mres[x][47:0] = Rs1.W[x] s* Rs2.W[x].H[1];
+ *   if (`.u` form) {
+ *     Mres[x][47:14] = Mres[x][47:14] + 1;
+ *   }
+ *   addop.W[x] = Mres[x][46:15]; // doubling
+ * }
+ * res[x] = Rd.W[x] + addop.W[x];
+ * if (res[x] > (2^31)-1) {
+ *   res[x] = (2^31)-1;
+ *   OV = 1;
+ * } else if (res[x] < -2^31) {
+ *   res[x] = -2^31;
+ *   OV = 1;
+ * }
+ * Rd.W[x] = res[x];
+ * for RV32: x=0
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  t    long type of value stored in t
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_KMMAWT2_U(long t, unsigned long a, unsigned long b)
+{
+    __ASM volatile("kmmawt2.u %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
+    return t;
+}
+/* ===== Inline Function End for 3.47.2. KMMAWT2.u ===== */
+
+/* ===== Inline Function Start for 3.48.1. KMMSB ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_MSW_32X32_MAC
+ * \brief KMMSB (SIMD Saturating MSW Signed Multiply Word and Subtract)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KMMSB Rd, Rs1, Rs2
+ * KMMSB.u Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Multiply the signed 32-bit integer elements of two registers and subtract the most
+ * significant 32-bit results from the signed 32-bit elements of a third register. The subtraction results
+ * are written to the third register. The `.u` form performs an additional rounding up operation on
+ * the multiplication results before subtracting the most significant 32-bit part of the results.
+ *
+ * **Description**:\n
+ * This instruction multiplies the signed 32-bit elements of Rs1 with the signed 32-bit elements of Rs2
+ * and subtracts the most significant 32-bit multiplication results from the signed 32-bit elements of
+ * Rd. If the subtraction result is beyond the Q31 number range (-2^31 <= Q31 <= 2^31-1), it is saturated to the
+ * range and the OV bit is set to 1. The results after saturation are written to Rd. The `.u` form of the
+ * instruction additionally rounds up the most significant 32-bit of the 64-bit multiplication results by
+ * adding a 1 to bit 31 of the results.
+ *
+ * **Operations**:\n
+ * ~~~
+ * Mres[x][63:0] = Rs1.W[x] * Rs2.W[x];
+ * if (`.u` form) {
+ *   Round[x][32:0] = Mres[x][63:31] + 1;
+ *   res[x] = Rd.W[x] - Round[x][32:1];
+ * } else {
+ *   res[x] = Rd.W[x] - Mres[x][63:32];
+ * }
+ * if (res[x] > (2^31)-1) {
+ *   res[x] = (2^31)-1;
+ *   OV = 1;
+ * } else if (res[x] < -2^31) {
+ *   res[x] = -2^31;
+ *   OV = 1;
+ * }
+ * Rd.W[x] = res[x];
+ * for RV32: x=0
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  t    long type of value stored in t
+ * \param [in]  a    long type of value stored in a
+ * \param [in]  b    long type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_KMMSB(long t, long a, long b)
+{
+    __ASM volatile("kmmsb %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
+    return t;
+}
+/* ===== Inline Function End for 3.48.1. KMMSB ===== */
+
+/* ===== Inline Function Start for 3.48.2. KMMSB.u ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_MSW_32X32_MAC
+ * \brief KMMSB.u (SIMD Saturating MSW Signed Multiply Word and Subtraction with Rounding)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KMMSB Rd, Rs1, Rs2
+ * KMMSB.u Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Multiply the signed 32-bit integer elements of two registers and subtract the most
+ * significant 32-bit results from the signed 32-bit elements of a third register. The subtraction results
+ * are written to the third register. The `.u` form performs an additional rounding up operation on
+ * the multiplication results before subtracting the most significant 32-bit part of the results.
+ *
+ * **Description**:\n
+ * This instruction multiplies the signed 32-bit elements of Rs1 with the signed 32-bit elements of Rs2
+ * and subtracts the most significant 32-bit multiplication results from the signed 32-bit elements of
+ * Rd. If the subtraction result is beyond the Q31 number range (-2^31 <= Q31 <= 2^31-1), it is saturated to the
+ * range and the OV bit is set to 1. The results after saturation are written to Rd. The `.u` form of the
+ * instruction additionally rounds up the most significant 32-bit of the 64-bit multiplication results by
+ * adding a 1 to bit 31 of the results.
+ *
+ * **Operations**:\n
+ * ~~~
+ * Mres[x][63:0] = Rs1.W[x] * Rs2.W[x];
+ * if (`.u` form) {
+ *   Round[x][32:0] = Mres[x][63:31] + 1;
+ *   res[x] = Rd.W[x] - Round[x][32:1];
+ * } else {
+ *   res[x] = Rd.W[x] - Mres[x][63:32];
+ * }
+ * if (res[x] > (2^31)-1) {
+ *   res[x] = (2^31)-1;
+ *   OV = 1;
+ * } else if (res[x] < -2^31) {
+ *   res[x] = -2^31;
+ *   OV = 1;
+ * }
+ * Rd.W[x] = res[x];
+ * for RV32: x=0
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  t    long type of value stored in t
+ * \param [in]  a    long type of value stored in a
+ * \param [in]  b    long type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_KMMSB_U(long t, long a, long b)
+{
+    __ASM volatile("kmmsb.u %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
+    return t;
+}
+/* ===== Inline Function End for 3.48.2. KMMSB.u ===== */
+
+/* ===== Inline Function Start for 3.49.1. KMMWB2 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_MSW_32X16_MAC
+ * \brief KMMWB2 (SIMD Saturating MSW Signed Multiply Word and Bottom Half & 2)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KMMWB2 Rd, Rs1, Rs2
+ * KMMWB2.u Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Multiply the signed 32-bit integer elements of one register and the bottom 16-bit of the
+ * corresponding 32-bit elements of another register, double the multiplication results and write the
+ * saturated most significant 32-bit results to the corresponding 32-bit elements of a register. The `.u`
+ * form rounds up the results from the most significant discarded bit.
+ *
+ * **Description**:\n
+ * This instruction multiplies the signed 32-bit Q31 elements of Rs1 with the signed bottom 16-bit Q15
+ * content of the corresponding 32-bit elements of Rs2, doubles the Q46 results to Q47 numbers and
+ * writes the saturated most significant 32-bit Q31 multiplication results to the corresponding 32-bit
+ * elements of Rd. The `.u` form of the instruction rounds up the most significant 32-bit of the 48-bit
+ * Q47 multiplication results by adding a 1 to bit 15 (i.e., bit 14 before doubling) of the results.
+ *
+ * **Operations**:\n
+ * ~~~
+ * if ((Rs1.W[x] == 0x80000000) & (Rs2.W[x].H[0] == 0x8000)) {
+ *   Rd.W[x] = 0x7fffffff;
+ *   OV = 1;
+ * } else {
+ *   Mres[x][47:0] = Rs1.W[x] s* Rs2.W[x].H[0];
+ *   if (`.u` form) {
+ *     Round[x][32:0] = Mres[x][46:14] + 1;
+ *     Rd.W[x] = Round[x][32:1];
+ *   } else {
+ *     Rd.W[x] = Mres[x][46:15];
+ *   }
+ * }
+ * for RV32: x=0
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  a    long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_KMMWB2(long a, unsigned long b)
+{
+    register long result;
+    __ASM volatile("kmmwb2 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.49.1. KMMWB2 ===== */
+
+/* ===== Inline Function Start for 3.49.2. KMMWB2.u ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_MSW_32X16_MAC
+ * \brief KMMWB2.u (SIMD Saturating MSW Signed Multiply Word and Bottom Half & 2 with Rounding)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KMMWB2 Rd, Rs1, Rs2
+ * KMMWB2.u Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Multiply the signed 32-bit integer elements of one register and the bottom 16-bit of the
+ * corresponding 32-bit elements of another register, double the multiplication results and write the
+ * saturated most significant 32-bit results to the corresponding 32-bit elements of a register. The `.u`
+ * form rounds up the results from the most significant discarded bit.
+ *
+ * **Description**:\n
+ * This instruction multiplies the signed 32-bit Q31 elements of Rs1 with the signed bottom 16-bit Q15
+ * content of the corresponding 32-bit elements of Rs2, doubles the Q46 results to Q47 numbers and
+ * writes the saturated most significant 32-bit Q31 multiplication results to the corresponding 32-bit
+ * elements of Rd. The `.u` form of the instruction rounds up the most significant 32-bit of the 48-bit
+ * Q47 multiplication results by adding a 1 to bit 15 (i.e., bit 14 before doubling) of the results.
+ *
+ * **Operations**:\n
+ * ~~~
+ * if ((Rs1.W[x] == 0x80000000) & (Rs2.W[x].H[0] == 0x8000)) {
+ *   Rd.W[x] = 0x7fffffff;
+ *   OV = 1;
+ * } else {
+ *   Mres[x][47:0] = Rs1.W[x] s* Rs2.W[x].H[0];
+ *   if (`.u` form) {
+ *     Round[x][32:0] = Mres[x][46:14] + 1;
+ *     Rd.W[x] = Round[x][32:1];
+ *   } else {
+ *     Rd.W[x] = Mres[x][46:15];
+ *   }
+ * }
+ * for RV32: x=0
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  a    long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_KMMWB2_U(long a, unsigned long b)
+{
+    register long result;
+    __ASM volatile("kmmwb2.u %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.49.2. KMMWB2.u ===== */
+
+/* ===== Inline Function Start for 3.50.1. KMMWT2 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_MSW_32X16_MAC
+ * \brief KMMWT2 (SIMD Saturating MSW Signed Multiply Word and Top Half & 2)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KMMWT2 Rd, Rs1, Rs2
+ * KMMWT2.u Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Multiply the signed 32-bit integer elements of one register and the top 16-bit of the
+ * corresponding 32-bit elements of another register, double the multiplication results and write the
+ * saturated most significant 32-bit results to the corresponding 32-bit elements of a register. The `.u`
+ * form rounds up the results from the most significant discarded bit.
+ *
+ * **Description**:\n
+ * This instruction multiplies the signed 32-bit Q31 elements of Rs1 with the signed top 16-bit Q15
+ * content of the corresponding 32-bit elements of Rs2, doubles the Q46 results to Q47 numbers and
+ * writes the saturated most significant 32-bit Q31 multiplication results to the corresponding 32-bit
+ * elements of Rd. The `.u` form of the instruction rounds up the most significant 32-bit of the 48-bit
+ * Q47 multiplication results by adding a 1 to bit 15 (i.e., bit 14 before doubling) of the results.
+ *
+ * **Operations**:\n
+ * ~~~
+ * if ((Rs1.W[x] == 0x80000000) & (Rs2.W[x].H[1] == 0x8000)) {
+ *   Rd.W[x] = 0x7fffffff;
+ *   OV = 1;
+ * } else {
+ *   Mres[x][47:0] = Rs1.W[x] s* Rs2.W[x].H[1];
+ *   if (`.u` form) {
+ *     Round[x][32:0] = Mres[x][46:14] + 1;
+ *     Rd.W[x] = Round[x][32:1];
+ *   } else {
+ *     Rd.W[x] = Mres[x][46:15];
+ *   }
+ * }
+ * for RV32: x=0
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  a    long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_KMMWT2(long a, unsigned long b)
+{
+    register long result;
+    __ASM volatile("kmmwt2 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.50.1. KMMWT2 ===== */
+
+/* ===== Inline Function Start for 3.50.2. KMMWT2.u ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_MSW_32X16_MAC
+ * \brief KMMWT2.u (SIMD Saturating MSW Signed Multiply Word and Top Half & 2 with Rounding)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KMMWT2 Rd, Rs1, Rs2
+ * KMMWT2.u Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Multiply the signed 32-bit integer elements of one register and the top 16-bit of the
+ * corresponding 32-bit elements of another register, double the multiplication results and write the
+ * saturated most significant 32-bit results to the corresponding 32-bit elements of a register. The `.u`
+ * form rounds up the results from the most significant discarded bit.
+ *
+ * **Description**:\n
+ * This instruction multiplies the signed 32-bit Q31 elements of Rs1 with the signed top 16-bit Q15
+ * content of the corresponding 32-bit elements of Rs2, doubles the Q46 results to Q47 numbers and
+ * writes the saturated most significant 32-bit Q31 multiplication results to the corresponding 32-bit
+ * elements of Rd. The `.u` form of the instruction rounds up the most significant 32-bit of the 48-bit
+ * Q47 multiplication results by adding a 1 to bit 15 (i.e., bit 14 before doubling) of the results.
+ *
+ * **Operations**:\n
+ * ~~~
+ * if ((Rs1.W[x] == 0x80000000) & (Rs2.W[x].H[1] == 0x8000)) {
+ *   Rd.W[x] = 0x7fffffff;
+ *   OV = 1;
+ * } else {
+ *   Mres[x][47:0] = Rs1.W[x] s* Rs2.W[x].H[1];
+ *   if (`.u` form) {
+ *     Round[x][32:0] = Mres[x][46:14] + 1;
+ *     Rd.W[x] = Round[x][32:1];
+ *   } else {
+ *     Rd.W[x] = Mres[x][46:15];
+ *   }
+ * }
+ * for RV32: x=0
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  a    long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_KMMWT2_U(long a, unsigned long b)
+{
+    register long result;
+    __ASM volatile("kmmwt2.u %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.50.2. KMMWT2.u ===== */
+
+/* ===== Inline Function Start for 3.51.1. KMSDA ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_16B_MULT_32B_ADDSUB
+ * \brief KMSDA (SIMD Saturating Signed Multiply Two Halfs & Add & Subtract)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KMSDA Rd, Rs1, Rs2
+ * KMSXDA Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do two signed 16-bit multiplications from the 32-bit elements of two registers; and then
+ * subtracts the two 32-bit results from the corresponding 32-bit elements of a third register. The
+ * subtraction result may be saturated.
+ * * KMSDA: rd.W[x] - top*top - bottom*bottom (per 32-bit element)
+ * * KMSXDA: rd.W[x] - top*bottom - bottom*top (per 32-bit element)
+ *
+ * **Description**:\n
+ * For the `KMSDA` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1
+ * with the bottom 16-bit content of the 32-bit elements of Rs2 and multiplies the top 16-bit content of
+ * the 32-bit elements of Rs1 with the top 16-bit content of the 32-bit elements of Rs2.
+ * For the `KMSXDA` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1
+ * with the top 16-bit content of the 32-bit elements of Rs2 and multiplies the top 16-bit content of the
+ * 32-bit elements of Rs1 with the bottom 16-bit content of the 32-bit elements of Rs2.
+ * The two 32-bit multiplication results are then subtracted from the content of the corresponding 32-
+ * bit elements of Rd. If the subtraction result is beyond the Q31 number range (-2^31 <= Q31 <= 2^31-1), it is
+ * saturated to the range and the OV bit is set to 1. The results after saturation are written to Rd. The
+ * 16-bit contents are treated as signed integers.
+ *
+ * **Operations**:\n
+ * ~~~
+ * // KMSDA
+ * res[x] = Rd.W[x] - (Rs1.W[x].H[1] * Rs2.W[x].H[1]) - (Rs1.W[x].H[0] * Rs2.W[x].H[0]);
+ * // KMSXDA
+ * res[x] = Rd.W[x] - (Rs1.W[x].H[1] * Rs2.W[x].H[0]) - (Rs1.W[x].H[0] * Rs2.W[x].H[1]);
+ * if (res[x] > (2^31)-1) {
+ *   res[x] = (2^31)-1;
+ *   OV = 1;
+ * } else if (res[x] < -2^31) {
+ *   res[x] = -2^31;
+ *   OV = 1;
+ * }
+ * Rd.W[x] = res[x];
+ * for RV32: x=0
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  t    long type of value stored in t
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_KMSDA(long t, unsigned long a, unsigned long b)
+{
+    __ASM volatile("kmsda %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
+    return t;
+}
+/* ===== Inline Function End for 3.51.1. KMSDA ===== */
+
+/* ===== Inline Function Start for 3.51.2. KMSXDA ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_16B_MULT_32B_ADDSUB
+ * \brief KMSXDA (SIMD Saturating Signed Crossed Multiply Two Halfs & Add & Subtract)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KMSDA Rd, Rs1, Rs2
+ * KMSXDA Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do two signed 16-bit multiplications from the 32-bit elements of two registers; and then
+ * subtracts the two 32-bit results from the corresponding 32-bit elements of a third register. The
+ * subtraction result may be saturated.
+ * * KMSDA: rd.W[x] - top*top - bottom*bottom (per 32-bit element)
+ * * KMSXDA: rd.W[x] - top*bottom - bottom*top (per 32-bit element)
+ *
+ * **Description**:\n
+ * For the `KMSDA` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1
+ * with the bottom 16-bit content of the 32-bit elements of Rs2 and multiplies the top 16-bit content of
+ * the 32-bit elements of Rs1 with the top 16-bit content of the 32-bit elements of Rs2.
+ * For the `KMSXDA` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1
+ * with the top 16-bit content of the 32-bit elements of Rs2 and multiplies the top 16-bit content of the
+ * 32-bit elements of Rs1 with the bottom 16-bit content of the 32-bit elements of Rs2.
+ * The two 32-bit multiplication results are then subtracted from the content of the corresponding 32-
+ * bit elements of Rd. If the subtraction result is beyond the Q31 number range (-2^31 <= Q31 <= 2^31-1), it is
+ * saturated to the range and the OV bit is set to 1. The results after saturation are written to Rd. The
+ * 16-bit contents are treated as signed integers.
+ *
+ * **Operations**:\n
+ * ~~~
+ * // KMSDA
+ * res[x] = Rd.W[x] - (Rs1.W[x].H[1] * Rs2.W[x].H[1]) - (Rs1.W[x].H[0] * Rs2.W[x].H[0]);
+ * // KMSXDA
+ * res[x] = Rd.W[x] - (Rs1.W[x].H[1] * Rs2.W[x].H[0]) - (Rs1.W[x].H[0] * Rs2.W[x].H[1]);
+ * if (res[x] > (2^31)-1) {
+ *   res[x] = (2^31)-1;
+ *   OV = 1;
+ * } else if (res[x] < -2^31) {
+ *   res[x] = -2^31;
+ *   OV = 1;
+ * }
+ * Rd.W[x] = res[x];
+ * for RV32: x=0
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  t    long type of value stored in t
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_KMSXDA(long t, unsigned long a, unsigned long b)
+{
+    __ASM volatile("kmsxda %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
+    return t;
+}
+/* ===== Inline Function End for 3.51.2. KMSXDA ===== */
+
+/* ===== Inline Function Start for 3.52. KMSR64 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_32B_MULT_64B_ADDSUB
+ * \brief KMSR64 (Signed Multiply and Saturating Subtract from 64-Bit Data)
+ * \details
+ * **Type**: DSP (64-bit Profile)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KMSR64 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Multiply the 32-bit signed elements in two registers and subtract the 64-bit multiplication
+ * results from the 64-bit signed data of a pair of registers (RV32) or a register (RV64). The result is
+ * saturated to the Q63 range and written back to the pair of registers (RV32) or the register (RV64).
+ *
+ * **RV32 Description**:\n
+ * This instruction multiplies the 32-bit signed data of Rs1 with that of Rs2. It
+ * subtracts the 64-bit multiplication result from the 64-bit signed data of an even/odd pair of registers
+ * specified by Rd(4,1) with unlimited precision. If the 64-bit subtraction result is beyond the Q63
+ * number range (-2^63 <= Q63 <= 2^63-1), it is saturated to the range and the OV bit is set to 1. The saturated
+ * result is written back to the even/odd pair of registers specified by Rd(4,1).
+ * Rx(4,1), i.e., d, determines the even/odd pair group of two registers. Specifically, the register pair
+ * includes register 2d and 2d+1.
+ * The odd `2d+1` register of the pair contains the high 32-bit of the result and the even `2d` register
+ * of the pair contains the low 32-bit of the result.
+ *
+ * **RV64 Description**:\n
+ * This instruction multiplies the 32-bit signed elements of Rs1 with that of Rs2. It
+ * subtracts the 64-bit multiplication results from the 64-bit signed data in Rd with unlimited
+ * precision. If the 64-bit subtraction result is beyond the Q63 number range (-2^63 <= Q63 <= 2^63-1), it is
+ * saturated to the range and the OV bit is set to 1. The saturated result is written back to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * RV32:
+ * t_L = CONCAT(Rd(4,1),1'b0); t_H = CONCAT(Rd(4,1),1'b1);
+ * result = R[t_H].R[t_L] - (Rs1 * Rs2);
+ * if (result > (2^63)-1) {
+ *   result = (2^63)-1; OV = 1;
+ * } else if (result < -2^63) {
+ *   result = -2^63; OV = 1;
+ * }
+ * R[t_H].R[t_L] = result;
+ * RV64:
+ * // `result` has unlimited precision
+ * result = Rd - (Rs1.W[0] * Rs2.W[0]) - (Rs1.W[1] * Rs2.W[1]);
+ * if (result > (2^63)-1) {
+ *   result = (2^63)-1; OV = 1;
+ * } else if (result < -2^63) {
+ *   result = -2^63; OV = 1;
+ * }
+ * Rd = result;
+ * ~~~
+ *
+ * \param [in]  t    long long type of value stored in t
+ * \param [in]  a    long type of value stored in a
+ * \param [in]  b    long type of value stored in b
+ * \return value stored in long long type
+ */
+__STATIC_FORCEINLINE long long __RV_KMSR64(long long t, long a, long b)
+{
+    __ASM volatile("kmsr64 %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
+    return t;
+}
+/* ===== Inline Function End for 3.52. KMSR64 ===== */
+
+/* ===== Inline Function Start for 3.53. KSLLW ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_Q31_SAT_ALU
+ * \brief KSLLW (Saturating Shift Left Logical for Word)
+ * \details
+ * **Type**: DSP
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KSLLW Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do logical left shift operation with saturation on a 32-bit word. The shift amount is a
+ * variable from a GPR.
+ *
+ * **Description**:\n
+ * The first word data in Rs1 is left-shifted logically. The shifted out bits are filled with
+ * zero and the shift amount is specified by the low-order 5-bits of the value in the Rs2 register. Any
+ * shifted value greater than 2^31-1 is saturated to 2^31-1. Any shifted value smaller than -2^31 is saturated
+ * to -2^31. And the saturated result is sign-extended and written to Rd. If any saturation is performed,
+ * set OV bit to 1.
+ *
+ * **Operations**:\n
+ * ~~~
+ * sa = Rs2[4:0];
+ * res[(31+sa):0] = Rs1.W[0] << sa;
+ * if (res > (2^31)-1) {
+ *   res = 0x7fffffff; OV = 1;
+ * } else if (res < -2^31) {
+ *   res = 0x80000000; OV = 1;
+ * }
+ * Rd[31:0] = res[31:0]; // RV32
+ * Rd[63:0] = SE(res[31:0]); // RV64
+ * ~~~
+ *
+ * \param [in]  a    long type of value stored in a
+ * \param [in]  b    unsigned int type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_KSLLW(long a, unsigned int b)
+{
+    register long result;
+    __ASM volatile("ksllw %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.53. KSLLW ===== */
+
+/* ===== Inline Function Start for 3.54. KSLLIW ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_Q31_SAT_ALU
+ * \brief KSLLIW (Saturating Shift Left Logical Immediate for Word)
+ * \details
+ * **Type**: DSP
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KSLLIW Rd, Rs1, imm5u
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do logical left shift operation with saturation on a 32-bit word. The shift amount is an
+ * immediate value.
+ *
+ * **Description**:\n
+ * The first word data in Rs1 is left-shifted logically. The shifted out bits are filled with
+ * zero and the shift amount is specified by the imm5u constant. Any shifted value greater than 2^31-1 is
+ * saturated to 2^31-1. Any shifted value smaller than -2^31 is saturated to -2^31. And the saturated result is
+ * sign-extended and written to Rd. If any saturation is performed, set OV bit to 1.
+ *
+ * **Operations**:\n
+ * ~~~
+ * sa = imm5u;
+ * res[(31+sa):0] = Rs1.W[0] << sa;
+ * if (res > (2^31)-1) {
+ *   res = 0x7fffffff; OV = 1;
+ * } else if (res < -2^31) {
+ *   res = 0x80000000; OV = 1;
+ * }
+ * Rd[31:0] = res[31:0]; // RV32
+ * Rd[63:0] = SE(res[31:0]); // RV64
+ * ~~~
+ *
+ * \param [in]  a    long type of value stored in a
+ * \param [in]  b    unsigned int type of value stored in b
+ * \return value stored in long type
+ */
+#define __RV_KSLLIW(a, b)    \
+    ({    \
+        register long result;    \
+        register long __a = (long)(a);    \
+        __ASM volatile("kslliw %0, %1, %2" : "=r"(result) : "r"(__a), "K"(b));    \
+        result;    \
+    })
+/* ===== Inline Function End for 3.54. KSLLIW ===== */
+
+/* ===== Inline Function Start for 3.55. KSLL8 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_SHIFT
+ * \brief KSLL8 (SIMD 8-bit Saturating Shift Left Logical)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KSLL8 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 8-bit elements logical left shift operations with saturation simultaneously. The shift
+ * amount is a variable from a GPR.
+ *
+ * **Description**:\n
+ * The 8-bit data elements in Rs1 are left-shifted logically. The shifted out bits are filled
+ * with zero and the shift amount is specified by the low-order 3-bits of the value in the Rs2 register.
+ * Any shifted value greater than 2^7-1 is saturated to 2^7-1. Any shifted value smaller than -2^7 is
+ * saturated to -2^7. And the saturated results are written to Rd. If any saturation is performed, set OV
+ * bit to 1.
+ *
+ * **Operations**:\n
+ * ~~~
+ * sa = Rs2[2:0];
+ * if (sa != 0) {
+ *   res[(7+sa):0] = Rs1.B[x] << sa;
+ *   if (res > (2^7)-1) {
+ *     res = 0x7f; OV = 1;
+ *   } else if (res < -2^7) {
+ *     res = 0x80; OV = 1;
+ *   }
+ *   Rd.B[x] = res[7:0];
+ * } else {
+ *   Rd = Rs1;
+ * }
+ * for RV32: x=3...0,
+ * for RV64: x=7...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned int type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_KSLL8(unsigned long a, unsigned int b)
+{
+    register unsigned long result;
+    __ASM volatile("ksll8 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.55. KSLL8 ===== */
+
+/* ===== Inline Function Start for 3.56. KSLLI8 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_SHIFT
+ * \brief KSLLI8 (SIMD 8-bit Saturating Shift Left Logical Immediate)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KSLLI8 Rd, Rs1, imm3u
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 8-bit elements logical left shift operations with saturation simultaneously. The shift
+ * amount is an immediate value.
+ *
+ * **Description**:\n
+ * The 8-bit data elements in Rs1 are left-shifted logically. The shifted out bits are filled
+ * with zero and the shift amount is specified by the imm3u constant. Any shifted value greater than
+ * 2^7-1 is saturated to 2^7-1. Any shifted value smaller than -2^7 is saturated to -2^7. And the saturated
+ * results are written to Rd. If any saturation is performed, set OV bit to 1.
+ *
+ * **Operations**:\n
+ * ~~~
+ * sa = imm3u[2:0];
+ * if (sa != 0) {
+ *   res[(7+sa):0] = Rs1.B[x] << sa;
+ *   if (res > (2^7)-1) {
+ *     res = 0x7f; OV = 1;
+ *   } else if (res < -2^7) {
+ *     res = 0x80; OV = 1;
+ *   }
+ *   Rd.B[x] = res[7:0];
+ * } else {
+ *   Rd = Rs1;
+ * }
+ * for RV32: x=3...0,
+ * for RV64: x=7...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned int type of value stored in b
+ * \return value stored in unsigned long type
+ */
+#define __RV_KSLLI8(a, b)    \
+    ({    \
+        register unsigned long result;    \
+        register unsigned long __a = (unsigned long)(a);    \
+        __ASM volatile("kslli8 %0, %1, %2" : "=r"(result) : "r"(__a), "K"(b));    \
+        result;    \
+    })
+/* ===== Inline Function End for 3.56. KSLLI8 ===== */
+
+/* ===== Inline Function Start for 3.57. KSLL16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_SHIFT
+ * \brief KSLL16 (SIMD 16-bit Saturating Shift Left Logical)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KSLL16 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 16-bit elements logical left shift operations with saturation simultaneously. The shift
+ * amount is a variable from a GPR.
+ *
+ * **Description**:\n
+ * The 16-bit data elements in Rs1 are left-shifted logically. The shifted out bits are filled
+ * with zero and the shift amount is specified by the low-order 4-bits of the value in the Rs2 register.
+ * Any shifted value greater than 2^15-1 is saturated to 2^15-1. Any shifted value smaller than -2^15 is
+ * saturated to -2^15. And the saturated results are written to Rd. If any saturation is performed, set OV
+ * bit to 1.
+ *
+ * **Operations**:\n
+ * ~~~
+ * sa = Rs2[3:0];
+ * if (sa != 0) {
+ *   res[(15+sa):0] = Rs1.H[x] << sa;
+ *   if (res > (2^15)-1) {
+ *     res = 0x7fff; OV = 1;
+ *   } else if (res < -2^15) {
+ *     res = 0x8000; OV = 1;
+ *   }
+ *   Rd.H[x] = res[15:0];
+ * } else {
+ *   Rd = Rs1;
+ * }
+ * for RV32: x=1...0,
+ * for RV64: x=3...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned int type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_KSLL16(unsigned long a, unsigned int b)
+{
+    register unsigned long result;
+    __ASM volatile("ksll16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.57. KSLL16 ===== */
+
+/* ===== Inline Function Start for 3.58. KSLLI16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_SHIFT
+ * \brief KSLLI16 (SIMD 16-bit Saturating Shift Left Logical Immediate)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KSLLI16 Rd, Rs1, imm4u
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 16-bit elements logical left shift operations with saturation simultaneously. The shift
+ * amount is an immediate value.
+ *
+ * **Description**:\n
+ * The 16-bit data elements in Rs1 are left-shifted logically. The shifted out bits are filled
+ * with zero and the shift amount is specified by the imm4u constant. Any shifted value greater than
+ * 2^15-1 is saturated to 2^15-1. Any shifted value smaller than -2^15 is saturated to -2^15. And the saturated
+ * results are written to Rd. If any saturation is performed, set OV bit to 1.
+ *
+ * **Operations**:\n
+ * ~~~
+ * sa = imm4u[3:0];
+ * if (sa != 0) {
+ *   res[(15+sa):0] = Rs1.H[x] << sa;
+ *   if (res > (2^15)-1) {
+ *     res = 0x7fff; OV = 1;
+ *   } else if (res < -2^15) {
+ *     res = 0x8000; OV = 1;
+ *   }
+ *   Rd.H[x] = res[15:0];
+ * } else {
+ *   Rd = Rs1;
+ * }
+ * for RV32: x=1...0,
+ * for RV64: x=3...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned int type of value stored in b
+ * \return value stored in unsigned long type
+ */
+#define __RV_KSLLI16(a, b)    \
+    ({    \
+        register unsigned long result;    \
+        register unsigned long __a = (unsigned long)(a);    \
+        __ASM volatile("kslli16 %0, %1, %2" : "=r"(result) : "r"(__a), "K"(b));    \
+        result;    \
+    })
+/* ===== Inline Function End for 3.58. KSLLI16 ===== */
+
+/* ===== Inline Function Start for 3.59.1. KSLRA8 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_SHIFT
+ * \brief KSLRA8 (SIMD 8-bit Shift Left Logical with Saturation or Shift Right Arithmetic)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KSLRA8 Rd, Rs1, Rs2
+ * KSLRA8.u Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 8-bit elements logical left (positive) or arithmetic right (negative) shift operation with
+ * Q7 saturation for the left shift. The `.u` form performs additional rounding up operations for the
+ * right shift.
+ *
+ * **Description**:\n
+ * The 8-bit data elements of Rs1 are left-shifted logically or right-shifted arithmetically
+ * based on the value of Rs2[3:0]. Rs2[3:0] is in the signed range of [-2^3, 2^3-1]. A positive Rs2[3:0] means
+ * logical left shift and a negative Rs2[3:0] means arithmetic right shift. The shift amount is the
+ * absolute value of Rs2[3:0]. However, the behavior of `Rs2[3:0]==-2^3 (0x8)` is defined to be
+ * equivalent to the behavior of `Rs2[3:0]==-(2^3-1) (0x9)`.
+ * The left-shifted results are saturated to the 8-bit signed integer range of [-2^7, 2^7-1]. For the `.u` form
+ * of the instruction, the right-shifted results are added a 1 to the most significant discarded bit
+ * position for rounding effect. After the shift, saturation, or rounding, the final results are written to
+ * Rd. If any saturation happens, this instruction sets the OV flag. The value of Rs2[31:4] will not affect
+ * this instruction.
+ *
+ * **Operations**:\n
+ * ~~~
+ * if (Rs2[3:0] < 0) {
+ *   sa = -Rs2[3:0];
+ *   sa = (sa == 8)? 7 : sa;
+ *   if (`.u` form) {
+ *     res[7:-1] = SE9(Rs1.B[x][7:sa-1]) + 1;
+ *     Rd.B[x] = res[7:0];
+ *   } else {
+ *     Rd.B[x] = SE8(Rs1.B[x][7:sa]);
+ *   }
+ * } else {
+ *   sa = Rs2[2:0];
+ *   res[(7+sa):0] = Rs1.B[x] <<(logic) sa;
+ *   if (res > (2^7)-1) {
+ *     res[7:0] = 0x7f; OV = 1;
+ *   } else if (res < -2^7) {
+ *     res[7:0] = 0x80; OV = 1;
+ *   }
+ *   Rd.B[x] = res[7:0];
+ * }
+ * for RV32: x=3...0,
+ * for RV64: x=7...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    int type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_KSLRA8(unsigned long a, int b)
+{
+    register unsigned long result;
+    __ASM volatile("kslra8 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.59.1. KSLRA8 ===== */
+
+/* ===== Inline Function Start for 3.59.2. KSLRA8.u ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_SHIFT
+ * \brief KSLRA8.u (SIMD 8-bit Shift Left Logical with Saturation or Rounding Shift Right Arithmetic)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KSLRA8 Rd, Rs1, Rs2
+ * KSLRA8.u Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 8-bit elements logical left (positive) or arithmetic right (negative) shift operation with
+ * Q7 saturation for the left shift. The `.u` form performs additional rounding up operations for the
+ * right shift.
+ *
+ * **Description**:\n
+ * The 8-bit data elements of Rs1 are left-shifted logically or right-shifted arithmetically
+ * based on the value of Rs2[3:0]. Rs2[3:0] is in the signed range of [-2^3, 2^3-1]. A positive Rs2[3:0] means
+ * logical left shift and a negative Rs2[3:0] means arithmetic right shift. The shift amount is the
+ * absolute value of Rs2[3:0]. However, the behavior of `Rs2[3:0]==-2^3 (0x8)` is defined to be
+ * equivalent to the behavior of `Rs2[3:0]==-(2^3-1) (0x9)`.
+ * The left-shifted results are saturated to the 8-bit signed integer range of [-2^7, 2^7-1]. For the `.u` form
+ * of the instruction, the right-shifted results are added a 1 to the most significant discarded bit
+ * position for rounding effect. After the shift, saturation, or rounding, the final results are written to
+ * Rd. If any saturation happens, this instruction sets the OV flag. The value of Rs2[31:4] will not affect
+ * this instruction.
+ *
+ * **Operations**:\n
+ * ~~~
+ * if (Rs2[3:0] < 0) {
+ *   sa = -Rs2[3:0];
+ *   sa = (sa == 8)? 7 : sa;
+ *   if (`.u` form) {
+ *     res[7:-1] = SE9(Rs1.B[x][7:sa-1]) + 1;
+ *     Rd.B[x] = res[7:0];
+ *   } else {
+ *     Rd.B[x] = SE8(Rs1.B[x][7:sa]);
+ *   }
+ * } else {
+ *   sa = Rs2[2:0];
+ *   res[(7+sa):0] = Rs1.B[x] <<(logic) sa;
+ *   if (res > (2^7)-1) {
+ *     res[7:0] = 0x7f; OV = 1;
+ *   } else if (res < -2^7) {
+ *     res[7:0] = 0x80; OV = 1;
+ *   }
+ *   Rd.B[x] = res[7:0];
+ * }
+ * for RV32: x=3...0,
+ * for RV64: x=7...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    int type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_KSLRA8_U(unsigned long a, int b)
+{
+    register unsigned long result;
+    __ASM volatile("kslra8.u %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.59.2. KSLRA8.u ===== */
+
+/* ===== Inline Function Start for 3.60.1. KSLRA16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_SHIFT
+ * \brief KSLRA16 (SIMD 16-bit Shift Left Logical with Saturation or Shift Right Arithmetic)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KSLRA16 Rd, Rs1, Rs2
+ * KSLRA16.u Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 16-bit elements logical left (positive) or arithmetic right (negative) shift operation with
+ * Q15 saturation for the left shift. The `.u` form performs additional rounding up operations for the
+ * right shift.
+ *
+ * **Description**:\n
+ * The 16-bit data elements of Rs1 are left-shifted logically or right-shifted arithmetically
+ * based on the value of Rs2[4:0]. Rs2[4:0] is in the signed range of [-2^4, 2^4-1]. A positive Rs2[4:0] means
+ * logical left shift and a negative Rs2[4:0] means arithmetic right shift. The shift amount is the
+ * absolute value of Rs2[4:0]. However, the behavior of `Rs2[4:0]==-2^4 (0x10)` is defined to be
+ * equivalent to the behavior of `Rs2[4:0]==-(2^4-1) (0x11)`.
+ * The left-shifted results are saturated to the 16-bit signed integer range of [-2^15, 2^15-1]. For the `.u`
+ * form of the instruction, the right-shifted results are added a 1 to the most significant discarded bit
+ * position for rounding effect. After the shift, saturation, or rounding, the final results are written to
+ * Rd. If any saturation happens, this instruction sets the OV flag. The value of Rs2[31:5] will not affect
+ * this instruction.
+ *
+ * **Operations**:\n
+ * ~~~
+ * if (Rs2[4:0] < 0) {
+ *   sa = -Rs2[4:0];
+ *   sa = (sa == 16)? 15 : sa;
+ *   if (`.u` form) {
+ *     res[15:-1] = SE17(Rs1.H[x][15:sa-1]) + 1;
+ *     Rd.H[x] = res[15:0];
+ *   } else {
+ *     Rd.H[x] = SE16(Rs1.H[x][15:sa]);
+ *   }
+ * } else {
+ *   sa = Rs2[3:0];
+ *   res[(15+sa):0] = Rs1.H[x] <<(logic) sa;
+ *   if (res > (2^15)-1) {
+ *     res[15:0] = 0x7fff; OV = 1;
+ *   } else if (res < -2^15) {
+ *     res[15:0] = 0x8000; OV = 1;
+ *   }
+ *   d.H[x] = res[15:0];
+ * }
+ * for RV32: x=1...0,
+ * for RV64: x=3...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    int type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_KSLRA16(unsigned long a, int b)
+{
+    register unsigned long result;
+    __ASM volatile("kslra16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.60.1. KSLRA16 ===== */
+
+/* ===== Inline Function Start for 3.60.2. KSLRA16.u ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_SHIFT
+ * \brief KSLRA16.u (SIMD 16-bit Shift Left Logical with Saturation or Rounding Shift Right Arithmetic)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KSLRA16 Rd, Rs1, Rs2
+ * KSLRA16.u Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 16-bit elements logical left (positive) or arithmetic right (negative) shift operation with
+ * Q15 saturation for the left shift. The `.u` form performs additional rounding up operations for the
+ * right shift.
+ *
+ * **Description**:\n
+ * The 16-bit data elements of Rs1 are left-shifted logically or right-shifted arithmetically
+ * based on the value of Rs2[4:0]. Rs2[4:0] is in the signed range of [-2^4, 2^4-1]. A positive Rs2[4:0] means
+ * logical left shift and a negative Rs2[4:0] means arithmetic right shift. The shift amount is the
+ * absolute value of Rs2[4:0]. However, the behavior of `Rs2[4:0]==-2^4 (0x10)` is defined to be
+ * equivalent to the behavior of `Rs2[4:0]==-(2^4-1) (0x11)`.
+ * The left-shifted results are saturated to the 16-bit signed integer range of [-2^15, 2^15-1]. For the `.u`
+ * form of the instruction, the right-shifted results are added a 1 to the most significant discarded bit
+ * position for rounding effect. After the shift, saturation, or rounding, the final results are written to
+ * Rd. If any saturation happens, this instruction sets the OV flag. The value of Rs2[31:5] will not affect
+ * this instruction.
+ *
+ * **Operations**:\n
+ * ~~~
+ * if (Rs2[4:0] < 0) {
+ *   sa = -Rs2[4:0];
+ *   sa = (sa == 16)? 15 : sa;
+ *   if (`.u` form) {
+ *     res[15:-1] = SE17(Rs1.H[x][15:sa-1]) + 1;
+ *     Rd.H[x] = res[15:0];
+ *   } else {
+ *     Rd.H[x] = SE16(Rs1.H[x][15:sa]);
+ *   }
+ * } else {
+ *   sa = Rs2[3:0];
+ *   res[(15+sa):0] = Rs1.H[x] <<(logic) sa;
+ *   if (res > (2^15)-1) {
+ *     res[15:0] = 0x7fff; OV = 1;
+ *   } else if (res < -2^15) {
+ *     res[15:0] = 0x8000; OV = 1;
+ *   }
+ *   d.H[x] = res[15:0];
+ * }
+ * for RV32: x=1...0,
+ * for RV64: x=3...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    int type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_KSLRA16_U(unsigned long a, int b)
+{
+    register unsigned long result;
+    __ASM volatile("kslra16.u %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.60.2. KSLRA16.u ===== */
+
+/* ===== Inline Function Start for 3.61. KSLRAW ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_Q31_SAT_ALU
+ * \brief KSLRAW (Shift Left Logical with Q31 Saturation or Shift Right Arithmetic)
+ * \details
+ * **Type**: DSP
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KSLRAW Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Perform a logical left (positive) or arithmetic right (negative) shift operation with Q31
+ * saturation for the left shift on a 32-bit data.
+ *
+ * **Description**:\n
+ * The lower 32-bit content of Rs1 is left-shifted logically or right-shifted arithmetically
+ * based on the value of Rs2[5:0]. Rs2[5:0] is in the signed range of [-25, 25-1]. A positive Rs2[5:0] means
+ * logical left shift and a negative Rs2[5:0] means arithmetic right shift. The shift amount is the
+ * absolute value of Rs2[5:0] clamped to the actual shift range of [0, 31].
+ * The left-shifted result is saturated to the 32-bit signed integer range of [-2^31, 2^31-1]. After the shift
+ * operation, the final result is bit-31 sign-extended and written to Rd. If any saturation happens, this
+ * instruction sets the OV flag. The value of Rs2[31:6] will not affected the operation of this instruction.
+ *
+ * **Operations**:\n
+ * ~~~
+ * if (Rs2[5:0] < 0) {
+ *   sa = -Rs2[5:0];
+ *   sa = (sa == 32)? 31 : sa;
+ *   res[31:0] = Rs1.W[0] >>(arith) sa;
+ * } else {
+ *   sa = Rs2[5:0];
+ *   tmp = Rs1.W[0] <<(logic) sa;
+ *   if (tmp > (2^31)-1) {
+ *     res[31:0] = (2^31)-1;
+ *     OV = 1;
+ *   } else if (tmp < -2^31) {
+ *     res[31:0] = -2^31;
+ *     OV = 1
+ *   } else {
+ *     res[31:0] = tmp[31:0];
+ *   }
+ * }
+ * Rd = res[31:0]; // RV32
+ * Rd = SE64(res[31:0]); // RV64
+ * ~~~
+ *
+ * \param [in]  a    int type of value stored in a
+ * \param [in]  b    int type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_KSLRAW(int a, int b)
+{
+    register long result;
+    __ASM volatile("kslraw %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.61. KSLRAW ===== */
+
+/* ===== Inline Function Start for 3.62. KSLRAW.u ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_Q31_SAT_ALU
+ * \brief KSLRAW.u (Shift Left Logical with Q31 Saturation or Rounding Shift Right Arithmetic)
+ * \details
+ * **Type**: DSP
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KSLRAW.u Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Perform a logical left (positive) or arithmetic right (negative) shift operation with Q31
+ * saturation for the left shift and a rounding up operation for the right shift on a 32-bit data.
+ *
+ * **Description**:\n
+ * The lower 32-bit content of Rs1 is left-shifted logically or right-shifted arithmetically
+ * based on the value of Rs2[5:0]. Rs2[5:0] is in the signed range of [-25, 25-1]. A positive Rs2[5:0] means
+ * logical left shift and a negative Rs2[5:0] means arithmetic right shift. The shift amount is the
+ * absolute value of Rs2[5:0] clamped to the actual shift range of [0, 31].
+ * The left-shifted result is saturated to the 32-bit signed integer range of [-2^31, 2^31-1]. The right-shifted
+ * result is added a 1 to the most significant discarded bit position for rounding effect. After the shift,
+ * saturation, or rounding, the final result is bit-31 sign-extended and written to Rd. If any saturation
+ * happens, this instruction sets the OV flag. The value of Rs2[31:6] will not affect the operation of this
+ * instruction.
+ *
+ * **Operations**:\n
+ * ~~~
+ * if (Rs2[5:0] < 0) {
+ *   sa = -Rs2[5:0];
+ *   sa = (sa == 32)? 31 : sa;
+ *   res[31:-1] = SE33(Rs1[31:(sa-1)]) + 1;
+ *   rst[31:0] = res[31:0];
+ * } else {
+ *   sa = Rs2[5:0];
+ *   tmp = Rs1.W[0] <<(logic) sa;
+ *   if (tmp > (2^31)-1) {
+ *     rst[31:0] = (2^31)-1;
+ *     OV = 1;
+ *   } else if (tmp < -2^31) {
+ *     rst[31:0] = -2^31;
+ *     OV = 1
+ *   } else {
+ *     rst[31:0] = tmp[31:0];
+ *   }
+ * }
+ * Rd = rst[31:0]; // RV32
+ * Rd = SE64(rst[31:0]); // RV64
+ * ~~~
+ *
+ * \param [in]  a    int type of value stored in a
+ * \param [in]  b    int type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_KSLRAW_U(int a, int b)
+{
+    register long result;
+    __ASM volatile("kslraw.u %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.62. KSLRAW.u ===== */
+
+/* ===== Inline Function Start for 3.63. KSTAS16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_ADDSUB
+ * \brief KSTAS16 (SIMD 16-bit Signed Saturating Straight Addition & Subtraction)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KSTAS16 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 16-bit signed integer element saturating addition and 16-bit signed integer element
+ * saturating subtraction in a 32-bit chunk simultaneously. Operands are from corresponding
+ * positions in 32-bit chunks.
+ *
+ * **Description**:\n
+ * This instruction adds the 16-bit signed integer element in [31:16] of 32-bit chunks in
+ * Rs1 with the 16-bit signed integer element in [31:16] of 32-bit chunks in Rs2; at the same time, it
+ * subtracts the 16-bit signed integer element in [15:0] of 32-bit chunks in Rs2 from the 16-bit signed
+ * integer element in [15:0] of 32-bit chunks in Rs1. If any of the results are beyond the Q15 number
+ * range (-2^15 <= Q15 <= 2^15-1), they are saturated to the range and the OV bit is set to 1. The saturated
+ * results are written to [31:16] of 32-bit chunks in Rd for addition and [15:0] of 32-bit chunks in Rd for
+ * subtraction.
+ *
+ * **Operations**:\n
+ * ~~~
+ * res1 = Rs1.W[x][31:16] + Rs2.W[x][31:16];
+ * res2 = Rs1.W[x][15:0] - Rs2.W[x][15:0];
+ * for (res in [res1, res2]) {
+ *   if (res > (2^15)-1) {
+ *     res = (2^15)-1;
+ *     OV = 1;
+ *   } else if (res < -2^15) {
+ *     res = -2^15;
+ *     OV = 1;
+ *   }
+ * }
+ * Rd.W[x][31:16] = res1;
+ * Rd.W[x][15:0] = res2;
+ * for RV32, x=0
+ * for RV64, x=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_KSTAS16(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("kstas16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.63. KSTAS16 ===== */
+
+/* ===== Inline Function Start for 3.64. KSTSA16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_ADDSUB
+ * \brief KSTSA16 (SIMD 16-bit Signed Saturating Straight Subtraction & Addition)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KSTSA16 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 16-bit signed integer element saturating subtraction and 16-bit signed integer element
+ * saturating addition in a 32-bit chunk simultaneously. Operands are from corresponding positions in
+ * 32-bit chunks.
+ *
+ * **Description**:\n
+ * This instruction subtracts the 16-bit signed integer element in [31:16] of 32-bit chunks
+ * in Rs2 from the 16-bit signed integer element in [31:16] of 32-bit chunks in Rs1; at the same time, it
+ * adds the 16-bit signed integer element in [15:0] of 32-bit chunks in Rs2 with the 16-bit signed integer
+ * element in [15:0] of 32-bit chunks in Rs1. If any of the results are beyond the Q15 number range (-2^15
+ * <= Q15 <= 2^15-1), they are saturated to the range and the OV bit is set to 1. The saturated results are
+ * written to [31:16] of 32-bit chunks in Rd for subtraction and [15:0] of 32-bit chunks in Rd for
+ * addition.
+ *
+ * **Operations**:\n
+ * ~~~
+ * res1 = Rs1.W[x][31:16] - Rs2.W[x][31:16];
+ * res2 = Rs1.W[x][15:0] + Rs2.W[x][15:0];
+ * for (res in [res1, res2]) {
+ *   if (res > (2^15)-1) {
+ *     res = (2^15)-1;
+ *     OV = 1;
+ *   } else if (res < -2^15) {
+ *     res = -2^15;
+ *     OV = 1;
+ *   }
+ * }
+ * Rd.W[x][31:16] = res1;
+ * Rd.W[x][15:0] = res2;
+ * for RV32, x=0
+ * for RV64, x=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_KSTSA16(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("kstsa16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.64. KSTSA16 ===== */
+
+/* ===== Inline Function Start for 3.65. KSUB8 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_ADDSUB
+ * \brief KSUB8 (SIMD 8-bit Signed Saturating Subtraction)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KSUB8 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 8-bit signed elements saturating subtractions simultaneously.
+ *
+ * **Description**:\n
+ * This instruction subtracts the 8-bit signed integer elements in Rs2 from the 8-bit
+ * signed integer elements in Rs1. If any of the results are beyond the Q7 number range (-2^7 <= Q7 <= 27
+ * -1), they are saturated to the range and the OV bit is set to 1. The saturated results are written to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * res[x] = Rs1.B[x] - Rs2.B[x];
+ * if (res[x] > (2^7)-1) {
+ *   res[x] = (2^7)-1;
+ *   OV = 1;
+ * } else if (res[x] < -2^7) {
+ *   res[x] = -2^7;
+ *   OV = 1;
+ * }
+ * Rd.B[x] = res[x];
+ * for RV32: x=3...0,
+ * for RV64: x=7...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_KSUB8(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("ksub8 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.65. KSUB8 ===== */
+
+/* ===== Inline Function Start for 3.66. KSUB16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_ADDSUB
+ * \brief KSUB16 (SIMD 16-bit Signed Saturating Subtraction)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KSUB16 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 16-bit signed integer elements saturating subtractions simultaneously.
+ *
+ * **Description**:\n
+ * This instruction subtracts the 16-bit signed integer elements in Rs2 from the 16-bit
+ * signed integer elements in Rs1. If any of the results are beyond the Q15 number range (-2^15 <= Q15 <=
+ * 2^15-1), they are saturated to the range and the OV bit is set to 1. The saturated results are written to
+ * Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * res[x] = Rs1.H[x] - Rs2.H[x];
+ * if (res[x] > (2^15)-1) {
+ *   res[x] = (2^15)-1;
+ *   OV = 1;
+ * } else if (res[x] < -2^15) {
+ *   res[x] = -2^15;
+ *   OV = 1;
+ * }
+ * Rd.H[x] = res[x];
+ * for RV32: x=1...0,
+ * for RV64: x=3...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_KSUB16(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("ksub16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.66. KSUB16 ===== */
+
+/* ===== Inline Function Start for 3.67. KSUB64 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_64B_ADDSUB
+ * \brief KSUB64 (64-bit Signed Saturating Subtraction)
+ * \details
+ * **Type**: DSP (64-bit Profile)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KSUB64 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Perform a 64-bit signed integer subtraction. The result is saturated to the Q63 range.
+ *
+ * **RV32 Description**:\n
+ * This instruction subtracts the 64-bit signed integer of an even/odd pair of
+ * registers specified by Rs2(4,1) from the 64-bit signed integer of an even/odd pair of registers
+ * specified by Rs1(4,1). If the 64-bit result is beyond the Q63 number range (-2^63 <= Q63 <= 2^63-1), it is
+ * saturated to the range and the OV bit is set to 1. The saturated result is then written to an even/odd
+ * pair of registers specified by Rd(4,1).
+ * Rx(4,1), i.e., d, determines the even/odd pair group of two registers. Specifically, the register pair
+ * includes register 2d and 2d+1.
+ * The odd `2d+1` register of the pair contains the high 32-bit of the operand and the even `2d`
+ * register of the pair contains the low 32-bit of the operand.
+ *
+ * **RV64 Description**:\n
+ * This instruction subtracts the 64-bit signed integer of Rs2 from the 64-bit signed
+ * integer of Rs1. If the 64-bit result is beyond the Q63 number range (-2^63 <= Q63 <= 2^63-1), it is saturated
+ * to the range and the OV bit is set to 1. The saturated result is then written to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * RV32:
+ * t_L = CONCAT(Rd(4,1),1'b0); t_H = CONCAT(Rd(4,1),1'b1);
+ * a_L = CONCAT(Rs1(4,1),1'b0); a_H = CONCAT(Rs1(4,1),1'b1);
+ * b_L = CONCAT(Rs2(4,1),1'b0); b_H = CONCAT(Rs2(4,1),1'b1);
+ * result = R[a_H].R[a_L] - R[b_H].R[b_L];
+ * if (result > (2^63)-1) {
+ *   result = (2^63)-1; OV = 1;
+ * } else if (result < -2^63) {
+ *   result = -2^63; OV = 1;
+ * }
+ * R[t_H].R[t_L] = result;
+ * RV64:
+ * result = Rs1 - Rs2;
+ * if (result > (2^63)-1) {
+ *   result = (2^63)-1; OV = 1;
+ * } else if (result < -2^63) {
+ *   result = -2^63; OV = 1;
+ * }
+ * Rd = result;
+ * ~~~
+ *
+ * \param [in]  a    long long type of value stored in a
+ * \param [in]  b    long long type of value stored in b
+ * \return value stored in long long type
+ */
+__STATIC_FORCEINLINE long long __RV_KSUB64(long long a, long long b)
+{
+    register long long result;
+    __ASM volatile("ksub64 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.67. KSUB64 ===== */
+
+/* ===== Inline Function Start for 3.68. KSUBH ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_Q15_SAT_ALU
+ * \brief KSUBH (Signed Subtraction with Q15 Saturation)
+ * \details
+ * **Type**: DSP
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KSUBH Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Subtract the signed lower 32-bit content of two registers with Q15 saturation.
+ *
+ * **Description**:\n
+ * The signed lower 32-bit content of Rs2 is subtracted from the signed lower 32-bit
+ * content of Rs1. And the result is saturated to the 16-bit signed integer range of [-2^15, 2^15-1] and then
+ * sign-extended and written to Rd. If saturation happens, this instruction sets the OV flag.
+ *
+ * **Operations**:\n
+ * ~~~
+ * tmp = Rs1.W[0] - Rs2.W[0];
+ * if (tmp > (2^15)-1) {
+ *   res = (2^15)-1;
+ *   OV = 1;
+ * } else if (tmp < -2^15) {
+ *   res = -2^15;
+ *   OV = 1
+ * } else {
+ *   res = tmp;
+ * }
+ * Rd = SE(res[15:0]);
+ * ~~~
+ *
+ * \param [in]  a    int type of value stored in a
+ * \param [in]  b    int type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_KSUBH(int a, int b)
+{
+    register long result;
+    __ASM volatile("ksubh %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.68. KSUBH ===== */
+
+/* ===== Inline Function Start for 3.69. KSUBW ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_Q31_SAT_ALU
+ * \brief KSUBW (Signed Subtraction with Q31 Saturation)
+ * \details
+ * **Type**: DSP
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KSUBW Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Subtract the signed lower 32-bit content of two registers with Q31 saturation.
+ *
+ * **Description**:\n
+ * The signed lower 32-bit content of Rs2 is subtracted from the signed lower 32-bit
+ * content of Rs1. And the result is saturated to the 32-bit signed integer range of [-2^31, 2^31-1] and then
+ * sign-extened and written to Rd. If saturation happens, this instruction sets the OV flag.
+ *
+ * **Operations**:\n
+ * ~~~
+ * tmp = Rs1.W[0] - Rs2.W[0];
+ * if (tmp > (2^31)-1) {
+ *   res = (2^31)-1;
+ *   OV = 1;
+ * } else if (tmp < -2^31) {
+ * res = -2^31;
+ *   OV = 1
+ * } else {
+ *   res = tmp;
+ * }
+ * Rd = res[31:0]; // RV32
+ * Rd = SE(res[31:0]); // RV64
+ * ~~~
+ *
+ * \param [in]  a    int type of value stored in a
+ * \param [in]  b    int type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_KSUBW(int a, int b)
+{
+    register long result;
+    __ASM volatile("ksubw %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.69. KSUBW ===== */
+
+/* ===== Inline Function Start for 3.70.1. KWMMUL ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_MSW_32X32_MAC
+ * \brief KWMMUL (SIMD Saturating MSW Signed Multiply Word & Double)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KWMMUL Rd, Rs1, Rs2
+ * KWMMUL.u Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Multiply the signed 32-bit integer elements of two registers, shift the results left 1-bit,
+ * saturate, and write the most significant 32-bit results to a register. The `.u` form additionally
+ * rounds up the multiplication results from the most signification discarded bit.
+ *
+ * **Description**:\n
+ * This instruction multiplies the 32-bit elements of Rs1 with the 32-bit elements of Rs2. It then shifts
+ * the multiplication results one bit to the left and takes the most significant 32-bit results. If the
+ * shifted result is greater than 2^31-1, it is saturated to 2^31-1 and the OV flag is set to 1. The final element
+ * result is written to Rd. The 32-bit elements of Rs1 and Rs2 are treated as signed integers. The `.u`
+ * form of the instruction additionally rounds up the 64-bit multiplication results by adding a 1 to bit
+ * 30 before the shift and saturation operations.
+ *
+ * **Operations**:\n
+ * ~~~
+ * if ((0x80000000 != Rs1.W[x]) | (0x80000000 != Rs2.W[x])) {
+ *   Mres[x][63:0] = Rs1.W[x] * Rs2.W[x];
+ *   if (`.u` form) {
+ *     Round[x][33:0] = Mres[x][63:30] + 1;
+ *     Rd.W[x] = Round[x][32:1];
+ *   } else {
+ *     Rd.W[x] = Mres[x][62:31];
+ *   }
+ * } else {
+ *   Rd.W[x] = 0x7fffffff;
+ *   OV = 1;
+ * }
+ * for RV32: x=0
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  a    long type of value stored in a
+ * \param [in]  b    long type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_KWMMUL(long a, long b)
+{
+    register long result;
+    __ASM volatile("kwmmul %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.70.1. KWMMUL ===== */
+
+/* ===== Inline Function Start for 3.70.2. KWMMUL.u ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_MSW_32X32_MAC
+ * \brief KWMMUL.u (SIMD Saturating MSW Signed Multiply Word & Double with Rounding)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KWMMUL Rd, Rs1, Rs2
+ * KWMMUL.u Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Multiply the signed 32-bit integer elements of two registers, shift the results left 1-bit,
+ * saturate, and write the most significant 32-bit results to a register. The `.u` form additionally
+ * rounds up the multiplication results from the most signification discarded bit.
+ *
+ * **Description**:\n
+ * This instruction multiplies the 32-bit elements of Rs1 with the 32-bit elements of Rs2. It then shifts
+ * the multiplication results one bit to the left and takes the most significant 32-bit results. If the
+ * shifted result is greater than 2^31-1, it is saturated to 2^31-1 and the OV flag is set to 1. The final element
+ * result is written to Rd. The 32-bit elements of Rs1 and Rs2 are treated as signed integers. The `.u`
+ * form of the instruction additionally rounds up the 64-bit multiplication results by adding a 1 to bit
+ * 30 before the shift and saturation operations.
+ *
+ * **Operations**:\n
+ * ~~~
+ * if ((0x80000000 != Rs1.W[x]) | (0x80000000 != Rs2.W[x])) {
+ *   Mres[x][63:0] = Rs1.W[x] * Rs2.W[x];
+ *   if (`.u` form) {
+ *     Round[x][33:0] = Mres[x][63:30] + 1;
+ *     Rd.W[x] = Round[x][32:1];
+ *   } else {
+ *     Rd.W[x] = Mres[x][62:31];
+ *   }
+ * } else {
+ *   Rd.W[x] = 0x7fffffff;
+ *   OV = 1;
+ * }
+ * for RV32: x=0
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  a    long type of value stored in a
+ * \param [in]  b    long type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_KWMMUL_U(long a, long b)
+{
+    register long result;
+    __ASM volatile("kwmmul.u %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.70.2. KWMMUL.u ===== */
+
+/* ===== Inline Function Start for 3.71. MADDR32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_MISC
+ * \brief MADDR32 (Multiply and Add to 32-Bit Word)
+ * \details
+ * **Type**: DSP
+ *
+ * **Syntax**:\n
+ * ~~~
+ * MADDR32 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Multiply the 32-bit contents of two registers and add the lower 32-bit multiplication result
+ * to the 32-bit content of a destination register. Write the final result back to the destination register.
+ *
+ * **Description**:\n
+ * This instruction multiplies the lower 32-bit content of Rs1 with that of Rs2. It adds the
+ * lower 32-bit multiplication result to the lower 32-bit content of Rd and writes the final result (RV32)
+ * or sign-extended result (RV64) back to Rd. The contents of Rs1 and Rs2 can be either signed or
+ * unsigned integers.
+ *
+ * **Operations**:\n
+ * ~~~
+ * RV32:
+ * Mresult = Rs1 * Rs2;
+ * Rd = Rd + Mresult.W[0];
+ * RV64:
+ * Mresult = Rs1.W[0] * Rs2.W[0];
+ * tres[31:0] = Rd.W[0] + Mresult.W[0];
+ * Rd = SE64(tres[31:0]);
+ * ~~~
+ *
+ * \param [in]  t    unsigned long type of value stored in t
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_MADDR32(unsigned long t, unsigned long a, unsigned long b)
+{
+    __ASM volatile("maddr32 %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
+    return t;
+}
+/* ===== Inline Function End for 3.71. MADDR32 ===== */
+
+/* ===== Inline Function Start for 3.72. MAXW ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_32B_COMPUTATION
+ * \brief MAXW (32-bit Signed Word Maximum)
+ * \details
+ * **Type**: DSP
+ *
+ * **Syntax**:\n
+ * ~~~
+ * MAXW Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Get the larger value from the 32-bit contents of two general registers.
+ *
+ * **Description**:\n
+ * This instruction compares two signed 32-bit integers stored in Rs1 and Rs2, picks the
+ * larger value as the result, and writes the result to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * if (Rs1.W[0] >= Rs2.W[0]) {
+ *   Rd = SE(Rs1.W[0]);
+ * } else {
+ *   Rd = SE(Rs2.W[0]);
+ * }
+ * ~~~
+ *
+ * \param [in]  a    int type of value stored in a
+ * \param [in]  b    int type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_MAXW(int a, int b)
+{
+    register long result;
+    __ASM volatile("maxw %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.72. MAXW ===== */
+
+/* ===== Inline Function Start for 3.73. MINW ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_32B_COMPUTATION
+ * \brief MINW (32-bit Signed Word Minimum)
+ * \details
+ * **Type**: DSP
+ *
+ * **Syntax**:\n
+ * ~~~
+ * MINW Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Get the smaller value from the 32-bit contents of two general registers.
+ *
+ * **Description**:\n
+ * This instruction compares two signed 32-bit integers stored in Rs1 and Rs2, picks the
+ * smaller value as the result, and writes the result to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * if (Rs1.W[0] >= Rs2.W[0]) { Rd = SE(Rs2.W[0]); } else { Rd = SE(Rs1.W[0]); }
+ * ~~~
+ *
+ * \param [in]  a    int type of value stored in a
+ * \param [in]  b    int type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_MINW(int a, int b)
+{
+    register long result;
+    __ASM volatile("minw %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.73. MINW ===== */
+
+/* ===== Inline Function Start for 3.74. MSUBR32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_MISC
+ * \brief MSUBR32 (Multiply and Subtract from 32-Bit Word)
+ * \details
+ * **Type**: DSP
+ *
+ * **Syntax**:\n
+ * ~~~
+ * MSUBR32 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Multiply the 32-bit contents of two registers and subtract the lower 32-bit multiplication
+ * result from the 32-bit content of a destination register. Write the final result back to the destination
+ * register.
+ *
+ * **Description**:\n
+ * This instruction multiplies the lower 32-bit content of Rs1 with that of Rs2, subtracts
+ * the lower 32-bit multiplication result from the lower 32-bit content of Rd, then writes the final
+ * result (RV32) or sign-extended result (RV64) back to Rd. The contents of Rs1 and Rs2 can be either
+ * signed or unsigned integers.
+ *
+ * **Operations**:\n
+ * ~~~
+ * RV32:
+ * Mresult = Rs1 * Rs2;
+ * Rd = Rd - Mresult.W[0];
+ * RV64:
+ * Mresult = Rs1.W[0] * Rs2.W[0];
+ * tres[31:0] = Rd.W[0] - Mresult.W[0];
+ * Rd = SE64(tres[31:0]);
+ * ~~~
+ *
+ * \param [in]  t    unsigned long type of value stored in t
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_MSUBR32(unsigned long t, unsigned long a, unsigned long b)
+{
+    __ASM volatile("msubr32 %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
+    return t;
+}
+/* ===== Inline Function End for 3.74. MSUBR32 ===== */
+
+/* ===== Inline Function Start for 3.75. MULR64 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_32B_COMPUTATION
+ * \brief MULR64 (Multiply Word Unsigned to 64-bit Data)
+ * \details
+ * **Type**: DSP
+ *
+ * **Syntax**:\n
+ * ~~~
+ * MULR64 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Multiply the 32-bit unsigned integer contents of two registers and write the 64-bit result.
+ *
+ * **RV32 Description**:\n
+ * This instruction multiplies the 32-bit content of Rs1 with that of Rs2 and writes the 64-bit
+ * multiplication result to an even/odd pair of registers containing Rd. Rd(4,1) index d determines the
+ * even/odd pair group of the two registers. Specifically, the register pair includes register 2d and
+ * 2d+1.
+ * The odd `2d+1` register of the pair contains the high 32-bit of the result and the even `2d` register
+ * of the pair contains the low 32-bit of the result.
+ * The lower 32-bit contents of Rs1 and Rs2 are treated as unsigned integers.
+ *
+ * **RV64 Description**:\n
+ * This instruction multiplies the lower 32-bit content of Rs1 with that of Rs2 and writes the 64-bit
+ * multiplication result to Rd.
+ * The lower 32-bit contents of Rs1 and Rs2 are treated as unsigned integers.
+ *
+ * **Operations**:\n
+ * ~~~
+ * RV32:
+ * Mresult = CONCAT(1`b0,Rs1) u* CONCAT(1`b0,Rs2);
+ * R[Rd(4,1).1(0)][31:0] = Mresult[63:32];
+ * R[Rd(4,1).0(0)][31:0] = Mresult[31:0];
+ * RV64:
+ * Rd = Mresult[63:0];
+ * Mresult = CONCAT(1`b0,Rs1.W[0]) u* CONCAT(1`b0,Rs2.W[0]);
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long long type
+ */
+__STATIC_FORCEINLINE unsigned long long __RV_MULR64(unsigned long a, unsigned long b)
+{
+    register unsigned long long result;
+    __ASM volatile("mulr64 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.75. MULR64 ===== */
+
+/* ===== Inline Function Start for 3.76. MULSR64 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_32B_COMPUTATION
+ * \brief MULSR64 (Multiply Word Signed to 64-bit Data)
+ * \details
+ * **Type**: DSP
+ *
+ * **Syntax**:\n
+ * ~~~
+ * MULSR64 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Multiply the 32-bit signed integer contents of two registers and write the 64-bit result.
+ *
+ * **RV32 Description**:\n
+ * This instruction multiplies the lower 32-bit content of Rs1 with the lower 32-bit content of Rs2 and
+ * writes the 64-bit multiplication result to an even/odd pair of registers containing Rd. Rd(4,1) index d
+ * determines the even/odd pair group of the two registers. Specifically, the register pair includes
+ * register 2d and 2d+1.
+ * The odd `2d+1` register of the pair contains the high 32-bit of the result and the even `2d` register
+ * of the pair contains the low 32-bit of the result.
+ * The lower 32-bit contents of Rs1 and Rs2 are treated as signed integers.
+ *
+ * **RV64 Description**:\n
+ * This instruction multiplies the lower 32-bit content of Rs1 with the lower 32-bit content of Rs2 and
+ * writes the 64-bit multiplication result to Rd.
+ * The lower 32-bit contents of Rs1 and Rs2 are treated as signed integers.
+ *
+ * **Operations**:\n
+ * ~~~
+ * RV32:
+ * Mresult = Ra s* Rb;
+ * R[Rd(4,1).1(0)][31:0] = Mresult[63:32];
+ * R[Rd(4,1).0(0)][31:0] = Mresult[31:0];
+ * RV64:
+ * Mresult = Ra.W[0] s* Rb.W[0];
+ * Rd = Mresult[63:0];
+ * ~~~
+ *
+ * \param [in]  a    long type of value stored in a
+ * \param [in]  b    long type of value stored in b
+ * \return value stored in long long type
+ */
+__STATIC_FORCEINLINE long long __RV_MULSR64(long a, long b)
+{
+    register long long result;
+    __ASM volatile("mulsr64 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.76. MULSR64 ===== */
+
+/* ===== Inline Function Start for 3.77. PBSAD ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_PART_SIMD_MISC
+ * \brief PBSAD (Parallel Byte Sum of Absolute Difference)
+ * \details
+ * **Type**: DSP
+ *
+ * **Syntax**:\n
+ * ~~~
+ * PBSAD Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Calculate the sum of absolute difference of unsigned 8-bit data elements.
+ *
+ * **Description**:\n
+ * This instruction subtracts the un-signed 8-bit elements of Rs2 from those of Rs1. Then
+ * it adds the absolute value of each difference together and writes the result to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * absdiff[x] = ABS(Rs1.B[x] - Rs2.B[x]);
+ * Rd = SUM(absdiff[x]);
+ * for RV32: x=3...0,
+ * for RV64: x=7...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_PBSAD(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("pbsad %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.77. PBSAD ===== */
+
+/* ===== Inline Function Start for 3.78. PBSADA ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_PART_SIMD_MISC
+ * \brief PBSADA (Parallel Byte Sum of Absolute Difference Accum)
+ * \details
+ * **Type**: DSP
+ *
+ * **Syntax**:\n
+ * ~~~
+ * PBSADA Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Calculate the sum of absolute difference of four unsigned 8-bit data elements and
+ * accumulate it into a register.
+ *
+ * **Description**:\n
+ * This instruction subtracts the un-signed 8-bit elements of Rs2 from those of Rs1. It
+ * then adds the absolute value of each difference together along with the content of Rd and writes the
+ * accumulated result back to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * absdiff[x] = ABS(Rs1.B[x] - Rs2.B[x]);
+ * Rd = Rd + SUM(absdiff[x]);
+ * for RV32: x=3...0,
+ * for RV64: x=7...0
+ * ~~~
+ *
+ * \param [in]  t    unsigned long type of value stored in t
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_PBSADA(unsigned long t, unsigned long a, unsigned long b)
+{
+    __ASM volatile("pbsada %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
+    return t;
+}
+/* ===== Inline Function End for 3.78. PBSADA ===== */
+
+/* ===== Inline Function Start for 3.79.1. PKBB16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_PACK
+ * \brief PKBB16 (Pack Two 16-bit Data from Both Bottom Half)
+ * \details
+ * **Type**: DSP
+ *
+ * **Syntax**:\n
+ * ~~~
+ * PKBB16 Rd, Rs1, Rs2
+ * PKBT16 Rd, Rs1, Rs2
+ * PKTT16 Rd, Rs1, Rs2
+ * PKTB16 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Pack 16-bit data from 32-bit chunks in two registers.
+ * * PKBB16: bottom.bottom
+ * * PKBT16 bottom.top
+ * * PKTT16 top.top
+ * * PKTB16 top.bottom
+ *
+ * **Description**:\n
+ * (PKBB16) moves Rs1.W[x][15:0] to Rd.W[x][31:16] and moves Rs2.W[x] [15:0] to
+ * Rd.W[x] [15:0].
+ * (PKBT16) moves Rs1.W[x] [15:0] to Rd.W[x] [31:16] and moves Rs2.W[x] [31:16] to Rd.W[x] [15:0].
+ * (PKTT16) moves Rs1.W[x] [31:16] to Rd.W[x] [31:16] and moves Rs2.W[x] [31:16] to Rd.W[x] [15:0].
+ * (PKTB16) moves Rs1.W[x] [31:16] to Rd.W[x] [31:16] and moves Rs2.W[x] [15:0] to Rd.W[x] [15:0].
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.W[x][31:0] = CONCAT(Rs1.W[x][15:0], Rs2.W[x][15:0]); // PKBB16
+ * Rd.W[x][31:0] = CONCAT(Rs1.W[x][15:0], Rs2.W[x][31:16]); // PKBT16
+ * Rd.W[x][31:0] = CONCAT(Rs1.W[x][31:16], Rs2.W[x][15:0]); // PKTB16
+ * Rd.W[x][31:0] = CONCAT(Rs1.W[x][31:16], Rs2.W[x][31:16]); // PKTT16
+ * for RV32: x=0,
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_PKBB16(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("pkbb16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.79.1. PKBB16 ===== */
+
+/* ===== Inline Function Start for 3.79.2. PKBT16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_PACK
+ * \brief PKBT16 (Pack Two 16-bit Data from Bottom and Top Half)
+ * \details
+ * **Type**: DSP
+ *
+ * **Syntax**:\n
+ * ~~~
+ * PKBB16 Rd, Rs1, Rs2
+ * PKBT16 Rd, Rs1, Rs2
+ * PKTT16 Rd, Rs1, Rs2
+ * PKTB16 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Pack 16-bit data from 32-bit chunks in two registers.
+ * * PKBB16: bottom.bottom
+ * * PKBT16 bottom.top
+ * * PKTT16 top.top
+ * * PKTB16 top.bottom
+ *
+ * **Description**:\n
+ * (PKBB16) moves Rs1.W[x][15:0] to Rd.W[x][31:16] and moves Rs2.W[x] [15:0] to
+ * Rd.W[x] [15:0].
+ * (PKBT16) moves Rs1.W[x] [15:0] to Rd.W[x] [31:16] and moves Rs2.W[x] [31:16] to Rd.W[x] [15:0].
+ * (PKTT16) moves Rs1.W[x] [31:16] to Rd.W[x] [31:16] and moves Rs2.W[x] [31:16] to Rd.W[x] [15:0].
+ * (PKTB16) moves Rs1.W[x] [31:16] to Rd.W[x] [31:16] and moves Rs2.W[x] [15:0] to Rd.W[x] [15:0].
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.W[x][31:0] = CONCAT(Rs1.W[x][15:0], Rs2.W[x][15:0]); // PKBB16
+ * Rd.W[x][31:0] = CONCAT(Rs1.W[x][15:0], Rs2.W[x][31:16]); // PKBT16
+ * Rd.W[x][31:0] = CONCAT(Rs1.W[x][31:16], Rs2.W[x][15:0]); // PKTB16
+ * Rd.W[x][31:0] = CONCAT(Rs1.W[x][31:16], Rs2.W[x][31:16]); // PKTT16
+ * for RV32: x=0,
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_PKBT16(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("pkbt16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.79.2. PKBT16 ===== */
+
+/* ===== Inline Function Start for 3.79.3. PKTT16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_PACK
+ * \brief PKTT16 (Pack Two 16-bit Data from Both Top Half)
+ * \details
+ * **Type**: DSP
+ *
+ * **Syntax**:\n
+ * ~~~
+ * PKBB16 Rd, Rs1, Rs2
+ * PKBT16 Rd, Rs1, Rs2
+ * PKTT16 Rd, Rs1, Rs2
+ * PKTB16 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Pack 16-bit data from 32-bit chunks in two registers.
+ * * PKBB16: bottom.bottom
+ * * PKBT16 bottom.top
+ * * PKTT16 top.top
+ * * PKTB16 top.bottom
+ *
+ * **Description**:\n
+ * (PKBB16) moves Rs1.W[x][15:0] to Rd.W[x][31:16] and moves Rs2.W[x] [15:0] to
+ * Rd.W[x] [15:0].
+ * (PKBT16) moves Rs1.W[x] [15:0] to Rd.W[x] [31:16] and moves Rs2.W[x] [31:16] to Rd.W[x] [15:0].
+ * (PKTT16) moves Rs1.W[x] [31:16] to Rd.W[x] [31:16] and moves Rs2.W[x] [31:16] to Rd.W[x] [15:0].
+ * (PKTB16) moves Rs1.W[x] [31:16] to Rd.W[x] [31:16] and moves Rs2.W[x] [15:0] to Rd.W[x] [15:0].
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.W[x][31:0] = CONCAT(Rs1.W[x][15:0], Rs2.W[x][15:0]); // PKBB16
+ * Rd.W[x][31:0] = CONCAT(Rs1.W[x][15:0], Rs2.W[x][31:16]); // PKBT16
+ * Rd.W[x][31:0] = CONCAT(Rs1.W[x][31:16], Rs2.W[x][15:0]); // PKTB16
+ * Rd.W[x][31:0] = CONCAT(Rs1.W[x][31:16], Rs2.W[x][31:16]); // PKTT16
+ * for RV32: x=0,
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_PKTT16(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("pktt16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.79.3. PKTT16 ===== */
+
+/* ===== Inline Function Start for 3.79.4. PKTB16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_PACK
+ * \brief PKTB16 (Pack Two 16-bit Data from Top and Bottom Half)
+ * \details
+ * **Type**: DSP
+ *
+ * **Syntax**:\n
+ * ~~~
+ * PKBB16 Rd, Rs1, Rs2
+ * PKBT16 Rd, Rs1, Rs2
+ * PKTT16 Rd, Rs1, Rs2
+ * PKTB16 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Pack 16-bit data from 32-bit chunks in two registers.
+ * * PKBB16: bottom.bottom
+ * * PKBT16 bottom.top
+ * * PKTT16 top.top
+ * * PKTB16 top.bottom
+ *
+ * **Description**:\n
+ * (PKBB16) moves Rs1.W[x][15:0] to Rd.W[x][31:16] and moves Rs2.W[x] [15:0] to
+ * Rd.W[x] [15:0].
+ * (PKBT16) moves Rs1.W[x] [15:0] to Rd.W[x] [31:16] and moves Rs2.W[x] [31:16] to Rd.W[x] [15:0].
+ * (PKTT16) moves Rs1.W[x] [31:16] to Rd.W[x] [31:16] and moves Rs2.W[x] [31:16] to Rd.W[x] [15:0].
+ * (PKTB16) moves Rs1.W[x] [31:16] to Rd.W[x] [31:16] and moves Rs2.W[x] [15:0] to Rd.W[x] [15:0].
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.W[x][31:0] = CONCAT(Rs1.W[x][15:0], Rs2.W[x][15:0]); // PKBB16
+ * Rd.W[x][31:0] = CONCAT(Rs1.W[x][15:0], Rs2.W[x][31:16]); // PKBT16
+ * Rd.W[x][31:0] = CONCAT(Rs1.W[x][31:16], Rs2.W[x][15:0]); // PKTB16
+ * Rd.W[x][31:0] = CONCAT(Rs1.W[x][31:16], Rs2.W[x][31:16]); // PKTT16
+ * for RV32: x=0,
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_PKTB16(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("pktb16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.79.4. PKTB16 ===== */
+
+/* ===== Inline Function Start for 3.80. RADD8 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_ADDSUB
+ * \brief RADD8 (SIMD 8-bit Signed Halving Addition)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * RADD8 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 8-bit signed integer element additions simultaneously. The element results are halved
+ * to avoid overflow or saturation.
+ *
+ * **Description**:\n
+ * This instruction adds the 8-bit signed integer elements in Rs1 with the 8-bit signed
+ * integer elements in Rs2. The results are first arithmetically right-shifted by 1 bit and then written to
+ * Rd.
+ *
+ * **Examples**:\n
+ * ~~~
+ * * Rs1 = 0x7F, Rs2 = 0x7F, Rd = 0x7F
+ * * Rs1 = 0x80, Rs2 = 0x80, Rd = 0x80
+ * * Rs1 = 0x40, Rs2 = 0x80, Rd = 0xE0
+ * ~~~
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.B[x] = (Rs1.B[x] + Rs2.B[x]) s>> 1; for RV32: x=3...0, for RV64: x=7...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_RADD8(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("radd8 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.80. RADD8 ===== */
+
+/* ===== Inline Function Start for 3.81. RADD16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_ADDSUB
+ * \brief RADD16 (SIMD 16-bit Signed Halving Addition)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * RADD16 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 16-bit signed integer element additions simultaneously. The results are halved to avoid
+ * overflow or saturation.
+ *
+ * **Description**:\n
+ * This instruction adds the 16-bit signed integer elements in Rs1 with the 16-bit signed
+ * integer elements in Rs2. The results are first arithmetically right-shifted by 1 bit and then written to
+ * Rd.
+ *
+ * **Examples**:\n
+ * ~~~
+ * * Rs1 = 0x7FFF, Rs2 = 0x7FFF, Rd = 0x7FFF
+ * * Rs1 = 0x8000, Rs2 = 0x8000, Rd = 0x8000
+ * * Rs1 = 0x4000, Rs2 = 0x8000, Rd = 0xE000
+ * ~~~
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.H[x] = (Rs1.H[x] + Rs2.H[x]) s>> 1; for RV32: x=1...0, for RV64: x=3...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_RADD16(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("radd16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.81. RADD16 ===== */
+
+/* ===== Inline Function Start for 3.82. RADD64 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_64B_ADDSUB
+ * \brief RADD64 (64-bit Signed Halving Addition)
+ * \details
+ * **Type**: DSP (64-bit Profile)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * RADD64 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Add two 64-bit signed integers. The result is halved to avoid overflow or saturation.
+ *
+ * **RV32 Description**:\n
+ * This instruction adds the 64-bit signed integer of an even/odd pair of registers
+ * specified by Rs1(4,1) with the 64-bit signed integer of an even/odd pair of registers specified by
+ * Rs2(4,1). The 64-bit addition result is first arithmetically right-shifted by 1 bit and then written to an
+ * even/odd pair of registers specified by Rd(4,1).
+ * Rx(4,1), i.e., value d, determines the even/odd pair group of two registers. Specifically, the register
+ * pair includes register 2d and 2d+1.
+ * The odd `2d+1` register of the pair contains the high 32-bit of the result and the even `2d` register
+ * of the pair contains the low 32-bit of the result.
+ *
+ * **RV64 Description**:\n
+ * This instruction adds the 64-bit signed integer in Rs1 with the 64-bit signed
+ * integer in Rs2. The 64-bit addition result is first arithmetically right-shifted by 1 bit and then
+ * written to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * RV32:
+ * t_L = CONCAT(Rd(4,1),1'b0); t_H = CONCAT(Rd(4,1),1'b1);
+ * a_L = CONCAT(Rs1(4,1),1'b0); a_H = CONCAT(Rs1(4,1),1'b1);
+ * b_L = CONCAT(Rs2(4,1),1'b0); b_H = CONCAT(Rs2(4,1),1'b1);
+ * R[t_H].R[t_L] = (R[a_H].R[a_L] + R[b_H].R[b_L]) s>> 1;
+ * RV64:
+ * Rd = (Rs1 + Rs2) s>> 1;
+ * ~~~
+ *
+ * \param [in]  a    long long type of value stored in a
+ * \param [in]  b    long long type of value stored in b
+ * \return value stored in long long type
+ */
+__STATIC_FORCEINLINE long long __RV_RADD64(long long a, long long b)
+{
+    register long long result;
+    __ASM volatile("radd64 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.82. RADD64 ===== */
+
+/* ===== Inline Function Start for 3.83. RADDW ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_32B_COMPUTATION
+ * \brief RADDW (32-bit Signed Halving Addition)
+ * \details
+ * **Type**: DSP
+ *
+ * **Syntax**:\n
+ * ~~~
+ * RADDW Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Add 32-bit signed integers and the results are halved to avoid overflow or saturation.
+ *
+ * **Description**:\n
+ * This instruction adds the first 32-bit signed integer in Rs1 with the first 32-bit signed
+ * integer in Rs2. The result is first arithmetically right-shifted by 1 bit and then sign-extended and
+ * written to Rd.
+ *
+ * **Examples**:\n
+ * ~~~
+ * * Rs1 = 0x7FFFFFFF, Rs2 = 0x7FFFFFFF, Rd = 0x7FFFFFFF
+ * * Rs1 = 0x80000000, Rs2 = 0x80000000, Rd = 0x80000000
+ * * Rs1 = 0x40000000, Rs2 = 0x80000000, Rd = 0xE0000000
+ * ~~~
+ *
+ * **Operations**:\n
+ * ~~~
+ * RV32:
+ * Rd[31:0] = (Rs1[31:0] + Rs2[31:0]) s>> 1;
+ * RV64:
+ * resw[31:0] = (Rs1[31:0] + Rs2[31:0]) s>> 1;
+ * Rd[63:0] = SE(resw[31:0]);
+ * ~~~
+ *
+ * \param [in]  a    int type of value stored in a
+ * \param [in]  b    int type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_RADDW(int a, int b)
+{
+    register long result;
+    __ASM volatile("raddw %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.83. RADDW ===== */
+
+/* ===== Inline Function Start for 3.84. RCRAS16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_ADDSUB
+ * \brief RCRAS16 (SIMD 16-bit Signed Halving Cross Addition & Subtraction)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * RCRAS16 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 16-bit signed integer element addition and 16-bit signed integer element subtraction in
+ * a 32-bit chunk simultaneously. Operands are from crossed positions in 32-bit chunks. The results
+ * are halved to avoid overflow or saturation.
+ *
+ * **Description**:\n
+ * This instruction adds the 16-bit signed integer element in [31:16] of 32-bit chunks in
+ * Rs1 with the 16-bit signed integer element in [15:0] of 32-bit chunks in Rs2, and subtracts the 16-bit
+ * signed integer element in [31:16] of 32-bit chunks in Rs2 from the 16-bit signed integer element in
+ * [15:0] of 32-bit chunks in Rs1. The element results are first arithmetically right-shifted by 1 bit and
+ * then written to [31:16] of 32-bit chunks in Rd and [15:0] of 32-bit chunks in Rd.
+ *
+ * **Examples**:\n
+ * ~~~
+ * Please see `RADD16` and `RSUB16` instructions.
+ * ~~~
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.W[x][31:16] = (Rs1.W[x][31:16] + Rs2.W[x][15:0]) s>> 1;
+ * Rd.W[x][15:0] = (Rs1.W[x][15:0] - Rs2.W[x][31:16]) s>> 1;
+ * for RV32, x=0
+ * for RV64, x=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_RCRAS16(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("rcras16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.84. RCRAS16 ===== */
+
+/* ===== Inline Function Start for 3.85. RCRSA16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_ADDSUB
+ * \brief RCRSA16 (SIMD 16-bit Signed Halving Cross Subtraction & Addition)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * RCRSA16 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 16-bit signed integer element subtraction and 16-bit signed integer element addition in
+ * a 32-bit chunk simultaneously. Operands are from crossed positions in 32-bit chunks. The results
+ * are halved to avoid overflow or saturation.
+ *
+ * **Description**:\n
+ * This instruction subtracts the 16-bit signed integer element in [15:0] of 32-bit chunks
+ * in Rs2 from the 16-bit signed integer element in [31:16] of 32-bit chunks in Rs1, and adds the 16-bit
+ * signed element integer in [15:0] of 32-bit chunks in Rs1 with the 16-bit signed integer element in
+ * [31:16] of 32-bit chunks in Rs2. The two results are first arithmetically right-shifted by 1 bit and
+ * then written to [31:16] of 32-bit chunks in Rd and [15:0] of 32-bit chunks in Rd.
+ *
+ * **Examples**:\n
+ * ~~~
+ * Please see `RADD16` and `RSUB16` instructions.
+ * ~~~
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.W[x][31:16] = (Rs1.W[x][31:16] - Rs2.W[x][15:0]) s>> 1;
+ * Rd.W[x][15:0] = (Rs1.W[x][15:0] + Rs2.W[x][31:16]) s>> 1;
+ * for RV32, x=0
+ * for RV64, x=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_RCRSA16(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("rcrsa16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.85. RCRSA16 ===== */
+
+/* ===== Inline Function Start for 3.86. RDOV ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_OV_FLAG_SC
+ * \brief RDOV (Read OV flag)
+ * \details
+ * **Type**: DSP
+ *
+ * **Syntax**:\n
+ * ~~~
+ * RDOV Rd  # pseudo mnemonic
+ * ~~~
+ *
+ * **Purpose**:\n
+ * This pseudo instruction is an alias to `CSRR Rd, ucode` instruction which maps to the real
+ * instruction of `CSRRS Rd, ucode, x0`.
+ *
+ *
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_RDOV(void)
+{
+    register unsigned long result;
+    __ASM volatile("rdov %0" : "=r"(result));
+    return result;
+}
+/* ===== Inline Function End for 3.86. RDOV ===== */
+
+/* ===== Inline Function Start for 3.87. RSTAS16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_ADDSUB
+ * \brief RSTAS16 (SIMD 16-bit Signed Halving Straight Addition & Subtraction)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * RSTAS16 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 16-bit signed integer element addition and 16-bit signed integer element subtraction in
+ * a 32-bit chunk simultaneously. Operands are from corresponding positions in 32-bit chunks. The
+ * results are halved to avoid overflow or saturation.
+ *
+ * **Description**:\n
+ * This instruction adds the 16-bit signed integer element in [31:16] of 32-bit chunks in
+ * Rs1 with the 16-bit signed integer element in [31:16] of 32-bit chunks in Rs2, and subtracts the 16-bit
+ * signed integer element in [15:0] of 32-bit chunks in Rs2 from the 16-bit signed integer element in
+ * [15:0] of 32-bit chunks in Rs1. The element results are first arithmetically right-shifted by 1 bit and
+ * then written to [31:16] of 32-bit chunks in Rd and [15:0] of 32-bit chunks in Rd.
+ *
+ * **Examples**:\n
+ * ~~~
+ * Please see `RADD16` and `RSUB16` instructions.
+ * ~~~
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.W[x][31:16] = (Rs1.W[x][31:16] + Rs2.W[x][31:16]) s>> 1;
+ * Rd.W[x][15:0] = (Rs1.W[x][15:0] - Rs2.W[x][15:0]) s>> 1;
+ * for RV32, x=0
+ * for RV64, x=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_RSTAS16(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("rstas16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.87. RSTAS16 ===== */
+
+/* ===== Inline Function Start for 3.88. RSTSA16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_ADDSUB
+ * \brief RSTSA16 (SIMD 16-bit Signed Halving Straight Subtraction & Addition)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * RSTSA16 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 16-bit signed integer element subtraction and 16-bit signed integer element addition in
+ * a 32-bit chunk simultaneously. Operands are from corresponding positions in 32-bit chunks. The
+ * results are halved to avoid overflow or saturation.
+ *
+ * **Description**:\n
+ * This instruction subtracts the 16-bit signed integer element in [31:16] of 32-bit chunks
+ * in Rs2 from the 16-bit signed integer element in [31:16] of 32-bit chunks in Rs1, and adds the 16-bit
+ * signed element integer in [15:0] of 32-bit chunks in Rs1 with the 16-bit signed integer element in
+ * [15:0] of 32-bit chunks in Rs2. The two results are first arithmetically right-shifted by 1 bit and then
+ * written to [31:16] of 32-bit chunks in Rd and [15:0] of 32-bit chunks in Rd.
+ *
+ * **Examples**:\n
+ * ~~~
+ * Please see `RADD16` and `RSUB16` instructions.
+ * ~~~
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.W[x][31:16] = (Rs1.W[x][31:16] - Rs2.W[x][31:16]) s>> 1;
+ * Rd.W[x][15:0] = (Rs1.W[x][15:0] + Rs2.W[x][15:0]) s>> 1;
+ * for RV32, x=0
+ * for RV64, x=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_RSTSA16(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("rstsa16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.88. RSTSA16 ===== */
+
+/* ===== Inline Function Start for 3.89. RSUB8 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_ADDSUB
+ * \brief RSUB8 (SIMD 8-bit Signed Halving Subtraction)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * RSUB8 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 8-bit signed integer element subtractions simultaneously. The results are halved to
+ * avoid overflow or saturation.
+ *
+ * **Description**:\n
+ * This instruction subtracts the 8-bit signed integer elements in Rs2 from the 8-bit
+ * signed integer elements in Rs1. The results are first arithmetically right-shifted by 1 bit and then
+ * written to Rd.
+ *
+ * **Examples**:\n
+ * ~~~
+ * * Rs1 = 0x7F, Rs2 = 0x80, Rd = 0x7F
+ * * Rs1 = 0x80, Rs2 = 0x7F, Rd = 0x80
+ * * Rs1= 0x80, Rs2 = 0x40, Rd = 0xA0
+ * ~~~
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.B[x] = (Rs1.B[x] - Rs2.B[x]) s>> 1;
+ * for RV32: x=3...0,
+ * for RV64: x=7...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_RSUB8(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("rsub8 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.89. RSUB8 ===== */
+
+/* ===== Inline Function Start for 3.90. RSUB16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_ADDSUB
+ * \brief RSUB16 (SIMD 16-bit Signed Halving Subtraction)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * RSUB16 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 16-bit signed integer element subtractions simultaneously. The results are halved to
+ * avoid overflow or saturation.
+ *
+ * **Description**:\n
+ * This instruction subtracts the 16-bit signed integer elements in Rs2 from the 16-bit
+ * signed integer elements in Rs1. The results are first arithmetically right-shifted by 1 bit and then
+ * written to Rd.
+ *
+ * **Examples**:\n
+ * ~~~
+ * * Ra = 0x7FFF, Rb = 0x8000, Rt = 0x7FFF
+ * * Ra = 0x8000, Rb = 0x7FFF, Rt = 0x8000
+ * * Ra = 0x8000, Rb = 0x4000, Rt = 0xA000
+ * ~~~
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.H[x] = (Rs1.H[x] - Rs2.H[x]) s>> 1;
+ * for RV32: x=1...0,
+ * for RV64: x=3...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_RSUB16(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("rsub16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.90. RSUB16 ===== */
+
+/* ===== Inline Function Start for 3.91. RSUB64 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_64B_ADDSUB
+ * \brief RSUB64 (64-bit Signed Halving Subtraction)
+ * \details
+ * **Type**: DSP (64-bit Profile)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * RSUB64 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Perform a 64-bit signed integer subtraction. The result is halved to avoid overflow or
+ * saturation.
+ *
+ * **RV32 Description**:\n
+ * This instruction subtracts the 64-bit signed integer of an even/odd pair of
+ * registers specified by Rb(4,1) from the 64-bit signed integer of an even/odd pair of registers
+ * specified by Ra(4,1). The subtraction result is first arithmetically right-shifted by 1 bit and then
+ * written to an even/odd pair of registers specified by Rt(4,1).
+ * Rx(4,1), i.e., value d, determines the even/odd pair group of two registers. Specifically, the register
+ * pair includes register 2d and 2d+1.
+ * The odd `2d+1` register of the pair contains the high 32-bit of the result and the even `2d` register
+ * of the pair contains the low 32-bit of the result.
+ *
+ * **RV64 Description**:\n
+ * This instruction subtracts the 64-bit signed integer in Rs2 from the 64-bit signed
+ * integer in Rs1. The 64-bit subtraction result is first arithmetically right-shifted by 1 bit and then
+ * written to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * RV32:
+ * t_L = CONCAT(Rd(4,1),1'b0); t_H = CONCAT(Rd(4,1),1'b1);
+ * a_L = CONCAT(Rs1(4,1),1'b0); a_H = CONCAT(Rs1(4,1),1'b1);
+ * b_L = CONCAT(Rs2(4,1),1'b0); b_H = CONCAT(Rs2(4,1),1'b1);
+ * R[t_H].R[t_L] = (R[a_H].R[a_L] - R[b_H].R[b_L]) s>> 1;
+ * RV64:
+ * Rd = (Rs1 - Rs2) s>> 1;
+ * ~~~
+ *
+ * \param [in]  a    long long type of value stored in a
+ * \param [in]  b    long long type of value stored in b
+ * \return value stored in long long type
+ */
+__STATIC_FORCEINLINE long long __RV_RSUB64(long long a, long long b)
+{
+    register long long result;
+    __ASM volatile("rsub64 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.91. RSUB64 ===== */
+
+/* ===== Inline Function Start for 3.92. RSUBW ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_32B_COMPUTATION
+ * \brief RSUBW (32-bit Signed Halving Subtraction)
+ * \details
+ * **Type**: DSP
+ *
+ * **Syntax**:\n
+ * ~~~
+ * RSUBW Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Subtract 32-bit signed integers and the result is halved to avoid overflow or saturation.
+ *
+ * **Description**:\n
+ * This instruction subtracts the first 32-bit signed integer in Rs2 from the first 32-bit
+ * signed integer in Rs1. The result is first arithmetically right-shifted by 1 bit and then sign-extended
+ * and written to Rd.
+ *
+ * **Examples**:\n
+ * ~~~
+ * * Rs1 = 0x7FFFFFFF, Rs2 = 0x80000000, Rd = 0x7FFFFFFF
+ * * Rs1 = 0x80000000, Rs2 = 0x7FFFFFFF, Rd = 0x80000000
+ * * Rs1 = 0x80000000, Rs2 = 0x40000000, Rd = 0xA0000000
+ * ~~~
+ *
+ * **Operations**:\n
+ * ~~~
+ * RV32:
+ * Rd[31:0] = (Rs1[31:0] - Rs2[31:0]) s>> 1;
+ * RV64:
+ * resw[31:0] = (Rs1[31:0] - Rs2[31:0]) s>> 1;
+ * Rd[63:0] = SE(resw[31:0]);
+ * ~~~
+ *
+ * \param [in]  a    int type of value stored in a
+ * \param [in]  b    int type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_RSUBW(int a, int b)
+{
+    register long result;
+    __ASM volatile("rsubw %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.92. RSUBW ===== */
+
+/* ===== Inline Function Start for 3.93. SCLIP8 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_MISC
+ * \brief SCLIP8 (SIMD 8-bit Signed Clip Value)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SCLIP8 Rd, Rs1, imm3u[2:0]
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Limit the 8-bit signed integer elements of a register into a signed range simultaneously.
+ *
+ * **Description**:\n
+ * This instruction limits the 8-bit signed integer elements stored in Rs1 into a signed
+ * integer range between 2^imm3u-1 and -2^imm3u, and writes the limited results to Rd. For example, if
+ * imm3u is 3, the 8-bit input values should be saturated between 7 and -8. If saturation is performed,
+ * set OV bit to 1.
+ *
+ * **Operations**:\n
+ * ~~~
+ * src = Rs1.B[x];
+ * if (src > (2^imm3u)-1) {
+ *   src = (2^imm3u)-1;
+ *   OV = 1;
+ * } else if (src < -2^imm3u) {
+ *   src = -2^imm3u;
+ *   OV = 1;
+ * }
+ * Rd.B[x] = src
+ * for RV32: x=3...0,
+ * for RV64: x=7...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned int type of value stored in b
+ * \return value stored in unsigned long type
+ */
+#define __RV_SCLIP8(a, b)    \
+    ({    \
+        register unsigned long result;    \
+        register unsigned long __a = (unsigned long)(a);    \
+        __ASM volatile("sclip8 %0, %1, %2" : "=r"(result) : "r"(__a), "K"(b));    \
+        result;    \
+    })
+/* ===== Inline Function End for 3.93. SCLIP8 ===== */
+
+/* ===== Inline Function Start for 3.94. SCLIP16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_MISC
+ * \brief SCLIP16 (SIMD 16-bit Signed Clip Value)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SCLIP16 Rd, Rs1, imm4u[3:0]
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Limit the 16-bit signed integer elements of a register into a signed range simultaneously.
+ *
+ * **Description**:\n
+ * This instruction limits the 16-bit signed integer elements stored in Rs1 into a signed
+ * integer range between 2imm4u-1 and -2imm4u, and writes the limited results to Rd. For example, if
+ * imm4u is 3, the 16-bit input values should be saturated between 7 and -8. If saturation is performed,
+ * set OV bit to 1.
+ *
+ * **Operations**:\n
+ * ~~~
+ * src = Rs1.H[x];
+ * if (src > (2^imm4u)-1) {
+ *   src = (2^imm4u)-1;
+ *   OV = 1;
+ * } else if (src < -2^imm4u) {
+ *   src = -2^imm4u;
+ *   OV = 1;
+ * }
+ * Rd.H[x] = src
+ * for RV32: x=1...0,
+ * for RV64: x=3...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned int type of value stored in b
+ * \return value stored in unsigned long type
+ */
+#define __RV_SCLIP16(a, b)    \
+    ({    \
+        register unsigned long result;    \
+        register unsigned long __a = (unsigned long)(a);    \
+        __ASM volatile("sclip16 %0, %1, %2" : "=r"(result) : "r"(__a), "K"(b));    \
+        result;    \
+    })
+/* ===== Inline Function End for 3.94. SCLIP16 ===== */
+
+/* ===== Inline Function Start for 3.95. SCLIP32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_PART_SIMD_MISC
+ * \brief SCLIP32 (SIMD 32-bit Signed Clip Value)
+ * \details
+ * **Type**: DSP
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SCLIP32 Rd, Rs1, imm5u[4:0]
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Limit the 32-bit signed integer elements of a register into a signed range simultaneously.
+ *
+ * **Description**:\n
+ * This instruction limits the 32-bit signed integer elements stored in Rs1 into a signed
+ * integer range between 2imm5u-1 and -2imm5u, and writes the limited results to Rd. For example, if
+ * imm5u is 3, the 32-bit input values should be saturated between 7 and -8. If saturation is performed,
+ * set OV bit to 1.
+ *
+ * **Operations**:\n
+ * ~~~
+ * src = Rs1.W[x];
+ * if (src > (2^imm5u)-1) {
+ *   src = (2^imm5u)-1;
+ *   OV = 1;
+ * } else if (src < -2^imm5u) {
+ *   src = -2^imm5u;
+ *   OV = 1;
+ * }
+ * Rd.W[x] = src
+ * for RV32: x=0,
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  a    long type of value stored in a
+ * \param [in]  b    unsigned int type of value stored in b
+ * \return value stored in long type
+ */
+#define __RV_SCLIP32(a, b)    \
+    ({    \
+        register long result;    \
+        register long __a = (long)(a);    \
+        __ASM volatile("sclip32 %0, %1, %2" : "=r"(result) : "r"(__a), "K"(b));    \
+        result;    \
+    })
+/* ===== Inline Function End for 3.95. SCLIP32 ===== */
+
+/* ===== Inline Function Start for 3.96. SCMPLE8 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_CMP
+ * \brief SCMPLE8 (SIMD 8-bit Signed Compare Less Than & Equal)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SCMPLE8 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 8-bit signed integer elements less than & equal comparisons simultaneously.
+ *
+ * **Description**:\n
+ * This instruction compares the 8-bit signed integer elements in Rs1 with the 8-bit
+ * signed integer elements in Rs2 to see if the one in Rs1 is less than or equal to the one in Rs2. If it is
+ * true, the result is 0xFF; otherwise, the result is 0x0. The element comparison results are written to
+ * Rd
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.B[x] = (Rs1.B[x] {le} Rs2.B[x])? 0xff : 0x0;
+ * for RV32: x=3...0,
+ * for RV64: x=7...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_SCMPLE8(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("scmple8 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.96. SCMPLE8 ===== */
+
+/* ===== Inline Function Start for 3.97. SCMPLE16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_CMP
+ * \brief SCMPLE16 (SIMD 16-bit Signed Compare Less Than & Equal)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SCMPLE16 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 16-bit signed integer elements less than & equal comparisons simultaneously.
+ *
+ * **Description**:\n
+ * This instruction compares the 16-bit signed integer elements in Rs1 with the 16-bit
+ * signed integer elements in Rs2 to see if the one in Rs1 is less than or equal to the one in Rs2. If it is
+ * true, the result is 0xFFFF; otherwise, the result is 0x0. The element comparison results are written
+ * to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.H[x] = (Rs1.H[x] {le} Rs2.H[x])? 0xffff : 0x0;
+ * for RV32: x=1...0,
+ * for RV64: x=3...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_SCMPLE16(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("scmple16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.97. SCMPLE16 ===== */
+
+/* ===== Inline Function Start for 3.98. SCMPLT8 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_CMP
+ * \brief SCMPLT8 (SIMD 8-bit Signed Compare Less Than)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SCMPLT8 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 8-bit signed integer elements less than comparisons simultaneously.
+ *
+ * **Description**:\n
+ * This instruction compares the 8-bit signed integer elements in Rs1 with the 8-bit
+ * signed integer elements in Rs2 to see if the one in Rs1 is less than the one in Rs2. If it is true, the
+ * result is 0xFF; otherwise, the result is 0x0. The element comparison results are written to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.B[x] = (Rs1.B[x] < Rs2.B[x])? 0xff : 0x0;
+ * for RV32: x=3...0,
+ * for RV64: x=7...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_SCMPLT8(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("scmplt8 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.98. SCMPLT8 ===== */
+
+/* ===== Inline Function Start for 3.99. SCMPLT16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_CMP
+ * \brief SCMPLT16 (SIMD 16-bit Signed Compare Less Than)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SCMPLT16 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 16-bit signed integer elements less than comparisons simultaneously.
+ *
+ * **Description**:\n
+ * This instruction compares the 16-bit signed integer elements in Rs1 with the two 16-
+ * bit signed integer elements in Rs2 to see if the one in Rs1 is less than the one in Rs2. If it is true, the
+ * result is 0xFFFF; otherwise, the result is 0x0. The element comparison results are written to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.H[x] = (Rs1.H[x] < Rs2.H[x])? 0xffff : 0x0;
+ * for RV32: x=1...0,
+ * for RV64: x=3...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_SCMPLT16(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("scmplt16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.99. SCMPLT16 ===== */
+
+/* ===== Inline Function Start for 3.100. SLL8 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_SHIFT
+ * \brief SLL8 (SIMD 8-bit Shift Left Logical)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SLL8 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 8-bit elements logical left shift operations simultaneously. The shift amount is a
+ * variable from a GPR.
+ *
+ * **Description**:\n
+ * The 8-bit elements in Rs1 are left-shifted logically. And the results are written to Rd.
+ * The shifted out bits are filled with zero and the shift amount is specified by the low-order 3-bits of
+ * the value in the Rs2 register.
+ *
+ * **Operations**:\n
+ * ~~~
+ * sa = Rs2[2:0];
+ * Rd.B[x] = Rs1.B[x] << sa;
+ * for RV32: x=3...0,
+ * for RV64: x=7...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned int type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_SLL8(unsigned long a, unsigned int b)
+{
+    register unsigned long result;
+    __ASM volatile("sll8 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.100. SLL8 ===== */
+
+/* ===== Inline Function Start for 3.101. SLLI8 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_SHIFT
+ * \brief SLLI8 (SIMD 8-bit Shift Left Logical Immediate)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SLLI8 Rd, Rs1, imm3u
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 8-bit elements logical left shift operations simultaneously. The shift amount is an
+ * immediate value.
+ *
+ * **Description**:\n
+ * The 8-bit elements in Rs1 are left-shifted logically. And the results are written to Rd.
+ * The shifted out bits are filled with zero and the shift amount is specified by the imm3u constant.
+ *
+ * **Operations**:\n
+ * ~~~
+ * sa = imm3u[2:0];
+ * Rd.B[x] = Rs1.B[x] << sa;
+ * for RV32: x=3...0,
+ * for RV64: x=7...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned int type of value stored in b
+ * \return value stored in unsigned long type
+ */
+#define __RV_SLLI8(a, b)    \
+    ({    \
+        register unsigned long result;    \
+        register unsigned long __a = (unsigned long)(a);    \
+        __ASM volatile("slli8 %0, %1, %2" : "=r"(result) : "r"(__a), "K"(b));    \
+        result;    \
+    })
+/* ===== Inline Function End for 3.101. SLLI8 ===== */
+
+/* ===== Inline Function Start for 3.102. SLL16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_SHIFT
+ * \brief SLL16 (SIMD 16-bit Shift Left Logical)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SLL16 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 16-bit elements logical left shift operations simultaneously. The shift amount is a
+ * variable from a GPR.
+ *
+ * **Description**:\n
+ * The 16-bit elements in Rs1 are left-shifted logically. And the results are written to Rd.
+ * The shifted out bits are filled with zero and the shift amount is specified by the low-order 4-bits of
+ * the value in the Rs2 register.
+ *
+ * **Operations**:\n
+ * ~~~
+ * sa = Rs2[3:0];
+ * Rd.H[x] = Rs1.H[x] << sa;
+ * for RV32: x=1...0,
+ * for RV64: x=3...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned int type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_SLL16(unsigned long a, unsigned int b)
+{
+    register unsigned long result;
+    __ASM volatile("sll16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.102. SLL16 ===== */
+
+/* ===== Inline Function Start for 3.103. SLLI16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_SHIFT
+ * \brief SLLI16 (SIMD 16-bit Shift Left Logical Immediate)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SLLI16 Rd, Rs1, imm4[3:0]
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 16-bit element logical left shift operations simultaneously. The shift amount is an
+ * immediate value.
+ *
+ * **Description**:\n
+ * The 16-bit elements in Rs1 are left-shifted logically. The shifted out bits are filled with
+ * zero and the shift amount is specified by the imm4[3:0] constant. And the results are written to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * sa = imm4[3:0];
+ * Rd.H[x] = Rs1.H[x] << sa;
+ * for RV32: x=1...0,
+ * for RV64: x=3...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned int type of value stored in b
+ * \return value stored in unsigned long type
+ */
+#define __RV_SLLI16(a, b)    \
+    ({    \
+        register unsigned long result;    \
+        register unsigned long __a = (unsigned long)(a);    \
+        __ASM volatile("slli16 %0, %1, %2" : "=r"(result) : "r"(__a), "K"(b));    \
+        result;    \
+    })
+/* ===== Inline Function End for 3.103. SLLI16 ===== */
+
+/* ===== Inline Function Start for 3.104. SMAL ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_16B_MULT_64B_ADDSUB
+ * \brief SMAL (Signed Multiply Halfs & Add 64-bit)
+ * \details
+ * **Type**: Partial-SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SMAL Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Multiply the signed bottom 16-bit content of the 32-bit elements of a register with the top
+ * 16-bit content of the same 32-bit elements of the same register, and add the results with a 64-bit
+ * value of an even/odd pair of registers (RV32) or a register (RV64). The addition result is written back
+ * to another even/odd pair of registers (RV32) or a register (RV64).
+ *
+ * **RV32 Description**:\n
+ * This instruction multiplies the bottom 16-bit content of the lower 32-bit of Rs2 with the top 16-bit
+ * content of the lower 32-bit of Rs2 and adds the result with the 64-bit value of an even/odd pair of
+ * registers specified by Rs1(4,1). The 64-bit addition result is written back to an even/odd pair of
+ * registers specified by Rd(4,1). The 16-bit values of Rs2, and the 64-bit value of the Rs1(4,1) register-
+ * pair are treated as signed integers.
+ * Rx(4,1), i.e., d, determines the even/odd pair group of the two registers. Specifically, the register pair
+ * includes register 2d and 2d+1.
+ * The odd `2d+1` register of the pair contains the high 32-bit of the operand and the even `2d`
+ * register of the pair contains the low 32-bit of the operand.
+ *
+ * **RV64 Description**:\n
+ * This instruction multiplies the bottom 16-bit content of the 32-bit elements of Rs2 with the top 16-bit
+ * content of the same 32-bit elements of Rs2 and adds the results with the 64-bit value of Rs1. The 64-
+ * bit addition result is written back to Rd. The 16-bit values of Rs2, and the 64-bit value of Rs1 are
+ * treated as signed integers.
+ *
+ * **Operations**:\n
+ * ~~~
+ * RV32:
+ * Mres[31:0] = Rs2.H[1] * Rs2.H[0];
+ * Idx0 = CONCAT(Rs1(4,1),1'b0); Idx1 = CONCAT(Rs1(4,1),1'b1); +
+ * Idx2 = CONCAT(Rd(4,1),1'b0); Idx3 = CONCAT(Rd(4,1),1'b1);
+ * R[Idx3].R[Idx2] = R[Idx1].R[Idx0] + SE64(Mres[31:0]);
+ * RV64:
+ * Mres[0][31:0] = Rs2.W[0].H[1] * Rs2.W[0].H[0];
+ * Mres[1][31:0] = Rs2.W[1].H[1] * Rs2.W[1].H[0];
+ * Rd = Rs1 + SE64(Mres[1][31:0]) + SE64(Mres[0][31:0]);
+ * ~~~
+ *
+ * \param [in]  a    long long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in long long type
+ */
+__STATIC_FORCEINLINE long long __RV_SMAL(long long a, unsigned long b)
+{
+    register long long result;
+    __ASM volatile("smal %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.104. SMAL ===== */
+
+/* ===== Inline Function Start for 3.105.1. SMALBB ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_16B_MULT_64B_ADDSUB
+ * \brief SMALBB (Signed Multiply Bottom Halfs & Add 64-bit)
+ * \details
+ * **Type**: DSP (64-bit Profile)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SMALBB Rd, Rs1, Rs2
+ * SMALBT Rd, Rs1, Rs2
+ * SMALTT Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Multiply the signed 16-bit content of the 32-bit elements of a register with the 16-bit
+ * content of the corresponding 32-bit elements of another register and add the results with a 64-bit
+ * value of an even/odd pair of registers (RV32) or a register (RV64). The addition result is written back
+ * to the register-pair (RV32) or the register (RV64).
+ * * SMALBB: rt pair + bottom*bottom (all 32-bit elements)
+ * * SMALBT rt pair + bottom*top (all 32-bit elements)
+ * * SMALTT rt pair + top*top (all 32-bit elements)
+ *
+ * **RV32 Description**:\n
+ * For the `SMALBB` instruction, it multiplies the bottom 16-bit content of Rs1 with the bottom 16-bit
+ * content of Rs2.
+ * For the `SMALBT` instruction, it multiplies the bottom 16-bit content of Rs1 with the top 16-bit
+ * content of Rs2.
+ * For the `SMALTT` instruction, it multiplies the top 16-bit content of Rs1 with the top 16-bit content
+ * of Rs2.
+ * The multiplication result is added with the 64-bit value of an even/odd pair of registers specified by
+ * Rd(4,1). The 64-bit addition result is written back to the register-pair. The 16-bit values of Rs1 and
+ * Rs2, and the 64-bit value of the register-pair are treated as signed integers.
+ * Rd(4,1), i.e., d, determines the even/odd pair group of the two registers. Specifically, the register pair
+ * includes register 2d and 2d+1.
+ * The odd `2d+1` register of the pair contains the high 32-bit of the operand and the even `2d`
+ * register of the pair contains the low 32-bit of the operand.
+ *
+ * **RV64 Description**:\n
+ * For the `SMALBB` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1
+ * with the bottom 16-bit content of the 32-bit elements of Rs2.
+ * For the `SMALBT` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1
+ * with the top 16-bit content of the 32-bit elements of Rs2.
+ * For the `SMALTT` instruction, it multiplies the top 16-bit content of the 32-bit elements of Rs1 with
+ * the top 16-bit content of the 32-bit elements of Rs2.
+ * The multiplication results are added with the 64-bit value of Rd. The 64-bit addition result is written
+ * back to Rd. The 16-bit values of Rs1 and Rs2, and the 64-bit value of Rd are treated as signed
+ * integers.
+ *
+ * **Operations**:\n
+ * ~~~
+ * RV32:
+ * Mres[31:0] = Rs1.H[0] * Rs2.H[0]; // SMALBB
+ * Mres[31:0] = Rs1.H[0] * Rs2.H[1]; // SMALBT
+ * Mres[31:0] = Rs1.H[1] * Rs2.H[1]; // SMALTT
+ * Idx0 = CONCAT(Rd(4,1),1'b0); Idx1 = CONCAT(Rd(4,1),1'b1);
+ * R[Idx1].R[Idx0] = R[Idx1].R[Idx0] + SE64(Mres[31:0]);
+ * RV64:
+ * // SMALBB
+ * Mres[0][31:0] = Rs1.W[0].H[0] * Rs2.W[0].H[0];
+ * Mres[1][31:0] = Rs1.W[1].H[0] * Rs2.W[1].H[0];
+ * // SMALBT
+ * Mres[0][31:0] = Rs1.W[0].H[0] * Rs2.W[0].H[1];
+ * Mres[1][31:0] = Rs1.W[1].H[0] * Rs2.W[1].H[1];
+ * // SMALTT
+ * Mres[0][31:0] = Rs1.W[0].H[1] * Rs2.W[0].H[1];
+ * Mres[1][31:0] = Rs1.W[1].H[1] * Rs2.W[1].H[1];
+ * Rd = Rd + SE64(Mres[0][31:0]) + SE64(Mres[1][31:0]);
+ * ~~~
+ *
+ * \param [in]  t    long long type of value stored in t
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in long long type
+ */
+__STATIC_FORCEINLINE long long __RV_SMALBB(long long t, unsigned long a, unsigned long b)
+{
+    __ASM volatile("smalbb %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
+    return t;
+}
+/* ===== Inline Function End for 3.105.1. SMALBB ===== */
+
+/* ===== Inline Function Start for 3.105.2. SMALBT ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_16B_MULT_64B_ADDSUB
+ * \brief SMALBT (Signed Multiply Bottom Half & Top Half & Add 64-bit)
+ * \details
+ * **Type**: DSP (64-bit Profile)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SMALBB Rd, Rs1, Rs2
+ * SMALBT Rd, Rs1, Rs2
+ * SMALTT Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Multiply the signed 16-bit content of the 32-bit elements of a register with the 16-bit
+ * content of the corresponding 32-bit elements of another register and add the results with a 64-bit
+ * value of an even/odd pair of registers (RV32) or a register (RV64). The addition result is written back
+ * to the register-pair (RV32) or the register (RV64).
+ * * SMALBB: rt pair + bottom*bottom (all 32-bit elements)
+ * * SMALBT rt pair + bottom*top (all 32-bit elements)
+ * * SMALTT rt pair + top*top (all 32-bit elements)
+ *
+ * **RV32 Description**:\n
+ * For the `SMALBB` instruction, it multiplies the bottom 16-bit content of Rs1 with the bottom 16-bit
+ * content of Rs2.
+ * For the `SMALBT` instruction, it multiplies the bottom 16-bit content of Rs1 with the top 16-bit
+ * content of Rs2.
+ * For the `SMALTT` instruction, it multiplies the top 16-bit content of Rs1 with the top 16-bit content
+ * of Rs2.
+ * The multiplication result is added with the 64-bit value of an even/odd pair of registers specified by
+ * Rd(4,1). The 64-bit addition result is written back to the register-pair. The 16-bit values of Rs1 and
+ * Rs2, and the 64-bit value of the register-pair are treated as signed integers.
+ * Rd(4,1), i.e., d, determines the even/odd pair group of the two registers. Specifically, the register pair
+ * includes register 2d and 2d+1.
+ * The odd `2d+1` register of the pair contains the high 32-bit of the operand and the even `2d`
+ * register of the pair contains the low 32-bit of the operand.
+ *
+ * **RV64 Description**:\n
+ * For the `SMALBB` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1
+ * with the bottom 16-bit content of the 32-bit elements of Rs2.
+ * For the `SMALBT` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1
+ * with the top 16-bit content of the 32-bit elements of Rs2.
+ * For the `SMALTT` instruction, it multiplies the top 16-bit content of the 32-bit elements of Rs1 with
+ * the top 16-bit content of the 32-bit elements of Rs2.
+ * The multiplication results are added with the 64-bit value of Rd. The 64-bit addition result is written
+ * back to Rd. The 16-bit values of Rs1 and Rs2, and the 64-bit value of Rd are treated as signed
+ * integers.
+ *
+ * **Operations**:\n
+ * ~~~
+ * RV32:
+ * Mres[31:0] = Rs1.H[0] * Rs2.H[0]; // SMALBB
+ * Mres[31:0] = Rs1.H[0] * Rs2.H[1]; // SMALBT
+ * Mres[31:0] = Rs1.H[1] * Rs2.H[1]; // SMALTT
+ * Idx0 = CONCAT(Rd(4,1),1'b0); Idx1 = CONCAT(Rd(4,1),1'b1);
+ * R[Idx1].R[Idx0] = R[Idx1].R[Idx0] + SE64(Mres[31:0]);
+ * RV64:
+ * // SMALBB
+ * Mres[0][31:0] = Rs1.W[0].H[0] * Rs2.W[0].H[0];
+ * Mres[1][31:0] = Rs1.W[1].H[0] * Rs2.W[1].H[0];
+ * // SMALBT
+ * Mres[0][31:0] = Rs1.W[0].H[0] * Rs2.W[0].H[1];
+ * Mres[1][31:0] = Rs1.W[1].H[0] * Rs2.W[1].H[1];
+ * // SMALTT
+ * Mres[0][31:0] = Rs1.W[0].H[1] * Rs2.W[0].H[1];
+ * Mres[1][31:0] = Rs1.W[1].H[1] * Rs2.W[1].H[1];
+ * Rd = Rd + SE64(Mres[0][31:0]) + SE64(Mres[1][31:0]);
+ * ~~~
+ *
+ * \param [in]  t    long long type of value stored in t
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in long long type
+ */
+__STATIC_FORCEINLINE long long __RV_SMALBT(long long t, unsigned long a, unsigned long b)
+{
+    __ASM volatile("smalbt %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
+    return t;
+}
+/* ===== Inline Function End for 3.105.2. SMALBT ===== */
+
+/* ===== Inline Function Start for 3.105.3. SMALTT ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_16B_MULT_64B_ADDSUB
+ * \brief SMALTT (Signed Multiply Top Halfs & Add 64-bit)
+ * \details
+ * **Type**: DSP (64-bit Profile)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SMALBB Rd, Rs1, Rs2
+ * SMALBT Rd, Rs1, Rs2
+ * SMALTT Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Multiply the signed 16-bit content of the 32-bit elements of a register with the 16-bit
+ * content of the corresponding 32-bit elements of another register and add the results with a 64-bit
+ * value of an even/odd pair of registers (RV32) or a register (RV64). The addition result is written back
+ * to the register-pair (RV32) or the register (RV64).
+ * * SMALBB: rt pair + bottom*bottom (all 32-bit elements)
+ * * SMALBT rt pair + bottom*top (all 32-bit elements)
+ * * SMALTT rt pair + top*top (all 32-bit elements)
+ *
+ * **RV32 Description**:\n
+ * For the `SMALBB` instruction, it multiplies the bottom 16-bit content of Rs1 with the bottom 16-bit
+ * content of Rs2.
+ * For the `SMALBT` instruction, it multiplies the bottom 16-bit content of Rs1 with the top 16-bit
+ * content of Rs2.
+ * For the `SMALTT` instruction, it multiplies the top 16-bit content of Rs1 with the top 16-bit content
+ * of Rs2.
+ * The multiplication result is added with the 64-bit value of an even/odd pair of registers specified by
+ * Rd(4,1). The 64-bit addition result is written back to the register-pair. The 16-bit values of Rs1 and
+ * Rs2, and the 64-bit value of the register-pair are treated as signed integers.
+ * Rd(4,1), i.e., d, determines the even/odd pair group of the two registers. Specifically, the register pair
+ * includes register 2d and 2d+1.
+ * The odd `2d+1` register of the pair contains the high 32-bit of the operand and the even `2d`
+ * register of the pair contains the low 32-bit of the operand.
+ *
+ * **RV64 Description**:\n
+ * For the `SMALBB` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1
+ * with the bottom 16-bit content of the 32-bit elements of Rs2.
+ * For the `SMALBT` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1
+ * with the top 16-bit content of the 32-bit elements of Rs2.
+ * For the `SMALTT` instruction, it multiplies the top 16-bit content of the 32-bit elements of Rs1 with
+ * the top 16-bit content of the 32-bit elements of Rs2.
+ * The multiplication results are added with the 64-bit value of Rd. The 64-bit addition result is written
+ * back to Rd. The 16-bit values of Rs1 and Rs2, and the 64-bit value of Rd are treated as signed
+ * integers.
+ *
+ * **Operations**:\n
+ * ~~~
+ * RV32:
+ * Mres[31:0] = Rs1.H[0] * Rs2.H[0]; // SMALBB
+ * Mres[31:0] = Rs1.H[0] * Rs2.H[1]; // SMALBT
+ * Mres[31:0] = Rs1.H[1] * Rs2.H[1]; // SMALTT
+ * Idx0 = CONCAT(Rd(4,1),1'b0); Idx1 = CONCAT(Rd(4,1),1'b1);
+ * R[Idx1].R[Idx0] = R[Idx1].R[Idx0] + SE64(Mres[31:0]);
+ * RV64:
+ * // SMALBB
+ * Mres[0][31:0] = Rs1.W[0].H[0] * Rs2.W[0].H[0];
+ * Mres[1][31:0] = Rs1.W[1].H[0] * Rs2.W[1].H[0];
+ * // SMALBT
+ * Mres[0][31:0] = Rs1.W[0].H[0] * Rs2.W[0].H[1];
+ * Mres[1][31:0] = Rs1.W[1].H[0] * Rs2.W[1].H[1];
+ * // SMALTT
+ * Mres[0][31:0] = Rs1.W[0].H[1] * Rs2.W[0].H[1];
+ * Mres[1][31:0] = Rs1.W[1].H[1] * Rs2.W[1].H[1];
+ * Rd = Rd + SE64(Mres[0][31:0]) + SE64(Mres[1][31:0]);
+ * ~~~
+ *
+ * \param [in]  t    long long type of value stored in t
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in long long type
+ */
+__STATIC_FORCEINLINE long long __RV_SMALTT(long long t, unsigned long a, unsigned long b)
+{
+    __ASM volatile("smaltt %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
+    return t;
+}
+/* ===== Inline Function End for 3.105.3. SMALTT ===== */
+
+/* ===== Inline Function Start for 3.106.1. SMALDA ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_16B_MULT_64B_ADDSUB
+ * \brief SMALDA (Signed Multiply Two Halfs and Two Adds 64-bit)
+ * \details
+ * **Type**: DSP (64-bit Profile)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SMALDA Rd, Rs1, Rs2
+ * SMALXDA Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do two signed 16-bit multiplications from the 32-bit elements of two registers; and then
+ * adds the two 32-bit results and the 64-bit value of an even/odd pair of registers together.
+ * * SMALDA: rt pair+ top*top + bottom*bottom (all 32-bit elements)
+ * * SMALXDA: rt pair+ top*bottom + bottom*top (all 32-bit elements)
+ *
+ * **RV32 Description**:\n
+ * For the `SMALDA` instruction, it multiplies the bottom 16-bit content of Rs1 with the bottom 16-bit
+ * content of Rs2 and then adds the result to the result of multiplying the top 16-bit content of Rs1 with
+ * the top 16-bit content of Rs2 with unlimited precision.
+ * For the `SMALXDA` instruction, it multiplies the top 16-bit content of Rs1 with the bottom 16-bit
+ * content of Rs2 and then adds the result to the result of multiplying the bottom 16-bit content of Rs1
+ * with the top 16-bit content of Rs2 with unlimited precision.
+ * The result is added to the 64-bit value of an even/odd pair of registers specified by Rd(4,1). The 64-
+ * bit addition result is written back to the register-pair. The 16-bit values of Rs1 and Rs2, and the 64-
+ * bit value of the register-pair are treated as signed integers.
+ * Rd(4,1), i.e., d, determines the even/odd pair group of the two registers. Specifically, the register pair
+ * includes register 2d and 2d+1.
+ * The odd `2d+1` register of the pair contains the high 32-bit of the operand and the even `2d`
+ * register of the pair contains the low 32-bit of the operand.
+ *
+ * **RV64 Description**:\n
+ * For the `SMALDA` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1
+ * with the bottom 16-bit content of the 32-bit elements of Rs2 and then adds the result to the result of
+ * multiplying the top 16-bit content of the 32-bit elements of Rs1 with the top 16-bit content of the 32-
+ * bit elements of Rs2 with unlimited precision.
+ * For the `SMALXDA` instruction, it multiplies the top 16-bit content of the 32-bit elements of Rs1
+ * with the bottom 16-bit content of the 32-bit elements of Rs2 and then adds the result to the result of
+ * multiplying the bottom 16-bit content of the 32-bit elements of Rs1 with the top 16-bit content of the
+ * 32-bit elements of Rs2 with unlimited precision.
+ * The results are added to the 64-bit value of Rd. The 64-bit addition result is written back to Rd. The
+ * 16-bit values of Rs1 and Rs2, and the 64-bit value of Rd are treated as signed integers.
+ *
+ * **Operations**:\n
+ * ~~~
+ * RV32:
+ * // SMALDA
+ * Mres0[31:0] = (Rs1.H[0] * Rs2.H[0]);
+ * Mres1[31:0] = (Rs1.H[1] * Rs2.H[1]);
+ * // SMALXDA
+ * Mres0[31:0] = (Rs1.H[0] * Rs2.H[1]);
+ * Mres1[31:0] = (Rs1.H[1] * Rs2.H[0]);
+ * Idx0 = CONCAT(Rd(4,1),1'b0); Idx1 = CONCAT(Rd(4,1),1'b1);
+ * R[Idx1].R[Idx0] = R[Idx1].R[Idx0] + SE64(Mres0[31:0]) + SE64(Mres1[31:0]);
+ * RV64:
+ * // SMALDA
+ * Mres0[0][31:0] = (Rs1.W[0].H[0] * Rs2.W[0].H[0]);
+ * Mres1[0][31:0] = (Rs1.W[0].H[1] * Rs2.W[0].H[1]);
+ * Mres0[1][31:0] = (Rs1.W[1].H[0] * Rs2.W[1].H[0]);
+ * Mres1[1][31:0] = (Rs1.W[1].H[1] * Rs2.W[1].H[1]);
+ * // SMALXDA
+ * Mres0[0][31:0] = (Rs1.W[0].H[0] * Rs2.W[0].H[1]);
+ * Mres1[0][31:0] = (Rs1.W[0].H[1] * Rs2.W[0].H[0]);
+ * Mres0[1][31:0] = (Rs1.W[1].H[0] * Rs2.W[1].H[1]);
+ * Mres1[1][31:0] = (Rs1.W[1].H[1] * Rs2.W[1].H[0]);
+ * Rd = Rd + SE64(Mres0[0][31:0]) + SE64(Mres1[0][31:0]) + SE64(Mres0[1][31:0]) +
+ * SE64(Mres1[1][31:0]);
+ * ~~~
+ *
+ * \param [in]  t    long long type of value stored in t
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in long long type
+ */
+__STATIC_FORCEINLINE long long __RV_SMALDA(long long t, unsigned long a, unsigned long b)
+{
+    __ASM volatile("smalda %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
+    return t;
+}
+/* ===== Inline Function End for 3.106.1. SMALDA ===== */
+
+/* ===== Inline Function Start for 3.106.2. SMALXDA ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_16B_MULT_64B_ADDSUB
+ * \brief SMALXDA (Signed Crossed Multiply Two Halfs and Two Adds 64-bit)
+ * \details
+ * **Type**: DSP (64-bit Profile)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SMALDA Rd, Rs1, Rs2
+ * SMALXDA Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do two signed 16-bit multiplications from the 32-bit elements of two registers; and then
+ * adds the two 32-bit results and the 64-bit value of an even/odd pair of registers together.
+ * * SMALDA: rt pair+ top*top + bottom*bottom (all 32-bit elements)
+ * * SMALXDA: rt pair+ top*bottom + bottom*top (all 32-bit elements)
+ *
+ * **RV32 Description**:\n
+ * For the `SMALDA` instruction, it multiplies the bottom 16-bit content of Rs1 with the bottom 16-bit
+ * content of Rs2 and then adds the result to the result of multiplying the top 16-bit content of Rs1 with
+ * the top 16-bit content of Rs2 with unlimited precision.
+ * For the `SMALXDA` instruction, it multiplies the top 16-bit content of Rs1 with the bottom 16-bit
+ * content of Rs2 and then adds the result to the result of multiplying the bottom 16-bit content of Rs1
+ * with the top 16-bit content of Rs2 with unlimited precision.
+ * The result is added to the 64-bit value of an even/odd pair of registers specified by Rd(4,1). The 64-
+ * bit addition result is written back to the register-pair. The 16-bit values of Rs1 and Rs2, and the 64-
+ * bit value of the register-pair are treated as signed integers.
+ * Rd(4,1), i.e., d, determines the even/odd pair group of the two registers. Specifically, the register pair
+ * includes register 2d and 2d+1.
+ * The odd `2d+1` register of the pair contains the high 32-bit of the operand and the even `2d`
+ * register of the pair contains the low 32-bit of the operand.
+ *
+ * **RV64 Description**:\n
+ * For the `SMALDA` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1
+ * with the bottom 16-bit content of the 32-bit elements of Rs2 and then adds the result to the result of
+ * multiplying the top 16-bit content of the 32-bit elements of Rs1 with the top 16-bit content of the 32-
+ * bit elements of Rs2 with unlimited precision.
+ * For the `SMALXDA` instruction, it multiplies the top 16-bit content of the 32-bit elements of Rs1
+ * with the bottom 16-bit content of the 32-bit elements of Rs2 and then adds the result to the result of
+ * multiplying the bottom 16-bit content of the 32-bit elements of Rs1 with the top 16-bit content of the
+ * 32-bit elements of Rs2 with unlimited precision.
+ * The results are added to the 64-bit value of Rd. The 64-bit addition result is written back to Rd. The
+ * 16-bit values of Rs1 and Rs2, and the 64-bit value of Rd are treated as signed integers.
+ *
+ * **Operations**:\n
+ * ~~~
+ * RV32:
+ * // SMALDA
+ * Mres0[31:0] = (Rs1.H[0] * Rs2.H[0]);
+ * Mres1[31:0] = (Rs1.H[1] * Rs2.H[1]);
+ * // SMALXDA
+ * Mres0[31:0] = (Rs1.H[0] * Rs2.H[1]);
+ * Mres1[31:0] = (Rs1.H[1] * Rs2.H[0]);
+ * Idx0 = CONCAT(Rd(4,1),1'b0); Idx1 = CONCAT(Rd(4,1),1'b1);
+ * R[Idx1].R[Idx0] = R[Idx1].R[Idx0] + SE64(Mres0[31:0]) + SE64(Mres1[31:0]);
+ * RV64:
+ * // SMALDA
+ * Mres0[0][31:0] = (Rs1.W[0].H[0] * Rs2.W[0].H[0]);
+ * Mres1[0][31:0] = (Rs1.W[0].H[1] * Rs2.W[0].H[1]);
+ * Mres0[1][31:0] = (Rs1.W[1].H[0] * Rs2.W[1].H[0]);
+ * Mres1[1][31:0] = (Rs1.W[1].H[1] * Rs2.W[1].H[1]);
+ * // SMALXDA
+ * Mres0[0][31:0] = (Rs1.W[0].H[0] * Rs2.W[0].H[1]);
+ * Mres1[0][31:0] = (Rs1.W[0].H[1] * Rs2.W[0].H[0]);
+ * Mres0[1][31:0] = (Rs1.W[1].H[0] * Rs2.W[1].H[1]);
+ * Mres1[1][31:0] = (Rs1.W[1].H[1] * Rs2.W[1].H[0]);
+ * Rd = Rd + SE64(Mres0[0][31:0]) + SE64(Mres1[0][31:0]) + SE64(Mres0[1][31:0]) +
+ * SE64(Mres1[1][31:0]);
+ * ~~~
+ *
+ * \param [in]  t    long long type of value stored in t
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in long long type
+ */
+__STATIC_FORCEINLINE long long __RV_SMALXDA(long long t, unsigned long a, unsigned long b)
+{
+    __ASM volatile("smalxda %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
+    return t;
+}
+/* ===== Inline Function End for 3.106.2. SMALXDA ===== */
+
+/* ===== Inline Function Start for 3.107.1. SMALDS ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_16B_MULT_64B_ADDSUB
+ * \brief SMALDS (Signed Multiply Two Halfs & Subtract & Add 64-bit)
+ * \details
+ * **Type**: DSP (64-bit Profile)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SMALDS Rd, Rs1, Rs2
+ * SMALDRS Rd, Rs1, Rs2
+ * SMALXDS Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do two signed 16-bit multiplications from the 32-bit elements of two registers; and then
+ * perform a subtraction operation between the two 32-bit results. Then add the subtraction result to
+ * the 64-bit value of an even/odd pair of registers (RV32) or a register (RV64). The addition result is
+ * written back to the register-pair.
+ * * SMALDS: rt pair + (top*top - bottom*bottom) (all 32-bit elements)
+ * * SMALDRS: rt pair + (bottom*bottom - top*top) (all 32-bit elements)
+ * * SMALXDS: rt pair + (top*bottom - bottom*top) (all 32-bit elements)
+ *
+ * **RV32 Description**:\n
+ * For the `SMALDS` instruction, it multiplies the bottom 16-bit content of Rs1 with the bottom 16-bit
+ * content of Rs2 and then subtracts the result from the result of multiplying the top 16-bit content of
+ * Rs1 with the top 16-bit content of Rs2.
+ * For the `SMALDRS` instruction, it multiplies the top 16-bit content of Rs1 with the top 16-bit content
+ * of Rs2 and then subtracts the result from the result of multiplying the bottom 16-bit content of Rs1
+ * with the bottom 16-bit content of Rs2.
+ * For the `SMALXDS` instruction, it multiplies the bottom 16-bit content of Rs1 with the top 16-bit
+ * content of Rs2 and then subtracts the result from the result of multiplying the top 16-bit content of
+ * Rs1 with the bottom 16-bit content of Rs2.
+ * The subtraction result is then added to the 64-bit value of an even/odd pair of registers specified by
+ * Rd(4,1). The 64-bit addition result is written back to the register-pair. The 16-bit values of Rs1 and
+ * Rs2, and the 64-bit value of the register-pair are treated as signed integers.
+ * Rd(4,1), i.e., d, determines the even/odd pair group of the two registers. Specifically, the register pair
+ * includes register 2d and 2d+1.
+ * The odd `2d+1` register of the pair contains the high 32-bit of the operand and the even `2d`
+ * register of the pair contains the low 32-bit of the operand.
+ *
+ * **RV64 Description**:\n
+ * For the `SMALDS` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1
+ * with the bottom 16-bit content of the 32-bit elements of Rs2 and then subtracts the result from the
+ * result of multiplying the top 16-bit content of the 32-bit elements of Rs1 with the top 16-bit content
+ * of the 32-bit elements of Rs2.
+ * For the `SMALDRS` instruction, it multiplies the top 16-bit content of the 32-bit elements of Rs1 with
+ * the top 16-bit content of the 32-bit elements of Rs2 and then subtracts the result from the result of
+ * multiplying the bottom 16-bit content of the 32-bit elements of Rs1 with the bottom 16-bit content of
+ * the 32-bit elements of Rs2.
+ * For the `SMALXDS` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1
+ * with the top 16-bit content of the 32-bit elements of Rs2 and then subtracts the result from the
+ * result of multiplying the top 16-bit content of the 32-bit elements of Rs1 with the bottom 16-bit
+ * content of the 32-bit elements of Rs2.
+ * The subtraction results are then added to the 64-bit value of Rd. The 64-bit addition result is written
+ * back to Rd. The 16-bit values of Rs1 and Rs2, and the 64-bit value of Rd are treated as signed
+ * integers.
+ *
+ * **Operations**:\n
+ * ~~~
+ * * RV32:
+ * Mres[31:0] = (Rs1.H[1] * Rs2.H[1]) - (Rs1.H[0] * Rs2.H[0]); // SMALDS
+ * Mres[31:0] = (Rs1.H[0] * Rs2.H[0]) - (Rs1.H[1] * Rs2.H[1]); // SMALDRS
+ * Mres[31:0] = (Rs1.H[1] * Rs2.H[0]) - (Rs1.H[0] * Rs2.H[1]); // SMALXDS
+ * Idx0 = CONCAT(Rd(4,1),1'b0); Idx1 = CONCAT(Rd(4,1),1'b1);
+ * R[Idx1].R[Idx0] = R[Idx1].R[Idx0] + SE64(Mres[31:0]);
+ * * RV64:
+ * // SMALDS
+ * Mres[0][31:0] = (Rs1.W[0].H[1] * Rs2.W[0].H[1]) - (Rs1.W[0].H[0] * Rs2.W[0].H[0]);
+ * Mres[1][31:0] = (Rs1.W[1].H[1] * Rs2.W[0].H[1]) - (Rs1.W[1].H[0] * Rs2.W[1].H[0]);
+ * // SMALDRS
+ * Mres[0][31:0] = (Rs1.W[0].H[0] * Rs2.W[0].H[0]) - (Rs1.W[0].H[1] * Rs2.W[0].H[1]);
+ * Mres[1][31:0] = (Rs1.W[1].H[0] * Rs2.W[0].H[0]) - (Rs1.W[1].H[1] * Rs2.W[1].H[1]);
+ * // SMALXDS
+ * Mres[0][31:0] = (Rs1.W[0].H[1] * Rs2.W[0].H[0]) - (Rs1.W[0].H[0] * Rs2.W[0].H[1]);
+ * Mres[1][31:0] = (Rs1.W[1].H[1] * Rs2.W[0].H[0]) - (Rs1.W[1].H[0] * Rs2.W[1].H[1]);
+ * Rd = Rd + SE64(Mres[0][31:0]) + SE64(Mres[1][31:0]);
+ * ~~~
+ *
+ * \param [in]  t    long long type of value stored in t
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in long long type
+ */
+__STATIC_FORCEINLINE long long __RV_SMALDS(long long t, unsigned long a, unsigned long b)
+{
+    __ASM volatile("smalds %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
+    return t;
+}
+/* ===== Inline Function End for 3.107.1. SMALDS ===== */
+
+/* ===== Inline Function Start for 3.107.2. SMALDRS ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_16B_MULT_64B_ADDSUB
+ * \brief SMALDRS (Signed Multiply Two Halfs & Reverse Subtract & Add 64- bit)
+ * \details
+ * **Type**: DSP (64-bit Profile)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SMALDS Rd, Rs1, Rs2
+ * SMALDRS Rd, Rs1, Rs2
+ * SMALXDS Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do two signed 16-bit multiplications from the 32-bit elements of two registers; and then
+ * perform a subtraction operation between the two 32-bit results. Then add the subtraction result to
+ * the 64-bit value of an even/odd pair of registers (RV32) or a register (RV64). The addition result is
+ * written back to the register-pair.
+ * * SMALDS: rt pair + (top*top - bottom*bottom) (all 32-bit elements)
+ * * SMALDRS: rt pair + (bottom*bottom - top*top) (all 32-bit elements)
+ * * SMALXDS: rt pair + (top*bottom - bottom*top) (all 32-bit elements)
+ *
+ * **RV32 Description**:\n
+ * For the `SMALDS` instruction, it multiplies the bottom 16-bit content of Rs1 with the bottom 16-bit
+ * content of Rs2 and then subtracts the result from the result of multiplying the top 16-bit content of
+ * Rs1 with the top 16-bit content of Rs2.
+ * For the `SMALDRS` instruction, it multiplies the top 16-bit content of Rs1 with the top 16-bit content
+ * of Rs2 and then subtracts the result from the result of multiplying the bottom 16-bit content of Rs1
+ * with the bottom 16-bit content of Rs2.
+ * For the `SMALXDS` instruction, it multiplies the bottom 16-bit content of Rs1 with the top 16-bit
+ * content of Rs2 and then subtracts the result from the result of multiplying the top 16-bit content of
+ * Rs1 with the bottom 16-bit content of Rs2.
+ * The subtraction result is then added to the 64-bit value of an even/odd pair of registers specified by
+ * Rd(4,1). The 64-bit addition result is written back to the register-pair. The 16-bit values of Rs1 and
+ * Rs2, and the 64-bit value of the register-pair are treated as signed integers.
+ * Rd(4,1), i.e., d, determines the even/odd pair group of the two registers. Specifically, the register pair
+ * includes register 2d and 2d+1.
+ * The odd `2d+1` register of the pair contains the high 32-bit of the operand and the even `2d`
+ * register of the pair contains the low 32-bit of the operand.
+ *
+ * **RV64 Description**:\n
+ * For the `SMALDS` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1
+ * with the bottom 16-bit content of the 32-bit elements of Rs2 and then subtracts the result from the
+ * result of multiplying the top 16-bit content of the 32-bit elements of Rs1 with the top 16-bit content
+ * of the 32-bit elements of Rs2.
+ * For the `SMALDRS` instruction, it multiplies the top 16-bit content of the 32-bit elements of Rs1 with
+ * the top 16-bit content of the 32-bit elements of Rs2 and then subtracts the result from the result of
+ * multiplying the bottom 16-bit content of the 32-bit elements of Rs1 with the bottom 16-bit content of
+ * the 32-bit elements of Rs2.
+ * For the `SMALXDS` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1
+ * with the top 16-bit content of the 32-bit elements of Rs2 and then subtracts the result from the
+ * result of multiplying the top 16-bit content of the 32-bit elements of Rs1 with the bottom 16-bit
+ * content of the 32-bit elements of Rs2.
+ * The subtraction results are then added to the 64-bit value of Rd. The 64-bit addition result is written
+ * back to Rd. The 16-bit values of Rs1 and Rs2, and the 64-bit value of Rd are treated as signed
+ * integers.
+ *
+ * **Operations**:\n
+ * ~~~
+ * * RV32:
+ * Mres[31:0] = (Rs1.H[1] * Rs2.H[1]) - (Rs1.H[0] * Rs2.H[0]); // SMALDS
+ * Mres[31:0] = (Rs1.H[0] * Rs2.H[0]) - (Rs1.H[1] * Rs2.H[1]); // SMALDRS
+ * Mres[31:0] = (Rs1.H[1] * Rs2.H[0]) - (Rs1.H[0] * Rs2.H[1]); // SMALXDS
+ * Idx0 = CONCAT(Rd(4,1),1'b0); Idx1 = CONCAT(Rd(4,1),1'b1);
+ * R[Idx1].R[Idx0] = R[Idx1].R[Idx0] + SE64(Mres[31:0]);
+ * * RV64:
+ * // SMALDS
+ * Mres[0][31:0] = (Rs1.W[0].H[1] * Rs2.W[0].H[1]) - (Rs1.W[0].H[0] * Rs2.W[0].H[0]);
+ * Mres[1][31:0] = (Rs1.W[1].H[1] * Rs2.W[0].H[1]) - (Rs1.W[1].H[0] * Rs2.W[1].H[0]);
+ * // SMALDRS
+ * Mres[0][31:0] = (Rs1.W[0].H[0] * Rs2.W[0].H[0]) - (Rs1.W[0].H[1] * Rs2.W[0].H[1]);
+ * Mres[1][31:0] = (Rs1.W[1].H[0] * Rs2.W[0].H[0]) - (Rs1.W[1].H[1] * Rs2.W[1].H[1]);
+ * // SMALXDS
+ * Mres[0][31:0] = (Rs1.W[0].H[1] * Rs2.W[0].H[0]) - (Rs1.W[0].H[0] * Rs2.W[0].H[1]);
+ * Mres[1][31:0] = (Rs1.W[1].H[1] * Rs2.W[0].H[0]) - (Rs1.W[1].H[0] * Rs2.W[1].H[1]);
+ * Rd = Rd + SE64(Mres[0][31:0]) + SE64(Mres[1][31:0]);
+ * ~~~
+ *
+ * \param [in]  t    long long type of value stored in t
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in long long type
+ */
+__STATIC_FORCEINLINE long long __RV_SMALDRS(long long t, unsigned long a, unsigned long b)
+{
+    __ASM volatile("smaldrs %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
+    return t;
+}
+/* ===== Inline Function End for 3.107.2. SMALDRS ===== */
+
+/* ===== Inline Function Start for 3.107.3. SMALXDS ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_16B_MULT_64B_ADDSUB
+ * \brief SMALXDS (Signed Crossed Multiply Two Halfs & Subtract & Add 64- bit)
+ * \details
+ * **Type**: DSP (64-bit Profile)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SMALDS Rd, Rs1, Rs2
+ * SMALDRS Rd, Rs1, Rs2
+ * SMALXDS Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do two signed 16-bit multiplications from the 32-bit elements of two registers; and then
+ * perform a subtraction operation between the two 32-bit results. Then add the subtraction result to
+ * the 64-bit value of an even/odd pair of registers (RV32) or a register (RV64). The addition result is
+ * written back to the register-pair.
+ * * SMALDS: rt pair + (top*top - bottom*bottom) (all 32-bit elements)
+ * * SMALDRS: rt pair + (bottom*bottom - top*top) (all 32-bit elements)
+ * * SMALXDS: rt pair + (top*bottom - bottom*top) (all 32-bit elements)
+ *
+ * **RV32 Description**:\n
+ * For the `SMALDS` instruction, it multiplies the bottom 16-bit content of Rs1 with the bottom 16-bit
+ * content of Rs2 and then subtracts the result from the result of multiplying the top 16-bit content of
+ * Rs1 with the top 16-bit content of Rs2.
+ * For the `SMALDRS` instruction, it multiplies the top 16-bit content of Rs1 with the top 16-bit content
+ * of Rs2 and then subtracts the result from the result of multiplying the bottom 16-bit content of Rs1
+ * with the bottom 16-bit content of Rs2.
+ * For the `SMALXDS` instruction, it multiplies the bottom 16-bit content of Rs1 with the top 16-bit
+ * content of Rs2 and then subtracts the result from the result of multiplying the top 16-bit content of
+ * Rs1 with the bottom 16-bit content of Rs2.
+ * The subtraction result is then added to the 64-bit value of an even/odd pair of registers specified by
+ * Rd(4,1). The 64-bit addition result is written back to the register-pair. The 16-bit values of Rs1 and
+ * Rs2, and the 64-bit value of the register-pair are treated as signed integers.
+ * Rd(4,1), i.e., d, determines the even/odd pair group of the two registers. Specifically, the register pair
+ * includes register 2d and 2d+1.
+ * The odd `2d+1` register of the pair contains the high 32-bit of the operand and the even `2d`
+ * register of the pair contains the low 32-bit of the operand.
+ *
+ * **RV64 Description**:\n
+ * For the `SMALDS` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1
+ * with the bottom 16-bit content of the 32-bit elements of Rs2 and then subtracts the result from the
+ * result of multiplying the top 16-bit content of the 32-bit elements of Rs1 with the top 16-bit content
+ * of the 32-bit elements of Rs2.
+ * For the `SMALDRS` instruction, it multiplies the top 16-bit content of the 32-bit elements of Rs1 with
+ * the top 16-bit content of the 32-bit elements of Rs2 and then subtracts the result from the result of
+ * multiplying the bottom 16-bit content of the 32-bit elements of Rs1 with the bottom 16-bit content of
+ * the 32-bit elements of Rs2.
+ * For the `SMALXDS` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1
+ * with the top 16-bit content of the 32-bit elements of Rs2 and then subtracts the result from the
+ * result of multiplying the top 16-bit content of the 32-bit elements of Rs1 with the bottom 16-bit
+ * content of the 32-bit elements of Rs2.
+ * The subtraction results are then added to the 64-bit value of Rd. The 64-bit addition result is written
+ * back to Rd. The 16-bit values of Rs1 and Rs2, and the 64-bit value of Rd are treated as signed
+ * integers.
+ *
+ * **Operations**:\n
+ * ~~~
+ * * RV32:
+ * Mres[31:0] = (Rs1.H[1] * Rs2.H[1]) - (Rs1.H[0] * Rs2.H[0]); // SMALDS
+ * Mres[31:0] = (Rs1.H[0] * Rs2.H[0]) - (Rs1.H[1] * Rs2.H[1]); // SMALDRS
+ * Mres[31:0] = (Rs1.H[1] * Rs2.H[0]) - (Rs1.H[0] * Rs2.H[1]); // SMALXDS
+ * Idx0 = CONCAT(Rd(4,1),1'b0); Idx1 = CONCAT(Rd(4,1),1'b1);
+ * R[Idx1].R[Idx0] = R[Idx1].R[Idx0] + SE64(Mres[31:0]);
+ * * RV64:
+ * // SMALDS
+ * Mres[0][31:0] = (Rs1.W[0].H[1] * Rs2.W[0].H[1]) - (Rs1.W[0].H[0] * Rs2.W[0].H[0]);
+ * Mres[1][31:0] = (Rs1.W[1].H[1] * Rs2.W[0].H[1]) - (Rs1.W[1].H[0] * Rs2.W[1].H[0]);
+ * // SMALDRS
+ * Mres[0][31:0] = (Rs1.W[0].H[0] * Rs2.W[0].H[0]) - (Rs1.W[0].H[1] * Rs2.W[0].H[1]);
+ * Mres[1][31:0] = (Rs1.W[1].H[0] * Rs2.W[0].H[0]) - (Rs1.W[1].H[1] * Rs2.W[1].H[1]);
+ * // SMALXDS
+ * Mres[0][31:0] = (Rs1.W[0].H[1] * Rs2.W[0].H[0]) - (Rs1.W[0].H[0] * Rs2.W[0].H[1]);
+ * Mres[1][31:0] = (Rs1.W[1].H[1] * Rs2.W[0].H[0]) - (Rs1.W[1].H[0] * Rs2.W[1].H[1]);
+ * Rd = Rd + SE64(Mres[0][31:0]) + SE64(Mres[1][31:0]);
+ * ~~~
+ *
+ * \param [in]  t    long long type of value stored in t
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in long long type
+ */
+__STATIC_FORCEINLINE long long __RV_SMALXDS(long long t, unsigned long a, unsigned long b)
+{
+    __ASM volatile("smalxds %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
+    return t;
+}
+/* ===== Inline Function End for 3.107.3. SMALXDS ===== */
+
+/* ===== Inline Function Start for 3.108. SMAR64 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_32B_MULT_64B_ADDSUB
+ * \brief SMAR64 (Signed Multiply and Add to 64-Bit Data)
+ * \details
+ * **Type**: DSP (64-bit Profile)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SMAR64 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Multiply the 32-bit signed elements in two registers and add the 64-bit multiplication
+ * result to the 64-bit signed data of a pair of registers (RV32) or a register (RV64). The result is written
+ * back to the pair of registers (RV32) or a register (RV64).
+ *
+ * **RV32 Description**:\n
+ * This instruction multiplies the 32-bit signed data of Rs1 with that of Rs2. It adds
+ * the 64-bit multiplication result to the 64-bit signed data of an even/odd pair of registers specified by
+ * Rd(4,1). The addition result is written back to the even/odd pair of registers specified by Rd(4,1).
+ * Rx(4,1), i.e., d, determines the even/odd pair group of two registers. Specifically, the register pair
+ * includes register 2d and 2d+1.
+ * The odd `2d+1` register of the pair contains the high 32-bit of the result and the even `2d` register
+ * of the pair contains the low 32-bit of the result.
+ *
+ * **RV64 Description**:\n
+ * This instruction multiplies the 32-bit signed elements of Rs1 with that of Rs2. It
+ * adds the 64-bit multiplication results to the 64-bit signed data of Rd. The addition result is written
+ * back to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * * RV32:
+ * t_L = CONCAT(Rd(4,1),1'b0); t_H = CONCAT(Rd(4,1),1'b1);
+ * R[t_H].R[t_L] = R[t_H].R[t_L] + (Rs1 * Rs2);
+ * * RV64:
+ * Rd = Rd + (Rs1.W[0] * Rs2.W[0]) + (Rs1.W[1] * Rs2.W[1]);
+ * ~~~
+ *
+ * \param [in]  t    long long type of value stored in t
+ * \param [in]  a    long type of value stored in a
+ * \param [in]  b    long type of value stored in b
+ * \return value stored in long long type
+ */
+__STATIC_FORCEINLINE long long __RV_SMAR64(long long t, long a, long b)
+{
+    __ASM volatile("smar64 %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
+    return t;
+}
+/* ===== Inline Function End for 3.108. SMAR64 ===== */
+
+/* ===== Inline Function Start for 3.109. SMAQA ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_8B_MULT_32B_ADD
+ * \brief SMAQA (Signed Multiply Four Bytes with 32-bit Adds)
+ * \details
+ * **Type**: Partial-SIMD (Reduction)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SMAQA Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do four signed 8-bit multiplications from 32-bit chunks of two registers; and then adds
+ * the four 16-bit results and the content of corresponding 32-bit chunks of a third register together.
+ *
+ * **Description**:\n
+ * This instruction multiplies the four signed 8-bit elements of 32-bit chunks of Rs1 with the four
+ * signed 8-bit elements of 32-bit chunks of Rs2 and then adds the four results together with the signed
+ * content of the corresponding 32-bit chunks of Rd. The final results are written back to the
+ * corresponding 32-bit chunks in Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * res[x] = Rd.W[x] +
+ *    (Rs1.W[x].B[3] s* Rs2.W[x].B[3]) + (Rs1.W[x].B[2] s* Rs2.W[x].B[2]) +
+ *    (Rs1.W[x].B[1] s* Rs2.W[x].B[1]) + (Rs1.W[x].B[0] s* Rs2.W[x].B[0]);
+ * Rd.W[x] = res[x];
+ * for RV32: x=0,
+ * for RV64: x=1,0
+ * ~~~
+ *
+ * \param [in]  t    long type of value stored in t
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_SMAQA(long t, unsigned long a, unsigned long b)
+{
+    __ASM volatile("smaqa %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
+    return t;
+}
+/* ===== Inline Function End for 3.109. SMAQA ===== */
+
+/* ===== Inline Function Start for 3.110. SMAQA.SU ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_8B_MULT_32B_ADD
+ * \brief SMAQA.SU (Signed and Unsigned Multiply Four Bytes with 32-bit Adds)
+ * \details
+ * **Type**: Partial-SIMD (Reduction)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SMAQA.SU Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do four `signed x unsigned` 8-bit multiplications from 32-bit chunks of two registers; and
+ * then adds the four 16-bit results and the content of corresponding 32-bit chunks of a third register
+ * together.
+ *
+ * **Description**:\n
+ * This instruction multiplies the four signed 8-bit elements of 32-bit chunks of Rs1 with the four
+ * unsigned 8-bit elements of 32-bit chunks of Rs2 and then adds the four results together with the
+ * signed content of the corresponding 32-bit chunks of Rd. The final results are written back to the
+ * corresponding 32-bit chunks in Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * res[x] = Rd.W[x] +
+ *    (Rs1.W[x].B[3] su* Rs2.W[x].B[3]) + (Rs1.W[x].B[2] su* Rs2.W[x].B[2]) +
+ *    (Rs1.W[x].B[1] su* Rs2.W[x].B[1]) + (Rs1.W[x].B[0] su* Rs2.W[x].B[0]);
+ * Rd.W[x] = res[x];
+ * for RV32: x=0,
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  t    long type of value stored in t
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_SMAQA_SU(long t, unsigned long a, unsigned long b)
+{
+    __ASM volatile("smaqa.su %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
+    return t;
+}
+/* ===== Inline Function End for 3.110. SMAQA.SU ===== */
+
+/* ===== Inline Function Start for 3.111. SMAX8 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_MISC
+ * \brief SMAX8 (SIMD 8-bit Signed Maximum)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SMAX8 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 8-bit signed integer elements finding maximum operations simultaneously.
+ *
+ * **Description**:\n
+ * This instruction compares the 8-bit signed integer elements in Rs1 with the 8-bit
+ * signed integer elements in Rs2 and selects the numbers that is greater than the other one. The
+ * selected results are written to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.B[x] = (Rs1.B[x] > Rs2.B[x])? Rs1.B[x] : Rs2.B[x];
+ * for RV32: x=3...0,
+ * for RV64: x=7...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_SMAX8(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("smax8 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.111. SMAX8 ===== */
+
+/* ===== Inline Function Start for 3.112. SMAX16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_MISC
+ * \brief SMAX16 (SIMD 16-bit Signed Maximum)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SMAX16 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 16-bit signed integer elements finding maximum operations simultaneously.
+ *
+ * **Description**:\n
+ * This instruction compares the 16-bit signed integer elements in Rs1 with the 16-bit
+ * signed integer elements in Rs2 and selects the numbers that is greater than the other one. The
+ * selected results are written to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.H[x] = (Rs1.H[x] > Rs2.H[x])? Rs1.H[x] : Rs2.H[x];
+ * for RV32: x=1...0,
+ * for RV64: x=3...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_SMAX16(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("smax16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.112. SMAX16 ===== */
+
+/* ===== Inline Function Start for 3.113.1. SMBB16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_16B_MULT_32B_ADDSUB
+ * \brief SMBB16 (SIMD Signed Multiply Bottom Half & Bottom Half)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SMBB16 Rd, Rs1, Rs2
+ * SMBT16 Rd, Rs1, Rs2
+ * SMTT16 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Multiply the signed 16-bit content of the 32-bit elements of a register with the signed 16-
+ * bit content of the 32-bit elements of another register and write the result to a third register.
+ * * SMBB16: W[x].bottom*W[x].bottom
+ * * SMBT16: W[x].bottom *W[x].top
+ * * SMTT16: W[x].top * W[x].top
+ *
+ * **Description**:\n
+ * For the `SMBB16` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1
+ * with the bottom 16-bit content of the 32-bit elements of Rs2.
+ * For the `SMBT16` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1
+ * with the top 16-bit content of the 32-bit elements of Rs2.
+ * For the `SMTT16` instruction, it multiplies the top 16-bit content of the 32-bit elements of Rs1 with
+ * the top 16-bit content of the 32-bit elements of Rs2.
+ * The multiplication results are written to Rd. The 16-bit contents of Rs1 and Rs2 are treated as signed
+ * integers.
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.W[x] = Rs1.W[x].H[0] * Rs2.W[x].H[0]; // SMBB16
+ * Rd.W[x] = Rs1.W[x].H[0] * Rs2.W[x].H[1]; // SMBT16
+ * Rd.W[x] = Rs1.W[x].H[1] * Rs2.W[x].H[1]; // SMTT16
+ * for RV32: x=0,
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_SMBB16(unsigned long a, unsigned long b)
+{
+    register long result;
+    __ASM volatile("smbb16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.113.1. SMBB16 ===== */
+
+/* ===== Inline Function Start for 3.113.2. SMBT16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_16B_MULT_32B_ADDSUB
+ * \brief SMBT16 (SIMD Signed Multiply Bottom Half & Top Half)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SMBB16 Rd, Rs1, Rs2
+ * SMBT16 Rd, Rs1, Rs2
+ * SMTT16 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Multiply the signed 16-bit content of the 32-bit elements of a register with the signed 16-
+ * bit content of the 32-bit elements of another register and write the result to a third register.
+ * * SMBB16: W[x].bottom*W[x].bottom
+ * * SMBT16: W[x].bottom *W[x].top
+ * * SMTT16: W[x].top * W[x].top
+ *
+ * **Description**:\n
+ * For the `SMBB16` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1
+ * with the bottom 16-bit content of the 32-bit elements of Rs2.
+ * For the `SMBT16` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1
+ * with the top 16-bit content of the 32-bit elements of Rs2.
+ * For the `SMTT16` instruction, it multiplies the top 16-bit content of the 32-bit elements of Rs1 with
+ * the top 16-bit content of the 32-bit elements of Rs2.
+ * The multiplication results are written to Rd. The 16-bit contents of Rs1 and Rs2 are treated as signed
+ * integers.
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.W[x] = Rs1.W[x].H[0] * Rs2.W[x].H[0]; // SMBB16
+ * Rd.W[x] = Rs1.W[x].H[0] * Rs2.W[x].H[1]; // SMBT16
+ * Rd.W[x] = Rs1.W[x].H[1] * Rs2.W[x].H[1]; // SMTT16
+ * for RV32: x=0,
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_SMBT16(unsigned long a, unsigned long b)
+{
+    register long result;
+    __ASM volatile("smbt16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.113.2. SMBT16 ===== */
+
+/* ===== Inline Function Start for 3.113.3. SMTT16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_16B_MULT_32B_ADDSUB
+ * \brief SMTT16 (SIMD Signed Multiply Top Half & Top Half)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SMBB16 Rd, Rs1, Rs2
+ * SMBT16 Rd, Rs1, Rs2
+ * SMTT16 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Multiply the signed 16-bit content of the 32-bit elements of a register with the signed 16-
+ * bit content of the 32-bit elements of another register and write the result to a third register.
+ * * SMBB16: W[x].bottom*W[x].bottom
+ * * SMBT16: W[x].bottom *W[x].top
+ * * SMTT16: W[x].top * W[x].top
+ *
+ * **Description**:\n
+ * For the `SMBB16` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1
+ * with the bottom 16-bit content of the 32-bit elements of Rs2.
+ * For the `SMBT16` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1
+ * with the top 16-bit content of the 32-bit elements of Rs2.
+ * For the `SMTT16` instruction, it multiplies the top 16-bit content of the 32-bit elements of Rs1 with
+ * the top 16-bit content of the 32-bit elements of Rs2.
+ * The multiplication results are written to Rd. The 16-bit contents of Rs1 and Rs2 are treated as signed
+ * integers.
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.W[x] = Rs1.W[x].H[0] * Rs2.W[x].H[0]; // SMBB16
+ * Rd.W[x] = Rs1.W[x].H[0] * Rs2.W[x].H[1]; // SMBT16
+ * Rd.W[x] = Rs1.W[x].H[1] * Rs2.W[x].H[1]; // SMTT16
+ * for RV32: x=0,
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_SMTT16(unsigned long a, unsigned long b)
+{
+    register long result;
+    __ASM volatile("smtt16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.113.3. SMTT16 ===== */
+
+/* ===== Inline Function Start for 3.114.1. SMDS ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_16B_MULT_32B_ADDSUB
+ * \brief SMDS (SIMD Signed Multiply Two Halfs and Subtract)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SMDS Rd, Rs1, Rs2
+ * SMDRS Rd, Rs1, Rs2
+ * SMXDS Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do two signed 16-bit multiplications from the 32-bit elements of two registers; and then
+ * perform a subtraction operation between the two 32-bit results.
+ * * SMDS: top*top - bottom*bottom (per 32-bit element)
+ * * SMDRS: bottom*bottom - top*top (per 32-bit element)
+ * * SMXDS: top*bottom - bottom*top (per 32-bit element)
+ *
+ * **Description**:\n
+ * For the `SMDS` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1 with
+ * the bottom 16-bit content of the 32-bit elements of Rs2 and then subtracts the result from the result
+ * of multiplying the top 16-bit content of the 32-bit elements of Rs1 with the top 16-bit content of the
+ * 32-bit elements of Rs2.
+ * For the `SMDRS` instruction, it multiplies the top 16-bit content of the 32-bit elements of Rs1 with
+ * the top 16-bit content of the 32-bit elements of Rs2 and then subtracts the result from the result of
+ * multiplying the bottom 16-bit content of the 32-bit elements of Rs1 with the bottom 16-bit content of
+ * the 32-bit elements of Rs2.
+ * For the `SMXDS` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1
+ * with the top 16-bit content of the 32-bit elements of Rs2 and then subtracts the result from the
+ * result of multiplying the top 16-bit content of the 32-bit elements of Rs1 with the bottom 16-bit
+ * content of the 32-bit elements of Rs2.
+ * The subtraction result is written to the corresponding 32-bit element of Rd. The 16-bit contents of
+ * multiplication are treated as signed integers.
+ *
+ * **Operations**:\n
+ * ~~~
+ * * SMDS:
+ * Rd.W[x] = (Rs1.W[x].H[1] * Rs2.W[x].H[1]) - (Rs1.W[x].H[0] * Rs2.W[x].H[0]);
+ * * SMDRS:
+ * Rd.W[x] = (Rs1.W[x].H[0] * Rs2.W[x].H[0]) - (Rs1.W[x].H[1] * Rs2.W[x].H[1]);
+ * * SMXDS:
+ * Rd.W[x] = (Rs1.W[x].H[1] * Rs2.W[x].H[0]) - (Rs1.W[x].H[0] * Rs2.W[x].H[1]);
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_SMDS(unsigned long a, unsigned long b)
+{
+    register long result;
+    __ASM volatile("smds %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.114.1. SMDS ===== */
+
+/* ===== Inline Function Start for 3.114.2. SMDRS ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_16B_MULT_32B_ADDSUB
+ * \brief SMDRS (SIMD Signed Multiply Two Halfs and Reverse Subtract)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SMDS Rd, Rs1, Rs2
+ * SMDRS Rd, Rs1, Rs2
+ * SMXDS Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do two signed 16-bit multiplications from the 32-bit elements of two registers; and then
+ * perform a subtraction operation between the two 32-bit results.
+ * * SMDS: top*top - bottom*bottom (per 32-bit element)
+ * * SMDRS: bottom*bottom - top*top (per 32-bit element)
+ * * SMXDS: top*bottom - bottom*top (per 32-bit element)
+ *
+ * **Description**:\n
+ * For the `SMDS` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1 with
+ * the bottom 16-bit content of the 32-bit elements of Rs2 and then subtracts the result from the result
+ * of multiplying the top 16-bit content of the 32-bit elements of Rs1 with the top 16-bit content of the
+ * 32-bit elements of Rs2.
+ * For the `SMDRS` instruction, it multiplies the top 16-bit content of the 32-bit elements of Rs1 with
+ * the top 16-bit content of the 32-bit elements of Rs2 and then subtracts the result from the result of
+ * multiplying the bottom 16-bit content of the 32-bit elements of Rs1 with the bottom 16-bit content of
+ * the 32-bit elements of Rs2.
+ * For the `SMXDS` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1
+ * with the top 16-bit content of the 32-bit elements of Rs2 and then subtracts the result from the
+ * result of multiplying the top 16-bit content of the 32-bit elements of Rs1 with the bottom 16-bit
+ * content of the 32-bit elements of Rs2.
+ * The subtraction result is written to the corresponding 32-bit element of Rd. The 16-bit contents of
+ * multiplication are treated as signed integers.
+ *
+ * **Operations**:\n
+ * ~~~
+ * * SMDS:
+ * Rd.W[x] = (Rs1.W[x].H[1] * Rs2.W[x].H[1]) - (Rs1.W[x].H[0] * Rs2.W[x].H[0]);
+ * * SMDRS:
+ * Rd.W[x] = (Rs1.W[x].H[0] * Rs2.W[x].H[0]) - (Rs1.W[x].H[1] * Rs2.W[x].H[1]);
+ * * SMXDS:
+ * Rd.W[x] = (Rs1.W[x].H[1] * Rs2.W[x].H[0]) - (Rs1.W[x].H[0] * Rs2.W[x].H[1]);
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_SMDRS(unsigned long a, unsigned long b)
+{
+    register long result;
+    __ASM volatile("smdrs %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.114.2. SMDRS ===== */
+
+/* ===== Inline Function Start for 3.114.3. SMXDS ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_16B_MULT_32B_ADDSUB
+ * \brief SMXDS (SIMD Signed Crossed Multiply Two Halfs and Subtract)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SMDS Rd, Rs1, Rs2
+ * SMDRS Rd, Rs1, Rs2
+ * SMXDS Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do two signed 16-bit multiplications from the 32-bit elements of two registers; and then
+ * perform a subtraction operation between the two 32-bit results.
+ * * SMDS: top*top - bottom*bottom (per 32-bit element)
+ * * SMDRS: bottom*bottom - top*top (per 32-bit element)
+ * * SMXDS: top*bottom - bottom*top (per 32-bit element)
+ *
+ * **Description**:\n
+ * For the `SMDS` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1 with
+ * the bottom 16-bit content of the 32-bit elements of Rs2 and then subtracts the result from the result
+ * of multiplying the top 16-bit content of the 32-bit elements of Rs1 with the top 16-bit content of the
+ * 32-bit elements of Rs2.
+ * For the `SMDRS` instruction, it multiplies the top 16-bit content of the 32-bit elements of Rs1 with
+ * the top 16-bit content of the 32-bit elements of Rs2 and then subtracts the result from the result of
+ * multiplying the bottom 16-bit content of the 32-bit elements of Rs1 with the bottom 16-bit content of
+ * the 32-bit elements of Rs2.
+ * For the `SMXDS` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1
+ * with the top 16-bit content of the 32-bit elements of Rs2 and then subtracts the result from the
+ * result of multiplying the top 16-bit content of the 32-bit elements of Rs1 with the bottom 16-bit
+ * content of the 32-bit elements of Rs2.
+ * The subtraction result is written to the corresponding 32-bit element of Rd. The 16-bit contents of
+ * multiplication are treated as signed integers.
+ *
+ * **Operations**:\n
+ * ~~~
+ * * SMDS:
+ * Rd.W[x] = (Rs1.W[x].H[1] * Rs2.W[x].H[1]) - (Rs1.W[x].H[0] * Rs2.W[x].H[0]);
+ * * SMDRS:
+ * Rd.W[x] = (Rs1.W[x].H[0] * Rs2.W[x].H[0]) - (Rs1.W[x].H[1] * Rs2.W[x].H[1]);
+ * * SMXDS:
+ * Rd.W[x] = (Rs1.W[x].H[1] * Rs2.W[x].H[0]) - (Rs1.W[x].H[0] * Rs2.W[x].H[1]);
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_SMXDS(unsigned long a, unsigned long b)
+{
+    register long result;
+    __ASM volatile("smxds %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.114.3. SMXDS ===== */
+
+/* ===== Inline Function Start for 3.115. SMIN8 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_MISC
+ * \brief SMIN8 (SIMD 8-bit Signed Minimum)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SMIN8 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 8-bit signed integer elements finding minimum operations simultaneously.
+ *
+ * **Description**:\n
+ * This instruction compares the 8-bit signed integer elements in Rs1 with the 8-bit
+ * signed integer elements in Rs2 and selects the numbers that is less than the other one. The selected
+ * results are written to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.B[x] = (Rs1.B[x] < Rs2.B[x])? Rs1.B[x] : Rs2.B[x];
+ * for RV32: x=3...0,
+ * for RV64: x=7...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_SMIN8(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("smin8 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.115. SMIN8 ===== */
+
+/* ===== Inline Function Start for 3.116. SMIN16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_MISC
+ * \brief SMIN16 (SIMD 16-bit Signed Minimum)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SMIN16 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 16-bit signed integer elements finding minimum operations simultaneously.
+ *
+ * **Description**:\n
+ * This instruction compares the 16-bit signed integer elements in Rs1 with the 16-bit
+ * signed integer elements in Rs2 and selects the numbers that is less than the other one. The selected
+ * results are written to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.H[x] = (Rs1.H[x] < Rs2.H[x])? Rs1.H[x] : Rs2.H[x];
+ * for RV32: x=1...0,
+ * for RV64: x=3...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_SMIN16(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("smin16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.116. SMIN16 ===== */
+
+/* ===== Inline Function Start for 3.117.1. SMMUL ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_MSW_32X32_MAC
+ * \brief SMMUL (SIMD MSW Signed Multiply Word)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SMMUL Rd, Rs1, Rs2
+ * SMMUL.u Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Multiply the 32-bit signed integer elements of two registers and write the most significant
+ * 32-bit results to the corresponding 32-bit elements of a register. The `.u` form performs an
+ * additional rounding up operation on the multiplication results before taking the most significant
+ * 32-bit part of the results.
+ *
+ * **Description**:\n
+ * This instruction multiplies the 32-bit elements of Rs1 with the 32-bit elements of Rs2 and writes the
+ * most significant 32-bit multiplication results to the corresponding 32-bit elements of Rd. The 32-bit
+ * elements of Rs1 and Rs2 are treated as signed integers. The `.u` form of the instruction rounds up
+ * the most significant 32-bit of the 64-bit multiplication results by adding a 1 to bit 31 of the results.
+ * * For `smmul/RV32` instruction, it is an alias to `mulh/RV32` instruction.
+ *
+ * **Operations**:\n
+ * ~~~
+ * Mres[x][63:0] = Rs1.W[x] * Rs2.W[x];
+ * if (`.u` form) {
+ *   Round[x][32:0] = Mres[x][63:31] + 1;
+ *   Rd.W[x] = Round[x][32:1];
+ * } else {
+ *   Rd.W[x] = Mres[x][63:32];
+ * }
+ * for RV32: x=0
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  a    long type of value stored in a
+ * \param [in]  b    long type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_SMMUL(long a, long b)
+{
+    register long result;
+    __ASM volatile("smmul %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.117.1. SMMUL ===== */
+
+/* ===== Inline Function Start for 3.117.2. SMMUL.u ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_MSW_32X32_MAC
+ * \brief SMMUL.u (SIMD MSW Signed Multiply Word with Rounding)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SMMUL Rd, Rs1, Rs2
+ * SMMUL.u Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Multiply the 32-bit signed integer elements of two registers and write the most significant
+ * 32-bit results to the corresponding 32-bit elements of a register. The `.u` form performs an
+ * additional rounding up operation on the multiplication results before taking the most significant
+ * 32-bit part of the results.
+ *
+ * **Description**:\n
+ * This instruction multiplies the 32-bit elements of Rs1 with the 32-bit elements of Rs2 and writes the
+ * most significant 32-bit multiplication results to the corresponding 32-bit elements of Rd. The 32-bit
+ * elements of Rs1 and Rs2 are treated as signed integers. The `.u` form of the instruction rounds up
+ * the most significant 32-bit of the 64-bit multiplication results by adding a 1 to bit 31 of the results.
+ * * For `smmul/RV32` instruction, it is an alias to `mulh/RV32` instruction.
+ *
+ * **Operations**:\n
+ * ~~~
+ * Mres[x][63:0] = Rs1.W[x] * Rs2.W[x];
+ * if (`.u` form) {
+ *   Round[x][32:0] = Mres[x][63:31] + 1;
+ *   Rd.W[x] = Round[x][32:1];
+ * } else {
+ *   Rd.W[x] = Mres[x][63:32];
+ * }
+ * for RV32: x=0
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  a    long type of value stored in a
+ * \param [in]  b    long type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_SMMUL_U(long a, long b)
+{
+    register long result;
+    __ASM volatile("smmul.u %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.117.2. SMMUL.u ===== */
+
+/* ===== Inline Function Start for 3.118.1. SMMWB ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_MSW_32X16_MAC
+ * \brief SMMWB (SIMD MSW Signed Multiply Word and Bottom Half)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SMMWB Rd, Rs1, Rs2
+ * SMMWB.u Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Multiply the signed 32-bit integer elements of one register and the bottom 16-bit of the
+ * corresponding 32-bit elements of another register, and write the most significant 32-bit results to
+ * the corresponding 32-bit elements of a register. The `.u` form rounds up the results from the most
+ * significant discarded bit.
+ *
+ * **Description**:\n
+ * This instruction multiplies the signed 32-bit elements of Rs1 with the signed bottom 16-bit content
+ * of the corresponding 32-bit elements of Rs2 and writes the most significant 32-bit multiplication
+ * results to the corresponding 32-bit elements of Rd. The `.u` form of the instruction rounds up the
+ * most significant 32-bit of the 48-bit multiplication results by adding a 1 to bit 15 of the results.
+ *
+ * **Operations**:\n
+ * ~~~
+ * Mres[x][47:0] = Rs1.W[x] * Rs2.W[x].H[0];
+ * if (`.u` form) {
+ *   Round[x][32:0] = Mres[x][47:15] + 1;
+ *   Rd.W[x] = Round[x][32:1];
+ * } else {
+ *   Rd.W[x] = Mres[x][47:16];
+ * }
+ * for RV32: x=0
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  a    long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_SMMWB(long a, unsigned long b)
+{
+    register long result;
+    __ASM volatile("smmwb %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.118.1. SMMWB ===== */
+
+/* ===== Inline Function Start for 3.118.2. SMMWB.u ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_MSW_32X16_MAC
+ * \brief SMMWB.u (SIMD MSW Signed Multiply Word and Bottom Half with Rounding)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SMMWB Rd, Rs1, Rs2
+ * SMMWB.u Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Multiply the signed 32-bit integer elements of one register and the bottom 16-bit of the
+ * corresponding 32-bit elements of another register, and write the most significant 32-bit results to
+ * the corresponding 32-bit elements of a register. The `.u` form rounds up the results from the most
+ * significant discarded bit.
+ *
+ * **Description**:\n
+ * This instruction multiplies the signed 32-bit elements of Rs1 with the signed bottom 16-bit content
+ * of the corresponding 32-bit elements of Rs2 and writes the most significant 32-bit multiplication
+ * results to the corresponding 32-bit elements of Rd. The `.u` form of the instruction rounds up the
+ * most significant 32-bit of the 48-bit multiplication results by adding a 1 to bit 15 of the results.
+ *
+ * **Operations**:\n
+ * ~~~
+ * Mres[x][47:0] = Rs1.W[x] * Rs2.W[x].H[0];
+ * if (`.u` form) {
+ *   Round[x][32:0] = Mres[x][47:15] + 1;
+ *   Rd.W[x] = Round[x][32:1];
+ * } else {
+ *   Rd.W[x] = Mres[x][47:16];
+ * }
+ * for RV32: x=0
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  a    long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_SMMWB_U(long a, unsigned long b)
+{
+    register long result;
+    __ASM volatile("smmwb.u %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.118.2. SMMWB.u ===== */
+
+/* ===== Inline Function Start for 3.119.1. SMMWT ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_MSW_32X16_MAC
+ * \brief SMMWT (SIMD MSW Signed Multiply Word and Top Half)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SMMWT Rd, Rs1, Rs2
+ * SMMWT.u Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Multiply the signed 32-bit integer elements of one register and the top 16-bit of the
+ * corresponding 32-bit elements of another register, and write the most significant 32-bit results to
+ * the corresponding 32-bit elements of a register. The `.u` form rounds up the results from the most
+ * significant discarded bit.
+ *
+ * **Description**:\n
+ * This instruction multiplies the signed 32-bit elements of Rs1 with the top signed 16-bit content of
+ * the corresponding 32-bit elements of Rs2 and writes the most significant 32-bit multiplication
+ * results to the corresponding 32-bit elements of Rd. The `.u` form of the instruction rounds up the
+ * most significant 32-bit of the 48-bit multiplication results by adding a 1 to bit 15 of the results.
+ *
+ * **Operations**:\n
+ * ~~~
+ * Mres[x][47:0] = Rs1.W[x] * Rs2.W[x].H[1];
+ * if (`.u` form) {
+ *   Round[x][32:0] = Mres[x][47:15] + 1;
+ *   Rd.W[x] = Round[x][32:1];
+ * } else {
+ *   Rd.W[x] = Mres[x][47:16];
+ * }
+ * for RV32: x=0
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  a    long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_SMMWT(long a, unsigned long b)
+{
+    register long result;
+    __ASM volatile("smmwt %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.119.1. SMMWT ===== */
+
+/* ===== Inline Function Start for 3.119.2. SMMWT.u ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_MSW_32X16_MAC
+ * \brief SMMWT.u (SIMD MSW Signed Multiply Word and Top Half with Rounding)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SMMWT Rd, Rs1, Rs2
+ * SMMWT.u Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Multiply the signed 32-bit integer elements of one register and the top 16-bit of the
+ * corresponding 32-bit elements of another register, and write the most significant 32-bit results to
+ * the corresponding 32-bit elements of a register. The `.u` form rounds up the results from the most
+ * significant discarded bit.
+ *
+ * **Description**:\n
+ * This instruction multiplies the signed 32-bit elements of Rs1 with the top signed 16-bit content of
+ * the corresponding 32-bit elements of Rs2 and writes the most significant 32-bit multiplication
+ * results to the corresponding 32-bit elements of Rd. The `.u` form of the instruction rounds up the
+ * most significant 32-bit of the 48-bit multiplication results by adding a 1 to bit 15 of the results.
+ *
+ * **Operations**:\n
+ * ~~~
+ * Mres[x][47:0] = Rs1.W[x] * Rs2.W[x].H[1];
+ * if (`.u` form) {
+ *   Round[x][32:0] = Mres[x][47:15] + 1;
+ *   Rd.W[x] = Round[x][32:1];
+ * } else {
+ *   Rd.W[x] = Mres[x][47:16];
+ * }
+ * for RV32: x=0
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  a    long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_SMMWT_U(long a, unsigned long b)
+{
+    register long result;
+    __ASM volatile("smmwt.u %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.119.2. SMMWT.u ===== */
+
+/* ===== Inline Function Start for 3.120.1. SMSLDA ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_16B_MULT_64B_ADDSUB
+ * \brief SMSLDA (Signed Multiply Two Halfs & Add & Subtract 64-bit)
+ * \details
+ * **Type**: DSP (64-bit Profile)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SMSLDA Rd, Rs1, Rs2
+ * SMSLXDA Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do two signed 16-bit multiplications from the 32-bit elements of two registers; and then
+ * subtracts the two 32-bit results from the 64-bit value of an even/odd pair of registers (RV32) or a
+ * register (RV64). The subtraction result is written back to the register-pair.
+ * * SMSLDA: rd pair - top*top - bottom*bottom (all 32-bit elements)
+ * * SMSLXDA: rd pair - top*bottom - bottom*top (all 32-bit elements)
+ *
+ * **RV32 Description**:\n
+ * For the `SMSLDA` instruction, it multiplies the bottom 16-bit content of Rs1 with the bottom 16-bit
+ * content Rs2 and multiplies the top 16-bit content of Rs1 with the top 16-bit content of Rs2.
+ * For the `SMSLXDA` instruction, it multiplies the top 16-bit content of Rs1 with the bottom 16-bit
+ * content of Rs2 and multiplies the bottom 16-bit content of Rs1 with the top 16-bit content of Rs2.
+ * The two multiplication results are subtracted from the 64-bit value of an even/odd pair of registers
+ * specified by Rd(4,1). The 64-bit subtraction result is written back to the register-pair. The 16-bit
+ * values of Rs1 and Rs2, and the 64-bit value of the register-pair are treated as signed integers.
+ * Rd(4,1), i.e., d, determines the even/odd pair group of the two registers. Specifically, the register pair
+ * includes register 2d and 2d+1.
+ * The odd `2d+1` register of the pair contains the high 32-bit of the result and the even `2d` register
+ * of the pair contains the low 32-bit of the result.
+ *
+ * **RV64 Description**:\n
+ * For the `SMSLDA` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1
+ * with the bottom 16-bit content of the 32-bit elements of Rs2 and multiplies the top 16-bit content of
+ * the 32-bit elements of Rs1 with the top 16-bit content of the 32-bit elements of Rs2.
+ * For the `SMSLXDA` instruction, it multiplies the top 16-bit content of the 32-bit elements of Rs1 with
+ * the bottom 16-bit content of the 32-bit elements of Rs2 and multiplies the bottom 16-bit content of
+ * the 32-bit elements of Rs1 with the top 16-bit content of the 32-bit elements of Rs2.
+ * The four multiplication results are subtracted from the 64-bit value of Rd. The 64-bit subtraction
+ * result is written back to Rd. The 16-bit values of Rs1 and Rs2, and the 64-bit value of Rd are treated
+ * as signed integers.
+ *
+ * **Operations**:\n
+ * ~~~
+ * * RV32:
+ * // SMSLDA
+ * Mres0[31:0] = (Rs1.H[0] * Rs2.H[0]);
+ * Mres1[31:0] = (Rs1.H[1] * Rs2.H[1]);
+ * // SMSLXDA
+ * Mres0[31:0] = (Rs1.H[0] * Rs2.H[1]);
+ * Mres1[31:0] = (Rs1.H[1] * Rs2.H[0]);
+ * Idx0 = CONCAT(Rd(4,1),1'b0); Idx1 = CONCAT(Rd(4,1),1'b1);
+ * R[Idx1].R[Idx0] = R[Idx1].R[Idx0] - SE64(Mres0[31:0]) - SE64(Mres1[31:0]);
+ * * RV64:
+ * // SMSLDA
+ * Mres0[0][31:0] = (Rs1.W[0].H[0] * Rs2.W[0].H[0]);
+ * Mres1[0][31:0] = (Rs1.W[0].H[1] * Rs2.W[0].H[1]);
+ * Mres0[1][31:0] = (Rs1.W[1].H[0] * Rs2.W[1].H[0]);
+ * Mres1[1][31:0] = (Rs1.W[1].H[1] * Rs2.W[1].H[1]);
+ * // SMSLXDA
+ * Mres0[0][31:0] = (Rs1.W[0].H[0] * Rs2.W[0].H[1]);
+ * Mres1[0][31:0] = (Rs1.W[0].H[1] * Rs2.W[0].H[0]);
+ * Mres0[1][31:0] = (Rs1.W[1].H[0] * Rs2.W[1].H[1]);
+ * Mres1[1][31:0] = (Rs1.W[1].H[1] * Rs2.W[1].H[0]);
+ * Rd = Rd - SE64(Mres0[0][31:0]) - SE64(Mres1[0][31:0]) - SE64(Mres0[1][31:0]) -
+ * SE64(Mres1[1][31:0]);
+ * ~~~
+ *
+ * \param [in]  t    long long type of value stored in t
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in long long type
+ */
+__STATIC_FORCEINLINE long long __RV_SMSLDA(long long t, unsigned long a, unsigned long b)
+{
+    __ASM volatile("smslda %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
+    return t;
+}
+/* ===== Inline Function End for 3.120.1. SMSLDA ===== */
+
+/* ===== Inline Function Start for 3.120.2. SMSLXDA ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_16B_MULT_64B_ADDSUB
+ * \brief SMSLXDA (Signed Crossed Multiply Two Halfs & Add & Subtract 64- bit)
+ * \details
+ * **Type**: DSP (64-bit Profile)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SMSLDA Rd, Rs1, Rs2
+ * SMSLXDA Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do two signed 16-bit multiplications from the 32-bit elements of two registers; and then
+ * subtracts the two 32-bit results from the 64-bit value of an even/odd pair of registers (RV32) or a
+ * register (RV64). The subtraction result is written back to the register-pair.
+ * * SMSLDA: rd pair - top*top - bottom*bottom (all 32-bit elements)
+ * * SMSLXDA: rd pair - top*bottom - bottom*top (all 32-bit elements)
+ *
+ * **RV32 Description**:\n
+ * For the `SMSLDA` instruction, it multiplies the bottom 16-bit content of Rs1 with the bottom 16-bit
+ * content Rs2 and multiplies the top 16-bit content of Rs1 with the top 16-bit content of Rs2.
+ * For the `SMSLXDA` instruction, it multiplies the top 16-bit content of Rs1 with the bottom 16-bit
+ * content of Rs2 and multiplies the bottom 16-bit content of Rs1 with the top 16-bit content of Rs2.
+ * The two multiplication results are subtracted from the 64-bit value of an even/odd pair of registers
+ * specified by Rd(4,1). The 64-bit subtraction result is written back to the register-pair. The 16-bit
+ * values of Rs1 and Rs2, and the 64-bit value of the register-pair are treated as signed integers.
+ * Rd(4,1), i.e., d, determines the even/odd pair group of the two registers. Specifically, the register pair
+ * includes register 2d and 2d+1.
+ * The odd `2d+1` register of the pair contains the high 32-bit of the result and the even `2d` register
+ * of the pair contains the low 32-bit of the result.
+ *
+ * **RV64 Description**:\n
+ * For the `SMSLDA` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1
+ * with the bottom 16-bit content of the 32-bit elements of Rs2 and multiplies the top 16-bit content of
+ * the 32-bit elements of Rs1 with the top 16-bit content of the 32-bit elements of Rs2.
+ * For the `SMSLXDA` instruction, it multiplies the top 16-bit content of the 32-bit elements of Rs1 with
+ * the bottom 16-bit content of the 32-bit elements of Rs2 and multiplies the bottom 16-bit content of
+ * the 32-bit elements of Rs1 with the top 16-bit content of the 32-bit elements of Rs2.
+ * The four multiplication results are subtracted from the 64-bit value of Rd. The 64-bit subtraction
+ * result is written back to Rd. The 16-bit values of Rs1 and Rs2, and the 64-bit value of Rd are treated
+ * as signed integers.
+ *
+ * **Operations**:\n
+ * ~~~
+ * * RV32:
+ * // SMSLDA
+ * Mres0[31:0] = (Rs1.H[0] * Rs2.H[0]);
+ * Mres1[31:0] = (Rs1.H[1] * Rs2.H[1]);
+ * // SMSLXDA
+ * Mres0[31:0] = (Rs1.H[0] * Rs2.H[1]);
+ * Mres1[31:0] = (Rs1.H[1] * Rs2.H[0]);
+ * Idx0 = CONCAT(Rd(4,1),1'b0); Idx1 = CONCAT(Rd(4,1),1'b1);
+ * R[Idx1].R[Idx0] = R[Idx1].R[Idx0] - SE64(Mres0[31:0]) - SE64(Mres1[31:0]);
+ * * RV64:
+ * // SMSLDA
+ * Mres0[0][31:0] = (Rs1.W[0].H[0] * Rs2.W[0].H[0]);
+ * Mres1[0][31:0] = (Rs1.W[0].H[1] * Rs2.W[0].H[1]);
+ * Mres0[1][31:0] = (Rs1.W[1].H[0] * Rs2.W[1].H[0]);
+ * Mres1[1][31:0] = (Rs1.W[1].H[1] * Rs2.W[1].H[1]);
+ * // SMSLXDA
+ * Mres0[0][31:0] = (Rs1.W[0].H[0] * Rs2.W[0].H[1]);
+ * Mres1[0][31:0] = (Rs1.W[0].H[1] * Rs2.W[0].H[0]);
+ * Mres0[1][31:0] = (Rs1.W[1].H[0] * Rs2.W[1].H[1]);
+ * Mres1[1][31:0] = (Rs1.W[1].H[1] * Rs2.W[1].H[0]);
+ * Rd = Rd - SE64(Mres0[0][31:0]) - SE64(Mres1[0][31:0]) - SE64(Mres0[1][31:0]) -
+ * SE64(Mres1[1][31:0]);
+ * ~~~
+ *
+ * \param [in]  t    long long type of value stored in t
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in long long type
+ */
+__STATIC_FORCEINLINE long long __RV_SMSLXDA(long long t, unsigned long a, unsigned long b)
+{
+    __ASM volatile("smslxda %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
+    return t;
+}
+/* ===== Inline Function End for 3.120.2. SMSLXDA ===== */
+
+/* ===== Inline Function Start for 3.121. SMSR64 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_32B_MULT_64B_ADDSUB
+ * \brief SMSR64 (Signed Multiply and Subtract from 64- Bit Data)
+ * \details
+ * **Type**: DSP (64-bit Profile)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SMSR64 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Multiply the 32-bit signed elements in two registers and subtract the 64-bit multiplication
+ * results from the 64-bit signed data of a pair of registers (RV32) or a register (RV64). The result is
+ * written back to the pair of registers (RV32) or a register (RV64).
+ *
+ * **RV32 Description**:\n
+ * This instruction multiplies the 32-bit signed data of Rs1 with that of Rs2. It
+ * subtracts the 64-bit multiplication result from the 64-bit signed data of an even/odd pair of registers
+ * specified by Rd(4,1). The subtraction result is written back to the even/odd pair of registers
+ * specified by Rd(4,1).
+ * Rx(4,1), i.e., d, determines the even/odd pair group of two registers. Specifically, the register pair
+ * includes register 2d and 2d+1.
+ * The odd `2d+1` register of the pair contains the high 32-bit of the result and the even `2d` register
+ * of the pair contains the low 32-bit of the result.
+ *
+ * **RV64 Description**:\n
+ * This instruction multiplies the 32-bit signed elements of Rs1 with that of Rs2. It
+ * subtracts the 64-bit multiplication results from the 64-bit signed data of Rd. The subtraction result is
+ * written back to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * * RV32:
+ * t_L = CONCAT(Rd(4,1),1'b0); t_H = CONCAT(Rd(4,1),1'b1);
+ * R[t_H].R[t_L] = R[t_H].R[t_L] - (Rs1 * Rs2);
+ * * RV64:
+ * Rd = Rd - (Rs1.W[0] * Rs2.W[0]) - (Rs1.W[1] * Rs2.W[1]);
+ * ~~~
+ *
+ * \param [in]  t    long long type of value stored in t
+ * \param [in]  a    long type of value stored in a
+ * \param [in]  b    long type of value stored in b
+ * \return value stored in long long type
+ */
+__STATIC_FORCEINLINE long long __RV_SMSR64(long long t, long a, long b)
+{
+    __ASM volatile("smsr64 %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
+    return t;
+}
+/* ===== Inline Function End for 3.121. SMSR64 ===== */
+
+/* ===== Inline Function Start for 3.122.1. SMUL8 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_MULTIPLY
+ * \brief SMUL8 (SIMD Signed 8-bit Multiply)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SMUL8 Rd, Rs1, Rs2
+ * SMULX8 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do signed 8-bit multiplications and generate four 16-bit results simultaneously.
+ *
+ * **RV32 Description**:\n
+ * For the `SMUL8` instruction, multiply the 8-bit data elements of Rs1 with the
+ * corresponding 8-bit data elements of Rs2.
+ * For the `SMULX8` instruction, multiply the first and second 8-bit data elements of Rs1 with the
+ * second and first 8-bit data elements of Rs2. At the same time, multiply the third and fourth 8-bit data
+ * elements of Rs1 with the fourth and third 8-bit data elements of Rs2.
+ * The four 16-bit results are then written into an even/odd pair of registers specified by Rd(4,1).
+ * Rd(4,1), i.e., d, determines the even/odd pair group of two registers. Specifically, the register pair
+ * includes register 2d and 2d+1.
+ * The odd `2d+1` register of the pair contains the two 16-bit results calculated from the top part of
+ * Rs1 and the even `2d` register of the pair contains the two 16-bit results calculated from the bottom
+ * part of Rs1.
+ *
+ * **RV64 Description**:\n
+ * For the `SMUL8` instruction, multiply the 8-bit data elements of Rs1 with the
+ * corresponding 8-bit data elements of Rs2.
+ * For the `SMULX8` instruction, multiply the first and second 8-bit data elements of Rs1 with the
+ * second and first 8-bit data elements of Rs2. At the same time, multiply the third and fourth 8-bit data
+ * elements of Rs1 with the fourth and third 8-bit data elements of Rs2.
+ * The four 16-bit results are then written into Rd. The Rd.W[1] contains the two 16-bit results
+ * calculated from the top part of Rs1 and the Rd.W[0] contains the two 16-bit results calculated from
+ * the bottom part of Rs1.
+ *
+ * **Operations**:\n
+ * ~~~
+ * * RV32:
+ * if (is `SMUL8`) {
+ *   op1t[x/2] = Rs1.B[x+1]; op2t[x/2] = Rs2.B[x+1]; // top
+ *   op1b[x/2] = Rs1.B[x]; op2b[x/2] = Rs2.B[x]; // bottom
+ * } else if (is `SMULX8`) {
+ *   op1t[x/2] = Rs1.B[x+1]; op2t[x/2] = Rs2.B[x]; // Rs1 top
+ *   op1b[x/2] = Rs1.B[x]; op2b[x/2] = Rs2.B[x+1]; // Rs1 bottom
+ * }
+ * rest[x/2] = op1t[x/2] s* op2t[x/2];
+ * resb[x/2] = op1b[x/2] s* op2b[x/2];
+ * t_L = CONCAT(Rd(4,1),1'b0); t_H = CONCAT(Rd(4,1),1'b1);
+ * R[t_H].H[1] = rest[1]; R[t_H].H[0] = resb[1];
+ * R[t_L].H[1] = rest[0]; R[t_L].H[0] = resb[0];
+ * x = 0 and 2
+ * * RV64:
+ * if (is `SMUL8`) {
+ *   op1t[x/2] = Rs1.B[x+1]; op2t[x/2] = Rs2.B[x+1]; // top
+ *   op1b[x/2] = Rs1.B[x]; op2b[x/2] = Rs2.B[x]; // bottom
+ * } else if (is `SMULX8`) {
+ *   op1t[x/2] = Rs1.B[x+1]; op2t[x/2] = Rs2.B[x]; // Rs1 top
+ *   op1b[x/2] = Rs1.B[x]; op2b[x/2] = Rs2.B[x+1]; // Rs1 bottom
+ * }
+ * rest[x/2] = op1t[x/2] s* op2t[x/2];
+ * resb[x/2] = op1b[x/2] s* op2b[x/2];
+ * t_L = CONCAT(Rd(4,1),1'b0); t_H = CONCAT(Rd(4,1),1'b1);
+ * Rd.W[1].H[1] = rest[1]; Rd.W[1].H[0] = resb[1];
+ * Rd.W[0].H[1] = rest[0]; Rd.W[0].H[0] = resb[0];
+ * x = 0 and 2
+ * ~~~
+ *
+ * \param [in]  a    unsigned int type of value stored in a
+ * \param [in]  b    unsigned int type of value stored in b
+ * \return value stored in unsigned long long type
+ */
+__STATIC_FORCEINLINE unsigned long long __RV_SMUL8(unsigned int a, unsigned int b)
+{
+    register unsigned long long result;
+    __ASM volatile("smul8 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.122.1. SMUL8 ===== */
+
+/* ===== Inline Function Start for 3.122.2. SMULX8 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_MULTIPLY
+ * \brief SMULX8 (SIMD Signed Crossed 8-bit Multiply)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SMUL8 Rd, Rs1, Rs2
+ * SMULX8 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do signed 8-bit multiplications and generate four 16-bit results simultaneously.
+ *
+ * **RV32 Description**:\n
+ * For the `SMUL8` instruction, multiply the 8-bit data elements of Rs1 with the
+ * corresponding 8-bit data elements of Rs2.
+ * For the `SMULX8` instruction, multiply the first and second 8-bit data elements of Rs1 with the
+ * second and first 8-bit data elements of Rs2. At the same time, multiply the third and fourth 8-bit data
+ * elements of Rs1 with the fourth and third 8-bit data elements of Rs2.
+ * The four 16-bit results are then written into an even/odd pair of registers specified by Rd(4,1).
+ * Rd(4,1), i.e., d, determines the even/odd pair group of two registers. Specifically, the register pair
+ * includes register 2d and 2d+1.
+ * The odd `2d+1` register of the pair contains the two 16-bit results calculated from the top part of
+ * Rs1 and the even `2d` register of the pair contains the two 16-bit results calculated from the bottom
+ * part of Rs1.
+ *
+ * **RV64 Description**:\n
+ * For the `SMUL8` instruction, multiply the 8-bit data elements of Rs1 with the
+ * corresponding 8-bit data elements of Rs2.
+ * For the `SMULX8` instruction, multiply the first and second 8-bit data elements of Rs1 with the
+ * second and first 8-bit data elements of Rs2. At the same time, multiply the third and fourth 8-bit data
+ * elements of Rs1 with the fourth and third 8-bit data elements of Rs2.
+ * The four 16-bit results are then written into Rd. The Rd.W[1] contains the two 16-bit results
+ * calculated from the top part of Rs1 and the Rd.W[0] contains the two 16-bit results calculated from
+ * the bottom part of Rs1.
+ *
+ * **Operations**:\n
+ * ~~~
+ * * RV32:
+ * if (is `SMUL8`) {
+ *   op1t[x/2] = Rs1.B[x+1]; op2t[x/2] = Rs2.B[x+1]; // top
+ *   op1b[x/2] = Rs1.B[x]; op2b[x/2] = Rs2.B[x]; // bottom
+ * } else if (is `SMULX8`) {
+ *   op1t[x/2] = Rs1.B[x+1]; op2t[x/2] = Rs2.B[x]; // Rs1 top
+ *   op1b[x/2] = Rs1.B[x]; op2b[x/2] = Rs2.B[x+1]; // Rs1 bottom
+ * }
+ * rest[x/2] = op1t[x/2] s* op2t[x/2];
+ * resb[x/2] = op1b[x/2] s* op2b[x/2];
+ * t_L = CONCAT(Rd(4,1),1'b0); t_H = CONCAT(Rd(4,1),1'b1);
+ * R[t_H].H[1] = rest[1]; R[t_H].H[0] = resb[1];
+ * R[t_L].H[1] = rest[0]; R[t_L].H[0] = resb[0];
+ * x = 0 and 2
+ * * RV64:
+ * if (is `SMUL8`) {
+ *   op1t[x/2] = Rs1.B[x+1]; op2t[x/2] = Rs2.B[x+1]; // top
+ *   op1b[x/2] = Rs1.B[x]; op2b[x/2] = Rs2.B[x]; // bottom
+ * } else if (is `SMULX8`) {
+ *   op1t[x/2] = Rs1.B[x+1]; op2t[x/2] = Rs2.B[x]; // Rs1 top
+ *   op1b[x/2] = Rs1.B[x]; op2b[x/2] = Rs2.B[x+1]; // Rs1 bottom
+ * }
+ * rest[x/2] = op1t[x/2] s* op2t[x/2];
+ * resb[x/2] = op1b[x/2] s* op2b[x/2];
+ * t_L = CONCAT(Rd(4,1),1'b0); t_H = CONCAT(Rd(4,1),1'b1);
+ * Rd.W[1].H[1] = rest[1]; Rd.W[1].H[0] = resb[1];
+ * Rd.W[0].H[1] = rest[0]; Rd.W[0].H[0] = resb[0];
+ * x = 0 and 2
+ * ~~~
+ *
+ * \param [in]  a    unsigned int type of value stored in a
+ * \param [in]  b    unsigned int type of value stored in b
+ * \return value stored in unsigned long long type
+ */
+__STATIC_FORCEINLINE unsigned long long __RV_SMULX8(unsigned int a, unsigned int b)
+{
+    register unsigned long long result;
+    __ASM volatile("smulx8 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.122.2. SMULX8 ===== */
+
+/* ===== Inline Function Start for 3.123.1. SMUL16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_MULTIPLY
+ * \brief SMUL16 (SIMD Signed 16-bit Multiply)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SMUL16 Rd, Rs1, Rs2
+ * SMULX16 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do signed 16-bit multiplications and generate two 32-bit results simultaneously.
+ *
+ * **RV32 Description**:\n
+ * For the `SMUL16` instruction, multiply the top 16-bit Q15 content of Rs1 with
+ * the top 16-bit Q15 content of Rs2. At the same time, multiply the bottom 16-bit Q15 content of Rs1
+ * with the bottom 16-bit Q15 content of Rs2.
+ * For the `SMULX16` instruction, multiply the top 16-bit Q15 content of Rs1 with the bottom 16-bit
+ * Q15 content of Rs2. At the same time, multiply the bottom 16-bit Q15 content of Rs1 with the top 16-
+ * bit Q15 content of Rs2.
+ * The two Q30 results are then written into an even/odd pair of registers specified by Rd(4,1). Rd(4,1),
+ * i.e., d, determines the even/odd pair group of two registers. Specifically, the register pair includes
+ * register 2d and 2d+1.
+ * The odd `2d+1` register of the pair contains the 32-bit result calculated from the top part of Rs1 and
+ * the even `2d` register of the pair contains the 32-bit result calculated from the bottom part of Rs1.
+ *
+ * **RV64 Description**:\n
+ * For the `SMUL16` instruction, multiply the top 16-bit Q15 content of the lower
+ * 32-bit word in Rs1 with the top 16-bit Q15 content of the lower 32-bit word in Rs2. At the same time,
+ * multiply the bottom 16-bit Q15 content of the lower 32-bit word in Rs1 with the bottom 16-bit Q15
+ * content of the lower 32-bit word in Rs2.
+ * For the `SMULX16` instruction, multiply the top 16-bit Q15 content of the lower 32-bit word in Rs1
+ * with the bottom 16-bit Q15 content of the lower 32-bit word in Rs2. At the same time, multiply the
+ * bottom 16-bit Q15 content of the lower 32-bit word in Rs1 with the top 16-bit Q15 content of the
+ * lower 32-bit word in Rs2.
+ * The two 32-bit Q30 results are then written into Rd. The result calculated from the top 16-bit of the
+ * lower 32-bit word in Rs1 is written to Rd.W[1]. And the result calculated from the bottom 16-bit of
+ * the lower 32-bit word in Rs1 is written to Rd.W[0]
+ *
+ * **Operations**:\n
+ * ~~~
+ * * RV32:
+ * if (is `SMUL16`) {
+ *   op1t = Rs1.H[1]; op2t = Rs2.H[1]; // top
+ *   op1b = Rs1.H[0]; op2b = Rs2.H[0]; // bottom
+ * } else if (is `SMULX16`) {
+ *   op1t = Rs1.H[1]; op2t = Rs2.H[0]; // Rs1 top
+ *   op1b = Rs1.H[0]; op2b = Rs2.H[1]; // Rs1 bottom
+ * }
+ * for ((aop,bop,res) in [(op1t,op2t,rest), (op1b,op2b,resb)]) {
+ *   res = aop s* bop;
+ * }
+ * t_L = CONCAT(Rd(4,1),1'b0); t_H = CONCAT(Rd(4,1),1'b1);
+ * R[t_H] = rest;
+ * R[t_L] = resb;
+ * * RV64:
+ * if (is `SMUL16`) {
+ *   op1t = Rs1.H[1]; op2t = Rs2.H[1]; // top
+ *   op1b = Rs1.H[0]; op2b = Rs2.H[0]; // bottom
+ * } else if (is `SMULX16`) {
+ *   op1t = Rs1.H[1]; op2t = Rs2.H[0]; // Rs1 top
+ *   op1b = Rs1.H[0]; op2b = Rs2.H[1]; // Rs1 bottom
+ * }
+ * for ((aop,bop,res) in [(op1t,op2t,rest), (op1b,op2b,resb)]) {
+ *   res = aop s* bop;
+ * }
+ * Rd.W[1] = rest;
+ * Rd.W[0] = resb;
+ * ~~~
+ *
+ * \param [in]  a    unsigned int type of value stored in a
+ * \param [in]  b    unsigned int type of value stored in b
+ * \return value stored in unsigned long long type
+ */
+__STATIC_FORCEINLINE unsigned long long __RV_SMUL16(unsigned int a, unsigned int b)
+{
+    register unsigned long long result;
+    __ASM volatile("smul16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.123.1. SMUL16 ===== */
+
+/* ===== Inline Function Start for 3.123.2. SMULX16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_MULTIPLY
+ * \brief SMULX16 (SIMD Signed Crossed 16-bit Multiply)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SMUL16 Rd, Rs1, Rs2
+ * SMULX16 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do signed 16-bit multiplications and generate two 32-bit results simultaneously.
+ *
+ * **RV32 Description**:\n
+ * For the `SMUL16` instruction, multiply the top 16-bit Q15 content of Rs1 with
+ * the top 16-bit Q15 content of Rs2. At the same time, multiply the bottom 16-bit Q15 content of Rs1
+ * with the bottom 16-bit Q15 content of Rs2.
+ * For the `SMULX16` instruction, multiply the top 16-bit Q15 content of Rs1 with the bottom 16-bit
+ * Q15 content of Rs2. At the same time, multiply the bottom 16-bit Q15 content of Rs1 with the top 16-
+ * bit Q15 content of Rs2.
+ * The two Q30 results are then written into an even/odd pair of registers specified by Rd(4,1). Rd(4,1),
+ * i.e., d, determines the even/odd pair group of two registers. Specifically, the register pair includes
+ * register 2d and 2d+1.
+ * The odd `2d+1` register of the pair contains the 32-bit result calculated from the top part of Rs1 and
+ * the even `2d` register of the pair contains the 32-bit result calculated from the bottom part of Rs1.
+ *
+ * **RV64 Description**:\n
+ * For the `SMUL16` instruction, multiply the top 16-bit Q15 content of the lower
+ * 32-bit word in Rs1 with the top 16-bit Q15 content of the lower 32-bit word in Rs2. At the same time,
+ * multiply the bottom 16-bit Q15 content of the lower 32-bit word in Rs1 with the bottom 16-bit Q15
+ * content of the lower 32-bit word in Rs2.
+ * For the `SMULX16` instruction, multiply the top 16-bit Q15 content of the lower 32-bit word in Rs1
+ * with the bottom 16-bit Q15 content of the lower 32-bit word in Rs2. At the same time, multiply the
+ * bottom 16-bit Q15 content of the lower 32-bit word in Rs1 with the top 16-bit Q15 content of the
+ * lower 32-bit word in Rs2.
+ * The two 32-bit Q30 results are then written into Rd. The result calculated from the top 16-bit of the
+ * lower 32-bit word in Rs1 is written to Rd.W[1]. And the result calculated from the bottom 16-bit of
+ * the lower 32-bit word in Rs1 is written to Rd.W[0]
+ *
+ * **Operations**:\n
+ * ~~~
+ * * RV32:
+ * if (is `SMUL16`) {
+ *   op1t = Rs1.H[1]; op2t = Rs2.H[1]; // top
+ *   op1b = Rs1.H[0]; op2b = Rs2.H[0]; // bottom
+ * } else if (is `SMULX16`) {
+ *   op1t = Rs1.H[1]; op2t = Rs2.H[0]; // Rs1 top
+ *   op1b = Rs1.H[0]; op2b = Rs2.H[1]; // Rs1 bottom
+ * }
+ * for ((aop,bop,res) in [(op1t,op2t,rest), (op1b,op2b,resb)]) {
+ *   res = aop s* bop;
+ * }
+ * t_L = CONCAT(Rd(4,1),1'b0); t_H = CONCAT(Rd(4,1),1'b1);
+ * R[t_H] = rest;
+ * R[t_L] = resb;
+ * * RV64:
+ * if (is `SMUL16`) {
+ *   op1t = Rs1.H[1]; op2t = Rs2.H[1]; // top
+ *   op1b = Rs1.H[0]; op2b = Rs2.H[0]; // bottom
+ * } else if (is `SMULX16`) {
+ *   op1t = Rs1.H[1]; op2t = Rs2.H[0]; // Rs1 top
+ *   op1b = Rs1.H[0]; op2b = Rs2.H[1]; // Rs1 bottom
+ * }
+ * for ((aop,bop,res) in [(op1t,op2t,rest), (op1b,op2b,resb)]) {
+ *   res = aop s* bop;
+ * }
+ * Rd.W[1] = rest;
+ * Rd.W[0] = resb;
+ * ~~~
+ *
+ * \param [in]  a    unsigned int type of value stored in a
+ * \param [in]  b    unsigned int type of value stored in b
+ * \return value stored in unsigned long long type
+ */
+__STATIC_FORCEINLINE unsigned long long __RV_SMULX16(unsigned int a, unsigned int b)
+{
+    register unsigned long long result;
+    __ASM volatile("smulx16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.123.2. SMULX16 ===== */
+
+/* ===== Inline Function Start for 3.124. SRA.u ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_MISC
+ * \brief SRA.u (Rounding Shift Right Arithmetic)
+ * \details
+ * **Type**: DSP
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SRA.u Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Perform an arithmetic right shift operation with rounding. The shift amount is a variable
+ * from a GPR.
+ *
+ * **Description**:\n
+ * This instruction right-shifts the content of Rs1 arithmetically. The shifted out bits are
+ * filled with the sign-bit and the shift amount is specified by the low-order 5-bits (RV32) or 6-bits
+ * (RV64) of the Rs2 register. For the rounding operation, a value of 1 is added to the most significant
+ * discarded bit of the data to calculate the final result. And the result is written to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * * RV32:
+ * sa = Rs2[4:0];
+ * if (sa > 0) {
+ *   res[31:-1] = SE33(Rs1[31:(sa-1)]) + 1;
+ *   Rd = res[31:0];
+ * } else {
+ *   Rd = Rs1;
+ * }
+ * * RV64:
+ * sa = Rs2[5:0];
+ * if (sa > 0) {
+ *   res[63:-1] = SE65(Rs1[63:(sa-1)]) + 1;
+ *   Rd = res[63:0];
+ * } else {
+ *   Rd = Rs1;
+ * }
+ * ~~~
+ *
+ * \param [in]  a    long type of value stored in a
+ * \param [in]  b    unsigned int type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_SRA_U(long a, unsigned int b)
+{
+    register long result;
+    __ASM volatile("sra.u %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.124. SRA.u ===== */
+
+/* ===== Inline Function Start for 3.125. SRAI.u ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_MISC
+ * \brief SRAI.u (Rounding Shift Right Arithmetic Immediate)
+ * \details
+ * **Type**: DSP
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SRAI.u Rd, Rs1, imm6u[4:0] (RV32)
+ * SRAI.u Rd, Rs1, imm6u[5:0] (RV64)
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Perform an arithmetic right shift operation with rounding. The shift amount is an
+ * immediate value.
+ *
+ * **Description**:\n
+ * This instruction right-shifts the content of Rs1 arithmetically. The shifted out bits are
+ * filled with the sign-bit and the shift amount is specified by the imm6u[4:0] (RV32) or imm6u[5:0]
+ * (RV64) constant . For the rounding operation, a value of 1 is added to the most significant discarded
+ * bit of the data to calculate the final result. And the result is written to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * * RV32:
+ * sa = imm6u[4:0];
+ * if (sa > 0) {
+ *   res[31:-1] = SE33(Rs1[31:(sa-1)]) + 1;
+ *   Rd = res[31:0];
+ * } else {
+ *   Rd = Rs1;
+ * }
+ * * RV64:
+ * sa = imm6u[5:0];
+ * if (sa > 0) {
+ *   res[63:-1] = SE65(Rs1[63:(sa-1)]) + 1;
+ *   Rd = res[63:0];
+ * } else {
+ *   Rd = Rs1;
+ * }
+ * ~~~
+ *
+ * \param [in]  a    long type of value stored in a
+ * \param [in]  b    unsigned int type of value stored in b
+ * \return value stored in long type
+ */
+#define __RV_SRAI_U(a, b)    \
+    ({    \
+        register long result;    \
+        register long __a = (long)(a);    \
+        __ASM volatile("srai.u %0, %1, %2" : "=r"(result) : "r"(__a), "K"(b));    \
+        result;    \
+    })
+/* ===== Inline Function End for 3.125. SRAI.u ===== */
+
+/* ===== Inline Function Start for 3.126.1. SRA8 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_SHIFT
+ * \brief SRA8 (SIMD 8-bit Shift Right Arithmetic)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SRA8 Rd, Rs1, Rs2
+ * SRA8.u Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 8-bit element arithmetic right shift operations simultaneously. The shift amount is a
+ * variable from a GPR. The `.u` form performs additional rounding up operations on the shifted
+ * results.
+ *
+ * **Description**:\n
+ * The 8-bit data elements in Rs1 are right-shifted arithmetically, that is, the shifted out
+ * bits are filled with the sign-bit of the data elements. The shift amount is specified by the low-order
+ * 3-bits of the value in the Rs2 register. For the rounding operation of the `.u` form, a value of 1 is
+ * added to the most significant discarded bit of each 8-bit data element to calculate the final results.
+ * And the results are written to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * sa = Rs2[2:0];
+ * if (sa > 0) {
+ *   if (`.u` form) { // SRA8.u
+ *     res[7:-1] = SE9(Rs1.B[x][7:sa-1]) + 1;
+ *     Rd.B[x] = res[7:0];
+ *   } else { // SRA8
+ *     Rd.B[x] = SE8(Rd.B[x][7:sa])
+ *   }
+ * } else {
+ *   Rd = Rs1;
+ * }
+ * for RV32: x=3...0,
+ * for RV64: x=7...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned int type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_SRA8(unsigned long a, unsigned int b)
+{
+    register unsigned long result;
+    __ASM volatile("sra8 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.126.1. SRA8 ===== */
+
+/* ===== Inline Function Start for 3.126.2. SRA8.u ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_SHIFT
+ * \brief SRA8.u (SIMD 8-bit Rounding Shift Right Arithmetic)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SRA8 Rd, Rs1, Rs2
+ * SRA8.u Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 8-bit element arithmetic right shift operations simultaneously. The shift amount is a
+ * variable from a GPR. The `.u` form performs additional rounding up operations on the shifted
+ * results.
+ *
+ * **Description**:\n
+ * The 8-bit data elements in Rs1 are right-shifted arithmetically, that is, the shifted out
+ * bits are filled with the sign-bit of the data elements. The shift amount is specified by the low-order
+ * 3-bits of the value in the Rs2 register. For the rounding operation of the `.u` form, a value of 1 is
+ * added to the most significant discarded bit of each 8-bit data element to calculate the final results.
+ * And the results are written to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * sa = Rs2[2:0];
+ * if (sa > 0) {
+ *   if (`.u` form) { // SRA8.u
+ *     res[7:-1] = SE9(Rs1.B[x][7:sa-1]) + 1;
+ *     Rd.B[x] = res[7:0];
+ *   } else { // SRA8
+ *     Rd.B[x] = SE8(Rd.B[x][7:sa])
+ *   }
+ * } else {
+ *   Rd = Rs1;
+ * }
+ * for RV32: x=3...0,
+ * for RV64: x=7...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned int type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_SRA8_U(unsigned long a, unsigned int b)
+{
+    register unsigned long result;
+    __ASM volatile("sra8.u %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.126.2. SRA8.u ===== */
+
+/* ===== Inline Function Start for 3.127.1. SRAI8 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_SHIFT
+ * \brief SRAI8 (SIMD 8-bit Shift Right Arithmetic Immediate)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SRAI8 Rd, Rs1, imm3u
+ * SRAI8.u Rd, Rs1, imm3u
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 8-bit element arithmetic right shift operations simultaneously. The shift amount is an
+ * immediate value. The `.u` form performs additional rounding up operations on the shifted results.
+ *
+ * **Description**:\n
+ * The 8-bit data elements in Rs1 are right-shifted arithmetically, that is, the shifted out
+ * bits are filled with the sign-bit of the data elements. The shift amount is specified by the imm3u
+ * constant. For the rounding operation of the `.u` form, a value of 1 is added to the most significant
+ * discarded bit of each 8-bit data element to calculate the final results. And the results are written to
+ * Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * sa = imm3u[2:0];
+ * if (sa > 0) {
+ *   if (`.u` form) { // SRA8.u
+ *     res[7:-1] = SE9(Rs1.B[x][7:sa-1]) + 1;
+ *     Rd.B[x] = res[7:0];
+ *   } else { // SRA8
+ *     Rd.B[x] = SE8(Rd.B[x][7:sa])
+ *   }
+ * } else {
+ *   Rd = Rs1;
+ * }
+ * for RV32: x=3...0,
+ * for RV64: x=7...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned int type of value stored in b
+ * \return value stored in unsigned long type
+ */
+#define __RV_SRAI8(a, b)    \
+    ({    \
+        register unsigned long result;    \
+        register unsigned long __a = (unsigned long)(a);    \
+        __ASM volatile("srai8 %0, %1, %2" : "=r"(result) : "r"(__a), "K"(b));    \
+        result;    \
+    })
+/* ===== Inline Function End for 3.127.1. SRAI8 ===== */
+
+/* ===== Inline Function Start for 3.127.2. SRAI8.u ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_SHIFT
+ * \brief SRAI8.u (SIMD 8-bit Rounding Shift Right Arithmetic Immediate)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SRAI8 Rd, Rs1, imm3u
+ * SRAI8.u Rd, Rs1, imm3u
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 8-bit element arithmetic right shift operations simultaneously. The shift amount is an
+ * immediate value. The `.u` form performs additional rounding up operations on the shifted results.
+ *
+ * **Description**:\n
+ * The 8-bit data elements in Rs1 are right-shifted arithmetically, that is, the shifted out
+ * bits are filled with the sign-bit of the data elements. The shift amount is specified by the imm3u
+ * constant. For the rounding operation of the `.u` form, a value of 1 is added to the most significant
+ * discarded bit of each 8-bit data element to calculate the final results. And the results are written to
+ * Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * sa = imm3u[2:0];
+ * if (sa > 0) {
+ *   if (`.u` form) { // SRA8.u
+ *     res[7:-1] = SE9(Rs1.B[x][7:sa-1]) + 1;
+ *     Rd.B[x] = res[7:0];
+ *   } else { // SRA8
+ *     Rd.B[x] = SE8(Rd.B[x][7:sa])
+ *   }
+ * } else {
+ *   Rd = Rs1;
+ * }
+ * for RV32: x=3...0,
+ * for RV64: x=7...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned int type of value stored in b
+ * \return value stored in unsigned long type
+ */
+#define __RV_SRAI8_U(a, b)    \
+    ({    \
+        register unsigned long result;    \
+        register unsigned long __a = (unsigned long)(a);    \
+        __ASM volatile("srai8.u %0, %1, %2" : "=r"(result) : "r"(__a), "K"(b));    \
+        result;    \
+    })
+/* ===== Inline Function End for 3.127.2. SRAI8.u ===== */
+
+/* ===== Inline Function Start for 3.128.1. SRA16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_SHIFT
+ * \brief SRA16 (SIMD 16-bit Shift Right Arithmetic)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SRA16 Rd, Rs1, Rs2
+ * SRA16.u Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 16-bit element arithmetic right shift operations simultaneously. The shift amount is a
+ * variable from a GPR. The `.u` form performs additional rounding up operations on the shifted
+ * results.
+ *
+ * **Description**:\n
+ * The 16-bit data elements in Rs1 are right-shifted arithmetically, that is, the shifted out
+ * bits are filled with the sign-bit of the data elements. The shift amount is specified by the low-order
+ * 4-bits of the value in the Rs2 register. For the rounding operation of the `.u` form, a value of 1 is
+ * added to the most significant discarded bit of each 16-bit data element to calculate the final results.
+ * And the results are written to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * sa = Rs2[3:0];
+ * if (sa != 0) {
+ *   if (`.u` form) { // SRA16.u
+ *     res[15:-1] = SE17(Rs1.H[x][15:sa-1]) + 1;
+ *     Rd.H[x] = res[15:0];
+ *   } else { // SRA16
+ *     Rd.H[x] = SE16(Rs1.H[x][15:sa])
+ *   }
+ * } else {
+ *   Rd = Rs1;
+ * }
+ * for RV32: x=1...0,
+ * for RV64: x=3...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_SRA16(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("sra16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.128.1. SRA16 ===== */
+
+/* ===== Inline Function Start for 3.128.2. SRA16.u ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_SHIFT
+ * \brief SRA16.u (SIMD 16-bit Rounding Shift Right Arithmetic)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SRA16 Rd, Rs1, Rs2
+ * SRA16.u Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 16-bit element arithmetic right shift operations simultaneously. The shift amount is a
+ * variable from a GPR. The `.u` form performs additional rounding up operations on the shifted
+ * results.
+ *
+ * **Description**:\n
+ * The 16-bit data elements in Rs1 are right-shifted arithmetically, that is, the shifted out
+ * bits are filled with the sign-bit of the data elements. The shift amount is specified by the low-order
+ * 4-bits of the value in the Rs2 register. For the rounding operation of the `.u` form, a value of 1 is
+ * added to the most significant discarded bit of each 16-bit data element to calculate the final results.
+ * And the results are written to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * sa = Rs2[3:0];
+ * if (sa != 0) {
+ *   if (`.u` form) { // SRA16.u
+ *     res[15:-1] = SE17(Rs1.H[x][15:sa-1]) + 1;
+ *     Rd.H[x] = res[15:0];
+ *   } else { // SRA16
+ *     Rd.H[x] = SE16(Rs1.H[x][15:sa])
+ *   }
+ * } else {
+ *   Rd = Rs1;
+ * }
+ * for RV32: x=1...0,
+ * for RV64: x=3...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_SRA16_U(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("sra16.u %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.128.2. SRA16.u ===== */
+
+/* ===== Inline Function Start for 3.129.1. SRAI16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_SHIFT
+ * \brief SRAI16 (SIMD 16-bit Shift Right Arithmetic Immediate)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SRAI16 Rd, Rs1, imm4u
+ * SRAI16.u Rd, Rs1, imm4u
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 16-bit elements arithmetic right shift operations simultaneously. The shift amount is
+ * an immediate value. The `.u` form performs additional rounding up operations on the shifted
+ * results.
+ *
+ * **Description**:\n
+ * The 16-bit data elements in Rs1 are right-shifted arithmetically, that is, the shifted out
+ * bits are filled with the sign-bit of the 16-bit data elements. The shift amount is specified by the
+ * imm4u constant. For the rounding operation of the `.u` form, a value of 1 is added to the most
+ * significant discarded bit of each 16-bit data to calculate the final results. And the results are written
+ * to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * sa = imm4u[3:0];
+ * if (sa > 0) {
+ *   if (`.u` form) { // SRAI16.u
+ *     res[15:-1] = SE17(Rs1.H[x][15:sa-1]) + 1;
+ *     Rd.H[x] = res[15:0];
+ *   } else { // SRAI16
+ *     Rd.H[x] = SE16(Rs1.H[x][15:sa]);
+ *   }
+ * } else {
+ *   Rd = Rs1;
+ * }
+ * for RV32: x=1...0,
+ * for RV64: x=3...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+#define __RV_SRAI16(a, b)    \
+    ({    \
+        register unsigned long result;    \
+        register unsigned long __a = (unsigned long)(a);    \
+        __ASM volatile("srai16 %0, %1, %2" : "=r"(result) : "r"(__a), "K"(b));    \
+        result;    \
+    })
+/* ===== Inline Function End for 3.129.1. SRAI16 ===== */
+
+/* ===== Inline Function Start for 3.129.2. SRAI16.u ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_SHIFT
+ * \brief SRAI16.u (SIMD 16-bit Rounding Shift Right Arithmetic Immediate)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SRAI16 Rd, Rs1, imm4u
+ * SRAI16.u Rd, Rs1, imm4u
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 16-bit elements arithmetic right shift operations simultaneously. The shift amount is
+ * an immediate value. The `.u` form performs additional rounding up operations on the shifted
+ * results.
+ *
+ * **Description**:\n
+ * The 16-bit data elements in Rs1 are right-shifted arithmetically, that is, the shifted out
+ * bits are filled with the sign-bit of the 16-bit data elements. The shift amount is specified by the
+ * imm4u constant. For the rounding operation of the `.u` form, a value of 1 is added to the most
+ * significant discarded bit of each 16-bit data to calculate the final results. And the results are written
+ * to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * sa = imm4u[3:0];
+ * if (sa > 0) {
+ *   if (`.u` form) { // SRAI16.u
+ *     res[15:-1] = SE17(Rs1.H[x][15:sa-1]) + 1;
+ *     Rd.H[x] = res[15:0];
+ *   } else { // SRAI16
+ *     Rd.H[x] = SE16(Rs1.H[x][15:sa]);
+ *   }
+ * } else {
+ *   Rd = Rs1;
+ * }
+ * for RV32: x=1...0,
+ * for RV64: x=3...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+#define __RV_SRAI16_U(a, b)    \
+    ({    \
+        register unsigned long result;    \
+        register unsigned long __a = (unsigned long)(a);    \
+        __ASM volatile("srai16.u %0, %1, %2" : "=r"(result) : "r"(__a), "K"(b));    \
+        result;    \
+    })
+/* ===== Inline Function End for 3.129.2. SRAI16.u ===== */
+
+/* ===== Inline Function Start for 3.130.1. SRL8 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_SHIFT
+ * \brief SRL8 (SIMD 8-bit Shift Right Logical)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SRL8 Rt, Ra, Rb
+ * SRL8.u Rt, Ra, Rb
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 8-bit elements logical right shift operations simultaneously. The shift amount is a
+ * variable from a GPR. The `.u` form performs additional rounding up operations on the shifted
+ * results.
+ *
+ * **Description**:\n
+ * The 8-bit data elements in Rs1 are right-shifted logically, that is, the shifted out bits are
+ * filled with zero. The shift amount is specified by the low-order 3-bits of the value in the Rs2 register.
+ * For the rounding operation of the `.u` form, a value of 1 is added to the most significant discarded
+ * bit of each 8-bit data element to calculate the final results. And the results are written to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * sa = Rs2[2:0];
+ * if (sa > 0) {
+ *   if (`.u` form) { // SRL8.u
+ *     res[8:0] = ZE9(Rs1.B[x][7:sa-1]) + 1;
+ *     Rd.B[x] = res[8:1];
+ *   } else { // SRL8
+ *     Rd.B[x] = ZE8(Rs1.B[x][7:sa]);
+ *   }
+ * } else {
+ *   Rd = Rs1;
+ * }
+ * for RV32: x=3...0,
+ * for RV64: x=7...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned int type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_SRL8(unsigned long a, unsigned int b)
+{
+    register unsigned long result;
+    __ASM volatile("srl8 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.130.1. SRL8 ===== */
+
+/* ===== Inline Function Start for 3.130.2. SRL8.u ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_SHIFT
+ * \brief SRL8.u (SIMD 8-bit Rounding Shift Right Logical)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SRL8 Rt, Ra, Rb
+ * SRL8.u Rt, Ra, Rb
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 8-bit elements logical right shift operations simultaneously. The shift amount is a
+ * variable from a GPR. The `.u` form performs additional rounding up operations on the shifted
+ * results.
+ *
+ * **Description**:\n
+ * The 8-bit data elements in Rs1 are right-shifted logically, that is, the shifted out bits are
+ * filled with zero. The shift amount is specified by the low-order 3-bits of the value in the Rs2 register.
+ * For the rounding operation of the `.u` form, a value of 1 is added to the most significant discarded
+ * bit of each 8-bit data element to calculate the final results. And the results are written to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * sa = Rs2[2:0];
+ * if (sa > 0) {
+ *   if (`.u` form) { // SRL8.u
+ *     res[8:0] = ZE9(Rs1.B[x][7:sa-1]) + 1;
+ *     Rd.B[x] = res[8:1];
+ *   } else { // SRL8
+ *     Rd.B[x] = ZE8(Rs1.B[x][7:sa]);
+ *   }
+ * } else {
+ *   Rd = Rs1;
+ * }
+ * for RV32: x=3...0,
+ * for RV64: x=7...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned int type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_SRL8_U(unsigned long a, unsigned int b)
+{
+    register unsigned long result;
+    __ASM volatile("srl8.u %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.130.2. SRL8.u ===== */
+
+/* ===== Inline Function Start for 3.131.1. SRLI8 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_SHIFT
+ * \brief SRLI8 (SIMD 8-bit Shift Right Logical Immediate)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SRLI8 Rt, Ra, imm3u
+ * SRLI8.u Rt, Ra, imm3u
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 8-bit elements logical right shift operations simultaneously. The shift amount is an
+ * immediate value. The `.u` form performs additional rounding up operations on the shifted results.
+ *
+ * **Description**:\n
+ * The 8-bit data elements in Rs1 are right-shifted logically, that is, the shifted out bits are
+ * filled with zero. The shift amount is specified by the imm3u constant. For the rounding operation of
+ * the `.u` form, a value of 1 is added to the most significant discarded bit of each 8-bit data element to
+ * calculate the final results. And the results are written to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * sa = imm3u[2:0];
+ * if (sa > 0) {
+ *   if (`.u` form) { // SRLI8.u
+ *     res[8:0] = ZE9(Rs1.B[x][7:sa-1]) + 1;
+ *     Rd.B[x] = res[8:1];
+ *   } else { // SRLI8
+ *     Rd.B[x] = ZE8(Rs1.B[x][7:sa]);
+ *   }
+ * } else {
+ *   Rd = Rs1;
+ * }
+ * for RV32: x=3...0,
+ * for RV64: x=7...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned int type of value stored in b
+ * \return value stored in unsigned long type
+ */
+#define __RV_SRLI8(a, b)    \
+    ({    \
+        register unsigned long result;    \
+        register unsigned long __a = (unsigned long)(a);    \
+        __ASM volatile("srli8 %0, %1, %2" : "=r"(result) : "r"(__a), "K"(b));    \
+        result;    \
+    })
+/* ===== Inline Function End for 3.131.1. SRLI8 ===== */
+
+/* ===== Inline Function Start for 3.131.2. SRLI8.u ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_SHIFT
+ * \brief SRLI8.u (SIMD 8-bit Rounding Shift Right Logical Immediate)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SRLI8 Rt, Ra, imm3u
+ * SRLI8.u Rt, Ra, imm3u
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 8-bit elements logical right shift operations simultaneously. The shift amount is an
+ * immediate value. The `.u` form performs additional rounding up operations on the shifted results.
+ *
+ * **Description**:\n
+ * The 8-bit data elements in Rs1 are right-shifted logically, that is, the shifted out bits are
+ * filled with zero. The shift amount is specified by the imm3u constant. For the rounding operation of
+ * the `.u` form, a value of 1 is added to the most significant discarded bit of each 8-bit data element to
+ * calculate the final results. And the results are written to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * sa = imm3u[2:0];
+ * if (sa > 0) {
+ *   if (`.u` form) { // SRLI8.u
+ *     res[8:0] = ZE9(Rs1.B[x][7:sa-1]) + 1;
+ *     Rd.B[x] = res[8:1];
+ *   } else { // SRLI8
+ *     Rd.B[x] = ZE8(Rs1.B[x][7:sa]);
+ *   }
+ * } else {
+ *   Rd = Rs1;
+ * }
+ * for RV32: x=3...0,
+ * for RV64: x=7...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned int type of value stored in b
+ * \return value stored in unsigned long type
+ */
+#define __RV_SRLI8_U(a, b)    \
+    ({    \
+        register unsigned long result;    \
+        register unsigned long __a = (unsigned long)(a);    \
+        __ASM volatile("srli8.u %0, %1, %2" : "=r"(result) : "r"(__a), "K"(b));    \
+        result;    \
+    })
+/* ===== Inline Function End for 3.131.2. SRLI8.u ===== */
+
+/* ===== Inline Function Start for 3.132.1. SRL16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_SHIFT
+ * \brief SRL16 (SIMD 16-bit Shift Right Logical)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SRL16 Rt, Ra, Rb
+ *  SRL16.u Rt, Ra, Rb
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 16-bit elements logical right shift operations simultaneously. The shift amount is a variable from a GPR. The `.u` form performs additional rounding upoperations on the shifted results.
+ *
+ * **Description**:\n
+ * The 16-bit data elements in Rs1 are right-shifted logically, that is, the shifted out bits
+ * are filled with zero. The shift amount is specified by the low-order 4-bits of the value in the Rs2
+ * register. For the rounding operation of the `.u` form, a value of 1 is added to the most significant
+ * discarded bit of each 16-bit data element to calculate the final results. And the results are written to
+ * Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * sa = Rs2[3:0];
+ * if (sa > 0) {
+ *   if (`.u` form) { // SRL16.u
+ *     res[16:0] = ZE17(Rs1.H[x][15:sa-1]) + 1;
+ *     Rd.H[x] = res[16:1];
+ *   } else { // SRL16
+ *     Rd.H[x] = ZE16(Rs1.H[x][15:sa]);
+ *   }
+ * } else {
+ *   Rd = Rs1;
+ * }
+ * for RV32: x=1...0,
+ * for RV64: x=3...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned int type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_SRL16(unsigned long a, unsigned int b)
+{
+    register unsigned long result;
+    __ASM volatile("srl16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.132.1. SRL16 ===== */
+
+/* ===== Inline Function Start for 3.132.2. SRL16.u ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_SHIFT
+ * \brief SRL16.u (SIMD 16-bit Rounding Shift Right Logical)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SRL16 Rt, Ra, Rb
+ *  SRL16.u Rt, Ra, Rb
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 16-bit elements logical right shift operations simultaneously. The shift amount is a variable from a GPR. The `.u` form performs additional rounding upoperations on the shifted results.
+ *
+ * **Description**:\n
+ * The 16-bit data elements in Rs1 are right-shifted logically, that is, the shifted out bits
+ * are filled with zero. The shift amount is specified by the low-order 4-bits of the value in the Rs2
+ * register. For the rounding operation of the `.u` form, a value of 1 is added to the most significant
+ * discarded bit of each 16-bit data element to calculate the final results. And the results are written to
+ * Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * sa = Rs2[3:0];
+ * if (sa > 0) {
+ *   if (`.u` form) { // SRL16.u
+ *     res[16:0] = ZE17(Rs1.H[x][15:sa-1]) + 1;
+ *     Rd.H[x] = res[16:1];
+ *   } else { // SRL16
+ *     Rd.H[x] = ZE16(Rs1.H[x][15:sa]);
+ *   }
+ * } else {
+ *   Rd = Rs1;
+ * }
+ * for RV32: x=1...0,
+ * for RV64: x=3...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned int type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_SRL16_U(unsigned long a, unsigned int b)
+{
+    register unsigned long result;
+    __ASM volatile("srl16.u %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.132.2. SRL16.u ===== */
+
+/* ===== Inline Function Start for 3.133.1. SRLI16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_SHIFT
+ * \brief SRLI16 (SIMD 16-bit Shift Right Logical Immediate)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SRLI16 Rt, Ra, imm4u
+ * SRLI16.u Rt, Ra, imm4u
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 16-bit elements logical right shift operations simultaneously. The shift amount is an
+ * immediate value. The `.u` form performs additional rounding up operations on the shifted results.
+ *
+ * **Description**:\n
+ * The 16-bit data elements in Rs1 are right-shifted logically, that is, the shifted out bits
+ * are filled with zero. The shift amount is specified by the imm4u constant. For the rounding
+ * operation of the `.u` form, a value of 1 is added to the most significant discarded bit of each 16-bit
+ * data element to calculate the final results. And the results are written to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * sa = imm4u;
+ * if (sa > 0) {
+ *   if (`.u` form) { // SRLI16.u
+ *     res[16:0] = ZE17(Rs1.H[x][15:sa-1]) + 1;
+ *     Rd.H[x] = res[16:1];
+ *   } else { // SRLI16
+ *     Rd.H[x] = ZE16(Rs1.H[x][15:sa]);
+ *   }
+ * } else {
+ *   Rd = Rs1;
+ * }
+ * for RV32: x=1...0,
+ * for RV64: x=3...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned int type of value stored in b
+ * \return value stored in unsigned long type
+ */
+#define __RV_SRLI16(a, b)    \
+    ({    \
+        register unsigned long result;    \
+        register unsigned long __a = (unsigned long)(a);    \
+        __ASM volatile("srli16 %0, %1, %2" : "=r"(result) : "r"(__a), "K"(b));    \
+        result;    \
+    })
+/* ===== Inline Function End for 3.133.1. SRLI16 ===== */
+
+/* ===== Inline Function Start for 3.133.2. SRLI16.u ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_SHIFT
+ * \brief SRLI16.u (SIMD 16-bit Rounding Shift Right Logical Immediate)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SRLI16 Rt, Ra, imm4u
+ * SRLI16.u Rt, Ra, imm4u
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 16-bit elements logical right shift operations simultaneously. The shift amount is an
+ * immediate value. The `.u` form performs additional rounding up operations on the shifted results.
+ *
+ * **Description**:\n
+ * The 16-bit data elements in Rs1 are right-shifted logically, that is, the shifted out bits
+ * are filled with zero. The shift amount is specified by the imm4u constant. For the rounding
+ * operation of the `.u` form, a value of 1 is added to the most significant discarded bit of each 16-bit
+ * data element to calculate the final results. And the results are written to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * sa = imm4u;
+ * if (sa > 0) {
+ *   if (`.u` form) { // SRLI16.u
+ *     res[16:0] = ZE17(Rs1.H[x][15:sa-1]) + 1;
+ *     Rd.H[x] = res[16:1];
+ *   } else { // SRLI16
+ *     Rd.H[x] = ZE16(Rs1.H[x][15:sa]);
+ *   }
+ * } else {
+ *   Rd = Rs1;
+ * }
+ * for RV32: x=1...0,
+ * for RV64: x=3...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned int type of value stored in b
+ * \return value stored in unsigned long type
+ */
+#define __RV_SRLI16_U(a, b)    \
+    ({    \
+        register unsigned long result;    \
+        register unsigned long __a = (unsigned long)(a);    \
+        __ASM volatile("srli16.u %0, %1, %2" : "=r"(result) : "r"(__a), "K"(b));    \
+        result;    \
+    })
+/* ===== Inline Function End for 3.133.2. SRLI16.u ===== */
+
+/* ===== Inline Function Start for 3.134. STAS16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_ADDSUB
+ * \brief STAS16 (SIMD 16-bit Straight Addition & Subtraction)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * STAS16 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 16-bit integer element addition and 16-bit integer element subtraction in a 32-bit
+ * chunk simultaneously. Operands are from corresponding positions in 32-bit chunks.
+ *
+ * **Description**:\n
+ * This instruction adds the 16-bit integer element in [31:16] of 32-bit chunks in Rs1 with
+ * the 16-bit integer element in [31:16] of 32-bit chunks in Rs2, and writes the result to [31:16] of 32-bit
+ * chunks in Rd; at the same time, it subtracts the 16-bit integer element in [15:0] of 32-bit chunks in
+ * Rs2 from the 16-bit integer element in [15:0] of 32-bit chunks, and writes the result to [15:0] of 32-
+ * bit chunks in Rd.
+ *
+ * **Note**:\n
+ * This instruction can be used for either signed or unsigned operations.
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.W[x][31:16] = Rs1.W[x][31:16] + Rs2.W[x][31:16];
+ * Rd.W[x][15:0] = Rs1.W[x][15:0] - Rs2.W[x][15:0];
+ * for RV32, x=0
+ * for RV64, x=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_STAS16(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("stas16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.134. STAS16 ===== */
+
+/* ===== Inline Function Start for 3.135. STSA16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_ADDSUB
+ * \brief STSA16 (SIMD 16-bit Straight Subtraction & Addition)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * STSA16 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 16-bit integer element subtraction and 16-bit integer element addition in a 32-bit
+ * chunk simultaneously. Operands are from corresponding positions in 32-bit chunks.
+ *
+ * **Description**:\n
+ * This instruction subtracts the 16-bit integer element in [31:16] of 32-bit chunks in Rs2
+ * from the 16-bit integer element in [31:16] of 32-bit chunks in Rs1, and writes the result to [31:16] of
+ * 32-bit chunks in Rd; at the same time, it adds the 16-bit integer element in [15:0] of 32-bit chunks in
+ * Rs2 with the 16-bit integer element in [15:0] of 32-bit chunks in Rs1, and writes the result to [15:0] of
+ * 32-bit chunks in Rd.
+ *
+ * **Note**:\n
+ * This instruction can be used for either signed or unsigned operations.
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.W[x][31:16] = Rs1.W[x][31:16] - Rs2.W[x][31:16];
+ * Rd.W[x][15:0] = Rs1.W[x][15:0] + Rs2.W[x][15:0];
+ * for RV32, x=0
+ * for RV64, x=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_STSA16(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("stsa16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.135. STSA16 ===== */
+
+/* ===== Inline Function Start for 3.136. SUB8 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_ADDSUB
+ * \brief SUB8 (SIMD 8-bit Subtraction)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SUB8 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 8-bit integer element subtractions simultaneously.
+ *
+ * **Description**:\n
+ * This instruction subtracts the 8-bit integer elements in Rs2 from the 8-bit integer
+ * elements in Rs1, and then writes the result to Rd.
+ *
+ * **Note**:\n
+ * This instruction can be used for either signed or unsigned subtraction.
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.B[x] = Rs1.B[x] - Rs2.B[x];
+ * for RV32: x=3...0,
+ * for RV64: x=7...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_SUB8(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("sub8 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.136. SUB8 ===== */
+
+/* ===== Inline Function Start for 3.137. SUB16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_ADDSUB
+ * \brief SUB16 (SIMD 16-bit Subtraction)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SUB16 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 16-bit integer element subtractions simultaneously.
+ *
+ * **Description**:\n
+ * This instruction subtracts the 16-bit integer elements in Rs2 from the 16-bit integer
+ * elements in Rs1, and then writes the result to Rd.
+ *
+ * **Note**:\n
+ * This instruction can be used for either signed or unsigned subtraction.
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.H[x] = Rs1.H[x] - Rs2.H[x];
+ * for RV32: x=1...0,
+ * for RV64: x=3...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_SUB16(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("sub16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.137. SUB16 ===== */
+
+/* ===== Inline Function Start for 3.138. SUB64 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_64B_ADDSUB
+ * \brief SUB64 (64-bit Subtraction)
+ * \details
+ * **Type**: DSP (64-bit Profile)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SUB64 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Perform a 64-bit signed or unsigned integer subtraction.
+ *
+ * **RV32 Description**:\n
+ * This instruction subtracts the 64-bit integer of an even/odd pair of registers
+ * specified by Rs2(4,1) from the 64-bit integer of an even/odd pair of registers specified by Rs1(4,1),
+ * and then writes the 64-bit result to an even/odd pair of registers specified by Rd(4,1).
+ * Rx(4,1), i.e., d, determines the even/odd pair group of two registers. Specifically, the register pair
+ * includes register 2d and 2d+1.
+ * The odd `2d+1` register of the pair contains the high 32-bit of the operand and the even `2d`
+ * register of the pair contains the low 32-bit of the operand.
+ *
+ * **RV64 Description**:\n
+ * This instruction subtracts the 64-bit integer of Rs2 from the 64-bit integer of Rs1,
+ * and then writes the 64-bit result to Rd.
+ *
+ * **Note**:\n
+ * This instruction can be used for either signed or unsigned subtraction.
+ *
+ * **Operations**:\n
+ * ~~~
+ * * RV32:
+ * t_L = CONCAT(Rd(4,1),1'b0); t_H = CONCAT(Rd(4,1),1'b1);
+ * a_L = CONCAT(Rs1(4,1),1'b0); a_H = CONCAT(Rs1(4,1),1'b1);
+ * b_L = CONCAT(Rs2(4,1),1'b0); b_H = CONCAT(Rs2(4,1),1'b1);
+ * R[t_H].R[t_L] = R[a_H].R[a_L] - R[b_H].R[b_L];
+ * * RV64:
+ * Rd = Rs1 - Rs2;
+ * ~~~
+ *
+ * \param [in]  a    unsigned long long type of value stored in a
+ * \param [in]  b    unsigned long long type of value stored in b
+ * \return value stored in unsigned long long type
+ */
+__STATIC_FORCEINLINE unsigned long long __RV_SUB64(unsigned long long a, unsigned long long b)
+{
+    register unsigned long long result;
+    __ASM volatile("sub64 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.138. SUB64 ===== */
+
+/* ===== Inline Function Start for 3.139.1. SUNPKD810 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_UNPACK
+ * \brief SUNPKD810 (Signed Unpacking Bytes 1 & 0)
+ * \details
+ * **Type**: DSP
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SUNPKD8xy Rd, Rs1
+ * xy = {10, 20, 30, 31, 32}
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Unpack byte *x and byte y* of 32-bit chunks in a register into two 16-bit signed halfwords
+ * of 32-bit chunks in a register.
+ *
+ * **Description**:\n
+ * For the `SUNPKD8(x)(*y*)` instruction, it unpacks byte *x and byte y* of 32-bit chunks in Rs1 into
+ * two 16-bit signed halfwords and writes the results to the top part and the bottom part of 32-bit
+ * chunks in Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.W[m].H[1] = SE16(Rs1.W[m].B[x])
+ * Rd.W[m].H[0] = SE16(Rs1.W[m].B[y])
+ * // SUNPKD810, x=1,y=0
+ * // SUNPKD820, x=2,y=0
+ * // SUNPKD830, x=3,y=0
+ * // SUNPKD831, x=3,y=1
+ * // SUNPKD832, x=3,y=2
+ * for RV32: m=0,
+ * for RV64: m=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_SUNPKD810(unsigned long a)
+{
+    register unsigned long result;
+    __ASM volatile("sunpkd810 %0, %1" : "=r"(result) : "r"(a));
+    return result;
+}
+/* ===== Inline Function End for 3.139.1. SUNPKD810 ===== */
+
+/* ===== Inline Function Start for 3.139.2. SUNPKD820 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_UNPACK
+ * \brief SUNPKD820 (Signed Unpacking Bytes 2 & 0)
+ * \details
+ * **Type**: DSP
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SUNPKD8xy Rd, Rs1
+ * xy = {10, 20, 30, 31, 32}
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Unpack byte *x and byte y* of 32-bit chunks in a register into two 16-bit signed halfwords
+ * of 32-bit chunks in a register.
+ *
+ * **Description**:\n
+ * For the `SUNPKD8(x)(*y*)` instruction, it unpacks byte *x and byte y* of 32-bit chunks in Rs1 into
+ * two 16-bit signed halfwords and writes the results to the top part and the bottom part of 32-bit
+ * chunks in Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.W[m].H[1] = SE16(Rs1.W[m].B[x])
+ * Rd.W[m].H[0] = SE16(Rs1.W[m].B[y])
+ * // SUNPKD810, x=1,y=0
+ * // SUNPKD820, x=2,y=0
+ * // SUNPKD830, x=3,y=0
+ * // SUNPKD831, x=3,y=1
+ * // SUNPKD832, x=3,y=2
+ * for RV32: m=0,
+ * for RV64: m=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_SUNPKD820(unsigned long a)
+{
+    register unsigned long result;
+    __ASM volatile("sunpkd820 %0, %1" : "=r"(result) : "r"(a));
+    return result;
+}
+/* ===== Inline Function End for 3.139.2. SUNPKD820 ===== */
+
+/* ===== Inline Function Start for 3.139.3. SUNPKD830 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_UNPACK
+ * \brief SUNPKD830 (Signed Unpacking Bytes 3 & 0)
+ * \details
+ * **Type**: DSP
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SUNPKD8xy Rd, Rs1
+ * xy = {10, 20, 30, 31, 32}
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Unpack byte *x and byte y* of 32-bit chunks in a register into two 16-bit signed halfwords
+ * of 32-bit chunks in a register.
+ *
+ * **Description**:\n
+ * For the `SUNPKD8(x)(*y*)` instruction, it unpacks byte *x and byte y* of 32-bit chunks in Rs1 into
+ * two 16-bit signed halfwords and writes the results to the top part and the bottom part of 32-bit
+ * chunks in Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.W[m].H[1] = SE16(Rs1.W[m].B[x])
+ * Rd.W[m].H[0] = SE16(Rs1.W[m].B[y])
+ * // SUNPKD810, x=1,y=0
+ * // SUNPKD820, x=2,y=0
+ * // SUNPKD830, x=3,y=0
+ * // SUNPKD831, x=3,y=1
+ * // SUNPKD832, x=3,y=2
+ * for RV32: m=0,
+ * for RV64: m=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_SUNPKD830(unsigned long a)
+{
+    register unsigned long result;
+    __ASM volatile("sunpkd830 %0, %1" : "=r"(result) : "r"(a));
+    return result;
+}
+/* ===== Inline Function End for 3.139.3. SUNPKD830 ===== */
+
+/* ===== Inline Function Start for 3.139.4. SUNPKD831 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_UNPACK
+ * \brief SUNPKD831 (Signed Unpacking Bytes 3 & 1)
+ * \details
+ * **Type**: DSP
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SUNPKD8xy Rd, Rs1
+ * xy = {10, 20, 30, 31, 32}
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Unpack byte *x and byte y* of 32-bit chunks in a register into two 16-bit signed halfwords
+ * of 32-bit chunks in a register.
+ *
+ * **Description**:\n
+ * For the `SUNPKD8(x)(*y*)` instruction, it unpacks byte *x and byte y* of 32-bit chunks in Rs1 into
+ * two 16-bit signed halfwords and writes the results to the top part and the bottom part of 32-bit
+ * chunks in Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.W[m].H[1] = SE16(Rs1.W[m].B[x])
+ * Rd.W[m].H[0] = SE16(Rs1.W[m].B[y])
+ * // SUNPKD810, x=1,y=0
+ * // SUNPKD820, x=2,y=0
+ * // SUNPKD830, x=3,y=0
+ * // SUNPKD831, x=3,y=1
+ * // SUNPKD832, x=3,y=2
+ * for RV32: m=0,
+ * for RV64: m=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_SUNPKD831(unsigned long a)
+{
+    register unsigned long result;
+    __ASM volatile("sunpkd831 %0, %1" : "=r"(result) : "r"(a));
+    return result;
+}
+/* ===== Inline Function End for 3.139.4. SUNPKD831 ===== */
+
+/* ===== Inline Function Start for 3.139.5. SUNPKD832 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_UNPACK
+ * \brief SUNPKD832 (Signed Unpacking Bytes 3 & 2)
+ * \details
+ * **Type**: DSP
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SUNPKD8xy Rd, Rs1
+ * xy = {10, 20, 30, 31, 32}
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Unpack byte *x and byte y* of 32-bit chunks in a register into two 16-bit signed halfwords
+ * of 32-bit chunks in a register.
+ *
+ * **Description**:\n
+ * For the `SUNPKD8(x)(*y*)` instruction, it unpacks byte *x and byte y* of 32-bit chunks in Rs1 into
+ * two 16-bit signed halfwords and writes the results to the top part and the bottom part of 32-bit
+ * chunks in Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.W[m].H[1] = SE16(Rs1.W[m].B[x])
+ * Rd.W[m].H[0] = SE16(Rs1.W[m].B[y])
+ * // SUNPKD810, x=1,y=0
+ * // SUNPKD820, x=2,y=0
+ * // SUNPKD830, x=3,y=0
+ * // SUNPKD831, x=3,y=1
+ * // SUNPKD832, x=3,y=2
+ * for RV32: m=0,
+ * for RV64: m=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_SUNPKD832(unsigned long a)
+{
+    register unsigned long result;
+    __ASM volatile("sunpkd832 %0, %1" : "=r"(result) : "r"(a));
+    return result;
+}
+/* ===== Inline Function End for 3.139.5. SUNPKD832 ===== */
+
+/* ===== Inline Function Start for 3.140. SWAP8 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_MISC
+ * \brief SWAP8 (Swap Byte within Halfword)
+ * \details
+ * **Type**: DSP
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SWAP8 Rd, Rs1
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Swap the bytes within each halfword of a register.
+ *
+ * **Description**:\n
+ * This instruction swaps the bytes within each halfword of Rs1 and writes the result to
+ * Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.H[x] = CONCAT(Rs1.H[x][7:0],Rs1.H[x][15:8]);
+ * for RV32: x=1...0,
+ * for RV64: x=3...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_SWAP8(unsigned long a)
+{
+    register unsigned long result;
+    __ASM volatile("swap8 %0, %1" : "=r"(result) : "r"(a));
+    return result;
+}
+/* ===== Inline Function End for 3.140. SWAP8 ===== */
+
+/* ===== Inline Function Start for 3.141. SWAP16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_MISC
+ * \brief SWAP16 (Swap Halfword within Word)
+ * \details
+ * **Type**: DSP
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SWAP16 Rd, Rs1
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Swap the 16-bit halfwords within each word of a register.
+ *
+ * **Description**:\n
+ * This instruction swaps the 16-bit halfwords within each word of Rs1 and writes the
+ * result to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.W[x] = CONCAT(Rs1.W[x][15:0],Rs1.H[x][31:16]);
+ * for RV32: x=0,
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_SWAP16(unsigned long a)
+{
+    register unsigned long result;
+    __ASM volatile("swap16 %0, %1" : "=r"(result) : "r"(a));
+    return result;
+}
+/* ===== Inline Function End for 3.141. SWAP16 ===== */
+
+/* ===== Inline Function Start for 3.142. UCLIP8 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_MISC
+ * \brief UCLIP8 (SIMD 8-bit Unsigned Clip Value)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * UCLIP8 Rt, Ra, imm3u
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Limit the 8-bit signed elements of a register into an unsigned range simultaneously.
+ *
+ * **Description**:\n
+ * This instruction limits the 8-bit signed elements stored in Rs1 into an unsigned integer
+ * range between 2^imm3u-1 and 0, and writes the limited results to Rd. For example, if imm3u is 3, the 8-
+ * bit input values should be saturated between 7 and 0. If saturation is performed, set OV bit to 1.
+ *
+ * **Operations**:\n
+ * ~~~
+ * src = Rs1.H[x];
+ * if (src > (2^imm3u)-1) {
+ *   src = (2^imm3u)-1;
+ *   OV = 1;
+ * } else if (src < 0) {
+ *   src = 0;
+ *   OV = 1;
+ * }
+ * Rd.H[x] = src;
+ * for RV32: x=3...0,
+ * for RV64: x=7...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned int type of value stored in b
+ * \return value stored in unsigned long type
+ */
+#define __RV_UCLIP8(a, b)    \
+    ({    \
+        register unsigned long result;    \
+        register unsigned long __a = (unsigned long)(a);    \
+        __ASM volatile("uclip8 %0, %1, %2" : "=r"(result) : "r"(__a), "K"(b));    \
+        result;    \
+    })
+/* ===== Inline Function End for 3.142. UCLIP8 ===== */
+
+/* ===== Inline Function Start for 3.143. UCLIP16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_MISC
+ * \brief UCLIP16 (SIMD 16-bit Unsigned Clip Value)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * UCLIP16 Rt, Ra, imm4u
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Limit the 16-bit signed elements of a register into an unsigned range simultaneously.
+ *
+ * **Description**:\n
+ * This instruction limits the 16-bit signed elements stored in Rs1 into an unsigned
+ * integer range between 2imm4u-1 and 0, and writes the limited results to Rd. For example, if imm4u is
+ * 3, the 16-bit input values should be saturated between 7 and 0. If saturation is performed, set OV bit
+ * to 1.
+ *
+ * **Operations**:\n
+ * ~~~
+ * src = Rs1.H[x];
+ * if (src > (2^imm4u)-1) {
+ *   src = (2^imm4u)-1;
+ *   OV = 1;
+ * } else if (src < 0) {
+ *   src = 0;
+ *   OV = 1;
+ * }
+ * Rd.H[x] = src;
+ * for RV32: x=1...0,
+ * for RV64: x=3...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned int type of value stored in b
+ * \return value stored in unsigned long type
+ */
+#define __RV_UCLIP16(a, b)    \
+    ({    \
+        register unsigned long result;    \
+        register unsigned long __a = (unsigned long)(a);    \
+        __ASM volatile("uclip16 %0, %1, %2" : "=r"(result) : "r"(__a), "K"(b));    \
+        result;    \
+    })
+/* ===== Inline Function End for 3.143. UCLIP16 ===== */
+
+/* ===== Inline Function Start for 3.144. UCLIP32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_PART_SIMD_MISC
+ * \brief UCLIP32 (SIMD 32-bit Unsigned Clip Value)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * UCLIP32 Rd, Rs1, imm5u[4:0]
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Limit the 32-bit signed integer elements of a register into an unsigned range
+ * simultaneously.
+ *
+ * **Description**:\n
+ * This instruction limits the 32-bit signed integer elements stored in Rs1 into an
+ * unsigned integer range between 2imm5u-1 and 0, and writes the limited results to Rd. For example, if
+ * imm5u is 3, the 32-bit input values should be saturated between 7 and 0. If saturation is performed,
+ * set OV bit to 1.
+ *
+ * **Operations**:\n
+ * ~~~
+ * src = Rs1.W[x];
+ * if (src > (2^imm5u)-1) {
+ *   src = (2^imm5u)-1;
+ *   OV = 1;
+ * } else if (src < 0) {
+ *   src = 0;
+ *   OV = 1;
+ * }
+ * Rd.W[x] = src
+ * for RV32: x=0,
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned int type of value stored in b
+ * \return value stored in unsigned long type
+ */
+#define __RV_UCLIP32(a, b)    \
+    ({    \
+        register unsigned long result;    \
+        register unsigned long __a = (unsigned long)(a);    \
+        __ASM volatile("uclip32 %0, %1, %2" : "=r"(result) : "r"(__a), "K"(b));    \
+        result;    \
+    })
+/* ===== Inline Function End for 3.144. UCLIP32 ===== */
+
+/* ===== Inline Function Start for 3.145. UCMPLE8 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_CMP
+ * \brief UCMPLE8 (SIMD 8-bit Unsigned Compare Less Than & Equal)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * UCMPLE8 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 8-bit unsigned integer elements less than & equal comparisons simultaneously.
+ *
+ * **Description**:\n
+ * This instruction compares the 8-bit unsigned integer elements in Rs1 with the 8-bit
+ * unsigned integer elements in Rs2 to see if the one in Rs1 is less than or equal to the one in Rs2. If it
+ * is true, the result is 0xFF; otherwise, the result is 0x0. The four comparison results are written to
+ * Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.B[x] = (Rs1.B[x] <=u Rs2.B[x])? 0xff : 0x0;
+ * for RV32: x=3...0,
+ * for RV64: x=7...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_UCMPLE8(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("ucmple8 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.145. UCMPLE8 ===== */
+
+/* ===== Inline Function Start for 3.146. UCMPLE16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_CMP
+ * \brief UCMPLE16 (SIMD 16-bit Unsigned Compare Less Than & Equal)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * UCMPLE16 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 16-bit unsigned integer elements less than & equal comparisons simultaneously.
+ *
+ * **Description**:\n
+ * This instruction compares the 16-bit unsigned integer elements in Rs1 with the 16-bit
+ * unsigned integer elements in Rs2 to see if the one in Rs1 is less than or equal to the one in Rs2. If it
+ * is true, the result is 0xFFFF; otherwise, the result is 0x0. The element comparison results are
+ * written to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.H[x] = (Rs1.H[x] <=u Rs2.H[x])? 0xffff : 0x0;
+ * for RV32: x=1...0,
+ * for RV64: x=3...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_UCMPLE16(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("ucmple16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.146. UCMPLE16 ===== */
+
+/* ===== Inline Function Start for 3.147. UCMPLT8 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_CMP
+ * \brief UCMPLT8 (SIMD 8-bit Unsigned Compare Less Than)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * UCMPLT8 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 8-bit unsigned integer elements less than comparisons simultaneously.
+ *
+ * **Description**:\n
+ * This instruction compares the 8-bit unsigned integer elements in Rs1 with the 8-bit
+ * unsigned integer elements in Rs2 to see if the one in Rs1 is less than the one in Rs2. If it is true, the
+ * result is 0xFF; otherwise, the result is 0x0. The element comparison results are written to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.B[x] = (Rs1.B[x] <u Rs2.B[x])? 0xff : 0x0;
+ * for RV32: x=3...0,
+ * for RV64: x=7...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_UCMPLT8(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("ucmplt8 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.147. UCMPLT8 ===== */
+
+/* ===== Inline Function Start for 3.148. UCMPLT16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_CMP
+ * \brief UCMPLT16 (SIMD 16-bit Unsigned Compare Less Than)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * UCMPLT16 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 16-bit unsigned integer elements less than comparisons simultaneously.
+ *
+ * **Description**:\n
+ * This instruction compares the 16-bit unsigned integer elements in Rs1 with the 16-bit
+ * unsigned integer elements in Rs2 to see if the one in Rs1 is less than the one in Rs2. If it is true, the
+ * result is 0xFFFF; otherwise, the result is 0x0. The element comparison results are written to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.H[x] = (Rs1.H[x] <u Rs2.H[x])? 0xffff : 0x0;
+ * for RV32: x=1...0,
+ * for RV64: x=3...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_UCMPLT16(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("ucmplt16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.148. UCMPLT16 ===== */
+
+/* ===== Inline Function Start for 3.149. UKADD8 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_ADDSUB
+ * \brief UKADD8 (SIMD 8-bit Unsigned Saturating Addition)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * UKADD8 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 8-bit unsigned integer element saturating additions simultaneously.
+ *
+ * **Description**:\n
+ * This instruction adds the 8-bit unsigned integer elements in Rs1 with the 8-bit
+ * unsigned integer elements in Rs2. If any of the results are beyond the 8-bit unsigned number range
+ * (0 <= RES <= 28-1), they are saturated to the range and the OV bit is set to 1. The saturated results are
+ * written to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * res[x] = Rs1.B[x] + Rs2.B[x];
+ * if (res[x] > (2^8)-1) {
+ *   res[x] = (2^8)-1;
+ *   OV = 1;
+ * }
+ * Rd.B[x] = res[x];
+ * for RV32: x=3...0,
+ * for RV64: x=7...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_UKADD8(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("ukadd8 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.149. UKADD8 ===== */
+
+/* ===== Inline Function Start for 3.150. UKADD16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_ADDSUB
+ * \brief UKADD16 (SIMD 16-bit Unsigned Saturating Addition)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * UKADD16 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 16-bit unsigned integer element saturating additions simultaneously.
+ *
+ * **Description**:\n
+ * This instruction adds the 16-bit unsigned integer elements in Rs1 with the 16-bit
+ * unsigned integer elements in Rs2. If any of the results are beyond the 16-bit unsigned number
+ * range (0 <= RES <= 2^16-1), they are saturated to the range and the OV bit is set to 1. The saturated
+ * results are written to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * res[x] = Rs1.H[x] + Rs2.H[x];
+ * if (res[x] > (2^16)-1) {
+ *   res[x] = (2^16)-1;
+ *   OV = 1;
+ * }
+ * Rd.H[x] = res[x];
+ * for RV32: x=1...0,
+ * for RV64: x=3...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_UKADD16(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("ukadd16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.150. UKADD16 ===== */
+
+/* ===== Inline Function Start for 3.151. UKADD64 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_64B_ADDSUB
+ * \brief UKADD64 (64-bit Unsigned Saturating Addition)
+ * \details
+ * **Type**: DSP (64-bit Profile)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * UKADD64 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Add two 64-bit unsigned integers. The result is saturated to the U64 range.
+ *
+ * **RV32 Description**:\n
+ * This instruction adds the 64-bit unsigned integer of an even/odd pair of registers
+ * specified by Rs1(4,1) with the 64-bit unsigned integer of an even/odd pair of registers specified by
+ * Rs2(4,1). If the 64-bit result is beyond the U64 number range (0 <= U64 <= 2^64-1), it is saturated to the
+ * range and the OV bit is set to 1. The saturated result is written to an even/odd pair of registers
+ * specified by Rd(4,1).
+ * Rx(4,1), i.e., d, determines the even/odd pair group of two registers. Specifically, the register pair
+ * includes register 2d and 2d+1.
+ * The odd `2d+1` register of the pair contains the high 32-bit of the result and the even `2d` register
+ * of the pair contains the low 32-bit of the result.
+ *
+ * **RV64 Description**:\n
+ * This instruction adds the 64-bit unsigned integer in Rs1 with the 64-bit unsigned
+ * integer in Rs2. If the 64-bit result is beyond the U64 number range (0 <= U64 <= 2^64-1), it is saturated to
+ * the range and the OV bit is set to 1. The saturated result is written to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * * RV32:
+ * t_L = CONCAT(Rt(4,1),1'b0); t_H = CONCAT(Rt(4,1),1'b1);
+ * a_L = CONCAT(Ra(4,1),1'b0); a_H = CONCAT(Ra(4,1),1'b1);
+ * b_L = CONCAT(Rb(4,1),1'b0); b_H = CONCAT(Rb(4,1),1'b1);
+ * result = R[a_H].R[a_L] + R[b_H].R[b_L];
+ * if (result > (2^64)-1) {
+ *   result = (2^64)-1; OV = 1;
+ * }
+ * R[t_H].R[t_L] = result;
+ * * RV64:
+ * result = Rs1 + Rs2;
+ * if (result > (2^64)-1) {
+ *   result = (2^64)-1; OV = 1;
+ * }
+ * Rd = result;
+ * ~~~
+ *
+ * \param [in]  a    unsigned long long type of value stored in a
+ * \param [in]  b    unsigned long long type of value stored in b
+ * \return value stored in unsigned long long type
+ */
+__STATIC_FORCEINLINE unsigned long long __RV_UKADD64(unsigned long long a, unsigned long long b)
+{
+    register unsigned long long result;
+    __ASM volatile("ukadd64 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.151. UKADD64 ===== */
+
+/* ===== Inline Function Start for 3.152. UKADDH ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_Q15_SAT_ALU
+ * \brief UKADDH (Unsigned Addition with U16 Saturation)
+ * \details
+ * **Type**: DSP
+ *
+ * **Syntax**:\n
+ * ~~~
+ * UKADDH Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Add the unsigned lower 32-bit content of two registers with U16 saturation.
+ *
+ * **Description**:\n
+ * The unsigned lower 32-bit content of Rs1 is added with the unsigned lower 32-bit
+ * content of Rs2. And the result is saturated to the 16-bit unsigned integer range of [0, 2^16-1] and then
+ * sign-extended and written to Rd. If saturation happens, this instruction sets the OV flag.
+ *
+ * **Operations**:\n
+ * ~~~
+ * tmp = Rs1.W[0] + Rs2.W[0];
+ * if (tmp > (2^16)-1) {
+ *   tmp = (2^16)-1;
+ *   OV = 1;
+ * }
+ * Rd = SE(tmp[15:0]);
+ * ~~~
+ *
+ * \param [in]  a    unsigned int type of value stored in a
+ * \param [in]  b    unsigned int type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_UKADDH(unsigned int a, unsigned int b)
+{
+    register unsigned long result;
+    __ASM volatile("ukaddh %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.152. UKADDH ===== */
+
+/* ===== Inline Function Start for 3.153. UKADDW ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_Q31_SAT_ALU
+ * \brief UKADDW (Unsigned Addition with U32 Saturation)
+ * \details
+ * **Type**: DSP
+ *
+ * **Syntax**:\n
+ * ~~~
+ * UKADDW Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Add the unsigned lower 32-bit content of two registers with U32 saturation.
+ *
+ * **Description**:\n
+ * The unsigned lower 32-bit content of Rs1 is added with the unsigned lower 32-bit
+ * content of Rs2. And the result is saturated to the 32-bit unsigned integer range of [0, 2^32-1] and then
+ * sign-extended and written to Rd. If saturation happens, this instruction sets the OV flag.
+ *
+ * **Operations**:\n
+ * ~~~
+ * tmp = Rs1.W[0] + Rs2.W[0];
+ * if (tmp > (2^32)-1) {
+ *   tmp[31:0] = (2^32)-1;
+ *   OV = 1;
+ * }
+ * Rd = tmp[31:0]; // RV32
+ * Rd = SE(tmp[31:0]); // RV64
+ * ~~~
+ *
+ * \param [in]  a    unsigned int type of value stored in a
+ * \param [in]  b    unsigned int type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_UKADDW(unsigned int a, unsigned int b)
+{
+    register unsigned long result;
+    __ASM volatile("ukaddw %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.153. UKADDW ===== */
+
+/* ===== Inline Function Start for 3.154. UKCRAS16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_ADDSUB
+ * \brief UKCRAS16 (SIMD 16-bit Unsigned Saturating Cross Addition & Subtraction)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * UKCRAS16 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do one 16-bit unsigned integer element saturating addition and one 16-bit unsigned
+ * integer element saturating subtraction in a 32-bit chunk simultaneously. Operands are from crossed
+ * positions in 32-bit chunks.
+ *
+ * **Description**:\n
+ * This instruction adds the 16-bit unsigned integer element in [31:16] of 32-bit chunks in
+ * Rs1 with the 16-bit unsigned integer element in [15:0] of 32-bit chunks in Rs2; at the same time, it
+ * subtracts the 16-bit unsigned integer element in [31:16] of 32-bit chunks in Rs2 from the 16-bit
+ * unsigned integer element in [15:0] of 32-bit chunks in Rs1. If any of the results are beyond the 16-bit
+ * unsigned number range (0 <= RES <= 2^16-1), they are saturated to the range and the OV bit is set to 1.
+ * The saturated results are written to [31:16] of 32-bit chunks in Rd for addition and [15:0] of 32-bit
+ * chunks in Rd for subtraction.
+ *
+ * **Operations**:\n
+ * ~~~
+ * res1 = Rs1.W[x][31:16] + Rs2.W[x][15:0];
+ * res2 = Rs1.W[x][15:0] - Rs2.W[x][31:16];
+ * if (res1 > (2^16)-1) {
+ *   res1 = (2^16)-1;
+ *   OV = 1;
+ * }
+ * if (res2 < 0) {
+ *   res2 = 0;
+ *   OV = 1;
+ * }
+ * Rd.W[x][31:16] = res1;
+ * Rd.W[x][15:0] = res2;
+ * for RV32, x=0
+ * for RV64, x=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_UKCRAS16(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("ukcras16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.154. UKCRAS16 ===== */
+
+/* ===== Inline Function Start for 3.155. UKCRSA16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_ADDSUB
+ * \brief UKCRSA16 (SIMD 16-bit Unsigned Saturating Cross Subtraction & Addition)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * UKCRSA16 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do one 16-bit unsigned integer element saturating subtraction and one 16-bit unsigned
+ * integer element saturating addition in a 32-bit chunk simultaneously. Operands are from crossed
+ * positions in 32-bit chunks.
+ *
+ * **Description**:\n
+ * This instruction subtracts the 16-bit unsigned integer element in [15:0] of 32-bit
+ * chunks in Rs2 from the 16-bit unsigned integer element in [31:16] of 32-bit chunks in Rs1; at the
+ * same time, it adds the 16-bit unsigned integer element in [31:16] of 32-bit chunks in Rs2 with the 16-
+ * bit unsigned integer element in [15:0] of 32-bit chunks in Rs1. If any of the results are beyond the
+ * 16-bit unsigned number range (0 <= RES <= 2^16-1), they are saturated to the range and the OV bit is set
+ * to 1. The saturated results are written to [31:16] of 32-bit chunks in Rd for subtraction and [15:0] of
+ * 32-bit chunks in Rd for addition.
+ *
+ * **Operations**:\n
+ * ~~~
+ * res1 = Rs1.W[x][31:16] - Rs2.W[x][15:0];
+ * res2 = Rs1.W[x][15:0] + Rs2.W[x][31:16];
+ * if (res1 < 0) {
+ *   res1 = 0;
+ *   OV = 1;
+ * } else if (res2 > (2^16)-1) {
+ *   res2 = (2^16)-1;
+ *   OV = 1;
+ * }
+ * Rd.W[x][31:16] = res1;
+ * Rd.W[x][15:0] = res2;
+ * for RV32, x=0
+ * for RV64, x=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_UKCRSA16(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("ukcrsa16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.155. UKCRSA16 ===== */
+
+/* ===== Inline Function Start for 3.156. UKMAR64 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_32B_MULT_64B_ADDSUB
+ * \brief UKMAR64 (Unsigned Multiply and Saturating Add to 64-Bit Data)
+ * \details
+ * **Type**: DSP (64-bit Profile)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * UKMAR64 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Multiply the 32-bit unsigned elements in two registers and add the 64-bit multiplication
+ * results to the 64-bit unsigned data of a pair of registers (RV32) or a register (RV64). The result is
+ * saturated to the U64 range and written back to the pair of registers (RV32) or the register (RV64).
+ *
+ * **RV32 Description**:\n
+ * This instruction multiplies the 32-bit unsigned data of Rs1 with that of Rs2. It
+ * adds the 64-bit multiplication result to the 64-bit unsigned data of an even/odd pair of registers
+ * specified by Rd(4,1) with unlimited precision. If the 64-bit addition result is beyond the U64 number
+ * range (0 <= U64 <= 2^64-1), it is saturated to the range and the OV bit is set to 1. The saturated result is
+ * written back to the even/odd pair of registers specified by Rd(4,1).
+ * Rx(4,1), i.e., d, determines the even/odd pair group of two registers. Specifically, the register pair
+ * includes register 2d and 2d+1.
+ * The odd `2d+1` register of the pair contains the high 32-bit of the result and the even `2d` register
+ * of the pair contains the low 32-bit of the result.
+ *
+ * **RV64 Description**:\n
+ * This instruction multiplies the 32-bit unsigned elements of Rs1 with that of Rs2.
+ * It adds the 64-bit multiplication results to the 64-bit unsigned data in Rd with unlimited precision. If
+ * the 64-bit addition result is beyond the U64 number range (0 <= U64 <= 2^64-1), it is saturated to the
+ * range and the OV bit is set to 1. The saturated result is written back to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * * RV32:
+ * t_L = CONCAT(Rd(4,1),1'b0); t_H = CONCAT(Rd(4,1),1'b1);
+ * result = R[t_H].R[t_L] + (Rs1 * Rs2);
+ * if (result > (2^64)-1) {
+ *   result = (2^64)-1; OV = 1;
+ * }
+ * R[t_H].R[t_L] = result;
+ * * RV64:
+ * // `result` has unlimited precision
+ * result = Rd + (Rs1.W[0] u* Rs2.W[0]) + (Rs1.W[1] u* Rs2.W[1]);
+ * if (result > (2^64)-1) {
+ *   result = (2^64)-1; OV = 1;
+ * }
+ * Rd = result;
+ * ~~~
+ *
+ * \param [in]  t    unsigned long long type of value stored in t
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long long type
+ */
+__STATIC_FORCEINLINE unsigned long long __RV_UKMAR64(unsigned long long t, unsigned long a, unsigned long b)
+{
+    __ASM volatile("ukmar64 %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
+    return t;
+}
+/* ===== Inline Function End for 3.156. UKMAR64 ===== */
+
+/* ===== Inline Function Start for 3.157. UKMSR64 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_32B_MULT_64B_ADDSUB
+ * \brief UKMSR64 (Unsigned Multiply and Saturating Subtract from 64-Bit Data)
+ * \details
+ * **Type**: DSP (64-bit Profile)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * UKMSR64 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Multiply the 32-bit unsigned elements in two registers and subtract the 64-bit
+ * multiplication results from the 64-bit unsigned data of a pair of registers (RV32) or a register (RV64).
+ * The result is saturated to the U64 range and written back to the pair of registers (RV32) or a register
+ * (RV64).
+ *
+ * **RV32 Description**:\n
+ * This instruction multiplies the 32-bit unsigned data of Rs1 with that of Rs2. It
+ * subtracts the 64-bit multiplication result from the 64-bit unsigned data of an even/odd pair of
+ * registers specified by Rd(4,1) with unlimited precision. If the 64-bit subtraction result is beyond the
+ * U64 number range (0 <= U64 <= 2^64-1), it is saturated to the range and the OV bit is set to 1. The
+ * saturated result is written back to the even/odd pair of registers specified by Rd(4,1).
+ * Rx(4,1), i.e., d, determines the even/odd pair group of two registers. Specifically, the register pair
+ * includes register 2d and 2d+1.
+ * The odd `2d+1` register of the pair contains the high 32-bit of the result and the even `2d` register
+ * of the pair contains the low 32-bit of the result.
+ *
+ * **RV64 Description**:\n
+ * This instruction multiplies the 32-bit unsigned elements of Rs1 with that of Rs2.
+ * It subtracts the 64-bit multiplication results from the 64-bit unsigned data of Rd with unlimited
+ * precision. If the 64-bit subtraction result is beyond the U64 number range (0 <= U64 <= 2^64-1), it is
+ * saturated to the range and the OV bit is set to 1. The saturated result is written back to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * * RV32:
+ * t_L = CONCAT(Rd(4,1),1'b0); t_H = CONCAT(Rd(4,1),1'b1);
+ * result = R[t_H].R[t_L] - (Rs1 u* Rs2);
+ * if (result < 0) {
+ *   result = 0; OV = 1;
+ * }
+ * R[t_H].R[t_L] = result;
+ * * RV64:
+ * // `result` has unlimited precision
+ * result = Rd - (Rs1.W[0] u* Rs2.W[0]) - (Rs1.W[1] u* Rs2.W[1]);
+ * if (result < 0) {
+ *   result = 0; OV = 1;
+ * }
+ * Rd = result;
+ * ~~~
+ *
+ * \param [in]  t    unsigned long long type of value stored in t
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long long type
+ */
+__STATIC_FORCEINLINE unsigned long long __RV_UKMSR64(unsigned long long t, unsigned long a, unsigned long b)
+{
+    __ASM volatile("ukmsr64 %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
+    return t;
+}
+/* ===== Inline Function End for 3.157. UKMSR64 ===== */
+
+/* ===== Inline Function Start for 3.158. UKSTAS16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_ADDSUB
+ * \brief UKSTAS16 (SIMD 16-bit Unsigned Saturating Straight Addition & Subtraction)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * UKSTAS16 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do one 16-bit unsigned integer element saturating addition and one 16-bit unsigned
+ * integer element saturating subtraction in a 32-bit chunk simultaneously. Operands are from
+ * corresponding positions in 32-bit chunks.
+ *
+ * **Description**:\n
+ * This instruction adds the 16-bit unsigned integer element in [31:16] of 32-bit chunks in
+ * Rs1 with the 16-bit unsigned integer element in [31:16] of 32-bit chunks in Rs2; at the same time, it
+ * subtracts the 16-bit unsigned integer element in [15:0] of 32-bit chunks in Rs2 from the 16-bit
+ * unsigned integer element in [15:0] of 32-bit chunks in Rs1. If any of the results are beyond the 16-bit
+ * unsigned number range (0 <= RES <= 2^16-1), they are saturated to the range and the OV bit is set to 1.
+ * The saturated results are written to [31:16] of 32-bit chunks in Rd for addition and [15:0] of 32-bit
+ * chunks in Rd for subtraction.
+ *
+ * **Operations**:\n
+ * ~~~
+ * res1 = Rs1.W[x][31:16] + Rs2.W[x][31:16];
+ * res2 = Rs1.W[x][15:0] - Rs2.W[x][15:0];
+ * if (res1 > (2^16)-1) {
+ *   res1 = (2^16)-1;
+ *   OV = 1;
+ * }
+ * if (res2 < 0) {
+ *   res2 = 0;
+ *   OV = 1;
+ * }
+ * Rd.W[x][31:16] = res1;
+ * Rd.W[x][15:0] = res2;
+ * for RV32, x=0
+ * for RV64, x=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_UKSTAS16(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("ukstas16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.158. UKSTAS16 ===== */
+
+/* ===== Inline Function Start for 3.159. UKSTSA16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_ADDSUB
+ * \brief UKSTSA16 (SIMD 16-bit Unsigned Saturating Straight Subtraction & Addition)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * UKSTSA16 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do one 16-bit unsigned integer element saturating subtraction and one 16-bit unsigned
+ * integer element saturating addition in a 32-bit chunk simultaneously. Operands are from
+ * corresponding positions in 32-bit chunks.
+ *
+ * **Description**:\n
+ * This instruction subtracts the 16-bit unsigned integer element in [31:16] of 32-bit
+ * chunks in Rs2 from the 16-bit unsigned integer element in [31:16] of 32-bit chunks in Rs1; at the
+ * same time, it adds the 16-bit unsigned integer element in [15:0] of 32-bit chunks in Rs2 with the 16-
+ * bit unsigned integer element in [15:0] of 32-bit chunks in Rs1. If any of the results are beyond the
+ * 16-bit unsigned number range (0 <= RES <= 2^16-1), they are saturated to the range and the OV bit is set
+ * to 1. The saturated results are written to [31:16] of 32-bit chunks in Rd for subtraction and [15:0] of
+ * 32-bit chunks in Rd for addition.
+ *
+ * **Operations**:\n
+ * ~~~
+ * res1 = Rs1.W[x][31:16] - Rs2.W[x][31:16];
+ * res2 = Rs1.W[x][15:0] + Rs2.W[x][15:0];
+ * if (res1 < 0) {
+ *   res1 = 0;
+ *   OV = 1;
+ * } else if (res2 > (2^16)-1) {
+ *   res2 = (2^16)-1;
+ *   OV = 1;
+ * }
+ * Rd.W[x][31:16] = res1;
+ * Rd.W[x][15:0] = res2;
+ * for RV32, x=0
+ * for RV64, x=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_UKSTSA16(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("ukstsa16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.159. UKSTSA16 ===== */
+
+/* ===== Inline Function Start for 3.160. UKSUB8 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_ADDSUB
+ * \brief UKSUB8 (SIMD 8-bit Unsigned Saturating Subtraction)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * UKSUB8 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 8-bit unsigned integer elements saturating subtractions simultaneously.
+ *
+ * **Description**:\n
+ * This instruction subtracts the 8-bit unsigned integer elements in Rs2 from the 8-bit
+ * unsigned integer elements in Rs1. If any of the results are beyond the 8-bit unsigned number range
+ * (0 <= RES <= 28-1), they are saturated to the range and the OV bit is set to 1. The saturated results are
+ * written to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * res[x] = Rs1.B[x] - Rs2.B[x];
+ * if (res[x] < 0) {
+ *   res[x] = 0;
+ *   OV = 1;
+ * }
+ * Rd.B[x] = res[x];
+ * for RV32: x=3...0,
+ * for RV64: x=7...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_UKSUB8(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("uksub8 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.160. UKSUB8 ===== */
+
+/* ===== Inline Function Start for 3.161. UKSUB16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_ADDSUB
+ * \brief UKSUB16 (SIMD 16-bit Unsigned Saturating Subtraction)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * UKSUB16 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 16-bit unsigned integer elements saturating subtractions simultaneously.
+ *
+ * **Description**:\n
+ * This instruction subtracts the 16-bit unsigned integer elements in Rs2 from the 16-bit
+ * unsigned integer elements in Rs1. If any of the results are beyond the 16-bit unsigned number
+ * range (0 <= RES <= 2^16-1), they are saturated to the range and the OV bit is set to 1. The saturated
+ * results are written to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * res[x] = Rs1.H[x] - Rs2.H[x];
+ * if (res[x] < 0) {
+ *   res[x] = 0;
+ *   OV = 1;
+ * }
+ * Rd.H[x] = res[x];
+ * for RV32: x=1...0,
+ * for RV64: x=3...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_UKSUB16(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("uksub16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.161. UKSUB16 ===== */
+
+/* ===== Inline Function Start for 3.162. UKSUB64 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_64B_ADDSUB
+ * \brief UKSUB64 (64-bit Unsigned Saturating Subtraction)
+ * \details
+ * **Type**: DSP (64-bit Profile)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * UKSUB64 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Perform a 64-bit signed integer subtraction. The result is saturated to the U64 range.
+ *
+ * **RV32 Description**:\n
+ * This instruction subtracts the 64-bit unsigned integer of an even/odd pair of
+ * registers specified by Rs2(4,1) from the 64-bit unsigned integer of an even/odd pair of registers
+ * specified by Rs1(4,1). If the 64-bit result is beyond the U64 number range (0 <= U64 <= 2^64-1), it is
+ * saturated to the range and the OV bit is set to 1. The saturated result is then written to an even/odd
+ * pair of registers specified by Rd(4,1).
+ * Rx(4,1), i.e., d, determines the even/odd pair group of two registers. Specifically, the register pair
+ * includes register 2d and 2d+1.
+ * The odd `2d+1` register of the pair contains the high 32-bit of the operand and the even `2d`
+ * register of the pair contains the low 32-bit of the operand.
+ *
+ * **RV64 Description**:\n
+ * This instruction subtracts the 64-bit unsigned integer of Rs2 from the 64-bit
+ * unsigned integer of an even/odd pair of Rs1. If the 64-bit result is beyond the U64 number range (0 <=
+ * U64 <= 2^64-1), it is saturated to the range and the OV bit is set to 1. The saturated result is then written
+ * to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * * RV32:
+ * t_L = CONCAT(Rd(4,1),1'b0); t_H = CONCAT(Rd(4,1),1'b1);
+ * a_L = CONCAT(Rs1(4,1),1'b0); a_H = CONCAT(Rs1(4,1),1'b1);
+ * b_L = CONCAT(Rs2(4,1),1'b0); b_H = CONCAT(Rs2(4,1),1'b1);
+ * result = R[a_H].R[a_L] - R[b_H].R[b_L];
+ * if (result < 0) {
+ *   result = 0; OV = 1;
+ * }
+ * R[t_H].R[t_L] = result;
+ * * RV64
+ * result = Rs1 - Rs2;
+ * if (result < 0) {
+ *   result = 0; OV = 1;
+ * }
+ * Rd = result;
+ * ~~~
+ *
+ * \param [in]  a    unsigned long long type of value stored in a
+ * \param [in]  b    unsigned long long type of value stored in b
+ * \return value stored in unsigned long long type
+ */
+__STATIC_FORCEINLINE unsigned long long __RV_UKSUB64(unsigned long long a, unsigned long long b)
+{
+    register unsigned long long result;
+    __ASM volatile("uksub64 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.162. UKSUB64 ===== */
+
+/* ===== Inline Function Start for 3.163. UKSUBH ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_Q15_SAT_ALU
+ * \brief UKSUBH (Unsigned Subtraction with U16 Saturation)
+ * \details
+ * **Type**: DSP
+ *
+ * **Syntax**:\n
+ * ~~~
+ * UKSUBH Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Subtract the unsigned lower 32-bit content of two registers with U16 saturation.
+ *
+ * **Description**:\n
+ * The unsigned lower 32-bit content of Rs2 is subtracted from the unsigned lower 32-bit
+ * content of Rs1. And the result is saturated to the 16-bit unsigned integer range of [0, 2^16-1] and then
+ * sign-extended and written to Rd. If saturation happens, this instruction sets the OV flag.
+ *
+ * **Operations**:\n
+ * ~~~
+ * tmp = Rs1.W[0] - Rs2.W[0];
+ * if (tmp > (2^16)-1) {
+ *   tmp = (2^16)-1;
+ *   OV = 1;
+ * }
+ * else if (tmp < 0) {
+ *   tmp = 0;
+ *   OV = 1;
+ * }
+ * Rd = SE(tmp[15:0]);
+ * ~~~
+ *
+ * \param [in]  a    unsigned int type of value stored in a
+ * \param [in]  b    unsigned int type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_UKSUBH(unsigned int a, unsigned int b)
+{
+    register unsigned long result;
+    __ASM volatile("uksubh %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.163. UKSUBH ===== */
+
+/* ===== Inline Function Start for 3.164. UKSUBW ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_Q31_SAT_ALU
+ * \brief UKSUBW (Unsigned Subtraction with U32 Saturation)
+ * \details
+ * **Type**: DSP
+ *
+ * **Syntax**:\n
+ * ~~~
+ * UKSUBW Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Subtract the unsigned lower 32-bit content of two registers with unsigned 32-bit
+ * saturation.
+ *
+ * **Description**:\n
+ * The unsigned lower 32-bit content of Rs2 is subtracted from the unsigned lower 32-bit
+ * content of Rs1. And the result is saturated to the 32-bit unsigned integer range of [0, 2^32-1] and then
+ * sign-extended and written to Rd. If saturation happens, this instruction sets the OV flag.
+ *
+ * **Operations**:\n
+ * ~~~
+ * tmp = Rs1.W[0] - Rs2.W[0];
+ * if (tmp < 0) {
+ *   tmp[31:0] = 0;
+ *   OV = 1;
+ * }
+ * Rd = tmp[31:0]; // RV32
+ * Rd = SE(tmp[31:0]); // RV64
+ * ~~~
+ *
+ * \param [in]  a    unsigned int type of value stored in a
+ * \param [in]  b    unsigned int type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_UKSUBW(unsigned int a, unsigned int b)
+{
+    register unsigned long result;
+    __ASM volatile("uksubw %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.164. UKSUBW ===== */
+
+/* ===== Inline Function Start for 3.165. UMAR64 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_32B_MULT_64B_ADDSUB
+ * \brief UMAR64 (Unsigned Multiply and Add to 64-Bit Data)
+ * \details
+ * **Type**: DSP (64-bit Profile)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * UMAR64 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Multiply the 32-bit unsigned elements in two registers and add the 64-bit multiplication
+ * results to the 64-bit unsigned data of a pair of registers (RV32) or a register (RV64). The result is
+ * written back to the pair of registers (RV32) or a register (RV64).
+ *
+ * **RV32 Description**:\n
+ * This instruction multiplies the 32-bit unsigned data of Rs1 with that of Rs2. It
+ * adds the 64-bit multiplication result to the 64-bit unsigned data of an even/odd pair of registers
+ * specified by Rd(4,1). The addition result is written back to the even/odd pair of registers specified by
+ * Rd(4,1).
+ * Rx(4,1), i.e., d, determines the even/odd pair group of two registers. Specifically, the register pair
+ * includes register 2d and 2d+1.
+ * The odd `2d+1` register of the pair contains the high 32-bit of the result and the even `2d` register
+ * of the pair contains the low 32-bit of the result.
+ *
+ * **RV64 Description**:\n
+ * This instruction multiplies the 32-bit unsigned elements of Rs1 with that of Rs2.
+ * It adds the 64-bit multiplication results to the 64-bit unsigned data of Rd. The addition result is
+ * written back to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * * RV32:
+ * t_L = CONCAT(Rd(4,1),1'b0); t_H = CONCAT(Rd(4,1),1'b1);
+ * R[t_H].R[t_L] = R[t_H].R[t_L] + (Rs1 * Rs2);
+ * * RV64:
+ * Rd = Rd + (Rs1.W[0] u* Rs2.W[0]) + (Rs1.W[1] u* Rs2.W[1]);
+ * ~~~
+ *
+ * \param [in]  t    unsigned long long type of value stored in t
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long long type
+ */
+__STATIC_FORCEINLINE unsigned long long __RV_UMAR64(unsigned long long t, unsigned long a, unsigned long b)
+{
+    __ASM volatile("umar64 %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
+    return t;
+}
+/* ===== Inline Function End for 3.165. UMAR64 ===== */
+
+/* ===== Inline Function Start for 3.166. UMAQA ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_8B_MULT_32B_ADD
+ * \brief UMAQA (Unsigned Multiply Four Bytes with 32- bit Adds)
+ * \details
+ * **Type**: DSP
+ *
+ * **Syntax**:\n
+ * ~~~
+ * UMAQA Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do four unsigned 8-bit multiplications from 32-bit chunks of two registers; and then adds
+ * the four 16-bit results and the content of corresponding 32-bit chunks of a third register together.
+ *
+ * **Description**:\n
+ * This instruction multiplies the four unsigned 8-bit elements of 32-bit chunks of Rs1 with the four
+ * unsigned 8-bit elements of 32-bit chunks of Rs2 and then adds the four results together with the
+ * unsigned content of the corresponding 32-bit chunks of Rd. The final results are written back to the
+ * corresponding 32-bit chunks in Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * res[x] = Rd.W[x] + (Rs1.W[x].B[3] u* Rs2.W[x].B[3]) +
+ *          (Rs1.W[x].B[2] u* Rs2.W[x].B[2]) + (Rs1.W[x].B[1] u* Rs2.W[x].B[1]) +
+ *          (Rs1.W[x].B[0] u* Rs2.W[x].B[0]);
+ * Rd.W[x] = res[x];
+ * for RV32: x=0,
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  t    unsigned long type of value stored in t
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_UMAQA(unsigned long t, unsigned long a, unsigned long b)
+{
+    __ASM volatile("umaqa %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
+    return t;
+}
+/* ===== Inline Function End for 3.166. UMAQA ===== */
+
+/* ===== Inline Function Start for 3.167. UMAX8 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_MISC
+ * \brief UMAX8 (SIMD 8-bit Unsigned Maximum)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * UMAX8 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 8-bit unsigned integer elements finding maximum operations simultaneously.
+ *
+ * **Description**:\n
+ * This instruction compares the 8-bit unsigned integer elements in Rs1 with the four 8-
+ * bit unsigned integer elements in Rs2 and selects the numbers that is greater than the other one. The
+ * two selected results are written to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.B[x] = (Rs1.B[x] >u Rs2.B[x])? Rs1.B[x] : Rs2.B[x];
+ * for RV32: x=3...0,
+ * for RV64: x=7...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_UMAX8(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("umax8 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.167. UMAX8 ===== */
+
+/* ===== Inline Function Start for 3.168. UMAX16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_MISC
+ * \brief UMAX16 (SIMD 16-bit Unsigned Maximum)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * UMAX16 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 16-bit unsigned integer elements finding maximum operations simultaneously.
+ *
+ * **Description**:\n
+ * This instruction compares the 16-bit unsigned integer elements in Rs1 with the 16-bit
+ * unsigned integer elements in Rs2 and selects the numbers that is greater than the other one. The
+ * selected results are written to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.H[x] = (Rs1.H[x] >u Rs2.H[x])? Rs1.H[x] : Rs2.H[x];
+ * for RV32: x=1...0,
+ * for RV64: x=3...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_UMAX16(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("umax16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.168. UMAX16 ===== */
+
+/* ===== Inline Function Start for 3.169. UMIN8 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_MISC
+ * \brief UMIN8 (SIMD 8-bit Unsigned Minimum)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * UMIN8 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 8-bit unsigned integer elements finding minimum operations simultaneously.
+ *
+ * **Description**:\n
+ * This instruction compares the 8-bit unsigned integer elements in Rs1 with the 8-bit
+ * unsigned integer elements in Rs2 and selects the numbers that is less than the other one. The
+ * selected results are written to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.B[x] = (Rs1.B[x] <u Rs2.B[x])? Rs1.B[x] : Rs2.B[x];
+ * for RV32: x=3...0,
+ * for RV64: x=7...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_UMIN8(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("umin8 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.169. UMIN8 ===== */
+
+/* ===== Inline Function Start for 3.170. UMIN16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_MISC
+ * \brief UMIN16 (SIMD 16-bit Unsigned Minimum)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * UMIN16 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 16-bit unsigned integer elements finding minimum operations simultaneously.
+ *
+ * **Description**:\n
+ * This instruction compares the 16-bit unsigned integer elements in Rs1 with the 16-bit
+ * unsigned integer elements in Rs2 and selects the numbers that is less than the other one. The
+ * selected results are written to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.H[x] = (Rs1.H[x] <u Rs2.H[x])? Rs1.H[x] : Rs2.H[x];
+ * for RV32: x=1...0,
+ * for RV64: x=3...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_UMIN16(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("umin16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.170. UMIN16 ===== */
+
+/* ===== Inline Function Start for 3.171. UMSR64 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_32B_MULT_64B_ADDSUB
+ * \brief UMSR64 (Unsigned Multiply and Subtract from 64-Bit Data)
+ * \details
+ * **Type**: DSP (64-bit Profile)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * UMSR64 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Multiply the 32-bit unsigned elements in two registers and subtract the 64-bit
+ * multiplication results from the 64-bit unsigned data of a pair of registers (RV32) or a register (RV64).
+ * The result is written back to the pair of registers (RV32) or a register (RV64).
+ *
+ * **RV32 Description**:\n
+ * This instruction multiplies the 32-bit unsigned data of Rs1 with that of Rs2. It
+ * subtracts the 64-bit multiplication result from the 64-bit unsigned data of an even/odd pair of
+ * registers specified by Rd(4,1). The subtraction result is written back to the even/odd pair of registers
+ * specified by Rd(4,1).
+ * Rx(4,1), i.e., d, determines the even/odd pair group of two registers. Specifically, the register pair
+ * includes register 2d and 2d+1.
+ * The odd `2d+1` register of the pair contains the high 32-bit of the result and the even `2d` register
+ * of the pair contains the low 32-bit of the result.
+ *
+ * **RV64 Description**:\n
+ * This instruction multiplies the 32-bit unsigned elements of Rs1 with that of Rs2.
+ * It subtracts the 64-bit multiplication results from the 64-bit unsigned data of Rd. The subtraction
+ * result is written back to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * * RV32:
+ * t_L = CONCAT(Rd(4,1),1'b0); t_H = CONCAT(Rd(4,1),1'b1);
+ * R[t_H].R[t_L] = R[t_H].R[t_L] - (Rs1 * Rs2);
+ * * RV64:
+ * Rd = Rd - (Rs1.W[0] u* Rs2.W[0]) - (Rs1.W[1] u* Rs2.W[1]);
+ * ~~~
+ *
+ * \param [in]  t    unsigned long long type of value stored in t
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long long type
+ */
+__STATIC_FORCEINLINE unsigned long long __RV_UMSR64(unsigned long long t, unsigned long a, unsigned long b)
+{
+    __ASM volatile("umsr64 %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
+    return t;
+}
+/* ===== Inline Function End for 3.171. UMSR64 ===== */
+
+/* ===== Inline Function Start for 3.172.1. UMUL8 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_MULTIPLY
+ * \brief UMUL8 (SIMD Unsigned 8-bit Multiply)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * UMUL8 Rd, Rs1, Rs2
+ * UMULX8 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do unsigned 8-bit multiplications and generate four 16-bit results simultaneously.
+ *
+ * **RV32 Description**:\n
+ * For the `UMUL8` instruction, multiply the unsigned 8-bit data elements of Rs1
+ * with the corresponding unsigned 8-bit data elements of Rs2.
+ * For the `UMULX8` instruction, multiply the first and second unsigned 8-bit data elements of Rs1
+ * with the second and first unsigned 8-bit data elements of Rs2. At the same time, multiply the third
+ * and fourth unsigned 8-bit data elements of Rs1 with the fourth and third unsigned 8-bit data
+ * elements of Rs2.
+ * The four 16-bit results are then written into an even/odd pair of registers specified by Rd(4,1).
+ * Rd(4,1), i.e., d, determines the even/odd pair group of two registers. Specifically, the register pair
+ * includes register 2d and 2d+1.
+ * The odd `2d+1` register of the pair contains the two 16-bit results calculated from the top part of
+ * Rs1 and the even `2d` register of the pair contains the two 16-bit results calculated from the bottom
+ * part of Rs1.
+ *
+ * **RV64 Description**:\n
+ * For the `UMUL8` instruction, multiply the unsigned 8-bit data elements of Rs1
+ * with the corresponding unsigned 8-bit data elements of Rs2.
+ * For the `UMULX8` instruction, multiply the first and second unsigned 8-bit data elements of Rs1
+ * with the second and first unsigned 8-bit data elements of Rs2. At the same time, multiply the third
+ * and fourth unsigned 8-bit data elements of Rs1 with the fourth and third unsigned 8-bit data
+ * elements of Rs2.
+ * The four 16-bit results are then written into Rd. The Rd.W[1] contains the two 16-bit results
+ * calculated from the top part of Rs1 and the Rd.W[0] contains the two 16-bit results calculated from
+ * the bottom part of Rs1.
+ *
+ * **Operations**:\n
+ * ~~~
+ * * RV32:
+ * if (is `UMUL8`) {
+ *   op1t[x/2] = Rs1.B[x+1]; op2t[x/2] = Rs2.B[x+1]; // top
+ *   op1b[x/2] = Rs1.B[x]; op2b[x/2] = Rs2.B[x]; // bottom
+ * } else if (is `UMULX8`) {
+ *   op1t[x/2] = Rs1.B[x+1]; op2t[x/2] = Rs2.B[x]; // Rs1 top
+ *   op1b[x/2] = Rs1.B[x]; op2b[x/2] = Rs2.B[x+1]; // Rs1 bottom
+ * }
+ * rest[x/2] = op1t[x/2] u* op2t[x/2];
+ * resb[x/2] = op1b[x/2] u* op2b[x/2];
+ * t_L = CONCAT(Rd(4,1),1'b0); t_H = CONCAT(Rd(4,1),1'b1);
+ * R[t_H].H[1] = rest[1]; R[t_H].H[0] = resb[1];
+ * R[t_L].H[1] = rest[0]; R[t_L].H[0] = resb[0];
+ * x = 0 and 2
+ * * RV64:
+ * if (is `UMUL8`) {
+ *     op1t[x/2] = Rs1.B[x+1]; op2t[x/2] = Rs2.B[x+1]; // top
+ *     op1b[x/2] = Rs1.B[x]; op2b[x/2] = Rs2.B[x]; // bottom
+ * } else if (is `UMULX8`) {
+ *     op1t[x/2] = Rs1.B[x+1]; op2t[x/2] = Rs2.B[x]; // Rs1 top
+ *     op1b[x/2]  =  Rs1.B[x]; op2b[x/2]  =  Rs2.B[x+1];  //  Rs1  bottom
+ * }
+ * rest[x/2]  =  op1t[x/2]  u*  op2t[x/2];
+ * resb[x/2]  =  op1b[x/2]  u*  op2b[x/2];
+ * t_L  =  CONCAT(Rd(4,1),1'b0); t_H  =  CONCAT(Rd(4,1),1'b1);
+ * Rd.W[1].H[1] = rest[1]; Rd.W[1].H[0] = resb[1];
+ * Rd.W[0].H[1] = rest[0]; Rd.W[0].H[0] = resb[0]; x = 0 and 2
+ * ~~~
+ *
+ * \param [in]  a    unsigned int type of value stored in a
+ * \param [in]  b    unsigned int type of value stored in b
+ * \return value stored in unsigned long long type
+ */
+__STATIC_FORCEINLINE unsigned long long __RV_UMUL8(unsigned int a, unsigned int b)
+{
+    register unsigned long long result;
+    __ASM volatile("umul8 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.172.1. UMUL8 ===== */
+
+/* ===== Inline Function Start for 3.172.2. UMULX8 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_MULTIPLY
+ * \brief UMULX8 (SIMD Unsigned Crossed 8-bit Multiply)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * UMUL8 Rd, Rs1, Rs2
+ * UMULX8 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do unsigned 8-bit multiplications and generate four 16-bit results simultaneously.
+ *
+ * **RV32 Description**:\n
+ * For the `UMUL8` instruction, multiply the unsigned 8-bit data elements of Rs1
+ * with the corresponding unsigned 8-bit data elements of Rs2.
+ * For the `UMULX8` instruction, multiply the first and second unsigned 8-bit data elements of Rs1
+ * with the second and first unsigned 8-bit data elements of Rs2. At the same time, multiply the third
+ * and fourth unsigned 8-bit data elements of Rs1 with the fourth and third unsigned 8-bit data
+ * elements of Rs2.
+ * The four 16-bit results are then written into an even/odd pair of registers specified by Rd(4,1).
+ * Rd(4,1), i.e., d, determines the even/odd pair group of two registers. Specifically, the register pair
+ * includes register 2d and 2d+1.
+ * The odd `2d+1` register of the pair contains the two 16-bit results calculated from the top part of
+ * Rs1 and the even `2d` register of the pair contains the two 16-bit results calculated from the bottom
+ * part of Rs1.
+ *
+ * **RV64 Description**:\n
+ * For the `UMUL8` instruction, multiply the unsigned 8-bit data elements of Rs1
+ * with the corresponding unsigned 8-bit data elements of Rs2.
+ * For the `UMULX8` instruction, multiply the first and second unsigned 8-bit data elements of Rs1
+ * with the second and first unsigned 8-bit data elements of Rs2. At the same time, multiply the third
+ * and fourth unsigned 8-bit data elements of Rs1 with the fourth and third unsigned 8-bit data
+ * elements of Rs2.
+ * The four 16-bit results are then written into Rd. The Rd.W[1] contains the two 16-bit results
+ * calculated from the top part of Rs1 and the Rd.W[0] contains the two 16-bit results calculated from
+ * the bottom part of Rs1.
+ *
+ * **Operations**:\n
+ * ~~~
+ * * RV32:
+ * if (is `UMUL8`) {
+ *   op1t[x/2] = Rs1.B[x+1]; op2t[x/2] = Rs2.B[x+1]; // top
+ *   op1b[x/2] = Rs1.B[x]; op2b[x/2] = Rs2.B[x]; // bottom
+ * } else if (is `UMULX8`) {
+ *   op1t[x/2] = Rs1.B[x+1]; op2t[x/2] = Rs2.B[x]; // Rs1 top
+ *   op1b[x/2] = Rs1.B[x]; op2b[x/2] = Rs2.B[x+1]; // Rs1 bottom
+ * }
+ * rest[x/2] = op1t[x/2] u* op2t[x/2];
+ * resb[x/2] = op1b[x/2] u* op2b[x/2];
+ * t_L = CONCAT(Rd(4,1),1'b0); t_H = CONCAT(Rd(4,1),1'b1);
+ * R[t_H].H[1] = rest[1]; R[t_H].H[0] = resb[1];
+ * R[t_L].H[1] = rest[0]; R[t_L].H[0] = resb[0];
+ * x = 0 and 2
+ * * RV64:
+ * if (is `UMUL8`) {
+ *     op1t[x/2] = Rs1.B[x+1]; op2t[x/2] = Rs2.B[x+1]; // top
+ *     op1b[x/2] = Rs1.B[x]; op2b[x/2] = Rs2.B[x]; // bottom
+ * } else if (is `UMULX8`) {
+ *     op1t[x/2] = Rs1.B[x+1]; op2t[x/2] = Rs2.B[x]; // Rs1 top
+ *     op1b[x/2]  =  Rs1.B[x]; op2b[x/2]  =  Rs2.B[x+1];  //  Rs1  bottom
+ * }
+ * rest[x/2]  =  op1t[x/2]  u*  op2t[x/2];
+ * resb[x/2]  =  op1b[x/2]  u*  op2b[x/2];
+ * t_L  =  CONCAT(Rd(4,1),1'b0); t_H  =  CONCAT(Rd(4,1),1'b1);
+ * Rd.W[1].H[1] = rest[1]; Rd.W[1].H[0] = resb[1];
+ * Rd.W[0].H[1] = rest[0]; Rd.W[0].H[0] = resb[0]; x = 0 and 2
+ * ~~~
+ *
+ * \param [in]  a    unsigned int type of value stored in a
+ * \param [in]  b    unsigned int type of value stored in b
+ * \return value stored in unsigned long long type
+ */
+__STATIC_FORCEINLINE unsigned long long __RV_UMULX8(unsigned int a, unsigned int b)
+{
+    register unsigned long long result;
+    __ASM volatile("umulx8 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.172.2. UMULX8 ===== */
+
+/* ===== Inline Function Start for 3.173.1. UMUL16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_MULTIPLY
+ * \brief UMUL16 (SIMD Unsigned 16-bit Multiply)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * UMUL16 Rd, Rs1, Rs2
+ * UMULX16 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do unsigned 16-bit multiplications and generate two 32-bit results simultaneously.
+ *
+ * **RV32 Description**:\n
+ * For the `UMUL16` instruction, multiply the top 16-bit U16 content of Rs1 with
+ * the top 16-bit U16 content of Rs2. At the same time, multiply the bottom 16-bit U16 content of Rs1
+ * with the bottom 16-bit U16 content of Rs2.
+ * For the `UMULX16` instruction, multiply the top 16-bit U16 content of Rs1 with the bottom 16-bit
+ * U16 content of Rs2. At the same time, multiply the bottom 16-bit U16 content of Rs1 with the top 16-
+ * bit U16 content of Rs2.
+ * The two U32 results are then written into an even/odd pair of registers specified by Rd(4,1). Rd(4,1),
+ * i.e., d, determines the even/odd pair group of two registers. Specifically, the register pair includes
+ * register 2d and 2d+1.
+ * The odd `2d+1` register of the pair contains the 32-bit result calculated from the top part of Rs1 and
+ * the even `2d` register of the pair contains the 32-bit result calculated from the bottom part of Rs1.
+ *
+ * **RV64 Description**:\n
+ * For the `UMUL16` instruction, multiply the top 16-bit U16 content of the lower
+ * 32-bit word in Rs1 with the top 16-bit U16 content of the lower 32-bit word in Rs2. At the same time,
+ * multiply the bottom 16-bit U16 content of the lower 32-bit word in Rs1 with the bottom 16-bit U16
+ * content of the lower 32-bit word in Rs2.
+ * For the `UMULX16` instruction, multiply the top 16-bit U16 content of the lower 32-bit word in Rs1
+ * with the bottom 16-bit U16 content of the lower 32-bit word in Rs2. At the same time, multiply the
+ * bottom 16-bit U16 content of the lower 32-bit word in Rs1 with the top 16-bit U16 content of the
+ * lower 32-bit word in Rs2.
+ * The two 32-bit U32 results are then written into Rd. The result calculated from the top 16-bit of the
+ * lower 32-bit word in Rs1 is written to Rd.W[1]. And the result calculated from the bottom 16-bit of
+ * the lower 32-bit word in Rs1 is written to Rd.W[0]
+ *
+ * **Operations**:\n
+ * ~~~
+ * * RV32:
+ * if (is `UMUL16`) {
+ *   op1t = Rs1.H[1]; op2t = Rs2.H[1]; // top
+ *   op1b = Rs1.H[0]; op2b = Rs2.H[0]; // bottom
+ * } else if (is `UMULX16`) {
+ *   op1t = Rs1.H[1]; op2t = Rs2.H[0]; // Rs1 top
+ *   op1b = Rs1.H[0]; op2b = Rs2.H[1]; // Rs1 bottom
+ * }
+ * for ((aop,bop,res) in [(op1t,op2t,rest), (op1b,op2b,resb)]) {
+ *   res = aop u* bop;
+ * }
+ * t_L = CONCAT(Rd(4,1),1'b0); t_H = CONCAT(Rd(4,1),1'b1);
+ * R[t_H] = rest;
+ * R[t_L] = resb;
+ * * RV64:
+ * if (is `UMUL16`) {
+ *   op1t = Rs1.H[1]; op2t = Rs2.H[1]; // top
+ *   op1b = Rs1.H[0]; op2b = Rs2.H[0]; // bottom
+ * } else if (is `UMULX16`) {
+ *   op1t = Rs1.H[1]; op2t = Rs2.H[0]; // Rs1 top
+ *   op1b = Rs1.H[0]; op2b = Rs2.H[1]; // Rs1 bottom
+ * }
+ * for ((aop,bop,res) in [(op1t,op2t,rest), (op1b,op2b,resb)]) {
+ *   res = aop u* bop;
+ * }
+ * Rd.W[1] = rest;
+ * Rd.W[0] = resb;
+ * ~~~
+ *
+ * \param [in]  a    unsigned int type of value stored in a
+ * \param [in]  b    unsigned int type of value stored in b
+ * \return value stored in unsigned long long type
+ */
+__STATIC_FORCEINLINE unsigned long long __RV_UMUL16(unsigned int a, unsigned int b)
+{
+    register unsigned long long result;
+    __ASM volatile("umul16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.173.1. UMUL16 ===== */
+
+/* ===== Inline Function Start for 3.173.2. UMULX16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_MULTIPLY
+ * \brief UMULX16 (SIMD Unsigned Crossed 16-bit Multiply)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * UMUL16 Rd, Rs1, Rs2
+ * UMULX16 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do unsigned 16-bit multiplications and generate two 32-bit results simultaneously.
+ *
+ * **RV32 Description**:\n
+ * For the `UMUL16` instruction, multiply the top 16-bit U16 content of Rs1 with
+ * the top 16-bit U16 content of Rs2. At the same time, multiply the bottom 16-bit U16 content of Rs1
+ * with the bottom 16-bit U16 content of Rs2.
+ * For the `UMULX16` instruction, multiply the top 16-bit U16 content of Rs1 with the bottom 16-bit
+ * U16 content of Rs2. At the same time, multiply the bottom 16-bit U16 content of Rs1 with the top 16-
+ * bit U16 content of Rs2.
+ * The two U32 results are then written into an even/odd pair of registers specified by Rd(4,1). Rd(4,1),
+ * i.e., d, determines the even/odd pair group of two registers. Specifically, the register pair includes
+ * register 2d and 2d+1.
+ * The odd `2d+1` register of the pair contains the 32-bit result calculated from the top part of Rs1 and
+ * the even `2d` register of the pair contains the 32-bit result calculated from the bottom part of Rs1.
+ *
+ * **RV64 Description**:\n
+ * For the `UMUL16` instruction, multiply the top 16-bit U16 content of the lower
+ * 32-bit word in Rs1 with the top 16-bit U16 content of the lower 32-bit word in Rs2. At the same time,
+ * multiply the bottom 16-bit U16 content of the lower 32-bit word in Rs1 with the bottom 16-bit U16
+ * content of the lower 32-bit word in Rs2.
+ * For the `UMULX16` instruction, multiply the top 16-bit U16 content of the lower 32-bit word in Rs1
+ * with the bottom 16-bit U16 content of the lower 32-bit word in Rs2. At the same time, multiply the
+ * bottom 16-bit U16 content of the lower 32-bit word in Rs1 with the top 16-bit U16 content of the
+ * lower 32-bit word in Rs2.
+ * The two 32-bit U32 results are then written into Rd. The result calculated from the top 16-bit of the
+ * lower 32-bit word in Rs1 is written to Rd.W[1]. And the result calculated from the bottom 16-bit of
+ * the lower 32-bit word in Rs1 is written to Rd.W[0]
+ *
+ * **Operations**:\n
+ * ~~~
+ * * RV32:
+ * if (is `UMUL16`) {
+ *   op1t = Rs1.H[1]; op2t = Rs2.H[1]; // top
+ *   op1b = Rs1.H[0]; op2b = Rs2.H[0]; // bottom
+ * } else if (is `UMULX16`) {
+ *   op1t = Rs1.H[1]; op2t = Rs2.H[0]; // Rs1 top
+ *   op1b = Rs1.H[0]; op2b = Rs2.H[1]; // Rs1 bottom
+ * }
+ * for ((aop,bop,res) in [(op1t,op2t,rest), (op1b,op2b,resb)]) {
+ *   res = aop u* bop;
+ * }
+ * t_L = CONCAT(Rd(4,1),1'b0); t_H = CONCAT(Rd(4,1),1'b1);
+ * R[t_H] = rest;
+ * R[t_L] = resb;
+ * * RV64:
+ * if (is `UMUL16`) {
+ *   op1t = Rs1.H[1]; op2t = Rs2.H[1]; // top
+ *   op1b = Rs1.H[0]; op2b = Rs2.H[0]; // bottom
+ * } else if (is `UMULX16`) {
+ *   op1t = Rs1.H[1]; op2t = Rs2.H[0]; // Rs1 top
+ *   op1b = Rs1.H[0]; op2b = Rs2.H[1]; // Rs1 bottom
+ * }
+ * for ((aop,bop,res) in [(op1t,op2t,rest), (op1b,op2b,resb)]) {
+ *   res = aop u* bop;
+ * }
+ * Rd.W[1] = rest;
+ * Rd.W[0] = resb;
+ * ~~~
+ *
+ * \param [in]  a    unsigned int type of value stored in a
+ * \param [in]  b    unsigned int type of value stored in b
+ * \return value stored in unsigned long long type
+ */
+__STATIC_FORCEINLINE unsigned long long __RV_UMULX16(unsigned int a, unsigned int b)
+{
+    register unsigned long long result;
+    __ASM volatile("umulx16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.173.2. UMULX16 ===== */
+
+/* ===== Inline Function Start for 3.174. URADD8 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_ADDSUB
+ * \brief URADD8 (SIMD 8-bit Unsigned Halving Addition)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * URADD8 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 8-bit unsigned integer element additions simultaneously. The results are halved to
+ * avoid overflow or saturation.
+ *
+ * **Description**:\n
+ * This instruction adds the 8-bit unsigned integer elements in Rs1 with the 8-bit
+ * unsigned integer elements in Rs2. The results are first logically right-shifted by 1 bit and then
+ * written to Rd.
+ *
+ * **Examples**:\n
+ * ~~~
+ * * Ra = 0x7F, Rb = 0x7F, Rt = 0x7F
+ * * Ra = 0x80, Rb = 0x80, Rt = 0x80
+ * * Ra = 0x40, Rb = 0x80, Rt = 0x60
+ * ~~~
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.B[x] = (Rs1.B[x] + Rs2.B[x]) u>> 1;
+ * for RV32: x=3...0,
+ * for RV64: x=7...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_URADD8(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("uradd8 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.174. URADD8 ===== */
+
+/* ===== Inline Function Start for 3.175. URADD16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_ADDSUB
+ * \brief URADD16 (SIMD 16-bit Unsigned Halving Addition)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * URADD16 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 16-bit unsigned integer element additions simultaneously. The results are halved to
+ * avoid overflow or saturation.
+ *
+ * **Description**:\n
+ * This instruction adds the 16-bit unsigned integer elements in Rs1 with the 16-bit
+ * unsigned integer elements in Rs2. The results are first logically right-shifted by 1 bit and then
+ * written to Rd.
+ *
+ * **Examples**:\n
+ * ~~~
+ * * Ra = 0x7FFF, Rb = 0x7FFF Rt = 0x7FFF
+ * * Ra = 0x8000, Rb = 0x8000 Rt = 0x8000
+ * * Ra = 0x4000, Rb = 0x8000 Rt = 0x6000
+ * ~~~
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.H[x] = (Rs1.H[x] + Rs2.H[x]) u>> 1;
+ * for RV32: x=1...0,
+ * for RV64: x=3...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_URADD16(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("uradd16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.175. URADD16 ===== */
+
+/* ===== Inline Function Start for 3.176. URADD64 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_64B_ADDSUB
+ * \brief URADD64 (64-bit Unsigned Halving Addition)
+ * \details
+ * **Type**: DSP (64-bit Profile)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * URADD64 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Add two 64-bit unsigned integers. The result is halved to avoid overflow or saturation.
+ *
+ * **RV32 Description**:\n
+ * This instruction adds the 64-bit unsigned integer of an even/odd pair of registers
+ * specified by Rs1(4,1) with the 64-bit unsigned integer of an even/odd pair of registers specified by
+ * Rs2(4,1). The 64-bit addition result is first logically right-shifted by 1 bit and then written to an
+ * even/odd pair of registers specified by Rd(4,1).
+ * Rx(4,1), i.e., d, determines the even/odd pair group of two registers. Specifically, the register pair
+ * includes register 2d and 2d+1.
+ * The odd `2d+1` register of the pair contains the high 32-bit of the result and the even `2d` register
+ * of the pair contains the low 32-bit of the result.
+ *
+ * **RV64 Description**:\n
+ * This instruction adds the 64-bit unsigned integer in Rs1 with the 64-bit unsigned
+ * integer Rs2. The 64-bit addition result is first logically right-shifted by 1 bit and then written to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * * RV32:
+ * t_L = CONCAT(Rt(4,1),1'b0); t_H = CONCAT(Rt(4,1),1'b1);
+ * a_L = CONCAT(Ra(4,1),1'b0); a_H = CONCAT(Ra(4,1),1'b1);
+ * b_L = CONCAT(Rb(4,1),1'b0); b_H = CONCAT(Rb(4,1),1'b1);
+ * R[t_H].R[t_L] = (R[a_H].R[a_L] + R[b_H].R[b_L]) u>> 1;
+ * * RV64:
+ * Rd = (Rs1 + Rs2) u>> 1;
+ * ~~~
+ *
+ * \param [in]  a    unsigned long long type of value stored in a
+ * \param [in]  b    unsigned long long type of value stored in b
+ * \return value stored in unsigned long long type
+ */
+__STATIC_FORCEINLINE unsigned long long __RV_URADD64(unsigned long long a, unsigned long long b)
+{
+    register unsigned long long result;
+    __ASM volatile("uradd64 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.176. URADD64 ===== */
+
+/* ===== Inline Function Start for 3.177. URADDW ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_32B_COMPUTATION
+ * \brief URADDW (32-bit Unsigned Halving Addition)
+ * \details
+ * **Type**: DSP
+ *
+ * **Syntax**:\n
+ * ~~~
+ * URADDW Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Add 32-bit unsigned integers and the results are halved to avoid overflow or saturation.
+ *
+ * **Description**:\n
+ * This instruction adds the first 32-bit unsigned integer in Rs1 with the first 32-bit
+ * unsigned integer in Rs2. The result is first logically right-shifted by 1 bit and then sign-extended and
+ * written to Rd.
+ *
+ * **Examples**:\n
+ * ~~~
+ * * Ra = 0x7FFFFFFF, Rb = 0x7FFFFFFF Rt = 0x7FFFFFFF
+ * * Ra = 0x80000000, Rb = 0x80000000 Rt = 0x80000000
+ * * Ra = 0x40000000, Rb = 0x80000000 Rt = 0x60000000
+ * ~~~
+ *
+ * **Operations**:\n
+ * ~~~
+ * * RV32:
+ * Rd[31:0] = (Rs1[31:0] + Rs2[31:0]) u>> 1;
+ * * RV64:
+ * resw[31:0] = (Rs1[31:0] + Rs2[31:0]) u>> 1;
+ * Rd[63:0] = SE(resw[31:0]);
+ * ~~~
+ *
+ * \param [in]  a    unsigned int type of value stored in a
+ * \param [in]  b    unsigned int type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_URADDW(unsigned int a, unsigned int b)
+{
+    register unsigned long result;
+    __ASM volatile("uraddw %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.177. URADDW ===== */
+
+/* ===== Inline Function Start for 3.178. URCRAS16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_ADDSUB
+ * \brief URCRAS16 (SIMD 16-bit Unsigned Halving Cross Addition & Subtraction)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * URCRAS16 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 16-bit unsigned integer element addition and 16-bit unsigned integer element
+ * subtraction in a 32-bit chunk simultaneously. Operands are from crossed positions in 32-bit chunks.
+ * The results are halved to avoid overflow or saturation.
+ *
+ * **Description**:\n
+ * This instruction adds the 16-bit unsigned integer in [31:16] of 32-bit chunks in Rs1
+ * with the 16-bit unsigned integer in [15:0] of 32-bit chunks in Rs2, and subtracts the 16-bit unsigned
+ * integer in [31:16] of 32-bit chunks in Rs2 from the 16-bit unsigned integer in [15:0] of 32-bit chunks
+ * in Rs1. The element results are first logically right-shifted by 1 bit and then written to [31:16] of 32-
+ * bit chunks in Rd and [15:0] of 32-bit chunks in Rd.
+ *
+ * **Examples**:\n
+ * ~~~
+ * Please see `URADD16` and `URSUB16` instructions.
+ * ~~~
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.W[x][31:16] = (Rs1.W[x][31:16] + Rs2.W[x][15:0]) u>> 1;
+ * Rd.W[x][15:0] = (Rs1.W[x][15:0] - Rs2.W[x][31:16]) u>> 1;
+ * for RV32, x=0
+ * for RV64, x=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_URCRAS16(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("urcras16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.178. URCRAS16 ===== */
+
+/* ===== Inline Function Start for 3.179. URCRSA16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_ADDSUB
+ * \brief URCRSA16 (SIMD 16-bit Unsigned Halving Cross Subtraction & Addition)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * URCRSA16 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 16-bit unsigned integer element subtraction and 16-bit unsigned integer element
+ * addition in a 32-bit chunk simultaneously. Operands are from crossed positions in 32-bit chunks.
+ * The results are halved to avoid overflow or saturation.
+ *
+ * **Description**:\n
+ * This instruction subtracts the 16-bit unsigned integer in [15:0] of 32-bit chunks in Rs2
+ * from the 16-bit unsigned integer in [31:16] of 32-bit chunks in Rs1, and adds the 16-bit unsigned
+ * integer in [15:0] of 32-bit chunks in Rs1 with the 16-bit unsigned integer in [31:16] of 32-bit chunks
+ * in Rs2. The two results are first logically right-shifted by 1 bit and then written to [31:16] of 32-bit
+ * chunks in Rd and [15:0] of 32-bit chunks in Rd.
+ *
+ * **Examples**:\n
+ * ~~~
+ * Please see `URADD16` and `URSUB16` instructions.
+ * ~~~
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.W[x][31:16] = (Rs1.W[x][31:16] - Rs2.W[x][15:0]) u>> 1;
+ * Rd.W[x][15:0] = (Rs1.W[x][15:0] + Rs2.W[x][31:16]) u>> 1;
+ * for RV32, x=0
+ * for RV64, x=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_URCRSA16(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("urcrsa16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.179. URCRSA16 ===== */
+
+/* ===== Inline Function Start for 3.180. URSTAS16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_ADDSUB
+ * \brief URSTAS16 (SIMD 16-bit Unsigned Halving Straight Addition & Subtraction)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * URSTAS16 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 16-bit unsigned integer element addition and 16-bit unsigned integer element
+ * subtraction in a 32-bit chunk simultaneously. Operands are from corresponding positions in 32-bit
+ * chunks. The results are halved to avoid overflow or saturation.
+ *
+ * **Description**:\n
+ * This instruction adds the 16-bit unsigned integer in [31:16] of 32-bit chunks in Rs1
+ * with the 16-bit unsigned integer in [31:16] of 32-bit chunks in Rs2, and subtracts the 16-bit unsigned
+ * integer in [15:0] of 32-bit chunks in Rs2 from the 16-bit unsigned integer in [15:0] of 32-bit chunks
+ * in Rs1. The element results are first logically right-shifted by 1 bit and then written to [31:16] of 32-
+ * bit chunks in Rd and [15:0] of 32-bit chunks in Rd.
+ *
+ * **Examples**:\n
+ * ~~~
+ * Please see `URADD16` and `URSUB16` instructions.
+ * ~~~
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.W[x][31:16] = (Rs1.W[x][31:16] + Rs2.W[x][31:16]) u>> 1;
+ * Rd.W[x][15:0] = (Rs1.W[x][15:0] - Rs2.W[x][15:0]) u>> 1;
+ * for RV32, x=0
+ * for RV64, x=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_URSTAS16(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("urstas16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.180. URSTAS16 ===== */
+
+/* ===== Inline Function Start for 3.181. URSTSA16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_ADDSUB
+ * \brief URSTSA16 (SIMD 16-bit Unsigned Halving Straight Subtraction & Addition)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * URCRSA16 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 16-bit unsigned integer element subtraction and 16-bit unsigned integer element
+ * addition in a 32-bit chunk simultaneously. Operands are from corresponding positions in 32-bit
+ * chunks. The results are halved to avoid overflow or saturation.
+ *
+ * **Description**:\n
+ * This instruction subtracts the 16-bit unsigned integer in [31:16] of 32-bit chunks in Rs2
+ * from the 16-bit unsigned integer in [31:16] of 32-bit chunks in Rs1, and adds the 16-bit unsigned
+ * integer in [15:0] of 32-bit chunks in Rs1 with the 16-bit unsigned integer in [15:0] of 32-bit chunks in
+ * Rs2. The two results are first logically right-shifted by 1 bit and then written to [31:16] of 32-bit
+ * chunks in Rd and [15:0] of 32-bit chunks in Rd.
+ *
+ * **Examples**:\n
+ * ~~~
+ * Please see `URADD16` and `URSUB16` instructions.
+ * ~~~
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.W[x][31:16] = (Rs1.W[x][31:16] - Rs2.W[x][31:16]) u>> 1;
+ * Rd.W[x][15:0] = (Rs1.W[x][15:0] + Rs2.W[x][15:0]) u>> 1;
+ * for RV32, x=0
+ * for RV64, x=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_URSTSA16(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("urstsa16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.181. URSTSA16 ===== */
+
+/* ===== Inline Function Start for 3.182. URSUB8 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_ADDSUB
+ * \brief URSUB8 (SIMD 8-bit Unsigned Halving Subtraction)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * URSUB8 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 8-bit unsigned integer element subtractions simultaneously. The results are halved to
+ * avoid overflow or saturation.
+ *
+ * **Description**:\n
+ * This instruction subtracts the 8-bit unsigned integer elements in Rs2 from the 8-bit
+ * unsigned integer elements in Rs1. The results are first logically right-shifted by 1 bit and then
+ * written to Rd.
+ *
+ * **Examples**:\n
+ * ~~~
+ * * Ra = 0x7F, Rb = 0x80 Rt = 0xFF
+ * * Ra = 0x80, Rb = 0x7F Rt = 0x00
+ * * Ra = 0x80, Rb = 0x40 Rt = 0x20
+ * ~~~
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.B[x] = (Rs1.B[x] - Rs2.B[x]) u>> 1;
+ * for RV32: x=3...0,
+ * for RV64: x=7...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_URSUB8(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("ursub8 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.182. URSUB8 ===== */
+
+/* ===== Inline Function Start for 3.183. URSUB16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_ADDSUB
+ * \brief URSUB16 (SIMD 16-bit Unsigned Halving Subtraction)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * URSUB16 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 16-bit unsigned integer element subtractions simultaneously. The results are halved to
+ * avoid overflow or saturation.
+ *
+ * **Description**:\n
+ * This instruction subtracts the 16-bit unsigned integer elements in Rs2 from the 16-bit
+ * unsigned integer elements in Rs1. The results are first logically right-shifted by 1 bit and then
+ * written to Rd.
+ *
+ * **Examples**:\n
+ * ~~~
+ * * Ra = 0x7FFF, Rb = 0x8000 Rt = 0xFFFF
+ * * Ra = 0x8000, Rb = 0x7FFF Rt = 0x0000
+ * * Ra = 0x8000, Rb = 0x4000 Rt = 0x2000
+ * ~~~
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.H[x] = (Rs1.H[x] - Rs2.H[x]) u>> 1;
+ * for RV32: x=1...0,
+ * for RV64: x=3...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_URSUB16(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("ursub16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.183. URSUB16 ===== */
+
+/* ===== Inline Function Start for 3.184. URSUB64 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_64B_ADDSUB
+ * \brief URSUB64 (64-bit Unsigned Halving Subtraction)
+ * \details
+ * **Type**: DSP (64-bit Profile)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * URSUB64 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Perform a 64-bit unsigned integer subtraction. The result is halved to avoid overflow or
+ * saturation.
+ *
+ * **RV32 Description**:\n
+ * This instruction subtracts the 64-bit unsigned integer of an even/odd pair of
+ * registers specified by Rs2(4,1) from the 64-bit unsigned integer of an even/odd pair of registers
+ * specified by Rs1(4,1). The subtraction result is first logically right-shifted by 1 bit and then written
+ * to an even/odd pair of registers specified by Rd(4,1).
+ * Rx(4,1), i.e., d, determines the even/odd pair group of two registers. Specifically, the register pair
+ * includes register 2d and 2d+1.
+ * The odd `2d+1` register of the pair contains the high 32-bit of the result and the even `2d` register
+ * of the pair contains the low 32-bit of the result.
+ *
+ * **RV64 Description**:\n
+ * This instruction subtracts the 64-bit unsigned integer in Rs2 from the 64-bit
+ * unsigned integer in Rs1. The subtraction result is first logically right-shifted by 1 bit and then
+ * written to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * * RV32:
+ * t_L = CONCAT(Rt(4,1),1'b0); t_H = CONCAT(Rt(4,1),1'b1);
+ * a_L = CONCAT(Ra(4,1),1'b0); a_H = CONCAT(Ra(4,1),1'b1);
+ * b_L = CONCAT(Rb(4,1),1'b0); b_H = CONCAT(Rb(4,1),1'b1);
+ * R[t_H].R[t_L] = (R[a_H].R[a_L] - R[b_H].R[b_L]) u>> 1;
+ * * RV64:
+ * Rd = (Rs1 - Rs2) u>> 1;
+ * ~~~
+ *
+ * \param [in]  a    unsigned long long type of value stored in a
+ * \param [in]  b    unsigned long long type of value stored in b
+ * \return value stored in unsigned long long type
+ */
+__STATIC_FORCEINLINE unsigned long long __RV_URSUB64(unsigned long long a, unsigned long long b)
+{
+    register unsigned long long result;
+    __ASM volatile("ursub64 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.184. URSUB64 ===== */
+
+/* ===== Inline Function Start for 3.185. URSUBW ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_32B_COMPUTATION
+ * \brief URSUBW (32-bit Unsigned Halving Subtraction)
+ * \details
+ * **Type**: DSP
+ *
+ * **Syntax**:\n
+ * ~~~
+ * URSUBW Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Subtract 32-bit unsigned integers and the result is halved to avoid overflow or saturation.
+ *
+ * **Description**:\n
+ * This instruction subtracts the first 32-bit signed integer in Rs2 from the first 32-bit
+ * signed integer in Rs1. The result is first logically right-shifted by 1 bit and then sign-extended and
+ * written to Rd.
+ *
+ * **Examples**:\n
+ * ~~~
+ * * Ra = 0x7FFFFFFF, Rb = 0x80000000 Rt = 0xFFFFFFFF
+ * * Ra = 0x80000000, Rb = 0x7FFFFFFF Rt = 0x00000000
+ * * Ra = 0x80000000, Rb = 0x40000000 Rt = 0x20000000
+ * ~~~
+ *
+ * **Operations**:\n
+ * ~~~
+ * * RV32:
+ * Rd[31:0] = (Rs1[31:0] - Rs2[31:0]) u>> 1;
+ * * RV64:
+ * resw[31:0] = (Rs1[31:0] - Rs2[31:0]) u>> 1;
+ * Rd[63:0] = SE(resw[31:0]);
+ * ~~~
+ *
+ * \param [in]  a    unsigned int type of value stored in a
+ * \param [in]  b    unsigned int type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_URSUBW(unsigned int a, unsigned int b)
+{
+    register unsigned long result;
+    __ASM volatile("ursubw %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.185. URSUBW ===== */
+
+/* ===== Inline Function Start for 3.186. WEXTI ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_MISC
+ * \brief WEXTI (Extract Word from 64-bit Immediate)
+ * \details
+ * **Type**: DSP
+ *
+ * **Syntax**:\n
+ * ~~~
+ * WEXTI Rd, Rs1, #LSBloc
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Extract a 32-bit word from a 64-bit value stored in an even/odd pair of registers (RV32) or
+ * a register (RV64) starting from a specified immediate LSB bit position.
+ *
+ * **RV32 Description**:\n
+ * This instruction extracts a 32-bit word from a 64-bit value of an even/odd pair of registers specified
+ * by Rs1(4,1) starting from a specified immediate LSB bit position, #LSBloc. The extracted word is
+ * written to Rd.
+ * Rs1(4,1), i.e., d, determines the even/odd pair group of the two registers. Specifically, the register
+ * pair includes register 2d and 2d+1.
+ * The odd `2d+1` register of the pair contains the high 32-bit of the 64-bit value and the even `2d`
+ * register of the pair contains the low 32-bit of the 64-bit value.
+ *
+ * **RV64 Description**:\n
+ * This instruction extracts a 32-bit word from a 64-bit value in Rs1 starting from a specified
+ * immediate LSB bit position, #LSBloc. The extracted word is sign-extended and written to lower 32-
+ * bit of Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * * RV32:
+ * Idx0 = CONCAT(Rs1(4,1),1'b0); Idx1 = CONCAT(Rs2(4,1),1'b1);
+ * src[63:0] = Concat(R[Idx1], R[Idx0]);
+ * Rd = src[31+LSBloc:LSBloc];
+ * * RV64:
+ * ExtractW = Rs1[31+LSBloc:LSBloc];
+ * Rd = SE(ExtractW)
+ * ~~~
+ *
+ * \param [in]  a    long long type of value stored in a
+ * \param [in]  b    unsigned int type of value stored in b
+ * \return value stored in unsigned long type
+ */
+#define __RV_WEXTI(a, b)    \
+    ({    \
+        register unsigned long result;    \
+        register long long __a = (long long)(a);    \
+        __ASM volatile("wexti %0, %1, %2" : "=r"(result) : "r"(__a), "K"(b));    \
+        result;    \
+    })
+/* ===== Inline Function End for 3.186. WEXTI ===== */
+
+/* ===== Inline Function Start for 3.187. WEXT ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_MISC
+ * \brief WEXT (Extract Word from 64-bit)
+ * \details
+ * **Type**: DSP
+ *
+ * **Syntax**:\n
+ * ~~~
+ * WEXT Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Extract a 32-bit word from a 64-bit value stored in an even/odd pair of registers (RV32) or
+ * a register (RV64) starting from a specified LSB bit position in a register.
+ *
+ * **RV32 Description**:\n
+ * This instruction extracts a 32-bit word from a 64-bit value of an even/odd pair of registers specified
+ * by Rs1(4,1) starting from a specified LSB bit position, specified in Rs2[4:0]. The extracted word is
+ * written to Rd.
+ * Rs1(4,1), i.e., d, determines the even/odd pair group of the two registers. Specifically, the register
+ * pair includes register 2d and 2d+1.
+ * The odd `2d+1` register of the pair contains the high 32-bit of the 64-bit value and the even `2d`
+ * register of the pair contains the low 32-bit of the 64-bit value.
+ *
+ * **Operations**:\n
+ * ~~~
+ * * RV32:
+ * Idx0 = CONCAT(Rs1(4,1),1'b0); Idx1 = CONCAT(Rs1(4,1),1'b1);
+ * src[63:0] = Concat(R[Idx1], R[Idx0]);
+ * LSBloc = Rs2[4:0];
+ * Rd = src[31+LSBloc:LSBloc];
+ * * RV64:
+ * LSBloc = Rs2[4:0];
+ * ExtractW = Rs1[31+LSBloc:LSBloc];
+ * Rd = SE(ExtractW)
+ * ~~~
+ *
+ * \param [in]  a    long long type of value stored in a
+ * \param [in]  b    unsigned int type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_WEXT(long long a, unsigned int b)
+{
+    register unsigned long result;
+    __ASM volatile("wext %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.187. WEXT ===== */
+
+/* ===== Inline Function Start for 3.188.1. ZUNPKD810 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_UNPACK
+ * \brief ZUNPKD810 (Unsigned Unpacking Bytes 1 & 0)
+ * \details
+ * **Type**: DSP
+ *
+ * **Syntax**:\n
+ * ~~~
+ * ZUNPKD8xy Rd, Rs1
+ * xy = {10, 20, 30, 31, 32}
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Unpack byte x and byte y of 32-bit chunks in a register into two 16-bit unsigned
+ * halfwords of 32-bit chunks in a register.
+ *
+ * **Description**:\n
+ * For the `ZUNPKD8(x)(*y*)` instruction, it unpacks byte *x and byte y* of 32-bit chunks in Rs1 into
+ * two 16-bit unsigned halfwords and writes the results to the top part and the bottom part of 32-bit
+ * chunks in Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.W[m].H[1] = ZE16(Rs1.W[m].B[x])
+ * Rd.W[m].H[0] = ZE16(Rs1.W[m].B[y])
+ * // ZUNPKD810, x=1,y=0
+ * // ZUNPKD820, x=2,y=0
+ * // ZUNPKD830, x=3,y=0
+ * // ZUNPKD831, x=3,y=1
+ * // ZUNPKD832, x=3,y=2
+ * for RV32: m=0,
+ * for RV64: m=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_ZUNPKD810(unsigned long a)
+{
+    register unsigned long result;
+    __ASM volatile("zunpkd810 %0, %1" : "=r"(result) : "r"(a));
+    return result;
+}
+/* ===== Inline Function End for 3.188.1. ZUNPKD810 ===== */
+
+/* ===== Inline Function Start for 3.188.2. ZUNPKD820 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_UNPACK
+ * \brief ZUNPKD820 (Unsigned Unpacking Bytes 2 & 0)
+ * \details
+ * **Type**: DSP
+ *
+ * **Syntax**:\n
+ * ~~~
+ * ZUNPKD8xy Rd, Rs1
+ * xy = {10, 20, 30, 31, 32}
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Unpack byte x and byte y of 32-bit chunks in a register into two 16-bit unsigned
+ * halfwords of 32-bit chunks in a register.
+ *
+ * **Description**:\n
+ * For the `ZUNPKD8(x)(*y*)` instruction, it unpacks byte *x and byte y* of 32-bit chunks in Rs1 into
+ * two 16-bit unsigned halfwords and writes the results to the top part and the bottom part of 32-bit
+ * chunks in Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.W[m].H[1] = ZE16(Rs1.W[m].B[x])
+ * Rd.W[m].H[0] = ZE16(Rs1.W[m].B[y])
+ * // ZUNPKD810, x=1,y=0
+ * // ZUNPKD820, x=2,y=0
+ * // ZUNPKD830, x=3,y=0
+ * // ZUNPKD831, x=3,y=1
+ * // ZUNPKD832, x=3,y=2
+ * for RV32: m=0,
+ * for RV64: m=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_ZUNPKD820(unsigned long a)
+{
+    register unsigned long result;
+    __ASM volatile("zunpkd820 %0, %1" : "=r"(result) : "r"(a));
+    return result;
+}
+/* ===== Inline Function End for 3.188.2. ZUNPKD820 ===== */
+
+/* ===== Inline Function Start for 3.188.3. ZUNPKD830 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_UNPACK
+ * \brief ZUNPKD830 (Unsigned Unpacking Bytes 3 & 0)
+ * \details
+ * **Type**: DSP
+ *
+ * **Syntax**:\n
+ * ~~~
+ * ZUNPKD8xy Rd, Rs1
+ * xy = {10, 20, 30, 31, 32}
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Unpack byte x and byte y of 32-bit chunks in a register into two 16-bit unsigned
+ * halfwords of 32-bit chunks in a register.
+ *
+ * **Description**:\n
+ * For the `ZUNPKD8(x)(*y*)` instruction, it unpacks byte *x and byte y* of 32-bit chunks in Rs1 into
+ * two 16-bit unsigned halfwords and writes the results to the top part and the bottom part of 32-bit
+ * chunks in Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.W[m].H[1] = ZE16(Rs1.W[m].B[x])
+ * Rd.W[m].H[0] = ZE16(Rs1.W[m].B[y])
+ * // ZUNPKD810, x=1,y=0
+ * // ZUNPKD820, x=2,y=0
+ * // ZUNPKD830, x=3,y=0
+ * // ZUNPKD831, x=3,y=1
+ * // ZUNPKD832, x=3,y=2
+ * for RV32: m=0,
+ * for RV64: m=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_ZUNPKD830(unsigned long a)
+{
+    register unsigned long result;
+    __ASM volatile("zunpkd830 %0, %1" : "=r"(result) : "r"(a));
+    return result;
+}
+/* ===== Inline Function End for 3.188.3. ZUNPKD830 ===== */
+
+/* ===== Inline Function Start for 3.188.4. ZUNPKD831 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_UNPACK
+ * \brief ZUNPKD831 (Unsigned Unpacking Bytes 3 & 1)
+ * \details
+ * **Type**: DSP
+ *
+ * **Syntax**:\n
+ * ~~~
+ * ZUNPKD8xy Rd, Rs1
+ * xy = {10, 20, 30, 31, 32}
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Unpack byte x and byte y of 32-bit chunks in a register into two 16-bit unsigned
+ * halfwords of 32-bit chunks in a register.
+ *
+ * **Description**:\n
+ * For the `ZUNPKD8(x)(*y*)` instruction, it unpacks byte *x and byte y* of 32-bit chunks in Rs1 into
+ * two 16-bit unsigned halfwords and writes the results to the top part and the bottom part of 32-bit
+ * chunks in Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.W[m].H[1] = ZE16(Rs1.W[m].B[x])
+ * Rd.W[m].H[0] = ZE16(Rs1.W[m].B[y])
+ * // ZUNPKD810, x=1,y=0
+ * // ZUNPKD820, x=2,y=0
+ * // ZUNPKD830, x=3,y=0
+ * // ZUNPKD831, x=3,y=1
+ * // ZUNPKD832, x=3,y=2
+ * for RV32: m=0,
+ * for RV64: m=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_ZUNPKD831(unsigned long a)
+{
+    register unsigned long result;
+    __ASM volatile("zunpkd831 %0, %1" : "=r"(result) : "r"(a));
+    return result;
+}
+/* ===== Inline Function End for 3.188.4. ZUNPKD831 ===== */
+
+/* ===== Inline Function Start for 3.188.5. ZUNPKD832 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_UNPACK
+ * \brief ZUNPKD832 (Unsigned Unpacking Bytes 3 & 2)
+ * \details
+ * **Type**: DSP
+ *
+ * **Syntax**:\n
+ * ~~~
+ * ZUNPKD8xy Rd, Rs1
+ * xy = {10, 20, 30, 31, 32}
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Unpack byte x and byte y of 32-bit chunks in a register into two 16-bit unsigned
+ * halfwords of 32-bit chunks in a register.
+ *
+ * **Description**:\n
+ * For the `ZUNPKD8(x)(*y*)` instruction, it unpacks byte *x and byte y* of 32-bit chunks in Rs1 into
+ * two 16-bit unsigned halfwords and writes the results to the top part and the bottom part of 32-bit
+ * chunks in Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.W[m].H[1] = ZE16(Rs1.W[m].B[x])
+ * Rd.W[m].H[0] = ZE16(Rs1.W[m].B[y])
+ * // ZUNPKD810, x=1,y=0
+ * // ZUNPKD820, x=2,y=0
+ * // ZUNPKD830, x=3,y=0
+ * // ZUNPKD831, x=3,y=1
+ * // ZUNPKD832, x=3,y=2
+ * for RV32: m=0,
+ * for RV64: m=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_ZUNPKD832(unsigned long a)
+{
+    register unsigned long result;
+    __ASM volatile("zunpkd832 %0, %1" : "=r"(result) : "r"(a));
+    return result;
+}
+/* ===== Inline Function End for 3.188.5. ZUNPKD832 ===== */
+
+#if (__RISCV_XLEN == 64) || defined(__ONLY_FOR_DOXYGEN_DOCUMENT_GENERATION__)
+
+/* ===== Inline Function Start for 4.1. ADD32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_ADDSUB
+ * \brief ADD32 (SIMD 32-bit Addition)
+ * \details
+ * **Type**: SIMD (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * ADD32 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 32-bit integer element additions simultaneously.
+ *
+ * **Description**:\n
+ * This instruction adds the 32-bit integer elements in Rs1 with the 32-bit integer
+ * elements in Rs2, and then writes the 32-bit element results to Rd.
+ *
+ * **Note**:\n
+ * This instruction can be used for either signed or unsigned addition.
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.W[x] = Rs1.W[x] + Rs2.W[x];
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_ADD32(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("add32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 4.1. ADD32 ===== */
+
+/* ===== Inline Function Start for 4.2. CRAS32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_ADDSUB
+ * \brief CRAS32 (SIMD 32-bit Cross Addition & Subtraction)
+ * \details
+ * **Type**: SIMD (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * CRAS32 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 32-bit integer element addition and 32-bit integer element subtraction in a 64-bit
+ * chunk simultaneously. Operands are from crossed 32-bit elements.
+ *
+ * **Description**:\n
+ * This instruction adds the 32-bit integer element in [63:32] of Rs1 with the 32-bit
+ * integer element in [31:0] of Rs2, and writes the result to [63:32] of Rd; at the same time, it subtracts
+ * the 32-bit integer element in [63:32] of Rs2 from the 32-bit integer element in [31:0] of Rs1, and
+ * writes the result to [31:0] of Rd.
+ *
+ * **Note**:\n
+ * This instruction can be used for either signed or unsigned operations.
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.W[1] = Rs1.W[1] + Rs2.W[0];
+ * Rd.W[0] = Rs1.W[0] - Rs2.W[1];
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_CRAS32(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("cras32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 4.2. CRAS32 ===== */
+
+/* ===== Inline Function Start for 4.3. CRSA32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_ADDSUB
+ * \brief CRSA32 (SIMD 32-bit Cross Subtraction & Addition)
+ * \details
+ * **Type**: SIMD (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * CRSA32 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 32-bit integer element subtraction and 32-bit integer element addition in a 64-bit
+ * chunk simultaneously. Operands are from crossed 32-bit elements.
+ * *Description: *
+ * This instruction subtracts the 32-bit integer element in [31:0] of Rs2 from the 32-bit integer element
+ * in [63:32] of Rs1, and writes the result to [63:32] of Rd; at the same time, it adds the 32-bit integer
+ * element in [31:0] of Rs1 with the 32-bit integer element in [63:32] of Rs2, and writes the result to
+ * [31:0] of Rd
+ *
+ * **Note**:\n
+ * This instruction can be used for either signed or unsigned operations.
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.W[1] = Rs1.W[1] - Rs2.W[0];
+ * Rd.W[0] = Rs1.W[0] + Rs2.W[1];
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_CRSA32(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("crsa32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 4.3. CRSA32 ===== */
+
+/* ===== Inline Function Start for 4.4. KABS32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_MISC
+ * \brief KABS32 (Scalar 32-bit Absolute Value with Saturation)
+ * \details
+ * **Type**: DSP (RV64 Only)
+24    20
+19    15
+14    12
+11    7
+KABS32
+10010
+Rs1
+000
+Rd
+6    0
+GE80B
+1111111
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KABS32 Rd, Rs1
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Get the absolute value of signed 32-bit integer elements in a general register.
+ *
+ * **Description**:\n
+ * This instruction calculates the absolute value of signed 32-bit integer elements stored
+ * in Rs1. The results are written to Rd. This instruction with the minimum negative integer input of
+ * 0x80000000 will produce a saturated output of maximum positive integer of 0x7fffffff and the OV
+ * flag will be set to 1.
+ *
+ * **Operations**:\n
+ * ~~~
+ * if (Rs1.W[x] >= 0) {
+ *   res[x] = Rs1.W[x];
+ * } else {
+ *   If (Rs1.W[x] == 0x80000000) {
+ *     res[x] = 0x7fffffff;
+ *     OV = 1;
+ *   } else {
+ *     res[x] = -Rs1.W[x];
+ *   }
+ * }
+ * Rd.W[x] = res[x];
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_KABS32(unsigned long a)
+{
+    register unsigned long result;
+    __ASM volatile("kabs32 %0, %1" : "=r"(result) : "r"(a));
+    return result;
+}
+/* ===== Inline Function End for 4.4. KABS32 ===== */
+
+/* ===== Inline Function Start for 4.5. KADD32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_ADDSUB
+ * \brief KADD32 (SIMD 32-bit Signed Saturating Addition)
+ * \details
+ * **Type**: SIMD (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KADD32 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 32-bit signed integer element saturating additions simultaneously.
+ *
+ * **Description**:\n
+ * This instruction adds the 32-bit signed integer elements in Rs1 with the 32-bit signed
+ * integer elements in Rs2. If any of the results are beyond the Q31 number range (-2^31 <= Q31 <= 2^31-1),
+ * they are saturated to the range and the OV bit is set to 1. The saturated results are written to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * res[x] = Rs1.W[x] + Rs2.W[x];
+ * if (res[x] > (2^31)-1) {
+ *   res[x] = (2^31)-1;
+ *   OV = 1;
+ * } else if (res[x] < -2^31) {
+ *   res[x] = -2^31;
+ *   OV = 1;
+ * }
+ * Rd.W[x] = res[x];
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_KADD32(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("kadd32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 4.5. KADD32 ===== */
+
+/* ===== Inline Function Start for 4.6. KCRAS32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_ADDSUB
+ * \brief KCRAS32 (SIMD 32-bit Signed Saturating Cross Addition & Subtraction)
+ * \details
+ * **Type**: SIM (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KCRAS32 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 32-bit signed integer element saturating addition and 32-bit signed integer element
+ * saturating subtraction in a 64-bit chunk simultaneously. Operands are from crossed 32-bit elements.
+ *
+ * **Description**:\n
+ * This instruction adds the 32-bit integer element in [63:32] of Rs1 with the 32-bit
+ * integer element in [31:0] of Rs2; at the same time, it subtracts the 32-bit integer element in [63:32] of
+ * Rs2 from the 32-bit integer element in [31:0] of Rs1. If any of the results are beyond the Q31 number
+ * range (-2^31 <= Q31 <= 2^31-1), they are saturated to the range and the OV bit is set to 1. The saturated
+ * results are written to [63:32] of Rd for addition and [31:0] of Rd for subtraction.
+ *
+ * **Operations**:\n
+ * ~~~
+ * res[1] = Rs1.W[1] + Rs2.W[0];
+ * res[0] = Rs1.W[0] - Rs2.W[1];
+ * if (res[x] > (2^31)-1) {
+ *   res[x] = (2^31)-1;
+ *   OV = 1;
+ * } else if (res < -2^31) {
+ *   res[x] = -2^31;
+ *   OV = 1;
+ * }
+ * Rd.W[1] = res[1];
+ * Rd.W[0] = res[0];
+ * for RV64, x=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_KCRAS32(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("kcras32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 4.6. KCRAS32 ===== */
+
+/* ===== Inline Function Start for 4.7. KCRSA32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_ADDSUB
+ * \brief KCRSA32 (SIMD 32-bit Signed Saturating Cross Subtraction & Addition)
+ * \details
+ * **Type**: SIMD (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KCRSA32 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 32-bit signed integer element saturating subtraction and 32-bit signed integer element
+ * saturating addition in a 64-bit chunk simultaneously. Operands are from crossed 32-bit elements.
+ * *Description: *
+ * This instruction subtracts the 32-bit integer element in [31:0] of Rs2 from the 32-bit integer element
+ * in [63:32] of Rs1; at the same time, it adds the 32-bit integer element in [31:0] of Rs1 with the 32-bit
+ * integer element in [63:32] of Rs2. If any of the results are beyond the Q31 number range (-2^31 <= Q31
+ * <= 2^31-1), they are saturated to the range and the OV bit is set to 1. The saturated results are written to
+ * [63:32] of Rd for subtraction and [31:0] of Rd for addition.
+ *
+ * **Operations**:\n
+ * ~~~
+ * res[1] = Rs1.W[1] - Rs2.W[0];
+ * res[0] = Rs1.W[0] + Rs2.W[1];
+ * if (res[x] > (2^31)-1) {
+ *   res[x] = (2^31)-1;
+ *   OV = 1;
+ * } else if (res < -2^31) {
+ *   res[x] = -2^31;
+ *   OV = 1;
+ * }
+ * Rd.W[1] = res[1];
+ * Rd.W[0] = res[0];
+ * for RV64, x=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_KCRSA32(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("kcrsa32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 4.7. KCRSA32 ===== */
+
+/* ===== Inline Function Start for 4.8.1. KDMBB16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_Q15_SAT_MULT
+ * \brief KDMBB16 (SIMD Signed Saturating Double Multiply B16 x B16)
+ * \details
+ * **Type**: SIMD (RV64 only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KDMxy16 Rd, Rs1, Rs2 (xy = BB, BT, TT)
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Multiply the signed Q15 integer contents of two 16-bit data in the corresponding portion
+ * of the 32-bit chunks in registers and then double and saturate the Q31 results into the 32-bit chunks
+ * in the destination register. If saturation happens, an overflow flag OV will be set.
+ *
+ * **Description**:\n
+ * Multiply the top or bottom 16-bit Q15 content of the 32-bit portions in Rs1 with the top
+ * or bottom 16-bit Q15 content of the 32-bit portions in Rs2. The Q30 results are then doubled and
+ * saturated into Q31 values. The Q31 values are then written into the 32-bit chunks in Rd. When both
+ * the two Q15 inputs are 0x8000, saturation will happen. The result will be saturated to 0x7FFFFFFF
+ * and the overflow flag OV will be set.
+ *
+ * **Operations**:\n
+ * ~~~
+ * // KDMBB16: (x,y,z)=(0,0,0),(2,2,1)
+ * // KDMBT16: (x,y,z)=(0,1,0),(2,3,1)
+ * // KDMTT16: (x,y,z)=(1,1,0),(3,3,1)
+ * aop[z] = Rs1.H[x]; bop[z] = Rs2.H[y];
+ * If (0x8000 != aop[z] | 0x8000 != bop[z]) {
+ *   Mresult[z] = aop[z] * bop[z];
+ *   resQ31[z] = Mresult[z] << 1;
+ * } else {
+ *   resQ31[z] = 0x7FFFFFFF;
+ *   OV = 1;
+ * }
+ * Rd.W[z] = resQ31[z];
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_KDMBB16(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("kdmbb16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 4.8.1. KDMBB16 ===== */
+
+/* ===== Inline Function Start for 4.8.2. KDMBT16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_Q15_SAT_MULT
+ * \brief KDMBT16 (SIMD Signed Saturating Double Multiply B16 x T16)
+ * \details
+ * **Type**: SIMD (RV64 only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KDMxy16 Rd, Rs1, Rs2 (xy = BB, BT, TT)
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Multiply the signed Q15 integer contents of two 16-bit data in the corresponding portion
+ * of the 32-bit chunks in registers and then double and saturate the Q31 results into the 32-bit chunks
+ * in the destination register. If saturation happens, an overflow flag OV will be set.
+ *
+ * **Description**:\n
+ * Multiply the top or bottom 16-bit Q15 content of the 32-bit portions in Rs1 with the top
+ * or bottom 16-bit Q15 content of the 32-bit portions in Rs2. The Q30 results are then doubled and
+ * saturated into Q31 values. The Q31 values are then written into the 32-bit chunks in Rd. When both
+ * the two Q15 inputs are 0x8000, saturation will happen. The result will be saturated to 0x7FFFFFFF
+ * and the overflow flag OV will be set.
+ *
+ * **Operations**:\n
+ * ~~~
+ * // KDMBB16: (x,y,z)=(0,0,0),(2,2,1)
+ * // KDMBT16: (x,y,z)=(0,1,0),(2,3,1)
+ * // KDMTT16: (x,y,z)=(1,1,0),(3,3,1)
+ * aop[z] = Rs1.H[x]; bop[z] = Rs2.H[y];
+ * If (0x8000 != aop[z] | 0x8000 != bop[z]) {
+ *   Mresult[z] = aop[z] * bop[z];
+ *   resQ31[z] = Mresult[z] << 1;
+ * } else {
+ *   resQ31[z] = 0x7FFFFFFF;
+ *   OV = 1;
+ * }
+ * Rd.W[z] = resQ31[z];
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_KDMBT16(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("kdmbt16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 4.8.2. KDMBT16 ===== */
+
+/* ===== Inline Function Start for 4.8.3. KDMTT16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_Q15_SAT_MULT
+ * \brief KDMTT16 (SIMD Signed Saturating Double Multiply T16 x T16)
+ * \details
+ * **Type**: SIMD (RV64 only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KDMxy16 Rd, Rs1, Rs2 (xy = BB, BT, TT)
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Multiply the signed Q15 integer contents of two 16-bit data in the corresponding portion
+ * of the 32-bit chunks in registers and then double and saturate the Q31 results into the 32-bit chunks
+ * in the destination register. If saturation happens, an overflow flag OV will be set.
+ *
+ * **Description**:\n
+ * Multiply the top or bottom 16-bit Q15 content of the 32-bit portions in Rs1 with the top
+ * or bottom 16-bit Q15 content of the 32-bit portions in Rs2. The Q30 results are then doubled and
+ * saturated into Q31 values. The Q31 values are then written into the 32-bit chunks in Rd. When both
+ * the two Q15 inputs are 0x8000, saturation will happen. The result will be saturated to 0x7FFFFFFF
+ * and the overflow flag OV will be set.
+ *
+ * **Operations**:\n
+ * ~~~
+ * // KDMBB16: (x,y,z)=(0,0,0),(2,2,1)
+ * // KDMBT16: (x,y,z)=(0,1,0),(2,3,1)
+ * // KDMTT16: (x,y,z)=(1,1,0),(3,3,1)
+ * aop[z] = Rs1.H[x]; bop[z] = Rs2.H[y];
+ * If (0x8000 != aop[z] | 0x8000 != bop[z]) {
+ *   Mresult[z] = aop[z] * bop[z];
+ *   resQ31[z] = Mresult[z] << 1;
+ * } else {
+ *   resQ31[z] = 0x7FFFFFFF;
+ *   OV = 1;
+ * }
+ * Rd.W[z] = resQ31[z];
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_KDMTT16(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("kdmtt16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 4.8.3. KDMTT16 ===== */
+
+/* ===== Inline Function Start for 4.9.1. KDMABB16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_Q15_SAT_MULT
+ * \brief KDMABB16 (SIMD Signed Saturating Double Multiply Addition B16 x B16)
+ * \details
+ * **Type**: SIMD (RV64 only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KDMAxy16 Rd, Rs1, Rs2 (xy = BB, BT, TT)
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Multiply the signed Q15 integer contents of two 16-bit data in the corresponding portion
+ * of the 32-bit chunks in registers and then double and saturate the Q31 results, add the results with
+ * the values of the corresponding 32-bit chunks from the destination register and write the saturated
+ * addition results back into the corresponding 32-bit chunks of the destination register. If saturation
+ * happens, an overflow flag OV will be set.
+ *
+ * **Description**:\n
+ * Multiply the top or bottom 16-bit Q15 content of the 32-bit portions in Rs1 with the top
+ * or bottom 16-bit Q15 content of the corresponding 32-bit portions in Rs2. The Q30 results are then
+ * doubled and saturated into Q31 values. The Q31 values are then added with the content of the
+ * corresponding 32-bit portions of Rd. If the addition results are beyond the Q31 number range (-2^31 <=
+ * Q31 <= 2^31-1), they are saturated to the range and the OV flag is set to 1. The results after saturation
+ * are written back to Rd.
+ * When both the two Q15 inputs are 0x8000, saturation will happen and the overflow flag OV will be
+ * set.
+ *
+ * **Operations**:\n
+ * ~~~
+ * // KDMABB16: (x,y,z)=(0,0,0),(2,2,1)
+ * // KDMABT16: (x,y,z)=(0,1,0),(2,3,1)
+ * // KDMATT16: (x,y,z)=(1,1,0),(3,3,1)
+ * aop[z] = Rs1.H[x]; bop[z] = Rs2.H[y];
+ * If (0x8000 != aop[z] | 0x8000 != bop[z]) {
+ *   Mresult[z] = aop[z] * bop[z];
+ *   resQ31[z] = Mresult[z] << 1;
+ * } else {
+ *   resQ31[z] = 0x7FFFFFFF;
+ *   OV = 1;
+ * }
+ * resadd[z] = Rd.W[z] + resQ31[z];
+ * if (resadd[z] > (2^31)-1) {
+ *   resadd[z] = (2^31)-1;
+ *   OV = 1;
+ * } else if (resadd[z] < -2^31) {
+ *   resadd[z] = -2^31;
+ *   OV = 1;
+ * }
+ * Rd.W[z] = resadd[z];
+ * ~~~
+ *
+ * \param [in]  t    unsigned long type of value stored in t
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_KDMABB16(unsigned long t, unsigned long a, unsigned long b)
+{
+    __ASM volatile("kdmabb16 %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
+    return t;
+}
+/* ===== Inline Function End for 4.9.1. KDMABB16 ===== */
+
+/* ===== Inline Function Start for 4.9.2. KDMABT16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_Q15_SAT_MULT
+ * \brief KDMABT16 (SIMD Signed Saturating Double Multiply Addition B16 x T16)
+ * \details
+ * **Type**: SIMD (RV64 only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KDMAxy16 Rd, Rs1, Rs2 (xy = BB, BT, TT)
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Multiply the signed Q15 integer contents of two 16-bit data in the corresponding portion
+ * of the 32-bit chunks in registers and then double and saturate the Q31 results, add the results with
+ * the values of the corresponding 32-bit chunks from the destination register and write the saturated
+ * addition results back into the corresponding 32-bit chunks of the destination register. If saturation
+ * happens, an overflow flag OV will be set.
+ *
+ * **Description**:\n
+ * Multiply the top or bottom 16-bit Q15 content of the 32-bit portions in Rs1 with the top
+ * or bottom 16-bit Q15 content of the corresponding 32-bit portions in Rs2. The Q30 results are then
+ * doubled and saturated into Q31 values. The Q31 values are then added with the content of the
+ * corresponding 32-bit portions of Rd. If the addition results are beyond the Q31 number range (-2^31 <=
+ * Q31 <= 2^31-1), they are saturated to the range and the OV flag is set to 1. The results after saturation
+ * are written back to Rd.
+ * When both the two Q15 inputs are 0x8000, saturation will happen and the overflow flag OV will be
+ * set.
+ *
+ * **Operations**:\n
+ * ~~~
+ * // KDMABB16: (x,y,z)=(0,0,0),(2,2,1)
+ * // KDMABT16: (x,y,z)=(0,1,0),(2,3,1)
+ * // KDMATT16: (x,y,z)=(1,1,0),(3,3,1)
+ * aop[z] = Rs1.H[x]; bop[z] = Rs2.H[y];
+ * If (0x8000 != aop[z] | 0x8000 != bop[z]) {
+ *   Mresult[z] = aop[z] * bop[z];
+ *   resQ31[z] = Mresult[z] << 1;
+ * } else {
+ *   resQ31[z] = 0x7FFFFFFF;
+ *   OV = 1;
+ * }
+ * resadd[z] = Rd.W[z] + resQ31[z];
+ * if (resadd[z] > (2^31)-1) {
+ *   resadd[z] = (2^31)-1;
+ *   OV = 1;
+ * } else if (resadd[z] < -2^31) {
+ *   resadd[z] = -2^31;
+ *   OV = 1;
+ * }
+ * Rd.W[z] = resadd[z];
+ * ~~~
+ *
+ * \param [in]  t    unsigned long type of value stored in t
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_KDMABT16(unsigned long t, unsigned long a, unsigned long b)
+{
+    __ASM volatile("kdmabt16 %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
+    return t;
+}
+/* ===== Inline Function End for 4.9.2. KDMABT16 ===== */
+
+/* ===== Inline Function Start for 4.9.3. KDMATT16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_Q15_SAT_MULT
+ * \brief KDMATT16 (SIMD Signed Saturating Double Multiply Addition T16 x T16)
+ * \details
+ * **Type**: SIMD (RV64 only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KDMAxy16 Rd, Rs1, Rs2 (xy = BB, BT, TT)
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Multiply the signed Q15 integer contents of two 16-bit data in the corresponding portion
+ * of the 32-bit chunks in registers and then double and saturate the Q31 results, add the results with
+ * the values of the corresponding 32-bit chunks from the destination register and write the saturated
+ * addition results back into the corresponding 32-bit chunks of the destination register. If saturation
+ * happens, an overflow flag OV will be set.
+ *
+ * **Description**:\n
+ * Multiply the top or bottom 16-bit Q15 content of the 32-bit portions in Rs1 with the top
+ * or bottom 16-bit Q15 content of the corresponding 32-bit portions in Rs2. The Q30 results are then
+ * doubled and saturated into Q31 values. The Q31 values are then added with the content of the
+ * corresponding 32-bit portions of Rd. If the addition results are beyond the Q31 number range (-2^31 <=
+ * Q31 <= 2^31-1), they are saturated to the range and the OV flag is set to 1. The results after saturation
+ * are written back to Rd.
+ * When both the two Q15 inputs are 0x8000, saturation will happen and the overflow flag OV will be
+ * set.
+ *
+ * **Operations**:\n
+ * ~~~
+ * // KDMABB16: (x,y,z)=(0,0,0),(2,2,1)
+ * // KDMABT16: (x,y,z)=(0,1,0),(2,3,1)
+ * // KDMATT16: (x,y,z)=(1,1,0),(3,3,1)
+ * aop[z] = Rs1.H[x]; bop[z] = Rs2.H[y];
+ * If (0x8000 != aop[z] | 0x8000 != bop[z]) {
+ *   Mresult[z] = aop[z] * bop[z];
+ *   resQ31[z] = Mresult[z] << 1;
+ * } else {
+ *   resQ31[z] = 0x7FFFFFFF;
+ *   OV = 1;
+ * }
+ * resadd[z] = Rd.W[z] + resQ31[z];
+ * if (resadd[z] > (2^31)-1) {
+ *   resadd[z] = (2^31)-1;
+ *   OV = 1;
+ * } else if (resadd[z] < -2^31) {
+ *   resadd[z] = -2^31;
+ *   OV = 1;
+ * }
+ * Rd.W[z] = resadd[z];
+ * ~~~
+ *
+ * \param [in]  t    unsigned long type of value stored in t
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_KDMATT16(unsigned long t, unsigned long a, unsigned long b)
+{
+    __ASM volatile("kdmatt16 %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
+    return t;
+}
+/* ===== Inline Function End for 4.9.3. KDMATT16 ===== */
+
+/* ===== Inline Function Start for 4.10.1. KHMBB16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_Q15_SAT_MULT
+ * \brief KHMBB16 (SIMD Signed Saturating Half Multiply B16 x B16)
+ * \details
+ * **Type**: SIMD (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KHMxy16 Rd, Rs1, Rs2 (xy = BB, BT, TT)
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Multiply the signed Q15 integer contents of two 16-bit data in the corresponding portion
+ * of the 32-bit chunks in registers and then right-shift 15 bits to turn the Q30 results into Q15
+ * numbers again and saturate the Q15 results into the destination register. If saturation happens, an
+ * overflow flag OV will be set.
+ *
+ * **Description**:\n
+ * Multiply the top or bottom 16-bit Q15 content of the 32-bit portions in Rs1 with the top
+ * or bottom 16-bit Q15 content of the 32-bit portion in Rs2. The Q30 results are then right-shifted 15-
+ * bits and saturated into Q15 values. The 32-bit Q15 values are then written into the 32-bit chunks in
+ * Rd. When both the two Q15 inputs are 0x8000, saturation will happen. The result will be saturated
+ * to 0x7FFF and the overflow flag OV will be set.
+ *
+ * **Operations**:\n
+ * ~~~
+ * // KHMBB16: (x,y,z)=(0,0,0),(2,2,1)
+ * // KHMBT16: (x,y,z)=(0,1,0),(2,3,1)
+ * // KHMTT16: (x,y,z)=(1,1,0),(3,3,1)
+ * aop = Rs1.H[x]; bop = Rs2.H[y];
+ * If (0x8000 != aop | 0x8000 != bop) {
+ *   Mresult[31:0] = aop * bop;
+ *   res[15:0] = Mresult[30:15];
+ * } else {
+ *   res[15:0] = 0x7FFF;
+ *   OV = 1;
+ * }
+ * Rd.W[z] = SE32(res[15:0]);
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_KHMBB16(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("khmbb16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 4.10.1. KHMBB16 ===== */
+
+/* ===== Inline Function Start for 4.10.2. KHMBT16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_Q15_SAT_MULT
+ * \brief KHMBT16 (SIMD Signed Saturating Half Multiply B16 x T16)
+ * \details
+ * **Type**: SIMD (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KHMxy16 Rd, Rs1, Rs2 (xy = BB, BT, TT)
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Multiply the signed Q15 integer contents of two 16-bit data in the corresponding portion
+ * of the 32-bit chunks in registers and then right-shift 15 bits to turn the Q30 results into Q15
+ * numbers again and saturate the Q15 results into the destination register. If saturation happens, an
+ * overflow flag OV will be set.
+ *
+ * **Description**:\n
+ * Multiply the top or bottom 16-bit Q15 content of the 32-bit portions in Rs1 with the top
+ * or bottom 16-bit Q15 content of the 32-bit portion in Rs2. The Q30 results are then right-shifted 15-
+ * bits and saturated into Q15 values. The 32-bit Q15 values are then written into the 32-bit chunks in
+ * Rd. When both the two Q15 inputs are 0x8000, saturation will happen. The result will be saturated
+ * to 0x7FFF and the overflow flag OV will be set.
+ *
+ * **Operations**:\n
+ * ~~~
+ * // KHMBB16: (x,y,z)=(0,0,0),(2,2,1)
+ * // KHMBT16: (x,y,z)=(0,1,0),(2,3,1)
+ * // KHMTT16: (x,y,z)=(1,1,0),(3,3,1)
+ * aop = Rs1.H[x]; bop = Rs2.H[y];
+ * If (0x8000 != aop | 0x8000 != bop) {
+ *   Mresult[31:0] = aop * bop;
+ *   res[15:0] = Mresult[30:15];
+ * } else {
+ *   res[15:0] = 0x7FFF;
+ *   OV = 1;
+ * }
+ * Rd.W[z] = SE32(res[15:0]);
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_KHMBT16(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("khmbt16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 4.10.2. KHMBT16 ===== */
+
+/* ===== Inline Function Start for 4.10.3. KHMTT16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_Q15_SAT_MULT
+ * \brief KHMTT16 (SIMD Signed Saturating Half Multiply T16 x T16)
+ * \details
+ * **Type**: SIMD (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KHMxy16 Rd, Rs1, Rs2 (xy = BB, BT, TT)
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Multiply the signed Q15 integer contents of two 16-bit data in the corresponding portion
+ * of the 32-bit chunks in registers and then right-shift 15 bits to turn the Q30 results into Q15
+ * numbers again and saturate the Q15 results into the destination register. If saturation happens, an
+ * overflow flag OV will be set.
+ *
+ * **Description**:\n
+ * Multiply the top or bottom 16-bit Q15 content of the 32-bit portions in Rs1 with the top
+ * or bottom 16-bit Q15 content of the 32-bit portion in Rs2. The Q30 results are then right-shifted 15-
+ * bits and saturated into Q15 values. The 32-bit Q15 values are then written into the 32-bit chunks in
+ * Rd. When both the two Q15 inputs are 0x8000, saturation will happen. The result will be saturated
+ * to 0x7FFF and the overflow flag OV will be set.
+ *
+ * **Operations**:\n
+ * ~~~
+ * // KHMBB16: (x,y,z)=(0,0,0),(2,2,1)
+ * // KHMBT16: (x,y,z)=(0,1,0),(2,3,1)
+ * // KHMTT16: (x,y,z)=(1,1,0),(3,3,1)
+ * aop = Rs1.H[x]; bop = Rs2.H[y];
+ * If (0x8000 != aop | 0x8000 != bop) {
+ *   Mresult[31:0] = aop * bop;
+ *   res[15:0] = Mresult[30:15];
+ * } else {
+ *   res[15:0] = 0x7FFF;
+ *   OV = 1;
+ * }
+ * Rd.W[z] = SE32(res[15:0]);
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_KHMTT16(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("khmtt16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 4.10.3. KHMTT16 ===== */
+
+/* ===== Inline Function Start for 4.11.1. KMABB32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_32B_MULT_ADD
+ * \brief KMABB32 (Saturating Signed Multiply Bottom Words & Add)
+ * \details
+ * **Type**: DSP (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KMABB32 Rd, Rs1, Rs2
+ * KMABT32 Rd, Rs1, Rs2
+ * KMATT32 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Multiply the signed 32-bit element in a register with the 32-bit element in another register
+ * and add the result to the content of 64-bit data in the third register. The addition result may be
+ * saturated and is written to the third register.
+ * * KMABB32: rd + bottom*bottom
+ * * KMABT32: rd + bottom*top
+ * * KMATT32: rd + top*top
+ *
+ * **Description**:\n
+ * For the `KMABB32` instruction, it multiplies the bottom 32-bit element in Rs1 with the bottom 32-bit
+ * element in Rs2.
+ * For the `KMABT32` instruction, it multiplies the bottom 32-bit element in Rs1 with the top 32-bit
+ * element in Rs2.
+ * For the `KMATT32` instruction, it multiplies the top 32-bit element in Rs1 with the top 32-bit
+ * element in Rs2.
+ * The multiplication result is added to the content of 64-bit data in Rd. If the addition result is beyond
+ * the Q63 number range (-2^63 <= Q63 <= 2^63-1), it is saturated to the range and the OV bit is set to 1. The
+ * result after saturation is written to Rd. The 32-bit contents of Rs1 and Rs2 are treated as signed
+ * integers.
+ *
+ * **Operations**:\n
+ * ~~~
+ * res = Rd + (Rs1.W[0] * Rs2.W[0]); // KMABB32
+ *  res = Rd + (Rs1.W[0] * Rs2.W[1]); // KMABT32
+ *  res = Rd + (Rs1.W[1] * Rs2.W[1]); // KMATT32
+ *  if (res > (2^63)-1) {
+ *    res = (2^63)-1;
+ *    OV = 1;
+ *  } else if (res < -2^63) {
+ *    res = -2^63;
+ *    OV = 1;
+ *  }
+ *  Rd = res;
+ * *Exceptions:* None
+ * ~~~
+ *
+ * \param [in]  t    long type of value stored in t
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_KMABB32(long t, unsigned long a, unsigned long b)
+{
+    __ASM volatile("kmabb32 %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
+    return t;
+}
+/* ===== Inline Function End for 4.11.1. KMABB32 ===== */
+
+/* ===== Inline Function Start for 4.11.2. KMABT32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_32B_MULT_ADD
+ * \brief KMABT32 (Saturating Signed Multiply Bottom & Top Words & Add)
+ * \details
+ * **Type**: DSP (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KMABB32 Rd, Rs1, Rs2
+ * KMABT32 Rd, Rs1, Rs2
+ * KMATT32 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Multiply the signed 32-bit element in a register with the 32-bit element in another register
+ * and add the result to the content of 64-bit data in the third register. The addition result may be
+ * saturated and is written to the third register.
+ * * KMABB32: rd + bottom*bottom
+ * * KMABT32: rd + bottom*top
+ * * KMATT32: rd + top*top
+ *
+ * **Description**:\n
+ * For the `KMABB32` instruction, it multiplies the bottom 32-bit element in Rs1 with the bottom 32-bit
+ * element in Rs2.
+ * For the `KMABT32` instruction, it multiplies the bottom 32-bit element in Rs1 with the top 32-bit
+ * element in Rs2.
+ * For the `KMATT32` instruction, it multiplies the top 32-bit element in Rs1 with the top 32-bit
+ * element in Rs2.
+ * The multiplication result is added to the content of 64-bit data in Rd. If the addition result is beyond
+ * the Q63 number range (-2^63 <= Q63 <= 2^63-1), it is saturated to the range and the OV bit is set to 1. The
+ * result after saturation is written to Rd. The 32-bit contents of Rs1 and Rs2 are treated as signed
+ * integers.
+ *
+ * **Operations**:\n
+ * ~~~
+ * res = Rd + (Rs1.W[0] * Rs2.W[0]); // KMABB32
+ *  res = Rd + (Rs1.W[0] * Rs2.W[1]); // KMABT32
+ *  res = Rd + (Rs1.W[1] * Rs2.W[1]); // KMATT32
+ *  if (res > (2^63)-1) {
+ *    res = (2^63)-1;
+ *    OV = 1;
+ *  } else if (res < -2^63) {
+ *    res = -2^63;
+ *    OV = 1;
+ *  }
+ *  Rd = res;
+ * *Exceptions:* None
+ * ~~~
+ *
+ * \param [in]  t    long type of value stored in t
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_KMABT32(long t, unsigned long a, unsigned long b)
+{
+    __ASM volatile("kmabt32 %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
+    return t;
+}
+/* ===== Inline Function End for 4.11.2. KMABT32 ===== */
+
+/* ===== Inline Function Start for 4.11.3. KMATT32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_32B_MULT_ADD
+ * \brief KMATT32 (Saturating Signed Multiply Top Words & Add)
+ * \details
+ * **Type**: DSP (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KMABB32 Rd, Rs1, Rs2
+ * KMABT32 Rd, Rs1, Rs2
+ * KMATT32 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Multiply the signed 32-bit element in a register with the 32-bit element in another register
+ * and add the result to the content of 64-bit data in the third register. The addition result may be
+ * saturated and is written to the third register.
+ * * KMABB32: rd + bottom*bottom
+ * * KMABT32: rd + bottom*top
+ * * KMATT32: rd + top*top
+ *
+ * **Description**:\n
+ * For the `KMABB32` instruction, it multiplies the bottom 32-bit element in Rs1 with the bottom 32-bit
+ * element in Rs2.
+ * For the `KMABT32` instruction, it multiplies the bottom 32-bit element in Rs1 with the top 32-bit
+ * element in Rs2.
+ * For the `KMATT32` instruction, it multiplies the top 32-bit element in Rs1 with the top 32-bit
+ * element in Rs2.
+ * The multiplication result is added to the content of 64-bit data in Rd. If the addition result is beyond
+ * the Q63 number range (-2^63 <= Q63 <= 2^63-1), it is saturated to the range and the OV bit is set to 1. The
+ * result after saturation is written to Rd. The 32-bit contents of Rs1 and Rs2 are treated as signed
+ * integers.
+ *
+ * **Operations**:\n
+ * ~~~
+ * res = Rd + (Rs1.W[0] * Rs2.W[0]); // KMABB32
+ *  res = Rd + (Rs1.W[0] * Rs2.W[1]); // KMABT32
+ *  res = Rd + (Rs1.W[1] * Rs2.W[1]); // KMATT32
+ *  if (res > (2^63)-1) {
+ *    res = (2^63)-1;
+ *    OV = 1;
+ *  } else if (res < -2^63) {
+ *    res = -2^63;
+ *    OV = 1;
+ *  }
+ *  Rd = res;
+ * *Exceptions:* None
+ * ~~~
+ *
+ * \param [in]  t    long type of value stored in t
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_KMATT32(long t, unsigned long a, unsigned long b)
+{
+    __ASM volatile("kmatt32 %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
+    return t;
+}
+/* ===== Inline Function End for 4.11.3. KMATT32 ===== */
+
+/* ===== Inline Function Start for 4.12.1. KMADA32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_32B_PARALLEL_MAC
+ * \brief KMADA32 (Saturating Signed Multiply Two Words and Two Adds)
+ * \details
+ * **Type**: DSP (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KMADA32 Rd, Rs1, Rs2
+ * KMAXDA32 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do two signed 32-bit multiplications from 32-bit data in two registers; and then adds the
+ * two 64-bit results and 64-bit data in a third register together. The addition result may be saturated.
+ * * KMADA32: rd + top*top + bottom*bottom
+ * * KMAXDA32: rd + top*bottom + bottom*top
+ *
+ * **Description**:\n
+ * For the `KMADA32` instruction, it multiplies the bottom 32-bit element in Rs1 with the bottom 32-
+ * bit element in Rs2 and then adds the result to the result of multiplying the top 32-bit element in Rs1
+ * with the top 32-bit element in Rs2. It is actually an alias of the `KMAR64` instruction.
+ * For the `KMAXDA32` instruction, it multiplies the top 32-bit element in Rs1 with the bottom 32-bit
+ * element in Rs2 and then adds the result to the result of multiplying the bottom 32-bit element in Rs1
+ * with the top 32-bit element in Rs2.
+ * The result is added to the content of 64-bit data in Rd. If the addition result is beyond the Q63
+ * number range (-2^63 <= Q63 <= 2^63-1), it is saturated to the range and the OV bit is set to 1. The 64-bit
+ * result is written to Rd. The 32-bit contents of Rs1 and Rs2 are treated as signed integers.
+ *
+ * **Operations**:\n
+ * ~~~
+ * res = Rd + (Rs1.W[1] * Rs2.w[1]) + (Rs1.W[0] * Rs2.W[0]); // KMADA32
+ * res = Rd + (Rs1.W[1] * Rs2.W[0]) + (Rs1.W[0] * Rs2.W[1]); // KMAXDA32
+ * if (res > (2^63)-1) {
+ *   res = (2^63)-1;
+ *   OV = 1;
+ * } else if (res < -2^63) {
+ *   res = -2^63;
+ *   OV = 1;
+ * }
+ * Rd = res;
+ * ~~~
+ *
+ * \param [in]  t    long type of value stored in t
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_KMADA32(long t, unsigned long a, unsigned long b)
+{
+    __ASM volatile("kmada32 %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
+    return t;
+}
+/* ===== Inline Function End for 4.12.1. KMADA32 ===== */
+
+/* ===== Inline Function Start for 4.12.2. KMAXDA32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_32B_PARALLEL_MAC
+ * \brief KMAXDA32 (Saturating Signed Crossed Multiply Two Words and Two Adds)
+ * \details
+ * **Type**: DSP (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KMADA32 Rd, Rs1, Rs2
+ * KMAXDA32 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do two signed 32-bit multiplications from 32-bit data in two registers; and then adds the
+ * two 64-bit results and 64-bit data in a third register together. The addition result may be saturated.
+ * * KMADA32: rd + top*top + bottom*bottom
+ * * KMAXDA32: rd + top*bottom + bottom*top
+ *
+ * **Description**:\n
+ * For the `KMADA32` instruction, it multiplies the bottom 32-bit element in Rs1 with the bottom 32-
+ * bit element in Rs2 and then adds the result to the result of multiplying the top 32-bit element in Rs1
+ * with the top 32-bit element in Rs2. It is actually an alias of the `KMAR64` instruction.
+ * For the `KMAXDA32` instruction, it multiplies the top 32-bit element in Rs1 with the bottom 32-bit
+ * element in Rs2 and then adds the result to the result of multiplying the bottom 32-bit element in Rs1
+ * with the top 32-bit element in Rs2.
+ * The result is added to the content of 64-bit data in Rd. If the addition result is beyond the Q63
+ * number range (-2^63 <= Q63 <= 2^63-1), it is saturated to the range and the OV bit is set to 1. The 64-bit
+ * result is written to Rd. The 32-bit contents of Rs1 and Rs2 are treated as signed integers.
+ *
+ * **Operations**:\n
+ * ~~~
+ * res = Rd + (Rs1.W[1] * Rs2.w[1]) + (Rs1.W[0] * Rs2.W[0]); // KMADA32
+ * res = Rd + (Rs1.W[1] * Rs2.W[0]) + (Rs1.W[0] * Rs2.W[1]); // KMAXDA32
+ * if (res > (2^63)-1) {
+ *   res = (2^63)-1;
+ *   OV = 1;
+ * } else if (res < -2^63) {
+ *   res = -2^63;
+ *   OV = 1;
+ * }
+ * Rd = res;
+ * ~~~
+ *
+ * \param [in]  t    long type of value stored in t
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_KMAXDA32(long t, unsigned long a, unsigned long b)
+{
+    __ASM volatile("kmaxda32 %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
+    return t;
+}
+/* ===== Inline Function End for 4.12.2. KMAXDA32 ===== */
+
+/* ===== Inline Function Start for 4.13.1. KMDA32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_32B_PARALLEL_MAC
+ * \brief KMDA32 (Signed Multiply Two Words and Add)
+ * \details
+ * **Type**: DSP (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KMDA32 Rd, Rs1, Rs2
+ * KMXDA32 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do two signed 32-bit multiplications from the 32-bit element of two registers; and then
+ * adds the two 64-bit results together. The addition result may be saturated.
+ * * KMDA32: top*top + bottom*bottom
+ * * KMXDA32: top*bottom + bottom*top
+ *
+ * **Description**:\n
+ * For the `KMDA32` instruction, it multiplies the bottom 32-bit element of Rs1 with the bottom 32-bit
+ * element of Rs2 and then adds the result to the result of multiplying the top 32-bit element of Rs1
+ * with the top 32-bit element of Rs2.
+ * For the `KMXDA32` instruction, it multiplies the bottom 32-bit element of Rs1 with the top 32-bit
+ * element of Rs2 and then adds the result to the result of multiplying the top 32-bit element of Rs1
+ * with the bottom 32-bit element of Rs2.
+ * The addition result is checked for saturation. If saturation happens, the result is saturated to 2^63-1.
+ * The final result is written to Rd. The 32-bit contents are treated as signed integers.
+ *
+ * **Operations**:\n
+ * ~~~
+ * if ((Rs1 != 0x8000000080000000) or (Rs2 != 0x8000000080000000)) {
+ *   Rd = (Rs1.W[1] * Rs2.W[1]) + (Rs1.W[0] * Rs2.W[0]); // KMDA32
+ *   Rd = (Rs1.W[1] * Rs2.W[0]) + (Rs1.W[0] * Rs2.W[1]); // KMXDA32
+ * } else {
+ *   Rd = 0x7fffffffffffffff;
+ *   OV = 1;
+ * }
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_KMDA32(unsigned long a, unsigned long b)
+{
+    register long result;
+    __ASM volatile("kmda32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 4.13.1. KMDA32 ===== */
+
+/* ===== Inline Function Start for 4.13.2. KMXDA32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_32B_PARALLEL_MAC
+ * \brief KMXDA32 (Signed Crossed Multiply Two Words and Add)
+ * \details
+ * **Type**: DSP (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KMDA32 Rd, Rs1, Rs2
+ * KMXDA32 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do two signed 32-bit multiplications from the 32-bit element of two registers; and then
+ * adds the two 64-bit results together. The addition result may be saturated.
+ * * KMDA32: top*top + bottom*bottom
+ * * KMXDA32: top*bottom + bottom*top
+ *
+ * **Description**:\n
+ * For the `KMDA32` instruction, it multiplies the bottom 32-bit element of Rs1 with the bottom 32-bit
+ * element of Rs2 and then adds the result to the result of multiplying the top 32-bit element of Rs1
+ * with the top 32-bit element of Rs2.
+ * For the `KMXDA32` instruction, it multiplies the bottom 32-bit element of Rs1 with the top 32-bit
+ * element of Rs2 and then adds the result to the result of multiplying the top 32-bit element of Rs1
+ * with the bottom 32-bit element of Rs2.
+ * The addition result is checked for saturation. If saturation happens, the result is saturated to 2^63-1.
+ * The final result is written to Rd. The 32-bit contents are treated as signed integers.
+ *
+ * **Operations**:\n
+ * ~~~
+ * if ((Rs1 != 0x8000000080000000) or (Rs2 != 0x8000000080000000)) {
+ *   Rd = (Rs1.W[1] * Rs2.W[1]) + (Rs1.W[0] * Rs2.W[0]); // KMDA32
+ *   Rd = (Rs1.W[1] * Rs2.W[0]) + (Rs1.W[0] * Rs2.W[1]); // KMXDA32
+ * } else {
+ *   Rd = 0x7fffffffffffffff;
+ *   OV = 1;
+ * }
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_KMXDA32(unsigned long a, unsigned long b)
+{
+    register long result;
+    __ASM volatile("kmxda32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 4.13.2. KMXDA32 ===== */
+
+/* ===== Inline Function Start for 4.14.1. KMADS32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_32B_PARALLEL_MAC
+ * \brief KMADS32 (Saturating Signed Multiply Two Words & Subtract & Add)
+ * \details
+ * **Type**: DSP (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KMADS32 Rd, Rs1, Rs2
+ * KMADRS32 Rd, Rs1, Rs2
+ * KMAXDS32 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do two signed 32-bit multiplications from 32-bit elements in two registers; and then
+ * perform a subtraction operation between the two 64-bit results. Then add the subtraction result to
+ * 64-bit data in a third register. The addition result may be saturated.
+ * * KMADS32: rd + (top*top - bottom*bottom)
+ * * KMADRS32: rd + (bottom*bottom - top*top)
+ * * KMAXDS32: rd + (top*bottom - bottom*top)
+ *
+ * **Description**:\n
+ * For the `KMADS32` instruction, it multiplies the bottom 32-bit element in Rs1 with the bottom 32-bit
+ * element in Rs2 and then subtracts the result from the result of multiplying the top 32-bit element in
+ * Rs1 with the top 32-bit element in Rs2.
+ * For the `KMADRS32` instruction, it multiplies the top 32-bit element in Rs1 with the top 32-bit
+ * element in Rs2 and then subtracts the result from the result of multiplying the bottom 32-bit
+ * element in Rs1 with the bottom 32-bit element in Rs2.
+ * For the `KMAXDS32` instruction, it multiplies the bottom 32-bit element in Rs1 with the top 32-bit
+ * element in Rs2 and then subtracts the result from the result of multiplying the top 32-bit element in
+ * Rs1 with the bottom 32-bit element in Rs2.
+ * The subtraction result is then added to the content of 64-bit data in Rd. If the addition result is
+ * beyond the Q63 number range (-2^63 <= Q63 <= 2^63-1), it is saturated to the range and the OV bit is set to
+ * 1. The 64-bit result after saturation is written to Rd. The 32-bit contents of Rs1 and Rs2 are treated
+ * as signed integers.
+ *
+ * **Operations**:\n
+ * ~~~
+ * res = Rd + (Rs1.W[1] * Rs2.W[1]) - (Rs1.W[0] * Rs2.W[0]); // KMADS32
+ * res = Rd + (Rs1.W[0] * Rs2.W[0]) - (Rs1.W[1] * Rs2.W[1]); // KMADRS32
+ * res = Rd + (Rs1.W[1] * Rs2.W[0]) - (Rs1.W[0] * Rs2.W[1]); // KMAXDS32
+ * if (res > (2^63)-1) {
+ *   res = (2^63)-1;
+ *   OV = 1;
+ * } else if (res < -2^63) {
+ *   res = -2^63;
+ *   OV = 1;
+ * }
+ * Rd = res;
+ * ~~~
+ *
+ * \param [in]  t    long type of value stored in t
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_KMADS32(long t, unsigned long a, unsigned long b)
+{
+    __ASM volatile("kmads32 %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
+    return t;
+}
+/* ===== Inline Function End for 4.14.1. KMADS32 ===== */
+
+/* ===== Inline Function Start for 4.14.2. KMADRS32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_32B_PARALLEL_MAC
+ * \brief KMADRS32 (Saturating Signed Multiply Two Words & Reverse Subtract & Add)
+ * \details
+ * **Type**: DSP (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KMADS32 Rd, Rs1, Rs2
+ * KMADRS32 Rd, Rs1, Rs2
+ * KMAXDS32 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do two signed 32-bit multiplications from 32-bit elements in two registers; and then
+ * perform a subtraction operation between the two 64-bit results. Then add the subtraction result to
+ * 64-bit data in a third register. The addition result may be saturated.
+ * * KMADS32: rd + (top*top - bottom*bottom)
+ * * KMADRS32: rd + (bottom*bottom - top*top)
+ * * KMAXDS32: rd + (top*bottom - bottom*top)
+ *
+ * **Description**:\n
+ * For the `KMADS32` instruction, it multiplies the bottom 32-bit element in Rs1 with the bottom 32-bit
+ * element in Rs2 and then subtracts the result from the result of multiplying the top 32-bit element in
+ * Rs1 with the top 32-bit element in Rs2.
+ * For the `KMADRS32` instruction, it multiplies the top 32-bit element in Rs1 with the top 32-bit
+ * element in Rs2 and then subtracts the result from the result of multiplying the bottom 32-bit
+ * element in Rs1 with the bottom 32-bit element in Rs2.
+ * For the `KMAXDS32` instruction, it multiplies the bottom 32-bit element in Rs1 with the top 32-bit
+ * element in Rs2 and then subtracts the result from the result of multiplying the top 32-bit element in
+ * Rs1 with the bottom 32-bit element in Rs2.
+ * The subtraction result is then added to the content of 64-bit data in Rd. If the addition result is
+ * beyond the Q63 number range (-2^63 <= Q63 <= 2^63-1), it is saturated to the range and the OV bit is set to
+ * 1. The 64-bit result after saturation is written to Rd. The 32-bit contents of Rs1 and Rs2 are treated
+ * as signed integers.
+ *
+ * **Operations**:\n
+ * ~~~
+ * res = Rd + (Rs1.W[1] * Rs2.W[1]) - (Rs1.W[0] * Rs2.W[0]); // KMADS32
+ * res = Rd + (Rs1.W[0] * Rs2.W[0]) - (Rs1.W[1] * Rs2.W[1]); // KMADRS32
+ * res = Rd + (Rs1.W[1] * Rs2.W[0]) - (Rs1.W[0] * Rs2.W[1]); // KMAXDS32
+ * if (res > (2^63)-1) {
+ *   res = (2^63)-1;
+ *   OV = 1;
+ * } else if (res < -2^63) {
+ *   res = -2^63;
+ *   OV = 1;
+ * }
+ * Rd = res;
+ * ~~~
+ *
+ * \param [in]  t    long type of value stored in t
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_KMADRS32(long t, unsigned long a, unsigned long b)
+{
+    __ASM volatile("kmadrs32 %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
+    return t;
+}
+/* ===== Inline Function End for 4.14.2. KMADRS32 ===== */
+
+/* ===== Inline Function Start for 4.14.3. KMAXDS32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_32B_PARALLEL_MAC
+ * \brief KMAXDS32 (Saturating Signed Crossed Multiply Two Words & Subtract & Add)
+ * \details
+ * **Type**: DSP (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KMADS32 Rd, Rs1, Rs2
+ * KMADRS32 Rd, Rs1, Rs2
+ * KMAXDS32 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do two signed 32-bit multiplications from 32-bit elements in two registers; and then
+ * perform a subtraction operation between the two 64-bit results. Then add the subtraction result to
+ * 64-bit data in a third register. The addition result may be saturated.
+ * * KMADS32: rd + (top*top - bottom*bottom)
+ * * KMADRS32: rd + (bottom*bottom - top*top)
+ * * KMAXDS32: rd + (top*bottom - bottom*top)
+ *
+ * **Description**:\n
+ * For the `KMADS32` instruction, it multiplies the bottom 32-bit element in Rs1 with the bottom 32-bit
+ * element in Rs2 and then subtracts the result from the result of multiplying the top 32-bit element in
+ * Rs1 with the top 32-bit element in Rs2.
+ * For the `KMADRS32` instruction, it multiplies the top 32-bit element in Rs1 with the top 32-bit
+ * element in Rs2 and then subtracts the result from the result of multiplying the bottom 32-bit
+ * element in Rs1 with the bottom 32-bit element in Rs2.
+ * For the `KMAXDS32` instruction, it multiplies the bottom 32-bit element in Rs1 with the top 32-bit
+ * element in Rs2 and then subtracts the result from the result of multiplying the top 32-bit element in
+ * Rs1 with the bottom 32-bit element in Rs2.
+ * The subtraction result is then added to the content of 64-bit data in Rd. If the addition result is
+ * beyond the Q63 number range (-2^63 <= Q63 <= 2^63-1), it is saturated to the range and the OV bit is set to
+ * 1. The 64-bit result after saturation is written to Rd. The 32-bit contents of Rs1 and Rs2 are treated
+ * as signed integers.
+ *
+ * **Operations**:\n
+ * ~~~
+ * res = Rd + (Rs1.W[1] * Rs2.W[1]) - (Rs1.W[0] * Rs2.W[0]); // KMADS32
+ * res = Rd + (Rs1.W[0] * Rs2.W[0]) - (Rs1.W[1] * Rs2.W[1]); // KMADRS32
+ * res = Rd + (Rs1.W[1] * Rs2.W[0]) - (Rs1.W[0] * Rs2.W[1]); // KMAXDS32
+ * if (res > (2^63)-1) {
+ *   res = (2^63)-1;
+ *   OV = 1;
+ * } else if (res < -2^63) {
+ *   res = -2^63;
+ *   OV = 1;
+ * }
+ * Rd = res;
+ * ~~~
+ *
+ * \param [in]  t    long type of value stored in t
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_KMAXDS32(long t, unsigned long a, unsigned long b)
+{
+    __ASM volatile("kmaxds32 %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
+    return t;
+}
+/* ===== Inline Function End for 4.14.3. KMAXDS32 ===== */
+
+/* ===== Inline Function Start for 4.15.1. KMSDA32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_32B_PARALLEL_MAC
+ * \brief KMSDA32 (Saturating Signed Multiply Two Words & Add & Subtract)
+ * \details
+ * **Type**: DSP (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KMSDA32 Rd, Rs1, Rs2
+ * KMSXDA32 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do two signed 32-bit multiplications from the 32-bit element of two registers; and then
+ * subtracts the two 64-bit results from a third register. The subtraction result may be saturated.
+ * * KMSDA: rd - top*top - bottom*bottom
+ * * KMSXDA: rd - top*bottom - bottom*top
+ *
+ * **Description**:\n
+ * For the `KMSDA32` instruction, it multiplies the bottom 32-bit element of Rs1 with the bottom 32-bit
+ * element of Rs2 and multiplies the top 32-bit element of Rs1 with the top 32-bit element of Rs2.
+ * For the `KMSXDA32` instruction, it multiplies the bottom 32-bit element of Rs1 with the top 32-bit
+ * element of Rs2 and multiplies the top 32-bit element of Rs1 with the bottom 32-bit element of Rs2.
+ * The two 64-bit multiplication results are then subtracted from the content of Rd. If the subtraction
+ * result is beyond the Q63 number range (-2^63 <= Q63 <= 2^63-1), it is saturated to the range and the OV bit
+ * is set to 1. The result after saturation is written to Rd. The 32-bit contents are treated as signed
+ * integers.
+ *
+ * **Operations**:\n
+ * ~~~
+ * res = Rd - (Rs1.W[1] * Rs2.W[1]) - (Rs1.W[0] * Rs2.W[0]); // KMSDA32
+ * res = Rd - (Rs1.W[1] * Rs2.W[0]) - (Rs1.W[0] * Rs2.W[1]); // KMSXDA32
+ * if (res > (2^63)-1) {
+ *   res = (2^63)-1;
+ *   OV = 1;
+ * } else if (res < -2^63) {
+ *   res = -2^63;
+ *   OV = 1;
+ * }
+ * Rd = res;
+ * ~~~
+ *
+ * \param [in]  t    long type of value stored in t
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_KMSDA32(long t, unsigned long a, unsigned long b)
+{
+    __ASM volatile("kmsda32 %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
+    return t;
+}
+/* ===== Inline Function End for 4.15.1. KMSDA32 ===== */
+
+/* ===== Inline Function Start for 4.15.2. KMSXDA32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_32B_PARALLEL_MAC
+ * \brief KMSXDA32 (Saturating Signed Crossed Multiply Two Words & Add & Subtract)
+ * \details
+ * **Type**: DSP (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KMSDA32 Rd, Rs1, Rs2
+ * KMSXDA32 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do two signed 32-bit multiplications from the 32-bit element of two registers; and then
+ * subtracts the two 64-bit results from a third register. The subtraction result may be saturated.
+ * * KMSDA: rd - top*top - bottom*bottom
+ * * KMSXDA: rd - top*bottom - bottom*top
+ *
+ * **Description**:\n
+ * For the `KMSDA32` instruction, it multiplies the bottom 32-bit element of Rs1 with the bottom 32-bit
+ * element of Rs2 and multiplies the top 32-bit element of Rs1 with the top 32-bit element of Rs2.
+ * For the `KMSXDA32` instruction, it multiplies the bottom 32-bit element of Rs1 with the top 32-bit
+ * element of Rs2 and multiplies the top 32-bit element of Rs1 with the bottom 32-bit element of Rs2.
+ * The two 64-bit multiplication results are then subtracted from the content of Rd. If the subtraction
+ * result is beyond the Q63 number range (-2^63 <= Q63 <= 2^63-1), it is saturated to the range and the OV bit
+ * is set to 1. The result after saturation is written to Rd. The 32-bit contents are treated as signed
+ * integers.
+ *
+ * **Operations**:\n
+ * ~~~
+ * res = Rd - (Rs1.W[1] * Rs2.W[1]) - (Rs1.W[0] * Rs2.W[0]); // KMSDA32
+ * res = Rd - (Rs1.W[1] * Rs2.W[0]) - (Rs1.W[0] * Rs2.W[1]); // KMSXDA32
+ * if (res > (2^63)-1) {
+ *   res = (2^63)-1;
+ *   OV = 1;
+ * } else if (res < -2^63) {
+ *   res = -2^63;
+ *   OV = 1;
+ * }
+ * Rd = res;
+ * ~~~
+ *
+ * \param [in]  t    long type of value stored in t
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_KMSXDA32(long t, unsigned long a, unsigned long b)
+{
+    __ASM volatile("kmsxda32 %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
+    return t;
+}
+/* ===== Inline Function End for 4.15.2. KMSXDA32 ===== */
+
+/* ===== Inline Function Start for 4.16. KSLL32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_SHIFT
+ * \brief KSLL32 (SIMD 32-bit Saturating Shift Left Logical)
+ * \details
+ * **Type**: SIMD (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KSLL32 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 32-bit elements logical left shift operations with saturation simultaneously. The shift
+ * amount is a variable from a GPR.
+ *
+ * **Description**:\n
+ * The 32-bit data elements in Rs1 are left-shifted logically. The shifted out bits are filled
+ * with zero and the shift amount is specified by the low-order 5-bits of the value in the Rs2 register.
+ * Any shifted value greater than 2^31-1 is saturated to 2^31-1. Any shifted value smaller than -2^31 is
+ * saturated to -2^31. And the saturated results are written to Rd. If any saturation is performed, set OV
+ * bit to 1.
+ *
+ * **Operations**:\n
+ * ~~~
+ * sa = Rs2[4:0];
+ * if (sa != 0) {
+ *   res[(31+sa):0] = Rs1.W[x] << sa;
+ *   if (res > (2^31)-1) {
+ *     res = 0x7fffffff; OV = 1;
+ *   } else if (res < -2^31) {
+ *     res = 0x80000000; OV = 1;
+ *   }
+ *   Rd.W[x] = res[31:0];
+ * } else {
+ *   Rd = Rs1;
+ * }
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned int type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_KSLL32(unsigned long a, unsigned int b)
+{
+    register unsigned long result;
+    __ASM volatile("ksll32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 4.16. KSLL32 ===== */
+
+/* ===== Inline Function Start for 4.17. KSLLI32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_SHIFT
+ * \brief KSLLI32 (SIMD 32-bit Saturating Shift Left Logical Immediate)
+ * \details
+ * **Type**: SIMD (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KSLLI32 Rd, Rs1, imm5u
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 32-bit elements logical left shift operations with saturation simultaneously. The shift
+ * amount is an immediate value.
+ *
+ * **Description**:\n
+ * The 32-bit data elements in Rs1 are left-shifted logically. The shifted out bits are filled
+ * with zero and the shift amount is specified by the imm5u constant. Any shifted value greater than
+ * 2^31-1 is saturated to 2^31-1. Any shifted value smaller than -2^31 is saturated to -2^31. And the saturated
+ * results are written to Rd. If any saturation is performed, set OV bit to 1.
+ *
+ * **Operations**:\n
+ * ~~~
+ * sa = imm5u[4:0];
+ * if (sa != 0) {
+ *   res[(31+sa):0] = Rs1.W[x] << sa;
+ *   if (res > (2^31)-1) {
+ *     res = 0x7fffffff; OV = 1;
+ *   } else if (res < -2^31) {
+ *     res = 0x80000000; OV = 1;
+ *   }
+ *   Rd.W[x] = res[31:0];
+ * } else {
+ *   Rd = Rs1;
+ * }
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned int type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_KSLLI32(unsigned long a, unsigned int b)
+{
+    register unsigned long result;
+    __ASM volatile("kslli32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 4.17. KSLLI32 ===== */
+
+/* ===== Inline Function Start for 4.18.1. KSLRA32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_SHIFT
+ * \brief KSLRA32 (SIMD 32-bit Shift Left Logical with Saturation or Shift Right Arithmetic)
+ * \details
+ * **Type**: SIMD (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KSLRA32 Rd, Rs1, Rs2
+ * KSLRA32.u Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 32-bit elements logical left (positive) or arithmetic right (negative) shift operation with
+ * Q31 saturation for the left shift. The `.u` form performs additional rounding up operations for the
+ * right shift.
+ *
+ * **Description**:\n
+ * The 32-bit data elements of Rs1 are left-shifted logically or right-shifted arithmetically
+ * based on the value of Rs2[5:0]. Rs2[5:0] is in the signed range of [-25, 25-1]. A positive Rs2[5:0] means
+ * logical left shift and a negative Rs2[5:0] means arithmetic right shift. The shift amount is the
+ * absolute value of Rs2[5:0]. However, the behavior of `Rs2[5:0]==-25 (0x20)` is defined to be
+ * equivalent to the behavior of `Rs2[5:0]==-(25-1) (0x21)`.
+ * The left-shifted results are saturated to the 32-bit signed integer range of [-2^31, 2^31-1]. For the `.u`
+ * form of the instruction, the right-shifted results are added a 1 to the most significant discarded bit
+ * position for rounding effect. After the shift, saturation, or rounding, the final results are written to
+ * Rd. If any saturation happens, this instruction sets the OV flag. The value of Rs2[31:6] will not affect
+ * this instruction.
+ *
+ * **Operations**:\n
+ * ~~~
+ * if (Rs2[5:0] < 0) {
+ *   sa = -Rs2[5:0];
+ *   sa = (sa == 32)? 31 : sa;
+ *   if (`.u` form) {
+ *     res[31:-1] = SE33(Rs1.W[x][31:sa-1]) + 1;
+ *     Rd.W[x] = res[31:0];
+ *   } else {
+ *     Rd.W[x] = SE32(Rs1.W[x][31:sa]);
+ *   }
+ * } else {
+ *   sa = Rs2[4:0];
+ *   res[(31+sa):0] = Rs1.W[x] <<(logic) sa;
+ *   if (res > (2^31)-1) {
+ *     res[31:0] = 0x7fffffff; OV = 1;
+ *   } else if (res < -2^31) {
+ *     res[31:0] = 0x80000000; OV = 1;
+ *   }
+ *   Rd.W[x] = res[31:0];
+ * }
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    int type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_KSLRA32(unsigned long a, int b)
+{
+    register unsigned long result;
+    __ASM volatile("kslra32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 4.18.1. KSLRA32 ===== */
+
+/* ===== Inline Function Start for 4.18.2. KSLRA32.u ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_SHIFT
+ * \brief KSLRA32.u (SIMD 32-bit Shift Left Logical with Saturation or Rounding Shift Right Arithmetic)
+ * \details
+ * **Type**: SIMD (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KSLRA32 Rd, Rs1, Rs2
+ * KSLRA32.u Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 32-bit elements logical left (positive) or arithmetic right (negative) shift operation with
+ * Q31 saturation for the left shift. The `.u` form performs additional rounding up operations for the
+ * right shift.
+ *
+ * **Description**:\n
+ * The 32-bit data elements of Rs1 are left-shifted logically or right-shifted arithmetically
+ * based on the value of Rs2[5:0]. Rs2[5:0] is in the signed range of [-25, 25-1]. A positive Rs2[5:0] means
+ * logical left shift and a negative Rs2[5:0] means arithmetic right shift. The shift amount is the
+ * absolute value of Rs2[5:0]. However, the behavior of `Rs2[5:0]==-25 (0x20)` is defined to be
+ * equivalent to the behavior of `Rs2[5:0]==-(25-1) (0x21)`.
+ * The left-shifted results are saturated to the 32-bit signed integer range of [-2^31, 2^31-1]. For the `.u`
+ * form of the instruction, the right-shifted results are added a 1 to the most significant discarded bit
+ * position for rounding effect. After the shift, saturation, or rounding, the final results are written to
+ * Rd. If any saturation happens, this instruction sets the OV flag. The value of Rs2[31:6] will not affect
+ * this instruction.
+ *
+ * **Operations**:\n
+ * ~~~
+ * if (Rs2[5:0] < 0) {
+ *   sa = -Rs2[5:0];
+ *   sa = (sa == 32)? 31 : sa;
+ *   if (`.u` form) {
+ *     res[31:-1] = SE33(Rs1.W[x][31:sa-1]) + 1;
+ *     Rd.W[x] = res[31:0];
+ *   } else {
+ *     Rd.W[x] = SE32(Rs1.W[x][31:sa]);
+ *   }
+ * } else {
+ *   sa = Rs2[4:0];
+ *   res[(31+sa):0] = Rs1.W[x] <<(logic) sa;
+ *   if (res > (2^31)-1) {
+ *     res[31:0] = 0x7fffffff; OV = 1;
+ *   } else if (res < -2^31) {
+ *     res[31:0] = 0x80000000; OV = 1;
+ *   }
+ *   Rd.W[x] = res[31:0];
+ * }
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    int type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_KSLRA32_U(unsigned long a, int b)
+{
+    register unsigned long result;
+    __ASM volatile("kslra32.u %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 4.18.2. KSLRA32.u ===== */
+
+/* ===== Inline Function Start for 4.19. KSTAS32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_ADDSUB
+ * \brief KSTAS32 (SIMD 32-bit Signed Saturating Straight Addition & Subtraction)
+ * \details
+ * **Type**: SIMD (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KSTAS32 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 32-bit signed integer element saturating addition and 32-bit signed integer element
+ * saturating subtraction in a 64-bit chunk simultaneously. Operands are from corresponding 32-bit
+ * elements.
+ *
+ * **Description**:\n
+ * This instruction adds the 32-bit integer element in [63:32] of Rs1 with the 32-bit
+ * integer element in [63:32] of Rs2; at the same time, it subtracts the 32-bit integer element in [31:0] of
+ * Rs2 from the 32-bit integer element in [31:0] of Rs1. If any of the results are beyond the Q31 number
+ * range (-2^31 <= Q31 <= 2^31-1), they are saturated to the range and the OV bit is set to 1. The saturated
+ * results are written to [63:32] of Rd for addition and [31:0] of Rd for subtraction.
+ *
+ * **Operations**:\n
+ * ~~~
+ * res[1] = Rs1.W[1] + Rs2.W[1];
+ * res[0] = Rs1.W[0] - Rs2.W[0];
+ * if (res[x] > (2^31)-1) {
+ *   res[x] = (2^31)-1;
+ *   OV = 1;
+ * } else if (res < -2^31) {
+ *   res[x] = -2^31;
+ *   OV = 1;
+ * }
+ * Rd.W[1] = res[1];
+ * Rd.W[0] = res[0];
+ * for RV64, x=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_KSTAS32(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("kstas32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 4.19. KSTAS32 ===== */
+
+/* ===== Inline Function Start for 4.20. KSTSA32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_ADDSUB
+ * \brief KSTSA32 (SIMD 32-bit Signed Saturating Straight Subtraction & Addition)
+ * \details
+ * **Type**: SIM (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KSTSA32 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 32-bit signed integer element saturating subtraction and 32-bit signed integer element
+ * saturating addition in a 64-bit chunk simultaneously. Operands are from corresponding 32-bit
+ * elements.
+ * *Description: *
+ * This instruction subtracts the 32-bit integer element in [63:32] of Rs2 from the 32-bit integer
+ * element in [63:32] of Rs1; at the same time, it adds the 32-bit integer element in [31:0] of Rs1 with
+ * the 32-bit integer element in [31:0] of Rs2. If any of the results are beyond the Q31 number range (-
+ * 231 <= Q31 <= 2^31-1), they are saturated to the range and the OV bit is set to 1. The saturated results are
+ * written to [63:32] of Rd for subtraction and [31:0] of Rd for addition.
+ *
+ * **Operations**:\n
+ * ~~~
+ * res[1] = Rs1.W[1] - Rs2.W[1];
+ * res[0] = Rs1.W[0] + Rs2.W[0];
+ * if (res[x] > (2^31)-1) {
+ *   res[x] = (2^31)-1;
+ *   OV = 1;
+ * } else if (res < -2^31) {
+ *   res[x] = -2^31;
+ *   OV = 1;
+ * }
+ * Rd.W[1] = res[1];
+ * Rd.W[0] = res[0];
+ * for RV64, x=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_KSTSA32(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("kstsa32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 4.20. KSTSA32 ===== */
+
+/* ===== Inline Function Start for 4.21. KSUB32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_ADDSUB
+ * \brief KSUB32 (SIMD 32-bit Signed Saturating Subtraction)
+ * \details
+ * **Type**: SIMD (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KSUB32 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 32-bit signed integer elements saturating subtractions simultaneously.
+ *
+ * **Description**:\n
+ * This instruction subtracts the 32-bit signed integer elements in Rs2 from the 32-bit
+ * signed integer elements in Rs1. If any of the results are beyond the Q31 number range (-2^31 <= Q31 <=
+ * 2^31-1), they are saturated to the range and the OV bit is set to 1. The saturated results are written to
+ * Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * res[x] = Rs1.W[x] - Rs2.W[x];
+ * if (res[x] > (2^31)-1) {
+ *   res[x] = (2^31)-1;
+ *   OV = 1;
+ * } else if (res[x] < -2^31) {
+ *   res[x] = -2^31;
+ *   OV = 1;
+ * }
+ * Rd.W[x] = res[x];
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_KSUB32(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("ksub32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 4.21. KSUB32 ===== */
+
+/* ===== Inline Function Start for 4.22.1. PKBB32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_32B_PACK
+ * \brief PKBB32 (Pack Two 32-bit Data from Both Bottom Half)
+ * \details
+ * **Type**: DSP (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * PKBB32 Rd, Rs1, Rs2
+ * PKBT32 Rd, Rs1, Rs2
+ * PKTT32 Rd, Rs1, Rs2
+ * PKTB32 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Pack 32-bit data from 64-bit chunks in two registers.
+ * * PKBB32: bottom.bottom
+ * * PKBT32: bottom.top
+ * * PKTT32: top.top
+ * * PKTB32: top.bottom
+ *
+ * **Description**:\n
+ * (PKBB32) moves Rs1.W[0] to Rd.W[1] and moves Rs2.W[0] to Rd.W[0].
+ * (PKBT32) moves Rs1.W[0] to Rd.W[1] and moves Rs2.W[1] to Rd.W[0].
+ * (PKTT32) moves Rs1.W[1] to Rd.W[1] and moves Rs2.W[1] to Rd.W[0].
+ * (PKTB32) moves Rs1.W[1] to Rd.W[1] and moves Rs2.W[0] to Rd.W[0].
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd = CONCAT(Rs1.W[_*0*_], Rs2.W[_*0*_]); // PKBB32
+ * Rd = CONCAT(Rs1.W[_*0*_], Rs2.W[_*1*_]); // PKBT32
+ * Rd = CONCAT(Rs1.W[_*1*_], Rs2.W[_*1*_]); // PKTT32
+ * Rd = CONCAT(Rs1.W[_*1*_], Rs2.W[_*0*_]); // PKTB32
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_PKBB32(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("pkbb32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 4.22.1. PKBB32 ===== */
+
+/* ===== Inline Function Start for 4.22.2. PKBT32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_32B_PACK
+ * \brief PKBT32 (Pack Two 32-bit Data from Bottom and Top Half)
+ * \details
+ * **Type**: DSP (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * PKBB32 Rd, Rs1, Rs2
+ * PKBT32 Rd, Rs1, Rs2
+ * PKTT32 Rd, Rs1, Rs2
+ * PKTB32 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Pack 32-bit data from 64-bit chunks in two registers.
+ * * PKBB32: bottom.bottom
+ * * PKBT32: bottom.top
+ * * PKTT32: top.top
+ * * PKTB32: top.bottom
+ *
+ * **Description**:\n
+ * (PKBB32) moves Rs1.W[0] to Rd.W[1] and moves Rs2.W[0] to Rd.W[0].
+ * (PKBT32) moves Rs1.W[0] to Rd.W[1] and moves Rs2.W[1] to Rd.W[0].
+ * (PKTT32) moves Rs1.W[1] to Rd.W[1] and moves Rs2.W[1] to Rd.W[0].
+ * (PKTB32) moves Rs1.W[1] to Rd.W[1] and moves Rs2.W[0] to Rd.W[0].
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd = CONCAT(Rs1.W[_*0*_], Rs2.W[_*0*_]); // PKBB32
+ * Rd = CONCAT(Rs1.W[_*0*_], Rs2.W[_*1*_]); // PKBT32
+ * Rd = CONCAT(Rs1.W[_*1*_], Rs2.W[_*1*_]); // PKTT32
+ * Rd = CONCAT(Rs1.W[_*1*_], Rs2.W[_*0*_]); // PKTB32
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_PKBT32(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("pkbt32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 4.22.2. PKBT32 ===== */
+
+/* ===== Inline Function Start for 4.22.3. PKTT32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_32B_PACK
+ * \brief PKTT32 (Pack Two 32-bit Data from Both Top Half)
+ * \details
+ * **Type**: DSP (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * PKBB32 Rd, Rs1, Rs2
+ * PKBT32 Rd, Rs1, Rs2
+ * PKTT32 Rd, Rs1, Rs2
+ * PKTB32 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Pack 32-bit data from 64-bit chunks in two registers.
+ * * PKBB32: bottom.bottom
+ * * PKBT32: bottom.top
+ * * PKTT32: top.top
+ * * PKTB32: top.bottom
+ *
+ * **Description**:\n
+ * (PKBB32) moves Rs1.W[0] to Rd.W[1] and moves Rs2.W[0] to Rd.W[0].
+ * (PKBT32) moves Rs1.W[0] to Rd.W[1] and moves Rs2.W[1] to Rd.W[0].
+ * (PKTT32) moves Rs1.W[1] to Rd.W[1] and moves Rs2.W[1] to Rd.W[0].
+ * (PKTB32) moves Rs1.W[1] to Rd.W[1] and moves Rs2.W[0] to Rd.W[0].
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd = CONCAT(Rs1.W[_*0*_], Rs2.W[_*0*_]); // PKBB32
+ * Rd = CONCAT(Rs1.W[_*0*_], Rs2.W[_*1*_]); // PKBT32
+ * Rd = CONCAT(Rs1.W[_*1*_], Rs2.W[_*1*_]); // PKTT32
+ * Rd = CONCAT(Rs1.W[_*1*_], Rs2.W[_*0*_]); // PKTB32
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_PKTT32(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("pktt32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 4.22.3. PKTT32 ===== */
+
+/* ===== Inline Function Start for 4.22.4. PKTB32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_32B_PACK
+ * \brief PKTB32 (Pack Two 32-bit Data from Top and Bottom Half)
+ * \details
+ * **Type**: DSP (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * PKBB32 Rd, Rs1, Rs2
+ * PKBT32 Rd, Rs1, Rs2
+ * PKTT32 Rd, Rs1, Rs2
+ * PKTB32 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Pack 32-bit data from 64-bit chunks in two registers.
+ * * PKBB32: bottom.bottom
+ * * PKBT32: bottom.top
+ * * PKTT32: top.top
+ * * PKTB32: top.bottom
+ *
+ * **Description**:\n
+ * (PKBB32) moves Rs1.W[0] to Rd.W[1] and moves Rs2.W[0] to Rd.W[0].
+ * (PKBT32) moves Rs1.W[0] to Rd.W[1] and moves Rs2.W[1] to Rd.W[0].
+ * (PKTT32) moves Rs1.W[1] to Rd.W[1] and moves Rs2.W[1] to Rd.W[0].
+ * (PKTB32) moves Rs1.W[1] to Rd.W[1] and moves Rs2.W[0] to Rd.W[0].
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd = CONCAT(Rs1.W[_*0*_], Rs2.W[_*0*_]); // PKBB32
+ * Rd = CONCAT(Rs1.W[_*0*_], Rs2.W[_*1*_]); // PKBT32
+ * Rd = CONCAT(Rs1.W[_*1*_], Rs2.W[_*1*_]); // PKTT32
+ * Rd = CONCAT(Rs1.W[_*1*_], Rs2.W[_*0*_]); // PKTB32
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_PKTB32(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("pktb32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 4.22.4. PKTB32 ===== */
+
+/* ===== Inline Function Start for 4.23. RADD32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_ADDSUB
+ * \brief RADD32 (SIMD 32-bit Signed Halving Addition)
+ * \details
+ * **Type**: SIMD (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * RADD32 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 32-bit signed integer element additions simultaneously. The results are halved to avoid
+ * overflow or saturation.
+ *
+ * **Description**:\n
+ * This instruction adds the 32-bit signed integer elements in Rs1 with the 32-bit signed
+ * integer elements in Rs2. The results are first arithmetically right-shifted by 1 bit and then written to
+ * Rd.
+ *
+ * **Examples**:\n
+ * ~~~
+ * * Rs1 = 0x7FFFFFFF, Rs2 = 0x7FFFFFFF Rd = 0x7FFFFFFF
+ * * Rs1 = 0x80000000, Rs2 = 0x80000000 Rd = 0x80000000
+ * * Rs1 = 0x40000000, Rs2 = 0x80000000 Rd = 0xE0000000
+ * ~~~
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.W[x] = (Rs1.W[x] + Rs2.W[x]) s>> 1;
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_RADD32(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("radd32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 4.23. RADD32 ===== */
+
+/* ===== Inline Function Start for 4.24. RCRAS32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_ADDSUB
+ * \brief RCRAS32 (SIMD 32-bit Signed Halving Cross Addition & Subtraction)
+ * \details
+ * **Type**: SIMD (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * RCRAS32 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 32-bit signed integer element addition and 32-bit signed integer element subtraction in
+ * a 64-bit chunk simultaneously. Operands are from crossed 32-bit elements. The results are halved to
+ * avoid overflow or saturation.
+ *
+ * **Description**:\n
+ * This instruction adds the 32-bit signed integer element in [63:32] of Rs1 with the 32-bit
+ * signed integer element in [31:0] of Rs2, and subtracts the 32-bit signed integer element in [63:32] of
+ * Rs2 from the 32-bit signed integer element in [31:0] of Rs1. The element results are first
+ * arithmetically right-shifted by 1 bit and then written to [63:32] of Rd for addition and [31:0] of Rd
+ * for subtraction.
+ *
+ * **Examples**:\n
+ * ~~~
+ * Please see `RADD32` and `RSUB32` instructions.
+ * ~~~
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.W[1] = (Rs1.W[1] + Rs2.W[0]) s>> 1;
+ * Rd.W[0] = (Rs1.W[0] - Rs2.W[1]) s>> 1;
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_RCRAS32(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("rcras32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 4.24. RCRAS32 ===== */
+
+/* ===== Inline Function Start for 4.25. RCRSA32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_ADDSUB
+ * \brief RCRSA32 (SIMD 32-bit Signed Halving Cross Subtraction & Addition)
+ * \details
+ * **Type**: SIMD (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * RCRSA32 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 32-bit signed integer element subtraction and 32-bit signed integer element addition in
+ * a 64-bit chunk simultaneously. Operands are from crossed 32-bit elements. The results are halved to
+ * avoid overflow or saturation.
+ *
+ * **Description**:\n
+ * This instruction subtracts the 32-bit signed integer element in [31:0] of Rs2 from the
+ * 32-bit signed integer element in [63:32] of Rs1, and adds the 32-bit signed element integer in [31:0]
+ * of Rs1 with the 32-bit signed integer element in [63:32] of Rs2. The two results are first
+ * arithmetically right-shifted by 1 bit and then written to [63:32] of Rd for subtraction and [31:0] of
+ * Rd for addition.
+ *
+ * **Examples**:\n
+ * ~~~
+ * Please see `RADD32` and `RSUB32` instructions.
+ * ~~~
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.W[1] = (Rs1.W[1] - Rs2.W[0]) s>> 1;
+ * Rd.W[0] = (Rs1.W[0] + Rs2.W[1]) s>> 1;
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_RCRSA32(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("rcrsa32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 4.25. RCRSA32 ===== */
+
+/* ===== Inline Function Start for 4.26. RSTAS32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_ADDSUB
+ * \brief RSTAS32 (SIMD 32-bit Signed Halving Straight Addition & Subtraction)
+ * \details
+ * **Type**: SIMD (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * RSTAS32 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 32-bit signed integer element addition and 32-bit signed integer element subtraction in
+ * a 64-bit chunk simultaneously. Operands are from corresponding 32-bit elements. The results are
+ * halved to avoid overflow or saturation.
+ *
+ * **Description**:\n
+ * This instruction adds the 32-bit signed integer element in [63:32] of Rs1 with the 32-bit
+ * signed integer element in [63:32] of Rs2, and subtracts the 32-bit signed integer element in [31:0] of
+ * Rs2 from the 32-bit signed integer element in [31:0] of Rs1. The element results are first
+ * arithmetically right-shifted by 1 bit and then written to [63:32] of Rd for addition and [31:0] of Rd
+ * for subtraction.
+ *
+ * **Examples**:\n
+ * ~~~
+ * Please see `RADD32` and `RSUB32` instructions.
+ * ~~~
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.W[1] = (Rs1.W[1] + Rs2.W[1]) s>> 1;
+ * Rd.W[0] = (Rs1.W[0] - Rs2.W[0]) s>> 1;
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_RSTAS32(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("rstas32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 4.26. RSTAS32 ===== */
+
+/* ===== Inline Function Start for 4.27. RSTSA32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_ADDSUB
+ * \brief RSTSA32 (SIMD 32-bit Signed Halving Straight Subtraction & Addition)
+ * \details
+ * **Type**: SIMD (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * RSTSA32 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 32-bit signed integer element subtraction and 32-bit signed integer element addition in
+ * a 64-bit chunk simultaneously. Operands are from corresponding 32-bit elements. The results are
+ * halved to avoid overflow or saturation.
+ *
+ * **Description**:\n
+ * This instruction subtracts the 32-bit signed integer element in [63:32] of Rs2 from the
+ * 32-bit signed integer element in [63:32] of Rs1, and adds the 32-bit signed element integer in [31:0]
+ * of Rs1 with the 32-bit signed integer element in [31:0] of Rs2. The two results are first arithmetically
+ * right-shifted by 1 bit and then written to [63:32] of Rd for subtraction and [31:0] of Rd for addition.
+ *
+ * **Examples**:\n
+ * ~~~
+ * Please see `RADD32` and `RSUB32` instructions.
+ * ~~~
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.W[1] = (Rs1.W[1] - Rs2.W[1]) s>> 1;
+ * Rd.W[0] = (Rs1.W[0] + Rs2.W[0]) s>> 1;
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_RSTSA32(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("rstsa32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 4.27. RSTSA32 ===== */
+
+/* ===== Inline Function Start for 4.28. RSUB32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_ADDSUB
+ * \brief RSUB32 (SIMD 32-bit Signed Halving Subtraction)
+ * \details
+ * **Type**: SIMD (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * RSUB32 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 32-bit signed integer element subtractions simultaneously. The results are halved to
+ * avoid overflow or saturation.
+ *
+ * **Description**:\n
+ * This instruction subtracts the 32-bit signed integer elements in Rs2 from the 32-bit
+ * signed integer elements in Rs1. The results are first arithmetically right-shifted by 1 bit and then
+ * written to Rd.
+ *
+ * **Examples**:\n
+ * ~~~
+ * * Ra = 0x7FFFFFFF, Rb = 0x80000000 Rt = 0x7FFFFFFF
+ * * Ra = 0x80000000, Rb = 0x7FFFFFFF Rt = 0x80000000
+ * * Ra = 0x80000000, Rb = 0x40000000 Rt = 0xA0000000
+ * ~~~
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.W[x] = (Rs1.W[x] - Rs2.W[x]) s>> 1;
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_RSUB32(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("rsub32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 4.28. RSUB32 ===== */
+
+/* ===== Inline Function Start for 4.29. SLL32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_SHIFT
+ * \brief SLL32 (SIMD 32-bit Shift Left Logical)
+ * \details
+ * **Type**: SIMD (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SLL32 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 32-bit elements logical left shift operations simultaneously. The shift amount is a
+ * variable from a GPR.
+ *
+ * **Description**:\n
+ * The 32-bit elements in Rs1 are left-shifted logically. And the results are written to Rd.
+ * The shifted out bits are filled with zero and the shift amount is specified by the low-order 5-bits of
+ * the value in the Rs2 register.
+ *
+ * **Operations**:\n
+ * ~~~
+ * sa = Rs2[4:0];
+ * Rd.W[x] = Rs1.W[x] << sa;
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned int type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_SLL32(unsigned long a, unsigned int b)
+{
+    register unsigned long result;
+    __ASM volatile("sll32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 4.29. SLL32 ===== */
+
+/* ===== Inline Function Start for 4.30. SLLI32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_SHIFT
+ * \brief SLLI32 (SIMD 32-bit Shift Left Logical Immediate)
+ * \details
+ * **Type**: SIMD (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SLLI32 Rd, Rs1, imm5u[4:0]
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 32-bit element logical left shift operations simultaneously. The shift amount is an
+ * immediate value.
+ *
+ * **Description**:\n
+ * The 32-bit elements in Rs1 are left-shifted logically. The shifted out bits are filled with
+ * zero and the shift amount is specified by the imm5u[4:0] constant. And the results are written to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * sa = imm5u[4:0];
+ * Rd.W[x] = Rs1.W[x] << sa;
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned int type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_SLLI32(unsigned long a, unsigned int b)
+{
+    register unsigned long result;
+    __ASM volatile("slli32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 4.30. SLLI32 ===== */
+
+/* ===== Inline Function Start for 4.31. SMAX32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_MISC
+ * \brief SMAX32 (SIMD 32-bit Signed Maximum)
+ * \details
+ * **Type**: SIMD (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SMAX32 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 32-bit signed integer elements finding maximum operations simultaneously.
+ *
+ * **Description**:\n
+ * This instruction compares the 32-bit signed integer elements in Rs1 with the 32-bit
+ * signed integer elements in Rs2 and selects the numbers that is greater than the other one. The
+ * selected results are written to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.W[x] = (Rs1.W[x] > Rs2.W[x])? Rs1.W[x] : Rs2.W[x];
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_SMAX32(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("smax32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 4.31. SMAX32 ===== */
+
+/* ===== Inline Function Start for 4.32.1. SMBB32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_32B_MULT
+ * \brief SMBB32 (Signed Multiply Bottom Word & Bottom Word)
+ * \details
+ * **Type**: DSP (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SMBB32 Rd, Rs1, Rs2
+ * SMBT32 Rd, Rs1, Rs2
+ * SMTT32 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Multiply the signed 32-bit element of a register with the signed 32-bit element of another
+ * register and write the 64-bit result to a third register.
+ * * SMBB32: bottom*bottom
+ * * SMBT32: bottom*top
+ * * SMTT32: top*top
+ *
+ * **Description**:\n
+ * For the `SMBB32` instruction, it multiplies the bottom 32-bit element of Rs1 with the bottom 32-bit
+ * element of Rs2. It is actually an alias of `MULSR64` instruction.
+ * For the `SMBT32` instruction, it multiplies the bottom 32-bit element of Rs1 with the top 32-bit
+ * element of Rs2.
+ * For the `SMTT32` instruction, it multiplies the top 32-bit element of Rs1 with the top 32-bit element
+ * of Rs2.
+ * The 64-bit multiplication result is written to Rd. The 32-bit contents of Rs1 and Rs2 are treated as
+ * signed integers.
+ *
+ * **Operations**:\n
+ * ~~~
+ * res = Rs1.W[0] * Rs2.W[0]; // SMBB32 res = Rs1.W[0] * Rs2.w[1]; // SMBT32 res = Rs1.W[1] * Rs2.W[1];
+ * // SMTT32 Rd = res;
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_SMBB32(unsigned long a, unsigned long b)
+{
+    register long result;
+    __ASM volatile("smbb32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 4.32.1. SMBB32 ===== */
+
+/* ===== Inline Function Start for 4.32.2. SMBT32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_32B_MULT
+ * \brief SMBT32 (Signed Multiply Bottom Word & Top Word)
+ * \details
+ * **Type**: DSP (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SMBB32 Rd, Rs1, Rs2
+ * SMBT32 Rd, Rs1, Rs2
+ * SMTT32 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Multiply the signed 32-bit element of a register with the signed 32-bit element of another
+ * register and write the 64-bit result to a third register.
+ * * SMBB32: bottom*bottom
+ * * SMBT32: bottom*top
+ * * SMTT32: top*top
+ *
+ * **Description**:\n
+ * For the `SMBB32` instruction, it multiplies the bottom 32-bit element of Rs1 with the bottom 32-bit
+ * element of Rs2. It is actually an alias of `MULSR64` instruction.
+ * For the `SMBT32` instruction, it multiplies the bottom 32-bit element of Rs1 with the top 32-bit
+ * element of Rs2.
+ * For the `SMTT32` instruction, it multiplies the top 32-bit element of Rs1 with the top 32-bit element
+ * of Rs2.
+ * The 64-bit multiplication result is written to Rd. The 32-bit contents of Rs1 and Rs2 are treated as
+ * signed integers.
+ *
+ * **Operations**:\n
+ * ~~~
+ * res = Rs1.W[0] * Rs2.W[0]; // SMBB32 res = Rs1.W[0] * Rs2.w[1]; // SMBT32 res = Rs1.W[1] * Rs2.W[1];
+ * // SMTT32 Rd = res;
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_SMBT32(unsigned long a, unsigned long b)
+{
+    register long result;
+    __ASM volatile("smbt32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 4.32.2. SMBT32 ===== */
+
+/* ===== Inline Function Start for 4.32.3. SMTT32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_32B_MULT
+ * \brief SMTT32 (Signed Multiply Top Word & Top Word)
+ * \details
+ * **Type**: DSP (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SMBB32 Rd, Rs1, Rs2
+ * SMBT32 Rd, Rs1, Rs2
+ * SMTT32 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Multiply the signed 32-bit element of a register with the signed 32-bit element of another
+ * register and write the 64-bit result to a third register.
+ * * SMBB32: bottom*bottom
+ * * SMBT32: bottom*top
+ * * SMTT32: top*top
+ *
+ * **Description**:\n
+ * For the `SMBB32` instruction, it multiplies the bottom 32-bit element of Rs1 with the bottom 32-bit
+ * element of Rs2. It is actually an alias of `MULSR64` instruction.
+ * For the `SMBT32` instruction, it multiplies the bottom 32-bit element of Rs1 with the top 32-bit
+ * element of Rs2.
+ * For the `SMTT32` instruction, it multiplies the top 32-bit element of Rs1 with the top 32-bit element
+ * of Rs2.
+ * The 64-bit multiplication result is written to Rd. The 32-bit contents of Rs1 and Rs2 are treated as
+ * signed integers.
+ *
+ * **Operations**:\n
+ * ~~~
+ * res = Rs1.W[0] * Rs2.W[0]; // SMBB32 res = Rs1.W[0] * Rs2.w[1]; // SMBT32 res = Rs1.W[1] * Rs2.W[1];
+ * // SMTT32 Rd = res;
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_SMTT32(unsigned long a, unsigned long b)
+{
+    register long result;
+    __ASM volatile("smtt32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 4.32.3. SMTT32 ===== */
+
+/* ===== Inline Function Start for 4.33.1. SMDS32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_32B_PARALLEL_MAC
+ * \brief SMDS32 (Signed Multiply Two Words and Subtract)
+ * \details
+ * **Type**: DSP (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SMDS32 Rd, Rs1, Rs2
+ * SMDRS32 Rd, Rs1, Rs2
+ * SMXDS32 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do two signed 32-bit multiplications from the l 32-bit element of two registers; and then
+ * perform a subtraction operation between the two 64-bit results.
+ * * SMDS32: top*top - bottom*bottom
+ * * SMDRS32: bottom*bottom - top*top
+ * * SMXDS32: top*bottom - bottom*top
+ *
+ * **Description**:\n
+ * For the `SMDS32` instruction, it multiplies the bottom 32-bit element of Rs1 with the bottom 32-bit
+ * element of Rs2 and then subtracts the result from the result of multiplying the top 32-bit element of
+ * Rs1 with the top 32-bit element of Rs2.
+ * For the `SMDRS32` instruction, it multiplies the top 32-bit element of Rs1 with the top 32-bit
+ * element of Rs2 and then subtracts the result from the result of multiplying the bottom 32-bit
+ * element of Rs1 with the bottom 32-bit element of Rs2.
+ * For the `SMXDS32` instruction, it multiplies the bottom 32-bit element of Rs1 with the top 32-bit
+ * element of Rs2 and then subtracts the result from the result of multiplying the top 32-bit element of
+ * Rs1 with the bottom 32-bit element of Rs2.
+ * The subtraction result is written to Rd. The 32-bit contents of Rs1 and Rs2 are treated as signed
+ * integers.
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rt = (Rs1.W[1] * Rs2.W[1]) - (Rs1.W[0] * Rs2.W[0]); // SMDS32
+ * Rt = (Rs1.W[0] * Rs2.W[0]) - (Rs1.W[1] * Rs2.W[1]); // SMDRS32
+ * Rt = (Rs1.W[1] * Rs2.W[0]) - (Rs1.W[0] * Rs2.W[1]); // SMXDS32
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_SMDS32(unsigned long a, unsigned long b)
+{
+    register long result;
+    __ASM volatile("smds32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 4.33.1. SMDS32 ===== */
+
+/* ===== Inline Function Start for 4.33.2. SMDRS32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_32B_PARALLEL_MAC
+ * \brief SMDRS32 (Signed Multiply Two Words and Reverse Subtract)
+ * \details
+ * **Type**: DSP (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SMDS32 Rd, Rs1, Rs2
+ * SMDRS32 Rd, Rs1, Rs2
+ * SMXDS32 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do two signed 32-bit multiplications from the l 32-bit element of two registers; and then
+ * perform a subtraction operation between the two 64-bit results.
+ * * SMDS32: top*top - bottom*bottom
+ * * SMDRS32: bottom*bottom - top*top
+ * * SMXDS32: top*bottom - bottom*top
+ *
+ * **Description**:\n
+ * For the `SMDS32` instruction, it multiplies the bottom 32-bit element of Rs1 with the bottom 32-bit
+ * element of Rs2 and then subtracts the result from the result of multiplying the top 32-bit element of
+ * Rs1 with the top 32-bit element of Rs2.
+ * For the `SMDRS32` instruction, it multiplies the top 32-bit element of Rs1 with the top 32-bit
+ * element of Rs2 and then subtracts the result from the result of multiplying the bottom 32-bit
+ * element of Rs1 with the bottom 32-bit element of Rs2.
+ * For the `SMXDS32` instruction, it multiplies the bottom 32-bit element of Rs1 with the top 32-bit
+ * element of Rs2 and then subtracts the result from the result of multiplying the top 32-bit element of
+ * Rs1 with the bottom 32-bit element of Rs2.
+ * The subtraction result is written to Rd. The 32-bit contents of Rs1 and Rs2 are treated as signed
+ * integers.
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rt = (Rs1.W[1] * Rs2.W[1]) - (Rs1.W[0] * Rs2.W[0]); // SMDS32
+ * Rt = (Rs1.W[0] * Rs2.W[0]) - (Rs1.W[1] * Rs2.W[1]); // SMDRS32
+ * Rt = (Rs1.W[1] * Rs2.W[0]) - (Rs1.W[0] * Rs2.W[1]); // SMXDS32
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_SMDRS32(unsigned long a, unsigned long b)
+{
+    register long result;
+    __ASM volatile("smdrs32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 4.33.2. SMDRS32 ===== */
+
+/* ===== Inline Function Start for 4.33.3. SMXDS32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_32B_PARALLEL_MAC
+ * \brief SMXDS32 (Signed Crossed Multiply Two Words and Subtract)
+ * \details
+ * **Type**: DSP (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SMDS32 Rd, Rs1, Rs2
+ * SMDRS32 Rd, Rs1, Rs2
+ * SMXDS32 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do two signed 32-bit multiplications from the l 32-bit element of two registers; and then
+ * perform a subtraction operation between the two 64-bit results.
+ * * SMDS32: top*top - bottom*bottom
+ * * SMDRS32: bottom*bottom - top*top
+ * * SMXDS32: top*bottom - bottom*top
+ *
+ * **Description**:\n
+ * For the `SMDS32` instruction, it multiplies the bottom 32-bit element of Rs1 with the bottom 32-bit
+ * element of Rs2 and then subtracts the result from the result of multiplying the top 32-bit element of
+ * Rs1 with the top 32-bit element of Rs2.
+ * For the `SMDRS32` instruction, it multiplies the top 32-bit element of Rs1 with the top 32-bit
+ * element of Rs2 and then subtracts the result from the result of multiplying the bottom 32-bit
+ * element of Rs1 with the bottom 32-bit element of Rs2.
+ * For the `SMXDS32` instruction, it multiplies the bottom 32-bit element of Rs1 with the top 32-bit
+ * element of Rs2 and then subtracts the result from the result of multiplying the top 32-bit element of
+ * Rs1 with the bottom 32-bit element of Rs2.
+ * The subtraction result is written to Rd. The 32-bit contents of Rs1 and Rs2 are treated as signed
+ * integers.
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rt = (Rs1.W[1] * Rs2.W[1]) - (Rs1.W[0] * Rs2.W[0]); // SMDS32
+ * Rt = (Rs1.W[0] * Rs2.W[0]) - (Rs1.W[1] * Rs2.W[1]); // SMDRS32
+ * Rt = (Rs1.W[1] * Rs2.W[0]) - (Rs1.W[0] * Rs2.W[1]); // SMXDS32
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_SMXDS32(unsigned long a, unsigned long b)
+{
+    register long result;
+    __ASM volatile("smxds32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 4.33.3. SMXDS32 ===== */
+
+/* ===== Inline Function Start for 4.34. SMIN32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_MISC
+ * \brief SMIN32 (SIMD 32-bit Signed Minimum)
+ * \details
+ * **Type**: SIMD (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SMIN32 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 32-bit signed integer elements finding minimum operations simultaneously.
+ *
+ * **Description**:\n
+ * This instruction compares the 32-bit signed integer elements in Rs1 with the 32-bit
+ * signed integer elements in Rs2 and selects the numbers that is less than the other one. The selected
+ * results are written to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.W[x] = (Rs1.W[x] < Rs2.W[x])? Rs1.W[x] : Rs2.W[x];
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_SMIN32(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("smin32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 4.34. SMIN32 ===== */
+
+/* ===== Inline Function Start for 4.35.1. SRA32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_SHIFT
+ * \brief SRA32 (SIMD 32-bit Shift Right Arithmetic)
+ * \details
+ * **Type**: SIMD (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SRA32 Rd, Rs1, Rs2
+ * SRA32.u Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 32-bit element arithmetic right shift operations simultaneously. The shift amount is a
+ * variable from a GPR. The `.u` form performs additional rounding up operations on the shifted
+ * results.
+ *
+ * **Description**:\n
+ * The 32-bit data elements in Rs1 are right-shifted arithmetically, that is, the shifted out
+ * bits are filled with the sign-bit of the data elements. The shift amount is specified by the low-order
+ * 5-bits of the value in the Rs2 register. For the rounding operation of the `.u` form, a value of 1 is
+ * added to the most significant discarded bit of each 32-bit data element to calculate the final results.
+ * And the results are written to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * sa = Rs2[4:0];
+ * if (sa > 0) {
+ *   if (`.u` form) { // SRA32.u
+ *     res[31:-1] = SE33(Rs1.W[x][31:sa-1]) + 1;
+ *     Rd.W[x] = res[31:0];
+ *   else { // SRA32
+ *     Rd.W[x] = SE32(Rs1.W[x][31:sa])
+ *   }
+ * } else {
+ *   Rd = Rs1;
+ * }
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned int type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_SRA32(unsigned long a, unsigned int b)
+{
+    register unsigned long result;
+    __ASM volatile("sra32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 4.35.1. SRA32 ===== */
+
+/* ===== Inline Function Start for 4.35.2. SRA32.u ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_SHIFT
+ * \brief SRA32.u (SIMD 32-bit Rounding Shift Right Arithmetic)
+ * \details
+ * **Type**: SIMD (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SRA32 Rd, Rs1, Rs2
+ * SRA32.u Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 32-bit element arithmetic right shift operations simultaneously. The shift amount is a
+ * variable from a GPR. The `.u` form performs additional rounding up operations on the shifted
+ * results.
+ *
+ * **Description**:\n
+ * The 32-bit data elements in Rs1 are right-shifted arithmetically, that is, the shifted out
+ * bits are filled with the sign-bit of the data elements. The shift amount is specified by the low-order
+ * 5-bits of the value in the Rs2 register. For the rounding operation of the `.u` form, a value of 1 is
+ * added to the most significant discarded bit of each 32-bit data element to calculate the final results.
+ * And the results are written to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * sa = Rs2[4:0];
+ * if (sa > 0) {
+ *   if (`.u` form) { // SRA32.u
+ *     res[31:-1] = SE33(Rs1.W[x][31:sa-1]) + 1;
+ *     Rd.W[x] = res[31:0];
+ *   else { // SRA32
+ *     Rd.W[x] = SE32(Rs1.W[x][31:sa])
+ *   }
+ * } else {
+ *   Rd = Rs1;
+ * }
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned int type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_SRA32_U(unsigned long a, unsigned int b)
+{
+    register unsigned long result;
+    __ASM volatile("sra32.u %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 4.35.2. SRA32.u ===== */
+
+/* ===== Inline Function Start for 4.36.1. SRAI32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_SHIFT
+ * \brief SRAI32 (SIMD 32-bit Shift Right Arithmetic Immediate)
+ * \details
+ * **Type**: DSP (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SRAI32 Rd, Rs1, imm5u
+ * SRAI32.u Rd, Rs1, imm5u
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 32-bit elements arithmetic right shift operations simultaneously. The shift amount is
+ * an immediate value. The `.u` form performs additional rounding up operations on the shifted
+ * results.
+ *
+ * **Description**:\n
+ * The 32-bit data elements in Rs1 are right-shifted arithmetically, that is, the shifted out
+ * bits are filled with the sign-bit of the 32-bit data elements. The shift amount is specified by the
+ * imm5u constant. For the rounding operation of the `.u` form, a value of 1 is added to the most
+ * significant discarded bit of each 32-bit data to calculate the final results. And the results are written
+ * to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * sa = imm5u[4:0];
+ *   if (sa > 0) {
+ *   if (`.u` form) { // SRAI32.u
+ *     res[31:-1] = SE33(Rs1.W[x][31:sa-1]) + 1;
+ *     Rd.W[x] = res[31:0];
+ *   else { // SRAI32
+ *     Rd.W[x] = SE32(Rs1.W[x][31:sa]);
+ *   }
+ * } else {
+ *   Rd = Rs1;
+ * }
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned int type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_SRAI32(unsigned long a, unsigned int b)
+{
+    register unsigned long result;
+    __ASM volatile("srai32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 4.36.1. SRAI32 ===== */
+
+/* ===== Inline Function Start for 4.36.2. SRAI32.u ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_SHIFT
+ * \brief SRAI32.u (SIMD 32-bit Rounding Shift Right Arithmetic Immediate)
+ * \details
+ * **Type**: DSP (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SRAI32 Rd, Rs1, imm5u
+ * SRAI32.u Rd, Rs1, imm5u
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 32-bit elements arithmetic right shift operations simultaneously. The shift amount is
+ * an immediate value. The `.u` form performs additional rounding up operations on the shifted
+ * results.
+ *
+ * **Description**:\n
+ * The 32-bit data elements in Rs1 are right-shifted arithmetically, that is, the shifted out
+ * bits are filled with the sign-bit of the 32-bit data elements. The shift amount is specified by the
+ * imm5u constant. For the rounding operation of the `.u` form, a value of 1 is added to the most
+ * significant discarded bit of each 32-bit data to calculate the final results. And the results are written
+ * to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * sa = imm5u[4:0];
+ *   if (sa > 0) {
+ *   if (`.u` form) { // SRAI32.u
+ *     res[31:-1] = SE33(Rs1.W[x][31:sa-1]) + 1;
+ *     Rd.W[x] = res[31:0];
+ *   else { // SRAI32
+ *     Rd.W[x] = SE32(Rs1.W[x][31:sa]);
+ *   }
+ * } else {
+ *   Rd = Rs1;
+ * }
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned int type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_SRAI32_U(unsigned long a, unsigned int b)
+{
+    register unsigned long result;
+    __ASM volatile("srai32.u %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 4.36.2. SRAI32.u ===== */
+
+/* ===== Inline Function Start for 4.37. SRAIW.u ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_NON_SIMD_32B_SHIFT
+ * \brief SRAIW.u (Rounding Shift Right Arithmetic Immediate Word)
+ * \details
+ * **Type**: DSP (RV64 only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SRAIW.u Rd, Rs1, imm5u
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Perform a 32-bit arithmetic right shift operation with rounding. The shift amount is an
+ * immediate value.
+ *
+ * **Description**:\n
+ * This instruction right-shifts the lower 32-bit content of Rs1 arithmetically. The shifted
+ * out bits are filled with the sign-bit Rs1(31) and the shift amount is specified by the imm5u constant.
+ * For the rounding operation, a value of 1 is added to the most significant discarded bit of the data to
+ * calculate the final result. And the result is sign-extended and written to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * sa = imm5u;
+ * if (sa != 0) {
+ *   res[31:-1] = SE33(Rs1[31:(sa-1)]) + 1;
+ *   Rd = SE32(res[31:0]);
+ * } else {
+ *   Rd = SE32(Rs1.W[0]);
+ * }
+ * ~~~
+ *
+ * \param [in]  a    int type of value stored in a
+ * \param [in]  b    unsigned int type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_SRAIW_U(int a, unsigned int b)
+{
+    register long result;
+    __ASM volatile("sraiw.u %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 4.37. SRAIW.u ===== */
+
+/* ===== Inline Function Start for 4.38.1. SRL32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_SHIFT
+ * \brief SRL32 (SIMD 32-bit Shift Right Logical)
+ * \details
+ * **Type**: SIMD (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SRL32 Rd, Rs1, Rs2
+ * SRL32.u Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 32-bit element logical right shift operations simultaneously. The shift amount is a
+ * variable from a GPR. The `.u` form performs additional rounding up operations on the shifted
+ * results.
+ *
+ * **Description**:\n
+ * The 32-bit data elements in Rs1 are right-shifted logically, that is, the shifted out bits
+ * are filled with zero. The shift amount is specified by the low-order 5-bits of the value in the Rs2
+ * register. For the rounding operation of the `.u` form, a value of 1 is added to the most significant
+ * discarded bit of each 32-bit data element to calculate the final results. And the results are written to
+ * Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * sa = Rs2[4:0];
+ * if (sa > 0) {
+ *   if (`.u` form) { // SRA32.u
+ *     res[31:-1] = ZE33(Rs1.W[x][31:sa-1]) + 1;
+ *     Rd.W[x] = res[31:0];
+ *   else { // SRA32
+ *     Rd.W[x] = ZE32(Rs1.W[x][31:sa])
+ *   }
+ * } else {
+ *   Rd = Rs1;
+ * }
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned int type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_SRL32(unsigned long a, unsigned int b)
+{
+    register unsigned long result;
+    __ASM volatile("srl32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 4.38.1. SRL32 ===== */
+
+/* ===== Inline Function Start for 4.38.2. SRL32.u ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_SHIFT
+ * \brief SRL32.u (SIMD 32-bit Rounding Shift Right Logical)
+ * \details
+ * **Type**: SIMD (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SRL32 Rd, Rs1, Rs2
+ * SRL32.u Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 32-bit element logical right shift operations simultaneously. The shift amount is a
+ * variable from a GPR. The `.u` form performs additional rounding up operations on the shifted
+ * results.
+ *
+ * **Description**:\n
+ * The 32-bit data elements in Rs1 are right-shifted logically, that is, the shifted out bits
+ * are filled with zero. The shift amount is specified by the low-order 5-bits of the value in the Rs2
+ * register. For the rounding operation of the `.u` form, a value of 1 is added to the most significant
+ * discarded bit of each 32-bit data element to calculate the final results. And the results are written to
+ * Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * sa = Rs2[4:0];
+ * if (sa > 0) {
+ *   if (`.u` form) { // SRA32.u
+ *     res[31:-1] = ZE33(Rs1.W[x][31:sa-1]) + 1;
+ *     Rd.W[x] = res[31:0];
+ *   else { // SRA32
+ *     Rd.W[x] = ZE32(Rs1.W[x][31:sa])
+ *   }
+ * } else {
+ *   Rd = Rs1;
+ * }
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned int type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_SRL32_U(unsigned long a, unsigned int b)
+{
+    register unsigned long result;
+    __ASM volatile("srl32.u %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 4.38.2. SRL32.u ===== */
+
+/* ===== Inline Function Start for 4.39.1. SRLI32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_SHIFT
+ * \brief SRLI32 (SIMD 32-bit Shift Right Logical Immediate)
+ * \details
+ * **Type**: SIMD (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SRLI32 Rd, Rs1, imm5u
+ * SRLI32.u Rd, Rs1, imm5u
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 32-bit elements logical right shift operations simultaneously. The shift amount is an
+ * immediate value. The `.u` form performs additional rounding up operations on the shifted results.
+ *
+ * **Description**:\n
+ * The 32-bit data elements in Rs1 are right-shifted logically, that is, the shifted out bits
+ * are filled with zero. The shift amount is specified by the imm5u constant. For the rounding
+ * operation of the `.u` form, a value of 1 is added to the most significant discarded bit of each 32-bit
+ * data to calculate the final results. And the results are written to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * sa = imm5u[4:0];
+ * if (sa > 0) {
+ *   if (`.u` form) { // SRLI32.u
+ *     res[31:-1] = ZE33(Rs1.W[x][31:sa-1]) + 1;
+ *     Rd.W[x] = res[31:0];
+ *   else { // SRLI32
+ *     Rd.W[x] = ZE32(Rs1.W[x][31:sa]);
+ *   }
+ * } else {
+ *   Rd = Rs1;
+ * }
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned int type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_SRLI32(unsigned long a, unsigned int b)
+{
+    register unsigned long result;
+    __ASM volatile("srli32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 4.39.1. SRLI32 ===== */
+
+/* ===== Inline Function Start for 4.39.2. SRLI32.u ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_SHIFT
+ * \brief SRLI32.u (SIMD 32-bit Rounding Shift Right Logical Immediate)
+ * \details
+ * **Type**: SIMD (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SRLI32 Rd, Rs1, imm5u
+ * SRLI32.u Rd, Rs1, imm5u
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 32-bit elements logical right shift operations simultaneously. The shift amount is an
+ * immediate value. The `.u` form performs additional rounding up operations on the shifted results.
+ *
+ * **Description**:\n
+ * The 32-bit data elements in Rs1 are right-shifted logically, that is, the shifted out bits
+ * are filled with zero. The shift amount is specified by the imm5u constant. For the rounding
+ * operation of the `.u` form, a value of 1 is added to the most significant discarded bit of each 32-bit
+ * data to calculate the final results. And the results are written to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * sa = imm5u[4:0];
+ * if (sa > 0) {
+ *   if (`.u` form) { // SRLI32.u
+ *     res[31:-1] = ZE33(Rs1.W[x][31:sa-1]) + 1;
+ *     Rd.W[x] = res[31:0];
+ *   else { // SRLI32
+ *     Rd.W[x] = ZE32(Rs1.W[x][31:sa]);
+ *   }
+ * } else {
+ *   Rd = Rs1;
+ * }
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned int type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_SRLI32_U(unsigned long a, unsigned int b)
+{
+    register unsigned long result;
+    __ASM volatile("srli32.u %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 4.39.2. SRLI32.u ===== */
+
+/* ===== Inline Function Start for 4.40. STAS32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_ADDSUB
+ * \brief STAS32 (SIMD 32-bit Straight Addition & Subtraction)
+ * \details
+ * **Type**: SIMD (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * STAS32 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 32-bit integer element addition and 32-bit integer element subtraction in a 64-bit
+ * chunk simultaneously. Operands are from corresponding 32-bit elements.
+ *
+ * **Description**:\n
+ * This instruction adds the 32-bit integer element in [63:32] of Rs1 with the 32-bit
+ * integer element in [63:32] of Rs2, and writes the result to [63:32] of Rd; at the same time, it subtracts
+ * the 32-bit integer element in [31:0] of Rs2 from the 32-bit integer element in [31:0] of Rs1, and
+ * writes the result to [31:0] of Rd.
+ *
+ * **Note**:\n
+ * This instruction can be used for either signed or unsigned operations.
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.W[1] = Rs1.W[1] + Rs2.W[1];
+ * Rd.W[0] = Rs1.W[0] - Rs2.W[0];
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_STAS32(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("stas32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 4.40. STAS32 ===== */
+
+/* ===== Inline Function Start for 4.41. STSA32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_ADDSUB
+ * \brief STSA32 (SIMD 32-bit Straight Subtraction & Addition)
+ * \details
+ * **Type**: SIMD (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * STSA32 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 32-bit integer element subtraction and 32-bit integer element addition in a 64-bit
+ * chunk simultaneously. Operands are from corresponding 32-bit elements.
+ * *Description: *
+ * This instruction subtracts the 32-bit integer element in [63:32] of Rs2 from the 32-bit integer
+ * element in [63:32] of Rs1, and writes the result to [63:32] of Rd; at the same time, it adds the 32-bit
+ * integer element in [31:0] of Rs1 with the 32-bit integer element in [31:0] of Rs2, and writes the result
+ * to [31:0] of Rd
+ *
+ * **Note**:\n
+ * This instruction can be used for either signed or unsigned operations.
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.W[1] = Rs1.W[1] - Rs2.W[1];
+ * Rd.W[0] = Rs1.W[0] + Rs2.W[0];
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_STSA32(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("stsa32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 4.41. STSA32 ===== */
+
+/* ===== Inline Function Start for 4.42. SUB32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_ADDSUB
+ * \brief SUB32 (SIMD 32-bit Subtraction)
+ * \details
+ * **Type**: DSP (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SUB32 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 32-bit integer element subtractions simultaneously.
+ *
+ * **Description**:\n
+ * This instruction subtracts the 32-bit integer elements in Rs2 from the 32-bit integer
+ * elements in Rs1, and then writes the results to Rd.
+ *
+ * **Note**:\n
+ * This instruction can be used for either signed or unsigned subtraction.
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.W[x] = Rs1.W[x] - Rs2.W[x];
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_SUB32(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("sub32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 4.42. SUB32 ===== */
+
+/* ===== Inline Function Start for 4.43. UKADD32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_ADDSUB
+ * \brief UKADD32 (SIMD 32-bit Unsigned Saturating Addition)
+ * \details
+ * **Type**: SIMD (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * UKADD32 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 32-bit unsigned integer element saturating additions simultaneously.
+ *
+ * **Description**:\n
+ * This instruction adds the 32-bit unsigned integer elements in Rs1 with the 32-bit
+ * unsigned integer elements in Rs2. If any of the results are beyond the 32-bit unsigned number
+ * range (0 <= RES <= 2^32-1), they are saturated to the range and the OV bit is set to 1. The saturated
+ * results are written to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * res[x] = Rs1.W[x] + Rs2.W[x];
+ * if (res[x] > (2^32)-1) {
+ *   res[x] = (2^32)-1;
+ *   OV = 1;
+ * }
+ * Rd.W[x] = res[x];
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_UKADD32(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("ukadd32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 4.43. UKADD32 ===== */
+
+/* ===== Inline Function Start for 4.44. UKCRAS32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_ADDSUB
+ * \brief UKCRAS32 (SIMD 32-bit Unsigned Saturating Cross Addition & Subtraction)
+ * \details
+ * **Type**: SIMD (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * UKCRAS32 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do one 32-bit unsigned integer element saturating addition and one 32-bit unsigned
+ * integer element saturating subtraction in a 64-bit chunk simultaneously. Operands are from crossed
+ * 32-bit elements.
+ *
+ * **Description**:\n
+ * This instruction adds the 32-bit unsigned integer element in [63:32] of Rs1 with the 32-
+ * bit unsigned integer element in [31:0] of Rs2; at the same time, it subtracts the 32-bit unsigned
+ * integer element in [63:32] of Rs2 from the 32-bit unsigned integer element in [31:0] Rs1. If any of the
+ * results are beyond the 32-bit unsigned number range (0 <= RES <= 2^32-1), they are saturated to the
+ * range and the OV bit is set to 1. The saturated results are written to [63:32] of Rd for addition and
+ * [31:0] of Rd for subtraction.
+ *
+ * **Operations**:\n
+ * ~~~
+ * res1 = Rs1.W[1] + Rs2.W[0];
+ * res2 = Rs1.W[0] - Rs2.W[1];
+ * if (res1 > (2^32)-1) {
+ *   res1 = (2^32)-1;
+ *   OV = 1;
+ * }
+ * if (res2 < 0) {
+ *   res2 = 0;
+ *   OV = 1;
+ * }
+ * Rd.W[1] = res1;
+ * Rd.W[0] = res2;
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_UKCRAS32(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("ukcras32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 4.44. UKCRAS32 ===== */
+
+/* ===== Inline Function Start for 4.45. UKCRSA32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_ADDSUB
+ * \brief UKCRSA32 (SIMD 32-bit Unsigned Saturating Cross Subtraction & Addition)
+ * \details
+ * **Type**: SIMD (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * UKCRSA32 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do one 32-bit unsigned integer element saturating subtraction and one 32-bit unsigned
+ * integer element saturating addition in a 64-bit chunk simultaneously. Operands are from crossed
+ * 32-bit elements.
+ *
+ * **Description**:\n
+ * This instruction subtracts the 32-bit unsigned integer element in [31:0] of Rs2 from the
+ * 32-bit unsigned integer element in [63:32] of Rs1; at the same time, it adds the 32-bit unsigned
+ * integer element in [63:32] of Rs2 with the 32-bit unsigned integer element in [31:0] Rs1. If any of the
+ * results are beyond the 32-bit unsigned number range (0 <= RES <= 2^32-1), they are saturated to the
+ * range and the OV bit is set to 1. The saturated results are written to [63:32] of Rd for subtraction and
+ * [31:0] of Rd for addition.
+ *
+ * **Operations**:\n
+ * ~~~
+ * res1 = Rs1.W[1] - Rs2.W[0];
+ * res2 = Rs1.W[0] + Rs2.W[1];
+ * if (res1 < 0) {
+ *   res1 = 0;
+ *   OV = 1;
+ * } else if (res2 > (2^32)-1) {
+ *   res2 = (2^32)-1;
+ *   OV = 1;
+ * }
+ * Rd.W[1] = res1;
+ * Rd.W[0] = res2;
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_UKCRSA32(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("ukcrsa32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 4.45. UKCRSA32 ===== */
+
+/* ===== Inline Function Start for 4.46. UKSTAS32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_ADDSUB
+ * \brief UKSTAS32 (SIMD 32-bit Unsigned Saturating Straight Addition & Subtraction)
+ * \details
+ * **Type**: SIMD (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * UKSTAS32 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do one 32-bit unsigned integer element saturating addition and one 32-bit unsigned
+ * integer element saturating subtraction in a 64-bit chunk simultaneously. Operands are from
+ * corresponding 32-bit elements.
+ *
+ * **Description**:\n
+ * This instruction adds the 32-bit unsigned integer element in [63:32] of Rs1 with the 32-
+ * bit unsigned integer element in [63:32] of Rs2; at the same time, it subtracts the 32-bit unsigned
+ * integer element in [31:0] of Rs2 from the 32-bit unsigned integer element in [31:0] Rs1. If any of the
+ * results are beyond the 32-bit unsigned number range (0 <= RES <= 2^32-1), they are saturated to the
+ * range and the OV bit is set to 1. The saturated results are written to [63:32] of Rd for addition and
+ * [31:0] of Rd for subtraction.
+ *
+ * **Operations**:\n
+ * ~~~
+ * res1 = Rs1.W[1] + Rs2.W[1];
+ * res2 = Rs1.W[0] - Rs2.W[0];
+ * if (res1 > (2^32)-1) {
+ *   res1 = (2^32)-1;
+ *   OV = 1;
+ * }
+ * if (res2 < 0) {
+ *   res2 = 0;
+ *   OV = 1;
+ * }
+ * Rd.W[1] = res1;
+ * Rd.W[0] = res2;
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_UKSTAS32(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("ukstas32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 4.46. UKSTAS32 ===== */
+
+/* ===== Inline Function Start for 4.47. UKSTSA32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_ADDSUB
+ * \brief UKSTSA32 (SIMD 32-bit Unsigned Saturating Straight Subtraction & Addition)
+ * \details
+ * **Type**: SIMD (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * UKSTSA32 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do one 32-bit unsigned integer element saturating subtraction and one 32-bit unsigned
+ * integer element saturating addition in a 64-bit chunk simultaneously. Operands are from
+ * corresponding 32-bit elements.
+ *
+ * **Description**:\n
+ * This instruction subtracts the 32-bit unsigned integer element in [63:32] of Rs2 from
+ * the 32-bit unsigned integer element in [63:32] of Rs1; at the same time, it adds the 32-bit unsigned
+ * integer element in [31:0] of Rs2 with the 32-bit unsigned integer element in [31:0] Rs1. If any of the
+ * results are beyond the 32-bit unsigned number range (0 <= RES <= 2^32-1), they are saturated to the
+ * range and the OV bit is set to 1. The saturated results are written to [63:32] of Rd for subtraction and
+ * [31:0] of Rd for addition.
+ *
+ * **Operations**:\n
+ * ~~~
+ * res1 = Rs1.W[1] - Rs2.W[1];
+ * res2 = Rs1.W[0] + Rs2.W[0];
+ * if (res1 < 0) {
+ *   res1 = 0;
+ *   OV = 1;
+ * } else if (res2 > (2^32)-1) {
+ *   res2 = (2^32)-1;
+ *   OV = 1;
+ * }
+ * Rd.W[1] = res1;
+ * Rd.W[0] = res2;
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_UKSTSA32(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("ukstsa32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 4.47. UKSTSA32 ===== */
+
+/* ===== Inline Function Start for 4.48. UKSUB32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_ADDSUB
+ * \brief UKSUB32 (SIMD 32-bit Unsigned Saturating Subtraction)
+ * \details
+ * **Type**: SIMD (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * UKSUB32 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 32-bit unsigned integer elements saturating subtractions simultaneously.
+ *
+ * **Description**:\n
+ * This instruction subtracts the 32-bit unsigned integer elements in Rs2 from the 32-bit
+ * unsigned integer elements in Rs1. If any of the results are beyond the 32-bit unsigned number
+ * range (0 <= RES <= 2^32-1), they are saturated to the range and the OV bit is set to 1. The saturated
+ * results are written to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * res[x] = Rs1.W[x] - Rs2.W[x];
+ * if (res[x] < 0) {
+ *   res[x] = 0;
+ *   OV = 1;
+ * }
+ * Rd.W[x] = res[x];
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_UKSUB32(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("uksub32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 4.48. UKSUB32 ===== */
+
+/* ===== Inline Function Start for 4.49. UMAX32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_MISC
+ * \brief UMAX32 (SIMD 32-bit Unsigned Maximum)
+ * \details
+ * **Type**: SIMD (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * UMAX32 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 32-bit unsigned integer elements finding maximum operations simultaneously.
+ *
+ * **Description**:\n
+ * This instruction compares the 32-bit unsigned integer elements in Rs1 with the 32-bit
+ * unsigned integer elements in Rs2 and selects the numbers that is greater than the other one. The
+ * selected results are written to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.W[x] = (Rs1.W[x] u> Rs2.W[x])? Rs1.W[x] : Rs2.W[x];
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_UMAX32(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("umax32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 4.49. UMAX32 ===== */
+
+/* ===== Inline Function Start for 4.50. UMIN32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_MISC
+ * \brief UMIN32 (SIMD 32-bit Unsigned Minimum)
+ * \details
+ * **Type**: SIMD (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * UMIN32 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 32-bit unsigned integer elements finding minimum operations simultaneously.
+ *
+ * **Description**:\n
+ * This instruction compares the 32-bit unsigned integer elements in Rs1 with the 32-bit
+ * unsigned integer elements in Rs2 and selects the numbers that is less than the other one. The
+ * selected results are written to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.W[x] = (Rs1.W[x] <u Rs2.W[x])? Rs1.W[x] : Rs2.W[x];
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_UMIN32(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("umin32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 4.50. UMIN32 ===== */
+
+/* ===== Inline Function Start for 4.51. URADD32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_ADDSUB
+ * \brief URADD32 (SIMD 32-bit Unsigned Halving Addition)
+ * \details
+ * **Type**: SIMD (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * URADD32 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 32-bit unsigned integer element additions simultaneously. The results are halved to
+ * avoid overflow or saturation.
+ *
+ * **Description**:\n
+ * This instruction adds the 32-bit unsigned integer elements in Rs1 with the 32-bit
+ * unsigned integer elements in Rs2. The results are first logically right-shifted by 1 bit and then
+ * written to Rd.
+ *
+ * **Examples**:\n
+ * ~~~
+ * * Ra = 0x7FFFFFFF, Rb = 0x7FFFFFFF Rt = 0x7FFFFFFF
+ * * Ra = 0x80000000, Rb = 0x80000000 Rt = 0x80000000
+ * * Ra = 0x40000000, Rb = 0x80000000 Rt = 0x60000000
+ * ~~~
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.W[x] = (Rs1.W[x] + Rs2.W[x]) u>> 1;
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_URADD32(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("uradd32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 4.51. URADD32 ===== */
+
+/* ===== Inline Function Start for 4.52. URCRAS32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_ADDSUB
+ * \brief URCRAS32 (SIMD 32-bit Unsigned Halving Cross Addition & Subtraction)
+ * \details
+ * **Type**: SIMD (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * URCRAS32 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 32-bit unsigned integer element addition and 32-bit unsigned integer element
+ * subtraction in a 64-bit chunk simultaneously. Operands are from crossed 32-bit elements. The
+ * results are halved to avoid overflow or saturation.
+ *
+ * **Description**:\n
+ * This instruction adds the 32-bit unsigned integer element in [63:32] of Rs1 with the 32-
+ * bit unsigned integer element in [31:0] of Rs2, and subtracts the 32-bit unsigned integer element in
+ * [63:32] of Rs2 from the 32-bit unsigned integer element in [31:0] of Rs1. The element results are first
+ * logically right-shifted by 1 bit and then written to [63:32] of Rd for addition and [31:0] of Rd for
+ * subtraction.
+ *
+ * **Examples**:\n
+ * ~~~
+ * Please see `URADD32` and `URSUB32` instructions.
+ * ~~~
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.W[1] = (Rs1.W[1] + Rs2.W[0]) u>> 1;
+ * Rd.W[0] = (Rs1.W[0] - Rs2.W[1]) u>> 1;
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_URCRAS32(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("urcras32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 4.52. URCRAS32 ===== */
+
+/* ===== Inline Function Start for 4.53. URCRSA32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_ADDSUB
+ * \brief URCRSA32 (SIMD 32-bit Unsigned Halving Cross Subtraction & Addition)
+ * \details
+ * **Type**: SIMD (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * URCRSA32 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 32-bit unsigned integer element subtraction and 32-bit unsigned integer element
+ * addition in a 64-bit chunk simultaneously. Operands are from crossed 32-bit elements. The results
+ * are halved to avoid overflow or saturation.
+ *
+ * **Description**:\n
+ * This instruction subtracts the 32-bit unsigned integer element in [31:0] of Rs2 from the
+ * 32-bit unsigned integer element in [63:32] of Rs1, and adds the 32-bit unsigned element integer in
+ * [31:0] of Rs1 with the 32-bit unsigned integer element in [63:32] of Rs2. The two results are first
+ * logically right-shifted by 1 bit and then written to [63:32] of Rd for subtraction and [31:0] of Rd for
+ * addition.
+ *
+ * **Examples**:\n
+ * ~~~
+ * Please see `URADD32` and `URSUB32` instructions.
+ * ~~~
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.W[1] = (Rs1.W[1] - Rs2.W[0]) u>> 1;
+ * Rd.W[0] = (Rs1.W[0] + Rs2.W[1]) u>> 1;
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_URCRSA32(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("urcrsa32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 4.53. URCRSA32 ===== */
+
+/* ===== Inline Function Start for 4.54. URSTAS32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_ADDSUB
+ * \brief URSTAS32 (SIMD 32-bit Unsigned Halving Straight Addition & Subtraction)
+ * \details
+ * **Type**: SIMD (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * URSTAS32 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 32-bit unsigned integer element addition and 32-bit unsigned integer element
+ * subtraction in a 64-bit chunk simultaneously. Operands are from corresponding 32-bit elements.
+ * The results are halved to avoid overflow or saturation.
+ *
+ * **Description**:\n
+ * This instruction adds the 32-bit unsigned integer element in [63:32] of Rs1 with the 32-
+ * bit unsigned integer element in [63:32] of Rs2, and subtracts the 32-bit unsigned integer element in
+ * [31:0] of Rs2 from the 32-bit unsigned integer element in [31:0] of Rs1. The element results are first
+ * logically right-shifted by 1 bit and then written to [63:32] of Rd for addition and [31:0] of Rd for
+ * subtraction.
+ *
+ * **Examples**:\n
+ * ~~~
+ * Please see `URADD32` and `URSUB32` instructions.
+ * ~~~
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.W[1] = (Rs1.W[1] + Rs2.W[1]) u>> 1;
+ * Rd.W[0] = (Rs1.W[0] - Rs2.W[0]) u>> 1;
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_URSTAS32(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("urstas32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 4.54. URSTAS32 ===== */
+
+/* ===== Inline Function Start for 4.55. URSTSA32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_ADDSUB
+ * \brief URSTSA32 (SIMD 32-bit Unsigned Halving Straight Subtraction & Addition)
+ * \details
+ * **Type**: SIMD (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * URSTSA32 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 32-bit unsigned integer element subtraction and 32-bit unsigned integer element
+ * addition in a 64-bit chunk simultaneously. Operands are from corresponding 32-bit elements. The
+ * results are halved to avoid overflow or saturation.
+ *
+ * **Description**:\n
+ * This instruction subtracts the 32-bit unsigned integer element in [63:32] of Rs2 from
+ * the 32-bit unsigned integer element in [63:32] of Rs1, and adds the 32-bit unsigned element integer
+ * in [31:0] of Rs1 with the 32-bit unsigned integer element in [31:0] of Rs2. The two results are first
+ * logically right-shifted by 1 bit and then written to [63:32] of Rd for subtraction and [31:0] of Rd for
+ * addition.
+ *
+ * **Examples**:\n
+ * ~~~
+ * Please see `URADD32` and `URSUB32` instructions.
+ * ~~~
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.W[1] = (Rs1.W[1] - Rs2.W[1]) u>> 1;
+ * Rd.W[0] = (Rs1.W[0] + Rs2.W[0]) u>> 1;
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_URSTSA32(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("urstsa32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 4.55. URSTSA32 ===== */
+
+/* ===== Inline Function Start for 4.56. URSUB32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_ADDSUB
+ * \brief URSUB32 (SIMD 32-bit Unsigned Halving Subtraction)
+ * \details
+ * **Type**: SIMD (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * URSUB32 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 32-bit unsigned integer element subtractions simultaneously. The results are halved to
+ * avoid overflow or saturation.
+ *
+ * **Description**:\n
+ * This instruction subtracts the 32-bit unsigned integer elements in Rs2 from the 32-bit
+ * unsigned integer elements in Rs1. The results are first logically right-shifted by 1 bit and then
+ * written to Rd.
+ *
+ * **Examples**:\n
+ * ~~~
+ * * Ra = 0x7FFFFFFF, Rb = 0x80000000, Rt = 0xFFFFFFFF
+ * * Ra = 0x80000000, Rb = 0x7FFFFFFF, Rt = 0x00000000
+ * * Ra = 0x80000000, Rb = 0x40000000, Rt = 0x20000000
+ * ~~~
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.W[x] = (Rs1.W[x] - Rs2.W[x]) u>> 1;
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_URSUB32(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("ursub32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 4.56. URSUB32 ===== */
+
+#endif /* __RISCV_XLEN == 64 */
+
+
+#if (__RISCV_XLEN == 32) || defined(__ONLY_FOR_DOXYGEN_DOCUMENT_GENERATION__)
+/* XXXXX Nuclei Extended DSP Instructions for RV32 XXXXX */
+/**
+ * \defgroup NMSIS_Core_DSP_Intrinsic_NUCLEI_CUSTOM      Nuclei Customized DSP Instructions
+ * \ingroup  NMSIS_Core_DSP_Intrinsic
+ * \brief    (RV32 only)Nuclei Customized DSP Instructions
+ * \details  This is Nuclei customized DSP instructions only for RV32
+ */
+/* ===== Inline Function Start for A.1. DKHM8 ===== */
+/**
+ * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_CUSTOM
+ * \brief DKHM8 (64-bit SIMD Signed Saturating Q7 Multiply)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * DKHM8 Rd, Rs1, Rs2
+ * # Rd, Rs1, Rs2 are all even/odd pair of registers
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do Q7xQ7 element multiplications simultaneously. The Q14 results are then reduced to Q7
+ * numbers again.
+ *
+ * **Description**:\n
+ * For the `DKHM8` instruction, multiply the top 8-bit Q7 content of 16-bit chunks in Rs1
+ * with the top 8-bit Q7 content of 16-bit chunks in Rs2. At the same time, multiply the bottom 8-bit Q7
+ * content of 16-bit chunks in Rs1 with the bottom 8-bit Q7 content of 16-bit chunks in Rs2.
+ *
+ * The Q14 results are then right-shifted 7-bits and saturated into Q7 values. The Q7 results are then
+ * written into Rd. When both the two Q7 inputs of a multiplication are 0x80, saturation will happen.
+ * The result will be saturated to 0x7F and the overflow flag OV will be set.
+ *
+ * **Operations**:\n
+ * ~~~
+ * op1t = Rs1.B[x+1]; op2t = Rs2.B[x+1]; // top
+ * op1b = Rs1.B[x]; op2b = Rs2.B[x]; // bottom
+ * for ((aop,bop,res) in [(op1t,op2t,rest), (op1b,op2b,resb)]) {
+ *   if (0x80 != aop | 0x80 != bop) {
+ *     res = (aop s* bop) >> 7;
+ *   } else {
+ *     res= 0x7F;
+ *     OV = 1;
+ *   }
+ * }
+ * Rd.H[x/2] = concat(rest, resb);
+ * for RV32, x=0,2,4,6
+ * ~~~
+ *
+ * \param [in]  a unsigned long long type of value stored in a
+ * \param [in]  b unsigned long long type of value stored in b
+ * \return value stored in unsigned long long type
+ */
+__STATIC_FORCEINLINE unsigned long long __RV_DKHM8(unsigned long long a, unsigned long long b)
+{
+    unsigned long long result;
+    __ASM volatile("dkhm8 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for A.1. DKHM8 ===== */
+
+/* ===== Inline Function Start for A.2. DKHM16 ===== */
+/**
+ * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_CUSTOM
+ * \brief DKHM16 (64-bit SIMD Signed Saturating Q15 Multiply)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * DKHM16 Rd, Rs1, Rs2
+ * # Rd, Rs1, Rs2 are all even/odd pair of registers
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do Q15xQ15 element multiplications simultaneously. The Q30 results are then reduced to
+ * Q15 numbers again.
+ *
+ * **Description**:\n
+ * For the `DKHM16` instruction, multiply the top 16-bit Q15 content of 32-bit chunks in
+ * Rs1 with the top 16-bit Q15 content of 32-bit chunks in Rs2. At the same time, multiply the bottom
+ * 16-bit Q15 content of 32-bit chunks in Rs1 with the bottom 16-bit Q15 content of 32-bit chunks in
+ * Rs2.
+ *
+ * The Q30 results are then right-shifted 15-bits and saturated into Q15 values. The Q15 results are
+ * then written into Rd. When both the two Q15 inputs of a multiplication are 0x8000, saturation will
+ * happen. The result will be saturated to 0x7FFF and the overflow flag OV will be set.
+ *
+ * **Operations**:\n
+ * ~~~
+ * op1t = Rs1.H[x+1]; op2t = Rs2.H[x+1]; // top
+ * op1b = Rs1.H[x]; op2b = Rs2.H[x]; // bottom
+ * for ((aop,bop,res) in [(op1t,op2t,rest), (op1b,op2b,resb)]) {
+ *   if (0x8000 != aop | 0x8000 != bop) {
+ *     res = (aop s* bop) >> 15;
+ *   } else {
+ *     res= 0x7FFF;
+ *     OV = 1;
+ *   }
+ * }
+ * Rd.W[x/2] = concat(rest, resb);
+ * for RV32: x=0, 2
+ * ~~~
+ *
+ * \param [in]  a unsigned long long type of value stored in a
+ * \param [in]  b unsigned long long type of value stored in b
+ * \return value stored in unsigned long long type
+ */
+__STATIC_FORCEINLINE unsigned long long __RV_DKHM16(unsigned long long a, unsigned long long b)
+{
+    unsigned long long result;
+    __ASM volatile("dkhm16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for A.2. DKHM16 ===== */
+
+/* ===== Inline Function Start for A.3. DKABS8 ===== */
+/**
+ * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_CUSTOM
+ * \brief DKABS8 (64-bit SIMD 8-bit Saturating Absolute)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * DKABS8 Rd, Rs1
+ * # Rd, Rs1 are all even/odd pair of registers
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Get the absolute value of 8-bit signed integer elements simultaneously.
+ *
+ * **Description**:\n
+ * This instruction calculates the absolute value of 8-bit signed integer elements stored
+ * in Rs1 and writes the element results to Rd. If the input number is 0x80, this instruction generates
+ * 0x7f as the output and sets the OV bit to 1.
+ *
+ * **Operations**:\n
+ * ~~~
+ * src = Rs1.B[x];
+ * if (src == 0x80) {
+ *   src = 0x7f;
+ *   OV = 1;
+ * } else if (src[7] == 1)
+ *   src = -src;
+ * }
+ * Rd.B[x] = src;
+ * for RV32: x=7...0,
+ * ~~~
+ *
+ * \param [in]  a unsigned long long type of value stored in a
+ * \return value stored in unsigned long long type
+ */
+__STATIC_FORCEINLINE unsigned long long __RV_DKABS8(unsigned long long a)
+{
+    unsigned long long result;
+    __ASM volatile("dkabs8 %0, %1" : "=r"(result) : "r"(a));
+    return result;
+}
+/* ===== Inline Function End for A.3. DKABS8 ===== */
+
+/* ===== Inline Function Start for A.4. DKABS16 ===== */
+/**
+ * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_CUSTOM
+ * \brief DKABS16 (64-bit SIMD 16-bit Saturating Absolute)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * DKABS16 Rd, Rs1
+ * # Rd, Rs1 are all even/odd pair of registers
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Get the absolute value of 16-bit signed integer elements simultaneously.
+ *
+ * **Description**:\n
+ * This instruction calculates the absolute value of 16-bit signed integer elements stored
+ * in Rs1 and writes the element results to Rd. If the input number is 0x8000, this instruction
+ * generates 0x7fff as the output and sets the OV bit to 1.
+ *
+ * **Operations**:\n
+ * ~~~
+ * src = Rs1.H[x];
+ * if (src == 0x8000) {
+ *   src = 0x7fff;
+ *   OV = 1;
+ * } else if (src[15] == 1)
+ *   src = -src;
+ * }
+ * Rd.H[x] = src;
+ * for RV32: x=3...0,
+ * ~~~
+ *
+ * \param [in]  a unsigned long long type of value stored in a
+ * \return value stored in unsigned long long type
+ */
+__STATIC_FORCEINLINE unsigned long long __RV_DKABS16(unsigned long long a)
+{
+    unsigned long long result;
+    __ASM volatile("dkabs16 %0, %1" : "=r"(result) : "r"(a));
+    return result;
+}
+/* ===== Inline Function End for A.4. DKABS16 ===== */
+
+/* ===== Inline Function Start for A.5. DKSLRA8 ===== */
+/**
+ * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_CUSTOM
+ * \brief DKSLRA8 (64-bit SIMD 8-bit Shift Left Logical with Saturation or Shift Right Arithmetic)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * DKSLRA8 Rd, Rs1, Rs2
+ * # Rd, Rs1 are all even/odd pair of registers
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 8-bit elements logical left (positive) or arithmetic right (negative) shift operation with
+ * Q7 saturation for the left shift.
+ *
+ * **Description**:\n
+ * The 8-bit data elements of Rs1 are left-shifted logically or right-shifted arithmetically
+ * based on the value of Rs2[3:0]. Rs2[3:0] is in the signed range of [-2^3, 2^3-1]. A positive Rs2[3:0] means
+ * logical left shift and a negative Rs2[3:0] means arithmetic right shift. The shift amount is the
+ * absolute value of Rs2[3:0]. However, the behavior of `Rs2[3:0]==-2^3 (0x8)` is defined to be
+ * equivalent to the behavior of `Rs2[3:0]==-(2^3-1) (0x9)`.
+ * The left-shifted results are saturated to the 8-bit signed integer range of [-2^7, 2^7-1].
+ * If any saturation happens, this instruction sets the OV flag. The value of Rs2[31:4] will not affect
+ * this instruction.
+ *
+ * **Operations**:\n
+ * ~~~
+ * if (Rs2[3:0] < 0) {
+ *   sa = -Rs2[3:0];
+ *   sa = (sa == 8)? 7 : sa;
+ *   Rd.B[x] = SE8(Rs1.B[x][7:sa]);
+ * } else {
+ *   sa = Rs2[2:0];
+ *   res[(7+sa):0] = Rs1.B[x] <<(logic) sa;
+ *   if (res > (2^7)-1) {
+ *     res[7:0] = 0x7f; OV = 1;
+ *   } else if (res < -2^7) {
+ *     res[7:0] = 0x80; OV = 1;
+ *   }
+ *   Rd.B[x] = res[7:0];
+ * }
+ * for RV32: x=7...0,
+ * ~~~
+ *
+ * \param [in]  a unsigned long long type of value stored in a
+ * \param [in]  b int type of value stored in b
+ * \return value stored in unsigned long long type
+ */
+__STATIC_FORCEINLINE unsigned long long __RV_DKSLRA8(unsigned long long a, int b)
+{
+    unsigned long long result;
+    __ASM volatile("dkslra8 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for A.5. DKSLRA8 ===== */
+
+/* ===== Inline Function Start for A.6. DKSLRA16 ===== */
+/**
+ * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_CUSTOM
+ * \brief DKSLRA16 (64-bit SIMD 16-bit Shift Left Logical with Saturation or Shift Right Arithmetic)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * DKSLRA16 Rd, Rs1, Rs2
+ * # Rd, Rs1 are all even/odd pair of registers
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 16-bit elements logical left (positive) or arithmetic right (negative) shift operation with
+ * Q15 saturation for the left shift.
+ *
+ * **Description**:\n
+ * The 16-bit data elements of Rs1 are left-shifted logically or right-shifted arithmetically
+ * based on the value of Rs2[4:0]. Rs2[4:0] is in the signed range of [-2^4, 2^4-1]. A positive Rs2[4:0] means
+ * logical left shift and a negative Rs2[4:0] means arithmetic right shift. The shift amount is the
+ * absolute value of Rs2[4:0]. However, the behavior of `Rs2[4:0]==-2^4 (0x10)` is defined to be
+ * equivalent to the behavior of `Rs2[4:0]==-(2^4-1) (0x11)`.
+ * The left-shifted results are saturated to the 16-bit signed integer range of [-2^15, 2^15-1].
+ * After the shift, saturation, or rounding, the final results are written to
+ * Rd. If any saturation happens, this instruction sets the OV flag. The value of Rs2[31:5] will not affect
+ * this instruction.
+ *
+ * **Operations**:\n
+ * ~~~
+ * if (Rs2[4:0] < 0) {
+ *   sa = -Rs2[4:0];
+ *   sa = (sa == 16)? 15 : sa;
+ *   Rd.H[x] = SE16(Rs1.H[x][15:sa]);
+ * } else {
+ *   sa = Rs2[3:0];
+ *   res[(15+sa):0] = Rs1.H[x] <<(logic) sa;
+ *   if (res > (2^15)-1) {
+ *     res[15:0] = 0x7fff; OV = 1;
+ *   } else if (res < -2^15) {
+ *     res[15:0] = 0x8000; OV = 1;
+ *   }
+ *   d.H[x] = res[15:0];
+ * }
+ * for RV32: x=3...0,
+ * ~~~
+ *
+ * \param [in]  a unsigned long long type of value stored in a
+ * \param [in]  b int type of value stored in b
+ * \return value stored in unsigned long long type
+ */
+__STATIC_FORCEINLINE unsigned long long __RV_DKSLRA16(unsigned long long a, int b)
+{
+    unsigned long long result;
+    __ASM volatile("dkslra16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for A.6. DKSLRA16 ===== */
+
+/* ===== Inline Function Start for A.7. DKADD8 ===== */
+/**
+ * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_CUSTOM
+ * \brief DKADD8 (64-bit SIMD 8-bit Signed Saturating Addition)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * DKADD8 Rd, Rs1, Rs2
+ * # Rd, Rs1, Rs2 are all even/odd pair of registers
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 8-bit signed integer element saturating additions simultaneously.
+ *
+ * **Description**:\n
+ * This instruction adds the 8-bit signed integer elements in Rs1 with the 8-bit signed
+ * integer elements in Rs2. If any of the results are beyond the Q7 number range (-2^7 <= Q7 <= 2^7-1), they
+ * are saturated to the range and the OV bit is set to 1. The saturated results are written to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * res[x] = Rs1.B[x] + Rs2.B[x];
+ * if (res[x] > 127) {
+ *   res[x] = 127;
+ *   OV = 1;
+ * } else if (res[x] < -128) {
+ *   res[x] = -128;
+ *   OV = 1;
+ * }
+ * Rd.B[x] = res[x];
+ * for RV32: x=7...0,
+ * ~~~
+ *
+ * \param [in]  a unsigned long long type of value stored in a
+ * \param [in]  b unsigned long long type of value stored in b
+ * \return value stored in unsigned long long type
+ */
+__STATIC_FORCEINLINE unsigned long long __RV_DKADD8(unsigned long long a, unsigned long long b)
+{
+    unsigned long long result;
+    __ASM volatile("dkadd8 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for A.7. DKADD8 ===== */
+
+/* ===== Inline Function Start for A.8. DKADD16 ===== */
+/**
+ * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_CUSTOM
+ * \brief DKADD16 (64-bit SIMD 16-bit Signed Saturating Addition)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * DKADD16 Rd, Rs1, Rs2
+ * # Rd, Rs1, Rs2 are all even/odd pair of registers
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 16-bit signed integer element saturating additions simultaneously.
+ *
+ * **Description**:\n
+ * This instruction adds the 16-bit signed integer elements in Rs1 with the 16-bit signed
+ * integer elements in Rs2. If any of the results are beyond the Q15 number range (-2^15 <= Q15 <= 2^15-1),
+ * they are saturated to the range and the OV bit is set to 1. The saturated results are written to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * res[x] = Rs1.H[x] + Rs2.H[x];
+ * if (res[x] > 32767) {
+ *   res[x] = 32767;
+ *   OV = 1;
+ * } else if (res[x] < -32768) {
+ *   res[x] = -32768;
+ *   OV = 1;
+ * }
+ * Rd.H[x] = res[x];
+ * for RV32: x=3...0,
+ * ~~~
+ *
+ * \param [in]  a unsigned long long type of value stored in a
+ * \param [in]  b unsigned long long type of value stored in b
+ * \return value stored in unsigned long long type
+ */
+__STATIC_FORCEINLINE unsigned long long __RV_DKADD16(unsigned long long a, unsigned long long b)
+{
+    unsigned long long result;
+    __ASM volatile("dkadd16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for A.8. DKADD16 ===== */
+
+/* ===== Inline Function Start for A.10. DKSUB8 ===== */
+/**
+ * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_CUSTOM
+ * \brief DKSUB8 (64-bit SIMD 8-bit Signed Saturating Subtraction)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * DKSUB8 Rd, Rs1, Rs2
+ * # Rd, Rs1, Rs2 are all even/odd pair of registers
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 8-bit signed elements saturating subtractions simultaneously.
+ *
+ * **Description**:\n
+ * This instruction subtracts the 8-bit signed integer elements in Rs2 from the 8-bit
+ * signed integer elements in Rs1. If any of the results are beyond the Q7 number range (-2^7 <= Q7 <= 2^7-1),
+ * they are saturated to the range and the OV bit is set to 1. The saturated results are written to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * res[x] = Rs1.B[x] - Rs2.B[x];
+ * if (res[x] > (2^7)-1) {
+ *   res[x] = (2^7)-1;
+ *   OV = 1;
+ * } else if (res[x] < -2^7) {
+ *   res[x] = -2^7;
+ *   OV = 1;
+ * }
+ * Rd.B[x] = res[x];
+ * for RV32: x=7...0,
+ * ~~~
+ *
+ * \param [in]  a unsigned long long type of value stored in a
+ * \param [in]  b unsigned long long type of value stored in b
+ * \return value stored in unsigned long long type
+ */
+__STATIC_FORCEINLINE unsigned long long __RV_DKSUB8(unsigned long long a, unsigned long long b)
+{
+    unsigned long long result;
+    __ASM volatile("dksub8 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for A.9. DKSUB8 ===== */
+
+/* ===== Inline Function Start for A.10. DKSUB16 ===== */
+/**
+ * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_CUSTOM
+ * \brief DKSUB16 (64-bit SIMD 16-bit Signed Saturating Subtraction)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * DKSUB16 Rd, Rs1, Rs2
+ * # Rd, Rs1, Rs2 are all even/odd pair of registers
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 16-bit signed integer elements saturating subtractions simultaneously.
+ *
+ * **Description**:\n
+ * This instruction subtracts the 16-bit signed integer elements in Rs2 from the 16-bit
+ * signed integer elements in Rs1. If any of the results are beyond the Q15 number range (-2^15 <= Q15 <=
+ * 2^15-1), they are saturated to the range and the OV bit is set to 1. The saturated results are written to
+ * Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * res[x] = Rs1.H[x] - Rs2.H[x];
+ * if (res[x] > (2^15)-1) {
+ *   res[x] = (2^15)-1;
+ *   OV = 1;
+ * } else if (res[x] < -2^15) {
+ *   res[x] = -2^15;
+ *   OV = 1;
+ * }
+ * Rd.H[x] = res[x];
+ * for RV32: x=3...0,
+ * ~~~
+ *
+ * \param [in]  a unsigned long long type of value stored in a
+ * \param [in]  b unsigned long long type of value stored in b
+ * \return value stored in unsigned long long type
+ */
+__STATIC_FORCEINLINE unsigned long long __RV_DKSUB16(unsigned long long a, unsigned long long b)
+{
+    unsigned long long result;
+    __ASM volatile("dksub16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for A.10. DKSUB16 ===== */
+
+/* ===== Inline Function Start for A.11.1. EXPD80 ===== */
+/**
+ * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_CUSTOM
+ * \brief EXPD80 (Expand and Copy Byte 0 to 32bit)
+ * \details
+ * **Type**: DSP
+ *
+ * **Syntax**:\n
+ * ~~~
+ * EXPD80 Rd, Rs1
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Copy 8-bit data from 32-bit chunks into 4 bytes in a register.
+ *
+ * **Description**:\n
+ * Moves Rs1.B[0][7:0] to Rd.[0][7:0], Rd.[1][7:0], Rd.[2][7:0], Rd.[3][7:0]
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.W[x][31:0] = CONCAT(Rs1.B[0][7:0], Rs1.B[0][7:0], Rs1.B[0][7:0], Rs1.B[0][7:0]);
+ * for RV32: x=0
+ * ~~~
+ *
+ * \param [in]  a unsigned long type of value stored in a
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_EXPD80(unsigned long a)
+{
+    unsigned long result;
+    __ASM volatile("expd80 %0, %1" : "=r"(result) : "r"(a));
+    return result;
+}
+/* ===== Inline Function End for A11.1. EXPD80 ===== */
+
+/* ===== Inline Function Start for A.11.2. EXPD81 ===== */
+/**
+ * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_CUSTOM
+ * \brief EXPD81 (Expand and Copy Byte 1 to 32bit)
+ * \details
+ * **Type**: DSP
+ *
+ * **Syntax**:\n
+ * ~~~
+ * EXPD81 Rd, Rs1
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Copy 8-bit data from 32-bit chunks into 4 bytes in a register.
+ *
+ * **Description**:\n
+ * Moves Rs1.B[1][7:0] to Rd.[0][7:0], Rd.[1][7:0], Rd.[2][7:0], Rd.[3][7:0]
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.W[x][31:0] = CONCAT(Rs1.B[1][7:0], Rs1.B[1][7:0], Rs1.B[1][7:0], Rs1.B[1][7:0]);
+ * for RV32: x=0
+ * ~~~
+ *
+ * \param [in]  a unsigned long type of value stored in a
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_EXPD81(unsigned long a)
+{
+    unsigned long result;
+    __ASM volatile("expd81 %0, %1" : "=r"(result) : "r"(a));
+    return result;
+}
+/* ===== Inline Function End for A11.2. EXPD81 ===== */
+
+/* ===== Inline Function Start for A.11.3. EXPD82 ===== */
+/**
+ * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_CUSTOM
+ * \brief EXPD82 (Expand and Copy Byte 2 to 32bit)
+ * \details
+ * **Type**: DSP
+ *
+ * **Syntax**:\n
+ * ~~~
+ * EXPD82 Rd, Rs1
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Copy 8-bit data from 32-bit chunks into 4 bytes in a register.
+ *
+ * **Description**:\n
+ * Moves Rs1.B[2][7:0] to Rd.[0][7:0], Rd.[1][7:0], Rd.[2][7:0], Rd.[3][7:0]
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.W[x][31:0] = CONCAT(Rs1.B[2][7:0], Rs1.B[2][7:0], Rs1.B[2][7:0], Rs1.B[2][7:0]);
+ * for RV32: x=0
+ * ~~~
+ *
+ * \param [in]  a unsigned long type of value stored in a
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_EXPD82(unsigned long a)
+{
+    unsigned long result;
+    __ASM volatile("expd82 %0, %1" : "=r"(result) : "r"(a));
+    return result;
+}
+/* ===== Inline Function End for A11.3. EXPD82 ===== */
+
+/* ===== Inline Function Start for A.11.4. EXPD83 ===== */
+/**
+ * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_CUSTOM
+ * \brief EXPD83 (Expand and Copy Byte 3 to 32bit)
+ * \details
+ * **Type**: DSP
+ *
+ * **Syntax**:\n
+ * ~~~
+ * EXPD83 Rd, Rs1
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Copy 8-bit data from 32-bit chunks into 4 bytes in a register.
+ *
+ * **Description**:\n
+ * Moves Rs1.B[3][7:0] to Rd.[0][7:0], Rd.[1][7:0], Rd.[2][7:0], Rd.[3][7:0]
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.W[x][31:0] = CONCAT(Rs1.B[3][7:0], Rs1.B[3][7:0], Rs1.B[3][7:0], Rs1.B[3][7:0]);
+ * for RV32: x=0
+ * ~~~
+ *
+ * \param [in]  a unsigned long type of value stored in a
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_EXPD83(unsigned long a)
+{
+    unsigned long result;
+    __ASM volatile("expd83 %0, %1" : "=r"(result) : "r"(a));
+    return result;
+}
+/* ===== Inline Function End for A11.4. EXPD83 ===== */
+#endif /* __RISCV_XLEN == 32 */
+
+#if defined(__RISCV_FEATURE_DSP) && (__RISCV_FEATURE_DSP == 1)
+/* XXXXX ARM Compatiable SIMD API XXXXX */
+/** \brief Q setting quad 8-bit saturating addition. */
+#define __QADD8(x, y)               __RV_KADD8(x, y)
+/** \brief Q setting quad 8-bit saturating subtract. */
+#define __QSUB8(x, y)               __RV_KSUB8((x), (y))
+/** \brief Q setting dual 16-bit saturating addition. */
+#define __QADD16(x, y)              __RV_KADD16((x), (y))
+/** \brief Dual 16-bit signed addition with halved results. */
+#define __SHADD16(x, y)             __RV_RADD16((x), (y))
+/** \brief Q setting dual 16-bit saturating subtract. */
+#define __QSUB16(x, y)              __RV_KSUB16((x), (y))
+/** \brief Dual 16-bit signed subtraction with halved results. */
+#define __SHSUB16(x, y)             __RV_RSUB16((x), (y))
+/** \brief Q setting dual 16-bit add and subtract with exchange. */
+#define __QASX(x, y)                __RV_KCRAS16((x), (y))
+/** \brief Dual 16-bit signed addition and subtraction with halved results.*/
+#define __SHASX(x, y)               __RV_RCRAS16((x), (y))
+/** \brief Q setting dual 16-bit subtract and add with exchange. */
+#define __QSAX(x, y)                __RV_KCRSA16((x), (y))
+/** \brief Dual 16-bit signed subtraction and addition with halved results.*/
+#define __SHSAX(x, y)               __RV_RCRSA16((x), (y))
+/** \brief Dual 16-bit signed multiply with exchange returning difference. */
+#define __SMUSDX(x, y)              __RV_SMXDS((y), (x))
+/** \brief Q setting sum of dual 16-bit signed multiply with exchange. */
+__STATIC_FORCEINLINE int32_t __SMUADX (int32_t op1, int32_t op2)
+{
+    return (int32_t)__RV_KMXDA(op1, op2);
+}
+/** \brief Q setting saturating add. */
+#define __QADD(x, y)                __RV_KADDW((x), (y))
+/** \brief Q setting saturating subtract. */
+#define __QSUB(x, y)                __RV_KSUBW((x), (y))
+/** \brief Q setting dual 16-bit signed multiply with single 32-bit accumulator. */
+__STATIC_FORCEINLINE int32_t __SMLAD(int32_t op1, int32_t op2, int32_t op3)
+{
+    return (int32_t)__RV_KMADA(op3, op1, op2);
+}
+/** \brief Q setting pre-exchanged dual 16-bit signed multiply with single 32-bit accumulator.  */
+__STATIC_FORCEINLINE int32_t __SMLADX(int32_t op1, int32_t op2, int32_t op3)
+{
+    return (int32_t)__RV_KMAXDA(op3, op1, op2);
+}
+/** \brief Q setting dual 16-bit signed multiply with exchange subtract with 32-bit accumulate.  */
+__STATIC_FORCEINLINE int32_t __SMLSDX(int32_t op1, int32_t op2, int32_t op3)
+{
+    return (op3 - (int32_t)__RV_SMXDS(op1, op2));
+}
+/** \brief Dual 16-bit signed multiply with single 64-bit accumulator. */
+__STATIC_FORCEINLINE int64_t __SMLALD(int32_t op1, int32_t op2, int64_t acc)
+{
+    return (int64_t)__RV_SMALDA(acc, op1, op2);
+}
+/** \brief Dual 16-bit signed multiply with exchange with single 64-bit accumulator.  */
+__STATIC_FORCEINLINE int64_t __SMLALDX(int32_t op1, int32_t op2, int64_t acc)
+{
+    return (int64_t)__RV_SMALXDA(acc, op1, op2);
+}
+/** \brief Q setting sum of dual 16-bit signed multiply. */
+__STATIC_FORCEINLINE int32_t __SMUAD(int32_t op1, int32_t op2)
+{
+    return (int32_t)__RV_KMDA(op1, op2);
+}
+/** \brief Dual 16-bit signed multiply returning difference. */
+__STATIC_FORCEINLINE int32_t __SMUSD(int32_t op1, int32_t op2)
+{
+    return (int32_t)__RV_SMDRS(op1, op2);
+}
+/** \brief Dual extract 8-bits and sign extend each to 16-bits. */
+#define __SXTB16(x)             __RV_SUNPKD820(x)
+/** \brief Dual extracted 8-bit to 16-bit signed addition. TODO Need test */
+__STATIC_FORCEINLINE int32_t __SXTAB16(uint32_t op1, uint32_t op2)
+{
+    return __RV_ADD16(op1, __RV_SUNPKD830(op2));
+}
+/** \brief 32-bit signed multiply with 32-bit truncated accumulator. */
+__STATIC_FORCEINLINE int32_t __SMMLA(int32_t op1, int32_t op2, int32_t op3)
+{
+    int32_t mul;
+    mul = (int32_t)__RV_SMMUL(op1, op2);
+    return (op3 + mul);
+}
+#define __DKHM8                 __RV_DKHM8
+#define __DKHM16                __RV_DKHM16
+#define __DKSUB16               __RV_DKSUB16
+#define __SMAQA                 __RV_SMAQA
+#define __MULSR64               __RV_MULSR64
+#define __DQADD8                __RV_DKADD8
+#define __DQSUB8                __RV_DKSUB8
+#define __DKADD16               __RV_DKADD16
+#define __PKBB16                __RV_PKBB16
+#define __DKSLRA16              __RV_DKSLRA16
+#define __DKSLRA8               __RV_DKSLRA8
+#define __KABSW                 __RV_KABSW
+#define __DKABS8                __RV_DKABS8
+#define __DKABS16               __RV_DKABS16
+#define __SMALDA                __RV_SMALDA
+#define __SMSLDA                __RV_SMSLDA
+#define __SMALBB                __RV_SMALBB
+#define __SUB64                 __RV_SUB64
+#define __ADD64                 __RV_ADD64
+#define __SMBB16                __RV_SMBB16
+#define __SMBT16                __RV_SMBT16
+#define __SMTT16                __RV_SMTT16
+#define __EXPD80                __RV_EXPD80
+#define __SMAX8                 __RV_SMAX8
+#define __SMAX16                __RV_SMAX16
+#define __PKTT16                __RV_PKTT16
+#define __KADD16                __RV_KADD16
+#define __SADD16                __RV_ADD16
+
+#endif /* (__RISCV_FEATURE_DSP == 1) */
+
+#endif /* defined(__DSP_PRESENT) && (__DSP_PRESENT == 1) */
+
+/** \brief Halfword packing instruction. Combines bits[15:0] of val1 with bits[31:16] of val2 levitated with the val3. */
+#define __PKHBT(ARG1,ARG2,ARG3)          ( ((((uint32_t)(ARG1))          ) & 0x0000FFFFUL) |  \
+                                           ((((uint32_t)(ARG2)) << (ARG3)) & 0xFFFF0000UL)  )
+/** \brief Halfword packing instruction. Combines bits[31:16] of val1 with bits[15:0] of val2 right-shifted with the val3. */
+#define __PKHTB(ARG1,ARG2,ARG3)          ( ((((uint32_t)(ARG1))          ) & 0xFFFF0000UL) |  \
+                                           ((((uint32_t)(ARG2)) >> (ARG3)) & 0x0000FFFFUL)  )
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __CORE_FEATURE_DSP__ */
diff --git a/kernel/arch/risc-v/nuclei/gcc/nmsis/Core/Include/core_feature_eclic.h b/kernel/arch/risc-v/nuclei/gcc/nmsis/Core/Include/core_feature_eclic.h
new file mode 100644
index 00000000..c579c17e
--- /dev/null
+++ b/kernel/arch/risc-v/nuclei/gcc/nmsis/Core/Include/core_feature_eclic.h
@@ -0,0 +1,897 @@
+/*
+ * Copyright (c) 2019 Nuclei Limited. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __CORE_FEATURE_ECLIC__
+#define __CORE_FEATURE_ECLIC__
+/*!
+ * @file     core_feature_eclic.h
+ * @brief    ECLIC feature API header file for Nuclei N/NX Core
+ */
+/*
+ * ECLIC Feature Configuration Macro:
+ * 1. __ECLIC_PRESENT:  Define whether Enhanced Core Local Interrupt Controller (ECLIC) Unit is present or not
+ *   * 0: Not present
+ *   * 1: Present
+ * 2. __ECLIC_BASEADDR:  Base address of the ECLIC unit.
+ * 3. ECLIC_GetInfoCtlbits():  Define the number of hardware bits are actually implemented in the clicintctl registers.
+ *   Valid number is 1 - 8.
+ * 4. __ECLIC_INTNUM  : Define the external interrupt number of ECLIC Unit
+ *
+ */
+#ifdef __cplusplus
+ extern "C" {
+#endif
+
+#if defined(__ECLIC_PRESENT) && (__ECLIC_PRESENT == 1)
+/**
+ * \defgroup NMSIS_Core_ECLIC_Registers     Register Define and Type Definitions Of ECLIC
+ * \ingroup NMSIS_Core_Registers
+ * \brief   Type definitions and defines for eclic registers.
+ *
+ * @{
+ */
+
+/**
+ * \brief  Union type to access CLICFG configure register.
+ */
+typedef union
+{
+    struct {
+        uint8_t _reserved0:1;                   /*!< bit:     0   Overflow condition code flag */
+        uint8_t nlbits:4;                       /*!< bit:     29  Carry condition code flag */
+        uint8_t _reserved1:2;                   /*!< bit:     30  Zero condition code flag */
+        uint8_t _reserved2:1;                   /*!< bit:     31  Negative condition code flag */
+    } b;                                        /*!< Structure used for bit  access */
+    uint8_t w;                                  /*!< Type      used for byte access */
+} CLICCFG_Type;
+
+/**
+ * \brief  Union type to access CLICINFO information register.
+ */
+typedef union {
+    struct {
+        uint32_t numint:13;                     /*!< bit:  0..12   number of maximum interrupt inputs supported */
+        uint32_t version:8;                     /*!< bit:  13..20  20:17 for architecture version,16:13 for implementation version */
+        uint32_t intctlbits:4;                  /*!< bit:  21..24  specifies how many hardware bits are actually implemented in the clicintctl registers */
+        uint32_t _reserved0:7;                  /*!< bit:  25..31  Reserved */
+    } b;                                        /*!< Structure used for bit  access */
+    uint32_t w;                                 /*!< Type      used for word access */
+} CLICINFO_Type;
+
+/**
+ * \brief Access to the structure of a vector interrupt controller.
+ */
+typedef struct {
+    __IOM uint8_t  INTIP;                       /*!< Offset: 0x000 (R/W)  Interrupt set pending register */
+    __IOM uint8_t  INTIE;                       /*!< Offset: 0x001 (R/W)  Interrupt set enable register */
+    __IOM uint8_t  INTATTR;                     /*!< Offset: 0x002 (R/W)  Interrupt set attributes register */
+    __IOM uint8_t  INTCTRL;                     /*!< Offset: 0x003 (R/W)  Interrupt configure register */
+} CLIC_CTRL_Type;
+
+typedef struct {
+    __IOM uint8_t   CFG;                        /*!< Offset: 0x000 (R/W)  CLIC configuration register */
+    uint8_t RESERVED0[3];
+    __IM uint32_t  INFO;                        /*!< Offset: 0x004 (R/ )  CLIC information register */
+    uint8_t RESERVED1[3];
+    __IOM uint8_t  MTH;                         /*!< Offset: 0x00B (R/W)  CLIC machine mode threshold register */
+    uint32_t RESERVED2[0x3FD];
+    CLIC_CTRL_Type CTRL[4096];                  /*!< Offset: 0x1000 (R/W) CLIC register structure for INTIP, INTIE, INTATTR, INTCTL */
+} CLIC_Type;
+
+#define CLIC_CLICCFG_NLBIT_Pos                 1U                                       /*!< CLIC CLICCFG: NLBIT Position */
+#define CLIC_CLICCFG_NLBIT_Msk                 (0xFUL << CLIC_CLICCFG_NLBIT_Pos)        /*!< CLIC CLICCFG: NLBIT Mask */
+
+#define CLIC_CLICINFO_CTLBIT_Pos                21U                                     /*!< CLIC INTINFO: __ECLIC_GetInfoCtlbits() Position */
+#define CLIC_CLICINFO_CTLBIT_Msk                (0xFUL << CLIC_CLICINFO_CTLBIT_Pos)     /*!< CLIC INTINFO: __ECLIC_GetInfoCtlbits() Mask */
+
+#define CLIC_CLICINFO_VER_Pos                  13U                                      /*!< CLIC CLICINFO: VERSION Position */
+#define CLIC_CLICINFO_VER_Msk                  (0xFFUL << CLIC_CLICCFG_NLBIT_Pos)       /*!< CLIC CLICINFO: VERSION Mask */
+
+#define CLIC_CLICINFO_NUM_Pos                  0U                                       /*!< CLIC CLICINFO: NUM Position */
+#define CLIC_CLICINFO_NUM_Msk                  (0xFFFUL << CLIC_CLICINFO_NUM_Pos)       /*!< CLIC CLICINFO: NUM Mask */
+
+#define CLIC_INTIP_IP_Pos                      0U                                       /*!< CLIC INTIP: IP Position */
+#define CLIC_INTIP_IP_Msk                      (0x1UL << CLIC_INTIP_IP_Pos)             /*!< CLIC INTIP: IP Mask */
+
+#define CLIC_INTIE_IE_Pos                      0U                                       /*!< CLIC INTIE: IE Position */
+#define CLIC_INTIE_IE_Msk                      (0x1UL << CLIC_INTIE_IE_Pos)             /*!< CLIC INTIE: IE Mask */
+
+#define CLIC_INTATTR_TRIG_Pos                  1U                                       /*!< CLIC INTATTR: TRIG Position */
+#define CLIC_INTATTR_TRIG_Msk                  (0x3UL << CLIC_INTATTR_TRIG_Pos)         /*!< CLIC INTATTR: TRIG Mask */
+
+#define CLIC_INTATTR_SHV_Pos                   0U                                       /*!< CLIC INTATTR: SHV Position */
+#define CLIC_INTATTR_SHV_Msk                   (0x1UL << CLIC_INTATTR_SHV_Pos)          /*!< CLIC INTATTR: SHV Mask */
+
+#define ECLIC_MAX_NLBITS                       8U                                       /*!< Max nlbit of the CLICINTCTLBITS */
+#define ECLIC_MODE_MTVEC_Msk                   3U                                       /*!< ECLIC Mode mask for MTVT CSR Register */
+
+#define ECLIC_NON_VECTOR_INTERRUPT             0x0                                      /*!< Non-Vector Interrupt Mode of ECLIC */
+#define ECLIC_VECTOR_INTERRUPT                 0x1                                      /*!< Vector Interrupt Mode of ECLIC */
+
+/**\brief ECLIC Trigger Enum for different Trigger Type */
+typedef enum ECLIC_TRIGGER {
+    ECLIC_LEVEL_TRIGGER = 0x0,          /*!< Level Triggerred, trig[0] = 0 */
+    ECLIC_POSTIVE_EDGE_TRIGGER = 0x1,   /*!< Postive/Rising Edge Triggered, trig[1] = 0, trig[0] = 1 */
+    ECLIC_NEGTIVE_EDGE_TRIGGER = 0x3,   /*!< Negtive/Falling Edge Triggered, trig[1] = 1, trig[0] = 0 */
+    ECLIC_MAX_TRIGGER = 0x3             /*!< MAX Supported Trigger Mode */
+} ECLIC_TRIGGER_Type;
+
+#ifndef __ECLIC_BASEADDR
+/* Base address of ECLIC(__ECLIC_BASEADDR) should be defined in <Device.h> */
+#error "__ECLIC_BASEADDR is not defined, please check!"
+#endif
+
+#ifndef __ECLIC_INTCTLBITS
+/* Define __ECLIC_INTCTLBITS to get via ECLIC->INFO if not defined */
+#define __ECLIC_INTCTLBITS                  (__ECLIC_GetInfoCtlbits())
+#endif
+
+/* ECLIC Memory mapping of Device */
+#define ECLIC_BASE                          __ECLIC_BASEADDR                            /*!< ECLIC Base Address */
+#define ECLIC                               ((CLIC_Type *) ECLIC_BASE)                  /*!< CLIC configuration struct */
+
+/** @} */ /* end of group NMSIS_Core_ECLIC_Registers */
+
+/* ##########################   ECLIC functions  #################################### */
+/**
+ * \defgroup   NMSIS_Core_IntExc        Interrupts and Exceptions
+ * \brief Functions that manage interrupts and exceptions via the ECLIC.
+ *
+ * @{
+ */
+
+/**
+ * \brief  Definition of IRQn numbers
+ * \details
+ * The core interrupt enumeration names for IRQn values are defined in the file <b><Device>.h</b>.
+ * - Interrupt ID(IRQn) from 0 to 18 are reserved for core internal interrupts.
+ * - Interrupt ID(IRQn) start from 19 represent device-specific external interrupts.
+ * - The first device-specific interrupt has the IRQn value 19.
+ *
+ * The table below describes the core interrupt names and their availability in various Nuclei Cores.
+ */
+/* The following enum IRQn definition in this file
+ * is only used for doxygen documentation generation,
+ * The <Device>.h is the real file to define it by vendor
+ */
+#if defined(__ONLY_FOR_DOXYGEN_DOCUMENT_GENERATION__)
+typedef enum IRQn {
+    /* ========= Nuclei N/NX Core Specific Interrupt Numbers  =========== */
+    /* Core Internal Interrupt IRQn definitions */
+    Reserved0_IRQn            =   0,              /*!<  Internal reserved */
+    Reserved1_IRQn            =   1,              /*!<  Internal reserved */
+    Reserved2_IRQn            =   2,              /*!<  Internal reserved */
+    SysTimerSW_IRQn           =   3,              /*!<  System Timer SW interrupt */
+    Reserved3_IRQn            =   4,              /*!<  Internal reserved */
+    Reserved4_IRQn            =   5,              /*!<  Internal reserved */
+    Reserved5_IRQn            =   6,              /*!<  Internal reserved */
+    SysTimer_IRQn             =   7,              /*!<  System Timer Interrupt */
+    Reserved6_IRQn            =   8,              /*!<  Internal reserved */
+    Reserved7_IRQn            =   9,              /*!<  Internal reserved */
+    Reserved8_IRQn            =  10,              /*!<  Internal reserved */
+    Reserved9_IRQn            =  11,              /*!<  Internal reserved */
+    Reserved10_IRQn           =  12,              /*!<  Internal reserved */
+    Reserved11_IRQn           =  13,              /*!<  Internal reserved */
+    Reserved12_IRQn           =  14,              /*!<  Internal reserved */
+    Reserved13_IRQn           =  15,              /*!<  Internal reserved */
+    Reserved14_IRQn           =  16,              /*!<  Internal reserved */
+    Reserved15_IRQn           =  17,              /*!<  Internal reserved */
+    Reserved16_IRQn           =  18,              /*!<  Internal reserved */
+
+    /* ========= Device Specific Interrupt Numbers  =================== */
+    /* ToDo: add here your device specific external interrupt numbers.
+     * 19~max(NUM_INTERRUPT, 1023) is reserved number for user.
+     * Maxmum interrupt supported could get from clicinfo.NUM_INTERRUPT.
+     * According the interrupt handlers defined in startup_Device.S
+     * eg.: Interrupt for Timer#1       eclic_tim1_handler   ->   TIM1_IRQn */
+    FirstDeviceSpecificInterrupt_IRQn    = 19,    /*!< First Device Specific Interrupt */
+    SOC_INT_MAX,                                  /*!< Number of total interrupts */
+} IRQn_Type;
+#endif /* __ONLY_FOR_DOXYGEN_DOCUMENT_GENERATION__ */
+
+#ifdef NMSIS_ECLIC_VIRTUAL
+    #ifndef NMSIS_ECLIC_VIRTUAL_HEADER_FILE
+        #define NMSIS_ECLIC_VIRTUAL_HEADER_FILE "nmsis_eclic_virtual.h"
+    #endif
+    #include NMSIS_ECLIC_VIRTUAL_HEADER_FILE
+#else
+    #define ECLIC_SetCfgNlbits            __ECLIC_SetCfgNlbits
+    #define ECLIC_GetCfgNlbits            __ECLIC_GetCfgNlbits
+    #define ECLIC_GetInfoVer              __ECLIC_GetInfoVer
+    #define ECLIC_GetInfoCtlbits          __ECLIC_GetInfoCtlbits
+    #define ECLIC_GetInfoNum              __ECLIC_GetInfoNum
+    #define ECLIC_SetMth                  __ECLIC_SetMth
+    #define ECLIC_GetMth                  __ECLIC_GetMth
+    #define ECLIC_EnableIRQ               __ECLIC_EnableIRQ
+    #define ECLIC_GetEnableIRQ            __ECLIC_GetEnableIRQ
+    #define ECLIC_DisableIRQ              __ECLIC_DisableIRQ
+    #define ECLIC_SetPendingIRQ           __ECLIC_SetPendingIRQ
+    #define ECLIC_GetPendingIRQ           __ECLIC_GetPendingIRQ
+    #define ECLIC_ClearPendingIRQ         __ECLIC_ClearPendingIRQ
+    #define ECLIC_SetTrigIRQ              __ECLIC_SetTrigIRQ
+    #define ECLIC_GetTrigIRQ              __ECLIC_GetTrigIRQ
+    #define ECLIC_SetShvIRQ               __ECLIC_SetShvIRQ
+    #define ECLIC_GetShvIRQ               __ECLIC_GetShvIRQ
+    #define ECLIC_SetCtrlIRQ              __ECLIC_SetCtrlIRQ
+    #define ECLIC_GetCtrlIRQ              __ECLIC_GetCtrlIRQ
+    #define ECLIC_SetLevelIRQ             __ECLIC_SetLevelIRQ
+    #define ECLIC_GetLevelIRQ             __ECLIC_GetLevelIRQ
+    #define ECLIC_SetPriorityIRQ          __ECLIC_SetPriorityIRQ
+    #define ECLIC_GetPriorityIRQ          __ECLIC_GetPriorityIRQ
+
+#endif /* NMSIS_ECLIC_VIRTUAL */
+
+#ifdef NMSIS_VECTAB_VIRTUAL
+    #ifndef NMSIS_VECTAB_VIRTUAL_HEADER_FILE
+        #define NMSIS_VECTAB_VIRTUAL_HEADER_FILE "nmsis_vectab_virtual.h"
+    #endif
+    #include NMSIS_VECTAB_VIRTUAL_HEADER_FILE
+#else
+    #define ECLIC_SetVector              __ECLIC_SetVector
+    #define ECLIC_GetVector              __ECLIC_GetVector
+#endif  /* (NMSIS_VECTAB_VIRTUAL) */
+
+/**
+ * \brief  Set nlbits value
+ * \details
+ * This function set the nlbits value of CLICCFG register.
+ * \param [in]    nlbits    nlbits value
+ * \remarks
+ * - nlbits is used to set the width of level in the CLICINTCTL[i].
+ * \sa
+ * - \ref ECLIC_GetCfgNlbits
+ */
+__STATIC_FORCEINLINE void __ECLIC_SetCfgNlbits(uint32_t nlbits)
+{
+    ECLIC->CFG &= ~CLIC_CLICCFG_NLBIT_Msk;
+    ECLIC->CFG |= (uint8_t)((nlbits <<CLIC_CLICCFG_NLBIT_Pos) & CLIC_CLICCFG_NLBIT_Msk);
+}
+
+/**
+ * \brief  Get nlbits value
+ * \details
+ * This function get the nlbits value of CLICCFG register.
+ * \return   nlbits value of CLICCFG register
+ * \remarks
+ * - nlbits is used to set the width of level in the CLICINTCTL[i].
+ * \sa
+ * - \ref ECLIC_SetCfgNlbits
+ */
+__STATIC_FORCEINLINE uint32_t __ECLIC_GetCfgNlbits(void)
+{
+    return ((uint32_t)((ECLIC->CFG & CLIC_CLICCFG_NLBIT_Msk) >> CLIC_CLICCFG_NLBIT_Pos));
+}
+
+/**
+ * \brief  Get the ECLIC version number
+ * \details
+ * This function gets the hardware version information from CLICINFO register.
+ * \return   hardware version number in CLICINFO register.
+ * \remarks
+ * - This function gets harware version information from CLICINFO register.
+ * - Bit 20:17 for architecture version, bit 16:13 for implementation version.
+ * \sa
+ * - \ref ECLIC_GetInfoNum
+*/
+__STATIC_FORCEINLINE uint32_t __ECLIC_GetInfoVer(void)
+{
+    return ((uint32_t)((ECLIC->INFO & CLIC_CLICINFO_VER_Msk) >> CLIC_CLICINFO_VER_Pos));
+}
+
+/**
+ * \brief  Get CLICINTCTLBITS
+ * \details
+ * This function gets CLICINTCTLBITS from CLICINFO register.
+ * \return  CLICINTCTLBITS from CLICINFO register.
+ * \remarks
+ * - In the CLICINTCTL[i] registers, with 2 <= CLICINTCTLBITS <= 8.
+ * - The implemented bits are kept left-justified in the most-significant bits of each 8-bit
+ *   CLICINTCTL[I] register, with the lower unimplemented bits treated as hardwired to 1.
+ * \sa
+ * - \ref ECLIC_GetInfoNum
+ */
+__STATIC_FORCEINLINE uint32_t __ECLIC_GetInfoCtlbits(void)
+{
+    return ((uint32_t)((ECLIC->INFO & CLIC_CLICINFO_CTLBIT_Msk) >> CLIC_CLICINFO_CTLBIT_Pos));
+}
+
+/**
+ * \brief  Get number of maximum interrupt inputs supported
+ * \details
+ * This function gets number of maximum interrupt inputs supported from CLICINFO register.
+ * \return  number of maximum interrupt inputs supported from CLICINFO register.
+ * \remarks
+ * - This function gets number of maximum interrupt inputs supported from CLICINFO register.
+ * - The num_interrupt field specifies the actual number of maximum interrupt inputs supported in this implementation.
+ * \sa
+ * - \ref ECLIC_GetInfoCtlbits
+ */
+__STATIC_FORCEINLINE uint32_t __ECLIC_GetInfoNum(void)
+{
+    return ((uint32_t)((ECLIC->INFO & CLIC_CLICINFO_NUM_Msk) >> CLIC_CLICINFO_NUM_Pos));
+}
+
+/**
+ * \brief  Set Machine Mode Interrupt Level Threshold
+ * \details
+ * This function sets machine mode interrupt level threshold.
+ * \param [in]  mth       Interrupt Level Threshold.
+ * \sa
+ * - \ref ECLIC_GetMth
+ */
+__STATIC_FORCEINLINE void __ECLIC_SetMth(uint8_t mth)
+{
+    ECLIC->MTH = mth;
+}
+
+/**
+ * \brief  Get Machine Mode Interrupt Level Threshold
+ * \details
+ * This function gets machine mode interrupt level threshold.
+ * \return       Interrupt Level Threshold.
+ * \sa
+ * - \ref ECLIC_SetMth
+ */
+__STATIC_FORCEINLINE uint8_t __ECLIC_GetMth(void)
+{
+    return (ECLIC->MTH);
+}
+
+
+/**
+ * \brief  Enable a specific interrupt
+ * \details
+ * This function enables the specific interrupt \em IRQn.
+ * \param [in]  IRQn  Interrupt number
+ * \remarks
+ * - IRQn must not be negative.
+ * \sa
+ * - \ref ECLIC_DisableIRQ
+ */
+__STATIC_FORCEINLINE void __ECLIC_EnableIRQ(IRQn_Type IRQn)
+{
+    ECLIC->CTRL[IRQn].INTIE |= CLIC_INTIE_IE_Msk;
+}
+
+/**
+ * \brief  Get a specific interrupt enable status
+ * \details
+ * This function returns the interrupt enable status for the specific interrupt \em IRQn.
+ * \param [in]  IRQn  Interrupt number
+ * \returns
+ * - 0  Interrupt is not enabled
+ * - 1  Interrupt is pending
+ * \remarks
+ * - IRQn must not be negative.
+ * \sa
+ * - \ref ECLIC_EnableIRQ
+ * - \ref ECLIC_DisableIRQ
+ */
+__STATIC_FORCEINLINE uint32_t __ECLIC_GetEnableIRQ(IRQn_Type IRQn)
+{
+    return((uint32_t) (ECLIC->CTRL[IRQn].INTIE) & CLIC_INTIE_IE_Msk);
+}
+
+/**
+ * \brief  Disable a specific interrupt
+ * \details
+ * This function disables the specific interrupt \em IRQn.
+ * \param [in]  IRQn  Number of the external interrupt to disable
+ * \remarks
+ * - IRQn must not be negative.
+ * \sa
+ * - \ref ECLIC_EnableIRQ
+ */
+__STATIC_FORCEINLINE void __ECLIC_DisableIRQ(IRQn_Type IRQn)
+{
+    ECLIC->CTRL[IRQn].INTIE &= ~CLIC_INTIE_IE_Msk;
+}
+
+/**
+ * \brief  Get the pending specific interrupt
+ * \details
+ * This function returns the pending status of the specific interrupt \em IRQn.
+ * \param [in]      IRQn  Interrupt number
+ * \returns
+ * - 0  Interrupt is not pending
+ * - 1  Interrupt is pending
+ * \remarks
+ * - IRQn must not be negative.
+ * \sa
+ * - \ref ECLIC_SetPendingIRQ
+ * - \ref ECLIC_ClearPendingIRQ
+ */
+__STATIC_FORCEINLINE int32_t __ECLIC_GetPendingIRQ(IRQn_Type IRQn)
+{
+    return((uint32_t)(ECLIC->CTRL[IRQn].INTIP) & CLIC_INTIP_IP_Msk);
+}
+
+/**
+ * \brief  Set a specific interrupt to pending
+ * \details
+ * This function sets the pending bit for the specific interrupt \em IRQn.
+ * \param [in]      IRQn  Interrupt number
+ * \remarks
+ * - IRQn must not be negative.
+ * \sa
+ * - \ref ECLIC_GetPendingIRQ
+ * - \ref ECLIC_ClearPendingIRQ
+ */
+__STATIC_FORCEINLINE void __ECLIC_SetPendingIRQ(IRQn_Type IRQn)
+{
+    ECLIC->CTRL[IRQn].INTIP |= CLIC_INTIP_IP_Msk;
+}
+
+/**
+ * \brief  Clear a specific interrupt from pending
+ * \details
+ * This function removes the pending state of the specific interrupt \em IRQn.
+ * \em IRQn cannot be a negative number.
+ * \param [in]      IRQn  Interrupt number
+ * \remarks
+ * - IRQn must not be negative.
+ * \sa
+ * - \ref ECLIC_SetPendingIRQ
+ * - \ref ECLIC_GetPendingIRQ
+ */
+__STATIC_FORCEINLINE void __ECLIC_ClearPendingIRQ(IRQn_Type IRQn)
+{
+    ECLIC->CTRL[IRQn].INTIP &= ~ CLIC_INTIP_IP_Msk;
+}
+
+/**
+ * \brief  Set trigger mode and polarity for a specific interrupt
+ * \details
+ * This function set trigger mode and polarity of the specific interrupt \em IRQn.
+ * \param [in]      IRQn  Interrupt number
+ * \param [in]      trig
+ *                   - 00  level trigger, \ref ECLIC_LEVEL_TRIGGER
+ *                   - 01  positive edge trigger, \ref ECLIC_POSTIVE_EDGE_TRIGGER
+ *                   - 02  level trigger, \ref ECLIC_LEVEL_TRIGGER
+ *                   - 03  negative edge trigger, \ref ECLIC_NEGTIVE_EDGE_TRIGGER
+ * \remarks
+ * - IRQn must not be negative.
+ *
+ * \sa
+ * - \ref ECLIC_GetTrigIRQ
+ */
+__STATIC_FORCEINLINE void __ECLIC_SetTrigIRQ(IRQn_Type IRQn, uint32_t trig)
+{
+    ECLIC->CTRL[IRQn].INTATTR &= ~CLIC_INTATTR_TRIG_Msk;
+    ECLIC->CTRL[IRQn].INTATTR |= (uint8_t)(trig<<CLIC_INTATTR_TRIG_Pos);
+}
+
+/**
+ * \brief  Get trigger mode and polarity for a specific interrupt
+ * \details
+ * This function get trigger mode and polarity of the specific interrupt \em IRQn.
+ * \param [in]      IRQn  Interrupt number
+ * \return
+ *                 - 00  level trigger, \ref ECLIC_LEVEL_TRIGGER
+ *                 - 01  positive edge trigger, \ref ECLIC_POSTIVE_EDGE_TRIGGER
+ *                 - 02  level trigger, \ref ECLIC_LEVEL_TRIGGER
+ *                 - 03  negative edge trigger, \ref ECLIC_NEGTIVE_EDGE_TRIGGER
+ * \remarks
+ *     - IRQn must not be negative.
+ * \sa
+ *     - \ref ECLIC_SetTrigIRQ
+ */
+__STATIC_FORCEINLINE uint32_t __ECLIC_GetTrigIRQ(IRQn_Type IRQn)
+{
+    return ((int32_t)(((ECLIC->CTRL[IRQn].INTATTR) & CLIC_INTATTR_TRIG_Msk)>>CLIC_INTATTR_TRIG_Pos));
+}
+
+/**
+ * \brief  Set interrupt working mode for a specific interrupt
+ * \details
+ * This function set selective hardware vector or non-vector working mode of the specific interrupt \em IRQn.
+ * \param [in]      IRQn  Interrupt number
+ * \param [in]      shv
+ *                        - 0  non-vector mode, \ref ECLIC_NON_VECTOR_INTERRUPT
+ *                        - 1  vector mode, \ref ECLIC_VECTOR_INTERRUPT
+ * \remarks
+ * - IRQn must not be negative.
+ * \sa
+ * - \ref ECLIC_GetShvIRQ
+ */
+__STATIC_FORCEINLINE void __ECLIC_SetShvIRQ(IRQn_Type IRQn, uint32_t shv)
+{
+    ECLIC->CTRL[IRQn].INTATTR &= ~CLIC_INTATTR_SHV_Msk;
+    ECLIC->CTRL[IRQn].INTATTR |= (uint8_t)(shv<<CLIC_INTATTR_SHV_Pos);
+}
+
+/**
+ * \brief  Get interrupt working mode for a specific interrupt
+ * \details
+ * This function get selective hardware vector or non-vector working mode of the specific interrupt \em IRQn.
+ * \param [in]      IRQn  Interrupt number
+ * \return       shv
+ *                        - 0  non-vector mode, \ref ECLIC_NON_VECTOR_INTERRUPT
+ *                        - 1  vector mode, \ref ECLIC_VECTOR_INTERRUPT
+ * \remarks
+ * - IRQn must not be negative.
+ * \sa
+ * - \ref ECLIC_SetShvIRQ
+ */
+__STATIC_FORCEINLINE uint32_t __ECLIC_GetShvIRQ(IRQn_Type IRQn)
+{
+    return ((int32_t)(((ECLIC->CTRL[IRQn].INTATTR) & CLIC_INTATTR_SHV_Msk)>>CLIC_INTATTR_SHV_Pos));
+}
+
+/**
+ * \brief  Modify ECLIC Interrupt Input Control Register for a specific interrupt
+ * \details
+ * This function modify ECLIC Interrupt Input Control(CLICINTCTL[i]) register of the specific interrupt \em IRQn.
+ * \param [in]      IRQn  Interrupt number
+ * \param [in]      intctrl  Set value for CLICINTCTL[i] register
+ * \remarks
+ * - IRQn must not be negative.
+ * \sa
+ * - \ref ECLIC_GetCtrlIRQ
+ */
+__STATIC_FORCEINLINE void __ECLIC_SetCtrlIRQ(IRQn_Type IRQn, uint8_t intctrl)
+{
+    ECLIC->CTRL[IRQn].INTCTRL = intctrl;
+}
+
+/**
+ * \brief  Get ECLIC Interrupt Input Control Register value for a specific interrupt
+ * \details
+ * This function modify ECLIC Interrupt Input Control register of the specific interrupt \em IRQn.
+ * \param [in]      IRQn  Interrupt number
+ * \return       value of ECLIC Interrupt Input Control register
+ * \remarks
+ * - IRQn must not be negative.
+ * \sa
+ * - \ref ECLIC_SetCtrlIRQ
+ */
+__STATIC_FORCEINLINE uint8_t __ECLIC_GetCtrlIRQ(IRQn_Type IRQn)
+{
+    return (ECLIC->CTRL[IRQn].INTCTRL);
+}
+
+/**
+ * \brief  Set ECLIC Interrupt level of a specific interrupt
+ * \details
+ * This function set interrupt level of the specific interrupt \em IRQn.
+ * \param [in]      IRQn  Interrupt number
+ * \param [in]      lvl_abs   Interrupt level
+ * \remarks
+ * - IRQn must not be negative.
+ * - If lvl_abs to be set is larger than the max level allowed, it will be force to be max level.
+ * - When you set level value you need use clciinfo.nlbits to get the width of level.
+ *   Then we could know the maximum of level. CLICINTCTLBITS is how many total bits are
+ *   present in the CLICINTCTL register.
+ * \sa
+ * - \ref ECLIC_GetLevelIRQ
+ */
+__STATIC_FORCEINLINE void __ECLIC_SetLevelIRQ(IRQn_Type IRQn, uint8_t lvl_abs)
+{
+    uint8_t nlbits = __ECLIC_GetCfgNlbits();
+    uint8_t intctlbits = (uint8_t)__ECLIC_INTCTLBITS;
+
+    if (nlbits == 0) {
+        return;
+    }
+
+    if (nlbits > intctlbits) {
+        nlbits = intctlbits;
+    }
+    uint8_t maxlvl = ((1 << nlbits) - 1);
+    if (lvl_abs > maxlvl) {
+        lvl_abs = maxlvl;
+    }
+    uint8_t lvl = lvl_abs << (ECLIC_MAX_NLBITS - nlbits);
+    uint8_t cur_ctrl = __ECLIC_GetCtrlIRQ(IRQn);
+    cur_ctrl = cur_ctrl << nlbits;
+    cur_ctrl = cur_ctrl >> nlbits;
+    __ECLIC_SetCtrlIRQ(IRQn, (cur_ctrl | lvl));
+}
+
+/**
+ * \brief  Get ECLIC Interrupt level of a specific interrupt
+ * \details
+ * This function get interrupt level of the specific interrupt \em IRQn.
+ * \param [in]      IRQn  Interrupt number
+ * \return         Interrupt level
+ * \remarks
+ * - IRQn must not be negative.
+ * \sa
+ * - \ref ECLIC_SetLevelIRQ
+ */
+__STATIC_FORCEINLINE uint8_t __ECLIC_GetLevelIRQ(IRQn_Type IRQn)
+{
+    uint8_t nlbits = __ECLIC_GetCfgNlbits();
+    uint8_t intctlbits = (uint8_t)__ECLIC_INTCTLBITS;
+
+    if (nlbits == 0) {
+        return 0;
+    }
+
+    if (nlbits > intctlbits) {
+        nlbits = intctlbits;
+    }
+    uint8_t intctrl = __ECLIC_GetCtrlIRQ(IRQn);
+    uint8_t lvl_abs = intctrl >> (ECLIC_MAX_NLBITS - nlbits);
+    return lvl_abs;
+}
+
+/**
+ * \brief  Get ECLIC Interrupt priority of a specific interrupt
+ * \details
+ * This function get interrupt priority of the specific interrupt \em IRQn.
+ * \param [in]      IRQn  Interrupt number
+ * \param [in]      pri   Interrupt priority
+ * \remarks
+ * - IRQn must not be negative.
+ * - If pri to be set is larger than the max priority allowed, it will be force to be max priority.
+ * - Priority width is CLICINTCTLBITS minus clciinfo.nlbits if clciinfo.nlbits
+ *   is less than CLICINTCTLBITS. Otherwise priority width is 0.
+ * \sa
+ * - \ref ECLIC_GetPriorityIRQ
+ */
+__STATIC_FORCEINLINE void __ECLIC_SetPriorityIRQ(IRQn_Type IRQn, uint8_t pri)
+{
+    uint8_t nlbits = __ECLIC_GetCfgNlbits();
+    uint8_t intctlbits = (uint8_t)__ECLIC_INTCTLBITS;
+    if (nlbits < intctlbits) {
+        uint8_t maxpri = ((1 << (intctlbits - nlbits)) - 1);
+        if (pri > maxpri) {
+            pri = maxpri;
+        }
+        pri = pri << (ECLIC_MAX_NLBITS - intctlbits);
+        uint8_t mask = ((uint8_t)(-1)) >> intctlbits;
+        pri = pri | mask;
+        uint8_t cur_ctrl = __ECLIC_GetCtrlIRQ(IRQn);
+        cur_ctrl = cur_ctrl >> (ECLIC_MAX_NLBITS - nlbits);
+        cur_ctrl = cur_ctrl << (ECLIC_MAX_NLBITS - nlbits);
+        __ECLIC_SetCtrlIRQ(IRQn, (cur_ctrl | pri));
+    }
+}
+
+/**
+ * \brief  Get ECLIC Interrupt priority of a specific interrupt
+ * \details
+ * This function get interrupt priority of the specific interrupt \em IRQn.
+ * \param [in]      IRQn  Interrupt number
+ * \return   Interrupt priority
+ * \remarks
+ * - IRQn must not be negative.
+ * \sa
+ * - \ref ECLIC_SetPriorityIRQ
+ */
+__STATIC_FORCEINLINE uint8_t __ECLIC_GetPriorityIRQ(IRQn_Type IRQn)
+{
+    uint8_t nlbits = __ECLIC_GetCfgNlbits();
+    uint8_t intctlbits = (uint8_t)__ECLIC_INTCTLBITS;
+    if (nlbits < intctlbits) {
+        uint8_t cur_ctrl = __ECLIC_GetCtrlIRQ(IRQn);
+        uint8_t pri = cur_ctrl << nlbits;
+        pri = pri >> nlbits;
+        pri = pri >> (ECLIC_MAX_NLBITS - intctlbits);
+        return pri;
+    } else {
+        return 0;
+    }
+}
+
+/**
+ * \brief  Set Interrupt Vector of a specific interrupt
+ * \details
+ * This function set interrupt handler address of the specific interrupt \em IRQn.
+ * \param [in]      IRQn  Interrupt number
+ * \param [in]      vector   Interrupt handler address
+ * \remarks
+ * - IRQn must not be negative.
+ * - You can set the \ref CSR_CSR_MTVT to set interrupt vector table entry address.
+ * - If your vector table is placed in readonly section, the vector for IRQn will not be modified.
+ *   For this case, you need to use the correct irq handler name defined in your vector table as
+ *   your irq handler function name.
+ * - This function will only work correctly when the vector table is placed in an read-write enabled section.
+ * \sa
+ * - \ref ECLIC_GetVector
+ */
+__STATIC_FORCEINLINE void __ECLIC_SetVector(IRQn_Type IRQn, rv_csr_t vector)
+{
+#if __RISCV_XLEN == 32
+    volatile uint32_t vec_base;
+    vec_base = ((uint32_t)__RV_CSR_READ(CSR_MTVT));
+    (* (unsigned long *) (vec_base + ((int32_t)IRQn) * 4)) = vector;
+#elif __RISCV_XLEN == 64
+    volatile uint64_t vec_base;
+    vec_base = ((uint64_t)__RV_CSR_READ(CSR_MTVT));
+    (* (unsigned long *) (vec_base + ((int32_t)IRQn) * 8)) = vector;
+#else // TODO Need cover for XLEN=128 case in future
+    volatile uint64_t vec_base;
+    vec_base = ((uint64_t)__RV_CSR_READ(CSR_MTVT));
+    (* (unsigned long *) (vec_base + ((int32_t)IRQn) * 8)) = vector;
+#endif
+}
+
+/**
+ * \brief  Get Interrupt Vector of a specific interrupt
+ * \details
+ * This function get interrupt handler address of the specific interrupt \em IRQn.
+ * \param [in]      IRQn  Interrupt number
+ * \return        Interrupt handler address
+ * \remarks
+ * - IRQn must not be negative.
+ * - You can read \ref CSR_CSR_MTVT to get interrupt vector table entry address.
+ * \sa
+ *     - \ref ECLIC_SetVector
+ */
+__STATIC_FORCEINLINE rv_csr_t __ECLIC_GetVector(IRQn_Type IRQn)
+{
+#if __RISCV_XLEN == 32
+    return (*(uint32_t *)(__RV_CSR_READ(CSR_MTVT)+IRQn*4));
+#elif __RISCV_XLEN == 64
+    return (*(uint64_t *)(__RV_CSR_READ(CSR_MTVT)+IRQn*8));
+#else // TODO Need cover for XLEN=128 case in future
+    return (*(uint64_t *)(__RV_CSR_READ(CSR_MTVT)+IRQn*8));
+#endif
+}
+
+/**
+ * \brief  Set Exception entry address
+ * \details
+ * This function set exception handler address to 'CSR_MTVEC'.
+ * \param [in]      addr  Exception handler address
+ * \remarks
+ * - This function use to set exception handler address to 'CSR_MTVEC'. Address is 4 bytes align.
+ * \sa
+ * - \ref __get_exc_entry
+ */
+__STATIC_FORCEINLINE void __set_exc_entry(rv_csr_t addr)
+{
+    addr &= (rv_csr_t)(~0x3F);
+    addr |= ECLIC_MODE_MTVEC_Msk;
+    __RV_CSR_WRITE(CSR_MTVEC, addr);
+}
+
+/**
+ * \brief  Get Exception entry address
+ * \details
+ * This function get exception handler address from 'CSR_MTVEC'.
+ * \return       Exception handler address
+ * \remarks
+ * - This function use to get exception handler address from 'CSR_MTVEC'. Address is 4 bytes align
+ * \sa
+ * - \ref __set_exc_entry
+ */
+__STATIC_FORCEINLINE rv_csr_t __get_exc_entry(void)
+{
+    unsigned long addr = __RV_CSR_READ(CSR_MTVEC);
+    return (addr & ~ECLIC_MODE_MTVEC_Msk);
+}
+
+/**
+ * \brief  Set Non-vector interrupt entry address
+ * \details
+ * This function set Non-vector interrupt address.
+ * \param [in]      addr  Non-vector interrupt entry address
+ * \remarks
+ * - This function use to set non-vector interrupt entry address to 'CSR_MTVT2' if
+ * - CSR_MTVT2 bit0 is 1. If 'CSR_MTVT2' bit0 is 0 then set address to 'CSR_MTVEC'
+ * \sa
+ * - \ref __get_nonvec_entry
+ */
+__STATIC_FORCEINLINE void __set_nonvec_entry(rv_csr_t addr)
+{
+    if (__RV_CSR_READ(CSR_MTVT2) & 0x1){
+        __RV_CSR_WRITE(CSR_MTVT2, addr | 0x01);
+    } else {
+        addr &= (rv_csr_t)(~0x3F);
+        addr |= ECLIC_MODE_MTVEC_Msk;
+        __RV_CSR_WRITE(CSR_MTVEC, addr);
+    }
+}
+
+/**
+ * \brief  Get Non-vector interrupt entry address
+ * \details
+ * This function get Non-vector interrupt address.
+ * \return      Non-vector interrupt handler address
+ * \remarks
+ * - This function use to get non-vector interrupt entry address from 'CSR_MTVT2' if
+ * - CSR_MTVT2 bit0 is 1. If 'CSR_MTVT2' bit0 is 0 then get address from 'CSR_MTVEC'.
+ * \sa
+ * - \ref __set_nonvec_entry
+ */
+__STATIC_FORCEINLINE rv_csr_t __get_nonvec_entry(void)
+{
+    if (__RV_CSR_READ(CSR_MTVT2) & 0x1) {
+        return __RV_CSR_READ(CSR_MTVT2) & (~(rv_csr_t)(0x1));
+    } else {
+        rv_csr_t addr = __RV_CSR_READ(CSR_MTVEC);
+        return (addr & ~ECLIC_MODE_MTVEC_Msk);
+    }
+}
+
+/**
+ * \brief  Get NMI interrupt entry from 'CSR_MNVEC'
+ * \details
+ * This function get NMI interrupt address from 'CSR_MNVEC'.
+ * \return      NMI interrupt handler address
+ * \remarks
+ * - This function use to get NMI interrupt handler address from 'CSR_MNVEC'. If CSR_MMISC_CTL[9] = 1 'CSR_MNVEC'
+ * - will be equal as mtvec. If CSR_MMISC_CTL[9] = 0 'CSR_MNVEC' will be equal as reset vector.
+ * - NMI entry is defined via \ref CSR_MMISC_CTL, writing to \ref CSR_MNVEC will be ignored.
+ */
+__STATIC_FORCEINLINE rv_csr_t __get_nmi_entry(void)
+{
+    return __RV_CSR_READ(CSR_MNVEC);
+}
+
+/**
+ * \brief   Save necessary CSRs into variables for vector interrupt nesting
+ * \details
+ * This macro is used to declare variables which are used for saving
+ * CSRs(MCAUSE, MEPC, MSUB), and it will read these CSR content into
+ * these variables, it need to be used in a vector-interrupt if nesting
+ * is required.
+ * \remarks
+ * - Interrupt will be enabled after this macro is called
+ * - It need to be used together with \ref RESTORE_IRQ_CSR_CONTEXT
+ * - Don't use variable names __mcause, __mpec, __msubm in your ISR code
+ * - If you want to enable interrupt nesting feature for vector interrupt,
+ * you can do it like this:
+ * \code
+ * // __INTERRUPT attribute will generates function entry and exit sequences suitable
+ * // for use in an interrupt handler when this attribute is present
+ * __INTERRUPT void eclic_mtip_handler(void)
+ * {
+ *     // Must call this to save CSRs
+ *     SAVE_IRQ_CSR_CONTEXT();
+ *     // !!!Interrupt is enabled here!!!
+ *     // !!!Higher priority interrupt could nest it!!!
+ *
+ *     // put you own interrupt handling code here
+ *
+ *     // Must call this to restore CSRs
+ *     RESTORE_IRQ_CSR_CONTEXT();
+ * }
+ * \endcode
+ */
+#define SAVE_IRQ_CSR_CONTEXT()                                              \
+        rv_csr_t __mcause = __RV_CSR_READ(CSR_MCAUSE);                      \
+        rv_csr_t __mepc = __RV_CSR_READ(CSR_MEPC);                          \
+        rv_csr_t __msubm = __RV_CSR_READ(CSR_MSUBM);                        \
+        __enable_irq();
+
+/**
+ * \brief   Restore necessary CSRs from variables for vector interrupt nesting
+ * \details
+ * This macro is used restore CSRs(MCAUSE, MEPC, MSUB) from pre-defined variables
+ * in \ref SAVE_IRQ_CSR_CONTEXT macro.
+ * \remarks
+ * - Interrupt will be disabled after this macro is called
+ * - It need to be used together with \ref SAVE_IRQ_CSR_CONTEXT
+ */
+#define RESTORE_IRQ_CSR_CONTEXT()                                           \
+        __disable_irq();                                                    \
+        __RV_CSR_WRITE(CSR_MSUBM, __msubm);                                 \
+        __RV_CSR_WRITE(CSR_MEPC, __mepc);                                   \
+        __RV_CSR_WRITE(CSR_MCAUSE, __mcause);
+
+/** @} */ /* End of Doxygen Group NMSIS_Core_IntExc */
+
+#endif /* defined(__ECLIC_PRESENT) && (__ECLIC_PRESENT == 1) */
+
+#ifdef __cplusplus
+}
+#endif
+#endif /** __CORE_FEATURE_ECLIC__ */
diff --git a/kernel/arch/risc-v/nuclei/gcc/nmsis/Core/Include/core_feature_fpu.h b/kernel/arch/risc-v/nuclei/gcc/nmsis/Core/Include/core_feature_fpu.h
new file mode 100644
index 00000000..c9e13b79
--- /dev/null
+++ b/kernel/arch/risc-v/nuclei/gcc/nmsis/Core/Include/core_feature_fpu.h
@@ -0,0 +1,304 @@
+/*
+ * Copyright (c) 2019 Nuclei Limited. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __CORE_FEATURE_FPU_H__
+#define __CORE_FEATURE_FPU_H__
+/*!
+ * @file     core_feature_fpu.h
+ * @brief    FPU feature API header file for Nuclei N/NX Core
+ */
+/*
+ * FPU Feature Configuration Macro:
+ * 1. __FPU_PRESENT:  Define whether Floating Point Unit(FPU) is present or not
+ *   * 0: Not present
+ *   * 1: Single precision FPU present, __RISCV_FLEN == 32
+ *   * 2: Double precision FPU present, __RISCV_FLEN == 64
+ */
+#ifdef __cplusplus
+ extern "C" {
+#endif
+
+/* ===== FPU Operations ===== */
+/**
+ * \defgroup NMSIS_Core_FPU_Functions   FPU Functions
+ * \ingroup  NMSIS_Core
+ * \brief    Functions that related to the RISC-V FPU (F and D extension).
+ * \details
+ *
+ * Nuclei provided floating point unit by RISC-V F and D extension.
+ * * `F extension` adds single-precision floating-point computational
+ * instructions compliant with the IEEE 754-2008 arithmetic standard, __RISCV_FLEN = 32.
+ *   The F extension adds 32 floating-point registers, f0-f31, each 32 bits wide,
+ *   and a floating-point control and status register fcsr, which contains the
+ *   operating mode and exception status of the floating-point unit.
+ * * `D extension` adds double-precision floating-point computational instructions
+ * compliant with the IEEE 754-2008 arithmetic standard.
+ *   The D extension widens the 32 floating-point registers, f0-f31, to 64 bits, __RISCV_FLEN = 64
+ *   @{
+ */
+#if defined(__FPU_PRESENT) && (__FPU_PRESENT > 0)
+
+#if __FPU_PRESENT == 1
+  /** \brief Refer to the width of the floating point register in bits(either 32 or 64) */
+  #define __RISCV_FLEN          32
+#elif __FPU_PRESENT == 2
+  #define __RISCV_FLEN          64
+#else
+  #define __RISCV_FLEN          __riscv_flen
+#endif /* __FPU_PRESENT == 1 */
+
+/** \brief Get FCSR CSR Register */
+#define __get_FCSR()            __RV_CSR_READ(CSR_FCSR)
+/** \brief Set FCSR CSR Register with val */
+#define __set_FCSR(val)         __RV_CSR_WRITE(CSR_FCSR, (val))
+/** \brief Get FRM CSR Register */
+#define __get_FRM()             __RV_CSR_READ(CSR_FRM)
+/** \brief Set FRM CSR Register with val */
+#define __set_FRM(val)          __RV_CSR_WRITE(CSR_FRM, (val))
+/** \brief Get FFLAGS CSR Register */
+#define __get_FFLAGS()          __RV_CSR_READ(CSR_FFLAGS)
+/** \brief Set FFLAGS CSR Register with val */
+#define __set_FFLAGS(val)       __RV_CSR_WRITE(CSR_FFLAGS, (val))
+
+/** \brief Enable FPU Unit */
+#define __enable_FPU()          __RV_CSR_SET(CSR_MSTATUS, MSTATUS_FS)
+/**
+ * \brief Disable FPU Unit
+ * \details
+ * * We can save power by disable FPU Unit.
+ * * When FPU Unit is disabled, any access to FPU related CSR registers
+ * and FPU instructions will cause illegal Instuction Exception.
+ * */
+#define __disable_FPU()         __RV_CSR_CLEAR(CSR_MSTATUS, MSTATUS_FS)
+
+
+/**
+ * \brief   Load a single-precision value from memory into float point register freg using flw instruction
+ * \details The FLW instruction loads a single-precision floating point value from memory
+ * address (addr + ofs) into floating point register freg(f0-f31)
+ * \param [in]    freg   The floating point register, eg. FREG(0), f0
+ * \param [in]    addr   The memory base address, 4 byte aligned required
+ * \param [in]    ofs    a 12-bit immediate signed byte offset value, should be an const value
+ * \remarks
+ * * FLW and FSW operations need to make sure the address is 4 bytes aligned,
+ *   otherwise it will cause exception code 4(Load address misaligned) or 6 (Store/AMO address misaligned)
+ * * FLW and FSW do not modify the bits being transferred; in particular, the payloads of non-canonical
+ * NaNs are preserved
+ *
+ */
+#define __RV_FLW(freg, addr, ofs)                              \
+    ({                                                         \
+        register rv_csr_t __addr = (rv_csr_t)(addr);           \
+        __ASM volatile("flw " STRINGIFY(freg) ", %0(%1)  "     \
+                     : : "I"(ofs), "r"(__addr)                 \
+                     : "memory");                              \
+    })
+
+/**
+ * \brief   Store a single-precision value from float point freg into memory using fsw instruction
+ * \details The FSW instruction stores a single-precision value from floating point register to memory
+ * \param [in]    freg   The floating point register(f0-f31), eg. FREG(0), f0
+ * \param [in]    addr   The memory base address, 4 byte aligned required
+ * \param [in]    ofs    a 12-bit immediate signed byte offset value, should be an const value
+ * \remarks
+ * * FLW and FSW operations need to make sure the address is 4 bytes aligned,
+ *   otherwise it will cause exception code 4(Load address misaligned) or 6 (Store/AMO address misaligned)
+ * * FLW and FSW do not modify the bits being transferred; in particular, the payloads of non-canonical
+ * NaNs are preserved
+ *
+ */
+#define __RV_FSW(freg, addr, ofs)                              \
+    ({                                                         \
+        register rv_csr_t __addr = (rv_csr_t)(addr);           \
+        __ASM volatile("fsw " STRINGIFY(freg) ", %0(%1)  "     \
+                     : : "I"(ofs), "r"(__addr)                 \
+                     : "memory");                              \
+    })
+
+/**
+ * \brief   Load a double-precision value from memory into float point register freg using fld instruction
+ * \details The FLD instruction loads a double-precision floating point value from memory
+ * address (addr + ofs) into floating point register freg(f0-f31)
+ * \param [in]    freg   The floating point register, eg. FREG(0), f0
+ * \param [in]    addr   The memory base address, 8 byte aligned required
+ * \param [in]    ofs    a 12-bit immediate signed byte offset value, should be an const value
+ * \attention
+ * * Function only available for double precision floating point unit, FLEN = 64
+ * \remarks
+ * * FLD and FSD operations need to make sure the address is 8 bytes aligned,
+ *   otherwise it will cause exception code 4(Load address misaligned) or 6 (Store/AMO address misaligned)
+ * * FLD and FSD do not modify the bits being transferred; in particular, the payloads of non-canonical
+ * NaNs are preserved.
+ */
+#define __RV_FLD(freg, addr, ofs)                              \
+    ({                                                         \
+        register rv_csr_t __addr = (rv_csr_t)(addr);           \
+        __ASM volatile("fld " STRINGIFY(freg) ", %0(%1)  "     \
+                     : : "I"(ofs), "r"(__addr)                 \
+                     : "memory");                              \
+    })
+
+/**
+ * \brief   Store a double-precision value from float point freg into memory using fsd instruction
+ * \details The FSD instruction stores double-precision value from floating point register to memory
+ * \param [in]    freg   The floating point register(f0-f31), eg. FREG(0), f0
+ * \param [in]    addr   The memory base address, 8 byte aligned required
+ * \param [in]    ofs    a 12-bit immediate signed byte offset value, should be an const value
+ * \attention
+ * * Function only available for double precision floating point unit, FLEN = 64
+ * \remarks
+ * * FLD and FSD operations need to make sure the address is 8 bytes aligned,
+ *   otherwise it will cause exception code 4(Load address misaligned) or 6 (Store/AMO address misaligned)
+ * * FLD and FSD do not modify the bits being transferred; in particular, the payloads of non-canonical
+ * NaNs are preserved.
+ *
+ */
+#define __RV_FSD(freg, addr, ofs)                              \
+    ({                                                         \
+        register rv_csr_t __addr = (rv_csr_t)(addr);           \
+        __ASM volatile("fsd " STRINGIFY(freg) ", %0(%1)  "     \
+                     : : "I"(ofs), "r"(__addr)                 \
+                     : "memory");                              \
+    })
+
+/**
+ * \def __RV_FLOAD
+ * \brief   Load a float point value from memory into float point register freg using flw/fld instruction
+ * \details
+ * * For Single-Precison Floating-Point Mode(__FPU_PRESENT == 1, __RISCV_FLEN == 32):
+ *   It will call \ref __RV_FLW to load a single-precision floating point value from memory to floating point register
+ * * For Double-Precison Floating-Point Mode(__FPU_PRESENT == 2, __RISCV_FLEN == 64):
+ *   It will call \ref __RV_FLD to load a double-precision floating point value from memory to floating point register
+ *
+ * \attention
+ * Function behaviour is different for __FPU_PRESENT = 1 or 2, please see the real function this macro represent
+ */
+/**
+ * \def __RV_FSTORE
+ * \brief   Store a float value from float point freg into memory using fsw/fsd instruction
+ * \details
+ * * For Single-Precison Floating-Point Mode(__FPU_PRESENT == 1, __RISCV_FLEN == 32):
+ *   It will call \ref __RV_FSW to store floating point register into memory
+ * * For Double-Precison Floating-Point Mode(__FPU_PRESENT == 2, __RISCV_FLEN == 64):
+ *   It will call \ref __RV_FSD to store floating point register into memory
+ *
+ * \attention
+ * Function behaviour is different for __FPU_PRESENT = 1 or 2, please see the real function this macro represent
+ */
+#if __FPU_PRESENT == 1
+#define __RV_FLOAD              __RV_FLW
+#define __RV_FSTORE             __RV_FSW
+/** \brief Type of FPU register, depends on the FLEN defined in RISC-V */
+typedef uint32_t rv_fpu_t;
+#elif __FPU_PRESENT == 2
+#define __RV_FLOAD              __RV_FLD
+#define __RV_FSTORE             __RV_FSD
+/** \brief Type of FPU register, depends on the FLEN defined in RISC-V */
+typedef uint64_t rv_fpu_t;
+#endif /* __FPU_PRESENT == 2 */
+
+/**
+ * \brief   Save FPU context into variables for interrupt nesting
+ * \details
+ * This macro is used to declare variables which are used for saving
+ * FPU context, and it will store the nessary fpu registers into
+ * these variables, it need to be used in a interrupt when in this
+ * interrupt fpu registers are used.
+ * \remarks
+ * - It need to be used together with \ref RESTORE_FPU_CONTEXT
+ * - Don't use variable names __fpu_context in your ISR code
+ * - If you isr code will use fpu registers, and this interrupt is nested.
+ * Then you can do it like this:
+ * \code
+ * void eclic_mtip_handler(void)
+ * {
+ *     // !!!Interrupt is enabled here!!!
+ *     // !!!Higher priority interrupt could nest it!!!
+ *
+ *     // Necessary only when you need to use fpu registers
+ *     // in this isr handler functions
+ *     SAVE_FPU_CONTEXT();
+ *
+ *     // put you own interrupt handling code here
+ *
+ *     // pair of SAVE_FPU_CONTEXT()
+ *     RESTORE_FPU_CONTEXT();
+ * }
+ * \endcode
+ */
+#define SAVE_FPU_CONTEXT()                                                  \
+        rv_fpu_t __fpu_context[20];                                         \
+        __RV_FSTORE(FREG(0),  __fpu_context, 0  << LOG_FPREGBYTES);         \
+        __RV_FSTORE(FREG(1),  __fpu_context, 1  << LOG_FPREGBYTES);         \
+        __RV_FSTORE(FREG(2),  __fpu_context, 2  << LOG_FPREGBYTES);         \
+        __RV_FSTORE(FREG(3),  __fpu_context, 3  << LOG_FPREGBYTES);         \
+        __RV_FSTORE(FREG(4),  __fpu_context, 4  << LOG_FPREGBYTES);         \
+        __RV_FSTORE(FREG(5),  __fpu_context, 5  << LOG_FPREGBYTES);         \
+        __RV_FSTORE(FREG(6),  __fpu_context, 6  << LOG_FPREGBYTES);         \
+        __RV_FSTORE(FREG(7),  __fpu_context, 7  << LOG_FPREGBYTES);         \
+        __RV_FSTORE(FREG(10), __fpu_context, 8  << LOG_FPREGBYTES);         \
+        __RV_FSTORE(FREG(11), __fpu_context, 9  << LOG_FPREGBYTES);         \
+        __RV_FSTORE(FREG(12), __fpu_context, 10 << LOG_FPREGBYTES);         \
+        __RV_FSTORE(FREG(13), __fpu_context, 11 << LOG_FPREGBYTES);         \
+        __RV_FSTORE(FREG(14), __fpu_context, 12 << LOG_FPREGBYTES);         \
+        __RV_FSTORE(FREG(15), __fpu_context, 13 << LOG_FPREGBYTES);         \
+        __RV_FSTORE(FREG(16), __fpu_context, 14 << LOG_FPREGBYTES);         \
+        __RV_FSTORE(FREG(17), __fpu_context, 15 << LOG_FPREGBYTES);         \
+        __RV_FSTORE(FREG(28), __fpu_context, 16 << LOG_FPREGBYTES);         \
+        __RV_FSTORE(FREG(29), __fpu_context, 17 << LOG_FPREGBYTES);         \
+        __RV_FSTORE(FREG(30), __fpu_context, 18 << LOG_FPREGBYTES);         \
+        __RV_FSTORE(FREG(31), __fpu_context, 19 << LOG_FPREGBYTES);
+
+/**
+ * \brief   Restore necessary fpu registers from variables for interrupt nesting
+ * \details
+ * This macro is used restore necessary fpu registers from pre-defined variables
+ * in \ref SAVE_FPU_CONTEXT macro.
+ * \remarks
+ * - It need to be used together with \ref SAVE_FPU_CONTEXT
+ */
+#define RESTORE_FPU_CONTEXT()                                               \
+        __RV_FLOAD(FREG(0),  __fpu_context, 0  << LOG_FPREGBYTES);          \
+        __RV_FLOAD(FREG(1),  __fpu_context, 1  << LOG_FPREGBYTES);          \
+        __RV_FLOAD(FREG(2),  __fpu_context, 2  << LOG_FPREGBYTES);          \
+        __RV_FLOAD(FREG(3),  __fpu_context, 3  << LOG_FPREGBYTES);          \
+        __RV_FLOAD(FREG(4),  __fpu_context, 4  << LOG_FPREGBYTES);          \
+        __RV_FLOAD(FREG(5),  __fpu_context, 5  << LOG_FPREGBYTES);          \
+        __RV_FLOAD(FREG(6),  __fpu_context, 6  << LOG_FPREGBYTES);          \
+        __RV_FLOAD(FREG(7),  __fpu_context, 7  << LOG_FPREGBYTES);          \
+        __RV_FLOAD(FREG(10), __fpu_context, 8  << LOG_FPREGBYTES);          \
+        __RV_FLOAD(FREG(11), __fpu_context, 9  << LOG_FPREGBYTES);          \
+        __RV_FLOAD(FREG(12), __fpu_context, 10 << LOG_FPREGBYTES);          \
+        __RV_FLOAD(FREG(13), __fpu_context, 11 << LOG_FPREGBYTES);          \
+        __RV_FLOAD(FREG(14), __fpu_context, 12 << LOG_FPREGBYTES);          \
+        __RV_FLOAD(FREG(15), __fpu_context, 13 << LOG_FPREGBYTES);          \
+        __RV_FLOAD(FREG(16), __fpu_context, 14 << LOG_FPREGBYTES);          \
+        __RV_FLOAD(FREG(17), __fpu_context, 15 << LOG_FPREGBYTES);          \
+        __RV_FLOAD(FREG(28), __fpu_context, 16 << LOG_FPREGBYTES);          \
+        __RV_FLOAD(FREG(29), __fpu_context, 17 << LOG_FPREGBYTES);          \
+        __RV_FLOAD(FREG(30), __fpu_context, 18 << LOG_FPREGBYTES);          \
+        __RV_FLOAD(FREG(31), __fpu_context, 19 << LOG_FPREGBYTES);
+#else
+#define SAVE_FPU_CONTEXT()
+#define RESTORE_FPU_CONTEXT()
+#endif /* __FPU_PRESENT > 0 */
+/** @} */ /* End of Doxygen Group NMSIS_Core_FPU_Functions */
+
+#ifdef __cplusplus
+}
+#endif
+#endif /** __RISCV_EXT_FPU_H__  */
diff --git a/kernel/arch/risc-v/nuclei/gcc/nmsis/Core/Include/core_feature_pmp.h b/kernel/arch/risc-v/nuclei/gcc/nmsis/Core/Include/core_feature_pmp.h
new file mode 100644
index 00000000..997dfaee
--- /dev/null
+++ b/kernel/arch/risc-v/nuclei/gcc/nmsis/Core/Include/core_feature_pmp.h
@@ -0,0 +1,260 @@
+/*
+ * Copyright (c) 2019 Nuclei Limited. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __CORE_FEATURE_PMP_H__
+#define __CORE_FEATURE_PMP_H__
+/*!
+ * @file     core_feature_pmp.h
+ * @brief    PMP feature API header file for Nuclei N/NX Core
+ */
+/*
+ * PMP Feature Configuration Macro:
+ * 1. __PMP_PRESENT:  Define whether Physical Memory Protection(PMP) is present or not
+ *   * 0: Not present
+ *   * 1: Present
+ * 2. __PMP_ENTRY_NUM:  Define the number of PMP entries, only 8 or 16 is configurable.
+ */
+#ifdef __cplusplus
+ extern "C" {
+#endif
+
+#if defined(__PMP_PRESENT) && (__PMP_PRESENT == 1)
+/* ===== PMP Operations ===== */
+/**
+ * \defgroup NMSIS_Core_PMP_Functions   PMP Functions
+ * \ingroup  NMSIS_Core
+ * \brief    Functions that related to the RISCV Phyiscal Memory Protection.
+ * \details
+ * Optional physical memory protection (PMP) unit provides per-hart machine-mode
+ * control registers to allow physical memory access privileges (read, write, execute)
+ * to be specified for each physical memory region.
+ *
+ * The PMP can supports region access control settings as small as four bytes.
+ *
+ *   @{
+ */
+#ifndef __PMP_ENTRY_NUM
+/* numbers of PMP entries(__PMP_ENTRY_NUM) should be defined in <Device.h> */
+#error "__PMP_ENTRY_NUM is not defined, please check!"
+#endif
+
+/**
+ * \brief   Get 8bit PMPxCFG Register by PMP entry index
+ * \details Return the content of the PMPxCFG Register.
+ * \param [in]    idx    PMP region index(0-15)
+ * \return               PMPxCFG Register value
+ */
+__STATIC_INLINE uint8_t __get_PMPxCFG(uint32_t idx)
+{
+    rv_csr_t pmpcfg = 0;
+
+    if (idx >= __PMP_ENTRY_NUM) return 0;
+#if __RISCV_XLEN == 32
+    if (idx < 4) {
+        pmpcfg = __RV_CSR_READ(CSR_PMPCFG0);
+    } else if ((idx >=4) && (idx < 8)) {
+        idx -= 4;
+        pmpcfg = __RV_CSR_READ(CSR_PMPCFG1);
+    } else if ((idx >=8) && (idx < 12)) {
+        idx -= 8;
+        pmpcfg = __RV_CSR_READ(CSR_PMPCFG2);
+    } else {
+        idx -= 12;
+        pmpcfg = __RV_CSR_READ(CSR_PMPCFG3);
+    }
+
+    idx = idx << 3;
+    return (uint8_t)((pmpcfg>>idx) & 0xFF);
+#elif __RISCV_XLEN == 64
+    if (idx < 8) {
+        pmpcfg = __RV_CSR_READ(CSR_PMPCFG0);
+    } else {
+        idx -= 8;
+        pmpcfg = __RV_CSR_READ(CSR_PMPCFG2);
+    }
+    idx = idx << 3;
+    return (uint8_t)((pmpcfg>>idx) & 0xFF);
+#else
+    // TODO Add RV128 Handling
+    return 0;
+#endif
+}
+
+/**
+ * \brief   Set 8bit PMPxCFG by pmp entry index
+ * \details Set the given pmpxcfg value to the PMPxCFG Register.
+ * \param [in]    idx      PMPx region index(0-15)
+ * \param [in]    pmpxcfg  PMPxCFG register value to set
+ */
+__STATIC_INLINE void __set_PMPxCFG(uint32_t idx, uint8_t pmpxcfg)
+{
+    rv_csr_t pmpcfgx = 0;
+    if (idx >= __PMP_ENTRY_NUM) return;
+
+#if __RISCV_XLEN == 32
+    if (idx < 4) {
+        pmpcfgx = __RV_CSR_READ(CSR_PMPCFG0);
+        idx = idx << 3;
+        pmpcfgx = (pmpcfgx & ~(0xFFUL << idx)) | ((rv_csr_t)pmpxcfg << idx);
+        __RV_CSR_WRITE(CSR_PMPCFG0, pmpcfgx);
+    } else if ((idx >=4) && (idx < 8)) {
+        idx -= 4;
+        pmpcfgx = __RV_CSR_READ(CSR_PMPCFG1);
+        idx = idx << 3;
+        pmpcfgx = (pmpcfgx & ~(0xFFUL << idx)) | ((rv_csr_t)pmpxcfg << idx);
+        __RV_CSR_WRITE(CSR_PMPCFG1, pmpcfgx);
+    } else if ((idx >=8) && (idx < 12)) {
+        idx -= 8;
+        pmpcfgx = __RV_CSR_READ(CSR_PMPCFG2);
+        idx = idx << 3;
+        pmpcfgx = (pmpcfgx & ~(0xFFUL << idx)) | ((rv_csr_t)pmpxcfg << idx);
+        __RV_CSR_WRITE(CSR_PMPCFG2, pmpcfgx);
+    } else {
+        idx -= 12;
+        pmpcfgx = __RV_CSR_READ(CSR_PMPCFG3);
+        idx = idx << 3;
+        pmpcfgx = (pmpcfgx & ~(0xFFUL << idx)) | ((rv_csr_t)pmpxcfg << idx);
+        __RV_CSR_WRITE(CSR_PMPCFG3, pmpcfgx);
+    }
+#elif __RISCV_XLEN == 64
+    if (idx < 8) {
+        pmpcfgx = __RV_CSR_READ(CSR_PMPCFG0);
+        idx = idx << 3;
+        pmpcfgx = (pmpcfgx & ~(0xFFULL << idx)) | ((rv_csr_t)pmpxcfg << idx);
+        __RV_CSR_WRITE(CSR_PMPCFG0, pmpcfgx);
+    } else {
+        idx -= 8;
+        pmpcfgx = __RV_CSR_READ(CSR_PMPCFG2);
+        idx = idx << 3;
+        pmpcfgx = (pmpcfgx & ~(0xFFULL << idx)) | ((rv_csr_t)pmpxcfg << idx);
+        __RV_CSR_WRITE(CSR_PMPCFG2, pmpcfgx);
+    }
+#else
+    // TODO Add RV128 Handling
+#endif
+}
+
+/**
+ * \brief   Get PMPCFGx Register by index
+ * \details Return the content of the PMPCFGx Register.
+ * \param [in]    idx    PMPCFG CSR index(0-3)
+ * \return               PMPCFGx Register value
+ * \remark
+ * - For RV64, only idx = 0 and idx = 2 is allowed.
+ *   pmpcfg0 and pmpcfg2 hold the configurations
+ *   for the 16 PMP entries, pmpcfg1 and pmpcfg3 are illegal
+ * - For RV32, pmpcfg0–pmpcfg3, hold the configurations
+ *   pmp0cfg–pmp15cfg for the 16 PMP entries
+ */
+__STATIC_INLINE rv_csr_t __get_PMPCFGx(uint32_t idx)
+{
+    switch (idx) {
+        case 0: return __RV_CSR_READ(CSR_PMPCFG0);
+        case 1: return __RV_CSR_READ(CSR_PMPCFG1);
+        case 2: return __RV_CSR_READ(CSR_PMPCFG2);
+        case 3: return __RV_CSR_READ(CSR_PMPCFG3);
+        default: return 0;
+    }
+}
+
+/**
+ * \brief   Set PMPCFGx by index
+ * \details Write the given value to the PMPCFGx Register.
+ * \param [in]    idx      PMPCFG CSR index(0-3)
+ * \param [in]    pmpcfg   PMPCFGx Register value to set
+ * \remark
+ * - For RV64, only idx = 0 and idx = 2 is allowed.
+ *   pmpcfg0 and pmpcfg2 hold the configurations
+ *   for the 16 PMP entries, pmpcfg1 and pmpcfg3 are illegal
+ * - For RV32, pmpcfg0–pmpcfg3, hold the configurations
+ *   pmp0cfg–pmp15cfg for the 16 PMP entries
+ */
+__STATIC_INLINE void __set_PMPCFGx(uint32_t idx, rv_csr_t pmpcfg)
+{
+    switch (idx) {
+        case 0: __RV_CSR_WRITE(CSR_PMPCFG0, pmpcfg); break;
+        case 1: __RV_CSR_WRITE(CSR_PMPCFG1, pmpcfg); break;
+        case 2: __RV_CSR_WRITE(CSR_PMPCFG2, pmpcfg); break;
+        case 3: __RV_CSR_WRITE(CSR_PMPCFG3, pmpcfg); break;
+        default: return;
+    }
+}
+
+/**
+ * \brief   Get PMPADDRx Register by index
+ * \details Return the content of the PMPADDRx Register.
+ * \param [in]    idx    PMP region index(0-15)
+ * \return               PMPADDRx Register value
+ */
+__STATIC_INLINE rv_csr_t __get_PMPADDRx(uint32_t idx)
+{
+    switch (idx) {
+        case 0: return __RV_CSR_READ(CSR_PMPADDR0);
+        case 1: return __RV_CSR_READ(CSR_PMPADDR1);
+        case 2: return __RV_CSR_READ(CSR_PMPADDR2);
+        case 3: return __RV_CSR_READ(CSR_PMPADDR3);
+        case 4: return __RV_CSR_READ(CSR_PMPADDR4);
+        case 5: return __RV_CSR_READ(CSR_PMPADDR5);
+        case 6: return __RV_CSR_READ(CSR_PMPADDR6);
+        case 7: return __RV_CSR_READ(CSR_PMPADDR7);
+        case 8: return __RV_CSR_READ(CSR_PMPADDR8);
+        case 9: return __RV_CSR_READ(CSR_PMPADDR9);
+        case 10: return __RV_CSR_READ(CSR_PMPADDR10);
+        case 11: return __RV_CSR_READ(CSR_PMPADDR11);
+        case 12: return __RV_CSR_READ(CSR_PMPADDR12);
+        case 13: return __RV_CSR_READ(CSR_PMPADDR13);
+        case 14: return __RV_CSR_READ(CSR_PMPADDR14);
+        case 15: return __RV_CSR_READ(CSR_PMPADDR15);
+        default: return 0;
+    }
+}
+
+/**
+ * \brief   Set PMPADDRx by index
+ * \details Write the given value to the PMPADDRx Register.
+ * \param [in]    idx      PMP region index(0-15)
+ * \param [in]    pmpaddr  PMPADDRx Register value to set
+ */
+__STATIC_INLINE void __set_PMPADDRx(uint32_t idx, rv_csr_t pmpaddr)
+{
+    switch (idx) {
+        case 0: __RV_CSR_WRITE(CSR_PMPADDR0, pmpaddr); break;
+        case 1: __RV_CSR_WRITE(CSR_PMPADDR1, pmpaddr); break;
+        case 2: __RV_CSR_WRITE(CSR_PMPADDR2, pmpaddr); break;
+        case 3: __RV_CSR_WRITE(CSR_PMPADDR3, pmpaddr); break;
+        case 4: __RV_CSR_WRITE(CSR_PMPADDR4, pmpaddr); break;
+        case 5: __RV_CSR_WRITE(CSR_PMPADDR5, pmpaddr); break;
+        case 6: __RV_CSR_WRITE(CSR_PMPADDR6, pmpaddr); break;
+        case 7: __RV_CSR_WRITE(CSR_PMPADDR7, pmpaddr); break;
+        case 8: __RV_CSR_WRITE(CSR_PMPADDR8, pmpaddr); break;
+        case 9: __RV_CSR_WRITE(CSR_PMPADDR9, pmpaddr); break;
+        case 10: __RV_CSR_WRITE(CSR_PMPADDR10, pmpaddr); break;
+        case 11: __RV_CSR_WRITE(CSR_PMPADDR11, pmpaddr); break;
+        case 12: __RV_CSR_WRITE(CSR_PMPADDR12, pmpaddr); break;
+        case 13: __RV_CSR_WRITE(CSR_PMPADDR13, pmpaddr); break;
+        case 14: __RV_CSR_WRITE(CSR_PMPADDR14, pmpaddr); break;
+        case 15: __RV_CSR_WRITE(CSR_PMPADDR15, pmpaddr); break;
+        default: return;
+    }
+}
+/** @} */ /* End of Doxygen Group NMSIS_Core_PMP_Functions */
+#endif /* defined(__PMP_PRESENT) && (__PMP_PRESENT == 1) */
+
+#ifdef __cplusplus
+}
+#endif
+#endif /** __CORE_FEATURE_PMP_H__  */
diff --git a/kernel/arch/risc-v/nuclei/gcc/nmsis/Core/Include/core_feature_timer.h b/kernel/arch/risc-v/nuclei/gcc/nmsis/Core/Include/core_feature_timer.h
new file mode 100644
index 00000000..6e9b7af3
--- /dev/null
+++ b/kernel/arch/risc-v/nuclei/gcc/nmsis/Core/Include/core_feature_timer.h
@@ -0,0 +1,364 @@
+/*
+ * Copyright (c) 2019 Nuclei Limited. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __CORE_FEATURE_TIMER_H__
+#define __CORE_FEATURE_TIMER_H__
+/*!
+ * @file     core_feature_timer.h
+ * @brief    System Timer feature API header file for Nuclei N/NX Core
+ */
+/*
+ * System Timer Feature Configuration Macro:
+ * 1. __SYSTIMER_PRESENT:  Define whether Private System Timer is present or not.
+ *   * 0: Not present
+ *   * 1: Present
+ * 2. __SYSTIMER_BASEADDR:  Define the base address of the System Timer.
+ */
+#ifdef __cplusplus
+ extern "C" {
+#endif
+
+#if defined(__SYSTIMER_PRESENT) && (__SYSTIMER_PRESENT == 1)
+/**
+ * \defgroup NMSIS_Core_SysTimer_Registers     Register Define and Type Definitions Of System Timer
+ * \ingroup NMSIS_Core_Registers
+ * \brief   Type definitions and defines for system timer registers.
+ *
+ * @{
+ */
+/**
+ * \brief  Structure type to access the System Timer (SysTimer).
+ * \details
+ * Structure definition to access the system timer(SysTimer).
+ * \remarks
+ * - MSFTRST register is introduced in Nuclei N Core version 1.3(\ref __NUCLEI_N_REV >= 0x0103)
+ * - MSTOP register is renamed to MTIMECTL register in Nuclei N Core version 1.4(\ref __NUCLEI_N_REV >= 0x0104)
+ * - CMPCLREN and CLKSRC bit in MTIMECTL register is introduced in Nuclei N Core version 1.4(\ref __NUCLEI_N_REV >= 0x0104)
+ */
+typedef struct {
+    __IOM uint64_t MTIMER;                  /*!< Offset: 0x000 (R/W)  System Timer current value 64bits Register */
+    __IOM uint64_t MTIMERCMP;               /*!< Offset: 0x008 (R/W)  System Timer compare Value 64bits Register */
+    __IOM uint32_t RESERVED0[0x3F8];        /*!< Offset: 0x010 - 0xFEC Reserved */
+    __IOM uint32_t MSFTRST;                 /*!< Offset: 0xFF0 (R/W)  System Timer Software Core Reset Register */
+    __IOM uint32_t RESERVED1;               /*!< Offset: 0xFF4 Reserved */
+    __IOM uint32_t MTIMECTL;                /*!< Offset: 0xFF8 (R/W)  System Timer Control Register, previously MSTOP register */
+    __IOM uint32_t MSIP;                    /*!< Offset: 0xFFC (R/W)  System Timer SW interrupt Register */
+} SysTimer_Type;
+
+/* Timer Control / Status Register Definitions */
+#define SysTimer_MTIMECTL_TIMESTOP_Pos      0U                                          /*!< SysTick Timer MTIMECTL: TIMESTOP bit Position */
+#define SysTimer_MTIMECTL_TIMESTOP_Msk      (1UL << SysTimer_MTIMECTL_TIMESTOP_Pos)     /*!< SysTick Timer MTIMECTL: TIMESTOP Mask */
+#define SysTimer_MTIMECTL_CMPCLREN_Pos      1U                                          /*!< SysTick Timer MTIMECTL: CMPCLREN bit Position */
+#define SysTimer_MTIMECTL_CMPCLREN_Msk      (1UL << SysTimer_MTIMECTL_CMPCLREN_Pos)     /*!< SysTick Timer MTIMECTL: CMPCLREN Mask */
+#define SysTimer_MTIMECTL_CLKSRC_Pos        2U                                          /*!< SysTick Timer MTIMECTL: CLKSRC bit Position */
+#define SysTimer_MTIMECTL_CLKSRC_Msk        (1UL << SysTimer_MTIMECTL_CLKSRC_Pos)       /*!< SysTick Timer MTIMECTL: CLKSRC Mask */
+
+#define SysTimer_MSIP_MSIP_Pos              0U                                          /*!< SysTick Timer MSIP: MSIP bit Position */
+#define SysTimer_MSIP_MSIP_Msk              (1UL << SysTimer_MSIP_MSIP_Pos)             /*!< SysTick Timer MSIP: MSIP Mask */
+
+#define SysTimer_MTIMER_Msk                 (0xFFFFFFFFFFFFFFFFULL)                     /*!< SysTick Timer MTIMER value Mask */
+#define SysTimer_MTIMERCMP_Msk              (0xFFFFFFFFFFFFFFFFULL)                     /*!< SysTick Timer MTIMERCMP value Mask */
+#define SysTimer_MTIMECTL_Msk               (0xFFFFFFFFUL)                              /*!< SysTick Timer MTIMECTL/MSTOP value Mask */
+#define SysTimer_MSIP_Msk                   (0xFFFFFFFFUL)                              /*!< SysTick Timer MSIP   value Mask */
+#define SysTimer_MSFTRST_Msk                (0xFFFFFFFFUL)                              /*!< SysTick Timer MSFTRST value Mask */
+
+#define SysTimer_MSFRST_KEY                 (0x80000A5FUL)                              /*!< SysTick Timer Software Reset Request Key */
+
+#ifndef __SYSTIMER_BASEADDR
+/* Base address of SYSTIMER(__SYSTIMER_BASEADDR) should be defined in <Device.h> */
+#error "__SYSTIMER_BASEADDR is not defined, please check!"
+#endif
+/* System Timer Memory mapping of Device  */
+#define SysTimer_BASE                       __SYSTIMER_BASEADDR                         /*!< SysTick Base Address */
+#define SysTimer                            ((SysTimer_Type *) SysTimer_BASE)           /*!< SysTick configuration struct */
+/** @} */ /* end of group NMSIS_Core_SysTimer_Registers */
+
+/* ##################################    SysTimer function  ############################################ */
+/**
+ * \defgroup NMSIS_Core_SysTimer SysTimer Functions
+ * \brief    Functions that configure the Core System Timer.
+ * @{
+ */
+/**
+ * \brief  Set system timer load value
+ * \details
+ * This function set the system timer load value in MTIMER register.
+ * \param [in]  value   value to set system timer MTIMER register.
+ * \remarks
+ * - Load value is 64bits wide.
+ * - \ref SysTimer_GetLoadValue
+ */
+__STATIC_FORCEINLINE void SysTimer_SetLoadValue(uint64_t value)
+{
+    SysTimer->MTIMER = value;
+}
+
+/**
+ * \brief  Get system timer load value
+ * \details
+ * This function get the system timer current value in MTIMER register.
+ * \return  current value(64bit) of system timer MTIMER register.
+ * \remarks
+ * - Load value is 64bits wide.
+ * - \ref SysTimer_SetLoadValue
+ */
+__STATIC_FORCEINLINE uint64_t SysTimer_GetLoadValue(void)
+{
+    return SysTimer->MTIMER;
+}
+
+/**
+ * \brief  Set system timer compare value
+ * \details
+ * This function set the system Timer compare value in MTIMERCMP register.
+ * \param [in]  value   compare value to set system timer MTIMERCMP register.
+ * \remarks
+ * - Compare value is 64bits wide.
+ * - If compare value is larger than current value timer interrupt generate.
+ * - Modify the load value or compare value less to clear the interrupt.
+ * - \ref SysTimer_GetCompareValue
+ */
+__STATIC_FORCEINLINE void SysTimer_SetCompareValue(uint64_t value)
+{
+    SysTimer->MTIMERCMP = value;
+}
+
+/**
+ * \brief  Get system timer compare value
+ * \details
+ * This function get the system timer compare value in MTIMERCMP register.
+ * \return  compare value of system timer MTIMERCMP register.
+ * \remarks
+ * - Compare value is 64bits wide.
+ * - \ref SysTimer_SetCompareValue
+ */
+__STATIC_FORCEINLINE uint64_t SysTimer_GetCompareValue(void)
+{
+    return SysTimer->MTIMERCMP;
+}
+
+/**
+ * \brief  Enable system timer counter running
+ * \details
+ * Enable system timer counter running by clear
+ * TIMESTOP bit in MTIMECTL register.
+ */
+__STATIC_FORCEINLINE void SysTimer_Start(void)
+{
+    SysTimer->MTIMECTL &= ~(SysTimer_MTIMECTL_TIMESTOP_Msk);
+}
+
+/**
+ * \brief  Stop system timer counter running
+ * \details
+ * Stop system timer counter running by set
+ * TIMESTOP bit in MTIMECTL register.
+ */
+__STATIC_FORCEINLINE void SysTimer_Stop(void)
+{
+    SysTimer->MTIMECTL |= SysTimer_MTIMECTL_TIMESTOP_Msk;
+}
+
+/**
+ * \brief  Set system timer control value
+ * \details
+ * This function set the system timer MTIMECTL register value.
+ * \param [in]  mctl    value to set MTIMECTL register
+ * \remarks
+ * - Bit TIMESTOP is used to start and stop timer.
+ *   Clear TIMESTOP bit to 0 to start timer, otherwise to stop timer.
+ * - Bit CMPCLREN is used to enable auto MTIMER clear to zero when MTIMER >= MTIMERCMP.
+ *   Clear CMPCLREN bit to 0 to stop auto clear MTIMER feature, otherwise to enable it.
+ * - Bit CLKSRC is used to select timer clock source.
+ *   Clear CLKSRC bit to 0 to use *mtime_toggle_a*, otherwise use *core_clk_aon*
+ * - \ref SysTimer_GetControlValue
+ */
+__STATIC_FORCEINLINE void SysTimer_SetControlValue(uint32_t mctl)
+{
+    SysTimer->MTIMECTL = (mctl & SysTimer_MTIMECTL_Msk);
+}
+
+/**
+ * \brief  Get system timer control value
+ * \details
+ * This function get the system timer MTIMECTL register value.
+ * \return  MTIMECTL register value
+ * \remarks
+ * - \ref SysTimer_SetControlValue
+ */
+__STATIC_FORCEINLINE uint32_t SysTimer_GetControlValue(void)
+{
+    return (SysTimer->MTIMECTL & SysTimer_MTIMECTL_Msk);
+}
+
+/**
+ * \brief  Trigger or set software interrupt via system timer
+ * \details
+ * This function set the system timer MSIP bit in MSIP register.
+ * \remarks
+ * - Set system timer MSIP bit and generate a SW interrupt.
+ * - \ref SysTimer_ClearSWIRQ
+ * - \ref SysTimer_GetMsipValue
+ */
+__STATIC_FORCEINLINE void SysTimer_SetSWIRQ(void)
+{
+    SysTimer->MSIP |= SysTimer_MSIP_MSIP_Msk;
+}
+
+/**
+ * \brief  Clear system timer software interrupt pending request
+ * \details
+ * This function clear the system timer MSIP bit in MSIP register.
+ * \remarks
+ * - Clear system timer MSIP bit in MSIP register to clear the software interrupt pending.
+ * - \ref SysTimer_SetSWIRQ
+ * - \ref SysTimer_GetMsipValue
+ */
+__STATIC_FORCEINLINE void SysTimer_ClearSWIRQ(void)
+{
+    SysTimer->MSIP &= ~SysTimer_MSIP_MSIP_Msk;
+}
+
+/**
+ * \brief  Get system timer MSIP register value
+ * \details
+ * This function get the system timer MSIP register value.
+ * \return    Value of Timer MSIP register.
+ * \remarks
+ * - Bit0 is SW interrupt flag.
+ *   Bit0 is 1 then SW interrupt set. Bit0 is 0 then SW interrupt clear.
+ * - \ref SysTimer_SetSWIRQ
+ * - \ref SysTimer_ClearSWIRQ
+ */
+__STATIC_FORCEINLINE uint32_t SysTimer_GetMsipValue(void)
+{
+    return (uint32_t)(SysTimer->MSIP & SysTimer_MSIP_Msk);
+}
+
+/**
+ * \brief  Set system timer MSIP register value
+ * \details
+ * This function set the system timer MSIP register value.
+ * \param [in]  msip   value to set MSIP register
+ */
+__STATIC_FORCEINLINE void SysTimer_SetMsipValue(uint32_t msip)
+{
+    SysTimer->MSIP = (msip & SysTimer_MSIP_Msk);
+}
+
+/**
+ * \brief  Do software reset request
+ * \details
+ * This function will do software reset request through MTIMER
+ * - Software need to write \ref SysTimer_MSFRST_KEY to generate software reset request
+ * - The software request flag can be cleared by reset operation to clear
+ * \remarks
+ * - The software reset is sent to SoC, SoC need to generate reset signal and send back to Core
+ * - This function will not return, it will do while(1) to wait the Core reset happened
+ */
+__STATIC_FORCEINLINE void SysTimer_SoftwareReset(void)
+{
+    SysTimer->MSFTRST = SysTimer_MSFRST_KEY;
+    while(1);
+}
+
+#if defined (__Vendor_SysTickConfig) && (__Vendor_SysTickConfig == 0U) && defined(__ECLIC_PRESENT) && (__ECLIC_PRESENT == 1)
+/**
+ * \brief   System Tick Configuration
+ * \details Initializes the System Timer and its non-vector interrupt, and starts the System Tick Timer.
+ *
+ *  In our default implementation, the timer counter will be set to zero, and it will start a timer compare non-vector interrupt
+ *  when it matchs the ticks user set, during the timer interrupt user should reload the system tick using \ref SysTick_Reload function
+ *  or similar function written by user, so it can produce period timer interrupt.
+ * \param [in]  ticks  Number of ticks between two interrupts.
+ * \return          0  Function succeeded.
+ * \return          1  Function failed.
+ * \remarks
+ * - For \ref __NUCLEI_N_REV >= 0x0104, the CMPCLREN bit in MTIMECTL is introduced,
+ *   but we assume that the CMPCLREN bit is set to 0, so MTIMER register will not be
+ *   auto cleared to 0 when MTIMER >= MTIMERCMP.
+ * - When the variable \ref __Vendor_SysTickConfig is set to 1, then the
+ *   function \ref SysTick_Config is not included.
+ * - In this case, the file <b><Device>.h</b> must contain a vendor-specific implementation
+ *   of this function.
+ * - If user need this function to start a period timer interrupt, then in timer interrupt handler
+ *   routine code, user should call \ref SysTick_Reload with ticks to reload the timer.
+ * - This function only available when __SYSTIMER_PRESENT == 1 and __ECLIC_PRESENT == 1 and __Vendor_SysTickConfig == 0
+ * \sa
+ * - \ref SysTimer_SetCompareValue; SysTimer_SetLoadValue
+ */
+__STATIC_INLINE uint32_t SysTick_Config(uint64_t ticks)
+{
+    SysTimer_SetLoadValue(0);
+    SysTimer_SetCompareValue(ticks);
+    ECLIC_SetShvIRQ(SysTimer_IRQn, ECLIC_NON_VECTOR_INTERRUPT);
+    ECLIC_SetLevelIRQ(SysTimer_IRQn, 0);
+    ECLIC_EnableIRQ(SysTimer_IRQn);
+    return (0UL);
+}
+
+/**
+ * \brief   System Tick Reload
+ * \details Reload the System Timer Tick when the MTIMECMP reached TIME value
+ *
+ * \param [in]  ticks  Number of ticks between two interrupts.
+ * \return          0  Function succeeded.
+ * \return          1  Function failed.
+ * \remarks
+ * - For \ref __NUCLEI_N_REV >= 0x0104, the CMPCLREN bit in MTIMECTL is introduced,
+ *   but for this \ref SysTick_Config function, we assume this CMPCLREN bit is set to 0,
+ *   so in interrupt handler function, user still need to set the MTIMERCMP or MTIMER to reload
+ *   the system tick, if vendor want to use this timer's auto clear feature, they can define
+ *   \ref __Vendor_SysTickConfig to 1, and implement \ref SysTick_Config and \ref SysTick_Reload functions.
+ * - When the variable \ref __Vendor_SysTickConfig is set to 1, then the
+ *   function \ref SysTick_Reload is not included.
+ * - In this case, the file <b><Device>.h</b> must contain a vendor-specific implementation
+ *   of this function.
+ * - This function only available when __SYSTIMER_PRESENT == 1 and __ECLIC_PRESENT == 1 and __Vendor_SysTickConfig == 0
+ * - Since the MTIMERCMP value might overflow, if overflowed, MTIMER will be set to 0, and MTIMERCMP set to ticks
+ * \sa
+ * - \ref SysTimer_SetCompareValue
+ * - \ref SysTimer_SetLoadValue
+ */
+__STATIC_FORCEINLINE uint32_t SysTick_Reload(uint64_t ticks)
+{
+    uint64_t cur_ticks = SysTimer->MTIMER;
+    uint64_t reload_ticks = ticks + cur_ticks;
+
+    if (__USUALLY(reload_ticks > cur_ticks)) {
+        SysTimer->MTIMERCMP = reload_ticks;
+    } else {
+        /* When added the ticks value, then the MTIMERCMP < TIMER,
+         * which means the MTIMERCMP is overflowed,
+         * so we need to reset the counter to zero */
+        SysTimer->MTIMER = 0;
+        SysTimer->MTIMERCMP = ticks;
+    }
+
+    return (0UL);
+}
+
+#endif /* defined(__Vendor_SysTickConfig) && (__Vendor_SysTickConfig == 0U) */
+/** @} */ /* End of Doxygen Group NMSIS_Core_SysTimer */
+
+#endif /* defined(__SYSTIMER_PRESENT) && (__SYSTIMER_PRESENT == 1) */
+
+#ifdef __cplusplus
+}
+#endif
+#endif /** __CORE_FEATURE_TIMER_H__  */
+
diff --git a/kernel/arch/risc-v/nuclei/gcc/nmsis/Core/Include/nmsis_compiler.h b/kernel/arch/risc-v/nuclei/gcc/nmsis/Core/Include/nmsis_compiler.h
new file mode 100644
index 00000000..c5278db1
--- /dev/null
+++ b/kernel/arch/risc-v/nuclei/gcc/nmsis/Core/Include/nmsis_compiler.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2019 Nuclei Limited. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NMSIS_COMPILER_H
+#define __NMSIS_COMPILER_H
+
+#include <stdint.h>
+
+/*!
+ * @file     nmsis_compiler.h
+ * @brief    NMSIS compiler generic header file
+ */
+#if defined ( __GNUC__ )
+  /** GNU GCC Compiler */
+  #include "nmsis_gcc.h"
+#else
+  #error Unknown compiler.
+#endif
+
+
+#endif /* __NMSIS_COMPILER_H */
+
diff --git a/kernel/arch/risc-v/nuclei/gcc/nmsis/Core/Include/nmsis_core.h b/kernel/arch/risc-v/nuclei/gcc/nmsis/Core/Include/nmsis_core.h
new file mode 100644
index 00000000..fa7821da
--- /dev/null
+++ b/kernel/arch/risc-v/nuclei/gcc/nmsis/Core/Include/nmsis_core.h
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2009-2019 Arm Limited. All rights reserved.
+ * -- Adaptable modifications made for Nuclei Processors. --
+ * Copyright (c) 2019 Nuclei Limited. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __NMSIS_CORE_H__
+#define __NMSIS_CORE_H__
+
+#include <stdint.h>
+
+#ifdef __cplusplus
+ extern "C" {
+#endif
+
+#include "nmsis_version.h"
+
+/**
+ * \ingroup NMSIS_Core_VersionControl
+ * @{
+ */
+/* The following enum __NUCLEI_N_REV/__NUCLEI_NX_REV definition in this file
+ * is only used for doxygen documentation generation,
+ * The <device>.h is the real file to define it by vendor
+ */
+#if defined(__ONLY_FOR_DOXYGEN_DOCUMENT_GENERATION__)
+/**
+ * \brief Nuclei N class core revision number
+ * \details
+ * Reversion number format: [15:8] revision number, [7:0] patch number
+ * \attention
+ * This define is exclusive with \ref __NUCLEI_NX_REV
+ */
+#define __NUCLEI_N_REV                   (0x0104)
+/**
+ * \brief Nuclei NX class core revision number
+ * \details
+ * Reversion number format: [15:8] revision number, [7:0] patch number
+ * \attention
+ * This define is exclusive with \ref __NUCLEI_N_REV
+ */
+#define __NUCLEI_NX_REV                  (0x0100)
+#endif /* __ONLY_FOR_DOXYGEN_DOCUMENT_GENERATION__ */
+/** @} */ /* End of Group NMSIS_Core_VersionControl */
+
+#include "nmsis_compiler.h"     /* NMSIS compiler specific defines */
+
+/* === Include Nuclei Core Related Headers === */
+/* Include core base feature header file */
+#include "core_feature_base.h"
+
+#ifndef __NMSIS_GENERIC
+/* Include core eclic feature header file */
+#include "core_feature_eclic.h"
+/* Include core systimer feature header file */
+#include "core_feature_timer.h"
+#endif
+
+/* Include core fpu feature header file */
+#include "core_feature_fpu.h"
+/* Include core dsp feature header file */
+#include "core_feature_dsp.h"
+/* Include core pmp feature header file */
+#include "core_feature_pmp.h"
+/* Include core cache feature header file */
+#include "core_feature_cache.h"
+
+/* Include compatiable functions header file */
+#include "core_compatiable.h"
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* __NMSIS_CORE_H__ */
diff --git a/kernel/arch/risc-v/nuclei/gcc/nmsis/Core/Include/nmsis_gcc.h b/kernel/arch/risc-v/nuclei/gcc/nmsis/Core/Include/nmsis_gcc.h
new file mode 100644
index 00000000..9f7eb9d2
--- /dev/null
+++ b/kernel/arch/risc-v/nuclei/gcc/nmsis/Core/Include/nmsis_gcc.h
@@ -0,0 +1,265 @@
+/*
+ * Copyright (c) 2019 Nuclei Limited. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NMSIS_GCC_H__
+#define __NMSIS_GCC_H__
+/*!
+ * @file     nmsis_gcc.h
+ * @brief    NMSIS compiler GCC header file
+ */
+#include <stdint.h>
+#include "riscv_encoding.h"
+
+#ifdef __cplusplus
+ extern "C" {
+#endif
+
+/* #########################  Startup and Lowlevel Init  ######################## */
+/**
+ * \defgroup NMSIS_Core_CompilerControl    Compiler Control
+ * \ingroup  NMSIS_Core
+ * \brief    Compiler agnostic \#define symbols for generic c/c++ source code
+ * \details
+ *
+ * The NMSIS-Core provides the header file <b>nmsis_compiler.h</b> with consistent \#define symbols for generate C or C++ source files that should be compiler agnostic.
+ * Each NMSIS compliant compiler should support the functionality described in this section.
+ *
+ * The header file <b>nmsis_compiler.h</b> is also included by each Device Header File <device.h> so that these definitions are available.
+ *   @{
+ */
+/* ignore some GCC warnings */
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wsign-conversion"
+#pragma GCC diagnostic ignored "-Wconversion"
+#pragma GCC diagnostic ignored "-Wunused-parameter"
+
+/* Fallback for __has_builtin */
+#ifndef __has_builtin
+  #define __has_builtin(x) (0)
+#endif
+
+/* NMSIS compiler specific defines */
+/** \brief Pass information from the compiler to the assembler. */
+#ifndef   __ASM
+  #define __ASM                                  __asm
+#endif
+
+/** \brief Recommend that function should be inlined by the compiler. */
+#ifndef   __INLINE
+  #define __INLINE                               inline
+#endif
+
+/** \brief Define a static function that may be inlined by the compiler. */
+#ifndef   __STATIC_INLINE
+  #define __STATIC_INLINE                        static inline
+#endif
+
+/** \brief Define a static function that should be always inlined by the compiler. */
+#ifndef   __STATIC_FORCEINLINE
+  #define __STATIC_FORCEINLINE                   __attribute__((always_inline)) static inline
+#endif
+
+/** \brief Inform the compiler that a function does not return. */
+#ifndef   __NO_RETURN
+  #define __NO_RETURN                            __attribute__((__noreturn__))
+#endif
+
+/** \brief Inform that a variable shall be retained in executable image. */
+#ifndef   __USED
+  #define __USED                                 __attribute__((used))
+#endif
+
+/** \brief restrict pointer qualifier to enable additional optimizations. */
+#ifndef   __WEAK
+  #define __WEAK                                 __attribute__((weak))
+#endif
+
+/** \brief specified the vector size of the variable, measured in bytes */
+#ifndef   __VECTOR_SIZE
+  #define __VECTOR_SIZE(x)                       __attribute__((vector_size(x)))
+#endif
+
+/** \brief Request smallest possible alignment. */
+#ifndef   __PACKED
+  #define __PACKED                               __attribute__((packed, aligned(1)))
+#endif
+
+/** \brief Request smallest possible alignment for a structure. */
+#ifndef   __PACKED_STRUCT
+  #define __PACKED_STRUCT                        struct __attribute__((packed, aligned(1)))
+#endif
+
+/** \brief Request smallest possible alignment for a union. */
+#ifndef   __PACKED_UNION
+  #define __PACKED_UNION                         union __attribute__((packed, aligned(1)))
+#endif
+
+#ifndef   __UNALIGNED_UINT16_WRITE
+  #pragma GCC diagnostic push
+  #pragma GCC diagnostic ignored "-Wpacked"
+  #pragma GCC diagnostic ignored "-Wattributes"
+  /** \brief Packed struct for unaligned uint16_t write access */
+  __PACKED_STRUCT T_UINT16_WRITE {
+      uint16_t v;
+  };
+  #pragma GCC diagnostic pop
+  /** \brief Pointer for unaligned write of a uint16_t variable. */
+  #define __UNALIGNED_UINT16_WRITE(addr, val)    (void)((((struct T_UINT16_WRITE *)(void *)(addr))->v) = (val))
+#endif
+
+#ifndef   __UNALIGNED_UINT16_READ
+  #pragma GCC diagnostic push
+  #pragma GCC diagnostic ignored "-Wpacked"
+  #pragma GCC diagnostic ignored "-Wattributes"
+  /** \brief Packed struct for unaligned uint16_t read access */
+  __PACKED_STRUCT T_UINT16_READ {
+      uint16_t v;
+  };
+  #pragma GCC diagnostic pop
+  /** \brief Pointer for unaligned read of a uint16_t variable. */
+  #define __UNALIGNED_UINT16_READ(addr)          (((const struct T_UINT16_READ *)(const void *)(addr))->v)
+#endif
+
+#ifndef   __UNALIGNED_UINT32_WRITE
+  #pragma GCC diagnostic push
+  #pragma GCC diagnostic ignored "-Wpacked"
+  #pragma GCC diagnostic ignored "-Wattributes"
+  /** \brief Packed struct for unaligned uint32_t write access */
+  __PACKED_STRUCT T_UINT32_WRITE {
+      uint32_t v;
+  };
+  #pragma GCC diagnostic pop
+  /** \brief Pointer for unaligned write of a uint32_t variable. */
+  #define __UNALIGNED_UINT32_WRITE(addr, val)    (void)((((struct T_UINT32_WRITE *)(void *)(addr))->v) = (val))
+#endif
+
+#ifndef   __UNALIGNED_UINT32_READ
+  #pragma GCC diagnostic push
+  #pragma GCC diagnostic ignored "-Wpacked"
+  #pragma GCC diagnostic ignored "-Wattributes"
+  /** \brief Packed struct for unaligned uint32_t read access */
+  __PACKED_STRUCT T_UINT32_READ {
+      uint32_t v;
+  };
+  #pragma GCC diagnostic pop
+  /** \brief Pointer for unaligned read of a uint32_t variable. */
+  #define __UNALIGNED_UINT32_READ(addr)          (((const struct T_UINT32_READ *)(const void *)(addr))->v)
+#endif
+
+/** \brief Minimum `x` bytes alignment for a variable. */
+#ifndef   __ALIGNED
+  #define __ALIGNED(x)                           __attribute__((aligned(x)))
+#endif
+
+/** \brief restrict pointer qualifier to enable additional optimizations. */
+#ifndef   __RESTRICT
+  #define __RESTRICT                             __restrict
+#endif
+
+/** \brief Barrier to prevent compiler from reordering instructions. */
+#ifndef   __COMPILER_BARRIER
+  #define __COMPILER_BARRIER()                   __ASM volatile("":::"memory")
+#endif
+
+/** \brief provide the compiler with branch prediction information, the branch is usually true */
+#ifndef   __USUALLY
+  #define __USUALLY(exp)                         __builtin_expect((exp), 1)
+#endif
+
+/** \brief provide the compiler with branch prediction information, the branch is rarely true */
+#ifndef   __RARELY
+  #define __RARELY(exp)                          __builtin_expect((exp), 0)
+#endif
+
+/** \brief Use this attribute to indicate that the specified function is an interrupt handler. */
+#ifndef   __INTERRUPT
+  #define __INTERRUPT                            __attribute__((interrupt))
+#endif
+
+/** @} */ /* End of Doxygen Group NMSIS_Core_CompilerControl */
+
+/* IO definitions (access restrictions to peripheral registers) */
+/**
+ * \defgroup NMSIS_Core_PeriphAccess     Peripheral Access
+ * \brief  Naming conventions and optional features for accessing peripherals.
+ *
+ * The section below describes the naming conventions, requirements, and optional features
+ * for accessing device specific peripherals.
+ * Most of the rules also apply to the core peripherals.
+ *
+ * The **Device Header File <device.h>** contains typically these definition
+ * and also includes the core specific header files.
+ *
+ * @{
+ */
+/** \brief Defines 'read only' permissions */
+#ifdef __cplusplus
+  #define   __I     volatile
+#else
+  #define   __I     volatile const
+#endif
+/** \brief Defines 'write only' permissions */
+#define     __O     volatile
+/** \brief Defines 'read / write' permissions */
+#define     __IO    volatile
+
+/* following defines should be used for structure members */
+/** \brief Defines 'read only' structure member permissions */
+#define     __IM     volatile const
+/** \brief Defines 'write only' structure member permissions */
+#define     __OM     volatile
+/** \brief Defines 'read/write' structure member permissions */
+#define     __IOM    volatile
+
+/**
+ * \brief   Mask and shift a bit field value for use in a register bit range.
+ * \details The macro \ref _VAL2FLD uses the #define's _Pos and _Msk of the related bit
+ * field to shift bit-field values for assigning to a register.
+ *
+ * **Example**:
+ * \code
+ * ECLIC->CFG = _VAL2FLD(CLIC_CLICCFG_NLBIT, 3);
+ * \endcode
+ * \param[in] field  Name of the register bit field.
+ * \param[in] value  Value of the bit field. This parameter is interpreted as an uint32_t type.
+ * \return           Masked and shifted value.
+ */
+#define _VAL2FLD(field, value)    (((uint32_t)(value) << field ## _Pos) & field ## _Msk)
+
+/**
+ * \brief   Mask and shift a register value to extract a bit filed value.
+ * \details The macro \ref _FLD2VAL uses the #define's _Pos and _Msk of the related bit
+ * field to extract the value of a bit field from a register.
+ *
+ * **Example**:
+ * \code
+ * nlbits = _FLD2VAL(CLIC_CLICCFG_NLBIT, ECLIC->CFG);
+ * \endcode
+ * \param[in] field  Name of the register bit field.
+ * \param[in] value  Value of register. This parameter is interpreted as an uint32_t type.
+ * \return           Masked and shifted bit field value.
+ */
+#define _FLD2VAL(field, value)    (((uint32_t)(value) & field ## _Msk) >> field ## _Pos)
+
+/** @} */ /* end of group NMSIS_Core_PeriphAccess */
+
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* __NMSIS_GCC_H__ */
diff --git a/kernel/arch/risc-v/nuclei/gcc/nmsis/Core/Include/nmsis_version.h b/kernel/arch/risc-v/nuclei/gcc/nmsis/Core/Include/nmsis_version.h
new file mode 100644
index 00000000..16507998
--- /dev/null
+++ b/kernel/arch/risc-v/nuclei/gcc/nmsis/Core/Include/nmsis_version.h
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2019 Nuclei Limited. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __NMSIS_VERSION_H
+#define __NMSIS_VERSION_H
+
+/**
+ * \defgroup NMSIS_Core_VersionControl    Version Control
+ * \ingroup  NMSIS_Core
+ * \brief    Version \#define symbols for NMSIS release specific C/C++ source code
+ * \details
+ *
+ * We followed the [semantic versioning 2.0.0](https://semver.org/) to control NMSIS version.
+ * The version format is **MAJOR.MINOR.PATCH**, increment the:
+ * 1. MAJOR version when you make incompatible API changes,
+ * 2. MINOR version when you add functionality in a backwards compatible manner, and
+ * 3. PATCH version when you make backwards compatible bug fixes.
+ *
+ * The header file `nmsis_version.h` is included by each core header so that these definitions are available.
+ *
+ * **Example Usage for NMSIS Version Check**:
+ * \code
+ *   #if defined(__NMSIS_VERSION) && (__NMSIS_VERSION >= 0x00010105)
+ *      #warning "Yes, we have NMSIS 1.1.5 or later"
+ *   #else
+ *      #error "We need NMSIS 1.1.5 or later!"
+ *   #endif
+ * \endcode
+ *
+ * @{
+ */
+
+/*!
+ * \file     nmsis_version.h
+ * \brief    NMSIS Version definitions
+ **/
+
+/**
+ * \brief   Represent the NMSIS major version
+ * \details
+ * The NMSIS major version can be used to
+ * differentiate between NMSIS major releases.
+ * */
+#define __NMSIS_VERSION_MAJOR            (1U)
+
+/**
+ * \brief   Represent the NMSIS minor version
+ * \details
+ * The NMSIS minor version can be used to
+ * query a NMSIS release update including new features.
+ *
+ **/
+#define __NMSIS_VERSION_MINOR            (0U)
+
+/**
+ * \brief   Represent the NMSIS patch version
+ * \details
+ * The NMSIS patch version can be used to
+ * show bug fixes in this package.
+ **/
+#define __NMSIS_VERSION_PATCH            (1U)
+/**
+ * \brief   Represent the NMSIS Version
+ * \details
+ * NMSIS Version format: **MAJOR.MINOR.PATCH**
+ * * MAJOR: \ref __NMSIS_VERSION_MAJOR, stored in `bits [31:16]` of \ref __NMSIS_VERSION
+ * * MINOR: \ref __NMSIS_VERSION_MINOR, stored in `bits [15:8]` of \ref __NMSIS_VERSION
+ * * PATCH: \ref __NMSIS_VERSION_PATCH, stored in `bits [7:0]` of \ref __NMSIS_VERSION
+ **/
+#define __NMSIS_VERSION                  ((__NMSIS_VERSION_MAJOR << 16U) | (__NMSIS_VERSION_MINOR << 8) | __NMSIS_VERSION_PATCH)
+
+/** @} */ /* End of Doxygen Group NMSIS_Core_VersionControl */
+#endif
diff --git a/kernel/arch/risc-v/nuclei/gcc/nmsis/Core/Include/riscv_bits.h b/kernel/arch/risc-v/nuclei/gcc/nmsis/Core/Include/riscv_bits.h
new file mode 100644
index 00000000..a18c1686
--- /dev/null
+++ b/kernel/arch/risc-v/nuclei/gcc/nmsis/Core/Include/riscv_bits.h
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2019 Nuclei Limited. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __RISCV_BITS_H__
+#define __RISCV_BITS_H__
+
+#ifdef __cplusplus
+ extern "C" {
+#endif
+
+#if __riscv_xlen == 64
+# define SLL32                  sllw
+# define STORE                  sd
+# define LOAD                   ld
+# define LWU                    lwu
+# define LOG_REGBYTES           3
+#else
+# define SLL32                  sll
+# define STORE                  sw
+# define LOAD                   lw
+# define LWU                    lw
+# define LOG_REGBYTES           2
+#endif /* __riscv_xlen */
+
+#define REGBYTES (1 << LOG_REGBYTES)
+
+#if __riscv_flen == 64
+# define FPSTORE                fsd
+# define FPLOAD                 fld
+# define LOG_FPREGBYTES         3
+#else
+# define FPSTORE                fsw
+# define FPLOAD                 flw
+# define LOG_FPREGBYTES         2
+#endif /* __riscv_flen */
+#define FPREGBYTES              (1 << LOG_FPREGBYTES)
+
+#define __rv_likely(x)          __builtin_expect((x), 1)
+#define __rv_unlikely(x)        __builtin_expect((x), 0)
+
+#define __RV_ROUNDUP(a, b)      ((((a)-1)/(b)+1)*(b))
+#define __RV_ROUNDDOWN(a, b)    ((a)/(b)*(b))
+
+#define __RV_MAX(a, b)          ((a) > (b) ? (a) : (b))
+#define __RV_MIN(a, b)          ((a) < (b) ? (a) : (b))
+#define __RV_CLAMP(a, lo, hi)   MIN(MAX(a, lo), hi)
+
+#define __RV_EXTRACT_FIELD(val, which)                  (((val) & (which)) / ((which) & ~((which)-1)))
+#define __RV_INSERT_FIELD(val, which, fieldval)         (((val) & ~(which)) | ((fieldval) * ((which) & ~((which)-1))))
+
+#ifdef __ASSEMBLY__
+#define _AC(X,Y)                X
+#define _AT(T,X)                X
+#else
+#define __AC(X,Y)               (X##Y)
+#define _AC(X,Y)                __AC(X,Y)
+#define _AT(T,X)                ((T)(X))
+#endif /* __ASSEMBLY__ */
+
+#define _UL(x)                  (_AC(x, UL))
+#define _ULL(x)                 (_AC(x, ULL))
+
+#define _BITUL(x)               (_UL(1) << (x))
+#define _BITULL(x)              (_ULL(1) << (x))
+
+#define UL(x)                   (_UL(x))
+#define ULL(x)                  (_ULL(x))
+
+#define STR(x)                  XSTR(x)
+#define XSTR(x)                 #x
+#define __STR(s)                #s
+#define STRINGIFY(s)            __STR(s)
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /** __RISCV_BITS_H__  */
diff --git a/kernel/arch/risc-v/nuclei/gcc/nmsis/Core/Include/riscv_encoding.h b/kernel/arch/risc-v/nuclei/gcc/nmsis/Core/Include/riscv_encoding.h
new file mode 100644
index 00000000..8b218b0b
--- /dev/null
+++ b/kernel/arch/risc-v/nuclei/gcc/nmsis/Core/Include/riscv_encoding.h
@@ -0,0 +1,617 @@
+/*
+ * Copyright (c) 2019 Nuclei Limited. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __RISCV_ENCODING_H__
+#define __RISCV_ENCODING_H__
+
+#include "riscv_bits.h"
+#ifdef __cplusplus
+ extern "C" {
+#endif
+/**
+ * \defgroup NMSIS_Core_CSR_Encoding    Core CSR Encodings
+ * \ingroup  NMSIS_Core
+ * \brief    NMSIS Core CSR Encodings
+ * \details
+ *
+ * The following macros are used for CSR encodings
+ *   @{
+ */
+#define MSTATUS_UIE         0x00000001
+#define MSTATUS_SIE         0x00000002
+#define MSTATUS_HIE         0x00000004
+#define MSTATUS_MIE         0x00000008
+#define MSTATUS_UPIE        0x00000010
+#define MSTATUS_SPIE        0x00000020
+#define MSTATUS_HPIE        0x00000040
+#define MSTATUS_MPIE        0x00000080
+#define MSTATUS_SPP         0x00000100
+#define MSTATUS_MPP         0x00001800
+#define MSTATUS_FS          0x00006000
+#define MSTATUS_XS          0x00018000
+#define MSTATUS_MPRV        0x00020000
+#define MSTATUS_PUM         0x00040000
+#define MSTATUS_MXR         0x00080000
+#define MSTATUS_VM          0x1F000000
+#define MSTATUS32_SD        0x80000000
+#define MSTATUS64_SD        0x8000000000000000
+
+#define MSTATUS_FS_INITIAL  0x00002000
+#define MSTATUS_FS_CLEAN    0x00004000
+#define MSTATUS_FS_DIRTY    0x00006000
+
+#define SSTATUS_UIE         0x00000001
+#define SSTATUS_SIE         0x00000002
+#define SSTATUS_UPIE        0x00000010
+#define SSTATUS_SPIE        0x00000020
+#define SSTATUS_SPP         0x00000100
+#define SSTATUS_FS          0x00006000
+#define SSTATUS_XS          0x00018000
+#define SSTATUS_PUM         0x00040000
+#define SSTATUS32_SD        0x80000000
+#define SSTATUS64_SD        0x8000000000000000
+
+#define CSR_MCACHE_CTL_IE   0x00000001
+#define CSR_MCACHE_CTL_DE   0x00010000
+
+#define DCSR_XDEBUGVER      (3U<<30)
+#define DCSR_NDRESET        (1<<29)
+#define DCSR_FULLRESET      (1<<28)
+#define DCSR_EBREAKM        (1<<15)
+#define DCSR_EBREAKH        (1<<14)
+#define DCSR_EBREAKS        (1<<13)
+#define DCSR_EBREAKU        (1<<12)
+#define DCSR_STOPCYCLE      (1<<10)
+#define DCSR_STOPTIME       (1<<9)
+#define DCSR_CAUSE          (7<<6)
+#define DCSR_DEBUGINT       (1<<5)
+#define DCSR_HALT           (1<<3)
+#define DCSR_STEP           (1<<2)
+#define DCSR_PRV            (3<<0)
+
+#define DCSR_CAUSE_NONE     0
+#define DCSR_CAUSE_SWBP     1
+#define DCSR_CAUSE_HWBP     2
+#define DCSR_CAUSE_DEBUGINT 3
+#define DCSR_CAUSE_STEP     4
+#define DCSR_CAUSE_HALT     5
+
+#define MCONTROL_TYPE(xlen)    (0xfULL<<((xlen)-4))
+#define MCONTROL_DMODE(xlen)   (1ULL<<((xlen)-5))
+#define MCONTROL_MASKMAX(xlen) (0x3fULL<<((xlen)-11))
+
+#define MCONTROL_SELECT     (1<<19)
+#define MCONTROL_TIMING     (1<<18)
+#define MCONTROL_ACTION     (0x3f<<12)
+#define MCONTROL_CHAIN      (1<<11)
+#define MCONTROL_MATCH      (0xf<<7)
+#define MCONTROL_M          (1<<6)
+#define MCONTROL_H          (1<<5)
+#define MCONTROL_S          (1<<4)
+#define MCONTROL_U          (1<<3)
+#define MCONTROL_EXECUTE    (1<<2)
+#define MCONTROL_STORE      (1<<1)
+#define MCONTROL_LOAD       (1<<0)
+
+#define MCONTROL_TYPE_NONE      0
+#define MCONTROL_TYPE_MATCH     2
+
+#define MCONTROL_ACTION_DEBUG_EXCEPTION   0
+#define MCONTROL_ACTION_DEBUG_MODE        1
+#define MCONTROL_ACTION_TRACE_START       2
+#define MCONTROL_ACTION_TRACE_STOP        3
+#define MCONTROL_ACTION_TRACE_EMIT        4
+
+#define MCONTROL_MATCH_EQUAL     0
+#define MCONTROL_MATCH_NAPOT     1
+#define MCONTROL_MATCH_GE        2
+#define MCONTROL_MATCH_LT        3
+#define MCONTROL_MATCH_MASK_LOW  4
+#define MCONTROL_MATCH_MASK_HIGH 5
+
+#define MIP_SSIP            (1 << IRQ_S_SOFT)
+#define MIP_HSIP            (1 << IRQ_H_SOFT)
+#define MIP_MSIP            (1 << IRQ_M_SOFT)
+#define MIP_STIP            (1 << IRQ_S_TIMER)
+#define MIP_HTIP            (1 << IRQ_H_TIMER)
+#define MIP_MTIP            (1 << IRQ_M_TIMER)
+#define MIP_SEIP            (1 << IRQ_S_EXT)
+#define MIP_HEIP            (1 << IRQ_H_EXT)
+#define MIP_MEIP            (1 << IRQ_M_EXT)
+
+#define MIE_SSIE            MIP_SSIP
+#define MIE_HSIE            MIP_HSIP
+#define MIE_MSIE            MIP_MSIP
+#define MIE_STIE            MIP_STIP
+#define MIE_HTIE            MIP_HTIP
+#define MIE_MTIE            MIP_MTIP
+#define MIE_SEIE            MIP_SEIP
+#define MIE_HEIE            MIP_HEIP
+#define MIE_MEIE            MIP_MEIP
+
+/* === Nuclei custom CSR bit mask === */
+
+#define WFE_WFE                     (0x1)
+#define TXEVT_TXEVT                 (0x1)
+#define SLEEPVALUE_SLEEPVALUE       (0x1)
+
+#define MCOUNTINHIBIT_IR            (1<<2)
+#define MCOUNTINHIBIT_CY            (1<<0)
+
+#define MILM_CTL_ILM_BPA            (((1ULL<<((__riscv_xlen)-10))-1)<<10)
+#define MILM_CTL_ILM_EN             (1<<0)
+
+#define MDLM_CTL_DLM_BPA            (((1ULL<<((__riscv_xlen)-10))-1)<<10)
+#define MDLM_CTL_DLM_EN             (1<<0)
+
+#define MSUBM_PTYP                  (0x3<<8)
+#define MSUBM_TYP                   (0x3<<6)
+
+#define MDCAUSE_MDCAUSE             (0x3)
+
+#define MMISC_CTL_NMI_CAUSE_FFF     (1<<9)
+#define MMISC_CTL_MISALIGN          (1<<6)
+#define MMISC_CTL_BPU               (1<<3)
+
+#define MCACHE_CTL_IC_EN            (1<<0)
+#define MCACHE_CTL_IC_SCPD_MOD      (1<<1)
+#define MCACHE_CTL_DC_EN            (1<<16)
+
+#define MTVT2_MTVT2EN               (1<<0)
+#define MTVT2_COMMON_CODE_ENTRY     (((1ULL<<((__riscv_xlen)-2))-1)<<2)
+
+#define MCFG_INFO_TEE               (1<<0)
+#define MCFG_INFO_ECC               (1<<1)
+#define MCFG_INFO_CLIC              (1<<2)
+#define MCFG_INFO_PLIC              (1<<3)
+#define MCFG_INFO_FIO               (1<<4)
+#define MCFG_INFO_PPI               (1<<5)
+#define MCFG_INFO_NICE              (1<<6)
+#define MCFG_INFO_ILM               (1<<7)
+#define MCFG_INFO_DLM               (1<<8)
+#define MCFG_INFO_ICACHE            (1<<9)
+#define MCFG_INFO_DCACHE            (1<<10)
+
+#define MICFG_IC_SET                (0xF<<0)
+#define MICFG_IC_WAY                (0x7<<4)
+#define MICFG_IC_LSIZE              (0x7<<7)
+#define MICFG_ILM_SIZE              (0x1F<<16)
+#define MICFG_ILM_XONLY             (1<<21)
+
+#define MDCFG_DC_SET                (0xF<<0)
+#define MDCFG_DC_WAY                (0x7<<4)
+#define MDCFG_DC_LSIZE              (0x7<<7)
+#define MDCFG_DLM_SIZE              (0x1F<<16)
+
+#define MPPICFG_INFO_PPI_SIZE       (0x1F<<1)
+#define MPPICFG_INFO_PPI_BPA        (((1ULL<<((__riscv_xlen)-10))-1)<<10)
+
+#define MFIOCFG_INFO_FIO_SIZE       (0x1F<<1)
+#define MFIOCFG_INFO_FIO_BPA        (((1ULL<<((__riscv_xlen)-10))-1)<<10)
+
+#define SIP_SSIP MIP_SSIP
+#define SIP_STIP MIP_STIP
+
+#define PRV_U 0
+#define PRV_S 1
+#define PRV_H 2
+#define PRV_M 3
+
+#define VM_MBARE 0
+#define VM_MBB   1
+#define VM_MBBID 2
+#define VM_SV32  8
+#define VM_SV39  9
+#define VM_SV48  10
+
+#define IRQ_S_SOFT   1
+#define IRQ_H_SOFT   2
+#define IRQ_M_SOFT   3
+#define IRQ_S_TIMER  5
+#define IRQ_H_TIMER  6
+#define IRQ_M_TIMER  7
+#define IRQ_S_EXT    9
+#define IRQ_H_EXT    10
+#define IRQ_M_EXT    11
+#define IRQ_COP      12
+#define IRQ_HOST     13
+
+#define DEFAULT_RSTVEC     0x00001000
+#define DEFAULT_NMIVEC     0x00001004
+#define DEFAULT_MTVEC      0x00001010
+#define CONFIG_STRING_ADDR 0x0000100C
+#define EXT_IO_BASE        0x40000000
+#define DRAM_BASE          0x80000000
+
+/* === FPU FRM Rounding Mode === */
+/** FPU Round to Nearest, ties to Even*/
+#define FRM_RNDMODE_RNE     0x0
+/** FPU Round Towards Zero */
+#define FRM_RNDMODE_RTZ     0x1
+/** FPU Round Down (towards -inf) */
+#define FRM_RNDMODE_RDN     0x2
+/** FPU Round Up (towards +inf) */
+#define FRM_RNDMODE_RUP     0x3
+/** FPU Round to nearest, ties to Max Magnitude */
+#define FRM_RNDMODE_RMM     0x4
+/**
+ * In instruction's rm, selects dynamic rounding mode.
+ * In Rounding Mode register, Invalid */
+#define FRM_RNDMODE_DYN     0x7
+
+/* === FPU FFLAGS Accrued Exceptions === */
+/** FPU Inexact */
+#define FFLAGS_AE_NX        (1<<0)
+/** FPU Underflow */
+#define FFLAGS_AE_UF        (1<<1)
+/** FPU Overflow */
+#define FFLAGS_AE_OF        (1<<2)
+/** FPU Divide by Zero */
+#define FFLAGS_AE_DZ        (1<<3)
+/** FPU Invalid Operation */
+#define FFLAGS_AE_NV        (1<<4)
+
+/** Floating Point Register f0-f31, eg. f0 -> FREG(0) */
+#define FREG(idx)           f##idx
+
+
+/* === PMP CFG Bits === */
+#define PMP_R                0x01
+#define PMP_W                0x02
+#define PMP_X                0x04
+#define PMP_A                0x18
+#define PMP_A_TOR            0x08
+#define PMP_A_NA4            0x10
+#define PMP_A_NAPOT          0x18
+#define PMP_L                0x80
+
+#define PMP_SHIFT            2
+#define PMP_COUNT            16
+
+// page table entry (PTE) fields
+#define PTE_V     0x001 // Valid
+#define PTE_R     0x002 // Read
+#define PTE_W     0x004 // Write
+#define PTE_X     0x008 // Execute
+#define PTE_U     0x010 // User
+#define PTE_G     0x020 // Global
+#define PTE_A     0x040 // Accessed
+#define PTE_D     0x080 // Dirty
+#define PTE_SOFT  0x300 // Reserved for Software
+
+#define PTE_PPN_SHIFT 10
+
+#define PTE_TABLE(PTE) (((PTE) & (PTE_V | PTE_R | PTE_W | PTE_X)) == PTE_V)
+
+#ifdef __riscv
+
+#ifdef __riscv64
+# define MSTATUS_SD MSTATUS64_SD
+# define SSTATUS_SD SSTATUS64_SD
+# define RISCV_PGLEVEL_BITS 9
+#else
+# define MSTATUS_SD MSTATUS32_SD
+# define SSTATUS_SD SSTATUS32_SD
+# define RISCV_PGLEVEL_BITS 10
+#endif /* __riscv64 */
+
+#define RISCV_PGSHIFT 12
+#define RISCV_PGSIZE (1 << RISCV_PGSHIFT)
+
+#endif /* __riscv */
+
+#define DOWNLOAD_MODE_FLASHXIP  0
+#define DOWNLOAD_MODE_FLASH     1
+#define DOWNLOAD_MODE_ILM       2
+#define DOWNLOAD_MODE_DDR       3
+
+/**
+ * \defgroup NMSIS_Core_CSR_Registers    Core CSR Registers
+ * \ingroup  NMSIS_Core
+ * \brief    NMSIS Core CSR Register Definitions
+ * \details
+ *
+ * The following macros are used for CSR Register Defintions.
+ *   @{
+ */
+/* === Standard RISC-V CSR Registers === */
+#define CSR_USTATUS 0x0
+#define CSR_FFLAGS 0x1
+#define CSR_FRM 0x2
+#define CSR_FCSR 0x3
+#define CSR_CYCLE 0xc00
+#define CSR_TIME 0xc01
+#define CSR_INSTRET 0xc02
+#define CSR_HPMCOUNTER3 0xc03
+#define CSR_HPMCOUNTER4 0xc04
+#define CSR_HPMCOUNTER5 0xc05
+#define CSR_HPMCOUNTER6 0xc06
+#define CSR_HPMCOUNTER7 0xc07
+#define CSR_HPMCOUNTER8 0xc08
+#define CSR_HPMCOUNTER9 0xc09
+#define CSR_HPMCOUNTER10 0xc0a
+#define CSR_HPMCOUNTER11 0xc0b
+#define CSR_HPMCOUNTER12 0xc0c
+#define CSR_HPMCOUNTER13 0xc0d
+#define CSR_HPMCOUNTER14 0xc0e
+#define CSR_HPMCOUNTER15 0xc0f
+#define CSR_HPMCOUNTER16 0xc10
+#define CSR_HPMCOUNTER17 0xc11
+#define CSR_HPMCOUNTER18 0xc12
+#define CSR_HPMCOUNTER19 0xc13
+#define CSR_HPMCOUNTER20 0xc14
+#define CSR_HPMCOUNTER21 0xc15
+#define CSR_HPMCOUNTER22 0xc16
+#define CSR_HPMCOUNTER23 0xc17
+#define CSR_HPMCOUNTER24 0xc18
+#define CSR_HPMCOUNTER25 0xc19
+#define CSR_HPMCOUNTER26 0xc1a
+#define CSR_HPMCOUNTER27 0xc1b
+#define CSR_HPMCOUNTER28 0xc1c
+#define CSR_HPMCOUNTER29 0xc1d
+#define CSR_HPMCOUNTER30 0xc1e
+#define CSR_HPMCOUNTER31 0xc1f
+#define CSR_SSTATUS 0x100
+#define CSR_SIE 0x104
+#define CSR_STVEC 0x105
+#define CSR_SSCRATCH 0x140
+#define CSR_SEPC 0x141
+#define CSR_SCAUSE 0x142
+#define CSR_SBADADDR 0x143
+#define CSR_SIP 0x144
+#define CSR_SPTBR 0x180
+#define CSR_MSTATUS 0x300
+#define CSR_MISA 0x301
+#define CSR_MEDELEG 0x302
+#define CSR_MIDELEG 0x303
+#define CSR_MIE 0x304
+#define CSR_MTVEC 0x305
+#define CSR_MCOUNTEREN 0x306
+#define CSR_MSCRATCH 0x340
+#define CSR_MEPC 0x341
+#define CSR_MCAUSE 0x342
+#define CSR_MBADADDR 0x343
+#define CSR_MTVAL 0x343
+#define CSR_MIP 0x344
+#define CSR_PMPCFG0 0x3a0
+#define CSR_PMPCFG1 0x3a1
+#define CSR_PMPCFG2 0x3a2
+#define CSR_PMPCFG3 0x3a3
+#define CSR_PMPADDR0 0x3b0
+#define CSR_PMPADDR1 0x3b1
+#define CSR_PMPADDR2 0x3b2
+#define CSR_PMPADDR3 0x3b3
+#define CSR_PMPADDR4 0x3b4
+#define CSR_PMPADDR5 0x3b5
+#define CSR_PMPADDR6 0x3b6
+#define CSR_PMPADDR7 0x3b7
+#define CSR_PMPADDR8 0x3b8
+#define CSR_PMPADDR9 0x3b9
+#define CSR_PMPADDR10 0x3ba
+#define CSR_PMPADDR11 0x3bb
+#define CSR_PMPADDR12 0x3bc
+#define CSR_PMPADDR13 0x3bd
+#define CSR_PMPADDR14 0x3be
+#define CSR_PMPADDR15 0x3bf
+#define CSR_TSELECT 0x7a0
+#define CSR_TDATA1 0x7a1
+#define CSR_TDATA2 0x7a2
+#define CSR_TDATA3 0x7a3
+#define CSR_DCSR 0x7b0
+#define CSR_DPC 0x7b1
+#define CSR_DSCRATCH 0x7b2
+#define CSR_MCYCLE 0xb00
+#define CSR_MINSTRET 0xb02
+#define CSR_MHPMCOUNTER3 0xb03
+#define CSR_MHPMCOUNTER4 0xb04
+#define CSR_MHPMCOUNTER5 0xb05
+#define CSR_MHPMCOUNTER6 0xb06
+#define CSR_MHPMCOUNTER7 0xb07
+#define CSR_MHPMCOUNTER8 0xb08
+#define CSR_MHPMCOUNTER9 0xb09
+#define CSR_MHPMCOUNTER10 0xb0a
+#define CSR_MHPMCOUNTER11 0xb0b
+#define CSR_MHPMCOUNTER12 0xb0c
+#define CSR_MHPMCOUNTER13 0xb0d
+#define CSR_MHPMCOUNTER14 0xb0e
+#define CSR_MHPMCOUNTER15 0xb0f
+#define CSR_MHPMCOUNTER16 0xb10
+#define CSR_MHPMCOUNTER17 0xb11
+#define CSR_MHPMCOUNTER18 0xb12
+#define CSR_MHPMCOUNTER19 0xb13
+#define CSR_MHPMCOUNTER20 0xb14
+#define CSR_MHPMCOUNTER21 0xb15
+#define CSR_MHPMCOUNTER22 0xb16
+#define CSR_MHPMCOUNTER23 0xb17
+#define CSR_MHPMCOUNTER24 0xb18
+#define CSR_MHPMCOUNTER25 0xb19
+#define CSR_MHPMCOUNTER26 0xb1a
+#define CSR_MHPMCOUNTER27 0xb1b
+#define CSR_MHPMCOUNTER28 0xb1c
+#define CSR_MHPMCOUNTER29 0xb1d
+#define CSR_MHPMCOUNTER30 0xb1e
+#define CSR_MHPMCOUNTER31 0xb1f
+#define CSR_MUCOUNTEREN 0x320
+#define CSR_MSCOUNTEREN 0x321
+#define CSR_MHPMEVENT3 0x323
+#define CSR_MHPMEVENT4 0x324
+#define CSR_MHPMEVENT5 0x325
+#define CSR_MHPMEVENT6 0x326
+#define CSR_MHPMEVENT7 0x327
+#define CSR_MHPMEVENT8 0x328
+#define CSR_MHPMEVENT9 0x329
+#define CSR_MHPMEVENT10 0x32a
+#define CSR_MHPMEVENT11 0x32b
+#define CSR_MHPMEVENT12 0x32c
+#define CSR_MHPMEVENT13 0x32d
+#define CSR_MHPMEVENT14 0x32e
+#define CSR_MHPMEVENT15 0x32f
+#define CSR_MHPMEVENT16 0x330
+#define CSR_MHPMEVENT17 0x331
+#define CSR_MHPMEVENT18 0x332
+#define CSR_MHPMEVENT19 0x333
+#define CSR_MHPMEVENT20 0x334
+#define CSR_MHPMEVENT21 0x335
+#define CSR_MHPMEVENT22 0x336
+#define CSR_MHPMEVENT23 0x337
+#define CSR_MHPMEVENT24 0x338
+#define CSR_MHPMEVENT25 0x339
+#define CSR_MHPMEVENT26 0x33a
+#define CSR_MHPMEVENT27 0x33b
+#define CSR_MHPMEVENT28 0x33c
+#define CSR_MHPMEVENT29 0x33d
+#define CSR_MHPMEVENT30 0x33e
+#define CSR_MHPMEVENT31 0x33f
+#define CSR_MVENDORID 0xf11
+#define CSR_MARCHID 0xf12
+#define CSR_MIMPID 0xf13
+#define CSR_MHARTID 0xf14
+#define CSR_CYCLEH 0xc80
+#define CSR_TIMEH 0xc81
+#define CSR_INSTRETH 0xc82
+#define CSR_HPMCOUNTER3H 0xc83
+#define CSR_HPMCOUNTER4H 0xc84
+#define CSR_HPMCOUNTER5H 0xc85
+#define CSR_HPMCOUNTER6H 0xc86
+#define CSR_HPMCOUNTER7H 0xc87
+#define CSR_HPMCOUNTER8H 0xc88
+#define CSR_HPMCOUNTER9H 0xc89
+#define CSR_HPMCOUNTER10H 0xc8a
+#define CSR_HPMCOUNTER11H 0xc8b
+#define CSR_HPMCOUNTER12H 0xc8c
+#define CSR_HPMCOUNTER13H 0xc8d
+#define CSR_HPMCOUNTER14H 0xc8e
+#define CSR_HPMCOUNTER15H 0xc8f
+#define CSR_HPMCOUNTER16H 0xc90
+#define CSR_HPMCOUNTER17H 0xc91
+#define CSR_HPMCOUNTER18H 0xc92
+#define CSR_HPMCOUNTER19H 0xc93
+#define CSR_HPMCOUNTER20H 0xc94
+#define CSR_HPMCOUNTER21H 0xc95
+#define CSR_HPMCOUNTER22H 0xc96
+#define CSR_HPMCOUNTER23H 0xc97
+#define CSR_HPMCOUNTER24H 0xc98
+#define CSR_HPMCOUNTER25H 0xc99
+#define CSR_HPMCOUNTER26H 0xc9a
+#define CSR_HPMCOUNTER27H 0xc9b
+#define CSR_HPMCOUNTER28H 0xc9c
+#define CSR_HPMCOUNTER29H 0xc9d
+#define CSR_HPMCOUNTER30H 0xc9e
+#define CSR_HPMCOUNTER31H 0xc9f
+#define CSR_MCYCLEH 0xb80
+#define CSR_MINSTRETH 0xb82
+#define CSR_MHPMCOUNTER3H 0xb83
+#define CSR_MHPMCOUNTER4H 0xb84
+#define CSR_MHPMCOUNTER5H 0xb85
+#define CSR_MHPMCOUNTER6H 0xb86
+#define CSR_MHPMCOUNTER7H 0xb87
+#define CSR_MHPMCOUNTER8H 0xb88
+#define CSR_MHPMCOUNTER9H 0xb89
+#define CSR_MHPMCOUNTER10H 0xb8a
+#define CSR_MHPMCOUNTER11H 0xb8b
+#define CSR_MHPMCOUNTER12H 0xb8c
+#define CSR_MHPMCOUNTER13H 0xb8d
+#define CSR_MHPMCOUNTER14H 0xb8e
+#define CSR_MHPMCOUNTER15H 0xb8f
+#define CSR_MHPMCOUNTER16H 0xb90
+#define CSR_MHPMCOUNTER17H 0xb91
+#define CSR_MHPMCOUNTER18H 0xb92
+#define CSR_MHPMCOUNTER19H 0xb93
+#define CSR_MHPMCOUNTER20H 0xb94
+#define CSR_MHPMCOUNTER21H 0xb95
+#define CSR_MHPMCOUNTER22H 0xb96
+#define CSR_MHPMCOUNTER23H 0xb97
+#define CSR_MHPMCOUNTER24H 0xb98
+#define CSR_MHPMCOUNTER25H 0xb99
+#define CSR_MHPMCOUNTER26H 0xb9a
+#define CSR_MHPMCOUNTER27H 0xb9b
+#define CSR_MHPMCOUNTER28H 0xb9c
+#define CSR_MHPMCOUNTER29H 0xb9d
+#define CSR_MHPMCOUNTER30H 0xb9e
+#define CSR_MHPMCOUNTER31H 0xb9f
+
+/* === CLIC CSR Registers === */
+#define CSR_MTVT                0x307
+#define CSR_MNXTI               0x345
+#define CSR_MINTSTATUS          0x346
+#define CSR_MSCRATCHCSW         0x348
+#define CSR_MSCRATCHCSWL        0x349
+#define CSR_MCLICBASE           0x350
+
+/* === Nuclei custom CSR Registers === */
+#define CSR_MCOUNTINHIBIT       0x320
+#define CSR_MILM_CTL            0x7C0
+#define CSR_MDLM_CTL            0x7C1
+#define CSR_MNVEC               0x7C3
+#define CSR_MSUBM               0x7C4
+#define CSR_MDCAUSE             0x7C9
+#define CSR_MCACHE_CTL          0x7CA
+#define CSR_MMISC_CTL           0x7D0
+#define CSR_MSAVESTATUS         0x7D6
+#define CSR_MSAVEEPC1           0x7D7
+#define CSR_MSAVECAUSE1         0x7D8
+#define CSR_MSAVEEPC2           0x7D9
+#define CSR_MSAVECAUSE2         0x7DA
+#define CSR_MSAVEDCAUSE1        0x7DB
+#define CSR_MSAVEDCAUSE2        0x7DC
+#define CSR_PUSHMSUBM           0x7EB
+#define CSR_MTVT2               0x7EC
+#define CSR_JALMNXTI            0x7ED
+#define CSR_PUSHMCAUSE          0x7EE
+#define CSR_PUSHMEPC            0x7EF
+#define CSR_MPPICFG_INFO        0x7F0
+#define CSR_MFIOCFG_INFO        0x7F1
+#define CSR_SLEEPVALUE          0x811
+#define CSR_TXEVT               0x812
+#define CSR_WFE                 0x810
+#define CSR_MICFG_INFO          0xFC0
+#define CSR_MDCFG_INFO          0xFC1
+#define CSR_MCFG_INFO           0xFC2
+
+/** @} */ /** End of Doxygen Group NMSIS_Core_CSR_Registers **/
+
+/* Exception Code in MCAUSE CSR */
+#define CAUSE_MISALIGNED_FETCH 0x0
+#define CAUSE_FAULT_FETCH 0x1
+#define CAUSE_ILLEGAL_INSTRUCTION 0x2
+#define CAUSE_BREAKPOINT 0x3
+#define CAUSE_MISALIGNED_LOAD 0x4
+#define CAUSE_FAULT_LOAD 0x5
+#define CAUSE_MISALIGNED_STORE 0x6
+#define CAUSE_FAULT_STORE 0x7
+#define CAUSE_USER_ECALL 0x8
+#define CAUSE_SUPERVISOR_ECALL 0x9
+#define CAUSE_HYPERVISOR_ECALL 0xa
+#define CAUSE_MACHINE_ECALL 0xb
+
+/* Exception Subcode in MDCAUSE CSR */
+#define DCAUSE_FAULT_FETCH_PMP      0x1
+#define DCAUSE_FAULT_FETCH_INST     0x2
+
+#define DCAUSE_FAULT_LOAD_PMP       0x1
+#define DCAUSE_FAULT_LOAD_INST      0x2
+#define DCAUSE_FAULT_LOAD_NICE      0x3
+
+#define DCAUSE_FAULT_STORE_PMP      0x1
+#define DCAUSE_FAULT_STORE_INST     0x2
+
+/** @} */ /** End of Doxygen Group NMSIS_Core_CSR_Encoding **/
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* __RISCV_ENCODING_H__ */
diff --git a/kernel/arch/risc-v/nuclei/gcc/nmsis/DSP/Include/riscv_common_tables.h b/kernel/arch/risc-v/nuclei/gcc/nmsis/DSP/Include/riscv_common_tables.h
new file mode 100644
index 00000000..c7bfb466
--- /dev/null
+++ b/kernel/arch/risc-v/nuclei/gcc/nmsis/DSP/Include/riscv_common_tables.h
@@ -0,0 +1,379 @@
+/* ----------------------------------------------------------------------
+ * Project:      NMSIS DSP Library
+ * Title:        riscv_common_tables.h
+ * Description:  Extern declaration for common tables
+ *
+ * $Date:        27. January 2017
+ * $Revision:    V.1.5.1
+ *
+ * Target Processor: RISC-V Cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (c) 2019 Nuclei Limited. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef _RISCV_COMMON_TABLES_H
+#define _RISCV_COMMON_TABLES_H
+
+#include "riscv_math.h"
+
+#if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_FFT_ALLOW_TABLES) 
+
+  #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) || defined(RISCV_TABLE_BITREV_1024)
+    extern const uint16_t riscvBitRevTable[1024];
+  #endif /* !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) */
+
+  #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) || defined(RISCV_TABLE_TWIDDLECOEF_F32_16)
+    extern const float32_t twiddleCoef_16[32];
+  #endif /* !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) */
+  
+  #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) || defined(RISCV_TABLE_TWIDDLECOEF_F32_32)
+    extern const float32_t twiddleCoef_32[64];
+  #endif /* !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) */
+  
+  #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) || defined(RISCV_TABLE_TWIDDLECOEF_F32_64)
+    extern const float32_t twiddleCoef_64[128];
+  #endif /* !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) */
+  
+  #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) || defined(RISCV_TABLE_TWIDDLECOEF_F32_128)
+    extern const float32_t twiddleCoef_128[256];
+  #endif /* !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) */
+  
+  #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) || defined(RISCV_TABLE_TWIDDLECOEF_F32_256)
+    extern const float32_t twiddleCoef_256[512];
+  #endif /* !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) */
+  
+  #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) || defined(RISCV_TABLE_TWIDDLECOEF_F32_512)
+    extern const float32_t twiddleCoef_512[1024];
+  #endif /* !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) */
+  
+  #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) || defined(RISCV_TABLE_TWIDDLECOEF_F32_1024)
+    extern const float32_t twiddleCoef_1024[2048];
+  #endif /* !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) */
+  
+  #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) || defined(RISCV_TABLE_TWIDDLECOEF_F32_2048)
+    extern const float32_t twiddleCoef_2048[4096];
+  #endif /* !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) */
+  
+  #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) || defined(RISCV_TABLE_TWIDDLECOEF_F32_4096)
+    extern const float32_t twiddleCoef_4096[8192];
+    #define twiddleCoef twiddleCoef_4096
+  #endif /* !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) */
+  
+  #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) || defined(RISCV_TABLE_TWIDDLECOEF_Q31_16)
+    extern const q31_t twiddleCoef_16_q31[24];
+  #endif /* !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) */
+  
+  #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) || defined(RISCV_TABLE_TWIDDLECOEF_Q31_32)
+    extern const q31_t twiddleCoef_32_q31[48];
+  #endif /* !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) */
+  
+  #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) || defined(RISCV_TABLE_TWIDDLECOEF_Q31_64)
+    extern const q31_t twiddleCoef_64_q31[96];
+  #endif /* !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) */
+  
+  #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) || defined(RISCV_TABLE_TWIDDLECOEF_Q31_128)
+    extern const q31_t twiddleCoef_128_q31[192];
+  #endif /* !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) */
+  
+  #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) || defined(RISCV_TABLE_TWIDDLECOEF_Q31_256)
+    extern const q31_t twiddleCoef_256_q31[384];
+  #endif /* !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) */
+  
+  #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) || defined(RISCV_TABLE_TWIDDLECOEF_Q31_512)
+    extern const q31_t twiddleCoef_512_q31[768];
+  #endif /* !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) */
+  
+  #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) || defined(RISCV_TABLE_TWIDDLECOEF_Q31_1024)
+    extern const q31_t twiddleCoef_1024_q31[1536];
+  #endif /* !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) */
+  
+  #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) || defined(RISCV_TABLE_TWIDDLECOEF_Q31_2048)
+    extern const q31_t twiddleCoef_2048_q31[3072];
+  #endif /* !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) */
+  
+  #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) || defined(RISCV_TABLE_TWIDDLECOEF_Q31_4096)
+    extern const q31_t twiddleCoef_4096_q31[6144];
+  #endif /* !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) */
+  
+  #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) || defined(RISCV_TABLE_TWIDDLECOEF_Q15_16)
+    extern const q15_t twiddleCoef_16_q15[24];
+  #endif /* !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) */
+  
+  #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) || defined(RISCV_TABLE_TWIDDLECOEF_Q15_32)
+    extern const q15_t twiddleCoef_32_q15[48];
+  #endif /* !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) */
+  
+  #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) || defined(RISCV_TABLE_TWIDDLECOEF_Q15_64)
+    extern const q15_t twiddleCoef_64_q15[96];
+  #endif /* !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) */
+  
+  #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) || defined(RISCV_TABLE_TWIDDLECOEF_Q15_128)
+    extern const q15_t twiddleCoef_128_q15[192];
+  #endif /* !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) */
+  
+  #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) || defined(RISCV_TABLE_TWIDDLECOEF_Q15_256)
+    extern const q15_t twiddleCoef_256_q15[384];
+  #endif /* !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) */
+  
+  #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) || defined(RISCV_TABLE_TWIDDLECOEF_Q15_512)
+    extern const q15_t twiddleCoef_512_q15[768];
+  #endif /* !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) */
+  
+  #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) || defined(RISCV_TABLE_TWIDDLECOEF_Q15_1024)
+    extern const q15_t twiddleCoef_1024_q15[1536];
+  #endif /* !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) */
+  
+  #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) || defined(RISCV_TABLE_TWIDDLECOEF_Q15_2048)
+    extern const q15_t twiddleCoef_2048_q15[3072];
+  #endif /* !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) */
+  
+  #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) || defined(RISCV_TABLE_TWIDDLECOEF_Q15_4096)
+    extern const q15_t twiddleCoef_4096_q15[6144];
+  #endif /* !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) */
+  
+  #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) || defined(RISCV_TABLE_TWIDDLECOEF_RFFT_F32_32)
+    extern const float32_t twiddleCoef_rfft_32[32];
+  #endif /* !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) */
+  
+  #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) || defined(RISCV_TABLE_TWIDDLECOEF_RFFT_F32_64)
+    extern const float32_t twiddleCoef_rfft_64[64];
+  #endif /* !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) */
+  
+  #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) || defined(RISCV_TABLE_TWIDDLECOEF_RFFT_F32_128)
+    extern const float32_t twiddleCoef_rfft_128[128];
+  #endif /* !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) */
+  
+  #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) || defined(RISCV_TABLE_TWIDDLECOEF_RFFT_F32_256)
+    extern const float32_t twiddleCoef_rfft_256[256];
+  #endif /* !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) */
+  
+  #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) || defined(RISCV_TABLE_TWIDDLECOEF_RFFT_F32_512)
+    extern const float32_t twiddleCoef_rfft_512[512];
+  #endif /* !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) */
+  
+  #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) || defined(RISCV_TABLE_TWIDDLECOEF_RFFT_F32_1024)
+    extern const float32_t twiddleCoef_rfft_1024[1024];
+  #endif /* !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) */
+  
+  #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) || defined(RISCV_TABLE_TWIDDLECOEF_RFFT_F32_2048)
+    extern const float32_t twiddleCoef_rfft_2048[2048];
+  #endif /* !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) */
+  
+  #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) || defined(RISCV_TABLE_TWIDDLECOEF_RFFT_F32_4096)
+    extern const float32_t twiddleCoef_rfft_4096[4096];
+  #endif /* !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) */
+  
+  /* floating-point bit reversal tables */
+  
+  #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) || defined(RISCV_TABLE_BITREVIDX_FLT_16)
+    #define RISCVBITREVINDEXTABLE_16_TABLE_LENGTH ((uint16_t)20)
+    extern const uint16_t riscvBitRevIndexTable16[RISCVBITREVINDEXTABLE_16_TABLE_LENGTH];
+  #endif /* !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) */
+  
+  #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) || defined(RISCV_TABLE_BITREVIDX_FLT_32)
+    #define RISCVBITREVINDEXTABLE_32_TABLE_LENGTH ((uint16_t)48)
+    extern const uint16_t riscvBitRevIndexTable32[RISCVBITREVINDEXTABLE_32_TABLE_LENGTH];
+  #endif /* !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) */
+  
+  #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) || defined(RISCV_TABLE_BITREVIDX_FLT_64)
+    #define RISCVBITREVINDEXTABLE_64_TABLE_LENGTH ((uint16_t)56)
+    extern const uint16_t riscvBitRevIndexTable64[RISCVBITREVINDEXTABLE_64_TABLE_LENGTH];
+  #endif /* !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) */
+  
+  #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) || defined(RISCV_TABLE_BITREVIDX_FLT_128)
+    #define RISCVBITREVINDEXTABLE_128_TABLE_LENGTH ((uint16_t)208)
+    extern const uint16_t riscvBitRevIndexTable128[RISCVBITREVINDEXTABLE_128_TABLE_LENGTH];
+  #endif /* !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) */
+  
+  #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) || defined(RISCV_TABLE_BITREVIDX_FLT_256)
+    #define RISCVBITREVINDEXTABLE_256_TABLE_LENGTH ((uint16_t)440)
+    extern const uint16_t riscvBitRevIndexTable256[RISCVBITREVINDEXTABLE_256_TABLE_LENGTH];
+  #endif /* !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) */
+  
+  #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) || defined(RISCV_TABLE_BITREVIDX_FLT_512)
+    #define RISCVBITREVINDEXTABLE_512_TABLE_LENGTH ((uint16_t)448)
+    extern const uint16_t riscvBitRevIndexTable512[RISCVBITREVINDEXTABLE_512_TABLE_LENGTH];
+  #endif /* !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) */
+  
+  #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) || defined(RISCV_TABLE_BITREVIDX_FLT_1024)
+    #define RISCVBITREVINDEXTABLE_1024_TABLE_LENGTH ((uint16_t)1800)
+    extern const uint16_t riscvBitRevIndexTable1024[RISCVBITREVINDEXTABLE_1024_TABLE_LENGTH];
+  #endif /* !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) */
+  
+  #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) || defined(RISCV_TABLE_BITREVIDX_FLT_2048)
+    #define RISCVBITREVINDEXTABLE_2048_TABLE_LENGTH ((uint16_t)3808)
+    extern const uint16_t riscvBitRevIndexTable2048[RISCVBITREVINDEXTABLE_2048_TABLE_LENGTH];
+  #endif /* !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) */
+  
+  #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) || defined(RISCV_TABLE_BITREVIDX_FLT_4096)
+    #define RISCVBITREVINDEXTABLE_4096_TABLE_LENGTH ((uint16_t)4032)
+    extern const uint16_t riscvBitRevIndexTable4096[RISCVBITREVINDEXTABLE_4096_TABLE_LENGTH];
+  #endif /* !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) */
+  
+  
+  /* fixed-point bit reversal tables */
+  
+  #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) || defined(RISCV_TABLE_BITREVIDX_FXT_16)
+    #define RISCVBITREVINDEXTABLE_FIXED_16_TABLE_LENGTH ((uint16_t)12)
+    extern const uint16_t riscvBitRevIndexTable_fixed_16[RISCVBITREVINDEXTABLE_FIXED_16_TABLE_LENGTH];
+  #endif /* !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) */
+  
+  #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) || defined(RISCV_TABLE_BITREVIDX_FXT_32)
+    #define RISCVBITREVINDEXTABLE_FIXED_32_TABLE_LENGTH ((uint16_t)24)
+    extern const uint16_t riscvBitRevIndexTable_fixed_32[RISCVBITREVINDEXTABLE_FIXED_32_TABLE_LENGTH];
+  #endif /* !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) */
+  
+  #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) || defined(RISCV_TABLE_BITREVIDX_FXT_64)
+    #define RISCVBITREVINDEXTABLE_FIXED_64_TABLE_LENGTH ((uint16_t)56)
+    extern const uint16_t riscvBitRevIndexTable_fixed_64[RISCVBITREVINDEXTABLE_FIXED_64_TABLE_LENGTH];
+  #endif /* !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) */
+  
+  #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) || defined(RISCV_TABLE_BITREVIDX_FXT_128)
+    #define RISCVBITREVINDEXTABLE_FIXED_128_TABLE_LENGTH ((uint16_t)112)
+    extern const uint16_t riscvBitRevIndexTable_fixed_128[RISCVBITREVINDEXTABLE_FIXED_128_TABLE_LENGTH];
+  #endif /* !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) */
+  
+  #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) || defined(RISCV_TABLE_BITREVIDX_FXT_256)
+    #define RISCVBITREVINDEXTABLE_FIXED_256_TABLE_LENGTH ((uint16_t)240)
+    extern const uint16_t riscvBitRevIndexTable_fixed_256[RISCVBITREVINDEXTABLE_FIXED_256_TABLE_LENGTH];
+  #endif /* !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) */
+  
+  #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) || defined(RISCV_TABLE_BITREVIDX_FXT_512)
+    #define RISCVBITREVINDEXTABLE_FIXED_512_TABLE_LENGTH ((uint16_t)480)
+    extern const uint16_t riscvBitRevIndexTable_fixed_512[RISCVBITREVINDEXTABLE_FIXED_512_TABLE_LENGTH];
+  #endif /* !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) */
+  
+  #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) || defined(RISCV_TABLE_BITREVIDX_FXT_1024)
+    #define RISCVBITREVINDEXTABLE_FIXED_1024_TABLE_LENGTH ((uint16_t)992)
+    extern const uint16_t riscvBitRevIndexTable_fixed_1024[RISCVBITREVINDEXTABLE_FIXED_1024_TABLE_LENGTH];
+  #endif /* !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) */
+  
+  #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) || defined(RISCV_TABLE_BITREVIDX_FXT_2048)
+    #define RISCVBITREVINDEXTABLE_FIXED_2048_TABLE_LENGTH ((uint16_t)1984)
+    extern const uint16_t riscvBitRevIndexTable_fixed_2048[RISCVBITREVINDEXTABLE_FIXED_2048_TABLE_LENGTH];
+  #endif /* !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) */
+  
+  #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) || defined(RISCV_TABLE_BITREVIDX_FXT_4096)
+    #define RISCVBITREVINDEXTABLE_FIXED_4096_TABLE_LENGTH ((uint16_t)4032)
+    extern const uint16_t riscvBitRevIndexTable_fixed_4096[RISCVBITREVINDEXTABLE_FIXED_4096_TABLE_LENGTH];
+  #endif /* !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) */
+
+  #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) || defined(RISCV_TABLE_REALCOEF_F32)
+    extern const float32_t realCoefA[8192];
+    extern const float32_t realCoefB[8192];
+  #endif
+
+  #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) || defined(RISCV_TABLE_REALCOEF_Q31)
+    extern const q31_t realCoefAQ31[8192];
+    extern const q31_t realCoefBQ31[8192];
+  #endif
+
+  #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) || defined(RISCV_TABLE_REALCOEF_Q15)
+    extern const q15_t realCoefAQ15[8192];
+    extern const q15_t realCoefBQ15[8192];
+  #endif
+
+  #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) || defined(RISCV_TABLE_DCT4_F32_128)
+    extern const float32_t Weights_128[256];
+    extern const float32_t cos_factors_128[128];
+  #endif
+
+  #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) || defined(RISCV_TABLE_DCT4_F32_512)
+    extern const float32_t Weights_512[1024];
+    extern const float32_t cos_factors_512[512];
+  #endif
+
+  #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) || defined(RISCV_TABLE_DCT4_F32_2048)
+    extern const float32_t Weights_2048[4096];
+    extern const float32_t cos_factors_2048[2048];
+  #endif
+
+  #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) || defined(RISCV_TABLE_DCT4_F32_8192)
+    extern const float32_t Weights_8192[16384];
+    extern const float32_t cos_factors_8192[8192];
+  #endif
+
+  #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) || defined(RISCV_TABLE_DCT4_Q15_128)
+    extern const q15_t WeightsQ15_128[256];
+    extern const q15_t cos_factorsQ15_128[128];
+  #endif
+
+  #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) || defined(RISCV_TABLE_DCT4_Q15_512)
+    extern const q15_t WeightsQ15_512[1024];
+    extern const q15_t cos_factorsQ15_512[512];
+  #endif
+
+  #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) || defined(RISCV_TABLE_DCT4_Q15_2048)
+    extern const q15_t WeightsQ15_2048[4096];
+    extern const q15_t cos_factorsQ15_2048[2048];
+  #endif
+
+  #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) || defined(RISCV_TABLE_DCT4_Q15_8192)
+    extern const q15_t WeightsQ15_8192[16384];
+    extern const q15_t cos_factorsQ15_8192[8192];
+  #endif
+
+  #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) || defined(RISCV_TABLE_DCT4_Q31_128)  
+    extern const q31_t WeightsQ31_128[256];
+    extern const q31_t cos_factorsQ31_128[128];
+  #endif
+
+  #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) || defined(RISCV_TABLE_DCT4_Q31_512) 
+    extern const q31_t WeightsQ31_512[1024];
+    extern const q31_t cos_factorsQ31_512[512];
+  #endif
+
+  #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) || defined(RISCV_TABLE_DCT4_Q31_2048) 
+    extern const q31_t WeightsQ31_2048[4096];
+    extern const q31_t cos_factorsQ31_2048[2048];
+  #endif
+
+  #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FFT_TABLES) || defined(RISCV_TABLE_DCT4_Q31_8192) 
+    extern const q31_t WeightsQ31_8192[16384];
+    extern const q31_t cos_factorsQ31_8192[8192];
+  #endif
+    
+#endif /* if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_FFT_TABLES) */
+
+#if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_FAST_ALLOW_TABLES)
+
+  #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FAST_TABLES) || defined(RISCV_TABLE_RECIP_Q15)
+    extern const q15_t riscvRecipTableQ15[64];
+  #endif /* !defined(RISCV_DSP_CONFIG_TABLES) defined(RISCV_ALL_FAST_TABLES) */
+  
+  #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FAST_TABLES) || defined(RISCV_TABLE_RECIP_Q31)
+    extern const q31_t riscvRecipTableQ31[64];
+  #endif /* !defined(RISCV_DSP_CONFIG_TABLES) defined(RISCV_ALL_FAST_TABLES) */
+  
+  /* Tables for Fast Math Sine and Cosine */
+  #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FAST_TABLES) || defined(RISCV_TABLE_SIN_F32)
+    extern const float32_t sinTable_f32[FAST_MATH_TABLE_SIZE + 1];
+  #endif /* !defined(RISCV_DSP_CONFIG_TABLES) defined(RISCV_ALL_FAST_TABLES) */
+  
+  #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FAST_TABLES) || defined(RISCV_TABLE_SIN_Q31)
+    extern const q31_t sinTable_q31[FAST_MATH_TABLE_SIZE + 1];
+  #endif /* !defined(RISCV_DSP_CONFIG_TABLES) defined(RISCV_ALL_FAST_TABLES) */
+  
+  #if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_ALL_FAST_TABLES) || defined(RISCV_TABLE_SIN_Q15)
+    extern const q15_t sinTable_q15[FAST_MATH_TABLE_SIZE + 1];
+  #endif /* !defined(RISCV_DSP_CONFIG_TABLES) defined(RISCV_ALL_FAST_TABLES) */
+
+#endif /* if !defined(RISCV_DSP_CONFIG_TABLES) || defined(RISCV_FAST_TABLES) */
+
+#endif /*  RISCV_COMMON_TABLES_H */
diff --git a/kernel/arch/risc-v/nuclei/gcc/nmsis/DSP/Include/riscv_const_structs.h b/kernel/arch/risc-v/nuclei/gcc/nmsis/DSP/Include/riscv_const_structs.h
new file mode 100644
index 00000000..471baef7
--- /dev/null
+++ b/kernel/arch/risc-v/nuclei/gcc/nmsis/DSP/Include/riscv_const_structs.h
@@ -0,0 +1,67 @@
+/* ----------------------------------------------------------------------
+ * Project:      NMSIS DSP Library
+ * Title:        riscv_const_structs.h
+ * Description:  Constant structs that are initialized for user convenience.
+ *               For example, some can be given as arguments to the riscv_cfft_f32() function.
+ *
+ * $Date:        27. January 2017
+ * $Revision:    V.1.5.1
+ *
+ * Target Processor: RISC-V Cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (c) 2019 Nuclei Limited. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef _RISCV_CONST_STRUCTS_H
+#define _RISCV_CONST_STRUCTS_H
+
+#include "riscv_math.h"
+#include "riscv_common_tables.h"
+
+   extern const riscv_cfft_instance_f32 riscv_cfft_sR_f32_len16;
+   extern const riscv_cfft_instance_f32 riscv_cfft_sR_f32_len32;
+   extern const riscv_cfft_instance_f32 riscv_cfft_sR_f32_len64;
+   extern const riscv_cfft_instance_f32 riscv_cfft_sR_f32_len128;
+   extern const riscv_cfft_instance_f32 riscv_cfft_sR_f32_len256;
+   extern const riscv_cfft_instance_f32 riscv_cfft_sR_f32_len512;
+   extern const riscv_cfft_instance_f32 riscv_cfft_sR_f32_len1024;
+   extern const riscv_cfft_instance_f32 riscv_cfft_sR_f32_len2048;
+   extern const riscv_cfft_instance_f32 riscv_cfft_sR_f32_len4096;
+
+   extern const riscv_cfft_instance_q31 riscv_cfft_sR_q31_len16;
+   extern const riscv_cfft_instance_q31 riscv_cfft_sR_q31_len32;
+   extern const riscv_cfft_instance_q31 riscv_cfft_sR_q31_len64;
+   extern const riscv_cfft_instance_q31 riscv_cfft_sR_q31_len128;
+   extern const riscv_cfft_instance_q31 riscv_cfft_sR_q31_len256;
+   extern const riscv_cfft_instance_q31 riscv_cfft_sR_q31_len512;
+   extern const riscv_cfft_instance_q31 riscv_cfft_sR_q31_len1024;
+   extern const riscv_cfft_instance_q31 riscv_cfft_sR_q31_len2048;
+   extern const riscv_cfft_instance_q31 riscv_cfft_sR_q31_len4096;
+
+   extern const riscv_cfft_instance_q15 riscv_cfft_sR_q15_len16;
+   extern const riscv_cfft_instance_q15 riscv_cfft_sR_q15_len32;
+   extern const riscv_cfft_instance_q15 riscv_cfft_sR_q15_len64;
+   extern const riscv_cfft_instance_q15 riscv_cfft_sR_q15_len128;
+   extern const riscv_cfft_instance_q15 riscv_cfft_sR_q15_len256;
+   extern const riscv_cfft_instance_q15 riscv_cfft_sR_q15_len512;
+   extern const riscv_cfft_instance_q15 riscv_cfft_sR_q15_len1024;
+   extern const riscv_cfft_instance_q15 riscv_cfft_sR_q15_len2048;
+   extern const riscv_cfft_instance_q15 riscv_cfft_sR_q15_len4096;
+
+#endif
diff --git a/kernel/arch/risc-v/nuclei/gcc/nmsis/DSP/Include/riscv_math.h b/kernel/arch/risc-v/nuclei/gcc/nmsis/DSP/Include/riscv_math.h
new file mode 100644
index 00000000..d561d102
--- /dev/null
+++ b/kernel/arch/risc-v/nuclei/gcc/nmsis/DSP/Include/riscv_math.h
@@ -0,0 +1,7386 @@
+/******************************************************************************
+ * @file     riscv_math.h
+ * @brief    Public header file for NMSIS DSP Library
+ * @version  V1.6.0
+ * @date     18. March 2019
+ ******************************************************************************/
+/*
+ * Copyright (c) 2010-2019 Arm Limited or its affiliates. All rights reserved.
+ * Copyright (c) 2019 Nuclei Limited. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+   \mainpage NMSIS DSP Software Library
+   *
+   * Introduction
+   * ------------
+   *
+   * This user manual describes the NMSIS DSP software library,
+   * a suite of common signal processing functions for use on Nuclei N/NX processor based devices.
+   *
+   * The library is divided into a number of functions each covering a specific category:
+   * - Basic math functions
+   * - Fast math functions
+   * - Complex math functions
+   * - Filters
+   * - Matrix functions
+   * - Transform functions
+   * - Motor control functions
+   * - Statistical functions
+   * - Support functions
+   * - Interpolation functions
+   *
+   * The library has separate functions for operating on 8-bit integers, 16-bit integers,
+   * 32-bit integer and 32-bit floating-point values.
+   *
+   * The library functions are declared in the public file <code>riscv_math.h</code> which is placed in the <code>Include</code> folder.
+   * Simply include this file and link the appropriate library in the application and begin calling the library functions.
+   * The Library supports single public header file <code>riscv_math.h</code> for Nuclei N cores with little endian.
+   * Same header file will be used for floating point unit(FPU) variants.
+   *
+   * \note Please refer to [NMSIS-DSP](../../../dsp/index.html)
+   *
+   * Examples
+   * --------
+   *
+   * The library ships with a number of examples which demonstrate how to use the library functions.
+   *
+   * Toolchain Support
+   * -----------------
+   *
+   * The library has been developed and tested with nuclei riscv gcc toolchain.
+   *
+   * Building the Library
+   * --------------------
+   *
+   * In NMSIS repo, it contains a Makefile to rebuild libraries on nuclei riscv gcc toolchain in the <code>NMSIS/</code> folder.
+   * * In *NMSIS* folder, you can run `make gen_dsp_lib` to build and install DSP library into **NMSIS/Library/DSP/GCC** folder.
+   *
+   * Preprocessor Macros
+   * -------------------
+   *
+   * Each library project have different preprocessor macros.
+   *
+   * - RISCV_MATH_MATRIX_CHECK:
+   *
+   * Define macro RISCV_MATH_MATRIX_CHECK for checking on the input and output sizes of matrices
+   *
+   * - RISCV_MATH_ROUNDING:
+   *
+   * Define macro RISCV_MATH_ROUNDING for rounding on support functions
+   *
+   * - RISCV_MATH_LOOPUNROLL:
+   *
+   * Define macro RISCV_MATH_LOOPUNROLL to enable manual loop unrolling in DSP functions
+   *
+   */
+
+
+/**
+ * @defgroup groupMath Basic Math Functions
+ */
+
+/**
+ * @defgroup groupFastMath Fast Math Functions
+ * This set of functions provides a fast approximation to sine, cosine, and square root.
+ * As compared to most of the other functions in the NMSIS math library, the fast math functions
+ * operate on individual values and not arrays.
+ * There are separate functions for Q15, Q31, and floating-point data.
+ *
+ */
+
+/**
+ * @defgroup groupCmplxMath Complex Math Functions
+ * This set of functions operates on complex data vectors.
+ * The data in the complex arrays is stored in an interleaved fashion
+ * (real, imag, real, imag, ...).
+ * In the API functions, the number of samples in a complex array refers
+ * to the number of complex values; the array contains twice this number of
+ * real values.
+ */
+
+/**
+ * @defgroup groupFilters Filtering Functions
+ */
+
+/**
+ * @defgroup groupMatrix Matrix Functions
+ *
+ * This set of functions provides basic matrix math operations.
+ * The functions operate on matrix data structures.  For example,
+ * the type
+ * definition for the floating-point matrix structure is shown
+ * below:
+ * <pre>
+ *     typedef struct
+ *     {
+ *       uint16_t numRows;     // number of rows of the matrix.
+ *       uint16_t numCols;     // number of columns of the matrix.
+ *       float32_t *pData;     // points to the data of the matrix.
+ *     } riscv_matrix_instance_f32;
+ * </pre>
+ * There are similar definitions for Q15 and Q31 data types.
+ *
+ * The structure specifies the size of the matrix and then points to
+ * an array of data.  The array is of size <code>numRows X numCols</code>
+ * and the values are arranged in row order.  That is, the
+ * matrix element (i, j) is stored at:
+ * <pre>
+ *     pData[i*numCols + j]
+ * </pre>
+ *
+ * \par Init Functions
+ * There is an associated initialization function for each type of matrix
+ * data structure.
+ * The initialization function sets the values of the internal structure fields.
+ * Refer to \ref riscv_mat_init_f32(), \ref riscv_mat_init_q31() and \ref riscv_mat_init_q15()
+ * for floating-point, Q31 and Q15 types,  respectively.
+ *
+ * \par
+ * Use of the initialization function is optional. However, if initialization function is used
+ * then the instance structure cannot be placed into a const data section.
+ * To place the instance structure in a const data
+ * section, manually initialize the data structure.  For example:
+ * <pre>
+ * <code>riscv_matrix_instance_f32 S = {nRows, nColumns, pData};</code>
+ * <code>riscv_matrix_instance_q31 S = {nRows, nColumns, pData};</code>
+ * <code>riscv_matrix_instance_q15 S = {nRows, nColumns, pData};</code>
+ * </pre>
+ * where <code>nRows</code> specifies the number of rows, <code>nColumns</code>
+ * specifies the number of columns, and <code>pData</code> points to the
+ * data array.
+ *
+ * \par Size Checking
+ * By default all of the matrix functions perform size checking on the input and
+ * output matrices. For example, the matrix addition function verifies that the
+ * two input matrices and the output matrix all have the same number of rows and
+ * columns. If the size check fails the functions return:
+ * <pre>
+ *     RISCV_MATH_SIZE_MISMATCH
+ * </pre>
+ * Otherwise the functions return
+ * <pre>
+ *     RISCV_MATH_SUCCESS
+ * </pre>
+ * There is some overhead associated with this matrix size checking.
+ * The matrix size checking is enabled via the \#define
+ * <pre>
+ *     RISCV_MATH_MATRIX_CHECK
+ * </pre>
+ * within the library project settings.  By default this macro is defined
+ * and size checking is enabled. By changing the project settings and
+ * undefining this macro size checking is eliminated and the functions
+ * run a bit faster. With size checking disabled the functions always
+ * return <code>RISCV_MATH_SUCCESS</code>.
+ */
+
+/**
+ * @defgroup groupTransforms Transform Functions
+ */
+
+/**
+ * @defgroup groupController Controller Functions
+ */
+
+/**
+ * @defgroup groupStats Statistics Functions
+ */
+
+/**
+ * @defgroup groupSupport Support Functions
+ */
+
+/**
+ * @defgroup groupInterpolation Interpolation Functions
+ * These functions perform 1- and 2-dimensional interpolation of data.
+ * Linear interpolation is used for 1-dimensional data and
+ * bilinear interpolation is used for 2-dimensional data.
+ */
+
+/**
+ * @defgroup groupExamples Examples
+ */
+
+#ifndef _RISCV_MATH_H
+#define _RISCV_MATH_H
+
+#ifdef   __cplusplus
+extern "C"
+{
+#endif
+
+/* Compiler specific diagnostic adjustment */
+#if   defined ( __CC_ARM )
+
+#elif defined ( __ARMCC_VERSION ) && ( __ARMCC_VERSION >= 6010050 )
+
+#elif defined ( __GNUC__ )
+  #pragma GCC diagnostic push
+  #pragma GCC diagnostic ignored "-Wsign-conversion"
+  #pragma GCC diagnostic ignored "-Wconversion"
+  #pragma GCC diagnostic ignored "-Wunused-parameter"
+
+#elif defined ( __ICCRISCV__ )
+
+#elif defined ( __TI_RISCV__ )
+
+#elif defined ( __CSMC__ )
+
+#elif defined ( __TASKING__ )
+
+#elif defined ( _MSC_VER )
+
+#else
+  #error Unknown compiler
+#endif
+
+
+/* Included for instrinsics definitions */
+#if !defined ( _MSC_VER )
+
+#define __NMSIS_GENERIC
+#if (defined (__RISCV_FEATURE_DSP) && (__RISCV_FEATURE_DSP == 1))
+    #define __DSP_PRESENT   1
+#endif
+#include "nmsis_core.h"
+
+
+#else
+#include <stdint.h>
+#define __STATIC_FORCEINLINE static __forceinline
+#define __ALIGNED(x) __declspec(align(x))
+#define LOW_OPTIMIZATION_ENTER
+#define LOW_OPTIMIZATION_EXIT
+#define IAR_ONLY_LOW_OPTIMIZATION_ENTER
+#define IAR_ONLY_LOW_OPTIMIZATION_EXIT
+#endif
+
+#include "string.h"
+#include <math.h>
+#include "float.h"
+
+/* evaluate RISCV DSP feature */
+#if (defined (__RISCV_FEATURE_DSP) && (__RISCV_FEATURE_DSP == 1))
+    #define RISCV_MATH_DSP
+#endif
+
+  /**
+   * @brief Macros required for reciprocal calculation in Normalized LMS
+   */
+
+#define DELTA_Q31          (0x100)
+#define DELTA_Q15          0x5
+#define INDEX_MASK         0x0000003F
+#ifndef PI
+  #define PI               3.14159265358979f
+#endif
+
+  /**
+   * @brief Macros required for SINE and COSINE Fast math approximations
+   */
+
+#define FAST_MATH_TABLE_SIZE  512
+#define FAST_MATH_Q31_SHIFT   (32 - 10)
+#define FAST_MATH_Q15_SHIFT   (16 - 10)
+#define CONTROLLER_Q31_SHIFT  (32 - 9)
+#define TABLE_SPACING_Q31     0x400000
+#define TABLE_SPACING_Q15     0x80
+
+  /**
+   * @brief Macros required for SINE and COSINE Controller functions
+   */
+  /* 1.31(q31) Fixed value of 2/360 */
+  /* -1 to +1 is divided into 360 values so total spacing is (2/360) */
+#define INPUT_SPACING         0xB60B61
+
+
+  /**
+   * @brief Error status returned by some functions in the library.
+   */
+
+  typedef enum
+  {
+    RISCV_MATH_SUCCESS        =  0,        /**< No error */
+    RISCV_MATH_ARGUMENT_ERROR = -1,        /**< One or more arguments are incorrect */
+    RISCV_MATH_LENGTH_ERROR   = -2,        /**< Length of data buffer is incorrect */
+    RISCV_MATH_SIZE_MISMATCH  = -3,        /**< Size of matrices is not compatible with the operation */
+    RISCV_MATH_NANINF         = -4,        /**< Not-a-number (NaN) or infinity is generated */
+    RISCV_MATH_SINGULAR       = -5,        /**< Input matrix is singular and cannot be inverted */
+    RISCV_MATH_TEST_FAILURE   = -6         /**< Test Failed */
+  } riscv_status;
+
+  /**
+   * @brief 8-bit fractional data type in 1.7 format.
+   */
+  typedef int8_t q7_t;
+
+  /**
+   * @brief 16-bit fractional data type in 1.15 format.
+   */
+  typedef int16_t q15_t;
+
+  /**
+   * @brief 32-bit fractional data type in 1.31 format.
+   */
+  typedef int32_t q31_t;
+
+  /**
+   * @brief 64-bit fractional data type in 1.63 format.
+   */
+  typedef int64_t q63_t;
+
+  /**
+   * @brief 32-bit floating-point type definition.
+   */
+  typedef float float32_t;
+
+  /**
+   * @brief 64-bit floating-point type definition.
+   */
+  typedef double float64_t;
+
+
+/**
+  @brief definition to read/write two 16 bit values.
+  @deprecated
+ */
+#define __SIMD32_TYPE int32_t
+
+#define __SIMD32(addr)        (*(__SIMD32_TYPE **) & (addr))
+#define __SIMD32_CONST(addr)  ( (__SIMD32_TYPE * )   (addr))
+#define _SIMD32_OFFSET(addr)  (*(__SIMD32_TYPE * )   (addr))
+#define __SIMD64(addr)        (*(      int64_t **) & (addr))
+
+/* SIMD replacement */
+
+/**
+  @brief         Read 2 Q31 from Q31 pointer and increment pointer afterwards.
+  @param[in]     pQ31      points to input value
+  @return        Q63 value
+ */
+__STATIC_FORCEINLINE q63_t read_q31x2_ia (
+  q31_t ** pQ31)
+{
+  q63_t val;
+#ifndef RISCV_ALIGN_ACCESS
+#if __RISCV_XLEN == 64
+  val = __LD(*pQ31);
+#else
+  val = *((q63_t *)*pQ31);
+#endif /* __RISCV_XLEN == 64 */
+#else
+  memcpy((void *)(&val), (void *)(*pQ31), 8);
+#endif
+  *pQ31 += 2;
+  return (val);
+}
+
+/**
+  @brief         Read 2 Q31 from Q31 pointer and decrement pointer afterwards.
+  @param[in]     pQ31      points to input value
+  @return        Q63 value
+ */
+__STATIC_FORCEINLINE q63_t read_q31x2_da (
+  q31_t ** pQ31)
+{
+  q63_t val;
+#ifndef RISCV_ALIGN_ACCESS
+#if __RISCV_XLEN == 64
+  val = __LD(*pQ31);
+#else
+  val = *((q63_t *)*pQ31);
+#endif /* __RISCV_XLEN == 64 */
+#else
+  memcpy((void *)(&val), (void *)(*pQ31), 8);
+#endif
+  *pQ31 -= 2;
+  return (val);
+}
+
+/**
+  @brief         Read 2 Q31 from Q31 pointer.
+  @param[in]     pQ31      points to input value
+  @return        Q63 value
+ */
+__STATIC_FORCEINLINE q63_t read_q31x2 (
+  q31_t * pQ31)
+{
+  q63_t val;
+#ifndef RISCV_ALIGN_ACCESS
+#if __RISCV_XLEN == 64
+  val = __LD(pQ31);
+#else
+  val = *((q63_t *)pQ31);
+#endif /* __RISCV_XLEN == 64 */
+#else
+  memcpy((void *)(&val), (void *)(pQ31), 8);
+#endif
+  return (val);
+}
+
+/**
+  @brief         Write 2 Q31 to Q31 pointer and increment pointer afterwards.
+  @param[in]     pQ31      points to input value
+  @param[in]     value     Q63 value
+  @return        none
+ */
+__STATIC_FORCEINLINE void write_q31x2_ia (
+        q31_t ** pQ31,
+        q63_t    value)
+{
+#ifndef RISCV_ALIGN_ACCESS
+#if __RISCV_XLEN == 64
+  __SD(*pQ31, value);
+#else
+  *((q63_t *)*pQ31) = value;
+#endif /* __RISCV_XLEN == 64 */
+#else
+  memcpy((void *)(*pQ31), (void *)(&value), 8);
+#endif
+  *pQ31 += 2;
+}
+
+/**
+  @brief         Write 2 Q31 to Q31 pointer.
+  @param[in]     pQ31      points to input value
+  @param[in]     value     Q63 value
+  @return        none
+ */
+__STATIC_FORCEINLINE void write_q31x2 (
+        q31_t * pQ31,
+        q63_t value)
+{
+#ifndef RISCV_ALIGN_ACCESS
+#if __RISCV_XLEN == 64
+  __SD(pQ31, value);
+#else
+  *((q63_t *)pQ31) = value;
+#endif /* __RISCV_XLEN == 64 */
+#else
+  memcpy((void *)(pQ31), (void *)(&value), 8);
+#endif
+}
+
+/**
+  @brief         Read 2 Q15 from Q15 pointer.
+  @param[in]     pQ15      points to input value
+  @return        Q31 value
+ */
+__STATIC_FORCEINLINE q31_t read_q15x2 (
+  q15_t * pQ15)
+{
+  q31_t val;
+
+#ifndef RISCV_ALIGN_ACCESS
+  __ASM volatile (
+    "lw %0, (%1)"
+    :"=r"(val)
+    :"r"(pQ15)
+  );
+#else
+  memcpy((void *)(&val), (void *)(pQ15), 4);
+#endif
+  return (val);
+}
+
+/**
+  @brief         Read 2 Q15 from Q15 pointer and increment pointer afterwards.
+  @param[in]     pQ15      points to input value
+  @return        Q31 value
+ */
+__STATIC_FORCEINLINE q31_t read_q15x2_ia (
+  q15_t ** pQ15)
+{
+  q31_t val;
+
+#ifndef RISCV_ALIGN_ACCESS
+  __ASM volatile (
+    "lw %0, (%1)"
+    :"=r"(val)
+    :"r"(*pQ15)
+  );
+#else
+  memcpy((void *)(&val), (void *)(*pQ15), 4);
+#endif
+  *pQ15 += 2;
+
+  return (val);
+}
+
+/**
+  @brief         Read 4 Q15 from Q15 pointer and increment pointer afterwards.
+  @param[in]     pQ15      points to input value
+  @return        Q63 value
+ */
+__STATIC_FORCEINLINE q63_t read_q15x4_ia (
+        q15_t ** pQ15)
+{
+  q63_t val;
+#ifndef RISCV_ALIGN_ACCESS
+  val = *((q63_t *)*pQ15);
+#else
+  memcpy((void *)(&val), (void *)(*pQ15), 8);
+#endif
+  *pQ15 += 4;
+
+  return (val);
+}
+
+/**
+  @brief         Read 4 Q15 from Q15 pointer.
+  @param[in]     pQ15      points to input value
+  @return        Q63 value
+ */
+__STATIC_FORCEINLINE q63_t read_q15x4 (
+        q15_t * pQ15)
+{
+  q63_t val;
+#ifndef RISCV_ALIGN_ACCESS
+#if __RISCV_XLEN == 64
+  val = __LD(pQ15);
+#else
+  val = *((q63_t *)pQ15);
+#endif /* __RISCV_XLEN == 64 */
+#else
+  memcpy((void *)(&val), (void *)(pQ15), 8);
+#endif
+  return (val);
+}
+
+/**
+  @brief         Read 2 Q15 from Q15 pointer and decrement pointer afterwards.
+  @param[in]     pQ15      points to input value
+  @return        Q31 value
+ */
+__STATIC_FORCEINLINE q31_t read_q15x2_da (
+  q15_t ** pQ15)
+{
+  q31_t val;
+
+#ifndef RISCV_ALIGN_ACCESS
+  __ASM volatile (
+    "lw %0, (%1)"
+    :"=r"(val)
+    :"r"(*pQ15)
+  );
+#else
+  memcpy((void *)(&val), (void *)(*pQ15), 4);
+#endif
+  *pQ15 -= 2;
+
+  return (val);
+}
+
+/**
+  @brief         Read 4 Q15 from Q15 pointer and decrement pointer afterwards.
+  @param[in]     pQ15      points to input value
+  @return        Q31 value
+ */
+__STATIC_FORCEINLINE q63_t read_q15x4_da (
+        q15_t ** pQ15)
+{
+    q63_t val;
+#ifndef RISCV_ALIGN_ACCESS
+    val = *((q63_t *)*pQ15);
+#else
+    memcpy((void *)(&val), (void *)(*pQ15), 8);
+#endif
+    *pQ15 -= 4;
+
+    return (val);
+}
+
+/**
+  @brief         Write 2 Q15 to Q15 pointer and increment pointer afterwards.
+  @param[in]     pQ15      points to input value
+  @param[in]     value     Q31 value
+  @return        none
+ */
+__STATIC_FORCEINLINE void write_q15x2_ia (
+  q15_t ** pQ15,
+  q31_t    value)
+{
+#ifndef RISCV_ALIGN_ACCESS
+  __ASM volatile (
+    "sw %0, (%1)"
+    :
+    :"r"(value), "r"(*pQ15)
+    :"memory"
+  );
+#else
+  memcpy((void *)(*pQ15), (void *)(&value), 4);
+#endif
+  *pQ15 += 2;
+}
+
+/**
+  @brief         Write 4 Q15 to Q15 pointer and increment pointer afterwards.
+  @param[in]     pQ15      points to input value
+  @param[in]     value     Q31 value
+  @return        none
+ */
+__STATIC_FORCEINLINE void write_q15x4_ia (
+        q15_t ** pQ15,
+        q63_t    value)
+{
+#ifndef RISCV_ALIGN_ACCESS
+    *((q63_t *)*pQ15) = value;
+#else
+    memcpy((void *)(*pQ15), (void *)(&value), 8);
+#endif
+    *pQ15 += 4;
+}
+
+/**
+  @brief         Write 4 Q15 to Q15 pointer and decrement pointer afterwards.
+  @param[in]     pQ15      points to input value
+  @param[in]     value     Q31 value
+  @return        none
+ */
+__STATIC_FORCEINLINE void write_q15x4_da (
+        q15_t ** pQ15,
+        q63_t    value)
+{
+#ifndef RISCV_ALIGN_ACCESS
+    *((q63_t *)*pQ15) = value;
+#else
+    memcpy((void *)(*pQ15), (void *)(&value), 8);
+#endif
+    *pQ15 -= 4;
+}
+
+/**
+  @brief         Write 2 Q15 to Q15 pointer.
+  @param[in]     pQ15      points to input value
+  @param[in]     value     Q31 value
+  @return        none
+ */
+__STATIC_FORCEINLINE void write_q15x2 (
+  q15_t * pQ15,
+  q31_t   value)
+{
+#ifndef RISCV_ALIGN_ACCESS
+  __ASM volatile (
+    "sw %0, (%1)"
+    :
+    :"r"(value), "r"(pQ15)
+    :"memory"
+  );
+#else
+  memcpy((void *)(pQ15), (void *)(&value), 4);
+#endif
+
+}
+
+/**
+  @brief         Write 4 Q15 to Q15 pointer.
+  @param[in]     pQ15      points to input value
+  @param[in]     value     Q31 value
+  @return        none
+ */
+__STATIC_FORCEINLINE void write_q15x4 (
+        q15_t * pQ15,
+        q63_t   value)
+{
+#ifndef RISCV_ALIGN_ACCESS
+  *((q63_t *)pQ15) = value;
+#else
+  memcpy((void *)(pQ15), (void *)(&value), 8);
+#endif
+}
+
+/**
+  @brief         Read 8 Q7 from Q7 pointer and increment pointer afterwards.
+  @param[in]     pQ7       points to input value
+  @return        Q63 value
+ */
+__STATIC_FORCEINLINE q63_t read_q7x8_ia (
+        q7_t ** pQ7)
+{
+    q63_t val;
+#ifndef RISCV_ALIGN_ACCESS
+    val = *((q63_t *)*pQ7);
+#else
+    memcpy((void *)(&val), (void *)(*pQ7), 8);
+#endif
+    *pQ7 += 8;
+
+    return val;
+}
+
+/**
+  @brief         Read 8 Q7 from Q7 pointer and decrement pointer afterwards.
+  @param[in]     pQ7       points to input value
+  @return        Q63 value
+ */
+__STATIC_FORCEINLINE q63_t read_q7x8_da (
+        q7_t ** pQ7)
+{
+    q63_t val;
+#ifndef RISCV_ALIGN_ACCESS
+    val = *((q63_t *)*pQ7);
+#else
+    memcpy((void *)(&val), (void *)(*pQ7), 8);
+#endif
+    *pQ7 -= 8;
+    return val;
+}
+
+/**
+  @brief         Read 4 Q7 from Q7 pointer and increment pointer afterwards.
+  @param[in]     pQ7       points to input value
+  @return        Q31 value
+ */
+__STATIC_FORCEINLINE q31_t read_q7x4_ia (
+  q7_t ** pQ7)
+{
+  q31_t val;
+
+#ifndef RISCV_ALIGN_ACCESS
+  __ASM volatile (
+    "lw %0, (%1)"
+    :"=r"(val)
+    :"r"(*pQ7)
+  );
+#else
+  memcpy((void *)(&val), (void *)(*pQ7), 4);
+#endif
+  *pQ7 += 4;
+
+  return (val);
+}
+
+/**
+  @brief         Read 4 Q7 from Q7 pointer and decrement pointer afterwards.
+  @param[in]     pQ7       points to input value
+  @return        Q31 value
+ */
+__STATIC_FORCEINLINE q31_t read_q7x4_da (
+  q7_t ** pQ7)
+{
+  q31_t val;
+
+#ifndef RISCV_ALIGN_ACCESS
+  __ASM volatile (
+    "lw %0, (%1)"
+    :"=r"(val)
+    :"r"(*pQ7)
+  );
+#else
+  memcpy((void *)(&val), (void *)(*pQ7), 4);
+#endif
+  *pQ7 -= 4;
+
+  return (val);
+}
+
+/**
+  @brief         Write 8 Q7 to Q7 pointer and increment pointer afterwards.
+  @param[in]     pQ7       points to input value
+  @param[in]     value     Q63 value
+  @return        none
+ */
+__STATIC_FORCEINLINE void write_q7x8_ia (
+        q7_t ** pQ7,
+        q63_t   value)
+{
+#ifndef RISCV_ALIGN_ACCESS
+    *((q63_t *)*pQ7) = value;
+#else
+    memcpy((void *)(*pQ7), (void *)(&value), 8);
+#endif
+    *pQ7 += 8;
+}
+
+/**
+  @brief         Write 4 Q7 to Q7 pointer and increment pointer afterwards.
+  @param[in]     pQ7       points to input value
+  @param[in]     value     Q31 value
+  @return        none
+ */
+__STATIC_FORCEINLINE void write_q7x4_ia (
+  q7_t ** pQ7,
+  q31_t   value)
+{
+  q31_t val = value;
+
+#ifndef RISCV_ALIGN_ACCESS
+  __ASM volatile (
+    "sw %0, (%1)"
+    :
+    :"r"(value), "r"(*pQ7)
+    :"memory"
+  );
+#else
+  memcpy((void *)(*pQ7), (void *)(&value), 4);
+#endif
+  *pQ7 += 4;
+}
+
+/**
+* @brief definition to pack four 8 bit values.
+*/
+#define __PACKq7(v0,v1,v2,v3) ( (((int32_t)(v0) <<  0) & (int32_t)0x000000FF) | \
+                              (((int32_t)(v1) <<  8) & (int32_t)0x0000FF00) | \
+                              (((int32_t)(v2) << 16) & (int32_t)0x00FF0000) | \
+                              (((int32_t)(v3) << 24) & (int32_t)0xFF000000)  )
+
+
+
+  /**
+   * @brief Clips Q63 to Q31 values.
+   */
+  __STATIC_FORCEINLINE q31_t clip_q63_to_q31(
+  q63_t x)
+  {
+    return ((q31_t) (x >> 32) != ((q31_t) x >> 31)) ?
+      ((0x7FFFFFFF ^ ((q31_t) (x >> 63)))) : (q31_t) x;
+  }
+
+  /**
+   * @brief Clips Q63 to Q15 values.
+   */
+  __STATIC_FORCEINLINE q15_t clip_q63_to_q15(
+  q63_t x)
+  {
+    return ((q31_t) (x >> 32) != ((q31_t) x >> 31)) ?
+      ((0x7FFF ^ ((q15_t) (x >> 63)))) : (q15_t) (x >> 15);
+  }
+
+  /**
+   * @brief Clips Q31 to Q7 values.
+   */
+  __STATIC_FORCEINLINE q7_t clip_q31_to_q7(
+  q31_t x)
+  {
+    return ((q31_t) (x >> 24) != ((q31_t) x >> 23)) ?
+      ((0x7F ^ ((q7_t) (x >> 31)))) : (q7_t) x;
+  }
+
+  /**
+   * @brief Clips Q31 to Q15 values.
+   */
+  __STATIC_FORCEINLINE q15_t clip_q31_to_q15(
+  q31_t x)
+  {
+    return ((q31_t) (x >> 16) != ((q31_t) x >> 15)) ?
+      ((0x7FFF ^ ((q15_t) (x >> 31)))) : (q15_t) x;
+  }
+
+  /**
+   * @brief Multiplies 32 X 64 and returns 32 bit result in 2.30 format.
+   */
+  __STATIC_FORCEINLINE q63_t mult32x64(
+  q63_t x,
+  q31_t y)
+  {
+    return ((((q63_t) (x & 0x00000000FFFFFFFF) * y) >> 32) +
+            (((q63_t) (x >> 32)                * y)      )  );
+  }
+
+  /**
+   * @brief Function to Calculates 1/in (reciprocal) value of Q31 Data type.
+   */
+  __STATIC_FORCEINLINE uint32_t riscv_recip_q31(
+        q31_t in,
+        q31_t * dst,
+  const q31_t * pRecipTable)
+  {
+    q31_t out;
+    uint32_t tempVal;
+    uint32_t index, i;
+    uint32_t signBits;
+
+    if (in > 0)
+    {
+      signBits = ((uint32_t) (__CLZ( in) - 1));
+    }
+    else
+    {
+      signBits = ((uint32_t) (__CLZ(-in) - 1));
+    }
+
+    /* Convert input sample to 1.31 format */
+    in = (in << signBits);
+
+    /* calculation of index for initial approximated Val */
+    index = (uint32_t)(in >> 24);
+    index = (index & INDEX_MASK);
+
+    /* 1.31 with exp 1 */
+    out = pRecipTable[index];
+
+    /* calculation of reciprocal value */
+    /* running approximation for two iterations */
+    for (i = 0U; i < 2U; i++)
+    {
+      tempVal = (uint32_t) (((q63_t) in * out) >> 31);
+      tempVal = 0x7FFFFFFFu - tempVal;
+      /*      1.31 with exp 1 */
+      /* out = (q31_t) (((q63_t) out * tempVal) >> 30); */
+      out = clip_q63_to_q31(((q63_t) out * tempVal) >> 30);
+    }
+
+    /* write output */
+    *dst = out;
+
+    /* return num of signbits of out = 1/in value */
+    return (signBits + 1U);
+  }
+
+
+  /**
+   * @brief Function to Calculates 1/in (reciprocal) value of Q15 Data type.
+   */
+  __STATIC_FORCEINLINE uint32_t riscv_recip_q15(
+        q15_t in,
+        q15_t * dst,
+  const q15_t * pRecipTable)
+  {
+    q15_t out = 0;
+    uint32_t tempVal = 0;
+    uint32_t index = 0, i = 0;
+    uint32_t signBits = 0;
+
+    if (in > 0)
+    {
+      signBits = ((uint32_t)(__CLZ( in) - 17));
+    }
+    else
+    {
+      signBits = ((uint32_t)(__CLZ(-in) - 17));
+    }
+
+    /* Convert input sample to 1.15 format */
+    in = (in << signBits);
+
+    /* calculation of index for initial approximated Val */
+    index = (uint32_t)(in >>  8);
+    index = (index & INDEX_MASK);
+
+    /*      1.15 with exp 1  */
+    out = pRecipTable[index];
+
+    /* calculation of reciprocal value */
+    /* running approximation for two iterations */
+    for (i = 0U; i < 2U; i++)
+    {
+      tempVal = (uint32_t) (((q31_t) in * out) >> 15);
+      tempVal = 0x7FFFu - tempVal;
+      /*      1.15 with exp 1 */
+      out = (q15_t) (((q31_t) out * tempVal) >> 14);
+      /* out = clip_q31_to_q15(((q31_t) out * tempVal) >> 14); */
+    }
+
+    /* write output */
+    *dst = out;
+
+    /* return num of signbits of out = 1/in value */
+    return (signBits + 1);
+  }
+
+
+/*
+ * @brief C custom defined intrinsic functions
+ */
+#if !defined (RISCV_MATH_DSP)
+
+  /*
+   * @brief C custom defined QADD8
+   */
+  __STATIC_FORCEINLINE uint32_t __QADD8(
+  uint32_t x,
+  uint32_t y)
+  {
+    q31_t r, s, t, u;
+
+    r = __SSAT(((((q31_t)x << 24) >> 24) + (((q31_t)y << 24) >> 24)), 8) & (int32_t)0x000000FF;
+    s = __SSAT(((((q31_t)x << 16) >> 24) + (((q31_t)y << 16) >> 24)), 8) & (int32_t)0x000000FF;
+    t = __SSAT(((((q31_t)x <<  8) >> 24) + (((q31_t)y <<  8) >> 24)), 8) & (int32_t)0x000000FF;
+    u = __SSAT(((((q31_t)x      ) >> 24) + (((q31_t)y      ) >> 24)), 8) & (int32_t)0x000000FF;
+
+    return ((uint32_t)((u << 24) | (t << 16) | (s <<  8) | (r      )));
+  }
+
+
+  /*
+   * @brief C custom defined QSUB8
+   */
+  __STATIC_FORCEINLINE uint32_t __QSUB8(
+  uint32_t x,
+  uint32_t y)
+  {
+    q31_t r, s, t, u;
+
+    r = __SSAT(((((q31_t)x << 24) >> 24) - (((q31_t)y << 24) >> 24)), 8) & (int32_t)0x000000FF;
+    s = __SSAT(((((q31_t)x << 16) >> 24) - (((q31_t)y << 16) >> 24)), 8) & (int32_t)0x000000FF;
+    t = __SSAT(((((q31_t)x <<  8) >> 24) - (((q31_t)y <<  8) >> 24)), 8) & (int32_t)0x000000FF;
+    u = __SSAT(((((q31_t)x      ) >> 24) - (((q31_t)y      ) >> 24)), 8) & (int32_t)0x000000FF;
+
+    return ((uint32_t)((u << 24) | (t << 16) | (s <<  8) | (r      )));
+  }
+
+
+  /*
+   * @brief C custom defined QADD16
+   */
+  __STATIC_FORCEINLINE uint32_t __QADD16(
+  uint32_t x,
+  uint32_t y)
+  {
+/*  q31_t r,     s;  without initialisation 'riscv_offset_q15 test' fails  but 'intrinsic' tests pass! for armCC */
+    q31_t r = 0, s = 0;
+
+    r = __SSAT(((((q31_t)x << 16) >> 16) + (((q31_t)y << 16) >> 16)), 16) & (int32_t)0x0000FFFF;
+    s = __SSAT(((((q31_t)x      ) >> 16) + (((q31_t)y      ) >> 16)), 16) & (int32_t)0x0000FFFF;
+
+    return ((uint32_t)((s << 16) | (r      )));
+  }
+
+
+  /*
+   * @brief C custom defined SHADD16
+   */
+  __STATIC_FORCEINLINE uint32_t __SHADD16(
+  uint32_t x,
+  uint32_t y)
+  {
+    q31_t r, s;
+
+    r = (((((q31_t)x << 16) >> 16) + (((q31_t)y << 16) >> 16)) >> 1) & (int32_t)0x0000FFFF;
+    s = (((((q31_t)x      ) >> 16) + (((q31_t)y      ) >> 16)) >> 1) & (int32_t)0x0000FFFF;
+
+    return ((uint32_t)((s << 16) | (r      )));
+  }
+
+
+  /*
+   * @brief C custom defined QSUB16
+   */
+  __STATIC_FORCEINLINE uint32_t __QSUB16(
+  uint32_t x,
+  uint32_t y)
+  {
+    q31_t r, s;
+
+    r = __SSAT(((((q31_t)x << 16) >> 16) - (((q31_t)y << 16) >> 16)), 16) & (int32_t)0x0000FFFF;
+    s = __SSAT(((((q31_t)x      ) >> 16) - (((q31_t)y      ) >> 16)), 16) & (int32_t)0x0000FFFF;
+
+    return ((uint32_t)((s << 16) | (r      )));
+  }
+
+
+  /*
+   * @brief C custom defined SHSUB16
+   */
+  __STATIC_FORCEINLINE uint32_t __SHSUB16(
+  uint32_t x,
+  uint32_t y)
+  {
+    q31_t r, s;
+
+    r = (((((q31_t)x << 16) >> 16) - (((q31_t)y << 16) >> 16)) >> 1) & (int32_t)0x0000FFFF;
+    s = (((((q31_t)x      ) >> 16) - (((q31_t)y      ) >> 16)) >> 1) & (int32_t)0x0000FFFF;
+
+    return ((uint32_t)((s << 16) | (r      )));
+  }
+
+
+  /*
+   * @brief C custom defined QASX
+   */
+  __STATIC_FORCEINLINE uint32_t __QASX(
+  uint32_t x,
+  uint32_t y)
+  {
+    q31_t r, s;
+
+    r = __SSAT(((((q31_t)x << 16) >> 16) - (((q31_t)y      ) >> 16)), 16) & (int32_t)0x0000FFFF;
+    s = __SSAT(((((q31_t)x      ) >> 16) + (((q31_t)y << 16) >> 16)), 16) & (int32_t)0x0000FFFF;
+
+    return ((uint32_t)((s << 16) | (r      )));
+  }
+
+
+  /*
+   * @brief C custom defined SHASX
+   */
+  __STATIC_FORCEINLINE uint32_t __SHASX(
+  uint32_t x,
+  uint32_t y)
+  {
+    q31_t r, s;
+
+    r = (((((q31_t)x << 16) >> 16) - (((q31_t)y      ) >> 16)) >> 1) & (int32_t)0x0000FFFF;
+    s = (((((q31_t)x      ) >> 16) + (((q31_t)y << 16) >> 16)) >> 1) & (int32_t)0x0000FFFF;
+
+    return ((uint32_t)((s << 16) | (r      )));
+  }
+
+
+  /*
+   * @brief C custom defined QSAX
+   */
+  __STATIC_FORCEINLINE uint32_t __QSAX(
+  uint32_t x,
+  uint32_t y)
+  {
+    q31_t r, s;
+
+    r = __SSAT(((((q31_t)x << 16) >> 16) + (((q31_t)y      ) >> 16)), 16) & (int32_t)0x0000FFFF;
+    s = __SSAT(((((q31_t)x      ) >> 16) - (((q31_t)y << 16) >> 16)), 16) & (int32_t)0x0000FFFF;
+
+    return ((uint32_t)((s << 16) | (r      )));
+  }
+
+
+  /*
+   * @brief C custom defined SHSAX
+   */
+  __STATIC_FORCEINLINE uint32_t __SHSAX(
+  uint32_t x,
+  uint32_t y)
+  {
+    q31_t r, s;
+
+    r = (((((q31_t)x << 16) >> 16) + (((q31_t)y      ) >> 16)) >> 1) & (int32_t)0x0000FFFF;
+    s = (((((q31_t)x      ) >> 16) - (((q31_t)y << 16) >> 16)) >> 1) & (int32_t)0x0000FFFF;
+
+    return ((uint32_t)((s << 16) | (r      )));
+  }
+
+
+  /*
+   * @brief C custom defined SMUSDX
+   */
+  __STATIC_FORCEINLINE uint32_t __SMUSDX(
+  uint32_t x,
+  uint32_t y)
+  {
+    return ((uint32_t)(((((q31_t)x << 16) >> 16) * (((q31_t)y      ) >> 16)) -
+                       ((((q31_t)x      ) >> 16) * (((q31_t)y << 16) >> 16))   ));
+  }
+
+  /*
+   * @brief C custom defined SMUADX
+   */
+  __STATIC_FORCEINLINE uint32_t __SMUADX(
+  uint32_t x,
+  uint32_t y)
+  {
+    return ((uint32_t)(((((q31_t)x << 16) >> 16) * (((q31_t)y      ) >> 16)) +
+                       ((((q31_t)x      ) >> 16) * (((q31_t)y << 16) >> 16))   ));
+  }
+
+
+  /*
+   * @brief C custom defined QADD
+   */
+  __STATIC_FORCEINLINE int32_t __QADD(
+  int32_t x,
+  int32_t y)
+  {
+    return ((int32_t)(clip_q63_to_q31((q63_t)x + (q31_t)y)));
+  }
+
+
+  /*
+   * @brief C custom defined QSUB
+   */
+  __STATIC_FORCEINLINE int32_t __QSUB(
+  int32_t x,
+  int32_t y)
+  {
+    return ((int32_t)(clip_q63_to_q31((q63_t)x - (q31_t)y)));
+  }
+
+
+  /*
+   * @brief C custom defined SMLAD
+   */
+  __STATIC_FORCEINLINE uint32_t __SMLAD(
+  uint32_t x,
+  uint32_t y,
+  uint32_t sum)
+  {
+    return ((uint32_t)(((((q31_t)x << 16) >> 16) * (((q31_t)y << 16) >> 16)) +
+                       ((((q31_t)x      ) >> 16) * (((q31_t)y      ) >> 16)) +
+                       ( ((q31_t)sum    )                                  )   ));
+  }
+
+
+  /*
+   * @brief C custom defined SMLADX
+   */
+  __STATIC_FORCEINLINE uint32_t __SMLADX(
+  uint32_t x,
+  uint32_t y,
+  uint32_t sum)
+  {
+    return ((uint32_t)(((((q31_t)x << 16) >> 16) * (((q31_t)y      ) >> 16)) +
+                       ((((q31_t)x      ) >> 16) * (((q31_t)y << 16) >> 16)) +
+                       ( ((q31_t)sum    )                                  )   ));
+  }
+
+
+  /*
+   * @brief C custom defined SMLSDX
+   */
+  __STATIC_FORCEINLINE uint32_t __SMLSDX(
+  uint32_t x,
+  uint32_t y,
+  uint32_t sum)
+  {
+    return ((uint32_t)(((((q31_t)x << 16) >> 16) * (((q31_t)y      ) >> 16)) -
+                       ((((q31_t)x      ) >> 16) * (((q31_t)y << 16) >> 16)) +
+                       ( ((q31_t)sum    )                                  )   ));
+  }
+
+
+  /*
+   * @brief C custom defined SMLALD
+   */
+  __STATIC_FORCEINLINE uint64_t __SMLALD(
+  uint32_t x,
+  uint32_t y,
+  uint64_t sum)
+  {
+/*  return (sum + ((q15_t) (x >> 16) * (q15_t) (y >> 16)) + ((q15_t) x * (q15_t) y)); */
+    return ((uint64_t)(((((q31_t)x << 16) >> 16) * (((q31_t)y << 16) >> 16)) +
+                       ((((q31_t)x      ) >> 16) * (((q31_t)y      ) >> 16)) +
+                       ( ((q63_t)sum    )                                  )   ));
+  }
+
+
+  /*
+   * @brief C custom defined SMLALDX
+   */
+  __STATIC_FORCEINLINE uint64_t __SMLALDX(
+  uint32_t x,
+  uint32_t y,
+  uint64_t sum)
+  {
+/*  return (sum + ((q15_t) (x >> 16) * (q15_t) y)) + ((q15_t) x * (q15_t) (y >> 16)); */
+    return ((uint64_t)(((((q31_t)x << 16) >> 16) * (((q31_t)y      ) >> 16)) +
+                       ((((q31_t)x      ) >> 16) * (((q31_t)y << 16) >> 16)) +
+                       ( ((q63_t)sum    )                                  )   ));
+  }
+
+
+  /*
+   * @brief C custom defined SMUAD
+   */
+  __STATIC_FORCEINLINE uint32_t __SMUAD(
+  uint32_t x,
+  uint32_t y)
+  {
+    return ((uint32_t)(((((q31_t)x << 16) >> 16) * (((q31_t)y << 16) >> 16)) +
+                       ((((q31_t)x      ) >> 16) * (((q31_t)y      ) >> 16))   ));
+  }
+
+
+  /*
+   * @brief C custom defined SMUSD
+   */
+  __STATIC_FORCEINLINE uint32_t __SMUSD(
+  uint32_t x,
+  uint32_t y)
+  {
+    return ((uint32_t)(((((q31_t)x << 16) >> 16) * (((q31_t)y << 16) >> 16)) -
+                       ((((q31_t)x      ) >> 16) * (((q31_t)y      ) >> 16))   ));
+  }
+
+
+  /*
+   * @brief C custom defined SXTB16
+   */
+  __STATIC_FORCEINLINE uint32_t __SXTB16(
+  uint32_t x)
+  {
+    return ((uint32_t)(((((q31_t)x << 24) >> 24) & (q31_t)0x0000FFFF) |
+                       ((((q31_t)x <<  8) >>  8) & (q31_t)0xFFFF0000)  ));
+  }
+
+  /*
+   * @brief C custom defined SMMLA
+   */
+  __STATIC_FORCEINLINE int32_t __SMMLA(
+  int32_t x,
+  int32_t y,
+  int32_t sum)
+  {
+    return (sum + (int32_t) (((int64_t) x * y) >> 32));
+  }
+
+#endif /* !defined (RISCV_MATH_DSP) */
+
+
+  /**
+   * @brief Instance structure for the Q7 FIR filter.
+   */
+  typedef struct
+  {
+          uint16_t numTaps;        /**< number of filter coefficients in the filter. */
+          q7_t *pState;            /**< points to the state variable array. The array is of length numTaps+blockSize-1. */
+    const q7_t *pCoeffs;           /**< points to the coefficient array. The array is of length numTaps.*/
+  } riscv_fir_instance_q7;
+
+  /**
+   * @brief Instance structure for the Q15 FIR filter.
+   */
+  typedef struct
+  {
+          uint16_t numTaps;         /**< number of filter coefficients in the filter. */
+          q15_t *pState;            /**< points to the state variable array. The array is of length numTaps+blockSize-1. */
+    const q15_t *pCoeffs;           /**< points to the coefficient array. The array is of length numTaps.*/
+  } riscv_fir_instance_q15;
+
+  /**
+   * @brief Instance structure for the Q31 FIR filter.
+   */
+  typedef struct
+  {
+          uint16_t numTaps;         /**< number of filter coefficients in the filter. */
+          q31_t *pState;            /**< points to the state variable array. The array is of length numTaps+blockSize-1. */
+    const q31_t *pCoeffs;           /**< points to the coefficient array. The array is of length numTaps. */
+  } riscv_fir_instance_q31;
+
+  /**
+   * @brief Instance structure for the floating-point FIR filter.
+   */
+  typedef struct
+  {
+          uint16_t numTaps;     /**< number of filter coefficients in the filter. */
+          float32_t *pState;    /**< points to the state variable array. The array is of length numTaps+blockSize-1. */
+    const float32_t *pCoeffs;   /**< points to the coefficient array. The array is of length numTaps. */
+  } riscv_fir_instance_f32;
+
+  /**
+   * @brief Processing function for the Q7 FIR filter.
+   * @param[in]  S          points to an instance of the Q7 FIR filter structure.
+   * @param[in]  pSrc       points to the block of input data.
+   * @param[out] pDst       points to the block of output data.
+   * @param[in]  blockSize  number of samples to process.
+   */
+  void riscv_fir_q7(
+  const riscv_fir_instance_q7 * S,
+  const q7_t * pSrc,
+        q7_t * pDst,
+        uint32_t blockSize);
+
+  /**
+   * @brief  Initialization function for the Q7 FIR filter.
+   * @param[in,out] S          points to an instance of the Q7 FIR structure.
+   * @param[in]     numTaps    Number of filter coefficients in the filter.
+   * @param[in]     pCoeffs    points to the filter coefficients.
+   * @param[in]     pState     points to the state buffer.
+   * @param[in]     blockSize  number of samples that are processed.
+   */
+  void riscv_fir_init_q7(
+        riscv_fir_instance_q7 * S,
+        uint16_t numTaps,
+  const q7_t * pCoeffs,
+        q7_t * pState,
+        uint32_t blockSize);
+
+  /**
+   * @brief Processing function for the Q15 FIR filter.
+   * @param[in]  S          points to an instance of the Q15 FIR structure.
+   * @param[in]  pSrc       points to the block of input data.
+   * @param[out] pDst       points to the block of output data.
+   * @param[in]  blockSize  number of samples to process.
+   */
+  void riscv_fir_q15(
+  const riscv_fir_instance_q15 * S,
+  const q15_t * pSrc,
+        q15_t * pDst,
+        uint32_t blockSize);
+
+  /**
+   * @brief Processing function for the fast Q15 FIR filter (fast version).
+   * @param[in]  S          points to an instance of the Q15 FIR filter structure.
+   * @param[in]  pSrc       points to the block of input data.
+   * @param[out] pDst       points to the block of output data.
+   * @param[in]  blockSize  number of samples to process.
+   */
+  void riscv_fir_fast_q15(
+  const riscv_fir_instance_q15 * S,
+  const q15_t * pSrc,
+        q15_t * pDst,
+        uint32_t blockSize);
+
+  /**
+   * @brief  Initialization function for the Q15 FIR filter.
+   * @param[in,out] S          points to an instance of the Q15 FIR filter structure.
+   * @param[in]     numTaps    Number of filter coefficients in the filter. Must be even and greater than or equal to 4.
+   * @param[in]     pCoeffs    points to the filter coefficients.
+   * @param[in]     pState     points to the state buffer.
+   * @param[in]     blockSize  number of samples that are processed at a time.
+   * @return     The function returns either
+   * <code>RISCV_MATH_SUCCESS</code> if initialization was successful or
+   * <code>RISCV_MATH_ARGUMENT_ERROR</code> if <code>numTaps</code> is not a supported value.
+   */
+  riscv_status riscv_fir_init_q15(
+        riscv_fir_instance_q15 * S,
+        uint16_t numTaps,
+  const q15_t * pCoeffs,
+        q15_t * pState,
+        uint32_t blockSize);
+
+  /**
+   * @brief Processing function for the Q31 FIR filter.
+   * @param[in]  S          points to an instance of the Q31 FIR filter structure.
+   * @param[in]  pSrc       points to the block of input data.
+   * @param[out] pDst       points to the block of output data.
+   * @param[in]  blockSize  number of samples to process.
+   */
+  void riscv_fir_q31(
+  const riscv_fir_instance_q31 * S,
+  const q31_t * pSrc,
+        q31_t * pDst,
+        uint32_t blockSize);
+
+  /**
+   * @brief Processing function for the fast Q31 FIR filter (fast version).
+   * @param[in]  S          points to an instance of the Q31 FIR filter structure.
+   * @param[in]  pSrc       points to the block of input data.
+   * @param[out] pDst       points to the block of output data.
+   * @param[in]  blockSize  number of samples to process.
+   */
+  void riscv_fir_fast_q31(
+  const riscv_fir_instance_q31 * S,
+  const q31_t * pSrc,
+        q31_t * pDst,
+        uint32_t blockSize);
+
+  /**
+   * @brief  Initialization function for the Q31 FIR filter.
+   * @param[in,out] S          points to an instance of the Q31 FIR structure.
+   * @param[in]     numTaps    Number of filter coefficients in the filter.
+   * @param[in]     pCoeffs    points to the filter coefficients.
+   * @param[in]     pState     points to the state buffer.
+   * @param[in]     blockSize  number of samples that are processed at a time.
+   */
+  void riscv_fir_init_q31(
+        riscv_fir_instance_q31 * S,
+        uint16_t numTaps,
+  const q31_t * pCoeffs,
+        q31_t * pState,
+        uint32_t blockSize);
+
+  /**
+   * @brief Processing function for the floating-point FIR filter.
+   * @param[in]  S          points to an instance of the floating-point FIR structure.
+   * @param[in]  pSrc       points to the block of input data.
+   * @param[out] pDst       points to the block of output data.
+   * @param[in]  blockSize  number of samples to process.
+   */
+  void riscv_fir_f32(
+  const riscv_fir_instance_f32 * S,
+  const float32_t * pSrc,
+        float32_t * pDst,
+        uint32_t blockSize);
+
+  /**
+   * @brief  Initialization function for the floating-point FIR filter.
+   * @param[in,out] S          points to an instance of the floating-point FIR filter structure.
+   * @param[in]     numTaps    Number of filter coefficients in the filter.
+   * @param[in]     pCoeffs    points to the filter coefficients.
+   * @param[in]     pState     points to the state buffer.
+   * @param[in]     blockSize  number of samples that are processed at a time.
+   */
+  void riscv_fir_init_f32(
+        riscv_fir_instance_f32 * S,
+        uint16_t numTaps,
+  const float32_t * pCoeffs,
+        float32_t * pState,
+        uint32_t blockSize);
+
+  /**
+   * @brief Instance structure for the Q15 Biquad cascade filter.
+   */
+  typedef struct
+  {
+          int8_t numStages;        /**< number of 2nd order stages in the filter.  Overall order is 2*numStages. */
+          q15_t *pState;           /**< Points to the array of state coefficients.  The array is of length 4*numStages. */
+    const q15_t *pCoeffs;          /**< Points to the array of coefficients.  The array is of length 5*numStages. */
+          int8_t postShift;        /**< Additional shift, in bits, applied to each output sample. */
+  } riscv_biquad_casd_df1_inst_q15;
+
+  /**
+   * @brief Instance structure for the Q31 Biquad cascade filter.
+   */
+  typedef struct
+  {
+          uint32_t numStages;      /**< number of 2nd order stages in the filter.  Overall order is 2*numStages. */
+          q31_t *pState;           /**< Points to the array of state coefficients.  The array is of length 4*numStages. */
+    const q31_t *pCoeffs;          /**< Points to the array of coefficients.  The array is of length 5*numStages. */
+          uint8_t postShift;       /**< Additional shift, in bits, applied to each output sample. */
+  } riscv_biquad_casd_df1_inst_q31;
+
+  /**
+   * @brief Instance structure for the floating-point Biquad cascade filter.
+   */
+  typedef struct
+  {
+          uint32_t numStages;      /**< number of 2nd order stages in the filter.  Overall order is 2*numStages. */
+          float32_t *pState;       /**< Points to the array of state coefficients.  The array is of length 4*numStages. */
+    const float32_t *pCoeffs;      /**< Points to the array of coefficients.  The array is of length 5*numStages. */
+  } riscv_biquad_casd_df1_inst_f32;
+
+  /**
+   * @brief Processing function for the Q15 Biquad cascade filter.
+   * @param[in]  S          points to an instance of the Q15 Biquad cascade structure.
+   * @param[in]  pSrc       points to the block of input data.
+   * @param[out] pDst       points to the block of output data.
+   * @param[in]  blockSize  number of samples to process.
+   */
+  void riscv_biquad_cascade_df1_q15(
+  const riscv_biquad_casd_df1_inst_q15 * S,
+  const q15_t * pSrc,
+        q15_t * pDst,
+        uint32_t blockSize);
+
+  /**
+   * @brief  Initialization function for the Q15 Biquad cascade filter.
+   * @param[in,out] S          points to an instance of the Q15 Biquad cascade structure.
+   * @param[in]     numStages  number of 2nd order stages in the filter.
+   * @param[in]     pCoeffs    points to the filter coefficients.
+   * @param[in]     pState     points to the state buffer.
+   * @param[in]     postShift  Shift to be applied to the output. Varies according to the coefficients format
+   */
+  void riscv_biquad_cascade_df1_init_q15(
+        riscv_biquad_casd_df1_inst_q15 * S,
+        uint8_t numStages,
+  const q15_t * pCoeffs,
+        q15_t * pState,
+        int8_t postShift);
+
+  /**
+   * @brief Fast but less precise processing function for the Q15 Biquad cascade filter for RISC-V Core with DSP enabled.
+   * @param[in]  S          points to an instance of the Q15 Biquad cascade structure.
+   * @param[in]  pSrc       points to the block of input data.
+   * @param[out] pDst       points to the block of output data.
+   * @param[in]  blockSize  number of samples to process.
+   */
+  void riscv_biquad_cascade_df1_fast_q15(
+  const riscv_biquad_casd_df1_inst_q15 * S,
+  const q15_t * pSrc,
+        q15_t * pDst,
+        uint32_t blockSize);
+
+  /**
+   * @brief Processing function for the Q31 Biquad cascade filter
+   * @param[in]  S          points to an instance of the Q31 Biquad cascade structure.
+   * @param[in]  pSrc       points to the block of input data.
+   * @param[out] pDst       points to the block of output data.
+   * @param[in]  blockSize  number of samples to process.
+   */
+  void riscv_biquad_cascade_df1_q31(
+  const riscv_biquad_casd_df1_inst_q31 * S,
+  const q31_t * pSrc,
+        q31_t * pDst,
+        uint32_t blockSize);
+
+  /**
+   * @brief Fast but less precise processing function for the Q31 Biquad cascade filter for RISC-V Core with DSP enabled.
+   * @param[in]  S          points to an instance of the Q31 Biquad cascade structure.
+   * @param[in]  pSrc       points to the block of input data.
+   * @param[out] pDst       points to the block of output data.
+   * @param[in]  blockSize  number of samples to process.
+   */
+  void riscv_biquad_cascade_df1_fast_q31(
+  const riscv_biquad_casd_df1_inst_q31 * S,
+  const q31_t * pSrc,
+        q31_t * pDst,
+        uint32_t blockSize);
+
+  /**
+   * @brief  Initialization function for the Q31 Biquad cascade filter.
+   * @param[in,out] S          points to an instance of the Q31 Biquad cascade structure.
+   * @param[in]     numStages  number of 2nd order stages in the filter.
+   * @param[in]     pCoeffs    points to the filter coefficients.
+   * @param[in]     pState     points to the state buffer.
+   * @param[in]     postShift  Shift to be applied to the output. Varies according to the coefficients format
+   */
+  void riscv_biquad_cascade_df1_init_q31(
+        riscv_biquad_casd_df1_inst_q31 * S,
+        uint8_t numStages,
+  const q31_t * pCoeffs,
+        q31_t * pState,
+        int8_t postShift);
+
+  /**
+   * @brief Processing function for the floating-point Biquad cascade filter.
+   * @param[in]  S          points to an instance of the floating-point Biquad cascade structure.
+   * @param[in]  pSrc       points to the block of input data.
+   * @param[out] pDst       points to the block of output data.
+   * @param[in]  blockSize  number of samples to process.
+   */
+  void riscv_biquad_cascade_df1_f32(
+  const riscv_biquad_casd_df1_inst_f32 * S,
+  const float32_t * pSrc,
+        float32_t * pDst,
+        uint32_t blockSize);
+
+  /**
+   * @brief  Initialization function for the floating-point Biquad cascade filter.
+   * @param[in,out] S          points to an instance of the floating-point Biquad cascade structure.
+   * @param[in]     numStages  number of 2nd order stages in the filter.
+   * @param[in]     pCoeffs    points to the filter coefficients.
+   * @param[in]     pState     points to the state buffer.
+   */
+  void riscv_biquad_cascade_df1_init_f32(
+        riscv_biquad_casd_df1_inst_f32 * S,
+        uint8_t numStages,
+  const float32_t * pCoeffs,
+        float32_t * pState);
+
+  /**
+   * @brief Instance structure for the floating-point matrix structure.
+   */
+  typedef struct
+  {
+    uint16_t numRows;     /**< number of rows of the matrix.     */
+    uint16_t numCols;     /**< number of columns of the matrix.  */
+    float32_t *pData;     /**< points to the data of the matrix. */
+  } riscv_matrix_instance_f32;
+
+
+  /**
+   * @brief Instance structure for the floating-point matrix structure.
+   */
+  typedef struct
+  {
+    uint16_t numRows;     /**< number of rows of the matrix.     */
+    uint16_t numCols;     /**< number of columns of the matrix.  */
+    float64_t *pData;     /**< points to the data of the matrix. */
+  } riscv_matrix_instance_f64;
+
+  /**
+   * @brief Instance structure for the Q15 matrix structure.
+   */
+  typedef struct
+  {
+    uint16_t numRows;     /**< number of rows of the matrix.     */
+    uint16_t numCols;     /**< number of columns of the matrix.  */
+    q15_t *pData;         /**< points to the data of the matrix. */
+  } riscv_matrix_instance_q15;
+
+  /**
+   * @brief Instance structure for the Q31 matrix structure.
+   */
+  typedef struct
+  {
+    uint16_t numRows;     /**< number of rows of the matrix.     */
+    uint16_t numCols;     /**< number of columns of the matrix.  */
+    q31_t *pData;         /**< points to the data of the matrix. */
+  } riscv_matrix_instance_q31;
+
+  /**
+   * @brief Floating-point matrix addition.
+   * @param[in]  pSrcA  points to the first input matrix structure
+   * @param[in]  pSrcB  points to the second input matrix structure
+   * @param[out] pDst   points to output matrix structure
+   * @return     The function returns either
+   * <code>RISCV_MATH_SIZE_MISMATCH</code> or <code>RISCV_MATH_SUCCESS</code> based on the outcome of size checking.
+   */
+riscv_status riscv_mat_add_f32(
+  const riscv_matrix_instance_f32 * pSrcA,
+  const riscv_matrix_instance_f32 * pSrcB,
+        riscv_matrix_instance_f32 * pDst);
+
+  /**
+   * @brief Q15 matrix addition.
+   * @param[in]   pSrcA  points to the first input matrix structure
+   * @param[in]   pSrcB  points to the second input matrix structure
+   * @param[out]  pDst   points to output matrix structure
+   * @return     The function returns either
+   * <code>RISCV_MATH_SIZE_MISMATCH</code> or <code>RISCV_MATH_SUCCESS</code> based on the outcome of size checking.
+   */
+riscv_status riscv_mat_add_q15(
+  const riscv_matrix_instance_q15 * pSrcA,
+  const riscv_matrix_instance_q15 * pSrcB,
+        riscv_matrix_instance_q15 * pDst);
+
+  /**
+   * @brief Q31 matrix addition.
+   * @param[in]  pSrcA  points to the first input matrix structure
+   * @param[in]  pSrcB  points to the second input matrix structure
+   * @param[out] pDst   points to output matrix structure
+   * @return     The function returns either
+   * <code>RISCV_MATH_SIZE_MISMATCH</code> or <code>RISCV_MATH_SUCCESS</code> based on the outcome of size checking.
+   */
+riscv_status riscv_mat_add_q31(
+  const riscv_matrix_instance_q31 * pSrcA,
+  const riscv_matrix_instance_q31 * pSrcB,
+        riscv_matrix_instance_q31 * pDst);
+
+  /**
+   * @brief Floating-point, complex, matrix multiplication.
+   * @param[in]  pSrcA  points to the first input matrix structure
+   * @param[in]  pSrcB  points to the second input matrix structure
+   * @param[out] pDst   points to output matrix structure
+   * @return     The function returns either
+   * <code>RISCV_MATH_SIZE_MISMATCH</code> or <code>RISCV_MATH_SUCCESS</code> based on the outcome of size checking.
+   */
+riscv_status riscv_mat_cmplx_mult_f32(
+  const riscv_matrix_instance_f32 * pSrcA,
+  const riscv_matrix_instance_f32 * pSrcB,
+        riscv_matrix_instance_f32 * pDst);
+
+  /**
+   * @brief Q15, complex,  matrix multiplication.
+   * @param[in]  pSrcA  points to the first input matrix structure
+   * @param[in]  pSrcB  points to the second input matrix structure
+   * @param[out] pDst   points to output matrix structure
+   * @return     The function returns either
+   * <code>RISCV_MATH_SIZE_MISMATCH</code> or <code>RISCV_MATH_SUCCESS</code> based on the outcome of size checking.
+   */
+riscv_status riscv_mat_cmplx_mult_q15(
+  const riscv_matrix_instance_q15 * pSrcA,
+  const riscv_matrix_instance_q15 * pSrcB,
+        riscv_matrix_instance_q15 * pDst,
+        q15_t * pScratch);
+
+  /**
+   * @brief Q31, complex, matrix multiplication.
+   * @param[in]  pSrcA  points to the first input matrix structure
+   * @param[in]  pSrcB  points to the second input matrix structure
+   * @param[out] pDst   points to output matrix structure
+   * @return     The function returns either
+   * <code>RISCV_MATH_SIZE_MISMATCH</code> or <code>RISCV_MATH_SUCCESS</code> based on the outcome of size checking.
+   */
+riscv_status riscv_mat_cmplx_mult_q31(
+  const riscv_matrix_instance_q31 * pSrcA,
+  const riscv_matrix_instance_q31 * pSrcB,
+        riscv_matrix_instance_q31 * pDst);
+
+  /**
+   * @brief Floating-point matrix transpose.
+   * @param[in]  pSrc  points to the input matrix
+   * @param[out] pDst  points to the output matrix
+   * @return    The function returns either  <code>RISCV_MATH_SIZE_MISMATCH</code>
+   * or <code>RISCV_MATH_SUCCESS</code> based on the outcome of size checking.
+   */
+riscv_status riscv_mat_trans_f32(
+  const riscv_matrix_instance_f32 * pSrc,
+        riscv_matrix_instance_f32 * pDst);
+
+  /**
+   * @brief Q15 matrix transpose.
+   * @param[in]  pSrc  points to the input matrix
+   * @param[out] pDst  points to the output matrix
+   * @return    The function returns either  <code>RISCV_MATH_SIZE_MISMATCH</code>
+   * or <code>RISCV_MATH_SUCCESS</code> based on the outcome of size checking.
+   */
+riscv_status riscv_mat_trans_q15(
+  const riscv_matrix_instance_q15 * pSrc,
+        riscv_matrix_instance_q15 * pDst);
+
+  /**
+   * @brief Q31 matrix transpose.
+   * @param[in]  pSrc  points to the input matrix
+   * @param[out] pDst  points to the output matrix
+   * @return    The function returns either  <code>RISCV_MATH_SIZE_MISMATCH</code>
+   * or <code>RISCV_MATH_SUCCESS</code> based on the outcome of size checking.
+   */
+riscv_status riscv_mat_trans_q31(
+  const riscv_matrix_instance_q31 * pSrc,
+        riscv_matrix_instance_q31 * pDst);
+
+  /**
+   * @brief Floating-point matrix multiplication
+   * @param[in]  pSrcA  points to the first input matrix structure
+   * @param[in]  pSrcB  points to the second input matrix structure
+   * @param[out] pDst   points to output matrix structure
+   * @return     The function returns either
+   * <code>RISCV_MATH_SIZE_MISMATCH</code> or <code>RISCV_MATH_SUCCESS</code> based on the outcome of size checking.
+   */
+riscv_status riscv_mat_mult_f32(
+  const riscv_matrix_instance_f32 * pSrcA,
+  const riscv_matrix_instance_f32 * pSrcB,
+        riscv_matrix_instance_f32 * pDst);
+
+  /**
+   * @brief Q15 matrix multiplication
+   * @param[in]  pSrcA   points to the first input matrix structure
+   * @param[in]  pSrcB   points to the second input matrix structure
+   * @param[out] pDst    points to output matrix structure
+   * @param[in]  pState  points to the array for storing intermediate results
+   * @return     The function returns either
+   * <code>RISCV_MATH_SIZE_MISMATCH</code> or <code>RISCV_MATH_SUCCESS</code> based on the outcome of size checking.
+   */
+riscv_status riscv_mat_mult_q15(
+  const riscv_matrix_instance_q15 * pSrcA,
+  const riscv_matrix_instance_q15 * pSrcB,
+        riscv_matrix_instance_q15 * pDst,
+        q15_t * pState);
+
+  /**
+   * @brief Q15 matrix multiplication (fast variant) for RISC-V Core with DSP enabled
+   * @param[in]  pSrcA   points to the first input matrix structure
+   * @param[in]  pSrcB   points to the second input matrix structure
+   * @param[out] pDst    points to output matrix structure
+   * @param[in]  pState  points to the array for storing intermediate results
+   * @return     The function returns either
+   * <code>RISCV_MATH_SIZE_MISMATCH</code> or <code>RISCV_MATH_SUCCESS</code> based on the outcome of size checking.
+   */
+riscv_status riscv_mat_mult_fast_q15(
+  const riscv_matrix_instance_q15 * pSrcA,
+  const riscv_matrix_instance_q15 * pSrcB,
+        riscv_matrix_instance_q15 * pDst,
+        q15_t * pState);
+
+  /**
+   * @brief Q31 matrix multiplication
+   * @param[in]  pSrcA  points to the first input matrix structure
+   * @param[in]  pSrcB  points to the second input matrix structure
+   * @param[out] pDst   points to output matrix structure
+   * @return     The function returns either
+   * <code>RISCV_MATH_SIZE_MISMATCH</code> or <code>RISCV_MATH_SUCCESS</code> based on the outcome of size checking.
+   */
+riscv_status riscv_mat_mult_q31(
+  const riscv_matrix_instance_q31 * pSrcA,
+  const riscv_matrix_instance_q31 * pSrcB,
+        riscv_matrix_instance_q31 * pDst);
+
+  /**
+   * @brief Q31 matrix multiplication (fast variant) for RISC-V Core with DSP enabled
+   * @param[in]  pSrcA  points to the first input matrix structure
+   * @param[in]  pSrcB  points to the second input matrix structure
+   * @param[out] pDst   points to output matrix structure
+   * @return     The function returns either
+   * <code>RISCV_MATH_SIZE_MISMATCH</code> or <code>RISCV_MATH_SUCCESS</code> based on the outcome of size checking.
+   */
+riscv_status riscv_mat_mult_fast_q31(
+  const riscv_matrix_instance_q31 * pSrcA,
+  const riscv_matrix_instance_q31 * pSrcB,
+        riscv_matrix_instance_q31 * pDst);
+
+  /**
+   * @brief Floating-point matrix subtraction
+   * @param[in]  pSrcA  points to the first input matrix structure
+   * @param[in]  pSrcB  points to the second input matrix structure
+   * @param[out] pDst   points to output matrix structure
+   * @return     The function returns either
+   * <code>RISCV_MATH_SIZE_MISMATCH</code> or <code>RISCV_MATH_SUCCESS</code> based on the outcome of size checking.
+   */
+riscv_status riscv_mat_sub_f32(
+  const riscv_matrix_instance_f32 * pSrcA,
+  const riscv_matrix_instance_f32 * pSrcB,
+        riscv_matrix_instance_f32 * pDst);
+
+  /**
+   * @brief Q15 matrix subtraction
+   * @param[in]  pSrcA  points to the first input matrix structure
+   * @param[in]  pSrcB  points to the second input matrix structure
+   * @param[out] pDst   points to output matrix structure
+   * @return     The function returns either
+   * <code>RISCV_MATH_SIZE_MISMATCH</code> or <code>RISCV_MATH_SUCCESS</code> based on the outcome of size checking.
+   */
+riscv_status riscv_mat_sub_q15(
+  const riscv_matrix_instance_q15 * pSrcA,
+  const riscv_matrix_instance_q15 * pSrcB,
+        riscv_matrix_instance_q15 * pDst);
+
+  /**
+   * @brief Q31 matrix subtraction
+   * @param[in]  pSrcA  points to the first input matrix structure
+   * @param[in]  pSrcB  points to the second input matrix structure
+   * @param[out] pDst   points to output matrix structure
+   * @return     The function returns either
+   * <code>RISCV_MATH_SIZE_MISMATCH</code> or <code>RISCV_MATH_SUCCESS</code> based on the outcome of size checking.
+   */
+riscv_status riscv_mat_sub_q31(
+  const riscv_matrix_instance_q31 * pSrcA,
+  const riscv_matrix_instance_q31 * pSrcB,
+        riscv_matrix_instance_q31 * pDst);
+
+  /**
+   * @brief Floating-point matrix scaling.
+   * @param[in]  pSrc   points to the input matrix
+   * @param[in]  scale  scale factor
+   * @param[out] pDst   points to the output matrix
+   * @return     The function returns either
+   * <code>RISCV_MATH_SIZE_MISMATCH</code> or <code>RISCV_MATH_SUCCESS</code> based on the outcome of size checking.
+   */
+riscv_status riscv_mat_scale_f32(
+  const riscv_matrix_instance_f32 * pSrc,
+        float32_t scale,
+        riscv_matrix_instance_f32 * pDst);
+
+  /**
+   * @brief Q15 matrix scaling.
+   * @param[in]  pSrc        points to input matrix
+   * @param[in]  scaleFract  fractional portion of the scale factor
+   * @param[in]  shift       number of bits to shift the result by
+   * @param[out] pDst        points to output matrix
+   * @return     The function returns either
+   * <code>RISCV_MATH_SIZE_MISMATCH</code> or <code>RISCV_MATH_SUCCESS</code> based on the outcome of size checking.
+   */
+riscv_status riscv_mat_scale_q15(
+  const riscv_matrix_instance_q15 * pSrc,
+        q15_t scaleFract,
+        int32_t shift,
+        riscv_matrix_instance_q15 * pDst);
+
+  /**
+   * @brief Q31 matrix scaling.
+   * @param[in]  pSrc        points to input matrix
+   * @param[in]  scaleFract  fractional portion of the scale factor
+   * @param[in]  shift       number of bits to shift the result by
+   * @param[out] pDst        points to output matrix structure
+   * @return     The function returns either
+   * <code>RISCV_MATH_SIZE_MISMATCH</code> or <code>RISCV_MATH_SUCCESS</code> based on the outcome of size checking.
+   */
+riscv_status riscv_mat_scale_q31(
+  const riscv_matrix_instance_q31 * pSrc,
+        q31_t scaleFract,
+        int32_t shift,
+        riscv_matrix_instance_q31 * pDst);
+
+  /**
+   * @brief  Q31 matrix initialization.
+   * @param[in,out] S         points to an instance of the floating-point matrix structure.
+   * @param[in]     nRows     number of rows in the matrix.
+   * @param[in]     nColumns  number of columns in the matrix.
+   * @param[in]     pData     points to the matrix data array.
+   */
+void riscv_mat_init_q31(
+        riscv_matrix_instance_q31 * S,
+        uint16_t nRows,
+        uint16_t nColumns,
+        q31_t * pData);
+
+  /**
+   * @brief  Q15 matrix initialization.
+   * @param[in,out] S         points to an instance of the floating-point matrix structure.
+   * @param[in]     nRows     number of rows in the matrix.
+   * @param[in]     nColumns  number of columns in the matrix.
+   * @param[in]     pData     points to the matrix data array.
+   */
+void riscv_mat_init_q15(
+        riscv_matrix_instance_q15 * S,
+        uint16_t nRows,
+        uint16_t nColumns,
+        q15_t * pData);
+
+  /**
+   * @brief  Floating-point matrix initialization.
+   * @param[in,out] S         points to an instance of the floating-point matrix structure.
+   * @param[in]     nRows     number of rows in the matrix.
+   * @param[in]     nColumns  number of columns in the matrix.
+   * @param[in]     pData     points to the matrix data array.
+   */
+void riscv_mat_init_f32(
+        riscv_matrix_instance_f32 * S,
+        uint16_t nRows,
+        uint16_t nColumns,
+        float32_t * pData);
+
+
+  /**
+   * @brief Instance structure for the Q15 PID Control.
+   */
+  typedef struct
+  {
+          q15_t A0;           /**< The derived gain, A0 = Kp + Ki + Kd . */
+#if !defined (RISCV_MATH_DSP)
+          q15_t A1;
+          q15_t A2;
+#else
+          q31_t A1;           /**< The derived gain A1 = -Kp - 2Kd | Kd.*/
+#endif
+          q15_t state[3];     /**< The state array of length 3. */
+          q15_t Kp;           /**< The proportional gain. */
+          q15_t Ki;           /**< The integral gain. */
+          q15_t Kd;           /**< The derivative gain. */
+  } riscv_pid_instance_q15;
+
+  /**
+   * @brief Instance structure for the Q31 PID Control.
+   */
+  typedef struct
+  {
+          q31_t A0;            /**< The derived gain, A0 = Kp + Ki + Kd . */
+          q31_t A1;            /**< The derived gain, A1 = -Kp - 2Kd. */
+          q31_t A2;            /**< The derived gain, A2 = Kd . */
+          q31_t state[3];      /**< The state array of length 3. */
+          q31_t Kp;            /**< The proportional gain. */
+          q31_t Ki;            /**< The integral gain. */
+          q31_t Kd;            /**< The derivative gain. */
+  } riscv_pid_instance_q31;
+
+  /**
+   * @brief Instance structure for the floating-point PID Control.
+   */
+  typedef struct
+  {
+          float32_t A0;          /**< The derived gain, A0 = Kp + Ki + Kd . */
+          float32_t A1;          /**< The derived gain, A1 = -Kp - 2Kd. */
+          float32_t A2;          /**< The derived gain, A2 = Kd . */
+          float32_t state[3];    /**< The state array of length 3. */
+          float32_t Kp;          /**< The proportional gain. */
+          float32_t Ki;          /**< The integral gain. */
+          float32_t Kd;          /**< The derivative gain. */
+  } riscv_pid_instance_f32;
+
+
+
+  /**
+   * @brief  Initialization function for the floating-point PID Control.
+   * @param[in,out] S               points to an instance of the PID structure.
+   * @param[in]     resetStateFlag  flag to reset the state. 0 = no change in state 1 = reset the state.
+   */
+  void riscv_pid_init_f32(
+        riscv_pid_instance_f32 * S,
+        int32_t resetStateFlag);
+
+
+  /**
+   * @brief  Reset function for the floating-point PID Control.
+   * @param[in,out] S  is an instance of the floating-point PID Control structure
+   */
+  void riscv_pid_reset_f32(
+        riscv_pid_instance_f32 * S);
+
+
+  /**
+   * @brief  Initialization function for the Q31 PID Control.
+   * @param[in,out] S               points to an instance of the Q15 PID structure.
+   * @param[in]     resetStateFlag  flag to reset the state. 0 = no change in state 1 = reset the state.
+   */
+  void riscv_pid_init_q31(
+        riscv_pid_instance_q31 * S,
+        int32_t resetStateFlag);
+
+
+  /**
+   * @brief  Reset function for the Q31 PID Control.
+   * @param[in,out] S   points to an instance of the Q31 PID Control structure
+   */
+
+  void riscv_pid_reset_q31(
+        riscv_pid_instance_q31 * S);
+
+
+  /**
+   * @brief  Initialization function for the Q15 PID Control.
+   * @param[in,out] S               points to an instance of the Q15 PID structure.
+   * @param[in]     resetStateFlag  flag to reset the state. 0 = no change in state 1 = reset the state.
+   */
+  void riscv_pid_init_q15(
+        riscv_pid_instance_q15 * S,
+        int32_t resetStateFlag);
+
+
+  /**
+   * @brief  Reset function for the Q15 PID Control.
+   * @param[in,out] S  points to an instance of the q15 PID Control structure
+   */
+  void riscv_pid_reset_q15(
+        riscv_pid_instance_q15 * S);
+
+
+  /**
+   * @brief Instance structure for the floating-point Linear Interpolate function.
+   */
+  typedef struct
+  {
+          uint32_t nValues;           /**< nValues */
+          float32_t x1;               /**< x1 */
+          float32_t xSpacing;         /**< xSpacing */
+          float32_t *pYData;          /**< pointer to the table of Y values */
+  } riscv_linear_interp_instance_f32;
+
+  /**
+   * @brief Instance structure for the floating-point bilinear interpolation function.
+   */
+  typedef struct
+  {
+          uint16_t numRows;   /**< number of rows in the data table. */
+          uint16_t numCols;   /**< number of columns in the data table. */
+          float32_t *pData;   /**< points to the data table. */
+  } riscv_bilinear_interp_instance_f32;
+
+   /**
+   * @brief Instance structure for the Q31 bilinear interpolation function.
+   */
+  typedef struct
+  {
+          uint16_t numRows;   /**< number of rows in the data table. */
+          uint16_t numCols;   /**< number of columns in the data table. */
+          q31_t *pData;       /**< points to the data table. */
+  } riscv_bilinear_interp_instance_q31;
+
+   /**
+   * @brief Instance structure for the Q15 bilinear interpolation function.
+   */
+  typedef struct
+  {
+          uint16_t numRows;   /**< number of rows in the data table. */
+          uint16_t numCols;   /**< number of columns in the data table. */
+          q15_t *pData;       /**< points to the data table. */
+  } riscv_bilinear_interp_instance_q15;
+
+   /**
+   * @brief Instance structure for the Q15 bilinear interpolation function.
+   */
+  typedef struct
+  {
+          uint16_t numRows;   /**< number of rows in the data table. */
+          uint16_t numCols;   /**< number of columns in the data table. */
+          q7_t *pData;        /**< points to the data table. */
+  } riscv_bilinear_interp_instance_q7;
+
+
+  /**
+   * @brief Q7 vector multiplication.
+   * @param[in]  pSrcA      points to the first input vector
+   * @param[in]  pSrcB      points to the second input vector
+   * @param[out] pDst       points to the output vector
+   * @param[in]  blockSize  number of samples in each vector
+   */
+  void riscv_mult_q7(
+  const q7_t * pSrcA,
+  const q7_t * pSrcB,
+        q7_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief Q15 vector multiplication.
+   * @param[in]  pSrcA      points to the first input vector
+   * @param[in]  pSrcB      points to the second input vector
+   * @param[out] pDst       points to the output vector
+   * @param[in]  blockSize  number of samples in each vector
+   */
+  void riscv_mult_q15(
+  const q15_t * pSrcA,
+  const q15_t * pSrcB,
+        q15_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief Q31 vector multiplication.
+   * @param[in]  pSrcA      points to the first input vector
+   * @param[in]  pSrcB      points to the second input vector
+   * @param[out] pDst       points to the output vector
+   * @param[in]  blockSize  number of samples in each vector
+   */
+  void riscv_mult_q31(
+  const q31_t * pSrcA,
+  const q31_t * pSrcB,
+        q31_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief Floating-point vector multiplication.
+   * @param[in]  pSrcA      points to the first input vector
+   * @param[in]  pSrcB      points to the second input vector
+   * @param[out] pDst       points to the output vector
+   * @param[in]  blockSize  number of samples in each vector
+   */
+  void riscv_mult_f32(
+  const float32_t * pSrcA,
+  const float32_t * pSrcB,
+        float32_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief Instance structure for the Q15 CFFT/CIFFT function.
+   */
+  typedef struct
+  {
+          uint16_t fftLen;                 /**< length of the FFT. */
+          uint8_t ifftFlag;                /**< flag that selects forward (ifftFlag=0) or inverse (ifftFlag=1) transform. */
+          uint8_t bitReverseFlag;          /**< flag that enables (bitReverseFlag=1) or disables (bitReverseFlag=0) bit reversal of output. */
+    const q15_t *pTwiddle;                 /**< points to the Sin twiddle factor table. */
+    const uint16_t *pBitRevTable;          /**< points to the bit reversal table. */
+          uint16_t twidCoefModifier;       /**< twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table. */
+          uint16_t bitRevFactor;           /**< bit reversal modifier that supports different size FFTs with the same bit reversal table. */
+  } riscv_cfft_radix2_instance_q15;
+
+/* Deprecated */
+  riscv_status riscv_cfft_radix2_init_q15(
+        riscv_cfft_radix2_instance_q15 * S,
+        uint16_t fftLen,
+        uint8_t ifftFlag,
+        uint8_t bitReverseFlag);
+
+/* Deprecated */
+  void riscv_cfft_radix2_q15(
+  const riscv_cfft_radix2_instance_q15 * S,
+        q15_t * pSrc);
+
+
+  /**
+   * @brief Instance structure for the Q15 CFFT/CIFFT function.
+   */
+  typedef struct
+  {
+          uint16_t fftLen;                 /**< length of the FFT. */
+          uint8_t ifftFlag;                /**< flag that selects forward (ifftFlag=0) or inverse (ifftFlag=1) transform. */
+          uint8_t bitReverseFlag;          /**< flag that enables (bitReverseFlag=1) or disables (bitReverseFlag=0) bit reversal of output. */
+    const q15_t *pTwiddle;                 /**< points to the twiddle factor table. */
+    const uint16_t *pBitRevTable;          /**< points to the bit reversal table. */
+          uint16_t twidCoefModifier;       /**< twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table. */
+          uint16_t bitRevFactor;           /**< bit reversal modifier that supports different size FFTs with the same bit reversal table. */
+  } riscv_cfft_radix4_instance_q15;
+
+/* Deprecated */
+  riscv_status riscv_cfft_radix4_init_q15(
+        riscv_cfft_radix4_instance_q15 * S,
+        uint16_t fftLen,
+        uint8_t ifftFlag,
+        uint8_t bitReverseFlag);
+
+/* Deprecated */
+  void riscv_cfft_radix4_q15(
+  const riscv_cfft_radix4_instance_q15 * S,
+        q15_t * pSrc);
+
+  /**
+   * @brief Instance structure for the Radix-2 Q31 CFFT/CIFFT function.
+   */
+  typedef struct
+  {
+          uint16_t fftLen;                 /**< length of the FFT. */
+          uint8_t ifftFlag;                /**< flag that selects forward (ifftFlag=0) or inverse (ifftFlag=1) transform. */
+          uint8_t bitReverseFlag;          /**< flag that enables (bitReverseFlag=1) or disables (bitReverseFlag=0) bit reversal of output. */
+    const q31_t *pTwiddle;                 /**< points to the Twiddle factor table. */
+    const uint16_t *pBitRevTable;          /**< points to the bit reversal table. */
+          uint16_t twidCoefModifier;       /**< twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table. */
+          uint16_t bitRevFactor;           /**< bit reversal modifier that supports different size FFTs with the same bit reversal table. */
+  } riscv_cfft_radix2_instance_q31;
+
+/* Deprecated */
+  riscv_status riscv_cfft_radix2_init_q31(
+        riscv_cfft_radix2_instance_q31 * S,
+        uint16_t fftLen,
+        uint8_t ifftFlag,
+        uint8_t bitReverseFlag);
+
+/* Deprecated */
+  void riscv_cfft_radix2_q31(
+  const riscv_cfft_radix2_instance_q31 * S,
+        q31_t * pSrc);
+
+  /**
+   * @brief Instance structure for the Q31 CFFT/CIFFT function.
+   */
+  typedef struct
+  {
+          uint16_t fftLen;                 /**< length of the FFT. */
+          uint8_t ifftFlag;                /**< flag that selects forward (ifftFlag=0) or inverse (ifftFlag=1) transform. */
+          uint8_t bitReverseFlag;          /**< flag that enables (bitReverseFlag=1) or disables (bitReverseFlag=0) bit reversal of output. */
+    const q31_t *pTwiddle;                 /**< points to the twiddle factor table. */
+    const uint16_t *pBitRevTable;          /**< points to the bit reversal table. */
+          uint16_t twidCoefModifier;       /**< twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table. */
+          uint16_t bitRevFactor;           /**< bit reversal modifier that supports different size FFTs with the same bit reversal table. */
+  } riscv_cfft_radix4_instance_q31;
+
+/* Deprecated */
+  void riscv_cfft_radix4_q31(
+  const riscv_cfft_radix4_instance_q31 * S,
+        q31_t * pSrc);
+
+/* Deprecated */
+  riscv_status riscv_cfft_radix4_init_q31(
+        riscv_cfft_radix4_instance_q31 * S,
+        uint16_t fftLen,
+        uint8_t ifftFlag,
+        uint8_t bitReverseFlag);
+
+  /**
+   * @brief Instance structure for the floating-point CFFT/CIFFT function.
+   */
+  typedef struct
+  {
+          uint16_t fftLen;                   /**< length of the FFT. */
+          uint8_t ifftFlag;                  /**< flag that selects forward (ifftFlag=0) or inverse (ifftFlag=1) transform. */
+          uint8_t bitReverseFlag;            /**< flag that enables (bitReverseFlag=1) or disables (bitReverseFlag=0) bit reversal of output. */
+    const float32_t *pTwiddle;               /**< points to the Twiddle factor table. */
+    const uint16_t *pBitRevTable;            /**< points to the bit reversal table. */
+          uint16_t twidCoefModifier;         /**< twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table. */
+          uint16_t bitRevFactor;             /**< bit reversal modifier that supports different size FFTs with the same bit reversal table. */
+          float32_t onebyfftLen;             /**< value of 1/fftLen. */
+  } riscv_cfft_radix2_instance_f32;
+
+/* Deprecated */
+  riscv_status riscv_cfft_radix2_init_f32(
+        riscv_cfft_radix2_instance_f32 * S,
+        uint16_t fftLen,
+        uint8_t ifftFlag,
+        uint8_t bitReverseFlag);
+
+/* Deprecated */
+  void riscv_cfft_radix2_f32(
+  const riscv_cfft_radix2_instance_f32 * S,
+        float32_t * pSrc);
+
+  /**
+   * @brief Instance structure for the floating-point CFFT/CIFFT function.
+   */
+  typedef struct
+  {
+          uint16_t fftLen;                   /**< length of the FFT. */
+          uint8_t ifftFlag;                  /**< flag that selects forward (ifftFlag=0) or inverse (ifftFlag=1) transform. */
+          uint8_t bitReverseFlag;            /**< flag that enables (bitReverseFlag=1) or disables (bitReverseFlag=0) bit reversal of output. */
+    const float32_t *pTwiddle;               /**< points to the Twiddle factor table. */
+    const uint16_t *pBitRevTable;            /**< points to the bit reversal table. */
+          uint16_t twidCoefModifier;         /**< twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table. */
+          uint16_t bitRevFactor;             /**< bit reversal modifier that supports different size FFTs with the same bit reversal table. */
+          float32_t onebyfftLen;             /**< value of 1/fftLen. */
+  } riscv_cfft_radix4_instance_f32;
+
+/* Deprecated */
+  riscv_status riscv_cfft_radix4_init_f32(
+        riscv_cfft_radix4_instance_f32 * S,
+        uint16_t fftLen,
+        uint8_t ifftFlag,
+        uint8_t bitReverseFlag);
+
+/* Deprecated */
+  void riscv_cfft_radix4_f32(
+  const riscv_cfft_radix4_instance_f32 * S,
+        float32_t * pSrc);
+
+  /**
+   * @brief Instance structure for the fixed-point CFFT/CIFFT function.
+   */
+  typedef struct
+  {
+          uint16_t fftLen;                   /**< length of the FFT. */
+    const q15_t *pTwiddle;             /**< points to the Twiddle factor table. */
+    const uint16_t *pBitRevTable;      /**< points to the bit reversal table. */
+          uint16_t bitRevLength;             /**< bit reversal table length. */
+  } riscv_cfft_instance_q15;
+
+void riscv_cfft_q15(
+    const riscv_cfft_instance_q15 * S,
+          q15_t * p1,
+          uint8_t ifftFlag,
+          uint8_t bitReverseFlag);
+
+  /**
+   * @brief Instance structure for the fixed-point CFFT/CIFFT function.
+   */
+  typedef struct
+  {
+          uint16_t fftLen;                   /**< length of the FFT. */
+    const q31_t *pTwiddle;             /**< points to the Twiddle factor table. */
+    const uint16_t *pBitRevTable;      /**< points to the bit reversal table. */
+          uint16_t bitRevLength;             /**< bit reversal table length. */
+  } riscv_cfft_instance_q31;
+
+void riscv_cfft_q31(
+    const riscv_cfft_instance_q31 * S,
+          q31_t * p1,
+          uint8_t ifftFlag,
+          uint8_t bitReverseFlag);
+
+  /**
+   * @brief Instance structure for the floating-point CFFT/CIFFT function.
+   */
+  typedef struct
+  {
+          uint16_t fftLen;                   /**< length of the FFT. */
+    const float32_t *pTwiddle;         /**< points to the Twiddle factor table. */
+    const uint16_t *pBitRevTable;      /**< points to the bit reversal table. */
+          uint16_t bitRevLength;             /**< bit reversal table length. */
+  } riscv_cfft_instance_f32;
+
+  void riscv_cfft_f32(
+  const riscv_cfft_instance_f32 * S,
+        float32_t * p1,
+        uint8_t ifftFlag,
+        uint8_t bitReverseFlag);
+
+  /**
+   * @brief Instance structure for the Q15 RFFT/RIFFT function.
+   */
+  typedef struct
+  {
+          uint32_t fftLenReal;                      /**< length of the real FFT. */
+          uint8_t ifftFlagR;                        /**< flag that selects forward (ifftFlagR=0) or inverse (ifftFlagR=1) transform. */
+          uint8_t bitReverseFlagR;                  /**< flag that enables (bitReverseFlagR=1) or disables (bitReverseFlagR=0) bit reversal of output. */
+          uint32_t twidCoefRModifier;               /**< twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table. */
+    const q15_t *pTwiddleAReal;                     /**< points to the real twiddle factor table. */
+    const q15_t *pTwiddleBReal;                     /**< points to the imag twiddle factor table. */
+    const riscv_cfft_instance_q15 *pCfft;       /**< points to the complex FFT instance. */
+  } riscv_rfft_instance_q15;
+
+  riscv_status riscv_rfft_init_q15(
+        riscv_rfft_instance_q15 * S,
+        uint32_t fftLenReal,
+        uint32_t ifftFlagR,
+        uint32_t bitReverseFlag);
+
+  void riscv_rfft_q15(
+  const riscv_rfft_instance_q15 * S,
+        q15_t * pSrc,
+        q15_t * pDst);
+
+  /**
+   * @brief Instance structure for the Q31 RFFT/RIFFT function.
+   */
+  typedef struct
+  {
+          uint32_t fftLenReal;                        /**< length of the real FFT. */
+          uint8_t ifftFlagR;                          /**< flag that selects forward (ifftFlagR=0) or inverse (ifftFlagR=1) transform. */
+          uint8_t bitReverseFlagR;                    /**< flag that enables (bitReverseFlagR=1) or disables (bitReverseFlagR=0) bit reversal of output. */
+          uint32_t twidCoefRModifier;                 /**< twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table. */
+    const q31_t *pTwiddleAReal;                       /**< points to the real twiddle factor table. */
+    const q31_t *pTwiddleBReal;                       /**< points to the imag twiddle factor table. */
+    const riscv_cfft_instance_q31 *pCfft;         /**< points to the complex FFT instance. */
+  } riscv_rfft_instance_q31;
+
+  riscv_status riscv_rfft_init_q31(
+        riscv_rfft_instance_q31 * S,
+        uint32_t fftLenReal,
+        uint32_t ifftFlagR,
+        uint32_t bitReverseFlag);
+
+  void riscv_rfft_q31(
+  const riscv_rfft_instance_q31 * S,
+        q31_t * pSrc,
+        q31_t * pDst);
+
+  /**
+   * @brief Instance structure for the floating-point RFFT/RIFFT function.
+   */
+  typedef struct
+  {
+          uint32_t fftLenReal;                        /**< length of the real FFT. */
+          uint16_t fftLenBy2;                         /**< length of the complex FFT. */
+          uint8_t ifftFlagR;                          /**< flag that selects forward (ifftFlagR=0) or inverse (ifftFlagR=1) transform. */
+          uint8_t bitReverseFlagR;                    /**< flag that enables (bitReverseFlagR=1) or disables (bitReverseFlagR=0) bit reversal of output. */
+          uint32_t twidCoefRModifier;                     /**< twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table. */
+    const float32_t *pTwiddleAReal;                   /**< points to the real twiddle factor table. */
+    const float32_t *pTwiddleBReal;                   /**< points to the imag twiddle factor table. */
+          riscv_cfft_radix4_instance_f32 *pCfft;        /**< points to the complex FFT instance. */
+  } riscv_rfft_instance_f32;
+
+  riscv_status riscv_rfft_init_f32(
+        riscv_rfft_instance_f32 * S,
+        riscv_cfft_radix4_instance_f32 * S_CFFT,
+        uint32_t fftLenReal,
+        uint32_t ifftFlagR,
+        uint32_t bitReverseFlag);
+
+  void riscv_rfft_f32(
+  const riscv_rfft_instance_f32 * S,
+        float32_t * pSrc,
+        float32_t * pDst);
+
+  /**
+   * @brief Instance structure for the floating-point RFFT/RIFFT function.
+   */
+typedef struct
+  {
+          riscv_cfft_instance_f32 Sint;      /**< Internal CFFT structure. */
+          uint16_t fftLenRFFT;             /**< length of the real sequence */
+    const float32_t * pTwiddleRFFT;        /**< Twiddle factors real stage  */
+  } riscv_rfft_fast_instance_f32 ;
+
+riscv_status riscv_rfft_fast_init_f32 (
+         riscv_rfft_fast_instance_f32 * S,
+         uint16_t fftLen);
+
+riscv_status riscv_rfft_32_fast_init_f32 ( riscv_rfft_fast_instance_f32 * S );
+
+riscv_status riscv_rfft_64_fast_init_f32 ( riscv_rfft_fast_instance_f32 * S );
+
+riscv_status riscv_rfft_128_fast_init_f32 ( riscv_rfft_fast_instance_f32 * S );
+
+riscv_status riscv_rfft_256_fast_init_f32 ( riscv_rfft_fast_instance_f32 * S );
+
+riscv_status riscv_rfft_512_fast_init_f32 ( riscv_rfft_fast_instance_f32 * S );
+
+riscv_status riscv_rfft_1024_fast_init_f32 ( riscv_rfft_fast_instance_f32 * S );
+
+riscv_status riscv_rfft_2048_fast_init_f32 ( riscv_rfft_fast_instance_f32 * S );
+
+riscv_status riscv_rfft_4096_fast_init_f32 ( riscv_rfft_fast_instance_f32 * S );
+
+
+  void riscv_rfft_fast_f32(
+        riscv_rfft_fast_instance_f32 * S,
+        float32_t * p, float32_t * pOut,
+        uint8_t ifftFlag);
+
+  /**
+   * @brief Instance structure for the floating-point DCT4/IDCT4 function.
+   */
+  typedef struct
+  {
+          uint16_t N;                          /**< length of the DCT4. */
+          uint16_t Nby2;                       /**< half of the length of the DCT4. */
+          float32_t normalize;                 /**< normalizing factor. */
+    const float32_t *pTwiddle;                 /**< points to the twiddle factor table. */
+    const float32_t *pCosFactor;               /**< points to the cosFactor table. */
+          riscv_rfft_instance_f32 *pRfft;        /**< points to the real FFT instance. */
+          riscv_cfft_radix4_instance_f32 *pCfft; /**< points to the complex FFT instance. */
+  } riscv_dct4_instance_f32;
+
+
+  /**
+   * @brief  Initialization function for the floating-point DCT4/IDCT4.
+   * @param[in,out] S          points to an instance of floating-point DCT4/IDCT4 structure.
+   * @param[in]     S_RFFT     points to an instance of floating-point RFFT/RIFFT structure.
+   * @param[in]     S_CFFT     points to an instance of floating-point CFFT/CIFFT structure.
+   * @param[in]     N          length of the DCT4.
+   * @param[in]     Nby2       half of the length of the DCT4.
+   * @param[in]     normalize  normalizing factor.
+   * @return      riscv_status function returns RISCV_MATH_SUCCESS if initialization is successful or RISCV_MATH_ARGUMENT_ERROR if <code>fftLenReal</code> is not a supported transform length.
+   */
+  riscv_status riscv_dct4_init_f32(
+        riscv_dct4_instance_f32 * S,
+        riscv_rfft_instance_f32 * S_RFFT,
+        riscv_cfft_radix4_instance_f32 * S_CFFT,
+        uint16_t N,
+        uint16_t Nby2,
+        float32_t normalize);
+
+
+  /**
+   * @brief Processing function for the floating-point DCT4/IDCT4.
+   * @param[in]     S              points to an instance of the floating-point DCT4/IDCT4 structure.
+   * @param[in]     pState         points to state buffer.
+   * @param[in,out] pInlineBuffer  points to the in-place input and output buffer.
+   */
+  void riscv_dct4_f32(
+  const riscv_dct4_instance_f32 * S,
+        float32_t * pState,
+        float32_t * pInlineBuffer);
+
+
+  /**
+   * @brief Instance structure for the Q31 DCT4/IDCT4 function.
+   */
+  typedef struct
+  {
+          uint16_t N;                          /**< length of the DCT4. */
+          uint16_t Nby2;                       /**< half of the length of the DCT4. */
+          q31_t normalize;                     /**< normalizing factor. */
+    const q31_t *pTwiddle;                     /**< points to the twiddle factor table. */
+    const q31_t *pCosFactor;                   /**< points to the cosFactor table. */
+          riscv_rfft_instance_q31 *pRfft;        /**< points to the real FFT instance. */
+          riscv_cfft_radix4_instance_q31 *pCfft; /**< points to the complex FFT instance. */
+  } riscv_dct4_instance_q31;
+
+
+  /**
+   * @brief  Initialization function for the Q31 DCT4/IDCT4.
+   * @param[in,out] S          points to an instance of Q31 DCT4/IDCT4 structure.
+   * @param[in]     S_RFFT     points to an instance of Q31 RFFT/RIFFT structure
+   * @param[in]     S_CFFT     points to an instance of Q31 CFFT/CIFFT structure
+   * @param[in]     N          length of the DCT4.
+   * @param[in]     Nby2       half of the length of the DCT4.
+   * @param[in]     normalize  normalizing factor.
+   * @return      riscv_status function returns RISCV_MATH_SUCCESS if initialization is successful or RISCV_MATH_ARGUMENT_ERROR if <code>N</code> is not a supported transform length.
+   */
+  riscv_status riscv_dct4_init_q31(
+        riscv_dct4_instance_q31 * S,
+        riscv_rfft_instance_q31 * S_RFFT,
+        riscv_cfft_radix4_instance_q31 * S_CFFT,
+        uint16_t N,
+        uint16_t Nby2,
+        q31_t normalize);
+
+
+  /**
+   * @brief Processing function for the Q31 DCT4/IDCT4.
+   * @param[in]     S              points to an instance of the Q31 DCT4 structure.
+   * @param[in]     pState         points to state buffer.
+   * @param[in,out] pInlineBuffer  points to the in-place input and output buffer.
+   */
+  void riscv_dct4_q31(
+  const riscv_dct4_instance_q31 * S,
+        q31_t * pState,
+        q31_t * pInlineBuffer);
+
+
+  /**
+   * @brief Instance structure for the Q15 DCT4/IDCT4 function.
+   */
+  typedef struct
+  {
+          uint16_t N;                          /**< length of the DCT4. */
+          uint16_t Nby2;                       /**< half of the length of the DCT4. */
+          q15_t normalize;                     /**< normalizing factor. */
+    const q15_t *pTwiddle;                     /**< points to the twiddle factor table. */
+    const q15_t *pCosFactor;                   /**< points to the cosFactor table. */
+          riscv_rfft_instance_q15 *pRfft;        /**< points to the real FFT instance. */
+          riscv_cfft_radix4_instance_q15 *pCfft; /**< points to the complex FFT instance. */
+  } riscv_dct4_instance_q15;
+
+
+  /**
+   * @brief  Initialization function for the Q15 DCT4/IDCT4.
+   * @param[in,out] S          points to an instance of Q15 DCT4/IDCT4 structure.
+   * @param[in]     S_RFFT     points to an instance of Q15 RFFT/RIFFT structure.
+   * @param[in]     S_CFFT     points to an instance of Q15 CFFT/CIFFT structure.
+   * @param[in]     N          length of the DCT4.
+   * @param[in]     Nby2       half of the length of the DCT4.
+   * @param[in]     normalize  normalizing factor.
+   * @return      riscv_status function returns RISCV_MATH_SUCCESS if initialization is successful or RISCV_MATH_ARGUMENT_ERROR if <code>N</code> is not a supported transform length.
+   */
+  riscv_status riscv_dct4_init_q15(
+        riscv_dct4_instance_q15 * S,
+        riscv_rfft_instance_q15 * S_RFFT,
+        riscv_cfft_radix4_instance_q15 * S_CFFT,
+        uint16_t N,
+        uint16_t Nby2,
+        q15_t normalize);
+
+
+  /**
+   * @brief Processing function for the Q15 DCT4/IDCT4.
+   * @param[in]     S              points to an instance of the Q15 DCT4 structure.
+   * @param[in]     pState         points to state buffer.
+   * @param[in,out] pInlineBuffer  points to the in-place input and output buffer.
+   */
+  void riscv_dct4_q15(
+  const riscv_dct4_instance_q15 * S,
+        q15_t * pState,
+        q15_t * pInlineBuffer);
+
+
+  /**
+   * @brief Floating-point vector addition.
+   * @param[in]  pSrcA      points to the first input vector
+   * @param[in]  pSrcB      points to the second input vector
+   * @param[out] pDst       points to the output vector
+   * @param[in]  blockSize  number of samples in each vector
+   */
+  void riscv_add_f32(
+  const float32_t * pSrcA,
+  const float32_t * pSrcB,
+        float32_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief Q7 vector addition.
+   * @param[in]  pSrcA      points to the first input vector
+   * @param[in]  pSrcB      points to the second input vector
+   * @param[out] pDst       points to the output vector
+   * @param[in]  blockSize  number of samples in each vector
+   */
+  void riscv_add_q7(
+  const q7_t * pSrcA,
+  const q7_t * pSrcB,
+        q7_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief Q15 vector addition.
+   * @param[in]  pSrcA      points to the first input vector
+   * @param[in]  pSrcB      points to the second input vector
+   * @param[out] pDst       points to the output vector
+   * @param[in]  blockSize  number of samples in each vector
+   */
+  void riscv_add_q15(
+  const q15_t * pSrcA,
+  const q15_t * pSrcB,
+        q15_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief Q31 vector addition.
+   * @param[in]  pSrcA      points to the first input vector
+   * @param[in]  pSrcB      points to the second input vector
+   * @param[out] pDst       points to the output vector
+   * @param[in]  blockSize  number of samples in each vector
+   */
+  void riscv_add_q31(
+  const q31_t * pSrcA,
+  const q31_t * pSrcB,
+        q31_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief Floating-point vector subtraction.
+   * @param[in]  pSrcA      points to the first input vector
+   * @param[in]  pSrcB      points to the second input vector
+   * @param[out] pDst       points to the output vector
+   * @param[in]  blockSize  number of samples in each vector
+   */
+  void riscv_sub_f32(
+  const float32_t * pSrcA,
+  const float32_t * pSrcB,
+        float32_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief Q7 vector subtraction.
+   * @param[in]  pSrcA      points to the first input vector
+   * @param[in]  pSrcB      points to the second input vector
+   * @param[out] pDst       points to the output vector
+   * @param[in]  blockSize  number of samples in each vector
+   */
+  void riscv_sub_q7(
+  const q7_t * pSrcA,
+  const q7_t * pSrcB,
+        q7_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief Q15 vector subtraction.
+   * @param[in]  pSrcA      points to the first input vector
+   * @param[in]  pSrcB      points to the second input vector
+   * @param[out] pDst       points to the output vector
+   * @param[in]  blockSize  number of samples in each vector
+   */
+  void riscv_sub_q15(
+  const q15_t * pSrcA,
+  const q15_t * pSrcB,
+        q15_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief Q31 vector subtraction.
+   * @param[in]  pSrcA      points to the first input vector
+   * @param[in]  pSrcB      points to the second input vector
+   * @param[out] pDst       points to the output vector
+   * @param[in]  blockSize  number of samples in each vector
+   */
+  void riscv_sub_q31(
+  const q31_t * pSrcA,
+  const q31_t * pSrcB,
+        q31_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief Multiplies a floating-point vector by a scalar.
+   * @param[in]  pSrc       points to the input vector
+   * @param[in]  scale      scale factor to be applied
+   * @param[out] pDst       points to the output vector
+   * @param[in]  blockSize  number of samples in the vector
+   */
+  void riscv_scale_f32(
+  const float32_t * pSrc,
+        float32_t scale,
+        float32_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief Multiplies a Q7 vector by a scalar.
+   * @param[in]  pSrc        points to the input vector
+   * @param[in]  scaleFract  fractional portion of the scale value
+   * @param[in]  shift       number of bits to shift the result by
+   * @param[out] pDst        points to the output vector
+   * @param[in]  blockSize   number of samples in the vector
+   */
+  void riscv_scale_q7(
+  const q7_t * pSrc,
+        q7_t scaleFract,
+        int8_t shift,
+        q7_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief Multiplies a Q15 vector by a scalar.
+   * @param[in]  pSrc        points to the input vector
+   * @param[in]  scaleFract  fractional portion of the scale value
+   * @param[in]  shift       number of bits to shift the result by
+   * @param[out] pDst        points to the output vector
+   * @param[in]  blockSize   number of samples in the vector
+   */
+  void riscv_scale_q15(
+  const q15_t * pSrc,
+        q15_t scaleFract,
+        int8_t shift,
+        q15_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief Multiplies a Q31 vector by a scalar.
+   * @param[in]  pSrc        points to the input vector
+   * @param[in]  scaleFract  fractional portion of the scale value
+   * @param[in]  shift       number of bits to shift the result by
+   * @param[out] pDst        points to the output vector
+   * @param[in]  blockSize   number of samples in the vector
+   */
+  void riscv_scale_q31(
+  const q31_t * pSrc,
+        q31_t scaleFract,
+        int8_t shift,
+        q31_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief Q7 vector absolute value.
+   * @param[in]  pSrc       points to the input buffer
+   * @param[out] pDst       points to the output buffer
+   * @param[in]  blockSize  number of samples in each vector
+   */
+  void riscv_abs_q7(
+  const q7_t * pSrc,
+        q7_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief Floating-point vector absolute value.
+   * @param[in]  pSrc       points to the input buffer
+   * @param[out] pDst       points to the output buffer
+   * @param[in]  blockSize  number of samples in each vector
+   */
+  void riscv_abs_f32(
+  const float32_t * pSrc,
+        float32_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief Q15 vector absolute value.
+   * @param[in]  pSrc       points to the input buffer
+   * @param[out] pDst       points to the output buffer
+   * @param[in]  blockSize  number of samples in each vector
+   */
+  void riscv_abs_q15(
+  const q15_t * pSrc,
+        q15_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief Q31 vector absolute value.
+   * @param[in]  pSrc       points to the input buffer
+   * @param[out] pDst       points to the output buffer
+   * @param[in]  blockSize  number of samples in each vector
+   */
+  void riscv_abs_q31(
+  const q31_t * pSrc,
+        q31_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief Dot product of floating-point vectors.
+   * @param[in]  pSrcA      points to the first input vector
+   * @param[in]  pSrcB      points to the second input vector
+   * @param[in]  blockSize  number of samples in each vector
+   * @param[out] result     output result returned here
+   */
+  void riscv_dot_prod_f32(
+  const float32_t * pSrcA,
+  const float32_t * pSrcB,
+        uint32_t blockSize,
+        float32_t * result);
+
+
+  /**
+   * @brief Dot product of Q7 vectors.
+   * @param[in]  pSrcA      points to the first input vector
+   * @param[in]  pSrcB      points to the second input vector
+   * @param[in]  blockSize  number of samples in each vector
+   * @param[out] result     output result returned here
+   */
+  void riscv_dot_prod_q7(
+  const q7_t * pSrcA,
+  const q7_t * pSrcB,
+        uint32_t blockSize,
+        q31_t * result);
+
+
+  /**
+   * @brief Dot product of Q15 vectors.
+   * @param[in]  pSrcA      points to the first input vector
+   * @param[in]  pSrcB      points to the second input vector
+   * @param[in]  blockSize  number of samples in each vector
+   * @param[out] result     output result returned here
+   */
+  void riscv_dot_prod_q15(
+  const q15_t * pSrcA,
+  const q15_t * pSrcB,
+        uint32_t blockSize,
+        q63_t * result);
+
+
+  /**
+   * @brief Dot product of Q31 vectors.
+   * @param[in]  pSrcA      points to the first input vector
+   * @param[in]  pSrcB      points to the second input vector
+   * @param[in]  blockSize  number of samples in each vector
+   * @param[out] result     output result returned here
+   */
+  void riscv_dot_prod_q31(
+  const q31_t * pSrcA,
+  const q31_t * pSrcB,
+        uint32_t blockSize,
+        q63_t * result);
+
+
+  /**
+   * @brief  Shifts the elements of a Q7 vector a specified number of bits.
+   * @param[in]  pSrc       points to the input vector
+   * @param[in]  shiftBits  number of bits to shift.  A positive value shifts left; a negative value shifts right.
+   * @param[out] pDst       points to the output vector
+   * @param[in]  blockSize  number of samples in the vector
+   */
+  void riscv_shift_q7(
+  const q7_t * pSrc,
+        int8_t shiftBits,
+        q7_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief  Shifts the elements of a Q15 vector a specified number of bits.
+   * @param[in]  pSrc       points to the input vector
+   * @param[in]  shiftBits  number of bits to shift.  A positive value shifts left; a negative value shifts right.
+   * @param[out] pDst       points to the output vector
+   * @param[in]  blockSize  number of samples in the vector
+   */
+  void riscv_shift_q15(
+  const q15_t * pSrc,
+        int8_t shiftBits,
+        q15_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief  Shifts the elements of a Q31 vector a specified number of bits.
+   * @param[in]  pSrc       points to the input vector
+   * @param[in]  shiftBits  number of bits to shift.  A positive value shifts left; a negative value shifts right.
+   * @param[out] pDst       points to the output vector
+   * @param[in]  blockSize  number of samples in the vector
+   */
+  void riscv_shift_q31(
+  const q31_t * pSrc,
+        int8_t shiftBits,
+        q31_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief  Adds a constant offset to a floating-point vector.
+   * @param[in]  pSrc       points to the input vector
+   * @param[in]  offset     is the offset to be added
+   * @param[out] pDst       points to the output vector
+   * @param[in]  blockSize  number of samples in the vector
+   */
+  void riscv_offset_f32(
+  const float32_t * pSrc,
+        float32_t offset,
+        float32_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief  Adds a constant offset to a Q7 vector.
+   * @param[in]  pSrc       points to the input vector
+   * @param[in]  offset     is the offset to be added
+   * @param[out] pDst       points to the output vector
+   * @param[in]  blockSize  number of samples in the vector
+   */
+  void riscv_offset_q7(
+  const q7_t * pSrc,
+        q7_t offset,
+        q7_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief  Adds a constant offset to a Q15 vector.
+   * @param[in]  pSrc       points to the input vector
+   * @param[in]  offset     is the offset to be added
+   * @param[out] pDst       points to the output vector
+   * @param[in]  blockSize  number of samples in the vector
+   */
+  void riscv_offset_q15(
+  const q15_t * pSrc,
+        q15_t offset,
+        q15_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief  Adds a constant offset to a Q31 vector.
+   * @param[in]  pSrc       points to the input vector
+   * @param[in]  offset     is the offset to be added
+   * @param[out] pDst       points to the output vector
+   * @param[in]  blockSize  number of samples in the vector
+   */
+  void riscv_offset_q31(
+  const q31_t * pSrc,
+        q31_t offset,
+        q31_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief  Negates the elements of a floating-point vector.
+   * @param[in]  pSrc       points to the input vector
+   * @param[out] pDst       points to the output vector
+   * @param[in]  blockSize  number of samples in the vector
+   */
+  void riscv_negate_f32(
+  const float32_t * pSrc,
+        float32_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief  Negates the elements of a Q7 vector.
+   * @param[in]  pSrc       points to the input vector
+   * @param[out] pDst       points to the output vector
+   * @param[in]  blockSize  number of samples in the vector
+   */
+  void riscv_negate_q7(
+  const q7_t * pSrc,
+        q7_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief  Negates the elements of a Q15 vector.
+   * @param[in]  pSrc       points to the input vector
+   * @param[out] pDst       points to the output vector
+   * @param[in]  blockSize  number of samples in the vector
+   */
+  void riscv_negate_q15(
+  const q15_t * pSrc,
+        q15_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief  Negates the elements of a Q31 vector.
+   * @param[in]  pSrc       points to the input vector
+   * @param[out] pDst       points to the output vector
+   * @param[in]  blockSize  number of samples in the vector
+   */
+  void riscv_negate_q31(
+  const q31_t * pSrc,
+        q31_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief  Copies the elements of a floating-point vector.
+   * @param[in]  pSrc       input pointer
+   * @param[out] pDst       output pointer
+   * @param[in]  blockSize  number of samples to process
+   */
+  void riscv_copy_f32(
+  const float32_t * pSrc,
+        float32_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief  Copies the elements of a Q7 vector.
+   * @param[in]  pSrc       input pointer
+   * @param[out] pDst       output pointer
+   * @param[in]  blockSize  number of samples to process
+   */
+  void riscv_copy_q7(
+  const q7_t * pSrc,
+        q7_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief  Copies the elements of a Q15 vector.
+   * @param[in]  pSrc       input pointer
+   * @param[out] pDst       output pointer
+   * @param[in]  blockSize  number of samples to process
+   */
+  void riscv_copy_q15(
+  const q15_t * pSrc,
+        q15_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief  Copies the elements of a Q31 vector.
+   * @param[in]  pSrc       input pointer
+   * @param[out] pDst       output pointer
+   * @param[in]  blockSize  number of samples to process
+   */
+  void riscv_copy_q31(
+  const q31_t * pSrc,
+        q31_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief  Fills a constant value into a floating-point vector.
+   * @param[in]  value      input value to be filled
+   * @param[out] pDst       output pointer
+   * @param[in]  blockSize  number of samples to process
+   */
+  void riscv_fill_f32(
+        float32_t value,
+        float32_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief  Fills a constant value into a Q7 vector.
+   * @param[in]  value      input value to be filled
+   * @param[out] pDst       output pointer
+   * @param[in]  blockSize  number of samples to process
+   */
+  void riscv_fill_q7(
+        q7_t value,
+        q7_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief  Fills a constant value into a Q15 vector.
+   * @param[in]  value      input value to be filled
+   * @param[out] pDst       output pointer
+   * @param[in]  blockSize  number of samples to process
+   */
+  void riscv_fill_q15(
+        q15_t value,
+        q15_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief  Fills a constant value into a Q31 vector.
+   * @param[in]  value      input value to be filled
+   * @param[out] pDst       output pointer
+   * @param[in]  blockSize  number of samples to process
+   */
+  void riscv_fill_q31(
+        q31_t value,
+        q31_t * pDst,
+        uint32_t blockSize);
+
+
+/**
+ * @brief Convolution of floating-point sequences.
+ * @param[in]  pSrcA    points to the first input sequence.
+ * @param[in]  srcALen  length of the first input sequence.
+ * @param[in]  pSrcB    points to the second input sequence.
+ * @param[in]  srcBLen  length of the second input sequence.
+ * @param[out] pDst     points to the location where the output result is written.  Length srcALen+srcBLen-1.
+ */
+  void riscv_conv_f32(
+  const float32_t * pSrcA,
+        uint32_t srcALen,
+  const float32_t * pSrcB,
+        uint32_t srcBLen,
+        float32_t * pDst);
+
+
+  /**
+   * @brief Convolution of Q15 sequences.
+   * @param[in]  pSrcA      points to the first input sequence.
+   * @param[in]  srcALen    length of the first input sequence.
+   * @param[in]  pSrcB      points to the second input sequence.
+   * @param[in]  srcBLen    length of the second input sequence.
+   * @param[out] pDst       points to the block of output data  Length srcALen+srcBLen-1.
+   * @param[in]  pScratch1  points to scratch buffer of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2.
+   * @param[in]  pScratch2  points to scratch buffer of size min(srcALen, srcBLen).
+   */
+  void riscv_conv_opt_q15(
+  const q15_t * pSrcA,
+        uint32_t srcALen,
+  const q15_t * pSrcB,
+        uint32_t srcBLen,
+        q15_t * pDst,
+        q15_t * pScratch1,
+        q15_t * pScratch2);
+
+
+/**
+ * @brief Convolution of Q15 sequences.
+ * @param[in]  pSrcA    points to the first input sequence.
+ * @param[in]  srcALen  length of the first input sequence.
+ * @param[in]  pSrcB    points to the second input sequence.
+ * @param[in]  srcBLen  length of the second input sequence.
+ * @param[out] pDst     points to the location where the output result is written.  Length srcALen+srcBLen-1.
+ */
+  void riscv_conv_q15(
+  const q15_t * pSrcA,
+        uint32_t srcALen,
+  const q15_t * pSrcB,
+        uint32_t srcBLen,
+        q15_t * pDst);
+
+
+  /**
+   * @brief Convolution of Q15 sequences (fast version) for RISC-V Core with DSP enabled
+   * @param[in]  pSrcA    points to the first input sequence.
+   * @param[in]  srcALen  length of the first input sequence.
+   * @param[in]  pSrcB    points to the second input sequence.
+   * @param[in]  srcBLen  length of the second input sequence.
+   * @param[out] pDst     points to the block of output data  Length srcALen+srcBLen-1.
+   */
+  void riscv_conv_fast_q15(
+  const q15_t * pSrcA,
+        uint32_t srcALen,
+  const q15_t * pSrcB,
+        uint32_t srcBLen,
+        q15_t * pDst);
+
+
+  /**
+   * @brief Convolution of Q15 sequences (fast version) for RISC-V Core with DSP enabled
+   * @param[in]  pSrcA      points to the first input sequence.
+   * @param[in]  srcALen    length of the first input sequence.
+   * @param[in]  pSrcB      points to the second input sequence.
+   * @param[in]  srcBLen    length of the second input sequence.
+   * @param[out] pDst       points to the block of output data  Length srcALen+srcBLen-1.
+   * @param[in]  pScratch1  points to scratch buffer of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2.
+   * @param[in]  pScratch2  points to scratch buffer of size min(srcALen, srcBLen).
+   */
+  void riscv_conv_fast_opt_q15(
+  const q15_t * pSrcA,
+        uint32_t srcALen,
+  const q15_t * pSrcB,
+        uint32_t srcBLen,
+        q15_t * pDst,
+        q15_t * pScratch1,
+        q15_t * pScratch2);
+
+
+  /**
+   * @brief Convolution of Q31 sequences.
+   * @param[in]  pSrcA    points to the first input sequence.
+   * @param[in]  srcALen  length of the first input sequence.
+   * @param[in]  pSrcB    points to the second input sequence.
+   * @param[in]  srcBLen  length of the second input sequence.
+   * @param[out] pDst     points to the block of output data  Length srcALen+srcBLen-1.
+   */
+  void riscv_conv_q31(
+  const q31_t * pSrcA,
+        uint32_t srcALen,
+  const q31_t * pSrcB,
+        uint32_t srcBLen,
+        q31_t * pDst);
+
+
+  /**
+   * @brief Convolution of Q31 sequences (fast version) for RISC-V Core with DSP enabled
+   * @param[in]  pSrcA    points to the first input sequence.
+   * @param[in]  srcALen  length of the first input sequence.
+   * @param[in]  pSrcB    points to the second input sequence.
+   * @param[in]  srcBLen  length of the second input sequence.
+   * @param[out] pDst     points to the block of output data  Length srcALen+srcBLen-1.
+   */
+  void riscv_conv_fast_q31(
+  const q31_t * pSrcA,
+        uint32_t srcALen,
+  const q31_t * pSrcB,
+        uint32_t srcBLen,
+        q31_t * pDst);
+
+
+    /**
+   * @brief Convolution of Q7 sequences.
+   * @param[in]  pSrcA      points to the first input sequence.
+   * @param[in]  srcALen    length of the first input sequence.
+   * @param[in]  pSrcB      points to the second input sequence.
+   * @param[in]  srcBLen    length of the second input sequence.
+   * @param[out] pDst       points to the block of output data  Length srcALen+srcBLen-1.
+   * @param[in]  pScratch1  points to scratch buffer(of type q15_t) of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2.
+   * @param[in]  pScratch2  points to scratch buffer (of type q15_t) of size min(srcALen, srcBLen).
+   */
+  void riscv_conv_opt_q7(
+  const q7_t * pSrcA,
+        uint32_t srcALen,
+  const q7_t * pSrcB,
+        uint32_t srcBLen,
+        q7_t * pDst,
+        q15_t * pScratch1,
+        q15_t * pScratch2);
+
+
+  /**
+   * @brief Convolution of Q7 sequences.
+   * @param[in]  pSrcA    points to the first input sequence.
+   * @param[in]  srcALen  length of the first input sequence.
+   * @param[in]  pSrcB    points to the second input sequence.
+   * @param[in]  srcBLen  length of the second input sequence.
+   * @param[out] pDst     points to the block of output data  Length srcALen+srcBLen-1.
+   */
+  void riscv_conv_q7(
+  const q7_t * pSrcA,
+        uint32_t srcALen,
+  const q7_t * pSrcB,
+        uint32_t srcBLen,
+        q7_t * pDst);
+
+
+  /**
+   * @brief Partial convolution of floating-point sequences.
+   * @param[in]  pSrcA       points to the first input sequence.
+   * @param[in]  srcALen     length of the first input sequence.
+   * @param[in]  pSrcB       points to the second input sequence.
+   * @param[in]  srcBLen     length of the second input sequence.
+   * @param[out] pDst        points to the block of output data
+   * @param[in]  firstIndex  is the first output sample to start with.
+   * @param[in]  numPoints   is the number of output points to be computed.
+   * @return  Returns either RISCV_MATH_SUCCESS if the function completed correctly or RISCV_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2].
+   */
+  riscv_status riscv_conv_partial_f32(
+  const float32_t * pSrcA,
+        uint32_t srcALen,
+  const float32_t * pSrcB,
+        uint32_t srcBLen,
+        float32_t * pDst,
+        uint32_t firstIndex,
+        uint32_t numPoints);
+
+
+  /**
+   * @brief Partial convolution of Q15 sequences.
+   * @param[in]  pSrcA       points to the first input sequence.
+   * @param[in]  srcALen     length of the first input sequence.
+   * @param[in]  pSrcB       points to the second input sequence.
+   * @param[in]  srcBLen     length of the second input sequence.
+   * @param[out] pDst        points to the block of output data
+   * @param[in]  firstIndex  is the first output sample to start with.
+   * @param[in]  numPoints   is the number of output points to be computed.
+   * @param[in]  pScratch1   points to scratch buffer of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2.
+   * @param[in]  pScratch2   points to scratch buffer of size min(srcALen, srcBLen).
+   * @return  Returns either RISCV_MATH_SUCCESS if the function completed correctly or RISCV_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2].
+   */
+  riscv_status riscv_conv_partial_opt_q15(
+  const q15_t * pSrcA,
+        uint32_t srcALen,
+  const q15_t * pSrcB,
+        uint32_t srcBLen,
+        q15_t * pDst,
+        uint32_t firstIndex,
+        uint32_t numPoints,
+        q15_t * pScratch1,
+        q15_t * pScratch2);
+
+
+  /**
+   * @brief Partial convolution of Q15 sequences.
+   * @param[in]  pSrcA       points to the first input sequence.
+   * @param[in]  srcALen     length of the first input sequence.
+   * @param[in]  pSrcB       points to the second input sequence.
+   * @param[in]  srcBLen     length of the second input sequence.
+   * @param[out] pDst        points to the block of output data
+   * @param[in]  firstIndex  is the first output sample to start with.
+   * @param[in]  numPoints   is the number of output points to be computed.
+   * @return  Returns either RISCV_MATH_SUCCESS if the function completed correctly or RISCV_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2].
+   */
+  riscv_status riscv_conv_partial_q15(
+  const q15_t * pSrcA,
+        uint32_t srcALen,
+  const q15_t * pSrcB,
+        uint32_t srcBLen,
+        q15_t * pDst,
+        uint32_t firstIndex,
+        uint32_t numPoints);
+
+
+  /**
+   * @brief Partial convolution of Q15 sequences (fast version) for RISC-V Core with DSP enabled
+   * @param[in]  pSrcA       points to the first input sequence.
+   * @param[in]  srcALen     length of the first input sequence.
+   * @param[in]  pSrcB       points to the second input sequence.
+   * @param[in]  srcBLen     length of the second input sequence.
+   * @param[out] pDst        points to the block of output data
+   * @param[in]  firstIndex  is the first output sample to start with.
+   * @param[in]  numPoints   is the number of output points to be computed.
+   * @return  Returns either RISCV_MATH_SUCCESS if the function completed correctly or RISCV_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2].
+   */
+  riscv_status riscv_conv_partial_fast_q15(
+  const q15_t * pSrcA,
+        uint32_t srcALen,
+  const q15_t * pSrcB,
+        uint32_t srcBLen,
+        q15_t * pDst,
+        uint32_t firstIndex,
+        uint32_t numPoints);
+
+
+  /**
+   * @brief Partial convolution of Q15 sequences (fast version) for RISC-V Core with DSP enabled
+   * @param[in]  pSrcA       points to the first input sequence.
+   * @param[in]  srcALen     length of the first input sequence.
+   * @param[in]  pSrcB       points to the second input sequence.
+   * @param[in]  srcBLen     length of the second input sequence.
+   * @param[out] pDst        points to the block of output data
+   * @param[in]  firstIndex  is the first output sample to start with.
+   * @param[in]  numPoints   is the number of output points to be computed.
+   * @param[in]  pScratch1   points to scratch buffer of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2.
+   * @param[in]  pScratch2   points to scratch buffer of size min(srcALen, srcBLen).
+   * @return  Returns either RISCV_MATH_SUCCESS if the function completed correctly or RISCV_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2].
+   */
+  riscv_status riscv_conv_partial_fast_opt_q15(
+  const q15_t * pSrcA,
+        uint32_t srcALen,
+  const q15_t * pSrcB,
+        uint32_t srcBLen,
+        q15_t * pDst,
+        uint32_t firstIndex,
+        uint32_t numPoints,
+        q15_t * pScratch1,
+        q15_t * pScratch2);
+
+
+  /**
+   * @brief Partial convolution of Q31 sequences.
+   * @param[in]  pSrcA       points to the first input sequence.
+   * @param[in]  srcALen     length of the first input sequence.
+   * @param[in]  pSrcB       points to the second input sequence.
+   * @param[in]  srcBLen     length of the second input sequence.
+   * @param[out] pDst        points to the block of output data
+   * @param[in]  firstIndex  is the first output sample to start with.
+   * @param[in]  numPoints   is the number of output points to be computed.
+   * @return  Returns either RISCV_MATH_SUCCESS if the function completed correctly or RISCV_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2].
+   */
+  riscv_status riscv_conv_partial_q31(
+  const q31_t * pSrcA,
+        uint32_t srcALen,
+  const q31_t * pSrcB,
+        uint32_t srcBLen,
+        q31_t * pDst,
+        uint32_t firstIndex,
+        uint32_t numPoints);
+
+
+  /**
+   * @brief Partial convolution of Q31 sequences (fast version) for RISC-V Core with DSP enabled
+   * @param[in]  pSrcA       points to the first input sequence.
+   * @param[in]  srcALen     length of the first input sequence.
+   * @param[in]  pSrcB       points to the second input sequence.
+   * @param[in]  srcBLen     length of the second input sequence.
+   * @param[out] pDst        points to the block of output data
+   * @param[in]  firstIndex  is the first output sample to start with.
+   * @param[in]  numPoints   is the number of output points to be computed.
+   * @return  Returns either RISCV_MATH_SUCCESS if the function completed correctly or RISCV_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2].
+   */
+  riscv_status riscv_conv_partial_fast_q31(
+  const q31_t * pSrcA,
+        uint32_t srcALen,
+  const q31_t * pSrcB,
+        uint32_t srcBLen,
+        q31_t * pDst,
+        uint32_t firstIndex,
+        uint32_t numPoints);
+
+
+  /**
+   * @brief Partial convolution of Q7 sequences
+   * @param[in]  pSrcA       points to the first input sequence.
+   * @param[in]  srcALen     length of the first input sequence.
+   * @param[in]  pSrcB       points to the second input sequence.
+   * @param[in]  srcBLen     length of the second input sequence.
+   * @param[out] pDst        points to the block of output data
+   * @param[in]  firstIndex  is the first output sample to start with.
+   * @param[in]  numPoints   is the number of output points to be computed.
+   * @param[in]  pScratch1   points to scratch buffer(of type q15_t) of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2.
+   * @param[in]  pScratch2   points to scratch buffer (of type q15_t) of size min(srcALen, srcBLen).
+   * @return  Returns either RISCV_MATH_SUCCESS if the function completed correctly or RISCV_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2].
+   */
+  riscv_status riscv_conv_partial_opt_q7(
+  const q7_t * pSrcA,
+        uint32_t srcALen,
+  const q7_t * pSrcB,
+        uint32_t srcBLen,
+        q7_t * pDst,
+        uint32_t firstIndex,
+        uint32_t numPoints,
+        q15_t * pScratch1,
+        q15_t * pScratch2);
+
+
+/**
+   * @brief Partial convolution of Q7 sequences.
+   * @param[in]  pSrcA       points to the first input sequence.
+   * @param[in]  srcALen     length of the first input sequence.
+   * @param[in]  pSrcB       points to the second input sequence.
+   * @param[in]  srcBLen     length of the second input sequence.
+   * @param[out] pDst        points to the block of output data
+   * @param[in]  firstIndex  is the first output sample to start with.
+   * @param[in]  numPoints   is the number of output points to be computed.
+   * @return  Returns either RISCV_MATH_SUCCESS if the function completed correctly or RISCV_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2].
+   */
+  riscv_status riscv_conv_partial_q7(
+  const q7_t * pSrcA,
+        uint32_t srcALen,
+  const q7_t * pSrcB,
+        uint32_t srcBLen,
+        q7_t * pDst,
+        uint32_t firstIndex,
+        uint32_t numPoints);
+
+
+  /**
+   * @brief Instance structure for the Q15 FIR decimator.
+   */
+  typedef struct
+  {
+          uint8_t M;                  /**< decimation factor. */
+          uint16_t numTaps;           /**< number of coefficients in the filter. */
+    const q15_t *pCoeffs;             /**< points to the coefficient array. The array is of length numTaps.*/
+          q15_t *pState;              /**< points to the state variable array. The array is of length numTaps+blockSize-1. */
+  } riscv_fir_decimate_instance_q15;
+
+  /**
+   * @brief Instance structure for the Q31 FIR decimator.
+   */
+  typedef struct
+  {
+          uint8_t M;                  /**< decimation factor. */
+          uint16_t numTaps;           /**< number of coefficients in the filter. */
+    const q31_t *pCoeffs;             /**< points to the coefficient array. The array is of length numTaps.*/
+          q31_t *pState;              /**< points to the state variable array. The array is of length numTaps+blockSize-1. */
+  } riscv_fir_decimate_instance_q31;
+
+/**
+  @brief Instance structure for floating-point FIR decimator.
+ */
+typedef struct
+  {
+          uint8_t M;                  /**< decimation factor. */
+          uint16_t numTaps;           /**< number of coefficients in the filter. */
+    const float32_t *pCoeffs;         /**< points to the coefficient array. The array is of length numTaps.*/
+          float32_t *pState;          /**< points to the state variable array. The array is of length numTaps+blockSize-1. */
+  } riscv_fir_decimate_instance_f32;
+
+
+/**
+  @brief         Processing function for floating-point FIR decimator.
+  @param[in]     S         points to an instance of the floating-point FIR decimator structure
+  @param[in]     pSrc      points to the block of input data
+  @param[out]    pDst      points to the block of output data
+  @param[in]     blockSize number of samples to process
+ */
+void riscv_fir_decimate_f32(
+  const riscv_fir_decimate_instance_f32 * S,
+  const float32_t * pSrc,
+        float32_t * pDst,
+        uint32_t blockSize);
+
+
+/**
+  @brief         Initialization function for the floating-point FIR decimator.
+  @param[in,out] S          points to an instance of the floating-point FIR decimator structure
+  @param[in]     numTaps    number of coefficients in the filter
+  @param[in]     M          decimation factor
+  @param[in]     pCoeffs    points to the filter coefficients
+  @param[in]     pState     points to the state buffer
+  @param[in]     blockSize  number of input samples to process per call
+  @return        execution status
+                   - \ref RISCV_MATH_SUCCESS      : Operation successful
+                   - \ref RISCV_MATH_LENGTH_ERROR : <code>blockSize</code> is not a multiple of <code>M</code>
+ */
+riscv_status riscv_fir_decimate_init_f32(
+        riscv_fir_decimate_instance_f32 * S,
+        uint16_t numTaps,
+        uint8_t M,
+  const float32_t * pCoeffs,
+        float32_t * pState,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief Processing function for the Q15 FIR decimator.
+   * @param[in]  S          points to an instance of the Q15 FIR decimator structure.
+   * @param[in]  pSrc       points to the block of input data.
+   * @param[out] pDst       points to the block of output data
+   * @param[in]  blockSize  number of input samples to process per call.
+   */
+  void riscv_fir_decimate_q15(
+  const riscv_fir_decimate_instance_q15 * S,
+  const q15_t * pSrc,
+        q15_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief Processing function for the Q15 FIR decimator (fast variant) for RISC-V Core with DSP enabled.
+   * @param[in]  S          points to an instance of the Q15 FIR decimator structure.
+   * @param[in]  pSrc       points to the block of input data.
+   * @param[out] pDst       points to the block of output data
+   * @param[in]  blockSize  number of input samples to process per call.
+   */
+  void riscv_fir_decimate_fast_q15(
+  const riscv_fir_decimate_instance_q15 * S,
+  const q15_t * pSrc,
+        q15_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief  Initialization function for the Q15 FIR decimator.
+   * @param[in,out] S          points to an instance of the Q15 FIR decimator structure.
+   * @param[in]     numTaps    number of coefficients in the filter.
+   * @param[in]     M          decimation factor.
+   * @param[in]     pCoeffs    points to the filter coefficients.
+   * @param[in]     pState     points to the state buffer.
+   * @param[in]     blockSize  number of input samples to process per call.
+   * @return    The function returns RISCV_MATH_SUCCESS if initialization is successful or RISCV_MATH_LENGTH_ERROR if
+   * <code>blockSize</code> is not a multiple of <code>M</code>.
+   */
+  riscv_status riscv_fir_decimate_init_q15(
+        riscv_fir_decimate_instance_q15 * S,
+        uint16_t numTaps,
+        uint8_t M,
+  const q15_t * pCoeffs,
+        q15_t * pState,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief Processing function for the Q31 FIR decimator.
+   * @param[in]  S     points to an instance of the Q31 FIR decimator structure.
+   * @param[in]  pSrc  points to the block of input data.
+   * @param[out] pDst  points to the block of output data
+   * @param[in] blockSize number of input samples to process per call.
+   */
+  void riscv_fir_decimate_q31(
+  const riscv_fir_decimate_instance_q31 * S,
+  const q31_t * pSrc,
+        q31_t * pDst,
+        uint32_t blockSize);
+
+  /**
+   * @brief Processing function for the Q31 FIR decimator (fast variant) for RISC-V Core with DSP enabled.
+   * @param[in]  S          points to an instance of the Q31 FIR decimator structure.
+   * @param[in]  pSrc       points to the block of input data.
+   * @param[out] pDst       points to the block of output data
+   * @param[in]  blockSize  number of input samples to process per call.
+   */
+  void riscv_fir_decimate_fast_q31(
+  const riscv_fir_decimate_instance_q31 * S,
+  const q31_t * pSrc,
+        q31_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief  Initialization function for the Q31 FIR decimator.
+   * @param[in,out] S          points to an instance of the Q31 FIR decimator structure.
+   * @param[in]     numTaps    number of coefficients in the filter.
+   * @param[in]     M          decimation factor.
+   * @param[in]     pCoeffs    points to the filter coefficients.
+   * @param[in]     pState     points to the state buffer.
+   * @param[in]     blockSize  number of input samples to process per call.
+   * @return    The function returns RISCV_MATH_SUCCESS if initialization is successful or RISCV_MATH_LENGTH_ERROR if
+   * <code>blockSize</code> is not a multiple of <code>M</code>.
+   */
+  riscv_status riscv_fir_decimate_init_q31(
+        riscv_fir_decimate_instance_q31 * S,
+        uint16_t numTaps,
+        uint8_t M,
+  const q31_t * pCoeffs,
+        q31_t * pState,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief Instance structure for the Q15 FIR interpolator.
+   */
+  typedef struct
+  {
+        uint8_t L;                      /**< upsample factor. */
+        uint16_t phaseLength;           /**< length of each polyphase filter component. */
+  const q15_t *pCoeffs;                 /**< points to the coefficient array. The array is of length L*phaseLength. */
+        q15_t *pState;                  /**< points to the state variable array. The array is of length blockSize+phaseLength-1. */
+  } riscv_fir_interpolate_instance_q15;
+
+  /**
+   * @brief Instance structure for the Q31 FIR interpolator.
+   */
+  typedef struct
+  {
+        uint8_t L;                      /**< upsample factor. */
+        uint16_t phaseLength;           /**< length of each polyphase filter component. */
+  const q31_t *pCoeffs;                 /**< points to the coefficient array. The array is of length L*phaseLength. */
+        q31_t *pState;                  /**< points to the state variable array. The array is of length blockSize+phaseLength-1. */
+  } riscv_fir_interpolate_instance_q31;
+
+  /**
+   * @brief Instance structure for the floating-point FIR interpolator.
+   */
+  typedef struct
+  {
+        uint8_t L;                     /**< upsample factor. */
+        uint16_t phaseLength;          /**< length of each polyphase filter component. */
+  const float32_t *pCoeffs;            /**< points to the coefficient array. The array is of length L*phaseLength. */
+        float32_t *pState;             /**< points to the state variable array. The array is of length phaseLength+numTaps-1. */
+  } riscv_fir_interpolate_instance_f32;
+
+
+  /**
+   * @brief Processing function for the Q15 FIR interpolator.
+   * @param[in]  S          points to an instance of the Q15 FIR interpolator structure.
+   * @param[in]  pSrc       points to the block of input data.
+   * @param[out] pDst       points to the block of output data.
+   * @param[in]  blockSize  number of input samples to process per call.
+   */
+  void riscv_fir_interpolate_q15(
+  const riscv_fir_interpolate_instance_q15 * S,
+  const q15_t * pSrc,
+        q15_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief  Initialization function for the Q15 FIR interpolator.
+   * @param[in,out] S          points to an instance of the Q15 FIR interpolator structure.
+   * @param[in]     L          upsample factor.
+   * @param[in]     numTaps    number of filter coefficients in the filter.
+   * @param[in]     pCoeffs    points to the filter coefficient buffer.
+   * @param[in]     pState     points to the state buffer.
+   * @param[in]     blockSize  number of input samples to process per call.
+   * @return        The function returns RISCV_MATH_SUCCESS if initialization is successful or RISCV_MATH_LENGTH_ERROR if
+   * the filter length <code>numTaps</code> is not a multiple of the interpolation factor <code>L</code>.
+   */
+  riscv_status riscv_fir_interpolate_init_q15(
+        riscv_fir_interpolate_instance_q15 * S,
+        uint8_t L,
+        uint16_t numTaps,
+  const q15_t * pCoeffs,
+        q15_t * pState,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief Processing function for the Q31 FIR interpolator.
+   * @param[in]  S          points to an instance of the Q15 FIR interpolator structure.
+   * @param[in]  pSrc       points to the block of input data.
+   * @param[out] pDst       points to the block of output data.
+   * @param[in]  blockSize  number of input samples to process per call.
+   */
+  void riscv_fir_interpolate_q31(
+  const riscv_fir_interpolate_instance_q31 * S,
+  const q31_t * pSrc,
+        q31_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief  Initialization function for the Q31 FIR interpolator.
+   * @param[in,out] S          points to an instance of the Q31 FIR interpolator structure.
+   * @param[in]     L          upsample factor.
+   * @param[in]     numTaps    number of filter coefficients in the filter.
+   * @param[in]     pCoeffs    points to the filter coefficient buffer.
+   * @param[in]     pState     points to the state buffer.
+   * @param[in]     blockSize  number of input samples to process per call.
+   * @return        The function returns RISCV_MATH_SUCCESS if initialization is successful or RISCV_MATH_LENGTH_ERROR if
+   * the filter length <code>numTaps</code> is not a multiple of the interpolation factor <code>L</code>.
+   */
+  riscv_status riscv_fir_interpolate_init_q31(
+        riscv_fir_interpolate_instance_q31 * S,
+        uint8_t L,
+        uint16_t numTaps,
+  const q31_t * pCoeffs,
+        q31_t * pState,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief Processing function for the floating-point FIR interpolator.
+   * @param[in]  S          points to an instance of the floating-point FIR interpolator structure.
+   * @param[in]  pSrc       points to the block of input data.
+   * @param[out] pDst       points to the block of output data.
+   * @param[in]  blockSize  number of input samples to process per call.
+   */
+  void riscv_fir_interpolate_f32(
+  const riscv_fir_interpolate_instance_f32 * S,
+  const float32_t * pSrc,
+        float32_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief  Initialization function for the floating-point FIR interpolator.
+   * @param[in,out] S          points to an instance of the floating-point FIR interpolator structure.
+   * @param[in]     L          upsample factor.
+   * @param[in]     numTaps    number of filter coefficients in the filter.
+   * @param[in]     pCoeffs    points to the filter coefficient buffer.
+   * @param[in]     pState     points to the state buffer.
+   * @param[in]     blockSize  number of input samples to process per call.
+   * @return        The function returns RISCV_MATH_SUCCESS if initialization is successful or RISCV_MATH_LENGTH_ERROR if
+   * the filter length <code>numTaps</code> is not a multiple of the interpolation factor <code>L</code>.
+   */
+  riscv_status riscv_fir_interpolate_init_f32(
+        riscv_fir_interpolate_instance_f32 * S,
+        uint8_t L,
+        uint16_t numTaps,
+  const float32_t * pCoeffs,
+        float32_t * pState,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief Instance structure for the high precision Q31 Biquad cascade filter.
+   */
+  typedef struct
+  {
+          uint8_t numStages;       /**< number of 2nd order stages in the filter.  Overall order is 2*numStages. */
+          q63_t *pState;           /**< points to the array of state coefficients.  The array is of length 4*numStages. */
+    const q31_t *pCoeffs;          /**< points to the array of coefficients.  The array is of length 5*numStages. */
+          uint8_t postShift;       /**< additional shift, in bits, applied to each output sample. */
+  } riscv_biquad_cas_df1_32x64_ins_q31;
+
+
+  /**
+   * @param[in]  S          points to an instance of the high precision Q31 Biquad cascade filter structure.
+   * @param[in]  pSrc       points to the block of input data.
+   * @param[out] pDst       points to the block of output data
+   * @param[in]  blockSize  number of samples to process.
+   */
+  void riscv_biquad_cas_df1_32x64_q31(
+  const riscv_biquad_cas_df1_32x64_ins_q31 * S,
+        q31_t * pSrc,
+        q31_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @param[in,out] S          points to an instance of the high precision Q31 Biquad cascade filter structure.
+   * @param[in]     numStages  number of 2nd order stages in the filter.
+   * @param[in]     pCoeffs    points to the filter coefficients.
+   * @param[in]     pState     points to the state buffer.
+   * @param[in]     postShift  shift to be applied to the output. Varies according to the coefficients format
+   */
+  void riscv_biquad_cas_df1_32x64_init_q31(
+        riscv_biquad_cas_df1_32x64_ins_q31 * S,
+        uint8_t numStages,
+  const q31_t * pCoeffs,
+        q63_t * pState,
+        uint8_t postShift);
+
+
+  /**
+   * @brief Instance structure for the floating-point transposed direct form II Biquad cascade filter.
+   */
+  typedef struct
+  {
+          uint8_t numStages;         /**< number of 2nd order stages in the filter.  Overall order is 2*numStages. */
+          float32_t *pState;         /**< points to the array of state coefficients.  The array is of length 2*numStages. */
+    const float32_t *pCoeffs;        /**< points to the array of coefficients.  The array is of length 5*numStages. */
+  } riscv_biquad_cascade_df2T_instance_f32;
+
+  /**
+   * @brief Instance structure for the floating-point transposed direct form II Biquad cascade filter.
+   */
+  typedef struct
+  {
+          uint8_t numStages;         /**< number of 2nd order stages in the filter.  Overall order is 2*numStages. */
+          float32_t *pState;         /**< points to the array of state coefficients.  The array is of length 4*numStages. */
+    const float32_t *pCoeffs;        /**< points to the array of coefficients.  The array is of length 5*numStages. */
+  } riscv_biquad_cascade_stereo_df2T_instance_f32;
+
+  /**
+   * @brief Instance structure for the floating-point transposed direct form II Biquad cascade filter.
+   */
+  typedef struct
+  {
+          uint8_t numStages;         /**< number of 2nd order stages in the filter.  Overall order is 2*numStages. */
+          float64_t *pState;         /**< points to the array of state coefficients.  The array is of length 2*numStages. */
+          float64_t *pCoeffs;        /**< points to the array of coefficients.  The array is of length 5*numStages. */
+  } riscv_biquad_cascade_df2T_instance_f64;
+
+
+  /**
+   * @brief Processing function for the floating-point transposed direct form II Biquad cascade filter.
+   * @param[in]  S          points to an instance of the filter data structure.
+   * @param[in]  pSrc       points to the block of input data.
+   * @param[out] pDst       points to the block of output data
+   * @param[in]  blockSize  number of samples to process.
+   */
+  void riscv_biquad_cascade_df2T_f32(
+  const riscv_biquad_cascade_df2T_instance_f32 * S,
+  const float32_t * pSrc,
+        float32_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief Processing function for the floating-point transposed direct form II Biquad cascade filter. 2 channels
+   * @param[in]  S          points to an instance of the filter data structure.
+   * @param[in]  pSrc       points to the block of input data.
+   * @param[out] pDst       points to the block of output data
+   * @param[in]  blockSize  number of samples to process.
+   */
+  void riscv_biquad_cascade_stereo_df2T_f32(
+  const riscv_biquad_cascade_stereo_df2T_instance_f32 * S,
+  const float32_t * pSrc,
+        float32_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief Processing function for the floating-point transposed direct form II Biquad cascade filter.
+   * @param[in]  S          points to an instance of the filter data structure.
+   * @param[in]  pSrc       points to the block of input data.
+   * @param[out] pDst       points to the block of output data
+   * @param[in]  blockSize  number of samples to process.
+   */
+  void riscv_biquad_cascade_df2T_f64(
+  const riscv_biquad_cascade_df2T_instance_f64 * S,
+        float64_t * pSrc,
+        float64_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief  Initialization function for the floating-point transposed direct form II Biquad cascade filter.
+   * @param[in,out] S          points to an instance of the filter data structure.
+   * @param[in]     numStages  number of 2nd order stages in the filter.
+   * @param[in]     pCoeffs    points to the filter coefficients.
+   * @param[in]     pState     points to the state buffer.
+   */
+  void riscv_biquad_cascade_df2T_init_f32(
+        riscv_biquad_cascade_df2T_instance_f32 * S,
+        uint8_t numStages,
+  const float32_t * pCoeffs,
+        float32_t * pState);
+
+
+  /**
+   * @brief  Initialization function for the floating-point transposed direct form II Biquad cascade filter.
+   * @param[in,out] S          points to an instance of the filter data structure.
+   * @param[in]     numStages  number of 2nd order stages in the filter.
+   * @param[in]     pCoeffs    points to the filter coefficients.
+   * @param[in]     pState     points to the state buffer.
+   */
+  void riscv_biquad_cascade_stereo_df2T_init_f32(
+        riscv_biquad_cascade_stereo_df2T_instance_f32 * S,
+        uint8_t numStages,
+  const float32_t * pCoeffs,
+        float32_t * pState);
+
+
+  /**
+   * @brief  Initialization function for the floating-point transposed direct form II Biquad cascade filter.
+   * @param[in,out] S          points to an instance of the filter data structure.
+   * @param[in]     numStages  number of 2nd order stages in the filter.
+   * @param[in]     pCoeffs    points to the filter coefficients.
+   * @param[in]     pState     points to the state buffer.
+   */
+  void riscv_biquad_cascade_df2T_init_f64(
+        riscv_biquad_cascade_df2T_instance_f64 * S,
+        uint8_t numStages,
+        float64_t * pCoeffs,
+        float64_t * pState);
+
+
+  /**
+   * @brief Instance structure for the Q15 FIR lattice filter.
+   */
+  typedef struct
+  {
+          uint16_t numStages;                  /**< number of filter stages. */
+          q15_t *pState;                       /**< points to the state variable array. The array is of length numStages. */
+    const q15_t *pCoeffs;                      /**< points to the coefficient array. The array is of length numStages. */
+  } riscv_fir_lattice_instance_q15;
+
+  /**
+   * @brief Instance structure for the Q31 FIR lattice filter.
+   */
+  typedef struct
+  {
+          uint16_t numStages;                  /**< number of filter stages. */
+          q31_t *pState;                       /**< points to the state variable array. The array is of length numStages. */
+    const q31_t *pCoeffs;                      /**< points to the coefficient array. The array is of length numStages. */
+  } riscv_fir_lattice_instance_q31;
+
+  /**
+   * @brief Instance structure for the floating-point FIR lattice filter.
+   */
+  typedef struct
+  {
+          uint16_t numStages;                  /**< number of filter stages. */
+          float32_t *pState;                   /**< points to the state variable array. The array is of length numStages. */
+    const float32_t *pCoeffs;                  /**< points to the coefficient array. The array is of length numStages. */
+  } riscv_fir_lattice_instance_f32;
+
+
+  /**
+   * @brief Initialization function for the Q15 FIR lattice filter.
+   * @param[in] S          points to an instance of the Q15 FIR lattice structure.
+   * @param[in] numStages  number of filter stages.
+   * @param[in] pCoeffs    points to the coefficient buffer.  The array is of length numStages.
+   * @param[in] pState     points to the state buffer.  The array is of length numStages.
+   */
+  void riscv_fir_lattice_init_q15(
+        riscv_fir_lattice_instance_q15 * S,
+        uint16_t numStages,
+  const q15_t * pCoeffs,
+        q15_t * pState);
+
+
+  /**
+   * @brief Processing function for the Q15 FIR lattice filter.
+   * @param[in]  S          points to an instance of the Q15 FIR lattice structure.
+   * @param[in]  pSrc       points to the block of input data.
+   * @param[out] pDst       points to the block of output data.
+   * @param[in]  blockSize  number of samples to process.
+   */
+  void riscv_fir_lattice_q15(
+  const riscv_fir_lattice_instance_q15 * S,
+  const q15_t * pSrc,
+        q15_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief Initialization function for the Q31 FIR lattice filter.
+   * @param[in] S          points to an instance of the Q31 FIR lattice structure.
+   * @param[in] numStages  number of filter stages.
+   * @param[in] pCoeffs    points to the coefficient buffer.  The array is of length numStages.
+   * @param[in] pState     points to the state buffer.   The array is of length numStages.
+   */
+  void riscv_fir_lattice_init_q31(
+        riscv_fir_lattice_instance_q31 * S,
+        uint16_t numStages,
+  const q31_t * pCoeffs,
+        q31_t * pState);
+
+
+  /**
+   * @brief Processing function for the Q31 FIR lattice filter.
+   * @param[in]  S          points to an instance of the Q31 FIR lattice structure.
+   * @param[in]  pSrc       points to the block of input data.
+   * @param[out] pDst       points to the block of output data
+   * @param[in]  blockSize  number of samples to process.
+   */
+  void riscv_fir_lattice_q31(
+  const riscv_fir_lattice_instance_q31 * S,
+  const q31_t * pSrc,
+        q31_t * pDst,
+        uint32_t blockSize);
+
+
+/**
+ * @brief Initialization function for the floating-point FIR lattice filter.
+ * @param[in] S          points to an instance of the floating-point FIR lattice structure.
+ * @param[in] numStages  number of filter stages.
+ * @param[in] pCoeffs    points to the coefficient buffer.  The array is of length numStages.
+ * @param[in] pState     points to the state buffer.  The array is of length numStages.
+ */
+  void riscv_fir_lattice_init_f32(
+        riscv_fir_lattice_instance_f32 * S,
+        uint16_t numStages,
+  const float32_t * pCoeffs,
+        float32_t * pState);
+
+
+  /**
+   * @brief Processing function for the floating-point FIR lattice filter.
+   * @param[in]  S          points to an instance of the floating-point FIR lattice structure.
+   * @param[in]  pSrc       points to the block of input data.
+   * @param[out] pDst       points to the block of output data
+   * @param[in]  blockSize  number of samples to process.
+   */
+  void riscv_fir_lattice_f32(
+  const riscv_fir_lattice_instance_f32 * S,
+  const float32_t * pSrc,
+        float32_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief Instance structure for the Q15 IIR lattice filter.
+   */
+  typedef struct
+  {
+          uint16_t numStages;                  /**< number of stages in the filter. */
+          q15_t *pState;                       /**< points to the state variable array. The array is of length numStages+blockSize. */
+          q15_t *pkCoeffs;                     /**< points to the reflection coefficient array. The array is of length numStages. */
+          q15_t *pvCoeffs;                     /**< points to the ladder coefficient array. The array is of length numStages+1. */
+  } riscv_iir_lattice_instance_q15;
+
+  /**
+   * @brief Instance structure for the Q31 IIR lattice filter.
+   */
+  typedef struct
+  {
+          uint16_t numStages;                  /**< number of stages in the filter. */
+          q31_t *pState;                       /**< points to the state variable array. The array is of length numStages+blockSize. */
+          q31_t *pkCoeffs;                     /**< points to the reflection coefficient array. The array is of length numStages. */
+          q31_t *pvCoeffs;                     /**< points to the ladder coefficient array. The array is of length numStages+1. */
+  } riscv_iir_lattice_instance_q31;
+
+  /**
+   * @brief Instance structure for the floating-point IIR lattice filter.
+   */
+  typedef struct
+  {
+          uint16_t numStages;                  /**< number of stages in the filter. */
+          float32_t *pState;                   /**< points to the state variable array. The array is of length numStages+blockSize. */
+          float32_t *pkCoeffs;                 /**< points to the reflection coefficient array. The array is of length numStages. */
+          float32_t *pvCoeffs;                 /**< points to the ladder coefficient array. The array is of length numStages+1. */
+  } riscv_iir_lattice_instance_f32;
+
+
+  /**
+   * @brief Processing function for the floating-point IIR lattice filter.
+   * @param[in]  S          points to an instance of the floating-point IIR lattice structure.
+   * @param[in]  pSrc       points to the block of input data.
+   * @param[out] pDst       points to the block of output data.
+   * @param[in]  blockSize  number of samples to process.
+   */
+  void riscv_iir_lattice_f32(
+  const riscv_iir_lattice_instance_f32 * S,
+  const float32_t * pSrc,
+        float32_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief Initialization function for the floating-point IIR lattice filter.
+   * @param[in] S          points to an instance of the floating-point IIR lattice structure.
+   * @param[in] numStages  number of stages in the filter.
+   * @param[in] pkCoeffs   points to the reflection coefficient buffer.  The array is of length numStages.
+   * @param[in] pvCoeffs   points to the ladder coefficient buffer.  The array is of length numStages+1.
+   * @param[in] pState     points to the state buffer.  The array is of length numStages+blockSize-1.
+   * @param[in] blockSize  number of samples to process.
+   */
+  void riscv_iir_lattice_init_f32(
+        riscv_iir_lattice_instance_f32 * S,
+        uint16_t numStages,
+        float32_t * pkCoeffs,
+        float32_t * pvCoeffs,
+        float32_t * pState,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief Processing function for the Q31 IIR lattice filter.
+   * @param[in]  S          points to an instance of the Q31 IIR lattice structure.
+   * @param[in]  pSrc       points to the block of input data.
+   * @param[out] pDst       points to the block of output data.
+   * @param[in]  blockSize  number of samples to process.
+   */
+  void riscv_iir_lattice_q31(
+  const riscv_iir_lattice_instance_q31 * S,
+  const q31_t * pSrc,
+        q31_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief Initialization function for the Q31 IIR lattice filter.
+   * @param[in] S          points to an instance of the Q31 IIR lattice structure.
+   * @param[in] numStages  number of stages in the filter.
+   * @param[in] pkCoeffs   points to the reflection coefficient buffer.  The array is of length numStages.
+   * @param[in] pvCoeffs   points to the ladder coefficient buffer.  The array is of length numStages+1.
+   * @param[in] pState     points to the state buffer.  The array is of length numStages+blockSize.
+   * @param[in] blockSize  number of samples to process.
+   */
+  void riscv_iir_lattice_init_q31(
+        riscv_iir_lattice_instance_q31 * S,
+        uint16_t numStages,
+        q31_t * pkCoeffs,
+        q31_t * pvCoeffs,
+        q31_t * pState,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief Processing function for the Q15 IIR lattice filter.
+   * @param[in]  S          points to an instance of the Q15 IIR lattice structure.
+   * @param[in]  pSrc       points to the block of input data.
+   * @param[out] pDst       points to the block of output data.
+   * @param[in]  blockSize  number of samples to process.
+   */
+  void riscv_iir_lattice_q15(
+  const riscv_iir_lattice_instance_q15 * S,
+  const q15_t * pSrc,
+        q15_t * pDst,
+        uint32_t blockSize);
+
+
+/**
+ * @brief Initialization function for the Q15 IIR lattice filter.
+ * @param[in] S          points to an instance of the fixed-point Q15 IIR lattice structure.
+ * @param[in] numStages  number of stages in the filter.
+ * @param[in] pkCoeffs   points to reflection coefficient buffer.  The array is of length numStages.
+ * @param[in] pvCoeffs   points to ladder coefficient buffer.  The array is of length numStages+1.
+ * @param[in] pState     points to state buffer.  The array is of length numStages+blockSize.
+ * @param[in] blockSize  number of samples to process per call.
+ */
+  void riscv_iir_lattice_init_q15(
+        riscv_iir_lattice_instance_q15 * S,
+        uint16_t numStages,
+        q15_t * pkCoeffs,
+        q15_t * pvCoeffs,
+        q15_t * pState,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief Instance structure for the floating-point LMS filter.
+   */
+  typedef struct
+  {
+          uint16_t numTaps;    /**< number of coefficients in the filter. */
+          float32_t *pState;   /**< points to the state variable array. The array is of length numTaps+blockSize-1. */
+          float32_t *pCoeffs;  /**< points to the coefficient array. The array is of length numTaps. */
+          float32_t mu;        /**< step size that controls filter coefficient updates. */
+  } riscv_lms_instance_f32;
+
+
+  /**
+   * @brief Processing function for floating-point LMS filter.
+   * @param[in]  S          points to an instance of the floating-point LMS filter structure.
+   * @param[in]  pSrc       points to the block of input data.
+   * @param[in]  pRef       points to the block of reference data.
+   * @param[out] pOut       points to the block of output data.
+   * @param[out] pErr       points to the block of error data.
+   * @param[in]  blockSize  number of samples to process.
+   */
+  void riscv_lms_f32(
+  const riscv_lms_instance_f32 * S,
+  const float32_t * pSrc,
+        float32_t * pRef,
+        float32_t * pOut,
+        float32_t * pErr,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief Initialization function for floating-point LMS filter.
+   * @param[in] S          points to an instance of the floating-point LMS filter structure.
+   * @param[in] numTaps    number of filter coefficients.
+   * @param[in] pCoeffs    points to the coefficient buffer.
+   * @param[in] pState     points to state buffer.
+   * @param[in] mu         step size that controls filter coefficient updates.
+   * @param[in] blockSize  number of samples to process.
+   */
+  void riscv_lms_init_f32(
+        riscv_lms_instance_f32 * S,
+        uint16_t numTaps,
+        float32_t * pCoeffs,
+        float32_t * pState,
+        float32_t mu,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief Instance structure for the Q15 LMS filter.
+   */
+  typedef struct
+  {
+          uint16_t numTaps;    /**< number of coefficients in the filter. */
+          q15_t *pState;       /**< points to the state variable array. The array is of length numTaps+blockSize-1. */
+          q15_t *pCoeffs;      /**< points to the coefficient array. The array is of length numTaps. */
+          q15_t mu;            /**< step size that controls filter coefficient updates. */
+          uint32_t postShift;  /**< bit shift applied to coefficients. */
+  } riscv_lms_instance_q15;
+
+
+  /**
+   * @brief Initialization function for the Q15 LMS filter.
+   * @param[in] S          points to an instance of the Q15 LMS filter structure.
+   * @param[in] numTaps    number of filter coefficients.
+   * @param[in] pCoeffs    points to the coefficient buffer.
+   * @param[in] pState     points to the state buffer.
+   * @param[in] mu         step size that controls filter coefficient updates.
+   * @param[in] blockSize  number of samples to process.
+   * @param[in] postShift  bit shift applied to coefficients.
+   */
+  void riscv_lms_init_q15(
+        riscv_lms_instance_q15 * S,
+        uint16_t numTaps,
+        q15_t * pCoeffs,
+        q15_t * pState,
+        q15_t mu,
+        uint32_t blockSize,
+        uint32_t postShift);
+
+
+  /**
+   * @brief Processing function for Q15 LMS filter.
+   * @param[in]  S          points to an instance of the Q15 LMS filter structure.
+   * @param[in]  pSrc       points to the block of input data.
+   * @param[in]  pRef       points to the block of reference data.
+   * @param[out] pOut       points to the block of output data.
+   * @param[out] pErr       points to the block of error data.
+   * @param[in]  blockSize  number of samples to process.
+   */
+  void riscv_lms_q15(
+  const riscv_lms_instance_q15 * S,
+  const q15_t * pSrc,
+        q15_t * pRef,
+        q15_t * pOut,
+        q15_t * pErr,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief Instance structure for the Q31 LMS filter.
+   */
+  typedef struct
+  {
+          uint16_t numTaps;    /**< number of coefficients in the filter. */
+          q31_t *pState;       /**< points to the state variable array. The array is of length numTaps+blockSize-1. */
+          q31_t *pCoeffs;      /**< points to the coefficient array. The array is of length numTaps. */
+          q31_t mu;            /**< step size that controls filter coefficient updates. */
+          uint32_t postShift;  /**< bit shift applied to coefficients. */
+  } riscv_lms_instance_q31;
+
+
+  /**
+   * @brief Processing function for Q31 LMS filter.
+   * @param[in]  S          points to an instance of the Q15 LMS filter structure.
+   * @param[in]  pSrc       points to the block of input data.
+   * @param[in]  pRef       points to the block of reference data.
+   * @param[out] pOut       points to the block of output data.
+   * @param[out] pErr       points to the block of error data.
+   * @param[in]  blockSize  number of samples to process.
+   */
+  void riscv_lms_q31(
+  const riscv_lms_instance_q31 * S,
+  const q31_t * pSrc,
+        q31_t * pRef,
+        q31_t * pOut,
+        q31_t * pErr,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief Initialization function for Q31 LMS filter.
+   * @param[in] S          points to an instance of the Q31 LMS filter structure.
+   * @param[in] numTaps    number of filter coefficients.
+   * @param[in] pCoeffs    points to coefficient buffer.
+   * @param[in] pState     points to state buffer.
+   * @param[in] mu         step size that controls filter coefficient updates.
+   * @param[in] blockSize  number of samples to process.
+   * @param[in] postShift  bit shift applied to coefficients.
+   */
+  void riscv_lms_init_q31(
+        riscv_lms_instance_q31 * S,
+        uint16_t numTaps,
+        q31_t * pCoeffs,
+        q31_t * pState,
+        q31_t mu,
+        uint32_t blockSize,
+        uint32_t postShift);
+
+
+  /**
+   * @brief Instance structure for the floating-point normalized LMS filter.
+   */
+  typedef struct
+  {
+          uint16_t numTaps;     /**< number of coefficients in the filter. */
+          float32_t *pState;    /**< points to the state variable array. The array is of length numTaps+blockSize-1. */
+          float32_t *pCoeffs;   /**< points to the coefficient array. The array is of length numTaps. */
+          float32_t mu;         /**< step size that control filter coefficient updates. */
+          float32_t energy;     /**< saves previous frame energy. */
+          float32_t x0;         /**< saves previous input sample. */
+  } riscv_lms_norm_instance_f32;
+
+
+  /**
+   * @brief Processing function for floating-point normalized LMS filter.
+   * @param[in]  S          points to an instance of the floating-point normalized LMS filter structure.
+   * @param[in]  pSrc       points to the block of input data.
+   * @param[in]  pRef       points to the block of reference data.
+   * @param[out] pOut       points to the block of output data.
+   * @param[out] pErr       points to the block of error data.
+   * @param[in]  blockSize  number of samples to process.
+   */
+  void riscv_lms_norm_f32(
+        riscv_lms_norm_instance_f32 * S,
+  const float32_t * pSrc,
+        float32_t * pRef,
+        float32_t * pOut,
+        float32_t * pErr,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief Initialization function for floating-point normalized LMS filter.
+   * @param[in] S          points to an instance of the floating-point LMS filter structure.
+   * @param[in] numTaps    number of filter coefficients.
+   * @param[in] pCoeffs    points to coefficient buffer.
+   * @param[in] pState     points to state buffer.
+   * @param[in] mu         step size that controls filter coefficient updates.
+   * @param[in] blockSize  number of samples to process.
+   */
+  void riscv_lms_norm_init_f32(
+        riscv_lms_norm_instance_f32 * S,
+        uint16_t numTaps,
+        float32_t * pCoeffs,
+        float32_t * pState,
+        float32_t mu,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief Instance structure for the Q31 normalized LMS filter.
+   */
+  typedef struct
+  {
+          uint16_t numTaps;     /**< number of coefficients in the filter. */
+          q31_t *pState;        /**< points to the state variable array. The array is of length numTaps+blockSize-1. */
+          q31_t *pCoeffs;       /**< points to the coefficient array. The array is of length numTaps. */
+          q31_t mu;             /**< step size that controls filter coefficient updates. */
+          uint8_t postShift;    /**< bit shift applied to coefficients. */
+    const q31_t *recipTable;    /**< points to the reciprocal initial value table. */
+          q31_t energy;         /**< saves previous frame energy. */
+          q31_t x0;             /**< saves previous input sample. */
+  } riscv_lms_norm_instance_q31;
+
+
+  /**
+   * @brief Processing function for Q31 normalized LMS filter.
+   * @param[in]  S          points to an instance of the Q31 normalized LMS filter structure.
+   * @param[in]  pSrc       points to the block of input data.
+   * @param[in]  pRef       points to the block of reference data.
+   * @param[out] pOut       points to the block of output data.
+   * @param[out] pErr       points to the block of error data.
+   * @param[in]  blockSize  number of samples to process.
+   */
+  void riscv_lms_norm_q31(
+        riscv_lms_norm_instance_q31 * S,
+  const q31_t * pSrc,
+        q31_t * pRef,
+        q31_t * pOut,
+        q31_t * pErr,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief Initialization function for Q31 normalized LMS filter.
+   * @param[in] S          points to an instance of the Q31 normalized LMS filter structure.
+   * @param[in] numTaps    number of filter coefficients.
+   * @param[in] pCoeffs    points to coefficient buffer.
+   * @param[in] pState     points to state buffer.
+   * @param[in] mu         step size that controls filter coefficient updates.
+   * @param[in] blockSize  number of samples to process.
+   * @param[in] postShift  bit shift applied to coefficients.
+   */
+  void riscv_lms_norm_init_q31(
+        riscv_lms_norm_instance_q31 * S,
+        uint16_t numTaps,
+        q31_t * pCoeffs,
+        q31_t * pState,
+        q31_t mu,
+        uint32_t blockSize,
+        uint8_t postShift);
+
+
+  /**
+   * @brief Instance structure for the Q15 normalized LMS filter.
+   */
+  typedef struct
+  {
+          uint16_t numTaps;     /**< Number of coefficients in the filter. */
+          q15_t *pState;        /**< points to the state variable array. The array is of length numTaps+blockSize-1. */
+          q15_t *pCoeffs;       /**< points to the coefficient array. The array is of length numTaps. */
+          q15_t mu;             /**< step size that controls filter coefficient updates. */
+          uint8_t postShift;    /**< bit shift applied to coefficients. */
+    const q15_t *recipTable;    /**< Points to the reciprocal initial value table. */
+          q15_t energy;         /**< saves previous frame energy. */
+          q15_t x0;             /**< saves previous input sample. */
+  } riscv_lms_norm_instance_q15;
+
+
+  /**
+   * @brief Processing function for Q15 normalized LMS filter.
+   * @param[in]  S          points to an instance of the Q15 normalized LMS filter structure.
+   * @param[in]  pSrc       points to the block of input data.
+   * @param[in]  pRef       points to the block of reference data.
+   * @param[out] pOut       points to the block of output data.
+   * @param[out] pErr       points to the block of error data.
+   * @param[in]  blockSize  number of samples to process.
+   */
+  void riscv_lms_norm_q15(
+        riscv_lms_norm_instance_q15 * S,
+  const q15_t * pSrc,
+        q15_t * pRef,
+        q15_t * pOut,
+        q15_t * pErr,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief Initialization function for Q15 normalized LMS filter.
+   * @param[in] S          points to an instance of the Q15 normalized LMS filter structure.
+   * @param[in] numTaps    number of filter coefficients.
+   * @param[in] pCoeffs    points to coefficient buffer.
+   * @param[in] pState     points to state buffer.
+   * @param[in] mu         step size that controls filter coefficient updates.
+   * @param[in] blockSize  number of samples to process.
+   * @param[in] postShift  bit shift applied to coefficients.
+   */
+  void riscv_lms_norm_init_q15(
+        riscv_lms_norm_instance_q15 * S,
+        uint16_t numTaps,
+        q15_t * pCoeffs,
+        q15_t * pState,
+        q15_t mu,
+        uint32_t blockSize,
+        uint8_t postShift);
+
+
+  /**
+   * @brief Correlation of floating-point sequences.
+   * @param[in]  pSrcA    points to the first input sequence.
+   * @param[in]  srcALen  length of the first input sequence.
+   * @param[in]  pSrcB    points to the second input sequence.
+   * @param[in]  srcBLen  length of the second input sequence.
+   * @param[out] pDst     points to the block of output data  Length 2 * max(srcALen, srcBLen) - 1.
+   */
+  void riscv_correlate_f32(
+  const float32_t * pSrcA,
+        uint32_t srcALen,
+  const float32_t * pSrcB,
+        uint32_t srcBLen,
+        float32_t * pDst);
+
+
+/**
+ @brief Correlation of Q15 sequences
+ @param[in]  pSrcA     points to the first input sequence
+ @param[in]  srcALen   length of the first input sequence
+ @param[in]  pSrcB     points to the second input sequence
+ @param[in]  srcBLen   length of the second input sequence
+ @param[out] pDst      points to the block of output data  Length 2 * max(srcALen, srcBLen) - 1.
+ @param[in]  pScratch  points to scratch buffer of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2.
+*/
+void riscv_correlate_opt_q15(
+  const q15_t * pSrcA,
+        uint32_t srcALen,
+  const q15_t * pSrcB,
+        uint32_t srcBLen,
+        q15_t * pDst,
+        q15_t * pScratch);
+
+
+/**
+  @brief Correlation of Q15 sequences.
+  @param[in]  pSrcA    points to the first input sequence
+  @param[in]  srcALen  length of the first input sequence
+  @param[in]  pSrcB    points to the second input sequence
+  @param[in]  srcBLen  length of the second input sequence
+  @param[out] pDst     points to the block of output data  Length 2 * max(srcALen, srcBLen) - 1.
+ */
+  void riscv_correlate_q15(
+  const q15_t * pSrcA,
+        uint32_t srcALen,
+  const q15_t * pSrcB,
+        uint32_t srcBLen,
+        q15_t * pDst);
+
+
+/**
+  @brief         Correlation of Q15 sequences (fast version).
+  @param[in]     pSrcA      points to the first input sequence
+  @param[in]     srcALen    length of the first input sequence
+  @param[in]     pSrcB      points to the second input sequence
+  @param[in]     srcBLen    length of the second input sequence
+  @param[out]    pDst       points to the location where the output result is written.  Length 2 * max(srcALen, srcBLen) - 1.
+  @return        none
+ */
+void riscv_correlate_fast_q15(
+  const q15_t * pSrcA,
+        uint32_t srcALen,
+  const q15_t * pSrcB,
+        uint32_t srcBLen,
+        q15_t * pDst);
+
+/**
+  @brief Correlation of Q15 sequences (fast version).
+  @param[in]  pSrcA     points to the first input sequence.
+  @param[in]  srcALen   length of the first input sequence.
+  @param[in]  pSrcB     points to the second input sequence.
+  @param[in]  srcBLen   length of the second input sequence.
+  @param[out] pDst      points to the block of output data  Length 2 * max(srcALen, srcBLen) - 1.
+  @param[in]  pScratch  points to scratch buffer of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2.
+ */
+void riscv_correlate_fast_opt_q15(
+  const q15_t * pSrcA,
+        uint32_t srcALen,
+  const q15_t * pSrcB,
+        uint32_t srcBLen,
+        q15_t * pDst,
+        q15_t * pScratch);
+
+
+  /**
+   * @brief Correlation of Q31 sequences.
+   * @param[in]  pSrcA    points to the first input sequence.
+   * @param[in]  srcALen  length of the first input sequence.
+   * @param[in]  pSrcB    points to the second input sequence.
+   * @param[in]  srcBLen  length of the second input sequence.
+   * @param[out] pDst     points to the block of output data  Length 2 * max(srcALen, srcBLen) - 1.
+   */
+  void riscv_correlate_q31(
+  const q31_t * pSrcA,
+        uint32_t srcALen,
+  const q31_t * pSrcB,
+        uint32_t srcBLen,
+        q31_t * pDst);
+
+
+/**
+  @brief Correlation of Q31 sequences (fast version).
+  @param[in]  pSrcA    points to the first input sequence
+  @param[in]  srcALen  length of the first input sequence
+  @param[in]  pSrcB    points to the second input sequence
+  @param[in]  srcBLen  length of the second input sequence
+  @param[out] pDst     points to the block of output data  Length 2 * max(srcALen, srcBLen) - 1.
+ */
+void riscv_correlate_fast_q31(
+  const q31_t * pSrcA,
+        uint32_t srcALen,
+  const q31_t * pSrcB,
+        uint32_t srcBLen,
+        q31_t * pDst);
+
+
+ /**
+   * @brief Correlation of Q7 sequences.
+   * @param[in]  pSrcA      points to the first input sequence.
+   * @param[in]  srcALen    length of the first input sequence.
+   * @param[in]  pSrcB      points to the second input sequence.
+   * @param[in]  srcBLen    length of the second input sequence.
+   * @param[out] pDst       points to the block of output data  Length 2 * max(srcALen, srcBLen) - 1.
+   * @param[in]  pScratch1  points to scratch buffer(of type q15_t) of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2.
+   * @param[in]  pScratch2  points to scratch buffer (of type q15_t) of size min(srcALen, srcBLen).
+   */
+  void riscv_correlate_opt_q7(
+  const q7_t * pSrcA,
+        uint32_t srcALen,
+  const q7_t * pSrcB,
+        uint32_t srcBLen,
+        q7_t * pDst,
+        q15_t * pScratch1,
+        q15_t * pScratch2);
+
+
+  /**
+   * @brief Correlation of Q7 sequences.
+   * @param[in]  pSrcA    points to the first input sequence.
+   * @param[in]  srcALen  length of the first input sequence.
+   * @param[in]  pSrcB    points to the second input sequence.
+   * @param[in]  srcBLen  length of the second input sequence.
+   * @param[out] pDst     points to the block of output data  Length 2 * max(srcALen, srcBLen) - 1.
+   */
+  void riscv_correlate_q7(
+  const q7_t * pSrcA,
+        uint32_t srcALen,
+  const q7_t * pSrcB,
+        uint32_t srcBLen,
+        q7_t * pDst);
+
+
+  /**
+   * @brief Instance structure for the floating-point sparse FIR filter.
+   */
+  typedef struct
+  {
+          uint16_t numTaps;             /**< number of coefficients in the filter. */
+          uint16_t stateIndex;          /**< state buffer index.  Points to the oldest sample in the state buffer. */
+          float32_t *pState;            /**< points to the state buffer array. The array is of length maxDelay+blockSize-1. */
+    const float32_t *pCoeffs;           /**< points to the coefficient array. The array is of length numTaps.*/
+          uint16_t maxDelay;            /**< maximum offset specified by the pTapDelay array. */
+          int32_t *pTapDelay;           /**< points to the array of delay values.  The array is of length numTaps. */
+  } riscv_fir_sparse_instance_f32;
+
+  /**
+   * @brief Instance structure for the Q31 sparse FIR filter.
+   */
+  typedef struct
+  {
+          uint16_t numTaps;             /**< number of coefficients in the filter. */
+          uint16_t stateIndex;          /**< state buffer index.  Points to the oldest sample in the state buffer. */
+          q31_t *pState;                /**< points to the state buffer array. The array is of length maxDelay+blockSize-1. */
+    const q31_t *pCoeffs;               /**< points to the coefficient array. The array is of length numTaps.*/
+          uint16_t maxDelay;            /**< maximum offset specified by the pTapDelay array. */
+          int32_t *pTapDelay;           /**< points to the array of delay values.  The array is of length numTaps. */
+  } riscv_fir_sparse_instance_q31;
+
+  /**
+   * @brief Instance structure for the Q15 sparse FIR filter.
+   */
+  typedef struct
+  {
+          uint16_t numTaps;             /**< number of coefficients in the filter. */
+          uint16_t stateIndex;          /**< state buffer index.  Points to the oldest sample in the state buffer. */
+          q15_t *pState;                /**< points to the state buffer array. The array is of length maxDelay+blockSize-1. */
+    const q15_t *pCoeffs;               /**< points to the coefficient array. The array is of length numTaps.*/
+          uint16_t maxDelay;            /**< maximum offset specified by the pTapDelay array. */
+          int32_t *pTapDelay;           /**< points to the array of delay values.  The array is of length numTaps. */
+  } riscv_fir_sparse_instance_q15;
+
+  /**
+   * @brief Instance structure for the Q7 sparse FIR filter.
+   */
+  typedef struct
+  {
+          uint16_t numTaps;             /**< number of coefficients in the filter. */
+          uint16_t stateIndex;          /**< state buffer index.  Points to the oldest sample in the state buffer. */
+          q7_t *pState;                 /**< points to the state buffer array. The array is of length maxDelay+blockSize-1. */
+    const q7_t *pCoeffs;                /**< points to the coefficient array. The array is of length numTaps.*/
+          uint16_t maxDelay;            /**< maximum offset specified by the pTapDelay array. */
+          int32_t *pTapDelay;           /**< points to the array of delay values.  The array is of length numTaps. */
+  } riscv_fir_sparse_instance_q7;
+
+
+  /**
+   * @brief Processing function for the floating-point sparse FIR filter.
+   * @param[in]  S           points to an instance of the floating-point sparse FIR structure.
+   * @param[in]  pSrc        points to the block of input data.
+   * @param[out] pDst        points to the block of output data
+   * @param[in]  pScratchIn  points to a temporary buffer of size blockSize.
+   * @param[in]  blockSize   number of input samples to process per call.
+   */
+  void riscv_fir_sparse_f32(
+        riscv_fir_sparse_instance_f32 * S,
+  const float32_t * pSrc,
+        float32_t * pDst,
+        float32_t * pScratchIn,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief  Initialization function for the floating-point sparse FIR filter.
+   * @param[in,out] S          points to an instance of the floating-point sparse FIR structure.
+   * @param[in]     numTaps    number of nonzero coefficients in the filter.
+   * @param[in]     pCoeffs    points to the array of filter coefficients.
+   * @param[in]     pState     points to the state buffer.
+   * @param[in]     pTapDelay  points to the array of offset times.
+   * @param[in]     maxDelay   maximum offset time supported.
+   * @param[in]     blockSize  number of samples that will be processed per block.
+   */
+  void riscv_fir_sparse_init_f32(
+        riscv_fir_sparse_instance_f32 * S,
+        uint16_t numTaps,
+  const float32_t * pCoeffs,
+        float32_t * pState,
+        int32_t * pTapDelay,
+        uint16_t maxDelay,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief Processing function for the Q31 sparse FIR filter.
+   * @param[in]  S           points to an instance of the Q31 sparse FIR structure.
+   * @param[in]  pSrc        points to the block of input data.
+   * @param[out] pDst        points to the block of output data
+   * @param[in]  pScratchIn  points to a temporary buffer of size blockSize.
+   * @param[in]  blockSize   number of input samples to process per call.
+   */
+  void riscv_fir_sparse_q31(
+        riscv_fir_sparse_instance_q31 * S,
+  const q31_t * pSrc,
+        q31_t * pDst,
+        q31_t * pScratchIn,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief  Initialization function for the Q31 sparse FIR filter.
+   * @param[in,out] S          points to an instance of the Q31 sparse FIR structure.
+   * @param[in]     numTaps    number of nonzero coefficients in the filter.
+   * @param[in]     pCoeffs    points to the array of filter coefficients.
+   * @param[in]     pState     points to the state buffer.
+   * @param[in]     pTapDelay  points to the array of offset times.
+   * @param[in]     maxDelay   maximum offset time supported.
+   * @param[in]     blockSize  number of samples that will be processed per block.
+   */
+  void riscv_fir_sparse_init_q31(
+        riscv_fir_sparse_instance_q31 * S,
+        uint16_t numTaps,
+  const q31_t * pCoeffs,
+        q31_t * pState,
+        int32_t * pTapDelay,
+        uint16_t maxDelay,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief Processing function for the Q15 sparse FIR filter.
+   * @param[in]  S            points to an instance of the Q15 sparse FIR structure.
+   * @param[in]  pSrc         points to the block of input data.
+   * @param[out] pDst         points to the block of output data
+   * @param[in]  pScratchIn   points to a temporary buffer of size blockSize.
+   * @param[in]  pScratchOut  points to a temporary buffer of size blockSize.
+   * @param[in]  blockSize    number of input samples to process per call.
+   */
+  void riscv_fir_sparse_q15(
+        riscv_fir_sparse_instance_q15 * S,
+  const q15_t * pSrc,
+        q15_t * pDst,
+        q15_t * pScratchIn,
+        q31_t * pScratchOut,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief  Initialization function for the Q15 sparse FIR filter.
+   * @param[in,out] S          points to an instance of the Q15 sparse FIR structure.
+   * @param[in]     numTaps    number of nonzero coefficients in the filter.
+   * @param[in]     pCoeffs    points to the array of filter coefficients.
+   * @param[in]     pState     points to the state buffer.
+   * @param[in]     pTapDelay  points to the array of offset times.
+   * @param[in]     maxDelay   maximum offset time supported.
+   * @param[in]     blockSize  number of samples that will be processed per block.
+   */
+  void riscv_fir_sparse_init_q15(
+        riscv_fir_sparse_instance_q15 * S,
+        uint16_t numTaps,
+  const q15_t * pCoeffs,
+        q15_t * pState,
+        int32_t * pTapDelay,
+        uint16_t maxDelay,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief Processing function for the Q7 sparse FIR filter.
+   * @param[in]  S            points to an instance of the Q7 sparse FIR structure.
+   * @param[in]  pSrc         points to the block of input data.
+   * @param[out] pDst         points to the block of output data
+   * @param[in]  pScratchIn   points to a temporary buffer of size blockSize.
+   * @param[in]  pScratchOut  points to a temporary buffer of size blockSize.
+   * @param[in]  blockSize    number of input samples to process per call.
+   */
+  void riscv_fir_sparse_q7(
+        riscv_fir_sparse_instance_q7 * S,
+  const q7_t * pSrc,
+        q7_t * pDst,
+        q7_t * pScratchIn,
+        q31_t * pScratchOut,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief  Initialization function for the Q7 sparse FIR filter.
+   * @param[in,out] S          points to an instance of the Q7 sparse FIR structure.
+   * @param[in]     numTaps    number of nonzero coefficients in the filter.
+   * @param[in]     pCoeffs    points to the array of filter coefficients.
+   * @param[in]     pState     points to the state buffer.
+   * @param[in]     pTapDelay  points to the array of offset times.
+   * @param[in]     maxDelay   maximum offset time supported.
+   * @param[in]     blockSize  number of samples that will be processed per block.
+   */
+  void riscv_fir_sparse_init_q7(
+        riscv_fir_sparse_instance_q7 * S,
+        uint16_t numTaps,
+  const q7_t * pCoeffs,
+        q7_t * pState,
+        int32_t * pTapDelay,
+        uint16_t maxDelay,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief  Floating-point sin_cos function.
+   * @param[in]  theta   input value in degrees
+   * @param[out] pSinVal  points to the processed sine output.
+   * @param[out] pCosVal  points to the processed cos output.
+   */
+  void riscv_sin_cos_f32(
+        float32_t theta,
+        float32_t * pSinVal,
+        float32_t * pCosVal);
+
+
+  /**
+   * @brief  Q31 sin_cos function.
+   * @param[in]  theta    scaled input value in degrees
+   * @param[out] pSinVal  points to the processed sine output.
+   * @param[out] pCosVal  points to the processed cosine output.
+   */
+  void riscv_sin_cos_q31(
+        q31_t theta,
+        q31_t * pSinVal,
+        q31_t * pCosVal);
+
+
+  /**
+   * @brief  Floating-point complex conjugate.
+   * @param[in]  pSrc        points to the input vector
+   * @param[out] pDst        points to the output vector
+   * @param[in]  numSamples  number of complex samples in each vector
+   */
+  void riscv_cmplx_conj_f32(
+  const float32_t * pSrc,
+        float32_t * pDst,
+        uint32_t numSamples);
+
+  /**
+   * @brief  Q31 complex conjugate.
+   * @param[in]  pSrc        points to the input vector
+   * @param[out] pDst        points to the output vector
+   * @param[in]  numSamples  number of complex samples in each vector
+   */
+  void riscv_cmplx_conj_q31(
+  const q31_t * pSrc,
+        q31_t * pDst,
+        uint32_t numSamples);
+
+
+  /**
+   * @brief  Q15 complex conjugate.
+   * @param[in]  pSrc        points to the input vector
+   * @param[out] pDst        points to the output vector
+   * @param[in]  numSamples  number of complex samples in each vector
+   */
+  void riscv_cmplx_conj_q15(
+  const q15_t * pSrc,
+        q15_t * pDst,
+        uint32_t numSamples);
+
+
+  /**
+   * @brief  Floating-point complex magnitude squared
+   * @param[in]  pSrc        points to the complex input vector
+   * @param[out] pDst        points to the real output vector
+   * @param[in]  numSamples  number of complex samples in the input vector
+   */
+  void riscv_cmplx_mag_squared_f32(
+  const float32_t * pSrc,
+        float32_t * pDst,
+        uint32_t numSamples);
+
+
+  /**
+   * @brief  Q31 complex magnitude squared
+   * @param[in]  pSrc        points to the complex input vector
+   * @param[out] pDst        points to the real output vector
+   * @param[in]  numSamples  number of complex samples in the input vector
+   */
+  void riscv_cmplx_mag_squared_q31(
+  const q31_t * pSrc,
+        q31_t * pDst,
+        uint32_t numSamples);
+
+
+  /**
+   * @brief  Q15 complex magnitude squared
+   * @param[in]  pSrc        points to the complex input vector
+   * @param[out] pDst        points to the real output vector
+   * @param[in]  numSamples  number of complex samples in the input vector
+   */
+  void riscv_cmplx_mag_squared_q15(
+  const q15_t * pSrc,
+        q15_t * pDst,
+        uint32_t numSamples);
+
+
+ /**
+   * @ingroup groupController
+   */
+
+  /**
+   * @defgroup PID PID Motor Control
+   *
+   * A Proportional Integral Derivative (PID) controller is a generic feedback control
+   * loop mechanism widely used in industrial control systems.
+   * A PID controller is the most commonly used type of feedback controller.
+   *
+   * This set of functions implements (PID) controllers
+   * for Q15, Q31, and floating-point data types.  The functions operate on a single sample
+   * of data and each call to the function returns a single processed value.
+   * <code>S</code> points to an instance of the PID control data structure.  <code>in</code>
+   * is the input sample value. The functions return the output value.
+   *
+   * \par Algorithm:
+   * <pre>
+   *    y[n] = y[n-1] + A0 * x[n] + A1 * x[n-1] + A2 * x[n-2]
+   *    A0 = Kp + Ki + Kd
+   *    A1 = (-Kp ) - (2 * Kd )
+   *    A2 = Kd
+   * </pre>
+   *
+   * \par
+   * where \c Kp is proportional constant, \c Ki is Integral constant and \c Kd is Derivative constant
+   *
+   * \par
+   * \image html PID.png "Proportional Integral Derivative Controller"
+   *
+   * \par
+   * The PID controller calculates an "error" value as the difference between
+   * the measured output and the reference input.
+   * The controller attempts to minimize the error by adjusting the process control inputs.
+   * The proportional value determines the reaction to the current error,
+   * the integral value determines the reaction based on the sum of recent errors,
+   * and the derivative value determines the reaction based on the rate at which the error has been changing.
+   *
+   * \par Instance Structure
+   * The Gains A0, A1, A2 and state variables for a PID controller are stored together in an instance data structure.
+   * A separate instance structure must be defined for each PID Controller.
+   * There are separate instance structure declarations for each of the 3 supported data types.
+   *
+   * \par Reset Functions
+   * There is also an associated reset function for each data type which clears the state array.
+   *
+   * \par Initialization Functions
+   * There is also an associated initialization function for each data type.
+   * The initialization function performs the following operations:
+   * - Initializes the Gains A0, A1, A2 from Kp,Ki, Kd gains.
+   * - Zeros out the values in the state buffer.
+   *
+   * \par
+   * Instance structure cannot be placed into a const data section and it is recommended to use the initialization function.
+   *
+   * \par Fixed-Point Behavior
+   * Care must be taken when using the fixed-point versions of the PID Controller functions.
+   * In particular, the overflow and saturation behavior of the accumulator used in each function must be considered.
+   * Refer to the function specific documentation below for usage guidelines.
+   */
+
+  /**
+   * @addtogroup PID
+   * @{
+   */
+
+  /**
+   * @brief         Process function for the floating-point PID Control.
+   * @param[in,out] S   is an instance of the floating-point PID Control structure
+   * @param[in]     in  input sample to process
+   * @return        processed output sample.
+   */
+  __STATIC_FORCEINLINE float32_t riscv_pid_f32(
+  riscv_pid_instance_f32 * S,
+  float32_t in)
+  {
+    float32_t out;
+
+    /* y[n] = y[n-1] + A0 * x[n] + A1 * x[n-1] + A2 * x[n-2]  */
+    out = (S->A0 * in) +
+      (S->A1 * S->state[0]) + (S->A2 * S->state[1]) + (S->state[2]);
+
+    /* Update state */
+    S->state[1] = S->state[0];
+    S->state[0] = in;
+    S->state[2] = out;
+
+    /* return to application */
+    return (out);
+
+  }
+
+/**
+  @brief         Process function for the Q31 PID Control.
+  @param[in,out] S  points to an instance of the Q31 PID Control structure
+  @param[in]     in  input sample to process
+  @return        processed output sample.
+
+  \par Scaling and Overflow Behavior
+         The function is implemented using an internal 64-bit accumulator.
+         The accumulator has a 2.62 format and maintains full precision of the intermediate multiplication results but provides only a single guard bit.
+         Thus, if the accumulator result overflows it wraps around rather than clip.
+         In order to avoid overflows completely the input signal must be scaled down by 2 bits as there are four additions.
+         After all multiply-accumulates are performed, the 2.62 accumulator is truncated to 1.32 format and then saturated to 1.31 format.
+ */
+__STATIC_FORCEINLINE q31_t riscv_pid_q31(
+  riscv_pid_instance_q31 * S,
+  q31_t in)
+  {
+    q63_t acc;
+    q31_t out;
+
+    /* acc = A0 * x[n]  */
+    acc = (q63_t) S->A0 * in;
+
+    /* acc += A1 * x[n-1] */
+    acc += (q63_t) S->A1 * S->state[0];
+
+    /* acc += A2 * x[n-2]  */
+    acc += (q63_t) S->A2 * S->state[1];
+
+    /* convert output to 1.31 format to add y[n-1] */
+    out = (q31_t) (acc >> 31U);
+
+    /* out += y[n-1] */
+    out += S->state[2];
+
+    /* Update state */
+    S->state[1] = S->state[0];
+    S->state[0] = in;
+    S->state[2] = out;
+
+    /* return to application */
+    return (out);
+  }
+
+
+/**
+  @brief         Process function for the Q15 PID Control.
+  @param[in,out] S   points to an instance of the Q15 PID Control structure
+  @param[in]     in  input sample to process
+  @return        processed output sample.
+
+  \par Scaling and Overflow Behavior
+         The function is implemented using a 64-bit internal accumulator.
+         Both Gains and state variables are represented in 1.15 format and multiplications yield a 2.30 result.
+         The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format.
+         There is no risk of internal overflow with this approach and the full precision of intermediate multiplications is preserved.
+         After all additions have been performed, the accumulator is truncated to 34.15 format by discarding low 15 bits.
+         Lastly, the accumulator is saturated to yield a result in 1.15 format.
+ */
+__STATIC_FORCEINLINE q15_t riscv_pid_q15(
+  riscv_pid_instance_q15 * S,
+  q15_t in)
+  {
+    q63_t acc;
+    q15_t out;
+
+#if defined (RISCV_MATH_DSP)
+    /* Implementation of PID controller */
+
+    /* acc = A0 * x[n]  */
+    acc = (q31_t) __RV_KMDA((uint32_t)S->A0, (uint32_t)in);
+
+    /* acc += A1 * x[n-1] + A2 * x[n-2]  */
+    acc = (q63_t)__RV_SMALDA((uint64_t)acc, (uint32_t)S->A1, (uint32_t)read_q15x2 (S->state));
+#else
+    /* acc = A0 * x[n]  */
+    acc = ((q31_t) S->A0) * in;
+
+    /* acc += A1 * x[n-1] + A2 * x[n-2]  */
+    acc += (q31_t) S->A1 * S->state[0];
+    acc += (q31_t) S->A2 * S->state[1];
+#endif
+
+    /* acc += y[n-1] */
+    acc += (q31_t) S->state[2] << 15;
+
+    /* saturate the output */
+    out = (q15_t) (__SSAT((acc >> 15), 16));
+
+    /* Update state */
+    S->state[1] = S->state[0];
+    S->state[0] = in;
+    S->state[2] = out;
+
+    /* return to application */
+    return (out);
+  }
+
+  /**
+   * @} end of PID group
+   */
+
+
+  /**
+   * @brief Floating-point matrix inverse.
+   * @param[in]  src   points to the instance of the input floating-point matrix structure.
+   * @param[out] dst   points to the instance of the output floating-point matrix structure.
+   * @return The function returns RISCV_MATH_SIZE_MISMATCH, if the dimensions do not match.
+   * If the input matrix is singular (does not have an inverse), then the algorithm terminates and returns error status RISCV_MATH_SINGULAR.
+   */
+  riscv_status riscv_mat_inverse_f32(
+  const riscv_matrix_instance_f32 * src,
+  riscv_matrix_instance_f32 * dst);
+
+
+  /**
+   * @brief Floating-point matrix inverse.
+   * @param[in]  src   points to the instance of the input floating-point matrix structure.
+   * @param[out] dst   points to the instance of the output floating-point matrix structure.
+   * @return The function returns RISCV_MATH_SIZE_MISMATCH, if the dimensions do not match.
+   * If the input matrix is singular (does not have an inverse), then the algorithm terminates and returns error status RISCV_MATH_SINGULAR.
+   */
+  riscv_status riscv_mat_inverse_f64(
+  const riscv_matrix_instance_f64 * src,
+  riscv_matrix_instance_f64 * dst);
+
+
+
+  /**
+   * @ingroup groupController
+   */
+
+  /**
+   * @defgroup clarke Vector Clarke Transform
+   * Forward Clarke transform converts the instantaneous stator phases into a two-coordinate time invariant vector.
+   * Generally the Clarke transform uses three-phase currents <code>Ia, Ib and Ic</code> to calculate currents
+   * in the two-phase orthogonal stator axis <code>Ialpha</code> and <code>Ibeta</code>.
+   * When <code>Ialpha</code> is superposed with <code>Ia</code> as shown in the figure below
+   * \image html clarke.png Stator current space vector and its components in (a,b).
+   * and <code>Ia + Ib + Ic = 0</code>, in this condition <code>Ialpha</code> and <code>Ibeta</code>
+   * can be calculated using only <code>Ia</code> and <code>Ib</code>.
+   *
+   * The function operates on a single sample of data and each call to the function returns the processed output.
+   * The library provides separate functions for Q31 and floating-point data types.
+   * \par Algorithm
+   * \image html clarkeFormula.png
+   * where <code>Ia</code> and <code>Ib</code> are the instantaneous stator phases and
+   * <code>pIalpha</code> and <code>pIbeta</code> are the two coordinates of time invariant vector.
+   * \par Fixed-Point Behavior
+   * Care must be taken when using the Q31 version of the Clarke transform.
+   * In particular, the overflow and saturation behavior of the accumulator used must be considered.
+   * Refer to the function specific documentation below for usage guidelines.
+   */
+
+  /**
+   * @addtogroup clarke
+   * @{
+   */
+
+  /**
+   *
+   * @brief  Floating-point Clarke transform
+   * @param[in]  Ia       input three-phase coordinate <code>a</code>
+   * @param[in]  Ib       input three-phase coordinate <code>b</code>
+   * @param[out] pIalpha  points to output two-phase orthogonal vector axis alpha
+   * @param[out] pIbeta   points to output two-phase orthogonal vector axis beta
+   * @return        none
+   */
+  __STATIC_FORCEINLINE void riscv_clarke_f32(
+  float32_t Ia,
+  float32_t Ib,
+  float32_t * pIalpha,
+  float32_t * pIbeta)
+  {
+    /* Calculate pIalpha using the equation, pIalpha = Ia */
+    *pIalpha = Ia;
+
+    /* Calculate pIbeta using the equation, pIbeta = (1/sqrt(3)) * Ia + (2/sqrt(3)) * Ib */
+    *pIbeta = ((float32_t) 0.57735026919 * Ia + (float32_t) 1.15470053838 * Ib);
+  }
+
+
+/**
+  @brief  Clarke transform for Q31 version
+  @param[in]  Ia       input three-phase coordinate <code>a</code>
+  @param[in]  Ib       input three-phase coordinate <code>b</code>
+  @param[out] pIalpha  points to output two-phase orthogonal vector axis alpha
+  @param[out] pIbeta   points to output two-phase orthogonal vector axis beta
+  @return     none
+
+  \par Scaling and Overflow Behavior
+         The function is implemented using an internal 32-bit accumulator.
+         The accumulator maintains 1.31 format by truncating lower 31 bits of the intermediate multiplication in 2.62 format.
+         There is saturation on the addition, hence there is no risk of overflow.
+ */
+__STATIC_FORCEINLINE void riscv_clarke_q31(
+  q31_t Ia,
+  q31_t Ib,
+  q31_t * pIalpha,
+  q31_t * pIbeta)
+  {
+    q31_t product1, product2;                    /* Temporary variables used to store intermediate results */
+
+    /* Calculating pIalpha from Ia by equation pIalpha = Ia */
+    *pIalpha = Ia;
+
+    /* Intermediate product is calculated by (1/(sqrt(3)) * Ia) */
+    product1 = (q31_t) (((q63_t) Ia * 0x24F34E8B) >> 30);
+
+    /* Intermediate product is calculated by (2/sqrt(3) * Ib) */
+    product2 = (q31_t) (((q63_t) Ib * 0x49E69D16) >> 30);
+
+    /* pIbeta is calculated by adding the intermediate products */
+    *pIbeta = __QADD(product1, product2);
+  }
+
+  /**
+   * @} end of clarke group
+   */
+
+
+  /**
+   * @ingroup groupController
+   */
+
+  /**
+   * @defgroup inv_clarke Vector Inverse Clarke Transform
+   * Inverse Clarke transform converts the two-coordinate time invariant vector into instantaneous stator phases.
+   *
+   * The function operates on a single sample of data and each call to the function returns the processed output.
+   * The library provides separate functions for Q31 and floating-point data types.
+   * \par Algorithm
+   * \image html clarkeInvFormula.png
+   * where <code>pIa</code> and <code>pIb</code> are the instantaneous stator phases and
+   * <code>Ialpha</code> and <code>Ibeta</code> are the two coordinates of time invariant vector.
+   * \par Fixed-Point Behavior
+   * Care must be taken when using the Q31 version of the Clarke transform.
+   * In particular, the overflow and saturation behavior of the accumulator used must be considered.
+   * Refer to the function specific documentation below for usage guidelines.
+   */
+
+  /**
+   * @addtogroup inv_clarke
+   * @{
+   */
+
+   /**
+   * @brief  Floating-point Inverse Clarke transform
+   * @param[in]  Ialpha  input two-phase orthogonal vector axis alpha
+   * @param[in]  Ibeta   input two-phase orthogonal vector axis beta
+   * @param[out] pIa     points to output three-phase coordinate <code>a</code>
+   * @param[out] pIb     points to output three-phase coordinate <code>b</code>
+   * @return     none
+   */
+  __STATIC_FORCEINLINE void riscv_inv_clarke_f32(
+  float32_t Ialpha,
+  float32_t Ibeta,
+  float32_t * pIa,
+  float32_t * pIb)
+  {
+    /* Calculating pIa from Ialpha by equation pIa = Ialpha */
+    *pIa = Ialpha;
+
+    /* Calculating pIb from Ialpha and Ibeta by equation pIb = -(1/2) * Ialpha + (sqrt(3)/2) * Ibeta */
+    *pIb = -0.5f * Ialpha + 0.8660254039f * Ibeta;
+  }
+
+
+/**
+  @brief  Inverse Clarke transform for Q31 version
+  @param[in]  Ialpha  input two-phase orthogonal vector axis alpha
+  @param[in]  Ibeta   input two-phase orthogonal vector axis beta
+  @param[out] pIa     points to output three-phase coordinate <code>a</code>
+  @param[out] pIb     points to output three-phase coordinate <code>b</code>
+  @return     none
+
+  \par Scaling and Overflow Behavior
+         The function is implemented using an internal 32-bit accumulator.
+         The accumulator maintains 1.31 format by truncating lower 31 bits of the intermediate multiplication in 2.62 format.
+         There is saturation on the subtraction, hence there is no risk of overflow.
+ */
+__STATIC_FORCEINLINE void riscv_inv_clarke_q31(
+  q31_t Ialpha,
+  q31_t Ibeta,
+  q31_t * pIa,
+  q31_t * pIb)
+  {
+    q31_t product1, product2;                    /* Temporary variables used to store intermediate results */
+
+    /* Calculating pIa from Ialpha by equation pIa = Ialpha */
+    *pIa = Ialpha;
+
+    /* Intermediate product is calculated by (1/(2*sqrt(3)) * Ia) */
+    product1 = (q31_t) (((q63_t) (Ialpha) * (0x40000000)) >> 31);
+
+    /* Intermediate product is calculated by (1/sqrt(3) * pIb) */
+    product2 = (q31_t) (((q63_t) (Ibeta) * (0x6ED9EBA1)) >> 31);
+
+    /* pIb is calculated by subtracting the products */
+    *pIb = __QSUB(product2, product1);
+  }
+
+  /**
+   * @} end of inv_clarke group
+   */
+
+
+
+  /**
+   * @ingroup groupController
+   */
+
+  /**
+   * @defgroup park Vector Park Transform
+   *
+   * Forward Park transform converts the input two-coordinate vector to flux and torque components.
+   * The Park transform can be used to realize the transformation of the <code>Ialpha</code> and the <code>Ibeta</code> currents
+   * from the stationary to the moving reference frame and control the spatial relationship between
+   * the stator vector current and rotor flux vector.
+   * If we consider the d axis aligned with the rotor flux, the diagram below shows the
+   * current vector and the relationship from the two reference frames:
+   * \image html park.png "Stator current space vector and its component in (a,b) and in the d,q rotating reference frame"
+   *
+   * The function operates on a single sample of data and each call to the function returns the processed output.
+   * The library provides separate functions for Q31 and floating-point data types.
+   * \par Algorithm
+   * \image html parkFormula.png
+   * where <code>Ialpha</code> and <code>Ibeta</code> are the stator vector components,
+   * <code>pId</code> and <code>pIq</code> are rotor vector components and <code>cosVal</code> and <code>sinVal</code> are the
+   * cosine and sine values of theta (rotor flux position).
+   * \par Fixed-Point Behavior
+   * Care must be taken when using the Q31 version of the Park transform.
+   * In particular, the overflow and saturation behavior of the accumulator used must be considered.
+   * Refer to the function specific documentation below for usage guidelines.
+   */
+
+  /**
+   * @addtogroup park
+   * @{
+   */
+
+  /**
+   * @brief Floating-point Park transform
+   * @param[in]  Ialpha  input two-phase vector coordinate alpha
+   * @param[in]  Ibeta   input two-phase vector coordinate beta
+   * @param[out] pId     points to output   rotor reference frame d
+   * @param[out] pIq     points to output   rotor reference frame q
+   * @param[in]  sinVal  sine value of rotation angle theta
+   * @param[in]  cosVal  cosine value of rotation angle theta
+   * @return     none
+   *
+   * The function implements the forward Park transform.
+   *
+   */
+  __STATIC_FORCEINLINE void riscv_park_f32(
+  float32_t Ialpha,
+  float32_t Ibeta,
+  float32_t * pId,
+  float32_t * pIq,
+  float32_t sinVal,
+  float32_t cosVal)
+  {
+    /* Calculate pId using the equation, pId = Ialpha * cosVal + Ibeta * sinVal */
+    *pId = Ialpha * cosVal + Ibeta * sinVal;
+
+    /* Calculate pIq using the equation, pIq = - Ialpha * sinVal + Ibeta * cosVal */
+    *pIq = -Ialpha * sinVal + Ibeta * cosVal;
+  }
+
+
+/**
+  @brief  Park transform for Q31 version
+  @param[in]  Ialpha  input two-phase vector coordinate alpha
+  @param[in]  Ibeta   input two-phase vector coordinate beta
+  @param[out] pId     points to output rotor reference frame d
+  @param[out] pIq     points to output rotor reference frame q
+  @param[in]  sinVal  sine value of rotation angle theta
+  @param[in]  cosVal  cosine value of rotation angle theta
+  @return     none
+
+  \par Scaling and Overflow Behavior
+         The function is implemented using an internal 32-bit accumulator.
+         The accumulator maintains 1.31 format by truncating lower 31 bits of the intermediate multiplication in 2.62 format.
+         There is saturation on the addition and subtraction, hence there is no risk of overflow.
+ */
+__STATIC_FORCEINLINE void riscv_park_q31(
+  q31_t Ialpha,
+  q31_t Ibeta,
+  q31_t * pId,
+  q31_t * pIq,
+  q31_t sinVal,
+  q31_t cosVal)
+  {
+    q31_t product1, product2;                    /* Temporary variables used to store intermediate results */
+    q31_t product3, product4;                    /* Temporary variables used to store intermediate results */
+
+    /* Intermediate product is calculated by (Ialpha * cosVal) */
+    product1 = (q31_t) (((q63_t) (Ialpha) * (cosVal)) >> 31);
+
+    /* Intermediate product is calculated by (Ibeta * sinVal) */
+    product2 = (q31_t) (((q63_t) (Ibeta) * (sinVal)) >> 31);
+
+
+    /* Intermediate product is calculated by (Ialpha * sinVal) */
+    product3 = (q31_t) (((q63_t) (Ialpha) * (sinVal)) >> 31);
+
+    /* Intermediate product is calculated by (Ibeta * cosVal) */
+    product4 = (q31_t) (((q63_t) (Ibeta) * (cosVal)) >> 31);
+
+    /* Calculate pId by adding the two intermediate products 1 and 2 */
+    *pId = __QADD(product1, product2);
+
+    /* Calculate pIq by subtracting the two intermediate products 3 from 4 */
+    *pIq = __QSUB(product4, product3);
+  }
+
+  /**
+   * @} end of park group
+   */
+
+
+  /**
+   * @ingroup groupController
+   */
+
+  /**
+   * @defgroup inv_park Vector Inverse Park transform
+   * Inverse Park transform converts the input flux and torque components to two-coordinate vector.
+   *
+   * The function operates on a single sample of data and each call to the function returns the processed output.
+   * The library provides separate functions for Q31 and floating-point data types.
+   * \par Algorithm
+   * \image html parkInvFormula.png
+   * where <code>pIalpha</code> and <code>pIbeta</code> are the stator vector components,
+   * <code>Id</code> and <code>Iq</code> are rotor vector components and <code>cosVal</code> and <code>sinVal</code> are the
+   * cosine and sine values of theta (rotor flux position).
+   * \par Fixed-Point Behavior
+   * Care must be taken when using the Q31 version of the Park transform.
+   * In particular, the overflow and saturation behavior of the accumulator used must be considered.
+   * Refer to the function specific documentation below for usage guidelines.
+   */
+
+  /**
+   * @addtogroup inv_park
+   * @{
+   */
+
+   /**
+   * @brief  Floating-point Inverse Park transform
+   * @param[in]  Id       input coordinate of rotor reference frame d
+   * @param[in]  Iq       input coordinate of rotor reference frame q
+   * @param[out] pIalpha  points to output two-phase orthogonal vector axis alpha
+   * @param[out] pIbeta   points to output two-phase orthogonal vector axis beta
+   * @param[in]  sinVal   sine value of rotation angle theta
+   * @param[in]  cosVal   cosine value of rotation angle theta
+   * @return     none
+   */
+  __STATIC_FORCEINLINE void riscv_inv_park_f32(
+  float32_t Id,
+  float32_t Iq,
+  float32_t * pIalpha,
+  float32_t * pIbeta,
+  float32_t sinVal,
+  float32_t cosVal)
+  {
+    /* Calculate pIalpha using the equation, pIalpha = Id * cosVal - Iq * sinVal */
+    *pIalpha = Id * cosVal - Iq * sinVal;
+
+    /* Calculate pIbeta using the equation, pIbeta = Id * sinVal + Iq * cosVal */
+    *pIbeta = Id * sinVal + Iq * cosVal;
+  }
+
+
+/**
+  @brief  Inverse Park transform for   Q31 version
+  @param[in]  Id       input coordinate of rotor reference frame d
+  @param[in]  Iq       input coordinate of rotor reference frame q
+  @param[out] pIalpha  points to output two-phase orthogonal vector axis alpha
+  @param[out] pIbeta   points to output two-phase orthogonal vector axis beta
+  @param[in]  sinVal   sine value of rotation angle theta
+  @param[in]  cosVal   cosine value of rotation angle theta
+  @return     none
+
+  @par Scaling and Overflow Behavior
+         The function is implemented using an internal 32-bit accumulator.
+         The accumulator maintains 1.31 format by truncating lower 31 bits of the intermediate multiplication in 2.62 format.
+         There is saturation on the addition, hence there is no risk of overflow.
+ */
+__STATIC_FORCEINLINE void riscv_inv_park_q31(
+  q31_t Id,
+  q31_t Iq,
+  q31_t * pIalpha,
+  q31_t * pIbeta,
+  q31_t sinVal,
+  q31_t cosVal)
+  {
+    q31_t product1, product2;                    /* Temporary variables used to store intermediate results */
+    q31_t product3, product4;                    /* Temporary variables used to store intermediate results */
+
+    /* Intermediate product is calculated by (Id * cosVal) */
+    product1 = (q31_t) (((q63_t) (Id) * (cosVal)) >> 31);
+
+    /* Intermediate product is calculated by (Iq * sinVal) */
+    product2 = (q31_t) (((q63_t) (Iq) * (sinVal)) >> 31);
+
+
+    /* Intermediate product is calculated by (Id * sinVal) */
+    product3 = (q31_t) (((q63_t) (Id) * (sinVal)) >> 31);
+
+    /* Intermediate product is calculated by (Iq * cosVal) */
+    product4 = (q31_t) (((q63_t) (Iq) * (cosVal)) >> 31);
+
+    /* Calculate pIalpha by using the two intermediate products 1 and 2 */
+    *pIalpha = __QSUB(product1, product2);
+
+    /* Calculate pIbeta by using the two intermediate products 3 and 4 */
+    *pIbeta = __QADD(product4, product3);
+  }
+
+  /**
+   * @} end of Inverse park group
+   */
+
+
+  /**
+   * @ingroup groupInterpolation
+   */
+
+  /**
+   * @defgroup LinearInterpolate Linear Interpolation
+   *
+   * Linear interpolation is a method of curve fitting using linear polynomials.
+   * Linear interpolation works by effectively drawing a straight line between two neighboring samples and returning the appropriate point along that line
+   *
+   * \par
+   * \image html LinearInterp.png "Linear interpolation"
+   *
+   * \par
+   * A  Linear Interpolate function calculates an output value(y), for the input(x)
+   * using linear interpolation of the input values x0, x1( nearest input values) and the output values y0 and y1(nearest output values)
+   *
+   * \par Algorithm:
+   * <pre>
+   *       y = y0 + (x - x0) * ((y1 - y0)/(x1-x0))
+   *       where x0, x1 are nearest values of input x
+   *             y0, y1 are nearest values to output y
+   * </pre>
+   *
+   * \par
+   * This set of functions implements Linear interpolation process
+   * for Q7, Q15, Q31, and floating-point data types.  The functions operate on a single
+   * sample of data and each call to the function returns a single processed value.
+   * <code>S</code> points to an instance of the Linear Interpolate function data structure.
+   * <code>x</code> is the input sample value. The functions returns the output value.
+   *
+   * \par
+   * if x is outside of the table boundary, Linear interpolation returns first value of the table
+   * if x is below input range and returns last value of table if x is above range.
+   */
+
+  /**
+   * @addtogroup LinearInterpolate
+   * @{
+   */
+
+  /**
+   * @brief  Process function for the floating-point Linear Interpolation Function.
+   * @param[in,out] S  is an instance of the floating-point Linear Interpolation structure
+   * @param[in]     x  input sample to process
+   * @return y processed output sample.
+   *
+   */
+  __STATIC_FORCEINLINE float32_t riscv_linear_interp_f32(
+  riscv_linear_interp_instance_f32 * S,
+  float32_t x)
+  {
+    float32_t y;
+    float32_t x0, x1;                            /* Nearest input values */
+    float32_t y0, y1;                            /* Nearest output values */
+    float32_t xSpacing = S->xSpacing;            /* spacing between input values */
+    int32_t i;                                   /* Index variable */
+    float32_t *pYData = S->pYData;               /* pointer to output table */
+
+    /* Calculation of index */
+    i = (int32_t) ((x - S->x1) / xSpacing);
+
+    if (i < 0)
+    {
+      /* Iniatilize output for below specified range as least output value of table */
+      y = pYData[0];
+    }
+    else if ((uint32_t)i >= (S->nValues - 1))
+    {
+      /* Iniatilize output for above specified range as last output value of table */
+      y = pYData[S->nValues - 1];
+    }
+    else
+    {
+      /* Calculation of nearest input values */
+      x0 = S->x1 +  i      * xSpacing;
+      x1 = S->x1 + (i + 1) * xSpacing;
+
+      /* Read of nearest output values */
+      y0 = pYData[i];
+      y1 = pYData[i + 1];
+
+      /* Calculation of output */
+      y = y0 + (x - x0) * ((y1 - y0) / (x1 - x0));
+
+    }
+
+    /* returns output value */
+    return (y);
+  }
+
+
+   /**
+   *
+   * @brief  Process function for the Q31 Linear Interpolation Function.
+   * @param[in] pYData   pointer to Q31 Linear Interpolation table
+   * @param[in] x        input sample to process
+   * @param[in] nValues  number of table values
+   * @return y processed output sample.
+   *
+   * \par
+   * Input sample <code>x</code> is in 12.20 format which contains 12 bits for table index and 20 bits for fractional part.
+   * This function can support maximum of table size 2^12.
+   *
+   */
+  __STATIC_FORCEINLINE q31_t riscv_linear_interp_q31(
+  q31_t * pYData,
+  q31_t x,
+  uint32_t nValues)
+  {
+    q31_t y;                                     /* output */
+    q31_t y0, y1;                                /* Nearest output values */
+    q31_t fract;                                 /* fractional part */
+    int32_t index;                               /* Index to read nearest output values */
+
+    /* Input is in 12.20 format */
+    /* 12 bits for the table index */
+    /* Index value calculation */
+    index = ((x & (q31_t)0xFFF00000) >> 20);
+
+    if (index >= (int32_t)(nValues - 1))
+    {
+      return (pYData[nValues - 1]);
+    }
+    else if (index < 0)
+    {
+      return (pYData[0]);
+    }
+    else
+    {
+      /* 20 bits for the fractional part */
+      /* shift left by 11 to keep fract in 1.31 format */
+      fract = (x & 0x000FFFFF) << 11;
+
+      /* Read two nearest output values from the index in 1.31(q31) format */
+      y0 = pYData[index];
+      y1 = pYData[index + 1];
+
+      /* Calculation of y0 * (1-fract) and y is in 2.30 format */
+      y = ((q31_t) ((q63_t) y0 * (0x7FFFFFFF - fract) >> 32));
+
+      /* Calculation of y0 * (1-fract) + y1 *fract and y is in 2.30 format */
+      y += ((q31_t) (((q63_t) y1 * fract) >> 32));
+
+      /* Convert y to 1.31 format */
+      return (y << 1U);
+    }
+  }
+
+
+  /**
+   *
+   * @brief  Process function for the Q15 Linear Interpolation Function.
+   * @param[in] pYData   pointer to Q15 Linear Interpolation table
+   * @param[in] x        input sample to process
+   * @param[in] nValues  number of table values
+   * @return y processed output sample.
+   *
+   * \par
+   * Input sample <code>x</code> is in 12.20 format which contains 12 bits for table index and 20 bits for fractional part.
+   * This function can support maximum of table size 2^12.
+   *
+   */
+  __STATIC_FORCEINLINE q15_t riscv_linear_interp_q15(
+  q15_t * pYData,
+  q31_t x,
+  uint32_t nValues)
+  {
+    q63_t y;                                     /* output */
+    q15_t y0, y1;                                /* Nearest output values */
+    q31_t fract;                                 /* fractional part */
+    int32_t index;                               /* Index to read nearest output values */
+
+    /* Input is in 12.20 format */
+    /* 12 bits for the table index */
+    /* Index value calculation */
+    index = ((x & (int32_t)0xFFF00000) >> 20);
+
+    if (index >= (int32_t)(nValues - 1))
+    {
+      return (pYData[nValues - 1]);
+    }
+    else if (index < 0)
+    {
+      return (pYData[0]);
+    }
+    else
+    {
+      /* 20 bits for the fractional part */
+      /* fract is in 12.20 format */
+      fract = (x & 0x000FFFFF);
+
+      /* Read two nearest output values from the index */
+      y0 = pYData[index];
+      y1 = pYData[index + 1];
+
+      /* Calculation of y0 * (1-fract) and y is in 13.35 format */
+      y = ((q63_t) y0 * (0xFFFFF - fract));
+
+      /* Calculation of (y0 * (1-fract) + y1 * fract) and y is in 13.35 format */
+      y += ((q63_t) y1 * (fract));
+
+      /* convert y to 1.15 format */
+      return (q15_t) (y >> 20);
+    }
+  }
+
+
+  /**
+   *
+   * @brief  Process function for the Q7 Linear Interpolation Function.
+   * @param[in] pYData   pointer to Q7 Linear Interpolation table
+   * @param[in] x        input sample to process
+   * @param[in] nValues  number of table values
+   * @return y processed output sample.
+   *
+   * \par
+   * Input sample <code>x</code> is in 12.20 format which contains 12 bits for table index and 20 bits for fractional part.
+   * This function can support maximum of table size 2^12.
+   */
+  __STATIC_FORCEINLINE q7_t riscv_linear_interp_q7(
+  q7_t * pYData,
+  q31_t x,
+  uint32_t nValues)
+  {
+    q31_t y;                                     /* output */
+    q7_t y0, y1;                                 /* Nearest output values */
+    q31_t fract;                                 /* fractional part */
+    uint32_t index;                              /* Index to read nearest output values */
+
+    /* Input is in 12.20 format */
+    /* 12 bits for the table index */
+    /* Index value calculation */
+    if (x < 0)
+    {
+      return (pYData[0]);
+    }
+    index = (x >> 20) & 0xfff;
+
+    if (index >= (nValues - 1))
+    {
+      return (pYData[nValues - 1]);
+    }
+    else
+    {
+      /* 20 bits for the fractional part */
+      /* fract is in 12.20 format */
+      fract = (x & 0x000FFFFF);
+
+      /* Read two nearest output values from the index and are in 1.7(q7) format */
+      y0 = pYData[index];
+      y1 = pYData[index + 1];
+
+      /* Calculation of y0 * (1-fract ) and y is in 13.27(q27) format */
+      y = ((y0 * (0xFFFFF - fract)));
+
+      /* Calculation of y1 * fract + y0 * (1-fract) and y is in 13.27(q27) format */
+      y += (y1 * fract);
+
+      /* convert y to 1.7(q7) format */
+      return (q7_t) (y >> 20);
+     }
+  }
+
+  /**
+   * @} end of LinearInterpolate group
+   */
+
+  /**
+   * @brief  Fast approximation to the trigonometric sine function for floating-point data.
+   * @param[in] x  input value in radians.
+   * @return  sin(x).
+   */
+  float32_t riscv_sin_f32(
+  float32_t x);
+
+
+  /**
+   * @brief  Fast approximation to the trigonometric sine function for Q31 data.
+   * @param[in] x  Scaled input value in radians.
+   * @return  sin(x).
+   */
+  q31_t riscv_sin_q31(
+  q31_t x);
+
+
+  /**
+   * @brief  Fast approximation to the trigonometric sine function for Q15 data.
+   * @param[in] x  Scaled input value in radians.
+   * @return  sin(x).
+   */
+  q15_t riscv_sin_q15(
+  q15_t x);
+
+
+  /**
+   * @brief  Fast approximation to the trigonometric cosine function for floating-point data.
+   * @param[in] x  input value in radians.
+   * @return  cos(x).
+   */
+  float32_t riscv_cos_f32(
+  float32_t x);
+
+
+  /**
+   * @brief Fast approximation to the trigonometric cosine function for Q31 data.
+   * @param[in] x  Scaled input value in radians.
+   * @return  cos(x).
+   */
+  q31_t riscv_cos_q31(
+  q31_t x);
+
+
+  /**
+   * @brief  Fast approximation to the trigonometric cosine function for Q15 data.
+   * @param[in] x  Scaled input value in radians.
+   * @return  cos(x).
+   */
+  q15_t riscv_cos_q15(
+  q15_t x);
+
+
+  /**
+   * @ingroup groupFastMath
+   */
+
+
+  /**
+   * @defgroup SQRT Square Root
+   *
+   * Computes the square root of a number.
+   * There are separate functions for Q15, Q31, and floating-point data types.
+   * The square root function is computed using the Newton-Raphson algorithm.
+   * This is an iterative algorithm of the form:
+   * <pre>
+   *      x1 = x0 - f(x0)/f'(x0)
+   * </pre>
+   * where <code>x1</code> is the current estimate,
+   * <code>x0</code> is the previous estimate, and
+   * <code>f'(x0)</code> is the derivative of <code>f()</code> evaluated at <code>x0</code>.
+   * For the square root function, the algorithm reduces to:
+   * <pre>
+   *     x0 = in/2                         [initial guess]
+   *     x1 = 1/2 * ( x0 + in / x0)        [each iteration]
+   * </pre>
+   */
+
+
+  /**
+   * @addtogroup SQRT
+   * @{
+   */
+
+/**
+  @brief         Floating-point square root function.
+  @param[in]     in    input value
+  @param[out]    pOut  square root of input value
+  @return        execution status
+                   - \ref RISCV_MATH_SUCCESS        : input value is positive
+                   - \ref RISCV_MATH_ARGUMENT_ERROR : input value is negative; *pOut is set to 0
+ */
+__STATIC_FORCEINLINE riscv_status riscv_sqrt_f32(
+  float32_t in,
+  float32_t * pOut)
+  {
+
+    if (in >= 0.0f)
+    {
+#if defined ( __riscv_flen )
+      __ASM volatile("fsqrt.s %0, %1" : "=f"(*pOut) : "f"(in));
+#else
+      *pOut = sqrtf(in);
+#endif /*__riscv_flen*/
+
+      return (RISCV_MATH_SUCCESS);
+    }
+    else
+    {
+      *pOut = 0.0f;
+      return (RISCV_MATH_ARGUMENT_ERROR);
+    }
+  }
+
+
+/**
+  @brief         Q31 square root function.
+  @param[in]     in    input value.  The range of the input value is [0 +1) or 0x00000000 to 0x7FFFFFFF
+  @param[out]    pOut  points to square root of input value
+  @return        execution status
+                   - \ref RISCV_MATH_SUCCESS        : input value is positive
+                   - \ref RISCV_MATH_ARGUMENT_ERROR : input value is negative; *pOut is set to 0
+ */
+riscv_status riscv_sqrt_q31(
+  q31_t in,
+  q31_t * pOut);
+
+
+/**
+  @brief         Q15 square root function.
+  @param[in]     in    input value.  The range of the input value is [0 +1) or 0x0000 to 0x7FFF
+  @param[out]    pOut  points to square root of input value
+  @return        execution status
+                   - \ref RISCV_MATH_SUCCESS        : input value is positive
+                   - \ref RISCV_MATH_ARGUMENT_ERROR : input value is negative; *pOut is set to 0
+ */
+riscv_status riscv_sqrt_q15(
+  q15_t in,
+  q15_t * pOut);
+
+  /**
+   * @brief  Vector Floating-point square root function.
+   * @param[in]  pIn   input vector.
+   * @param[out] pOut  vector of square roots of input elements.
+   * @param[in]  len   length of input vector.
+   * @return The function returns RISCV_MATH_SUCCESS if input value is positive value or RISCV_MATH_ARGUMENT_ERROR if
+   * <code>in</code> is negative value and returns zero output for negative values.
+   */
+  void riscv_vsqrt_f32(
+  float32_t * pIn,
+  float32_t * pOut,
+  uint16_t len);
+
+  void riscv_vsqrt_q31(
+  q31_t * pIn,
+  q31_t * pOut,
+  uint16_t len);
+
+  void riscv_vsqrt_q15(
+  q15_t * pIn,
+  q15_t * pOut,
+  uint16_t len);
+
+  /**
+   * @} end of SQRT group
+   */
+
+
+  /**
+   * @brief floating-point Circular write function.
+   */
+  __STATIC_FORCEINLINE void riscv_circularWrite_f32(
+  int32_t * circBuffer,
+  int32_t L,
+  uint16_t * writeOffset,
+  int32_t bufferInc,
+  const int32_t * src,
+  int32_t srcInc,
+  uint32_t blockSize)
+  {
+    uint32_t i = 0U;
+    int32_t wOffset;
+
+    /* Copy the value of Index pointer that points
+     * to the current location where the input samples to be copied */
+    wOffset = *writeOffset;
+
+    /* Loop over the blockSize */
+    i = blockSize;
+
+    while (i > 0U)
+    {
+      /* copy the input sample to the circular buffer */
+      circBuffer[wOffset] = *src;
+
+      /* Update the input pointer */
+      src += srcInc;
+
+      /* Circularly update wOffset.  Watch out for positive and negative value */
+      wOffset += bufferInc;
+      if (wOffset >= L)
+        wOffset -= L;
+
+      /* Decrement the loop counter */
+      i--;
+    }
+
+    /* Update the index pointer */
+    *writeOffset = (uint16_t)wOffset;
+  }
+
+
+
+  /**
+   * @brief floating-point Circular Read function.
+   */
+  __STATIC_FORCEINLINE void riscv_circularRead_f32(
+  int32_t * circBuffer,
+  int32_t L,
+  int32_t * readOffset,
+  int32_t bufferInc,
+  int32_t * dst,
+  int32_t * dst_base,
+  int32_t dst_length,
+  int32_t dstInc,
+  uint32_t blockSize)
+  {
+    uint32_t i = 0U;
+    int32_t rOffset;
+    int32_t* dst_end;
+
+    /* Copy the value of Index pointer that points
+     * to the current location from where the input samples to be read */
+    rOffset = *readOffset;
+    dst_end = dst_base + dst_length;
+
+    /* Loop over the blockSize */
+    i = blockSize;
+
+    while (i > 0U)
+    {
+      /* copy the sample from the circular buffer to the destination buffer */
+      *dst = circBuffer[rOffset];
+
+      /* Update the input pointer */
+      dst += dstInc;
+
+      if (dst == dst_end)
+      {
+        dst = dst_base;
+      }
+
+      /* Circularly update rOffset.  Watch out for positive and negative value  */
+      rOffset += bufferInc;
+
+      if (rOffset >= L)
+      {
+        rOffset -= L;
+      }
+
+      /* Decrement the loop counter */
+      i--;
+    }
+
+    /* Update the index pointer */
+    *readOffset = rOffset;
+  }
+
+
+  /**
+   * @brief Q15 Circular write function.
+   */
+  __STATIC_FORCEINLINE void riscv_circularWrite_q15(
+  q15_t * circBuffer,
+  int32_t L,
+  uint16_t * writeOffset,
+  int32_t bufferInc,
+  const q15_t * src,
+  int32_t srcInc,
+  uint32_t blockSize)
+  {
+    uint32_t i = 0U;
+    int32_t wOffset;
+
+    /* Copy the value of Index pointer that points
+     * to the current location where the input samples to be copied */
+    wOffset = *writeOffset;
+
+    /* Loop over the blockSize */
+    i = blockSize;
+
+    while (i > 0U)
+    {
+      /* copy the input sample to the circular buffer */
+      circBuffer[wOffset] = *src;
+
+      /* Update the input pointer */
+      src += srcInc;
+
+      /* Circularly update wOffset.  Watch out for positive and negative value */
+      wOffset += bufferInc;
+      if (wOffset >= L)
+        wOffset -= L;
+
+      /* Decrement the loop counter */
+      i--;
+    }
+
+    /* Update the index pointer */
+    *writeOffset = (uint16_t)wOffset;
+  }
+
+
+  /**
+   * @brief Q15 Circular Read function.
+   */
+  __STATIC_FORCEINLINE void riscv_circularRead_q15(
+  q15_t * circBuffer,
+  int32_t L,
+  int32_t * readOffset,
+  int32_t bufferInc,
+  q15_t * dst,
+  q15_t * dst_base,
+  int32_t dst_length,
+  int32_t dstInc,
+  uint32_t blockSize)
+  {
+    uint32_t i = 0;
+    int32_t rOffset;
+    q15_t* dst_end;
+
+    /* Copy the value of Index pointer that points
+     * to the current location from where the input samples to be read */
+    rOffset = *readOffset;
+
+    dst_end = dst_base + dst_length;
+
+    /* Loop over the blockSize */
+    i = blockSize;
+
+    while (i > 0U)
+    {
+      /* copy the sample from the circular buffer to the destination buffer */
+      *dst = circBuffer[rOffset];
+
+      /* Update the input pointer */
+      dst += dstInc;
+
+      if (dst == dst_end)
+      {
+        dst = dst_base;
+      }
+
+      /* Circularly update wOffset.  Watch out for positive and negative value */
+      rOffset += bufferInc;
+
+      if (rOffset >= L)
+      {
+        rOffset -= L;
+      }
+
+      /* Decrement the loop counter */
+      i--;
+    }
+
+    /* Update the index pointer */
+    *readOffset = rOffset;
+  }
+
+
+  /**
+   * @brief Q7 Circular write function.
+   */
+  __STATIC_FORCEINLINE void riscv_circularWrite_q7(
+  q7_t * circBuffer,
+  int32_t L,
+  uint16_t * writeOffset,
+  int32_t bufferInc,
+  const q7_t * src,
+  int32_t srcInc,
+  uint32_t blockSize)
+  {
+    uint32_t i = 0U;
+    int32_t wOffset;
+
+    /* Copy the value of Index pointer that points
+     * to the current location where the input samples to be copied */
+    wOffset = *writeOffset;
+
+    /* Loop over the blockSize */
+    i = blockSize;
+
+    while (i > 0U)
+    {
+      /* copy the input sample to the circular buffer */
+      circBuffer[wOffset] = *src;
+
+      /* Update the input pointer */
+      src += srcInc;
+
+      /* Circularly update wOffset.  Watch out for positive and negative value */
+      wOffset += bufferInc;
+      if (wOffset >= L)
+        wOffset -= L;
+
+      /* Decrement the loop counter */
+      i--;
+    }
+
+    /* Update the index pointer */
+    *writeOffset = (uint16_t)wOffset;
+  }
+
+
+  /**
+   * @brief Q7 Circular Read function.
+   */
+  __STATIC_FORCEINLINE void riscv_circularRead_q7(
+  q7_t * circBuffer,
+  int32_t L,
+  int32_t * readOffset,
+  int32_t bufferInc,
+  q7_t * dst,
+  q7_t * dst_base,
+  int32_t dst_length,
+  int32_t dstInc,
+  uint32_t blockSize)
+  {
+    uint32_t i = 0;
+    int32_t rOffset;
+    q7_t* dst_end;
+
+    /* Copy the value of Index pointer that points
+     * to the current location from where the input samples to be read */
+    rOffset = *readOffset;
+
+    dst_end = dst_base + dst_length;
+
+    /* Loop over the blockSize */
+    i = blockSize;
+
+    while (i > 0U)
+    {
+      /* copy the sample from the circular buffer to the destination buffer */
+      *dst = circBuffer[rOffset];
+
+      /* Update the input pointer */
+      dst += dstInc;
+
+      if (dst == dst_end)
+      {
+        dst = dst_base;
+      }
+
+      /* Circularly update rOffset.  Watch out for positive and negative value */
+      rOffset += bufferInc;
+
+      if (rOffset >= L)
+      {
+        rOffset -= L;
+      }
+
+      /* Decrement the loop counter */
+      i--;
+    }
+
+    /* Update the index pointer */
+    *readOffset = rOffset;
+  }
+
+
+  /**
+   * @brief  Sum of the squares of the elements of a Q31 vector.
+   * @param[in]  pSrc       is input pointer
+   * @param[in]  blockSize  is the number of samples to process
+   * @param[out] pResult    is output value.
+   */
+  void riscv_power_q31(
+  const q31_t * pSrc,
+        uint32_t blockSize,
+        q63_t * pResult);
+
+
+  /**
+   * @brief  Sum of the squares of the elements of a floating-point vector.
+   * @param[in]  pSrc       is input pointer
+   * @param[in]  blockSize  is the number of samples to process
+   * @param[out] pResult    is output value.
+   */
+  void riscv_power_f32(
+  const float32_t * pSrc,
+        uint32_t blockSize,
+        float32_t * pResult);
+
+
+  /**
+   * @brief  Sum of the squares of the elements of a Q15 vector.
+   * @param[in]  pSrc       is input pointer
+   * @param[in]  blockSize  is the number of samples to process
+   * @param[out] pResult    is output value.
+   */
+  void riscv_power_q15(
+  const q15_t * pSrc,
+        uint32_t blockSize,
+        q63_t * pResult);
+
+
+  /**
+   * @brief  Sum of the squares of the elements of a Q7 vector.
+   * @param[in]  pSrc       is input pointer
+   * @param[in]  blockSize  is the number of samples to process
+   * @param[out] pResult    is output value.
+   */
+  void riscv_power_q7(
+  const q7_t * pSrc,
+        uint32_t blockSize,
+        q31_t * pResult);
+
+
+  /**
+   * @brief  Mean value of a Q7 vector.
+   * @param[in]  pSrc       is input pointer
+   * @param[in]  blockSize  is the number of samples to process
+   * @param[out] pResult    is output value.
+   */
+  void riscv_mean_q7(
+  const q7_t * pSrc,
+        uint32_t blockSize,
+        q7_t * pResult);
+
+
+  /**
+   * @brief  Mean value of a Q15 vector.
+   * @param[in]  pSrc       is input pointer
+   * @param[in]  blockSize  is the number of samples to process
+   * @param[out] pResult    is output value.
+   */
+  void riscv_mean_q15(
+  const q15_t * pSrc,
+        uint32_t blockSize,
+        q15_t * pResult);
+
+
+  /**
+   * @brief  Mean value of a Q31 vector.
+   * @param[in]  pSrc       is input pointer
+   * @param[in]  blockSize  is the number of samples to process
+   * @param[out] pResult    is output value.
+   */
+  void riscv_mean_q31(
+  const q31_t * pSrc,
+        uint32_t blockSize,
+        q31_t * pResult);
+
+
+  /**
+   * @brief  Mean value of a floating-point vector.
+   * @param[in]  pSrc       is input pointer
+   * @param[in]  blockSize  is the number of samples to process
+   * @param[out] pResult    is output value.
+   */
+  void riscv_mean_f32(
+  const float32_t * pSrc,
+        uint32_t blockSize,
+        float32_t * pResult);
+
+
+  /**
+   * @brief  Variance of the elements of a floating-point vector.
+   * @param[in]  pSrc       is input pointer
+   * @param[in]  blockSize  is the number of samples to process
+   * @param[out] pResult    is output value.
+   */
+  void riscv_var_f32(
+  const float32_t * pSrc,
+        uint32_t blockSize,
+        float32_t * pResult);
+
+
+  /**
+   * @brief  Variance of the elements of a Q31 vector.
+   * @param[in]  pSrc       is input pointer
+   * @param[in]  blockSize  is the number of samples to process
+   * @param[out] pResult    is output value.
+   */
+  void riscv_var_q31(
+  const q31_t * pSrc,
+        uint32_t blockSize,
+        q31_t * pResult);
+
+
+  /**
+   * @brief  Variance of the elements of a Q15 vector.
+   * @param[in]  pSrc       is input pointer
+   * @param[in]  blockSize  is the number of samples to process
+   * @param[out] pResult    is output value.
+   */
+  void riscv_var_q15(
+  const q15_t * pSrc,
+        uint32_t blockSize,
+        q15_t * pResult);
+
+
+  /**
+   * @brief  Root Mean Square of the elements of a floating-point vector.
+   * @param[in]  pSrc       is input pointer
+   * @param[in]  blockSize  is the number of samples to process
+   * @param[out] pResult    is output value.
+   */
+  void riscv_rms_f32(
+  const float32_t * pSrc,
+        uint32_t blockSize,
+        float32_t * pResult);
+
+
+  /**
+   * @brief  Root Mean Square of the elements of a Q31 vector.
+   * @param[in]  pSrc       is input pointer
+   * @param[in]  blockSize  is the number of samples to process
+   * @param[out] pResult    is output value.
+   */
+  void riscv_rms_q31(
+  const q31_t * pSrc,
+        uint32_t blockSize,
+        q31_t * pResult);
+
+
+  /**
+   * @brief  Root Mean Square of the elements of a Q15 vector.
+   * @param[in]  pSrc       is input pointer
+   * @param[in]  blockSize  is the number of samples to process
+   * @param[out] pResult    is output value.
+   */
+  void riscv_rms_q15(
+  const q15_t * pSrc,
+        uint32_t blockSize,
+        q15_t * pResult);
+
+
+  /**
+   * @brief  Standard deviation of the elements of a floating-point vector.
+   * @param[in]  pSrc       is input pointer
+   * @param[in]  blockSize  is the number of samples to process
+   * @param[out] pResult    is output value.
+   */
+  void riscv_std_f32(
+  const float32_t * pSrc,
+        uint32_t blockSize,
+        float32_t * pResult);
+
+
+  /**
+   * @brief  Standard deviation of the elements of a Q31 vector.
+   * @param[in]  pSrc       is input pointer
+   * @param[in]  blockSize  is the number of samples to process
+   * @param[out] pResult    is output value.
+   */
+  void riscv_std_q31(
+  const q31_t * pSrc,
+        uint32_t blockSize,
+        q31_t * pResult);
+
+
+  /**
+   * @brief  Standard deviation of the elements of a Q15 vector.
+   * @param[in]  pSrc       is input pointer
+   * @param[in]  blockSize  is the number of samples to process
+   * @param[out] pResult    is output value.
+   */
+  void riscv_std_q15(
+  const q15_t * pSrc,
+        uint32_t blockSize,
+        q15_t * pResult);
+
+
+  /**
+   * @brief  Floating-point complex magnitude
+   * @param[in]  pSrc        points to the complex input vector
+   * @param[out] pDst        points to the real output vector
+   * @param[in]  numSamples  number of complex samples in the input vector
+   */
+  void riscv_cmplx_mag_f32(
+  const float32_t * pSrc,
+        float32_t * pDst,
+        uint32_t numSamples);
+
+
+  /**
+   * @brief  Q31 complex magnitude
+   * @param[in]  pSrc        points to the complex input vector
+   * @param[out] pDst        points to the real output vector
+   * @param[in]  numSamples  number of complex samples in the input vector
+   */
+  void riscv_cmplx_mag_q31(
+  const q31_t * pSrc,
+        q31_t * pDst,
+        uint32_t numSamples);
+
+
+  /**
+   * @brief  Q15 complex magnitude
+   * @param[in]  pSrc        points to the complex input vector
+   * @param[out] pDst        points to the real output vector
+   * @param[in]  numSamples  number of complex samples in the input vector
+   */
+  void riscv_cmplx_mag_q15(
+  const q15_t * pSrc,
+        q15_t * pDst,
+        uint32_t numSamples);
+
+
+  /**
+   * @brief  Q15 complex dot product
+   * @param[in]  pSrcA       points to the first input vector
+   * @param[in]  pSrcB       points to the second input vector
+   * @param[in]  numSamples  number of complex samples in each vector
+   * @param[out] realResult  real part of the result returned here
+   * @param[out] imagResult  imaginary part of the result returned here
+   */
+  void riscv_cmplx_dot_prod_q15(
+  const q15_t * pSrcA,
+  const q15_t * pSrcB,
+        uint32_t numSamples,
+        q31_t * realResult,
+        q31_t * imagResult);
+
+
+  /**
+   * @brief  Q31 complex dot product
+   * @param[in]  pSrcA       points to the first input vector
+   * @param[in]  pSrcB       points to the second input vector
+   * @param[in]  numSamples  number of complex samples in each vector
+   * @param[out] realResult  real part of the result returned here
+   * @param[out] imagResult  imaginary part of the result returned here
+   */
+  void riscv_cmplx_dot_prod_q31(
+  const q31_t * pSrcA,
+  const q31_t * pSrcB,
+        uint32_t numSamples,
+        q63_t * realResult,
+        q63_t * imagResult);
+
+
+  /**
+   * @brief  Floating-point complex dot product
+   * @param[in]  pSrcA       points to the first input vector
+   * @param[in]  pSrcB       points to the second input vector
+   * @param[in]  numSamples  number of complex samples in each vector
+   * @param[out] realResult  real part of the result returned here
+   * @param[out] imagResult  imaginary part of the result returned here
+   */
+  void riscv_cmplx_dot_prod_f32(
+  const float32_t * pSrcA,
+  const float32_t * pSrcB,
+        uint32_t numSamples,
+        float32_t * realResult,
+        float32_t * imagResult);
+
+
+  /**
+   * @brief  Q15 complex-by-real multiplication
+   * @param[in]  pSrcCmplx   points to the complex input vector
+   * @param[in]  pSrcReal    points to the real input vector
+   * @param[out] pCmplxDst   points to the complex output vector
+   * @param[in]  numSamples  number of samples in each vector
+   */
+  void riscv_cmplx_mult_real_q15(
+  const q15_t * pSrcCmplx,
+  const q15_t * pSrcReal,
+        q15_t * pCmplxDst,
+        uint32_t numSamples);
+
+
+  /**
+   * @brief  Q31 complex-by-real multiplication
+   * @param[in]  pSrcCmplx   points to the complex input vector
+   * @param[in]  pSrcReal    points to the real input vector
+   * @param[out] pCmplxDst   points to the complex output vector
+   * @param[in]  numSamples  number of samples in each vector
+   */
+  void riscv_cmplx_mult_real_q31(
+  const q31_t * pSrcCmplx,
+  const q31_t * pSrcReal,
+        q31_t * pCmplxDst,
+        uint32_t numSamples);
+
+
+  /**
+   * @brief  Floating-point complex-by-real multiplication
+   * @param[in]  pSrcCmplx   points to the complex input vector
+   * @param[in]  pSrcReal    points to the real input vector
+   * @param[out] pCmplxDst   points to the complex output vector
+   * @param[in]  numSamples  number of samples in each vector
+   */
+  void riscv_cmplx_mult_real_f32(
+  const float32_t * pSrcCmplx,
+  const float32_t * pSrcReal,
+        float32_t * pCmplxDst,
+        uint32_t numSamples);
+
+
+  /**
+   * @brief  Minimum value of a Q7 vector.
+   * @param[in]  pSrc       is input pointer
+   * @param[in]  blockSize  is the number of samples to process
+   * @param[out] result     is output pointer
+   * @param[in]  index      is the array index of the minimum value in the input buffer.
+   */
+  void riscv_min_q7(
+  const q7_t * pSrc,
+        uint32_t blockSize,
+        q7_t * result,
+        uint32_t * index);
+
+
+  /**
+   * @brief  Minimum value of a Q15 vector.
+   * @param[in]  pSrc       is input pointer
+   * @param[in]  blockSize  is the number of samples to process
+   * @param[out] pResult    is output pointer
+   * @param[in]  pIndex     is the array index of the minimum value in the input buffer.
+   */
+  void riscv_min_q15(
+  const q15_t * pSrc,
+        uint32_t blockSize,
+        q15_t * pResult,
+        uint32_t * pIndex);
+
+
+  /**
+   * @brief  Minimum value of a Q31 vector.
+   * @param[in]  pSrc       is input pointer
+   * @param[in]  blockSize  is the number of samples to process
+   * @param[out] pResult    is output pointer
+   * @param[out] pIndex     is the array index of the minimum value in the input buffer.
+   */
+  void riscv_min_q31(
+  const q31_t * pSrc,
+        uint32_t blockSize,
+        q31_t * pResult,
+        uint32_t * pIndex);
+
+
+  /**
+   * @brief  Minimum value of a floating-point vector.
+   * @param[in]  pSrc       is input pointer
+   * @param[in]  blockSize  is the number of samples to process
+   * @param[out] pResult    is output pointer
+   * @param[out] pIndex     is the array index of the minimum value in the input buffer.
+   */
+  void riscv_min_f32(
+  const float32_t * pSrc,
+        uint32_t blockSize,
+        float32_t * pResult,
+        uint32_t * pIndex);
+
+
+/**
+ * @brief Maximum value of a Q7 vector.
+ * @param[in]  pSrc       points to the input buffer
+ * @param[in]  blockSize  length of the input vector
+ * @param[out] pResult    maximum value returned here
+ * @param[out] pIndex     index of maximum value returned here
+ */
+  void riscv_max_q7(
+  const q7_t * pSrc,
+        uint32_t blockSize,
+        q7_t * pResult,
+        uint32_t * pIndex);
+
+
+/**
+ * @brief Maximum value of a Q15 vector.
+ * @param[in]  pSrc       points to the input buffer
+ * @param[in]  blockSize  length of the input vector
+ * @param[out] pResult    maximum value returned here
+ * @param[out] pIndex     index of maximum value returned here
+ */
+  void riscv_max_q15(
+  const q15_t * pSrc,
+        uint32_t blockSize,
+        q15_t * pResult,
+        uint32_t * pIndex);
+
+
+/**
+ * @brief Maximum value of a Q31 vector.
+ * @param[in]  pSrc       points to the input buffer
+ * @param[in]  blockSize  length of the input vector
+ * @param[out] pResult    maximum value returned here
+ * @param[out] pIndex     index of maximum value returned here
+ */
+  void riscv_max_q31(
+  const q31_t * pSrc,
+        uint32_t blockSize,
+        q31_t * pResult,
+        uint32_t * pIndex);
+
+
+/**
+ * @brief Maximum value of a floating-point vector.
+ * @param[in]  pSrc       points to the input buffer
+ * @param[in]  blockSize  length of the input vector
+ * @param[out] pResult    maximum value returned here
+ * @param[out] pIndex     index of maximum value returned here
+ */
+  void riscv_max_f32(
+  const float32_t * pSrc,
+        uint32_t blockSize,
+        float32_t * pResult,
+        uint32_t * pIndex);
+
+
+  /**
+   * @brief  Q15 complex-by-complex multiplication
+   * @param[in]  pSrcA       points to the first input vector
+   * @param[in]  pSrcB       points to the second input vector
+   * @param[out] pDst        points to the output vector
+   * @param[in]  numSamples  number of complex samples in each vector
+   */
+  void riscv_cmplx_mult_cmplx_q15(
+  const q15_t * pSrcA,
+  const q15_t * pSrcB,
+        q15_t * pDst,
+        uint32_t numSamples);
+
+
+  /**
+   * @brief  Q31 complex-by-complex multiplication
+   * @param[in]  pSrcA       points to the first input vector
+   * @param[in]  pSrcB       points to the second input vector
+   * @param[out] pDst        points to the output vector
+   * @param[in]  numSamples  number of complex samples in each vector
+   */
+  void riscv_cmplx_mult_cmplx_q31(
+  const q31_t * pSrcA,
+  const q31_t * pSrcB,
+        q31_t * pDst,
+        uint32_t numSamples);
+
+
+  /**
+   * @brief  Floating-point complex-by-complex multiplication
+   * @param[in]  pSrcA       points to the first input vector
+   * @param[in]  pSrcB       points to the second input vector
+   * @param[out] pDst        points to the output vector
+   * @param[in]  numSamples  number of complex samples in each vector
+   */
+  void riscv_cmplx_mult_cmplx_f32(
+  const float32_t * pSrcA,
+  const float32_t * pSrcB,
+        float32_t * pDst,
+        uint32_t numSamples);
+
+
+  /**
+   * @brief Converts the elements of the floating-point vector to Q31 vector.
+   * @param[in]  pSrc       points to the floating-point input vector
+   * @param[out] pDst       points to the Q31 output vector
+   * @param[in]  blockSize  length of the input vector
+   */
+  void riscv_float_to_q31(
+  const float32_t * pSrc,
+        q31_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief Converts the elements of the floating-point vector to Q15 vector.
+   * @param[in]  pSrc       points to the floating-point input vector
+   * @param[out] pDst       points to the Q15 output vector
+   * @param[in]  blockSize  length of the input vector
+   */
+  void riscv_float_to_q15(
+  const float32_t * pSrc,
+        q15_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief Converts the elements of the floating-point vector to Q7 vector.
+   * @param[in]  pSrc       points to the floating-point input vector
+   * @param[out] pDst       points to the Q7 output vector
+   * @param[in]  blockSize  length of the input vector
+   */
+  void riscv_float_to_q7(
+  const float32_t * pSrc,
+        q7_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief  Converts the elements of the Q31 vector to floating-point vector.
+   * @param[in]  pSrc       is input pointer
+   * @param[out] pDst       is output pointer
+   * @param[in]  blockSize  is the number of samples to process
+   */
+  void riscv_q31_to_float(
+  const q31_t * pSrc,
+        float32_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief  Converts the elements of the Q31 vector to Q15 vector.
+   * @param[in]  pSrc       is input pointer
+   * @param[out] pDst       is output pointer
+   * @param[in]  blockSize  is the number of samples to process
+   */
+  void riscv_q31_to_q15(
+  const q31_t * pSrc,
+        q15_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief  Converts the elements of the Q31 vector to Q7 vector.
+   * @param[in]  pSrc       is input pointer
+   * @param[out] pDst       is output pointer
+   * @param[in]  blockSize  is the number of samples to process
+   */
+  void riscv_q31_to_q7(
+  const q31_t * pSrc,
+        q7_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief  Converts the elements of the Q15 vector to floating-point vector.
+   * @param[in]  pSrc       is input pointer
+   * @param[out] pDst       is output pointer
+   * @param[in]  blockSize  is the number of samples to process
+   */
+  void riscv_q15_to_float(
+  const q15_t * pSrc,
+        float32_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief  Converts the elements of the Q15 vector to Q31 vector.
+   * @param[in]  pSrc       is input pointer
+   * @param[out] pDst       is output pointer
+   * @param[in]  blockSize  is the number of samples to process
+   */
+  void riscv_q15_to_q31(
+  const q15_t * pSrc,
+        q31_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief  Converts the elements of the Q15 vector to Q7 vector.
+   * @param[in]  pSrc       is input pointer
+   * @param[out] pDst       is output pointer
+   * @param[in]  blockSize  is the number of samples to process
+   */
+  void riscv_q15_to_q7(
+  const q15_t * pSrc,
+        q7_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief  Converts the elements of the Q7 vector to floating-point vector.
+   * @param[in]  pSrc       is input pointer
+   * @param[out] pDst       is output pointer
+   * @param[in]  blockSize  is the number of samples to process
+   */
+  void riscv_q7_to_float(
+  const q7_t * pSrc,
+        float32_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief  Converts the elements of the Q7 vector to Q31 vector.
+   * @param[in]  pSrc       input pointer
+   * @param[out] pDst       output pointer
+   * @param[in]  blockSize  number of samples to process
+   */
+  void riscv_q7_to_q31(
+  const q7_t * pSrc,
+        q31_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief  Converts the elements of the Q7 vector to Q15 vector.
+   * @param[in]  pSrc       input pointer
+   * @param[out] pDst       output pointer
+   * @param[in]  blockSize  number of samples to process
+   */
+  void riscv_q7_to_q15(
+  const q7_t * pSrc,
+        q15_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @ingroup groupInterpolation
+   */
+
+  /**
+   * @defgroup BilinearInterpolate Bilinear Interpolation
+   *
+   * Bilinear interpolation is an extension of linear interpolation applied to a two dimensional grid.
+   * The underlying function <code>f(x, y)</code> is sampled on a regular grid and the interpolation process
+   * determines values between the grid points.
+   * Bilinear interpolation is equivalent to two step linear interpolation, first in the x-dimension and then in the y-dimension.
+   * Bilinear interpolation is often used in image processing to rescale images.
+   * The NMSIS DSP library provides bilinear interpolation functions for Q7, Q15, Q31, and floating-point data types.
+   *
+   * <b>Algorithm</b>
+   * \par
+   * The instance structure used by the bilinear interpolation functions describes a two dimensional data table.
+   * For floating-point, the instance structure is defined as:
+   * <pre>
+   *   typedef struct
+   *   {
+   *     uint16_t numRows;
+   *     uint16_t numCols;
+   *     float32_t *pData;
+   * } riscv_bilinear_interp_instance_f32;
+   * </pre>
+   *
+   * \par
+   * where <code>numRows</code> specifies the number of rows in the table;
+   * <code>numCols</code> specifies the number of columns in the table;
+   * and <code>pData</code> points to an array of size <code>numRows*numCols</code> values.
+   * The data table <code>pTable</code> is organized in row order and the supplied data values fall on integer indexes.
+   * That is, table element (x,y) is located at <code>pTable[x + y*numCols]</code> where x and y are integers.
+   *
+   * \par
+   * Let <code>(x, y)</code> specify the desired interpolation point.  Then define:
+   * <pre>
+   *     XF = floor(x)
+   *     YF = floor(y)
+   * </pre>
+   * \par
+   * The interpolated output point is computed as:
+   * <pre>
+   *  f(x, y) = f(XF, YF) * (1-(x-XF)) * (1-(y-YF))
+   *           + f(XF+1, YF) * (x-XF)*(1-(y-YF))
+   *           + f(XF, YF+1) * (1-(x-XF))*(y-YF)
+   *           + f(XF+1, YF+1) * (x-XF)*(y-YF)
+   * </pre>
+   * Note that the coordinates (x, y) contain integer and fractional components.
+   * The integer components specify which portion of the table to use while the
+   * fractional components control the interpolation processor.
+   *
+   * \par
+   * if (x,y) are outside of the table boundary, Bilinear interpolation returns zero output.
+   */
+
+
+  /**
+   * @addtogroup BilinearInterpolate
+   * @{
+   */
+
+  /**
+  * @brief  Floating-point bilinear interpolation.
+  * @param[in,out] S  points to an instance of the interpolation structure.
+  * @param[in]     X  interpolation coordinate.
+  * @param[in]     Y  interpolation coordinate.
+  * @return out interpolated value.
+  */
+  __STATIC_FORCEINLINE float32_t riscv_bilinear_interp_f32(
+  const riscv_bilinear_interp_instance_f32 * S,
+  float32_t X,
+  float32_t Y)
+  {
+    float32_t out;
+    float32_t f00, f01, f10, f11;
+    float32_t *pData = S->pData;
+    int32_t xIndex, yIndex, index;
+    float32_t xdiff, ydiff;
+    float32_t b1, b2, b3, b4;
+
+    xIndex = (int32_t) X;
+    yIndex = (int32_t) Y;
+
+    /* Care taken for table outside boundary */
+    /* Returns zero output when values are outside table boundary */
+    if (xIndex < 0 || xIndex > (S->numRows - 1) || yIndex < 0 || yIndex > (S->numCols - 1))
+    {
+      return (0);
+    }
+
+    /* Calculation of index for two nearest points in X-direction */
+    index = (xIndex - 1) + (yIndex - 1) * S->numCols;
+
+
+    /* Read two nearest points in X-direction */
+    f00 = pData[index];
+    f01 = pData[index + 1];
+
+    /* Calculation of index for two nearest points in Y-direction */
+    index = (xIndex - 1) + (yIndex) * S->numCols;
+
+
+    /* Read two nearest points in Y-direction */
+    f10 = pData[index];
+    f11 = pData[index + 1];
+
+    /* Calculation of intermediate values */
+    b1 = f00;
+    b2 = f01 - f00;
+    b3 = f10 - f00;
+    b4 = f00 - f01 - f10 + f11;
+
+    /* Calculation of fractional part in X */
+    xdiff = X - xIndex;
+
+    /* Calculation of fractional part in Y */
+    ydiff = Y - yIndex;
+
+    /* Calculation of bi-linear interpolated output */
+    out = b1 + b2 * xdiff + b3 * ydiff + b4 * xdiff * ydiff;
+
+    /* return to application */
+    return (out);
+  }
+
+
+  /**
+  * @brief  Q31 bilinear interpolation.
+  * @param[in,out] S  points to an instance of the interpolation structure.
+  * @param[in]     X  interpolation coordinate in 12.20 format.
+  * @param[in]     Y  interpolation coordinate in 12.20 format.
+  * @return out interpolated value.
+  */
+  __STATIC_FORCEINLINE q31_t riscv_bilinear_interp_q31(
+  riscv_bilinear_interp_instance_q31 * S,
+  q31_t X,
+  q31_t Y)
+  {
+    q31_t out;                                   /* Temporary output */
+    q31_t acc = 0;                               /* output */
+    q31_t xfract, yfract;                        /* X, Y fractional parts */
+    q31_t x1, x2, y1, y2;                        /* Nearest output values */
+    int32_t rI, cI;                              /* Row and column indices */
+    q31_t *pYData = S->pData;                    /* pointer to output table values */
+    uint32_t nCols = S->numCols;                 /* num of rows */
+
+    /* Input is in 12.20 format */
+    /* 12 bits for the table index */
+    /* Index value calculation */
+    rI = ((X & (q31_t)0xFFF00000) >> 20);
+
+    /* Input is in 12.20 format */
+    /* 12 bits for the table index */
+    /* Index value calculation */
+    cI = ((Y & (q31_t)0xFFF00000) >> 20);
+
+    /* Care taken for table outside boundary */
+    /* Returns zero output when values are outside table boundary */
+    if (rI < 0 || rI > (S->numRows - 1) || cI < 0 || cI > (S->numCols - 1))
+    {
+      return (0);
+    }
+
+    /* 20 bits for the fractional part */
+    /* shift left xfract by 11 to keep 1.31 format */
+    xfract = (X & 0x000FFFFF) << 11U;
+
+    /* Read two nearest output values from the index */
+    x1 = pYData[(rI) + (int32_t)nCols * (cI)    ];
+    x2 = pYData[(rI) + (int32_t)nCols * (cI) + 1];
+
+    /* 20 bits for the fractional part */
+    /* shift left yfract by 11 to keep 1.31 format */
+    yfract = (Y & 0x000FFFFF) << 11U;
+
+    /* Read two nearest output values from the index */
+    y1 = pYData[(rI) + (int32_t)nCols * (cI + 1)    ];
+    y2 = pYData[(rI) + (int32_t)nCols * (cI + 1) + 1];
+
+    /* Calculation of x1 * (1-xfract ) * (1-yfract) and acc is in 3.29(q29) format */
+    out = ((q31_t) (((q63_t) x1  * (0x7FFFFFFF - xfract)) >> 32));
+    acc = ((q31_t) (((q63_t) out * (0x7FFFFFFF - yfract)) >> 32));
+
+    /* x2 * (xfract) * (1-yfract)  in 3.29(q29) and adding to acc */
+    out = ((q31_t) ((q63_t) x2 * (0x7FFFFFFF - yfract) >> 32));
+    acc += ((q31_t) ((q63_t) out * (xfract) >> 32));
+
+    /* y1 * (1 - xfract) * (yfract)  in 3.29(q29) and adding to acc */
+    out = ((q31_t) ((q63_t) y1 * (0x7FFFFFFF - xfract) >> 32));
+    acc += ((q31_t) ((q63_t) out * (yfract) >> 32));
+
+    /* y2 * (xfract) * (yfract)  in 3.29(q29) and adding to acc */
+    out = ((q31_t) ((q63_t) y2 * (xfract) >> 32));
+    acc += ((q31_t) ((q63_t) out * (yfract) >> 32));
+
+    /* Convert acc to 1.31(q31) format */
+    return ((q31_t)(acc << 2));
+  }
+
+
+  /**
+  * @brief  Q15 bilinear interpolation.
+  * @param[in,out] S  points to an instance of the interpolation structure.
+  * @param[in]     X  interpolation coordinate in 12.20 format.
+  * @param[in]     Y  interpolation coordinate in 12.20 format.
+  * @return out interpolated value.
+  */
+  __STATIC_FORCEINLINE q15_t riscv_bilinear_interp_q15(
+  riscv_bilinear_interp_instance_q15 * S,
+  q31_t X,
+  q31_t Y)
+  {
+    q63_t acc = 0;                               /* output */
+    q31_t out;                                   /* Temporary output */
+    q15_t x1, x2, y1, y2;                        /* Nearest output values */
+    q31_t xfract, yfract;                        /* X, Y fractional parts */
+    int32_t rI, cI;                              /* Row and column indices */
+    q15_t *pYData = S->pData;                    /* pointer to output table values */
+    uint32_t nCols = S->numCols;                 /* num of rows */
+
+    /* Input is in 12.20 format */
+    /* 12 bits for the table index */
+    /* Index value calculation */
+    rI = ((X & (q31_t)0xFFF00000) >> 20);
+
+    /* Input is in 12.20 format */
+    /* 12 bits for the table index */
+    /* Index value calculation */
+    cI = ((Y & (q31_t)0xFFF00000) >> 20);
+
+    /* Care taken for table outside boundary */
+    /* Returns zero output when values are outside table boundary */
+    if (rI < 0 || rI > (S->numRows - 1) || cI < 0 || cI > (S->numCols - 1))
+    {
+      return (0);
+    }
+
+    /* 20 bits for the fractional part */
+    /* xfract should be in 12.20 format */
+    xfract = (X & 0x000FFFFF);
+
+    /* Read two nearest output values from the index */
+    x1 = pYData[((uint32_t)rI) + nCols * ((uint32_t)cI)    ];
+    x2 = pYData[((uint32_t)rI) + nCols * ((uint32_t)cI) + 1];
+
+    /* 20 bits for the fractional part */
+    /* yfract should be in 12.20 format */
+    yfract = (Y & 0x000FFFFF);
+
+    /* Read two nearest output values from the index */
+    y1 = pYData[((uint32_t)rI) + nCols * ((uint32_t)cI + 1)    ];
+    y2 = pYData[((uint32_t)rI) + nCols * ((uint32_t)cI + 1) + 1];
+
+    /* Calculation of x1 * (1-xfract ) * (1-yfract) and acc is in 13.51 format */
+
+    /* x1 is in 1.15(q15), xfract in 12.20 format and out is in 13.35 format */
+    /* convert 13.35 to 13.31 by right shifting  and out is in 1.31 */
+    out = (q31_t) (((q63_t) x1 * (0xFFFFF - xfract)) >> 4U);
+    acc = ((q63_t) out * (0xFFFFF - yfract));
+
+    /* x2 * (xfract) * (1-yfract)  in 1.51 and adding to acc */
+    out = (q31_t) (((q63_t) x2 * (0xFFFFF - yfract)) >> 4U);
+    acc += ((q63_t) out * (xfract));
+
+    /* y1 * (1 - xfract) * (yfract)  in 1.51 and adding to acc */
+    out = (q31_t) (((q63_t) y1 * (0xFFFFF - xfract)) >> 4U);
+    acc += ((q63_t) out * (yfract));
+
+    /* y2 * (xfract) * (yfract)  in 1.51 and adding to acc */
+    out = (q31_t) (((q63_t) y2 * (xfract)) >> 4U);
+    acc += ((q63_t) out * (yfract));
+
+    /* acc is in 13.51 format and down shift acc by 36 times */
+    /* Convert out to 1.15 format */
+    return ((q15_t)(acc >> 36));
+  }
+
+
+  /**
+  * @brief  Q7 bilinear interpolation.
+  * @param[in,out] S  points to an instance of the interpolation structure.
+  * @param[in]     X  interpolation coordinate in 12.20 format.
+  * @param[in]     Y  interpolation coordinate in 12.20 format.
+  * @return out interpolated value.
+  */
+  __STATIC_FORCEINLINE q7_t riscv_bilinear_interp_q7(
+  riscv_bilinear_interp_instance_q7 * S,
+  q31_t X,
+  q31_t Y)
+  {
+    q63_t acc = 0;                               /* output */
+    q31_t out;                                   /* Temporary output */
+    q31_t xfract, yfract;                        /* X, Y fractional parts */
+    q7_t x1, x2, y1, y2;                         /* Nearest output values */
+    int32_t rI, cI;                              /* Row and column indices */
+    q7_t *pYData = S->pData;                     /* pointer to output table values */
+    uint32_t nCols = S->numCols;                 /* num of rows */
+
+    /* Input is in 12.20 format */
+    /* 12 bits for the table index */
+    /* Index value calculation */
+    rI = ((X & (q31_t)0xFFF00000) >> 20);
+
+    /* Input is in 12.20 format */
+    /* 12 bits for the table index */
+    /* Index value calculation */
+    cI = ((Y & (q31_t)0xFFF00000) >> 20);
+
+    /* Care taken for table outside boundary */
+    /* Returns zero output when values are outside table boundary */
+    if (rI < 0 || rI > (S->numRows - 1) || cI < 0 || cI > (S->numCols - 1))
+    {
+      return (0);
+    }
+
+    /* 20 bits for the fractional part */
+    /* xfract should be in 12.20 format */
+    xfract = (X & (q31_t)0x000FFFFF);
+
+    /* Read two nearest output values from the index */
+    x1 = pYData[((uint32_t)rI) + nCols * ((uint32_t)cI)    ];
+    x2 = pYData[((uint32_t)rI) + nCols * ((uint32_t)cI) + 1];
+
+    /* 20 bits for the fractional part */
+    /* yfract should be in 12.20 format */
+    yfract = (Y & (q31_t)0x000FFFFF);
+
+    /* Read two nearest output values from the index */
+    y1 = pYData[((uint32_t)rI) + nCols * ((uint32_t)cI + 1)    ];
+    y2 = pYData[((uint32_t)rI) + nCols * ((uint32_t)cI + 1) + 1];
+
+    /* Calculation of x1 * (1-xfract ) * (1-yfract) and acc is in 16.47 format */
+    out = ((x1 * (0xFFFFF - xfract)));
+    acc = (((q63_t) out * (0xFFFFF - yfract)));
+
+    /* x2 * (xfract) * (1-yfract)  in 2.22 and adding to acc */
+    out = ((x2 * (0xFFFFF - yfract)));
+    acc += (((q63_t) out * (xfract)));
+
+    /* y1 * (1 - xfract) * (yfract)  in 2.22 and adding to acc */
+    out = ((y1 * (0xFFFFF - xfract)));
+    acc += (((q63_t) out * (yfract)));
+
+    /* y2 * (xfract) * (yfract)  in 2.22 and adding to acc */
+    out = ((y2 * (yfract)));
+    acc += (((q63_t) out * (xfract)));
+
+    /* acc in 16.47 format and down shift by 40 to convert to 1.7 format */
+    return ((q7_t)(acc >> 40));
+  }
+
+  /**
+   * @} end of BilinearInterpolate group
+   */
+
+
+/* SMMLAR */
+#define multAcc_32x32_keep32_R(a, x, y) \
+    a = (q31_t) (((((q63_t) a) << 32) + ((q63_t) x * y) + 0x80000000LL ) >> 32)
+
+/* SMMLSR */
+#define multSub_32x32_keep32_R(a, x, y) \
+    a = (q31_t) (((((q63_t) a) << 32) - ((q63_t) x * y) + 0x80000000LL ) >> 32)
+
+/* SMMULR */
+#define mult_32x32_keep32_R(a, x, y) \
+    a = (q31_t) (((q63_t) x * y + 0x80000000LL ) >> 32)
+
+/* SMMLA */
+#define multAcc_32x32_keep32(a, x, y) \
+    a += (q31_t) (((q63_t) x * y) >> 32)
+
+/* SMMLS */
+#define multSub_32x32_keep32(a, x, y) \
+    a -= (q31_t) (((q63_t) x * y) >> 32)
+
+/* SMMUL */
+#define mult_32x32_keep32(a, x, y) \
+    a = (q31_t) (((q63_t) x * y ) >> 32)
+
+
+#define LOW_OPTIMIZATION_ENTER \
+   __attribute__(( optimize("-O1") ))
+#define LOW_OPTIMIZATION_EXIT
+#define IAR_ONLY_LOW_OPTIMIZATION_ENTER
+#define IAR_ONLY_LOW_OPTIMIZATION_EXIT
+
+
+#ifdef   __cplusplus
+}
+#endif
+
+
+#endif /* _RISCV_MATH_H */
+
+/**
+ *
+ * End of file.
+ */
diff --git a/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/DSP/GCC/libnmsis_dsp_rv32imac.a b/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/DSP/GCC/libnmsis_dsp_rv32imac.a
new file mode 100644
index 00000000..3f2e5918
Binary files /dev/null and b/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/DSP/GCC/libnmsis_dsp_rv32imac.a differ
diff --git a/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/DSP/GCC/libnmsis_dsp_rv32imacp.a b/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/DSP/GCC/libnmsis_dsp_rv32imacp.a
new file mode 100644
index 00000000..ed8e1f08
Binary files /dev/null and b/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/DSP/GCC/libnmsis_dsp_rv32imacp.a differ
diff --git a/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/DSP/GCC/libnmsis_dsp_rv32imafc.a b/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/DSP/GCC/libnmsis_dsp_rv32imafc.a
new file mode 100644
index 00000000..050b9148
Binary files /dev/null and b/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/DSP/GCC/libnmsis_dsp_rv32imafc.a differ
diff --git a/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/DSP/GCC/libnmsis_dsp_rv32imafcp.a b/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/DSP/GCC/libnmsis_dsp_rv32imafcp.a
new file mode 100644
index 00000000..8adfbd24
Binary files /dev/null and b/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/DSP/GCC/libnmsis_dsp_rv32imafcp.a differ
diff --git a/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/DSP/GCC/libnmsis_dsp_rv32imafdc.a b/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/DSP/GCC/libnmsis_dsp_rv32imafdc.a
new file mode 100644
index 00000000..262e9e4e
Binary files /dev/null and b/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/DSP/GCC/libnmsis_dsp_rv32imafdc.a differ
diff --git a/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/DSP/GCC/libnmsis_dsp_rv32imafdcp.a b/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/DSP/GCC/libnmsis_dsp_rv32imafdcp.a
new file mode 100644
index 00000000..c08ad914
Binary files /dev/null and b/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/DSP/GCC/libnmsis_dsp_rv32imafdcp.a differ
diff --git a/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/DSP/GCC/libnmsis_dsp_rv64imac.a b/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/DSP/GCC/libnmsis_dsp_rv64imac.a
new file mode 100644
index 00000000..04e3709b
Binary files /dev/null and b/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/DSP/GCC/libnmsis_dsp_rv64imac.a differ
diff --git a/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/DSP/GCC/libnmsis_dsp_rv64imacp.a b/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/DSP/GCC/libnmsis_dsp_rv64imacp.a
new file mode 100644
index 00000000..7f2d0706
Binary files /dev/null and b/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/DSP/GCC/libnmsis_dsp_rv64imacp.a differ
diff --git a/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/DSP/GCC/libnmsis_dsp_rv64imafc.a b/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/DSP/GCC/libnmsis_dsp_rv64imafc.a
new file mode 100644
index 00000000..577a9072
Binary files /dev/null and b/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/DSP/GCC/libnmsis_dsp_rv64imafc.a differ
diff --git a/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/DSP/GCC/libnmsis_dsp_rv64imafcp.a b/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/DSP/GCC/libnmsis_dsp_rv64imafcp.a
new file mode 100644
index 00000000..7a4731cf
Binary files /dev/null and b/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/DSP/GCC/libnmsis_dsp_rv64imafcp.a differ
diff --git a/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/DSP/GCC/libnmsis_dsp_rv64imafdc.a b/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/DSP/GCC/libnmsis_dsp_rv64imafdc.a
new file mode 100644
index 00000000..a77b5705
Binary files /dev/null and b/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/DSP/GCC/libnmsis_dsp_rv64imafdc.a differ
diff --git a/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/DSP/GCC/libnmsis_dsp_rv64imafdcp.a b/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/DSP/GCC/libnmsis_dsp_rv64imafdcp.a
new file mode 100644
index 00000000..d7ebb92a
Binary files /dev/null and b/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/DSP/GCC/libnmsis_dsp_rv64imafdcp.a differ
diff --git a/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/NN/GCC/libnmsis_nn_rv32imac.a b/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/NN/GCC/libnmsis_nn_rv32imac.a
new file mode 100644
index 00000000..0c53f849
Binary files /dev/null and b/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/NN/GCC/libnmsis_nn_rv32imac.a differ
diff --git a/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/NN/GCC/libnmsis_nn_rv32imacp.a b/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/NN/GCC/libnmsis_nn_rv32imacp.a
new file mode 100644
index 00000000..2b4bb7ac
Binary files /dev/null and b/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/NN/GCC/libnmsis_nn_rv32imacp.a differ
diff --git a/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/NN/GCC/libnmsis_nn_rv32imafc.a b/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/NN/GCC/libnmsis_nn_rv32imafc.a
new file mode 100644
index 00000000..f879b126
Binary files /dev/null and b/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/NN/GCC/libnmsis_nn_rv32imafc.a differ
diff --git a/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/NN/GCC/libnmsis_nn_rv32imafcp.a b/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/NN/GCC/libnmsis_nn_rv32imafcp.a
new file mode 100644
index 00000000..43aff14b
Binary files /dev/null and b/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/NN/GCC/libnmsis_nn_rv32imafcp.a differ
diff --git a/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/NN/GCC/libnmsis_nn_rv32imafdc.a b/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/NN/GCC/libnmsis_nn_rv32imafdc.a
new file mode 100644
index 00000000..4211799a
Binary files /dev/null and b/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/NN/GCC/libnmsis_nn_rv32imafdc.a differ
diff --git a/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/NN/GCC/libnmsis_nn_rv32imafdcp.a b/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/NN/GCC/libnmsis_nn_rv32imafdcp.a
new file mode 100644
index 00000000..09793aea
Binary files /dev/null and b/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/NN/GCC/libnmsis_nn_rv32imafdcp.a differ
diff --git a/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/NN/GCC/libnmsis_nn_rv64imac.a b/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/NN/GCC/libnmsis_nn_rv64imac.a
new file mode 100644
index 00000000..6868dbe2
Binary files /dev/null and b/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/NN/GCC/libnmsis_nn_rv64imac.a differ
diff --git a/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/NN/GCC/libnmsis_nn_rv64imacp.a b/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/NN/GCC/libnmsis_nn_rv64imacp.a
new file mode 100644
index 00000000..42d38954
Binary files /dev/null and b/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/NN/GCC/libnmsis_nn_rv64imacp.a differ
diff --git a/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/NN/GCC/libnmsis_nn_rv64imafc.a b/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/NN/GCC/libnmsis_nn_rv64imafc.a
new file mode 100644
index 00000000..e79348fb
Binary files /dev/null and b/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/NN/GCC/libnmsis_nn_rv64imafc.a differ
diff --git a/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/NN/GCC/libnmsis_nn_rv64imafcp.a b/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/NN/GCC/libnmsis_nn_rv64imafcp.a
new file mode 100644
index 00000000..08d9c723
Binary files /dev/null and b/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/NN/GCC/libnmsis_nn_rv64imafcp.a differ
diff --git a/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/NN/GCC/libnmsis_nn_rv64imafdc.a b/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/NN/GCC/libnmsis_nn_rv64imafdc.a
new file mode 100644
index 00000000..2eb57b6e
Binary files /dev/null and b/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/NN/GCC/libnmsis_nn_rv64imafdc.a differ
diff --git a/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/NN/GCC/libnmsis_nn_rv64imafdcp.a b/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/NN/GCC/libnmsis_nn_rv64imafdcp.a
new file mode 100644
index 00000000..08360fb4
Binary files /dev/null and b/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/NN/GCC/libnmsis_nn_rv64imafdcp.a differ
diff --git a/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/mathlib/GCC/libmathlib_rv64imafdcpv.a b/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/mathlib/GCC/libmathlib_rv64imafdcpv.a
new file mode 100644
index 00000000..c6cea696
Binary files /dev/null and b/kernel/arch/risc-v/nuclei/gcc/nmsis/Library/mathlib/GCC/libmathlib_rv64imafdcpv.a differ
diff --git a/kernel/arch/risc-v/nuclei/gcc/nmsis/NN/Include/riscv_nn_tables.h b/kernel/arch/risc-v/nuclei/gcc/nmsis/NN/Include/riscv_nn_tables.h
new file mode 100644
index 00000000..3b068f5f
--- /dev/null
+++ b/kernel/arch/risc-v/nuclei/gcc/nmsis/NN/Include/riscv_nn_tables.h
@@ -0,0 +1,57 @@
+/* ----------------------------------------------------------------------
+ * Project:      NMSIS NN Library
+ * Title:        riscv_nn_tables.h
+ * Description:  Extern declaration for NN tables
+ *
+ * $Date:        17. January 2018
+ * $Revision:    V.1.0.0
+ *
+ * Target Processor: RISC-V Cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2018 Arm Limited or its affiliates. All rights reserved.
+ * Copyright (c) 2019 Nuclei Limited. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef _RISCV_NN_TABLES_H
+#define _RISCV_NN_TABLES_H
+
+#include "riscv_math.h"
+
+/**
+* @brief tables for various activation functions
+*
+*/
+
+extern const q15_t sigmoidTable_q15[256];
+extern const q7_t sigmoidTable_q7[256];
+
+extern const q7_t tanhTable_q7[256];
+extern const q15_t tanhTable_q15[256];
+
+  /**
+   * @brief 2-way tables for various activation functions
+   *
+   * 2-way table, H table for value larger than 1/4
+   * L table for value smaller than 1/4, H table for remaining
+   * We have this only for the q15_t version. It does not make
+   * sense to have it for q7_t type
+   */
+extern const q15_t sigmoidHTable_q15[192];
+extern const q15_t sigmoidLTable_q15[128];
+
+#endif                          /*  RISCV_NN_TABLES_H */
diff --git a/kernel/arch/risc-v/nuclei/gcc/nmsis/NN/Include/riscv_nnfunctions.h b/kernel/arch/risc-v/nuclei/gcc/nmsis/NN/Include/riscv_nnfunctions.h
new file mode 100644
index 00000000..4b68e417
--- /dev/null
+++ b/kernel/arch/risc-v/nuclei/gcc/nmsis/NN/Include/riscv_nnfunctions.h
@@ -0,0 +1,1134 @@
+/*
+ * Copyright (C) 2010-2018 Arm Limited or its affiliates. All rights reserved.
+ * Copyright (c) 2019 Nuclei Limited. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* ----------------------------------------------------------------------
+ * Project:      NMSIS NN Library
+ * Title:        riscv_nnfunctions.h
+ * Description:  Public header file for NMSIS NN Library
+ *
+ * $Date:        13. July 2018
+ * $Revision:    V.1.0.0
+ *
+ * Target Processor: RISC-V Cores
+ * -------------------------------------------------------------------- */
+
+/**
+   \mainpage NMSIS NN Software Library
+   *
+   * Introduction
+   * ------------
+   *
+   * This user manual describes the NMSIS NN software library,
+   * a collection of efficient neural network kernels developed to maximize the
+   * performance and minimize the memory footprint of neural networks on Nuclei N processor cores.
+   *
+   * The library is divided into a number of functions each covering a specific category:
+   * - Neural Network Convolution Functions
+   * - Neural Network Activation Functions
+   * - Fully-connected Layer Functions
+   * - Neural Network Pooling Functions
+   * - Softmax Functions
+   * - Neural Network Support Functions
+   *
+   * The library has separate functions for operating on different weight and activation data
+   * types including 8-bit integers (q7_t) and 16-bit integers (q15_t). The descrition of the
+   * kernels are included in the function description. The implementation details are also
+   * described in this paper [1].
+   *
+   * \note Please refer to [NMSIS-NN](../../../nn/index.html)
+   *
+   * Block Diagram
+   * --------
+   * \image html NMSIS-NN-OVERVIEW.PNG
+   *
+   * Examples
+   * --------
+   *
+   * The library ships with a number of examples which demonstrate how to use the library functions.
+   *
+   * Pre-processor Macros
+   * ------------
+   *
+   * Each library project have differant pre-processor macros.
+   *
+   * - RISCV_MATH_DSP:
+   *
+   * Define macro RISCV_MATH_DSP, If the silicon supports DSP instructions.
+   *
+   * - RISCV_NN_TRUNCATE:
+   *
+   * Define macro RISCV_NN_TRUNCATE to use floor instead of round-to-the-nearest-int for the computation.
+   *
+   *
+   * [1] CMSIS-NN: Efficient Neural Network Kernels for Arm Cortex-M CPUs https://arxiv.org/abs/1801.06601
+   */
+
+/**
+ * @defgroup groupNN Neural Network Functions
+ * These functions perform basic operations for neural network layers.
+ */
+
+#ifndef _RISCV_NNFUNCTIONS_H
+#define _RISCV_NNFUNCTIONS_H
+
+#include "riscv_nnsupportfunctions.h"
+#include "riscv_nn_tables.h"
+
+#define USE_INTRINSIC
+
+//#define RISCV_NN_TRUNCATE /* This config the rounding model to floor or round to the nearest int */
+
+#ifdef __cplusplus
+extern    "C"
+{
+#endif
+
+/**
+ * @defgroup NNConv Neural Network Convolution Functions
+ *
+ * Perform convolution layer
+ *
+ * The convolution is implemented in 2 steps: im2col and GEMM
+ *
+ * im2col is a process of converting each patch of image data into
+ * a column. After im2col, the convolution is computed as matrix-matrix
+ * multiplication.
+ *
+ * To reduce the memory footprint, the im2col is performed partially.
+ * Each iteration, only a few column (i.e., patches) are generated and
+ * computed with GEMM kernels similar to NMSIS-DSP riscv_mat_mult functions.
+ *
+ */
+
+  /**
+   * @brief Basic Q7 convolution function
+   * @param[in]       Im_in       pointer to input tensor
+   * @param[in]       dim_im_in   input tensor dimention
+   * @param[in]       ch_im_in    number of input tensor channels
+   * @param[in]       wt          pointer to kernel weights
+   * @param[in]       ch_im_out   number of filters, i.e., output tensor channels
+   * @param[in]       dim_kernel  filter kernel size
+   * @param[in]       padding     padding sizes
+   * @param[in]       stride      convolution stride
+   * @param[in]       bias        pointer to bias
+   * @param[in]       bias_shift  amount of left-shift for bias
+   * @param[in]       out_shift   amount of right-shift for output
+   * @param[in,out]   Im_out      pointer to output tensor
+   * @param[in]       dim_im_out  output tensor dimension
+   * @param[in,out]   bufferA     pointer to buffer space for input
+   * @param[in,out]   bufferB     pointer to buffer space for output
+   * @return     The function returns <code>RISCV_MATH_SUCCESS</code>
+   *
+   */
+
+    riscv_status riscv_convolve_HWC_q7_basic(const q7_t * Im_in,
+                                         const uint16_t dim_im_in,
+                                         const uint16_t ch_im_in,
+                                         const q7_t * wt,
+                                         const uint16_t ch_im_out,
+                                         const uint16_t dim_kernel,
+                                         const uint16_t padding,
+                                         const uint16_t stride,
+                                         const q7_t * bias,
+                                         const uint16_t bias_shift,
+                                         const uint16_t out_shift,
+                                         q7_t * Im_out,
+                                         const uint16_t dim_im_out,
+                                         q15_t * bufferA,
+                                         q7_t * bufferB);
+
+  /**
+   * @brief Basic Q7 convolution function (non-sqaure shape)
+   * @param[in]       Im_in        pointer to input tensor
+   * @param[in]       dim_im_in_x  input tensor dimention x
+   * @param[in]       dim_im_in_y  input tensor dimention y
+   * @param[in]       ch_im_in     number of input tensor channels
+   * @param[in]       wt           pointer to kernel weights
+   * @param[in]       ch_im_out    number of filters, i.e., output tensor channels
+   * @param[in]       dim_kernel_x filter kernel size x
+   * @param[in]       dim_kernel_y filter kernel size y
+   * @param[in]       padding_x    padding size x
+   * @param[in]       padding_y    padding size y
+   * @param[in]       stride_x     convolution stride x
+   * @param[in]       stride_y     convolution stride y
+   * @param[in]       bias         pointer to bias
+   * @param[in]       bias_shift   amount of left-shift for bias
+   * @param[in]       out_shift    amount of right-shift for output
+   * @param[in,out]   Im_out       pointer to output tensor
+   * @param[in]       dim_im_out_x output tensor dimension x
+   * @param[in]       dim_im_out_y output tensor dimension y
+   * @param[in,out]   bufferA      pointer to buffer space for input
+   * @param[in,out]   bufferB      pointer to buffer space for output
+   * @return     The function returns <code>RISCV_MATH_SUCCESS</code>
+   */
+
+    riscv_status riscv_convolve_HWC_q7_basic_nonsquare(const q7_t * Im_in,
+                                                  const uint16_t dim_im_in_x,
+                                                  const uint16_t dim_im_in_y,
+                                                  const uint16_t ch_im_in,
+                                                  const q7_t * wt,
+                                                  const uint16_t ch_im_out,
+                                                  const uint16_t dim_kernel_x,
+                                                  const uint16_t dim_kernel_y,
+                                                  const uint16_t padding_x,
+                                                  const uint16_t padding_y,
+                                                  const uint16_t stride_x,
+                                                  const uint16_t stride_y,
+                                                  const q7_t * bias,
+                                                  const uint16_t bias_shift,
+                                                  const uint16_t out_shift,
+                                                  q7_t * Im_out,
+                                                  const uint16_t dim_im_out_x,
+                                                  const uint16_t dim_im_out_y,
+                                                  q15_t * bufferA,
+                                                  q7_t * bufferB);
+
+  /**
+   * @brief Basic Q15 convolution function
+   * @param[in]       Im_in       pointer to input tensor
+   * @param[in]       dim_im_in   input tensor dimention
+   * @param[in]       ch_im_in    number of input tensor channels
+   * @param[in]       wt          pointer to kernel weights
+   * @param[in]       ch_im_out   number of filters, i.e., output tensor channels
+   * @param[in]       dim_kernel  filter kernel size
+   * @param[in]       padding     padding sizes
+   * @param[in]       stride      convolution stride
+   * @param[in]       bias        pointer to bias
+   * @param[in]       bias_shift  amount of left-shift for bias
+   * @param[in]       out_shift   amount of right-shift for output
+   * @param[in,out]   Im_out      pointer to output tensor
+   * @param[in]       dim_im_out  output tensor dimension
+   * @param[in,out]   bufferA     pointer to buffer space for input
+   * @param[in,out]   bufferB     pointer to buffer space for output
+   * @return     The function returns <code>RISCV_MATH_SUCCESS</code>
+   *
+   */
+
+    riscv_status riscv_convolve_HWC_q15_basic(const q15_t * Im_in,
+                                          const uint16_t dim_im_in,
+                                          const uint16_t ch_im_in,
+                                          const q15_t * wt,
+                                          const uint16_t ch_im_out,
+                                          const uint16_t dim_kernel,
+                                          const uint16_t padding,
+                                          const uint16_t stride,
+                                          const q15_t * bias,
+                                          const uint16_t bias_shift,
+                                          const uint16_t out_shift,
+                                          q15_t * Im_out,
+                                          const uint16_t dim_im_out,
+                                          q15_t * bufferA,
+                                          q7_t * bufferB);
+
+  /**
+   * @brief Fast Q7 convolution function
+   * @param[in]       Im_in       pointer to input tensor
+   * @param[in]       dim_im_in   input tensor dimention
+   * @param[in]       ch_im_in    number of input tensor channels
+   * @param[in]       wt          pointer to kernel weights
+   * @param[in]       ch_im_out   number of filters, i.e., output tensor channels
+   * @param[in]       dim_kernel  filter kernel size
+   * @param[in]       padding     padding sizes
+   * @param[in]       stride      convolution stride
+   * @param[in]       bias        pointer to bias
+   * @param[in]       bias_shift  amount of left-shift for bias
+   * @param[in]       out_shift   amount of right-shift for output
+   * @param[in,out]   Im_out      pointer to output tensor
+   * @param[in]       dim_im_out  output tensor dimension
+   * @param[in,out]   bufferA     pointer to buffer space for input
+   * @param[in,out]   bufferB     pointer to buffer space for output
+   * @return     The function returns either
+   * <code>RISCV_MATH_SIZE_MISMATCH</code> or <code>RISCV_MATH_SUCCESS</code> based on the outcome of size checking.
+   *
+   * This function is the version with full list of optimization tricks, but with
+   * some contraints:
+   *   ch_im_in is multiple of 4
+   *   ch_im_out is multiple of 2
+   */
+
+    riscv_status riscv_convolve_HWC_q7_fast(const q7_t * Im_in,
+                                        const uint16_t dim_im_in,
+                                        const uint16_t ch_im_in,
+                                        const q7_t * wt,
+                                        const uint16_t ch_im_out,
+                                        const uint16_t dim_kernel,
+                                        const uint16_t padding,
+                                        const uint16_t stride,
+                                        const q7_t * bias,
+                                        const uint16_t bias_shift,
+                                        const uint16_t out_shift,
+                                        q7_t * Im_out,
+                                        const uint16_t dim_im_out,
+                                        q15_t * bufferA,
+                                        q7_t * bufferB);
+
+  /**
+   * @brief Fast Q7 convolution function (non-sqaure shape)
+   * @param[in]       Im_in        pointer to input tensor
+   * @param[in]       dim_im_in_x  input tensor dimention x
+   * @param[in]       dim_im_in_y  input tensor dimention y
+   * @param[in]       ch_im_in     number of input tensor channels
+   * @param[in]       wt           pointer to kernel weights
+   * @param[in]       ch_im_out    number of filters, i.e., output tensor channels
+   * @param[in]       dim_kernel_x filter kernel size x
+   * @param[in]       dim_kernel_y filter kernel size y
+   * @param[in]       padding_x    padding size x
+   * @param[in]       padding_y    padding size y
+   * @param[in]       stride_x     convolution stride x
+   * @param[in]       stride_y     convolution stride y
+   * @param[in]       bias         pointer to bias
+   * @param[in]       bias_shift   amount of left-shift for bias
+   * @param[in]       out_shift    amount of right-shift for output
+   * @param[in,out]   Im_out       pointer to output tensor
+   * @param[in]       dim_im_out_x output tensor dimension x
+   * @param[in]       dim_im_out_y output tensor dimension y
+   * @param[in,out]   bufferA      pointer to buffer space for input
+   * @param[in,out]   bufferB      pointer to buffer space for output
+   * @return     The function returns either
+   * <code>RISCV_MATH_SIZE_MISMATCH</code> or <code>RISCV_MATH_SUCCESS</code> based on the outcome of size checking.
+   *
+   * This function is the version with full list of optimization tricks, but with
+   * some contraints:
+   *   ch_im_in is multiple of 4
+   *   ch_im_out is multiple of 2
+   */
+
+    riscv_status riscv_convolve_HWC_q7_fast_nonsquare(const q7_t * Im_in,
+                                                  const uint16_t dim_im_in_x,
+                                                  const uint16_t dim_im_in_y,
+                                                  const uint16_t ch_im_in,
+                                                  const q7_t * wt,
+                                                  const uint16_t ch_im_out,
+                                                  const uint16_t dim_kernel_x,
+                                                  const uint16_t dim_kernel_y,
+                                                  const uint16_t padding_x,
+                                                  const uint16_t padding_y,
+                                                  const uint16_t stride_x,
+                                                  const uint16_t stride_y,
+                                                  const q7_t * bias,
+                                                  const uint16_t bias_shift,
+                                                  const uint16_t out_shift,
+                                                  q7_t * Im_out,
+                                                  const uint16_t dim_im_out_x,
+                                                  const uint16_t dim_im_out_y,
+                                                  q15_t * bufferA,
+                                                  q7_t * bufferB);
+
+  /**
+   * @brief Fast Q7 version of 1x1 convolution (non-sqaure shape)
+   * @param[in]       Im_in        pointer to input tensor
+   * @param[in]       dim_im_in_x  input tensor dimention x
+   * @param[in]       dim_im_in_y  input tensor dimention y
+   * @param[in]       ch_im_in     number of input tensor channels
+   * @param[in]       wt           pointer to kernel weights
+   * @param[in]       ch_im_out    number of filters, i.e., output tensor channels
+   * @param[in]       dim_kernel_x filter kernel size x
+   * @param[in]       dim_kernel_y filter kernel size y
+   * @param[in]       padding_x    padding size x
+   * @param[in]       padding_y    padding size y
+   * @param[in]       stride_x     convolution stride x
+   * @param[in]       stride_y     convolution stride y
+   * @param[in]       bias         pointer to bias
+   * @param[in]       bias_shift   amount of left-shift for bias
+   * @param[in]       out_shift    amount of right-shift for output
+   * @param[in,out]   Im_out       pointer to output tensor
+   * @param[in]       dim_im_out_x output tensor dimension x
+   * @param[in]       dim_im_out_y output tensor dimension y
+   * @param[in,out]   bufferA      pointer to buffer space for input
+   * @param[in,out]   bufferB      pointer to buffer space for output
+   * @return     The function returns either
+   * <code>RISCV_MATH_SIZE_MISMATCH</code> or <code>RISCV_MATH_SUCCESS</code> based on the outcome of size checking.
+   *
+   * This function implement convolution with 1x1 kernel size (i.e., dim_kernel_x=1
+   * and dim_kernel_y=1). It can be used for
+   * second half of MobileNets after depthwise separable convolution.
+   *
+   * This function is the version with full list of optimization tricks, but with
+   * some contraints:
+   *   ch_im_in is multiple of 4
+   *   ch_im_out is multiple of 2
+   */
+    riscv_status riscv_convolve_1x1_HWC_q7_fast_nonsquare(const q7_t * Im_in,
+                                                      const uint16_t dim_im_in_x,
+                                                      const uint16_t dim_im_in_y,
+                                                      const uint16_t ch_im_in,
+                                                      const q7_t * wt,
+                                                      const uint16_t ch_im_out,
+                                                      const uint16_t dim_kernel_x,
+                                                      const uint16_t dim_kernel_y,
+                                                      const uint16_t padding_x,
+                                                      const uint16_t padding_y,
+                                                      const uint16_t stride_x,
+                                                      const uint16_t stride_y,
+                                                      const q7_t * bias,
+                                                      const uint16_t bias_shift,
+                                                      const uint16_t out_shift,
+                                                      q7_t * Im_out,
+                                                      const uint16_t dim_im_out_x,
+                                                      const uint16_t dim_im_out_y,
+                                                      q15_t * bufferA,
+                                                      q7_t * bufferB);
+
+  /**
+   * @brief Q7 version of convolution for RGB image
+   * @param[in]       Im_in       pointer to input tensor
+   * @param[in]       dim_im_in   input tensor dimention
+   * @param[in]       ch_im_in    number of input tensor channels
+   * @param[in]       wt          pointer to kernel weights
+   * @param[in]       ch_im_out   number of filters, i.e., output tensor channels
+   * @param[in]       dim_kernel  filter kernel size
+   * @param[in]       padding     padding sizes
+   * @param[in]       stride      convolution stride
+   * @param[in]       bias        pointer to bias
+   * @param[in]       bias_shift  amount of left-shift for bias
+   * @param[in]       out_shift   amount of right-shift for output
+   * @param[in,out]   Im_out      pointer to output tensor
+   * @param[in]       dim_im_out  output tensor dimension
+   * @param[in,out]   bufferA     pointer to buffer space for input
+   * @param[in,out]   bufferB     pointer to buffer space for output
+   * @return     The function returns either
+   * <code>RISCV_MATH_SIZE_MISMATCH</code> or <code>RISCV_MATH_SUCCESS</code> based on the outcome of size checking.
+   *
+   * This kernel is written exclusively for convolution with ch_im_in
+   * equals 3. This applies on the first layer of CNNs which has input
+   * image with RGB format.
+   */
+
+    riscv_status riscv_convolve_HWC_q7_RGB(const q7_t * Im_in,
+                                       const uint16_t dim_im_in,
+                                       const uint16_t ch_im_in,
+                                       const q7_t * wt,
+                                       const uint16_t ch_im_out,
+                                       const uint16_t dim_kernel,
+                                       const uint16_t padding,
+                                       const uint16_t stride,
+                                       const q7_t * bias,
+                                       const uint16_t bias_shift,
+                                       const uint16_t out_shift,
+                                       q7_t * Im_out,
+                                       const uint16_t dim_im_out,
+                                       q15_t * bufferA,
+                                       q7_t * bufferB);
+
+  /**
+   * @brief Fast Q15 convolution function
+   * @param[in]       Im_in       pointer to input tensor
+   * @param[in]       dim_im_in   input tensor dimention
+   * @param[in]       ch_im_in    number of input tensor channels
+   * @param[in]       wt          pointer to kernel weights
+   * @param[in]       ch_im_out   number of filters, i.e., output tensor channels
+   * @param[in]       dim_kernel  filter kernel size
+   * @param[in]       padding     padding sizes
+   * @param[in]       stride      convolution stride
+   * @param[in]       bias        pointer to bias
+   * @param[in]       bias_shift  amount of left-shift for bias
+   * @param[in]       out_shift   amount of right-shift for output
+   * @param[in,out]   Im_out      pointer to output tensor
+   * @param[in]       dim_im_out  output tensor dimension
+   * @param[in,out]   bufferA     pointer to buffer space for input
+   * @param[in,out]   bufferB     pointer to buffer space for output
+   * @return     The function returns either
+   * <code>RISCV_MATH_SIZE_MISMATCH</code> or <code>RISCV_MATH_SUCCESS</code> based on the outcome of size checking.
+   *
+   * This function is the version with full list of optimization tricks, but with
+   * some contraints:
+   *   ch_im_in is multiple of 2
+   *   ch_im_out is multiple of 2
+   */
+
+    riscv_status riscv_convolve_HWC_q15_fast(const q15_t * Im_in,
+                                         const uint16_t dim_im_in,
+                                         const uint16_t ch_im_in,
+                                         const q15_t * wt,
+                                         const uint16_t ch_im_out,
+                                         const uint16_t dim_kernel,
+                                         const uint16_t padding,
+                                         const uint16_t stride,
+                                         const q15_t * bias,
+                                         const uint16_t bias_shift,
+                                         const uint16_t out_shift,
+                                         q15_t * Im_out,
+                                         const uint16_t dim_im_out,
+                                         q15_t * bufferA,
+                                         q7_t * bufferB);
+
+  /**
+   * @brief Fast Q15 convolution function (non-sqaure shape)
+   * @param[in]       Im_in        pointer to input tensor
+   * @param[in]       dim_im_in_x  input tensor dimention x
+   * @param[in]       dim_im_in_y  input tensor dimention y
+   * @param[in]       ch_im_in     number of input tensor channels
+   * @param[in]       wt           pointer to kernel weights
+   * @param[in]       ch_im_out    number of filters, i.e., output tensor channels
+   * @param[in]       dim_kernel_x filter kernel size x
+   * @param[in]       dim_kernel_y filter kernel size y
+   * @param[in]       padding_x    padding size x
+   * @param[in]       padding_y    padding size y
+   * @param[in]       stride_x     convolution stride x
+   * @param[in]       stride_y     convolution stride y
+   * @param[in]       bias         pointer to bias
+   * @param[in]       bias_shift   amount of left-shift for bias
+   * @param[in]       out_shift    amount of right-shift for output
+   * @param[in,out]   Im_out       pointer to output tensor
+   * @param[in]       dim_im_out_x output tensor dimension x
+   * @param[in]       dim_im_out_y output tensor dimension y
+   * @param[in,out]   bufferA      pointer to buffer space for input
+   * @param[in,out]   bufferB      pointer to buffer space for output
+   * @return     The function returns either
+   * <code>RISCV_MATH_SIZE_MISMATCH</code> or <code>RISCV_MATH_SUCCESS</code> based on the outcome of size checking.
+   *
+   * @details
+   *
+   * <b>Buffer size:</b>
+   *
+   * bufferA size: 2*ch_im_in*dim_kernel*dim_kernel
+   *
+   * bufferB size: 0
+   *
+   * <b>Input dimension constraints:</b>
+   *
+   * ch_im_in is multiple of 2
+   *
+   * ch_im_out is multipe of 2
+   *
+   */
+
+    riscv_status
+    riscv_convolve_HWC_q15_fast_nonsquare(const q15_t * Im_in,
+                              const uint16_t dim_im_in_x,
+                              const uint16_t dim_im_in_y,
+                              const uint16_t ch_im_in,
+                              const q15_t * wt,
+                              const uint16_t ch_im_out,
+                              const uint16_t dim_kernel_x,
+                              const uint16_t dim_kernel_y,
+                              const uint16_t padding_x,
+                              const uint16_t padding_y,
+                              const uint16_t stride_x,
+                              const uint16_t stride_y,
+                              const q15_t * bias,
+                              const uint16_t bias_shift,
+                              const uint16_t out_shift,
+                              q15_t * Im_out,
+                              const uint16_t dim_im_out_x,
+                              const uint16_t dim_im_out_y,
+                              q15_t * bufferA,
+                              q7_t * bufferB);
+
+  /**
+   * @brief Q7 depthwise separable convolution function
+   * @param[in]       Im_in       pointer to input tensor
+   * @param[in]       dim_im_in   input tensor dimention
+   * @param[in]       ch_im_in    number of input tensor channels
+   * @param[in]       wt          pointer to kernel weights
+   * @param[in]       ch_im_out   number of filters, i.e., output tensor channels
+   * @param[in]       dim_kernel  filter kernel size
+   * @param[in]       padding     padding sizes
+   * @param[in]       stride      convolution stride
+   * @param[in]       bias        pointer to bias
+   * @param[in]       bias_shift  amount of left-shift for bias
+   * @param[in]       out_shift   amount of right-shift for output
+   * @param[in,out]   Im_out      pointer to output tensor
+   * @param[in]       dim_im_out  output tensor dimension
+   * @param[in,out]   bufferA     pointer to buffer space for input
+   * @param[in,out]   bufferB     pointer to buffer space for output
+   * @return     The function returns either
+   * <code>RISCV_MATH_SIZE_MISMATCH</code> or <code>RISCV_MATH_SUCCESS</code> based on the outcome of size checking.
+   *
+   * This function is the version with full list of optimization tricks, but with
+   * some contraints:
+   *   ch_im_in is multiple of 2
+   *   ch_im_out is multiple of 2
+   */
+
+    riscv_status riscv_depthwise_separable_conv_HWC_q7(const q7_t * Im_in,
+                                                   const uint16_t dim_im_in,
+                                                   const uint16_t ch_im_in,
+                                                   const q7_t * wt,
+                                                   const uint16_t ch_im_out,
+                                                   const uint16_t dim_kernel,
+                                                   const uint16_t padding,
+                                                   const uint16_t stride,
+                                                   const q7_t * bias,
+                                                   const uint16_t bias_shift,
+                                                   const uint16_t out_shift,
+                                                   q7_t * Im_out,
+                                                   const uint16_t dim_im_out,
+                                                   q15_t * bufferA,
+                                                   q7_t * bufferB);
+
+  /**
+   * @brief Q7 depthwise separable convolution function (non-square shape)
+   * @param[in]       Im_in         pointer to input tensor
+   * @param[in]       dim_im_in_x   input tensor dimention x
+   * @param[in]       dim_im_in_y   input tensor dimention y
+   * @param[in]       ch_im_in      number of input tensor channels
+   * @param[in]       wt            pointer to kernel weights
+   * @param[in]       ch_im_out     number of filters, i.e., output tensor channels
+   * @param[in]       dim_kernel_x  filter kernel size x
+   * @param[in]       dim_kernel_y  filter kernel size y
+   * @param[in]       padding_x     padding sizes x
+   * @param[in]       padding_y     padding sizes y
+   * @param[in]       stride_x      convolution stride x
+   * @param[in]       stride_y      convolution stride y
+   * @param[in]       bias          pointer to bias
+   * @param[in]       bias_shift    amount of left-shift for bias
+   * @param[in]       out_shift     amount of right-shift for output
+   * @param[in,out]   Im_out        pointer to output tensor
+   * @param[in]       dim_im_out_x  output tensor dimension x
+   * @param[in]       dim_im_out_y  output tensor dimension y
+   * @param[in,out]   bufferA       pointer to buffer space for input
+   * @param[in,out]   bufferB       pointer to buffer space for output
+   * @return     The function returns either
+   * <code>RISCV_MATH_SIZE_MISMATCH</code> or <code>RISCV_MATH_SUCCESS</code> based on the outcome of size checking.
+   *
+   * This function is the version with full list of optimization tricks, but with
+   * some contraints:
+   *   ch_im_in is multiple of 2
+   *   ch_im_out is multiple of 2
+   */
+    riscv_status riscv_depthwise_separable_conv_HWC_q7_nonsquare(const q7_t * Im_in,
+                                                             const uint16_t dim_im_in_x,
+                                                             const uint16_t dim_im_in_y,
+                                                             const uint16_t ch_im_in,
+                                                             const q7_t * wt,
+                                                             const uint16_t ch_im_out,
+                                                             const uint16_t dim_kernel_x,
+                                                             const uint16_t dim_kernel_y,
+                                                             const uint16_t padding_x,
+                                                             const uint16_t padding_y,
+                                                             const uint16_t stride_x,
+                                                             const uint16_t stride_y,
+                                                             const q7_t * bias,
+                                                             const uint16_t bias_shift,
+                                                             const uint16_t out_shift,
+                                                             q7_t * Im_out,
+                                                             const uint16_t dim_im_out_x,
+                                                             const uint16_t dim_im_out_y,
+                                                             q15_t * bufferA,
+                                                             q7_t * bufferB);
+
+
+/**
+ * @defgroup FC Fully-connected Layer Functions
+ *
+ * Perform fully-connected layer
+ *
+ * Fully-connected layer is basically a matrix-vector multiplication
+ * with bias. The matrix is the weights and the input/output vectors
+ * are the activation values. Supported {weight, activation} precisions
+ * include {8-bit, 8-bit}, {16-bit, 16-bit}, and {8-bit, 16-bit}.
+ *
+ * Here we have two types of kernel functions. The basic function
+ * implements the function using regular GEMV approach. The opt functions
+ * operates with weights in interleaved formats.
+ *
+ */
+
+  /**
+   * @brief Q7 basic fully-connected layer function
+   * @param[in]       pV          pointer to input vector
+   * @param[in]       pM          pointer to matrix weights
+   * @param[in]       dim_vec     length of the vector
+   * @param[in]       num_of_rows number of rows in weight matrix
+   * @param[in]       bias_shift  amount of left-shift for bias
+   * @param[in]       out_shift   amount of right-shift for output
+   * @param[in]       bias        pointer to bias
+   * @param[in,out]   pOut        pointer to output vector
+   * @param[in,out]   vec_buffer  pointer to buffer space for input
+   * @return     The function returns <code>RISCV_MATH_SUCCESS</code>
+   *
+   */
+
+    riscv_status riscv_fully_connected_q7(const q7_t * pV,
+                                      const q7_t * pM,
+                                      const uint16_t dim_vec,
+                                      const uint16_t num_of_rows,
+                                      const uint16_t bias_shift,
+                                      const uint16_t out_shift,
+                                      const q7_t * bias,
+                                      q7_t * pOut,
+                                      q15_t * vec_buffer);
+
+  /**
+   * @brief S8 basic fully-connected layer function for TF Lite
+   * @param[in]       pInput                       pointer to pInput vector
+   * @param[in]       pWeight                      pointer to matrix weights
+   * @param[in]       col_dim                      dimension of the input vector
+   * @param[in]       row_dim                      dimension of the output vector
+   * @param[in]       nb_batches                   number of batches
+   * @param[in]       input_offset                 
+   * @param[in]       filter_offset                
+   * @param[in]       out_mult                     requantization parameter
+   * @param[in]       out_shift                    requantization parameter
+   * @param[in]       output_offset                
+   * @param[in]       pBias                        pointer to bias
+   * @param[out]      pOut                         pointer to output vector
+   * @param[in]       output_activation_min        for clamping
+   * @param[in]       output_activation_max        for clamping
+   * @param[in,out]   vec_buffer                   pointer to buffer space for pInput
+   * @return          The function returns         RISCV_MATH_SUCCESS
+   *
+   * @details
+   *
+   * <b>Buffer size:</b>
+   *
+   * vec_buffer size: col_dim of word16.
+   *
+   * This basic function is designed to work with regular pWeight
+   * matrix without interleaving.
+   *
+   */
+  riscv_status
+  riscv_fully_connected_s8(const int8_t   *pInput,             
+                         const int8_t   *weight,                  
+                         const uint16_t input_length,  
+                         const uint16_t num_rows,  
+                         const uint16_t nb_batches,   
+                         const int32_t  input_offset,   
+                         const int32_t  filter_offset,  
+                         const int32_t  out_mult,      
+                         const int32_t  out_shift,     
+                         const int32_t  output_offset,     
+                         const int8_t   *bias,             
+                         int8_t         *pOut,                   
+                         const int32_t  output_activation_min,
+                         const int32_t  output_activation_max,
+                         q15_t          *vec_buffer)  ;  
+
+  /**
+   * @brief Q7 opt fully-connected layer function
+   * @param[in]       pV          pointer to input vector
+   * @param[in]       pM          pointer to matrix weights
+   * @param[in]       dim_vec     length of the vector
+   * @param[in]       num_of_rows number of rows in weight matrix
+   * @param[in]       bias_shift  amount of left-shift for bias
+   * @param[in]       out_shift   amount of right-shift for output
+   * @param[in]       bias        pointer to bias
+   * @param[in,out]   pOut        pointer to output vector
+   * @param[in,out]   vec_buffer  pointer to buffer space for input
+   * @return     The function returns <code>RISCV_MATH_SUCCESS</code>
+   *
+   */
+
+    riscv_status riscv_fully_connected_q7_opt(const q7_t * pV,
+                                          const q7_t * pM,
+                                          const uint16_t dim_vec,
+                                          const uint16_t num_of_rows,
+                                          const uint16_t bias_shift,
+                                          const uint16_t out_shift,
+                                          const q7_t * bias,
+                                          q7_t * pOut,
+                                          q15_t * vec_buffer);
+
+  /**
+   * @brief Q15 basic fully-connected layer function
+   * @param[in]       pV          pointer to input vector
+   * @param[in]       pM          pointer to matrix weights
+   * @param[in]       dim_vec     length of the vector
+   * @param[in]       num_of_rows number of rows in weight matrix
+   * @param[in]       bias_shift  amount of left-shift for bias
+   * @param[in]       out_shift   amount of right-shift for output
+   * @param[in]       bias        pointer to bias
+   * @param[in,out]   pOut        pointer to output vector
+   * @param[in,out]   vec_buffer  pointer to buffer space for input
+   * @return     The function returns <code>RISCV_MATH_SUCCESS</code>
+   *
+   */
+
+    riscv_status riscv_fully_connected_q15(const q15_t * pV,
+                                       const q15_t * pM,
+                                       const uint16_t dim_vec,
+                                       const uint16_t num_of_rows,
+                                       const uint16_t bias_shift,
+                                       const uint16_t out_shift,
+                                       const q15_t * bias,
+                                       q15_t * pOut,
+                                       q15_t * vec_buffer);
+
+  /**
+   * @brief Q15 opt fully-connected layer function
+   * @param[in]       pV          pointer to input vector
+   * @param[in]       pM          pointer to matrix weights
+   * @param[in]       dim_vec     length of the vector
+   * @param[in]       num_of_rows number of rows in weight matrix
+   * @param[in]       bias_shift  amount of left-shift for bias
+   * @param[in]       out_shift   amount of right-shift for output
+   * @param[in]       bias        pointer to bias
+   * @param[in,out]   pOut        pointer to output vector
+   * @param[in,out]   vec_buffer  pointer to buffer space for input
+   * @return     The function returns <code>RISCV_MATH_SUCCESS</code>
+   *
+   */
+
+    riscv_status riscv_fully_connected_q15_opt(const q15_t * pV,
+                                           const q15_t * pM,
+                                           const uint16_t dim_vec,
+                                           const uint16_t num_of_rows,
+                                           const uint16_t bias_shift,
+                                           const uint16_t out_shift,
+                                           const q15_t * bias,
+                                           q15_t * pOut,
+                                           q15_t * vec_buffer);
+
+  /**
+   * @brief Mixed Q15-Q7 fully-connected layer function
+   * @param[in]       pV          pointer to input vector
+   * @param[in]       pM          pointer to matrix weights
+   * @param[in]       dim_vec     length of the vector
+   * @param[in]       num_of_rows number of rows in weight matrix
+   * @param[in]       bias_shift  amount of left-shift for bias
+   * @param[in]       out_shift   amount of right-shift for output
+   * @param[in]       bias        pointer to bias
+   * @param[in,out]   pOut        pointer to output vector
+   * @param[in,out]   vec_buffer  pointer to buffer space for input
+   * @return     The function returns <code>RISCV_MATH_SUCCESS</code>
+   *
+   */
+
+    riscv_status riscv_fully_connected_mat_q7_vec_q15(const q15_t * pV,
+                                                  const q7_t * pM,
+                                                  const uint16_t dim_vec,
+                                                  const uint16_t num_of_rows,
+                                                  const uint16_t bias_shift,
+                                                  const uint16_t out_shift,
+                                                  const q7_t * bias,
+                                                  q15_t * pOut,
+                                                  q15_t * vec_buffer);
+
+  /**
+   * @brief Mixed Q15-Q7 opt fully-connected layer function
+   * @param[in]       pV          pointer to input vector
+   * @param[in]       pM          pointer to matrix weights
+   * @param[in]       dim_vec     length of the vector
+   * @param[in]       num_of_rows number of rows in weight matrix
+   * @param[in]       bias_shift  amount of left-shift for bias
+   * @param[in]       out_shift   amount of right-shift for output
+   * @param[in]       bias        pointer to bias
+   * @param[in,out]   pOut        pointer to output vector
+   * @param[in,out]   vec_buffer  pointer to buffer space for input
+   * @return     The function returns <code>RISCV_MATH_SUCCESS</code>
+   *
+   */
+
+    riscv_status riscv_fully_connected_mat_q7_vec_q15_opt(const q15_t * pV,
+                                                      const q7_t * pM,
+                                                      const uint16_t dim_vec,
+                                                      const uint16_t num_of_rows,
+                                                      const uint16_t bias_shift,
+                                                      const uint16_t out_shift,
+                                                      const q7_t * bias,
+                                                      q15_t * pOut,
+                                                      q15_t * vec_buffer);
+
+/**
+ * @brief Matrix-Multiplication Kernels for Convolution
+ *
+ * These functions are used within convolution layer functions for
+ * matrix multiplication.
+ *
+ * The implementation is similar to NMSIS-DSP riscv_mat_mult functions
+ * with one Q7 and one Q15 operands. The Q15 operand is the im2col
+ * output which is always with 2 columns.
+ *
+ */
+
+  /**
+   * @brief Matrix-multiplication function for convolution
+   * @param[in]       pA          pointer to operand A
+   * @param[in]       pInBuffer   pointer to operand B, always conssists of 2 vectors
+   * @param[in]       ch_im_out   numRow of A
+   * @param[in]       numCol_A    numCol of A
+   * @param[in]       bias_shift  amount of left-shift for bias
+   * @param[in]       out_shift   amount of right-shift for output
+   * @param[in]       bias        the bias
+   * @param[in,out]   pOut        pointer to output
+   * @return     The function returns the incremented output pointer
+   */
+
+    q7_t     *riscv_nn_mat_mult_kernel_q7_q15(const q7_t * pA,
+                                            const q15_t * pInBuffer,
+                                            const uint16_t ch_im_out,
+                                            const uint16_t numCol_A,
+                                            const uint16_t bias_shift,
+                                            const uint16_t out_shift,
+                                            const q7_t * bias,
+                                            q7_t * pOut);
+
+    q7_t     *riscv_nn_mat_mult_kernel_q7(const q7_t * pA,
+                                            const q7_t * pInBuffer,
+                                            const uint16_t ch_im_out,
+                                            const uint16_t numCol_A,
+                                            const uint16_t bias_shift,
+                                            const uint16_t out_shift,
+                                            const q7_t * bias,
+                                            q7_t * pOut);
+
+  /**
+   * @brief Matrix-multiplication function for convolution with reordered columns
+   * @param[in]       pA          pointer to operand A
+   * @param[in]       pInBuffer   pointer to operand B, always conssists of 2 vectors
+   * @param[in]       ch_im_out   numRow of A
+   * @param[in]       numCol_A    numCol of A
+   * @param[in]       bias_shift  amount of left-shift for bias
+   * @param[in]       out_shift   amount of right-shift for output
+   * @param[in]       bias        the bias
+   * @param[in,out]   pOut        pointer to output
+   * @return     The function returns the incremented output pointer
+   */
+
+    q7_t     *riscv_nn_mat_mult_kernel_q7_q15_reordered(const q7_t * pA,
+                                                      const q15_t * pInBuffer,
+                                                      const uint16_t ch_im_out,
+                                                      const uint16_t numCol_A,
+                                                      const uint16_t bias_shift,
+                                                      const uint16_t out_shift,
+                                                      const q7_t * bias,
+                                                      q7_t * pOut);
+
+    q7_t     *riscv_nn_mat_mult_kernel_q7_reordered(const q7_t * pA,
+                                                      const q7_t * pInBuffer,
+                                                      const uint16_t ch_im_out,
+                                                      const uint16_t numCol_A,
+                                                      const uint16_t bias_shift,
+                                                      const uint16_t out_shift,
+                                                      const q7_t * bias,
+                                                      q7_t * pOut);
+
+#ifdef __cplusplus
+}
+#endif
+
+/*
+ *  Other functions
+ *  These layers are typically not timing critical
+ *  Basic implementation is supported here
+ */
+
+#ifdef __cplusplus
+extern    "C"
+{
+#endif
+
+/**
+ * @defgroup Acti Neural Network Activation Functions
+ *
+ * Perform activation layers, including ReLU (Rectified Linear Unit),
+ * sigmoid and tanh
+ *
+ */
+
+  /**
+   * @brief Q7 RELU function
+   * @param[in,out]   data        pointer to input
+   * @param[in]       size        number of elements
+   * @return none.
+   */
+
+    void      riscv_relu_q7(q7_t * data, uint16_t size);
+
+  /**
+   * @brief Q15 RELU function
+   * @param[in,out]   data        pointer to input
+   * @param[in]       size        number of elements
+   * @return none.
+   */
+
+    void      riscv_relu_q15(q15_t * data, uint16_t size);
+
+  /**
+   * @brief Q7 neural network activation function using direct table look-up
+   * @param[in,out]   data        pointer to input
+   * @param[in]       size        number of elements
+   * @param[in]       int_width   bit-width of the integer part, assume to be smaller than 3
+   * @param[in]       type        type of activation functions
+   * @return none.
+   */
+
+    void      riscv_nn_activations_direct_q7(q7_t * data, uint16_t size, uint16_t int_width,
+                                           riscv_nn_activation_type type);
+
+  /**
+   * @brief Q15 neural network activation function using direct table look-up
+   * @param[in,out]   data        pointer to input
+   * @param[in]       size        number of elements
+   * @param[in]       int_width   bit-width of the integer part, assume to be smaller than 3
+   * @param[in]       type        type of activation functions
+   * @return none.
+   */
+
+    void      riscv_nn_activations_direct_q15(q15_t * data, uint16_t size, uint16_t int_width,
+                                            riscv_nn_activation_type type);
+
+/**
+ * @defgroup Pooling Neural Network Pooling Functions
+ *
+ * Perform pooling functions, including max pooling and average pooling
+ *
+ */
+
+  /**
+   * @brief Q7 max pooling function
+   * @param[in]       Im_in       pointer to input tensor
+   * @param[in]       dim_im_in   input tensor dimention
+   * @param[in]       ch_im_in    number of input tensor channels
+   * @param[in]       dim_kernel  filter kernel size
+   * @param[in]       padding     padding sizes
+   * @param[in]       stride      convolution stride
+   * @param[in]       dim_im_out  output tensor dimension
+   * @param[in,out]   bufferA     pointer to buffer space for input
+   * @param[in,out]   Im_out      pointer to output tensor
+   * @return none.
+   *
+   */
+
+    void      riscv_maxpool_q7_HWC(q7_t * Im_in,
+                                 const uint16_t dim_im_in,
+                                 const uint16_t ch_im_in,
+                                 const uint16_t dim_kernel,
+                                 const uint16_t padding,
+                                 const uint16_t stride,
+                                 const uint16_t dim_im_out,
+                                 q7_t * bufferA,
+                                 q7_t * Im_out);
+
+  /**
+   * @brief Q7 average pooling function
+   * @param[in]       Im_in       pointer to input tensor
+   * @param[in]       dim_im_in   input tensor dimention
+   * @param[in]       ch_im_in    number of input tensor channels
+   * @param[in]       dim_kernel  filter kernel size
+   * @param[in]       padding     padding sizes
+   * @param[in]       stride      convolution stride
+   * @param[in]       dim_im_out  output tensor dimension
+   * @param[in,out]   bufferA     pointer to buffer space for input
+   * @param[in,out]   Im_out      pointer to output tensor
+   * @return none.
+   *
+   */
+
+    void      riscv_avepool_q7_HWC(q7_t * Im_in,
+                                 const uint16_t dim_im_in,
+                                 const uint16_t ch_im_in,
+                                 const uint16_t dim_kernel,
+                                 const uint16_t padding,
+                                 const uint16_t stride,
+                                 const uint16_t dim_im_out,
+                                 q7_t * bufferA,
+                                 q7_t * Im_out);
+
+/**
+ * @defgroup Softmax Softmax Functions
+ *
+ * EXP(2) based softmax function
+ *
+ */
+
+  /**
+   * @brief Q7 softmax function
+   * @param[in]       vec_in      pointer to input vector
+   * @param[in]       dim_vec     input vector dimention
+   * @param[out]      p_out       pointer to output vector
+   * @return none.
+   *
+   */
+
+    void      riscv_softmax_q7(const q7_t * vec_in, const uint16_t dim_vec, q7_t * p_out);
+
+  /**
+   * @brief Q15 softmax function
+   * @param[in]       vec_in      pointer to input vector
+   * @param[in]       dim_vec     input vector dimention
+   * @param[out]      p_out       pointer to output vector
+   * @return none.
+   *
+   */
+
+    void      riscv_softmax_q15(const q15_t * vec_in, const uint16_t dim_vec, q15_t * p_out);
+
+  /**
+   * @brief uint8 depthwise convolution function with asymmetric quantization for even number of channel multiplier
+   *        and input channels. Unless specified otherwise, arguments are mandatory.
+   *
+   * @param[in]     input     Pointer to input tensor
+   * @param[in]     input_x   Width of input tensor
+   * @param[in]     input_y   Height of input tensor
+   * @param[in]     input_ch  Channels in input tensor
+   * @param[in]     kernel    Pointer to kernel weights
+   * @param[in]     kernel_x  Width of kernel
+   * @param[in]     kernel_y  Height of kernel
+   * @param[in]     ch_mult   Number of channel multiplier
+   * @param[in]     pad_x     Padding sizes x
+   * @param[in]     pad_y     Padding sizes y
+   * @param[in]     stride_x  Convolution stride along the width
+   * @param[in]     stride_y  Convolution stride along the height
+   * @param[in]     dilation_x Dilation along width. Not used and intended for future enhancement.
+   * @param[in]     dilation_y Dilation along height. Not used and intended for future enhancement.
+   * @param[in]     bias       Pointer to optional bias values. If no bias is
+   *                           availble, NULL is expected
+   * @param[in]     input_offset  Input tensor zero offset
+   * @param[in]     filter_offset Kernel tensor zero offset
+   * @param[in]     output_offset Output tensor zero offset
+   * @param[in,out] output        Pointer to output tensor
+   * @param[in]     output_x  Width of output tensor
+   * @param[in]     output_y  Height of output tensor
+   * @param[in]     output_activation_min   Minimum value to clamp the output to. Range : {0, 255}
+   * @param[in]     output_activation_max   Minimum value to clamp the output to. Range : {0, 255}
+   * @param[in]     out_shift  Amount of right-shift for output
+   * @param[in]     out_mult   Output multiplier for requantization
+   * @return        The function returns one of the following
+   *                <code>RISCV_MATH_SIZE_MISMATCH</code> - Not supported dimension of tensors
+   *                <code>RISCV_MATH_SUCCESS</code> - Successful operation
+   *                <code>RISCV_MATH_ARGUMENT_ERROR</code> - Implementation not available
+   *
+   * <b> Input constraints</b>
+   * ch_mult  is multiple of 2
+   * kernel_x is multiple of 2
+   *
+   */
+    riscv_status riscv_depthwise_conv_u8_basic_ver1(const uint8_t *input,
+                                                const uint16_t input_x,
+                                                const uint16_t input_y,
+                                                const uint16_t input_ch,
+                                                const uint8_t *kernel,
+                                                const uint16_t kernel_x,
+                                                const uint16_t kernel_y,
+                                                const int16_t ch_mult,
+                                                const int16_t pad_x,
+                                                const int16_t pad_y,
+                                                const int16_t stride_x,
+                                                const int16_t stride_y,
+                                                const int16_t dilation_x,
+                                                const int16_t dilation_y,
+                                                const int32_t *bias,
+                                                const int32_t input_offset,
+                                                const int32_t filter_offset,
+                                                const int32_t output_offset,
+                                                uint8_t *output,
+                                                const uint16_t output_x,
+                                                const uint16_t output_y,
+                                                const int32_t output_activation_min,
+                                                const int32_t output_activation_max,
+                                                const int32_t out_shift,
+                                                const int32_t out_mult);
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/kernel/arch/risc-v/nuclei/gcc/nmsis/NN/Include/riscv_nnsupportfunctions.h b/kernel/arch/risc-v/nuclei/gcc/nmsis/NN/Include/riscv_nnsupportfunctions.h
new file mode 100644
index 00000000..6061f08d
--- /dev/null
+++ b/kernel/arch/risc-v/nuclei/gcc/nmsis/NN/Include/riscv_nnsupportfunctions.h
@@ -0,0 +1,366 @@
+/*
+ * Copyright (C) 2010-2018 Arm Limited or its affiliates. All rights reserved.
+ * Copyright (c) 2019 Nuclei Limited. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* ----------------------------------------------------------------------
+ * Project:      NMSIS NN Library
+ * Title:        riscv_nnsupportfunctions.h
+ * Description:  Public header file of support functions for NMSIS NN Library
+ *
+ * $Date:        July 2019
+ * $Revision:    V.1.0.0
+ *
+ * Target Processor: RISC-V Cores
+ * -------------------------------------------------------------------- */
+
+#ifndef _RISCV_NNSUPPORTFUNCTIONS_H_
+#define _RISCV_NNSUPPORTFUNCTIONS_H_
+
+#include "riscv_math.h"
+#include "riscv_common_tables.h"
+
+#ifdef __cplusplus
+extern    "C"
+{
+#endif
+
+#define LEFT_SHIFT(_shift)  (_shift > 0 ? _shift : 0)
+#define RIGHT_SHIFT(_shift) (_shift > 0 ? 0 : -_shift)
+#define Q31_MIN (0x80000000L)
+#define Q31_MAX (0x7FFFFFFFL)
+
+#define MAX(A,B) (A) > (B) ? (A) : (B)
+#define MIN(A,B) (A) < (B) ? (A) : (B)
+
+/**
+ * @brief Union for SIMD access of q31/q15/q7 types
+ */
+union riscv_nnword
+{
+    q31_t     word;
+               /**< q31 type */
+    q15_t     half_words[2];
+               /**< q15 type */
+    q7_t      bytes[4];
+               /**< q7 type */
+};
+
+/**
+ * @brief Struct for specifying activation function types
+ *
+ */
+typedef enum
+{
+    RISCV_SIGMOID = 0,
+                /**< Sigmoid activation function */
+    RISCV_TANH = 1,
+             /**< Tanh activation function */
+} riscv_nn_activation_type;
+
+/**
+ * @defgroup nndata_convert Neural Network Data Conversion Functions
+ *
+ * Perform data type conversion in-between neural network operations
+ *
+ */
+
+/**
+ * @brief Converts the elements of the q7 vector to q15 vector without left-shift
+ * @param[in]       *pSrc points to the q7 input vector
+ * @param[out]      *pDst points to the q15 output vector
+ * @param[in]       blockSize length of the input vector
+ * @return none.
+ *
+ */
+void      riscv_q7_to_q15_no_shift(const q7_t * pSrc, q15_t * pDst, uint32_t blockSize);
+
+void      riscv_q7_to_q7_no_shift(const q7_t * pSrc, q7_t * pDst, uint32_t blockSize);
+
+/**
+ * @brief Non-saturating addition of elements of a q7 vector
+ * @param[in]       *input Pointer to the q7 input vector
+ * @param[out]      *output Pointer to the q31 output variable.
+ * @param[in]       block_size length of the input vector
+ * @return none.
+ * \par Description:
+ *
+ * 2^24 samples can be added without saturating the result.
+ *
+ * The equation used for the conversion process is:
+ *
+ * <pre>
+ *  sum = input[0] + input[1] + .. + input[block_size -1]
+ * </pre>
+ *
+ * */
+void riscv_nn_add_q7(const q7_t *input, q31_t *output, uint32_t block_size);
+
+/**
+ * @brief  Converts the elements of the q7 vector to reordered q15 vector without left-shift
+ * @param[in]       *pSrc points to the q7 input vector
+ * @param[out]      *pDst points to the q15 output vector
+ * @param[in]       blockSize length of the input vector
+ * @return none.
+ *
+ */
+void      riscv_q7_to_q15_reordered_no_shift(const q7_t * pSrc, q15_t * pDst, uint32_t blockSize);
+
+void      riscv_q7_to_q7_reordered_no_shift(const q7_t * pSrc, q7_t * pDst, uint32_t blockSize);
+
+/**
+ * @brief Converts the elements from a q7 vector to a q15 vector with an added offset
+ * @param[in]    *src       points to the q7 input vector
+ * @param[out]   *dst       points to the q15 output vector
+ * @param[in]    block_size length of the input vector
+ * @param[in]    offset     q7 offset to be added to each input vector element.
+ * @return none.
+ *
+ * \par Description:
+ *
+ * The equation used for the conversion process is:
+ *
+ * <pre>
+ *  dst[n] = (q15_t) src[n] + offset;   0 <= n < block_size.
+ * </pre>
+ *
+ */
+void riscv_q7_to_q15_with_offset(const q7_t *src, q15_t *dst, uint32_t block_size, q7_t offset);
+
+#if defined (RISCV_MATH_DSP)
+
+/**
+ * @brief read and expand one q7 word into two q15 words
+ */
+
+__STATIC_FORCEINLINE void *read_and_pad(void *source, q31_t * out1, q31_t * out2)
+{
+        q31_t     inA = *__SIMD32(source)++;
+        q31_t     inAbuf1 = __SXTB16(__ROR(inA, 8));
+        q31_t     inAbuf2 = __SXTB16(inA);
+
+        *out2 = __PKHTB(inAbuf1, inAbuf2, 16);
+        *out1 = __PKHBT(inAbuf2, inAbuf1, 16);
+
+        return source;
+}
+
+/**
+ * @brief read and expand one q7 word into two q15 words with reordering
+ */
+
+__STATIC_FORCEINLINE q7_t *read_and_pad_reordered(q7_t *source, q31_t * out1, q31_t * out2)
+{
+        q31_t     inA = read_q7x4_ia(&source);
+        *out2 = __SXTB16(__ROR(inA, 8));
+        *out1 = __SXTB16(inA);
+
+        return source;
+}
+
+/**
+ * @brief read and expand one q7 word into two q15 words with reordering and add an offset
+ */
+__STATIC_FORCEINLINE q7_t *read_and_pad_reordered_with_offset(q7_t *source, q31_t * out1, q31_t * out2,q31_t offset)
+{
+        q31_t     inA = read_q7x4_ia(&source);
+
+        *out2 = __SXTB16(__ROR(inA, 8));
+        *out1 = __SXTB16(inA);
+        *out1 = __QADD16(*out1,offset);
+        *out2 = __QADD16(*out2,offset);
+
+        return source;
+}
+
+
+#endif
+
+
+
+/**
+ * @defgroup NNBasicMath Basic Math Functions for Neural Network Computation
+ *
+ * Basic Math Functions for Neural Network Computation
+ *
+ */
+
+/**
+ * @brief           q7 vector multiplication with variable output shifts
+ * @param[in]       *pSrcA        pointer to the first input vector
+ * @param[in]       *pSrcB        pointer to the second input vector
+ * @param[out]      *pDst         pointer to the output vector
+ * @param[in]       out_shift     amount of right-shift for output
+ * @param[in]       blockSize     number of samples in each vector
+ * @return none.
+ *
+ * <b>Scaling and Overflow Behavior:</b>
+ * \par
+ * The function uses saturating arithmetic.
+ * Results outside of the allowable q15 range [0x8000 0x7FFF] will be saturated.
+ */
+
+void riscv_nn_mult_q15(
+  q15_t * pSrcA,
+  q15_t * pSrcB,
+  q15_t * pDst,
+  const uint16_t out_shift,
+  uint32_t blockSize);
+
+/**
+ * @brief           q7 vector multiplication with variable output shifts
+ * @param[in]       *pSrcA        pointer to the first input vector
+ * @param[in]       *pSrcB        pointer to the second input vector
+ * @param[out]      *pDst         pointer to the output vector
+ * @param[in]       out_shift     amount of right-shift for output
+ * @param[in]       blockSize     number of samples in each vector
+ * @return none.
+ *
+ * <b>Scaling and Overflow Behavior:</b>
+ * \par
+ * The function uses saturating arithmetic.
+ * Results outside of the allowable q7 range [0x80 0x7F] will be saturated.
+ */
+
+void riscv_nn_mult_q7(
+  q7_t * pSrcA,
+  q7_t * pSrcB,
+  q7_t * pDst,
+  const uint16_t out_shift,
+  uint32_t blockSize);
+
+/**
+ * @brief macro for adding rounding offset
+ */
+#ifndef RISCV_NN_TRUNCATE
+    #define NN_ROUND(out_shift) ( (0x1 << out_shift) >> 1 )
+#else
+    #define NN_ROUND(out_shift) 0
+#endif
+
+/**
+ * @brief           Saturating doubling high multiply. Result matches
+ *                  NEON instruction VQRDMULH.
+ * @param[in]       m1        Multiplicand
+ * @param[in]       m2        Multiplier
+ * @return          Result of multiplication.
+ *
+ */
+__STATIC_FORCEINLINE q31_t riscv_nn_sat_doubling_high_mult(const q31_t m1, const q31_t m2)
+{
+    q31_t result = 0;
+    // Rounding offset to add for a right shift of 31
+    q63_t mult = 1 << 30;
+
+    if ((m1 < 0) ^ (m2 < 0))
+    {
+        mult = 1 - mult;
+    }
+    // Gets resolved as a SMLAL instruction
+    mult = mult + (q63_t)m1 * m2;
+
+    // Utilize all of the upper 32 bits. This is the doubling step
+    // as well.
+    result = mult / (1UL << 31);
+
+    if ((m1 == m2) && (m1 == Q31_MIN))
+    {
+        result = Q31_MAX;
+    }
+    return result;
+}
+
+/**
+ * @brief           Rounding divide by power of two.
+ * @param[in]       dividend - Dividend
+ * @param[in]       exponent - Divisor = power(2, exponent)
+ *                             Range: [0, 31]
+ * @return          Rounded result of division. Midpoint is rounded away from zero.
+ *
+ */
+__STATIC_FORCEINLINE q31_t riscv_nn_divide_by_power_of_two(const q31_t dividend, const q31_t exponent)
+{
+    q31_t result = 0;
+    const q31_t remainder_mask = (1l << exponent) - 1;
+    int32_t remainder = remainder_mask & dividend;
+
+    // Basic division
+    result = dividend >> exponent;
+
+    // Adjust 'result' for rounding (mid point away from zero)
+    q31_t threshold = remainder_mask >> 1;
+    if (result < 0)
+    {
+        threshold++;
+    }
+    if (remainder > threshold)
+    {
+        result++;
+    }
+
+    return result;
+}
+
+/**
+ * @brief           Requantize a given value.
+ * @param[in]       val         Value to be requantized
+ * @param[in]       multiplier  multiplier
+ * @param[in]       shift       left or right shift for 'val * multiplier'
+ *
+ * @return          Returns (val * multiplier)/(2 ^ shift)
+ *
+ */
+__STATIC_FORCEINLINE q31_t riscv_nn_requantize(const q31_t val, const q31_t multiplier, const q31_t shift)
+{
+  return riscv_nn_divide_by_power_of_two(riscv_nn_sat_doubling_high_mult(val * (1 << LEFT_SHIFT(shift)), multiplier),
+                                       RIGHT_SHIFT(shift));
+}
+
+/**
+  @brief         Read 2 q15 elements and post increment pointer.
+  @param[in]     in_q15   Pointer to pointer that holds address of input.
+  @return        q31 value
+ */
+__STATIC_FORCEINLINE q31_t riscv_nn_read_q15x2_ia(const q15_t **in_q15)
+{
+  q31_t val;
+
+  memcpy(&val, *in_q15, 4);
+  *in_q15 += 2;
+
+  return (val);
+}
+
+/**
+  @brief         Read 4 q7 from q7 pointer and post increment pointer.
+  @param[in]     in_q7       Pointer to pointer that holds address of input.
+  @return        q31 value
+ */
+__STATIC_FORCEINLINE q31_t riscv_nn_read_q7x4_ia(const q7_t **in_q7)
+{
+  q31_t val;
+  memcpy(&val, *in_q7, 4);
+  *in_q7 += 4;
+
+  return (val);
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif