diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000
index 802dd1c9b..cb230b348 100644
--- a/kernel/loongarch64/KERNEL.LOONGSON2K1000
+++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000
@@ -31,11 +31,11 @@ IDAMAXKERNEL = idamax_lsx.S
 ISAMINKERNEL = isamin_lsx.S
 IDAMINKERNEL = idamin_lsx.S
 
-SCOPYKERNEL =  scopy_lsx.S
-DCOPYKERNEL =  dcopy_lsx.S
+SCOPYKERNEL =  copy_lsx.S
+DCOPYKERNEL =  copy_lsx.S
 
-SSWAPKERNEL =  sswap_lsx.S
-DSWAPKERNEL =  dswap_lsx.S
+SSWAPKERNEL =  swap_lsx.S
+DSWAPKERNEL =  swap_lsx.S
 
 SAXPYKERNEL =  saxpy_lsx.S
 DAXPYKERNEL =  daxpy_lsx.S
@@ -43,8 +43,8 @@ DAXPYKERNEL =  daxpy_lsx.S
 SAXPBYKERNEL = saxpby_lsx.S
 DAXPBYKERNEL = daxpby_lsx.S
 
-SSUMKERNEL  =  ssum_lsx.S
-DSUMKERNEL  =  dsum_lsx.S
+SSUMKERNEL  =  sum_lsx.S
+DSUMKERNEL  =  sum_lsx.S
 
 SASUMKERNEL =  sasum_lsx.S
 DASUMKERNEL =  dasum_lsx.S
diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5
index 3253489d9..ba59c4566 100644
--- a/kernel/loongarch64/KERNEL.LOONGSON3R5
+++ b/kernel/loongarch64/KERNEL.LOONGSON3R5
@@ -31,11 +31,11 @@ IDAMAXKERNEL = idamax_lasx.S
 ISAMINKERNEL = isamin_lasx.S
 IDAMINKERNEL = idamin_lasx.S
 
-SCOPYKERNEL =  scopy_lasx.S
-DCOPYKERNEL =  dcopy_lasx.S
+SCOPYKERNEL =  copy_lasx.S
+DCOPYKERNEL =  copy_lasx.S
 
-SSWAPKERNEL =  sswap_lasx.S
-DSWAPKERNEL =  dswap_lasx.S
+SSWAPKERNEL =  swap_lasx.S
+DSWAPKERNEL =  swap_lasx.S
 
 SAXPYKERNEL =  saxpy_lasx.S
 DAXPYKERNEL =  daxpy_lasx.S
@@ -43,8 +43,8 @@ DAXPYKERNEL =  daxpy_lasx.S
 SAXPBYKERNEL = saxpby_lasx.S
 DAXPBYKERNEL = daxpby_lasx.S
 
-SSUMKERNEL  =  ssum_lasx.S
-DSUMKERNEL  =  dsum_lasx.S
+SSUMKERNEL  =  sum_lasx.S
+DSUMKERNEL  =  sum_lasx.S
 
 SASUMKERNEL =  sasum_lasx.S
 DASUMKERNEL =  dasum_lasx.S
diff --git a/kernel/loongarch64/copy_lasx.S b/kernel/loongarch64/copy_lasx.S
new file mode 100644
index 000000000..31f91cec1
--- /dev/null
+++ b/kernel/loongarch64/copy_lasx.S
@@ -0,0 +1,306 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define Y      $r7
+#define INCY   $r8
+#define I      $r17
+#define TEMP   $r18
+#define t1     $r14
+#define t2     $r15
+#define t3     $r16
+#define t4     $r19
+#define a1     $f12
+#define a2     $f13
+#define a3     $f14
+#define a4     $f15
+#define VX0    $xr12
+#define VX1    $xr13
+
+    PROLOGUE
+    bge $r0, N, .L999
+    li.d TEMP, 1
+    slli.d  TEMP, TEMP, BASE_SHIFT
+    slli.d  INCX, INCX, BASE_SHIFT
+    slli.d  INCY, INCY, BASE_SHIFT
+    srai.d I, N, 3
+    bne INCX, TEMP, .L20
+    bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
+    b .L11  // INCX==1 and INCY==1
+.L20:
+    bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
+    b .L21 // INCX!=1 and INCY==1
+
+/* INCX==1 and INCY==1 */
+.L11:
+    bge $r0, I, .L112
+    .align 3
+
+.L111:
+    xvld VX0, X, 0
+    addi.d  I, I, -1
+    xvst VX0, Y, 0
+#ifdef DOUBLE
+    xvld VX0, X, 32
+    xvst VX0, Y, 32
+#endif
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    blt $r0, I, .L111
+    .align 3
+
+.L112:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L113:
+    LD    $f12, X, 0
+    addi.d I, I, -1
+    addi.d  X, X, SIZE
+    ST    $f12, Y, 0
+    addi.d  Y, Y, SIZE
+    blt $r0, I, .L113
+    b .L999
+    .align 3
+
+/* INCX==1 and INCY!=1 */
+.L12:
+    bge $r0, I, .L122
+    .align 3
+
+.L121:
+#ifdef DOUBLE
+    xvld VX0, X, 0
+    xvld VX1, X, 32
+    xvstelm.d VX0, Y, 0, 0
+    add.d Y, Y, INCY
+    xvstelm.d VX0, Y, 0, 1
+    add.d Y, Y, INCY
+    xvstelm.d VX0, Y, 0, 2
+    add.d Y, Y, INCY
+    xvstelm.d VX0, Y, 0, 3
+    add.d Y, Y, INCY
+    xvstelm.d VX1, Y, 0, 0
+    add.d Y, Y, INCY
+    xvstelm.d VX1, Y, 0, 1
+    add.d Y, Y, INCY
+    xvstelm.d VX1, Y, 0, 2
+    add.d Y, Y, INCY
+    xvstelm.d VX1, Y, 0, 3
+    add.d Y, Y, INCY
+#else
+    xvld VX0, X, 0
+    xvstelm.w VX0, Y, 0, 0
+    add.d Y, Y, INCY
+    xvstelm.w VX0, Y, 0, 1
+    add.d Y, Y, INCY
+    xvstelm.w VX0, Y, 0, 2
+    add.d Y, Y, INCY
+    xvstelm.w VX0, Y, 0, 3
+    add.d Y, Y, INCY
+    xvstelm.w VX0, Y, 0, 4
+    add.d Y, Y, INCY
+    xvstelm.w VX0, Y, 0, 5
+    add.d Y, Y, INCY
+    xvstelm.w VX0, Y, 0, 6
+    add.d Y, Y, INCY
+    xvstelm.w VX0, Y, 0, 7
+    add.d Y, Y, INCY
+#endif
+    addi.d X, X, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L121
+    .align 3
+
+.L122:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L123:
+    LD    $f12, X, 0
+    addi.d I, I, -1
+    addi.d  X, X, SIZE
+    ST    $f12, Y, 0
+    add.d  Y, Y, INCY
+    blt $r0, I, .L123
+    b .L999
+    .align 3
+
+/* INCX!=1 and INCY==1 */
+.L21:
+    bge $r0, I, .L212
+    .align 3
+
+.L211:
+#ifdef DOUBLE
+    ld.d t1, X, 0
+    add.d X, X, INCX
+    ld.d t2, X, 0
+    add.d X, X, INCX
+    ld.d t3, X, 0
+    add.d X, X, INCX
+    ld.d t4, X, 0
+    add.d X, X, INCX
+    xvinsgr2vr.d VX0, t1, 0
+    xvinsgr2vr.d VX0, t2, 1
+    xvinsgr2vr.d VX0, t3, 2
+    xvinsgr2vr.d VX0, t4, 3
+    xvst VX0, Y, 0
+    ld.d t1, X, 0
+    add.d X, X, INCX
+    ld.d t2, X, 0
+    add.d X, X, INCX
+    ld.d t3, X, 0
+    add.d X, X, INCX
+    ld.d t4, X, 0
+    add.d X, X, INCX
+    xvinsgr2vr.d VX1, t1, 0
+    xvinsgr2vr.d VX1, t2, 1
+    xvinsgr2vr.d VX1, t3, 2
+    xvinsgr2vr.d VX1, t4, 3
+    xvst VX1, Y, 32
+#else
+    ld.w t1, X, 0
+    add.d X, X, INCX
+    ld.w t2, X, 0
+    add.d X, X, INCX
+    ld.w t3, X, 0
+    add.d X, X, INCX
+    ld.w t4, X, 0
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 0
+    xvinsgr2vr.w VX0, t2, 1
+    xvinsgr2vr.w VX0, t3, 2
+    xvinsgr2vr.w VX0, t4, 3
+    ld.w t1, X, 0
+    add.d X, X, INCX
+    ld.w t2, X, 0
+    add.d X, X, INCX
+    ld.w t3, X, 0
+    add.d X, X, INCX
+    ld.w t4, X, 0
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 4
+    xvinsgr2vr.w VX0, t2, 5
+    xvinsgr2vr.w VX0, t3, 6
+    xvinsgr2vr.w VX0, t4, 7
+    xvst VX0, Y, 0
+#endif
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L211
+    .align 3
+
+.L212:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L213:
+    LD    $f12, X, 0
+    addi.d I, I, -1
+    ST    $f12, Y, 0
+    add.d  X, X, INCX
+    addi.d  Y, Y, SIZE
+    blt $r0, I, .L213
+    b .L999
+    .align 3
+
+/* INCX!=1 and INCY!=1 */
+.L22:
+    bge $r0, I, .L223
+    .align 3
+
+.L222:
+    LD    a1, X, 0
+    add.d X, X, INCX
+    LD    a2, X, 0
+    add.d X, X, INCX
+    LD    a3, X, 0
+    add.d X, X, INCX
+    LD    a4, X, 0
+    add.d X, X, INCX
+    ST    a1, Y, 0
+    add.d Y, Y, INCY
+    ST    a2, Y, 0
+    add.d Y, Y, INCY
+    ST    a3, X, 0
+    add.d Y, Y, INCY
+    ST    a4, X, 0
+    add.d Y, Y, INCY
+    LD    a1, X, 0
+    add.d X, X, INCX
+    LD    a2, X, 0
+    add.d X, X, INCX
+    LD    a3, X, 0
+    add.d X, X, INCX
+    LD    a4, X, 0
+    add.d X, X, INCX
+    ST    a1, Y, 0
+    add.d Y, Y, INCY
+    ST    a2, Y, 0
+    add.d Y, Y, INCY
+    ST    a3, X, 0
+    add.d Y, Y, INCY
+    ST    a4, X, 0
+    add.d Y, Y, INCY
+    addi.d  I, I, -1
+    blt $r0, I, .L222
+    .align 3
+
+.L223:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L224:
+    LD    $f12, X, 0
+    addi.d I, I, -1
+    ST    $f12, Y, 0
+    add.d  X, X, INCX
+    add.d  Y, Y, INCY
+    blt $r0, I, .L224
+    .align 3
+
+.L999:
+    move $r4, $r12
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/copy_lsx.S b/kernel/loongarch64/copy_lsx.S
new file mode 100644
index 000000000..bb10f3565
--- /dev/null
+++ b/kernel/loongarch64/copy_lsx.S
@@ -0,0 +1,316 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define Y      $r7
+#define INCY   $r8
+#define I      $r17
+#define TEMP   $r18
+#define t1     $r14
+#define t2     $r15
+#define t3     $r16
+#define t4     $r19
+#define a1     $f12
+#define a2     $f13
+#define a3     $f14
+#define a4     $f15
+#define VX0    $vr12
+#define VX1    $vr13
+
+    PROLOGUE
+    bge $r0, N, .L999
+    li.d TEMP, 1
+    slli.d  TEMP, TEMP, BASE_SHIFT
+    slli.d  INCX, INCX, BASE_SHIFT
+    slli.d  INCY, INCY, BASE_SHIFT
+    srai.d I, N, 3
+    bne INCX, TEMP, .L20
+    bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
+    b .L11  // INCX==1 and INCY==1
+.L20:
+    bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
+    b .L21 // INCX!=1 and INCY==1
+
+/* INCX==1 and INCY==1 */
+.L11:
+    bge $r0, I, .L112
+    .align 3
+
+.L111:
+    vld VX0, X, 0
+    vld VX1, X, 16
+    addi.d  I, I, -1
+    vst VX0, Y, 0
+    vst VX1, Y, 16
+#ifdef DOUBLE
+    vld VX0, X, 32
+    vld VX1, X, 48
+    vst VX0, Y, 32
+    vst VX1, Y, 48
+#endif
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    blt $r0, I, .L111
+    .align 3
+
+.L112:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L113:
+    LD    $f12, X, 0
+    addi.d I, I, -1
+    addi.d  X, X, SIZE
+    ST    $f12, Y, 0
+    addi.d  Y, Y, SIZE
+    blt $r0, I, .L113
+    b .L999
+    .align 3
+
+/* INCX==1 and INCY!=1 */
+.L12:
+    bge $r0, I, .L122
+    .align 3
+
+.L121:
+#ifdef DOUBLE
+    vld VX0, X, 0
+    vld VX1, X, 16
+    vstelm.d VX0, Y, 0, 0
+    add.d Y, Y, INCY
+    vstelm.d VX0, Y, 0, 1
+    add.d Y, Y, INCY
+    vstelm.d VX1, Y, 0, 0
+    add.d Y, Y, INCY
+    vstelm.d VX1, Y, 0, 1
+    add.d Y, Y, INCY
+    vld VX0, X, 32
+    vld VX1, X, 48
+    vstelm.d VX0, Y, 0, 0
+    add.d Y, Y, INCY
+    vstelm.d VX0, Y, 0, 1
+    add.d Y, Y, INCY
+    vstelm.d VX1, Y, 0, 0
+    add.d Y, Y, INCY
+    vstelm.d VX1, Y, 0, 1
+    add.d Y, Y, INCY
+#else
+    vld VX0, X, 0
+    vld VX1, X, 16
+    vstelm.w VX0, Y, 0, 0
+    add.d Y, Y, INCY
+    vstelm.w VX0, Y, 0, 1
+    add.d Y, Y, INCY
+    vstelm.w VX0, Y, 0, 2
+    add.d Y, Y, INCY
+    vstelm.w VX0, Y, 0, 3
+    add.d Y, Y, INCY
+    vstelm.w VX1, Y, 0, 0
+    add.d Y, Y, INCY
+    vstelm.w VX1, Y, 0, 1
+    add.d Y, Y, INCY
+    vstelm.w VX1, Y, 0, 2
+    add.d Y, Y, INCY
+    vstelm.w VX1, Y, 0, 3
+    add.d Y, Y, INCY
+#endif
+    addi.d X, X, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L121
+    .align 3
+
+.L122:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L123:
+    LD    $f12, X, 0
+    addi.d I, I, -1
+    addi.d  X, X, SIZE
+    ST    $f12, Y, 0
+    add.d  Y, Y, INCY
+    blt $r0, I, .L123
+    b .L999
+    .align 3
+
+/* INCX!=1 and INCY==1 */
+.L21:
+    bge $r0, I, .L212
+    .align 3
+
+.L211:
+#ifdef DOUBLE
+    ld.d t1, X, 0
+    add.d X, X, INCX
+    ld.d t2, X, 0
+    add.d X, X, INCX
+    ld.d t3, X, 0
+    add.d X, X, INCX
+    ld.d t4, X, 0
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    vst VX0, Y, 0
+    vst VX1, Y, 16
+    ld.d t1, X, 0
+    add.d X, X, INCX
+    ld.d t2, X, 0
+    add.d X, X, INCX
+    ld.d t3, X, 0
+    add.d X, X, INCX
+    ld.d t4, X, 0
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    vst VX0, Y, 32
+    vst VX1, Y, 48
+#else
+    ld.w t1, X, 0
+    add.d X, X, INCX
+    ld.w t2, X, 0
+    add.d X, X, INCX
+    ld.w t3, X, 0
+    add.d X, X, INCX
+    ld.w t4, X, 0
+    add.d X, X, INCX
+    vinsgr2vr.w VX0, t1, 0
+    vinsgr2vr.w VX0, t2, 1
+    vinsgr2vr.w VX0, t3, 2
+    vinsgr2vr.w VX0, t4, 3
+    vst VX0, Y, 0
+    ld.w t1, X, 0
+    add.d X, X, INCX
+    ld.w t2, X, 0
+    add.d X, X, INCX
+    ld.w t3, X, 0
+    add.d X, X, INCX
+    ld.w t4, X, 0
+    add.d X, X, INCX
+    vinsgr2vr.w VX1, t1, 0
+    vinsgr2vr.w VX1, t2, 1
+    vinsgr2vr.w VX1, t3, 2
+    vinsgr2vr.w VX1, t4, 3
+    vst VX1, Y, 16
+#endif
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L211
+    .align 3
+
+.L212:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L213:
+    LD    $f12, X, 0
+    addi.d I, I, -1
+    ST    $f12, Y, 0
+    add.d  X, X, INCX
+    addi.d  Y, Y, SIZE
+    blt $r0, I, .L213
+    b .L999
+    .align 3
+
+/* INCX!=1 and INCY!=1 */
+.L22:
+    bge $r0, I, .L223
+    .align 3
+
+.L222:
+    LD    a1, X, 0
+    add.d X, X, INCX
+    LD    a2, X, 0
+    add.d X, X, INCX
+    LD    a3, X, 0
+    add.d X, X, INCX
+    LD    a4, X, 0
+    add.d X, X, INCX
+    ST    a1, Y, 0
+    add.d Y, Y, INCY
+    ST    a2, Y, 0
+    add.d Y, Y, INCY
+    ST    a3, X, 0
+    add.d Y, Y, INCY
+    ST    a4, X, 0
+    add.d Y, Y, INCY
+    LD    a1, X, 0
+    add.d X, X, INCX
+    LD    a2, X, 0
+    add.d X, X, INCX
+    LD    a3, X, 0
+    add.d X, X, INCX
+    LD    a4, X, 0
+    add.d X, X, INCX
+    ST    a1, Y, 0
+    add.d Y, Y, INCY
+    ST    a2, Y, 0
+    add.d Y, Y, INCY
+    ST    a3, X, 0
+    add.d Y, Y, INCY
+    ST    a4, X, 0
+    add.d Y, Y, INCY
+    addi.d  I, I, -1
+    blt $r0, I, .L222
+    .align 3
+
+.L223:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L224:
+    LD    $f12, X, 0
+    addi.d I, I, -1
+    ST    $f12, Y, 0
+    add.d  X, X, INCX
+    add.d  Y, Y, INCY
+    blt $r0, I, .L224
+    .align 3
+
+.L999:
+    move $r4, $r12
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/dcopy_lasx.S b/kernel/loongarch64/dcopy_lasx.S
deleted file mode 100644
index 9d7da4a80..000000000
--- a/kernel/loongarch64/dcopy_lasx.S
+++ /dev/null
@@ -1,224 +0,0 @@
-#define ASSEMBLER
-
-#include "common.h"
-#define N      $r4
-#define X      $r5
-#define INCX   $r6
-#define Y      $r7
-#define INCY   $r8
-#define I      $r17
-#define TEMP   $r18
-#define t1     $r14
-#define t2     $r15
-#define t3     $r16
-#define t4     $r19
-#define a1     $f12
-#define a2     $f13
-#define a3     $f14
-#define a4     $f15
-#define VX0    $xr12
-#define VX1    $xr13
-
-    PROLOGUE
-    bge $r0, N, .L999
-    li.d TEMP, 1
-    slli.d  TEMP, TEMP, BASE_SHIFT
-    slli.d  INCX, INCX, BASE_SHIFT
-    slli.d  INCY, INCY, BASE_SHIFT
-    srai.d I, N, 3
-    bne INCX, TEMP, .L20
-    bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
-    b .L11  // INCX==1 and INCY==1
-.L20:
-    bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
-    b .L21 // INCX!=1 and INCY==1
-
-.L11:
-    bge $r0, I, .L112
-    .align 3
-
-.L111:
-    xvld VX0, X, 0 * SIZE
-    xvld VX1, X, 4 * SIZE
-    xvst VX0, Y, 0 * SIZE
-    xvst VX1, Y, 4 * SIZE
-    addi.d X, X, 8 * SIZE
-    addi.d Y, Y, 8 * SIZE
-    addi.d  I, I, -1
-    blt $r0, I, .L111
-    .align 3
-
-.L112:
-    andi I, N, 7
-    bge $r0, I, .L999
-    .align 3
-
-.L113:
-    fld.d $f12, X, 0 * SIZE
-    addi.d I, I, -1
-    addi.d  X, X, SIZE
-    fst.d $f12, Y, 0 * SIZE
-    addi.d  Y, Y, SIZE
-    blt $r0, I, .L113
-    b .L999
-    .align 3
-
-.L12:
-    bge $r0, I, .L122
-    .align 3
-
-.L121:
-    xvld VX0, X, 0 * SIZE
-    xvld VX1, X, 4 * SIZE
-    xvstelm.d VX0, Y, 0, 0
-    add.d Y, Y, INCY
-    xvstelm.d VX0, Y, 0, 1
-    add.d Y, Y, INCY
-    xvstelm.d VX0, Y, 0, 2
-    add.d Y, Y, INCY
-    xvstelm.d VX0, Y, 0, 3
-    add.d Y, Y, INCY
-    xvstelm.d VX1, Y, 0, 0
-    add.d Y, Y, INCY
-    xvstelm.d VX1, Y, 0, 1
-    add.d Y, Y, INCY
-    xvstelm.d VX1, Y, 0, 2
-    add.d Y, Y, INCY
-    xvstelm.d VX1, Y, 0, 3
-    add.d Y, Y, INCY
-    addi.d X, X, 8 * SIZE
-    addi.d  I, I, -1
-    blt $r0, I, .L121
-    .align 3
-
-.L122:
-    andi I, N, 7
-    bge $r0, I, .L999
-    .align 3
-
-.L123:
-    fld.d $f12, X, 0 * SIZE
-    addi.d I, I, -1
-    addi.d  X, X, SIZE
-    fst.d $f12, Y, 0 * SIZE
-    add.d  Y, Y, INCY
-    blt $r0, I, .L123
-    b .L999
-    .align 3
-
-.L21:
-    bge $r0, I, .L212
-    .align 3
-
-.L211:
-    ld.d t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t2, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t4, X, 0 * SIZE
-    add.d X, X, INCX
-    xvinsgr2vr.d VX0, t1, 0
-    xvinsgr2vr.d VX0, t2, 1
-    xvinsgr2vr.d VX0, t3, 2
-    xvinsgr2vr.d VX0, t4, 3
-    xvst VX0, Y, 0 * SIZE
-    ld.d t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t2, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t4, X, 0 * SIZE
-    add.d X, X, INCX
-    xvinsgr2vr.d VX1, t1, 0
-    xvinsgr2vr.d VX1, t2, 1
-    xvinsgr2vr.d VX1, t3, 2
-    xvinsgr2vr.d VX1, t4, 3
-    xvst VX1, Y, 4 * SIZE
-    addi.d Y, Y, 8 * SIZE
-    addi.d  I, I, -1
-    blt $r0, I, .L211
-    .align 3
-
-.L212:
-    andi I, N, 7
-    bge $r0, I, .L999
-    .align 3
-
-.L213:
-    fld.d $f12, X, 0 * SIZE
-    addi.d I, I, -1
-    fst.d $f12, Y, 0 * SIZE
-    add.d  X, X, INCX
-    addi.d  Y, Y, SIZE
-    blt $r0, I, .L213
-    b .L999
-    .align 3
-
-.L22:
-    bgez INCX, .L220
-    .align 3
-
-.L220:
-    bge $r0, I, .L223
-    .align 3
-
-.L222:
-    fld.d a1, X, 0 * SIZE
-    add.d X, X, INCX
-    fld.d a2, X, 0 * SIZE
-    add.d X, X, INCX
-    fld.d a3, X, 0 * SIZE
-    add.d X, X, INCX
-    fld.d a4, X, 0 * SIZE
-    add.d X, X, INCX
-    fst.d a1, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    fst.d a2, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    fst.d a3, X, 0 * SIZE
-    add.d Y, Y, INCY
-    fst.d a4, X, 0 * SIZE
-    add.d Y, Y, INCY
-    fld.d a1, X, 0 * SIZE
-    add.d X, X, INCX
-    fld.d a2, X, 0 * SIZE
-    add.d X, X, INCX
-    fld.d a3, X, 0 * SIZE
-    add.d X, X, INCX
-    fld.d a4, X, 0 * SIZE
-    add.d X, X, INCX
-    fst.d a1, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    fst.d a2, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    fst.d a3, X, 0 * SIZE
-    add.d Y, Y, INCY
-    fst.d a4, X, 0 * SIZE
-    add.d Y, Y, INCY
-    addi.d  I, I, -1
-    blt $r0, I, .L222
-    .align 3
-
-.L223:
-    andi I, N, 7
-    bge $r0, I, .L999
-    .align 3
-
-.L224:
-    fld.d $f12, X, 0 * SIZE
-    addi.d I, I, -1
-    fst.d $f12, Y, 0 * SIZE
-    add.d  X, X, INCX
-    add.d  Y, Y, INCY
-    blt $r0, I, .L224
-    .align 3
-
-.L999:
-    move $r4, $r12
-    jirl $r0, $r1, 0x0
-    .align 3
-
-    EPILOGUE
diff --git a/kernel/loongarch64/dcopy_lsx.S b/kernel/loongarch64/dcopy_lsx.S
deleted file mode 100644
index 161655bbd..000000000
--- a/kernel/loongarch64/dcopy_lsx.S
+++ /dev/null
@@ -1,232 +0,0 @@
-#define ASSEMBLER
-
-#include "common.h"
-#define N      $r4
-#define X      $r5
-#define INCX   $r6
-#define Y      $r7
-#define INCY   $r8
-#define I      $r17
-#define TEMP   $r18
-#define t1     $r14
-#define t2     $r15
-#define t3     $r16
-#define t4     $r19
-#define a1     $f12
-#define a2     $f13
-#define a3     $f14
-#define a4     $f15
-#define VX0    $vr12
-#define VX1    $vr13
-
-    PROLOGUE
-    bge $r0, N, .L999
-    li.d TEMP, 1
-    slli.d  TEMP, TEMP, BASE_SHIFT
-    slli.d  INCX, INCX, BASE_SHIFT
-    slli.d  INCY, INCY, BASE_SHIFT
-    srai.d I, N, 3
-    bne INCX, TEMP, .L20
-    bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
-    b .L11  // INCX==1 and INCY==1
-.L20:
-    bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
-    b .L21 // INCX!=1 and INCY==1
-
-.L11:
-    bge $r0, I, .L112
-    .align 3
-
-.L111:
-    vld VX0, X, 0 * SIZE
-    vld VX1, X, 2 * SIZE
-    vst VX0, Y, 0 * SIZE
-    vst VX1, Y, 2 * SIZE
-    vld VX0, X, 4 * SIZE
-    vld VX1, X, 6 * SIZE
-    addi.d  I, I, -1
-    vst VX0, Y, 4 * SIZE
-    vst VX1, Y, 6 * SIZE
-    addi.d X, X, 8 * SIZE
-    addi.d Y, Y, 8 * SIZE
-    blt $r0, I, .L111
-    .align 3
-
-.L112:
-    andi I, N, 7
-    bge $r0, I, .L999
-    .align 3
-
-.L113:
-    fld.d $f12, X, 0 * SIZE
-    addi.d I, I, -1
-    addi.d  X, X, SIZE
-    fst.d $f12, Y, 0 * SIZE
-    addi.d  Y, Y, SIZE
-    blt $r0, I, .L113
-    b .L999
-    .align 3
-
-.L12:
-    bge $r0, I, .L122
-    .align 3
-
-.L121:
-    vld VX0, X, 0 * SIZE
-    vld VX1, X, 2 * SIZE
-    vstelm.d VX0, Y, 0, 0
-    add.d Y, Y, INCY
-    vstelm.d VX0, Y, 0, 1
-    add.d Y, Y, INCY
-    vstelm.d VX1, Y, 0, 0
-    add.d Y, Y, INCY
-    vstelm.d VX1, Y, 0, 1
-    add.d Y, Y, INCY
-    vld VX0, X, 4 * SIZE
-    vld VX1, X, 6 * SIZE
-    vstelm.d VX0, Y, 0, 0
-    add.d Y, Y, INCY
-    vstelm.d VX0, Y, 0, 1
-    add.d Y, Y, INCY
-    vstelm.d VX1, Y, 0, 0
-    add.d Y, Y, INCY
-    vstelm.d VX1, Y, 0, 1
-    add.d Y, Y, INCY
-    addi.d X, X, 8 * SIZE
-    addi.d  I, I, -1
-    blt $r0, I, .L121
-    .align 3
-
-.L122:
-    andi I, N, 7
-    bge $r0, I, .L999
-    .align 3
-
-.L123:
-    fld.d $f12, X, 0 * SIZE
-    addi.d I, I, -1
-    addi.d  X, X, SIZE
-    fst.d $f12, Y, 0 * SIZE
-    add.d  Y, Y, INCY
-    blt $r0, I, .L123
-    b .L999
-    .align 3
-
-.L21:
-    bge $r0, I, .L212
-    .align 3
-
-.L211:
-    ld.d t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t2, X, 0 * SIZE
-    add.d X, X, INCX
-    vinsgr2vr.d VX0, t1, 0
-    vinsgr2vr.d VX0, t2, 1
-    vst VX0, Y, 0 * SIZE
-    ld.d t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t4, X, 0 * SIZE
-    add.d X, X, INCX
-    vinsgr2vr.d VX1, t3, 0
-    vinsgr2vr.d VX1, t4, 1
-    vst VX1, Y, 2 * SIZE
-    ld.d t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t2, X, 0 * SIZE
-    add.d X, X, INCX
-    vinsgr2vr.d VX0, t1, 0
-    vinsgr2vr.d VX0, t2, 1
-    vst VX0, Y, 4 * SIZE
-    ld.d t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t4, X, 0 * SIZE
-    add.d X, X, INCX
-    vinsgr2vr.d VX1, t3, 0
-    vinsgr2vr.d VX1, t4, 1
-    vst VX1, Y, 6 * SIZE
-    addi.d Y, Y, 8 * SIZE
-    addi.d  I, I, -1
-    blt $r0, I, .L211
-    .align 3
-
-.L212:
-    andi I, N, 7
-    bge $r0, I, .L999
-    .align 3
-
-.L213:
-    fld.d $f12, X, 0 * SIZE
-    addi.d I, I, -1
-    fst.d $f12, Y, 0 * SIZE
-    add.d  X, X, INCX
-    addi.d  Y, Y, SIZE
-    blt $r0, I, .L213
-    b .L999
-    .align 3
-
-.L22:
-    bgez INCX, .L220
-    .align 3
-
-.L220:
-    bge $r0, I, .L223
-    .align 3
-
-.L222:
-    fld.d a1, X, 0 * SIZE
-    add.d X, X, INCX
-    fld.d a2, X, 0 * SIZE
-    add.d X, X, INCX
-    fld.d a3, X, 0 * SIZE
-    add.d X, X, INCX
-    fld.d a4, X, 0 * SIZE
-    add.d X, X, INCX
-    fst.d a1, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    fst.d a2, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    fst.d a3, X, 0 * SIZE
-    add.d Y, Y, INCY
-    fst.d a4, X, 0 * SIZE
-    add.d Y, Y, INCY
-    fld.d a1, X, 0 * SIZE
-    add.d X, X, INCX
-    fld.d a2, X, 0 * SIZE
-    add.d X, X, INCX
-    fld.d a3, X, 0 * SIZE
-    add.d X, X, INCX
-    fld.d a4, X, 0 * SIZE
-    add.d X, X, INCX
-    fst.d a1, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    fst.d a2, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    fst.d a3, X, 0 * SIZE
-    add.d Y, Y, INCY
-    fst.d a4, X, 0 * SIZE
-    add.d Y, Y, INCY
-    addi.d  I, I, -1
-    blt $r0, I, .L222
-    .align 3
-
-.L223:
-    andi I, N, 7
-    bge $r0, I, .L999
-    .align 3
-
-.L224:
-    fld.d $f12, X, 0 * SIZE
-    addi.d I, I, -1
-    fst.d $f12, Y, 0 * SIZE
-    add.d  X, X, INCX
-    add.d  Y, Y, INCY
-    blt $r0, I, .L224
-    .align 3
-
-.L999:
-    move $r4, $r12
-    jirl $r0, $r1, 0x0
-    .align 3
-
-    EPILOGUE
diff --git a/kernel/loongarch64/dnrm2_lasx.S b/kernel/loongarch64/dnrm2_lasx.S
index 2a9c3cf7b..5a6f7cf1e 100644
--- a/kernel/loongarch64/dnrm2_lasx.S
+++ b/kernel/loongarch64/dnrm2_lasx.S
@@ -1,3 +1,35 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************************/
+
 #define ASSEMBLER
 
 #include "common.h"
@@ -12,6 +44,8 @@
 #define t2     $r13
 #define t3     $r14
 #define t4     $r15
+
+/* Don't change following FR unless you know the effects. */
 #define VX0    $xr15
 #define VX1    $xr16
 #define VM0    $xr17
@@ -35,6 +69,7 @@
 
     xvxor.v   res1, res1, res1
     xvxor.v   res2, res2, res2
+    xvxor.v VM0, VM0, VM0
     bge $r0, N, .L999
     beq $r0, INCX, .L999
     move  XX, X
@@ -46,12 +81,11 @@
     slli.d INCX, INCX, BASE_SHIFT
     srai.d I, N, 3
     bne INCX, TEMP, .L20
-    xvld VM0, X, 0
     bge $r0,  I, .L97
     .align 3
 
 .L10:
-    xvld VX0, X, 0 * SIZE
+    xvld VX0, X, 0
     xvld VX1, X, 4 * SIZE
     xvfmaxa.d VM1, VX1, VX0
     xvfmaxa.d VM0, VM0, VM1
@@ -62,40 +96,32 @@
     .align 3
 
 .L20: // INCX!=1
-    move TEMP, X // initialize the maxa value
-    ld.d t1, TEMP, 0 * SIZE
-    add.d TEMP, TEMP, INCX
-    xvinsgr2vr.d VM0, t1, 0
-    srai.d I, N, 3
     bge $r0, I, .L97
-    ld.d t2, TEMP, 0 * SIZE
-    add.d TEMP, TEMP, INCX
-    xvinsgr2vr.d VM0, t2, 1
     .align 3
 
 .L21:
-    ld.d t1, X, 0 * SIZE
+    ld.d t1, X, 0
     add.d X, X, INCX
     xvinsgr2vr.d VX0, t1, 0
-    ld.d t2, X, 0 * SIZE
+    ld.d t2, X, 0
     add.d X, X, INCX
     xvinsgr2vr.d VX0, t2, 1
-    ld.d t3, X, 0 * SIZE
+    ld.d t3, X, 0
     add.d X, X, INCX
     xvinsgr2vr.d VX0, t3, 2
-    ld.d t4, X, 0 * SIZE
+    ld.d t4, X, 0
     add.d X, X, INCX
     xvinsgr2vr.d VX0, t4, 3
-    ld.d t1, X, 0 * SIZE
+    ld.d t1, X, 0
     add.d X, X, INCX
     xvinsgr2vr.d VX1, t1, 0
-    ld.d t2, X, 0 * SIZE
+    ld.d t2, X, 0
     add.d X, X, INCX
     xvinsgr2vr.d VX1, t2, 1
-    ld.d t3, X, 0 * SIZE
+    ld.d t3, X, 0
     add.d X, X, INCX
     xvinsgr2vr.d VX1, t3, 2
-    ld.d t4, X, 0 * SIZE
+    ld.d t4, X, 0
     add.d X, X, INCX
     xvinsgr2vr.d VX1, t4, 3
     xvfmaxa.d VM1, VX0, VX1
@@ -109,9 +135,9 @@
     xvpickve.d VX0, VM0, 1
     xvpickve.d VX1, VM0, 2
     xvpickve.d VM3, VM0, 3
-    xvfmaxa.d VM1, VX0, VX1
-    xvfmaxa.d VM2, VM3, VM0
-    xvfmaxa.d VM0, VM1, VM2
+    fmaxa.d $f17, $f17, $f14
+    fmaxa.d $f17, $f17, $f15
+    fmaxa.d $f17, $f17, $f16
     .align 3
 
 .L97:
@@ -149,12 +175,12 @@
     .align 3
 
 .L110:
-    xvld VX0, XX, 0 * SIZE
+    xvld VX0, XX, 0
     xvld VX1, XX, 4 * SIZE
-    xvfmul.d VM0, VX0, VALPHA
-    xvfmul.d VM1, VX1, VALPHA
-    xvfmadd.d res1, VM0, VM0, res1
-    xvfmadd.d res2, VM1, VM1, res2
+    xvfmul.d VM2, VX0, VALPHA
+    xvfmul.d VM3, VX1, VALPHA
+    xvfmadd.d res1, VM2, VM2, res1
+    xvfmadd.d res2, VM3, VM3, res2
     addi.d XX, XX, 8 * SIZE
     addi.d I, I, -1
     blt $r0, I, .L110
@@ -166,34 +192,34 @@
     bge $r0, I, .L997
 
 .L121:
-    ld.d t1, XX,  0 * SIZE
+    ld.d t1, XX,  0
     add.d XX, XX, INCX
-    ld.d t2, XX, 0 * SIZE
+    ld.d t2, XX, 0
     add.d XX, XX, INCX
-    ld.d t3, XX, 0 * SIZE
+    ld.d t3, XX, 0
     add.d XX, XX, INCX
-    ld.d t4, XX, 0 * SIZE
+    ld.d t4, XX, 0
     add.d XX, XX, INCX
     xvinsgr2vr.d VX0, t1, 0
     xvinsgr2vr.d VX0, t2, 1
     xvinsgr2vr.d VX0, t3, 2
     xvinsgr2vr.d VX0, t4, 3
-    ld.d t1, XX, 0 * SIZE
+    ld.d t1, XX, 0
     add.d XX, XX, INCX
-    ld.d t2, XX, 0 * SIZE
+    ld.d t2, XX, 0
     add.d XX, XX, INCX
-    ld.d t3, XX, 0 * SIZE
+    ld.d t3, XX, 0
     add.d XX, XX, INCX
-    ld.d t4, XX, 0 * SIZE
+    ld.d t4, XX, 0
     add.d XX, XX, INCX
-    xvinsgr2vr.d VX0, t1, 0
-    xvinsgr2vr.d VX0, t2, 1
+    xvinsgr2vr.d VX1, t1, 0
+    xvinsgr2vr.d VX1, t2, 1
     xvinsgr2vr.d VX1, t3, 2
     xvinsgr2vr.d VX1, t4, 3
-    xvfmul.d VM0, VX0, VALPHA
-    xvfmul.d VM1, VX1, VALPHA
-    xvfmadd.d res1, VM0, VM0, res1
-    xvfmadd.d res2, VM1, VM1, res2
+    xvfmul.d VM2, VX0, VALPHA
+    xvfmul.d VM3, VX1, VALPHA
+    xvfmadd.d res1, VM2, VM2, res1
+    xvfmadd.d res2, VM3, VM3, res2
     addi.d  I, I, -1
     blt $r0, I, .L121
     b .L996
@@ -203,10 +229,10 @@
     xvfadd.d res1, res1, res2
     xvpickve.d VX0, res1, 1
     xvpickve.d VX1, res1, 2
-    xvpickve.d VM0, res1, 3
-    xvfadd.d res1, VX0, res1
-    xvfadd.d VX1, VX1, VM0
-    xvfadd.d res1, VX1, res1
+    xvpickve.d VM2, res1, 3
+    fadd.d $f19, $f19, $f15
+    fadd.d $f19, $f19, $f16
+    fadd.d $f19, $f19, $f13
     .align 3
 
 .L997:
@@ -215,19 +241,17 @@
     .align 3
 
 .L998:
-    fld.d $f15, XX, 0 * SIZE
+    fld.d $f15, XX, 0
     addi.d I, I, -1
     fmul.d $f15, $f15, ALPHA
     fmadd.d $f19, $f15, $f15, $f19
     add.d XX, XX , INCX
     blt $r0, I, .L998
-    fsqrt.d $f19, $f19
-    fmul.d $f0, max, $f19
-    jirl    $r0, $r1, 0x0
-    .align 3
 
 .L999:
-    fmov.d $f0, $f19
+    fsqrt.d $f19, $f19
+    fmul.d $f0, max, $f19
     jirl $r0, $r1, 0x0
+    .align 3
 
     EPILOGUE
diff --git a/kernel/loongarch64/dnrm2_lsx.S b/kernel/loongarch64/dnrm2_lsx.S
index e4615e18d..fce4260e2 100644
--- a/kernel/loongarch64/dnrm2_lsx.S
+++ b/kernel/loongarch64/dnrm2_lsx.S
@@ -1,3 +1,35 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************************/
+
 #define ASSEMBLER
 
 #include "common.h"
@@ -12,6 +44,8 @@
 #define t2     $r13
 #define t3     $r14
 #define t4     $r15
+
+/* Don't change following FR unless you know the effects. */
 #define VX0    $vr15
 #define VX1    $vr16
 #define VM0    $vr17
@@ -35,6 +69,7 @@
 
     vxor.v   res1, res1, res1
     vxor.v   res2, res2, res2
+    vxor.v VM0, VM0, VM0
     bge $r0, N, .L999
     beq $r0, INCX, .L999
     move  XX, X
@@ -46,7 +81,7 @@
     slli.d INCX, INCX, BASE_SHIFT
     srai.d I, N, 3
     bne INCX, TEMP, .L20
-    vld VM0, X, 0
+
     bge $r0,  I, .L97
     .align 3
 
@@ -66,15 +101,7 @@
     .align 3
 
 .L20: // INCX!=1
-    move TEMP, X // initialize the maxa value
-    ld.d t1, TEMP, 0 * SIZE
-    add.d TEMP, TEMP, INCX
-    vinsgr2vr.d VM0, t1, 0
-    srai.d I, N, 3
     bge $r0, I, .L97
-    ld.d t2, TEMP, 0 * SIZE
-    add.d TEMP, TEMP, INCX
-    vinsgr2vr.d VM0, t2, 1
     .align 3
 
 .L21:
@@ -154,16 +181,16 @@
 .L110:
     vld VX0, XX, 0 * SIZE
     vld VX1, XX, 2 * SIZE
-    vfmul.d VM0, VX0, VALPHA
-    vfmul.d VM1, VX1, VALPHA
-    vfmadd.d res1, VM0, VM0, res1
-    vfmadd.d res2, VM1, VM1, res2
+    vfmul.d VM2, VX0, VALPHA
+    vfmul.d VM3, VX1, VALPHA
+    vfmadd.d res1, VM2, VM2, res1
+    vfmadd.d res2, VM3, VM3, res2
     vld VX0, XX, 4 * SIZE
     vld VX1, XX, 6 * SIZE
-    vfmul.d VM0, VX0, VALPHA
-    vfmul.d VM1, VX1, VALPHA
-    vfmadd.d res1, VM0, VM0, res1
-    vfmadd.d res2, VM1, VM1, res2
+    vfmul.d VM2, VX0, VALPHA
+    vfmul.d VM3, VX1, VALPHA
+    vfmadd.d res1, VM2, VM2, res1
+    vfmadd.d res2, VM3, VM3, res2
     addi.d XX, XX, 8 * SIZE
     addi.d I, I, -1
     blt $r0, I, .L110
@@ -173,6 +200,7 @@
 .L120:
     srai.d I, N, 3
     bge $r0, I, .L997
+    .align 3
 
 .L121:
     ld.d t1, XX,  0 * SIZE
@@ -187,14 +215,14 @@
     vinsgr2vr.d VX0, t2, 1
     vinsgr2vr.d VX1, t3, 0
     vinsgr2vr.d VX1, t4, 1
-    vfmul.d VM0, VX0, VALPHA
+    vfmul.d VM2, VX0, VALPHA
     ld.d t1, XX, 0 * SIZE
     add.d XX, XX, INCX
-    vfmul.d VM1, VX1, VALPHA
+    vfmul.d VM3, VX1, VALPHA
     ld.d t2, XX, 0 * SIZE
     add.d XX, XX, INCX
-    vfmadd.d res1, VM0, VM0, res1
-    vfmadd.d res2, VM1, VM1, res2
+    vfmadd.d res1, VM2, VM2, res1
+    vfmadd.d res2, VM3, VM3, res2
     ld.d t3, XX, 0 * SIZE
     add.d XX, XX, INCX
     ld.d t4, XX, 0 * SIZE
@@ -203,10 +231,10 @@
     vinsgr2vr.d VX0, t2, 1
     vinsgr2vr.d VX1, t3, 0
     vinsgr2vr.d VX1, t4, 1
-    vfmul.d VM0, VX0, VALPHA
-    vfmul.d VM1, VX1, VALPHA
-    vfmadd.d res1, VM0, VM0, res1
-    vfmadd.d res2, VM1, VM1, res2
+    vfmul.d VM2, VX0, VALPHA
+    vfmul.d VM3, VX1, VALPHA
+    vfmadd.d res1, VM2, VM2, res1
+    vfmadd.d res2, VM3, VM3, res2
     addi.d  I, I, -1
     blt $r0, I, .L121
     b .L996
@@ -230,13 +258,11 @@
     fmadd.d $f19, $f15, $f15, $f19
     add.d XX, XX , INCX
     blt $r0, I, .L998
-    fsqrt.d $f19, $f19
-    fmul.d $f0, max, $f19
-    jirl    $r0, $r1, 0x0
     .align 3
 
 .L999:
-    fmov.d $f0, $f19
+    fsqrt.d $f19, $f19
+    fmul.d $f0, max, $f19
     jirl $r0, $r1, 0x0
 
     EPILOGUE
diff --git a/kernel/loongarch64/dsum_lasx.S b/kernel/loongarch64/dsum_lasx.S
deleted file mode 100644
index 3c51dab60..000000000
--- a/kernel/loongarch64/dsum_lasx.S
+++ /dev/null
@@ -1,125 +0,0 @@
-#define ASSEMBLER
-#include "common.h"
-#define N      $r4
-#define X      $r5
-#define INCX   $r6
-#define I      $r17
-#define TEMP   $r18
-#define t1     $r15
-#define t2     $r12
-#define t3     $r13
-#define t4     $r14
-#define VX0    $xr12
-#define VX1    $xr13
-#define VX2    $xr14
-#define VX3    $xr15
-#define res1   $xr16
-#define res2   $xr17
-    PROLOGUE
-    xvxor.v res1, res1, res1
-    xvxor.v res2, res2, res2
-    bge $r0, N, .L999
-    bge $r0, INCX, .L999
-    li.d  TEMP, SIZE
-    slli.d  INCX, INCX, BASE_SHIFT
-    srai.d I, N, 3
-    bne INCX, TEMP, .L20
-    bge $r0, I, .L13
-    .align 3
-
-.L11:
-    xvld VX0, X, 0 * SIZE
-    xvld VX1, X, 4 * SIZE
-    xvfadd.d res2, VX0, VX1
-    xvfadd.d res1, res1, res2
-    addi.d X, X, 8 * SIZE
-    addi.d  I, I, -1
-    blt $r0, I, .L11
-    .align 3
-
-.L12:
-    xvpickve.d VX1, res1, 1
-    xvpickve.d VX2, res1, 2
-    xvpickve.d VX3, res1, 3
-    xvfadd.d res1, VX1, res1
-    xvfadd.d res1, VX2, res1
-    xvfadd.d res1, VX3, res1
-    .align 3
-
-.L13:
-    andi I, N, 7
-    bge $r0, I, .L999
-    .align 3
-
-.L14:
-    fld.d $f12, X, 0 * SIZE
-    addi.d I, I, -1
-    fadd.d $f16, $f12, $f16
-    addi.d  X, X, SIZE
-    blt $r0, I, .L14
-    b .L999
-    .align 3
-
-.L20:
-    bge $r0, I, .L23
-    .align 3
-
-.L21:
-    ld.d t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t2, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t4, X, 0 * SIZE
-    add.d X, X, INCX
-    xvinsgr2vr.d VX0, t1, 0
-    xvinsgr2vr.d VX0, t2, 1
-    xvinsgr2vr.d VX0, t3, 2
-    xvinsgr2vr.d VX0, t4, 3
-    ld.d t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t2, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t4, X, 0 * SIZE
-    add.d X, X, INCX
-    xvinsgr2vr.d VX1, t1, 0
-    xvinsgr2vr.d VX1, t2, 1
-    xvinsgr2vr.d VX1, t3, 2
-    xvinsgr2vr.d VX1, t4, 3
-    xvfadd.d res2, VX0, VX1
-    xvfadd.d res1, res1, res2
-    addi.d  I, I, -1
-    blt $r0, I, .L21
-    .align 3
-
-.L22:
-    xvpickve.d VX1, res1, 1
-    xvpickve.d VX2, res1, 2
-    xvpickve.d VX3, res1, 3
-    xvfadd.d res1, VX1, res1
-    xvfadd.d res1, VX2, res1
-    xvfadd.d res1, VX3, res1
-    .align 3
-
-.L23:
-    andi I, N, 7
-    bge $r0, I, .L999
-    .align 3
-
-.L24:
-    fld.d $f12, X, 0 * SIZE
-    fadd.d $f16, $f12, $f16
-    addi.d I, I, -1
-    add.d  X, X, INCX
-    blt $r0, I, .L24
-    .align 3
-
-.L999:
-    fmov.d $f0, $f16
-    jirl $r0, $r1, 0x0
-    .align 3
-
-    EPILOGUE
diff --git a/kernel/loongarch64/dsum_lsx.S b/kernel/loongarch64/dsum_lsx.S
deleted file mode 100644
index 402d087df..000000000
--- a/kernel/loongarch64/dsum_lsx.S
+++ /dev/null
@@ -1,123 +0,0 @@
-#define ASSEMBLER
-#include "common.h"
-#define N      $r4
-#define X      $r5
-#define INCX   $r6
-#define I      $r17
-#define TEMP   $r18
-#define t1     $r15
-#define t2     $r12
-#define t3     $r13
-#define t4     $r14
-#define VX0    $vr12
-#define VX1    $vr13
-#define VX2    $vr14
-#define VX3    $vr15
-#define res1   $vr16
-#define res2   $vr17
-    PROLOGUE
-    vxor.v res1, res1, res1
-    vxor.v res2, res2, res2
-    bge $r0, N, .L999
-    bge $r0, INCX, .L999
-    li.d  TEMP, SIZE
-    slli.d  INCX, INCX, BASE_SHIFT
-    srai.d I, N, 3
-    bne INCX, TEMP, .L20
-    bge $r0, I, .L13
-    .align 3
-
-.L11:
-    vld VX0, X, 0 * SIZE
-    vld VX1, X, 2 * SIZE
-    vfadd.d res2, VX0, VX1
-    vfadd.d res1, res1, res2
-    vld VX0, X, 4 * SIZE
-    vld VX1, X, 6 * SIZE
-    vfadd.d res2, VX0, VX1
-    vfadd.d res1, res1, res2
-    addi.d X, X, 8 * SIZE
-    addi.d  I, I, -1
-    blt $r0, I, .L11
-    .align 3
-
-.L12:
-    vreplvei.d VX1, res1, 1
-    vfadd.d res1, VX1, res1
-    .align 3
-
-.L13:
-    andi I, N, 7
-    bge $r0, I, .L999
-    .align 3
-
-.L14:
-    fld.d $f12, X, 0 * SIZE
-    fadd.d $f16, $f12, $f16
-    addi.d I, I, -1
-    addi.d  X, X, SIZE
-    blt $r0, I, .L14
-    b .L999
-    .align 3
-
-.L20:
-    bge $r0, I, .L23
-    .align 3
-
-.L21:
-    ld.d t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t2, X, 0 * SIZE
-    add.d X, X, INCX
-    vinsgr2vr.d VX0, t1, 0
-    vinsgr2vr.d VX0, t2, 1
-    ld.d t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t2, X, 0 * SIZE
-    vinsgr2vr.d VX1, t1, 0
-    vinsgr2vr.d VX1, t2, 1
-    add.d X, X, INCX
-    vfadd.d res2, VX0, VX1
-    vfadd.d res1, res1, res2
-    ld.d t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t4, X, 0 * SIZE
-    add.d X, X, INCX
-    vinsgr2vr.d VX0, t3, 0
-    vinsgr2vr.d VX0, t4, 1
-    ld.d t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t4, X, 0 * SIZE
-    vinsgr2vr.d VX1, t3, 0
-    vinsgr2vr.d VX1, t4, 1
-    add.d X, X, INCX
-    vfadd.d res2, VX0, VX1
-    vfadd.d res1, res1, res2
-    addi.d  I, I, -1
-    blt $r0, I, .L21
-    .align 3
-
-.L22:
-    vreplvei.d VX1, res1, 1
-    vfadd.d res1, VX1, res1
-    .align 3
-
-.L23:
-    andi I, N, 7
-    bge $r0, I, .L999
-    .align 3
-
-.L24:
-    fld.d $f12, X, 0 * SIZE
-    fadd.d $f16, $f12, $f16
-    addi.d I, I, -1
-    add.d  X, X, INCX
-    blt $r0, I, .L24
-    .align 3
-
-.L999:
-    fmov.d $f0, $f16
-    jirl $r0, $r1, 0x0
-    .align 3
-
-    EPILOGUE
diff --git a/kernel/loongarch64/dswap_lasx.S b/kernel/loongarch64/dswap_lasx.S
deleted file mode 100644
index 221cb7fa2..000000000
--- a/kernel/loongarch64/dswap_lasx.S
+++ /dev/null
@@ -1,301 +0,0 @@
-#define ASSEMBLER
-
-#include "common.h"
-#define N      $r4
-#define X      $r7
-#define INCX   $r8
-#define Y      $r9
-#define INCY   $r10
-
-#define I      $r17
-#define TEMP   $r18
-#define XX     $r5
-#define YY     $r6
-#define t1     $r14
-#define t2     $r15
-#define t3     $r16
-#define t4     $r19
-#define a1     $f12
-#define a2     $f13
-#define a3     $f14
-#define a4     $f15
-#define b1     $f16
-#define b2     $f17
-#define b3     $f18
-#define b4     $f19
-#define VX0    $xr12
-#define VX1    $xr13
-#define VX2    $xr14
-#define VX3    $xr15
-
-
-    PROLOGUE
-    bge $r0, N, .L999
-    li.d TEMP, 1
-    slli.d  TEMP, TEMP, BASE_SHIFT
-    slli.d  INCX, INCX, BASE_SHIFT
-    slli.d  INCY, INCY, BASE_SHIFT
-    srai.d I, N, 3
-    bne INCX, TEMP, .L20
-    bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
-    b .L11  // INCX==1 and INCY==1
-.L20:
-    bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
-    b .L21 // INCX!=1 and INCY==1
-
-.L11:
-    bge $r0, I, .L112
-    .align 3
-
-.L111:
-    xvld VX0, X, 0 * SIZE
-    xvld VX1, X, 4 * SIZE
-    xvld VX2, Y, 0 * SIZE
-    xvld VX3, Y, 4 * SIZE
-    addi.d  I, I, -1
-    xvst VX2, X, 0 * SIZE
-    xvst VX3, X, 4 * SIZE
-    xvst VX0, Y, 0 * SIZE
-    xvst VX1, Y, 4 * SIZE
-    addi.d X, X, 8 * SIZE
-    addi.d Y, Y, 8 * SIZE
-    blt $r0, I, .L111
-    .align 3
-
-.L112:
-    andi I, N, 7
-    bge $r0, I, .L999
-    .align 3
-
-.L113:
-    fld.d $f12, X, 0 * SIZE
-    fld.d $f14, Y, 0 * SIZE
-    addi.d I, I, -1
-    fst.d $f12, Y, 0 * SIZE
-    fst.d $f14, X, 0 * SIZE
-    addi.d  X, X, SIZE
-    addi.d  Y, Y, SIZE
-    blt $r0, I, .L113
-    b .L999
-    .align 3
-
-.L12: // INCX==1 and INCY!=1
-    bge $r0, I, .L122
-    .align 3
-
-.L121:
-    xvld VX0, X, 0 * SIZE
-    ld.d t1, Y, 0 * SIZE
-    xvstelm.d VX0, Y, 0, 0
-    add.d Y, Y, INCY
-    ld.d t2, Y, 0 * SIZE
-    xvstelm.d VX0, Y, 0, 1
-    add.d Y, Y, INCY
-    ld.d t3, Y, 0 * SIZE
-    xvstelm.d VX0, Y, 0, 2
-    add.d Y, Y, INCY
-    ld.d t4, Y, 0 * SIZE
-    xvstelm.d VX0, Y, 0, 3
-    xvinsgr2vr.d VX2, t1, 0
-    xvinsgr2vr.d VX2, t2, 1
-    xvinsgr2vr.d VX2, t3, 2
-    xvinsgr2vr.d VX2, t4, 3
-    add.d Y, Y, INCY
-    xvst VX2, X, 0 * SIZE
-    xvld VX1, X, 4 * SIZE
-    ld.d t1, Y, 0 * SIZE
-    xvstelm.d VX1, Y, 0, 0
-    add.d Y, Y, INCY
-    ld.d t2, Y, 0 * SIZE
-    xvstelm.d VX1, Y, 0, 1
-    add.d Y, Y, INCY
-    ld.d t3, Y, 0 * SIZE
-    xvstelm.d VX1, Y, 0, 2
-    add.d Y, Y, INCY
-    ld.d t4, Y, 0 * SIZE
-    xvstelm.d VX1, Y, 0, 3
-    xvinsgr2vr.d VX3, t1, 0
-    xvinsgr2vr.d VX3, t2, 1
-    xvinsgr2vr.d VX3, t3, 2
-    xvinsgr2vr.d VX3, t4, 3
-    add.d Y, Y, INCY
-    xvst VX3, X, 4 * SIZE
-    addi.d X, X, 8 * SIZE
-    addi.d  I, I, -1
-    blt $r0, I, .L121
-    .align 3
-
-.L122:
-    andi I, N, 7
-    bge $r0, I, .L999
-    .align 3
-
-.L123:
-    fld.d $f12, X, 0 * SIZE
-    fld.d $f14, Y, 0 * SIZE
-    addi.d I, I, -1
-    fst.d $f12, Y, 0 * SIZE
-    fst.d $f14, X, 0 * SIZE
-    addi.d  X, X, SIZE
-    add.d  Y, Y, INCY
-    blt $r0, I, .L123
-    b .L999
-    .align 3
-
-.L21:
-    bge $r0, I, .L212
-    .align 3
-
-.L211:
-    xvld VX2, Y, 0 * SIZE
-    ld.d t1, X, 0 * SIZE
-    xvstelm.d VX2, X, 0, 0
-    add.d X, X, INCX
-    ld.d t2, X, 0 * SIZE
-    xvstelm.d VX2, X, 0, 1
-    add.d X, X, INCX
-    ld.d t3, X, 0 * SIZE
-    xvstelm.d VX2, X, 0, 2
-    add.d X, X, INCX
-    ld.d t4, X, 0 * SIZE
-    xvstelm.d VX2, X, 0, 3
-    xvinsgr2vr.d VX0, t1, 0
-    xvinsgr2vr.d VX0, t2, 1
-    xvinsgr2vr.d VX0, t3, 2
-    xvinsgr2vr.d VX0, t4, 3
-    add.d X, X, INCX
-    xvst VX0, Y, 0 * SIZE
-    xvld VX3, Y, 4 * SIZE
-    ld.d t1, X, 0 * SIZE
-    xvstelm.d VX3, X, 0, 0
-    add.d X, X, INCY
-    ld.d t2, X, 0 * SIZE
-    xvstelm.d VX3, X, 0, 1
-    add.d X, X, INCX
-    ld.d t3, X, 0 * SIZE
-    xvstelm.d VX3, X, 0, 2
-    add.d X, X, INCX
-    ld.d t4, X, 0 * SIZE
-    xvstelm.d VX3, X, 0, 3
-    xvinsgr2vr.d VX1, t1, 0
-    xvinsgr2vr.d VX1, t2, 1
-    xvinsgr2vr.d VX1, t3, 2
-    xvinsgr2vr.d VX1, t4, 3
-    add.d X, X, INCX
-    xvst VX1, Y, 0 * SIZE
-    addi.d Y, Y, 8 * SIZE
-    addi.d  I, I, -1
-    blt $r0, I, .L211
-    .align 3
-
-.L212:
-    andi I, N, 7
-    bge $r0, I, .L999
-    .align 3
-
-.L213:
-    fld.d $f12, X, 0 * SIZE
-    fld.d $f14, Y, 0 * SIZE
-    addi.d I, I, -1
-    fst.d $f12, Y, 0 * SIZE
-    fst.d $f14, X, 0 * SIZE
-    add.d  X, X, INCX
-    addi.d  Y, Y, SIZE
-    blt $r0, I, .L213
-    b .L999
-    .align 3
-
-.L22:
-    bgez INCX, .L220
-    //addi.d TEMP, N, -1
-    //mul.d TEMP, TEMP, INCX
-    //sub.d X, X, TEMP
-    .align 3
-
-.L220:
-    bge $r0, I, .L223
-    .align 3
-    move XX, X
-
-.L222:
-    fld.d a1, X, 0 * SIZE
-    add.d X, X, INCX
-    fld.d a2, X, 0 * SIZE
-    add.d X, X, INCX
-    fld.d a3, X, 0 * SIZE
-    add.d X, X, INCX
-    fld.d a4, X, 0 * SIZE
-    add.d X, X, INCX
-    fld.d b1, Y, 0 * SIZE
-    fst.d a1, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    fld.d b2, Y, 0 * SIZE
-    fst.d a2, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    fld.d b3, Y, 0 * SIZE
-    fst.d a3, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    fld.d b4, Y, 0 * SIZE
-    fst.d a4, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    fld.d a1, X, 0 * SIZE
-    add.d X, X, INCX
-    fst.d b1, XX, 0 * SIZE
-    add.d XX, XX, INCX
-    fld.d b1, Y, 0 * SIZE
-    fst.d a1, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    fld.d a2, X, 0 * SIZE
-    add.d X, X, INCX
-    fst.d b2, XX, 0 * SIZE
-    add.d XX, XX, INCX
-    fld.d b2, Y, 0 * SIZE
-    fst.d a2, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    fld.d a3, X, 0 * SIZE
-    add.d X, X, INCX
-    fst.d b3, XX, 0 * SIZE
-    add.d XX, XX, INCX
-    fld.d b3, Y, 0 * SIZE
-    fst.d a3, Y, 0 * SIZE
-    fld.d a4, X, 0 * SIZE
-    add.d X, X, INCX
-    fst.d b4, XX, 0 * SIZE
-    add.d XX, XX, INCX
-    fld.d b4, Y, 0 * SIZE
-    fst.d a4, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    fst.d b1, XX, 0 * SIZE
-    add.d XX, XX, INCX
-    fst.d b2, XX, 0 * SIZE
-    add.d XX, XX, INCX
-    fst.d b3, XX, 0 * SIZE
-    add.d XX, XX, INCX
-    fst.d b4, XX, 0 * SIZE
-    add.d XX, XX, INCX
-    addi.d  I, I, -1
-    blt $r0, I, .L222
-    .align 3
-
-.L223:
-    andi I, N, 7
-    bge $r0, I, .L999
-    .align 3
-
-.L224:
-    fld.d $f12, X, 0 * SIZE
-    fld.d $f14, Y, 0 * SIZE
-    addi.d I, I, -1
-    fst.d $f12, Y, 0 * SIZE
-    fst.d $f14, X, 0 * SIZE
-    add.d  X, X, INCX
-    add.d  Y, Y, INCY
-    blt $r0, I, .L224
-    .align 3
-
-.L999:
-    move $r4, $r12
-    jirl $r0, $r1, 0x0
-    .align 3
-
-    EPILOGUE
diff --git a/kernel/loongarch64/dswap_lsx.S b/kernel/loongarch64/dswap_lsx.S
deleted file mode 100644
index 7f7f585e1..000000000
--- a/kernel/loongarch64/dswap_lsx.S
+++ /dev/null
@@ -1,317 +0,0 @@
-#define ASSEMBLER
-
-#include "common.h"
-#define N      $r4
-#define X      $r7
-#define INCX   $r8
-#define Y      $r9
-#define INCY   $r10
-
-#define I      $r17
-#define TEMP   $r18
-#define XX     $r5
-#define YY     $r6
-#define t1     $r14
-#define t2     $r15
-#define t3     $r16
-#define t4     $r19
-#define a1     $f12
-#define a2     $f13
-#define a3     $f14
-#define a4     $f15
-#define b1     $f16
-#define b2     $f17
-#define b3     $f18
-#define b4     $f19
-#define VX0    $vr12
-#define VX1    $vr13
-#define VX2    $vr14
-#define VX3    $vr15
-
-
-    PROLOGUE
-    bge $r0, N, .L999
-    li.d TEMP, 1
-    slli.d  TEMP, TEMP, BASE_SHIFT
-    slli.d  INCX, INCX, BASE_SHIFT
-    slli.d  INCY, INCY, BASE_SHIFT
-    srai.d I, N, 3
-    bne INCX, TEMP, .L20
-    bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
-    b .L11  // INCX==1 and INCY==1
-.L20:
-    bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
-    b .L21 // INCX!=1 and INCY==1
-
-.L11:
-    bge $r0, I, .L112
-    .align 3
-
-.L111:
-    vld VX0, X, 0 * SIZE
-    vld VX1, X, 2 * SIZE
-    vld VX2, Y, 0 * SIZE
-    vld VX3, Y, 2 * SIZE
-    vst VX2, X, 0 * SIZE
-    vst VX3, X, 2 * SIZE
-    vst VX0, Y, 0 * SIZE
-    vst VX1, Y, 2 * SIZE
-    vld VX0, X, 4 * SIZE
-    vld VX1, X, 6 * SIZE
-    vld VX2, Y, 4 * SIZE
-    vld VX3, Y, 6 * SIZE
-    addi.d  I, I, -1
-    vst VX2, X, 4 * SIZE
-    vst VX3, X, 6 * SIZE
-    vst VX0, Y, 4 * SIZE
-    vst VX1, Y, 6 * SIZE
-    addi.d X, X, 8 * SIZE
-    addi.d Y, Y, 8 * SIZE
-    blt $r0, I, .L111
-    .align 3
-
-.L112:
-    andi I, N, 7
-    bge $r0, I, .L999
-    .align 3
-
-.L113:
-    fld.d $f12, X, 0 * SIZE
-    fld.d $f14, Y, 0 * SIZE
-    addi.d I, I, -1
-    fst.d $f12, Y, 0 * SIZE
-    fst.d $f14, X, 0 * SIZE
-    addi.d  X, X, SIZE
-    addi.d  Y, Y, SIZE
-    blt $r0, I, .L113
-    b .L999
-    .align 3
-
-.L12: // INCX==1 and INCY!=1
-    bge $r0, I, .L122
-    .align 3
-
-.L121:
-    vld VX0, X, 0 * SIZE
-    ld.d t1, Y, 0 * SIZE
-    vstelm.d VX0, Y, 0, 0
-    add.d Y, Y, INCY
-    ld.d t2, Y, 0 * SIZE
-    vstelm.d VX0, Y, 0, 1
-    vinsgr2vr.d VX2, t1, 0
-    vinsgr2vr.d VX2, t2, 1
-    add.d Y, Y, INCY
-    vst VX2, X, 0 * SIZE
-    vld VX1, X, 2 * SIZE
-    ld.d t3, Y, 0 * SIZE
-    vstelm.d VX1, Y, 0, 0
-    add.d Y, Y, INCY
-    ld.d t4, Y, 0 * SIZE
-    vstelm.d VX1, Y, 0, 1
-    vinsgr2vr.d VX3, t3, 0
-    vinsgr2vr.d VX3, t4, 1
-    add.d Y, Y, INCY
-    vst VX3, X, 2 * SIZE
-    vld VX0, X, 4 * SIZE
-    ld.d t1, Y, 0 * SIZE
-    vstelm.d VX0, Y, 0, 0
-    add.d Y, Y, INCY
-    ld.d t2, Y, 0 * SIZE
-    vstelm.d VX0, Y, 0, 1
-    vinsgr2vr.d VX2, t1, 0
-    vinsgr2vr.d VX2, t2, 1
-    add.d Y, Y, INCY
-    vst VX2, X, 4 * SIZE
-    vld VX1, X, 6 * SIZE
-    ld.d t3, Y, 0 * SIZE
-    vstelm.d VX1, Y, 0, 0
-    add.d Y, Y, INCY
-    ld.d t4, Y, 0 * SIZE
-    vstelm.d VX1, Y, 0, 1
-    vinsgr2vr.d VX3, t3, 0
-    vinsgr2vr.d VX3, t4, 1
-    add.d Y, Y, INCY
-    vst VX3, X, 6 * SIZE
-    addi.d X, X, 8 * SIZE
-    addi.d  I, I, -1
-    blt $r0, I, .L121
-    .align 3
-
-.L122:
-    andi I, N, 7
-    bge $r0, I, .L999
-    .align 3
-
-.L123:
-    fld.d $f12, X, 0 * SIZE
-    fld.d $f14, Y, 0 * SIZE
-    addi.d I, I, -1
-    fst.d $f12, Y, 0 * SIZE
-    fst.d $f14, X, 0 * SIZE
-    addi.d  X, X, SIZE
-    add.d  Y, Y, INCY
-    blt $r0, I, .L123
-    b .L999
-    .align 3
-
-.L21:
-    bge $r0, I, .L212
-    .align 3
-
-.L211:
-    vld VX2, Y, 0 * SIZE
-    ld.d t1, X, 0 * SIZE
-    vstelm.d VX2, X, 0, 0
-    add.d X, X, INCX
-    ld.d t2, X, 0 * SIZE
-    vstelm.d VX2, X, 0, 1
-    vinsgr2vr.d VX0, t1, 0
-    vinsgr2vr.d VX0, t2, 1
-    add.d X, X, INCY
-    vst VX0, Y, 0 * SIZE
-    vld VX3, Y, 2 * SIZE
-    ld.d t3, X, 0 * SIZE
-    vstelm.d VX3, X, 0, 0
-    add.d X, X, INCX
-    ld.d t4, X, 0 * SIZE
-    vstelm.d VX3, X, 0, 1
-    vinsgr2vr.d VX1, t3, 0
-    vinsgr2vr.d VX1, t4, 1
-    add.d X, X, INCX
-    vst VX1, Y, 2 * SIZE
-    vld VX2, Y, 4 * SIZE
-    ld.d t1, X, 0 * SIZE
-    vstelm.d VX2, X, 0, 0
-    add.d X, X, INCX
-    ld.d t2, X, 0 * SIZE
-    vstelm.d VX2, X, 0, 1
-    vinsgr2vr.d VX0, t1, 0
-    vinsgr2vr.d VX0, t2, 1
-    add.d X, X, INCY
-    vst VX0, Y, 4 * SIZE
-    vld VX3, Y, 6 * SIZE
-    ld.d t3, X, 0 * SIZE
-    vstelm.d VX3, X, 0, 0
-    add.d X, X, INCX
-    ld.d t4, X, 0 * SIZE
-    vstelm.d VX3, X, 0, 1
-    vinsgr2vr.d VX1, t3, 0
-    vinsgr2vr.d VX1, t4, 1
-    add.d X, X, INCX
-    vst VX1, Y, 6 * SIZE
-    addi.d Y, Y, 8 * SIZE
-    addi.d  I, I, -1
-    blt $r0, I, .L211
-    .align 3
-
-.L212:
-    andi I, N, 7
-    bge $r0, I, .L999
-    .align 3
-
-.L213:
-    fld.d $f12, X, 0 * SIZE
-    fld.d $f14, Y, 0 * SIZE
-    addi.d I, I, -1
-    fst.d $f12, Y, 0 * SIZE
-    fst.d $f14, X, 0 * SIZE
-    add.d  X, X, INCX
-    addi.d  Y, Y, SIZE
-    blt $r0, I, .L213
-    b .L999
-    .align 3
-
-.L22:
-    bgez INCX, .L220
-    //addi.d TEMP, N, -1
-    //mul.d TEMP, TEMP, INCX
-    //sub.d X, X, TEMP
-    .align 3
-
-.L220:
-    bge $r0, I, .L223
-    .align 3
-    move XX, X
-
-.L222:
-    fld.d a1, X, 0 * SIZE
-    add.d X, X, INCX
-    fld.d a2, X, 0 * SIZE
-    add.d X, X, INCX
-    fld.d a3, X, 0 * SIZE
-    add.d X, X, INCX
-    fld.d a4, X, 0 * SIZE
-    add.d X, X, INCX
-    fld.d b1, Y, 0 * SIZE
-    fst.d a1, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    fld.d b2, Y, 0 * SIZE
-    fst.d a2, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    fld.d b3, Y, 0 * SIZE
-    fst.d a3, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    fld.d b4, Y, 0 * SIZE
-    fst.d a4, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    fld.d a1, X, 0 * SIZE
-    add.d X, X, INCX
-    fst.d b1, XX, 0 * SIZE
-    add.d XX, XX, INCX
-    fld.d b1, Y, 0 * SIZE
-    fst.d a1, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    fld.d a2, X, 0 * SIZE
-    add.d X, X, INCX
-    fst.d b2, XX, 0 * SIZE
-    add.d XX, XX, INCX
-    fld.d b2, Y, 0 * SIZE
-    fst.d a2, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    fld.d a3, X, 0 * SIZE
-    add.d X, X, INCX
-    fst.d b3, XX, 0 * SIZE
-    add.d XX, XX, INCX
-    fld.d b3, Y, 0 * SIZE
-    fst.d a3, Y, 0 * SIZE
-    fld.d a4, X, 0 * SIZE
-    add.d X, X, INCX
-    fst.d b4, XX, 0 * SIZE
-    add.d XX, XX, INCX
-    fld.d b4, Y, 0 * SIZE
-    fst.d a4, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    fst.d b1, XX, 0 * SIZE
-    add.d XX, XX, INCX
-    fst.d b2, XX, 0 * SIZE
-    add.d XX, XX, INCX
-    fst.d b3, XX, 0 * SIZE
-    add.d XX, XX, INCX
-    fst.d b4, XX, 0 * SIZE
-    add.d XX, XX, INCX
-    addi.d  I, I, -1
-    blt $r0, I, .L222
-    .align 3
-
-.L223:
-    andi I, N, 7
-    bge $r0, I, .L999
-    .align 3
-
-.L224:
-    fld.d $f12, X, 0 * SIZE
-    fld.d $f14, Y, 0 * SIZE
-    addi.d I, I, -1
-    fst.d $f12, Y, 0 * SIZE
-    fst.d $f14, X, 0 * SIZE
-    add.d  X, X, INCX
-    add.d  Y, Y, INCY
-    blt $r0, I, .L224
-    .align 3
-
-.L999:
-    move $r4, $r12
-    jirl $r0, $r1, 0x0
-    .align 3
-
-    EPILOGUE
diff --git a/kernel/loongarch64/scopy_lasx.S b/kernel/loongarch64/scopy_lasx.S
deleted file mode 100644
index 7db1e7cee..000000000
--- a/kernel/loongarch64/scopy_lasx.S
+++ /dev/null
@@ -1,216 +0,0 @@
-#define ASSEMBLER
-
-#include "common.h"
-#define N      $r4
-#define X      $r5
-#define INCX   $r6
-#define Y      $r7
-#define INCY   $r8
-#define I      $r17
-#define TEMP   $r18
-#define t1     $r14
-#define t2     $r15
-#define t3     $r16
-#define t4     $r19
-#define a1     $f12
-#define a2     $f13
-#define a3     $f14
-#define a4     $f15
-#define VX0    $xr12
-#define VX1    $xr13
-
-    PROLOGUE
-    bge $r0, N, .L999
-    li.d TEMP, 1
-    slli.d  TEMP, TEMP, BASE_SHIFT
-    slli.d  INCX, INCX, BASE_SHIFT
-    slli.d  INCY, INCY, BASE_SHIFT
-    srai.d I, N, 3
-    bne INCX, TEMP, .L20
-    bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
-    b .L11  // INCX==1 and INCY==1
-.L20:
-    bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
-    b .L21 // INCX!=1 and INCY==1
-
-.L11:
-    bge $r0, I, .L112
-    .align 3
-
-.L111:
-    xvld VX0, X, 0 * SIZE
-    addi.d  I, I, -1
-    xvst VX0, Y, 0 * SIZE
-    addi.d X, X, 8 * SIZE
-    addi.d Y, Y, 8 * SIZE
-    blt $r0, I, .L111
-    .align 3
-
-.L112:
-    andi I, N, 7
-    bge $r0, I, .L999
-    .align 3
-
-.L113:
-    fld.s $f12, X, 0 * SIZE
-    addi.d I, I, -1
-    addi.d  X, X, SIZE
-    fst.s $f12, Y, 0 * SIZE
-    addi.d  Y, Y, SIZE
-    blt $r0, I, .L113
-    b .L999
-    .align 3
-
-.L12:
-    bge $r0, I, .L122
-    .align 3
-
-.L121:
-    xvld VX0, X, 0 * SIZE
-    xvstelm.w VX0, Y, 0, 0
-    add.d Y, Y, INCY
-    xvstelm.w VX0, Y, 0, 1
-    add.d Y, Y, INCY
-    xvstelm.w VX0, Y, 0, 2
-    add.d Y, Y, INCY
-    xvstelm.w VX0, Y, 0, 3
-    add.d Y, Y, INCY
-    xvstelm.w VX0, Y, 0, 4
-    add.d Y, Y, INCY
-    xvstelm.w VX0, Y, 0, 5
-    add.d Y, Y, INCY
-    xvstelm.w VX0, Y, 0, 6
-    add.d Y, Y, INCY
-    xvstelm.w VX0, Y, 0, 7
-    add.d Y, Y, INCY
-    addi.d X, X, 8 * SIZE
-    addi.d  I, I, -1
-    blt $r0, I, .L121
-    .align 3
-
-.L122:
-    andi I, N, 7
-    bge $r0, I, .L999
-    .align 3
-
-.L123:
-    fld.s $f12, X, 0 * SIZE
-    addi.d I, I, -1
-    addi.d  X, X, SIZE
-    fst.s $f12, Y, 0 * SIZE
-    add.d  Y, Y, INCY
-    blt $r0, I, .L123
-    b .L999
-    .align 3
-
-.L21:
-    bge $r0, I, .L212
-    .align 3
-
-.L211:
-    ld.w t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t2, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t4, X, 0 * SIZE
-    add.d X, X, INCX
-    xvinsgr2vr.w VX0, t1, 0
-    xvinsgr2vr.w VX0, t2, 1
-    xvinsgr2vr.w VX0, t3, 2
-    xvinsgr2vr.w VX0, t4, 3
-    ld.w t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t2, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t4, X, 0 * SIZE
-    add.d X, X, INCX
-    xvinsgr2vr.w VX0, t1, 4
-    xvinsgr2vr.w VX0, t2, 5
-    xvinsgr2vr.w VX0, t3, 6
-    xvinsgr2vr.w VX0, t4, 7
-    xvst VX0, Y, 0 * SIZE
-    addi.d Y, Y, 8 * SIZE
-    addi.d  I, I, -1
-    blt $r0, I, .L211
-    .align 3
-
-.L212:
-    andi I, N, 7
-    bge $r0, I, .L999
-    .align 3
-
-.L213:
-    fld.s $f12, X, 0 * SIZE
-    addi.d I, I, -1
-    fst.s $f12, Y, 0 * SIZE
-    add.d  X, X, INCX
-    addi.d  Y, Y, SIZE
-    blt $r0, I, .L213
-    b .L999
-    .align 3
-
-.L22:
-    bge $r0, I, .L223
-    .align 3
-
-.L222:
-    fld.s a1, X, 0 * SIZE
-    add.d X, X, INCX
-    fld.s a2, X, 0 * SIZE
-    add.d X, X, INCX
-    fld.s a3, X, 0 * SIZE
-    add.d X, X, INCX
-    fld.s a4, X, 0 * SIZE
-    add.d X, X, INCX
-    fst.s a1, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    fst.s a2, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    fst.s a3, X, 0 * SIZE
-    add.d Y, Y, INCY
-    fst.s a4, X, 0 * SIZE
-    add.d Y, Y, INCY
-    fld.s a1, X, 0 * SIZE
-    add.d X, X, INCX
-    fld.s a2, X, 0 * SIZE
-    add.d X, X, INCX
-    fld.s a3, X, 0 * SIZE
-    add.d X, X, INCX
-    fld.s a4, X, 0 * SIZE
-    add.d X, X, INCX
-    fst.s a1, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    fst.s a2, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    fst.s a3, X, 0 * SIZE
-    add.d Y, Y, INCY
-    fst.s a4, X, 0 * SIZE
-    add.d Y, Y, INCY
-    addi.d  I, I, -1
-    blt $r0, I, .L222
-    .align 3
-
-.L223:
-    andi I, N, 7
-    bge $r0, I, .L999
-    .align 3
-
-.L224:
-    fld.s $f12, X, 0 * SIZE
-    addi.d I, I, -1
-    fst.s $f12, Y, 0 * SIZE
-    add.d  X, X, INCX
-    add.d  Y, Y, INCY
-    blt $r0, I, .L224
-    .align 3
-
-.L999:
-    move $r4, $r12
-    jirl $r0, $r1, 0x0
-    .align 3
-
-    EPILOGUE
diff --git a/kernel/loongarch64/scopy_lsx.S b/kernel/loongarch64/scopy_lsx.S
deleted file mode 100644
index 32150d3d6..000000000
--- a/kernel/loongarch64/scopy_lsx.S
+++ /dev/null
@@ -1,220 +0,0 @@
-#define ASSEMBLER
-
-#include "common.h"
-#define N      $r4
-#define X      $r5
-#define INCX   $r6
-#define Y      $r7
-#define INCY   $r8
-#define I      $r17
-#define TEMP   $r18
-#define t1     $r14
-#define t2     $r15
-#define t3     $r16
-#define t4     $r19
-#define a1     $f12
-#define a2     $f13
-#define a3     $f14
-#define a4     $f15
-#define VX0    $vr12
-#define VX1    $vr13
-
-    PROLOGUE
-    bge $r0, N, .L999
-    li.d TEMP, 1
-    slli.d  TEMP, TEMP, BASE_SHIFT
-    slli.d  INCX, INCX, BASE_SHIFT
-    slli.d  INCY, INCY, BASE_SHIFT
-    srai.d I, N, 3
-    bne INCX, TEMP, .L20
-    bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
-    b .L11  // INCX==1 and INCY==1
-.L20:
-    bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
-    b .L21 // INCX!=1 and INCY==1
-
-.L11:
-    bge $r0, I, .L112
-    .align 3
-
-.L111:
-    vld VX0, X, 0 * SIZE
-    vld VX1, X, 4 * SIZE
-    addi.d  I, I, -1
-    vst VX0, Y, 0 * SIZE
-    vst VX1, Y, 4 * SIZE
-    addi.d X, X, 8 * SIZE
-    addi.d Y, Y, 8 * SIZE
-    blt $r0, I, .L111
-    .align 3
-
-.L112:
-    andi I, N, 7
-    bge $r0, I, .L999
-    .align 3
-
-.L113:
-    fld.s $f12, X, 0 * SIZE
-    addi.d I, I, -1
-    addi.d  X, X, SIZE
-    fst.s $f12, Y, 0 * SIZE
-    addi.d  Y, Y, SIZE
-    blt $r0, I, .L113
-    b .L999
-    .align 3
-
-.L12:
-    bge $r0, I, .L122
-    .align 3
-
-.L121:
-    vld VX0, X, 0 * SIZE
-    vld VX1, X, 4 * SIZE
-    vstelm.w VX0, Y, 0, 0
-    add.d Y, Y, INCY
-    vstelm.w VX0, Y, 0, 1
-    add.d Y, Y, INCY
-    vstelm.w VX0, Y, 0, 2
-    add.d Y, Y, INCY
-    vstelm.w VX0, Y, 0, 3
-    add.d Y, Y, INCY
-    vstelm.w VX1, Y, 0, 0
-    add.d Y, Y, INCY
-    vstelm.w VX1, Y, 0, 1
-    add.d Y, Y, INCY
-    vstelm.w VX1, Y, 0, 2
-    add.d Y, Y, INCY
-    vstelm.w VX1, Y, 0, 3
-    add.d Y, Y, INCY
-    addi.d X, X, 8 * SIZE
-    addi.d  I, I, -1
-    blt $r0, I, .L121
-    .align 3
-
-.L122:
-    andi I, N, 7
-    bge $r0, I, .L999
-    .align 3
-
-.L123:
-    fld.s $f12, X, 0 * SIZE
-    addi.d I, I, -1
-    addi.d  X, X, SIZE
-    fst.s $f12, Y, 0 * SIZE
-    add.d  Y, Y, INCY
-    blt $r0, I, .L123
-    b .L999
-    .align 3
-
-.L21:
-    bge $r0, I, .L212
-    .align 3
-
-.L211:
-    ld.w t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t2, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t4, X, 0 * SIZE
-    add.d X, X, INCX
-    vinsgr2vr.w VX0, t1, 0
-    vinsgr2vr.w VX0, t2, 1
-    vinsgr2vr.w VX0, t3, 2
-    vinsgr2vr.w VX0, t4, 3
-    vst VX0, Y, 0 * SIZE
-    ld.w t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t2, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t4, X, 0 * SIZE
-    add.d X, X, INCX
-    vinsgr2vr.w VX1, t1, 0
-    vinsgr2vr.w VX1, t2, 1
-    vinsgr2vr.w VX1, t3, 2
-    vinsgr2vr.w VX1, t4, 3
-    vst VX1, Y, 4 * SIZE
-    addi.d Y, Y, 8 * SIZE
-    addi.d  I, I, -1
-    blt $r0, I, .L211
-    .align 3
-
-.L212:
-    andi I, N, 7
-    bge $r0, I, .L999
-    .align 3
-
-.L213:
-    fld.s $f12, X, 0 * SIZE
-    addi.d I, I, -1
-    fst.s $f12, Y, 0 * SIZE
-    add.d  X, X, INCX
-    addi.d  Y, Y, SIZE
-    blt $r0, I, .L213
-    b .L999
-    .align 3
-
-.L22:
-    bge $r0, I, .L223
-    .align 3
-
-.L222:
-    fld.s a1, X, 0 * SIZE
-    add.d X, X, INCX
-    fld.s a2, X, 0 * SIZE
-    add.d X, X, INCX
-    fld.s a3, X, 0 * SIZE
-    add.d X, X, INCX
-    fld.s a4, X, 0 * SIZE
-    add.d X, X, INCX
-    fst.s a1, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    fst.s a2, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    fst.s a3, X, 0 * SIZE
-    add.d Y, Y, INCY
-    fst.s a4, X, 0 * SIZE
-    add.d Y, Y, INCY
-    fld.s a1, X, 0 * SIZE
-    add.d X, X, INCX
-    fld.s a2, X, 0 * SIZE
-    add.d X, X, INCX
-    fld.s a3, X, 0 * SIZE
-    add.d X, X, INCX
-    fld.s a4, X, 0 * SIZE
-    add.d X, X, INCX
-    fst.s a1, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    fst.s a2, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    fst.s a3, X, 0 * SIZE
-    add.d Y, Y, INCY
-    fst.s a4, X, 0 * SIZE
-    add.d Y, Y, INCY
-    addi.d  I, I, -1
-    blt $r0, I, .L222
-    .align 3
-
-.L223:
-    andi I, N, 7
-    bge $r0, I, .L999
-    .align 3
-
-.L224:
-    fld.s $f12, X, 0 * SIZE
-    addi.d I, I, -1
-    fst.s $f12, Y, 0 * SIZE
-    add.d  X, X, INCX
-    add.d  Y, Y, INCY
-    blt $r0, I, .L224
-    .align 3
-
-.L999:
-    move $r4, $r12
-    jirl $r0, $r1, 0x0
-    .align 3
-
-    EPILOGUE
diff --git a/kernel/loongarch64/snrm2_lasx.S b/kernel/loongarch64/snrm2_lasx.S
index 274908c14..3ae11e897 100644
--- a/kernel/loongarch64/snrm2_lasx.S
+++ b/kernel/loongarch64/snrm2_lasx.S
@@ -1,3 +1,35 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************************/
+
 #define ASSEMBLER
 
 #include "common.h"
@@ -11,10 +43,13 @@
 #define t2     $r13
 #define t3     $r14
 #define t4     $r15
+
+/* Don't change following FR unless you know the effects. */
 #define VX0    $xr15
 #define VX1    $xr16
 #define VX2    $xr17
 #define VX3    $xr18
+#define VX4    $xr21
 #define res1   $xr19
 #define res2   $xr20
 
@@ -37,14 +72,13 @@
    .align 3
 
 .L10:
-   xvld VX0, X, 0 * SIZE
-   xvld VX1, X, 0 * SIZE
-   xvfcvtl.d.s VX0, VX0
-   xvfcvth.d.s VX1, VX1
-   xvfmadd.d res1, VX0, VX0, res1
-   xvfmadd.d res2, VX1, VX1, res2
+   xvld VX0, X, 0
+   xvfcvtl.d.s VX1, VX0
+   xvfcvth.d.s VX2, VX0
+   xvfmadd.d res1, VX1, VX1, res1
+   xvfmadd.d res2, VX2, VX2, res2
    addi.d I, I, -1
-   addi.d X, X, 8 * SIZE 
+   addi.d X, X, 8 * SIZE
    blt $r0, I, .L10
    .align 3
    b .L996
@@ -54,70 +88,46 @@
    .align 3
 
 .L21:
-   ld.w t1, X, 0 * SIZE
+   ld.w t1, X, 0
    add.d X, X, INCX
-   ld.w t2, X, 0 * SIZE
+   ld.w t2, X, 0
    add.d X, X, INCX
-   ld.w t3, X, 0 * SIZE
+   ld.w t3, X, 0
    add.d X, X, INCX
-   ld.w t4, X, 0 * SIZE
+   ld.w t4, X, 0
    add.d X, X, INCX
    xvinsgr2vr.w VX0, t1, 0
    xvinsgr2vr.w VX0, t2, 1
    xvinsgr2vr.w VX0, t3, 2
    xvinsgr2vr.w VX0, t4, 3
-   ld.w t1, X, 0 * SIZE
+   ld.w t1, X, 0
    add.d X, X, INCX
-   ld.w t2, X, 0 * SIZE
+   ld.w t2, X, 0
    add.d X, X, INCX
-   ld.w t3, X, 0 * SIZE
-   add.d X, X, INCX
-   ld.w t4, X, 0 * SIZE
+   ld.w t3, X, 0
    add.d X, X, INCX
+   ld.w t4, X, 0
    xvinsgr2vr.w VX0, t1, 4
    xvinsgr2vr.w VX0, t2, 5
    xvinsgr2vr.w VX0, t3, 6
    xvinsgr2vr.w VX0, t4, 7
-   ld.w t1, X, 0 * SIZE
    add.d X, X, INCX
-   ld.w t2, X, 0 * SIZE
-   add.d X, X, INCX
-   ld.w t3, X, 0 * SIZE
-   add.d X, X, INCX
-   ld.w t4, X, 0 * SIZE
-   add.d X, X, INCX
-   xvinsgr2vr.w VX1, t1, 0
-   xvinsgr2vr.w VX1, t2, 1
-   xvinsgr2vr.w VX1, t3, 2
-   xvinsgr2vr.w VX1, t4, 3
-   ld.w t1, X, 0 * SIZE
-   add.d X, X, INCX
-   ld.w t2, X, 0 * SIZE
-   add.d X, X, INCX
-   ld.w t3, X, 0 * SIZE
-   add.d X, X, INCX
-   ld.w t4, X, 0 * SIZE
-   add.d X, X, INCX
-   xvinsgr2vr.w VX1, t1, 4
-   xvinsgr2vr.w VX1, t2, 5
-   xvinsgr2vr.w VX1, t3, 6
-   xvinsgr2vr.w VX1, t4, 7
-   xvfcvtl.d.s VX0, VX0
-   xvfcvth.d.s VX1, VX1
-   xvfmadd.d res1, VX0, VX0, res1
-   xvfmadd.d res2, VX1, VX1, res2
+   xvfcvtl.d.s VX1, VX0
+   xvfcvth.d.s VX2, VX0
+   xvfmadd.d res1, VX1, VX1, res1
+   xvfmadd.d res2, VX2, VX2, res2
    addi.d  I, I, -1
    blt $r0, I, .L21
    b .L996
 
 .L996:
    xvfadd.d res1, res1, res2
-   xvpickve.w VX1, res1, 1
-   xvpickve.w VX2, res1, 2
-   xvpickve.w VX3, res1, 3
-   xvfadd.s res1, VX1, res1
-   xvfadd.s res1, VX2, res1
-   xvfadd.s res1, VX3, res1
+   xvpickve.d VX1, res1, 1
+   xvpickve.d VX2, res1, 2
+   xvpickve.d VX3, res1, 3
+   fadd.d   $f19, $f19, $f16
+   fadd.d   $f19, $f19, $f17
+   fadd.d   $f19, $f19, $f18
    .align 3
 
 .L997:
@@ -126,11 +136,11 @@
    .align 3
 
 .L998:
-   fld.s $f15, X, 0 * SIZE
-   addi.d I, I, -1
+   fld.s $f15, X, 0
+   add.d    X, X, INCX
+   addi.d   I, I, -1
    fcvt.d.s $f15, $f15
-   fmadd.d $f19, $f15, $f15, $f19
-   add.d X, X, INCX
+   fmadd.d  $f19, $f15, $f15, $f19
    blt $r0, I, .L998
    .align 3
 
diff --git a/kernel/loongarch64/snrm2_lsx.S b/kernel/loongarch64/snrm2_lsx.S
index 17d017900..bb492dbf0 100644
--- a/kernel/loongarch64/snrm2_lsx.S
+++ b/kernel/loongarch64/snrm2_lsx.S
@@ -1,3 +1,35 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************************/
+
 #define ASSEMBLER
 
 #include "common.h"
@@ -15,6 +47,9 @@
 #define VX1    $vr16
 #define VX2    $vr17
 #define VX3    $vr18
+#define VX4    $vr21
+#define VX5    $vr22
+/* Don't change following FR unless you know the effects. */
 #define res1   $vr19
 #define res2   $vr20
 
@@ -24,99 +59,71 @@
    LDINT   N,     0(N)
    LDINT   INCX,  0(INCX)
 #endif
-
    vxor.v res1, res1, res1
    vxor.v res2, res2, res2
-   bge $r0, N, .L999
+   bge $r0,    N, .L999
    beq $r0, INCX, .L999
    li.d  TEMP, SIZE
    slli.d INCX, INCX, BASE_SHIFT
    srai.d I, N, 3
    bne INCX, TEMP, .L20
-   bge $r0,  I, .L997
+   bge  $r0,    I, .L997
    .align 3
 
 .L10:
-   vld VX0, X, 0 * SIZE
-   vld VX1, X, 0 * SIZE
-   vfcvtl.d.s VX0, VX0
-   vfcvth.d.s VX1, VX1
-   vfmadd.d res1, VX0, VX0, res1
-   vfmadd.d res2, VX1, VX1, res2
-   vld VX2, X, 4 * SIZE
-   vld VX3, X, 4 * SIZE
-   vfcvtl.d.s VX2, VX2
-   vfcvth.d.s VX3, VX3
-   vfmadd.d res1, VX2, VX2, res1
-   vfmadd.d res2, VX3, VX3, res2
+   vld  VX0, X, 0
+   vld  VX5, X, 4 * SIZE
    addi.d I, I, -1
-   addi.d X, X, 8 * SIZE 
+   addi.d X, X, 8 * SIZE
+   vfcvtl.d.s VX1, VX0
+   vfcvth.d.s VX2, VX0
+   vfcvtl.d.s VX3, VX5
+   vfcvth.d.s VX4, VX5
+   vfmadd.d  res1, VX1, VX1, res1
+   vfmadd.d  res2, VX2, VX2, res2
+   vfmadd.d  res1, VX3, VX3, res1
+   vfmadd.d  res2, VX4, VX4, res2
    blt $r0, I, .L10
    b .L996
    .align 3
-   
 
 .L20:
    bge $r0, I, .L997
    .align 3
 
 .L21:
-   ld.w t1, X, 0 * SIZE
+   ld.w t1, X, 0
    add.d X, X, INCX
-   ld.w t2, X, 0 * SIZE
+   ld.w t2, X, 0
    add.d X, X, INCX
-   ld.w t3, X, 0 * SIZE
+   ld.w t3, X, 0
    add.d X, X, INCX
-   ld.w t4, X, 0 * SIZE
+   ld.w t4, X, 0
    add.d X, X, INCX
    vinsgr2vr.w VX0, t1, 0
    vinsgr2vr.w VX0, t2, 1
    vinsgr2vr.w VX0, t3, 2
-   vinsgr2vr.w VX0, t4, 3  
-   ld.w t1, X, 0 * SIZE
+   vinsgr2vr.w VX0, t4, 3
+   vfcvtl.d.s VX1, VX0
+   vfcvth.d.s VX2, VX0
+   vfmadd.d res1, VX1, VX1, res1
+   vfmadd.d res2, VX2, VX2, res2
+   ld.w t1, X, 0
    add.d X, X, INCX
-   ld.w t2, X, 0 * SIZE
+   ld.w t2, X, 0
    add.d X, X, INCX
-   ld.w t3, X, 0 * SIZE
+   ld.w t3, X, 0
    add.d X, X, INCX
-   ld.w t4, X, 0 * SIZE
+   ld.w t4, X, 0
    add.d X, X, INCX
-   vinsgr2vr.w VX1, t1, 0
-   vinsgr2vr.w VX1, t2, 1
-   vinsgr2vr.w VX1, t3, 2
-   vinsgr2vr.w VX1, t4, 3
-   vfcvtl.d.s VX0, VX0
-   vfcvth.d.s VX1, VX1
-   vfmadd.d res1, VX0, VX0, res1
-   vfmadd.d res2, VX1, VX1, res2
-   ld.w t1, X, 0 * SIZE
-   add.d X, X, INCX
-   ld.w t2, X, 0 * SIZE
-   add.d X, X, INCX
-   ld.w t3, X, 0 * SIZE
-   add.d X, X, INCX
-   ld.w t4, X, 0 * SIZE
-   add.d X, X, INCX
-   vinsgr2vr.w VX2, t1, 0
-   vinsgr2vr.w VX2, t2, 1
-   vinsgr2vr.w VX2, t3, 2
-   vinsgr2vr.w VX2, t4, 3  
-   ld.w t1, X, 0 * SIZE
-   add.d X, X, INCX
-   ld.w t2, X, 0 * SIZE
-   add.d X, X, INCX
-   ld.w t3, X, 0 * SIZE
-   add.d X, X, INCX
-   ld.w t4, X, 0 * SIZE
-   add.d X, X, INCX
-   vinsgr2vr.w VX3, t1, 0
-   vinsgr2vr.w VX3, t2, 1
-   vinsgr2vr.w VX3, t3, 2
-   vinsgr2vr.w VX3, t4, 3
-   vfcvtl.d.s VX2, VX2
-   vfcvth.d.s VX3, VX3
-   vfmadd.d res1, VX2, VX2, res1
-   vfmadd.d res2, VX3, VX3, res2
+   vinsgr2vr.w VX0, t1, 0
+   vinsgr2vr.w VX0, t2, 1
+   vinsgr2vr.w VX0, t3, 2
+   vinsgr2vr.w VX0, t4, 3
+   vfcvtl.d.s VX3, VX0
+   vfcvth.d.s VX4, VX0
+   vfmadd.d res1, VX3, VX3, res1
+   vfmadd.d res2, VX4, VX4, res2
    addi.d  I, I, -1
    blt $r0, I, .L21
    b .L996
@@ -124,12 +131,8 @@
 
 .L996:
    vfadd.d res1, res1, res2
-   vreplvei.w VX1, res1, 1
-   vreplvei.w VX2, res1, 2
-   vreplvei.w VX3, res1, 3
-   vfadd.s res1, VX1, res1
-   vfadd.s res1, VX2, res1
-   vfadd.s res1, VX3, res1
+   vreplvei.d VX1, res1, 1
+   vfadd.d res1, VX1, res1
    .align 3
 
 .L997:
@@ -138,7 +141,7 @@
    .align 3
 
 .L998:
-   fld.s $f15, X, 0 * SIZE
+   fld.s $f15, X, 0
    addi.d I, I, -1
    fcvt.d.s $f15, $f15
    fmadd.d $f19, $f15, $f15, $f19
diff --git a/kernel/loongarch64/ssum_lasx.S b/kernel/loongarch64/ssum_lasx.S
deleted file mode 100644
index 7cf57bc77..000000000
--- a/kernel/loongarch64/ssum_lasx.S
+++ /dev/null
@@ -1,140 +0,0 @@
-#define ASSEMBLER
-#include "common.h"
-#define N      $r4
-#define X      $r5
-#define INCX   $r6
-#define I      $r17
-#define TEMP   $r18
-#define t1     $r15
-#define t2     $r12
-#define t3     $r13
-#define t4     $r14
-#define VX0    $xr12
-#define VX1    $xr13
-#define VX2    $xr14
-#define VX3    $xr15
-#define res1   $xr16
-#define res2   $xr17
-    PROLOGUE
-    xvxor.v res1, res1, res1
-    xvxor.v res2, res2, res2
-    bge $r0, N, .L999
-    bge $r0, INCX, .L999
-    li.d  TEMP, SIZE
-    slli.d  INCX, INCX, BASE_SHIFT
-    srai.d I, N, 3
-    bne INCX, TEMP, .L20
-    bge $r0, I, .L13
-    .align 3
-
-.L11:
-    xvld VX0, X, 0 * SIZE
-    xvfadd.s res1, VX0, res1
-    addi.d X, X, 8 * SIZE
-    addi.d  I, I, -1
-    blt $r0, I, .L11
-    .align 3
-
-.L12:
-    xvfadd.s res2, res1, res2
-    xvpickve.w VX1, res1, 1
-    xvpickve.w VX2, res1, 2
-    xvpickve.w VX3, res1, 3
-    xvfadd.s res1, VX1, res1
-    xvfadd.s res1, VX2, res1
-    xvfadd.s res1, VX3, res1
-    xvpickve.w VX0, res2, 4
-    xvpickve.w VX1, res2, 5
-    xvpickve.w VX2, res2, 6
-    xvpickve.w VX3, res2, 7
-    xvfadd.s res1, VX0, res1
-    xvfadd.s res1, VX1, res1
-    xvfadd.s res1, VX2, res1
-    xvfadd.s res1, VX2, res1
-    .align 3
-
-.L13:
-    andi I, N, 7
-    bge $r0, I, .L999
-    .align 3
-
-.L14:
-    fld.s $f12, X, 0 * SIZE
-    fadd.s $f16, $f12, $f16
-    addi.d I, I, -1
-    addi.d  X, X, SIZE
-    blt $r0, I, .L14
-    b .L999
-    .align 3
-
-.L20:
-    bge $r0, I, .L23
-    .align 3
-
-.L21:
-    ld.w t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t2, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t4, X, 0 * SIZE
-    add.d X, X, INCX
-    xvinsgr2vr.w VX0, t1, 0
-    xvinsgr2vr.w VX0, t2, 1
-    xvinsgr2vr.w VX0, t3, 2
-    xvinsgr2vr.w VX0, t4, 3
-    ld.w t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t2, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t4, X, 0 * SIZE
-    add.d X, X, INCX
-    xvinsgr2vr.w VX0, t1, 4
-    xvinsgr2vr.w VX0, t2, 5
-    xvinsgr2vr.w VX0, t3, 6
-    xvinsgr2vr.w VX0, t4, 7
-    xvfadd.s res1, VX0, res1
-    addi.d  I, I, -1
-    blt $r0, I, .L21
-    .align 3
-
-.L22:
-    xvfadd.s res2, res1, res2
-    xvpickve.w VX1, res1, 1
-    xvpickve.w VX2, res1, 2
-    xvpickve.w VX3, res1, 3
-    xvfadd.s res1, VX1, res1
-    xvfadd.s res1, VX2, res1
-    xvfadd.s res1, VX3, res1
-    xvpickve.w VX0, res2, 4
-    xvpickve.w VX1, res2, 5
-    xvpickve.w VX2, res2, 6
-    xvpickve.w VX3, res2, 7
-    xvfadd.s res1, VX0, res1
-    xvfadd.s res1, VX1, res1
-    xvfadd.s res1, VX2, res1
-    xvfadd.s res1, VX2, res1
-    .align 3
-
-.L23:
-    andi I, N, 7
-    bge $r0, I, .L999
-    .align 3
-
-.L24:
-    fld.s $f12, X, 0 * SIZE
-    fadd.s $f16, $f12, $f16
-    addi.d I, I, -1
-    add.d  X, X, INCX
-    blt $r0, I, .L24
-    .align 3
-
-.L999:
-    fmov.s $f0, $f16
-    jirl $r0, $r1, 0x0
-    .align 3
-
-    EPILOGUE
\ No newline at end of file
diff --git a/kernel/loongarch64/ssum_lsx.S b/kernel/loongarch64/ssum_lsx.S
deleted file mode 100644
index de63c69e3..000000000
--- a/kernel/loongarch64/ssum_lsx.S
+++ /dev/null
@@ -1,125 +0,0 @@
-#define ASSEMBLER
-#include "common.h"
-#define N      $r4
-#define X      $r5
-#define INCX   $r6
-#define I      $r17
-#define TEMP   $r18
-#define t1     $r15
-#define t2     $r12
-#define t3     $r13
-#define t4     $r14
-#define VX0    $vr12
-#define VX1    $vr13
-#define VX2    $vr14
-#define VX3    $vr15
-#define res1   $vr16
-#define res2   $vr17
-    PROLOGUE
-    vxor.v res1, res1, res1
-    vxor.v res2, res2, res2
-    bge $r0, N, .L999
-    bge $r0, INCX, .L999
-    li.d  TEMP, SIZE
-    slli.d  INCX, INCX, BASE_SHIFT
-    srai.d I, N, 3
-    bne INCX, TEMP, .L20
-    bge $r0, I, .L13
-    .align 3
-
-.L11:
-    vld VX0, X, 0 * SIZE
-    vld VX1, X, 4 * SIZE
-    vfadd.s res2, VX0, VX1
-    vfadd.s res1, res1, res2
-    addi.d X, X, 8 * SIZE
-    addi.d  I, I, -1
-    blt $r0, I, .L11
-    .align 3
-
-.L12:
-    vreplvei.w VX1, res1, 1
-    vreplvei.w VX2, res1, 2
-    vreplvei.w VX3, res1, 3
-    vfadd.s res1, VX1, res1
-    vfadd.s res1, VX2, res1
-    vfadd.s res1, VX3, res1
-    .align 3
-
-.L13:
-    andi I, N, 7
-    bge $r0, I, .L999
-    .align 3
-
-.L14:
-    fld.s $f12, X, 0 * SIZE
-    fadd.s $f16, $f12, $f16
-    addi.d I, I, -1
-    addi.d  X, X, SIZE
-    blt $r0, I, .L14
-    b .L999
-    .align 3
-
-.L20:
-    bge $r0, I, .L23
-    .align 3
-
-.L21:
-    ld.w t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t2, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t4, X, 0 * SIZE
-    add.d X, X, INCX
-    vinsgr2vr.w VX0, t1, 0
-    vinsgr2vr.w VX0, t2, 1
-    vinsgr2vr.w VX0, t3, 2
-    vinsgr2vr.w VX0, t4, 3
-    ld.w t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t2, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t4, X, 0 * SIZE
-    add.d X, X, INCX
-    vinsgr2vr.w VX1, t1, 0
-    vinsgr2vr.w VX1, t2, 1
-    vinsgr2vr.w VX1, t3, 2
-    vinsgr2vr.w VX1, t4, 3
-    vfadd.s res2, VX0, VX1
-    vfadd.s res1, res1, res2
-    addi.d  I, I, -1
-    blt $r0, I, .L21
-    .align 3
-
-.L22:
-    vreplvei.w VX1, res1, 1
-    vreplvei.w VX2, res1, 2
-    vreplvei.w VX3, res1, 3
-    vfadd.s res1, VX1, res1
-    vfadd.s res1, VX2, res1
-    vfadd.s res1, VX3, res1
-    .align 3
-
-.L23:
-    andi I, N, 7
-    bge $r0, I, .L999
-    .align 3
-
-.L24:
-    fld.s $f12, X, 0 * SIZE
-    fadd.s $f16, $f12, $f16
-    addi.d I, I, -1
-    add.d  X, X, INCX
-    blt $r0, I, .L24
-    .align 3
-
-.L999:
-    fmov.s $f0, $f16
-    jirl $r0, $r1, 0x0
-    .align 3
-
-    EPILOGUE
\ No newline at end of file
diff --git a/kernel/loongarch64/sswap_lasx.S b/kernel/loongarch64/sswap_lasx.S
deleted file mode 100644
index 7184eff45..000000000
--- a/kernel/loongarch64/sswap_lasx.S
+++ /dev/null
@@ -1,286 +0,0 @@
-#define ASSEMBLER
-
-#include "common.h"
-#define N      $r4
-#define X      $r7
-#define INCX   $r8
-#define Y      $r9
-#define INCY   $r10
-
-#define I      $r17
-#define TEMP   $r18
-#define XX     $r5
-#define YY     $r6
-#define t1     $r14
-#define t2     $r15
-#define t3     $r16
-#define t4     $r19
-#define a1     $f12
-#define a2     $f13
-#define a3     $f14
-#define a4     $f15
-#define b1     $f16
-#define b2     $f17
-#define b3     $f18
-#define b4     $f19
-#define VX0    $xr12
-#define VX1    $xr13
-#define VX2    $xr14
-#define VX3    $xr15
-
-
-    PROLOGUE
-    bge $r0, N, .L999
-    li.d TEMP, 1
-    slli.d  TEMP, TEMP, BASE_SHIFT
-    slli.d  INCX, INCX, BASE_SHIFT
-    slli.d  INCY, INCY, BASE_SHIFT
-    srai.d I, N, 3
-    bne INCX, TEMP, .L20
-    bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
-    b .L11  // INCX==1 and INCY==1
-.L20:
-    bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
-    b .L21 // INCX!=1 and INCY==1
-
-.L11:
-    bge $r0, I, .L112
-    .align 3
-
-.L111:
-    xvld VX0, X, 0 * SIZE
-    xvld VX2, Y, 0 * SIZE
-    addi.d  I, I, -1
-    xvst VX2, X, 0 * SIZE
-    xvst VX0, Y, 0 * SIZE
-    addi.d X, X, 8 * SIZE
-    addi.d Y, Y, 8 * SIZE
-    blt $r0, I, .L111
-    .align 3
-
-.L112:
-    andi I, N, 7
-    bge $r0, I, .L999
-    .align 3
-
-.L113:
-    fld.s $f12, X, 0 * SIZE
-    fld.s $f14, Y, 0 * SIZE
-    addi.d I, I, -1
-    fst.s $f12, Y, 0 * SIZE
-    fst.s $f14, X, 0 * SIZE
-    addi.d  X, X, SIZE
-    addi.d  Y, Y, SIZE
-    blt $r0, I, .L113
-    b .L999
-    .align 3
-
-.L12: // INCX==1 and INCY!=1
-    bge $r0, I, .L122
-    .align 3
-
-.L121:
-    xvld VX0, X, 0 * SIZE
-    ld.w t1, Y, 0 * SIZE
-    xvstelm.w VX0, Y, 0, 0
-    add.d Y, Y, INCY
-    ld.w t2, Y, 0 * SIZE
-    xvstelm.w VX0, Y, 0, 1
-    add.d Y, Y, INCY
-    ld.w t3, Y, 0 * SIZE
-    xvstelm.w VX0, Y, 0, 2
-    add.d Y, Y, INCY
-    ld.w t4, Y, 0 * SIZE
-    xvstelm.w VX0, Y, 0, 3
-    xvinsgr2vr.w VX2, t1, 0
-    xvinsgr2vr.w VX2, t2, 1
-    xvinsgr2vr.w VX2, t3, 2
-    xvinsgr2vr.w VX2, t4, 3
-    add.d Y, Y, INCY
-    ld.w t1, Y, 0 * SIZE
-    xvstelm.w VX0, Y, 0, 4
-    add.d Y, Y, INCY
-    ld.w t2, Y, 0 * SIZE
-    xvstelm.w VX0, Y, 0, 5
-    add.d Y, Y, INCY
-    ld.w t3, Y, 0 * SIZE
-    xvstelm.w VX0, Y, 0, 6
-    add.d Y, Y, INCY
-    ld.w t4, Y, 0 * SIZE
-    xvstelm.w VX0, Y, 0, 7
-    xvinsgr2vr.w VX2, t1, 4
-    xvinsgr2vr.w VX2, t2, 5
-    xvinsgr2vr.w VX2, t3, 6
-    xvinsgr2vr.w VX2, t4, 7
-    add.d Y, Y, INCY
-    xvst VX2, X, 0 * SIZE
-    addi.d X, X, 8 * SIZE
-    addi.d  I, I, -1
-    blt $r0, I, .L121
-    .align 3
-
-.L122:
-    andi I, N, 7
-    bge $r0, I, .L999
-    .align 3
-
-.L123:
-    fld.s $f12, X, 0 * SIZE
-    fld.s $f14, Y, 0 * SIZE
-    addi.d I, I, -1
-    fst.s $f12, Y, 0 * SIZE
-    fst.s $f14, X, 0 * SIZE
-    addi.d  X, X, SIZE
-    add.d  Y, Y, INCY
-    blt $r0, I, .L123
-    b .L999
-    .align 3
-
-.L21:
-    bge $r0, I, .L212
-    .align 3
-
-.L211:
-    xvld VX2, Y, 0 * SIZE
-    ld.w t1, X, 0 * SIZE
-    xvstelm.w VX2, X, 0, 0
-    add.d X, X, INCX
-    ld.w t2, X, 0 * SIZE
-    xvstelm.w VX2, X, 0, 1
-    add.d X, X, INCY
-    ld.w t3, X, 0 * SIZE
-    xvstelm.w VX2, X, 0, 2
-    add.d X, X, INCX
-    ld.w t4, X, 0 * SIZE
-    xvstelm.w VX2, X, 0, 3
-    xvinsgr2vr.w VX0, t1, 0
-    xvinsgr2vr.w VX0, t2, 1
-    xvinsgr2vr.w VX0, t3, 2
-    xvinsgr2vr.w VX0, t4, 3
-    add.d X, X, INCX
-    ld.w t1, X, 0 * SIZE
-    xvstelm.w VX2, X, 0, 4
-    add.d X, X, INCY
-    ld.w t2, X, 0 * SIZE
-    xvstelm.w VX2, X, 0, 5
-    add.d X, X, INCX
-    ld.w t3, X, 0 * SIZE
-    xvstelm.w VX2, X, 0, 6
-    add.d X, X, INCX
-    ld.w t4, X, 0 * SIZE
-    xvstelm.w VX2, X, 0, 7
-    xvinsgr2vr.w VX0, t1, 4
-    xvinsgr2vr.w VX0, t2, 5
-    xvinsgr2vr.w VX0, t3, 6
-    xvinsgr2vr.w VX0, t4, 7
-    add.d X, X, INCX
-    xvst VX1, Y, 0 * SIZE
-    addi.d Y, Y, 8 * SIZE
-    addi.d  I, I, -1
-    blt $r0, I, .L211
-    .align 3
-
-.L212:
-    andi I, N, 7
-    bge $r0, I, .L999
-    .align 3
-
-.L213:
-    fld.s $f12, X, 0 * SIZE
-    fld.s $f14, Y, 0 * SIZE
-    addi.d I, I, -1
-    fst.s $f12, Y, 0 * SIZE
-    fst.s $f14, X, 0 * SIZE
-    add.d  X, X, INCX
-    addi.d  Y, Y, SIZE
-    blt $r0, I, .L213
-    b .L999
-    .align 3
-
-.L22:
-    bge $r0, I, .L223
-    .align 3
-    move XX, X
-
-.L222:
-    fld.s a1, X, 0 * SIZE
-    add.d X, X, INCX
-    fld.s a2, X, 0 * SIZE
-    add.d X, X, INCX
-    fld.s a3, X, 0 * SIZE
-    add.d X, X, INCX
-    fld.s a4, X, 0 * SIZE
-    add.d X, X, INCX
-    fld.s b1, Y, 0 * SIZE
-    fst.s a1, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    fld.s b2, Y, 0 * SIZE
-    fst.s a2, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    fld.s b3, Y, 0 * SIZE
-    fst.s a3, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    fld.s b4, Y, 0 * SIZE
-    fst.s a4, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    fld.s a1, X, 0 * SIZE
-    add.d X, X, INCX
-    fst.s b1, XX, 0 * SIZE
-    add.d XX, XX, INCX
-    fld.s b1, Y, 0 * SIZE
-    fst.s a1, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    fld.s a2, X, 0 * SIZE
-    add.d X, X, INCX
-    fst.s b2, XX, 0 * SIZE
-    add.d XX, XX, INCX
-    fld.s b2, Y, 0 * SIZE
-    fst.s a2, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    fld.s a3, X, 0 * SIZE
-    add.d X, X, INCX
-    fst.s b3, XX, 0 * SIZE
-    add.d XX, XX, INCX
-    fld.s b3, Y, 0 * SIZE
-    fst.s a3, Y, 0 * SIZE
-    fld.s a4, X, 0 * SIZE
-    add.d X, X, INCX
-    fst.s b4, XX, 0 * SIZE
-    add.d XX, XX, INCX
-    fld.s b4, Y, 0 * SIZE
-    fst.s a4, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    fst.s b1, XX, 0 * SIZE
-    add.d XX, XX, INCX
-    fst.s b2, XX, 0 * SIZE
-    add.d XX, XX, INCX
-    fst.s b3, XX, 0 * SIZE
-    add.d XX, XX, INCX
-    fst.s b4, XX, 0 * SIZE
-    add.d XX, XX, INCX
-    addi.d  I, I, -1
-    blt $r0, I, .L222
-    .align 3
-
-.L223:
-    andi I, N, 7
-    bge $r0, I, .L999
-    .align 3
-
-.L224:
-    fld.s $f12, X, 0 * SIZE
-    fld.s $f14, Y, 0 * SIZE
-    addi.d I, I, -1
-    fst.s $f12, Y, 0 * SIZE
-    fst.s $f14, X, 0 * SIZE
-    add.d  X, X, INCX
-    add.d  Y, Y, INCY
-    blt $r0, I, .L224
-    .align 3
-
-.L999:
-    move $r4, $r12
-    jirl $r0, $r1, 0x0
-    .align 3
-
-    EPILOGUE
diff --git a/kernel/loongarch64/sswap_lsx.S b/kernel/loongarch64/sswap_lsx.S
deleted file mode 100644
index 4f19a8024..000000000
--- a/kernel/loongarch64/sswap_lsx.S
+++ /dev/null
@@ -1,294 +0,0 @@
-#define ASSEMBLER
-
-#include "common.h"
-#define N      $r4
-#define X      $r7
-#define INCX   $r8
-#define Y      $r9
-#define INCY   $r10
-
-#define I      $r17
-#define TEMP   $r18
-#define XX     $r5
-#define YY     $r6
-#define t1     $r14
-#define t2     $r15
-#define t3     $r16
-#define t4     $r19
-#define a1     $f12
-#define a2     $f13
-#define a3     $f14
-#define a4     $f15
-#define b1     $f16
-#define b2     $f17
-#define b3     $f18
-#define b4     $f19
-#define VX0    $vr12
-#define VX1    $vr13
-#define VX2    $vr14
-#define VX3    $vr15
-
-
-    PROLOGUE
-    bge $r0, N, .L999
-    li.d TEMP, 1
-    slli.d  TEMP, TEMP, BASE_SHIFT
-    slli.d  INCX, INCX, BASE_SHIFT
-    slli.d  INCY, INCY, BASE_SHIFT
-    srai.d I, N, 3
-    bne INCX, TEMP, .L20
-    bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
-    b .L11  // INCX==1 and INCY==1
-.L20:
-    bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
-    b .L21 // INCX!=1 and INCY==1
-
-.L11:
-    bge $r0, I, .L112
-    .align 3
-
-.L111:
-    vld VX0, X, 0 * SIZE
-    vld VX1, X, 4 * SIZE
-    vld VX2, Y, 0 * SIZE
-    vld VX3, Y, 4 * SIZE
-    addi.d  I, I, -1
-    vst VX2, X, 0 * SIZE
-    vst VX3, X, 4 * SIZE
-    vst VX0, Y, 0 * SIZE
-    vst VX1, Y, 4 * SIZE
-    addi.d X, X, 8 * SIZE
-    addi.d Y, Y, 8 * SIZE
-    blt $r0, I, .L111
-    .align 3
-
-.L112:
-    andi I, N, 7
-    bge $r0, I, .L999
-    .align 3
-
-.L113:
-    fld.s $f12, X, 0 * SIZE
-    fld.s $f14, Y, 0 * SIZE
-    addi.d I, I, -1
-    fst.s $f12, Y, 0 * SIZE
-    fst.s $f14, X, 0 * SIZE
-    addi.d  X, X, SIZE
-    addi.d  Y, Y, SIZE
-    blt $r0, I, .L113
-    b .L999
-    .align 3
-
-.L12: // INCX==1 and INCY!=1
-    bge $r0, I, .L122
-    .align 3
-
-.L121:
-    vld VX0, X, 0 * SIZE
-    ld.w t1, Y, 0 * SIZE
-    vstelm.w VX0, Y, 0, 0
-    add.d Y, Y, INCY
-    ld.w t2, Y, 0 * SIZE
-    vstelm.w VX0, Y, 0, 1
-    add.d Y, Y, INCY
-    ld.w t3, Y, 0 * SIZE
-    vstelm.w VX0, Y, 0, 2
-    add.d Y, Y, INCY
-    ld.w t4, Y, 0 * SIZE
-    vstelm.w VX0, Y, 0, 3
-    vinsgr2vr.w VX2, t1, 0
-    vinsgr2vr.w VX2, t2, 1
-    vinsgr2vr.w VX2, t3, 2
-    vinsgr2vr.w VX2, t4, 3
-    add.d Y, Y, INCY
-    vst VX2, X, 0 * SIZE
-    vld VX1, X, 4 * SIZE
-    ld.w t1, Y, 0 * SIZE
-    vstelm.w VX1, Y, 0, 0
-    add.d Y, Y, INCY
-    ld.w t2, Y, 0 * SIZE
-    vstelm.w VX1, Y, 0, 1
-    add.d Y, Y, INCY
-    ld.w t3, Y, 0 * SIZE
-    vstelm.w VX1, Y, 0, 2
-    add.d Y, Y, INCY
-    ld.w t4, Y, 0 * SIZE
-    vstelm.w VX1, Y, 0, 3
-    vinsgr2vr.w VX3, t1, 0
-    vinsgr2vr.w VX3, t2, 1
-    vinsgr2vr.w VX3, t3, 2
-    vinsgr2vr.w VX3, t4, 3
-    add.d Y, Y, INCY
-    vst VX3, X, 4 * SIZE
-    addi.d X, X, 8 * SIZE
-    addi.d  I, I, -1
-    blt $r0, I, .L121
-    .align 3
-
-.L122:
-    andi I, N, 7
-    bge $r0, I, .L999
-    .align 3
-
-.L123:
-    fld.s $f12, X, 0 * SIZE
-    fld.s $f14, Y, 0 * SIZE
-    addi.d I, I, -1
-    fst.s $f12, Y, 0 * SIZE
-    fst.s $f14, X, 0 * SIZE
-    addi.d  X, X, SIZE
-    add.d  Y, Y, INCY
-    blt $r0, I, .L123
-    b .L999
-    .align 3
-
-.L21:// INCX!=1 and INCY==1
-    bge $r0, I, .L212
-    .align 3
-
-.L211:
-    vld VX2, Y, 0 * SIZE
-    ld.w t1, X, 0 * SIZE
-    vstelm.w VX2, X, 0, 0
-    add.d X, X, INCX
-    ld.w t2, X, 0 * SIZE
-    vstelm.w VX2, X, 0, 1
-    add.d X, X, INCY
-    ld.w t3, X, 0 * SIZE
-    vstelm.w VX2, X, 0, 2
-    add.d X, X, INCX
-    ld.w t4, X, 0 * SIZE
-    vstelm.w VX2, X, 0, 3
-    vinsgr2vr.w VX0, t1, 0
-    vinsgr2vr.w VX0, t2, 1
-    vinsgr2vr.w VX0, t3, 2
-    vinsgr2vr.w VX0, t4, 3
-    add.d X, X, INCX
-    vst VX0, Y, 0 * SIZE
-    vld VX3, Y, 4 * SIZE
-    ld.w t1, X, 0 * SIZE
-    vstelm.w VX3, X, 0, 0
-    add.d X, X, INCY
-    ld.w t2, X, 0 * SIZE
-    vstelm.w VX3, X, 0, 1
-    add.d X, X, INCX
-    ld.w t3, X, 0 * SIZE
-    vstelm.w VX3, X, 0, 2
-    add.d X, X, INCX
-    ld.w t4, X, 0 * SIZE
-    vstelm.w VX3, X, 0, 3
-    vinsgr2vr.w VX1, t1, 0
-    vinsgr2vr.w VX1, t2, 1
-    vinsgr2vr.w VX1, t3, 2
-    vinsgr2vr.w VX1, t4, 3
-    add.d X, X, INCX
-    vst VX1, Y, 0 * SIZE
-    addi.d Y, Y, 8 * SIZE
-    addi.d  I, I, -1
-    blt $r0, I, .L211
-    .align 3
-
-.L212:
-    andi I, N, 7
-    bge $r0, I, .L999
-    .align 3
-
-.L213:
-    fld.s $f12, X, 0 * SIZE
-    fld.s $f14, Y, 0 * SIZE
-    addi.d I, I, -1
-    fst.s $f12, Y, 0 * SIZE
-    fst.s $f14, X, 0 * SIZE
-    add.d  X, X, INCX
-    addi.d  Y, Y, SIZE
-    blt $r0, I, .L213
-    b .L999
-    .align 3
-
-.L22:
-    bge $r0, I, .L223
-    .align 3
-    move XX, X
-
-.L222:
-    fld.s a1, X, 0 * SIZE
-    add.d X, X, INCX
-    fld.s a2, X, 0 * SIZE
-    add.d X, X, INCX
-    fld.s a3, X, 0 * SIZE
-    add.d X, X, INCX
-    fld.s a4, X, 0 * SIZE
-    add.d X, X, INCX
-    fld.s b1, Y, 0 * SIZE
-    fst.s a1, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    fld.s b2, Y, 0 * SIZE
-    fst.s a2, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    fld.s b3, Y, 0 * SIZE
-    fst.s a3, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    fld.s b4, Y, 0 * SIZE
-    fst.s a4, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    fld.s a1, X, 0 * SIZE
-    add.d X, X, INCX
-    fst.s b1, XX, 0 * SIZE
-    add.d XX, XX, INCX
-    fld.s b1, Y, 0 * SIZE
-    fst.s a1, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    fld.s a2, X, 0 * SIZE
-    add.d X, X, INCX
-    fst.s b2, XX, 0 * SIZE
-    add.d XX, XX, INCX
-    fld.s b2, Y, 0 * SIZE
-    fst.s a2, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    fld.s a3, X, 0 * SIZE
-    add.d X, X, INCX
-    fst.s b3, XX, 0 * SIZE
-    add.d XX, XX, INCX
-    fld.s b3, Y, 0 * SIZE
-    fst.s a3, Y, 0 * SIZE
-    fld.s a4, X, 0 * SIZE
-    add.d X, X, INCX
-    fst.s b4, XX, 0 * SIZE
-    add.d XX, XX, INCX
-    fld.s b4, Y, 0 * SIZE
-    fst.s a4, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    fst.s b1, XX, 0 * SIZE
-    add.d XX, XX, INCX
-    fst.s b2, XX, 0 * SIZE
-    add.d XX, XX, INCX
-    fst.s b3, XX, 0 * SIZE
-    add.d XX, XX, INCX
-    fst.s b4, XX, 0 * SIZE
-    add.d XX, XX, INCX
-    addi.d  I, I, -1
-    blt $r0, I, .L222
-    .align 3
-
-.L223:
-    andi I, N, 7
-    bge $r0, I, .L999
-    .align 3
-
-.L224:
-    fld.s $f12, X, 0 * SIZE
-    fld.s $f14, Y, 0 * SIZE
-    addi.d I, I, -1
-    fst.s $f12, Y, 0 * SIZE
-    fst.s $f14, X, 0 * SIZE
-    add.d  X, X, INCX
-    add.d  Y, Y, INCY
-    blt $r0, I, .L224
-    .align 3
-
-.L999:
-    move $r4, $r12
-    jirl $r0, $r1, 0x0
-    .align 3
-
-    EPILOGUE
diff --git a/kernel/loongarch64/sum_lasx.S b/kernel/loongarch64/sum_lasx.S
new file mode 100644
index 000000000..fd6d5adb3
--- /dev/null
+++ b/kernel/loongarch64/sum_lasx.S
@@ -0,0 +1,225 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define I      $r17
+#define TEMP   $r18
+#define t1     $r15
+#define t2     $r12
+#define t3     $r13
+#define t4     $r14
+#define VX0    $xr12
+#define VX1    $xr13
+#define VX2    $xr14
+#define VX3    $xr15
+#define res1   $xr16
+#define res2   $xr17
+    PROLOGUE
+    xvxor.v res1, res1, res1
+    xvxor.v res2, res2, res2
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    li.d  TEMP, SIZE
+    slli.d  INCX, INCX, BASE_SHIFT
+    srai.d I, N, 3
+    bne INCX, TEMP, .L20
+    bge $r0, I, .L13
+    .align 3
+
+.L11:
+    xvld VX0, X, 0
+    xvfadd.s res1, res1, VX0
+#ifdef DOUBLE
+    xvld VX1, X, 32
+    xvfadd.s res1, res1, VX1
+#endif
+    addi.d X, X, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L11
+    .align 3
+
+.L12:
+#ifdef DOUBLE
+    xvpickve.d VX1, res1, 1
+    xvpickve.d VX2, res1, 2
+    xvpickve.d VX3, res1, 3
+    xvfadd.d res1, VX1, res1
+    xvfadd.d res1, VX2, res1
+    xvfadd.d res1, VX3, res1
+#else
+    xvfadd.s res2, res1, res2
+    xvpickve.w VX1, res1, 1
+    xvpickve.w VX2, res1, 2
+    xvpickve.w VX3, res1, 3
+    xvfadd.s res1, VX1, res1
+    xvfadd.s res1, VX2, res1
+    xvfadd.s res1, VX3, res1
+    xvpickve.w VX0, res2, 4
+    xvpickve.w VX1, res2, 5
+    xvpickve.w VX2, res2, 6
+    xvpickve.w VX3, res2, 7
+    xvfadd.s res1, VX0, res1
+    xvfadd.s res1, VX1, res1
+    xvfadd.s res1, VX2, res1
+    xvfadd.s res1, VX2, res1
+#endif
+    .align 3
+
+.L13:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L14:
+    LD  $f12, X, 0
+    ADD $f16, $f12, $f16
+    addi.d I, I, -1
+    addi.d  X, X, SIZE
+    blt $r0, I, .L14
+    b .L999
+    .align 3
+
+.L20:
+    bge $r0, I, .L23
+    .align 3
+
+.L21:
+#ifdef DOUBLE
+    ld.d t1, X, 0
+    add.d X, X, INCX
+    ld.d t2, X, 0
+    add.d X, X, INCX
+    ld.d t3, X, 0
+    add.d X, X, INCX
+    ld.d t4, X, 0
+    add.d X, X, INCX
+    xvinsgr2vr.d VX0, t1, 0
+    xvinsgr2vr.d VX0, t2, 1
+    xvinsgr2vr.d VX0, t3, 2
+    xvinsgr2vr.d VX0, t4, 3
+    ld.d t1, X, 0
+    add.d X, X, INCX
+    ld.d t2, X, 0
+    add.d X, X, INCX
+    ld.d t3, X, 0
+    add.d X, X, INCX
+    ld.d t4, X, 0
+    add.d X, X, INCX
+    xvinsgr2vr.d VX1, t1, 0
+    xvinsgr2vr.d VX1, t2, 1
+    xvinsgr2vr.d VX1, t3, 2
+    xvinsgr2vr.d VX1, t4, 3
+    xvfadd.d res2, VX0, VX1
+    xvfadd.d res1, res1, res2
+#else
+    ld.w t1, X, 0
+    add.d X, X, INCX
+    ld.w t2, X, 0
+    add.d X, X, INCX
+    ld.w t3, X, 0
+    add.d X, X, INCX
+    ld.w t4, X, 0
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 0
+    xvinsgr2vr.w VX0, t2, 1
+    xvinsgr2vr.w VX0, t3, 2
+    xvinsgr2vr.w VX0, t4, 3
+    ld.w t1, X, 0
+    add.d X, X, INCX
+    ld.w t2, X, 0
+    add.d X, X, INCX
+    ld.w t3, X, 0
+    add.d X, X, INCX
+    ld.w t4, X, 0
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 4
+    xvinsgr2vr.w VX0, t2, 5
+    xvinsgr2vr.w VX0, t3, 6
+    xvinsgr2vr.w VX0, t4, 7
+    xvfadd.s res1, VX0, res1
+#endif
+    addi.d  I, I, -1
+    blt $r0, I, .L21
+    .align 3
+
+.L22:
+#ifdef DOUBLE
+    xvpickve.d VX1, res1, 1
+    xvpickve.d VX2, res1, 2
+    xvpickve.d VX3, res1, 3
+    xvfadd.d res1, VX1, res1
+    xvfadd.d res1, VX2, res1
+    xvfadd.d res1, VX3, res1
+#else
+    xvfadd.s res2, res1, res2
+    xvpickve.w VX1, res1, 1
+    xvpickve.w VX2, res1, 2
+    xvpickve.w VX3, res1, 3
+    xvfadd.s res1, VX1, res1
+    xvfadd.s res1, VX2, res1
+    xvfadd.s res1, VX3, res1
+    xvpickve.w VX0, res2, 4
+    xvpickve.w VX1, res2, 5
+    xvpickve.w VX2, res2, 6
+    xvpickve.w VX3, res2, 7
+    xvfadd.s res1, VX0, res1
+    xvfadd.s res1, VX1, res1
+    xvfadd.s res1, VX2, res1
+    xvfadd.s res1, VX2, res1
+#endif
+    .align 3
+
+.L23:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L24:
+    LD  $f12, X, 0
+    ADD $f16, $f12, $f16
+    addi.d I, I, -1
+    add.d  X, X, INCX
+    blt $r0, I, .L24
+    .align 3
+
+.L999:
+    fmov.s $f0, $f16
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/sum_lsx.S b/kernel/loongarch64/sum_lsx.S
new file mode 100644
index 000000000..6b2027781
--- /dev/null
+++ b/kernel/loongarch64/sum_lsx.S
@@ -0,0 +1,204 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define I      $r17
+#define TEMP   $r18
+#define t1     $r15
+#define t2     $r12
+#define t3     $r13
+#define t4     $r14
+#define VX0    $vr12
+#define VX1    $vr13
+#define VX2    $vr14
+#define VX3    $vr15
+#define res1   $vr16
+#define res2   $vr17
+    PROLOGUE
+    vxor.v res1, res1, res1
+    vxor.v res2, res2, res2
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    li.d  TEMP, SIZE
+    slli.d  INCX, INCX, BASE_SHIFT
+    srai.d I, N, 3
+    bne INCX, TEMP, .L20
+    bge $r0, I, .L13
+    .align 3
+
+.L11:
+    vld VX0, X, 0
+    vld VX1, X, 16
+    VFADD res2, VX0, VX1
+    VFADD res1, res1, res2
+#ifdef DOUBLE
+    vld VX0, X, 32
+    vld VX1, X, 48
+    VFADD res2, VX0, VX1
+    VFADD res1, res1, res2
+#endif
+    addi.d  X, X, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L11
+    .align 3
+
+.L12:
+#ifdef DOUBLE
+    vreplvei.d VX1, res1, 1
+    vfadd.d   res1, res1, VX1
+#else
+    vreplvei.w VX1, res1, 1
+    vreplvei.w VX2, res1, 2
+    vreplvei.w VX3, res1, 3
+    vfadd.s res1, VX1, res1
+    vfadd.s res1, VX2, res1
+    vfadd.s res1, VX3, res1
+#endif
+    .align 3
+
+.L13:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L14:
+    LD   $f12, X, 0
+    ADD  $f16, $f12, $f16
+    addi.d  I, I, -1
+    addi.d  X, X, SIZE
+    blt $r0, I, .L14
+    b .L999
+    .align 3
+
+.L20:
+    bge $r0, I, .L23
+    .align 3
+
+.L21:
+#ifdef DOUBLE
+    ld.d t1, X, 0
+    add.d X, X, INCX
+    ld.d t2, X, 0
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    ld.d t1, X, 0
+    add.d X, X, INCX
+    ld.d t2, X, 0
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t1, 0
+    vinsgr2vr.d VX1, t2, 1
+    vfadd.d res2, VX0, VX1
+    vfadd.d res1, res1, res2
+    ld.d t3, X, 0
+    add.d X, X, INCX
+    ld.d t4, X, 0
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t3, 0
+    vinsgr2vr.d VX0, t4, 1
+    ld.d t3, X, 0
+    add.d X, X, INCX
+    ld.d t4, X, 0
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    vfadd.d res2, VX0, VX1
+    vfadd.d res1, res1, res2
+#else
+    ld.w t1, X, 0
+    add.d X, X, INCX
+    ld.w t2, X, 0
+    add.d X, X, INCX
+    ld.w t3, X, 0
+    add.d X, X, INCX
+    ld.w t4, X, 0
+    add.d X, X, INCX
+    vinsgr2vr.w VX0, t1, 0
+    vinsgr2vr.w VX0, t2, 1
+    vinsgr2vr.w VX0, t3, 2
+    vinsgr2vr.w VX0, t4, 3
+    ld.w t1, X, 0
+    add.d X, X, INCX
+    ld.w t2, X, 0
+    add.d X, X, INCX
+    ld.w t3, X, 0
+    add.d X, X, INCX
+    ld.w t4, X, 0
+    add.d X, X, INCX
+    vinsgr2vr.w VX1, t1, 0
+    vinsgr2vr.w VX1, t2, 1
+    vinsgr2vr.w VX1, t3, 2
+    vinsgr2vr.w VX1, t4, 3
+    vfadd.s res2, VX0, VX1
+    vfadd.s res1, res1, res2
+#endif
+    addi.d  I, I, -1
+    blt $r0, I, .L21
+    .align 3
+
+.L22:
+#ifdef DOUBLE
+    vreplvei.d VX1, res1, 1
+    vfadd.d res1, VX1, res1
+#else
+    vreplvei.w VX1, res1, 1
+    vreplvei.w VX2, res1, 2
+    vreplvei.w VX3, res1, 3
+    vfadd.s res1, VX1, res1
+    vfadd.s res1, VX2, res1
+    vfadd.s res1, VX3, res1
+#endif
+    .align 3
+
+.L23:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L24:
+    LD  $f12, X, 0
+    ADD $f16, $f12, $f16
+    addi.d I, I, -1
+    add.d  X, X, INCX
+    blt  $r0, I, .L24
+    .align 3
+
+.L999:
+    fmov.s $f0, $f16
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/swap_lasx.S b/kernel/loongarch64/swap_lasx.S
new file mode 100644
index 000000000..4767fffe3
--- /dev/null
+++ b/kernel/loongarch64/swap_lasx.S
@@ -0,0 +1,401 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+#define N      $r4
+#define X      $r7
+#define INCX   $r8
+#define Y      $r9
+#define INCY   $r10
+
+#define I      $r17
+#define TEMP   $r18
+#define XX     $r5
+#define YY     $r6
+#define t1     $r14
+#define t2     $r15
+#define t3     $r16
+#define t4     $r19
+#define a1     $f12
+#define a2     $f13
+#define a3     $f14
+#define a4     $f15
+#define b1     $f16
+#define b2     $f17
+#define b3     $f18
+#define b4     $f19
+#define VX0    $xr12
+#define VX1    $xr13
+#define VX2    $xr14
+#define VX3    $xr15
+
+
+    PROLOGUE
+    bge $r0, N, .L999
+    li.d TEMP, 1
+    slli.d  TEMP, TEMP, BASE_SHIFT
+    slli.d  INCX, INCX, BASE_SHIFT
+    slli.d  INCY, INCY, BASE_SHIFT
+    srai.d I, N, 3
+    bne INCX, TEMP, .L20
+    bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
+    b .L11  // INCX==1 and INCY==1
+.L20:
+    bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
+    b .L21 // INCX!=1 and INCY==1
+
+/* INCX==1 and INCY==1 */
+.L11:
+    bge $r0, I, .L112
+    .align 3
+
+.L111:
+    xvld VX0, X, 0
+    xvld VX2, Y, 0
+    addi.d  I, I, -1
+    xvst VX2, X, 0
+    xvst VX0, Y, 0
+#ifdef DOUBLE
+    xvld VX0, X, 32
+    xvld VX2, Y, 32
+    xvst VX2, X, 32
+    xvst VX0, Y, 32
+#endif
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    blt $r0, I, .L111
+    .align 3
+
+.L112:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L113:
+    LD  $f12, X, 0
+    LD  $f14, Y, 0
+    addi.d I, I, -1
+    ST  $f12, Y, 0
+    ST  $f14, X, 0
+    addi.d  X, X, SIZE
+    addi.d  Y, Y, SIZE
+    blt $r0, I, .L113
+    b .L999
+    .align 3
+
+/* INCX==1 and INCY!=1 */
+.L12:
+    bge $r0, I, .L122
+    .align 3
+
+.L121:
+#ifdef DOUBLE
+    xvld VX0, X, 0
+    ld.d t1, Y, 0
+    xvstelm.d VX0, Y, 0, 0
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0
+    xvstelm.d VX0, Y, 0, 1
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0
+    xvstelm.d VX0, Y, 0, 2
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0
+    xvstelm.d VX0, Y, 0, 3
+    xvinsgr2vr.d VX2, t1, 0
+    xvinsgr2vr.d VX2, t2, 1
+    xvinsgr2vr.d VX2, t3, 2
+    xvinsgr2vr.d VX2, t4, 3
+    add.d Y, Y, INCY
+    xvst VX2, X, 0
+    xvld VX1, X, 4 * SIZE
+    ld.d t1, Y, 0
+    xvstelm.d VX1, Y, 0, 0
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0
+    xvstelm.d VX1, Y, 0, 1
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0
+    xvstelm.d VX1, Y, 0, 2
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0
+    xvstelm.d VX1, Y, 0, 3
+    xvinsgr2vr.d VX3, t1, 0
+    xvinsgr2vr.d VX3, t2, 1
+    xvinsgr2vr.d VX3, t3, 2
+    xvinsgr2vr.d VX3, t4, 3
+    add.d Y, Y, INCY
+    xvst VX3, X, 4 * SIZE
+    addi.d X, X, 8 * SIZE
+#else
+    xvld VX0, X, 0
+    ld.w t1, Y, 0
+    xvstelm.w VX0, Y, 0, 0
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0
+    xvstelm.w VX0, Y, 0, 1
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0
+    xvstelm.w VX0, Y, 0, 2
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0
+    xvstelm.w VX0, Y, 0, 3
+    xvinsgr2vr.w VX2, t1, 0
+    xvinsgr2vr.w VX2, t2, 1
+    xvinsgr2vr.w VX2, t3, 2
+    xvinsgr2vr.w VX2, t4, 3
+    add.d Y, Y, INCY
+    ld.w t1, Y, 0
+    xvstelm.w VX0, Y, 0, 4
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0
+    xvstelm.w VX0, Y, 0, 5
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0
+    xvstelm.w VX0, Y, 0, 6
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0
+    xvstelm.w VX0, Y, 0, 7
+    xvinsgr2vr.w VX2, t1, 4
+    xvinsgr2vr.w VX2, t2, 5
+    xvinsgr2vr.w VX2, t3, 6
+    xvinsgr2vr.w VX2, t4, 7
+    add.d Y, Y, INCY
+    xvst VX2, X, 0
+    addi.d X, X, 8 * SIZE
+#endif
+    addi.d  I, I, -1
+    blt $r0, I, .L121
+    .align 3
+
+.L122:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L123:
+    LD  $f12, X, 0
+    LD  $f14, Y, 0
+    addi.d I, I, -1
+    ST  $f12, Y, 0
+    ST  $f14, X, 0
+    addi.d  X, X, SIZE
+    add.d   Y, Y, INCY
+    blt   $r0, I, .L123
+    b .L999
+    .align 3
+
+.L21:
+    bge $r0, I, .L212
+    .align 3
+
+.L211:
+#ifdef DOUBLE
+    xvld VX2, Y, 0
+    ld.d t1, X, 0
+    xvstelm.d VX2, X, 0, 0
+    add.d X, X, INCX
+    ld.d t2, X, 0
+    xvstelm.d VX2, X, 0, 1
+    add.d X, X, INCX
+    ld.d t3, X, 0
+    xvstelm.d VX2, X, 0, 2
+    add.d X, X, INCX
+    ld.d t4, X, 0
+    xvstelm.d VX2, X, 0, 3
+    xvinsgr2vr.d VX0, t1, 0
+    xvinsgr2vr.d VX0, t2, 1
+    xvinsgr2vr.d VX0, t3, 2
+    xvinsgr2vr.d VX0, t4, 3
+    add.d X, X, INCX
+    xvst VX0, Y, 0
+    xvld VX3, Y, 4 * SIZE
+    ld.d t1, X, 0
+    xvstelm.d VX3, X, 0, 0
+    add.d X, X, INCX
+    ld.d t2, X, 0
+    xvstelm.d VX3, X, 0, 1
+    add.d X, X, INCX
+    ld.d t3, X, 0
+    xvstelm.d VX3, X, 0, 2
+    add.d X, X, INCX
+    ld.d t4, X, 0
+    xvstelm.d VX3, X, 0, 3
+    xvinsgr2vr.d VX1, t1, 0
+    xvinsgr2vr.d VX1, t2, 1
+    xvinsgr2vr.d VX1, t3, 2
+    xvinsgr2vr.d VX1, t4, 3
+    add.d X, X, INCX
+    xvst VX1, Y, 4 * SIZE
+    addi.d Y, Y, 8 * SIZE
+#else
+    xvld VX2, Y, 0
+    ld.w t1, X, 0
+    xvstelm.w VX2, X, 0, 0
+    add.d X, X, INCX
+    ld.w t2, X, 0
+    xvstelm.w VX2, X, 0, 1
+    add.d X, X, INCX
+    ld.w t3, X, 0
+    xvstelm.w VX2, X, 0, 2
+    add.d X, X, INCX
+    ld.w t4, X, 0
+    xvstelm.w VX2, X, 0, 3
+    xvinsgr2vr.w VX0, t1, 0
+    xvinsgr2vr.w VX0, t2, 1
+    xvinsgr2vr.w VX0, t3, 2
+    xvinsgr2vr.w VX0, t4, 3
+    add.d X, X, INCX
+    ld.w t1, X, 0
+    xvstelm.w VX2, X, 0, 4
+    add.d X, X, INCX
+    ld.w t2, X, 0
+    xvstelm.w VX2, X, 0, 5
+    add.d X, X, INCX
+    ld.w t3, X, 0
+    xvstelm.w VX2, X, 0, 6
+    add.d X, X, INCX
+    ld.w t4, X, 0
+    xvstelm.w VX2, X, 0, 7
+    xvinsgr2vr.w VX0, t1, 4
+    xvinsgr2vr.w VX0, t2, 5
+    xvinsgr2vr.w VX0, t3, 6
+    xvinsgr2vr.w VX0, t4, 7
+    add.d X, X, INCX
+    xvst VX0, Y, 0
+    addi.d Y, Y, 8 * SIZE
+#endif
+    addi.d  I, I, -1
+    blt $r0, I, .L211
+    .align 3
+
+.L212:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L213:
+    LD  $f12, X, 0
+    LD  $f14, Y, 0
+    addi.d I, I, -1
+    ST  $f12, Y, 0
+    ST  $f14, X, 0
+    add.d  X, X, INCX
+    addi.d Y, Y, SIZE
+    blt  $r0, I, .L213
+    b .L999
+    .align 3
+
+.L22:
+    bge $r0, I, .L223
+    .align 3
+    move XX, X
+
+.L222:
+    LD    a1, X, 0
+    add.d X, X, INCX
+    LD    a2, X, 0
+    add.d X, X, INCX
+    LD    a3, X, 0
+    add.d X, X, INCX
+    LD    a4, X, 0
+    add.d X, X, INCX
+    LD    b1, Y, 0
+    ST    a1, Y, 0
+    add.d Y, Y, INCY
+    LD    b2, Y, 0
+    ST    a2, Y, 0
+    add.d Y, Y, INCY
+    LD    b3, Y, 0
+    ST    a3, Y, 0
+    add.d Y, Y, INCY
+    LD    b4, Y, 0
+    ST    a4, Y, 0
+    add.d Y, Y, INCY
+    LD    a1, X, 0
+    add.d X, X, INCX
+    ST    b1, XX, 0
+    add.d XX, XX, INCX
+    LD    b1, Y, 0
+    ST    a1, Y, 0
+    add.d Y, Y, INCY
+    LD    a2, X, 0
+    add.d X, X, INCX
+    ST    b2, XX, 0
+    add.d XX, XX, INCX
+    LD    b2, Y, 0
+    ST    a2, Y, 0
+    add.d Y, Y, INCY
+    LD    a3, X, 0
+    add.d X, X, INCX
+    ST    b3, XX, 0
+    add.d XX, XX, INCX
+    LD    b3, Y, 0
+    ST    a3, Y, 0
+    LD    a4, X, 0
+    add.d X, X, INCX
+    ST    b4, XX, 0
+    add.d XX, XX, INCX
+    LD    b4, Y, 0
+    ST    a4, Y, 0
+    add.d Y, Y, INCY
+    ST    b1, XX, 0
+    add.d XX, XX, INCX
+    ST    b2, XX, 0
+    add.d XX, XX, INCX
+    ST    b3, XX, 0
+    add.d XX, XX, INCX
+    ST    b4, XX, 0
+    add.d XX, XX, INCX
+    addi.d  I, I, -1
+    blt $r0, I, .L222
+    .align 3
+
+.L223:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L224:
+    LD  $f12, X, 0
+    LD  $f14, Y, 0
+    addi.d I, I, -1
+    ST  $f12, Y, 0
+    ST  $f14, X, 0
+    add.d  X, X, INCX
+    add.d  Y, Y, INCY
+    blt  $r0, I, .L224
+    .align 3
+
+.L999:
+    move $r4, $r12
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/swap_lsx.S b/kernel/loongarch64/swap_lsx.S
new file mode 100644
index 000000000..736187f93
--- /dev/null
+++ b/kernel/loongarch64/swap_lsx.S
@@ -0,0 +1,431 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+#define N      $r4
+#define X      $r7
+#define INCX   $r8
+#define Y      $r9
+#define INCY   $r10
+
+#define I      $r17
+#define TEMP   $r18
+#define XX     $r5
+#define YY     $r6
+#define t1     $r14
+#define t2     $r15
+#define t3     $r16
+#define t4     $r19
+#define a1     $f12
+#define a2     $f13
+#define a3     $f14
+#define a4     $f15
+#define b1     $f16
+#define b2     $f17
+#define b3     $f18
+#define b4     $f19
+#define VX0    $vr12
+#define VX1    $vr13
+#define VX2    $vr14
+#define VX3    $vr15
+
+
+    PROLOGUE
+    bge $r0, N, .L999
+    li.d TEMP, 1
+    slli.d  TEMP, TEMP, BASE_SHIFT
+    slli.d  INCX, INCX, BASE_SHIFT
+    slli.d  INCY, INCY, BASE_SHIFT
+    srai.d I, N, 3
+    bne INCX, TEMP, .L20
+    bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
+    b .L11  // INCX==1 and INCY==1
+.L20:
+    bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
+    b .L21 // INCX!=1 and INCY==1
+
+/* INCX==1 and incy==1 */
+.L11:
+    bge $r0, I, .L112
+    .align 3
+
+.L111:
+    vld VX0, X, 0
+    vld VX1, X, 16
+    vld VX2, Y, 0
+    vld VX3, Y, 16
+    addi.d  I, I, -1
+    vst VX2, X, 0
+    vst VX3, X, 16
+    vst VX0, Y, 0
+    vst VX1, Y, 16
+#ifdef DOUBLE
+    vld VX0, X, 32
+    vld VX1, X, 48
+    vld VX2, Y, 32
+    vld VX3, Y, 48
+    vst VX2, X, 32
+    vst VX3, X, 48
+    vst VX0, Y, 32
+    vst VX1, Y, 48
+#endif
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    blt $r0, I, .L111
+    .align 3
+
+.L112:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L113:
+#ifdef DOUBLE
+    fld.d $f12, X, 0
+    fld.d $f14, Y, 0
+    addi.d I, I, -1
+    fst.d $f12, Y, 0
+    fst.d $f14, X, 0
+#else
+    fld.s $f12, X, 0
+    fld.s $f14, Y, 0
+    addi.d I, I, -1
+    fst.s $f12, Y, 0
+    fst.s $f14, X, 0
+#endif
+    addi.d  X, X, SIZE
+    addi.d  Y, Y, SIZE
+    blt $r0, I, .L113
+    b .L999
+    .align 3
+
+/* INCX==1 and INCY!=1 */
+.L12:
+    bge $r0, I, .L122
+    .align 3
+
+.L121:
+#ifdef DOUBLE
+    vld VX0, X, 0
+    ld.d t1, Y, 0
+    vstelm.d VX0, Y, 0, 0
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0
+    vstelm.d VX0, Y, 0, 1
+    vinsgr2vr.d VX2, t1, 0
+    vinsgr2vr.d VX2, t2, 1
+    add.d Y, Y, INCY
+    vst VX2, X, 0
+    vld VX1, X, 2 * SIZE
+    ld.d t3, Y, 0
+    vstelm.d VX1, Y, 0, 0
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0
+    vstelm.d VX1, Y, 0, 1
+    vinsgr2vr.d VX3, t3, 0
+    vinsgr2vr.d VX3, t4, 1
+    add.d Y, Y, INCY
+    vst VX3, X, 2 * SIZE
+    vld VX0, X, 4 * SIZE
+    ld.d t1, Y, 0
+    vstelm.d VX0, Y, 0, 0
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0
+    vstelm.d VX0, Y, 0, 1
+    vinsgr2vr.d VX2, t1, 0
+    vinsgr2vr.d VX2, t2, 1
+    add.d Y, Y, INCY
+    vst VX2, X, 4 * SIZE
+    vld VX1, X, 6 * SIZE
+    ld.d t3, Y, 0
+    vstelm.d VX1, Y, 0, 0
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0
+    vstelm.d VX1, Y, 0, 1
+    vinsgr2vr.d VX3, t3, 0
+    vinsgr2vr.d VX3, t4, 1
+    add.d Y, Y, INCY
+    vst VX3, X, 6 * SIZE
+    addi.d X, X, 8 * SIZE
+#else
+    vld VX0, X, 0
+    ld.w t1, Y, 0
+    vstelm.w VX0, Y, 0, 0
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0
+    vstelm.w VX0, Y, 0, 1
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0
+    vstelm.w VX0, Y, 0, 2
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0
+    vstelm.w VX0, Y, 0, 3
+    vinsgr2vr.w VX2, t1, 0
+    vinsgr2vr.w VX2, t2, 1
+    vinsgr2vr.w VX2, t3, 2
+    vinsgr2vr.w VX2, t4, 3
+    add.d Y, Y, INCY
+    vst VX2, X, 0
+
+    vld VX1, X, 4 * SIZE
+    ld.w t1, Y, 0
+    vstelm.w VX1, Y, 0, 0
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0
+    vstelm.w VX1, Y, 0, 1
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0
+    vstelm.w VX1, Y, 0, 2
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0
+    vstelm.w VX1, Y, 0, 3
+    vinsgr2vr.w VX3, t1, 0
+    vinsgr2vr.w VX3, t2, 1
+    vinsgr2vr.w VX3, t3, 2
+    vinsgr2vr.w VX3, t4, 3
+    add.d Y, Y, INCY
+    vst VX3, X, 4 * SIZE
+    addi.d X, X, 8 * SIZE
+#endif
+    addi.d  I, I, -1
+    blt $r0, I, .L121
+    .align 3
+
+.L122:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L123:
+    LD  $f12, X, 0
+    LD  $f14, Y, 0
+    addi.d I, I, -1
+    ST  $f12, Y, 0
+    ST  $f14, X, 0
+    addi.d X, X, SIZE
+    add.d  Y, Y, INCY
+    blt  $r0, I, .L123
+    b .L999
+    .align 3
+
+/* INCX!=1 and INCY==1 */
+.L21:
+    bge $r0, I, .L212
+    .align 3
+
+.L211:
+#ifdef DOUBLE
+    vld VX2, Y, 0
+    ld.d t1, X, 0
+    vstelm.d VX2, X, 0, 0
+    add.d X, X, INCX
+    ld.d t2, X, 0
+    vstelm.d VX2, X, 0, 1
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    add.d X, X, INCX
+    vst VX0, Y, 0
+    vld VX3, Y, 2 * SIZE
+    ld.d t3, X, 0
+    vstelm.d VX3, X, 0, 0
+    add.d X, X, INCX
+    ld.d t4, X, 0
+    vstelm.d VX3, X, 0, 1
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    add.d X, X, INCX
+    vst VX1, Y, 2 * SIZE
+    vld VX2, Y, 4 * SIZE
+    ld.d t1, X, 0
+    vstelm.d VX2, X, 0, 0
+    add.d X, X, INCX
+    ld.d t2, X, 0
+    vstelm.d VX2, X, 0, 1
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    add.d X, X, INCX
+    vst VX0, Y, 4 * SIZE
+    vld VX3, Y, 6 * SIZE
+    ld.d t3, X, 0
+    vstelm.d VX3, X, 0, 0
+    add.d X, X, INCX
+    ld.d t4, X, 0
+    vstelm.d VX3, X, 0, 1
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    add.d X, X, INCX
+    vst  VX1, Y, 6 * SIZE
+    addi.d Y, Y, 8 * SIZE
+#else
+    vld VX2, Y, 0
+    ld.w t1, X, 0
+    vstelm.w VX2, X, 0, 0
+    add.d X, X, INCX
+    ld.w t2, X, 0
+    vstelm.w VX2, X, 0, 1
+    add.d X, X, INCX
+    ld.w t3, X, 0
+    vstelm.w VX2, X, 0, 2
+    add.d X, X, INCX
+    ld.w t4, X, 0
+    vstelm.w VX2, X, 0, 3
+    vinsgr2vr.w VX0, t1, 0
+    vinsgr2vr.w VX0, t2, 1
+    vinsgr2vr.w VX0, t3, 2
+    vinsgr2vr.w VX0, t4, 3
+    add.d X, X, INCX
+    vst VX0, Y, 0
+    vld VX3, Y, 4 * SIZE
+    ld.w t1, X, 0
+    vstelm.w VX3, X, 0, 0
+    add.d X, X, INCX
+    ld.w t2, X, 0
+    vstelm.w VX3, X, 0, 1
+    add.d X, X, INCX
+    ld.w t3, X, 0
+    vstelm.w VX3, X, 0, 2
+    add.d X, X, INCX
+    ld.w t4, X, 0
+    vstelm.w VX3, X, 0, 3
+    vinsgr2vr.w VX1, t1, 0
+    vinsgr2vr.w VX1, t2, 1
+    vinsgr2vr.w VX1, t3, 2
+    vinsgr2vr.w VX1, t4, 3
+    add.d X, X, INCX
+    vst  VX1, Y, 4 * SIZE
+    addi.d Y, Y, 8 * SIZE
+#endif
+    addi.d  I, I, -1
+    blt $r0, I, .L211
+    .align 3
+
+.L212:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L213:
+    LD  $f12, X, 0 * SIZE
+    LD  $f14, Y, 0 * SIZE
+    addi.d I, I, -1
+    ST  $f12, Y, 0 * SIZE
+    ST  $f14, X, 0 * SIZE
+    add.d  X, X, INCX
+    addi.d Y, Y, SIZE
+    blt  $r0, I, .L213
+    b .L999
+    .align 3
+
+.L22:
+    bge $r0, I, .L223
+    .align 3
+    move XX, X
+
+.L222:
+    LD    a1, X, 0
+    add.d X, X, INCX
+    LD    a2, X, 0
+    add.d X, X, INCX
+    LD    a3, X, 0
+    add.d X, X, INCX
+    LD    a4, X, 0
+    add.d X, X, INCX
+    LD    b1, Y, 0
+    ST    a1, Y, 0
+    add.d Y, Y, INCY
+    LD    b2, Y, 0
+    ST    a2, Y, 0
+    add.d Y, Y, INCY
+    LD    b3, Y, 0
+    ST    a3, Y, 0
+    add.d Y, Y, INCY
+    LD    b4, Y, 0
+    ST    a4, Y, 0
+    add.d Y, Y, INCY
+    LD    a1, X, 0
+    add.d X, X, INCX
+    ST    b1, XX, 0
+    add.d XX, XX, INCX
+    LD    b1, Y, 0
+    ST    a1, Y, 0
+    add.d Y, Y, INCY
+    LD    a2, X, 0
+    add.d X, X, INCX
+    ST    b2, XX, 0
+    add.d XX, XX, INCX
+    LD    b2, Y, 0
+    ST    a2, Y, 0
+    add.d Y, Y, INCY
+    LD    a3, X, 0
+    add.d X, X, INCX
+    ST    b3, XX, 0
+    add.d XX, XX, INCX
+    LD    b3, Y, 0
+    ST    a3, Y, 0
+    LD    a4, X, 0
+    add.d X, X, INCX
+    ST    b4, XX, 0
+    add.d XX, XX, INCX
+    LD    b4, Y, 0
+    ST    a4, Y, 0
+    add.d Y, Y, INCY
+    ST    b1, XX, 0
+    add.d XX, XX, INCX
+    ST    b2, XX, 0
+    add.d XX, XX, INCX
+    ST    b3, XX, 0
+    add.d XX, XX, INCX
+    ST    b4, XX, 0
+    add.d XX, XX, INCX
+    addi.d  I, I, -1
+    blt $r0, I, .L222
+    .align 3
+
+.L223:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L224:
+    LD  $f12, X, 0
+    LD  $f14, Y, 0
+    addi.d I, I, -1
+    ST  $f12, Y, 0
+    ST  $f14, X, 0
+    add.d  X, X, INCX
+    add.d  Y, Y, INCY
+    blt $r0, I, .L224
+    .align 3
+
+.L999:
+    move $r4, $r12
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE