diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5
index c23c2fac5..011e8b89e 100644
--- a/kernel/loongarch64/KERNEL.LOONGSON3R5
+++ b/kernel/loongarch64/KERNEL.LOONGSON3R5
@@ -24,12 +24,12 @@ SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
 
 SGEMVNKERNEL = sgemv_n_8_lasx.S
 SGEMVTKERNEL = sgemv_t_8_lasx.S
-endif
 
-DTRSMKERNEL_LN  = ../generic/trsm_kernel_LN.c
-DTRSMKERNEL_LT  = ../generic/trsm_kernel_LT.c
-DTRSMKERNEL_RN  = ../generic/trsm_kernel_RN.c
-DTRSMKERNEL_RT  = ../generic/trsm_kernel_RT.c
+DTRSMKERNEL_LN  = dtrsm_kernel_LN_16x4_lasx.S
+DTRSMKERNEL_LT  = dtrsm_kernel_LT_16x4_lasx.S
+DTRSMKERNEL_RN  = dtrsm_kernel_RN_16x4_lasx.S
+DTRSMKERNEL_RT  = dtrsm_kernel_RT_16x4_lasx.S
+endif
 
 STRSMKERNEL_LN  = ../generic/trsm_kernel_LN.c
 STRSMKERNEL_LT  = ../generic/trsm_kernel_LT.c
diff --git a/kernel/loongarch64/dtrsm_kernel_LN_16x4_lasx.S b/kernel/loongarch64/dtrsm_kernel_LN_16x4_lasx.S
new file mode 100644
index 000000000..3315daccb
--- /dev/null
+++ b/kernel/loongarch64/dtrsm_kernel_LN_16x4_lasx.S
@@ -0,0 +1,1366 @@
+/*******************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+#include "loongarch64_asm.S"
+
+/*********************************************************************
+* 2023/07/26 guxiwei
+*        UTEST                  : OK
+*        CTEST                  : OK
+*        TEST                   : OK
+*
+*
+*********************************************************************/
+
+/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b,
+ *          FLOAT *c, BLASLONG ldc, BLASLONG offset)
+ */
+#define M      $r4   // param 1: bm
+#define N      $r5   // param 2: bn
+#define K      $r6   // param 3: bk
+#define A      $r7   // param 5: ba
+#define B      $r8   // param 6: bb
+#define C      $r9   // param 7: bc
+#define LDC    $r10  // param 8: ldc
+#define OFFSET $r11  // param 9: offset
+
+/* Cycle control parameters */
+#define I      $r13
+#define J      $r14
+#define L      $r15
+#define TL     $r16
+/* Matrix address */
+#define A0     $r17
+#define B0     $r18
+#define C0     $r19
+#define C1     $r20
+#define C2     $r23
+#define C3     $r24
+#define T0     $r25
+#define T1     $r26
+#define T2     $r27
+#define KK     $r28
+#define AA     $r29
+#define CC     $r30
+#undef  ZERO
+#define ZERO   $r0
+
+#define U0     $xr0
+#define U1     $xr1
+#define U2     $xr2
+#define U3     $xr3
+#define U4     $xr4
+#define U5     $xr5
+#define U6     $xr6
+#define U7     $xr7
+#define U8     $xr8
+#define U9     $xr9
+#define U10    $xr10
+#define U11    $xr11
+#define U12    $xr12
+#define U13    $xr13
+#define U14    $xr14
+#define U15    $xr15
+#define D0     $xr16
+#define D1     $xr17
+#define D2     $xr18
+#define D3     $xr19
+#define D4     $xr20
+#define D5     $xr21
+#define D6     $xr22
+#define D7     $xr23
+#define D8     $xr24
+#define D9     $xr25
+#define D10    $xr26
+#define D11    $xr27
+#define D12    $xr28
+#define D13    $xr29
+#define D14    $xr30
+#define D15    $xr31
+
+/* Prefetch interval */
+#define A_PRE  0x400
+#define B_PRE  0x100
+
+#include "dtrsm_kernel_macro.S"
+
+// By integrating the dgemm and dsolve processes, the following advantages can be obtained:
+// 1. Avoid the overhead of function calls (by not invoking dgemm_kernel)
+// 2. Reduce the storage and retrieval of C data
+// 3. Vectorization of dsolve
+// GEMM_UNROLL_M x DGEMM_UNROLL_N is 16x4, which is a fairly large size.
+// To achieve finer-grained optimization, 15 scenarios have been addressed:
+// 16x4, 16x2, 16x1, 8x4, 8x2, 8x1, 4x4, 4x2, 4x1, 2x4, 2x2, 2x1, 1x4, 1x2, 1x1.
+
+.macro dsolve_16 N
+// if N = 4 the data layout of C is as follows:
+// U0  U1  U2  U3
+// U4  U5  U6  U7
+// U8  U9  U10 U11
+// U12 U13 U14 U15
+// if N = 2 the dat layout of C is as follows:
+// U0 U1 U2 U3
+// U4 U5 U6 U7
+// if N = 1 the dat layout of C is as follows:
+// U0 U1 U2 U3
+// The matrix A has dimensions of 16x16, and
+// it will be divided into 4 segments for processing.
+
+#define G12 U3
+#define G13 U7
+#define G14 U11
+#define G15 U15
+    GTRANSPOSE4x4_D U3, U7, U11, U15, G12, G13, G14, G15, D0, D1
+    // A
+    // G12 G13 G14 G15
+    // -----------------
+    // 204             | D9
+    // 220 221         | D8 D7
+    // 236 237 238     | D6 D5 D4
+    // 252 253 254 255 | D3 D2 D1 D0
+    PTR_ADDI T0, A0, 252 * 8
+    GLDREPL xv, d, D3, T0, 0, D2, T0, 1 * 8, D1, T0, 2 * 8, D0, T0, 3 * 8
+    PTR_ADDI T0, A0, 236 * 8
+    GLDREPL xv, d, D6, T0, 0, D5, T0, 1 * 8, D4, T0, 2 * 8
+    PTR_ADDI T0, A0, 220 * 8
+    GLDREPL xv, d, D8, T0, 0, D7, T0, 1 * 8
+    PTR_ADDI T0, A0, 204 * 8
+    GLDREPL xv, d, D9, T0, 0
+
+    xvfmul.d    G15,    G15,    D0
+    GNMSUB xvf, d, G14, G15, D1, G14
+    xvfmul.d    G14,    G14,    D4
+    GNMSUB xvf, d, G13, G15, D2, G13, G13, G14, D5, G13
+    xvfmul.d    G13,    G13,    D7
+    GNMSUB xvf, d, G12, G15, D3, G12, G12, G14, D6, G12, G12, G13, D8, G12
+    xvfmul.d    G12,    G12,    D9
+    // Store B
+.if \N == 4
+    // x x x x ... x x x x
+    // x x x x ... x x x x
+    // x x x x ... x x x x
+    // b48 b49 b50 b51 ... b60 b61 b62 b63
+    GST xv, , G12, B0, 48 * 8, G13, B0, 52 * 8, G14, B0, 56 * 8, G15, B0, 60 * 8
+.elseif \N == 2
+    // x x x x ... x x x x
+    // x x x x ... x x x x
+    // x x x x ... x x x x
+    // b24 b25 b26 b27 b28 b29 b30 b31
+    GST v, , $vr3, B0, 24 * 8, $vr7, B0, 26 * 8, $vr11, B0, 28 * 8, $vr15, B0, 30 * 8
+.elseif \N == 1
+    // x x x x
+    // x x x x
+    // x x x x
+    // b12 b13 b14 b15
+    GST f, d, $f3, B0, 12 * 8, $f7, B0, 13 * 8, $f11, B0, 14 * 8, $f15, B0, 15 * 8
+.endif
+    // Transpose G15 G14 G13 G12
+    GTRANSPOSE4x4_D G12, G13, G14, G15, D0, D1, D2, D3, D4, D5
+    // Store C
+.if \N == 4
+    // x x x x ... c12 c13 c14 c15
+    // x x x x ... c28 c29 c30 c31
+    // x x x x ... c44 c45 c46 c47
+    // x x x x ... c60 c61 c62 c63
+    GST xv, , D0, C0, 12 * 8, D1, C1, 12 * 8, D2, C2, 12 * 8, D3, C3, 12 * 8
+.elseif \N == 2
+    // x x x x ... c12 c13 c14 c15
+    // x x x x ... c28 c29 c30 c31
+    GST xv, , D0, C0, 12 * 8, D1, C1, 12 * 8
+.elseif \N == 1
+    // Store C
+    // x x x x ... c12 c13 c14 c15
+    GST xv, , D0, C0, 12 * 8
+.endif
+
+#define G8  U2
+#define G9  U6
+#define G10 U10
+#define G11 U14
+    GTRANSPOSE4x4_D U2, U6, U10, U14, G8, G9, G10, G11, D0, D1
+    // A
+    // G8  G9  G10  G11
+    // -----------------
+    // 136             | D9
+    // 152 153         | D8  D7
+    // 168 169 170     | D6  D5  D4
+    // 184 185 186 187 | D3  D2  D1  D0
+    // 200 201 202 203 | D15 D14 D13 D12
+    // 216 217 218 219 | D11 D10 D9  D8
+    // 232 233 234 235 | D7  D6  D5  D4
+    // 248 249 250 251 | D3  D2  D1  D0
+    PTR_ADDI  T0, A0, 248 * 8
+    GLDREPL xv, d, D3, T0, 0, D2, T0, 1 * 8, D1, T0, 2 * 8, D0, T0, 3 * 8
+    PTR_ADDI  T0, A0, 232 * 8
+    GLDREPL xv, d, D7, T0, 0, D6, T0, 1 * 8, D5, T0, 2 * 8, D4, T0, 3 * 8
+    PTR_ADDI  T0, A0, 216 * 8
+    GLDREPL xv, d, D11, T0, 0, D10, T0, 1 * 8, D9, T0, 2 * 8, D8, T0, 3 * 8
+    PTR_ADDI  T0, A0, 200 * 8
+    GLDREPL xv, d, D15, T0, 0, D14, T0, 1 * 8, D13, T0, 2 * 8, D12, T0, 3 * 8
+    GNMSUB xvf, d, G11, G15, D0,  G11, G10, G15, D1,  G10, G9, G15, D2,  G9, G8, G15, D3,  G8, \
+                   G11, G14, D4,  G11, G10, G14, D5,  G10, G9, G14, D6,  G9, G8, G14, D7,  G8, \
+                   G11, G13, D8,  G11, G10, G13, D9,  G10, G9, G13, D10, G9, G8, G13, D11, G8, \
+                   G11, G12, D12, G11, G10, G12, D13, G10, G9, G12, D14, G9, G8, G12, D15, G8
+    PTR_ADDI T0, A0, 184 * 8
+    GLDREPL xv, d, D3, T0, 0, D2, T0, 1 * 8, D1, T0, 2 * 8, D0, T0, 3 * 8
+    PTR_ADDI T0, A0, 168 * 8
+    GLDREPL xv, d, D6, T0, 0, D5, T0, 1 * 8, D4, T0, 2 * 8
+    PTR_ADDI T0, A0, 152 * 8
+    GLDREPL xv, d, D8, T0, 0, D7, T0, 1 * 8
+    PTR_ADDI T0, A0, 136 * 8
+    GLDREPL xv, d, D9, T0, 0
+
+    xvfmul.d    G11,    G11,    D0
+    GNMSUB xvf, d, G10, G11, D1, G10, G9, G11, D2, G9, G8, G11, D3, G8
+    xvfmul.d    G10,    G10,    D4
+    GNMSUB xvf, d, G9, G10, D5, G9, G8, G10, D6, G8
+    xvfmul.d    G9,     G9,     D7
+    GNMSUB xvf, d, G8, G9, D8, G8
+    xvfmul.d    G8,     G8,     D9
+    // Store B
+.if \N == 4
+    // x x x x ... x x x x
+    // x x x x ... x x x x
+    // b32 b33 b34 b34 ... b44 b45 b46 b47
+    // b48 b49 b50 b51 ... b60 b61 b62 b63
+    GST xv, , G8, B0, 32 * 8, G9, B0, 36 * 8, G10, B0, 40 * 8, G11, B0, 44 * 8
+.elseif \N == 2
+    // x x x x ... x x x x
+    // x x x x ... x x x x
+    // b16 b17 b18 b19 b20 b21 b22 b23
+    // b24 b25 b26 b27 b28 b29 b30 b31
+    GST v, , $vr2, B0, 16 * 8, $vr6, B0, 18 * 8, $vr10, B0, 20 * 8, $vr14, B0, 22 * 8
+.elseif \N == 1
+    // x x x x
+    // x x x x
+    // b8  b9  b10 b11
+    // b12 b13 b14 b15
+    GST f, d, $f2, B0, 8 * 8, $f6, B0, 9 * 8, $f10, B0, 10 * 8, $f14, B0, 11 * 8
+.endif
+    // Transpose G11 G10 G9 G8
+    GTRANSPOSE4x4_D G8, G9, G10, G11, D0, D1, D2, D3, D4, D5
+    // Store C
+.if \N == 4
+    // x x x x ... c8  c9  c10 c11 c12 c13 c14 c15
+    // x x x x ... c24 c25 c26 c27 c28 c29 c30 c31
+    // x x x x ... c40 c41 c42 c43 c44 c45 c46 c47
+    // x x x x ... c56 c57 c58 c59 c60 c61 c62 c63
+    GST xv, , D0, C0, 8 * 8, D1, C1, 8 * 8, D2, C2, 8 * 8, D3, C3, 8 * 8
+.elseif \N == 2
+    // x x x x ... c8  c9  c10 c11 c12 c13 c14 c15
+    // x x x x ... c24 c25 c26 c27 c28 c29 c30 c31
+    GST xv, , D0, C0, 8 * 8, D1, C1, 8 * 8
+.elseif \N == 1
+    // x x x x ... c8  c9  c10 c11 c12 c13 c14 c15
+    GST xv, , D0, C0, 8 * 8
+.endif
+
+#define G4 U1
+#define G5 U5
+#define G6 U9
+#define G7 U13
+    GTRANSPOSE4x4_D U1, U5, U9, U13, G4, G5, G6, G7, D0, D1
+    // A
+    // G4   G5  G6   G7
+    // ------------------
+    // 68               | D9
+    // 84	85          | D8  D7
+    // 100	101	102     | D6  D5  D4
+    // 116	117	118	119 | D3  D2  D1  D0
+    // 132	133	134	135 | D15 D14 D13 D12
+    // 148	149	150	151 | D11 D10 D9  D8
+    // 164	165	166	167 | D7  D6  D5  D4
+    // 180	181	182	183 | D3  D2  D1  D0
+    // 196	197	198	199 | D15 D14 D13 D12
+    // 212	213	214	215 | D11 D10 D9  D8
+    // 228	229	230	231 | D7  D6  D5  D4
+    // 244	245	246	247 | D3  D2  D1  D0
+    PTR_ADDI  T0, A0, 244 * 8
+    GLDREPL xv, d, D3, T0, 0, D2, T0, 1 * 8, D1, T0, 2 * 8, D0, T0, 3 * 8
+    PTR_ADDI  T0, A0, 228 * 8
+    GLDREPL xv, d, D7, T0, 0, D6, T0, 1 * 8, D5, T0, 2 * 8, D4, T0, 3 * 8
+    PTR_ADDI  T0, A0, 212 * 8
+    GLDREPL xv, d, D11, T0, 0, D10, T0, 1 * 8, D9, T0, 2 * 8, D8, T0, 3 * 8
+    PTR_ADDI  T0, A0, 196 * 8
+    GLDREPL xv, d, D15, T0, 0, D14, T0, 1 * 8, D13, T0, 2 * 8, D12, T0, 3 * 8
+    GNMSUB xvf, d, G7, G15, D0,  G7, G6, G15, D1,  G6, G5, G15, D2,  G5, G4, G15, D3,  G4, \
+                   G7, G14, D4,  G7, G6, G14, D5,  G6, G5, G14, D6,  G5, G4, G14, D7,  G4, \
+                   G7, G13, D8,  G7, G6, G13, D9,  G6, G5, G13, D10, G5, G4, G13, D11, G4, \
+                   G7, G12, D12, G7, G6, G12, D13, G6, G5, G12, D14, G5, G4, G12, D15, G4
+    PTR_ADDI  T0, A0, 180 * 8
+    GLDREPL xv, d, D3, T0, 0, D2, T0, 1 * 8, D1, T0, 2 * 8, D0, T0, 3 * 8
+    PTR_ADDI  T0, A0, 164 * 8
+    GLDREPL xv, d, D7, T0, 0, D6, T0, 1 * 8, D5, T0, 2 * 8, D4, T0, 3 * 8
+    PTR_ADDI  T0, A0, 148 * 8
+    GLDREPL xv, d, D11, T0, 0, D10, T0, 1 * 8, D9, T0, 2 * 8, D8, T0, 3 * 8
+    PTR_ADDI  T0, A0, 132 * 8
+    GLDREPL xv, d, D15, T0, 0, D14, T0, 1 * 8, D13, T0, 2 * 8, D12, T0, 3 * 8
+    GNMSUB xvf, d, G7, G11, D0,  G7, G6, G11, D1,  G6, G5, G11, D2,  G5, G4, G11, D3,  G4, \
+                   G7, G10, D4,  G7, G6, G10, D5,  G6, G5, G10, D6,  G5, G4, G10, D7,  G4, \
+                   G7, G9,  D8,  G7, G6, G9,  D9,  G6, G5, G9,  D10, G5, G4, G9,  D11, G4, \
+                   G7, G8,  D12, G7, G6, G8,  D13, G6, G5, G8,  D14, G5, G4, G8,  D15, G4
+    PTR_ADDI T0, A0, 116 * 8
+    GLDREPL xv, d, D3, T0, 0, D2, T0, 1 * 8, D1, T0, 2 * 8, D0, T0, 3 * 8
+    PTR_ADDI T0, A0, 100 * 8
+    GLDREPL xv, d, D6, T0, 0, D5, T0, 1 * 8, D4, T0, 2 * 8
+    PTR_ADDI T0, A0, 84 * 8
+    GLDREPL xv, d, D8, T0, 0, D7, T0, 1 * 8
+    PTR_ADDI T0, A0, 68 * 8
+    GLDREPL xv, d, D9, T0, 0
+    xvfmul.d    G7,     G7,     D0
+    GNMSUB xvf, d, G6, G7, D1, G6, G5, G7, D2, G5, G4, G7, D3, G4
+    xvfmul.d    G6,     G6,     D4
+    GNMSUB xvf, d, G5, G6, D5, G5, G4, G6, D6, G4
+    xvfmul.d    G5,     G5,     D7
+    GNMSUB xvf, d, G4, G5, D8, G4
+    xvfmul.d    G4,     G4,     D9
+    // Store B
+.if \N == 4
+    // x x x x ... x x x x
+    // b16 b17 b18 b19 ... b28 b29 b30 b31
+    // b32 b33 b34 b34 ... b44 b45 b46 b47
+    // b48 b49 b50 b51 ... b60 b61 b62 b63
+    GST xv, , G4, B0, 16 * 8, G5, B0, 20 * 8, G6, B0, 24 * 8, G7, B0, 28 * 8
+.elseif \N == 2
+    // x x x x ... x x x x
+    // b8  b9  b10 b11 b12 b13 b14 b15
+    // b16 b17 b18 b19 b20 b21 b22 b23
+    // b24 b25 b26 b27 b28 b29 b30 b31
+    GST v, , $vr1, B0, 8 * 8, $vr5, B0, 10 * 8, $vr9, B0, 12 * 8, $vr13, B0, 14 * 8
+.elseif \N == 1
+    // x x x x
+    // b4  b5  b6  b7
+    // b8  b9  b10 b11
+    // b12 b13 b14 b15
+    GST f, d, $f1, B0, 4 * 8, $f5, B0, 5 * 8, $f9, B0, 6 * 8, $f13, B0, 7 * 8
+.endif
+    // Transpose G7 G6 G5 G4
+    GTRANSPOSE4x4_D G4, G5, G6, G7, D0, D1, D2, D3, D4, D5
+    // Store C
+.if \N == 4
+    // x x x x c4  c5  c6  c7  c8  c9  c10 c11 c12 c13 c14 c15
+    // x x x x c20 c21 c22 c23 c24 c25 c26 c27 c28 c29 c30 c31
+    // x x x x c36 c37 c38 c39 c40 c41 c42 c43 c44 c45 c46 c47
+    // x x x x c52 c53 c54 c55 c56 c57 c58 c59 c60 c61 c62 c63
+    GST xv, , D0, C0, 4 * 8, D1, C1, 4 * 8, D2, C2, 4 * 8, D3, C3, 4 * 8
+.elseif \N == 2
+    // x x x x c4  c5  c6  c7  c8  c9  c10 c11 c12 c13 c14 c15
+    // x x x x c20 c21 c22 c23 c24 c25 c26 c27 c28 c29 c30 c31
+    GST xv, , D0, C0, 4 * 8, D1, C1, 4 * 8
+.elseif \N == 1
+    // x x x x c4  c5  c6  c7  c8  c9  c10 c11 c12 c13 c14 c15
+    GST xv, , D0, C0, 4 * 8
+.endif
+
+#define G0 U0
+#define G1 U4
+#define G2 U8
+#define G3 U12
+    GTRANSPOSE4x4_D U0, U4, U8, U12, G0, G1, G2, G3, D0, D1
+    // A
+    // G0   G1  G2   G3
+    // ------------------
+    // 0                 | D9
+    // 16	17           | D8  D7
+    // 32	33	34       | D6  D5  D4
+    // 48	49	50	51   | D3  D2  D1  D0
+    // 64	65	66	67   | D15 D14 D13 D12
+    // 80	81	82	83   | D11 D10 D9  D8
+    // 96	97	98	99   | D7  D6  D5  D4
+    // 112	113	114	115  | D3  D2  D1  D0
+    // 128	129	130	131  | D15 D14 D13 D12
+    // 144	145	146	147  | D11 D10 D9  D8
+    // 160	161	162	163  | D7  D6  D5  D4
+    // 176	177	178	179  | D3  D2  D1  D0
+    // 192	193	194	195  | D15 D14 D13 D12
+    // 208	209	210	211  | D11 D10 D9  D8
+    // 224	225	226	227  | D7  D6  D5  D4
+    // 240	241	242	243  | D3  D2  D1  D0
+    PTR_ADDI  T0, A0, 240 * 8
+    GLDREPL xv, d, D3, T0, 0, D2, T0, 1 * 8, D1, T0, 2 * 8, D0, T0, 3 * 8
+    PTR_ADDI  T0, A0, 224 * 8
+    GLDREPL xv, d, D7, T0, 0, D6, T0, 1 * 8, D5, T0, 2 * 8, D4, T0, 3 * 8
+    PTR_ADDI  T0, A0, 208 * 8
+    GLDREPL xv, d, D11, T0, 0, D10, T0, 1 * 8, D9, T0, 2 * 8, D8, T0, 3 * 8
+    PTR_ADDI  T0, A0, 192 * 8
+    GLDREPL xv, d, D15, T0, 0, D14, T0, 1 * 8, D13, T0, 2 * 8, D12, T0, 3 * 8
+    GNMSUB xvf, d, G3, G15, D0,  G3, G2, G15, D1,  G2, G1, G15, D2,  G1, G0, G15, D3,  G0, \
+                   G3, G14, D4,  G3, G2, G14, D5,  G2, G1, G14, D6,  G1, G0, G14, D7,  G0, \
+                   G3, G13, D8,  G3, G2, G13, D9,  G2, G1, G13, D10, G1, G0, G13, D11, G0, \
+                   G3, G12, D12, G3, G2, G12, D13, G2, G1, G12, D14, G1, G0, G12, D15, G0
+    PTR_ADDI  T0, A0, 176 * 8
+    GLDREPL xv, d, D3, T0, 0, D2, T0, 1 * 8, D1, T0, 2 * 8, D0, T0, 3 * 8
+    PTR_ADDI  T0, A0, 160 * 8
+    GLDREPL xv, d, D7, T0, 0, D6, T0, 1 * 8, D5, T0, 2 * 8, D4, T0, 3 * 8
+    PTR_ADDI  T0, A0, 144 * 8
+    GLDREPL xv, d, D11, T0, 0, D10, T0, 1 * 8, D9, T0, 2 * 8, D8, T0, 3 * 8
+    PTR_ADDI  T0, A0, 128 * 8
+    GLDREPL xv, d, D15, T0, 0, D14, T0, 1 * 8, D13, T0, 2 * 8, D12, T0, 3 * 8
+    GNMSUB xvf, d, G3, G11, D0,  G3, G2, G11, D1,  G2, G1, G11, D2,  G1, G0, G11, D3,  G0, \
+                   G3, G10, D4,  G3, G2, G10, D5,  G2, G1, G10, D6,  G1, G0, G10, D7,  G0, \
+                   G3, G9,  D8,  G3, G2, G9,  D9,  G2, G1, G9,  D10, G1, G0,  G9, D11, G0, \
+                   G3, G8,  D12, G3, G2, G8,  D13, G2, G1, G8,  D14, G1, G0,  G8, D15, G0
+    PTR_ADDI  T0, A0, 112 * 8
+    GLDREPL xv, d, D3, T0, 0, D2, T0, 1 * 8, D1, T0, 2 * 8, D0, T0, 3 * 8
+    PTR_ADDI  T0, A0, 96 * 8
+    GLDREPL xv, d, D7, T0, 0, D6, T0, 1 * 8, D5, T0, 2 * 8, D4, T0, 3 * 8
+    PTR_ADDI  T0, A0, 80 * 8
+    GLDREPL xv, d, D11, T0, 0, D10, T0, 1 * 8, D9, T0, 2 * 8, D8, T0, 3 * 8
+    PTR_ADDI  T0, A0, 64 * 8
+    GLDREPL xv, d, D15, T0, 0, D14, T0, 1 * 8, D13, T0, 2 * 8, D12, T0, 3 * 8
+    GNMSUB xvf, d, G3, G7, D0,  G3, G2, G7, D1,  G2, G1, G7, D2,  G1, G0, G7, D3,  G0, \
+                   G3, G6, D4,  G3, G2, G6, D5,  G2, G1, G6, D6,  G1, G0, G6, D7,  G0, \
+                   G3, G5, D8,  G3, G2, G5, D9,  G2, G1, G5, D10, G1, G0, G5, D11, G0, \
+                   G3, G4, D12, G3, G2, G4, D13, G2, G1, G4, D14, G1, G0, G4, D15, G0
+    PTR_ADDI T0, A0, 48 * 8
+    GLDREPL xv, d, D3, T0, 0, D2, T0, 1 * 8, D1, T0, 2 * 8, D0, T0, 3 * 8
+    PTR_ADDI T0, A0, 32 * 8
+    GLDREPL xv, d, D6, T0, 0, D5, T0, 1 * 8, D4, T0, 2 * 8
+    PTR_ADDI T0, A0, 16 * 8
+    GLDREPL xv, d, D8, T0, 0, D7, T0, 1 * 8
+    PTR_ADDI T0, A0, 0 * 8
+    GLDREPL xv, d, D9, T0, 0
+
+    xvfmul.d    G3,     G3,     D0
+    GNMSUB xvf, d, G2, G3, D1, G2, G1, G3, D2, G1, G0, G3, D3, G0
+    xvfmul.d    G2,     G2,     D4
+    GNMSUB xvf, d, G1, G2, D5, G1, G0, G2, D6, G0
+    xvfmul.d    G1,     G1,     D7
+    GNMSUB  xvf, d, G0, G1, D8, G0
+    xvfmul.d    G0,     G0,     D9
+    // Store B
+.if \N == 4
+    // b0  b1  b2  b3  ... b12 b13 b14 b15
+    // b16 b17 b18 b19 ... b28 b29 b30 b31
+    // b32 b33 b34 b34 ... b44 b45 b46 b47
+    // b48 b49 b50 b51 ... b60 b61 b62 b63
+    GST xv, , G0, B0, 0, G1, B0, 4 * 8, G2, B0, 8 * 8, G3, B0, 12 * 8
+.elseif \N == 2
+    // b0  b1  b2  b3  b4  b5  b6  b7
+    // b8  b9  b10 b11 b12 b13 b14 b15
+    // b16 b17 b18 b19 b20 b21 b22 b23
+    // b24 b25 b26 b27 b28 b29 b30 b31
+    GST v, , $vr0, B0, 0, $vr4, B0, 2 * 8, $vr8, B0, 4 * 8, $vr12, B0, 6 * 8
+.elseif \N == 1
+    // b0  b1  b2  b3
+    // b4  b5  b6  b7
+    // b8  b9  b10 b11
+    // b12 b13 b14 b15
+    GST f, d, $f0, B0, 0, $f4, B0, 1 * 8, $f8, B0, 2 * 8, $f12, B0, 3 * 8
+.endif
+    // Transpose C3 C2 C1 C0
+    GTRANSPOSE4x4_D G0, G1, G2, G3, D0, D1, D2, D3, D4, D5
+    // Store C
+.if \N == 4
+    // c0  c1  c2  c3  ... c12 c13 c14 c15
+    // c16 c17 c18 c19 ... c28 c29 c30 c31
+    // c32 c33 c34 c34 ... c44 c45 c46 c47
+    // c48 c49 c50 c51 ... c60 c61 c62 c63
+    GST xv, , D0, C0, 0, D1, C1, 0, D2, C2, 0, D3, C3, 0
+.elseif \N == 2
+    // c0  c1  c2  c3  ... c12 c13 c14 c15
+    // c16 c17 c18 c19 ... c28 c29 c30 c31
+    GST xv, , D0, C0, 0, D1, C1, 0
+.elseif \N == 1
+    // c0  c1  c2  c3  ... c12 c13 c14 c15
+    GST xv, , D0, C0, 0
+.endif
+
+#undef G0
+#undef G1
+#undef G2
+#undef G3
+#undef G4
+#undef G5
+#undef G6
+#undef G7
+#undef G8
+#undef G9
+#undef G10
+#undef G11
+#undef G12
+#undef G13
+#undef G14
+#undef G15
+.endm
+
+.macro dsolve_8 N
+// if N = 4 the data layout of C is as follows:
+// U0  U1
+// U2  U3
+// U4  U5
+// U6  U7
+// if N = 2 the dat layout of C is as follows:
+// U0  U1
+// U2  U3
+// if N = 1 the dat layout of C is as follows:
+// U0  U1
+// The matrix A has dimensions of 8x8, and
+// it will be divided into 2 segments for processing.
+
+#define G4 U1
+#define G5 U3
+#define G6 U5
+#define G7 U7
+    // Transpose U7 U5 U3 U1
+    GTRANSPOSE4x4_D U1, U3, U5, U7, G4, G5, G6, G7, D0, D1
+    // A
+    // G4   G5  G6  G7
+    // ---------------
+    // 36              | D9
+    // 44	45         | D8 D7
+    // 52	53	54     | D6 D5 D4
+    // 60	61	62	63 | D3 D2 D1 D0
+    PTR_ADDI      T0,     A0,     60 * 8
+    GLDREPL xv, d, D3, T0, 0, D2, T0, 1 * 8, D1, T0, 2 * 8, D0, T0, 3 * 8
+    PTR_ADDI      T0,     A0,     52 * 8
+    GLDREPL xv, d, D6, T0, 0, D5, T0, 1 * 8, D4, T0, 2 * 8
+    PTR_ADDI      T0,     A0,     44 * 8
+    GLDREPL xv, d, D8, T0, 0, D7, T0, 1 * 8
+    PTR_ADDI      T0,     A0,     36 * 8
+    GLDREPL xv, d, D9, T0, 0
+
+    xvfmul.d    G7,     G7,     D0
+    GNMSUB xvf, d, G6, G7, D1, G6, G5, G7, D2, G5, G4, G7, D3, G4
+    xvfmul.d    G6,     G6,     D4
+    GNMSUB xvf, d, G5, G6, D5, G5, G4, G6, D6, G4
+    xvfmul.d    G5,     G5,     D7
+    GNMSUB xvf, d, G4, G5, D8, G4
+    xvfmul.d    G4,     G4,     D9
+    // Store B
+.if \N == 4
+    GST xv, , G4, B0, 16 * 8, G5, B0, 20 * 8, G6, B0, 24 * 8, G7, B0, 28 * 8
+.elseif \N == 2
+    GST v, , $vr1, B0, 8 * 8, $vr3, B0, 10 * 8, $vr5, B0, 12 * 8, $vr7, B0, 14 * 8
+.elseif \N == 1
+    GST f, d, $f1, B0, 4 * 8, $f3, B0, 5 * 8, $f5, B0, 6 * 8, $f7, B0, 7 * 8
+.endif
+    // Transpose
+    GTRANSPOSE4x4_D G4, G5, G6, G7, D4, D5, D6, D7, D8, D9
+    // Store C
+.if \N == 4
+    GST xv, , D4, C0, 4 * 8, D5, C1, 4 * 8, D6, C2, 4 * 8, D7, C3, 4 * 8
+.elseif \N == 2
+    GST xv, , D4, C0, 4 * 8, D5, C1, 4 * 8
+.elseif \N == 1
+    GST xv, , D4, C0, 4 * 8
+.endif
+
+#define G0 U0
+#define G1 U2
+#define G2 U4
+#define G3 U6
+    // Transpose U6 U4 U2 U0
+    GTRANSPOSE4x4_D U0, U2, U4, U6, G0, G1, G2, G3, D0, D1
+    // A
+    // G0  G1   G2  G3
+    //-----------------
+    // 0               | D9
+    // 8	9          | D8  D7
+    // 16	17	18     | D6  D5  D4
+    // 24	25	26	27 | D3  D2  D1  D0
+    // 32	33	34	35 | D15 D14 D13 D12
+    // 40	41	42	43 | D11 D10 D9  D8
+    // 48	49	50	51 | D7  D6  D5  D4
+    // 56	57	58	59 | D3  D2  D1  D0
+    PTR_ADDI  T0, A0, 56 * 8
+    GLDREPL xv, d, D3, T0, 0, D2, T0, 1 * 8, D1, T0, 2 * 8, D0, T0, 3 * 8
+    PTR_ADDI  T0, A0, 48 * 8
+    GLDREPL xv, d, D7, T0, 0, D6, T0, 1 * 8, D5, T0, 2 * 8, D4, T0, 3 * 8
+    PTR_ADDI  T0, A0, 40 * 8
+    GLDREPL xv, d, D11, T0, 0, D10, T0, 1 * 8, D9, T0, 2 * 8, D8, T0, 3 * 8
+    PTR_ADDI  T0, A0, 32 * 8
+    GLDREPL xv, d, D15, T0, 0, D14, T0, 1 * 8, D13, T0, 2 * 8, D12, T0, 3 * 8
+    GNMSUB xvf, d, G3, G7, D0,  G3, G2, G7, D1,  G2, G1, G7, D2,  G1, G0, G7, D3,  G0, \
+                   G3, G6, D4,  G3, G2, G6, D5,  G2, G1, G6, D6,  G1, G0, G6, D7,  G0, \
+                   G3, G5, D8,  G3, G2, G5, D9,  G2, G1, G5, D10, G1, G0, G5, D11, G0, \
+                   G3, G4, D12, G3, G2, G4, D13, G2, G1, G4, D14, G1, G0, G4, D15, G0
+    PTR_ADDI T0, A0, 24 * 8
+    GLDREPL xv, d, D3, T0, 0, D2, T0, 1 * 8, D1, T0, 2 * 8, D0, T0, 3 * 8
+    PTR_ADDI T0, A0, 16 * 8
+    GLDREPL xv, d, D6, T0, 0, D5, T0, 1 * 8, D4, T0, 2 * 8
+    PTR_ADDI T0, A0, 8 * 8
+    GLDREPL xv, d, D8, T0, 0, D7, T0, 1 * 8
+    PTR_ADDI T0, A0, 0 * 8
+    GLDREPL xv, d, D9, T0, 0
+
+    xvfmul.d    G3,     G3,     D0
+    GNMSUB xvf, d, G2, G3, D1, G2, G1, G3, D2, G1, G0, G3, D3, G0
+    xvfmul.d    G2,     G2,     D4
+    GNMSUB xvf, d, G1, G2, D5, G1, G0, G2, D6, G0
+    xvfmul.d    G1,     G1,     D7
+    GNMSUB xvf, d, G0, G1, D8, G0
+    xvfmul.d    G0,     G0,     D9
+    // Store B
+.if \N == 4
+    GST xv, , G0, B0, 0, G1, B0, 4 * 8, G2, B0, 8 * 8, G3, B0, 12 * 8
+.elseif \N == 2
+    GST v, , $vr0, B0, 0, $vr2, B0, 2 * 8, $vr4, B0, 4 * 8, $vr6, B0, 6 * 8
+.elseif \N == 1
+    GST f, d, $f0, B0, 0, $f2, B0, 1 * 8, $f4, B0, 2 * 8, $f6, B0, 3 * 8
+.endif
+    // Transpose
+    GTRANSPOSE4x4_D G0, G1, G2, G3, D0, D1, D2, D3, D4, D5
+    // Store C
+.if \N == 4
+    GST xv, , D0, C0, 0, D1, C1, 0, D2, C2, 0, D3, C3, 0
+.elseif \N == 2
+    GST xv, , D0, C0, 0, D1, C1, 0
+.elseif \N == 1
+    GST xv, , D0, C0, 0
+.endif
+
+#undef G0
+#undef G1
+#undef G2
+#undef G3
+#undef G4
+#undef G5
+#undef G6
+#undef G7
+.endm
+
+.macro dsolve_4 N
+// if N = 4 the data layout of C is as follows:
+// U0
+// U1
+// U2
+// U3
+// if N = 2 the dat layout of C is as follows:
+// U0
+// U1
+// if N = 1 the dat layout of C is as follows:
+// U0
+// The matrix A has dimensions of 4x4, and
+// it will be divided into 1 segments for processing.
+
+#define G0 U0
+#define G1 U1
+#define G2 U2
+#define G3 U3
+    // Transpose U3 U2 U1 U0
+    GTRANSPOSE4x4_D U0, U1, U2, U3, G0, G1, G2, G3, D0, D1
+    // A
+    // G0 G1 G2 G3
+    //-------------
+    // 0           | D9
+    // 4  5        | D8 D7
+    // 8  9  10    | D6 D5 D4
+    // 12 13 14 15 | D3 D2 D1 D0
+    GLDREPL xv, d, D3, A0, 12 * 8, D2, A0, 13 * 8, D1, A0, 14 * 8, D0, A0, 15 * 8, \
+                   D6, A0, 8 * 8,  D5, A0, 9 * 8,  D4, A0, 10 * 8, \
+                   D8, A0, 4 * 8,  D7, A0, 5 * 8, \
+                   D9, A0, 0 * 8
+    xvfmul.d    G3,     G3,     D0
+    GNMSUB xvf, d, G2, G3, D1, G2, G1, G3, D2, G1, G0, G3, D3, G0
+    xvfmul.d    G2,     G2,     D4
+    GNMSUB xvf, d, G1, G2, D5, G1, G0, G2, D6, G0
+    xvfmul.d    G1,     G1,     D7
+    GNMSUB xvf, d, G0, G1, D8, G0
+    xvfmul.d    G0,     G0,     D9
+    // Store B
+.if \N == 4
+    GST xv, , G0, B0, 0, G1, B0, 4 * 8, G2, B0, 8 * 8, G3, B0, 12 * 8
+.elseif \N == 2
+    GST v, , $vr0, B0, 0, $vr1, B0, 2 * 8, $vr2, B0, 4 * 8, $vr3, B0, 6 * 8
+.elseif \N == 1
+    GST f, d, $f0, B0, 0, $f1, B0, 1 * 8, $f2, B0, 2 * 8, $f3, B0, 3 * 8
+.endif
+    // Transpose
+    GTRANSPOSE4x4_D G0, G1, G2, G3, D0, D1, D2, D3, D4, D5
+    // Store C
+.if \N == 4
+    GST xv, , D0, C0, 0, D1, C1, 0, D2, C2, 0, D3, C3, 0
+.elseif \N == 2
+    GST xv, , D0, C0, 0, D1, C1, 0
+.elseif \N == 1
+    GST xv, , D0, C0, 0
+.endif
+
+#undef G0
+#undef G1
+#undef G2
+#undef G3
+.endm
+
+.macro dsolve_2 N
+#define G0  U2
+#define G1  U3
+    // Transpose
+    GSBUTTERFLY xv, d, G0, G1, U1, U0
+    // A
+    // G0 G1
+    // ------
+    // 0    | D2
+    // 2  3 | D1 D0
+    GLDREPL xv, d, D2, A0, 0, D1, A0, 2 * 8, D0, A0, 3 * 8
+    xvfmul.d    G1,     G1,     D0
+    GNMSUB xvf, d, G0, G1, D1, G0
+    xvfmul.d    G0,     G0,     D2
+    // Store B
+.if \N == 4
+    GST xv, , G0, B0, 0, G1, B0, 4 * 8
+.elseif \N == 2
+    GST v, , $vr2, B0, 0, $vr3, B0, 2 * 8
+.elseif \N == 1
+    GST f, d, $f2, B0, 0, $f3, B0, 8
+.endif
+    // Transpose
+    GSBUTTERFLY xv, d, D0, D1, G1, G0
+    // Store C
+.if \N == 4
+    vst       $vr16,    C0,      0x00
+    vst       $vr17,    C1,      0x00
+    xvstelm.d D0,  C2,  0x00,    0x02
+    xvstelm.d D1,  C3,  0x00,    0x02
+    xvstelm.d D0,  C2,  0x08,    0x03
+    xvstelm.d D1,  C3,  0x08,    0x03
+.elseif \N == 2
+    GST v, , $vr16, C0, 0, $vr17, C1, 0
+.elseif \N == 1
+    GST v, , $vr16, C0, 0
+.endif
+
+#undef G0
+#undef G1
+.endm
+
+.macro dgemm_dsolve_16x4
+    or    T1,   A0,     A0
+    or    T2,   B0,     B0
+    bge   ZERO, L,	.L_dsolve_16x4_load
+    dgemm_16x4
+    b	.L_dsolve_16x4
+.L_dsolve_16x4_load:
+    // Load C
+    GLD xv, , U0,  C0, 0x00, U1,  C0, 0x20, U2,  C0, 0x40, U3,  C0, 0x60
+    GLD xv, , U4,  C1, 0x00, U5,  C1, 0x20, U6,  C1, 0x40, U7,  C1, 0x60
+    GLD xv, , U8,  C2, 0x00, U9,  C2, 0x20, U10, C2, 0x40, U11, C2, 0x60
+    GLD xv, , U12, C3, 0x00, U13, C3, 0x20, U14, C3, 0x40, U15, C3, 0x60
+/********************** solver ******************/
+.L_dsolve_16x4:
+    PTR_ADDI    A0,    T1,    -(16 * 8 * 8)
+    PTR_ADDI    A0,    A0,    -(16 * 8 * 8)
+    PTR_ADDI    B0,    T2,    -(16 * 4 * 8)
+    dsolve_16 4
+.endm
+
+.macro dgemm_dsolve_1x4
+    or    T1,   A0,     A0
+    or    T2,   B0,     B0
+    bge   ZERO, L,    .L_dsolve_1x4_load
+    dgemm_1x4
+    b   .L_dsolve_1x4
+.L_dsolve_1x4_load:
+    // Load C
+    fld.d       $f0,    C0,     0x00
+    fld.d       $f1,    C1,     0x00
+    fld.d       $f2,    C2,     0x00
+    fld.d       $f3,    C3,     0x00
+    xvinsve0.d  U0,     U1,     0x01
+    xvinsve0.d  U0,     U2,     0x02
+    xvinsve0.d  U0,     U3,     0x03
+.L_dsolve_1x4:
+    or      A0,     T1,     T1
+    or      B0,     T2,     T2
+    GLDREPL xv, d, D0, A0, -1 * 8
+    GMUL xvf, d, U0, U0, D0
+    // Store C
+    xvstelm.d   U0,     C0,     0x00,       0x00
+    xvstelm.d   U0,     C1,     0x00,       0x01
+    xvstelm.d   U0,     C2,     0x00,       0x02
+    xvstelm.d   U0,     C3,     0x00,       0x03
+    // Store B
+    xvst    U0,     B0,     -32
+.endm
+
+.macro dgemm_dsolve_2x4
+    or    T1,   A0,     A0
+    or    T2,   B0,     B0
+    bge   ZERO, L,    .L_dsolve_2x4_load
+    dgemm_2x4
+    b   .L_dsolve_2x4
+.L_dsolve_2x4_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    /* Load C1  */
+    xvld      U1,  C1,  0x00
+    /* Load C2  */
+    xvld      U2,  C2,  0x00
+    /* Load C3  */
+    xvld      U3,  C3,  0x00
+
+    xvpermi.q   U0, U2, 0x02
+    xvpermi.q   U1, U3, 0x02
+/********************** solver ******************/
+.L_dsolve_2x4:
+    PTR_ADDI      A0,     T1,     -(2 * 2 * 8)
+    PTR_ADDI      B0,     T2,     -(2 * 4 * 8)
+    dsolve_2 4
+.endm
+
+.macro dgemm_dsolve_4x4
+    or    T1,   A0,     A0
+    or    T2,   B0,     B0
+    bge   ZERO, L,    .L_dsolve_4x4_load
+    dgemm_4x4
+    b .L_dsolve_4x4
+.L_dsolve_4x4_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    /* Load C1  */
+    xvld      U1,  C1,  0x00
+    /* Load C2  */
+    xvld      U2,  C2,  0x00
+    /* Load C3  */
+    xvld      U3,  C3,  0x00
+/************** solver *****************/
+.L_dsolve_4x4:
+    PTR_ADDI      A0,     T1,     -(4 * 4 * 8)
+    PTR_ADDI      B0,     T2,     -(4 * 4 * 8)
+
+    dsolve_4 4
+.endm
+
+.macro dgemm_dsolve_8x4
+    or    T1,   A0,     A0
+    or    T2,   B0,     B0
+    bge   ZERO, L,	.L_dsolve_8x4_load
+    dgemm_8x4
+    b .L_dsolve_8x4
+.L_dsolve_8x4_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    xvld      U1,  C0,  0x20
+
+    /* Load C1  */
+    xvld      U2,  C1,  0x00
+    xvld      U3,  C1,  0x20
+
+    /* Load C2  */
+    xvld      U4,  C2,  0x00
+    xvld      U5,  C2,  0x20
+
+    /* Load C3  */
+    xvld      U6,  C3,  0x00
+    xvld      U7,  C3,  0x20
+/********* solver *********/
+.L_dsolve_8x4:
+    PTR_ADDI      A0,     T1,     -(8 * 8 * 8)
+    PTR_ADDI      B0,     T2,     -(8 * 4 * 8)
+
+    dsolve_8 4
+.endm
+
+.macro dgemm_dsolve_4x2
+    or    T1,   A0,     A0
+    or    T2,   B0,     B0
+    bge   ZERO, L,	.L_dsolve_4x2_load
+    dgemm_4x2
+    b .L_dsolve_4x2
+.L_dsolve_4x2_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    /* Load C1  */
+    xvld      U1,  C1,  0x00
+.L_dsolve_4x2:
+    PTR_ADDI      A0,     T1,     -(4 * 4 * 8)
+    PTR_ADDI      B0,     T2,     -(4 * 2 * 8)
+
+    dsolve_4 2
+.endm
+
+.macro dgemm_dsolve_2x2
+    or    T1,   A0,     A0
+    or    T2,   B0,     B0
+    bge   ZERO, L,	.L_dsolve_2x2_load
+    dgemm_2x2
+    b .L_dsolve_2x2
+.L_dsolve_2x2_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    /* Load C1  */
+    xvld      U1,  C1,  0x00
+.L_dsolve_2x2:
+    PTR_ADDI     A0,  T1,     -(2 * 2 * 8)
+    PTR_ADDI     B0,  T2,     -(2 * 2 * 8)
+
+    dsolve_2 2
+.endm
+
+.macro dgemm_dsolve_8x2
+    or    T1,   A0,     A0
+    or    T2,   B0,     B0
+    bge   ZERO, L,	.L_dsolve_8x2_load
+    dgemm_8x2
+    b .L_dsolve_8x2
+.L_dsolve_8x2_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    xvld      U1,  C0,  0x20
+    /* Load C1  */
+    xvld      U2,  C1,  0x00
+    xvld      U3,  C1,  0x20
+.L_dsolve_8x2:
+    PTR_ADDI     A0,  T1,     -(8 * 8 * 8)
+    PTR_ADDI     B0,  T2,     -(8 * 2 * 8)
+
+    dsolve_8 2
+.endm
+
+.macro dgemm_dsolve_16x2
+    or    T1,   A0,     A0
+    or    T2,   B0,     B0
+    bge   ZERO, L,	.L_dsolve_16x2_load
+    dgemm_16x2
+    b .L_dsolve_16x2
+.L_dsolve_16x2_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    xvld      U1,  C0,  0x20
+    xvld      U2,  C0,  0x40
+    xvld      U3,  C0,  0x60
+    /* Load C1  */
+    xvld      U4,  C1,  0x00
+    xvld      U5,  C1,  0x20
+    xvld      U6,  C1,  0x40
+    xvld      U7,  C1,  0x60
+.L_dsolve_16x2:
+    PTR_ADDI    A0,    T1,    -(16 * 8 * 8)
+    PTR_ADDI    A0,    A0,    -(16 * 8 * 8)
+    PTR_ADDI    B0,    T2,    -(16 * 2 * 8)
+
+    dsolve_16 2
+.endm
+
+.macro dgemm_dsolve_2x1
+    or    T1,   A0,     A0
+    or    T2,   B0,     B0
+    bge   ZERO, L,	.L_dsolve_2x1_load
+    dgemm_2x1
+    b .L_dsolve_2x1
+.L_dsolve_2x1_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+.L_dsolve_2x1:
+    PTR_ADDI     A0,  T1,     -(2 * 2 * 8)
+    PTR_ADDI     B0,  T2,     -(2 * 1 * 8)
+
+    dsolve_2 1
+.endm
+
+.macro dgemm_dsolve_4x1
+    or    T1,   A0,     A0
+    or    T2,   B0,     B0
+    bge   ZERO, L,	.L_dsolve_4x1_load
+    dgemm_4x1
+    b .L_dsolve_4x1
+.L_dsolve_4x1_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+.L_dsolve_4x1:
+    PTR_ADDI      A0,     T1,     -(4 * 4 * 8)
+    PTR_ADDI      B0,     T2,     -(4 * 1 * 8)
+
+    dsolve_4 1
+.endm
+
+.macro dgemm_dsolve_8x1
+    or    T1,   A0,     A0
+    or    T2,   B0,     B0
+    bge   ZERO, L,	.L_dsolve_8x1_load
+    dgemm_8x1
+    b .L_dsolve_8x1
+.L_dsolve_8x1_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    xvld      U1,  C0,  0x20
+.L_dsolve_8x1:
+    PTR_ADDI     A0,  T1,     -(8 * 8 * 8)
+    PTR_ADDI     B0,  T2,     -(8 * 1 * 8)
+
+    dsolve_8 1
+.endm
+
+.macro dgemm_dsolve_16x1
+    or    T1,   A0,     A0
+    or    T2,   B0,     B0
+    bge   ZERO, L,	.L_dsolve_16x1_load
+    dgemm_16x1
+    b .L_dsolve_16x1
+.L_dsolve_16x1_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    xvld      U1,  C0,  0x20
+    xvld      U2,  C0,  0x40
+    xvld      U3,  C0,  0x60
+.L_dsolve_16x1:
+    PTR_ADDI    A0,    T1,    -(16 * 8 * 8)
+    PTR_ADDI    A0,    A0,    -(16 * 8 * 8)
+    PTR_ADDI    B0,    T2,    -(16 * 1 * 8)
+
+    dsolve_16 1
+.endm
+
+    PROLOGUE
+    push_if_used 26, 32
+    PTR_SLLI   LDC,   LDC,   3
+    /* if (!(N >> 2)) goto L_N3 */
+    PTR_SRAI   J,     N,     2     /* J = bn >> 2 */
+    andi     N,     N,     0x03
+    beq      ZERO,  J,     .L_N3
+.align 5
+.L_J1:
+    PTR_ADDI   J,     J,     -1
+    PTR_ADD    KK,    M,     OFFSET
+
+    andi      I,    M,      15
+    beq       ZERO, I,      .L_M16
+    andi      I,    M,      1
+    beqz      I,    .L_M2
+.L_M1:
+    PTR_ADDI    T0,   M,      -1
+    PTR_SLLI    T0,   T0,     3
+    PTR_MUL     AA,   T0,     K
+    PTR_ADD     AA,   AA,     A
+    PTR_ALSL    A0,   KK,     AA,     3 /* a + (m - 1) * k + kk */
+    PTR_ADD     CC,   T0,     C         /* c + (m - 1) */
+
+    PTR_SLLI   T0,    KK,     5
+    PTR_ADD    B0,    B,      T0 /* b + 4 * kk */
+    PTR_SUB    L,     K,      KK
+    GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC
+    dgemm_dsolve_1x4
+    PTR_ADDI   KK,    KK,     -1
+.L_M2:
+    andi    I,      M,      2
+    beqz    I,      .L_M4
+    PTR_SRLI  T0,     M,      1
+    PTR_SLLI  T0,     T0,     1
+    PTR_ADDI  T0,     T0,     -2
+    PTR_SLLI  T0,     T0,     3 /* ((m & -2) - 2) */
+    PTR_ADD   CC,     T0,     C /* c + ((m & -2) - 2)*/
+    PTR_SLLI  T1,     KK,     4
+    PTR_MUL   AA,     T0,     K
+    PTR_ADD   AA,     AA,     A
+    PTR_ADD   A0,     AA,     T1 /* a + ((m & -2) - 2) * k + 2 * kk */
+    PTR_SLLI  T0,     KK,     5
+    PTR_ADD   B0,     B,      T0 /* b + 4 * kk */
+    PTR_SUB   L,      K,      KK
+    GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC
+    dgemm_dsolve_2x4
+    PTR_ADDI  KK,     KK,     -2
+.L_M4:
+    andi    I,      M,      4
+    beqz    I,      .L_M8
+    PTR_SRLI  T0,     M,      2
+    PTR_SLLI  T0,     T0,     2
+    PTR_ADDI  T0,     T0,     -4
+    PTR_SLLI  T0,     T0,     3 /* ((m & -4) - 4) */
+    PTR_ADD   CC,     T0,     C /* c + ((m & -4) - 4)*/
+    PTR_SLLI  T1,     KK,     5
+    PTR_MUL   AA,     T0,     K
+    PTR_ADD   AA,     AA,     A
+    PTR_ADD   A0,     AA,     T1 /* a + ((m & -4) - 4) * k + 4 * kk */
+    PTR_SLLI  T0,     KK,     5
+    PTR_ADD   B0,     B,      T0 /* b + 4 * kk */
+    PTR_SUB   L,      K,      KK
+    GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC
+    dgemm_dsolve_4x4
+    PTR_ADDI  KK,     KK,     -4
+.L_M8:
+    andi    I,      M,      8
+    beqz    I,      .L_M16
+    PTR_SRLI  T0,     M,      3
+    PTR_SLLI  T0,     T0,     3
+    PTR_ADDI  T0,     T0,     -8
+    PTR_SLLI  T0,     T0,     3 /* ((m & -8) - 8) */
+    PTR_ADD   CC,     T0,     C /* c + ((m & -8) - 8)*/
+    PTR_SLLI  T1,     KK,     6
+    PTR_MUL   AA,     T0,     K
+    PTR_ADD   AA,     AA,     A
+    PTR_ADD   A0,     AA,     T1 /* a + ((m & -8) - 8) * k + 8 * kk */
+    PTR_SLLI  T0,     KK,     5
+    PTR_ADD   B0,     B,      T0 /* b + 4 * kk */
+    PTR_SUB   L,      K,      KK
+    GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC
+    dgemm_dsolve_8x4
+    PTR_ADDI  KK,     KK,     -8
+.L_M16:
+    PTR_SRAI   I,     M,     4     /* I = bm >> 4 */
+    beq      ZERO,  I,     .L_M0
+
+    PTR_SRLI   T0,    M,      4
+    PTR_SLLI   T0,    T0,     4
+    PTR_ADDI   T0,    T0,     -16 /* ((M & -16)) - 16) */
+    PTR_SLLI   T0,    T0,     3
+    PTR_MUL    AA,    T0,     K
+    PTR_ADD    AA,    A,      AA
+    PTR_ADD    CC,    C,      T0
+.align 5
+.L_I1:
+    PTR_SLLI   T0,    KK,     5
+    PTR_ADD    B0,    B,      T0
+    PTR_SUB    L,     K,      KK
+    GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC
+    PTR_SLLI   T0,    KK,     7
+    PTR_ADD    A0,    AA,     T0
+    dgemm_dsolve_16x4
+    PTR_ADDI   I,     I,      -1
+    PTR_ADDI   KK,    KK,     -16
+    PTR_ADDI   CC,    CC,     -(16 * 8)
+    PTR_SLLI   T0,    K,      7
+    PTR_SUB    AA,    AA,     T0
+    blt      ZERO,  I,      .L_I1
+.L_M0:
+    PTR_SLLI   T0,    K,      3
+    PTR_ALSL   B,     T0,     B,      2 // b += 4 * k;
+    PTR_ALSL   C,     LDC,    C,      2 // c += 4 * ldc
+    blt      ZERO,  J,      .L_J1
+.L_N3:
+    andi    J,      N,      2
+    beq     ZERO,   J,      .L_N1
+
+    PTR_ADD    KK,    M,     OFFSET
+    andi      I,    M,      15
+    beq       ZERO, I,      .L_N3_M16
+    andi      I,    M,      1
+    beqz      I,    .L_N3_M2
+.L_N3_M1:
+    PTR_ADDI    KK,   KK,     -1
+
+    PTR_ADDI    T0,   M,      -1
+    PTR_SLLI    T0,   T0,     3
+    PTR_MUL     AA,   T0,     K
+    PTR_ADD     AA,   AA,     A
+    PTR_ALSL    A0,   KK,     AA,     3 /* a + (m - 1) * k + kk */
+    PTR_ADD     CC,   T0,     C         /* c + (m - 1) */
+
+    PTR_SLLI   T0,    KK,     4
+    PTR_ADD    B0,    B,      T0 /* b + 2 * kk */
+    GADD , d, C0, CC, ZERO, C1, C0, LDC
+    // dgemm_dsolve_1x2
+    GLD f, d, $f0, A0, 0, $f1, C0, 0, $f2, C1, 0
+    GMUL f, d, $f1, $f1, $f0, $f2, $f2, $f0
+    GST f, d, $f1, C0, 0, $f2, C1, 0, $f1, B0, 0, $f2, B0, 8
+.L_N3_M2:
+    andi    I,      M,      2
+    beqz    I,      .L_N3_M4
+    PTR_SRLI  T0,     M,      1
+    PTR_SLLI  T0,     T0,     1
+    PTR_ADDI  T0,     T0,     -2
+    PTR_SLLI  T0,     T0,     3 /* ((m & -2) - 2) */
+    PTR_ADD   CC,     T0,     C /* c + ((m & -2) - 2)*/
+    PTR_SLLI  T1,     KK,     4
+    PTR_MUL   AA,     T0,     K
+    PTR_ADD   AA,     AA,     A
+    PTR_ADD   A0,     AA,     T1 /* a + ((m & -2) - 2) * k + 2 * kk */
+    PTR_SLLI  T0,     KK,     4
+    PTR_ADD   B0,     B,      T0 /* b + 2 * kk */
+    PTR_SUB   L,      K,      KK
+    GADD , d, C0, CC, ZERO, C1, C0, LDC
+    dgemm_dsolve_2x2
+    PTR_ADDI  KK,     KK,     -2
+.L_N3_M4:
+    andi    I,      M,      4
+    beqz    I,      .L_N3_M8
+    PTR_SRLI  T0,     M,      2
+    PTR_SLLI  T0,     T0,     2
+    PTR_ADDI  T0,     T0,     -4
+    PTR_SLLI  T0,     T0,     3 /* ((m & -4) - 4) */
+    PTR_ADD   CC,     T0,     C /* c + ((m & -4) - 4)*/
+    PTR_SLLI  T1,     KK,     5
+    PTR_MUL   AA,     T0,     K
+    PTR_ADD   AA,     AA,     A
+    PTR_ADD   A0,     AA,     T1 /* a + ((m & -4) - 4) * k + 4 * kk */
+    PTR_SLLI  T0,     KK,     4
+    PTR_ADD   B0,     B,      T0 /* b + 2 * kk */
+    PTR_SUB   L,      K,      KK
+    GADD , d, C0, CC, ZERO, C1, C0, LDC
+    dgemm_dsolve_4x2
+    PTR_ADDI  KK,     KK,     -4
+.L_N3_M8:
+    andi    I,      M,      8
+    beqz    I,      .L_N3_M16
+    PTR_SRLI  T0,     M,      3
+    PTR_SLLI  T0,     T0,     3
+    PTR_ADDI  T0,     T0,     -8
+    PTR_SLLI  T0,     T0,     3 /* ((m & -8) - 8) */
+    PTR_ADD   CC,     T0,     C /* c + ((m & -8) - 8)*/
+    PTR_SLLI  T1,     KK,     6
+    PTR_MUL   AA,     T0,     K
+    PTR_ADD   AA,     AA,     A
+    PTR_ADD   A0,     AA,     T1 /* a + ((m & -8) - 8) * k + 8 * kk */
+    PTR_SLLI  T0,     KK,     4
+    PTR_ADD   B0,     B,      T0 /* b + 2 * kk */
+    PTR_SUB   L,      K,      KK
+    GADD , d, C0, CC, ZERO, C1, C0, LDC
+    dgemm_dsolve_8x2
+    PTR_ADDI  KK,     KK,     -8
+.L_N3_M16:
+    PTR_SRAI   I,     M,     4     /* I = bm >> 4 */
+    beq      ZERO,  I,     .L_N3_M0
+
+    PTR_SRLI   T0,    M,      4
+    PTR_SLLI   T0,    T0,     4
+    PTR_ADDI   T0,    T0,     -16 /* ((M & -16)) - 16) */
+    PTR_SLLI   T0,    T0,     3
+    PTR_MUL    AA,    T0,     K
+    PTR_ADD    AA,    A,      AA
+    PTR_ADD    CC,    C,      T0
+.align 5
+.L_N3_I1:
+    PTR_SLLI   T0,    KK,     4
+    PTR_ADD    B0,    B,      T0
+    PTR_SUB    L,     K,      KK
+    GADD , d, C0, CC, ZERO, C1, C0, LDC
+    PTR_SLLI   T0,    KK,     7
+    PTR_ADD    A0,    AA,     T0
+    dgemm_dsolve_16x2
+    PTR_ADDI   I,     I,      -1
+    PTR_ADDI   KK,    KK,     -16
+    PTR_ADDI   CC,    CC,     -(16 * 8)
+    PTR_SLLI   T0,    K,      7
+    PTR_SUB    AA,    AA,     T0
+    blt      ZERO,  I,      .L_N3_I1
+.L_N3_M0:
+    PTR_SLLI   T0,    K,      3
+    PTR_ALSL   B,     T0,     B,      1 // b += 2 * k;
+    PTR_ALSL   C,     LDC,    C,      1 // c += 2 * ldc
+.L_N1:
+    andi    J,      N,      1
+    beq     ZERO,   J,      .L_N0
+
+    PTR_ADD    KK,    M,     OFFSET
+    andi      I,    M,      15
+    beq       ZERO, I,      .L_N1_M16
+    andi      I,    M,      1
+    beqz      I,    .L_N1_M2
+.L_N1_M1:
+    PTR_ADDI    KK,   KK,     -1
+
+    PTR_ADDI    T0,   M,      -1
+    PTR_SLLI    T0,   T0,     3
+    PTR_MUL     AA,   T0,     K
+    PTR_ADD     AA,   AA,     A
+    PTR_ALSL    A0,   KK,     AA,     3 /* a + (m - 1) * k + kk */
+    PTR_ADD     CC,   T0,     C         /* c + (m - 1) */
+
+    PTR_SLLI   T0,    KK,     3
+    PTR_ADD    B0,    B,      T0 /* b + kk */
+    GADD , d, C0, CC, ZERO
+    // dgemm_dsolve_1x1
+    GLD f, d, $f0, A0, 0, $f1, C0, 0
+    GMUL f, d, $f1, $f1, $f0
+    GST f, d, $f1, C0, 0, $f1, B0, 0
+.L_N1_M2:
+    andi    I,      M,      2
+    beqz    I,      .L_N1_M4
+    PTR_SRLI  T0,     M,      1
+    PTR_SLLI  T0,     T0,     1
+    PTR_ADDI  T0,     T0,     -2
+    PTR_SLLI  T0,     T0,     3 /* ((m & -2) - 2) */
+    PTR_ADD   CC,     T0,     C /* c + ((m & -2) - 2)*/
+    PTR_SLLI  T1,     KK,     4
+    PTR_MUL   AA,     T0,     K
+    PTR_ADD   AA,     AA,     A
+    PTR_ADD   A0,     AA,     T1 /* a + ((m & -2) - 2) * k + 2 * kk */
+    PTR_SLLI  T0,     KK,     3
+    PTR_ADD   B0,     B,      T0 /* b + kk */
+    PTR_SUB   L,      K,      KK
+    GADD , d, C0, CC, ZERO
+    dgemm_dsolve_2x1
+    PTR_ADDI  KK,     KK,     -2
+.L_N1_M4:
+    andi    I,      M,      4
+    beqz    I,      .L_N1_M8
+    PTR_SRLI  T0,     M,      2
+    PTR_SLLI  T0,     T0,     2
+    PTR_ADDI  T0,     T0,     -4
+    PTR_SLLI  T0,     T0,     3 /* ((m & -4) - 4) */
+    PTR_ADD   CC,     T0,     C /* c + ((m & -4) - 4)*/
+    PTR_SLLI  T1,     KK,     5
+    PTR_MUL   AA,     T0,     K
+    PTR_ADD   AA,     AA,     A
+    PTR_ADD   A0,     AA,     T1 /* a + ((m & -4) - 4) * k + 4 * kk */
+    PTR_SLLI  T0,     KK,     3
+    PTR_ADD   B0,     B,      T0 /* b + kk */
+    PTR_SUB   L,      K,      KK
+    GADD , d, C0, CC, ZERO
+    dgemm_dsolve_4x1
+    PTR_ADDI  KK,     KK,     -4
+.L_N1_M8:
+    andi    I,      M,      8
+    beqz    I,      .L_N1_M16
+    PTR_SRLI  T0,     M,      3
+    PTR_SLLI  T0,     T0,     3
+    PTR_ADDI  T0,     T0,     -8
+    PTR_SLLI  T0,     T0,     3 /* ((m & -8) - 8) */
+    PTR_ADD   CC,     T0,     C /* c + ((m & -8) - 8)*/
+    PTR_SLLI  T1,     KK,     6
+    PTR_MUL   AA,     T0,     K
+    PTR_ADD   AA,     AA,     A
+    PTR_ADD   A0,     AA,     T1 /* a + ((m & -8) - 8) * k + 8 * kk */
+    PTR_SLLI  T0,     KK,     3
+    PTR_ADD   B0,     B,      T0 /* b +  kk */
+    PTR_SUB   L,      K,      KK
+    GADD , d, C0, CC, ZERO
+    dgemm_dsolve_8x1
+    PTR_ADDI  KK,     KK,     -8
+.L_N1_M16:
+    PTR_SRAI   I,     M,     4     /* I = bm >> 4 */
+    beq      ZERO,  I,     .L_N1_M0
+
+    PTR_SRLI   T0,    M,      4
+    PTR_SLLI   T0,    T0,     4
+    PTR_ADDI   T0,    T0,     -16 /* ((M & -16)) - 16) */
+    PTR_SLLI   T0,    T0,     3
+    PTR_MUL    AA,    T0,     K
+    PTR_ADD    AA,    A,      AA
+    PTR_ADD    CC,    C,      T0
+.align 5
+.L_N1_I1:
+    PTR_SLLI   T0,    KK,     3
+    PTR_ADD    B0,    B,      T0
+    PTR_SUB    L,     K,      KK
+    GADD , d, C0, CC, ZERO
+    PTR_SLLI   T0,    KK,     7
+    PTR_ADD    A0,    AA,     T0
+    dgemm_dsolve_16x1
+    PTR_ADDI   I,     I,      -1
+    PTR_ADDI   KK,    KK,     -16
+    PTR_ADDI   CC,    CC,     -(16 * 8)
+    PTR_SLLI   T0,    K,      7
+    PTR_SUB    AA,    AA,     T0
+    blt      ZERO,  I,      .L_N1_I1
+.L_N1_M0:
+.L_N0:
+    pop_if_used 26, 32
+    jirl    $r0, $r1, 0x0
+    EPILOGUE
diff --git a/kernel/loongarch64/dtrsm_kernel_LT_16x4_lasx.S b/kernel/loongarch64/dtrsm_kernel_LT_16x4_lasx.S
new file mode 100644
index 000000000..0e2cacccf
--- /dev/null
+++ b/kernel/loongarch64/dtrsm_kernel_LT_16x4_lasx.S
@@ -0,0 +1,959 @@
+/*******************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+#include "loongarch64_asm.S"
+
+/*********************************************************************
+* 2023/08/26 guxiwei
+*        UTEST                  : OK
+*        CTEST                  : OK
+*        TEST                   : OK
+*
+*
+*********************************************************************/
+
+/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b,
+ *          FLOAT *c, BLASLONG ldc, BLASLONG offset)
+ */
+
+#define M      $r4   // param 1: bm
+#define N      $r5   // param 2: bn
+#define K      $r6   // param 3: bk
+#define A      $r7   // param 5: ba
+#define B      $r8   // param 6: bb
+#define C      $r9   // param 7: bc
+#define LDC    $r10  // param 8: ldc
+#define OFFSET $r11  // param 9: offset
+
+/* Cycle control parameters */
+#define I      $r13
+#define J      $r14
+#define L      $r15
+#define TL     $r16
+/* Matrix address */
+#define A0     $r17
+#define B0     $r18
+#define C0     $r19
+#define C1     $r20
+#define C2     $r23
+#define C3     $r24
+#define T0     $r25
+#define T1     $r26
+#define T2     $r27
+#define KK     $r28
+#define AA     $r29
+#define CC     $r30
+#define BB     B0
+#undef  ZERO
+#define ZERO   $r0
+
+#define U0     $xr0
+#define U1     $xr1
+#define U2     $xr2
+#define U3     $xr3
+#define U4     $xr4
+#define U5     $xr5
+#define U6     $xr6
+#define U7     $xr7
+#define U8     $xr8
+#define U9     $xr9
+#define U10    $xr10
+#define U11    $xr11
+#define U12    $xr12
+#define U13    $xr13
+#define U14    $xr14
+#define U15    $xr15
+#define D0     $xr16
+#define D1     $xr17
+#define D2     $xr18
+#define D3     $xr19
+#define D4     $xr20
+#define D5     $xr21
+#define D6     $xr22
+#define D7     $xr23
+#define D8     $xr24
+#define D9     $xr25
+#define D10    $xr26
+#define D11    $xr27
+#define D12    $xr28
+#define D13    $xr29
+#define D14    $xr30
+#define D15    $xr31
+#define G0     D0
+#define G1     D1
+#define G2     D2
+#define G3     D3
+#define G4     D4
+#define G5     D5
+#define G6     D6
+#define G7     D7
+#define G8     D8
+#define G9     D9
+#define G10    D10
+#define G11    D11
+#define G12    D12
+#define G13    D13
+#define G14    D14
+#define G15    D15
+
+/* Prefetch interval */
+#define A_PRE  0x400
+#define B_PRE  0x100
+
+#include "dtrsm_kernel_macro.S"
+
+.macro ldrepl_macro start, end, stride
+// Load Ux (x = 0...15)
+.if \start <= \end
+    GLDREPL xv, d, $xr\start, A0, \stride * 8
+    ldrepl_macro %start + 1, \end, %stride + 1
+.endif
+.endm
+.macro nmsub_macro start0, end0, start1, reg
+// Gx -= reg * Ux
+.if \start0 <= \end0
+    xvfnmsub.d  $xr\start0, \reg, $xr\start1, $xr\start0
+    nmsub_macro %start0 + 1, \end0, %start1 + 1, \reg
+.endif
+.endm
+.macro B_st_macro start, end, stride, N
+// Store Gx(x = 16...31)
+.if \start <= \end
+.if \N == 4
+    xvst    $xr\start, B0, \stride * 0x20
+.elseif \N == 2
+    vst     $vr\start, B0, \stride * 0x10
+.elseif \N == 1
+    fst.d   $f\start, B0, \stride * 0x08
+.endif
+    B_st_macro %start + 1, \end, %stride + 1, \N
+.endif
+.endm
+
+.macro dsolve_16 N
+// The data layout of C (4x16) is as follows (store 4 data in each register):
+// U0  U1  U2  U3
+// U4  U5  U6  U7
+// U8  U9  U10 U11
+// U12 U13 U14 U15
+// The first step is to transpose the result of C
+    GTRANSPOSE4x4_D U3, U7, U11, U15, G12, G13, G14, G15, D0, D1
+    GTRANSPOSE4x4_D U2, U6, U10, U14, G8, G9, G10, G11, D0, D1
+    GTRANSPOSE4x4_D U1, U5, U9, U13, G4, G5, G6, G7, U3, U7
+    GTRANSPOSE4x4_D U0, U4, U8, U12, G0, G1, G2, G3, U3, U7
+// Now we have the following memory layout of C:
+//     0     1    2   3    ...    15
+// 0 |    |    |    |    |     |     |
+// 1 | G0 | G1 | G2 | G3 | ... | G15 |
+// 2 |    |    |    |    |     |     |
+// 3 |    |    |    |    |     |     |
+// Next we are going to process matrix A with a size of 16x16,
+// using only the upper triangular portion. The memory layout of
+// matrix A is as follows, quite large.
+//0	1	2	3	4	5	6	7	8	9	10	11	12	13	14	15
+//	17	18	19	20	21	22	23	24	25	26	27	28	29	30	31
+//		34	35	36	37	38	39	40	41	42	43	44	45	46	47
+//			51	52	53	54	55	56	57	58	59	60	61	62	63
+//				68	69	70	71	72	73	74	75	76	77	78	79
+//					85	86	87	88	89	90	91	92	93	94	95
+//						102	103	104	105	106	107	108	109	110	111
+//							119	120	121	122	123	124	125	126	127
+//								136	137	138	139	140	141	142	143
+//									153	154	155	156	157	158	159
+//										170	171	172	173	174	175
+//											187	188	189	190	191
+//												204	205	206	207
+//													221	222	223
+//														238	239
+//															255
+// Sequentially extract data from A in row order
+// Load 0
+    ldrepl_macro 0, 15, 0
+    GMUL xvf, d, G0, G0, U0
+    nmsub_macro 17, 31, 1, G0
+    PTR_ADDI    A0,      A0,      17 * 8
+// Load 1
+    ldrepl_macro 1, 15, 0
+    GMUL xvf, d, G1, G1, U1
+    nmsub_macro 18, 31, 2, G1
+    PTR_ADDI    A0,      A0,      17 * 8
+// Load 2
+    ldrepl_macro 2, 15, 0
+    GMUL xvf, d, G2, G2, U2
+    nmsub_macro 19, 31, 3, G2
+    PTR_ADDI    A0,      A0,      17 * 8
+// Load 3
+    ldrepl_macro 3, 15, 0
+    GMUL xvf, d, G3, G3, U3
+    nmsub_macro 20, 31, 4, G3
+    PTR_ADDI    A0,      A0,      17 * 8
+// Load 4
+    ldrepl_macro 4, 15, 0
+    GMUL xvf, d, G4, G4, U4
+    nmsub_macro 21, 31, 5, G4
+    PTR_ADDI    A0,      A0,      17 * 8
+// Load 5
+    ldrepl_macro 5, 15, 0
+    GMUL xvf, d, G5, G5, U5
+    nmsub_macro 22, 31, 6, G5
+    PTR_ADDI    A0,      A0,      17 * 8
+// Load 6
+    ldrepl_macro 6, 15, 0
+    GMUL xvf, d, G6, G6, U6
+    nmsub_macro 23, 31, 7, G6
+    PTR_ADDI    A0,      A0,      17 * 8
+// Load 7
+    ldrepl_macro 7, 15, 0
+    GMUL xvf, d, G7, G7, U7
+    nmsub_macro 24, 31, 8, G7
+    PTR_ADDI    A0,      A0,      17 * 8
+// Load 8
+    ldrepl_macro 8, 15, 0
+    GMUL xvf, d, G8, G8, U8
+    nmsub_macro 25, 31, 9, G8
+    PTR_ADDI    A0,      A0,      17 * 8
+// Load 9
+    ldrepl_macro 9, 15, 0
+    GMUL xvf, d, G9, G9, U9
+    nmsub_macro 26, 31, 10, G9
+    PTR_ADDI    A0,      A0,      17 * 8
+// Load 10
+    ldrepl_macro 10, 15, 0
+    GMUL xvf, d, G10, G10, U10
+    nmsub_macro 27, 31, 11, G10
+    PTR_ADDI    A0,      A0,      17 * 8
+// Load 11
+    ldrepl_macro 11, 15, 0
+    GMUL xvf, d, G11, G11, U11
+    nmsub_macro 28, 31, 12, G11
+    PTR_ADDI    A0,      A0,      17 * 8
+// Load 12
+    ldrepl_macro 12, 15, 0
+    GMUL xvf, d, G12, G12, U12
+    nmsub_macro 29, 31, 13, G12
+    PTR_ADDI    A0,      A0,      17 * 8
+// Load 13
+    ldrepl_macro 13, 15, 0
+    GMUL xvf, d, G13, G13, U13
+    nmsub_macro 30, 31, 14, G13
+    PTR_ADDI    A0,      A0,      17 * 8
+// Load 14
+    ldrepl_macro 14, 15, 0
+    GMUL xvf, d, G14, G14, U14
+    nmsub_macro 31, 31, 15, G14
+    PTR_ADDI    A0,      A0,      17 * 8
+// Load 15
+    ldrepl_macro 15, 15, 0
+    GMUL xvf, d, G15, G15, U15
+// Finally, We can store the result.
+// For B, stored sequentially, and  C, first transpose and then store
+    B_st_macro 16, 31, 0, \N
+    GTRANSPOSE4x4_D G0, G1, G2, G3, G0, G1, G2, G3, U0, U1
+    GTRANSPOSE4x4_D G4, G5, G6, G7, G4, G5, G6, G7, U0, U1
+    GTRANSPOSE4x4_D G8, G9, G10, G11, G8, G9, G10, G11, U0, U1
+    GTRANSPOSE4x4_D G12, G13, G14, G15, G12, G13, G14, G15, U0, U1
+.if \N == 4
+    GST xv, , G0, C0, 0x00, G4, C0, 0x20, G8,  C0, 0x40, G12, C0, 0x60, \
+              G1, C1, 0x00, G5, C1, 0x20, G9,  C1, 0x40, G13, C1, 0x60, \
+              G2, C2, 0x00, G6, C2, 0x20, G10, C2, 0x40, G14, C2, 0x60, \
+              G3, C3, 0x00, G7, C3, 0x20, G11, C3, 0x40, G15, C3, 0x60
+.elseif \N == 2
+    GST xv, , G0, C0, 0x00, G4, C0, 0x20, G8,  C0, 0x40, G12, C0, 0x60, \
+              G1, C1, 0x00, G5, C1, 0x20, G9,  C1, 0x40, G13, C1, 0x60
+.elseif \N == 1
+    GST xv, , G0, C0, 0x00, G4, C0, 0x20, G8,  C0, 0x40, G12, C0, 0x60
+.endif
+.endm
+
+.macro dgemm_dsolve_16x4
+    bge     ZERO,       KK,     .L_dsolve_16x4_load
+    dgemm_16x4
+    b	.L_dsolve_16x4
+.L_dsolve_16x4_load:
+    // Load C
+    GLD xv, , U0,  C0, 0x00, U1,  C0, 0x20, U2,  C0, 0x40, U3,  C0, 0x60
+    GLD xv, , U4,  C1, 0x00, U5,  C1, 0x20, U6,  C1, 0x40, U7,  C1, 0x60
+    GLD xv, , U8,  C2, 0x00, U9,  C2, 0x20, U10, C2, 0x40, U11, C2, 0x60
+    GLD xv, , U12, C3, 0x00, U13, C3, 0x20, U14, C3, 0x40, U15, C3, 0x60
+/********************** solver ******************/
+.L_dsolve_16x4:
+    dsolve_16 4
+.endm
+
+.macro dsolve_8 N
+// The data layout of C (4x8) is as follows (store 4 data in each register):
+// U0  U1
+// U2  U3
+// U4  U5
+// U6  U7
+// The first step is to transpose the result of C
+    GTRANSPOSE4x4_D U1, U3, U5, U7, G4, G5, G6, G7, G8, G9
+    GTRANSPOSE4x4_D U0, U2, U4, U6, G0, G1, G2, G3, G8, G9
+// Now we have the following memory layout of C:
+//     0     1    2   3    ...   7
+// 0 |    |    |    |    |     |    |
+// 1 | G0 | G1 | G2 | G3 | ... | G7 |
+// 2 |    |    |    |    |     |    |
+// 3 |    |    |    |    |     |    |
+// Next we are going to process matrix A with a size of 8x8,
+// using only the upper triangular portion. The memory layout of
+// matrix A is as follows:
+//0	1	2	3	4	5	6	7
+//	9	10	11	12	13	14	15
+//		18	19	20	21	22	23
+//			27	28	29	30	31
+//				36	37	38	39
+//					45	46	47
+//						54	55
+//							63
+// Sequentially extract data from A in row order
+// Load 0
+    ldrepl_macro 0, 7, 0
+    GMUL xvf, d, G0, G0, U0
+    nmsub_macro 17, 23, 1, G0
+    PTR_ADDI    A0,      A0,      9 * 8
+// Load 1
+    ldrepl_macro 1, 7, 0
+    GMUL xvf, d, G1, G1, U1
+    nmsub_macro 18, 23, 2, G1
+    PTR_ADDI    A0,      A0,      9 * 8
+// Load 2
+    ldrepl_macro 2, 7, 0
+    GMUL xvf, d, G2, G2, U2
+    nmsub_macro 19, 23, 3, G2
+    PTR_ADDI    A0,      A0,      9 * 8
+// Load 3
+    ldrepl_macro 3, 7, 0
+    GMUL xvf, d, G3, G3, U3
+    nmsub_macro 20, 23, 4, G3
+    PTR_ADDI    A0,      A0,      9 * 8
+// Load 4
+    ldrepl_macro 4, 7, 0
+    GMUL xvf, d, G4, G4, U4
+    nmsub_macro 21, 23, 5, G4
+    PTR_ADDI    A0,      A0,      9 * 8
+// Load 5
+    ldrepl_macro 5, 7, 0
+    GMUL xvf, d, G5, G5, U5
+    nmsub_macro 22, 23, 6, G5
+    PTR_ADDI    A0,      A0,      9 * 8
+// Load 6
+    ldrepl_macro 6, 7, 0
+    GMUL xvf, d, G6, G6, U6
+    nmsub_macro 23, 23, 7, G6
+    PTR_ADDI    A0,      A0,      9 * 8
+// Load 7
+    ldrepl_macro 7, 7, 0
+    GMUL xvf, d, G7, G7, U7
+// Finally, We can store the result.
+// For B, stored sequentially, and  C, first transpose and then store
+    B_st_macro 16, 23, 0, \N
+    GTRANSPOSE4x4_D G0, G1, G2, G3, G0, G1, G2, G3, U0, U1
+    GTRANSPOSE4x4_D G4, G5, G6, G7, G4, G5, G6, G7, U0, U1
+.if \N == 4
+    GST xv, , G0, C0, 0x00, G4, C0, 0x20, \
+              G1, C1, 0x00, G5, C1, 0x20, \
+              G2, C2, 0x00, G6, C2, 0x20, \
+              G3, C3, 0x00, G7, C3, 0x20
+.elseif \N == 2
+    GST xv, , G0, C0, 0x00, G4, C0, 0x20, \
+              G1, C1, 0x00, G5, C1, 0x20
+.elseif \N == 1
+    GST xv, , G0, C0, 0x00, G4, C0, 0x20
+.endif
+.endm
+
+.macro dgemm_dsolve_8x4
+    bge   ZERO, L,	.L_dsolve_8x4_load
+    dgemm_8x4
+    b .L_dsolve_8x4
+.L_dsolve_8x4_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    xvld      U1,  C0,  0x20
+
+    /* Load C1  */
+    xvld      U2,  C1,  0x00
+    xvld      U3,  C1,  0x20
+
+    /* Load C2  */
+    xvld      U4,  C2,  0x00
+    xvld      U5,  C2,  0x20
+
+    /* Load C3  */
+    xvld      U6,  C3,  0x00
+    xvld      U7,  C3,  0x20
+/********* solver *********/
+.L_dsolve_8x4:
+    dsolve_8 4
+.endm
+
+.macro dsolve_4 N
+// The data layout of C (4x4) is as follows (store 4 data in each register):
+// U0
+// U1
+// U2
+// U3
+// The first step is to transpose the result of C
+    GTRANSPOSE4x4_D U0, U1, U2, U3, G0, G1, G2, G3, G4, G5
+// Now we have the following memory layout of C:
+//     0     1    2   3
+// 0 |    |    |    |    |
+// 1 | G0 | G1 | G2 | G3 |
+// 2 |    |    |    |    |
+// 3 |    |    |    |    |
+// Next we are going to process matrix A with a size of 4x4,
+// using only the upper triangular portion. The memory layout of
+// matrix A is as follows:
+//0	1	2	3
+//	5	6	7
+//		10	11
+//			15
+// Sequentially extract data from A in row order
+// Load 0
+    ldrepl_macro 0, 3, 0
+    GMUL xvf, d, G0, G0, U0
+    nmsub_macro 17, 19, 1, G0
+    PTR_ADDI    A0,      A0,      5 * 8
+// Load 1
+    ldrepl_macro 1, 3, 0
+    GMUL xvf, d, G1, G1, U1
+    nmsub_macro 18, 19, 2, G1
+    PTR_ADDI    A0,      A0,      5 * 8
+// Load 2
+    ldrepl_macro 2, 3, 0
+    GMUL xvf, d, G2, G2, U2
+    nmsub_macro 19, 19, 3, G2
+    PTR_ADDI    A0,      A0,      5 * 8
+// Load 3
+    ldrepl_macro 3, 3, 0
+    GMUL xvf, d, G3, G3, U3
+// Finally, We can store the result.
+// For B, stored sequentially, and  C, first transpose and then store
+    B_st_macro 16, 19, 0, \N
+    GTRANSPOSE4x4_D G0, G1, G2, G3, G0, G1, G2, G3, U0, U1
+.if \N == 4
+    GST xv, , G0, C0, 0x00, G1, C1, 0x00, G2, C2, 0x00, G3, C3, 0x00
+.elseif \N == 2
+    GST xv, , G0, C0, 0x00, G1, C1, 0x00
+.elseif \N == 1
+    GST xv, , G0, C0, 0x00
+.endif
+.endm
+
+.macro dgemm_dsolve_4x4
+    bge   ZERO, L,    .L_dsolve_4x4_load
+    dgemm_4x4
+    b .L_dsolve_4x4
+.L_dsolve_4x4_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    /* Load C1  */
+    xvld      U1,  C1,  0x00
+    /* Load C2  */
+    xvld      U2,  C2,  0x00
+    /* Load C3  */
+    xvld      U3,  C3,  0x00
+/************** solver *****************/
+.L_dsolve_4x4:
+    dsolve_4 4
+.endm
+
+.macro dsolve_2 N
+// Transpose
+    GSBUTTERFLY xv, d, G0, G1, U1, U0
+// Now we have the following memory layout of C:
+//     0     1
+// 0 |    |    |
+// 1 | G0 | G1 |
+// 2 |    |    |
+// 3 |    |    |
+// Next we are going to process matrix A with a size of 2x2,
+// using only the upper triangular portion. The memory layout of
+// matrix A is as follows:
+//0	1
+//	3
+// Sequentially extract data from A in row order
+// Load 0
+    ldrepl_macro 0, 1, 0
+    GMUL xvf, d, G0, G0, U0
+    nmsub_macro 17, 17, 1, G0
+    PTR_ADDI    A0,      A0,      3 * 8
+// Load 1
+    ldrepl_macro 1, 1, 0
+    GMUL xvf, d, G1, G1, U1
+// Finally, We can store the result.
+// For B, stored sequentially, and  C, first transpose and then store
+    B_st_macro 16, 17, 0, \N
+    GSBUTTERFLY xv, d, U0, U1, G1, G0
+.if \N == 4
+    vst       $vr0,     C0,      0x00
+    vst       $vr1,     C1,      0x00
+    xvstelm.d U0,  C2,  0x00,    0x02
+    xvstelm.d U1,  C3,  0x00,    0x02
+    xvstelm.d U0,  C2,  0x08,    0x03
+    xvstelm.d U1,  C3,  0x08,    0x03
+.elseif \N == 2
+    vst       $vr0,     C0,      0x00
+    vst       $vr1,     C1,      0x00
+.elseif \N == 1
+    vst       $vr0,     C0,      0x00
+.endif
+.endm
+
+.macro dgemm_dsolve_2x4
+    bge   ZERO, L,    .L_dsolve_2x4_load
+    dgemm_2x4
+    b   .L_dsolve_2x4
+.L_dsolve_2x4_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    /* Load C1  */
+    xvld      U1,  C1,  0x00
+    /* Load C2  */
+    xvld      U2,  C2,  0x00
+    /* Load C3  */
+    xvld      U3,  C3,  0x00
+
+    xvpermi.q   U0, U2, 0x02
+    xvpermi.q   U1, U3, 0x02
+/********************** solver ******************/
+.L_dsolve_2x4:
+    dsolve_2 4
+.endm
+
+.macro dgemm_dsolve_1x4
+    bge   ZERO, L,    .L_dsolve_1x4_load
+    dgemm_1x4
+    b   .L_dsolve_1x4
+.L_dsolve_1x4_load:
+    // Load C
+    fld.d       $f0,    C0,     0x00
+    fld.d       $f1,    C1,     0x00
+    fld.d       $f2,    C2,     0x00
+    fld.d       $f3,    C3,     0x00
+    xvinsve0.d  U0,     U1,     0x01
+    xvinsve0.d  U0,     U2,     0x02
+    xvinsve0.d  U0,     U3,     0x03
+.L_dsolve_1x4:
+    GLDREPL xv, d, D0, A0, 0x00
+    GMUL xvf, d, U0, U0, D0
+    // Store C
+    xvstelm.d   U0,     C0,     0x00,       0x00
+    xvstelm.d   U0,     C1,     0x00,       0x01
+    xvstelm.d   U0,     C2,     0x00,       0x02
+    xvstelm.d   U0,     C3,     0x00,       0x03
+    // Store B
+    xvst    U0,     B0,     0x00
+.endm
+
+.macro dgemm_dsolve_16x2
+    bge   ZERO, L,	.L_dsolve_16x2_load
+    dgemm_16x2
+    b .L_dsolve_16x2
+.L_dsolve_16x2_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    xvld      U1,  C0,  0x20
+    xvld      U2,  C0,  0x40
+    xvld      U3,  C0,  0x60
+    /* Load C1  */
+    xvld      U4,  C1,  0x00
+    xvld      U5,  C1,  0x20
+    xvld      U6,  C1,  0x40
+    xvld      U7,  C1,  0x60
+.L_dsolve_16x2:
+    dsolve_16 2
+.endm
+
+.macro dgemm_dsolve_8x2
+    bge   ZERO, L,	.L_dsolve_8x2_load
+    dgemm_8x2
+    b .L_dsolve_8x2
+.L_dsolve_8x2_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    xvld      U1,  C0,  0x20
+    /* Load C1  */
+    xvld      U2,  C1,  0x00
+    xvld      U3,  C1,  0x20
+.L_dsolve_8x2:
+    dsolve_8 2
+.endm
+
+.macro dgemm_dsolve_4x2
+    bge   ZERO, L,	.L_dsolve_4x2_load
+    dgemm_4x2
+    b .L_dsolve_4x2
+.L_dsolve_4x2_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    /* Load C1  */
+    xvld      U1,  C1,  0x00
+.L_dsolve_4x2:
+    dsolve_4 2
+.endm
+
+.macro dgemm_dsolve_1x2
+    bge   ZERO, L,    .L_dsolve_1x2_load
+    dgemm_1x2
+    b   .L_dsolve_1x2
+.L_dsolve_1x2_load:
+    // Load C
+    fld.d       $f0,    C0,     0x00
+    fld.d       $f1,    C1,     0x00
+    xvinsve0.d  U0,     U1,     0x01
+.L_dsolve_1x2:
+    GLDREPL xv, d, D0, A0, 0x00
+    GMUL xvf, d, U0, U0, D0
+    // Store C
+    xvstelm.d   U0,     C0,     0x00,       0x00
+    xvstelm.d   U0,     C1,     0x00,       0x01
+    // Store B
+    vst    $vr0,     B0,     0x00
+.endm
+
+.macro dgemm_dsolve_2x2
+    bge   ZERO, L,	.L_dsolve_2x2_load
+    dgemm_2x2
+    b .L_dsolve_2x2
+.L_dsolve_2x2_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    /* Load C1  */
+    xvld      U1,  C1,  0x00
+.L_dsolve_2x2:
+    dsolve_2 2
+.endm
+
+.macro dgemm_dsolve_16x1
+    bge   ZERO, L,	.L_dsolve_16x1_load
+    dgemm_16x1
+    b .L_dsolve_16x1
+.L_dsolve_16x1_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    xvld      U1,  C0,  0x20
+    xvld      U2,  C0,  0x40
+    xvld      U3,  C0,  0x60
+.L_dsolve_16x1:
+    dsolve_16 1
+.endm
+
+.macro dgemm_dsolve_8x1
+    bge   ZERO, L,	.L_dsolve_8x1_load
+    dgemm_8x1
+    b .L_dsolve_8x1
+.L_dsolve_8x1_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    xvld      U1,  C0,  0x20
+.L_dsolve_8x1:
+    dsolve_8 1
+.endm
+
+.macro dgemm_dsolve_4x1
+    bge   ZERO, L,	.L_dsolve_4x1_load
+    dgemm_4x1
+    b .L_dsolve_4x1
+.L_dsolve_4x1_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+.L_dsolve_4x1:
+    dsolve_4 1
+.endm
+
+.macro dgemm_dsolve_2x1
+    bge   ZERO, L,	.L_dsolve_2x1_load
+    dgemm_2x1
+    b .L_dsolve_2x1
+.L_dsolve_2x1_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+.L_dsolve_2x1:
+    dsolve_2 1
+.endm
+
+.macro dgemm_dsolve_1x1
+    bge   ZERO, L,    .L_dsolve_1x1_load
+    dgemm_1x1
+    b .L_dsolve_1x1
+.L_dsolve_1x1_load:
+    // Load C
+    fld.d       $f0,    C0,     0x00
+.L_dsolve_1x1:
+    GLDREPL xv, d, D0, A0, 0x00
+    GMUL xvf, d, U0, U0, D0
+    // Store C
+    xvstelm.d   U0,     C0,     0x00,       0x00
+    // Store B
+    xvstelm.d   U0,     B0,     0x00,       0x00
+.endm
+
+    PROLOGUE
+    push_if_used 26, 32
+    PTR_SLLI   LDC,   LDC,   3
+    /* if (!(N >> 2)) goto L_N3 */
+    PTR_SRAI   J,     N,     2     /* J = bn >> 2 */
+    andi       N,     N,     0x03
+    beq        ZERO,  J,     .L_N3
+.align 5
+.L_J1:
+    PTR_ADDI    J,      J,     -1
+    move        KK,     OFFSET
+    move        AA,     A
+    move        CC,     C
+    PTR_SRAI    I,      M,      4 // M >> 4
+    beqz        I,      .L_M15
+.align 4
+.L_I1:
+    GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC
+    move        A0,     AA
+    move        B0,     B
+    move        L,      KK
+    dgemm_dsolve_16x4
+    PTR_ADDI    I,      I,      -1
+    PTR_SLLI    T0,     K,      7
+    PTR_ADDI    CC,     CC,     0x80 // cc += 16
+    PTR_ADDI    KK,     KK,     0x10 // kk += 16
+    PTR_ADD     AA,     AA,     T0 // aa += 16 * k
+    bnez        I,      .L_I1
+.L_M15:
+    andi        I,      M,      8
+    beqz        I,      .L_M7
+.L_M8:
+    GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC
+    move        A0,     AA
+    move        B0,     B
+    move        L,      KK
+    dgemm_dsolve_8x4
+    PTR_SLLI    T0,     K,      6
+    PTR_ADDI    CC,     CC,     0x40 // cc += 8
+    PTR_ADDI    KK,     KK,     0x08 // kk += 8
+    PTR_ADD     AA,     AA,     T0 // aa += 8 * k
+.L_M7:
+    andi        I,      M,      4
+    beqz        I,      .L_M3
+.L_M4:
+    GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC
+    move        A0,     AA
+    move        B0,     B
+    move        L,      KK
+    dgemm_dsolve_4x4
+    PTR_SLLI    T0,     K,      5
+    PTR_ADDI    CC,     CC,     0x20 // cc += 4
+    PTR_ADDI    KK,     KK,     0x04 // kk += 4
+    PTR_ADD     AA,     AA,     T0 // aa += 4 * k
+.L_M3:
+    andi        I,      M,      2
+    beqz        I,      .L_M1
+.L_M2:
+    GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC
+    move        A0,     AA
+    move        B0,     B
+    move        L,      KK
+    dgemm_dsolve_2x4
+    PTR_SLLI    T0,     K,      4
+    PTR_ADDI    CC,     CC,     0x10 // cc += 2
+    PTR_ADDI    KK,     KK,     0x02 // kk += 2
+    PTR_ADD     AA,     AA,     T0 // aa += 2 * k
+.L_M1:
+    andi        I,      M,      1
+    beqz        I,      .L_M0
+    GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC
+    move        A0,     AA
+    move        B0,     B
+    move        L,      KK
+    dgemm_dsolve_1x4
+    PTR_SLLI    T0,     K,      3
+    PTR_ADDI    CC,     CC,     0x08 // cc += 1
+    PTR_ADDI    KK,     KK,     0x01 // kk += 1
+    PTR_ADD     AA,     AA,     T0 // aa += 1 * k
+.L_M0:
+    PTR_SLLI    T0,     K,      5
+    PTR_SLLI    T1,     LDC,    2
+    PTR_ADD     B,      B,      T0 // b += 4 * k
+    PTR_ADD     C,      C,      T1 // c += 4 * ldc
+    bnez        J,      .L_J1
+.L_N3:
+    andi    J,      N,      2
+    beq     ZERO,   J,      .L_N1
+.L_N2:
+    move        KK,     OFFSET
+    move        AA,     A
+    move        CC,     C
+    PTR_SRAI    I,      M,      4 // M >> 4
+    beqz        I,      .L_N2_M15
+.align 4
+.L_N2_I1:
+    GADD , d, C0, CC, ZERO, C1, C0, LDC
+    move        A0,     AA
+    move        B0,     B
+    move        L,      KK
+    dgemm_dsolve_16x2
+    PTR_ADDI    I,      I,      -1
+    PTR_SLLI    T0,     K,      7
+    PTR_ADDI    CC,     CC,     0x80 // cc += 16
+    PTR_ADDI    KK,     KK,     0x10 // kk += 16
+    PTR_ADD     AA,     AA,     T0 // aa += 16 * k
+    bnez        I,      .L_N2_I1
+.L_N2_M15:
+    andi        I,      M,      8
+    beqz        I,      .L_N2_M7
+.L_N2_M8:
+    GADD , d, C0, CC, ZERO, C1, C0, LDC
+    move        A0,     AA
+    move        B0,     B
+    move        L,      KK
+    dgemm_dsolve_8x2
+    PTR_SLLI    T0,     K,      6
+    PTR_ADDI    CC,     CC,     0x40 // cc += 8
+    PTR_ADDI    KK,     KK,     0x08 // kk += 8
+    PTR_ADD     AA,     AA,     T0 // aa += 8 * k
+.L_N2_M7:
+    andi        I,      M,      4
+    beqz        I,      .L_N2_M3
+.L_N2_M4:
+    GADD , d, C0, CC, ZERO, C1, C0, LDC
+    move        A0,     AA
+    move        B0,     B
+    move        L,      KK
+    dgemm_dsolve_4x2
+    PTR_SLLI    T0,     K,      5
+    PTR_ADDI    CC,     CC,     0x20 // cc += 4
+    PTR_ADDI    KK,     KK,     0x04 // kk += 4
+    PTR_ADD     AA,     AA,     T0 // aa += 4 * k
+.L_N2_M3:
+    andi        I,      M,      2
+    beqz        I,      .L_N2_M1
+.L_N2_M2:
+    GADD , d, C0, CC, ZERO, C1, C0, LDC
+    move        A0,     AA
+    move        B0,     B
+    move        L,      KK
+    dgemm_dsolve_2x2
+    PTR_SLLI    T0,     K,      4
+    PTR_ADDI    CC,     CC,     0x10 // cc += 2
+    PTR_ADDI    KK,     KK,     0x02 // kk += 2
+    PTR_ADD     AA,     AA,     T0 // aa += 2 * k
+.L_N2_M1:
+    andi        I,      M,      1
+    beqz        I,      .L_N2_M0
+    GADD , d, C0, CC, ZERO, C1, C0, LDC
+    move        A0,     AA
+    move        B0,     B
+    move        L,      KK
+    dgemm_dsolve_1x2
+    PTR_SLLI    T0,     K,      3
+    PTR_ADDI    CC,     CC,     0x08 // cc += 1
+    PTR_ADDI    KK,     KK,     0x01 // kk += 1
+    PTR_ADD     AA,     AA,     T0 // aa += 1 * k
+.L_N2_M0:
+    PTR_SLLI    T0,     K,      4
+    PTR_SLLI    T1,     LDC,    1
+    PTR_ADD     B,      B,      T0 // b += 2 * k
+    PTR_ADD     C,      C,      T1 // c += 2 * ldc
+.L_N1:
+    andi    J,      N,      1
+    beq     ZERO,   J,      .L_N0
+
+    move        KK,     OFFSET
+    move        AA,     A
+    move        CC,     C
+    PTR_SRAI    I,      M,      4 // M >> 4
+    beqz        I,      .L_N1_M15
+.align 4
+.L_N1_I1:
+    GADD , d, C0, CC, ZERO
+    move        A0,     AA
+    move        B0,     B
+    move        L,      KK
+    dgemm_dsolve_16x1
+    PTR_ADDI    I,      I,      -1
+    PTR_SLLI    T0,     K,      7
+    PTR_ADDI    CC,     CC,     0x80 // cc += 16
+    PTR_ADDI    KK,     KK,     0x10 // kk += 16
+    PTR_ADD     AA,     AA,     T0 // aa += 16 * k
+    bnez        I,      .L_N1_I1
+.L_N1_M15:
+    andi        I,      M,      8
+    beqz        I,      .L_N1_M7
+.L_N1_M8:
+    GADD , d, C0, CC, ZERO
+    move        A0,     AA
+    move        B0,     B
+    move        L,      KK
+    dgemm_dsolve_8x1
+    PTR_SLLI    T0,     K,      6
+    PTR_ADDI    CC,     CC,     0x40 // cc += 8
+    PTR_ADDI    KK,     KK,     0x08 // kk += 8
+    PTR_ADD     AA,     AA,     T0 // aa += 8 * k
+.L_N1_M7:
+    andi        I,      M,      4
+    beqz        I,      .L_N1_M3
+.L_N1_M4:
+    GADD , d, C0, CC, ZERO
+    move        A0,     AA
+    move        B0,     B
+    move        L,      KK
+    dgemm_dsolve_4x1
+    PTR_SLLI    T0,     K,      5
+    PTR_ADDI    CC,     CC,     0x20 // cc += 4
+    PTR_ADDI    KK,     KK,     0x04 // kk += 4
+    PTR_ADD     AA,     AA,     T0 // aa += 4 * k
+.L_N1_M3:
+    andi        I,      M,      2
+    beqz        I,      .L_N1_M1
+.L_N1_M2:
+    GADD , d, C0, CC, ZERO
+    move        A0,     AA
+    move        B0,     B
+    move        L,      KK
+    dgemm_dsolve_2x1
+    PTR_SLLI    T0,     K,      4
+    PTR_ADDI    CC,     CC,     0x10 // cc += 2
+    PTR_ADDI    KK,     KK,     0x02 // kk += 2
+    PTR_ADD     AA,     AA,     T0 // aa += 2 * k
+.L_N1_M1:
+    andi        I,      M,      1
+    beqz        I,      .L_N1_M0
+    GADD , d, C0, CC, ZERO
+    move        A0,     AA
+    move        B0,     B
+    move        L,      KK
+    dgemm_dsolve_1x1
+    PTR_SLLI    T0,     K,      3
+    PTR_ADDI    CC,     CC,     0x08 // cc += 1
+    PTR_ADDI    KK,     KK,     0x01 // kk += 1
+    PTR_ADD     AA,     AA,     T0 // aa += 1 * k
+.L_N1_M0:
+.L_N0:
+    pop_if_used 26, 32
+    jirl    $r0, $r1, 0x0
+    EPILOGUE
diff --git a/kernel/loongarch64/dtrsm_kernel_RN_16x4_lasx.S b/kernel/loongarch64/dtrsm_kernel_RN_16x4_lasx.S
new file mode 100644
index 000000000..421339736
--- /dev/null
+++ b/kernel/loongarch64/dtrsm_kernel_RN_16x4_lasx.S
@@ -0,0 +1,882 @@
+/*******************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+#include "loongarch64_asm.S"
+
+/*********************************************************************
+* 2023/09/26 guxiwei
+*        UTEST                  : OK
+*        CTEST                  : OK
+*        TEST                   : OK
+*
+*
+*********************************************************************/
+
+/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b,
+ *          FLOAT *c, BLASLONG ldc, BLASLONG offset)
+ */
+
+#define M      $r4   // param 1: bm
+#define N      $r5   // param 2: bn
+#define K      $r6   // param 3: bk
+#define A      $r7   // param 5: ba
+#define B      $r8   // param 6: bb
+#define C      $r9   // param 7: bc
+#define LDC    $r10  // param 8: ldc
+#define OFFSET $r11  // param 9: offset
+
+/* Cycle control parameters */
+#define I      $r13
+#define J      $r14
+#define L      $r15
+#define TL     $r16
+/* Matrix address */
+#define A0     $r17
+#define B0     $r18
+#define C0     $r19
+#define C1     $r20
+#define C2     $r23
+#define C3     $r24
+#define T0     $r25
+#define T1     $r26
+#define T2     $r27
+#define KK     $r28
+#define AA     $r29
+#define CC     $r30
+#define BB     B0
+#undef  ZERO
+#define ZERO   $r0
+
+#define U0     $xr0
+#define U1     $xr1
+#define U2     $xr2
+#define U3     $xr3
+#define U4     $xr4
+#define U5     $xr5
+#define U6     $xr6
+#define U7     $xr7
+#define U8     $xr8
+#define U9     $xr9
+#define U10    $xr10
+#define U11    $xr11
+#define U12    $xr12
+#define U13    $xr13
+#define U14    $xr14
+#define U15    $xr15
+#define D0     $xr16
+#define D1     $xr17
+#define D2     $xr18
+#define D3     $xr19
+#define D4     $xr20
+#define D5     $xr21
+#define D6     $xr22
+#define D7     $xr23
+#define D8     $xr24
+#define D9     $xr25
+#define D10    $xr26
+#define D11    $xr27
+#define D12    $xr28
+#define D13    $xr29
+#define D14    $xr30
+#define D15    $xr31
+#define G0     D0
+#define G1     D1
+#define G2     D2
+#define G3     D3
+#define G4     D4
+#define G5     D5
+#define G6     D6
+#define G7     D7
+#define G8     D8
+#define G9     D9
+#define G10    D10
+#define G11    D11
+#define G12    D12
+#define G13    D13
+#define G14    D14
+#define G15    D15
+
+/* Prefetch interval */
+#define A_PRE  0x400
+#define B_PRE  0x100
+
+#include "dtrsm_kernel_macro.S"
+
+.macro ldrepl_macro start, end, stride
+// Load Ux (x = 0...15)
+.if \start <= \end
+    GLDREPL xv, d, $xr\start, B0, \stride * 8
+    ldrepl_macro %start + 1, \end, %stride + 1
+.endif
+.endm
+
+.macro nmsub_macro start0, end0, start1, reg
+// Ux -= reg * Dx
+.if \start0 <= \end0
+    xvfnmsub.d  $xr\start0, \reg, $xr\start1, $xr\start0
+    nmsub_macro %start0 + 1, \end0, %start1 + 1, \reg
+.endif
+.endm
+
+.macro A_st_macro start, end, stride, N
+// Store Ux(x = 0...15)
+.if \start <= \end
+.if \N == 4
+    xvst    $xr\start, A0, \stride * 0x20
+.elseif \N == 2
+    vst     $vr\start, A0, \stride * 0x10
+.elseif \N == 1
+    fst.d   $f\start, A0, \stride * 0x08
+.endif
+    A_st_macro %start + 1, \end, %stride + 1, \N
+.endif
+.endm
+
+.macro dsolve_16x4
+// We are going to process matrix B with a size of 4x4,
+// using only the upper triangular portion. The memory layout of
+// matrix B is as follows:
+//0	1	2	3
+//	5	6	7
+//		10	11
+//			15
+// Sequentially extract data from B in row order
+    ldrepl_macro 16, 19, 0
+    GMUL xvf, d, U0, D0, U0, U1, D0, U1, U2, D0, U2, U3, D0, U3
+    ldrepl_macro 20, 22, 5
+    nmsub_macro 4, 7, 0, D1
+    ldrepl_macro 23, 24, 10
+    GMUL xvf, d, U4, D4, U4, U5, D4, U5, U6, D4, U6, U7, D4, U7
+    ldrepl_macro 25, 25, 15
+    nmsub_macro 8, 11, 0, D2
+    nmsub_macro 8, 11, 4, D5
+    GMUL xvf, d, U8, D7, U8, U9, D7, U9, U10, D7, U10, U11, D7, U11
+    nmsub_macro 12, 15, 0, D3
+    nmsub_macro 12, 15, 4, D6
+    nmsub_macro 12, 15, 8, D8
+    GMUL xvf, d, U12, D9, U12, U13, D9, U13, U14, D9, U14, U15, D9, U15
+// Store A
+    A_st_macro 0, 15, 0, 4
+// Store C
+    GST xv, , U0,  C0, 0x00, U1,  C0, 0x20, U2,  C0, 0x40, U3,  C0, 0x60, \
+              U4,  C1, 0x00, U5,  C1, 0x20, U6,  C1, 0x40, U7,  C1, 0x60, \
+              U8,  C2, 0x00, U9,  C2, 0x20, U10, C2, 0x40, U11, C2, 0x60, \
+              U12, C3, 0x00, U13, C3, 0x20, U14, C3, 0x40, U15, C3, 0x60
+.endm
+
+.macro dsolve_16x2
+// We are going to process matrix B with a size of 2x2,
+// using only the upper triangular portion. The memory layout of
+// matrix B is as follows:
+//0	1
+//	3
+// Sequentially extract data from B in row order
+    ldrepl_macro 16, 17, 0
+    GMUL xvf, d, U0, D0, U0, U1, D0, U1, U2, D0, U2, U3, D0, U3
+    ldrepl_macro 18, 18, 3
+    nmsub_macro 4, 7, 0, D1
+    GMUL xvf, d, U4, D2, U4, U5, D2, U5, U6, D2, U6, U7, D2, U7
+// Store A
+    A_st_macro 0, 7, 0, 4
+// Store C
+    GST xv, , U0,  C0, 0x00, U1,  C0, 0x20, U2,  C0, 0x40, U3,  C0, 0x60, \
+              U4,  C1, 0x00, U5,  C1, 0x20, U6,  C1, 0x40, U7,  C1, 0x60
+.endm
+
+.macro dsolve_8x4
+// We are going to process matrix B with a size of 4x4,
+// using only the upper triangular portion. The memory layout of
+// matrix B is as follows:
+//0	1	2	3
+//	5	6	7
+//		10	11
+//			15
+// Sequentially extract data from B in row order
+    ldrepl_macro 16, 19, 0
+    GMUL xvf, d, U0, D0, U0, U1, D0, U1
+    ldrepl_macro 20, 22, 5
+    nmsub_macro 2, 3, 0, D1
+    ldrepl_macro 23, 24, 10
+    GMUL xvf, d, U2, D4, U2, U3, D4, U3
+    ldrepl_macro 25, 25, 15
+    nmsub_macro 4, 5, 0, D2
+    nmsub_macro 4, 5, 2, D5
+    GMUL xvf, d, U4, D7, U4, U5, D7, U5
+    nmsub_macro 6, 7, 0, D3
+    nmsub_macro 6, 7, 2, D6
+    nmsub_macro 6, 7, 4, D8
+    GMUL xvf, d, U6, D9, U6, U7, D9, U7
+// Store A
+    A_st_macro 0, 7, 0, 4
+// Store C
+    GST xv, , U0, C0, 0x00, U1, C0, 0x20, \
+              U2, C1, 0x00, U3, C1, 0x20, \
+              U4, C2, 0x00, U5, C2, 0x20, \
+              U6, C3, 0x00, U7, C3, 0x20
+.endm
+
+.macro dsolve_8x2
+// We are going to process matrix B with a size of 2x2,
+// using only the upper triangular portion. The memory layout of
+// matrix B is as follows:
+//0	1
+//	3
+// Sequentially extract data from B in row order
+    ldrepl_macro 16, 17, 0
+    GMUL xvf, d, U0, D0, U0, U1, D0, U1
+    ldrepl_macro 18, 18, 3
+    nmsub_macro 2, 3, 0, D1
+    GMUL xvf, d, U2, D2, U2, U3, D2, U3
+// Store A
+    A_st_macro 0, 3, 0, 4
+// Store C
+    GST xv, , U0,  C0, 0x00, U1,  C0, 0x20, \
+              U2,  C1, 0x00, U3,  C1, 0x20
+.endm
+
+.macro dsolve_4x4
+// We are going to process matrix B with a size of 4x4,
+// using only the upper triangular portion. The memory layout of
+// matrix B is as follows:
+//0	1	2	3
+//	5	6	7
+//		10	11
+//			15
+// Sequentially extract data from B in row order
+    ldrepl_macro 16, 19, 0
+    GMUL xvf, d, U0, D0, U0
+    ldrepl_macro 20, 22, 5
+    nmsub_macro 1, 1, 0, D1
+    ldrepl_macro 23, 24, 10
+    GMUL xvf, d, U1, D4, U1
+    ldrepl_macro 25, 25, 15
+    nmsub_macro 2, 2, 0, D2
+    nmsub_macro 2, 2, 1, D5
+    GMUL xvf, d, U2, D7, U2
+    nmsub_macro 3, 3, 0, D3
+    nmsub_macro 3, 3, 1, D6
+    nmsub_macro 3, 3, 2, D8
+    GMUL xvf, d, U3, D9, U3
+// Store A
+    A_st_macro 0, 3, 0, 4
+// Store C
+    GST xv, , U0, C0, 0x00, U1, C1, 0x00, U2, C2, 0x00, U3, C3, 0x00
+.endm
+
+.macro dsolve_4x2
+// We are going to process matrix B with a size of 2x2,
+// using only the upper triangular portion. The memory layout of
+// matrix B is as follows:
+//0	1
+//	3
+// Sequentially extract data from B in row order
+    ldrepl_macro 16, 17, 0
+    GMUL xvf, d, U0, D0, U0
+    ldrepl_macro 18, 18, 3
+    nmsub_macro 1, 1, 0, D1
+    GMUL xvf, d, U1, D2, U1
+// Store A
+    A_st_macro 0, 1, 0, 4
+// Store C
+    GST xv, , U0, C0, 0x00, U1, C1, 0x00
+.endm
+
+.macro dsolve_2x4
+// We are going to process matrix B with a size of 4x4,
+// using only the upper triangular portion. The memory layout of
+// matrix B is as follows:
+//0	1	2	3
+//	5	6	7
+//		10	11
+//			15
+// Sequentially extract data from B in row order
+    ldrepl_macro 16, 19, 0
+    GMUL xvf, d, U0, D0, U0
+    ldrepl_macro 20, 22, 5
+    nmsub_macro 1, 1, 0, D1
+    ldrepl_macro 23, 24, 10
+    GMUL xvf, d, U1, D4, U1
+
+    ldrepl_macro 25, 25, 15
+    nmsub_macro 2, 2, 0, D2
+    nmsub_macro 2, 2, 1, D5
+    GMUL xvf, d, U2, D7, U2
+    nmsub_macro 3, 3, 0, D3
+    nmsub_macro 3, 3, 1, D6
+    nmsub_macro 3, 3, 2, D8
+    GMUL xvf, d, U3, D9, U3
+// Store A
+    A_st_macro 0, 3, 0, 2
+// Store C
+    GST v, , $vr0, C0, 0x00, $vr1, C1, 0x00, $vr2, C2, 0x00, $vr3, C3, 0x00,
+.endm
+
+.macro dsolve_2x2
+// We are going to process matrix B with a size of 2x2,
+// using only the upper triangular portion. The memory layout of
+// matrix B is as follows:
+//0	1
+//	3
+// Sequentially extract data from B in row order
+    ldrepl_macro 16, 17, 0
+    GMUL xvf, d, U0, D0, U0
+    ldrepl_macro 18, 18, 3
+    nmsub_macro 1, 1, 0, D1
+    GMUL xvf, d, U1, D2, U1
+// Store A
+    A_st_macro 0, 1, 0, 2
+// Store C
+    GST v, , $vr0, C0, 0x00, $vr1, C1, 0x00
+.endm
+
+.macro dsolve_1x4
+// We are going to process matrix B with a size of 4x4,
+// using only the upper triangular portion. The memory layout of
+// matrix B is as follows:
+//0	1	2	3
+//	5	6	7
+//		10	11
+//			15
+// Sequentially extract data from B in row order
+    ldrepl_macro 16, 19, 0
+    GMUL xvf, d, U0, D0, U0
+    ldrepl_macro 20, 22, 5
+    nmsub_macro 1, 1, 0, D1
+    ldrepl_macro 23, 24, 10
+    GMUL xvf, d, U1, D4, U1
+
+    ldrepl_macro 25, 25, 15
+    nmsub_macro 2, 2, 0, D2
+    nmsub_macro 2, 2, 1, D5
+    GMUL xvf, d, U2, D7, U2
+    nmsub_macro 3, 3, 0, D3
+    nmsub_macro 3, 3, 1, D6
+    nmsub_macro 3, 3, 2, D8
+    GMUL xvf, d, U3, D9, U3
+// Store A
+    A_st_macro 0, 3, 0, 1
+// Store C
+    GST f, d, $f0, C0, 0x00, $f1, C1, 0x00, $f2, C2, 0x00, $f3, C3, 0x00,
+.endm
+
+.macro dsolve_1x2
+// We are going to process matrix B with a size of 2x2,
+// using only the upper triangular portion. The memory layout of
+// matrix B is as follows:
+//0	1
+//	3
+// Sequentially extract data from B in row order
+    ldrepl_macro 16, 17, 0
+    GMUL xvf, d, U0, D0, U0
+    ldrepl_macro 18, 18, 3
+    nmsub_macro 1, 1, 0, D1
+    GMUL xvf, d, U1, D2, U1
+// Store A
+    A_st_macro 0, 1, 0, 1
+// Store C
+    GST f, d, $f0, C0, 0x00, $f1, C1, 0x00
+.endm
+
+.macro dgemm_dsolve_16x4
+    bge   ZERO, L,	.L_dsolve_16x4_load
+    dgemm_16x4
+    b	.L_dsolve_16x4
+.L_dsolve_16x4_load:
+    // Load C
+    GLD xv, , U0,  C0, 0x00, U1,  C0, 0x20, U2,  C0, 0x40, U3,  C0, 0x60
+    GLD xv, , U4,  C1, 0x00, U5,  C1, 0x20, U6,  C1, 0x40, U7,  C1, 0x60
+    GLD xv, , U8,  C2, 0x00, U9,  C2, 0x20, U10, C2, 0x40, U11, C2, 0x60
+    GLD xv, , U12, C3, 0x00, U13, C3, 0x20, U14, C3, 0x40, U15, C3, 0x60
+/********************** solver ******************/
+.L_dsolve_16x4:
+    dsolve_16x4
+.endm
+
+.macro dgemm_dsolve_8x4
+    bge   ZERO, L,	.L_dsolve_8x4_load
+    dgemm_8x4
+    b .L_dsolve_8x4
+.L_dsolve_8x4_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    xvld      U1,  C0,  0x20
+
+    /* Load C1  */
+    xvld      U2,  C1,  0x00
+    xvld      U3,  C1,  0x20
+
+    /* Load C2  */
+    xvld      U4,  C2,  0x00
+    xvld      U5,  C2,  0x20
+
+    /* Load C3  */
+    xvld      U6,  C3,  0x00
+    xvld      U7,  C3,  0x20
+/********* solver *********/
+.L_dsolve_8x4:
+    dsolve_8x4
+.endm
+
+.macro dgemm_dsolve_4x4
+    bge   ZERO, L,    .L_dsolve_4x4_load
+    dgemm_4x4
+    b .L_dsolve_4x4
+.L_dsolve_4x4_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    /* Load C1  */
+    xvld      U1,  C1,  0x00
+    /* Load C2  */
+    xvld      U2,  C2,  0x00
+    /* Load C3  */
+    xvld      U3,  C3,  0x00
+/************** solver *****************/
+.L_dsolve_4x4:
+    dsolve_4x4
+.endm
+
+.macro dgemm_dsolve_2x4
+    bge   ZERO, L,    .L_dsolve_2x4_load
+    dgemm_2x4
+    xvpermi.q   U2,     U0,     0x01
+    xvpermi.q   U3,     U1,     0x01
+    b   .L_dsolve_2x4
+.L_dsolve_2x4_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    /* Load C1  */
+    xvld      U1,  C1,  0x00
+    /* Load C2  */
+    xvld      U2,  C2,  0x00
+    /* Load C3  */
+    xvld      U3,  C3,  0x00
+/********************** solver ******************/
+.L_dsolve_2x4:
+    dsolve_2x4
+.endm
+
+.macro dgemm_dsolve_1x4
+    bge   ZERO, L,    .L_dsolve_1x4_load
+    dgemm_1x4
+    xvpackod.d  U1,     U0,     U0
+    xvpermi.q   U2,     U0,     0x01
+    xvpermi.q   U3,     U1,     0x01
+    b   .L_dsolve_1x4
+.L_dsolve_1x4_load:
+    // Load C
+    fld.d       $f0,    C0,     0x00
+    fld.d       $f1,    C1,     0x00
+    fld.d       $f2,    C2,     0x00
+    fld.d       $f3,    C3,     0x00
+.L_dsolve_1x4:
+    dsolve_1x4
+.endm
+
+.macro dgemm_dsolve_16x2
+    bge   ZERO, L,	.L_dsolve_16x2_load
+    dgemm_16x2
+    b .L_dsolve_16x2
+.L_dsolve_16x2_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    xvld      U1,  C0,  0x20
+    xvld      U2,  C0,  0x40
+    xvld      U3,  C0,  0x60
+    /* Load C1  */
+    xvld      U4,  C1,  0x00
+    xvld      U5,  C1,  0x20
+    xvld      U6,  C1,  0x40
+    xvld      U7,  C1,  0x60
+.L_dsolve_16x2:
+    dsolve_16x2
+.endm
+
+.macro dgemm_dsolve_8x2
+    bge   ZERO, L,	.L_dsolve_8x2_load
+    dgemm_8x2
+    b .L_dsolve_8x2
+.L_dsolve_8x2_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    xvld      U1,  C0,  0x20
+    /* Load C1  */
+    xvld      U2,  C1,  0x00
+    xvld      U3,  C1,  0x20
+.L_dsolve_8x2:
+    dsolve_8x2
+.endm
+
+.macro dgemm_dsolve_4x2
+    bge   ZERO, L,	.L_dsolve_4x2_load
+    dgemm_4x2
+    b .L_dsolve_4x2
+.L_dsolve_4x2_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    /* Load C1  */
+    xvld      U1,  C1,  0x00
+.L_dsolve_4x2:
+    dsolve_4x2
+.endm
+
+.macro dgemm_dsolve_2x2
+    bge   ZERO, L,	.L_dsolve_2x2_load
+    dgemm_2x2
+    b .L_dsolve_2x2
+.L_dsolve_2x2_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    /* Load C1  */
+    xvld      U1,  C1,  0x00
+.L_dsolve_2x2:
+    dsolve_2x2
+.endm
+
+.macro dgemm_dsolve_1x2
+    bge   ZERO, L,    .L_dsolve_1x2_load
+    dgemm_1x2
+    xvpackod.d  U1,     U0,     U0
+    b   .L_dsolve_1x2
+.L_dsolve_1x2_load:
+    // Load C
+    fld.d       $f0,    C0,     0x00
+    fld.d       $f1,    C1,     0x00
+.L_dsolve_1x2:
+    dsolve_1x2
+.endm
+
+.macro dgemm_dsolve_16x1
+    bge   ZERO, L,	.L_dsolve_16x1_load
+    dgemm_16x1
+    b .L_dsolve_16x1
+.L_dsolve_16x1_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    xvld      U1,  C0,  0x20
+    xvld      U2,  C0,  0x40
+    xvld      U3,  C0,  0x60
+.L_dsolve_16x1:
+    ldrepl_macro 16, 16, 0
+    GMUL xvf, d, U0, D0, U0, U1, D0, U1, U2, D0, U2, U3, D0, U3
+    // Store A
+    A_st_macro 0, 3, 0, 4
+    // Strore C
+    GST xv, , U0, C0, 0x00, U1, C0, 0x20, U2, C0, 0x40, U3, C0, 0x60
+.endm
+
+.macro dgemm_dsolve_8x1
+    bge   ZERO, L,	.L_dsolve_8x1_load
+    dgemm_8x1
+    b .L_dsolve_8x1
+.L_dsolve_8x1_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    xvld      U1,  C0,  0x20
+.L_dsolve_8x1:
+    ldrepl_macro 16, 16, 0
+    GMUL xvf, d, U0, D0, U0, U1, D0, U1
+    // Store A
+    A_st_macro 0, 1, 0, 4
+    // Strore C
+    GST xv, , U0, C0, 0x00, U1, C0, 0x20
+.endm
+
+.macro dgemm_dsolve_4x1
+    bge   ZERO, L,	.L_dsolve_4x1_load
+    dgemm_4x1
+    b .L_dsolve_4x1
+.L_dsolve_4x1_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+.L_dsolve_4x1:
+    ldrepl_macro 16, 16, 0
+    GMUL xvf, d, U0, D0, U0
+    // Store A
+    A_st_macro 0, 0, 0, 4
+    // Strore C
+    GST xv, , U0, C0, 0x00
+.endm
+
+.macro dgemm_dsolve_2x1
+    bge   ZERO, L,	.L_dsolve_2x1_load
+    dgemm_2x1
+    b .L_dsolve_2x1
+.L_dsolve_2x1_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+.L_dsolve_2x1:
+    ldrepl_macro 16, 16, 0
+    GMUL xvf, d, U0, D0, U0
+    // Store A
+    A_st_macro 0, 0, 0, 2
+    // Strore C
+    GST v, , $vr0, C0, 0x00
+.endm
+
+.macro dgemm_dsolve_1x1
+    bge   ZERO, L,    .L_dsolve_1x1_load
+    dgemm_1x1
+    b .L_dsolve_1x1
+.L_dsolve_1x1_load:
+    // Load C
+    fld.d       $f0,    C0,     0x00
+.L_dsolve_1x1:
+    ldrepl_macro 16, 16, 0
+    GMUL xvf, d, U0, D0, U0
+    // Store A
+    A_st_macro 0, 0, 0, 1
+    // Strore C
+    GST f, d, $f0, C0, 0x00
+.endm
+
+    PROLOGUE
+    push_if_used 26, 32
+    PTR_SLLI   LDC,   LDC,   3
+    PTR_SUB    KK,    ZERO,  OFFSET
+    /* if (!(N >> 2)) goto L_N3 */
+    PTR_SRAI   J,     N,     2     /* J = bn >> 2 */
+    andi       N,     N,     0x03
+    beq        ZERO,  J,     .L_N3
+.align 5
+.L_J1:
+    PTR_ADDI    J,      J,     -1
+    move        AA,     A
+    move        CC,     C
+    PTR_SRAI    I,      M,      4 // M >> 4
+    beqz        I,      .L_M15
+.align 4
+.L_I1:
+    GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC
+    move        A0,     AA
+    move        B0,     B
+    move        L,      KK
+    dgemm_dsolve_16x4
+    PTR_ADDI    I,      I,      -1
+    PTR_SLLI    T0,     K,      7
+    PTR_ADDI    CC,     CC,     0x80 // cc += 16
+    PTR_ADD     AA,     AA,     T0 // aa += 16 * k
+    bnez        I,      .L_I1
+.L_M15:
+    andi        I,      M,      8
+    beqz        I,      .L_M7
+.L_M8:
+    GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC
+    move        A0,     AA
+    move        B0,     B
+    move        L,      KK
+    dgemm_dsolve_8x4
+    PTR_SLLI    T0,     K,      6
+    PTR_ADDI    CC,     CC,     0x40 // cc += 8
+    PTR_ADD     AA,     AA,     T0 // aa += 8 * k
+.L_M7:
+    andi        I,      M,      4
+    beqz        I,      .L_M3
+.L_M4:
+    GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC
+    move        A0,     AA
+    move        B0,     B
+    move        L,      KK
+    dgemm_dsolve_4x4
+    PTR_SLLI    T0,     K,      5
+    PTR_ADDI    CC,     CC,     0x20 // cc += 4
+    PTR_ADD     AA,     AA,     T0 // aa += 4 * k
+.L_M3:
+    andi        I,      M,      2
+    beqz        I,      .L_M1
+.L_M2:
+    GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC
+    move        A0,     AA
+    move        B0,     B
+    move        L,      KK
+    dgemm_dsolve_2x4
+    PTR_SLLI    T0,     K,      4
+    PTR_ADDI    CC,     CC,     0x10 // cc += 2
+    PTR_ADD     AA,     AA,     T0 // aa += 2 * k
+.L_M1:
+    andi        I,      M,      1
+    beqz        I,      .L_M0
+    GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC
+    move        A0,     AA
+    move        B0,     B
+    move        L,      KK
+    dgemm_dsolve_1x4
+    PTR_SLLI    T0,     K,      3
+    PTR_ADDI    CC,     CC,     0x08 // cc += 1
+    PTR_ADD     AA,     AA,     T0 // aa += 1 * k
+.L_M0:
+    PTR_SLLI    T0,     K,      5
+    PTR_SLLI    T1,     LDC,    2
+    PTR_ADD     B,      B,      T0 // b += 4 * k
+    PTR_ADD     C,      C,      T1 // c += 4 * ldc
+    PTR_ADDI    KK,     KK,     4 // kk += 4
+    bnez        J,      .L_J1
+.L_N3:
+    andi    J,      N,      2
+    beq     ZERO,   J,      .L_N1
+.L_N2:
+    move        AA,     A
+    move        CC,     C
+    PTR_SRAI    I,      M,      4 // M >> 4
+    beqz        I,      .L_N2_M15
+.align 4
+.L_N2_I1:
+    GADD , d, C0, CC, ZERO, C1, C0, LDC
+    move        A0,     AA
+    move        B0,     B
+    move        L,      KK
+    dgemm_dsolve_16x2
+    PTR_ADDI    I,      I,      -1
+    PTR_SLLI    T0,     K,      7
+    PTR_ADDI    CC,     CC,     0x80 // cc += 16
+    PTR_ADD     AA,     AA,     T0 // aa += 16 * k
+    bnez        I,      .L_N2_I1
+.L_N2_M15:
+    andi        I,      M,      8
+    beqz        I,      .L_N2_M7
+.L_N2_M8:
+    GADD , d, C0, CC, ZERO, C1, C0, LDC
+    move        A0,     AA
+    move        B0,     B
+    move        L,      KK
+    dgemm_dsolve_8x2
+    PTR_SLLI    T0,     K,      6
+    PTR_ADDI    CC,     CC,     0x40 // cc += 8
+    PTR_ADD     AA,     AA,     T0 // aa += 8 * k
+.L_N2_M7:
+    andi        I,      M,      4
+    beqz        I,      .L_N2_M3
+.L_N2_M4:
+    GADD , d, C0, CC, ZERO, C1, C0, LDC
+    move        A0,     AA
+    move        B0,     B
+    move        L,      KK
+    dgemm_dsolve_4x2
+    PTR_SLLI    T0,     K,      5
+    PTR_ADDI    CC,     CC,     0x20 // cc += 4
+    PTR_ADD     AA,     AA,     T0 // aa += 4 * k
+.L_N2_M3:
+    andi        I,      M,      2
+    beqz        I,      .L_N2_M1
+.L_N2_M2:
+    GADD , d, C0, CC, ZERO, C1, C0, LDC
+    move        A0,     AA
+    move        B0,     B
+    move        L,      KK
+    dgemm_dsolve_2x2
+    PTR_SLLI    T0,     K,      4
+    PTR_ADDI    CC,     CC,     0x10 // cc += 2
+    PTR_ADD     AA,     AA,     T0 // aa += 2 * k
+.L_N2_M1:
+    andi        I,      M,      1
+    beqz        I,      .L_N2_M0
+    GADD , d, C0, CC, ZERO, C1, C0, LDC
+    move        A0,     AA
+    move        B0,     B
+    move        L,      KK
+    dgemm_dsolve_1x2
+    PTR_SLLI    T0,     K,      3
+    PTR_ADDI    CC,     CC,     0x08 // cc += 1
+    PTR_ADD     AA,     AA,     T0 // aa += 1 * k
+.L_N2_M0:
+    PTR_SLLI    T0,     K,      4
+    PTR_SLLI    T1,     LDC,    1
+    PTR_ADD     B,      B,      T0 // b += 2 * k
+    PTR_ADD     C,      C,      T1 // c += 2 * ldc
+    PTR_ADDI    KK,     KK,     2 // kk += 2
+.L_N1:
+    andi    J,      N,      1
+    beq     ZERO,   J,      .L_N0
+    move        AA,     A
+    move        CC,     C
+    PTR_SRAI    I,      M,      4 // M >> 4
+    beqz        I,      .L_N1_M15
+.align 4
+.L_N1_I1:
+    GADD , d, C0, CC, ZERO
+    move        A0,     AA
+    move        B0,     B
+    move        L,      KK
+    dgemm_dsolve_16x1
+    PTR_ADDI    I,      I,      -1
+    PTR_SLLI    T0,     K,      7
+    PTR_ADDI    CC,     CC,     0x80 // cc += 16
+    PTR_ADD     AA,     AA,     T0 // aa += 16 * k
+    bnez        I,      .L_N1_I1
+.L_N1_M15:
+    andi        I,      M,      8
+    beqz        I,      .L_N1_M7
+.L_N1_M8:
+    GADD , d, C0, CC, ZERO
+    move        A0,     AA
+    move        B0,     B
+    move        L,      KK
+    dgemm_dsolve_8x1
+    PTR_SLLI    T0,     K,      6
+    PTR_ADDI    CC,     CC,     0x40 // cc += 8
+    PTR_ADD     AA,     AA,     T0 // aa += 8 * k
+.L_N1_M7:
+    andi        I,      M,      4
+    beqz        I,      .L_N1_M3
+.L_N1_M4:
+    GADD , d, C0, CC, ZERO
+    move        A0,     AA
+    move        B0,     B
+    move        L,      KK
+    dgemm_dsolve_4x1
+    PTR_SLLI    T0,     K,      5
+    PTR_ADDI    CC,     CC,     0x20 // cc += 4
+    PTR_ADD     AA,     AA,     T0 // aa += 4 * k
+.L_N1_M3:
+    andi        I,      M,      2
+    beqz        I,      .L_N1_M1
+.L_N1_M2:
+    GADD , d, C0, CC, ZERO
+    move        A0,     AA
+    move        B0,     B
+    move        L,      KK
+    dgemm_dsolve_2x1
+    PTR_SLLI    T0,     K,      4
+    PTR_ADDI    CC,     CC,     0x10 // cc += 2
+    PTR_ADD     AA,     AA,     T0 // aa += 2 * k
+.L_N1_M1:
+    andi        I,      M,      1
+    beqz        I,      .L_N1_M0
+    GADD , d, C0, CC, ZERO
+    move        A0,     AA
+    move        B0,     B
+    move        L,      KK
+    dgemm_dsolve_1x1
+    PTR_SLLI    T0,     K,      3
+    PTR_ADDI    CC,     CC,     0x08 // cc += 1
+    PTR_ADD     AA,     AA,     T0 // aa += 1 * k
+.L_N1_M0:
+.L_N0:
+    pop_if_used 26, 32
+    jirl    $r0, $r1, 0x0
+    EPILOGUE
diff --git a/kernel/loongarch64/dtrsm_kernel_RT_16x4_lasx.S b/kernel/loongarch64/dtrsm_kernel_RT_16x4_lasx.S
new file mode 100644
index 000000000..5f86d75b5
--- /dev/null
+++ b/kernel/loongarch64/dtrsm_kernel_RT_16x4_lasx.S
@@ -0,0 +1,953 @@
+/*******************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+#include "loongarch64_asm.S"
+
+/*********************************************************************
+* 2023/09/26 guxiwei
+*        UTEST                  : OK
+*        CTEST                  : OK
+*        TEST                   : OK
+*
+*
+*********************************************************************/
+
+/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b,
+ *          FLOAT *c, BLASLONG ldc, BLASLONG offset)
+ */
+#define M      $r4   // param 1: bm
+#define N      $r5   // param 2: bn
+#define K      $r6   // param 3: bk
+#define A      $r7   // param 5: ba
+#define B      $r8   // param 6: bb
+#define C      $r9   // param 7: bc
+#define LDC    $r10  // param 8: ldc
+#define OFFSET $r11  // param 9: offset
+
+/* Cycle control parameters */
+#define I      $r13
+#define J      $r14
+#define L      $r15
+#define TL     $r16
+/* Matrix address */
+#define A0     $r17
+#define B0     $r18
+#define C0     $r19
+#define C1     $r20
+#define C2     $r23
+#define C3     $r24
+#define T0     $r25
+#define T1     $r26
+#define T2     $r27
+#define KK     $r28
+#define AA     $r29
+#define CC     $r30
+#define BB     $r31
+#undef  ZERO
+#define ZERO   $r0
+
+#define U0     $xr0
+#define U1     $xr1
+#define U2     $xr2
+#define U3     $xr3
+#define U4     $xr4
+#define U5     $xr5
+#define U6     $xr6
+#define U7     $xr7
+#define U8     $xr8
+#define U9     $xr9
+#define U10    $xr10
+#define U11    $xr11
+#define U12    $xr12
+#define U13    $xr13
+#define U14    $xr14
+#define U15    $xr15
+#define D0     $xr16
+#define D1     $xr17
+#define D2     $xr18
+#define D3     $xr19
+#define D4     $xr20
+#define D5     $xr21
+#define D6     $xr22
+#define D7     $xr23
+#define D8     $xr24
+#define D9     $xr25
+#define D10    $xr26
+#define D11    $xr27
+#define D12    $xr28
+#define D13    $xr29
+#define D14    $xr30
+#define D15    $xr31
+
+/* Prefetch interval */
+#define A_PRE  0x400
+#define B_PRE  0x100
+
+#include "dtrsm_kernel_macro.S"
+
+.macro ldrepl_macro start, end, stride
+// Load Ux (x = 0...15)
+.if \start <= \end
+    GLDREPL xv, d, $xr\start, B0, \stride * 8
+    ldrepl_macro %start + 1, \end, %stride + 1
+.endif
+.endm
+
+.macro nmsub_macro start0, end0, start1, reg
+// Ux -= reg * Dx
+.if \start0 <= \end0
+    xvfnmsub.d  $xr\start0, \reg, $xr\start1, $xr\start0
+    nmsub_macro %start0 + 1, \end0, %start1 + 1, \reg
+.endif
+.endm
+
+.macro A_st_macro start, end, stride, N
+// Store Ux(x = 0...15)
+.if \start <= \end
+.if \N == 4
+    xvst    $xr\start, A0, \stride * 0x20
+.elseif \N == 2
+    vst     $vr\start, A0, \stride * 0x10
+.elseif \N == 1
+    fst.d   $f\start, A0, \stride * 0x08
+.endif
+    A_st_macro %start + 1, \end, %stride + 1, \N
+.endif
+.endm
+
+.macro dsolve_16x2
+// We are going to process matrix B with a size of 2x2,
+// using only the upper triangular portion. The memory layout of
+// matrix B is as follows:
+//0
+//2	 3
+// Sequentially extract data from B in row order
+    ldrepl_macro 16, 16, 0
+    ldrepl_macro 17, 18, 2
+    GMUL xvf, d, U4, D2, U4, U5, D2, U5, U6, D2, U6, U7, D2, U7
+    nmsub_macro 0, 3, 4, D1
+    GMUL xvf, d, U0, D0, U0, U1, D0, U1, U2, D0, U2, U3, D0, U3
+// Store A
+    A_st_macro 0, 7, 0, 4
+// Store C
+    GST xv, , U0,  C0, 0x00, U1,  C0, 0x20, U2,  C0, 0x40, U3,  C0, 0x60, \
+              U4,  C1, 0x00, U5,  C1, 0x20, U6,  C1, 0x40, U7,  C1, 0x60
+.endm
+
+.macro dsolve_8x2
+// We are going to process matrix B with a size of 2x2,
+// using only the upper triangular portion. The memory layout of
+// matrix B is as follows:
+//0
+//2	 3
+// Sequentially extract data from B in row order
+    ldrepl_macro 16, 16, 0
+    ldrepl_macro 17, 18, 2
+    GMUL xvf, d, U2, D2, U2, U3, D2, U3
+    nmsub_macro 0, 1, 2, D1
+    GMUL xvf, d, U0, D0, U0, U1, D0, U1
+// Store A
+    A_st_macro 0, 3, 0, 4
+// Store C
+    GST xv, , U0,  C0, 0x00, U1,  C0, 0x20, \
+              U2,  C1, 0x00, U3,  C1, 0x20
+.endm
+
+.macro dsolve_4x2
+// We are going to process matrix B with a size of 2x2,
+// using only the upper triangular portion. The memory layout of
+// matrix B is as follows:
+//0
+//2	 3
+// Sequentially extract data from B in row order
+    ldrepl_macro 16, 16, 0
+    ldrepl_macro 17, 18, 2
+    GMUL xvf, d, U1, D2, U1
+    nmsub_macro 0, 0, 1, D1
+    GMUL xvf, d, U0, D0, U0
+// Store A
+    A_st_macro 0, 1, 0, 4
+// Store C
+    GST xv, , U0, C0, 0x00, U1, C1, 0x00
+.endm
+
+.macro dsolve_2x2
+// We are going to process matrix B with a size of 2x2,
+// using only the upper triangular portion. The memory layout of
+// matrix B is as follows:
+//0
+//2	 3
+// Sequentially extract data from B in row order
+    ldrepl_macro 16, 16, 0
+    ldrepl_macro 17, 18, 2
+    GMUL xvf, d, U1, D2, U1
+    nmsub_macro 0, 0, 1, D1
+    GMUL xvf, d, U0, D0, U0
+// Store A
+    A_st_macro 0, 1, 0, 2
+// Store C
+    GST v, , $vr0, C0, 0x00, $vr1, C1, 0x00
+.endm
+
+.macro dsolve_1x2
+// We are going to process matrix B with a size of 2x2,
+// using only the upper triangular portion. The memory layout of
+// matrix B is as follows:
+//0
+//2	 3
+// Sequentially extract data from B in row order
+    ldrepl_macro 16, 16, 0
+    ldrepl_macro 17, 18, 2
+    GMUL xvf, d, U1, D2, U1
+    nmsub_macro 0, 0, 1, D1
+    GMUL xvf, d, U0, D0, U0
+// Store A
+    A_st_macro 0, 1, 0, 1
+// Store C
+    GST f, d, $f0, C0, 0x00, $f1, C1, 0x00
+.endm
+
+.macro dsolve_16x4
+// We are going to process matrix B with a size of 4x4,
+// using only the upper triangular portion. The memory layout of
+// matrix B is as follows:
+//0
+//4	 5
+//8	 9	10
+//12 13	14	15
+// Sequentially extract data from B in row order
+    ldrepl_macro 22, 25, 12
+    GMUL xvf, d, U12, D9, U12, U13, D9, U13, U14, D9, U14, U15, D9, U15
+    ldrepl_macro 19, 21, 8
+    nmsub_macro 8, 11, 12, D8
+    ldrepl_macro 17, 18, 4
+    GMUL xvf, d, U8, D5, U8, U9, D5, U9, U10, D5, U10, U11, D5, U11
+    ldrepl_macro 16, 16, 0
+    nmsub_macro 4, 7, 12, D7
+    nmsub_macro 4, 7, 8, D4
+    GMUL xvf, d, U4, D2, U4, U5, D2, U5, U6, D2, U6, U7, D2, U7
+    nmsub_macro 0, 3, 12, D6
+    nmsub_macro 0, 3, 8, D3
+    nmsub_macro 0, 3, 4, D1
+    GMUL xvf, d, U0, D0, U0, U1, D0, U1, U2, D0, U2, U3, D0, U3
+// Store A
+    A_st_macro 0, 15, 0, 4
+// Store C
+    GST xv, , U0,  C0, 0x00, U1,  C0, 0x20, U2,  C0, 0x40, U3,  C0, 0x60, \
+              U4,  C1, 0x00, U5,  C1, 0x20, U6,  C1, 0x40, U7,  C1, 0x60, \
+              U8,  C2, 0x00, U9,  C2, 0x20, U10, C2, 0x40, U11, C2, 0x60, \
+              U12, C3, 0x00, U13, C3, 0x20, U14, C3, 0x40, U15, C3, 0x60
+.endm
+
+.macro dsolve_8x4
+// We are going to process matrix B with a size of 4x4,
+// using only the upper triangular portion. The memory layout of
+// matrix B is as follows:
+//0
+//4	 5
+//8	 9	10
+//12 13	14	15
+// Sequentially extract data from B in row order
+    ldrepl_macro 22, 25, 12
+    GMUL xvf, d, U6, D9, U6, U7, D9, U7
+    ldrepl_macro 19, 21, 8
+    nmsub_macro 4, 5, 6, D8
+    ldrepl_macro 17, 18, 4
+    GMUL xvf, d, U4, D5, U4, U5, D5, U5
+    ldrepl_macro 16, 16, 0
+    nmsub_macro 2, 3, 6, D7
+    nmsub_macro 2, 3, 4, D4
+    GMUL xvf, d, U2, D2, U2, U3, D2, U3
+    nmsub_macro 0, 1, 6, D6
+    nmsub_macro 0, 1, 4, D3
+    nmsub_macro 0, 1, 2, D1
+    GMUL xvf, d, U0, D0, U0, U1, D0, U1
+// Store A
+    A_st_macro 0, 7, 0, 4
+// Store C
+    GST xv, , U0, C0, 0x00, U1, C0, 0x20, \
+              U2, C1, 0x00, U3, C1, 0x20, \
+              U4, C2, 0x00, U5, C2, 0x20, \
+              U6, C3, 0x00, U7, C3, 0x20
+.endm
+
+.macro dsolve_4x4
+// We are going to process matrix B with a size of 4x4,
+// using only the upper triangular portion. The memory layout of
+// matrix B is as follows:
+//0
+//4	 5
+//8	 9	10
+//12 13	14	15
+// Sequentially extract data from B in row order
+    ldrepl_macro 22, 25, 12
+    GMUL xvf, d, U3, D9, U3
+    ldrepl_macro 19, 21, 8
+    nmsub_macro 2, 2, 3, D8
+    ldrepl_macro 17, 18, 4
+    GMUL xvf, d, U2, D5, U2
+    ldrepl_macro 16, 16, 0
+    nmsub_macro 1, 1, 3, D7
+    nmsub_macro 1, 1, 2, D4
+    GMUL xvf, d, U1, D2, U1
+    nmsub_macro 0, 0, 3, D6
+    nmsub_macro 0, 0, 2, D3
+    nmsub_macro 0, 0, 1, D1
+    GMUL xvf, d, U0, D0, U0
+// Store A
+    A_st_macro 0, 3, 0, 4
+// Store C
+    GST xv, , U0, C0, 0x00, U1, C1, 0x00, U2, C2, 0x00, U3, C3, 0x00
+.endm
+
+.macro dsolve_2x4
+// We are going to process matrix B with a size of 4x4,
+// using only the upper triangular portion. The memory layout of
+// matrix B is as follows:
+//0
+//4	 5
+//8	 9	10
+//12 13	14	15
+// Sequentially extract data from B in row order
+    ldrepl_macro 22, 25, 12
+    GMUL xvf, d, U3, D9, U3
+    ldrepl_macro 19, 21, 8
+    nmsub_macro 2, 2, 3, D8
+    ldrepl_macro 17, 18, 4
+    GMUL xvf, d, U2, D5, U2
+    ldrepl_macro 16, 16, 0
+    nmsub_macro 1, 1, 3, D7
+    nmsub_macro 1, 1, 2, D4
+    GMUL xvf, d, U1, D2, U1
+    nmsub_macro 0, 0, 3, D6
+    nmsub_macro 0, 0, 2, D3
+    nmsub_macro 0, 0, 1, D1
+    GMUL xvf, d, U0, D0, U0
+// Store A
+    A_st_macro 0, 3, 0, 2
+// Store C
+    GST v, , $vr0, C0, 0x00, $vr1, C1, 0x00, $vr2, C2, 0x00, $vr3, C3, 0x00
+.endm
+
+.macro dsolve_1x4
+// We are going to process matrix B with a size of 4x4,
+// using only the upper triangular portion. The memory layout of
+// matrix B is as follows:
+//0
+//4	 5
+//8	 9	10
+//12 13	14	15
+// Sequentially extract data from B in row order
+    ldrepl_macro 22, 25, 12
+    GMUL xvf, d, U3, D9, U3
+    ldrepl_macro 19, 21, 8
+    nmsub_macro 2, 2, 3, D8
+    ldrepl_macro 17, 18, 4
+    GMUL xvf, d, U2, D5, U2
+    ldrepl_macro 16, 16, 0
+    nmsub_macro 1, 1, 3, D7
+    nmsub_macro 1, 1, 2, D4
+    GMUL xvf, d, U1, D2, U1
+    nmsub_macro 0, 0, 3, D6
+    nmsub_macro 0, 0, 2, D3
+    nmsub_macro 0, 0, 1, D1
+    GMUL xvf, d, U0, D0, U0
+// Store A
+    A_st_macro 0, 3, 0, 1
+// Store C
+    GST f, d, $f0, C0, 0x00, $f1, C1, 0x00, $f2, C2, 0x00, $f3, C3, 0x00,
+.endm
+
+.macro dgemm_dsolve_16x1
+    or    T1,   A0,     A0
+    or    T2,   B0,     B0
+    bge   ZERO, L,	.L_dsolve_16x1_load
+    dgemm_16x1
+    b .L_dsolve_16x1
+.L_dsolve_16x1_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    xvld      U1,  C0,  0x20
+    xvld      U2,  C0,  0x40
+    xvld      U3,  C0,  0x60
+.L_dsolve_16x1:
+    PTR_ADDI    A0,     T1,     -16 * 8
+    PTR_ADDI    B0,     T2,     -1 * 8
+    ldrepl_macro 16, 16, 0
+    GMUL xvf, d, U0, D0, U0, U1, D0, U1, U2, D0, U2, U3, D0, U3
+    // Store A
+    A_st_macro 0, 3, 0, 4
+    // Strore C
+    GST xv, , U0, C0, 0x00, U1, C0, 0x20, U2, C0, 0x40, U3, C0, 0x60
+.endm
+
+.macro dgemm_dsolve_8x1
+    or    T1,   A0,     A0
+    or    T2,   B0,     B0
+    bge   ZERO, L,	.L_dsolve_8x1_load
+    dgemm_8x1
+    b .L_dsolve_8x1
+.L_dsolve_8x1_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    xvld      U1,  C0,  0x20
+.L_dsolve_8x1:
+    PTR_ADDI    A0,     T1,     -8 * 8
+    PTR_ADDI    B0,     T2,     -1 * 8
+    ldrepl_macro 16, 16, 0
+    GMUL xvf, d, U0, D0, U0, U1, D0, U1
+    // Store A
+    A_st_macro 0, 1, 0, 4
+    // Strore C
+    GST xv, , U0, C0, 0x00, U1, C0, 0x20
+.endm
+
+.macro dgemm_dsolve_4x1
+    or    T1,   A0,     A0
+    or    T2,   B0,     B0
+    bge   ZERO, L,	.L_dsolve_4x1_load
+    dgemm_4x1
+    b .L_dsolve_4x1
+.L_dsolve_4x1_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+.L_dsolve_4x1:
+    PTR_ADDI    A0,     T1,     -4 * 8
+    PTR_ADDI    B0,     T2,     -1 * 8
+    ldrepl_macro 16, 16, 0
+    GMUL xvf, d, U0, D0, U0
+    // Store A
+    A_st_macro 0, 0, 0, 4
+    // Strore C
+    GST xv, , U0, C0, 0x00
+.endm
+
+.macro dgemm_dsolve_2x1
+    or    T1,   A0,     A0
+    or    T2,   B0,     B0
+    bge   ZERO, L,	.L_dsolve_2x1_load
+    dgemm_2x1
+    b .L_dsolve_2x1
+.L_dsolve_2x1_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+.L_dsolve_2x1:
+    PTR_ADDI    A0,     T1,     -2 * 8
+    PTR_ADDI    B0,     T2,     -1 * 8
+    ldrepl_macro 16, 16, 0
+    GMUL xvf, d, U0, D0, U0
+    // Store A
+    A_st_macro 0, 0, 0, 2
+    // Strore C
+    GST v, , $vr0, C0, 0x00
+.endm
+
+.macro dgemm_dsolve_1x1
+    or    T1,   A0,     A0
+    or    T2,   B0,     B0
+    bge   ZERO, L,    .L_dsolve_1x1_load
+    dgemm_1x1
+    b .L_dsolve_1x1
+.L_dsolve_1x1_load:
+    // Load C
+    fld.d       $f0,    C0,     0x00
+.L_dsolve_1x1:
+    PTR_ADDI    A0,     T1,     -1 * 8
+    PTR_ADDI    B0,     T2,     -1 * 8
+    ldrepl_macro 16, 16, 0
+    GMUL xvf, d, U0, D0, U0
+    // Store A
+    A_st_macro 0, 0, 0, 1
+    // Strore C
+    GST f, d, $f0, C0, 0x00
+.endm
+
+.macro dgemm_dsolve_16x2
+    or    T1,   A0,     A0
+    or    T2,   B0,     B0
+    bge   ZERO, L,	.L_dsolve_16x2_load
+    dgemm_16x2
+    b .L_dsolve_16x2
+.L_dsolve_16x2_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    xvld      U1,  C0,  0x20
+    xvld      U2,  C0,  0x40
+    xvld      U3,  C0,  0x60
+    /* Load C1  */
+    xvld      U4,  C1,  0x00
+    xvld      U5,  C1,  0x20
+    xvld      U6,  C1,  0x40
+    xvld      U7,  C1,  0x60
+.L_dsolve_16x2:
+    PTR_ADDI    A0,     T1,     -(16 * 2) * 8
+    PTR_ADDI    B0,     T2,     -(2 * 2) * 8
+    dsolve_16x2
+.endm
+
+.macro dgemm_dsolve_8x2
+    or    T1,   A0,     A0
+    or    T2,   B0,     B0
+    bge   ZERO, L,	.L_dsolve_8x2_load
+    dgemm_8x2
+    b .L_dsolve_8x2
+.L_dsolve_8x2_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    xvld      U1,  C0,  0x20
+    /* Load C1  */
+    xvld      U2,  C1,  0x00
+    xvld      U3,  C1,  0x20
+.L_dsolve_8x2:
+    PTR_ADDI    A0,     T1,     -(8 * 2) * 8
+    PTR_ADDI    B0,     T2,     -(2 * 2) * 8
+    dsolve_8x2
+.endm
+
+.macro dgemm_dsolve_4x2
+    or    T1,   A0,     A0
+    or    T2,   B0,     B0
+    bge   ZERO, L,	.L_dsolve_4x2_load
+    dgemm_4x2
+    b .L_dsolve_4x2
+.L_dsolve_4x2_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    /* Load C1  */
+    xvld      U1,  C1,  0x00
+.L_dsolve_4x2:
+    PTR_ADDI    A0,     T1,     -(4 * 2) * 8
+    PTR_ADDI    B0,     T2,     -(2 * 2) * 8
+    dsolve_4x2
+.endm
+
+.macro dgemm_dsolve_2x2
+    or    T1,   A0,     A0
+    or    T2,   B0,     B0
+    bge   ZERO, L,	.L_dsolve_2x2_load
+    dgemm_2x2
+    b .L_dsolve_2x2
+.L_dsolve_2x2_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    /* Load C1  */
+    xvld      U1,  C1,  0x00
+.L_dsolve_2x2:
+    PTR_ADDI    A0,     T1,     -(2 * 2) * 8
+    PTR_ADDI    B0,     T2,     -(2 * 2) * 8
+    dsolve_2x2
+.endm
+
+.macro dgemm_dsolve_1x2
+    or    T1,   A0,     A0
+    or    T2,   B0,     B0
+    bge   ZERO, L,    .L_dsolve_1x2_load
+    dgemm_1x2
+    xvpackod.d  U1,     U0,     U0
+    b   .L_dsolve_1x2
+.L_dsolve_1x2_load:
+    // Load C
+    fld.d       $f0,    C0,     0x00
+    fld.d       $f1,    C1,     0x00
+.L_dsolve_1x2:
+    PTR_ADDI    A0,     T1,     -(1 * 2) * 8
+    PTR_ADDI    B0,     T2,     -(2 * 2) * 8
+    dsolve_1x2
+.endm
+
+.macro dgemm_dsolve_16x4
+    or    T1,   A0,     A0
+    or    T2,   B0,     B0
+    bge   ZERO, L,	.L_dsolve_16x4_load
+    dgemm_16x4
+    b	.L_dsolve_16x4
+.L_dsolve_16x4_load:
+    // Load C
+    GLD xv, , U0,  C0, 0x00, U1,  C0, 0x20, U2,  C0, 0x40, U3,  C0, 0x60
+    GLD xv, , U4,  C1, 0x00, U5,  C1, 0x20, U6,  C1, 0x40, U7,  C1, 0x60
+    GLD xv, , U8,  C2, 0x00, U9,  C2, 0x20, U10, C2, 0x40, U11, C2, 0x60
+    GLD xv, , U12, C3, 0x00, U13, C3, 0x20, U14, C3, 0x40, U15, C3, 0x60
+/********************** solver ******************/
+.L_dsolve_16x4:
+    PTR_ADDI    A0,     T1,     -(16 * 4) * 8
+    PTR_ADDI    B0,     T2,     -(4 * 4) * 8
+    dsolve_16x4
+.endm
+
+.macro dgemm_dsolve_8x4
+    or    T1,   A0,     A0
+    or    T2,   B0,     B0
+    bge   ZERO, L,	.L_dsolve_8x4_load
+    dgemm_8x4
+    b .L_dsolve_8x4
+.L_dsolve_8x4_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    xvld      U1,  C0,  0x20
+
+    /* Load C1  */
+    xvld      U2,  C1,  0x00
+    xvld      U3,  C1,  0x20
+
+    /* Load C2  */
+    xvld      U4,  C2,  0x00
+    xvld      U5,  C2,  0x20
+
+    /* Load C3  */
+    xvld      U6,  C3,  0x00
+    xvld      U7,  C3,  0x20
+/********* solver *********/
+.L_dsolve_8x4:
+    PTR_ADDI    A0,     T1,     -(8 * 4) * 8
+    PTR_ADDI    B0,     T2,     -(4 * 4) * 8
+    dsolve_8x4
+.endm
+
+.macro dgemm_dsolve_4x4
+    or    T1,   A0,     A0
+    or    T2,   B0,     B0
+    bge   ZERO, L,    .L_dsolve_4x4_load
+    dgemm_4x4
+    b .L_dsolve_4x4
+.L_dsolve_4x4_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    /* Load C1  */
+    xvld      U1,  C1,  0x00
+    /* Load C2  */
+    xvld      U2,  C2,  0x00
+    /* Load C3  */
+    xvld      U3,  C3,  0x00
+/************** solver *****************/
+.L_dsolve_4x4:
+    PTR_ADDI    A0,     T1,     -(4 * 4) * 8
+    PTR_ADDI    B0,     T2,     -(4 * 4) * 8
+    dsolve_4x4
+.endm
+
+.macro dgemm_dsolve_2x4
+    or    T1,   A0,     A0
+    or    T2,   B0,     B0
+    bge   ZERO, L,    .L_dsolve_2x4_load
+    dgemm_2x4
+    xvpermi.q   U2,     U0,     0x01
+    xvpermi.q   U3,     U1,     0x01
+    b   .L_dsolve_2x4
+.L_dsolve_2x4_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    /* Load C1  */
+    xvld      U1,  C1,  0x00
+    /* Load C2  */
+    xvld      U2,  C2,  0x00
+    /* Load C3  */
+    xvld      U3,  C3,  0x00
+/********************** solver ******************/
+.L_dsolve_2x4:
+    PTR_ADDI    A0,     T1,     -(2 * 4) * 8
+    PTR_ADDI    B0,     T2,     -(4 * 4) * 8
+    dsolve_2x4
+.endm
+
+.macro dgemm_dsolve_1x4
+    or    T1,   A0,     A0
+    or    T2,   B0,     B0
+    bge   ZERO, L,    .L_dsolve_1x4_load
+    dgemm_1x4
+    xvpackod.d  U1,     U0,     U0
+    xvpermi.q   U2,     U0,     0x01
+    xvpermi.q   U3,     U1,     0x01
+    b   .L_dsolve_1x4
+.L_dsolve_1x4_load:
+    // Load C
+    fld.d       $f0,    C0,     0x00
+    fld.d       $f1,    C1,     0x00
+    fld.d       $f2,    C2,     0x00
+    fld.d       $f3,    C3,     0x00
+.L_dsolve_1x4:
+    PTR_ADDI    A0,     T1,     -(1 * 4) * 8
+    PTR_ADDI    B0,     T2,     -(4 * 4) * 8
+    dsolve_1x4
+.endm
+
+    PROLOGUE
+    push_if_used 26, 32
+    PTR_SLLI    LDC,    LDC,    3
+    PTR_SUB     KK,     N,      OFFSET
+    PTR_MUL     T0,     N,      LDC
+    PTR_MUL     T1,     N,      K
+    PTR_ADD     C,      C,      T0 // c += n * ldc
+    PTR_SLLI    T1,     T1,     3
+    PTR_ADD     B,      B,      T1
+
+    andi        J,      N,      1
+    beqz        J,      .L_N2
+.L_N1:
+    move        AA,     A
+    PTR_SUB     C,      C,      LDC // c -= ldc
+    PTR_SLLI    T0,     K,      3
+    PTR_SLLI    T1,     KK,     3
+    PTR_SUB     B,      B,      T0 // b -= k
+    PTR_ADD     BB,     B,      T1 // bb = b + kk
+    move        CC,     C
+
+    PTR_SRAI    I,      M,      4 // M >> 4
+    beqz        I,      .L_N1_M15
+.align 4
+.L_N1_I1:
+    PTR_SLLI    T1,     KK,     7
+    GADD , d, C0, CC, ZERO
+    PTR_ADD     A0,     AA,     T1 // a0 = aa + 16 * kk
+    move        B0,     BB
+    PTR_SUB     L,      K,      KK // L = K - KK
+    dgemm_dsolve_16x1
+    PTR_ADDI    I,      I,      -1
+    PTR_SLLI    T0,     K,      7
+    PTR_ADDI    CC,     CC,     0x80 // cc += 16
+    PTR_ADD     AA,     AA,     T0 // aa += 16 * k
+    bnez        I,      .L_N1_I1
+.L_N1_M15:
+    andi        I,      M,      8
+    beqz        I,      .L_N1_M7
+.L_N1_M8:
+    PTR_SLLI    T1,     KK,     6
+    GADD , d, C0, CC, ZERO
+    PTR_ADD     A0,     AA,     T1 // a0 = aa + 8 * kk
+    move        B0,     BB
+    PTR_SUB     L,      K,      KK // L = K - KK
+    dgemm_dsolve_8x1
+    PTR_SLLI    T0,     K,      6
+    PTR_ADDI    CC,     CC,     0x40 // cc += 8
+    PTR_ADD     AA,     AA,     T0 // aa += 8 * k
+.L_N1_M7:
+    andi        I,      M,      4
+    beqz        I,      .L_N1_M3
+.L_N1_M4:
+    PTR_SLLI    T1,     KK,     5
+    GADD , d, C0, CC, ZERO
+    PTR_ADD     A0,     AA,     T1 // a0 = aa + 4 * kk
+    move        B0,     BB
+    PTR_SUB     L,      K,      KK // L = K - KK
+    dgemm_dsolve_4x1
+    PTR_SLLI    T0,     K,      5
+    PTR_ADDI    CC,     CC,     0x20 // cc += 4
+    PTR_ADD     AA,     AA,     T0 // aa += 4 * k
+.L_N1_M3:
+    andi        I,      M,      2
+    beqz        I,      .L_N1_M1
+.L_N1_M2:
+    PTR_SLLI    T1,     KK,     4
+    GADD , d, C0, CC, ZERO
+    PTR_ADD     A0,     AA,     T1 // a0 = aa + 2 * kk
+    move        B0,     BB
+    PTR_SUB     L,      K,      KK // L = K - KK
+    dgemm_dsolve_2x1
+    PTR_SLLI    T0,     K,      4
+    PTR_ADDI    CC,     CC,     0x10 // cc += 2
+    PTR_ADD     AA,     AA,     T0 // aa += 2 * k
+.L_N1_M1:
+    andi        I,      M,      1
+    beqz        I,      .L_N1_M0
+    PTR_SLLI    T1,     KK,     3
+    GADD , d, C0, CC, ZERO
+    PTR_ADD     A0,     AA,     T1 // a0 = aa + kk
+    move        B0,     BB
+    PTR_SUB     L,      K,      KK // L = K - KK
+    dgemm_dsolve_1x1
+    PTR_SLLI    T0,     K,      3
+    PTR_ADDI    CC,     CC,     0x08 // cc += 1
+    PTR_ADD     AA,     AA,     T0 // aa += 1 * k
+.L_N1_M0:
+    PTR_ADDI    KK,     KK,     -1
+.L_N2:
+    andi    J,      N,      2
+    beq     ZERO,   J,      .L_N4
+    move    AA,     A
+    PTR_SLLI    T0, LDC,    1
+    PTR_SLLI    T1, K,      4
+    PTR_SLLI    T2, KK,     4
+    PTR_SUB     B,  B,      T1
+    PTR_SUB     C,  C,      T0
+    PTR_ADD     BB, B,      T2
+    move    CC,     C
+    PTR_SRAI    I,      M,      4 // M >> 4
+    beqz        I,      .L_N2_M15
+.align 4
+.L_N2_I1:
+    PTR_SLLI    T1,     KK,     7
+    GADD , d, C0, CC, ZERO, C1, C0, LDC
+    PTR_ADD     A0,     AA,     T1 // a0 = aa + 16 * kk
+    move        B0,     BB
+    PTR_SUB     L,      K,      KK // L = K - KK
+    dgemm_dsolve_16x2
+    PTR_ADDI    I,      I,      -1
+    PTR_SLLI    T0,     K,      7
+    PTR_ADDI    CC,     CC,     0x80 // cc += 16
+    PTR_ADD     AA,     AA,     T0 // aa += 16 * k
+    bnez        I,      .L_N2_I1
+.L_N2_M15:
+    andi        I,      M,      8
+    beqz        I,      .L_N2_M7
+.L_N2_M8:
+    PTR_SLLI    T1,     KK,     6
+    GADD , d, C0, CC, ZERO, C1, C0, LDC
+    PTR_ADD     A0,     AA,     T1 // a0 = aa + 8 * kk
+    move        B0,     BB
+    PTR_SUB     L,      K,      KK // L = K - KK
+    dgemm_dsolve_8x2
+    PTR_SLLI    T0,     K,      6
+    PTR_ADDI    CC,     CC,     0x40 // cc += 8
+    PTR_ADD     AA,     AA,     T0 // aa += 8 * k
+.L_N2_M7:
+    andi        I,      M,      4
+    beqz        I,      .L_N2_M3
+.L_N2_M4:
+    PTR_SLLI    T1,     KK,     5
+    GADD , d, C0, CC, ZERO, C1, C0, LDC
+    PTR_ADD     A0,     AA,     T1 // a0 = aa + 4 * kk
+    move        B0,     BB
+    PTR_SUB     L,      K,      KK // L = K - KK
+    dgemm_dsolve_4x2
+    PTR_SLLI    T0,     K,      5
+    PTR_ADDI    CC,     CC,     0x20 // cc += 4
+    PTR_ADD     AA,     AA,     T0 // aa += 4 * k
+.L_N2_M3:
+    andi        I,      M,      2
+    beqz        I,      .L_N2_M1
+.L_N2_M2:
+    PTR_SLLI    T1,     KK,     4
+    GADD , d, C0, CC, ZERO, C1, C0, LDC
+    PTR_ADD     A0,     AA,     T1 // a0 = aa + 2 * kk
+    move        B0,     BB
+    PTR_SUB     L,      K,      KK // L = K - KK
+    dgemm_dsolve_2x2
+    PTR_SLLI    T0,     K,      4
+    PTR_ADDI    CC,     CC,     0x10 // cc += 2
+    PTR_ADD     AA,     AA,     T0 // aa += 2 * k
+.L_N2_M1:
+    andi        I,      M,      1
+    beqz        I,      .L_N2_M0
+    PTR_SLLI    T1,     KK,     3
+    GADD , d, C0, CC, ZERO, C1, C0, LDC
+    PTR_ADD     A0,     AA,     T1 // a0 = aa + kk
+    move        B0,     BB
+    PTR_SUB     L,      K,      KK // L = K - KK
+    dgemm_dsolve_1x2
+    PTR_SLLI    T0,     K,      3
+    PTR_ADDI    CC,     CC,     0x08 // cc += 1
+    PTR_ADD     AA,     AA,     T0 // aa += 1 * k
+.L_N2_M0:
+    PTR_ADDI    KK,     KK,     -2
+.L_N4:
+    PTR_SRAI   J,     N,     2     /* J = bn >> 2 */
+    beq        ZERO,  J,     .L_N0
+.align 5
+.L_J1:
+    PTR_ADDI    J,      J,     -1
+    move    AA,     A
+    PTR_SLLI    T0, LDC,    2
+    PTR_SLLI    T1, K,      5
+    PTR_SLLI    T2, KK,     5
+    PTR_SUB     B,  B,      T1
+    PTR_SUB     C,  C,      T0
+    PTR_ADD     BB, B,      T2
+    move    CC,     C
+    PTR_SRAI    I,      M,      4 // M >> 4
+    beqz        I,      .L_M15
+.align 4
+.L_I1:
+    PTR_SLLI    T1,     KK,     7
+    GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC
+    PTR_ADD     A0,     AA,     T1 // a0 = aa + 16 * kk
+    move        B0,     BB
+    PTR_SUB     L,      K,      KK // L = K - KK
+    dgemm_dsolve_16x4
+    PTR_ADDI    I,      I,      -1
+    PTR_SLLI    T0,     K,      7
+    PTR_ADDI    CC,     CC,     0x80 // cc += 16
+    PTR_ADD     AA,     AA,     T0 // aa += 16 * k
+    bnez        I,      .L_I1
+.L_M15:
+    andi        I,      M,      8
+    beqz        I,      .L_M7
+.L_M8:
+    PTR_SLLI    T1,     KK,     6
+    GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC
+    PTR_ADD     A0,     AA,     T1 // a0 = aa + 8 * kk
+    move        B0,     BB
+    PTR_SUB     L,      K,      KK // L = K - KK
+    dgemm_dsolve_8x4
+    PTR_SLLI    T0,     K,      6
+    PTR_ADDI    CC,     CC,     0x40 // cc += 8
+    PTR_ADD     AA,     AA,     T0 // aa += 8 * k
+.L_M7:
+    andi        I,      M,      4
+    beqz        I,      .L_M3
+.L_M4:
+    PTR_SLLI    T1,     KK,     5
+    GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC
+    PTR_ADD     A0,     AA,     T1 // a0 = aa + 4 * kk
+    move        B0,     BB
+    PTR_SUB     L,      K,      KK // L = K - KK
+    dgemm_dsolve_4x4
+    PTR_SLLI    T0,     K,      5
+    PTR_ADDI    CC,     CC,     0x20 // cc += 4
+    PTR_ADD     AA,     AA,     T0 // aa += 4 * k
+.L_M3:
+    andi        I,      M,      2
+    beqz        I,      .L_M1
+.L_M2:
+    PTR_SLLI    T1,     KK,     4
+    GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC
+    PTR_ADD     A0,     AA,     T1 // a0 = aa + 2 * kk
+    move        B0,     BB
+    PTR_SUB     L,      K,      KK // L = K - KK
+    dgemm_dsolve_2x4
+    PTR_SLLI    T0,     K,      4
+    PTR_ADDI    CC,     CC,     0x10 // cc += 2
+    PTR_ADD     AA,     AA,     T0 // aa += 2 * k
+.L_M1:
+    andi        I,      M,      1
+    beqz        I,      .L_M0
+    PTR_SLLI    T1,     KK,     3
+    GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC
+    PTR_ADD     A0,     AA,     T1 // a0 = aa + kk
+    move        B0,     BB
+    PTR_SUB     L,      K,      KK // L = K - KK
+    dgemm_dsolve_1x4
+    PTR_SLLI    T0,     K,      3
+    PTR_ADDI    CC,     CC,     0x08 // cc += 1
+    PTR_ADD     AA,     AA,     T0 // aa += 1 * k
+.L_M0:
+    PTR_ADDI    KK,     KK,     -4
+    bnez        J,      .L_J1
+.L_N0:
+    pop_if_used 26, 32
+    jirl    $r0, $r1, 0x0
+    EPILOGUE
diff --git a/kernel/loongarch64/dtrsm_kernel_macro.S b/kernel/loongarch64/dtrsm_kernel_macro.S
new file mode 100644
index 000000000..88b7121d1
--- /dev/null
+++ b/kernel/loongarch64/dtrsm_kernel_macro.S
@@ -0,0 +1,2147 @@
+/*******************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+/************** Dgemm Kernel 16x4 ****************/
+.macro KERNEL2x16x4
+    xvld     U0,   A0,    0x00
+    xvfmadd.d  D0,  U8, U12, D0
+    xvfmadd.d  D1,  U9, U12, D1
+
+    xvld     U1,   A0,    0x20
+    xvfmadd.d  D2,  U10, U12, D2
+    xvfmadd.d  D3,  U11, U12, D3
+
+    xvld     U2,   A0,    0x40
+    xvfmadd.d  D4,  U8, U13, D4
+    xvfmadd.d  D5,  U9, U13, D5
+
+    xvld     U3,   A0,    0x60
+    xvfmadd.d  D6,  U10, U13, D6
+    xvfmadd.d  D7,  U11, U13, D7
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D8,  U8, U14, D8
+    xvfmadd.d  D9,  U9, U14, D9
+
+    preld      0,   B0, B_PRE
+    xvldrepl.d U5,  B0, 0x08
+    xvfmadd.d  D10, U10, U14, D10
+    xvfmadd.d  D11, U11, U14, D11
+
+    preld      0,   A0, A_PRE
+    xvldrepl.d U6,  B0, 0x10
+    xvfmadd.d  D12, U8, U15, D12
+    xvfmadd.d  D13, U9, U15, D13
+
+    preld      0,   A0, A_PRE + 0x40
+    xvldrepl.d U7,  B0, 0x18
+    xvfmadd.d  D14, U10, U15, D14
+    xvfmadd.d  D15, U11, U15, D15
+
+    addi.d     A0,  A0, 0x80
+    addi.d     B0,  B0, 0x20
+
+    xvld     U8,   A0,    0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+
+    xvld     U9,   A0,    0x20
+    xvfmadd.d  D2,  U2, U4, D2
+    xvfmadd.d  D3,  U3, U4, D3
+
+    xvld     U10,   A0,    0x40
+    xvfmadd.d  D4,  U0, U5, D4
+    xvfmadd.d  D5,  U1, U5, D5
+
+    xvld     U11,   A0,    0x60
+    xvfmadd.d  D6,  U2, U5, D6
+    xvfmadd.d  D7,  U3, U5, D7
+
+    xvldrepl.d U12,  B0, 0x00
+    xvfmadd.d  D8,  U0, U6, D8
+    xvfmadd.d  D9,  U1, U6, D9
+
+    preld      0,   B0, B_PRE
+    xvldrepl.d U13,  B0, 0x08
+    xvfmadd.d  D10, U2, U6, D10
+    xvfmadd.d  D11, U3, U6, D11
+
+    preld      0,   A0, A_PRE
+    xvldrepl.d U14,  B0, 0x10
+    xvfmadd.d  D12, U0, U7, D12
+    xvfmadd.d  D13, U1, U7, D13
+
+    preld      0,   A0, A_PRE + 0x40
+    xvldrepl.d U15,  B0, 0x18
+    xvfmadd.d  D14, U2, U7, D14
+    xvfmadd.d  D15, U3, U7, D15
+
+    addi.d     A0,  A0, 0x80
+    addi.d     B0,  B0, 0x20
+.endm
+
+.macro KERNEL2x16x4_END
+    xvld     U0,   A0,    0x00
+    xvfmadd.d  D0,  U8, U12, D0
+    xvfmadd.d  D1,  U9, U12, D1
+
+    xvld     U1,   A0,    0x20
+    xvfmadd.d  D2,  U10, U12, D2
+    xvfmadd.d  D3,  U11, U12, D3
+
+    xvld     U2,   A0,    0x40
+    xvfmadd.d  D4,  U8, U13, D4
+    xvfmadd.d  D5,  U9, U13, D5
+
+    xvld     U3,   A0,    0x60
+    xvfmadd.d  D6,  U10, U13, D6
+    xvfmadd.d  D7,  U11, U13, D7
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D8,  U8, U14, D8
+    xvfmadd.d  D9,  U9, U14, D9
+
+    preld      0,   B0, B_PRE
+    xvldrepl.d U5,  B0, 0x08
+    xvfmadd.d  D10, U10, U14, D10
+    xvfmadd.d  D11, U11, U14, D11
+
+    preld      0,   A0, A_PRE
+    xvldrepl.d U6,  B0, 0x10
+    xvfmadd.d  D12, U8, U15, D12
+    xvfmadd.d  D13, U9, U15, D13
+
+    preld      0,   A0, A_PRE + 0x40
+    xvldrepl.d U7,  B0, 0x18
+    xvfmadd.d  D14, U10, U15, D14
+    xvfmadd.d  D15, U11, U15, D15
+
+    addi.d     A0,  A0, 0x80
+    addi.d     B0,  B0, 0x20
+
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+
+    xvfmadd.d  D2,  U2, U4, D2
+    xvfmadd.d  D3,  U3, U4, D3
+
+    xvfmadd.d  D4,  U0, U5, D4
+    xvfmadd.d  D5,  U1, U5, D5
+
+    xvfmadd.d  D6,  U2, U5, D6
+    xvfmadd.d  D7,  U3, U5, D7
+
+    xvfmadd.d  D8,  U0, U6, D8
+    xvfmadd.d  D9,  U1, U6, D9
+
+    preld      0,   B0, B_PRE
+    xvfmadd.d  D10, U2, U6, D10
+    xvfmadd.d  D11, U3, U6, D11
+
+    preld      0,   A0, A_PRE
+    xvfmadd.d  D12, U0, U7, D12
+    xvfmadd.d  D13, U1, U7, D13
+
+    preld      0,   A0, A_PRE + 0x40
+    xvfmadd.d  D14, U2, U7, D14
+    xvfmadd.d  D15, U3, U7, D15
+.endm
+
+.macro KERNEL8x16x4
+.rept 4
+    KERNEL2x16x4
+.endr
+.endm
+
+.macro KERNEL8x16x4_END
+.rept 3
+    KERNEL2x16x4
+.endr
+    KERNEL2x16x4_END
+.endm
+
+.macro KERNEL2x8x4
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U8, U12, D0
+    xvfmadd.d  D1,  U9, U12, D1
+
+    xvldrepl.d U5,  B0, 0x08
+    xvfmadd.d  D4,  U8, U13, D4
+    xvfmadd.d  D5,  U9, U13, D5
+
+    xvldrepl.d U6,  B0, 0x10
+    xvfmadd.d  D8,  U8, U14, D8
+    xvfmadd.d  D9,  U9, U14, D9
+
+    xvldrepl.d U7,  B0, 0x18
+    xvfmadd.d  D12, U8, U15, D12
+    xvfmadd.d  D13, U9, U15, D13
+
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x20
+
+    xvld     U8,   A0,    0x00
+    xvld     U9,   A0,    0x20
+
+    xvldrepl.d U12, B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+
+    xvldrepl.d U13, B0, 0x08
+    xvfmadd.d  D4,  U0, U5, D4
+    xvfmadd.d  D5,  U1, U5, D5
+
+    xvldrepl.d U14, B0, 0x10
+    xvfmadd.d  D8,  U0, U6, D8
+    xvfmadd.d  D9,  U1, U6, D9
+
+    xvldrepl.d U15,  B0, 0x18
+    xvfmadd.d  D12, U0, U7, D12
+    xvfmadd.d  D13, U1, U7, D13
+
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x20
+.endm
+
+.macro KERNEL2x8x4_END
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U8, U12, D0
+    xvfmadd.d  D1,  U9, U12, D1
+
+    xvldrepl.d U5,  B0, 0x08
+    xvfmadd.d  D4,  U8, U13, D4
+    xvfmadd.d  D5,  U9, U13, D5
+
+    xvldrepl.d U6,  B0, 0x10
+    xvfmadd.d  D8,  U8, U14, D8
+    xvfmadd.d  D9,  U9, U14, D9
+
+    xvldrepl.d U7,  B0, 0x18
+    xvfmadd.d  D12, U8, U15, D12
+    xvfmadd.d  D13, U9, U15, D13
+
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x20
+
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+
+    xvfmadd.d  D4,  U0, U5, D4
+    xvfmadd.d  D5,  U1, U5, D5
+
+    xvfmadd.d  D8,  U0, U6, D8
+    xvfmadd.d  D9,  U1, U6, D9
+
+    xvfmadd.d  D12, U0, U7, D12
+    xvfmadd.d  D13, U1, U7, D13
+.endm
+
+.macro KERNEL8x8x4
+.rept 4
+    KERNEL2x8x4
+.endr
+.endm
+
+.macro KERNEL8x8x4_END
+.rept 3
+    KERNEL2x8x4
+.endr
+    KERNEL2x8x4_END
+.endm
+
+.macro KERNEL2x4x4
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U8, U12, D0
+
+    xvldrepl.d U5,  B0, 0x08
+    xvfmadd.d  D4,  U8, U13, D4
+
+    xvldrepl.d U6,  B0, 0x10
+    xvfmadd.d  D8,  U8, U14, D8
+
+    xvldrepl.d U7,  B0, 0x18
+    xvfmadd.d  D12, U8, U15, D12
+
+    addi.d     A0,  A0, 0x20
+    addi.d     B0,  B0, 0x20
+
+    xvld     U8,   A0,    0x00
+
+    xvldrepl.d U12, B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    xvldrepl.d U13, B0, 0x08
+    xvfmadd.d  D4,  U0, U5, D4
+
+    xvldrepl.d U14, B0, 0x10
+    xvfmadd.d  D8,  U0, U6, D8
+
+    xvldrepl.d U15,  B0, 0x18
+    xvfmadd.d  D12, U0, U7, D12
+
+    addi.d     A0,  A0, 0x20
+    addi.d     B0,  B0, 0x20
+.endm
+
+.macro KERNEL2x4x4_END
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U8, U12, D0
+
+    xvldrepl.d U5,  B0, 0x08
+    xvfmadd.d  D4,  U8, U13, D4
+
+    xvldrepl.d U6,  B0, 0x10
+    xvfmadd.d  D8,  U8, U14, D8
+
+    xvldrepl.d U7,  B0, 0x18
+    xvfmadd.d  D12, U8, U15, D12
+
+    addi.d     A0,  A0, 0x20
+    addi.d     B0,  B0, 0x20
+
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D4,  U0, U5, D4
+    xvfmadd.d  D8,  U0, U6, D8
+    xvfmadd.d  D12, U0, U7, D12
+.endm
+
+.macro KERNEL8x4x4
+.rept 4
+    KERNEL2x4x4
+.endr
+.endm
+
+.macro KERNEL8x4x4_END
+.rept 3
+    KERNEL2x4x4
+.endr
+    KERNEL2x4x4_END
+.endm
+
+.macro KERNEL2x2x4
+    xvldrepl.d     U0,   A0,    0x00
+    xvldrepl.d     U1,   A0,    0x08
+
+    xvfmadd.d   D0,     U8,     U12,    D0
+    xvfmadd.d   D1,     U9,     U12,    D1
+
+    xvld    U4,  B0, 0x00
+    addi.d  A0,  A0, 0x10
+    addi.d  B0,  B0, 0x20
+
+    xvldrepl.d     U8,   A0,    0x00
+    xvldrepl.d     U9,   A0,    0x08
+
+    xvfmadd.d   D0,     U0,     U4,    D0
+    xvfmadd.d   D1,     U1,     U4,    D1
+
+    xvld       U12, B0, 0x00
+    addi.d     A0,  A0, 0x10
+    addi.d     B0,  B0, 0x20
+.endm
+
+.macro KERNEL2x2x4_END
+    xvldrepl.d     U0,   A0,    0x00
+    xvldrepl.d     U1,   A0,    0x08
+
+    xvfmadd.d   D0,     U8,     U12,    D0
+    xvfmadd.d   D1,     U9,     U12,    D1
+
+    xvld    U4,  B0, 0x00
+    addi.d  A0,  A0, 0x10
+    addi.d  B0,  B0, 0x20
+
+    xvfmadd.d   D0,     U0,     U4,    D0
+    xvfmadd.d   D1,     U1,     U4,    D1
+.endm
+
+.macro KERNEL8x2x4
+.rept 4
+    KERNEL2x2x4
+.endr
+.endm
+
+.macro KERNEL8x2x4_END
+.rept 3
+    KERNEL2x2x4
+.endr
+    KERNEL2x2x4_END
+.endm
+
+.macro KERNEL2x1x4
+    xvldrepl.d  U0,     A0,     0x00
+    xvfmadd.d   D0,     U8,     U12,    D0
+    xvld        U4,     B0,     0x00
+
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x20
+
+    xvldrepl.d  U8,     A0,     0x00
+    xvfmadd.d   D0,     U0,     U4,     D0
+    xvld        U12,    B0,     0x00
+
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x20
+.endm
+
+.macro KERNEL2x1x4_END
+    xvldrepl.d  U0,     A0,     0x00
+    xvfmadd.d   D0,     U8,     U12,    D0
+    xvld        U4,     B0,     0x00
+
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x20
+
+    xvfmadd.d   D0,     U0,     U4,     D0
+.endm
+
+.macro KERNEL8x1x4
+.rept 4
+    KERNEL2x1x4
+.endr
+.endm
+
+.macro KERNEL8x1x4_END
+.rept 3
+    KERNEL2x1x4
+.endr
+    KERNEL2x1x4_END
+.endm
+
+.macro KERNEL2x16x2
+    xvld     U0,   A0,    0x00
+    xvfmadd.d  D0,  U8, U12, D0
+    xvfmadd.d  D1,  U9, U12, D1
+
+    xvld     U1,   A0,    0x20
+    xvfmadd.d  D2,  U10, U12, D2
+    xvfmadd.d  D3,  U11, U12, D3
+
+    xvld     U2,   A0,    0x40
+    xvfmadd.d  D4,  U8, U13, D4
+    xvfmadd.d  D5,  U9, U13, D5
+
+    xvld     U3,   A0,    0x60
+    xvfmadd.d  D6,  U10, U13, D6
+    xvfmadd.d  D7,  U11, U13, D7
+
+    xvldrepl.d U4,  B0, 0x00
+    xvldrepl.d U5,  B0, 0x08
+
+    addi.d     A0,  A0, 0x80
+    addi.d     B0,  B0, 0x10
+
+    xvld     U8,   A0,    0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+
+    xvld     U9,   A0,    0x20
+    xvfmadd.d  D2,  U2, U4, D2
+    xvfmadd.d  D3,  U3, U4, D3
+
+    xvld     U10,   A0,    0x40
+    xvfmadd.d  D4,  U0, U5, D4
+    xvfmadd.d  D5,  U1, U5, D5
+
+    xvld     U11,   A0,    0x60
+    xvfmadd.d  D6,  U2, U5, D6
+    xvfmadd.d  D7,  U3, U5, D7
+
+    xvldrepl.d U12,  B0, 0x00
+    xvldrepl.d U13,  B0, 0x08
+
+    addi.d     A0,  A0, 0x80
+    addi.d     B0,  B0, 0x10
+.endm
+
+.macro KERNEL2x16x2_END
+    xvld     U0,   A0,    0x00
+    xvfmadd.d  D0,  U8, U12, D0
+    xvfmadd.d  D1,  U9, U12, D1
+
+    xvld     U1,   A0,    0x20
+    xvfmadd.d  D2,  U10, U12, D2
+    xvfmadd.d  D3,  U11, U12, D3
+
+    xvld     U2,   A0,    0x40
+    xvfmadd.d  D4,  U8, U13, D4
+    xvfmadd.d  D5,  U9, U13, D5
+
+    xvld     U3,   A0,    0x60
+    xvfmadd.d  D6,  U10, U13, D6
+    xvfmadd.d  D7,  U11, U13, D7
+
+    xvldrepl.d U4,  B0, 0x00
+    xvldrepl.d U5,  B0, 0x08
+
+    addi.d     A0,  A0, 0x80
+    addi.d     B0,  B0, 0x10
+
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+
+    xvfmadd.d  D2,  U2, U4, D2
+    xvfmadd.d  D3,  U3, U4, D3
+
+    xvfmadd.d  D4,  U0, U5, D4
+    xvfmadd.d  D5,  U1, U5, D5
+
+    xvfmadd.d  D6,  U2, U5, D6
+    xvfmadd.d  D7,  U3, U5, D7
+.endm
+
+.macro KERNEL8x16x2
+.rept 4
+    KERNEL2x16x2
+.endr
+.endm
+
+.macro KERNEL8x16x2_END
+.rept 3
+    KERNEL2x16x2
+.endr
+    KERNEL2x16x2_END
+.endm
+
+.macro KERNEL2x8x2
+    xvld     U0,   A0,    0x00
+    xvfmadd.d  D0,  U8, U12, D0
+    xvfmadd.d  D1,  U9, U12, D1
+
+    xvld     U1,   A0,    0x20
+    xvfmadd.d  D4,  U8, U13, D4
+    xvfmadd.d  D5,  U9, U13, D5
+
+    xvldrepl.d U4,  B0, 0x00
+    xvldrepl.d U5,  B0, 0x08
+
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x10
+
+    xvld     U8,   A0,    0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+
+    xvld     U9,   A0,    0x20
+    xvfmadd.d  D4,  U0, U5, D4
+    xvfmadd.d  D5,  U1, U5, D5
+
+    xvldrepl.d U12,  B0, 0x00
+    xvldrepl.d U13,  B0, 0x08
+
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x10
+.endm
+
+.macro KERNEL2x8x2_END
+    xvld     U0,   A0,    0x00
+    xvfmadd.d  D0,  U8, U12, D0
+    xvfmadd.d  D1,  U9, U12, D1
+
+    xvld     U1,   A0,    0x20
+    xvfmadd.d  D4,  U8, U13, D4
+    xvfmadd.d  D5,  U9, U13, D5
+
+    xvldrepl.d U4,  B0, 0x00
+    xvldrepl.d U5,  B0, 0x08
+
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x10
+
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+
+    xvfmadd.d  D4,  U0, U5, D4
+    xvfmadd.d  D5,  U1, U5, D5
+.endm
+
+.macro KERNEL8x8x2
+.rept 4
+    KERNEL2x8x2
+.endr
+.endm
+
+.macro KERNEL8x8x2_END
+.rept 3
+    KERNEL2x8x2
+ .endr
+    KERNEL2x8x2_END
+.endm
+
+.macro KERNEL2x4x2
+    xvld     U0,   A0,    0x00
+    xvfmadd.d  D0,  U8, U12, D0
+    xvfmadd.d  D4,  U8, U13, D4
+
+    xvldrepl.d U4,  B0, 0x00
+    xvldrepl.d U5,  B0, 0x08
+
+    addi.d     A0,  A0, 0x20
+    addi.d     B0,  B0, 0x10
+
+    xvld     U8,   A0,    0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D4,  U0, U5, D4
+
+    xvldrepl.d U12,  B0, 0x00
+    xvldrepl.d U13,  B0, 0x08
+
+    addi.d     A0,  A0, 0x20
+    addi.d     B0,  B0, 0x10
+.endm
+
+.macro KERNEL2x4x2_END
+    xvld     U0,   A0,    0x00
+    xvfmadd.d  D0,  U8, U12, D0
+    xvfmadd.d  D4,  U8, U13, D4
+
+    xvldrepl.d U4,  B0, 0x00
+    xvldrepl.d U5,  B0, 0x08
+
+    addi.d     A0,  A0, 0x20
+    addi.d     B0,  B0, 0x10
+
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D4,  U0, U5, D4
+.endm
+
+.macro KERNEL8x4x2
+.rept 4
+    KERNEL2x4x2
+.endr
+.endm
+
+.macro KERNEL8x4x2_END
+.rept 3
+    KERNEL2x4x2
+.endr
+    KERNEL2x4x2_END
+.endm
+
+.macro KERNEL2x2x2
+    xvld     U0,   A0,    0x00
+    xvfmadd.d  D0,  U8, U12, D0
+    xvfmadd.d  D4,  U8, U13, D4
+
+    xvldrepl.d U4,  B0, 0x00
+    xvldrepl.d U5,  B0, 0x08
+
+    addi.d     A0,  A0, 0x10
+    addi.d     B0,  B0, 0x10
+
+    xvld     U8,   A0,    0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D4,  U0, U5, D4
+
+    xvldrepl.d U12,  B0, 0x00
+    xvldrepl.d U13,  B0, 0x08
+
+    addi.d     A0,  A0, 0x10
+    addi.d     B0,  B0, 0x10
+.endm
+
+.macro KERNEL2x2x2_END
+    xvld     U0,   A0,    0x00
+    xvfmadd.d  D0,  U8, U12, D0
+    xvfmadd.d  D4,  U8, U13, D4
+
+    xvldrepl.d U4,  B0, 0x00
+    xvldrepl.d U5,  B0, 0x08
+
+    addi.d     A0,  A0, 0x10
+    addi.d     B0,  B0, 0x10
+
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D4,  U0, U5, D4
+.endm
+
+.macro KERNEL8x2x2
+.rept 4
+    KERNEL2x2x2
+.endr
+.endm
+
+.macro KERNEL8x2x2_END
+.rept 3
+    KERNEL2x2x2
+.endr
+    KERNEL2x2x2_END
+.endm
+
+.macro KERNEL2x1x2
+    xvld     U0,   A0,    0x00
+    xvfmadd.d  D0,  U8, U12, D0
+    xvfmadd.d  D4,  U8, U13, D4
+
+    xvldrepl.d U4,  B0, 0x00
+    xvldrepl.d U5,  B0, 0x08
+
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x10
+
+    xvld     U8,   A0,    0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D4,  U0, U5, D4
+
+    xvldrepl.d U12,  B0, 0x00
+    xvldrepl.d U13,  B0, 0x08
+
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x10
+.endm
+
+.macro KERNEL2x1x2_END
+    xvld     U0,   A0,    0x00
+    xvfmadd.d  D0,  U8, U12, D0
+    xvfmadd.d  D4,  U8, U13, D4
+
+    xvldrepl.d U4,  B0, 0x00
+    xvldrepl.d U5,  B0, 0x08
+
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x10
+
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D4,  U0, U5, D4
+.endm
+
+.macro KERNEL8x1x2
+.rept 4
+    KERNEL2x1x2
+.endr
+.endm
+
+.macro KERNEL8x1x2_END
+.rept 3
+    KERNEL2x1x2
+.endr
+    KERNEL2x1x2_END
+.endm
+
+.macro KERNEL2x16x1
+    xvld     U0,   A0,    0x00
+    xvfmadd.d  D0,  U8, U12, D0
+    xvfmadd.d  D1,  U9, U12, D1
+
+    xvld     U1,   A0,    0x20
+    xvfmadd.d  D2,  U10, U12, D2
+    xvfmadd.d  D3,  U11, U12, D3
+
+    xvld     U2,   A0,    0x40
+    xvld     U3,   A0,    0x60
+
+    xvldrepl.d U4,  B0, 0x00
+
+    addi.d     A0,  A0, 0x80
+    addi.d     B0,  B0, 0x08
+
+    xvld     U8,   A0,    0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+
+    xvld     U9,   A0,    0x20
+    xvfmadd.d  D2,  U2, U4, D2
+    xvfmadd.d  D3,  U3, U4, D3
+
+    xvld     U10,   A0,    0x40
+    xvld     U11,   A0,    0x60
+
+    xvldrepl.d U12,  B0, 0x00
+
+    addi.d     A0,  A0, 0x80
+    addi.d     B0,  B0, 0x08
+.endm
+
+.macro KERNEL2x16x1_END
+    xvld     U0,   A0,    0x00
+    xvfmadd.d  D0,  U8, U12, D0
+    xvfmadd.d  D1,  U9, U12, D1
+
+    xvld     U1,   A0,    0x20
+    xvfmadd.d  D2,  U10, U12, D2
+    xvfmadd.d  D3,  U11, U12, D3
+
+    xvld     U2,   A0,    0x40
+    xvld     U3,   A0,    0x60
+
+    xvldrepl.d U4,  B0, 0x00
+
+    addi.d     A0,  A0, 0x80
+    addi.d     B0,  B0, 0x08
+
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+
+    xvfmadd.d  D2,  U2, U4, D2
+    xvfmadd.d  D3,  U3, U4, D3
+.endm
+
+.macro KERNEL8x16x1
+.rept 4
+    KERNEL2x16x1
+.endr
+.endm
+
+.macro KERNEL8x16x1_END
+.rept 3
+    KERNEL2x16x1
+.endr
+    KERNEL2x16x1_END
+.endm
+
+.macro KERNEL2x8x1
+    xvld     U0,   A0,    0x00
+    xvfmadd.d  D0,  U8, U12, D0
+    xvfmadd.d  D1,  U9, U12, D1
+    xvld     U1,   A0,    0x20
+    xvldrepl.d U4,  B0, 0x00
+
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x08
+
+    xvld     U8,   A0,    0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+    xvld     U9,   A0,    0x20
+    xvldrepl.d U12,  B0, 0x00
+
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x08
+.endm
+
+.macro KERNEL2x8x1_END
+    xvld     U0,   A0,    0x00
+    xvfmadd.d  D0,  U8, U12, D0
+    xvfmadd.d  D1,  U9, U12, D1
+    xvld     U1,   A0,    0x20
+    xvldrepl.d U4,  B0, 0x00
+
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x08
+
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+.endm
+
+.macro KERNEL8x8x1
+.rept 4
+    KERNEL2x8x1
+.endr
+.endm
+
+.macro KERNEL8x8x1_END
+.rept 3
+    KERNEL2x8x1
+.endr
+    KERNEL2x8x1_END
+.endm
+
+.macro KERNEL2x4x1
+    xvld     U0,   A0,    0x00
+    xvfmadd.d  D0,  U8, U12, D0
+    xvldrepl.d U4,  B0, 0x00
+
+    addi.d     A0,  A0, 0x20
+    addi.d     B0,  B0, 0x08
+
+    xvld     U8,   A0,    0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvldrepl.d U12,  B0, 0x00
+
+    addi.d     A0,  A0, 0x20
+    addi.d     B0,  B0, 0x08
+.endm
+
+.macro KERNEL2x4x1_END
+    xvld     U0,   A0,    0x00
+    xvfmadd.d  D0,  U8, U12, D0
+    xvldrepl.d U4,  B0, 0x00
+
+    addi.d     A0,  A0, 0x20
+    addi.d     B0,  B0, 0x08
+
+    xvfmadd.d  D0,  U0, U4, D0
+.endm
+
+.macro KERNEL8x4x1
+.rept 4
+    KERNEL2x4x1
+.endr
+.endm
+
+.macro KERNEL8x4x1_END
+.rept 3
+    KERNEL2x4x1
+.endr
+    KERNEL2x4x1_END
+.endm
+
+.macro KERNEL2x2x1
+    xvld     U0,   A0,    0x00
+    xvfmadd.d  D0,  U8, U12, D0
+    xvldrepl.d U4,  B0, 0x00
+
+    addi.d     A0,  A0, 0x10
+    addi.d     B0,  B0, 0x08
+
+    xvld     U8,   A0,    0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvldrepl.d U12,  B0, 0x00
+
+    addi.d     A0,  A0, 0x10
+    addi.d     B0,  B0, 0x08
+.endm
+
+.macro KERNEL2x2x1_END
+    xvld     U0,   A0,    0x00
+    xvfmadd.d  D0,  U8, U12, D0
+    xvldrepl.d U4,  B0, 0x00
+
+    addi.d     A0,  A0, 0x10
+    addi.d     B0,  B0, 0x08
+
+    xvfmadd.d  D0,  U0, U4, D0
+.endm
+
+.macro KERNEL8x2x1
+.rept 4
+    KERNEL2x2x1
+.endr
+.endm
+
+.macro KERNEL8x2x1_END
+.rept 3
+    KERNEL2x2x1
+.endr
+    KERNEL2x2x1_END
+.endm
+
+.macro KERNEL2x1x1
+    xvld     U0,   A0,    0x00
+    xvfmadd.d  D0,  U8, U12, D0
+    xvldrepl.d U4,  B0, 0x00
+
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x08
+
+    xvld     U8,   A0,    0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvldrepl.d U12,  B0, 0x00
+
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x08
+.endm
+
+.macro KERNEL2x1x1_END
+    xvld     U0,   A0,    0x00
+    xvfmadd.d  D0,  U8, U12, D0
+    xvldrepl.d U4,  B0, 0x00
+
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x08
+
+    xvfmadd.d  D0,  U0, U4, D0
+.endm
+
+.macro KERNEL8x1x1
+.rept 4
+    KERNEL2x1x1
+.endr
+.endm
+
+.macro KERNEL8x1x1_END
+.rept 3
+    KERNEL2x1x1
+.endr
+    KERNEL2x1x1_END
+.endm
+
+.macro dgemm_16x4
+.L_dgemm_16x4: // See dgemm_kernel_16x4.S
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+    xvld     U2,   A0,    0x40
+    xvld     U3,   A0,    0x60
+
+    xvldrepl.d     U4, B0, 0x00
+    /* line 1 */
+    xvfmul.d  D0,  U0, U4
+    xvfmul.d  D1,  U1, U4
+    xvfmul.d  D2,  U2, U4
+    xvfmul.d  D3,  U3, U4
+
+    xvldrepl.d     U5, B0, 0x08
+    /* line 2 */
+    xvfmul.d  D4,  U0, U5
+    xvfmul.d  D5,  U1, U5
+    xvfmul.d  D6,  U2, U5
+    xvfmul.d  D7,  U3, U5
+
+    xvldrepl.d     U6, B0, 0x10
+    /* line 3 */
+    xvfmul.d  D8,  U0, U6
+    xvfmul.d  D9,  U1, U6
+    xvfmul.d  D10, U2, U6
+    xvfmul.d  D11, U3, U6
+
+    xvldrepl.d     U7, B0, 0x18
+    /* line 4 */
+    xvfmul.d  D12, U0, U7
+    xvfmul.d  D13, U1, U7
+    xvfmul.d  D14, U2, U7
+    xvfmul.d  D15, U3, U7
+
+    /* Add stride for A0 and B0 */
+    PTR_ADDI    A0,  A0, 0x80
+    PTR_ADDI    B0,  B0, 0x20
+    /* Reduce L */
+    PTR_ADDI    L,   L,  -1
+    PTR_SRAI    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_L7 */
+    beq       ZERO,TL, .L_dgemm_16x4_L7
+
+    xvld     U8,   A0,    0x00
+    xvld     U9,   A0,    0x20
+    xvld     U10,  A0,    0x40
+    xvld     U11,  A0,    0x60
+
+    PTR_ADDI    TL,  TL,  -1
+
+    xvldrepl.d  U12,   B0,  0x00
+    xvldrepl.d  U13,   B0,  0x08
+    xvldrepl.d  U14,   B0,  0x10
+    xvldrepl.d  U15,   B0,  0x18
+    PTR_ADDI     A0,  A0, 0x80
+    PTR_ADDI     B0,  B0, 0x20
+
+    beq    ZERO,    TL,  .L_dgemm_16x4_TL1_END
+.align 5
+.L_dgemm_16x4_TL1:
+    KERNEL8x16x4
+    PTR_ADDI  TL,    TL,     -1
+    blt     ZERO,  TL,     .L_dgemm_16x4_TL1
+.L_dgemm_16x4_TL1_END:
+    KERNEL8x16x4_END
+.L_dgemm_16x4_L7:
+    andi      TL,  L,    7
+    beq       TL,  ZERO, .L_dgemm_16x4_L0
+.align 5
+.L_dgemm_16x4_L71:
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+    xvld     U2,   A0,    0x40
+    xvld     U3,   A0,    0x60
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+    xvfmadd.d  D2,  U2, U4, D2
+    xvfmadd.d  D3,  U3, U4, D3
+
+    xvldrepl.d U5,  B0, 0x08
+    xvfmadd.d  D4,  U0, U5, D4
+    xvfmadd.d  D5,  U1, U5, D5
+    xvfmadd.d  D6,  U2, U5, D6
+    xvfmadd.d  D7,  U3, U5, D7
+
+    xvldrepl.d U6,  B0, 0x10
+    xvfmadd.d  D8,  U0, U6, D8
+    xvfmadd.d  D9,  U1, U6, D9
+    xvfmadd.d  D10, U2, U6, D10
+    xvfmadd.d  D11, U3, U6, D11
+
+    xvldrepl.d U7,  B0, 0x18
+    xvfmadd.d  D12, U0, U7, D12
+    xvfmadd.d  D13, U1, U7, D13
+    xvfmadd.d  D14, U2, U7, D14
+    xvfmadd.d  D15, U3, U7, D15
+
+    PTR_ADDI     A0,  A0, 0x80
+    PTR_ADDI     B0,  B0, 0x20
+
+    PTR_ADDI     TL,  TL, -1
+    blt        ZERO,TL, .L_dgemm_16x4_L71
+.L_dgemm_16x4_L0:
+    // Load C
+    GLD xv, , U0,  C0, 0x00, U1,  C0, 0x20, U2,  C0, 0x40, U3,  C0, 0x60
+    GLD xv, , U4,  C1, 0x00, U5,  C1, 0x20, U6,  C1, 0x40, U7,  C1, 0x60
+    GLD xv, , U8,  C2, 0x00, U9,  C2, 0x20, U10, C2, 0x40, U11, C2, 0x60
+    GLD xv, , U12, C3, 0x00, U13, C3, 0x20, U14, C3, 0x40, U15, C3, 0x60
+    GSUB xvf, d, U0,  U0,  D0,  U1,  U1,  D1,  U2,  U2,  D2,  U3,  U3,  D3,  \
+                 U4,  U4,  D4,  U5,  U5,  D5,  U6,  U6,  D6,  U7,  U7,  D7,  \
+                 U8,  U8,  D8,  U9,  U9,  D9,  U10, U10, D10, U11, U11, D11, \
+                 U12, U12, D12, U13, U13, D13, U14, U14, D14, U15, U15, D15
+.endm
+
+.macro dgemm_1x4
+.L_dgemm_1x4: // See dgemm_kernel_16x4.S
+    xvldrepl.d  U0,     A0,     0x00
+    xvld        U4,     B0,     0x00
+    xvfmul.d    D0,     U0,     U4
+
+    /* Add stride for A0 and B0 */
+    PTR_ADDI    A0,  A0, 0x08
+    PTR_ADDI    B0,  B0, 0x20
+    /* Reduce L */
+    PTR_ADDI    L,   L,  -1
+    PTR_SRAI    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_M1_L7 */
+    beq       ZERO,TL, .L_dgemm_1x4_M1_L7
+    xvldrepl.d  U8,     A0,     0x00
+
+    PTR_ADDI     TL,  TL,  -1
+    xvld       U12, B0,  0x00
+    PTR_ADDI     A0,  A0,  0x08
+    PTR_ADDI     B0,  B0,  0x20
+
+    beq    ZERO,    TL,  .L_dgemm_1x4_M1_TL1_END
+.align 5
+.L_dgemm_1x4_M1_TL1:
+    KERNEL8x1x4
+    PTR_ADDI  TL,     TL,     -1
+    blt     ZERO,TL, .L_dgemm_1x4_M1_TL1
+.L_dgemm_1x4_M1_TL1_END:
+    KERNEL8x1x4_END
+.L_dgemm_1x4_M1_L7:
+    /* if (!(L & 7)) goto L_M1_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_dgemm_1x4_M1_L0
+.align 5
+.L_dgemm_1x4_M1_L71:
+    xvldrepl.d  U0,     A0,     0x00
+    xvld        U4,     B0,     0x00
+    xvfmadd.d   D0,     U0,     U4,    D0
+
+    /* Add stride for A0, B0 */
+    PTR_ADDI     A0,  A0, 0x08
+    PTR_ADDI     B0,  B0, 0x20
+
+    PTR_ADDI     TL,  TL, -1
+    blt        ZERO,TL, .L_dgemm_1x4_M1_L71
+.L_dgemm_1x4_M1_L0:
+    // Load C
+    fld.d       $f0,    C0,     0x00
+    fld.d       $f1,    C1,     0x00
+    fld.d       $f2,    C2,     0x00
+    fld.d       $f3,    C3,     0x00
+    xvinsve0.d  U0,     U1,     0x01
+    xvinsve0.d  U0,     U2,     0x02
+    xvinsve0.d  U0,     U3,     0x03
+    GSUB xvf, d, U0, U0, D0
+.endm
+
+.macro dgemm_2x4
+.L_dgemm_2x4:
+    /* Load 2 * 64 from A0 */
+    xvldrepl.d     U0,   A0,    0x00
+    xvldrepl.d     U1,   A0,    0x08
+    xvld    U4,     B0,     0x00
+    xvfmul.d    D0,     U0,     U4
+    xvfmul.d    D1,     U1,     U4
+    /* Add stride for A0 and B0 */
+    PTR_ADDI    A0,  A0, 0x10
+    PTR_ADDI    B0,  B0, 0x20
+    /* Reduce L */
+    PTR_ADDI    L,   L,  -1
+    PTR_SRAI    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_M2_L7 */
+    beq       ZERO,TL, .L_dgemm_2x4_M2_L7
+
+    xvldrepl.d     U8,   A0,    0x00
+    xvldrepl.d     U9,   A0,    0x08
+
+    PTR_ADDI    TL,  TL,  -1
+
+    xvld       U12, B0, 0x00
+    PTR_ADDI     A0,  A0, 0x10
+    PTR_ADDI     B0,  B0, 0x20
+
+    beq    ZERO,    TL,  .L_dgemm_2x4_M2_TL1_END
+.align 5
+.L_dgemm_2x4_M2_TL1:
+    KERNEL8x2x4
+
+    PTR_ADDI    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_dgemm_2x4_M2_TL1
+.L_dgemm_2x4_M2_TL1_END:
+    KERNEL8x2x4_END
+
+.L_dgemm_2x4_M2_L7:
+    /* if (!(L & 7)) goto L_M2_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_dgemm_2x4_M2_L0
+.align 5
+.L_dgemm_2x4_M2_L71:
+    xvldrepl.d     U0,   A0,    0x00
+    xvldrepl.d     U1,   A0,    0x08
+
+    xvld    U4,  B0, 0x00
+
+    xvfmadd.d   D0,     U0,     U4,    D0
+    xvfmadd.d   D1,     U1,     U4,    D1
+    /* Add stride for A0, B0 */
+    PTR_ADDI     A0,  A0, 0x10
+    PTR_ADDI     B0,  B0, 0x20
+
+    PTR_ADDI     TL,  TL, -1
+    blt        ZERO,TL, .L_dgemm_2x4_M2_L71
+.L_dgemm_2x4_M2_L0:
+    xvpackev.d  D4,     D1,     D0
+    xvpackod.d  D5,     D1,     D0
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    /* Load C1  */
+    xvld      U1,  C1,  0x00
+    /* Load C2  */
+    xvld      U2,  C2,  0x00
+    /* Load C3  */
+    xvld      U3,  C3,  0x00
+
+    xvpermi.q   U0, U2, 0x02
+    xvpermi.q   U1, U3, 0x02
+
+    GSUB xvf, d, U0, U0, D4, U1, U1, D5
+.endm
+
+.macro dgemm_4x4
+.L_dgemm_4x4:
+    /* Load 4 * 64 from A0 */
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d     U4, B0, 0x00
+    /* line 1 */
+    xvfmul.d  D0,  U0, U4
+
+    xvldrepl.d     U5, B0, 0x08
+    /* line 2 */
+    xvfmul.d  D4,  U0, U5
+
+    xvldrepl.d     U6, B0, 0x10
+    /* line 3 */
+    xvfmul.d  D8,  U0, U6
+
+    xvldrepl.d     U7, B0, 0x18
+    /* line 4 */
+    xvfmul.d  D12, U0, U7
+
+    /* Add stride for A0 and B0 */
+    PTR_ADDI    A0,  A0, 0x20
+    PTR_ADDI    B0,  B0, 0x20
+    /* Reduce L */
+    PTR_ADDI    L,   L,  -1
+    PTR_SRAI    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_M4_L7 */
+    beq       ZERO,TL, .L_dgemm_4x4_M4_L7
+
+    xvld     U8,   A0,    0x00
+
+    PTR_ADDI    TL,  TL,  -1
+
+    xvldrepl.d  U12,   B0,  0x00
+    xvldrepl.d  U13,   B0,  0x08
+    xvldrepl.d  U14,   B0,  0x10
+    xvldrepl.d  U15,   B0,  0x18
+    PTR_ADDI     A0,  A0, 0x20
+    PTR_ADDI     B0,  B0, 0x20
+
+    beq    ZERO,    TL,  .L_dgemm_4x4_M4_TL1_END
+.align 5
+.L_dgemm_4x4_M4_TL1: /* TL-- */
+    KERNEL8x4x4
+
+    PTR_ADDI     TL,  TL, -1
+    blt        ZERO,TL, .L_dgemm_4x4_M4_TL1
+.L_dgemm_4x4_M4_TL1_END:
+    KERNEL8x4x4_END
+.L_dgemm_4x4_M4_L7:
+    /* if (!(L & 7)) goto L_M4_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_dgemm_4x4_M4_L0
+.align 5
+.L_dgemm_4x4_M4_L71:
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+
+    xvldrepl.d U4,  B0, 0x10
+    xvfmadd.d  D8,  U0, U4, D8
+
+    xvldrepl.d U4,  B0, 0x18
+    xvfmadd.d  D12, U0, U4, D12
+
+    /* Add stride for A0, B0 */
+    PTR_ADDI     A0,  A0, 0x20
+    PTR_ADDI     B0,  B0, 0x20
+
+    PTR_ADDI     TL,  TL, -1
+    blt        ZERO,TL, .L_dgemm_4x4_M4_L71
+ .L_dgemm_4x4_M4_L0:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    /* Load C1  */
+    xvld      U1,  C1,  0x00
+    /* Load C2  */
+    xvld      U2,  C2,  0x00
+    /* Load C3  */
+    xvld      U3,  C3,  0x00
+
+    GSUB xvf, d, U0, U0, D0, U1, U1, D4, U2, U2, D8, U3, U3, D12
+.endm
+
+.macro dgemm_8x4
+.L_dgemm_8x4:
+    /* Load 8 * 64 from A0 */
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+
+    xvldrepl.d     U4, B0, 0x00
+    /* line 1 */
+    xvfmul.d  D0,  U0, U4
+    xvfmul.d  D1,  U1, U4
+
+    xvldrepl.d     U5, B0, 0x08
+    /* line 2 */
+    xvfmul.d  D4,  U0, U5
+    xvfmul.d  D5,  U1, U5
+
+    xvldrepl.d     U6, B0, 0x10
+    /* line 3 */
+    xvfmul.d  D8,  U0, U6
+    xvfmul.d  D9,  U1, U6
+
+    xvldrepl.d     U7, B0, 0x18
+    /* line 4 */
+    xvfmul.d  D12, U0, U7
+    xvfmul.d  D13, U1, U7
+
+    /* Add stride for A0 and B0 */
+    PTR_ADDI    A0,  A0, 0x40
+    PTR_ADDI    B0,  B0, 0x20
+    /* Reduce L */
+    PTR_ADDI    L,   L,  -1
+    PTR_SRAI    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_M8_L7 */
+    beq       ZERO,TL, .L_dgemm_8x4_M8_L7
+
+    xvld     U8,   A0,    0x00
+    xvld     U9,   A0,    0x20
+
+    PTR_ADDI    TL,  TL,  -1
+
+    xvldrepl.d  U12,   B0,  0x00
+    xvldrepl.d  U13,   B0,  0x08
+    xvldrepl.d  U14,   B0,  0x10
+    xvldrepl.d  U15,   B0,  0x18
+    PTR_ADDI     A0,  A0, 0x40
+    PTR_ADDI     B0,  B0, 0x20
+
+    beq    ZERO,    TL,  .L_dgemm_8x4_M8_TL1_END
+.align 5
+.L_dgemm_8x4_M8_TL1: /* TL-- */
+    KERNEL8x8x4
+
+    PTR_ADDI    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_dgemm_8x4_M8_TL1
+
+.L_dgemm_8x4_M8_TL1_END:
+    KERNEL8x8x4_END
+
+.L_dgemm_8x4_M8_L7:
+    /* if (!(L & 7)) goto L_M8_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_dgemm_8x4_M8_L0
+.align 5
+.L_dgemm_8x4_M8_L71:
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+
+    xvldrepl.d U5,  B0, 0x08
+    xvfmadd.d  D4,  U0, U5, D4
+    xvfmadd.d  D5,  U1, U5, D5
+
+    xvldrepl.d U6,  B0, 0x10
+    xvfmadd.d  D8,  U0, U6, D8
+    xvfmadd.d  D9,  U1, U6, D9
+
+    xvldrepl.d U7,  B0, 0x18
+    xvfmadd.d  D12, U0, U7, D12
+    xvfmadd.d  D13, U1, U7, D13
+
+    /* Add stride for A0, B0 */
+    PTR_ADDI     A0,  A0, 0x40
+    PTR_ADDI     B0,  B0, 0x20
+
+    PTR_ADDI     TL,  TL, -1
+    blt        ZERO,TL, .L_dgemm_8x4_M8_L71
+.L_dgemm_8x4_M8_L0:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    xvld      U1,  C0,  0x20
+
+    /* Load C1  */
+    xvld      U2,  C1,  0x00
+    xvld      U3,  C1,  0x20
+
+    /* Load C2  */
+    xvld      U4,  C2,  0x00
+    xvld      U5,  C2,  0x20
+
+    /* Load C3  */
+    xvld      U6,  C3,  0x00
+    xvld      U7,  C3,  0x20
+
+    GSUB xvf, d, U0, U0, D0,  U1, U1, D1, \
+                 U2, U2, D4,  U3, U3, D5, \
+                 U4, U4, D8,  U5, U5, D9, \
+                 U6, U6, D12, U7, U7, D13
+.endm
+
+.macro dgemm_4x2
+.L_dgemm_4x2:
+    /* Load 4 * 64 from A0 */
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d     U4, B0, 0x00
+    /* line 1 */
+    xvfmul.d  D0,  U0, U4
+
+    xvldrepl.d     U5, B0, 0x08
+    /* line 2 */
+    xvfmul.d  D4,  U0, U5
+
+    /* Add stride for A0 and B0 */
+    PTR_ADDI    A0,  A0, 0x20
+    PTR_ADDI    B0,  B0, 0x10
+    /* Reduce L */
+    PTR_ADDI    L,   L,  -1
+    PTR_SRAI    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_dgemm_4x2_N3_M4_L7 */
+    beq       ZERO,TL, .L_dgemm_4x2_N3_M4_L7
+
+    xvld     U8,   A0,    0x00
+
+    PTR_ADDI    TL,  TL,  -1
+
+    xvldrepl.d  U12,   B0,  0x00
+    xvldrepl.d  U13,   B0,  0x08
+    PTR_ADDI     A0,  A0, 0x20
+    PTR_ADDI     B0,  B0, 0x10
+
+    beq    ZERO,    TL,  .L_dgemm_4x2_N3_M4_TL1_END
+.align 5
+.L_dgemm_4x2_N3_M4_TL1: /* TL-- */
+    KERNEL8x4x2
+
+    PTR_ADDI    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_dgemm_4x2_N3_M4_TL1
+.L_dgemm_4x2_N3_M4_TL1_END:
+    KERNEL8x4x2_END
+
+.L_dgemm_4x2_N3_M4_L7:
+    /* if (!(L & 7)) goto L_dgemm_4x2_N3_M4_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_dgemm_4x2_N3_M4_L0
+.align 5
+.L_dgemm_4x2_N3_M4_L71:
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    xvldrepl.d U5,  B0, 0x08
+    xvfmadd.d  D4,  U0, U5, D4
+
+    /* Add stride for A0, B0 */
+    PTR_ADDI     A0,  A0, 0x20
+    PTR_ADDI     B0,  B0, 0x10
+
+    PTR_ADDI     TL,  TL, -1
+    blt        ZERO,TL, .L_dgemm_4x2_N3_M4_L71
+
+.L_dgemm_4x2_N3_M4_L0:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    /* Load C1  */
+    xvld      U1,  C1,  0x00
+    GSUB xvf, d, U0, U0, D0, U1, U1, D4
+.endm
+
+.macro dgemm_2x2
+.L_dgemm_2x2:
+    /* Load 2 * 64 from A0 */
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d     U4, B0, 0x00
+    /* line 1 */
+    xvfmul.d  D0,  U0, U4
+
+    xvldrepl.d     U4, B0, 0x08
+    /* line 2 */
+    xvfmul.d  D4,  U0, U4
+
+    /* Add stride for A0 and B0 */
+    PTR_ADDI    A0,  A0, 0x10
+    PTR_ADDI    B0,  B0, 0x10
+    /* Reduce L */
+    PTR_ADDI    L,   L,  -1
+    PTR_SRAI    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_dgemm_2x2_N3_M2_L7 */
+    beq       ZERO,TL, .L_dgemm_2x2_N3_M2_L7
+
+    xvld     U8,   A0,    0x00
+
+    PTR_ADDI    TL,  TL,  -1
+
+    xvldrepl.d  U12,   B0,  0x00
+    xvldrepl.d  U13,   B0,  0x08
+    PTR_ADDI     A0,  A0, 0x10
+    PTR_ADDI     B0,  B0, 0x10
+
+    beq    ZERO,    TL,  .L_dgemm_2x2_N3_M2_TL1_END
+.align 5
+.L_dgemm_2x2_N3_M2_TL1: /* TL-- */
+    KERNEL8x2x2
+
+    PTR_ADDI    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_dgemm_2x2_N3_M2_TL1
+.L_dgemm_2x2_N3_M2_TL1_END:
+    KERNEL8x2x2_END
+
+.L_dgemm_2x2_N3_M2_L7:
+    /* if (!(L & 7)) goto L_dgemm_2x2_N3_M2_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_dgemm_2x2_N3_M2_L0
+.align 5
+.L_dgemm_2x2_N3_M2_L71:
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    xvldrepl.d U5,  B0, 0x08
+    xvfmadd.d  D4,  U0, U5, D4
+
+    /* Add stride for A0, B0 */
+    PTR_ADDI     A0,  A0, 0x10
+    PTR_ADDI     B0,  B0, 0x10
+
+    PTR_ADDI     TL,  TL, -1
+    blt        ZERO,TL, .L_dgemm_2x2_N3_M2_L71
+.L_dgemm_2x2_N3_M2_L0:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    /* Load C1  */
+    xvld      U1,  C1,  0x00
+    GSUB xvf, d, U0, U0, D0, U1, U1, D4
+.endm
+
+.macro dgemm_8x2
+.L_dgemm_8x2:
+    /* Load 8 * 64 from A0 */
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+
+    xvldrepl.d     U4, B0, 0x00
+    /* line 1 */
+    xvfmul.d  D0,  U0, U4
+    xvfmul.d  D1,  U1, U4
+
+    xvldrepl.d     U5, B0, 0x08
+    /* line 2 */
+    xvfmul.d  D4,  U0, U5
+    xvfmul.d  D5,  U1, U5
+
+    /* Add stride for A0 and B0 */
+    PTR_ADDI    A0,  A0, 0x40
+    PTR_ADDI    B0,  B0, 0x10
+    /* Reduce L */
+    PTR_ADDI    L,   L,  -1
+    PTR_SRAI    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_dgemm_8x2_N3_M8_L7 */
+    beq       ZERO,TL, .L_dgemm_8x2_N3_M8_L7
+
+    xvld     U8,   A0,    0x00
+    xvld     U9,   A0,    0x20
+
+    PTR_ADDI    TL,  TL,  -1
+
+    xvldrepl.d  U12,   B0,  0x00
+    xvldrepl.d  U13,   B0,  0x08
+    PTR_ADDI     A0,  A0, 0x40
+    PTR_ADDI     B0,  B0, 0x10
+
+    beq    ZERO,    TL,  .L_dgemm_8x2_N3_M8_TL1_END
+.align 5
+.L_dgemm_8x2_N3_M8_TL1: /* TL-- */
+    KERNEL8x8x2
+
+    PTR_ADDI    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_dgemm_8x2_N3_M8_TL1
+.L_dgemm_8x2_N3_M8_TL1_END:
+    KERNEL8x8x2_END
+
+.L_dgemm_8x2_N3_M8_L7:
+    /* if (!(L & 7)) goto L_dgemm_8x2_N3_M8_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_dgemm_8x2_N3_M8_L0
+.align 5
+.L_dgemm_8x2_N3_M8_L71:
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+
+    xvldrepl.d U5,  B0, 0x08
+    xvfmadd.d  D4,  U0, U5, D4
+    xvfmadd.d  D5,  U1, U5, D5
+
+    /* Add stride for A0, B0 */
+    PTR_ADDI     A0,  A0, 0x40
+    PTR_ADDI     B0,  B0, 0x10
+
+    PTR_ADDI     TL,  TL, -1
+    blt        ZERO,TL, .L_dgemm_8x2_N3_M8_L71
+
+.L_dgemm_8x2_N3_M8_L0:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    xvld      U1,  C0,  0x20
+    /* Load C1  */
+    xvld      U2,  C1,  0x00
+    xvld      U3,  C1,  0x20
+    GSUB xvf, d, U0, U0, D0, U1, U1, D1, U2, U2, D4, U3, U3, D5
+.endm
+
+.macro dgemm_16x2
+.L_dgemm_16x2:
+    /* Load 16 * 64 from A0
+     * U0 = {a3,  a2,  a1,  a0}
+     * U1 = {a7,  a6,  a5,  a4}
+     * U2 = {a11, a10, a9,  a8}
+     * U3 = {a15, a14, a13, a12}
+     */
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+    xvld     U2,   A0,    0x40
+    xvld     U3,   A0,    0x60
+
+    xvldrepl.d     U4, B0, 0x00
+    /* line 1 */
+    xvfmul.d  D0,  U0, U4
+    xvfmul.d  D1,  U1, U4
+    xvfmul.d  D2,  U2, U4
+    xvfmul.d  D3,  U3, U4
+
+    xvldrepl.d     U5, B0, 0x08
+    /* line 2 */
+    xvfmul.d  D4,  U0, U5
+    xvfmul.d  D5,  U1, U5
+    xvfmul.d  D6,  U2, U5
+    xvfmul.d  D7,  U3, U5
+
+    /* Add stride for A0 and B0 */
+    PTR_ADDI    A0,  A0, 0x80
+    PTR_ADDI    B0,  B0, 0x10
+    /* Reduce L */
+    PTR_ADDI    L,   L,  -1
+    PTR_SRAI    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_N3_L7 */
+    beq       ZERO,TL, .L_dgemm_16x2_N3_L7
+
+    xvld     U8,   A0,    0x00
+    xvld     U9,   A0,    0x20
+    xvld     U10,  A0,    0x40
+    xvld     U11,  A0,    0x60
+
+    PTR_ADDI    TL,  TL,  -1
+
+    xvldrepl.d  U12,   B0,  0x00
+    xvldrepl.d  U13,   B0,  0x08
+    PTR_ADDI     A0,  A0, 0x80
+    PTR_ADDI     B0,  B0, 0x10
+
+    beq    ZERO,    TL,  .L_dgemm_16x2_N3_TL1_END
+.align 5
+.L_dgemm_16x2_N3_TL1: /* TL-- */
+    KERNEL8x16x2
+
+    PTR_ADDI    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_dgemm_16x2_N3_TL1
+.L_dgemm_16x2_N3_TL1_END:
+    KERNEL8x16x2_END
+
+.L_dgemm_16x2_N3_L7:
+    /* if (!(L & 7)) goto L_dgemm_16x2_N3_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_dgemm_16x2_N3_L0
+.align 5
+.L_dgemm_16x2_N3_L71:
+    /* Load 16 * 64 from A0 */
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+    xvld     U2,   A0,    0x40
+    xvld     U3,   A0,    0x60
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+    xvfmadd.d  D2,  U2, U4, D2
+    xvfmadd.d  D3,  U3, U4, D3
+
+    xvldrepl.d U5,  B0, 0x08
+    xvfmadd.d  D4,  U0, U5, D4
+    xvfmadd.d  D5,  U1, U5, D5
+    xvfmadd.d  D6,  U2, U5, D6
+    xvfmadd.d  D7,  U3, U5, D7
+    /* Add stride for A0, B0 */
+    PTR_ADDI     A0,  A0, 0x80
+    PTR_ADDI     B0,  B0, 0x10
+
+    PTR_ADDI     TL,  TL, -1
+    blt        ZERO,TL, .L_dgemm_16x2_N3_L71
+
+.L_dgemm_16x2_N3_L0:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    xvld      U1,  C0,  0x20
+    xvld      U2,  C0,  0x40
+    xvld      U3,  C0,  0x60
+    /* Load C1  */
+    xvld      U4,  C1,  0x00
+    xvld      U5,  C1,  0x20
+    xvld      U6,  C1,  0x40
+    xvld      U7,  C1,  0x60
+    GSUB xvf, d, U0, U0, D0, U1, U1, D1, U2, U2, D2, U3, U3, D3, \
+                 U4, U4, D4, U5, U5, D5, U6, U6, D6, U7, U7, D7
+.endm
+
+.macro dgemm_2x1
+.L_dgemm_2x1:
+    /* Load 2 * 64 from A0 */
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d     U4, B0, 0x00
+    /* line 1 */
+    xvfmul.d  D0,  U0, U4
+
+    /* Add stride for A0 and B0 */
+    PTR_ADDI    A0,  A0, 0x10
+    PTR_ADDI    B0,  B0, 0x08
+    /* Reduce L */
+    PTR_ADDI    L,   L,  -1
+    PTR_SRAI    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_dgemm_2x1_N1_M2_L7 */
+    beq       ZERO,TL, .L_dgemm_2x1_N1_M2_L7
+
+    xvld     U8,   A0,    0x00
+
+    PTR_ADDI    TL,  TL,  -1
+
+    xvldrepl.d  U12,   B0,  0x00
+    PTR_ADDI     A0,  A0, 0x10
+    PTR_ADDI     B0,  B0, 0x08
+
+    beq    ZERO,    TL,  .L_dgemm_2x1_N1_M2_TL1_END
+.align 5
+.L_dgemm_2x1_N1_M2_TL1: /* TL-- */
+    KERNEL8x2x1
+
+    PTR_ADDI    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_dgemm_2x1_N1_M2_TL1
+.L_dgemm_2x1_N1_M2_TL1_END:
+    KERNEL8x2x1_END
+
+.L_dgemm_2x1_N1_M2_L7:
+    /* if (!(L & 7)) goto L_dgemm_2x1_N1_M2_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_dgemm_2x1_N1_M2_L0
+.align 5
+.L_dgemm_2x1_N1_M2_L71:
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    /* Add stride for A0, B0 */
+    PTR_ADDI     A0,  A0, 0x10
+    PTR_ADDI     B0,  B0, 0x08
+
+    PTR_ADDI     TL,  TL, -1
+    blt        ZERO,TL, .L_dgemm_2x1_N1_M2_L71
+.L_dgemm_2x1_N1_M2_L0:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    GSUB xvf, d, U0, U0, D0
+.endm
+
+.macro dgemm_4x1
+.L_dgemm_4x1:
+    /* Load 4 * 64 from A0 */
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d     U4, B0, 0x00
+    /* line 1 */
+    xvfmul.d  D0,  U0, U4
+
+    /* Add stride for A0 and B0 */
+    PTR_ADDI    A0,  A0, 0x20
+    PTR_ADDI    B0,  B0, 0x08
+    /* Reduce L */
+    PTR_ADDI    L,   L,  -1
+    PTR_SRAI    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_dgemm_4x1_N1_M4_L7 */
+    beq       ZERO,TL, .L_dgemm_4x1_N1_M4_L7
+
+    xvld     U8,   A0,    0x00
+
+    PTR_ADDI    TL,  TL,  -1
+
+    xvldrepl.d  U12,   B0,  0x00
+    PTR_ADDI     A0,  A0, 0x20
+    PTR_ADDI     B0,  B0, 0x08
+
+    beq    ZERO,    TL,  .L_dgemm_4x1_N1_M4_TL1_END
+.align 5
+.L_dgemm_4x1_N1_M4_TL1: /* TL-- */
+    KERNEL8x4x1
+
+    PTR_ADDI    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_dgemm_4x1_N1_M4_TL1
+.L_dgemm_4x1_N1_M4_TL1_END:
+    KERNEL8x4x1_END
+
+.L_dgemm_4x1_N1_M4_L7:
+    /* if (!(L & 7)) goto L_dgemm_4x1_N1_M4_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_dgemm_4x1_N1_M4_L0
+.align 5
+.L_dgemm_4x1_N1_M4_L71:
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    /* Add stride for A0, B0 */
+    PTR_ADDI     A0,  A0, 0x20
+    PTR_ADDI     B0,  B0, 0x08
+
+    PTR_ADDI     TL,  TL, -1
+    blt        ZERO,TL, .L_dgemm_4x1_N1_M4_L71
+.L_dgemm_4x1_N1_M4_L0:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    GSUB xvf, d, U0, U0, D0
+.endm
+
+.macro dgemm_8x1
+.L_dgemm_8x1:
+    /* Load 8 * 64 from A0 */
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+
+    xvldrepl.d     U4, B0, 0x00
+    /* line 1 */
+    xvfmul.d  D0,  U0, U4
+    xvfmul.d  D1,  U1, U4
+
+    /* Add stride for A0 and B0 */
+    PTR_ADDI    A0,  A0, 0x40
+    PTR_ADDI    B0,  B0, 0x08
+    /* Reduce L */
+    PTR_ADDI    L,   L,  -1
+    PTR_SRAI    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_dgemm_8x1_N1_M8_L7 */
+    beq       ZERO,TL, .L_dgemm_8x1_N1_M8_L7
+
+    xvld     U8,   A0,    0x00
+    xvld     U9,   A0,    0x20
+
+    PTR_ADDI    TL,  TL,  -1
+
+    xvldrepl.d  U12,   B0,  0x00
+    PTR_ADDI     A0,  A0, 0x40
+    PTR_ADDI     B0,  B0, 0x08
+
+    beq    ZERO,    TL,  .L_dgemm_8x1_N1_M8_TL1_END
+.align 5
+.L_dgemm_8x1_N1_M8_TL1: /* TL-- */
+    KERNEL8x8x1
+
+    PTR_ADDI    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_dgemm_8x1_N1_M8_TL1
+
+.L_dgemm_8x1_N1_M8_TL1_END:
+    KERNEL8x8x1_END
+
+.L_dgemm_8x1_N1_M8_L7:
+    /* if (!(L & 7)) goto L_dgemm_8x1_N1_M8_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_dgemm_8x1_N1_M8_L0
+.align 5
+.L_dgemm_8x1_N1_M8_L71:
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+
+    /* Add stride for A0, B0 */
+    PTR_ADDI     A0,  A0, 0x40
+    PTR_ADDI     B0,  B0, 0x08
+
+    PTR_ADDI     TL,  TL, -1
+    blt        ZERO,TL, .L_dgemm_8x1_N1_M8_L71
+.L_dgemm_8x1_N1_M8_L0:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    xvld      U1,  C0,  0x20
+    GSUB xvf, d, U0, U0, D0, U1, U1, D1
+.endm
+
+.macro dgemm_16x1
+.L_dgemm_16x1:
+    /* Load 16 * 64 from A0
+     * U0 = {a3,  a2,  a1,  a0}
+     * U1 = {a7,  a6,  a5,  a4}
+     * U2 = {a11, a10, a9,  a8}
+     * U3 = {a15, a14, a13, a12}
+     */
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+    xvld     U2,   A0,    0x40
+    xvld     U3,   A0,    0x60
+
+    xvldrepl.d     U4, B0, 0x00
+    /* line 1 */
+    xvfmul.d  D0,  U0, U4
+    xvfmul.d  D1,  U1, U4
+    xvfmul.d  D2,  U2, U4
+    xvfmul.d  D3,  U3, U4
+
+    /* Add stride for A0 and B0 */
+    PTR_ADDI    A0,  A0, 0x80
+    PTR_ADDI    B0,  B0, 0x08
+    /* Reduce L */
+    PTR_ADDI    L,   L,  -1
+    PTR_SRAI    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_dgemm_16x1_N1_L7 */
+    beq       ZERO,TL, .L_dgemm_16x1_N1_L7
+
+    xvld     U8,   A0,    0x00
+    xvld     U9,   A0,    0x20
+    xvld     U10,  A0,    0x40
+    xvld     U11,  A0,    0x60
+
+    PTR_ADDI    TL,  TL,  -1
+
+    xvldrepl.d  U12,   B0,  0x00
+    PTR_ADDI     A0,  A0, 0x80
+    PTR_ADDI     B0,  B0, 0x08
+
+    beq    ZERO,    TL,  .L_dgemm_16x1_N1_TL1_END
+.align 5
+.L_dgemm_16x1_N1_TL1: /* TL-- */
+    KERNEL8x16x1
+
+    PTR_ADDI    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_dgemm_16x1_N1_TL1
+.L_dgemm_16x1_N1_TL1_END:
+    KERNEL8x16x1_END
+
+.L_dgemm_16x1_N1_L7:
+    /* if (!(L & 7)) goto L_dgemm_16x1_N1_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_dgemm_16x1_N1_L0
+.align 5
+.L_dgemm_16x1_N1_L71:
+    /* Load 16 * 64 from A0 */
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+    xvld     U2,   A0,    0x40
+    xvld     U3,   A0,    0x60
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+    xvfmadd.d  D2,  U2, U4, D2
+    xvfmadd.d  D3,  U3, U4, D3
+
+    /* Add stride for A0, B0 */
+    PTR_ADDI     A0,  A0, 0x80
+    PTR_ADDI     B0,  B0, 0x08
+
+    PTR_ADDI     TL,  TL, -1
+    blt        ZERO,TL, .L_dgemm_16x1_N1_L71
+.L_dgemm_16x1_N1_L0:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    xvld      U1,  C0,  0x20
+    xvld      U2,  C0,  0x40
+    xvld      U3,  C0,  0x60
+    GSUB xvf, d, U0, U0, D0, U1, U1, D1, U2, U2, D2, U3, U3, D3
+.endm
+
+.macro dgemm_1x2
+.L_dgemm_1x2: // See dgemm_kernel_16x4.S
+    /* Load 1 * 64 from A0 */
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d     U4, B0, 0x00
+    /* line 1 */
+    xvfmul.d  D0,  U0, U4
+
+    xvldrepl.d     U4, B0, 0x08
+    /* line 2 */
+    xvfmul.d  D4,  U0, U4
+
+    /* Add stride for A0 and B0 */
+    addi.d    A0,  A0, 0x08
+    addi.d    B0,  B0, 0x10
+    /* Reduce L */
+    addi.d    L,   L,  -1
+    srai.d    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_N3_M1_L7 */
+    beq       ZERO,TL, .L_dgemm_1x2_N3_M1_L7
+
+    xvld     U8,   A0,    0x00
+
+    addi.d    TL,  TL,  -1
+
+    xvldrepl.d  U12,   B0,  0x00
+    xvldrepl.d  U13,   B0,  0x08
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x10
+    beq    ZERO,    TL,  .L_dgemm_1x2_N3_M1_TL1_END
+.L_dgemm_1x2_N3_M1_TL1: /* TL-- */
+    KERNEL8x1x2
+    addi.d    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_dgemm_1x2_N3_M1_TL1
+.L_dgemm_1x2_N3_M1_TL1_END:
+    KERNEL8x1x2_END
+.L_dgemm_1x2_N3_M1_L7:
+    /* if (!(L & 7)) goto L_dgemm_1x2_N3_M1_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_dgemm_1x2_N3_M1_L0
+.L_dgemm_1x2_N3_M1_L71:
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    xvldrepl.d U5,  B0, 0x08
+    xvfmadd.d  D4,  U0, U5, D4
+
+    /* Add stride for A0, B0 */
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x10
+
+    addi.d     TL,  TL, -1
+    blt        ZERO,TL, .L_dgemm_1x2_N3_M1_L71
+.L_dgemm_1x2_N3_M1_L0:
+    xvld      U0,  C0,  0x00
+    xvld      U1,  C1,  0x00
+    xvinsve0.d  U0,     U1,     0x01
+    xvinsve0.d  D0,     D4,     0x01
+    GSUB xvf, d, U0, U0, D0
+.endm
+
+.macro dgemm_1x1
+.L_dgemm_1x1:
+    /* Load 1 * 64 from A0 */
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d     U4, B0, 0x00
+    /* line 1 */
+    xvfmul.d  D0,  U0, U4
+
+    /* Add stride for A0 and B0 */
+    addi.d    A0,  A0, 0x08
+    addi.d    B0,  B0, 0x08
+    /* Reduce L */
+    addi.d    L,   L,  -1
+    srai.d    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_N1_M1_L7 */
+    beq       ZERO,TL, .L_N1_M1_L7
+
+    xvld     U8,   A0,    0x00
+
+    addi.d    TL,  TL,  -1
+
+    xvldrepl.d  U12,   B0,  0x00
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x08
+
+    beq    ZERO,    TL,  .L_N1_M1_TL1_END
+.L_N1_M1_TL1: /* TL-- */
+    KERNEL8x1x1
+    addi.d    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_N1_M1_TL1
+.L_N1_M1_TL1_END:
+    KERNEL8x1x1_END
+.L_N1_M1_L7:
+    /* if (!(L & 7)) goto L_N1_M1_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_N1_M1_L0
+
+.L_N1_M1_L71:
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    /* Add stride for A0, B0 */
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x08
+
+    addi.d     TL,  TL, -1
+    blt        ZERO,TL, .L_N1_M1_L71
+.L_N1_M1_L0:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    GSUB xvf, d, U0, U0, D0
+.endm