Merge pull request #4242 from XiWeiGu/loongarch64_dtrsm

LoongArch64: Add dtrsm kernel
2023-09-26 19:21:48 +02:00 · 2023-09-26 19:21:48 +02:00 · e2ca22f8d8
parent 138ed79fe7 4670eb1462
commit e2ca22f8d8
6 changed files with 6312 additions and 5 deletions
--- a/kernel/loongarch64/KERNEL.LOONGSON3R5
+++ b/kernel/loongarch64/KERNEL.LOONGSON3R5
@ -24,12 +24,12 @@ SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)

 SGEMVNKERNEL = sgemv_n_8_lasx.S
 SGEMVTKERNEL = sgemv_t_8_lasx.S
-endif

-DTRSMKERNEL_LN  = ../generic/trsm_kernel_LN.c
-DTRSMKERNEL_LT  = ../generic/trsm_kernel_LT.c
-DTRSMKERNEL_RN  = ../generic/trsm_kernel_RN.c
-DTRSMKERNEL_RT  = ../generic/trsm_kernel_RT.c
+DTRSMKERNEL_LN  = dtrsm_kernel_LN_16x4_lasx.S
+DTRSMKERNEL_LT  = dtrsm_kernel_LT_16x4_lasx.S
+DTRSMKERNEL_RN  = dtrsm_kernel_RN_16x4_lasx.S
+DTRSMKERNEL_RT  = dtrsm_kernel_RT_16x4_lasx.S
+endif

 STRSMKERNEL_LN  = ../generic/trsm_kernel_LN.c
 STRSMKERNEL_LT  = ../generic/trsm_kernel_LT.c
--- a/kernel/loongarch64/dtrsm_kernel_LN_16x4_lasx.S
+++ b/kernel/loongarch64/dtrsm_kernel_LN_16x4_lasx.S
--- a/kernel/loongarch64/dtrsm_kernel_LT_16x4_lasx.S
+++ b/kernel/loongarch64/dtrsm_kernel_LT_16x4_lasx.S
@ -0,0 +1,959 @@
+/*******************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+#include "loongarch64_asm.S"
+
+/*********************************************************************
+* 2023/08/26 guxiwei
+*        UTEST                  : OK
+*        CTEST                  : OK
+*        TEST                   : OK
+*
+*
+*********************************************************************/
+
+/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b,
+ *          FLOAT *c, BLASLONG ldc, BLASLONG offset)
+ */
+
+#define M      $r4   // param 1: bm
+#define N      $r5   // param 2: bn
+#define K      $r6   // param 3: bk
+#define A      $r7   // param 5: ba
+#define B      $r8   // param 6: bb
+#define C      $r9   // param 7: bc
+#define LDC    $r10  // param 8: ldc
+#define OFFSET $r11  // param 9: offset
+
+/* Cycle control parameters */
+#define I      $r13
+#define J      $r14
+#define L      $r15
+#define TL     $r16
+/* Matrix address */
+#define A0     $r17
+#define B0     $r18
+#define C0     $r19
+#define C1     $r20
+#define C2     $r23
+#define C3     $r24
+#define T0     $r25
+#define T1     $r26
+#define T2     $r27
+#define KK     $r28
+#define AA     $r29
+#define CC     $r30
+#define BB     B0
+#undef  ZERO
+#define ZERO   $r0
+
+#define U0     $xr0
+#define U1     $xr1
+#define U2     $xr2
+#define U3     $xr3
+#define U4     $xr4
+#define U5     $xr5
+#define U6     $xr6
+#define U7     $xr7
+#define U8     $xr8
+#define U9     $xr9
+#define U10    $xr10
+#define U11    $xr11
+#define U12    $xr12
+#define U13    $xr13
+#define U14    $xr14
+#define U15    $xr15
+#define D0     $xr16
+#define D1     $xr17
+#define D2     $xr18
+#define D3     $xr19
+#define D4     $xr20
+#define D5     $xr21
+#define D6     $xr22
+#define D7     $xr23
+#define D8     $xr24
+#define D9     $xr25
+#define D10    $xr26
+#define D11    $xr27
+#define D12    $xr28
+#define D13    $xr29
+#define D14    $xr30
+#define D15    $xr31
+#define G0     D0
+#define G1     D1
+#define G2     D2
+#define G3     D3
+#define G4     D4
+#define G5     D5
+#define G6     D6
+#define G7     D7
+#define G8     D8
+#define G9     D9
+#define G10    D10
+#define G11    D11
+#define G12    D12
+#define G13    D13
+#define G14    D14
+#define G15    D15
+
+/* Prefetch interval */
+#define A_PRE  0x400
+#define B_PRE  0x100
+
+#include "dtrsm_kernel_macro.S"
+
+.macro ldrepl_macro start, end, stride
+// Load Ux (x = 0...15)
+.if \start <= \end
+    GLDREPL xv, d, $xr\start, A0, \stride * 8
+    ldrepl_macro %start + 1, \end, %stride + 1
+.endif
+.endm
+.macro nmsub_macro start0, end0, start1, reg
+// Gx -= reg * Ux
+.if \start0 <= \end0
+    xvfnmsub.d  $xr\start0, \reg, $xr\start1, $xr\start0
+    nmsub_macro %start0 + 1, \end0, %start1 + 1, \reg
+.endif
+.endm
+.macro B_st_macro start, end, stride, N
+// Store Gx(x = 16...31)
+.if \start <= \end
+.if \N == 4
+    xvst    $xr\start, B0, \stride * 0x20
+.elseif \N == 2
+    vst     $vr\start, B0, \stride * 0x10
+.elseif \N == 1
+    fst.d   $f\start, B0, \stride * 0x08
+.endif
+    B_st_macro %start + 1, \end, %stride + 1, \N
+.endif
+.endm
+
+.macro dsolve_16 N
+// The data layout of C (4x16) is as follows (store 4 data in each register):
+// U0  U1  U2  U3
+// U4  U5  U6  U7
+// U8  U9  U10 U11
+// U12 U13 U14 U15
+// The first step is to transpose the result of C
+    GTRANSPOSE4x4_D U3, U7, U11, U15, G12, G13, G14, G15, D0, D1
+    GTRANSPOSE4x4_D U2, U6, U10, U14, G8, G9, G10, G11, D0, D1
+    GTRANSPOSE4x4_D U1, U5, U9, U13, G4, G5, G6, G7, U3, U7
+    GTRANSPOSE4x4_D U0, U4, U8, U12, G0, G1, G2, G3, U3, U7
+// Now we have the following memory layout of C:
+//     0     1    2   3    ...    15
+// 0 |    |    |    |    |     |     |
+// 1 | G0 | G1 | G2 | G3 | ... | G15 |
+// 2 |    |    |    |    |     |     |
+// 3 |    |    |    |    |     |     |
+// Next we are going to process matrix A with a size of 16x16,
+// using only the upper triangular portion. The memory layout of
+// matrix A is as follows, quite large.
+//0	1	2	3	4	5	6	7	8	9	10	11	12	13	14	15
+//	17	18	19	20	21	22	23	24	25	26	27	28	29	30	31
+//		34	35	36	37	38	39	40	41	42	43	44	45	46	47
+//			51	52	53	54	55	56	57	58	59	60	61	62	63
+//				68	69	70	71	72	73	74	75	76	77	78	79
+//					85	86	87	88	89	90	91	92	93	94	95
+//						102	103	104	105	106	107	108	109	110	111
+//							119	120	121	122	123	124	125	126	127
+//								136	137	138	139	140	141	142	143
+//									153	154	155	156	157	158	159
+//										170	171	172	173	174	175
+//											187	188	189	190	191
+//												204	205	206	207
+//													221	222	223
+//														238	239
+//															255
+// Sequentially extract data from A in row order
+// Load 0
+    ldrepl_macro 0, 15, 0
+    GMUL xvf, d, G0, G0, U0
+    nmsub_macro 17, 31, 1, G0
+    PTR_ADDI    A0,      A0,      17 * 8
+// Load 1
+    ldrepl_macro 1, 15, 0
+    GMUL xvf, d, G1, G1, U1
+    nmsub_macro 18, 31, 2, G1
+    PTR_ADDI    A0,      A0,      17 * 8
+// Load 2
+    ldrepl_macro 2, 15, 0
+    GMUL xvf, d, G2, G2, U2
+    nmsub_macro 19, 31, 3, G2
+    PTR_ADDI    A0,      A0,      17 * 8
+// Load 3
+    ldrepl_macro 3, 15, 0
+    GMUL xvf, d, G3, G3, U3
+    nmsub_macro 20, 31, 4, G3
+    PTR_ADDI    A0,      A0,      17 * 8
+// Load 4
+    ldrepl_macro 4, 15, 0
+    GMUL xvf, d, G4, G4, U4
+    nmsub_macro 21, 31, 5, G4
+    PTR_ADDI    A0,      A0,      17 * 8
+// Load 5
+    ldrepl_macro 5, 15, 0
+    GMUL xvf, d, G5, G5, U5
+    nmsub_macro 22, 31, 6, G5
+    PTR_ADDI    A0,      A0,      17 * 8
+// Load 6
+    ldrepl_macro 6, 15, 0
+    GMUL xvf, d, G6, G6, U6
+    nmsub_macro 23, 31, 7, G6
+    PTR_ADDI    A0,      A0,      17 * 8
+// Load 7
+    ldrepl_macro 7, 15, 0
+    GMUL xvf, d, G7, G7, U7
+    nmsub_macro 24, 31, 8, G7
+    PTR_ADDI    A0,      A0,      17 * 8
+// Load 8
+    ldrepl_macro 8, 15, 0
+    GMUL xvf, d, G8, G8, U8
+    nmsub_macro 25, 31, 9, G8
+    PTR_ADDI    A0,      A0,      17 * 8
+// Load 9
+    ldrepl_macro 9, 15, 0
+    GMUL xvf, d, G9, G9, U9
+    nmsub_macro 26, 31, 10, G9
+    PTR_ADDI    A0,      A0,      17 * 8
+// Load 10
+    ldrepl_macro 10, 15, 0
+    GMUL xvf, d, G10, G10, U10
+    nmsub_macro 27, 31, 11, G10
+    PTR_ADDI    A0,      A0,      17 * 8
+// Load 11
+    ldrepl_macro 11, 15, 0
+    GMUL xvf, d, G11, G11, U11
+    nmsub_macro 28, 31, 12, G11
+    PTR_ADDI    A0,      A0,      17 * 8
+// Load 12
+    ldrepl_macro 12, 15, 0
+    GMUL xvf, d, G12, G12, U12
+    nmsub_macro 29, 31, 13, G12
+    PTR_ADDI    A0,      A0,      17 * 8
+// Load 13
+    ldrepl_macro 13, 15, 0
+    GMUL xvf, d, G13, G13, U13
+    nmsub_macro 30, 31, 14, G13
+    PTR_ADDI    A0,      A0,      17 * 8
+// Load 14
+    ldrepl_macro 14, 15, 0
+    GMUL xvf, d, G14, G14, U14
+    nmsub_macro 31, 31, 15, G14
+    PTR_ADDI    A0,      A0,      17 * 8
+// Load 15
+    ldrepl_macro 15, 15, 0
+    GMUL xvf, d, G15, G15, U15
+// Finally, We can store the result.
+// For B, stored sequentially, and  C, first transpose and then store
+    B_st_macro 16, 31, 0, \N
+    GTRANSPOSE4x4_D G0, G1, G2, G3, G0, G1, G2, G3, U0, U1
+    GTRANSPOSE4x4_D G4, G5, G6, G7, G4, G5, G6, G7, U0, U1
+    GTRANSPOSE4x4_D G8, G9, G10, G11, G8, G9, G10, G11, U0, U1
+    GTRANSPOSE4x4_D G12, G13, G14, G15, G12, G13, G14, G15, U0, U1
+.if \N == 4
+    GST xv, , G0, C0, 0x00, G4, C0, 0x20, G8,  C0, 0x40, G12, C0, 0x60, \
+              G1, C1, 0x00, G5, C1, 0x20, G9,  C1, 0x40, G13, C1, 0x60, \
+              G2, C2, 0x00, G6, C2, 0x20, G10, C2, 0x40, G14, C2, 0x60, \
+              G3, C3, 0x00, G7, C3, 0x20, G11, C3, 0x40, G15, C3, 0x60
+.elseif \N == 2
+    GST xv, , G0, C0, 0x00, G4, C0, 0x20, G8,  C0, 0x40, G12, C0, 0x60, \
+              G1, C1, 0x00, G5, C1, 0x20, G9,  C1, 0x40, G13, C1, 0x60
+.elseif \N == 1
+    GST xv, , G0, C0, 0x00, G4, C0, 0x20, G8,  C0, 0x40, G12, C0, 0x60
+.endif
+.endm
+
+.macro dgemm_dsolve_16x4
+    bge     ZERO,       KK,     .L_dsolve_16x4_load
+    dgemm_16x4
+    b	.L_dsolve_16x4
+.L_dsolve_16x4_load:
+    // Load C
+    GLD xv, , U0,  C0, 0x00, U1,  C0, 0x20, U2,  C0, 0x40, U3,  C0, 0x60
+    GLD xv, , U4,  C1, 0x00, U5,  C1, 0x20, U6,  C1, 0x40, U7,  C1, 0x60
+    GLD xv, , U8,  C2, 0x00, U9,  C2, 0x20, U10, C2, 0x40, U11, C2, 0x60
+    GLD xv, , U12, C3, 0x00, U13, C3, 0x20, U14, C3, 0x40, U15, C3, 0x60
+/********************** solver ******************/
+.L_dsolve_16x4:
+    dsolve_16 4
+.endm
+
+.macro dsolve_8 N
+// The data layout of C (4x8) is as follows (store 4 data in each register):
+// U0  U1
+// U2  U3
+// U4  U5
+// U6  U7
+// The first step is to transpose the result of C
+    GTRANSPOSE4x4_D U1, U3, U5, U7, G4, G5, G6, G7, G8, G9
+    GTRANSPOSE4x4_D U0, U2, U4, U6, G0, G1, G2, G3, G8, G9
+// Now we have the following memory layout of C:
+//     0     1    2   3    ...   7
+// 0 |    |    |    |    |     |    |
+// 1 | G0 | G1 | G2 | G3 | ... | G7 |
+// 2 |    |    |    |    |     |    |
+// 3 |    |    |    |    |     |    |
+// Next we are going to process matrix A with a size of 8x8,
+// using only the upper triangular portion. The memory layout of
+// matrix A is as follows:
+//0	1	2	3	4	5	6	7
+//	9	10	11	12	13	14	15
+//		18	19	20	21	22	23
+//			27	28	29	30	31
+//				36	37	38	39
+//					45	46	47
+//						54	55
+//							63
+// Sequentially extract data from A in row order
+// Load 0
+    ldrepl_macro 0, 7, 0
+    GMUL xvf, d, G0, G0, U0
+    nmsub_macro 17, 23, 1, G0
+    PTR_ADDI    A0,      A0,      9 * 8
+// Load 1
+    ldrepl_macro 1, 7, 0
+    GMUL xvf, d, G1, G1, U1
+    nmsub_macro 18, 23, 2, G1
+    PTR_ADDI    A0,      A0,      9 * 8
+// Load 2
+    ldrepl_macro 2, 7, 0
+    GMUL xvf, d, G2, G2, U2
+    nmsub_macro 19, 23, 3, G2
+    PTR_ADDI    A0,      A0,      9 * 8
+// Load 3
+    ldrepl_macro 3, 7, 0
+    GMUL xvf, d, G3, G3, U3
+    nmsub_macro 20, 23, 4, G3
+    PTR_ADDI    A0,      A0,      9 * 8
+// Load 4
+    ldrepl_macro 4, 7, 0
+    GMUL xvf, d, G4, G4, U4
+    nmsub_macro 21, 23, 5, G4
+    PTR_ADDI    A0,      A0,      9 * 8
+// Load 5
+    ldrepl_macro 5, 7, 0
+    GMUL xvf, d, G5, G5, U5
+    nmsub_macro 22, 23, 6, G5
+    PTR_ADDI    A0,      A0,      9 * 8
+// Load 6
+    ldrepl_macro 6, 7, 0
+    GMUL xvf, d, G6, G6, U6
+    nmsub_macro 23, 23, 7, G6
+    PTR_ADDI    A0,      A0,      9 * 8
+// Load 7
+    ldrepl_macro 7, 7, 0
+    GMUL xvf, d, G7, G7, U7
+// Finally, We can store the result.
+// For B, stored sequentially, and  C, first transpose and then store
+    B_st_macro 16, 23, 0, \N
+    GTRANSPOSE4x4_D G0, G1, G2, G3, G0, G1, G2, G3, U0, U1
+    GTRANSPOSE4x4_D G4, G5, G6, G7, G4, G5, G6, G7, U0, U1
+.if \N == 4
+    GST xv, , G0, C0, 0x00, G4, C0, 0x20, \
+              G1, C1, 0x00, G5, C1, 0x20, \
+              G2, C2, 0x00, G6, C2, 0x20, \
+              G3, C3, 0x00, G7, C3, 0x20
+.elseif \N == 2
+    GST xv, , G0, C0, 0x00, G4, C0, 0x20, \
+              G1, C1, 0x00, G5, C1, 0x20
+.elseif \N == 1
+    GST xv, , G0, C0, 0x00, G4, C0, 0x20
+.endif
+.endm
+
+.macro dgemm_dsolve_8x4
+    bge   ZERO, L,	.L_dsolve_8x4_load
+    dgemm_8x4
+    b .L_dsolve_8x4
+.L_dsolve_8x4_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    xvld      U1,  C0,  0x20
+
+    /* Load C1  */
+    xvld      U2,  C1,  0x00
+    xvld      U3,  C1,  0x20
+
+    /* Load C2  */
+    xvld      U4,  C2,  0x00
+    xvld      U5,  C2,  0x20
+
+    /* Load C3  */
+    xvld      U6,  C3,  0x00
+    xvld      U7,  C3,  0x20
+/********* solver *********/
+.L_dsolve_8x4:
+    dsolve_8 4
+.endm
+
+.macro dsolve_4 N
+// The data layout of C (4x4) is as follows (store 4 data in each register):
+// U0
+// U1
+// U2
+// U3
+// The first step is to transpose the result of C
+    GTRANSPOSE4x4_D U0, U1, U2, U3, G0, G1, G2, G3, G4, G5
+// Now we have the following memory layout of C:
+//     0     1    2   3
+// 0 |    |    |    |    |
+// 1 | G0 | G1 | G2 | G3 |
+// 2 |    |    |    |    |
+// 3 |    |    |    |    |
+// Next we are going to process matrix A with a size of 4x4,
+// using only the upper triangular portion. The memory layout of
+// matrix A is as follows:
+//0	1	2	3
+//	5	6	7
+//		10	11
+//			15
+// Sequentially extract data from A in row order
+// Load 0
+    ldrepl_macro 0, 3, 0
+    GMUL xvf, d, G0, G0, U0
+    nmsub_macro 17, 19, 1, G0
+    PTR_ADDI    A0,      A0,      5 * 8
+// Load 1
+    ldrepl_macro 1, 3, 0
+    GMUL xvf, d, G1, G1, U1
+    nmsub_macro 18, 19, 2, G1
+    PTR_ADDI    A0,      A0,      5 * 8
+// Load 2
+    ldrepl_macro 2, 3, 0
+    GMUL xvf, d, G2, G2, U2
+    nmsub_macro 19, 19, 3, G2
+    PTR_ADDI    A0,      A0,      5 * 8
+// Load 3
+    ldrepl_macro 3, 3, 0
+    GMUL xvf, d, G3, G3, U3
+// Finally, We can store the result.
+// For B, stored sequentially, and  C, first transpose and then store
+    B_st_macro 16, 19, 0, \N
+    GTRANSPOSE4x4_D G0, G1, G2, G3, G0, G1, G2, G3, U0, U1
+.if \N == 4
+    GST xv, , G0, C0, 0x00, G1, C1, 0x00, G2, C2, 0x00, G3, C3, 0x00
+.elseif \N == 2
+    GST xv, , G0, C0, 0x00, G1, C1, 0x00
+.elseif \N == 1
+    GST xv, , G0, C0, 0x00
+.endif
+.endm
+
+.macro dgemm_dsolve_4x4
+    bge   ZERO, L,    .L_dsolve_4x4_load
+    dgemm_4x4
+    b .L_dsolve_4x4
+.L_dsolve_4x4_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    /* Load C1  */
+    xvld      U1,  C1,  0x00
+    /* Load C2  */
+    xvld      U2,  C2,  0x00
+    /* Load C3  */
+    xvld      U3,  C3,  0x00
+/************** solver *****************/
+.L_dsolve_4x4:
+    dsolve_4 4
+.endm
+
+.macro dsolve_2 N
+// Transpose
+    GSBUTTERFLY xv, d, G0, G1, U1, U0
+// Now we have the following memory layout of C:
+//     0     1
+// 0 |    |    |
+// 1 | G0 | G1 |
+// 2 |    |    |
+// 3 |    |    |
+// Next we are going to process matrix A with a size of 2x2,
+// using only the upper triangular portion. The memory layout of
+// matrix A is as follows:
+//0	1
+//	3
+// Sequentially extract data from A in row order
+// Load 0
+    ldrepl_macro 0, 1, 0
+    GMUL xvf, d, G0, G0, U0
+    nmsub_macro 17, 17, 1, G0
+    PTR_ADDI    A0,      A0,      3 * 8
+// Load 1
+    ldrepl_macro 1, 1, 0
+    GMUL xvf, d, G1, G1, U1
+// Finally, We can store the result.
+// For B, stored sequentially, and  C, first transpose and then store
+    B_st_macro 16, 17, 0, \N
+    GSBUTTERFLY xv, d, U0, U1, G1, G0
+.if \N == 4
+    vst       $vr0,     C0,      0x00
+    vst       $vr1,     C1,      0x00
+    xvstelm.d U0,  C2,  0x00,    0x02
+    xvstelm.d U1,  C3,  0x00,    0x02
+    xvstelm.d U0,  C2,  0x08,    0x03
+    xvstelm.d U1,  C3,  0x08,    0x03
+.elseif \N == 2
+    vst       $vr0,     C0,      0x00
+    vst       $vr1,     C1,      0x00
+.elseif \N == 1
+    vst       $vr0,     C0,      0x00
+.endif
+.endm
+
+.macro dgemm_dsolve_2x4
+    bge   ZERO, L,    .L_dsolve_2x4_load
+    dgemm_2x4
+    b   .L_dsolve_2x4
+.L_dsolve_2x4_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    /* Load C1  */
+    xvld      U1,  C1,  0x00
+    /* Load C2  */
+    xvld      U2,  C2,  0x00
+    /* Load C3  */
+    xvld      U3,  C3,  0x00
+
+    xvpermi.q   U0, U2, 0x02
+    xvpermi.q   U1, U3, 0x02
+/********************** solver ******************/
+.L_dsolve_2x4:
+    dsolve_2 4
+.endm
+
+.macro dgemm_dsolve_1x4
+    bge   ZERO, L,    .L_dsolve_1x4_load
+    dgemm_1x4
+    b   .L_dsolve_1x4
+.L_dsolve_1x4_load:
+    // Load C
+    fld.d       $f0,    C0,     0x00
+    fld.d       $f1,    C1,     0x00
+    fld.d       $f2,    C2,     0x00
+    fld.d       $f3,    C3,     0x00
+    xvinsve0.d  U0,     U1,     0x01
+    xvinsve0.d  U0,     U2,     0x02
+    xvinsve0.d  U0,     U3,     0x03
+.L_dsolve_1x4:
+    GLDREPL xv, d, D0, A0, 0x00
+    GMUL xvf, d, U0, U0, D0
+    // Store C
+    xvstelm.d   U0,     C0,     0x00,       0x00
+    xvstelm.d   U0,     C1,     0x00,       0x01
+    xvstelm.d   U0,     C2,     0x00,       0x02
+    xvstelm.d   U0,     C3,     0x00,       0x03
+    // Store B
+    xvst    U0,     B0,     0x00
+.endm
+
+.macro dgemm_dsolve_16x2
+    bge   ZERO, L,	.L_dsolve_16x2_load
+    dgemm_16x2
+    b .L_dsolve_16x2
+.L_dsolve_16x2_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    xvld      U1,  C0,  0x20
+    xvld      U2,  C0,  0x40
+    xvld      U3,  C0,  0x60
+    /* Load C1  */
+    xvld      U4,  C1,  0x00
+    xvld      U5,  C1,  0x20
+    xvld      U6,  C1,  0x40
+    xvld      U7,  C1,  0x60
+.L_dsolve_16x2:
+    dsolve_16 2
+.endm
+
+.macro dgemm_dsolve_8x2
+    bge   ZERO, L,	.L_dsolve_8x2_load
+    dgemm_8x2
+    b .L_dsolve_8x2
+.L_dsolve_8x2_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    xvld      U1,  C0,  0x20
+    /* Load C1  */
+    xvld      U2,  C1,  0x00
+    xvld      U3,  C1,  0x20
+.L_dsolve_8x2:
+    dsolve_8 2
+.endm
+
+.macro dgemm_dsolve_4x2
+    bge   ZERO, L,	.L_dsolve_4x2_load
+    dgemm_4x2
+    b .L_dsolve_4x2
+.L_dsolve_4x2_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    /* Load C1  */
+    xvld      U1,  C1,  0x00
+.L_dsolve_4x2:
+    dsolve_4 2
+.endm
+
+.macro dgemm_dsolve_1x2
+    bge   ZERO, L,    .L_dsolve_1x2_load
+    dgemm_1x2
+    b   .L_dsolve_1x2
+.L_dsolve_1x2_load:
+    // Load C
+    fld.d       $f0,    C0,     0x00
+    fld.d       $f1,    C1,     0x00
+    xvinsve0.d  U0,     U1,     0x01
+.L_dsolve_1x2:
+    GLDREPL xv, d, D0, A0, 0x00
+    GMUL xvf, d, U0, U0, D0
+    // Store C
+    xvstelm.d   U0,     C0,     0x00,       0x00
+    xvstelm.d   U0,     C1,     0x00,       0x01
+    // Store B
+    vst    $vr0,     B0,     0x00
+.endm
+
+.macro dgemm_dsolve_2x2
+    bge   ZERO, L,	.L_dsolve_2x2_load
+    dgemm_2x2
+    b .L_dsolve_2x2
+.L_dsolve_2x2_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    /* Load C1  */
+    xvld      U1,  C1,  0x00
+.L_dsolve_2x2:
+    dsolve_2 2
+.endm
+
+.macro dgemm_dsolve_16x1
+    bge   ZERO, L,	.L_dsolve_16x1_load
+    dgemm_16x1
+    b .L_dsolve_16x1
+.L_dsolve_16x1_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    xvld      U1,  C0,  0x20
+    xvld      U2,  C0,  0x40
+    xvld      U3,  C0,  0x60
+.L_dsolve_16x1:
+    dsolve_16 1
+.endm
+
+.macro dgemm_dsolve_8x1
+    bge   ZERO, L,	.L_dsolve_8x1_load
+    dgemm_8x1
+    b .L_dsolve_8x1
+.L_dsolve_8x1_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    xvld      U1,  C0,  0x20
+.L_dsolve_8x1:
+    dsolve_8 1
+.endm
+
+.macro dgemm_dsolve_4x1
+    bge   ZERO, L,	.L_dsolve_4x1_load
+    dgemm_4x1
+    b .L_dsolve_4x1
+.L_dsolve_4x1_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+.L_dsolve_4x1:
+    dsolve_4 1
+.endm
+
+.macro dgemm_dsolve_2x1
+    bge   ZERO, L,	.L_dsolve_2x1_load
+    dgemm_2x1
+    b .L_dsolve_2x1
+.L_dsolve_2x1_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+.L_dsolve_2x1:
+    dsolve_2 1
+.endm
+
+.macro dgemm_dsolve_1x1
+    bge   ZERO, L,    .L_dsolve_1x1_load
+    dgemm_1x1
+    b .L_dsolve_1x1
+.L_dsolve_1x1_load:
+    // Load C
+    fld.d       $f0,    C0,     0x00
+.L_dsolve_1x1:
+    GLDREPL xv, d, D0, A0, 0x00
+    GMUL xvf, d, U0, U0, D0
+    // Store C
+    xvstelm.d   U0,     C0,     0x00,       0x00
+    // Store B
+    xvstelm.d   U0,     B0,     0x00,       0x00
+.endm
+
+    PROLOGUE
+    push_if_used 26, 32
+    PTR_SLLI   LDC,   LDC,   3
+    /* if (!(N >> 2)) goto L_N3 */
+    PTR_SRAI   J,     N,     2     /* J = bn >> 2 */
+    andi       N,     N,     0x03
+    beq        ZERO,  J,     .L_N3
+.align 5
+.L_J1:
+    PTR_ADDI    J,      J,     -1
+    move        KK,     OFFSET
+    move        AA,     A
+    move        CC,     C
+    PTR_SRAI    I,      M,      4 // M >> 4
+    beqz        I,      .L_M15
+.align 4
+.L_I1:
+    GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC
+    move        A0,     AA
+    move        B0,     B
+    move        L,      KK
+    dgemm_dsolve_16x4
+    PTR_ADDI    I,      I,      -1
+    PTR_SLLI    T0,     K,      7
+    PTR_ADDI    CC,     CC,     0x80 // cc += 16
+    PTR_ADDI    KK,     KK,     0x10 // kk += 16
+    PTR_ADD     AA,     AA,     T0 // aa += 16 * k
+    bnez        I,      .L_I1
+.L_M15:
+    andi        I,      M,      8
+    beqz        I,      .L_M7
+.L_M8:
+    GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC
+    move        A0,     AA
+    move        B0,     B
+    move        L,      KK
+    dgemm_dsolve_8x4
+    PTR_SLLI    T0,     K,      6
+    PTR_ADDI    CC,     CC,     0x40 // cc += 8
+    PTR_ADDI    KK,     KK,     0x08 // kk += 8
+    PTR_ADD     AA,     AA,     T0 // aa += 8 * k
+.L_M7:
+    andi        I,      M,      4
+    beqz        I,      .L_M3
+.L_M4:
+    GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC
+    move        A0,     AA
+    move        B0,     B
+    move        L,      KK
+    dgemm_dsolve_4x4
+    PTR_SLLI    T0,     K,      5
+    PTR_ADDI    CC,     CC,     0x20 // cc += 4
+    PTR_ADDI    KK,     KK,     0x04 // kk += 4
+    PTR_ADD     AA,     AA,     T0 // aa += 4 * k
+.L_M3:
+    andi        I,      M,      2
+    beqz        I,      .L_M1
+.L_M2:
+    GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC
+    move        A0,     AA
+    move        B0,     B
+    move        L,      KK
+    dgemm_dsolve_2x4
+    PTR_SLLI    T0,     K,      4
+    PTR_ADDI    CC,     CC,     0x10 // cc += 2
+    PTR_ADDI    KK,     KK,     0x02 // kk += 2
+    PTR_ADD     AA,     AA,     T0 // aa += 2 * k
+.L_M1:
+    andi        I,      M,      1
+    beqz        I,      .L_M0
+    GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC
+    move        A0,     AA
+    move        B0,     B
+    move        L,      KK
+    dgemm_dsolve_1x4
+    PTR_SLLI    T0,     K,      3
+    PTR_ADDI    CC,     CC,     0x08 // cc += 1
+    PTR_ADDI    KK,     KK,     0x01 // kk += 1
+    PTR_ADD     AA,     AA,     T0 // aa += 1 * k
+.L_M0:
+    PTR_SLLI    T0,     K,      5
+    PTR_SLLI    T1,     LDC,    2
+    PTR_ADD     B,      B,      T0 // b += 4 * k
+    PTR_ADD     C,      C,      T1 // c += 4 * ldc
+    bnez        J,      .L_J1
+.L_N3:
+    andi    J,      N,      2
+    beq     ZERO,   J,      .L_N1
+.L_N2:
+    move        KK,     OFFSET
+    move        AA,     A
+    move        CC,     C
+    PTR_SRAI    I,      M,      4 // M >> 4
+    beqz        I,      .L_N2_M15
+.align 4
+.L_N2_I1:
+    GADD , d, C0, CC, ZERO, C1, C0, LDC
+    move        A0,     AA
+    move        B0,     B
+    move        L,      KK
+    dgemm_dsolve_16x2
+    PTR_ADDI    I,      I,      -1
+    PTR_SLLI    T0,     K,      7
+    PTR_ADDI    CC,     CC,     0x80 // cc += 16
+    PTR_ADDI    KK,     KK,     0x10 // kk += 16
+    PTR_ADD     AA,     AA,     T0 // aa += 16 * k
+    bnez        I,      .L_N2_I1
+.L_N2_M15:
+    andi        I,      M,      8
+    beqz        I,      .L_N2_M7
+.L_N2_M8:
+    GADD , d, C0, CC, ZERO, C1, C0, LDC
+    move        A0,     AA
+    move        B0,     B
+    move        L,      KK
+    dgemm_dsolve_8x2
+    PTR_SLLI    T0,     K,      6
+    PTR_ADDI    CC,     CC,     0x40 // cc += 8
+    PTR_ADDI    KK,     KK,     0x08 // kk += 8
+    PTR_ADD     AA,     AA,     T0 // aa += 8 * k
+.L_N2_M7:
+    andi        I,      M,      4
+    beqz        I,      .L_N2_M3
+.L_N2_M4:
+    GADD , d, C0, CC, ZERO, C1, C0, LDC
+    move        A0,     AA
+    move        B0,     B
+    move        L,      KK
+    dgemm_dsolve_4x2
+    PTR_SLLI    T0,     K,      5
+    PTR_ADDI    CC,     CC,     0x20 // cc += 4
+    PTR_ADDI    KK,     KK,     0x04 // kk += 4
+    PTR_ADD     AA,     AA,     T0 // aa += 4 * k
+.L_N2_M3:
+    andi        I,      M,      2
+    beqz        I,      .L_N2_M1
+.L_N2_M2:
+    GADD , d, C0, CC, ZERO, C1, C0, LDC
+    move        A0,     AA
+    move        B0,     B
+    move        L,      KK
+    dgemm_dsolve_2x2
+    PTR_SLLI    T0,     K,      4
+    PTR_ADDI    CC,     CC,     0x10 // cc += 2
+    PTR_ADDI    KK,     KK,     0x02 // kk += 2
+    PTR_ADD     AA,     AA,     T0 // aa += 2 * k
+.L_N2_M1:
+    andi        I,      M,      1
+    beqz        I,      .L_N2_M0
+    GADD , d, C0, CC, ZERO, C1, C0, LDC
+    move        A0,     AA
+    move        B0,     B
+    move        L,      KK
+    dgemm_dsolve_1x2
+    PTR_SLLI    T0,     K,      3
+    PTR_ADDI    CC,     CC,     0x08 // cc += 1
+    PTR_ADDI    KK,     KK,     0x01 // kk += 1
+    PTR_ADD     AA,     AA,     T0 // aa += 1 * k
+.L_N2_M0:
+    PTR_SLLI    T0,     K,      4
+    PTR_SLLI    T1,     LDC,    1
+    PTR_ADD     B,      B,      T0 // b += 2 * k
+    PTR_ADD     C,      C,      T1 // c += 2 * ldc
+.L_N1:
+    andi    J,      N,      1
+    beq     ZERO,   J,      .L_N0
+
+    move        KK,     OFFSET
+    move        AA,     A
+    move        CC,     C
+    PTR_SRAI    I,      M,      4 // M >> 4
+    beqz        I,      .L_N1_M15
+.align 4
+.L_N1_I1:
+    GADD , d, C0, CC, ZERO
+    move        A0,     AA
+    move        B0,     B
+    move        L,      KK
+    dgemm_dsolve_16x1
+    PTR_ADDI    I,      I,      -1
+    PTR_SLLI    T0,     K,      7
+    PTR_ADDI    CC,     CC,     0x80 // cc += 16
+    PTR_ADDI    KK,     KK,     0x10 // kk += 16
+    PTR_ADD     AA,     AA,     T0 // aa += 16 * k
+    bnez        I,      .L_N1_I1
+.L_N1_M15:
+    andi        I,      M,      8
+    beqz        I,      .L_N1_M7
+.L_N1_M8:
+    GADD , d, C0, CC, ZERO
+    move        A0,     AA
+    move        B0,     B
+    move        L,      KK
+    dgemm_dsolve_8x1
+    PTR_SLLI    T0,     K,      6
+    PTR_ADDI    CC,     CC,     0x40 // cc += 8
+    PTR_ADDI    KK,     KK,     0x08 // kk += 8
+    PTR_ADD     AA,     AA,     T0 // aa += 8 * k
+.L_N1_M7:
+    andi        I,      M,      4
+    beqz        I,      .L_N1_M3
+.L_N1_M4:
+    GADD , d, C0, CC, ZERO
+    move        A0,     AA
+    move        B0,     B
+    move        L,      KK
+    dgemm_dsolve_4x1
+    PTR_SLLI    T0,     K,      5
+    PTR_ADDI    CC,     CC,     0x20 // cc += 4
+    PTR_ADDI    KK,     KK,     0x04 // kk += 4
+    PTR_ADD     AA,     AA,     T0 // aa += 4 * k
+.L_N1_M3:
+    andi        I,      M,      2
+    beqz        I,      .L_N1_M1
+.L_N1_M2:
+    GADD , d, C0, CC, ZERO
+    move        A0,     AA
+    move        B0,     B
+    move        L,      KK
+    dgemm_dsolve_2x1
+    PTR_SLLI    T0,     K,      4
+    PTR_ADDI    CC,     CC,     0x10 // cc += 2
+    PTR_ADDI    KK,     KK,     0x02 // kk += 2
+    PTR_ADD     AA,     AA,     T0 // aa += 2 * k
+.L_N1_M1:
+    andi        I,      M,      1
+    beqz        I,      .L_N1_M0
+    GADD , d, C0, CC, ZERO
+    move        A0,     AA
+    move        B0,     B
+    move        L,      KK
+    dgemm_dsolve_1x1
+    PTR_SLLI    T0,     K,      3
+    PTR_ADDI    CC,     CC,     0x08 // cc += 1
+    PTR_ADDI    KK,     KK,     0x01 // kk += 1
+    PTR_ADD     AA,     AA,     T0 // aa += 1 * k
+.L_N1_M0:
+.L_N0:
+    pop_if_used 26, 32
+    jirl    $r0, $r1, 0x0
+    EPILOGUE
--- a/kernel/loongarch64/dtrsm_kernel_RN_16x4_lasx.S
+++ b/kernel/loongarch64/dtrsm_kernel_RN_16x4_lasx.S
@ -0,0 +1,882 @@
+/*******************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+#include "loongarch64_asm.S"
+
+/*********************************************************************
+* 2023/09/26 guxiwei
+*        UTEST                  : OK
+*        CTEST                  : OK
+*        TEST                   : OK
+*
+*
+*********************************************************************/
+
+/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b,
+ *          FLOAT *c, BLASLONG ldc, BLASLONG offset)
+ */
+
+#define M      $r4   // param 1: bm
+#define N      $r5   // param 2: bn
+#define K      $r6   // param 3: bk
+#define A      $r7   // param 5: ba
+#define B      $r8   // param 6: bb
+#define C      $r9   // param 7: bc
+#define LDC    $r10  // param 8: ldc
+#define OFFSET $r11  // param 9: offset
+
+/* Cycle control parameters */
+#define I      $r13
+#define J      $r14
+#define L      $r15
+#define TL     $r16
+/* Matrix address */
+#define A0     $r17
+#define B0     $r18
+#define C0     $r19
+#define C1     $r20
+#define C2     $r23
+#define C3     $r24
+#define T0     $r25
+#define T1     $r26
+#define T2     $r27
+#define KK     $r28
+#define AA     $r29
+#define CC     $r30
+#define BB     B0
+#undef  ZERO
+#define ZERO   $r0
+
+#define U0     $xr0
+#define U1     $xr1
+#define U2     $xr2
+#define U3     $xr3
+#define U4     $xr4
+#define U5     $xr5
+#define U6     $xr6
+#define U7     $xr7
+#define U8     $xr8
+#define U9     $xr9
+#define U10    $xr10
+#define U11    $xr11
+#define U12    $xr12
+#define U13    $xr13
+#define U14    $xr14
+#define U15    $xr15
+#define D0     $xr16
+#define D1     $xr17
+#define D2     $xr18
+#define D3     $xr19
+#define D4     $xr20
+#define D5     $xr21
+#define D6     $xr22
+#define D7     $xr23
+#define D8     $xr24
+#define D9     $xr25
+#define D10    $xr26
+#define D11    $xr27
+#define D12    $xr28
+#define D13    $xr29
+#define D14    $xr30
+#define D15    $xr31
+#define G0     D0
+#define G1     D1
+#define G2     D2
+#define G3     D3
+#define G4     D4
+#define G5     D5
+#define G6     D6
+#define G7     D7
+#define G8     D8
+#define G9     D9
+#define G10    D10
+#define G11    D11
+#define G12    D12
+#define G13    D13
+#define G14    D14
+#define G15    D15
+
+/* Prefetch interval */
+#define A_PRE  0x400
+#define B_PRE  0x100
+
+#include "dtrsm_kernel_macro.S"
+
+.macro ldrepl_macro start, end, stride
+// Load Ux (x = 0...15)
+.if \start <= \end
+    GLDREPL xv, d, $xr\start, B0, \stride * 8
+    ldrepl_macro %start + 1, \end, %stride + 1
+.endif
+.endm
+
+.macro nmsub_macro start0, end0, start1, reg
+// Ux -= reg * Dx
+.if \start0 <= \end0
+    xvfnmsub.d  $xr\start0, \reg, $xr\start1, $xr\start0
+    nmsub_macro %start0 + 1, \end0, %start1 + 1, \reg
+.endif
+.endm
+
+.macro A_st_macro start, end, stride, N
+// Store Ux(x = 0...15)
+.if \start <= \end
+.if \N == 4
+    xvst    $xr\start, A0, \stride * 0x20
+.elseif \N == 2
+    vst     $vr\start, A0, \stride * 0x10
+.elseif \N == 1
+    fst.d   $f\start, A0, \stride * 0x08
+.endif
+    A_st_macro %start + 1, \end, %stride + 1, \N
+.endif
+.endm
+
+.macro dsolve_16x4
+// We are going to process matrix B with a size of 4x4,
+// using only the upper triangular portion. The memory layout of
+// matrix B is as follows:
+//0	1	2	3
+//	5	6	7
+//		10	11
+//			15
+// Sequentially extract data from B in row order
+    ldrepl_macro 16, 19, 0
+    GMUL xvf, d, U0, D0, U0, U1, D0, U1, U2, D0, U2, U3, D0, U3
+    ldrepl_macro 20, 22, 5
+    nmsub_macro 4, 7, 0, D1
+    ldrepl_macro 23, 24, 10
+    GMUL xvf, d, U4, D4, U4, U5, D4, U5, U6, D4, U6, U7, D4, U7
+    ldrepl_macro 25, 25, 15
+    nmsub_macro 8, 11, 0, D2
+    nmsub_macro 8, 11, 4, D5
+    GMUL xvf, d, U8, D7, U8, U9, D7, U9, U10, D7, U10, U11, D7, U11
+    nmsub_macro 12, 15, 0, D3
+    nmsub_macro 12, 15, 4, D6
+    nmsub_macro 12, 15, 8, D8
+    GMUL xvf, d, U12, D9, U12, U13, D9, U13, U14, D9, U14, U15, D9, U15
+// Store A
+    A_st_macro 0, 15, 0, 4
+// Store C
+    GST xv, , U0,  C0, 0x00, U1,  C0, 0x20, U2,  C0, 0x40, U3,  C0, 0x60, \
+              U4,  C1, 0x00, U5,  C1, 0x20, U6,  C1, 0x40, U7,  C1, 0x60, \
+              U8,  C2, 0x00, U9,  C2, 0x20, U10, C2, 0x40, U11, C2, 0x60, \
+              U12, C3, 0x00, U13, C3, 0x20, U14, C3, 0x40, U15, C3, 0x60
+.endm
+
+.macro dsolve_16x2
+// We are going to process matrix B with a size of 2x2,
+// using only the upper triangular portion. The memory layout of
+// matrix B is as follows:
+//0	1
+//	3
+// Sequentially extract data from B in row order
+    ldrepl_macro 16, 17, 0
+    GMUL xvf, d, U0, D0, U0, U1, D0, U1, U2, D0, U2, U3, D0, U3
+    ldrepl_macro 18, 18, 3
+    nmsub_macro 4, 7, 0, D1
+    GMUL xvf, d, U4, D2, U4, U5, D2, U5, U6, D2, U6, U7, D2, U7
+// Store A
+    A_st_macro 0, 7, 0, 4
+// Store C
+    GST xv, , U0,  C0, 0x00, U1,  C0, 0x20, U2,  C0, 0x40, U3,  C0, 0x60, \
+              U4,  C1, 0x00, U5,  C1, 0x20, U6,  C1, 0x40, U7,  C1, 0x60
+.endm
+
+.macro dsolve_8x4
+// We are going to process matrix B with a size of 4x4,
+// using only the upper triangular portion. The memory layout of
+// matrix B is as follows:
+//0	1	2	3
+//	5	6	7
+//		10	11
+//			15
+// Sequentially extract data from B in row order
+    ldrepl_macro 16, 19, 0
+    GMUL xvf, d, U0, D0, U0, U1, D0, U1
+    ldrepl_macro 20, 22, 5
+    nmsub_macro 2, 3, 0, D1
+    ldrepl_macro 23, 24, 10
+    GMUL xvf, d, U2, D4, U2, U3, D4, U3
+    ldrepl_macro 25, 25, 15
+    nmsub_macro 4, 5, 0, D2
+    nmsub_macro 4, 5, 2, D5
+    GMUL xvf, d, U4, D7, U4, U5, D7, U5
+    nmsub_macro 6, 7, 0, D3
+    nmsub_macro 6, 7, 2, D6
+    nmsub_macro 6, 7, 4, D8
+    GMUL xvf, d, U6, D9, U6, U7, D9, U7
+// Store A
+    A_st_macro 0, 7, 0, 4
+// Store C
+    GST xv, , U0, C0, 0x00, U1, C0, 0x20, \
+              U2, C1, 0x00, U3, C1, 0x20, \
+              U4, C2, 0x00, U5, C2, 0x20, \
+              U6, C3, 0x00, U7, C3, 0x20
+.endm
+
+.macro dsolve_8x2
+// We are going to process matrix B with a size of 2x2,
+// using only the upper triangular portion. The memory layout of
+// matrix B is as follows:
+//0	1
+//	3
+// Sequentially extract data from B in row order
+    ldrepl_macro 16, 17, 0
+    GMUL xvf, d, U0, D0, U0, U1, D0, U1
+    ldrepl_macro 18, 18, 3
+    nmsub_macro 2, 3, 0, D1
+    GMUL xvf, d, U2, D2, U2, U3, D2, U3
+// Store A
+    A_st_macro 0, 3, 0, 4
+// Store C
+    GST xv, , U0,  C0, 0x00, U1,  C0, 0x20, \
+              U2,  C1, 0x00, U3,  C1, 0x20
+.endm
+
+.macro dsolve_4x4
+// We are going to process matrix B with a size of 4x4,
+// using only the upper triangular portion. The memory layout of
+// matrix B is as follows:
+//0	1	2	3
+//	5	6	7
+//		10	11
+//			15
+// Sequentially extract data from B in row order
+    ldrepl_macro 16, 19, 0
+    GMUL xvf, d, U0, D0, U0
+    ldrepl_macro 20, 22, 5
+    nmsub_macro 1, 1, 0, D1
+    ldrepl_macro 23, 24, 10
+    GMUL xvf, d, U1, D4, U1
+    ldrepl_macro 25, 25, 15
+    nmsub_macro 2, 2, 0, D2
+    nmsub_macro 2, 2, 1, D5
+    GMUL xvf, d, U2, D7, U2
+    nmsub_macro 3, 3, 0, D3
+    nmsub_macro 3, 3, 1, D6
+    nmsub_macro 3, 3, 2, D8
+    GMUL xvf, d, U3, D9, U3
+// Store A
+    A_st_macro 0, 3, 0, 4
+// Store C
+    GST xv, , U0, C0, 0x00, U1, C1, 0x00, U2, C2, 0x00, U3, C3, 0x00
+.endm
+
+.macro dsolve_4x2
+// We are going to process matrix B with a size of 2x2,
+// using only the upper triangular portion. The memory layout of
+// matrix B is as follows:
+//0	1
+//	3
+// Sequentially extract data from B in row order
+    ldrepl_macro 16, 17, 0
+    GMUL xvf, d, U0, D0, U0
+    ldrepl_macro 18, 18, 3
+    nmsub_macro 1, 1, 0, D1
+    GMUL xvf, d, U1, D2, U1
+// Store A
+    A_st_macro 0, 1, 0, 4
+// Store C
+    GST xv, , U0, C0, 0x00, U1, C1, 0x00
+.endm
+
+.macro dsolve_2x4
+// We are going to process matrix B with a size of 4x4,
+// using only the upper triangular portion. The memory layout of
+// matrix B is as follows:
+//0	1	2	3
+//	5	6	7
+//		10	11
+//			15
+// Sequentially extract data from B in row order
+    ldrepl_macro 16, 19, 0
+    GMUL xvf, d, U0, D0, U0
+    ldrepl_macro 20, 22, 5
+    nmsub_macro 1, 1, 0, D1
+    ldrepl_macro 23, 24, 10
+    GMUL xvf, d, U1, D4, U1
+
+    ldrepl_macro 25, 25, 15
+    nmsub_macro 2, 2, 0, D2
+    nmsub_macro 2, 2, 1, D5
+    GMUL xvf, d, U2, D7, U2
+    nmsub_macro 3, 3, 0, D3
+    nmsub_macro 3, 3, 1, D6
+    nmsub_macro 3, 3, 2, D8
+    GMUL xvf, d, U3, D9, U3
+// Store A
+    A_st_macro 0, 3, 0, 2
+// Store C
+    GST v, , $vr0, C0, 0x00, $vr1, C1, 0x00, $vr2, C2, 0x00, $vr3, C3, 0x00,
+.endm
+
+.macro dsolve_2x2
+// We are going to process matrix B with a size of 2x2,
+// using only the upper triangular portion. The memory layout of
+// matrix B is as follows:
+//0	1
+//	3
+// Sequentially extract data from B in row order
+    ldrepl_macro 16, 17, 0
+    GMUL xvf, d, U0, D0, U0
+    ldrepl_macro 18, 18, 3
+    nmsub_macro 1, 1, 0, D1
+    GMUL xvf, d, U1, D2, U1
+// Store A
+    A_st_macro 0, 1, 0, 2
+// Store C
+    GST v, , $vr0, C0, 0x00, $vr1, C1, 0x00
+.endm
+
+.macro dsolve_1x4
+// We are going to process matrix B with a size of 4x4,
+// using only the upper triangular portion. The memory layout of
+// matrix B is as follows:
+//0	1	2	3
+//	5	6	7
+//		10	11
+//			15
+// Sequentially extract data from B in row order
+    ldrepl_macro 16, 19, 0
+    GMUL xvf, d, U0, D0, U0
+    ldrepl_macro 20, 22, 5
+    nmsub_macro 1, 1, 0, D1
+    ldrepl_macro 23, 24, 10
+    GMUL xvf, d, U1, D4, U1
+
+    ldrepl_macro 25, 25, 15
+    nmsub_macro 2, 2, 0, D2
+    nmsub_macro 2, 2, 1, D5
+    GMUL xvf, d, U2, D7, U2
+    nmsub_macro 3, 3, 0, D3
+    nmsub_macro 3, 3, 1, D6
+    nmsub_macro 3, 3, 2, D8
+    GMUL xvf, d, U3, D9, U3
+// Store A
+    A_st_macro 0, 3, 0, 1
+// Store C
+    GST f, d, $f0, C0, 0x00, $f1, C1, 0x00, $f2, C2, 0x00, $f3, C3, 0x00,
+.endm
+
+.macro dsolve_1x2
+// We are going to process matrix B with a size of 2x2,
+// using only the upper triangular portion. The memory layout of
+// matrix B is as follows:
+//0	1
+//	3
+// Sequentially extract data from B in row order
+    ldrepl_macro 16, 17, 0
+    GMUL xvf, d, U0, D0, U0
+    ldrepl_macro 18, 18, 3
+    nmsub_macro 1, 1, 0, D1
+    GMUL xvf, d, U1, D2, U1
+// Store A
+    A_st_macro 0, 1, 0, 1
+// Store C
+    GST f, d, $f0, C0, 0x00, $f1, C1, 0x00
+.endm
+
+.macro dgemm_dsolve_16x4
+    bge   ZERO, L,	.L_dsolve_16x4_load
+    dgemm_16x4
+    b	.L_dsolve_16x4
+.L_dsolve_16x4_load:
+    // Load C
+    GLD xv, , U0,  C0, 0x00, U1,  C0, 0x20, U2,  C0, 0x40, U3,  C0, 0x60
+    GLD xv, , U4,  C1, 0x00, U5,  C1, 0x20, U6,  C1, 0x40, U7,  C1, 0x60
+    GLD xv, , U8,  C2, 0x00, U9,  C2, 0x20, U10, C2, 0x40, U11, C2, 0x60
+    GLD xv, , U12, C3, 0x00, U13, C3, 0x20, U14, C3, 0x40, U15, C3, 0x60
+/********************** solver ******************/
+.L_dsolve_16x4:
+    dsolve_16x4
+.endm
+
+.macro dgemm_dsolve_8x4
+    bge   ZERO, L,	.L_dsolve_8x4_load
+    dgemm_8x4
+    b .L_dsolve_8x4
+.L_dsolve_8x4_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    xvld      U1,  C0,  0x20
+
+    /* Load C1  */
+    xvld      U2,  C1,  0x00
+    xvld      U3,  C1,  0x20
+
+    /* Load C2  */
+    xvld      U4,  C2,  0x00
+    xvld      U5,  C2,  0x20
+
+    /* Load C3  */
+    xvld      U6,  C3,  0x00
+    xvld      U7,  C3,  0x20
+/********* solver *********/
+.L_dsolve_8x4:
+    dsolve_8x4
+.endm
+
+.macro dgemm_dsolve_4x4
+    bge   ZERO, L,    .L_dsolve_4x4_load
+    dgemm_4x4
+    b .L_dsolve_4x4
+.L_dsolve_4x4_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    /* Load C1  */
+    xvld      U1,  C1,  0x00
+    /* Load C2  */
+    xvld      U2,  C2,  0x00
+    /* Load C3  */
+    xvld      U3,  C3,  0x00
+/************** solver *****************/
+.L_dsolve_4x4:
+    dsolve_4x4
+.endm
+
+.macro dgemm_dsolve_2x4
+    bge   ZERO, L,    .L_dsolve_2x4_load
+    dgemm_2x4
+    xvpermi.q   U2,     U0,     0x01
+    xvpermi.q   U3,     U1,     0x01
+    b   .L_dsolve_2x4
+.L_dsolve_2x4_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    /* Load C1  */
+    xvld      U1,  C1,  0x00
+    /* Load C2  */
+    xvld      U2,  C2,  0x00
+    /* Load C3  */
+    xvld      U3,  C3,  0x00
+/********************** solver ******************/
+.L_dsolve_2x4:
+    dsolve_2x4
+.endm
+
+.macro dgemm_dsolve_1x4
+    bge   ZERO, L,    .L_dsolve_1x4_load
+    dgemm_1x4
+    xvpackod.d  U1,     U0,     U0
+    xvpermi.q   U2,     U0,     0x01
+    xvpermi.q   U3,     U1,     0x01
+    b   .L_dsolve_1x4
+.L_dsolve_1x4_load:
+    // Load C
+    fld.d       $f0,    C0,     0x00
+    fld.d       $f1,    C1,     0x00
+    fld.d       $f2,    C2,     0x00
+    fld.d       $f3,    C3,     0x00
+.L_dsolve_1x4:
+    dsolve_1x4
+.endm
+
+.macro dgemm_dsolve_16x2
+    bge   ZERO, L,	.L_dsolve_16x2_load
+    dgemm_16x2
+    b .L_dsolve_16x2
+.L_dsolve_16x2_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    xvld      U1,  C0,  0x20
+    xvld      U2,  C0,  0x40
+    xvld      U3,  C0,  0x60
+    /* Load C1  */
+    xvld      U4,  C1,  0x00
+    xvld      U5,  C1,  0x20
+    xvld      U6,  C1,  0x40
+    xvld      U7,  C1,  0x60
+.L_dsolve_16x2:
+    dsolve_16x2
+.endm
+
+.macro dgemm_dsolve_8x2
+    bge   ZERO, L,	.L_dsolve_8x2_load
+    dgemm_8x2
+    b .L_dsolve_8x2
+.L_dsolve_8x2_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    xvld      U1,  C0,  0x20
+    /* Load C1  */
+    xvld      U2,  C1,  0x00
+    xvld      U3,  C1,  0x20
+.L_dsolve_8x2:
+    dsolve_8x2
+.endm
+
+.macro dgemm_dsolve_4x2
+    bge   ZERO, L,	.L_dsolve_4x2_load
+    dgemm_4x2
+    b .L_dsolve_4x2
+.L_dsolve_4x2_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    /* Load C1  */
+    xvld      U1,  C1,  0x00
+.L_dsolve_4x2:
+    dsolve_4x2
+.endm
+
+.macro dgemm_dsolve_2x2
+    bge   ZERO, L,	.L_dsolve_2x2_load
+    dgemm_2x2
+    b .L_dsolve_2x2
+.L_dsolve_2x2_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    /* Load C1  */
+    xvld      U1,  C1,  0x00
+.L_dsolve_2x2:
+    dsolve_2x2
+.endm
+
+.macro dgemm_dsolve_1x2
+    bge   ZERO, L,    .L_dsolve_1x2_load
+    dgemm_1x2
+    xvpackod.d  U1,     U0,     U0
+    b   .L_dsolve_1x2
+.L_dsolve_1x2_load:
+    // Load C
+    fld.d       $f0,    C0,     0x00
+    fld.d       $f1,    C1,     0x00
+.L_dsolve_1x2:
+    dsolve_1x2
+.endm
+
+.macro dgemm_dsolve_16x1
+    bge   ZERO, L,	.L_dsolve_16x1_load
+    dgemm_16x1
+    b .L_dsolve_16x1
+.L_dsolve_16x1_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    xvld      U1,  C0,  0x20
+    xvld      U2,  C0,  0x40
+    xvld      U3,  C0,  0x60
+.L_dsolve_16x1:
+    ldrepl_macro 16, 16, 0
+    GMUL xvf, d, U0, D0, U0, U1, D0, U1, U2, D0, U2, U3, D0, U3
+    // Store A
+    A_st_macro 0, 3, 0, 4
+    // Strore C
+    GST xv, , U0, C0, 0x00, U1, C0, 0x20, U2, C0, 0x40, U3, C0, 0x60
+.endm
+
+.macro dgemm_dsolve_8x1
+    bge   ZERO, L,	.L_dsolve_8x1_load
+    dgemm_8x1
+    b .L_dsolve_8x1
+.L_dsolve_8x1_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    xvld      U1,  C0,  0x20
+.L_dsolve_8x1:
+    ldrepl_macro 16, 16, 0
+    GMUL xvf, d, U0, D0, U0, U1, D0, U1
+    // Store A
+    A_st_macro 0, 1, 0, 4
+    // Strore C
+    GST xv, , U0, C0, 0x00, U1, C0, 0x20
+.endm
+
+.macro dgemm_dsolve_4x1
+    bge   ZERO, L,	.L_dsolve_4x1_load
+    dgemm_4x1
+    b .L_dsolve_4x1
+.L_dsolve_4x1_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+.L_dsolve_4x1:
+    ldrepl_macro 16, 16, 0
+    GMUL xvf, d, U0, D0, U0
+    // Store A
+    A_st_macro 0, 0, 0, 4
+    // Strore C
+    GST xv, , U0, C0, 0x00
+.endm
+
+.macro dgemm_dsolve_2x1
+    bge   ZERO, L,	.L_dsolve_2x1_load
+    dgemm_2x1
+    b .L_dsolve_2x1
+.L_dsolve_2x1_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+.L_dsolve_2x1:
+    ldrepl_macro 16, 16, 0
+    GMUL xvf, d, U0, D0, U0
+    // Store A
+    A_st_macro 0, 0, 0, 2
+    // Strore C
+    GST v, , $vr0, C0, 0x00
+.endm
+
+.macro dgemm_dsolve_1x1
+    bge   ZERO, L,    .L_dsolve_1x1_load
+    dgemm_1x1
+    b .L_dsolve_1x1
+.L_dsolve_1x1_load:
+    // Load C
+    fld.d       $f0,    C0,     0x00
+.L_dsolve_1x1:
+    ldrepl_macro 16, 16, 0
+    GMUL xvf, d, U0, D0, U0
+    // Store A
+    A_st_macro 0, 0, 0, 1
+    // Strore C
+    GST f, d, $f0, C0, 0x00
+.endm
+
+    PROLOGUE
+    push_if_used 26, 32
+    PTR_SLLI   LDC,   LDC,   3
+    PTR_SUB    KK,    ZERO,  OFFSET
+    /* if (!(N >> 2)) goto L_N3 */
+    PTR_SRAI   J,     N,     2     /* J = bn >> 2 */
+    andi       N,     N,     0x03
+    beq        ZERO,  J,     .L_N3
+.align 5
+.L_J1:
+    PTR_ADDI    J,      J,     -1
+    move        AA,     A
+    move        CC,     C
+    PTR_SRAI    I,      M,      4 // M >> 4
+    beqz        I,      .L_M15
+.align 4
+.L_I1:
+    GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC
+    move        A0,     AA
+    move        B0,     B
+    move        L,      KK
+    dgemm_dsolve_16x4
+    PTR_ADDI    I,      I,      -1
+    PTR_SLLI    T0,     K,      7
+    PTR_ADDI    CC,     CC,     0x80 // cc += 16
+    PTR_ADD     AA,     AA,     T0 // aa += 16 * k
+    bnez        I,      .L_I1
+.L_M15:
+    andi        I,      M,      8
+    beqz        I,      .L_M7
+.L_M8:
+    GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC
+    move        A0,     AA
+    move        B0,     B
+    move        L,      KK
+    dgemm_dsolve_8x4
+    PTR_SLLI    T0,     K,      6
+    PTR_ADDI    CC,     CC,     0x40 // cc += 8
+    PTR_ADD     AA,     AA,     T0 // aa += 8 * k
+.L_M7:
+    andi        I,      M,      4
+    beqz        I,      .L_M3
+.L_M4:
+    GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC
+    move        A0,     AA
+    move        B0,     B
+    move        L,      KK
+    dgemm_dsolve_4x4
+    PTR_SLLI    T0,     K,      5
+    PTR_ADDI    CC,     CC,     0x20 // cc += 4
+    PTR_ADD     AA,     AA,     T0 // aa += 4 * k
+.L_M3:
+    andi        I,      M,      2
+    beqz        I,      .L_M1
+.L_M2:
+    GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC
+    move        A0,     AA
+    move        B0,     B
+    move        L,      KK
+    dgemm_dsolve_2x4
+    PTR_SLLI    T0,     K,      4
+    PTR_ADDI    CC,     CC,     0x10 // cc += 2
+    PTR_ADD     AA,     AA,     T0 // aa += 2 * k
+.L_M1:
+    andi        I,      M,      1
+    beqz        I,      .L_M0
+    GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC
+    move        A0,     AA
+    move        B0,     B
+    move        L,      KK
+    dgemm_dsolve_1x4
+    PTR_SLLI    T0,     K,      3
+    PTR_ADDI    CC,     CC,     0x08 // cc += 1
+    PTR_ADD     AA,     AA,     T0 // aa += 1 * k
+.L_M0:
+    PTR_SLLI    T0,     K,      5
+    PTR_SLLI    T1,     LDC,    2
+    PTR_ADD     B,      B,      T0 // b += 4 * k
+    PTR_ADD     C,      C,      T1 // c += 4 * ldc
+    PTR_ADDI    KK,     KK,     4 // kk += 4
+    bnez        J,      .L_J1
+.L_N3:
+    andi    J,      N,      2
+    beq     ZERO,   J,      .L_N1
+.L_N2:
+    move        AA,     A
+    move        CC,     C
+    PTR_SRAI    I,      M,      4 // M >> 4
+    beqz        I,      .L_N2_M15
+.align 4
+.L_N2_I1:
+    GADD , d, C0, CC, ZERO, C1, C0, LDC
+    move        A0,     AA
+    move        B0,     B
+    move        L,      KK
+    dgemm_dsolve_16x2
+    PTR_ADDI    I,      I,      -1
+    PTR_SLLI    T0,     K,      7
+    PTR_ADDI    CC,     CC,     0x80 // cc += 16
+    PTR_ADD     AA,     AA,     T0 // aa += 16 * k
+    bnez        I,      .L_N2_I1
+.L_N2_M15:
+    andi        I,      M,      8
+    beqz        I,      .L_N2_M7
+.L_N2_M8:
+    GADD , d, C0, CC, ZERO, C1, C0, LDC
+    move        A0,     AA
+    move        B0,     B
+    move        L,      KK
+    dgemm_dsolve_8x2
+    PTR_SLLI    T0,     K,      6
+    PTR_ADDI    CC,     CC,     0x40 // cc += 8
+    PTR_ADD     AA,     AA,     T0 // aa += 8 * k
+.L_N2_M7:
+    andi        I,      M,      4
+    beqz        I,      .L_N2_M3
+.L_N2_M4:
+    GADD , d, C0, CC, ZERO, C1, C0, LDC
+    move        A0,     AA
+    move        B0,     B
+    move        L,      KK
+    dgemm_dsolve_4x2
+    PTR_SLLI    T0,     K,      5
+    PTR_ADDI    CC,     CC,     0x20 // cc += 4
+    PTR_ADD     AA,     AA,     T0 // aa += 4 * k
+.L_N2_M3:
+    andi        I,      M,      2
+    beqz        I,      .L_N2_M1
+.L_N2_M2:
+    GADD , d, C0, CC, ZERO, C1, C0, LDC
+    move        A0,     AA
+    move        B0,     B
+    move        L,      KK
+    dgemm_dsolve_2x2
+    PTR_SLLI    T0,     K,      4
+    PTR_ADDI    CC,     CC,     0x10 // cc += 2
+    PTR_ADD     AA,     AA,     T0 // aa += 2 * k
+.L_N2_M1:
+    andi        I,      M,      1
+    beqz        I,      .L_N2_M0
+    GADD , d, C0, CC, ZERO, C1, C0, LDC
+    move        A0,     AA
+    move        B0,     B
+    move        L,      KK
+    dgemm_dsolve_1x2
+    PTR_SLLI    T0,     K,      3
+    PTR_ADDI    CC,     CC,     0x08 // cc += 1
+    PTR_ADD     AA,     AA,     T0 // aa += 1 * k
+.L_N2_M0:
+    PTR_SLLI    T0,     K,      4
+    PTR_SLLI    T1,     LDC,    1
+    PTR_ADD     B,      B,      T0 // b += 2 * k
+    PTR_ADD     C,      C,      T1 // c += 2 * ldc
+    PTR_ADDI    KK,     KK,     2 // kk += 2
+.L_N1:
+    andi    J,      N,      1
+    beq     ZERO,   J,      .L_N0
+    move        AA,     A
+    move        CC,     C
+    PTR_SRAI    I,      M,      4 // M >> 4
+    beqz        I,      .L_N1_M15
+.align 4
+.L_N1_I1:
+    GADD , d, C0, CC, ZERO
+    move        A0,     AA
+    move        B0,     B
+    move        L,      KK
+    dgemm_dsolve_16x1
+    PTR_ADDI    I,      I,      -1
+    PTR_SLLI    T0,     K,      7
+    PTR_ADDI    CC,     CC,     0x80 // cc += 16
+    PTR_ADD     AA,     AA,     T0 // aa += 16 * k
+    bnez        I,      .L_N1_I1
+.L_N1_M15:
+    andi        I,      M,      8
+    beqz        I,      .L_N1_M7
+.L_N1_M8:
+    GADD , d, C0, CC, ZERO
+    move        A0,     AA
+    move        B0,     B
+    move        L,      KK
+    dgemm_dsolve_8x1
+    PTR_SLLI    T0,     K,      6
+    PTR_ADDI    CC,     CC,     0x40 // cc += 8
+    PTR_ADD     AA,     AA,     T0 // aa += 8 * k
+.L_N1_M7:
+    andi        I,      M,      4
+    beqz        I,      .L_N1_M3
+.L_N1_M4:
+    GADD , d, C0, CC, ZERO
+    move        A0,     AA
+    move        B0,     B
+    move        L,      KK
+    dgemm_dsolve_4x1
+    PTR_SLLI    T0,     K,      5
+    PTR_ADDI    CC,     CC,     0x20 // cc += 4
+    PTR_ADD     AA,     AA,     T0 // aa += 4 * k
+.L_N1_M3:
+    andi        I,      M,      2
+    beqz        I,      .L_N1_M1
+.L_N1_M2:
+    GADD , d, C0, CC, ZERO
+    move        A0,     AA
+    move        B0,     B
+    move        L,      KK
+    dgemm_dsolve_2x1
+    PTR_SLLI    T0,     K,      4
+    PTR_ADDI    CC,     CC,     0x10 // cc += 2
+    PTR_ADD     AA,     AA,     T0 // aa += 2 * k
+.L_N1_M1:
+    andi        I,      M,      1
+    beqz        I,      .L_N1_M0
+    GADD , d, C0, CC, ZERO
+    move        A0,     AA
+    move        B0,     B
+    move        L,      KK
+    dgemm_dsolve_1x1
+    PTR_SLLI    T0,     K,      3
+    PTR_ADDI    CC,     CC,     0x08 // cc += 1
+    PTR_ADD     AA,     AA,     T0 // aa += 1 * k
+.L_N1_M0:
+.L_N0:
+    pop_if_used 26, 32
+    jirl    $r0, $r1, 0x0
+    EPILOGUE
--- a/kernel/loongarch64/dtrsm_kernel_RT_16x4_lasx.S
+++ b/kernel/loongarch64/dtrsm_kernel_RT_16x4_lasx.S
@ -0,0 +1,953 @@
+/*******************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+#include "loongarch64_asm.S"
+
+/*********************************************************************
+* 2023/09/26 guxiwei
+*        UTEST                  : OK
+*        CTEST                  : OK
+*        TEST                   : OK
+*
+*
+*********************************************************************/
+
+/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b,
+ *          FLOAT *c, BLASLONG ldc, BLASLONG offset)
+ */
+#define M      $r4   // param 1: bm
+#define N      $r5   // param 2: bn
+#define K      $r6   // param 3: bk
+#define A      $r7   // param 5: ba
+#define B      $r8   // param 6: bb
+#define C      $r9   // param 7: bc
+#define LDC    $r10  // param 8: ldc
+#define OFFSET $r11  // param 9: offset
+
+/* Cycle control parameters */
+#define I      $r13
+#define J      $r14
+#define L      $r15
+#define TL     $r16
+/* Matrix address */
+#define A0     $r17
+#define B0     $r18
+#define C0     $r19
+#define C1     $r20
+#define C2     $r23
+#define C3     $r24
+#define T0     $r25
+#define T1     $r26
+#define T2     $r27
+#define KK     $r28
+#define AA     $r29
+#define CC     $r30
+#define BB     $r31
+#undef  ZERO
+#define ZERO   $r0
+
+#define U0     $xr0
+#define U1     $xr1
+#define U2     $xr2
+#define U3     $xr3
+#define U4     $xr4
+#define U5     $xr5
+#define U6     $xr6
+#define U7     $xr7
+#define U8     $xr8
+#define U9     $xr9
+#define U10    $xr10
+#define U11    $xr11
+#define U12    $xr12
+#define U13    $xr13
+#define U14    $xr14
+#define U15    $xr15
+#define D0     $xr16
+#define D1     $xr17
+#define D2     $xr18
+#define D3     $xr19
+#define D4     $xr20
+#define D5     $xr21
+#define D6     $xr22
+#define D7     $xr23
+#define D8     $xr24
+#define D9     $xr25
+#define D10    $xr26
+#define D11    $xr27
+#define D12    $xr28
+#define D13    $xr29
+#define D14    $xr30
+#define D15    $xr31
+
+/* Prefetch interval */
+#define A_PRE  0x400
+#define B_PRE  0x100
+
+#include "dtrsm_kernel_macro.S"
+
+.macro ldrepl_macro start, end, stride
+// Load Ux (x = 0...15)
+.if \start <= \end
+    GLDREPL xv, d, $xr\start, B0, \stride * 8
+    ldrepl_macro %start + 1, \end, %stride + 1
+.endif
+.endm
+
+.macro nmsub_macro start0, end0, start1, reg
+// Ux -= reg * Dx
+.if \start0 <= \end0
+    xvfnmsub.d  $xr\start0, \reg, $xr\start1, $xr\start0
+    nmsub_macro %start0 + 1, \end0, %start1 + 1, \reg
+.endif
+.endm
+
+.macro A_st_macro start, end, stride, N
+// Store Ux(x = 0...15)
+.if \start <= \end
+.if \N == 4
+    xvst    $xr\start, A0, \stride * 0x20
+.elseif \N == 2
+    vst     $vr\start, A0, \stride * 0x10
+.elseif \N == 1
+    fst.d   $f\start, A0, \stride * 0x08
+.endif
+    A_st_macro %start + 1, \end, %stride + 1, \N
+.endif
+.endm
+
+.macro dsolve_16x2
+// We are going to process matrix B with a size of 2x2,
+// using only the upper triangular portion. The memory layout of
+// matrix B is as follows:
+//0
+//2	 3
+// Sequentially extract data from B in row order
+    ldrepl_macro 16, 16, 0
+    ldrepl_macro 17, 18, 2
+    GMUL xvf, d, U4, D2, U4, U5, D2, U5, U6, D2, U6, U7, D2, U7
+    nmsub_macro 0, 3, 4, D1
+    GMUL xvf, d, U0, D0, U0, U1, D0, U1, U2, D0, U2, U3, D0, U3
+// Store A
+    A_st_macro 0, 7, 0, 4
+// Store C
+    GST xv, , U0,  C0, 0x00, U1,  C0, 0x20, U2,  C0, 0x40, U3,  C0, 0x60, \
+              U4,  C1, 0x00, U5,  C1, 0x20, U6,  C1, 0x40, U7,  C1, 0x60
+.endm
+
+.macro dsolve_8x2
+// We are going to process matrix B with a size of 2x2,
+// using only the upper triangular portion. The memory layout of
+// matrix B is as follows:
+//0
+//2	 3
+// Sequentially extract data from B in row order
+    ldrepl_macro 16, 16, 0
+    ldrepl_macro 17, 18, 2
+    GMUL xvf, d, U2, D2, U2, U3, D2, U3
+    nmsub_macro 0, 1, 2, D1
+    GMUL xvf, d, U0, D0, U0, U1, D0, U1
+// Store A
+    A_st_macro 0, 3, 0, 4
+// Store C
+    GST xv, , U0,  C0, 0x00, U1,  C0, 0x20, \
+              U2,  C1, 0x00, U3,  C1, 0x20
+.endm
+
+.macro dsolve_4x2
+// We are going to process matrix B with a size of 2x2,
+// using only the upper triangular portion. The memory layout of
+// matrix B is as follows:
+//0
+//2	 3
+// Sequentially extract data from B in row order
+    ldrepl_macro 16, 16, 0
+    ldrepl_macro 17, 18, 2
+    GMUL xvf, d, U1, D2, U1
+    nmsub_macro 0, 0, 1, D1
+    GMUL xvf, d, U0, D0, U0
+// Store A
+    A_st_macro 0, 1, 0, 4
+// Store C
+    GST xv, , U0, C0, 0x00, U1, C1, 0x00
+.endm
+
+.macro dsolve_2x2
+// We are going to process matrix B with a size of 2x2,
+// using only the upper triangular portion. The memory layout of
+// matrix B is as follows:
+//0
+//2	 3
+// Sequentially extract data from B in row order
+    ldrepl_macro 16, 16, 0
+    ldrepl_macro 17, 18, 2
+    GMUL xvf, d, U1, D2, U1
+    nmsub_macro 0, 0, 1, D1
+    GMUL xvf, d, U0, D0, U0
+// Store A
+    A_st_macro 0, 1, 0, 2
+// Store C
+    GST v, , $vr0, C0, 0x00, $vr1, C1, 0x00
+.endm
+
+.macro dsolve_1x2
+// We are going to process matrix B with a size of 2x2,
+// using only the upper triangular portion. The memory layout of
+// matrix B is as follows:
+//0
+//2	 3
+// Sequentially extract data from B in row order
+    ldrepl_macro 16, 16, 0
+    ldrepl_macro 17, 18, 2
+    GMUL xvf, d, U1, D2, U1
+    nmsub_macro 0, 0, 1, D1
+    GMUL xvf, d, U0, D0, U0
+// Store A
+    A_st_macro 0, 1, 0, 1
+// Store C
+    GST f, d, $f0, C0, 0x00, $f1, C1, 0x00
+.endm
+
+.macro dsolve_16x4
+// We are going to process matrix B with a size of 4x4,
+// using only the upper triangular portion. The memory layout of
+// matrix B is as follows:
+//0
+//4	 5
+//8	 9	10
+//12 13	14	15
+// Sequentially extract data from B in row order
+    ldrepl_macro 22, 25, 12
+    GMUL xvf, d, U12, D9, U12, U13, D9, U13, U14, D9, U14, U15, D9, U15
+    ldrepl_macro 19, 21, 8
+    nmsub_macro 8, 11, 12, D8
+    ldrepl_macro 17, 18, 4
+    GMUL xvf, d, U8, D5, U8, U9, D5, U9, U10, D5, U10, U11, D5, U11
+    ldrepl_macro 16, 16, 0
+    nmsub_macro 4, 7, 12, D7
+    nmsub_macro 4, 7, 8, D4
+    GMUL xvf, d, U4, D2, U4, U5, D2, U5, U6, D2, U6, U7, D2, U7
+    nmsub_macro 0, 3, 12, D6
+    nmsub_macro 0, 3, 8, D3
+    nmsub_macro 0, 3, 4, D1
+    GMUL xvf, d, U0, D0, U0, U1, D0, U1, U2, D0, U2, U3, D0, U3
+// Store A
+    A_st_macro 0, 15, 0, 4
+// Store C
+    GST xv, , U0,  C0, 0x00, U1,  C0, 0x20, U2,  C0, 0x40, U3,  C0, 0x60, \
+              U4,  C1, 0x00, U5,  C1, 0x20, U6,  C1, 0x40, U7,  C1, 0x60, \
+              U8,  C2, 0x00, U9,  C2, 0x20, U10, C2, 0x40, U11, C2, 0x60, \
+              U12, C3, 0x00, U13, C3, 0x20, U14, C3, 0x40, U15, C3, 0x60
+.endm
+
+.macro dsolve_8x4
+// We are going to process matrix B with a size of 4x4,
+// using only the upper triangular portion. The memory layout of
+// matrix B is as follows:
+//0
+//4	 5
+//8	 9	10
+//12 13	14	15
+// Sequentially extract data from B in row order
+    ldrepl_macro 22, 25, 12
+    GMUL xvf, d, U6, D9, U6, U7, D9, U7
+    ldrepl_macro 19, 21, 8
+    nmsub_macro 4, 5, 6, D8
+    ldrepl_macro 17, 18, 4
+    GMUL xvf, d, U4, D5, U4, U5, D5, U5
+    ldrepl_macro 16, 16, 0
+    nmsub_macro 2, 3, 6, D7
+    nmsub_macro 2, 3, 4, D4
+    GMUL xvf, d, U2, D2, U2, U3, D2, U3
+    nmsub_macro 0, 1, 6, D6
+    nmsub_macro 0, 1, 4, D3
+    nmsub_macro 0, 1, 2, D1
+    GMUL xvf, d, U0, D0, U0, U1, D0, U1
+// Store A
+    A_st_macro 0, 7, 0, 4
+// Store C
+    GST xv, , U0, C0, 0x00, U1, C0, 0x20, \
+              U2, C1, 0x00, U3, C1, 0x20, \
+              U4, C2, 0x00, U5, C2, 0x20, \
+              U6, C3, 0x00, U7, C3, 0x20
+.endm
+
+.macro dsolve_4x4
+// We are going to process matrix B with a size of 4x4,
+// using only the upper triangular portion. The memory layout of
+// matrix B is as follows:
+//0
+//4	 5
+//8	 9	10
+//12 13	14	15
+// Sequentially extract data from B in row order
+    ldrepl_macro 22, 25, 12
+    GMUL xvf, d, U3, D9, U3
+    ldrepl_macro 19, 21, 8
+    nmsub_macro 2, 2, 3, D8
+    ldrepl_macro 17, 18, 4
+    GMUL xvf, d, U2, D5, U2
+    ldrepl_macro 16, 16, 0
+    nmsub_macro 1, 1, 3, D7
+    nmsub_macro 1, 1, 2, D4
+    GMUL xvf, d, U1, D2, U1
+    nmsub_macro 0, 0, 3, D6
+    nmsub_macro 0, 0, 2, D3
+    nmsub_macro 0, 0, 1, D1
+    GMUL xvf, d, U0, D0, U0
+// Store A
+    A_st_macro 0, 3, 0, 4
+// Store C
+    GST xv, , U0, C0, 0x00, U1, C1, 0x00, U2, C2, 0x00, U3, C3, 0x00
+.endm
+
+.macro dsolve_2x4
+// We are going to process matrix B with a size of 4x4,
+// using only the upper triangular portion. The memory layout of
+// matrix B is as follows:
+//0
+//4	 5
+//8	 9	10
+//12 13	14	15
+// Sequentially extract data from B in row order
+    ldrepl_macro 22, 25, 12
+    GMUL xvf, d, U3, D9, U3
+    ldrepl_macro 19, 21, 8
+    nmsub_macro 2, 2, 3, D8
+    ldrepl_macro 17, 18, 4
+    GMUL xvf, d, U2, D5, U2
+    ldrepl_macro 16, 16, 0
+    nmsub_macro 1, 1, 3, D7
+    nmsub_macro 1, 1, 2, D4
+    GMUL xvf, d, U1, D2, U1
+    nmsub_macro 0, 0, 3, D6
+    nmsub_macro 0, 0, 2, D3
+    nmsub_macro 0, 0, 1, D1
+    GMUL xvf, d, U0, D0, U0
+// Store A
+    A_st_macro 0, 3, 0, 2
+// Store C
+    GST v, , $vr0, C0, 0x00, $vr1, C1, 0x00, $vr2, C2, 0x00, $vr3, C3, 0x00
+.endm
+
+.macro dsolve_1x4
+// We are going to process matrix B with a size of 4x4,
+// using only the upper triangular portion. The memory layout of
+// matrix B is as follows:
+//0
+//4	 5
+//8	 9	10
+//12 13	14	15
+// Sequentially extract data from B in row order
+    ldrepl_macro 22, 25, 12
+    GMUL xvf, d, U3, D9, U3
+    ldrepl_macro 19, 21, 8
+    nmsub_macro 2, 2, 3, D8
+    ldrepl_macro 17, 18, 4
+    GMUL xvf, d, U2, D5, U2
+    ldrepl_macro 16, 16, 0
+    nmsub_macro 1, 1, 3, D7
+    nmsub_macro 1, 1, 2, D4
+    GMUL xvf, d, U1, D2, U1
+    nmsub_macro 0, 0, 3, D6
+    nmsub_macro 0, 0, 2, D3
+    nmsub_macro 0, 0, 1, D1
+    GMUL xvf, d, U0, D0, U0
+// Store A
+    A_st_macro 0, 3, 0, 1
+// Store C
+    GST f, d, $f0, C0, 0x00, $f1, C1, 0x00, $f2, C2, 0x00, $f3, C3, 0x00,
+.endm
+
+.macro dgemm_dsolve_16x1
+    or    T1,   A0,     A0
+    or    T2,   B0,     B0
+    bge   ZERO, L,	.L_dsolve_16x1_load
+    dgemm_16x1
+    b .L_dsolve_16x1
+.L_dsolve_16x1_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    xvld      U1,  C0,  0x20
+    xvld      U2,  C0,  0x40
+    xvld      U3,  C0,  0x60
+.L_dsolve_16x1:
+    PTR_ADDI    A0,     T1,     -16 * 8
+    PTR_ADDI    B0,     T2,     -1 * 8
+    ldrepl_macro 16, 16, 0
+    GMUL xvf, d, U0, D0, U0, U1, D0, U1, U2, D0, U2, U3, D0, U3
+    // Store A
+    A_st_macro 0, 3, 0, 4
+    // Strore C
+    GST xv, , U0, C0, 0x00, U1, C0, 0x20, U2, C0, 0x40, U3, C0, 0x60
+.endm
+
+.macro dgemm_dsolve_8x1
+    or    T1,   A0,     A0
+    or    T2,   B0,     B0
+    bge   ZERO, L,	.L_dsolve_8x1_load
+    dgemm_8x1
+    b .L_dsolve_8x1
+.L_dsolve_8x1_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    xvld      U1,  C0,  0x20
+.L_dsolve_8x1:
+    PTR_ADDI    A0,     T1,     -8 * 8
+    PTR_ADDI    B0,     T2,     -1 * 8
+    ldrepl_macro 16, 16, 0
+    GMUL xvf, d, U0, D0, U0, U1, D0, U1
+    // Store A
+    A_st_macro 0, 1, 0, 4
+    // Strore C
+    GST xv, , U0, C0, 0x00, U1, C0, 0x20
+.endm
+
+.macro dgemm_dsolve_4x1
+    or    T1,   A0,     A0
+    or    T2,   B0,     B0
+    bge   ZERO, L,	.L_dsolve_4x1_load
+    dgemm_4x1
+    b .L_dsolve_4x1
+.L_dsolve_4x1_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+.L_dsolve_4x1:
+    PTR_ADDI    A0,     T1,     -4 * 8
+    PTR_ADDI    B0,     T2,     -1 * 8
+    ldrepl_macro 16, 16, 0
+    GMUL xvf, d, U0, D0, U0
+    // Store A
+    A_st_macro 0, 0, 0, 4
+    // Strore C
+    GST xv, , U0, C0, 0x00
+.endm
+
+.macro dgemm_dsolve_2x1
+    or    T1,   A0,     A0
+    or    T2,   B0,     B0
+    bge   ZERO, L,	.L_dsolve_2x1_load
+    dgemm_2x1
+    b .L_dsolve_2x1
+.L_dsolve_2x1_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+.L_dsolve_2x1:
+    PTR_ADDI    A0,     T1,     -2 * 8
+    PTR_ADDI    B0,     T2,     -1 * 8
+    ldrepl_macro 16, 16, 0
+    GMUL xvf, d, U0, D0, U0
+    // Store A
+    A_st_macro 0, 0, 0, 2
+    // Strore C
+    GST v, , $vr0, C0, 0x00
+.endm
+
+.macro dgemm_dsolve_1x1
+    or    T1,   A0,     A0
+    or    T2,   B0,     B0
+    bge   ZERO, L,    .L_dsolve_1x1_load
+    dgemm_1x1
+    b .L_dsolve_1x1
+.L_dsolve_1x1_load:
+    // Load C
+    fld.d       $f0,    C0,     0x00
+.L_dsolve_1x1:
+    PTR_ADDI    A0,     T1,     -1 * 8
+    PTR_ADDI    B0,     T2,     -1 * 8
+    ldrepl_macro 16, 16, 0
+    GMUL xvf, d, U0, D0, U0
+    // Store A
+    A_st_macro 0, 0, 0, 1
+    // Strore C
+    GST f, d, $f0, C0, 0x00
+.endm
+
+.macro dgemm_dsolve_16x2
+    or    T1,   A0,     A0
+    or    T2,   B0,     B0
+    bge   ZERO, L,	.L_dsolve_16x2_load
+    dgemm_16x2
+    b .L_dsolve_16x2
+.L_dsolve_16x2_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    xvld      U1,  C0,  0x20
+    xvld      U2,  C0,  0x40
+    xvld      U3,  C0,  0x60
+    /* Load C1  */
+    xvld      U4,  C1,  0x00
+    xvld      U5,  C1,  0x20
+    xvld      U6,  C1,  0x40
+    xvld      U7,  C1,  0x60
+.L_dsolve_16x2:
+    PTR_ADDI    A0,     T1,     -(16 * 2) * 8
+    PTR_ADDI    B0,     T2,     -(2 * 2) * 8
+    dsolve_16x2
+.endm
+
+.macro dgemm_dsolve_8x2
+    or    T1,   A0,     A0
+    or    T2,   B0,     B0
+    bge   ZERO, L,	.L_dsolve_8x2_load
+    dgemm_8x2
+    b .L_dsolve_8x2
+.L_dsolve_8x2_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    xvld      U1,  C0,  0x20
+    /* Load C1  */
+    xvld      U2,  C1,  0x00
+    xvld      U3,  C1,  0x20
+.L_dsolve_8x2:
+    PTR_ADDI    A0,     T1,     -(8 * 2) * 8
+    PTR_ADDI    B0,     T2,     -(2 * 2) * 8
+    dsolve_8x2
+.endm
+
+.macro dgemm_dsolve_4x2
+    or    T1,   A0,     A0
+    or    T2,   B0,     B0
+    bge   ZERO, L,	.L_dsolve_4x2_load
+    dgemm_4x2
+    b .L_dsolve_4x2
+.L_dsolve_4x2_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    /* Load C1  */
+    xvld      U1,  C1,  0x00
+.L_dsolve_4x2:
+    PTR_ADDI    A0,     T1,     -(4 * 2) * 8
+    PTR_ADDI    B0,     T2,     -(2 * 2) * 8
+    dsolve_4x2
+.endm
+
+.macro dgemm_dsolve_2x2
+    or    T1,   A0,     A0
+    or    T2,   B0,     B0
+    bge   ZERO, L,	.L_dsolve_2x2_load
+    dgemm_2x2
+    b .L_dsolve_2x2
+.L_dsolve_2x2_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    /* Load C1  */
+    xvld      U1,  C1,  0x00
+.L_dsolve_2x2:
+    PTR_ADDI    A0,     T1,     -(2 * 2) * 8
+    PTR_ADDI    B0,     T2,     -(2 * 2) * 8
+    dsolve_2x2
+.endm
+
+.macro dgemm_dsolve_1x2
+    or    T1,   A0,     A0
+    or    T2,   B0,     B0
+    bge   ZERO, L,    .L_dsolve_1x2_load
+    dgemm_1x2
+    xvpackod.d  U1,     U0,     U0
+    b   .L_dsolve_1x2
+.L_dsolve_1x2_load:
+    // Load C
+    fld.d       $f0,    C0,     0x00
+    fld.d       $f1,    C1,     0x00
+.L_dsolve_1x2:
+    PTR_ADDI    A0,     T1,     -(1 * 2) * 8
+    PTR_ADDI    B0,     T2,     -(2 * 2) * 8
+    dsolve_1x2
+.endm
+
+.macro dgemm_dsolve_16x4
+    or    T1,   A0,     A0
+    or    T2,   B0,     B0
+    bge   ZERO, L,	.L_dsolve_16x4_load
+    dgemm_16x4
+    b	.L_dsolve_16x4
+.L_dsolve_16x4_load:
+    // Load C
+    GLD xv, , U0,  C0, 0x00, U1,  C0, 0x20, U2,  C0, 0x40, U3,  C0, 0x60
+    GLD xv, , U4,  C1, 0x00, U5,  C1, 0x20, U6,  C1, 0x40, U7,  C1, 0x60
+    GLD xv, , U8,  C2, 0x00, U9,  C2, 0x20, U10, C2, 0x40, U11, C2, 0x60
+    GLD xv, , U12, C3, 0x00, U13, C3, 0x20, U14, C3, 0x40, U15, C3, 0x60
+/********************** solver ******************/
+.L_dsolve_16x4:
+    PTR_ADDI    A0,     T1,     -(16 * 4) * 8
+    PTR_ADDI    B0,     T2,     -(4 * 4) * 8
+    dsolve_16x4
+.endm
+
+.macro dgemm_dsolve_8x4
+    or    T1,   A0,     A0
+    or    T2,   B0,     B0
+    bge   ZERO, L,	.L_dsolve_8x4_load
+    dgemm_8x4
+    b .L_dsolve_8x4
+.L_dsolve_8x4_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    xvld      U1,  C0,  0x20
+
+    /* Load C1  */
+    xvld      U2,  C1,  0x00
+    xvld      U3,  C1,  0x20
+
+    /* Load C2  */
+    xvld      U4,  C2,  0x00
+    xvld      U5,  C2,  0x20
+
+    /* Load C3  */
+    xvld      U6,  C3,  0x00
+    xvld      U7,  C3,  0x20
+/********* solver *********/
+.L_dsolve_8x4:
+    PTR_ADDI    A0,     T1,     -(8 * 4) * 8
+    PTR_ADDI    B0,     T2,     -(4 * 4) * 8
+    dsolve_8x4
+.endm
+
+.macro dgemm_dsolve_4x4
+    or    T1,   A0,     A0
+    or    T2,   B0,     B0
+    bge   ZERO, L,    .L_dsolve_4x4_load
+    dgemm_4x4
+    b .L_dsolve_4x4
+.L_dsolve_4x4_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    /* Load C1  */
+    xvld      U1,  C1,  0x00
+    /* Load C2  */
+    xvld      U2,  C2,  0x00
+    /* Load C3  */
+    xvld      U3,  C3,  0x00
+/************** solver *****************/
+.L_dsolve_4x4:
+    PTR_ADDI    A0,     T1,     -(4 * 4) * 8
+    PTR_ADDI    B0,     T2,     -(4 * 4) * 8
+    dsolve_4x4
+.endm
+
+.macro dgemm_dsolve_2x4
+    or    T1,   A0,     A0
+    or    T2,   B0,     B0
+    bge   ZERO, L,    .L_dsolve_2x4_load
+    dgemm_2x4
+    xvpermi.q   U2,     U0,     0x01
+    xvpermi.q   U3,     U1,     0x01
+    b   .L_dsolve_2x4
+.L_dsolve_2x4_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    /* Load C1  */
+    xvld      U1,  C1,  0x00
+    /* Load C2  */
+    xvld      U2,  C2,  0x00
+    /* Load C3  */
+    xvld      U3,  C3,  0x00
+/********************** solver ******************/
+.L_dsolve_2x4:
+    PTR_ADDI    A0,     T1,     -(2 * 4) * 8
+    PTR_ADDI    B0,     T2,     -(4 * 4) * 8
+    dsolve_2x4
+.endm
+
+.macro dgemm_dsolve_1x4
+    or    T1,   A0,     A0
+    or    T2,   B0,     B0
+    bge   ZERO, L,    .L_dsolve_1x4_load
+    dgemm_1x4
+    xvpackod.d  U1,     U0,     U0
+    xvpermi.q   U2,     U0,     0x01
+    xvpermi.q   U3,     U1,     0x01
+    b   .L_dsolve_1x4
+.L_dsolve_1x4_load:
+    // Load C
+    fld.d       $f0,    C0,     0x00
+    fld.d       $f1,    C1,     0x00
+    fld.d       $f2,    C2,     0x00
+    fld.d       $f3,    C3,     0x00
+.L_dsolve_1x4:
+    PTR_ADDI    A0,     T1,     -(1 * 4) * 8
+    PTR_ADDI    B0,     T2,     -(4 * 4) * 8
+    dsolve_1x4
+.endm
+
+    PROLOGUE
+    push_if_used 26, 32
+    PTR_SLLI    LDC,    LDC,    3
+    PTR_SUB     KK,     N,      OFFSET
+    PTR_MUL     T0,     N,      LDC
+    PTR_MUL     T1,     N,      K
+    PTR_ADD     C,      C,      T0 // c += n * ldc
+    PTR_SLLI    T1,     T1,     3
+    PTR_ADD     B,      B,      T1
+
+    andi        J,      N,      1
+    beqz        J,      .L_N2
+.L_N1:
+    move        AA,     A
+    PTR_SUB     C,      C,      LDC // c -= ldc
+    PTR_SLLI    T0,     K,      3
+    PTR_SLLI    T1,     KK,     3
+    PTR_SUB     B,      B,      T0 // b -= k
+    PTR_ADD     BB,     B,      T1 // bb = b + kk
+    move        CC,     C
+
+    PTR_SRAI    I,      M,      4 // M >> 4
+    beqz        I,      .L_N1_M15
+.align 4
+.L_N1_I1:
+    PTR_SLLI    T1,     KK,     7
+    GADD , d, C0, CC, ZERO
+    PTR_ADD     A0,     AA,     T1 // a0 = aa + 16 * kk
+    move        B0,     BB
+    PTR_SUB     L,      K,      KK // L = K - KK
+    dgemm_dsolve_16x1
+    PTR_ADDI    I,      I,      -1
+    PTR_SLLI    T0,     K,      7
+    PTR_ADDI    CC,     CC,     0x80 // cc += 16
+    PTR_ADD     AA,     AA,     T0 // aa += 16 * k
+    bnez        I,      .L_N1_I1
+.L_N1_M15:
+    andi        I,      M,      8
+    beqz        I,      .L_N1_M7
+.L_N1_M8:
+    PTR_SLLI    T1,     KK,     6
+    GADD , d, C0, CC, ZERO
+    PTR_ADD     A0,     AA,     T1 // a0 = aa + 8 * kk
+    move        B0,     BB
+    PTR_SUB     L,      K,      KK // L = K - KK
+    dgemm_dsolve_8x1
+    PTR_SLLI    T0,     K,      6
+    PTR_ADDI    CC,     CC,     0x40 // cc += 8
+    PTR_ADD     AA,     AA,     T0 // aa += 8 * k
+.L_N1_M7:
+    andi        I,      M,      4
+    beqz        I,      .L_N1_M3
+.L_N1_M4:
+    PTR_SLLI    T1,     KK,     5
+    GADD , d, C0, CC, ZERO
+    PTR_ADD     A0,     AA,     T1 // a0 = aa + 4 * kk
+    move        B0,     BB
+    PTR_SUB     L,      K,      KK // L = K - KK
+    dgemm_dsolve_4x1
+    PTR_SLLI    T0,     K,      5
+    PTR_ADDI    CC,     CC,     0x20 // cc += 4
+    PTR_ADD     AA,     AA,     T0 // aa += 4 * k
+.L_N1_M3:
+    andi        I,      M,      2
+    beqz        I,      .L_N1_M1
+.L_N1_M2:
+    PTR_SLLI    T1,     KK,     4
+    GADD , d, C0, CC, ZERO
+    PTR_ADD     A0,     AA,     T1 // a0 = aa + 2 * kk
+    move        B0,     BB
+    PTR_SUB     L,      K,      KK // L = K - KK
+    dgemm_dsolve_2x1
+    PTR_SLLI    T0,     K,      4
+    PTR_ADDI    CC,     CC,     0x10 // cc += 2
+    PTR_ADD     AA,     AA,     T0 // aa += 2 * k
+.L_N1_M1:
+    andi        I,      M,      1
+    beqz        I,      .L_N1_M0
+    PTR_SLLI    T1,     KK,     3
+    GADD , d, C0, CC, ZERO
+    PTR_ADD     A0,     AA,     T1 // a0 = aa + kk
+    move        B0,     BB
+    PTR_SUB     L,      K,      KK // L = K - KK
+    dgemm_dsolve_1x1
+    PTR_SLLI    T0,     K,      3
+    PTR_ADDI    CC,     CC,     0x08 // cc += 1
+    PTR_ADD     AA,     AA,     T0 // aa += 1 * k
+.L_N1_M0:
+    PTR_ADDI    KK,     KK,     -1
+.L_N2:
+    andi    J,      N,      2
+    beq     ZERO,   J,      .L_N4
+    move    AA,     A
+    PTR_SLLI    T0, LDC,    1
+    PTR_SLLI    T1, K,      4
+    PTR_SLLI    T2, KK,     4
+    PTR_SUB     B,  B,      T1
+    PTR_SUB     C,  C,      T0
+    PTR_ADD     BB, B,      T2
+    move    CC,     C
+    PTR_SRAI    I,      M,      4 // M >> 4
+    beqz        I,      .L_N2_M15
+.align 4
+.L_N2_I1:
+    PTR_SLLI    T1,     KK,     7
+    GADD , d, C0, CC, ZERO, C1, C0, LDC
+    PTR_ADD     A0,     AA,     T1 // a0 = aa + 16 * kk
+    move        B0,     BB
+    PTR_SUB     L,      K,      KK // L = K - KK
+    dgemm_dsolve_16x2
+    PTR_ADDI    I,      I,      -1
+    PTR_SLLI    T0,     K,      7
+    PTR_ADDI    CC,     CC,     0x80 // cc += 16
+    PTR_ADD     AA,     AA,     T0 // aa += 16 * k
+    bnez        I,      .L_N2_I1
+.L_N2_M15:
+    andi        I,      M,      8
+    beqz        I,      .L_N2_M7
+.L_N2_M8:
+    PTR_SLLI    T1,     KK,     6
+    GADD , d, C0, CC, ZERO, C1, C0, LDC
+    PTR_ADD     A0,     AA,     T1 // a0 = aa + 8 * kk
+    move        B0,     BB
+    PTR_SUB     L,      K,      KK // L = K - KK
+    dgemm_dsolve_8x2
+    PTR_SLLI    T0,     K,      6
+    PTR_ADDI    CC,     CC,     0x40 // cc += 8
+    PTR_ADD     AA,     AA,     T0 // aa += 8 * k
+.L_N2_M7:
+    andi        I,      M,      4
+    beqz        I,      .L_N2_M3
+.L_N2_M4:
+    PTR_SLLI    T1,     KK,     5
+    GADD , d, C0, CC, ZERO, C1, C0, LDC
+    PTR_ADD     A0,     AA,     T1 // a0 = aa + 4 * kk
+    move        B0,     BB
+    PTR_SUB     L,      K,      KK // L = K - KK
+    dgemm_dsolve_4x2
+    PTR_SLLI    T0,     K,      5
+    PTR_ADDI    CC,     CC,     0x20 // cc += 4
+    PTR_ADD     AA,     AA,     T0 // aa += 4 * k
+.L_N2_M3:
+    andi        I,      M,      2
+    beqz        I,      .L_N2_M1
+.L_N2_M2:
+    PTR_SLLI    T1,     KK,     4
+    GADD , d, C0, CC, ZERO, C1, C0, LDC
+    PTR_ADD     A0,     AA,     T1 // a0 = aa + 2 * kk
+    move        B0,     BB
+    PTR_SUB     L,      K,      KK // L = K - KK
+    dgemm_dsolve_2x2
+    PTR_SLLI    T0,     K,      4
+    PTR_ADDI    CC,     CC,     0x10 // cc += 2
+    PTR_ADD     AA,     AA,     T0 // aa += 2 * k
+.L_N2_M1:
+    andi        I,      M,      1
+    beqz        I,      .L_N2_M0
+    PTR_SLLI    T1,     KK,     3
+    GADD , d, C0, CC, ZERO, C1, C0, LDC
+    PTR_ADD     A0,     AA,     T1 // a0 = aa + kk
+    move        B0,     BB
+    PTR_SUB     L,      K,      KK // L = K - KK
+    dgemm_dsolve_1x2
+    PTR_SLLI    T0,     K,      3
+    PTR_ADDI    CC,     CC,     0x08 // cc += 1
+    PTR_ADD     AA,     AA,     T0 // aa += 1 * k
+.L_N2_M0:
+    PTR_ADDI    KK,     KK,     -2
+.L_N4:
+    PTR_SRAI   J,     N,     2     /* J = bn >> 2 */
+    beq        ZERO,  J,     .L_N0
+.align 5
+.L_J1:
+    PTR_ADDI    J,      J,     -1
+    move    AA,     A
+    PTR_SLLI    T0, LDC,    2
+    PTR_SLLI    T1, K,      5
+    PTR_SLLI    T2, KK,     5
+    PTR_SUB     B,  B,      T1
+    PTR_SUB     C,  C,      T0
+    PTR_ADD     BB, B,      T2
+    move    CC,     C
+    PTR_SRAI    I,      M,      4 // M >> 4
+    beqz        I,      .L_M15
+.align 4
+.L_I1:
+    PTR_SLLI    T1,     KK,     7
+    GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC
+    PTR_ADD     A0,     AA,     T1 // a0 = aa + 16 * kk
+    move        B0,     BB
+    PTR_SUB     L,      K,      KK // L = K - KK
+    dgemm_dsolve_16x4
+    PTR_ADDI    I,      I,      -1
+    PTR_SLLI    T0,     K,      7
+    PTR_ADDI    CC,     CC,     0x80 // cc += 16
+    PTR_ADD     AA,     AA,     T0 // aa += 16 * k
+    bnez        I,      .L_I1
+.L_M15:
+    andi        I,      M,      8
+    beqz        I,      .L_M7
+.L_M8:
+    PTR_SLLI    T1,     KK,     6
+    GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC
+    PTR_ADD     A0,     AA,     T1 // a0 = aa + 8 * kk
+    move        B0,     BB
+    PTR_SUB     L,      K,      KK // L = K - KK
+    dgemm_dsolve_8x4
+    PTR_SLLI    T0,     K,      6
+    PTR_ADDI    CC,     CC,     0x40 // cc += 8
+    PTR_ADD     AA,     AA,     T0 // aa += 8 * k
+.L_M7:
+    andi        I,      M,      4
+    beqz        I,      .L_M3
+.L_M4:
+    PTR_SLLI    T1,     KK,     5
+    GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC
+    PTR_ADD     A0,     AA,     T1 // a0 = aa + 4 * kk
+    move        B0,     BB
+    PTR_SUB     L,      K,      KK // L = K - KK
+    dgemm_dsolve_4x4
+    PTR_SLLI    T0,     K,      5
+    PTR_ADDI    CC,     CC,     0x20 // cc += 4
+    PTR_ADD     AA,     AA,     T0 // aa += 4 * k
+.L_M3:
+    andi        I,      M,      2
+    beqz        I,      .L_M1
+.L_M2:
+    PTR_SLLI    T1,     KK,     4
+    GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC
+    PTR_ADD     A0,     AA,     T1 // a0 = aa + 2 * kk
+    move        B0,     BB
+    PTR_SUB     L,      K,      KK // L = K - KK
+    dgemm_dsolve_2x4
+    PTR_SLLI    T0,     K,      4
+    PTR_ADDI    CC,     CC,     0x10 // cc += 2
+    PTR_ADD     AA,     AA,     T0 // aa += 2 * k
+.L_M1:
+    andi        I,      M,      1
+    beqz        I,      .L_M0
+    PTR_SLLI    T1,     KK,     3
+    GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC
+    PTR_ADD     A0,     AA,     T1 // a0 = aa + kk
+    move        B0,     BB
+    PTR_SUB     L,      K,      KK // L = K - KK
+    dgemm_dsolve_1x4
+    PTR_SLLI    T0,     K,      3
+    PTR_ADDI    CC,     CC,     0x08 // cc += 1
+    PTR_ADD     AA,     AA,     T0 // aa += 1 * k
+.L_M0:
+    PTR_ADDI    KK,     KK,     -4
+    bnez        J,      .L_J1
+.L_N0:
+    pop_if_used 26, 32
+    jirl    $r0, $r1, 0x0
+    EPILOGUE
--- a/kernel/loongarch64/dtrsm_kernel_macro.S
+++ b/kernel/loongarch64/dtrsm_kernel_macro.S