diff --git a/Makefile.system b/Makefile.system index 902b53605..eae1376e1 100644 --- a/Makefile.system +++ b/Makefile.system @@ -955,12 +955,18 @@ endif ifeq ($(ARCH), loongarch64) LA64_ABI=$(shell $(CC) -mabi=lp64d -c $(TOPDIR)/cpuid_loongarch64.c -o /dev/null > /dev/null 2> /dev/null && echo lp64d) +LA64_ARCH=$(shell $(CC) -march=loongarch64 -c $(TOPDIR)/cpuid_loongarch64.c -o /dev/null > /dev/null 2> /dev/null && echo loongarch64) ifneq ($(LA64_ABI), lp64d) LA64_ABI=lp64 endif +ifneq ($(LA64_ARCH), loongarch64) +CCOMMON_OPT += -mabi=$(LA64_ABI) +FCOMMON_OPT += -mabi=$(LA64_ABI) +else CCOMMON_OPT += -march=loongarch64 -mabi=$(LA64_ABI) FCOMMON_OPT += -march=loongarch64 -mabi=$(LA64_ABI) endif +endif endif diff --git a/c_check b/c_check index 59ab9bb13..114eed60c 100755 --- a/c_check +++ b/c_check @@ -197,10 +197,22 @@ fi no_lsx=0 no_lasx=0 if [ "$architecture" = "loongarch64" ]; then + lasx_flags='-march=loongarch64' + lsx_flags='-march=loongarch64' + tmpd="$(mktemp -d)" + tmparch="$tmpd/arch.c" + printf "void main(void){ }\n" >> "$tmparch" + args="-march=loongarch64 -o $tmparch.o $tmparch" + { + $compiler_name $flags $args >/dev/null 2>&1 + } || { + lasx_flags='' + lsx_flags='' + } + tmplsx="$tmpd/lsx.c" codelsx='"vadd.b $vr0, $vr0, $vr0"' - lsx_flags='-march=loongarch64' printf "void main(void){ __asm__ volatile(%s);}\n" "$codelsx" >> "$tmplsx" args="$lsx_flags -o $tmplsx.o $tmplsx" { @@ -211,7 +223,6 @@ if [ "$architecture" = "loongarch64" ]; then tmplasx="$tmpd/lasx.c" codelasx='"xvadd.b $xr0, $xr0, $xr0"' - lasx_flags='-march=loongarch64' printf "void main(void){ __asm__ volatile(%s);}\n" "$codelasx" >> "$tmplasx" args="$lasx_flags -o $tmplasx.o $tmplasx" { diff --git a/kernel/loongarch64/cgemv_n_4_lsx.S b/kernel/loongarch64/cgemv_n_4_lsx.S index cf8273797..a3626191b 100644 --- a/kernel/loongarch64/cgemv_n_4_lsx.S +++ b/kernel/loongarch64/cgemv_n_4_lsx.S @@ -279,7 +279,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. PROLOGUE PTR_LD INC_Y, $sp, 0 - push_if_used 17 + 7, 31 + push_if_used 7, 7 PTR_ADDI K, $r0, 0x01 PTR_SUB I, INC_X, K PTR_SUB J, INC_Y, K @@ -318,6 +318,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .L_GAP_1_1: /* if (inc_x != 1) && (incy != 1) */ CGEMV_N_LSX GAP_1_1, X_4_GAP, X_1, Y_4_GAP, Y_1 .L_END: - pop_if_used 17 + 7, 31 + pop_if_used 7, 7 jirl $r0, $r1, 0x0 EPILOGUE diff --git a/kernel/loongarch64/cgemv_n_8_lasx.S b/kernel/loongarch64/cgemv_n_8_lasx.S index ba38a9573..44e59d0a7 100644 --- a/kernel/loongarch64/cgemv_n_8_lasx.S +++ b/kernel/loongarch64/cgemv_n_8_lasx.S @@ -336,7 +336,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. PROLOGUE PTR_LD INC_Y, $sp, 0 - push_if_used 17 + 7, 31 + push_if_used 7, 7 PTR_ADDI K, $r0, 0x01 PTR_SUB I, INC_X, K PTR_SUB J, INC_Y, K @@ -378,6 +378,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .L_GAP_1_1: /* if (inc_x != 1) && (incy != 1) */ CGEMV_N_LASX GAP_1_1, X_8_GAP, X_1, Y_8_GAP, Y_1 .L_END: - pop_if_used 17 + 7, 31 + pop_if_used 7, 7 jirl $r0, $r1, 0x0 EPILOGUE diff --git a/kernel/loongarch64/cgemv_t_4_lsx.S b/kernel/loongarch64/cgemv_t_4_lsx.S index ada349364..6acf8c63b 100644 --- a/kernel/loongarch64/cgemv_t_4_lsx.S +++ b/kernel/loongarch64/cgemv_t_4_lsx.S @@ -255,7 +255,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. PROLOGUE PTR_LD INC_Y, $sp, 0 - push_if_used 17 + 8, 30 + push_if_used 8, 6 PTR_ADDI K, $r0, 0x01 PTR_SUB I, INC_X, K maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */ @@ -285,6 +285,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .L_GAP_1: /* if (incx != 1) */ CGEMV_T_LSX GAP_1, X4_GAP .L_END: - pop_if_used 17 + 8, 30 + pop_if_used 8, 6 jirl $r0, $r1, 0x0 EPILOGUE diff --git a/kernel/loongarch64/cgemv_t_8_lasx.S b/kernel/loongarch64/cgemv_t_8_lasx.S index 94e4bd2eb..f8a0ad124 100644 --- a/kernel/loongarch64/cgemv_t_8_lasx.S +++ b/kernel/loongarch64/cgemv_t_8_lasx.S @@ -304,7 +304,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. PROLOGUE PTR_LD INC_Y, $sp, 0 - push_if_used 17 + 8, 30 + push_if_used 8, 6 PTR_ADDI K, $r0, 0x01 PTR_SUB I, INC_X, K maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */ @@ -337,6 +337,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .L_GAP_1: /* if (incx != 1) */ CGEMV_T_LASX GAP_1, X8_GAP .L_END: - pop_if_used 17 + 8, 30 + pop_if_used 8, 6 jirl $r0, $r1, 0x0 EPILOGUE diff --git a/kernel/loongarch64/dgemm_ncopy_8_lsx.S b/kernel/loongarch64/dgemm_ncopy_8_lsx.S index 30bebe8df..203c3eb27 100644 --- a/kernel/loongarch64/dgemm_ncopy_8_lsx.S +++ b/kernel/loongarch64/dgemm_ncopy_8_lsx.S @@ -79,7 +79,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define D7 $vr15 PROLOGUE - push_if_used 26, 32 + push_if_used 0, 0 move TD, DST move TS, SRC slli.d TL, LDA, 0x03 @@ -278,6 +278,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi.d M, M, -1 blt ZERO, M, .L_M1 .L_N0: - pop_if_used 26, 32 + pop_if_used 0, 0 jirl $r0, $r1, 0x00 EPILOGUE diff --git a/kernel/loongarch64/dgemm_tcopy_4_lsx.S b/kernel/loongarch64/dgemm_tcopy_4_lsx.S index 134066471..d9a442e57 100644 --- a/kernel/loongarch64/dgemm_tcopy_4_lsx.S +++ b/kernel/loongarch64/dgemm_tcopy_4_lsx.S @@ -66,7 +66,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define U7 $vr7 PROLOGUE - push_if_used 18, 8 + push_if_used 1, 0 move S0, SRC move P0, DST @@ -274,7 +274,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fst.d F0, P3, 0x00 .L_M0: - pop_if_used 18, 8 + pop_if_used 1, 0 jirl $r0, $r1, 0x00 EPILOGUE diff --git a/kernel/loongarch64/dgemm_tcopy_8_lsx.S b/kernel/loongarch64/dgemm_tcopy_8_lsx.S index a7e3ef69c..b4106e6a9 100644 --- a/kernel/loongarch64/dgemm_tcopy_8_lsx.S +++ b/kernel/loongarch64/dgemm_tcopy_8_lsx.S @@ -76,7 +76,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define U7 $vr7 PROLOGUE - push_if_used 24, 8 + push_if_used 7, 0 move S0, SRC move P0, DST @@ -592,6 +592,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi.d S1, S1, 0x08 addi.d P4, P4, 0x08 .L_M0: - pop_if_used 24, 8 + pop_if_used 7, 0 jirl $r0, $r1, 0x00 EPILOGUE diff --git a/kernel/loongarch64/dgemv_n_8_lasx.S b/kernel/loongarch64/dgemv_n_8_lasx.S index a49bf9bb1..9fe4bfddd 100644 --- a/kernel/loongarch64/dgemv_n_8_lasx.S +++ b/kernel/loongarch64/dgemv_n_8_lasx.S @@ -509,7 +509,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. PROLOGUE PTR_LD INC_Y, $sp, 0 - push_if_used 17 + 7, 24 + 4 + push_if_used 7, 4 PTR_ADDI K, $r0, 0x01 PTR_SUB I, INC_X, K PTR_SUB J, INC_Y, K @@ -549,6 +549,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .L_GAP_1_1: /* if (inc_x != 1) && (incy != 1) */ DGEMV_N_LASX GAP_1_1, X_8_GAP, X_4_GAP, X_2_GAP, X_1, Y_8_GAP, Y_4_GAP, Y_1 .L_END: - pop_if_used 17 + 7, 24 + 4 + pop_if_used 7, 4 jirl $r0, $r1, 0x0 EPILOGUE diff --git a/kernel/loongarch64/dgemv_t_8_lasx.S b/kernel/loongarch64/dgemv_t_8_lasx.S index 71f942b0f..2c29bebec 100644 --- a/kernel/loongarch64/dgemv_t_8_lasx.S +++ b/kernel/loongarch64/dgemv_t_8_lasx.S @@ -445,7 +445,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. PROLOGUE PTR_LD INC_Y, $sp, 0 - push_if_used 17 + 8, 24 + 3 + push_if_used 8, 3 PTR_ADDI K, $r0, 0x01 PTR_SUB I, INC_X, K maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */ @@ -476,6 +476,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .L_GAP_1: /* if (incx != 1) */ DGEMV_T_LASX GAP_1, X8_GAP, X4_GAP .L_END: - pop_if_used 17 + 8, 24 + 3 + pop_if_used 8, 3 jirl $r0, $r1, 0x0 EPILOGUE diff --git a/kernel/loongarch64/dtrsm_kernel_LN_16x4_lasx.S b/kernel/loongarch64/dtrsm_kernel_LN_16x4_lasx.S index 3315daccb..e71fa7d30 100644 --- a/kernel/loongarch64/dtrsm_kernel_LN_16x4_lasx.S +++ b/kernel/loongarch64/dtrsm_kernel_LN_16x4_lasx.S @@ -1029,7 +1029,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm PROLOGUE - push_if_used 26, 32 + push_if_used 9, 8 PTR_SLLI LDC, LDC, 3 /* if (!(N >> 2)) goto L_N3 */ PTR_SRAI J, N, 2 /* J = bn >> 2 */ @@ -1361,6 +1361,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. blt ZERO, I, .L_N1_I1 .L_N1_M0: .L_N0: - pop_if_used 26, 32 + pop_if_used 9, 8 jirl $r0, $r1, 0x0 EPILOGUE diff --git a/kernel/loongarch64/dtrsm_kernel_LT_16x4_lasx.S b/kernel/loongarch64/dtrsm_kernel_LT_16x4_lasx.S index 0e2cacccf..7fc62857a 100644 --- a/kernel/loongarch64/dtrsm_kernel_LT_16x4_lasx.S +++ b/kernel/loongarch64/dtrsm_kernel_LT_16x4_lasx.S @@ -128,31 +128,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "dtrsm_kernel_macro.S" -.macro ldrepl_macro start, end, stride +.macro ldrepl_macro stride:req, index:req, more:vararg // Load Ux (x = 0...15) -.if \start <= \end - GLDREPL xv, d, $xr\start, A0, \stride * 8 - ldrepl_macro %start + 1, \end, %stride + 1 + GLDREPL xv, d, $xr\index, A0, \index * 8 - \stride * 8 +.ifnb \more + ldrepl_macro \stride, \more .endif .endm -.macro nmsub_macro start0, end0, start1, reg +.macro nmsub_macro reg:req, start0:req, start1:req, more:vararg // Gx -= reg * Ux -.if \start0 <= \end0 xvfnmsub.d $xr\start0, \reg, $xr\start1, $xr\start0 - nmsub_macro %start0 + 1, \end0, %start1 + 1, \reg +.ifnb \more + nmsub_macro \reg, \more .endif .endm -.macro B_st_macro start, end, stride, N +.macro B_st_macro N:req, stride:req, start:req, more:vararg // Store Gx(x = 16...31) -.if \start <= \end .if \N == 4 - xvst $xr\start, B0, \stride * 0x20 + xvst $xr\start, B0, \start * 0x20 - \stride * 0x20 .elseif \N == 2 - vst $vr\start, B0, \stride * 0x10 + vst $vr\start, B0, \start * 0x10 - \stride * 0x10 .elseif \N == 1 - fst.d $f\start, B0, \stride * 0x08 + fst.d $f\start, B0, \start * 0x08 - \stride * 0x08 .endif - B_st_macro %start + 1, \end, %stride + 1, \N +.ifnb \more + B_st_macro \N, \stride, \more .endif .endm @@ -194,86 +194,93 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // 255 // Sequentially extract data from A in row order // Load 0 - ldrepl_macro 0, 15, 0 + ldrepl_macro 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 GMUL xvf, d, G0, G0, U0 - nmsub_macro 17, 31, 1, G0 + nmsub_macro G0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7, 24, 8, \ + 25, 9, 26, 10, 27, 11, 28, 12, 29, 13, 30, 14, 31, 15 PTR_ADDI A0, A0, 17 * 8 // Load 1 - ldrepl_macro 1, 15, 0 + ldrepl_macro 1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 GMUL xvf, d, G1, G1, U1 - nmsub_macro 18, 31, 2, G1 + nmsub_macro G1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7, 24, 8, \ + 25, 9, 26, 10, 27, 11, 28, 12, 29, 13, 30, 14, 31, 15 PTR_ADDI A0, A0, 17 * 8 // Load 2 - ldrepl_macro 2, 15, 0 + ldrepl_macro 2, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 GMUL xvf, d, G2, G2, U2 - nmsub_macro 19, 31, 3, G2 + nmsub_macro G2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7, 24, 8, 25, 9, 26, \ + 10, 27, 11, 28, 12, 29, 13, 30, 14, 31, 15 PTR_ADDI A0, A0, 17 * 8 // Load 3 - ldrepl_macro 3, 15, 0 + ldrepl_macro 3, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 GMUL xvf, d, G3, G3, U3 - nmsub_macro 20, 31, 4, G3 + nmsub_macro G3, 20, 4, 21, 5, 22, 6, 23, 7, 24, 8, 25, 9, 26, 10, \ + 27, 11, 28, 12, 29, 13, 30, 14, 31, 15 PTR_ADDI A0, A0, 17 * 8 // Load 4 - ldrepl_macro 4, 15, 0 + ldrepl_macro 4, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 GMUL xvf, d, G4, G4, U4 - nmsub_macro 21, 31, 5, G4 + nmsub_macro G4, 21, 5, 22, 6, 23, 7, 24, 8, 25, 9, 26, 10, 27, 11, \ + 28, 12, 29, 13, 30, 14, 31, 15 PTR_ADDI A0, A0, 17 * 8 // Load 5 - ldrepl_macro 5, 15, 0 + ldrepl_macro 5, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 GMUL xvf, d, G5, G5, U5 - nmsub_macro 22, 31, 6, G5 + nmsub_macro G5, 22, 6, 23, 7, 24, 8, 25, 9, 26, 10, 27, 11, 28, 12, \ + 29, 13, 30, 14, 31, 15 PTR_ADDI A0, A0, 17 * 8 // Load 6 - ldrepl_macro 6, 15, 0 + ldrepl_macro 6, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 GMUL xvf, d, G6, G6, U6 - nmsub_macro 23, 31, 7, G6 + nmsub_macro G6, 23, 7, 24, 8, 25, 9, 26, 10, 27, 11, 28, 12, 29, 13, \ + 30, 14, 31, 15 PTR_ADDI A0, A0, 17 * 8 // Load 7 - ldrepl_macro 7, 15, 0 + ldrepl_macro 7, 7, 8, 9, 10, 11, 12, 13, 14, 15 GMUL xvf, d, G7, G7, U7 - nmsub_macro 24, 31, 8, G7 + nmsub_macro G7, 24, 8, 25, 9, 26, 10, 27, 11, 28, 12, 29, 13, 30, 14, 31, 15 PTR_ADDI A0, A0, 17 * 8 // Load 8 - ldrepl_macro 8, 15, 0 + ldrepl_macro 8, 8, 9, 10, 11, 12, 13, 14, 15 GMUL xvf, d, G8, G8, U8 - nmsub_macro 25, 31, 9, G8 + nmsub_macro G8, 25, 9, 26, 10, 27, 11, 28, 12, 29, 13, 30, 14, 31, 15 PTR_ADDI A0, A0, 17 * 8 // Load 9 - ldrepl_macro 9, 15, 0 + ldrepl_macro 9, 9, 10, 11, 12, 13, 14, 15 GMUL xvf, d, G9, G9, U9 - nmsub_macro 26, 31, 10, G9 + nmsub_macro G9, 26, 10, 27, 11, 28, 12, 29, 13, 30, 14, 31, 15 PTR_ADDI A0, A0, 17 * 8 // Load 10 - ldrepl_macro 10, 15, 0 + ldrepl_macro 10, 10, 11, 12, 13, 14, 15 GMUL xvf, d, G10, G10, U10 - nmsub_macro 27, 31, 11, G10 + nmsub_macro G10, 27, 11, 28, 12, 29, 13, 30, 14, 31, 15 PTR_ADDI A0, A0, 17 * 8 // Load 11 - ldrepl_macro 11, 15, 0 + ldrepl_macro 11, 11, 12, 13, 14, 15 GMUL xvf, d, G11, G11, U11 - nmsub_macro 28, 31, 12, G11 + nmsub_macro G11, 28, 12, 29, 13, 30, 14, 31, 15 PTR_ADDI A0, A0, 17 * 8 // Load 12 - ldrepl_macro 12, 15, 0 + ldrepl_macro 12, 12, 13, 14, 15 GMUL xvf, d, G12, G12, U12 - nmsub_macro 29, 31, 13, G12 + nmsub_macro G12, 29, 13, 30, 14, 31, 15 PTR_ADDI A0, A0, 17 * 8 // Load 13 - ldrepl_macro 13, 15, 0 + ldrepl_macro 13, 13, 14, 15 GMUL xvf, d, G13, G13, U13 - nmsub_macro 30, 31, 14, G13 + nmsub_macro G13, 30, 14, 31, 15 PTR_ADDI A0, A0, 17 * 8 // Load 14 - ldrepl_macro 14, 15, 0 + ldrepl_macro 14, 14, 15 GMUL xvf, d, G14, G14, U14 - nmsub_macro 31, 31, 15, G14 + nmsub_macro G14, 31, 15 PTR_ADDI A0, A0, 17 * 8 // Load 15 - ldrepl_macro 15, 15, 0 + ldrepl_macro 15, 15 GMUL xvf, d, G15, G15, U15 // Finally, We can store the result. // For B, stored sequentially, and C, first transpose and then store - B_st_macro 16, 31, 0, \N + B_st_macro \N, 16, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 GTRANSPOSE4x4_D G0, G1, G2, G3, G0, G1, G2, G3, U0, U1 GTRANSPOSE4x4_D G4, G5, G6, G7, G4, G5, G6, G7, U0, U1 GTRANSPOSE4x4_D G8, G9, G10, G11, G8, G9, G10, G11, U0, U1 @@ -334,46 +341,46 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // 63 // Sequentially extract data from A in row order // Load 0 - ldrepl_macro 0, 7, 0 + ldrepl_macro 0, 0, 1, 2, 3, 4, 5, 6, 7 GMUL xvf, d, G0, G0, U0 - nmsub_macro 17, 23, 1, G0 + nmsub_macro G0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7 PTR_ADDI A0, A0, 9 * 8 // Load 1 - ldrepl_macro 1, 7, 0 + ldrepl_macro 1, 1, 2, 3, 4, 5, 6, 7 GMUL xvf, d, G1, G1, U1 - nmsub_macro 18, 23, 2, G1 + nmsub_macro G1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7 PTR_ADDI A0, A0, 9 * 8 // Load 2 - ldrepl_macro 2, 7, 0 + ldrepl_macro 2, 2, 3, 4, 5, 6, 7 GMUL xvf, d, G2, G2, U2 - nmsub_macro 19, 23, 3, G2 + nmsub_macro G2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7 PTR_ADDI A0, A0, 9 * 8 // Load 3 - ldrepl_macro 3, 7, 0 + ldrepl_macro 3, 3, 4, 5, 6, 7 GMUL xvf, d, G3, G3, U3 - nmsub_macro 20, 23, 4, G3 + nmsub_macro G3, 20, 4, 21, 5, 22, 6, 23, 7 PTR_ADDI A0, A0, 9 * 8 // Load 4 - ldrepl_macro 4, 7, 0 + ldrepl_macro 4, 4, 5, 6, 7 GMUL xvf, d, G4, G4, U4 - nmsub_macro 21, 23, 5, G4 + nmsub_macro G4, 21, 5, 22, 6, 23, 7 PTR_ADDI A0, A0, 9 * 8 // Load 5 - ldrepl_macro 5, 7, 0 + ldrepl_macro 5, 5, 6, 7 GMUL xvf, d, G5, G5, U5 - nmsub_macro 22, 23, 6, G5 + nmsub_macro G5, 22, 6, 23, 7 PTR_ADDI A0, A0, 9 * 8 // Load 6 - ldrepl_macro 6, 7, 0 + ldrepl_macro 6, 6, 7 GMUL xvf, d, G6, G6, U6 - nmsub_macro 23, 23, 7, G6 + nmsub_macro G6, 23, 7 PTR_ADDI A0, A0, 9 * 8 // Load 7 - ldrepl_macro 7, 7, 0 + ldrepl_macro 7, 7 GMUL xvf, d, G7, G7, U7 // Finally, We can store the result. // For B, stored sequentially, and C, first transpose and then store - B_st_macro 16, 23, 0, \N + B_st_macro \N, 16, 16, 17, 18, 19, 20, 21, 22, 23 GTRANSPOSE4x4_D G0, G1, G2, G3, G0, G1, G2, G3, U0, U1 GTRANSPOSE4x4_D G4, G5, G6, G7, G4, G5, G6, G7, U0, U1 .if \N == 4 @@ -437,26 +444,26 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // 15 // Sequentially extract data from A in row order // Load 0 - ldrepl_macro 0, 3, 0 + ldrepl_macro 0, 0, 1, 2, 3 GMUL xvf, d, G0, G0, U0 - nmsub_macro 17, 19, 1, G0 + nmsub_macro G0, 17, 1, 18, 2, 19, 3 PTR_ADDI A0, A0, 5 * 8 // Load 1 - ldrepl_macro 1, 3, 0 + ldrepl_macro 1, 1, 2, 3 GMUL xvf, d, G1, G1, U1 - nmsub_macro 18, 19, 2, G1 + nmsub_macro G1, 18, 2, 19, 3 PTR_ADDI A0, A0, 5 * 8 // Load 2 - ldrepl_macro 2, 3, 0 + ldrepl_macro 2, 2, 3 GMUL xvf, d, G2, G2, U2 - nmsub_macro 19, 19, 3, G2 + nmsub_macro G2, 19, 3 PTR_ADDI A0, A0, 5 * 8 // Load 3 - ldrepl_macro 3, 3, 0 + ldrepl_macro 3, 3 GMUL xvf, d, G3, G3, U3 // Finally, We can store the result. // For B, stored sequentially, and C, first transpose and then store - B_st_macro 16, 19, 0, \N + B_st_macro \N, 16, 16, 17, 18, 19 GTRANSPOSE4x4_D G0, G1, G2, G3, G0, G1, G2, G3, U0, U1 .if \N == 4 GST xv, , G0, C0, 0x00, G1, C1, 0x00, G2, C2, 0x00, G3, C3, 0x00 @@ -501,16 +508,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // 3 // Sequentially extract data from A in row order // Load 0 - ldrepl_macro 0, 1, 0 + ldrepl_macro 0, 0, 1 GMUL xvf, d, G0, G0, U0 - nmsub_macro 17, 17, 1, G0 + nmsub_macro G0, 17, 1 PTR_ADDI A0, A0, 3 * 8 // Load 1 - ldrepl_macro 1, 1, 0 + ldrepl_macro 1, 1 GMUL xvf, d, G1, G1, U1 // Finally, We can store the result. // For B, stored sequentially, and C, first transpose and then store - B_st_macro 16, 17, 0, \N + B_st_macro \N, 16, 16, 17 GSBUTTERFLY xv, d, U0, U1, G1, G0 .if \N == 4 vst $vr0, C0, 0x00 @@ -717,7 +724,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm PROLOGUE - push_if_used 26, 32 + push_if_used 9, 8 PTR_SLLI LDC, LDC, 3 /* if (!(N >> 2)) goto L_N3 */ PTR_SRAI J, N, 2 /* J = bn >> 2 */ @@ -954,6 +961,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. PTR_ADD AA, AA, T0 // aa += 1 * k .L_N1_M0: .L_N0: - pop_if_used 26, 32 + pop_if_used 9, 8 jirl $r0, $r1, 0x0 EPILOGUE diff --git a/kernel/loongarch64/dtrsm_kernel_RN_16x4_lasx.S b/kernel/loongarch64/dtrsm_kernel_RN_16x4_lasx.S index 421339736..be378631b 100644 --- a/kernel/loongarch64/dtrsm_kernel_RN_16x4_lasx.S +++ b/kernel/loongarch64/dtrsm_kernel_RN_16x4_lasx.S @@ -128,33 +128,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "dtrsm_kernel_macro.S" -.macro ldrepl_macro start, end, stride +.macro ldrepl_macro stride:req, index:req, more:vararg // Load Ux (x = 0...15) -.if \start <= \end - GLDREPL xv, d, $xr\start, B0, \stride * 8 - ldrepl_macro %start + 1, \end, %stride + 1 + GLDREPL xv, d, $xr\index, B0, \index * 8 - \stride * 8 +.ifnb \more + ldrepl_macro \stride, \more .endif .endm - -.macro nmsub_macro start0, end0, start1, reg -// Ux -= reg * Dx -.if \start0 <= \end0 +.macro nmsub_macro reg:req, start0:req, start1:req, more:vararg +// Gx -= reg * Ux xvfnmsub.d $xr\start0, \reg, $xr\start1, $xr\start0 - nmsub_macro %start0 + 1, \end0, %start1 + 1, \reg +.ifnb \more + nmsub_macro \reg, \more .endif .endm - -.macro A_st_macro start, end, stride, N -// Store Ux(x = 0...15) -.if \start <= \end +.macro A_st_macro N:req, stride:req, start:req, more:vararg +// Store Gx(x = 16...31) .if \N == 4 - xvst $xr\start, A0, \stride * 0x20 + xvst $xr\start, A0, \start * 0x20 - \stride * 0x20 .elseif \N == 2 - vst $vr\start, A0, \stride * 0x10 + vst $vr\start, A0, \start * 0x10 - \stride * 0x10 .elseif \N == 1 - fst.d $f\start, A0, \stride * 0x08 + fst.d $f\start, A0, \start * 0x08 - \stride * 0x08 .endif - A_st_macro %start + 1, \end, %stride + 1, \N +.ifnb \more + A_st_macro \N, \stride, \more .endif .endm @@ -167,22 +165,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // 10 11 // 15 // Sequentially extract data from B in row order - ldrepl_macro 16, 19, 0 + ldrepl_macro 16, 16, 17, 18, 19 GMUL xvf, d, U0, D0, U0, U1, D0, U1, U2, D0, U2, U3, D0, U3 - ldrepl_macro 20, 22, 5 - nmsub_macro 4, 7, 0, D1 - ldrepl_macro 23, 24, 10 + ldrepl_macro 15, 20, 21, 22 + + nmsub_macro D1, 4, 0, 5, 1, 6, 2, 7, 3 + ldrepl_macro 13, 23, 24 GMUL xvf, d, U4, D4, U4, U5, D4, U5, U6, D4, U6, U7, D4, U7 - ldrepl_macro 25, 25, 15 - nmsub_macro 8, 11, 0, D2 - nmsub_macro 8, 11, 4, D5 + ldrepl_macro 10, 25 + nmsub_macro D2, 8, 0, 9, 1, 10, 2, 11, 3 + nmsub_macro D5, 8, 4, 9, 5, 10, 6, 11, 7 GMUL xvf, d, U8, D7, U8, U9, D7, U9, U10, D7, U10, U11, D7, U11 - nmsub_macro 12, 15, 0, D3 - nmsub_macro 12, 15, 4, D6 - nmsub_macro 12, 15, 8, D8 + nmsub_macro D3, 12, 0, 13, 1, 14, 2, 15, 3 + nmsub_macro D6, 12, 4, 13, 5, 14, 6, 15, 7 + nmsub_macro D8, 12, 8, 13, 9, 14, 10, 15, 11 GMUL xvf, d, U12, D9, U12, U13, D9, U13, U14, D9, U14, U15, D9, U15 // Store A - A_st_macro 0, 15, 0, 4 + A_st_macro 4, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 // Store C GST xv, , U0, C0, 0x00, U1, C0, 0x20, U2, C0, 0x40, U3, C0, 0x60, \ U4, C1, 0x00, U5, C1, 0x20, U6, C1, 0x40, U7, C1, 0x60, \ @@ -197,13 +196,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. //0 1 // 3 // Sequentially extract data from B in row order - ldrepl_macro 16, 17, 0 + ldrepl_macro 16, 16, 17 GMUL xvf, d, U0, D0, U0, U1, D0, U1, U2, D0, U2, U3, D0, U3 - ldrepl_macro 18, 18, 3 - nmsub_macro 4, 7, 0, D1 + ldrepl_macro 15, 18 + nmsub_macro D1, 4, 0, 5, 1, 6, 2, 7, 3 GMUL xvf, d, U4, D2, U4, U5, D2, U5, U6, D2, U6, U7, D2, U7 // Store A - A_st_macro 0, 7, 0, 4 + A_st_macro 4, 0, 0, 1, 2, 3, 4, 5, 6, 7 // Store C GST xv, , U0, C0, 0x00, U1, C0, 0x20, U2, C0, 0x40, U3, C0, 0x60, \ U4, C1, 0x00, U5, C1, 0x20, U6, C1, 0x40, U7, C1, 0x60 @@ -218,22 +217,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // 10 11 // 15 // Sequentially extract data from B in row order - ldrepl_macro 16, 19, 0 + ldrepl_macro 16, 16, 17, 18, 19 GMUL xvf, d, U0, D0, U0, U1, D0, U1 - ldrepl_macro 20, 22, 5 - nmsub_macro 2, 3, 0, D1 - ldrepl_macro 23, 24, 10 + ldrepl_macro 15, 20, 21, 22 + nmsub_macro D1, 2, 0, 3, 1 + ldrepl_macro 13, 23, 24 GMUL xvf, d, U2, D4, U2, U3, D4, U3 - ldrepl_macro 25, 25, 15 - nmsub_macro 4, 5, 0, D2 - nmsub_macro 4, 5, 2, D5 + ldrepl_macro 10, 25 + nmsub_macro D2, 4, 0, 5, 1 + nmsub_macro D5, 4, 2, 5, 3 GMUL xvf, d, U4, D7, U4, U5, D7, U5 - nmsub_macro 6, 7, 0, D3 - nmsub_macro 6, 7, 2, D6 - nmsub_macro 6, 7, 4, D8 + nmsub_macro D3, 6, 0, 7, 1 + nmsub_macro D6, 6, 2, 7, 3 + nmsub_macro D8, 6, 4, 7, 5 GMUL xvf, d, U6, D9, U6, U7, D9, U7 // Store A - A_st_macro 0, 7, 0, 4 + A_st_macro 4, 0, 0, 1, 2, 3, 4, 5, 6, 7 // Store C GST xv, , U0, C0, 0x00, U1, C0, 0x20, \ U2, C1, 0x00, U3, C1, 0x20, \ @@ -248,13 +247,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. //0 1 // 3 // Sequentially extract data from B in row order - ldrepl_macro 16, 17, 0 + ldrepl_macro 16, 16, 17 GMUL xvf, d, U0, D0, U0, U1, D0, U1 - ldrepl_macro 18, 18, 3 - nmsub_macro 2, 3, 0, D1 + ldrepl_macro 15, 18 + nmsub_macro D1, 2, 0, 3, 1 GMUL xvf, d, U2, D2, U2, U3, D2, U3 // Store A - A_st_macro 0, 3, 0, 4 + A_st_macro 4, 0, 0, 1, 2, 3 // Store C GST xv, , U0, C0, 0x00, U1, C0, 0x20, \ U2, C1, 0x00, U3, C1, 0x20 @@ -269,22 +268,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // 10 11 // 15 // Sequentially extract data from B in row order - ldrepl_macro 16, 19, 0 + ldrepl_macro 16, 16, 17, 18, 19 GMUL xvf, d, U0, D0, U0 - ldrepl_macro 20, 22, 5 - nmsub_macro 1, 1, 0, D1 - ldrepl_macro 23, 24, 10 + ldrepl_macro 15, 20, 21, 22 + nmsub_macro D1, 1, 0 + ldrepl_macro 13, 23, 24 GMUL xvf, d, U1, D4, U1 - ldrepl_macro 25, 25, 15 - nmsub_macro 2, 2, 0, D2 - nmsub_macro 2, 2, 1, D5 + ldrepl_macro 10, 25 + nmsub_macro D2, 2, 0 + nmsub_macro D5, 2, 1 GMUL xvf, d, U2, D7, U2 - nmsub_macro 3, 3, 0, D3 - nmsub_macro 3, 3, 1, D6 - nmsub_macro 3, 3, 2, D8 + nmsub_macro D3, 3, 0 + nmsub_macro D6, 3, 1 + nmsub_macro D8, 3, 2 GMUL xvf, d, U3, D9, U3 // Store A - A_st_macro 0, 3, 0, 4 + A_st_macro 4, 0, 0, 1, 2, 3 // Store C GST xv, , U0, C0, 0x00, U1, C1, 0x00, U2, C2, 0x00, U3, C3, 0x00 .endm @@ -296,13 +295,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. //0 1 // 3 // Sequentially extract data from B in row order - ldrepl_macro 16, 17, 0 + ldrepl_macro 16, 16, 17 GMUL xvf, d, U0, D0, U0 - ldrepl_macro 18, 18, 3 - nmsub_macro 1, 1, 0, D1 + ldrepl_macro 15, 18 + nmsub_macro D1, 1, 0 GMUL xvf, d, U1, D2, U1 // Store A - A_st_macro 0, 1, 0, 4 + A_st_macro 4, 0, 0, 1 // Store C GST xv, , U0, C0, 0x00, U1, C1, 0x00 .endm @@ -316,23 +315,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // 10 11 // 15 // Sequentially extract data from B in row order - ldrepl_macro 16, 19, 0 + ldrepl_macro 16, 16, 17, 18, 19 GMUL xvf, d, U0, D0, U0 - ldrepl_macro 20, 22, 5 - nmsub_macro 1, 1, 0, D1 - ldrepl_macro 23, 24, 10 + ldrepl_macro 15, 20, 21, 22 + nmsub_macro D1, 1, 0 + ldrepl_macro 13, 23, 24 GMUL xvf, d, U1, D4, U1 - ldrepl_macro 25, 25, 15 - nmsub_macro 2, 2, 0, D2 - nmsub_macro 2, 2, 1, D5 + ldrepl_macro 10, 25 + nmsub_macro D2, 2, 0 + nmsub_macro D5, 2, 1 GMUL xvf, d, U2, D7, U2 - nmsub_macro 3, 3, 0, D3 - nmsub_macro 3, 3, 1, D6 - nmsub_macro 3, 3, 2, D8 + nmsub_macro D3, 3, 0 + nmsub_macro D6, 3, 1 + nmsub_macro D8, 3, 2 GMUL xvf, d, U3, D9, U3 // Store A - A_st_macro 0, 3, 0, 2 + A_st_macro 2, 0, 0, 1, 2, 3 // Store C GST v, , $vr0, C0, 0x00, $vr1, C1, 0x00, $vr2, C2, 0x00, $vr3, C3, 0x00, .endm @@ -344,13 +343,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. //0 1 // 3 // Sequentially extract data from B in row order - ldrepl_macro 16, 17, 0 + ldrepl_macro 16, 16, 17 GMUL xvf, d, U0, D0, U0 - ldrepl_macro 18, 18, 3 - nmsub_macro 1, 1, 0, D1 + ldrepl_macro 15, 18 + nmsub_macro D1, 1, 0 GMUL xvf, d, U1, D2, U1 // Store A - A_st_macro 0, 1, 0, 2 + A_st_macro 2, 0, 0, 1 // Store C GST v, , $vr0, C0, 0x00, $vr1, C1, 0x00 .endm @@ -364,23 +363,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // 10 11 // 15 // Sequentially extract data from B in row order - ldrepl_macro 16, 19, 0 + ldrepl_macro 16, 16, 17, 18, 19 GMUL xvf, d, U0, D0, U0 - ldrepl_macro 20, 22, 5 - nmsub_macro 1, 1, 0, D1 - ldrepl_macro 23, 24, 10 + ldrepl_macro 15, 20, 21, 22 + nmsub_macro D1, 1, 0 + ldrepl_macro 13, 23, 24 GMUL xvf, d, U1, D4, U1 - ldrepl_macro 25, 25, 15 - nmsub_macro 2, 2, 0, D2 - nmsub_macro 2, 2, 1, D5 + ldrepl_macro 10, 25 + nmsub_macro D2, 2, 0 + nmsub_macro D5, 2, 1 GMUL xvf, d, U2, D7, U2 - nmsub_macro 3, 3, 0, D3 - nmsub_macro 3, 3, 1, D6 - nmsub_macro 3, 3, 2, D8 + nmsub_macro D3, 3, 0 + nmsub_macro D6, 3, 1 + nmsub_macro D8, 3, 2 GMUL xvf, d, U3, D9, U3 // Store A - A_st_macro 0, 3, 0, 1 + A_st_macro 1, 0, 0, 1, 2, 3 // Store C GST f, d, $f0, C0, 0x00, $f1, C1, 0x00, $f2, C2, 0x00, $f3, C3, 0x00, .endm @@ -392,13 +391,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. //0 1 // 3 // Sequentially extract data from B in row order - ldrepl_macro 16, 17, 0 + ldrepl_macro 16, 16, 17 GMUL xvf, d, U0, D0, U0 - ldrepl_macro 18, 18, 3 - nmsub_macro 1, 1, 0, D1 + ldrepl_macro 15, 18 + nmsub_macro D1, 1, 0 GMUL xvf, d, U1, D2, U1 // Store A - A_st_macro 0, 1, 0, 1 + A_st_macro 1, 0, 0, 1 // Store C GST f, d, $f0, C0, 0x00, $f1, C1, 0x00 .endm @@ -582,10 +581,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvld U2, C0, 0x40 xvld U3, C0, 0x60 .L_dsolve_16x1: - ldrepl_macro 16, 16, 0 + ldrepl_macro 16, 16 GMUL xvf, d, U0, D0, U0, U1, D0, U1, U2, D0, U2, U3, D0, U3 // Store A - A_st_macro 0, 3, 0, 4 + A_st_macro 4, 0, 0, 1, 2, 3 // Strore C GST xv, , U0, C0, 0x00, U1, C0, 0x20, U2, C0, 0x40, U3, C0, 0x60 .endm @@ -599,10 +598,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvld U0, C0, 0x00 xvld U1, C0, 0x20 .L_dsolve_8x1: - ldrepl_macro 16, 16, 0 + ldrepl_macro 16, 16 GMUL xvf, d, U0, D0, U0, U1, D0, U1 // Store A - A_st_macro 0, 1, 0, 4 + A_st_macro 4, 0, 0, 1 // Strore C GST xv, , U0, C0, 0x00, U1, C0, 0x20 .endm @@ -615,10 +614,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /* Load C0 */ xvld U0, C0, 0x00 .L_dsolve_4x1: - ldrepl_macro 16, 16, 0 + ldrepl_macro 16, 16 GMUL xvf, d, U0, D0, U0 // Store A - A_st_macro 0, 0, 0, 4 + A_st_macro 4, 0, 0 // Strore C GST xv, , U0, C0, 0x00 .endm @@ -631,10 +630,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /* Load C0 */ xvld U0, C0, 0x00 .L_dsolve_2x1: - ldrepl_macro 16, 16, 0 + ldrepl_macro 16, 16 GMUL xvf, d, U0, D0, U0 // Store A - A_st_macro 0, 0, 0, 2 + A_st_macro 2, 0, 0 // Strore C GST v, , $vr0, C0, 0x00 .endm @@ -647,16 +646,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // Load C fld.d $f0, C0, 0x00 .L_dsolve_1x1: - ldrepl_macro 16, 16, 0 + ldrepl_macro 16, 16 GMUL xvf, d, U0, D0, U0 // Store A - A_st_macro 0, 0, 0, 1 + A_st_macro 1, 0, 0 // Strore C GST f, d, $f0, C0, 0x00 .endm PROLOGUE - push_if_used 26, 32 + push_if_used 9, 8 PTR_SLLI LDC, LDC, 3 PTR_SUB KK, ZERO, OFFSET /* if (!(N >> 2)) goto L_N3 */ @@ -877,6 +876,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. PTR_ADD AA, AA, T0 // aa += 1 * k .L_N1_M0: .L_N0: - pop_if_used 26, 32 + pop_if_used 9, 8 jirl $r0, $r1, 0x0 EPILOGUE diff --git a/kernel/loongarch64/dtrsm_kernel_RT_16x4_lasx.S b/kernel/loongarch64/dtrsm_kernel_RT_16x4_lasx.S index 5f86d75b5..fb0877523 100644 --- a/kernel/loongarch64/dtrsm_kernel_RT_16x4_lasx.S +++ b/kernel/loongarch64/dtrsm_kernel_RT_16x4_lasx.S @@ -111,33 +111,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "dtrsm_kernel_macro.S" -.macro ldrepl_macro start, end, stride +.macro ldrepl_macro stride:req, index:req, more:vararg // Load Ux (x = 0...15) -.if \start <= \end - GLDREPL xv, d, $xr\start, B0, \stride * 8 - ldrepl_macro %start + 1, \end, %stride + 1 + GLDREPL xv, d, $xr\index, B0, \index * 8 - \stride * 8 +.ifnb \more + ldrepl_macro \stride, \more .endif .endm - -.macro nmsub_macro start0, end0, start1, reg -// Ux -= reg * Dx -.if \start0 <= \end0 +.macro nmsub_macro reg:req, start0:req, start1:req, more:vararg +// Gx -= reg * Ux xvfnmsub.d $xr\start0, \reg, $xr\start1, $xr\start0 - nmsub_macro %start0 + 1, \end0, %start1 + 1, \reg +.ifnb \more + nmsub_macro \reg, \more .endif .endm - -.macro A_st_macro start, end, stride, N -// Store Ux(x = 0...15) -.if \start <= \end +.macro A_st_macro N:req, stride:req, start:req, more:vararg +// Store Gx(x = 16...31) .if \N == 4 - xvst $xr\start, A0, \stride * 0x20 + xvst $xr\start, A0, \start * 0x20 - \stride * 0x20 .elseif \N == 2 - vst $vr\start, A0, \stride * 0x10 + vst $vr\start, A0, \start * 0x10 - \stride * 0x10 .elseif \N == 1 - fst.d $f\start, A0, \stride * 0x08 + fst.d $f\start, A0, \start * 0x08 - \stride * 0x08 .endif - A_st_macro %start + 1, \end, %stride + 1, \N +.ifnb \more + A_st_macro \N, \stride, \more .endif .endm @@ -148,13 +146,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. //0 //2 3 // Sequentially extract data from B in row order - ldrepl_macro 16, 16, 0 - ldrepl_macro 17, 18, 2 + ldrepl_macro 16, 16 + ldrepl_macro 15, 17, 18 GMUL xvf, d, U4, D2, U4, U5, D2, U5, U6, D2, U6, U7, D2, U7 - nmsub_macro 0, 3, 4, D1 + nmsub_macro D1, 0, 4, 1, 5, 2, 6, 3, 7 GMUL xvf, d, U0, D0, U0, U1, D0, U1, U2, D0, U2, U3, D0, U3 // Store A - A_st_macro 0, 7, 0, 4 + A_st_macro 4, 0, 0, 1, 2, 3, 4, 5, 6, 7 // Store C GST xv, , U0, C0, 0x00, U1, C0, 0x20, U2, C0, 0x40, U3, C0, 0x60, \ U4, C1, 0x00, U5, C1, 0x20, U6, C1, 0x40, U7, C1, 0x60 @@ -167,13 +165,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. //0 //2 3 // Sequentially extract data from B in row order - ldrepl_macro 16, 16, 0 - ldrepl_macro 17, 18, 2 + ldrepl_macro 16, 16 + ldrepl_macro 15, 17, 18 GMUL xvf, d, U2, D2, U2, U3, D2, U3 - nmsub_macro 0, 1, 2, D1 + nmsub_macro D1, 0, 2, 1, 3 GMUL xvf, d, U0, D0, U0, U1, D0, U1 // Store A - A_st_macro 0, 3, 0, 4 + A_st_macro 4, 0, 0, 1, 2, 3 // Store C GST xv, , U0, C0, 0x00, U1, C0, 0x20, \ U2, C1, 0x00, U3, C1, 0x20 @@ -186,13 +184,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. //0 //2 3 // Sequentially extract data from B in row order - ldrepl_macro 16, 16, 0 - ldrepl_macro 17, 18, 2 + ldrepl_macro 16, 16 + ldrepl_macro 15, 17, 18 GMUL xvf, d, U1, D2, U1 - nmsub_macro 0, 0, 1, D1 + nmsub_macro D1, 0, 1 GMUL xvf, d, U0, D0, U0 // Store A - A_st_macro 0, 1, 0, 4 + A_st_macro 4, 0, 0, 1 // Store C GST xv, , U0, C0, 0x00, U1, C1, 0x00 .endm @@ -204,13 +202,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. //0 //2 3 // Sequentially extract data from B in row order - ldrepl_macro 16, 16, 0 - ldrepl_macro 17, 18, 2 + ldrepl_macro 16, 16 + ldrepl_macro 15, 17, 18 GMUL xvf, d, U1, D2, U1 - nmsub_macro 0, 0, 1, D1 + nmsub_macro D1, 0, 1 GMUL xvf, d, U0, D0, U0 // Store A - A_st_macro 0, 1, 0, 2 + A_st_macro 2, 0, 0, 1 // Store C GST v, , $vr0, C0, 0x00, $vr1, C1, 0x00 .endm @@ -222,13 +220,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. //0 //2 3 // Sequentially extract data from B in row order - ldrepl_macro 16, 16, 0 - ldrepl_macro 17, 18, 2 + ldrepl_macro 16, 16 + ldrepl_macro 15, 17, 18 GMUL xvf, d, U1, D2, U1 - nmsub_macro 0, 0, 1, D1 + nmsub_macro D1, 0, 1 GMUL xvf, d, U0, D0, U0 // Store A - A_st_macro 0, 1, 0, 1 + A_st_macro 1, 0, 0, 1 // Store C GST f, d, $f0, C0, 0x00, $f1, C1, 0x00 .endm @@ -242,22 +240,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. //8 9 10 //12 13 14 15 // Sequentially extract data from B in row order - ldrepl_macro 22, 25, 12 + ldrepl_macro 10, 22, 23, 24, 25 GMUL xvf, d, U12, D9, U12, U13, D9, U13, U14, D9, U14, U15, D9, U15 - ldrepl_macro 19, 21, 8 - nmsub_macro 8, 11, 12, D8 - ldrepl_macro 17, 18, 4 + ldrepl_macro 11, 19, 20, 21 + nmsub_macro D8, 8, 12, 9, 13, 10, 14, 11, 15 + ldrepl_macro 13, 17, 18 GMUL xvf, d, U8, D5, U8, U9, D5, U9, U10, D5, U10, U11, D5, U11 - ldrepl_macro 16, 16, 0 - nmsub_macro 4, 7, 12, D7 - nmsub_macro 4, 7, 8, D4 + ldrepl_macro 16, 16 + nmsub_macro D7, 4, 12, 5, 13, 6, 14, 7, 15 + nmsub_macro D4, 4, 8, 5, 9, 6, 10, 7, 11 GMUL xvf, d, U4, D2, U4, U5, D2, U5, U6, D2, U6, U7, D2, U7 - nmsub_macro 0, 3, 12, D6 - nmsub_macro 0, 3, 8, D3 - nmsub_macro 0, 3, 4, D1 + nmsub_macro D6, 0, 12, 1, 13, 2, 14, 3, 15 + nmsub_macro D3, 0, 8, 1, 9, 2, 10, 3, 11 + nmsub_macro D1, 0, 4, 1, 5, 2, 6, 3, 7 GMUL xvf, d, U0, D0, U0, U1, D0, U1, U2, D0, U2, U3, D0, U3 // Store A - A_st_macro 0, 15, 0, 4 + A_st_macro 4, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 // Store C GST xv, , U0, C0, 0x00, U1, C0, 0x20, U2, C0, 0x40, U3, C0, 0x60, \ U4, C1, 0x00, U5, C1, 0x20, U6, C1, 0x40, U7, C1, 0x60, \ @@ -274,22 +272,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. //8 9 10 //12 13 14 15 // Sequentially extract data from B in row order - ldrepl_macro 22, 25, 12 + ldrepl_macro 10, 22, 23, 24, 25 GMUL xvf, d, U6, D9, U6, U7, D9, U7 - ldrepl_macro 19, 21, 8 - nmsub_macro 4, 5, 6, D8 - ldrepl_macro 17, 18, 4 + ldrepl_macro 11, 19, 20, 21 + nmsub_macro D8, 4, 6, 5, 7 + ldrepl_macro 13, 17, 18 GMUL xvf, d, U4, D5, U4, U5, D5, U5 - ldrepl_macro 16, 16, 0 - nmsub_macro 2, 3, 6, D7 - nmsub_macro 2, 3, 4, D4 + ldrepl_macro 16, 16 + nmsub_macro D7, 2, 6, 3, 7 + nmsub_macro D4, 2, 4, 3, 5 GMUL xvf, d, U2, D2, U2, U3, D2, U3 - nmsub_macro 0, 1, 6, D6 - nmsub_macro 0, 1, 4, D3 - nmsub_macro 0, 1, 2, D1 + nmsub_macro D6, 0, 6, 1, 7 + nmsub_macro D3, 0, 4, 1, 5 + nmsub_macro D1, 0, 2, 1, 3 GMUL xvf, d, U0, D0, U0, U1, D0, U1 // Store A - A_st_macro 0, 7, 0, 4 + A_st_macro 4, 0, 0, 1, 2, 3, 4, 5, 6, 7 // Store C GST xv, , U0, C0, 0x00, U1, C0, 0x20, \ U2, C1, 0x00, U3, C1, 0x20, \ @@ -306,22 +304,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. //8 9 10 //12 13 14 15 // Sequentially extract data from B in row order - ldrepl_macro 22, 25, 12 + ldrepl_macro 10, 22, 23, 24, 25 GMUL xvf, d, U3, D9, U3 - ldrepl_macro 19, 21, 8 - nmsub_macro 2, 2, 3, D8 - ldrepl_macro 17, 18, 4 + ldrepl_macro 11, 19, 20, 21 + nmsub_macro D8, 2, 3 + ldrepl_macro 13, 17, 18 GMUL xvf, d, U2, D5, U2 - ldrepl_macro 16, 16, 0 - nmsub_macro 1, 1, 3, D7 - nmsub_macro 1, 1, 2, D4 + ldrepl_macro 16, 16 + nmsub_macro D7, 1, 3 + nmsub_macro D4, 1, 2 GMUL xvf, d, U1, D2, U1 - nmsub_macro 0, 0, 3, D6 - nmsub_macro 0, 0, 2, D3 - nmsub_macro 0, 0, 1, D1 + nmsub_macro D6, 0, 3 + nmsub_macro D3, 0, 2 + nmsub_macro D1, 0, 1 GMUL xvf, d, U0, D0, U0 // Store A - A_st_macro 0, 3, 0, 4 + A_st_macro 4, 0, 0, 1, 2, 3 // Store C GST xv, , U0, C0, 0x00, U1, C1, 0x00, U2, C2, 0x00, U3, C3, 0x00 .endm @@ -335,22 +333,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. //8 9 10 //12 13 14 15 // Sequentially extract data from B in row order - ldrepl_macro 22, 25, 12 + ldrepl_macro 10, 22, 23, 24, 25 GMUL xvf, d, U3, D9, U3 - ldrepl_macro 19, 21, 8 - nmsub_macro 2, 2, 3, D8 - ldrepl_macro 17, 18, 4 + ldrepl_macro 11, 19, 20, 21 + nmsub_macro D8, 2, 3 + ldrepl_macro 13, 17, 18 GMUL xvf, d, U2, D5, U2 - ldrepl_macro 16, 16, 0 - nmsub_macro 1, 1, 3, D7 - nmsub_macro 1, 1, 2, D4 + ldrepl_macro 16, 16 + nmsub_macro D7, 1, 3 + nmsub_macro D4, 1, 2 GMUL xvf, d, U1, D2, U1 - nmsub_macro 0, 0, 3, D6 - nmsub_macro 0, 0, 2, D3 - nmsub_macro 0, 0, 1, D1 + nmsub_macro D6, 0, 3 + nmsub_macro D3, 0, 2 + nmsub_macro D1, 0, 1 GMUL xvf, d, U0, D0, U0 // Store A - A_st_macro 0, 3, 0, 2 + A_st_macro 2, 0, 0, 1, 2, 3 // Store C GST v, , $vr0, C0, 0x00, $vr1, C1, 0x00, $vr2, C2, 0x00, $vr3, C3, 0x00 .endm @@ -364,22 +362,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. //8 9 10 //12 13 14 15 // Sequentially extract data from B in row order - ldrepl_macro 22, 25, 12 + ldrepl_macro 10, 22, 23, 24, 25 GMUL xvf, d, U3, D9, U3 - ldrepl_macro 19, 21, 8 - nmsub_macro 2, 2, 3, D8 - ldrepl_macro 17, 18, 4 + ldrepl_macro 11, 19, 20, 21 + nmsub_macro D8, 2, 3 + ldrepl_macro 13, 17, 18 GMUL xvf, d, U2, D5, U2 - ldrepl_macro 16, 16, 0 - nmsub_macro 1, 1, 3, D7 - nmsub_macro 1, 1, 2, D4 + ldrepl_macro 16, 16 + nmsub_macro D7, 1, 3 + nmsub_macro D4, 1, 2 GMUL xvf, d, U1, D2, U1 - nmsub_macro 0, 0, 3, D6 - nmsub_macro 0, 0, 2, D3 - nmsub_macro 0, 0, 1, D1 + nmsub_macro D6, 0, 3 + nmsub_macro D3, 0, 2 + nmsub_macro D1, 0, 1 GMUL xvf, d, U0, D0, U0 // Store A - A_st_macro 0, 3, 0, 1 + A_st_macro 1, 0, 0, 1, 2, 3 // Store C GST f, d, $f0, C0, 0x00, $f1, C1, 0x00, $f2, C2, 0x00, $f3, C3, 0x00, .endm @@ -399,10 +397,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .L_dsolve_16x1: PTR_ADDI A0, T1, -16 * 8 PTR_ADDI B0, T2, -1 * 8 - ldrepl_macro 16, 16, 0 + ldrepl_macro 16, 16 GMUL xvf, d, U0, D0, U0, U1, D0, U1, U2, D0, U2, U3, D0, U3 // Store A - A_st_macro 0, 3, 0, 4 + A_st_macro 4, 0, 0, 1, 2, 3 // Strore C GST xv, , U0, C0, 0x00, U1, C0, 0x20, U2, C0, 0x40, U3, C0, 0x60 .endm @@ -420,10 +418,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .L_dsolve_8x1: PTR_ADDI A0, T1, -8 * 8 PTR_ADDI B0, T2, -1 * 8 - ldrepl_macro 16, 16, 0 + ldrepl_macro 16, 16 GMUL xvf, d, U0, D0, U0, U1, D0, U1 // Store A - A_st_macro 0, 1, 0, 4 + A_st_macro 4, 0, 0, 1 // Strore C GST xv, , U0, C0, 0x00, U1, C0, 0x20 .endm @@ -440,10 +438,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .L_dsolve_4x1: PTR_ADDI A0, T1, -4 * 8 PTR_ADDI B0, T2, -1 * 8 - ldrepl_macro 16, 16, 0 + ldrepl_macro 16, 16 GMUL xvf, d, U0, D0, U0 // Store A - A_st_macro 0, 0, 0, 4 + A_st_macro 4, 0, 0 // Strore C GST xv, , U0, C0, 0x00 .endm @@ -460,10 +458,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .L_dsolve_2x1: PTR_ADDI A0, T1, -2 * 8 PTR_ADDI B0, T2, -1 * 8 - ldrepl_macro 16, 16, 0 + ldrepl_macro 16, 16 GMUL xvf, d, U0, D0, U0 // Store A - A_st_macro 0, 0, 0, 2 + A_st_macro 2, 0, 0 // Strore C GST v, , $vr0, C0, 0x00 .endm @@ -480,10 +478,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .L_dsolve_1x1: PTR_ADDI A0, T1, -1 * 8 PTR_ADDI B0, T2, -1 * 8 - ldrepl_macro 16, 16, 0 + ldrepl_macro 16, 16 GMUL xvf, d, U0, D0, U0 // Store A - A_st_macro 0, 0, 0, 1 + A_st_macro 1, 0, 0 // Strore C GST f, d, $f0, C0, 0x00 .endm @@ -697,7 +695,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm PROLOGUE - push_if_used 26, 32 + push_if_used 9, 8 PTR_SLLI LDC, LDC, 3 PTR_SUB KK, N, OFFSET PTR_MUL T0, N, LDC @@ -948,6 +946,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. PTR_ADDI KK, KK, -4 bnez J, .L_J1 .L_N0: - pop_if_used 26, 32 + pop_if_used 9, 8 jirl $r0, $r1, 0x0 EPILOGUE diff --git a/kernel/loongarch64/loongarch64_asm.S b/kernel/loongarch64/loongarch64_asm.S index d097b3045..a2221491b 100644 --- a/kernel/loongarch64/loongarch64_asm.S +++ b/kernel/loongarch64/loongarch64_asm.S @@ -90,57 +90,175 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define PTR_FST fst.d #endif -// The max registers available to the user which -// do not need to be preserved across calls. -// Ref: https://loongson.github.io/LoongArch-Documentation/LoongArch-ELF-ABI-CN.html -#define MAX_INT_CALLER_SAVED 17 -#define MAX_FP_CALLER_SAVED 24 - .altmacro // Enable alternate macro mode +/* + * Pushing and popping static registers into/from the stack. + * regs : number of static general-purpose registers, greater than or equal to 0, less than or equal to 9 + * fregs: number of static floating-point registers, greater than or equal to 0, less than or equal to 8 + */ .macro push_if_used regs, fregs -.if \regs > MAX_INT_CALLER_SAVED - PTR_ADDI $sp, $sp, -((\regs - MAX_INT_CALLER_SAVED) << REG_LOG) - push_regs 0, \regs - MAX_INT_CALLER_SAVED - 1 +.if \regs > 0 + PTR_ADDI $sp, $sp, -(\regs << REG_LOG) + push_regs 0, \regs - 1 .endif -.if \fregs > MAX_FP_CALLER_SAVED - PTR_ADDI $sp, $sp, -((\fregs - MAX_FP_CALLER_SAVED) << FREG_LOG) - push_fregs 0, \fregs - MAX_FP_CALLER_SAVED - 1 +.if \fregs > 0 + PTR_ADDI $sp, $sp, -(\fregs << FREG_LOG) + push_fregs 0, \fregs - 1 .endif .endm // End push_if_used + .macro pop_if_used regs, fregs -.if \fregs > MAX_FP_CALLER_SAVED - pop_fregs 0, \fregs - MAX_FP_CALLER_SAVED - 1 - PTR_ADDI $sp, $sp, (\fregs - MAX_FP_CALLER_SAVED) << FREG_LOG +.if \fregs > 0 + pop_fregs 0, \fregs - 1 + PTR_ADDI $sp, $sp, \fregs << FREG_LOG .endif -.if \regs > MAX_INT_CALLER_SAVED - pop_regs 0, \regs - MAX_INT_CALLER_SAVED - 1 - PTR_ADDI $sp, $sp, (\regs - MAX_INT_CALLER_SAVED) << REG_LOG +.if \regs > 0 + pop_regs 0, \regs - 1 + PTR_ADDI $sp, $sp, \regs << REG_LOG .endif .endm // End pop_if_used + .macro push_regs from, to - PTR_ST $s\()\from, $sp, \from << REG_LOG +#ifdef __clang__ +.if \to >= 0 + PTR_ST $s0, $sp, 0 << REG_LOG +.endif +.if \to >= 1 + PTR_ST $s1, $sp, 1 << REG_LOG +.endif +.if \to >= 2 + PTR_ST $s2, $sp, 2 << REG_LOG +.endif +.if \to >= 3 + PTR_ST $s3, $sp, 3 << REG_LOG +.endif +.if \to >= 4 + PTR_ST $s4, $sp, 4 << REG_LOG +.endif +.if \to >= 5 + PTR_ST $s5, $sp, 5 << REG_LOG +.endif +.if \to >= 6 + PTR_ST $s6, $sp, 6 << REG_LOG +.endif +.if \to >= 7 + PTR_ST $s7, $sp, 7 << REG_LOG +.endif +.if \to >= 8 + PTR_ST $s8, $sp, 8 << REG_LOG +.endif +#else + PTR_ST $s\()\from, $sp, \from << REG_LOG .if \to - \from push_regs %from + 1, \to .endif +#endif .endm // End push_regs + .macro pop_regs from, to +#ifdef __clang__ +.if \to >= 0 + PTR_LD $s0, $sp, 0 << REG_LOG +.endif +.if \to >= 1 + PTR_LD $s1, $sp, 1 << REG_LOG +.endif +.if \to >= 2 + PTR_LD $s2, $sp, 2 << REG_LOG +.endif +.if \to >= 3 + PTR_LD $s3, $sp, 3 << REG_LOG +.endif +.if \to >= 4 + PTR_LD $s4, $sp, 4 << REG_LOG +.endif +.if \to >= 5 + PTR_LD $s5, $sp, 5 << REG_LOG +.endif +.if \to >= 6 + PTR_LD $s6, $sp, 6 << REG_LOG +.endif +.if \to >= 7 + PTR_LD $s7, $sp, 7 << REG_LOG +.endif +.if \to >= 8 + PTR_LD $s8, $sp, 8 << REG_LOG +.endif +#else PTR_LD $s\()\from, $sp, \from << REG_LOG .if \to - \from pop_regs %from + 1, \to .endif +#endif .endm // End pop_regs + .macro push_fregs from, to +#ifdef __clang__ +.if \to >= 0 + PTR_FST $fs0, $sp, 0 << FREG_LOG +.endif +.if \to >= 1 + PTR_FST $fs1, $sp, 1 << FREG_LOG +.endif +.if \to >= 2 + PTR_FST $fs2, $sp, 2 << FREG_LOG +.endif +.if \to >= 3 + PTR_FST $fs3, $sp, 3 << FREG_LOG +.endif +.if \to >= 4 + PTR_FST $fs4, $sp, 4 << FREG_LOG +.endif +.if \to >= 5 + PTR_FST $fs5, $sp, 5 << FREG_LOG +.endif +.if \to >= 6 + PTR_FST $fs6, $sp, 6 << FREG_LOG +.endif +.if \to >= 7 + PTR_FST $fs7, $sp, 7 << FREG_LOG +.endif +#else PTR_FST $fs\()\from, $sp, \from << FREG_LOG .if \to - \from push_fregs %from + 1, \to .endif +#endif .endm // End push_fregs + .macro pop_fregs from, to +#ifdef __clang__ +.if \to >= 0 + PTR_FLD $fs0, $sp, 0 << FREG_LOG +.endif +.if \to >= 1 + PTR_FLD $fs1, $sp, 1 << FREG_LOG +.endif +.if \to >= 2 + PTR_FLD $fs2, $sp, 2 << FREG_LOG +.endif +.if \to >= 3 + PTR_FLD $fs3, $sp, 3 << FREG_LOG +.endif +.if \to >= 4 + PTR_FLD $fs4, $sp, 4 << FREG_LOG +.endif +.if \to >= 5 + PTR_FLD $fs5, $sp, 5 << FREG_LOG +.endif +.if \to >= 6 + PTR_FLD $fs6, $sp, 6 << FREG_LOG +.endif +.if \to >= 7 + PTR_FLD $fs7, $sp, 7 << FREG_LOG +.endif +#else PTR_FLD $fs\()\from, $sp, \from << FREG_LOG .if \to - \from pop_fregs %from + 1, \to .endif +#endif .endm // End pop_fregs // @@ -275,7 +393,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // GXOR // .macro GXOR pre_op:req, suf_op:req, out:req, in0:req, in1:req, more:vararg - \pre_op\()xor.\suf_op \out, \in0, \in1 +.ifnb \pre_op + \pre_op\()xor.v \out, \in0, \in1 +.else + xor.\suf_op \out, \in0, \in1 +.endif .ifnb \more GXOR \pre_op, \suf_op, \more .endif @@ -307,6 +429,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. GPRELD \more .endif .endm +// +// GPACKEV +// +.macro GPACKEV pre_op:req, suf_op:req, out:req, in0:req, in1:req, more:vararg + \pre_op\()packev.\suf_op \out, \in0, \in1 +.ifnb \more + GPACKEV \pre_op, \suf_op, \more +.endif +.endm +// +// GPACKOD +// +.macro GPACKOD pre_op:req, suf_op:req, out:req, in0:req, in1:req, more:vararg + \pre_op\()packod.\suf_op \out, \in0, \in1 +.ifnb \more + GPACKOD \pre_op, \suf_op, \more +.endif +.endm +// +// GSHUF4I +// +.macro GSHUF4I pre_op:req, suf_op:req, out:req, in0:req, in1:req /* imm */, more:vararg + \pre_op\()shuf4i.\suf_op \out, \in0, \in1 +.ifnb \more + GSHUF4I \pre_op, \suf_op, \more +.endif +.endm + +.macro TRANSF2G name, pre_op:req, suf_op:req, more:vararg +.ifeqs "\pre_op\()\suf_op", "vfs" + \name v, w, \more +.endif +.ifeqs "\pre_op\()\suf_op", "vfd" + \name v, d, \more +.endif +.ifeqs "\pre_op\()\suf_op", "xvfs" + \name xv, w, \more +.endif +.ifeqs "\pre_op\()\suf_op", "xvfd" + \name xv, d, \more +.endif +.endm // // Compound instructions @@ -314,61 +478,96 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // GACC: Accumulate the values of vector registers // .macro GACC pre_op:req, suf_op:req, out:req, in:req, more:vararg -.ifeqs "\pre_op", "xvf" +.ifeqs "\pre_op\()\suf_op", "xvfd" + xvpermi.q \out, \in, 0x01 + \pre_op\()add.\suf_op \in, \out, \in + xvpackod.d \out, \in, \in + \pre_op\()add.\suf_op \out, \out, \in +.endif +.ifeqs "\pre_op\()\suf_op", "xvfs" xvpermi.q \out, \in, 0x01 \pre_op\()add.\suf_op \in, \out, \in xvpackod.d \out, \in, \in \pre_op\()add.\suf_op \out, \out, \in -.ifeqs "\suf_op", "s" xvpackod.w \in, \out, \out \pre_op\()add.\suf_op \out, \out, \in .endif -.endif - -.ifeqs "\pre_op", "vf" +.ifeqs "\pre_op\()\suf_op", "vfd" + vpackod.d \out, \in, \in + \pre_op\()add.\suf_op \out, \out, \in +.endif +.ifeqs "\pre_op\()\suf_op", "vfs" vpackod.d \out, \in, \in \pre_op\()add.\suf_op \out, \out, \in -.ifeqs "\suf_op", "s" vpackod.w \in, \out, \out \pre_op\()add.\suf_op \out, \out, \in .endif -.endif -.ifeqs "\pre_op", "xv" +.ifeqs "\pre_op\()\suf_op", "xvd" + xvpermi.q \out, \in, 0x01 + \pre_op\()add.\suf_op \in, \out, \in + xvpackod.d \out, \in, \in + \pre_op\()add.\suf_op \out, \out, \in +.endif +.ifeqs "\pre_op\()\suf_op", "xvw" + xvpermi.q \out, \in, 0x01 + \pre_op\()add.\suf_op \in, \out, \in + xvpackod.d \out, \in, \in + \pre_op\()add.\suf_op \out, \out, \in + xvpackod.w \in, \out, \out + \pre_op\()add.\suf_op \out, \out, \in +.endif +.ifeqs "\pre_op\()\suf_op", "xvh" + xvpermi.q \out, \in, 0x01 + \pre_op\()add.\suf_op \in, \out, \in + xvpackod.d \out, \in, \in + \pre_op\()add.\suf_op \out, \out, \in + xvpackod.w \in, \out, \out + \pre_op\()add.\suf_op \out, \out, \in + xvpackod.h \in, \out, \out + \pre_op\()add.\suf_op \out, \out, \in +.endif +.ifeqs "\pre_op\()\suf_op", "xvb" xvpermi.q \out, \in, 0x01 \pre_op\()add.\suf_op \in, \out, \in xvpackod.d \out, \in, \in \pre_op\()add.\suf_op \out, \out, \in -.ifnc "\suf_op", "d" xvpackod.w \in, \out, \out \pre_op\()add.\suf_op \out, \out, \in -.ifnc "\suf_op", "w" xvpackod.h \in, \out, \out \pre_op\()add.\suf_op \out, \out, \in -.ifnc "\suf_op", "h" xvpackod.b \in, \out, \out \pre_op\()add.\suf_op \out, \out, \in .endif -.endif -.endif -.endif -.ifeqs "\pre_op", "v" +.ifeqs "\pre_op\()\suf_op", "vd" + vpackod.d \out, \in, \in + \pre_op\()add.\suf_op \out, \out, \in +.endif +.ifeqs "\pre_op\()\suf_op", "vw" + vpackod.d \out, \in, \in + \pre_op\()add.\suf_op \out, \out, \in + vpackod.w \in, \out, \out + \pre_op\()add.\suf_op \out, \out, \in +.endif +.ifeqs "\pre_op\()\suf_op", "vh" + vpackod.d \out, \in, \in + \pre_op\()add.\suf_op \out, \out, \in + vpackod.w \in, \out, \out + \pre_op\()add.\suf_op \out, \out, \in + vpackod.h \in, \out, \out + \pre_op\()add.\suf_op \out, \out, \in +.endif +.ifeqs "\pre_op\()\suf_op", "vb" vpackod.d \out, \in, \in \pre_op\()add.\suf_op \out, \out, \in -.ifnc "\suf_op", "d" vpackod.w \in, \out, \out \pre_op\()add.\suf_op \out, \out, \in -.ifnc "\suf_op", "w" vpackod.h \in, \out, \out \pre_op\()add.\suf_op \out, \out, \in -.ifnc "\suf_op", "h" vpackod.b \in, \out, \out \pre_op\()add.\suf_op \out, \out, \in .endif -.endif -.endif -.endif .ifnb \more GACC \pre_op, \suf_op, \more @@ -391,26 +590,26 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // Note: When "pre_op = xvf && suf_op = s", in will be modified. // .macro GCOMPLEXACC pre_op:req, suf_op:req, out:req, in:req, more:vararg -.ifeqs "\pre_op", "xvf" +.ifeqs "\pre_op\()\suf_op", "xvfd" + xvpermi.q \out, \in, 0x01 + \pre_op\()add.\suf_op \out, \out, \in +.endif + +.ifeqs "\pre_op\()\suf_op", "xvfs" xvpermi.q \out, \in, 0x01 -.ifeqs "\suf_op", "s" \pre_op\()add.\suf_op \in, \out, \in xvpackod.d \out, \in, \in \pre_op\()add.\suf_op \out, \out, \in -.else - \pre_op\()add.\suf_op \out, \out, \in -.endif .endif -.ifeqs "\pre_op", "vf" -.ifeqs "\suf_op", "s" - vpackod.d \out, \in, \in - \pre_op\()add.\suf_op \out, \out, \in -.else +.ifeqs "\pre_op\()\suf_op", "vfd" vor.v \out, \in, \in .endif -.endif +.ifeqs "\pre_op\()\suf_op", "vfs" + vpackod.d \out, \in, \in + \pre_op\()add.\suf_op \out, \out, \in +.endif .ifnb \more GCOMPLEXACC \pre_op, \suf_op, \more @@ -430,56 +629,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // suf_op: s or d, differentiate between single precision or double precision complex numbers // .macro GCOMPLEXMUL xconj=0, pre_op:req, suf_op:req, out:req, in0:req, in1:req, tmp0:req, tmp1:req, tmp2:req, more:vararg -.ifeqs "\pre_op", "xvf" - xvxor.v \tmp1, \tmp1, \tmp1 -.ifeqs "\suf_op", "s" - xvpackev.w \tmp0, \in0, \in0 -.else - xvpackev.d \tmp0, \in0, \in0 -.endif -.else - vxor.v \tmp1, \tmp1, \tmp1 -.ifeqs "\suf_op", "s" - vpackev.w \tmp0, \in0, \in0 -.else - vpackev.d \tmp0, \in0, \in0 -.endif -.endif + TRANSF2G GXOR, \pre_op, s, \tmp1, \tmp1, \tmp1 + TRANSF2G GPACKEV, \pre_op, \suf_op, \tmp0, \in0, \in0 \pre_op\()sub.\suf_op \tmp1, \tmp1, \in0 -.ifeqs "\pre_op", "xvf" +.ifeqs "\xconj", "0" + TRANSF2G GPACKOD, \pre_op, \suf_op, \tmp1, \in0, \tmp1 +.else + TRANSF2G GPACKOD, \pre_op, \suf_op, \tmp1, \tmp1, \in0 +.endif + .ifeqs "\suf_op", "s" -.ifeqs "\xconj", "0" - xvpackod.w \tmp1, \in0, \tmp1 + TRANSF2G GSHUF4I, \pre_op, \suf_op, \tmp2, \in1, 0xb1 .else - xvpackod.w \tmp1, \tmp1, \in0 -.endif - xvshuf4i.w \tmp2, \in1, 0xb1 -.else -.ifeqs "\xconj", "0" - xvpackod.d \tmp1, \in0, \tmp1 -.else - xvpackod.d \tmp1, \tmp1, \in0 -.endif - xvshuf4i.d \tmp2, \in1, 0x0b -.endif -.else -.ifeqs "\suf_op", "s" -.ifeqs "\xconj", "0" - vpackod.w \tmp1, \in0, \tmp1 -.else - vpackod.w \tmp1, \tmp1, \in0 -.endif - vshuf4i.w \tmp2, \in1, 0xb1 -.else -.ifeqs "\xconj", "0" - vpackod.d \tmp1, \in0, \tmp1 -.else - vpackod.d \tmp1, \tmp1, \in0 -.endif - vshuf4i.d \tmp2, \in1, 0x0b -.endif + TRANSF2G GSHUF4I, \pre_op, \suf_op, \tmp2, \in1, 0x0b .endif \pre_op\()mul.\suf_op \out, \tmp0, \in1 @@ -512,112 +676,58 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // suf_op: s or d, differentiate between single precision or double precision complex numbers // .macro GCOMPLEXMADD xconj=0, conj=0, pre_op:req, suf_op:req, out:req, in0:req, in1:req, in2:req, tmp0:req, tmp1:req, tmp2:req, more:vararg -.ifeqs "\pre_op", "xvf" - xvxor.v \tmp1, \tmp1, \tmp1 -.ifeqs "\suf_op", "s" - xvpackev.w \tmp0, \in0, \in0 -.else - xvpackev.d \tmp0, \in0, \in0 -.endif -.else - vxor.v \tmp1, \tmp1, \tmp1 -.ifeqs "\suf_op", "s" - vpackev.w \tmp0, \in0, \in0 -.else - vpackev.d \tmp0, \in0, \in0 -.endif -.endif + TRANSF2G GXOR, \pre_op, s, \tmp1, \tmp1, \tmp1 + TRANSF2G GPACKEV, \pre_op, \suf_op, \tmp0, \in0, \in0 \pre_op\()madd.\suf_op \tmp2, \tmp0, \in1, \in2 -.ifeqs "\conj", "1" + +.ifeqs "\conj\()\suf_op", "1s" \pre_op\()nmsub.\suf_op \tmp0, \tmp0, \in1, \in2 -.ifeqs "\pre_op", "xvf" -.ifeqs "\suf_op", "s" - xvshuf4i.w \tmp0, \tmp0, 0xb1 - xvpackev.w \out, \tmp0, \tmp2 -.else - xvshuf4i.d \tmp0, \tmp0, 0x0b - xvpackev.d \out, \tmp0, \tmp2 + TRANSF2G GSHUF4I, \pre_op, \suf_op, \tmp0, \tmp0, 0xb1 + TRANSF2G GPACKEV, \pre_op, \suf_op, \out, \tmp0, \tmp2 .endif -.else -.ifeqs "\suf_op", "s" - vshuf4i.w \tmp0, \tmp0, 0xb1 - vpackev.w \out, \tmp0, \tmp2 -.else - vshuf4i.d \tmp0, \tmp0, 0x0b - vpackev.d \out, \tmp0, \tmp2 +.ifeqs "\conj\()\suf_op", "1d" + \pre_op\()nmsub.\suf_op \tmp0, \tmp0, \in1, \in2 + TRANSF2G GSHUF4I, \pre_op, \suf_op, \tmp0, \tmp0, 0x0b + TRANSF2G GPACKEV, \pre_op, \suf_op, \out, \tmp0, \tmp2 .endif -.endif /* pre_op = xvf */ -.else +.ifeqs "\conj", "0" \pre_op\()add.\suf_op \out, \tmp2, \tmp1 -.endif /* conj = 1 */ +.endif \pre_op\()sub.\suf_op \tmp1, \tmp1, \in0 -.ifeqs "\pre_op", "xvf" -.ifeqs "\suf_op", "s" -.ifeqs "\conj", "0" -.ifeqs "\xconj", "0" - xvpackod.w \tmp1, \in0, \tmp1 -.else - xvpackod.w \tmp1, \tmp1, \in0 +.ifeqs "\xconj\()\conj\()\suf_op", "00s" + TRANSF2G GPACKOD, \pre_op, \suf_op, \tmp1, \in0, \tmp1 + TRANSF2G GSHUF4I, \pre_op, \suf_op, \tmp2, \in1, 0xb1 .endif -.else -.ifeqs "\xconj", "0" - xvpackod.w \tmp1, \in0, \in0 -.else - xvpackod.w \tmp1, \tmp1, \tmp1 +.ifeqs "\xconj\()\conj\()\suf_op", "10s" + TRANSF2G GPACKOD, \pre_op, \suf_op, \tmp1, \tmp1, \in0 + TRANSF2G GSHUF4I, \pre_op, \suf_op, \tmp2, \in1, 0xb1 .endif +.ifeqs "\xconj\()\conj\()\suf_op", "01s" + TRANSF2G GPACKOD, \pre_op, \suf_op, \tmp1, \in0, \in0 + TRANSF2G GSHUF4I, \pre_op, \suf_op, \tmp2, \in1, 0xb1 .endif - xvshuf4i.w \tmp2, \in1, 0xb1 -.else -.ifeqs "\conj", "0" -.ifeqs "\xconj", "0" - xvpackod.d \tmp1, \in0, \tmp1 -.else - xvpackod.d \tmp1, \tmp1, \in0 +.ifeqs "\xconj\()\conj\()\suf_op", "11s" + TRANSF2G GPACKOD, \pre_op, \suf_op, \tmp1, \tmp1, \tmp1 + TRANSF2G GSHUF4I, \pre_op, \suf_op, \tmp2, \in1, 0xb1 .endif -.else -.ifeqs "\xconj", "0" - xvpackod.d \tmp1, \in0, \in0 -.else - xvpackod.d \tmp1, \tmp1, \tmp1 +.ifeqs "\xconj\()\conj\()\suf_op", "00d" + TRANSF2G GPACKOD, \pre_op, \suf_op, \tmp1, \in0, \tmp1 + TRANSF2G GSHUF4I, \pre_op, \suf_op, \tmp2, \in1, 0x0b .endif +.ifeqs "\xconj\()\conj\()\suf_op", "10d" + TRANSF2G GPACKOD, \pre_op, \suf_op, \tmp1, \tmp1, \in0 + TRANSF2G GSHUF4I, \pre_op, \suf_op, \tmp2, \in1, 0x0b .endif - xvshuf4i.d \tmp2, \in1, 0x0b -.endif -.else -.ifeqs "\suf_op", "s" -.ifeqs "\conj", "0" -.ifeqs "\xconj", "0" - vpackod.w \tmp1, \in0, \tmp1 -.else - vpackod.w \tmp1, \tmp1, \in0 -.endif -.else -.ifeqs "\xconj", "0" - vpackod.w \tmp1, \in0, \in0 -.else - vpackod.w \tmp1, \tmp1, \tmp1 -.endif -.endif - vshuf4i.w \tmp2, \in1, 0xb1 -.else -.ifeqs "\conj", "0" -.ifeqs "\xconj", "0" - vpackod.d \tmp1, \in0, \tmp1 -.else - vpackod.d \tmp1, \tmp1, \in0 -.endif -.else -.ifeqs "\xconj", "0" - vpackod.d \tmp1, \in0, \in0 -.else - vpackod.d \tmp1, \tmp1, \tmp1 -.endif -.endif - vshuf4i.d \tmp2, \in1, 0x0b +.ifeqs "\xconj\()\conj\()\suf_op", "01d" + TRANSF2G GPACKOD, \pre_op, \suf_op, \tmp1, \in0, \in0 + TRANSF2G GSHUF4I, \pre_op, \suf_op, \tmp2, \in1, 0x0b .endif +.ifeqs "\xconj\()\conj\()\suf_op", "11d" + TRANSF2G GPACKOD, \pre_op, \suf_op, \tmp1, \tmp1, \tmp1 + TRANSF2G GSHUF4I, \pre_op, \suf_op, \tmp2, \in1, 0x0b .endif \pre_op\()madd.\suf_op \out, \tmp1, \tmp2, \out diff --git a/kernel/loongarch64/sgemm_kernel_16x8_lasx.S b/kernel/loongarch64/sgemm_kernel_16x8_lasx.S index bd609394e..c6d1aeaef 100644 --- a/kernel/loongarch64/sgemm_kernel_16x8_lasx.S +++ b/kernel/loongarch64/sgemm_kernel_16x8_lasx.S @@ -837,7 +837,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm PROLOGUE - push_if_used 26, 32 + push_if_used 9, 8 xvreplve0.w VALPHA, $xr0 #if defined (TRMMKERNEL) && !defined(LEFT) PTR_SUB OFF, ZERO, OFFSET @@ -2343,6 +2343,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif // #if defined(TRMMKERNEL) .L_N1_M0: .L_N0: - pop_if_used 26, 32 + pop_if_used 9, 8 jirl $r0, $r1, 0x0 EPILOGUE diff --git a/kernel/loongarch64/sgemm_ncopy_16_lasx.S b/kernel/loongarch64/sgemm_ncopy_16_lasx.S index 266c07c5c..1a81ce601 100644 --- a/kernel/loongarch64/sgemm_ncopy_16_lasx.S +++ b/kernel/loongarch64/sgemm_ncopy_16_lasx.S @@ -135,7 +135,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. //.L_N0 PROLOGUE - push_if_used 26, 32 + push_if_used 9, 8 move TD, DST move TS, SRC @@ -458,6 +458,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. PTR_ADDI M, M, -1 blt ZERO, M, .L_N1_M1 .L_N0: - pop_if_used 26, 32 + pop_if_used 9, 8 jirl $r0, $r1, 0x0 EPILOGUE diff --git a/kernel/loongarch64/sgemm_ncopy_8_lasx.S b/kernel/loongarch64/sgemm_ncopy_8_lasx.S index 5c173568b..db36827d5 100644 --- a/kernel/loongarch64/sgemm_ncopy_8_lasx.S +++ b/kernel/loongarch64/sgemm_ncopy_8_lasx.S @@ -110,7 +110,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. //.L_N0 PROLOGUE - push_if_used 17, 20 + push_if_used 0, 0 move TD, DST move TS, SRC @@ -293,6 +293,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. PTR_ADDI M, M, -1 blt ZERO, M, .L_N1_M1 .L_N0: - pop_if_used 17, 20 + pop_if_used 0, 0 jirl $r0, $r1, 0x0 EPILOGUE diff --git a/kernel/loongarch64/sgemm_tcopy_16_lasx.S b/kernel/loongarch64/sgemm_tcopy_16_lasx.S index d9789bdcd..fc42ae8c2 100644 --- a/kernel/loongarch64/sgemm_tcopy_16_lasx.S +++ b/kernel/loongarch64/sgemm_tcopy_16_lasx.S @@ -118,7 +118,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. //.L_M0 PROLOGUE - push_if_used 24, 8 + push_if_used 7, 0 move S0, SRC move P0, DST @@ -521,6 +521,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. PTR_ADDI S1, S1, 0x04 PTR_ADDI P5, P5, 0x04 .L_M0: - pop_if_used 24, 8 + pop_if_used 7, 0 jirl $r0, $r1, 0x00 EPILOGUE diff --git a/kernel/loongarch64/sgemm_tcopy_8_lasx.S b/kernel/loongarch64/sgemm_tcopy_8_lasx.S index 725a47a60..73d08fb8b 100644 --- a/kernel/loongarch64/sgemm_tcopy_8_lasx.S +++ b/kernel/loongarch64/sgemm_tcopy_8_lasx.S @@ -110,7 +110,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. //.L_M0 PROLOGUE - push_if_used 23, 8 + push_if_used 6, 0 move S0, SRC move P0, DST @@ -401,6 +401,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. PTR_ADDI S1, S1, 0x04 PTR_ADDI P4, P4, 0x04 .L_M0: - pop_if_used 23, 8 + pop_if_used 6, 0 jirl $r0, $r1, 0x00 EPILOGUE diff --git a/kernel/loongarch64/sgemv_n_8_lasx.S b/kernel/loongarch64/sgemv_n_8_lasx.S index 52ffc320e..8648c2212 100644 --- a/kernel/loongarch64/sgemv_n_8_lasx.S +++ b/kernel/loongarch64/sgemv_n_8_lasx.S @@ -418,7 +418,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. PROLOGUE PTR_LD INC_Y, $sp, 0 - push_if_used 17 + 7, 19 + push_if_used 7, 0 PTR_ADDI K, $r0, 0x01 PTR_SUB I, INC_X, K PTR_SUB J, INC_Y, K @@ -458,6 +458,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .L_GAP_1_1: /* if (inc_x != 1) && (incy != 1) */ SGEMV_N_LASX GAP_1_1, X_8_GAP, X_4_GAP, X_2_GAP, X_1, Y_8_GAP, Y_4_GAP, Y_1 .L_END: - pop_if_used 17 + 7, 19 + pop_if_used 7, 0 jirl $r0, $r1, 0x0 EPILOGUE diff --git a/kernel/loongarch64/sgemv_t_8_lasx.S b/kernel/loongarch64/sgemv_t_8_lasx.S index f4bfffb42..1f843cadb 100644 --- a/kernel/loongarch64/sgemv_t_8_lasx.S +++ b/kernel/loongarch64/sgemv_t_8_lasx.S @@ -369,7 +369,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. PROLOGUE PTR_LD INC_Y, $sp, 0 - push_if_used 17 + 8, 18 + push_if_used 8, 0 PTR_ADDI K, $r0, 0x01 PTR_SUB I, INC_X, K maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */ @@ -400,6 +400,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .L_GAP_1: /* if (incx != 1) */ SGEMV_T_LASX GAP_1, X8_GAP, X4_GAP .L_END: - pop_if_used 17 + 8, 18 + pop_if_used 8, 0 jirl $r0, $r1, 0x0 EPILOGUE diff --git a/kernel/loongarch64/zgemv_n_2_lsx.S b/kernel/loongarch64/zgemv_n_2_lsx.S index efb376118..d68154008 100644 --- a/kernel/loongarch64/zgemv_n_2_lsx.S +++ b/kernel/loongarch64/zgemv_n_2_lsx.S @@ -253,7 +253,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. PROLOGUE PTR_LD INC_Y, $sp, 0 - push_if_used 17 + 7, 31 + push_if_used 7, 7 PTR_ADDI K, $r0, 0x01 PTR_SUB I, INC_X, K PTR_SUB J, INC_Y, K @@ -291,6 +291,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .L_GAP_1_1: /* if (inc_x != 1) && (incy != 1) */ ZGEMV_N_LSX GAP_1_1, X_2_GAP, X_1, Y_2_GAP, Y_1 .L_END: - pop_if_used 17 + 7, 31 + pop_if_used 7, 7 jirl $r0, $r1, 0x0 EPILOGUE diff --git a/kernel/loongarch64/zgemv_n_4_lasx.S b/kernel/loongarch64/zgemv_n_4_lasx.S index 26edf1ed7..2e0e0a06d 100644 --- a/kernel/loongarch64/zgemv_n_4_lasx.S +++ b/kernel/loongarch64/zgemv_n_4_lasx.S @@ -298,7 +298,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. PROLOGUE PTR_LD INC_Y, $sp, 0 - push_if_used 17 + 7, 31 + push_if_used 7, 7 PTR_ADDI K, $r0, 0x01 PTR_SUB I, INC_X, K PTR_SUB J, INC_Y, K @@ -337,7 +337,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .L_GAP_1_1: /* if (inc_x != 1) && (incy != 1) */ ZGEMV_N_LASX GAP_1_1, X_4_GAP, X_1, Y_4_GAP, Y_1 .L_END: - pop_if_used 17 + 7, 31 + pop_if_used 7, 7 jirl $r0, $r1, 0x0 EPILOGUE diff --git a/kernel/loongarch64/zgemv_t_2_lsx.S b/kernel/loongarch64/zgemv_t_2_lsx.S index 2a0fc172e..cae2a0ce4 100644 --- a/kernel/loongarch64/zgemv_t_2_lsx.S +++ b/kernel/loongarch64/zgemv_t_2_lsx.S @@ -234,7 +234,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. PROLOGUE PTR_LD INC_Y, $sp, 0 - push_if_used 17 + 8, 30 + push_if_used 8, 6 PTR_ADDI K, $r0, 0x01 PTR_SUB I, INC_X, K maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */ @@ -263,6 +263,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .L_GAP_1: /* if (incx != 1) */ ZGEMV_T_LSX GAP_1, X2_GAP .L_END: - pop_if_used 17 + 8, 30 + pop_if_used 8, 6 jirl $r0, $r1, 0x0 EPILOGUE diff --git a/kernel/loongarch64/zgemv_t_4_lasx.S b/kernel/loongarch64/zgemv_t_4_lasx.S index 4d33b8f96..50dd73adc 100644 --- a/kernel/loongarch64/zgemv_t_4_lasx.S +++ b/kernel/loongarch64/zgemv_t_4_lasx.S @@ -264,7 +264,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. PROLOGUE PTR_LD INC_Y, $sp, 0 - push_if_used 17 + 8, 30 + push_if_used 8, 6 PTR_ADDI K, $r0, 0x01 PTR_SUB I, INC_X, K maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */ @@ -294,6 +294,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .L_GAP_1: /* if (incx != 1) */ ZGEMV_T_LASX GAP_1, X4_GAP .L_END: - pop_if_used 17 + 8, 30 + pop_if_used 8, 6 jirl $r0, $r1, 0x0 EPILOGUE