diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5 index 9d858584c..67d1fd11c 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON3R5 +++ b/kernel/loongarch64/KERNEL.LOONGSON3R5 @@ -11,9 +11,24 @@ DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) DGEMVNKERNEL = dgemv_n_8_lasx.S DGEMVTKERNEL = dgemv_t_8_lasx.S + +SGEMMKERNEL = sgemm_kernel_16x8_lasx.S +SGEMMINCOPY = sgemm_ncopy_16_lasx.S +SGEMMITCOPY = sgemm_tcopy_16_lasx.S +SGEMMONCOPY = sgemm_ncopy_8_lasx.S +SGEMMOTCOPY = sgemm_tcopy_8_lasx.S +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) endif DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c diff --git a/kernel/loongarch64/loongarch64_asm.S b/kernel/loongarch64/loongarch64_asm.S index 8876cbed9..89243c620 100644 --- a/kernel/loongarch64/loongarch64_asm.S +++ b/kernel/loongarch64/loongarch64_asm.S @@ -36,6 +36,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define PTR_ST st.d #define PTR_SLLI slli.d #define PTR_SRLI srli.d +#define PTR_SRAI srai.d +#define PTR_MUL mul.d #define PTR_ALSL alsl.d #else #define LA_REG int32_t @@ -48,6 +50,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define PTR_ST st.w #define PTR_SLLI slli.w #define PTR_SRLI srli.w +#define PTR_SRAI srai.w +#define PTR_MUL mul.w #define PTR_ALSL alsl.w #endif @@ -218,6 +222,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endif .endm // +// GSUB +// +.macro GSUB pre_op, suf_op:req, out:req, in0:req, in1:req, more:vararg + \pre_op\()sub.\suf_op \out, \in0, \in1 +.ifnb \more + GSUB \pre_op, \suf_op, \more +.endif +.endm +// // GSLLI // .macro GSLLI pre_op, suf_op:req, out:req, in0:req, in1:req, more:vararg @@ -244,6 +257,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. GXOR \pre_op, \suf_op, \more .endif .endm +// +// GPERMI +// +.macro GPERMI pre_op:req, suf_op:req, out:req, in0:req, in1:req, more:vararg + \pre_op\()permi.\suf_op \out, \in0, \in1 +.ifnb \more + GPERMI \pre_op, \suf_op, \more +.endif +.endm +// +// GNMSUB +// +.macro GNMSUB pre_op:req, suf_op:req, out:req, in0:req, in1:req, in2:req, more:vararg + \pre_op\()nmsub.\suf_op \out, \in0, \in1, \in2 +.ifnb \more + GNMSUB \pre_op, \suf_op, \more +.endif +.endm +// +// GPRELD +// +.macro GPRELD in0:req, in1:req, in2:req, more:vararg + preld \in0, \in1, \in2 +.ifnb \more + GPRELD \more +.endif +.endm // // Compound instructions @@ -311,3 +351,57 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. GACC \pre_op, \suf_op, \more .endif .endm +// +// GMOV +// +.macro GMOV pre_op:req, out:req, in:req, more:vararg + \pre_op\()or.v \out, \in, \in +.ifnb \more + GMOV \pre_op, \more +.endif +.endm + +// +// Media Related Macros +// +.macro GSBUTTERFLY pre_op, suf_op, out0, out1, in0, in1 + \pre_op\()ilvl.\suf_op \out0, \in0, \in1 + \pre_op\()ilvh.\suf_op \out1, \in0, \in1 +.endm +.macro GINTERLACE pre_op, suf_op, out0, out1, in0, in1 + \pre_op\()pickev.\suf_op \out0, \in0, \in1 + \pre_op\()pickod.\suf_op \out1, \in0, \in1 +.endm + +// +// TRANSPOSE4x4_D: Transpose 4x4 block with double-word elements in vectors, +// has no pre_op param. 128-bit vector instructions are not supported. +// +.macro GTRANSPOSE4x4_D in0, in1, in2, in3, out0, out1, out2, out3, \ + vt0, vt1 + GSBUTTERFLY xv, d, \vt0, \out1, \in1, \in0 + GSBUTTERFLY xv, d, \vt1, \out3, \in3, \in2 + GMOV xv, \out0, \vt0, \out2, \vt1, \vt1, \out3 + GPERMI xv, q, \out0, \out2, 0x02, \out2, \vt0, 0x31, \out3, \out1, 0x31, \out1, \vt1, 0x02 +.endm + +.macro GTRANSPOSE8x8_W out0, out1, out2, out3, out4, out5, out6, out7, \ + in0, in1, in2, in3, in4, in5, in6, in7, \ + tmp0, tmp1, tmp2, tmp3 + GSBUTTERFLY xv, w, \tmp0, \tmp2, \in2, \in0 + GSBUTTERFLY xv, w, \tmp1, \tmp3, \in3, \in1 + GSBUTTERFLY xv, w, \out0, \out1, \tmp1, \tmp0 + GSBUTTERFLY xv, w, \out2, \out3, \tmp3, \tmp2 + + GSBUTTERFLY xv, w, \tmp0, \tmp2, \in6, \in4 + GSBUTTERFLY xv, w, \tmp1, \tmp3, \in7, \in5 + GSBUTTERFLY xv, w, \out4, \out5, \tmp1, \tmp0 + GSBUTTERFLY xv, w, \out6, \out7, \tmp3, \tmp2 + + GMOV xv, \tmp0, \out0, \tmp1, \out1, \tmp2, \out2, \tmp3, \out3 + + GPERMI xv, q, \out0, \out4, 0x02, \out1, \out5, 0x02, \ + \out2, \out6, 0x02, \out3, \out7, 0x02, \ + \out4, \tmp0, 0x31, \out5, \tmp1, 0x31, \ + \out6, \tmp2, 0x31, \out7, \tmp3, 0x31 +.endm diff --git a/kernel/loongarch64/sgemm_kernel_16x8_lasx.S b/kernel/loongarch64/sgemm_kernel_16x8_lasx.S new file mode 100644 index 000000000..254dbe052 --- /dev/null +++ b/kernel/loongarch64/sgemm_kernel_16x8_lasx.S @@ -0,0 +1,2325 @@ +/******************************************************************************* +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" +#include "loongarch64_asm.S" + +/********************************************************************* +* 2023/08/23 guxiwei +* UTEST : OK +* CTEST : OK +* TEST : OK +* +* +* 2023/08/23 guxiwei +* Parameter: +* SGEMM_DEFAULT_UNROLL_N 8 +* SGEMM_DEFAULT_UNROLL_M 16 +* SGEMM_DEFAULT_P 256 +* SGEMM_DEFAULT_Q 256 +* SGEMM_DEFAULT_R 1024 +* A_PRE 1024 +* B_PRE 256 // Enable prefetching for B results in a performance decrease, temporarily disabled. +* +* +* Performance at Loongson 3A5000 2.5GHz with 5000x5000x5000: +* 1 thread: 71.7 GFLOPS +* 2 threads: 142.6 GFLOPS +* 3 threads: 211.5 GFLOPS +* 4 threads: 265.0 GFLOPS +*********************************************************************/ + +/* Function parameters */ +#define M $r4 // param 1: bm +#define N $r5 // param 2: bn +#define K $r6 // param 3: bk +#define ALPHA $f0 // param 4: alpha +#define A $r7 // param 5: ba +#define B $r8 // param 6: bb +#define C $r9 // param 7: bc +#define LDC $r10 // param 8: ldc + +#ifdef TRMMKERNEL +#define OFFSET $r11 // param 9: offset +#endif +#define OFF $r12 + +/* Cycle control parameters */ +#define I $r13 +#define J $r14 +#define L $r15 +#define TL $r16 +/* Matrix address */ +#define A0 $r17 +#define B0 $r18 +#define C0 $r19 +#define C1 $r20 +#define C2 $r23 +#define C3 $r24 +#define C4 $r25 +#define C5 $r26 +#define C6 $r27 +#define C7 $r28 +#define T0 $r29 +#define T1 $r30 +#undef ZERO +#define ZERO $r0 + +/* LASX Vectors + * Store 16 sets of 32-bit data in A using UO and U1, with each register holding 8 data. + * Use X0 through X7 to store 8 sets of 32-bit data in B, with each register holding a broadcast value of a single data. + * Use D0 to D15 to store intermediate values of the computation. + * Use VALPHA to store the broadcast value of alpha + */ +#define U0 $xr0 +#define U1 $xr1 +#define X0 $xr2 +#define X1 $xr3 +#define X2 $xr4 +#define X3 $xr5 +#define X4 $xr6 +#define X5 $xr7 +#define X6 $xr8 +#define X7 $xr9 +#define D0 $xr10 +#define D1 $xr11 +#define D2 $xr12 +#define D3 $xr13 +#define D4 $xr14 +#define D5 $xr15 +#define D6 $xr16 +#define D7 $xr17 +#define D8 $xr18 +#define D9 $xr19 +#define D10 $xr20 +#define D11 $xr21 +#define D12 $xr22 +#define D13 $xr23 +#define D14 $xr24 +#define D15 $xr25 +#define VALPHA $xr26 + +/* Prefetch interval */ +#define A_PRE 0x400 +#define B_PRE 0x100 + +// Loops outline: +// .L_N8 <-------------------------------------------------------------------------------------------- /* if N >> 3 == 0, goto .L_N7; else, enter .L_N8. */ +// | .L_M16 <--------------------- | /* if M >> 4 == 0, goto .L_M8; Otherwise, enter .L_M16. */ +// | | .L_M16_TL1 | | +// | | .L_M16_L7 | The entire core loop of the function, KERNEK16x8 | +// | | .L_M16_L71 | | +// | | .L_M16_L0 ---------------- | +// | .L_M8 | +// | | .L_M8_TL1 | | +// | | .L_M8_L7 | KERNEK8x8 | +// | | .L_M8_L71 | | +// | | .L_M8_L0 | | +// | .L_M4 | +// | | .L_M4_TL1 | | +// | | .L_M4_L7 | KERNEK4x8 | +// | | .L_M4_L71 | | +// | | .L_M4_L0 | | +// | .L_M2 | +// | | .L_M2_TL1 | | +// | | .L_M2_L7 | KERNEK2x8 | +// | | .L_M2_L71 | | +// | | .L_M2_L0 | | +// | .L_M1 | +// | | .L_M1_TL1 | | +// | | .L_M1_L7 | KERNEK1x8 | +// | | .L_M1_L71 | | +// | | .L_M1_L0 | | +// | .L_M0------------------------------------------------------------------------------------------ +// .L_N7 /* if N & 7 == 0, goto .L_N0; else, enter .L_N4 */ +// .L_N4 +// | .L_N4_M16 <--------------------- +// | | .L_N4_M16_TL1 | +// | | .L_N4_M16_L7 | KERNEL16x4 +// | | .L_N4_M16_L71 | +// | | .L_N4_M16_L0 ---------------- +// | .L_N4_M8 +// | | .L_N4_M8_TL1 | +// | | .L_N4_M8_L7 | KERNEL8x4 +// | | .L_N4_M8_L71 | +// | | .L_N4_M8_L0 | +// | .L_N4_M4 +// | | .L_N4_M4_TL1 | +// | | .L_N4_M4_L7 | KERNEL4x4 +// | | .L_N4_M4_L71 | +// | | .L_N4_M4_L0 | +// | .L_N4_M2 +// | | .L_N4_M2_TL1 | +// | | .L_N4_M2_L7 | KERNEL2x4 +// | | .L_N4_M2_L71 | +// | | .L_N4_M2_L0 | +// | .L_N4_M1 +// | | .L_N4_M1_TL1 | +// | | .L_N4_M1_L7 | KERNEL1x4 +// | | .L_N4_M1_L71 | +// | | .L_N4_M1_L0 | +// | .L_N4_M0 +// .L_N3 /* if N & 2 == 0, goto .L_N1; else enter .L_N2 */ +// .L_N2 +// | .L_N2_M16 <--------------------- +// | | .L_N2_M16_TL1 | +// | | .L_N2_M16_L7 | KERNEL16x2 +// | | .L_N2_M16_L71 | +// | | .L_N2_M16_L0 ---------------- +// | .L_N2_M8 +// | | .L_N2_M8_TL1 | +// | | .L_N2_M8_L7 | KERNEL8x2 +// | | .L_N2_M8_L71 | +// | | .L_N2_M8_L0 | +// | .L_N2_M4 +// | | .L_N2_M4_TL1 | +// | | .L_N2_M4_L7 | KERNEL4x2 +// | | .L_N2_M4_L71 | +// | | .L_N2_M4_L0 | +// | .L_N2_M2 +// | | .L_N2_M2_TL1 | +// | | .L_N2_M2_L7 | KERNEL2x2 +// | | .L_N2_M2_L71 | +// | | .L_N2_M2_L0 | +// | .L_N2_M1 +// | | .L_N2_M1_TL1 | +// | | .L_N2_M1_L7 | KERNEL1x2 +// | | .L_N2_M1_L71 | +// | | .L_N2_M1_L0 | +// | .L_N2_M0 +// .L_N1 +// | .L_N1_M16 <--------------------- +// | | .L_N1_M16_TL1 | +// | | .L_N1_M16_L7 | KERNEL16x1 +// | | .L_N1_M16_L71 | +// | | .L_N1_M16_L0 ---------------- +// | .L_N1_M8 +// | | .L_N1_M8_TL1 | +// | | .L_N1_M8_L7 | KERNEL8x1 +// | | .L_N1_M8_L71 | +// | | .L_N1_M8_L0 | +// | .L_N1_M4 +// | | .L_N1_M4_TL1 | +// | | .L_N1_M4_L7 | KERNEL4x1 +// | | .L_N1_M4_L71 | +// | | .L_N1_M4_L0 | +// | .L_N1_M2 +// | | .L_N1_M2_TL1 | +// | | .L_N1_M2_L7 | KERNEL2x1 +// | | .L_N1_M2_L71 | +// | | .L_N1_M2_L0 | +// | .L_N1_M1 +// | | .L_N1_M1_TL1 | +// | | .L_N1_M1_L7 | KERNEL1x1 +// | | .L_N1_M1_L71 | +// | | .L_N1_M1_L0 | +// | .L_N1_M0 +// .L_N0 + +/*************** sgemm_kernel_macros ***************/ +.macro KERNEL1x16x8_START + GLD xv, , U0, A0, 0x00, U1, A0, 0x20 + + GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04, X2, B0, 0x08, X3, B0, 0x0C + GMUL xvf, s, D0, U0, X0, D1, U1, X0 + preld 0, C0, 0x00 + GMUL xvf, s, D2, U0, X1, D3, U1, X1 + preld 0, C1, 0x00 + GMUL xvf, s, D4, U0, X2, D5, U1, X2 + preld 0, C2, 0x00 + GMUL xvf, s, D6, U0, X3, D7, U1, X3 + preld 0, C3, 0x00 + GLDREPL xv, w, X4, B0, 0x10, X5, B0, 0x14, X6, B0, 0x18, X7, B0, 0x1C + GMUL xvf, s, D8, U0, X4, D9, U1, X4 + preld 0, C4, 0x00 + GMUL xvf, s, D10, U0, X5, D11, U1, X5 + preld 0, C5, 0x00 + GMUL xvf, s, D12, U0, X6, D13, U1, X6 + preld 0, C6, 0x00 + GMUL xvf, s, D14, U0, X7, D15, U1, X7 + preld 0, C7, 0x00 + PTR_ADDI A0, A0, 0x40 + PTR_ADDI B0, B0, 0x20 +.endm + +.macro KERNEL1x16x8 + GLD xv, , U0, A0, 0x00, U1, A0, 0x20 + + GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04, X2, B0, 0x08, X3, B0, 0x0C + GMADD xvf, s, D0, U0, X0, D0, D1, U1, X0, D1, \ + D2, U0, X1, D2, D3, U1, X1, D3 + preld 0, A0, A_PRE + GMADD xvf, s, D4, U0, X2, D4, D5, U1, X2, D5, \ + D6, U0, X3, D6, D7, U1, X3 D7 + GLDREPL xv, w, X4, B0, 0x10, X5, B0, 0x14, X6, B0, 0x18, X7, B0, 0x1C + GMADD xvf, s, D8, U0, X4, D8, D9, U1, X4, D9, \ + D10, U0, X5, D10, D11, U1, X5, D11 + //preld 0, B0, B_PRE + GMADD xvf, s, D12, U0, X6, D12, D13, U1, X6, D13, \ + D14, U0, X7, D14, D15, U1, X7 D15 + PTR_ADDI A0, A0, 0x40 + PTR_ADDI B0, B0, 0x20 +.endm + +.macro KERNEL8x16x8 +.rept 8 + KERNEL1x16x8 +.endr +.endm + +.macro SAVE16x8 +#if defined(TRMMKERNEL) + GMUL xvf, s, D0, D0, VALPHA, D1, D1, VALPHA, D2, D2, VALPHA, D3, D3, VALPHA, \ + D4, D4, VALPHA, D5, D5, VALPHA, D6, D6, VALPHA, D7, D7, VALPHA, \ + D8, D8, VALPHA, D9, D9, VALPHA, D10, D10, VALPHA, D11, D11, VALPHA, \ + D12, D12, VALPHA, D13, D13, VALPHA, D14, D14, VALPHA, D15, D15, VALPHA +#else + /* Load C0 */ + GLD xv, , X0, C0, 0x00, X1, C0, 0x20 + GMADD xvf, s, D0, D0, VALPHA, X0, D1, D1, VALPHA, X1 + /* Load C1 */ + GLD xv, , X2, C1, 0x00, X3, C1, 0x20 + GMADD xvf, s, D2, D2, VALPHA, X2, D3, D3, VALPHA, X3 + /* Load C2 */ + GLD xv, , X4, C2, 0x00, X5, C2, 0x20 + GMADD xvf, s, D4, D4, VALPHA, X4, D5, D5, VALPHA, X5 + /* Load C3 */ + GLD xv, , X6, C3, 0x00, X7, C3, 0x20 + GMADD xvf, s, D6, D6, VALPHA, X6, D7, D7, VALPHA, X7 + /* Load C4 */ + GLD xv, , X0, C4, 0x00, X1, C4, 0x20 + GMADD xvf, s, D8, D8, VALPHA, X0, D9, D9, VALPHA, X1 + /* Load C5 */ + GLD xv, , X2, C5, 0x00, X3, C5, 0x20 + GMADD xvf, s, D10, D10, VALPHA, X2, D11, D11, VALPHA, X3 + /* Load C6 */ + GLD xv, , X4, C6, 0x00, X5, C6, 0x20 + GMADD xvf, s, D12, D12, VALPHA, X4, D13, D13, VALPHA, X5 + /* Load C7 */ + GLD xv, , X6, C7, 0x00, X7, C7, 0x20 + GMADD xvf, s, D14, D14, VALPHA, X6, D15, D15, VALPHA, X7 +#endif // #if defined(TRMMKERNEL) + GST xv, , D0, C0, 0x00, D1, C0, 0x20, \ + D2, C1, 0x00, D3, C1, 0x20, \ + D4, C2, 0x00, D5, C2, 0x20, \ + D6, C3, 0x00, D7, C3, 0x20, \ + D8, C4, 0x00, D9, C4, 0x20, \ + D10, C5, 0x00, D11, C5, 0x20, \ + D12, C6, 0x00, D13, C6, 0x20, \ + D14, C7, 0x00, D15, C7, 0x20 +#if __loongarch_grlen == 64 + GADDI , d, C0, C0, 0x40, C1, C1, 0x40, C2, C2, 0x40, C3, C3, 0x40, \ + C4, C4, 0x40, C5, C5, 0x40, C6, C6, 0x40, C7, C7, 0x40 +#else + GADDI , w, C0, C0, 0x40, C1, C1, 0x40, C2, C2, 0x40, C3, C3, 0x40, \ + C4, C4, 0x40, C5, C5, 0x40, C6, C6, 0x40, C7, C7, 0x40 +#endif +.endm + +// m = 8, 4, 2, 1 +// stride = 0x20, 0x10, 0x08, 0x04 +.macro KERNEL1xMx8_START m, stride +.if \m == 8 + GLD xv, , U0, A0, 0x00 +.elseif \m == 4 + GLD v, , $vr0, A0, 0x00 +.elseif \m ==2 + GLD f, d, $f0, A0, 0x00 +.elseif \m ==1 + GLD f, s, $f0, A0, 0x00 +.endif + GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04, X2, B0, 0x08, X3, B0, 0x0C + GMUL xvf, s, D0, U0, X0, D2, U0, X1, \ + D4, U0, X2, D6, U0, X3 + GLDREPL xv, w, X4, B0, 0x10, X5, B0, 0x14, X6, B0, 0x18, X7, B0, 0x1C + GMUL xvf, s, D8, U0, X4, D10, U0, X5, \ + D12, U0, X6, D14, U0, X7 + PTR_ADDI A0, A0, \stride + PTR_ADDI B0, B0, 0x20 +.endm + +.macro KERNEL1xMx8 m, stride +.if \m == 8 + GLD xv, , U0, A0, 0x00 +.elseif \m == 4 + GLD v, , $vr0, A0, 0x00 +.elseif \m ==2 + GLD f, d, $f0, A0, 0x00 +.elseif \m ==1 + GLD f, s, $f0, A0, 0x00 +.endif + + GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04, X2, B0, 0x08, X3, B0, 0x0C + GMADD xvf, s, D0, U0, X0, D0, D2, U0, X1, D2, \ + D4, U0, X2, D4, D6, U0, X3, D6 + GLDREPL xv, w, X4, B0, 0x10, X5, B0, 0x14, X6, B0, 0x18, X7, B0, 0x1C + GMADD xvf, s, D8, U0, X4, D8, D10, U0, X5, D10, \ + D12, U0, X6, D12, D14, U0, X7, D14 + PTR_ADDI A0, A0, \stride + PTR_ADDI B0, B0, 0x20 +.endm + +.macro KERNEL8xMx8 m, stride +.rept 8 + KERNEL1xMx8 \m, \stride +.endr +.endm + +.macro SAVEMx8 m, stride +#if defined(TRMMKERNEL) + GMUL xvf, s, D0, D0, VALPHA, D2, D2, VALPHA, \ + D4, D4, VALPHA, D6, D6, VALPHA, \ + D8, D8, VALPHA, D10, D10, VALPHA, \ + D12, D12, VALPHA, D14, D14, VALPHA +#else + /* Load C0, C1, C2, C3, C4, C5, C6, C7 */ + .if \m == 8 + GLD xv, , X0, C0, 0x00, X2, C1, 0x00, X4, C2, 0x00, X6, C3, 0x00 + .elseif \m == 4 + GLD v, , $vr2, C0, 0x00, $vr4, C1, 0x00, $vr6, C2, 0x00, $vr8, C3, 0x00 +.elseif \m == 2 + GLD f, d, $f2, C0, 0x00, $f4, C1, 0x00, $f6, C2, 0x00, $f8, C3, 0x00 +.elseif \m == 1 + GLD f, s, $f2, C0, 0x00, $f4, C1, 0x00, $f6, C2, 0x00, $f8, C3, 0x00 + .endif + GMADD xvf, s, D0, D0, VALPHA, X0, D2, D2, VALPHA, X2, \ + D4, D4, VALPHA, X4, D6, D6, VALPHA, X6 +.if \m == 8 + GLD xv, , X0, C4, 0x00, X2, C5, 0x00, X4, C6, 0x00, X6, C7, 0x00 +.elseif \m == 4 + GLD v, , $vr2, C4, 0x00, $vr4, C5, 0x00, $vr6, C6, 0x00, $vr8, C7, 0x00 +.elseif \m == 2 + GLD f, d, $f2, C4, 0x00, $f4, C5, 0x00, $f6, C6, 0x00, $f8, C7, 0x00 +.elseif \m == 1 + GLD f, s, $f2, C4, 0x00, $f4, C5, 0x00, $f6, C6, 0x00, $f8, C7, 0x00 +.endif + GMADD xvf, s, D8, D8, VALPHA, X0, D10, D10, VALPHA, X2, \ + D12, D12, VALPHA, X4, D14, D14, VALPHA, X6 +#endif // #if defined(TRMMKERNEL) +.if \m == 8 + GST xv, , D0, C0, 0x00, D2, C1, 0x00, \ + D4, C2, 0x00, D6, C3, 0x00, \ + D8, C4, 0x00, D10, C5, 0x00, \ + D12, C6, 0x00, D14, C7, 0x00 +.elseif \m == 4 + GST v, , $vr10, C0, 0x00, $vr12, C1, 0x00, \ + $vr14, C2, 0x00, $vr16, C3, 0x00, \ + $vr18, C4, 0x00, $vr20, C5, 0x00, \ + $vr22, C6, 0x00, $vr24, C7, 0x00 +.elseif \m == 2 + GST f, d, $f10, C0, 0x00, $f12, C1, 0x00, \ + $f14, C2, 0x00, $f16, C3, 0x00, \ + $f18, C4, 0x00, $f20, C5, 0x00, \ + $f22, C6, 0x00, $f24, C7, 0x00 +.elseif \m == 1 + GST f, s, $f10, C0, 0x00, $f12, C1, 0x00, \ + $f14, C2, 0x00, $f16, C3, 0x00, \ + $f18, C4, 0x00, $f20, C5, 0x00, \ + $f22, C6, 0x00, $f24, C7, 0x00 +.endif +#if __loongarch_grlen == 64 + GADDI , d, C0, C0, \stride, C1, C1, \stride, C2, C2, \stride, C3, C3, \stride, \ + C4, C4, \stride, C5, C5, \stride, C6, C6, \stride, C7, C7, \stride +#else + GADDI , w, C0, C0, \stride, C1, C1, \stride, C2, C2, \stride, C3, C3, \stride, \ + C4, C4, \stride, C5, C5, \stride, C6, C6, \stride, C7, C7, \stride +#endif +.endm + +.macro KERNEL1x16x4_START + GLD xv, , U0, A0, 0x00, U1, A0, 0x20 + + GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04, X2, B0, 0x08, X3, B0, 0x0C + GMUL xvf, s, D0, U0, X0, D1, U1, X0, \ + D2, U0, X1, D3, U1, X1, \ + D4, U0, X2, D5, U1, X2, \ + D6, U0, X3, D7, U1, X3 + PTR_ADDI A0, A0, 0x40 + PTR_ADDI B0, B0, 0x10 +.endm + +.macro KERNEL1x16x4 + GLD xv, , U0, A0, 0x00, U1, A0, 0x20 + + GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04, X2, B0, 0x08, X3, B0, 0x0C + GMADD xvf, s, D0, U0, X0, D0, D1, U1, X0, D1, \ + D2, U0, X1, D2, D3, U1, X1, D3, \ + D4, U0, X2, D4, D5, U1, X2, D5, \ + D6, U0, X3, D6, D7, U1, X3 D7 + PTR_ADDI A0, A0, 0x40 + PTR_ADDI B0, B0, 0x10 +.endm + +.macro KERNEL8x16x4 +.rept 8 + KERNEL1x16x4 +.endr +.endm + +.macro SAVE16x4 +#if defined(TRMMKERNEL) + GMUL xvf, s, D0, D0, VALPHA, D1, D1, VALPHA, D2, D2, VALPHA, D3, D3, VALPHA, \ + D4, D4, VALPHA, D5, D5, VALPHA, D6, D6, VALPHA, D7, D7, VALPHA +#else + /* Load C0 */ + GLD xv, , X0, C0, 0x00, X1, C0, 0x20 + GMADD xvf, s, D0, D0, VALPHA, X0, D1, D1, VALPHA, X1 + /* Load C1 */ + GLD xv, , X2, C1, 0x00, X3, C1, 0x20 + GMADD xvf, s, D2, D2, VALPHA, X2, D3, D3, VALPHA, X3 + /* Load C2 */ + GLD xv, , X4, C2, 0x00, X5, C2, 0x20 + GMADD xvf, s, D4, D4, VALPHA, X4, D5, D5, VALPHA, X5 + /* Load C3 */ + GLD xv, , X6, C3, 0x00, X7, C3, 0x20 + GMADD xvf, s, D6, D6, VALPHA, X6, D7, D7, VALPHA, X7 +#endif // #if defined(TRMMKERNEL) + GST xv, , D0, C0, 0x00, D1, C0, 0x20, \ + D2, C1, 0x00, D3, C1, 0x20, \ + D4, C2, 0x00, D5, C2, 0x20, \ + D6, C3, 0x00, D7, C3, 0x20 +#if __loongarch_grlen == 64 + GADDI , d, C0, C0, 0x40, C1, C1, 0x40, C2, C2, 0x40, C3, C3, 0x40 +#else + GADDI , w, C0, C0, 0x40, C1, C1, 0x40, C2, C2, 0x40, C3, C3, 0x40 +#endif +.endm + +// m = 8, 4, 2, 1 +// stride = 0x20, 0x10, 0x08, 0x04 +.macro KERNEL1xMx4_START m, stride +.if \m == 8 + GLD xv, , U0, A0, 0x00 +.elseif \m == 4 + GLD v, , $vr0, A0, 0x00 +.elseif \m ==2 + GLD f, d, $f0, A0, 0x00 +.elseif \m ==1 + GLD f, s, $f0, A0, 0x00 +.endif + GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04, X2, B0, 0x08, X3, B0, 0x0C + GMUL xvf, s, D0, U0, X0, D2, U0, X1, \ + D4, U0, X2, D6, U0, X3 + PTR_ADDI A0, A0, \stride + PTR_ADDI B0, B0, 0x10 +.endm + +.macro KERNEL1xMx4 m, stride +.if \m == 8 + GLD xv, , U0, A0, 0x00 +.elseif \m == 4 + GLD v, , $vr0, A0, 0x00 +.elseif \m ==2 + GLD f, d, $f0, A0, 0x00 +.elseif \m ==1 + GLD f, s, $f0, A0, 0x00 +.endif + GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04, X2, B0, 0x08, X3, B0, 0x0C + GMADD xvf, s, D0, U0, X0, D0, D2, U0, X1, D2, \ + D4, U0, X2, D4, D6, U0, X3, D6 + PTR_ADDI A0, A0, \stride + PTR_ADDI B0, B0, 0x10 +.endm + +.macro KERNEL8xMx4 m, stride +.rept 8 + KERNEL1xMx4 \m, \stride +.endr +.endm + +.macro SAVEMx4 m, stride +#if defined(TRMMKERNEL) + GMUL xvf, s, D0, D0, VALPHA, D2, D2, VALPHA, \ + D4, D4, VALPHA, D6, D6, VALPHA +#else + /* Load C0, C1, C2, C3 */ + .if \m == 8 + GLD xv, , X0, C0, 0x00, X2, C1, 0x00, X4, C2, 0x00, X6, C3, 0x00 + .elseif \m == 4 + GLD v, , $vr2, C0, 0x00, $vr4, C1, 0x00, $vr6, C2, 0x00, $vr8, C3, 0x00 +.elseif \m == 2 + GLD f, d, $f2, C0, 0x00, $f4, C1, 0x00, $f6, C2, 0x00, $f8, C3, 0x00 +.elseif \m == 1 + GLD f, s, $f2, C0, 0x00, $f4, C1, 0x00, $f6, C2, 0x00, $f8, C3, 0x00 + .endif + GMADD xvf, s, D0, D0, VALPHA, X0, D2, D2, VALPHA, X2, \ + D4, D4, VALPHA, X4, D6, D6, VALPHA, X6 +#endif // #if defined(TRMMKERNEL) +.if \m == 8 + GST xv, , D0, C0, 0x00, D2, C1, 0x00, \ + D4, C2, 0x00, D6, C3, 0x00 +.elseif \m == 4 + GST v, , $vr10, C0, 0x00, $vr12, C1, 0x00, \ + $vr14, C2, 0x00, $vr16, C3, 0x00 +.elseif \m == 2 + GST f, d, $f10, C0, 0x00, $f12, C1, 0x00, \ + $f14, C2, 0x00, $f16, C3, 0x00 +.elseif \m == 1 + GST f, s, $f10, C0, 0x00, $f12, C1, 0x00, \ + $f14, C2, 0x00, $f16, C3, 0x00 +.endif +#if __loongarch_grlen == 64 + GADDI , d, C0, C0, \stride, C1, C1, \stride, C2, C2, \stride, C3, C3, \stride +#else + GADDI , w, C0, C0, \stride, C1, C1, \stride, C2, C2, \stride, C3, C3, \stride +#endif +.endm + +.macro KERNEL1x16x2_START + GLD xv, , U0, A0, 0x00, U1, A0, 0x20 + + GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04 + GMUL xvf, s, D0, U0, X0, D1, U1, X0, \ + D2, U0, X1, D3, U1, X1 + PTR_ADDI A0, A0, 0x40 + PTR_ADDI B0, B0, 0x08 +.endm + +.macro KERNEL1x16x2 + GLD xv, , U0, A0, 0x00, U1, A0, 0x20 + + GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04 + GMADD xvf, s, D0, U0, X0, D0, D1, U1, X0, D1, \ + D2, U0, X1, D2, D3, U1, X1, D3 + PTR_ADDI A0, A0, 0x40 + PTR_ADDI B0, B0, 0x08 +.endm + +.macro KERNEL8x16x2 +.rept 8 + KERNEL1x16x2 +.endr +.endm + +.macro SAVE16x2 +#if defined(TRMMKERNEL) + GMUL xvf, s, D0, D0, VALPHA, D1, D1, VALPHA, D2, D2, VALPHA, D3, D3, VALPHA +#else + /* Load C0 */ + GLD xv, , X0, C0, 0x00, X1, C0, 0x20 + GMADD xvf, s, D0, D0, VALPHA, X0, D1, D1, VALPHA, X1 + /* Load C1 */ + GLD xv, , X2, C1, 0x00, X3, C1, 0x20 + GMADD xvf, s, D2, D2, VALPHA, X2, D3, D3, VALPHA, X3 +#endif // #if defined(TRMMKERNEL) + GST xv, , D0, C0, 0x00, D1, C0, 0x20, \ + D2, C1, 0x00, D3, C1, 0x20 +#if __loongarch_grlen == 64 + GADDI , d, C0, C0, 0x40, C1, C1, 0x40 +#else + GADDI , w, C0, C0, 0x40, C1, C1, 0x40 +#endif +.endm + +// m = 8, 4, 2, 1 +// stride = 0x20, 0x10, 0x08, 0x04 +.macro KERNEL1xMx2_START m, stride +.if \m == 8 + GLD xv, , U0, A0, 0x00 +.elseif \m == 4 + GLD v, , $vr0, A0, 0x00 +.elseif \m ==2 + GLD f, d, $f0, A0, 0x00 +.elseif \m ==1 + GLD f, s, $f0, A0, 0x00 +.endif + GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04 + GMUL xvf, s, D0, U0, X0, D2, U0, X1 + PTR_ADDI A0, A0, \stride + PTR_ADDI B0, B0, 0x08 +.endm + +.macro KERNEL1xMx2 m, stride +.if \m == 8 + GLD xv, , U0, A0, 0x00 +.elseif \m == 4 + GLD v, , $vr0, A0, 0x00 +.elseif \m ==2 + GLD f, d, $f0, A0, 0x00 +.elseif \m ==1 + GLD f, s, $f0, A0, 0x00 +.endif + GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04 + GMADD xvf, s, D0, U0, X0, D0, D2, U0, X1, D2 + PTR_ADDI A0, A0, \stride + PTR_ADDI B0, B0, 0x08 +.endm + +.macro KERNEL8xMx2 m, stride +.rept 8 + KERNEL1xMx2 \m, \stride +.endr +.endm + +.macro SAVEMx2 m, stride +#if defined(TRMMKERNEL) + GMUL xvf, s, D0, D0, VALPHA, D2, D2, VALPHA +#else + /* Load C0, C1 */ + .if \m == 8 + GLD xv, , X0, C0, 0x00, X2, C1, 0x00 + .elseif \m == 4 + GLD v, , $vr2, C0, 0x00, $vr4, C1, 0x00 +.elseif \m == 2 + GLD f, d, $f2, C0, 0x00, $f4, C1, 0x00 +.elseif \m == 1 + GLD f, s, $f2, C0, 0x00, $f4, C1, 0x00 + .endif + GMADD xvf, s, D0, D0, VALPHA, X0, D2, D2, VALPHA, X2 +#endif // #if defined(TRMMKERNEL) +.if \m == 8 + GST xv, , D0, C0, 0x00, D2, C1, 0x00 +.elseif \m == 4 + GST v, , $vr10, C0, 0x00, $vr12, C1, 0x00 +.elseif \m == 2 + GST f, d, $f10, C0, 0x00, $f12, C1, 0x00 +.elseif \m == 1 + GST f, s, $f10, C0, 0x00, $f12, C1, 0x00 +.endif +#if __loongarch_grlen == 64 + GADDI , d, C0, C0, \stride, C1, C1, \stride +#else + GADDI , w, C0, C0, \stride, C1, C1, \stride +#endif +.endm + +.macro KERNEL1x16x1_START + GLD xv, , U0, A0, 0x00, U1, A0, 0x20 + GLDREPL xv, w, X0, B0, 0x00 + GMUL xvf, s, D0, U0, X0, D1, U1, X0 + PTR_ADDI A0, A0, 0x40 + PTR_ADDI B0, B0, 0x04 +.endm + +.macro KERNEL1x16x1 + GLD xv, , U0, A0, 0x00, U1, A0, 0x20 + GLDREPL xv, w, X0, B0, 0x00 + GMADD xvf, s, D0, U0, X0, D0, D1, U1, X0, D1 + PTR_ADDI A0, A0, 0x40 + PTR_ADDI B0, B0, 0x04 +.endm + +.macro KERNEL8x16x1 +.rept 8 + KERNEL1x16x1 +.endr +.endm + +.macro SAVE16x1 +#if defined(TRMMKERNEL) + GMUL xvf, s, D0, D0, VALPHA, D1, D1, VALPHA +#else + /* Load C0 */ + GLD xv, , X0, C0, 0x00, X1, C0, 0x20 + GMADD xvf, s, D0, D0, VALPHA, X0, D1, D1, VALPHA, X1 +#endif // #if defined(TRMMKERNEL) + GST xv, , D0, C0, 0x00, D1, C0, 0x20 +#if __loongarch_grlen == 64 + GADDI , d, C0, C0, 0x40 +#else + GADDI , w, C0, C0, 0x40 +#endif +.endm + +// m = 8, 4, 2, 1 +// stride = 0x20, 0x10, 0x08, 0x04 +.macro KERNEL1xMx1_START m, stride +.if \m == 8 + GLD xv, , U0, A0, 0x00 +.elseif \m == 4 + GLD v, , $vr0, A0, 0x00 +.elseif \m ==2 + GLD f, d, $f0, A0, 0x00 +.elseif \m ==1 + GLD f, s, $f0, A0, 0x00 +.endif + GLDREPL xv, w, X0, B0, 0x00 + GMUL xvf, s, D0, U0, X0 + PTR_ADDI A0, A0, \stride + PTR_ADDI B0, B0, 0x04 +.endm + +.macro KERNEL1xMx1 m, stride +.if \m == 8 + GLD xv, , U0, A0, 0x00 +.elseif \m == 4 + GLD v, , $vr0, A0, 0x00 +.elseif \m ==2 + GLD f, d, $f0, A0, 0x00 +.elseif \m ==1 + GLD f, s, $f0, A0, 0x00 +.endif + GLDREPL xv, w, X0, B0, 0x00 + GMADD xvf, s, D0, U0, X0, D0 + PTR_ADDI A0, A0, \stride + PTR_ADDI B0, B0, 0x04 +.endm + +.macro KERNEL8xMx1 m, stride +.rept 8 + KERNEL1xMx1 \m, \stride +.endr +.endm + +.macro SAVEMx1 m, stride +#if defined(TRMMKERNEL) + GMUL xvf, s, D0, D0, VALPHA +#else + /* Load C0, C1 */ + .if \m == 8 + GLD xv, , X0, C0, 0x00 + .elseif \m == 4 + GLD v, , $vr2, C0, 0x00 +.elseif \m == 2 + GLD f, d, $f2, C0, 0x00 +.elseif \m == 1 + GLD f, s, $f2, C0, 0x00 + .endif + GMADD xvf, s, D0, D0, VALPHA, X0 +#endif // #if defined(TRMMKERNEL) +.if \m == 8 + GST xv, , D0, C0, 0x00 +.elseif \m == 4 + GST v, , $vr10, C0, 0x00 +.elseif \m == 2 + GST f, d, $f10, C0, 0x00 +.elseif \m == 1 + GST f, s, $f10, C0, 0x00 +.endif +#if __loongarch_grlen == 64 + GADDI , d, C0, C0, \stride +#else + GADDI , w, C0, C0, \stride +#endif +.endm + + PROLOGUE + push_if_used 26, 32 + xvreplve0.w VALPHA, $xr0 +#if defined (TRMMKERNEL) && !defined(LEFT) + PTR_SUB OFF, ZERO, OFFSET +#else + xor OFF, OFF, OFF +#endif + /* if (!(N >> 3)) goto L_N7 */ + PTR_SRAI J, N, 3 /* J = bn >> 3 */ + andi N, N, 0x07 + beq ZERO, J, .L_N7 +.L_N8: /* J -- */ + move C0, C + move A0, A + PTR_SLLI T0, LDC, 2 + PTR_ADDI J, J, -1 /* J-- */ +#if __loongarch_grlen == 64 + GADD , d, C1, C0, T0, C2, C1, T0, C3, C2, T0, C4, C3, T0, C5, C4, T0, \ + C6, C5, T0, C7, C6, T0 +#else + GADD , w, C1, C0, T0, C2, C1, T0, C3, C2, T0, C4, C3, T0, C5, C4, T0, \ + C6, C5, T0, C7, C6, T0 +#endif +#if defined(TRMMKERNEL) && defined(LEFT) + move OFF, OFFSET +#endif + /* if (!(M >> 4)) goto L_M8 */ + PTR_SRAI I, M, 4 /* I = bm >> 4 */ + beq ZERO, I, .L_M8 +.align 5 +.L_M16: /* I-- */ +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + PTR_SLLI T0, OFF, 0x06 + PTR_ADD A0, A0, T0 /* A0 += 16 * OFF */ + PTR_SLLI T0, OFF, 0x05 + PTR_ADD B0, B, T0 /* B0 = B + 8 * OFF */ +#endif +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + PTR_SUB L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + PTR_ADDI L, OFF, 16 +#else + /* number of values in B */ + PTR_ADDI L, OFF, 8 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + KERNEL1x16x8_START + /* Reduce L */ + PTR_ADDI L, L, -1 + PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_M16_L7 */ + beq ZERO,TL, .L_M16_L7 +.align 5 +.L_M16_TL1: + KERNEL8x16x8 + PTR_ADDI TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_M16_TL1 +.L_M16_L7: + andi TL, L, 7 + beq TL, ZERO,.L_M16_L0 +.align 5 +.L_M16_L71: + KERNEL1x16x8 + PTR_ADDI TL, TL, -1 + blt ZERO,TL, .L_M16_L71 +.L_M16_L0: + SAVE16x8 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + PTR_SUB L, K, OFF +#ifdef LEFT + /* number of values in A */ + PTR_ADDI L, L, -16 +#else + /* number of values in B */ + PTR_ADDI L, L, -8 +#endif + PTR_SLLI T0, L, 0x06 + PTR_ADD A0, A0, T0 + PTR_SLLI T0, L, 0x05 + PTR_ADD B0, B0, T0 +#endif + +#ifdef LEFT + PTR_ADDI OFF, OFF, 0x10 /* number of values in A */ +#endif +#endif // #if defined(TRMMKERNEL) + + PTR_ADDI I, I, -1 /* I-- */ + blt ZERO,I, .L_M16 +.L_M8: + /* We have done M & 16, considering M=8/4/2/1 */ + andi I, M, 15 + beq ZERO,I, .L_M0 + + andi I, M, 8 + beq ZERO,I, .L_M4 +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + PTR_SLLI T0, OFF, 0x05 + PTR_ADD A0, A0, T0 /* A0 += 8 * OFF */ + PTR_SLLI T0, OFF, 0x05 + PTR_ADD B0, B, T0 /* B0 = B + 8 * OFF */ +#endif +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + PTR_SUB L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + PTR_ADDI L, OFF, 8 +#else + /* number of values in B */ + PTR_ADDI L, OFF, 8 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif // #if defined(TRMMKERNEL) + KERNEL1xMx8_START 8, 0x20 + /* Reduce L */ + PTR_ADDI L, L, -1 + PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_M8_L7 */ + beq ZERO,TL, .L_M8_L7 +.align 5 +.L_M8_TL1: + KERNEL8xMx8 8, 0x20 + PTR_ADDI TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_M8_TL1 +.L_M8_L7: + /* if (!(L & 7)) goto L_M8_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_M8_L0 +.align 5 +.L_M8_L71: + KERNEL1xMx8 8, 0x20 + PTR_ADDI TL, TL, -1 + blt ZERO,TL, .L_M8_L71 +.L_M8_L0: + SAVEMx8 8, 0x20 +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + PTR_SUB L, K, OFF +#ifdef LEFT + /* number of values in A */ + PTR_ADDI L, L, -8 +#else + /* number of values in B */ + PTR_ADDI L, L, -8 +#endif + PTR_SLLI T0, L, 0x05 + PTR_ADD A0, A0, T0 + PTR_SLLI T0, L, 0x05 + PTR_ADD B0, B0, T0 +#endif + +#ifdef LEFT + /* number of values in A */ + PTR_ADDI OFF, OFF, 0x08 +#endif +#endif // #if defined(TRMMKERNEL) +.L_M4: + andi I, M, 4 + beq ZERO,I, .L_M2 +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + PTR_SLLI T0, OFF, 0x04 + PTR_ADD A0, A0, T0 /* A0 += 4 * OFF */ + PTR_SLLI T0, OFF, 0x05 + PTR_ADD B0, B, T0 /* B0 = B + 8 * OFF */ +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + PTR_SUB L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + PTR_ADDI L, OFF, 4 +#else + /* number of values in B */ + PTR_ADDI L, OFF, 8 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + KERNEL1xMx8_START 4, 0x10 + /* Reduce L */ + PTR_ADDI L, L, -1 + PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_M4_L7 */ + beq ZERO,TL, .L_M4_L7 +.align 5 +.L_M4_TL1: + KERNEL8xMx8 4, 0x10 + PTR_ADDI TL, TL, -1 + blt ZERO,TL, .L_M4_TL1 +.L_M4_L7: + /* if (!(L & 7)) goto L_M4_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_M4_L0 +.L_M4_L71: + KERNEL1xMx8 4, 0x10 + PTR_ADDI TL, TL, -1 + blt ZERO,TL, .L_M4_L71 +.L_M4_L0: + SAVEMx8 4, 0x10 +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + PTR_SUB L, K, OFF +#ifdef LEFT + /* number of values in A */ + PTR_ADDI L, L, -4 +#else + /* number of values in B */ + PTR_ADDI L, L, -8 +#endif + PTR_SLLI T0, L, 0x04 + PTR_ADD A0, A0, T0 + PTR_SLLI T0, L, 0x05 + PTR_ADD B0, B0, T0 +#endif + +#ifdef LEFT + /* number of values in A */ + PTR_ADDI OFF, OFF, 0x04 +#endif +#endif // #if defined(TRMMKERNEL) +.L_M2: + andi I, M, 2 + beq ZERO,I, .L_M1 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + PTR_SLLI T0, OFF, 0x03 + PTR_ADD A0, A0, T0 + PTR_SLLI T0, OFF, 0x05 + PTR_ADD B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + PTR_SUB L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + PTR_ADDI L, OFF, 2 +#else + /* number of values in B */ + PTR_ADDI L, OFF, 8 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + KERNEL1xMx8_START 2, 0x08 + + /* Reduce L */ + PTR_ADDI L, L, -1 + PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_M2_L7 */ + beq ZERO,TL, .L_M2_L7 +.align 5 +.L_M2_TL1: + KERNEL8xMx8 2, 0x08 + PTR_ADDI TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_M2_TL1 +.L_M2_L7: + /* if (!(L & 7)) goto L_M2_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_M2_L0 +.align 5 +.L_M2_L71: + KERNEL1xMx8 2, 0x08 + PTR_ADDI TL, TL, -1 + blt ZERO,TL, .L_M2_L71 +.L_M2_L0: + SAVEMx8 2, 0x08 +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + PTR_SUB L, K, OFF +#ifdef LEFT + /* number of values in A */ + PTR_ADDI L, L, -2 +#else + /* number of values in B */ + PTR_ADDI L, L, -8 +#endif + PTR_SLLI T0, L, 0x03 + PTR_ADD A0, A0, T0 + PTR_SLLI T0, L, 0x05 + PTR_ADD B0, B0, T0 +#endif + +#ifdef LEFT + /* number of values in A */ + PTR_ADDI OFF, OFF, 0x02 +#endif +#endif // #if defined(TRMMKERNEL) +.L_M1: + andi I, M, 1 + beq ZERO,I, .L_M0 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + PTR_SLLI T0, OFF, 0x02 + PTR_ADD A0, A0, T0 + PTR_SLLI T0, OFF, 0x05 + PTR_ADD B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + PTR_SUB L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + PTR_ADDI L, OFF, 1 +#else + /* number of values in B */ + PTR_ADDI L, OFF, 8 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + KERNEL1xMx8_START 1, 0x04 + /* Reduce L */ + PTR_ADDI L, L, -1 + PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_M1_L7 */ + beq ZERO,TL, .L_M1_L7 +.align 5 +.L_M1_TL1: + KERNEL8xMx8 1, 0x04 + PTR_ADDI TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_M1_TL1 +.L_M1_L7: + /* if (!(L & 7)) goto L_M1_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_M1_L0 +.align 5 +.L_M1_L71: + KERNEL1xMx8 1, 0x04 + PTR_ADDI TL, TL, -1 + blt ZERO,TL, .L_M1_L71 +.L_M1_L0: + SAVEMx8 1, 0x04 +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + PTR_SUB L, K, OFF +#ifdef LEFT + /* number of values in A */ + PTR_ADDI L, L, -1 +#else + /* number of values in B */ + PTR_ADDI L, L, -8 +#endif + PTR_SLLI T0, L, 0x02 + PTR_ADD A0, A0, T0 + PTR_SLLI T0, L, 0x05 + PTR_ADD B0, B0, T0 +#endif + +#ifdef LEFT + /* number of values in A */ + PTR_ADDI OFF, OFF, 0x01 +#endif +#endif // #if defined(TRMMKERNEL) + +.L_M0: + /* Add stride for B and C + * B += (K * 32) + * C += (LDC * 32) + */ + PTR_SLLI T0, K, 5 + PTR_SLLI T1, LDC, 5 + PTR_ADD B, B, T0 + PTR_ADD C, C, T1 +#if defined(TRMMKERNEL) && !defined(LEFT) + PTR_ADDI OFF, OFF, 0x08 /* number of values in B */ +#endif + blt ZERO, J, .L_N8 + +.L_N7: + andi J, N, 4 + beq ZERO, J, .L_N3 +.L_N4: + move C0, C + move A0, A + PTR_SLLI T0, LDC, 2 +#if __loongarch_grlen == 64 + GADD , d, C1, C0, T0, C2, C1, T0, C3, C2, T0 +#else + GADD , w, C1, C0, T0, C2, C1, T0, C3, C2, T0 +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + move OFF, OFFSET +#endif + + /* if (!(M >> 4)) goto L_N4_M8 */ + PTR_SRAI I, M, 4 /* I = bm >> 4 */ + beq ZERO, I, .L_N4_M8 +.align 5 +.L_N4_M16: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + PTR_SLLI T0, OFF, 0x06 + PTR_ADD A0, A0, T0 /* A0 += 16 * OFF */ + PTR_SLLI T0, OFF, 0x04 + PTR_ADD B0, B, T0 /* B0 += 4 * OFF */ +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + PTR_SUB L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + PTR_ADDI L, OFF, 16 +#else + /* number of values in B */ + PTR_ADDI L, OFF, 4 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + KERNEL1x16x4_START + + /* Reduce L */ + PTR_ADDI L, L, -1 + PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N4_L7 */ + beq ZERO,TL, .L_N4_M16_L7 +.align 5 +.L_N4_M16_TL1: /* TL-- */ + KERNEL8x16x4 + + PTR_ADDI TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N4_M16_TL1 +.L_N4_M16_L7: + /* if (!(L & 7)) goto L_N4_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N4_M16_L0 +.align 5 +.L_N4_M16_L71: + KERNEL1x16x4 + PTR_ADDI TL, TL, -1 + blt ZERO,TL, .L_N4_M16_L71 +.L_N4_M16_L0: + SAVE16x4 +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + PTR_SUB L, K, OFF +#ifdef LEFT + PTR_ADDI L, L, -16 +#else + PTR_ADDI L, L, -4 +#endif + PTR_SLLI T0, L, 0x06 + PTR_ADD A0, A0, T0 + PTR_SLLI T0, L, 0x04 + PTR_ADD B0, B0, T0 +#endif + +#ifdef LEFT + PTR_ADDI OFF, OFF, 0x10 +#endif +#endif // #if defined(TRMMKERNEL) + + PTR_ADDI I, I, -1 /* I-- */ + blt ZERO,I, .L_N4_M16 +.L_N4_M8: + /* We have done M & 16, considering M=8/4/2/1 */ + andi I, M, 15 + beq ZERO,I, .L_N4_M0 + + andi I, M, 8 + beq ZERO,I, .L_N4_M4 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + PTR_SLLI T0, OFF, 0x05 + PTR_ADD A0, A0, T0 + PTR_SLLI T0, OFF, 0x04 + PTR_ADD B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + PTR_SUB L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + PTR_ADDI L, OFF, 8 +#else + /* number of values in B */ + PTR_ADDI L, OFF, 4 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + KERNEL1xMx4_START 8, 0x20 + /* Reduce L */ + PTR_ADDI L, L, -1 + PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N4_M8_L7 */ + beq ZERO,TL, .L_N4_M8_L7 +.align 5 +.L_N4_M8_TL1: /* TL-- */ + KERNEL8xMx4 8, 0x20 + + PTR_ADDI TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N4_M8_TL1 +.L_N4_M8_L7: + /* if (!(L & 7)) goto L_N4_M8_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N4_M8_L0 +.align 5 +.L_N4_M8_L71: + KERNEL1xMx4 8, 0x20 + PTR_ADDI TL, TL, -1 + blt ZERO,TL, .L_N4_M8_L71 +.L_N4_M8_L0: + SAVEMx4 8, 0x20 +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + PTR_SUB L, K, OFF +#ifdef LEFT + PTR_ADDI L, L, -8 +#else + PTR_ADDI L, L, -4 +#endif + PTR_SLLI T0, L, 0x05 + PTR_ADD A0, A0, T0 + PTR_SLLI T0, L, 0x04 + PTR_ADD B0, B0, T0 +#endif + +#ifdef LEFT + PTR_ADDI OFF, OFF, 0x08 +#endif +#endif // #if defined(TRMMKERNEL) +.L_N4_M4: + andi I, M, 4 + beq ZERO,I, .L_N4_M2 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + PTR_SLLI T0, OFF, 0x04 + PTR_ADD A0, A0, T0 + PTR_SLLI T0, OFF, 0x04 + PTR_ADD B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + PTR_SUB L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + PTR_ADDI L, OFF, 4 +#else + /* number of values in B */ + PTR_ADDI L, OFF, 4 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + KERNEL1xMx4_START 4, 0x10 + /* Reduce L */ + PTR_ADDI L, L, -1 + PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N4_M4_L7 */ + beq ZERO,TL, .L_N4_M4_L7 +.align 5 +.L_N4_M4_TL1: /* TL-- */ + KERNEL8xMx4 4, 0x10 + + PTR_ADDI TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N4_M4_TL1 +.L_N4_M4_L7: + /* if (!(L & 7)) goto L_N4_M4_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N4_M4_L0 +.align 5 +.L_N4_M4_L71: + KERNEL1xMx4 4, 0x10 + + PTR_ADDI TL, TL, -1 + blt ZERO,TL, .L_N4_M4_L71 +.L_N4_M4_L0: + SAVEMx4 4, 0x10 +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + PTR_SUB L, K, OFF +#ifdef LEFT + PTR_ADDI L, L, -4 +#else + PTR_ADDI L, L, -4 +#endif + PTR_SLLI T0, L, 0x04 + PTR_ADD A0, A0, T0 + PTR_SLLI T0, L, 0x04 + PTR_ADD B0, B0, T0 +#endif + +#ifdef LEFT + PTR_ADDI OFF, OFF, 0x04 +#endif +#endif // #if defined(TRMMKERNEL) +.L_N4_M2: + andi I, M, 2 + beq ZERO,I, .L_N4_M1 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + PTR_SLLI T0, OFF, 0x03 + PTR_ADD A0, A0, T0 + PTR_SLLI T0, OFF, 0x04 + PTR_ADD B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + PTR_SUB L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + PTR_ADDI L, OFF, 2 +#else + /* number of values in B */ + PTR_ADDI L, OFF, 4 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + KERNEL1xMx4_START 2, 0x08 + /* Reduce L */ + PTR_ADDI L, L, -1 + PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N4_M2_L7 */ + beq ZERO,TL, .L_N4_M2_L7 +.align 5 +.L_N4_M2_TL1: /* TL-- */ + KERNEL8xMx4 2, 0x08 + + PTR_ADDI TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N4_M2_TL1 +.L_N4_M2_L7: + /* if (!(L & 7)) goto L_N4_M2_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N4_M2_L0 +.align 5 +.L_N4_M2_L71: + KERNEL1xMx4 2, 0x08 + PTR_ADDI TL, TL, -1 + blt ZERO,TL, .L_N4_M2_L71 +.L_N4_M2_L0: + SAVEMx4 2, 0x08 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + PTR_SUB L, K, OFF +#ifdef LEFT + PTR_ADDI L, L, -2 +#else + PTR_ADDI L, L, -4 +#endif + PTR_SLLI T0, L, 0x03 + PTR_ADD A0, A0, T0 + PTR_SLLI T0, L, 0x04 + PTR_ADD B0, B0, T0 +#endif + +#ifdef LEFT + PTR_ADDI OFF, OFF, 0x02 +#endif +#endif // #if defined(TRMMKERNEL) +.L_N4_M1: + andi I, M, 1 + beq ZERO,I, .L_N4_M0 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + PTR_SLLI T0, OFF, 0x02 + PTR_ADD A0, A0, T0 + PTR_SLLI T0, OFF, 0x04 + PTR_ADD B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + PTR_SUB L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + PTR_ADDI L, OFF, 1 +#else + /* number of values in B */ + PTR_ADDI L, OFF, 4 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + KERNEL1xMx4_START 1, 0x04 + /* Reduce L */ + PTR_ADDI L, L, -1 + PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N4_M1_L7 */ + beq ZERO,TL, .L_N4_M1_L7 +.align 5 +.L_N4_M1_TL1: /* TL-- */ + KERNEL8xMx4 1, 0x04 + + PTR_ADDI TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N4_M1_TL1 +.L_N4_M1_L7: + /* if (!(L & 7)) goto L_N4_M1_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N4_M1_L0 +.align 5 +.L_N4_M1_L71: + KERNEL1xMx4 1, 0x04 + PTR_ADDI TL, TL, -1 + blt ZERO,TL, .L_N4_M1_L71 +.L_N4_M1_L0: + SAVEMx4 1, 0x04 +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + PTR_SUB L, K, OFF +#ifdef LEFT + PTR_ADDI L, L, -1 +#else + PTR_ADDI L, L, -4 +#endif + PTR_SLLI T0, L, 0x02 + PTR_ADD A0, A0, T0 + PTR_SLLI T0, L, 0x04 + PTR_ADD B0, B0, T0 +#endif + +#ifdef LEFT + PTR_ADDI OFF, OFF, 0x01 +#endif +#endif // #if defined(TRMMKERNEL) +.L_N4_M0: + /* Add stride for B and C + * B += 4 * K + * C += 4 * LDC + */ + PTR_SLLI T0, K, 4 + PTR_SLLI T1, LDC, 4 + PTR_ADD B, B, T0 + PTR_ADD C, C, T1 + +#if defined(TRMMKERNEL) && !defined(LEFT) + PTR_ADDI OFF, OFF, 0x04 +#endif + /* We must reinit I */ + PTR_SRAI I, M, 4 /* I = bm >> 4 */ +.L_N3: + andi J, N, 2 + beq ZERO, J, .L_N1 + +.L_N2: + move C0, C + move A0, A + PTR_SLLI T0, LDC, 2 + PTR_ADD C1, C0, T0 + +#if defined(TRMMKERNEL) && defined(LEFT) + move OFF, OFFSET +#endif + + /* if (!(M >> 4)) goto L_N2_M8 */ + PTR_SRAI I, M, 4 /* I = bm >> 4 */ + beq ZERO, I, .L_N2_M8 +.align 5 +.L_N2_M16: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + PTR_SLLI T0, OFF, 0x06 + PTR_ADD A0, A0, T0 + PTR_SLLI T0, OFF, 0x03 + PTR_ADD B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + PTR_SUB L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + PTR_ADDI L, OFF, 16 +#else + /* number of values in B */ + PTR_ADDI L, OFF, 2 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + KERNEL1x16x2_START + + /* Reduce L */ + PTR_ADDI L, L, -1 + PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N2_M16_L7 */ + beq ZERO,TL, .L_N2_M16_L7 +.align 5 +.L_N2_M16_TL1: /* TL-- */ + KERNEL8x16x2 + + PTR_ADDI TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N2_M16_TL1 +.L_N2_M16_L7: + /* if (!(L & 7)) goto L_N2_M16_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N2_M16_L0 +.align 5 +.L_N2_M16_L71: + KERNEL1x16x2 + PTR_ADDI TL, TL, -1 + blt ZERO,TL, .L_N2_M16_L71 +.L_N2_M16_L0: + SAVE16x2 +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + PTR_SUB L, K, OFF +#ifdef LEFT + PTR_ADDI L, L, -16 +#else + PTR_ADDI L, L, -2 +#endif + PTR_SLLI T0, L, 0x06 + PTR_ADD A0, A0, T0 + PTR_SLLI T0, L, 0x03 + PTR_ADD B0, B0, T0 +#endif + +#ifdef LEFT + PTR_ADDI OFF, OFF, 0x10 +#endif +#endif // #if defined(TRMMKERNEL) + + PTR_ADDI I, I, -1 /* I-- */ + blt ZERO,I, .L_N2_M16 +.L_N2_M8: + /* We have done M & 16, considering M=8/4/2/1 */ + andi I, M, 15 + beq ZERO,I, .L_N2_M0 + + andi I, M, 8 + beq ZERO,I, .L_N2_M4 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + PTR_SLLI T0, OFF, 0x05 + PTR_ADD A0, A0, T0 + PTR_SLLI T0, OFF, 0x03 + PTR_ADD B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + PTR_SUB L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + PTR_ADDI L, OFF, 8 +#else + /* number of values in B */ + PTR_ADDI L, OFF, 2 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + KERNEL1xMx2_START 8, 0x20 + /* Reduce L */ + PTR_ADDI L, L, -1 + PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N2_M8_L7 */ + beq ZERO,TL, .L_N2_M8_L7 +.align 5 +.L_N2_M8_TL1: /* TL-- */ + KERNEL8xMx2 8, 0x20 + PTR_ADDI TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N2_M8_TL1 +.L_N2_M8_L7: + /* if (!(L & 7)) goto L_N2_M8_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N2_M8_L0 +.align 5 +.L_N2_M8_L71: + KERNEL1xMx2 8, 0x20 + PTR_ADDI TL, TL, -1 + blt ZERO,TL, .L_N2_M8_L71 +.L_N2_M8_L0: + SAVEMx2 8, 0x20 +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + PTR_SUB L, K, OFF +#ifdef LEFT + PTR_ADDI L, L, -8 +#else + PTR_ADDI L, L, -2 +#endif + PTR_SLLI T0, L, 0x05 + PTR_ADD A0, A0, T0 + PTR_SLLI T0, L, 0x03 + PTR_ADD B0, B0, T0 +#endif + +#ifdef LEFT + PTR_ADDI OFF, OFF, 0x08 +#endif +#endif // #if defined(TRMMKERNEL) +.L_N2_M4: + andi I, M, 4 + beq ZERO,I, .L_N2_M2 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + PTR_SLLI T0, OFF, 0x04 + PTR_ADD A0, A0, T0 + PTR_SLLI T0, OFF, 0x03 + PTR_ADD B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + PTR_SUB L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + PTR_ADDI L, OFF, 4 +#else + /* number of values in B */ + PTR_ADDI L, OFF, 2 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + KERNEL1xMx2_START 4, 0x10 + /* Reduce L */ + PTR_ADDI L, L, -1 + PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N2_M4_L7 */ + beq ZERO,TL, .L_N2_M4_L7 +.align 5 +.L_N2_M4_TL1: /* TL-- */ + KERNEL8xMx2 4, 0x10 + PTR_ADDI TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N2_M4_TL1 +.L_N2_M4_L7: + /* if (!(L & 7)) goto L_N2_M4_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N2_M4_L0 +.align 5 +.L_N2_M4_L71: + KERNEL1xMx2 4, 0x10 + PTR_ADDI TL, TL, -1 + blt ZERO,TL, .L_N2_M4_L71 +.L_N2_M4_L0: + SAVEMx2 4, 0x10 +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + PTR_SUB L, K, OFF +#ifdef LEFT + PTR_ADDI L, L, -4 +#else + PTR_ADDI L, L, -2 +#endif + PTR_SLLI T0, L, 0x04 + PTR_ADD A0, A0, T0 + PTR_SLLI T0, L, 0x03 + PTR_ADD B0, B0, T0 +#endif + +#ifdef LEFT + PTR_ADDI OFF, OFF, 0x04 +#endif +#endif // #if defined(TRMMKERNEL) +.L_N2_M2: + andi I, M, 2 + beq ZERO,I, .L_N2_M1 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + PTR_SLLI T0, OFF, 0x03 + PTR_ADD A0, A0, T0 + PTR_ADD B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + PTR_SUB L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + PTR_ADDI L, OFF, 2 +#else + /* number of values in B */ + PTR_ADDI L, OFF, 2 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + KERNEL1xMx2_START 2, 0x08 + /* Reduce L */ + PTR_ADDI L, L, -1 + PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N2_M2_L7 */ + beq ZERO,TL, .L_N2_M2_L7 +.align 5 +.L_N2_M2_TL1: /* TL-- */ + KERNEL8xMx2 2, 0x08 + PTR_ADDI TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N2_M2_TL1 +.L_N2_M2_L7: + /* if (!(L & 7)) goto L_N2_M2_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N2_M2_L0 +.align 5 +.L_N2_M2_L71: + KERNEL1xMx2 2, 0x08 + PTR_ADDI TL, TL, -1 + blt ZERO,TL, .L_N2_M2_L71 +.L_N2_M2_L0: + SAVEMx2 2, 0x08 +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + PTR_SUB L, K, OFF +#ifdef LEFT + PTR_ADDI L, L, -2 +#else + PTR_ADDI L, L, -2 +#endif + PTR_SLLI T0, L, 0x03 + PTR_ADD A0, A0, T0 + PTR_ADD B0, B0, T0 +#endif + +#ifdef LEFT + PTR_ADDI OFF, OFF, 0x02 +#endif +#endif // #if defined(TRMMKERNEL) +.L_N2_M1: + andi I, M, 1 + beq ZERO,I, .L_N2_M0 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + PTR_SLLI T0, OFF, 0x02 + PTR_ADD A0, A0, T0 + PTR_SLLI T0, OFF, 0x03 + PTR_ADD B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + PTR_SUB L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + PTR_ADDI L, OFF, 1 +#else + /* number of values in B */ + PTR_ADDI L, OFF, 2 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + KERNEL1xMx2_START 1, 0x04 + /* Reduce L */ + PTR_ADDI L, L, -1 + PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N2_M1_L7 */ + beq ZERO,TL, .L_N2_M1_L7 +.align 5 +.L_N2_M1_TL1: /* TL-- */ + KERNEL8xMx2 1, 0x04 + PTR_ADDI TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N2_M1_TL1 +.L_N2_M1_L7: + /* if (!(L & 7)) goto L_N2_M1_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N2_M1_L0 +.align 5 +.L_N2_M1_L71: + KERNEL1xMx2 1, 0x04 + PTR_ADDI TL, TL, -1 + blt ZERO,TL, .L_N2_M1_L71 +.L_N2_M1_L0: + SAVEMx2 1, 0x04 +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + PTR_SUB L, K, OFF +#ifdef LEFT + PTR_ADDI L, L, -1 +#else + PTR_ADDI L, L, -2 +#endif + PTR_SLLI T0, L, 0x02 + PTR_ADD A0, A0, T0 + PTR_SLLI T0, L, 0x03 + PTR_ADD B0, B0, T0 +#endif + +#ifdef LEFT + PTR_ADDI OFF, OFF, 0x01 +#endif +#endif // #if defined(TRMMKERNEL) +.L_N2_M0: + /* Add stride for B and C + * B += 2 * K + * C += 2 * LDC + */ + PTR_SLLI T0, K, 3 + PTR_SLLI T1, LDC, 3 + PTR_ADD B, B, T0 + PTR_ADD C, C, T1 +#if defined(TRMMKERNEL) && !defined(LEFT) + PTR_ADDI OFF, OFF, 0x02 +#endif + /* We must reinit I */ + PTR_SRAI I, M, 4 /* I = bm >> 4 */ +.L_N1: + andi J, N, 1 + beq ZERO, J, .L_N0 + move C0, C + move A0, A + +#if defined(TRMMKERNEL) && defined(LEFT) + move OFF, OFFSET +#endif + /* if (!(M >> 4)) goto L_N1_M8 */ + PTR_SRAI I, M, 4 /* I = bm >> 4 */ + beq ZERO, I, .L_N1_M8 +.L_N1_M16: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + PTR_SLLI T0, OFF, 0x06 + PTR_ADD A0, A0, T0 + PTR_SLLI T0, OFF, 0x02 + PTR_ADD B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + PTR_SUB L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + PTR_ADDI L, OFF, 16 +#else + /* number of values in B */ + PTR_ADDI L, OFF, 1 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + KERNEL1x16x1_START + /* Reduce L */ + PTR_ADDI L, L, -1 + PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N1_M16_L7 */ + beq ZERO,TL, .L_N1_M16_L7 +.align 5 +.L_N1_M16_TL1: /* TL-- */ + KERNEL8x16x1 + PTR_ADDI TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N1_M16_TL1 +.L_N1_M16_L7: + /* if (!(L & 7)) goto L_N1_M16_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N1_M16_L0 +.align 5 +.L_N1_M16_L71: + KERNEL1x16x1 + PTR_ADDI TL, TL, -1 + blt ZERO,TL, .L_N1_M16_L71 +.L_N1_M16_L0: + SAVE16x1 +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + PTR_SUB L, K, OFF +#ifdef LEFT + PTR_ADDI L, L, -16 +#else + PTR_ADDI L, L, -1 +#endif + PTR_SLLI T0, L, 0x06 + PTR_ADD A0, A0, T0 + PTR_SLLI T0, L, 0x02 + PTR_ADD B0, B0, T0 +#endif + +#ifdef LEFT + PTR_ADDI OFF, OFF, 0x10 +#endif +#endif // #if defined(TRMMKERNEL) + + PTR_ADDI I, I, -1 /* I-- */ + blt ZERO,I, .L_N1_M16 +.L_N1_M8: + /* We have done M & 16, considering M=8/4/2/1 */ + andi I, M, 15 + beq ZERO,I, .L_N1_M0 + + andi I, M, 8 + beq ZERO,I, .L_N1_M4 +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + PTR_SLLI T0, OFF, 0x05 + PTR_ADD A0, A0, T0 + PTR_SLLI T0, OFF, 0x02 + PTR_ADD B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + PTR_SUB L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + PTR_ADDI L, OFF, 8 +#else + /* number of values in B */ + PTR_ADDI L, OFF, 1 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + KERNEL1xMx1_START 8, 0x20 + /* Reduce L */ + PTR_ADDI L, L, -1 + PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N1_M8_L7 */ + beq ZERO,TL, .L_N1_M8_L7 +.align 5 +.L_N1_M8_TL1: /* TL-- */ + KERNEL8xMx1 8, 0x20 + PTR_ADDI TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N1_M8_TL1 +.L_N1_M8_L7: + /* if (!(L & 7)) goto L_N1_M8_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N1_M8_L0 +.align 5 +.L_N1_M8_L71: + KERNEL1xMx1 8, 0x20 + PTR_ADDI TL, TL, -1 + blt ZERO,TL, .L_N1_M8_L71 +.L_N1_M8_L0: + SAVEMx1 8, 0x20 +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + PTR_SUB L, K, OFF +#ifdef LEFT + PTR_ADDI L, L, -8 +#else + PTR_ADDI L, L, -1 +#endif + PTR_SLLI T0, L, 0x05 + PTR_ADD A0, A0, T0 + PTR_SLLI T0, L, 0x02 + PTR_ADD B0, B0, T0 +#endif + +#ifdef LEFT + PTR_ADDI OFF, OFF, 0x08 +#endif +#endif // #if defined(TRMMKERNEL) +.L_N1_M4: + andi I, M, 4 + beq ZERO,I, .L_N1_M2 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + PTR_SLLI T0, OFF, 0x04 + PTR_ADD A0, A0, T0 + PTR_SLLI T0, OFF, 0x02 + PTR_ADD B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + PTR_SUB L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + PTR_ADDI L, OFF, 4 +#else + /* number of values in B */ + PTR_ADDI L, OFF, 1 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + KERNEL1xMx1_START 4, 0x10 + /* Reduce L */ + PTR_ADDI L, L, -1 + PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N1_M4_L7 */ + beq ZERO,TL, .L_N1_M4_L7 +.align 5 +.L_N1_M4_TL1: /* TL-- */ + KERNEL8xMx1 4, 0x10 + + PTR_ADDI TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N1_M4_TL1 +.L_N1_M4_L7: + /* if (!(L & 7)) goto L_N1_M4_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N1_M4_L0 +.align 5 +.L_N1_M4_L71: + KERNEL1xMx1 4, 0x10 + PTR_ADDI TL, TL, -1 + blt ZERO,TL, .L_N1_M4_L71 +.L_N1_M4_L0: + SAVEMx1 4, 0x10 +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + PTR_SUB L, K, OFF +#ifdef LEFT + PTR_ADDI L, L, -4 +#else + PTR_ADDI L, L, -1 +#endif + PTR_SLLI T0, L, 0x04 + PTR_ADD A0, A0, T0 + PTR_SLLI T0, L, 0x02 + PTR_ADD B0, B0, T0 +#endif + +#ifdef LEFT + PTR_ADDI OFF, OFF, 0x04 +#endif +#endif // #if defined(TRMMKERNEL) +.L_N1_M2: + andi I, M, 2 + beq ZERO,I, .L_N1_M1 +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + PTR_SLLI T0, OFF, 0x03 + PTR_ADD A0, A0, T0 + PTR_SLLI T0, OFF, 0x02 + PTR_ADD B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + PTR_SUB L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + PTR_ADDI L, OFF, 2 +#else + /* number of values in B */ + PTR_ADDI L, OFF, 1 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + KERNEL1xMx1_START 2, 0x08 + /* Reduce L */ + PTR_ADDI L, L, -1 + PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N1_M2_L7 */ + beq ZERO,TL, .L_N1_M2_L7 +.align 5 +.L_N1_M2_TL1: /* TL-- */ + KERNEL8xMx1 2, 0x08 + + PTR_ADDI TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N1_M2_TL1 +.L_N1_M2_L7: + /* if (!(L & 7)) goto L_N1_M2_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N1_M2_L0 +.align 5 +.L_N1_M2_L71: + KERNEL1xMx1 2, 0x08 + PTR_ADDI TL, TL, -1 + blt ZERO,TL, .L_N1_M2_L71 +.L_N1_M2_L0: + SAVEMx1 2, 0x08 +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + PTR_SUB L, K, OFF +#ifdef LEFT + PTR_ADDI L, L, -2 +#else + PTR_ADDI L, L, -1 +#endif + PTR_SLLI T0, L, 0x03 + PTR_ADD A0, A0, T0 + PTR_SLLI T0, L, 0x02 + PTR_ADD B0, B0, T0 +#endif + +#ifdef LEFT + PTR_ADDI OFF, OFF, 0x02 +#endif +#endif // #if defined(TRMMKERNEL) + +.L_N1_M1: + andi I, M, 1 + beq ZERO,I, .L_N1_M0 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + PTR_SLLI T0, OFF, 0x02 + PTR_ADD A0, A0, T0 + PTR_ADD B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + PTR_SUB L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + PTR_ADDI L, OFF, 1 +#else + /* number of values in B */ + PTR_ADDI L, OFF, 1 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + KERNEL1xMx1_START 1, 0x04 + /* Reduce L */ + PTR_ADDI L, L, -1 + PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N1_M1_L7 */ + beq ZERO,TL, .L_N1_M1_L7 +.align 5 +.L_N1_M1_TL1: /* TL-- */ + KERNEL8xMx1 1, 0x04 + + PTR_ADDI TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N1_M1_TL1 +.L_N1_M1_L7: + /* if (!(L & 7)) goto L_N1_M1_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N1_M1_L0 +.align 5 +.L_N1_M1_L71: + KERNEL1xMx1 1, 0x04 + PTR_ADDI TL, TL, -1 + blt ZERO,TL, .L_N1_M1_L71 +.L_N1_M1_L0: + SAVEMx1 1, 0x04 +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + PTR_SUB L, K, OFF +#ifdef LEFT + PTR_ADDI L, L, -1 +#else + PTR_ADDI L, L, -1 +#endif + PTR_SLLI T0, L, 0x02 + PTR_ADD A0, A0, T0 + PTR_ADD B0, B0, T0 +#endif + +#ifdef LEFT + PTR_ADDI OFF, OFF, 0x01 +#endif +#endif // #if defined(TRMMKERNEL) +.L_N1_M0: +.L_N0: + pop_if_used 26, 32 + jirl $r0, $r1, 0x0 + EPILOGUE diff --git a/kernel/loongarch64/sgemm_ncopy_16_lasx.S b/kernel/loongarch64/sgemm_ncopy_16_lasx.S new file mode 100644 index 000000000..266c07c5c --- /dev/null +++ b/kernel/loongarch64/sgemm_ncopy_16_lasx.S @@ -0,0 +1,463 @@ +/******************************************************************************* +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" +#include "loongarch64_asm.S" + +/********************************************************************* +* 2023/08/23 guxiwei +* UTEST : OK +* CTEST : OK +* TEST : OK +*********************************************************************/ + +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define S5 $r16 +#define S6 $r17 +#define S7 $r18 +#define S8 $r19 +#define S9 $r20 +#define S10 $r23 +#define S11 $r24 +#define S12 $r25 +#define S13 $r26 +#define S14 $r27 +#define S15 $r28 +#define S16 $r29 +#define TD $r30 +#define TS $r31 +#define TL $r7 +#define T0 $r6 +#undef ZERO +#define ZERO $r0 + +#define F0 $f0 +#define F1 $f1 +#define F2 $f2 +#define F3 $f3 +#define F4 $f4 +#define F5 $f5 +#define F6 $f6 +#define F7 $f7 +/* LASX vectors */ +#define U0 $xr0 +#define U1 $xr1 +#define U2 $xr2 +#define U3 $xr3 +#define U4 $xr4 +#define U5 $xr5 +#define U6 $xr6 +#define U7 $xr7 +#define U8 $xr8 +#define U9 $xr9 +#define U10 $xr10 +#define U11 $xr11 +#define U12 $xr12 +#define U13 $xr13 +#define U14 $xr14 +#define U15 $xr15 +#define D0 $xr16 +#define D1 $xr17 +#define D2 $xr18 +#define D3 $xr19 +#define D4 $xr20 +#define D5 $xr21 +#define D6 $xr22 +#define D7 $xr23 +#define D8 $xr24 +#define D9 $xr25 +#define D10 $xr26 +#define D11 $xr27 +#define D12 $xr28 +#define D13 $xr29 +#define D14 $xr30 +#define D15 $xr31 + +// Loops outline +//.L_N16 <------------------- +//| .L_M8: | +//| .L_M7: | Main Loop +//| .L_M1: | +//| .L_M0: --------------- +//.L_N15: +//.L_N8: +//| .L_N8_M8: +//| .L_N8_M7: +//| .L_N8_M1: +//.L_N7: +//.L_N4: +//| .L_N4_M4: +//| .L_N4_M3: +//| .L_N4_M1: +//.L_N3: +//.L_N2: +//| .L_N2_M2: +//| .L_N2_M1: +//.L_N1: +//| .L_N1_M1: +//.L_N0 + + PROLOGUE + push_if_used 26, 32 + + move TD, DST + move TS, SRC + PTR_SLLI TL, LDA, 0x02 + PTR_SLLI T0, TL, 0x01 + PTR_SRAI J, N, 0x04 + beq J, ZERO, .L_N15 +.align 5 +.L_N16: + move S1, TS + PTR_ADD S2, TS, TL + PTR_SRAI I, M, 0x03 + PTR_ADD S3, S2, TL + PTR_ADDI J, J, -1 + PTR_ADD S4, S3, TL + PTR_ADD S5, S3, T0 + PTR_ADD S6, S4, T0 + PTR_ADD S7, S5, T0 + PTR_ADD S8, S6, T0 + PTR_ADD S9, S7, T0 + PTR_ADD S10, S8, T0 + PTR_ADD S11, S9, T0 + PTR_ADD S12, S10, T0 + PTR_ADD S13, S11, T0 + PTR_ADD S14, S12, T0 + PTR_ADD S15, S13, T0 + PTR_ADD S16, S14, T0 + PTR_ADD TS, S15, T0 + beq I, ZERO, .L_M7 +.align 5 +.L_M8: + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + xvld U2, S3, 0x00 + xvld U3, S4, 0x00 + xvld U4, S5, 0x00 + xvld U5, S6, 0x00 + xvld U6, S7, 0x00 + xvld U7, S8, 0x00 + xvld U8, S9, 0x00 + xvld U9, S10, 0x00 + xvld U10, S11, 0x00 + xvld U11, S12, 0x00 + xvld U12, S13, 0x00 + xvld U13, S14, 0x00 + xvld U14, S15, 0x00 + xvld U15, S16, 0x00 + + GTRANSPOSE8x8_W D0, D2, D4, D6, D8, D10, D12, D14, \ + U0, U1, U2, U3, U4, U5, U6, U7, \ + D1, D3, D5, D7 // As tmp + GTRANSPOSE8x8_W D1, D3, D5, D7, D9, D11, D13, D15, \ + U8, U9, U10, U11, U12, U13, U14, U15, \ + U0, U1, U2, U3 // As tmp + GST xv, , D0, TD, 0x00, D1, TD, 0x20, D2, TD, 0x40, D3, TD, 0x60, \ + D4, TD, 0x80, D5, TD, 0xA0, D6, TD, 0xC0, D7, TD, 0xE0 + PTR_ADDI TD, TD, 0x100 + GST xv, , D8, TD, 0x00, D9, TD, 0x20, D10, TD, 0x40, D11, TD, 0x60, \ + D12, TD, 0x80, D13, TD, 0xA0, D14, TD, 0xC0, D15, TD, 0xE0 + PTR_ADDI TD, TD, 0x100 + PTR_ADDI S1, S1, 0x20 + PTR_ADDI S2, S2, 0x20 + PTR_ADDI S3, S3, 0x20 + PTR_ADDI S4, S4, 0x20 + PTR_ADDI S5, S5, 0x20 + PTR_ADDI S6, S6, 0x20 + PTR_ADDI S7, S7, 0x20 + PTR_ADDI S8, S8, 0x20 + PTR_ADDI S9, S9, 0x20 + PTR_ADDI S10, S10, 0x20 + PTR_ADDI S11, S11, 0x20 + PTR_ADDI S12, S12, 0x20 + PTR_ADDI S13, S13, 0x20 + PTR_ADDI S14, S14, 0x20 + PTR_ADDI S15, S15, 0x20 + PTR_ADDI S16, S16, 0x20 + + PTR_ADDI I, I, -1 + blt ZERO, I, .L_M8 +.L_M7: + andi I, M, 0x07 + beq I, ZERO, .L_M0 +.align 5 +.L_M1: + fld.s F0, S1, 0x00 + fld.s F1, S2, 0x00 + fld.s F2, S3, 0x00 + fld.s F3, S4, 0x00 + fld.s F4, S5, 0x00 + fld.s F5, S6, 0x00 + fld.s F6, S7, 0x00 + fld.s F7, S8, 0x00 + + fst.s F0, TD, 0x00 + fst.s F1, TD, 0x04 + fst.s F2, TD, 0x08 + fst.s F3, TD, 0x0C + fst.s F4, TD, 0x10 + fst.s F5, TD, 0x14 + fst.s F6, TD, 0x18 + fst.s F7, TD, 0x1C + + PTR_ADDI S1, S1, 0x04 + PTR_ADDI S2, S2, 0x04 + PTR_ADDI S3, S3, 0x04 + PTR_ADDI S4, S4, 0x04 + PTR_ADDI S5, S5, 0x04 + PTR_ADDI S6, S6, 0x04 + PTR_ADDI S7, S7, 0x04 + PTR_ADDI S8, S8, 0x04 + PTR_ADDI TD, TD, 0x20 + + fld.s F0, S9, 0x00 + fld.s F1, S10, 0x00 + fld.s F2, S11, 0x00 + fld.s F3, S12, 0x00 + fld.s F4, S13, 0x00 + fld.s F5, S14, 0x00 + fld.s F6, S15, 0x00 + fld.s F7, S16, 0x00 + + fst.s F0, TD, 0x00 + fst.s F1, TD, 0x04 + fst.s F2, TD, 0x08 + fst.s F3, TD, 0x0C + fst.s F4, TD, 0x10 + fst.s F5, TD, 0x14 + fst.s F6, TD, 0x18 + fst.s F7, TD, 0x1C + + PTR_ADDI S9, S9, 0x04 + PTR_ADDI S10, S10, 0x04 + PTR_ADDI S11, S11, 0x04 + PTR_ADDI S12, S12, 0x04 + PTR_ADDI S13, S13, 0x04 + PTR_ADDI S14, S14, 0x04 + PTR_ADDI S15, S15, 0x04 + PTR_ADDI S16, S16, 0x04 + PTR_ADDI TD, TD, 0x20 + + PTR_ADDI I, I, -1 + blt ZERO, I, .L_M1 +.L_M0: + blt ZERO, J, .L_N16 +.L_N15: + andi J, N, 0x0f + beq ZERO, J, .L_N0 + + andi J, N, 0x08 + beq ZERO, J, .L_N7 +.L_N8: + move S1, TS + PTR_ADD S2, TS, TL + PTR_SRAI I, M, 0x03 + PTR_ADD S3, S2, TL + PTR_ADD S4, S2, T0 + PTR_ADD S5, S3, T0 + PTR_ADD S6, S4, T0 + PTR_ADD S7, S5, T0 + PTR_ADD S8, S6, T0 + PTR_ADD TS, S7, T0 + beq I, ZERO, .L_N8_M7 +.align 5 +.L_N8_M8: + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + xvld U2, S3, 0x00 + xvld U3, S4, 0x00 + xvld U4, S5, 0x00 + xvld U5, S6, 0x00 + xvld U6, S7, 0x00 + xvld U7, S8, 0x00 + + GTRANSPOSE8x8_W D0, D2, D4, D6, D8, D10, D12, D14, \ + U0, U1, U2, U3, U4, U5, U6, U7, \ + D1, D3, D5, D7 // As tmp + GST xv, , D0, TD, 0x00, D2, TD, 0x20, D4, TD, 0x40, D6, TD, 0x60, \ + D8, TD, 0x80, D10, TD, 0xA0, D12, TD, 0xC0, D14, TD, 0xE0 + PTR_ADDI TD, TD, 0x100 + PTR_ADDI S1, S1, 0x20 + PTR_ADDI S2, S2, 0x20 + PTR_ADDI S3, S3, 0x20 + PTR_ADDI S4, S4, 0x20 + PTR_ADDI S5, S5, 0x20 + PTR_ADDI S6, S6, 0x20 + PTR_ADDI S7, S7, 0x20 + PTR_ADDI S8, S8, 0x20 + + PTR_ADDI I, I, -1 + blt ZERO, I, .L_N8_M8 +.L_N8_M7: + andi I, M, 0x07 + beq I, ZERO, .L_N7 +.align 5 +.L_N8_M1: + fld.s F0, S1, 0x00 + fld.s F1, S2, 0x00 + fld.s F2, S3, 0x00 + fld.s F3, S4, 0x00 + fld.s F4, S5, 0x00 + fld.s F5, S6, 0x00 + fld.s F6, S7, 0x00 + fld.s F7, S8, 0x00 + + fst.s F0, TD, 0x00 + PTR_ADDI S1, S1, 0x04 + fst.s F1, TD, 0x04 + PTR_ADDI S2, S2, 0x04 + fst.s F2, TD, 0x08 + PTR_ADDI S3, S3, 0x04 + fst.s F3, TD, 0x0C + PTR_ADDI S4, S4, 0x04 + fst.s F4, TD, 0x10 + PTR_ADDI S5, S5, 0x04 + fst.s F5, TD, 0x14 + PTR_ADDI S6, S6, 0x04 + fst.s F6, TD, 0x18 + PTR_ADDI S7, S7, 0x04 + fst.s F7, TD, 0x1C + PTR_ADDI S8, S8, 0x04 + + PTR_ADDI TD, TD, 0x20 + PTR_ADDI I, I, -1 + blt ZERO, I, .L_N8_M1 +.L_N7: + andi J, N, 0x07 + beq ZERO, J, .L_N0 + + andi J, N, 0x04 + beq ZERO, J, .L_N3 +.L_N4: + move S1, TS + PTR_ADD S2, TS, TL + PTR_SRAI I, M, 0x02 + PTR_ADD S3, S2, TL + PTR_ADD S4, S2, T0 + PTR_ADD TS, S3, T0 + beq I, ZERO, .L_N4_M3 +.align 5 +.L_N4_M4: + GLD v, , $vr0, S1, 0, $vr1, S2, 0, $vr2, S3, 0, $vr3, S4, 0 + GSBUTTERFLY v, w, $vr4, $vr5, $vr2, $vr0 + GSBUTTERFLY v, w, $vr6, $vr7, $vr3, $vr1 + GSBUTTERFLY v, w, $vr0, $vr1, $vr6, $vr4 + GSBUTTERFLY v, w, $vr2, $vr3, $vr7, $vr5 + GST v, , $vr0, TD, 0x00, $vr1, TD, 0x10, $vr2, TD, 0x20, $vr3, TD, 0x30 + PTR_ADDI S1, S1, 0x10 + PTR_ADDI S2, S2, 0x10 + PTR_ADDI S3, S3, 0x10 + PTR_ADDI S4, S4, 0x10 + PTR_ADDI TD, TD, 0x40 + PTR_ADDI I, I, -1 + blt ZERO, I, .L_N4_M4 +.L_N4_M3: + andi I, M, 0x03 + beq I, ZERO, .L_N3 +.align 5 +.L_N4_M1: + fld.s F0, S1, 0x00 + fld.s F1, S2, 0x00 + fld.s F2, S3, 0x00 + fld.s F3, S4, 0x00 + + fst.s F0, TD, 0x00 + PTR_ADDI S1, S1, 0x04 + fst.s F1, TD, 0x04 + PTR_ADDI S2, S2, 0x04 + fst.s F2, TD, 0x08 + PTR_ADDI S3, S3, 0x04 + fst.s F3, TD, 0x0C + PTR_ADDI S4, S4, 0x04 + + PTR_ADDI TD, TD, 0x10 + PTR_ADDI I, I, -1 + blt ZERO, I, .L_N4_M1 +.L_N3: + andi J, N, 0x03 + beq ZERO, J, .L_N0 + + andi J, N, 0x02 + beq ZERO, J, .L_N1 +.L_N2: + move S1, TS + PTR_ADD S2, TS, TL + PTR_SRAI I, M, 0x01 + PTR_ADD TS, S2, TL + beq I, ZERO, .L_N2_M1 +.align 5 +.L_N2_M2: + GLD f, d, F0, S1, 0x00, F1, S2, 0x00 + vilvl.w $vr0, $vr1, $vr0 + GST v, , $vr0, TD, 0x00 + PTR_ADDI S1, S1, 0x08 + PTR_ADDI S2, S2, 0x08 + PTR_ADDI TD, TD, 0x10 + + PTR_ADDI I, I, -1 + blt ZERO, I, .L_N2_M2 +.L_N2_M1: + andi I, M, 0x01 + beq I, ZERO, .L_N1 + + fld.s F0, S1, 0x00 + fld.s F1, S2, 0x00 + + fst.s F0, TD, 0x00 + PTR_ADDI S1, S1, 0x04 + fst.s F1, TD, 0x04 + PTR_ADDI S2, S2, 0x04 + PTR_ADDI TD, TD, 0x08 +.align 5 +.L_N1: + move S1, TS + beq ZERO, M, .L_N0 +.L_N1_M1: + fld.s F0, S1, 0x00 + PTR_ADDI S1, S1, 0x04 + fst.s F0, TD, 0x00 + PTR_ADDI TD, TD, 0x04 + PTR_ADDI M, M, -1 + blt ZERO, M, .L_N1_M1 +.L_N0: + pop_if_used 26, 32 + jirl $r0, $r1, 0x0 + EPILOGUE diff --git a/kernel/loongarch64/sgemm_ncopy_8_lasx.S b/kernel/loongarch64/sgemm_ncopy_8_lasx.S new file mode 100644 index 000000000..5c173568b --- /dev/null +++ b/kernel/loongarch64/sgemm_ncopy_8_lasx.S @@ -0,0 +1,298 @@ +/******************************************************************************* +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" +#include "loongarch64_asm.S" + +/********************************************************************* +* 2023/08/23 guxiwei +* UTEST : OK +* CTEST : OK +* TEST : OK +*********************************************************************/ + +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define S5 $r16 +#define S6 $r17 +#define S7 $r18 +#define S8 $r19 +#define TD $r20 +#define TS $r11 +#define TL $r7 +#define T0 $r6 +#undef ZERO +#define ZERO $r0 + +#define F0 $f0 +#define F1 $f1 +#define F2 $f2 +#define F3 $f3 +#define F4 $f4 +#define F5 $f5 +#define F6 $f6 +#define F7 $f7 +/* LASX vectors */ +#define U0 $xr0 +#define U1 $xr1 +#define U2 $xr2 +#define U3 $xr3 +#define U4 $xr4 +#define U5 $xr5 +#define U6 $xr6 +#define U7 $xr7 +#define D0 $xr8 +#define D1 $xr9 +#define D2 $xr10 +#define D3 $xr11 +#define D4 $xr12 +#define D5 $xr13 +#define D6 $xr14 +#define D7 $xr15 +#define D8 $xr16 +#define D10 $xr17 +#define D12 $xr18 +#define D14 $xr19 + +// Loops outline +//.L_N8: <---------------- +//| .L_M8: | +//| .L_M7: | Main Loop +//| .L_M1: | +//| .L_M0:-------------- +//.L_N7: +//.L_N4: +//| .L_N4_M4: +//| .L_N4_M3: +//| .L_N4_M1: +//.L_N3: +//.L_N2: +//| .L_N2_M2: +//| .L_N2_M1: +//.L_N1: +//| .L_N1_M1: +//.L_N0 + + PROLOGUE + push_if_used 17, 20 + + move TD, DST + move TS, SRC + PTR_SLLI TL, LDA, 0x02 + PTR_SLLI T0, TL, 0x01 + PTR_SRAI J, N, 0x03 + beq J, ZERO, .L_N7 +.align 5 +.L_N8: + move S1, TS + PTR_ADD S2, TS, TL + PTR_SRAI I, M, 0x03 + PTR_ADD S3, S2, TL + PTR_ADDI J, J, -1 + PTR_ADD S4, S2, T0 + PTR_ADD S5, S3, T0 + PTR_ADD S6, S4, T0 + PTR_ADD S7, S5, T0 + PTR_ADD S8, S6, T0 + PTR_ADD TS, S7, T0 + beq I, ZERO, .L_M7 +.align 5 +.L_M8: + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + xvld U2, S3, 0x00 + xvld U3, S4, 0x00 + xvld U4, S5, 0x00 + xvld U5, S6, 0x00 + xvld U6, S7, 0x00 + xvld U7, S8, 0x00 + + GTRANSPOSE8x8_W D0, D2, D4, D6, D8, D10, D12, D14, \ + U0, U1, U2, U3, U4, U5, U6, U7, \ + D1, D3, D5, D7 // As tmp + GST xv, , D0, TD, 0x00, D2, TD, 0x20, D4, TD, 0x40, D6, TD, 0x60, \ + D8, TD, 0x80, D10, TD, 0xA0, D12, TD, 0xC0, D14, TD, 0xE0 + PTR_ADDI TD, TD, 0x100 + PTR_ADDI S1, S1, 0x20 + PTR_ADDI S2, S2, 0x20 + PTR_ADDI S3, S3, 0x20 + PTR_ADDI S4, S4, 0x20 + PTR_ADDI S5, S5, 0x20 + PTR_ADDI S6, S6, 0x20 + PTR_ADDI S7, S7, 0x20 + PTR_ADDI S8, S8, 0x20 + PTR_ADDI I, I, -1 + blt ZERO, I, .L_M8 +.L_M7: + andi I, M, 0x07 + beq I, ZERO, .L_M0 +.align 5 +.L_M1: + fld.s F0, S1, 0x00 + fld.s F1, S2, 0x00 + fld.s F2, S3, 0x00 + fld.s F3, S4, 0x00 + fld.s F4, S5, 0x00 + fld.s F5, S6, 0x00 + fld.s F6, S7, 0x00 + fld.s F7, S8, 0x00 + + fst.s F0, TD, 0x00 + PTR_ADDI S1, S1, 0x04 + fst.s F1, TD, 0x04 + PTR_ADDI S2, S2, 0x04 + fst.s F2, TD, 0x08 + PTR_ADDI S3, S3, 0x04 + fst.s F3, TD, 0x0C + PTR_ADDI S4, S4, 0x04 + fst.s F4, TD, 0x10 + PTR_ADDI S5, S5, 0x04 + fst.s F5, TD, 0x14 + PTR_ADDI S6, S6, 0x04 + fst.s F6, TD, 0x18 + PTR_ADDI S7, S7, 0x04 + fst.s F7, TD, 0x1C + PTR_ADDI S8, S8, 0x04 + + PTR_ADDI TD, TD, 0x20 + PTR_ADDI I, I, -1 + blt ZERO, I, .L_M1 +.L_M0: + blt ZERO, J, .L_N8 +.L_N7: + andi J, N, 0x07 + beq ZERO, J, .L_N0 + + andi J, N, 0x04 + beq ZERO, J, .L_N3 +.L_N4: + move S1, TS + PTR_ADD S2, TS, TL + PTR_SRAI I, M, 0x02 + PTR_ADD S3, S2, TL + PTR_ADD S4, S2, T0 + PTR_ADD TS, S3, T0 + beq I, ZERO, .L_N4_M3 +.align 5 +.L_N4_M4: + GLD v, , $vr0, S1, 0, $vr1, S2, 0, $vr2, S3, 0, $vr3, S4, 0 + GSBUTTERFLY v, w, $vr4, $vr5, $vr2, $vr0 + GSBUTTERFLY v, w, $vr6, $vr7, $vr3, $vr1 + GSBUTTERFLY v, w, $vr0, $vr1, $vr6, $vr4 + GSBUTTERFLY v, w, $vr2, $vr3, $vr7, $vr5 + GST v, , $vr0, TD, 0x00, $vr1, TD, 0x10, $vr2, TD, 0x20, $vr3, TD, 0x30 + PTR_ADDI S1, S1, 0x10 + PTR_ADDI S2, S2, 0x10 + PTR_ADDI S3, S3, 0x10 + PTR_ADDI S4, S4, 0x10 + PTR_ADDI TD, TD, 0x40 + PTR_ADDI I, I, -1 + blt ZERO, I, .L_N4_M4 +.L_N4_M3: + andi I, M, 0x03 + beq I, ZERO, .L_N3 +.align 5 +.L_N4_M1: + fld.s F0, S1, 0x00 + fld.s F1, S2, 0x00 + fld.s F2, S3, 0x00 + fld.s F3, S4, 0x00 + + fst.s F0, TD, 0x00 + PTR_ADDI S1, S1, 0x04 + fst.s F1, TD, 0x04 + PTR_ADDI S2, S2, 0x04 + fst.s F2, TD, 0x08 + PTR_ADDI S3, S3, 0x04 + fst.s F3, TD, 0x0C + PTR_ADDI S4, S4, 0x04 + + PTR_ADDI TD, TD, 0x10 + PTR_ADDI I, I, -1 + blt ZERO, I, .L_N4_M1 +.L_N3: + andi J, N, 0x03 + beq ZERO, J, .L_N0 + + andi J, N, 0x02 + beq ZERO, J, .L_N1 +.L_N2: + move S1, TS + PTR_ADD S2, TS, TL + PTR_SRAI I, M, 0x01 + PTR_ADD TS, S2, TL + beq I, ZERO, .L_N2_M1 +.align 5 +.L_N2_M2: + GLD f, d, F0, S1, 0x00, F1, S2, 0x00 + vilvl.w $vr0, $vr1, $vr0 + GST v, , $vr0, TD, 0x00 + PTR_ADDI S1, S1, 0x08 + PTR_ADDI S2, S2, 0x08 + PTR_ADDI TD, TD, 0x10 + + PTR_ADDI I, I, -1 + blt ZERO, I, .L_N2_M2 +.L_N2_M1: + andi I, M, 0x01 + beq I, ZERO, .L_N1 + + fld.s F0, S1, 0x00 + fld.s F1, S2, 0x00 + + fst.s F0, TD, 0x00 + PTR_ADDI S1, S1, 0x04 + fst.s F1, TD, 0x04 + PTR_ADDI S2, S2, 0x04 + PTR_ADDI TD, TD, 0x08 +.align 5 +.L_N1: + move S1, TS + beq ZERO, M, .L_N0 +.L_N1_M1: + fld.s F0, S1, 0x00 + PTR_ADDI S1, S1, 0x04 + fst.s F0, TD, 0x00 + PTR_ADDI TD, TD, 0x04 + PTR_ADDI M, M, -1 + blt ZERO, M, .L_N1_M1 +.L_N0: + pop_if_used 17, 20 + jirl $r0, $r1, 0x0 + EPILOGUE diff --git a/kernel/loongarch64/sgemm_tcopy_16_lasx.S b/kernel/loongarch64/sgemm_tcopy_16_lasx.S new file mode 100644 index 000000000..d9789bdcd --- /dev/null +++ b/kernel/loongarch64/sgemm_tcopy_16_lasx.S @@ -0,0 +1,526 @@ +/******************************************************************************* +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" +#include "loongarch64_asm.S" + +/********************************************************************* +* 2023/08/23 guxiwei +* UTEST : OK +* CTEST : OK +* TEST : OK +*********************************************************************/ + +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S0 $r11 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define S5 $r16 +#define S6 $r17 +#define S7 $r18 +#define S8 $r19 +#define P0 $r20 +#define P1 $r23 +#define P2 $r24 +#define P3 $r25 +#define P4 $r26 +#define P5 $r27 +#define T0 $r28 +#define T1 $r29 +#define TL $r7 +#define ZERO $r0 + +/* LASX vectors */ +#define U0 $xr0 +#define U1 $xr1 +#define U2 $xr2 +#define U3 $xr3 +#define U4 $xr4 +#define U5 $xr5 +#define U6 $xr6 +#define U7 $xr7 + +// Loops outline +//.L_M8 <------------------- +//| .L_N16: | +//| .L_N15: | +//| .L_N8: | +//| .L_N7: | Main Loop +//| .L_N4: | +//| .L_N3: | +//| .L_N2: | +//| .L_N1: | +//| .L_N0: --------------- +//.L_M7 +//.L_M4 +//| .L_M4_N16: +//| .L_M4_N15: +//| .L_M4_N8: +//| .L_M4_N7: +//| .L_M4_N4: +//| .L_M4_N3: +//| .L_M4_N2: +//| .L_M4_N1: +//.L_M3 +//.L_M2 +//| .L_M2_N16: +//| .L_M2_N15: +//| .L_M2_N8: +//| .L_M2_N7: +//| .L_M2_N4: +//| .L_M2_N3: +//| .L_M2_N2: +//| .L_M2_N1: +//.L_M1 +//| .L_M1_N16: +//| .L_M1_N15: +//| .L_M1_N8: +//| .L_M1_N7: +//| .L_M1_N4: +//| .L_M1_N3: +//| .L_M1_N2: +//| .L_M1_N1: +//.L_M0 + + PROLOGUE + push_if_used 24, 8 + + move S0, SRC + move P0, DST + + PTR_SRAI T0, N, 0x04 + PTR_SRAI T1, N, 0x03 + PTR_SLLI T0, T0, 0x04 + PTR_SLLI T1, T1, 0x03 + + PTR_MUL P2, M, T0 + PTR_MUL P3, M, T1 + PTR_SLLI P2, P2, 0x02 + PTR_SLLI P3, P3, 0x02 + PTR_ADD P2, DST, P2 + PTR_ADD P3, DST, P3 + + PTR_SRAI T0, N, 0x02 + PTR_SRAI T1, N, 0x01 + PTR_SLLI T0, T0, 0x02 + PTR_SLLI T1, T1, 0x01 + PTR_MUL P4, M, T0 + PTR_MUL P5, M, T1 + PTR_SLLI P4, P4, 0x02 + PTR_SLLI P5, P5, 0x02 + PTR_ADD P4, DST, P4 + PTR_ADD P5, DST, P5 + + PTR_SLLI TL, LDA, 0x02 + PTR_SRAI J, M, 0x03 + PTR_SLLI T0, TL, 0x01 + PTR_SLLI T1, M, 0x06 + beq ZERO, J, .L_M7 +.align 5 +.L_M8: + move S1, S0 + PTR_ADD S2, S0, TL + PTR_ADD S3, S1, T0 + PTR_ADD S4, S2, T0 + PTR_ADD S5, S3, T0 + PTR_ADD S6, S4, T0 + PTR_ADD S7, S5, T0 + PTR_ADD S8, S6, T0 + PTR_ADD S0, S7, T0 + + move P1, P0 + PTR_ADDI P0, P0, 0x200 + + PTR_SRAI I, N, 0x04 + PTR_ADDI J, J, -1 + beq ZERO, I, .L_N15 +.L_N16: + xvld U0, S1, 0x00 + xvld U1, S1, 0x20 + xvld U2, S2, 0x00 + xvld U3, S2, 0x20 + + xvst U0, P1, 0x00 + xvst U1, P1, 0x20 + xvst U2, P1, 0x40 + xvst U3, P1, 0x60 + + xvld U4, S3, 0x00 + xvld U5, S3, 0x20 + xvld U6, S4, 0x00 + xvld U7, S4, 0x20 + + xvst U4, P1, 0x80 + xvst U5, P1, 0xA0 + xvst U6, P1, 0xC0 + xvst U7, P1, 0xE0 + + xvld U0, S5, 0x00 + xvld U1, S5, 0x20 + xvld U2, S6, 0x00 + xvld U3, S6, 0x20 + + xvst U0, P1, 0x100 + xvst U1, P1, 0x120 + xvst U2, P1, 0x140 + xvst U3, P1, 0x160 + + xvld U4, S7, 0x00 + xvld U5, S7, 0x20 + xvld U6, S8, 0x00 + xvld U7, S8, 0x20 + + xvst U4, P1, 0x180 + xvst U5, P1, 0x1A0 + xvst U6, P1, 0x1C0 + xvst U7, P1, 0x1E0 + + PTR_ADDI S1, S1, 0x40 + PTR_ADDI S2, S2, 0x40 + PTR_ADDI S3, S3, 0x40 + PTR_ADDI S4, S4, 0x40 + PTR_ADDI S5, S5, 0x40 + PTR_ADDI S6, S6, 0x40 + PTR_ADDI S7, S7, 0x40 + PTR_ADDI S8, S8, 0x40 + + PTR_ADDI I, I, -1 + PTR_ADD P1, P1, T1 + blt ZERO, I, .L_N16 +.L_N15: + andi I, N, 0x08 + beq ZERO, I, .L_N7 +.L_N8: + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + xvld U2, S3, 0x00 + xvld U3, S4, 0x00 + xvld U4, S5, 0x00 + xvld U5, S6, 0x00 + xvld U6, S7, 0x00 + xvld U7, S8, 0x00 + + GST xv, , U0, P2, 0x00, U1, P2, 0x20, U2, P2, 0x40, U3, P2, 0x60, \ + U4, P2, 0x80, U5, P2, 0xA0, U6, P2, 0xC0, U7, P2, 0xE0 + + PTR_ADDI S1, S1, 0x20 + PTR_ADDI S2, S2, 0x20 + PTR_ADDI S3, S3, 0x20 + PTR_ADDI S4, S4, 0x20 + PTR_ADDI S5, S5, 0x20 + PTR_ADDI S6, S6, 0x20 + PTR_ADDI S7, S7, 0x20 + PTR_ADDI S8, S8, 0x20 + PTR_ADDI P2, P2, 0x100 +.L_N7: + andi I, N, 0x04 + beq ZERO, I, .L_N3 +.L_N4: + GLD v, , $vr0, S1, 0x00, $vr1, S2, 0x00, $vr2, S3, 0x00, $vr3, S4, 0x00, \ + $vr4, S5, 0x00, $vr5, S6, 0x00, $vr6, S7, 0x00, $vr7, S8, 0x00 + GST v, , $vr0, P3, 0x00, $vr1, P3, 0x10, $vr2, P3, 0x20, $vr3, P3, 0x30, \ + $vr4, P3, 0x40, $vr5, P3, 0x50, $vr6, P3, 0x60, $vr7, P3, 0x70 + PTR_ADDI S1, S1, 0x10 + PTR_ADDI S2, S2, 0x10 + PTR_ADDI S3, S3, 0x10 + PTR_ADDI S4, S4, 0x10 + PTR_ADDI S5, S5, 0x10 + PTR_ADDI S6, S6, 0x10 + PTR_ADDI S7, S7, 0x10 + PTR_ADDI S8, S8, 0x10 + PTR_ADDI P3, P3, 0x80 +.L_N3: + andi I, N, 0x02 + beq ZERO, I, .L_N1 +.L_N2: + GLD f, d, $f0, S1, 0x00, $f1, S2, 0x00, $f2, S3, 0x00, $f3, S4, 0x00, \ + $f4, S5, 0x00, $f5, S6, 0x00, $f6, S7, 0x00, $f7, S8, 0x00 + GST f, d, $f0, P4, 0x00, $f1, P4, 0x08, $f2, P4, 0x10, $f3, P4, 0x18, \ + $f4, P4, 0x20, $f5, P4, 0x28, $f6, P4, 0x30, $f7, P4, 0x38 + PTR_ADDI S1, S1, 0x08 + PTR_ADDI S2, S2, 0x08 + PTR_ADDI S3, S3, 0x08 + PTR_ADDI S4, S4, 0x08 + PTR_ADDI S5, S5, 0x08 + PTR_ADDI S6, S6, 0x08 + PTR_ADDI S7, S7, 0x08 + PTR_ADDI S8, S8, 0x08 + PTR_ADDI P4, P4, 0x40 +.L_N1: + andi I, N, 0x01 + beq ZERO, I, .L_N0 + + GLD f, s, $f0, S1, 0x00, $f1, S2, 0x00, $f2, S3, 0x00, $f3, S4, 0x00, \ + $f4, S5, 0x00, $f5, S6, 0x00, $f6, S7, 0x00, $f7, S8, 0x00 + GST f, s, $f0, P5, 0x00, $f1, P5, 0x04, $f2, P5, 0x08, $f3, P5, 0x0C, \ + $f4, P5, 0x10, $f5, P5, 0x14, $f6, P5, 0x18, $f7, P5, 0x1C + PTR_ADDI S1, S1, 0x04 + PTR_ADDI S2, S2, 0x04 + PTR_ADDI S3, S3, 0x04 + PTR_ADDI S4, S4, 0x04 + PTR_ADDI S5, S5, 0x04 + PTR_ADDI S6, S6, 0x04 + PTR_ADDI S7, S7, 0x04 + PTR_ADDI S8, S8, 0x04 + PTR_ADDI P5, P5, 0x20 +.L_N0: + blt ZERO, J, .L_M8 +.L_M7: + andi J, M, 0x04 + beq ZERO, J, .L_M3 +.L_M4: + move S1, S0 + PTR_ADD S2, S0, TL + PTR_ADD S3, S1, T0 + PTR_ADD S4, S2, T0 + PTR_ADD S0, S3, T0 + + move P1, P0 + PTR_ADDI P0, P0, 0x100 + + PTR_SRAI I, N, 0x04 + beq ZERO, I, .L_M4_N15 +.align 5 +.L_M4_N16: + xvld U0, S1, 0x00 + xvld U1, S1, 0x20 + xvld U2, S2, 0x00 + xvld U3, S2, 0x20 + + xvst U0, P1, 0x00 + xvst U1, P1, 0x20 + xvst U2, P1, 0x40 + xvst U3, P1, 0x60 + + xvld U4, S3, 0x00 + xvld U5, S3, 0x20 + xvld U6, S4, 0x00 + xvld U7, S4, 0x20 + + xvst U4, P1, 0x80 + xvst U5, P1, 0xA0 + xvst U6, P1, 0xC0 + xvst U7, P1, 0xE0 + + PTR_ADDI S1, S1, 0x40 + PTR_ADDI S2, S2, 0x40 + PTR_ADDI S3, S3, 0x40 + PTR_ADDI S4, S4, 0x40 + PTR_ADDI I, I, -1 + PTR_ADD P1, P1, T1 + blt ZERO, I, .L_M4_N16 +.L_M4_N15: + andi I, N, 0x08 + beq ZERO, I, .L_M4_N7 +.L_M4_N8: + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + xvld U2, S3, 0x00 + xvld U3, S4, 0x00 + + GST xv, , U0, P2, 0x00, U1, P2, 0x20, U2, P2, 0x40, U3, P2, 0x60 + + PTR_ADDI S1, S1, 0x20 + PTR_ADDI S2, S2, 0x20 + PTR_ADDI S3, S3, 0x20 + PTR_ADDI S4, S4, 0x20 + PTR_ADDI P2, P2, 0x80 +.L_M4_N7: + andi I, N, 0x04 + beq ZERO, I, .L_M4_N3 +.L_M4_N4: + GLD v, , $vr0, S1, 0x00, $vr1, S2, 0x00, $vr2, S3, 0x00, $vr3, S4, 0x00 + GST v, , $vr0, P3, 0x00, $vr1, P3, 0x10, $vr2, P3, 0x20, $vr3, P3, 0x30 + PTR_ADDI S1, S1, 0x10 + PTR_ADDI S2, S2, 0x10 + PTR_ADDI S3, S3, 0x10 + PTR_ADDI S4, S4, 0x10 + PTR_ADDI P3, P3, 0x40 +.L_M4_N3: + andi I, N, 0x02 + beq ZERO, I, .L_M4_N1 +.L_M4_N2: + GLD f, d, $f0, S1, 0x00, $f1, S2, 0x00, $f2, S3, 0x00, $f3, S4, 0x00 + GST f, d, $f0, P4, 0x00, $f1, P4, 0x08, $f2, P4, 0x10, $f3, P4, 0x18 + PTR_ADDI S1, S1, 0x08 + PTR_ADDI S2, S2, 0x08 + PTR_ADDI S3, S3, 0x08 + PTR_ADDI S4, S4, 0x08 + PTR_ADDI P4, P4, 0x20 +.L_M4_N1: + andi I, N, 0x01 + beq ZERO, I, .L_M3 + + GLD f, s, $f0, S1, 0x00, $f1, S2, 0x00, $f2, S3, 0x00, $f3, S4, 0x00 + GST f, s, $f0, P5, 0x00, $f1, P5, 0x04, $f2, P5, 0x08, $f3, P5, 0x0C + PTR_ADDI S1, S1, 0x04 + PTR_ADDI S2, S2, 0x04 + PTR_ADDI S3, S3, 0x04 + PTR_ADDI S4, S4, 0x04 + PTR_ADDI P5, P5, 0x10 +.L_M3: + andi J, M, 0x02 + beq ZERO, J, .L_M1 +.L_M2: + move S1, S0 + PTR_ADD S2, S0, TL + PTR_ADD S0, S0, T0 + + move P1, P0 + PTR_ADDI P0, P0, 0x80 + + PTR_SRAI I, N, 0x04 + beq ZERO, I, .L_M2_N15 +.align 5 +.L_M2_N16: + xvld U0, S1, 0x00 + xvld U1, S1, 0x20 + xvld U2, S2, 0x00 + xvld U3, S2, 0x20 + + xvst U0, P1, 0x00 + xvst U1, P1, 0x20 + xvst U2, P1, 0x40 + xvst U3, P1, 0x60 + + PTR_ADDI S1, S1, 0x40 + PTR_ADDI S2, S2, 0x40 + PTR_ADDI I, I, -1 + PTR_ADD P1, P1, T1 + blt ZERO, I, .L_M2_N16 +.L_M2_N15: + andi I, N, 0x08 + beq ZERO, I, .L_M2_N7 +.L_M2_N8: + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + + GST xv, , U0, P2, 0x00, U1, P2, 0x20 + + PTR_ADDI S1, S1, 0x20 + PTR_ADDI S2, S2, 0x20 + PTR_ADDI P2, P2, 0x40 +.L_M2_N7: + andi I, N, 0x04 + beq ZERO, I, .L_M2_N3 +.L_M2_N4: + GLD v, , $vr0, S1, 0x00, $vr1, S2, 0x00 + GST v, , $vr0, P3, 0x00, $vr1, P3, 0x10 + PTR_ADDI S1, S1, 0x10 + PTR_ADDI S2, S2, 0x10 + PTR_ADDI P3, P3, 0x20 +.L_M2_N3: + andi I, N, 0x02 + beq ZERO, I, .L_M2_N1 +.L_M2_N2: + GLD f, d, $f0, S1, 0x00, $f1, S2, 0x00 + GST f, d, $f0, P4, 0x00, $f1, P4, 0x08 + PTR_ADDI S1, S1, 0x08 + PTR_ADDI S2, S2, 0x08 + PTR_ADDI P4, P4, 0x10 +.L_M2_N1: + andi I, N, 0x01 + beq ZERO, I, .L_M1 + + GLD f, s, $f0, S1, 0x00, $f1, S2, 0x00 + GST f, s, $f0, P5, 0x00, $f1, P5, 0x04 + PTR_ADDI S1, S1, 0x04 + PTR_ADDI S2, S2, 0x04 + PTR_ADDI P5, P5, 0x08 +.L_M1: + andi J, M, 0x01 + beq ZERO, J, .L_M0 + + move S1, S0 + PTR_ADD S2, S0, TL + + move P1, P0 + PTR_ADDI P0, P0, 0x40 + + PTR_SRAI I, N, 0x04 + beq ZERO, I, .L_M1_N15 +.align 5 +.L_M1_N16: + xvld U0, S1, 0x00 + xvld U1, S1, 0x20 + + xvst U0, P1, 0x00 + xvst U1, P1, 0x20 + + PTR_ADDI S1, S1, 0x40 + PTR_ADDI I, I, -1 + PTR_ADD P1, P1, T1 + blt ZERO, I, .L_M1_N16 +.L_M1_N15: + andi I, N, 0x08 + beq ZERO, I, .L_M1_N7 +.L_M1_N8: + xvld U0, S1, 0x00 + + GST xv, , U0, P2, 0x00 + + PTR_ADDI S1, S1, 0x20 + PTR_ADDI P2, P2, 0x20 +.L_M1_N7: + andi I, N, 0x04 + beq ZERO, I, .L_M1_N3 +.L_M1_N4: + GLD v, , $vr0, S1, 0x00 + GST v, , $vr0, P3, 0x00 + PTR_ADDI S1, S1, 0x10 + PTR_ADDI P3, P3, 0x10 +.L_M1_N3: + andi I, N, 0x02 + beq ZERO, I, .L_M1_N1 +.L_M1_N2: + GLD f, d, $f0, S1, 0x00 + GST f, d, $f0, P4, 0x00 + PTR_ADDI S1, S1, 0x08 + PTR_ADDI P4, P4, 0x08 +.L_M1_N1: + andi I, N, 0x01 + beq ZERO, I, .L_M0 + + GLD f, s, $f0, S1, 0x00 + GST f, s, $f0, P5, 0x00 + PTR_ADDI S1, S1, 0x04 + PTR_ADDI P5, P5, 0x04 +.L_M0: + pop_if_used 24, 8 + jirl $r0, $r1, 0x00 + EPILOGUE diff --git a/kernel/loongarch64/sgemm_tcopy_8_lasx.S b/kernel/loongarch64/sgemm_tcopy_8_lasx.S new file mode 100644 index 000000000..725a47a60 --- /dev/null +++ b/kernel/loongarch64/sgemm_tcopy_8_lasx.S @@ -0,0 +1,406 @@ +/******************************************************************************* +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" +#include "loongarch64_asm.S" + +/********************************************************************* +* 2023/08/23 guxiwei +* UTEST : OK +* CTEST : OK +* TEST : OK +*********************************************************************/ + +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S0 $r11 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define S5 $r16 +#define S6 $r17 +#define S7 $r18 +#define S8 $r19 +#define P0 $r20 +#define P1 $r23 +#define P2 $r24 +#define P3 $r25 +#define P4 $r26 +#define T0 $r27 +#define T1 $r28 +#define TL $r7 +#undef ZERO +#define ZERO $r0 + +/* LASX vectors */ +#define U0 $xr0 +#define U1 $xr1 +#define U2 $xr2 +#define U3 $xr3 +#define U4 $xr4 +#define U5 $xr5 +#define U6 $xr6 +#define U7 $xr7 + +// Loops outline +//.L_M8 <------------------- +//| .L_N8: | +//| .L_N7: | Main Loop +//| .L_N4: | +//| .L_N3: | +//| .L_N2: | +//| .L_N1: | +//| .L_N0: --------------- +//.L_M7 +//.L_M4 +//| .L_M4_N8: +//| .L_M4_N7: +//| .L_M4_N4: +//| .L_M4_N3: +//| .L_M4_N2: +//| .L_M4_N1: +//.L_M3 +//.L_M2 +//| .L_M2_N8: +//| .L_M2_N7: +//| .L_M2_N4: +//| .L_M2_N3: +//| .L_M2_N2: +//| .L_M2_N1: +//.L_M1 +//| .L_M1_N8: +//| .L_M1_N7: +//| .L_M1_N4: +//| .L_M1_N3: +//| .L_M1_N2: +//| .L_M1_N1: +//.L_M0 + + PROLOGUE + push_if_used 23, 8 + + move S0, SRC + move P0, DST + + PTR_SRAI T0, N, 0x04 + PTR_SRAI T1, N, 0x03 + PTR_SLLI T0, T0, 0x04 + PTR_SLLI T1, T1, 0x03 + + PTR_MUL P2, M, T1 + PTR_SLLI P2, P2, 0x02 + PTR_ADD P2, DST, P2 + PTR_SRAI T0, N, 0x02 + PTR_SRAI T1, N, 0x01 + PTR_SLLI T0, T0, 0x02 + PTR_SLLI T1, T1, 0x01 + PTR_MUL P3, M, T0 + PTR_MUL P4, M, T1 + PTR_SLLI P3, P3, 0x02 + PTR_SLLI P4, P4, 0x02 + PTR_ADD P3, DST, P3 + PTR_ADD P4, DST, P4 + + PTR_SLLI TL, LDA, 0x02 + PTR_SRAI J, M, 0x03 + PTR_SLLI T0, TL, 0x01 + PTR_SLLI T1, M, 0x05 + beq ZERO, J, .L_M7 +.align 5 +.L_M8: + move S1, S0 + PTR_ADD S2, S0, TL + PTR_ADD S3, S1, T0 + PTR_ADD S4, S2, T0 + PTR_ADD S5, S3, T0 + PTR_ADD S6, S4, T0 + PTR_ADD S7, S5, T0 + PTR_ADD S8, S6, T0 + PTR_ADD S0, S7, T0 + + move P1, P0 + PTR_ADDI P0, P0, 0x100 + + PTR_SRAI I, N, 0x03 + PTR_ADDI J, J, -1 + beq ZERO, I, .L_N7 +.L_N8: + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + xvld U2, S3, 0x00 + xvld U3, S4, 0x00 + xvld U4, S5, 0x00 + xvld U5, S6, 0x00 + xvld U6, S7, 0x00 + xvld U7, S8, 0x00 + + GST xv, , U0, P1, 0x00, U1, P1, 0x20, U2, P1, 0x40, U3, P1, 0x60, \ + U4, P1, 0x80, U5, P1, 0xA0, U6, P1, 0xC0, U7, P1, 0xE0 + + PTR_ADDI S1, S1, 0x20 + PTR_ADDI S2, S2, 0x20 + PTR_ADDI S3, S3, 0x20 + PTR_ADDI S4, S4, 0x20 + PTR_ADDI S5, S5, 0x20 + PTR_ADDI S6, S6, 0x20 + PTR_ADDI S7, S7, 0x20 + PTR_ADDI S8, S8, 0x20 + + PTR_ADDI I, I, -1 + PTR_ADD P1, P1, T1 + blt ZERO, I, .L_N8 +.L_N7: + andi I, N, 0x04 + beq ZERO, I, .L_N3 +.L_N4: + GLD v, , $vr0, S1, 0x00, $vr1, S2, 0x00, $vr2, S3, 0x00, $vr3, S4, 0x00, \ + $vr4, S5, 0x00, $vr5, S6, 0x00, $vr6, S7, 0x00, $vr7, S8, 0x00 + GST v, , $vr0, P2, 0x00, $vr1, P2, 0x10, $vr2, P2, 0x20, $vr3, P2, 0x30, \ + $vr4, P2, 0x40, $vr5, P2, 0x50, $vr6, P2, 0x60, $vr7, P2, 0x70 + PTR_ADDI S1, S1, 0x10 + PTR_ADDI S2, S2, 0x10 + PTR_ADDI S3, S3, 0x10 + PTR_ADDI S4, S4, 0x10 + PTR_ADDI S5, S5, 0x10 + PTR_ADDI S6, S6, 0x10 + PTR_ADDI S7, S7, 0x10 + PTR_ADDI S8, S8, 0x10 + PTR_ADDI P2, P2, 0x80 +.L_N3: + andi I, N, 0x02 + beq ZERO, I, .L_N1 +.L_N2: + GLD f, d, $f0, S1, 0x00, $f1, S2, 0x00, $f2, S3, 0x00, $f3, S4, 0x00, \ + $f4, S5, 0x00, $f5, S6, 0x00, $f6, S7, 0x00, $f7, S8, 0x00 + GST f, d, $f0, P3, 0x00, $f1, P3, 0x08, $f2, P3, 0x10, $f3, P3, 0x18, \ + $f4, P3, 0x20, $f5, P3, 0x28, $f6, P3, 0x30, $f7, P3, 0x38 + PTR_ADDI S1, S1, 0x08 + PTR_ADDI S2, S2, 0x08 + PTR_ADDI S3, S3, 0x08 + PTR_ADDI S4, S4, 0x08 + PTR_ADDI S5, S5, 0x08 + PTR_ADDI S6, S6, 0x08 + PTR_ADDI S7, S7, 0x08 + PTR_ADDI S8, S8, 0x08 + PTR_ADDI P3, P3, 0x40 +.L_N1: + andi I, N, 0x01 + beq ZERO, I, .L_N0 + + GLD f, s, $f0, S1, 0x00, $f1, S2, 0x00, $f2, S3, 0x00, $f3, S4, 0x00, \ + $f4, S5, 0x00, $f5, S6, 0x00, $f6, S7, 0x00, $f7, S8, 0x00 + GST f, s, $f0, P4, 0x00, $f1, P4, 0x04, $f2, P4, 0x08, $f3, P4, 0x0C, \ + $f4, P4, 0x10, $f5, P4, 0x14, $f6, P4, 0x18, $f7, P4, 0x1C + PTR_ADDI S1, S1, 0x04 + PTR_ADDI S2, S2, 0x04 + PTR_ADDI S3, S3, 0x04 + PTR_ADDI S4, S4, 0x04 + PTR_ADDI S5, S5, 0x04 + PTR_ADDI S6, S6, 0x04 + PTR_ADDI S7, S7, 0x04 + PTR_ADDI S8, S8, 0x04 + PTR_ADDI P4, P4, 0x20 +.L_N0: + blt ZERO, J, .L_M8 + +.L_M7: + andi J, M, 0x04 + beq ZERO, J, .L_M3 +.L_M4: + move S1, S0 + PTR_ADD S2, S0, TL + PTR_ADD S3, S1, T0 + PTR_ADD S4, S2, T0 + PTR_ADD S0, S3, T0 + + move P1, P0 + PTR_ADDI P0, P0, 0x80 + + PTR_SRAI I, N, 0x03 + beq ZERO, I, .L_M4_N7 +.align 5 +.L_M4_N8: + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + xvld U2, S3, 0x00 + xvld U3, S4, 0x00 + + GST xv, , U0, P1, 0x00, U1, P1, 0x20, U2, P1, 0x40, U3, P1, 0x60 + + PTR_ADDI S1, S1, 0x20 + PTR_ADDI S2, S2, 0x20 + PTR_ADDI S3, S3, 0x20 + PTR_ADDI S4, S4, 0x20 + + PTR_ADDI I, I, -1 + PTR_ADD P1, P1, T1 + blt ZERO, I, .L_M4_N8 +.L_M4_N7: + andi I, N, 0x04 + beq ZERO, I, .L_M4_N3 +.L_M4_N4: + GLD v, , $vr0, S1, 0x00, $vr1, S2, 0x00, $vr2, S3, 0x00, $vr3, S4, 0x00 + GST v, , $vr0, P2, 0x00, $vr1, P2, 0x10, $vr2, P2, 0x20, $vr3, P2, 0x30 + PTR_ADDI S1, S1, 0x10 + PTR_ADDI S2, S2, 0x10 + PTR_ADDI S3, S3, 0x10 + PTR_ADDI S4, S4, 0x10 + PTR_ADDI P2, P2, 0x40 +.L_M4_N3: + andi I, N, 0x02 + beq ZERO, I, .L_M4_N1 +.L_M4_N2: + GLD f, d, $f0, S1, 0x00, $f1, S2, 0x00, $f2, S3, 0x00, $f3, S4, 0x00 + GST f, d, $f0, P3, 0x00, $f1, P3, 0x08, $f2, P3, 0x10, $f3, P3, 0x18 + PTR_ADDI S1, S1, 0x08 + PTR_ADDI S2, S2, 0x08 + PTR_ADDI S3, S3, 0x08 + PTR_ADDI S4, S4, 0x08 + PTR_ADDI P3, P3, 0x20 +.L_M4_N1: + andi I, N, 0x01 + beq ZERO, I, .L_M3 + + GLD f, s, $f0, S1, 0x00, $f1, S2, 0x00, $f2, S3, 0x00, $f3, S4, 0x00 + GST f, s, $f0, P4, 0x00, $f1, P4, 0x04, $f2, P4, 0x08, $f3, P4, 0x0C + PTR_ADDI S1, S1, 0x04 + PTR_ADDI S2, S2, 0x04 + PTR_ADDI S3, S3, 0x04 + PTR_ADDI S4, S4, 0x04 + PTR_ADDI P4, P4, 0x10 +.L_M3: + andi J, M, 0x02 + beq ZERO, J, .L_M1 +.L_M2: + move S1, S0 + PTR_ADD S2, S0, TL + PTR_ADD S0, S0, T0 + + move P1, P0 + PTR_ADDI P0, P0, 0x40 + + PTR_SRAI I, N, 0x03 + beq ZERO, I, .L_M2_N7 +.align 5 +.L_M2_N8: + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + + GST xv, , U0, P1, 0x00, U1, P1, 0x20 + + PTR_ADDI S1, S1, 0x20 + PTR_ADDI S2, S2, 0x20 + PTR_ADDI I, I, -1 + PTR_ADD P1, P1, T1 + blt ZERO, I, .L_M2_N8 +.L_M2_N7: + andi I, N, 0x04 + beq ZERO, I, .L_M2_N3 +.L_M2_N4: + GLD v, , $vr0, S1, 0x00, $vr1, S2, 0x00 + GST v, , $vr0, P2, 0x00, $vr1, P2, 0x10 + PTR_ADDI S1, S1, 0x10 + PTR_ADDI S2, S2, 0x10 + PTR_ADDI P2, P2, 0x20 +.L_M2_N3: + andi I, N, 0x02 + beq ZERO, I, .L_M2_N1 +.L_M2_N2: + GLD f, d, $f0, S1, 0x00, $f1, S2, 0x00 + GST f, d, $f0, P3, 0x00, $f1, P3, 0x08 + PTR_ADDI S1, S1, 0x08 + PTR_ADDI S2, S2, 0x08 + PTR_ADDI P3, P3, 0x10 +.L_M2_N1: + andi I, N, 0x01 + beq ZERO, I, .L_M1 + + GLD f, s, $f0, S1, 0x00, $f1, S2, 0x00 + GST f, s, $f0, P4, 0x00, $f1, P4, 0x04 + PTR_ADDI S1, S1, 0x04 + PTR_ADDI S2, S2, 0x04 + PTR_ADDI P4, P4, 0x08 +.L_M1: + andi J, M, 0x01 + beq ZERO, J, .L_M0 + + move S1, S0 + PTR_ADD S2, S0, TL + + move P1, P0 + PTR_ADDI P0, P0, 0x20 + + PTR_SRAI I, N, 0x03 + beq ZERO, I, .L_M1_N7 +.align 5 +.L_M1_N8: + xvld U0, S1, 0x00 + + GST xv, , U0, P1, 0x00 + + PTR_ADDI S1, S1, 0x20 + + PTR_ADDI I, I, -1 + PTR_ADD P1, P1, T1 + blt ZERO, I, .L_M1_N8 +.L_M1_N7: + andi I, N, 0x04 + beq ZERO, I, .L_M1_N3 +.L_M1_N4: + GLD v, , $vr0, S1, 0x00 + GST v, , $vr0, P2, 0x00 + PTR_ADDI S1, S1, 0x10 + PTR_ADDI P2, P2, 0x10 +.L_M1_N3: + andi I, N, 0x02 + beq ZERO, I, .L_M1_N1 +.L_M1_N2: + GLD f, d, $f0, S1, 0x00 + GST f, d, $f0, P3, 0x00 + PTR_ADDI S1, S1, 0x08 + PTR_ADDI P3, P3, 0x08 +.L_M1_N1: + andi I, N, 0x01 + beq ZERO, I, .L_M0 + + GLD f, s, $f0, S1, 0x00 + GST f, s, $f0, P4, 0x00 + PTR_ADDI S1, S1, 0x04 + PTR_ADDI P4, P4, 0x04 +.L_M0: + pop_if_used 23, 8 + jirl $r0, $r1, 0x00 + EPILOGUE diff --git a/param.h b/param.h index 547463b2f..03bf3624f 100644 --- a/param.h +++ b/param.h @@ -2848,34 +2848,36 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(NO_LASX) #define DGEMM_DEFAULT_UNROLL_N 8 #define DGEMM_DEFAULT_UNROLL_M 2 +#define SGEMM_DEFAULT_UNROLL_N 8 +#define SGEMM_DEFAULT_UNROLL_M 2 #else #define DGEMM_DEFAULT_UNROLL_N 4 #define DGEMM_DEFAULT_UNROLL_M 16 +#define SGEMM_DEFAULT_UNROLL_N 8 +#define SGEMM_DEFAULT_UNROLL_M 16 #endif -#define SGEMM_DEFAULT_UNROLL_N 8 #define QGEMM_DEFAULT_UNROLL_N 2 #define CGEMM_DEFAULT_UNROLL_N 4 #define ZGEMM_DEFAULT_UNROLL_N 4 #define XGEMM_DEFAULT_UNROLL_N 1 -#define SGEMM_DEFAULT_UNROLL_M 2 #define QGEMM_DEFAULT_UNROLL_M 2 #define CGEMM_DEFAULT_UNROLL_M 1 #define ZGEMM_DEFAULT_UNROLL_M 1 #define XGEMM_DEFAULT_UNROLL_M 1 -#define SGEMM_DEFAULT_P 512 +#define SGEMM_DEFAULT_P 256 #define DGEMM_DEFAULT_P 32 #define CGEMM_DEFAULT_P 128 #define ZGEMM_DEFAULT_P 128 -#define SGEMM_DEFAULT_R 12288 +#define SGEMM_DEFAULT_R 1024 #define DGEMM_DEFAULT_R 858 #define CGEMM_DEFAULT_R 4096 #define ZGEMM_DEFAULT_R 4096 -#define SGEMM_DEFAULT_Q 128 +#define SGEMM_DEFAULT_Q 256 #define DGEMM_DEFAULT_Q 152 #define CGEMM_DEFAULT_Q 128 #define ZGEMM_DEFAULT_Q 128