OpenBLAS/kernel/loongarch64/sgemm_kernel_16x8_lasx.S

2349 lines
66 KiB
ArmAsm

/*******************************************************************************
Copyright (c) 2023, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/
#define ASSEMBLER
#include "common.h"
#include "loongarch64_asm.S"
/*********************************************************************
* 2023/08/23 guxiwei
* UTEST : OK
* CTEST : OK
* TEST : OK
*
*
* 2023/08/23 guxiwei
* Parameter:
* SGEMM_DEFAULT_UNROLL_N 8
* SGEMM_DEFAULT_UNROLL_M 16
* SGEMM_DEFAULT_P 256
* SGEMM_DEFAULT_Q 256
* SGEMM_DEFAULT_R 1024
* A_PRE 1024
* B_PRE 256 // Enable prefetching for B results in a performance decrease, temporarily disabled.
*
*
* Performance at Loongson 3A5000 2.5GHz with 5000x5000x5000:
* 1 thread: 71.7 GFLOPS
* 2 threads: 142.6 GFLOPS
* 3 threads: 211.5 GFLOPS
* 4 threads: 265.0 GFLOPS
*********************************************************************/
/* Function parameters */
#define M $r4 // param 1: bm
#define N $r5 // param 2: bn
#define K $r6 // param 3: bk
#define ALPHA $f0 // param 4: alpha
#define A $r7 // param 5: ba
#define B $r8 // param 6: bb
#define C $r9 // param 7: bc
#define LDC $r10 // param 8: ldc
#ifdef TRMMKERNEL
#define OFFSET $r11 // param 9: offset
#endif
#define OFF $r12
/* Cycle control parameters */
#define I $r13
#define J $r14
#define L $r15
#define TL $r16
/* Matrix address */
#define A0 $r17
#define B0 $r18
#define C0 $r19
#define C1 $r20
#define C2 $r23
#define C3 $r24
#define C4 $r25
#define C5 $r26
#define C6 $r27
#define C7 $r28
#define T0 $r29
#define T1 $r30
#undef ZERO
#define ZERO $r0
/* LASX Vectors
* Store 16 sets of 32-bit data in A using UO and U1, with each register holding 8 data.
* Use X0 through X7 to store 8 sets of 32-bit data in B, with each register holding a broadcast value of a single data.
* Use D0 to D15 to store intermediate values of the computation.
* Use VALPHA to store the broadcast value of alpha
*/
#define U0 $xr0
#define U1 $xr1
#define X0 $xr2
#define X1 $xr3
#define X2 $xr4
#define X3 $xr5
#define X4 $xr6
#define X5 $xr7
#define X6 $xr8
#define X7 $xr9
#define D0 $xr10
#define D1 $xr11
#define D2 $xr12
#define D3 $xr13
#define D4 $xr14
#define D5 $xr15
#define D6 $xr16
#define D7 $xr17
#define D8 $xr18
#define D9 $xr19
#define D10 $xr20
#define D11 $xr21
#define D12 $xr22
#define D13 $xr23
#define D14 $xr24
#define D15 $xr25
#define VALPHA $xr26
/* Prefetch interval */
#define A_PRE 0x400
#define B_PRE 0x100
// Loops outline:
// .L_N8 <-------------------------------------------------------------------------------------------- /* if N >> 3 == 0, goto .L_N7; else, enter .L_N8. */
// | .L_M16 <--------------------- | /* if M >> 4 == 0, goto .L_M8; Otherwise, enter .L_M16. */
// | | .L_M16_TL1 | |
// | | .L_M16_L7 | The entire core loop of the function, KERNEK16x8 |
// | | .L_M16_L71 | |
// | | .L_M16_L0 ---------------- |
// | .L_M8 |
// | | .L_M8_TL1 | |
// | | .L_M8_L7 | KERNEK8x8 |
// | | .L_M8_L71 | |
// | | .L_M8_L0 | |
// | .L_M4 |
// | | .L_M4_TL1 | |
// | | .L_M4_L7 | KERNEK4x8 |
// | | .L_M4_L71 | |
// | | .L_M4_L0 | |
// | .L_M2 |
// | | .L_M2_TL1 | |
// | | .L_M2_L7 | KERNEK2x8 |
// | | .L_M2_L71 | |
// | | .L_M2_L0 | |
// | .L_M1 |
// | | .L_M1_TL1 | |
// | | .L_M1_L7 | KERNEK1x8 |
// | | .L_M1_L71 | |
// | | .L_M1_L0 | |
// | .L_M0------------------------------------------------------------------------------------------
// .L_N7 /* if N & 7 == 0, goto .L_N0; else, enter .L_N4 */
// .L_N4
// | .L_N4_M16 <---------------------
// | | .L_N4_M16_TL1 |
// | | .L_N4_M16_L7 | KERNEL16x4
// | | .L_N4_M16_L71 |
// | | .L_N4_M16_L0 ----------------
// | .L_N4_M8
// | | .L_N4_M8_TL1 |
// | | .L_N4_M8_L7 | KERNEL8x4
// | | .L_N4_M8_L71 |
// | | .L_N4_M8_L0 |
// | .L_N4_M4
// | | .L_N4_M4_TL1 |
// | | .L_N4_M4_L7 | KERNEL4x4
// | | .L_N4_M4_L71 |
// | | .L_N4_M4_L0 |
// | .L_N4_M2
// | | .L_N4_M2_TL1 |
// | | .L_N4_M2_L7 | KERNEL2x4
// | | .L_N4_M2_L71 |
// | | .L_N4_M2_L0 |
// | .L_N4_M1
// | | .L_N4_M1_TL1 |
// | | .L_N4_M1_L7 | KERNEL1x4
// | | .L_N4_M1_L71 |
// | | .L_N4_M1_L0 |
// | .L_N4_M0
// .L_N3 /* if N & 2 == 0, goto .L_N1; else enter .L_N2 */
// .L_N2
// | .L_N2_M16 <---------------------
// | | .L_N2_M16_TL1 |
// | | .L_N2_M16_L7 | KERNEL16x2
// | | .L_N2_M16_L71 |
// | | .L_N2_M16_L0 ----------------
// | .L_N2_M8
// | | .L_N2_M8_TL1 |
// | | .L_N2_M8_L7 | KERNEL8x2
// | | .L_N2_M8_L71 |
// | | .L_N2_M8_L0 |
// | .L_N2_M4
// | | .L_N2_M4_TL1 |
// | | .L_N2_M4_L7 | KERNEL4x2
// | | .L_N2_M4_L71 |
// | | .L_N2_M4_L0 |
// | .L_N2_M2
// | | .L_N2_M2_TL1 |
// | | .L_N2_M2_L7 | KERNEL2x2
// | | .L_N2_M2_L71 |
// | | .L_N2_M2_L0 |
// | .L_N2_M1
// | | .L_N2_M1_TL1 |
// | | .L_N2_M1_L7 | KERNEL1x2
// | | .L_N2_M1_L71 |
// | | .L_N2_M1_L0 |
// | .L_N2_M0
// .L_N1
// | .L_N1_M16 <---------------------
// | | .L_N1_M16_TL1 |
// | | .L_N1_M16_L7 | KERNEL16x1
// | | .L_N1_M16_L71 |
// | | .L_N1_M16_L0 ----------------
// | .L_N1_M8
// | | .L_N1_M8_TL1 |
// | | .L_N1_M8_L7 | KERNEL8x1
// | | .L_N1_M8_L71 |
// | | .L_N1_M8_L0 |
// | .L_N1_M4
// | | .L_N1_M4_TL1 |
// | | .L_N1_M4_L7 | KERNEL4x1
// | | .L_N1_M4_L71 |
// | | .L_N1_M4_L0 |
// | .L_N1_M2
// | | .L_N1_M2_TL1 |
// | | .L_N1_M2_L7 | KERNEL2x1
// | | .L_N1_M2_L71 |
// | | .L_N1_M2_L0 |
// | .L_N1_M1
// | | .L_N1_M1_TL1 |
// | | .L_N1_M1_L7 | KERNEL1x1
// | | .L_N1_M1_L71 |
// | | .L_N1_M1_L0 |
// | .L_N1_M0
// .L_N0
/*************** sgemm_kernel_macros ***************/
.macro KERNEL1x16x8_START
GLD xv, , U0, A0, 0x00, U1, A0, 0x20
GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04, X2, B0, 0x08, X3, B0, 0x0C
GMUL xvf, s, D0, U0, X0, D1, U1, X0
preld 0, C0, 0x00
GMUL xvf, s, D2, U0, X1, D3, U1, X1
preld 0, C1, 0x00
GMUL xvf, s, D4, U0, X2, D5, U1, X2
preld 0, C2, 0x00
GMUL xvf, s, D6, U0, X3, D7, U1, X3
preld 0, C3, 0x00
GLDREPL xv, w, X4, B0, 0x10, X5, B0, 0x14, X6, B0, 0x18, X7, B0, 0x1C
GMUL xvf, s, D8, U0, X4, D9, U1, X4
preld 0, C4, 0x00
GMUL xvf, s, D10, U0, X5, D11, U1, X5
preld 0, C5, 0x00
GMUL xvf, s, D12, U0, X6, D13, U1, X6
preld 0, C6, 0x00
GMUL xvf, s, D14, U0, X7, D15, U1, X7
preld 0, C7, 0x00
PTR_ADDI A0, A0, 0x40
PTR_ADDI B0, B0, 0x20
.endm
.macro KERNEL1x16x8
GLD xv, , U0, A0, 0x00, U1, A0, 0x20
GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04, X2, B0, 0x08, X3, B0, 0x0C
GMADD xvf, s, D0, U0, X0, D0, D1, U1, X0, D1, \
D2, U0, X1, D2, D3, U1, X1, D3
preld 0, A0, A_PRE
GMADD xvf, s, D4, U0, X2, D4, D5, U1, X2, D5, \
D6, U0, X3, D6, D7, U1, X3 D7
GLDREPL xv, w, X4, B0, 0x10, X5, B0, 0x14, X6, B0, 0x18, X7, B0, 0x1C
GMADD xvf, s, D8, U0, X4, D8, D9, U1, X4, D9, \
D10, U0, X5, D10, D11, U1, X5, D11
//preld 0, B0, B_PRE
GMADD xvf, s, D12, U0, X6, D12, D13, U1, X6, D13, \
D14, U0, X7, D14, D15, U1, X7 D15
PTR_ADDI A0, A0, 0x40
PTR_ADDI B0, B0, 0x20
.endm
.macro KERNEL8x16x8
.rept 8
KERNEL1x16x8
.endr
.endm
.macro SAVE16x8
#if defined(TRMMKERNEL)
GMUL xvf, s, D0, D0, VALPHA, D1, D1, VALPHA, D2, D2, VALPHA, D3, D3, VALPHA, \
D4, D4, VALPHA, D5, D5, VALPHA, D6, D6, VALPHA, D7, D7, VALPHA, \
D8, D8, VALPHA, D9, D9, VALPHA, D10, D10, VALPHA, D11, D11, VALPHA, \
D12, D12, VALPHA, D13, D13, VALPHA, D14, D14, VALPHA, D15, D15, VALPHA
#else
/* Load C0 */
GLD xv, , X0, C0, 0x00, X1, C0, 0x20
GMADD xvf, s, D0, D0, VALPHA, X0, D1, D1, VALPHA, X1
/* Load C1 */
GLD xv, , X2, C1, 0x00, X3, C1, 0x20
GMADD xvf, s, D2, D2, VALPHA, X2, D3, D3, VALPHA, X3
/* Load C2 */
GLD xv, , X4, C2, 0x00, X5, C2, 0x20
GMADD xvf, s, D4, D4, VALPHA, X4, D5, D5, VALPHA, X5
/* Load C3 */
GLD xv, , X6, C3, 0x00, X7, C3, 0x20
GMADD xvf, s, D6, D6, VALPHA, X6, D7, D7, VALPHA, X7
/* Load C4 */
GLD xv, , X0, C4, 0x00, X1, C4, 0x20
GMADD xvf, s, D8, D8, VALPHA, X0, D9, D9, VALPHA, X1
/* Load C5 */
GLD xv, , X2, C5, 0x00, X3, C5, 0x20
GMADD xvf, s, D10, D10, VALPHA, X2, D11, D11, VALPHA, X3
/* Load C6 */
GLD xv, , X4, C6, 0x00, X5, C6, 0x20
GMADD xvf, s, D12, D12, VALPHA, X4, D13, D13, VALPHA, X5
/* Load C7 */
GLD xv, , X6, C7, 0x00, X7, C7, 0x20
GMADD xvf, s, D14, D14, VALPHA, X6, D15, D15, VALPHA, X7
#endif // #if defined(TRMMKERNEL)
GST xv, , D0, C0, 0x00, D1, C0, 0x20, \
D2, C1, 0x00, D3, C1, 0x20, \
D4, C2, 0x00, D5, C2, 0x20, \
D6, C3, 0x00, D7, C3, 0x20, \
D8, C4, 0x00, D9, C4, 0x20, \
D10, C5, 0x00, D11, C5, 0x20, \
D12, C6, 0x00, D13, C6, 0x20, \
D14, C7, 0x00, D15, C7, 0x20
#if __loongarch_grlen == 64
GADDI , d, C0, C0, 0x40, C1, C1, 0x40, C2, C2, 0x40, C3, C3, 0x40, \
C4, C4, 0x40, C5, C5, 0x40, C6, C6, 0x40, C7, C7, 0x40
#elif __loongarch_grlen == 32
GADDI , w, C0, C0, 0x40, C1, C1, 0x40, C2, C2, 0x40, C3, C3, 0x40, \
C4, C4, 0x40, C5, C5, 0x40, C6, C6, 0x40, C7, C7, 0x40
#else
GADDI , d, C0, C0, 0x40, C1, C1, 0x40, C2, C2, 0x40, C3, C3, 0x40, \
C4, C4, 0x40, C5, C5, 0x40, C6, C6, 0x40, C7, C7, 0x40
#endif
.endm
// m = 8, 4, 2, 1
// stride = 0x20, 0x10, 0x08, 0x04
.macro KERNEL1xMx8_START m, stride
.if \m == 8
GLD xv, , U0, A0, 0x00
.elseif \m == 4
GLD v, , $vr0, A0, 0x00
.elseif \m ==2
GLD f, d, $f0, A0, 0x00
.elseif \m ==1
GLD f, s, $f0, A0, 0x00
.endif
GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04, X2, B0, 0x08, X3, B0, 0x0C
GMUL xvf, s, D0, U0, X0, D2, U0, X1, \
D4, U0, X2, D6, U0, X3
GLDREPL xv, w, X4, B0, 0x10, X5, B0, 0x14, X6, B0, 0x18, X7, B0, 0x1C
GMUL xvf, s, D8, U0, X4, D10, U0, X5, \
D12, U0, X6, D14, U0, X7
PTR_ADDI A0, A0, \stride
PTR_ADDI B0, B0, 0x20
.endm
.macro KERNEL1xMx8 m, stride
.if \m == 8
GLD xv, , U0, A0, 0x00
.elseif \m == 4
GLD v, , $vr0, A0, 0x00
.elseif \m ==2
GLD f, d, $f0, A0, 0x00
.elseif \m ==1
GLD f, s, $f0, A0, 0x00
.endif
GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04, X2, B0, 0x08, X3, B0, 0x0C
GMADD xvf, s, D0, U0, X0, D0, D2, U0, X1, D2, \
D4, U0, X2, D4, D6, U0, X3, D6
GLDREPL xv, w, X4, B0, 0x10, X5, B0, 0x14, X6, B0, 0x18, X7, B0, 0x1C
GMADD xvf, s, D8, U0, X4, D8, D10, U0, X5, D10, \
D12, U0, X6, D12, D14, U0, X7, D14
PTR_ADDI A0, A0, \stride
PTR_ADDI B0, B0, 0x20
.endm
.macro KERNEL8xMx8 m, stride
.rept 8
KERNEL1xMx8 \m, \stride
.endr
.endm
.macro SAVEMx8 m, stride
#if defined(TRMMKERNEL)
GMUL xvf, s, D0, D0, VALPHA, D2, D2, VALPHA, \
D4, D4, VALPHA, D6, D6, VALPHA, \
D8, D8, VALPHA, D10, D10, VALPHA, \
D12, D12, VALPHA, D14, D14, VALPHA
#else
/* Load C0, C1, C2, C3, C4, C5, C6, C7 */
.if \m == 8
GLD xv, , X0, C0, 0x00, X2, C1, 0x00, X4, C2, 0x00, X6, C3, 0x00
.elseif \m == 4
GLD v, , $vr2, C0, 0x00, $vr4, C1, 0x00, $vr6, C2, 0x00, $vr8, C3, 0x00
.elseif \m == 2
GLD f, d, $f2, C0, 0x00, $f4, C1, 0x00, $f6, C2, 0x00, $f8, C3, 0x00
.elseif \m == 1
GLD f, s, $f2, C0, 0x00, $f4, C1, 0x00, $f6, C2, 0x00, $f8, C3, 0x00
.endif
GMADD xvf, s, D0, D0, VALPHA, X0, D2, D2, VALPHA, X2, \
D4, D4, VALPHA, X4, D6, D6, VALPHA, X6
.if \m == 8
GLD xv, , X0, C4, 0x00, X2, C5, 0x00, X4, C6, 0x00, X6, C7, 0x00
.elseif \m == 4
GLD v, , $vr2, C4, 0x00, $vr4, C5, 0x00, $vr6, C6, 0x00, $vr8, C7, 0x00
.elseif \m == 2
GLD f, d, $f2, C4, 0x00, $f4, C5, 0x00, $f6, C6, 0x00, $f8, C7, 0x00
.elseif \m == 1
GLD f, s, $f2, C4, 0x00, $f4, C5, 0x00, $f6, C6, 0x00, $f8, C7, 0x00
.endif
GMADD xvf, s, D8, D8, VALPHA, X0, D10, D10, VALPHA, X2, \
D12, D12, VALPHA, X4, D14, D14, VALPHA, X6
#endif // #if defined(TRMMKERNEL)
.if \m == 8
GST xv, , D0, C0, 0x00, D2, C1, 0x00, \
D4, C2, 0x00, D6, C3, 0x00, \
D8, C4, 0x00, D10, C5, 0x00, \
D12, C6, 0x00, D14, C7, 0x00
.elseif \m == 4
GST v, , $vr10, C0, 0x00, $vr12, C1, 0x00, \
$vr14, C2, 0x00, $vr16, C3, 0x00, \
$vr18, C4, 0x00, $vr20, C5, 0x00, \
$vr22, C6, 0x00, $vr24, C7, 0x00
.elseif \m == 2
GST f, d, $f10, C0, 0x00, $f12, C1, 0x00, \
$f14, C2, 0x00, $f16, C3, 0x00, \
$f18, C4, 0x00, $f20, C5, 0x00, \
$f22, C6, 0x00, $f24, C7, 0x00
.elseif \m == 1
GST f, s, $f10, C0, 0x00, $f12, C1, 0x00, \
$f14, C2, 0x00, $f16, C3, 0x00, \
$f18, C4, 0x00, $f20, C5, 0x00, \
$f22, C6, 0x00, $f24, C7, 0x00
.endif
#if __loongarch_grlen == 64
GADDI , d, C0, C0, \stride, C1, C1, \stride, C2, C2, \stride, C3, C3, \stride, \
C4, C4, \stride, C5, C5, \stride, C6, C6, \stride, C7, C7, \stride
#elif __loongarch_grlen == 32
GADDI , w, C0, C0, \stride, C1, C1, \stride, C2, C2, \stride, C3, C3, \stride, \
C4, C4, \stride, C5, C5, \stride, C6, C6, \stride, C7, C7, \stride
#else
GADDI , d, C0, C0, \stride, C1, C1, \stride, C2, C2, \stride, C3, C3, \stride, \
C4, C4, \stride, C5, C5, \stride, C6, C6, \stride, C7, C7, \stride
#endif
.endm
.macro KERNEL1x16x4_START
GLD xv, , U0, A0, 0x00, U1, A0, 0x20
GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04, X2, B0, 0x08, X3, B0, 0x0C
GMUL xvf, s, D0, U0, X0, D1, U1, X0, \
D2, U0, X1, D3, U1, X1, \
D4, U0, X2, D5, U1, X2, \
D6, U0, X3, D7, U1, X3
PTR_ADDI A0, A0, 0x40
PTR_ADDI B0, B0, 0x10
.endm
.macro KERNEL1x16x4
GLD xv, , U0, A0, 0x00, U1, A0, 0x20
GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04, X2, B0, 0x08, X3, B0, 0x0C
GMADD xvf, s, D0, U0, X0, D0, D1, U1, X0, D1, \
D2, U0, X1, D2, D3, U1, X1, D3, \
D4, U0, X2, D4, D5, U1, X2, D5, \
D6, U0, X3, D6, D7, U1, X3 D7
PTR_ADDI A0, A0, 0x40
PTR_ADDI B0, B0, 0x10
.endm
.macro KERNEL8x16x4
.rept 8
KERNEL1x16x4
.endr
.endm
.macro SAVE16x4
#if defined(TRMMKERNEL)
GMUL xvf, s, D0, D0, VALPHA, D1, D1, VALPHA, D2, D2, VALPHA, D3, D3, VALPHA, \
D4, D4, VALPHA, D5, D5, VALPHA, D6, D6, VALPHA, D7, D7, VALPHA
#else
/* Load C0 */
GLD xv, , X0, C0, 0x00, X1, C0, 0x20
GMADD xvf, s, D0, D0, VALPHA, X0, D1, D1, VALPHA, X1
/* Load C1 */
GLD xv, , X2, C1, 0x00, X3, C1, 0x20
GMADD xvf, s, D2, D2, VALPHA, X2, D3, D3, VALPHA, X3
/* Load C2 */
GLD xv, , X4, C2, 0x00, X5, C2, 0x20
GMADD xvf, s, D4, D4, VALPHA, X4, D5, D5, VALPHA, X5
/* Load C3 */
GLD xv, , X6, C3, 0x00, X7, C3, 0x20
GMADD xvf, s, D6, D6, VALPHA, X6, D7, D7, VALPHA, X7
#endif // #if defined(TRMMKERNEL)
GST xv, , D0, C0, 0x00, D1, C0, 0x20, \
D2, C1, 0x00, D3, C1, 0x20, \
D4, C2, 0x00, D5, C2, 0x20, \
D6, C3, 0x00, D7, C3, 0x20
#if __loongarch_grlen == 64
GADDI , d, C0, C0, 0x40, C1, C1, 0x40, C2, C2, 0x40, C3, C3, 0x40
#elif __loongarch_grlen == 32
GADDI , w, C0, C0, 0x40, C1, C1, 0x40, C2, C2, 0x40, C3, C3, 0x40
#else
GADDI , d, C0, C0, 0x40, C1, C1, 0x40, C2, C2, 0x40, C3, C3, 0x40
#endif
.endm
// m = 8, 4, 2, 1
// stride = 0x20, 0x10, 0x08, 0x04
.macro KERNEL1xMx4_START m, stride
.if \m == 8
GLD xv, , U0, A0, 0x00
.elseif \m == 4
GLD v, , $vr0, A0, 0x00
.elseif \m ==2
GLD f, d, $f0, A0, 0x00
.elseif \m ==1
GLD f, s, $f0, A0, 0x00
.endif
GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04, X2, B0, 0x08, X3, B0, 0x0C
GMUL xvf, s, D0, U0, X0, D2, U0, X1, \
D4, U0, X2, D6, U0, X3
PTR_ADDI A0, A0, \stride
PTR_ADDI B0, B0, 0x10
.endm
.macro KERNEL1xMx4 m, stride
.if \m == 8
GLD xv, , U0, A0, 0x00
.elseif \m == 4
GLD v, , $vr0, A0, 0x00
.elseif \m ==2
GLD f, d, $f0, A0, 0x00
.elseif \m ==1
GLD f, s, $f0, A0, 0x00
.endif
GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04, X2, B0, 0x08, X3, B0, 0x0C
GMADD xvf, s, D0, U0, X0, D0, D2, U0, X1, D2, \
D4, U0, X2, D4, D6, U0, X3, D6
PTR_ADDI A0, A0, \stride
PTR_ADDI B0, B0, 0x10
.endm
.macro KERNEL8xMx4 m, stride
.rept 8
KERNEL1xMx4 \m, \stride
.endr
.endm
.macro SAVEMx4 m, stride
#if defined(TRMMKERNEL)
GMUL xvf, s, D0, D0, VALPHA, D2, D2, VALPHA, \
D4, D4, VALPHA, D6, D6, VALPHA
#else
/* Load C0, C1, C2, C3 */
.if \m == 8
GLD xv, , X0, C0, 0x00, X2, C1, 0x00, X4, C2, 0x00, X6, C3, 0x00
.elseif \m == 4
GLD v, , $vr2, C0, 0x00, $vr4, C1, 0x00, $vr6, C2, 0x00, $vr8, C3, 0x00
.elseif \m == 2
GLD f, d, $f2, C0, 0x00, $f4, C1, 0x00, $f6, C2, 0x00, $f8, C3, 0x00
.elseif \m == 1
GLD f, s, $f2, C0, 0x00, $f4, C1, 0x00, $f6, C2, 0x00, $f8, C3, 0x00
.endif
GMADD xvf, s, D0, D0, VALPHA, X0, D2, D2, VALPHA, X2, \
D4, D4, VALPHA, X4, D6, D6, VALPHA, X6
#endif // #if defined(TRMMKERNEL)
.if \m == 8
GST xv, , D0, C0, 0x00, D2, C1, 0x00, \
D4, C2, 0x00, D6, C3, 0x00
.elseif \m == 4
GST v, , $vr10, C0, 0x00, $vr12, C1, 0x00, \
$vr14, C2, 0x00, $vr16, C3, 0x00
.elseif \m == 2
GST f, d, $f10, C0, 0x00, $f12, C1, 0x00, \
$f14, C2, 0x00, $f16, C3, 0x00
.elseif \m == 1
GST f, s, $f10, C0, 0x00, $f12, C1, 0x00, \
$f14, C2, 0x00, $f16, C3, 0x00
.endif
#if __loongarch_grlen == 64
GADDI , d, C0, C0, \stride, C1, C1, \stride, C2, C2, \stride, C3, C3, \stride
#elif __loongarch_grlen == 32
GADDI , w, C0, C0, \stride, C1, C1, \stride, C2, C2, \stride, C3, C3, \stride
#else
GADDI , d, C0, C0, \stride, C1, C1, \stride, C2, C2, \stride, C3, C3, \stride
#endif
.endm
.macro KERNEL1x16x2_START
GLD xv, , U0, A0, 0x00, U1, A0, 0x20
GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04
GMUL xvf, s, D0, U0, X0, D1, U1, X0, \
D2, U0, X1, D3, U1, X1
PTR_ADDI A0, A0, 0x40
PTR_ADDI B0, B0, 0x08
.endm
.macro KERNEL1x16x2
GLD xv, , U0, A0, 0x00, U1, A0, 0x20
GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04
GMADD xvf, s, D0, U0, X0, D0, D1, U1, X0, D1, \
D2, U0, X1, D2, D3, U1, X1, D3
PTR_ADDI A0, A0, 0x40
PTR_ADDI B0, B0, 0x08
.endm
.macro KERNEL8x16x2
.rept 8
KERNEL1x16x2
.endr
.endm
.macro SAVE16x2
#if defined(TRMMKERNEL)
GMUL xvf, s, D0, D0, VALPHA, D1, D1, VALPHA, D2, D2, VALPHA, D3, D3, VALPHA
#else
/* Load C0 */
GLD xv, , X0, C0, 0x00, X1, C0, 0x20
GMADD xvf, s, D0, D0, VALPHA, X0, D1, D1, VALPHA, X1
/* Load C1 */
GLD xv, , X2, C1, 0x00, X3, C1, 0x20
GMADD xvf, s, D2, D2, VALPHA, X2, D3, D3, VALPHA, X3
#endif // #if defined(TRMMKERNEL)
GST xv, , D0, C0, 0x00, D1, C0, 0x20, \
D2, C1, 0x00, D3, C1, 0x20
#if __loongarch_grlen == 64
GADDI , d, C0, C0, 0x40, C1, C1, 0x40
#elif __loongarch_grlen == 32
GADDI , w, C0, C0, 0x40, C1, C1, 0x40
#else
GADDI , d, C0, C0, 0x40, C1, C1, 0x40
#endif
.endm
// m = 8, 4, 2, 1
// stride = 0x20, 0x10, 0x08, 0x04
.macro KERNEL1xMx2_START m, stride
.if \m == 8
GLD xv, , U0, A0, 0x00
.elseif \m == 4
GLD v, , $vr0, A0, 0x00
.elseif \m ==2
GLD f, d, $f0, A0, 0x00
.elseif \m ==1
GLD f, s, $f0, A0, 0x00
.endif
GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04
GMUL xvf, s, D0, U0, X0, D2, U0, X1
PTR_ADDI A0, A0, \stride
PTR_ADDI B0, B0, 0x08
.endm
.macro KERNEL1xMx2 m, stride
.if \m == 8
GLD xv, , U0, A0, 0x00
.elseif \m == 4
GLD v, , $vr0, A0, 0x00
.elseif \m ==2
GLD f, d, $f0, A0, 0x00
.elseif \m ==1
GLD f, s, $f0, A0, 0x00
.endif
GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04
GMADD xvf, s, D0, U0, X0, D0, D2, U0, X1, D2
PTR_ADDI A0, A0, \stride
PTR_ADDI B0, B0, 0x08
.endm
.macro KERNEL8xMx2 m, stride
.rept 8
KERNEL1xMx2 \m, \stride
.endr
.endm
.macro SAVEMx2 m, stride
#if defined(TRMMKERNEL)
GMUL xvf, s, D0, D0, VALPHA, D2, D2, VALPHA
#else
/* Load C0, C1 */
.if \m == 8
GLD xv, , X0, C0, 0x00, X2, C1, 0x00
.elseif \m == 4
GLD v, , $vr2, C0, 0x00, $vr4, C1, 0x00
.elseif \m == 2
GLD f, d, $f2, C0, 0x00, $f4, C1, 0x00
.elseif \m == 1
GLD f, s, $f2, C0, 0x00, $f4, C1, 0x00
.endif
GMADD xvf, s, D0, D0, VALPHA, X0, D2, D2, VALPHA, X2
#endif // #if defined(TRMMKERNEL)
.if \m == 8
GST xv, , D0, C0, 0x00, D2, C1, 0x00
.elseif \m == 4
GST v, , $vr10, C0, 0x00, $vr12, C1, 0x00
.elseif \m == 2
GST f, d, $f10, C0, 0x00, $f12, C1, 0x00
.elseif \m == 1
GST f, s, $f10, C0, 0x00, $f12, C1, 0x00
.endif
#if __loongarch_grlen == 64
GADDI , d, C0, C0, \stride, C1, C1, \stride
#elif __loongarch_grlen == 32
GADDI , w, C0, C0, \stride, C1, C1, \stride
#else
GADDI , d, C0, C0, \stride, C1, C1, \stride
#endif
.endm
.macro KERNEL1x16x1_START
GLD xv, , U0, A0, 0x00, U1, A0, 0x20
GLDREPL xv, w, X0, B0, 0x00
GMUL xvf, s, D0, U0, X0, D1, U1, X0
PTR_ADDI A0, A0, 0x40
PTR_ADDI B0, B0, 0x04
.endm
.macro KERNEL1x16x1
GLD xv, , U0, A0, 0x00, U1, A0, 0x20
GLDREPL xv, w, X0, B0, 0x00
GMADD xvf, s, D0, U0, X0, D0, D1, U1, X0, D1
PTR_ADDI A0, A0, 0x40
PTR_ADDI B0, B0, 0x04
.endm
.macro KERNEL8x16x1
.rept 8
KERNEL1x16x1
.endr
.endm
.macro SAVE16x1
#if defined(TRMMKERNEL)
GMUL xvf, s, D0, D0, VALPHA, D1, D1, VALPHA
#else
/* Load C0 */
GLD xv, , X0, C0, 0x00, X1, C0, 0x20
GMADD xvf, s, D0, D0, VALPHA, X0, D1, D1, VALPHA, X1
#endif // #if defined(TRMMKERNEL)
GST xv, , D0, C0, 0x00, D1, C0, 0x20
#if __loongarch_grlen == 64
GADDI , d, C0, C0, 0x40
#elif __loongarch_grlen == 32
GADDI , w, C0, C0, 0x40
#else
GADDI , d, C0, C0, 0x40
#endif
.endm
// m = 8, 4, 2, 1
// stride = 0x20, 0x10, 0x08, 0x04
.macro KERNEL1xMx1_START m, stride
.if \m == 8
GLD xv, , U0, A0, 0x00
.elseif \m == 4
GLD v, , $vr0, A0, 0x00
.elseif \m ==2
GLD f, d, $f0, A0, 0x00
.elseif \m ==1
GLD f, s, $f0, A0, 0x00
.endif
GLDREPL xv, w, X0, B0, 0x00
GMUL xvf, s, D0, U0, X0
PTR_ADDI A0, A0, \stride
PTR_ADDI B0, B0, 0x04
.endm
.macro KERNEL1xMx1 m, stride
.if \m == 8
GLD xv, , U0, A0, 0x00
.elseif \m == 4
GLD v, , $vr0, A0, 0x00
.elseif \m ==2
GLD f, d, $f0, A0, 0x00
.elseif \m ==1
GLD f, s, $f0, A0, 0x00
.endif
GLDREPL xv, w, X0, B0, 0x00
GMADD xvf, s, D0, U0, X0, D0
PTR_ADDI A0, A0, \stride
PTR_ADDI B0, B0, 0x04
.endm
.macro KERNEL8xMx1 m, stride
.rept 8
KERNEL1xMx1 \m, \stride
.endr
.endm
.macro SAVEMx1 m, stride
#if defined(TRMMKERNEL)
GMUL xvf, s, D0, D0, VALPHA
#else
/* Load C0, C1 */
.if \m == 8
GLD xv, , X0, C0, 0x00
.elseif \m == 4
GLD v, , $vr2, C0, 0x00
.elseif \m == 2
GLD f, d, $f2, C0, 0x00
.elseif \m == 1
GLD f, s, $f2, C0, 0x00
.endif
GMADD xvf, s, D0, D0, VALPHA, X0
#endif // #if defined(TRMMKERNEL)
.if \m == 8
GST xv, , D0, C0, 0x00
.elseif \m == 4
GST v, , $vr10, C0, 0x00
.elseif \m == 2
GST f, d, $f10, C0, 0x00
.elseif \m == 1
GST f, s, $f10, C0, 0x00
.endif
#if __loongarch_grlen == 64
GADDI , d, C0, C0, \stride
#elif __loongarch_grlen == 32
GADDI , w, C0, C0, \stride
#else
GADDI , d, C0, C0, \stride
#endif
.endm
PROLOGUE
push_if_used 26, 32
xvreplve0.w VALPHA, $xr0
#if defined (TRMMKERNEL) && !defined(LEFT)
PTR_SUB OFF, ZERO, OFFSET
#else
xor OFF, OFF, OFF
#endif
/* if (!(N >> 3)) goto L_N7 */
PTR_SRAI J, N, 3 /* J = bn >> 3 */
andi N, N, 0x07
beq ZERO, J, .L_N7
.L_N8: /* J -- */
move C0, C
move A0, A
PTR_SLLI T0, LDC, 2
PTR_ADDI J, J, -1 /* J-- */
#if __loongarch_grlen == 64
GADD , d, C1, C0, T0, C2, C1, T0, C3, C2, T0, C4, C3, T0, C5, C4, T0, \
C6, C5, T0, C7, C6, T0
#elif __loongarch_grlen == 32
GADD , w, C1, C0, T0, C2, C1, T0, C3, C2, T0, C4, C3, T0, C5, C4, T0, \
C6, C5, T0, C7, C6, T0
#else
GADD , d, C1, C0, T0, C2, C1, T0, C3, C2, T0, C4, C3, T0, C5, C4, T0, \
C6, C5, T0, C7, C6, T0
#endif
#if defined(TRMMKERNEL) && defined(LEFT)
move OFF, OFFSET
#endif
/* if (!(M >> 4)) goto L_M8 */
PTR_SRAI I, M, 4 /* I = bm >> 4 */
beq ZERO, I, .L_M8
.align 5
.L_M16: /* I-- */
#if defined(TRMMKERNEL)
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
move B0, B
#else
PTR_SLLI T0, OFF, 0x06
PTR_ADD A0, A0, T0 /* A0 += 16 * OFF */
PTR_SLLI T0, OFF, 0x05
PTR_ADD B0, B, T0 /* B0 = B + 8 * OFF */
#endif
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
PTR_SUB L, K, OFF
#elif defined(LEFT)
/* number of values in A */
PTR_ADDI L, OFF, 16
#else
/* number of values in B */
PTR_ADDI L, OFF, 8
#endif
#else // #if !defined(TRMMKERNEL)
move B0, B
move L, K /* L = bk */
#endif
KERNEL1x16x8_START
/* Reduce L */
PTR_ADDI L, L, -1
PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */
/* if (TL < 1) goto L_M16_L7 */
beq ZERO,TL, .L_M16_L7
.align 5
.L_M16_TL1:
KERNEL8x16x8
PTR_ADDI TL, TL, -1 /* TL-- */
blt ZERO,TL, .L_M16_TL1
.L_M16_L7:
andi TL, L, 7
beq TL, ZERO,.L_M16_L0
.align 5
.L_M16_L71:
KERNEL1x16x8
PTR_ADDI TL, TL, -1
blt ZERO,TL, .L_M16_L71
.L_M16_L0:
SAVE16x8
#if defined(TRMMKERNEL)
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
PTR_SUB L, K, OFF
#ifdef LEFT
/* number of values in A */
PTR_ADDI L, L, -16
#else
/* number of values in B */
PTR_ADDI L, L, -8
#endif
PTR_SLLI T0, L, 0x06
PTR_ADD A0, A0, T0
PTR_SLLI T0, L, 0x05
PTR_ADD B0, B0, T0
#endif
#ifdef LEFT
PTR_ADDI OFF, OFF, 0x10 /* number of values in A */
#endif
#endif // #if defined(TRMMKERNEL)
PTR_ADDI I, I, -1 /* I-- */
blt ZERO,I, .L_M16
.L_M8:
/* We have done M & 16, considering M=8/4/2/1 */
andi I, M, 15
beq ZERO,I, .L_M0
andi I, M, 8
beq ZERO,I, .L_M4
#if defined(TRMMKERNEL)
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
move B0, B
#else
PTR_SLLI T0, OFF, 0x05
PTR_ADD A0, A0, T0 /* A0 += 8 * OFF */
PTR_SLLI T0, OFF, 0x05
PTR_ADD B0, B, T0 /* B0 = B + 8 * OFF */
#endif
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
PTR_SUB L, K, OFF
#elif defined(LEFT)
/* number of values in A */
PTR_ADDI L, OFF, 8
#else
/* number of values in B */
PTR_ADDI L, OFF, 8
#endif
#else // #if !defined(TRMMKERNEL)
move B0, B
move L, K /* L = bk */
#endif // #if defined(TRMMKERNEL)
KERNEL1xMx8_START 8, 0x20
/* Reduce L */
PTR_ADDI L, L, -1
PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */
/* if (TL < 1) goto L_M8_L7 */
beq ZERO,TL, .L_M8_L7
.align 5
.L_M8_TL1:
KERNEL8xMx8 8, 0x20
PTR_ADDI TL, TL, -1 /* TL-- */
blt ZERO,TL, .L_M8_TL1
.L_M8_L7:
/* if (!(L & 7)) goto L_M8_L0 */
andi TL, L, 7
beq TL, ZERO,.L_M8_L0
.align 5
.L_M8_L71:
KERNEL1xMx8 8, 0x20
PTR_ADDI TL, TL, -1
blt ZERO,TL, .L_M8_L71
.L_M8_L0:
SAVEMx8 8, 0x20
#if defined(TRMMKERNEL)
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
PTR_SUB L, K, OFF
#ifdef LEFT
/* number of values in A */
PTR_ADDI L, L, -8
#else
/* number of values in B */
PTR_ADDI L, L, -8
#endif
PTR_SLLI T0, L, 0x05
PTR_ADD A0, A0, T0
PTR_SLLI T0, L, 0x05
PTR_ADD B0, B0, T0
#endif
#ifdef LEFT
/* number of values in A */
PTR_ADDI OFF, OFF, 0x08
#endif
#endif // #if defined(TRMMKERNEL)
.L_M4:
andi I, M, 4
beq ZERO,I, .L_M2
#if defined(TRMMKERNEL)
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
move B0, B
#else
PTR_SLLI T0, OFF, 0x04
PTR_ADD A0, A0, T0 /* A0 += 4 * OFF */
PTR_SLLI T0, OFF, 0x05
PTR_ADD B0, B, T0 /* B0 = B + 8 * OFF */
#endif
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
PTR_SUB L, K, OFF
#elif defined(LEFT)
/* number of values in A */
PTR_ADDI L, OFF, 4
#else
/* number of values in B */
PTR_ADDI L, OFF, 8
#endif
#else // #if !defined(TRMMKERNEL)
move B0, B
move L, K /* L = bk */
#endif
KERNEL1xMx8_START 4, 0x10
/* Reduce L */
PTR_ADDI L, L, -1
PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */
/* if (TL < 1) goto L_M4_L7 */
beq ZERO,TL, .L_M4_L7
.align 5
.L_M4_TL1:
KERNEL8xMx8 4, 0x10
PTR_ADDI TL, TL, -1
blt ZERO,TL, .L_M4_TL1
.L_M4_L7:
/* if (!(L & 7)) goto L_M4_L0 */
andi TL, L, 7
beq TL, ZERO,.L_M4_L0
.L_M4_L71:
KERNEL1xMx8 4, 0x10
PTR_ADDI TL, TL, -1
blt ZERO,TL, .L_M4_L71
.L_M4_L0:
SAVEMx8 4, 0x10
#if defined(TRMMKERNEL)
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
PTR_SUB L, K, OFF
#ifdef LEFT
/* number of values in A */
PTR_ADDI L, L, -4
#else
/* number of values in B */
PTR_ADDI L, L, -8
#endif
PTR_SLLI T0, L, 0x04
PTR_ADD A0, A0, T0
PTR_SLLI T0, L, 0x05
PTR_ADD B0, B0, T0
#endif
#ifdef LEFT
/* number of values in A */
PTR_ADDI OFF, OFF, 0x04
#endif
#endif // #if defined(TRMMKERNEL)
.L_M2:
andi I, M, 2
beq ZERO,I, .L_M1
#if defined(TRMMKERNEL)
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
move B0, B
#else
PTR_SLLI T0, OFF, 0x03
PTR_ADD A0, A0, T0
PTR_SLLI T0, OFF, 0x05
PTR_ADD B0, B, T0
#endif
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
PTR_SUB L, K, OFF
#elif defined(LEFT)
/* number of values in A */
PTR_ADDI L, OFF, 2
#else
/* number of values in B */
PTR_ADDI L, OFF, 8
#endif
#else // #if !defined(TRMMKERNEL)
move B0, B
move L, K /* L = bk */
#endif
KERNEL1xMx8_START 2, 0x08
/* Reduce L */
PTR_ADDI L, L, -1
PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */
/* if (TL < 1) goto L_M2_L7 */
beq ZERO,TL, .L_M2_L7
.align 5
.L_M2_TL1:
KERNEL8xMx8 2, 0x08
PTR_ADDI TL, TL, -1 /* TL-- */
blt ZERO,TL, .L_M2_TL1
.L_M2_L7:
/* if (!(L & 7)) goto L_M2_L0 */
andi TL, L, 7
beq TL, ZERO,.L_M2_L0
.align 5
.L_M2_L71:
KERNEL1xMx8 2, 0x08
PTR_ADDI TL, TL, -1
blt ZERO,TL, .L_M2_L71
.L_M2_L0:
SAVEMx8 2, 0x08
#if defined(TRMMKERNEL)
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
PTR_SUB L, K, OFF
#ifdef LEFT
/* number of values in A */
PTR_ADDI L, L, -2
#else
/* number of values in B */
PTR_ADDI L, L, -8
#endif
PTR_SLLI T0, L, 0x03
PTR_ADD A0, A0, T0
PTR_SLLI T0, L, 0x05
PTR_ADD B0, B0, T0
#endif
#ifdef LEFT
/* number of values in A */
PTR_ADDI OFF, OFF, 0x02
#endif
#endif // #if defined(TRMMKERNEL)
.L_M1:
andi I, M, 1
beq ZERO,I, .L_M0
#if defined(TRMMKERNEL)
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
move B0, B
#else
PTR_SLLI T0, OFF, 0x02
PTR_ADD A0, A0, T0
PTR_SLLI T0, OFF, 0x05
PTR_ADD B0, B, T0
#endif
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
PTR_SUB L, K, OFF
#elif defined(LEFT)
/* number of values in A */
PTR_ADDI L, OFF, 1
#else
/* number of values in B */
PTR_ADDI L, OFF, 8
#endif
#else // #if !defined(TRMMKERNEL)
move B0, B
move L, K /* L = bk */
#endif
KERNEL1xMx8_START 1, 0x04
/* Reduce L */
PTR_ADDI L, L, -1
PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */
/* if (TL < 1) goto L_M1_L7 */
beq ZERO,TL, .L_M1_L7
.align 5
.L_M1_TL1:
KERNEL8xMx8 1, 0x04
PTR_ADDI TL, TL, -1 /* TL-- */
blt ZERO,TL, .L_M1_TL1
.L_M1_L7:
/* if (!(L & 7)) goto L_M1_L0 */
andi TL, L, 7
beq TL, ZERO,.L_M1_L0
.align 5
.L_M1_L71:
KERNEL1xMx8 1, 0x04
PTR_ADDI TL, TL, -1
blt ZERO,TL, .L_M1_L71
.L_M1_L0:
SAVEMx8 1, 0x04
#if defined(TRMMKERNEL)
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
PTR_SUB L, K, OFF
#ifdef LEFT
/* number of values in A */
PTR_ADDI L, L, -1
#else
/* number of values in B */
PTR_ADDI L, L, -8
#endif
PTR_SLLI T0, L, 0x02
PTR_ADD A0, A0, T0
PTR_SLLI T0, L, 0x05
PTR_ADD B0, B0, T0
#endif
#ifdef LEFT
/* number of values in A */
PTR_ADDI OFF, OFF, 0x01
#endif
#endif // #if defined(TRMMKERNEL)
.L_M0:
/* Add stride for B and C
* B += (K * 32)
* C += (LDC * 32)
*/
PTR_SLLI T0, K, 5
PTR_SLLI T1, LDC, 5
PTR_ADD B, B, T0
PTR_ADD C, C, T1
#if defined(TRMMKERNEL) && !defined(LEFT)
PTR_ADDI OFF, OFF, 0x08 /* number of values in B */
#endif
blt ZERO, J, .L_N8
.L_N7:
andi J, N, 4
beq ZERO, J, .L_N3
.L_N4:
move C0, C
move A0, A
PTR_SLLI T0, LDC, 2
#if __loongarch_grlen == 64
GADD , d, C1, C0, T0, C2, C1, T0, C3, C2, T0
#elif __loongarch_grlen == 32
GADD , w, C1, C0, T0, C2, C1, T0, C3, C2, T0
#else
GADD , d, C1, C0, T0, C2, C1, T0, C3, C2, T0
#endif
#if defined(TRMMKERNEL) && defined(LEFT)
move OFF, OFFSET
#endif
/* if (!(M >> 4)) goto L_N4_M8 */
PTR_SRAI I, M, 4 /* I = bm >> 4 */
beq ZERO, I, .L_N4_M8
.align 5
.L_N4_M16:
#if defined(TRMMKERNEL)
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
move B0, B
#else
PTR_SLLI T0, OFF, 0x06
PTR_ADD A0, A0, T0 /* A0 += 16 * OFF */
PTR_SLLI T0, OFF, 0x04
PTR_ADD B0, B, T0 /* B0 += 4 * OFF */
#endif
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
PTR_SUB L, K, OFF
#elif defined(LEFT)
/* number of values in A */
PTR_ADDI L, OFF, 16
#else
/* number of values in B */
PTR_ADDI L, OFF, 4
#endif
#else // #if !defined(TRMMKERNEL)
move B0, B
move L, K /* L = bk */
#endif
KERNEL1x16x4_START
/* Reduce L */
PTR_ADDI L, L, -1
PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */
/* if (TL < 1) goto L_N4_L7 */
beq ZERO,TL, .L_N4_M16_L7
.align 5
.L_N4_M16_TL1: /* TL-- */
KERNEL8x16x4
PTR_ADDI TL, TL, -1 /* TL-- */
blt ZERO,TL, .L_N4_M16_TL1
.L_N4_M16_L7:
/* if (!(L & 7)) goto L_N4_L0 */
andi TL, L, 7
beq TL, ZERO,.L_N4_M16_L0
.align 5
.L_N4_M16_L71:
KERNEL1x16x4
PTR_ADDI TL, TL, -1
blt ZERO,TL, .L_N4_M16_L71
.L_N4_M16_L0:
SAVE16x4
#if defined(TRMMKERNEL)
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
PTR_SUB L, K, OFF
#ifdef LEFT
PTR_ADDI L, L, -16
#else
PTR_ADDI L, L, -4
#endif
PTR_SLLI T0, L, 0x06
PTR_ADD A0, A0, T0
PTR_SLLI T0, L, 0x04
PTR_ADD B0, B0, T0
#endif
#ifdef LEFT
PTR_ADDI OFF, OFF, 0x10
#endif
#endif // #if defined(TRMMKERNEL)
PTR_ADDI I, I, -1 /* I-- */
blt ZERO,I, .L_N4_M16
.L_N4_M8:
/* We have done M & 16, considering M=8/4/2/1 */
andi I, M, 15
beq ZERO,I, .L_N4_M0
andi I, M, 8
beq ZERO,I, .L_N4_M4
#if defined(TRMMKERNEL)
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
move B0, B
#else
PTR_SLLI T0, OFF, 0x05
PTR_ADD A0, A0, T0
PTR_SLLI T0, OFF, 0x04
PTR_ADD B0, B, T0
#endif
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
PTR_SUB L, K, OFF
#elif defined(LEFT)
/* number of values in A */
PTR_ADDI L, OFF, 8
#else
/* number of values in B */
PTR_ADDI L, OFF, 4
#endif
#else // #if !defined(TRMMKERNEL)
move B0, B
move L, K /* L = bk */
#endif
KERNEL1xMx4_START 8, 0x20
/* Reduce L */
PTR_ADDI L, L, -1
PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */
/* if (TL < 1) goto L_N4_M8_L7 */
beq ZERO,TL, .L_N4_M8_L7
.align 5
.L_N4_M8_TL1: /* TL-- */
KERNEL8xMx4 8, 0x20
PTR_ADDI TL, TL, -1 /* TL-- */
blt ZERO,TL, .L_N4_M8_TL1
.L_N4_M8_L7:
/* if (!(L & 7)) goto L_N4_M8_L0 */
andi TL, L, 7
beq TL, ZERO,.L_N4_M8_L0
.align 5
.L_N4_M8_L71:
KERNEL1xMx4 8, 0x20
PTR_ADDI TL, TL, -1
blt ZERO,TL, .L_N4_M8_L71
.L_N4_M8_L0:
SAVEMx4 8, 0x20
#if defined(TRMMKERNEL)
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
PTR_SUB L, K, OFF
#ifdef LEFT
PTR_ADDI L, L, -8
#else
PTR_ADDI L, L, -4
#endif
PTR_SLLI T0, L, 0x05
PTR_ADD A0, A0, T0
PTR_SLLI T0, L, 0x04
PTR_ADD B0, B0, T0
#endif
#ifdef LEFT
PTR_ADDI OFF, OFF, 0x08
#endif
#endif // #if defined(TRMMKERNEL)
.L_N4_M4:
andi I, M, 4
beq ZERO,I, .L_N4_M2
#if defined(TRMMKERNEL)
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
move B0, B
#else
PTR_SLLI T0, OFF, 0x04
PTR_ADD A0, A0, T0
PTR_SLLI T0, OFF, 0x04
PTR_ADD B0, B, T0
#endif
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
PTR_SUB L, K, OFF
#elif defined(LEFT)
/* number of values in A */
PTR_ADDI L, OFF, 4
#else
/* number of values in B */
PTR_ADDI L, OFF, 4
#endif
#else // #if !defined(TRMMKERNEL)
move B0, B
move L, K /* L = bk */
#endif
KERNEL1xMx4_START 4, 0x10
/* Reduce L */
PTR_ADDI L, L, -1
PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */
/* if (TL < 1) goto L_N4_M4_L7 */
beq ZERO,TL, .L_N4_M4_L7
.align 5
.L_N4_M4_TL1: /* TL-- */
KERNEL8xMx4 4, 0x10
PTR_ADDI TL, TL, -1 /* TL-- */
blt ZERO,TL, .L_N4_M4_TL1
.L_N4_M4_L7:
/* if (!(L & 7)) goto L_N4_M4_L0 */
andi TL, L, 7
beq TL, ZERO,.L_N4_M4_L0
.align 5
.L_N4_M4_L71:
KERNEL1xMx4 4, 0x10
PTR_ADDI TL, TL, -1
blt ZERO,TL, .L_N4_M4_L71
.L_N4_M4_L0:
SAVEMx4 4, 0x10
#if defined(TRMMKERNEL)
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
PTR_SUB L, K, OFF
#ifdef LEFT
PTR_ADDI L, L, -4
#else
PTR_ADDI L, L, -4
#endif
PTR_SLLI T0, L, 0x04
PTR_ADD A0, A0, T0
PTR_SLLI T0, L, 0x04
PTR_ADD B0, B0, T0
#endif
#ifdef LEFT
PTR_ADDI OFF, OFF, 0x04
#endif
#endif // #if defined(TRMMKERNEL)
.L_N4_M2:
andi I, M, 2
beq ZERO,I, .L_N4_M1
#if defined(TRMMKERNEL)
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
move B0, B
#else
PTR_SLLI T0, OFF, 0x03
PTR_ADD A0, A0, T0
PTR_SLLI T0, OFF, 0x04
PTR_ADD B0, B, T0
#endif
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
PTR_SUB L, K, OFF
#elif defined(LEFT)
/* number of values in A */
PTR_ADDI L, OFF, 2
#else
/* number of values in B */
PTR_ADDI L, OFF, 4
#endif
#else // #if !defined(TRMMKERNEL)
move B0, B
move L, K /* L = bk */
#endif
KERNEL1xMx4_START 2, 0x08
/* Reduce L */
PTR_ADDI L, L, -1
PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */
/* if (TL < 1) goto L_N4_M2_L7 */
beq ZERO,TL, .L_N4_M2_L7
.align 5
.L_N4_M2_TL1: /* TL-- */
KERNEL8xMx4 2, 0x08
PTR_ADDI TL, TL, -1 /* TL-- */
blt ZERO,TL, .L_N4_M2_TL1
.L_N4_M2_L7:
/* if (!(L & 7)) goto L_N4_M2_L0 */
andi TL, L, 7
beq TL, ZERO,.L_N4_M2_L0
.align 5
.L_N4_M2_L71:
KERNEL1xMx4 2, 0x08
PTR_ADDI TL, TL, -1
blt ZERO,TL, .L_N4_M2_L71
.L_N4_M2_L0:
SAVEMx4 2, 0x08
#if defined(TRMMKERNEL)
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
PTR_SUB L, K, OFF
#ifdef LEFT
PTR_ADDI L, L, -2
#else
PTR_ADDI L, L, -4
#endif
PTR_SLLI T0, L, 0x03
PTR_ADD A0, A0, T0
PTR_SLLI T0, L, 0x04
PTR_ADD B0, B0, T0
#endif
#ifdef LEFT
PTR_ADDI OFF, OFF, 0x02
#endif
#endif // #if defined(TRMMKERNEL)
.L_N4_M1:
andi I, M, 1
beq ZERO,I, .L_N4_M0
#if defined(TRMMKERNEL)
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
move B0, B
#else
PTR_SLLI T0, OFF, 0x02
PTR_ADD A0, A0, T0
PTR_SLLI T0, OFF, 0x04
PTR_ADD B0, B, T0
#endif
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
PTR_SUB L, K, OFF
#elif defined(LEFT)
/* number of values in A */
PTR_ADDI L, OFF, 1
#else
/* number of values in B */
PTR_ADDI L, OFF, 4
#endif
#else // #if !defined(TRMMKERNEL)
move B0, B
move L, K /* L = bk */
#endif
KERNEL1xMx4_START 1, 0x04
/* Reduce L */
PTR_ADDI L, L, -1
PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */
/* if (TL < 1) goto L_N4_M1_L7 */
beq ZERO,TL, .L_N4_M1_L7
.align 5
.L_N4_M1_TL1: /* TL-- */
KERNEL8xMx4 1, 0x04
PTR_ADDI TL, TL, -1 /* TL-- */
blt ZERO,TL, .L_N4_M1_TL1
.L_N4_M1_L7:
/* if (!(L & 7)) goto L_N4_M1_L0 */
andi TL, L, 7
beq TL, ZERO,.L_N4_M1_L0
.align 5
.L_N4_M1_L71:
KERNEL1xMx4 1, 0x04
PTR_ADDI TL, TL, -1
blt ZERO,TL, .L_N4_M1_L71
.L_N4_M1_L0:
SAVEMx4 1, 0x04
#if defined(TRMMKERNEL)
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
PTR_SUB L, K, OFF
#ifdef LEFT
PTR_ADDI L, L, -1
#else
PTR_ADDI L, L, -4
#endif
PTR_SLLI T0, L, 0x02
PTR_ADD A0, A0, T0
PTR_SLLI T0, L, 0x04
PTR_ADD B0, B0, T0
#endif
#ifdef LEFT
PTR_ADDI OFF, OFF, 0x01
#endif
#endif // #if defined(TRMMKERNEL)
.L_N4_M0:
/* Add stride for B and C
* B += 4 * K
* C += 4 * LDC
*/
PTR_SLLI T0, K, 4
PTR_SLLI T1, LDC, 4
PTR_ADD B, B, T0
PTR_ADD C, C, T1
#if defined(TRMMKERNEL) && !defined(LEFT)
PTR_ADDI OFF, OFF, 0x04
#endif
/* We must reinit I */
PTR_SRAI I, M, 4 /* I = bm >> 4 */
.L_N3:
andi J, N, 2
beq ZERO, J, .L_N1
.L_N2:
move C0, C
move A0, A
PTR_SLLI T0, LDC, 2
PTR_ADD C1, C0, T0
#if defined(TRMMKERNEL) && defined(LEFT)
move OFF, OFFSET
#endif
/* if (!(M >> 4)) goto L_N2_M8 */
PTR_SRAI I, M, 4 /* I = bm >> 4 */
beq ZERO, I, .L_N2_M8
.align 5
.L_N2_M16:
#if defined(TRMMKERNEL)
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
move B0, B
#else
PTR_SLLI T0, OFF, 0x06
PTR_ADD A0, A0, T0
PTR_SLLI T0, OFF, 0x03
PTR_ADD B0, B, T0
#endif
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
PTR_SUB L, K, OFF
#elif defined(LEFT)
/* number of values in A */
PTR_ADDI L, OFF, 16
#else
/* number of values in B */
PTR_ADDI L, OFF, 2
#endif
#else // #if !defined(TRMMKERNEL)
move B0, B
move L, K /* L = bk */
#endif
KERNEL1x16x2_START
/* Reduce L */
PTR_ADDI L, L, -1
PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */
/* if (TL < 1) goto L_N2_M16_L7 */
beq ZERO,TL, .L_N2_M16_L7
.align 5
.L_N2_M16_TL1: /* TL-- */
KERNEL8x16x2
PTR_ADDI TL, TL, -1 /* TL-- */
blt ZERO,TL, .L_N2_M16_TL1
.L_N2_M16_L7:
/* if (!(L & 7)) goto L_N2_M16_L0 */
andi TL, L, 7
beq TL, ZERO,.L_N2_M16_L0
.align 5
.L_N2_M16_L71:
KERNEL1x16x2
PTR_ADDI TL, TL, -1
blt ZERO,TL, .L_N2_M16_L71
.L_N2_M16_L0:
SAVE16x2
#if defined(TRMMKERNEL)
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
PTR_SUB L, K, OFF
#ifdef LEFT
PTR_ADDI L, L, -16
#else
PTR_ADDI L, L, -2
#endif
PTR_SLLI T0, L, 0x06
PTR_ADD A0, A0, T0
PTR_SLLI T0, L, 0x03
PTR_ADD B0, B0, T0
#endif
#ifdef LEFT
PTR_ADDI OFF, OFF, 0x10
#endif
#endif // #if defined(TRMMKERNEL)
PTR_ADDI I, I, -1 /* I-- */
blt ZERO,I, .L_N2_M16
.L_N2_M8:
/* We have done M & 16, considering M=8/4/2/1 */
andi I, M, 15
beq ZERO,I, .L_N2_M0
andi I, M, 8
beq ZERO,I, .L_N2_M4
#if defined(TRMMKERNEL)
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
move B0, B
#else
PTR_SLLI T0, OFF, 0x05
PTR_ADD A0, A0, T0
PTR_SLLI T0, OFF, 0x03
PTR_ADD B0, B, T0
#endif
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
PTR_SUB L, K, OFF
#elif defined(LEFT)
/* number of values in A */
PTR_ADDI L, OFF, 8
#else
/* number of values in B */
PTR_ADDI L, OFF, 2
#endif
#else // #if !defined(TRMMKERNEL)
move B0, B
move L, K /* L = bk */
#endif
KERNEL1xMx2_START 8, 0x20
/* Reduce L */
PTR_ADDI L, L, -1
PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */
/* if (TL < 1) goto L_N2_M8_L7 */
beq ZERO,TL, .L_N2_M8_L7
.align 5
.L_N2_M8_TL1: /* TL-- */
KERNEL8xMx2 8, 0x20
PTR_ADDI TL, TL, -1 /* TL-- */
blt ZERO,TL, .L_N2_M8_TL1
.L_N2_M8_L7:
/* if (!(L & 7)) goto L_N2_M8_L0 */
andi TL, L, 7
beq TL, ZERO,.L_N2_M8_L0
.align 5
.L_N2_M8_L71:
KERNEL1xMx2 8, 0x20
PTR_ADDI TL, TL, -1
blt ZERO,TL, .L_N2_M8_L71
.L_N2_M8_L0:
SAVEMx2 8, 0x20
#if defined(TRMMKERNEL)
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
PTR_SUB L, K, OFF
#ifdef LEFT
PTR_ADDI L, L, -8
#else
PTR_ADDI L, L, -2
#endif
PTR_SLLI T0, L, 0x05
PTR_ADD A0, A0, T0
PTR_SLLI T0, L, 0x03
PTR_ADD B0, B0, T0
#endif
#ifdef LEFT
PTR_ADDI OFF, OFF, 0x08
#endif
#endif // #if defined(TRMMKERNEL)
.L_N2_M4:
andi I, M, 4
beq ZERO,I, .L_N2_M2
#if defined(TRMMKERNEL)
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
move B0, B
#else
PTR_SLLI T0, OFF, 0x04
PTR_ADD A0, A0, T0
PTR_SLLI T0, OFF, 0x03
PTR_ADD B0, B, T0
#endif
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
PTR_SUB L, K, OFF
#elif defined(LEFT)
/* number of values in A */
PTR_ADDI L, OFF, 4
#else
/* number of values in B */
PTR_ADDI L, OFF, 2
#endif
#else // #if !defined(TRMMKERNEL)
move B0, B
move L, K /* L = bk */
#endif
KERNEL1xMx2_START 4, 0x10
/* Reduce L */
PTR_ADDI L, L, -1
PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */
/* if (TL < 1) goto L_N2_M4_L7 */
beq ZERO,TL, .L_N2_M4_L7
.align 5
.L_N2_M4_TL1: /* TL-- */
KERNEL8xMx2 4, 0x10
PTR_ADDI TL, TL, -1 /* TL-- */
blt ZERO,TL, .L_N2_M4_TL1
.L_N2_M4_L7:
/* if (!(L & 7)) goto L_N2_M4_L0 */
andi TL, L, 7
beq TL, ZERO,.L_N2_M4_L0
.align 5
.L_N2_M4_L71:
KERNEL1xMx2 4, 0x10
PTR_ADDI TL, TL, -1
blt ZERO,TL, .L_N2_M4_L71
.L_N2_M4_L0:
SAVEMx2 4, 0x10
#if defined(TRMMKERNEL)
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
PTR_SUB L, K, OFF
#ifdef LEFT
PTR_ADDI L, L, -4
#else
PTR_ADDI L, L, -2
#endif
PTR_SLLI T0, L, 0x04
PTR_ADD A0, A0, T0
PTR_SLLI T0, L, 0x03
PTR_ADD B0, B0, T0
#endif
#ifdef LEFT
PTR_ADDI OFF, OFF, 0x04
#endif
#endif // #if defined(TRMMKERNEL)
.L_N2_M2:
andi I, M, 2
beq ZERO,I, .L_N2_M1
#if defined(TRMMKERNEL)
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
move B0, B
#else
PTR_SLLI T0, OFF, 0x03
PTR_ADD A0, A0, T0
PTR_ADD B0, B, T0
#endif
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
PTR_SUB L, K, OFF
#elif defined(LEFT)
/* number of values in A */
PTR_ADDI L, OFF, 2
#else
/* number of values in B */
PTR_ADDI L, OFF, 2
#endif
#else // #if !defined(TRMMKERNEL)
move B0, B
move L, K /* L = bk */
#endif
KERNEL1xMx2_START 2, 0x08
/* Reduce L */
PTR_ADDI L, L, -1
PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */
/* if (TL < 1) goto L_N2_M2_L7 */
beq ZERO,TL, .L_N2_M2_L7
.align 5
.L_N2_M2_TL1: /* TL-- */
KERNEL8xMx2 2, 0x08
PTR_ADDI TL, TL, -1 /* TL-- */
blt ZERO,TL, .L_N2_M2_TL1
.L_N2_M2_L7:
/* if (!(L & 7)) goto L_N2_M2_L0 */
andi TL, L, 7
beq TL, ZERO,.L_N2_M2_L0
.align 5
.L_N2_M2_L71:
KERNEL1xMx2 2, 0x08
PTR_ADDI TL, TL, -1
blt ZERO,TL, .L_N2_M2_L71
.L_N2_M2_L0:
SAVEMx2 2, 0x08
#if defined(TRMMKERNEL)
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
PTR_SUB L, K, OFF
#ifdef LEFT
PTR_ADDI L, L, -2
#else
PTR_ADDI L, L, -2
#endif
PTR_SLLI T0, L, 0x03
PTR_ADD A0, A0, T0
PTR_ADD B0, B0, T0
#endif
#ifdef LEFT
PTR_ADDI OFF, OFF, 0x02
#endif
#endif // #if defined(TRMMKERNEL)
.L_N2_M1:
andi I, M, 1
beq ZERO,I, .L_N2_M0
#if defined(TRMMKERNEL)
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
move B0, B
#else
PTR_SLLI T0, OFF, 0x02
PTR_ADD A0, A0, T0
PTR_SLLI T0, OFF, 0x03
PTR_ADD B0, B, T0
#endif
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
PTR_SUB L, K, OFF
#elif defined(LEFT)
/* number of values in A */
PTR_ADDI L, OFF, 1
#else
/* number of values in B */
PTR_ADDI L, OFF, 2
#endif
#else // #if !defined(TRMMKERNEL)
move B0, B
move L, K /* L = bk */
#endif
KERNEL1xMx2_START 1, 0x04
/* Reduce L */
PTR_ADDI L, L, -1
PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */
/* if (TL < 1) goto L_N2_M1_L7 */
beq ZERO,TL, .L_N2_M1_L7
.align 5
.L_N2_M1_TL1: /* TL-- */
KERNEL8xMx2 1, 0x04
PTR_ADDI TL, TL, -1 /* TL-- */
blt ZERO,TL, .L_N2_M1_TL1
.L_N2_M1_L7:
/* if (!(L & 7)) goto L_N2_M1_L0 */
andi TL, L, 7
beq TL, ZERO,.L_N2_M1_L0
.align 5
.L_N2_M1_L71:
KERNEL1xMx2 1, 0x04
PTR_ADDI TL, TL, -1
blt ZERO,TL, .L_N2_M1_L71
.L_N2_M1_L0:
SAVEMx2 1, 0x04
#if defined(TRMMKERNEL)
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
PTR_SUB L, K, OFF
#ifdef LEFT
PTR_ADDI L, L, -1
#else
PTR_ADDI L, L, -2
#endif
PTR_SLLI T0, L, 0x02
PTR_ADD A0, A0, T0
PTR_SLLI T0, L, 0x03
PTR_ADD B0, B0, T0
#endif
#ifdef LEFT
PTR_ADDI OFF, OFF, 0x01
#endif
#endif // #if defined(TRMMKERNEL)
.L_N2_M0:
/* Add stride for B and C
* B += 2 * K
* C += 2 * LDC
*/
PTR_SLLI T0, K, 3
PTR_SLLI T1, LDC, 3
PTR_ADD B, B, T0
PTR_ADD C, C, T1
#if defined(TRMMKERNEL) && !defined(LEFT)
PTR_ADDI OFF, OFF, 0x02
#endif
/* We must reinit I */
PTR_SRAI I, M, 4 /* I = bm >> 4 */
.L_N1:
andi J, N, 1
beq ZERO, J, .L_N0
move C0, C
move A0, A
#if defined(TRMMKERNEL) && defined(LEFT)
move OFF, OFFSET
#endif
/* if (!(M >> 4)) goto L_N1_M8 */
PTR_SRAI I, M, 4 /* I = bm >> 4 */
beq ZERO, I, .L_N1_M8
.L_N1_M16:
#if defined(TRMMKERNEL)
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
move B0, B
#else
PTR_SLLI T0, OFF, 0x06
PTR_ADD A0, A0, T0
PTR_SLLI T0, OFF, 0x02
PTR_ADD B0, B, T0
#endif
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
PTR_SUB L, K, OFF
#elif defined(LEFT)
/* number of values in A */
PTR_ADDI L, OFF, 16
#else
/* number of values in B */
PTR_ADDI L, OFF, 1
#endif
#else // #if !defined(TRMMKERNEL)
move B0, B
move L, K /* L = bk */
#endif
KERNEL1x16x1_START
/* Reduce L */
PTR_ADDI L, L, -1
PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */
/* if (TL < 1) goto L_N1_M16_L7 */
beq ZERO,TL, .L_N1_M16_L7
.align 5
.L_N1_M16_TL1: /* TL-- */
KERNEL8x16x1
PTR_ADDI TL, TL, -1 /* TL-- */
blt ZERO,TL, .L_N1_M16_TL1
.L_N1_M16_L7:
/* if (!(L & 7)) goto L_N1_M16_L0 */
andi TL, L, 7
beq TL, ZERO,.L_N1_M16_L0
.align 5
.L_N1_M16_L71:
KERNEL1x16x1
PTR_ADDI TL, TL, -1
blt ZERO,TL, .L_N1_M16_L71
.L_N1_M16_L0:
SAVE16x1
#if defined(TRMMKERNEL)
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
PTR_SUB L, K, OFF
#ifdef LEFT
PTR_ADDI L, L, -16
#else
PTR_ADDI L, L, -1
#endif
PTR_SLLI T0, L, 0x06
PTR_ADD A0, A0, T0
PTR_SLLI T0, L, 0x02
PTR_ADD B0, B0, T0
#endif
#ifdef LEFT
PTR_ADDI OFF, OFF, 0x10
#endif
#endif // #if defined(TRMMKERNEL)
PTR_ADDI I, I, -1 /* I-- */
blt ZERO,I, .L_N1_M16
.L_N1_M8:
/* We have done M & 16, considering M=8/4/2/1 */
andi I, M, 15
beq ZERO,I, .L_N1_M0
andi I, M, 8
beq ZERO,I, .L_N1_M4
#if defined(TRMMKERNEL)
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
move B0, B
#else
PTR_SLLI T0, OFF, 0x05
PTR_ADD A0, A0, T0
PTR_SLLI T0, OFF, 0x02
PTR_ADD B0, B, T0
#endif
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
PTR_SUB L, K, OFF
#elif defined(LEFT)
/* number of values in A */
PTR_ADDI L, OFF, 8
#else
/* number of values in B */
PTR_ADDI L, OFF, 1
#endif
#else // #if !defined(TRMMKERNEL)
move B0, B
move L, K /* L = bk */
#endif
KERNEL1xMx1_START 8, 0x20
/* Reduce L */
PTR_ADDI L, L, -1
PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */
/* if (TL < 1) goto L_N1_M8_L7 */
beq ZERO,TL, .L_N1_M8_L7
.align 5
.L_N1_M8_TL1: /* TL-- */
KERNEL8xMx1 8, 0x20
PTR_ADDI TL, TL, -1 /* TL-- */
blt ZERO,TL, .L_N1_M8_TL1
.L_N1_M8_L7:
/* if (!(L & 7)) goto L_N1_M8_L0 */
andi TL, L, 7
beq TL, ZERO,.L_N1_M8_L0
.align 5
.L_N1_M8_L71:
KERNEL1xMx1 8, 0x20
PTR_ADDI TL, TL, -1
blt ZERO,TL, .L_N1_M8_L71
.L_N1_M8_L0:
SAVEMx1 8, 0x20
#if defined(TRMMKERNEL)
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
PTR_SUB L, K, OFF
#ifdef LEFT
PTR_ADDI L, L, -8
#else
PTR_ADDI L, L, -1
#endif
PTR_SLLI T0, L, 0x05
PTR_ADD A0, A0, T0
PTR_SLLI T0, L, 0x02
PTR_ADD B0, B0, T0
#endif
#ifdef LEFT
PTR_ADDI OFF, OFF, 0x08
#endif
#endif // #if defined(TRMMKERNEL)
.L_N1_M4:
andi I, M, 4
beq ZERO,I, .L_N1_M2
#if defined(TRMMKERNEL)
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
move B0, B
#else
PTR_SLLI T0, OFF, 0x04
PTR_ADD A0, A0, T0
PTR_SLLI T0, OFF, 0x02
PTR_ADD B0, B, T0
#endif
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
PTR_SUB L, K, OFF
#elif defined(LEFT)
/* number of values in A */
PTR_ADDI L, OFF, 4
#else
/* number of values in B */
PTR_ADDI L, OFF, 1
#endif
#else // #if !defined(TRMMKERNEL)
move B0, B
move L, K /* L = bk */
#endif
KERNEL1xMx1_START 4, 0x10
/* Reduce L */
PTR_ADDI L, L, -1
PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */
/* if (TL < 1) goto L_N1_M4_L7 */
beq ZERO,TL, .L_N1_M4_L7
.align 5
.L_N1_M4_TL1: /* TL-- */
KERNEL8xMx1 4, 0x10
PTR_ADDI TL, TL, -1 /* TL-- */
blt ZERO,TL, .L_N1_M4_TL1
.L_N1_M4_L7:
/* if (!(L & 7)) goto L_N1_M4_L0 */
andi TL, L, 7
beq TL, ZERO,.L_N1_M4_L0
.align 5
.L_N1_M4_L71:
KERNEL1xMx1 4, 0x10
PTR_ADDI TL, TL, -1
blt ZERO,TL, .L_N1_M4_L71
.L_N1_M4_L0:
SAVEMx1 4, 0x10
#if defined(TRMMKERNEL)
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
PTR_SUB L, K, OFF
#ifdef LEFT
PTR_ADDI L, L, -4
#else
PTR_ADDI L, L, -1
#endif
PTR_SLLI T0, L, 0x04
PTR_ADD A0, A0, T0
PTR_SLLI T0, L, 0x02
PTR_ADD B0, B0, T0
#endif
#ifdef LEFT
PTR_ADDI OFF, OFF, 0x04
#endif
#endif // #if defined(TRMMKERNEL)
.L_N1_M2:
andi I, M, 2
beq ZERO,I, .L_N1_M1
#if defined(TRMMKERNEL)
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
move B0, B
#else
PTR_SLLI T0, OFF, 0x03
PTR_ADD A0, A0, T0
PTR_SLLI T0, OFF, 0x02
PTR_ADD B0, B, T0
#endif
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
PTR_SUB L, K, OFF
#elif defined(LEFT)
/* number of values in A */
PTR_ADDI L, OFF, 2
#else
/* number of values in B */
PTR_ADDI L, OFF, 1
#endif
#else // #if !defined(TRMMKERNEL)
move B0, B
move L, K /* L = bk */
#endif
KERNEL1xMx1_START 2, 0x08
/* Reduce L */
PTR_ADDI L, L, -1
PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */
/* if (TL < 1) goto L_N1_M2_L7 */
beq ZERO,TL, .L_N1_M2_L7
.align 5
.L_N1_M2_TL1: /* TL-- */
KERNEL8xMx1 2, 0x08
PTR_ADDI TL, TL, -1 /* TL-- */
blt ZERO,TL, .L_N1_M2_TL1
.L_N1_M2_L7:
/* if (!(L & 7)) goto L_N1_M2_L0 */
andi TL, L, 7
beq TL, ZERO,.L_N1_M2_L0
.align 5
.L_N1_M2_L71:
KERNEL1xMx1 2, 0x08
PTR_ADDI TL, TL, -1
blt ZERO,TL, .L_N1_M2_L71
.L_N1_M2_L0:
SAVEMx1 2, 0x08
#if defined(TRMMKERNEL)
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
PTR_SUB L, K, OFF
#ifdef LEFT
PTR_ADDI L, L, -2
#else
PTR_ADDI L, L, -1
#endif
PTR_SLLI T0, L, 0x03
PTR_ADD A0, A0, T0
PTR_SLLI T0, L, 0x02
PTR_ADD B0, B0, T0
#endif
#ifdef LEFT
PTR_ADDI OFF, OFF, 0x02
#endif
#endif // #if defined(TRMMKERNEL)
.L_N1_M1:
andi I, M, 1
beq ZERO,I, .L_N1_M0
#if defined(TRMMKERNEL)
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
move B0, B
#else
PTR_SLLI T0, OFF, 0x02
PTR_ADD A0, A0, T0
PTR_ADD B0, B, T0
#endif
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
PTR_SUB L, K, OFF
#elif defined(LEFT)
/* number of values in A */
PTR_ADDI L, OFF, 1
#else
/* number of values in B */
PTR_ADDI L, OFF, 1
#endif
#else // #if !defined(TRMMKERNEL)
move B0, B
move L, K /* L = bk */
#endif
KERNEL1xMx1_START 1, 0x04
/* Reduce L */
PTR_ADDI L, L, -1
PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */
/* if (TL < 1) goto L_N1_M1_L7 */
beq ZERO,TL, .L_N1_M1_L7
.align 5
.L_N1_M1_TL1: /* TL-- */
KERNEL8xMx1 1, 0x04
PTR_ADDI TL, TL, -1 /* TL-- */
blt ZERO,TL, .L_N1_M1_TL1
.L_N1_M1_L7:
/* if (!(L & 7)) goto L_N1_M1_L0 */
andi TL, L, 7
beq TL, ZERO,.L_N1_M1_L0
.align 5
.L_N1_M1_L71:
KERNEL1xMx1 1, 0x04
PTR_ADDI TL, TL, -1
blt ZERO,TL, .L_N1_M1_L71
.L_N1_M1_L0:
SAVEMx1 1, 0x04
#if defined(TRMMKERNEL)
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
PTR_SUB L, K, OFF
#ifdef LEFT
PTR_ADDI L, L, -1
#else
PTR_ADDI L, L, -1
#endif
PTR_SLLI T0, L, 0x02
PTR_ADD A0, A0, T0
PTR_ADD B0, B0, T0
#endif
#ifdef LEFT
PTR_ADDI OFF, OFF, 0x01
#endif
#endif // #if defined(TRMMKERNEL)
.L_N1_M0:
.L_N0:
pop_if_used 26, 32
jirl $r0, $r1, 0x0
EPILOGUE