2349 lines
66 KiB
ArmAsm
2349 lines
66 KiB
ArmAsm
/*******************************************************************************
|
|
Copyright (c) 2023, The OpenBLAS Project
|
|
All rights reserved.
|
|
Redistribution and use in source and binary forms, with or without
|
|
modification, are permitted provided that the following conditions are
|
|
met:
|
|
1. Redistributions of source code must retain the above copyright
|
|
notice, this list of conditions and the following disclaimer.
|
|
2. Redistributions in binary form must reproduce the above copyright
|
|
notice, this list of conditions and the following disclaimer in
|
|
the documentation and/or other materials provided with the
|
|
distribution.
|
|
3. Neither the name of the OpenBLAS project nor the names of
|
|
its contributors may be used to endorse or promote products
|
|
derived from this software without specific prior written permission.
|
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
|
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
|
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
|
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
|
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
|
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
*******************************************************************************/
|
|
#define ASSEMBLER
|
|
|
|
#include "common.h"
|
|
#include "loongarch64_asm.S"
|
|
|
|
/*********************************************************************
|
|
* 2023/08/23 guxiwei
|
|
* UTEST : OK
|
|
* CTEST : OK
|
|
* TEST : OK
|
|
*
|
|
*
|
|
* 2023/08/23 guxiwei
|
|
* Parameter:
|
|
* SGEMM_DEFAULT_UNROLL_N 8
|
|
* SGEMM_DEFAULT_UNROLL_M 16
|
|
* SGEMM_DEFAULT_P 256
|
|
* SGEMM_DEFAULT_Q 256
|
|
* SGEMM_DEFAULT_R 1024
|
|
* A_PRE 1024
|
|
* B_PRE 256 // Enable prefetching for B results in a performance decrease, temporarily disabled.
|
|
*
|
|
*
|
|
* Performance at Loongson 3A5000 2.5GHz with 5000x5000x5000:
|
|
* 1 thread: 71.7 GFLOPS
|
|
* 2 threads: 142.6 GFLOPS
|
|
* 3 threads: 211.5 GFLOPS
|
|
* 4 threads: 265.0 GFLOPS
|
|
*********************************************************************/
|
|
|
|
/* Function parameters */
|
|
#define M $r4 // param 1: bm
|
|
#define N $r5 // param 2: bn
|
|
#define K $r6 // param 3: bk
|
|
#define ALPHA $f0 // param 4: alpha
|
|
#define A $r7 // param 5: ba
|
|
#define B $r8 // param 6: bb
|
|
#define C $r9 // param 7: bc
|
|
#define LDC $r10 // param 8: ldc
|
|
|
|
#ifdef TRMMKERNEL
|
|
#define OFFSET $r11 // param 9: offset
|
|
#endif
|
|
#define OFF $r12
|
|
|
|
/* Cycle control parameters */
|
|
#define I $r13
|
|
#define J $r14
|
|
#define L $r15
|
|
#define TL $r16
|
|
/* Matrix address */
|
|
#define A0 $r17
|
|
#define B0 $r18
|
|
#define C0 $r19
|
|
#define C1 $r20
|
|
#define C2 $r23
|
|
#define C3 $r24
|
|
#define C4 $r25
|
|
#define C5 $r26
|
|
#define C6 $r27
|
|
#define C7 $r28
|
|
#define T0 $r29
|
|
#define T1 $r30
|
|
#undef ZERO
|
|
#define ZERO $r0
|
|
|
|
/* LASX Vectors
|
|
* Store 16 sets of 32-bit data in A using UO and U1, with each register holding 8 data.
|
|
* Use X0 through X7 to store 8 sets of 32-bit data in B, with each register holding a broadcast value of a single data.
|
|
* Use D0 to D15 to store intermediate values of the computation.
|
|
* Use VALPHA to store the broadcast value of alpha
|
|
*/
|
|
#define U0 $xr0
|
|
#define U1 $xr1
|
|
#define X0 $xr2
|
|
#define X1 $xr3
|
|
#define X2 $xr4
|
|
#define X3 $xr5
|
|
#define X4 $xr6
|
|
#define X5 $xr7
|
|
#define X6 $xr8
|
|
#define X7 $xr9
|
|
#define D0 $xr10
|
|
#define D1 $xr11
|
|
#define D2 $xr12
|
|
#define D3 $xr13
|
|
#define D4 $xr14
|
|
#define D5 $xr15
|
|
#define D6 $xr16
|
|
#define D7 $xr17
|
|
#define D8 $xr18
|
|
#define D9 $xr19
|
|
#define D10 $xr20
|
|
#define D11 $xr21
|
|
#define D12 $xr22
|
|
#define D13 $xr23
|
|
#define D14 $xr24
|
|
#define D15 $xr25
|
|
#define VALPHA $xr26
|
|
|
|
/* Prefetch interval */
|
|
#define A_PRE 0x400
|
|
#define B_PRE 0x100
|
|
|
|
// Loops outline:
|
|
// .L_N8 <-------------------------------------------------------------------------------------------- /* if N >> 3 == 0, goto .L_N7; else, enter .L_N8. */
|
|
// | .L_M16 <--------------------- | /* if M >> 4 == 0, goto .L_M8; Otherwise, enter .L_M16. */
|
|
// | | .L_M16_TL1 | |
|
|
// | | .L_M16_L7 | The entire core loop of the function, KERNEK16x8 |
|
|
// | | .L_M16_L71 | |
|
|
// | | .L_M16_L0 ---------------- |
|
|
// | .L_M8 |
|
|
// | | .L_M8_TL1 | |
|
|
// | | .L_M8_L7 | KERNEK8x8 |
|
|
// | | .L_M8_L71 | |
|
|
// | | .L_M8_L0 | |
|
|
// | .L_M4 |
|
|
// | | .L_M4_TL1 | |
|
|
// | | .L_M4_L7 | KERNEK4x8 |
|
|
// | | .L_M4_L71 | |
|
|
// | | .L_M4_L0 | |
|
|
// | .L_M2 |
|
|
// | | .L_M2_TL1 | |
|
|
// | | .L_M2_L7 | KERNEK2x8 |
|
|
// | | .L_M2_L71 | |
|
|
// | | .L_M2_L0 | |
|
|
// | .L_M1 |
|
|
// | | .L_M1_TL1 | |
|
|
// | | .L_M1_L7 | KERNEK1x8 |
|
|
// | | .L_M1_L71 | |
|
|
// | | .L_M1_L0 | |
|
|
// | .L_M0------------------------------------------------------------------------------------------
|
|
// .L_N7 /* if N & 7 == 0, goto .L_N0; else, enter .L_N4 */
|
|
// .L_N4
|
|
// | .L_N4_M16 <---------------------
|
|
// | | .L_N4_M16_TL1 |
|
|
// | | .L_N4_M16_L7 | KERNEL16x4
|
|
// | | .L_N4_M16_L71 |
|
|
// | | .L_N4_M16_L0 ----------------
|
|
// | .L_N4_M8
|
|
// | | .L_N4_M8_TL1 |
|
|
// | | .L_N4_M8_L7 | KERNEL8x4
|
|
// | | .L_N4_M8_L71 |
|
|
// | | .L_N4_M8_L0 |
|
|
// | .L_N4_M4
|
|
// | | .L_N4_M4_TL1 |
|
|
// | | .L_N4_M4_L7 | KERNEL4x4
|
|
// | | .L_N4_M4_L71 |
|
|
// | | .L_N4_M4_L0 |
|
|
// | .L_N4_M2
|
|
// | | .L_N4_M2_TL1 |
|
|
// | | .L_N4_M2_L7 | KERNEL2x4
|
|
// | | .L_N4_M2_L71 |
|
|
// | | .L_N4_M2_L0 |
|
|
// | .L_N4_M1
|
|
// | | .L_N4_M1_TL1 |
|
|
// | | .L_N4_M1_L7 | KERNEL1x4
|
|
// | | .L_N4_M1_L71 |
|
|
// | | .L_N4_M1_L0 |
|
|
// | .L_N4_M0
|
|
// .L_N3 /* if N & 2 == 0, goto .L_N1; else enter .L_N2 */
|
|
// .L_N2
|
|
// | .L_N2_M16 <---------------------
|
|
// | | .L_N2_M16_TL1 |
|
|
// | | .L_N2_M16_L7 | KERNEL16x2
|
|
// | | .L_N2_M16_L71 |
|
|
// | | .L_N2_M16_L0 ----------------
|
|
// | .L_N2_M8
|
|
// | | .L_N2_M8_TL1 |
|
|
// | | .L_N2_M8_L7 | KERNEL8x2
|
|
// | | .L_N2_M8_L71 |
|
|
// | | .L_N2_M8_L0 |
|
|
// | .L_N2_M4
|
|
// | | .L_N2_M4_TL1 |
|
|
// | | .L_N2_M4_L7 | KERNEL4x2
|
|
// | | .L_N2_M4_L71 |
|
|
// | | .L_N2_M4_L0 |
|
|
// | .L_N2_M2
|
|
// | | .L_N2_M2_TL1 |
|
|
// | | .L_N2_M2_L7 | KERNEL2x2
|
|
// | | .L_N2_M2_L71 |
|
|
// | | .L_N2_M2_L0 |
|
|
// | .L_N2_M1
|
|
// | | .L_N2_M1_TL1 |
|
|
// | | .L_N2_M1_L7 | KERNEL1x2
|
|
// | | .L_N2_M1_L71 |
|
|
// | | .L_N2_M1_L0 |
|
|
// | .L_N2_M0
|
|
// .L_N1
|
|
// | .L_N1_M16 <---------------------
|
|
// | | .L_N1_M16_TL1 |
|
|
// | | .L_N1_M16_L7 | KERNEL16x1
|
|
// | | .L_N1_M16_L71 |
|
|
// | | .L_N1_M16_L0 ----------------
|
|
// | .L_N1_M8
|
|
// | | .L_N1_M8_TL1 |
|
|
// | | .L_N1_M8_L7 | KERNEL8x1
|
|
// | | .L_N1_M8_L71 |
|
|
// | | .L_N1_M8_L0 |
|
|
// | .L_N1_M4
|
|
// | | .L_N1_M4_TL1 |
|
|
// | | .L_N1_M4_L7 | KERNEL4x1
|
|
// | | .L_N1_M4_L71 |
|
|
// | | .L_N1_M4_L0 |
|
|
// | .L_N1_M2
|
|
// | | .L_N1_M2_TL1 |
|
|
// | | .L_N1_M2_L7 | KERNEL2x1
|
|
// | | .L_N1_M2_L71 |
|
|
// | | .L_N1_M2_L0 |
|
|
// | .L_N1_M1
|
|
// | | .L_N1_M1_TL1 |
|
|
// | | .L_N1_M1_L7 | KERNEL1x1
|
|
// | | .L_N1_M1_L71 |
|
|
// | | .L_N1_M1_L0 |
|
|
// | .L_N1_M0
|
|
// .L_N0
|
|
|
|
/*************** sgemm_kernel_macros ***************/
|
|
.macro KERNEL1x16x8_START
|
|
GLD xv, , U0, A0, 0x00, U1, A0, 0x20
|
|
|
|
GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04, X2, B0, 0x08, X3, B0, 0x0C
|
|
GMUL xvf, s, D0, U0, X0, D1, U1, X0
|
|
preld 0, C0, 0x00
|
|
GMUL xvf, s, D2, U0, X1, D3, U1, X1
|
|
preld 0, C1, 0x00
|
|
GMUL xvf, s, D4, U0, X2, D5, U1, X2
|
|
preld 0, C2, 0x00
|
|
GMUL xvf, s, D6, U0, X3, D7, U1, X3
|
|
preld 0, C3, 0x00
|
|
GLDREPL xv, w, X4, B0, 0x10, X5, B0, 0x14, X6, B0, 0x18, X7, B0, 0x1C
|
|
GMUL xvf, s, D8, U0, X4, D9, U1, X4
|
|
preld 0, C4, 0x00
|
|
GMUL xvf, s, D10, U0, X5, D11, U1, X5
|
|
preld 0, C5, 0x00
|
|
GMUL xvf, s, D12, U0, X6, D13, U1, X6
|
|
preld 0, C6, 0x00
|
|
GMUL xvf, s, D14, U0, X7, D15, U1, X7
|
|
preld 0, C7, 0x00
|
|
PTR_ADDI A0, A0, 0x40
|
|
PTR_ADDI B0, B0, 0x20
|
|
.endm
|
|
|
|
.macro KERNEL1x16x8
|
|
GLD xv, , U0, A0, 0x00, U1, A0, 0x20
|
|
|
|
GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04, X2, B0, 0x08, X3, B0, 0x0C
|
|
GMADD xvf, s, D0, U0, X0, D0, D1, U1, X0, D1, \
|
|
D2, U0, X1, D2, D3, U1, X1, D3
|
|
preld 0, A0, A_PRE
|
|
GMADD xvf, s, D4, U0, X2, D4, D5, U1, X2, D5, \
|
|
D6, U0, X3, D6, D7, U1, X3 D7
|
|
GLDREPL xv, w, X4, B0, 0x10, X5, B0, 0x14, X6, B0, 0x18, X7, B0, 0x1C
|
|
GMADD xvf, s, D8, U0, X4, D8, D9, U1, X4, D9, \
|
|
D10, U0, X5, D10, D11, U1, X5, D11
|
|
//preld 0, B0, B_PRE
|
|
GMADD xvf, s, D12, U0, X6, D12, D13, U1, X6, D13, \
|
|
D14, U0, X7, D14, D15, U1, X7 D15
|
|
PTR_ADDI A0, A0, 0x40
|
|
PTR_ADDI B0, B0, 0x20
|
|
.endm
|
|
|
|
.macro KERNEL8x16x8
|
|
.rept 8
|
|
KERNEL1x16x8
|
|
.endr
|
|
.endm
|
|
|
|
.macro SAVE16x8
|
|
#if defined(TRMMKERNEL)
|
|
GMUL xvf, s, D0, D0, VALPHA, D1, D1, VALPHA, D2, D2, VALPHA, D3, D3, VALPHA, \
|
|
D4, D4, VALPHA, D5, D5, VALPHA, D6, D6, VALPHA, D7, D7, VALPHA, \
|
|
D8, D8, VALPHA, D9, D9, VALPHA, D10, D10, VALPHA, D11, D11, VALPHA, \
|
|
D12, D12, VALPHA, D13, D13, VALPHA, D14, D14, VALPHA, D15, D15, VALPHA
|
|
#else
|
|
/* Load C0 */
|
|
GLD xv, , X0, C0, 0x00, X1, C0, 0x20
|
|
GMADD xvf, s, D0, D0, VALPHA, X0, D1, D1, VALPHA, X1
|
|
/* Load C1 */
|
|
GLD xv, , X2, C1, 0x00, X3, C1, 0x20
|
|
GMADD xvf, s, D2, D2, VALPHA, X2, D3, D3, VALPHA, X3
|
|
/* Load C2 */
|
|
GLD xv, , X4, C2, 0x00, X5, C2, 0x20
|
|
GMADD xvf, s, D4, D4, VALPHA, X4, D5, D5, VALPHA, X5
|
|
/* Load C3 */
|
|
GLD xv, , X6, C3, 0x00, X7, C3, 0x20
|
|
GMADD xvf, s, D6, D6, VALPHA, X6, D7, D7, VALPHA, X7
|
|
/* Load C4 */
|
|
GLD xv, , X0, C4, 0x00, X1, C4, 0x20
|
|
GMADD xvf, s, D8, D8, VALPHA, X0, D9, D9, VALPHA, X1
|
|
/* Load C5 */
|
|
GLD xv, , X2, C5, 0x00, X3, C5, 0x20
|
|
GMADD xvf, s, D10, D10, VALPHA, X2, D11, D11, VALPHA, X3
|
|
/* Load C6 */
|
|
GLD xv, , X4, C6, 0x00, X5, C6, 0x20
|
|
GMADD xvf, s, D12, D12, VALPHA, X4, D13, D13, VALPHA, X5
|
|
/* Load C7 */
|
|
GLD xv, , X6, C7, 0x00, X7, C7, 0x20
|
|
GMADD xvf, s, D14, D14, VALPHA, X6, D15, D15, VALPHA, X7
|
|
#endif // #if defined(TRMMKERNEL)
|
|
GST xv, , D0, C0, 0x00, D1, C0, 0x20, \
|
|
D2, C1, 0x00, D3, C1, 0x20, \
|
|
D4, C2, 0x00, D5, C2, 0x20, \
|
|
D6, C3, 0x00, D7, C3, 0x20, \
|
|
D8, C4, 0x00, D9, C4, 0x20, \
|
|
D10, C5, 0x00, D11, C5, 0x20, \
|
|
D12, C6, 0x00, D13, C6, 0x20, \
|
|
D14, C7, 0x00, D15, C7, 0x20
|
|
#if __loongarch_grlen == 64
|
|
GADDI , d, C0, C0, 0x40, C1, C1, 0x40, C2, C2, 0x40, C3, C3, 0x40, \
|
|
C4, C4, 0x40, C5, C5, 0x40, C6, C6, 0x40, C7, C7, 0x40
|
|
#elif __loongarch_grlen == 32
|
|
GADDI , w, C0, C0, 0x40, C1, C1, 0x40, C2, C2, 0x40, C3, C3, 0x40, \
|
|
C4, C4, 0x40, C5, C5, 0x40, C6, C6, 0x40, C7, C7, 0x40
|
|
#else
|
|
GADDI , d, C0, C0, 0x40, C1, C1, 0x40, C2, C2, 0x40, C3, C3, 0x40, \
|
|
C4, C4, 0x40, C5, C5, 0x40, C6, C6, 0x40, C7, C7, 0x40
|
|
#endif
|
|
.endm
|
|
|
|
// m = 8, 4, 2, 1
|
|
// stride = 0x20, 0x10, 0x08, 0x04
|
|
.macro KERNEL1xMx8_START m, stride
|
|
.if \m == 8
|
|
GLD xv, , U0, A0, 0x00
|
|
.elseif \m == 4
|
|
GLD v, , $vr0, A0, 0x00
|
|
.elseif \m ==2
|
|
GLD f, d, $f0, A0, 0x00
|
|
.elseif \m ==1
|
|
GLD f, s, $f0, A0, 0x00
|
|
.endif
|
|
GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04, X2, B0, 0x08, X3, B0, 0x0C
|
|
GMUL xvf, s, D0, U0, X0, D2, U0, X1, \
|
|
D4, U0, X2, D6, U0, X3
|
|
GLDREPL xv, w, X4, B0, 0x10, X5, B0, 0x14, X6, B0, 0x18, X7, B0, 0x1C
|
|
GMUL xvf, s, D8, U0, X4, D10, U0, X5, \
|
|
D12, U0, X6, D14, U0, X7
|
|
PTR_ADDI A0, A0, \stride
|
|
PTR_ADDI B0, B0, 0x20
|
|
.endm
|
|
|
|
.macro KERNEL1xMx8 m, stride
|
|
.if \m == 8
|
|
GLD xv, , U0, A0, 0x00
|
|
.elseif \m == 4
|
|
GLD v, , $vr0, A0, 0x00
|
|
.elseif \m ==2
|
|
GLD f, d, $f0, A0, 0x00
|
|
.elseif \m ==1
|
|
GLD f, s, $f0, A0, 0x00
|
|
.endif
|
|
|
|
GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04, X2, B0, 0x08, X3, B0, 0x0C
|
|
GMADD xvf, s, D0, U0, X0, D0, D2, U0, X1, D2, \
|
|
D4, U0, X2, D4, D6, U0, X3, D6
|
|
GLDREPL xv, w, X4, B0, 0x10, X5, B0, 0x14, X6, B0, 0x18, X7, B0, 0x1C
|
|
GMADD xvf, s, D8, U0, X4, D8, D10, U0, X5, D10, \
|
|
D12, U0, X6, D12, D14, U0, X7, D14
|
|
PTR_ADDI A0, A0, \stride
|
|
PTR_ADDI B0, B0, 0x20
|
|
.endm
|
|
|
|
.macro KERNEL8xMx8 m, stride
|
|
.rept 8
|
|
KERNEL1xMx8 \m, \stride
|
|
.endr
|
|
.endm
|
|
|
|
.macro SAVEMx8 m, stride
|
|
#if defined(TRMMKERNEL)
|
|
GMUL xvf, s, D0, D0, VALPHA, D2, D2, VALPHA, \
|
|
D4, D4, VALPHA, D6, D6, VALPHA, \
|
|
D8, D8, VALPHA, D10, D10, VALPHA, \
|
|
D12, D12, VALPHA, D14, D14, VALPHA
|
|
#else
|
|
/* Load C0, C1, C2, C3, C4, C5, C6, C7 */
|
|
.if \m == 8
|
|
GLD xv, , X0, C0, 0x00, X2, C1, 0x00, X4, C2, 0x00, X6, C3, 0x00
|
|
.elseif \m == 4
|
|
GLD v, , $vr2, C0, 0x00, $vr4, C1, 0x00, $vr6, C2, 0x00, $vr8, C3, 0x00
|
|
.elseif \m == 2
|
|
GLD f, d, $f2, C0, 0x00, $f4, C1, 0x00, $f6, C2, 0x00, $f8, C3, 0x00
|
|
.elseif \m == 1
|
|
GLD f, s, $f2, C0, 0x00, $f4, C1, 0x00, $f6, C2, 0x00, $f8, C3, 0x00
|
|
.endif
|
|
GMADD xvf, s, D0, D0, VALPHA, X0, D2, D2, VALPHA, X2, \
|
|
D4, D4, VALPHA, X4, D6, D6, VALPHA, X6
|
|
.if \m == 8
|
|
GLD xv, , X0, C4, 0x00, X2, C5, 0x00, X4, C6, 0x00, X6, C7, 0x00
|
|
.elseif \m == 4
|
|
GLD v, , $vr2, C4, 0x00, $vr4, C5, 0x00, $vr6, C6, 0x00, $vr8, C7, 0x00
|
|
.elseif \m == 2
|
|
GLD f, d, $f2, C4, 0x00, $f4, C5, 0x00, $f6, C6, 0x00, $f8, C7, 0x00
|
|
.elseif \m == 1
|
|
GLD f, s, $f2, C4, 0x00, $f4, C5, 0x00, $f6, C6, 0x00, $f8, C7, 0x00
|
|
.endif
|
|
GMADD xvf, s, D8, D8, VALPHA, X0, D10, D10, VALPHA, X2, \
|
|
D12, D12, VALPHA, X4, D14, D14, VALPHA, X6
|
|
#endif // #if defined(TRMMKERNEL)
|
|
.if \m == 8
|
|
GST xv, , D0, C0, 0x00, D2, C1, 0x00, \
|
|
D4, C2, 0x00, D6, C3, 0x00, \
|
|
D8, C4, 0x00, D10, C5, 0x00, \
|
|
D12, C6, 0x00, D14, C7, 0x00
|
|
.elseif \m == 4
|
|
GST v, , $vr10, C0, 0x00, $vr12, C1, 0x00, \
|
|
$vr14, C2, 0x00, $vr16, C3, 0x00, \
|
|
$vr18, C4, 0x00, $vr20, C5, 0x00, \
|
|
$vr22, C6, 0x00, $vr24, C7, 0x00
|
|
.elseif \m == 2
|
|
GST f, d, $f10, C0, 0x00, $f12, C1, 0x00, \
|
|
$f14, C2, 0x00, $f16, C3, 0x00, \
|
|
$f18, C4, 0x00, $f20, C5, 0x00, \
|
|
$f22, C6, 0x00, $f24, C7, 0x00
|
|
.elseif \m == 1
|
|
GST f, s, $f10, C0, 0x00, $f12, C1, 0x00, \
|
|
$f14, C2, 0x00, $f16, C3, 0x00, \
|
|
$f18, C4, 0x00, $f20, C5, 0x00, \
|
|
$f22, C6, 0x00, $f24, C7, 0x00
|
|
.endif
|
|
#if __loongarch_grlen == 64
|
|
GADDI , d, C0, C0, \stride, C1, C1, \stride, C2, C2, \stride, C3, C3, \stride, \
|
|
C4, C4, \stride, C5, C5, \stride, C6, C6, \stride, C7, C7, \stride
|
|
#elif __loongarch_grlen == 32
|
|
GADDI , w, C0, C0, \stride, C1, C1, \stride, C2, C2, \stride, C3, C3, \stride, \
|
|
C4, C4, \stride, C5, C5, \stride, C6, C6, \stride, C7, C7, \stride
|
|
#else
|
|
GADDI , d, C0, C0, \stride, C1, C1, \stride, C2, C2, \stride, C3, C3, \stride, \
|
|
C4, C4, \stride, C5, C5, \stride, C6, C6, \stride, C7, C7, \stride
|
|
#endif
|
|
.endm
|
|
|
|
.macro KERNEL1x16x4_START
|
|
GLD xv, , U0, A0, 0x00, U1, A0, 0x20
|
|
|
|
GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04, X2, B0, 0x08, X3, B0, 0x0C
|
|
GMUL xvf, s, D0, U0, X0, D1, U1, X0, \
|
|
D2, U0, X1, D3, U1, X1, \
|
|
D4, U0, X2, D5, U1, X2, \
|
|
D6, U0, X3, D7, U1, X3
|
|
PTR_ADDI A0, A0, 0x40
|
|
PTR_ADDI B0, B0, 0x10
|
|
.endm
|
|
|
|
.macro KERNEL1x16x4
|
|
GLD xv, , U0, A0, 0x00, U1, A0, 0x20
|
|
|
|
GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04, X2, B0, 0x08, X3, B0, 0x0C
|
|
GMADD xvf, s, D0, U0, X0, D0, D1, U1, X0, D1, \
|
|
D2, U0, X1, D2, D3, U1, X1, D3, \
|
|
D4, U0, X2, D4, D5, U1, X2, D5, \
|
|
D6, U0, X3, D6, D7, U1, X3 D7
|
|
PTR_ADDI A0, A0, 0x40
|
|
PTR_ADDI B0, B0, 0x10
|
|
.endm
|
|
|
|
.macro KERNEL8x16x4
|
|
.rept 8
|
|
KERNEL1x16x4
|
|
.endr
|
|
.endm
|
|
|
|
.macro SAVE16x4
|
|
#if defined(TRMMKERNEL)
|
|
GMUL xvf, s, D0, D0, VALPHA, D1, D1, VALPHA, D2, D2, VALPHA, D3, D3, VALPHA, \
|
|
D4, D4, VALPHA, D5, D5, VALPHA, D6, D6, VALPHA, D7, D7, VALPHA
|
|
#else
|
|
/* Load C0 */
|
|
GLD xv, , X0, C0, 0x00, X1, C0, 0x20
|
|
GMADD xvf, s, D0, D0, VALPHA, X0, D1, D1, VALPHA, X1
|
|
/* Load C1 */
|
|
GLD xv, , X2, C1, 0x00, X3, C1, 0x20
|
|
GMADD xvf, s, D2, D2, VALPHA, X2, D3, D3, VALPHA, X3
|
|
/* Load C2 */
|
|
GLD xv, , X4, C2, 0x00, X5, C2, 0x20
|
|
GMADD xvf, s, D4, D4, VALPHA, X4, D5, D5, VALPHA, X5
|
|
/* Load C3 */
|
|
GLD xv, , X6, C3, 0x00, X7, C3, 0x20
|
|
GMADD xvf, s, D6, D6, VALPHA, X6, D7, D7, VALPHA, X7
|
|
#endif // #if defined(TRMMKERNEL)
|
|
GST xv, , D0, C0, 0x00, D1, C0, 0x20, \
|
|
D2, C1, 0x00, D3, C1, 0x20, \
|
|
D4, C2, 0x00, D5, C2, 0x20, \
|
|
D6, C3, 0x00, D7, C3, 0x20
|
|
#if __loongarch_grlen == 64
|
|
GADDI , d, C0, C0, 0x40, C1, C1, 0x40, C2, C2, 0x40, C3, C3, 0x40
|
|
#elif __loongarch_grlen == 32
|
|
GADDI , w, C0, C0, 0x40, C1, C1, 0x40, C2, C2, 0x40, C3, C3, 0x40
|
|
#else
|
|
GADDI , d, C0, C0, 0x40, C1, C1, 0x40, C2, C2, 0x40, C3, C3, 0x40
|
|
#endif
|
|
.endm
|
|
|
|
// m = 8, 4, 2, 1
|
|
// stride = 0x20, 0x10, 0x08, 0x04
|
|
.macro KERNEL1xMx4_START m, stride
|
|
.if \m == 8
|
|
GLD xv, , U0, A0, 0x00
|
|
.elseif \m == 4
|
|
GLD v, , $vr0, A0, 0x00
|
|
.elseif \m ==2
|
|
GLD f, d, $f0, A0, 0x00
|
|
.elseif \m ==1
|
|
GLD f, s, $f0, A0, 0x00
|
|
.endif
|
|
GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04, X2, B0, 0x08, X3, B0, 0x0C
|
|
GMUL xvf, s, D0, U0, X0, D2, U0, X1, \
|
|
D4, U0, X2, D6, U0, X3
|
|
PTR_ADDI A0, A0, \stride
|
|
PTR_ADDI B0, B0, 0x10
|
|
.endm
|
|
|
|
.macro KERNEL1xMx4 m, stride
|
|
.if \m == 8
|
|
GLD xv, , U0, A0, 0x00
|
|
.elseif \m == 4
|
|
GLD v, , $vr0, A0, 0x00
|
|
.elseif \m ==2
|
|
GLD f, d, $f0, A0, 0x00
|
|
.elseif \m ==1
|
|
GLD f, s, $f0, A0, 0x00
|
|
.endif
|
|
GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04, X2, B0, 0x08, X3, B0, 0x0C
|
|
GMADD xvf, s, D0, U0, X0, D0, D2, U0, X1, D2, \
|
|
D4, U0, X2, D4, D6, U0, X3, D6
|
|
PTR_ADDI A0, A0, \stride
|
|
PTR_ADDI B0, B0, 0x10
|
|
.endm
|
|
|
|
.macro KERNEL8xMx4 m, stride
|
|
.rept 8
|
|
KERNEL1xMx4 \m, \stride
|
|
.endr
|
|
.endm
|
|
|
|
.macro SAVEMx4 m, stride
|
|
#if defined(TRMMKERNEL)
|
|
GMUL xvf, s, D0, D0, VALPHA, D2, D2, VALPHA, \
|
|
D4, D4, VALPHA, D6, D6, VALPHA
|
|
#else
|
|
/* Load C0, C1, C2, C3 */
|
|
.if \m == 8
|
|
GLD xv, , X0, C0, 0x00, X2, C1, 0x00, X4, C2, 0x00, X6, C3, 0x00
|
|
.elseif \m == 4
|
|
GLD v, , $vr2, C0, 0x00, $vr4, C1, 0x00, $vr6, C2, 0x00, $vr8, C3, 0x00
|
|
.elseif \m == 2
|
|
GLD f, d, $f2, C0, 0x00, $f4, C1, 0x00, $f6, C2, 0x00, $f8, C3, 0x00
|
|
.elseif \m == 1
|
|
GLD f, s, $f2, C0, 0x00, $f4, C1, 0x00, $f6, C2, 0x00, $f8, C3, 0x00
|
|
.endif
|
|
GMADD xvf, s, D0, D0, VALPHA, X0, D2, D2, VALPHA, X2, \
|
|
D4, D4, VALPHA, X4, D6, D6, VALPHA, X6
|
|
#endif // #if defined(TRMMKERNEL)
|
|
.if \m == 8
|
|
GST xv, , D0, C0, 0x00, D2, C1, 0x00, \
|
|
D4, C2, 0x00, D6, C3, 0x00
|
|
.elseif \m == 4
|
|
GST v, , $vr10, C0, 0x00, $vr12, C1, 0x00, \
|
|
$vr14, C2, 0x00, $vr16, C3, 0x00
|
|
.elseif \m == 2
|
|
GST f, d, $f10, C0, 0x00, $f12, C1, 0x00, \
|
|
$f14, C2, 0x00, $f16, C3, 0x00
|
|
.elseif \m == 1
|
|
GST f, s, $f10, C0, 0x00, $f12, C1, 0x00, \
|
|
$f14, C2, 0x00, $f16, C3, 0x00
|
|
.endif
|
|
#if __loongarch_grlen == 64
|
|
GADDI , d, C0, C0, \stride, C1, C1, \stride, C2, C2, \stride, C3, C3, \stride
|
|
#elif __loongarch_grlen == 32
|
|
GADDI , w, C0, C0, \stride, C1, C1, \stride, C2, C2, \stride, C3, C3, \stride
|
|
#else
|
|
GADDI , d, C0, C0, \stride, C1, C1, \stride, C2, C2, \stride, C3, C3, \stride
|
|
#endif
|
|
.endm
|
|
|
|
.macro KERNEL1x16x2_START
|
|
GLD xv, , U0, A0, 0x00, U1, A0, 0x20
|
|
|
|
GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04
|
|
GMUL xvf, s, D0, U0, X0, D1, U1, X0, \
|
|
D2, U0, X1, D3, U1, X1
|
|
PTR_ADDI A0, A0, 0x40
|
|
PTR_ADDI B0, B0, 0x08
|
|
.endm
|
|
|
|
.macro KERNEL1x16x2
|
|
GLD xv, , U0, A0, 0x00, U1, A0, 0x20
|
|
|
|
GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04
|
|
GMADD xvf, s, D0, U0, X0, D0, D1, U1, X0, D1, \
|
|
D2, U0, X1, D2, D3, U1, X1, D3
|
|
PTR_ADDI A0, A0, 0x40
|
|
PTR_ADDI B0, B0, 0x08
|
|
.endm
|
|
|
|
.macro KERNEL8x16x2
|
|
.rept 8
|
|
KERNEL1x16x2
|
|
.endr
|
|
.endm
|
|
|
|
.macro SAVE16x2
|
|
#if defined(TRMMKERNEL)
|
|
GMUL xvf, s, D0, D0, VALPHA, D1, D1, VALPHA, D2, D2, VALPHA, D3, D3, VALPHA
|
|
#else
|
|
/* Load C0 */
|
|
GLD xv, , X0, C0, 0x00, X1, C0, 0x20
|
|
GMADD xvf, s, D0, D0, VALPHA, X0, D1, D1, VALPHA, X1
|
|
/* Load C1 */
|
|
GLD xv, , X2, C1, 0x00, X3, C1, 0x20
|
|
GMADD xvf, s, D2, D2, VALPHA, X2, D3, D3, VALPHA, X3
|
|
#endif // #if defined(TRMMKERNEL)
|
|
GST xv, , D0, C0, 0x00, D1, C0, 0x20, \
|
|
D2, C1, 0x00, D3, C1, 0x20
|
|
#if __loongarch_grlen == 64
|
|
GADDI , d, C0, C0, 0x40, C1, C1, 0x40
|
|
#elif __loongarch_grlen == 32
|
|
GADDI , w, C0, C0, 0x40, C1, C1, 0x40
|
|
#else
|
|
GADDI , d, C0, C0, 0x40, C1, C1, 0x40
|
|
#endif
|
|
.endm
|
|
|
|
// m = 8, 4, 2, 1
|
|
// stride = 0x20, 0x10, 0x08, 0x04
|
|
.macro KERNEL1xMx2_START m, stride
|
|
.if \m == 8
|
|
GLD xv, , U0, A0, 0x00
|
|
.elseif \m == 4
|
|
GLD v, , $vr0, A0, 0x00
|
|
.elseif \m ==2
|
|
GLD f, d, $f0, A0, 0x00
|
|
.elseif \m ==1
|
|
GLD f, s, $f0, A0, 0x00
|
|
.endif
|
|
GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04
|
|
GMUL xvf, s, D0, U0, X0, D2, U0, X1
|
|
PTR_ADDI A0, A0, \stride
|
|
PTR_ADDI B0, B0, 0x08
|
|
.endm
|
|
|
|
.macro KERNEL1xMx2 m, stride
|
|
.if \m == 8
|
|
GLD xv, , U0, A0, 0x00
|
|
.elseif \m == 4
|
|
GLD v, , $vr0, A0, 0x00
|
|
.elseif \m ==2
|
|
GLD f, d, $f0, A0, 0x00
|
|
.elseif \m ==1
|
|
GLD f, s, $f0, A0, 0x00
|
|
.endif
|
|
GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04
|
|
GMADD xvf, s, D0, U0, X0, D0, D2, U0, X1, D2
|
|
PTR_ADDI A0, A0, \stride
|
|
PTR_ADDI B0, B0, 0x08
|
|
.endm
|
|
|
|
.macro KERNEL8xMx2 m, stride
|
|
.rept 8
|
|
KERNEL1xMx2 \m, \stride
|
|
.endr
|
|
.endm
|
|
|
|
.macro SAVEMx2 m, stride
|
|
#if defined(TRMMKERNEL)
|
|
GMUL xvf, s, D0, D0, VALPHA, D2, D2, VALPHA
|
|
#else
|
|
/* Load C0, C1 */
|
|
.if \m == 8
|
|
GLD xv, , X0, C0, 0x00, X2, C1, 0x00
|
|
.elseif \m == 4
|
|
GLD v, , $vr2, C0, 0x00, $vr4, C1, 0x00
|
|
.elseif \m == 2
|
|
GLD f, d, $f2, C0, 0x00, $f4, C1, 0x00
|
|
.elseif \m == 1
|
|
GLD f, s, $f2, C0, 0x00, $f4, C1, 0x00
|
|
.endif
|
|
GMADD xvf, s, D0, D0, VALPHA, X0, D2, D2, VALPHA, X2
|
|
#endif // #if defined(TRMMKERNEL)
|
|
.if \m == 8
|
|
GST xv, , D0, C0, 0x00, D2, C1, 0x00
|
|
.elseif \m == 4
|
|
GST v, , $vr10, C0, 0x00, $vr12, C1, 0x00
|
|
.elseif \m == 2
|
|
GST f, d, $f10, C0, 0x00, $f12, C1, 0x00
|
|
.elseif \m == 1
|
|
GST f, s, $f10, C0, 0x00, $f12, C1, 0x00
|
|
.endif
|
|
#if __loongarch_grlen == 64
|
|
GADDI , d, C0, C0, \stride, C1, C1, \stride
|
|
#elif __loongarch_grlen == 32
|
|
GADDI , w, C0, C0, \stride, C1, C1, \stride
|
|
#else
|
|
GADDI , d, C0, C0, \stride, C1, C1, \stride
|
|
#endif
|
|
.endm
|
|
|
|
.macro KERNEL1x16x1_START
|
|
GLD xv, , U0, A0, 0x00, U1, A0, 0x20
|
|
GLDREPL xv, w, X0, B0, 0x00
|
|
GMUL xvf, s, D0, U0, X0, D1, U1, X0
|
|
PTR_ADDI A0, A0, 0x40
|
|
PTR_ADDI B0, B0, 0x04
|
|
.endm
|
|
|
|
.macro KERNEL1x16x1
|
|
GLD xv, , U0, A0, 0x00, U1, A0, 0x20
|
|
GLDREPL xv, w, X0, B0, 0x00
|
|
GMADD xvf, s, D0, U0, X0, D0, D1, U1, X0, D1
|
|
PTR_ADDI A0, A0, 0x40
|
|
PTR_ADDI B0, B0, 0x04
|
|
.endm
|
|
|
|
.macro KERNEL8x16x1
|
|
.rept 8
|
|
KERNEL1x16x1
|
|
.endr
|
|
.endm
|
|
|
|
.macro SAVE16x1
|
|
#if defined(TRMMKERNEL)
|
|
GMUL xvf, s, D0, D0, VALPHA, D1, D1, VALPHA
|
|
#else
|
|
/* Load C0 */
|
|
GLD xv, , X0, C0, 0x00, X1, C0, 0x20
|
|
GMADD xvf, s, D0, D0, VALPHA, X0, D1, D1, VALPHA, X1
|
|
#endif // #if defined(TRMMKERNEL)
|
|
GST xv, , D0, C0, 0x00, D1, C0, 0x20
|
|
#if __loongarch_grlen == 64
|
|
GADDI , d, C0, C0, 0x40
|
|
#elif __loongarch_grlen == 32
|
|
GADDI , w, C0, C0, 0x40
|
|
#else
|
|
GADDI , d, C0, C0, 0x40
|
|
#endif
|
|
.endm
|
|
|
|
// m = 8, 4, 2, 1
|
|
// stride = 0x20, 0x10, 0x08, 0x04
|
|
.macro KERNEL1xMx1_START m, stride
|
|
.if \m == 8
|
|
GLD xv, , U0, A0, 0x00
|
|
.elseif \m == 4
|
|
GLD v, , $vr0, A0, 0x00
|
|
.elseif \m ==2
|
|
GLD f, d, $f0, A0, 0x00
|
|
.elseif \m ==1
|
|
GLD f, s, $f0, A0, 0x00
|
|
.endif
|
|
GLDREPL xv, w, X0, B0, 0x00
|
|
GMUL xvf, s, D0, U0, X0
|
|
PTR_ADDI A0, A0, \stride
|
|
PTR_ADDI B0, B0, 0x04
|
|
.endm
|
|
|
|
.macro KERNEL1xMx1 m, stride
|
|
.if \m == 8
|
|
GLD xv, , U0, A0, 0x00
|
|
.elseif \m == 4
|
|
GLD v, , $vr0, A0, 0x00
|
|
.elseif \m ==2
|
|
GLD f, d, $f0, A0, 0x00
|
|
.elseif \m ==1
|
|
GLD f, s, $f0, A0, 0x00
|
|
.endif
|
|
GLDREPL xv, w, X0, B0, 0x00
|
|
GMADD xvf, s, D0, U0, X0, D0
|
|
PTR_ADDI A0, A0, \stride
|
|
PTR_ADDI B0, B0, 0x04
|
|
.endm
|
|
|
|
.macro KERNEL8xMx1 m, stride
|
|
.rept 8
|
|
KERNEL1xMx1 \m, \stride
|
|
.endr
|
|
.endm
|
|
|
|
.macro SAVEMx1 m, stride
|
|
#if defined(TRMMKERNEL)
|
|
GMUL xvf, s, D0, D0, VALPHA
|
|
#else
|
|
/* Load C0, C1 */
|
|
.if \m == 8
|
|
GLD xv, , X0, C0, 0x00
|
|
.elseif \m == 4
|
|
GLD v, , $vr2, C0, 0x00
|
|
.elseif \m == 2
|
|
GLD f, d, $f2, C0, 0x00
|
|
.elseif \m == 1
|
|
GLD f, s, $f2, C0, 0x00
|
|
.endif
|
|
GMADD xvf, s, D0, D0, VALPHA, X0
|
|
#endif // #if defined(TRMMKERNEL)
|
|
.if \m == 8
|
|
GST xv, , D0, C0, 0x00
|
|
.elseif \m == 4
|
|
GST v, , $vr10, C0, 0x00
|
|
.elseif \m == 2
|
|
GST f, d, $f10, C0, 0x00
|
|
.elseif \m == 1
|
|
GST f, s, $f10, C0, 0x00
|
|
.endif
|
|
#if __loongarch_grlen == 64
|
|
GADDI , d, C0, C0, \stride
|
|
#elif __loongarch_grlen == 32
|
|
GADDI , w, C0, C0, \stride
|
|
#else
|
|
GADDI , d, C0, C0, \stride
|
|
#endif
|
|
.endm
|
|
|
|
PROLOGUE
|
|
push_if_used 26, 32
|
|
xvreplve0.w VALPHA, $xr0
|
|
#if defined (TRMMKERNEL) && !defined(LEFT)
|
|
PTR_SUB OFF, ZERO, OFFSET
|
|
#else
|
|
xor OFF, OFF, OFF
|
|
#endif
|
|
/* if (!(N >> 3)) goto L_N7 */
|
|
PTR_SRAI J, N, 3 /* J = bn >> 3 */
|
|
andi N, N, 0x07
|
|
beq ZERO, J, .L_N7
|
|
.L_N8: /* J -- */
|
|
move C0, C
|
|
move A0, A
|
|
PTR_SLLI T0, LDC, 2
|
|
PTR_ADDI J, J, -1 /* J-- */
|
|
#if __loongarch_grlen == 64
|
|
GADD , d, C1, C0, T0, C2, C1, T0, C3, C2, T0, C4, C3, T0, C5, C4, T0, \
|
|
C6, C5, T0, C7, C6, T0
|
|
#elif __loongarch_grlen == 32
|
|
GADD , w, C1, C0, T0, C2, C1, T0, C3, C2, T0, C4, C3, T0, C5, C4, T0, \
|
|
C6, C5, T0, C7, C6, T0
|
|
#else
|
|
GADD , d, C1, C0, T0, C2, C1, T0, C3, C2, T0, C4, C3, T0, C5, C4, T0, \
|
|
C6, C5, T0, C7, C6, T0
|
|
#endif
|
|
#if defined(TRMMKERNEL) && defined(LEFT)
|
|
move OFF, OFFSET
|
|
#endif
|
|
/* if (!(M >> 4)) goto L_M8 */
|
|
PTR_SRAI I, M, 4 /* I = bm >> 4 */
|
|
beq ZERO, I, .L_M8
|
|
.align 5
|
|
.L_M16: /* I-- */
|
|
#if defined(TRMMKERNEL)
|
|
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
|
move B0, B
|
|
#else
|
|
PTR_SLLI T0, OFF, 0x06
|
|
PTR_ADD A0, A0, T0 /* A0 += 16 * OFF */
|
|
PTR_SLLI T0, OFF, 0x05
|
|
PTR_ADD B0, B, T0 /* B0 = B + 8 * OFF */
|
|
#endif
|
|
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
|
|
PTR_SUB L, K, OFF
|
|
#elif defined(LEFT)
|
|
/* number of values in A */
|
|
PTR_ADDI L, OFF, 16
|
|
#else
|
|
/* number of values in B */
|
|
PTR_ADDI L, OFF, 8
|
|
#endif
|
|
#else // #if !defined(TRMMKERNEL)
|
|
move B0, B
|
|
move L, K /* L = bk */
|
|
#endif
|
|
KERNEL1x16x8_START
|
|
/* Reduce L */
|
|
PTR_ADDI L, L, -1
|
|
PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */
|
|
/* if (TL < 1) goto L_M16_L7 */
|
|
beq ZERO,TL, .L_M16_L7
|
|
.align 5
|
|
.L_M16_TL1:
|
|
KERNEL8x16x8
|
|
PTR_ADDI TL, TL, -1 /* TL-- */
|
|
blt ZERO,TL, .L_M16_TL1
|
|
.L_M16_L7:
|
|
andi TL, L, 7
|
|
beq TL, ZERO,.L_M16_L0
|
|
.align 5
|
|
.L_M16_L71:
|
|
KERNEL1x16x8
|
|
PTR_ADDI TL, TL, -1
|
|
blt ZERO,TL, .L_M16_L71
|
|
.L_M16_L0:
|
|
SAVE16x8
|
|
|
|
#if defined(TRMMKERNEL)
|
|
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
|
PTR_SUB L, K, OFF
|
|
#ifdef LEFT
|
|
/* number of values in A */
|
|
PTR_ADDI L, L, -16
|
|
#else
|
|
/* number of values in B */
|
|
PTR_ADDI L, L, -8
|
|
#endif
|
|
PTR_SLLI T0, L, 0x06
|
|
PTR_ADD A0, A0, T0
|
|
PTR_SLLI T0, L, 0x05
|
|
PTR_ADD B0, B0, T0
|
|
#endif
|
|
|
|
#ifdef LEFT
|
|
PTR_ADDI OFF, OFF, 0x10 /* number of values in A */
|
|
#endif
|
|
#endif // #if defined(TRMMKERNEL)
|
|
|
|
PTR_ADDI I, I, -1 /* I-- */
|
|
blt ZERO,I, .L_M16
|
|
.L_M8:
|
|
/* We have done M & 16, considering M=8/4/2/1 */
|
|
andi I, M, 15
|
|
beq ZERO,I, .L_M0
|
|
|
|
andi I, M, 8
|
|
beq ZERO,I, .L_M4
|
|
#if defined(TRMMKERNEL)
|
|
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
|
move B0, B
|
|
#else
|
|
PTR_SLLI T0, OFF, 0x05
|
|
PTR_ADD A0, A0, T0 /* A0 += 8 * OFF */
|
|
PTR_SLLI T0, OFF, 0x05
|
|
PTR_ADD B0, B, T0 /* B0 = B + 8 * OFF */
|
|
#endif
|
|
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
|
|
PTR_SUB L, K, OFF
|
|
#elif defined(LEFT)
|
|
/* number of values in A */
|
|
PTR_ADDI L, OFF, 8
|
|
#else
|
|
/* number of values in B */
|
|
PTR_ADDI L, OFF, 8
|
|
#endif
|
|
#else // #if !defined(TRMMKERNEL)
|
|
move B0, B
|
|
move L, K /* L = bk */
|
|
#endif // #if defined(TRMMKERNEL)
|
|
KERNEL1xMx8_START 8, 0x20
|
|
/* Reduce L */
|
|
PTR_ADDI L, L, -1
|
|
PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */
|
|
/* if (TL < 1) goto L_M8_L7 */
|
|
beq ZERO,TL, .L_M8_L7
|
|
.align 5
|
|
.L_M8_TL1:
|
|
KERNEL8xMx8 8, 0x20
|
|
PTR_ADDI TL, TL, -1 /* TL-- */
|
|
blt ZERO,TL, .L_M8_TL1
|
|
.L_M8_L7:
|
|
/* if (!(L & 7)) goto L_M8_L0 */
|
|
andi TL, L, 7
|
|
beq TL, ZERO,.L_M8_L0
|
|
.align 5
|
|
.L_M8_L71:
|
|
KERNEL1xMx8 8, 0x20
|
|
PTR_ADDI TL, TL, -1
|
|
blt ZERO,TL, .L_M8_L71
|
|
.L_M8_L0:
|
|
SAVEMx8 8, 0x20
|
|
#if defined(TRMMKERNEL)
|
|
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
|
PTR_SUB L, K, OFF
|
|
#ifdef LEFT
|
|
/* number of values in A */
|
|
PTR_ADDI L, L, -8
|
|
#else
|
|
/* number of values in B */
|
|
PTR_ADDI L, L, -8
|
|
#endif
|
|
PTR_SLLI T0, L, 0x05
|
|
PTR_ADD A0, A0, T0
|
|
PTR_SLLI T0, L, 0x05
|
|
PTR_ADD B0, B0, T0
|
|
#endif
|
|
|
|
#ifdef LEFT
|
|
/* number of values in A */
|
|
PTR_ADDI OFF, OFF, 0x08
|
|
#endif
|
|
#endif // #if defined(TRMMKERNEL)
|
|
.L_M4:
|
|
andi I, M, 4
|
|
beq ZERO,I, .L_M2
|
|
#if defined(TRMMKERNEL)
|
|
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
|
move B0, B
|
|
#else
|
|
PTR_SLLI T0, OFF, 0x04
|
|
PTR_ADD A0, A0, T0 /* A0 += 4 * OFF */
|
|
PTR_SLLI T0, OFF, 0x05
|
|
PTR_ADD B0, B, T0 /* B0 = B + 8 * OFF */
|
|
#endif
|
|
|
|
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
|
|
PTR_SUB L, K, OFF
|
|
#elif defined(LEFT)
|
|
/* number of values in A */
|
|
PTR_ADDI L, OFF, 4
|
|
#else
|
|
/* number of values in B */
|
|
PTR_ADDI L, OFF, 8
|
|
#endif
|
|
#else // #if !defined(TRMMKERNEL)
|
|
move B0, B
|
|
move L, K /* L = bk */
|
|
#endif
|
|
KERNEL1xMx8_START 4, 0x10
|
|
/* Reduce L */
|
|
PTR_ADDI L, L, -1
|
|
PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */
|
|
/* if (TL < 1) goto L_M4_L7 */
|
|
beq ZERO,TL, .L_M4_L7
|
|
.align 5
|
|
.L_M4_TL1:
|
|
KERNEL8xMx8 4, 0x10
|
|
PTR_ADDI TL, TL, -1
|
|
blt ZERO,TL, .L_M4_TL1
|
|
.L_M4_L7:
|
|
/* if (!(L & 7)) goto L_M4_L0 */
|
|
andi TL, L, 7
|
|
beq TL, ZERO,.L_M4_L0
|
|
.L_M4_L71:
|
|
KERNEL1xMx8 4, 0x10
|
|
PTR_ADDI TL, TL, -1
|
|
blt ZERO,TL, .L_M4_L71
|
|
.L_M4_L0:
|
|
SAVEMx8 4, 0x10
|
|
#if defined(TRMMKERNEL)
|
|
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
|
PTR_SUB L, K, OFF
|
|
#ifdef LEFT
|
|
/* number of values in A */
|
|
PTR_ADDI L, L, -4
|
|
#else
|
|
/* number of values in B */
|
|
PTR_ADDI L, L, -8
|
|
#endif
|
|
PTR_SLLI T0, L, 0x04
|
|
PTR_ADD A0, A0, T0
|
|
PTR_SLLI T0, L, 0x05
|
|
PTR_ADD B0, B0, T0
|
|
#endif
|
|
|
|
#ifdef LEFT
|
|
/* number of values in A */
|
|
PTR_ADDI OFF, OFF, 0x04
|
|
#endif
|
|
#endif // #if defined(TRMMKERNEL)
|
|
.L_M2:
|
|
andi I, M, 2
|
|
beq ZERO,I, .L_M1
|
|
|
|
#if defined(TRMMKERNEL)
|
|
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
|
move B0, B
|
|
#else
|
|
PTR_SLLI T0, OFF, 0x03
|
|
PTR_ADD A0, A0, T0
|
|
PTR_SLLI T0, OFF, 0x05
|
|
PTR_ADD B0, B, T0
|
|
#endif
|
|
|
|
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
|
|
PTR_SUB L, K, OFF
|
|
#elif defined(LEFT)
|
|
/* number of values in A */
|
|
PTR_ADDI L, OFF, 2
|
|
#else
|
|
/* number of values in B */
|
|
PTR_ADDI L, OFF, 8
|
|
#endif
|
|
#else // #if !defined(TRMMKERNEL)
|
|
move B0, B
|
|
move L, K /* L = bk */
|
|
#endif
|
|
KERNEL1xMx8_START 2, 0x08
|
|
|
|
/* Reduce L */
|
|
PTR_ADDI L, L, -1
|
|
PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */
|
|
/* if (TL < 1) goto L_M2_L7 */
|
|
beq ZERO,TL, .L_M2_L7
|
|
.align 5
|
|
.L_M2_TL1:
|
|
KERNEL8xMx8 2, 0x08
|
|
PTR_ADDI TL, TL, -1 /* TL-- */
|
|
blt ZERO,TL, .L_M2_TL1
|
|
.L_M2_L7:
|
|
/* if (!(L & 7)) goto L_M2_L0 */
|
|
andi TL, L, 7
|
|
beq TL, ZERO,.L_M2_L0
|
|
.align 5
|
|
.L_M2_L71:
|
|
KERNEL1xMx8 2, 0x08
|
|
PTR_ADDI TL, TL, -1
|
|
blt ZERO,TL, .L_M2_L71
|
|
.L_M2_L0:
|
|
SAVEMx8 2, 0x08
|
|
#if defined(TRMMKERNEL)
|
|
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
|
PTR_SUB L, K, OFF
|
|
#ifdef LEFT
|
|
/* number of values in A */
|
|
PTR_ADDI L, L, -2
|
|
#else
|
|
/* number of values in B */
|
|
PTR_ADDI L, L, -8
|
|
#endif
|
|
PTR_SLLI T0, L, 0x03
|
|
PTR_ADD A0, A0, T0
|
|
PTR_SLLI T0, L, 0x05
|
|
PTR_ADD B0, B0, T0
|
|
#endif
|
|
|
|
#ifdef LEFT
|
|
/* number of values in A */
|
|
PTR_ADDI OFF, OFF, 0x02
|
|
#endif
|
|
#endif // #if defined(TRMMKERNEL)
|
|
.L_M1:
|
|
andi I, M, 1
|
|
beq ZERO,I, .L_M0
|
|
|
|
#if defined(TRMMKERNEL)
|
|
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
|
move B0, B
|
|
#else
|
|
PTR_SLLI T0, OFF, 0x02
|
|
PTR_ADD A0, A0, T0
|
|
PTR_SLLI T0, OFF, 0x05
|
|
PTR_ADD B0, B, T0
|
|
#endif
|
|
|
|
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
|
|
PTR_SUB L, K, OFF
|
|
#elif defined(LEFT)
|
|
/* number of values in A */
|
|
PTR_ADDI L, OFF, 1
|
|
#else
|
|
/* number of values in B */
|
|
PTR_ADDI L, OFF, 8
|
|
#endif
|
|
#else // #if !defined(TRMMKERNEL)
|
|
move B0, B
|
|
move L, K /* L = bk */
|
|
#endif
|
|
KERNEL1xMx8_START 1, 0x04
|
|
/* Reduce L */
|
|
PTR_ADDI L, L, -1
|
|
PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */
|
|
/* if (TL < 1) goto L_M1_L7 */
|
|
beq ZERO,TL, .L_M1_L7
|
|
.align 5
|
|
.L_M1_TL1:
|
|
KERNEL8xMx8 1, 0x04
|
|
PTR_ADDI TL, TL, -1 /* TL-- */
|
|
blt ZERO,TL, .L_M1_TL1
|
|
.L_M1_L7:
|
|
/* if (!(L & 7)) goto L_M1_L0 */
|
|
andi TL, L, 7
|
|
beq TL, ZERO,.L_M1_L0
|
|
.align 5
|
|
.L_M1_L71:
|
|
KERNEL1xMx8 1, 0x04
|
|
PTR_ADDI TL, TL, -1
|
|
blt ZERO,TL, .L_M1_L71
|
|
.L_M1_L0:
|
|
SAVEMx8 1, 0x04
|
|
#if defined(TRMMKERNEL)
|
|
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
|
PTR_SUB L, K, OFF
|
|
#ifdef LEFT
|
|
/* number of values in A */
|
|
PTR_ADDI L, L, -1
|
|
#else
|
|
/* number of values in B */
|
|
PTR_ADDI L, L, -8
|
|
#endif
|
|
PTR_SLLI T0, L, 0x02
|
|
PTR_ADD A0, A0, T0
|
|
PTR_SLLI T0, L, 0x05
|
|
PTR_ADD B0, B0, T0
|
|
#endif
|
|
|
|
#ifdef LEFT
|
|
/* number of values in A */
|
|
PTR_ADDI OFF, OFF, 0x01
|
|
#endif
|
|
#endif // #if defined(TRMMKERNEL)
|
|
|
|
.L_M0:
|
|
/* Add stride for B and C
|
|
* B += (K * 32)
|
|
* C += (LDC * 32)
|
|
*/
|
|
PTR_SLLI T0, K, 5
|
|
PTR_SLLI T1, LDC, 5
|
|
PTR_ADD B, B, T0
|
|
PTR_ADD C, C, T1
|
|
#if defined(TRMMKERNEL) && !defined(LEFT)
|
|
PTR_ADDI OFF, OFF, 0x08 /* number of values in B */
|
|
#endif
|
|
blt ZERO, J, .L_N8
|
|
|
|
.L_N7:
|
|
andi J, N, 4
|
|
beq ZERO, J, .L_N3
|
|
.L_N4:
|
|
move C0, C
|
|
move A0, A
|
|
PTR_SLLI T0, LDC, 2
|
|
#if __loongarch_grlen == 64
|
|
GADD , d, C1, C0, T0, C2, C1, T0, C3, C2, T0
|
|
#elif __loongarch_grlen == 32
|
|
GADD , w, C1, C0, T0, C2, C1, T0, C3, C2, T0
|
|
#else
|
|
GADD , d, C1, C0, T0, C2, C1, T0, C3, C2, T0
|
|
#endif
|
|
|
|
#if defined(TRMMKERNEL) && defined(LEFT)
|
|
move OFF, OFFSET
|
|
#endif
|
|
|
|
/* if (!(M >> 4)) goto L_N4_M8 */
|
|
PTR_SRAI I, M, 4 /* I = bm >> 4 */
|
|
beq ZERO, I, .L_N4_M8
|
|
.align 5
|
|
.L_N4_M16:
|
|
#if defined(TRMMKERNEL)
|
|
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
|
move B0, B
|
|
#else
|
|
PTR_SLLI T0, OFF, 0x06
|
|
PTR_ADD A0, A0, T0 /* A0 += 16 * OFF */
|
|
PTR_SLLI T0, OFF, 0x04
|
|
PTR_ADD B0, B, T0 /* B0 += 4 * OFF */
|
|
#endif
|
|
|
|
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
|
|
PTR_SUB L, K, OFF
|
|
#elif defined(LEFT)
|
|
/* number of values in A */
|
|
PTR_ADDI L, OFF, 16
|
|
#else
|
|
/* number of values in B */
|
|
PTR_ADDI L, OFF, 4
|
|
#endif
|
|
#else // #if !defined(TRMMKERNEL)
|
|
move B0, B
|
|
move L, K /* L = bk */
|
|
#endif
|
|
KERNEL1x16x4_START
|
|
|
|
/* Reduce L */
|
|
PTR_ADDI L, L, -1
|
|
PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */
|
|
/* if (TL < 1) goto L_N4_L7 */
|
|
beq ZERO,TL, .L_N4_M16_L7
|
|
.align 5
|
|
.L_N4_M16_TL1: /* TL-- */
|
|
KERNEL8x16x4
|
|
|
|
PTR_ADDI TL, TL, -1 /* TL-- */
|
|
blt ZERO,TL, .L_N4_M16_TL1
|
|
.L_N4_M16_L7:
|
|
/* if (!(L & 7)) goto L_N4_L0 */
|
|
andi TL, L, 7
|
|
beq TL, ZERO,.L_N4_M16_L0
|
|
.align 5
|
|
.L_N4_M16_L71:
|
|
KERNEL1x16x4
|
|
PTR_ADDI TL, TL, -1
|
|
blt ZERO,TL, .L_N4_M16_L71
|
|
.L_N4_M16_L0:
|
|
SAVE16x4
|
|
#if defined(TRMMKERNEL)
|
|
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
|
PTR_SUB L, K, OFF
|
|
#ifdef LEFT
|
|
PTR_ADDI L, L, -16
|
|
#else
|
|
PTR_ADDI L, L, -4
|
|
#endif
|
|
PTR_SLLI T0, L, 0x06
|
|
PTR_ADD A0, A0, T0
|
|
PTR_SLLI T0, L, 0x04
|
|
PTR_ADD B0, B0, T0
|
|
#endif
|
|
|
|
#ifdef LEFT
|
|
PTR_ADDI OFF, OFF, 0x10
|
|
#endif
|
|
#endif // #if defined(TRMMKERNEL)
|
|
|
|
PTR_ADDI I, I, -1 /* I-- */
|
|
blt ZERO,I, .L_N4_M16
|
|
.L_N4_M8:
|
|
/* We have done M & 16, considering M=8/4/2/1 */
|
|
andi I, M, 15
|
|
beq ZERO,I, .L_N4_M0
|
|
|
|
andi I, M, 8
|
|
beq ZERO,I, .L_N4_M4
|
|
|
|
#if defined(TRMMKERNEL)
|
|
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
|
move B0, B
|
|
#else
|
|
PTR_SLLI T0, OFF, 0x05
|
|
PTR_ADD A0, A0, T0
|
|
PTR_SLLI T0, OFF, 0x04
|
|
PTR_ADD B0, B, T0
|
|
#endif
|
|
|
|
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
|
|
PTR_SUB L, K, OFF
|
|
#elif defined(LEFT)
|
|
/* number of values in A */
|
|
PTR_ADDI L, OFF, 8
|
|
#else
|
|
/* number of values in B */
|
|
PTR_ADDI L, OFF, 4
|
|
#endif
|
|
#else // #if !defined(TRMMKERNEL)
|
|
move B0, B
|
|
move L, K /* L = bk */
|
|
#endif
|
|
KERNEL1xMx4_START 8, 0x20
|
|
/* Reduce L */
|
|
PTR_ADDI L, L, -1
|
|
PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */
|
|
/* if (TL < 1) goto L_N4_M8_L7 */
|
|
beq ZERO,TL, .L_N4_M8_L7
|
|
.align 5
|
|
.L_N4_M8_TL1: /* TL-- */
|
|
KERNEL8xMx4 8, 0x20
|
|
|
|
PTR_ADDI TL, TL, -1 /* TL-- */
|
|
blt ZERO,TL, .L_N4_M8_TL1
|
|
.L_N4_M8_L7:
|
|
/* if (!(L & 7)) goto L_N4_M8_L0 */
|
|
andi TL, L, 7
|
|
beq TL, ZERO,.L_N4_M8_L0
|
|
.align 5
|
|
.L_N4_M8_L71:
|
|
KERNEL1xMx4 8, 0x20
|
|
PTR_ADDI TL, TL, -1
|
|
blt ZERO,TL, .L_N4_M8_L71
|
|
.L_N4_M8_L0:
|
|
SAVEMx4 8, 0x20
|
|
#if defined(TRMMKERNEL)
|
|
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
|
PTR_SUB L, K, OFF
|
|
#ifdef LEFT
|
|
PTR_ADDI L, L, -8
|
|
#else
|
|
PTR_ADDI L, L, -4
|
|
#endif
|
|
PTR_SLLI T0, L, 0x05
|
|
PTR_ADD A0, A0, T0
|
|
PTR_SLLI T0, L, 0x04
|
|
PTR_ADD B0, B0, T0
|
|
#endif
|
|
|
|
#ifdef LEFT
|
|
PTR_ADDI OFF, OFF, 0x08
|
|
#endif
|
|
#endif // #if defined(TRMMKERNEL)
|
|
.L_N4_M4:
|
|
andi I, M, 4
|
|
beq ZERO,I, .L_N4_M2
|
|
|
|
#if defined(TRMMKERNEL)
|
|
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
|
move B0, B
|
|
#else
|
|
PTR_SLLI T0, OFF, 0x04
|
|
PTR_ADD A0, A0, T0
|
|
PTR_SLLI T0, OFF, 0x04
|
|
PTR_ADD B0, B, T0
|
|
#endif
|
|
|
|
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
|
|
PTR_SUB L, K, OFF
|
|
#elif defined(LEFT)
|
|
/* number of values in A */
|
|
PTR_ADDI L, OFF, 4
|
|
#else
|
|
/* number of values in B */
|
|
PTR_ADDI L, OFF, 4
|
|
#endif
|
|
#else // #if !defined(TRMMKERNEL)
|
|
move B0, B
|
|
move L, K /* L = bk */
|
|
#endif
|
|
KERNEL1xMx4_START 4, 0x10
|
|
/* Reduce L */
|
|
PTR_ADDI L, L, -1
|
|
PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */
|
|
/* if (TL < 1) goto L_N4_M4_L7 */
|
|
beq ZERO,TL, .L_N4_M4_L7
|
|
.align 5
|
|
.L_N4_M4_TL1: /* TL-- */
|
|
KERNEL8xMx4 4, 0x10
|
|
|
|
PTR_ADDI TL, TL, -1 /* TL-- */
|
|
blt ZERO,TL, .L_N4_M4_TL1
|
|
.L_N4_M4_L7:
|
|
/* if (!(L & 7)) goto L_N4_M4_L0 */
|
|
andi TL, L, 7
|
|
beq TL, ZERO,.L_N4_M4_L0
|
|
.align 5
|
|
.L_N4_M4_L71:
|
|
KERNEL1xMx4 4, 0x10
|
|
|
|
PTR_ADDI TL, TL, -1
|
|
blt ZERO,TL, .L_N4_M4_L71
|
|
.L_N4_M4_L0:
|
|
SAVEMx4 4, 0x10
|
|
#if defined(TRMMKERNEL)
|
|
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
|
PTR_SUB L, K, OFF
|
|
#ifdef LEFT
|
|
PTR_ADDI L, L, -4
|
|
#else
|
|
PTR_ADDI L, L, -4
|
|
#endif
|
|
PTR_SLLI T0, L, 0x04
|
|
PTR_ADD A0, A0, T0
|
|
PTR_SLLI T0, L, 0x04
|
|
PTR_ADD B0, B0, T0
|
|
#endif
|
|
|
|
#ifdef LEFT
|
|
PTR_ADDI OFF, OFF, 0x04
|
|
#endif
|
|
#endif // #if defined(TRMMKERNEL)
|
|
.L_N4_M2:
|
|
andi I, M, 2
|
|
beq ZERO,I, .L_N4_M1
|
|
|
|
#if defined(TRMMKERNEL)
|
|
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
|
move B0, B
|
|
#else
|
|
PTR_SLLI T0, OFF, 0x03
|
|
PTR_ADD A0, A0, T0
|
|
PTR_SLLI T0, OFF, 0x04
|
|
PTR_ADD B0, B, T0
|
|
#endif
|
|
|
|
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
|
|
PTR_SUB L, K, OFF
|
|
#elif defined(LEFT)
|
|
/* number of values in A */
|
|
PTR_ADDI L, OFF, 2
|
|
#else
|
|
/* number of values in B */
|
|
PTR_ADDI L, OFF, 4
|
|
#endif
|
|
#else // #if !defined(TRMMKERNEL)
|
|
move B0, B
|
|
move L, K /* L = bk */
|
|
#endif
|
|
KERNEL1xMx4_START 2, 0x08
|
|
/* Reduce L */
|
|
PTR_ADDI L, L, -1
|
|
PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */
|
|
/* if (TL < 1) goto L_N4_M2_L7 */
|
|
beq ZERO,TL, .L_N4_M2_L7
|
|
.align 5
|
|
.L_N4_M2_TL1: /* TL-- */
|
|
KERNEL8xMx4 2, 0x08
|
|
|
|
PTR_ADDI TL, TL, -1 /* TL-- */
|
|
blt ZERO,TL, .L_N4_M2_TL1
|
|
.L_N4_M2_L7:
|
|
/* if (!(L & 7)) goto L_N4_M2_L0 */
|
|
andi TL, L, 7
|
|
beq TL, ZERO,.L_N4_M2_L0
|
|
.align 5
|
|
.L_N4_M2_L71:
|
|
KERNEL1xMx4 2, 0x08
|
|
PTR_ADDI TL, TL, -1
|
|
blt ZERO,TL, .L_N4_M2_L71
|
|
.L_N4_M2_L0:
|
|
SAVEMx4 2, 0x08
|
|
|
|
#if defined(TRMMKERNEL)
|
|
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
|
PTR_SUB L, K, OFF
|
|
#ifdef LEFT
|
|
PTR_ADDI L, L, -2
|
|
#else
|
|
PTR_ADDI L, L, -4
|
|
#endif
|
|
PTR_SLLI T0, L, 0x03
|
|
PTR_ADD A0, A0, T0
|
|
PTR_SLLI T0, L, 0x04
|
|
PTR_ADD B0, B0, T0
|
|
#endif
|
|
|
|
#ifdef LEFT
|
|
PTR_ADDI OFF, OFF, 0x02
|
|
#endif
|
|
#endif // #if defined(TRMMKERNEL)
|
|
.L_N4_M1:
|
|
andi I, M, 1
|
|
beq ZERO,I, .L_N4_M0
|
|
|
|
#if defined(TRMMKERNEL)
|
|
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
|
move B0, B
|
|
#else
|
|
PTR_SLLI T0, OFF, 0x02
|
|
PTR_ADD A0, A0, T0
|
|
PTR_SLLI T0, OFF, 0x04
|
|
PTR_ADD B0, B, T0
|
|
#endif
|
|
|
|
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
|
|
PTR_SUB L, K, OFF
|
|
#elif defined(LEFT)
|
|
/* number of values in A */
|
|
PTR_ADDI L, OFF, 1
|
|
#else
|
|
/* number of values in B */
|
|
PTR_ADDI L, OFF, 4
|
|
#endif
|
|
#else // #if !defined(TRMMKERNEL)
|
|
move B0, B
|
|
move L, K /* L = bk */
|
|
#endif
|
|
KERNEL1xMx4_START 1, 0x04
|
|
/* Reduce L */
|
|
PTR_ADDI L, L, -1
|
|
PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */
|
|
/* if (TL < 1) goto L_N4_M1_L7 */
|
|
beq ZERO,TL, .L_N4_M1_L7
|
|
.align 5
|
|
.L_N4_M1_TL1: /* TL-- */
|
|
KERNEL8xMx4 1, 0x04
|
|
|
|
PTR_ADDI TL, TL, -1 /* TL-- */
|
|
blt ZERO,TL, .L_N4_M1_TL1
|
|
.L_N4_M1_L7:
|
|
/* if (!(L & 7)) goto L_N4_M1_L0 */
|
|
andi TL, L, 7
|
|
beq TL, ZERO,.L_N4_M1_L0
|
|
.align 5
|
|
.L_N4_M1_L71:
|
|
KERNEL1xMx4 1, 0x04
|
|
PTR_ADDI TL, TL, -1
|
|
blt ZERO,TL, .L_N4_M1_L71
|
|
.L_N4_M1_L0:
|
|
SAVEMx4 1, 0x04
|
|
#if defined(TRMMKERNEL)
|
|
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
|
PTR_SUB L, K, OFF
|
|
#ifdef LEFT
|
|
PTR_ADDI L, L, -1
|
|
#else
|
|
PTR_ADDI L, L, -4
|
|
#endif
|
|
PTR_SLLI T0, L, 0x02
|
|
PTR_ADD A0, A0, T0
|
|
PTR_SLLI T0, L, 0x04
|
|
PTR_ADD B0, B0, T0
|
|
#endif
|
|
|
|
#ifdef LEFT
|
|
PTR_ADDI OFF, OFF, 0x01
|
|
#endif
|
|
#endif // #if defined(TRMMKERNEL)
|
|
.L_N4_M0:
|
|
/* Add stride for B and C
|
|
* B += 4 * K
|
|
* C += 4 * LDC
|
|
*/
|
|
PTR_SLLI T0, K, 4
|
|
PTR_SLLI T1, LDC, 4
|
|
PTR_ADD B, B, T0
|
|
PTR_ADD C, C, T1
|
|
|
|
#if defined(TRMMKERNEL) && !defined(LEFT)
|
|
PTR_ADDI OFF, OFF, 0x04
|
|
#endif
|
|
/* We must reinit I */
|
|
PTR_SRAI I, M, 4 /* I = bm >> 4 */
|
|
.L_N3:
|
|
andi J, N, 2
|
|
beq ZERO, J, .L_N1
|
|
|
|
.L_N2:
|
|
move C0, C
|
|
move A0, A
|
|
PTR_SLLI T0, LDC, 2
|
|
PTR_ADD C1, C0, T0
|
|
|
|
#if defined(TRMMKERNEL) && defined(LEFT)
|
|
move OFF, OFFSET
|
|
#endif
|
|
|
|
/* if (!(M >> 4)) goto L_N2_M8 */
|
|
PTR_SRAI I, M, 4 /* I = bm >> 4 */
|
|
beq ZERO, I, .L_N2_M8
|
|
.align 5
|
|
.L_N2_M16:
|
|
#if defined(TRMMKERNEL)
|
|
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
|
move B0, B
|
|
#else
|
|
PTR_SLLI T0, OFF, 0x06
|
|
PTR_ADD A0, A0, T0
|
|
PTR_SLLI T0, OFF, 0x03
|
|
PTR_ADD B0, B, T0
|
|
#endif
|
|
|
|
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
|
|
PTR_SUB L, K, OFF
|
|
#elif defined(LEFT)
|
|
/* number of values in A */
|
|
PTR_ADDI L, OFF, 16
|
|
#else
|
|
/* number of values in B */
|
|
PTR_ADDI L, OFF, 2
|
|
#endif
|
|
#else // #if !defined(TRMMKERNEL)
|
|
move B0, B
|
|
move L, K /* L = bk */
|
|
#endif
|
|
KERNEL1x16x2_START
|
|
|
|
/* Reduce L */
|
|
PTR_ADDI L, L, -1
|
|
PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */
|
|
/* if (TL < 1) goto L_N2_M16_L7 */
|
|
beq ZERO,TL, .L_N2_M16_L7
|
|
.align 5
|
|
.L_N2_M16_TL1: /* TL-- */
|
|
KERNEL8x16x2
|
|
|
|
PTR_ADDI TL, TL, -1 /* TL-- */
|
|
blt ZERO,TL, .L_N2_M16_TL1
|
|
.L_N2_M16_L7:
|
|
/* if (!(L & 7)) goto L_N2_M16_L0 */
|
|
andi TL, L, 7
|
|
beq TL, ZERO,.L_N2_M16_L0
|
|
.align 5
|
|
.L_N2_M16_L71:
|
|
KERNEL1x16x2
|
|
PTR_ADDI TL, TL, -1
|
|
blt ZERO,TL, .L_N2_M16_L71
|
|
.L_N2_M16_L0:
|
|
SAVE16x2
|
|
#if defined(TRMMKERNEL)
|
|
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
|
PTR_SUB L, K, OFF
|
|
#ifdef LEFT
|
|
PTR_ADDI L, L, -16
|
|
#else
|
|
PTR_ADDI L, L, -2
|
|
#endif
|
|
PTR_SLLI T0, L, 0x06
|
|
PTR_ADD A0, A0, T0
|
|
PTR_SLLI T0, L, 0x03
|
|
PTR_ADD B0, B0, T0
|
|
#endif
|
|
|
|
#ifdef LEFT
|
|
PTR_ADDI OFF, OFF, 0x10
|
|
#endif
|
|
#endif // #if defined(TRMMKERNEL)
|
|
|
|
PTR_ADDI I, I, -1 /* I-- */
|
|
blt ZERO,I, .L_N2_M16
|
|
.L_N2_M8:
|
|
/* We have done M & 16, considering M=8/4/2/1 */
|
|
andi I, M, 15
|
|
beq ZERO,I, .L_N2_M0
|
|
|
|
andi I, M, 8
|
|
beq ZERO,I, .L_N2_M4
|
|
|
|
#if defined(TRMMKERNEL)
|
|
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
|
move B0, B
|
|
#else
|
|
PTR_SLLI T0, OFF, 0x05
|
|
PTR_ADD A0, A0, T0
|
|
PTR_SLLI T0, OFF, 0x03
|
|
PTR_ADD B0, B, T0
|
|
#endif
|
|
|
|
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
|
|
PTR_SUB L, K, OFF
|
|
#elif defined(LEFT)
|
|
/* number of values in A */
|
|
PTR_ADDI L, OFF, 8
|
|
#else
|
|
/* number of values in B */
|
|
PTR_ADDI L, OFF, 2
|
|
#endif
|
|
#else // #if !defined(TRMMKERNEL)
|
|
move B0, B
|
|
move L, K /* L = bk */
|
|
#endif
|
|
KERNEL1xMx2_START 8, 0x20
|
|
/* Reduce L */
|
|
PTR_ADDI L, L, -1
|
|
PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */
|
|
/* if (TL < 1) goto L_N2_M8_L7 */
|
|
beq ZERO,TL, .L_N2_M8_L7
|
|
.align 5
|
|
.L_N2_M8_TL1: /* TL-- */
|
|
KERNEL8xMx2 8, 0x20
|
|
PTR_ADDI TL, TL, -1 /* TL-- */
|
|
blt ZERO,TL, .L_N2_M8_TL1
|
|
.L_N2_M8_L7:
|
|
/* if (!(L & 7)) goto L_N2_M8_L0 */
|
|
andi TL, L, 7
|
|
beq TL, ZERO,.L_N2_M8_L0
|
|
.align 5
|
|
.L_N2_M8_L71:
|
|
KERNEL1xMx2 8, 0x20
|
|
PTR_ADDI TL, TL, -1
|
|
blt ZERO,TL, .L_N2_M8_L71
|
|
.L_N2_M8_L0:
|
|
SAVEMx2 8, 0x20
|
|
#if defined(TRMMKERNEL)
|
|
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
|
PTR_SUB L, K, OFF
|
|
#ifdef LEFT
|
|
PTR_ADDI L, L, -8
|
|
#else
|
|
PTR_ADDI L, L, -2
|
|
#endif
|
|
PTR_SLLI T0, L, 0x05
|
|
PTR_ADD A0, A0, T0
|
|
PTR_SLLI T0, L, 0x03
|
|
PTR_ADD B0, B0, T0
|
|
#endif
|
|
|
|
#ifdef LEFT
|
|
PTR_ADDI OFF, OFF, 0x08
|
|
#endif
|
|
#endif // #if defined(TRMMKERNEL)
|
|
.L_N2_M4:
|
|
andi I, M, 4
|
|
beq ZERO,I, .L_N2_M2
|
|
|
|
#if defined(TRMMKERNEL)
|
|
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
|
move B0, B
|
|
#else
|
|
PTR_SLLI T0, OFF, 0x04
|
|
PTR_ADD A0, A0, T0
|
|
PTR_SLLI T0, OFF, 0x03
|
|
PTR_ADD B0, B, T0
|
|
#endif
|
|
|
|
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
|
|
PTR_SUB L, K, OFF
|
|
#elif defined(LEFT)
|
|
/* number of values in A */
|
|
PTR_ADDI L, OFF, 4
|
|
#else
|
|
/* number of values in B */
|
|
PTR_ADDI L, OFF, 2
|
|
#endif
|
|
#else // #if !defined(TRMMKERNEL)
|
|
move B0, B
|
|
move L, K /* L = bk */
|
|
#endif
|
|
KERNEL1xMx2_START 4, 0x10
|
|
/* Reduce L */
|
|
PTR_ADDI L, L, -1
|
|
PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */
|
|
/* if (TL < 1) goto L_N2_M4_L7 */
|
|
beq ZERO,TL, .L_N2_M4_L7
|
|
.align 5
|
|
.L_N2_M4_TL1: /* TL-- */
|
|
KERNEL8xMx2 4, 0x10
|
|
PTR_ADDI TL, TL, -1 /* TL-- */
|
|
blt ZERO,TL, .L_N2_M4_TL1
|
|
.L_N2_M4_L7:
|
|
/* if (!(L & 7)) goto L_N2_M4_L0 */
|
|
andi TL, L, 7
|
|
beq TL, ZERO,.L_N2_M4_L0
|
|
.align 5
|
|
.L_N2_M4_L71:
|
|
KERNEL1xMx2 4, 0x10
|
|
PTR_ADDI TL, TL, -1
|
|
blt ZERO,TL, .L_N2_M4_L71
|
|
.L_N2_M4_L0:
|
|
SAVEMx2 4, 0x10
|
|
#if defined(TRMMKERNEL)
|
|
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
|
PTR_SUB L, K, OFF
|
|
#ifdef LEFT
|
|
PTR_ADDI L, L, -4
|
|
#else
|
|
PTR_ADDI L, L, -2
|
|
#endif
|
|
PTR_SLLI T0, L, 0x04
|
|
PTR_ADD A0, A0, T0
|
|
PTR_SLLI T0, L, 0x03
|
|
PTR_ADD B0, B0, T0
|
|
#endif
|
|
|
|
#ifdef LEFT
|
|
PTR_ADDI OFF, OFF, 0x04
|
|
#endif
|
|
#endif // #if defined(TRMMKERNEL)
|
|
.L_N2_M2:
|
|
andi I, M, 2
|
|
beq ZERO,I, .L_N2_M1
|
|
|
|
#if defined(TRMMKERNEL)
|
|
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
|
move B0, B
|
|
#else
|
|
PTR_SLLI T0, OFF, 0x03
|
|
PTR_ADD A0, A0, T0
|
|
PTR_ADD B0, B, T0
|
|
#endif
|
|
|
|
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
|
|
PTR_SUB L, K, OFF
|
|
#elif defined(LEFT)
|
|
/* number of values in A */
|
|
PTR_ADDI L, OFF, 2
|
|
#else
|
|
/* number of values in B */
|
|
PTR_ADDI L, OFF, 2
|
|
#endif
|
|
#else // #if !defined(TRMMKERNEL)
|
|
move B0, B
|
|
move L, K /* L = bk */
|
|
#endif
|
|
KERNEL1xMx2_START 2, 0x08
|
|
/* Reduce L */
|
|
PTR_ADDI L, L, -1
|
|
PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */
|
|
/* if (TL < 1) goto L_N2_M2_L7 */
|
|
beq ZERO,TL, .L_N2_M2_L7
|
|
.align 5
|
|
.L_N2_M2_TL1: /* TL-- */
|
|
KERNEL8xMx2 2, 0x08
|
|
PTR_ADDI TL, TL, -1 /* TL-- */
|
|
blt ZERO,TL, .L_N2_M2_TL1
|
|
.L_N2_M2_L7:
|
|
/* if (!(L & 7)) goto L_N2_M2_L0 */
|
|
andi TL, L, 7
|
|
beq TL, ZERO,.L_N2_M2_L0
|
|
.align 5
|
|
.L_N2_M2_L71:
|
|
KERNEL1xMx2 2, 0x08
|
|
PTR_ADDI TL, TL, -1
|
|
blt ZERO,TL, .L_N2_M2_L71
|
|
.L_N2_M2_L0:
|
|
SAVEMx2 2, 0x08
|
|
#if defined(TRMMKERNEL)
|
|
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
|
PTR_SUB L, K, OFF
|
|
#ifdef LEFT
|
|
PTR_ADDI L, L, -2
|
|
#else
|
|
PTR_ADDI L, L, -2
|
|
#endif
|
|
PTR_SLLI T0, L, 0x03
|
|
PTR_ADD A0, A0, T0
|
|
PTR_ADD B0, B0, T0
|
|
#endif
|
|
|
|
#ifdef LEFT
|
|
PTR_ADDI OFF, OFF, 0x02
|
|
#endif
|
|
#endif // #if defined(TRMMKERNEL)
|
|
.L_N2_M1:
|
|
andi I, M, 1
|
|
beq ZERO,I, .L_N2_M0
|
|
|
|
#if defined(TRMMKERNEL)
|
|
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
|
move B0, B
|
|
#else
|
|
PTR_SLLI T0, OFF, 0x02
|
|
PTR_ADD A0, A0, T0
|
|
PTR_SLLI T0, OFF, 0x03
|
|
PTR_ADD B0, B, T0
|
|
#endif
|
|
|
|
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
|
|
PTR_SUB L, K, OFF
|
|
#elif defined(LEFT)
|
|
/* number of values in A */
|
|
PTR_ADDI L, OFF, 1
|
|
#else
|
|
/* number of values in B */
|
|
PTR_ADDI L, OFF, 2
|
|
#endif
|
|
#else // #if !defined(TRMMKERNEL)
|
|
move B0, B
|
|
move L, K /* L = bk */
|
|
#endif
|
|
KERNEL1xMx2_START 1, 0x04
|
|
/* Reduce L */
|
|
PTR_ADDI L, L, -1
|
|
PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */
|
|
/* if (TL < 1) goto L_N2_M1_L7 */
|
|
beq ZERO,TL, .L_N2_M1_L7
|
|
.align 5
|
|
.L_N2_M1_TL1: /* TL-- */
|
|
KERNEL8xMx2 1, 0x04
|
|
PTR_ADDI TL, TL, -1 /* TL-- */
|
|
blt ZERO,TL, .L_N2_M1_TL1
|
|
.L_N2_M1_L7:
|
|
/* if (!(L & 7)) goto L_N2_M1_L0 */
|
|
andi TL, L, 7
|
|
beq TL, ZERO,.L_N2_M1_L0
|
|
.align 5
|
|
.L_N2_M1_L71:
|
|
KERNEL1xMx2 1, 0x04
|
|
PTR_ADDI TL, TL, -1
|
|
blt ZERO,TL, .L_N2_M1_L71
|
|
.L_N2_M1_L0:
|
|
SAVEMx2 1, 0x04
|
|
#if defined(TRMMKERNEL)
|
|
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
|
PTR_SUB L, K, OFF
|
|
#ifdef LEFT
|
|
PTR_ADDI L, L, -1
|
|
#else
|
|
PTR_ADDI L, L, -2
|
|
#endif
|
|
PTR_SLLI T0, L, 0x02
|
|
PTR_ADD A0, A0, T0
|
|
PTR_SLLI T0, L, 0x03
|
|
PTR_ADD B0, B0, T0
|
|
#endif
|
|
|
|
#ifdef LEFT
|
|
PTR_ADDI OFF, OFF, 0x01
|
|
#endif
|
|
#endif // #if defined(TRMMKERNEL)
|
|
.L_N2_M0:
|
|
/* Add stride for B and C
|
|
* B += 2 * K
|
|
* C += 2 * LDC
|
|
*/
|
|
PTR_SLLI T0, K, 3
|
|
PTR_SLLI T1, LDC, 3
|
|
PTR_ADD B, B, T0
|
|
PTR_ADD C, C, T1
|
|
#if defined(TRMMKERNEL) && !defined(LEFT)
|
|
PTR_ADDI OFF, OFF, 0x02
|
|
#endif
|
|
/* We must reinit I */
|
|
PTR_SRAI I, M, 4 /* I = bm >> 4 */
|
|
.L_N1:
|
|
andi J, N, 1
|
|
beq ZERO, J, .L_N0
|
|
move C0, C
|
|
move A0, A
|
|
|
|
#if defined(TRMMKERNEL) && defined(LEFT)
|
|
move OFF, OFFSET
|
|
#endif
|
|
/* if (!(M >> 4)) goto L_N1_M8 */
|
|
PTR_SRAI I, M, 4 /* I = bm >> 4 */
|
|
beq ZERO, I, .L_N1_M8
|
|
.L_N1_M16:
|
|
#if defined(TRMMKERNEL)
|
|
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
|
move B0, B
|
|
#else
|
|
PTR_SLLI T0, OFF, 0x06
|
|
PTR_ADD A0, A0, T0
|
|
PTR_SLLI T0, OFF, 0x02
|
|
PTR_ADD B0, B, T0
|
|
#endif
|
|
|
|
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
|
|
PTR_SUB L, K, OFF
|
|
#elif defined(LEFT)
|
|
/* number of values in A */
|
|
PTR_ADDI L, OFF, 16
|
|
#else
|
|
/* number of values in B */
|
|
PTR_ADDI L, OFF, 1
|
|
#endif
|
|
#else // #if !defined(TRMMKERNEL)
|
|
move B0, B
|
|
move L, K /* L = bk */
|
|
#endif
|
|
KERNEL1x16x1_START
|
|
/* Reduce L */
|
|
PTR_ADDI L, L, -1
|
|
PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */
|
|
/* if (TL < 1) goto L_N1_M16_L7 */
|
|
beq ZERO,TL, .L_N1_M16_L7
|
|
.align 5
|
|
.L_N1_M16_TL1: /* TL-- */
|
|
KERNEL8x16x1
|
|
PTR_ADDI TL, TL, -1 /* TL-- */
|
|
blt ZERO,TL, .L_N1_M16_TL1
|
|
.L_N1_M16_L7:
|
|
/* if (!(L & 7)) goto L_N1_M16_L0 */
|
|
andi TL, L, 7
|
|
beq TL, ZERO,.L_N1_M16_L0
|
|
.align 5
|
|
.L_N1_M16_L71:
|
|
KERNEL1x16x1
|
|
PTR_ADDI TL, TL, -1
|
|
blt ZERO,TL, .L_N1_M16_L71
|
|
.L_N1_M16_L0:
|
|
SAVE16x1
|
|
#if defined(TRMMKERNEL)
|
|
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
|
PTR_SUB L, K, OFF
|
|
#ifdef LEFT
|
|
PTR_ADDI L, L, -16
|
|
#else
|
|
PTR_ADDI L, L, -1
|
|
#endif
|
|
PTR_SLLI T0, L, 0x06
|
|
PTR_ADD A0, A0, T0
|
|
PTR_SLLI T0, L, 0x02
|
|
PTR_ADD B0, B0, T0
|
|
#endif
|
|
|
|
#ifdef LEFT
|
|
PTR_ADDI OFF, OFF, 0x10
|
|
#endif
|
|
#endif // #if defined(TRMMKERNEL)
|
|
|
|
PTR_ADDI I, I, -1 /* I-- */
|
|
blt ZERO,I, .L_N1_M16
|
|
.L_N1_M8:
|
|
/* We have done M & 16, considering M=8/4/2/1 */
|
|
andi I, M, 15
|
|
beq ZERO,I, .L_N1_M0
|
|
|
|
andi I, M, 8
|
|
beq ZERO,I, .L_N1_M4
|
|
#if defined(TRMMKERNEL)
|
|
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
|
move B0, B
|
|
#else
|
|
PTR_SLLI T0, OFF, 0x05
|
|
PTR_ADD A0, A0, T0
|
|
PTR_SLLI T0, OFF, 0x02
|
|
PTR_ADD B0, B, T0
|
|
#endif
|
|
|
|
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
|
|
PTR_SUB L, K, OFF
|
|
#elif defined(LEFT)
|
|
/* number of values in A */
|
|
PTR_ADDI L, OFF, 8
|
|
#else
|
|
/* number of values in B */
|
|
PTR_ADDI L, OFF, 1
|
|
#endif
|
|
#else // #if !defined(TRMMKERNEL)
|
|
move B0, B
|
|
move L, K /* L = bk */
|
|
#endif
|
|
KERNEL1xMx1_START 8, 0x20
|
|
/* Reduce L */
|
|
PTR_ADDI L, L, -1
|
|
PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */
|
|
/* if (TL < 1) goto L_N1_M8_L7 */
|
|
beq ZERO,TL, .L_N1_M8_L7
|
|
.align 5
|
|
.L_N1_M8_TL1: /* TL-- */
|
|
KERNEL8xMx1 8, 0x20
|
|
PTR_ADDI TL, TL, -1 /* TL-- */
|
|
blt ZERO,TL, .L_N1_M8_TL1
|
|
.L_N1_M8_L7:
|
|
/* if (!(L & 7)) goto L_N1_M8_L0 */
|
|
andi TL, L, 7
|
|
beq TL, ZERO,.L_N1_M8_L0
|
|
.align 5
|
|
.L_N1_M8_L71:
|
|
KERNEL1xMx1 8, 0x20
|
|
PTR_ADDI TL, TL, -1
|
|
blt ZERO,TL, .L_N1_M8_L71
|
|
.L_N1_M8_L0:
|
|
SAVEMx1 8, 0x20
|
|
#if defined(TRMMKERNEL)
|
|
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
|
PTR_SUB L, K, OFF
|
|
#ifdef LEFT
|
|
PTR_ADDI L, L, -8
|
|
#else
|
|
PTR_ADDI L, L, -1
|
|
#endif
|
|
PTR_SLLI T0, L, 0x05
|
|
PTR_ADD A0, A0, T0
|
|
PTR_SLLI T0, L, 0x02
|
|
PTR_ADD B0, B0, T0
|
|
#endif
|
|
|
|
#ifdef LEFT
|
|
PTR_ADDI OFF, OFF, 0x08
|
|
#endif
|
|
#endif // #if defined(TRMMKERNEL)
|
|
.L_N1_M4:
|
|
andi I, M, 4
|
|
beq ZERO,I, .L_N1_M2
|
|
|
|
#if defined(TRMMKERNEL)
|
|
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
|
move B0, B
|
|
#else
|
|
PTR_SLLI T0, OFF, 0x04
|
|
PTR_ADD A0, A0, T0
|
|
PTR_SLLI T0, OFF, 0x02
|
|
PTR_ADD B0, B, T0
|
|
#endif
|
|
|
|
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
|
|
PTR_SUB L, K, OFF
|
|
#elif defined(LEFT)
|
|
/* number of values in A */
|
|
PTR_ADDI L, OFF, 4
|
|
#else
|
|
/* number of values in B */
|
|
PTR_ADDI L, OFF, 1
|
|
#endif
|
|
#else // #if !defined(TRMMKERNEL)
|
|
move B0, B
|
|
move L, K /* L = bk */
|
|
#endif
|
|
KERNEL1xMx1_START 4, 0x10
|
|
/* Reduce L */
|
|
PTR_ADDI L, L, -1
|
|
PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */
|
|
/* if (TL < 1) goto L_N1_M4_L7 */
|
|
beq ZERO,TL, .L_N1_M4_L7
|
|
.align 5
|
|
.L_N1_M4_TL1: /* TL-- */
|
|
KERNEL8xMx1 4, 0x10
|
|
|
|
PTR_ADDI TL, TL, -1 /* TL-- */
|
|
blt ZERO,TL, .L_N1_M4_TL1
|
|
.L_N1_M4_L7:
|
|
/* if (!(L & 7)) goto L_N1_M4_L0 */
|
|
andi TL, L, 7
|
|
beq TL, ZERO,.L_N1_M4_L0
|
|
.align 5
|
|
.L_N1_M4_L71:
|
|
KERNEL1xMx1 4, 0x10
|
|
PTR_ADDI TL, TL, -1
|
|
blt ZERO,TL, .L_N1_M4_L71
|
|
.L_N1_M4_L0:
|
|
SAVEMx1 4, 0x10
|
|
#if defined(TRMMKERNEL)
|
|
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
|
PTR_SUB L, K, OFF
|
|
#ifdef LEFT
|
|
PTR_ADDI L, L, -4
|
|
#else
|
|
PTR_ADDI L, L, -1
|
|
#endif
|
|
PTR_SLLI T0, L, 0x04
|
|
PTR_ADD A0, A0, T0
|
|
PTR_SLLI T0, L, 0x02
|
|
PTR_ADD B0, B0, T0
|
|
#endif
|
|
|
|
#ifdef LEFT
|
|
PTR_ADDI OFF, OFF, 0x04
|
|
#endif
|
|
#endif // #if defined(TRMMKERNEL)
|
|
.L_N1_M2:
|
|
andi I, M, 2
|
|
beq ZERO,I, .L_N1_M1
|
|
#if defined(TRMMKERNEL)
|
|
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
|
move B0, B
|
|
#else
|
|
PTR_SLLI T0, OFF, 0x03
|
|
PTR_ADD A0, A0, T0
|
|
PTR_SLLI T0, OFF, 0x02
|
|
PTR_ADD B0, B, T0
|
|
#endif
|
|
|
|
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
|
|
PTR_SUB L, K, OFF
|
|
#elif defined(LEFT)
|
|
/* number of values in A */
|
|
PTR_ADDI L, OFF, 2
|
|
#else
|
|
/* number of values in B */
|
|
PTR_ADDI L, OFF, 1
|
|
#endif
|
|
#else // #if !defined(TRMMKERNEL)
|
|
move B0, B
|
|
move L, K /* L = bk */
|
|
#endif
|
|
KERNEL1xMx1_START 2, 0x08
|
|
/* Reduce L */
|
|
PTR_ADDI L, L, -1
|
|
PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */
|
|
/* if (TL < 1) goto L_N1_M2_L7 */
|
|
beq ZERO,TL, .L_N1_M2_L7
|
|
.align 5
|
|
.L_N1_M2_TL1: /* TL-- */
|
|
KERNEL8xMx1 2, 0x08
|
|
|
|
PTR_ADDI TL, TL, -1 /* TL-- */
|
|
blt ZERO,TL, .L_N1_M2_TL1
|
|
.L_N1_M2_L7:
|
|
/* if (!(L & 7)) goto L_N1_M2_L0 */
|
|
andi TL, L, 7
|
|
beq TL, ZERO,.L_N1_M2_L0
|
|
.align 5
|
|
.L_N1_M2_L71:
|
|
KERNEL1xMx1 2, 0x08
|
|
PTR_ADDI TL, TL, -1
|
|
blt ZERO,TL, .L_N1_M2_L71
|
|
.L_N1_M2_L0:
|
|
SAVEMx1 2, 0x08
|
|
#if defined(TRMMKERNEL)
|
|
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
|
PTR_SUB L, K, OFF
|
|
#ifdef LEFT
|
|
PTR_ADDI L, L, -2
|
|
#else
|
|
PTR_ADDI L, L, -1
|
|
#endif
|
|
PTR_SLLI T0, L, 0x03
|
|
PTR_ADD A0, A0, T0
|
|
PTR_SLLI T0, L, 0x02
|
|
PTR_ADD B0, B0, T0
|
|
#endif
|
|
|
|
#ifdef LEFT
|
|
PTR_ADDI OFF, OFF, 0x02
|
|
#endif
|
|
#endif // #if defined(TRMMKERNEL)
|
|
|
|
.L_N1_M1:
|
|
andi I, M, 1
|
|
beq ZERO,I, .L_N1_M0
|
|
|
|
#if defined(TRMMKERNEL)
|
|
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
|
move B0, B
|
|
#else
|
|
PTR_SLLI T0, OFF, 0x02
|
|
PTR_ADD A0, A0, T0
|
|
PTR_ADD B0, B, T0
|
|
#endif
|
|
|
|
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
|
|
PTR_SUB L, K, OFF
|
|
#elif defined(LEFT)
|
|
/* number of values in A */
|
|
PTR_ADDI L, OFF, 1
|
|
#else
|
|
/* number of values in B */
|
|
PTR_ADDI L, OFF, 1
|
|
#endif
|
|
#else // #if !defined(TRMMKERNEL)
|
|
move B0, B
|
|
move L, K /* L = bk */
|
|
#endif
|
|
KERNEL1xMx1_START 1, 0x04
|
|
/* Reduce L */
|
|
PTR_ADDI L, L, -1
|
|
PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */
|
|
/* if (TL < 1) goto L_N1_M1_L7 */
|
|
beq ZERO,TL, .L_N1_M1_L7
|
|
.align 5
|
|
.L_N1_M1_TL1: /* TL-- */
|
|
KERNEL8xMx1 1, 0x04
|
|
|
|
PTR_ADDI TL, TL, -1 /* TL-- */
|
|
blt ZERO,TL, .L_N1_M1_TL1
|
|
.L_N1_M1_L7:
|
|
/* if (!(L & 7)) goto L_N1_M1_L0 */
|
|
andi TL, L, 7
|
|
beq TL, ZERO,.L_N1_M1_L0
|
|
.align 5
|
|
.L_N1_M1_L71:
|
|
KERNEL1xMx1 1, 0x04
|
|
PTR_ADDI TL, TL, -1
|
|
blt ZERO,TL, .L_N1_M1_L71
|
|
.L_N1_M1_L0:
|
|
SAVEMx1 1, 0x04
|
|
#if defined(TRMMKERNEL)
|
|
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
|
PTR_SUB L, K, OFF
|
|
#ifdef LEFT
|
|
PTR_ADDI L, L, -1
|
|
#else
|
|
PTR_ADDI L, L, -1
|
|
#endif
|
|
PTR_SLLI T0, L, 0x02
|
|
PTR_ADD A0, A0, T0
|
|
PTR_ADD B0, B0, T0
|
|
#endif
|
|
|
|
#ifdef LEFT
|
|
PTR_ADDI OFF, OFF, 0x01
|
|
#endif
|
|
#endif // #if defined(TRMMKERNEL)
|
|
.L_N1_M0:
|
|
.L_N0:
|
|
pop_if_used 26, 32
|
|
jirl $r0, $r1, 0x0
|
|
EPILOGUE
|