OpenBLAS/kernel/loongarch64/zgemm_kernel.S

1048 lines
24 KiB
ArmAsm

/***************************************************************************
Copyright (c) 2020, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define ASSEMBLER
#include "common.h"
#define M $r4
#define N $r5
#define K $r6
#define A $r7
#define B $r8
#define C $r9
#define LDC $r10
#define AO $r12
#define BO $r13
#define I $r17
#define J $r18
#define L $r25
#define CO1 $r14
#define CO2 $r15
#define CO3 $r23
#define CO4 $r24
#if defined(TRMMKERNEL)
#define OFFSET $r11
#define KK $r26
#define TEMP $r27
#endif
#define a1 $f22
#define a2 $f8
#define a3 $f28
#define a4 $f29
#define b1 $f23
#define b2 $f9
#define b3 $f10
#define b4 $f11
#define b5 $f12
#define b6 $f13
#define b7 $f14
#define b8 $f15
#define a5 b8
#define c11 $f16
#define c12 $f17
#define c21 $f3
#define c22 $f4
#define c31 $f2
#define c32 $f5
#define c41 $f6
#define c42 $f7
#define c51 $f18
#define c52 $f19
#define c61 $f20
#define c62 $f21
#define c71 $f24
#define c72 $f25
#define c81 $f26
#define c82 $f27
#define ALPHA_R $f0
#define ALPHA_I $f1
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
#define MADD1 MADD
#define MADD2 MADD
#define MADD3 MADD
#define MADD4 NMSUB
#endif
#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
#define MADD1 MADD
#define MADD2 MADD
#define MADD3 NMSUB
#define MADD4 MADD
#endif
#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
#define MADD1 MADD
#define MADD2 NMSUB
#define MADD3 MADD
#define MADD4 MADD
#endif
#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
#define MADD1 MADD
#define MADD2 NMSUB
#define MADD3 NMSUB
#define MADD4 NMSUB
#endif
PROLOGUE
addi.d $sp, $sp, -128
SDARG $r23, $sp, 0
SDARG $r24, $sp, 8
SDARG $r25, $sp, 64
fst.d $f24, $sp, 16
fst.d $f25, $sp, 24
fst.d $f26, $sp, 32
fst.d $f27, $sp, 40
fst.d $f28, $sp, 48
fst.d $f29, $sp, 56
#if defined(TRMMKERNEL)
SDARG $r26, $sp, 72
SDARG $r27, $sp, 80
#endif
#ifndef __64BIT__
fst.d $f18, $sp, 88
fst.d $f19, $sp, 96
fst.d $f20, $sp, 104
fst.d $f21, $sp, 112
#endif
slli.d LDC, LDC, ZBASE_SHIFT
#if defined(TRMMKERNEL) && !defined(LEFT)
sub.d KK, $r0, OFFSET
#endif
srai.d J, N, 2
nop
bge $r0, J, .L20
.L10:
move CO1, C
MTC c11, $r0
add.d CO2, C, LDC
move AO, A
add.d CO3, CO2, LDC
addi.d J, J, -1
add.d CO4, CO3, LDC
MOV c21, c11
MOV c31, c11
#if defined(TRMMKERNEL) && defined(LEFT)
move KK, OFFSET
#endif
MOV c41, c11
MOV c51, c11
move I, M
add.d C, CO4, LDC
MOV c61, c11
bge $r0, I, .L19
.L11:
#if defined(TRMMKERNEL)
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
move BO, B
#else
slli.d L, KK, ZBASE_SHIFT
slli.d TEMP, KK, 2 + ZBASE_SHIFT
add.d AO, AO, L
add.d BO, B, TEMP
#endif
LD a1, AO, 0 * SIZE
MOV c71, c11
LD b1, BO, 0 * SIZE
MOV c81, c11
LD a3, AO, 4 * SIZE
MOV c12, c11
LD b2, BO, 1 * SIZE
MOV c22, c11
MOV c32, c11
LD b3, BO, 2 * SIZE
MOV c42, c11
LD b4, BO, 3 * SIZE
MOV c52, c11
LD b5, BO, 4 * SIZE
MOV c62, c11
LD b6, BO, 8 * SIZE
MOV c72, c11
LD b7, BO, 12 * SIZE
MOV c82, c11
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
sub.d TEMP, K, KK
#elif defined(LEFT)
addi.d TEMP, KK, 1
#else
addi.d TEMP, KK, 4
#endif
srai.d L, TEMP, 2
bge $r0, L, .L15
#else
LD a1, AO, 0 * SIZE
MOV c71, c11
LD b1, B, 0 * SIZE
MOV c81, c11
LD a3, AO, 4 * SIZE
MOV c12, c11
LD b2, B, 1 * SIZE
MOV c22, c11
srai.d L, K, 2
MOV c32, c11
LD b3, B, 2 * SIZE
MOV c42, c11
LD b4, B, 3 * SIZE
MOV c52, c11
LD b5, B, 4 * SIZE
MOV c62, c11
LD b6, B, 8 * SIZE
MOV c72, c11
LD b7, B, 12 * SIZE
MOV c82, c11
move BO, B
bge $r0, L, .L15
#endif
MADD1 c11, b1, a1, c11
LD a2, AO, 1 * SIZE
MADD3 c21, b2, a1, c21
addi.d L, L, -1
MADD1 c31, b3, a1, c31
MADD3 c41, b4, a1, c41
bge $r0, L, .L13
.align 3
.L12:
MADD2 c12, b1, a2, c12
LD b1, BO, 16 * SIZE
MADD4 c22, b2, a2, c22
LD b2, BO, 5 * SIZE
MADD2 c32, b3, a2, c32
LD b3, BO, 6 * SIZE
MADD4 c42, b4, a2, c42
LD b4, BO, 7 * SIZE
MADD1 c51, b5, a1, c51
MADD3 c61, b2, a1, c61
LD a4, AO, 2 * SIZE
MADD1 c71, b3, a1, c71
MADD3 c81, b4, a1, c81
LD a1, AO, 8 * SIZE
MADD2 c52, b5, a2, c52
LD b5, BO, 20 * SIZE
MADD4 c62, b2, a2, c62
LD b2, BO, 9 * SIZE
MADD2 c72, b3, a2, c72
LD b3, BO, 10 * SIZE
MADD4 c82, b4, a2, c82
LD b4, BO, 11 * SIZE
MADD1 c11, b6, a4, c11
LD a2, AO, 3 * SIZE
MADD3 c21, b2, a4, c21
MADD1 c31, b3, a4, c31
MADD3 c41, b4, a4, c41
MADD2 c12, b6, a2, c12
LD b6, BO, 24 * SIZE
MADD4 c22, b2, a2, c22
LD b2, BO, 13 * SIZE
MADD2 c32, b3, a2, c32
LD b3, BO, 14 * SIZE
MADD4 c42, b4, a2, c42
LD b4, BO, 15 * SIZE
MADD1 c51, b7, a4, c51
MADD3 c61, b2, a4, c61
MADD1 c71, b3, a4, c71
MADD3 c81, b4, a4, c81
MADD2 c52, b7, a2, c52
LD b7, BO, 28 * SIZE
MADD4 c62, b2, a2, c62
LD b2, BO, 17 * SIZE
MADD2 c72, b3, a2, c72
LD b3, BO, 18 * SIZE
MADD4 c82, b4, a2, c82
LD b4, BO, 19 * SIZE
MADD1 c11, b1, a3, c11
LD a2, AO, 5 * SIZE
MADD3 c21, b2, a3, c21
MADD1 c31, b3, a3, c31
MADD3 c41, b4, a3, c41
MADD2 c12, b1, a2, c12
LD b1, BO, 32 * SIZE
MADD4 c22, b2, a2, c22
LD b2, BO, 21 * SIZE
MADD2 c32, b3, a2, c32
LD b3, BO, 22 * SIZE
MADD4 c42, b4, a2, c42
LD b4, BO, 23 * SIZE
MADD1 c51, b5, a3, c51
MADD3 c61, b2, a3, c61
LD a4, AO, 6 * SIZE
MADD1 c71, b3, a3, c71
MADD3 c81, b4, a3, c81
LD a3, AO, 12 * SIZE
MADD2 c52, b5, a2, c52
LD b5, BO, 36 * SIZE
MADD4 c62, b2, a2, c62
LD b2, BO, 25 * SIZE
MADD2 c72, b3, a2, c72
LD b3, BO, 26 * SIZE
MADD4 c82, b4, a2, c82
LD b4, BO, 27 * SIZE
MADD1 c11, b6, a4, c11
LD a2, AO, 7 * SIZE
MADD3 c21, b2, a4, c21
MADD1 c31, b3, a4, c31
MADD3 c41, b4, a4, c41
addi.d L, L, -1
MADD2 c12, b6, a2, c12
LD b6, BO, 40 * SIZE
MADD4 c22, b2, a2, c22
LD b2, BO, 29 * SIZE
MADD2 c32, b3, a2, c32
LD b3, BO, 30 * SIZE
MADD4 c42, b4, a2, c42
LD b4, BO, 31 * SIZE
MADD1 c51, b7, a4, c51
addi.d BO, BO, 32 * SIZE
MADD3 c61, b2, a4, c61
addi.d AO, AO, 8 * SIZE
MADD1 c71, b3, a4, c71
MADD3 c81, b4, a4, c81
MADD2 c52, b7, a2, c52
LD b7, BO, 12 * SIZE
MADD4 c62, b2, a2, c62
LD b2, BO, 1 * SIZE
MADD2 c72, b3, a2, c72
LD b3, BO, 2 * SIZE
MADD4 c82, b4, a2, c82
LD b4, BO, 3 * SIZE
MADD1 c11, b1, a1, c11
LD a2, AO, 1 * SIZE
MADD3 c21, b2, a1, c21
MADD1 c31, b3, a1, c31
MADD3 c41, b4, a1, c41
blt $r0, L, .L12
.align 3
.L13:
MADD2 c12, b1, a2, c12
LD b1, BO, 16 * SIZE
MADD4 c22, b2, a2, c22
LD b2, BO, 5 * SIZE
MADD2 c32, b3, a2, c32
LD b3, BO, 6 * SIZE
MADD4 c42, b4, a2, c42
LD b4, BO, 7 * SIZE
MADD1 c51, b5, a1, c51
MADD3 c61, b2, a1, c61
LD a4, AO, 2 * SIZE
MADD1 c71, b3, a1, c71
MADD3 c81, b4, a1, c81
LD a1, AO, 8 * SIZE
MADD2 c52, b5, a2, c52
LD b5, BO, 20 * SIZE
MADD4 c62, b2, a2, c62
LD b2, BO, 9 * SIZE
MADD2 c72, b3, a2, c72
LD b3, BO, 10 * SIZE
MADD4 c82, b4, a2, c82
LD b4, BO, 11 * SIZE
MADD1 c11, b6, a4, c11
LD a2, AO, 3 * SIZE
MADD3 c21, b2, a4, c21
MADD1 c31, b3, a4, c31
MADD3 c41, b4, a4, c41
MADD2 c12, b6, a2, c12
LD b6, BO, 24 * SIZE
MADD4 c22, b2, a2, c22
LD b2, BO, 13 * SIZE
MADD2 c32, b3, a2, c32
LD b3, BO, 14 * SIZE
MADD4 c42, b4, a2, c42
LD b4, BO, 15 * SIZE
MADD1 c51, b7, a4, c51
MADD3 c61, b2, a4, c61
MADD1 c71, b3, a4, c71
MADD3 c81, b4, a4, c81
MADD2 c52, b7, a2, c52
LD b7, BO, 28 * SIZE
MADD4 c62, b2, a2, c62
LD b2, BO, 17 * SIZE
MADD2 c72, b3, a2, c72
LD b3, BO, 18 * SIZE
MADD4 c82, b4, a2, c82
LD b4, BO, 19 * SIZE
MADD1 c11, b1, a3, c11
LD a2, AO, 5 * SIZE
MADD3 c21, b2, a3, c21
MADD1 c31, b3, a3, c31
MADD3 c41, b4, a3, c41
MADD2 c12, b1, a2, c12
LD b1, BO, 32 * SIZE
MADD4 c22, b2, a2, c22
LD b2, BO, 21 * SIZE
MADD2 c32, b3, a2, c32
LD b3, BO, 22 * SIZE
MADD4 c42, b4, a2, c42
LD b4, BO, 23 * SIZE
MADD1 c51, b5, a3, c51
MADD3 c61, b2, a3, c61
LD a4, AO, 6 * SIZE
MADD1 c71, b3, a3, c71
MADD3 c81, b4, a3, c81
LD a3, AO, 12 * SIZE
MADD2 c52, b5, a2, c52
LD b5, BO, 36 * SIZE
MADD4 c62, b2, a2, c62
LD b2, BO, 25 * SIZE
MADD2 c72, b3, a2, c72
LD b3, BO, 26 * SIZE
MADD4 c82, b4, a2, c82
LD b4, BO, 27 * SIZE
MADD1 c11, b6, a4, c11
LD a2, AO, 7 * SIZE
MADD3 c21, b2, a4, c21
MADD1 c31, b3, a4, c31
MADD3 c41, b4, a4, c41
MADD2 c12, b6, a2, c12
LD b6, BO, 40 * SIZE
MADD4 c22, b2, a2, c22
LD b2, BO, 29 * SIZE
MADD2 c32, b3, a2, c32
LD b3, BO, 30 * SIZE
MADD4 c42, b4, a2, c42
LD b4, BO, 31 * SIZE
MADD1 c51, b7, a4, c51
addi.d BO, BO, 32 * SIZE
MADD3 c61, b2, a4, c61
addi.d AO, AO, 8 * SIZE
MADD1 c71, b3, a4, c71
MADD3 c81, b4, a4, c81
MADD2 c52, b7, a2, c52
LD b7, BO, 12 * SIZE
MADD4 c62, b2, a2, c62
LD b2, BO, 1 * SIZE
MADD2 c72, b3, a2, c72
LD b3, BO, 2 * SIZE
MADD4 c82, b4, a2, c82
LD b4, BO, 3 * SIZE
.align 3
.L15:
#ifndef TRMMKERNEL
andi L, K, 3
#else
andi L, TEMP, 3
#endif
bge $r0, L, .L18
.align 3
.L16:
MADD1 c11, b1, a1, c11
LD a2, AO, 1 * SIZE
MADD3 c21, b2, a1, c21
MADD1 c31, b3, a1, c31
MADD3 c41, b4, a1, c41
MADD2 c12, b1, a2, c12
LD b1, BO, 8 * SIZE
MADD4 c22, b2, a2, c22
LD b2, BO, 5 * SIZE
MADD2 c32, b3, a2, c32
LD b3, BO, 6 * SIZE
MADD4 c42, b4, a2, c42
LD b4, BO, 7 * SIZE
MADD1 c51, b5, a1, c51
addi.d L, L, -1
MADD3 c61, b2, a1, c61
addi.d AO, AO, 2 * SIZE
MADD1 c71, b3, a1, c71
addi.d BO, BO, 8 * SIZE
MADD3 c81, b4, a1, c81
LD a1, AO, 0 * SIZE
MADD2 c52, b5, a2, c52
LD b5, BO, 4 * SIZE
MADD4 c62, b2, a2, c62
LD b2, BO, 1 * SIZE
MADD2 c72, b3, a2, c72
LD b3, BO, 2 * SIZE
MADD4 c82, b4, a2, c82
LD b4, BO, 3 * SIZE
blt $r0, L, .L16
.L18:
#ifndef TRMMKERNEL
LD b1, CO1, 0 * SIZE
ADD c11, c11, c22
LD b2, CO1, 1 * SIZE
ADD c12, c12, c21
LD b3, CO2, 0 * SIZE
ADD c31, c31, c42
LD b4, CO2, 1 * SIZE
ADD c32, c32, c41
LD b5, CO3, 0 * SIZE
ADD c51, c51, c62
LD b6, CO3, 1 * SIZE
ADD c52, c52, c61
LD b7, CO4, 0 * SIZE
ADD c71, c71, c82
LD b8, CO4, 1 * SIZE
ADD c72, c72, c81
MADD b1, c11, ALPHA_R, b1
addi.d CO1,CO1, 2 * SIZE
MADD b2, c12, ALPHA_R, b2
addi.d CO2,CO2, 2 * SIZE
MADD b3, c31, ALPHA_R, b3
addi.d CO3,CO3, 2 * SIZE
MADD b4, c32, ALPHA_R, b4
addi.d CO4,CO4, 2 * SIZE
MADD b5, c51, ALPHA_R, b5
addi.d I, I, -1
MADD b6, c52, ALPHA_R, b6
MADD b7, c71, ALPHA_R, b7
MADD b8, c72, ALPHA_R, b8
NMSUB b1, c12, ALPHA_I, b1
MADD b2, c11, ALPHA_I, b2
MTC c11, $r0
NMSUB b3, c32, ALPHA_I, b3
MADD b4, c31, ALPHA_I, b4
ST b1, CO1, -2 * SIZE
NMSUB b5, c52, ALPHA_I, b5
ST b2, CO1, -1 * SIZE
MADD b6, c51, ALPHA_I, b6
ST b3, CO2, -2 * SIZE
NMSUB b7, c72, ALPHA_I, b7
ST b4, CO2, -1 * SIZE
MADD b8, c71, ALPHA_I, b8
ST b5, CO3, -2 * SIZE
MOV c21, c11
ST b6, CO3, -1 * SIZE
MOV c31, c11
ST b7, CO4, -2 * SIZE
MOV c41, c11
ST b8, CO4, -1 * SIZE
MOV c51, c11
#else
ADD c11, c11, c22
addi.d CO1,CO1, 2 * SIZE
ADD c12, c12, c21
addi.d CO2,CO2, 2 * SIZE
ADD c31, c31, c42
addi.d CO3,CO3, 2 * SIZE
ADD c32, c32, c41
addi.d CO4,CO4, 2 * SIZE
ADD c51, c51, c62
addi.d I, I, -1
ADD c52, c52, c61
ADD c71, c71, c82
ADD c72, c72, c81
MUL b1, ALPHA_R, c11
MUL b2, ALPHA_R, c12
MUL b3, ALPHA_R, c31
MUL b4, ALPHA_R, c32
MUL b5, ALPHA_R, c51
MUL b6, ALPHA_R, c52
MUL b7, ALPHA_R, c71
MUL b8, ALPHA_R, c72
NMSUB b1, c12, ALPHA_I, b1
MADD b2, c11, ALPHA_I, b2
MTC c11, $r0
NMSUB b3, c32, ALPHA_I, b3
MADD b4, c31, ALPHA_I, b4
ST b1, CO1, -2 * SIZE
NMSUB b5, c52, ALPHA_I, b5
ST b2, CO1, -1 * SIZE
MADD b6, c51, ALPHA_I, b6
ST b3, CO2, -2 * SIZE
NMSUB b7, c72, ALPHA_I, b7
ST b4, CO2, -1 * SIZE
MADD b8, c71, ALPHA_I, b8
ST b5, CO3, -2 * SIZE
MOV c21, c11
ST b6, CO3, -1 * SIZE
MOV c31, c11
ST b7, CO4, -2 * SIZE
MOV c41, c11
ST b8, CO4, -1 * SIZE
MOV c51, c11
#if ( defined(LEFT) && defined(TRANSA)) || \
(!defined(LEFT) && !defined(TRANSA))
sub.d TEMP, K, KK
#ifdef LEFT
addi.d TEMP, TEMP, -1
#else
addi.d TEMP, TEMP, -4
#endif
slli.d L, TEMP, ZBASE_SHIFT
slli.d TEMP, TEMP, 2 + ZBASE_SHIFT
add.d AO, AO, L
add.d BO, BO, TEMP
#endif
#ifdef LEFT
addi.d KK, KK, 1
#endif
#endif
MOV c61, c11
blt $r0, I, .L11
.align 3
.L19:
#if defined(TRMMKERNEL) && !defined(LEFT)
addi.d KK, KK, 4
#endif
move B, BO
blt $r0, J, .L10
.align 3
.L20:
andi J, N, 2
MTC c11, $r0
move CO1, C
bge $r0, J, .L30
add.d CO2, C, LDC
add.d C, CO2, LDC
#if defined(TRMMKERNEL) && defined(LEFT)
move KK, OFFSET
#endif
move I, M
move AO, A
bge $r0, I, .L29
.align 3
.L21:
#if defined(TRMMKERNEL)
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
move BO, B
#else
slli.d L, KK, ZBASE_SHIFT
slli.d TEMP, KK, 1 + ZBASE_SHIFT
add.d AO, AO, L
add.d BO, B, TEMP
#endif
LD a1, AO, 0 * SIZE
MOV c21, c11
LD b1, BO, 0 * SIZE
MOV c31, c11
LD a3, AO, 4 * SIZE
MOV c41, c11
LD b2, BO, 1 * SIZE
LD b3, BO, 2 * SIZE
MOV c12, c11
LD b4, BO, 3 * SIZE
MOV c22, c11
LD b5, BO, 4 * SIZE
MOV c32, c11
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
sub.d TEMP, K, KK
#elif defined(LEFT)
addi.d TEMP, KK, 1
#else
addi.d TEMP, KK, 2
#endif
srai.d L, TEMP, 2
MOV c42, c11
bge $r0, L, .L25
#else
LD a1, AO, 0 * SIZE
MOV c21, c11
LD b1, B, 0 * SIZE
MOV c31, c11
LD a3, AO, 4 * SIZE
MOV c41, c11
LD b2, B, 1 * SIZE
srai.d L, K, 2
LD b3, B, 2 * SIZE
MOV c12, c11
LD b4, B, 3 * SIZE
MOV c22, c11
LD b5, B, 4 * SIZE
MOV c32, c11
MOV c42, c11
move BO, B
bge $r0, L, .L25
#endif
.align 3
.L22:
MADD1 c11, b1, a1, c11
LD a2, AO, 1 * SIZE
MADD3 c21, b2, a1, c21
addi.d L, L, -1
MADD1 c31, b3, a1, c31
MADD3 c41, b4, a1, c41
LD a1, AO, 2 * SIZE
MADD2 c12, b1, a2, c12
LD b1, BO, 8 * SIZE
MADD4 c22, b2, a2, c22
LD b2, BO, 5 * SIZE
MADD2 c32, b3, a2, c32
LD b3, BO, 6 * SIZE
MADD4 c42, b4, a2, c42
LD b4, BO, 7 * SIZE
MADD1 c11, b5, a1, c11
LD a2, AO, 3 * SIZE
MADD3 c21, b2, a1, c21
MADD1 c31, b3, a1, c31
MADD3 c41, b4, a1, c41
LD a1, AO, 8 * SIZE
MADD2 c12, b5, a2, c12
LD b5, BO, 12 * SIZE
MADD4 c22, b2, a2, c22
LD b2, BO, 9 * SIZE
MADD2 c32, b3, a2, c32
LD b3, BO, 10 * SIZE
MADD4 c42, b4, a2, c42
LD b4, BO, 11 * SIZE
MADD1 c11, b1, a3, c11
LD a2, AO, 5 * SIZE
MADD3 c21, b2, a3, c21
MADD1 c31, b3, a3, c31
MADD3 c41, b4, a3, c41
LD a3, AO, 6 * SIZE
MADD2 c12, b1, a2, c12
LD b1, BO, 16 * SIZE
MADD4 c22, b2, a2, c22
LD b2, BO, 13 * SIZE
MADD2 c32, b3, a2, c32
LD b3, BO, 14 * SIZE
MADD4 c42, b4, a2, c42
LD b4, BO, 15 * SIZE
MADD1 c11, b5, a3, c11
LD a2, AO, 7 * SIZE
MADD3 c21, b2, a3, c21
addi.d AO, AO, 8 * SIZE
MADD1 c31, b3, a3, c31
MADD3 c41, b4, a3, c41
LD a3, AO, 4 * SIZE
MADD2 c12, b5, a2, c12
LD b5, BO, 20 * SIZE
MADD4 c22, b2, a2, c22
LD b2, BO, 17 * SIZE
MADD2 c32, b3, a2, c32
LD b3, BO, 18 * SIZE
MADD4 c42, b4, a2, c42
LD b4, BO, 19 * SIZE
addi.d BO, BO, 16 * SIZE
blt $r0, L, .L22
.align 3
.L25:
#ifndef TRMMKERNEL
andi L, K, 3
#else
andi L, TEMP, 3
#endif
bge $r0, L, .L28
.align 3
.L26:
MADD1 c11, b1, a1, c11
LD a2, AO, 1 * SIZE
MADD3 c21, b2, a1, c21
addi.d L, L, -1
MADD1 c31, b3, a1, c31
addi.d BO, BO, 4 * SIZE
MADD3 c41, b4, a1, c41
LD a1, AO, 2 * SIZE
MADD2 c12, b1, a2, c12
LD b1, BO, 0 * SIZE
MADD4 c22, b2, a2, c22
LD b2, BO, 1 * SIZE
MADD2 c32, b3, a2, c32
LD b3, BO, 2 * SIZE
MADD4 c42, b4, a2, c42
LD b4, BO, 3 * SIZE
addi.d AO, AO, 2 * SIZE
blt $r0, L, .L26
.L28:
#ifndef TRMMKERNEL
LD b1, CO1, 0 * SIZE
ADD c11, c11, c22
LD b2, CO1, 1 * SIZE
ADD c12, c12, c21
LD b3, CO2, 0 * SIZE
ADD c31, c31, c42
LD b4, CO2, 1 * SIZE
ADD c32, c32, c41
MADD b1, c11, ALPHA_R, b1
addi.d CO1,CO1, 2 * SIZE
MADD b2, c12, ALPHA_R, b2
addi.d CO2,CO2, 2 * SIZE
MADD b3, c31, ALPHA_R, b3
addi.d I, I, -1
MADD b4, c32, ALPHA_R, b4
NMSUB b1, c12, ALPHA_I, b1
MADD b2, c11, ALPHA_I, b2
MTC c11, $r0
NMSUB b3, c32, ALPHA_I, b3
MADD b4, c31, ALPHA_I, b4
ST b1, CO1, -2 * SIZE
ST b2, CO1, -1 * SIZE
ST b3, CO2, -2 * SIZE
#else
ADD c11, c11, c22
ADD c12, c12, c21
ADD c31, c31, c42
ADD c32, c32, c41
MUL b1, ALPHA_R, c11
addi.d CO1,CO1, 2 * SIZE
MUL b2, ALPHA_R, c12
addi.d CO2,CO2, 2 * SIZE
MUL b3, ALPHA_R, c31
addi.d I, I, -1
MUL b4, ALPHA_R, c32
NMSUB b1, c12, ALPHA_I, b1
MADD b2, c11, ALPHA_I, b2
MTC c11, $r0
NMSUB b3, c32, ALPHA_I, b3
MADD b4, c31, ALPHA_I, b4
ST b1, CO1, -2 * SIZE
ST b2, CO1, -1 * SIZE
ST b3, CO2, -2 * SIZE
#if ( defined(LEFT) && defined(TRANSA)) || \
(!defined(LEFT) && !defined(TRANSA))
sub.d TEMP, K, KK
#ifdef LEFT
addi.d TEMP, TEMP, -1
#else
addi.d TEMP, TEMP, -2
#endif
slli.d L, TEMP, ZBASE_SHIFT
slli.d TEMP, TEMP, 1 + ZBASE_SHIFT
add.d AO, AO, L
add.d BO, BO, TEMP
#endif
#ifdef LEFT
addi.d KK, KK, 1
#endif
#endif
ST b4, CO2, -1 * SIZE
blt $r0, I, .L21
.align 3
.L29:
#if defined(TRMMKERNEL) && !defined(LEFT)
addi.d KK, KK, 2
#endif
move B, BO
.align 3
.L30:
andi J, N, 1
MTC c11, $r0
move CO1, C
bge $r0, J, .L999
#if defined(TRMMKERNEL) && defined(LEFT)
move KK, OFFSET
#endif
move I, M
add.d C, CO1, LDC
move AO, A
bge $r0, I, .L39
.align 3
.L31:
#if defined(TRMMKERNEL)
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
move BO, B
#else
slli.d TEMP, KK, ZBASE_SHIFT
add.d AO, AO, TEMP
add.d BO, B, TEMP
#endif
LD a1, AO, 0 * SIZE
MOV c21, c11
LD b1, BO, 0 * SIZE
MOV c31, c11
LD a2, AO, 1 * SIZE
MOV c41, c11
LD b2, BO, 1 * SIZE
MOV c12, c11
MOV c22, c11
LD a3, AO, 4 * SIZE
MOV c32, c11
LD b3, BO, 4 * SIZE
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
sub.d TEMP, K, KK
#elif defined(LEFT)
addi.d TEMP, KK, 1
#else
addi.d TEMP, KK, 1
#endif
srai.d L, TEMP, 2
MOV c42, c11
bge $r0, L, .L35
#else
LD a1, AO, 0 * SIZE
MOV c21, c11
LD b1, B, 0 * SIZE
MOV c31, c11
LD a2, AO, 1 * SIZE
MOV c41, c11
LD b2, B, 1 * SIZE
MOV c12, c11
srai.d L, K, 2
MOV c22, c11
LD a3, AO, 4 * SIZE
MOV c32, c11
LD b3, B, 4 * SIZE
MOV c42, c11
move BO, B
bge $r0, L, .L35
#endif
.align 3
.L32:
MADD1 c11, b1, a1, c11
LD b4, BO, 3 * SIZE
MADD3 c21, b2, a1, c21
LD a1, AO, 2 * SIZE
MADD2 c12, b1, a2, c12
LD b1, BO, 2 * SIZE
MADD4 c22, b2, a2, c22
LD a2, AO, 3 * SIZE
MADD1 c11, b1, a1, c11
LD b2, BO, 5 * SIZE
MADD3 c21, b4, a1, c21
LD a1, AO, 8 * SIZE
MADD2 c12, b1, a2, c12
LD b1, BO, 8 * SIZE
MADD4 c22, b4, a2, c22
LD a2, AO, 5 * SIZE
MADD1 c11, b3, a3, c11
LD b4, BO, 7 * SIZE
MADD3 c21, b2, a3, c21
LD a3, AO, 6 * SIZE
MADD2 c12, b3, a2, c12
LD b3, BO, 6 * SIZE
MADD4 c22, b2, a2, c22
LD a2, AO, 7 * SIZE
MADD1 c11, b3, a3, c11
LD b2, BO, 9 * SIZE
MADD3 c21, b4, a3, c21
LD a3, AO, 12 * SIZE
MADD2 c12, b3, a2, c12
LD b3, BO, 12 * SIZE
MADD4 c22, b4, a2, c22
LD a2, AO, 9 * SIZE
addi.d AO, AO, 8 * SIZE
addi.d L, L, -1
addi.d BO, BO, 8 * SIZE
blt $r0, L, .L32
.align 3
.L35:
#ifndef TRMMKERNEL
andi L, K, 3
#else
andi L, TEMP, 3
#endif
bge $r0, L, .L38
.align 3
.L36:
MADD1 c11, b1, a1, c11
addi.d L, L, -1
MADD3 c21, b2, a1, c21
LD a1, AO, 2 * SIZE
MADD2 c12, b1, a2, c12
LD b1, BO, 2 * SIZE
MADD4 c22, b2, a2, c22
LD a2, AO, 3 * SIZE
LD b2, BO, 3 * SIZE
addi.d BO, BO, 2 * SIZE
addi.d AO, AO, 2 * SIZE
blt $r0, L, .L36
.L38:
#ifndef TRMMKERNEL
LD b1, CO1, 0 * SIZE
ADD c11, c11, c22
LD b2, CO1, 1 * SIZE
ADD c12, c12, c21
MADD b1, c11, ALPHA_R, b1
addi.d CO1,CO1, 2 * SIZE
MADD b2, c12, ALPHA_R, b2
addi.d I, I, -1
NMSUB b1, c12, ALPHA_I, b1
MADD b2, c11, ALPHA_I, b2
MTC c11, $r0
ST b1, CO1, -2 * SIZE
ST b2, CO1, -1 * SIZE
blt $r0, I, .L31
#else
ADD c11, c11, c22
ADD c12, c12, c21
MUL b1, ALPHA_R, c11
addi.d CO1,CO1, 2 * SIZE
MUL b2, ALPHA_R, c12
addi.d I, I, -1
NMSUB b1, c12, ALPHA_I, b1
MADD b2, c11, ALPHA_I, b2
MTC c11, $r0
#if ( defined(LEFT) && defined(TRANSA)) || \
(!defined(LEFT) && !defined(TRANSA))
sub.d TEMP, K, KK
#ifdef LEFT
addi.d TEMP, TEMP, -1
#else
addi.d TEMP, TEMP, -1
#endif
slli.d TEMP, TEMP, ZBASE_SHIFT
add.d AO, AO, TEMP
add.d BO, BO, TEMP
#endif
#ifdef LEFT
addi.d KK, KK, 1
#endif
ST b1, CO1, -2 * SIZE
ST b2, CO1, -1 * SIZE
blt $r0, I, .L31
#endif
.align 3
.L39:
#if defined(TRMMKERNEL) && !defined(LEFT)
addi.d KK, KK, 1
#endif
move B, BO
.align 3
.L999:
LDARG $r23, $sp, 0
LDARG $r24, $sp, 8
LDARG $r25, $sp, 64
fld.d $f24, $sp, 16
fld.d $f25, $sp, 24
fld.d $f26, $sp, 32
fld.d $f27, $sp, 40
fld.d $f28, $sp, 48
fld.d $f29, $sp, 56
#if defined(TRMMKERNEL)
LDARG $r26, $sp, 72
LDARG $r27, $sp, 80
#endif
#ifndef __64BIT__
fld.d $f18, $sp, 88
fld.d $f19, $sp, 96
fld.d $f20, $sp, 104
fld.d $f21, $sp, 112
#endif
addi.d $sp, $sp, 128
move $r4, $r17
fmov.d $f0, $f22
fmov.d $f1, $f23
jirl $r0, $r1, 0x0
EPILOGUE