OpenBLAS/kernel/loongarch64/zgemm3m_kernel.S

1360 lines
33 KiB
ArmAsm

/***************************************************************************
Copyright (c) 2020, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define ASSEMBLER
#include "common.h"
#define M $r4
#define N $r5
#define K $r6
#define A $r7
#define B $r8
#define C $r9
#define LDC $r10
#define AO $r12
#define BO $r13
#define I $r17
#define J $r18
#define L $r11
#define CO1 $r14
#define CO2 $r15
#define CO3 $r23
#define CO4 $r24
#define CO5 $r25
#define CO6 $r26
#define CO7 $r27
#define CO8 $r28
#define a1 $f22
#define a2 $f8
#define a3 $f28
#define a4 $f29
#define b1 $f23
#define b2 $f9
#define b3 $f10
#define b4 $f11
#define b5 $f12
#define b6 $f13
#define b7 $f14
#define b8 $f15
#define a5 b8
#define c11 $f16
#define c12 $f17
#define c21 $f3
#define c22 $f4
#define c31 $f2
#define c32 $f5
#define c41 $f6
#define c42 $f7
#define c51 $f18
#define c52 $f19
#define c61 $f20
#define c62 $f21
#define c71 $f24
#define c72 $f25
#define c81 $f26
#define c82 $f27
#define ALPHA_R $f0
#define ALPHA_I $f1
PROLOGUE
addi.d $sp, $sp, -128
SDARG $r23, $sp, 0
SDARG $r24, $sp, 8
SDARG $r25, $sp, 16
SDARG $r26, $sp, 24
SDARG $r27, $sp, 32
SDARG $r28, $sp, 40
fst.d $f24, $sp, 48
fst.d $f25, $sp, 56
fst.d $f26, $sp, 64
fst.d $f27, $sp, 72
fst.d $f28, $sp, 80
fst.d $f29, $sp, 88
slli.d LDC, LDC, ZBASE_SHIFT
srai.d J, N, 3
bge $r0, J, .L30
.L10:
move CO1, C
MTC c11, $r0
add.d CO2, C, LDC
move AO, A
add.d CO3, CO2, LDC
addi.d J, J, -1
add.d CO4, CO3, LDC
MOV c21, c11
add.d CO5, CO4, LDC
MOV c31, c11
add.d CO6, CO5, LDC
MOV c41, c11
add.d CO7, CO6, LDC
MOV c51, c11
add.d CO8, CO7, LDC
srai.d I, M, 1
add.d C, CO8, LDC
MOV c61, c11
bge $r0, I, .L20
.L11:
LD a1, AO, 0 * SIZE
MOV c71, c11
LD b1, B, 0 * SIZE
MOV c81, c11
LD a3, AO, 4 * SIZE
MOV c12, c11
LD b2, B, 1 * SIZE
MOV c22, c11
srai.d L, K, 2
MOV c32, c11
LD b3, B, 2 * SIZE
MOV c42, c11
LD b4, B, 3 * SIZE
MOV c52, c11
LD b5, B, 4 * SIZE
MOV c62, c11
LD b6, B, 8 * SIZE
MOV c72, c11
LD b7, B, 12 * SIZE
MOV c82, c11
move BO, B
bge $r0, L, .L15
MADD c11, b1, a1, c11
LD a2, AO, 1 * SIZE
MADD c21, b2, a1, c21
addi.d L, L, -1
MADD c31, b3, a1, c31
MADD c41, b4, a1, c41
bge $r0, L, .L13
.align 3
.L12:
MADD c12, b1, a2, c12
LD b1, BO, 16 * SIZE
MADD c22, b2, a2, c22
LD b2, BO, 5 * SIZE
MADD c32, b3, a2, c32
LD b3, BO, 6 * SIZE
MADD c42, b4, a2, c42
LD b4, BO, 7 * SIZE
MADD c51, b5, a1, c51
LD a4, AO, 2 * SIZE
MADD c61, b2, a1, c61
MADD c71, b3, a1, c71
MADD c81, b4, a1, c81
LD a1, AO, 8 * SIZE
MADD c52, b5, a2, c52
LD b5, BO, 20 * SIZE
MADD c62, b2, a2, c62
LD b2, BO, 9 * SIZE
MADD c72, b3, a2, c72
LD b3, BO, 10 * SIZE
MADD c82, b4, a2, c82
LD b4, BO, 11 * SIZE
MADD c11, b6, a4, c11
LD a2, AO, 3 * SIZE
MADD c21, b2, a4, c21
MADD c31, b3, a4, c31
MADD c41, b4, a4, c41
MADD c12, b6, a2, c12
LD b6, BO, 24 * SIZE
MADD c22, b2, a2, c22
LD b2, BO, 13 * SIZE
MADD c32, b3, a2, c32
LD b3, BO, 14 * SIZE
MADD c42, b4, a2, c42
LD b4, BO, 15 * SIZE
MADD c51, b7, a4, c51
MADD c61, b2, a4, c61
MADD c71, b3, a4, c71
MADD c81, b4, a4, c81
MADD c52, b7, a2, c52
LD b7, BO, 28 * SIZE
MADD c62, b2, a2, c62
LD b2, BO, 17 * SIZE
MADD c72, b3, a2, c72
LD b3, BO, 18 * SIZE
MADD c82, b4, a2, c82
LD b4, BO, 19 * SIZE
MADD c11, b1, a3, c11
LD a2, AO, 5 * SIZE
MADD c21, b2, a3, c21
MADD c31, b3, a3, c31
MADD c41, b4, a3, c41
MADD c12, b1, a2, c12
LD b1, BO, 32 * SIZE
MADD c22, b2, a2, c22
LD b2, BO, 21 * SIZE
MADD c32, b3, a2, c32
LD b3, BO, 22 * SIZE
MADD c42, b4, a2, c42
LD b4, BO, 23 * SIZE
MADD c51, b5, a3, c51
LD a4, AO, 6 * SIZE
MADD c61, b2, a3, c61
MADD c71, b3, a3, c71
MADD c81, b4, a3, c81
LD a3, AO, 12 * SIZE
MADD c52, b5, a2, c52
LD b5, BO, 36 * SIZE
MADD c62, b2, a2, c62
LD b2, BO, 25 * SIZE
MADD c72, b3, a2, c72
LD b3, BO, 26 * SIZE
MADD c82, b4, a2, c82
LD b4, BO, 27 * SIZE
MADD c11, b6, a4, c11
LD a2, AO, 7 * SIZE
MADD c21, b2, a4, c21
MADD c31, b3, a4, c31
MADD c41, b4, a4, c41
addi.d L, L, -1
MADD c12, b6, a2, c12
LD b6, BO, 40 * SIZE
MADD c22, b2, a2, c22
LD b2, BO, 29 * SIZE
MADD c32, b3, a2, c32
LD b3, BO, 30 * SIZE
MADD c42, b4, a2, c42
LD b4, BO, 31 * SIZE
MADD c51, b7, a4, c51
addi.d BO, BO, 32 * SIZE
MADD c61, b2, a4, c61
addi.d AO, AO, 8 * SIZE
MADD c71, b3, a4, c71
MADD c81, b4, a4, c81
MADD c52, b7, a2, c52
LD b7, BO, 12 * SIZE
MADD c62, b2, a2, c62
LD b2, BO, 1 * SIZE
MADD c72, b3, a2, c72
LD b3, BO, 2 * SIZE
MADD c82, b4, a2, c82
LD b4, BO, 3 * SIZE
MADD c11, b1, a1, c11
LD a2, AO, 1 * SIZE
MADD c21, b2, a1, c21
MADD c31, b3, a1, c31
MADD c41, b4, a1, c41
blt $r0, L, .L12
.align 3
.L13:
MADD c12, b1, a2, c12
LD b1, BO, 16 * SIZE
MADD c22, b2, a2, c22
LD b2, BO, 5 * SIZE
MADD c32, b3, a2, c32
LD b3, BO, 6 * SIZE
MADD c42, b4, a2, c42
LD b4, BO, 7 * SIZE
MADD c51, b5, a1, c51
MADD c61, b2, a1, c61
LD a4, AO, 2 * SIZE
MADD c71, b3, a1, c71
MADD c81, b4, a1, c81
LD a1, AO, 8 * SIZE
MADD c52, b5, a2, c52
LD b5, BO, 20 * SIZE
MADD c62, b2, a2, c62
LD b2, BO, 9 * SIZE
MADD c72, b3, a2, c72
LD b3, BO, 10 * SIZE
MADD c82, b4, a2, c82
LD b4, BO, 11 * SIZE
MADD c11, b6, a4, c11
LD a2, AO, 3 * SIZE
MADD c21, b2, a4, c21
MADD c31, b3, a4, c31
MADD c41, b4, a4, c41
MADD c12, b6, a2, c12
LD b6, BO, 24 * SIZE
MADD c22, b2, a2, c22
LD b2, BO, 13 * SIZE
MADD c32, b3, a2, c32
LD b3, BO, 14 * SIZE
MADD c42, b4, a2, c42
LD b4, BO, 15 * SIZE
MADD c51, b7, a4, c51
MADD c61, b2, a4, c61
MADD c71, b3, a4, c71
MADD c81, b4, a4, c81
MADD c52, b7, a2, c52
LD b7, BO, 28 * SIZE
MADD c62, b2, a2, c62
LD b2, BO, 17 * SIZE
MADD c72, b3, a2, c72
LD b3, BO, 18 * SIZE
MADD c82, b4, a2, c82
LD b4, BO, 19 * SIZE
MADD c11, b1, a3, c11
LD a2, AO, 5 * SIZE
MADD c21, b2, a3, c21
MADD c31, b3, a3, c31
MADD c41, b4, a3, c41
MADD c12, b1, a2, c12
LD b1, BO, 32 * SIZE
MADD c22, b2, a2, c22
LD b2, BO, 21 * SIZE
MADD c32, b3, a2, c32
LD b3, BO, 22 * SIZE
MADD c42, b4, a2, c42
LD b4, BO, 23 * SIZE
MADD c51, b5, a3, c51
MADD c61, b2, a3, c61
LD a4, AO, 6 * SIZE
MADD c71, b3, a3, c71
MADD c81, b4, a3, c81
LD a3, AO, 12 * SIZE
MADD c52, b5, a2, c52
LD b5, BO, 36 * SIZE
MADD c62, b2, a2, c62
LD b2, BO, 25 * SIZE
MADD c72, b3, a2, c72
LD b3, BO, 26 * SIZE
MADD c82, b4, a2, c82
LD b4, BO, 27 * SIZE
MADD c11, b6, a4, c11
LD a2, AO, 7 * SIZE
MADD c21, b2, a4, c21
MADD c31, b3, a4, c31
MADD c41, b4, a4, c41
MADD c12, b6, a2, c12
LD b6, BO, 40 * SIZE
MADD c22, b2, a2, c22
LD b2, BO, 29 * SIZE
MADD c32, b3, a2, c32
LD b3, BO, 30 * SIZE
MADD c42, b4, a2, c42
LD b4, BO, 31 * SIZE
MADD c51, b7, a4, c51
addi.d BO, BO, 32 * SIZE
MADD c61, b2, a4, c61
addi.d AO, AO, 8 * SIZE
MADD c71, b3, a4, c71
MADD c81, b4, a4, c81
MADD c52, b7, a2, c52
LD b7, BO, 12 * SIZE
MADD c62, b2, a2, c62
LD b2, BO, 1 * SIZE
MADD c72, b3, a2, c72
LD b3, BO, 2 * SIZE
MADD c82, b4, a2, c82
LD b4, BO, 3 * SIZE
.align 3
.L15:
andi L, K, 3
bge $r0, L, .L18
.align 3
.L16:
MADD c11, b1, a1, c11
LD a2, AO, 1 * SIZE
MADD c21, b2, a1, c21
MADD c31, b3, a1, c31
MADD c41, b4, a1, c41
MADD c12, b1, a2, c12
LD b1, BO, 8 * SIZE
MADD c22, b2, a2, c22
LD b2, BO, 5 * SIZE
MADD c32, b3, a2, c32
LD b3, BO, 6 * SIZE
MADD c42, b4, a2, c42
LD b4, BO, 7 * SIZE
MADD c51, b5, a1, c51
addi.d L, L, -1
MADD c61, b2, a1, c61
addi.d AO, AO, 2 * SIZE
MADD c71, b3, a1, c71
addi.d BO, BO, 8 * SIZE
MADD c81, b4, a1, c81
LD a1, AO, 0 * SIZE
MADD c52, b5, a2, c52
LD b5, BO, 4 * SIZE
MADD c62, b2, a2, c62
LD b2, BO, 1 * SIZE
MADD c72, b3, a2, c72
LD b3, BO, 2 * SIZE
MADD c82, b4, a2, c82
LD b4, BO, 3 * SIZE
blt $r0, L, .L16
.L18:
LD $f22, CO1, 0 * SIZE
LD $f8, CO1, 1 * SIZE
LD $f23, CO1, 2 * SIZE
LD $f9, CO1, 3 * SIZE
LD $f10, CO2, 0 * SIZE
MADD $f22, c11, ALPHA_R, $f22
LD $f11, CO2, 1 * SIZE
MADD $f8, c11, ALPHA_I, $f8
LD $f12, CO2, 2 * SIZE
MADD $f23, c12, ALPHA_R, $f23
LD $f13, CO2, 3 * SIZE
MADD $f9, c12, ALPHA_I, $f9
MADD $f10, c21, ALPHA_R, $f10
ST $f22, CO1, 0 * SIZE
MADD $f11, c21, ALPHA_I, $f11
ST $f8, CO1, 1 * SIZE
MADD $f12, c22, ALPHA_R, $f12
ST $f23, CO1, 2 * SIZE
MADD $f13, c22, ALPHA_I, $f13
ST $f9, CO1, 3 * SIZE
LD $f22, CO3, 0 * SIZE
LD $f8, CO3, 1 * SIZE
LD $f23, CO3, 2 * SIZE
LD $f9, CO3, 3 * SIZE
ST $f10, CO2, 0 * SIZE
ST $f11, CO2, 1 * SIZE
ST $f12, CO2, 2 * SIZE
ST $f13, CO2, 3 * SIZE
LD $f10, CO4, 0 * SIZE
LD $f11, CO4, 1 * SIZE
LD $f12, CO4, 2 * SIZE
LD $f13, CO4, 3 * SIZE
MADD $f22, c31, ALPHA_R, $f22
MADD $f8, c31, ALPHA_I, $f8
MADD $f23, c32, ALPHA_R, $f23
MADD $f9, c32, ALPHA_I, $f9
MADD $f10, c41, ALPHA_R, $f10
ST $f22, CO3, 0 * SIZE
MADD $f11, c41, ALPHA_I, $f11
ST $f8, CO3, 1 * SIZE
MADD $f12, c42, ALPHA_R, $f12
ST $f23, CO3, 2 * SIZE
MADD $f13, c42, ALPHA_I, $f13
ST $f9, CO3, 3 * SIZE
LD $f22, CO5, 0 * SIZE
LD $f8, CO5, 1 * SIZE
LD $f23, CO5, 2 * SIZE
LD $f9, CO5, 3 * SIZE
ST $f10, CO4, 0 * SIZE
ST $f11, CO4, 1 * SIZE
ST $f12, CO4, 2 * SIZE
ST $f13, CO4, 3 * SIZE
LD $f10, CO6, 0 * SIZE
LD $f11, CO6, 1 * SIZE
LD $f12, CO6, 2 * SIZE
LD $f13, CO6, 3 * SIZE
MADD $f22, c51, ALPHA_R, $f22
addi.d CO1,CO1, 4 * SIZE
MADD $f8, c51, ALPHA_I, $f8
addi.d CO2,CO2, 4 * SIZE
MADD $f23, c52, ALPHA_R, $f23
addi.d CO3,CO3, 4 * SIZE
MADD $f9, c52, ALPHA_I, $f9
addi.d CO4,CO4, 4 * SIZE
MADD $f10, c61, ALPHA_R, $f10
ST $f22, CO5, 0 * SIZE
MADD $f11, c61, ALPHA_I, $f11
ST $f8, CO5, 1 * SIZE
MADD $f12, c62, ALPHA_R, $f12
ST $f23, CO5, 2 * SIZE
MADD $f13, c62, ALPHA_I, $f13
ST $f9, CO5, 3 * SIZE
LD $f22, CO7, 0 * SIZE
LD $f8, CO7, 1 * SIZE
LD $f23, CO7, 2 * SIZE
LD $f9, CO7, 3 * SIZE
ST $f10, CO6, 0 * SIZE
ST $f11, CO6, 1 * SIZE
ST $f12, CO6, 2 * SIZE
ST $f13, CO6, 3 * SIZE
LD $f10, CO8, 0 * SIZE
addi.d I, I, -1
LD $f11, CO8, 1 * SIZE
MTC c11, $r0
LD $f12, CO8, 2 * SIZE
LD $f13, CO8, 3 * SIZE
MADD $f22, c71, ALPHA_R, $f22
addi.d CO5,CO5, 4 * SIZE
MADD $f8, c71, ALPHA_I, $f8
addi.d CO6,CO6, 4 * SIZE
MADD $f23, c72, ALPHA_R, $f23
addi.d CO7,CO7, 4 * SIZE
MADD $f9, c72, ALPHA_I, $f9
addi.d CO8,CO8, 4 * SIZE
MADD $f10, c81, ALPHA_R, $f10
ST $f22, CO7, -4 * SIZE
MADD $f11, c81, ALPHA_I, $f11
ST $f8, CO7, -3 * SIZE
MADD $f12, c82, ALPHA_R, $f12
ST $f23, CO7, -2 * SIZE
MADD $f13, c82, ALPHA_I, $f13
ST $f9, CO7, -1 * SIZE
ST $f10, CO8, -4 * SIZE
MOV c21, c11
ST $f11, CO8, -3 * SIZE
MOV c31, c11
ST $f12, CO8, -2 * SIZE
MOV c41, c11
ST $f13, CO8, -1 * SIZE
MOV c51, c11
MOV c61, c11
blt $r0, I, .L11
.align 3
.L20:
andi I, M, 1
MOV c61, c11
MOV c71, c11
bge $r0, I, .L29
LD a1, AO, 0 * SIZE
LD a2, AO, 1 * SIZE
LD a3, AO, 2 * SIZE
LD a4, AO, 3 * SIZE
LD b1, B, 0 * SIZE
LD b2, B, 1 * SIZE
LD b3, B, 2 * SIZE
LD b4, B, 3 * SIZE
LD b5, B, 4 * SIZE
LD b6, B, 8 * SIZE
LD b7, B, 12 * SIZE
srai.d L, K, 2
MOV c81, c11
move BO, B
bge $r0, L, .L25
.align 3
.L22:
MADD c11, b1, a1, c11
LD b1, BO, 16 * SIZE
MADD c21, b2, a1, c21
LD b2, BO, 5 * SIZE
MADD c31, b3, a1, c31
LD b3, BO, 6 * SIZE
MADD c41, b4, a1, c41
LD b4, BO, 7 * SIZE
MADD c51, b5, a1, c51
LD b5, BO, 20 * SIZE
MADD c61, b2, a1, c61
LD b2, BO, 9 * SIZE
MADD c71, b3, a1, c71
LD b3, BO, 10 * SIZE
MADD c81, b4, a1, c81
LD b4, BO, 11 * SIZE
LD a1, AO, 4 * SIZE
addi.d L, L, -1
MADD c11, b6, a2, c11
LD b6, BO, 24 * SIZE
MADD c21, b2, a2, c21
LD b2, BO, 13 * SIZE
MADD c31, b3, a2, c31
LD b3, BO, 14 * SIZE
MADD c41, b4, a2, c41
LD b4, BO, 15 * SIZE
MADD c51, b7, a2, c51
LD b7, BO, 28 * SIZE
MADD c61, b2, a2, c61
LD b2, BO, 17 * SIZE
MADD c71, b3, a2, c71
LD b3, BO, 18 * SIZE
MADD c81, b4, a2, c81
LD b4, BO, 19 * SIZE
LD a2, AO, 5 * SIZE
addi.d AO, AO, 4 * SIZE
MADD c11, b1, a3, c11
LD b1, BO, 32 * SIZE
MADD c21, b2, a3, c21
LD b2, BO, 21 * SIZE
MADD c31, b3, a3, c31
LD b3, BO, 22 * SIZE
MADD c41, b4, a3, c41
LD b4, BO, 23 * SIZE
MADD c51, b5, a3, c51
LD b5, BO, 36 * SIZE
MADD c61, b2, a3, c61
LD b2, BO, 25 * SIZE
MADD c71, b3, a3, c71
LD b3, BO, 26 * SIZE
MADD c81, b4, a3, c81
LD b4, BO, 27 * SIZE
LD a3, AO, 2 * SIZE
addi.d BO, BO, 32 * SIZE
MADD c11, b6, a4, c11
LD b6, BO, 8 * SIZE
MADD c21, b2, a4, c21
LD b2, BO, -3 * SIZE
MADD c31, b3, a4, c31
LD b3, BO, -2 * SIZE
MADD c41, b4, a4, c41
LD b4, BO, -1 * SIZE
MADD c51, b7, a4, c51
LD b7, BO, 12 * SIZE
MADD c61, b2, a4, c61
LD b2, BO, 1 * SIZE
MADD c71, b3, a4, c71
LD b3, BO, 2 * SIZE
MADD c81, b4, a4, c81
LD b4, BO, 3 * SIZE
LD a4, AO, 3 * SIZE
blt $r0, L, .L22
.align 3
.L25:
andi L, K, 3
bge $r0, L, .L28
.align 3
.L26:
MADD c11, b1, a1, c11
LD b1, BO, 8 * SIZE
MADD c21, b2, a1, c21
LD b2, BO, 5 * SIZE
MADD c31, b3, a1, c31
LD b3, BO, 6 * SIZE
MADD c41, b4, a1, c41
LD b4, BO, 7 * SIZE
addi.d L, L, -1
MOV a2, a2
addi.d AO, AO, 1 * SIZE
addi.d BO, BO, 8 * SIZE
MADD c51, b5, a1, c51
LD b5, BO, 4 * SIZE
MADD c61, b2, a1, c61
LD b2, BO, 1 * SIZE
MADD c71, b3, a1, c71
LD b3, BO, 2 * SIZE
MADD c81, b4, a1, c81
LD a1, AO, 0 * SIZE
LD b4, BO, 3 * SIZE
blt $r0, L, .L26
.L28:
LD $f22, CO1, 0 * SIZE
LD $f8, CO1, 1 * SIZE
LD $f23, CO2, 0 * SIZE
LD $f9, CO2, 1 * SIZE
LD $f10, CO3, 0 * SIZE
MADD $f22, c11, ALPHA_R, $f22
LD $f11, CO3, 1 * SIZE
MADD $f8, c11, ALPHA_I, $f8
LD $f12, CO4, 0 * SIZE
MADD $f23, c21, ALPHA_R, $f23
LD $f13, CO4, 1 * SIZE
MADD $f9, c21, ALPHA_I, $f9
MADD $f10, c31, ALPHA_R, $f10
ST $f22, CO1, 0 * SIZE
MADD $f11, c31, ALPHA_I, $f11
ST $f8, CO1, 1 * SIZE
MADD $f12, c41, ALPHA_R, $f12
ST $f23, CO2, 0 * SIZE
MADD $f13, c41, ALPHA_I, $f13
ST $f9, CO2, 1 * SIZE
LD $f22, CO5, 0 * SIZE
LD $f8, CO5, 1 * SIZE
LD $f23, CO6, 0 * SIZE
LD $f9, CO6, 1 * SIZE
ST $f10, CO3, 0 * SIZE
ST $f11, CO3, 1 * SIZE
ST $f12, CO4, 0 * SIZE
ST $f13, CO4, 1 * SIZE
LD $f10, CO7, 0 * SIZE
MADD $f22, c51, ALPHA_R, $f22
LD $f11, CO7, 1 * SIZE
MADD $f8, c51, ALPHA_I, $f8
LD $f12, CO8, 0 * SIZE
MADD $f23, c61, ALPHA_R, $f23
LD $f13, CO8, 1 * SIZE
MADD $f9, c61, ALPHA_I, $f9
MADD $f10, c71, ALPHA_R, $f10
ST $f22, CO5, 0 * SIZE
MADD $f11, c71, ALPHA_I, $f11
ST $f8, CO5, 1 * SIZE
MADD $f12, c81, ALPHA_R, $f12
ST $f23, CO6, 0 * SIZE
MADD $f13, c81, ALPHA_I, $f13
ST $f9, CO6, 1 * SIZE
ST $f10, CO7, 0 * SIZE
ST $f11, CO7, 1 * SIZE
ST $f12, CO8, 0 * SIZE
ST $f13, CO8, 1 * SIZE
.align 3
.L29:
move B, BO
blt $r0, J, .L10
.align 3
.L30:
andi J, N, 4
move AO, A
bge $r0, J, .L50
move CO1, C
MTC c11, $r0
add.d CO2, C, LDC
add.d CO3, CO2, LDC
add.d CO4, CO3, LDC
MOV c21, c11
add.d C, CO4, LDC
MOV c31, c11
srai.d I, M, 1
MOV c41, c11
bge $r0, I, .L40
.L31:
LD a1, AO, 0 * SIZE
LD a3, AO, 4 * SIZE
LD b1, B, 0 * SIZE
MOV c12, c11
LD b2, B, 1 * SIZE
MOV c22, c11
LD b3, B, 2 * SIZE
MOV c32, c11
LD b4, B, 3 * SIZE
MOV c42, c11
LD b5, B, 4 * SIZE
srai.d L, K, 2
LD b6, B, 8 * SIZE
LD b7, B, 12 * SIZE
move BO, B
bge $r0, L, .L35
.align 3
.L32:
MADD c11, b1, a1, c11
LD a2, AO, 1 * SIZE
MADD c21, b2, a1, c21
addi.d L, L, -1
MADD c31, b3, a1, c31
MADD c41, b4, a1, c41
LD a1, AO, 2 * SIZE
MADD c12, b1, a2, c12
LD b1, BO, 16 * SIZE
MADD c22, b2, a2, c22
LD b2, BO, 5 * SIZE
MADD c32, b3, a2, c32
LD b3, BO, 6 * SIZE
MADD c42, b4, a2, c42
LD b4, BO, 7 * SIZE
MADD c11, b5, a1, c11
LD a2, AO, 3 * SIZE
MADD c21, b2, a1, c21
MADD c31, b3, a1, c31
MADD c41, b4, a1, c41
LD a1, AO, 8 * SIZE
MADD c12, b5, a2, c12
LD b5, BO, 20 * SIZE
MADD c22, b2, a2, c22
LD b2, BO, 9 * SIZE
MADD c32, b3, a2, c32
LD b3, BO, 10 * SIZE
MADD c42, b4, a2, c42
LD b4, BO, 11 * SIZE
MADD c11, b6, a3, c11
LD a2, AO, 5 * SIZE
MADD c21, b2, a3, c21
MADD c31, b3, a3, c31
MADD c41, b4, a3, c41
LD a3, AO, 6 * SIZE
MADD c12, b6, a2, c12
LD b6, BO, 24 * SIZE
MADD c22, b2, a2, c22
LD b2, BO, 13 * SIZE
MADD c32, b3, a2, c32
LD b3, BO, 14 * SIZE
MADD c42, b4, a2, c42
LD b4, BO, 15 * SIZE
MADD c11, b7, a3, c11
LD a2, AO, 7 * SIZE
MADD c21, b2, a3, c21
addi.d AO, AO, 8 * SIZE
MADD c31, b3, a3, c31
addi.d BO, BO, 16 * SIZE
MADD c41, b4, a3, c41
LD a3, AO, 4 * SIZE
MADD c12, b7, a2, c12
LD b7, BO, 12 * SIZE
MADD c22, b2, a2, c22
LD b2, BO, 1 * SIZE
MADD c32, b3, a2, c32
LD b3, BO, 2 * SIZE
MADD c42, b4, a2, c42
LD b4, BO, 3 * SIZE
blt $r0, L, .L32
.align 3
.L35:
andi L, K, 3
bge $r0, L, .L38
.align 3
.L36:
MADD c11, b1, a1, c11
LD a2, AO, 1 * SIZE
MADD c21, b2, a1, c21
addi.d L, L, -1
MADD c31, b3, a1, c31
addi.d AO, AO, 2 * SIZE
MADD c41, b4, a1, c41
LD a1, AO, 0 * SIZE
MADD c12, b1, a2, c12
LD b1, BO, 4 * SIZE
MADD c22, b2, a2, c22
LD b2, BO, 5 * SIZE
MADD c32, b3, a2, c32
LD b3, BO, 6 * SIZE
MADD c42, b4, a2, c42
LD b4, BO, 7 * SIZE
addi.d BO, BO, 4 * SIZE
blt $r0, L, .L36
.L38:
LD $f22, CO1, 0 * SIZE
LD $f8, CO1, 1 * SIZE
LD $f23, CO1, 2 * SIZE
LD $f9, CO1, 3 * SIZE
LD $f10, CO2, 0 * SIZE
LD $f11, CO2, 1 * SIZE
LD $f12, CO2, 2 * SIZE
LD $f13, CO2, 3 * SIZE
MADD $f22, c11, ALPHA_R, $f22
MADD $f8, c11, ALPHA_I, $f8
MADD $f23, c12, ALPHA_R, $f23
MADD $f9, c12, ALPHA_I, $f9
MADD $f10, c21, ALPHA_R, $f10
ST $f22, CO1, 0 * SIZE
MADD $f11, c21, ALPHA_I, $f11
ST $f8, CO1, 1 * SIZE
MADD $f12, c22, ALPHA_R, $f12
ST $f23, CO1, 2 * SIZE
MADD $f13, c22, ALPHA_I, $f13
ST $f9, CO1, 3 * SIZE
LD $f22, CO3, 0 * SIZE
LD $f8, CO3, 1 * SIZE
LD $f23, CO3, 2 * SIZE
LD $f9, CO3, 3 * SIZE
ST $f10, CO2, 0 * SIZE
MADD $f22, c31, ALPHA_R, $f22
ST $f11, CO2, 1 * SIZE
MADD $f8, c31, ALPHA_I, $f8
ST $f12, CO2, 2 * SIZE
MADD $f23, c32, ALPHA_R, $f23
ST $f13, CO2, 3 * SIZE
MADD $f9, c32, ALPHA_I, $f9
LD $f10, CO4, 0 * SIZE
LD $f11, CO4, 1 * SIZE
LD $f12, CO4, 2 * SIZE
LD $f13, CO4, 3 * SIZE
MADD $f10, c41, ALPHA_R, $f10
addi.d CO1,CO1, 4 * SIZE
MADD $f11, c41, ALPHA_I, $f11
addi.d CO2,CO2, 4 * SIZE
MADD $f12, c42, ALPHA_R, $f12
addi.d CO3,CO3, 4 * SIZE
MADD $f13, c42, ALPHA_I, $f13
addi.d CO4,CO4, 4 * SIZE
ST $f22, CO3, -4 * SIZE
addi.d I, I, -1
ST $f8, CO3, -3 * SIZE
ST $f23, CO3, -2 * SIZE
ST $f9, CO3, -1 * SIZE
ST $f10, CO4, -4 * SIZE
MTC c11, $r0
ST $f11, CO4, -3 * SIZE
MOV c21, c11
ST $f12, CO4, -2 * SIZE
MOV c31, c11
ST $f13, CO4, -1 * SIZE
MOV c41, c11
blt $r0, I, .L31
.align 3
.L40:
andi I, M, 1
MOV c61, c11
bge $r0, I, .L49
LD a1, AO, 0 * SIZE
MOV c71, c11
LD a2, AO, 1 * SIZE
MOV c81, c11
LD b1, B, 0 * SIZE
LD b2, B, 1 * SIZE
LD b3, B, 2 * SIZE
LD b4, B, 3 * SIZE
LD b5, B, 4 * SIZE
LD b6, B, 8 * SIZE
LD b7, B, 12 * SIZE
srai.d L, K, 2
move BO, B
bge $r0, L, .L45
.align 3
.L42:
MADD c11, b1, a1, c11
LD b1, BO, 16 * SIZE
MADD c21, b2, a1, c21
LD b2, BO, 5 * SIZE
MADD c31, b3, a1, c31
LD b3, BO, 6 * SIZE
MADD c41, b4, a1, c41
LD b4, BO, 7 * SIZE
LD a1, AO, 4 * SIZE
addi.d L, L, -1
MADD c11, b5, a2, c11
LD b5, BO, 20 * SIZE
MADD c21, b2, a2, c21
LD b2, BO, 9 * SIZE
MADD c31, b3, a2, c31
LD b3, BO, 10 * SIZE
MADD c41, b4, a2, c41
LD b4, BO, 11 * SIZE
LD a2, AO, 2 * SIZE
addi.d AO, AO, 4 * SIZE
MADD c11, b6, a2, c11
LD b6, BO, 24 * SIZE
MADD c21, b2, a2, c21
LD b2, BO, 13 * SIZE
MADD c31, b3, a2, c31
LD b3, BO, 14 * SIZE
MADD c41, b4, a2, c41
LD b4, BO, 15 * SIZE
LD a2, AO, -1 * SIZE
addi.d BO, BO, 16 * SIZE
MADD c11, b7, a2, c11
LD b7, BO, 12 * SIZE
MADD c21, b2, a2, c21
LD b2, BO, 1 * SIZE
MADD c31, b3, a2, c31
LD b3, BO, 2 * SIZE
MADD c41, b4, a2, c41
LD b4, BO, 3 * SIZE
LD a2, AO, 1 * SIZE
blt $r0, L, .L42
.align 3
.L45:
andi L, K, 3
bge $r0, L, .L48
.align 3
.L46:
MADD c11, b1, a1, c11
LD b1, BO, 4 * SIZE
MADD c21, b2, a1, c21
LD b2, BO, 5 * SIZE
MADD c31, b3, a1, c31
LD b3, BO, 6 * SIZE
MADD c41, b4, a1, c41
LD a1, AO, 1 * SIZE
LD b4, BO, 7 * SIZE
addi.d L, L, -1
addi.d AO, AO, 1 * SIZE
MOV a2, a2
addi.d BO, BO, 4 * SIZE
blt $r0, L, .L46
.L48:
LD $f22, CO1, 0 * SIZE
LD $f8, CO1, 1 * SIZE
LD $f23, CO2, 0 * SIZE
LD $f9, CO2, 1 * SIZE
LD $f10, CO3, 0 * SIZE
MADD $f22, c11, ALPHA_R, $f22
LD $f11, CO3, 1 * SIZE
MADD $f8, c11, ALPHA_I, $f8
LD $f12, CO4, 0 * SIZE
MADD $f23, c21, ALPHA_R, $f23
LD $f13, CO4, 1 * SIZE
MADD $f9, c21, ALPHA_I, $f9
MADD $f10, c31, ALPHA_R, $f10
ST $f22, CO1, 0 * SIZE
MADD $f11, c31, ALPHA_I, $f11
ST $f8, CO1, 1 * SIZE
MADD $f12, c41, ALPHA_R, $f12
ST $f23, CO2, 0 * SIZE
MADD $f13, c41, ALPHA_I, $f13
ST $f9, CO2, 1 * SIZE
ST $f10, CO3, 0 * SIZE
ST $f11, CO3, 1 * SIZE
ST $f12, CO4, 0 * SIZE
ST $f13, CO4, 1 * SIZE
.align 3
.L49:
move B, BO
.align 3
.L50:
andi J, N, 2
move AO, A
bge $r0, J, .L70
move CO1, C
add.d CO2, C, LDC
srai.d I, M, 1
add.d C, CO2, LDC
bge $r0, I, .L60
.L51:
LD a1, AO, 0 * SIZE
MTC c11, $r0
LD a2, AO, 1 * SIZE
MOV c21, c11
LD a5, AO, 4 * SIZE
LD b1, B, 0 * SIZE
MOV c12, c11
LD b2, B, 1 * SIZE
MOV c22, c11
LD b3, B, 2 * SIZE
LD b5, B, 4 * SIZE
srai.d L, K, 2
LD b6, B, 8 * SIZE
LD b7, B, 12 * SIZE
move BO, B
bge $r0, L, .L55
.align 3
.L52:
MADD c11, b1, a1, c11
LD a3, AO, 2 * SIZE
MADD c21, b2, a1, c21
LD b4, BO, 3 * SIZE
MADD c12, b1, a2, c12
LD a4, AO, 3 * SIZE
MADD c22, b2, a2, c22
LD b1, BO, 8 * SIZE
MADD c11, b3, a3, c11
LD a1, AO, 8 * SIZE
MADD c21, b4, a3, c21
LD b2, BO, 5 * SIZE
MADD c12, b3, a4, c12
LD a2, AO, 5 * SIZE
MADD c22, b4, a4, c22
LD b3, BO, 6 * SIZE
MADD c11, b5, a5, c11
LD a3, AO, 6 * SIZE
MADD c21, b2, a5, c21
LD b4, BO, 7 * SIZE
MADD c12, b5, a2, c12
LD a4, AO, 7 * SIZE
MADD c22, b2, a2, c22
LD b5, BO, 12 * SIZE
MADD c11, b3, a3, c11
LD a5, AO, 12 * SIZE
MADD c21, b4, a3, c21
LD b2, BO, 9 * SIZE
MADD c12, b3, a4, c12
LD a2, AO, 9 * SIZE
MADD c22, b4, a4, c22
LD b3, BO, 10 * SIZE
addi.d AO, AO, 8 * SIZE
addi.d L, L, -1
addi.d BO, BO, 8 * SIZE
blt $r0, L, .L52
.align 3
.L55:
andi L, K, 3
bge $r0, L, .L58
.align 3
.L56:
MADD c11, b1, a1, c11
LD a2, AO, 1 * SIZE
MADD c21, b2, a1, c21
LD a1, AO, 2 * SIZE
MADD c12, b1, a2, c12
LD b1, BO, 2 * SIZE
MADD c22, b2, a2, c22
LD b2, BO, 3 * SIZE
addi.d L, L, -1
addi.d AO, AO, 2 * SIZE
addi.d BO, BO, 2 * SIZE
blt $r0, L, .L56
.L58:
LD $f22, CO1, 0 * SIZE
LD $f8, CO1, 1 * SIZE
LD $f23, CO1, 2 * SIZE
LD $f9, CO1, 3 * SIZE
LD $f10, CO2, 0 * SIZE
LD $f11, CO2, 1 * SIZE
LD $f12, CO2, 2 * SIZE
LD $f13, CO2, 3 * SIZE
MADD $f22, c11, ALPHA_R, $f22
addi.d I, I, -1
MADD $f8, c11, ALPHA_I, $f8
addi.d CO1,CO1, 4 * SIZE
MADD $f23, c12, ALPHA_R, $f23
addi.d CO2,CO2, 4 * SIZE
MADD $f9, c12, ALPHA_I, $f9
MADD $f10, c21, ALPHA_R, $f10
MADD $f11, c21, ALPHA_I, $f11
MADD $f12, c22, ALPHA_R, $f12
MADD $f13, c22, ALPHA_I, $f13
ST $f22, CO1, -4 * SIZE
ST $f8, CO1, -3 * SIZE
ST $f23, CO1, -2 * SIZE
ST $f9, CO1, -1 * SIZE
ST $f10, CO2, -4 * SIZE
ST $f11, CO2, -3 * SIZE
ST $f12, CO2, -2 * SIZE
ST $f13, CO2, -1 * SIZE
blt $r0, I, .L51
.align 3
.L60:
andi I, M, 1
bge $r0, I, .L69
srai.d L, K, 2
LD a1, AO, 0 * SIZE
MTC c11, $r0
LD a2, AO, 1 * SIZE
MOV c21, c11
LD a3, AO, 2 * SIZE
MOV c31, c11
LD a4, AO, 3 * SIZE
MOV c41, c11
LD b1, B, 0 * SIZE
LD b2, B, 1 * SIZE
LD b3, B, 2 * SIZE
LD b4, B, 3 * SIZE
LD b5, B, 4 * SIZE
LD b6, B, 8 * SIZE
LD b7, B, 12 * SIZE
move BO, B
bge $r0, L, .L65
.align 3
.L62:
MADD c11, b1, a1, c11
LD b1, BO, 4 * SIZE
MADD c21, b2, a1, c21
LD b2, BO, 5 * SIZE
MADD c31, b3, a2, c31
LD b3, BO, 6 * SIZE
MADD c41, b4, a2, c41
LD b4, BO, 7 * SIZE
LD a1, AO, 4 * SIZE
LD a2, AO, 5 * SIZE
MADD c11, b1, a3, c11
LD b1, BO, 8 * SIZE
MADD c21, b2, a3, c21
LD b2, BO, 9 * SIZE
MADD c31, b3, a4, c31
LD b3, BO, 10 * SIZE
MADD c41, b4, a4, c41
LD b4, BO, 11 * SIZE
LD a3, AO, 6 * SIZE
LD a4, AO, 7 * SIZE
addi.d L, L, -1
addi.d AO, AO, 4 * SIZE
addi.d BO, BO, 8 * SIZE
blt $r0, L, .L62
.align 3
.L65:
andi L, K, 3
bge $r0, L, .L68
.align 3
.L66:
MADD c11, b1, a1, c11
LD b1, BO, 2 * SIZE
MADD c21, b2, a1, c21
LD b2, BO, 3 * SIZE
LD a1, AO, 1 * SIZE
addi.d L, L, -1
addi.d AO, AO, 1 * SIZE
addi.d BO, BO, 2 * SIZE
blt $r0, L, .L66
.L68:
LD $f22, CO1, 0 * SIZE
LD $f8, CO1, 1 * SIZE
LD $f23, CO2, 0 * SIZE
LD $f9, CO2, 1 * SIZE
ADD c11, c11, c31
ADD c21, c21, c41
MADD $f22, c11, ALPHA_R, $f22
MADD $f8, c11, ALPHA_I, $f8
MADD $f23, c21, ALPHA_R, $f23
MADD $f9, c21, ALPHA_I, $f9
ST $f22, CO1, 0 * SIZE
ST $f8, CO1, 1 * SIZE
ST $f23, CO2, 0 * SIZE
ST $f9, CO2, 1 * SIZE
.align 3
.L69:
move B, BO
.align 3
.L70:
andi J, N, 1
move AO, A
bge $r0, J, .L999
move CO1, C
srai.d I, M, 1
add.d C, CO1, LDC
bge $r0, I, .L80
.L71:
LD a1, AO, 0 * SIZE
MTC c11, $r0
LD a2, AO, 1 * SIZE
MOV c21, c11
LD a5, AO, 4 * SIZE
LD b1, B, 0 * SIZE
MOV c12, c11
LD b2, B, 1 * SIZE
MOV c22, c11
LD b3, B, 2 * SIZE
LD b5, B, 4 * SIZE
srai.d L, K, 2
LD b6, B, 8 * SIZE
LD b7, B, 12 * SIZE
move BO, B
bge $r0, L, .L75
.align 3
.L72:
LD a1, AO, 0 * SIZE
LD a2, AO, 1 * SIZE
LD b1, BO, 0 * SIZE
MADD c11, b1, a1, c11
MADD c12, b1, a2, c12
LD a1, AO, 2 * SIZE
LD a2, AO, 3 * SIZE
LD b1, BO, 1 * SIZE
MADD c11, b1, a1, c11
MADD c12, b1, a2, c12
LD a1, AO, 4 * SIZE
LD a2, AO, 5 * SIZE
LD b1, BO, 2 * SIZE
MADD c11, b1, a1, c11
MADD c12, b1, a2, c12
LD a1, AO, 6 * SIZE
LD a2, AO, 7 * SIZE
LD b1, BO, 3 * SIZE
MADD c11, b1, a1, c11
MADD c12, b1, a2, c12
addi.d L, L, -1
addi.d AO, AO, 8 * SIZE
addi.d BO, BO, 4 * SIZE
blt $r0, L, .L72
.align 3
.L75:
andi L, K, 3
bge $r0, L, .L78
.align 3
.L76:
LD a1, AO, 0 * SIZE
LD a2, AO, 1 * SIZE
LD b1, BO, 0 * SIZE
MADD c11, b1, a1, c11
MADD c12, b1, a2, c12
addi.d L, L, -1
addi.d AO, AO, 2 * SIZE
addi.d BO, BO, 1 * SIZE
blt $r0, L, .L76
.L78:
LD $f22, CO1, 0 * SIZE
LD $f8, CO1, 1 * SIZE
LD $f23, CO1, 2 * SIZE
LD $f9, CO1, 3 * SIZE
ADD c11, c11, c21
addi.d I, I, -1
ADD c12, c12, c22
addi.d CO1,CO1, 4 * SIZE
MADD $f22, c11, ALPHA_R, $f22
MADD $f8, c11, ALPHA_I, $f8
MADD $f23, c12, ALPHA_R, $f23
MADD $f9, c12, ALPHA_I, $f9
ST $f22, CO1, -4 * SIZE
ST $f8, CO1, -3 * SIZE
ST $f23, CO1, -2 * SIZE
ST $f9, CO1, -1 * SIZE
blt $r0, I, .L71
.align 3
.L80:
andi I, M, 1
bge $r0, I, .L89
LD a1, AO, 0 * SIZE
MTC c11, $r0
LD a2, AO, 1 * SIZE
MOV c21, c11
LD a3, AO, 2 * SIZE
LD a4, AO, 3 * SIZE
LD b1, B, 0 * SIZE
LD b2, B, 1 * SIZE
LD b3, B, 2 * SIZE
LD b4, B, 3 * SIZE
LD b5, B, 4 * SIZE
LD b6, B, 8 * SIZE
LD b7, B, 12 * SIZE
srai.d L, K, 2
move BO, B
bge $r0, L, .L85
.align 3
.L82:
LD a1, AO, 0 * SIZE
LD b1, BO, 0 * SIZE
MADD c11, b1, a1, c11
LD a1, AO, 1 * SIZE
LD b1, BO, 1 * SIZE
MADD c21, b1, a1, c21
LD a1, AO, 2 * SIZE
LD b1, BO, 2 * SIZE
MADD c11, b1, a1, c11
LD a1, AO, 3 * SIZE
LD b1, BO, 3 * SIZE
MADD c21, b1, a1, c21
addi.d L, L, -1
addi.d AO, AO, 4 * SIZE
addi.d BO, BO, 4 * SIZE
blt $r0, L, .L82
.align 3
.L85:
andi L, K, 3
bge $r0, L, .L88
.align 3
.L86:
LD a1, AO, 0 * SIZE
LD b1, BO, 0 * SIZE
MADD c11, b1, a1, c11
addi.d L, L, -1
addi.d AO, AO, 1 * SIZE
addi.d BO, BO, 1 * SIZE
blt $r0, L, .L86
.L88:
LD $f22, CO1, 0 * SIZE
LD $f8, CO1, 1 * SIZE
ADD c11, c11, c21
MADD $f22, c11, ALPHA_R, $f22
MADD $f8, c11, ALPHA_I, $f8
ST $f22, CO1, 0 * SIZE
ST $f8, CO1, 1 * SIZE
.align 3
.L89:
move B, BO
.align 3
.L999:
LDARG $r23, $sp, 0
LDARG $r24, $sp, 8
LDARG $r25, $sp, 16
LDARG $r26, $sp, 24
LDARG $r27, $sp, 32
LDARG $r28, $sp, 40
fld.d $f24, $sp, 48
fld.d $f25, $sp, 56
fld.d $f26, $sp, 64
fld.d $f27, $sp, 72
fld.d $f28, $sp, 80
fld.d $f29, $sp, 88
addi.d $sp, $sp, 128
move $r4, $r17
fmov.d $f0, $f22
jirl $r0, $r1, 0x0
EPILOGUE