OpenBLAS/kernel/mips64/gemm_kernel_loongson3a.S

2411 lines
39 KiB
ArmAsm

#define gsLQC1(base,fq,ft,offset) .word(0x32<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq)
#define gsSQC1(base,fq,ft,offset) .word(0x3A<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq)
#define FETCH ld
#define REALNAME ASMNAME
#define ASSEMBLER
#include "common.h"
#define M $4
#define N $5
#define K $6
#define A $8
#define B $9
#define C $10
#define LDC $11
#define AO $12
#define BO $13
#define CO1 $14
#define CO2 $15
#define CO3 $16
#define CO4 $17
#define KCO $18
#define MCO $19
#define NCO $20
#define SPANB $21
#define PREB $23
#define PREA $24
#define SPANA $25
#define ALPHA $f15
#if defined(TRMMKERNEL)
#define OFFSET $2
#define KK $3
#define TEMP $7
#endif
#define R8 8
#define R9 9
#define R14 14
#define R15 15
#define R16 16
#define R17 17
#define t11 $f30
#define t21 $f31
#define t31 $f28
#define t41 $f29
#define t12 $f26
#define t22 $f27
#define t32 $f24
#define t42 $f25
#define t13 $f22
#define t23 $f23
#define t33 $f20
#define t43 $f21
#define t14 $f18
#define t24 $f19
#define t34 $f16
#define t44 $f17
#define c11 $f0
#define c21 $f1
#define c31 $f2
#define c41 $f3
#define c12 $f4
#define c22 $f5
#define c32 $f6
#define c42 $f7
#define c13 $f8
#define c23 $f9
#define c33 $f10
#define c43 $f11
#define c14 $f12
#define c24 $f13
#define c34 $f14
#define c44 $f0
#define a0 $f0
#define a1 $f1
#define a2 $f2
#define a3 $f3
#define a4 $f4
#define a5 $f5
#define a6 $f6
#define a7 $f7
#define b0 $f8
#define b1 $f9
#define b2 $f10
#define b3 $f11
#define b4 $f12
#define b5 $f13
#define b6 $f14
#define b7 $f15
#define F31 31
#define F30 30
#define F29 29
#define F28 28
#define F27 27
#define F26 26
#define F25 25
#define F24 24
#define F23 23
#define F22 22
#define F21 21
#define F20 20
#define F19 19
#define F18 18
#define F17 17
#define F16 16
#define F15 15
#define F14 14
#define F13 13
#define F12 12
#define F11 11
#define F10 10
#define F9 9
#define F8 8
#define F7 7
#define F6 6
#define F5 5
#define F4 4
#define F3 3
#define F2 2
#define F1 1
#define F0 0
PROLOGUE
daddiu $sp, $sp, -160
sd $16, 0($sp)
sd $17, 8($sp)
sd $18, 16($sp)
sd $19, 24($sp)
sd $20, 32($sp)
sd $21, 40($sp)
sd $22, 48($sp)
ST $f24, 56($sp)
ST $f25, 64($sp)
ST $f26, 72($sp)
ST $f27, 80($sp)
ST $f28, 88($sp)
sd $23, 96($sp)
sd $24, 104($sp)
sd $25, 112($sp)
ST $f20,120($sp)
ST $f21,128($sp)
ST $f22,136($sp)
ST $f23,144($sp)
.align 5 # BACKUP
.L0_N4: # Loop N
ST ALPHA,152($sp) # Backup ALPHA
move MCO,M # Backup M
move NCO,N # Backup N
move KCO,K # Backup K
move AO,A # Backup A_addr
dsra N,NCO,2 # N=NCO/2
dsll LDC,LDC,BASE_SHIFT # LDC*8Byte
dsll SPANB,KCO,2+BASE_SHIFT # SPANB=KC*NR(4)*8Byte=KC*2^5
move BO,B # Backup B_addr
#if defined(TRMMKERNEL)
LDARG OFFSET,160($sp) #
#endif
#if defined(TRMMKERNEL) && !defined(LEFT)
neg KK,OFFSET # right
#endif
beq N,$0,.L0_N2 # N=0,NCO<4
dsll SPANA,KCO,1+BASE_SHIFT # SPANA = KCO*4mr*8Byte
.L0_N4_Lb:
move CO1,C # Set C
dsra M,MCO,2 # M=MCO/2
move A,AO # Reset A
daddu CO2,CO1,LDC
daddu CO3,CO2,LDC
daddu PREB,BO,SPANB # PreB point next panelB
daddu CO4,CO3,LDC
daddu PREA,AO,SPANA
#if defined(TRMMKERNEL) && defined(LEFT)
move KK,OFFSET # left
#endif
beqz M,.L14_M2
daddu C,CO4,LDC
.L10:
#if defined(TRMMKERNEL)
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
move B,BO
#else
dsll K,KK,2 + BASE_SHIFT # KK no data part
dsll TEMP,KK,2 + BASE_SHIFT
daddu A,A,K # move A B to data part
daddu B,BO,TEMP
#endif
MTC $0,t11
MOV t21,t11
gsLQC1(R8,F1,F0,0) #a0,a1
MOV t31,t11
MOV t41,t11
gsLQC1(R9,F9,F8,0) #b0,b1
MOV t12,t11
MOV t22,t11
gsLQC1(R8,F3,F2,1) #a2,a3
MOV t32,t11
MOV t42,t11
gsLQC1(R9,F11,F10,1) #b2,b3
MOV t13,t11
MOV t23,t11
MOV t33,t11
MOV t43,t11
MOV t14,t11
MOV t24,t11
MOV t34,t11
MOV t44,t11
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
dsubu TEMP,KCO,KK # temp = kco - kk
#elif defined(LEFT)
daddiu TEMP, KK, 4
#else
daddiu TEMP, KK, 4
#endif
dsra K,TEMP,2 # K=KCO/2
beqz K,.L15
nop
#else
MTC $0,t11 # gemm part
move B,BO
MOV t21,t11
gsLQC1(R8,F1,F0,0) #a0,a1
MOV t31,t11
MOV t41,t11
gsLQC1(R9,F9,F8,0) #b0,b1
MOV t12,t11
MOV t22,t11
gsLQC1(R8,F3,F2,1) #a2,a3
MOV t32,t11
MOV t42,t11
gsLQC1(R9,F11,F10,1) #b2,b3
dsra K,KCO,2 # K=KCO/2
MOV t13,t11
MOV t23,t11
MOV t33,t11
MOV t43,t11
MOV t14,t11
MOV t24,t11
MOV t34,t11
MOV t44,t11
beqz K,.L15
nop
#endif
.align 5
.L11: # N=M=K=4
gsLQC1(R8,F5,F4,2) # R8=A
MADD t11,t11,a0,b0
MADD t21,t21,a1,b0
gsLQC1(R9,F13,F12,2) # R9=B
MADD t12,t12,a0,b1
MADD t22,t22,a1,b1
gsLQC1(R8,F7,F6,3)
MADD t31,t31,a2,b0
MADD t41,t41,a3,b0
gsLQC1(R9,F15,F14,3)
MADD t32,t32,a2,b1
MADD t42,t42,a3,b1
FETCH $0,(PREB)
MADD t13,t13,a0,b2
MADD t23,t23,a1,b2
MADD t14,t14,a0,b3
MADD t24,t24,a1,b3
FETCH $0,(PREA)
MADD t33,t33,a2,b2
MADD t43,t43,a3,b2
MADD t34,t34,a2,b3
MADD t44,t44,a3,b3
#load2 comp1
.L12:
gsLQC1(R8,F1,F0,4)
MADD t11,t11,a4,b4
MADD t21,t21,a5,b4
gsLQC1(R9,F9,F8,4)
MADD t12,t12,a4,b5
MADD t22,t22,a5,b5
gsLQC1(R8,F3,F2,5)
MADD t31,t31,a6,b4
MADD t41,t41,a7,b4
gsLQC1(R9,F11,F10,5)
MADD t32,t32,a6,b5
MADD t42,t42,a7,b5
FETCH $0,4*SIZE(PREB)
MADD t13,t13,a4,b6
MADD t23,t23,a5,b6
MADD t14,t14,a4,b7
MADD t24,t24,a5,b7
FETCH $0,4*SIZE(PREA)
MADD t33,t33,a6,b6
MADD t43,t43,a7,b6
MADD t34,t34,a6,b7
MADD t44,t44,a7,b7
.L13:
gsLQC1(R8,F5,F4,6)
MADD t11,t11,a0,b0
MADD t21,t21,a1,b0
gsLQC1(R9,F13,F12,6)
MADD t12,t12,a0,b1
MADD t22,t22,a1,b1
gsLQC1(R8,F7,F6,7)
MADD t31,t31,a2,b0
MADD t41,t41,a3,b0
gsLQC1(R9,F15,F14,7)
MADD t32,t32,a2,b1
MADD t42,t42,a3,b1
daddu A,A,16*SIZE # A+=4(mr)*4(kr)*8Byte=16*SIZE
FETCH $0,8*SIZE(PREB)
MADD t13,t13,a0,b2
MADD t23,t23,a1,b2
daddu B,B,16*SIZE
MADD t14,t14,a0,b3
MADD t24,t24,a1,b3
FETCH $0,8*SIZE(PREA)
MADD t33,t33,a2,b2
MADD t43,t43,a3,b2
MADD t34,t34,a2,b3
MADD t44,t44,a3,b3
.L14:
gsLQC1(R8,F1,F0,0)
MADD t11,t11,a4,b4
MADD t21,t21,a5,b4
gsLQC1(R9,F9,F8,0)
MADD t12,t12,a4,b5
MADD t22,t22,a5,b5
gsLQC1(R8,F3,F2,1)
MADD t31,t31,a6,b4
MADD t41,t41,a7,b4
daddiu K,K,-1
gsLQC1(R9,F11,F10,1)
MADD t32,t32,a6,b5
MADD t42,t42,a7,b5
FETCH $0,12*SIZE(PREB)
MADD t13,t13,a4,b6
MADD t23,t23,a5,b6
MADD t14,t14,a4,b7
MADD t24,t24,a5,b7
FETCH $0,12*SIZE(PREA)
MADD t33,t33,a6,b6
MADD t43,t43,a7,b6
daddu PREB,PREB,16*SIZE
MADD t34,t34,a6,b7
daddu PREA,PREA,16*SIZE
bnez K,.L11
MADD t44,t44,a7,b7
.L15: # N=4 M=4 K=2
#ifndef TRMMKERNEL
andi K,KCO,2 # k = KCO&2
#else
andi K,TEMP, 2
#endif
nop
beqz K,.L18
nop
.L16:
gsLQC1(R8,F5,F4,2) # R8=A
MADD t11,t11,a0,b0
MADD t21,t21,a1,b0
gsLQC1(R9,F13,F12,2) # R9=B
MADD t12,t12,a0,b1
MADD t22,t22,a1,b1
gsLQC1(R8,F7,F6,3)
MADD t31,t31,a2,b0
MADD t41,t41,a3,b0
gsLQC1(R9,F15,F14,3)
MADD t32,t32,a2,b1
MADD t42,t42,a3,b1
daddu A,A,8*SIZE # A+=4(mr)*2(kr)*8Byte=8*SIZE
FETCH $0,0(PREB)
MADD t13,t13,a0,b2
MADD t23,t23,a1,b2
daddu B,B,8*SIZE
MADD t14,t14,a0,b3
MADD t24,t24,a1,b3
FETCH $0,0(PREA)
MADD t33,t33,a2,b2
MADD t43,t43,a3,b2
MADD t34,t34,a2,b3
MADD t44,t44,a3,b3
.L17:
gsLQC1(R8,F1,F0,0)
MADD t11,t11,a4,b4
MADD t21,t21,a5,b4
gsLQC1(R9,F9,F8,0)
MADD t12,t12,a4,b5
MADD t22,t22,a5,b5
gsLQC1(R8,F3,F2,1)
MADD t31,t31,a6,b4
MADD t41,t41,a7,b4
gsLQC1(R9,F11,F10,1)
MADD t32,t32,a6,b5
MADD t42,t42,a7,b5
FETCH $0,4*SIZE(PREB)
MADD t13,t13,a4,b6
MADD t23,t23,a5,b6
MADD t14,t14,a4,b7
MADD t24,t24,a5,b7
FETCH $0,4*SIZE(PREA)
MADD t33,t33,a6,b6
MADD t43,t43,a7,b6
daddu PREB,PREB,8*SIZE
MADD t34,t34,a6,b7
MADD t44,t44,a7,b7
daddu PREA,PREA,8*SIZE
.L18: # N=4, M=4, K=1
#ifndef TRMMKERNEL
andi K,KCO,1
#else
andi K,TEMP, 1
#endif
NOP
beqz K,.L19 #
LD ALPHA,152($sp) # Get ALPHA
FETCH $0,0(PREB)
MADD t11,t11,a0,b0
MADD t21,t21,a1,b0
daddu A,A,4*SIZE # A+=4(mr)*1(kr)*8Byte=32
MADD t12,t12,a0,b1
MADD t22,t22,a1,b1
daddu B,B,4*SIZE
FETCH $0,0(PREA)
MADD t31,t31,a2,b0
MADD t41,t41,a3,b0
daddu PREB,PREB,4*SIZE
MADD t32,t32,a2,b1
MADD t42,t42,a3,b1
daddu PREA,PREA,4*SIZE
MADD t13,t13,a0,b2
MADD t23,t23,a1,b2
MADD t14,t14,a0,b3
MADD t24,t24,a1,b3
MADD t33,t33,a2,b2
MADD t43,t43,a3,b2
MADD t34,t34,a2,b3
MADD t44,t44,a3,b3
.L19: # Write Back
#ifndef TRMMKERNEL
LD c11,0(CO1) # gemm write part Fetch 16 C
LD c21,1*SIZE(CO1)
LD c31,2*SIZE(CO1)
LD c41,3*SIZE(CO1)
LD c12,0(CO2)
MADD t11,c11,t11,ALPHA
LD c22,1*SIZE(CO2)
MADD t21,c21,t21,ALPHA
LD c32,2*SIZE(CO2)
MADD t31,c31,t31,ALPHA
LD c42,3*SIZE(CO2)
MADD t41,c41,t41,ALPHA
LD c13,0(CO3)
MADD t12,c12,t12,ALPHA
LD c23,1*SIZE(CO3)
MADD t22,c22,t22,ALPHA
LD c33,2*SIZE(CO3)
MADD t32,c32,t32,ALPHA
LD c43,3*SIZE(CO3)
MADD t42,c42,t42,ALPHA
LD c14,0(CO4)
MADD t13,c13,t13,ALPHA
LD c24,1*SIZE(CO4)
MADD t23,c23,t23,ALPHA
LD c34,2*SIZE(CO4)
MADD t33,c33,t33,ALPHA
LD c44,3*SIZE(CO4)
MADD t43,c43,t43,ALPHA
ST t11,0(CO1)
MADD t14,c14,t14,ALPHA
ST t21,1*SIZE(CO1)
MADD t24,c24,t24,ALPHA
ST t31,2*SIZE(CO1)
MADD t34,c34,t34,ALPHA
ST t41,3*SIZE(CO1)
MADD t44,c44,t44,ALPHA
daddiu M,M,-1 # M--
ST t12,0(CO2)
ST t22,1*SIZE(CO2)
ST t32,2*SIZE(CO2)
ST t42,3*SIZE(CO2)
ST t13,0(CO3)
ST t23,1*SIZE(CO3)
ST t33,2*SIZE(CO3)
ST t43,3*SIZE(CO3)
FETCH $0,4*SIZE(CO1)
FETCH $0,4*SIZE(CO2)
FETCH $0,4*SIZE(CO3)
FETCH $0,4*SIZE(CO4)
FETCH $0,8*SIZE(CO1)
FETCH $0,8*SIZE(CO2)
FETCH $0,8*SIZE(CO3)
FETCH $0,8*SIZE(CO4)
ST t14,0(CO4)
daddu CO1,CO1,4*SIZE # COx += 4*8Byte
ST t24,1*SIZE(CO4)
daddu CO2,CO2,4*SIZE
ST t34,2*SIZE(CO4)
daddu CO3,CO3,4*SIZE
ST t44,3*SIZE(CO4)
daddu PREB,BO,SPANB
bnez M,.L10 # M!=0
daddu CO4,CO4,4*SIZE
#else
MUL t11, ALPHA, t11
MUL t21, ALPHA, t21
MUL t31, ALPHA, t31
MUL t41, ALPHA, t41
ST t11, 0 * SIZE(CO1)
ST t21, 1 * SIZE(CO1)
ST t31, 2 * SIZE(CO1)
ST t41, 3 * SIZE(CO1)
MUL t12, ALPHA, t12
MUL t22, ALPHA, t22
MUL t32, ALPHA, t32
MUL t42, ALPHA, t42
ST t12, 0 * SIZE(CO2)
ST t22, 1 * SIZE(CO2)
ST t32, 2 * SIZE(CO2)
ST t42, 3 * SIZE(CO2)
MUL t13, ALPHA, t13
MUL t23, ALPHA, t23
MUL t33, ALPHA, t33
MUL t43, ALPHA, t43
ST t13, 0 * SIZE(CO3)
ST t23, 1 * SIZE(CO3)
ST t33, 2 * SIZE(CO3)
ST t43, 3 * SIZE(CO3)
MUL t14, ALPHA, t14
MUL t24, ALPHA, t24
MUL t34, ALPHA, t34
MUL t44, ALPHA, t44
ST t14, 0 * SIZE(CO4)
ST t24, 1 * SIZE(CO4)
ST t34, 2 * SIZE(CO4)
ST t44, 3 * SIZE(CO4)
daddiu M,M,-1 # M--
daddiu CO4,CO4, 4 * SIZE # trmm part write back
daddiu CO3,CO3, 4 * SIZE
daddiu CO2,CO2, 4 * SIZE
daddiu CO1,CO1, 4 * SIZE
#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
dsubu TEMP,KCO,KK
#ifdef LEFT
daddiu TEMP,TEMP, -4
#else
daddiu TEMP,TEMP, -4
#endif
dsll K,TEMP,2 + BASE_SHIFT
dsll TEMP,TEMP,2 + BASE_SHIFT
daddu A,A,K # mov A to the end of panel Ai
daddu B,B,TEMP # mov B to the end of panel Bj
#endif
#ifdef LEFT # right control by N loop
daddiu KK, KK,4
#endif
bnez M,.L10 # M!=0
nop
#endif
.L14_M2:
andi M,MCO,2 # Remainder M = 2
beqz M,.L14_M1
nop
.L20:
#if defined(TRMMKERNEL)
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
move B,BO
#else
dsll K,KK,1 + BASE_SHIFT #mr=2 so KK*2
dsll TEMP,KK,2 + BASE_SHIFT
daddu A,A,K
daddu B,BO,TEMP
#endif
MTC $0,t11
MOV t21,t11
gsLQC1(R8,F1,F0,0) #a0,a1
MOV t12,t11
MOV t22,t11
gsLQC1(R9,F9,F8,0) #b0,b1
MOV t13,t11
gsLQC1(R9,F11,F10,1) #b2,b3
MOV t23,t11
MOV t14,t11
MOV t24,t11
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
dsubu TEMP,KCO,KK
#elif defined(LEFT)
daddiu TEMP,KK,2
#else
daddiu TEMP,KK,4 # not sure
#endif
dsra K,TEMP,2
beqz K,.L25
nop
#else
move B,BO # gemm part
MTC $0,t11
MOV t21,t11
gsLQC1(R8,F1,F0,0) #a0,a1
MOV t12,t11
MOV t22,t11
gsLQC1(R9,F9,F8,0) #b0,b1
dsra K,KCO,2 # K=KCO/2
MOV t13,t11
gsLQC1(R9,F11,F10,1) #b2,b3
MOV t23,t11
MOV t14,t11
MOV t24,t11
beqz K,.L25
nop
#endif
.L21: # N=4 m=2,=K=4
gsLQC1(R8,F5,F4,1) # R8=A
MADD t11,t11,a0,b0
MADD t21,t21,a1,b0
gsLQC1(R9,F13,F12,2) # R9=B
MADD t12,t12,a0,b1
MADD t22,t22,a1,b1
gsLQC1(R9,F15,F14,3)
MADD t13,t13,a0,b2
MADD t23,t23,a1,b2
gsLQC1(R8,F3,F2,2)
MADD t14,t14,a0,b3
MADD t24,t24,a1,b3
gsLQC1(R9,F9,F8,4)
MADD t11,t11,a4,b4
MADD t21,t21,a5,b4
gsLQC1(R9,F11,F10,5)
MADD t12,t12,a4,b5
MADD t22,t22,a5,b5
gsLQC1(R8,F7,F6,3)
MADD t13,t13,a4,b6
MADD t23,t23,a5,b6
MADD t14,t14,a4,b7
MADD t24,t24,a5,b7
gsLQC1(R9,F13,F12,6)
MADD t11,t11,a2,b0
MADD t21,t21,a3,b0
daddu A,A,8*SIZE # A+=2(mr)*4(kr)*8Byte=8*SIZE
gsLQC1(R9,F15,F14,7)
MADD t12,t12,a2,b1
MADD t22,t22,a3,b1
daddiu K,K,-1
gsLQC1(R8,F1,F0,0)
MADD t13,t13,a2,b2
MADD t23,t23,a3,b2
daddu B,B,16*SIZE # B+=4(nr)*4(kr)*8Byte=16*SIZE
MADD t14,t14,a2,b3
MADD t24,t24,a3,b3
gsLQC1(R9,F9,F8,0)
MADD t11,t11,a6,b4
MADD t21,t21,a7,b4
gsLQC1(R9,F11,F10,1)
MADD t12,t12,a6,b5
MADD t22,t22,a7,b5
MADD t13,t13,a6,b6
MADD t23,t23,a7,b6
MADD t14,t14,a6,b7
bnez K,.L21
MADD t24,t24,a7,b7
.L25: # N=4 M=2 K=2
#ifndef TRMMKERNEL
andi K,KCO,2 # k = KCO&2
#else
andi K,TEMP,2
#endif
beqz K,.L28
nop
.L26:
gsLQC1(R8,F5,F4,1) # R8=A
MADD t11,t11,a0,b0
MADD t21,t21,a1,b0
gsLQC1(R9,F13,F12,2) # R9=B
MADD t12,t12,a0,b1
MADD t22,t22,a1,b1
daddu A,A,4*SIZE # A+=2(mr)*2(kr)*8Byte=32
gsLQC1(R9,F15,F14,3)
MADD t13,t13,a0,b2
MADD t23,t23,a1,b2
daddu B,B,8*SIZE
MADD t14,t14,a0,b3
MADD t24,t24,a1,b3
.L27:
gsLQC1(R8,F1,F0,0)
MADD t11,t11,a4,b4
MADD t21,t21,a5,b4
gsLQC1(R9,F9,F8,0)
MADD t12,t12,a4,b5
MADD t22,t22,a5,b5
gsLQC1(R9,F11,F10,1)
MADD t13,t13,a4,b6
MADD t23,t23,a5,b6
MADD t14,t14,a4,b7
MADD t24,t24,a5,b7
.L28: # N=4, M=2, K=1
#ifndef TRMMKERNEL
andi K,KCO,1
#else
andi K,TEMP,1
#endif
beqz K,.L29 #
LD ALPHA,152($sp) # Get ALPHA
MADD t11,t11,a0,b0
MADD t21,t21,a1,b0
daddu A,A,2*SIZE # A+=2(mr)*1(kr)*8Byte=16
daddu B,B,4*SIZE
MADD t12,t12,a0,b1
MADD t22,t22,a1,b1
MADD t13,t13,a0,b2
MADD t23,t23,a1,b2
MADD t14,t14,a0,b3
MADD t24,t24,a1,b3
.L29: # Write Back
#ifndef TRMMKERNEL
LD c11,0(CO1) # gemm write back part Fetch 16 C
LD c21,1*SIZE(CO1)
LD c12,0(CO2)
LD c22,1*SIZE(CO2)
LD c13,0(CO3)
MADD t11,c11,t11,ALPHA
LD c23,1*SIZE(CO3)
MADD t21,c21,t21,ALPHA
LD c14,0(CO4)
MADD t12,c12,t12,ALPHA
LD c24,1*SIZE(CO4)
MADD t22,c22,t22,ALPHA
ST t11,0(CO1)
MADD t13,c13,t13,ALPHA
ST t21,1*SIZE(CO1)
MADD t23,c23,t23,ALPHA
ST t12,0(CO2)
MADD t14,c14,t14,ALPHA
ST t22,1*SIZE(CO2)
MADD t24,c24,t24,ALPHA
ST t13,0(CO3)
ST t23,1*SIZE(CO3)
daddu CO1,CO1,2*SIZE # COx += 2*8Byte
FETCH $0,0(CO1)
FETCH $0,2*SIZE(CO2)
FETCH $0,2*SIZE(CO3)
FETCH $0,2*SIZE(CO4)
ST t14,0(CO4)
daddu CO2,CO2,2*SIZE
ST t24,1*SIZE(CO4)
daddu CO3,CO3,2*SIZE
daddu CO4,CO4,2*SIZE
#else
MUL t11, ALPHA, t11
MUL t21, ALPHA, t21
ST t11, 0 * SIZE(CO1)
ST t21, 1 * SIZE(CO1)
MUL t12, ALPHA, t12
MUL t22, ALPHA, t22
ST t12, 0 * SIZE(CO2)
ST t22, 1 * SIZE(CO2)
MUL t13, ALPHA, t13
MUL t23, ALPHA, t23
ST t13, 0 * SIZE(CO3)
ST t23, 1 * SIZE(CO3)
MUL t14, ALPHA, t14
MUL t24, ALPHA, t24
ST t14, 0 * SIZE(CO4)
ST t24, 1 * SIZE(CO4)
daddiu CO1,CO1, 2 * SIZE
daddiu CO2,CO2, 2 * SIZE
daddiu CO3,CO3, 2 * SIZE
daddiu CO4,CO4, 2 * SIZE
#if ( defined(LEFT) && defined(TRANSA)) || \
(!defined(LEFT) && !defined(TRANSA))
dsubu TEMP,KCO,KK
#ifdef LEFT
daddiu TEMP,TEMP,-2
#else
daddiu TEMP,TEMP,-4
#endif
dsll K,TEMP,1 + BASE_SHIFT
dsll TEMP,TEMP,2 + BASE_SHIFT
daddu A,A,K
daddu B,B,TEMP
#endif
#ifdef LEFT
daddiu KK, KK, 2
#endif
#endif
.L14_M1:
andi M,MCO,1 # Remainder M = 1
beqz M,.L0_N4_Loop # M = 0, finishing one panel B
nop
.L30:
#if defined(TRMMKERNEL)
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
move B,BO
#else
dsll K,KK, 0 + BASE_SHIFT
dsll TEMP,KK,2 + BASE_SHIFT
daddu A,A,K
daddu B,BO,TEMP
#endif
LD a0, 0 * SIZE(A)
# gsLQC1(R8,F1,F0,0)
gsLQC1(R9,F9,F8,0) #b0,b1
MTC $0,t11
gsLQC1(R9,F11,F10,1) #b2,b3
MOV t12,t11
MOV t13,t11
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
dsubu TEMP, KCO, KK
#elif defined(LEFT)
daddiu TEMP, KK, 1
#else
daddiu TEMP, KK, 4
#endif
dsra K,TEMP, 2
beqz K,.L35
MOV t14,t11
#else
# gemm
move B,BO
LD a0, 0 * SIZE(A)
# gsLQC1(R8,F1,F0,0)
dsra K,KCO,2 # K=KCO/2
gsLQC1(R9,F9,F8,0) #b0,b1
MTC $0,t11
gsLQC1(R9,F11,F10,1) #b2,b3
MOV t12,t11
MOV t13,t11
dsra K,KCO,2
beqz K,.L35
MOV t14,t11
#endif
.L31: # N=4 m=1,=K=4
# gsLQC1(R8,F3,F2,1)
LD a1, 1*SIZE(A)
gsLQC1(R9,F13,F12,2) # R9=B
MADD t11,t11,a0,b0
MADD t12,t12,a0,b1
gsLQC1(R9,F15,F14,3)
MADD t13,t13,a0,b2
MADD t14,t14,a0,b3
LD a2, 2*SIZE(A)
gsLQC1(R9,F9,F8,4)
MADD t11,t11,a1,b4
MADD t12,t12,a1,b5
gsLQC1(R9,F11,F10,5)
MADD t13,t13,a1,b6
MADD t14,t14,a1,b7
daddiu K,K,-1
LD a3, 3*SIZE(A)
gsLQC1(R9,F13,F12,6)
MADD t11,t11,a2,b0
MADD t12,t12,a2,b1
gsLQC1(R9,F15,F14,7)
MADD t13,t13,a2,b2
MADD t14,t14,a2,b3
daddu A,A,4*SIZE # A+=1(mr)*4(kr)*8Byte=8*SIZE
daddu B,B,16*SIZE # B+=4(nr)*4(kr)*8Byte=16*SIZE
# gsLQC1(R8,F1,F0,0)
LD a0, 0*SIZE(A)
gsLQC1(R9,F9,F8,0)
MADD t11,t11,a3,b4
MADD t12,t12,a3,b5
gsLQC1(R9,F11,F10,1)
MADD t13,t13,a3,b6
bnez K,.L31
MADD t14,t14,a3,b7
.L35: # N=4 M=1 K=2
#ifndef TRMMKERNEL
andi K,KCO,2 # k = KCO&2
#else
andi K,TEMP,2
#endif
beqz K,.L38
nop
.L36:
LD a1,1*SIZE(A)
gsLQC1(R9,F13,F12,2) # R9=B
MADD t11,t11,a0,b0
MADD t12,t12,a0,b1
daddu A,A,2*SIZE # A+=1(mr)*2(kr)*8Byte=32
gsLQC1(R9,F15,F14,3)
MADD t13,t13,a0,b2
MADD t14,t14,a0,b3
daddu B,B,8*SIZE
.L37:
LD a0,0(A)
gsLQC1(R9,F9,F8,0)
MADD t11,t11,a1,b4
MADD t12,t12,a1,b5
gsLQC1(R9,F11,F10,1)
MADD t13,t13,a1,b6
MADD t14,t14,a1,b7
.L38: # N=4, M=1, K=1
#ifndef TRMMKERNEL
andi K,KCO,1
#else
andi K,TEMP,1
#endif
beqz K,.L39 #
LD ALPHA,152($sp) # Get ALPHA
MADD t11,t11,a0,b0
MADD t12,t12,a0,b1
daddu A,A,1*SIZE # A+=1(mr)*1(kr)*8Byte=16
daddu B,B,4*SIZE
MADD t13,t13,a0,b2
MADD t14,t14,a0,b3
.L39: # Write Back
#ifndef TRMMKERNEL
LD c11,0(CO1) # Fetch 16 C
LD c12,0(CO2)
LD c13,0(CO3)
LD c14,0(CO4)
MADD t11,c11,t11,ALPHA
MADD t12,c12,t12,ALPHA
MADD t13,c13,t13,ALPHA
MADD t14,c14,t14,ALPHA
ST t11,0(CO1)
ST t12,0(CO2)
ST t13,0(CO3)
ST t14,0(CO4)
#else
MUL t11, ALPHA, t11
MUL t12, ALPHA, t12
MUL t13, ALPHA, t13
MUL t14, ALPHA, t14
ST t11, 0 * SIZE(CO1)
ST t12, 0 * SIZE(CO2)
ST t13, 0 * SIZE(CO3)
ST t14, 0 * SIZE(CO4)
#if ( defined(LEFT) && defined(TRANSA)) || \
(!defined(LEFT) && !defined(TRANSA))
dsubu TEMP, KCO, KK
#ifdef LEFT
daddiu TEMP, TEMP, -1
#else
daddiu TEMP, TEMP, -4
#endif
dsll K,TEMP, 0 + BASE_SHIFT
dsll TEMP,TEMP, 2 + BASE_SHIFT
daddu A,A,K
daddu B,B,TEMP
#endif
#ifdef LEFT
daddiu KK, KK, 1
#endif
#endif
.L0_N4_Loop:
daddiu N,N,-1 # N--
#if defined(TRMMKERNEL) && !defined(LEFT)
daddiu KK, KK,4
#endif
bnez N,.L0_N4_Lb # N!=0
move BO,B # Set B
.align 5
.L0_N2:
andi N,NCO,2 # Remainder N = 2
beqz N,.L0_N1 # N=0,NCO<2
nop
.L0_N2_Lb:
move CO1,C # Set C
dsra M,MCO,2 # M=MCO/2
#if defined(TRMMKERNEL) && defined(LEFT)
move KK, OFFSET
#endif
dsll SPANB,KCO,1+BASE_SHIFT # SPANB=KC*NR(2)*8Byte=KC*16=KC*2^4
move A,AO # Reset A
daddu CO2,CO1,LDC
daddu PREA,AO,SPANA
beqz M,.L12_M2
daddu C,CO2,LDC
.L40:
#if defined(TRMMKERNEL)
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
move B,BO
#else
dsll K,KK, 2 + BASE_SHIFT # mr=4
dsll TEMP, KK,1 + BASE_SHIFT # nr=2
daddu A,A,K
daddu B,BO,TEMP
#endif
MTC $0,t11
MOV t21,t11
gsLQC1(R8,F1,F0,0) #a0,a1
MOV t31,t11
MOV t41,t11
gsLQC1(R9,F9,F8,0) #b0,b1
MOV t12,t11
gsLQC1(R8,F3,F2,1) #a2,a3
MOV t22,t11
MOV t32,t11
MOV t42,t11
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
dsubu TEMP,KCO,KK
#elif defined(LEFT)
daddiu TEMP, KK, 4
#else
daddiu TEMP, KK, 2
#endif
dsra K,TEMP,2 # K=KCO/2
beqz K,.L45
nop
#else
move B,BO
MTC $0,t11 # gemm part
MOV t21,t11
gsLQC1(R8,F1,F0,0) #a0,a1
MOV t31,t11
MOV t41,t11
gsLQC1(R9,F9,F8,0) #b0,b1
dsra K,KCO,2 # K=KCO/2
MOV t12,t11
gsLQC1(R8,F3,F2,1) #a2,a3
MOV t22,t11
MOV t32,t11
MOV t42,t11
beqz K,.L45
nop
#endif
.L41: # N=2,M=K=4
gsLQC1(R8,F5,F4,2) # R8=A
MADD t11,t11,a0,b0
MADD t21,t21,a1,b0
gsLQC1(R9,F13,F12,1) # R9=B
MADD t12,t12,a0,b1
MADD t22,t22,a1,b1
gsLQC1(R8,F7,F6,3)
MADD t31,t31,a2,b0
MADD t41,t41,a3,b0
FETCH $0,(PREA)
MADD t32,t32,a2,b1
MADD t42,t42,a3,b1
.L42:
gsLQC1(R8,F1,F0,4)
MADD t11,t11,a4,b4
MADD t21,t21,a5,b4
gsLQC1(R9,F11,F10,2)
MADD t12,t12,a4,b5
MADD t22,t22,a5,b5
gsLQC1(R8,F3,F2,5)
MADD t31,t31,a6,b4
MADD t41,t41,a7,b4
FETCH $0,4*SIZE(PREA)
MADD t32,t32,a6,b5
MADD t42,t42,a7,b5
.L43:
gsLQC1(R8,F5,F4,6)
MADD t11,t11,a0,b2
MADD t21,t21,a1,b2
gsLQC1(R9,F15,F14,3)
MADD t12,t12,a0,b3
MADD t22,t22,a1,b3
gsLQC1(R8,F7,F6,7)
MADD t31,t31,a2,b2
MADD t41,t41,a3,b2
daddu B,B,8*SIZE # B+=2(nr)*4(kr)*8Byte=8*SIZE
FETCH $0,8*SIZE(PREA)
MADD t32,t32,a2,b3
MADD t42,t42,a3,b3
daddu A,A,16*SIZE # A+=4(mr)*4(kr)*8Byte=16*SIZE
.L44:
gsLQC1(R8,F1,F0,0)
MADD t11,t11,a4,b6
MADD t21,t21,a5,b6
daddiu K,K,-1
gsLQC1(R9,F9,F8,0)
MADD t12,t12,a4,b7
MADD t22,t22,a5,b7
daddu PREA,PREA,16*SIZE
gsLQC1(R8,F3,F2,1)
MADD t31,t31,a6,b6
MADD t41,t41,a7,b6
FETCH $0,-4*SIZE(PREA)
MADD t32,t32,a6,b7
bnez K,.L41
MADD t42,t42,a7,b7
.L45: # N=2 M=4 K=2
#ifndef TRMMKERNEL
andi K,KCO,2 # k = KCO&2
#else
andi K,TEMP,2
#endif
beqz K,.L48
nop
.L46:
gsLQC1(R8,F5,F4,2) # R8=A
MADD t11,t11,a0,b0
MADD t21,t21,a1,b0
gsLQC1(R9,F13,F12,1) # R9=B
MADD t12,t12,a0,b1
MADD t22,t22,a1,b1
gsLQC1(R8,F7,F6,3)
MADD t31,t31,a2,b0
MADD t41,t41,a3,b0
daddu B,B,4*SIZE # B+=2(nr)*2(kr)*8Byte=32
FETCH $0,0(PREA)
MADD t32,t32,a2,b1
MADD t42,t42,a3,b1
daddu A,A,8*SIZE # A+=4(mr)*2(kr)*8Byte=8*SIZE
.L47:
gsLQC1(R8,F1,F0,0)
MADD t11,t11,a4,b4
MADD t21,t21,a5,b4
gsLQC1(R9,F9,F8,0)
MADD t12,t12,a4,b5
MADD t22,t22,a5,b5
gsLQC1(R8,F3,F2,1)
MADD t31,t31,a6,b4
MADD t41,t41,a7,b4
FETCH $0,4*SIZE(PREA)
MADD t32,t32,a6,b5
MADD t42,t42,a7,b5
daddu PREA,PREA,8*SIZE
.L48: # N=2, M=4, K=1
#ifndef TRMMKERNEL
andi K,KCO,1
#else
andi K,TEMP,1
#endif
beqz K,.L49 #
LD ALPHA,152($sp) # Get ALPHA
FETCH $0,0(PREA)
MADD t11,t11,a0,b0
MADD t21,t21,a1,b0
daddu A,A,4*SIZE # A+=4(mr)*1(kr)*8Byte=32
MADD t12,t12,a0,b1
MADD t22,t22,a1,b1
daddu B,B,2*SIZE
daddu PREA,PREA,4*SIZE
MADD t31,t31,a2,b0
MADD t41,t41,a3,b0
MADD t32,t32,a2,b1
MADD t42,t42,a3,b1
.L49: # Write Back
#ifndef TRMMKERNEL
LD c11,0(CO1) # gemm write back part Fetch 16 C
LD c21,1*SIZE(CO1)
LD c31,2*SIZE(CO1)
LD c41,3*SIZE(CO1)
LD c12,0(CO2)
MADD t11,c11,t11,ALPHA
LD c22,1*SIZE(CO2)
MADD t21,c21,t21,ALPHA
LD c32,2*SIZE(CO2)
MADD t31,c31,t31,ALPHA
LD c42,3*SIZE(CO2)
MADD t41,c41,t41,ALPHA
ST t11,0(CO1)
MADD t12,c12,t12,ALPHA
ST t21,1*SIZE(CO1)
MADD t22,c22,t22,ALPHA
ST t31,2*SIZE(CO1)
MADD t32,c32,t32,ALPHA
ST t41,3*SIZE(CO1)
MADD t42,c42,t42,ALPHA
daddiu M,M,-1 # M--
ST t12,0(CO2)
ST t22,1*SIZE(CO2)
ST t32,2*SIZE(CO2)
ST t42,3*SIZE(CO2)
FETCH $0,4*SIZE(CO1)
FETCH $0,4*SIZE(CO2)
FETCH $0,8*SIZE(CO1)
FETCH $0,8*SIZE(CO2)
daddu CO1,CO1,4*SIZE # COx += 4*8Byte
bnez M,.L40 # M!=0
daddu CO2,CO2,4*SIZE
#else
daddiu M,M,-1
daddiu CO1,CO1, 4*SIZE
daddiu CO2,CO2, 4*SIZE
MUL t11, ALPHA, t11
MUL t21, ALPHA, t21
MUL t31, ALPHA, t31
MUL t41, ALPHA, t41
MUL t12, ALPHA, t12
MUL t22, ALPHA, t22
MUL t32, ALPHA, t32
MUL t42, ALPHA, t42
ST t11, -4 * SIZE(CO1)
ST t21, -3 * SIZE(CO1)
ST t31, -2 * SIZE(CO1)
ST t41, -1 * SIZE(CO1)
ST t12, -4 * SIZE(CO2)
ST t22, -3 * SIZE(CO2)
ST t32, -2 * SIZE(CO2)
ST t42, -1 * SIZE(CO2)
#if ( defined(LEFT) && defined(TRANSA)) || \
(!defined(LEFT) && !defined(TRANSA))
dsubu TEMP, KCO, KK
#ifdef LEFT
daddiu TEMP, TEMP, -4
#else
daddiu TEMP, TEMP, -2
#endif
dsll K,TEMP, 2 + BASE_SHIFT
dsll TEMP, TEMP, 1 + BASE_SHIFT
daddu A,A,K
daddu B,B,TEMP
#endif
#ifdef LEFT
daddiu KK, KK, 4
#endif
bnez M,.L40
nop
#endif
.L12_M2:
andi M,MCO,2 # Remainder M = 2
beqz M,.L12_M1
nop
.L50:
#if defined(TRMMKERNEL)
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
move B,BO
#else
dsll K, KK, 1 + BASE_SHIFT #mr=2
dsll TEMP, KK, 1 + BASE_SHIFT #nr=2
daddu A, A, K
daddu B, BO, TEMP
#endif
MTC $0,t11
gsLQC1(R8,F1,F0,0) #a0,a1
MOV t21,t11
MOV t12,t11
gsLQC1(R9,F9,F8,0) #b0,b1
MOV t22,t11
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
dsubu TEMP, KCO, KK
#elif defined(LEFT)
daddiu TEMP, KK, 2
#else
daddiu TEMP, KK, 2
#endif
dsra K,TEMP,2 # K=KCO/2
beqz K,.L55
nop
#else
move B,BO
dsra K,KCO,2 # K=KCO/2
MTC $0,t11
gsLQC1(R8,F1,F0,0) #a0,a1
MOV t21,t11
MOV t12,t11
gsLQC1(R9,F9,F8,0) #b0,b1
MOV t22,t11
beqz K,.L55
nop
#endif
.L51: # N=2 m=2,=K=4
gsLQC1(R8,F5,F4,1) # R8=A
MADD t11,t11,a0,b0
MADD t21,t21,a1,b0
gsLQC1(R9,F13,F12,1) # R9=B
MADD t12,t12,a0,b1
MADD t22,t22,a1,b1
gsLQC1(R8,F3,F2,2)
MADD t11,t11,a4,b4
MADD t21,t21,a5,b4
gsLQC1(R9,F11,F10,2)
MADD t12,t12,a4,b5
MADD t22,t22,a5,b5
daddiu K,K,-1
gsLQC1(R8,F7,F6,3)
MADD t11,t11,a2,b2
MADD t21,t21,a3,b2
daddu A,A,8*SIZE # A+=2(mr)*4(kr)*8Byte=8*SIZE
gsLQC1(R9,F15,F14,3)
MADD t12,t12,a2,b3
MADD t22,t22,a3,b3
daddu B,B,8*SIZE # B+=2(nr)*4(kr)*8Byte=16*SIZE
gsLQC1(R8,F1,F0,0)
MADD t11,t11,a6,b6
MADD t21,t21,a7,b6
gsLQC1(R9,F9,F8,0)
MADD t12,t12,a6,b7
bnez K,.L51
MADD t22,t22,a7,b7
.L55: # N=2 M=2 K=2
#ifndef TRMMKERNEL
andi K,KCO,2 # k = KCO&2
#else
andi K,TEMP,2
#endif
NOP
beqz K,.L58
nop
.L56:
gsLQC1(R8,F5,F4,1) # R8=A
MADD t11,t11,a0,b0
MADD t21,t21,a1,b0
daddu A,A,4*SIZE # A+=2(mr)*2(kr)*8Byte=32
gsLQC1(R9,F13,F12,1) # R9=B
MADD t12,t12,a0,b1
MADD t22,t22,a1,b1
daddu B,B,4*SIZE
.L57:
gsLQC1(R8,F1,F0,0)
MADD t11,t11,a4,b4
MADD t21,t21,a5,b4
gsLQC1(R9,F9,F8,0)
MADD t12,t12,a4,b5
MADD t22,t22,a5,b5
.L58: # N=2, M=2, K=1
#ifndef TRMMKERNEL
andi K,KCO,1
#else
andi K, TEMP, 1
#endif
beqz K,.L59 #
LD ALPHA,152($sp) # Get ALPHA
MADD t11,t11,a0,b0
MADD t21,t21,a1,b0
daddu A,A,2*SIZE # A+=2(mr)*1(kr)*8Byte=16
daddu B,B,2*SIZE
MADD t12,t12,a0,b1
MADD t22,t22,a1,b1
.L59: # Write Back
#ifndef TRMMKERNEL
LD c11,0(CO1) # write gemm part back Fetch 16 C
LD c21,1*SIZE(CO1)
LD c12,0(CO2)
LD c22,1*SIZE(CO2)
MADD t11,c11,t11,ALPHA
MADD t21,c21,t21,ALPHA
MADD t12,c12,t12,ALPHA
MADD t22,c22,t22,ALPHA
ST t11,0(CO1)
ST t21,1*SIZE(CO1)
ST t12,0(CO2)
ST t22,1*SIZE(CO2)
daddu CO1,CO1,2*SIZE # COx += 2*8Byte
daddu CO2,CO2,2*SIZE
FETCH $0,0(CO1)
FETCH $0,0(CO2)
#else
daddiu M, M, -1
daddiu CO1,CO1, 2 * SIZE
daddiu CO2,CO2, 2 * SIZE
MUL t11, ALPHA, t11
MUL t21, ALPHA, t21
MUL t12, ALPHA, t12
MUL t22, ALPHA, t22
ST t11, -2 * SIZE(CO1)
ST t21, -1 * SIZE(CO1)
ST t12, -2 * SIZE(CO2)
ST t22, -1 * SIZE(CO2)
#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
dsubu TEMP, KCO, KK
#ifdef LEFT
daddiu TEMP, TEMP, -2
#else
daddiu TEMP, TEMP, -2
#endif
dsll K, TEMP, 1 + BASE_SHIFT
dsll TEMP, TEMP, 1 + BASE_SHIFT
daddu A, A, K
daddu B, B, TEMP
#endif
#ifdef LEFT
daddiu KK, KK, 2
#endif
FETCH $0,0(CO1)
FETCH $0,0(CO2)
#endif
.L12_M1:
andi M,MCO,1 # Remainder M = 1
beqz M,.L0_N2_Loop # M = 0, finishing one panel B
nop
.L60:
#if defined(TRMMKERNEL)
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
move B,BO
#else
dsll K, KK, 0 + BASE_SHIFT
dsll TEMP, KK, 1 + BASE_SHIFT
daddu A, A, K
daddu B, BO, TEMP
#endif
MTC $0,t11
#gsLQC1(R8,F4,F0,0)
LD a0, 0*SIZE(A)
MOV t21,t11
MOV t12,t11
gsLQC1(R9,F9,F8,0) #b0,b1
MOV t22,t11
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
dsubu TEMP, KCO, KK
#elif defined(LEFT)
daddiu TEMP, KK, 1
#else
daddiu TEMP, KK, 2
#endif
dsra K,TEMP,2 # K=KCO/2
beqz K,.L65
nop
#else
dsra K,KCO,2 # K=KCO/2
MTC $0,t11
move B,BO # Reset B
# gsLQC1(R8,F4,F0,0)
LD a0,0*SIZE(A)
MOV t21,t11
MOV t12,t11
gsLQC1(R9,F9,F8,0) #b0,b1
MOV t22,t11
beqz K,.L65
nop
#endif
.L61: # N=2 m=1,=K=4
LD a4, 1*SIZE(A)
gsLQC1(R9,F13,F12,1) # R9=B
MADD t11,t11,a0,b0
MADD t12,t12,a0,b1
LD a2, 2*SIZE(A)
gsLQC1(R9,F11,F10,2)
MADD t11,t11,a4,b4
MADD t12,t12,a4,b5
# gsLQC1(R8,F6,F2,1)
LD a6, 3*SIZE(A)
MADD t11,t11,a2,b2
MADD t12,t12,a2,b3
daddiu K,K,-1
gsLQC1(R9,F15,F14,3)
daddu A,A,4*SIZE # A+=1(mr)*4(kr)*8Byte=32
# gsLQC1(R8,F4,F0,0)
LD a0, 0*SIZE(A)
daddu B,B,8*SIZE # B+=2(nr)*4(kr)*8Byte=8*SIZE
gsLQC1(R9,F9,F8,0)
MADD t11,t11,a6,b6
bnez K,.L61
MADD t12,t12,a6,b7
.L65: # N=2 M=1 K=2
#ifndef TRMMKERNEL
andi K,KCO,2 # k = KCO&2
#else
andi K,TEMP,2
#endif
beqz K,.L68
nop
.L66:
LD a4, 1*SIZE(A)
MADD t11,t11,a0,b0
gsLQC1(R9,F13,F12,1) # R9=B
MADD t12,t12,a0,b1
daddu A,A,2*SIZE # A+=1(mr)*2(kr)*8Byte=16
daddu B,B,4*SIZE
.L67:
LD a0,0(A)
gsLQC1(R9,F9,F8,0)
MADD t11,t11,a4,b4
MADD t12,t12,a4,b5
.L68: # N=2, M=1, K=1
#ifndef TRMMKERNEL
andi K,KCO,1
#else
andi K,TEMP,1
#endif
beqz K,.L69 #
LD ALPHA,152($sp) # Get ALPHA
MADD t11,t11,a0,b0
MADD t12,t12,a0,b1
daddu A,A,1*SIZE # A+=1(mr)*1(kr)*8Byte=16
daddu B,B,2*SIZE
.L69: # Write Back
#ifndef TRMMKERNEL
LD c11,0(CO1) # Fetch 16 C
LD c12,0(CO2)
MADD t11,c11,t11,ALPHA
MADD t12,c12,t12,ALPHA
ST t11,0(CO1)
ST t12,0(CO2)
daddu CO1,CO1,1*SIZE # COx += 2*8Byte
daddu CO2,CO2,1*SIZE
FETCH $0,0(CO1)
FETCH $0,0(CO2)
#else
MUL t11, ALPHA, t11
MUL t12, ALPHA, t12
ST t11, 0 * SIZE(CO1)
ST t12, 0 * SIZE(CO2)
daddu CO1,CO1,1*SIZE # COx += 2*8Byte
daddu CO2,CO2,1*SIZE
#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
dsubu TEMP, KCO, KK
#ifdef LEFT
daddiu TEMP, TEMP, -1
#else
daddiu TEMP, TEMP, -2
#endif
dsll K, TEMP, 0 + BASE_SHIFT
dsll TEMP, TEMP, 1 + BASE_SHIFT
daddu A, A, K
daddu B, B, TEMP
#endif
#ifdef LEFT
daddiu KK, KK, 1
#endif
#endif
.L0_N2_Loop:
#if defined(TRMMKERNEL) && !defined(LEFT)
daddiu KK, KK, 2
#endif
move BO, B
.align 5
.L0_N1:
andi N,NCO,1 # Remainder N = 1
beqz N,.L999 # N=0,NCO<1
nop
move CO1,C # Set C
dsra M,MCO,2 # M=MCO/2
#if defined(TRMMKERNEL) && defined(LEFT)
move KK, OFFSET
#endif
move A,AO # Reset A
beqz M,.L11_M2
daddu PREA,AO,SPANA
.L70:
#if defined(TRMMKERNEL)
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
move B, BO
#else
dsll K, KK, 2 + BASE_SHIFT
dsll TEMP, KK, 0 + BASE_SHIFT
daddu A, A, K
daddu B, BO, TEMP
#endif
# gsLQC1(R9,F12,F8,0)
LD b0, 0*SIZE(B)
MTC $0,t11
gsLQC1(R8,F1,F0,0) #a0,a1
MOV t21,t11
gsLQC1(R8,F3,F2,1) #a2,a3
MOV t31,t11
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
dsubu TEMP, KCO, KK
#elif defined(LEFT)
daddiu TEMP, KK, 4
#else
daddiu TEMP, KK, 1
#endif
dsra K,TEMP,2 # K=KCO/2
beqz K,.L75
MOV t41,t11
#else
move B, BO
dsra K,KCO,2 # K=KCO/2
# gsLQC1(R9,F12,F8,0)
LD b0, 0*SIZE(B)
MTC $0,t11
gsLQC1(R8,F1,F0,0) #a0,a1
MOV t21,t11
gsLQC1(R8,F3,F2,1) #a2,a3
MOV t31,t11
beqz K,.L75
MOV t41,t11
#endif
.L71: # N=1,M=K=4
gsLQC1(R8,F5,F4,2) # R8=A
gsLQC1(R8,F7,F6,3)
MADD t11,t11,a0,b0
MADD t21,t21,a1,b0
LD b4, 1*SIZE(B)
FETCH $0,(PREA)
MADD t31,t31,a2,b0
MADD t41,t41,a3,b0
.L72:
# gsLQC1(R9,F14,F10,1)
gsLQC1(R8,F1,F0,4)
gsLQC1(R8,F3,F2,5)
MADD t11,t11,a4,b4
MADD t21,t21,a5,b4
LD b2, 2*SIZE(B)
FETCH $0,4*SIZE(PREA)
MADD t31,t31,a6,b4
MADD t41,t41,a7,b4
.L73:
gsLQC1(R8,F5,F4,6)
gsLQC1(R8,F7,F6,7)
MADD t11,t11,a0,b2
LD b6, 3*SIZE(B)
MADD t21,t21,a1,b2
daddu A,A,16*SIZE # A+=4(mr)*4(kr)*8Byte=16*SIZE
FETCH $0,8*SIZE(PREA)
MADD t31,t31,a2,b2
MADD t41,t41,a3,b2
daddu B,B,4*SIZE # B+=1(nr)*4(kr)*8Byte=32
.L74:
# gsLQC1(R9,F12,F8,0)
gsLQC1(R8,F1,F0,0)
daddu PREA,PREA,16*SIZE
gsLQC1(R8,F3,F2,1)
MADD t11,t11,a4,b6
MADD t21,t21,a5,b6
LD b0, 0*SIZE(B)
daddiu K,K,-1
FETCH $0,-32(PREA)
MADD t31,t31,a6,b6
bnez K,.L71
MADD t41,t41,a7,b6
.L75: # N=2 M=4 K=2
#ifndef TRMMKERNEL
andi K,KCO,2 # k = KCO&2
#else
andi K,TEMP,2
#endif
beqz K,.L78
nop
.L76:
gsLQC1(R8,F5,F4,2) # R8=A
gsLQC1(R8,F7,F6,3)
MADD t11,t11,a0,b0
MADD t21,t21,a1,b0
daddu A,A,8*SIZE # A+=4(mr)*2(kr)*8Byte=8*SIZE
LD b4, 1*SIZE(B)
FETCH $0,0(PREA)
MADD t31,t31,a2,b0
MADD t41,t41,a3,b0
daddu B,B,2*SIZE # B+=1(nr)*2(kr)*8Byte=32
.L77:
gsLQC1(R8,F1,F0,0)
gsLQC1(R8,F3,F2,1)
MADD t11,t11,a4,b4
MADD t21,t21,a5,b4
LD b0,0(B)
FETCH $0,4*SIZE(PREA)
MADD t31,t31,a6,b4
MADD t41,t41,a7,b4
daddu PREA,PREA,8*SIZE
.L78: # N=2, M=4, K=1
#ifndef TRMMKERNEL
andi K,KCO,1
#else
andi K,TEMP,1
#endif
beqz K,.L79 #
LD ALPHA,152($sp) # Get ALPHA
FETCH $0,0(PREA)
MADD t11,t11,a0,b0
MADD t21,t21,a1,b0
daddu A,A,4*SIZE # A+=4(mr)*1(kr)*8Byte=32
MADD t31,t31,a2,b0
MADD t41,t41,a3,b0
daddu B,B,1*SIZE
daddu PREA,PREA,4*SIZE
.L79: # Write Back
#ifndef TRMMKERNEL
LD c11,0(CO1) # Fetch 16 C
LD c21,1*SIZE(CO1)
LD c31,2*SIZE(CO1)
LD c41,3*SIZE(CO1)
MADD t11,c11,t11,ALPHA
MADD t21,c21,t21,ALPHA
MADD t31,c31,t31,ALPHA
MADD t41,c41,t41,ALPHA
ST t11,0(CO1)
ST t21,1*SIZE(CO1)
ST t31,2*SIZE(CO1)
ST t41,3*SIZE(CO1)
daddiu M,M,-1 # M--
FETCH $0,4*SIZE(CO1)
daddu CO1,CO1,4*SIZE # COx += 4*8Byte
bnez M,.L70 # M!=0
nop
#else
daddiu M,M,-1 # M--
MUL t11, ALPHA, t11
MUL t21, ALPHA, t21
MUL t31, ALPHA, t31
MUL t41, ALPHA, t41
ST t11,0(CO1)
ST t21,1*SIZE(CO1)
ST t31,2*SIZE(CO1)
ST t41,3*SIZE(CO1)
daddu CO1,CO1,4*SIZE # COx += 4*8Byte
#if ( defined(LEFT) && defined(TRANSA)) || \
(!defined(LEFT) && !defined(TRANSA))
dsubu TEMP, KCO, KK
#ifdef LEFT
daddiu TEMP, TEMP, -4
#else
daddiu TEMP, TEMP, -1
#endif
dsll K, TEMP, 2 + BASE_SHIFT
dsll TEMP, TEMP, 0 + BASE_SHIFT
daddu A, A,K
daddu B, B, TEMP
#endif
#ifdef LEFT
daddiu KK, KK, 4
#endif
bnez M,.L70 # M!=0
nop
#endif
.L11_M2:
andi M,MCO,2 # Remainder M = 2
beqz M,.L11_M1
nop
.L80:
#if defined(TRMMKERNEL)
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
move B, BO
#else
dsll K, KK, 1 + BASE_SHIFT
dsll TEMP, KK, 0 + BASE_SHIFT
daddu A, A, K
daddu B, BO, TEMP
#endif
# gsLQC1(R9,F12,F8,0)
LD b0, 0*SIZE(B)
MTC $0,t11
gsLQC1(R8,F1,F0,0) #a0,a1
MOV t21,t11
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
dsubu TEMP, KCO, KK
#elif defined(LEFT)
daddiu TEMP, KK, 2
#else
daddiu TEMP, KK, 1
#endif
dsra K,TEMP,2 # K=KCO/2
beqz K,.L85
nop
#else
move B, BO
dsra K,KCO,2 # K=KCO/2
# gsLQC1(R9,F12,F8,0)
LD b0, 0*SIZE(B)
MTC $0,t11
gsLQC1(R8,F1,F0,0) #a0,a1
MOV t21,t11
beqz K,.L85
nop
#endif
.L81: # N=1,M=2,K=4
LD b4, 1*SIZE(B)
gsLQC1(R8,F5,F4,1) # R8=A
MADD t11,t11,a0,b0
MADD t21,t21,a1,b0
LD b2, 2*SIZE(B)
gsLQC1(R8,F3,F2,2)
MADD t11,t11,a4,b4
MADD t21,t21,a5,b4
# gsLQC1(R9,F14,F10,1)
LD b6, 3*SIZE(B)
gsLQC1(R8,F7,F6,3)
MADD t11,t11,a2,b2
MADD t21,t21,a3,b2
daddu A,A,8*SIZE # A+=2(mr)*4(kr)*8Byte=8*SIZE
daddu B,B,4*SIZE # B+=1(nr)*4(kr)*8Byte=32
# gsLQC1(R9,F12,F8,0)
gsLQC1(R8,F1,F0,0)
daddiu K,K,-1
MADD t11,t11,a6,b6
LD b0, 0*SIZE(B)
bnez K,.L81
MADD t21,t21,a7,b6
.L85: # N=2 M=4 K=2
#ifndef TRMMKERNEL
andi K,KCO,2 # k = KCO&2
#else
andi K,TEMP,2
#endif
beqz K,.L88
nop
.L86:
gsLQC1(R8,F5,F4,1) # R8=A
LD b4, 1*SIZE(B)
MADD t11,t11,a0,b0
MADD t21,t21,a1,b0
daddu A,A,4*SIZE # A+=2(mr)*2(kr)*8Byte=32
daddu B,B,2*SIZE # B+=1(nr)*2(kr)*8Byte=16
gsLQC1(R8,F1,F0,0)
LD b0,0(B)
MADD t11,t11,a4,b4
MADD t21,t21,a5,b4
.L88: # N=2, M=4, K=1
#ifndef TRMMKERNEL
andi K,KCO,1
#else
andi K,TEMP,1
#endif
beqz K,.L89 #
LD ALPHA,152($sp) # Get ALPHA
MADD t11,t11,a0,b0
MADD t21,t21,a1,b0
daddu A,A,2*SIZE # A+=2(mr)*1(kr)*8Byte=16
daddu B,B,1*SIZE
.L89: # Write Back
#ifndef TRMMKERNEL
LD c11,0(CO1) # Fetch 16 C
LD c21,1*SIZE(CO1)
MADD t11,c11,t11,ALPHA
MADD t21,c21,t21,ALPHA
ST t11,0(CO1)
ST t21,1*SIZE(CO1)
FETCH $0,2*SIZE(CO1)
daddu CO1,CO1,2*SIZE # COx += 2*8Byte
#else
daddu CO1,CO1,2*SIZE # COx += 2*8Byte
MUL t11, ALPHA, t11
MUL t21, ALPHA, t21
ST t11, -2 * SIZE(CO1)
ST t21, -1 * SIZE(CO1)
#if ( defined(LEFT) && defined(TRANSA)) || \
(!defined(LEFT) && !defined(TRANSA))
dsubu TEMP, KCO, KK
#ifdef LEFT
daddiu TEMP, TEMP, -2
#else
daddiu TEMP, TEMP, -1
#endif
dsll K, TEMP, 1 + BASE_SHIFT
dsll TEMP, TEMP, 0 + BASE_SHIFT
daddu A, A, K
daddu B, B, TEMP
#endif
#ifdef LEFT
daddiu KK, KK, 2
#endif
#endif
.L11_M1:
andi M,MCO,1 # Remainder M = 1
beqz M,.L999 # M = 0, End
nop
.L90:
#if defined(TRMMKERNEL)
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
move B, BO
#else
dsll K, KK, 0 + BASE_SHIFT
dsll TEMP, KK, 0 + BASE_SHIFT
daddu A, A, K
daddu B, BO, TEMP
#endif
# gsLQC1(R8,F4,F0,0)
MTC $0,t11
# gsLQC1(R9,F12,F8,0)
LD a0, 0*SIZE(A)
LD b0, 0*SIZE(B)
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
dsubu TEMP, KCO, KK
#elif defined(LEFT)
daddiu TEMP, KK, 1
#else
daddiu TEMP, KK, 1
#endif
dsra K, TEMP, 2
beqz K,.L95
nop
#else
move B, BO
dsra K,KCO,2 # K=KCO/2
# gsLQC1(R8,F4,F0,0)
# gsLQC1(R9,F12,F8,0)
LD a0, 0*SIZE(A)
LD b0, 0*SIZE(B)
beqz K,.L95
MTC $0,t11
#endif
.L91: # N=1,M=1,K=4
# gsLQC1(R8,F6,F2,1)
LD a4, 1*SIZE(A)
LD b4, 1*SIZE(B)
MADD t11,t11,a0,b0
# gsLQC1(R9,F14,F10,1)
LD a2, 2*SIZE(A)
LD b2, 2*SIZE(B)
MADD t11,t11,a4,b4
# gsLQC1(R8,F4,F0,0)
LD a6, 3*SIZE(A)
LD b6, 3*SIZE(B)
MADD t11,t11,a2,b2
daddu A,A,4*SIZE # A+=1(mr)*4(kr)*8Byte=32
daddu B,B,4*SIZE # B+=1(nr)*4(kr)*8Byte=32
LD a0, 0*SIZE(A)
LD b0, 0*SIZE(B)
# gsLQC1(R9,F12,F8,0)
MADD t11,t11,a6,b6
daddiu K,K,-1
bnez K,.L91
nop
.L95: # N=2 M=4 K=2
#ifndef TRMMKERNEL
andi K,KCO,2 # k = KCO&2
#else
andi K,TEMP,2
#endif
beqz K,.L98
nop
.L96:
LD a4, 1*SIZE(A)
LD b4, 1*SIZE(B)
MADD t11,t11,a0,b0
daddu B,B,2*SIZE # B+=1(nr)*2(kr)*8Byte=16
daddu A,A,2*SIZE # A+=1(mr)*2(kr)*8Byte=32
LD b0,0(B)
LD a0,0(A)
MADD t11,t11,a4,b4
.L98: # N=2, M=4, K=1
#ifndef TRMMKERNEL
andi K,KCO,1
#else
andi K,TEMP,1
#endif
beqz K,.L99 #
LD ALPHA,152($sp) # Get ALPHA
MADD t11,t11,a0,b0
.L99: # Write Back
#ifndef TRMMKERNEL
LD c11,0(CO1) # Fetch 16 C
MADD t11,c11,t11,ALPHA
ST t11,0(CO1)
#else
MUL t11, ALPHA, t11
ST t11, 0 * SIZE(CO1)
#endif
.L999: # End
ld $16, 0($sp)
ld $17, 8($sp)
ld $18, 16($sp)
ld $19, 24($sp)
ld $20, 32($sp)
ld $21, 40($sp)
ld $22, 48($sp)
LD $f24, 56($sp)
LD $f25, 64($sp)
LD $f26, 72($sp)
LD $f27, 80($sp)
LD $f28, 88($sp)
ld $23, 96($sp)
ld $24, 104($sp)
ld $25, 112($sp)
LD $f20,120($sp)
LD $f21,128($sp)
LD $f22,136($sp)
LD $f23,144($sp)
j $31
daddiu $sp, $sp, 160
EPILOGUE