OpenBLAS/kernel/mips64/gemm_kernel_loongson3a.S

1607 lines
26 KiB
ArmAsm

#define gsLQC1(base,fq,ft,offset) .word(0x32<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq)
#define gsSQC1(base,fq,ft,offset) .word(0x3A<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq)
#define FETCH ld
#define REALNAME ASMNAME
#define ASSEMBLER
#include "common.h"
#define M $4
#define N $5
#define K $6
#define A $8
#define B $9
#define C $10
#define LDC $11
#define AO $12
#define BO $13
#define I $2
#define J $3
#define L $7
#define CO1 $14
#define CO2 $15
#define CO3 $16
#define CO4 $17
#define KCO $18
#define MCO $19
#define NCO $20
#define SPANB $21
#define SPANC $22
#define PREB $23
#define PREA $24
#define SPANA $25
#define ALPHA $f15
#define R8 8
#define R9 9
#define R14 14
#define R15 15
#define R16 16
#define R17 17
#define t11 $f30
#define t21 $f31
#define t31 $f28
#define t41 $f29
#define t12 $f26
#define t22 $f27
#define t32 $f24
#define t42 $f25
#define t13 $f22
#define t23 $f23
#define t33 $f20
#define t43 $f21
#define t14 $f18
#define t24 $f19
#define t34 $f16
#define t44 $f17
#define c11 $f0
#define c21 $f1
#define c31 $f2
#define c41 $f3
#define c12 $f4
#define c22 $f5
#define c32 $f6
#define c42 $f7
#define c13 $f8
#define c23 $f9
#define c33 $f10
#define c43 $f11
#define c14 $f12
#define c24 $f13
#define c34 $f14
#define c44 $f0
#define a0 $f0
#define a1 $f1
#define a2 $f2
#define a3 $f3
#define a4 $f4
#define a5 $f5
#define a6 $f6
#define a7 $f7
#define b0 $f8
#define b1 $f9
#define b2 $f10
#define b3 $f11
#define b4 $f12
#define b5 $f13
#define b6 $f14
#define b7 $f15
#define F31 31
#define F30 30
#define F29 29
#define F28 28
#define F27 27
#define F26 26
#define F25 25
#define F24 24
#define F23 23
#define F22 22
#define F21 21
#define F20 20
#define F19 19
#define F18 18
#define F17 17
#define F16 16
#define F15 15
#define F14 14
#define F13 13
#define F12 12
#define F11 11
#define F10 10
#define F9 9
#define F8 8
#define F7 7
#define F6 6
#define F5 5
#define F4 4
#define F3 3
#define F2 2
#define F1 1
#define F0 0
PROLOGUE
daddiu $sp, $sp, -160
sd $16, 0($sp)
sd $17, 8($sp)
sd $18, 16($sp)
sd $19, 24($sp)
sd $20, 32($sp)
sd $21, 40($sp)
sd $22, 48($sp)
ST $f24, 56($sp)
ST $f25, 64($sp)
ST $f26, 72($sp)
ST $f27, 80($sp)
ST $f28, 88($sp)
sd $23, 96($sp)
sd $24, 104($sp)
sd $25, 112($sp)
ST $f20,120($sp)
ST $f21,128($sp)
ST $f22,136($sp)
ST $f23,144($sp)
.align 5 # BACKUP
.L0_N4: # Loop N
ST ALPHA,152($sp) # Backup ALPHA
move MCO,M # Backup M
move NCO,N # Backup N
move KCO,K # Backup K
move AO,A # Backup A_addr
move BO,B # Backup B_addr
dsll LDC,LDC,BASE_SHIFT # LDC*8Byte
dsll SPANB,KCO,2+BASE_SHIFT # SPANB=KC*NR(4)*8Byte=KC*2^5
dsll SPANA,KCO,1+BASE_SHIFT # SPANA = KCO*4mr*8Byte
dsra N,NCO,2 # N=NCO/2
beq N,$0,.L0_N2 # N=0,NCO<4
dsll SPANC,LDC,2 # SPANC=LDC*4
.L0_N4_Lb:
move CO1,C # Set C
dsra M,MCO,2 # M=MCO/2
move A,AO # Reset A
daddu CO2,CO1,LDC
daddu CO3,CO2,LDC
daddu PREB,BO,SPANB # PreB point next panelB
daddu CO4,CO3,LDC
beqz M,.L14_M2
daddu PREA,AO,SPANA
.L10:
MTC $0,t11
MOV t21,t11
gsLQC1(R8,F1,F0,0) #a0,a1
MOV t31,t11
MOV t41,t11
gsLQC1(R9,F9,F8,0) #b0,b1
MOV t12,t11
MOV t22,t11
gsLQC1(R8,F3,F2,1) #a2,a3
MOV t32,t11
MOV t42,t11
gsLQC1(R9,F11,F10,1) #b2,b3
dsra K,KCO,2 # K=KCO/2
MOV t13,t11
MOV t23,t11
MOV t33,t11
MOV t43,t11
MOV t14,t11
MOV t24,t11
MOV t34,t11
MOV t44,t11
beqz K,.L15
nop
.L11: # N=M=K=4
gsLQC1(R8,F5,F4,2) # R8=A
MADD t11,t11,a0,b0
MADD t21,t21,a1,b0
gsLQC1(R9,F13,F12,2) # R9=B
MADD t12,t12,a0,b1
MADD t22,t22,a1,b1
gsLQC1(R8,F7,F6,3)
MADD t31,t31,a2,b0
MADD t41,t41,a3,b0
gsLQC1(R9,F15,F14,3)
MADD t32,t32,a2,b1
MADD t42,t42,a3,b1
FETCH $0,(PREB)
MADD t13,t13,a0,b2
MADD t23,t23,a1,b2
MADD t14,t14,a0,b3
MADD t24,t24,a1,b3
FETCH $0,(PREA)
MADD t33,t33,a2,b2
MADD t43,t43,a3,b2
MADD t34,t34,a2,b3
MADD t44,t44,a3,b3
#load2 comp1
.L12:
gsLQC1(R8,F1,F0,4)
MADD t11,t11,a4,b4
MADD t21,t21,a5,b4
gsLQC1(R9,F9,F8,4)
MADD t12,t12,a4,b5
MADD t22,t22,a5,b5
gsLQC1(R8,F3,F2,5)
MADD t31,t31,a6,b4
MADD t41,t41,a7,b4
gsLQC1(R9,F11,F10,5)
MADD t32,t32,a6,b5
MADD t42,t42,a7,b5
FETCH $0,4*SIZE(PREB)
MADD t13,t13,a4,b6
MADD t23,t23,a5,b6
MADD t14,t14,a4,b7
MADD t24,t24,a5,b7
FETCH $0,4*SIZE(PREA)
MADD t33,t33,a6,b6
MADD t43,t43,a7,b6
MADD t34,t34,a6,b7
MADD t44,t44,a7,b7
.L13:
gsLQC1(R8,F5,F4,6)
MADD t11,t11,a0,b0
MADD t21,t21,a1,b0
gsLQC1(R9,F13,F12,6)
MADD t12,t12,a0,b1
MADD t22,t22,a1,b1
gsLQC1(R8,F7,F6,7)
MADD t31,t31,a2,b0
MADD t41,t41,a3,b0
gsLQC1(R9,F15,F14,7)
MADD t32,t32,a2,b1
MADD t42,t42,a3,b1
daddu A,A,16*SIZE # A+=4(mr)*4(kr)*8Byte=16*SIZE
FETCH $0,8*SIZE(PREB)
MADD t13,t13,a0,b2
MADD t23,t23,a1,b2
daddu B,B,16*SIZE
MADD t14,t14,a0,b3
MADD t24,t24,a1,b3
FETCH $0,8*SIZE(PREA)
MADD t33,t33,a2,b2
MADD t43,t43,a3,b2
MADD t34,t34,a2,b3
MADD t44,t44,a3,b3
.L14:
gsLQC1(R8,F1,F0,0)
MADD t11,t11,a4,b4
MADD t21,t21,a5,b4
gsLQC1(R9,F9,F8,0)
MADD t12,t12,a4,b5
MADD t22,t22,a5,b5
gsLQC1(R8,F3,F2,1)
MADD t31,t31,a6,b4
MADD t41,t41,a7,b4
daddiu K,K,-1
gsLQC1(R9,F11,F10,1)
MADD t32,t32,a6,b5
MADD t42,t42,a7,b5
FETCH $0,12*SIZE(PREB)
MADD t13,t13,a4,b6
MADD t23,t23,a5,b6
MADD t14,t14,a4,b7
MADD t24,t24,a5,b7
FETCH $0,12*SIZE(PREA)
MADD t33,t33,a6,b6
MADD t43,t43,a7,b6
daddu PREB,PREB,16*SIZE
MADD t34,t34,a6,b7
daddu PREA,PREA,16*SIZE
bnez K,.L11
MADD t44,t44,a7,b7
.L15: # N=4 M=4 K=2
and K,KCO,2 # k = KCO&2
beqz K,.L18
nop
.L16:
gsLQC1(R8,F5,F4,2) # R8=A
MADD t11,t11,a0,b0
MADD t21,t21,a1,b0
gsLQC1(R9,F13,F12,2) # R9=B
MADD t12,t12,a0,b1
MADD t22,t22,a1,b1
gsLQC1(R8,F7,F6,3)
MADD t31,t31,a2,b0
MADD t41,t41,a3,b0
gsLQC1(R9,F15,F14,3)
MADD t32,t32,a2,b1
MADD t42,t42,a3,b1
daddu A,A,8*SIZE # A+=4(mr)*2(kr)*8Byte=8*SIZE
FETCH $0,0(PREB)
MADD t13,t13,a0,b2
MADD t23,t23,a1,b2
daddu B,B,8*SIZE
MADD t14,t14,a0,b3
MADD t24,t24,a1,b3
FETCH $0,0(PREA)
MADD t33,t33,a2,b2
MADD t43,t43,a3,b2
MADD t34,t34,a2,b3
MADD t44,t44,a3,b3
.L17:
gsLQC1(R8,F1,F0,0)
MADD t11,t11,a4,b4
MADD t21,t21,a5,b4
gsLQC1(R9,F9,F8,0)
MADD t12,t12,a4,b5
MADD t22,t22,a5,b5
gsLQC1(R8,F3,F2,1)
MADD t31,t31,a6,b4
MADD t41,t41,a7,b4
gsLQC1(R9,F11,F10,1)
MADD t32,t32,a6,b5
MADD t42,t42,a7,b5
FETCH $0,4*SIZE(PREB)
MADD t13,t13,a4,b6
MADD t23,t23,a5,b6
MADD t14,t14,a4,b7
MADD t24,t24,a5,b7
FETCH $0,4*SIZE(PREA)
MADD t33,t33,a6,b6
MADD t43,t43,a7,b6
daddu PREB,PREB,8*SIZE
MADD t34,t34,a6,b7
MADD t44,t44,a7,b7
daddu PREA,PREA,8*SIZE
.L18: # N=4, M=4, K=1
and K,KCO,1
beqz K,.L19 #
LD ALPHA,152($sp) # Get ALPHA
FETCH $0,0(PREB)
MADD t11,t11,a0,b0
MADD t21,t21,a1,b0
daddu A,A,4*SIZE # A+=4(mr)*1(kr)*8Byte=32
MADD t12,t12,a0,b1
MADD t22,t22,a1,b1
daddu B,B,4*SIZE
FETCH $0,0(PREA)
MADD t31,t31,a2,b0
MADD t41,t41,a3,b0
daddu PREB,PREB,4*SIZE
MADD t32,t32,a2,b1
MADD t42,t42,a3,b1
daddu PREA,PREA,4*SIZE
MADD t13,t13,a0,b2
MADD t23,t23,a1,b2
MADD t14,t14,a0,b3
MADD t24,t24,a1,b3
MADD t33,t33,a2,b2
MADD t43,t43,a3,b2
MADD t34,t34,a2,b3
MADD t44,t44,a3,b3
.L19: # Write Back
LD c11,0(CO1) # Fetch 16 C
LD c21,1*SIZE(CO1)
LD c31,2*SIZE(CO1)
LD c41,3*SIZE(CO1)
LD c12,0(CO2)
MADD t11,c11,t11,ALPHA
LD c22,1*SIZE(CO2)
MADD t21,c21,t21,ALPHA
LD c32,2*SIZE(CO2)
MADD t31,c31,t31,ALPHA
LD c42,3*SIZE(CO2)
MADD t41,c41,t41,ALPHA
LD c13,0(CO3)
MADD t12,c12,t12,ALPHA
LD c23,1*SIZE(CO3)
MADD t22,c22,t22,ALPHA
LD c33,2*SIZE(CO3)
MADD t32,c32,t32,ALPHA
LD c43,3*SIZE(CO3)
MADD t42,c42,t42,ALPHA
LD c14,0(CO4)
MADD t13,c13,t13,ALPHA
LD c24,1*SIZE(CO4)
MADD t23,c23,t23,ALPHA
LD c34,2*SIZE(CO4)
MADD t33,c33,t33,ALPHA
LD c44,3*SIZE(CO4)
MADD t43,c43,t43,ALPHA
ST t11,0(CO1)
MADD t14,c14,t14,ALPHA
ST t21,1*SIZE(CO1)
MADD t24,c24,t24,ALPHA
ST t31,2*SIZE(CO1)
MADD t34,c34,t34,ALPHA
ST t41,3*SIZE(CO1)
MADD t44,c44,t44,ALPHA
daddiu M,M,-1 # M--
ST t12,0(CO2)
ST t22,1*SIZE(CO2)
ST t32,2*SIZE(CO2)
ST t42,3*SIZE(CO2)
ST t13,0(CO3)
ST t23,1*SIZE(CO3)
ST t33,2*SIZE(CO3)
ST t43,3*SIZE(CO3)
FETCH $0,4*SIZE(CO1)
FETCH $0,4*SIZE(CO2)
FETCH $0,4*SIZE(CO3)
FETCH $0,4*SIZE(CO4)
FETCH $0,8*SIZE(CO1)
FETCH $0,8*SIZE(CO2)
FETCH $0,8*SIZE(CO3)
FETCH $0,8*SIZE(CO4)
ST t14,0(CO4)
daddu CO1,CO1,4*SIZE # COx += 4*8Byte
ST t24,1*SIZE(CO4)
daddu CO2,CO2,4*SIZE
ST t34,2*SIZE(CO4)
daddu CO3,CO3,4*SIZE
ST t44,3*SIZE(CO4)
move B,BO # Reset B
daddu PREB,BO,SPANB
bnez M,.L10 # M!=0
daddu CO4,CO4,4*SIZE
.L14_M2:
and M,MCO,2 # Remainder M = 2
beqz M,.L14_M1
nop
.L20:
MTC $0,t11
MOV t21,t11
gsLQC1(R8,F1,F0,0) #a0,a1
MOV t12,t11
MOV t22,t11
gsLQC1(R9,F9,F8,0) #b0,b1
dsra K,KCO,2 # K=KCO/2
MOV t13,t11
gsLQC1(R9,F11,F10,1) #b2,b3
MOV t23,t11
MOV t14,t11
MOV t24,t11
beqz K,.L25
nop
.L21: # N=4 m=2,=K=4
gsLQC1(R8,F5,F4,1) # R8=A
MADD t11,t11,a0,b0
MADD t21,t21,a1,b0
gsLQC1(R9,F13,F12,2) # R9=B
MADD t12,t12,a0,b1
MADD t22,t22,a1,b1
gsLQC1(R9,F15,F14,3)
MADD t13,t13,a0,b2
MADD t23,t23,a1,b2
gsLQC1(R8,F3,F2,2)
MADD t14,t14,a0,b3
MADD t24,t24,a1,b3
gsLQC1(R9,F9,F8,4)
MADD t11,t11,a4,b4
MADD t21,t21,a5,b4
gsLQC1(R9,F11,F10,5)
MADD t12,t12,a4,b5
MADD t22,t22,a5,b5
gsLQC1(R8,F7,F6,3)
MADD t13,t13,a4,b6
MADD t23,t23,a5,b6
MADD t14,t14,a4,b7
MADD t24,t24,a5,b7
gsLQC1(R9,F13,F12,6)
MADD t11,t11,a2,b0
MADD t21,t21,a3,b0
daddu A,A,8*SIZE # A+=2(mr)*4(kr)*8Byte=8*SIZE
gsLQC1(R9,F15,F14,7)
MADD t12,t12,a2,b1
MADD t22,t22,a3,b1
daddiu K,K,-1
gsLQC1(R8,F1,F0,0)
MADD t13,t13,a2,b2
MADD t23,t23,a3,b2
daddu B,B,16*SIZE # B+=4(nr)*4(kr)*8Byte=16*SIZE
MADD t14,t14,a2,b3
MADD t24,t24,a3,b3
gsLQC1(R9,F9,F8,0)
MADD t11,t11,a6,b4
MADD t21,t21,a7,b4
gsLQC1(R9,F11,F10,1)
MADD t12,t12,a6,b5
MADD t22,t22,a7,b5
MADD t13,t13,a6,b6
MADD t23,t23,a7,b6
MADD t14,t14,a6,b7
bnez K,.L21
MADD t24,t24,a7,b7
.L25: # N=4 M=2 K=2
and K,KCO,2 # k = KCO&2
beqz K,.L28
nop
.L26:
gsLQC1(R8,F5,F4,1) # R8=A
MADD t11,t11,a0,b0
MADD t21,t21,a1,b0
gsLQC1(R9,F13,F12,2) # R9=B
MADD t12,t12,a0,b1
MADD t22,t22,a1,b1
daddu A,A,4*SIZE # A+=2(mr)*2(kr)*8Byte=32
gsLQC1(R9,F15,F14,3)
MADD t13,t13,a0,b2
MADD t23,t23,a1,b2
daddu B,B,8*SIZE
MADD t14,t14,a0,b3
MADD t24,t24,a1,b3
.L27:
gsLQC1(R8,F1,F0,0)
MADD t11,t11,a4,b4
MADD t21,t21,a5,b4
gsLQC1(R9,F9,F8,0)
MADD t12,t12,a4,b5
MADD t22,t22,a5,b5
gsLQC1(R9,F11,F10,1)
MADD t13,t13,a4,b6
MADD t23,t23,a5,b6
MADD t14,t14,a4,b7
MADD t24,t24,a5,b7
.L28: # N=4, M=2, K=1
and K,KCO,1
beqz K,.L29 #
LD ALPHA,152($sp) # Get ALPHA
MADD t11,t11,a0,b0
MADD t21,t21,a1,b0
daddu A,A,2*SIZE # A+=2(mr)*1(kr)*8Byte=16
daddu B,B,4*SIZE
MADD t12,t12,a0,b1
MADD t22,t22,a1,b1
MADD t13,t13,a0,b2
MADD t23,t23,a1,b2
MADD t14,t14,a0,b3
MADD t24,t24,a1,b3
.L29: # Write Back
LD c11,0(CO1) # Fetch 16 C
LD c21,1*SIZE(CO1)
LD c12,0(CO2)
LD c22,1*SIZE(CO2)
LD c13,0(CO3)
MADD t11,c11,t11,ALPHA
LD c23,1*SIZE(CO3)
MADD t21,c21,t21,ALPHA
LD c14,0(CO4)
MADD t12,c12,t12,ALPHA
LD c24,1*SIZE(CO4)
MADD t22,c22,t22,ALPHA
ST t11,0(CO1)
MADD t13,c13,t13,ALPHA
ST t21,1*SIZE(CO1)
MADD t23,c23,t23,ALPHA
ST t12,0(CO2)
MADD t14,c14,t14,ALPHA
ST t22,1*SIZE(CO2)
MADD t24,c24,t24,ALPHA
ST t13,0(CO3)
move B,BO # Reset B
ST t23,1*SIZE(CO3)
daddu CO1,CO1,2*SIZE # COx += 2*8Byte
FETCH $0,0(CO1)
FETCH $0,2*SIZE(CO2)
FETCH $0,2*SIZE(CO3)
FETCH $0,2*SIZE(CO4)
ST t14,0(CO4)
daddu CO2,CO2,2*SIZE
ST t24,1*SIZE(CO4)
daddu CO3,CO3,2*SIZE
daddu CO4,CO4,2*SIZE
.L14_M1:
and M,MCO,1 # Remainder M = 1
beqz M,.L0_N4_Loop # M = 0, finishing one panel B
nop
.L30:
gsLQC1(R8,F1,F0,0)
dsra K,KCO,2 # K=KCO/2
gsLQC1(R9,F9,F8,0) #b0,b1
MTC $0,t11
gsLQC1(R9,F11,F10,1) #b2,b3
MOV t12,t11
MOV t13,t11
beqz K,.L35
MOV t14,t11
.L31: # N=4 m=1,=K=4
gsLQC1(R8,F3,F2,1)
gsLQC1(R9,F13,F12,2) # R9=B
MADD t11,t11,a0,b0
MADD t12,t12,a0,b1
gsLQC1(R9,F15,F14,3)
MADD t13,t13,a0,b2
MADD t14,t14,a0,b3
gsLQC1(R9,F9,F8,4)
MADD t11,t11,a1,b4
MADD t12,t12,a1,b5
gsLQC1(R9,F11,F10,5)
MADD t13,t13,a1,b6
MADD t14,t14,a1,b7
daddiu K,K,-1
gsLQC1(R9,F13,F12,6)
MADD t11,t11,a2,b0
MADD t12,t12,a2,b1
daddu A,A,4*SIZE # A+=1(mr)*4(kr)*8Byte=8*SIZE
gsLQC1(R9,F15,F14,7)
MADD t13,t13,a2,b2
MADD t14,t14,a2,b3
daddu B,B,16*SIZE # B+=4(nr)*4(kr)*8Byte=16*SIZE
gsLQC1(R8,F1,F0,0)
gsLQC1(R9,F9,F8,0)
MADD t11,t11,a3,b4
MADD t12,t12,a3,b5
gsLQC1(R9,F11,F10,1)
MADD t13,t13,a3,b6
bnez K,.L31
MADD t14,t14,a3,b7
.L35: # N=4 M=1 K=2
and K,KCO,2 # k = KCO&2
beqz K,.L38
nop
.L36:
gsLQC1(R9,F13,F12,2) # R9=B
MADD t11,t11,a0,b0
MADD t12,t12,a0,b1
daddu A,A,2*SIZE # A+=1(mr)*2(kr)*8Byte=32
gsLQC1(R9,F15,F14,3)
MADD t13,t13,a0,b2
MADD t14,t14,a0,b3
daddu B,B,8*SIZE
.L37:
LD a0,0(A)
gsLQC1(R9,F9,F8,0)
MADD t11,t11,a1,b4
MADD t12,t12,a1,b5
gsLQC1(R9,F11,F10,1)
MADD t13,t13,a1,b6
MADD t14,t14,a1,b7
.L38: # N=4, M=1, K=1
and K,KCO,1
beqz K,.L39 #
LD ALPHA,152($sp) # Get ALPHA
MADD t11,t11,a0,b0
MADD t12,t12,a0,b1
daddu A,A,1*SIZE # A+=1(mr)*1(kr)*8Byte=16
daddu B,B,4*SIZE
MADD t13,t13,a0,b2
MADD t14,t14,a0,b3
.L39: # Write Back
LD c11,0(CO1) # Fetch 16 C
LD c12,0(CO2)
LD c13,0(CO3)
LD c14,0(CO4)
MADD t11,c11,t11,ALPHA
MADD t12,c12,t12,ALPHA
MADD t13,c13,t13,ALPHA
MADD t14,c14,t14,ALPHA
ST t11,0(CO1)
ST t12,0(CO2)
ST t13,0(CO3)
ST t14,0(CO4)
.L0_N4_Loop:
daddu BO,BO,SPANB # BO point to next panel B
daddiu N,N,-1 # N--
daddu C,C,SPANC # C pointe to next panel C
bnez N,.L0_N4_Lb # N!=0
move B,BO # Set B
.align 5
.L0_N2:
and N,NCO,2 # Remainder N = 2
beqz N,.L0_N1 # N=0,NCO<2
dsll SPANC,LDC,1 # SPANC=LDC*2
.L0_N2_Lb:
move CO1,C # Set C
dsra M,MCO,2 # M=MCO/2
dsll SPANB,KCO,1+BASE_SHIFT # SPANB=KC*NR(2)*8Byte=KC*16=KC*2^4
move A,AO # Reset A
daddu CO2,CO1,LDC
beqz M,.L12_M2
daddu PREA,AO,SPANA
.L40:
MTC $0,t11
MOV t21,t11
gsLQC1(R8,F1,F0,0) #a0,a1
MOV t31,t11
MOV t41,t11
gsLQC1(R9,F9,F8,0) #b0,b1
dsra K,KCO,2 # K=KCO/2
MOV t12,t11
gsLQC1(R8,F3,F2,1) #a2,a3
MOV t22,t11
MOV t32,t11
MOV t42,t11
beqz K,.L45
nop
.L41: # N=2,M=K=4
gsLQC1(R8,F5,F4,2) # R8=A
MADD t11,t11,a0,b0
MADD t21,t21,a1,b0
gsLQC1(R9,F13,F12,1) # R9=B
MADD t12,t12,a0,b1
MADD t22,t22,a1,b1
gsLQC1(R8,F7,F6,3)
MADD t31,t31,a2,b0
MADD t41,t41,a3,b0
FETCH $0,(PREA)
MADD t32,t32,a2,b1
MADD t42,t42,a3,b1
.L42:
gsLQC1(R8,F1,F0,4)
MADD t11,t11,a4,b4
MADD t21,t21,a5,b4
gsLQC1(R9,F11,F10,2)
MADD t12,t12,a4,b5
MADD t22,t22,a5,b5
gsLQC1(R8,F3,F2,5)
MADD t31,t31,a6,b4
MADD t41,t41,a7,b4
FETCH $0,4*SIZE(PREA)
MADD t32,t32,a6,b5
MADD t42,t42,a7,b5
.L43:
gsLQC1(R8,F5,F4,6)
MADD t11,t11,a0,b2
MADD t21,t21,a1,b2
gsLQC1(R9,F15,F14,3)
MADD t12,t12,a0,b3
MADD t22,t22,a1,b3
gsLQC1(R8,F7,F6,7)
MADD t31,t31,a2,b2
MADD t41,t41,a3,b2
daddu B,B,8*SIZE # B+=2(nr)*4(kr)*8Byte=8*SIZE
FETCH $0,8*SIZE(PREA)
MADD t32,t32,a2,b3
MADD t42,t42,a3,b3
daddu A,A,16*SIZE # A+=4(mr)*4(kr)*8Byte=16*SIZE
.L44:
gsLQC1(R8,F1,F0,0)
MADD t11,t11,a4,b6
MADD t21,t21,a5,b6
daddiu K,K,-1
gsLQC1(R9,F9,F8,0)
MADD t12,t12,a4,b7
MADD t22,t22,a5,b7
daddu PREA,PREA,16*SIZE
gsLQC1(R8,F3,F2,1)
MADD t31,t31,a6,b6
MADD t41,t41,a7,b6
FETCH $0,-4*SIZE(PREA)
MADD t32,t32,a6,b7
bnez K,.L41
MADD t42,t42,a7,b7
.L45: # N=2 M=4 K=2
and K,KCO,2 # k = KCO&2
beqz K,.L48
nop
.L46:
gsLQC1(R8,F5,F4,2) # R8=A
MADD t11,t11,a0,b0
MADD t21,t21,a1,b0
gsLQC1(R9,F13,F12,1) # R9=B
MADD t12,t12,a0,b1
MADD t22,t22,a1,b1
gsLQC1(R8,F7,F6,3)
MADD t31,t31,a2,b0
MADD t41,t41,a3,b0
daddu B,B,4*SIZE # B+=2(nr)*2(kr)*8Byte=32
FETCH $0,0(PREA)
MADD t32,t32,a2,b1
MADD t42,t42,a3,b1
daddu A,A,8*SIZE # A+=4(mr)*2(kr)*8Byte=8*SIZE
.L47:
gsLQC1(R8,F1,F0,0)
MADD t11,t11,a4,b4
MADD t21,t21,a5,b4
gsLQC1(R9,F9,F8,0)
MADD t12,t12,a4,b5
MADD t22,t22,a5,b5
gsLQC1(R8,F3,F2,1)
MADD t31,t31,a6,b4
MADD t41,t41,a7,b4
FETCH $0,4*SIZE(PREA)
MADD t32,t32,a6,b5
MADD t42,t42,a7,b5
daddu PREA,PREA,8*SIZE
.L48: # N=2, M=4, K=1
and K,KCO,1
beqz K,.L49 #
LD ALPHA,152($sp) # Get ALPHA
FETCH $0,0(PREA)
MADD t11,t11,a0,b0
MADD t21,t21,a1,b0
daddu A,A,4*SIZE # A+=4(mr)*1(kr)*8Byte=32
MADD t12,t12,a0,b1
MADD t22,t22,a1,b1
daddu B,B,2*SIZE
daddu PREA,PREA,4*SIZE
MADD t31,t31,a2,b0
MADD t41,t41,a3,b0
MADD t32,t32,a2,b1
MADD t42,t42,a3,b1
.L49: # Write Back
LD c11,0(CO1) # Fetch 16 C
LD c21,1*SIZE(CO1)
LD c31,2*SIZE(CO1)
LD c41,3*SIZE(CO1)
LD c12,0(CO2)
MADD t11,c11,t11,ALPHA
LD c22,1*SIZE(CO2)
MADD t21,c21,t21,ALPHA
LD c32,2*SIZE(CO2)
MADD t31,c31,t31,ALPHA
LD c42,3*SIZE(CO2)
MADD t41,c41,t41,ALPHA
ST t11,0(CO1)
MADD t12,c12,t12,ALPHA
ST t21,1*SIZE(CO1)
MADD t22,c22,t22,ALPHA
ST t31,2*SIZE(CO1)
MADD t32,c32,t32,ALPHA
ST t41,3*SIZE(CO1)
MADD t42,c42,t42,ALPHA
daddiu M,M,-1 # M--
ST t12,0(CO2)
ST t22,1*SIZE(CO2)
ST t32,2*SIZE(CO2)
ST t42,3*SIZE(CO2)
FETCH $0,4*SIZE(CO1)
FETCH $0,4*SIZE(CO2)
FETCH $0,8*SIZE(CO1)
FETCH $0,8*SIZE(CO2)
daddu CO1,CO1,4*SIZE # COx += 4*8Byte
daddu CO2,CO2,4*SIZE
bnez M,.L40 # M!=0
move B,BO # Reset B
.L12_M2:
and M,MCO,2 # Remainder M = 2
beqz M,.L12_M1
nop
.L50:
dsra K,KCO,2 # K=KCO/2
MTC $0,t11
gsLQC1(R8,F1,F0,0) #a0,a1
MOV t21,t11
MOV t12,t11
gsLQC1(R9,F9,F8,0) #b0,b1
MOV t22,t11
beqz K,.L55
nop
.L51: # N=2 m=2,=K=4
gsLQC1(R8,F5,F4,1) # R8=A
MADD t11,t11,a0,b0
MADD t21,t21,a1,b0
gsLQC1(R9,F13,F12,1) # R9=B
MADD t12,t12,a0,b1
MADD t22,t22,a1,b1
gsLQC1(R8,F3,F2,2)
MADD t11,t11,a4,b4
MADD t21,t21,a5,b4
gsLQC1(R9,F11,F10,2)
MADD t12,t12,a4,b5
MADD t22,t22,a5,b5
daddiu K,K,-1
gsLQC1(R8,F7,F6,3)
MADD t11,t11,a2,b2
MADD t21,t21,a3,b2
daddu A,A,8*SIZE # A+=2(mr)*4(kr)*8Byte=8*SIZE
gsLQC1(R9,F15,F14,3)
MADD t12,t12,a2,b3
MADD t22,t22,a3,b3
daddu B,B,8*SIZE # B+=2(nr)*4(kr)*8Byte=16*SIZE
gsLQC1(R8,F1,F0,0)
MADD t11,t11,a6,b6
MADD t21,t21,a7,b6
gsLQC1(R9,F9,F8,0)
MADD t12,t12,a6,b7
bnez K,.L51
MADD t22,t22,a7,b7
.L55: # N=2 M=2 K=2
and K,KCO,2 # k = KCO&2
beqz K,.L58
nop
.L56:
gsLQC1(R8,F5,F4,1) # R8=A
MADD t11,t11,a0,b0
MADD t21,t21,a1,b0
daddu A,A,4*SIZE # A+=2(mr)*2(kr)*8Byte=32
gsLQC1(R9,F13,F12,1) # R9=B
MADD t12,t12,a0,b1
MADD t22,t22,a1,b1
daddu B,B,4*SIZE
.L57:
gsLQC1(R8,F1,F0,0)
MADD t11,t11,a4,b4
MADD t21,t21,a5,b4
gsLQC1(R9,F9,F8,0)
MADD t12,t12,a4,b5
MADD t22,t22,a5,b5
.L58: # N=2, M=2, K=1
and K,KCO,1
beqz K,.L59 #
LD ALPHA,152($sp) # Get ALPHA
MADD t11,t11,a0,b0
MADD t21,t21,a1,b0
daddu A,A,2*SIZE # A+=2(mr)*1(kr)*8Byte=16
daddu B,B,2*SIZE
MADD t12,t12,a0,b1
MADD t22,t22,a1,b1
.L59: # Write Back
LD c11,0(CO1) # Fetch 16 C
LD c21,1*SIZE(CO1)
LD c12,0(CO2)
LD c22,1*SIZE(CO2)
MADD t11,c11,t11,ALPHA
MADD t21,c21,t21,ALPHA
MADD t12,c12,t12,ALPHA
MADD t22,c22,t22,ALPHA
ST t11,0(CO1)
ST t21,1*SIZE(CO1)
ST t12,0(CO2)
move B,BO # Reset B
ST t22,1*SIZE(CO2)
daddu CO1,CO1,2*SIZE # COx += 2*8Byte
daddu CO2,CO2,2*SIZE
FETCH $0,0(CO1)
FETCH $0,0(CO2)
.L12_M1:
and M,MCO,1 # Remainder M = 1
beqz M,.L0_N2_Loop # M = 0, finishing one panel B
nop
.L60:
dsra K,KCO,2 # K=KCO/2
MTC $0,t11
gsLQC1(R8,F4,F0,0)
MOV t21,t11
MOV t12,t11
gsLQC1(R9,F9,F8,0) #b0,b1
MOV t22,t11
beqz K,.L65
nop
.L61: # N=2 m=1,=K=4
gsLQC1(R9,F13,F12,1) # R9=B
MADD t11,t11,a0,b0
MADD t12,t12,a0,b1
gsLQC1(R9,F11,F10,2)
MADD t11,t11,a4,b4
MADD t12,t12,a4,b5
daddiu K,K,-1
gsLQC1(R8,F6,F2,1)
MADD t11,t11,a2,b2
gsLQC1(R9,F15,F14,3)
MADD t12,t12,a2,b3
daddu A,A,4*SIZE # A+=1(mr)*4(kr)*8Byte=32
gsLQC1(R8,F4,F0,0)
daddu B,B,8*SIZE # B+=2(nr)*4(kr)*8Byte=8*SIZE
gsLQC1(R9,F9,F8,0)
MADD t11,t11,a6,b6
bnez K,.L61
MADD t12,t12,a6,b7
.L65: # N=2 M=1 K=2
and K,KCO,2 # k = KCO&2
beqz K,.L68
nop
.L66:
gsLQC1(R9,F13,F12,1) # R9=B
MADD t11,t11,a0,b0
MADD t12,t12,a0,b1
daddu A,A,2*SIZE # A+=1(mr)*2(kr)*8Byte=16
daddu B,B,4*SIZE
.L67:
LD a0,0(A)
gsLQC1(R9,F9,F8,0)
MADD t11,t11,a4,b4
MADD t12,t12,a4,b5
.L68: # N=2, M=1, K=1
and K,KCO,1
beqz K,.L69 #
LD ALPHA,152($sp) # Get ALPHA
MADD t11,t11,a0,b0
MADD t12,t12,a0,b1
daddu A,A,1*SIZE # A+=1(mr)*1(kr)*8Byte=16
daddu B,B,2*SIZE
.L69: # Write Back
LD c11,0(CO1) # Fetch 16 C
LD c12,0(CO2)
MADD t11,c11,t11,ALPHA
MADD t12,c12,t12,ALPHA
ST t11,0(CO1)
ST t12,0(CO2)
move B,BO # Reset B
daddu CO1,CO1,1*SIZE # COx += 2*8Byte
daddu CO2,CO2,1*SIZE
FETCH $0,0(CO1)
FETCH $0,0(CO2)
.L0_N2_Loop:
daddu BO,BO,SPANB # BO+=KC*2N
move B,BO # Set B
daddu C,C,SPANC # C+=LDC*2
.align 5
.L0_N1:
and N,NCO,1 # Remainder N = 1
beqz N,.L999 # N=0,NCO<1
nop
move CO1,C # Set C
dsra M,MCO,2 # M=MCO/2
move A,AO # Reset A
beqz M,.L11_M2
daddu PREA,AO,SPANA
.L70:
dsra K,KCO,2 # K=KCO/2
gsLQC1(R9,F12,F8,0)
MTC $0,t11
gsLQC1(R8,F1,F0,0) #a0,a1
MOV t21,t11
gsLQC1(R8,F3,F2,1) #a2,a3
MOV t31,t11
beqz K,.L75
MOV t41,t11
.L71: # N=1,M=K=4
gsLQC1(R8,F5,F4,2) # R8=A
gsLQC1(R8,F7,F6,3)
MADD t11,t11,a0,b0
MADD t21,t21,a1,b0
FETCH $0,(PREA)
MADD t31,t31,a2,b0
MADD t41,t41,a3,b0
.L72:
gsLQC1(R9,F14,F10,1)
gsLQC1(R8,F1,F0,4)
gsLQC1(R8,F3,F2,5)
MADD t11,t11,a4,b4
MADD t21,t21,a5,b4
FETCH $0,4*SIZE(PREA)
MADD t31,t31,a6,b4
MADD t41,t41,a7,b4
.L73:
gsLQC1(R8,F5,F4,6)
gsLQC1(R8,F7,F6,7)
MADD t11,t11,a0,b2
MADD t21,t21,a1,b2
daddu B,B,4*SIZE # B+=1(nr)*4(kr)*8Byte=32
FETCH $0,8*SIZE(PREA)
MADD t31,t31,a2,b2
MADD t41,t41,a3,b2
daddu A,A,16*SIZE # A+=4(mr)*4(kr)*8Byte=16*SIZE
.L74:
gsLQC1(R9,F12,F8,0)
gsLQC1(R8,F1,F0,0)
daddu PREA,PREA,16*SIZE
gsLQC1(R8,F3,F2,1)
MADD t11,t11,a4,b6
MADD t21,t21,a5,b6
daddiu K,K,-1
FETCH $0,-32(PREA)
MADD t31,t31,a6,b6
bnez K,.L71
MADD t41,t41,a7,b6
.L75: # N=2 M=4 K=2
and K,KCO,2 # k = KCO&2
beqz K,.L78
nop
.L76:
gsLQC1(R8,F5,F4,2) # R8=A
gsLQC1(R8,F7,F6,3)
MADD t11,t11,a0,b0
MADD t21,t21,a1,b0
daddu B,B,2*SIZE # B+=1(nr)*2(kr)*8Byte=32
FETCH $0,0(PREA)
MADD t31,t31,a2,b0
MADD t41,t41,a3,b0
daddu A,A,8*SIZE # A+=4(mr)*2(kr)*8Byte=8*SIZE
.L77:
LD b0,0(B)
gsLQC1(R8,F1,F0,0)
gsLQC1(R8,F3,F2,1)
MADD t11,t11,a4,b4
MADD t21,t21,a5,b4
FETCH $0,4*SIZE(PREA)
MADD t31,t31,a6,b4
MADD t41,t41,a7,b4
daddu PREA,PREA,8*SIZE
.L78: # N=2, M=4, K=1
and K,KCO,1
beqz K,.L79 #
LD ALPHA,152($sp) # Get ALPHA
FETCH $0,0(PREA)
MADD t11,t11,a0,b0
MADD t21,t21,a1,b0
daddu A,A,4*SIZE # A+=4(mr)*1(kr)*8Byte=32
MADD t31,t31,a2,b0
MADD t41,t41,a3,b0
daddu B,B,1*SIZE
daddu PREA,PREA,4*SIZE
.L79: # Write Back
LD c11,0(CO1) # Fetch 16 C
LD c21,1*SIZE(CO1)
LD c31,2*SIZE(CO1)
LD c41,3*SIZE(CO1)
MADD t11,c11,t11,ALPHA
MADD t21,c21,t21,ALPHA
MADD t31,c31,t31,ALPHA
MADD t41,c41,t41,ALPHA
ST t11,0(CO1)
ST t21,1*SIZE(CO1)
ST t31,2*SIZE(CO1)
ST t41,3*SIZE(CO1)
daddiu M,M,-1 # M--
FETCH $0,4*SIZE(CO1)
daddu CO1,CO1,4*SIZE # COx += 4*8Byte
bnez M,.L70 # M!=0
move B,BO # Reset B
.L11_M2:
and M,MCO,2 # Remainder M = 2
beqz M,.L11_M1
nop
.L80:
dsra K,KCO,2 # K=KCO/2
gsLQC1(R9,F12,F8,0)
MTC $0,t11
gsLQC1(R8,F1,F0,0) #a0,a1
MOV t21,t11
beqz K,.L85
nop
.L81: # N=1,M=2,K=4
gsLQC1(R8,F5,F4,1) # R8=A
MADD t11,t11,a0,b0
MADD t21,t21,a1,b0
gsLQC1(R8,F3,F2,2)
MADD t11,t11,a4,b4
MADD t21,t21,a5,b4
gsLQC1(R9,F14,F10,1)
daddu B,B,4*SIZE # B+=1(nr)*4(kr)*8Byte=32
gsLQC1(R8,F7,F6,3)
MADD t11,t11,a2,b2
MADD t21,t21,a3,b2
daddu A,A,8*SIZE # A+=2(mr)*4(kr)*8Byte=8*SIZE
gsLQC1(R9,F12,F8,0)
daddiu K,K,-1
gsLQC1(R8,F1,F0,0)
MADD t11,t11,a6,b6
bnez K,.L81
MADD t21,t21,a7,b6
.L85: # N=2 M=4 K=2
and K,KCO,2 # k = KCO&2
beqz K,.L88
nop
.L86:
gsLQC1(R8,F5,F4,1) # R8=A
MADD t11,t11,a0,b0
MADD t21,t21,a1,b0
daddu B,B,2*SIZE # B+=1(nr)*2(kr)*8Byte=16
LD b0,0(B)
daddu A,A,4*SIZE # A+=2(mr)*2(kr)*8Byte=32
gsLQC1(R8,F1,F0,0)
MADD t11,t11,a4,b4
MADD t21,t21,a5,b4
.L88: # N=2, M=4, K=1
and K,KCO,1
beqz K,.L89 #
LD ALPHA,152($sp) # Get ALPHA
MADD t11,t11,a0,b0
MADD t21,t21,a1,b0
daddu A,A,2*SIZE # A+=2(mr)*1(kr)*8Byte=16
daddu B,B,1*SIZE
.L89: # Write Back
LD c11,0(CO1) # Fetch 16 C
LD c21,1*SIZE(CO1)
MADD t11,c11,t11,ALPHA
MADD t21,c21,t21,ALPHA
ST t11,0(CO1)
ST t21,1*SIZE(CO1)
FETCH $0,2*SIZE(CO1)
daddu CO1,CO1,2*SIZE # COx += 2*8Byte
move B,BO # Reset B
.L11_M1:
and M,MCO,1 # Remainder M = 1
beqz M,.L999 # M = 0, End
nop
.L90:
dsra K,KCO,2 # K=KCO/2
gsLQC1(R8,F4,F0,0)
gsLQC1(R9,F12,F8,0)
beqz K,.L95
MTC $0,t11
.L91: # N=1,M=1,K=4
gsLQC1(R8,F6,F2,1)
MADD t11,t11,a0,b0
gsLQC1(R9,F14,F10,1)
MADD t11,t11,a4,b4
daddu A,A,4*SIZE # A+=1(mr)*4(kr)*8Byte=32
gsLQC1(R8,F4,F0,0)
MADD t11,t11,a2,b2
daddu B,B,4*SIZE # B+=1(nr)*4(kr)*8Byte=32
gsLQC1(R9,F12,F8,0)
MADD t11,t11,a6,b6
daddiu K,K,-1
bnez K,.L91
nop
.L95: # N=2 M=4 K=2
and K,KCO,2 # k = KCO&2
beqz K,.L98
nop
.L96:
MADD t11,t11,a0,b0
MADD t11,t11,a4,b4
daddu B,B,2*SIZE # B+=1(nr)*2(kr)*8Byte=16
daddu A,A,2*SIZE # A+=1(mr)*2(kr)*8Byte=32
LD b0,0(B)
LD a0,0(A)
.L98: # N=2, M=4, K=1
and K,KCO,1
beqz K,.L99 #
LD ALPHA,152($sp) # Get ALPHA
MADD t11,t11,a0,b0
.L99: # Write Back
LD c11,0(CO1) # Fetch 16 C
MADD t11,c11,t11,ALPHA
ST t11,0(CO1)
.L999: # End
ld $16, 0($sp)
ld $17, 8($sp)
ld $18, 16($sp)
ld $19, 24($sp)
ld $20, 32($sp)
ld $21, 40($sp)
ld $22, 48($sp)
LD $f24, 56($sp)
LD $f25, 64($sp)
LD $f26, 72($sp)
LD $f27, 80($sp)
LD $f28, 88($sp)
ld $23, 96($sp)
ld $24, 104($sp)
ld $25, 112($sp)
LD $f20,120($sp)
LD $f21,128($sp)
LD $f22,136($sp)
LD $f23,144($sp)
j $31
daddiu $sp, $sp, 160
EPILOGUE