From ac494c0d04e12c83b38cab845578b5c147696232 Mon Sep 17 00:00:00 2001 From: traz Date: Wed, 6 Apr 2011 10:36:44 +0000 Subject: [PATCH 01/42] New kernel in LOONGSON3A. --- kernel/mips64/gemm_kernel_loongson3a.S | 1631 ++++++++++++++++++++++++ 1 file changed, 1631 insertions(+) create mode 100644 kernel/mips64/gemm_kernel_loongson3a.S diff --git a/kernel/mips64/gemm_kernel_loongson3a.S b/kernel/mips64/gemm_kernel_loongson3a.S new file mode 100644 index 000000000..d19d65469 --- /dev/null +++ b/kernel/mips64/gemm_kernel_loongson3a.S @@ -0,0 +1,1631 @@ +#define gsLQC1(base,fq,ft,offset) .word(0x32<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq) +#define gsSQC1(base,fq,ft,offset) .word(0x3A<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq) +#define REALNAME ASMNAME +#define PROLOGUE \ + .text ;\ + .set mips64 ;\ + .align 5 ;\ + .globl REALNAME ;\ + .ent REALNAME ;\ + .type REALNAME, @function ;\ +REALNAME: ;\ + .set noreorder ;\ + .set nomacro + +#define EPILOGUE \ + .set macro ;\ + .set reorder ;\ + .end REALNAME +#define BASE_SHIFT 3 +#define FETCH ld + +#define M $4 +#define N $5 +#define K $6 +#define A $8 +#define B $9 +#define C $10 +#define LDC $11 + +#define AO $12 +#define BO $13 + +#define I $2 +#define J $3 +#define L $7 + +#define CO1 $14 +#define CO2 $15 +#define CO3 $16 +#define CO4 $17 + +#define KCO $18 +#define MCO $19 +#define NCO $20 + +#define SPANB $21 +#define SPANC $22 +#define PREB $23 +#define PREA $24 +#define SPANA $25 + +#define ALPHA $f15 + +#define R8 8 +#define R9 9 +#define R14 14 +#define R15 15 +#define R16 16 +#define R17 17 + +#define t11 $f30 +#define t21 $f31 +#define t31 $f28 +#define t41 $f29 + +#define t12 $f26 +#define t22 $f27 +#define t32 $f24 +#define t42 $f25 + +#define t13 $f22 +#define t23 $f23 +#define t33 $f20 +#define t43 $f21 + +#define t14 $f18 +#define t24 $f19 +#define t34 $f16 +#define t44 $f17 + +#define c11 $f0 +#define c21 $f1 +#define c31 $f2 +#define c41 $f3 + +#define c12 $f4 +#define c22 $f5 +#define c32 $f6 +#define c42 $f7 + +#define c13 $f8 +#define c23 $f9 +#define c33 $f10 +#define c43 $f11 + +#define c14 $f12 +#define c24 $f13 +#define c34 $f14 +#define c44 $f0 + +#define a0 $f0 +#define a1 $f1 +#define a2 $f2 +#define a3 $f3 +#define a4 $f4 +#define a5 $f5 +#define a6 $f6 +#define a7 $f7 +#define b0 $f8 +#define b1 $f9 +#define b2 $f10 +#define b3 $f11 +#define b4 $f12 +#define b5 $f13 +#define b6 $f14 +#define b7 $f15 + +#define F31 31 +#define F30 30 +#define F29 29 +#define F28 28 +#define F27 27 +#define F26 26 +#define F25 25 +#define F24 24 +#define F23 23 +#define F22 22 +#define F21 21 +#define F20 20 +#define F19 19 +#define F18 18 +#define F17 17 +#define F16 16 +#define F15 15 +#define F14 14 +#define F13 13 +#define F12 12 +#define F11 11 +#define F10 10 +#define F9 9 +#define F8 8 +#define F7 7 +#define F6 6 +#define F5 5 +#define F4 4 +#define F3 3 +#define F2 2 +#define F1 1 +#define F0 0 + + PROLOGUE + + daddiu $sp, $sp, -160 + sd $16, 0($sp) + sd $17, 8($sp) + sd $18, 16($sp) + sd $19, 24($sp) + sd $20, 32($sp) + sd $21, 40($sp) + sd $22, 48($sp) + sdc1 $f24, 56($sp) + sdc1 $f25, 64($sp) + sdc1 $f26, 72($sp) + sdc1 $f27, 80($sp) + sdc1 $f28, 88($sp) + sd $23, 96($sp) + sd $24, 104($sp) + sd $25, 112($sp) + sdc1 $f20,120($sp) + sdc1 $f21,128($sp) + sdc1 $f22,136($sp) + sdc1 $f23,144($sp) + + + .align 5 # BACKUP +.L0_N4: # Loop N + sdc1 ALPHA,152($sp) # Backup ALPHA + move MCO,M # Backup M + + move NCO,N # Backup N + move KCO,K # Backup K + + move AO,A # Backup A_addr + move BO,B # Backup B_addr + + dsll LDC,LDC,3 # LDC*8Byte + dsll SPANB,KCO,5 # SPANB=KC*NR(4)*8Byte=KC*2^5 + + dsll SPANA,KCO,5 # SPANA = KCO*4mr*8Byte + dsra N,NCO,2 # N=NCO/2 + beq N,$0,.L0_N2 # N=0,NCO<4 + dsll SPANC,LDC,2 # SPANC=LDC*4 + +.L0_N4_Lb: + move CO1,C # Set C + dsra M,MCO,2 # M=MCO/2 + + move A,AO # Reset A + daddu CO2,CO1,LDC + + daddu CO3,CO2,LDC + daddu PREB,BO,SPANB # PreB point next panelB + + daddu CO4,CO3,LDC + beqz M,.L14_M2 + daddu PREA,AO,SPANA + +.L10: + dmtc1 $0,t11 + mov.d t21,t11 + gsLQC1(R8,F1,F0,0) #a0,a1 + + mov.d t31,t11 + mov.d t41,t11 + gsLQC1(R9,F9,F8,0) #b0,b1 + + mov.d t12,t11 + mov.d t22,t11 + gsLQC1(R8,F3,F2,1) #a2,a3 + + mov.d t32,t11 + mov.d t42,t11 + gsLQC1(R9,F11,F10,1) #b2,b3 + + dsra K,KCO,2 # K=KCO/2 + mov.d t13,t11 + + mov.d t23,t11 + mov.d t33,t11 + + mov.d t43,t11 + mov.d t14,t11 + + mov.d t24,t11 + mov.d t34,t11 + + mov.d t44,t11 + beqz K,.L15 + nop + +.L11: # N=M=K=4 + gsLQC1(R8,F5,F4,2) # R8=A + madd.d t11,t11,a0,b0 + madd.d t21,t21,a1,b0 + + gsLQC1(R9,F13,F12,2) # R9=B + madd.d t12,t12,a0,b1 + madd.d t22,t22,a1,b1 + + gsLQC1(R8,F7,F6,3) + madd.d t31,t31,a2,b0 + madd.d t41,t41,a3,b0 + + gsLQC1(R9,F15,F14,3) + madd.d t32,t32,a2,b1 + madd.d t42,t42,a3,b1 + + FETCH $0,(PREB) + madd.d t13,t13,a0,b2 + madd.d t23,t23,a1,b2 + + madd.d t14,t14,a0,b3 + madd.d t24,t24,a1,b3 + + FETCH $0,(PREA) + madd.d t33,t33,a2,b2 + madd.d t43,t43,a3,b2 + + madd.d t34,t34,a2,b3 + madd.d t44,t44,a3,b3 + #load2 comp1 +.L12: + gsLQC1(R8,F1,F0,4) + madd.d t11,t11,a4,b4 + madd.d t21,t21,a5,b4 + + gsLQC1(R9,F9,F8,4) + madd.d t12,t12,a4,b5 + madd.d t22,t22,a5,b5 + + gsLQC1(R8,F3,F2,5) + madd.d t31,t31,a6,b4 + madd.d t41,t41,a7,b4 + + gsLQC1(R9,F11,F10,5) + madd.d t32,t32,a6,b5 + madd.d t42,t42,a7,b5 + + FETCH $0,32(PREB) + madd.d t13,t13,a4,b6 + madd.d t23,t23,a5,b6 + + madd.d t14,t14,a4,b7 + madd.d t24,t24,a5,b7 + + FETCH $0,32(PREA) + madd.d t33,t33,a6,b6 + madd.d t43,t43,a7,b6 + + madd.d t34,t34,a6,b7 + madd.d t44,t44,a7,b7 + +.L13: + gsLQC1(R8,F5,F4,6) + madd.d t11,t11,a0,b0 + madd.d t21,t21,a1,b0 + + gsLQC1(R9,F13,F12,6) + madd.d t12,t12,a0,b1 + madd.d t22,t22,a1,b1 + + gsLQC1(R8,F7,F6,7) + madd.d t31,t31,a2,b0 + madd.d t41,t41,a3,b0 + + gsLQC1(R9,F15,F14,7) + madd.d t32,t32,a2,b1 + madd.d t42,t42,a3,b1 + daddu A,A,128 # A+=4(mr)*4(kr)*8Byte=128 + + FETCH $0,64(PREB) + madd.d t13,t13,a0,b2 + madd.d t23,t23,a1,b2 + daddu B,B,128 + + madd.d t14,t14,a0,b3 + madd.d t24,t24,a1,b3 + + FETCH $0,64(PREA) + madd.d t33,t33,a2,b2 + madd.d t43,t43,a3,b2 + + madd.d t34,t34,a2,b3 + madd.d t44,t44,a3,b3 + +.L14: + gsLQC1(R8,F1,F0,0) + madd.d t11,t11,a4,b4 + madd.d t21,t21,a5,b4 + + gsLQC1(R9,F9,F8,0) + madd.d t12,t12,a4,b5 + madd.d t22,t22,a5,b5 + + gsLQC1(R8,F3,F2,1) + madd.d t31,t31,a6,b4 + madd.d t41,t41,a7,b4 + daddiu K,K,-1 + + gsLQC1(R9,F11,F10,1) + madd.d t32,t32,a6,b5 + madd.d t42,t42,a7,b5 + + FETCH $0,96(PREB) + madd.d t13,t13,a4,b6 + madd.d t23,t23,a5,b6 + + madd.d t14,t14,a4,b7 + madd.d t24,t24,a5,b7 + + FETCH $0,96(PREA) + madd.d t33,t33,a6,b6 + madd.d t43,t43,a7,b6 + daddu PREB,PREB,128 + + madd.d t34,t34,a6,b7 + daddu PREA,PREA,128 + bnez K,.L11 + madd.d t44,t44,a7,b7 + +.L15: # N=4 M=4 K=2 + and K,KCO,2 # k = KCO&2 + beqz K,.L18 + nop + +.L16: + gsLQC1(R8,F5,F4,2) # R8=A + madd.d t11,t11,a0,b0 + madd.d t21,t21,a1,b0 + + gsLQC1(R9,F13,F12,2) # R9=B + madd.d t12,t12,a0,b1 + madd.d t22,t22,a1,b1 + + gsLQC1(R8,F7,F6,3) + madd.d t31,t31,a2,b0 + madd.d t41,t41,a3,b0 + + gsLQC1(R9,F15,F14,3) + madd.d t32,t32,a2,b1 + madd.d t42,t42,a3,b1 + daddu A,A,64 # A+=4(mr)*2(kr)*8Byte=64 + + FETCH $0,0(PREB) + madd.d t13,t13,a0,b2 + madd.d t23,t23,a1,b2 + daddu B,B,64 + + madd.d t14,t14,a0,b3 + madd.d t24,t24,a1,b3 + + FETCH $0,0(PREA) + madd.d t33,t33,a2,b2 + madd.d t43,t43,a3,b2 + + madd.d t34,t34,a2,b3 + madd.d t44,t44,a3,b3 + +.L17: + gsLQC1(R8,F1,F0,0) + madd.d t11,t11,a4,b4 + madd.d t21,t21,a5,b4 + + gsLQC1(R9,F9,F8,0) + madd.d t12,t12,a4,b5 + madd.d t22,t22,a5,b5 + + gsLQC1(R8,F3,F2,1) + madd.d t31,t31,a6,b4 + madd.d t41,t41,a7,b4 + + gsLQC1(R9,F11,F10,1) + madd.d t32,t32,a6,b5 + madd.d t42,t42,a7,b5 + + FETCH $0,32(PREB) + madd.d t13,t13,a4,b6 + madd.d t23,t23,a5,b6 + + madd.d t14,t14,a4,b7 + madd.d t24,t24,a5,b7 + + FETCH $0,32(PREA) + madd.d t33,t33,a6,b6 + madd.d t43,t43,a7,b6 + daddu PREB,PREB,64 + + madd.d t34,t34,a6,b7 + madd.d t44,t44,a7,b7 + daddu PREA,PREA,64 + +.L18: # N=4, M=4, K=1 + and K,KCO,1 + beqz K,.L19 # + ldc1 ALPHA,152($sp) # Get ALPHA + + FETCH $0,0(PREB) + madd.d t11,t11,a0,b0 + madd.d t21,t21,a1,b0 + daddu A,A,32 # A+=4(mr)*1(kr)*8Byte=32 + + madd.d t12,t12,a0,b1 + madd.d t22,t22,a1,b1 + daddu B,B,32 + + FETCH $0,0(PREA) + madd.d t31,t31,a2,b0 + madd.d t41,t41,a3,b0 + daddu PREB,PREB,32 + + madd.d t32,t32,a2,b1 + madd.d t42,t42,a3,b1 + daddu PREA,PREA,32 + + madd.d t13,t13,a0,b2 + madd.d t23,t23,a1,b2 + + madd.d t14,t14,a0,b3 + madd.d t24,t24,a1,b3 + + madd.d t33,t33,a2,b2 + madd.d t43,t43,a3,b2 + + madd.d t34,t34,a2,b3 + madd.d t44,t44,a3,b3 + +.L19: # Write Back + ldc1 c11,0(CO1) # Fetch 16 C + ldc1 c21,8(CO1) + ldc1 c31,16(CO1) + ldc1 c41,24(CO1) + + ldc1 c12,0(CO2) + madd.d t11,c11,t11,ALPHA + ldc1 c22,8(CO2) + madd.d t21,c21,t21,ALPHA + ldc1 c32,16(CO2) + madd.d t31,c31,t31,ALPHA + ldc1 c42,24(CO2) + madd.d t41,c41,t41,ALPHA + + ldc1 c13,0(CO3) + madd.d t12,c12,t12,ALPHA + ldc1 c23,8(CO3) + madd.d t22,c22,t22,ALPHA + ldc1 c33,16(CO3) + madd.d t32,c32,t32,ALPHA + ldc1 c43,24(CO3) + madd.d t42,c42,t42,ALPHA + + ldc1 c14,0(CO4) + madd.d t13,c13,t13,ALPHA + ldc1 c24,8(CO4) + madd.d t23,c23,t23,ALPHA + ldc1 c34,16(CO4) + madd.d t33,c33,t33,ALPHA + ldc1 c44,24(CO4) + madd.d t43,c43,t43,ALPHA + + sdc1 t11,0(CO1) + madd.d t14,c14,t14,ALPHA + sdc1 t21,8(CO1) + madd.d t24,c24,t24,ALPHA + sdc1 t31,16(CO1) + madd.d t34,c34,t34,ALPHA + sdc1 t41,24(CO1) + madd.d t44,c44,t44,ALPHA + daddiu M,M,-1 # M-- + + sdc1 t12,0(CO2) + sdc1 t22,8(CO2) + sdc1 t32,16(CO2) + sdc1 t42,24(CO2) + + sdc1 t13,0(CO3) + sdc1 t23,8(CO3) + sdc1 t33,16(CO3) + sdc1 t43,24(CO3) + + FETCH $0,32(CO1) + FETCH $0,32(CO2) + FETCH $0,32(CO3) + FETCH $0,32(CO4) + + sdc1 t14,0(CO4) + daddu CO1,CO1,32 # COx += 4*8Byte + sdc1 t24,8(CO4) + daddu CO2,CO2,32 + sdc1 t34,16(CO4) + daddu CO3,CO3,32 + sdc1 t44,24(CO4) + move B,BO # Reset B + daddu PREB,BO,SPANB + bnez M,.L10 # M!=0 + daddu CO4,CO4,32 + + + +.L14_M2: + and M,MCO,2 # Remainder M = 2 + beqz M,.L14_M1 + nop + +.L20: + dmtc1 $0,t11 + mov.d t21,t11 + gsLQC1(R8,F1,F0,0) #a0,a1 + + mov.d t12,t11 + mov.d t22,t11 + gsLQC1(R9,F9,F8,0) #b0,b1 + + dsra K,KCO,2 # K=KCO/2 + mov.d t13,t11 + gsLQC1(R9,F11,F10,1) #b2,b3 + + mov.d t23,t11 + mov.d t14,t11 + + mov.d t24,t11 + beqz K,.L25 + nop + +.L21: # N=4 m=2,=K=4 + gsLQC1(R8,F5,F4,1) # R8=A + madd.d t11,t11,a0,b0 + madd.d t21,t21,a1,b0 + + gsLQC1(R9,F13,F12,2) # R9=B + madd.d t12,t12,a0,b1 + madd.d t22,t22,a1,b1 + + gsLQC1(R9,F15,F14,3) + madd.d t13,t13,a0,b2 + madd.d t23,t23,a1,b2 + + gsLQC1(R8,F3,F2,2) + madd.d t14,t14,a0,b3 + madd.d t24,t24,a1,b3 + + gsLQC1(R9,F9,F8,4) + madd.d t11,t11,a4,b4 + madd.d t21,t21,a5,b4 + + gsLQC1(R9,F11,F10,5) + madd.d t12,t12,a4,b5 + madd.d t22,t22,a5,b5 + + gsLQC1(R8,F7,F6,3) + madd.d t13,t13,a4,b6 + madd.d t23,t23,a5,b6 + + madd.d t14,t14,a4,b7 + madd.d t24,t24,a5,b7 + + gsLQC1(R9,F13,F12,6) + madd.d t11,t11,a2,b0 + madd.d t21,t21,a3,b0 + daddu A,A,64 # A+=2(mr)*4(kr)*8Byte=64 + + gsLQC1(R9,F15,F14,7) + madd.d t12,t12,a2,b1 + madd.d t22,t22,a3,b1 + daddiu K,K,-1 + + gsLQC1(R8,F1,F0,0) + madd.d t13,t13,a2,b2 + madd.d t23,t23,a3,b2 + daddu B,B,128 # B+=4(nr)*4(kr)*8Byte=128 + + madd.d t14,t14,a2,b3 + madd.d t24,t24,a3,b3 + + gsLQC1(R9,F9,F8,0) + madd.d t11,t11,a6,b4 + madd.d t21,t21,a7,b4 + + gsLQC1(R9,F11,F10,1) + madd.d t12,t12,a6,b5 + madd.d t22,t22,a7,b5 + + madd.d t13,t13,a6,b6 + madd.d t23,t23,a7,b6 + + madd.d t14,t14,a6,b7 + bnez K,.L21 + madd.d t24,t24,a7,b7 + +.L25: # N=4 M=2 K=2 + and K,KCO,2 # k = KCO&2 + beqz K,.L28 + nop + +.L26: + gsLQC1(R8,F5,F4,1) # R8=A + madd.d t11,t11,a0,b0 + madd.d t21,t21,a1,b0 + + gsLQC1(R9,F13,F12,2) # R9=B + madd.d t12,t12,a0,b1 + madd.d t22,t22,a1,b1 + daddu A,A,32 # A+=2(mr)*2(kr)*8Byte=32 + + gsLQC1(R9,F15,F14,3) + madd.d t13,t13,a0,b2 + madd.d t23,t23,a1,b2 + daddu B,B,64 + + madd.d t14,t14,a0,b3 + madd.d t24,t24,a1,b3 + +.L27: + gsLQC1(R8,F1,F0,0) + madd.d t11,t11,a4,b4 + madd.d t21,t21,a5,b4 + + gsLQC1(R9,F9,F8,0) + madd.d t12,t12,a4,b5 + madd.d t22,t22,a5,b5 + + gsLQC1(R9,F11,F10,1) + madd.d t13,t13,a4,b6 + madd.d t23,t23,a5,b6 + + madd.d t14,t14,a4,b7 + madd.d t24,t24,a5,b7 + +.L28: # N=4, M=2, K=1 + and K,KCO,1 + beqz K,.L29 # + ldc1 ALPHA,152($sp) # Get ALPHA + + madd.d t11,t11,a0,b0 + madd.d t21,t21,a1,b0 + daddu A,A,16 # A+=2(mr)*1(kr)*8Byte=16 + daddu B,B,32 + + madd.d t12,t12,a0,b1 + madd.d t22,t22,a1,b1 + + madd.d t13,t13,a0,b2 + madd.d t23,t23,a1,b2 + + madd.d t14,t14,a0,b3 + madd.d t24,t24,a1,b3 + +.L29: # Write Back + ldc1 c11,0(CO1) # Fetch 16 C + ldc1 c21,8(CO1) + + ldc1 c12,0(CO2) + ldc1 c22,8(CO2) + + ldc1 c13,0(CO3) + madd.d t11,c11,t11,ALPHA + ldc1 c23,8(CO3) + madd.d t21,c21,t21,ALPHA + + ldc1 c14,0(CO4) + madd.d t12,c12,t12,ALPHA + ldc1 c24,8(CO4) + madd.d t22,c22,t22,ALPHA + + sdc1 t11,0(CO1) + madd.d t13,c13,t13,ALPHA + sdc1 t21,8(CO1) + madd.d t23,c23,t23,ALPHA + + sdc1 t12,0(CO2) + madd.d t14,c14,t14,ALPHA + sdc1 t22,8(CO2) + madd.d t24,c24,t24,ALPHA + + sdc1 t13,0(CO3) + move B,BO # Reset B + sdc1 t23,8(CO3) + daddu CO1,CO1,16 # COx += 2*8Byte + + FETCH $0,0(CO1) + FETCH $0,16(CO2) + FETCH $0,16(CO3) + FETCH $0,16(CO4) + + sdc1 t14,0(CO4) + daddu CO2,CO2,16 + sdc1 t24,8(CO4) + daddu CO3,CO3,16 + daddu CO4,CO4,16 + + + +.L14_M1: + and M,MCO,1 # Remainder M = 1 + beqz M,.L0_N4_Loop # M = 0, finishing one panel B + nop + +.L30: + ldc1 a0,0(A) + dsra K,KCO,2 # K=KCO/2 + gsLQC1(R9,F9,F8,0) #b0,b1 + dmtc1 $0,t11 + gsLQC1(R9,F11,F10,1) #b2,b3 + mov.d t12,t11 + mov.d t13,t11 + beqz K,.L25 + mov.d t14,t11 + +.L31: # N=4 m=1,=K=4 + ldc1 a1,8(A) + + gsLQC1(R9,F13,F12,2) # R9=B + madd.d t11,t11,a0,b0 + madd.d t12,t12,a0,b1 + + gsLQC1(R9,F15,F14,3) + madd.d t13,t13,a0,b2 + madd.d t14,t14,a0,b3 + + ldc1 a2,16(A) + + gsLQC1(R9,F9,F8,4) + madd.d t11,t11,a1,b4 + madd.d t12,t12,a1,b5 + + gsLQC1(R9,F11,F10,5) + madd.d t13,t13,a1,b6 + madd.d t14,t14,a1,b7 + + ldc1 a3,24(A) + daddiu K,K,-1 + + gsLQC1(R9,F13,F12,6) + madd.d t11,t11,a2,b0 + madd.d t12,t12,a2,b1 + daddu A,A,32 # A+=1(mr)*4(kr)*8Byte=64 + + gsLQC1(R9,F15,F14,7) + madd.d t13,t13,a2,b2 + madd.d t14,t14,a2,b3 + daddu B,B,128 # B+=4(nr)*4(kr)*8Byte=128 + + ldc1 a0,0(A) + + gsLQC1(R9,F9,F8,0) + madd.d t11,t11,a3,b4 + madd.d t12,t12,a3,b5 + + gsLQC1(R9,F11,F10,1) + madd.d t13,t13,a3,b6 + bnez K,.L31 + madd.d t14,t14,a3,b7 + +.L35: # N=4 M=1 K=2 + and K,KCO,2 # k = KCO&2 + beqz K,.L38 + nop + +.L36: + ldc1 a1,8(A) + + gsLQC1(R9,F13,F12,2) # R9=B + madd.d t11,t11,a0,b0 + madd.d t12,t12,a0,b1 + daddu A,A,16 # A+=1(mr)*2(kr)*8Byte=32 + + gsLQC1(R9,F15,F14,3) + madd.d t13,t13,a0,b2 + madd.d t14,t14,a0,b3 + daddu B,B,64 + + +.L37: + ldc1 a0,0(A) + + gsLQC1(R9,F9,F8,0) + madd.d t11,t11,a1,b4 + madd.d t12,t12,a1,b5 + + gsLQC1(R9,F11,F10,1) + madd.d t13,t13,a1,b6 + madd.d t14,t14,a1,b7 + +.L38: # N=4, M=1, K=1 + and K,KCO,1 + beqz K,.L39 # + ldc1 ALPHA,152($sp) # Get ALPHA + + madd.d t11,t11,a0,b0 + madd.d t12,t12,a0,b1 + daddu A,A,8 # A+=1(mr)*1(kr)*8Byte=16 + daddu B,B,32 + + madd.d t13,t13,a0,b2 + madd.d t14,t14,a0,b3 + +.L39: # Write Back + ldc1 c11,0(CO1) # Fetch 16 C + ldc1 c12,0(CO2) + ldc1 c13,0(CO3) + ldc1 c14,0(CO4) + + madd.d t11,c11,t11,ALPHA + madd.d t12,c12,t12,ALPHA + madd.d t13,c13,t13,ALPHA + madd.d t14,c14,t14,ALPHA + + sdc1 t11,0(CO1) + sdc1 t12,0(CO2) + sdc1 t13,0(CO3) + sdc1 t14,0(CO4) + + +.L0_N4_Loop: + daddu BO,BO,SPANB # BO point to next panel B + daddiu N,N,-1 # N-- + daddu C,C,SPANC # C pointe to next panel C + bnez N,.L0_N4_Lb # N!=0 + move B,BO # Set B + + + + .align 5 +.L0_N2: + and N,NCO,2 # Remainder N = 2 + beqz N,.L0_N1 # N=0,NCO<2 + dsll SPANC,LDC,1 # SPANC=LDC*2 + +.L0_N2_Lb: + move CO1,C # Set C + dsra M,MCO,2 # M=MCO/2 + + dsll SPANB,KCO,4 # SPANB=KC*NR(2)*8Byte=KC*16=KC*2^4 + move A,AO # Reset A + + daddu CO2,CO1,LDC + beqz M,.L12_M2 + daddu PREA,AO,SPANA + +.L40: + dmtc1 $0,t11 + mov.d t21,t11 + gsLQC1(R8,F1,F0,0) #a0,a1 + + mov.d t31,t11 + mov.d t41,t11 + gsLQC1(R9,F9,F8,0) #b0,b1 + + dsra K,KCO,2 # K=KCO/2 + mov.d t12,t11 + gsLQC1(R8,F3,F2,1) #a2,a3 + + mov.d t22,t11 + mov.d t32,t11 + + mov.d t42,t11 + beqz K,.L45 + nop + +.L41: # N=2,M=K=4 + gsLQC1(R8,F5,F4,2) # R8=A + madd.d t11,t11,a0,b0 + madd.d t21,t21,a1,b0 + + gsLQC1(R9,F13,F12,1) # R9=B + madd.d t12,t12,a0,b1 + madd.d t22,t22,a1,b1 + + gsLQC1(R8,F7,F6,3) + madd.d t31,t31,a2,b0 + madd.d t41,t41,a3,b0 + + FETCH $0,(PREA) + madd.d t32,t32,a2,b1 + madd.d t42,t42,a3,b1 + +.L42: + gsLQC1(R8,F1,F0,4) + madd.d t11,t11,a4,b4 + madd.d t21,t21,a5,b4 + + gsLQC1(R9,F11,F10,2) + madd.d t12,t12,a4,b5 + madd.d t22,t22,a5,b5 + + gsLQC1(R8,F3,F2,5) + madd.d t31,t31,a6,b4 + madd.d t41,t41,a7,b4 + + FETCH $0,32(PREA) + madd.d t32,t32,a6,b5 + madd.d t42,t42,a7,b5 + +.L43: + gsLQC1(R8,F5,F4,6) + madd.d t11,t11,a0,b2 + madd.d t21,t21,a1,b2 + + gsLQC1(R9,F15,F14,3) + madd.d t12,t12,a0,b3 + madd.d t22,t22,a1,b3 + + gsLQC1(R8,F7,F6,7) + madd.d t31,t31,a2,b2 + madd.d t41,t41,a3,b2 + daddu B,B,64 # B+=2(nr)*4(kr)*8Byte=64 + + FETCH $0,64(PREA) + madd.d t32,t32,a2,b3 + madd.d t42,t42,a3,b3 + daddu A,A,128 # A+=4(mr)*4(kr)*8Byte=128 + +.L44: + gsLQC1(R8,F1,F0,0) + madd.d t11,t11,a4,b6 + madd.d t21,t21,a5,b6 + daddiu K,K,-1 + + gsLQC1(R9,F9,F8,0) + madd.d t12,t12,a4,b7 + madd.d t22,t22,a5,b7 + daddu PREA,PREA,128 + + gsLQC1(R8,F3,F2,1) + madd.d t31,t31,a6,b6 + madd.d t41,t41,a7,b6 + + FETCH $0,-32(PREA) + madd.d t32,t32,a6,b7 + bnez K,.L41 + madd.d t42,t42,a7,b7 + + +.L45: # N=2 M=4 K=2 + and K,KCO,2 # k = KCO&2 + beqz K,.L48 + nop + +.L46: + gsLQC1(R8,F5,F4,2) # R8=A + madd.d t11,t11,a0,b0 + madd.d t21,t21,a1,b0 + + gsLQC1(R9,F13,F12,1) # R9=B + madd.d t12,t12,a0,b1 + madd.d t22,t22,a1,b1 + + gsLQC1(R8,F7,F6,3) + madd.d t31,t31,a2,b0 + madd.d t41,t41,a3,b0 + daddu B,B,32 # B+=2(nr)*2(kr)*8Byte=32 + + FETCH $0,0(PREA) + madd.d t32,t32,a2,b1 + madd.d t42,t42,a3,b1 + daddu A,A,64 # A+=4(mr)*2(kr)*8Byte=64 + +.L47: + gsLQC1(R8,F1,F0,0) + madd.d t11,t11,a4,b4 + madd.d t21,t21,a5,b4 + + gsLQC1(R9,F9,F8,0) + madd.d t12,t12,a4,b5 + madd.d t22,t22,a5,b5 + + gsLQC1(R8,F3,F2,1) + madd.d t31,t31,a6,b4 + madd.d t41,t41,a7,b4 + + FETCH $0,32(PREA) + madd.d t32,t32,a6,b5 + madd.d t42,t42,a7,b5 + daddu PREA,PREA,64 + + +.L48: # N=2, M=4, K=1 + and K,KCO,1 + beqz K,.L49 # + ldc1 ALPHA,152($sp) # Get ALPHA + + FETCH $0,0(PREA) + madd.d t11,t11,a0,b0 + madd.d t21,t21,a1,b0 + daddu A,A,32 # A+=4(mr)*1(kr)*8Byte=32 + + madd.d t12,t12,a0,b1 + madd.d t22,t22,a1,b1 + daddu B,B,32 + daddu PREA,PREA,32 + + madd.d t31,t31,a2,b0 + madd.d t41,t41,a3,b0 + + madd.d t32,t32,a2,b1 + madd.d t42,t42,a3,b1 + +.L49: # Write Back + ldc1 c11,0(CO1) # Fetch 16 C + ldc1 c21,8(CO1) + ldc1 c31,16(CO1) + ldc1 c41,24(CO1) + + ldc1 c12,0(CO2) + madd.d t11,c11,t11,ALPHA + ldc1 c22,8(CO2) + madd.d t21,c21,t21,ALPHA + ldc1 c32,16(CO2) + madd.d t31,c31,t31,ALPHA + ldc1 c42,24(CO2) + madd.d t41,c41,t41,ALPHA + + sdc1 t11,0(CO1) + madd.d t12,c12,t12,ALPHA + sdc1 t21,8(CO1) + madd.d t22,c22,t22,ALPHA + sdc1 t31,16(CO1) + madd.d t32,c32,t32,ALPHA + sdc1 t41,24(CO1) + madd.d t42,c42,t42,ALPHA + daddiu M,M,-1 # M-- + + sdc1 t12,0(CO2) + sdc1 t22,8(CO2) + sdc1 t32,16(CO2) + sdc1 t42,24(CO2) + + FETCH $0,32(CO1) + FETCH $0,32(CO2) + + daddu CO1,CO1,32 # COx += 4*8Byte + daddu CO2,CO2,32 + bnez M,.L40 # M!=0 + move B,BO # Reset B + + +.L12_M2: + and M,MCO,2 # Remainder M = 2 + beqz M,.L12_M1 + nop + +.L50: + dsra K,KCO,2 # K=KCO/2 + dmtc1 $0,t11 + gsLQC1(R8,F1,F0,0) #a0,a1 + + mov.d t21,t11 + mov.d t12,t11 + gsLQC1(R9,F9,F8,0) #b0,b1 + + mov.d t22,t11 + beqz K,.L55 + nop + +.L51: # N=2 m=2,=K=4 + gsLQC1(R8,F5,F4,1) # R8=A + madd.d t11,t11,a0,b0 + madd.d t21,t21,a1,b0 + + gsLQC1(R9,F13,F12,1) # R9=B + madd.d t12,t12,a0,b1 + madd.d t22,t22,a1,b1 + + gsLQC1(R8,F3,F2,2) + madd.d t11,t11,a4,b4 + madd.d t21,t21,a5,b4 + + gsLQC1(R9,F11,F10,2) + madd.d t12,t12,a4,b5 + madd.d t22,t22,a5,b5 + daddiu K,K,-1 + + gsLQC1(R8,F7,F6,3) + madd.d t11,t11,a2,b2 + madd.d t21,t21,a3,b2 + daddu A,A,64 # A+=2(mr)*4(kr)*8Byte=64 + + gsLQC1(R9,F15,F14,3) + madd.d t12,t12,a2,b3 + madd.d t22,t22,a3,b3 + daddu B,B,64 # B+=2(nr)*4(kr)*8Byte=128 + + gsLQC1(R8,F1,F0,0) + madd.d t11,t11,a6,b6 + madd.d t21,t21,a7,b6 + + gsLQC1(R9,F9,F8,0) + madd.d t12,t12,a6,b7 + bnez K,.L51 + madd.d t22,t22,a7,b7 + +.L55: # N=2 M=2 K=2 + and K,KCO,2 # k = KCO&2 + beqz K,.L58 + nop + +.L56: + gsLQC1(R8,F5,F4,1) # R8=A + madd.d t11,t11,a0,b0 + madd.d t21,t21,a1,b0 + daddu A,A,32 # A+=2(mr)*2(kr)*8Byte=32 + + gsLQC1(R9,F13,F12,1) # R9=B + madd.d t12,t12,a0,b1 + madd.d t22,t22,a1,b1 + daddu B,B,32 + +.L57: + gsLQC1(R8,F1,F0,0) + madd.d t11,t11,a4,b4 + madd.d t21,t21,a5,b4 + + gsLQC1(R9,F9,F8,0) + madd.d t12,t12,a4,b5 + madd.d t22,t22,a5,b5 + + +.L58: # N=2, M=2, K=1 + and K,KCO,1 + beqz K,.L59 # + ldc1 ALPHA,152($sp) # Get ALPHA + + madd.d t11,t11,a0,b0 + madd.d t21,t21,a1,b0 + daddu A,A,16 # A+=2(mr)*1(kr)*8Byte=16 + daddu B,B,16 + + madd.d t12,t12,a0,b1 + madd.d t22,t22,a1,b1 + + +.L59: # Write Back + ldc1 c11,0(CO1) # Fetch 16 C + ldc1 c21,8(CO1) + ldc1 c12,0(CO2) + ldc1 c22,8(CO2) + + madd.d t11,c11,t11,ALPHA + madd.d t21,c21,t21,ALPHA + madd.d t12,c12,t12,ALPHA + madd.d t22,c22,t22,ALPHA + + sdc1 t11,0(CO1) + sdc1 t21,8(CO1) + sdc1 t12,0(CO2) + move B,BO # Reset B + sdc1 t22,8(CO2) + daddu CO1,CO1,16 # COx += 2*8Byte + daddu CO2,CO2,16 + + FETCH $0,0(CO1) + FETCH $0,0(CO2) + + +.L12_M1: + and M,MCO,1 # Remainder M = 1 + beqz M,.L0_N2_Loop # M = 0, finishing one panel B + nop + +.L60: + dsra K,KCO,2 # K=KCO/2 + dmtc1 $0,t11 + ldc1 a0,0(A) + + mov.d t21,t11 + mov.d t12,t11 + gsLQC1(R9,F9,F8,0) #b0,b1 + + mov.d t22,t11 + beqz K,.L65 + nop + +.L61: # N=2 m=1,=K=4 + ldc1 a4,8(A) + gsLQC1(R9,F13,F12,1) # R9=B + ldc1 a2,16(A) + madd.d t11,t11,a0,b0 + madd.d t12,t12,a0,b1 + + gsLQC1(R9,F11,F10,2) + madd.d t11,t11,a4,b4 + madd.d t12,t12,a4,b5 + daddiu K,K,-1 + + ldc1 a6,24(A) + madd.d t11,t11,a2,b2 + daddu A,A,32 # A+=1(mr)*4(kr)*8Byte=64 + + gsLQC1(R9,F15,F14,3) + madd.d t12,t12,a2,b3 + daddu B,B,64 # B+=2(nr)*4(kr)*8Byte=128 + + ldc1 a0,0(A) + gsLQC1(R9,F9,F8,0) + madd.d t11,t11,a6,b6 + bnez K,.L61 + madd.d t12,t12,a6,b7 + +.L65: # N=2 M=1 K=2 + and K,KCO,2 # k = KCO&2 + beqz K,.L68 + nop + +.L66: + ldc1 a4,8(A) + gsLQC1(R9,F13,F12,1) # R9=B + madd.d t11,t11,a0,b0 + madd.d t12,t12,a0,b1 + daddu A,A,16 # A+=1(mr)*2(kr)*8Byte=32 + daddu B,B,32 + +.L67: + ldc1 a0,0(A) + gsLQC1(R9,F9,F8,0) + madd.d t11,t11,a4,b4 + madd.d t12,t12,a4,b5 + + +.L68: # N=2, M=1, K=1 + and K,KCO,1 + beqz K,.L69 # + ldc1 ALPHA,152($sp) # Get ALPHA + + madd.d t11,t11,a0,b0 + madd.d t12,t12,a0,b1 + daddu A,A,8 # A+=1(mr)*1(kr)*8Byte=16 + daddu B,B,16 + + +.L69: # Write Back + ldc1 c11,0(CO1) # Fetch 16 C + ldc1 c12,0(CO2) + + madd.d t11,c11,t11,ALPHA + madd.d t12,c12,t12,ALPHA + + sdc1 t11,0(CO1) + move B,BO # Reset B + sdc1 t12,0(CO2) + daddu CO1,CO1,8 # COx += 2*8Byte + daddu CO2,CO2,8 + + FETCH $0,0(CO1) + FETCH $0,0(CO2) + + +.L0_N2_Loop: + daddu BO,BO,SPANB # BO+=KC*2N + move B,BO # Set B + daddu C,C,SPANC # C+=LDC*2 + + + + .align 5 +.L0_N1: + and N,NCO,1 # Remainder N = 1 + beqz N,.L999 # N=0,NCO<1 + nop + + move CO1,C # Set C + dsra M,MCO,2 # M=MCO/2 + + move A,AO # Reset A + beqz M,.L11_M2 + daddu PREA,AO,SPANA + + +.L70: + dsra K,KCO,2 # K=KCO/2 + ldc1 b0,0(B) + dmtc1 $0,t11 + gsLQC1(R8,F1,F0,0) #a0,a1 + mov.d t21,t11 + gsLQC1(R8,F3,F2,1) #a2,a3 + mov.d t31,t11 + beqz K,.L75 + mov.d t41,t11 + +.L71: # N=1,M=K=4 + ldc1 b4,8(B) + gsLQC1(R8,F5,F4,2) # R8=A + gsLQC1(R8,F7,F6,3) + madd.d t11,t11,a0,b0 + madd.d t21,t21,a1,b0 + + FETCH $0,(PREA) + madd.d t31,t31,a2,b0 + madd.d t41,t41,a3,b0 + + +.L72: + ldc1 b2,16(B) + gsLQC1(R8,F1,F0,4) + gsLQC1(R8,F3,F2,5) + madd.d t11,t11,a4,b4 + madd.d t21,t21,a5,b4 + + FETCH $0,32(PREA) + madd.d t31,t31,a6,b4 + madd.d t41,t41,a7,b4 + + +.L73: + ldc1 b6,24(B) + gsLQC1(R8,F5,F4,6) + gsLQC1(R8,F7,F6,7) + madd.d t11,t11,a0,b2 + madd.d t21,t21,a1,b2 + daddu B,B,32 # B+=1(nr)*4(kr)*8Byte=64 + + FETCH $0,64(PREA) + madd.d t31,t31,a2,b2 + madd.d t41,t41,a3,b2 + daddu A,A,128 # A+=4(mr)*4(kr)*8Byte=128 + +.L74: + ldc1 b0,0(B) + gsLQC1(R8,F1,F0,0) + daddu PREA,PREA,128 + gsLQC1(R8,F3,F2,1) + madd.d t11,t11,a4,b6 + madd.d t21,t21,a5,b6 + daddiu K,K,-1 + + FETCH $0,-32(PREA) + madd.d t31,t31,a6,b6 + bnez K,.L71 + madd.d t41,t41,a7,b6 + + + +.L75: # N=2 M=4 K=2 + and K,KCO,2 # k = KCO&2 + beqz K,.L78 + nop + +.L76: + ldc1 b4,8(B) + gsLQC1(R8,F5,F4,2) # R8=A + gsLQC1(R8,F7,F6,3) + madd.d t11,t11,a0,b0 + madd.d t21,t21,a1,b0 + daddu B,B,16 # B+=1(nr)*2(kr)*8Byte=32 + + FETCH $0,0(PREA) + madd.d t31,t31,a2,b0 + madd.d t41,t41,a3,b0 + daddu A,A,64 # A+=4(mr)*2(kr)*8Byte=64 + +.L77: + ldc1 b0,0(B) + gsLQC1(R8,F1,F0,0) + gsLQC1(R8,F3,F2,1) + madd.d t11,t11,a4,b4 + madd.d t21,t21,a5,b4 + + FETCH $0,32(PREA) + madd.d t31,t31,a6,b4 + madd.d t41,t41,a7,b4 + daddu PREA,PREA,64 + + + +.L78: # N=2, M=4, K=1 + and K,KCO,1 + beqz K,.L79 # + ldc1 ALPHA,152($sp) # Get ALPHA + + FETCH $0,0(PREA) + madd.d t11,t11,a0,b0 + madd.d t21,t21,a1,b0 + daddu A,A,32 # A+=4(mr)*1(kr)*8Byte=32 + + madd.d t31,t31,a2,b0 + madd.d t41,t41,a3,b0 + daddu B,B,8 + daddu PREA,PREA,32 + + +.L79: # Write Back + ldc1 c11,0(CO1) # Fetch 16 C + ldc1 c21,8(CO1) + ldc1 c31,16(CO1) + ldc1 c41,24(CO1) + + madd.d t11,c11,t11,ALPHA + madd.d t21,c21,t21,ALPHA + madd.d t31,c31,t31,ALPHA + madd.d t41,c41,t41,ALPHA + + sdc1 t11,0(CO1) + sdc1 t21,8(CO1) + sdc1 t31,16(CO1) + sdc1 t41,24(CO1) + daddiu M,M,-1 # M-- + + FETCH $0,32(CO1) + daddu CO1,CO1,32 # COx += 4*8Byte + bnez M,.L70 # M!=0 + move B,BO # Reset B + + + +.L11_M2: + and M,MCO,2 # Remainder M = 2 + beqz M,.L11_M1 + nop + +.L80: + dsra K,KCO,2 # K=KCO/2 + ldc1 b0,0(B) + dmtc1 $0,t11 + gsLQC1(R8,F1,F0,0) #a0,a1 + mov.d t21,t11 + beqz K,.L85 + nop + +.L81: # N=1,M=2,K=4 + ldc1 b4,8(B) + gsLQC1(R8,F5,F4,1) # R8=A + madd.d t11,t11,a0,b0 + madd.d t21,t21,a1,b0 + + ldc1 b2,16(B) + gsLQC1(R8,F3,F2,2) + madd.d t11,t11,a4,b4 + madd.d t21,t21,a5,b4 + + ldc1 b6,24(B) + daddu B,B,32 # B+=1(nr)*4(kr)*8Byte=32 + + gsLQC1(R8,F7,F6,3) + madd.d t11,t11,a2,b2 + madd.d t21,t21,a3,b2 + daddu A,A,64 # A+=2(mr)*4(kr)*8Byte=64 + + ldc1 b0,0(B) + daddiu K,K,-1 + + gsLQC1(R8,F1,F0,0) + madd.d t11,t11,a6,b6 + bnez K,.L81 + madd.d t21,t21,a7,b6 + + +.L85: # N=2 M=4 K=2 + and K,KCO,2 # k = KCO&2 + beqz K,.L88 + nop + +.L86: + ldc1 b4,8(B) + daddu B,B,16 # B+=1(nr)*2(kr)*8Byte=16 + + gsLQC1(R8,F5,F4,1) # R8=A + madd.d t11,t11,a0,b0 + madd.d t21,t21,a1,b0 + daddu A,A,32 # A+=2(mr)*2(kr)*8Byte=32 + + ldc1 b0,0(B) + gsLQC1(R8,F1,F0,0) + madd.d t11,t11,a4,b4 + madd.d t21,t21,a5,b4 + + +.L88: # N=2, M=4, K=1 + and K,KCO,1 + beqz K,.L89 # + ldc1 ALPHA,152($sp) # Get ALPHA + + madd.d t11,t11,a0,b0 + madd.d t21,t21,a1,b0 + daddu A,A,16 # A+=2(mr)*1(kr)*8Byte=16 + daddu B,B,8 + + +.L89: # Write Back + ldc1 c11,0(CO1) # Fetch 16 C + ldc1 c21,8(CO1) + + madd.d t11,c11,t11,ALPHA + madd.d t21,c21,t21,ALPHA + + sdc1 t11,0(CO1) + sdc1 t21,8(CO1) + + FETCH $0,16(CO1) + daddu CO1,CO1,16 # COx += 2*8Byte + move B,BO # Reset B + + +.L11_M1: + and M,MCO,1 # Remainder M = 1 + beqz M,.L999 # M = 0, End + nop + +.L90: + dsra K,KCO,2 # K=KCO/2 + ldc1 b0,0(B) + ldc1 a0,0(A) + beqz K,.L95 + dmtc1 $0,t11 + +.L91: # N=1,M=1,K=4 + ldc1 b4,8(B) + ldc1 a4,8(A) + ldc1 b2,16(B) + ldc1 a2,16(A) + ldc1 b6,24(B) + ldc1 a6,24(A) + + madd.d t11,t11,a0,b0 + madd.d t11,t11,a4,b4 + daddu B,B,32 # B+=1(nr)*4(kr)*8Byte=32 + daddu A,A,32 # A+=1(mr)*4(kr)*8Byte=32 + madd.d t11,t11,a2,b2 + madd.d t11,t11,a6,b6 + daddiu K,K,-1 + + ldc1 b0,0(B) + bnez K,.L91 + ldc1 a0,0(A) + + +.L95: # N=2 M=4 K=2 + and K,KCO,2 # k = KCO&2 + beqz K,.L98 + nop + +.L96: + ldc1 b4,8(B) + ldc1 a4,8(A) + + madd.d t11,t11,a0,b0 + madd.d t11,t11,a4,b4 + daddu B,B,16 # B+=1(nr)*2(kr)*8Byte=16 + daddu A,A,16 # A+=1(mr)*2(kr)*8Byte=32 + + ldc1 b0,0(B) + ldc1 a0,0(A) + + +.L98: # N=2, M=4, K=1 + and K,KCO,1 + beqz K,.L99 # + ldc1 ALPHA,152($sp) # Get ALPHA + madd.d t11,t11,a0,b0 + + +.L99: # Write Back + ldc1 c11,0(CO1) # Fetch 16 C + madd.d t11,c11,t11,ALPHA + sdc1 t11,0(CO1) + + + + +.L999: # End + ld $16, 0($sp) + ld $17, 8($sp) + ld $18, 16($sp) + ld $19, 24($sp) + ld $20, 32($sp) + ld $21, 40($sp) + ld $22, 48($sp) + ldc1 $f24, 56($sp) + ldc1 $f25, 64($sp) + ldc1 $f26, 72($sp) + ldc1 $f27, 80($sp) + ldc1 $f28, 88($sp) + ld $23, 96($sp) + ld $24, 104($sp) + ld $25, 112($sp) + ldc1 $f20,120($sp) + ldc1 $f21,128($sp) + ldc1 $f22,136($sp) + ldc1 $f23,144($sp) + + j $31 + daddiu $sp, $sp, 160 + + EPILOGUE From 782205a693dd8617508e0e20e0bc2b7ac8130f2b Mon Sep 17 00:00:00 2001 From: traz Date: Wed, 6 Apr 2011 10:38:34 +0000 Subject: [PATCH 02/42] Add dgemm compiler Options in KERNEL.LOONGSON3A. --- kernel/mips64/KERNEL.LOONGSON3A | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/kernel/mips64/KERNEL.LOONGSON3A b/kernel/mips64/KERNEL.LOONGSON3A index b295070d9..e149019aa 100644 --- a/kernel/mips64/KERNEL.LOONGSON3A +++ b/kernel/mips64/KERNEL.LOONGSON3A @@ -1,2 +1,10 @@ SAXPYKERNEL=axpy_loongson3a.S DAXPYKERNEL=daxpy_loongson3a_simd.S + +DGEMMKERNEL = gemm_kernel_loongson3a.S +DGEMMONCOPY = ../generic/gemm_ncopy_4.c +DGEMMOTCOPY = ../generic/gemm_tcopy_4.c +DGEMMONCOPYOBJ = dgemm_oncopy.o +DGEMMOTCOPYOBJ = dgemm_otcopy.o + + From 1aa9a298e1fa3ed41e02892b15d31a15e197eff6 Mon Sep 17 00:00:00 2001 From: traz Date: Wed, 6 Apr 2011 10:39:31 +0000 Subject: [PATCH 03/42] Change BLOCK SIZE of LOONGSON3A TARGET. --- param.h | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/param.h b/param.h index 8fcd19358..0038f9029 100644 --- a/param.h +++ b/param.h @@ -1482,25 +1482,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SGEMM_DEFAULT_UNROLL_M 2 #define SGEMM_DEFAULT_UNROLL_N 8 -#define DGEMM_DEFAULT_UNROLL_M 2 -#define DGEMM_DEFAULT_UNROLL_N 8 + +#define DGEMM_DEFAULT_UNROLL_M 4 +#define DGEMM_DEFAULT_UNROLL_N 4 + #define CGEMM_DEFAULT_UNROLL_M 1 #define CGEMM_DEFAULT_UNROLL_N 4 #define ZGEMM_DEFAULT_UNROLL_M 1 #define ZGEMM_DEFAULT_UNROLL_N 4 #define SGEMM_DEFAULT_P 108 -#define DGEMM_DEFAULT_P 112 +#define DGEMM_DEFAULT_P 32 #define CGEMM_DEFAULT_P 108 #define ZGEMM_DEFAULT_P 112 #define SGEMM_DEFAULT_Q 288 -#define DGEMM_DEFAULT_Q 144 +#define DGEMM_DEFAULT_Q 112 #define CGEMM_DEFAULT_Q 144 #define ZGEMM_DEFAULT_Q 72 #define SGEMM_DEFAULT_R 2000 -#define DGEMM_DEFAULT_R 2000 +#define DGEMM_DEFAULT_R 1000 #define CGEMM_DEFAULT_R 2000 #define ZGEMM_DEFAULT_R 2000 From 921e040b150ffe2edf9e6e28141480b8add6c26c Mon Sep 17 00:00:00 2001 From: Xianyi Zhang Date: Mon, 11 Apr 2011 21:46:48 +0000 Subject: [PATCH 04/42] Changed default page size to 16KB on Loongson 3A. --- common_mips64.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/common_mips64.h b/common_mips64.h index 7c7a70ba5..acea79011 100644 --- a/common_mips64.h +++ b/common_mips64.h @@ -220,6 +220,11 @@ REALNAME: ;\ #define BUFFER_SIZE ( 8 << 20) +#if defined(LOONGSON3A) +#define PAGESIZE (16UL << 10) +#define FIXED_PAGESIZE (16UL << 10) +#endif + #ifndef PAGESIZE #define PAGESIZE (64UL << 10) #endif From ab9e4ce3519908ae29126e7b0c5192fa3c25db10 Mon Sep 17 00:00:00 2001 From: traz Date: Mon, 11 Apr 2011 22:17:57 +0000 Subject: [PATCH 05/42] Adjust kc size from 112 to 116 . --- kernel/mips64/gemm_kernel_loongson3a.S | 1631 ------------------------ param.h | 4 +- 2 files changed, 2 insertions(+), 1633 deletions(-) delete mode 100644 kernel/mips64/gemm_kernel_loongson3a.S diff --git a/kernel/mips64/gemm_kernel_loongson3a.S b/kernel/mips64/gemm_kernel_loongson3a.S deleted file mode 100644 index d19d65469..000000000 --- a/kernel/mips64/gemm_kernel_loongson3a.S +++ /dev/null @@ -1,1631 +0,0 @@ -#define gsLQC1(base,fq,ft,offset) .word(0x32<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq) -#define gsSQC1(base,fq,ft,offset) .word(0x3A<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq) -#define REALNAME ASMNAME -#define PROLOGUE \ - .text ;\ - .set mips64 ;\ - .align 5 ;\ - .globl REALNAME ;\ - .ent REALNAME ;\ - .type REALNAME, @function ;\ -REALNAME: ;\ - .set noreorder ;\ - .set nomacro - -#define EPILOGUE \ - .set macro ;\ - .set reorder ;\ - .end REALNAME -#define BASE_SHIFT 3 -#define FETCH ld - -#define M $4 -#define N $5 -#define K $6 -#define A $8 -#define B $9 -#define C $10 -#define LDC $11 - -#define AO $12 -#define BO $13 - -#define I $2 -#define J $3 -#define L $7 - -#define CO1 $14 -#define CO2 $15 -#define CO3 $16 -#define CO4 $17 - -#define KCO $18 -#define MCO $19 -#define NCO $20 - -#define SPANB $21 -#define SPANC $22 -#define PREB $23 -#define PREA $24 -#define SPANA $25 - -#define ALPHA $f15 - -#define R8 8 -#define R9 9 -#define R14 14 -#define R15 15 -#define R16 16 -#define R17 17 - -#define t11 $f30 -#define t21 $f31 -#define t31 $f28 -#define t41 $f29 - -#define t12 $f26 -#define t22 $f27 -#define t32 $f24 -#define t42 $f25 - -#define t13 $f22 -#define t23 $f23 -#define t33 $f20 -#define t43 $f21 - -#define t14 $f18 -#define t24 $f19 -#define t34 $f16 -#define t44 $f17 - -#define c11 $f0 -#define c21 $f1 -#define c31 $f2 -#define c41 $f3 - -#define c12 $f4 -#define c22 $f5 -#define c32 $f6 -#define c42 $f7 - -#define c13 $f8 -#define c23 $f9 -#define c33 $f10 -#define c43 $f11 - -#define c14 $f12 -#define c24 $f13 -#define c34 $f14 -#define c44 $f0 - -#define a0 $f0 -#define a1 $f1 -#define a2 $f2 -#define a3 $f3 -#define a4 $f4 -#define a5 $f5 -#define a6 $f6 -#define a7 $f7 -#define b0 $f8 -#define b1 $f9 -#define b2 $f10 -#define b3 $f11 -#define b4 $f12 -#define b5 $f13 -#define b6 $f14 -#define b7 $f15 - -#define F31 31 -#define F30 30 -#define F29 29 -#define F28 28 -#define F27 27 -#define F26 26 -#define F25 25 -#define F24 24 -#define F23 23 -#define F22 22 -#define F21 21 -#define F20 20 -#define F19 19 -#define F18 18 -#define F17 17 -#define F16 16 -#define F15 15 -#define F14 14 -#define F13 13 -#define F12 12 -#define F11 11 -#define F10 10 -#define F9 9 -#define F8 8 -#define F7 7 -#define F6 6 -#define F5 5 -#define F4 4 -#define F3 3 -#define F2 2 -#define F1 1 -#define F0 0 - - PROLOGUE - - daddiu $sp, $sp, -160 - sd $16, 0($sp) - sd $17, 8($sp) - sd $18, 16($sp) - sd $19, 24($sp) - sd $20, 32($sp) - sd $21, 40($sp) - sd $22, 48($sp) - sdc1 $f24, 56($sp) - sdc1 $f25, 64($sp) - sdc1 $f26, 72($sp) - sdc1 $f27, 80($sp) - sdc1 $f28, 88($sp) - sd $23, 96($sp) - sd $24, 104($sp) - sd $25, 112($sp) - sdc1 $f20,120($sp) - sdc1 $f21,128($sp) - sdc1 $f22,136($sp) - sdc1 $f23,144($sp) - - - .align 5 # BACKUP -.L0_N4: # Loop N - sdc1 ALPHA,152($sp) # Backup ALPHA - move MCO,M # Backup M - - move NCO,N # Backup N - move KCO,K # Backup K - - move AO,A # Backup A_addr - move BO,B # Backup B_addr - - dsll LDC,LDC,3 # LDC*8Byte - dsll SPANB,KCO,5 # SPANB=KC*NR(4)*8Byte=KC*2^5 - - dsll SPANA,KCO,5 # SPANA = KCO*4mr*8Byte - dsra N,NCO,2 # N=NCO/2 - beq N,$0,.L0_N2 # N=0,NCO<4 - dsll SPANC,LDC,2 # SPANC=LDC*4 - -.L0_N4_Lb: - move CO1,C # Set C - dsra M,MCO,2 # M=MCO/2 - - move A,AO # Reset A - daddu CO2,CO1,LDC - - daddu CO3,CO2,LDC - daddu PREB,BO,SPANB # PreB point next panelB - - daddu CO4,CO3,LDC - beqz M,.L14_M2 - daddu PREA,AO,SPANA - -.L10: - dmtc1 $0,t11 - mov.d t21,t11 - gsLQC1(R8,F1,F0,0) #a0,a1 - - mov.d t31,t11 - mov.d t41,t11 - gsLQC1(R9,F9,F8,0) #b0,b1 - - mov.d t12,t11 - mov.d t22,t11 - gsLQC1(R8,F3,F2,1) #a2,a3 - - mov.d t32,t11 - mov.d t42,t11 - gsLQC1(R9,F11,F10,1) #b2,b3 - - dsra K,KCO,2 # K=KCO/2 - mov.d t13,t11 - - mov.d t23,t11 - mov.d t33,t11 - - mov.d t43,t11 - mov.d t14,t11 - - mov.d t24,t11 - mov.d t34,t11 - - mov.d t44,t11 - beqz K,.L15 - nop - -.L11: # N=M=K=4 - gsLQC1(R8,F5,F4,2) # R8=A - madd.d t11,t11,a0,b0 - madd.d t21,t21,a1,b0 - - gsLQC1(R9,F13,F12,2) # R9=B - madd.d t12,t12,a0,b1 - madd.d t22,t22,a1,b1 - - gsLQC1(R8,F7,F6,3) - madd.d t31,t31,a2,b0 - madd.d t41,t41,a3,b0 - - gsLQC1(R9,F15,F14,3) - madd.d t32,t32,a2,b1 - madd.d t42,t42,a3,b1 - - FETCH $0,(PREB) - madd.d t13,t13,a0,b2 - madd.d t23,t23,a1,b2 - - madd.d t14,t14,a0,b3 - madd.d t24,t24,a1,b3 - - FETCH $0,(PREA) - madd.d t33,t33,a2,b2 - madd.d t43,t43,a3,b2 - - madd.d t34,t34,a2,b3 - madd.d t44,t44,a3,b3 - #load2 comp1 -.L12: - gsLQC1(R8,F1,F0,4) - madd.d t11,t11,a4,b4 - madd.d t21,t21,a5,b4 - - gsLQC1(R9,F9,F8,4) - madd.d t12,t12,a4,b5 - madd.d t22,t22,a5,b5 - - gsLQC1(R8,F3,F2,5) - madd.d t31,t31,a6,b4 - madd.d t41,t41,a7,b4 - - gsLQC1(R9,F11,F10,5) - madd.d t32,t32,a6,b5 - madd.d t42,t42,a7,b5 - - FETCH $0,32(PREB) - madd.d t13,t13,a4,b6 - madd.d t23,t23,a5,b6 - - madd.d t14,t14,a4,b7 - madd.d t24,t24,a5,b7 - - FETCH $0,32(PREA) - madd.d t33,t33,a6,b6 - madd.d t43,t43,a7,b6 - - madd.d t34,t34,a6,b7 - madd.d t44,t44,a7,b7 - -.L13: - gsLQC1(R8,F5,F4,6) - madd.d t11,t11,a0,b0 - madd.d t21,t21,a1,b0 - - gsLQC1(R9,F13,F12,6) - madd.d t12,t12,a0,b1 - madd.d t22,t22,a1,b1 - - gsLQC1(R8,F7,F6,7) - madd.d t31,t31,a2,b0 - madd.d t41,t41,a3,b0 - - gsLQC1(R9,F15,F14,7) - madd.d t32,t32,a2,b1 - madd.d t42,t42,a3,b1 - daddu A,A,128 # A+=4(mr)*4(kr)*8Byte=128 - - FETCH $0,64(PREB) - madd.d t13,t13,a0,b2 - madd.d t23,t23,a1,b2 - daddu B,B,128 - - madd.d t14,t14,a0,b3 - madd.d t24,t24,a1,b3 - - FETCH $0,64(PREA) - madd.d t33,t33,a2,b2 - madd.d t43,t43,a3,b2 - - madd.d t34,t34,a2,b3 - madd.d t44,t44,a3,b3 - -.L14: - gsLQC1(R8,F1,F0,0) - madd.d t11,t11,a4,b4 - madd.d t21,t21,a5,b4 - - gsLQC1(R9,F9,F8,0) - madd.d t12,t12,a4,b5 - madd.d t22,t22,a5,b5 - - gsLQC1(R8,F3,F2,1) - madd.d t31,t31,a6,b4 - madd.d t41,t41,a7,b4 - daddiu K,K,-1 - - gsLQC1(R9,F11,F10,1) - madd.d t32,t32,a6,b5 - madd.d t42,t42,a7,b5 - - FETCH $0,96(PREB) - madd.d t13,t13,a4,b6 - madd.d t23,t23,a5,b6 - - madd.d t14,t14,a4,b7 - madd.d t24,t24,a5,b7 - - FETCH $0,96(PREA) - madd.d t33,t33,a6,b6 - madd.d t43,t43,a7,b6 - daddu PREB,PREB,128 - - madd.d t34,t34,a6,b7 - daddu PREA,PREA,128 - bnez K,.L11 - madd.d t44,t44,a7,b7 - -.L15: # N=4 M=4 K=2 - and K,KCO,2 # k = KCO&2 - beqz K,.L18 - nop - -.L16: - gsLQC1(R8,F5,F4,2) # R8=A - madd.d t11,t11,a0,b0 - madd.d t21,t21,a1,b0 - - gsLQC1(R9,F13,F12,2) # R9=B - madd.d t12,t12,a0,b1 - madd.d t22,t22,a1,b1 - - gsLQC1(R8,F7,F6,3) - madd.d t31,t31,a2,b0 - madd.d t41,t41,a3,b0 - - gsLQC1(R9,F15,F14,3) - madd.d t32,t32,a2,b1 - madd.d t42,t42,a3,b1 - daddu A,A,64 # A+=4(mr)*2(kr)*8Byte=64 - - FETCH $0,0(PREB) - madd.d t13,t13,a0,b2 - madd.d t23,t23,a1,b2 - daddu B,B,64 - - madd.d t14,t14,a0,b3 - madd.d t24,t24,a1,b3 - - FETCH $0,0(PREA) - madd.d t33,t33,a2,b2 - madd.d t43,t43,a3,b2 - - madd.d t34,t34,a2,b3 - madd.d t44,t44,a3,b3 - -.L17: - gsLQC1(R8,F1,F0,0) - madd.d t11,t11,a4,b4 - madd.d t21,t21,a5,b4 - - gsLQC1(R9,F9,F8,0) - madd.d t12,t12,a4,b5 - madd.d t22,t22,a5,b5 - - gsLQC1(R8,F3,F2,1) - madd.d t31,t31,a6,b4 - madd.d t41,t41,a7,b4 - - gsLQC1(R9,F11,F10,1) - madd.d t32,t32,a6,b5 - madd.d t42,t42,a7,b5 - - FETCH $0,32(PREB) - madd.d t13,t13,a4,b6 - madd.d t23,t23,a5,b6 - - madd.d t14,t14,a4,b7 - madd.d t24,t24,a5,b7 - - FETCH $0,32(PREA) - madd.d t33,t33,a6,b6 - madd.d t43,t43,a7,b6 - daddu PREB,PREB,64 - - madd.d t34,t34,a6,b7 - madd.d t44,t44,a7,b7 - daddu PREA,PREA,64 - -.L18: # N=4, M=4, K=1 - and K,KCO,1 - beqz K,.L19 # - ldc1 ALPHA,152($sp) # Get ALPHA - - FETCH $0,0(PREB) - madd.d t11,t11,a0,b0 - madd.d t21,t21,a1,b0 - daddu A,A,32 # A+=4(mr)*1(kr)*8Byte=32 - - madd.d t12,t12,a0,b1 - madd.d t22,t22,a1,b1 - daddu B,B,32 - - FETCH $0,0(PREA) - madd.d t31,t31,a2,b0 - madd.d t41,t41,a3,b0 - daddu PREB,PREB,32 - - madd.d t32,t32,a2,b1 - madd.d t42,t42,a3,b1 - daddu PREA,PREA,32 - - madd.d t13,t13,a0,b2 - madd.d t23,t23,a1,b2 - - madd.d t14,t14,a0,b3 - madd.d t24,t24,a1,b3 - - madd.d t33,t33,a2,b2 - madd.d t43,t43,a3,b2 - - madd.d t34,t34,a2,b3 - madd.d t44,t44,a3,b3 - -.L19: # Write Back - ldc1 c11,0(CO1) # Fetch 16 C - ldc1 c21,8(CO1) - ldc1 c31,16(CO1) - ldc1 c41,24(CO1) - - ldc1 c12,0(CO2) - madd.d t11,c11,t11,ALPHA - ldc1 c22,8(CO2) - madd.d t21,c21,t21,ALPHA - ldc1 c32,16(CO2) - madd.d t31,c31,t31,ALPHA - ldc1 c42,24(CO2) - madd.d t41,c41,t41,ALPHA - - ldc1 c13,0(CO3) - madd.d t12,c12,t12,ALPHA - ldc1 c23,8(CO3) - madd.d t22,c22,t22,ALPHA - ldc1 c33,16(CO3) - madd.d t32,c32,t32,ALPHA - ldc1 c43,24(CO3) - madd.d t42,c42,t42,ALPHA - - ldc1 c14,0(CO4) - madd.d t13,c13,t13,ALPHA - ldc1 c24,8(CO4) - madd.d t23,c23,t23,ALPHA - ldc1 c34,16(CO4) - madd.d t33,c33,t33,ALPHA - ldc1 c44,24(CO4) - madd.d t43,c43,t43,ALPHA - - sdc1 t11,0(CO1) - madd.d t14,c14,t14,ALPHA - sdc1 t21,8(CO1) - madd.d t24,c24,t24,ALPHA - sdc1 t31,16(CO1) - madd.d t34,c34,t34,ALPHA - sdc1 t41,24(CO1) - madd.d t44,c44,t44,ALPHA - daddiu M,M,-1 # M-- - - sdc1 t12,0(CO2) - sdc1 t22,8(CO2) - sdc1 t32,16(CO2) - sdc1 t42,24(CO2) - - sdc1 t13,0(CO3) - sdc1 t23,8(CO3) - sdc1 t33,16(CO3) - sdc1 t43,24(CO3) - - FETCH $0,32(CO1) - FETCH $0,32(CO2) - FETCH $0,32(CO3) - FETCH $0,32(CO4) - - sdc1 t14,0(CO4) - daddu CO1,CO1,32 # COx += 4*8Byte - sdc1 t24,8(CO4) - daddu CO2,CO2,32 - sdc1 t34,16(CO4) - daddu CO3,CO3,32 - sdc1 t44,24(CO4) - move B,BO # Reset B - daddu PREB,BO,SPANB - bnez M,.L10 # M!=0 - daddu CO4,CO4,32 - - - -.L14_M2: - and M,MCO,2 # Remainder M = 2 - beqz M,.L14_M1 - nop - -.L20: - dmtc1 $0,t11 - mov.d t21,t11 - gsLQC1(R8,F1,F0,0) #a0,a1 - - mov.d t12,t11 - mov.d t22,t11 - gsLQC1(R9,F9,F8,0) #b0,b1 - - dsra K,KCO,2 # K=KCO/2 - mov.d t13,t11 - gsLQC1(R9,F11,F10,1) #b2,b3 - - mov.d t23,t11 - mov.d t14,t11 - - mov.d t24,t11 - beqz K,.L25 - nop - -.L21: # N=4 m=2,=K=4 - gsLQC1(R8,F5,F4,1) # R8=A - madd.d t11,t11,a0,b0 - madd.d t21,t21,a1,b0 - - gsLQC1(R9,F13,F12,2) # R9=B - madd.d t12,t12,a0,b1 - madd.d t22,t22,a1,b1 - - gsLQC1(R9,F15,F14,3) - madd.d t13,t13,a0,b2 - madd.d t23,t23,a1,b2 - - gsLQC1(R8,F3,F2,2) - madd.d t14,t14,a0,b3 - madd.d t24,t24,a1,b3 - - gsLQC1(R9,F9,F8,4) - madd.d t11,t11,a4,b4 - madd.d t21,t21,a5,b4 - - gsLQC1(R9,F11,F10,5) - madd.d t12,t12,a4,b5 - madd.d t22,t22,a5,b5 - - gsLQC1(R8,F7,F6,3) - madd.d t13,t13,a4,b6 - madd.d t23,t23,a5,b6 - - madd.d t14,t14,a4,b7 - madd.d t24,t24,a5,b7 - - gsLQC1(R9,F13,F12,6) - madd.d t11,t11,a2,b0 - madd.d t21,t21,a3,b0 - daddu A,A,64 # A+=2(mr)*4(kr)*8Byte=64 - - gsLQC1(R9,F15,F14,7) - madd.d t12,t12,a2,b1 - madd.d t22,t22,a3,b1 - daddiu K,K,-1 - - gsLQC1(R8,F1,F0,0) - madd.d t13,t13,a2,b2 - madd.d t23,t23,a3,b2 - daddu B,B,128 # B+=4(nr)*4(kr)*8Byte=128 - - madd.d t14,t14,a2,b3 - madd.d t24,t24,a3,b3 - - gsLQC1(R9,F9,F8,0) - madd.d t11,t11,a6,b4 - madd.d t21,t21,a7,b4 - - gsLQC1(R9,F11,F10,1) - madd.d t12,t12,a6,b5 - madd.d t22,t22,a7,b5 - - madd.d t13,t13,a6,b6 - madd.d t23,t23,a7,b6 - - madd.d t14,t14,a6,b7 - bnez K,.L21 - madd.d t24,t24,a7,b7 - -.L25: # N=4 M=2 K=2 - and K,KCO,2 # k = KCO&2 - beqz K,.L28 - nop - -.L26: - gsLQC1(R8,F5,F4,1) # R8=A - madd.d t11,t11,a0,b0 - madd.d t21,t21,a1,b0 - - gsLQC1(R9,F13,F12,2) # R9=B - madd.d t12,t12,a0,b1 - madd.d t22,t22,a1,b1 - daddu A,A,32 # A+=2(mr)*2(kr)*8Byte=32 - - gsLQC1(R9,F15,F14,3) - madd.d t13,t13,a0,b2 - madd.d t23,t23,a1,b2 - daddu B,B,64 - - madd.d t14,t14,a0,b3 - madd.d t24,t24,a1,b3 - -.L27: - gsLQC1(R8,F1,F0,0) - madd.d t11,t11,a4,b4 - madd.d t21,t21,a5,b4 - - gsLQC1(R9,F9,F8,0) - madd.d t12,t12,a4,b5 - madd.d t22,t22,a5,b5 - - gsLQC1(R9,F11,F10,1) - madd.d t13,t13,a4,b6 - madd.d t23,t23,a5,b6 - - madd.d t14,t14,a4,b7 - madd.d t24,t24,a5,b7 - -.L28: # N=4, M=2, K=1 - and K,KCO,1 - beqz K,.L29 # - ldc1 ALPHA,152($sp) # Get ALPHA - - madd.d t11,t11,a0,b0 - madd.d t21,t21,a1,b0 - daddu A,A,16 # A+=2(mr)*1(kr)*8Byte=16 - daddu B,B,32 - - madd.d t12,t12,a0,b1 - madd.d t22,t22,a1,b1 - - madd.d t13,t13,a0,b2 - madd.d t23,t23,a1,b2 - - madd.d t14,t14,a0,b3 - madd.d t24,t24,a1,b3 - -.L29: # Write Back - ldc1 c11,0(CO1) # Fetch 16 C - ldc1 c21,8(CO1) - - ldc1 c12,0(CO2) - ldc1 c22,8(CO2) - - ldc1 c13,0(CO3) - madd.d t11,c11,t11,ALPHA - ldc1 c23,8(CO3) - madd.d t21,c21,t21,ALPHA - - ldc1 c14,0(CO4) - madd.d t12,c12,t12,ALPHA - ldc1 c24,8(CO4) - madd.d t22,c22,t22,ALPHA - - sdc1 t11,0(CO1) - madd.d t13,c13,t13,ALPHA - sdc1 t21,8(CO1) - madd.d t23,c23,t23,ALPHA - - sdc1 t12,0(CO2) - madd.d t14,c14,t14,ALPHA - sdc1 t22,8(CO2) - madd.d t24,c24,t24,ALPHA - - sdc1 t13,0(CO3) - move B,BO # Reset B - sdc1 t23,8(CO3) - daddu CO1,CO1,16 # COx += 2*8Byte - - FETCH $0,0(CO1) - FETCH $0,16(CO2) - FETCH $0,16(CO3) - FETCH $0,16(CO4) - - sdc1 t14,0(CO4) - daddu CO2,CO2,16 - sdc1 t24,8(CO4) - daddu CO3,CO3,16 - daddu CO4,CO4,16 - - - -.L14_M1: - and M,MCO,1 # Remainder M = 1 - beqz M,.L0_N4_Loop # M = 0, finishing one panel B - nop - -.L30: - ldc1 a0,0(A) - dsra K,KCO,2 # K=KCO/2 - gsLQC1(R9,F9,F8,0) #b0,b1 - dmtc1 $0,t11 - gsLQC1(R9,F11,F10,1) #b2,b3 - mov.d t12,t11 - mov.d t13,t11 - beqz K,.L25 - mov.d t14,t11 - -.L31: # N=4 m=1,=K=4 - ldc1 a1,8(A) - - gsLQC1(R9,F13,F12,2) # R9=B - madd.d t11,t11,a0,b0 - madd.d t12,t12,a0,b1 - - gsLQC1(R9,F15,F14,3) - madd.d t13,t13,a0,b2 - madd.d t14,t14,a0,b3 - - ldc1 a2,16(A) - - gsLQC1(R9,F9,F8,4) - madd.d t11,t11,a1,b4 - madd.d t12,t12,a1,b5 - - gsLQC1(R9,F11,F10,5) - madd.d t13,t13,a1,b6 - madd.d t14,t14,a1,b7 - - ldc1 a3,24(A) - daddiu K,K,-1 - - gsLQC1(R9,F13,F12,6) - madd.d t11,t11,a2,b0 - madd.d t12,t12,a2,b1 - daddu A,A,32 # A+=1(mr)*4(kr)*8Byte=64 - - gsLQC1(R9,F15,F14,7) - madd.d t13,t13,a2,b2 - madd.d t14,t14,a2,b3 - daddu B,B,128 # B+=4(nr)*4(kr)*8Byte=128 - - ldc1 a0,0(A) - - gsLQC1(R9,F9,F8,0) - madd.d t11,t11,a3,b4 - madd.d t12,t12,a3,b5 - - gsLQC1(R9,F11,F10,1) - madd.d t13,t13,a3,b6 - bnez K,.L31 - madd.d t14,t14,a3,b7 - -.L35: # N=4 M=1 K=2 - and K,KCO,2 # k = KCO&2 - beqz K,.L38 - nop - -.L36: - ldc1 a1,8(A) - - gsLQC1(R9,F13,F12,2) # R9=B - madd.d t11,t11,a0,b0 - madd.d t12,t12,a0,b1 - daddu A,A,16 # A+=1(mr)*2(kr)*8Byte=32 - - gsLQC1(R9,F15,F14,3) - madd.d t13,t13,a0,b2 - madd.d t14,t14,a0,b3 - daddu B,B,64 - - -.L37: - ldc1 a0,0(A) - - gsLQC1(R9,F9,F8,0) - madd.d t11,t11,a1,b4 - madd.d t12,t12,a1,b5 - - gsLQC1(R9,F11,F10,1) - madd.d t13,t13,a1,b6 - madd.d t14,t14,a1,b7 - -.L38: # N=4, M=1, K=1 - and K,KCO,1 - beqz K,.L39 # - ldc1 ALPHA,152($sp) # Get ALPHA - - madd.d t11,t11,a0,b0 - madd.d t12,t12,a0,b1 - daddu A,A,8 # A+=1(mr)*1(kr)*8Byte=16 - daddu B,B,32 - - madd.d t13,t13,a0,b2 - madd.d t14,t14,a0,b3 - -.L39: # Write Back - ldc1 c11,0(CO1) # Fetch 16 C - ldc1 c12,0(CO2) - ldc1 c13,0(CO3) - ldc1 c14,0(CO4) - - madd.d t11,c11,t11,ALPHA - madd.d t12,c12,t12,ALPHA - madd.d t13,c13,t13,ALPHA - madd.d t14,c14,t14,ALPHA - - sdc1 t11,0(CO1) - sdc1 t12,0(CO2) - sdc1 t13,0(CO3) - sdc1 t14,0(CO4) - - -.L0_N4_Loop: - daddu BO,BO,SPANB # BO point to next panel B - daddiu N,N,-1 # N-- - daddu C,C,SPANC # C pointe to next panel C - bnez N,.L0_N4_Lb # N!=0 - move B,BO # Set B - - - - .align 5 -.L0_N2: - and N,NCO,2 # Remainder N = 2 - beqz N,.L0_N1 # N=0,NCO<2 - dsll SPANC,LDC,1 # SPANC=LDC*2 - -.L0_N2_Lb: - move CO1,C # Set C - dsra M,MCO,2 # M=MCO/2 - - dsll SPANB,KCO,4 # SPANB=KC*NR(2)*8Byte=KC*16=KC*2^4 - move A,AO # Reset A - - daddu CO2,CO1,LDC - beqz M,.L12_M2 - daddu PREA,AO,SPANA - -.L40: - dmtc1 $0,t11 - mov.d t21,t11 - gsLQC1(R8,F1,F0,0) #a0,a1 - - mov.d t31,t11 - mov.d t41,t11 - gsLQC1(R9,F9,F8,0) #b0,b1 - - dsra K,KCO,2 # K=KCO/2 - mov.d t12,t11 - gsLQC1(R8,F3,F2,1) #a2,a3 - - mov.d t22,t11 - mov.d t32,t11 - - mov.d t42,t11 - beqz K,.L45 - nop - -.L41: # N=2,M=K=4 - gsLQC1(R8,F5,F4,2) # R8=A - madd.d t11,t11,a0,b0 - madd.d t21,t21,a1,b0 - - gsLQC1(R9,F13,F12,1) # R9=B - madd.d t12,t12,a0,b1 - madd.d t22,t22,a1,b1 - - gsLQC1(R8,F7,F6,3) - madd.d t31,t31,a2,b0 - madd.d t41,t41,a3,b0 - - FETCH $0,(PREA) - madd.d t32,t32,a2,b1 - madd.d t42,t42,a3,b1 - -.L42: - gsLQC1(R8,F1,F0,4) - madd.d t11,t11,a4,b4 - madd.d t21,t21,a5,b4 - - gsLQC1(R9,F11,F10,2) - madd.d t12,t12,a4,b5 - madd.d t22,t22,a5,b5 - - gsLQC1(R8,F3,F2,5) - madd.d t31,t31,a6,b4 - madd.d t41,t41,a7,b4 - - FETCH $0,32(PREA) - madd.d t32,t32,a6,b5 - madd.d t42,t42,a7,b5 - -.L43: - gsLQC1(R8,F5,F4,6) - madd.d t11,t11,a0,b2 - madd.d t21,t21,a1,b2 - - gsLQC1(R9,F15,F14,3) - madd.d t12,t12,a0,b3 - madd.d t22,t22,a1,b3 - - gsLQC1(R8,F7,F6,7) - madd.d t31,t31,a2,b2 - madd.d t41,t41,a3,b2 - daddu B,B,64 # B+=2(nr)*4(kr)*8Byte=64 - - FETCH $0,64(PREA) - madd.d t32,t32,a2,b3 - madd.d t42,t42,a3,b3 - daddu A,A,128 # A+=4(mr)*4(kr)*8Byte=128 - -.L44: - gsLQC1(R8,F1,F0,0) - madd.d t11,t11,a4,b6 - madd.d t21,t21,a5,b6 - daddiu K,K,-1 - - gsLQC1(R9,F9,F8,0) - madd.d t12,t12,a4,b7 - madd.d t22,t22,a5,b7 - daddu PREA,PREA,128 - - gsLQC1(R8,F3,F2,1) - madd.d t31,t31,a6,b6 - madd.d t41,t41,a7,b6 - - FETCH $0,-32(PREA) - madd.d t32,t32,a6,b7 - bnez K,.L41 - madd.d t42,t42,a7,b7 - - -.L45: # N=2 M=4 K=2 - and K,KCO,2 # k = KCO&2 - beqz K,.L48 - nop - -.L46: - gsLQC1(R8,F5,F4,2) # R8=A - madd.d t11,t11,a0,b0 - madd.d t21,t21,a1,b0 - - gsLQC1(R9,F13,F12,1) # R9=B - madd.d t12,t12,a0,b1 - madd.d t22,t22,a1,b1 - - gsLQC1(R8,F7,F6,3) - madd.d t31,t31,a2,b0 - madd.d t41,t41,a3,b0 - daddu B,B,32 # B+=2(nr)*2(kr)*8Byte=32 - - FETCH $0,0(PREA) - madd.d t32,t32,a2,b1 - madd.d t42,t42,a3,b1 - daddu A,A,64 # A+=4(mr)*2(kr)*8Byte=64 - -.L47: - gsLQC1(R8,F1,F0,0) - madd.d t11,t11,a4,b4 - madd.d t21,t21,a5,b4 - - gsLQC1(R9,F9,F8,0) - madd.d t12,t12,a4,b5 - madd.d t22,t22,a5,b5 - - gsLQC1(R8,F3,F2,1) - madd.d t31,t31,a6,b4 - madd.d t41,t41,a7,b4 - - FETCH $0,32(PREA) - madd.d t32,t32,a6,b5 - madd.d t42,t42,a7,b5 - daddu PREA,PREA,64 - - -.L48: # N=2, M=4, K=1 - and K,KCO,1 - beqz K,.L49 # - ldc1 ALPHA,152($sp) # Get ALPHA - - FETCH $0,0(PREA) - madd.d t11,t11,a0,b0 - madd.d t21,t21,a1,b0 - daddu A,A,32 # A+=4(mr)*1(kr)*8Byte=32 - - madd.d t12,t12,a0,b1 - madd.d t22,t22,a1,b1 - daddu B,B,32 - daddu PREA,PREA,32 - - madd.d t31,t31,a2,b0 - madd.d t41,t41,a3,b0 - - madd.d t32,t32,a2,b1 - madd.d t42,t42,a3,b1 - -.L49: # Write Back - ldc1 c11,0(CO1) # Fetch 16 C - ldc1 c21,8(CO1) - ldc1 c31,16(CO1) - ldc1 c41,24(CO1) - - ldc1 c12,0(CO2) - madd.d t11,c11,t11,ALPHA - ldc1 c22,8(CO2) - madd.d t21,c21,t21,ALPHA - ldc1 c32,16(CO2) - madd.d t31,c31,t31,ALPHA - ldc1 c42,24(CO2) - madd.d t41,c41,t41,ALPHA - - sdc1 t11,0(CO1) - madd.d t12,c12,t12,ALPHA - sdc1 t21,8(CO1) - madd.d t22,c22,t22,ALPHA - sdc1 t31,16(CO1) - madd.d t32,c32,t32,ALPHA - sdc1 t41,24(CO1) - madd.d t42,c42,t42,ALPHA - daddiu M,M,-1 # M-- - - sdc1 t12,0(CO2) - sdc1 t22,8(CO2) - sdc1 t32,16(CO2) - sdc1 t42,24(CO2) - - FETCH $0,32(CO1) - FETCH $0,32(CO2) - - daddu CO1,CO1,32 # COx += 4*8Byte - daddu CO2,CO2,32 - bnez M,.L40 # M!=0 - move B,BO # Reset B - - -.L12_M2: - and M,MCO,2 # Remainder M = 2 - beqz M,.L12_M1 - nop - -.L50: - dsra K,KCO,2 # K=KCO/2 - dmtc1 $0,t11 - gsLQC1(R8,F1,F0,0) #a0,a1 - - mov.d t21,t11 - mov.d t12,t11 - gsLQC1(R9,F9,F8,0) #b0,b1 - - mov.d t22,t11 - beqz K,.L55 - nop - -.L51: # N=2 m=2,=K=4 - gsLQC1(R8,F5,F4,1) # R8=A - madd.d t11,t11,a0,b0 - madd.d t21,t21,a1,b0 - - gsLQC1(R9,F13,F12,1) # R9=B - madd.d t12,t12,a0,b1 - madd.d t22,t22,a1,b1 - - gsLQC1(R8,F3,F2,2) - madd.d t11,t11,a4,b4 - madd.d t21,t21,a5,b4 - - gsLQC1(R9,F11,F10,2) - madd.d t12,t12,a4,b5 - madd.d t22,t22,a5,b5 - daddiu K,K,-1 - - gsLQC1(R8,F7,F6,3) - madd.d t11,t11,a2,b2 - madd.d t21,t21,a3,b2 - daddu A,A,64 # A+=2(mr)*4(kr)*8Byte=64 - - gsLQC1(R9,F15,F14,3) - madd.d t12,t12,a2,b3 - madd.d t22,t22,a3,b3 - daddu B,B,64 # B+=2(nr)*4(kr)*8Byte=128 - - gsLQC1(R8,F1,F0,0) - madd.d t11,t11,a6,b6 - madd.d t21,t21,a7,b6 - - gsLQC1(R9,F9,F8,0) - madd.d t12,t12,a6,b7 - bnez K,.L51 - madd.d t22,t22,a7,b7 - -.L55: # N=2 M=2 K=2 - and K,KCO,2 # k = KCO&2 - beqz K,.L58 - nop - -.L56: - gsLQC1(R8,F5,F4,1) # R8=A - madd.d t11,t11,a0,b0 - madd.d t21,t21,a1,b0 - daddu A,A,32 # A+=2(mr)*2(kr)*8Byte=32 - - gsLQC1(R9,F13,F12,1) # R9=B - madd.d t12,t12,a0,b1 - madd.d t22,t22,a1,b1 - daddu B,B,32 - -.L57: - gsLQC1(R8,F1,F0,0) - madd.d t11,t11,a4,b4 - madd.d t21,t21,a5,b4 - - gsLQC1(R9,F9,F8,0) - madd.d t12,t12,a4,b5 - madd.d t22,t22,a5,b5 - - -.L58: # N=2, M=2, K=1 - and K,KCO,1 - beqz K,.L59 # - ldc1 ALPHA,152($sp) # Get ALPHA - - madd.d t11,t11,a0,b0 - madd.d t21,t21,a1,b0 - daddu A,A,16 # A+=2(mr)*1(kr)*8Byte=16 - daddu B,B,16 - - madd.d t12,t12,a0,b1 - madd.d t22,t22,a1,b1 - - -.L59: # Write Back - ldc1 c11,0(CO1) # Fetch 16 C - ldc1 c21,8(CO1) - ldc1 c12,0(CO2) - ldc1 c22,8(CO2) - - madd.d t11,c11,t11,ALPHA - madd.d t21,c21,t21,ALPHA - madd.d t12,c12,t12,ALPHA - madd.d t22,c22,t22,ALPHA - - sdc1 t11,0(CO1) - sdc1 t21,8(CO1) - sdc1 t12,0(CO2) - move B,BO # Reset B - sdc1 t22,8(CO2) - daddu CO1,CO1,16 # COx += 2*8Byte - daddu CO2,CO2,16 - - FETCH $0,0(CO1) - FETCH $0,0(CO2) - - -.L12_M1: - and M,MCO,1 # Remainder M = 1 - beqz M,.L0_N2_Loop # M = 0, finishing one panel B - nop - -.L60: - dsra K,KCO,2 # K=KCO/2 - dmtc1 $0,t11 - ldc1 a0,0(A) - - mov.d t21,t11 - mov.d t12,t11 - gsLQC1(R9,F9,F8,0) #b0,b1 - - mov.d t22,t11 - beqz K,.L65 - nop - -.L61: # N=2 m=1,=K=4 - ldc1 a4,8(A) - gsLQC1(R9,F13,F12,1) # R9=B - ldc1 a2,16(A) - madd.d t11,t11,a0,b0 - madd.d t12,t12,a0,b1 - - gsLQC1(R9,F11,F10,2) - madd.d t11,t11,a4,b4 - madd.d t12,t12,a4,b5 - daddiu K,K,-1 - - ldc1 a6,24(A) - madd.d t11,t11,a2,b2 - daddu A,A,32 # A+=1(mr)*4(kr)*8Byte=64 - - gsLQC1(R9,F15,F14,3) - madd.d t12,t12,a2,b3 - daddu B,B,64 # B+=2(nr)*4(kr)*8Byte=128 - - ldc1 a0,0(A) - gsLQC1(R9,F9,F8,0) - madd.d t11,t11,a6,b6 - bnez K,.L61 - madd.d t12,t12,a6,b7 - -.L65: # N=2 M=1 K=2 - and K,KCO,2 # k = KCO&2 - beqz K,.L68 - nop - -.L66: - ldc1 a4,8(A) - gsLQC1(R9,F13,F12,1) # R9=B - madd.d t11,t11,a0,b0 - madd.d t12,t12,a0,b1 - daddu A,A,16 # A+=1(mr)*2(kr)*8Byte=32 - daddu B,B,32 - -.L67: - ldc1 a0,0(A) - gsLQC1(R9,F9,F8,0) - madd.d t11,t11,a4,b4 - madd.d t12,t12,a4,b5 - - -.L68: # N=2, M=1, K=1 - and K,KCO,1 - beqz K,.L69 # - ldc1 ALPHA,152($sp) # Get ALPHA - - madd.d t11,t11,a0,b0 - madd.d t12,t12,a0,b1 - daddu A,A,8 # A+=1(mr)*1(kr)*8Byte=16 - daddu B,B,16 - - -.L69: # Write Back - ldc1 c11,0(CO1) # Fetch 16 C - ldc1 c12,0(CO2) - - madd.d t11,c11,t11,ALPHA - madd.d t12,c12,t12,ALPHA - - sdc1 t11,0(CO1) - move B,BO # Reset B - sdc1 t12,0(CO2) - daddu CO1,CO1,8 # COx += 2*8Byte - daddu CO2,CO2,8 - - FETCH $0,0(CO1) - FETCH $0,0(CO2) - - -.L0_N2_Loop: - daddu BO,BO,SPANB # BO+=KC*2N - move B,BO # Set B - daddu C,C,SPANC # C+=LDC*2 - - - - .align 5 -.L0_N1: - and N,NCO,1 # Remainder N = 1 - beqz N,.L999 # N=0,NCO<1 - nop - - move CO1,C # Set C - dsra M,MCO,2 # M=MCO/2 - - move A,AO # Reset A - beqz M,.L11_M2 - daddu PREA,AO,SPANA - - -.L70: - dsra K,KCO,2 # K=KCO/2 - ldc1 b0,0(B) - dmtc1 $0,t11 - gsLQC1(R8,F1,F0,0) #a0,a1 - mov.d t21,t11 - gsLQC1(R8,F3,F2,1) #a2,a3 - mov.d t31,t11 - beqz K,.L75 - mov.d t41,t11 - -.L71: # N=1,M=K=4 - ldc1 b4,8(B) - gsLQC1(R8,F5,F4,2) # R8=A - gsLQC1(R8,F7,F6,3) - madd.d t11,t11,a0,b0 - madd.d t21,t21,a1,b0 - - FETCH $0,(PREA) - madd.d t31,t31,a2,b0 - madd.d t41,t41,a3,b0 - - -.L72: - ldc1 b2,16(B) - gsLQC1(R8,F1,F0,4) - gsLQC1(R8,F3,F2,5) - madd.d t11,t11,a4,b4 - madd.d t21,t21,a5,b4 - - FETCH $0,32(PREA) - madd.d t31,t31,a6,b4 - madd.d t41,t41,a7,b4 - - -.L73: - ldc1 b6,24(B) - gsLQC1(R8,F5,F4,6) - gsLQC1(R8,F7,F6,7) - madd.d t11,t11,a0,b2 - madd.d t21,t21,a1,b2 - daddu B,B,32 # B+=1(nr)*4(kr)*8Byte=64 - - FETCH $0,64(PREA) - madd.d t31,t31,a2,b2 - madd.d t41,t41,a3,b2 - daddu A,A,128 # A+=4(mr)*4(kr)*8Byte=128 - -.L74: - ldc1 b0,0(B) - gsLQC1(R8,F1,F0,0) - daddu PREA,PREA,128 - gsLQC1(R8,F3,F2,1) - madd.d t11,t11,a4,b6 - madd.d t21,t21,a5,b6 - daddiu K,K,-1 - - FETCH $0,-32(PREA) - madd.d t31,t31,a6,b6 - bnez K,.L71 - madd.d t41,t41,a7,b6 - - - -.L75: # N=2 M=4 K=2 - and K,KCO,2 # k = KCO&2 - beqz K,.L78 - nop - -.L76: - ldc1 b4,8(B) - gsLQC1(R8,F5,F4,2) # R8=A - gsLQC1(R8,F7,F6,3) - madd.d t11,t11,a0,b0 - madd.d t21,t21,a1,b0 - daddu B,B,16 # B+=1(nr)*2(kr)*8Byte=32 - - FETCH $0,0(PREA) - madd.d t31,t31,a2,b0 - madd.d t41,t41,a3,b0 - daddu A,A,64 # A+=4(mr)*2(kr)*8Byte=64 - -.L77: - ldc1 b0,0(B) - gsLQC1(R8,F1,F0,0) - gsLQC1(R8,F3,F2,1) - madd.d t11,t11,a4,b4 - madd.d t21,t21,a5,b4 - - FETCH $0,32(PREA) - madd.d t31,t31,a6,b4 - madd.d t41,t41,a7,b4 - daddu PREA,PREA,64 - - - -.L78: # N=2, M=4, K=1 - and K,KCO,1 - beqz K,.L79 # - ldc1 ALPHA,152($sp) # Get ALPHA - - FETCH $0,0(PREA) - madd.d t11,t11,a0,b0 - madd.d t21,t21,a1,b0 - daddu A,A,32 # A+=4(mr)*1(kr)*8Byte=32 - - madd.d t31,t31,a2,b0 - madd.d t41,t41,a3,b0 - daddu B,B,8 - daddu PREA,PREA,32 - - -.L79: # Write Back - ldc1 c11,0(CO1) # Fetch 16 C - ldc1 c21,8(CO1) - ldc1 c31,16(CO1) - ldc1 c41,24(CO1) - - madd.d t11,c11,t11,ALPHA - madd.d t21,c21,t21,ALPHA - madd.d t31,c31,t31,ALPHA - madd.d t41,c41,t41,ALPHA - - sdc1 t11,0(CO1) - sdc1 t21,8(CO1) - sdc1 t31,16(CO1) - sdc1 t41,24(CO1) - daddiu M,M,-1 # M-- - - FETCH $0,32(CO1) - daddu CO1,CO1,32 # COx += 4*8Byte - bnez M,.L70 # M!=0 - move B,BO # Reset B - - - -.L11_M2: - and M,MCO,2 # Remainder M = 2 - beqz M,.L11_M1 - nop - -.L80: - dsra K,KCO,2 # K=KCO/2 - ldc1 b0,0(B) - dmtc1 $0,t11 - gsLQC1(R8,F1,F0,0) #a0,a1 - mov.d t21,t11 - beqz K,.L85 - nop - -.L81: # N=1,M=2,K=4 - ldc1 b4,8(B) - gsLQC1(R8,F5,F4,1) # R8=A - madd.d t11,t11,a0,b0 - madd.d t21,t21,a1,b0 - - ldc1 b2,16(B) - gsLQC1(R8,F3,F2,2) - madd.d t11,t11,a4,b4 - madd.d t21,t21,a5,b4 - - ldc1 b6,24(B) - daddu B,B,32 # B+=1(nr)*4(kr)*8Byte=32 - - gsLQC1(R8,F7,F6,3) - madd.d t11,t11,a2,b2 - madd.d t21,t21,a3,b2 - daddu A,A,64 # A+=2(mr)*4(kr)*8Byte=64 - - ldc1 b0,0(B) - daddiu K,K,-1 - - gsLQC1(R8,F1,F0,0) - madd.d t11,t11,a6,b6 - bnez K,.L81 - madd.d t21,t21,a7,b6 - - -.L85: # N=2 M=4 K=2 - and K,KCO,2 # k = KCO&2 - beqz K,.L88 - nop - -.L86: - ldc1 b4,8(B) - daddu B,B,16 # B+=1(nr)*2(kr)*8Byte=16 - - gsLQC1(R8,F5,F4,1) # R8=A - madd.d t11,t11,a0,b0 - madd.d t21,t21,a1,b0 - daddu A,A,32 # A+=2(mr)*2(kr)*8Byte=32 - - ldc1 b0,0(B) - gsLQC1(R8,F1,F0,0) - madd.d t11,t11,a4,b4 - madd.d t21,t21,a5,b4 - - -.L88: # N=2, M=4, K=1 - and K,KCO,1 - beqz K,.L89 # - ldc1 ALPHA,152($sp) # Get ALPHA - - madd.d t11,t11,a0,b0 - madd.d t21,t21,a1,b0 - daddu A,A,16 # A+=2(mr)*1(kr)*8Byte=16 - daddu B,B,8 - - -.L89: # Write Back - ldc1 c11,0(CO1) # Fetch 16 C - ldc1 c21,8(CO1) - - madd.d t11,c11,t11,ALPHA - madd.d t21,c21,t21,ALPHA - - sdc1 t11,0(CO1) - sdc1 t21,8(CO1) - - FETCH $0,16(CO1) - daddu CO1,CO1,16 # COx += 2*8Byte - move B,BO # Reset B - - -.L11_M1: - and M,MCO,1 # Remainder M = 1 - beqz M,.L999 # M = 0, End - nop - -.L90: - dsra K,KCO,2 # K=KCO/2 - ldc1 b0,0(B) - ldc1 a0,0(A) - beqz K,.L95 - dmtc1 $0,t11 - -.L91: # N=1,M=1,K=4 - ldc1 b4,8(B) - ldc1 a4,8(A) - ldc1 b2,16(B) - ldc1 a2,16(A) - ldc1 b6,24(B) - ldc1 a6,24(A) - - madd.d t11,t11,a0,b0 - madd.d t11,t11,a4,b4 - daddu B,B,32 # B+=1(nr)*4(kr)*8Byte=32 - daddu A,A,32 # A+=1(mr)*4(kr)*8Byte=32 - madd.d t11,t11,a2,b2 - madd.d t11,t11,a6,b6 - daddiu K,K,-1 - - ldc1 b0,0(B) - bnez K,.L91 - ldc1 a0,0(A) - - -.L95: # N=2 M=4 K=2 - and K,KCO,2 # k = KCO&2 - beqz K,.L98 - nop - -.L96: - ldc1 b4,8(B) - ldc1 a4,8(A) - - madd.d t11,t11,a0,b0 - madd.d t11,t11,a4,b4 - daddu B,B,16 # B+=1(nr)*2(kr)*8Byte=16 - daddu A,A,16 # A+=1(mr)*2(kr)*8Byte=32 - - ldc1 b0,0(B) - ldc1 a0,0(A) - - -.L98: # N=2, M=4, K=1 - and K,KCO,1 - beqz K,.L99 # - ldc1 ALPHA,152($sp) # Get ALPHA - madd.d t11,t11,a0,b0 - - -.L99: # Write Back - ldc1 c11,0(CO1) # Fetch 16 C - madd.d t11,c11,t11,ALPHA - sdc1 t11,0(CO1) - - - - -.L999: # End - ld $16, 0($sp) - ld $17, 8($sp) - ld $18, 16($sp) - ld $19, 24($sp) - ld $20, 32($sp) - ld $21, 40($sp) - ld $22, 48($sp) - ldc1 $f24, 56($sp) - ldc1 $f25, 64($sp) - ldc1 $f26, 72($sp) - ldc1 $f27, 80($sp) - ldc1 $f28, 88($sp) - ld $23, 96($sp) - ld $24, 104($sp) - ld $25, 112($sp) - ldc1 $f20,120($sp) - ldc1 $f21,128($sp) - ldc1 $f22,136($sp) - ldc1 $f23,144($sp) - - j $31 - daddiu $sp, $sp, 160 - - EPILOGUE diff --git a/param.h b/param.h index 0038f9029..417165652 100644 --- a/param.h +++ b/param.h @@ -1492,12 +1492,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ZGEMM_DEFAULT_UNROLL_N 4 #define SGEMM_DEFAULT_P 108 -#define DGEMM_DEFAULT_P 32 +#define DGEMM_DEFAULT_P 32 #define CGEMM_DEFAULT_P 108 #define ZGEMM_DEFAULT_P 112 #define SGEMM_DEFAULT_Q 288 -#define DGEMM_DEFAULT_Q 112 +#define DGEMM_DEFAULT_Q 116 #define CGEMM_DEFAULT_Q 144 #define ZGEMM_DEFAULT_Q 72 From ecd4c1f3d9cecc6b3382ab1a00cbe542edf99404 Mon Sep 17 00:00:00 2001 From: traz Date: Mon, 11 Apr 2011 22:46:36 +0000 Subject: [PATCH 06/42] Modify prefetching C. --- kernel/mips64/gemm_kernel_loongson3a.S | 1606 ++++++++++++++++++++++++ 1 file changed, 1606 insertions(+) create mode 100644 kernel/mips64/gemm_kernel_loongson3a.S diff --git a/kernel/mips64/gemm_kernel_loongson3a.S b/kernel/mips64/gemm_kernel_loongson3a.S new file mode 100644 index 000000000..c93e2e4a5 --- /dev/null +++ b/kernel/mips64/gemm_kernel_loongson3a.S @@ -0,0 +1,1606 @@ +#define gsLQC1(base,fq,ft,offset) .word(0x32<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq) +#define gsSQC1(base,fq,ft,offset) .word(0x3A<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq) +#define FETCH ld + +#define REALNAME ASMNAME +#define ASSEMBLER +#include "common.h" + +#define M $4 +#define N $5 +#define K $6 +#define A $8 +#define B $9 +#define C $10 +#define LDC $11 + +#define AO $12 +#define BO $13 + +#define I $2 +#define J $3 +#define L $7 + +#define CO1 $14 +#define CO2 $15 +#define CO3 $16 +#define CO4 $17 + +#define KCO $18 +#define MCO $19 +#define NCO $20 + +#define SPANB $21 +#define SPANC $22 +#define PREB $23 +#define PREA $24 +#define SPANA $25 + +#define ALPHA $f15 + +#define R8 8 +#define R9 9 +#define R14 14 +#define R15 15 +#define R16 16 +#define R17 17 + +#define t11 $f30 +#define t21 $f31 +#define t31 $f28 +#define t41 $f29 + +#define t12 $f26 +#define t22 $f27 +#define t32 $f24 +#define t42 $f25 + +#define t13 $f22 +#define t23 $f23 +#define t33 $f20 +#define t43 $f21 + +#define t14 $f18 +#define t24 $f19 +#define t34 $f16 +#define t44 $f17 + +#define c11 $f0 +#define c21 $f1 +#define c31 $f2 +#define c41 $f3 + +#define c12 $f4 +#define c22 $f5 +#define c32 $f6 +#define c42 $f7 + +#define c13 $f8 +#define c23 $f9 +#define c33 $f10 +#define c43 $f11 + +#define c14 $f12 +#define c24 $f13 +#define c34 $f14 +#define c44 $f0 + +#define a0 $f0 +#define a1 $f1 +#define a2 $f2 +#define a3 $f3 +#define a4 $f4 +#define a5 $f5 +#define a6 $f6 +#define a7 $f7 +#define b0 $f8 +#define b1 $f9 +#define b2 $f10 +#define b3 $f11 +#define b4 $f12 +#define b5 $f13 +#define b6 $f14 +#define b7 $f15 + +#define F31 31 +#define F30 30 +#define F29 29 +#define F28 28 +#define F27 27 +#define F26 26 +#define F25 25 +#define F24 24 +#define F23 23 +#define F22 22 +#define F21 21 +#define F20 20 +#define F19 19 +#define F18 18 +#define F17 17 +#define F16 16 +#define F15 15 +#define F14 14 +#define F13 13 +#define F12 12 +#define F11 11 +#define F10 10 +#define F9 9 +#define F8 8 +#define F7 7 +#define F6 6 +#define F5 5 +#define F4 4 +#define F3 3 +#define F2 2 +#define F1 1 +#define F0 0 + + PROLOGUE + + daddiu $sp, $sp, -160 + sd $16, 0($sp) + sd $17, 8($sp) + sd $18, 16($sp) + sd $19, 24($sp) + sd $20, 32($sp) + sd $21, 40($sp) + sd $22, 48($sp) + ST $f24, 56($sp) + ST $f25, 64($sp) + ST $f26, 72($sp) + ST $f27, 80($sp) + ST $f28, 88($sp) + sd $23, 96($sp) + sd $24, 104($sp) + sd $25, 112($sp) + ST $f20,120($sp) + ST $f21,128($sp) + ST $f22,136($sp) + ST $f23,144($sp) + + + .align 5 # BACKUP +.L0_N4: # Loop N + ST ALPHA,152($sp) # Backup ALPHA + move MCO,M # Backup M + + move NCO,N # Backup N + move KCO,K # Backup K + + move AO,A # Backup A_addr + move BO,B # Backup B_addr + + dsll LDC,LDC,BASE_SHIFT # LDC*8Byte + dsll SPANB,KCO,2+BASE_SHIFT # SPANB=KC*NR(4)*8Byte=KC*2^5 + + dsll SPANA,KCO,1+BASE_SHIFT # SPANA = KCO*4mr*8Byte + dsra N,NCO,2 # N=NCO/2 + + beq N,$0,.L0_N2 # N=0,NCO<4 + dsll SPANC,LDC,2 # SPANC=LDC*4 + +.L0_N4_Lb: + move CO1,C # Set C + dsra M,MCO,2 # M=MCO/2 + + move A,AO # Reset A + daddu CO2,CO1,LDC + + daddu CO3,CO2,LDC + daddu PREB,BO,SPANB # PreB point next panelB + + daddu CO4,CO3,LDC + beqz M,.L14_M2 + daddu PREA,AO,SPANA + +.L10: + MTC $0,t11 + MOV t21,t11 + gsLQC1(R8,F1,F0,0) #a0,a1 + + MOV t31,t11 + MOV t41,t11 + gsLQC1(R9,F9,F8,0) #b0,b1 + + MOV t12,t11 + MOV t22,t11 + gsLQC1(R8,F3,F2,1) #a2,a3 + + MOV t32,t11 + MOV t42,t11 + gsLQC1(R9,F11,F10,1) #b2,b3 + + dsra K,KCO,2 # K=KCO/2 + MOV t13,t11 + + MOV t23,t11 + MOV t33,t11 + + MOV t43,t11 + MOV t14,t11 + + MOV t24,t11 + MOV t34,t11 + + MOV t44,t11 + beqz K,.L15 + nop + +.L11: # N=M=K=4 + gsLQC1(R8,F5,F4,2) # R8=A + MADD t11,t11,a0,b0 + MADD t21,t21,a1,b0 + + gsLQC1(R9,F13,F12,2) # R9=B + MADD t12,t12,a0,b1 + MADD t22,t22,a1,b1 + + gsLQC1(R8,F7,F6,3) + MADD t31,t31,a2,b0 + MADD t41,t41,a3,b0 + + gsLQC1(R9,F15,F14,3) + MADD t32,t32,a2,b1 + MADD t42,t42,a3,b1 + + FETCH $0,(PREB) + MADD t13,t13,a0,b2 + MADD t23,t23,a1,b2 + + MADD t14,t14,a0,b3 + MADD t24,t24,a1,b3 + + FETCH $0,(PREA) + MADD t33,t33,a2,b2 + MADD t43,t43,a3,b2 + + MADD t34,t34,a2,b3 + MADD t44,t44,a3,b3 + #load2 comp1 +.L12: + gsLQC1(R8,F1,F0,4) + MADD t11,t11,a4,b4 + MADD t21,t21,a5,b4 + + gsLQC1(R9,F9,F8,4) + MADD t12,t12,a4,b5 + MADD t22,t22,a5,b5 + + gsLQC1(R8,F3,F2,5) + MADD t31,t31,a6,b4 + MADD t41,t41,a7,b4 + + gsLQC1(R9,F11,F10,5) + MADD t32,t32,a6,b5 + MADD t42,t42,a7,b5 + + FETCH $0,4*SIZE(PREB) + MADD t13,t13,a4,b6 + MADD t23,t23,a5,b6 + + MADD t14,t14,a4,b7 + MADD t24,t24,a5,b7 + + FETCH $0,4*SIZE(PREA) + MADD t33,t33,a6,b6 + MADD t43,t43,a7,b6 + + MADD t34,t34,a6,b7 + MADD t44,t44,a7,b7 + +.L13: + gsLQC1(R8,F5,F4,6) + MADD t11,t11,a0,b0 + MADD t21,t21,a1,b0 + + gsLQC1(R9,F13,F12,6) + MADD t12,t12,a0,b1 + MADD t22,t22,a1,b1 + + gsLQC1(R8,F7,F6,7) + MADD t31,t31,a2,b0 + MADD t41,t41,a3,b0 + + gsLQC1(R9,F15,F14,7) + MADD t32,t32,a2,b1 + MADD t42,t42,a3,b1 + daddu A,A,16*SIZE # A+=4(mr)*4(kr)*8Byte=16*SIZE + + FETCH $0,8*SIZE(PREB) + MADD t13,t13,a0,b2 + MADD t23,t23,a1,b2 + daddu B,B,16*SIZE + + MADD t14,t14,a0,b3 + MADD t24,t24,a1,b3 + + FETCH $0,8*SIZE(PREA) + MADD t33,t33,a2,b2 + MADD t43,t43,a3,b2 + + MADD t34,t34,a2,b3 + MADD t44,t44,a3,b3 + +.L14: + gsLQC1(R8,F1,F0,0) + MADD t11,t11,a4,b4 + MADD t21,t21,a5,b4 + + gsLQC1(R9,F9,F8,0) + MADD t12,t12,a4,b5 + MADD t22,t22,a5,b5 + + gsLQC1(R8,F3,F2,1) + MADD t31,t31,a6,b4 + MADD t41,t41,a7,b4 + daddiu K,K,-1 + + gsLQC1(R9,F11,F10,1) + MADD t32,t32,a6,b5 + MADD t42,t42,a7,b5 + + FETCH $0,12*SIZE(PREB) + MADD t13,t13,a4,b6 + MADD t23,t23,a5,b6 + + MADD t14,t14,a4,b7 + MADD t24,t24,a5,b7 + + FETCH $0,12*SIZE(PREA) + MADD t33,t33,a6,b6 + MADD t43,t43,a7,b6 + daddu PREB,PREB,16*SIZE + + MADD t34,t34,a6,b7 + daddu PREA,PREA,16*SIZE + bnez K,.L11 + MADD t44,t44,a7,b7 + +.L15: # N=4 M=4 K=2 + and K,KCO,2 # k = KCO&2 + beqz K,.L18 + nop + +.L16: + gsLQC1(R8,F5,F4,2) # R8=A + MADD t11,t11,a0,b0 + MADD t21,t21,a1,b0 + + gsLQC1(R9,F13,F12,2) # R9=B + MADD t12,t12,a0,b1 + MADD t22,t22,a1,b1 + + gsLQC1(R8,F7,F6,3) + MADD t31,t31,a2,b0 + MADD t41,t41,a3,b0 + + gsLQC1(R9,F15,F14,3) + MADD t32,t32,a2,b1 + MADD t42,t42,a3,b1 + daddu A,A,8*SIZE # A+=4(mr)*2(kr)*8Byte=8*SIZE + + FETCH $0,0(PREB) + MADD t13,t13,a0,b2 + MADD t23,t23,a1,b2 + daddu B,B,8*SIZE + + MADD t14,t14,a0,b3 + MADD t24,t24,a1,b3 + + FETCH $0,0(PREA) + MADD t33,t33,a2,b2 + MADD t43,t43,a3,b2 + + MADD t34,t34,a2,b3 + MADD t44,t44,a3,b3 + +.L17: + gsLQC1(R8,F1,F0,0) + MADD t11,t11,a4,b4 + MADD t21,t21,a5,b4 + + gsLQC1(R9,F9,F8,0) + MADD t12,t12,a4,b5 + MADD t22,t22,a5,b5 + + gsLQC1(R8,F3,F2,1) + MADD t31,t31,a6,b4 + MADD t41,t41,a7,b4 + + gsLQC1(R9,F11,F10,1) + MADD t32,t32,a6,b5 + MADD t42,t42,a7,b5 + + FETCH $0,4*SIZE(PREB) + MADD t13,t13,a4,b6 + MADD t23,t23,a5,b6 + + MADD t14,t14,a4,b7 + MADD t24,t24,a5,b7 + + FETCH $0,4*SIZE(PREA) + MADD t33,t33,a6,b6 + MADD t43,t43,a7,b6 + daddu PREB,PREB,8*SIZE + + MADD t34,t34,a6,b7 + MADD t44,t44,a7,b7 + daddu PREA,PREA,8*SIZE + +.L18: # N=4, M=4, K=1 + and K,KCO,1 + beqz K,.L19 # + LD ALPHA,152($sp) # Get ALPHA + + FETCH $0,0(PREB) + MADD t11,t11,a0,b0 + MADD t21,t21,a1,b0 + daddu A,A,4*SIZE # A+=4(mr)*1(kr)*8Byte=32 + + MADD t12,t12,a0,b1 + MADD t22,t22,a1,b1 + daddu B,B,4*SIZE + + FETCH $0,0(PREA) + MADD t31,t31,a2,b0 + MADD t41,t41,a3,b0 + daddu PREB,PREB,4*SIZE + + MADD t32,t32,a2,b1 + MADD t42,t42,a3,b1 + daddu PREA,PREA,4*SIZE + + MADD t13,t13,a0,b2 + MADD t23,t23,a1,b2 + + MADD t14,t14,a0,b3 + MADD t24,t24,a1,b3 + + MADD t33,t33,a2,b2 + MADD t43,t43,a3,b2 + + MADD t34,t34,a2,b3 + MADD t44,t44,a3,b3 + +.L19: # Write Back + LD c11,0(CO1) # Fetch 16 C + LD c21,1*SIZE(CO1) + LD c31,2*SIZE(CO1) + LD c41,3*SIZE(CO1) + + LD c12,0(CO2) + MADD t11,c11,t11,ALPHA + LD c22,1*SIZE(CO2) + MADD t21,c21,t21,ALPHA + LD c32,2*SIZE(CO2) + MADD t31,c31,t31,ALPHA + LD c42,3*SIZE(CO2) + MADD t41,c41,t41,ALPHA + + LD c13,0(CO3) + MADD t12,c12,t12,ALPHA + LD c23,1*SIZE(CO3) + MADD t22,c22,t22,ALPHA + LD c33,2*SIZE(CO3) + MADD t32,c32,t32,ALPHA + LD c43,3*SIZE(CO3) + MADD t42,c42,t42,ALPHA + + LD c14,0(CO4) + MADD t13,c13,t13,ALPHA + LD c24,1*SIZE(CO4) + MADD t23,c23,t23,ALPHA + LD c34,2*SIZE(CO4) + MADD t33,c33,t33,ALPHA + LD c44,3*SIZE(CO4) + MADD t43,c43,t43,ALPHA + + ST t11,0(CO1) + MADD t14,c14,t14,ALPHA + ST t21,1*SIZE(CO1) + MADD t24,c24,t24,ALPHA + ST t31,2*SIZE(CO1) + MADD t34,c34,t34,ALPHA + ST t41,3*SIZE(CO1) + MADD t44,c44,t44,ALPHA + daddiu M,M,-1 # M-- + + ST t12,0(CO2) + ST t22,1*SIZE(CO2) + ST t32,2*SIZE(CO2) + ST t42,3*SIZE(CO2) + + ST t13,0(CO3) + ST t23,1*SIZE(CO3) + ST t33,2*SIZE(CO3) + ST t43,3*SIZE(CO3) + + FETCH $0,4*SIZE(CO1) + FETCH $0,4*SIZE(CO2) + FETCH $0,4*SIZE(CO3) + FETCH $0,4*SIZE(CO4) + + FETCH $0,8*SIZE(CO1) + FETCH $0,8*SIZE(CO2) + FETCH $0,8*SIZE(CO3) + FETCH $0,8*SIZE(CO4) + + ST t14,0(CO4) + daddu CO1,CO1,4*SIZE # COx += 4*8Byte + ST t24,1*SIZE(CO4) + daddu CO2,CO2,4*SIZE + ST t34,2*SIZE(CO4) + daddu CO3,CO3,4*SIZE + ST t44,3*SIZE(CO4) + move B,BO # Reset B + daddu PREB,BO,SPANB + bnez M,.L10 # M!=0 + daddu CO4,CO4,4*SIZE + + + +.L14_M2: + and M,MCO,2 # Remainder M = 2 + beqz M,.L14_M1 + nop + +.L20: + MTC $0,t11 + MOV t21,t11 + gsLQC1(R8,F1,F0,0) #a0,a1 + + MOV t12,t11 + MOV t22,t11 + gsLQC1(R9,F9,F8,0) #b0,b1 + + dsra K,KCO,2 # K=KCO/2 + MOV t13,t11 + gsLQC1(R9,F11,F10,1) #b2,b3 + + MOV t23,t11 + MOV t14,t11 + + MOV t24,t11 + beqz K,.L25 + nop + +.L21: # N=4 m=2,=K=4 + gsLQC1(R8,F5,F4,1) # R8=A + MADD t11,t11,a0,b0 + MADD t21,t21,a1,b0 + + gsLQC1(R9,F13,F12,2) # R9=B + MADD t12,t12,a0,b1 + MADD t22,t22,a1,b1 + + gsLQC1(R9,F15,F14,3) + MADD t13,t13,a0,b2 + MADD t23,t23,a1,b2 + + gsLQC1(R8,F3,F2,2) + MADD t14,t14,a0,b3 + MADD t24,t24,a1,b3 + + gsLQC1(R9,F9,F8,4) + MADD t11,t11,a4,b4 + MADD t21,t21,a5,b4 + + gsLQC1(R9,F11,F10,5) + MADD t12,t12,a4,b5 + MADD t22,t22,a5,b5 + + gsLQC1(R8,F7,F6,3) + MADD t13,t13,a4,b6 + MADD t23,t23,a5,b6 + + MADD t14,t14,a4,b7 + MADD t24,t24,a5,b7 + + gsLQC1(R9,F13,F12,6) + MADD t11,t11,a2,b0 + MADD t21,t21,a3,b0 + daddu A,A,8*SIZE # A+=2(mr)*4(kr)*8Byte=8*SIZE + + gsLQC1(R9,F15,F14,7) + MADD t12,t12,a2,b1 + MADD t22,t22,a3,b1 + daddiu K,K,-1 + + gsLQC1(R8,F1,F0,0) + MADD t13,t13,a2,b2 + MADD t23,t23,a3,b2 + daddu B,B,16*SIZE # B+=4(nr)*4(kr)*8Byte=16*SIZE + + MADD t14,t14,a2,b3 + MADD t24,t24,a3,b3 + + gsLQC1(R9,F9,F8,0) + MADD t11,t11,a6,b4 + MADD t21,t21,a7,b4 + + gsLQC1(R9,F11,F10,1) + MADD t12,t12,a6,b5 + MADD t22,t22,a7,b5 + + MADD t13,t13,a6,b6 + MADD t23,t23,a7,b6 + + MADD t14,t14,a6,b7 + bnez K,.L21 + MADD t24,t24,a7,b7 + +.L25: # N=4 M=2 K=2 + and K,KCO,2 # k = KCO&2 + beqz K,.L28 + nop + +.L26: + gsLQC1(R8,F5,F4,1) # R8=A + MADD t11,t11,a0,b0 + MADD t21,t21,a1,b0 + + gsLQC1(R9,F13,F12,2) # R9=B + MADD t12,t12,a0,b1 + MADD t22,t22,a1,b1 + daddu A,A,4*SIZE # A+=2(mr)*2(kr)*8Byte=32 + + gsLQC1(R9,F15,F14,3) + MADD t13,t13,a0,b2 + MADD t23,t23,a1,b2 + daddu B,B,8*SIZE + + MADD t14,t14,a0,b3 + MADD t24,t24,a1,b3 + +.L27: + gsLQC1(R8,F1,F0,0) + MADD t11,t11,a4,b4 + MADD t21,t21,a5,b4 + + gsLQC1(R9,F9,F8,0) + MADD t12,t12,a4,b5 + MADD t22,t22,a5,b5 + + gsLQC1(R9,F11,F10,1) + MADD t13,t13,a4,b6 + MADD t23,t23,a5,b6 + + MADD t14,t14,a4,b7 + MADD t24,t24,a5,b7 + +.L28: # N=4, M=2, K=1 + and K,KCO,1 + beqz K,.L29 # + LD ALPHA,152($sp) # Get ALPHA + + MADD t11,t11,a0,b0 + MADD t21,t21,a1,b0 + daddu A,A,2*SIZE # A+=2(mr)*1(kr)*8Byte=16 + daddu B,B,4*SIZE + + MADD t12,t12,a0,b1 + MADD t22,t22,a1,b1 + + MADD t13,t13,a0,b2 + MADD t23,t23,a1,b2 + + MADD t14,t14,a0,b3 + MADD t24,t24,a1,b3 + +.L29: # Write Back + LD c11,0(CO1) # Fetch 16 C + LD c21,1*SIZE(CO1) + + LD c12,0(CO2) + LD c22,1*SIZE(CO2) + + LD c13,0(CO3) + MADD t11,c11,t11,ALPHA + LD c23,1*SIZE(CO3) + MADD t21,c21,t21,ALPHA + + LD c14,0(CO4) + MADD t12,c12,t12,ALPHA + LD c24,1*SIZE(CO4) + MADD t22,c22,t22,ALPHA + + ST t11,0(CO1) + MADD t13,c13,t13,ALPHA + ST t21,1*SIZE(CO1) + MADD t23,c23,t23,ALPHA + + ST t12,0(CO2) + MADD t14,c14,t14,ALPHA + ST t22,1*SIZE(CO2) + MADD t24,c24,t24,ALPHA + + ST t13,0(CO3) + move B,BO # Reset B + ST t23,1*SIZE(CO3) + daddu CO1,CO1,2*SIZE # COx += 2*8Byte + + FETCH $0,0(CO1) + FETCH $0,2*SIZE(CO2) + FETCH $0,2*SIZE(CO3) + FETCH $0,2*SIZE(CO4) + + ST t14,0(CO4) + daddu CO2,CO2,2*SIZE + ST t24,1*SIZE(CO4) + daddu CO3,CO3,2*SIZE + daddu CO4,CO4,2*SIZE + + + +.L14_M1: + and M,MCO,1 # Remainder M = 1 + beqz M,.L0_N4_Loop # M = 0, finishing one panel B + nop + +.L30: + gsLQC1(R8,F1,F0,0) + dsra K,KCO,2 # K=KCO/2 + gsLQC1(R9,F9,F8,0) #b0,b1 + MTC $0,t11 + gsLQC1(R9,F11,F10,1) #b2,b3 + MOV t12,t11 + MOV t13,t11 + beqz K,.L35 + MOV t14,t11 + +.L31: # N=4 m=1,=K=4 + gsLQC1(R8,F3,F2,1) + gsLQC1(R9,F13,F12,2) # R9=B + MADD t11,t11,a0,b0 + MADD t12,t12,a0,b1 + + gsLQC1(R9,F15,F14,3) + MADD t13,t13,a0,b2 + MADD t14,t14,a0,b3 + + gsLQC1(R9,F9,F8,4) + MADD t11,t11,a1,b4 + MADD t12,t12,a1,b5 + + gsLQC1(R9,F11,F10,5) + MADD t13,t13,a1,b6 + MADD t14,t14,a1,b7 + daddiu K,K,-1 + + gsLQC1(R9,F13,F12,6) + MADD t11,t11,a2,b0 + MADD t12,t12,a2,b1 + daddu A,A,4*SIZE # A+=1(mr)*4(kr)*8Byte=8*SIZE + + gsLQC1(R9,F15,F14,7) + MADD t13,t13,a2,b2 + MADD t14,t14,a2,b3 + daddu B,B,16*SIZE # B+=4(nr)*4(kr)*8Byte=16*SIZE + + gsLQC1(R8,F1,F0,0) + gsLQC1(R9,F9,F8,0) + MADD t11,t11,a3,b4 + MADD t12,t12,a3,b5 + + gsLQC1(R9,F11,F10,1) + MADD t13,t13,a3,b6 + bnez K,.L31 + MADD t14,t14,a3,b7 + +.L35: # N=4 M=1 K=2 + and K,KCO,2 # k = KCO&2 + beqz K,.L38 + nop + +.L36: + gsLQC1(R9,F13,F12,2) # R9=B + MADD t11,t11,a0,b0 + MADD t12,t12,a0,b1 + daddu A,A,2*SIZE # A+=1(mr)*2(kr)*8Byte=32 + + gsLQC1(R9,F15,F14,3) + MADD t13,t13,a0,b2 + MADD t14,t14,a0,b3 + daddu B,B,8*SIZE + + +.L37: + LD a0,0(A) + + gsLQC1(R9,F9,F8,0) + MADD t11,t11,a1,b4 + MADD t12,t12,a1,b5 + + gsLQC1(R9,F11,F10,1) + MADD t13,t13,a1,b6 + MADD t14,t14,a1,b7 + +.L38: # N=4, M=1, K=1 + and K,KCO,1 + beqz K,.L39 # + LD ALPHA,152($sp) # Get ALPHA + + MADD t11,t11,a0,b0 + MADD t12,t12,a0,b1 + daddu A,A,1*SIZE # A+=1(mr)*1(kr)*8Byte=16 + daddu B,B,4*SIZE + + MADD t13,t13,a0,b2 + MADD t14,t14,a0,b3 + +.L39: # Write Back + LD c11,0(CO1) # Fetch 16 C + LD c12,0(CO2) + LD c13,0(CO3) + LD c14,0(CO4) + + MADD t11,c11,t11,ALPHA + MADD t12,c12,t12,ALPHA + MADD t13,c13,t13,ALPHA + MADD t14,c14,t14,ALPHA + + ST t11,0(CO1) + ST t12,0(CO2) + ST t13,0(CO3) + ST t14,0(CO4) + + +.L0_N4_Loop: + daddu BO,BO,SPANB # BO point to next panel B + daddiu N,N,-1 # N-- + daddu C,C,SPANC # C pointe to next panel C + bnez N,.L0_N4_Lb # N!=0 + move B,BO # Set B + + + + .align 5 +.L0_N2: + and N,NCO,2 # Remainder N = 2 + beqz N,.L0_N1 # N=0,NCO<2 + dsll SPANC,LDC,1 # SPANC=LDC*2 + +.L0_N2_Lb: + move CO1,C # Set C + dsra M,MCO,2 # M=MCO/2 + + dsll SPANB,KCO,1+BASE_SHIFT # SPANB=KC*NR(2)*8Byte=KC*16=KC*2^4 + move A,AO # Reset A + + daddu CO2,CO1,LDC + beqz M,.L12_M2 + daddu PREA,AO,SPANA + +.L40: + MTC $0,t11 + MOV t21,t11 + gsLQC1(R8,F1,F0,0) #a0,a1 + + MOV t31,t11 + MOV t41,t11 + gsLQC1(R9,F9,F8,0) #b0,b1 + + dsra K,KCO,2 # K=KCO/2 + MOV t12,t11 + gsLQC1(R8,F3,F2,1) #a2,a3 + + MOV t22,t11 + MOV t32,t11 + + MOV t42,t11 + beqz K,.L45 + nop + +.L41: # N=2,M=K=4 + gsLQC1(R8,F5,F4,2) # R8=A + MADD t11,t11,a0,b0 + MADD t21,t21,a1,b0 + + gsLQC1(R9,F13,F12,1) # R9=B + MADD t12,t12,a0,b1 + MADD t22,t22,a1,b1 + + gsLQC1(R8,F7,F6,3) + MADD t31,t31,a2,b0 + MADD t41,t41,a3,b0 + + FETCH $0,(PREA) + MADD t32,t32,a2,b1 + MADD t42,t42,a3,b1 + +.L42: + gsLQC1(R8,F1,F0,4) + MADD t11,t11,a4,b4 + MADD t21,t21,a5,b4 + + gsLQC1(R9,F11,F10,2) + MADD t12,t12,a4,b5 + MADD t22,t22,a5,b5 + + gsLQC1(R8,F3,F2,5) + MADD t31,t31,a6,b4 + MADD t41,t41,a7,b4 + + FETCH $0,4*SIZE(PREA) + MADD t32,t32,a6,b5 + MADD t42,t42,a7,b5 + +.L43: + gsLQC1(R8,F5,F4,6) + MADD t11,t11,a0,b2 + MADD t21,t21,a1,b2 + + gsLQC1(R9,F15,F14,3) + MADD t12,t12,a0,b3 + MADD t22,t22,a1,b3 + + gsLQC1(R8,F7,F6,7) + MADD t31,t31,a2,b2 + MADD t41,t41,a3,b2 + daddu B,B,8*SIZE # B+=2(nr)*4(kr)*8Byte=8*SIZE + + FETCH $0,8*SIZE(PREA) + MADD t32,t32,a2,b3 + MADD t42,t42,a3,b3 + daddu A,A,16*SIZE # A+=4(mr)*4(kr)*8Byte=16*SIZE + +.L44: + gsLQC1(R8,F1,F0,0) + MADD t11,t11,a4,b6 + MADD t21,t21,a5,b6 + daddiu K,K,-1 + + gsLQC1(R9,F9,F8,0) + MADD t12,t12,a4,b7 + MADD t22,t22,a5,b7 + daddu PREA,PREA,16*SIZE + + gsLQC1(R8,F3,F2,1) + MADD t31,t31,a6,b6 + MADD t41,t41,a7,b6 + + FETCH $0,-4*SIZE(PREA) + MADD t32,t32,a6,b7 + bnez K,.L41 + MADD t42,t42,a7,b7 + + +.L45: # N=2 M=4 K=2 + and K,KCO,2 # k = KCO&2 + beqz K,.L48 + nop + +.L46: + gsLQC1(R8,F5,F4,2) # R8=A + MADD t11,t11,a0,b0 + MADD t21,t21,a1,b0 + + gsLQC1(R9,F13,F12,1) # R9=B + MADD t12,t12,a0,b1 + MADD t22,t22,a1,b1 + + gsLQC1(R8,F7,F6,3) + MADD t31,t31,a2,b0 + MADD t41,t41,a3,b0 + daddu B,B,4*SIZE # B+=2(nr)*2(kr)*8Byte=32 + + FETCH $0,0(PREA) + MADD t32,t32,a2,b1 + MADD t42,t42,a3,b1 + daddu A,A,8*SIZE # A+=4(mr)*2(kr)*8Byte=8*SIZE + +.L47: + gsLQC1(R8,F1,F0,0) + MADD t11,t11,a4,b4 + MADD t21,t21,a5,b4 + + gsLQC1(R9,F9,F8,0) + MADD t12,t12,a4,b5 + MADD t22,t22,a5,b5 + + gsLQC1(R8,F3,F2,1) + MADD t31,t31,a6,b4 + MADD t41,t41,a7,b4 + + FETCH $0,4*SIZE(PREA) + MADD t32,t32,a6,b5 + MADD t42,t42,a7,b5 + daddu PREA,PREA,8*SIZE + + +.L48: # N=2, M=4, K=1 + and K,KCO,1 + beqz K,.L49 # + LD ALPHA,152($sp) # Get ALPHA + + FETCH $0,0(PREA) + MADD t11,t11,a0,b0 + MADD t21,t21,a1,b0 + daddu A,A,4*SIZE # A+=4(mr)*1(kr)*8Byte=32 + + MADD t12,t12,a0,b1 + MADD t22,t22,a1,b1 + daddu B,B,2*SIZE + daddu PREA,PREA,4*SIZE + + MADD t31,t31,a2,b0 + MADD t41,t41,a3,b0 + + MADD t32,t32,a2,b1 + MADD t42,t42,a3,b1 + +.L49: # Write Back + LD c11,0(CO1) # Fetch 16 C + LD c21,1*SIZE(CO1) + LD c31,2*SIZE(CO1) + LD c41,3*SIZE(CO1) + + LD c12,0(CO2) + MADD t11,c11,t11,ALPHA + LD c22,1*SIZE(CO2) + MADD t21,c21,t21,ALPHA + LD c32,2*SIZE(CO2) + MADD t31,c31,t31,ALPHA + LD c42,3*SIZE(CO2) + MADD t41,c41,t41,ALPHA + + ST t11,0(CO1) + MADD t12,c12,t12,ALPHA + ST t21,1*SIZE(CO1) + MADD t22,c22,t22,ALPHA + ST t31,2*SIZE(CO1) + MADD t32,c32,t32,ALPHA + ST t41,3*SIZE(CO1) + MADD t42,c42,t42,ALPHA + daddiu M,M,-1 # M-- + + ST t12,0(CO2) + ST t22,1*SIZE(CO2) + ST t32,2*SIZE(CO2) + ST t42,3*SIZE(CO2) + + FETCH $0,4*SIZE(CO1) + FETCH $0,4*SIZE(CO2) + + FETCH $0,8*SIZE(CO1) + FETCH $0,8*SIZE(CO2) + + daddu CO1,CO1,4*SIZE # COx += 4*8Byte + daddu CO2,CO2,4*SIZE + bnez M,.L40 # M!=0 + move B,BO # Reset B + + +.L12_M2: + and M,MCO,2 # Remainder M = 2 + beqz M,.L12_M1 + nop + +.L50: + dsra K,KCO,2 # K=KCO/2 + MTC $0,t11 + gsLQC1(R8,F1,F0,0) #a0,a1 + + MOV t21,t11 + MOV t12,t11 + gsLQC1(R9,F9,F8,0) #b0,b1 + + MOV t22,t11 + beqz K,.L55 + nop + +.L51: # N=2 m=2,=K=4 + gsLQC1(R8,F5,F4,1) # R8=A + MADD t11,t11,a0,b0 + MADD t21,t21,a1,b0 + + gsLQC1(R9,F13,F12,1) # R9=B + MADD t12,t12,a0,b1 + MADD t22,t22,a1,b1 + + gsLQC1(R8,F3,F2,2) + MADD t11,t11,a4,b4 + MADD t21,t21,a5,b4 + + gsLQC1(R9,F11,F10,2) + MADD t12,t12,a4,b5 + MADD t22,t22,a5,b5 + daddiu K,K,-1 + + gsLQC1(R8,F7,F6,3) + MADD t11,t11,a2,b2 + MADD t21,t21,a3,b2 + daddu A,A,8*SIZE # A+=2(mr)*4(kr)*8Byte=8*SIZE + + gsLQC1(R9,F15,F14,3) + MADD t12,t12,a2,b3 + MADD t22,t22,a3,b3 + daddu B,B,8*SIZE # B+=2(nr)*4(kr)*8Byte=16*SIZE + + gsLQC1(R8,F1,F0,0) + MADD t11,t11,a6,b6 + MADD t21,t21,a7,b6 + + gsLQC1(R9,F9,F8,0) + MADD t12,t12,a6,b7 + bnez K,.L51 + MADD t22,t22,a7,b7 + +.L55: # N=2 M=2 K=2 + and K,KCO,2 # k = KCO&2 + beqz K,.L58 + nop + +.L56: + gsLQC1(R8,F5,F4,1) # R8=A + MADD t11,t11,a0,b0 + MADD t21,t21,a1,b0 + daddu A,A,4*SIZE # A+=2(mr)*2(kr)*8Byte=32 + + gsLQC1(R9,F13,F12,1) # R9=B + MADD t12,t12,a0,b1 + MADD t22,t22,a1,b1 + daddu B,B,4*SIZE + +.L57: + gsLQC1(R8,F1,F0,0) + MADD t11,t11,a4,b4 + MADD t21,t21,a5,b4 + + gsLQC1(R9,F9,F8,0) + MADD t12,t12,a4,b5 + MADD t22,t22,a5,b5 + + +.L58: # N=2, M=2, K=1 + and K,KCO,1 + beqz K,.L59 # + LD ALPHA,152($sp) # Get ALPHA + + MADD t11,t11,a0,b0 + MADD t21,t21,a1,b0 + daddu A,A,2*SIZE # A+=2(mr)*1(kr)*8Byte=16 + daddu B,B,2*SIZE + + MADD t12,t12,a0,b1 + MADD t22,t22,a1,b1 + + +.L59: # Write Back + LD c11,0(CO1) # Fetch 16 C + LD c21,1*SIZE(CO1) + LD c12,0(CO2) + LD c22,1*SIZE(CO2) + + MADD t11,c11,t11,ALPHA + MADD t21,c21,t21,ALPHA + MADD t12,c12,t12,ALPHA + MADD t22,c22,t22,ALPHA + + ST t11,0(CO1) + ST t21,1*SIZE(CO1) + ST t12,0(CO2) + move B,BO # Reset B + ST t22,1*SIZE(CO2) + + daddu CO1,CO1,2*SIZE # COx += 2*8Byte + daddu CO2,CO2,2*SIZE + + FETCH $0,0(CO1) + FETCH $0,0(CO2) + + +.L12_M1: + and M,MCO,1 # Remainder M = 1 + beqz M,.L0_N2_Loop # M = 0, finishing one panel B + nop + +.L60: + dsra K,KCO,2 # K=KCO/2 + MTC $0,t11 + gsLQC1(R8,F4,F0,0) + + MOV t21,t11 + MOV t12,t11 + gsLQC1(R9,F9,F8,0) #b0,b1 + + MOV t22,t11 + beqz K,.L65 + nop + +.L61: # N=2 m=1,=K=4 + gsLQC1(R9,F13,F12,1) # R9=B + MADD t11,t11,a0,b0 + MADD t12,t12,a0,b1 + + gsLQC1(R9,F11,F10,2) + MADD t11,t11,a4,b4 + MADD t12,t12,a4,b5 + daddiu K,K,-1 + + gsLQC1(R8,F6,F2,1) + MADD t11,t11,a2,b2 + + gsLQC1(R9,F15,F14,3) + MADD t12,t12,a2,b3 + daddu A,A,4*SIZE # A+=1(mr)*4(kr)*8Byte=32 + + gsLQC1(R8,F4,F0,0) + daddu B,B,8*SIZE # B+=2(nr)*4(kr)*8Byte=8*SIZE + + gsLQC1(R9,F9,F8,0) + MADD t11,t11,a6,b6 + bnez K,.L61 + MADD t12,t12,a6,b7 + +.L65: # N=2 M=1 K=2 + and K,KCO,2 # k = KCO&2 + beqz K,.L68 + nop + +.L66: + gsLQC1(R9,F13,F12,1) # R9=B + MADD t11,t11,a0,b0 + MADD t12,t12,a0,b1 + daddu A,A,2*SIZE # A+=1(mr)*2(kr)*8Byte=16 + daddu B,B,4*SIZE + +.L67: + LD a0,0(A) + gsLQC1(R9,F9,F8,0) + MADD t11,t11,a4,b4 + MADD t12,t12,a4,b5 + + +.L68: # N=2, M=1, K=1 + and K,KCO,1 + beqz K,.L69 # + LD ALPHA,152($sp) # Get ALPHA + + MADD t11,t11,a0,b0 + MADD t12,t12,a0,b1 + daddu A,A,1*SIZE # A+=1(mr)*1(kr)*8Byte=16 + daddu B,B,2*SIZE + + +.L69: # Write Back + LD c11,0(CO1) # Fetch 16 C + LD c12,0(CO2) + + MADD t11,c11,t11,ALPHA + MADD t12,c12,t12,ALPHA + + ST t11,0(CO1) + ST t12,0(CO2) + move B,BO # Reset B + + daddu CO1,CO1,1*SIZE # COx += 2*8Byte + daddu CO2,CO2,1*SIZE + + FETCH $0,0(CO1) + FETCH $0,0(CO2) + + +.L0_N2_Loop: + daddu BO,BO,SPANB # BO+=KC*2N + move B,BO # Set B + daddu C,C,SPANC # C+=LDC*2 + + + + .align 5 +.L0_N1: + and N,NCO,1 # Remainder N = 1 + beqz N,.L999 # N=0,NCO<1 + nop + + move CO1,C # Set C + dsra M,MCO,2 # M=MCO/2 + + move A,AO # Reset A + beqz M,.L11_M2 + daddu PREA,AO,SPANA + + +.L70: + dsra K,KCO,2 # K=KCO/2 + gsLQC1(R9,F12,F8,0) + MTC $0,t11 + gsLQC1(R8,F1,F0,0) #a0,a1 + MOV t21,t11 + gsLQC1(R8,F3,F2,1) #a2,a3 + MOV t31,t11 + beqz K,.L75 + MOV t41,t11 + +.L71: # N=1,M=K=4 + gsLQC1(R8,F5,F4,2) # R8=A + gsLQC1(R8,F7,F6,3) + MADD t11,t11,a0,b0 + MADD t21,t21,a1,b0 + + FETCH $0,(PREA) + MADD t31,t31,a2,b0 + MADD t41,t41,a3,b0 + +.L72: + gsLQC1(R9,F14,F10,1) + gsLQC1(R8,F1,F0,4) + gsLQC1(R8,F3,F2,5) + MADD t11,t11,a4,b4 + MADD t21,t21,a5,b4 + + FETCH $0,4*SIZE(PREA) + MADD t31,t31,a6,b4 + MADD t41,t41,a7,b4 + +.L73: + gsLQC1(R8,F5,F4,6) + gsLQC1(R8,F7,F6,7) + MADD t11,t11,a0,b2 + MADD t21,t21,a1,b2 + daddu B,B,4*SIZE # B+=1(nr)*4(kr)*8Byte=32 + + FETCH $0,8*SIZE(PREA) + MADD t31,t31,a2,b2 + MADD t41,t41,a3,b2 + daddu A,A,16*SIZE # A+=4(mr)*4(kr)*8Byte=16*SIZE + +.L74: + gsLQC1(R9,F12,F8,0) + gsLQC1(R8,F1,F0,0) + daddu PREA,PREA,16*SIZE + gsLQC1(R8,F3,F2,1) + MADD t11,t11,a4,b6 + MADD t21,t21,a5,b6 + daddiu K,K,-1 + + FETCH $0,-32(PREA) + MADD t31,t31,a6,b6 + bnez K,.L71 + MADD t41,t41,a7,b6 + + +.L75: # N=2 M=4 K=2 + and K,KCO,2 # k = KCO&2 + beqz K,.L78 + nop + +.L76: + gsLQC1(R8,F5,F4,2) # R8=A + gsLQC1(R8,F7,F6,3) + MADD t11,t11,a0,b0 + MADD t21,t21,a1,b0 + daddu B,B,2*SIZE # B+=1(nr)*2(kr)*8Byte=32 + + FETCH $0,0(PREA) + MADD t31,t31,a2,b0 + MADD t41,t41,a3,b0 + daddu A,A,8*SIZE # A+=4(mr)*2(kr)*8Byte=8*SIZE + +.L77: + LD b0,0(B) + gsLQC1(R8,F1,F0,0) + gsLQC1(R8,F3,F2,1) + MADD t11,t11,a4,b4 + MADD t21,t21,a5,b4 + + FETCH $0,4*SIZE(PREA) + MADD t31,t31,a6,b4 + MADD t41,t41,a7,b4 + daddu PREA,PREA,8*SIZE + + + +.L78: # N=2, M=4, K=1 + and K,KCO,1 + beqz K,.L79 # + LD ALPHA,152($sp) # Get ALPHA + + FETCH $0,0(PREA) + MADD t11,t11,a0,b0 + MADD t21,t21,a1,b0 + daddu A,A,4*SIZE # A+=4(mr)*1(kr)*8Byte=32 + + MADD t31,t31,a2,b0 + MADD t41,t41,a3,b0 + daddu B,B,1*SIZE + daddu PREA,PREA,4*SIZE + + +.L79: # Write Back + LD c11,0(CO1) # Fetch 16 C + LD c21,1*SIZE(CO1) + LD c31,2*SIZE(CO1) + LD c41,3*SIZE(CO1) + + MADD t11,c11,t11,ALPHA + MADD t21,c21,t21,ALPHA + MADD t31,c31,t31,ALPHA + MADD t41,c41,t41,ALPHA + + ST t11,0(CO1) + ST t21,1*SIZE(CO1) + ST t31,2*SIZE(CO1) + ST t41,3*SIZE(CO1) + daddiu M,M,-1 # M-- + + FETCH $0,4*SIZE(CO1) + + daddu CO1,CO1,4*SIZE # COx += 4*8Byte + bnez M,.L70 # M!=0 + move B,BO # Reset B + + + +.L11_M2: + and M,MCO,2 # Remainder M = 2 + beqz M,.L11_M1 + nop + +.L80: + dsra K,KCO,2 # K=KCO/2 + gsLQC1(R9,F12,F8,0) + MTC $0,t11 + gsLQC1(R8,F1,F0,0) #a0,a1 + MOV t21,t11 + beqz K,.L85 + nop + +.L81: # N=1,M=2,K=4 + gsLQC1(R8,F5,F4,1) # R8=A + MADD t11,t11,a0,b0 + MADD t21,t21,a1,b0 + + gsLQC1(R8,F3,F2,2) + MADD t11,t11,a4,b4 + MADD t21,t21,a5,b4 + + gsLQC1(R9,F14,F10,1) + daddu B,B,4*SIZE # B+=1(nr)*4(kr)*8Byte=32 + + gsLQC1(R8,F7,F6,3) + MADD t11,t11,a2,b2 + MADD t21,t21,a3,b2 + daddu A,A,8*SIZE # A+=2(mr)*4(kr)*8Byte=8*SIZE + + gsLQC1(R9,F12,F8,0) + daddiu K,K,-1 + + gsLQC1(R8,F1,F0,0) + MADD t11,t11,a6,b6 + bnez K,.L81 + MADD t21,t21,a7,b6 + + +.L85: # N=2 M=4 K=2 + and K,KCO,2 # k = KCO&2 + beqz K,.L88 + nop + +.L86: + gsLQC1(R8,F5,F4,1) # R8=A + MADD t11,t11,a0,b0 + MADD t21,t21,a1,b0 + daddu B,B,2*SIZE # B+=1(nr)*2(kr)*8Byte=16 + + LD b0,0(B) + daddu A,A,4*SIZE # A+=2(mr)*2(kr)*8Byte=32 + + gsLQC1(R8,F1,F0,0) + MADD t11,t11,a4,b4 + MADD t21,t21,a5,b4 + + +.L88: # N=2, M=4, K=1 + and K,KCO,1 + beqz K,.L89 # + LD ALPHA,152($sp) # Get ALPHA + + MADD t11,t11,a0,b0 + MADD t21,t21,a1,b0 + daddu A,A,2*SIZE # A+=2(mr)*1(kr)*8Byte=16 + daddu B,B,1*SIZE + + +.L89: # Write Back + LD c11,0(CO1) # Fetch 16 C + LD c21,1*SIZE(CO1) + + MADD t11,c11,t11,ALPHA + MADD t21,c21,t21,ALPHA + + ST t11,0(CO1) + ST t21,1*SIZE(CO1) + + FETCH $0,2*SIZE(CO1) + + daddu CO1,CO1,2*SIZE # COx += 2*8Byte + move B,BO # Reset B + + +.L11_M1: + and M,MCO,1 # Remainder M = 1 + beqz M,.L999 # M = 0, End + nop + +.L90: + dsra K,KCO,2 # K=KCO/2 + gsLQC1(R8,F4,F0,0) + gsLQC1(R9,F12,F8,0) + beqz K,.L95 + MTC $0,t11 + +.L91: # N=1,M=1,K=4 + gsLQC1(R8,F6,F2,1) + MADD t11,t11,a0,b0 + gsLQC1(R9,F14,F10,1) + MADD t11,t11,a4,b4 + daddu A,A,4*SIZE # A+=1(mr)*4(kr)*8Byte=32 + + + gsLQC1(R8,F4,F0,0) + MADD t11,t11,a2,b2 + daddu B,B,4*SIZE # B+=1(nr)*4(kr)*8Byte=32 + + gsLQC1(R9,F12,F8,0) + MADD t11,t11,a6,b6 + daddiu K,K,-1 + bnez K,.L91 + nop + +.L95: # N=2 M=4 K=2 + and K,KCO,2 # k = KCO&2 + beqz K,.L98 + nop + +.L96: + MADD t11,t11,a0,b0 + MADD t11,t11,a4,b4 + daddu B,B,2*SIZE # B+=1(nr)*2(kr)*8Byte=16 + daddu A,A,2*SIZE # A+=1(mr)*2(kr)*8Byte=32 + + LD b0,0(B) + LD a0,0(A) + + +.L98: # N=2, M=4, K=1 + and K,KCO,1 + beqz K,.L99 # + LD ALPHA,152($sp) # Get ALPHA + MADD t11,t11,a0,b0 + + +.L99: # Write Back + LD c11,0(CO1) # Fetch 16 C + MADD t11,c11,t11,ALPHA + ST t11,0(CO1) + + + + +.L999: # End + ld $16, 0($sp) + ld $17, 8($sp) + ld $18, 16($sp) + ld $19, 24($sp) + ld $20, 32($sp) + ld $21, 40($sp) + ld $22, 48($sp) + LD $f24, 56($sp) + LD $f25, 64($sp) + LD $f26, 72($sp) + LD $f27, 80($sp) + LD $f28, 88($sp) + ld $23, 96($sp) + ld $24, 104($sp) + ld $25, 112($sp) + LD $f20,120($sp) + LD $f21,128($sp) + LD $f22,136($sp) + LD $f23,144($sp) + + j $31 + daddiu $sp, $sp, 160 + + EPILOGUE From 921caefa5695a8e79b3b8ce239fd92345e563b45 Mon Sep 17 00:00:00 2001 From: traz Date: Fri, 15 Apr 2011 21:56:25 +0000 Subject: [PATCH 07/42] Increased handling trmm part, no edge handling. Test size(M and N) must be a multiple of 4 . --- kernel/mips64/gemm_kernel_loongson3a.S | 288 +++++++++++++++++++++++-- 1 file changed, 268 insertions(+), 20 deletions(-) diff --git a/kernel/mips64/gemm_kernel_loongson3a.S b/kernel/mips64/gemm_kernel_loongson3a.S index c93e2e4a5..389b38f46 100644 --- a/kernel/mips64/gemm_kernel_loongson3a.S +++ b/kernel/mips64/gemm_kernel_loongson3a.S @@ -17,10 +17,6 @@ #define AO $12 #define BO $13 -#define I $2 -#define J $3 -#define L $7 - #define CO1 $14 #define CO2 $15 #define CO3 $16 @@ -31,13 +27,18 @@ #define NCO $20 #define SPANB $21 -#define SPANC $22 #define PREB $23 #define PREA $24 #define SPANA $25 #define ALPHA $f15 +#if defined(TRMMKERNEL) +#define OFFSET $2 +#define KK $3 +#define TEMP $7 +#endif + #define R8 8 #define R9 9 #define R14 14 @@ -164,20 +165,26 @@ ST ALPHA,152($sp) # Backup ALPHA move MCO,M # Backup M +#if defined(TRMMKERNEL) + ld OFFSET,160($sp) # +#endif + move NCO,N # Backup N move KCO,K # Backup K +#if defined(TRMMKERNEL) && !defined(LEFT) + neg KK,OFFSET +#endif + move AO,A # Backup A_addr - move BO,B # Backup B_addr + dsra N,NCO,2 # N=NCO/2 dsll LDC,LDC,BASE_SHIFT # LDC*8Byte dsll SPANB,KCO,2+BASE_SHIFT # SPANB=KC*NR(4)*8Byte=KC*2^5 - dsll SPANA,KCO,1+BASE_SHIFT # SPANA = KCO*4mr*8Byte - dsra N,NCO,2 # N=NCO/2 - + move BO,B # Backup B_addr beq N,$0,.L0_N2 # N=0,NCO<4 - dsll SPANC,LDC,2 # SPANC=LDC*4 + dsll SPANA,KCO,1+BASE_SHIFT # SPANA = KCO*4mr*8Byte .L0_N4_Lb: move CO1,C # Set C @@ -189,11 +196,27 @@ daddu CO3,CO2,LDC daddu PREB,BO,SPANB # PreB point next panelB +#if defined(TRMMKERNEL) && defined(LEFT) + move KK,OFFSET +#endif + daddu CO4,CO3,LDC - beqz M,.L14_M2 daddu PREA,AO,SPANA + + beqz M,.L14_M2 + daddu C,CO4,LDC .L10: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B,BO +#else + dsll K,KK,2 + BASE_SHIFT + dsll TEMP,KK,2 + BASE_SHIFT + + daddu A,A,K + daddu B,BO,TEMP +#endif MTC $0,t11 MOV t21,t11 gsLQC1(R8,F1,F0,0) #a0,a1 @@ -210,6 +233,48 @@ MOV t42,t11 gsLQC1(R9,F11,F10,1) #b2,b3 + MOV t13,t11 + MOV t23,t11 + + MOV t33,t11 + MOV t43,t11 + + MOV t14,t11 + MOV t24,t11 + + MOV t34,t11 + MOV t44,t11 + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP,KCO,KK # temp = kco - kk +#elif defined(LEFT) + daddiu TEMP, KK, 4 +#else + daddiu TEMP, KK, 4 +#endif + + dsra K,TEMP,2 # K=KCO/2 + beqz K,.L15 + nop + +#else + MTC $0,t11 # gemm part + move B,BO + MOV t21,t11 + gsLQC1(R8,F1,F0,0) #a0,a1 + + MOV t31,t11 + MOV t41,t11 + gsLQC1(R9,F9,F8,0) #b0,b1 + + MOV t12,t11 + MOV t22,t11 + gsLQC1(R8,F3,F2,1) #a2,a3 + + MOV t32,t11 + MOV t42,t11 + gsLQC1(R9,F11,F10,1) #b2,b3 + dsra K,KCO,2 # K=KCO/2 MOV t13,t11 @@ -225,7 +290,9 @@ MOV t44,t11 beqz K,.L15 nop - +#endif + + .align 5 .L11: # N=M=K=4 gsLQC1(R8,F5,F4,2) # R8=A MADD t11,t11,a0,b0 @@ -357,7 +424,13 @@ MADD t44,t44,a7,b7 .L15: # N=4 M=4 K=2 +#ifndef TRMMKERNEL and K,KCO,2 # k = KCO&2 +#else + andi K,TEMP, 2 +#endif + nop + beqz K,.L18 nop @@ -428,7 +501,13 @@ daddu PREA,PREA,8*SIZE .L18: # N=4, M=4, K=1 - and K,KCO,1 +#ifndef TRMMKERNEL + andi K,KCO,1 +#else + andi K,TEMP, 1 +#endif + NOP + beqz K,.L19 # LD ALPHA,152($sp) # Get ALPHA @@ -463,7 +542,8 @@ MADD t44,t44,a3,b3 .L19: # Write Back - LD c11,0(CO1) # Fetch 16 C +#ifndef TRMMKERNEL + LD c11,0(CO1) # gemm write part Fetch 16 C LD c21,1*SIZE(CO1) LD c31,2*SIZE(CO1) LD c41,3*SIZE(CO1) @@ -532,11 +612,80 @@ ST t34,2*SIZE(CO4) daddu CO3,CO3,4*SIZE ST t44,3*SIZE(CO4) - move B,BO # Reset B daddu PREB,BO,SPANB bnez M,.L10 # M!=0 daddu CO4,CO4,4*SIZE +#else + MUL t11, ALPHA, t11 + MUL t21, ALPHA, t21 + MUL t31, ALPHA, t31 + MUL t41, ALPHA, t41 + + ST t11, 0 * SIZE(CO1) + ST t21, 1 * SIZE(CO1) + ST t31, 2 * SIZE(CO1) + ST t41, 3 * SIZE(CO1) + + MUL t12, ALPHA, t12 + MUL t22, ALPHA, t22 + MUL t32, ALPHA, t32 + MUL t42, ALPHA, t42 + + ST t12, 0 * SIZE(CO2) + ST t22, 1 * SIZE(CO2) + ST t32, 2 * SIZE(CO2) + ST t42, 3 * SIZE(CO2) + + MUL t13, ALPHA, t13 + MUL t23, ALPHA, t23 + MUL t33, ALPHA, t33 + MUL t43, ALPHA, t43 + + ST t13, 0 * SIZE(CO3) + ST t23, 1 * SIZE(CO3) + ST t33, 2 * SIZE(CO3) + ST t43, 3 * SIZE(CO3) + + MUL t14, ALPHA, t14 + MUL t24, ALPHA, t24 + MUL t34, ALPHA, t34 + MUL t44, ALPHA, t44 + + ST t14, 0 * SIZE(CO4) + ST t24, 1 * SIZE(CO4) + ST t34, 2 * SIZE(CO4) + ST t44, 3 * SIZE(CO4) + + daddiu M,M,-1 # M-- + + daddiu CO4,CO4, 4 * SIZE # trmm part write back + daddiu CO3,CO3, 4 * SIZE + daddiu CO2,CO2, 4 * SIZE + daddiu CO1,CO1, 4 * SIZE + +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP,KCO,KK +#ifdef LEFT + daddiu TEMP,TEMP, -4 +#else + daddiu TEMP,TEMP, -4 +#endif + + dsll K,TEMP,2 + BASE_SHIFT + dsll TEMP,TEMP,2 + BASE_SHIFT + + daddu A,A,K + daddu B,B,TEMP +#endif + +#ifdef LEFT + daddiu KK, KK,4 +#endif + bnez M,.L10 # M!=0 + nop +#endif + .L14_M2: @@ -545,6 +694,46 @@ nop .L20: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B,BO +#else + dsll K,KK,1 + BASE_SHIFT #mr=2 so KK*2 + dsll TEMP,KK,2 + BASE_SHIFT + + daddu A,A,K + daddu B,BO,TEMP +#endif + + MTC $0,t11 + MOV t21,t11 + gsLQC1(R8,F1,F0,0) #a0,a1 + + MOV t12,t11 + MOV t22,t11 + gsLQC1(R9,F9,F8,0) #b0,b1 + + dsra K,KCO,2 # K=KCO/2 + MOV t13,t11 + gsLQC1(R9,F11,F10,1) #b2,b3 + + MOV t23,t11 + MOV t14,t11 + MOV t24,t11 + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP,KCO,KK +#elif defined(LEFT) + daddiu TEMP,KK,2 +#else + daddiu TEMP,KK,4 # not sure +#endif + dsra K,TEMP,2 + beqz K,.L25 + nop + +#else + move B,BO # gemm part MTC $0,t11 MOV t21,t11 gsLQC1(R8,F1,F0,0) #a0,a1 @@ -563,6 +752,7 @@ MOV t24,t11 beqz K,.L25 nop +#endif .L21: # N=4 m=2,=K=4 gsLQC1(R8,F5,F4,1) # R8=A @@ -630,7 +820,11 @@ MADD t24,t24,a7,b7 .L25: # N=4 M=2 K=2 +#ifndef TRMMKERNEL and K,KCO,2 # k = KCO&2 +#else + and K,TEMP,2 +#endif beqz K,.L28 nop @@ -669,7 +863,11 @@ MADD t24,t24,a5,b7 .L28: # N=4, M=2, K=1 +#ifndef TRMMKERNEL and K,KCO,1 +#else + and K,TEMP,1 +#endif beqz K,.L29 # LD ALPHA,152($sp) # Get ALPHA @@ -688,7 +886,8 @@ MADD t24,t24,a1,b3 .L29: # Write Back - LD c11,0(CO1) # Fetch 16 C +#ifndef TRMMKERNEL + LD c11,0(CO1) # gemm write back part Fetch 16 C LD c21,1*SIZE(CO1) LD c12,0(CO2) @@ -730,6 +929,56 @@ daddu CO3,CO3,2*SIZE daddu CO4,CO4,2*SIZE +#else + MUL t11, ALPHA, t11 + MUL t21, ALPHA, t21 + + ST t11, 0 * SIZE(CO1) + ST t21, 1 * SIZE(CO1) + + MUL t12, ALPHA, t12 + MUL t22, ALPHA, t22 + + ST t12, 0 * SIZE(CO2) + ST t22, 1 * SIZE(CO2) + + MUL t13, ALPHA, t13 + MUL t23, ALPHA, t23 + + ST t13, 0 * SIZE(CO3) + ST t23, 1 * SIZE(CO3) + + MUL t14, ALPHA, t14 + MUL t24, ALPHA, t24 + + ST t14, 0 * SIZE(CO4) + ST t24, 1 * SIZE(CO4) + + daddiu CO1,CO1, 2 * SIZE + daddiu CO2,CO2, 2 * SIZE + daddiu CO3,CO3, 2 * SIZE + daddiu CO4,CO4, 2 * SIZE + +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP,KCO,KK +#ifdef LEFT + daddiu TEMP,TEMP,-2 +#else + daddiu TEMP,TEMP,-4 +#endif + + dsll K,TEMP,1 + BASE_SHIFT + dsll TEMP,TEMP,2 + BASE_SHIFT + + daddu A,A,K + daddu B,B,TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 2 +#endif +#endif .L14_M1: @@ -848,7 +1097,6 @@ .L0_N4_Loop: daddu BO,BO,SPANB # BO point to next panel B daddiu N,N,-1 # N-- - daddu C,C,SPANC # C pointe to next panel C bnez N,.L0_N4_Lb # N!=0 move B,BO # Set B @@ -858,7 +1106,7 @@ .L0_N2: and N,NCO,2 # Remainder N = 2 beqz N,.L0_N1 # N=0,NCO<2 - dsll SPANC,LDC,1 # SPANC=LDC*2 + nop .L0_N2_Lb: move CO1,C # Set C @@ -868,8 +1116,9 @@ move A,AO # Reset A daddu CO2,CO1,LDC - beqz M,.L12_M2 daddu PREA,AO,SPANA + beqz M,.L12_M2 + daddu C,CO2,LDC .L40: MTC $0,t11 @@ -1284,7 +1533,6 @@ .L0_N2_Loop: daddu BO,BO,SPANB # BO+=KC*2N move B,BO # Set B - daddu C,C,SPANC # C+=LDC*2 From 9320933520a2c4876567a7e7b1f7aca1208117d7 Mon Sep 17 00:00:00 2001 From: traz Date: Sun, 17 Apr 2011 20:26:49 +0000 Subject: [PATCH 08/42] Completely dtrmm function. --- kernel/mips64/gemm_kernel_loongson3a.S | 540 ++++++++++++++++++++++++- 1 file changed, 522 insertions(+), 18 deletions(-) diff --git a/kernel/mips64/gemm_kernel_loongson3a.S b/kernel/mips64/gemm_kernel_loongson3a.S index 389b38f46..a785c3e0d 100644 --- a/kernel/mips64/gemm_kernel_loongson3a.S +++ b/kernel/mips64/gemm_kernel_loongson3a.S @@ -3,6 +3,7 @@ #define FETCH ld #define REALNAME ASMNAME + #define ASSEMBLER #include "common.h" @@ -713,7 +714,6 @@ MOV t22,t11 gsLQC1(R9,F9,F8,0) #b0,b1 - dsra K,KCO,2 # K=KCO/2 MOV t13,t11 gsLQC1(R9,F11,F10,1) #b2,b3 @@ -987,6 +987,37 @@ nop .L30: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B,BO +#else + dsll K,KK, 0 + BASE_SHIFT + dsll TEMP,KK,2 + BASE_SHIFT + + daddu A,A,K + daddu B,BO,TEMP +#endif + + gsLQC1(R8,F1,F0,0) + gsLQC1(R9,F9,F8,0) #b0,b1 + MTC $0,t11 + gsLQC1(R9,F11,F10,1) #b2,b3 + MOV t12,t11 + MOV t13,t11 + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, KCO, KK +#elif defined(LEFT) + daddiu TEMP, KK, 1 +#else + daddiu TEMP, KK, 4 +#endif + dsra K,TEMP, 2 + + beqz K,.L35 + MOV t14,t11 +#else + move B,BO gsLQC1(R8,F1,F0,0) dsra K,KCO,2 # K=KCO/2 gsLQC1(R9,F9,F8,0) #b0,b1 @@ -994,8 +1025,10 @@ gsLQC1(R9,F11,F10,1) #b2,b3 MOV t12,t11 MOV t13,t11 + dsra K,KCO,2 beqz K,.L35 MOV t14,t11 +#endif .L31: # N=4 m=1,=K=4 gsLQC1(R8,F3,F2,1) @@ -1037,7 +1070,11 @@ MADD t14,t14,a3,b7 .L35: # N=4 M=1 K=2 +#ifndef TRMMKERNEL and K,KCO,2 # k = KCO&2 +#else + and K,TEMP,2 +#endif beqz K,.L38 nop @@ -1065,7 +1102,11 @@ MADD t14,t14,a1,b7 .L38: # N=4, M=1, K=1 +#ifndef TRMMKERNEL and K,KCO,1 +#else + andi K,TEMP,1 +#endif beqz K,.L39 # LD ALPHA,152($sp) # Get ALPHA @@ -1078,6 +1119,7 @@ MADD t14,t14,a0,b3 .L39: # Write Back +#ifndef TRMMKERNEL LD c11,0(CO1) # Fetch 16 C LD c12,0(CO2) LD c13,0(CO3) @@ -1092,13 +1134,46 @@ ST t12,0(CO2) ST t13,0(CO3) ST t14,0(CO4) +#else + MUL t11, ALPHA, t11 + MUL t12, ALPHA, t12 + MUL t13, ALPHA, t13 + MUL t14, ALPHA, t14 + + ST t11, 0 * SIZE(CO1) + ST t12, 0 * SIZE(CO2) + ST t13, 0 * SIZE(CO3) + ST t14, 0 * SIZE(CO4) + +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, KCO, KK +#ifdef LEFT + daddiu TEMP, TEMP, -1 +#else + daddiu TEMP, TEMP, -4 +#endif + + dsll K,TEMP, 0 + BASE_SHIFT + dsll TEMP,TEMP, 2 + BASE_SHIFT + + daddu A,A,K + daddu B,BO,TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 1 +#endif +#endif .L0_N4_Loop: - daddu BO,BO,SPANB # BO point to next panel B daddiu N,N,-1 # N-- +#if defined(TRMMKERNEL) && !defined(LEFT) + daddiu KK, KK,4 +#endif bnez N,.L0_N4_Lb # N!=0 - move B,BO # Set B + move BO,B # Set B @@ -1111,7 +1186,11 @@ .L0_N2_Lb: move CO1,C # Set C dsra M,MCO,2 # M=MCO/2 - + +#if defined(TRMMKERNEL) && defined(LEFT) + move KK, OFFSET +#endif + dsll SPANB,KCO,1+BASE_SHIFT # SPANB=KC*NR(2)*8Byte=KC*16=KC*2^4 move A,AO # Reset A @@ -1121,6 +1200,16 @@ daddu C,CO2,LDC .L40: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B,BO +#else + dsll K,KK, 2 + BASE_SHIFT # mr=4 + dsll TEMP, KK,1 + BASE_SHIFT # nr=2 + + daddu A,A,K + daddu B,BO,TEMP +#endif MTC $0,t11 MOV t21,t11 gsLQC1(R8,F1,F0,0) #a0,a1 @@ -1129,6 +1218,33 @@ MOV t41,t11 gsLQC1(R9,F9,F8,0) #b0,b1 + MOV t12,t11 + gsLQC1(R8,F3,F2,1) #a2,a3 + + MOV t22,t11 + MOV t32,t11 + + MOV t42,t11 +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP,KCO,KK +#elif defined(LEFT) + daddiu TEMP, KK, 4 +#else + daddiu TEMP, KK, 2 +#endif + dsra K,TEMP,2 # K=KCO/2 + beqz K,.L45 + nop +#else + move B,BO + MTC $0,t11 # gemm part + MOV t21,t11 + gsLQC1(R8,F1,F0,0) #a0,a1 + + MOV t31,t11 + MOV t41,t11 + gsLQC1(R9,F9,F8,0) #b0,b1 + dsra K,KCO,2 # K=KCO/2 MOV t12,t11 gsLQC1(R8,F3,F2,1) #a2,a3 @@ -1139,6 +1255,7 @@ MOV t42,t11 beqz K,.L45 nop +#endif .L41: # N=2,M=K=4 gsLQC1(R8,F5,F4,2) # R8=A @@ -1215,7 +1332,11 @@ .L45: # N=2 M=4 K=2 +#ifndef TRMMKERNEL and K,KCO,2 # k = KCO&2 +#else + andi K,TEMP,2 +#endif beqz K,.L48 nop @@ -1258,7 +1379,11 @@ .L48: # N=2, M=4, K=1 +#ifndef TRMMKERNEL and K,KCO,1 +#else + andi K,TEMP,1 +#endif beqz K,.L49 # LD ALPHA,152($sp) # Get ALPHA @@ -1279,7 +1404,8 @@ MADD t42,t42,a3,b1 .L49: # Write Back - LD c11,0(CO1) # Fetch 16 C +#ifndef TRMMKERNEL + LD c11,0(CO1) # gemm write back part Fetch 16 C LD c21,1*SIZE(CO1) LD c31,2*SIZE(CO1) LD c41,3*SIZE(CO1) @@ -1315,10 +1441,57 @@ FETCH $0,8*SIZE(CO2) daddu CO1,CO1,4*SIZE # COx += 4*8Byte - daddu CO2,CO2,4*SIZE bnez M,.L40 # M!=0 - move B,BO # Reset B + daddu CO2,CO2,4*SIZE +#else + daddiu M,M,-1 + daddiu CO1,CO1, 4*SIZE + daddiu CO2,CO2, 4*SIZE + + MUL t11, ALPHA, t11 + MUL t21, ALPHA, t21 + MUL t31, ALPHA, t31 + MUL t41, ALPHA, t41 + + MUL t12, ALPHA, t12 + MUL t22, ALPHA, t22 + MUL t32, ALPHA, t32 + MUL t42, ALPHA, t42 + + ST t11, -4 * SIZE(CO1) + ST t21, -3 * SIZE(CO1) + ST t31, -2 * SIZE(CO1) + ST t41, -1 * SIZE(CO1) + + ST t12, -4 * SIZE(CO2) + ST t22, -3 * SIZE(CO2) + ST t32, -2 * SIZE(CO2) + ST t42, -1 * SIZE(CO2) + +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, KCO, KK +#ifdef LEFT + daddiu TEMP, TEMP, -4 +#else + daddiu TEMP, TEMP, -2 +#endif + + dsll K,TEMP, 2 + BASE_SHIFT + dsll TEMP, TEMP, 1 + BASE_SHIFT + + daddu A,A,K + daddu B,B,TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 4 +#endif + + bnez M,.L40 + nop +#endif .L12_M2: and M,MCO,2 # Remainder M = 2 @@ -1326,6 +1499,37 @@ nop .L50: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B,BO +#else + dsll K, KK, 1 + BASE_SHIFT #mr=2 + dsll TEMP, KK, 1 + BASE_SHIFT #nr=2 + + daddu A, A, K + daddu B, BO, TEMP +#endif + MTC $0,t11 + gsLQC1(R8,F1,F0,0) #a0,a1 + + MOV t21,t11 + MOV t12,t11 + gsLQC1(R9,F9,F8,0) #b0,b1 + + MOV t22,t11 +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, KCO, KK +#elif defined(LEFT) + daddiu TEMP, KK, 2 +#else + daddiu TEMP, KK, 2 +#endif + dsra K,TEMP,2 # K=KCO/2 + beqz K,.L55 + nop + +#else + move B,BO dsra K,KCO,2 # K=KCO/2 MTC $0,t11 gsLQC1(R8,F1,F0,0) #a0,a1 @@ -1337,6 +1541,7 @@ MOV t22,t11 beqz K,.L55 nop +#endif .L51: # N=2 m=2,=K=4 gsLQC1(R8,F5,F4,1) # R8=A @@ -1376,7 +1581,12 @@ MADD t22,t22,a7,b7 .L55: # N=2 M=2 K=2 +#ifndef TRMMKERNEL and K,KCO,2 # k = KCO&2 +#else + andi K,TEMP,2 +#endif + NOP beqz K,.L58 nop @@ -1402,7 +1612,11 @@ .L58: # N=2, M=2, K=1 +#ifndef TRMMKERNEL and K,KCO,1 +#else + and K, TEMP, 1 +#endif beqz K,.L59 # LD ALPHA,152($sp) # Get ALPHA @@ -1416,7 +1630,8 @@ .L59: # Write Back - LD c11,0(CO1) # Fetch 16 C +#ifndef TRMMKERNEL + LD c11,0(CO1) # write gemm part back Fetch 16 C LD c21,1*SIZE(CO1) LD c12,0(CO2) LD c22,1*SIZE(CO2) @@ -1429,7 +1644,6 @@ ST t11,0(CO1) ST t21,1*SIZE(CO1) ST t12,0(CO2) - move B,BO # Reset B ST t22,1*SIZE(CO2) daddu CO1,CO1,2*SIZE # COx += 2*8Byte @@ -1437,6 +1651,44 @@ FETCH $0,0(CO1) FETCH $0,0(CO2) +#else + daddiu M, M, -1 + + daddiu CO1,CO1, 2 * SIZE + daddiu CO2,CO2, 2 * SIZE + + MUL t11, ALPHA, t11 + MUL t21, ALPHA, t21 + MUL t12, ALPHA, t12 + MUL t22, ALPHA, t22 + + ST t11, -2 * SIZE(CO1) + ST t21, -1 * SIZE(CO1) + ST t12, -2 * SIZE(CO2) + ST t22, -1 * SIZE(CO2) + +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, KCO, KK +#ifdef LEFT + daddiu TEMP, TEMP, -2 +#else + daddiu TEMP, TEMP, -2 +#endif + + dsll K, TEMP, 1 + BASE_SHIFT + dsll TEMP, TEMP, 1 + BASE_SHIFT + + daddu A, A, K + daddu B, B, TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 2 +#endif + FETCH $0,0(CO1) + FETCH $0,0(CO2) + +#endif .L12_M1: @@ -1445,8 +1697,39 @@ nop .L60: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B,BO +#else + dsll K, KK, 0 + BASE_SHIFT + dsll TEMP, KK, 1 + BASE_SHIFT + + daddu A, A, K + daddu B, BO, TEMP +#endif + MTC $0,t11 + gsLQC1(R8,F4,F0,0) + + MOV t21,t11 + MOV t12,t11 + gsLQC1(R9,F9,F8,0) #b0,b1 + + MOV t22,t11 +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, KCO, KK +#elif defined(LEFT) + daddiu TEMP, KK, 1 +#else + daddiu TEMP, KK, 2 +#endif + dsra K,TEMP,2 # K=KCO/2 + beqz K,.L65 + nop + +#else dsra K,KCO,2 # K=KCO/2 MTC $0,t11 + move B,BO # Reset B gsLQC1(R8,F4,F0,0) MOV t21,t11 @@ -1456,6 +1739,7 @@ MOV t22,t11 beqz K,.L65 nop +#endif .L61: # N=2 m=1,=K=4 gsLQC1(R9,F13,F12,1) # R9=B @@ -1483,7 +1767,11 @@ MADD t12,t12,a6,b7 .L65: # N=2 M=1 K=2 +#ifndef TRMMKERNEL and K,KCO,2 # k = KCO&2 +#else + and K,TEMP,2 +#endif beqz K,.L68 nop @@ -1502,7 +1790,11 @@ .L68: # N=2, M=1, K=1 +#ifndef TRMMKERNEL and K,KCO,1 +#else + and K,TEMP,1 +#endif beqz K,.L69 # LD ALPHA,152($sp) # Get ALPHA @@ -1513,6 +1805,7 @@ .L69: # Write Back +#ifndef TRMMKERNEL LD c11,0(CO1) # Fetch 16 C LD c12,0(CO2) @@ -1521,19 +1814,47 @@ ST t11,0(CO1) ST t12,0(CO2) - move B,BO # Reset B daddu CO1,CO1,1*SIZE # COx += 2*8Byte daddu CO2,CO2,1*SIZE FETCH $0,0(CO1) FETCH $0,0(CO2) +#else + MUL t11, ALPHA, t11 + MUL t12, ALPHA, t12 + ST t11, 0 * SIZE(CO1) + ST t12, 0 * SIZE(CO2) + + daddu CO1,CO1,1*SIZE # COx += 2*8Byte + daddu CO2,CO2,1*SIZE + +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, KCO, KK +#ifdef LEFT + daddiu TEMP, TEMP, -1 +#else + daddiu TEMP, TEMP, -2 +#endif + + dsll K, TEMP, 0 + BASE_SHIFT + dsll TEMP, TEMP, 1 + BASE_SHIFT + + daddu A, A, K + daddu B, B, TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 1 +#endif +#endif .L0_N2_Loop: - daddu BO,BO,SPANB # BO+=KC*2N - move B,BO # Set B - +#if defined(TRMMKERNEL) && !defined(LEFT) + daddiu KK, KK, 2 +#endif + move BO, B .align 5 @@ -1544,13 +1865,45 @@ move CO1,C # Set C dsra M,MCO,2 # M=MCO/2 - + +#if defined(TRMMKERNEL) && defined(LEFT) + move KK, OFFSET +#endif + move A,AO # Reset A beqz M,.L11_M2 daddu PREA,AO,SPANA .L70: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B, BO +#else + dsll K, KK, 2 + BASE_SHIFT + dsll TEMP, KK, 0 + BASE_SHIFT + + daddu AO, AO, K + daddu B, BO, TEMP +#endif + gsLQC1(R9,F12,F8,0) + MTC $0,t11 + gsLQC1(R8,F1,F0,0) #a0,a1 + MOV t21,t11 + gsLQC1(R8,F3,F2,1) #a2,a3 + MOV t31,t11 +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, KCO, KK +#elif defined(LEFT) + daddiu TEMP, KK, 4 +#else + daddiu TEMP, KK, 1 +#endif + dsra K,TEMP,2 # K=KCO/2 + beqz K,.L75 + MOV t41,t11 +#else + move B, BO dsra K,KCO,2 # K=KCO/2 gsLQC1(R9,F12,F8,0) MTC $0,t11 @@ -1560,6 +1913,8 @@ MOV t31,t11 beqz K,.L75 MOV t41,t11 +#endif + .L71: # N=1,M=K=4 gsLQC1(R8,F5,F4,2) # R8=A @@ -1610,7 +1965,11 @@ .L75: # N=2 M=4 K=2 +#ifndef TRMMKERNEL and K,KCO,2 # k = KCO&2 +#else + and K,TEMP,2 +#endif beqz K,.L78 nop @@ -1641,7 +2000,11 @@ .L78: # N=2, M=4, K=1 +#ifndef TRMMKERNEL and K,KCO,1 +#else + and K,TEMP,1 +#endif beqz K,.L79 # LD ALPHA,152($sp) # Get ALPHA @@ -1657,6 +2020,7 @@ .L79: # Write Back +#ifndef TRMMKERNEL LD c11,0(CO1) # Fetch 16 C LD c21,1*SIZE(CO1) LD c31,2*SIZE(CO1) @@ -1677,7 +2041,42 @@ daddu CO1,CO1,4*SIZE # COx += 4*8Byte bnez M,.L70 # M!=0 - move B,BO # Reset B + nop +#else + daddiu M,M,-1 # M-- + MUL t11, ALPHA, t11 + MUL t21, ALPHA, t21 + MUL t31, ALPHA, t31 + MUL t41, ALPHA, t41 + + ST t11,0(CO1) + ST t21,1*SIZE(CO1) + ST t31,2*SIZE(CO1) + ST t41,3*SIZE(CO1) + + daddu CO1,CO1,4*SIZE # COx += 4*8Byte +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, KCO, KK +#ifdef LEFT + daddiu TEMP, TEMP, -4 +#else + daddiu TEMP, TEMP, -1 +#endif + + dsll K, TEMP, 2 + BASE_SHIFT + dsll TEMP, TEMP, 0 + BASE_SHIFT + + daddu A, A,K + daddu B, B, TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 4 +#endif + bnez M,.L70 # M!=0 + nop +#endif @@ -1687,6 +2086,33 @@ nop .L80: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B, BO +#else + dsll K, KK, 1 + BASE_SHIFT + dsll TEMP, KK, 0 + BASE_SHIFT + + daddu A, A, K + daddu B, BO, TEMP +#endif + + gsLQC1(R9,F12,F8,0) + MTC $0,t11 + gsLQC1(R8,F1,F0,0) #a0,a1 + MOV t21,t11 +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, KCO, KK +#elif defined(LEFT) + daddiu TEMP, KK, 2 +#else + daddiu TEMP, KK, 1 +#endif + dsra K,TEMP,2 # K=KCO/2 + beqz K,.L85 + nop +#else + move B, BO dsra K,KCO,2 # K=KCO/2 gsLQC1(R9,F12,F8,0) MTC $0,t11 @@ -1694,6 +2120,7 @@ MOV t21,t11 beqz K,.L85 nop +#endif .L81: # N=1,M=2,K=4 gsLQC1(R8,F5,F4,1) # R8=A @@ -1722,7 +2149,12 @@ .L85: # N=2 M=4 K=2 +#ifndef TRMMKERNEL and K,KCO,2 # k = KCO&2 +#else + andi K,TEMP,2 +#endif + beqz K,.L88 nop @@ -1741,7 +2173,12 @@ .L88: # N=2, M=4, K=1 +#ifndef TRMMKERNEL and K,KCO,1 +#else + andi K,TEMP,1 +#endif + beqz K,.L89 # LD ALPHA,152($sp) # Get ALPHA @@ -1752,6 +2189,7 @@ .L89: # Write Back +#ifndef TRMMKERNEL LD c11,0(CO1) # Fetch 16 C LD c21,1*SIZE(CO1) @@ -1764,7 +2202,34 @@ FETCH $0,2*SIZE(CO1) daddu CO1,CO1,2*SIZE # COx += 2*8Byte - move B,BO # Reset B +#else + daddu CO1,CO1,2*SIZE # COx += 2*8Byte + MUL t11, ALPHA, t11 + MUL t21, ALPHA, t21 + + ST t11, -2 * SIZE(CO1) + ST t21, -1 * SIZE(CO1) +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, KCO, KK +#ifdef LEFT + daddiu TEMP, TEMP, -2 +#else + daddiu TEMP, TEMP, -1 +#endif + + dsll K, TEMP, 1 + BASE_SHIFT + dsll TEMP, TEMP, 0 + BASE_SHIFT + + daddu A, A, K + daddu B, B, TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 2 +#endif +#endif + .L11_M1: @@ -1772,12 +2237,39 @@ beqz M,.L999 # M = 0, End nop -.L90: +.L90: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B, BO +#else + dsll K, KK, 0 + BASE_SHIFT + dsll TEMP, KK, 0 + BASE_SHIFT + + daddu A, A, K + daddu B, BO, TEMP +#endif + gsLQC1(R8,F4,F0,0) + MTC $0,t11 + gsLQC1(R9,F12,F8,0) +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, KCO, KK +#elif defined(LEFT) + daddiu TEMP, KK, 1 +#else + daddiu TEMP, KK, 1 +#endif + dsra K, TEMP, 2 + beqz K,.L95 + nop + +#else + move B, BO dsra K,KCO,2 # K=KCO/2 gsLQC1(R8,F4,F0,0) gsLQC1(R9,F12,F8,0) beqz K,.L95 MTC $0,t11 +#endif .L91: # N=1,M=1,K=4 gsLQC1(R8,F6,F2,1) @@ -1798,7 +2290,11 @@ nop .L95: # N=2 M=4 K=2 +#ifndef TRMMKERNEL and K,KCO,2 # k = KCO&2 +#else + andi K,TEMP,2 +#endif beqz K,.L98 nop @@ -1813,18 +2309,26 @@ .L98: # N=2, M=4, K=1 +#ifndef TRMMKERNEL and K,KCO,1 +#else + andi K,TEMP,1 +#endif beqz K,.L99 # LD ALPHA,152($sp) # Get ALPHA MADD t11,t11,a0,b0 .L99: # Write Back +#ifndef TRMMKERNEL LD c11,0(CO1) # Fetch 16 C MADD t11,c11,t11,ALPHA ST t11,0(CO1) +#else + MUL t11, ALPHA, t11 - + ST t11, 0 * SIZE(CO1) +#endif .L999: # End From 417b8ec792ad423c629208010886e5cfac696af3 Mon Sep 17 00:00:00 2001 From: Xianyi Zhang Date: Fri, 6 May 2011 17:03:35 +0800 Subject: [PATCH 09/42] Added openblas_set_num_threads for Fortran. --- Changelog.txt | 1 + driver/others/Makefile | 4 ++- driver/others/openblas_set_num_threads.c | 45 ++++++++++++++++++++++++ 3 files changed, 49 insertions(+), 1 deletion(-) create mode 100644 driver/others/openblas_set_num_threads.c diff --git a/Changelog.txt b/Changelog.txt index 4f83fdf97..0a9f182fa 100644 --- a/Changelog.txt +++ b/Changelog.txt @@ -15,6 +15,7 @@ common: * Provided an error message when the arch is not supported.(Refs issue #19 on github) * Fixed issue #23. Fixed a bug of f_check script about generating link flags. + * Added openblas_set_num_threads for Fortran. x86/x86_64: * diff --git a/driver/others/Makefile b/driver/others/Makefile index bc5de3848..ab0e2fea0 100644 --- a/driver/others/Makefile +++ b/driver/others/Makefile @@ -6,7 +6,7 @@ COMMONOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) c_abs.$(SUFFIX) z_abs.$(SUFFIX) COMMONOBJS += slamch.$(SUFFIX) slamc3.$(SUFFIX) dlamch.$(SUFFIX) dlamc3.$(SUFFIX) ifdef SMP -COMMONOBJS += blas_server.$(SUFFIX) divtable.$(SUFFIX) blasL1thread.$(SUFFIX) +COMMONOBJS += blas_server.$(SUFFIX) divtable.$(SUFFIX) blasL1thread.$(SUFFIX) openblas_set_num_threads.$(SUFFIX) ifndef NO_AFFINITY COMMONOBJS += init.$(SUFFIX) endif @@ -100,6 +100,8 @@ memory.$(SUFFIX) : $(MEMORY) ../../common.h ../../param.h blas_server.$(SUFFIX) : $(BLAS_SERVER) ../../common.h ../../common_thread.h ../../param.h $(CC) $(CFLAGS) -c $< -o $(@F) +openblas_set_num_threads.$(SUFFIX) : openblas_set_num_threads.c + blasL1thread.$(SUFFIX) : blas_l1_thread.c ../../common.h ../../common_thread.h $(CC) $(CFLAGS) -c $< -o $(@F) diff --git a/driver/others/openblas_set_num_threads.c b/driver/others/openblas_set_num_threads.c new file mode 100644 index 000000000..7ca3b7114 --- /dev/null +++ b/driver/others/openblas_set_num_threads.c @@ -0,0 +1,45 @@ +/***************************************************************************** +Copyright (c) 2011, Lab of Parallel Software and Computational Science,ICSAS +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the ISCAS nor the names of its contributors may + be used to endorse or promote products derived from this software + without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "common.h" + +#ifdef SMP_SERVER +#ifdef OS_LINUX + +extern void openblas_set_num_threads(int num_threads) ; + +void NAME(int* num_threads){ + openblas_set_num_threads(*num_threads); +} + +#endif +#endif From 5a991b71492eecd68f5f7eb58278f6f0d58b3c40 Mon Sep 17 00:00:00 2001 From: traz Date: Mon, 9 May 2011 17:28:20 +0000 Subject: [PATCH 10/42] Fixed #24 drmm error on Loongson3A --- kernel/mips64/gemm_kernel_loongson3a.S | 43 ++++++++++++++------------ 1 file changed, 23 insertions(+), 20 deletions(-) diff --git a/kernel/mips64/gemm_kernel_loongson3a.S b/kernel/mips64/gemm_kernel_loongson3a.S index a785c3e0d..9df66c0d7 100644 --- a/kernel/mips64/gemm_kernel_loongson3a.S +++ b/kernel/mips64/gemm_kernel_loongson3a.S @@ -164,19 +164,12 @@ .align 5 # BACKUP .L0_N4: # Loop N ST ALPHA,152($sp) # Backup ALPHA + move MCO,M # Backup M -#if defined(TRMMKERNEL) - ld OFFSET,160($sp) # -#endif - move NCO,N # Backup N move KCO,K # Backup K -#if defined(TRMMKERNEL) && !defined(LEFT) - neg KK,OFFSET -#endif - move AO,A # Backup A_addr dsra N,NCO,2 # N=NCO/2 @@ -184,6 +177,15 @@ dsll SPANB,KCO,2+BASE_SHIFT # SPANB=KC*NR(4)*8Byte=KC*2^5 move BO,B # Backup B_addr + +#if defined(TRMMKERNEL) + LDARG OFFSET,160($sp) # +#endif + +#if defined(TRMMKERNEL) && !defined(LEFT) + neg KK,OFFSET # right +#endif + beq N,$0,.L0_N2 # N=0,NCO<4 dsll SPANA,KCO,1+BASE_SHIFT # SPANA = KCO*4mr*8Byte @@ -197,13 +199,13 @@ daddu CO3,CO2,LDC daddu PREB,BO,SPANB # PreB point next panelB -#if defined(TRMMKERNEL) && defined(LEFT) - move KK,OFFSET -#endif - daddu CO4,CO3,LDC daddu PREA,AO,SPANA - + +#if defined(TRMMKERNEL) && defined(LEFT) + move KK,OFFSET # left +#endif + beqz M,.L14_M2 daddu C,CO4,LDC @@ -212,12 +214,13 @@ #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) move B,BO #else - dsll K,KK,2 + BASE_SHIFT + dsll K,KK,2 + BASE_SHIFT # KK no data part dsll TEMP,KK,2 + BASE_SHIFT - daddu A,A,K + daddu A,A,K # move A B to data part daddu B,BO,TEMP #endif + MTC $0,t11 MOV t21,t11 gsLQC1(R8,F1,F0,0) #a0,a1 @@ -676,11 +679,11 @@ dsll K,TEMP,2 + BASE_SHIFT dsll TEMP,TEMP,2 + BASE_SHIFT - daddu A,A,K - daddu B,B,TEMP + daddu A,A,K # mov A to the end of panel Ai + daddu B,B,TEMP # mov B to the end of panel Bj #endif -#ifdef LEFT +#ifdef LEFT # right control by N loop daddiu KK, KK,4 #endif bnez M,.L10 # M!=0 @@ -1158,7 +1161,7 @@ dsll TEMP,TEMP, 2 + BASE_SHIFT daddu A,A,K - daddu B,BO,TEMP + daddu B,B,TEMP #endif #ifdef LEFT @@ -1883,7 +1886,7 @@ dsll K, KK, 2 + BASE_SHIFT dsll TEMP, KK, 0 + BASE_SHIFT - daddu AO, AO, K + daddu A, A, K daddu B, BO, TEMP #endif gsLQC1(R9,F12,F8,0) From d2f351d819b8899c7c26817b1c9dcfdb07299054 Mon Sep 17 00:00:00 2001 From: traz Date: Mon, 9 May 2011 17:31:58 +0000 Subject: [PATCH 11/42] Modify dtrsm compiler options --- kernel/mips64/KERNEL | 11 +++++++++++ kernel/mips64/KERNEL.LOONGSON3A | 4 ++++ 2 files changed, 15 insertions(+) diff --git a/kernel/mips64/KERNEL b/kernel/mips64/KERNEL index dd0d2cfea..f6615bf01 100644 --- a/kernel/mips64/KERNEL +++ b/kernel/mips64/KERNEL @@ -96,10 +96,21 @@ STRSMKERNEL_LT = trsm_kernel_LT.S STRSMKERNEL_RN = trsm_kernel_LT.S STRSMKERNEL_RT = trsm_kernel_RT.S +ifndef DTRSMKERNEL_LN DTRSMKERNEL_LN = trsm_kernel_LN.S +endif + +ifndef DTRSMKERNEL_LT DTRSMKERNEL_LT = trsm_kernel_LT.S +endif + +ifndef DTRSMKERNEL_RN DTRSMKERNEL_RN = trsm_kernel_LT.S +endif + +ifndef DTRSMKERNEL_RT DTRSMKERNEL_RT = trsm_kernel_RT.S +endif CTRSMKERNEL_LN = ztrsm_kernel_LT.S CTRSMKERNEL_LT = ztrsm_kernel_LT.S diff --git a/kernel/mips64/KERNEL.LOONGSON3A b/kernel/mips64/KERNEL.LOONGSON3A index e149019aa..0e387c032 100644 --- a/kernel/mips64/KERNEL.LOONGSON3A +++ b/kernel/mips64/KERNEL.LOONGSON3A @@ -8,3 +8,7 @@ DGEMMONCOPYOBJ = dgemm_oncopy.o DGEMMOTCOPYOBJ = dgemm_otcopy.o +DTRSMKERNEL_LN = trsm_kernel_LN_loongson3a.S +DTRSMKERNEL_LT = trsm_kernel_LT_loongson3a.S +DTRSMKERNEL_RN = trsm_kernel_RN_loongson3a.S +DTRSMKERNEL_RT = trsm_kernel_RT_loongson3a.S From 432c309f63736ab3ac7205cbbbf294070d6d4d93 Mon Sep 17 00:00:00 2001 From: traz Date: Tue, 10 May 2011 12:48:43 +0000 Subject: [PATCH 12/42] Finish dtrsm_kernel_Lx.S on Loongson3A. --- kernel/mips64/trsm_kernel_LN_loongson3a.S | 1938 +++++++++++++++++++++ kernel/mips64/trsm_kernel_LT_loongson3a.S | 1783 +++++++++++++++++++ 2 files changed, 3721 insertions(+) create mode 100644 kernel/mips64/trsm_kernel_LN_loongson3a.S create mode 100644 kernel/mips64/trsm_kernel_LT_loongson3a.S diff --git a/kernel/mips64/trsm_kernel_LN_loongson3a.S b/kernel/mips64/trsm_kernel_LN_loongson3a.S new file mode 100644 index 000000000..aba86fbce --- /dev/null +++ b/kernel/mips64/trsm_kernel_LN_loongson3a.S @@ -0,0 +1,1938 @@ +#define REALNAME ASMNAME + +#define ASSEMBLER +#include "common.h" + + +#define M $4 +#define N $5 +#define K $6 +#define A $8 +#define B $9 +#define C $10 +#define LDC $11 + +#define AO $12 +#define BO $13 + +#define I $2 +#define J $3 +#define L $7 + +#define CO1 $14 +#define CO2 $15 +#define CO3 $16 +#define CO4 $17 + +#define OFFSET $22 +#define KK $23 +#define TEMP $24 +#define AORIG $25 + +#define a1 $f0 +#define a2 $f1 +#define a3 $f2 +#define a4 $f3 +#define a5 $f4 +#define a6 $f5 +#define a7 $f6 +#define a8 $f7 + +#define b1 $f8 +#define b2 $f9 +#define b3 $f10 +#define b4 $f11 +#define b5 $f12 +#define b6 $f13 +#define b7 $f14 +#define b8 $f15 + +#define t11 $f16 +#define t21 $f17 +#define t31 $f18 +#define t41 $f19 + +#define t12 $f20 +#define t22 $f21 +#define t32 $f22 +#define t42 $f23 + +#define t13 $f24 +#define t23 $f25 +#define t33 $f26 +#define t43 $f27 + +#define t14 $f28 +#define t24 $f29 +#define t34 $f30 +#define t44 $f31 + +#define ALPHA $f15 + + PROLOGUE + + daddiu $sp, $sp, -144 + + SDARG $16, 0($sp) + SDARG $17, 8($sp) + SDARG $18, 16($sp) + SDARG $19, 24($sp) + SDARG $20, 32($sp) + SDARG $21, 40($sp) + sdc1 $f24, 48($sp) + sdc1 $f25, 56($sp) + sdc1 $f26, 64($sp) + sdc1 $f27, 72($sp) + sdc1 $f28, 80($sp) + + SDARG $22, 88($sp) + SDARG $23, 96($sp) + SDARG $24, 104($sp) + SDARG $25, 112($sp) + +#ifndef __64BIT__ + sdc1 $f20,112($sp) + sdc1 $f21,120($sp) + sdc1 $f22,128($sp) + sdc1 $f23,136($sp) +#endif + # LN compute from bottom to top + LDARG OFFSET, 144($sp) + dsll LDC, LDC, BASE_SHIFT # ldc + + mult M, K + mflo TEMP # TEMP=MC*KC + + dsll TEMP, TEMP, BASE_SHIFT + daddu A, A, TEMP # A move to the end of sa + + dsll TEMP, M, BASE_SHIFT + daddu C, C, TEMP # C+=MC + + dsra J, N, 2 # j = nc/4 + blez J, .L30 + nop + +.L10: # nr=4 + daddiu J, J, -1 + move CO1, C + daddu CO2, C, LDC + daddu CO3, CO2, LDC + daddu CO4, CO3, LDC + + MTC $0, t11 # clear result registers + MOV t21, t11 + MOV t31, t11 + MOV t41, t11 + MOV t12, t11 + MOV t22, t11 + MOV t32, t11 + MOV t42, t11 + + daddu KK, M, OFFSET # kc - kk is the length of the rectangular data part of panel Ai + move AORIG, A # reset A + + daddu C, CO4, LDC # fixed pointer C, the write back address + + andi I, M, 1 # mr=2,nr=4 + blez I, .L50 + nop + + dsll TEMP, K, BASE_SHIFT # mr=1 + dsubu AORIG, AORIG, TEMP # AORIG point to the beginning address of Ai + + dsll L, KK, BASE_SHIFT # mr=1 + dsll TEMP, KK, 2 + BASE_SHIFT # nr=4 + + daddu AO, AORIG, L # AO point to the rectangular data part + daddu BO, B, TEMP + + dsubu TEMP, K, KK + + MOV t13, t11 # mr=2 + MOV t23, t11 + MOV t33, t11 + MOV t43, t11 + MOV t14, t11 + MOV t24, t11 + MOV t34, t11 + MOV t44, t11 + + LD a1, 0 * SIZE(AO) # this part compute the rectangular data part of Ai + + LD b1, 0 * SIZE(BO) # get 4b + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + + dsra L, TEMP, 2 + blez L, .L55 + nop + + + .align 3 +.L52: + LD a5, 1 * SIZE(AO) + + LD b5, 4 * SIZE(BO) + LD b6, 5 * SIZE(BO) + LD b7, 6 * SIZE(BO) + LD b8, 7 * SIZE(BO) + + MADD t11, t11, a1, b1 # 1st compute + MADD t12, t12, a1, b2 + MADD t13, t13, a1, b3 + MADD t14, t14, a1, b4 + + LD a3, 2 * SIZE(AO) + LD b1, 8 * SIZE(BO) + LD b2, 9 * SIZE(BO) + LD b3, 10 * SIZE(BO) + LD b4, 11 * SIZE(BO) + + MADD t11, t11, a5, b5 # 2ed compute + MADD t12, t12, a5, b6 + MADD t13, t13, a5, b7 + MADD t14, t14, a5, b8 + + LD a7, 3 * SIZE(AO) + LD b5, 12 * SIZE(BO) + LD b6, 13 * SIZE(BO) + LD b7, 14 * SIZE(BO) + LD b8, 15 * SIZE(BO) + + MADD t11, t11, a3, b1 # 3rd compute + MADD t12, t12, a3, b2 + MADD t13, t13, a3, b3 + MADD t14, t14, a3, b4 + + daddiu AO, AO, 4 * SIZE # AO += 1mr*4kr + daddiu BO, BO, 16 * SIZE # BO += 4nr*4kr + + LD a1, 0 * SIZE(AO) # next + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + + MADD t11, t11, a7, b5 # 4th compute + MADD t12, t12, a7, b6 + MADD t13, t13, a7, b7 + MADD t14, t14, a7, b8 + + daddiu L, L, -1 + bgtz L, .L52 + nop + + + .align 3 +.L55: + andi L, TEMP, 3 + blez L, .L58 + nop + + .align 3 +.L56: + MADD t11, t11, a1, b1 # 3rd compute + MADD t12, t12, a1, b2 + MADD t13, t13, a1, b3 + MADD t14, t14, a1, b4 + + daddiu AO, AO, 1 * SIZE # AO += 1mr + daddiu BO, BO, 4 * SIZE # BO += 4nr + + LD a1, 0 * SIZE(AO) # next + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + + daddiu L, L, -1 + bgtz L, .L56 + nop + + +.L58: # deal with the triangular part + daddiu TEMP, KK, -1 + dsll L, TEMP, BASE_SHIFT # mr=1 + dsll TEMP, TEMP, 2 + BASE_SHIFT + daddu AO, AORIG, L # Ao point to the triangular data part + daddu BO, B, TEMP + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + + SUB t11, b1, t11 + SUB t12, b2, t12 + SUB t13, b3, t13 + SUB t14, b4, t14 + + + LD b3, 0 * SIZE(AO) + MUL t11, b3, t11 + MUL t12, b3, t12 + MUL t13, b3, t13 + MUL t14, b3, t14 + + daddiu CO1, CO1, -1 * SIZE + daddiu CO2, CO2, -1 * SIZE + daddiu CO3, CO3, -1 * SIZE + daddiu CO4, CO4, -1 * SIZE + + ST t11, 0 * SIZE(BO) + ST t12, 1 * SIZE(BO) + ST t13, 2 * SIZE(BO) + ST t14, 3 * SIZE(BO) + + ST t11, 0 * SIZE(CO1) + ST t12, 0 * SIZE(CO2) + ST t13, 0 * SIZE(CO3) + ST t14, 0 * SIZE(CO4) + + + daddiu KK, KK, -1 # the length of rectangular data part increases by 1 + MTC $0, t11 # clear result registers + MOV t21, t11 + MOV t31, t11 + MOV t41, t11 + MOV t12, t11 + MOV t22, t11 + MOV t32, t11 + MOV t42, t11 + + + +.L50: + andi I, M, 2 # mr=2,nr=4 + blez I, .L20 + nop + + dsll TEMP, K, 1 + BASE_SHIFT + dsubu AORIG, AORIG, TEMP # AORIG point to the beginning address of Ai + + dsll L, KK, 1 + BASE_SHIFT + dsll TEMP, KK, 2 + BASE_SHIFT + + daddu AO, AORIG, L # AO point to the rectangular data part + daddu BO, B, TEMP + + dsubu TEMP, K, KK + + MOV t13, t11 # mr=2 + MOV t23, t11 + MOV t33, t11 + MOV t43, t11 + MOV t14, t11 + MOV t24, t11 + MOV t34, t11 + MOV t44, t11 + + LD a1, 0 * SIZE(AO) # this part compute the rectangular data part of Ai + LD a2, 1 * SIZE(AO) # mr*KK with nr*KK + + LD b1, 0 * SIZE(BO) # get 4b + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + + dsra L, TEMP, 2 + blez L, .L25 + nop + + + .align 3 +.L22: + LD a5, 2 * SIZE(AO) + LD a6, 3 * SIZE(AO) + LD b5, 4 * SIZE(BO) + LD b6, 5 * SIZE(BO) + LD b7, 6 * SIZE(BO) + LD b8, 7 * SIZE(BO) + + MADD t11, t11, a1, b1 # 1st compute + MADD t21, t21, a2, b1 + MADD t12, t12, a1, b2 + MADD t22, t22, a2, b2 + MADD t13, t13, a1, b3 + MADD t23, t23, a2, b3 + MADD t14, t14, a1, b4 + MADD t24, t24, a2, b4 + + LD a3, 4 * SIZE(AO) + LD a4, 5 * SIZE(AO) + LD b1, 8 * SIZE(BO) + LD b2, 9 * SIZE(BO) + LD b3, 10 * SIZE(BO) + LD b4, 11 * SIZE(BO) + + MADD t11, t11, a5, b5 # 2ed compute + MADD t21, t21, a6, b5 + MADD t12, t12, a5, b6 + MADD t22, t22, a6, b6 + MADD t13, t13, a5, b7 + MADD t23, t23, a6, b7 + MADD t14, t14, a5, b8 + MADD t24, t24, a6, b8 + + LD a7, 6 * SIZE(AO) + LD a8, 7 * SIZE(AO) + LD b5, 12 * SIZE(BO) + LD b6, 13 * SIZE(BO) + LD b7, 14 * SIZE(BO) + LD b8, 15 * SIZE(BO) + + MADD t11, t11, a3, b1 # 3rd compute + MADD t21, t21, a4, b1 + MADD t12, t12, a3, b2 + MADD t22, t22, a4, b2 + MADD t13, t13, a3, b3 + MADD t23, t23, a4, b3 + MADD t14, t14, a3, b4 + MADD t24, t24, a4, b4 + + daddiu AO, AO, 8 * SIZE # AO += 2mr*4kr + daddiu BO, BO, 16 * SIZE # BO += 4nr*4kr + + LD a1, 0 * SIZE(AO) # next + LD a2, 1 * SIZE(AO) + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + + MADD t11, t11, a7, b5 # 4th compute + MADD t21, t21, a8, b5 + MADD t12, t12, a7, b6 + MADD t22, t22, a8, b6 + MADD t13, t13, a7, b7 + MADD t23, t23, a8, b7 + MADD t14, t14, a7, b8 + MADD t24, t24, a8, b8 + + daddiu L, L, -1 + bgtz L, .L22 + nop + + + .align 3 +.L25: + andi L, TEMP, 3 + blez L, .L28 + nop + + .align 3 +.L26: + MADD t11, t11, a1, b1 # 3rd compute + MADD t21, t21, a2, b1 + MADD t12, t12, a1, b2 + MADD t22, t22, a2, b2 + MADD t13, t13, a1, b3 + MADD t23, t23, a2, b3 + MADD t14, t14, a1, b4 + MADD t24, t24, a2, b4 + + daddiu AO, AO, 2 * SIZE # AO += 2mr + daddiu BO, BO, 4 * SIZE # BO += 4nr + + LD a1, 0 * SIZE(AO) # next + LD a2, 1 * SIZE(AO) + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + + daddiu L, L, -1 + bgtz L, .L26 + nop + + +.L28: # deal with the triangular part + daddiu TEMP, KK, -2 + dsll L, TEMP, 1 + BASE_SHIFT + dsll TEMP, TEMP, 2 + BASE_SHIFT + daddu AO, AORIG, L # Ao point to the triangular data part + daddu BO, B, TEMP + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + LD b5, 4 * SIZE(BO) + LD b6, 5 * SIZE(BO) + LD b7, 6 * SIZE(BO) + LD b8, 7 * SIZE(BO) + + SUB t11, b1, t11 + SUB t12, b2, t12 + SUB t13, b3, t13 + SUB t14, b4, t14 + SUB t21, b5, t21 + SUB t22, b6, t22 + SUB t23, b7, t23 + SUB t24, b8, t24 + + + LD b1, 3 * SIZE(AO) # computes the triangular_part + LD b2, 2 * SIZE(AO) + MUL t21, b1, t21 + MUL t22, b1, t22 + MUL t23, b1, t23 + MUL t24, b1, t24 + NMSUB t11, t11, b2, t21 + NMSUB t12, t12, b2, t22 + NMSUB t13, t13, b2, t23 + NMSUB t14, t14, b2, t24 + + LD b3, 0 * SIZE(AO) + MUL t11, b3, t11 + MUL t12, b3, t12 + MUL t13, b3, t13 + MUL t14, b3, t14 + + daddiu CO1, CO1, -2 * SIZE + daddiu CO2, CO2, -2 * SIZE + daddiu CO3, CO3, -2 * SIZE + daddiu CO4, CO4, -2 * SIZE + + ST t11, 0 * SIZE(BO) + ST t12, 1 * SIZE(BO) + ST t13, 2 * SIZE(BO) + ST t14, 3 * SIZE(BO) + ST t21, 4 * SIZE(BO) + ST t22, 5 * SIZE(BO) + ST t23, 6 * SIZE(BO) + ST t24, 7 * SIZE(BO) + + ST t11, 0 * SIZE(CO1) + ST t21, 1 * SIZE(CO1) + ST t12, 0 * SIZE(CO2) + ST t22, 1 * SIZE(CO2) + ST t13, 0 * SIZE(CO3) + ST t23, 1 * SIZE(CO3) + ST t14, 0 * SIZE(CO4) + ST t24, 1 * SIZE(CO4) + + + + daddiu KK, KK, -2 # the length of rectangular data part increases by 2 + MTC $0, t11 # clear result registers + MOV t21, t11 + MOV t31, t11 + MOV t41, t11 + MOV t12, t11 + MOV t22, t11 + MOV t32, t11 + MOV t42, t11 + + +.L20: + dsra I, M, 2 # I=MC/4 + blez I, .L29 + nop + +.L11: # mr=4 + dsll TEMP, K, 2 + BASE_SHIFT # TEMP=KC*MR*data_Byte + dsubu AORIG, AORIG, TEMP # AORIG point to the beginning address of panel Ai + dsll L, KK, 2 + BASE_SHIFT # KC-KK is the length of the rectangular data part of Ai + dsll TEMP, KK, 2 + BASE_SHIFT # KK*NR*data_Byte + + daddu AO, AORIG, L # AO point to the rectangular data part + daddu BO, B, TEMP + + dsubu TEMP, K, KK + + LD a1, 0 * SIZE(AO) # this part compute the rectangular data part of Ai + LD a2, 1 * SIZE(AO) # mr*KK with nr*KK + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) # get 4a + + LD b1, 0 * SIZE(BO) # get 4b + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + + MOV t13, t11 # clear result registers + MOV t23, t11 + MOV t33, t11 + MOV t43, t11 + MOV t14, t11 + MOV t24, t11 + MOV t34, t11 + MOV t44, t11 + + dsra L, TEMP, 2 # L=(KC-offset)/4 + blez L, .L15 + nop + + .align 3 +.L12: + LD a5, 4 * SIZE(AO) + LD a6, 5 * SIZE(AO) + LD a7, 6 * SIZE(AO) + LD a8, 7 * SIZE(AO) + + LD b5, 4 * SIZE(BO) + LD b6, 5 * SIZE(BO) + LD b7, 6 * SIZE(BO) + LD b8, 7 * SIZE(BO) + + MADD t11, t11, a1, b1 # 1st compute + MADD t21, t21, a2, b1 + MADD t31, t31, a3, b1 + MADD t41, t41, a4, b1 + + MADD t12, t12, a1, b2 + MADD t22, t22, a2, b2 + MADD t32, t32, a3, b2 + MADD t42, t42, a4, b2 + + MADD t13, t13, a1, b3 + MADD t23, t23, a2, b3 + MADD t33, t33, a3, b3 + MADD t43, t43, a4, b3 + + MADD t14, t14, a1, b4 + MADD t24, t24, a2, b4 + MADD t34, t34, a3, b4 + MADD t44, t44, a4, b4 + + LD a1, 8 * SIZE(AO) + LD a2, 9 * SIZE(AO) + LD a3, 10 * SIZE(AO) + LD a4, 11 * SIZE(AO) + + LD b1, 8 * SIZE(BO) + LD b2, 9 * SIZE(BO) + LD b3, 10 * SIZE(BO) + LD b4, 11 * SIZE(BO) + + MADD t11, t11, a5, b5 # 2ed compute + MADD t21, t21, a6, b5 + MADD t31, t31, a7, b5 + MADD t41, t41, a8, b5 + + MADD t12, t12, a5, b6 + MADD t22, t22, a6, b6 + MADD t32, t32, a7, b6 + MADD t42, t42, a8, b6 + + MADD t13, t13, a5, b7 + MADD t23, t23, a6, b7 + MADD t33, t33, a7, b7 + MADD t43, t43, a8, b7 + + MADD t14, t14, a5, b8 + MADD t24, t24, a6, b8 + MADD t34, t34, a7, b8 + MADD t44, t44, a8, b8 + + LD a5, 12 * SIZE(AO) + LD a6, 13 * SIZE(AO) + LD a7, 14 * SIZE(AO) + LD a8, 15 * SIZE(AO) + + LD b5, 12 * SIZE(BO) + LD b6, 13 * SIZE(BO) + LD b7, 14 * SIZE(BO) + LD b8, 15 * SIZE(BO) + + MADD t11, t11, a1, b1 # 3rd compute + MADD t21, t21, a2, b1 + MADD t31, t31, a3, b1 + MADD t41, t41, a4, b1 + + MADD t12, t12, a1, b2 + MADD t22, t22, a2, b2 + MADD t32, t32, a3, b2 + MADD t42, t42, a4, b2 + + MADD t13, t13, a1, b3 + MADD t23, t23, a2, b3 + MADD t33, t33, a3, b3 + MADD t43, t43, a4, b3 + + MADD t14, t14, a1, b4 + MADD t24, t24, a2, b4 + MADD t34, t34, a3, b4 + MADD t44, t44, a4, b4 + + daddiu AO, AO, 16 * SIZE # AO += 4mr*4kr + daddiu BO, BO, 16 * SIZE # BO += 4nr*4kr + + LD a1, 0 * SIZE(AO) # next + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + + MADD t11, t11, a5, b5 # 4th compute + MADD t21, t21, a6, b5 + MADD t31, t31, a7, b5 + MADD t41, t41, a8, b5 + + MADD t12, t12, a5, b6 + MADD t22, t22, a6, b6 + MADD t32, t32, a7, b6 + MADD t42, t42, a8, b6 + + MADD t13, t13, a5, b7 + MADD t23, t23, a6, b7 + MADD t33, t33, a7, b7 + MADD t43, t43, a8, b7 + + MADD t14, t14, a5, b8 + MADD t24, t24, a6, b8 + MADD t34, t34, a7, b8 + MADD t44, t44, a8, b8 + + daddiu L, L, -1 + bgtz L, .L12 + nop + + + .align 3 +.L15: + andi L, TEMP, 3 + blez L, .L18 + nop + + .align 3 +.L16: + MADD t11, t11, a1, b1 + MADD t21, t21, a2, b1 + MADD t31, t31, a3, b1 + MADD t41, t41, a4, b1 + + MADD t12, t12, a1, b2 + MADD t22, t22, a2, b2 + MADD t32, t32, a3, b2 + MADD t42, t42, a4, b2 + + MADD t13, t13, a1, b3 + MADD t23, t23, a2, b3 + MADD t33, t33, a3, b3 + MADD t43, t43, a4, b3 + + MADD t14, t14, a1, b4 + MADD t24, t24, a2, b4 + MADD t34, t34, a3, b4 + MADD t44, t44, a4, b4 + + daddiu AO, AO, 4 * SIZE # AO += 4mr + daddiu BO, BO, 4 * SIZE # BO += 4nr + + LD a1, 0 * SIZE(AO) # next + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + + daddiu L, L, -1 + bgtz L, .L16 + nop + + +.L18: # deal with the triangular data part of panel Ai + daddiu TEMP, KK, -4 # + + dsll L, TEMP, 2 + BASE_SHIFT + dsll TEMP, TEMP, 2 + BASE_SHIFT + daddu AO, AORIG, L # AO point to the triangular data part + daddu BO, B, TEMP + + LD b1, 0 * SIZE(BO) # triangular_part*X + rectangular_part = B + LD b2, 1 * SIZE(BO) # triangular_part*X = B - rectangular_part + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + + SUB t11, b1, t11 + SUB t12, b2, t12 + SUB t13, b3, t13 + SUB t14, b4, t14 + + LD b5, 4 * SIZE(BO) # sb store in row major + LD b6, 5 * SIZE(BO) + LD b7, 6 * SIZE(BO) + LD b8, 7 * SIZE(BO) + + SUB t21, b5, t21 + SUB t22, b6, t22 + SUB t23, b7, t23 + SUB t24, b8, t24 + + LD b1, 8 * SIZE(BO) + LD b2, 9 * SIZE(BO) + LD b3, 10 * SIZE(BO) + LD b4, 11 * SIZE(BO) + + SUB t31, b1, t31 + SUB t32, b2, t32 + SUB t33, b3, t33 + SUB t34, b4, t34 + + LD b5, 12 * SIZE(BO) + LD b6, 13 * SIZE(BO) + LD b7, 14 * SIZE(BO) + LD b8, 15 * SIZE(BO) + + SUB t41, b5, t41 + SUB t42, b6, t42 + SUB t43, b7, t43 + SUB t44, b8, t44 + + + LD b1, 15 * SIZE(AO) + LD b2, 14 * SIZE(AO) + LD b4, 13 * SIZE(AO) + LD b7, 12 * SIZE(AO) + + MUL t41, b1, t41 + MUL t42, b1, t42 + MUL t43, b1, t43 + MUL t44, b1, t44 + NMSUB t31, t31, b2, t41 + NMSUB t32, t32, b2, t42 + NMSUB t33, t33, b2, t43 + NMSUB t34, t34, b2, t44 + NMSUB t21, t21, b4, t41 + NMSUB t22, t22, b4, t42 + NMSUB t23, t23, b4, t43 + NMSUB t24, t24, b4, t44 + NMSUB t11, t11, b7, t41 + NMSUB t12, t12, b7, t42 + NMSUB t13, t13, b7, t43 + NMSUB t14, t14, b7, t44 + + + + LD b3, 10 * SIZE(AO) + LD b5, 9 * SIZE(AO) + LD b8, 8 * SIZE(AO) + MUL t31, b3, t31 + MUL t32, b3, t32 + MUL t33, b3, t33 + MUL t34, b3, t34 + NMSUB t21, t21, b5, t31 + NMSUB t22, t22, b5, t32 + NMSUB t23, t23, b5, t33 + NMSUB t24, t24, b5, t34 + NMSUB t11, t11, b8, t31 + NMSUB t12, t12, b8, t32 + NMSUB t13, t13, b8, t33 + NMSUB t14, t14, b8, t34 + + + + LD b6, 5 * SIZE(AO) + LD b1, 4 * SIZE(AO) + MUL t21, b6, t21 + MUL t22, b6, t22 + MUL t23, b6, t23 + MUL t24, b6, t24 + NMSUB t11, t11, b1, t21 + NMSUB t12, t12, b1, t22 + NMSUB t13, t13, b1, t23 + NMSUB t14, t14, b1, t24 + + + + LD b2, 0 * SIZE(AO) + MUL t11, b2, t11 + MUL t12, b2, t12 + MUL t13, b2, t13 + MUL t14, b2, t14 + + daddiu CO1, CO1, -4 * SIZE # modify + daddiu CO2, CO2, -4 * SIZE + daddiu CO3, CO3, -4 * SIZE + daddiu CO4, CO4, -4 * SIZE + + + ST t11, 0 * SIZE(BO) # update packed B + ST t12, 1 * SIZE(BO) + ST t13, 2 * SIZE(BO) + ST t14, 3 * SIZE(BO) + ST t21, 4 * SIZE(BO) + ST t22, 5 * SIZE(BO) + ST t23, 6 * SIZE(BO) + ST t24, 7 * SIZE(BO) + ST t31, 8 * SIZE(BO) + ST t32, 9 * SIZE(BO) + ST t33, 10 * SIZE(BO) + ST t34, 11 * SIZE(BO) + ST t41, 12 * SIZE(BO) + ST t42, 13 * SIZE(BO) + ST t43, 14 * SIZE(BO) + ST t44, 15 * SIZE(BO) + + ST t11, 0 * SIZE(CO1) # write back + ST t21, 1 * SIZE(CO1) + ST t31, 2 * SIZE(CO1) + ST t41, 3 * SIZE(CO1) + ST t12, 0 * SIZE(CO2) + ST t22, 1 * SIZE(CO2) + ST t32, 2 * SIZE(CO2) + ST t42, 3 * SIZE(CO2) + ST t13, 0 * SIZE(CO3) + ST t23, 1 * SIZE(CO3) + ST t33, 2 * SIZE(CO3) + ST t43, 3 * SIZE(CO3) + ST t14, 0 * SIZE(CO4) + ST t24, 1 * SIZE(CO4) + ST t34, 2 * SIZE(CO4) + ST t44, 3 * SIZE(CO4) + + + daddiu KK, KK, -4 # KC-KK is the length of the rectangular data part, LN compute from bottom to top so KK-=4 + daddiu I, I, -1 + + MTC $0, a1 + MOV t11, a1 + MOV t21, a1 + MOV t31, a1 + MOV t41, a1 + MOV t12, a1 + MOV t22, a1 + MOV t32, a1 + MOV t42, a1 + bgtz I, .L11 + nop + + .align 3 + +.L29: + dsll TEMP, K, 2 + BASE_SHIFT + daddu B, B, TEMP # B point to next Bj + + bgtz J, .L10 + nop + + + .align 3 +.L30: + andi J, N, 2 # nr=2 + blez J, .L70 + nop + + move CO1, C + daddu CO2, C, LDC + + MTC $0, t11 # clear result regusters + MOV t21, t11 + MOV t31, t11 + MOV t41, t11 + + daddu KK, M, OFFSET + move AORIG, A # reset A + + daddu C, CO2, LDC # fixed + + andi I, M, 1 # mr=1 + blez I, .L60 + nop + + dsll TEMP, K, BASE_SHIFT + dsubu AORIG, AORIG, TEMP # AORIG point to the beginning address of everypanel of Ai + + dsll L, KK, BASE_SHIFT # mr=1 + dsll TEMP, KK, 1 + BASE_SHIFT # nr=2 + + daddu AO, AORIG, L # AO point to rectangular data part + daddu BO, B, TEMP + + dsubu TEMP, K, KK + + MOV t12, t11 # clear result registers + MOV t22, t11 + MOV t32, t11 + MOV t42, t11 + + LD a1, 0 * SIZE(AO) + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + + dsra L, TEMP, 2 + blez L, .L65 + nop + + + .align 3 +.L62: + LD a5, 1 * SIZE(AO) + LD b5, 2 * SIZE(BO) + LD b6, 3 * SIZE(BO) + + MADD t11, t11, a1, b1 # 1st compute + MADD t12, t12, a1, b2 + + LD a3, 2 * SIZE(AO) + LD b3, 4 * SIZE(BO) + LD b4, 5 * SIZE(BO) + + MADD t11, t11, a5, b5 # 2ed compute + MADD t12, t12, a5, b6 + + LD a7, 3 * SIZE(AO) + LD b7, 6 * SIZE(BO) + LD b8, 7 * SIZE(BO) + + MADD t11, t11, a3, b3 # 3rd compute + MADD t12, t12, a3, b4 + + daddiu AO, AO, 4 * SIZE # AO += 1mr*4kr + daddiu BO, BO, 8 * SIZE # BO += 2nr*4kr + + LD a1, 0 * SIZE(AO) # next + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + + MADD t11, t11, a7, b7 # 4th compute + MADD t12, t12, a7, b8 + + daddiu L, L, -1 + bgtz L, .L62 + nop + + .align 3 + +.L65: + andi L, TEMP, 3 + blez L, .L68 + nop + + .align 3 +.L66: + MADD t11, t11, a1, b1 # 3rd compute + MADD t21, t21, a2, b1 + MADD t12, t12, a1, b2 + MADD t22, t22, a2, b2 + + daddiu AO, AO, 1 * SIZE # AO += mr + daddiu BO, BO, 2 * SIZE # BO += 2nr + + LD a1, 0 * SIZE(AO) # next + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + + daddiu L, L, -1 + bgtz L, .L66 + nop + +.L68: + daddiu TEMP, KK, -1 # mr=1 + + dsll L, TEMP, BASE_SHIFT + dsll TEMP, TEMP, 1 + BASE_SHIFT + daddu AO, AORIG, L # Ao point to the triangular data part + daddu BO, B, TEMP + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + + SUB t11, b1, t11 + SUB t12, b2, t12 + + + LD b3, 0 * SIZE(AO) + MUL t11, b3, t11 + MUL t12, b3, t12 + + daddiu CO1, CO1, -1 * SIZE + daddiu CO2, CO2, -1 * SIZE + + + ST t11, 0 * SIZE(BO) + ST t12, 1 * SIZE(BO) + + ST t11, 0 * SIZE(CO1) + ST t12, 0 * SIZE(CO2) + + + daddiu KK, KK, -1 + MTC $0, t11 # clear result regusters + MOV t21, t11 + MOV t31, t11 + MOV t41, t11 + + + + +.L60: + andi I, M, 2 + blez I, .L40 + nop + + dsll TEMP, K, 1 + BASE_SHIFT + dsubu AORIG, AORIG, TEMP # AORIG point to the beginning address of everypanel of Ai + + dsll L, KK, 1 + BASE_SHIFT # mr=2 + dsll TEMP, KK, 1 + BASE_SHIFT # nr=2 + + daddu AO, AORIG, L # AO point to rectangular data part + daddu BO, B, TEMP + + dsubu TEMP, K, KK + + + MOV t12, t11 # clear result registers + MOV t22, t11 + MOV t32, t11 + MOV t42, t11 + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + + dsra L, TEMP, 2 + blez L, .L45 + nop + + + .align 3 +.L42: + LD a5, 2 * SIZE(AO) + LD a6, 3 * SIZE(AO) + LD b5, 2 * SIZE(BO) + LD b6, 3 * SIZE(BO) + + MADD t11, t11, a1, b1 # 1st compute + MADD t21, t21, a2, b1 + MADD t12, t12, a1, b2 + MADD t22, t22, a2, b2 + + LD a3, 4 * SIZE(AO) + LD a4, 5 * SIZE(AO) + LD b3, 4 * SIZE(BO) + LD b4, 5 * SIZE(BO) + + MADD t11, t11, a5, b5 # 2ed compute + MADD t21, t21, a6, b5 + MADD t12, t12, a5, b6 + MADD t22, t22, a6, b6 + + LD a7, 6 * SIZE(AO) + LD a8, 7 * SIZE(AO) + LD b7, 6 * SIZE(BO) + LD b8, 7 * SIZE(BO) + + MADD t11, t11, a3, b3 # 3rd compute + MADD t21, t21, a4, b3 + MADD t12, t12, a3, b4 + MADD t22, t22, a4, b4 + + daddiu AO, AO, 8 * SIZE # AO += 2mr*4kr + daddiu BO, BO, 8 * SIZE # BO += 2nr*4kr + + LD a1, 0 * SIZE(AO) # next + LD a2, 1 * SIZE(AO) + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + + MADD t11, t11, a7, b7 # 4th compute + MADD t21, t21, a8, b7 + MADD t12, t12, a7, b8 + MADD t22, t22, a8, b8 + + daddiu L, L, -1 + bgtz L, .L42 + nop + + .align 3 + +.L45: + andi L, TEMP, 3 + blez L, .L48 + nop + + .align 3 +.L46: + MADD t11, t11, a1, b1 # 3rd compute + MADD t21, t21, a2, b1 + MADD t12, t12, a1, b2 + MADD t22, t22, a2, b2 + + daddiu AO, AO, 2 * SIZE # AO += 2mr + daddiu BO, BO, 2 * SIZE # BO += 2nr + + LD a1, 0 * SIZE(AO) # next + LD a2, 1 * SIZE(AO) + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + + daddiu L, L, -1 + bgtz L, .L46 + nop + +.L48: + daddiu TEMP, KK, -2 + + dsll L, TEMP, 1 + BASE_SHIFT + dsll TEMP, TEMP, 1 + BASE_SHIFT + daddu AO, AORIG, L # Ao point to the triangular data part + daddu BO, B, TEMP + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + + SUB t11, b1, t11 + SUB t12, b2, t12 + SUB t21, b3, t21 + SUB t22, b4, t22 + + LD b1, 3 * SIZE(AO) # computes the triangular_part + LD b2, 2 * SIZE(AO) + MUL t21, b1, t21 + MUL t22, b1, t22 + NMSUB t11, t11, b2, t21 + NMSUB t12, t12, b2, t22 + + LD b3, 0 * SIZE(AO) + MUL t11, b3, t11 + MUL t12, b3, t12 + + daddiu CO1, CO1, -2 * SIZE + daddiu CO2, CO2, -2 * SIZE + + + ST t11, 0 * SIZE(BO) + ST t12, 1 * SIZE(BO) + ST t21, 2 * SIZE(BO) + ST t22, 3 * SIZE(BO) + + ST t11, 0 * SIZE(CO1) + ST t21, 1 * SIZE(CO1) + ST t12, 0 * SIZE(CO2) + ST t22, 1 * SIZE(CO2) + + + daddiu KK, KK, -2 + MTC $0, t11 # clear result regusters + MOV t21, t11 + MOV t31, t11 + MOV t41, t11 + + +.L40: + dsra I, M, 2 # I = mc/4 + blez I, .L49 + nop + +.L31: + dsll TEMP, K, 2 + BASE_SHIFT + dsubu AORIG, AORIG, TEMP # AORIG point to the beginning address of panel Ai + dsll L, KK, 2 + BASE_SHIFT # mr=4 + dsll TEMP, KK, 1 + BASE_SHIFT # nr=2 + + daddu AO, AORIG, L # AO point to the rectangular data part + daddu BO, B, TEMP + + dsubu TEMP, K, KK + + MOV t12, t11 + MOV t22, t11 + MOV t32, t11 + MOV t42, t11 + LD a1, 0 * SIZE(AO) # this part compute the rectangular data part of Ai + LD a2, 1 * SIZE(AO) # mr*KK with nr*KK + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) # get 4a + + LD b1, 0 * SIZE(BO) # get 4b + LD b2, 1 * SIZE(BO) + + dsra L, TEMP, 2 + blez L, .L35 + nop + + + .align 3 +.L32: + LD a5, 4 * SIZE(AO) + LD a6, 5 * SIZE(AO) + LD a7, 6 * SIZE(AO) + LD a8, 7 * SIZE(AO) + LD b5, 2 * SIZE(BO) + LD b6, 3 * SIZE(BO) + + MADD t11, t11, a1, b1 # 1st compute + MADD t21, t21, a2, b1 + MADD t31, t31, a3, b1 + MADD t41, t41, a4, b1 + MADD t12, t12, a1, b2 + MADD t22, t22, a2, b2 + MADD t32, t32, a3, b2 + MADD t42, t42, a4, b2 + + LD a1, 8 * SIZE(AO) + LD a2, 9 * SIZE(AO) + LD a3, 10 * SIZE(AO) + LD a4, 11 * SIZE(AO) + LD b3, 4 * SIZE(BO) + LD b4, 5 * SIZE(BO) + + MADD t11, t11, a5, b5 # 2ed compute + MADD t21, t21, a6, b5 + MADD t31, t31, a7, b5 + MADD t41, t41, a8, b5 + MADD t12, t12, a5, b6 + MADD t22, t22, a6, b6 + MADD t32, t32, a7, b6 + MADD t42, t42, a8, b6 + + LD a5, 12 * SIZE(AO) + LD a6, 13 * SIZE(AO) + LD a7, 14 * SIZE(AO) + LD a8, 15 * SIZE(AO) + LD b7, 6 * SIZE(BO) + LD b8, 7 * SIZE(BO) + + MADD t11, t11, a1, b3 # 3rd compute + MADD t21, t21, a2, b3 + MADD t31, t31, a3, b3 + MADD t41, t41, a4, b3 + MADD t12, t12, a1, b4 + MADD t22, t22, a2, b4 + MADD t32, t32, a3, b4 + MADD t42, t42, a4, b4 + + daddiu AO, AO, 16 * SIZE # AO += 4mr*4kr + daddiu BO, BO, 8 * SIZE # BO += 2nr*4kr + + LD a1, 0 * SIZE(AO) # next + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + + MADD t11, t11, a5, b7 # 4th compute + MADD t21, t21, a6, b7 + MADD t31, t31, a7, b7 + MADD t41, t41, a8, b7 + MADD t12, t12, a5, b8 + MADD t22, t22, a6, b8 + MADD t32, t32, a7, b8 + MADD t42, t42, a8, b8 + + daddiu L, L, -1 + bgtz L, .L32 + nop + + + .align 3 + +.L35: + andi L, TEMP, 3 + blez L, .L38 + nop + + .align 3 +.L36: + MADD t11, t11, a1, b1 # 3rd compute + MADD t21, t21, a2, b1 + MADD t31, t31, a3, b1 + MADD t41, t41, a4, b1 + + MADD t12, t12, a1, b2 + MADD t22, t22, a2, b2 + MADD t32, t32, a3, b2 + MADD t42, t42, a4, b2 + + daddiu AO, AO, 4 * SIZE # AO += 4mr + daddiu BO, BO, 2 * SIZE # BO += 2nr + + LD a1, 0 * SIZE(AO) # next + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + + daddiu L, L, -1 + bgtz L, .L36 + nop + + +.L38: # + daddiu TEMP, KK, -4 + dsll L, TEMP, 2 + BASE_SHIFT # mr=4 + dsll TEMP, TEMP, 1 + BASE_SHIFT # nr=2 + daddu AO, AORIG, L # AO point to the triangular data part + daddu BO, B, TEMP + + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + LD b5, 4 * SIZE(BO) + LD b6, 5 * SIZE(BO) + LD b7, 6 * SIZE(BO) + LD b8, 7 * SIZE(BO) + + SUB t11, b1, t11 + SUB t12, b2, t12 + SUB t21, b3, t21 + SUB t22, b4, t22 + SUB t31, b5, t31 + SUB t32, b6, t32 + SUB t41, b7, t41 + SUB t42, b8, t42 + + + LD b1, 15 * SIZE(AO) + LD b2, 14 * SIZE(AO) + LD b4, 13 * SIZE(AO) + LD b7, 12 * SIZE(AO) + + MUL t41, b1, t41 + MUL t42, b1, t42 + NMSUB t31, t31, b2, t41 + NMSUB t32, t32, b2, t42 + NMSUB t21, t21, b4, t41 + NMSUB t22, t22, b4, t42 + NMSUB t11, t11, b7, t41 + NMSUB t12, t12, b7, t42 + + + + LD b3, 10 * SIZE(AO) + LD b5, 9 * SIZE(AO) + LD b8, 8 * SIZE(AO) + MUL t31, b3, t31 + MUL t32, b3, t32 + NMSUB t21, t21, b5, t31 + NMSUB t22, t22, b5, t32 + NMSUB t11, t11, b8, t31 + NMSUB t12, t12, b8, t32 + + + + LD b6, 5 * SIZE(AO) + LD b1, 4 * SIZE(AO) + MUL t21, b6, t21 + MUL t22, b6, t22 + NMSUB t11, t11, b1, t21 + NMSUB t12, t12, b1, t22 + + + LD b2, 0 * SIZE(AO) + MUL t11, b2, t11 + MUL t12, b2, t12 + + daddiu CO1, CO1, -4 * SIZE + daddiu CO2, CO2, -4 * SIZE + + ST t11, 0 * SIZE(BO) + ST t12, 1 * SIZE(BO) + ST t21, 2 * SIZE(BO) + ST t22, 3 * SIZE(BO) + ST t31, 4 * SIZE(BO) + ST t32, 5 * SIZE(BO) + ST t41, 6 * SIZE(BO) + ST t42, 7 * SIZE(BO) + + ST t11, 0 * SIZE(CO1) + ST t21, 1 * SIZE(CO1) + ST t31, 2 * SIZE(CO1) + ST t41, 3 * SIZE(CO1) + ST t12, 0 * SIZE(CO2) + ST t22, 1 * SIZE(CO2) + ST t32, 2 * SIZE(CO2) + ST t42, 3 * SIZE(CO2) + + + daddiu KK, KK, -4 + + MTC $0, t11 + MOV t21, t11 + MOV t31, t11 + MOV t41, t11 + + daddiu I, I, -1 + bgtz I, .L31 + nop + + + + .align 3 +.L49: + dsll TEMP, K, 1 + BASE_SHIFT # nr=2 + daddu B, B, TEMP + + .align 3 + +.L70: + andi J, N, 1 # nr=1 + blez J, .L999 # END + nop + + move CO1, C + + daddu KK, M, OFFSET + move AORIG, A # reset A + + andi I, M, 1 # mr=1 + blez I, .L90 + NOP + + MTC $0, t11 + + dsll TEMP, K, BASE_SHIFT # mr=1 + dsubu AORIG, AORIG, TEMP + + dsll L, KK, BASE_SHIFT + + daddu AO, AORIG, L # AO point to the rectangular data part + daddu BO, B, L + + dsubu TEMP, K, KK + + + LD a1, 0 * SIZE(AO) + LD b1, 0 * SIZE(BO) + + dsra L, TEMP, 2 + blez L, .L95 + nop + + .align 3 +.L92: + LD a5, 1 * SIZE(AO) + LD b5, 1 * SIZE(BO) + + MADD t11, t11, a1, b1 # 1st compute + + LD a3, 2 * SIZE(AO) + LD b3, 2 * SIZE(BO) + + MADD t11, t11, a5, b5 # 2ed compute + + LD a7, 3 * SIZE(AO) + LD b7, 3 * SIZE(BO) + + MADD t11, t11, a3, b3 # 3rd compute + + daddiu AO, AO, 4 * SIZE # AO += 1mr*4kr + daddiu BO, BO, 4 * SIZE # BO += 1nr*4kr + + LD a1, 0 * SIZE(AO) # next + LD b1, 0 * SIZE(BO) + + MADD t11, t11, a7, b7 # 4th compute + + daddiu L, L, -1 + bgtz L, .L92 + nop + + .align 3 + +.L95: + andi L, TEMP, 3 + blez L, .L98 + nop + + .align 3 +.L96: + MADD t11, t11, a1, b1 # 3rd compute + + daddiu AO, AO, 1 * SIZE # AO += 1mr + daddiu BO, BO, 1 * SIZE # BO += 1nr + + LD a1, 0 * SIZE(AO) # next + LD b1, 0 * SIZE(BO) + + daddiu L, L, -1 + bgtz L, .L96 + nop + + +.L98: + daddiu TEMP, KK, -1 # mr=2 + dsll TEMP, TEMP, BASE_SHIFT + + daddu AO, AORIG, TEMP # AO point to the triangular data part + daddu BO, B, TEMP + + LD b1, 0 * SIZE(BO) + + SUB t11, b1, t11 + + + LD b3, 0 * SIZE(AO) + MUL t11, b3, t11 + + daddiu CO1, CO1, -1 * SIZE + + ST t11, 0 * SIZE(BO) + + ST t11, 0 * SIZE(CO1) + + daddiu KK, KK, -1 + + +.L90: + andi I, M, 2 + blez I, .L80 + NOP + + MTC $0, t11 + MOV t21, t11 # clear result registers + + dsll TEMP, K, 1+BASE_SHIFT # mr=2 + dsubu AORIG, AORIG, TEMP + + dsll L, KK, 1 + BASE_SHIFT + dsll TEMP, KK, 0 + BASE_SHIFT + + daddu AO, AORIG, L # AO point to the rectangular data part + daddu BO, B, TEMP + + dsubu TEMP, K, KK + + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD b1, 0 * SIZE(BO) + + dsra L, TEMP, 2 + blez L, .L85 + nop + + .align 3 +.L82: + LD a5, 2 * SIZE(AO) + LD a6, 3 * SIZE(AO) + + LD b5, 1 * SIZE(BO) + + MADD t11, t11, a1, b1 # 1st compute + MADD t21, t21, a2, b1 + + LD a3, 4 * SIZE(AO) + LD a4, 5 * SIZE(AO) + + LD b3, 2 * SIZE(BO) + + MADD t11, t11, a5, b5 # 2ed compute + MADD t21, t21, a6, b5 + + LD a7, 6 * SIZE(AO) + LD a8, 7 * SIZE(AO) + + LD b7, 3 * SIZE(BO) + + MADD t11, t11, a3, b3 # 3rd compute + MADD t21, t21, a4, b3 + + daddiu AO, AO, 8 * SIZE # AO += 2mr*4kr + daddiu BO, BO, 4 * SIZE # BO += 1nr*4kr + + LD a1, 0 * SIZE(AO) # next + LD a2, 1 * SIZE(AO) + + LD b1, 0 * SIZE(BO) + + MADD t11, t11, a7, b7 # 4th compute + MADD t21, t21, a8, b7 + + daddiu L, L, -1 + bgtz L, .L82 + nop + + .align 3 + +.L85: + andi L, TEMP, 3 + blez L, .L88 + nop + + .align 3 +.L86: + MADD t11, t11, a1, b1 # 3rd compute + MADD t21, t21, a2, b1 + + daddiu AO, AO, 2 * SIZE # AO += 2mr + daddiu BO, BO, 1 * SIZE # BO += 1nr + + LD a1, 0 * SIZE(AO) # next + LD a2, 1 * SIZE(AO) + + LD b1, 0 * SIZE(BO) + + daddiu L, L, -1 + bgtz L, .L86 + nop + + +.L88: + daddiu TEMP, KK, -2 # mr=2 + dsll L, TEMP, 1 + BASE_SHIFT + dsll TEMP, TEMP, 0 + BASE_SHIFT + + daddu AO, AORIG, L # AO point to the triangular data part + daddu BO, B, TEMP + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + + SUB t11, b1, t11 + SUB t21, b2, t21 + + LD b1, 3 * SIZE(AO) # computes the triangular_part + LD b2, 2 * SIZE(AO) + MUL t21, b1, t21 + NMSUB t11, t11, b2, t21 + + LD b3, 0 * SIZE(AO) + MUL t11, b3, t11 + + daddiu CO1, CO1, -2 * SIZE + + ST t11, 0 * SIZE(BO) + ST t21, 1 * SIZE(BO) + + ST t11, 0 * SIZE(CO1) + ST t21, 1 * SIZE(CO1) + + daddiu KK, KK, -2 + + + .align 3 +.L80: + dsra I, M, 2 + blez I, .L89 + nop + +.L71: + dsll TEMP, K, 2 + BASE_SHIFT # mr=4 + dsubu AORIG, AORIG, TEMP + + dsll L, KK, 2 + BASE_SHIFT # mr=4 + dsll TEMP, KK, 0 + BASE_SHIFT # nr=1 + + daddu AO, AORIG, L # AO point to the rectangular + daddu BO, B, TEMP + + dsubu TEMP, K, KK + + + MTC $0, t11 # clear result regusters + MOV t21, t11 + MOV t31, t11 + MOV t41, t11 + + LD a1, 0 * SIZE(AO) # this part compute the rectangular data part of Ai + LD a2, 1 * SIZE(AO) # mr*KK with nr*KK + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) # get 4a + + LD b1, 0 * SIZE(BO) # get 4b + + dsra L, TEMP, 2 + blez L, .L75 + nop # reset B + + .align 3 +.L72: + LD a5, 4 * SIZE(AO) + LD a6, 5 * SIZE(AO) + LD a7, 6 * SIZE(AO) + LD a8, 7 * SIZE(AO) + + LD b5, 1 * SIZE(BO) + + MADD t11, t11, a1, b1 # 1st compute + MADD t21, t21, a2, b1 + MADD t31, t31, a3, b1 + MADD t41, t41, a4, b1 + + LD a1, 8 * SIZE(AO) + LD a2, 9 * SIZE(AO) + LD a3, 10 * SIZE(AO) + LD a4, 11 * SIZE(AO) + + LD b3, 2 * SIZE(BO) + + MADD t11, t11, a5, b5 # 2ed compute + MADD t21, t21, a6, b5 + MADD t31, t31, a7, b5 + MADD t41, t41, a8, b5 + + LD a5, 12 * SIZE(AO) + LD a6, 13 * SIZE(AO) + LD a7, 14 * SIZE(AO) + LD a8, 15 * SIZE(AO) + + LD b7, 3 * SIZE(BO) + + MADD t11, t11, a1, b3 # 3rd compute + MADD t21, t21, a2, b3 + MADD t31, t31, a3, b3 + MADD t41, t41, a4, b3 + + daddiu AO, AO, 16 * SIZE # AO += 4mr*4kr + daddiu BO, BO, 4 * SIZE # BO += 1nr*4kr + + LD a1, 0 * SIZE(AO) # next + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + LD b1, 0 * SIZE(BO) + + MADD t11, t11, a5, b7 # 4th compute + MADD t21, t21, a6, b7 + MADD t31, t31, a7, b7 + MADD t41, t41, a8, b7 + + daddiu L, L, -1 + bgtz L, .L72 + nop + + .align 3 + +.L75: + andi L, TEMP, 3 + blez L, .L78 + nop + + .align 3 +.L76: + MADD t11, t11, a1, b1 # 3rd compute + MADD t21, t21, a2, b1 + MADD t31, t31, a3, b1 + MADD t41, t41, a4, b1 + + daddiu AO, AO, 4 * SIZE # AO += 4mr + daddiu BO, BO, 1 * SIZE # BO += 1nr + + LD a1, 0 * SIZE(AO) # next + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + LD b1, 0 * SIZE(BO) + + daddiu L, L, -1 + bgtz L, .L76 + nop + +.L78: + daddiu TEMP, KK, -4 # mr=4 + + dsll L, TEMP, 2 + BASE_SHIFT # mr=4 + dsll TEMP, TEMP, 0 + BASE_SHIFT # nr=1 + daddu AO, AORIG, L # AO point to the triangular + daddu BO, B, TEMP + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + + SUB t11, b1, t11 + SUB t21, b2, t21 + SUB t31, b3, t31 + SUB t41, b4, t41 + + LD b1, 15 * SIZE(AO) + LD b2, 14 * SIZE(AO) + LD b4, 13 * SIZE(AO) + LD b7, 12 * SIZE(AO) + MUL t41, b1, t41 + NMSUB t31, t31, b2, t41 + NMSUB t21, t21, b4, t41 + NMSUB t11, t11, b7, t41 + + + + LD b3, 10 * SIZE(AO) + LD b5, 9 * SIZE(AO) + LD b8, 8 * SIZE(AO) + MUL t31, b3, t31 + NMSUB t21, t21, b5, t31 + NMSUB t11, t11, b8, t31 + + + + LD b6, 5 * SIZE(AO) + LD b1, 4 * SIZE(AO) + MUL t21, b6, t21 + NMSUB t11, t11, b1, t21 + + + + LD b2, 0 * SIZE(AO) + MUL t11, b2, t11 + + daddiu CO1, CO1, -4 * SIZE + + ST t11, 0 * SIZE(BO) + ST t21, 1 * SIZE(BO) + ST t31, 2 * SIZE(BO) + ST t41, 3 * SIZE(BO) + + ST t11, 0 * SIZE(CO1) + ST t21, 1 * SIZE(CO1) + ST t31, 2 * SIZE(CO1) + ST t41, 3 * SIZE(CO1) + + + daddiu KK, KK, -4 + daddiu I, I, -1 + bgtz I, .L71 + nop + + + .align 3 +.L89: + dsll TEMP, K, BASE_SHIFT # nr=1 + daddu B, B, TEMP + + + + .align 3 + +.L999: + LDARG $16, 0($sp) + LDARG $17, 8($sp) + LDARG $18, 16($sp) + LDARG $19, 24($sp) + LDARG $20, 32($sp) + LDARG $21, 40($sp) + ldc1 $f24, 48($sp) + ldc1 $f25, 56($sp) + ldc1 $f26, 64($sp) + ldc1 $f27, 72($sp) + ldc1 $f28, 80($sp) + + LDARG $22, 88($sp) + LDARG $23, 96($sp) + LDARG $24, 104($sp) + LDARG $25, 112($sp) + +#ifndef __64BIT__ + ldc1 $f20,112($sp) + ldc1 $f21,120($sp) + ldc1 $f22,128($sp) + ldc1 $f23,136($sp) +#endif + + j $31 + daddiu $sp, $sp, 144 + + EPILOGUE diff --git a/kernel/mips64/trsm_kernel_LT_loongson3a.S b/kernel/mips64/trsm_kernel_LT_loongson3a.S new file mode 100644 index 000000000..4114d94ef --- /dev/null +++ b/kernel/mips64/trsm_kernel_LT_loongson3a.S @@ -0,0 +1,1783 @@ +#define REALNAME ASMNAME + +#define ASSEMBLER +#include "common.h" + + +#define M $4 +#define N $5 +#define K $6 +#define A $8 +#define B $9 +#define C $10 +#define LDC $11 + +#define AO $12 +#define BO $13 + +#define I $2 +#define J $3 +#define L $7 + +#define CO1 $14 +#define CO2 $15 +#define CO3 $16 +#define CO4 $17 + +#define OFFSET $22 +#define KK $23 +#define TEMP $24 +#define AORIG $25 + +#define a1 $f0 +#define a2 $f1 +#define a3 $f2 +#define a4 $f3 +#define a5 $f4 +#define a6 $f5 +#define a7 $f6 +#define a8 $f7 + +#define b1 $f8 +#define b2 $f9 +#define b3 $f10 +#define b4 $f11 +#define b5 $f12 +#define b6 $f13 +#define b7 $f14 +#define b8 $f15 + +#define t11 $f16 +#define t21 $f17 +#define t31 $f18 +#define t41 $f19 + +#define t12 $f20 +#define t22 $f21 +#define t32 $f22 +#define t42 $f23 + +#define t13 $f24 +#define t23 $f25 +#define t33 $f26 +#define t43 $f27 + +#define t14 $f28 +#define t24 $f29 +#define t34 $f30 +#define t44 $f31 + +#define ALPHA $f15 + + PROLOGUE + + daddiu $sp, $sp, -144 + + SDARG $16, 0($sp) + SDARG $17, 8($sp) + SDARG $18, 16($sp) + SDARG $19, 24($sp) + SDARG $20, 32($sp) + SDARG $21, 40($sp) + sdc1 $f24, 48($sp) + sdc1 $f25, 56($sp) + sdc1 $f26, 64($sp) + sdc1 $f27, 72($sp) + sdc1 $f28, 80($sp) + + SDARG $22, 88($sp) + SDARG $23, 96($sp) + SDARG $24, 104($sp) + SDARG $25, 112($sp) + +#ifndef __64BIT__ + sdc1 $f20,112($sp) + sdc1 $f21,120($sp) + sdc1 $f22,128($sp) + sdc1 $f23,136($sp) +#endif + # LT compute from left to right, top to bottom + LDARG OFFSET, 144($sp) + dsll LDC, LDC, BASE_SHIFT # ldc + + dsra J, N, 2 # j = nc/4 + blez J, .L30 + nop + +.L10: # nr=4 + daddiu J, J, -1 + move CO1, C + daddu CO2, C, LDC + daddu CO3, CO2, LDC + daddu CO4, CO3, LDC + + MTC $0, t11 # clear result registers + MOV t21, t11 + MOV t31, t11 + MOV t41, t11 + MOV t12, t11 + MOV t22, t11 + MOV t32, t11 + MOV t42, t11 + + dsra I, M, 2 # i = mc/4 + move KK, OFFSET # kk is the length of the rectangular data part of panel Ai + move AO, A # reset A + daddu C, CO4, LDC # fixed pointer C, the write back address + blez I, .L20 + nop + + +.L11: # mr=4 + LD a1, 0 * SIZE(AO) # this part compute the rectangular data part of Ai + LD a2, 1 * SIZE(AO) # mr*KK with nr*KK + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) # get 4a + + LD b1, 0 * SIZE(B) # get 4b + LD b2, 1 * SIZE(B) + LD b3, 2 * SIZE(B) + LD b4, 3 * SIZE(B) + + MOV t13, t11 # clear result registers + MOV t23, t11 + MOV t33, t11 + MOV t43, t11 + MOV t14, t11 + MOV t24, t11 + MOV t34, t11 + MOV t44, t11 + + dsra L, KK, 2 # L = kk/4 + blez L, .L15 + move BO, B # + + + .align 3 +.L12: + LD a5, 4 * SIZE(AO) + LD a6, 5 * SIZE(AO) + LD a7, 6 * SIZE(AO) + LD a8, 7 * SIZE(AO) + + LD b5, 4 * SIZE(BO) + LD b6, 5 * SIZE(BO) + LD b7, 6 * SIZE(BO) + LD b8, 7 * SIZE(BO) + + MADD t11, t11, a1, b1 # 1st compute + MADD t21, t21, a2, b1 + MADD t31, t31, a3, b1 + MADD t41, t41, a4, b1 + + MADD t12, t12, a1, b2 + MADD t22, t22, a2, b2 + MADD t32, t32, a3, b2 + MADD t42, t42, a4, b2 + + MADD t13, t13, a1, b3 + MADD t23, t23, a2, b3 + MADD t33, t33, a3, b3 + MADD t43, t43, a4, b3 + + MADD t14, t14, a1, b4 + MADD t24, t24, a2, b4 + MADD t34, t34, a3, b4 + MADD t44, t44, a4, b4 + + LD a1, 8 * SIZE(AO) + LD a2, 9 * SIZE(AO) + LD a3, 10 * SIZE(AO) + LD a4, 11 * SIZE(AO) + + LD b1, 8 * SIZE(BO) + LD b2, 9 * SIZE(BO) + LD b3, 10 * SIZE(BO) + LD b4, 11 * SIZE(BO) + + MADD t11, t11, a5, b5 # 2ed compute + MADD t21, t21, a6, b5 + MADD t31, t31, a7, b5 + MADD t41, t41, a8, b5 + + MADD t12, t12, a5, b6 + MADD t22, t22, a6, b6 + MADD t32, t32, a7, b6 + MADD t42, t42, a8, b6 + + MADD t13, t13, a5, b7 + MADD t23, t23, a6, b7 + MADD t33, t33, a7, b7 + MADD t43, t43, a8, b7 + + MADD t14, t14, a5, b8 + MADD t24, t24, a6, b8 + MADD t34, t34, a7, b8 + MADD t44, t44, a8, b8 + + LD a5, 12 * SIZE(AO) + LD a6, 13 * SIZE(AO) + LD a7, 14 * SIZE(AO) + LD a8, 15 * SIZE(AO) + + LD b5, 12 * SIZE(BO) + LD b6, 13 * SIZE(BO) + LD b7, 14 * SIZE(BO) + LD b8, 15 * SIZE(BO) + + MADD t11, t11, a1, b1 # 3rd compute + MADD t21, t21, a2, b1 + MADD t31, t31, a3, b1 + MADD t41, t41, a4, b1 + + MADD t12, t12, a1, b2 + MADD t22, t22, a2, b2 + MADD t32, t32, a3, b2 + MADD t42, t42, a4, b2 + + MADD t13, t13, a1, b3 + MADD t23, t23, a2, b3 + MADD t33, t33, a3, b3 + MADD t43, t43, a4, b3 + + MADD t14, t14, a1, b4 + MADD t24, t24, a2, b4 + MADD t34, t34, a3, b4 + MADD t44, t44, a4, b4 + + daddiu AO, AO, 16 * SIZE # AO += 4mr*4kr + daddiu BO, BO, 16 * SIZE # BO += 4nr*4kr + + LD a1, 0 * SIZE(AO) # next + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + + MADD t11, t11, a5, b5 # 4th compute + MADD t21, t21, a6, b5 + MADD t31, t31, a7, b5 + MADD t41, t41, a8, b5 + + MADD t12, t12, a5, b6 + MADD t22, t22, a6, b6 + MADD t32, t32, a7, b6 + MADD t42, t42, a8, b6 + + MADD t13, t13, a5, b7 + MADD t23, t23, a6, b7 + MADD t33, t33, a7, b7 + MADD t43, t43, a8, b7 + + MADD t14, t14, a5, b8 + MADD t24, t24, a6, b8 + MADD t34, t34, a7, b8 + MADD t44, t44, a8, b8 + + daddiu L, L, -1 + bgtz L, .L12 + nop + + + .align 3 +.L15: + andi L, KK, 3 # the remainder part: KK-KK/4 + blez L, .L18 + nop + + .align 3 +.L16: + MADD t11, t11, a1, b1 + MADD t21, t21, a2, b1 + MADD t31, t31, a3, b1 + MADD t41, t41, a4, b1 + + MADD t12, t12, a1, b2 + MADD t22, t22, a2, b2 + MADD t32, t32, a3, b2 + MADD t42, t42, a4, b2 + + MADD t13, t13, a1, b3 + MADD t23, t23, a2, b3 + MADD t33, t33, a3, b3 + MADD t43, t43, a4, b3 + + MADD t14, t14, a1, b4 + MADD t24, t24, a2, b4 + MADD t34, t34, a3, b4 + MADD t44, t44, a4, b4 + + daddiu AO, AO, 4 * SIZE # AO += 4mr + daddiu BO, BO, 4 * SIZE # BO += 4nr + + LD a1, 0 * SIZE(AO) # next + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + + daddiu L, L, -1 + bgtz L, .L16 + nop + + +.L18: # deal with the triangular data part of panel Ai + LD b1, 0 * SIZE(BO) # triangular_part*X + rectangular_part = B + LD b2, 1 * SIZE(BO) # triangular_part*X = B - rectangular_part + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + + SUB t11, b1, t11 + SUB t12, b2, t12 + SUB t13, b3, t13 + SUB t14, b4, t14 + + LD b5, 4 * SIZE(BO) # sb store in row major + LD b6, 5 * SIZE(BO) + LD b7, 6 * SIZE(BO) + LD b8, 7 * SIZE(BO) + + SUB t21, b5, t21 + SUB t22, b6, t22 + SUB t23, b7, t23 + SUB t24, b8, t24 + + LD b1, 8 * SIZE(BO) + LD b2, 9 * SIZE(BO) + LD b3, 10 * SIZE(BO) + LD b4, 11 * SIZE(BO) + + SUB t31, b1, t31 + SUB t32, b2, t32 + SUB t33, b3, t33 + SUB t34, b4, t34 + + LD b5, 12 * SIZE(BO) + LD b6, 13 * SIZE(BO) + LD b7, 14 * SIZE(BO) + LD b8, 15 * SIZE(BO) + + SUB t41, b5, t41 + SUB t42, b6, t42 + SUB t43, b7, t43 + SUB t44, b8, t44 + + + LD a1, 0 * SIZE(AO) # sa stores in col major + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + MUL t11, a1, t11 + MUL t12, a1, t12 + MUL t13, a1, t13 + MUL t14, a1, t14 + NMSUB t21, t21, a2, t11 + NMSUB t22, t22, a2, t12 + NMSUB t23, t23, a2, t13 + NMSUB t24, t24, a2, t14 + NMSUB t31, t31, a3, t11 + NMSUB t32, t32, a3, t12 + NMSUB t33, t33, a3, t13 + NMSUB t34, t34, a3, t14 + NMSUB t41, t41, a4, t11 + NMSUB t42, t42, a4, t12 + NMSUB t43, t43, a4, t13 + NMSUB t44, t44, a4, t14 + + + LD a5, 5 * SIZE(AO) + LD a6, 6 * SIZE(AO) + LD a7, 7 * SIZE(AO) + MUL t21, a5, t21 + MUL t22, a5, t22 + MUL t23, a5, t23 + MUL t24, a5, t24 + NMSUB t31, t31, a6, t21 + NMSUB t32, t32, a6, t22 + NMSUB t33, t33, a6, t23 + NMSUB t34, t34, a6, t24 + NMSUB t41, t41, a7, t21 + NMSUB t42, t42, a7, t22 + NMSUB t43, t43, a7, t23 + NMSUB t44, t44, a7, t24 + + + LD a8, 10 * SIZE(AO) + LD a1, 11 * SIZE(AO) + MUL t31, a8, t31 + MUL t32, a8, t32 + MUL t33, a8, t33 + MUL t34, a8, t34 + NMSUB t41, t41, a1, t31 + NMSUB t42, t42, a1, t32 + NMSUB t43, t43, a1, t33 + NMSUB t44, t44, a1, t34 + + + LD a2, 15 * SIZE(AO) + MUL t41, a2, t41 + MUL t42, a2, t42 + MUL t43, a2, t43 + MUL t44, a2, t44 + + ST t11, 0 * SIZE(BO) # update packed B + ST t12, 1 * SIZE(BO) + ST t13, 2 * SIZE(BO) + ST t14, 3 * SIZE(BO) + ST t21, 4 * SIZE(BO) + ST t22, 5 * SIZE(BO) + ST t23, 6 * SIZE(BO) + ST t24, 7 * SIZE(BO) + ST t31, 8 * SIZE(BO) + ST t32, 9 * SIZE(BO) + ST t33, 10 * SIZE(BO) + ST t34, 11 * SIZE(BO) + ST t41, 12 * SIZE(BO) + ST t42, 13 * SIZE(BO) + ST t43, 14 * SIZE(BO) + ST t44, 15 * SIZE(BO) + + ST t11, 0 * SIZE(CO1) # write back + ST t21, 1 * SIZE(CO1) + ST t31, 2 * SIZE(CO1) + ST t41, 3 * SIZE(CO1) + ST t12, 0 * SIZE(CO2) + ST t22, 1 * SIZE(CO2) + ST t32, 2 * SIZE(CO2) + ST t42, 3 * SIZE(CO2) + ST t13, 0 * SIZE(CO3) + ST t23, 1 * SIZE(CO3) + ST t33, 2 * SIZE(CO3) + ST t43, 3 * SIZE(CO3) + ST t14, 0 * SIZE(CO4) + ST t24, 1 * SIZE(CO4) + ST t34, 2 * SIZE(CO4) + ST t44, 3 * SIZE(CO4) + + daddiu CO1, CO1, 4 * SIZE # fixed pointers + daddiu CO2, CO2, 4 * SIZE + daddiu CO3, CO3, 4 * SIZE + daddiu CO4, CO4, 4 * SIZE + + dsubu TEMP, K, KK + dsll L, TEMP, 2 + BASE_SHIFT + dsll TEMP, TEMP, 2 + BASE_SHIFT + daddu AO, AO, L # mov AO to the end of panel Ai + daddu BO, BO, TEMP # mov BO to the end of panel Bj + + daddiu KK, KK, 4 # the length of rectangular data part increases by 4 + daddiu I, I, -1 + + MTC $0, a1 + MOV t11, a1 + MOV t21, a1 + MOV t31, a1 + MOV t41, a1 + MOV t12, a1 + MOV t22, a1 + MOV t32, a1 + MOV t42, a1 + bgtz I, .L11 + nop + + + .align 3 +.L20: + andi I, M, 2 # mr=2,nr=4 + blez I, .L50 + nop + + MOV t13, t11 + MOV t23, t11 + MOV t33, t11 + MOV t43, t11 + MOV t14, t11 + MOV t24, t11 + MOV t34, t11 + MOV t44, t11 + + LD a1, 0 * SIZE(AO) # this part compute the rectangular data part of Ai + LD a2, 1 * SIZE(AO) # mr*KK with nr*KK + + LD b1, 0 * SIZE(B) # get 4b + LD b2, 1 * SIZE(B) + LD b3, 2 * SIZE(B) + LD b4, 3 * SIZE(B) + + dsra L, KK, 2 + blez L, .L25 + move BO, B + + + .align 3 +.L22: + LD a5, 2 * SIZE(AO) + LD a6, 3 * SIZE(AO) + LD b5, 4 * SIZE(BO) + LD b6, 5 * SIZE(BO) + LD b7, 6 * SIZE(BO) + LD b8, 7 * SIZE(BO) + + MADD t11, t11, a1, b1 # 1st compute + MADD t21, t21, a2, b1 + MADD t12, t12, a1, b2 + MADD t22, t22, a2, b2 + MADD t13, t13, a1, b3 + MADD t23, t23, a2, b3 + MADD t14, t14, a1, b4 + MADD t24, t24, a2, b4 + + LD a3, 4 * SIZE(AO) + LD a4, 5 * SIZE(AO) + LD b1, 8 * SIZE(BO) + LD b2, 9 * SIZE(BO) + LD b3, 10 * SIZE(BO) + LD b4, 11 * SIZE(BO) + + MADD t11, t11, a5, b5 # 2ed compute + MADD t21, t21, a6, b5 + MADD t12, t12, a5, b6 + MADD t22, t22, a6, b6 + MADD t13, t13, a5, b7 + MADD t23, t23, a6, b7 + MADD t14, t14, a5, b8 + MADD t24, t24, a6, b8 + + LD a7, 6 * SIZE(AO) + LD a8, 7 * SIZE(AO) + LD b5, 12 * SIZE(BO) + LD b6, 13 * SIZE(BO) + LD b7, 14 * SIZE(BO) + LD b8, 15 * SIZE(BO) + + MADD t11, t11, a3, b1 # 3rd compute + MADD t21, t21, a4, b1 + MADD t12, t12, a3, b2 + MADD t22, t22, a4, b2 + MADD t13, t13, a3, b3 + MADD t23, t23, a4, b3 + MADD t14, t14, a3, b4 + MADD t24, t24, a4, b4 + + daddiu AO, AO, 8 * SIZE # AO += 2mr*4kr + daddiu BO, BO, 16 * SIZE # BO += 4nr*4kr + + LD a1, 0 * SIZE(AO) # next + LD a2, 1 * SIZE(AO) + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + + MADD t11, t11, a7, b5 # 4th compute + MADD t21, t21, a8, b5 + MADD t12, t12, a7, b6 + MADD t22, t22, a8, b6 + MADD t13, t13, a7, b7 + MADD t23, t23, a8, b7 + MADD t14, t14, a7, b8 + MADD t24, t24, a8, b8 + + daddiu L, L, -1 + bgtz L, .L22 + nop + + + .align 3 +.L25: + andi L, KK, 3 + blez L, .L28 + nop + + .align 3 +.L26: + MADD t11, t11, a1, b1 # 3rd compute + MADD t21, t21, a2, b1 + MADD t12, t12, a1, b2 + MADD t22, t22, a2, b2 + MADD t13, t13, a1, b3 + MADD t23, t23, a2, b3 + MADD t14, t14, a1, b4 + MADD t24, t24, a2, b4 + + daddiu AO, AO, 2 * SIZE # AO += 2mr + daddiu BO, BO, 4 * SIZE # BO += 4nr + + LD a1, 0 * SIZE(AO) # next + LD a2, 1 * SIZE(AO) + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + + daddiu L, L, -1 + bgtz L, .L26 + nop + + +.L28: # deal with the triangular part + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + LD b5, 4 * SIZE(BO) + LD b6, 5 * SIZE(BO) + LD b7, 6 * SIZE(BO) + LD b8, 7 * SIZE(BO) + + SUB t11, b1, t11 + SUB t12, b2, t12 + SUB t13, b3, t13 + SUB t14, b4, t14 + SUB t21, b5, t21 + SUB t22, b6, t22 + SUB t23, b7, t23 + SUB t24, b8, t24 + + + LD b1, 0 * SIZE(AO) # computes the triangular_part + LD b2, 1 * SIZE(AO) + MUL t11, b1, t11 + MUL t12, b1, t12 + MUL t13, b1, t13 + MUL t14, b1, t14 + NMSUB t21, t21, b2, t11 + NMSUB t22, t22, b2, t12 + NMSUB t23, t23, b2, t13 + NMSUB t24, t24, b2, t14 + + LD b3, 3 * SIZE(AO) + MUL t21, b3, t21 + MUL t22, b3, t22 + MUL t23, b3, t23 + MUL t24, b3, t24 + + + ST t11, 0 * SIZE(BO) + ST t12, 1 * SIZE(BO) + ST t13, 2 * SIZE(BO) + ST t14, 3 * SIZE(BO) + ST t21, 4 * SIZE(BO) + ST t22, 5 * SIZE(BO) + ST t23, 6 * SIZE(BO) + ST t24, 7 * SIZE(BO) + + ST t11, 0 * SIZE(CO1) + ST t21, 1 * SIZE(CO1) + ST t12, 0 * SIZE(CO2) + ST t22, 1 * SIZE(CO2) + ST t13, 0 * SIZE(CO3) + ST t23, 1 * SIZE(CO3) + ST t14, 0 * SIZE(CO4) + ST t24, 1 * SIZE(CO4) + + daddiu CO1, CO1, 2 * SIZE + daddiu CO2, CO2, 2 * SIZE + daddiu CO3, CO3, 2 * SIZE + daddiu CO4, CO4, 2 * SIZE + + + dsubu TEMP, K, KK + dsll L, TEMP, 1 + BASE_SHIFT + dsll TEMP, TEMP, 2 + BASE_SHIFT + daddu AO, AO, L # mov AO to the end of Ai + daddu BO, BO, TEMP # mov BO to the end of Bj + + daddiu KK, KK, 2 # the length of rectangular data part increases by 2 + MTC $0, a1 + MOV t11, a1 + MOV t21, a1 + MOV t31, a1 + MOV t41, a1 + MOV t12, a1 + MOV t22, a1 + MOV t32, a1 + MOV t42, a1 + + + .align 3 +.L50: + andi I, M, 1 # mr=1,nr=4 + blez I, .L29 + nop + + MOV t13, t11 + MOV t23, t11 + MOV t33, t11 + MOV t43, t11 + MOV t14, t11 + MOV t24, t11 + MOV t34, t11 + MOV t44, t11 + + LD a1, 0 * SIZE(AO) # this part compute the rectangular data part of Ai + + LD b1, 0 * SIZE(B) # get 4b + LD b2, 1 * SIZE(B) + LD b3, 2 * SIZE(B) + LD b4, 3 * SIZE(B) + + dsra L, KK, 2 + blez L, .L55 + move BO, B + + + .align 3 +.L52: + LD a5, 1 * SIZE(AO) + LD b5, 4 * SIZE(BO) + LD b6, 5 * SIZE(BO) + LD b7, 6 * SIZE(BO) + LD b8, 7 * SIZE(BO) + + MADD t11, t11, a1, b1 # 1st compute + MADD t12, t12, a1, b2 + MADD t13, t13, a1, b3 + MADD t14, t14, a1, b4 + + LD a3, 2 * SIZE(AO) + LD b1, 8 * SIZE(BO) + LD b2, 9 * SIZE(BO) + LD b3, 10 * SIZE(BO) + LD b4, 11 * SIZE(BO) + + MADD t11, t11, a5, b5 # 2ed compute + MADD t12, t12, a5, b6 + MADD t13, t13, a5, b7 + MADD t14, t14, a5, b8 + + LD a7, 3 * SIZE(AO) + LD b5, 12 * SIZE(BO) + LD b6, 13 * SIZE(BO) + LD b7, 14 * SIZE(BO) + LD b8, 15 * SIZE(BO) + + MADD t11, t11, a3, b1 # 3rd compute + MADD t12, t12, a3, b2 + MADD t13, t13, a3, b3 + MADD t14, t14, a3, b4 + + daddiu AO, AO, 4 * SIZE # AO += mr*4kr + daddiu BO, BO, 16 * SIZE # BO += 4nr*4kr + + LD a1, 0 * SIZE(AO) # next + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + + MADD t11, t11, a7, b5 # 4th compute + MADD t12, t12, a7, b6 + MADD t13, t13, a7, b7 + MADD t14, t14, a7, b8 + + daddiu L, L, -1 + bgtz L, .L52 + nop + + + .align 3 +.L55: + andi L, KK, 3 + blez L, .L58 + nop + + .align 3 +.L56: + MADD t11, t11, a1, b1 # 3rd compute + MADD t12, t12, a1, b2 + MADD t13, t13, a1, b3 + MADD t14, t14, a1, b4 + + daddiu AO, AO, 1 * SIZE # AO += 2mr + daddiu BO, BO, 4 * SIZE # BO += 4nr + + LD a1, 0 * SIZE(AO) # next + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + + daddiu L, L, -1 + bgtz L, .L56 + nop + + +.L58: # deal with the triangular part + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + + SUB t11, b1, t11 + SUB t12, b2, t12 + SUB t13, b3, t13 + SUB t14, b4, t14 + + + LD b1, 0 * SIZE(AO) # computes the triangular_part + MUL t11, b1, t11 + MUL t12, b1, t12 + MUL t13, b1, t13 + MUL t14, b1, t14 + + ST t11, 0 * SIZE(BO) + ST t12, 1 * SIZE(BO) + ST t13, 2 * SIZE(BO) + ST t14, 3 * SIZE(BO) + + ST t11, 0 * SIZE(CO1) + ST t12, 0 * SIZE(CO2) + ST t13, 0 * SIZE(CO3) + ST t14, 0 * SIZE(CO4) + + daddiu CO1, CO1, 1 * SIZE + daddiu CO2, CO2, 1 * SIZE + daddiu CO3, CO3, 1 * SIZE + daddiu CO4, CO4, 1 * SIZE + + + dsubu TEMP, K, KK + dsll L, TEMP, BASE_SHIFT # mr=1 + dsll TEMP, TEMP, 2 + BASE_SHIFT + daddu AO, AO, L # mov AO to the end of Ai + daddu BO, BO, TEMP # mov BO to the end of Bj + + daddiu KK, KK, 1 # the length of rectangular data part increases by 2 + + .align 3 +.L29: + move B, BO # fixed panel Bj + bgtz J, .L10 + nop + + + .align 3 +.L30: + andi J, N, 2 # nr=2 + blez J, .L70 + nop + + move CO1, C + daddu CO2, C, LDC + + MTC $0, t11 # clear result regusters + MOV t21, t11 + MOV t31, t11 + MOV t41, t11 + + move KK, OFFSET + move AO, A # reset A + daddu C, CO2, LDC # fixed + + dsra I, M, 2 # I = mc/4 + blez I, .L40 + nop + +.L31: + MOV t12, t11 + MOV t22, t11 + MOV t32, t11 + MOV t42, t11 + LD a1, 0 * SIZE(AO) # this part compute the rectangular data part of Ai + LD a2, 1 * SIZE(AO) # mr*KK with nr*KK + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) # get 4a + + LD b1, 0 * SIZE(B) # get 4b + LD b2, 1 * SIZE(B) + + dsra L, KK, 2 # L=kk/4 + blez L, .L35 + move BO, B # reset B + + + .align 3 +.L32: + LD a5, 4 * SIZE(AO) + LD a6, 5 * SIZE(AO) + LD a7, 6 * SIZE(AO) + LD a8, 7 * SIZE(AO) + LD b5, 2 * SIZE(BO) + LD b6, 3 * SIZE(BO) + + MADD t11, t11, a1, b1 # 1st compute + MADD t21, t21, a2, b1 + MADD t31, t31, a3, b1 + MADD t41, t41, a4, b1 + MADD t12, t12, a1, b2 + MADD t22, t22, a2, b2 + MADD t32, t32, a3, b2 + MADD t42, t42, a4, b2 + + LD a1, 8 * SIZE(AO) + LD a2, 9 * SIZE(AO) + LD a3, 10 * SIZE(AO) + LD a4, 11 * SIZE(AO) + LD b3, 4 * SIZE(BO) + LD b4, 5 * SIZE(BO) + + MADD t11, t11, a5, b5 # 2ed compute + MADD t21, t21, a6, b5 + MADD t31, t31, a7, b5 + MADD t41, t41, a8, b5 + MADD t12, t12, a5, b6 + MADD t22, t22, a6, b6 + MADD t32, t32, a7, b6 + MADD t42, t42, a8, b6 + + LD a5, 12 * SIZE(AO) + LD a6, 13 * SIZE(AO) + LD a7, 14 * SIZE(AO) + LD a8, 15 * SIZE(AO) + LD b7, 6 * SIZE(BO) + LD b8, 7 * SIZE(BO) + + MADD t11, t11, a1, b3 # 3rd compute + MADD t21, t21, a2, b3 + MADD t31, t31, a3, b3 + MADD t41, t41, a4, b3 + MADD t12, t12, a1, b4 + MADD t22, t22, a2, b4 + MADD t32, t32, a3, b4 + MADD t42, t42, a4, b4 + + daddiu AO, AO, 16 * SIZE # AO += 4mr*4kr + daddiu BO, BO, 8 * SIZE # BO += 2nr*4kr + + LD a1, 0 * SIZE(AO) # next + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + + MADD t11, t11, a5, b7 # 4th compute + MADD t21, t21, a6, b7 + MADD t31, t31, a7, b7 + MADD t41, t41, a8, b7 + MADD t12, t12, a5, b8 + MADD t22, t22, a6, b8 + MADD t32, t32, a7, b8 + MADD t42, t42, a8, b8 + + daddiu L, L, -1 + bgtz L, .L32 + nop + + + .align 3 + +.L35: + andi L, KK, 3 + blez L, .L38 + nop + + .align 3 +.L36: + MADD t11, t11, a1, b1 # 3rd compute + MADD t21, t21, a2, b1 + MADD t31, t31, a3, b1 + MADD t41, t41, a4, b1 + + MADD t12, t12, a1, b2 + MADD t22, t22, a2, b2 + MADD t32, t32, a3, b2 + MADD t42, t42, a4, b2 + + daddiu AO, AO, 4 * SIZE # AO += 4mr + daddiu BO, BO, 2 * SIZE # BO += 2nr + + LD a1, 0 * SIZE(AO) # next + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + + daddiu L, L, -1 + bgtz L, .L36 + nop + + +.L38: # + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + LD b5, 4 * SIZE(BO) + LD b6, 5 * SIZE(BO) + LD b7, 6 * SIZE(BO) + LD b8, 7 * SIZE(BO) + + SUB t11, b1, t11 + SUB t12, b2, t12 + SUB t21, b3, t21 + SUB t22, b4, t22 + SUB t31, b5, t31 + SUB t32, b6, t32 + SUB t41, b7, t41 + SUB t42, b8, t42 + + LD a1, 0 * SIZE(AO) # sa stores in col major + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + MUL t11, a1, t11 + MUL t12, a1, t12 + NMSUB t21, t21, a2, t11 + NMSUB t22, t22, a2, t12 + NMSUB t31, t31, a3, t11 + NMSUB t32, t32, a3, t12 + NMSUB t41, t41, a4, t11 + NMSUB t42, t42, a4, t12 + + + LD a5, 5 * SIZE(AO) + LD a6, 6 * SIZE(AO) + LD a7, 7 * SIZE(AO) + MUL t21, a5, t21 + MUL t22, a5, t22 + NMSUB t31, t31, a6, t21 + NMSUB t32, t32, a6, t22 + NMSUB t41, t41, a7, t21 + NMSUB t42, t42, a7, t22 + + + LD a8, 10 * SIZE(AO) + LD a1, 11 * SIZE(AO) + MUL t31, a8, t31 + MUL t32, a8, t32 + NMSUB t41, t41, a1, t31 + NMSUB t42, t42, a1, t32 + + + LD a2, 15 * SIZE(AO) + MUL t41, a2, t41 + MUL t42, a2, t42 + + ST t11, 0 * SIZE(BO) + ST t12, 1 * SIZE(BO) + ST t21, 2 * SIZE(BO) + ST t22, 3 * SIZE(BO) + ST t31, 4 * SIZE(BO) + ST t32, 5 * SIZE(BO) + ST t41, 6 * SIZE(BO) + ST t42, 7 * SIZE(BO) + + ST t11, 0 * SIZE(CO1) + ST t21, 1 * SIZE(CO1) + ST t31, 2 * SIZE(CO1) + ST t41, 3 * SIZE(CO1) + ST t12, 0 * SIZE(CO2) + ST t22, 1 * SIZE(CO2) + ST t32, 2 * SIZE(CO2) + ST t42, 3 * SIZE(CO2) + + daddiu CO1, CO1, 4 * SIZE + daddiu CO2, CO2, 4 * SIZE + + dsubu TEMP, K, KK + dsll L, TEMP, 2 + BASE_SHIFT + dsll TEMP, TEMP, 1 + BASE_SHIFT + daddu AO, AO, L # move AO to the end of Ai + daddu BO, BO, TEMP + + daddiu KK, KK, 4 # + + MTC $0, a1 + MOV t11, a1 + MOV t21, a1 + MOV t31, a1 + MOV t41, a1 + + daddiu I, I, -1 + bgtz I, .L31 + nop + + + .align 3 +.L40: + andi I, M, 2 + blez I, .L60 + nop + + MOV t12, t11 # clear result registers + MOV t22, t21 + MOV t32, t31 + MOV t42, t41 + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD b1, 0 * SIZE(B) + LD b2, 1 * SIZE(B) + + dsra L, KK, 2 + blez L, .L45 + move BO, B # reset B + + + .align 3 +.L42: + LD a5, 2 * SIZE(AO) + LD a6, 3 * SIZE(AO) + LD b5, 2 * SIZE(BO) + LD b6, 3 * SIZE(BO) + + MADD t11, t11, a1, b1 # 1st compute + MADD t21, t21, a2, b1 + MADD t12, t12, a1, b2 + MADD t22, t22, a2, b2 + + LD a3, 4 * SIZE(AO) + LD a4, 5 * SIZE(AO) + LD b3, 4 * SIZE(BO) + LD b4, 5 * SIZE(BO) + + MADD t11, t11, a5, b5 # 2ed compute + MADD t21, t21, a6, b5 + MADD t12, t12, a5, b6 + MADD t22, t22, a6, b6 + + LD a7, 6 * SIZE(AO) + LD a8, 7 * SIZE(AO) + LD b7, 6 * SIZE(BO) + LD b8, 7 * SIZE(BO) + + MADD t11, t11, a3, b3 # 3rd compute + MADD t21, t21, a4, b3 + MADD t12, t12, a3, b4 + MADD t22, t22, a4, b4 + + daddiu AO, AO, 8 * SIZE # AO += 2mr*4kr + daddiu BO, BO, 8 * SIZE # BO += 2nr*4kr + + LD a1, 0 * SIZE(AO) # next + LD a2, 1 * SIZE(AO) + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + + MADD t11, t11, a7, b7 # 4th compute + MADD t21, t21, a8, b7 + MADD t12, t12, a7, b8 + MADD t22, t22, a8, b8 + + daddiu L, L, -1 + bgtz L, .L42 + nop + + .align 3 + +.L45: + andi L, KK, 3 + blez L, .L48 + nop + + .align 3 +.L46: + MADD t11, t11, a1, b1 # 3rd compute + MADD t21, t21, a2, b1 + MADD t12, t12, a1, b2 + MADD t22, t22, a2, b2 + + daddiu AO, AO, 2 * SIZE # AO += 2mr + daddiu BO, BO, 2 * SIZE # BO += 2nr + + LD a1, 0 * SIZE(AO) # next + LD a2, 1 * SIZE(AO) + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + + daddiu L, L, -1 + bgtz L, .L46 + nop + +.L48: + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + + SUB t11, b1, t11 + SUB t12, b2, t12 + SUB t21, b3, t21 + SUB t22, b4, t22 + + LD b1, 0 * SIZE(AO) # computes the triangular_part + LD b2, 1 * SIZE(AO) + MUL t11, b1, t11 + MUL t12, b1, t12 + NMSUB t21, t21, b2, t11 + NMSUB t22, t22, b2, t12 + + LD b3, 3 * SIZE(AO) + MUL t21, b3, t21 + MUL t22, b3, t22 + + ST t11, 0 * SIZE(BO) + ST t12, 1 * SIZE(BO) + ST t21, 2 * SIZE(BO) + ST t22, 3 * SIZE(BO) + + ST t11, 0 * SIZE(CO1) + ST t21, 1 * SIZE(CO1) + ST t12, 0 * SIZE(CO2) + ST t22, 1 * SIZE(CO2) + + daddiu CO1, CO1, 2 * SIZE + daddiu CO2, CO2, 2 * SIZE + + dsubu TEMP, K, KK + dsll L, TEMP, 1 + BASE_SHIFT + dsll TEMP, TEMP, 1 + BASE_SHIFT + daddu AO, AO, L + daddu BO, BO, TEMP + + daddiu KK, KK, 2 + MTC $0, a1 + MOV t11, a1 + MOV t21, a1 + MOV t31, a1 + MOV t41, a1 + + + .align 3 +.L60: + andi I, M, 1 # mr=1 + blez I, .L49 + nop + + MOV t12, t11 # clear result registers + MOV t22, t21 + MOV t32, t31 + MOV t42, t41 + + LD a1, 0 * SIZE(AO) + LD b1, 0 * SIZE(B) + LD b2, 1 * SIZE(B) + + dsra L, KK, 2 + blez L, .L65 + move BO, B # reset B + + + .align 3 +.L62: + LD a5, 1 * SIZE(AO) + LD b5, 2 * SIZE(BO) + LD b6, 3 * SIZE(BO) + + MADD t11, t11, a1, b1 # 1st compute + MADD t12, t12, a1, b2 + + LD a3, 2 * SIZE(AO) + LD b3, 4 * SIZE(BO) + LD b4, 5 * SIZE(BO) + + MADD t11, t11, a5, b5 # 2ed compute + MADD t12, t12, a5, b6 + + LD a7, 3 * SIZE(AO) + LD b7, 6 * SIZE(BO) + LD b8, 7 * SIZE(BO) + + MADD t11, t11, a3, b3 # 3rd compute + MADD t12, t12, a3, b4 + + daddiu AO, AO, 4 * SIZE # AO += mr*4kr + daddiu BO, BO, 8 * SIZE # BO += 2nr*4kr + + LD a1, 0 * SIZE(AO) # next + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + + MADD t11, t11, a7, b7 # 4th compute + MADD t12, t12, a7, b8 + + daddiu L, L, -1 + bgtz L, .L62 + nop + + .align 3 + +.L65: + andi L, KK, 3 + blez L, .L68 + nop + + .align 3 +.L66: + MADD t11, t11, a1, b1 # 3rd compute + MADD t12, t12, a1, b2 + + daddiu AO, AO, 1 * SIZE # AO += 1mr + daddiu BO, BO, 2 * SIZE # BO += 2nr + + LD a1, 0 * SIZE(AO) # next + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + + daddiu L, L, -1 + bgtz L, .L66 + nop + +.L68: + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + + SUB t11, b1, t11 + SUB t12, b2, t12 + + LD b1, 0 * SIZE(AO) # computes the triangular_part + MUL t11, b1, t11 + MUL t12, b1, t12 + + ST t11, 0 * SIZE(BO) + ST t12, 1 * SIZE(BO) + + ST t11, 0 * SIZE(CO1) + ST t12, 0 * SIZE(CO2) + + daddiu CO1, CO1, 1 * SIZE + daddiu CO2, CO2, 1 * SIZE + + dsubu TEMP, K, KK + dsll L, TEMP, BASE_SHIFT # mr=1 + dsll TEMP, TEMP, 1 + BASE_SHIFT + daddu AO, AO, L + daddu BO, BO, TEMP + + daddiu KK, KK, 1 + + .align 3 +.L49: + move B, BO + + .align 3 + +.L70: + andi J, N, 1 # nr=1 + blez J, .L999 # END + nop + + move CO1, C + + move KK, OFFSET + move AO, A + + dsra I, M, 2 + blez I, .L80 + nop + +.L71: + MTC $0, t11 # clear result regusters + MOV t21, t11 + MOV t31, t11 + MOV t41, t11 + + LD a1, 0 * SIZE(AO) # this part compute the rectangular data part of Ai + LD a2, 1 * SIZE(AO) # mr*KK with nr*KK + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) # get 4a + + LD b1, 0 * SIZE(B) # get 4b + + dsra L, KK, 2 + blez L, .L75 + move BO, B # reset B + + .align 3 +.L72: + LD a5, 4 * SIZE(AO) + LD a6, 5 * SIZE(AO) + LD a7, 6 * SIZE(AO) + LD a8, 7 * SIZE(AO) + + LD b5, 1 * SIZE(BO) + + MADD t11, t11, a1, b1 # 1st compute + MADD t21, t21, a2, b1 + MADD t31, t31, a3, b1 + MADD t41, t41, a4, b1 + + LD a1, 8 * SIZE(AO) + LD a2, 9 * SIZE(AO) + LD a3, 10 * SIZE(AO) + LD a4, 11 * SIZE(AO) + + LD b3, 2 * SIZE(BO) + + MADD t11, t11, a5, b5 # 2ed compute + MADD t21, t21, a6, b5 + MADD t31, t31, a7, b5 + MADD t41, t41, a8, b5 + + LD a5, 12 * SIZE(AO) + LD a6, 13 * SIZE(AO) + LD a7, 14 * SIZE(AO) + LD a8, 15 * SIZE(AO) + + LD b7, 3 * SIZE(BO) + + MADD t11, t11, a1, b3 # 3rd compute + MADD t21, t21, a2, b3 + MADD t31, t31, a3, b3 + MADD t41, t41, a4, b3 + + daddiu AO, AO, 16 * SIZE # AO += 4mr*4kr + daddiu BO, BO, 4 * SIZE # BO += 1nr*4kr + + LD a1, 0 * SIZE(AO) # next + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + LD b1, 0 * SIZE(BO) + + MADD t11, t11, a5, b7 # 4th compute + MADD t21, t21, a6, b7 + MADD t31, t31, a7, b7 + MADD t41, t41, a8, b7 + + daddiu L, L, -1 + bgtz L, .L72 + nop + + .align 3 + +.L75: + andi L, KK, 3 + blez L, .L78 + nop + + .align 3 +.L76: + MADD t11, t11, a1, b1 # 3rd compute + MADD t21, t21, a2, b1 + MADD t31, t31, a3, b1 + MADD t41, t41, a4, b1 + + daddiu AO, AO, 4 * SIZE # AO += 4mr + daddiu BO, BO, 1 * SIZE # BO += 1nr + + LD a1, 0 * SIZE(AO) # next + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + LD b1, 0 * SIZE(BO) + + daddiu L, L, -1 + bgtz L, .L76 + nop + +.L78: + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + + SUB t11, b1, t11 + SUB t21, b2, t21 + SUB t31, b3, t31 + SUB t41, b4, t41 + + LD a1, 0 * SIZE(AO) # sa stores in col major + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + MUL t11, a1, t11 + NMSUB t21, t21, a2, t11 + NMSUB t31, t31, a3, t11 + NMSUB t41, t41, a4, t11 + + LD a5, 5 * SIZE(AO) + LD a6, 6 * SIZE(AO) + LD a7, 7 * SIZE(AO) + MUL t21, a5, t21 + NMSUB t31, t31, a6, t21 + NMSUB t41, t41, a7, t21 + + LD a8, 10 * SIZE(AO) + LD a1, 11 * SIZE(AO) + MUL t31, a8, t31 + NMSUB t41, t41, a1, t31 + + LD a2, 15 * SIZE(AO) + MUL t41, a2, t41 + + + ST t11, 0 * SIZE(BO) + ST t21, 1 * SIZE(BO) + ST t31, 2 * SIZE(BO) + ST t41, 3 * SIZE(BO) + + ST t11, 0 * SIZE(CO1) + ST t21, 1 * SIZE(CO1) + ST t31, 2 * SIZE(CO1) + ST t41, 3 * SIZE(CO1) + + daddiu CO1, CO1, 4 * SIZE + + dsubu TEMP, K, KK + dsll L, TEMP, 2 + BASE_SHIFT + dsll TEMP, TEMP, 0 + BASE_SHIFT + daddu AO, AO, L + daddu BO, BO, TEMP + + daddiu KK, KK, 4 + daddiu I, I, -1 + bgtz I, .L71 + nop + + + .align 3 + +.L80: + andi I, M, 2 + blez I, .L90 + NOP + + MTC $0, t11 + MOV t21, t11 # clear result registers + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD b1, 0 * SIZE(B) + + dsra L, KK, 2 + blez L, .L85 + move BO, B + + .align 3 +.L82: + LD a5, 2 * SIZE(AO) + LD a6, 3 * SIZE(AO) + + LD b5, 1 * SIZE(BO) + + MADD t11, t11, a1, b1 # 1st compute + MADD t21, t21, a2, b1 + + LD a3, 4 * SIZE(AO) + LD a4, 5 * SIZE(AO) + + LD b3, 2 * SIZE(BO) + + MADD t11, t11, a5, b5 # 2ed compute + MADD t21, t21, a6, b5 + + LD a7, 6 * SIZE(AO) + LD a8, 7 * SIZE(AO) + + LD b7, 3 * SIZE(BO) + + MADD t11, t11, a3, b3 # 3rd compute + MADD t21, t21, a4, b3 + + daddiu AO, AO, 8 * SIZE # AO += 2mr*4kr + daddiu BO, BO, 4 * SIZE # BO += 1nr*4kr + + LD a1, 0 * SIZE(AO) # next + LD a2, 1 * SIZE(AO) + + LD b1, 0 * SIZE(BO) + + MADD t11, t11, a7, b7 # 4th compute + MADD t21, t21, a8, b7 + + daddiu L, L, -1 + bgtz L, .L82 + nop + + .align 3 + +.L85: + andi L, KK, 3 + blez L, .L88 + nop + + .align 3 +.L86: + MADD t11, t11, a1, b1 # 3rd compute + MADD t21, t21, a2, b1 + + daddiu AO, AO, 2 * SIZE # AO += 2mr + daddiu BO, BO, 1 * SIZE # BO += 1nr + + LD a1, 0 * SIZE(AO) # next + LD a2, 1 * SIZE(AO) + + LD b1, 0 * SIZE(BO) + + daddiu L, L, -1 + bgtz L, .L86 + nop + + +.L88: + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + + SUB t11, b1, t11 + SUB t21, b2, t21 + + LD b1, 0 * SIZE(AO) # computes the triangular_part + LD b2, 1 * SIZE(AO) + MUL t11, b1, t11 + NMSUB t21, t21, b2, t11 + + LD b3, 3 * SIZE(AO) + MUL t21, b3, t21 + + ST t11, 0 * SIZE(BO) + ST t21, 1 * SIZE(BO) + + ST t11, 0 * SIZE(CO1) + ST t21, 1 * SIZE(CO1) + + + daddiu CO1, CO1, 2 * SIZE + + dsubu TEMP, K, KK + dsll L, TEMP, 1 + BASE_SHIFT + dsll TEMP, TEMP, 0 + BASE_SHIFT + daddu AO, AO, L + daddu BO, BO, TEMP + + daddiu KK, KK, 2 + + + .align 3 +.L90: + andi I, M, 1 # mr=1 + blez I, .L89 + NOP + + MTC $0, t11 + + LD a1, 0 * SIZE(AO) + LD b1, 0 * SIZE(B) + + dsra L, KK, 2 + blez L, .L95 + move BO, B + + .align 3 +.L92: + LD a5, 1 * SIZE(AO) + LD b5, 1 * SIZE(BO) + + MADD t11, t11, a1, b1 # 1st compute + + LD a3, 2 * SIZE(AO) + LD b3, 2 * SIZE(BO) + + MADD t11, t11, a5, b5 # 2ed compute + + LD a7, 3 * SIZE(AO) + LD b7, 3 * SIZE(BO) + + MADD t11, t11, a3, b3 # 3rd compute + + daddiu AO, AO, 4 * SIZE # AO += 2mr*4kr + daddiu BO, BO, 4 * SIZE # BO += 1nr*4kr + + LD a1, 0 * SIZE(AO) # next + LD b1, 0 * SIZE(BO) + + MADD t11, t11, a7, b7 # 4th compute + + daddiu L, L, -1 + bgtz L, .L92 + nop + + .align 3 +.L95: + andi L, KK, 3 + blez L, .L98 + nop + + .align 3 +.L96: + MADD t11, t11, a1, b1 # 3rd compute + + daddiu AO, AO, 1 * SIZE # AO += 2mr + daddiu BO, BO, 1 * SIZE # BO += 1nr + + LD a1, 0 * SIZE(AO) # next + LD b1, 0 * SIZE(BO) + + daddiu L, L, -1 + bgtz L, .L96 + nop + + +.L98: + LD b1, 0 * SIZE(BO) + + SUB t11, b1, t11 + + LD b1, 0 * SIZE(AO) # computes the triangular_part + MUL t11, b1, t11 + + ST t11, 0 * SIZE(BO) + + ST t11, 0 * SIZE(CO1) + + + daddiu CO1, CO1, 1 * SIZE + + dsubu TEMP, K, KK + dsll L, TEMP, BASE_SHIFT + dsll TEMP, TEMP, BASE_SHIFT + daddu AO, AO, L + daddu BO, BO, TEMP + + daddiu KK, KK, 1 + + + .align 3 +.L89: + move B, BO + + + .align 3 + +.L999: + LDARG $16, 0($sp) + LDARG $17, 8($sp) + LDARG $18, 16($sp) + LDARG $19, 24($sp) + LDARG $20, 32($sp) + LDARG $21, 40($sp) + ldc1 $f24, 48($sp) + ldc1 $f25, 56($sp) + ldc1 $f26, 64($sp) + ldc1 $f27, 72($sp) + ldc1 $f28, 80($sp) + + LDARG $22, 88($sp) + LDARG $23, 96($sp) + LDARG $24, 104($sp) + LDARG $25, 112($sp) + +#ifndef __64BIT__ + ldc1 $f20,112($sp) + ldc1 $f21,120($sp) + ldc1 $f22,128($sp) + ldc1 $f23,136($sp) +#endif + + j $31 + daddiu $sp, $sp, 144 + + EPILOGUE From fa8e4fd879ea09dd093448e664f1c01ce47d5a1a Mon Sep 17 00:00:00 2001 From: Xianyi Zhang Date: Wed, 11 May 2011 01:12:32 +0800 Subject: [PATCH 13/42] Fixed #26 the wrong result of rotmg. Used fabs() instead of abs(). --- Changelog.txt | 1 + interface/rotmg.c | 12 +++++++++--- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/Changelog.txt b/Changelog.txt index 0a9f182fa..2035dbce1 100644 --- a/Changelog.txt +++ b/Changelog.txt @@ -16,6 +16,7 @@ common: issue #19 on github) * Fixed issue #23. Fixed a bug of f_check script about generating link flags. * Added openblas_set_num_threads for Fortran. + * Fixed #25 a wrong result of rotmg. x86/x86_64: * diff --git a/interface/rotmg.c b/interface/rotmg.c index c37c09914..3db891714 100644 --- a/interface/rotmg.c +++ b/interface/rotmg.c @@ -7,6 +7,12 @@ #define GAMSQ 16777216.e0 #define RGAMSQ 5.9604645e-8 +#ifdef DOUBLE +#define ABS(x) fabs(x) +#else +#define ABS(x) fabsf(x) +#endif + #ifndef CBLAS void NAME(FLOAT *dd1, FLOAT *dd2, FLOAT *dx1, FLOAT *DY1, FLOAT *dparam){ @@ -47,7 +53,7 @@ void CNAME(FLOAT *dd1, FLOAT *dd2, FLOAT *dx1, FLOAT dy1, FLOAT *dparam){ dq2 = dp2 * dy1; dq1 = dp1 * *dx1; - if (! (abs(dq1) > abs(dq2))) goto L40; + if (! (ABS(dq1) > ABS(dq2))) goto L40; dh21 = -(dy1) / *dx1; dh12 = dp2 / dp1; @@ -140,7 +146,7 @@ L150: goto L130; L160: - if (! (abs(*dd2) <= RGAMSQ)) { + if (! (ABS(*dd2) <= RGAMSQ)) { goto L190; } if (*dd2 == ZERO) { @@ -157,7 +163,7 @@ L180: goto L160; L190: - if (! (abs(*dd2) >= GAMSQ)) { + if (! (ABS(*dd2) >= GAMSQ)) { goto L220; } igo = 3; From 29dce62b8f5299201dbe539ccff1b1caf99ad715 Mon Sep 17 00:00:00 2001 From: traz Date: Wed, 11 May 2011 10:44:23 +0000 Subject: [PATCH 14/42] Finish dtrsm_kernel_Rx.S on Loongson3A. --- kernel/mips64/trsm_kernel_RN_loongson3a.S | 1852 +++++++++++++++++++ kernel/mips64/trsm_kernel_RT_loongson3a.S | 1958 +++++++++++++++++++++ 2 files changed, 3810 insertions(+) create mode 100644 kernel/mips64/trsm_kernel_RN_loongson3a.S create mode 100644 kernel/mips64/trsm_kernel_RT_loongson3a.S diff --git a/kernel/mips64/trsm_kernel_RN_loongson3a.S b/kernel/mips64/trsm_kernel_RN_loongson3a.S new file mode 100644 index 000000000..790d7c981 --- /dev/null +++ b/kernel/mips64/trsm_kernel_RN_loongson3a.S @@ -0,0 +1,1852 @@ +#define REALNAME ASMNAME + +#define ASSEMBLER +#include "common.h" + + +#define M $4 +#define N $5 +#define K $6 +#define A $8 +#define B $9 +#define C $10 +#define LDC $11 + +#define AO $12 +#define BO $13 + +#define I $2 +#define J $3 +#define L $7 + +#define CO1 $14 +#define CO2 $15 +#define CO3 $16 +#define CO4 $17 + +#define OFFSET $22 +#define KK $23 +#define TEMP $24 +#define AORIG $25 + +#define a1 $f0 +#define a2 $f1 +#define a3 $f26 +#define a4 $f27 + +#define a5 $f28 +#define a6 $f29 +#define a7 $f30 +#define a8 $f31 + +#define b1 $f2 +#define b2 $f3 +#define b3 $f4 +#define b4 $f5 + +#define b5 $f6 +#define b6 $f7 +#define b7 $f8 +#define b8 $f9 + +#define t11 $f10 +#define t21 $f11 +#define t31 $f12 +#define t41 $f13 + +#define t12 $f14 +#define t22 $f15 +#define t32 $f16 +#define t42 $f17 + +#define t13 $f18 +#define t23 $f19 +#define t33 $f20 +#define t43 $f21 + +#define t14 $f22 +#define t24 $f23 +#define t34 $f24 +#define t44 $f25 + + PROLOGUE + + daddiu $sp, $sp, -144 + + SDARG $16, 0($sp) + SDARG $17, 8($sp) + SDARG $18, 16($sp) + SDARG $19, 24($sp) + SDARG $20, 32($sp) + SDARG $21, 40($sp) + sdc1 $f24, 48($sp) + sdc1 $f25, 56($sp) + sdc1 $f26, 64($sp) + sdc1 $f27, 72($sp) + sdc1 $f28, 80($sp) + + SDARG $22, 88($sp) + SDARG $23, 96($sp) + SDARG $24, 104($sp) + SDARG $25, 112($sp) + +#ifndef __64BIT__ + sdc1 $f20,112($sp) + sdc1 $f21,120($sp) + sdc1 $f22,128($sp) + sdc1 $f23,136($sp) +#endif + + # RN compute from top to bottom left to right + .align 3 + LDARG OFFSET, 144($sp) # get the last parameter + dsll LDC, LDC, BASE_SHIFT # LDC * data_Byte + + neg KK, OFFSET # for RN OFFSET always 0 + + dsra J, N, 2 # J = NC/4 + blez J, .L30 + NOP + +.L10: + daddiu J, J, -1 + + move CO1, C + daddu CO2, C, LDC + daddu CO3, CO2, LDC + daddu CO4, CO3, LDC + + move AO, A # A is the retangular matrix and B is the trigular matrix + daddu C, CO4, LDC # Fixed pointer C + + dsra I, M, 2 # I=MC/4 + blez I, .L20 + NOP + + .align 3 +.L11: + MTC $0, t11 # clear results registers + MOV t21, t11 + MOV t31, t11 + MOV t41, t11 + + MOV t12, t11 + MOV t22, t11 + MOV t32, t11 + MOV t42, t11 + + MOV t13, t11 + MOV t23, t11 + MOV t33, t11 + MOV t43, t11 + + MOV t14, t11 + MOV t24, t11 + MOV t34, t11 + MOV t44, t11 + + LD a1, 0 * SIZE(AO) # AO point to the beginning address of sa + LD a2, 1 * SIZE(AO) # get 4 a + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + LD b1, 0 * SIZE(B) # B point to the beginning address of every panel Bj + LD b2, 1 * SIZE(B) # get 4 b + LD b3, 2 * SIZE(B) + LD b4, 3 * SIZE(B) + + dsra L, KK, 2 # L=KK/4, KK is the length of the retangular data part of Bj + blez L, .L15 + move BO, B # reset B + +.L12: + LD a5, 4 * SIZE(AO) + LD a6, 5 * SIZE(AO) + LD a7, 6 * SIZE(AO) + LD a8, 7 * SIZE(AO) + + LD b5, 4 * SIZE(BO) + LD b6, 5 * SIZE(BO) + LD b7, 6 * SIZE(BO) + LD b8, 7 * SIZE(BO) + + MADD t11, t11, a1, b1 + MADD t21, t21, a2, b1 + MADD t31, t31, a3, b1 + MADD t41, t41, a4, b1 + + MADD t12, t12, a1, b2 + MADD t22, t22, a2, b2 + MADD t32, t32, a3, b2 + MADD t42, t42, a4, b2 + + MADD t13, t13, a1, b3 + MADD t23, t23, a2, b3 + MADD t33, t33, a3, b3 + MADD t43, t43, a4, b3 + + MADD t14, t14, a1, b4 + MADD t24, t24, a2, b4 + MADD t34, t34, a3, b4 + MADD t44, t44, a4, b4 # fisrt + + LD a1, 8 * SIZE(AO) + LD a2, 9 * SIZE(AO) + LD a3, 10 * SIZE(AO) + LD a4, 11 * SIZE(AO) + + LD b1, 8 * SIZE(BO) + LD b2, 9 * SIZE(BO) + LD b3, 10 * SIZE(BO) + LD b4, 11 * SIZE(BO) + + MADD t11, t11, a5, b5 + MADD t21, t21, a6, b5 + MADD t31, t31, a7, b5 + MADD t41, t41, a8, b5 + + MADD t12, t12, a5, b6 + MADD t22, t22, a6, b6 + MADD t32, t32, a7, b6 + MADD t42, t42, a8, b6 + + MADD t13, t13, a5, b7 + MADD t23, t23, a6, b7 + MADD t33, t33, a7, b7 + MADD t43, t43, a8, b7 + + MADD t14, t14, a5, b8 + MADD t24, t24, a6, b8 + MADD t34, t34, a7, b8 + MADD t44, t44, a8, b8 # second + + LD a5, 12 * SIZE(AO) + LD a6, 13 * SIZE(AO) + LD a7, 14 * SIZE(AO) + LD a8, 15 * SIZE(AO) + + LD b5, 12 * SIZE(BO) + LD b6, 13 * SIZE(BO) + LD b7, 14 * SIZE(BO) + LD b8, 15 * SIZE(BO) + + MADD t11, t11, a1, b1 + MADD t21, t21, a2, b1 + MADD t31, t31, a3, b1 + MADD t41, t41, a4, b1 + + MADD t12, t12, a1, b2 + MADD t22, t22, a2, b2 + MADD t32, t32, a3, b2 + MADD t42, t42, a4, b2 + + MADD t13, t13, a1, b3 + MADD t23, t23, a2, b3 + MADD t33, t33, a3, b3 + MADD t43, t43, a4, b3 + + MADD t14, t14, a1, b4 + MADD t24, t24, a2, b4 + MADD t34, t34, a3, b4 + MADD t44, t44, a4, b4 # third + + daddiu AO, AO, 16 * SIZE # AO += 4mr*4kr + daddiu BO, BO, 16 * SIZE # BP += 4nr*4kr + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + + MADD t11, t11, a5, b5 + MADD t21, t21, a6, b5 + MADD t31, t31, a7, b5 + MADD t41, t41, a8, b5 + + MADD t12, t12, a5, b6 + MADD t22, t22, a6, b6 + MADD t32, t32, a7, b6 + MADD t42, t42, a8, b6 + + MADD t13, t13, a5, b7 + MADD t23, t23, a6, b7 + MADD t33, t33, a7, b7 + MADD t43, t43, a8, b7 + + MADD t14, t14, a5, b8 + MADD t24, t24, a6, b8 + MADD t34, t34, a7, b8 + MADD t44, t44, a8, b8 # fouth + + daddiu L, L, -1 + bgtz L, .L12 + NOP + + +.L15: + andi L, KK, 3 # deal with kc remainder part + blez L, .L18 + NOP + + .align 3 +.L16: + MADD t11, t11, a1, b1 + MADD t21, t21, a2, b1 + MADD t31, t31, a3, b1 + MADD t41, t41, a4, b1 + + MADD t12, t12, a1, b2 + MADD t22, t22, a2, b2 + MADD t32, t32, a3, b2 + MADD t42, t42, a4, b2 + + MADD t13, t13, a1, b3 + MADD t23, t23, a2, b3 + MADD t33, t33, a3, b3 + MADD t43, t43, a4, b3 + + MADD t14, t14, a1, b4 + MADD t24, t24, a2, b4 + MADD t34, t34, a3, b4 + MADD t44, t44, a4, b4 + + daddiu AO, AO, 4 * SIZE # AO += 4mr + daddiu BO, BO, 4 * SIZE # BP += 4nr + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + + daddiu L, L, -1 + bgtz L, .L16 + NOP + + + .align 3 +.L18: # .L18 always deal with the trigular data part + LD b1, 0 * SIZE(AO) # for RN & RT A is the result matrix + LD b2, 1 * SIZE(AO) # Fixed results + LD b3, 2 * SIZE(AO) + LD b4, 3 * SIZE(AO) # sa stored as col major + + SUB t11, b1, t11 + SUB t21, b2, t21 + SUB t31, b3, t31 + SUB t41, b4, t41 + + LD b5, 4 * SIZE(AO) + LD b6, 5 * SIZE(AO) + LD b7, 6 * SIZE(AO) + LD b8, 7 * SIZE(AO) + + SUB t12, b5, t12 + SUB t22, b6, t22 + SUB t32, b7, t32 + SUB t42, b8, t42 + + LD b1, 8 * SIZE(AO) + LD b2, 9 * SIZE(AO) + LD b3, 10 * SIZE(AO) + LD b4, 11 * SIZE(AO) + + SUB t13, b1, t13 + SUB t23, b2, t23 + SUB t33, b3, t33 + SUB t43, b4, t43 + + LD b5, 12 * SIZE(AO) + LD b6, 13 * SIZE(AO) + LD b7, 14 * SIZE(AO) + LD b8, 15 * SIZE(AO) + + SUB t14, b5, t14 + SUB t24, b6, t24 + SUB t34, b7, t34 + SUB t44, b8, t44 + + + + LD b1, 0 * SIZE(BO) # BO point to the beginning of the trigular data part of Bj + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + MUL t11, b1, t11 + MUL t21, b1, t21 + MUL t31, b1, t31 + MUL t41, b1, t41 + NMSUB t12, t12, b2, t11 + NMSUB t22, t22, b2, t21 + NMSUB t32, t32, b2, t31 + NMSUB t42, t42, b2, t41 + NMSUB t13, t13, b3, t11 + NMSUB t23, t23, b3, t21 + NMSUB t33, t33, b3, t31 + NMSUB t43, t43, b3, t41 + NMSUB t14, t14, b4, t11 + NMSUB t24, t24, b4, t21 + NMSUB t34, t34, b4, t31 + NMSUB t44, t44, b4, t41 + + + LD b5, 5 * SIZE(BO) + LD b6, 6 * SIZE(BO) + LD b7, 7 * SIZE(BO) + MUL t12, b5, t12 + MUL t22, b5, t22 + MUL t32, b5, t32 + MUL t42, b5, t42 + NMSUB t13, t13, b6, t12 + NMSUB t23, t23, b6, t22 + NMSUB t33, t33, b6, t32 + NMSUB t43, t43, b6, t42 + NMSUB t14, t14, b7, t12 + NMSUB t24, t24, b7, t22 + NMSUB t34, t34, b7, t32 + NMSUB t44, t44, b7, t42 + + + + LD b8, 10 * SIZE(BO) + LD b1, 11 * SIZE(BO) + MUL t13, b8, t13 + MUL t23, b8, t23 + MUL t33, b8, t33 + MUL t43, b8, t43 + NMSUB t14, t14, b1, t13 + NMSUB t24, t24, b1, t23 + NMSUB t34, t34, b1, t33 + NMSUB t44, t44, b1, t43 + + + + LD b2, 15 * SIZE(BO) + MUL t14, b2, t14 + MUL t24, b2, t24 + MUL t34, b2, t34 + MUL t44, b2, t44 + + + + ST t11, 0 * SIZE(AO) # update packed blockA for follow-up compute + ST t21, 1 * SIZE(AO) + ST t31, 2 * SIZE(AO) + ST t41, 3 * SIZE(AO) + + ST t12, 4 * SIZE(AO) + ST t22, 5 * SIZE(AO) + ST t32, 6 * SIZE(AO) + ST t42, 7 * SIZE(AO) + + ST t13, 8 * SIZE(AO) + ST t23, 9 * SIZE(AO) + ST t33, 10 * SIZE(AO) + ST t43, 11 * SIZE(AO) + + ST t14, 12 * SIZE(AO) + ST t24, 13 * SIZE(AO) + ST t34, 14 * SIZE(AO) + ST t44, 15 * SIZE(AO) + + + ST t11, 0 * SIZE(CO1) # write back results + ST t21, 1 * SIZE(CO1) + ST t31, 2 * SIZE(CO1) + ST t41, 3 * SIZE(CO1) + + ST t12, 0 * SIZE(CO2) + ST t22, 1 * SIZE(CO2) + ST t32, 2 * SIZE(CO2) + ST t42, 3 * SIZE(CO2) + + ST t13, 0 * SIZE(CO3) + ST t23, 1 * SIZE(CO3) + ST t33, 2 * SIZE(CO3) + ST t43, 3 * SIZE(CO3) + + ST t14, 0 * SIZE(CO4) + ST t24, 1 * SIZE(CO4) + ST t34, 2 * SIZE(CO4) + ST t44, 3 * SIZE(CO4) + + daddiu CO1, CO1, 4 * SIZE # fixed address + daddiu CO2, CO2, 4 * SIZE + daddiu CO3, CO3, 4 * SIZE + daddiu CO4, CO4, 4 * SIZE + + + dsubu TEMP, K, KK # temp = kc - retangular data length of every panel + dsll L, TEMP, 2 + BASE_SHIFT + dsll TEMP, TEMP, 2 + BASE_SHIFT + daddu AO, AO, L # move AO to the end of this panel. also the beginning of next panel + daddu BO, BO, TEMP # move BO to the end of this panel + + daddiu I, I, -1 + bgtz I, .L11 + NOP + + .align 3 +.L20: + andi I, M, 2 # mr=2 + blez I, .L50 + nop + + MTC $0, t11 # clear results registers + MOV t21, t11 + MOV t31, t11 + MOV t41, t11 + + MOV t12, t11 + MOV t22, t11 + MOV t32, t11 + MOV t42, t11 + + MOV t13, t11 + MOV t23, t11 + MOV t33, t11 + MOV t43, t11 + + MOV t14, t11 + MOV t24, t11 + MOV t34, t11 + MOV t44, t11 + + LD a1, 0 * SIZE(AO) # AO point to the beginning address of sa + LD a2, 1 * SIZE(AO) # get 4 a + + LD b1, 0 * SIZE(B) # B point to the beginning address of every panel Bj + LD b2, 1 * SIZE(B) # get 4 b + LD b3, 2 * SIZE(B) + LD b4, 3 * SIZE(B) + + dsra L, KK, 2 # L=KK/4, KK is the length of the retangular data part of Bj + blez L, .L25 + move BO, B # reset B + +.L22: + LD a5, 2 * SIZE(AO) + LD a6, 3 * SIZE(AO) + + LD b5, 4 * SIZE(BO) + LD b6, 5 * SIZE(BO) + LD b7, 6 * SIZE(BO) + LD b8, 7 * SIZE(BO) + + MADD t11, t11, a1, b1 + MADD t21, t21, a2, b1 + + MADD t12, t12, a1, b2 + MADD t22, t22, a2, b2 + + MADD t13, t13, a1, b3 + MADD t23, t23, a2, b3 + + MADD t14, t14, a1, b4 + MADD t24, t24, a2, b4 + + LD a3, 4 * SIZE(AO) + LD a4, 5 * SIZE(AO) + + LD b1, 8 * SIZE(BO) + LD b2, 9 * SIZE(BO) + LD b3, 10 * SIZE(BO) + LD b4, 11 * SIZE(BO) + + MADD t11, t11, a5, b5 + MADD t21, t21, a6, b5 + + MADD t12, t12, a5, b6 + MADD t22, t22, a6, b6 + + MADD t13, t13, a5, b7 + MADD t23, t23, a6, b7 + + MADD t14, t14, a5, b8 + MADD t24, t24, a6, b8 + + LD a7, 6 * SIZE(AO) + LD a8, 7 * SIZE(AO) + + LD b5, 12 * SIZE(BO) + LD b6, 13 * SIZE(BO) + LD b7, 14 * SIZE(BO) + LD b8, 15 * SIZE(BO) + + MADD t11, t11, a3, b1 + MADD t21, t21, a4, b1 + + MADD t12, t12, a3, b2 + MADD t22, t22, a4, b2 + + MADD t13, t13, a3, b3 + MADD t23, t23, a4, b3 + + MADD t14, t14, a3, b4 + MADD t24, t24, a4, b4 + + daddiu AO, AO, 8 * SIZE # AO += 2mr*4kr + daddiu BO, BO, 16 * SIZE # BP += 4nr*4kr + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + + MADD t11, t11, a7, b5 + MADD t21, t21, a8, b5 + + MADD t12, t12, a7, b6 + MADD t22, t22, a8, b6 + + MADD t13, t13, a7, b7 + MADD t23, t23, a8, b7 + + MADD t14, t14, a7, b8 + MADD t24, t24, a8, b8 + + daddiu L, L, -1 + bgtz L, .L22 + NOP + + +.L25: + andi L, KK, 3 # deal with kc remainder part + blez L, .L28 + NOP + + .align 3 +.L26: + MADD t11, t11, a1, b1 + MADD t21, t21, a2, b1 + + MADD t12, t12, a1, b2 + MADD t22, t22, a2, b2 + + MADD t13, t13, a1, b3 + MADD t23, t23, a2, b3 + + MADD t14, t14, a1, b4 + MADD t24, t24, a2, b4 + + daddiu AO, AO, 2 * SIZE # AO += 2mr + daddiu BO, BO, 4 * SIZE # BP += 4nr + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + + daddiu L, L, -1 + bgtz L, .L26 + NOP + + + .align 3 +.L28: # .L18 always deal with the trigular data part + LD b1, 0 * SIZE(AO) # for RN & RT A is the result matrix + LD b2, 1 * SIZE(AO) # Fixed results + + SUB t11, b1, t11 + SUB t21, b2, t21 + + LD b5, 2 * SIZE(AO) + LD b6, 3 * SIZE(AO) + + SUB t12, b5, t12 + SUB t22, b6, t22 + + LD b3, 4 * SIZE(AO) + LD b4, 5 * SIZE(AO) + + SUB t13, b3, t13 + SUB t23, b4, t23 + + LD b7, 6 * SIZE(AO) + LD b8, 7 * SIZE(AO) + + SUB t14, b7, t14 + SUB t24, b8, t24 + + + + LD b1, 0 * SIZE(BO) # BO point to the beginning of the trigular data part of Bj + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + MUL t11, b1, t11 + MUL t21, b1, t21 + NMSUB t12, t12, b2, t11 + NMSUB t22, t22, b2, t21 + NMSUB t13, t13, b3, t11 + NMSUB t23, t23, b3, t21 + NMSUB t14, t14, b4, t11 + NMSUB t24, t24, b4, t21 + + + LD b5, 5 * SIZE(BO) + LD b6, 6 * SIZE(BO) + LD b7, 7 * SIZE(BO) + MUL t12, b5, t12 + MUL t22, b5, t22 + NMSUB t13, t13, b6, t12 + NMSUB t23, t23, b6, t22 + NMSUB t14, t14, b7, t12 + NMSUB t24, t24, b7, t22 + + + + LD b8, 10 * SIZE(BO) + LD b1, 11 * SIZE(BO) + MUL t13, b8, t13 + MUL t23, b8, t23 + NMSUB t14, t14, b1, t13 + NMSUB t24, t24, b1, t23 + + + + LD b2, 15 * SIZE(BO) + MUL t14, b2, t14 + MUL t24, b2, t24 + + + + ST t11, 0 * SIZE(AO) # update packed blockA for follow-up compute + ST t21, 1 * SIZE(AO) + + ST t12, 2 * SIZE(AO) + ST t22, 3 * SIZE(AO) + + ST t13, 4 * SIZE(AO) + ST t23, 5 * SIZE(AO) + + ST t14, 6 * SIZE(AO) + ST t24, 7 * SIZE(AO) + + + ST t11, 0 * SIZE(CO1) # write back results + ST t21, 1 * SIZE(CO1) + + ST t12, 0 * SIZE(CO2) + ST t22, 1 * SIZE(CO2) + + ST t13, 0 * SIZE(CO3) + ST t23, 1 * SIZE(CO3) + + ST t14, 0 * SIZE(CO4) + ST t24, 1 * SIZE(CO4) + + daddiu CO1, CO1, 2 * SIZE # fixed address + daddiu CO2, CO2, 2 * SIZE # mr=2 + daddiu CO3, CO3, 2 * SIZE + daddiu CO4, CO4, 2 * SIZE + + + dsubu TEMP, K, KK # temp = kc - retangular data length of every panel + dsll L, TEMP, 1 + BASE_SHIFT # mr=2 + dsll TEMP, TEMP, 2 + BASE_SHIFT + daddu AO, AO, L # move AO to the end of this panel. also the beginning of next panel + daddu BO, BO, TEMP # move BO to the end of this panel + + .align 3 +.L50: + andi I, M, 1 # mr=1 + blez I, .L29 + nop + + MTC $0, t11 # clear results registers + MOV t21, t11 + MOV t31, t11 + MOV t41, t11 + + MOV t12, t11 + MOV t22, t11 + MOV t32, t11 + MOV t42, t11 + + MOV t13, t11 + MOV t23, t11 + MOV t33, t11 + MOV t43, t11 + + MOV t14, t11 + MOV t24, t11 + MOV t34, t11 + MOV t44, t11 + + LD a1, 0 * SIZE(AO) # AO point to the beginning address of sa + + LD b1, 0 * SIZE(B) # B point to the beginning address of every panel Bj + LD b2, 1 * SIZE(B) # get 4 b + LD b3, 2 * SIZE(B) + LD b4, 3 * SIZE(B) + + dsra L, KK, 2 # L=KK/4, KK is the length of the retangular data part of Bj + blez L, .L55 + move BO, B # reset B + +.L52: + LD a5, 1 * SIZE(AO) + + LD b5, 4 * SIZE(BO) + LD b6, 5 * SIZE(BO) + LD b7, 6 * SIZE(BO) + LD b8, 7 * SIZE(BO) + + MADD t11, t11, a1, b1 + MADD t12, t12, a1, b2 + MADD t13, t13, a1, b3 + MADD t14, t14, a1, b4 + + LD a3, 2 * SIZE(AO) + + LD b1, 8 * SIZE(BO) + LD b2, 9 * SIZE(BO) + LD b3, 10 * SIZE(BO) + LD b4, 11 * SIZE(BO) + + MADD t11, t11, a5, b5 + MADD t12, t12, a5, b6 + MADD t13, t13, a5, b7 + MADD t14, t14, a5, b8 + + LD a7, 3 * SIZE(AO) + + LD b5, 12 * SIZE(BO) + LD b6, 13 * SIZE(BO) + LD b7, 14 * SIZE(BO) + LD b8, 15 * SIZE(BO) + + MADD t11, t11, a3, b1 + MADD t12, t12, a3, b2 + MADD t13, t13, a3, b3 + MADD t14, t14, a3, b4 + + daddiu AO, AO, 4 * SIZE # AO += 1mr*4kr + daddiu BO, BO, 16 * SIZE # BP += 4nr*4kr + + LD a1, 0 * SIZE(AO) + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + + MADD t11, t11, a7, b5 + MADD t12, t12, a7, b6 + MADD t13, t13, a7, b7 + MADD t14, t14, a7, b8 + + daddiu L, L, -1 + bgtz L, .L52 + NOP + + +.L55: + andi L, KK, 3 # deal with kc remainder part + blez L, .L58 + NOP + + .align 3 +.L56: + MADD t11, t11, a1, b1 + MADD t12, t12, a1, b2 + MADD t13, t13, a1, b3 + MADD t14, t14, a1, b4 + + daddiu AO, AO, 1 * SIZE # AO += 1mr + daddiu BO, BO, 4 * SIZE # BP += 4nr + + LD a1, 0 * SIZE(AO) + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + + daddiu L, L, -1 + bgtz L, .L56 + NOP + + + .align 3 +.L58: # .L18 always deal with the trigular data part + LD b1, 0 * SIZE(AO) # for RN & RT A is the result matrix + LD b5, 1 * SIZE(AO) + LD b3, 2 * SIZE(AO) + LD b7, 3 * SIZE(AO) + + SUB t11, b1, t11 + SUB t12, b5, t12 + SUB t13, b3, t13 + SUB t14, b7, t14 + + + + LD b1, 0 * SIZE(BO) # BO point to the beginning of the trigular data part of Bj + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + MUL t11, b1, t11 + NMSUB t12, t12, b2, t11 + NMSUB t13, t13, b3, t11 + NMSUB t14, t14, b4, t11 + + + LD b5, 5 * SIZE(BO) + LD b6, 6 * SIZE(BO) + LD b7, 7 * SIZE(BO) + MUL t12, b5, t12 + NMSUB t13, t13, b6, t12 + NMSUB t14, t14, b7, t12 + + + LD b8, 10 * SIZE(BO) + LD b1, 11 * SIZE(BO) + MUL t13, b8, t13 + NMSUB t14, t14, b1, t13 + + + LD b2, 15 * SIZE(BO) + MUL t14, b2, t14 + + + + ST t11, 0 * SIZE(AO) # update packed blockA for follow-up compute + ST t12, 1 * SIZE(AO) + ST t13, 2 * SIZE(AO) + ST t14, 3 * SIZE(AO) + + + ST t11, 0 * SIZE(CO1) # write back results + ST t12, 0 * SIZE(CO2) + ST t13, 0 * SIZE(CO3) + ST t14, 0 * SIZE(CO4) + + daddiu CO1, CO1, 1 * SIZE # fixed address + daddiu CO2, CO2, 1 * SIZE # mr=2 + daddiu CO3, CO3, 1 * SIZE + daddiu CO4, CO4, 1 * SIZE + + + dsubu TEMP, K, KK # temp = kc - retangular data length of every panel + dsll L, TEMP, BASE_SHIFT # mr=2 + dsll TEMP, TEMP, 2 + BASE_SHIFT + daddu AO, AO, L # move AO to the end of this panel. also the beginning of next panel + daddu BO, BO, TEMP # move BO to the end of this panel + + + .align 3 +.L29: + move B, BO # change to next panel of Bj + daddiu KK, KK, 4 # rectangular data length increase by 4 + bgtz J, .L10 + NOP + + + .align 3 + +.L30: + andi J, N, 2 + blez J, .L70 + nop + + move CO1, C + daddu CO2, C, LDC + + move AO, A # A is the retangular matrix and B is the trigular matrix + daddu C, CO2, LDC # Fixed pointer C + + dsra I, M, 2 # I=MC/4 + blez I, .L40 + NOP + + .align 3 +.L31: + MTC $0, t11 # clear results registers + MOV t21, t11 + MOV t31, t11 + MOV t41, t11 + + MOV t12, t11 + MOV t22, t11 + MOV t32, t11 + MOV t42, t11 + + LD a1, 0 * SIZE(AO) # AO point to the beginning address of sa + LD a2, 1 * SIZE(AO) # get 4 a + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + LD b1, 0 * SIZE(B) # B point to the beginning address of every panel Bj + LD b2, 1 * SIZE(B) # get 4 b + + dsra L, KK, 2 # L=KK/4, KK is the length of the retangular data part of Bj + blez L, .L35 + move BO, B # reset B + +.L32: + LD a5, 4 * SIZE(AO) + LD a6, 5 * SIZE(AO) + LD a7, 6 * SIZE(AO) + LD a8, 7 * SIZE(AO) + + LD b5, 2 * SIZE(BO) + LD b6, 3 * SIZE(BO) + + MADD t11, t11, a1, b1 + MADD t21, t21, a2, b1 + MADD t31, t31, a3, b1 + MADD t41, t41, a4, b1 + + MADD t12, t12, a1, b2 + MADD t22, t22, a2, b2 + MADD t32, t32, a3, b2 + MADD t42, t42, a4, b2 + + LD a1, 8 * SIZE(AO) + LD a2, 9 * SIZE(AO) + LD a3, 10 * SIZE(AO) + LD a4, 11 * SIZE(AO) + + LD b3, 4 * SIZE(BO) + LD b4, 5 * SIZE(BO) + + MADD t11, t11, a5, b5 + MADD t21, t21, a6, b5 + MADD t31, t31, a7, b5 + MADD t41, t41, a8, b5 + + MADD t12, t12, a5, b6 + MADD t22, t22, a6, b6 + MADD t32, t32, a7, b6 + MADD t42, t42, a8, b6 + + LD a5, 12 * SIZE(AO) + LD a6, 13 * SIZE(AO) + LD a7, 14 * SIZE(AO) + LD a8, 15 * SIZE(AO) + + LD b7, 6 * SIZE(BO) + LD b8, 7 * SIZE(BO) + + MADD t11, t11, a1, b3 + MADD t21, t21, a2, b3 + MADD t31, t31, a3, b3 + MADD t41, t41, a4, b3 + + MADD t12, t12, a1, b4 + MADD t22, t22, a2, b4 + MADD t32, t32, a3, b4 + MADD t42, t42, a4, b4 + + daddiu AO, AO, 16 * SIZE # AO += 4mr*4kr + daddiu BO, BO, 8 * SIZE # BP += 2nr*4kr + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + + MADD t11, t11, a5, b7 + MADD t21, t21, a6, b7 + MADD t31, t31, a7, b7 + MADD t41, t41, a8, b7 + + MADD t12, t12, a5, b8 + MADD t22, t22, a6, b8 + MADD t32, t32, a7, b8 + MADD t42, t42, a8, b8 + + daddiu L, L, -1 + bgtz L, .L32 + NOP + + +.L35: + andi L, KK, 3 # deal with kc remainder part + blez L, .L38 + NOP + + .align 3 +.L36: + MADD t11, t11, a1, b1 + MADD t21, t21, a2, b1 + MADD t31, t31, a3, b1 + MADD t41, t41, a4, b1 + + MADD t12, t12, a1, b2 + MADD t22, t22, a2, b2 + MADD t32, t32, a3, b2 + MADD t42, t42, a4, b2 + + daddiu AO, AO, 4 * SIZE # AO += 4mr + daddiu BO, BO, 2 * SIZE # BP += 2nr + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + + daddiu L, L, -1 + bgtz L, .L36 + NOP + + + .align 3 +.L38: # .L38 always deal with the trigular data part + LD b1, 0 * SIZE(AO) # for RN & RT A is the result matrix + LD b2, 1 * SIZE(AO) # Fixed results + LD b3, 2 * SIZE(AO) + LD b4, 3 * SIZE(AO) # sa stored as col major + + SUB t11, b1, t11 + SUB t21, b2, t21 + SUB t31, b3, t31 + SUB t41, b4, t41 + + LD b5, 4 * SIZE(AO) + LD b6, 5 * SIZE(AO) + LD b7, 6 * SIZE(AO) + LD b8, 7 * SIZE(AO) + + SUB t12, b5, t12 + SUB t22, b6, t22 + SUB t32, b7, t32 + SUB t42, b8, t42 + + + LD b1, 0 * SIZE(BO) # BO point to the beginning of the trigular data part of Bj + LD b2, 1 * SIZE(BO) + MUL t11, b1, t11 + MUL t21, b1, t21 + MUL t31, b1, t31 + MUL t41, b1, t41 + NMSUB t12, t12, b2, t11 + NMSUB t22, t22, b2, t21 + NMSUB t32, t32, b2, t31 + NMSUB t42, t42, b2, t41 + + LD b5, 3 * SIZE(BO) + MUL t12, b5, t12 + MUL t22, b5, t22 + MUL t32, b5, t32 + MUL t42, b5, t42 + + + ST t11, 0 * SIZE(AO) # update packed blockA for follow-up compute + ST t21, 1 * SIZE(AO) + ST t31, 2 * SIZE(AO) + ST t41, 3 * SIZE(AO) + + ST t12, 4 * SIZE(AO) + ST t22, 5 * SIZE(AO) + ST t32, 6 * SIZE(AO) + ST t42, 7 * SIZE(AO) + + ST t11, 0 * SIZE(CO1) # write back results + ST t21, 1 * SIZE(CO1) + ST t31, 2 * SIZE(CO1) + ST t41, 3 * SIZE(CO1) + + ST t12, 0 * SIZE(CO2) + ST t22, 1 * SIZE(CO2) + ST t32, 2 * SIZE(CO2) + ST t42, 3 * SIZE(CO2) + + daddiu CO1, CO1, 4 * SIZE # fixed address + daddiu CO2, CO2, 4 * SIZE + + dsubu TEMP, K, KK # temp = kc - retangular data length of every panel + dsll L, TEMP, 2 + BASE_SHIFT + dsll TEMP, TEMP, 1 + BASE_SHIFT # nr=2 + daddu AO, AO, L # move AO to the end of this panel. also the beginning of next panel + daddu BO, BO, TEMP # move BO to the end of this panel + + daddiu I, I, -1 + bgtz I, .L31 + NOP + + .align 3 +.L40: + andi I, M,2 + blez I,.L60 + nop + + MTC $0, t11 # clear results registers + MOV t21, t11 + + MOV t12, t11 + MOV t22, t11 + + LD a1, 0 * SIZE(AO) # AO point to the beginning address of sa + LD a2, 1 * SIZE(AO) # get 4 a + + LD b1, 0 * SIZE(B) # B point to the beginning address of every panel Bj + LD b2, 1 * SIZE(B) # get 4 b + + dsra L, KK, 2 # L=KK/4, KK is the length of the retangular data part of Bj + blez L, .L45 + move BO, B # reset B + +.L42: + LD a5, 2 * SIZE(AO) + LD a6, 3 * SIZE(AO) + LD b5, 2 * SIZE(BO) + LD b6, 3 * SIZE(BO) + + MADD t11, t11, a1, b1 + MADD t21, t21, a2, b1 + MADD t12, t12, a1, b2 + MADD t22, t22, a2, b2 + + LD a3, 4 * SIZE(AO) + LD a4, 5 * SIZE(AO) + LD b3, 4 * SIZE(BO) + LD b4, 5 * SIZE(BO) + + MADD t11, t11, a5, b5 + MADD t21, t21, a6, b5 + MADD t12, t12, a5, b6 + MADD t22, t22, a6, b6 + + LD a7, 6 * SIZE(AO) + LD a8, 7 * SIZE(AO) + LD b7, 6 * SIZE(BO) + LD b8, 7 * SIZE(BO) + + MADD t11, t11, a3, b3 + MADD t21, t21, a4, b3 + MADD t12, t12, a3, b4 + MADD t22, t22, a4, b4 + + daddiu AO, AO, 8 * SIZE # AO += 2mr*4kr + daddiu BO, BO, 8 * SIZE # BP += 2nr*4kr + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + + MADD t11, t11, a7, b7 + MADD t21, t21, a8, b7 + MADD t12, t12, a7, b8 + MADD t22, t22, a8, b8 + + daddiu L, L, -1 + bgtz L, .L42 + NOP + + +.L45: + andi L, KK, 3 # deal with kc remainder part + blez L, .L48 + NOP + + .align 3 +.L46: + MADD t11, t11, a1, b1 + MADD t21, t21, a2, b1 + MADD t12, t12, a1, b2 + MADD t22, t22, a2, b2 + + daddiu AO, AO, 2 * SIZE # AO += 2mr + daddiu BO, BO, 2 * SIZE # BP += 2nr + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + + daddiu L, L, -1 + bgtz L, .L46 + NOP + + + .align 3 +.L48: # .L48 always deal with the trigular data part + LD b1, 0 * SIZE(AO) # for RN & RT A is the result matrix + LD b2, 1 * SIZE(AO) # Fixed results + + SUB t11, b1, t11 + SUB t21, b2, t21 + + LD b5, 2 * SIZE(AO) + LD b6, 3 * SIZE(AO) + + SUB t12, b5, t12 + SUB t22, b6, t22 + + + LD b1, 0 * SIZE(BO) # BO point to the beginning of the trigular data part of Bj + LD b2, 1 * SIZE(BO) + MUL t11, b1, t11 + MUL t21, b1, t21 + NMSUB t12, t12, b2, t11 + NMSUB t22, t22, b2, t21 + + LD b5, 3 * SIZE(BO) + MUL t12, b5, t12 + MUL t22, b5, t22 + + + ST t11, 0 * SIZE(AO) # update packed blockA for follow-up compute + ST t21, 1 * SIZE(AO) + ST t12, 2 * SIZE(AO) + ST t22, 3 * SIZE(AO) + + ST t11, 0 * SIZE(CO1) # write back results + ST t21, 1 * SIZE(CO1) + ST t12, 0 * SIZE(CO2) + ST t22, 1 * SIZE(CO2) + + daddiu CO1, CO1, 2 * SIZE # fixed address + daddiu CO2, CO2, 2 * SIZE + + dsubu TEMP, K, KK # temp = kc - retangular data length of every panel + dsll L, TEMP, 1 + BASE_SHIFT + dsll TEMP, TEMP, 1 + BASE_SHIFT # nr=2 + daddu AO, AO, L # move AO to the end of this panel. also the beginning of next panel + daddu BO, BO, TEMP # move BO to the end of this panel + + + .align 3 +.L60: + andi I,M,1 # nr=2 mr=1 + blez I,.L39 + nop + + MTC $0, t11 # clear results registers + MOV t12, t11 + + LD a1, 0 * SIZE(AO) # AO point to the beginning address of sa + + LD b1, 0 * SIZE(B) # B point to the beginning address of every panel Bj + LD b2, 1 * SIZE(B) # get 4 b + + dsra L, KK, 2 # L=KK/4, KK is the length of the retangular data part of Bj + blez L, .L65 + move BO, B # reset B + +.L62: + LD a5, 1 * SIZE(AO) + LD b5, 2 * SIZE(BO) + LD b6, 3 * SIZE(BO) + + MADD t11, t11, a1, b1 + MADD t12, t12, a1, b2 + + LD a3, 2 * SIZE(AO) + LD b3, 4 * SIZE(BO) + LD b4, 5 * SIZE(BO) + + MADD t11, t11, a5, b5 + MADD t12, t12, a5, b6 + + LD a7, 3 * SIZE(AO) + LD b7, 6 * SIZE(BO) + LD b8, 7 * SIZE(BO) + + MADD t11, t11, a3, b3 + MADD t12, t12, a3, b4 + + daddiu AO, AO, 4 * SIZE # AO += 1mr*4kr + daddiu BO, BO, 8 * SIZE # BP += 2nr*4kr + + LD a1, 0 * SIZE(AO) + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + + MADD t11, t11, a7, b7 + MADD t12, t12, a7, b8 + + daddiu L, L, -1 + bgtz L, .L62 + NOP + + +.L65: + andi L, KK, 3 # deal with kc remainder part + blez L, .L68 + NOP + + .align 3 +.L66: + MADD t11, t11, a1, b1 + MADD t12, t12, a1, b2 + + daddiu AO, AO, 1 * SIZE # AO += mr + daddiu BO, BO, 2 * SIZE # BP += 2nr + + LD a1, 0 * SIZE(AO) + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + + daddiu L, L, -1 + bgtz L, .L66 + NOP + + + .align 3 +.L68: # .L48 always deal with the trigular data part + LD b1, 0 * SIZE(AO) # for RN & RT A is the result matrix + LD b5, 1 * SIZE(AO) # Fixed results + + SUB t11, b1, t11 + SUB t12, b5, t12 + + + LD b1, 0 * SIZE(BO) # BO point to the beginning of the trigular data part of Bj + LD b2, 1 * SIZE(BO) + MUL t11, b1, t11 + NMSUB t12, t12, b2, t11 + + LD b5, 3 * SIZE(BO) + MUL t12, b5, t12 + + + ST t11, 0 * SIZE(AO) # update packed blockA for follow-up compute + ST t12, 1 * SIZE(AO) + + ST t11, 0 * SIZE(CO1) # write back results + ST t12, 0 * SIZE(CO2) + + daddiu CO1, CO1, 1 * SIZE # fixed address + daddiu CO2, CO2, 1 * SIZE + + dsubu TEMP, K, KK # temp = kc - retangular data length of every panel + dsll L, TEMP, BASE_SHIFT # mr=1 + dsll TEMP, TEMP, 1 + BASE_SHIFT # nr=2 + daddu AO, AO, L # move AO to the end of this panel. also the beginning of next panel + daddu BO, BO, TEMP # move BO to the end of this panel + + + .align 3 +.L39: + move B, BO # change to next panel of Bj + daddiu KK, KK, 2 # rectangular data length increase by 4 + + + + .align 3 + +.L70: + andi J, N, 1 # nr=1 + blez J, .L999 + NOP + + move CO1, C + move AO, A + + daddu C, CO1, LDC + + dsra I, M, 2 # I=MC/4 + blez I, .L80 + NOP + + .align 3 +.L71: + MTC $0, t11 # clear results registers + MOV t21, t11 + MOV t31, t11 + MOV t41, t11 + + LD a1, 0 * SIZE(AO) # AO point to the beginning address of sa + LD a2, 1 * SIZE(AO) # get 4 a + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + LD b1, 0 * SIZE(B) # B point to the beginning address of every panel Bj + + dsra L, KK, 2 # L=KK/4, KK is the length of the retangular data part of Bj + blez L, .L75 + move BO, B # reset B + +.L72: + LD a5, 4 * SIZE(AO) + LD a6, 5 * SIZE(AO) + LD a7, 6 * SIZE(AO) + LD a8, 7 * SIZE(AO) + + LD b5, 1 * SIZE(BO) + + MADD t11, t11, a1, b1 + MADD t21, t21, a2, b1 + MADD t31, t31, a3, b1 + MADD t41, t41, a4, b1 + + LD a1, 8 * SIZE(AO) + LD a2, 9 * SIZE(AO) + LD a3, 10 * SIZE(AO) + LD a4, 11 * SIZE(AO) + + LD b3, 2 * SIZE(BO) + + MADD t11, t11, a5, b5 + MADD t21, t21, a6, b5 + MADD t31, t31, a7, b5 + MADD t41, t41, a8, b5 + + LD a5, 12 * SIZE(AO) + LD a6, 13 * SIZE(AO) + LD a7, 14 * SIZE(AO) + LD a8, 15 * SIZE(AO) + + LD b7, 3 * SIZE(BO) + + MADD t11, t11, a1, b3 + MADD t21, t21, a2, b3 + MADD t31, t31, a3, b3 + MADD t41, t41, a4, b3 + + daddiu AO, AO, 16 * SIZE # AO += 4mr*4kr + daddiu BO, BO, 4 * SIZE # BP += 1nr*4kr + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + LD b1, 0 * SIZE(BO) + + MADD t11, t11, a5, b7 + MADD t21, t21, a6, b7 + MADD t31, t31, a7, b7 + MADD t41, t41, a8, b7 + + daddiu L, L, -1 + bgtz L, .L72 + NOP + + +.L75: + andi L, KK, 3 # deal with kc remainder part + blez L, .L78 + NOP + + .align 3 +.L76: + MADD t11, t11, a1, b1 + MADD t21, t21, a2, b1 + MADD t31, t31, a3, b1 + MADD t41, t41, a4, b1 + + daddiu AO, AO, 4 * SIZE # AO += 4mr + daddiu BO, BO, 1 * SIZE # BP += 1nr + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + LD b1, 0 * SIZE(BO) + + daddiu L, L, -1 + bgtz L, .L76 + NOP + + + .align 3 +.L78: # .L78 always deal with the trigular data part + LD b1, 0 * SIZE(AO) # for RN & RT A is the result matrix + LD b2, 1 * SIZE(AO) # Fixed results + LD b3, 2 * SIZE(AO) + LD b4, 3 * SIZE(AO) # sa stored as col major + + SUB t11, b1, t11 + SUB t21, b2, t21 + SUB t31, b3, t31 + SUB t41, b4, t41 + + + LD b1, 0 * SIZE(BO) # BO point to the beginning of the trigular data part of Bj + MUL t11, b1, t11 + MUL t21, b1, t21 + MUL t31, b1, t31 + MUL t41, b1, t41 + + + ST t11, 0 * SIZE(AO) # update packed blockA for follow-up compute + ST t21, 1 * SIZE(AO) + ST t31, 2 * SIZE(AO) + ST t41, 3 * SIZE(AO) + + ST t11, 0 * SIZE(CO1) # write back results + ST t21, 1 * SIZE(CO1) + ST t31, 2 * SIZE(CO1) + ST t41, 3 * SIZE(CO1) + + + daddiu CO1, CO1, 4 * SIZE # fixed address + + dsubu TEMP, K, KK # temp = kc - retangular data length of every panel + dsll L, TEMP, 2 + BASE_SHIFT + dsll TEMP, TEMP, BASE_SHIFT # nr=1 + daddu AO, AO, L # move AO to the end of this panel. also the beginning of next panel + daddu BO, BO, TEMP # move BO to the end of this panel + + daddiu I, I, -1 + bgtz I, .L71 + NOP + + + .align 3 +.L80: + andi I, M, 2 # mr=2 + blez I, .L90 + nop + + MTC $0, t11 # clear results registers + MOV t21, t11 + + LD a1, 0 * SIZE(AO) # AO point to the beginning address of sa + LD a2, 1 * SIZE(AO) # get 4 a + + LD b1, 0 * SIZE(B) # B point to the beginning address of every panel Bj + + dsra L, KK, 2 # L=KK/4, KK is the length of the retangular data part of Bj + blez L, .L85 + move BO, B # reset B + +.L82: + LD a5, 2 * SIZE(AO) + LD a6, 3 * SIZE(AO) + + LD b5, 1 * SIZE(BO) + + MADD t11, t11, a1, b1 + MADD t21, t21, a2, b1 + + LD a3, 4 * SIZE(AO) + LD a4, 5 * SIZE(AO) + + LD b3, 2 * SIZE(BO) + + MADD t11, t11, a5, b5 + MADD t21, t21, a6, b5 + + LD a7, 6 * SIZE(AO) + LD a8, 7 * SIZE(AO) + + LD b7, 3 * SIZE(BO) + + MADD t11, t11, a3, b3 + MADD t21, t21, a4, b3 + + daddiu AO, AO, 8 * SIZE # AO += 2mr*4kr + daddiu BO, BO, 4 * SIZE # BP += 1nr*4kr + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + + LD b1, 0 * SIZE(BO) + + MADD t11, t11, a7, b7 + MADD t21, t21, a8, b7 + + daddiu L, L, -1 + bgtz L, .L82 + NOP + + +.L85: + andi L, KK, 3 # deal with kc remainder part + blez L, .L88 + NOP + + .align 3 +.L86: + MADD t11, t11, a1, b1 + MADD t21, t21, a2, b1 + + daddiu AO, AO, 2 * SIZE # AO += 2mr + daddiu BO, BO, 1 * SIZE # BP += 1nr + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + + LD b1, 0 * SIZE(BO) + + daddiu L, L, -1 + bgtz L, .L86 + NOP + + + .align 3 +.L88: # .L88 always deal with the trigular data part + LD b1, 0 * SIZE(AO) # for RN & RT A is the result matrix + LD b2, 1 * SIZE(AO) # Fixed results + + SUB t11, b1, t11 + SUB t21, b2, t21 + + + LD b1, 0 * SIZE(BO) # BO point to the beginning of the trigular data part of Bj + MUL t11, b1, t11 + MUL t21, b1, t21 + + + ST t11, 0 * SIZE(AO) # update packed blockA for follow-up compute + ST t21, 1 * SIZE(AO) + + ST t11, 0 * SIZE(CO1) # write back results + ST t21, 1 * SIZE(CO1) + + + daddiu CO1, CO1, 2 * SIZE # fixed address + + dsubu TEMP, K, KK # temp = kc - retangular data length of every panel + dsll L, TEMP, 1 + BASE_SHIFT + dsll TEMP, TEMP, BASE_SHIFT # nr=1 + daddu AO, AO, L # move AO to the end of this panel. also the beginning of next panel + daddu BO, BO, TEMP # move BO to the end of this panel + + + .align 3 +.L90: + andi I, M, 1 # mr=1 + blez I, .L79 + nop + + MTC $0, t11 # clear results registers + + LD a1, 0 * SIZE(AO) # AO point to the beginning address of sa + LD b1, 0 * SIZE(B) # B point to the beginning address of every panel Bj + + dsra L, KK, 2 # L=KK/4, KK is the length of the retangular data part of Bj + blez L, .L95 + move BO, B # reset B + +.L92: + LD a5, 1 * SIZE(AO) + LD b5, 1 * SIZE(BO) + + MADD t11, t11, a1, b1 + + LD a3, 2 * SIZE(AO) + LD b3, 2 * SIZE(BO) + + MADD t11, t11, a5, b5 + + LD a7, 3 * SIZE(AO) + LD b7, 3 * SIZE(BO) + + MADD t11, t11, a3, b3 + + daddiu AO, AO, 4 * SIZE # AO += 1mr*4kr + daddiu BO, BO, 4 * SIZE # BP += 1nr*4kr + + LD a1, 0 * SIZE(AO) + LD b1, 0 * SIZE(BO) + + MADD t11, t11, a7, b7 + + daddiu L, L, -1 + bgtz L, .L92 + NOP + + +.L95: + andi L, KK, 3 # deal with kc remainder part + blez L, .L98 + NOP + + .align 3 +.L96: + MADD t11, t11, a1, b1 + + daddiu AO, AO, 1 * SIZE # AO += 2mr + daddiu BO, BO, 1 * SIZE # BP += 1nr + + LD a1, 0 * SIZE(AO) + LD b1, 0 * SIZE(BO) + + daddiu L, L, -1 + bgtz L, .L96 + NOP + + + .align 3 +.L98: # .L98 always deal with the trigular data part + LD b1, 0 * SIZE(AO) # for RN & RT A is the result matrix + + SUB t11, b1, t11 + + + LD b1, 0 * SIZE(BO) # BO point to the beginning of the trigular data part of Bj + MUL t11, b1, t11 + + + ST t11, 0 * SIZE(AO) # update packed blockA for follow-up compute + + ST t11, 0 * SIZE(CO1) # write back results + + + daddiu CO1, CO1, 1 * SIZE # fixed address + + dsubu TEMP, K, KK # temp = kc - retangular data length of every panel + dsll L, TEMP, BASE_SHIFT + dsll TEMP, TEMP, BASE_SHIFT # nr=1 + daddu AO, AO, L # move AO to the end of this panel. also the beginning of next panel + daddu BO, BO, TEMP # move BO to the end of this panel + + + .align 3 +.L79: + move B, BO + daddiu KK, KK, 1 + + + .align 3 + + +.L999: + LDARG $16, 0($sp) + LDARG $17, 8($sp) + LDARG $18, 16($sp) + LDARG $19, 24($sp) + LDARG $20, 32($sp) + LDARG $21, 40($sp) + ldc1 $f24, 48($sp) + ldc1 $f25, 56($sp) + ldc1 $f26, 64($sp) + ldc1 $f27, 72($sp) + ldc1 $f28, 80($sp) + + LDARG $22, 88($sp) + LDARG $23, 96($sp) + LDARG $24, 104($sp) + LDARG $25, 112($sp) + +#ifndef __64BIT__ + ldc1 $f20,112($sp) + ldc1 $f21,120($sp) + ldc1 $f22,128($sp) + ldc1 $f23,136($sp) +#endif + + j $31 + daddiu $sp, $sp, 144 + + EPILOGUE diff --git a/kernel/mips64/trsm_kernel_RT_loongson3a.S b/kernel/mips64/trsm_kernel_RT_loongson3a.S new file mode 100644 index 000000000..cf20cf9e0 --- /dev/null +++ b/kernel/mips64/trsm_kernel_RT_loongson3a.S @@ -0,0 +1,1958 @@ +#define REALNAME ASMNAME + +#define ASSEMBLER +#include "common.h" + + +#define M $4 +#define N $5 +#define K $6 +#define A $8 +#define B $9 +#define C $10 +#define LDC $11 + +#define AO $12 +#define BO $13 + +#define I $2 +#define J $3 +#define L $7 + +#define CO1 $14 +#define CO2 $15 +#define CO3 $16 +#define CO4 $17 + +#define OFFSET $22 +#define KK $23 +#define TEMP $24 +#define AORIG $25 + +#define a1 $f0 +#define a2 $f1 +#define a3 $f26 +#define a4 $f27 + +#define a5 $f28 +#define a6 $f29 +#define a7 $f30 +#define a8 $f31 + +#define b1 $f2 +#define b2 $f3 +#define b3 $f4 +#define b4 $f5 + +#define b5 $f6 +#define b6 $f7 +#define b7 $f8 +#define b8 $f9 + +#define t11 $f10 +#define t21 $f11 +#define t31 $f12 +#define t41 $f13 + +#define t12 $f14 +#define t22 $f15 +#define t32 $f16 +#define t42 $f17 + +#define t13 $f18 +#define t23 $f19 +#define t33 $f20 +#define t43 $f21 + +#define t14 $f22 +#define t24 $f23 +#define t34 $f24 +#define t44 $f25 + + PROLOGUE + + daddiu $sp, $sp, -144 + + SDARG $16, 0($sp) + SDARG $17, 8($sp) + SDARG $18, 16($sp) + SDARG $19, 24($sp) + SDARG $20, 32($sp) + SDARG $21, 40($sp) + sdc1 $f24, 48($sp) + sdc1 $f25, 56($sp) + sdc1 $f26, 64($sp) + sdc1 $f27, 72($sp) + sdc1 $f28, 80($sp) + + SDARG $22, 88($sp) + SDARG $23, 96($sp) + SDARG $24, 104($sp) + SDARG $25, 112($sp) + +#ifndef __64BIT__ + sdc1 $f20,112($sp) + sdc1 $f21,120($sp) + sdc1 $f22,128($sp) + sdc1 $f23,136($sp) +#endif + + + .align 3 # RT compute from right to left + LDARG OFFSET, 144($sp) # get the last parameter + dsll LDC, LDC, BASE_SHIFT # LDC * data_Byte + + mult N, K + mflo TEMP + + dsll TEMP, TEMP, BASE_SHIFT # B Representative triangle matrix!!! + daddu B, B, TEMP # B point to the end of sb + # Be carefull B has no effeck of mc!! + mult N, LDC + mflo TEMP + daddu C, C, TEMP # C point to the last colum of blockB + + dsubu KK, K, OFFSET # KC-KK is the length of rectangular data part of Bj + + andi J, N, 1 + blez J, .L30 + nop + + dsll TEMP, K, BASE_SHIFT + dsubu B, B, TEMP # move B to the beginning address of Bj + + dsubu C, C, LDC + + move CO1, C + + move AORIG, A + + dsra I, M, 2 + blez I, .L80 + NOP + +.L31: # mr=4,nr=1 + dsll L, KK, 2 + BASE_SHIFT # mr=4 + dsll TEMP, KK, BASE_SHIFT # nr=1 + daddu AO, AORIG, L + daddu BO, B, TEMP # BO point to the retangular data part,also reset BO + dsubu TEMP, K, KK # temp = the length of rectangular data part + + MTC $0, t11 # clear 4 results registers + MOV t21, t11 + MOV t31, t11 + MOV t41, t11 + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + LD b1, 0 * SIZE(BO) + + dsra L, TEMP, 2 # L=(KC-offset)/4 + blez L, .L35 + NOP + + .align 3 + +.L32: + LD a5, 4 * SIZE(AO) + LD a6, 5 * SIZE(AO) + LD a7, 6 * SIZE(AO) + LD a8, 7 * SIZE(AO) + + LD b5, 1 * SIZE(BO) + + MADD t11, t11, a1, b1 + MADD t21, t21, a2, b1 + MADD t31, t31, a3, b1 + MADD t41, t41, a4, b1 + + LD a1, 8 * SIZE(AO) + LD a2, 9 * SIZE(AO) + LD a3, 10 * SIZE(AO) + LD a4, 11 * SIZE(AO) + + LD b3, 2 * SIZE(BO) + + MADD t11, t11, a5, b5 + MADD t21, t21, a6, b5 + MADD t31, t31, a7, b5 + MADD t41, t41, a8, b5 + + LD a5, 12 * SIZE(AO) + LD a6, 13 * SIZE(AO) + LD a7, 14 * SIZE(AO) + LD a8, 15 * SIZE(AO) + + LD b7, 3 * SIZE(BO) + + MADD t11, t11, a1, b3 + MADD t21, t21, a2, b3 + MADD t31, t31, a3, b3 + MADD t41, t41, a4, b3 + + daddiu AO, AO, 16 * SIZE # AO += 4mr*4kr + daddiu BO, BO, 4 * SIZE # BO += 1nr*4kr + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + LD b1, 0 * SIZE(BO) + + MADD t11, t11, a5, b7 + MADD t21, t21, a6, b7 + MADD t31, t31, a7, b7 + MADD t41, t41, a8, b7 + + daddiu L, L, -1 + bgtz L, .L32 + NOP + + + .align 3 + +.L35: + andi L, TEMP, 3 + blez L, .L38 + NOP + .align 3 + +.L36: + MADD t11, t11, a1, b1 + MADD t21, t21, a2, b1 + MADD t31, t31, a3, b1 + MADD t41, t41, a4, b1 + + daddiu AO, AO, 4 * SIZE # AO += 4mr + daddiu BO, BO, 1 * SIZE # BO += 2nr + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + LD b1, 0 * SIZE(BO) + + daddiu L, L, -1 + bgtz L, .L36 + NOP + + + .align +.L38: + daddiu TEMP, KK, -1 # deal with the triangular data part + dsll L, TEMP, 2 + BASE_SHIFT + dsll TEMP, TEMP, BASE_SHIFT # nr=1 + daddu AO, AORIG, L + daddu BO, B, TEMP # BO point to the trigular data part + + LD b1, 0 * SIZE(AO) # fixed results + LD b2, 1 * SIZE(AO) + LD b3, 2 * SIZE(AO) + LD b4, 3 * SIZE(AO) + + SUB t11, b1, t11 + SUB t21, b2, t21 + SUB t31, b3, t31 + SUB t41, b4, t41 + + + LD b2, 0 * SIZE(BO) + MUL t11, b2, t11 + MUL t21, b2, t21 + MUL t31, b2, t31 + MUL t41, b2, t41 + + + ST t11, 0 * SIZE(AO) # updata packed A + ST t21, 1 * SIZE(AO) + ST t31, 2 * SIZE(AO) + ST t41, 3 * SIZE(AO) + + ST t11, 0 * SIZE(CO1) # write back + ST t21, 1 * SIZE(CO1) + ST t31, 2 * SIZE(CO1) + ST t41, 3 * SIZE(CO1) + + + daddiu CO1, CO1, 4 * SIZE # fixed pointer + + dsll TEMP, K, 2 + BASE_SHIFT + daddu AORIG, AORIG, TEMP # move to next panel Ai + + daddiu I, I, -1 + bgtz I, .L31 + NOP + + + .align 3 +.L80: + andi I, M, 2 + blez I, .L90 + nop + + dsll L, KK, 1 + BASE_SHIFT # mr=2 + dsll TEMP, KK, BASE_SHIFT # nr=1 + daddu AO, AORIG, L + daddu BO, B, TEMP # BO point to the retangular data part,also reset BO + dsubu TEMP, K, KK # temp = the length of rectangular data part + + MTC $0, t11 # clear 4 results registers + MOV t21, t11 + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + + LD b1, 0 * SIZE(BO) + + dsra L, TEMP, 2 # L=(KC-offset)/4 + blez L, .L85 + NOP + + .align 3 + +.L82: + LD a5, 2 * SIZE(AO) + LD a6, 3 * SIZE(AO) + + LD b5, 1 * SIZE(BO) + + MADD t11, t11, a1, b1 + MADD t21, t21, a2, b1 + + LD a3, 4 * SIZE(AO) + LD a4, 5 * SIZE(AO) + + LD b3, 2 * SIZE(BO) + + MADD t11, t11, a5, b5 + MADD t21, t21, a6, b5 + + LD a7, 6 * SIZE(AO) + LD a8, 7 * SIZE(AO) + + LD b7, 3 * SIZE(BO) + + MADD t11, t11, a3, b3 + MADD t21, t21, a4, b3 + + daddiu AO, AO, 16 * SIZE # AO += 4mr*4kr + daddiu BO, BO, 4 * SIZE # BO += 1nr*4kr + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + + LD b1, 0 * SIZE(BO) + + MADD t11, t11, a7, b7 + MADD t21, t21, a8, b7 + + daddiu L, L, -1 + bgtz L, .L82 + NOP + + + .align 3 + +.L85: + andi L, TEMP, 3 + blez L, .L88 + NOP + .align 3 + +.L86: + MADD t11, t11, a1, b1 + MADD t21, t21, a2, b1 + + daddiu AO, AO, 2 * SIZE # AO += 2mr + daddiu BO, BO, 1 * SIZE # BO += 1nr + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + + LD b1, 0 * SIZE(BO) + + daddiu L, L, -1 + bgtz L, .L86 + NOP + + + .align +.L88: + daddiu TEMP, KK, -1 # deal with the triangular data part + dsll L, TEMP, 1 + BASE_SHIFT + dsll TEMP, TEMP, BASE_SHIFT # nr=1 + daddu AO, AORIG, L + daddu BO, B, TEMP # BO point to the trigular data part + + LD b1, 0 * SIZE(AO) # fixed results + LD b2, 1 * SIZE(AO) + + SUB t11, b1, t11 + SUB t21, b2, t21 + + + LD b2, 0 * SIZE(BO) + MUL t11, b2, t11 + MUL t21, b2, t21 + + + ST t11, 0 * SIZE(AO) # updata packed A + ST t21, 1 * SIZE(AO) + + ST t11, 0 * SIZE(CO1) # write back + ST t21, 1 * SIZE(CO1) + + + daddiu CO1, CO1, 2 * SIZE # fixed pointer + + dsll TEMP, K, 1 + BASE_SHIFT + daddu AORIG, AORIG, TEMP # move to next panel Ai + + + .align 3 +.L90: + andi I, M, 1 + blez I, .L39 + nop + + dsll L, KK, BASE_SHIFT # mr=1 + dsll TEMP, KK, BASE_SHIFT # nr=1 + daddu AO, AORIG, L + daddu BO, B, TEMP # BO point to the retangular data part,also reset BO + dsubu TEMP, K, KK # temp = the length of rectangular data part + + MTC $0, t11 # clear 4 results registers + + LD a1, 0 * SIZE(AO) + LD b1, 0 * SIZE(BO) + + dsra L, TEMP, 2 # L=(KC-offset)/4 + blez L, .L95 + NOP + + .align 3 + +.L92: + LD a5, 1 * SIZE(AO) + LD b5, 1 * SIZE(BO) + + MADD t11, t11, a1, b1 + + LD a3, 2 * SIZE(AO) + LD b3, 2 * SIZE(BO) + + MADD t11, t11, a5, b5 + + LD a7, 3 * SIZE(AO) + LD b7, 3 * SIZE(BO) + + MADD t11, t11, a3, b3 + + daddiu AO, AO, 4 * SIZE # AO += 1mr*4kr + daddiu BO, BO, 4 * SIZE # BO += 1nr*4kr + + LD a1, 0 * SIZE(AO) + LD b1, 0 * SIZE(BO) + + MADD t11, t11, a7, b7 + + daddiu L, L, -1 + bgtz L, .L92 + NOP + + + .align 3 + +.L95: + andi L, TEMP, 3 + blez L, .L98 + NOP + .align 3 + +.L96: + MADD t11, t11, a1, b1 + + daddiu AO, AO, 1 * SIZE # AO += 2mr + daddiu BO, BO, 1 * SIZE # BO += 1nr + + LD a1, 0 * SIZE(AO) + LD b1, 0 * SIZE(BO) + + daddiu L, L, -1 + bgtz L, .L96 + NOP + + + .align +.L98: + daddiu TEMP, KK, -1 # deal with the triangular data part + dsll L, TEMP, BASE_SHIFT + dsll TEMP, TEMP, BASE_SHIFT # nr=1 + daddu AO, AORIG, L + daddu BO, B, TEMP # BO point to the trigular data part + + LD b1, 0 * SIZE(AO) # fixed results + + SUB t11, b1, t11 + + + LD b2, 0 * SIZE(BO) + MUL t11, b2, t11 + + + ST t11, 0 * SIZE(AO) # updata packed A + + ST t11, 0 * SIZE(CO1) # write back + + + daddiu CO1, CO1, 1 * SIZE # fixed pointer + + dsll TEMP, K, BASE_SHIFT + daddu AORIG, AORIG, TEMP # move to next panel Ai + + +.L39: + daddiu KK, KK, -1 # rectangular data length increased by 1 + + + .align 3 +.L30: # nr=2 + andi J, N, 2 + blez J, .L50 + nop + + dsll TEMP, K, 1 + BASE_SHIFT # Kc*2nr move B to the beginning address of Bj + dsubu B, B, TEMP + + dsll TEMP, LDC, 1 # C + dsubu C, C, TEMP + + move CO1, C + daddu CO2, C, LDC + + move AORIG, A + + dsra I, M, 2 + blez I, .L60 + NOP + +.L51: # mr=4,nr=2 + dsll L, KK, 2 + BASE_SHIFT # mr=4 + dsll TEMP, KK, 1 + BASE_SHIFT # nr=2 + daddu AO, AORIG, L + daddu BO, B, TEMP # BO point to the retangular data part,also reset BO + dsubu TEMP, K, KK # temp = the length of rectangular data part + + MTC $0, t11 # clear 8 results registers + MOV t21, t11 + MOV t31, t11 + MOV t41, t11 + MOV t12, t11 + MOV t22, t11 + MOV t32, t11 + MOV t42, t11 + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + + dsra L, TEMP, 2 # L=(KC-offset)/4 + blez L, .L55 + NOP + + .align 3 + +.L52: + LD a5, 4 * SIZE(AO) + LD a6, 5 * SIZE(AO) + LD a7, 6 * SIZE(AO) + LD a8, 7 * SIZE(AO) + + LD b5, 2 * SIZE(BO) + LD b6, 3 * SIZE(BO) + + MADD t11, t11, a1, b1 + MADD t21, t21, a2, b1 + MADD t31, t31, a3, b1 + MADD t41, t41, a4, b1 + + MADD t12, t12, a1, b2 + MADD t22, t22, a2, b2 + MADD t32, t32, a3, b2 + MADD t42, t42, a4, b2 + + LD a1, 8 * SIZE(AO) + LD a2, 9 * SIZE(AO) + LD a3, 10 * SIZE(AO) + LD a4, 11 * SIZE(AO) + + LD b3, 4 * SIZE(BO) + LD b4, 5 * SIZE(BO) + + MADD t11, t11, a5, b5 + MADD t21, t21, a6, b5 + MADD t31, t31, a7, b5 + MADD t41, t41, a8, b5 + + MADD t12, t12, a5, b6 + MADD t22, t22, a6, b6 + MADD t32, t32, a7, b6 + MADD t42, t42, a8, b6 + + LD a5, 12 * SIZE(AO) + LD a6, 13 * SIZE(AO) + LD a7, 14 * SIZE(AO) + LD a8, 15 * SIZE(AO) + + LD b7, 6 * SIZE(BO) + LD b8, 7 * SIZE(BO) + + MADD t11, t11, a1, b3 + MADD t21, t21, a2, b3 + MADD t31, t31, a3, b3 + MADD t41, t41, a4, b3 + + MADD t12, t12, a1, b4 + MADD t22, t22, a2, b4 + MADD t32, t32, a3, b4 + MADD t42, t42, a4, b4 + + daddiu AO, AO, 16 * SIZE # AO += 4mr*4kr + daddiu BO, BO, 8 * SIZE # BO += 2nr*4kr + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + + MADD t11, t11, a5, b7 + MADD t21, t21, a6, b7 + MADD t31, t31, a7, b7 + MADD t41, t41, a8, b7 + + MADD t12, t12, a5, b8 + MADD t22, t22, a6, b8 + MADD t32, t32, a7, b8 + MADD t42, t42, a8, b8 + + daddiu L, L, -1 + bgtz L, .L52 + NOP + + + .align 3 + +.L55: + andi L, TEMP, 3 + blez L, .L58 + NOP + .align 3 + +.L56: + MADD t11, t11, a1, b1 + MADD t21, t21, a2, b1 + MADD t31, t31, a3, b1 + MADD t41, t41, a4, b1 + + MADD t12, t12, a1, b2 + MADD t22, t22, a2, b2 + MADD t32, t32, a3, b2 + MADD t42, t42, a4, b2 + + daddiu AO, AO, 4 * SIZE # AO += 4mr + daddiu BO, BO, 2 * SIZE # BO += 2nr + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + + daddiu L, L, -1 + bgtz L, .L56 + NOP + + + .align +.L58: + daddiu TEMP, KK, -2 # deal with the triangular data part + dsll L, TEMP, 2 + BASE_SHIFT + dsll TEMP, TEMP, 1 + BASE_SHIFT # nr=2 + daddu AO, AORIG, L + daddu BO, B, TEMP # BO point to the trigular data part + + LD b1, 0 * SIZE(AO) # fixed results + LD b2, 1 * SIZE(AO) + LD b3, 2 * SIZE(AO) + LD b4, 3 * SIZE(AO) + + SUB t11, b1, t11 + SUB t21, b2, t21 + SUB t31, b3, t31 + SUB t41, b4, t41 + + LD b5, 4 * SIZE(AO) + LD b6, 5 * SIZE(AO) + LD b7, 6 * SIZE(AO) + LD b8, 7 * SIZE(AO) + + SUB t12, b5, t12 + SUB t22, b6, t22 + SUB t32, b7, t32 + SUB t42, b8, t42 + + + LD b8, 3 * SIZE(BO) + LD b1, 2 * SIZE(BO) + MUL t12, b8, t12 + MUL t22, b8, t22 + MUL t32, b8, t32 + MUL t42, b8, t42 + NMSUB t11, t11, b1, t12 + NMSUB t21, t21, b1, t22 + NMSUB t31, t31, b1, t32 + NMSUB t41, t41, b1, t42 + + + LD b2, 0 * SIZE(BO) + MUL t11, b2, t11 + MUL t21, b2, t21 + MUL t31, b2, t31 + MUL t41, b2, t41 + + + ST t11, 0 * SIZE(AO) # updata packed A + ST t21, 1 * SIZE(AO) + ST t31, 2 * SIZE(AO) + ST t41, 3 * SIZE(AO) + + ST t12, 4 * SIZE(AO) + ST t22, 5 * SIZE(AO) + ST t32, 6 * SIZE(AO) + ST t42, 7 * SIZE(AO) + + ST t11, 0 * SIZE(CO1) # write back + ST t21, 1 * SIZE(CO1) + ST t31, 2 * SIZE(CO1) + ST t41, 3 * SIZE(CO1) + + ST t12, 0 * SIZE(CO2) + ST t22, 1 * SIZE(CO2) + ST t32, 2 * SIZE(CO2) + ST t42, 3 * SIZE(CO2) + + daddiu CO1, CO1, 4 * SIZE # fixed pointer + daddiu CO2, CO2, 4 * SIZE + + dsll TEMP, K, 2 + BASE_SHIFT + daddu AORIG, AORIG, TEMP # move to next panel Ai + + daddiu I, I, -1 + bgtz I, .L51 + NOP + + + + .align 3 +.L60: + andi I, M, 2 # mr=2 + blez I, .L70 + nop + + dsll L, KK, 1 + BASE_SHIFT # mr=2 + dsll TEMP, KK, 1 + BASE_SHIFT # nr=2 + daddu AO, AORIG, L + daddu BO, B, TEMP # BO point to the retangular data part,also reset BO + dsubu TEMP, K, KK # temp = the length of rectangular data part + + MTC $0, t11 # clear 8 results registers + MOV t21, t11 + MOV t12, t11 + MOV t22, t11 + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + + dsra L, TEMP, 2 # L=(KC-offset)/4 + blez L, .L65 + NOP + + .align 3 + +.L62: + LD a5, 2 * SIZE(AO) + LD a6, 3 * SIZE(AO) + + LD b5, 2 * SIZE(BO) + LD b6, 3 * SIZE(BO) + + MADD t11, t11, a1, b1 + MADD t21, t21, a2, b1 + + MADD t12, t12, a1, b2 + MADD t22, t22, a2, b2 + + LD a3, 4 * SIZE(AO) + LD a4, 5 * SIZE(AO) + + LD b3, 4 * SIZE(BO) + LD b4, 5 * SIZE(BO) + + MADD t11, t11, a5, b5 + MADD t21, t21, a6, b5 + + MADD t12, t12, a5, b6 + MADD t22, t22, a6, b6 + + LD a7, 6 * SIZE(AO) + LD a8, 7 * SIZE(AO) + + LD b7, 6 * SIZE(BO) + LD b8, 7 * SIZE(BO) + + MADD t11, t11, a3, b3 + MADD t21, t21, a4, b3 + + MADD t12, t12, a3, b4 + MADD t22, t22, a4, b4 + + daddiu AO, AO, 8 * SIZE # AO += 2mr*4kr + daddiu BO, BO, 8 * SIZE # BO += 2nr*4kr + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + + MADD t11, t11, a7, b7 + MADD t21, t21, a8, b7 + + MADD t12, t12, a7, b8 + MADD t22, t22, a8, b8 + + daddiu L, L, -1 + bgtz L, .L62 + NOP + + + .align 3 + +.L65: + andi L, TEMP, 3 + blez L, .L68 + NOP + .align 3 + +.L66: + MADD t11, t11, a1, b1 + MADD t21, t21, a2, b1 + + MADD t12, t12, a1, b2 + MADD t22, t22, a2, b2 + + daddiu AO, AO, 2 * SIZE # AO += 2mr + daddiu BO, BO, 2 * SIZE # BO += 2nr + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + + daddiu L, L, -1 + bgtz L, .L66 + NOP + + + .align +.L68: + daddiu TEMP, KK, -2 # deal with the triangular data part + dsll L, TEMP, 1 + BASE_SHIFT + dsll TEMP, TEMP, 1 + BASE_SHIFT # nr=2 + daddu AO, AORIG, L + daddu BO, B, TEMP # BO point to the trigular data part + + LD b1, 0 * SIZE(AO) # fixed results + LD b2, 1 * SIZE(AO) + LD b3, 2 * SIZE(AO) + LD b4, 3 * SIZE(AO) + + SUB t11, b1, t11 + SUB t21, b2, t21 + SUB t12, b3, t12 + SUB t22, b4, t22 + + + LD b8, 3 * SIZE(BO) + LD b7, 2 * SIZE(BO) + MUL t12, b8, t12 + MUL t22, b8, t22 + NMSUB t11, t11, b7, t12 + NMSUB t21, t21, b7, t22 + + + LD b6, 0 * SIZE(BO) + MUL t11, b6, t11 + MUL t21, b6, t21 + + + ST t11, 0 * SIZE(AO) # updata packed A + ST t21, 1 * SIZE(AO) + ST t12, 2 * SIZE(AO) + ST t22, 3 * SIZE(AO) + + ST t11, 0 * SIZE(CO1) # write back + ST t21, 1 * SIZE(CO1) + + ST t12, 0 * SIZE(CO2) + ST t22, 1 * SIZE(CO2) + + daddiu CO1, CO1, 2 * SIZE # fixed pointer + daddiu CO2, CO2, 2 * SIZE + + dsll TEMP, K, 1 + BASE_SHIFT # mr=2 + daddu AORIG, AORIG, TEMP # move to next panel Ai + + + + .align 3 +.L70: + andi I, M, 1 # mr=1 + blez I, .L59 + nop + + dsll L, KK, BASE_SHIFT # mr=1 + dsll TEMP, KK, 1 + BASE_SHIFT # nr=2 + daddu AO, AORIG, L + daddu BO, B, TEMP # BO point to the retangular data part,also reset BO + dsubu TEMP, K, KK # temp = the length of rectangular data part + + MTC $0, t11 # clear 8 results registers + MOV t12, t11 + + LD a1, 0 * SIZE(AO) + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + + dsra L, TEMP, 2 # L=(KC-offset)/4 + blez L, .L75 + NOP + + .align 3 + +.L72: + LD a5, 1 * SIZE(AO) + + LD b5, 2 * SIZE(BO) + LD b6, 3 * SIZE(BO) + + MADD t11, t11, a1, b1 + MADD t12, t12, a1, b2 + + LD a3, 2 * SIZE(AO) + + LD b3, 4 * SIZE(BO) + LD b4, 5 * SIZE(BO) + + MADD t11, t11, a5, b5 + MADD t12, t12, a5, b6 + + LD a7, 3 * SIZE(AO) + + LD b7, 6 * SIZE(BO) + LD b8, 7 * SIZE(BO) + + MADD t11, t11, a3, b3 + MADD t12, t12, a3, b4 + + daddiu AO, AO, 4 * SIZE # AO += 1mr*4kr + daddiu BO, BO, 8 * SIZE # BO += 2nr*4kr + + LD a1, 0 * SIZE(AO) + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + + MADD t11, t11, a7, b7 + MADD t12, t12, a7, b8 + + daddiu L, L, -1 + bgtz L, .L72 + NOP + + + .align 3 + +.L75: + andi L, TEMP, 3 + blez L, .L78 + NOP + .align 3 + +.L76: + MADD t11, t11, a1, b1 + MADD t12, t12, a1, b2 + + daddiu AO, AO, 1 * SIZE # AO += 1mr + daddiu BO, BO, 2 * SIZE # BO += 2nr + + LD a1, 0 * SIZE(AO) + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + + daddiu L, L, -1 + bgtz L, .L76 + NOP + + + .align +.L78: + daddiu TEMP, KK, -2 # deal with the triangular data part + dsll L, TEMP, BASE_SHIFT + dsll TEMP, TEMP, 1 + BASE_SHIFT # nr=2 + daddu AO, AORIG, L + daddu BO, B, TEMP # BO point to the trigular data part + + LD b1, 0 * SIZE(AO) # fixed results + LD b2, 1 * SIZE(AO) + + SUB t11, b1, t11 + SUB t12, b2, t12 + + + LD b8, 3 * SIZE(BO) + LD b7, 2 * SIZE(BO) + MUL t12, b8, t12 + NMSUB t11, t11, b7, t12 + + + LD b6, 0 * SIZE(BO) + MUL t11, b6, t11 + + + ST t11, 0 * SIZE(AO) # updata packed A + ST t12, 1 * SIZE(AO) + + ST t11, 0 * SIZE(CO1) # write back + ST t12, 0 * SIZE(CO2) + + daddiu CO1, CO1, 1 * SIZE # fixed pointer + daddiu CO2, CO2, 1 * SIZE + + dsll TEMP, K, BASE_SHIFT # mr=2 + daddu AORIG, AORIG, TEMP # move to next panel Ai + + +.L59: + daddiu KK, KK, -2 # rectangular data length increased by 2 + + + + .align 3 +.L50: + dsra J, N, 2 # J = NC/4 + blez J, .L999 + NOP + +.L10: + dsll TEMP, K, 2 + BASE_SHIFT + dsubu B, B, TEMP # move B to the beginning address of Bj + + dsll TEMP, LDC, 2 + dsubu C, C, TEMP # move C to the beginning address of Cj + + daddiu J, J, -1 + + move CO1, C + daddu CO2, C, LDC + daddu CO3, CO2, LDC + daddu CO4, CO3, LDC + + move AORIG, A # reset A + + dsra I, M, 2 # I=MC/4 + blez I, .L20 + NOP + + .align 3 +.L11: + dsll L, KK, 2 + BASE_SHIFT # mr=4 + dsll TEMP, KK, 2 + BASE_SHIFT # nr=4 + daddu AO, AORIG, L + daddu BO, B, TEMP # BO point to the retangular data part,also reset BO + dsubu TEMP, K, KK # temp = the length of rectangular data part + + MTC $0, t11 # clear 16 results registers + MOV t21, t11 + MOV t31, t11 + MOV t41, t11 + MOV t12, t11 + MOV t22, t11 + MOV t32, t11 + MOV t42, t11 + MOV t13, t11 + MOV t23, t11 + MOV t33, t11 + MOV t43, t11 + MOV t14, t11 + MOV t24, t11 + MOV t34, t11 + MOV t44, t11 + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + + dsra L, TEMP, 2 # L=(KC-offset)/4 + blez L, .L15 + NOP + + .align 3 + +.L12: + LD a5, 4 * SIZE(AO) + LD a6, 5 * SIZE(AO) + LD a7, 6 * SIZE(AO) + LD a8, 7 * SIZE(AO) + + LD b5, 4 * SIZE(BO) + LD b6, 5 * SIZE(BO) + LD b7, 6 * SIZE(BO) + LD b8, 7 * SIZE(BO) + + MADD t11, t11, a1, b1 + MADD t21, t21, a2, b1 + MADD t31, t31, a3, b1 + MADD t41, t41, a4, b1 + + MADD t12, t12, a1, b2 + MADD t22, t22, a2, b2 + MADD t32, t32, a3, b2 + MADD t42, t42, a4, b2 + + MADD t13, t13, a1, b3 + MADD t23, t23, a2, b3 + MADD t33, t33, a3, b3 + MADD t43, t43, a4, b3 + + MADD t14, t14, a1, b4 + MADD t24, t24, a2, b4 + MADD t34, t34, a3, b4 + MADD t44, t44, a4, b4 # fisrt + + LD a1, 8 * SIZE(AO) + LD a2, 9 * SIZE(AO) + LD a3, 10 * SIZE(AO) + LD a4, 11 * SIZE(AO) + + LD b1, 8 * SIZE(BO) + LD b2, 9 * SIZE(BO) + LD b3, 10 * SIZE(BO) + LD b4, 11 * SIZE(BO) + + MADD t11, t11, a5, b5 + MADD t21, t21, a6, b5 + MADD t31, t31, a7, b5 + MADD t41, t41, a8, b5 + + MADD t12, t12, a5, b6 + MADD t22, t22, a6, b6 + MADD t32, t32, a7, b6 + MADD t42, t42, a8, b6 + + MADD t13, t13, a5, b7 + MADD t23, t23, a6, b7 + MADD t33, t33, a7, b7 + MADD t43, t43, a8, b7 + + MADD t14, t14, a5, b8 + MADD t24, t24, a6, b8 + MADD t34, t34, a7, b8 + MADD t44, t44, a8, b8 # second + + LD a5, 12 * SIZE(AO) + LD a6, 13 * SIZE(AO) + LD a7, 14 * SIZE(AO) + LD a8, 15 * SIZE(AO) + + LD b5, 12 * SIZE(BO) + LD b6, 13 * SIZE(BO) + LD b7, 14 * SIZE(BO) + LD b8, 15 * SIZE(BO) + + MADD t11, t11, a1, b1 + MADD t21, t21, a2, b1 + MADD t31, t31, a3, b1 + MADD t41, t41, a4, b1 + + MADD t12, t12, a1, b2 + MADD t22, t22, a2, b2 + MADD t32, t32, a3, b2 + MADD t42, t42, a4, b2 + + MADD t13, t13, a1, b3 + MADD t23, t23, a2, b3 + MADD t33, t33, a3, b3 + MADD t43, t43, a4, b3 + + MADD t14, t14, a1, b4 + MADD t24, t24, a2, b4 + MADD t34, t34, a3, b4 + MADD t44, t44, a4, b4 # third + + daddiu AO, AO, 16 * SIZE # AO += 4mr*4kr + daddiu BO, BO, 16 * SIZE # BO += 4nr*4kr + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + + MADD t11, t11, a5, b5 + MADD t21, t21, a6, b5 + MADD t31, t31, a7, b5 + MADD t41, t41, a8, b5 + + MADD t12, t12, a5, b6 + MADD t22, t22, a6, b6 + MADD t32, t32, a7, b6 + MADD t42, t42, a8, b6 + + MADD t13, t13, a5, b7 + MADD t23, t23, a6, b7 + MADD t33, t33, a7, b7 + MADD t43, t43, a8, b7 + + MADD t14, t14, a5, b8 + MADD t24, t24, a6, b8 + MADD t34, t34, a7, b8 + MADD t44, t44, a8, b8 # fouth + + daddiu L, L, -1 + bgtz L, .L12 + NOP + + + .align 3 + +.L15: + andi L, TEMP, 3 + blez L, .L18 + NOP + .align 3 + +.L16: + MADD t11, t11, a1, b1 + MADD t21, t21, a2, b1 + MADD t31, t31, a3, b1 + MADD t41, t41, a4, b1 + + MADD t12, t12, a1, b2 + MADD t22, t22, a2, b2 + MADD t32, t32, a3, b2 + MADD t42, t42, a4, b2 + + MADD t13, t13, a1, b3 + MADD t23, t23, a2, b3 + MADD t33, t33, a3, b3 + MADD t43, t43, a4, b3 + + MADD t14, t14, a1, b4 + MADD t24, t24, a2, b4 + MADD t34, t34, a3, b4 + MADD t44, t44, a4, b4 # third + + daddiu AO, AO, 4 * SIZE # AO += 4mr + daddiu BO, BO, 4 * SIZE # BO += 4nr + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + + daddiu L, L, -1 + bgtz L, .L16 + NOP + + + .align +.L18: + daddiu TEMP, KK, -4 # deal with the triangular data part + dsll L, TEMP, 2 + BASE_SHIFT + dsll TEMP, TEMP, 2 + BASE_SHIFT + daddu AO, AORIG, L + daddu BO, B, TEMP # BO point to the trigular data part + + LD b1, 0 * SIZE(AO) # fixed results + LD b2, 1 * SIZE(AO) + LD b3, 2 * SIZE(AO) + LD b4, 3 * SIZE(AO) + + SUB t11, b1, t11 + SUB t21, b2, t21 + SUB t31, b3, t31 + SUB t41, b4, t41 + + LD b5, 4 * SIZE(AO) + LD b6, 5 * SIZE(AO) + LD b7, 6 * SIZE(AO) + LD b8, 7 * SIZE(AO) + + SUB t12, b5, t12 + SUB t22, b6, t22 + SUB t32, b7, t32 + SUB t42, b8, t42 + + LD b1, 8 * SIZE(AO) + LD b2, 9 * SIZE(AO) + LD b3, 10 * SIZE(AO) + LD b4, 11 * SIZE(AO) + + SUB t13, b1, t13 + SUB t23, b2, t23 + SUB t33, b3, t33 + SUB t43, b4, t43 + + LD b5, 12 * SIZE(AO) + LD b6, 13 * SIZE(AO) + LD b7, 14 * SIZE(AO) + LD b8, 15 * SIZE(AO) + + SUB t14, b5, t14 + SUB t24, b6, t24 + SUB t34, b7, t34 + SUB t44, b8, t44 + + + LD b1, 15 * SIZE(BO) + LD b2, 14 * SIZE(BO) + LD b3, 13 * SIZE(BO) + LD b4, 12 * SIZE(BO) + MUL t14, b1, t14 + MUL t24, b1, t24 + MUL t34, b1, t34 + MUL t44, b1, t44 + NMSUB t13, t13, b2, t14 + NMSUB t23, t23, b2, t24 + NMSUB t33, t33, b2, t34 + NMSUB t43, t43, b2, t44 + NMSUB t12, t12, b3, t14 + NMSUB t22, t22, b3, t24 + NMSUB t32, t32, b3, t34 + NMSUB t42, t42, b3, t44 + NMSUB t11, t11, b4, t14 + NMSUB t21, t21, b4, t24 + NMSUB t31, t31, b4, t34 + NMSUB t41, t41, b4, t44 + + + LD b5, 10 * SIZE(BO) + LD b6, 9 * SIZE(BO) + LD b7, 8 * SIZE(BO) + MUL t13, b5, t13 + MUL t23, b5, t23 + MUL t33, b5, t33 + MUL t43, b5, t43 + NMSUB t12, t12, b6, t13 + NMSUB t22, t22, b6, t23 + NMSUB t32, t32, b6, t33 + NMSUB t42, t42, b6, t43 + NMSUB t11, t11, b7, t13 + NMSUB t21, t21, b7, t23 + NMSUB t31, t31, b7, t33 + NMSUB t41, t41, b7, t43 + + + LD b8, 5 * SIZE(BO) + LD b1, 4 * SIZE(BO) + MUL t12, b8, t12 + MUL t22, b8, t22 + MUL t32, b8, t32 + MUL t42, b8, t42 + NMSUB t11, t11, b1, t12 + NMSUB t21, t21, b1, t22 + NMSUB t31, t31, b1, t32 + NMSUB t41, t41, b1, t42 + + + LD b2, 0 * SIZE(BO) + MUL t11, b2, t11 + MUL t21, b2, t21 + MUL t31, b2, t31 + MUL t41, b2, t41 + + + ST t11, 0 * SIZE(AO) # updata packed A + ST t21, 1 * SIZE(AO) + ST t31, 2 * SIZE(AO) + ST t41, 3 * SIZE(AO) + + ST t12, 4 * SIZE(AO) + ST t22, 5 * SIZE(AO) + ST t32, 6 * SIZE(AO) + ST t42, 7 * SIZE(AO) + + ST t13, 8 * SIZE(AO) + ST t23, 9 * SIZE(AO) + ST t33, 10 * SIZE(AO) + ST t43, 11 * SIZE(AO) + + ST t14, 12 * SIZE(AO) + ST t24, 13 * SIZE(AO) + ST t34, 14 * SIZE(AO) + ST t44, 15 * SIZE(AO) + + ST t11, 0 * SIZE(CO1) # write back + ST t21, 1 * SIZE(CO1) + ST t31, 2 * SIZE(CO1) + ST t41, 3 * SIZE(CO1) + + ST t12, 0 * SIZE(CO2) + ST t22, 1 * SIZE(CO2) + ST t32, 2 * SIZE(CO2) + ST t42, 3 * SIZE(CO2) + + ST t13, 0 * SIZE(CO3) + ST t23, 1 * SIZE(CO3) + ST t33, 2 * SIZE(CO3) + ST t43, 3 * SIZE(CO3) + + ST t14, 0 * SIZE(CO4) + ST t24, 1 * SIZE(CO4) + ST t34, 2 * SIZE(CO4) + ST t44, 3 * SIZE(CO4) + + daddiu CO1, CO1, 4 * SIZE # fixed pointer + daddiu CO2, CO2, 4 * SIZE + daddiu CO3, CO3, 4 * SIZE + daddiu CO4, CO4, 4 * SIZE + + dsll TEMP, K, 2 + BASE_SHIFT + daddu AORIG, AORIG, TEMP # move to next panel Ai + + daddiu I, I, -1 + bgtz I, .L11 + NOP + + .align 3 +.L20: + andi I, M, 2 # mr=2 + blez I, .L40 + NOP + + dsll L, KK, 1 + BASE_SHIFT # mr=2 + dsll TEMP, KK, 2 + BASE_SHIFT # nr=4 + daddu AO, AORIG, L + daddu BO, B, TEMP # BO point to the retangular data part,also reset BO + dsubu TEMP, K, KK # temp = the length of rectangular data part + + MTC $0, t11 # clear 8 results registers + MOV t21, t11 + MOV t12, t11 + MOV t22, t11 + MOV t13, t11 + MOV t23, t11 + MOV t14, t11 + MOV t24, t11 + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + + dsra L, TEMP, 2 # L=(KC-offset)/4 + blez L, .L25 + NOP + + .align 3 + +.L22: + LD a5, 2 * SIZE(AO) + LD a6, 3 * SIZE(AO) + + LD b5, 4 * SIZE(BO) + LD b6, 5 * SIZE(BO) + LD b7, 6 * SIZE(BO) + LD b8, 7 * SIZE(BO) + + MADD t11, t11, a1, b1 + MADD t21, t21, a2, b1 + + MADD t12, t12, a1, b2 + MADD t22, t22, a2, b2 + + MADD t13, t13, a1, b3 + MADD t23, t23, a2, b3 + + MADD t14, t14, a1, b4 + MADD t24, t24, a2, b4 + + LD a3, 4 * SIZE(AO) + LD a4, 5 * SIZE(AO) + + LD b1, 8 * SIZE(BO) + LD b2, 9 * SIZE(BO) + LD b3, 10 * SIZE(BO) + LD b4, 11 * SIZE(BO) + + MADD t11, t11, a5, b5 + MADD t21, t21, a6, b5 + + MADD t12, t12, a5, b6 + MADD t22, t22, a6, b6 + + MADD t13, t13, a5, b7 + MADD t23, t23, a6, b7 + + MADD t14, t14, a5, b8 + MADD t24, t24, a6, b8 + + LD a7, 6 * SIZE(AO) + LD a8, 7 * SIZE(AO) + + LD b5, 12 * SIZE(BO) + LD b6, 13 * SIZE(BO) + LD b7, 14 * SIZE(BO) + LD b8, 15 * SIZE(BO) + + MADD t11, t11, a3, b1 + MADD t21, t21, a4, b1 + + MADD t12, t12, a3, b2 + MADD t22, t22, a4, b2 + + MADD t13, t13, a3, b3 + MADD t23, t23, a4, b3 + + MADD t14, t14, a3, b4 + MADD t24, t24, a4, b4 + + daddiu AO, AO, 8 * SIZE # AO += 2mr*4kr + daddiu BO, BO, 16 * SIZE # BO += 4nr*4kr + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + + MADD t11, t11, a7, b5 + MADD t21, t21, a8, b5 + + MADD t12, t12, a7, b6 + MADD t22, t22, a8, b6 + + MADD t13, t13, a7, b7 + MADD t23, t23, a8, b7 + + MADD t14, t14, a7, b8 + MADD t24, t24, a8, b8 + + daddiu L, L, -1 + bgtz L, .L22 + NOP + + + .align 3 + +.L25: + andi L, TEMP, 3 + blez L, .L28 + NOP + .align 3 + +.L26: + MADD t11, t11, a1, b1 + MADD t21, t21, a2, b1 + + MADD t12, t12, a1, b2 + MADD t22, t22, a2, b2 + + MADD t13, t13, a1, b3 + MADD t23, t23, a2, b3 + + MADD t14, t14, a1, b4 + MADD t24, t24, a2, b4 + + daddiu AO, AO, 2 * SIZE # AO += 2mr + daddiu BO, BO, 4 * SIZE # BO += 4nr + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + + daddiu L, L, -1 + bgtz L, .L26 + NOP + + + .align +.L28: + daddiu TEMP, KK, -4 # deal with the triangular data part + dsll L, TEMP, 1 + BASE_SHIFT # mr=2 + dsll TEMP, TEMP, 2 + BASE_SHIFT + daddu AO, AORIG, L + daddu BO, B, TEMP # BO point to the trigular data part + + LD b1, 0 * SIZE(AO) # fixed results + LD b2, 1 * SIZE(AO) + + SUB t11, b1, t11 + SUB t21, b2, t21 + + LD b5, 2 * SIZE(AO) + LD b6, 3 * SIZE(AO) + + SUB t12, b5, t12 + SUB t22, b6, t22 + + LD b3, 4 * SIZE(AO) + LD b4, 5 * SIZE(AO) + + SUB t13, b3, t13 + SUB t23, b4, t23 + + LD b7, 6 * SIZE(AO) + LD b8, 7 * SIZE(AO) + + SUB t14, b7, t14 + SUB t24, b8, t24 + + + LD b1, 15 * SIZE(BO) + LD b2, 14 * SIZE(BO) + LD b3, 13 * SIZE(BO) + LD b4, 12 * SIZE(BO) + MUL t14, b1, t14 + MUL t24, b1, t24 + NMSUB t13, t13, b2, t14 + NMSUB t23, t23, b2, t24 + NMSUB t12, t12, b3, t14 + NMSUB t22, t22, b3, t24 + NMSUB t11, t11, b4, t14 + NMSUB t21, t21, b4, t24 + + + LD b5, 10 * SIZE(BO) + LD b6, 9 * SIZE(BO) + LD b7, 8 * SIZE(BO) + MUL t13, b5, t13 + MUL t23, b5, t23 + NMSUB t12, t12, b6, t13 + NMSUB t22, t22, b6, t23 + NMSUB t11, t11, b7, t13 + NMSUB t21, t21, b7, t23 + + + LD b8, 5 * SIZE(BO) + LD b1, 4 * SIZE(BO) + MUL t12, b8, t12 + MUL t22, b8, t22 + NMSUB t11, t11, b1, t12 + NMSUB t21, t21, b1, t22 + + + LD b2, 0 * SIZE(BO) + MUL t11, b2, t11 + MUL t21, b2, t21 + + + ST t11, 0 * SIZE(AO) # updata packed A + ST t21, 1 * SIZE(AO) + + ST t12, 2 * SIZE(AO) + ST t22, 3 * SIZE(AO) + + ST t13, 4 * SIZE(AO) + ST t23, 5 * SIZE(AO) + + ST t14, 6 * SIZE(AO) + ST t24, 7 * SIZE(AO) + + ST t11, 0 * SIZE(CO1) # write back + ST t21, 1 * SIZE(CO1) + + ST t12, 0 * SIZE(CO2) + ST t22, 1 * SIZE(CO2) + + ST t13, 0 * SIZE(CO3) + ST t23, 1 * SIZE(CO3) + + ST t14, 0 * SIZE(CO4) + ST t24, 1 * SIZE(CO4) + + daddiu CO1, CO1, 2 * SIZE # fixed pointer + daddiu CO2, CO2, 2 * SIZE + daddiu CO3, CO3, 2 * SIZE + daddiu CO4, CO4, 2 * SIZE + + dsll TEMP, K, 1 + BASE_SHIFT # mr=2 + daddu AORIG, AORIG, TEMP # move to next panel Ai + + + .align 3 +.L40: + andi I, M, 1 + blez I, .L29 + NOP + + dsll L, KK, BASE_SHIFT # mr=1 + dsll TEMP, KK, 2 + BASE_SHIFT # nr=4 + daddu AO, AORIG, L + daddu BO, B, TEMP # BO point to the retangular data part,also reset BO + dsubu TEMP, K, KK # temp = the length of rectangular data part + + MTC $0, t11 # clear 4 results registers + MOV t12, t11 + MOV t13, t11 + MOV t14, t11 + + LD a1, 0 * SIZE(AO) + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + + dsra L, TEMP, 2 # L=(KC-offset)/4 + blez L, .L45 + NOP + + .align 3 + +.L42: + LD a5, 1 * SIZE(AO) + + LD b5, 4 * SIZE(BO) + LD b6, 5 * SIZE(BO) + LD b7, 6 * SIZE(BO) + LD b8, 7 * SIZE(BO) + + MADD t11, t11, a1, b1 + MADD t12, t12, a1, b2 + MADD t13, t13, a1, b3 + MADD t14, t14, a1, b4 + + LD a3, 2 * SIZE(AO) + + LD b1, 8 * SIZE(BO) + LD b2, 9 * SIZE(BO) + LD b3, 10 * SIZE(BO) + LD b4, 11 * SIZE(BO) + + MADD t11, t11, a5, b5 + MADD t12, t12, a5, b6 + MADD t13, t13, a5, b7 + MADD t14, t14, a5, b8 + + LD a7, 3 * SIZE(AO) + + LD b5, 12 * SIZE(BO) + LD b6, 13 * SIZE(BO) + LD b7, 14 * SIZE(BO) + LD b8, 15 * SIZE(BO) + + MADD t11, t11, a3, b1 + MADD t12, t12, a3, b2 + MADD t13, t13, a3, b3 + MADD t14, t14, a3, b4 + + daddiu AO, AO, 4 * SIZE # AO += 1mr*4kr + daddiu BO, BO, 16 * SIZE # BO += 4nr*4kr + + LD a1, 0 * SIZE(AO) + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + + MADD t11, t11, a7, b5 + MADD t12, t12, a7, b6 + MADD t13, t13, a7, b7 + MADD t14, t14, a7, b8 + + daddiu L, L, -1 + bgtz L, .L42 + NOP + + + .align 3 + +.L45: + andi L, TEMP, 3 + blez L, .L48 + NOP + .align 3 + +.L46: + MADD t11, t11, a1, b1 + MADD t12, t12, a1, b2 + MADD t13, t13, a1, b3 + MADD t14, t14, a1, b4 + + daddiu AO, AO, 1 * SIZE # AO += 2mr + daddiu BO, BO, 4 * SIZE # BO += 4nr + + LD a1, 0 * SIZE(AO) + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + + daddiu L, L, -1 + bgtz L, .L46 + NOP + + + .align +.L48: + daddiu TEMP, KK, -4 # deal with the triangular data part + dsll L, TEMP, BASE_SHIFT # mr=1 + dsll TEMP, TEMP, 2 + BASE_SHIFT + daddu AO, AORIG, L + daddu BO, B, TEMP # BO point to the trigular data part + + LD b1, 0 * SIZE(AO) # fixed results + LD b5, 1 * SIZE(AO) + LD b3, 2 * SIZE(AO) + LD b7, 3 * SIZE(AO) + + SUB t11, b1, t11 + SUB t12, b5, t12 + SUB t13, b3, t13 + SUB t14, b7, t14 + + + LD b1, 15 * SIZE(BO) + LD b2, 14 * SIZE(BO) + LD b3, 13 * SIZE(BO) + LD b4, 12 * SIZE(BO) + MUL t14, b1, t14 + NMSUB t13, t13, b2, t14 + NMSUB t12, t12, b3, t14 + NMSUB t11, t11, b4, t14 + + + LD b5, 10 * SIZE(BO) + LD b6, 9 * SIZE(BO) + LD b7, 8 * SIZE(BO) + MUL t13, b5, t13 + NMSUB t12, t12, b6, t13 + NMSUB t11, t11, b7, t13 + + + LD b8, 5 * SIZE(BO) + LD b1, 4 * SIZE(BO) + MUL t12, b8, t12 + NMSUB t11, t11, b1, t12 + + + LD b2, 0 * SIZE(BO) + MUL t11, b2, t11 + + + ST t11, 0 * SIZE(AO) # updata packed A + ST t12, 1 * SIZE(AO) + ST t13, 2 * SIZE(AO) + ST t14, 3 * SIZE(AO) + + ST t11, 0 * SIZE(CO1) # write back + ST t12, 0 * SIZE(CO2) + ST t13, 0 * SIZE(CO3) + ST t14, 0 * SIZE(CO4) + + daddiu CO1, CO1, 1 * SIZE # fixed pointer + daddiu CO2, CO2, 1 * SIZE + daddiu CO3, CO3, 1 * SIZE + daddiu CO4, CO4, 1 * SIZE + + dsll TEMP, K, BASE_SHIFT # mr=2 + daddu AORIG, AORIG, TEMP # move to next panel Ai + + +.L29: + daddiu KK, KK, -4 # rectangular data part increased by 4 + bgtz J, .L10 + NOP + + + + .align 3 + + +.L999: + LDARG $16, 0($sp) + LDARG $17, 8($sp) + LDARG $18, 16($sp) + LDARG $19, 24($sp) + LDARG $20, 32($sp) + LDARG $21, 40($sp) + ldc1 $f24, 48($sp) + ldc1 $f25, 56($sp) + ldc1 $f26, 64($sp) + ldc1 $f27, 72($sp) + ldc1 $f28, 80($sp) + + LDARG $22, 88($sp) + LDARG $23, 96($sp) + LDARG $24, 104($sp) + LDARG $25, 112($sp) + +#ifndef __64BIT__ + ldc1 $f20,112($sp) + ldc1 $f21,120($sp) + ldc1 $f22,128($sp) + ldc1 $f23,136($sp) +#endif + + j $31 + daddiu $sp, $sp, 144 + + EPILOGUE From 03272a606d9e2848ee696f467307d4e8fef5367c Mon Sep 17 00:00:00 2001 From: Xianyi Zhang Date: Fri, 13 May 2011 01:21:39 +0800 Subject: [PATCH 15/42] Added the unit test for drotmg. --- utest/Makefile | 4 +-- utest/common_utest.h | 2 ++ utest/main.c | 3 ++- utest/test_rotmg.c | 60 ++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 66 insertions(+), 3 deletions(-) create mode 100644 utest/test_rotmg.c diff --git a/utest/Makefile b/utest/Makefile index 9d512b877..9339d42be 100644 --- a/utest/Makefile +++ b/utest/Makefile @@ -5,12 +5,12 @@ include $(TOPDIR)/Makefile.system TARGET=openblas_utest CUNIT_LIB=/usr/local/lib/libcunit.a -OBJS=main.o test_rot.o test_swap.o test_axpy.o test_dotu.o +OBJS=main.o test_rot.o test_swap.o test_axpy.o test_dotu.o test_rotmg.o all : run_test $(TARGET): $(OBJS) - $(CC) -o $@ $^ ../$(LIBNAME) $(CUNIT_LIB) $(EXTRALIB) + $(FC) -o $@ $^ ../$(LIBNAME) $(CUNIT_LIB) $(EXTRALIB) run_test: $(TARGET) ./$(TARGET) diff --git a/utest/common_utest.h b/utest/common_utest.h index 3e9ecb422..f9a14d87d 100644 --- a/utest/common_utest.h +++ b/utest/common_utest.h @@ -57,4 +57,6 @@ void test_caxpy_inc_0(void); void test_zdotu_n_1(void); void test_zdotu_offset_1(void); +void test_drotmg(void); + #endif diff --git a/utest/main.c b/utest/main.c index f6ecf3cc0..2ec9c7a57 100644 --- a/utest/main.c +++ b/utest/main.c @@ -54,7 +54,8 @@ CU_TestInfo test_level1[]={ {"Testing zdotu with n == 1",test_zdotu_n_1}, {"Testing zdotu with input x & y offset == 1",test_zdotu_offset_1}, - + + {"Testing drotmg",test_drotmg}, CU_TEST_INFO_NULL, }; diff --git a/utest/test_rotmg.c b/utest/test_rotmg.c new file mode 100644 index 000000000..e51e6b299 --- /dev/null +++ b/utest/test_rotmg.c @@ -0,0 +1,60 @@ +/***************************************************************************** +Copyright (c) 2011, Lab of Parallel Software and Computational Science,ICSAS +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the ISCAS nor the names of its contributors may + be used to endorse or promote products derived from this software + without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "common_utest.h" + +void test_drotmg() +{ + double te_d1, tr_d1; + double te_d2, tr_d2; + double te_x1, tr_x1; + double te_y1, tr_y1; + double te_param[5],tr_param[5]; + int i=0; + te_d1= tr_d1=0.21149573940783739; + te_d2= tr_d2=0.046892057172954082; + te_x1= tr_x1=-0.42272687517106533; + te_y1= tr_y1=0.42211309121921659; + //OpenBLAS + BLASFUNC(drotmg)(&te_d1, &te_d2, &te_x1, &te_y1, te_param); + //reference + BLASFUNC_REF(drotmg)(&tr_d1, &tr_d2, &tr_x1, &tr_y1, tr_param); + + CU_ASSERT_DOUBLE_EQUAL(te_d1, tr_d1, CHECK_EPS); + CU_ASSERT_DOUBLE_EQUAL(te_d2, tr_d2, CHECK_EPS); + CU_ASSERT_DOUBLE_EQUAL(te_x1, tr_x1, CHECK_EPS); + CU_ASSERT_DOUBLE_EQUAL(te_y1, tr_y1, CHECK_EPS); + + for(i=0; i<5; i++){ + CU_ASSERT_DOUBLE_EQUAL(te_param[i], tr_param[i], CHECK_EPS); + } +} From 1d605109599db32d15d00e81c9d83b8b2cdf4208 Mon Sep 17 00:00:00 2001 From: Xianyi Zhang Date: Fri, 13 May 2011 02:19:55 +0800 Subject: [PATCH 16/42] Added the unit testcase for dsdot. --- common_reference.h | 4 ++++ utest/Makefile | 2 +- utest/common_utest.h | 2 ++ utest/main.c | 2 ++ 4 files changed, 9 insertions(+), 1 deletion(-) diff --git a/common_reference.h b/common_reference.h index 04b11f80f..4cc4be4fd 100644 --- a/common_reference.h +++ b/common_reference.h @@ -60,4 +60,8 @@ float _Complex BLASFUNC_REF(cdotc) (blasint *, float *, blasint *, float *, double _Complex BLASFUNC_REF(zdotu) (blasint *, double *, blasint *, double *, blasint *); double _Complex BLASFUNC_REF(zdotc) (blasint *, double *, blasint *, double *, blasint *); +void BLASFUNC_REF(drotmg)(double *, double *, double *, double *, double *); + +double BLASFUNC_REF(dsdot)(blasint *, float *, blasint *, float *, blasint*); + #endif diff --git a/utest/Makefile b/utest/Makefile index 9339d42be..e7c5f3412 100644 --- a/utest/Makefile +++ b/utest/Makefile @@ -5,7 +5,7 @@ include $(TOPDIR)/Makefile.system TARGET=openblas_utest CUNIT_LIB=/usr/local/lib/libcunit.a -OBJS=main.o test_rot.o test_swap.o test_axpy.o test_dotu.o test_rotmg.o +OBJS=main.o test_rot.o test_swap.o test_axpy.o test_dotu.o test_rotmg.o test_dsdot.o all : run_test diff --git a/utest/common_utest.h b/utest/common_utest.h index f9a14d87d..1332ef6ab 100644 --- a/utest/common_utest.h +++ b/utest/common_utest.h @@ -59,4 +59,6 @@ void test_zdotu_offset_1(void); void test_drotmg(void); +void test_dsdot_n_1(void); + #endif diff --git a/utest/main.c b/utest/main.c index 2ec9c7a57..135709507 100644 --- a/utest/main.c +++ b/utest/main.c @@ -56,6 +56,8 @@ CU_TestInfo test_level1[]={ {"Testing zdotu with input x & y offset == 1",test_zdotu_offset_1}, {"Testing drotmg",test_drotmg}, + + {"Testing dsdot with n == 1",test_dsdot_n_1}, CU_TEST_INFO_NULL, }; From b206fc7075ad39f5de144a894fe32b8865c243fd Mon Sep 17 00:00:00 2001 From: Xianyi Zhang Date: Fri, 13 May 2011 02:34:30 +0800 Subject: [PATCH 17/42] Fixed #28. Convert the result to double precision in the end of dsdot kernel. --- Changelog.txt | 3 ++- kernel/Makefile.L1 | 2 +- kernel/x86_64/dot_sse.S | 4 ++++ 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/Changelog.txt b/Changelog.txt index 2035dbce1..2b5dc3a94 100644 --- a/Changelog.txt +++ b/Changelog.txt @@ -19,7 +19,8 @@ common: * Fixed #25 a wrong result of rotmg. x86/x86_64: - * + * Fixed #28 a wrong result of dsdot on x86_64. + MIPS64: * ==================================================================== diff --git a/kernel/Makefile.L1 b/kernel/Makefile.L1 index 317f14363..b08664a8e 100644 --- a/kernel/Makefile.L1 +++ b/kernel/Makefile.L1 @@ -668,7 +668,7 @@ $(KDIR)qdot_k$(TSUFFIX).$(SUFFIX) $(KDIR)qdot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNEL $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE $< -o $@ $(KDIR)dsdot_k$(TSUFFIX).$(SUFFIX) $(KDIR)dsdot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SDOTKERNEL) - $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $@ + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DDSDOT $< -o $@ $(KDIR)sdot_k$(TSUFFIX).$(SUFFIX) $(KDIR)sdot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SDOTKERNEL) $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $@ diff --git a/kernel/x86_64/dot_sse.S b/kernel/x86_64/dot_sse.S index cc866a9c5..61c481064 100644 --- a/kernel/x86_64/dot_sse.S +++ b/kernel/x86_64/dot_sse.S @@ -1286,6 +1286,10 @@ haddps %xmm0, %xmm0 #endif +#ifdef DSDOT + cvtss2sd %xmm0, %xmm0 +#endif + RESTOREREGISTERS ret From 830a823be18dddc6cf58a824eb12de99e22f76a1 Mon Sep 17 00:00:00 2001 From: Xianyi Zhang Date: Fri, 13 May 2011 02:41:39 +0800 Subject: [PATCH 18/42] Added missed testing codes for dsdot. --- utest/test_dsdot.c | 50 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) create mode 100644 utest/test_dsdot.c diff --git a/utest/test_dsdot.c b/utest/test_dsdot.c new file mode 100644 index 000000000..8df7380be --- /dev/null +++ b/utest/test_dsdot.c @@ -0,0 +1,50 @@ +/***************************************************************************** +Copyright (c) 2011, Lab of Parallel Software and Computational Science,ICSAS +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the ISCAS nor the names of its contributors may + be used to endorse or promote products derived from this software + without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "common_utest.h" + +void test_dsdot_n_1() +{ + float x= 0.172555164; + float y= -0.0138700781; + int incx=1; + int incy=1; + int n=1; + + double res1=0.0f, res2=0.0f; + + res1=BLASFUNC(dsdot)(&n, &x, &incx, &y, &incy); + res2=BLASFUNC_REF(dsdot)(&n, &x, &incx, &y, &incy); + + CU_ASSERT_DOUBLE_EQUAL(res1, res2, CHECK_EPS); + +} From a9320f896eeea7e863dab9b300ab90bcaba0fcc3 Mon Sep 17 00:00:00 2001 From: traz Date: Sat, 14 May 2011 22:00:57 +0000 Subject: [PATCH 19/42] Fixed #25 dtrmm and dtrsm computational error on Loongson3A. --- kernel/mips64/gemm_kernel_loongson3a.S | 217 +++++++++++++++---------- 1 file changed, 133 insertions(+), 84 deletions(-) diff --git a/kernel/mips64/gemm_kernel_loongson3a.S b/kernel/mips64/gemm_kernel_loongson3a.S index 9df66c0d7..77b2b51ff 100644 --- a/kernel/mips64/gemm_kernel_loongson3a.S +++ b/kernel/mips64/gemm_kernel_loongson3a.S @@ -7,6 +7,8 @@ #define ASSEMBLER #include "common.h" + + #define M $4 #define N $5 #define K $6 @@ -429,7 +431,7 @@ .L15: # N=4 M=4 K=2 #ifndef TRMMKERNEL - and K,KCO,2 # k = KCO&2 + andi K,KCO,2 # k = KCO&2 #else andi K,TEMP, 2 #endif @@ -693,7 +695,7 @@ .L14_M2: - and M,MCO,2 # Remainder M = 2 + andi M,MCO,2 # Remainder M = 2 beqz M,.L14_M1 nop @@ -824,9 +826,9 @@ .L25: # N=4 M=2 K=2 #ifndef TRMMKERNEL - and K,KCO,2 # k = KCO&2 + andi K,KCO,2 # k = KCO&2 #else - and K,TEMP,2 + andi K,TEMP,2 #endif beqz K,.L28 nop @@ -867,9 +869,9 @@ .L28: # N=4, M=2, K=1 #ifndef TRMMKERNEL - and K,KCO,1 + andi K,KCO,1 #else - and K,TEMP,1 + andi K,TEMP,1 #endif beqz K,.L29 # LD ALPHA,152($sp) # Get ALPHA @@ -917,7 +919,6 @@ MADD t24,c24,t24,ALPHA ST t13,0(CO3) - move B,BO # Reset B ST t23,1*SIZE(CO3) daddu CO1,CO1,2*SIZE # COx += 2*8Byte @@ -985,7 +986,7 @@ .L14_M1: - and M,MCO,1 # Remainder M = 1 + andi M,MCO,1 # Remainder M = 1 beqz M,.L0_N4_Loop # M = 0, finishing one panel B nop @@ -1001,7 +1002,8 @@ daddu B,BO,TEMP #endif - gsLQC1(R8,F1,F0,0) + LD a0, 0 * SIZE(A) +# gsLQC1(R8,F1,F0,0) gsLQC1(R9,F9,F8,0) #b0,b1 MTC $0,t11 gsLQC1(R9,F11,F10,1) #b2,b3 @@ -1019,9 +1021,11 @@ beqz K,.L35 MOV t14,t11 -#else +#else + # gemm move B,BO - gsLQC1(R8,F1,F0,0) + LD a0, 0 * SIZE(A) +# gsLQC1(R8,F1,F0,0) dsra K,KCO,2 # K=KCO/2 gsLQC1(R9,F9,F8,0) #b0,b1 MTC $0,t11 @@ -1034,7 +1038,8 @@ #endif .L31: # N=4 m=1,=K=4 - gsLQC1(R8,F3,F2,1) +# gsLQC1(R8,F3,F2,1) + LD a1, 1*SIZE(A) gsLQC1(R9,F13,F12,2) # R9=B MADD t11,t11,a0,b0 MADD t12,t12,a0,b1 @@ -1042,7 +1047,8 @@ gsLQC1(R9,F15,F14,3) MADD t13,t13,a0,b2 MADD t14,t14,a0,b3 - + + LD a2, 2*SIZE(A) gsLQC1(R9,F9,F8,4) MADD t11,t11,a1,b4 MADD t12,t12,a1,b5 @@ -1051,18 +1057,21 @@ MADD t13,t13,a1,b6 MADD t14,t14,a1,b7 daddiu K,K,-1 - + + LD a3, 3*SIZE(A) gsLQC1(R9,F13,F12,6) MADD t11,t11,a2,b0 MADD t12,t12,a2,b1 - daddu A,A,4*SIZE # A+=1(mr)*4(kr)*8Byte=8*SIZE gsLQC1(R9,F15,F14,7) MADD t13,t13,a2,b2 MADD t14,t14,a2,b3 + + daddu A,A,4*SIZE # A+=1(mr)*4(kr)*8Byte=8*SIZE daddu B,B,16*SIZE # B+=4(nr)*4(kr)*8Byte=16*SIZE - gsLQC1(R8,F1,F0,0) +# gsLQC1(R8,F1,F0,0) + LD a0, 0*SIZE(A) gsLQC1(R9,F9,F8,0) MADD t11,t11,a3,b4 MADD t12,t12,a3,b5 @@ -1074,14 +1083,15 @@ .L35: # N=4 M=1 K=2 #ifndef TRMMKERNEL - and K,KCO,2 # k = KCO&2 + andi K,KCO,2 # k = KCO&2 #else - and K,TEMP,2 + andi K,TEMP,2 #endif beqz K,.L38 nop .L36: + LD a1,1*SIZE(A) gsLQC1(R9,F13,F12,2) # R9=B MADD t11,t11,a0,b0 MADD t12,t12,a0,b1 @@ -1095,7 +1105,6 @@ .L37: LD a0,0(A) - gsLQC1(R9,F9,F8,0) MADD t11,t11,a1,b4 MADD t12,t12,a1,b5 @@ -1106,7 +1115,7 @@ .L38: # N=4, M=1, K=1 #ifndef TRMMKERNEL - and K,KCO,1 + andi K,KCO,1 #else andi K,TEMP,1 #endif @@ -1182,7 +1191,7 @@ .align 5 .L0_N2: - and N,NCO,2 # Remainder N = 2 + andi N,NCO,2 # Remainder N = 2 beqz N,.L0_N1 # N=0,NCO<2 nop @@ -1336,7 +1345,7 @@ .L45: # N=2 M=4 K=2 #ifndef TRMMKERNEL - and K,KCO,2 # k = KCO&2 + andi K,KCO,2 # k = KCO&2 #else andi K,TEMP,2 #endif @@ -1383,7 +1392,7 @@ .L48: # N=2, M=4, K=1 #ifndef TRMMKERNEL - and K,KCO,1 + andi K,KCO,1 #else andi K,TEMP,1 #endif @@ -1497,7 +1506,7 @@ #endif .L12_M2: - and M,MCO,2 # Remainder M = 2 + andi M,MCO,2 # Remainder M = 2 beqz M,.L12_M1 nop @@ -1585,7 +1594,7 @@ .L55: # N=2 M=2 K=2 #ifndef TRMMKERNEL - and K,KCO,2 # k = KCO&2 + andi K,KCO,2 # k = KCO&2 #else andi K,TEMP,2 #endif @@ -1616,9 +1625,9 @@ .L58: # N=2, M=2, K=1 #ifndef TRMMKERNEL - and K,KCO,1 + andi K,KCO,1 #else - and K, TEMP, 1 + andi K, TEMP, 1 #endif beqz K,.L59 # LD ALPHA,152($sp) # Get ALPHA @@ -1695,7 +1704,7 @@ .L12_M1: - and M,MCO,1 # Remainder M = 1 + andi M,MCO,1 # Remainder M = 1 beqz M,.L0_N2_Loop # M = 0, finishing one panel B nop @@ -1711,8 +1720,8 @@ daddu B, BO, TEMP #endif MTC $0,t11 - gsLQC1(R8,F4,F0,0) - +#gsLQC1(R8,F4,F0,0) + LD a0, 0*SIZE(A) MOV t21,t11 MOV t12,t11 gsLQC1(R9,F9,F8,0) #b0,b1 @@ -1733,8 +1742,8 @@ dsra K,KCO,2 # K=KCO/2 MTC $0,t11 move B,BO # Reset B - gsLQC1(R8,F4,F0,0) - +# gsLQC1(R8,F4,F0,0) + LD a0,0*SIZE(A) MOV t21,t11 MOV t12,t11 gsLQC1(R9,F9,F8,0) #b0,b1 @@ -1745,23 +1754,27 @@ #endif .L61: # N=2 m=1,=K=4 + LD a4, 1*SIZE(A) gsLQC1(R9,F13,F12,1) # R9=B MADD t11,t11,a0,b0 MADD t12,t12,a0,b1 + LD a2, 2*SIZE(A) gsLQC1(R9,F11,F10,2) MADD t11,t11,a4,b4 MADD t12,t12,a4,b5 - daddiu K,K,-1 - gsLQC1(R8,F6,F2,1) +# gsLQC1(R8,F6,F2,1) + LD a6, 3*SIZE(A) MADD t11,t11,a2,b2 + MADD t12,t12,a2,b3 + daddiu K,K,-1 gsLQC1(R9,F15,F14,3) - MADD t12,t12,a2,b3 daddu A,A,4*SIZE # A+=1(mr)*4(kr)*8Byte=32 +# gsLQC1(R8,F4,F0,0) - gsLQC1(R8,F4,F0,0) + LD a0, 0*SIZE(A) daddu B,B,8*SIZE # B+=2(nr)*4(kr)*8Byte=8*SIZE gsLQC1(R9,F9,F8,0) @@ -1771,16 +1784,18 @@ .L65: # N=2 M=1 K=2 #ifndef TRMMKERNEL - and K,KCO,2 # k = KCO&2 + andi K,KCO,2 # k = KCO&2 #else - and K,TEMP,2 + andi K,TEMP,2 #endif beqz K,.L68 nop .L66: - gsLQC1(R9,F13,F12,1) # R9=B + LD a4, 1*SIZE(A) MADD t11,t11,a0,b0 + + gsLQC1(R9,F13,F12,1) # R9=B MADD t12,t12,a0,b1 daddu A,A,2*SIZE # A+=1(mr)*2(kr)*8Byte=16 daddu B,B,4*SIZE @@ -1794,9 +1809,9 @@ .L68: # N=2, M=1, K=1 #ifndef TRMMKERNEL - and K,KCO,1 + andi K,KCO,1 #else - and K,TEMP,1 + andi K,TEMP,1 #endif beqz K,.L69 # LD ALPHA,152($sp) # Get ALPHA @@ -1862,7 +1877,7 @@ .align 5 .L0_N1: - and N,NCO,1 # Remainder N = 1 + andi N,NCO,1 # Remainder N = 1 beqz N,.L999 # N=0,NCO<1 nop @@ -1889,7 +1904,8 @@ daddu A, A, K daddu B, BO, TEMP #endif - gsLQC1(R9,F12,F8,0) +# gsLQC1(R9,F12,F8,0) + LD b0, 0*SIZE(B) MTC $0,t11 gsLQC1(R8,F1,F0,0) #a0,a1 MOV t21,t11 @@ -1908,7 +1924,8 @@ #else move B, BO dsra K,KCO,2 # K=KCO/2 - gsLQC1(R9,F12,F8,0) +# gsLQC1(R9,F12,F8,0) + LD b0, 0*SIZE(B) MTC $0,t11 gsLQC1(R8,F1,F0,0) #a0,a1 MOV t21,t11 @@ -1925,17 +1942,19 @@ MADD t11,t11,a0,b0 MADD t21,t21,a1,b0 + LD b4, 1*SIZE(B) FETCH $0,(PREA) MADD t31,t31,a2,b0 MADD t41,t41,a3,b0 .L72: - gsLQC1(R9,F14,F10,1) +# gsLQC1(R9,F14,F10,1) gsLQC1(R8,F1,F0,4) gsLQC1(R8,F3,F2,5) MADD t11,t11,a4,b4 MADD t21,t21,a5,b4 + LD b2, 2*SIZE(B) FETCH $0,4*SIZE(PREA) MADD t31,t31,a6,b4 MADD t41,t41,a7,b4 @@ -1944,24 +1963,28 @@ gsLQC1(R8,F5,F4,6) gsLQC1(R8,F7,F6,7) MADD t11,t11,a0,b2 + + LD b6, 3*SIZE(B) MADD t21,t21,a1,b2 - daddu B,B,4*SIZE # B+=1(nr)*4(kr)*8Byte=32 + daddu A,A,16*SIZE # A+=4(mr)*4(kr)*8Byte=16*SIZE FETCH $0,8*SIZE(PREA) MADD t31,t31,a2,b2 MADD t41,t41,a3,b2 - daddu A,A,16*SIZE # A+=4(mr)*4(kr)*8Byte=16*SIZE + daddu B,B,4*SIZE # B+=1(nr)*4(kr)*8Byte=32 .L74: - gsLQC1(R9,F12,F8,0) +# gsLQC1(R9,F12,F8,0) gsLQC1(R8,F1,F0,0) daddu PREA,PREA,16*SIZE gsLQC1(R8,F3,F2,1) MADD t11,t11,a4,b6 MADD t21,t21,a5,b6 + + LD b0, 0*SIZE(B) daddiu K,K,-1 - FETCH $0,-32(PREA) + MADD t31,t31,a6,b6 bnez K,.L71 MADD t41,t41,a7,b6 @@ -1969,9 +1992,9 @@ .L75: # N=2 M=4 K=2 #ifndef TRMMKERNEL - and K,KCO,2 # k = KCO&2 + andi K,KCO,2 # k = KCO&2 #else - and K,TEMP,2 + andi K,TEMP,2 #endif beqz K,.L78 nop @@ -1981,20 +2004,21 @@ gsLQC1(R8,F7,F6,3) MADD t11,t11,a0,b0 MADD t21,t21,a1,b0 - daddu B,B,2*SIZE # B+=1(nr)*2(kr)*8Byte=32 + daddu A,A,8*SIZE # A+=4(mr)*2(kr)*8Byte=8*SIZE + LD b4, 1*SIZE(B) FETCH $0,0(PREA) MADD t31,t31,a2,b0 MADD t41,t41,a3,b0 - daddu A,A,8*SIZE # A+=4(mr)*2(kr)*8Byte=8*SIZE + daddu B,B,2*SIZE # B+=1(nr)*2(kr)*8Byte=32 .L77: - LD b0,0(B) gsLQC1(R8,F1,F0,0) gsLQC1(R8,F3,F2,1) MADD t11,t11,a4,b4 MADD t21,t21,a5,b4 + LD b0,0(B) FETCH $0,4*SIZE(PREA) MADD t31,t31,a6,b4 MADD t41,t41,a7,b4 @@ -2004,9 +2028,9 @@ .L78: # N=2, M=4, K=1 #ifndef TRMMKERNEL - and K,KCO,1 + andi K,KCO,1 #else - and K,TEMP,1 + andi K,TEMP,1 #endif beqz K,.L79 # LD ALPHA,152($sp) # Get ALPHA @@ -2084,7 +2108,7 @@ .L11_M2: - and M,MCO,2 # Remainder M = 2 + andi M,MCO,2 # Remainder M = 2 beqz M,.L11_M1 nop @@ -2100,7 +2124,8 @@ daddu B, BO, TEMP #endif - gsLQC1(R9,F12,F8,0) +# gsLQC1(R9,F12,F8,0) + LD b0, 0*SIZE(B) MTC $0,t11 gsLQC1(R8,F1,F0,0) #a0,a1 MOV t21,t11 @@ -2117,7 +2142,8 @@ #else move B, BO dsra K,KCO,2 # K=KCO/2 - gsLQC1(R9,F12,F8,0) +# gsLQC1(R9,F12,F8,0) + LD b0, 0*SIZE(B) MTC $0,t11 gsLQC1(R8,F1,F0,0) #a0,a1 MOV t21,t11 @@ -2126,34 +2152,39 @@ #endif .L81: # N=1,M=2,K=4 + LD b4, 1*SIZE(B) gsLQC1(R8,F5,F4,1) # R8=A MADD t11,t11,a0,b0 MADD t21,t21,a1,b0 + LD b2, 2*SIZE(B) gsLQC1(R8,F3,F2,2) MADD t11,t11,a4,b4 MADD t21,t21,a5,b4 - - gsLQC1(R9,F14,F10,1) - daddu B,B,4*SIZE # B+=1(nr)*4(kr)*8Byte=32 +# gsLQC1(R9,F14,F10,1) + + LD b6, 3*SIZE(B) gsLQC1(R8,F7,F6,3) MADD t11,t11,a2,b2 + MADD t21,t21,a3,b2 daddu A,A,8*SIZE # A+=2(mr)*4(kr)*8Byte=8*SIZE + daddu B,B,4*SIZE # B+=1(nr)*4(kr)*8Byte=32 - gsLQC1(R9,F12,F8,0) - daddiu K,K,-1 - +# gsLQC1(R9,F12,F8,0) gsLQC1(R8,F1,F0,0) + daddiu K,K,-1 MADD t11,t11,a6,b6 + + LD b0, 0*SIZE(B) bnez K,.L81 MADD t21,t21,a7,b6 .L85: # N=2 M=4 K=2 #ifndef TRMMKERNEL - and K,KCO,2 # k = KCO&2 + andi K,KCO,2 # k = KCO&2 #else andi K,TEMP,2 #endif @@ -2163,21 +2194,22 @@ .L86: gsLQC1(R8,F5,F4,1) # R8=A + LD b4, 1*SIZE(B) MADD t11,t11,a0,b0 + MADD t21,t21,a1,b0 - daddu B,B,2*SIZE # B+=1(nr)*2(kr)*8Byte=16 - - LD b0,0(B) daddu A,A,4*SIZE # A+=2(mr)*2(kr)*8Byte=32 + daddu B,B,2*SIZE # B+=1(nr)*2(kr)*8Byte=16 gsLQC1(R8,F1,F0,0) + LD b0,0(B) MADD t11,t11,a4,b4 MADD t21,t21,a5,b4 .L88: # N=2, M=4, K=1 #ifndef TRMMKERNEL - and K,KCO,1 + andi K,KCO,1 #else andi K,TEMP,1 #endif @@ -2236,7 +2268,7 @@ .L11_M1: - and M,MCO,1 # Remainder M = 1 + andi M,MCO,1 # Remainder M = 1 beqz M,.L999 # M = 0, End nop @@ -2251,9 +2283,11 @@ daddu A, A, K daddu B, BO, TEMP #endif - gsLQC1(R8,F4,F0,0) +# gsLQC1(R8,F4,F0,0) MTC $0,t11 - gsLQC1(R9,F12,F8,0) +# gsLQC1(R9,F12,F8,0) + LD a0, 0*SIZE(A) + LD b0, 0*SIZE(B) #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) dsubu TEMP, KCO, KK #elif defined(LEFT) @@ -2268,33 +2302,45 @@ #else move B, BO dsra K,KCO,2 # K=KCO/2 - gsLQC1(R8,F4,F0,0) - gsLQC1(R9,F12,F8,0) +# gsLQC1(R8,F4,F0,0) +# gsLQC1(R9,F12,F8,0) + LD a0, 0*SIZE(A) + LD b0, 0*SIZE(B) beqz K,.L95 MTC $0,t11 #endif .L91: # N=1,M=1,K=4 - gsLQC1(R8,F6,F2,1) +# gsLQC1(R8,F6,F2,1) + LD a4, 1*SIZE(A) + LD b4, 1*SIZE(B) MADD t11,t11,a0,b0 - gsLQC1(R9,F14,F10,1) +# gsLQC1(R9,F14,F10,1) + LD a2, 2*SIZE(A) + LD b2, 2*SIZE(B) MADD t11,t11,a4,b4 - daddu A,A,4*SIZE # A+=1(mr)*4(kr)*8Byte=32 - gsLQC1(R8,F4,F0,0) +# gsLQC1(R8,F4,F0,0) + LD a6, 3*SIZE(A) + LD b6, 3*SIZE(B) MADD t11,t11,a2,b2 - daddu B,B,4*SIZE # B+=1(nr)*4(kr)*8Byte=32 - gsLQC1(R9,F12,F8,0) + daddu A,A,4*SIZE # A+=1(mr)*4(kr)*8Byte=32 + daddu B,B,4*SIZE # B+=1(nr)*4(kr)*8Byte=32 + + LD a0, 0*SIZE(A) + LD b0, 0*SIZE(B) +# gsLQC1(R9,F12,F8,0) MADD t11,t11,a6,b6 + daddiu K,K,-1 bnez K,.L91 nop .L95: # N=2 M=4 K=2 #ifndef TRMMKERNEL - and K,KCO,2 # k = KCO&2 + andi K,KCO,2 # k = KCO&2 #else andi K,TEMP,2 #endif @@ -2302,18 +2348,21 @@ nop .L96: + LD a4, 1*SIZE(A) + LD b4, 1*SIZE(B) MADD t11,t11,a0,b0 - MADD t11,t11,a4,b4 daddu B,B,2*SIZE # B+=1(nr)*2(kr)*8Byte=16 daddu A,A,2*SIZE # A+=1(mr)*2(kr)*8Byte=32 LD b0,0(B) LD a0,0(A) + MADD t11,t11,a4,b4 + .L98: # N=2, M=4, K=1 #ifndef TRMMKERNEL - and K,KCO,1 + andi K,KCO,1 #else andi K,TEMP,1 #endif From fcb5ce011b7fd1aea67d47d6e313dbe19547263f Mon Sep 17 00:00:00 2001 From: Xianyi Zhang Date: Tue, 17 May 2011 21:24:00 +0000 Subject: [PATCH 20/42] Fixed #28. Convert the result to double precision in MIPS64 dsdot_k kernel. --- Changelog.txt | 5 +++-- interface/dsdot.c | 11 +++++++---- kernel/mips64/dot.S | 8 ++++++-- 3 files changed, 16 insertions(+), 8 deletions(-) diff --git a/Changelog.txt b/Changelog.txt index 2b5dc3a94..cd1b4c3ef 100644 --- a/Changelog.txt +++ b/Changelog.txt @@ -1,7 +1,7 @@ OpenBLAS ChangeLog ==================================================================== Version 0.1 alpha2(in development) - +0;136;0c common: * Fixed blasint undefined bug in file. Other software could include this header successfully(Refs issue #13 on github) @@ -22,7 +22,8 @@ x86/x86_64: * Fixed #28 a wrong result of dsdot on x86_64. MIPS64: - * + * Fixed #28 a wrong result of dsdot on Loongson3A/MIPS64. + ==================================================================== Version 0.1 alpha1 20-Mar-2011 diff --git a/interface/dsdot.c b/interface/dsdot.c index 66f7917d5..94237e0c4 100644 --- a/interface/dsdot.c +++ b/interface/dsdot.c @@ -49,6 +49,7 @@ double NAME(blasint *N, float *x, blasint *INCX, float *y, blasint *INCY){ BLASLONG n = *N; BLASLONG incx = *INCX; BLASLONG incy = *INCY; + double ret = 0.0; PRINT_DEBUG_NAME; @@ -61,19 +62,21 @@ double NAME(blasint *N, float *x, blasint *INCX, float *y, blasint *INCY){ if (incx < 0) x -= (n - 1) * incx; if (incy < 0) y -= (n - 1) * incy; - return DSDOT_K(n, x, incx, y, incy); + ret=DSDOT_K(n, x, incx, y, incy); FUNCTION_PROFILE_END(1, n, n); IDEBUG_END; - return 0; + return ret; } #else double CNAME(blasint n, float *x, blasint incx, float *y, blasint incy){ + + double ret = 0.0; PRINT_DEBUG_CNAME; @@ -86,13 +89,13 @@ double CNAME(blasint n, float *x, blasint incx, float *y, blasint incy){ if (incx < 0) x -= (n - 1) * incx; if (incy < 0) y -= (n - 1) * incy; - return DSDOT_K(n, x, incx, y, incy); + ret=DSDOT_K(n, x, incx, y, incy); FUNCTION_PROFILE_END(1, n, n); IDEBUG_END; - return 0; + return ret; } diff --git a/kernel/mips64/dot.S b/kernel/mips64/dot.S index b1f599172..6220b6ac9 100644 --- a/kernel/mips64/dot.S +++ b/kernel/mips64/dot.S @@ -300,7 +300,11 @@ .align 3 .L999: - j $31 ADD s1, s1, s2 - +#ifdef DSDOT + cvt.d.s s1, s1 +#endif + j $31 + NOP + EPILOGUE From 5ca4e51df04e01dec47afe1a5c02c28f2f1547b7 Mon Sep 17 00:00:00 2001 From: traz Date: Wed, 18 May 2011 10:54:51 +0000 Subject: [PATCH 21/42] Remove the useless code, modify code comments and format. --- kernel/mips64/gemm_kernel_loongson3a.S | 1126 ++++++++++++------------ 1 file changed, 553 insertions(+), 573 deletions(-) diff --git a/kernel/mips64/gemm_kernel_loongson3a.S b/kernel/mips64/gemm_kernel_loongson3a.S index 77b2b51ff..3e95a3ed4 100644 --- a/kernel/mips64/gemm_kernel_loongson3a.S +++ b/kernel/mips64/gemm_kernel_loongson3a.S @@ -1,13 +1,9 @@ -#define gsLQC1(base,fq,ft,offset) .word(0x32<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq) -#define gsSQC1(base,fq,ft,offset) .word(0x3A<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq) -#define FETCH ld - #define REALNAME ASMNAME - #define ASSEMBLER #include "common.h" - - +#define FETCH ld +#define gsLQC1(base,fq,ft,offset) .word(0x32<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq) +#define gsSQC1(base,fq,ft,offset) .word(0x3A<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq) #define M $4 #define N $5 @@ -163,81 +159,78 @@ ST $f23,144($sp) - .align 5 # BACKUP -.L0_N4: # Loop N - ST ALPHA,152($sp) # Backup ALPHA - - move MCO,M # Backup M + .align 5 +.L0_N4: # Loop N + ST ALPHA,152($sp) # Backup ALPHA + move MCO,M # Backup M - move NCO,N # Backup N - move KCO,K # Backup K + move NCO,N # Backup N + move KCO,K # Backup K - move AO,A # Backup A_addr - dsra N,NCO,2 # N=NCO/2 + move AO,A # Backup A_addr + dsra N,NCO,2 # N=NCO/2 dsll LDC,LDC,BASE_SHIFT # LDC*8Byte - dsll SPANB,KCO,2+BASE_SHIFT # SPANB=KC*NR(4)*8Byte=KC*2^5 + dsll SPANB,KCO,2+BASE_SHIFT # SPANB=KC*4nr*8Byte=KC*2^5 - move BO,B # Backup B_addr - #if defined(TRMMKERNEL) - LDARG OFFSET,160($sp) # + LDARG OFFSET,160($sp) # OFFSET is relate to the data part #endif #if defined(TRMMKERNEL) && !defined(LEFT) - neg KK,OFFSET # right + neg KK,OFFSET #endif - beq N,$0,.L0_N2 # N=0,NCO<4 - dsll SPANA,KCO,1+BASE_SHIFT # SPANA = KCO*4mr*8Byte + move BO,B # Backup B_addr + beq N,$0,.L0_N2 # N=0,NCO<4 + dsll SPANA,KCO,1+BASE_SHIFT # SPANA = KCO*2mr*8Byte -.L0_N4_Lb: - move CO1,C # Set C - dsra M,MCO,2 # M=MCO/2 +.L0_N4_Lb: # mr=4,nr=4 + move CO1,C + dsra M,MCO,2 # M=MCO/2 - move A,AO # Reset A - daddu CO2,CO1,LDC + move A,AO # Reset A + daddu CO2,C,LDC + daddu PREB,BO,SPANB # PreB point next panelB daddu CO3,CO2,LDC - daddu PREB,BO,SPANB # PreB point next panelB - daddu CO4,CO3,LDC daddu PREA,AO,SPANA + daddu CO4,CO3,LDC #if defined(TRMMKERNEL) && defined(LEFT) - move KK,OFFSET # left + move KK,OFFSET #endif - beqz M,.L14_M2 - daddu C,CO4,LDC + daddu C,CO4,LDC # move C to next panel Cj .L10: #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - move B,BO + move B,BO # (SIDE=L and UPLO=L) or (SIZE=R and UPLO=U) #else - dsll K,KK,2 + BASE_SHIFT # KK no data part + dsll K,KK,2 + BASE_SHIFT # KK is the length that needs to span to the data part dsll TEMP,KK,2 + BASE_SHIFT - daddu A,A,K # move A B to data part + daddu A,A,K # move A B to data part daddu B,BO,TEMP #endif MTC $0,t11 MOV t21,t11 - gsLQC1(R8,F1,F0,0) #a0,a1 + gsLQC1(R8,F1,F0,0) # a0,a1 MOV t31,t11 MOV t41,t11 - gsLQC1(R9,F9,F8,0) #b0,b1 + gsLQC1(R9,F9,F8,0) # b0,b1 MOV t12,t11 MOV t22,t11 - gsLQC1(R8,F3,F2,1) #a2,a3 + gsLQC1(R8,F3,F2,1) # a2,a3 MOV t32,t11 MOV t42,t11 - gsLQC1(R9,F11,F10,1) #b2,b3 + gsLQC1(R9,F11,F10,1) # b2,b3 MOV t13,t11 MOV t23,t11 @@ -248,63 +241,60 @@ MOV t14,t11 MOV t24,t11 - MOV t34,t11 - MOV t44,t11 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - dsubu TEMP,KCO,KK # temp = kco - kk + dsubu TEMP,KCO,KK # temp is the length of the data part #elif defined(LEFT) - daddiu TEMP, KK, 4 + daddiu TEMP, KK, 4 # S=L,U=L #else - daddiu TEMP, KK, 4 + daddiu TEMP, KK, 4 # S=R,U=U,for this two situation KK is the length of the data part #endif - - dsra K,TEMP,2 # K=KCO/2 + dsra K,TEMP,2 # K=KCO/2 + MOV t34,t11 beqz K,.L15 - nop + MOV t44,t11 #else - MTC $0,t11 # gemm part - move B,BO + move B,BO # Reset B + MTC $0,t11 # GEMM part NR=4,MR=4 + gsLQC1(R8,F1,F0,0) # a0,a1 + MOV t21,t11 - gsLQC1(R8,F1,F0,0) #a0,a1 - MOV t31,t11 + gsLQC1(R9,F9,F8,0) # b0,b1 + MOV t41,t11 - gsLQC1(R9,F9,F8,0) #b0,b1 - MOV t12,t11 + gsLQC1(R8,F3,F2,1) # a2,a3 + MOV t22,t11 - gsLQC1(R8,F3,F2,1) #a2,a3 - MOV t32,t11 + gsLQC1(R9,F11,F10,1) # b2,b3 + MOV t42,t11 - gsLQC1(R9,F11,F10,1) #b2,b3 - - dsra K,KCO,2 # K=KCO/2 + dsra K,KCO,2 # K=KCO/2 + MOV t13,t11 - MOV t23,t11 + MOV t33,t11 - MOV t43,t11 + MOV t14,t11 - MOV t24,t11 - MOV t34,t11 - MOV t44,t11 + MOV t34,t11 beqz K,.L15 - nop + MOV t44,t11 # clear 16 results registers #endif .align 5 -.L11: # N=M=K=4 - gsLQC1(R8,F5,F4,2) # R8=A +.L11: # kr=4 + gsLQC1(R8,F5,F4,2) MADD t11,t11,a0,b0 MADD t21,t21,a1,b0 - gsLQC1(R9,F13,F12,2) # R9=B + gsLQC1(R9,F13,F12,2) MADD t12,t12,a0,b1 MADD t22,t22,a1,b1 @@ -329,7 +319,7 @@ MADD t34,t34,a2,b3 MADD t44,t44,a3,b3 - #load2 comp1 + .L12: gsLQC1(R8,F1,F0,4) MADD t11,t11,a4,b4 @@ -377,12 +367,12 @@ gsLQC1(R9,F15,F14,7) MADD t32,t32,a2,b1 MADD t42,t42,a3,b1 - daddu A,A,16*SIZE # A+=4(mr)*4(kr)*8Byte=16*SIZE + daddu A,A,16*SIZE # 4mr*4kr FETCH $0,8*SIZE(PREB) MADD t13,t13,a0,b2 MADD t23,t23,a1,b2 - daddu B,B,16*SIZE + daddu B,B,16*SIZE # 4nr*4kr MADD t14,t14,a0,b3 MADD t24,t24,a1,b3 @@ -395,7 +385,7 @@ MADD t44,t44,a3,b3 .L14: - gsLQC1(R8,F1,F0,0) + gsLQC1(R8,F1,F0,0) MADD t11,t11,a4,b4 MADD t21,t21,a5,b4 @@ -416,36 +406,34 @@ MADD t13,t13,a4,b6 MADD t23,t23,a5,b6 + FETCH $0,12*SIZE(PREA) MADD t14,t14,a4,b7 MADD t24,t24,a5,b7 - FETCH $0,12*SIZE(PREA) MADD t33,t33,a6,b6 MADD t43,t43,a7,b6 daddu PREB,PREB,16*SIZE MADD t34,t34,a6,b7 - daddu PREA,PREA,16*SIZE - bnez K,.L11 MADD t44,t44,a7,b7 + bnez K,.L11 + daddu PREA,PREA,16*SIZE -.L15: # N=4 M=4 K=2 +.L15: # kr=2 #ifndef TRMMKERNEL - andi K,KCO,2 # k = KCO&2 + andi K,KCO,2 #else andi K,TEMP, 2 #endif - nop - beqz K,.L18 nop .L16: - gsLQC1(R8,F5,F4,2) # R8=A + gsLQC1(R8,F5,F4,2) MADD t11,t11,a0,b0 MADD t21,t21,a1,b0 - gsLQC1(R9,F13,F12,2) # R9=B + gsLQC1(R9,F13,F12,2) MADD t12,t12,a0,b1 MADD t22,t22,a1,b1 @@ -456,17 +444,17 @@ gsLQC1(R9,F15,F14,3) MADD t32,t32,a2,b1 MADD t42,t42,a3,b1 - daddu A,A,8*SIZE # A+=4(mr)*2(kr)*8Byte=8*SIZE + daddu A,A,8*SIZE # 4mr*2kr FETCH $0,0(PREB) MADD t13,t13,a0,b2 MADD t23,t23,a1,b2 - daddu B,B,8*SIZE + daddu B,B,8*SIZE # 4nr*2kr + FETCH $0,0(PREA) MADD t14,t14,a0,b3 MADD t24,t24,a1,b3 - FETCH $0,0(PREA) MADD t33,t33,a2,b2 MADD t43,t43,a3,b2 @@ -494,37 +482,35 @@ MADD t13,t13,a4,b6 MADD t23,t23,a5,b6 + FETCH $0,4*SIZE(PREA) MADD t14,t14,a4,b7 MADD t24,t24,a5,b7 + daddu PREB,PREB,8*SIZE - FETCH $0,4*SIZE(PREA) MADD t33,t33,a6,b6 MADD t43,t43,a7,b6 - daddu PREB,PREB,8*SIZE + daddu PREA,PREA,8*SIZE MADD t34,t34,a6,b7 MADD t44,t44,a7,b7 - daddu PREA,PREA,8*SIZE -.L18: # N=4, M=4, K=1 +.L18: # kr=1 #ifndef TRMMKERNEL andi K,KCO,1 #else - andi K,TEMP, 1 + andi K,TEMP,1 #endif - NOP - - beqz K,.L19 # - LD ALPHA,152($sp) # Get ALPHA + beqz K,.L19 + LD ALPHA,152($sp) # Get ALPHA FETCH $0,0(PREB) MADD t11,t11,a0,b0 MADD t21,t21,a1,b0 - daddu A,A,4*SIZE # A+=4(mr)*1(kr)*8Byte=32 + daddu A,A,4*SIZE # 4mr*kr MADD t12,t12,a0,b1 MADD t22,t22,a1,b1 - daddu B,B,4*SIZE + daddu B,B,4*SIZE # 4nr*kr FETCH $0,0(PREA) MADD t31,t31,a2,b0 @@ -547,10 +533,10 @@ MADD t34,t34,a2,b3 MADD t44,t44,a3,b3 -.L19: # Write Back +.L19: # Write Back to C #ifndef TRMMKERNEL - LD c11,0(CO1) # gemm write part Fetch 16 C - LD c21,1*SIZE(CO1) + LD c11,0(CO1) # GEMM write part + LD c21,1*SIZE(CO1) # get 16 C LD c31,2*SIZE(CO1) LD c41,3*SIZE(CO1) @@ -589,7 +575,7 @@ MADD t34,c34,t34,ALPHA ST t41,3*SIZE(CO1) MADD t44,c44,t44,ALPHA - daddiu M,M,-1 # M-- + daddiu M,M,-1 # M-- ST t12,0(CO2) ST t22,1*SIZE(CO2) @@ -612,159 +598,160 @@ FETCH $0,8*SIZE(CO4) ST t14,0(CO4) - daddu CO1,CO1,4*SIZE # COx += 4*8Byte + daddu CO1,CO1,4*SIZE # COi += 4 ST t24,1*SIZE(CO4) daddu CO2,CO2,4*SIZE ST t34,2*SIZE(CO4) daddu CO3,CO3,4*SIZE ST t44,3*SIZE(CO4) daddu PREB,BO,SPANB - bnez M,.L10 # M!=0 + + bnez M,.L10 daddu CO4,CO4,4*SIZE #else - MUL t11, ALPHA, t11 + MUL t11, ALPHA, t11 # TRMM write back part MUL t21, ALPHA, t21 MUL t31, ALPHA, t31 MUL t41, ALPHA, t41 ST t11, 0 * SIZE(CO1) - ST t21, 1 * SIZE(CO1) - ST t31, 2 * SIZE(CO1) - ST t41, 3 * SIZE(CO1) - MUL t12, ALPHA, t12 + ST t21, 1 * SIZE(CO1) MUL t22, ALPHA, t22 + ST t31, 2 * SIZE(CO1) MUL t32, ALPHA, t32 + ST t41, 3 * SIZE(CO1) MUL t42, ALPHA, t42 ST t12, 0 * SIZE(CO2) - ST t22, 1 * SIZE(CO2) - ST t32, 2 * SIZE(CO2) - ST t42, 3 * SIZE(CO2) - MUL t13, ALPHA, t13 + ST t22, 1 * SIZE(CO2) MUL t23, ALPHA, t23 + ST t32, 2 * SIZE(CO2) MUL t33, ALPHA, t33 + ST t42, 3 * SIZE(CO2) MUL t43, ALPHA, t43 ST t13, 0 * SIZE(CO3) - ST t23, 1 * SIZE(CO3) - ST t33, 2 * SIZE(CO3) - ST t43, 3 * SIZE(CO3) - MUL t14, ALPHA, t14 + ST t23, 1 * SIZE(CO3) MUL t24, ALPHA, t24 + ST t33, 2 * SIZE(CO3) MUL t34, ALPHA, t34 + ST t43, 3 * SIZE(CO3) MUL t44, ALPHA, t44 ST t14, 0 * SIZE(CO4) + daddiu M,M,-1 # M-- ST t24, 1 * SIZE(CO4) ST t34, 2 * SIZE(CO4) ST t44, 3 * SIZE(CO4) - - daddiu M,M,-1 # M-- - - daddiu CO4,CO4, 4 * SIZE # trmm part write back - daddiu CO3,CO3, 4 * SIZE - daddiu CO2,CO2, 4 * SIZE daddiu CO1,CO1, 4 * SIZE - + daddiu CO2,CO2, 4 * SIZE + daddiu CO3,CO3, 4 * SIZE + daddiu CO4,CO4, 4 * SIZE + + FETCH $0,4*SIZE(CO1) + FETCH $0,4*SIZE(CO2) + FETCH $0,4*SIZE(CO3) + FETCH $0,4*SIZE(CO4) + + FETCH $0,0(CO1) + FETCH $0,0(CO2) + FETCH $0,0(CO3) + FETCH $0,0(CO4) + #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - dsubu TEMP,KCO,KK + dsubu TEMP,KCO,KK #ifdef LEFT daddiu TEMP,TEMP, -4 #else daddiu TEMP,TEMP, -4 #endif - dsll K,TEMP,2 + BASE_SHIFT dsll TEMP,TEMP,2 + BASE_SHIFT - - daddu A,A,K # mov A to the end of panel Ai - daddu B,B,TEMP # mov B to the end of panel Bj + daddu A,A,K # mov A to the end of panel Ai + daddu B,B,TEMP # mov B to the end of panel Bj #endif -#ifdef LEFT # right control by N loop +#ifdef LEFT daddiu KK, KK,4 #endif - bnez M,.L10 # M!=0 + bnez M,.L10 nop #endif - + .align 3 .L14_M2: - andi M,MCO,2 # Remainder M = 2 + andi M, MCO, 2 # nr=4,mr=2 beqz M,.L14_M1 nop .L20: #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - move B,BO + move B,BO # Reset B #else - dsll K,KK,1 + BASE_SHIFT #mr=2 so KK*2 - dsll TEMP,KK,2 + BASE_SHIFT - + dsll K,KK,1 + BASE_SHIFT # mr=2 + dsll TEMP,KK,2 + BASE_SHIFT # nr=4 daddu A,A,K daddu B,BO,TEMP #endif MTC $0,t11 MOV t21,t11 - gsLQC1(R8,F1,F0,0) #a0,a1 + gsLQC1(R8,F1,F0,0) # a0,a1 MOV t12,t11 MOV t22,t11 - gsLQC1(R9,F9,F8,0) #b0,b1 + gsLQC1(R9,F9,F8,0) # b0,b1 MOV t13,t11 - gsLQC1(R9,F11,F10,1) #b2,b3 - MOV t23,t11 - MOV t14,t11 - MOV t24,t11 + gsLQC1(R9,F11,F10,1) # b2,b3 + #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) dsubu TEMP,KCO,KK #elif defined(LEFT) - daddiu TEMP,KK,2 + daddiu TEMP,KK,2 # left part,controlled by mr, mr=2 #else - daddiu TEMP,KK,4 # not sure + daddiu TEMP,KK,4 # right part,controlled by nr,nr=4 #endif dsra K,TEMP,2 + MOV t14,t11 beqz K,.L25 - nop + MOV t24,t11 # clear 2*4=8 results registers #else - move B,BO # gemm part + move B,BO # Reset B MTC $0,t11 + gsLQC1(R8,F1,F0,0) + MOV t21,t11 - gsLQC1(R8,F1,F0,0) #a0,a1 - MOV t12,t11 + gsLQC1(R9,F9,F8,0) + MOV t22,t11 - gsLQC1(R9,F9,F8,0) #b0,b1 + dsra K,KCO,2 + gsLQC1(R9,F11,F10,1) - dsra K,KCO,2 # K=KCO/2 MOV t13,t11 - gsLQC1(R9,F11,F10,1) #b2,b3 - MOV t23,t11 - MOV t14,t11 - MOV t24,t11 + MOV t14,t11 beqz K,.L25 - nop + MOV t24,t11 #endif -.L21: # N=4 m=2,=K=4 - gsLQC1(R8,F5,F4,1) # R8=A +.L21: # nr=4,mr=2,kr=4 + gsLQC1(R8,F5,F4,1) MADD t11,t11,a0,b0 MADD t21,t21,a1,b0 - gsLQC1(R9,F13,F12,2) # R9=B + gsLQC1(R9,F13,F12,2) MADD t12,t12,a0,b1 MADD t22,t22,a1,b1 @@ -772,51 +759,51 @@ MADD t13,t13,a0,b2 MADD t23,t23,a1,b2 - gsLQC1(R8,F3,F2,2) MADD t14,t14,a0,b3 MADD t24,t24,a1,b3 - gsLQC1(R9,F9,F8,4) + gsLQC1(R8,F3,F2,2) MADD t11,t11,a4,b4 MADD t21,t21,a5,b4 - gsLQC1(R9,F11,F10,5) + gsLQC1(R9,F9,F8,4) MADD t12,t12,a4,b5 MADD t22,t22,a5,b5 - gsLQC1(R8,F7,F6,3) + gsLQC1(R9,F11,F10,5) MADD t13,t13,a4,b6 MADD t23,t23,a5,b6 MADD t14,t14,a4,b7 MADD t24,t24,a5,b7 - - gsLQC1(R9,F13,F12,6) - MADD t11,t11,a2,b0 - MADD t21,t21,a3,b0 - daddu A,A,8*SIZE # A+=2(mr)*4(kr)*8Byte=8*SIZE - - gsLQC1(R9,F15,F14,7) - MADD t12,t12,a2,b1 - MADD t22,t22,a3,b1 daddiu K,K,-1 - gsLQC1(R8,F1,F0,0) + gsLQC1(R8,F7,F6,3) + MADD t11,t11,a2,b0 + MADD t21,t21,a3,b0 + + gsLQC1(R9,F13,F12,6) + MADD t12,t12,a2,b1 + MADD t22,t22,a3,b1 + + gsLQC1(R9,F15,F14,7) MADD t13,t13,a2,b2 MADD t23,t23,a3,b2 - daddu B,B,16*SIZE # B+=4(nr)*4(kr)*8Byte=16*SIZE + daddu A,A,8*SIZE # 2mr*4kr MADD t14,t14,a2,b3 MADD t24,t24,a3,b3 + daddu B,B,16*SIZE # 4nr*4kr - gsLQC1(R9,F9,F8,0) + gsLQC1(R8,F1,F0,0) MADD t11,t11,a6,b4 MADD t21,t21,a7,b4 - gsLQC1(R9,F11,F10,1) + gsLQC1(R9,F9,F8,0) MADD t12,t12,a6,b5 MADD t22,t22,a7,b5 + gsLQC1(R9,F11,F10,1) MADD t13,t13,a6,b6 MADD t23,t23,a7,b6 @@ -824,32 +811,32 @@ bnez K,.L21 MADD t24,t24,a7,b7 -.L25: # N=4 M=2 K=2 +.L25: #ifndef TRMMKERNEL - andi K,KCO,2 # k = KCO&2 + andi K,KCO,2 # kr=2 #else - andi K,TEMP,2 + andi K,TEMP,2 #endif beqz K,.L28 nop .L26: - gsLQC1(R8,F5,F4,1) # R8=A + gsLQC1(R8,F5,F4,1) MADD t11,t11,a0,b0 MADD t21,t21,a1,b0 - gsLQC1(R9,F13,F12,2) # R9=B + gsLQC1(R9,F13,F12,2) MADD t12,t12,a0,b1 MADD t22,t22,a1,b1 - daddu A,A,4*SIZE # A+=2(mr)*2(kr)*8Byte=32 gsLQC1(R9,F15,F14,3) MADD t13,t13,a0,b2 MADD t23,t23,a1,b2 - daddu B,B,8*SIZE + daddu A,A,4*SIZE # 2mr*2kr MADD t14,t14,a0,b3 MADD t24,t24,a1,b3 + daddu B,B,8*SIZE # 4nr*2kr .L27: gsLQC1(R8,F1,F0,0) @@ -867,19 +854,19 @@ MADD t14,t14,a4,b7 MADD t24,t24,a5,b7 -.L28: # N=4, M=2, K=1 +.L28: # kr=1 #ifndef TRMMKERNEL - andi K,KCO,1 + andi K,KCO,1 #else - andi K,TEMP,1 + andi K,TEMP,1 #endif - beqz K,.L29 # - LD ALPHA,152($sp) # Get ALPHA + beqz K,.L29 + LD ALPHA,152($sp) # Get ALPHA MADD t11,t11,a0,b0 MADD t21,t21,a1,b0 - daddu A,A,2*SIZE # A+=2(mr)*1(kr)*8Byte=16 - daddu B,B,4*SIZE + daddu A,A,2*SIZE # 2mr*kr + daddu B,B,4*SIZE # 4nr*kr MADD t12,t12,a0,b1 MADD t22,t22,a1,b1 @@ -890,9 +877,9 @@ MADD t14,t14,a0,b3 MADD t24,t24,a1,b3 -.L29: # Write Back +.L29: # Write Back to C #ifndef TRMMKERNEL - LD c11,0(CO1) # gemm write back part Fetch 16 C + LD c11,0(CO1) # GEMM write back part LD c21,1*SIZE(CO1) LD c12,0(CO2) @@ -919,64 +906,64 @@ MADD t24,c24,t24,ALPHA ST t13,0(CO3) + daddu CO1,CO1,2*SIZE # COi += 2 ST t23,1*SIZE(CO3) - daddu CO1,CO1,2*SIZE # COx += 2*8Byte - - FETCH $0,0(CO1) - FETCH $0,2*SIZE(CO2) - FETCH $0,2*SIZE(CO3) - FETCH $0,2*SIZE(CO4) + daddu CO2,CO2,2*SIZE ST t14,0(CO4) - daddu CO2,CO2,2*SIZE - ST t24,1*SIZE(CO4) daddu CO3,CO3,2*SIZE + ST t24,1*SIZE(CO4) daddu CO4,CO4,2*SIZE + FETCH $0,0(CO1) + FETCH $0,0(CO2) + FETCH $0,0(CO3) + FETCH $0,0(CO4) + #else - MUL t11, ALPHA, t11 + MUL t11, ALPHA, t11 # TRMM write back part MUL t21, ALPHA, t21 ST t11, 0 * SIZE(CO1) - ST t21, 1 * SIZE(CO1) - MUL t12, ALPHA, t12 + ST t21, 1 * SIZE(CO1) MUL t22, ALPHA, t22 ST t12, 0 * SIZE(CO2) - ST t22, 1 * SIZE(CO2) - MUL t13, ALPHA, t13 + ST t22, 1 * SIZE(CO2) MUL t23, ALPHA, t23 ST t13, 0 * SIZE(CO3) - ST t23, 1 * SIZE(CO3) - MUL t14, ALPHA, t14 + ST t23, 1 * SIZE(CO3) MUL t24, ALPHA, t24 ST t14, 0 * SIZE(CO4) ST t24, 1 * SIZE(CO4) - + daddiu CO1,CO1, 2 * SIZE daddiu CO2,CO2, 2 * SIZE daddiu CO3,CO3, 2 * SIZE daddiu CO4,CO4, 2 * SIZE -#if ( defined(LEFT) && defined(TRANSA)) || \ - (!defined(LEFT) && !defined(TRANSA)) + FETCH $0,0(CO1) + FETCH $0,0(CO2) + FETCH $0,0(CO3) + FETCH $0,0(CO4) + +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) dsubu TEMP,KCO,KK #ifdef LEFT daddiu TEMP,TEMP,-2 #else daddiu TEMP,TEMP,-4 #endif - dsll K,TEMP,1 + BASE_SHIFT dsll TEMP,TEMP,2 + BASE_SHIFT - daddu A,A,K - daddu B,B,TEMP + daddu A,A,K # move A to next panel Ai + daddu B,B,TEMP # move B to next panel Bj #endif #ifdef LEFT @@ -985,15 +972,16 @@ #endif + .align 3 .L14_M1: - andi M,MCO,1 # Remainder M = 1 - beqz M,.L0_N4_Loop # M = 0, finishing one panel B + andi M,MCO,1 # mr=1 + beqz M,.L0_N4_Loop # M = 0, finishing one panel Bj nop .L30: #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - move B,BO + move B,BO # Reset B #else dsll K,KK, 0 + BASE_SHIFT dsll TEMP,KK,2 + BASE_SHIFT @@ -1001,14 +989,15 @@ daddu A,A,K daddu B,BO,TEMP #endif - - LD a0, 0 * SIZE(A) -# gsLQC1(R8,F1,F0,0) - gsLQC1(R9,F9,F8,0) #b0,b1 MTC $0,t11 - gsLQC1(R9,F11,F10,1) #b2,b3 MOV t12,t11 + LD a0, 0 * SIZE(A) # a0 + MOV t13,t11 + gsLQC1(R9,F9,F8,0) # b0,b1 + + MOV t14,t11 # clear result registers + gsLQC1(R9,F11,F10,1) # b2,b3 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) dsubu TEMP, KCO, KK @@ -1018,39 +1007,42 @@ daddiu TEMP, KK, 4 #endif dsra K,TEMP, 2 - + nop beqz K,.L35 - MOV t14,t11 + nop + #else - # gemm - move B,BO - LD a0, 0 * SIZE(A) -# gsLQC1(R8,F1,F0,0) - dsra K,KCO,2 # K=KCO/2 - gsLQC1(R9,F9,F8,0) #b0,b1 + move B,BO # Reset B, GEMM part + dsra K,KCO,2 # K=KCO/2 + LD a0, 0 * SIZE(A) # a0 + MTC $0,t11 - gsLQC1(R9,F11,F10,1) #b2,b3 MOV t12,t11 + gsLQC1(R9,F9,F8,0) # b0,b1 + MOV t13,t11 - dsra K,KCO,2 - beqz K,.L35 MOV t14,t11 + gsLQC1(R9,F11,F10,1) # b2,b3 + + beqz K,.L35 + nop #endif -.L31: # N=4 m=1,=K=4 -# gsLQC1(R8,F3,F2,1) - LD a1, 1*SIZE(A) - gsLQC1(R9,F13,F12,2) # R9=B +.L31: # nr=4,mr=1,kr=4 + LD a1, 1*SIZE(A) # load a1 MADD t11,t11,a0,b0 + + gsLQC1(R9,F13,F12,2) # b4,b5 MADD t12,t12,a0,b1 - gsLQC1(R9,F15,F14,3) + gsLQC1(R9,F15,F14,3) # b6,b7 MADD t13,t13,a0,b2 MADD t14,t14,a0,b3 - LD a2, 2*SIZE(A) - gsLQC1(R9,F9,F8,4) + LD a2, 2*SIZE(A) # a2 MADD t11,t11,a1,b4 + + gsLQC1(R9,F9,F8,4) MADD t12,t12,a1,b5 gsLQC1(R9,F11,F10,5) @@ -1058,22 +1050,22 @@ MADD t14,t14,a1,b7 daddiu K,K,-1 - LD a3, 3*SIZE(A) - gsLQC1(R9,F13,F12,6) + LD a3, 3*SIZE(A) # a3 MADD t11,t11,a2,b0 + + gsLQC1(R9,F13,F12,6) MADD t12,t12,a2,b1 + daddu A,A,4*SIZE # 1mr*4kr gsLQC1(R9,F15,F14,7) MADD t13,t13,a2,b2 MADD t14,t14,a2,b3 - - daddu A,A,4*SIZE # A+=1(mr)*4(kr)*8Byte=8*SIZE - daddu B,B,16*SIZE # B+=4(nr)*4(kr)*8Byte=16*SIZE + daddu B,B,16*SIZE # 4nr*4kr -# gsLQC1(R8,F1,F0,0) - LD a0, 0*SIZE(A) - gsLQC1(R9,F9,F8,0) + LD a0, 0*SIZE(A) # a0 MADD t11,t11,a3,b4 + + gsLQC1(R9,F9,F8,0) MADD t12,t12,a3,b5 gsLQC1(R9,F11,F10,1) @@ -1081,58 +1073,60 @@ bnez K,.L31 MADD t14,t14,a3,b7 -.L35: # N=4 M=1 K=2 +.L35: # kr=2 #ifndef TRMMKERNEL - andi K,KCO,2 # k = KCO&2 + andi K,KCO,2 #else - andi K,TEMP,2 + andi K,TEMP,2 #endif beqz K,.L38 nop .L36: - LD a1,1*SIZE(A) - gsLQC1(R9,F13,F12,2) # R9=B + LD a1,1*SIZE(A) # load a1 MADD t11,t11,a0,b0 + + gsLQC1(R9,F13,F12,2) MADD t12,t12,a0,b1 - daddu A,A,2*SIZE # A+=1(mr)*2(kr)*8Byte=32 + daddu A,A,2*SIZE # mr*2kr gsLQC1(R9,F15,F14,3) MADD t13,t13,a0,b2 MADD t14,t14,a0,b3 - daddu B,B,8*SIZE + daddu B,B,8*SIZE # 4nr*2kr .L37: LD a0,0(A) - gsLQC1(R9,F9,F8,0) MADD t11,t11,a1,b4 + + gsLQC1(R9,F9,F8,0) MADD t12,t12,a1,b5 gsLQC1(R9,F11,F10,1) MADD t13,t13,a1,b6 MADD t14,t14,a1,b7 -.L38: # N=4, M=1, K=1 +.L38: # kr=1 #ifndef TRMMKERNEL - andi K,KCO,1 + andi K,KCO,1 #else andi K,TEMP,1 #endif - beqz K,.L39 # - LD ALPHA,152($sp) # Get ALPHA + beqz K,.L39 + LD ALPHA,152($sp) # Get ALPHA MADD t11,t11,a0,b0 MADD t12,t12,a0,b1 - daddu A,A,1*SIZE # A+=1(mr)*1(kr)*8Byte=16 + daddu A,A,1*SIZE daddu B,B,4*SIZE MADD t13,t13,a0,b2 MADD t14,t14,a0,b3 -.L39: # Write Back +.L39: # Write Back #ifndef TRMMKERNEL - LD c11,0(CO1) # Fetch 16 C + LD c11,0(CO1) LD c12,0(CO2) LD c13,0(CO3) LD c14,0(CO4) @@ -1157,8 +1151,7 @@ ST t13, 0 * SIZE(CO3) ST t14, 0 * SIZE(CO4) -#if ( defined(LEFT) && defined(TRANSA)) || \ - (!defined(LEFT) && !defined(TRANSA)) +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) dsubu TEMP, KCO, KK #ifdef LEFT daddiu TEMP, TEMP, -1 @@ -1179,64 +1172,60 @@ #endif -.L0_N4_Loop: - daddiu N,N,-1 # N-- + .align 3 +.L0_N4_Loop: # mc finished + daddiu N,N,-1 # N-- #if defined(TRMMKERNEL) && !defined(LEFT) daddiu KK, KK,4 #endif - bnez N,.L0_N4_Lb # N!=0 - move BO,B # Set B - - + bnez N,.L0_N4_Lb + move BO,B # Set BO point to next panel Bj .align 5 .L0_N2: - andi N,NCO,2 # Remainder N = 2 - beqz N,.L0_N1 # N=0,NCO<2 + andi N,NCO,2 # nr = 2 + beqz N,.L0_N1 nop .L0_N2_Lb: - move CO1,C # Set C - dsra M,MCO,2 # M=MCO/2 + move CO1,C + daddu CO2,C,LDC + + dsra M,MCO,2 + move A,AO # Reset A + + daddu PREA,AO,SPANA + daddu C,CO2,LDC #if defined(TRMMKERNEL) && defined(LEFT) move KK, OFFSET #endif - - dsll SPANB,KCO,1+BASE_SHIFT # SPANB=KC*NR(2)*8Byte=KC*16=KC*2^4 - move A,AO # Reset A - - daddu CO2,CO1,LDC - daddu PREA,AO,SPANA beqz M,.L12_M2 - daddu C,CO2,LDC + nop .L40: #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - move B,BO + move B,BO # Reset B #else - dsll K,KK, 2 + BASE_SHIFT # mr=4 - dsll TEMP, KK,1 + BASE_SHIFT # nr=2 + dsll K,KK, 2 + BASE_SHIFT + dsll TEMP, KK,1 + BASE_SHIFT daddu A,A,K daddu B,BO,TEMP #endif MTC $0,t11 MOV t21,t11 - gsLQC1(R8,F1,F0,0) #a0,a1 + gsLQC1(R8,F1,F0,0) # a0,a1 MOV t31,t11 MOV t41,t11 - gsLQC1(R9,F9,F8,0) #b0,b1 + gsLQC1(R9,F9,F8,0) # b0,b1 MOV t12,t11 - gsLQC1(R8,F3,F2,1) #a2,a3 - MOV t22,t11 - MOV t32,t11 + gsLQC1(R8,F3,F2,1) # a2,a3 - MOV t42,t11 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) dsubu TEMP,KCO,KK #elif defined(LEFT) @@ -1244,37 +1233,38 @@ #else daddiu TEMP, KK, 2 #endif - dsra K,TEMP,2 # K=KCO/2 - beqz K,.L45 - nop -#else - move B,BO - MTC $0,t11 # gemm part - MOV t21,t11 - gsLQC1(R8,F1,F0,0) #a0,a1 - - MOV t31,t11 - MOV t41,t11 - gsLQC1(R9,F9,F8,0) #b0,b1 - - dsra K,KCO,2 # K=KCO/2 - MOV t12,t11 - gsLQC1(R8,F3,F2,1) #a2,a3 - - MOV t22,t11 + dsra K,TEMP,2 MOV t32,t11 - - MOV t42,t11 beqz K,.L45 - nop + MOV t42,t11 + +#else + move B,BO # Reset B + MTC $0,t11 # gemm part + gsLQC1(R8,F1,F0,0) # a0,a1 + + MOV t21,t11 + MOV t31,t11 + gsLQC1(R9,F9,F8,0) # b0,b1 + + MOV t41,t11 + dsra K,KCO,2 # K=KCO/2 + gsLQC1(R8,F3,F2,1) # a2,a3 + + MOV t12,t11 + MOV t22,t11 + + MOV t32,t11 + beqz K,.L45 + MOV t42,t11 #endif -.L41: # N=2,M=K=4 - gsLQC1(R8,F5,F4,2) # R8=A +.L41: # nr=2,mr=kr=4 + gsLQC1(R8,F5,F4,2) MADD t11,t11,a0,b0 MADD t21,t21,a1,b0 - gsLQC1(R9,F13,F12,1) # R9=B + gsLQC1(R9,F13,F12,1) MADD t12,t12,a0,b1 MADD t22,t22,a1,b1 @@ -1315,12 +1305,12 @@ gsLQC1(R8,F7,F6,7) MADD t31,t31,a2,b2 MADD t41,t41,a3,b2 - daddu B,B,8*SIZE # B+=2(nr)*4(kr)*8Byte=8*SIZE + daddu B,B,8*SIZE # 2nr*4kr FETCH $0,8*SIZE(PREA) MADD t32,t32,a2,b3 MADD t42,t42,a3,b3 - daddu A,A,16*SIZE # A+=4(mr)*4(kr)*8Byte=16*SIZE + daddu A,A,16*SIZE # 4mr*4kr .L44: gsLQC1(R8,F1,F0,0) @@ -1343,9 +1333,9 @@ MADD t42,t42,a7,b7 -.L45: # N=2 M=4 K=2 +.L45: # kr=2 #ifndef TRMMKERNEL - andi K,KCO,2 # k = KCO&2 + andi K,KCO,2 #else andi K,TEMP,2 #endif @@ -1353,23 +1343,23 @@ nop .L46: - gsLQC1(R8,F5,F4,2) # R8=A + gsLQC1(R8,F5,F4,2) MADD t11,t11,a0,b0 MADD t21,t21,a1,b0 - gsLQC1(R9,F13,F12,1) # R9=B + gsLQC1(R9,F13,F12,1) MADD t12,t12,a0,b1 MADD t22,t22,a1,b1 gsLQC1(R8,F7,F6,3) MADD t31,t31,a2,b0 MADD t41,t41,a3,b0 - daddu B,B,4*SIZE # B+=2(nr)*2(kr)*8Byte=32 + daddu B,B,4*SIZE # B+=2(nr)*2(kr)*8Byte=32 FETCH $0,0(PREA) MADD t32,t32,a2,b1 MADD t42,t42,a3,b1 - daddu A,A,8*SIZE # A+=4(mr)*2(kr)*8Byte=8*SIZE + daddu A,A,8*SIZE # A+=4(mr)*2(kr)*8Byte=8*SIZE .L47: gsLQC1(R8,F1,F0,0) @@ -1390,19 +1380,19 @@ daddu PREA,PREA,8*SIZE -.L48: # N=2, M=4, K=1 +.L48: # kr=1 #ifndef TRMMKERNEL - andi K,KCO,1 + andi K,KCO,1 #else andi K,TEMP,1 #endif - beqz K,.L49 # - LD ALPHA,152($sp) # Get ALPHA + beqz K,.L49 + LD ALPHA,152($sp) # Get ALPHA FETCH $0,0(PREA) MADD t11,t11,a0,b0 MADD t21,t21,a1,b0 - daddu A,A,4*SIZE # A+=4(mr)*1(kr)*8Byte=32 + daddu A,A,4*SIZE # A+=4(mr)*1(kr)*8Byte=32 MADD t12,t12,a0,b1 MADD t22,t22,a1,b1 @@ -1415,9 +1405,9 @@ MADD t32,t32,a2,b1 MADD t42,t42,a3,b1 -.L49: # Write Back +.L49: # Write Back #ifndef TRMMKERNEL - LD c11,0(CO1) # gemm write back part Fetch 16 C + LD c11,0(CO1) # gemm write back part Fetch 16 C LD c21,1*SIZE(CO1) LD c31,2*SIZE(CO1) LD c41,3*SIZE(CO1) @@ -1439,7 +1429,7 @@ MADD t32,c32,t32,ALPHA ST t41,3*SIZE(CO1) MADD t42,c42,t42,ALPHA - daddiu M,M,-1 # M-- + daddiu M,M,-1 ST t12,0(CO2) ST t22,1*SIZE(CO2) @@ -1448,48 +1438,49 @@ FETCH $0,4*SIZE(CO1) FETCH $0,4*SIZE(CO2) - FETCH $0,8*SIZE(CO1) FETCH $0,8*SIZE(CO2) - daddu CO1,CO1,4*SIZE # COx += 4*8Byte - bnez M,.L40 # M!=0 + daddu CO1,CO1,4*SIZE + bnez M,.L40 daddu CO2,CO2,4*SIZE + #else - daddiu M,M,-1 - - daddiu CO1,CO1, 4*SIZE - daddiu CO2,CO2, 4*SIZE - MUL t11, ALPHA, t11 MUL t21, ALPHA, t21 MUL t31, ALPHA, t31 MUL t41, ALPHA, t41 MUL t12, ALPHA, t12 + ST t11, 0 * SIZE(CO1) MUL t22, ALPHA, t22 + ST t21, 1 * SIZE(CO1) MUL t32, ALPHA, t32 + ST t31, 2 * SIZE(CO1) MUL t42, ALPHA, t42 - - ST t11, -4 * SIZE(CO1) - ST t21, -3 * SIZE(CO1) - ST t31, -2 * SIZE(CO1) - ST t41, -1 * SIZE(CO1) + ST t41, 3 * SIZE(CO1) - ST t12, -4 * SIZE(CO2) - ST t22, -3 * SIZE(CO2) - ST t32, -2 * SIZE(CO2) - ST t42, -1 * SIZE(CO2) + ST t12, 0 * SIZE(CO2) + daddiu M,M,-1 + ST t22, 1 * SIZE(CO2) + ST t32, 2 * SIZE(CO2) + ST t42, 3 * SIZE(CO2) + + daddiu CO1,CO1, 4*SIZE + daddiu CO2,CO2, 4*SIZE -#if ( defined(LEFT) && defined(TRANSA)) || \ - (!defined(LEFT) && !defined(TRANSA)) + FETCH $0,0(CO1) + FETCH $0,0(CO2) + FETCH $0,4(CO1) + FETCH $0,4(CO2) + +#if ( defined(LEFT) && defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA)) dsubu TEMP, KCO, KK #ifdef LEFT daddiu TEMP, TEMP, -4 #else daddiu TEMP, TEMP, -2 #endif - dsll K,TEMP, 2 + BASE_SHIFT dsll TEMP, TEMP, 1 + BASE_SHIFT @@ -1500,13 +1491,14 @@ #ifdef LEFT daddiu KK, KK, 4 #endif - bnez M,.L40 nop #endif + + .align 3 .L12_M2: - andi M,MCO,2 # Remainder M = 2 + andi M,MCO,2 # mr = 2 beqz M,.L12_M1 nop @@ -1522,13 +1514,10 @@ daddu B, BO, TEMP #endif MTC $0,t11 - gsLQC1(R8,F1,F0,0) #a0,a1 + gsLQC1(R8,F1,F0,0) #a0,a1 MOV t21,t11 - MOV t12,t11 - gsLQC1(R9,F9,F8,0) #b0,b1 - - MOV t22,t11 + gsLQC1(R9,F9,F8,0) #b0,b1 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) dsubu TEMP, KCO, KK #elif defined(LEFT) @@ -1536,31 +1525,31 @@ #else daddiu TEMP, KK, 2 #endif - dsra K,TEMP,2 # K=KCO/2 + dsra K,TEMP,2 + MOV t12,t11 beqz K,.L55 - nop + MOV t22,t11 #else move B,BO - dsra K,KCO,2 # K=KCO/2 - MTC $0,t11 - gsLQC1(R8,F1,F0,0) #a0,a1 + dsra K,KCO,2 # K=KCO/2 + gsLQC1(R8,F1,F0,0) #a0,a1 + MTC $0,t11 MOV t21,t11 - MOV t12,t11 - gsLQC1(R9,F9,F8,0) #b0,b1 + gsLQC1(R9,F9,F8,0) #b0,b1 - MOV t22,t11 + MOV t12,t11 beqz K,.L55 - nop + MOV t22,t11 #endif -.L51: # N=2 m=2,=K=4 - gsLQC1(R8,F5,F4,1) # R8=A +.L51: # nr=2 mr=2,kr=4 + gsLQC1(R8,F5,F4,1) MADD t11,t11,a0,b0 MADD t21,t21,a1,b0 - gsLQC1(R9,F13,F12,1) # R9=B + gsLQC1(R9,F13,F12,1) MADD t12,t12,a0,b1 MADD t22,t22,a1,b1 @@ -1576,12 +1565,12 @@ gsLQC1(R8,F7,F6,3) MADD t11,t11,a2,b2 MADD t21,t21,a3,b2 - daddu A,A,8*SIZE # A+=2(mr)*4(kr)*8Byte=8*SIZE + daddu A,A,8*SIZE # A+=2(mr)*4(kr)*8Byte=8*SIZE gsLQC1(R9,F15,F14,3) MADD t12,t12,a2,b3 MADD t22,t22,a3,b3 - daddu B,B,8*SIZE # B+=2(nr)*4(kr)*8Byte=16*SIZE + daddu B,B,8*SIZE # B+=2(nr)*4(kr)*8Byte=16*SIZE gsLQC1(R8,F1,F0,0) MADD t11,t11,a6,b6 @@ -1592,26 +1581,25 @@ bnez K,.L51 MADD t22,t22,a7,b7 -.L55: # N=2 M=2 K=2 +.L55: # kr=2 #ifndef TRMMKERNEL - andi K,KCO,2 # k = KCO&2 + andi K,KCO,2 #else andi K,TEMP,2 #endif - NOP beqz K,.L58 nop .L56: - gsLQC1(R8,F5,F4,1) # R8=A + gsLQC1(R8,F5,F4,1) MADD t11,t11,a0,b0 MADD t21,t21,a1,b0 - daddu A,A,4*SIZE # A+=2(mr)*2(kr)*8Byte=32 + daddu A,A,4*SIZE # A+=2(mr)*2(kr)*8Byte=32 - gsLQC1(R9,F13,F12,1) # R9=B + gsLQC1(R9,F13,F12,1) MADD t12,t12,a0,b1 MADD t22,t22,a1,b1 - daddu B,B,4*SIZE + daddu B,B,4*SIZE # 2nr*2kr .L57: gsLQC1(R8,F1,F0,0) @@ -1623,27 +1611,27 @@ MADD t22,t22,a5,b5 -.L58: # N=2, M=2, K=1 +.L58: # kr=1 #ifndef TRMMKERNEL - andi K,KCO,1 + andi K,KCO,1 #else - andi K, TEMP, 1 + andi K,TEMP, 1 #endif - beqz K,.L59 # - LD ALPHA,152($sp) # Get ALPHA + beqz K,.L59 + LD ALPHA,152($sp) # Get ALPHA MADD t11,t11,a0,b0 MADD t21,t21,a1,b0 - daddu A,A,2*SIZE # A+=2(mr)*1(kr)*8Byte=16 - daddu B,B,2*SIZE + daddu A,A,2*SIZE # A+=2(mr)*1(kr)*8Byte=16 + daddu B,B,2*SIZE # 2nr*kr MADD t12,t12,a0,b1 MADD t22,t22,a1,b1 -.L59: # Write Back +.L59: # Write Back #ifndef TRMMKERNEL - LD c11,0(CO1) # write gemm part back Fetch 16 C + LD c11,0(CO1) # write gemm part back Fetch 16 C LD c21,1*SIZE(CO1) LD c12,0(CO2) LD c22,1*SIZE(CO2) @@ -1658,17 +1646,15 @@ ST t12,0(CO2) ST t22,1*SIZE(CO2) - daddu CO1,CO1,2*SIZE # COx += 2*8Byte + daddu CO1,CO1,2*SIZE daddu CO2,CO2,2*SIZE FETCH $0,0(CO1) FETCH $0,0(CO2) #else daddiu M, M, -1 - daddiu CO1,CO1, 2 * SIZE daddiu CO2,CO2, 2 * SIZE - MUL t11, ALPHA, t11 MUL t21, ALPHA, t21 MUL t12, ALPHA, t12 @@ -1679,6 +1665,9 @@ ST t12, -2 * SIZE(CO2) ST t22, -1 * SIZE(CO2) + FETCH $0,0(CO1) + FETCH $0,0(CO2) + #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) dsubu TEMP, KCO, KK #ifdef LEFT @@ -1697,21 +1686,19 @@ #ifdef LEFT daddiu KK, KK, 2 #endif - FETCH $0,0(CO1) - FETCH $0,0(CO2) - #endif + .align 3 .L12_M1: - andi M,MCO,1 # Remainder M = 1 - beqz M,.L0_N2_Loop # M = 0, finishing one panel B + andi M,MCO,1 # mr = 1 + beqz M,.L0_N2_Loop nop .L60: #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - move B,BO + move B,BO # Reset B #else dsll K, KK, 0 + BASE_SHIFT dsll TEMP, KK, 1 + BASE_SHIFT @@ -1720,13 +1707,11 @@ daddu B, BO, TEMP #endif MTC $0,t11 -#gsLQC1(R8,F4,F0,0) - LD a0, 0*SIZE(A) - MOV t21,t11 - MOV t12,t11 - gsLQC1(R9,F9,F8,0) #b0,b1 + LD a0, 0*SIZE(A) # a0 + + MOV t21,t11 + gsLQC1(R9,F9,F8,0) # b0,b1 - MOV t22,t11 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) dsubu TEMP, KCO, KK #elif defined(LEFT) @@ -1734,87 +1719,88 @@ #else daddiu TEMP, KK, 2 #endif - dsra K,TEMP,2 # K=KCO/2 + dsra K,TEMP,2 + MOV t12,t11 beqz K,.L65 - nop + MOV t22,t11 #else - dsra K,KCO,2 # K=KCO/2 - MTC $0,t11 - move B,BO # Reset B -# gsLQC1(R8,F4,F0,0) + dsra K,KCO,2 + move B,BO # Reset B LD a0,0*SIZE(A) - MOV t21,t11 - MOV t12,t11 - gsLQC1(R9,F9,F8,0) #b0,b1 - MOV t22,t11 + MTC $0,t11 + MOV t21,t11 + gsLQC1(R9,F9,F8,0) + + MOV t12,t11 beqz K,.L65 - nop + MOV t22,t11 #endif -.L61: # N=2 m=1,=K=4 - LD a4, 1*SIZE(A) - gsLQC1(R9,F13,F12,1) # R9=B +.L61: # nr=2,mr=1,kr=4 + LD a4, 1*SIZE(A) # a2 MADD t11,t11,a0,b0 + + gsLQC1(R9,F13,F12,1) MADD t12,t12,a0,b1 - LD a2, 2*SIZE(A) - gsLQC1(R9,F11,F10,2) + LD a2, 2*SIZE(A) # a3 MADD t11,t11,a4,b4 + + gsLQC1(R9,F11,F10,2) MADD t12,t12,a4,b5 -# gsLQC1(R8,F6,F2,1) - LD a6, 3*SIZE(A) + LD a6, 3*SIZE(A) # a4 MADD t11,t11,a2,b2 - MADD t12,t12,a2,b3 daddiu K,K,-1 gsLQC1(R9,F15,F14,3) + MADD t12,t12,a2,b3 daddu A,A,4*SIZE # A+=1(mr)*4(kr)*8Byte=32 -# gsLQC1(R8,F4,F0,0) LD a0, 0*SIZE(A) + MADD t11,t11,a6,b6 daddu B,B,8*SIZE # B+=2(nr)*4(kr)*8Byte=8*SIZE - gsLQC1(R9,F9,F8,0) - MADD t11,t11,a6,b6 + gsLQC1(R9,F9,F8,0) # a0 bnez K,.L61 MADD t12,t12,a6,b7 -.L65: # N=2 M=1 K=2 +.L65: # kr=2 #ifndef TRMMKERNEL - andi K,KCO,2 # k = KCO&2 + andi K,KCO,2 #else - andi K,TEMP,2 + andi K,TEMP,2 #endif beqz K,.L68 nop .L66: - LD a4, 1*SIZE(A) + LD a4, 1*SIZE(A) # a1 MADD t11,t11,a0,b0 - - gsLQC1(R9,F13,F12,1) # R9=B - MADD t12,t12,a0,b1 daddu A,A,2*SIZE # A+=1(mr)*2(kr)*8Byte=16 + + gsLQC1(R9,F13,F12,1) + MADD t12,t12,a0,b1 daddu B,B,4*SIZE .L67: - LD a0,0(A) - gsLQC1(R9,F9,F8,0) + LD a0,0(A) # a0 MADD t11,t11,a4,b4 + + gsLQC1(R9,F9,F8,0) MADD t12,t12,a4,b5 -.L68: # N=2, M=1, K=1 +.L68: # kr=1 #ifndef TRMMKERNEL - andi K,KCO,1 + andi K,KCO,1 #else - andi K,TEMP,1 + andi K,TEMP,1 #endif - beqz K,.L69 # - LD ALPHA,152($sp) # Get ALPHA + beqz K,.L69 + LD ALPHA,152($sp) # Get ALPHA MADD t11,t11,a0,b0 MADD t12,t12,a0,b1 @@ -1822,9 +1808,9 @@ daddu B,B,2*SIZE -.L69: # Write Back +.L69: # Write Back #ifndef TRMMKERNEL - LD c11,0(CO1) # Fetch 16 C + LD c11,0(CO1) # Fetch 16 C LD c12,0(CO2) MADD t11,c11,t11,ALPHA @@ -1833,11 +1819,9 @@ ST t11,0(CO1) ST t12,0(CO2) - daddu CO1,CO1,1*SIZE # COx += 2*8Byte + daddu CO1,CO1,1*SIZE daddu CO2,CO2,1*SIZE - FETCH $0,0(CO1) - FETCH $0,0(CO2) #else MUL t11, ALPHA, t11 MUL t12, ALPHA, t12 @@ -1845,7 +1829,7 @@ ST t11, 0 * SIZE(CO1) ST t12, 0 * SIZE(CO2) - daddu CO1,CO1,1*SIZE # COx += 2*8Byte + daddu CO1,CO1,1*SIZE daddu CO2,CO2,1*SIZE #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) @@ -1877,26 +1861,26 @@ .align 5 .L0_N1: - andi N,NCO,1 # Remainder N = 1 - beqz N,.L999 # N=0,NCO<1 + andi N,NCO,1 # nr = 1 + beqz N,.L999 nop - move CO1,C # Set C - dsra M,MCO,2 # M=MCO/2 - + move CO1,C + dsra M,MCO,2 + + move A,AO # Reset A + daddu PREA,AO,SPANA #if defined(TRMMKERNEL) && defined(LEFT) move KK, OFFSET #endif - move A,AO # Reset A beqz M,.L11_M2 - daddu PREA,AO,SPANA - + daddu C,CO1,LDC .L70: #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - move B, BO + move B, BO # Reset B #else dsll K, KK, 2 + BASE_SHIFT dsll TEMP, KK, 0 + BASE_SHIFT @@ -1904,13 +1888,15 @@ daddu A, A, K daddu B, BO, TEMP #endif -# gsLQC1(R9,F12,F8,0) - LD b0, 0*SIZE(B) + MTC $0,t11 - gsLQC1(R8,F1,F0,0) #a0,a1 + LD b0, 0*SIZE(B) + MOV t21,t11 - gsLQC1(R8,F3,F2,1) #a2,a3 + gsLQC1(R8,F1,F0,0) #a0,a1 + MOV t31,t11 + gsLQC1(R8,F3,F2,1) #a2,a3 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) dsubu TEMP, KCO, KK #elif defined(LEFT) @@ -1918,122 +1904,125 @@ #else daddiu TEMP, KK, 1 #endif - dsra K,TEMP,2 # K=KCO/2 - beqz K,.L75 + dsra K,TEMP,2 MOV t41,t11 + beqz K,.L75 + nop #else - move B, BO - dsra K,KCO,2 # K=KCO/2 -# gsLQC1(R9,F12,F8,0) + move B, BO # Reset B + dsra K,KCO,2 LD b0, 0*SIZE(B) + MTC $0,t11 - gsLQC1(R8,F1,F0,0) #a0,a1 MOV t21,t11 - gsLQC1(R8,F3,F2,1) #a2,a3 + gsLQC1(R8,F1,F0,0) #a0,a1 + MOV t31,t11 - beqz K,.L75 MOV t41,t11 + gsLQC1(R8,F3,F2,1) #a2,a3 + + beqz K,.L75 + nop #endif - -.L71: # N=1,M=K=4 - gsLQC1(R8,F5,F4,2) # R8=A - gsLQC1(R8,F7,F6,3) +.L71: # nr=1,mr=kr=4 + LD b4, 1*SIZE(B) # b1 MADD t11,t11,a0,b0 + + gsLQC1(R8,F5,F4,2) MADD t21,t21,a1,b0 - LD b4, 1*SIZE(B) + gsLQC1(R8,F7,F6,3) FETCH $0,(PREA) MADD t31,t31,a2,b0 MADD t41,t41,a3,b0 .L72: -# gsLQC1(R9,F14,F10,1) - gsLQC1(R8,F1,F0,4) - gsLQC1(R8,F3,F2,5) + LD b2, 2*SIZE(B) # b2 MADD t11,t11,a4,b4 + gsLQC1(R8,F1,F0,4) MADD t21,t21,a5,b4 - LD b2, 2*SIZE(B) + gsLQC1(R8,F3,F2,5) FETCH $0,4*SIZE(PREA) MADD t31,t31,a6,b4 MADD t41,t41,a7,b4 .L73: - gsLQC1(R8,F5,F4,6) - gsLQC1(R8,F7,F6,7) - MADD t11,t11,a0,b2 - LD b6, 3*SIZE(B) + MADD t11,t11,a0,b2 + daddu B,B,4*SIZE # B+=1(nr)*4(kr)*8Byte=32 + + gsLQC1(R8,F5,F4,6) MADD t21,t21,a1,b2 - daddu A,A,16*SIZE # A+=4(mr)*4(kr)*8Byte=16*SIZE - FETCH $0,8*SIZE(PREA) + + gsLQC1(R8,F7,F6,7) MADD t31,t31,a2,b2 MADD t41,t41,a3,b2 - daddu B,B,4*SIZE # B+=1(nr)*4(kr)*8Byte=32 + daddu A,A,16*SIZE # A+=4(mr)*4(kr)*8Byte=16*SIZE .L74: -# gsLQC1(R9,F12,F8,0) - gsLQC1(R8,F1,F0,0) - daddu PREA,PREA,16*SIZE - gsLQC1(R8,F3,F2,1) - MADD t11,t11,a4,b6 - MADD t21,t21,a5,b6 - LD b0, 0*SIZE(B) + MADD t11,t11,a4,b6 + daddu PREA,PREA,16*SIZE + + gsLQC1(R8,F1,F0,0) + MADD t21,t21,a5,b6 daddiu K,K,-1 FETCH $0,-32(PREA) + gsLQC1(R8,F3,F2,1) MADD t31,t31,a6,b6 bnez K,.L71 MADD t41,t41,a7,b6 -.L75: # N=2 M=4 K=2 +.L75: # kr=2 #ifndef TRMMKERNEL - andi K,KCO,2 # k = KCO&2 + andi K,KCO,2 #else - andi K,TEMP,2 + andi K,TEMP,2 #endif beqz K,.L78 nop .L76: - gsLQC1(R8,F5,F4,2) # R8=A - gsLQC1(R8,F7,F6,3) - MADD t11,t11,a0,b0 - MADD t21,t21,a1,b0 - daddu A,A,8*SIZE # A+=4(mr)*2(kr)*8Byte=8*SIZE - LD b4, 1*SIZE(B) + MADD t11,t11,a0,b0 + daddu B,B,2*SIZE # B+=1(nr)*2(kr)*8Byte=32 + + gsLQC1(R8,F5,F4,2) + MADD t21,t21,a1,b0 FETCH $0,0(PREA) + + gsLQC1(R8,F7,F6,3) MADD t31,t31,a2,b0 MADD t41,t41,a3,b0 - daddu B,B,2*SIZE # B+=1(nr)*2(kr)*8Byte=32 + daddu A,A,8*SIZE # A+=4(mr)*2(kr)*8Byte=8*SIZE .L77: - gsLQC1(R8,F1,F0,0) - gsLQC1(R8,F3,F2,1) - MADD t11,t11,a4,b4 - MADD t21,t21,a5,b4 - LD b0,0(B) + MADD t11,t11,a4,b4 + + gsLQC1(R8,F1,F0,0) + MADD t21,t21,a5,b4 FETCH $0,4*SIZE(PREA) + + gsLQC1(R8,F3,F2,1) MADD t31,t31,a6,b4 MADD t41,t41,a7,b4 daddu PREA,PREA,8*SIZE - -.L78: # N=2, M=4, K=1 +.L78: # kr=1 #ifndef TRMMKERNEL - andi K,KCO,1 + andi K,KCO,1 #else - andi K,TEMP,1 + andi K,TEMP,1 #endif - beqz K,.L79 # - LD ALPHA,152($sp) # Get ALPHA + beqz K,.L79 + LD ALPHA,152($sp) # Get ALPHA FETCH $0,0(PREA) MADD t11,t11,a0,b0 @@ -2046,9 +2035,9 @@ daddu PREA,PREA,4*SIZE -.L79: # Write Back +.L79: # Write Back #ifndef TRMMKERNEL - LD c11,0(CO1) # Fetch 16 C + LD c11,0(CO1) # Fetch 16 C LD c21,1*SIZE(CO1) LD c31,2*SIZE(CO1) LD c41,3*SIZE(CO1) @@ -2062,15 +2051,15 @@ ST t21,1*SIZE(CO1) ST t31,2*SIZE(CO1) ST t41,3*SIZE(CO1) - daddiu M,M,-1 # M-- + daddiu M,M,-1 # M-- FETCH $0,4*SIZE(CO1) + FETCH $0,8*SIZE(CO1) + bnez M,.L70 # M!=0 daddu CO1,CO1,4*SIZE # COx += 4*8Byte - bnez M,.L70 # M!=0 - nop #else - daddiu M,M,-1 # M-- + daddiu M,M,-1 # M-- MUL t11, ALPHA, t11 MUL t21, ALPHA, t21 MUL t31, ALPHA, t31 @@ -2081,9 +2070,11 @@ ST t31,2*SIZE(CO1) ST t41,3*SIZE(CO1) - daddu CO1,CO1,4*SIZE # COx += 4*8Byte -#if ( defined(LEFT) && defined(TRANSA)) || \ - (!defined(LEFT) && !defined(TRANSA)) + FETCH $0,4*SIZE(CO1) + FETCH $0,8*SIZE(CO1) + + daddu CO1,CO1,4*SIZE +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) dsubu TEMP, KCO, KK #ifdef LEFT daddiu TEMP, TEMP, -4 @@ -2101,14 +2092,14 @@ #ifdef LEFT daddiu KK, KK, 4 #endif - bnez M,.L70 # M!=0 + bnez M,.L70 nop #endif - + .align 3 .L11_M2: - andi M,MCO,2 # Remainder M = 2 + andi M,MCO,2 # mr = 2 beqz M,.L11_M1 nop @@ -2124,10 +2115,10 @@ daddu B, BO, TEMP #endif -# gsLQC1(R9,F12,F8,0) LD b0, 0*SIZE(B) MTC $0,t11 - gsLQC1(R8,F1,F0,0) #a0,a1 + + gsLQC1(R8,F1,F0,0) #a0,a1 MOV t21,t11 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) dsubu TEMP, KCO, KK @@ -2141,19 +2132,20 @@ nop #else move B, BO - dsra K,KCO,2 # K=KCO/2 -# gsLQC1(R9,F12,F8,0) + dsra K,KCO,2 LD b0, 0*SIZE(B) + MTC $0,t11 - gsLQC1(R8,F1,F0,0) #a0,a1 MOV t21,t11 + gsLQC1(R8,F1,F0,0) #a0,a1 + beqz K,.L85 nop #endif -.L81: # N=1,M=2,K=4 +.L81: # nr=1,mr=2,kr=4 LD b4, 1*SIZE(B) - gsLQC1(R8,F5,F4,1) # R8=A + gsLQC1(R8,F5,F4,1) MADD t11,t11,a0,b0 MADD t21,t21,a1,b0 @@ -2162,42 +2154,38 @@ MADD t11,t11,a4,b4 MADD t21,t21,a5,b4 -# gsLQC1(R9,F14,F10,1) - LD b6, 3*SIZE(B) gsLQC1(R8,F7,F6,3) MADD t11,t11,a2,b2 - MADD t21,t21,a3,b2 + daddu A,A,8*SIZE # A+=2(mr)*4(kr)*8Byte=8*SIZE daddu B,B,4*SIZE # B+=1(nr)*4(kr)*8Byte=32 -# gsLQC1(R9,F12,F8,0) - gsLQC1(R8,F1,F0,0) - daddiu K,K,-1 - MADD t11,t11,a6,b6 - LD b0, 0*SIZE(B) - bnez K,.L81 + gsLQC1(R8,F1,F0,0) + MADD t11,t11,a6,b6 MADD t21,t21,a7,b6 + + daddiu K,K,-1 + bnez K,.L81 + nop - -.L85: # N=2 M=4 K=2 +.L85: # kr=2 #ifndef TRMMKERNEL - andi K,KCO,2 # k = KCO&2 + andi K,KCO,2 #else andi K,TEMP,2 #endif - beqz K,.L88 nop .L86: - gsLQC1(R8,F5,F4,1) # R8=A + gsLQC1(R8,F5,F4,1) LD b4, 1*SIZE(B) MADD t11,t11,a0,b0 - MADD t21,t21,a1,b0 + daddu A,A,4*SIZE # A+=2(mr)*2(kr)*8Byte=32 daddu B,B,2*SIZE # B+=1(nr)*2(kr)*8Byte=16 @@ -2207,15 +2195,14 @@ MADD t21,t21,a5,b4 -.L88: # N=2, M=4, K=1 +.L88: # kr=1 #ifndef TRMMKERNEL - andi K,KCO,1 + andi K,KCO,1 #else andi K,TEMP,1 #endif - - beqz K,.L89 # - LD ALPHA,152($sp) # Get ALPHA + beqz K,.L89 + LD ALPHA,152($sp) # Get ALPHA MADD t11,t11,a0,b0 MADD t21,t21,a1,b0 @@ -2223,9 +2210,9 @@ daddu B,B,1*SIZE -.L89: # Write Back +.L89: # Write Back #ifndef TRMMKERNEL - LD c11,0(CO1) # Fetch 16 C + LD c11,0(CO1) # Fetch 16 C LD c21,1*SIZE(CO1) MADD t11,c11,t11,ALPHA @@ -2237,15 +2224,16 @@ FETCH $0,2*SIZE(CO1) daddu CO1,CO1,2*SIZE # COx += 2*8Byte + #else daddu CO1,CO1,2*SIZE # COx += 2*8Byte MUL t11, ALPHA, t11 MUL t21, ALPHA, t21 + FETCH $0,0(CO1) ST t11, -2 * SIZE(CO1) ST t21, -1 * SIZE(CO1) -#if ( defined(LEFT) && defined(TRANSA)) || \ - (!defined(LEFT) && !defined(TRANSA)) +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) dsubu TEMP, KCO, KK #ifdef LEFT daddiu TEMP, TEMP, -2 @@ -2266,10 +2254,10 @@ #endif - + .align 3 .L11_M1: - andi M,MCO,1 # Remainder M = 1 - beqz M,.L999 # M = 0, End + andi M,MCO,1 # mr = 1 + beqz M,.L999 nop .L90: @@ -2283,11 +2271,9 @@ daddu A, A, K daddu B, BO, TEMP #endif -# gsLQC1(R8,F4,F0,0) - MTC $0,t11 -# gsLQC1(R9,F12,F8,0) LD a0, 0*SIZE(A) LD b0, 0*SIZE(B) + MTC $0,t11 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) dsubu TEMP, KCO, KK #elif defined(LEFT) @@ -2301,27 +2287,22 @@ #else move B, BO - dsra K,KCO,2 # K=KCO/2 -# gsLQC1(R8,F4,F0,0) -# gsLQC1(R9,F12,F8,0) LD a0, 0*SIZE(A) LD b0, 0*SIZE(B) + dsra K,KCO,2 beqz K,.L95 MTC $0,t11 #endif -.L91: # N=1,M=1,K=4 -# gsLQC1(R8,F6,F2,1) +.L91: # nr=mr=1,kr=4 LD a4, 1*SIZE(A) LD b4, 1*SIZE(B) MADD t11,t11,a0,b0 -# gsLQC1(R9,F14,F10,1) + LD a2, 2*SIZE(A) LD b2, 2*SIZE(B) MADD t11,t11,a4,b4 - -# gsLQC1(R8,F4,F0,0) LD a6, 3*SIZE(A) LD b6, 3*SIZE(B) MADD t11,t11,a2,b2 @@ -2331,16 +2312,15 @@ LD a0, 0*SIZE(A) LD b0, 0*SIZE(B) -# gsLQC1(R9,F12,F8,0) MADD t11,t11,a6,b6 daddiu K,K,-1 bnez K,.L91 nop -.L95: # N=2 M=4 K=2 +.L95: # kr=2 #ifndef TRMMKERNEL - andi K,KCO,2 # k = KCO&2 + andi K,KCO,2 #else andi K,TEMP,2 #endif @@ -2357,25 +2337,25 @@ LD b0,0(B) LD a0,0(A) MADD t11,t11,a4,b4 - - -.L98: # N=2, M=4, K=1 +.L98: # kr=1 #ifndef TRMMKERNEL andi K,KCO,1 #else andi K,TEMP,1 #endif - beqz K,.L99 # - LD ALPHA,152($sp) # Get ALPHA + beqz K,.L99 + LD ALPHA,152($sp) # Get ALPHA + MADD t11,t11,a0,b0 -.L99: # Write Back +.L99: # Write Back #ifndef TRMMKERNEL - LD c11,0(CO1) # Fetch 16 C + LD c11,0(CO1) # Fetch 16 C MADD t11,c11,t11,ALPHA ST t11,0(CO1) + #else MUL t11, ALPHA, t11 From fc8490911562499a993b33c77455c381fe4274c3 Mon Sep 17 00:00:00 2001 From: traz Date: Fri, 27 May 2011 09:47:17 +0000 Subject: [PATCH 22/42] Modify single precision compiler conditions, increasing single precision kernel code on Loongson3a. --- kernel/mips64/KERNEL | 11 + kernel/mips64/KERNEL.LOONGSON3A | 18 +- kernel/mips64/sgemm_kernel_loongson3a.S | 2559 +++++++++++++++++++++++ 3 files changed, 2584 insertions(+), 4 deletions(-) create mode 100644 kernel/mips64/sgemm_kernel_loongson3a.S diff --git a/kernel/mips64/KERNEL b/kernel/mips64/KERNEL index f6615bf01..ebb447b11 100644 --- a/kernel/mips64/KERNEL +++ b/kernel/mips64/KERNEL @@ -91,10 +91,21 @@ ifndef ZGEMM_BETA ZGEMM_BETA = ../generic/zgemm_beta.c endif +ifndef STRSMKERNEL_LN STRSMKERNEL_LN = trsm_kernel_LN.S +endif + +ifndef STRSMKERNEL_LT STRSMKERNEL_LT = trsm_kernel_LT.S +endif + +ifndef STRSMKERNEL_RN STRSMKERNEL_RN = trsm_kernel_LT.S +endif + +ifndef STRSMKERNEL_RT STRSMKERNEL_RT = trsm_kernel_RT.S +endif ifndef DTRSMKERNEL_LN DTRSMKERNEL_LN = trsm_kernel_LN.S diff --git a/kernel/mips64/KERNEL.LOONGSON3A b/kernel/mips64/KERNEL.LOONGSON3A index 0e387c032..e72ac142e 100644 --- a/kernel/mips64/KERNEL.LOONGSON3A +++ b/kernel/mips64/KERNEL.LOONGSON3A @@ -1,14 +1,24 @@ SAXPYKERNEL=axpy_loongson3a.S DAXPYKERNEL=daxpy_loongson3a_simd.S +SGEMMKERNEL = sgemm_kernel_loongson3a.S +SGEMMONCOPY = ../generic/gemm_ncopy_4.c +SGEMMOTCOPY = ../generic/gemm_tcopy_4.c +SGEMMONCOPYOBJ = sgemm_oncopy.o +SGEMMOTCOPYOBJ = sgemm_otcopy.o + DGEMMKERNEL = gemm_kernel_loongson3a.S DGEMMONCOPY = ../generic/gemm_ncopy_4.c DGEMMOTCOPY = ../generic/gemm_tcopy_4.c DGEMMONCOPYOBJ = dgemm_oncopy.o DGEMMOTCOPYOBJ = dgemm_otcopy.o +STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c -DTRSMKERNEL_LN = trsm_kernel_LN_loongson3a.S -DTRSMKERNEL_LT = trsm_kernel_LT_loongson3a.S -DTRSMKERNEL_RN = trsm_kernel_RN_loongson3a.S -DTRSMKERNEL_RT = trsm_kernel_RT_loongson3a.S +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c diff --git a/kernel/mips64/sgemm_kernel_loongson3a.S b/kernel/mips64/sgemm_kernel_loongson3a.S new file mode 100644 index 000000000..36c3b3878 --- /dev/null +++ b/kernel/mips64/sgemm_kernel_loongson3a.S @@ -0,0 +1,2559 @@ +#define REALNAME ASMNAME +#define ASSEMBLER +#include "common.h" +#define FETCH ld +#define gsLQC1(base,fq,ft,offset) .word(0x32<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq) +#define gsSQC1(base,fq,ft,offset) .word(0x3A<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq) + +#define M $4 +#define N $5 +#define K $6 +#define A $8 +#define B $9 +#define C $10 +#define LDC $11 + +#define AO $12 +#define BO $13 + +#define CO1 $14 +#define CO2 $15 +#define CO3 $16 +#define CO4 $17 + +#define KCO $18 +#define MCO $19 +#define NCO $20 + +#define SPANB $21 +#define PREB $23 +#define PREA $24 +#define SPANA $25 + +#define ALPHA $f15 + +#if defined(TRMMKERNEL) +#define OFFSET $2 +#define KK $3 +#define TEMP $7 +#endif + +#define R8 8 +#define R9 9 +#define R14 14 +#define R15 15 +#define R16 16 +#define R17 17 + +#define t11 $f30 +#define t21 $f31 +#define t31 $f28 +#define t41 $f29 + +#define t12 $f26 +#define t22 $f27 +#define t32 $f24 +#define t42 $f25 + +#define t13 $f22 +#define t23 $f23 +#define t33 $f20 +#define t43 $f21 + +#define t14 $f18 +#define t24 $f19 +#define t34 $f16 +#define t44 $f17 + +#define c11 $f0 +#define c21 $f1 +#define c31 $f2 +#define c41 $f3 + +#define c12 $f4 +#define c22 $f5 +#define c32 $f6 +#define c42 $f7 + +#define c13 $f8 +#define c23 $f9 +#define c33 $f10 +#define c43 $f11 + +#define c14 $f12 +#define c24 $f13 +#define c34 $f14 +#define c44 $f0 + +#define a0 $f0 +#define a1 $f1 +#define a2 $f2 +#define a3 $f3 +#define a4 $f4 +#define a5 $f5 +#define a6 $f6 +#define a7 $f7 +#define b0 $f8 +#define b1 $f9 +#define b2 $f10 +#define b3 $f11 +#define b4 $f12 +#define b5 $f13 +#define b6 $f14 +#define b7 $f15 + +#define F31 31 +#define F30 30 +#define F29 29 +#define F28 28 +#define F27 27 +#define F26 26 +#define F25 25 +#define F24 24 +#define F23 23 +#define F22 22 +#define F21 21 +#define F20 20 +#define F19 19 +#define F18 18 +#define F17 17 +#define F16 16 +#define F15 15 +#define F14 14 +#define F13 13 +#define F12 12 +#define F11 11 +#define F10 10 +#define F9 9 +#define F8 8 +#define F7 7 +#define F6 6 +#define F5 5 +#define F4 4 +#define F3 3 +#define F2 2 +#define F1 1 +#define F0 0 + + PROLOGUE + + daddiu $sp, $sp, -160 + sd $16, 0($sp) + sd $17, 8($sp) + sd $18, 16($sp) + sd $19, 24($sp) + sd $20, 32($sp) + sd $21, 40($sp) + sd $22, 48($sp) + ST $f24, 56($sp) + ST $f25, 64($sp) + ST $f26, 72($sp) + ST $f27, 80($sp) + ST $f28, 88($sp) + sd $23, 96($sp) + sd $24, 104($sp) + sd $25, 112($sp) + ST $f20,120($sp) + ST $f21,128($sp) + ST $f22,136($sp) + ST $f23,144($sp) + + + .align 5 +.L0_N4: # Loop N + ST ALPHA,152($sp) # Backup ALPHA + move MCO,M # Backup M + + move NCO,N # Backup N + move KCO,K # Backup K + + move AO,A # Backup A_addr + dsra N,NCO,2 # N=NCO/2 + + dsll LDC,LDC,BASE_SHIFT # LDC*8Byte + dsll SPANB,KCO,2+BASE_SHIFT # SPANB=KC*4nr*8Byte=KC*2^5 + +#if defined(TRMMKERNEL) + LDARG OFFSET,160($sp) # OFFSET is relate to the data part +#endif + +#if defined(TRMMKERNEL) && !defined(LEFT) + neg KK,OFFSET +#endif + + move BO,B # Backup B_addr + beq N,$0,.L0_N2 # N=0,NCO<4 + dsll SPANA,KCO,1+BASE_SHIFT # SPANA = KCO*2mr*8Byte + +.L0_N4_Lb: # mr=4,nr=4 + move CO1,C + dsra M,MCO,2 # M=MCO/2 + + move A,AO # Reset A + daddu CO2,C,LDC + + daddu PREB,BO,SPANB # PreB point next panelB + daddu CO3,CO2,LDC + + daddu PREA,AO,SPANA + daddu CO4,CO3,LDC + +#if defined(TRMMKERNEL) && defined(LEFT) + move KK,OFFSET +#endif + beqz M,.L14_M2 + daddu C,CO4,LDC # move C to next panel Cj + +.L10: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B,BO # (SIDE=L and UPLO=L) or (SIZE=R and UPLO=U) +#else + dsll K,KK,2 + BASE_SHIFT # KK is the length that needs to span to the data part + dsll TEMP,KK,2 + BASE_SHIFT + + daddu A,A,K # move A B to data part + daddu B,BO,TEMP +#endif + MTC $0,t11 + MOV t21,t11 + LD a0,0(A) + + MOV t31,t11 + MOV t41,t11 + LD a1,1*SIZE(A) + + MOV t12,t11 + MOV t22,t11 + LD b0,0(B) + + MOV t32,t11 + MOV t42,t11 + LD b1,1*SIZE(B) + + MOV t13,t11 + MOV t23,t11 + LD a2,2*SIZE(A) + + MOV t33,t11 + MOV t43,t11 + LD b2,2*SIZE(B) + + MOV t14,t11 + MOV t24,t11 + LD a3,3*SIZE(A) + + + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP,KCO,KK # temp is the length of the data part +#elif defined(LEFT) + daddiu TEMP, KK, 4 # S=L,U=L +#else + daddiu TEMP, KK, 4 # S=R,U=U,for this two situation KK is the length of the data part +#endif + dsra K,TEMP,2 # K=KCO/2 + MOV t34,t11 + beqz K,.L15 + MOV t44,t11 + +#else + move B,BO # Reset B + MTC $0,t11 # GEMM part NR=4,MR=4 + LD a0,0(A) + + MOV t21,t11 + MOV t31,t11 + LD a1,1*SIZE(A) + + MOV t41,t11 + MOV t12,t11 + LD b0,0(B) + + MOV t22,t11 + MOV t32,t11 + LD b1,1*SIZE(B) + + MOV t42,t11 + dsra K,KCO,2 # K=KCO/2 + LD a2,2*SIZE(A) + + MOV t13,t11 + MOV t23,t11 + LD b2,2*SIZE(B) + + MOV t33,t11 + MOV t43,t11 + LD a3,3*SIZE(A) + + MOV t14,t11 + MOV t24,t11 + LD b3,3*SIZE(B) + + MOV t34,t11 + beqz K,.L15 + MOV t44,t11 # clear 16 results registers +#endif + + .align 5 +.L11: # kr=4 + MADD t11,t11,a0,b0 + MADD t21,t21,a1,b0 + LD a4,4*SIZE(A) + + MADD t12,t12,a0,b1 + MADD t22,t22,a1,b1 + LD a5,5*SIZE(A) + + MADD t31,t31,a2,b0 + MADD t41,t41,a3,b0 + LD b4,4*SIZE(B) + + MADD t32,t32,a2,b1 + MADD t42,t42,a3,b1 + LD b5,5*SIZE(B) + FETCH $0,(PREB) + + MADD t13,t13,a0,b2 + MADD t23,t23,a1,b2 + LD a6,6*SIZE(A) + + MADD t14,t14,a0,b3 + MADD t24,t24,a1,b3 + LD b6,6*SIZE(B) + FETCH $0,(PREA) + + MADD t33,t33,a2,b2 + MADD t43,t43,a3,b2 + LD a7,7*SIZE(A) + + MADD t34,t34,a2,b3 + MADD t44,t44,a3,b3 + LD b7,7*SIZE(B) + +.L12: + MADD t11,t11,a4,b4 + MADD t21,t21,a5,b4 + LD a0,8*SIZE(A) + + MADD t12,t12,a4,b5 + MADD t22,t22,a5,b5 + LD a1,9*SIZE(A) + + MADD t31,t31,a6,b4 + MADD t41,t41,a7,b4 + LD b0,8*SIZE(B) + + MADD t32,t32,a6,b5 + MADD t42,t42,a7,b5 + LD b1,9*SIZE(B) + + FETCH $0,4*SIZE(PREB) + MADD t13,t13,a4,b6 + MADD t23,t23,a5,b6 + LD a2,10*SIZE(A) + + MADD t14,t14,a4,b7 + MADD t24,t24,a5,b7 + LD b2,10*SIZE(B) + + FETCH $0,4*SIZE(PREA) + MADD t33,t33,a6,b6 + MADD t43,t43,a7,b6 + LD a3,11*SIZE(A) + + MADD t34,t34,a6,b7 + MADD t44,t44,a7,b7 + LD b3,11*SIZE(B) + +.L13: + MADD t11,t11,a0,b0 + MADD t21,t21,a1,b0 + LD a4,12*SIZE(A) + + MADD t12,t12,a0,b1 + MADD t22,t22,a1,b1 + LD a5,13*SIZE(A) + + MADD t31,t31,a2,b0 + MADD t41,t41,a3,b0 + LD b4,12*SIZE(B) + + FETCH $0,8*SIZE(PREA) + MADD t32,t32,a2,b1 + MADD t42,t42,a3,b1 + LD b5,13*SIZE(B) + + FETCH $0,8*SIZE(PREB) + MADD t13,t13,a0,b2 + MADD t23,t23,a1,b2 + LD a6,14*SIZE(A) + + MADD t14,t14,a0,b3 + MADD t24,t24,a1,b3 + daddu A,A,16*SIZE # 4mr*4kr + LD b6,14*SIZE(B) + + MADD t33,t33,a2,b2 + MADD t43,t43,a3,b2 + daddu B,B,16*SIZE # 4nr*4kr + LD a7,-1*SIZE(A) + + MADD t34,t34,a2,b3 + MADD t44,t44,a3,b3 + LD b7,-1*SIZE(B) + +.L14: + MADD t11,t11,a4,b4 + MADD t21,t21,a5,b4 + LD a0,0(A) + + MADD t12,t12,a4,b5 + MADD t22,t22,a5,b5 + LD a1,1*SIZE(A) + + MADD t31,t31,a6,b4 + MADD t41,t41,a7,b4 + daddiu K,K,-1 + LD b0,0(B) + + MADD t32,t32,a6,b5 + MADD t42,t42,a7,b5 + daddu PREA,PREA,16*SIZE + LD b1,1*SIZE(B) + + FETCH $0,12*SIZE(PREB) + MADD t13,t13,a4,b6 + MADD t23,t23,a5,b6 + LD a2,2*SIZE(A) + + FETCH $0,-4*SIZE(PREA) + MADD t14,t14,a4,b7 + MADD t24,t24,a5,b7 + LD b2,2*SIZE(B) + + MADD t33,t33,a6,b6 + MADD t43,t43,a7,b6 + daddu PREB,PREB,16*SIZE + LD a3,3*SIZE(A) + + MADD t34,t34,a6,b7 + MADD t44,t44,a7,b7 + bnez K,.L11 + LD b3,3*SIZE(B) + + +.L15: # kr=2 +#ifndef TRMMKERNEL + andi K,KCO,2 +#else + andi K,TEMP, 2 +#endif + beqz K,.L18 + nop + +.L16: + MADD t11,t11,a0,b0 + MADD t21,t21,a1,b0 + LD a4,4*SIZE(A) + + MADD t12,t12,a0,b1 + MADD t22,t22,a1,b1 + LD a5,5*SIZE(A) + + MADD t31,t31,a2,b0 + MADD t41,t41,a3,b0 + LD b4,4*SIZE(B) + + FETCH $0,0(PREA) + MADD t32,t32,a2,b1 + MADD t42,t42,a3,b1 + LD b5,5*SIZE(B) + + FETCH $0,0(PREB) + MADD t13,t13,a0,b2 + MADD t23,t23,a1,b2 + LD a6,6*SIZE(A) + + MADD t14,t14,a0,b3 + MADD t24,t24,a1,b3 + daddu A,A,8*SIZE # 4mr*2kr + LD b6,6*SIZE(B) + + MADD t33,t33,a2,b2 + MADD t43,t43,a3,b2 + daddu B,B,8*SIZE # 4nr*2kr + LD a7,-1*SIZE(A) + + MADD t34,t34,a2,b3 + MADD t44,t44,a3,b3 + LD b7,-1*SIZE(B) + +.L17: + MADD t11,t11,a4,b4 + MADD t21,t21,a5,b4 + LD a0,0*SIZE(A) + + MADD t12,t12,a4,b5 + MADD t22,t22,a5,b5 + LD a1,1*SIZE(A) + + MADD t31,t31,a6,b4 + MADD t41,t41,a7,b4 + LD b0,0*SIZE(B) + + MADD t32,t32,a6,b5 + MADD t42,t42,a7,b5 + LD b1,1*SIZE(B) + + FETCH $0,4*SIZE(PREB) + MADD t13,t13,a4,b6 + MADD t23,t23,a5,b6 + LD a2,2*SIZE(A) + + FETCH $0,4*SIZE(PREA) + MADD t14,t14,a4,b7 + MADD t24,t24,a5,b7 + LD b2,2*SIZE(B) + + MADD t33,t33,a6,b6 + MADD t43,t43,a7,b6 + daddu PREA,PREA,8*SIZE + LD a3,3*SIZE(A) + + MADD t34,t34,a6,b7 + MADD t44,t44,a7,b7 + daddu PREB,PREB,8*SIZE + LD b3,3*SIZE(B) + + +.L18: # kr=1 +#ifndef TRMMKERNEL + andi K,KCO,1 +#else + andi K,TEMP,1 +#endif + beqz K,.L19 + LD ALPHA,152($sp) # Get ALPHA + + FETCH $0,0(PREB) + MADD t11,t11,a0,b0 + MADD t21,t21,a1,b0 + daddu A,A,4*SIZE # 4mr*kr + + MADD t12,t12,a0,b1 + MADD t22,t22,a1,b1 + daddu B,B,4*SIZE # 4nr*kr + + FETCH $0,0(PREA) + MADD t31,t31,a2,b0 + MADD t41,t41,a3,b0 + daddu PREB,PREB,4*SIZE + + MADD t32,t32,a2,b1 + MADD t42,t42,a3,b1 + daddu PREA,PREA,4*SIZE + + MADD t13,t13,a0,b2 + MADD t23,t23,a1,b2 + + MADD t14,t14,a0,b3 + MADD t24,t24,a1,b3 + + MADD t33,t33,a2,b2 + MADD t43,t43,a3,b2 + + MADD t34,t34,a2,b3 + MADD t44,t44,a3,b3 + +.L19: # Write Back to C +#ifndef TRMMKERNEL + LD c11,0(CO1) # GEMM write part + LD c21,1*SIZE(CO1) # get 16 C + LD c31,2*SIZE(CO1) + LD c41,3*SIZE(CO1) + + LD c12,0(CO2) + MADD t11,c11,t11,ALPHA + LD c22,1*SIZE(CO2) + MADD t21,c21,t21,ALPHA + LD c32,2*SIZE(CO2) + MADD t31,c31,t31,ALPHA + LD c42,3*SIZE(CO2) + MADD t41,c41,t41,ALPHA + + LD c13,0(CO3) + MADD t12,c12,t12,ALPHA + LD c23,1*SIZE(CO3) + MADD t22,c22,t22,ALPHA + LD c33,2*SIZE(CO3) + MADD t32,c32,t32,ALPHA + LD c43,3*SIZE(CO3) + MADD t42,c42,t42,ALPHA + + LD c14,0(CO4) + MADD t13,c13,t13,ALPHA + LD c24,1*SIZE(CO4) + MADD t23,c23,t23,ALPHA + LD c34,2*SIZE(CO4) + MADD t33,c33,t33,ALPHA + LD c44,3*SIZE(CO4) + MADD t43,c43,t43,ALPHA + + ST t11,0(CO1) + MADD t14,c14,t14,ALPHA + ST t21,1*SIZE(CO1) + MADD t24,c24,t24,ALPHA + ST t31,2*SIZE(CO1) + MADD t34,c34,t34,ALPHA + ST t41,3*SIZE(CO1) + MADD t44,c44,t44,ALPHA + daddiu M,M,-1 # M-- + + ST t12,0(CO2) + ST t22,1*SIZE(CO2) + ST t32,2*SIZE(CO2) + ST t42,3*SIZE(CO2) + + ST t13,0(CO3) + ST t23,1*SIZE(CO3) + ST t33,2*SIZE(CO3) + ST t43,3*SIZE(CO3) + + FETCH $0,4*SIZE(CO1) + FETCH $0,4*SIZE(CO2) + FETCH $0,4*SIZE(CO3) + FETCH $0,4*SIZE(CO4) + + FETCH $0,8*SIZE(CO1) + FETCH $0,8*SIZE(CO2) + FETCH $0,8*SIZE(CO3) + FETCH $0,8*SIZE(CO4) + + ST t14,0(CO4) + daddu CO1,CO1,4*SIZE # COi += 4 + ST t24,1*SIZE(CO4) + daddu CO2,CO2,4*SIZE + ST t34,2*SIZE(CO4) + daddu CO3,CO3,4*SIZE + ST t44,3*SIZE(CO4) + daddu PREB,BO,SPANB + + bnez M,.L10 + daddu CO4,CO4,4*SIZE + +#else + MUL t11, ALPHA, t11 # TRMM write back part + MUL t21, ALPHA, t21 + MUL t31, ALPHA, t31 + MUL t41, ALPHA, t41 + + ST t11, 0 * SIZE(CO1) + MUL t12, ALPHA, t12 + ST t21, 1 * SIZE(CO1) + MUL t22, ALPHA, t22 + ST t31, 2 * SIZE(CO1) + MUL t32, ALPHA, t32 + ST t41, 3 * SIZE(CO1) + MUL t42, ALPHA, t42 + + ST t12, 0 * SIZE(CO2) + MUL t13, ALPHA, t13 + ST t22, 1 * SIZE(CO2) + MUL t23, ALPHA, t23 + ST t32, 2 * SIZE(CO2) + MUL t33, ALPHA, t33 + ST t42, 3 * SIZE(CO2) + MUL t43, ALPHA, t43 + + ST t13, 0 * SIZE(CO3) + MUL t14, ALPHA, t14 + ST t23, 1 * SIZE(CO3) + MUL t24, ALPHA, t24 + ST t33, 2 * SIZE(CO3) + MUL t34, ALPHA, t34 + ST t43, 3 * SIZE(CO3) + MUL t44, ALPHA, t44 + + ST t14, 0 * SIZE(CO4) + daddiu M,M,-1 # M-- + ST t24, 1 * SIZE(CO4) + ST t34, 2 * SIZE(CO4) + ST t44, 3 * SIZE(CO4) + daddiu CO1,CO1, 4 * SIZE + daddiu CO2,CO2, 4 * SIZE + daddiu CO3,CO3, 4 * SIZE + daddiu CO4,CO4, 4 * SIZE + + FETCH $0,4*SIZE(CO1) + FETCH $0,4*SIZE(CO2) + FETCH $0,4*SIZE(CO3) + FETCH $0,4*SIZE(CO4) + + FETCH $0,0(CO1) + FETCH $0,0(CO2) + FETCH $0,0(CO3) + FETCH $0,0(CO4) + +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP,KCO,KK +#ifdef LEFT + daddiu TEMP,TEMP, -4 +#else + daddiu TEMP,TEMP, -4 +#endif + dsll K,TEMP,2 + BASE_SHIFT + dsll TEMP,TEMP,2 + BASE_SHIFT + daddu A,A,K # mov A to the end of panel Ai + daddu B,B,TEMP # mov B to the end of panel Bj +#endif + +#ifdef LEFT + daddiu KK, KK,4 +#endif + bnez M,.L10 + nop +#endif + + + .align 3 +.L14_M2: + andi M, MCO, 2 # nr=4,mr=2 + beqz M,.L14_M1 + nop + +.L20: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B,BO # Reset B +#else + dsll K,KK,1 + BASE_SHIFT # mr=2 + dsll TEMP,KK,2 + BASE_SHIFT # nr=4 + daddu A,A,K + daddu B,BO,TEMP +#endif + + MTC $0,t11 + LD a0,0*SIZE(A) + MOV t21,t11 + LD a1,1*SIZE(A) + + MOV t12,t11 + LD b0,0*SIZE(B) + MOV t22,t11 + LD b1,1*SIZE(B) + + MOV t13,t11 + LD b2,2*SIZE(B) + MOV t23,t11 + LD b3,3*SIZE(B) + + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP,KCO,KK +#elif defined(LEFT) + daddiu TEMP,KK,2 # left part,controlled by mr, mr=2 +#else + daddiu TEMP,KK,4 # right part,controlled by nr,nr=4 +#endif + dsra K,TEMP,2 + MOV t14,t11 + beqz K,.L25 + MOV t24,t11 # clear 2*4=8 results registers + +#else + move B,BO # Reset B + LD a0,0*SIZE(A) + MTC $0,t11 + LD a1,1*SIZE(A) + + MOV t21,t11 + LD b0,0*SIZE(B) + MOV t12,t11 + LD b1,1*SIZE(B) + + MOV t22,t11 + dsra K,KCO,2 + LD b2,2*SIZE(B) + + MOV t13,t11 + MOV t23,t11 + LD b3,3*SIZE(B) + + MOV t14,t11 + beqz K,.L25 + MOV t24,t11 + +#endif + +.L21: # nr=4,mr=2,kr=4 + MADD t11,t11,a0,b0 + LD a4,2*SIZE(A) + MADD t21,t21,a1,b0 + LD a5,3*SIZE(A) + + MADD t12,t12,a0,b1 + LD b4,4*SIZE(B) + MADD t22,t22,a1,b1 + LD b5,5*SIZE(B) + + MADD t13,t13,a0,b2 + LD b6,6*SIZE(B) + MADD t23,t23,a1,b2 + LD b7,7*SIZE(B) + + MADD t14,t14,a0,b3 + MADD t24,t24,a1,b3 + + MADD t11,t11,a4,b4 + LD a2,4*SIZE(A) + MADD t21,t21,a5,b4 + LD a3,5*SIZE(A) + + MADD t12,t12,a4,b5 + LD b0,8*SIZE(B) + MADD t22,t22,a5,b5 + LD b1,9*SIZE(B) + + MADD t13,t13,a4,b6 + LD b2,10*SIZE(B) + MADD t23,t23,a5,b6 + LD b3,11*SIZE(B) + + MADD t14,t14,a4,b7 + MADD t24,t24,a5,b7 + daddiu K,K,-1 + + MADD t11,t11,a2,b0 + LD a6,6*SIZE(A) + MADD t21,t21,a3,b0 + LD a7,7*SIZE(A) + + MADD t12,t12,a2,b1 + LD b4,12*SIZE(B) + MADD t22,t22,a3,b1 + LD b5,13*SIZE(B) + + MADD t13,t13,a2,b2 + LD b6,14*SIZE(B) + MADD t23,t23,a3,b2 + LD b7,15*SIZE(B) + + MADD t14,t14,a2,b3 + MADD t24,t24,a3,b3 + daddu A,A,8*SIZE # 2mr*4kr + daddu B,B,16*SIZE # 4nr*4kr + + MADD t11,t11,a6,b4 + LD a0,0*SIZE(A) + MADD t21,t21,a7,b4 + LD a1,1*SIZE(A) + + MADD t12,t12,a6,b5 + LD b0,0*SIZE(B) + MADD t22,t22,a7,b5 + LD b1,1*SIZE(B) + + MADD t13,t13,a6,b6 + LD b2,2*SIZE(B) + MADD t23,t23,a7,b6 + LD b3,3*SIZE(B) + + MADD t14,t14,a6,b7 + bnez K,.L21 + MADD t24,t24,a7,b7 + + +.L25: +#ifndef TRMMKERNEL + andi K,KCO,2 # kr=2 +#else + andi K,TEMP,2 +#endif + beqz K,.L28 + nop + +.L26: + MADD t11,t11,a0,b0 + LD a4,2*SIZE(A) + MADD t21,t21,a1,b0 + LD a5,3*SIZE(A) + + MADD t12,t12,a0,b1 + LD b4,4*SIZE(B) + MADD t22,t22,a1,b1 + LD b5,5*SIZE(B) + + MADD t13,t13,a0,b2 + LD b6,6*SIZE(B) + MADD t23,t23,a1,b2 + LD b7,7*SIZE(B) + + MADD t14,t14,a0,b3 + MADD t24,t24,a1,b3 + daddu A,A,4*SIZE # 2mr*2kr + daddu B,B,8*SIZE # 4nr*2kr + +.L27: + MADD t11,t11,a4,b4 + LD a0,0*SIZE(A) + MADD t21,t21,a5,b4 + LD a1,1*SIZE(A) + + MADD t12,t12,a4,b5 + LD b0,0*SIZE(B) + MADD t22,t22,a5,b5 + LD b1,1*SIZE(B) + + MADD t13,t13,a4,b6 + LD b2,2*SIZE(B) + MADD t23,t23,a5,b6 + LD b3,3*SIZE(B) + + MADD t14,t14,a4,b7 + MADD t24,t24,a5,b7 + + +.L28: # kr=1 +#ifndef TRMMKERNEL + andi K,KCO,1 +#else + andi K,TEMP,1 +#endif + beqz K,.L29 + LD ALPHA,152($sp) # Get ALPHA + + MADD t11,t11,a0,b0 + MADD t21,t21,a1,b0 + daddu A,A,2*SIZE # 2mr*kr + daddu B,B,4*SIZE # 4nr*kr + + MADD t12,t12,a0,b1 + MADD t22,t22,a1,b1 + + MADD t13,t13,a0,b2 + MADD t23,t23,a1,b2 + + MADD t14,t14,a0,b3 + MADD t24,t24,a1,b3 + +.L29: # Write Back to C +#ifndef TRMMKERNEL + LD c11,0(CO1) # GEMM write back part + LD c21,1*SIZE(CO1) + + LD c12,0(CO2) + LD c22,1*SIZE(CO2) + + LD c13,0(CO3) + MADD t11,c11,t11,ALPHA + LD c23,1*SIZE(CO3) + MADD t21,c21,t21,ALPHA + + LD c14,0(CO4) + MADD t12,c12,t12,ALPHA + LD c24,1*SIZE(CO4) + MADD t22,c22,t22,ALPHA + + ST t11,0(CO1) + MADD t13,c13,t13,ALPHA + ST t21,1*SIZE(CO1) + MADD t23,c23,t23,ALPHA + + ST t12,0(CO2) + MADD t14,c14,t14,ALPHA + ST t22,1*SIZE(CO2) + MADD t24,c24,t24,ALPHA + + ST t13,0(CO3) + daddu CO1,CO1,2*SIZE # COi += 2 + ST t23,1*SIZE(CO3) + daddu CO2,CO2,2*SIZE + + ST t14,0(CO4) + daddu CO3,CO3,2*SIZE + ST t24,1*SIZE(CO4) + daddu CO4,CO4,2*SIZE + + FETCH $0,0(CO1) + FETCH $0,0(CO2) + FETCH $0,0(CO3) + FETCH $0,0(CO4) + +#else + MUL t11, ALPHA, t11 # TRMM write back part + MUL t21, ALPHA, t21 + + ST t11, 0 * SIZE(CO1) + MUL t12, ALPHA, t12 + ST t21, 1 * SIZE(CO1) + MUL t22, ALPHA, t22 + + ST t12, 0 * SIZE(CO2) + MUL t13, ALPHA, t13 + ST t22, 1 * SIZE(CO2) + MUL t23, ALPHA, t23 + + ST t13, 0 * SIZE(CO3) + MUL t14, ALPHA, t14 + ST t23, 1 * SIZE(CO3) + MUL t24, ALPHA, t24 + + ST t14, 0 * SIZE(CO4) + ST t24, 1 * SIZE(CO4) + + daddiu CO1,CO1, 2 * SIZE + daddiu CO2,CO2, 2 * SIZE + daddiu CO3,CO3, 2 * SIZE + daddiu CO4,CO4, 2 * SIZE + + FETCH $0,0(CO1) + FETCH $0,0(CO2) + FETCH $0,0(CO3) + FETCH $0,0(CO4) + +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP,KCO,KK +#ifdef LEFT + daddiu TEMP,TEMP,-2 +#else + daddiu TEMP,TEMP,-4 +#endif + dsll K,TEMP,1 + BASE_SHIFT + dsll TEMP,TEMP,2 + BASE_SHIFT + + daddu A,A,K # move A to next panel Ai + daddu B,B,TEMP # move B to next panel Bj +#endif + +#ifdef LEFT + daddiu KK, KK, 2 +#endif +#endif + + + .align 3 +.L14_M1: + andi M,MCO,1 # mr=1 + beqz M,.L0_N4_Loop # M = 0, finishing one panel Bj + nop + +.L30: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B,BO # Reset B +#else + dsll K,KK, 0 + BASE_SHIFT + dsll TEMP,KK,2 + BASE_SHIFT + + daddu A,A,K + daddu B,BO,TEMP +#endif + MTC $0,t11 + MOV t12,t11 + LD a0, 0 * SIZE(A) # a0 + + MOV t13,t11 + LD b0,0*SIZE(B) + MOV t14,t11 # clear result registers + LD b1,1*SIZE(B) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, KCO, KK +#elif defined(LEFT) + daddiu TEMP, KK, 1 +#else + daddiu TEMP, KK, 4 +#endif + dsra K,TEMP, 2 + nop + beqz K,.L35 + nop + +#else + move B,BO # Reset B, GEMM part + dsra K,KCO,2 # K=KCO/2 + LD a0, 0 * SIZE(A) # a0 + + MTC $0,t11 + LD b0,0*SIZE(B) + + MOV t12,t11 + LD b1,1*SIZE(B) + + MOV t13,t11 + LD b2,2*SIZE(B) + + MOV t14,t11 + beqz K,.L35 + LD b3,3*SIZE(B) + +#endif + +.L31: # nr=4,mr=1,kr=4 + LD a1, 1*SIZE(A) # load a1 + MADD t11,t11,a0,b0 + + LD b4,4*SIZE(B) + LD b5,5*SIZE(B) + MADD t12,t12,a0,b1 + + LD b6,6*SIZE(B) + LD b7,7*SIZE(B) + MADD t13,t13,a0,b2 + MADD t14,t14,a0,b3 + + LD a2, 2*SIZE(A) # a2 + MADD t11,t11,a1,b4 + + LD b0,8*SIZE(B) + LD b1,9*SIZE(B) + MADD t12,t12,a1,b5 + + LD b2,10*SIZE(B) + LD b3,11*SIZE(B) + MADD t13,t13,a1,b6 + MADD t14,t14,a1,b7 + + LD a3, 3*SIZE(A) # a3 + MADD t11,t11,a2,b0 + daddiu K,K,-1 + + LD b4,12*SIZE(B) + LD b5,13*SIZE(B) + MADD t12,t12,a2,b1 + daddu A,A,4*SIZE # 1mr*4kr + + LD b6,14*SIZE(B) + LD b7,15*SIZE(B) + MADD t13,t13,a2,b2 + MADD t14,t14,a2,b3 + + LD a0, 0*SIZE(A) # a0 + daddu B,B,16*SIZE # 4nr*4kr + MADD t11,t11,a3,b4 + + LD b0,0*SIZE(B) + MADD t12,t12,a3,b5 + LD b1,1*SIZE(B) + MADD t13,t13,a3,b6 + + LD b2,2*SIZE(B) + MADD t14,t14,a3,b7 + bnez K,.L31 + LD b3,3*SIZE(B) + + +.L35: # kr=2 +#ifndef TRMMKERNEL + andi K,KCO,2 +#else + andi K,TEMP,2 +#endif + beqz K,.L38 + nop + +.L36: + LD a1,1*SIZE(A) # load a1 + MADD t11,t11,a0,b0 + + LD b4,4*SIZE(B) + LD b5,5*SIZE(B) + MADD t12,t12,a0,b1 + daddu A,A,2*SIZE # mr*2kr + + LD b6,6*SIZE(B) + MADD t13,t13,a0,b2 + + LD b7,7*SIZE(B) + MADD t14,t14,a0,b3 + daddu B,B,8*SIZE # 4nr*2kr + + +.L37: + LD a0,0(A) + MADD t11,t11,a1,b4 + + LD b0,0*SIZE(B) + LD b1,1*SIZE(B) + MADD t12,t12,a1,b5 + + LD b2,2*SIZE(B) + LD b3,3*SIZE(B) + MADD t13,t13,a1,b6 + MADD t14,t14,a1,b7 + + +.L38: # kr=1 +#ifndef TRMMKERNEL + andi K,KCO,1 +#else + andi K,TEMP,1 +#endif + beqz K,.L39 + LD ALPHA,152($sp) # Get ALPHA + + MADD t11,t11,a0,b0 + MADD t12,t12,a0,b1 + daddu A,A,1*SIZE + daddu B,B,4*SIZE + + MADD t13,t13,a0,b2 + MADD t14,t14,a0,b3 + +.L39: # Write Back +#ifndef TRMMKERNEL + LD c11,0(CO1) + LD c12,0(CO2) + LD c13,0(CO3) + LD c14,0(CO4) + + MADD t11,c11,t11,ALPHA + MADD t12,c12,t12,ALPHA + MADD t13,c13,t13,ALPHA + MADD t14,c14,t14,ALPHA + + ST t11,0(CO1) + ST t12,0(CO2) + ST t13,0(CO3) + ST t14,0(CO4) +#else + MUL t11, ALPHA, t11 + MUL t12, ALPHA, t12 + MUL t13, ALPHA, t13 + MUL t14, ALPHA, t14 + + ST t11, 0 * SIZE(CO1) + ST t12, 0 * SIZE(CO2) + ST t13, 0 * SIZE(CO3) + ST t14, 0 * SIZE(CO4) + +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, KCO, KK +#ifdef LEFT + daddiu TEMP, TEMP, -1 +#else + daddiu TEMP, TEMP, -4 +#endif + + dsll K,TEMP, 0 + BASE_SHIFT + dsll TEMP,TEMP, 2 + BASE_SHIFT + + daddu A,A,K + daddu B,B,TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 1 +#endif +#endif + + + .align 3 +.L0_N4_Loop: # mc finished + daddiu N,N,-1 # N-- +#if defined(TRMMKERNEL) && !defined(LEFT) + daddiu KK, KK,4 +#endif + bnez N,.L0_N4_Lb + move BO,B # Set BO point to next panel Bj + + .align 5 +.L0_N2: + andi N,NCO,2 # nr = 2 + beqz N,.L0_N1 + nop + +.L0_N2_Lb: + move CO1,C + daddu CO2,C,LDC + + dsra M,MCO,2 + move A,AO # Reset A + + daddu PREA,AO,SPANA + daddu C,CO2,LDC + +#if defined(TRMMKERNEL) && defined(LEFT) + move KK, OFFSET +#endif + beqz M,.L12_M2 + nop + +.L40: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B,BO # Reset B +#else + dsll K,KK, 2 + BASE_SHIFT + dsll TEMP, KK,1 + BASE_SHIFT + + daddu A,A,K + daddu B,BO,TEMP +#endif + MTC $0,t11 + LD a0,0*SIZE(A) + MOV t21,t11 + LD a1,1*SIZE(A) + + MOV t31,t11 + LD b0,0*SIZE(B) + MOV t41,t11 + LD b1,1*SIZE(B) + + MOV t12,t11 + LD a2,2*SIZE(A) + MOV t22,t11 + LD a3,3*SIZE(A) + + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP,KCO,KK +#elif defined(LEFT) + daddiu TEMP, KK, 4 +#else + daddiu TEMP, KK, 2 +#endif + dsra K,TEMP,2 + MOV t32,t11 + beqz K,.L45 + MOV t42,t11 + +#else + move B,BO # Reset B + LD a0,0*SIZE(A) + MTC $0,t11 # gemm part + LD a1,1*SIZE(A) + + MOV t21,t11 + LD b0,0*SIZE(B) + MOV t31,t11 + LD b1,1*SIZE(B) + + MOV t41,t11 + LD a2,2*SIZE(A) + dsra K,KCO,2 # K=KCO/2 + LD a3,3*SIZE(A) + + MOV t12,t11 + MOV t22,t11 + + MOV t32,t11 + beqz K,.L45 + MOV t42,t11 + +#endif + +.L41: # nr=2,mr=kr=4 + MADD t11,t11,a0,b0 + LD a4,4*SIZE(A) + MADD t21,t21,a1,b0 + LD a5,5*SIZE(A) + + MADD t12,t12,a0,b1 + LD b4,2*SIZE(B) + MADD t22,t22,a1,b1 + LD b5,3*SIZE(B) + + MADD t31,t31,a2,b0 + LD a6,6*SIZE(A) + MADD t41,t41,a3,b0 + LD a7,7*SIZE(A) + + FETCH $0,(PREA) + MADD t32,t32,a2,b1 + MADD t42,t42,a3,b1 + +.L42: + MADD t11,t11,a4,b4 + LD a0,8*SIZE(A) + MADD t21,t21,a5,b4 + LD a1,9*SIZE(A) + + MADD t12,t12,a4,b5 + LD b2,4*SIZE(B) + MADD t22,t22,a5,b5 + LD b3,5*SIZE(B) + + MADD t31,t31,a6,b4 + LD a2,10*SIZE(A) + MADD t41,t41,a7,b4 + LD a3,11*SIZE(A) + + FETCH $0,4*SIZE(PREA) + MADD t32,t32,a6,b5 + MADD t42,t42,a7,b5 + +.L43: + MADD t11,t11,a0,b2 + LD a4,12*SIZE(A) + MADD t21,t21,a1,b2 + LD a5,13*SIZE(A) + + MADD t12,t12,a0,b3 + LD b6,6*SIZE(B) + MADD t22,t22,a1,b3 + LD b7,7*SIZE(B) + + MADD t31,t31,a2,b2 + LD a6,14*SIZE(A) + MADD t41,t41,a3,b2 + LD a7,15*SIZE(A) + + FETCH $0,8*SIZE(PREA) + MADD t32,t32,a2,b3 + MADD t42,t42,a3,b3 + + daddu A,A,16*SIZE # 4mr*4kr + daddu B,B,8*SIZE # 2nr*4kr + +.L44: + MADD t11,t11,a4,b6 + LD a0,0*SIZE(A) + MADD t21,t21,a5,b6 + LD a1,1*SIZE(A) + + + MADD t12,t12,a4,b7 + LD b0,0*SIZE(B) + MADD t22,t22,a5,b7 + LD b1,1*SIZE(B) + + daddiu K,K,-1 + daddu PREA,PREA,16*SIZE + + MADD t31,t31,a6,b6 + LD a2,2*SIZE(A) + MADD t41,t41,a7,b6 + LD a3,3*SIZE(A) + + FETCH $0,-4*SIZE(PREA) + MADD t32,t32,a6,b7 + bnez K,.L41 + MADD t42,t42,a7,b7 + + +.L45: # kr=2 +#ifndef TRMMKERNEL + andi K,KCO,2 +#else + andi K,TEMP,2 +#endif + beqz K,.L48 + nop + +.L46: + MADD t11,t11,a0,b0 + LD a4,4*SIZE(A) + MADD t21,t21,a1,b0 + LD a5,5*SIZE(A) + + MADD t12,t12,a0,b1 + LD b4,2*SIZE(B) + MADD t22,t22,a1,b1 + LD b5,3*SIZE(B) + + MADD t31,t31,a2,b0 + LD a6,6*SIZE(A) + MADD t41,t41,a3,b0 + LD a7,7*SIZE(A) + + FETCH $0,0(PREA) + MADD t32,t32,a2,b1 + daddu B,B,4*SIZE # B+=2(nr)*2(kr)*8Byte=32 + + MADD t42,t42,a3,b1 + daddu A,A,8*SIZE # A+=4(mr)*2(kr)*8Byte=8*SIZE + +.L47: + MADD t11,t11,a4,b4 + LD a0,0*SIZE(A) + MADD t21,t21,a5,b4 + LD a1,1*SIZE(A) + + MADD t12,t12,a4,b5 + LD b0,0*SIZE(B) + MADD t22,t22,a5,b5 + LD b1,1*SIZE(B) + + MADD t31,t31,a6,b4 + LD a2,2*SIZE(A) + MADD t41,t41,a7,b4 + LD a3,3*SIZE(A) + + FETCH $0,4*SIZE(PREA) + MADD t32,t32,a6,b5 + MADD t42,t42,a7,b5 + daddu PREA,PREA,8*SIZE + + + +.L48: # kr=1 +#ifndef TRMMKERNEL + andi K,KCO,1 +#else + andi K,TEMP,1 +#endif + beqz K,.L49 + LD ALPHA,152($sp) # Get ALPHA + + FETCH $0,0(PREA) + MADD t11,t11,a0,b0 + MADD t21,t21,a1,b0 + daddu A,A,4*SIZE # A+=4(mr)*1(kr)*8Byte=32 + + MADD t12,t12,a0,b1 + MADD t22,t22,a1,b1 + daddu B,B,2*SIZE + daddu PREA,PREA,4*SIZE + + MADD t31,t31,a2,b0 + MADD t41,t41,a3,b0 + + MADD t32,t32,a2,b1 + MADD t42,t42,a3,b1 + +.L49: # Write Back +#ifndef TRMMKERNEL + LD c11,0(CO1) # gemm write back part Fetch 16 C + LD c21,1*SIZE(CO1) + LD c31,2*SIZE(CO1) + LD c41,3*SIZE(CO1) + + LD c12,0(CO2) + MADD t11,c11,t11,ALPHA + LD c22,1*SIZE(CO2) + MADD t21,c21,t21,ALPHA + LD c32,2*SIZE(CO2) + MADD t31,c31,t31,ALPHA + LD c42,3*SIZE(CO2) + MADD t41,c41,t41,ALPHA + + ST t11,0(CO1) + MADD t12,c12,t12,ALPHA + ST t21,1*SIZE(CO1) + MADD t22,c22,t22,ALPHA + ST t31,2*SIZE(CO1) + MADD t32,c32,t32,ALPHA + ST t41,3*SIZE(CO1) + MADD t42,c42,t42,ALPHA + daddiu M,M,-1 + + ST t12,0(CO2) + ST t22,1*SIZE(CO2) + ST t32,2*SIZE(CO2) + ST t42,3*SIZE(CO2) + + FETCH $0,4*SIZE(CO1) + FETCH $0,4*SIZE(CO2) + FETCH $0,8*SIZE(CO1) + FETCH $0,8*SIZE(CO2) + + daddu CO1,CO1,4*SIZE + bnez M,.L40 + daddu CO2,CO2,4*SIZE + +#else + MUL t11, ALPHA, t11 + MUL t21, ALPHA, t21 + MUL t31, ALPHA, t31 + MUL t41, ALPHA, t41 + + MUL t12, ALPHA, t12 + ST t11, 0 * SIZE(CO1) + MUL t22, ALPHA, t22 + ST t21, 1 * SIZE(CO1) + MUL t32, ALPHA, t32 + ST t31, 2 * SIZE(CO1) + MUL t42, ALPHA, t42 + ST t41, 3 * SIZE(CO1) + + ST t12, 0 * SIZE(CO2) + daddiu M,M,-1 + ST t22, 1 * SIZE(CO2) + ST t32, 2 * SIZE(CO2) + ST t42, 3 * SIZE(CO2) + + daddiu CO1,CO1, 4*SIZE + daddiu CO2,CO2, 4*SIZE + + FETCH $0,0(CO1) + FETCH $0,0(CO2) + FETCH $0,4(CO1) + FETCH $0,4(CO2) + +#if ( defined(LEFT) && defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, KCO, KK +#ifdef LEFT + daddiu TEMP, TEMP, -4 +#else + daddiu TEMP, TEMP, -2 +#endif + dsll K,TEMP, 2 + BASE_SHIFT + dsll TEMP, TEMP, 1 + BASE_SHIFT + + daddu A,A,K + daddu B,B,TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 4 +#endif + bnez M,.L40 + nop +#endif + + + .align 3 +.L12_M2: + andi M,MCO,2 # mr = 2 + beqz M,.L12_M1 + nop + +.L50: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B,BO +#else + dsll K, KK, 1 + BASE_SHIFT #mr=2 + dsll TEMP, KK, 1 + BASE_SHIFT #nr=2 + + daddu A, A, K + daddu B, BO, TEMP +#endif + MTC $0,t11 + LD a0,0*SIZE(A) + MOV t21,t11 + LD a1,1*SIZE(A) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, KCO, KK +#elif defined(LEFT) + daddiu TEMP, KK, 2 +#else + daddiu TEMP, KK, 2 +#endif + dsra K,TEMP,2 + MOV t12,t11 + beqz K,.L55 + MOV t22,t11 + +#else + move B,BO + LD a0,0*SIZE(A) + dsra K,KCO,2 # K=KCO/2 + LD a1,1*SIZE(A) + + MTC $0,t11 + LD b0,0*SIZE(B) + MOV t21,t11 + LD b1,1*SIZE(B) + + MOV t12,t11 + beqz K,.L55 + MOV t22,t11 + +#endif + +.L51: # nr=2 mr=2,kr=4 + MADD t11,t11,a0,b0 + LD a4,2*SIZE(A) + MADD t21,t21,a1,b0 + LD b4,2*SIZE(B) + + MADD t12,t12,a0,b1 + LD a5,3*SIZE(A) + MADD t22,t22,a1,b1 + LD b5,3*SIZE(B) + + MADD t11,t11,a4,b4 + LD a2,4*SIZE(A) + MADD t21,t21,a5,b4 + LD b2,4*SIZE(B) + + MADD t12,t12,a4,b5 + LD a3,5*SIZE(A) + MADD t22,t22,a5,b5 + daddiu K,K,-1 + LD b3,5*SIZE(B) + + MADD t11,t11,a2,b2 + LD a6,6*SIZE(A) + MADD t21,t21,a3,b2 + daddu A,A,8*SIZE # A+=2(mr)*4(kr)*8Byte=8*SIZE + LD b6,6*SIZE(B) + + MADD t12,t12,a2,b3 + daddu B,B,8*SIZE # B+=2(nr)*4(kr)*8Byte=16*SIZE + LD a7,-1*SIZE(A) + MADD t22,t22,a3,b3 + LD b7,-1*SIZE(B) + + MADD t11,t11,a6,b6 + LD a0,0*SIZE(A) + MADD t21,t21,a7,b6 + LD b0,0*SIZE(B) + + MADD t12,t12,a6,b7 + LD a1,1*SIZE(A) + + MADD t22,t22,a7,b7 + bnez K,.L51 + LD b1,1*SIZE(B) + + +.L55: # kr=2 +#ifndef TRMMKERNEL + andi K,KCO,2 +#else + andi K,TEMP,2 +#endif + beqz K,.L58 + nop + +.L56: + MADD t11,t11,a0,b0 + LD a4,2*SIZE(A) + MADD t21,t21,a1,b0 + daddu A,A,4*SIZE # A+=2(mr)*2(kr)*8Byte=32 + LD b4,2*SIZE(B) + + MADD t12,t12,a0,b1 + daddu B,B,4*SIZE # 2nr*2kr + LD a5,-1*SIZE(A) + MADD t22,t22,a1,b1 + LD b5,-1*SIZE(B) + +.L57: + MADD t11,t11,a4,b4 + LD a0,0*SIZE(A) + MADD t21,t21,a5,b4 + LD b0,0*SIZE(B) + + MADD t12,t12,a4,b5 + LD a1,1*SIZE(A) + MADD t22,t22,a5,b5 + LD b1,1*SIZE(B) + +.L58: # kr=1 +#ifndef TRMMKERNEL + andi K,KCO,1 +#else + andi K,TEMP, 1 +#endif + beqz K,.L59 + LD ALPHA,152($sp) # Get ALPHA + + MADD t11,t11,a0,b0 + MADD t21,t21,a1,b0 + daddu A,A,2*SIZE # A+=2(mr)*1(kr)*8Byte=16 + daddu B,B,2*SIZE # 2nr*kr + + MADD t12,t12,a0,b1 + MADD t22,t22,a1,b1 + + +.L59: # Write Back +#ifndef TRMMKERNEL + LD c11,0(CO1) # write gemm part back Fetch 16 C + LD c21,1*SIZE(CO1) + LD c12,0(CO2) + LD c22,1*SIZE(CO2) + + MADD t11,c11,t11,ALPHA + MADD t21,c21,t21,ALPHA + MADD t12,c12,t12,ALPHA + MADD t22,c22,t22,ALPHA + + ST t11,0(CO1) + ST t21,1*SIZE(CO1) + ST t12,0(CO2) + ST t22,1*SIZE(CO2) + + daddu CO1,CO1,2*SIZE + daddu CO2,CO2,2*SIZE + + FETCH $0,0(CO1) + FETCH $0,0(CO2) +#else + daddiu M, M, -1 + daddiu CO1,CO1, 2 * SIZE + daddiu CO2,CO2, 2 * SIZE + MUL t11, ALPHA, t11 + MUL t21, ALPHA, t21 + MUL t12, ALPHA, t12 + MUL t22, ALPHA, t22 + + ST t11, -2 * SIZE(CO1) + ST t21, -1 * SIZE(CO1) + ST t12, -2 * SIZE(CO2) + ST t22, -1 * SIZE(CO2) + + FETCH $0,0(CO1) + FETCH $0,0(CO2) + +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, KCO, KK +#ifdef LEFT + daddiu TEMP, TEMP, -2 +#else + daddiu TEMP, TEMP, -2 +#endif + + dsll K, TEMP, 1 + BASE_SHIFT + dsll TEMP, TEMP, 1 + BASE_SHIFT + + daddu A, A, K + daddu B, B, TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 2 +#endif +#endif + + + .align 3 +.L12_M1: + andi M,MCO,1 # mr = 1 + beqz M,.L0_N2_Loop + nop + +.L60: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B,BO # Reset B +#else + dsll K, KK, 0 + BASE_SHIFT + dsll TEMP, KK, 1 + BASE_SHIFT + + daddu A, A, K + daddu B, BO, TEMP +#endif + MTC $0,t11 + LD a0, 0*SIZE(A) # a0 + + MOV t21,t11 + LD b0,0*SIZE(B) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, KCO, KK +#elif defined(LEFT) + daddiu TEMP, KK, 1 +#else + daddiu TEMP, KK, 2 +#endif + dsra K,TEMP,2 + MOV t12,t11 + beqz K,.L65 + MOV t22,t11 + +#else + dsra K,KCO,2 + move B,BO # Reset B + LD a0,0*SIZE(A) + + MTC $0,t11 + MOV t21,t11 + LD b0,0*SIZE(B) + + MOV t12,t11 + LD b1,1*SIZE(B) + beqz K,.L65 + MOV t22,t11 + +#endif + +.L61: # nr=2,mr=1,kr=4 + LD a4, 1*SIZE(A) # a2 + LD b4, 2*SIZE(B) + MADD t11,t11,a0,b0 + + LD b5,3*SIZE(B) + MADD t12,t12,a0,b1 + + LD a2, 2*SIZE(A) # a3 + LD b2,4*SIZE(B) + MADD t11,t11,a4,b4 + + LD b3,5*SIZE(B) + MADD t12,t12,a4,b5 + + LD a6, 3*SIZE(A) # a4 + daddiu K,K,-1 + LD b6,6*SIZE(B) + MADD t11,t11,a2,b2 + + LD b7,7*SIZE(B) + MADD t12,t12,a2,b3 + daddu A,A,4*SIZE # A+=1(mr)*4(kr)*8Byte=32 + + LD a0, 0*SIZE(A) + daddu B,B,8*SIZE # B+=2(nr)*4(kr)*8Byte=8*SIZE + + LD b0,0*SIZE(B) + MADD t11,t11,a6,b6 + + LD b1,1*SIZE(B) + bnez K,.L61 + MADD t12,t12,a6,b7 + + + +.L65: # kr=2 +#ifndef TRMMKERNEL + andi K,KCO,2 +#else + andi K,TEMP,2 +#endif + beqz K,.L68 + nop + +.L66: + LD a4, 1*SIZE(A) # a1 + MADD t11,t11,a0,b0 + LD b4,2*SIZE(B) + daddu A,A,2*SIZE # A+=1(mr)*2(kr)*8Byte=16 + + LD b5,3*SIZE(B) + MADD t12,t12,a0,b1 + daddu B,B,4*SIZE + +.L67: + LD a0,0(A) # a0 + LD b0,0*SIZE(B) + MADD t11,t11,a4,b4 + + LD b1,1*SIZE(B) + MADD t12,t12,a4,b5 + + +.L68: # kr=1 +#ifndef TRMMKERNEL + andi K,KCO,1 +#else + andi K,TEMP,1 +#endif + beqz K,.L69 + LD ALPHA,152($sp) # Get ALPHA + + MADD t11,t11,a0,b0 + MADD t12,t12,a0,b1 + daddu A,A,1*SIZE # A+=1(mr)*1(kr)*8Byte=16 + daddu B,B,2*SIZE + + +.L69: # Write Back +#ifndef TRMMKERNEL + LD c11,0(CO1) # Fetch 16 C + LD c12,0(CO2) + + MADD t11,c11,t11,ALPHA + MADD t12,c12,t12,ALPHA + + ST t11,0(CO1) + ST t12,0(CO2) + + daddu CO1,CO1,1*SIZE + daddu CO2,CO2,1*SIZE + +#else + MUL t11, ALPHA, t11 + MUL t12, ALPHA, t12 + + ST t11, 0 * SIZE(CO1) + ST t12, 0 * SIZE(CO2) + + daddu CO1,CO1,1*SIZE + daddu CO2,CO2,1*SIZE + +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, KCO, KK +#ifdef LEFT + daddiu TEMP, TEMP, -1 +#else + daddiu TEMP, TEMP, -2 +#endif + + dsll K, TEMP, 0 + BASE_SHIFT + dsll TEMP, TEMP, 1 + BASE_SHIFT + + daddu A, A, K + daddu B, B, TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 1 +#endif +#endif + +.L0_N2_Loop: +#if defined(TRMMKERNEL) && !defined(LEFT) + daddiu KK, KK, 2 +#endif + move BO, B + + + .align 5 +.L0_N1: + andi N,NCO,1 # nr = 1 + beqz N,.L999 + nop + + move CO1,C + dsra M,MCO,2 + + move A,AO # Reset A + daddu PREA,AO,SPANA +#if defined(TRMMKERNEL) && defined(LEFT) + move KK, OFFSET +#endif + + beqz M,.L11_M2 + daddu C,CO1,LDC + +.L70: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B, BO # Reset B +#else + dsll K, KK, 2 + BASE_SHIFT + dsll TEMP, KK, 0 + BASE_SHIFT + + daddu A, A, K + daddu B, BO, TEMP +#endif + MTC $0,t11 + LD b0, 0*SIZE(B) + + MOV t21,t11 + LD a0,0*SIZE(A) + MOV t31,t11 + LD a1,1*SIZE(A) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, KCO, KK +#elif defined(LEFT) + daddiu TEMP, KK, 4 +#else + daddiu TEMP, KK, 1 +#endif + dsra K,TEMP,2 + MOV t41,t11 + beqz K,.L75 + nop +#else + move B, BO # Reset B + dsra K,KCO,2 + LD b0, 0*SIZE(B) + + MTC $0,t11 + LD a0,0*SIZE(A) + MOV t21,t11 + LD a1,1*SIZE(A) + + MOV t31,t11 + LD a2,2*SIZE(A) + MOV t41,t11 + beqz K,.L75 + LD a3,3*SIZE(A) + +#endif + +.L71: # nr=1,mr=kr=4 + LD b4, 1*SIZE(B) # b1 + MADD t11,t11,a0,b0 + + LD a4, 4*SIZE(A) + MADD t21,t21,a1,b0 + + LD a5, 5*SIZE(A) + FETCH $0,(PREA) + + LD a6,6*SIZE(A) + MADD t31,t31,a2,b0 + + LD a7,7*SIZE(A) + MADD t41,t41,a3,b0 + +.L72: + LD b2, 2*SIZE(B) # b2 + MADD t11,t11,a4,b4 + + LD a0,8*SIZE(A) + MADD t21,t21,a5,b4 + + LD a1,9*SIZE(A) + FETCH $0,4*SIZE(PREA) + + LD a2,10*SIZE(A) + MADD t31,t31,a6,b4 + + LD a3,11*SIZE(A) + MADD t41,t41,a7,b4 + +.L73: + LD b6, 3*SIZE(B) + MADD t11,t11,a0,b2 + + LD a4,12*SIZE(A) + daddu B,B,4*SIZE # B+=1(nr)*4(kr)*8Byte=32 + + LD a5,13*SIZE(A) + MADD t21,t21,a1,b2 + + LD a6,14*SIZE(A) + FETCH $0,8*SIZE(PREA) + MADD t31,t31,a2,b2 + + LD a7,15*SIZE(A) + MADD t41,t41,a3,b2 + daddu A,A,16*SIZE # A+=4(mr)*4(kr)*8Byte=16*SIZE + +.L74: + LD b0, 0*SIZE(B) + MADD t11,t11,a4,b6 + + LD a0,0*SIZE(A) + daddu PREA,PREA,16*SIZE + + LD a1,1*SIZE(A) + MADD t21,t21,a5,b6 + + LD a2,2*SIZE(A) + daddiu K,K,-1 + MADD t31,t31,a6,b6 + + LD a3,3*SIZE(A) + MADD t41,t41,a7,b6 + bnez K,.L71 + FETCH $0,-32(PREA) + + +.L75: # kr=2 +#ifndef TRMMKERNEL + andi K,KCO,2 +#else + andi K,TEMP,2 +#endif + beqz K,.L78 + nop + +.L76: + LD b4, 1*SIZE(B) + MADD t11,t11,a0,b0 + + LD a4,4*SIZE(A) + daddu B,B,2*SIZE # B+=1(nr)*2(kr)*8Byte=32 + + LD a5,5*SIZE(A) + MADD t21,t21,a1,b0 + FETCH $0,0(PREA) + + LD a6,6*SIZE(A) + MADD t31,t31,a2,b0 + + LD a7,7*SIZE(A) + MADD t41,t41,a3,b0 + daddu A,A,8*SIZE # A+=4(mr)*2(kr)*8Byte=8*SIZE + +.L77: + LD b0,0(B) + MADD t11,t11,a4,b4 + + LD a0,0*SIZE(A) + MADD t21,t21,a5,b4 + FETCH $0,4*SIZE(PREA) + + LD a1,1*SIZE(A) + MADD t31,t31,a6,b4 + + LD a2,2*SIZE(A) + MADD t41,t41,a7,b4 + + LD a3,3*SIZE(A) + daddu PREA,PREA,8*SIZE + + + +.L78: # kr=1 +#ifndef TRMMKERNEL + andi K,KCO,1 +#else + andi K,TEMP,1 +#endif + beqz K,.L79 + LD ALPHA,152($sp) # Get ALPHA + + FETCH $0,0(PREA) + MADD t11,t11,a0,b0 + MADD t21,t21,a1,b0 + daddu A,A,4*SIZE # A+=4(mr)*1(kr)*8Byte=32 + + MADD t31,t31,a2,b0 + MADD t41,t41,a3,b0 + daddu B,B,1*SIZE + daddu PREA,PREA,4*SIZE + + +.L79: # Write Back +#ifndef TRMMKERNEL + LD c11,0(CO1) # Fetch 16 C + LD c21,1*SIZE(CO1) + LD c31,2*SIZE(CO1) + LD c41,3*SIZE(CO1) + + MADD t11,c11,t11,ALPHA + MADD t21,c21,t21,ALPHA + MADD t31,c31,t31,ALPHA + MADD t41,c41,t41,ALPHA + + ST t11,0(CO1) + ST t21,1*SIZE(CO1) + ST t31,2*SIZE(CO1) + ST t41,3*SIZE(CO1) + daddiu M,M,-1 # M-- + + FETCH $0,4*SIZE(CO1) + FETCH $0,8*SIZE(CO1) + + bnez M,.L70 # M!=0 + daddu CO1,CO1,4*SIZE # COx += 4*8Byte +#else + daddiu M,M,-1 # M-- + MUL t11, ALPHA, t11 + MUL t21, ALPHA, t21 + MUL t31, ALPHA, t31 + MUL t41, ALPHA, t41 + + ST t11,0(CO1) + ST t21,1*SIZE(CO1) + ST t31,2*SIZE(CO1) + ST t41,3*SIZE(CO1) + + FETCH $0,4*SIZE(CO1) + FETCH $0,8*SIZE(CO1) + + daddu CO1,CO1,4*SIZE +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, KCO, KK +#ifdef LEFT + daddiu TEMP, TEMP, -4 +#else + daddiu TEMP, TEMP, -1 +#endif + + dsll K, TEMP, 2 + BASE_SHIFT + dsll TEMP, TEMP, 0 + BASE_SHIFT + + daddu A, A,K + daddu B, B, TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 4 +#endif + bnez M,.L70 + nop +#endif + + + .align 3 +.L11_M2: + andi M,MCO,2 # mr = 2 + beqz M,.L11_M1 + nop + +.L80: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B, BO +#else + dsll K, KK, 1 + BASE_SHIFT + dsll TEMP, KK, 0 + BASE_SHIFT + + daddu A, A, K + daddu B, BO, TEMP +#endif + LD b0, 0*SIZE(B) + MTC $0,t11 + + LD a0,0*SIZE(A) + MOV t21,t11 + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, KCO, KK +#elif defined(LEFT) + daddiu TEMP, KK, 2 +#else + daddiu TEMP, KK, 1 +#endif + dsra K,TEMP,2 # K=KCO/2 + beqz K,.L85 + nop +#else + move B, BO + dsra K,KCO,2 + LD b0, 0*SIZE(B) + + MTC $0,t11 + MOV t21,t11 + LD a0,0*SIZE(A) + + beqz K,.L85 + LD a1,1*SIZE(A) + +#endif + +.L81: # nr=1,mr=2,kr=4 + LD b4, 1*SIZE(B) + LD a4,2*SIZE(A) + MADD t11,t11,a0,b0 + LD a5,3*SIZE(A) + MADD t21,t21,a1,b0 + + LD b2, 2*SIZE(B) + LD a2,4*SIZE(A) + MADD t11,t11,a4,b4 + LD a3,5*SIZE(A) + MADD t21,t21,a5,b4 + + LD b6, 3*SIZE(B) + LD a6,6*SIZE(A) + MADD t11,t11,a2,b2 + LD a7,7*SIZE(A) + MADD t21,t21,a3,b2 + + daddu A,A,8*SIZE # A+=2(mr)*4(kr)*8Byte=8*SIZE + daddu B,B,4*SIZE # B+=1(nr)*4(kr)*8Byte=32 + + LD b0, 0*SIZE(B) + daddiu K,K,-1 + + LD a0,0*SIZE(A) + MADD t11,t11,a6,b6 + + LD a1,1*SIZE(A) + bnez K,.L81 + MADD t21,t21,a7,b6 + +.L85: # kr=2 +#ifndef TRMMKERNEL + andi K,KCO,2 +#else + andi K,TEMP,2 +#endif + beqz K,.L88 + nop + +.L86: + LD b4, 1*SIZE(B) + LD a4,2*SIZE(A) + MADD t11,t11,a0,b0 + LD a5,3*SIZE(A) + MADD t21,t21,a1,b0 + + daddu A,A,4*SIZE # A+=2(mr)*2(kr)*8Byte=32 + daddu B,B,2*SIZE # B+=1(nr)*2(kr)*8Byte=16 + + LD b0,0(B) + LD a0,0*SIZE(A) + MADD t11,t11,a4,b4 + LD a1,1*SIZE(A) + MADD t21,t21,a5,b4 + + + +.L88: # kr=1 +#ifndef TRMMKERNEL + andi K,KCO,1 +#else + andi K,TEMP,1 +#endif + beqz K,.L89 + LD ALPHA,152($sp) # Get ALPHA + + MADD t11,t11,a0,b0 + MADD t21,t21,a1,b0 + daddu A,A,2*SIZE # A+=2(mr)*1(kr)*8Byte=16 + daddu B,B,1*SIZE + + +.L89: # Write Back +#ifndef TRMMKERNEL + LD c11,0(CO1) # Fetch 16 C + LD c21,1*SIZE(CO1) + + MADD t11,c11,t11,ALPHA + MADD t21,c21,t21,ALPHA + + ST t11,0(CO1) + ST t21,1*SIZE(CO1) + + FETCH $0,2*SIZE(CO1) + + daddu CO1,CO1,2*SIZE # COx += 2*8Byte + +#else + daddu CO1,CO1,2*SIZE # COx += 2*8Byte + MUL t11, ALPHA, t11 + MUL t21, ALPHA, t21 + + FETCH $0,0(CO1) + ST t11, -2 * SIZE(CO1) + ST t21, -1 * SIZE(CO1) +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, KCO, KK +#ifdef LEFT + daddiu TEMP, TEMP, -2 +#else + daddiu TEMP, TEMP, -1 +#endif + + dsll K, TEMP, 1 + BASE_SHIFT + dsll TEMP, TEMP, 0 + BASE_SHIFT + + daddu A, A, K + daddu B, B, TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 2 +#endif +#endif + + + .align 3 +.L11_M1: + andi M,MCO,1 # mr = 1 + beqz M,.L999 + nop + +.L90: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B, BO +#else + dsll K, KK, 0 + BASE_SHIFT + dsll TEMP, KK, 0 + BASE_SHIFT + + daddu A, A, K + daddu B, BO, TEMP +#endif + LD a0, 0*SIZE(A) + LD b0, 0*SIZE(B) + MTC $0,t11 +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, KCO, KK +#elif defined(LEFT) + daddiu TEMP, KK, 1 +#else + daddiu TEMP, KK, 1 +#endif + dsra K, TEMP, 2 + beqz K,.L95 + nop + +#else + move B, BO + LD a0, 0*SIZE(A) + LD b0, 0*SIZE(B) + dsra K,KCO,2 + beqz K,.L95 + MTC $0,t11 +#endif + +.L91: # nr=mr=1,kr=4 + LD a4, 1*SIZE(A) + LD b4, 1*SIZE(B) + MADD t11,t11,a0,b0 + + LD a2, 2*SIZE(A) + LD b2, 2*SIZE(B) + MADD t11,t11,a4,b4 + + LD a6, 3*SIZE(A) + LD b6, 3*SIZE(B) + MADD t11,t11,a2,b2 + + daddu A,A,4*SIZE # A+=1(mr)*4(kr)*8Byte=32 + daddu B,B,4*SIZE # B+=1(nr)*4(kr)*8Byte=32 + + LD a0, 0*SIZE(A) + LD b0, 0*SIZE(B) + MADD t11,t11,a6,b6 + + daddiu K,K,-1 + bnez K,.L91 + nop + +.L95: # kr=2 +#ifndef TRMMKERNEL + andi K,KCO,2 +#else + andi K,TEMP,2 +#endif + beqz K,.L98 + nop + +.L96: + LD a4, 1*SIZE(A) + LD b4, 1*SIZE(B) + MADD t11,t11,a0,b0 + daddu B,B,2*SIZE # B+=1(nr)*2(kr)*8Byte=16 + daddu A,A,2*SIZE # A+=1(mr)*2(kr)*8Byte=32 + + LD b0,0(B) + LD a0,0(A) + MADD t11,t11,a4,b4 + +.L98: # kr=1 +#ifndef TRMMKERNEL + andi K,KCO,1 +#else + andi K,TEMP,1 +#endif + beqz K,.L99 + LD ALPHA,152($sp) # Get ALPHA + + MADD t11,t11,a0,b0 + + +.L99: # Write Back +#ifndef TRMMKERNEL + LD c11,0(CO1) # Fetch 16 C + MADD t11,c11,t11,ALPHA + ST t11,0(CO1) + +#else + MUL t11, ALPHA, t11 + + ST t11, 0 * SIZE(CO1) +#endif + + +.L999: # End + ld $16, 0($sp) + ld $17, 8($sp) + ld $18, 16($sp) + ld $19, 24($sp) + ld $20, 32($sp) + ld $21, 40($sp) + ld $22, 48($sp) + LD $f24, 56($sp) + LD $f25, 64($sp) + LD $f26, 72($sp) + LD $f27, 80($sp) + LD $f28, 88($sp) + ld $23, 96($sp) + ld $24, 104($sp) + ld $25, 112($sp) + LD $f20,120($sp) + LD $f21,128($sp) + LD $f22,136($sp) + LD $f23,144($sp) + + j $31 + daddiu $sp, $sp, 160 + + EPILOGUE From cc09e6ef3a6b18640722c83a1c948261b60a5d57 Mon Sep 17 00:00:00 2001 From: Xianyi Zhang Date: Fri, 27 May 2011 18:12:45 +0800 Subject: [PATCH 23/42] Ingnore *.obj files in git. --- .gitignore | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.gitignore b/.gitignore index 44af57166..fa15203f1 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,7 @@ +*.obj +*.lib +*.dll +*.def *.o lapack-3.1.1 lapack-3.1.1.tgz From c30c22a76cc072c8791667c46b5cad9a5e431583 Mon Sep 17 00:00:00 2001 From: Xianyi Zhang Date: Fri, 27 May 2011 18:16:19 +0800 Subject: [PATCH 24/42] Fixed a bug about detecting underscore prefix in c_check. --- Changelog.txt | 1 + c_check | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/Changelog.txt b/Changelog.txt index cd1b4c3ef..461058279 100644 --- a/Changelog.txt +++ b/Changelog.txt @@ -17,6 +17,7 @@ common: * Fixed issue #23. Fixed a bug of f_check script about generating link flags. * Added openblas_set_num_threads for Fortran. * Fixed #25 a wrong result of rotmg. + * Fixed a bug about detecting underscore prefix in c_check. x86/x86_64: * Fixed #28 a wrong result of dsdot on x86_64. diff --git a/c_check b/c_check index d8025f9f3..263efeb3d 100644 --- a/c_check +++ b/c_check @@ -149,7 +149,7 @@ $binformat = bin64 if ($data =~ /BINARY_64/); $data = `$compiler_name -S ctest1.c && grep globl ctest1.s | head -n 1 && rm -f ctest1.s`; -$data =~ /globl\ ([_\.]*)(.*)/; +$data =~ /globl\s([_\.]*)(.*)/; $need_fu = $1; From af40551c9f47ce752abefbf8afddba348875b0f6 Mon Sep 17 00:00:00 2001 From: Xianyi Zhang Date: Fri, 27 May 2011 21:15:30 +0800 Subject: [PATCH 25/42] Fixed the makefile bug about openblas_set_num_threads. --- driver/others/Makefile | 1 + 1 file changed, 1 insertion(+) diff --git a/driver/others/Makefile b/driver/others/Makefile index ab0e2fea0..75b552b65 100644 --- a/driver/others/Makefile +++ b/driver/others/Makefile @@ -101,6 +101,7 @@ blas_server.$(SUFFIX) : $(BLAS_SERVER) ../../common.h ../../common_thread.h ../. $(CC) $(CFLAGS) -c $< -o $(@F) openblas_set_num_threads.$(SUFFIX) : openblas_set_num_threads.c + $(CC) $(CFLAGS) -c $< -o $(@F) blasL1thread.$(SUFFIX) : blas_l1_thread.c ../../common.h ../../common_thread.h $(CC) $(CFLAGS) -c $< -o $(@F) From 88d94d0ec826f54e409d0690eea1783089d0926b Mon Sep 17 00:00:00 2001 From: traz Date: Sat, 28 May 2011 09:48:34 +0000 Subject: [PATCH 26/42] Fixed #30 strmm computational error on Loongson3A. --- kernel/mips64/sgemm_kernel_loongson3a.S | 110 ++++++++++++++---------- param.h | 10 +-- 2 files changed, 70 insertions(+), 50 deletions(-) diff --git a/kernel/mips64/sgemm_kernel_loongson3a.S b/kernel/mips64/sgemm_kernel_loongson3a.S index 36c3b3878..4a8c9b0e4 100644 --- a/kernel/mips64/sgemm_kernel_loongson3a.S +++ b/kernel/mips64/sgemm_kernel_loongson3a.S @@ -1,6 +1,7 @@ #define REALNAME ASMNAME #define ASSEMBLER #include "common.h" + #define FETCH ld #define gsLQC1(base,fq,ft,offset) .word(0x32<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq) #define gsSQC1(base,fq,ft,offset) .word(0x3A<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq) @@ -215,35 +216,36 @@ daddu A,A,K # move A B to data part daddu B,BO,TEMP #endif - MTC $0,t11 - MOV t21,t11 + + MTC $0,t11 # GEMM part NR=4,MR=4 LD a0,0(A) - + + MOV t21,t11 MOV t31,t11 - MOV t41,t11 LD a1,1*SIZE(A) + MOV t41,t11 MOV t12,t11 - MOV t22,t11 LD b0,0(B) + MOV t22,t11 MOV t32,t11 - MOV t42,t11 LD b1,1*SIZE(B) + MOV t42,t11 + LD a2,2*SIZE(A) + MOV t13,t11 MOV t23,t11 - LD a2,2*SIZE(A) - + LD b2,2*SIZE(B) + MOV t33,t11 MOV t43,t11 - LD b2,2*SIZE(B) + LD a3,3*SIZE(A) MOV t14,t11 MOV t24,t11 - LD a3,3*SIZE(A) - - + LD b3,3*SIZE(B) #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) dsubu TEMP,KCO,KK # temp is the length of the data part @@ -733,22 +735,22 @@ daddu B,BO,TEMP #endif - MTC $0,t11 LD a0,0*SIZE(A) - MOV t21,t11 + MTC $0,t11 LD a1,1*SIZE(A) - - MOV t12,t11 + + MOV t21,t11 LD b0,0*SIZE(B) - MOV t22,t11 + MOV t12,t11 LD b1,1*SIZE(B) - MOV t13,t11 + MOV t22,t11 LD b2,2*SIZE(B) + + MOV t13,t11 MOV t23,t11 LD b3,3*SIZE(B) - #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) dsubu TEMP,KCO,KK #elif defined(LEFT) @@ -1043,20 +1045,26 @@ #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) move B,BO # Reset B #else - dsll K,KK, 0 + BASE_SHIFT + dsll K,KK, BASE_SHIFT dsll TEMP,KK,2 + BASE_SHIFT daddu A,A,K daddu B,BO,TEMP #endif + + LD a0, 0 * SIZE(A) # a0 + MTC $0,t11 + LD b0,0*SIZE(B) + MOV t12,t11 - LD a0, 0 * SIZE(A) # a0 + LD b1,1*SIZE(B) MOV t13,t11 - LD b0,0*SIZE(B) - MOV t14,t11 # clear result registers - LD b1,1*SIZE(B) + LD b2,2*SIZE(B) + + MOV t14,t11 + LD b3,3*SIZE(B) #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) dsubu TEMP, KCO, KK @@ -1236,7 +1244,7 @@ daddiu TEMP, TEMP, -4 #endif - dsll K,TEMP, 0 + BASE_SHIFT + dsll K,TEMP, BASE_SHIFT dsll TEMP,TEMP, 2 + BASE_SHIFT daddu A,A,K @@ -1291,21 +1299,21 @@ daddu A,A,K daddu B,BO,TEMP #endif - MTC $0,t11 LD a0,0*SIZE(A) - MOV t21,t11 + MTC $0,t11 # gemm part LD a1,1*SIZE(A) - MOV t31,t11 + MOV t21,t11 LD b0,0*SIZE(B) - MOV t41,t11 + MOV t31,t11 LD b1,1*SIZE(B) - MOV t12,t11 + MOV t41,t11 LD a2,2*SIZE(A) - MOV t22,t11 LD a3,3*SIZE(A) - + + MOV t12,t11 + MOV t22,t11 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) dsubu TEMP,KCO,KK @@ -1621,11 +1629,14 @@ daddu A, A, K daddu B, BO, TEMP #endif - MTC $0,t11 LD a0,0*SIZE(A) - MOV t21,t11 LD a1,1*SIZE(A) + MTC $0,t11 + LD b0,0*SIZE(B) + MOV t21,t11 + LD b1,1*SIZE(B) + #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) dsubu TEMP, KCO, KK #elif defined(LEFT) @@ -1830,11 +1841,14 @@ daddu A, A, K daddu B, BO, TEMP #endif - MTC $0,t11 - LD a0, 0*SIZE(A) # a0 + LD a0,0*SIZE(A) + MTC $0,t11 MOV t21,t11 - LD b0,0*SIZE(B) + LD b0,0*SIZE(B) + + MOV t12,t11 + LD b1,1*SIZE(B) #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) dsubu TEMP, KCO, KK @@ -1844,9 +1858,9 @@ daddiu TEMP, KK, 2 #endif dsra K,TEMP,2 - MOV t12,t11 - beqz K,.L65 MOV t22,t11 + beqz K,.L65 + nop #else dsra K,KCO,2 @@ -2023,13 +2037,18 @@ daddu A, A, K daddu B, BO, TEMP #endif - MTC $0,t11 LD b0, 0*SIZE(B) - MOV t21,t11 + MTC $0,t11 LD a0,0*SIZE(A) - MOV t31,t11 + MOV t21,t11 LD a1,1*SIZE(A) + + MOV t31,t11 + LD a2,2*SIZE(A) + MOV t41,t11 + LD a3,3*SIZE(A) + #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) dsubu TEMP, KCO, KK @@ -2039,7 +2058,6 @@ daddiu TEMP, KK, 1 #endif dsra K,TEMP,2 - MOV t41,t11 beqz K,.L75 nop #else @@ -2276,10 +2294,11 @@ daddu B, BO, TEMP #endif LD b0, 0*SIZE(B) + MTC $0,t11 - - LD a0,0*SIZE(A) MOV t21,t11 + LD a0,0*SIZE(A) + LD a1,1*SIZE(A) #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) dsubu TEMP, KCO, KK @@ -2443,6 +2462,7 @@ LD a0, 0*SIZE(A) LD b0, 0*SIZE(B) MTC $0,t11 + #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) dsubu TEMP, KCO, KK #elif defined(LEFT) diff --git a/param.h b/param.h index 417165652..603caab46 100644 --- a/param.h +++ b/param.h @@ -1480,8 +1480,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_B 0 #define GEMM_DEFAULT_ALIGN 0x03fffUL -#define SGEMM_DEFAULT_UNROLL_M 2 -#define SGEMM_DEFAULT_UNROLL_N 8 +#define SGEMM_DEFAULT_UNROLL_M 4 +#define SGEMM_DEFAULT_UNROLL_N 4 #define DGEMM_DEFAULT_UNROLL_M 4 #define DGEMM_DEFAULT_UNROLL_N 4 @@ -1491,17 +1491,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ZGEMM_DEFAULT_UNROLL_M 1 #define ZGEMM_DEFAULT_UNROLL_N 4 -#define SGEMM_DEFAULT_P 108 +#define SGEMM_DEFAULT_P 32 #define DGEMM_DEFAULT_P 32 #define CGEMM_DEFAULT_P 108 #define ZGEMM_DEFAULT_P 112 -#define SGEMM_DEFAULT_Q 288 +#define SGEMM_DEFAULT_Q 116 #define DGEMM_DEFAULT_Q 116 #define CGEMM_DEFAULT_Q 144 #define ZGEMM_DEFAULT_Q 72 -#define SGEMM_DEFAULT_R 2000 +#define SGEMM_DEFAULT_R 1000 #define DGEMM_DEFAULT_R 1000 #define CGEMM_DEFAULT_R 2000 #define ZGEMM_DEFAULT_R 2000 From 3d7e62eb8b17c52922d43c71354754e8cb283c47 Mon Sep 17 00:00:00 2001 From: Xianyi Zhang Date: Mon, 30 May 2011 12:42:17 +0800 Subject: [PATCH 27/42] Fixed #31 Shared library placement on Mac. Thank Mr.Viral B. Shah for this patch. --- exports/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/exports/Makefile b/exports/Makefile index 24cdc41c8..6e067acbf 100644 --- a/exports/Makefile +++ b/exports/Makefile @@ -84,7 +84,7 @@ libgoto_hpl.def : gensymbol perl ./gensymbol win2khpl $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) > $(@F) $(LIBDYNNAME) : ../$(LIBNAME) osx.def - $(PREFIX)gcc $(CFLAGS) -all_load -dynamiclib -o $(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB) + $(PREFIX)gcc $(CFLAGS) -all_load -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB) symbol.$(SUFFIX) : symbol.S $(CC) $(CFLAGS) -c -o $(@F) $^ From 31040e4d80caab57dac1fd7f28b7a59815dca43e Mon Sep 17 00:00:00 2001 From: Xianyi Date: Fri, 3 Jun 2011 13:19:54 +0800 Subject: [PATCH 28/42] Fixed #32 a SEGFAULT bug with gcc-4.6. According to i386 calling convention, The called funtion should remove the hidden return value address from the stack. --- Changelog.txt | 1 + kernel/x86/zdot_sse2.S | 3 +++ 2 files changed, 4 insertions(+) diff --git a/Changelog.txt b/Changelog.txt index 461058279..60798d2a9 100644 --- a/Changelog.txt +++ b/Changelog.txt @@ -21,6 +21,7 @@ common: x86/x86_64: * Fixed #28 a wrong result of dsdot on x86_64. + * Fixed #32 a SEGFAULT bug of zdotc with gcc-4.6. MIPS64: * Fixed #28 a wrong result of dsdot on Loongson3A/MIPS64. diff --git a/kernel/x86/zdot_sse2.S b/kernel/x86/zdot_sse2.S index 5aeefde31..2a174fb5d 100644 --- a/kernel/x86/zdot_sse2.S +++ b/kernel/x86/zdot_sse2.S @@ -1541,5 +1541,8 @@ popl %ebx popl %esi popl %edi +/*remove the hidden return value address from the stack.*/ + popl %ecx + xchgl %ecx, 0(%esp) ret EPILOGUE From 4335bca2f7265193186771c5b1cc4f6b177c85b8 Mon Sep 17 00:00:00 2001 From: Wang Qian Date: Tue, 7 Jun 2011 12:53:25 +0800 Subject: [PATCH 29/42] Fixed #33 ztrmm bug on Nehalem. --- Changelog.txt | 1 + kernel/x86_64/zgemm_kernel_1x4_nehalem.S | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/Changelog.txt b/Changelog.txt index 60798d2a9..c4e6a8fe2 100644 --- a/Changelog.txt +++ b/Changelog.txt @@ -22,6 +22,7 @@ common: x86/x86_64: * Fixed #28 a wrong result of dsdot on x86_64. * Fixed #32 a SEGFAULT bug of zdotc with gcc-4.6. + * Fixed #33 ztrmm bug on Nehalem. MIPS64: * Fixed #28 a wrong result of dsdot on Loongson3A/MIPS64. diff --git a/kernel/x86_64/zgemm_kernel_1x4_nehalem.S b/kernel/x86_64/zgemm_kernel_1x4_nehalem.S index e72a19c96..4ddfc488b 100644 --- a/kernel/x86_64/zgemm_kernel_1x4_nehalem.S +++ b/kernel/x86_64/zgemm_kernel_1x4_nehalem.S @@ -544,7 +544,7 @@ jg .L11 #if defined(TRMMKERNEL) && !defined(LEFT) - addq $1, KK + addq $4, KK #endif leaq (C, LDC, 4), C @@ -594,7 +594,7 @@ jg .L11 #if defined(TRMMKERNEL) && !defined(LEFT) - addq $1, KK + addq $4, KK #endif leaq (C, LDC, 4), C From 149638322434c90ba7fafb48312a08d1634144dd Mon Sep 17 00:00:00 2001 From: Xianyi Zhang Date: Thu, 9 Jun 2011 10:40:15 +0800 Subject: [PATCH 30/42] Print the wall time (cycles) with enabling FUNCTION_PROFILE. --- Changelog.txt | 1 + driver/others/profile.c | 9 +++++---- interface/create | 0 3 files changed, 6 insertions(+), 4 deletions(-) mode change 100644 => 100755 interface/create diff --git a/Changelog.txt b/Changelog.txt index c4e6a8fe2..cc90ee198 100644 --- a/Changelog.txt +++ b/Changelog.txt @@ -18,6 +18,7 @@ common: * Added openblas_set_num_threads for Fortran. * Fixed #25 a wrong result of rotmg. * Fixed a bug about detecting underscore prefix in c_check. + * Print the wall time (cycles) with enabling FUNCTION_PROFILE x86/x86_64: * Fixed #28 a wrong result of dsdot on x86_64. diff --git a/driver/others/profile.c b/driver/others/profile.c index f65550c9f..f464c0b6a 100644 --- a/driver/others/profile.c +++ b/driver/others/profile.c @@ -74,20 +74,21 @@ void gotoblas_profile_quit(void) { if (cycles > 0) { fprintf(stderr, "\n\t====== BLAS Profiling Result =======\n\n"); - fprintf(stderr, " Function No. of Calls Time Consumption Efficiency Bytes/cycle\n"); + fprintf(stderr, " Function No. of Calls Time Consumption Efficiency Bytes/cycle Wall Time(Cycles)\n"); for (i = 0; i < MAX_PROF_TABLE; i ++) { if (function_profile_table[i].calls) { #ifndef OS_WINDOWS - fprintf(stderr, "%-12s : %10Ld %8.2f%% %10.3f%% %8.2f\n", + fprintf(stderr, "%-12s : %10Ld %8.2f%% %10.3f%% %8.2f %Ld\n", #else - fprintf(stderr, "%-12s : %10lld %8.2f%% %10.3f%% %8.2f\n", + fprintf(stderr, "%-12s : %10lld %8.2f%% %10.3f%% %8.2f %lld\n", #endif func_table[i], function_profile_table[i].calls, (double)function_profile_table[i].cycles / (double)cycles * 100., (double)function_profile_table[i].fops / (double)function_profile_table[i].tcycles * 100., - (double)function_profile_table[i].area / (double)function_profile_table[i].cycles + (double)function_profile_table[i].area / (double)function_profile_table[i].cycles, + function_profile_table[i].cycles ); } } diff --git a/interface/create b/interface/create old mode 100644 new mode 100755 From 8d50a9fd1ae8c04005ef3f40b0e7c6ed764e8ded Mon Sep 17 00:00:00 2001 From: Xianyi Zhang Date: Thu, 9 Jun 2011 11:38:59 +0800 Subject: [PATCH 31/42] Fixed #35 a build bug with NO_LAPACK=1 & DYNAMIC_ARCH=1. --- Changelog.txt | 1 + Makefile.system | 4 ++++ kernel/Makefile | 8 +++++++- kernel/setparam-ref.c | 16 ++++++++++++++++ 4 files changed, 28 insertions(+), 1 deletion(-) diff --git a/Changelog.txt b/Changelog.txt index cc90ee198..9089096e5 100644 --- a/Changelog.txt +++ b/Changelog.txt @@ -19,6 +19,7 @@ common: * Fixed #25 a wrong result of rotmg. * Fixed a bug about detecting underscore prefix in c_check. * Print the wall time (cycles) with enabling FUNCTION_PROFILE + * Fixed #35 a build bug with NO_LAPACK=1 & DYNAMIC_ARCH=1 x86/x86_64: * Fixed #28 a wrong result of dsdot on x86_64. diff --git a/Makefile.system b/Makefile.system index 6fb0ec86f..7686c938b 100644 --- a/Makefile.system +++ b/Makefile.system @@ -515,6 +515,10 @@ ifeq ($(DYNAMIC_ARCH), 1) CCOMMON_OPT += -DDYNAMIC_ARCH endif +ifeq ($(NO_LAPACK), 1) +CCOMMON_OPT += -DNO_LAPACK +endif + ifdef SMP CCOMMON_OPT += -DSMP_SERVER diff --git a/kernel/Makefile b/kernel/Makefile index 6084cbc3f..a3a32de81 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -53,6 +53,11 @@ SBLASOBJS += setparam$(TSUFFIX).$(SUFFIX) CCOMMON_OPT += -DTS=$(TSUFFIX) endif +KERNEL_INTERFACE = ../common_level1.h ../common_level2.h ../common_level3.h +ifneq ($(NO_LAPACK), 1) +KERNEL_INTERFACE += ../common_lapack.h +endif + ifeq ($(ARCH), x86) COMMONOBJS += cpuid.$(SUFFIX) endif @@ -88,9 +93,10 @@ setparam$(TSUFFIX).$(SUFFIX): setparam$(TSUFFIX).c kernel$(TSUFFIX).h setparam$(TSUFFIX).c : setparam-ref.c sed 's/TS/$(TSUFFIX)/g' $< > $(@F) -kernel$(TSUFFIX).h : ../common_level1.h ../common_level2.h ../common_level3.h ../common_lapack.h +kernel$(TSUFFIX).h : $(KERNEL_INTERFACE) sed 's/\ *(/$(TSUFFIX)(/g' $^ > $(@F) + cpuid.$(SUFFIX): $(KERNELDIR)/cpuid.S $(CC) -c $(CFLAGS) $< -o $(@F) diff --git a/kernel/setparam-ref.c b/kernel/setparam-ref.c index 0ab57f3b3..73df7625a 100644 --- a/kernel/setparam-ref.c +++ b/kernel/setparam-ref.c @@ -101,7 +101,11 @@ gotoblas_t TABLE_NAME = { #endif ssymm_outcopyTS, ssymm_oltcopyTS, +#ifndef NO_LAPACK sneg_tcopyTS, slaswp_ncopyTS, +#else + NULL,NULL, +#endif 0, 0, 0, DGEMM_DEFAULT_UNROLL_M, DGEMM_DEFAULT_UNROLL_N, MAX(DGEMM_DEFAULT_UNROLL_M, DGEMM_DEFAULT_UNROLL_N), @@ -147,7 +151,11 @@ gotoblas_t TABLE_NAME = { #endif dsymm_outcopyTS, dsymm_oltcopyTS, +#ifndef NO_LAPACK dneg_tcopyTS, dlaswp_ncopyTS, +#else + NULL, NULL, +#endif #ifdef EXPRECISION @@ -286,7 +294,11 @@ gotoblas_t TABLE_NAME = { chemm3m_oucopyrTS, chemm3m_olcopyrTS, chemm3m_oucopyiTS, chemm3m_olcopyiTS, +#ifndef NO_LAPACK cneg_tcopyTS, claswp_ncopyTS, +#else + NULL, NULL, +#endif 0, 0, 0, ZGEMM_DEFAULT_UNROLL_M, ZGEMM_DEFAULT_UNROLL_N, MAX(ZGEMM_DEFAULT_UNROLL_M, ZGEMM_DEFAULT_UNROLL_N), @@ -375,7 +387,11 @@ gotoblas_t TABLE_NAME = { zhemm3m_oucopyrTS, zhemm3m_olcopyrTS, zhemm3m_oucopyiTS, zhemm3m_olcopyiTS, +#ifndef NO_LAPACK zneg_tcopyTS, zlaswp_ncopyTS, +#else + NULL, NULL, +#endif #ifdef EXPRECISION From b3d188774525483cedf6ce1282ac9b9cb806eb67 Mon Sep 17 00:00:00 2001 From: Xianyi Zhang Date: Thu, 9 Jun 2011 22:59:49 +0800 Subject: [PATCH 32/42] Fixed #35 a build bug with NO_LAPACK=1 DYNAMIC_ARCH=1 FC=gfortran. I forgot to test it with gfortran in last bug fixed commit. --- kernel/setparam-ref.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/kernel/setparam-ref.c b/kernel/setparam-ref.c index 73df7625a..d3734bbd9 100644 --- a/kernel/setparam-ref.c +++ b/kernel/setparam-ref.c @@ -203,7 +203,11 @@ gotoblas_t TABLE_NAME = { #endif qsymm_outcopyTS, qsymm_oltcopyTS, +#ifndef NO_LAPACK qneg_tcopyTS, qlaswp_ncopyTS, +#else + NULL, NULL, +#endif #endif @@ -482,7 +486,11 @@ gotoblas_t TABLE_NAME = { xhemm3m_oucopyrTS, xhemm3m_olcopyrTS, xhemm3m_oucopyiTS, xhemm3m_olcopyiTS, +#ifndef NO_LAPACK xneg_tcopyTS, xlaswp_ncopyTS, +#else + NULL, NULL, +#endif #endif From aeed8d6225501a3ec0eaf82bdbd614ee5d4e336b Mon Sep 17 00:00:00 2001 From: Xianyi Zhang Date: Sun, 19 Jun 2011 11:55:29 +0800 Subject: [PATCH 33/42] Fixed #27. Temporarily walk around axpy's low performance issue with small imput size & multithreads. --- Changelog.txt | 1 + interface/axpy.c | 6 +++++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/Changelog.txt b/Changelog.txt index 9089096e5..2c1bfdf53 100644 --- a/Changelog.txt +++ b/Changelog.txt @@ -25,6 +25,7 @@ x86/x86_64: * Fixed #28 a wrong result of dsdot on x86_64. * Fixed #32 a SEGFAULT bug of zdotc with gcc-4.6. * Fixed #33 ztrmm bug on Nehalem. + * Walk round #27 the low performance axpy issue with small imput size & multithreads. MIPS64: * Fixed #28 a wrong result of dsdot on Loongson3A/MIPS64. diff --git a/interface/axpy.c b/interface/axpy.c index dd75b758c..82b0ee234 100644 --- a/interface/axpy.c +++ b/interface/axpy.c @@ -85,7 +85,11 @@ void CNAME(blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT *y, blasint inc //In that case, the threads would be dependent. if (incx == 0 || incy == 0) nthreads = 1; - + + //Temporarily walk around the low performance issue with small imput size & multithreads. + if (n <= 10000) + nthreads = 1; + if (nthreads == 1) { #endif From 7945919f22ef9eddfcd475621259af3f01f6a09c Mon Sep 17 00:00:00 2001 From: Xianyi Zhang Date: Sun, 19 Jun 2011 12:07:31 +0800 Subject: [PATCH 34/42] Updated gitignore file. --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index fa15203f1..6cfc5b3c1 100644 --- a/.gitignore +++ b/.gitignore @@ -7,6 +7,7 @@ lapack-3.1.1 lapack-3.1.1.tgz *.so *.a +.svn *~ config.h Makefile.conf From fab36f1adb7aeef5e8e7655d781695764c4f4e6e Mon Sep 17 00:00:00 2001 From: Xianyi Zhang Date: Mon, 20 Jun 2011 18:35:35 +0800 Subject: [PATCH 35/42] Fixed #20. Added install target in makefile. You can use "make install PREFIX=your_installation_directory". --- Makefile | 63 ++++++++++++++++++++++++++++++++++++++ Makefile.rule | 3 ++ openblas_config_template.h | 21 +++++++++++++ 3 files changed, 87 insertions(+) create mode 100644 openblas_config_template.h diff --git a/Makefile b/Makefile index 77dd3c2e7..3bca1337f 100644 --- a/Makefile +++ b/Makefile @@ -15,6 +15,10 @@ ifdef SANITY_CHECK BLASDIRS += reference endif +ifndef PREFIX +PREFIX = /opt/OpenBLAS +endif + SUBDIRS = $(BLASDIRS) ifneq ($(NO_LAPACK), 1) SUBDIRS += lapack @@ -111,6 +115,7 @@ ifdef DYNAMIC_ARCH do $(MAKE) GOTOBLAS_MAKEFILE= -C kernel TARGET_CORE=$$d kernel || exit 1 ;\ done endif + touch lib.grd prof : prof_blas prof_lapack @@ -230,6 +235,63 @@ lapack-test : dummy : +lib.grd : + $(error OpenBLAS: Please run "make" firstly) + +install : lib.grd + @-mkdir -p $(PREFIX) + @echo Generating openblas_config.h in $(PREFIX) +#for inc + @echo \#ifndef OPENBLAS_CONFIG_H > $(PREFIX)/openblas_config.h + @echo \#define OPENBLAS_CONFIG_H >> $(PREFIX)/openblas_config.h + @cat config.h >> $(PREFIX)/openblas_config.h + @echo \#define VERSION \" OpenBLAS $(VERSION) \" >> $(PREFIX)/openblas_config.h + @cat openblas_config_template.h >> $(PREFIX)/openblas_config.h + @echo \#endif >> $(PREFIX)/openblas_config.h + + @echo Generating f77blas.h in $(PREFIX) + @echo \#ifndef OPENBLAS_F77BLAS_H > $(PREFIX)/f77blas.h + @echo \#define OPENBLAS_F77BLAS_H >> $(PREFIX)/f77blas.h + @echo \#include \"openblas_config.h\" >> $(PREFIX)/f77blas.h + @cat common_interface.h >> $(PREFIX)/f77blas.h + @echo \#endif >> $(PREFIX)/f77blas.h + + @echo Generating cblas.h in $(PREFIX) + @sed 's/common/openblas_config/g' cblas.h > $(PREFIX)/cblas.h + +#for install static library + @echo Copy the static library to $(PREFIX) + @cp $(LIBNAME) $(PREFIX) + @-ln -fs $(PREFIX)/$(LIBNAME) $(PREFIX)/libopenblas.$(LIBSUFFIX) +#for install shared library + @echo Copy the shared library to $(PREFIX) +ifeq ($(OSNAME), Linux) + -cp $(LIBSONAME) $(PREFIX) + -ln -fs $(PREFIX)/$(LIBSONAME) $(PREFIX)/libopenblas.so +endif +ifeq ($(OSNAME), FreeBSD) + -cp $(LIBSONAME) $(PREFIX) + -ln -fs $(PREFIX)/$(LIBSONAME) $(PREFIX)/libopenblas.so +endif +ifeq ($(OSNAME), NetBSD) + -cp $(LIBSONAME) $(PREFIX) + -ln -fs $(PREFIX)/$(LIBSONAME) $(PREFIX)/libopenblas.so +endif +ifeq ($(OSNAME), Darwin) + -cp $(LIBDYNNAME) $(PREFIX) + -ln -fs $(PREFIX)/$(LIBDYNNAME) $(PREFIX)/libopenblas.dylib +endif +ifeq ($(OSNAME), WINNT) + -cp $(LIBDLLNAME) $(PREFIX) + -ln -fs $(PREFIX)/$(LIBDLLNAME) $(PREFIX)/libopenblas.dll +endif +ifeq ($(OSNAME), CYGWIN_NT) + -cp $(LIBDLLNAME) $(PREFIX) + -ln -fs $(PREFIX)/$(LIBDLLNAME) $(PREFIX)/libopenblas.dll +endif + + @echo Install OK! + clean :: @for d in $(SUBDIRS_ALL) ; \ do if test -d $$d; then \ @@ -245,4 +307,5 @@ endif echo deleting lapack-3.1.1; \ rm -rf lapack-3.1.1 ;\ fi + @rm -f *.grd @echo Done. \ No newline at end of file diff --git a/Makefile.rule b/Makefile.rule index 61f9eb91d..88d552495 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -91,6 +91,9 @@ VERSION = 0.1alpha2 # SANITY_CHECK to compare the result with reference BLAS. # UTEST_CHECK = 1 +# The installation directory. +# PREFIX = /opt/OpenBLAS + # Common Optimization Flag; -O2 is enough. # DEBUG = 1 diff --git a/openblas_config_template.h b/openblas_config_template.h new file mode 100644 index 000000000..9fb80aa4f --- /dev/null +++ b/openblas_config_template.h @@ -0,0 +1,21 @@ +/*This is only for "make install" target.*/ + +#ifdef NEEDBUNDERSCORE +#define BLASFUNC(FUNC) FUNC##_ +#else +#define BLASFUNC(FUNC) FUNC +#endif + +#if defined(OS_WINDOWS) && defined(__64BIT__) +typedef long long BLASLONG; +typedef unsigned long long BLASULONG; +#else +typedef long BLASLONG; +typedef unsigned long BLASULONG; +#endif + +#ifdef USE64BITINT +typedef BLASLONG blasint; +#else +typedef int blasint; +#endif From d978436c4bcc45404188d8b58618f44efd52728d Mon Sep 17 00:00:00 2001 From: Xianyi Zhang Date: Mon, 20 Jun 2011 18:36:29 +0800 Subject: [PATCH 36/42] Refs #20. Updated the docs. --- Changelog.txt | 1 + README | 5 +++++ 2 files changed, 6 insertions(+) diff --git a/Changelog.txt b/Changelog.txt index c4e6a8fe2..aadf00d37 100644 --- a/Changelog.txt +++ b/Changelog.txt @@ -18,6 +18,7 @@ common: * Added openblas_set_num_threads for Fortran. * Fixed #25 a wrong result of rotmg. * Fixed a bug about detecting underscore prefix in c_check. + * Added install target. You can use "make install". (Refs #20) x86/x86_64: * Fixed #28 a wrong result of dsdot on x86_64. diff --git a/README b/README index c18b6c502..21e740689 100644 --- a/README +++ b/README @@ -22,6 +22,11 @@ make BINARY=64 CC=mips64el-unknown-linux-gnu-gcc FC=mips64el-unknown-linux-gnu-g 3)Debug version make DEBUG=1 +4)Intall to the directory (Optional) +e.g. +make install PREFIX=your_installation_directory +The default directory is /opt/OpenBLAS + 3.Support CPU & OS Please read GotoBLAS_01Readme.txt From 32353a9d3085c9de7b95342e4be5e4c816ee8593 Mon Sep 17 00:00:00 2001 From: Xianyi Zhang Date: Tue, 21 Jun 2011 17:39:08 +0800 Subject: [PATCH 37/42] Refs #20. Fixed the installation bug with DYNAMIC_ARCH=1. --- Makefile | 72 ++++++++---------------------------------------- Makefile.install | 65 +++++++++++++++++++++++++++++++++++++++++++ kernel/Makefile | 4 +-- 3 files changed, 78 insertions(+), 63 deletions(-) create mode 100644 Makefile.install diff --git a/Makefile b/Makefile index 3bca1337f..c480fc47d 100644 --- a/Makefile +++ b/Makefile @@ -26,8 +26,8 @@ endif SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench -.PHONY : all libs netlib test ctest shared -.NOTPARALLEL : all libs prof lapack-test +.PHONY : all libs netlib test ctest shared install +.NOTPARALLEL : all libs prof lapack-test install all :: libs netlib tests shared @echo @@ -109,11 +109,15 @@ endif $(MAKE) -C $$d $(@F) || exit 1 ; \ fi; \ done +#Save the config files for installation + cp Makefile.conf Makefile.conf_last + cp config.h config_last.h ifdef DYNAMIC_ARCH $(MAKE) -C kernel commonlibs || exit 1 for d in $(DYNAMIC_CORE) ; \ do $(MAKE) GOTOBLAS_MAKEFILE= -C kernel TARGET_CORE=$$d kernel || exit 1 ;\ done + echo DYNAMIC_ARCH=1 >> Makefile.conf_last endif touch lib.grd @@ -235,62 +239,8 @@ lapack-test : dummy : -lib.grd : - $(error OpenBLAS: Please run "make" firstly) - -install : lib.grd - @-mkdir -p $(PREFIX) - @echo Generating openblas_config.h in $(PREFIX) -#for inc - @echo \#ifndef OPENBLAS_CONFIG_H > $(PREFIX)/openblas_config.h - @echo \#define OPENBLAS_CONFIG_H >> $(PREFIX)/openblas_config.h - @cat config.h >> $(PREFIX)/openblas_config.h - @echo \#define VERSION \" OpenBLAS $(VERSION) \" >> $(PREFIX)/openblas_config.h - @cat openblas_config_template.h >> $(PREFIX)/openblas_config.h - @echo \#endif >> $(PREFIX)/openblas_config.h - - @echo Generating f77blas.h in $(PREFIX) - @echo \#ifndef OPENBLAS_F77BLAS_H > $(PREFIX)/f77blas.h - @echo \#define OPENBLAS_F77BLAS_H >> $(PREFIX)/f77blas.h - @echo \#include \"openblas_config.h\" >> $(PREFIX)/f77blas.h - @cat common_interface.h >> $(PREFIX)/f77blas.h - @echo \#endif >> $(PREFIX)/f77blas.h - - @echo Generating cblas.h in $(PREFIX) - @sed 's/common/openblas_config/g' cblas.h > $(PREFIX)/cblas.h - -#for install static library - @echo Copy the static library to $(PREFIX) - @cp $(LIBNAME) $(PREFIX) - @-ln -fs $(PREFIX)/$(LIBNAME) $(PREFIX)/libopenblas.$(LIBSUFFIX) -#for install shared library - @echo Copy the shared library to $(PREFIX) -ifeq ($(OSNAME), Linux) - -cp $(LIBSONAME) $(PREFIX) - -ln -fs $(PREFIX)/$(LIBSONAME) $(PREFIX)/libopenblas.so -endif -ifeq ($(OSNAME), FreeBSD) - -cp $(LIBSONAME) $(PREFIX) - -ln -fs $(PREFIX)/$(LIBSONAME) $(PREFIX)/libopenblas.so -endif -ifeq ($(OSNAME), NetBSD) - -cp $(LIBSONAME) $(PREFIX) - -ln -fs $(PREFIX)/$(LIBSONAME) $(PREFIX)/libopenblas.so -endif -ifeq ($(OSNAME), Darwin) - -cp $(LIBDYNNAME) $(PREFIX) - -ln -fs $(PREFIX)/$(LIBDYNNAME) $(PREFIX)/libopenblas.dylib -endif -ifeq ($(OSNAME), WINNT) - -cp $(LIBDLLNAME) $(PREFIX) - -ln -fs $(PREFIX)/$(LIBDLLNAME) $(PREFIX)/libopenblas.dll -endif -ifeq ($(OSNAME), CYGWIN_NT) - -cp $(LIBDLLNAME) $(PREFIX) - -ln -fs $(PREFIX)/$(LIBDLLNAME) $(PREFIX)/libopenblas.dll -endif - - @echo Install OK! +install : + $(MAKE) -f Makefile.install install clean :: @for d in $(SUBDIRS_ALL) ; \ @@ -298,14 +248,14 @@ clean :: $(MAKE) -C $$d $(@F) || exit 1 ; \ fi; \ done -ifdef DYNAMIC_ARCH +#ifdef DYNAMIC_ARCH @$(MAKE) -C kernel clean -endif +#endif @rm -f *.$(LIBSUFFIX) *.so *~ *.exe getarch getarch_2nd *.dll *.lib *.$(SUFFIX) *.dwf libopenblas.$(LIBSUFFIX) libopenblas_p.$(LIBSUFFIX) *.lnk myconfig.h @rm -f Makefile.conf config.h Makefile_kernel.conf config_kernel.h st* *.dylib @if test -d lapack-3.1.1; then \ echo deleting lapack-3.1.1; \ rm -rf lapack-3.1.1 ;\ fi - @rm -f *.grd + @rm -f *.grd Makefile.conf_last config_last.h @echo Done. \ No newline at end of file diff --git a/Makefile.install b/Makefile.install new file mode 100644 index 000000000..80dafc9c6 --- /dev/null +++ b/Makefile.install @@ -0,0 +1,65 @@ +TOPDIR = . +export GOTOBLAS_MAKEFILE = 1 +-include $(TOPDIR)/Makefile.conf_last +include ./Makefile.system + +.PHONY : install +.NOTPARALLEL : install + +lib.grd : + $(error OpenBLAS: Please run "make" firstly) + +install : lib.grd + @-mkdir -p $(PREFIX) + @echo Generating openblas_config.h in $(PREFIX) +#for inc + @echo \#ifndef OPENBLAS_CONFIG_H > $(PREFIX)/openblas_config.h + @echo \#define OPENBLAS_CONFIG_H >> $(PREFIX)/openblas_config.h + @cat config_last.h >> $(PREFIX)/openblas_config.h + @echo \#define VERSION \" OpenBLAS $(VERSION) \" >> $(PREFIX)/openblas_config.h + @cat openblas_config_template.h >> $(PREFIX)/openblas_config.h + @echo \#endif >> $(PREFIX)/openblas_config.h + + @echo Generating f77blas.h in $(PREFIX) + @echo \#ifndef OPENBLAS_F77BLAS_H > $(PREFIX)/f77blas.h + @echo \#define OPENBLAS_F77BLAS_H >> $(PREFIX)/f77blas.h + @echo \#include \"openblas_config.h\" >> $(PREFIX)/f77blas.h + @cat common_interface.h >> $(PREFIX)/f77blas.h + @echo \#endif >> $(PREFIX)/f77blas.h + + @echo Generating cblas.h in $(PREFIX) + @sed 's/common/openblas_config/g' cblas.h > $(PREFIX)/cblas.h + +#for install static library + @echo Copy the static library to $(PREFIX) + @cp $(LIBNAME) $(PREFIX) + @-ln -fs $(PREFIX)/$(LIBNAME) $(PREFIX)/libopenblas.$(LIBSUFFIX) +#for install shared library + @echo Copy the shared library to $(PREFIX) +ifeq ($(OSNAME), Linux) + -cp $(LIBSONAME) $(PREFIX) + -ln -fs $(PREFIX)/$(LIBSONAME) $(PREFIX)/libopenblas.so +endif +ifeq ($(OSNAME), FreeBSD) + -cp $(LIBSONAME) $(PREFIX) + -ln -fs $(PREFIX)/$(LIBSONAME) $(PREFIX)/libopenblas.so +endif +ifeq ($(OSNAME), NetBSD) + -cp $(LIBSONAME) $(PREFIX) + -ln -fs $(PREFIX)/$(LIBSONAME) $(PREFIX)/libopenblas.so +endif +ifeq ($(OSNAME), Darwin) + -cp $(LIBDYNNAME) $(PREFIX) + -ln -fs $(PREFIX)/$(LIBDYNNAME) $(PREFIX)/libopenblas.dylib +endif +ifeq ($(OSNAME), WINNT) + -cp $(LIBDLLNAME) $(PREFIX) + -ln -fs $(PREFIX)/$(LIBDLLNAME) $(PREFIX)/libopenblas.dll +endif +ifeq ($(OSNAME), CYGWIN_NT) + -cp $(LIBDLLNAME) $(PREFIX) + -ln -fs $(PREFIX)/$(LIBDLLNAME) $(PREFIX)/libopenblas.dll +endif + + @echo Install OK! + diff --git a/kernel/Makefile b/kernel/Makefile index a3a32de81..aed145b60 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -118,10 +118,10 @@ lsame.$(PSUFFIX): $(KERNELDIR)/$(LSAME_KERNEL) cpuid.$(PSUFFIX): $(KERNELDIR)/cpuid.S $(CC) -c $(PFLAGS) $< -o $(@F) -ifdef DYNAMIC_ARCH +#ifdef DYNAMIC_ARCH clean :: @rm -f setparam_*.c kernel_*.h setparam.h kernel.h -endif +#endif include $(TOPDIR)/Makefile.tail From e568df0daee58498cd8f37cde1a1534e1a0698cf Mon Sep 17 00:00:00 2001 From: Xianyi Zhang Date: Tue, 21 Jun 2011 18:06:13 +0800 Subject: [PATCH 38/42] Refs #38. Prepare the docs with v0.1alpha2. --- Changelog.txt | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/Changelog.txt b/Changelog.txt index fb149ca7a..f94c164c7 100644 --- a/Changelog.txt +++ b/Changelog.txt @@ -1,7 +1,7 @@ OpenBLAS ChangeLog ==================================================================== -Version 0.1 alpha2(in development) -0;136;0c +Version 0.1 alpha2(Preparing to release) + common: * Fixed blasint undefined bug in file. Other software could include this header successfully(Refs issue #13 on github) @@ -31,6 +31,8 @@ x86/x86_64: MIPS64: * Fixed #28 a wrong result of dsdot on Loongson3A/MIPS64. + * Optimized single/double precision BLAS Level3 on Loongson3A/MIPS64. (Refs #2) + * Optimized single/double precision axpy function on Loongson3A/MIPS64. (Refs #3) ==================================================================== Version 0.1 alpha1 From 82f5274828a1c12137651375cff248cbc6ed4160 Mon Sep 17 00:00:00 2001 From: Xianyi Zhang Date: Wed, 22 Jun 2011 01:52:20 +0800 Subject: [PATCH 39/42] Refs #39. It's unnecessary to include sys/mman.h file in blas_server_omp.c. --- driver/others/blas_server_omp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/driver/others/blas_server_omp.c b/driver/others/blas_server_omp.c index 17d886e52..4fd4cd440 100644 --- a/driver/others/blas_server_omp.c +++ b/driver/others/blas_server_omp.c @@ -38,7 +38,7 @@ #include #include -#include +//#include #include "common.h" #ifndef USE_OPENMP From 078bfd0b4fa33e1b651366a475050fb1b8f5cb4c Mon Sep 17 00:00:00 2001 From: Xianyi Zhang Date: Wed, 22 Jun 2011 13:19:39 +0800 Subject: [PATCH 40/42] Refs #39. Moved the shared lib (dll) to top directory in MingW64 compiler environment. --- Makefile | 2 +- exports/Makefile | 9 +++++---- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/Makefile b/Makefile index c480fc47d..798c56192 100644 --- a/Makefile +++ b/Makefile @@ -74,7 +74,7 @@ ifeq ($(OSNAME), Darwin) endif ifeq ($(OSNAME), WINNT) $(MAKE) -C exports dll -# -ln -fs $(LIBDLLNAME) libopenblas.dll + -ln -fs $(LIBDLLNAME) libopenblas.dll endif ifeq ($(OSNAME), CYGWIN_NT) $(MAKE) -C exports dll diff --git a/exports/Makefile b/exports/Makefile index 6e067acbf..f4c9314f9 100644 --- a/exports/Makefile +++ b/exports/Makefile @@ -53,18 +53,19 @@ dyn : $(LIBDYNNAME) zip : dll zip $(LIBZIPNAME) $(LIBDLLNAME) $(LIBNAME) -dll : libgoto2.dll +dll : ../$(LIBDLLNAME) +#libgoto2.dll dll2 : libgoto2_shared.dll -libgoto2.dll : ../$(LIBNAME) libgoto2.def dllinit.$(SUFFIX) +../$(LIBDLLNAME) : ../$(LIBNAME) libgoto2.def dllinit.$(SUFFIX) $(RANLIB) ../$(LIBNAME) ifeq ($(BINARY32), 1) - $(DLLWRAP) -o $(@F) --def libgoto2.def \ + $(DLLWRAP) -o ../$(LIBDLLNAME) --def libgoto2.def \ --entry _dllinit@12 -s dllinit.$(SUFFIX) --dllname $(@F) ../$(LIBNAME) $(FEXTRALIB) -lib /machine:i386 /def:libgoto2.def else - $(DLLWRAP) -o $(@F) --def libgoto2.def \ + $(DLLWRAP) -o ../$(LIBDLLNAME) --def libgoto2.def \ --entry _dllinit -s dllinit.$(SUFFIX) --dllname $(@F) ../$(LIBNAME) $(FEXTRALIB) -lib /machine:X64 /def:libgoto2.def endif From 859b71645a75e7d02a17a6f09a342ff495068435 Mon Sep 17 00:00:00 2001 From: traits Date: Thu, 23 Jun 2011 15:09:34 +0800 Subject: [PATCH 41/42] Refs #37. Updated REAME about the compatible issue with EKOPath compiler. --- README | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README b/README index 21e740689..9a7b16326 100644 --- a/README +++ b/README @@ -72,6 +72,7 @@ Please see Changelog.txt to obtain the differences between GotoBLAS2 1.13 BSD ve 9.Known Issues * The number of CPUs/Cores should less than or equal to 8*sizeof(unsigned long). On 64 bits, the limit is 64. On 32 bits, it is 32. +* This library is not compatible with EKOPath Compiler Suite 4.0.10 (http://www.pathscale.com/ekopath-compiler-suite). However, Path64 (https://github.com/path64/compiler) could compile the codes successfully. 10. Specification of Git Branches We used the git branching model in this article (http://nvie.com/posts/a-successful-git-branching-model/). @@ -79,4 +80,4 @@ Now, there are 4 branches in github.com. * The master branch. This a main branch to reflect a production-ready state. * The develop branch. This a main branch to reflect a state with the latest delivered development changes for the next release. * The loongson3a branch. This is a feature branch. We develop Loongson3A codes on this branch. We will merge this feature to develop branch in future. - * The gh-pages branch. This is for web pages \ No newline at end of file + * The gh-pages branch. This is for web pages From 6a0762949d703d19266331ecd5d0d1968526af70 Mon Sep 17 00:00:00 2001 From: traits Date: Thu, 23 Jun 2011 15:16:24 +0800 Subject: [PATCH 42/42] Fixed #38. Released v0.1 alpha2. --- Changelog.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Changelog.txt b/Changelog.txt index f94c164c7..b54949ec5 100644 --- a/Changelog.txt +++ b/Changelog.txt @@ -1,6 +1,7 @@ OpenBLAS ChangeLog ==================================================================== -Version 0.1 alpha2(Preparing to release) +Version 0.1 alpha2 +23-Jun-2011 common: * Fixed blasint undefined bug in file. Other software