Change prefetch length of A and B, the performance is 2.1G now.

This commit is contained in:
traz 2011-06-23 10:46:58 +00:00
parent 1c96d345e2
commit 14f81da375
1 changed files with 207 additions and 166 deletions

View File

@ -6,6 +6,7 @@
#define gsLQC1(base,fq,ft,offset) .word(0x32<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq) #define gsLQC1(base,fq,ft,offset) .word(0x32<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq)
#define gsSQC1(base,fq,ft,offset) .word(0x3A<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq) #define gsSQC1(base,fq,ft,offset) .word(0x3A<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq)
#define STACKSIZE 160 #define STACKSIZE 160
#define M $4 #define M $4
#define N $5 #define N $5
@ -109,12 +110,18 @@
#define ALPHA_R $f15 #define ALPHA_R $f15
#define ALPHA_I $f16 #define ALPHA_I $f16
#if defined(NN) || defined(NT) || defined(TN) || defined(TT) #################################
## MADD1 a*c
## MADD2 b*c
## MADD3 a*d
## MADD4 d*b
##################################
####if defined(NN) || defined(NT) || defined(TN) || defined(TT)
#define MADD1 MADD #define MADD1 MADD
#define MADD2 MADD #define MADD2 MADD
#define MADD3 MADD #define MADD3 MADD
#define MADD4 NMSUB #define MADD4 NMSUB
#endif ###endif
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
#define MADD1 MADD #define MADD1 MADD
@ -166,25 +173,28 @@
sdc1 $f23,112($sp) sdc1 $f23,112($sp)
#endif #endif
dsll LDC, LDC, ZBASE_SHIFT # LDC*SIZE*COMPSIZE dsra J, N, 1 # J=N/2
ST ALPHA_R, 128($sp) # store alpha_r & alpha_i ST ALPHA_R, 128($sp) # store alpha_r & alpha_i
dsra J, N, 1 # J=N/2 dsll LDC, LDC, ZBASE_SHIFT # LDC*SIZE*COMPSIZE
blez J, .L20
ST ALPHA_I, 136($sp) ST ALPHA_I, 136($sp)
dsll PREB, K, 1+ZBASE_SHIFT # PREA=K*2*2^4
blez J, .L20
dsll PREA, K, 1+ZBASE_SHIFT # PREA=K*2*2^4
.align 5 .align 5
.L10: .L10:
daddiu J, J, -1 daddiu J, J, -1
move CO1, C # Fix pointer Cx
daddu CO2, C, LDC
move AO, A # Reset AO
dsra I, M, 1 # I=M/2 dsra I, M, 1 # I=M/2
dsll PREB, K, 1 + ZBASE_SHIFT # PREA=K*2*2^4
dsll PREA, K, 1 + ZBASE_SHIFT # PREA=K*2*2^4
move CO1, C # Fix pointer Cx
daddu CO2, C, LDC
move AO, A # Reset AO
daddu PREB, PREB, B # PREA=A+panel size
blez I, .L30 blez I, .L30
daddu PREA, PREA, A # PREA=A+panel size daddu PREA, PREA, A # PREA=A+panel size
@ -192,41 +202,32 @@
dsra L, K, 2 # Unroll K 4 times dsra L, K, 2 # Unroll K 4 times
move BO, B move BO, B
gsLQC1(R12, F1, F0, 0) # R:a1 I:a2
MTC $0, c11 # Clear results regs MTC $0, c11 # Clear results regs
MOV c12, c11 MOV c12, c11
gsLQC1(R12, F1, F0, 0) # R:a1 I:a2
gsLQC1(R13, F5, F4, 0) # R:b1 I:b2
MOV c13, c11 MOV c13, c11
MOV c14, c11 MOV c14, c11
gsLQC1(R13, F5, F4, 0) # R:b1 I:b2
gsLQC1(R12, F3, F2, 1) # R:a3 I:a4
MOV c21, c11 MOV c21, c11
MOV c22, c11 MOV c22, c11
gsLQC1(R12, F3, F2, 1) # R:a3 I:a4
gsLQC1(R13, F7, F6, 1) # R:b2 I:b3
MOV c23, c11 MOV c23, c11
MOV c24, c11 MOV c24, c11
gsLQC1(R13, F7, F6, 1) # R:b2 I:b3
FETCH $0, 0 * SIZE(PREA) # LOAD 32 Byte 4 double
daddu PREB, PREB, B # PREA=A+panel size
FETCH $0, 0 * SIZE(CO1)
MOV c31, c11 MOV c31, c11
MOV c32, c11 MOV c32, c11
FETCH $0, 0 * SIZE(CO2)
MOV c33, c11 MOV c33, c11
MOV c34, c11 MOV c34, c11
FETCH $0, 0 * SIZE(PREB)
MOV c41, c11 MOV c41, c11
FETCH $0, 4 * SIZE(CO1)
MOV c42, c11 MOV c42, c11
MOV c43, c11
FETCH $0, 4 * SIZE(CO2) MOV c43, c11
blez L, .L15 blez L, .L15
MOV c44, c11 MOV c44, c11
@ -234,26 +235,26 @@
.L12: .L12:
gsLQC1(R12, F9, F8, 2) # Unroll K=1 gsLQC1(R12, F9, F8, 2) # Unroll K=1
gsLQC1(R13, F13, F12, 2)
MADD1 c11, c11, a1, b1 # axc A1xB1 MADD1 c11, c11, a1, b1 # axc A1xB1
MADD3 c13, c13, a1, b2 # axd MADD3 c13, c13, a1, b2 # axd
gsLQC1(R13, F13, F12, 2) gsLQC1(R12, F11, F10, 3)
gsLQC1(R13, F16, F15, 3)
MADD2 c12, c12, a2, b1 # bxc MADD2 c12, c12, a2, b1 # bxc
MADD4 c14, c14, a2, b2 # bxd MADD4 c14, c14, a2, b2 # bxd
gsLQC1(R12, F11, F10, 3)
MADD1 c21, c21, a3, b1 # A2xB1 MADD1 c21, c21, a3, b1 # A2xB1
MADD3 c23, c23, a3, b2 MADD3 c23, c23, a3, b2
gsLQC1(R13, F16, F15, 3)
MADD2 c22, c22, a4, b1 MADD2 c22, c22, a4, b1
MADD4 c24, c24, a4, b2 MADD4 c24, c24, a4, b2
FETCH $0, 4 * SIZE(PREA) FETCH $0, 4 * SIZE(PREA)
FETCH $0, 4 * SIZE(PREB)
MADD1 c31, c31, a1, b3 # A1xB2 MADD1 c31, c31, a1, b3 # A1xB2
MADD3 c33, c33, a1, b4 MADD3 c33, c33, a1, b4
FETCH $0, 4 * SIZE(PREB)
MADD2 c32, c32, a2, b3 MADD2 c32, c32, a2, b3
MADD4 c34, c34, a2, b4 MADD4 c34, c34, a2, b4
@ -262,27 +263,27 @@
MADD2 c42, c42, a4, b3 MADD2 c42, c42, a4, b3
MADD4 c44, c44, a4, b4 MADD4 c44, c44, a4, b4
gsLQC1(R12, F1, F0, 4) # Unroll K=2 gsLQC1(R12, F1, F0, 4) # unroll k=2
gsLQC1(R13, F5, F4, 4)
MADD1 c11, c11, a5, b5 # axc A1xB1 MADD1 c11, c11, a5, b5 # axc A1xB1
MADD3 c13, c13, a5, b6 # axd MADD3 c13, c13, a5, b6 # axd
gsLQC1(R13, F5, F4, 4)
MADD2 c12, c12, a6, b5 # bxc MADD2 c12, c12, a6, b5 # bxc
MADD4 c14, c14, a6, b6 # bxd MADD4 c14, c14, a6, b6 # bxd
gsLQC1(R12, F3, F2, 5) gsLQC1(R12, F3, F2, 5)
gsLQC1(R13, F7, F6, 5)
MADD1 c21, c21, a7, b5 # A2xB1 MADD1 c21, c21, a7, b5 # A2xB1
MADD3 c23, c23, a7, b6 MADD3 c23, c23, a7, b6
gsLQC1(R13, F7, F6, 5)
MADD2 c22, c22, a8, b5 MADD2 c22, c22, a8, b5
MADD4 c24, c24, a8, b6 MADD4 c24, c24, a8, b6
FETCH $0, 8 * SIZE(PREA) FETCH $0, 8 * SIZE(PREA)
FETCH $0, 8 * SIZE(PREB)
MADD1 c31, c31, a5, b7 # A1xB2 MADD1 c31, c31, a5, b7 # A1xB2
MADD3 c33, c33, a5, b8 MADD3 c33, c33, a5, b8
FETCH $0, 8 * SIZE(PREB)
MADD2 c32, c32, a6, b7 MADD2 c32, c32, a6, b7
MADD4 c34, c34, a6, b8 MADD4 c34, c34, a6, b8
@ -292,61 +293,61 @@
MADD4 c44, c44, a8, b8 MADD4 c44, c44, a8, b8
gsLQC1(R12, F9, F8, 6) # Unroll K=3 gsLQC1(R12, F9, F8, 6) # Unroll K=3
gsLQC1(R13, F13, F12, 6)
MADD1 c11, c11, a1, b1 # axc A1xB1 MADD1 c11, c11, a1, b1 # axc A1xB1
MADD3 c13, c13, a1, b2 # axd MADD3 c13, c13, a1, b2 # axd
daddiu L, L, -1
gsLQC1(R13, F13, F12, 6) gsLQC1(R13, F16, F15, 7)
gsLQC1(R12, F11, F10, 7)
MADD2 c12, c12, a2, b1 # bxc MADD2 c12, c12, a2, b1 # bxc
MADD4 c14, c14, a2, b2 # bxd MADD4 c14, c14, a2, b2 # bxd
gsLQC1(R12, F11, F10, 7)
MADD1 c21, c21, a3, b1 # A2xB1 MADD1 c21, c21, a3, b1 # A2xB1
MADD3 c23, c23, a3, b2 MADD3 c23, c23, a3, b2
daddiu AO, AO, 16 * SIZE # 2mr*4kr*cmpx
daddiu BO, BO, 16 * SIZE # 2nr*4kr*cmpx
gsLQC1(R13, F16, F15, 7)
MADD2 c22, c22, a4, b1 MADD2 c22, c22, a4, b1
MADD4 c24, c24, a4, b2 MADD4 c24, c24, a4, b2
daddiu AO, AO, 16 * SIZE # 2mr*4kr*cmpx
FETCH $0, 12 * SIZE(PREA) FETCH $0, 12 * SIZE(PREA)
MADD1 c31, c31, a1, b3 # A1xB2 MADD1 c31, c31, a1, b3 # A1xB2
MADD3 c33, c33, a1, b4 MADD3 c33, c33, a1, b4
daddiu BO, BO, 16 * SIZE # 2nr*4kr*cmpx daddiu L, L, -1
FETCH $0, 12 * SIZE(PREB) FETCH $0, 12 * SIZE(PREB)
MADD2 c32, c32, a2, b3 MADD2 c32, c32, a2, b3
MADD4 c34, c34, a2, b4 MADD4 c34, c34, a2, b4
daddu PREA, PREA, 16 * SIZE
MADD1 c41, c41, a3, b3 # A2xB2 MADD1 c41, c41, a3, b3 # A2xB2
MADD3 c43, c43, a3, b4 MADD3 c43, c43, a3, b4
daddu PREA, PREA, 16 * SIZE
daddu PREB, PREB, 16 * SIZE daddu PREB, PREB, 16 * SIZE
MADD2 c42, c42, a4, b3 MADD2 c42, c42, a4, b3
MADD4 c44, c44, a4, b4 MADD4 c44, c44, a4, b4
gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 Unroll K=4 gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 Unroll K=4
gsLQC1(R13, F5, F4, 0) # R:b1 I:b2
MADD1 c11, c11, a5, b5 # axc A1xB1 MADD1 c11, c11, a5, b5 # axc A1xB1
MADD3 c13, c13, a5, b6 # axd MADD3 c13, c13, a5, b6 # axd
gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 gsLQC1(R12, F3, F2, 1) # R:a3 I:a4
gsLQC1(R13, F7, F6, 1) # R:b2 I:b3
MADD2 c12, c12, a6, b5 # bxc MADD2 c12, c12, a6, b5 # bxc
MADD4 c14, c14, a6, b6 # bxd MADD4 c14, c14, a6, b6 # bxd
gsLQC1(R12, F3, F2, 1) # R:a3 I:a4
MADD1 c21, c21, a7, b5 # A2xB1 MADD1 c21, c21, a7, b5 # A2xB1
MADD3 c23, c23, a7, b6 MADD3 c23, c23, a7, b6
gsLQC1(R13, F7, F6, 1) # R:b2 I:b3
MADD2 c22, c22, a8, b5 MADD2 c22, c22, a8, b5
MADD4 c24, c24, a8, b6 MADD4 c24, c24, a8, b6
FETCH $0, 0 * SIZE(PREA) FETCH $0, 0 * SIZE(PREA)
FETCH $0, 0 * SIZE(PREB)
MADD1 c31, c31, a5, b7 # A1xB2 MADD1 c31, c31, a5, b7 # A1xB2
MADD3 c33, c33, a5, b8 MADD3 c33, c33, a5, b8
FETCH $0, 0 * SIZE(PREB)
MADD2 c32, c32, a6, b7 MADD2 c32, c32, a6, b7
MADD4 c34, c34, a6, b8 MADD4 c34, c34, a6, b8
@ -362,46 +363,52 @@
.L15: .L15:
andi L, K, 3 andi L, K, 3
LD ALPHA_R, 128($sp) LD ALPHA_R, 128($sp)
NOP
blez L, .L18 blez L, .L18
LD ALPHA_I, 136($sp) LD ALPHA_I, 136($sp)
.align 5 .align 5
.L16: .L16:
daddiu L, L, -1 daddiu BO, BO, 4 * SIZE # 2nr*1kr*cmpx
daddiu BO, BO, 2 * SIZE * COMPSIZE # 2nr*1kr*cmpx daddiu AO, AO, 4 * SIZE # 2mr*1kr*cmpx
daddiu AO, AO, 2 * SIZE * COMPSIZE # 2mr*1kr*cmpx
MADD1 c11, c11, a1, b1 # axc A1xB1 MADD1 c11, c11, a1, b1 # axc A1xB1
MADD3 c13, c13, a1, b2 # axd MADD3 c13, c13, a1, b2 # axd
daddiu PREA, PREA, 4 * SIZE
daddiu PREB, PREB, 4 * SIZE
MADD2 c12, c12, a2, b1 # bxc MADD2 c12, c12, a2, b1 # bxc
MADD4 c14, c14, a2, b2 # bxd MADD4 c14, c14, a2, b2 # bxd
MADD1 c21, c21, a3, b1 # A2xB1 MADD1 c21, c21, a3, b1 # A2xB1
MADD3 c23, c23, a3, b2 MADD3 c23, c23, a3, b2
MADD2 c22, c22, a4, b1 MADD2 c22, c22, a4, b1
MADD4 c24, c24, a4, b2 MADD4 c24, c24, a4, b2
FETCH $0, 0 * SIZE(PREA)
MADD1 c31, c31, a1, b3 # A1xB2 MADD1 c31, c31, a1, b3 # A1xB2
MADD3 c33, c33, a1, b4 MADD3 c33, c33, a1, b4
daddiu L, L, -1
MADD2 c32, c32, a2, b3 MADD2 c32, c32, a2, b3
MADD4 c34, c34, a2, b4 MADD4 c34, c34, a2, b4
FETCH $0, 0 * SIZE(PREB)
MADD1 c41, c41, a3, b3 # A2xB2 MADD1 c41, c41, a3, b3 # A2xB2
MADD3 c43, c43, a3, b4 MADD3 c43, c43, a3, b4
MADD2 c42, c42, a4, b3 MADD2 c42, c42, a4, b3
MADD4 c44, c44, a4, b4 MADD4 c44, c44, a4, b4
gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 Unroll K=4 gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 Unroll K=4
gsLQC1(R12, F3, F2, 1) # R:a3 I:a4
gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 gsLQC1(R13, F5, F4, 0) # R:b1 I:b2
gsLQC1(R12, F3, F2, 1) # R:a3 I:a4
gsLQC1(R13, F7, F6, 1) # R:b2 I:b3 gsLQC1(R13, F7, F6, 1) # R:b2 I:b3
bgtz L, .L16 bgtz L, .L16
NOP NOP
.L18: .L18:
ADD c11, c14, c11 ADD c11, c14, c11
LD a1, 0 * SIZE(CO1) LD a1, 0 * SIZE(CO1)
ADD c12, c13, c12 ADD c12, c13, c12
@ -426,170 +433,196 @@
MADD b1, b1, ALPHA_R, c21 MADD b1, b1, ALPHA_R, c21
MADD b2, b2, ALPHA_R, c22 MADD b2, b2, ALPHA_R, c22
MADD a3, a3, ALPHA_R, c31
MADD a4, a4, ALPHA_R, c32
MADD b3, b3, ALPHA_R, c41
MADD b4, b4, ALPHA_R, c42
NMSUB a1, a1, ALPHA_I, c12 NMSUB a1, a1, ALPHA_I, c12
MADD a2, a2, ALPHA_I, c11 MADD a2, a2, ALPHA_I, c11
NMSUB b1, b1, ALPHA_I, c22 NMSUB b1, b1, ALPHA_I, c22
MADD b2, b2, ALPHA_I, c21 MADD b2, b2, ALPHA_I, c21
MADD a3, a3, ALPHA_R, c31
MADD a4, a4, ALPHA_R, c32
ST a1, 0 * SIZE(CO1) ST a1, 0 * SIZE(CO1)
MADD b3, b3, ALPHA_R, c41
MADD b4, b4, ALPHA_R, c42
ST a2, 1 * SIZE(CO1)
NMSUB a3, a3, ALPHA_I, c32 NMSUB a3, a3, ALPHA_I, c32
MADD a4, a4, ALPHA_I, c31 MADD a4, a4, ALPHA_I, c31
ST a2, 1 * SIZE(CO1) ST b1, 2 * SIZE(CO1)
NMSUB b3, b3, ALPHA_I, c42 NMSUB b3, b3, ALPHA_I, c42
MADD b4, b4, ALPHA_I, c41 MADD b4, b4, ALPHA_I, c41
ST b1, 2 * SIZE(CO1)
ST b2, 3 * SIZE(CO1) ST b2, 3 * SIZE(CO1)
ST a3, 0 * SIZE(CO2) ST a3, 0 * SIZE(CO2)
ST a4, 1 * SIZE(CO2) ST a4, 1 * SIZE(CO2)
ST b3, 2 * SIZE(CO2) ST b3, 2 * SIZE(CO2)
ST b4, 3 * SIZE(CO2) ST b4, 3 * SIZE(CO2)
FETCH $0, 4 * SIZE(CO2)
FETCH $0, 4 * SIZE(CO1)
FETCH $0, 8 * SIZE(CO2)
FETCH $0, 8 * SIZE(CO1)
FETCH $0, 12 * SIZE(CO2)
FETCH $0, 12 * SIZE(CO1)
FETCH $0, 16 * SIZE(CO2)
FETCH $0, 16 * SIZE(CO1)
daddiu CO1,CO1, 4 * SIZE daddiu CO1,CO1, 4 * SIZE
bgtz I, .L11 bgtz I, .L11
daddiu CO2,CO2, 4 * SIZE daddiu CO2,CO2, 4 * SIZE
.L30: .L30:
andi I, M, 1 andi I, M, 1
daddu C, C, LDC # Change C to next panel daddu C, C, LDC # Change C to next panel
daddu PREB, PREB, B # PREA=A+panel size
blez I, .L19 blez I, .L19
daddu C, C, LDC # Change C to next panel daddu C, C, LDC # Change C to next panel
dsra L, K, 2 # Unroll K 4 times dsra L, K, 2 # Unroll K 4 times
move BO, B move BO, B
gsLQC1(R12, F1, F0, 0) # R:a1 I:a2
MTC $0, c11 # Clear results regs MTC $0, c11 # Clear results regs
MOV c12, c11 MOV c12, c11
gsLQC1(R13, F5, F4, 0) # R:b1 I:b2
MOV c13, c11 MOV c13, c11
MOV c14, c11 MOV c14, c11
gsLQC1(R13, F7, F6, 1) # R:b2 I:b3
MOV c31, c11 MOV c31, c11
MOV c32, c11 MOV c32, c11
FETCH $0, 0 * SIZE(PREB)
FETCH $0, 0 * SIZE(CO1)
FETCH $0, 0 * SIZE(CO2)
FETCH $0, 4 * SIZE(CO1)
FETCH $0, 4 * SIZE(CO2)
MOV c33, c11 MOV c33, c11
blez L, .L35
MOV c34, c11 MOV c34, c11
gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 .align 5
gsLQC1(R13, F5, F4, 0) # R:b1 I:b2
gsLQC1(R13, F7, F6, 1) # R:b2 I:b3
blez L, .L35
NOP
.align 3
.L32: .L32:
gsLQC1(R12, F3, F2, 1) # R:a3 I:a4 gsLQC1(R12, F3, F2, 1) # R:a3 I:a4
gsLQC1(R13, F13, F12, 2) gsLQC1(R13, F13, F12, 2)
gsLQC1(R13, F16, F15, 3)
MADD1 c11, c11, a1, b1 # axc A1xB1 MADD1 c11, c11, a1, b1 # axc A1xB1
MADD3 c13, c13, a1, b2 # axd MADD3 c13, c13, a1, b2 # axd
gsLQC1(R13, F16, F15, 3)
MADD2 c12, c12, a2, b1 # bxc MADD2 c12, c12, a2, b1 # bxc
MADD4 c14, c14, a2, b2 # bxd MADD4 c14, c14, a2, b2 # bxd
NOP
MADD1 c31, c31, a1, b3 # A1xB2 MADD1 c31, c31, a1, b3 # A1xB2
MADD3 c33, c33, a1, b4 MADD3 c33, c33, a1, b4
FETCH $0, 4 * SIZE(PREB)
MADD2 c32, c32, a2, b3 MADD2 c32, c32, a2, b3
MADD4 c34, c34, a2, b4 MADD4 c34, c34, a2, b4
NOP
gsLQC1(R12, F9, F8, 2) # Unroll K=1 gsLQC1(R12, F9, F8, 2) # Unroll K=1
gsLQC1(R13, F5, F4, 4) gsLQC1(R13, F5, F4, 4)
gsLQC1(R13, F7, F6, 5)
MADD1 c11, c11, a3, b5 # axc A1xB1 MADD1 c11, c11, a3, b5 # axc A1xB1
MADD3 c13, c13, a3, b6 # axd MADD3 c13, c13, a3, b6 # axd
gsLQC1(R13, F7, F6, 5)
MADD2 c12, c12, a4, b5 # bxc MADD2 c12, c12, a4, b5 # bxc
MADD4 c14, c14, a4, b6 # bxd MADD4 c14, c14, a4, b6 # bxd
NOP
MADD1 c31, c31, a3, b7 # A1xB2 MADD1 c31, c31, a3, b7 # A1xB2
MADD3 c33, c33, a3, b8 MADD3 c33, c33, a3, b8
FETCH $0, 8 * SIZE(PREB)
MADD2 c32, c32, a4, b7 MADD2 c32, c32, a4, b7
MADD4 c34, c34, a4, b8 MADD4 c34, c34, a4, b8
daddiu L, L, -1 daddiu L, L, -1
gsLQC1(R12, F11, F10, 3) gsLQC1(R12, F11, F10, 3)
gsLQC1(R13, F13, F12, 6) gsLQC1(R13, F13, F12, 6)
gsLQC1(R13, F16, F15, 7)
MADD1 c11, c11, a5, b1 # axc A1xB1 MADD1 c11, c11, a5, b1 # axc A1xB1
MADD3 c13, c13, a5, b2 # axd MADD3 c13, c13, a5, b2 # axd
gsLQC1(R13, F16, F15, 7)
MADD2 c12, c12, a6, b1 # bxc MADD2 c12, c12, a6, b1 # bxc
MADD4 c14, c14, a6, b2 # bxd MADD4 c14, c14, a6, b2 # bxd
daddiu BO, BO, 16 * SIZE # 2nr*4kr*cmpx
daddiu AO, AO, 8 * SIZE # 2mr*4kr*cmpx daddiu AO, AO, 8 * SIZE # 2mr*4kr*cmpx
MADD1 c31, c31, a5, b3 # A1xB2 MADD1 c31, c31, a5, b3 # A1xB2
MADD3 c33, c33, a5, b4 MADD3 c33, c33, a5, b4
FETCH $0, 12 * SIZE(PREB)
MADD2 c32, c32, a6, b3 MADD2 c32, c32, a6, b3
MADD4 c34, c34, a6, b4 MADD4 c34, c34, a6, b4
daddiu BO, BO, 16 * SIZE # 2nr*4kr*cmpx
gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 Unroll K=4 gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 Unroll K=4
gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 gsLQC1(R13, F5, F4, 0) # R:b1 I:b2
gsLQC1(R13, F7, F6, 1) # R:b2 I:b3
MADD1 c11, c11, a7, b5 # axc A1xB1 MADD1 c11, c11, a7, b5 # axc A1xB1
MADD3 c13, c13, a7, b6 # axd MADD3 c13, c13, a7, b6 # axd
gsLQC1(R13, F7, F6, 1) # R:b2 I:b3
MADD2 c12, c12, a8, b5 # bxc MADD2 c12, c12, a8, b5 # bxc
MADD4 c14, c14, a8, b6 # bxd MADD4 c14, c14, a8, b6 # bxd
daddiu PREB, PREB, 16 * SIZE
MADD1 c31, c31, a7, b7 # A1xB2 MADD1 c31, c31, a7, b7 # A1xB2
MADD3 c33, c33, a7, b8 MADD3 c33, c33, a7, b8
FETCH $0, 0 * SIZE(PREB)
MADD2 c32, c32, a8, b7 MADD2 c32, c32, a8, b7
bgtz L, .L32
MADD4 c34, c34, a8, b8 MADD4 c34, c34, a8, b8
bgtz L, .L32
NOP
.align 3
.L35: .L35:
andi L, K, 3 andi L, K, 3
LD ALPHA_R, 128($sp) LD ALPHA_R, 128($sp)
LD ALPHA_I, 136($sp)
blez L, .L38
NOP NOP
.align 3 blez L, .L38
LD ALPHA_I, 136($sp)
.align 5
.L36: .L36:
daddiu L, L, -1
daddiu BO, BO, 2 * SIZE * COMPSIZE # 2nr*1kr*cmpx
daddiu AO, AO, 1 * SIZE * COMPSIZE # 2mr*1kr*cmpx
daddiu L, L, -1
MADD1 c11, c11, a1, b1 # axc A1xB1 MADD1 c11, c11, a1, b1 # axc A1xB1
MADD3 c13, c13, a1, b2 # axd MADD3 c13, c13, a1, b2 # axd
daddiu BO, BO, 4 * SIZE # 2nr*1kr*cmpx
MADD2 c12, c12, a2, b1 # bxc MADD2 c12, c12, a2, b1 # bxc
MADD4 c14, c14, a2, b2 # bxd MADD4 c14, c14, a2, b2 # bxd
daddiu AO, AO, 2 * SIZE # 2mr*1kr*cmpx
MADD1 c31, c31, a1, b3 # A1xB2 MADD1 c31, c31, a1, b3 # A1xB2
MADD3 c33, c33, a1, b4 MADD3 c33, c33, a1, b4
daddiu PREB, PREB, 4 * SIZE
MADD2 c32, c32, a2, b3 MADD2 c32, c32, a2, b3
MADD4 c34, c34, a2, b4 MADD4 c34, c34, a2, b4
gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 Unroll K=4 gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 Unroll K=4
gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 gsLQC1(R13, F5, F4, 0) # R:b1 I:b2
gsLQC1(R13, F7, F6, 1) # R:b2 I:b3 NOP
bgtz L, .L36 bgtz L, .L36
NOP gsLQC1(R13, F7, F6, 1) # R:b2 I:b3
.L38: .L38:
ADD c11, c14, c11 ADD c11, c14, c11
ADD c12, c13, c12
ADD c31, c34, c31
ADD c32, c33, c32
LD a1, 0 * SIZE(CO1) LD a1, 0 * SIZE(CO1)
ADD c12, c13, c12
LD a2, 1 * SIZE(CO1) LD a2, 1 * SIZE(CO1)
ADD c31, c34, c31
LD a3, 0 * SIZE(CO2) LD a3, 0 * SIZE(CO2)
ADD c32, c33, c32
LD a4, 1 * SIZE(CO2) LD a4, 1 * SIZE(CO2)
MADD a1, a1, ALPHA_R, c11 MADD a1, a1, ALPHA_R, c11
@ -613,43 +646,48 @@
daddiu CO1,CO1, 2 * SIZE daddiu CO1,CO1, 2 * SIZE
daddiu CO2,CO2, 2 * SIZE daddiu CO2,CO2, 2 * SIZE
.align 3 .align 5
.L19: .L19:
bgtz J, .L10 bgtz J, .L10
move B, BO move B, BO
.align 3 .align 5
.L20: .L20:
andi J, N, 1 andi J, N, 1
blez J, .L999 blez J, .L999
NOP dsll PREA, K, 1+ZBASE_SHIFT # PREA=K*2*2^4
move CO1, C
move AO, A # Reset AO
dsra I, M, 1 # I=M/2 dsra I, M, 1 # I=M/2
move CO1, C
move AO, A # Reset AO
blez I, .L29 blez I, .L29
NOP daddu PREA, PREA, A
.L21: .L21:
dsra L, K, 2 # Unroll K 4 times dsra L, K, 2 # Unroll K 4 times
move BO, B move BO, B
gsLQC1(R12, F1, F0, 0) # R:a1 I:a2
MTC $0, c11 # Clear results regs MTC $0, c11 # Clear results regs
MOV c12, c11 MOV c12, c11
gsLQC1(R13, F5, F4, 0) # R:b1 I:b2
MOV c13, c11 MOV c13, c11
MOV c14, c11 MOV c14, c11
gsLQC1(R12, F3, F2, 1) # R:a3 I:a4
MOV c21, c11 MOV c21, c11
MOV c22, c11 MOV c22, c11
FETCH $0, 0 * SIZE(PREA)
MOV c23, c11 MOV c23, c11
MOV c24, c11 MOV c24, c11
gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 FETCH $0, 0 * SIZE(CO1)
gsLQC1(R12, F3, F2, 1) # R:a3 I:a4 FETCH $0, 4 * SIZE(CO1)
gsLQC1(R13, F5, F4, 0) # R:b1 I:b2
blez L, .L25 blez L, .L25
NOP NOP
@ -658,110 +696,116 @@
.L22: .L22:
gsLQC1(R12, F9, F8, 2) # Unroll K=1 gsLQC1(R12, F9, F8, 2) # Unroll K=1
gsLQC1(R12, F11, F10, 3)
gsLQC1(R13, F7, F6, 1) # R:b2 I:b3
MADD1 c11, c11, a1, b1 # axc A1xB1 MADD1 c11, c11, a1, b1 # axc A1xB1
MADD3 c13, c13, a1, b2 # axd MADD3 c13, c13, a1, b2 # axd
gsLQC1(R13, F7, F6, 1) # R:b2 I:b3
MADD2 c12, c12, a2, b1 # bxc MADD2 c12, c12, a2, b1 # bxc
MADD4 c14, c14, a2, b2 # bxd MADD4 c14, c14, a2, b2 # bxd
gsLQC1(R12, F11, F10, 3)
MADD1 c21, c21, a3, b1 # A2xB1 MADD1 c21, c21, a3, b1 # A2xB1
MADD3 c23, c23, a3, b2 MADD3 c23, c23, a3, b2
FETCH $0, 4 * SIZE(PREA)
MADD2 c22, c22, a4, b1 MADD2 c22, c22, a4, b1
MADD4 c24, c24, a4, b2 MADD4 c24, c24, a4, b2
gsLQC1(R12, F1, F0, 4) # Unroll K=2 gsLQC1(R12, F1, F0, 4) # Unroll K=2
gsLQC1(R12, F3, F2, 5)
gsLQC1(R13, F13, F12, 2)
MADD1 c11, c11, a5, b3 # axc A1xB1 MADD1 c11, c11, a5, b3 # axc A1xB1
MADD3 c13, c13, a5, b4 # axd MADD3 c13, c13, a5, b4 # axd
gsLQC1(R13, F13, F12, 2)
MADD2 c12, c12, a6, b3 # bxc MADD2 c12, c12, a6, b3 # bxc
MADD4 c14, c14, a6, b4 # bxd MADD4 c14, c14, a6, b4 # bxd
gsLQC1(R12, F3, F2, 5)
MADD1 c21, c21, a7, b3 # A2xB1 MADD1 c21, c21, a7, b3 # A2xB1
MADD3 c23, c23, a7, b4 MADD3 c23, c23, a7, b4
FETCH $0, 8 * SIZE(PREA)
MADD2 c22, c22, a8, b3 MADD2 c22, c22, a8, b3
MADD4 c24, c24, a8, b4 MADD4 c24, c24, a8, b4
daddiu L, L, -1 daddiu L, L, -1
gsLQC1(R12, F9, F8, 6) # Unroll K=3
gsLQC1(R12, F11, F10, 7)
gsLQC1(R13, F16, F15, 3)
gsLQC1(R12, F9, F8, 6) # Unroll K=3
MADD1 c11, c11, a1, b5 # axc A1xB1 MADD1 c11, c11, a1, b5 # axc A1xB1
MADD3 c13, c13, a1, b6 # axd MADD3 c13, c13, a1, b6 # axd
gsLQC1(R13, F16, F15, 3)
MADD2 c12, c12, a2, b5 # bxc MADD2 c12, c12, a2, b5 # bxc
MADD4 c14, c14, a2, b6 # bxd MADD4 c14, c14, a2, b6 # bxd
daddiu BO, BO, 8 * SIZE # 1nr*4kr*cmpx gsLQC1(R12, F11, F10, 7)
daddiu AO, AO, 16 * SIZE # 2mr*4kr*cmpx
MADD1 c21, c21, a3, b5 # A2xB1 MADD1 c21, c21, a3, b5 # A2xB1
MADD3 c23, c23, a3, b6 MADD3 c23, c23, a3, b6
daddiu BO, BO, 8 * SIZE # 1nr*4kr*cmpx
FETCH $0, 12 * SIZE(PREA)
MADD2 c22, c22, a4, b5 MADD2 c22, c22, a4, b5
MADD4 c24, c24, a4, b6 MADD4 c24, c24, a4, b6
daddiu AO, AO, 16 * SIZE # 2mr*4kr*cmpx
gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 Unroll K=4 gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 Unroll K=4
gsLQC1(R12, F3, F2, 1) # R:a3 I:a4
gsLQC1(R13, F5, F4, 0) # R:b1 I:b2
MADD1 c11, c11, a5, b7 # axc A1xB1 MADD1 c11, c11, a5, b7 # axc A1xB1
MADD3 c13, c13, a5, b8 # axd MADD3 c13, c13, a5, b8 # axd
daddiu PREA, PREA, 16 * SIZE
gsLQC1(R13, F5, F4, 0) # R:b1 I:b2
MADD2 c12, c12, a6, b7 # bxc MADD2 c12, c12, a6, b7 # bxc
MADD4 c14, c14, a6, b8 # bxd MADD4 c14, c14, a6, b8 # bxd
gsLQC1(R12, F3, F2, 1) # R:a3 I:a4
MADD1 c21, c21, a7, b7 # A2xB1 MADD1 c21, c21, a7, b7 # A2xB1
MADD3 c23, c23, a7, b8 MADD3 c23, c23, a7, b8
FETCH $0, 0 * SIZE(PREA)
MADD2 c22, c22, a8, b7 MADD2 c22, c22, a8, b7
bgtz L, .L22
MADD4 c24, c24, a8, b8 MADD4 c24, c24, a8, b8
bgtz L, .L22
NOP
.align 3
.L25: .L25:
andi L, K, 3 andi L, K, 3
LD ALPHA_R, 128($sp) LD ALPHA_R, 128($sp)
LD ALPHA_I, 136($sp)
blez L, .L28 blez L, .L28
NOP LD ALPHA_I, 136($sp)
.align 3 .align 3
.L26: .L26:
daddiu L, L, -1
daddiu BO, BO, 1 * SIZE * COMPSIZE # 2nr*1kr*cmpx
daddiu AO, AO, 2 * SIZE * COMPSIZE # 2mr*1kr*cmpx
daddiu L, L, -1
MADD1 c11, c11, a1, b1 # axc A1xB1 MADD1 c11, c11, a1, b1 # axc A1xB1
MADD3 c13, c13, a1, b2 # axd MADD3 c13, c13, a1, b2 # axd
daddiu BO, BO, 2 * SIZE # 2nr*1kr*cmpx
MADD2 c12, c12, a2, b1 # bxc MADD2 c12, c12, a2, b1 # bxc
MADD4 c14, c14, a2, b2 # bxd MADD4 c14, c14, a2, b2 # bxd
daddiu AO, AO, 4 * SIZE # 2mr*1kr*cmpx
MADD1 c21, c21, a3, b1 # A2xB1 MADD1 c21, c21, a3, b1 # A2xB1
MADD3 c23, c23, a3, b2 MADD3 c23, c23, a3, b2
daddiu PREA, PREA, 4 * SIZE # 2mr*1kr*cmpx
MADD2 c22, c22, a4, b1 MADD2 c22, c22, a4, b1
MADD4 c24, c24, a4, b2 MADD4 c24, c24, a4, b2
gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 Unroll K=4 gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 Unroll K=4
gsLQC1(R12, F3, F2, 1) # R:a3 I:a4 gsLQC1(R12, F3, F2, 1) # R:a3 I:a4
gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 gsLQC1(R13, F5, F4, 0) # R:b1 I:b2
bgtz L, .L26 bgtz L, .L26
NOP FETCH $0, 0 * SIZE(PREA)
.L28: .L28:
ADD c11, c14, c11 ADD c11, c14, c11
ADD c12, c13, c12
ADD c21, c24, c21
ADD c22, c23, c22
LD a1, 0 * SIZE(CO1) LD a1, 0 * SIZE(CO1)
ADD c12, c13, c12
LD a2, 1 * SIZE(CO1) LD a2, 1 * SIZE(CO1)
ADD c21, c24, c21
LD b1, 2 * SIZE(CO1) LD b1, 2 * SIZE(CO1)
ADD c22, c23, c22
LD b2, 3 * SIZE(CO1) LD b2, 3 * SIZE(CO1)
daddiu I, I, -1 daddiu I, I, -1
@ -792,15 +836,16 @@
dsra L, K, 2 # Unroll K 4 times dsra L, K, 2 # Unroll K 4 times
move BO, B move BO, B
gsLQC1(R12, F1, F0, 0) # R:a1 I:a2
MTC $0, c11 # Clear results regs MTC $0, c11 # Clear results regs
MOV c12, c11 MOV c12, c11
gsLQC1(R13, F5, F4, 0) # R:b1 I:b2
MOV c13, c11 MOV c13, c11
MOV c14, c11 MOV c14, c11
FETCH $0, 0 * SIZE(PREA)
gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 FETCH $0, 4 * SIZE(PREA)
gsLQC1(R13, F5, F4, 0) # R:b1 I:b2
blez L, .L45 blez L, .L45
NOP NOP
@ -808,53 +853,49 @@
.L42: .L42:
gsLQC1(R12, F3, F2, 1) # R:a3 I:a4 gsLQC1(R12, F3, F2, 1) # R:a3 I:a4
gsLQC1(R13, F7, F6, 1) # R:b2 I:b3
MADD1 c11, c11, a1, b1 # axc A1xB1 MADD1 c11, c11, a1, b1 # axc A1xB1
MADD3 c13, c13, a1, b2 # axd MADD3 c13, c13, a1, b2 # axd
gsLQC1(R13, F7, F6, 1) # R:b2 I:b3
MADD2 c12, c12, a2, b1 # bxc MADD2 c12, c12, a2, b1 # bxc
MADD4 c14, c14, a2, b2 # bxd MADD4 c14, c14, a2, b2 # bxd
gsLQC1(R12, F9, F8, 2) # Unroll K=1 gsLQC1(R12, F9, F8, 2) # Unroll K=1
gsLQC1(R13, F13, F12, 2)
MADD1 c11, c11, a3, b3 # axc A1xB1 MADD1 c11, c11, a3, b3 # axc A1xB1
MADD3 c13, c13, a3, b4 # axd MADD3 c13, c13, a3, b4 # axd
gsLQC1(R13, F13, F12, 2)
MADD2 c12, c12, a4, b3 # bxc MADD2 c12, c12, a4, b3 # bxc
MADD4 c14, c14, a4, b4 # bxd MADD4 c14, c14, a4, b4 # bxd
daddiu L, L, -1 daddiu L, L, -1
gsLQC1(R12, F11, F10, 3)
gsLQC1(R13, F16, F15, 3)
gsLQC1(R12, F11, F10, 3)
MADD1 c11, c11, a5, b5 # axc A1xB1 MADD1 c11, c11, a5, b5 # axc A1xB1
MADD3 c13, c13, a5, b6 # axd MADD3 c13, c13, a5, b6 # axd
MADD2 c12, c12, a6, b5 # bxc
MADD4 c14, c14, a6, b6 # bxd
daddiu BO, BO, 8 * SIZE # 2nr*4kr*cmpx
daddiu AO, AO, 8 * SIZE # 2mr*4kr*cmpx daddiu AO, AO, 8 * SIZE # 2mr*4kr*cmpx
gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 Unroll K=4 gsLQC1(R13, F16, F15, 3)
gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 MADD2 c12, c12, a6, b5 # bxc
MADD4 c14, c14, a6, b6 # bxd
daddiu BO, BO, 8 * SIZE # 2nr*4kr*cmpx
gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 Unroll K=4
MADD1 c11, c11, a7, b7 # axc A1xB1 MADD1 c11, c11, a7, b7 # axc A1xB1
MADD3 c13, c13, a7, b8 # axd MADD3 c13, c13, a7, b8 # axd
gsLQC1(R13, F5, F4, 0) # R:b1 I:b2
MADD2 c12, c12, a8, b7 # bxc MADD2 c12, c12, a8, b7 # bxc
bgtz L, .L42
MADD4 c14, c14, a8, b8 # bxd MADD4 c14, c14, a8, b8 # bxd
bgtz L, .L42
NOP
.align 3 .align 5
.L45: .L45:
andi L, K, 3 andi L, K, 3
LD ALPHA_R, 128($sp) LD ALPHA_R, 128($sp)
LD ALPHA_I, 136($sp)
blez L, .L48 blez L, .L48
NOP LD ALPHA_I, 136($sp)
.align 3
.L46: .L46:
daddiu L, L, -1 daddiu L, L, -1
@ -892,7 +933,7 @@
.align 3 .align 5
.L999: .L999:
LDARG $16, 0($sp) LDARG $16, 0($sp)