Add ztrmm and ztrsm part on loongson3a. The average performance is 2.2G.

This commit is contained in:
traz 2011-06-23 21:11:00 +00:00
parent 14f81da375
commit e72113f06a
4 changed files with 438 additions and 25 deletions

View File

@ -128,10 +128,21 @@ CTRSMKERNEL_LT = ztrsm_kernel_LT.S
CTRSMKERNEL_RN = ztrsm_kernel_LT.S
CTRSMKERNEL_RT = ztrsm_kernel_RT.S
ifndef ZTRSMKERNEL_LN
ZTRSMKERNEL_LN = ztrsm_kernel_LT.S
endif
ifndef ZTRSMKERNEL_LT
ZTRSMKERNEL_LT = ztrsm_kernel_LT.S
endif
ifndef ZTRSMKERNEL_RN
ZTRSMKERNEL_RN = ztrsm_kernel_LT.S
endif
ifndef ZTRSMKERNEL_RT
ZTRSMKERNEL_RT = ztrsm_kernel_RT.S
endif
CGEMM3MKERNEL = zgemm3m_kernel.S
ZGEMM3MKERNEL = zgemm3m_kernel.S

View File

@ -28,3 +28,12 @@ DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c

View File

@ -1,12 +1,10 @@
#define ASSEMBLER
#include "common.h"
#define FETCH ld
#define gsLQC1(base,fq,ft,offset) .word(0x32<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq)
#define gsSQC1(base,fq,ft,offset) .word(0x3A<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq)
#define STACKSIZE 160
#define M $4
#define N $5
@ -116,12 +114,12 @@
## MADD3 a*d
## MADD4 d*b
##################################
####if defined(NN) || defined(NT) || defined(TN) || defined(TT)
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
#define MADD1 MADD
#define MADD2 MADD
#define MADD3 MADD
#define MADD4 NMSUB
###endif
#endif
#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
#define MADD1 MADD
@ -175,6 +173,9 @@
dsra J, N, 1 # J=N/2
ST ALPHA_R, 128($sp) # store alpha_r & alpha_i
#if defined(TRMMKERNEL) && !defined(LEFT)
neg KK, OFFSET
#endif
dsll LDC, LDC, ZBASE_SHIFT # LDC*SIZE*COMPSIZE
blez J, .L20
@ -183,6 +184,10 @@
.align 5
.L10:
#if defined(TRMMKERNEL) && defined(LEFT)
move KK, OFFSET
#endif
daddiu J, J, -1
dsra I, M, 1 # I=M/2
@ -193,12 +198,66 @@
daddu CO2, C, LDC
move AO, A # Reset AO
daddu PREB, PREB, B # PREA=A+panel size
blez I, .L30
daddu PREA, PREA, A # PREA=A+panel size
.L11:
#if defined(TRMMKERNEL)
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
move BO, B
#else
dsll L, KK, 1 + ZBASE_SHIFT # MR=NR=2
dsll TEMP, KK, 1 + ZBASE_SHIFT
daddu AO, AO, L
daddu BO, B, TEMP
#endif
MTC $0, c11 # Clear results regs
MOV c12, c11
gsLQC1(R12, F1, F0, 0) # R:a1 I:a2
MOV c13, c11
MOV c14, c11
gsLQC1(R13, F5, F4, 0) # R:b1 I:b2
MOV c21, c11
MOV c22, c11
gsLQC1(R12, F3, F2, 1) # R:a3 I:a4
MOV c23, c11
MOV c24, c11
gsLQC1(R13, F7, F6, 1) # R:b2 I:b3
FETCH $0, 0 * SIZE(CO2)
MOV c31, c11
MOV c32, c11
FETCH $0, 0 * SIZE(CO1)
MOV c33, c11
MOV c34, c11
FETCH $0, 4 * SIZE(CO2)
MOV c41, c11
MOV c42, c11
FETCH $0, 4 * SIZE(CO1)
MOV c43, c11
MOV c44, c11
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
dsubu TEMP, K, KK
#elif defined(LEFT)
daddiu TEMP, KK, 2
#else
daddiu TEMP, KK, 2
#endif
dsra L, TEMP, 2
daddu PREB, PREB, B # PREA=A+panel size
blez L, .L15
NOP
#else
dsra L, K, 2 # Unroll K 4 times
move BO, B
@ -218,18 +277,25 @@
MOV c24, c11
gsLQC1(R13, F7, F6, 1) # R:b2 I:b3
FETCH $0, 0 * SIZE(CO2)
MOV c31, c11
MOV c32, c11
FETCH $0, 0 * SIZE(CO1)
MOV c33, c11
MOV c34, c11
FETCH $0, 4 * SIZE(CO2)
MOV c41, c11
MOV c42, c11
FETCH $0, 4 * SIZE(CO1)
MOV c43, c11
daddu PREB, PREB, B # PREA=A+panel size
blez L, .L15
MOV c44, c11
#endif
.align 5
@ -361,8 +427,13 @@
.align 5
.L15:
#ifndef TRMMKERNEL
andi L, K, 3
LD ALPHA_R, 128($sp)
#else
andi L, TEMP, 3
LD ALPHA_R, 128($sp)
#endif
blez L, .L18
LD ALPHA_I, 136($sp)
@ -408,7 +479,7 @@
NOP
.L18:
#ifndef TRMMKERNEL
ADD c11, c14, c11
LD a1, 0 * SIZE(CO1)
ADD c12, c13, c12
@ -458,20 +529,75 @@
ST b3, 2 * SIZE(CO2)
ST b4, 3 * SIZE(CO2)
FETCH $0, 4 * SIZE(CO2)
FETCH $0, 4 * SIZE(CO1)
FETCH $0, 8 * SIZE(CO2)
FETCH $0, 8 * SIZE(CO1)
FETCH $0, 12 * SIZE(CO2)
FETCH $0, 12 * SIZE(CO1)
FETCH $0, 16 * SIZE(CO2)
FETCH $0, 16 * SIZE(CO1)
#else
ADD c11, c14, c11
ADD c12, c13, c12
ADD c21, c24, c21
ADD c22, c23, c22
ADD c31, c34, c31
ADD c32, c33, c32
ADD c41, c44, c41
ADD c42, c43, c42
daddiu I, I, -1
MUL a1, ALPHA_R, c11
MUL a2, ALPHA_R, c12
MUL b1, ALPHA_R, c21
MUL b2, ALPHA_R, c22
NMSUB a1, a1, ALPHA_I, c12
MADD a2, a2, ALPHA_I, c11
NMSUB b1, b1, ALPHA_I, c22
MADD b2, b2, ALPHA_I, c21
MUL a3, ALPHA_R, c31
MUL a4, ALPHA_R, c32
MUL b3, ALPHA_R, c41
MUL b4, ALPHA_R, c42
NMSUB a3, a3, ALPHA_I, c32
MADD a4, a4, ALPHA_I, c31
NMSUB b3, b3, ALPHA_I, c42
MADD b4, b4, ALPHA_I, c41
ST a1, 0 * SIZE(CO1)
ST a2, 1 * SIZE(CO1)
ST b1, 2 * SIZE(CO1)
ST b2, 3 * SIZE(CO1)
ST a3, 0 * SIZE(CO2)
ST a4, 1 * SIZE(CO2)
ST b3, 2 * SIZE(CO2)
ST b4, 3 * SIZE(CO2)
#if ( defined(LEFT) && defined(TRANSA)) || \
(!defined(LEFT) && !defined(TRANSA))
dsubu TEMP, K, KK
#ifdef LEFT
daddiu TEMP, TEMP, -2
#else
daddiu TEMP, TEMP, -2
#endif
dsll L, TEMP, 1 + ZBASE_SHIFT
dsll TEMP, TEMP, 1 + ZBASE_SHIFT
daddu AO, AO, L
daddu BO, BO, TEMP
#endif
#ifdef LEFT
daddiu KK, KK, 2
#endif
#endif
dsll PREB, K, 1 + ZBASE_SHIFT # PREA=K*2*2^4
daddiu CO1,CO1, 4 * SIZE
bgtz I, .L11
daddiu CO2,CO2, 4 * SIZE
.align 5
.L30:
andi I, M, 1
daddu C, C, LDC # Change C to next panel
@ -480,22 +606,69 @@
blez I, .L19
daddu C, C, LDC # Change C to next panel
dsra L, K, 2 # Unroll K 4 times
#if defined(TRMMKERNEL)
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
move BO, B
#else
dsll L, KK, ZBASE_SHIFT # MR=1
dsll TEMP, KK, 1 + ZBASE_SHIFT # NR=2
daddu AO, AO, L
daddu BO, B, TEMP
#endif
gsLQC1(R12, F1, F0, 0) # R:a1 I:a2
move BO, B
gsLQC1(R13, F5, F4, 0) # R:b1 I:b2
MTC $0, c11 # Clear results regs
MOV c12, c11
gsLQC1(R13, F5, F4, 0) # R:b1 I:b2
gsLQC1(R13, F7, F6, 1) # R:b2 I:b3
MOV c13, c11
MOV c14, c11
gsLQC1(R13, F7, F6, 1) # R:b2 I:b3
FETCH $0, 0 * SIZE(PREB)
MOV c31, c11
MOV c32, c11
FETCH $0, 0 * SIZE(CO1)
FETCH $0, 0 * SIZE(CO2)
FETCH $0, 4 * SIZE(CO1)
FETCH $0, 4 * SIZE(CO2)
MOV c33, c11
MOV c34, c11
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
dsubu TEMP, K, KK
#elif defined(LEFT)
daddiu TEMP, KK, 1 # MR=1
#else
daddiu TEMP, KK, 2 # NR=2
#endif
dsra L, TEMP, 2
blez L, .L35
NOP
#else
gsLQC1(R12, F1, F0, 0) # R:a1 I:a2
dsra L, K, 2 # Unroll K 4 times
move BO, B
gsLQC1(R13, F5, F4, 0) # R:b1 I:b2
MTC $0, c11 # Clear results regs
MOV c12, c11
gsLQC1(R13, F7, F6, 1) # R:b2 I:b3
MOV c13, c11
MOV c14, c11
FETCH $0, 0 * SIZE(PREB)
MOV c31, c11
MOV c32, c11
FETCH $0, 0 * SIZE(CO1)
FETCH $0, 0 * SIZE(CO2)
FETCH $0, 4 * SIZE(CO1)
@ -504,6 +677,7 @@
MOV c33, c11
blez L, .L35
MOV c34, c11
#endif
.align 5
@ -582,15 +756,18 @@
.L35:
#ifndef TRMMKERNEL
andi L, K, 3
LD ALPHA_R, 128($sp)
NOP
#else
andi L, TEMP, 3
LD ALPHA_R, 128($sp)
#endif
blez L, .L38
LD ALPHA_I, 136($sp)
.align 5
.L36:
daddiu L, L, -1
MADD1 c11, c11, a1, b1 # axc A1xB1
MADD3 c13, c13, a1, b2 # axd
@ -615,6 +792,7 @@
gsLQC1(R13, F7, F6, 1) # R:b2 I:b3
.L38:
#ifndef TRMMKERNEL
ADD c11, c14, c11
LD a1, 0 * SIZE(CO1)
ADD c12, c13, c12
@ -645,10 +823,60 @@
daddiu CO1,CO1, 2 * SIZE
daddiu CO2,CO2, 2 * SIZE
#else
ADD c11, c14, c11
ADD c12, c13, c12
ADD c31, c34, c31
ADD c32, c33, c32
MUL a1, ALPHA_R, c11
MUL a2, ALPHA_R, c12
MUL a3, ALPHA_R, c31
MUL a4, ALPHA_R, c32
NMSUB a1, a1, ALPHA_I, c12
MADD a2, a2, ALPHA_I, c11
NMSUB a3, a3, ALPHA_I, c32
MADD a4, a4, ALPHA_I, c31
ST a1, 0 * SIZE(CO1)
ST a2, 1 * SIZE(CO1)
ST a3, 0 * SIZE(CO2)
ST a4, 1 * SIZE(CO2)
daddiu CO1,CO1, 2 * SIZE
daddiu CO2,CO2, 2 * SIZE
#if ( defined(LEFT) && defined(TRANSA)) || \
(!defined(LEFT) && !defined(TRANSA))
dsubu TEMP, K, KK
#ifdef LEFT
daddiu TEMP, TEMP, -1
#else
daddiu TEMP, TEMP, -2
#endif
dsll L, TEMP, ZBASE_SHIFT
dsll TEMP, TEMP, 1 + ZBASE_SHIFT
daddu AO, AO, L
daddu BO, BO, TEMP
#endif
#ifdef LEFT
daddiu KK, KK, 1
#endif
#endif
.align 5
.L19:
#if defined(TRMMKERNEL) && !defined(LEFT)
daddiu KK, KK, 2
#endif
bgtz J, .L10
move B, BO
@ -662,11 +890,56 @@
dsra I, M, 1 # I=M/2
move CO1, C
#if defined(TRMMKERNEL) && defined(LEFT)
move KK, OFFSET
#endif
move AO, A # Reset AO
blez I, .L29
daddu PREA, PREA, A
.L21:
#if defined(TRMMKERNEL)
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
move BO, B
#else
dsll L, KK, 1 + ZBASE_SHIFT
dsll TEMP, KK, ZBASE_SHIFT
daddu AO, AO, L
daddu BO, B, TEMP
#endif
gsLQC1(R12, F1, F0, 0) # R:a1 I:a2
MTC $0, c11 # Clear results regs
MOV c12, c11
gsLQC1(R13, F5, F4, 0) # R:b1 I:b2
MOV c13, c11
MOV c14, c11
gsLQC1(R12, F3, F2, 1) # R:a3 I:a4
MOV c21, c11
MOV c22, c11
FETCH $0, 0 * SIZE(PREA)
MOV c23, c11
MOV c24, c11
FETCH $0, 0 * SIZE(CO1)
FETCH $0, 4 * SIZE(CO1)
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
dsubu TEMP, K, KK
#elif defined(LEFT)
daddiu TEMP, KK, 2 # define Mr=2
#else
daddiu TEMP, KK, 1 # define NR=1
#endif
dsra L, TEMP, 2
blez L, .L25
NOP
#else
dsra L, K, 2 # Unroll K 4 times
move BO, B
@ -691,8 +964,9 @@
blez L, .L25
NOP
#endif
.align 3
.align 5
.L22:
gsLQC1(R12, F9, F8, 2) # Unroll K=1
@ -766,15 +1040,18 @@
.L25:
#ifndef TRMMKERNEL
andi L, K, 3
LD ALPHA_R, 128($sp)
#else
andi L, TEMP, 3
LD ALPHA_R, 128($sp)
#endif
blez L, .L28
LD ALPHA_I, 136($sp)
.align 3
.L26:
daddiu L, L, -1
MADD1 c11, c11, a1, b1 # axc A1xB1
MADD3 c13, c13, a1, b2 # axd
@ -799,6 +1076,7 @@
FETCH $0, 0 * SIZE(PREA)
.L28:
#ifndef TRMMKERNEL
ADD c11, c14, c11
LD a1, 0 * SIZE(CO1)
ADD c12, c13, c12
@ -824,6 +1102,48 @@
ST b1, 2 * SIZE(CO1)
ST b2, 3 * SIZE(CO1)
#else
ADD c11, c14, c11
ADD c12, c13, c12
ADD c21, c24, c21
ADD c22, c23, c22
daddiu I, I, -1
MUL a1, ALPHA_R, c11
MUL a2, ALPHA_R, c12
MUL b1, ALPHA_R, c21
MUL b2, ALPHA_R, c22
NMSUB a1, a1, ALPHA_I, c12
MADD a2, a2, ALPHA_I, c11
NMSUB b1, b1, ALPHA_I, c22
MADD b2, b2, ALPHA_I, c21
ST a1, 0 * SIZE(CO1)
ST a2, 1 * SIZE(CO1)
ST b1, 2 * SIZE(CO1)
ST b2, 3 * SIZE(CO1)
#if ( defined(LEFT) && defined(TRANSA)) || \
(!defined(LEFT) && !defined(TRANSA))
dsubu TEMP, K, KK
#ifdef LEFT
daddiu TEMP, TEMP, -2
#else
daddiu TEMP, TEMP, -1
#endif
dsll L, TEMP, 1 + ZBASE_SHIFT
dsll TEMP, TEMP, ZBASE_SHIFT
daddu AO, AO, L
daddu BO, BO, TEMP
#endif
#ifdef LEFT
daddiu KK, KK, 2
#endif
#endif
daddiu CO1,CO1, 4 * SIZE
bgtz I, .L21
NOP
@ -833,6 +1153,39 @@
blez I, .L999
NOP
#if defined(TRMMKERNEL)
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
move BO, B
#else
dsll TEMP, KK, ZBASE_SHIFT
daddu AO, AO, TEMP
daddu BO, B, TEMP
#endif
gsLQC1(R12, F1, F0, 0) # R:a1 I:a2
MTC $0, c11 # Clear results regs
MOV c12, c11
gsLQC1(R13, F5, F4, 0) # R:b1 I:b2
MOV c13, c11
MOV c14, c11
FETCH $0, 0 * SIZE(PREA)
FETCH $0, 4 * SIZE(PREA)
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
dsubu TEMP, K, KK
#elif defined(LEFT)
daddiu TEMP, KK, 1
#else
daddiu TEMP, KK, 1
#endif
dsra L, TEMP, 2
blez L, .L45
NOP
#else
dsra L, K, 2 # Unroll K 4 times
move BO, B
@ -848,6 +1201,7 @@
FETCH $0, 4 * SIZE(PREA)
blez L, .L45
NOP
#endif
.align 3
@ -892,8 +1246,13 @@
.align 5
.L45:
#ifndef TRMMKERNEL
andi L, K, 3
LD ALPHA_R, 128($sp)
#else
andi L, TEMP, 3
LD ALPHA_R, 128($sp)
#endif
blez L, .L48
LD ALPHA_I, 136($sp)
@ -914,6 +1273,7 @@
NOP
.L48:
#ifndef TRMMKERNEL
ADD c11, c14, c11
ADD c12, c13, c12
@ -929,7 +1289,40 @@
ST a1, 0 * SIZE(CO1)
ST a2, 1 * SIZE(CO1)
#else
ADD c11, c14, c11
ADD c12, c13, c12
MUL a1, ALPHA_R, c11
MUL a2, ALPHA_R, c12
NMSUB a1, a1, ALPHA_I, c12
MADD a2, a2, ALPHA_I, c11
ST a1, 0 * SIZE(CO1)
ST a2, 1 * SIZE(CO1)
#if ( defined(LEFT) && defined(TRANSA)) || \
(!defined(LEFT) && !defined(TRANSA))
dsubu TEMP, K, KK
#ifdef LEFT
daddiu TEMP, TEMP, -1
#else
daddiu TEMP, TEMP, -1
#endif
dsll TEMP, TEMP, ZBASE_SHIFT
daddu AO, AO, TEMP
daddu BO, BO, TEMP
#endif
#ifdef LEFT
daddiu KK, KK, 1
#endif
daddiu CO1,CO1, 2 * SIZE
#endif

View File

@ -1500,7 +1500,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define SGEMM_DEFAULT_Q 116
#define DGEMM_DEFAULT_Q 116
#define CGEMM_DEFAULT_Q 144
#define ZGEMM_DEFAULT_Q 60
#define ZGEMM_DEFAULT_Q 80
#define SGEMM_DEFAULT_R 1000
#define DGEMM_DEFAULT_R 1000