Add ztrmm and ztrsm part on loongson3a. The average performance is 2.2G.

This commit is contained in:
traz 2011-06-23 21:11:00 +00:00
parent 14f81da375
commit e72113f06a
4 changed files with 438 additions and 25 deletions

View File

@ -128,10 +128,21 @@ CTRSMKERNEL_LT = ztrsm_kernel_LT.S
CTRSMKERNEL_RN = ztrsm_kernel_LT.S CTRSMKERNEL_RN = ztrsm_kernel_LT.S
CTRSMKERNEL_RT = ztrsm_kernel_RT.S CTRSMKERNEL_RT = ztrsm_kernel_RT.S
ifndef ZTRSMKERNEL_LN
ZTRSMKERNEL_LN = ztrsm_kernel_LT.S ZTRSMKERNEL_LN = ztrsm_kernel_LT.S
endif
ifndef ZTRSMKERNEL_LT
ZTRSMKERNEL_LT = ztrsm_kernel_LT.S ZTRSMKERNEL_LT = ztrsm_kernel_LT.S
endif
ifndef ZTRSMKERNEL_RN
ZTRSMKERNEL_RN = ztrsm_kernel_LT.S ZTRSMKERNEL_RN = ztrsm_kernel_LT.S
endif
ifndef ZTRSMKERNEL_RT
ZTRSMKERNEL_RT = ztrsm_kernel_RT.S ZTRSMKERNEL_RT = ztrsm_kernel_RT.S
endif
CGEMM3MKERNEL = zgemm3m_kernel.S CGEMM3MKERNEL = zgemm3m_kernel.S
ZGEMM3MKERNEL = zgemm3m_kernel.S ZGEMM3MKERNEL = zgemm3m_kernel.S

View File

@ -28,3 +28,12 @@ DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c

View File

@ -1,12 +1,10 @@
#define ASSEMBLER #define ASSEMBLER
#include "common.h" #include "common.h"
#define FETCH ld #define FETCH ld
#define gsLQC1(base,fq,ft,offset) .word(0x32<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq) #define gsLQC1(base,fq,ft,offset) .word(0x32<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq)
#define gsSQC1(base,fq,ft,offset) .word(0x3A<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq) #define gsSQC1(base,fq,ft,offset) .word(0x3A<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq)
#define STACKSIZE 160 #define STACKSIZE 160
#define M $4 #define M $4
#define N $5 #define N $5
@ -116,12 +114,12 @@
## MADD3 a*d ## MADD3 a*d
## MADD4 d*b ## MADD4 d*b
################################## ##################################
####if defined(NN) || defined(NT) || defined(TN) || defined(TT) #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
#define MADD1 MADD #define MADD1 MADD
#define MADD2 MADD #define MADD2 MADD
#define MADD3 MADD #define MADD3 MADD
#define MADD4 NMSUB #define MADD4 NMSUB
###endif #endif
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
#define MADD1 MADD #define MADD1 MADD
@ -175,6 +173,9 @@
dsra J, N, 1 # J=N/2 dsra J, N, 1 # J=N/2
ST ALPHA_R, 128($sp) # store alpha_r & alpha_i ST ALPHA_R, 128($sp) # store alpha_r & alpha_i
#if defined(TRMMKERNEL) && !defined(LEFT)
neg KK, OFFSET
#endif
dsll LDC, LDC, ZBASE_SHIFT # LDC*SIZE*COMPSIZE dsll LDC, LDC, ZBASE_SHIFT # LDC*SIZE*COMPSIZE
blez J, .L20 blez J, .L20
@ -183,6 +184,10 @@
.align 5 .align 5
.L10: .L10:
#if defined(TRMMKERNEL) && defined(LEFT)
move KK, OFFSET
#endif
daddiu J, J, -1 daddiu J, J, -1
dsra I, M, 1 # I=M/2 dsra I, M, 1 # I=M/2
@ -193,12 +198,66 @@
daddu CO2, C, LDC daddu CO2, C, LDC
move AO, A # Reset AO move AO, A # Reset AO
daddu PREB, PREB, B # PREA=A+panel size
blez I, .L30 blez I, .L30
daddu PREA, PREA, A # PREA=A+panel size daddu PREA, PREA, A # PREA=A+panel size
.L11: .L11:
#if defined(TRMMKERNEL)
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
move BO, B
#else
dsll L, KK, 1 + ZBASE_SHIFT # MR=NR=2
dsll TEMP, KK, 1 + ZBASE_SHIFT
daddu AO, AO, L
daddu BO, B, TEMP
#endif
MTC $0, c11 # Clear results regs
MOV c12, c11
gsLQC1(R12, F1, F0, 0) # R:a1 I:a2
MOV c13, c11
MOV c14, c11
gsLQC1(R13, F5, F4, 0) # R:b1 I:b2
MOV c21, c11
MOV c22, c11
gsLQC1(R12, F3, F2, 1) # R:a3 I:a4
MOV c23, c11
MOV c24, c11
gsLQC1(R13, F7, F6, 1) # R:b2 I:b3
FETCH $0, 0 * SIZE(CO2)
MOV c31, c11
MOV c32, c11
FETCH $0, 0 * SIZE(CO1)
MOV c33, c11
MOV c34, c11
FETCH $0, 4 * SIZE(CO2)
MOV c41, c11
MOV c42, c11
FETCH $0, 4 * SIZE(CO1)
MOV c43, c11
MOV c44, c11
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
dsubu TEMP, K, KK
#elif defined(LEFT)
daddiu TEMP, KK, 2
#else
daddiu TEMP, KK, 2
#endif
dsra L, TEMP, 2
daddu PREB, PREB, B # PREA=A+panel size
blez L, .L15
NOP
#else
dsra L, K, 2 # Unroll K 4 times dsra L, K, 2 # Unroll K 4 times
move BO, B move BO, B
@ -218,18 +277,25 @@
MOV c24, c11 MOV c24, c11
gsLQC1(R13, F7, F6, 1) # R:b2 I:b3 gsLQC1(R13, F7, F6, 1) # R:b2 I:b3
FETCH $0, 0 * SIZE(CO2)
MOV c31, c11 MOV c31, c11
MOV c32, c11 MOV c32, c11
FETCH $0, 0 * SIZE(CO1)
MOV c33, c11 MOV c33, c11
MOV c34, c11 MOV c34, c11
FETCH $0, 4 * SIZE(CO2)
MOV c41, c11 MOV c41, c11
MOV c42, c11 MOV c42, c11
FETCH $0, 4 * SIZE(CO1)
MOV c43, c11 MOV c43, c11
daddu PREB, PREB, B # PREA=A+panel size
blez L, .L15 blez L, .L15
MOV c44, c11 MOV c44, c11
#endif
.align 5 .align 5
@ -361,8 +427,13 @@
.align 5 .align 5
.L15: .L15:
#ifndef TRMMKERNEL
andi L, K, 3 andi L, K, 3
LD ALPHA_R, 128($sp) LD ALPHA_R, 128($sp)
#else
andi L, TEMP, 3
LD ALPHA_R, 128($sp)
#endif
blez L, .L18 blez L, .L18
LD ALPHA_I, 136($sp) LD ALPHA_I, 136($sp)
@ -408,7 +479,7 @@
NOP NOP
.L18: .L18:
#ifndef TRMMKERNEL
ADD c11, c14, c11 ADD c11, c14, c11
LD a1, 0 * SIZE(CO1) LD a1, 0 * SIZE(CO1)
ADD c12, c13, c12 ADD c12, c13, c12
@ -458,20 +529,75 @@
ST b3, 2 * SIZE(CO2) ST b3, 2 * SIZE(CO2)
ST b4, 3 * SIZE(CO2) ST b4, 3 * SIZE(CO2)
FETCH $0, 4 * SIZE(CO2) #else
FETCH $0, 4 * SIZE(CO1) ADD c11, c14, c11
FETCH $0, 8 * SIZE(CO2) ADD c12, c13, c12
FETCH $0, 8 * SIZE(CO1) ADD c21, c24, c21
FETCH $0, 12 * SIZE(CO2) ADD c22, c23, c22
FETCH $0, 12 * SIZE(CO1)
FETCH $0, 16 * SIZE(CO2)
FETCH $0, 16 * SIZE(CO1)
ADD c31, c34, c31
ADD c32, c33, c32
ADD c41, c44, c41
ADD c42, c43, c42
daddiu I, I, -1
MUL a1, ALPHA_R, c11
MUL a2, ALPHA_R, c12
MUL b1, ALPHA_R, c21
MUL b2, ALPHA_R, c22
NMSUB a1, a1, ALPHA_I, c12
MADD a2, a2, ALPHA_I, c11
NMSUB b1, b1, ALPHA_I, c22
MADD b2, b2, ALPHA_I, c21
MUL a3, ALPHA_R, c31
MUL a4, ALPHA_R, c32
MUL b3, ALPHA_R, c41
MUL b4, ALPHA_R, c42
NMSUB a3, a3, ALPHA_I, c32
MADD a4, a4, ALPHA_I, c31
NMSUB b3, b3, ALPHA_I, c42
MADD b4, b4, ALPHA_I, c41
ST a1, 0 * SIZE(CO1)
ST a2, 1 * SIZE(CO1)
ST b1, 2 * SIZE(CO1)
ST b2, 3 * SIZE(CO1)
ST a3, 0 * SIZE(CO2)
ST a4, 1 * SIZE(CO2)
ST b3, 2 * SIZE(CO2)
ST b4, 3 * SIZE(CO2)
#if ( defined(LEFT) && defined(TRANSA)) || \
(!defined(LEFT) && !defined(TRANSA))
dsubu TEMP, K, KK
#ifdef LEFT
daddiu TEMP, TEMP, -2
#else
daddiu TEMP, TEMP, -2
#endif
dsll L, TEMP, 1 + ZBASE_SHIFT
dsll TEMP, TEMP, 1 + ZBASE_SHIFT
daddu AO, AO, L
daddu BO, BO, TEMP
#endif
#ifdef LEFT
daddiu KK, KK, 2
#endif
#endif
dsll PREB, K, 1 + ZBASE_SHIFT # PREA=K*2*2^4
daddiu CO1,CO1, 4 * SIZE daddiu CO1,CO1, 4 * SIZE
bgtz I, .L11 bgtz I, .L11
daddiu CO2,CO2, 4 * SIZE daddiu CO2,CO2, 4 * SIZE
.align 5
.L30: .L30:
andi I, M, 1 andi I, M, 1
daddu C, C, LDC # Change C to next panel daddu C, C, LDC # Change C to next panel
@ -480,22 +606,69 @@
blez I, .L19 blez I, .L19
daddu C, C, LDC # Change C to next panel daddu C, C, LDC # Change C to next panel
dsra L, K, 2 # Unroll K 4 times #if defined(TRMMKERNEL)
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
move BO, B move BO, B
#else
dsll L, KK, ZBASE_SHIFT # MR=1
dsll TEMP, KK, 1 + ZBASE_SHIFT # NR=2
daddu AO, AO, L
daddu BO, B, TEMP
#endif
gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 gsLQC1(R12, F1, F0, 0) # R:a1 I:a2
move BO, B
gsLQC1(R13, F5, F4, 0) # R:b1 I:b2
MTC $0, c11 # Clear results regs MTC $0, c11 # Clear results regs
MOV c12, c11 MOV c12, c11
gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 gsLQC1(R13, F7, F6, 1) # R:b2 I:b3
MOV c13, c11 MOV c13, c11
MOV c14, c11 MOV c14, c11
gsLQC1(R13, F7, F6, 1) # R:b2 I:b3 FETCH $0, 0 * SIZE(PREB)
MOV c31, c11 MOV c31, c11
MOV c32, c11 MOV c32, c11
FETCH $0, 0 * SIZE(CO1)
FETCH $0, 0 * SIZE(CO2)
FETCH $0, 4 * SIZE(CO1)
FETCH $0, 4 * SIZE(CO2)
MOV c33, c11
MOV c34, c11
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
dsubu TEMP, K, KK
#elif defined(LEFT)
daddiu TEMP, KK, 1 # MR=1
#else
daddiu TEMP, KK, 2 # NR=2
#endif
dsra L, TEMP, 2
blez L, .L35
NOP
#else
gsLQC1(R12, F1, F0, 0) # R:a1 I:a2
dsra L, K, 2 # Unroll K 4 times
move BO, B
gsLQC1(R13, F5, F4, 0) # R:b1 I:b2
MTC $0, c11 # Clear results regs
MOV c12, c11
gsLQC1(R13, F7, F6, 1) # R:b2 I:b3
MOV c13, c11
MOV c14, c11
FETCH $0, 0 * SIZE(PREB) FETCH $0, 0 * SIZE(PREB)
MOV c31, c11
MOV c32, c11
FETCH $0, 0 * SIZE(CO1) FETCH $0, 0 * SIZE(CO1)
FETCH $0, 0 * SIZE(CO2) FETCH $0, 0 * SIZE(CO2)
FETCH $0, 4 * SIZE(CO1) FETCH $0, 4 * SIZE(CO1)
@ -504,6 +677,7 @@
MOV c33, c11 MOV c33, c11
blez L, .L35 blez L, .L35
MOV c34, c11 MOV c34, c11
#endif
.align 5 .align 5
@ -582,15 +756,18 @@
.L35: .L35:
#ifndef TRMMKERNEL
andi L, K, 3 andi L, K, 3
LD ALPHA_R, 128($sp) LD ALPHA_R, 128($sp)
NOP #else
andi L, TEMP, 3
LD ALPHA_R, 128($sp)
#endif
blez L, .L38 blez L, .L38
LD ALPHA_I, 136($sp) LD ALPHA_I, 136($sp)
.align 5 .align 5
.L36: .L36:
daddiu L, L, -1 daddiu L, L, -1
MADD1 c11, c11, a1, b1 # axc A1xB1 MADD1 c11, c11, a1, b1 # axc A1xB1
MADD3 c13, c13, a1, b2 # axd MADD3 c13, c13, a1, b2 # axd
@ -615,6 +792,7 @@
gsLQC1(R13, F7, F6, 1) # R:b2 I:b3 gsLQC1(R13, F7, F6, 1) # R:b2 I:b3
.L38: .L38:
#ifndef TRMMKERNEL
ADD c11, c14, c11 ADD c11, c14, c11
LD a1, 0 * SIZE(CO1) LD a1, 0 * SIZE(CO1)
ADD c12, c13, c12 ADD c12, c13, c12
@ -645,10 +823,60 @@
daddiu CO1,CO1, 2 * SIZE daddiu CO1,CO1, 2 * SIZE
daddiu CO2,CO2, 2 * SIZE daddiu CO2,CO2, 2 * SIZE
#else
ADD c11, c14, c11
ADD c12, c13, c12
ADD c31, c34, c31
ADD c32, c33, c32
MUL a1, ALPHA_R, c11
MUL a2, ALPHA_R, c12
MUL a3, ALPHA_R, c31
MUL a4, ALPHA_R, c32
NMSUB a1, a1, ALPHA_I, c12
MADD a2, a2, ALPHA_I, c11
NMSUB a3, a3, ALPHA_I, c32
MADD a4, a4, ALPHA_I, c31
ST a1, 0 * SIZE(CO1)
ST a2, 1 * SIZE(CO1)
ST a3, 0 * SIZE(CO2)
ST a4, 1 * SIZE(CO2)
daddiu CO1,CO1, 2 * SIZE
daddiu CO2,CO2, 2 * SIZE
#if ( defined(LEFT) && defined(TRANSA)) || \
(!defined(LEFT) && !defined(TRANSA))
dsubu TEMP, K, KK
#ifdef LEFT
daddiu TEMP, TEMP, -1
#else
daddiu TEMP, TEMP, -2
#endif
dsll L, TEMP, ZBASE_SHIFT
dsll TEMP, TEMP, 1 + ZBASE_SHIFT
daddu AO, AO, L
daddu BO, BO, TEMP
#endif
#ifdef LEFT
daddiu KK, KK, 1
#endif
#endif
.align 5 .align 5
.L19: .L19:
#if defined(TRMMKERNEL) && !defined(LEFT)
daddiu KK, KK, 2
#endif
bgtz J, .L10 bgtz J, .L10
move B, BO move B, BO
@ -662,11 +890,56 @@
dsra I, M, 1 # I=M/2 dsra I, M, 1 # I=M/2
move CO1, C move CO1, C
#if defined(TRMMKERNEL) && defined(LEFT)
move KK, OFFSET
#endif
move AO, A # Reset AO move AO, A # Reset AO
blez I, .L29 blez I, .L29
daddu PREA, PREA, A daddu PREA, PREA, A
.L21: .L21:
#if defined(TRMMKERNEL)
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
move BO, B
#else
dsll L, KK, 1 + ZBASE_SHIFT
dsll TEMP, KK, ZBASE_SHIFT
daddu AO, AO, L
daddu BO, B, TEMP
#endif
gsLQC1(R12, F1, F0, 0) # R:a1 I:a2
MTC $0, c11 # Clear results regs
MOV c12, c11
gsLQC1(R13, F5, F4, 0) # R:b1 I:b2
MOV c13, c11
MOV c14, c11
gsLQC1(R12, F3, F2, 1) # R:a3 I:a4
MOV c21, c11
MOV c22, c11
FETCH $0, 0 * SIZE(PREA)
MOV c23, c11
MOV c24, c11
FETCH $0, 0 * SIZE(CO1)
FETCH $0, 4 * SIZE(CO1)
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
dsubu TEMP, K, KK
#elif defined(LEFT)
daddiu TEMP, KK, 2 # define Mr=2
#else
daddiu TEMP, KK, 1 # define NR=1
#endif
dsra L, TEMP, 2
blez L, .L25
NOP
#else
dsra L, K, 2 # Unroll K 4 times dsra L, K, 2 # Unroll K 4 times
move BO, B move BO, B
@ -691,8 +964,9 @@
blez L, .L25 blez L, .L25
NOP NOP
#endif
.align 3 .align 5
.L22: .L22:
gsLQC1(R12, F9, F8, 2) # Unroll K=1 gsLQC1(R12, F9, F8, 2) # Unroll K=1
@ -766,15 +1040,18 @@
.L25: .L25:
#ifndef TRMMKERNEL
andi L, K, 3 andi L, K, 3
LD ALPHA_R, 128($sp) LD ALPHA_R, 128($sp)
#else
andi L, TEMP, 3
LD ALPHA_R, 128($sp)
#endif
blez L, .L28 blez L, .L28
LD ALPHA_I, 136($sp) LD ALPHA_I, 136($sp)
.align 3 .align 3
.L26: .L26:
daddiu L, L, -1 daddiu L, L, -1
MADD1 c11, c11, a1, b1 # axc A1xB1 MADD1 c11, c11, a1, b1 # axc A1xB1
MADD3 c13, c13, a1, b2 # axd MADD3 c13, c13, a1, b2 # axd
@ -799,6 +1076,7 @@
FETCH $0, 0 * SIZE(PREA) FETCH $0, 0 * SIZE(PREA)
.L28: .L28:
#ifndef TRMMKERNEL
ADD c11, c14, c11 ADD c11, c14, c11
LD a1, 0 * SIZE(CO1) LD a1, 0 * SIZE(CO1)
ADD c12, c13, c12 ADD c12, c13, c12
@ -824,6 +1102,48 @@
ST b1, 2 * SIZE(CO1) ST b1, 2 * SIZE(CO1)
ST b2, 3 * SIZE(CO1) ST b2, 3 * SIZE(CO1)
#else
ADD c11, c14, c11
ADD c12, c13, c12
ADD c21, c24, c21
ADD c22, c23, c22
daddiu I, I, -1
MUL a1, ALPHA_R, c11
MUL a2, ALPHA_R, c12
MUL b1, ALPHA_R, c21
MUL b2, ALPHA_R, c22
NMSUB a1, a1, ALPHA_I, c12
MADD a2, a2, ALPHA_I, c11
NMSUB b1, b1, ALPHA_I, c22
MADD b2, b2, ALPHA_I, c21
ST a1, 0 * SIZE(CO1)
ST a2, 1 * SIZE(CO1)
ST b1, 2 * SIZE(CO1)
ST b2, 3 * SIZE(CO1)
#if ( defined(LEFT) && defined(TRANSA)) || \
(!defined(LEFT) && !defined(TRANSA))
dsubu TEMP, K, KK
#ifdef LEFT
daddiu TEMP, TEMP, -2
#else
daddiu TEMP, TEMP, -1
#endif
dsll L, TEMP, 1 + ZBASE_SHIFT
dsll TEMP, TEMP, ZBASE_SHIFT
daddu AO, AO, L
daddu BO, BO, TEMP
#endif
#ifdef LEFT
daddiu KK, KK, 2
#endif
#endif
daddiu CO1,CO1, 4 * SIZE daddiu CO1,CO1, 4 * SIZE
bgtz I, .L21 bgtz I, .L21
NOP NOP
@ -833,6 +1153,39 @@
blez I, .L999 blez I, .L999
NOP NOP
#if defined(TRMMKERNEL)
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
move BO, B
#else
dsll TEMP, KK, ZBASE_SHIFT
daddu AO, AO, TEMP
daddu BO, B, TEMP
#endif
gsLQC1(R12, F1, F0, 0) # R:a1 I:a2
MTC $0, c11 # Clear results regs
MOV c12, c11
gsLQC1(R13, F5, F4, 0) # R:b1 I:b2
MOV c13, c11
MOV c14, c11
FETCH $0, 0 * SIZE(PREA)
FETCH $0, 4 * SIZE(PREA)
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
dsubu TEMP, K, KK
#elif defined(LEFT)
daddiu TEMP, KK, 1
#else
daddiu TEMP, KK, 1
#endif
dsra L, TEMP, 2
blez L, .L45
NOP
#else
dsra L, K, 2 # Unroll K 4 times dsra L, K, 2 # Unroll K 4 times
move BO, B move BO, B
@ -848,6 +1201,7 @@
FETCH $0, 4 * SIZE(PREA) FETCH $0, 4 * SIZE(PREA)
blez L, .L45 blez L, .L45
NOP NOP
#endif
.align 3 .align 3
@ -892,8 +1246,13 @@
.align 5 .align 5
.L45: .L45:
#ifndef TRMMKERNEL
andi L, K, 3 andi L, K, 3
LD ALPHA_R, 128($sp) LD ALPHA_R, 128($sp)
#else
andi L, TEMP, 3
LD ALPHA_R, 128($sp)
#endif
blez L, .L48 blez L, .L48
LD ALPHA_I, 136($sp) LD ALPHA_I, 136($sp)
@ -914,6 +1273,7 @@
NOP NOP
.L48: .L48:
#ifndef TRMMKERNEL
ADD c11, c14, c11 ADD c11, c14, c11
ADD c12, c13, c12 ADD c12, c13, c12
@ -929,7 +1289,40 @@
ST a1, 0 * SIZE(CO1) ST a1, 0 * SIZE(CO1)
ST a2, 1 * SIZE(CO1) ST a2, 1 * SIZE(CO1)
#else
ADD c11, c14, c11
ADD c12, c13, c12
MUL a1, ALPHA_R, c11
MUL a2, ALPHA_R, c12
NMSUB a1, a1, ALPHA_I, c12
MADD a2, a2, ALPHA_I, c11
ST a1, 0 * SIZE(CO1)
ST a2, 1 * SIZE(CO1)
#if ( defined(LEFT) && defined(TRANSA)) || \
(!defined(LEFT) && !defined(TRANSA))
dsubu TEMP, K, KK
#ifdef LEFT
daddiu TEMP, TEMP, -1
#else
daddiu TEMP, TEMP, -1
#endif
dsll TEMP, TEMP, ZBASE_SHIFT
daddu AO, AO, TEMP
daddu BO, BO, TEMP
#endif
#ifdef LEFT
daddiu KK, KK, 1
#endif
daddiu CO1,CO1, 2 * SIZE daddiu CO1,CO1, 2 * SIZE
#endif

View File

@ -1500,7 +1500,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define SGEMM_DEFAULT_Q 116 #define SGEMM_DEFAULT_Q 116
#define DGEMM_DEFAULT_Q 116 #define DGEMM_DEFAULT_Q 116
#define CGEMM_DEFAULT_Q 144 #define CGEMM_DEFAULT_Q 144
#define ZGEMM_DEFAULT_Q 60 #define ZGEMM_DEFAULT_Q 80
#define SGEMM_DEFAULT_R 1000 #define SGEMM_DEFAULT_R 1000
#define DGEMM_DEFAULT_R 1000 #define DGEMM_DEFAULT_R 1000