sgemm pipeline improved, zgemm rewritten without inner packs, ABI lxvx v20 fixed with vs52
This commit is contained in:
parent
8fe794f059
commit
a469b32cf4
|
@ -42,7 +42,7 @@ ZGEMMKERNEL = zgemm_kernel_power9.S
|
||||||
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
|
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
|
||||||
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
|
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
|
||||||
ZGEMMINCOPY = ../generic/zgemm_ncopy_8.c
|
ZGEMMINCOPY = ../generic/zgemm_ncopy_8.c
|
||||||
ZGEMMITCOPY = zgemm_tcopy_8_power8.S
|
ZGEMMITCOPY = ../generic/zgemm_tcopy_8.c
|
||||||
ZGEMMONCOPYOBJ = zgemm_oncopy.o
|
ZGEMMONCOPYOBJ = zgemm_oncopy.o
|
||||||
ZGEMMOTCOPYOBJ = zgemm_otcopy.o
|
ZGEMMOTCOPYOBJ = zgemm_otcopy.o
|
||||||
ZGEMMINCOPYOBJ = zgemm_incopy.o
|
ZGEMMINCOPYOBJ = zgemm_incopy.o
|
||||||
|
|
|
@ -135,18 +135,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
std r14, 280(SP)
|
std r14, 280(SP)
|
||||||
|
|
||||||
|
|
||||||
stxv v20, 288(SP)
|
stxv vs52, 288(SP)
|
||||||
stxv v21, 304(SP)
|
stxv vs53, 304(SP)
|
||||||
stxv v22, 320(SP)
|
stxv vs54, 320(SP)
|
||||||
stxv v23, 336(SP)
|
stxv vs55, 336(SP)
|
||||||
stxv v24, 352(SP)
|
stxv vs56, 352(SP)
|
||||||
stxv v25, 368(SP)
|
stxv vs57, 368(SP)
|
||||||
stxv v26, 384(SP)
|
stxv vs58, 384(SP)
|
||||||
stxv v27, 400(SP)
|
stxv vs59, 400(SP)
|
||||||
stxv v28, 416(SP)
|
stxv vs60, 416(SP)
|
||||||
stxv v29, 432(SP)
|
stxv vs61, 432(SP)
|
||||||
stxv v30, 448(SP)
|
stxv vs62, 448(SP)
|
||||||
stxv v31, 464(SP)
|
stxv vs63, 464(SP)
|
||||||
|
|
||||||
|
|
||||||
stfd f1, ALPHA_SP
|
stfd f1, ALPHA_SP
|
||||||
|
@ -229,18 +229,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
ld r15, 272(SP)
|
ld r15, 272(SP)
|
||||||
ld r14, 280(SP)
|
ld r14, 280(SP)
|
||||||
|
|
||||||
lxv v20, 288(SP)
|
lxv vs52, 288(SP)
|
||||||
lxv v21, 304(SP)
|
lxv vs53, 304(SP)
|
||||||
lxv v22, 320(SP)
|
lxv vs54, 320(SP)
|
||||||
lxv v23, 336(SP)
|
lxv vs55, 336(SP)
|
||||||
lxv v24, 352(SP)
|
lxv vs56, 352(SP)
|
||||||
lxv v25, 368(SP)
|
lxv vs57, 368(SP)
|
||||||
lxv v26, 384(SP)
|
lxv vs58, 384(SP)
|
||||||
lxv v27, 400(SP)
|
lxv vs59, 400(SP)
|
||||||
lxv v28, 416(SP)
|
lxv vs60, 416(SP)
|
||||||
lxv v29, 432(SP)
|
lxv vs61, 432(SP)
|
||||||
lxv v30, 448(SP)
|
lxv vs62, 448(SP)
|
||||||
lxv v31, 464(SP)
|
lxv vs63, 464(SP)
|
||||||
|
|
||||||
addi SP, SP, STACKSIZE
|
addi SP, SP, STACKSIZE
|
||||||
blr
|
blr
|
||||||
|
|
|
@ -32,7 +32,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
#define LOAD ld
|
#define LOAD ld
|
||||||
#define STACKSIZE (512 )
|
#define STACKSIZE (512 )
|
||||||
|
#define FLINK_SAVE (STACKSIZE+16) /* 16($r12) */
|
||||||
#define M r3
|
#define M r3
|
||||||
#define N r4
|
#define N r4
|
||||||
#define K r5
|
#define K r5
|
||||||
|
@ -91,7 +91,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
PROFCODE
|
PROFCODE
|
||||||
|
|
||||||
addi SP, SP, -STACKSIZE
|
addi SP, SP, -STACKSIZE
|
||||||
li r0, 0
|
mflr r0
|
||||||
|
|
||||||
|
|
||||||
stfd f14, 0(SP)
|
stfd f14, 0(SP)
|
||||||
stfd f15, 8(SP)
|
stfd f15, 8(SP)
|
||||||
|
@ -137,19 +138,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
std r14, 280(SP)
|
std r14, 280(SP)
|
||||||
|
|
||||||
|
|
||||||
stxv v20, 288(SP)
|
stxv vs52, 288(SP)
|
||||||
stxv v21, 304(SP)
|
stxv vs53, 304(SP)
|
||||||
stxv v22, 320(SP)
|
stxv vs54, 320(SP)
|
||||||
stxv v23, 336(SP)
|
stxv vs55, 336(SP)
|
||||||
stxv v24, 352(SP)
|
stxv vs56, 352(SP)
|
||||||
stxv v25, 368(SP)
|
stxv vs57, 368(SP)
|
||||||
stxv v26, 384(SP)
|
stxv vs58, 384(SP)
|
||||||
stxv v27, 400(SP)
|
stxv vs59, 400(SP)
|
||||||
stxv v28, 416(SP)
|
stxv vs60, 416(SP)
|
||||||
stxv v29, 432(SP)
|
stxv vs61, 432(SP)
|
||||||
stxv v30, 448(SP)
|
stxv vs62, 448(SP)
|
||||||
stxv v31, 464(SP)
|
stxv vs63, 464(SP)
|
||||||
|
std r0, FLINK_SAVE(SP)
|
||||||
|
|
||||||
|
|
||||||
#if defined(TRMMKERNEL)
|
#if defined(TRMMKERNEL)
|
||||||
|
@ -158,71 +159,53 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
slwi LDC, LDC, 2
|
slwi LDC, LDC, 2
|
||||||
|
|
||||||
|
|
||||||
/* cmpwi cr0, M, 0
|
|
||||||
ble .L999_H1
|
|
||||||
cmpwi cr0, N, 0
|
|
||||||
ble .L999_H1
|
|
||||||
cmpwi cr0, K, 0
|
|
||||||
ble .L999_H1
|
|
||||||
*/
|
|
||||||
|
|
||||||
|
|
||||||
/*alpha is stored in f1. convert to single and splat*/
|
/*alpha is stored in f1. convert to single and splat*/
|
||||||
xscvdpspn alpha_r,vs1
|
xscvdpspn alpha_r,vs1
|
||||||
xxspltw alpha_r,alpha_r,0
|
xxspltw alpha_r,alpha_r,0
|
||||||
|
|
||||||
|
|
||||||
/*load reverse permute mask for big endian
|
/*load reverse permute mask for big endian
|
||||||
uint128 = 0xc0d0e0f08090a0b0405060700010203
|
uint128 = 0xc0d0e0f08090a0b0405060700010203
|
||||||
*/
|
*/
|
||||||
|
|
||||||
lis T2, perm_const2@highest
|
lis T2, perm_const2@highest
|
||||||
ori T2, T2, perm_const2@higher
|
|
||||||
rldicr T2, T2, 32, 31
|
|
||||||
oris T2, T2, perm_const2@h
|
|
||||||
ori T2, T2, perm_const2@l
|
|
||||||
|
|
||||||
lis T1, perm_const1@highest
|
lis T1, perm_const1@highest
|
||||||
|
lis T3, save_permute_12@highest
|
||||||
|
lis T4, save_permute_11@highest
|
||||||
|
lis T5, save_permute_22@highest
|
||||||
|
lis T6, save_permute_21@highest
|
||||||
|
ori T2, T2, perm_const2@higher
|
||||||
ori T1, T1, perm_const1@higher
|
ori T1, T1, perm_const1@higher
|
||||||
|
ori T3, T3, save_permute_12@higher
|
||||||
|
ori T4, T4, save_permute_11@higher
|
||||||
|
ori T5, T5, save_permute_22@higher
|
||||||
|
ori T6, T6, save_permute_21@higher
|
||||||
|
rldicr T2, T2, 32, 31
|
||||||
rldicr T1, T1, 32, 31
|
rldicr T1, T1, 32, 31
|
||||||
|
rldicr T3, T3, 32, 31
|
||||||
|
rldicr T4, T4, 32, 31
|
||||||
|
rldicr T5, T5, 32, 31
|
||||||
|
rldicr T6, T6, 32, 31
|
||||||
|
oris T2, T2, perm_const2@h
|
||||||
oris T1, T1, perm_const1@h
|
oris T1, T1, perm_const1@h
|
||||||
|
oris T3, T3, save_permute_12@h
|
||||||
|
oris T4, T4, save_permute_11@h
|
||||||
|
oris T5, T5, save_permute_22@h
|
||||||
|
oris T6, T6, save_permute_21@h
|
||||||
|
ori T2, T2, perm_const2@l
|
||||||
ori T1, T1, perm_const1@l
|
ori T1, T1, perm_const1@l
|
||||||
|
ori T3, T3, save_permute_12@l
|
||||||
|
ori T4, T4, save_permute_11@l
|
||||||
|
ori T5, T5, save_permute_22@l
|
||||||
|
ori T6, T6, save_permute_21@l
|
||||||
|
li r0,0
|
||||||
mtvsrdd permute_mask,T2,T1
|
mtvsrdd permute_mask,T2,T1
|
||||||
|
mtvsrdd save_permute_1,T3,T4
|
||||||
lis T2, save_permute_12@highest
|
mtvsrdd save_permute_2,T5,T6
|
||||||
ori T2, T2, save_permute_12@higher
|
|
||||||
rldicr T2, T2, 32, 31
|
|
||||||
oris T2, T2, save_permute_12@h
|
|
||||||
ori T2, T2, save_permute_12@l
|
|
||||||
|
|
||||||
lis T1, save_permute_11@highest
|
|
||||||
ori T1, T1, save_permute_11@higher
|
|
||||||
rldicr T1, T1, 32, 31
|
|
||||||
oris T1, T1, save_permute_11@h
|
|
||||||
ori T1, T1, save_permute_11@l
|
|
||||||
|
|
||||||
mtvsrdd save_permute_1,T2,T1
|
|
||||||
|
|
||||||
lis T2, save_permute_22@highest
|
|
||||||
ori T2, T2, save_permute_22@higher
|
|
||||||
rldicr T2, T2, 32, 31
|
|
||||||
oris T2, T2, save_permute_22@h
|
|
||||||
ori T2, T2, save_permute_22@l
|
|
||||||
|
|
||||||
lis T1, save_permute_21@highest
|
|
||||||
ori T1, T1, save_permute_21@higher
|
|
||||||
rldicr T1, T1, 32, 31
|
|
||||||
oris T1, T1, save_permute_21@h
|
|
||||||
ori T1, T1, save_permute_21@l
|
|
||||||
|
|
||||||
mtvsrdd save_permute_2,T2,T1
|
|
||||||
|
|
||||||
#include "sgemm_logic_power9.S"
|
#include "sgemm_logic_power9.S"
|
||||||
|
|
||||||
.L999:
|
.L999:
|
||||||
addi r3, 0, 0
|
|
||||||
|
|
||||||
lfd f14, 0(SP)
|
lfd f14, 0(SP)
|
||||||
lfd f15, 8(SP)
|
lfd f15, 8(SP)
|
||||||
lfd f16, 16(SP)
|
lfd f16, 16(SP)
|
||||||
|
@ -265,22 +248,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
ld r15, 272(SP)
|
ld r15, 272(SP)
|
||||||
ld r14, 280(SP)
|
ld r14, 280(SP)
|
||||||
|
|
||||||
lxv v20, 288(SP)
|
ld r0, FLINK_SAVE(SP)
|
||||||
lxv v21, 304(SP)
|
|
||||||
lxv v22, 320(SP)
|
|
||||||
lxv v23, 336(SP)
|
|
||||||
lxv v24, 352(SP)
|
|
||||||
lxv v25, 368(SP)
|
|
||||||
lxv v26, 384(SP)
|
|
||||||
lxv v27, 400(SP)
|
|
||||||
lxv v28, 416(SP)
|
|
||||||
lxv v29, 432(SP)
|
|
||||||
lxv v30, 448(SP)
|
|
||||||
lxv v31, 464(SP)
|
|
||||||
|
|
||||||
|
lxv vs52, 288(SP)
|
||||||
|
lxv vs53, 304(SP)
|
||||||
|
lxv vs54, 320(SP)
|
||||||
|
lxv vs55, 336(SP)
|
||||||
|
lxv vs56, 352(SP)
|
||||||
|
lxv vs57, 368(SP)
|
||||||
|
lxv vs58, 384(SP)
|
||||||
|
lxv vs59, 400(SP)
|
||||||
|
mtlr r0
|
||||||
|
lxv vs60, 416(SP)
|
||||||
|
lxv vs61, 432(SP)
|
||||||
|
lxv vs62, 448(SP)
|
||||||
|
lxv vs63, 464(SP)
|
||||||
|
|
||||||
addi SP, SP, STACKSIZE
|
addi SP, SP, STACKSIZE
|
||||||
blr
|
blr
|
||||||
|
|
||||||
|
|
||||||
EPILOGUE
|
EPILOGUE
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -1,5 +1,94 @@
|
||||||
#define MY_ALIGN .align 3
|
#define MY_ALIGN .align 3
|
||||||
|
b L8
|
||||||
|
|
||||||
|
MY_ALIGN
|
||||||
|
LSGEMM_L8x16_LMAIN_SUB:
|
||||||
|
LOAD8x16_0
|
||||||
|
mtctr L
|
||||||
|
MY_ALIGN
|
||||||
|
|
||||||
|
LSGEMM_L8x16_LOOP:
|
||||||
|
|
||||||
|
KERNEL8x16_I1_L4_2 64,32, 0,0
|
||||||
|
KERNEL8x16_I1_L4_2 64,32, 1,0
|
||||||
|
KERNEL8x16_I1_L4_2 64,32, 2,0
|
||||||
|
KERNEL8x16_I1_L4_2 64,32, 3,0
|
||||||
|
KERNEL8x16_I1_L4_2 64,32, 4,0
|
||||||
|
KERNEL8x16_I1_L4_2 64,32, 5,0
|
||||||
|
KERNEL8x16_I1_L4_2 64,32, 6,0
|
||||||
|
KERNEL8x16_I1_L4_2 64,32, 7,0
|
||||||
|
KERNEL8x16_I1_L4_2 64,32, 8,0
|
||||||
|
KERNEL8x16_I1_L4_2 64,32, 9,0
|
||||||
|
KERNEL8x16_I1_L4_2 64,32, 10,0
|
||||||
|
KERNEL8x16_I1_L4_2 64,32, 11,0
|
||||||
|
KERNEL8x16_I1_L4_2 64,32, 12,0
|
||||||
|
KERNEL8x16_I1_L4_2 64,32, 13,0
|
||||||
|
KERNEL8x16_I1_L4_2 64,32, 14,0
|
||||||
|
KERNEL8x16_I1_L4_2 64,32, 15,0
|
||||||
|
KERNEL8x16_I1_L4_2 64,32, 16,0
|
||||||
|
KERNEL8x16_I1_L4_2 64,32, 17,0
|
||||||
|
KERNEL8x16_I1_L4_2 64,32, 18,0
|
||||||
|
KERNEL8x16_I1_L4_2 64,32, 19,0
|
||||||
|
KERNEL8x16_I1_L4_2 64,32, 20,0
|
||||||
|
KERNEL8x16_I1_L4_2 64,32, 21,0
|
||||||
|
KERNEL8x16_I1_L4_2 64,32, 22,0
|
||||||
|
KERNEL8x16_I1_L4_2 64,32, 23,0
|
||||||
|
KERNEL8x16_I1_L4_2 64,32, 24,0
|
||||||
|
KERNEL8x16_I1_L4_2 64,32, 25,0
|
||||||
|
KERNEL8x16_I1_L4_2 64,32, 26,0
|
||||||
|
KERNEL8x16_I1_L4_2 64,32, 27,0
|
||||||
|
KERNEL8x16_I1_L4_2 64,32, 28,0
|
||||||
|
KERNEL8x16_I1_L4_2 64,32, 29,0
|
||||||
|
KERNEL8x16_I1_L4_2 64,32, 30,0
|
||||||
|
KERNEL8x16_I1_L4_2 64,32, 31,1
|
||||||
|
bdnz LSGEMM_L8x16_LOOP
|
||||||
|
|
||||||
|
MY_ALIGN
|
||||||
|
LSGEMM_L8x16_LOOP_END:
|
||||||
|
END8x16 0, AO, BO, 64, 32
|
||||||
|
blr
|
||||||
|
|
||||||
|
MY_ALIGN
|
||||||
|
LSGEMM_L8x16_L64_SUB:
|
||||||
|
LOAD8x16_0
|
||||||
|
KERNEL8x16_I1_L4_2 64,32, 0,0
|
||||||
|
KERNEL8x16_I1_L4_2 64,32, 1,0
|
||||||
|
KERNEL8x16_I1_L4_2 64,32, 2,0
|
||||||
|
KERNEL8x16_I1_L4_2 64,32, 3,0
|
||||||
|
KERNEL8x16_I1_L4_2 64,32, 4,0
|
||||||
|
KERNEL8x16_I1_L4_2 64,32, 5,0
|
||||||
|
KERNEL8x16_I1_L4_2 64,32, 6,0
|
||||||
|
KERNEL8x16_I1_L4_2 64,32, 7,0
|
||||||
|
KERNEL8x16_I1_L4_2 64,32, 8,0
|
||||||
|
KERNEL8x16_I1_L4_2 64,32, 9,0
|
||||||
|
KERNEL8x16_I1_L4_2 64,32, 10,0
|
||||||
|
KERNEL8x16_I1_L4_2 64,32, 11,0
|
||||||
|
KERNEL8x16_I1_L4_2 64,32, 12,0
|
||||||
|
KERNEL8x16_I1_L4_2 64,32, 13,0
|
||||||
|
KERNEL8x16_I1_L4_2 64,32, 14,0
|
||||||
|
KERNEL8x16_I1_L4_3 64,32, 15,1
|
||||||
|
blr
|
||||||
|
LSGEMM_L8x16_L32_SUB:
|
||||||
|
LOAD8x16_0
|
||||||
|
KERNEL8x16_I1_L4_2 64,32, 0,0
|
||||||
|
KERNEL8x16_I1_L4_2 64,32, 1,0
|
||||||
|
KERNEL8x16_I1_L4_2 64,32, 2,0
|
||||||
|
KERNEL8x16_I1_L4_2 64,32, 3,0
|
||||||
|
KERNEL8x16_I1_L4_2 64,32, 4,0
|
||||||
|
KERNEL8x16_I1_L4_2 64,32, 5,0
|
||||||
|
KERNEL8x16_I1_L4_2 64,32, 6,0
|
||||||
|
KERNEL8x16_I1_L4_3 64,32, 7,1
|
||||||
|
blr
|
||||||
|
|
||||||
|
LSGEMM_L8x16_L16_SUB:
|
||||||
|
LOAD8x16_0
|
||||||
|
KERNEL8x16_I1_L4_2 64,32, 0,0
|
||||||
|
KERNEL8x16_I1_L4_2 64,32, 1,0
|
||||||
|
KERNEL8x16_I1_L4_2 64,32, 2,0
|
||||||
|
KERNEL8x16_I1_L4_3 64,32, 3,1
|
||||||
|
blr
|
||||||
|
|
||||||
|
L8:
|
||||||
#if defined(TRMMKERNEL) && !defined(LEFT)
|
#if defined(TRMMKERNEL) && !defined(LEFT)
|
||||||
neg TEMP_REG, OFFSET
|
neg TEMP_REG, OFFSET
|
||||||
#endif
|
#endif
|
||||||
|
@ -39,98 +128,50 @@ LSGEMM_L8x16_BEGIN:
|
||||||
REFRESH_TEMP_BK T11,K,TEMP_REG,16,8
|
REFRESH_TEMP_BK T11,K,TEMP_REG,16,8
|
||||||
mr T12, T11
|
mr T12, T11
|
||||||
addi T12,T12, -1
|
addi T12,T12, -1
|
||||||
srawi. L, T12, 6 /**(T11-1) % 64x */
|
srawi. L, T12, 7 /**(T11-1) % 128x */
|
||||||
#else
|
#else
|
||||||
mr T12, K
|
mr T12, K
|
||||||
addi T12,T12, -1
|
addi T12,T12, -1
|
||||||
srawi. L, T12, 6 /**(K-1) % 64x */
|
srawi. L, T12, 7 /**(K-1) % 128x */
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
ZERO8x16
|
ZERO8x16
|
||||||
ble LSGEMM_L8x16_SUB0
|
ble LSGEMM_L8x16_SUB0
|
||||||
|
bl LSGEMM_L8x16_LMAIN_SUB
|
||||||
MY_ALIGN
|
andi. L, T12, 127
|
||||||
LSGEMM_L8x16_LOOP_START:
|
ble LSGEMM_L8x16_SAVE
|
||||||
|
b LSGEMM_L8x16_SUB2
|
||||||
LOAD8x16_0 /*we already zeroed */
|
|
||||||
/*##OffsetA=64 OffsetB=32
|
|
||||||
#addi AO,AO,2112
|
|
||||||
#addi BO,BO,32 */
|
|
||||||
|
|
||||||
mtctr L
|
|
||||||
|
|
||||||
MY_ALIGN
|
|
||||||
|
|
||||||
LSGEMM_L8x16_LOOP:
|
|
||||||
|
|
||||||
KERNEL8x16_I1_L4_2 64,32, 0,0
|
|
||||||
KERNEL8x16_I1_L4_2 64,32, 1,0
|
|
||||||
KERNEL8x16_I1_L4_2 64,32, 2,0
|
|
||||||
KERNEL8x16_I1_L4_2 64,32, 3,0
|
|
||||||
KERNEL8x16_I1_L4_2 64,32, 4,0
|
|
||||||
KERNEL8x16_I1_L4_2 64,32, 5,0
|
|
||||||
KERNEL8x16_I1_L4_2 64,32, 6,0
|
|
||||||
KERNEL8x16_I1_L4_2 64,32, 7,0
|
|
||||||
KERNEL8x16_I1_L4_2 64,32, 8,0
|
|
||||||
KERNEL8x16_I1_L4_2 64,32, 9,0
|
|
||||||
KERNEL8x16_I1_L4_2 64,32, 10,0
|
|
||||||
KERNEL8x16_I1_L4_2 64,32, 11,0
|
|
||||||
KERNEL8x16_I1_L4_2 64,32, 12,0
|
|
||||||
KERNEL8x16_I1_L4_2 64,32, 13,0
|
|
||||||
KERNEL8x16_I1_L4_2 64,32, 14,0
|
|
||||||
KERNEL8x16_I1_L4_2 64,32, 15,1
|
|
||||||
|
|
||||||
bdnz LSGEMM_L8x16_LOOP
|
|
||||||
|
|
||||||
MY_ALIGN
|
|
||||||
LSGEMM_L8x16_LOOP_END:
|
|
||||||
|
|
||||||
END8x16 0, AO, BO, 64, 32
|
|
||||||
|
|
||||||
b LSGEMM_L8x16_SUB1
|
|
||||||
MY_ALIGN
|
MY_ALIGN
|
||||||
LSGEMM_L8x16_SUB0:
|
LSGEMM_L8x16_SUB0:
|
||||||
#if defined(TRMMKERNEL)
|
#if defined(TRMMKERNEL)
|
||||||
andi. L, T11, 127
|
andi. L, T11, 255
|
||||||
|
cmpwi T11,128
|
||||||
#else
|
#else
|
||||||
andi. L, K, 127
|
andi. L, K, 255
|
||||||
|
cmpwi K,128
|
||||||
#endif
|
#endif
|
||||||
b LSGEMM_L8x16_SUB2
|
|
||||||
|
bne LSGEMM_L8x16_SUB2
|
||||||
MY_ALIGN
|
MY_ALIGN
|
||||||
LSGEMM_L8x16_SUB1:
|
LSGEMM_L8x16_SUB2_128:
|
||||||
#if defined(TRMMKERNEL)
|
bl LSGEMM_L8x16_L64_SUB
|
||||||
andi. L, T12, 63
|
bl LSGEMM_L8x16_L64_SUB
|
||||||
#else
|
b LSGEMM_L8x16_SAVE
|
||||||
andi. L, T12, 63
|
|
||||||
#endif
|
|
||||||
ble LSGEMM_L8x16_SAVE
|
|
||||||
MY_ALIGN
|
MY_ALIGN
|
||||||
LSGEMM_L8x16_SUB2:
|
LSGEMM_L8x16_SUB2:
|
||||||
|
andi. T10,L,64
|
||||||
srawi. T10,L, 5
|
ble LSGEMM_L8x16_SUB2_32
|
||||||
ble LSGEMM_L8x16_SUB2_16
|
bl LSGEMM_L8x16_L64_SUB
|
||||||
mtctr T10
|
|
||||||
MY_ALIGN
|
MY_ALIGN
|
||||||
LSGEMM_L8x16_SUB2_LOOP:
|
LSGEMM_L8x16_SUB2_32:
|
||||||
LOAD8x16_0
|
andi. T10,L, 32
|
||||||
KERNEL8x16_I1_L4_2 64,32, 0,0
|
ble LSGEMM_L8x16_SUB2_16
|
||||||
KERNEL8x16_I1_L4_2 64,32, 1,0
|
bl LSGEMM_L8x16_L32_SUB
|
||||||
KERNEL8x16_I1_L4_2 64,32, 2,0
|
|
||||||
KERNEL8x16_I1_L4_2 64,32, 3,0
|
|
||||||
KERNEL8x16_I1_L4_2 64,32, 4,0
|
|
||||||
KERNEL8x16_I1_L4_2 64,32, 5,0
|
|
||||||
KERNEL8x16_I1_L4_2 64,32, 6,0
|
|
||||||
KERNEL8x16_I1_L4_3 64,32, 7,1
|
|
||||||
bdnz LSGEMM_L8x16_SUB2_LOOP
|
|
||||||
MY_ALIGN
|
MY_ALIGN
|
||||||
LSGEMM_L8x16_SUB2_16:
|
LSGEMM_L8x16_SUB2_16:
|
||||||
andi. T10,L, 16
|
andi. T10,L, 16
|
||||||
ble LSGEMM_L8x16_SUB2_8
|
ble LSGEMM_L8x16_SUB2_8
|
||||||
LOAD8x16_0
|
bl LSGEMM_L8x16_L16_SUB
|
||||||
KERNEL8x16_I1_L4_2 64,32, 0,0
|
|
||||||
KERNEL8x16_I1_L4_2 64,32, 1,0
|
|
||||||
KERNEL8x16_I1_L4_2 64,32, 2,0
|
|
||||||
KERNEL8x16_I1_L4_3 64,32, 3,1
|
|
||||||
MY_ALIGN
|
MY_ALIGN
|
||||||
LSGEMM_L8x16_SUB2_8:
|
LSGEMM_L8x16_SUB2_8:
|
||||||
andi. T10,L, 8
|
andi. T10,L, 8
|
||||||
|
@ -155,8 +196,7 @@ LSGEMM_L8x16_SUB2_1:
|
||||||
andi. T10,L, 1
|
andi. T10,L, 1
|
||||||
ble LSGEMM_L8x16_SAVE
|
ble LSGEMM_L8x16_SAVE
|
||||||
KERNEL8x16 0
|
KERNEL8x16 0
|
||||||
# addic. L, L, -1
|
|
||||||
# bgt LSGEMM_L8x16_SUB2
|
|
||||||
|
|
||||||
MY_ALIGN
|
MY_ALIGN
|
||||||
LSGEMM_L8x16_SAVE:
|
LSGEMM_L8x16_SAVE:
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -30,10 +30,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
#define LOAD ld
|
#define LOAD ld
|
||||||
|
|
||||||
#define STACKSIZE 32192
|
#define STACKSIZE 512
|
||||||
|
|
||||||
#define FZERO 312+192(SP)
|
#define FZERO 312+192(SP)
|
||||||
|
|
||||||
|
#define FLINK_SAVE (STACKSIZE+16) /* 16($r12) */
|
||||||
|
|
||||||
#define M r3
|
#define M r3
|
||||||
#define N r4
|
#define N r4
|
||||||
|
@ -56,20 +57,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
#define FRAMEPOINTER r12
|
#define FRAMEPOINTER r12
|
||||||
|
|
||||||
#define BBUFFER r14
|
#define T10 r14
|
||||||
|
|
||||||
#define L r15
|
#define L r15
|
||||||
#define ALPHA r16
|
#define T8 r16
|
||||||
#define T5 r17
|
#define T5 r17
|
||||||
#define T2 r19
|
#define T2 r19
|
||||||
#define BBO r20
|
#define T9 r20
|
||||||
#define o8 r21
|
#define T6 r21
|
||||||
#define I r22
|
#define I r22
|
||||||
#define J r23
|
#define J r23
|
||||||
#define AO r24
|
#define AO r24
|
||||||
#define BO r25
|
#define BO r25
|
||||||
#define CO r26
|
#define CO r26
|
||||||
#define o16 r27
|
#define T7 r27
|
||||||
#define T3 r28
|
#define T3 r28
|
||||||
#define T4 r29
|
#define T4 r29
|
||||||
|
|
||||||
|
@ -83,11 +84,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
mr FRAMEPOINTER, SP
|
mr FRAMEPOINTER, SP
|
||||||
addi SP, SP, -STACKSIZE
|
addi SP, SP, -STACKSIZE
|
||||||
addi SP, SP, -STACKSIZE
|
mflr r0
|
||||||
addi SP, SP, -STACKSIZE
|
|
||||||
addi SP, SP, -STACKSIZE
|
|
||||||
li r0, 0
|
|
||||||
|
|
||||||
stfd f14, 0(SP)
|
stfd f14, 0(SP)
|
||||||
stfd f15, 8(SP)
|
stfd f15, 8(SP)
|
||||||
stfd f16, 16(SP)
|
stfd f16, 16(SP)
|
||||||
|
@ -111,6 +108,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
stfd f30, 128(SP)
|
stfd f30, 128(SP)
|
||||||
stfd f31, 136(SP)
|
stfd f31, 136(SP)
|
||||||
|
|
||||||
|
xxspltd alpha_r,vs1,0 /*copy from register f1 */
|
||||||
|
xxspltd alpha_i,vs2,0 /*copy from register f2 */
|
||||||
|
|
||||||
std r31, 144(SP)
|
std r31, 144(SP)
|
||||||
std r30, 152(SP)
|
std r30, 152(SP)
|
||||||
|
@ -132,21 +131,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
std r14, 280(SP)
|
std r14, 280(SP)
|
||||||
|
|
||||||
|
|
||||||
stxv v20, 288(SP)
|
stxv vs52, 288(SP)
|
||||||
stxv v21, 304(SP)
|
stxv vs53, 304(SP)
|
||||||
stxv v22, 320(SP)
|
stxv vs54, 320(SP)
|
||||||
stxv v23, 336(SP)
|
stxv vs55, 336(SP)
|
||||||
stxv v24, 352(SP)
|
stxv vs56, 352(SP)
|
||||||
stxv v25, 368(SP)
|
stxv vs57, 368(SP)
|
||||||
stxv v26, 384(SP)
|
stxv vs58, 384(SP)
|
||||||
stxv v27, 400(SP)
|
stxv vs59, 400(SP)
|
||||||
stxv v28, 416(SP)
|
stxv vs60, 416(SP)
|
||||||
stxv v29, 432(SP)
|
stxv vs61, 432(SP)
|
||||||
stxv v30, 448(SP)
|
stxv vs62, 448(SP)
|
||||||
stxv v31, 464(SP)
|
stxv vs63, 464(SP)
|
||||||
|
|
||||||
|
std r0, FLINK_SAVE(SP)
|
||||||
|
|
||||||
stw r0, FZERO
|
|
||||||
|
|
||||||
#ifdef linux
|
#ifdef linux
|
||||||
ld LDC, FRAMESLOT(0) + 0(FRAMEPOINTER)
|
ld LDC, FRAMESLOT(0) + 0(FRAMEPOINTER)
|
||||||
|
@ -162,34 +161,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
#include "zgemm_macros_power9.S"
|
#include "zgemm_macros_power9.S"
|
||||||
|
|
||||||
cmpwi cr0, M, 0
|
|
||||||
ble L999
|
|
||||||
cmpwi cr0, N, 0
|
|
||||||
ble L999
|
|
||||||
cmpwi cr0, K, 0
|
|
||||||
ble L999
|
|
||||||
|
|
||||||
slwi LDC, LDC, ZBASE_SHIFT
|
slwi LDC, LDC, ZBASE_SHIFT
|
||||||
li PRE, 512
|
li PRE, 512
|
||||||
li o8 , 8
|
li r0, 0
|
||||||
li o16 , 16
|
|
||||||
|
|
||||||
addi BBUFFER, SP, 512+4096
|
|
||||||
li T1, -4096
|
|
||||||
and BBUFFER, BBUFFER, T1
|
|
||||||
|
|
||||||
|
|
||||||
addi ALPHA, SP, 296+192
|
#if defined(CC) || defined(CR) || defined(RC) || defined(RR)
|
||||||
|
/*negate for this case as we will use addition -1*(a+b) */
|
||||||
xxlor alpha_r,vs1,vs1 /*copy from register f1 */
|
xvnegdp alpha_r,alpha_r
|
||||||
xxlor alpha_i,vs2,vs2 /*copy from register f2 */
|
xvnegdp alpha_i,alpha_i
|
||||||
|
#endif
|
||||||
.align 4
|
.align 4
|
||||||
|
|
||||||
#include "zgemm_logic_power9.S"
|
#include "zgemm_logic_power9.S"
|
||||||
|
|
||||||
L999:
|
L999:
|
||||||
addi r3, 0, 0
|
|
||||||
|
|
||||||
lfd f14, 0(SP)
|
lfd f14, 0(SP)
|
||||||
lfd f15, 8(SP)
|
lfd f15, 8(SP)
|
||||||
|
@ -234,22 +222,22 @@ L999:
|
||||||
ld r15, 272(SP)
|
ld r15, 272(SP)
|
||||||
ld r14, 280(SP)
|
ld r14, 280(SP)
|
||||||
|
|
||||||
lxv v20, 288(SP)
|
ld r0, FLINK_SAVE(SP)
|
||||||
lxv v21, 304(SP)
|
|
||||||
lxv v22, 320(SP)
|
lxv vs52, 288(SP)
|
||||||
lxv v23, 336(SP)
|
lxv vs53, 304(SP)
|
||||||
lxv v24, 352(SP)
|
lxv vs54, 320(SP)
|
||||||
lxv v25, 368(SP)
|
lxv vs55, 336(SP)
|
||||||
lxv v26, 384(SP)
|
lxv vs56, 352(SP)
|
||||||
lxv v27, 400(SP)
|
lxv vs57, 368(SP)
|
||||||
lxv v28, 416(SP)
|
lxv vs58, 384(SP)
|
||||||
lxv v29, 432(SP)
|
lxv vs59, 400(SP)
|
||||||
lxv v30, 448(SP)
|
mtlr r0
|
||||||
lxv v31, 464(SP)
|
lxv vs60, 416(SP)
|
||||||
|
lxv vs61, 432(SP)
|
||||||
|
lxv vs62, 448(SP)
|
||||||
|
lxv vs63, 464(SP)
|
||||||
|
|
||||||
addi SP, SP, STACKSIZE
|
|
||||||
addi SP, SP, STACKSIZE
|
|
||||||
addi SP, SP, STACKSIZE
|
|
||||||
addi SP, SP, STACKSIZE
|
addi SP, SP, STACKSIZE
|
||||||
blr
|
blr
|
||||||
|
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
8
param.h
8
param.h
|
@ -2248,15 +2248,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#define ZGEMM_DEFAULT_UNROLL_M 8
|
#define ZGEMM_DEFAULT_UNROLL_M 8
|
||||||
#define ZGEMM_DEFAULT_UNROLL_N 2
|
#define ZGEMM_DEFAULT_UNROLL_N 2
|
||||||
|
|
||||||
#define SGEMM_DEFAULT_P 640
|
#define SGEMM_DEFAULT_P 832
|
||||||
#define DGEMM_DEFAULT_P 128
|
#define DGEMM_DEFAULT_P 128
|
||||||
#define CGEMM_DEFAULT_P 640
|
#define CGEMM_DEFAULT_P 640
|
||||||
#define ZGEMM_DEFAULT_P 512
|
#define ZGEMM_DEFAULT_P 256
|
||||||
|
|
||||||
#define SGEMM_DEFAULT_Q 1408
|
#define SGEMM_DEFAULT_Q 1025
|
||||||
#define DGEMM_DEFAULT_Q 384
|
#define DGEMM_DEFAULT_Q 384
|
||||||
#define CGEMM_DEFAULT_Q 640
|
#define CGEMM_DEFAULT_Q 640
|
||||||
#define ZGEMM_DEFAULT_Q 1152
|
#define ZGEMM_DEFAULT_Q 1025
|
||||||
|
|
||||||
#define SYMV_P 8
|
#define SYMV_P 8
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue