Fixed #276. Merge branch 'wernsaar-develop' into bulldozer

This commit is contained in:
Zhang Xianyi 2013-08-09 10:49:44 +08:00
commit 49faee1a51
3 changed files with 1085 additions and 20 deletions

View File

@ -54,9 +54,8 @@ STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
#DTRSMKERNEL_LT = dtrsm_kernel_LT_8x2_bulldozer.S DTRSMKERNEL_LT = dtrsm_kernel_LT_8x2_bulldozer.S
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c DTRSMKERNEL_RN = dtrsm_kernel_RN_8x2_bulldozer.S
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
@ -69,21 +68,4 @@ ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
#STRMMKERNEL = ../generic/trmmkernel_16x2.c
STRMMKERNEL = sgemm_kernel_16x2_bulldozer.S
#STRMMKERNEL_RT = ../generic/trmmkernel_16x2.c
#STRMMKERNEL_RN = ../generic/trmmkernel_16x2.c
DTRMMKERNEL = dgemm_kernel_8x2_bulldozer.S
#DTRMMKERNEL_RT = ../generic/trmmkernel_8x2.c
#DTRMMKERNEL_RN = ../generic/trmmkernel_8x2.c
CTRMMKERNEL = cgemm_kernel_4x2_bulldozer.S
ZTRMMKERNEL = zgemm_kernel_2x2_bulldozer.S
#ZTRMMKERNEL = ../generic/ztrmmkernel_4x2.c
#ZTRMMKERNEL_RR = ../generic/ztrmmkernel_2x2.c
#ZTRMMKERNEL_RC = ../generic/ztrmmkernel_2x2.c

View File

@ -84,6 +84,9 @@
#endif #endif
#define A_PR1 384
#define B_PR1 192
.macro KERNEL8x2_SUB .macro KERNEL8x2_SUB
vmovddup -16*SIZE(BO,%rax,2), %xmm1 vmovddup -16*SIZE(BO,%rax,2), %xmm1
@ -708,9 +711,14 @@
ALIGN_4 ALIGN_4
.L52: .L52:
prefetcht0 A_PR1(AO,%rax,8)
prefetcht0 B_PR1(BO,%rax,2)
KERNEL8x2_SUB KERNEL8x2_SUB
prefetcht0 A_PR1(AO,%rax,8)
KERNEL8x2_SUB KERNEL8x2_SUB
prefetcht0 A_PR1(AO,%rax,8)
KERNEL8x2_SUB KERNEL8x2_SUB
prefetcht0 A_PR1(AO,%rax,8)
KERNEL8x2_SUB KERNEL8x2_SUB
jl .L52 jl .L52

File diff suppressed because it is too large Load Diff