dtrsm_kernel_LT_8x2_bulldozer.S performance optimization

This commit is contained in:
wernsaar
2013-08-05 11:27:16 +02:00
parent aaeb8eaecd
commit 44d23881b5

View File

@@ -84,6 +84,9 @@
#endif
#define A_PR1 384
#define B_PR1 192
.macro KERNEL8x2_SUB
vmovddup -16*SIZE(BO,%rax,2), %xmm1
@@ -708,9 +711,14 @@
ALIGN_4
.L52:
prefetcht0 A_PR1(AO,%rax,8)
prefetcht0 B_PR1(BO,%rax,2)
KERNEL8x2_SUB
prefetcht0 A_PR1(AO,%rax,8)
KERNEL8x2_SUB
prefetcht0 A_PR1(AO,%rax,8)
KERNEL8x2_SUB
prefetcht0 A_PR1(AO,%rax,8)
KERNEL8x2_SUB
jl .L52