dtrsm_kernel_LT_8x2_bulldozer.S performance optimization

This commit is contained in:
wernsaar 2013-08-05 11:27:16 +02:00
parent aaeb8eaecd
commit 44d23881b5
1 changed files with 8 additions and 0 deletions

View File

@ -84,6 +84,9 @@
#endif
#define A_PR1 384
#define B_PR1 192
.macro KERNEL8x2_SUB
vmovddup -16*SIZE(BO,%rax,2), %xmm1
@ -708,9 +711,14 @@
ALIGN_4
.L52:
prefetcht0 A_PR1(AO,%rax,8)
prefetcht0 B_PR1(BO,%rax,2)
KERNEL8x2_SUB
prefetcht0 A_PR1(AO,%rax,8)
KERNEL8x2_SUB
prefetcht0 A_PR1(AO,%rax,8)
KERNEL8x2_SUB
prefetcht0 A_PR1(AO,%rax,8)
KERNEL8x2_SUB
jl .L52