performance optimizations in sgemm_kernel_16x2_bulldozer.S
This commit is contained in:
parent
a789b588cd
commit
0ded1fcc1c
|
@ -127,17 +127,18 @@
|
|||
*******************************************************************************************/
|
||||
|
||||
#define KERNEL16x3_1(xx) \
|
||||
prefetcht0 A_PR1(AO,%rax,SIZE) ;\
|
||||
vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\
|
||||
vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
|
||||
vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\
|
||||
vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\
|
||||
vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\
|
||||
vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\
|
||||
nop ;\
|
||||
vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\
|
||||
vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\
|
||||
vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\
|
||||
vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\
|
||||
prefetcht0 A_PR1(AO,%rax,SIZE) ;\
|
||||
vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\
|
||||
vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\
|
||||
vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\
|
||||
|
@ -146,20 +147,21 @@
|
|||
vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\
|
||||
vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\
|
||||
vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\
|
||||
vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm1 ;\
|
||||
vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm2 ;\
|
||||
vfmaddps %xmm15,%xmm3,%xmm0,%xmm15 ;\
|
||||
|
||||
#define KERNEL16x3_2(xx) \
|
||||
prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\
|
||||
vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm1 ;\
|
||||
vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\
|
||||
vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\
|
||||
vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm2 ;\
|
||||
vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\
|
||||
vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm3 ;\
|
||||
nop ;\
|
||||
vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\
|
||||
vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\
|
||||
vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\
|
||||
vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\
|
||||
prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\
|
||||
vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\
|
||||
vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
|
||||
vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\
|
||||
|
@ -168,20 +170,21 @@
|
|||
vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\
|
||||
vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\
|
||||
vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\
|
||||
vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\
|
||||
vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\
|
||||
vfmaddps %xmm15,%xmm3,%xmm0,%xmm15 ;\
|
||||
|
||||
#define KERNEL16x3_3(xx) \
|
||||
prefetcht0 A_PR1+128(AO,%rax,SIZE) ;\
|
||||
vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\
|
||||
vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\
|
||||
vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\
|
||||
vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\
|
||||
vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\
|
||||
vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm3 ;\
|
||||
nop ;\
|
||||
vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\
|
||||
vmovups 4 * SIZE(AO, %rax, SIZE), %xmm0 ;\
|
||||
vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\
|
||||
vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\
|
||||
prefetcht0 A_PR1+128(AO,%rax,SIZE) ;\
|
||||
vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\
|
||||
vmovups 8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
|
||||
vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\
|
||||
|
@ -190,31 +193,32 @@
|
|||
vmovups 12 * SIZE(AO, %rax, SIZE), %xmm0 ;\
|
||||
vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\
|
||||
vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\
|
||||
vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm1 ;\
|
||||
vbroadcastss 4 * SIZE(BO, BI, SIZE), %xmm2 ;\
|
||||
vfmaddps %xmm15,%xmm3,%xmm0,%xmm15 ;\
|
||||
|
||||
#define KERNEL16x3_4(xx) \
|
||||
prefetcht0 A_PR1+192(AO,%rax,SIZE) ;\
|
||||
vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm1 ;\
|
||||
vmovups 16 * SIZE(AO, %rax, SIZE), %xmm0 ;\
|
||||
vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\
|
||||
vbroadcastss 4 * SIZE(BO, BI, SIZE), %xmm2 ;\
|
||||
vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\
|
||||
vbroadcastss 5 * SIZE(BO, BI, SIZE), %xmm3 ;\
|
||||
nop ;\
|
||||
vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\
|
||||
vmovups 20 * SIZE(AO, %rax, SIZE), %xmm0 ;\
|
||||
vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\
|
||||
vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\
|
||||
prefetcht0 A_PR1+192(AO,%rax,SIZE) ;\
|
||||
vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\
|
||||
vmovups 24 * SIZE(AO, %rax, SIZE), %xmm0 ;\
|
||||
vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\
|
||||
vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\
|
||||
addq $12, BI ;\
|
||||
vfmaddps %xmm12,%xmm3,%xmm0,%xmm12 ;\
|
||||
vmovups 28 * SIZE(AO, %rax, SIZE), %xmm0 ;\
|
||||
vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\
|
||||
vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\
|
||||
vfmaddps %xmm15,%xmm3,%xmm0,%xmm15 ;\
|
||||
addq $12, BI ;\
|
||||
addq $64, %rax ;\
|
||||
vfmaddps %xmm15,%xmm3,%xmm0,%xmm15 ;\
|
||||
|
||||
#define KERNEL16x3_SUB(xx) \
|
||||
vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\
|
||||
|
@ -223,6 +227,7 @@
|
|||
vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\
|
||||
vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\
|
||||
vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\
|
||||
nop ;\
|
||||
vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\
|
||||
vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\
|
||||
vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\
|
||||
|
@ -248,6 +253,7 @@
|
|||
vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\
|
||||
vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\
|
||||
vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\
|
||||
nop ;\
|
||||
vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\
|
||||
vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\
|
||||
vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\
|
||||
|
@ -261,6 +267,7 @@
|
|||
vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm2 ;\
|
||||
vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\
|
||||
vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm3 ;\
|
||||
nop ;\
|
||||
vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\
|
||||
vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\
|
||||
vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\
|
||||
|
@ -275,6 +282,7 @@
|
|||
vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\
|
||||
vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\
|
||||
vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm3 ;\
|
||||
nop ;\
|
||||
vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\
|
||||
vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\
|
||||
vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\
|
||||
|
@ -288,6 +296,7 @@
|
|||
vbroadcastss 4 * SIZE(BO, BI, SIZE), %xmm2 ;\
|
||||
vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\
|
||||
vbroadcastss 5 * SIZE(BO, BI, SIZE), %xmm3 ;\
|
||||
nop ;\
|
||||
vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\
|
||||
vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\
|
||||
vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\
|
||||
|
@ -303,6 +312,7 @@
|
|||
vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\
|
||||
vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\
|
||||
vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\
|
||||
nop ;\
|
||||
vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\
|
||||
vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\
|
||||
vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\
|
||||
|
@ -1072,15 +1082,74 @@
|
|||
leaq (B,%rax, SIZE), BO2 // next offset to BO2
|
||||
leaq BUFFER1, BO // first buffer to BO
|
||||
movq K, %rax
|
||||
sarq $3 , %rax // K / 8
|
||||
jz .L6_01a_2
|
||||
ALIGN_4
|
||||
|
||||
.L6_01a_1:
|
||||
|
||||
prefetcht0 512(BO1)
|
||||
prefetcht0 512(BO2)
|
||||
prefetchw 512(BO)
|
||||
|
||||
vmovsd 0 * SIZE(BO1), %xmm0
|
||||
vmovsd 2 * SIZE(BO1), %xmm2
|
||||
vmovsd 4 * SIZE(BO1), %xmm4
|
||||
vmovsd 6 * SIZE(BO1), %xmm6
|
||||
vmovss 0 * SIZE(BO2), %xmm1
|
||||
vmovss 2 * SIZE(BO2), %xmm3
|
||||
vmovss 4 * SIZE(BO2), %xmm5
|
||||
vmovss 6 * SIZE(BO2), %xmm7
|
||||
vmovsd %xmm0, 0*SIZE(BO)
|
||||
vmovss %xmm1, 2*SIZE(BO)
|
||||
vmovsd %xmm2, 3*SIZE(BO)
|
||||
vmovss %xmm3, 5*SIZE(BO)
|
||||
vmovsd %xmm4, 6*SIZE(BO)
|
||||
vmovss %xmm5, 8*SIZE(BO)
|
||||
vmovsd %xmm6, 9*SIZE(BO)
|
||||
vmovss %xmm7,11*SIZE(BO)
|
||||
addq $8*SIZE,BO1
|
||||
addq $8*SIZE,BO2
|
||||
addq $12*SIZE,BO
|
||||
|
||||
vmovsd 0 * SIZE(BO1), %xmm0
|
||||
vmovsd 2 * SIZE(BO1), %xmm2
|
||||
vmovsd 4 * SIZE(BO1), %xmm4
|
||||
vmovsd 6 * SIZE(BO1), %xmm6
|
||||
vmovss 0 * SIZE(BO2), %xmm1
|
||||
vmovss 2 * SIZE(BO2), %xmm3
|
||||
vmovss 4 * SIZE(BO2), %xmm5
|
||||
vmovss 6 * SIZE(BO2), %xmm7
|
||||
vmovsd %xmm0, 0*SIZE(BO)
|
||||
vmovss %xmm1, 2*SIZE(BO)
|
||||
vmovsd %xmm2, 3*SIZE(BO)
|
||||
vmovss %xmm3, 5*SIZE(BO)
|
||||
vmovsd %xmm4, 6*SIZE(BO)
|
||||
vmovss %xmm5, 8*SIZE(BO)
|
||||
vmovsd %xmm6, 9*SIZE(BO)
|
||||
vmovss %xmm7,11*SIZE(BO)
|
||||
addq $8*SIZE,BO1
|
||||
addq $8*SIZE,BO2
|
||||
addq $12*SIZE,BO
|
||||
|
||||
decq %rax
|
||||
jnz .L6_01a_1
|
||||
|
||||
|
||||
|
||||
.L6_01a_2:
|
||||
|
||||
movq K, %rax
|
||||
andq $7, %rax // K % 8
|
||||
jz .L6_02c
|
||||
ALIGN_4
|
||||
|
||||
|
||||
.L6_02b:
|
||||
|
||||
vmovss 0 * SIZE(BO1), %xmm0
|
||||
vmovss 1 * SIZE(BO1), %xmm1
|
||||
vmovsd 0 * SIZE(BO1), %xmm0
|
||||
vmovss 0 * SIZE(BO2), %xmm2
|
||||
vmovss %xmm0, 0*SIZE(BO)
|
||||
vmovss %xmm1, 1*SIZE(BO)
|
||||
vmovsd %xmm0, 0*SIZE(BO)
|
||||
vmovss %xmm2, 2*SIZE(BO)
|
||||
addq $2*SIZE,BO1
|
||||
addq $2*SIZE,BO2
|
||||
|
@ -1096,17 +1165,73 @@
|
|||
leaq (BO1,%rax, SIZE), BO2 // next offset to BO2
|
||||
leaq BUFFER2, BO // second buffer to BO
|
||||
movq K, %rax
|
||||
sarq $3 , %rax // K / 8
|
||||
jz .L6_02c_2
|
||||
ALIGN_4
|
||||
|
||||
.L6_02c_1:
|
||||
|
||||
prefetcht0 512(BO2)
|
||||
prefetchw 512(BO)
|
||||
|
||||
vmovsd 0 * SIZE(BO2), %xmm0
|
||||
vmovsd 2 * SIZE(BO2), %xmm2
|
||||
vmovsd 4 * SIZE(BO2), %xmm4
|
||||
vmovsd 6 * SIZE(BO2), %xmm6
|
||||
vmovss 1 * SIZE(BO1), %xmm1
|
||||
vmovss 3 * SIZE(BO1), %xmm3
|
||||
vmovss 5 * SIZE(BO1), %xmm5
|
||||
vmovss 7 * SIZE(BO1), %xmm7
|
||||
vmovss %xmm1, 0*SIZE(BO)
|
||||
vmovsd %xmm0, 1*SIZE(BO)
|
||||
vmovss %xmm3, 3*SIZE(BO)
|
||||
vmovsd %xmm2, 4*SIZE(BO)
|
||||
vmovss %xmm5, 6*SIZE(BO)
|
||||
vmovsd %xmm4, 7*SIZE(BO)
|
||||
vmovss %xmm7, 9*SIZE(BO)
|
||||
vmovsd %xmm6,10*SIZE(BO)
|
||||
addq $8*SIZE,BO1
|
||||
addq $8*SIZE,BO2
|
||||
addq $12*SIZE,BO
|
||||
|
||||
|
||||
vmovsd 0 * SIZE(BO2), %xmm0
|
||||
vmovsd 2 * SIZE(BO2), %xmm2
|
||||
vmovsd 4 * SIZE(BO2), %xmm4
|
||||
vmovsd 6 * SIZE(BO2), %xmm6
|
||||
vmovss 1 * SIZE(BO1), %xmm1
|
||||
vmovss 3 * SIZE(BO1), %xmm3
|
||||
vmovss 5 * SIZE(BO1), %xmm5
|
||||
vmovss 7 * SIZE(BO1), %xmm7
|
||||
vmovss %xmm1, 0*SIZE(BO)
|
||||
vmovsd %xmm0, 1*SIZE(BO)
|
||||
vmovss %xmm3, 3*SIZE(BO)
|
||||
vmovsd %xmm2, 4*SIZE(BO)
|
||||
vmovss %xmm5, 6*SIZE(BO)
|
||||
vmovsd %xmm4, 7*SIZE(BO)
|
||||
vmovss %xmm7, 9*SIZE(BO)
|
||||
vmovsd %xmm6,10*SIZE(BO)
|
||||
addq $8*SIZE,BO1
|
||||
addq $8*SIZE,BO2
|
||||
addq $12*SIZE,BO
|
||||
|
||||
decq %rax
|
||||
jnz .L6_02c_1
|
||||
|
||||
|
||||
.L6_02c_2:
|
||||
|
||||
movq K, %rax
|
||||
andq $7, %rax // K % 8
|
||||
jz .L6_03c
|
||||
ALIGN_4
|
||||
|
||||
.L6_03b:
|
||||
|
||||
vmovss 1*SIZE(BO1), %xmm0
|
||||
vmovss 0*SIZE(BO2), %xmm1
|
||||
vmovss 1*SIZE(BO2), %xmm2
|
||||
vmovsd 0*SIZE(BO2), %xmm1
|
||||
vmovss %xmm0, 0*SIZE(BO)
|
||||
vmovss %xmm1, 1*SIZE(BO)
|
||||
vmovss %xmm2, 2*SIZE(BO)
|
||||
vmovsd %xmm1, 1*SIZE(BO)
|
||||
addq $2*SIZE,BO1
|
||||
addq $2*SIZE,BO2
|
||||
addq $3*SIZE,BO
|
||||
|
|
Loading…
Reference in New Issue