From 0ded1fcc1c10147ff94169428d1167a833b0c9cd Mon Sep 17 00:00:00 2001 From: wernsaar Date: Thu, 13 Jun 2013 11:35:15 +0200 Subject: [PATCH] performance optimizations in sgemm_kernel_16x2_bulldozer.S --- kernel/x86_64/sgemm_kernel_16x2_bulldozer.S | 165 +++++++++++++++++--- 1 file changed, 145 insertions(+), 20 deletions(-) diff --git a/kernel/x86_64/sgemm_kernel_16x2_bulldozer.S b/kernel/x86_64/sgemm_kernel_16x2_bulldozer.S index 2e10fae71..f02a1dfa5 100644 --- a/kernel/x86_64/sgemm_kernel_16x2_bulldozer.S +++ b/kernel/x86_64/sgemm_kernel_16x2_bulldozer.S @@ -127,17 +127,18 @@ *******************************************************************************************/ #define KERNEL16x3_1(xx) \ - prefetcht0 A_PR1(AO,%rax,SIZE) ;\ vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ + nop ;\ vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ @@ -146,20 +147,21 @@ vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddps %xmm15,%xmm3,%xmm0,%xmm15 ;\ #define KERNEL16x3_2(xx) \ - prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ - vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm3 ;\ + nop ;\ vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ @@ -168,20 +170,21 @@ vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ + vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddps %xmm15,%xmm3,%xmm0,%xmm15 ;\ #define KERNEL16x3_3(xx) \ - prefetcht0 A_PR1+128(AO,%rax,SIZE) ;\ - vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm3 ;\ + nop ;\ vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ vmovups 4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + prefetcht0 A_PR1+128(AO,%rax,SIZE) ;\ vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ vmovups 8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ @@ -190,31 +193,32 @@ vmovups 12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ + vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vbroadcastss 4 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddps %xmm15,%xmm3,%xmm0,%xmm15 ;\ #define KERNEL16x3_4(xx) \ - prefetcht0 A_PR1+192(AO,%rax,SIZE) ;\ - vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovups 16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss 4 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ vbroadcastss 5 * SIZE(BO, BI, SIZE), %xmm3 ;\ + nop ;\ vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ vmovups 20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + prefetcht0 A_PR1+192(AO,%rax,SIZE) ;\ vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ vmovups 24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ + addq $12, BI ;\ vfmaddps %xmm12,%xmm3,%xmm0,%xmm12 ;\ vmovups 28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ - vfmaddps %xmm15,%xmm3,%xmm0,%xmm15 ;\ - addq $12, BI ;\ addq $64, %rax ;\ + vfmaddps %xmm15,%xmm3,%xmm0,%xmm15 ;\ #define KERNEL16x3_SUB(xx) \ vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ @@ -223,6 +227,7 @@ vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ + nop ;\ vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ @@ -248,6 +253,7 @@ vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ + nop ;\ vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ @@ -261,6 +267,7 @@ vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm3 ;\ + nop ;\ vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ @@ -275,6 +282,7 @@ vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm3 ;\ + nop ;\ vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ @@ -288,6 +296,7 @@ vbroadcastss 4 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ vbroadcastss 5 * SIZE(BO, BI, SIZE), %xmm3 ;\ + nop ;\ vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ @@ -303,6 +312,7 @@ vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ + nop ;\ vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ @@ -1072,15 +1082,74 @@ leaq (B,%rax, SIZE), BO2 // next offset to BO2 leaq BUFFER1, BO // first buffer to BO movq K, %rax + sarq $3 , %rax // K / 8 + jz .L6_01a_2 ALIGN_4 +.L6_01a_1: + + prefetcht0 512(BO1) + prefetcht0 512(BO2) + prefetchw 512(BO) + + vmovsd 0 * SIZE(BO1), %xmm0 + vmovsd 2 * SIZE(BO1), %xmm2 + vmovsd 4 * SIZE(BO1), %xmm4 + vmovsd 6 * SIZE(BO1), %xmm6 + vmovss 0 * SIZE(BO2), %xmm1 + vmovss 2 * SIZE(BO2), %xmm3 + vmovss 4 * SIZE(BO2), %xmm5 + vmovss 6 * SIZE(BO2), %xmm7 + vmovsd %xmm0, 0*SIZE(BO) + vmovss %xmm1, 2*SIZE(BO) + vmovsd %xmm2, 3*SIZE(BO) + vmovss %xmm3, 5*SIZE(BO) + vmovsd %xmm4, 6*SIZE(BO) + vmovss %xmm5, 8*SIZE(BO) + vmovsd %xmm6, 9*SIZE(BO) + vmovss %xmm7,11*SIZE(BO) + addq $8*SIZE,BO1 + addq $8*SIZE,BO2 + addq $12*SIZE,BO + + vmovsd 0 * SIZE(BO1), %xmm0 + vmovsd 2 * SIZE(BO1), %xmm2 + vmovsd 4 * SIZE(BO1), %xmm4 + vmovsd 6 * SIZE(BO1), %xmm6 + vmovss 0 * SIZE(BO2), %xmm1 + vmovss 2 * SIZE(BO2), %xmm3 + vmovss 4 * SIZE(BO2), %xmm5 + vmovss 6 * SIZE(BO2), %xmm7 + vmovsd %xmm0, 0*SIZE(BO) + vmovss %xmm1, 2*SIZE(BO) + vmovsd %xmm2, 3*SIZE(BO) + vmovss %xmm3, 5*SIZE(BO) + vmovsd %xmm4, 6*SIZE(BO) + vmovss %xmm5, 8*SIZE(BO) + vmovsd %xmm6, 9*SIZE(BO) + vmovss %xmm7,11*SIZE(BO) + addq $8*SIZE,BO1 + addq $8*SIZE,BO2 + addq $12*SIZE,BO + + decq %rax + jnz .L6_01a_1 + + + +.L6_01a_2: + + movq K, %rax + andq $7, %rax // K % 8 + jz .L6_02c + ALIGN_4 + + .L6_02b: - vmovss 0 * SIZE(BO1), %xmm0 - vmovss 1 * SIZE(BO1), %xmm1 + vmovsd 0 * SIZE(BO1), %xmm0 vmovss 0 * SIZE(BO2), %xmm2 - vmovss %xmm0, 0*SIZE(BO) - vmovss %xmm1, 1*SIZE(BO) + vmovsd %xmm0, 0*SIZE(BO) vmovss %xmm2, 2*SIZE(BO) addq $2*SIZE,BO1 addq $2*SIZE,BO2 @@ -1096,17 +1165,73 @@ leaq (BO1,%rax, SIZE), BO2 // next offset to BO2 leaq BUFFER2, BO // second buffer to BO movq K, %rax + sarq $3 , %rax // K / 8 + jz .L6_02c_2 ALIGN_4 +.L6_02c_1: + + prefetcht0 512(BO2) + prefetchw 512(BO) + + vmovsd 0 * SIZE(BO2), %xmm0 + vmovsd 2 * SIZE(BO2), %xmm2 + vmovsd 4 * SIZE(BO2), %xmm4 + vmovsd 6 * SIZE(BO2), %xmm6 + vmovss 1 * SIZE(BO1), %xmm1 + vmovss 3 * SIZE(BO1), %xmm3 + vmovss 5 * SIZE(BO1), %xmm5 + vmovss 7 * SIZE(BO1), %xmm7 + vmovss %xmm1, 0*SIZE(BO) + vmovsd %xmm0, 1*SIZE(BO) + vmovss %xmm3, 3*SIZE(BO) + vmovsd %xmm2, 4*SIZE(BO) + vmovss %xmm5, 6*SIZE(BO) + vmovsd %xmm4, 7*SIZE(BO) + vmovss %xmm7, 9*SIZE(BO) + vmovsd %xmm6,10*SIZE(BO) + addq $8*SIZE,BO1 + addq $8*SIZE,BO2 + addq $12*SIZE,BO + + + vmovsd 0 * SIZE(BO2), %xmm0 + vmovsd 2 * SIZE(BO2), %xmm2 + vmovsd 4 * SIZE(BO2), %xmm4 + vmovsd 6 * SIZE(BO2), %xmm6 + vmovss 1 * SIZE(BO1), %xmm1 + vmovss 3 * SIZE(BO1), %xmm3 + vmovss 5 * SIZE(BO1), %xmm5 + vmovss 7 * SIZE(BO1), %xmm7 + vmovss %xmm1, 0*SIZE(BO) + vmovsd %xmm0, 1*SIZE(BO) + vmovss %xmm3, 3*SIZE(BO) + vmovsd %xmm2, 4*SIZE(BO) + vmovss %xmm5, 6*SIZE(BO) + vmovsd %xmm4, 7*SIZE(BO) + vmovss %xmm7, 9*SIZE(BO) + vmovsd %xmm6,10*SIZE(BO) + addq $8*SIZE,BO1 + addq $8*SIZE,BO2 + addq $12*SIZE,BO + + decq %rax + jnz .L6_02c_1 + + +.L6_02c_2: + + movq K, %rax + andq $7, %rax // K % 8 + jz .L6_03c + ALIGN_4 .L6_03b: vmovss 1*SIZE(BO1), %xmm0 - vmovss 0*SIZE(BO2), %xmm1 - vmovss 1*SIZE(BO2), %xmm2 + vmovsd 0*SIZE(BO2), %xmm1 vmovss %xmm0, 0*SIZE(BO) - vmovss %xmm1, 1*SIZE(BO) - vmovss %xmm2, 2*SIZE(BO) + vmovsd %xmm1, 1*SIZE(BO) addq $2*SIZE,BO1 addq $2*SIZE,BO2 addq $3*SIZE,BO