performance optimizations in sgemm_kernel_16x2_bulldozer.S

This commit is contained in:
wernsaar 2013-06-13 11:35:15 +02:00
parent a789b588cd
commit 0ded1fcc1c
1 changed files with 145 additions and 20 deletions

View File

@ -127,17 +127,18 @@
*******************************************************************************************/ *******************************************************************************************/
#define KERNEL16x3_1(xx) \ #define KERNEL16x3_1(xx) \
prefetcht0 A_PR1(AO,%rax,SIZE) ;\
vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\
vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\
vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\
vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\
vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\
nop ;\
vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\
vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\
vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\
vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\
prefetcht0 A_PR1(AO,%rax,SIZE) ;\
vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\
vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\
vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\
@ -146,20 +147,21 @@
vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\
vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\
vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\
vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm1 ;\
vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm2 ;\
vfmaddps %xmm15,%xmm3,%xmm0,%xmm15 ;\ vfmaddps %xmm15,%xmm3,%xmm0,%xmm15 ;\
#define KERNEL16x3_2(xx) \ #define KERNEL16x3_2(xx) \
prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\
vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm1 ;\
vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\
vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\
vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm2 ;\
vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\
vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm3 ;\ vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm3 ;\
nop ;\
vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\
vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\
vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\
vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\
prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\
vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\
vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\
@ -168,20 +170,21 @@
vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\
vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\
vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\
vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\
vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\
vfmaddps %xmm15,%xmm3,%xmm0,%xmm15 ;\ vfmaddps %xmm15,%xmm3,%xmm0,%xmm15 ;\
#define KERNEL16x3_3(xx) \ #define KERNEL16x3_3(xx) \
prefetcht0 A_PR1+128(AO,%rax,SIZE) ;\
vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\
vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\
vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\
vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\
vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\
vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm3 ;\ vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm3 ;\
nop ;\
vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\
vmovups 4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vmovups 4 * SIZE(AO, %rax, SIZE), %xmm0 ;\
vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\
vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\
prefetcht0 A_PR1+128(AO,%rax,SIZE) ;\
vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\
vmovups 8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vmovups 8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\
@ -190,31 +193,32 @@
vmovups 12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vmovups 12 * SIZE(AO, %rax, SIZE), %xmm0 ;\
vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\
vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\
vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm1 ;\
vbroadcastss 4 * SIZE(BO, BI, SIZE), %xmm2 ;\
vfmaddps %xmm15,%xmm3,%xmm0,%xmm15 ;\ vfmaddps %xmm15,%xmm3,%xmm0,%xmm15 ;\
#define KERNEL16x3_4(xx) \ #define KERNEL16x3_4(xx) \
prefetcht0 A_PR1+192(AO,%rax,SIZE) ;\
vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm1 ;\
vmovups 16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vmovups 16 * SIZE(AO, %rax, SIZE), %xmm0 ;\
vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\
vbroadcastss 4 * SIZE(BO, BI, SIZE), %xmm2 ;\
vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\
vbroadcastss 5 * SIZE(BO, BI, SIZE), %xmm3 ;\ vbroadcastss 5 * SIZE(BO, BI, SIZE), %xmm3 ;\
nop ;\
vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\
vmovups 20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vmovups 20 * SIZE(AO, %rax, SIZE), %xmm0 ;\
vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\
vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\
prefetcht0 A_PR1+192(AO,%rax,SIZE) ;\
vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\
vmovups 24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vmovups 24 * SIZE(AO, %rax, SIZE), %xmm0 ;\
vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\
vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\
addq $12, BI ;\
vfmaddps %xmm12,%xmm3,%xmm0,%xmm12 ;\ vfmaddps %xmm12,%xmm3,%xmm0,%xmm12 ;\
vmovups 28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vmovups 28 * SIZE(AO, %rax, SIZE), %xmm0 ;\
vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\
vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\
vfmaddps %xmm15,%xmm3,%xmm0,%xmm15 ;\
addq $12, BI ;\
addq $64, %rax ;\ addq $64, %rax ;\
vfmaddps %xmm15,%xmm3,%xmm0,%xmm15 ;\
#define KERNEL16x3_SUB(xx) \ #define KERNEL16x3_SUB(xx) \
vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\
@ -223,6 +227,7 @@
vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\
vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\
vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\
nop ;\
vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\
vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\
vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\
@ -248,6 +253,7 @@
vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\
vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\
vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\
nop ;\
vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\
vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\
vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\
@ -261,6 +267,7 @@
vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm2 ;\ vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm2 ;\
vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\
vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm3 ;\ vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm3 ;\
nop ;\
vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\
vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\
vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\
@ -275,6 +282,7 @@
vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\
vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\
vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm3 ;\ vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm3 ;\
nop ;\
vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\
vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\
vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\
@ -288,6 +296,7 @@
vbroadcastss 4 * SIZE(BO, BI, SIZE), %xmm2 ;\ vbroadcastss 4 * SIZE(BO, BI, SIZE), %xmm2 ;\
vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\
vbroadcastss 5 * SIZE(BO, BI, SIZE), %xmm3 ;\ vbroadcastss 5 * SIZE(BO, BI, SIZE), %xmm3 ;\
nop ;\
vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\
vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\
vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\
@ -303,6 +312,7 @@
vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\
vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\
vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\
nop ;\
vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\
vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\
vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\
@ -1072,15 +1082,74 @@
leaq (B,%rax, SIZE), BO2 // next offset to BO2 leaq (B,%rax, SIZE), BO2 // next offset to BO2
leaq BUFFER1, BO // first buffer to BO leaq BUFFER1, BO // first buffer to BO
movq K, %rax movq K, %rax
sarq $3 , %rax // K / 8
jz .L6_01a_2
ALIGN_4 ALIGN_4
.L6_01a_1:
prefetcht0 512(BO1)
prefetcht0 512(BO2)
prefetchw 512(BO)
vmovsd 0 * SIZE(BO1), %xmm0
vmovsd 2 * SIZE(BO1), %xmm2
vmovsd 4 * SIZE(BO1), %xmm4
vmovsd 6 * SIZE(BO1), %xmm6
vmovss 0 * SIZE(BO2), %xmm1
vmovss 2 * SIZE(BO2), %xmm3
vmovss 4 * SIZE(BO2), %xmm5
vmovss 6 * SIZE(BO2), %xmm7
vmovsd %xmm0, 0*SIZE(BO)
vmovss %xmm1, 2*SIZE(BO)
vmovsd %xmm2, 3*SIZE(BO)
vmovss %xmm3, 5*SIZE(BO)
vmovsd %xmm4, 6*SIZE(BO)
vmovss %xmm5, 8*SIZE(BO)
vmovsd %xmm6, 9*SIZE(BO)
vmovss %xmm7,11*SIZE(BO)
addq $8*SIZE,BO1
addq $8*SIZE,BO2
addq $12*SIZE,BO
vmovsd 0 * SIZE(BO1), %xmm0
vmovsd 2 * SIZE(BO1), %xmm2
vmovsd 4 * SIZE(BO1), %xmm4
vmovsd 6 * SIZE(BO1), %xmm6
vmovss 0 * SIZE(BO2), %xmm1
vmovss 2 * SIZE(BO2), %xmm3
vmovss 4 * SIZE(BO2), %xmm5
vmovss 6 * SIZE(BO2), %xmm7
vmovsd %xmm0, 0*SIZE(BO)
vmovss %xmm1, 2*SIZE(BO)
vmovsd %xmm2, 3*SIZE(BO)
vmovss %xmm3, 5*SIZE(BO)
vmovsd %xmm4, 6*SIZE(BO)
vmovss %xmm5, 8*SIZE(BO)
vmovsd %xmm6, 9*SIZE(BO)
vmovss %xmm7,11*SIZE(BO)
addq $8*SIZE,BO1
addq $8*SIZE,BO2
addq $12*SIZE,BO
decq %rax
jnz .L6_01a_1
.L6_01a_2:
movq K, %rax
andq $7, %rax // K % 8
jz .L6_02c
ALIGN_4
.L6_02b: .L6_02b:
vmovss 0 * SIZE(BO1), %xmm0 vmovsd 0 * SIZE(BO1), %xmm0
vmovss 1 * SIZE(BO1), %xmm1
vmovss 0 * SIZE(BO2), %xmm2 vmovss 0 * SIZE(BO2), %xmm2
vmovss %xmm0, 0*SIZE(BO) vmovsd %xmm0, 0*SIZE(BO)
vmovss %xmm1, 1*SIZE(BO)
vmovss %xmm2, 2*SIZE(BO) vmovss %xmm2, 2*SIZE(BO)
addq $2*SIZE,BO1 addq $2*SIZE,BO1
addq $2*SIZE,BO2 addq $2*SIZE,BO2
@ -1096,17 +1165,73 @@
leaq (BO1,%rax, SIZE), BO2 // next offset to BO2 leaq (BO1,%rax, SIZE), BO2 // next offset to BO2
leaq BUFFER2, BO // second buffer to BO leaq BUFFER2, BO // second buffer to BO
movq K, %rax movq K, %rax
sarq $3 , %rax // K / 8
jz .L6_02c_2
ALIGN_4 ALIGN_4
.L6_02c_1:
prefetcht0 512(BO2)
prefetchw 512(BO)
vmovsd 0 * SIZE(BO2), %xmm0
vmovsd 2 * SIZE(BO2), %xmm2
vmovsd 4 * SIZE(BO2), %xmm4
vmovsd 6 * SIZE(BO2), %xmm6
vmovss 1 * SIZE(BO1), %xmm1
vmovss 3 * SIZE(BO1), %xmm3
vmovss 5 * SIZE(BO1), %xmm5
vmovss 7 * SIZE(BO1), %xmm7
vmovss %xmm1, 0*SIZE(BO)
vmovsd %xmm0, 1*SIZE(BO)
vmovss %xmm3, 3*SIZE(BO)
vmovsd %xmm2, 4*SIZE(BO)
vmovss %xmm5, 6*SIZE(BO)
vmovsd %xmm4, 7*SIZE(BO)
vmovss %xmm7, 9*SIZE(BO)
vmovsd %xmm6,10*SIZE(BO)
addq $8*SIZE,BO1
addq $8*SIZE,BO2
addq $12*SIZE,BO
vmovsd 0 * SIZE(BO2), %xmm0
vmovsd 2 * SIZE(BO2), %xmm2
vmovsd 4 * SIZE(BO2), %xmm4
vmovsd 6 * SIZE(BO2), %xmm6
vmovss 1 * SIZE(BO1), %xmm1
vmovss 3 * SIZE(BO1), %xmm3
vmovss 5 * SIZE(BO1), %xmm5
vmovss 7 * SIZE(BO1), %xmm7
vmovss %xmm1, 0*SIZE(BO)
vmovsd %xmm0, 1*SIZE(BO)
vmovss %xmm3, 3*SIZE(BO)
vmovsd %xmm2, 4*SIZE(BO)
vmovss %xmm5, 6*SIZE(BO)
vmovsd %xmm4, 7*SIZE(BO)
vmovss %xmm7, 9*SIZE(BO)
vmovsd %xmm6,10*SIZE(BO)
addq $8*SIZE,BO1
addq $8*SIZE,BO2
addq $12*SIZE,BO
decq %rax
jnz .L6_02c_1
.L6_02c_2:
movq K, %rax
andq $7, %rax // K % 8
jz .L6_03c
ALIGN_4
.L6_03b: .L6_03b:
vmovss 1*SIZE(BO1), %xmm0 vmovss 1*SIZE(BO1), %xmm0
vmovss 0*SIZE(BO2), %xmm1 vmovsd 0*SIZE(BO2), %xmm1
vmovss 1*SIZE(BO2), %xmm2
vmovss %xmm0, 0*SIZE(BO) vmovss %xmm0, 0*SIZE(BO)
vmovss %xmm1, 1*SIZE(BO) vmovsd %xmm1, 1*SIZE(BO)
vmovss %xmm2, 2*SIZE(BO)
addq $2*SIZE,BO1 addq $2*SIZE,BO1
addq $2*SIZE,BO2 addq $2*SIZE,BO2
addq $3*SIZE,BO addq $3*SIZE,BO