From f76f9525477785bb452699c07d1985ec14dc2b61 Mon Sep 17 00:00:00 2001 From: wangqian Date: Tue, 19 Jun 2012 16:17:43 +0800 Subject: [PATCH 01/16] Refs #83 #53. Adding Intel Sandy Bridge (AVX supported) kernel codes for BLAS level 3 functions. --- kernel/generic/zgemm_ncopy_4_sandy.c | 235 ++ kernel/generic/zgemm_ncopy_8_sandy.c | 401 +++ kernel/generic/zgemm_tcopy_4_sandy.c | 237 ++ kernel/generic/zgemm_tcopy_8_sandy.c | 370 ++ kernel/x86_64/KERNEL.SANDYBRIDGE | 97 +- kernel/x86_64/cgemm_kernel_4x8_sandy.S | 4478 ++++++++++++++++++++++++ kernel/x86_64/dgemm_kernel_4x8_sandy.S | 3186 +++++++++++++++++ kernel/x86_64/sgemm_kernel_8x8_sandy.S | 3736 ++++++++++++++++++++ kernel/x86_64/zgemm_kernel_4x4_sandy.S | 3257 +++++++++++++++++ param.h | 100 +- 10 files changed, 15982 insertions(+), 115 deletions(-) create mode 100644 kernel/generic/zgemm_ncopy_4_sandy.c create mode 100644 kernel/generic/zgemm_ncopy_8_sandy.c create mode 100644 kernel/generic/zgemm_tcopy_4_sandy.c create mode 100644 kernel/generic/zgemm_tcopy_8_sandy.c create mode 100644 kernel/x86_64/cgemm_kernel_4x8_sandy.S create mode 100644 kernel/x86_64/dgemm_kernel_4x8_sandy.S create mode 100644 kernel/x86_64/sgemm_kernel_8x8_sandy.S create mode 100644 kernel/x86_64/zgemm_kernel_4x4_sandy.S diff --git a/kernel/generic/zgemm_ncopy_4_sandy.c b/kernel/generic/zgemm_ncopy_4_sandy.c new file mode 100644 index 000000000..839bd5939 --- /dev/null +++ b/kernel/generic/zgemm_ncopy_4_sandy.c @@ -0,0 +1,235 @@ +/***************************************************************************** + Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are +met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the ISCAS nor the names of its contributors may +be used to endorse or promote products derived from this software +without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + **********************************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG row,BLASLONG col,FLOAT* src,BLASLONG srcdim,FLOAT* dest) +{ + BLASLONG i,j; + BLASLONG idx=0; + BLASLONG ii; + FLOAT *src0,*src1,*src2,*src3,*dest0; + for (j=0; j +#include "common.h" + +int CNAME(BLASLONG row,BLASLONG col,FLOAT* src,BLASLONG srcdim,FLOAT* dest) +{ + BLASLONG i,j; + BLASLONG idx=0; + BLASLONG ii; + FLOAT *src0,*src1,*src2,*src3,*src4,*src5,*src6,*src7,*dest0; + for (j=0; j +#include "common.h" + +int CNAME(BLASLONG row,BLASLONG col,FLOAT* src,BLASLONG srcdim,FLOAT* dest) +{ + BLASLONG i,j; + BLASLONG idx=0; + BLASLONG ii; + FLOAT *src0,*src1,*src2,*src3,*dest0; + FLOAT *dest1,*dest2; + ii = col&-4; + ii = ii*(2*row); + dest2 = dest+ii; + ii = col&-2; + ii = ii*(2*row); + dest1 = dest+ii; + for (j=0; j +#include "common.h" + +int CNAME(BLASLONG row,BLASLONG col,FLOAT* src,BLASLONG srcdim,FLOAT* dest) +{ + BLASLONG i,j; + BLASLONG idx=0; + BLASLONG ii; + FLOAT *src0,*src1,*src2,*src3,*dest0; + FLOAT *dest1,*dest2,*dest4; + ii = col&-8; + ii = ii*(2*row); + dest4 = dest+ii; + ii = col&-4; + ii = ii*(2*row); + dest2 = dest+ii; + ii = col&-2; + ii = ii*(2*row); + dest1 = dest+ii; + for (j=0; j Date: Tue, 19 Jun 2012 17:29:06 +0800 Subject: [PATCH 02/16] Fixed dynamic_arch building bug. --- param.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/param.h b/param.h index db9ec6a3b..27aeeb6ac 100644 --- a/param.h +++ b/param.h @@ -1,5 +1,5 @@ /***************************************************************************** -Copyright (c) 2011, Lab of Parallel Software and Computational Science,ICSAS +Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS All rights reserved. Redistribution and use in source and binary forms, with or without @@ -143,7 +143,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -#if defined(BARCELONA) || defined(SHANGHAI) +#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) #define SNUMOPT 8 #define DNUMOPT 4 From 3ef96aa567e27ab76f07701b37da1ca0c0c59f39 Mon Sep 17 00:00:00 2001 From: wangqian Date: Tue, 19 Jun 2012 20:37:22 +0800 Subject: [PATCH 03/16] Fixed bug in MOVQ redefine and ALIGN SIZE problem. --- kernel/x86_64/cgemm_kernel_4x8_sandy.S | 137 +++++++++--------- kernel/x86_64/dgemm_kernel_4x8_sandy.S | 163 +++++++++++----------- kernel/x86_64/sgemm_kernel_8x8_sandy.S | 185 +++++++++++++------------ kernel/x86_64/zgemm_kernel_4x4_sandy.S | 118 ++++++++-------- 4 files changed, 304 insertions(+), 299 deletions(-) diff --git a/kernel/x86_64/cgemm_kernel_4x8_sandy.S b/kernel/x86_64/cgemm_kernel_4x8_sandy.S index 2b4e4dc64..56ebee120 100644 --- a/kernel/x86_64/cgemm_kernel_4x8_sandy.S +++ b/kernel/x86_64/cgemm_kernel_4x8_sandy.S @@ -146,6 +146,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define JMP jmp #define NOP #define XOR xorpd +#undef MOVQ #define MOVQ movq #define XOR_SY vxorps @@ -305,7 +306,7 @@ movq %r11, kk; MOVQ bn,j; SARQ $2,j; # Rn = 4 JLE .L0_loopE; -.align 32; +ALIGN_5; .L0_bodyB:; #if defined(TRMMKERNEL) && defined(LEFT) MOVQ OFFSET, %rax; @@ -320,7 +321,7 @@ MOVQ ba,ptrba; MOVQ bm,i; SARQ $3,i; # Rm = 8 JLE .L1_loopE; -.align 32; +ALIGN_5; .L1_bodyB:; #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb,ptrbb; @@ -367,7 +368,7 @@ MOVQ %rax, kkk; #endif SARQ $2,k; # Unroll 4 times JLE .L2_loopE; -.align 32; +ALIGN_5; .L2_bodyB:; # Computing kernel @@ -591,7 +592,7 @@ ADD2_SY yvec7, yvec8, yvec8; .L2_bodyE:; DECQ k; JG .L2_bodyB; -.align 64; +ALIGN_5 .L2_loopE:; #ifndef TRMMKERNEL TEST $2, bk; @@ -599,7 +600,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L3_loopE; -.align 64 +ALIGN_5 .L3_loopB: ######### Unroll 1 ################## PREFETCH0 PRESIZE*SIZE(ptrba) @@ -717,7 +718,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L4_loopE; -.align 64 +ALIGN_5 .L4_loopB:; ######### Unroll 1 ################## PREFETCH0 PRESIZE*SIZE(ptrba) @@ -875,7 +876,7 @@ MOVQ C0, %rax; OR ldc, %rax; TEST $15, %rax; JNE .L4_loopEx; -.align 32 +ALIGN_5 EXTRA_SY $1,yvec15,xvec7; EXTRA_SY $1,yvec14,xvec6; EXTRA_SY $1,yvec13,xvec5; @@ -934,7 +935,7 @@ ADDQ $16*SIZE,C1; DECQ i; JG .L1_bodyB; JMP .L1_loopE; -.align 32 +ALIGN_5 .L4_loopEx: EXTRA_SY $1, yvec15, xvec7; #ifndef TRMMKERNEL @@ -1077,11 +1078,11 @@ ADDQ $16*SIZE, C0; ADDQ $16*SIZE, C1; DECQ i; JG .L1_bodyB; -.align 32; +ALIGN_5; .L1_loopE:; TEST $4, bm; JLE .L5_loopE; -.align 32 +ALIGN_5 .L5_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb,ptrbb; @@ -1113,7 +1114,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L8_loopE; -.align 32 +ALIGN_5 .L8_bodyB: #### Unroll times 1 #### LD_SY 0*SIZE(ptrba), yvec0; @@ -1242,7 +1243,7 @@ ADDQ $32*SIZE, ptrba; ADDQ $32*SIZE, ptrbb; DECQ k; JG .L8_bodyB; -.align 32 +ALIGN_5 .L8_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -1250,7 +1251,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L9_loopE; -.align 32 +ALIGN_5 .L9_bodyB: #### Unroll times 1 #### LD_SY 0*SIZE(ptrba), yvec0; @@ -1323,7 +1324,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L10_loopE; -.align 32 +ALIGN_5 .L10_bodyB: #### Unroll times 1 #### LD_SY 0*SIZE(ptrba), yvec0; @@ -1494,7 +1495,7 @@ ADDQ $8*SIZE, C1; .L5_loopE: TEST $2, bm; JLE .L6_loopE; -.align 32 +ALIGN_5 .L6_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb,ptrbb; @@ -1527,7 +1528,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L11_loopE; -.align 32 +ALIGN_5 .L11_bodyB: LD_SX 0*SIZE(ptrba), xvec0; # ar1, ai1, ar2, ai2 EDUP_SX 0*SIZE(ptrbb), xvec2; # br1, br1, br2, br2 @@ -1652,7 +1653,7 @@ ADDQ $16*SIZE, ptrba; ADDQ $32*SIZE, ptrbb; DECQ k; JG .L11_bodyB; -.align 32 +ALIGN_5 .L11_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -1660,7 +1661,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L12_loopE; -.align 32 +ALIGN_5 .L12_bodyB: LD_SX 0*SIZE(ptrba), xvec0; # ar1, ai1, ar2, ai2 EDUP_SX 0*SIZE(ptrbb), xvec2; # br1, br1, br2, br2 @@ -1731,7 +1732,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L13_loopE; -.align 32 +ALIGN_5 .L13_bodyB: LD_SX 0*SIZE(ptrba), xvec0; # ar1, ai1, ar2, ai2 EDUP_SX 0*SIZE(ptrbb), xvec2; # br1, br1, br2, br2 @@ -1875,7 +1876,7 @@ ADDQ $4*SIZE, C1; .L6_loopE: TEST $1, bm; JLE .L7_loopE; -.align 32 +ALIGN_5 .L7_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb,ptrbb; @@ -1905,7 +1906,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L14_loopE; -.align 32 +ALIGN_5 .L14_bodyB: BROAD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec2; @@ -1978,7 +1979,7 @@ ADDQ $8*SIZE, ptrba; ADDQ $32*SIZE, ptrbb; DECQ k; JG .L14_bodyB; -.align 32 +ALIGN_5 .L14_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -1986,7 +1987,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L15_loopE; -.align 32 +ALIGN_5 .L15_bodyB: BROAD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec2; @@ -2031,7 +2032,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L16_loopE; -.align 32 +ALIGN_5 .L16_bodyB: BROAD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec2; @@ -2129,11 +2130,11 @@ LEAQ (C,ldc,4),C; .L0_bodyE:; DECQ j; JG .L0_bodyB; -.align 32; +ALIGN_5; .L0_loopE:; TEST $2, bn; JLE .L20_loopE; -.align 32 +ALIGN_5 .L20_bodyB: #if defined(TRMMKERNEL) && defined(LEFT) MOVQ OFFSET, %rax; @@ -2145,7 +2146,7 @@ MOVQ ba, ptrba; MOVQ bm, i; SARQ $3, i; JLE .L21_loopE; -.align 32 +ALIGN_5 .L21_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb,ptrbb; @@ -2181,7 +2182,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L211_loopE; -.align 32 +ALIGN_5 .L211_bodyB: EDUP_SX 0*SIZE(ptrbb), xvec4; ODUP_SX 0*SIZE(ptrbb), xvec5; @@ -2430,7 +2431,7 @@ ADDQ $64*SIZE, ptrba; ADDQ $16*SIZE, ptrbb; DECQ k; JG .L211_bodyB; -.align 32 +ALIGN_5 .L211_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -2438,7 +2439,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L212_loopE; -.align 32 +ALIGN_5 .L212_bodyB: EDUP_SX 0*SIZE(ptrbb), xvec4; ODUP_SX 0*SIZE(ptrbb), xvec5; @@ -2571,7 +2572,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L213_loopE; -.align 32 +ALIGN_5 .L213_bodyB: EDUP_SX 0*SIZE(ptrbb), xvec4; ODUP_SX 0*SIZE(ptrbb), xvec5; @@ -2825,11 +2826,11 @@ ADDQ $16*SIZE, C0; ADDQ $16*SIZE, C1; DECQ i; JG .L21_bodyB; -.align 32 +ALIGN_5 .L21_loopE: TEST $4, bm; JLE .L22_loopE; -.align 32 +ALIGN_5 .L22_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb,ptrbb; @@ -2862,7 +2863,7 @@ MOVQ %rax, kkk; SARQ $2, k; JLE .L221_loopE; -.align 32 +ALIGN_5 .L221_bodyB: EDUP_SX 0*SIZE(ptrbb), xvec4; ODUP_SX 0*SIZE(ptrbb), xvec5; @@ -3002,7 +3003,7 @@ ADDQ $32*SIZE, ptrba; ADDQ $16*SIZE, ptrbb; DECQ k; JG .L221_bodyB; -.align 32 +ALIGN_5 .L221_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -3010,7 +3011,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L222_loopE; -.align 32 +ALIGN_5 .L222_bodyB: EDUP_SX 0*SIZE(ptrbb), xvec4; ODUP_SX 0*SIZE(ptrbb), xvec5; @@ -3089,7 +3090,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L223_loopE; -.align 32 +ALIGN_5 .L223_bodyB: EDUP_SX 0*SIZE(ptrbb), xvec4; ODUP_SX 0*SIZE(ptrbb), xvec5; @@ -3237,7 +3238,7 @@ ADDQ $8*SIZE, C1; .L22_loopE: TEST $2, bm; JLE .L23_loopE; -.align 32 +ALIGN_5 .L23_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb,ptrbb; @@ -3267,7 +3268,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L231_loopE; -.align 32 +ALIGN_5 .L231_bodyB: EDUP_SX 0*SIZE(ptrbb), xvec4; ODUP_SX 0*SIZE(ptrbb), xvec5; @@ -3351,7 +3352,7 @@ ADDQ $16*SIZE, ptrba; ADDQ $16*SIZE, ptrbb; DECQ k; JG .L231_bodyB; -.align 32 +ALIGN_5 .L231_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -3359,7 +3360,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L232_loopE; -.align 32 +ALIGN_5 .L232_bodyB: EDUP_SX 0*SIZE(ptrbb), xvec4; ODUP_SX 0*SIZE(ptrbb), xvec5; @@ -3409,7 +3410,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L233_loopE; -.align 32 +ALIGN_5 .L233_bodyB: EDUP_SX 0*SIZE(ptrbb), xvec4; ODUP_SX 0*SIZE(ptrbb), xvec5; @@ -3503,7 +3504,7 @@ ADDQ $4*SIZE, C1; .L23_loopE: TEST $1, bm; JLE .L24_loopE; -.align 32 +ALIGN_5 .L24_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb,ptrbb; @@ -3532,7 +3533,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L241_loopE; -.align 32 +ALIGN_5 .L241_bodyB: BROAD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec2; @@ -3585,7 +3586,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L242_loopE; -.align 32 +ALIGN_5 .L242_bodyB: BROAD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec2; @@ -3616,7 +3617,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L243_loopE; -.align 32 +ALIGN_5 .L243_bodyB: BROAD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec2; @@ -3684,7 +3685,7 @@ LEAQ (C, ldc, 2), C; .L20_loopE: TEST $1, bn; JLE .L30_loopE; -.align 32 +ALIGN_5 .L30_bodyB: #if defined(TRMMKERNEL) && defined(LEFT) MOVQ OFFSET, %rax; @@ -3695,7 +3696,7 @@ MOVQ ba, ptrba; MOVQ bm, i; SARQ $3, i; JLE .L31_loopE; -.align 32 +ALIGN_5 .L31_bodyB: MOVQ bb, ptrbb; #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) @@ -3727,7 +3728,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L311_loopE; -.align 32 +ALIGN_5 .L311_bodyB: #### Unroll 1 #### LD_SY 0*SIZE(ptrba), yvec0; @@ -3800,7 +3801,7 @@ ADDQ $64*SIZE, ptrba; ADDQ $8*SIZE, ptrbb; DECQ k; JG .L311_bodyB; -.align 32 +ALIGN_5 .L311_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -3808,7 +3809,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L312_loopE; -.align 32 +ALIGN_5 .L312_bodyB: #### Unroll 1 #### LD_SY 0*SIZE(ptrba), yvec0; @@ -3853,7 +3854,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L313_loopE; -.align 32 +ALIGN_5 .L313_bodyB: #### Unroll 1 #### LD_SY 0*SIZE(ptrba), yvec0; @@ -3941,11 +3942,11 @@ ADDQ $8, kk; ADDQ $16*SIZE, C0; DECQ i; JG .L31_bodyB; -.align 32 +ALIGN_5 .L31_loopE: TEST $4, bm; JLE .L32_loopE; -.align 32 +ALIGN_5 .L32_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb,ptrbb; @@ -3974,7 +3975,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L321_loopE; -.align 32 +ALIGN_5 .L321_bodyB: #### Unroll 1 #### LD_SY 0*SIZE(ptrba), yvec0; @@ -4023,7 +4024,7 @@ ADDQ $32*SIZE, ptrba; ADDQ $8*SIZE, ptrbb; DECQ k; JG .L321_bodyB; -.align 32 +ALIGN_5 .L321_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -4031,7 +4032,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L322_loopE; -.align 32 +ALIGN_5 .L322_bodyB: #### Unroll 1 #### LD_SY 0*SIZE(ptrba), yvec0; @@ -4064,7 +4065,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L323_loopE; -.align 32 +ALIGN_5 .L323_bodyB: #### Unroll 1 #### LD_SY 0*SIZE(ptrba), yvec0; @@ -4128,7 +4129,7 @@ ADDQ $8*SIZE, C0; .L32_loopE: TEST $2, bm; JLE .L33_loopE; -.align 32 +ALIGN_5 .L33_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb,ptrbb; @@ -4157,7 +4158,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L331_loopE; -.align 32 +ALIGN_5 .L331_bodyB: #### Unroll 1 #### LD_SX 0*SIZE(ptrba), xvec0; @@ -4202,7 +4203,7 @@ ADDQ $16*SIZE, ptrba; ADDQ $8*SIZE, ptrbb; DECQ k; JG .L331_bodyB; -.align 32 +ALIGN_5 .L331_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -4210,7 +4211,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L332_loopE; -.align 32 +ALIGN_5 .L332_bodyB: #### Unroll 1 #### LD_SX 0*SIZE(ptrba), xvec0; @@ -4241,7 +4242,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L333_loopE; -.align 32 +ALIGN_5 .L333_bodyB: #### Unroll 1 #### LD_SX 0*SIZE(ptrba), xvec0; @@ -4300,7 +4301,7 @@ ADDQ $4*SIZE, C0; .L33_loopE: TEST $1, bm; JLE .L34_loopE; -.align 32 +ALIGN_5 .L34_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb,ptrbb; @@ -4329,7 +4330,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L341_loopE; -.align 32 +ALIGN_5 .L341_bodyB: LD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec2; @@ -4354,7 +4355,7 @@ ADDQ $8*SIZE, ptrba; ADDQ $8*SIZE, ptrbb; DECQ k; JG .L341_bodyB; -.align 32 +ALIGN_5 .L341_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -4362,7 +4363,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L342_loopE; -.align 32 +ALIGN_5 .L342_bodyB: LD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec2; @@ -4383,7 +4384,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L343_loopE; -.align 32 +ALIGN_5 .L343_bodyB: XOR_SY yvec0, yvec0, yvec0; XOR_SY yvec2, yvec2, yvec2; diff --git a/kernel/x86_64/dgemm_kernel_4x8_sandy.S b/kernel/x86_64/dgemm_kernel_4x8_sandy.S index fea5ecb4a..c98879d7c 100644 --- a/kernel/x86_64/dgemm_kernel_4x8_sandy.S +++ b/kernel/x86_64/dgemm_kernel_4x8_sandy.S @@ -140,6 +140,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define JNE jne #define NOP #define XOR xorpd +#undef MOVQ #define MOVQ movq #define XOR_SY vxorps @@ -265,7 +266,7 @@ movq %r11, kk MOVQ bn,j; SARQ $2,j; # Rn = 4 JLE .L0_loopE; -.align 32; +ALIGN_5; .L0_bodyB:; #if defined(TRMMKERNEL) && defined(LEFT) MOVQ OFFSET, %rax; @@ -281,7 +282,7 @@ MOVQ ba,ptrba; MOVQ bm,i; SARQ $3,i; # Rm = 8 JLE .L1_loopE; -.align 32; +ALIGN_5; .L1_bodyB:; #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb, ptrbb; @@ -328,7 +329,7 @@ MOVQ %rax, kkk; #endif SARQ $2,k; JLE .L2_loopE; -.align 32; +ALIGN_5; .L2_bodyB:; # Computing kernel @@ -448,7 +449,7 @@ ADD_DY yvec8, yvec7, yvec8; .L2_bodyE:; DECQ k; JG .L2_bodyB; -.align 64; +ALIGN_5 .L2_loopE:; PREFETCH2 0*SIZE(prebb); ADDQ $8*SIZE, prebb; @@ -459,7 +460,7 @@ MOVQ kkk, %rax; TEST $2, %rax; #endif JLE .L3_loopE; -.align 64 +ALIGN_5 .L3_bodyB: #### Unroll times 1 #### PREFETCH0 64*SIZE(ptrba) @@ -529,7 +530,7 @@ MOVQ kkk, %rax; TEST $1, %rax; #endif JLE .L4_loopE; -.align 64 +ALIGN_5 .L4_bodyB:; #### Unroll times 1 #### PREFETCH0 64*SIZE(ptrba) @@ -588,7 +589,7 @@ MOVQ C0, %rax; OR ldc, %rax; TEST $15, %rax; JNE .L4_loopEx; # Unalign part write back -.align 32 +ALIGN_5 #### Writing Back #### EXTRA_DY $1,yvec15,xvec7; EXTRA_DY $1,yvec14,xvec6; @@ -648,7 +649,7 @@ ADDQ $8*SIZE,C1; DECQ i; JG .L1_bodyB; JMP .L1_loopE; -.align 32; +ALIGN_5; .L4_loopEx:; EXTRA_DY $1, yvec15, xvec7; #ifndef TRMMKERNEL @@ -776,11 +777,11 @@ ADDQ $8*SIZE, C0; ADDQ $8*SIZE, C1; DECQ i; JG .L1_bodyB; -.align 32 +ALIGN_5 .L1_loopE:; TEST $4, bm; # Rm = 4 JLE .L5_loopE; -.align 32 +ALIGN_5 .L5_bodyB:; #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb, ptrbb; @@ -816,7 +817,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L6_loopE; -.align 32; +ALIGN_5; .L6_bodyB:; # Computing kernel @@ -887,7 +888,7 @@ MUL_DY yvec1, yvec5, yvec7; ADD_DY yvec9, yvec7, yvec9; DECQ k; JG .L6_bodyB; -.align 32 +ALIGN_5 .L6_loopE:; #ifndef TRMMKERNEL TEST $2, bk; @@ -896,7 +897,7 @@ MOVQ kkk, %rax; TEST $2, %rax; #endif JLE .L7_loopE; -.align 32 +ALIGN_5 .L7_bodyB:; #### Untoll time 1 #### LD_DY 4*SIZE(ptrba), yvec1; @@ -940,7 +941,7 @@ MOVQ kkk, %rax; TEST $1, %rax; #endif JLE .L8_loopE; -.align 32 +ALIGN_5 .L8_bodyB:; #### Untoll time 1 #### MUL_DY yvec0, yvec2, yvec6; @@ -977,7 +978,7 @@ MOVQ C0, %rax; OR ldc, %rax; TEST $15, %rax; JNE .L8_loopEx; # Unalign part write back -.align 32 +ALIGN_5 #### Writing Back #### EXTRA_DY $1,yvec15,xvec7; EXTRA_DY $1,yvec13,xvec5; @@ -1014,7 +1015,7 @@ ADDQ $4, kk ADDQ $4*SIZE, C0; ADDQ $4*SIZE, C1; JMP .L5_loopE; -.align 32 +ALIGN_5 .L8_loopEx:; EXTRA_DY $1,yvec15,xvec7; EXTRA_DY $1,yvec13,xvec5; @@ -1080,7 +1081,7 @@ ADDQ $4*SIZE, C1; .L5_loopE:; TEST $2, bm; JLE .L9_loopE; -.align 32 +ALIGN_5 .L9_bodyB:; #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb, ptrbb; @@ -1117,7 +1118,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L10_loopE; -.align 32; +ALIGN_5; .L10_bodyB:; # Computing kernel @@ -1192,7 +1193,7 @@ MUL_DX xvec1, xvec5; ADD_DX xvec5, xvec9; DECQ k; JG .L10_bodyB; -.align 32 +ALIGN_5 .L10_loopE:; #ifndef TRMMKERNEL TEST $2, bk @@ -1201,7 +1202,7 @@ MOVQ kkk, %rax; TEST $2, %rax; #endif JLE .L11_loopE; -.align 32 +ALIGN_5 .L11_bodyB:; ##### Unroll time 1 #### LD_DX 4*SIZE(ptrbb), xvec6; @@ -1248,7 +1249,7 @@ MOVQ kkk, %rax; TEST $1, %rax; #endif JLE .L12_loopE; -.align 32 +ALIGN_5 .L12_bodyB:; SHUF_DX $0x4e, xvec3, xvec5; MUL_DX xvec0, xvec2; @@ -1285,7 +1286,7 @@ MOVQ C0, %rax; OR ldc, %rax; TEST $15, %rax; JNE .L12_loopEx; -.align 32 +ALIGN_5 #### Writing Back #### #ifndef TRMMKERNEL ADD_DX 0*SIZE(C0), xvec13; @@ -1310,7 +1311,7 @@ ADDQ $2, kk ADDQ $2*SIZE, C0 ADDQ $2*SIZE, C1 JMP .L9_loopE; -.align 32 +ALIGN_5 .L12_loopEx: #ifndef TRMMKERNEL LDL_DX 0*SIZE(C0), xvec14; @@ -1349,7 +1350,7 @@ ADDQ $2*SIZE, C1; .L9_loopE:; TEST $1, bm JLE .L13_loopE; -.align 32 +ALIGN_5 .L13_bodyB:; #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb, ptrbb; @@ -1379,7 +1380,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L14_loopE; -.align 32 +ALIGN_5 .L14_bodyB:; BROAD_DY 0*SIZE(ptrba), yvec0; LD_DY 0*SIZE(ptrbb), yvec2; @@ -1404,7 +1405,7 @@ ADDQ $4*SIZE, ptrba; ADDQ $16*SIZE, ptrbb; DECQ k; JG .L14_bodyB; -.align 32 +ALIGN_5 .L14_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -1413,7 +1414,7 @@ MOVQ kkk, %rax; TEST $2, %rax; #endif JLE .L15_loopE; -.align 32 +ALIGN_5 .L15_bodyB: BROAD_DY 0*SIZE(ptrba), yvec0; LD_DY 0*SIZE(ptrbb), yvec2; @@ -1434,7 +1435,7 @@ MOVQ kkk, %rax; TEST $1, %rax; #endif JLE .L16_loopE; -.align 32 +ALIGN_5 .L16_bodyB:; BROAD_DY 0*SIZE(ptrba), yvec0; LD_DY 0*SIZE(ptrbb), yvec2; @@ -1485,11 +1486,11 @@ LEAQ (C,ldc,4),C; .L0_bodyE:; DECQ j; JG .L0_bodyB; -.align 32; +ALIGN_5; .L0_loopE:; TEST $2, bn; JLE .L20_loopE; -.align 32; +ALIGN_5; .L20_loopB:; #if defined(TRMMKERNEL) && defined(LEFT) MOVQ OFFSET, %rax; @@ -1501,7 +1502,7 @@ MOVQ ba, ptrba; MOVQ bm, i; SARQ $3, i; # Rm = 8 JLE .L21_loopE; -.align 32; +ALIGN_5; .L21_bodyB:; #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb, ptrbb; @@ -1538,7 +1539,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L211_loopE; -.align 32; +ALIGN_5; .L211_bodyB: # Computing kernel #### Unroll time 1 #### @@ -1692,7 +1693,7 @@ MUL_DX xvec3, xvec7; ADD_DX xvec7, xvec8; DECQ k; JG .L211_bodyB; -.align 32 +ALIGN_5 .L211_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -1701,7 +1702,7 @@ MOVQ kkk, %rax; TEST $2, %rax; #endif JLE .L212_loopE; -.align 32; +ALIGN_5; .L212_bodyB: # Computing kernel #### Unroll time 1 #### @@ -1788,7 +1789,7 @@ MOVQ kkk, %rax; TEST $1, %rax; #endif JLE .L213_loopE; -.align 32 +ALIGN_5 .L213_bodyB: #### Unroll time 1 #### LD_DX 0*SIZE(ptrba), xvec0; @@ -1858,7 +1859,7 @@ MOVQ C0, %rax; OR ldc, %rax; TEST $15, %rax; JNE .L213_loopEx; -.align 32 +ALIGN_5 #### Writing Back #### #ifndef TRMMKERNEL ADD_DX 0*SIZE(C0), xvec11; @@ -1893,7 +1894,7 @@ ADDQ $8*SIZE, C1; DECQ i; JG .L21_bodyB; JMP .L21_loopE; -.align 32 +ALIGN_5 .L213_loopEx:; #ifndef TRMMKERNEL LDL_DX 0*SIZE(C0), xvec0; @@ -1956,7 +1957,7 @@ JG .L21_bodyB; .L21_loopE:; TEST $4, bm; # Rm = 4 JLE .L22_loopE; -.align 32; +ALIGN_5; .L22_bodyB:; #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb, ptrbb; @@ -1989,7 +1990,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L221_loopE; -.align 32 +ALIGN_5 .L221_bodyB:; # Computing kernel #### Unroll time 1 #### @@ -2071,7 +2072,7 @@ MUL_DX xvec1, xvec5; ADD_DX xvec5, xvec10; DECQ k; JG .L221_bodyB; -.align 32 +ALIGN_5 .L221_loopE:; #ifndef TRMMKERNEL TEST $2, bk; @@ -2080,7 +2081,7 @@ MOVQ kkk, %rax; TEST $2, %rax; #endif JLE .L222_loopE; -.align 32 +ALIGN_5 .L222_bodyB: #### Unroll time 1 #### LD_DX 0*SIZE(ptrba), xvec0; @@ -2129,7 +2130,7 @@ MOVQ kkk, %rax; TEST $1, %rax; #endif JLE .L223_loopE; -.align 32 +ALIGN_5 .L223_bodyB: #### Unroll time 1 #### LD_DX 0*SIZE(ptrba), xvec0; @@ -2171,7 +2172,7 @@ MOVQ C0, %rax; OR ldc, %rax; TEST $15, %rax; JNE .L223_loopEx; -.align 32 +ALIGN_5 #### Writing Back #### #ifndef TRMMKERNEL ADD_DX 0*SIZE(C0), xvec11; @@ -2196,7 +2197,7 @@ ADDQ $4, kk ADDQ $4*SIZE, C0; ADDQ $4*SIZE, C1; JMP .L22_loopE; -.align 32 +ALIGN_5 .L223_loopEx:; #ifndef TRMMKERNEL LDL_DX 0*SIZE(C0), xvec0; @@ -2237,7 +2238,7 @@ ADDQ $4*SIZE, C1; .L22_loopE:; TEST $2, bm; # Rm = 2 JLE .L23_loopE; -.align 32; +ALIGN_5; .L23_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb, ptrbb; @@ -2267,7 +2268,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L231_loopE; -.align 32 +ALIGN_5 .L231_bodyB: # Computing kernel #### Unroll time 1 #### @@ -2309,7 +2310,7 @@ ADD_DX xvec5, xvec11; ADDQ $8*SIZE, ptrbb; DECQ k; JG .L231_bodyB; -.align 32 +ALIGN_5 .L231_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -2318,7 +2319,7 @@ MOVQ kkk, %rax; TEST $2, %rax; #endif JLE .L232_loopE; -.align 32 +ALIGN_5 .L232_bodyB: #### Unroll time 1 #### LD_DX 0*SIZE(ptrba), xvec0; @@ -2347,7 +2348,7 @@ MOVQ kkk, %rax; TEST $1, %rax; #endif JLE .L233_loopE; -.align 32 +ALIGN_5 .L233_bodyB: #### Unroll time 1 #### LD_DX 0*SIZE(ptrba), xvec0; @@ -2373,7 +2374,7 @@ MOVQ C0, %rax; OR ldc, %rax; TEST $15, %rax; JNE .L233_loopEx; -.align 32 +ALIGN_5 #### Writing Back #### #ifndef TRMMKERNEL ADD_DX 0*SIZE(C0), xvec11; @@ -2394,7 +2395,7 @@ ADDQ $2, kk; ADDQ $2*SIZE, C0; ADDQ $2*SIZE, C1; JMP .L23_loopE; -.align 32 +ALIGN_5 .L233_loopEx:; #ifndef TRMMKERNEL LDL_DX 0*SIZE(C0), xvec0; @@ -2425,7 +2426,7 @@ ADDQ $2*SIZE, C1; .L23_loopE: TEST $1, bm; # Rm = 1 JLE .L24_loopE; -.align 32; +ALIGN_5; .L24_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb, ptrbb; @@ -2454,7 +2455,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L241_loopE; -.align 32 +ALIGN_5 .L241_bodyB: BROAD_DX 0*SIZE(ptrba), xvec0; LD_DX 0*SIZE(ptrbb), xvec2; @@ -2479,7 +2480,7 @@ ADDQ $4*SIZE, ptrba; ADDQ $8*SIZE, ptrbb; DECQ k; JG .L241_bodyB; -.align 32 +ALIGN_5 .L241_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -2488,7 +2489,7 @@ MOVQ kkk, %rax; TEST $2, %rax; #endif JLE .L242_loopE; -.align 32 +ALIGN_5 .L242_bodyB: BROAD_DX 0*SIZE(ptrba), xvec0; LD_DX 0*SIZE(ptrbb), xvec2; @@ -2509,7 +2510,7 @@ MOVQ kkk, %rax; TEST $1, %rax; #endif JLE .L243_loopE; -.align 32 +ALIGN_5 .L243_bodyB: BROAD_DX 0*SIZE(ptrba), xvec0; LD_DX 0*SIZE(ptrbb), xvec2; @@ -2551,7 +2552,7 @@ LEAQ (C, ldc, 2), C; .L20_loopE:; TEST $1, bn; # Rn = 1 JLE .L30_loopE; -.align 32 +ALIGN_5 .L30_bodyB: #if defined(TRMMKERNEL)&&defined(LEFT) MOVQ OFFSET, %rax; @@ -2562,7 +2563,7 @@ MOVQ ba, ptrba; MOVQ bm, i; SARQ $3, i; JLE .L31_loopE; -.align 32 +ALIGN_5 .L31_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb, ptrbb; @@ -2593,7 +2594,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L311_loopE; -.align 32 +ALIGN_5 .L311_bodyB: #### Unroll time 1 #### LD_DY 0*SIZE(ptrba), yvec0; @@ -2634,7 +2635,7 @@ ADD_DY yvec4, yvec14, yvec14; ADDQ $4*SIZE, ptrbb; DECQ k; JG .L311_bodyB; -.align 32 +ALIGN_5 .L311_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -2643,7 +2644,7 @@ MOVQ kkk, %rax; TEST $2, %rax; #endif JLE .L312_loopE; -.align 32 +ALIGN_5 .L312_bodyB: #### Unroll time 1 #### LD_DY 0*SIZE(ptrba), yvec0; @@ -2673,7 +2674,7 @@ MOVQ kkk, %rax; TEST $1, %rax; #endif JLE .L313_loopE; -.align 32 +ALIGN_5 .L313_bodyB: #### Unroll time 1 #### LD_DY 0*SIZE(ptrba), yvec0; @@ -2696,7 +2697,7 @@ MOVQ C0, %rax; OR ldc, %rax; TEST $15, %rax; JNE .L313_loopEx; -.align 32 +ALIGN_5 #### Writing Back #### EXTRA_DY $1, yvec15, xvec13; EXTRA_DY $1, yvec14, xvec12; @@ -2724,7 +2725,7 @@ ADDQ $8*SIZE, C0; DECQ i; JG .L31_bodyB; JMP .L31_loopE; -.align 32 +ALIGN_5 .L313_loopEx: EXTRA_DY $1, yvec15, xvec13; EXTRA_DY $1, yvec14, xvec12; @@ -2766,7 +2767,7 @@ JG .L31_bodyB; .L31_loopE: TEST $4, bm JLE .L32_loopE; -.align 32 +ALIGN_5 .L32_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb, ptrbb; @@ -2796,7 +2797,7 @@ MOVQ %rax, kkk #endif SARQ $2, k; JLE .L321_loopE; -.align 32 +ALIGN_5 .L321_bodyB: LD_DY 0*SIZE(ptrba), yvec0; BROAD_DY 0*SIZE(ptrbb), yvec1; @@ -2821,7 +2822,7 @@ ADDQ $16*SIZE, ptrba; ADDQ $4*SIZE, ptrbb; DECQ k; JG .L321_bodyB; -.align 32 +ALIGN_5 .L321_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -2830,7 +2831,7 @@ MOVQ kkk, %rax; TEST $2, %rax; #endif JLE .L322_loopE; -.align 32 +ALIGN_5 .L322_bodyB: LD_DY 0*SIZE(ptrba), yvec0; BROAD_DY 0*SIZE(ptrbb), yvec1; @@ -2852,7 +2853,7 @@ MOVQ kkk, %rax; TEST $1, %rax; #endif JLE .L323_loopE; -.align 32 +ALIGN_5 .L323_bodyB: LD_DY 0*SIZE(ptrba), yvec0; BROAD_DY 0*SIZE(ptrbb), yvec1; @@ -2870,7 +2871,7 @@ MOVQ C0, %rax; OR ldc, %rax; TEST $15, %rax; JNE .L323_loopEx; -.align 32 +ALIGN_5 #### Writing Back #### EXTRA_DY $1, yvec15, xvec14; #ifndef TRMMKERNEL @@ -2891,7 +2892,7 @@ ADDQ $4, kk #endif ADDQ $4*SIZE, C0; JMP .L32_loopE; -.align 32 +ALIGN_5 .L323_loopEx: #### Writing Back #### EXTRA_DY $1, yvec15, xvec14; @@ -2921,7 +2922,7 @@ ADDQ $4*SIZE, C0; .L32_loopE: TEST $2, bm JLE .L33_loopE; -.align 32 +ALIGN_5 .L33_bodyB: #if !defined(TRMMKERNEL) || (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) MOVQ bb, ptrbb; @@ -2951,7 +2952,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L331_loopE; -.align 32 +ALIGN_5 .L331_bodyB: LD_DX 0*SIZE(ptrba), xvec0; BROAD_DX 0*SIZE(ptrbb), xvec2; @@ -2976,7 +2977,7 @@ ADDQ $8*SIZE, ptrba; ADDQ $4*SIZE, ptrbb; DECQ k; JG .L331_bodyB; -.align 32 +ALIGN_5 .L331_loopE: #ifndef TRMMKERNEL TEST $2,bk; @@ -2985,7 +2986,7 @@ MOVQ kkk, %rax; TEST $2, %rax #endif JLE .L332_loopE; -.align 32 +ALIGN_5 .L332_bodyB: LD_DX 0*SIZE(ptrba), xvec0; BROAD_DX 0*SIZE(ptrbb), xvec2; @@ -3006,7 +3007,7 @@ MOVQ kkk, %rax; TEST $1, %rax; #endif JLE .L333_loopE; -.align 32 +ALIGN_5 .L333_bodyB: LD_DX 0*SIZE(ptrba), xvec0; BROAD_DX 0*SIZE(ptrbb), xvec2; @@ -3039,7 +3040,7 @@ ADDQ $2*SIZE, C0; .L33_loopE: TEST $1, bm JLE .L34_loopE; -.align 32 +ALIGN_5 .L34_bodyB: #if !defined(TRMMKERNEL) || (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) MOVQ bb, ptrbb; @@ -3068,7 +3069,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L341_loopE; -.align 32 +ALIGN_5 .L341_bodyB: movsd 0*SIZE(ptrba), xvec0; movsd 0*SIZE(ptrbb), xvec1; @@ -3093,7 +3094,7 @@ addq $4*SIZE, ptrba; addq $4*SIZE, ptrbb; decq k; JG .L341_bodyB; -.align 32 +ALIGN_5 .L341_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -3102,7 +3103,7 @@ MOVQ kkk, %rax; TEST $2, %rax; #endif JLE .L342_loopE; -.align 32 +ALIGN_5 .L342_bodyB: movsd 0*SIZE(ptrba), xvec0; movsd 0*SIZE(ptrbb), xvec1; @@ -3124,7 +3125,7 @@ MOVQ kkk, %rax; TEST $1, %rax; #endif JLE .L343_loopE; -.align 32 +ALIGN_5 .L343_bodyB: movsd 0*SIZE(ptrba), xvec0; movsd 0*SIZE(ptrbb), xvec1; diff --git a/kernel/x86_64/sgemm_kernel_8x8_sandy.S b/kernel/x86_64/sgemm_kernel_8x8_sandy.S index 44f8f1802..4d16a60d0 100644 --- a/kernel/x86_64/sgemm_kernel_8x8_sandy.S +++ b/kernel/x86_64/sgemm_kernel_8x8_sandy.S @@ -142,6 +142,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define JMP jmp #define NOP #define XOR xorpd +#undef MOVQ #define MOVQ movq #define XOR_SY vxorps @@ -273,7 +274,7 @@ movq %r11, kk MOVQ bn,j; SARQ $3,j; JLE .L0_loopE; -.align 16; +ALIGN_4; .L0_bodyB:; #if defined(TRMMKERNEL) && defined(LEFT) MOVQ OFFSET, %rax; @@ -289,7 +290,7 @@ MOVQ ba,ptrba; MOVQ bm,i; SARQ $3,i; JLE .L1_loopE; -.align 16; +ALIGN_4; .L1_bodyB:; #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb, ptrbb; @@ -342,7 +343,7 @@ MOVQ %rax, kkk; #endif SARQ $2,k; JLE .L2_loopE; -.align 16; +ALIGN_4; .L2_bodyB:; # Computing kernel @@ -472,7 +473,7 @@ ADD_SY yvec8, yvec7, yvec8; .L2_bodyE:; DECQ k; JG .L2_bodyB; -.align 64; +ALIGN_4 .L2_loopE:; #ifndef TRMMKERNEL TEST $2, bk; @@ -480,7 +481,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L3_loopE; -.align 64 +ALIGN_4 .L3_loobB: #### Unroll times 1 #### MUL_SY yvec0, yvec2, yvec6; @@ -550,7 +551,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L4_loopE; -.align 64 +ALIGN_4 .L4_loopB:; #### Unroll times 1 #### MUL_SY yvec0, yvec2, yvec6; @@ -609,7 +610,7 @@ MOVQ C0, %rax; OR ldc, %rax; TEST $15, %rax; JNE .L4_loopEx; -.align 16 +ALIGN_4 LEAQ (ldc,ldc,2),%rax; EXTRA_SY $1,yvec15,xvec7; EXTRA_SY $1,yvec14,xvec6; @@ -669,7 +670,7 @@ ADDQ $8*SIZE,C1; DECQ i; JG .L1_bodyB; JMP .L1_loopE; -.align 16; +ALIGN_4; .L4_loopEx: LEAQ (ldc,ldc,2),%rax; EXTRA_SY $1, yvec15, xvec7; @@ -813,11 +814,11 @@ ADDQ $8*SIZE, C0; ADDQ $8*SIZE, C1; DECQ i; JG .L1_bodyB; -.align 16 +ALIGN_4 .L1_loopE:; TEST $4, bm; JLE .L5_loopE; -.align 16 +ALIGN_4 .L5_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb, ptrbb; @@ -857,7 +858,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L8_loopE; -.align 16 +ALIGN_4 .L8_bodyB: #### Unroll time 1 #### @@ -983,7 +984,7 @@ MUL_SX xvec1, xvec5; ADD_SX xvec5, xvec8; DECQ k; JG .L8_bodyB; -.align 16 +ALIGN_4 .L8_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -991,7 +992,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L9_loopE; -.align 16 +ALIGN_4 .L9_bodyB: #### Unroll time 1 #### SHUF_SX $0x4e, xvec2, xvec4; @@ -1062,7 +1063,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L10_loopE; -.align 16 +ALIGN_4 .L10_bodyB: #### Unroll time 1 #### SHUF_SX $0x4e, xvec2, xvec4; @@ -1122,7 +1123,7 @@ MOVQ C0, %rax; OR ldc, %rax; TEST $15, %rax; JNE .L10_loopEx; -.align 16 +ALIGN_4 LEAQ (ldc,ldc,2),%rax; #ifndef TRMMKERNEL ADD_SX 0*SIZE(C0), xvec15; @@ -1155,7 +1156,7 @@ ADDQ $4, kk ADDQ $4*SIZE, C0; ADDQ $4*SIZE, C1; JMP .L5_loopE; -.align 16 +ALIGN_4 .L10_loopEx: LEAQ (ldc,ldc,2),%rax; #ifndef TRMMKERNEL @@ -1215,7 +1216,7 @@ ADDQ $4*SIZE, C1; .L5_loopE: TEST $2, bm; JLE .L6_loopE; -.align 16 +ALIGN_4 .L6_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb, ptrbb; @@ -1249,7 +1250,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L11_loopE; -.align 16 +ALIGN_4 .L11_bodyB: #### Computing kernel LD_SX 0*SIZE(ptrba), xvec0; # a1, a2, a3, a4 @@ -1318,7 +1319,7 @@ ADDQ $8*SIZE, ptrba; ADDQ $32*SIZE, ptrbb; DECQ k; JG .L11_bodyB; -.align 16 +ALIGN_4 .L11_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -1326,7 +1327,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L12_loopE; -.align 16 +ALIGN_4 .L12_bodyB: LD_SX 0*SIZE(ptrba), xvec0; # a1, a2, a3, a4 SHUF_SX $0x44, xvec0, xvec1; # a1, a2, a1, a2 @@ -1368,7 +1369,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L13_loopE; -.align 16 +ALIGN_4 .L13_bodyB: LD_SX 0*SIZE(ptrba), xvec0; # a1, a2, a3, a4 SHUF_SX $0x44, xvec0, xvec1; # a1, a2, a1, a2 @@ -1433,7 +1434,7 @@ ADDQ $2*SIZE, C1; .L6_loopE: TEST $1, bm; JLE .L7_loopE; -.align 16 +ALIGN_4 .L7_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb, ptrbb; @@ -1465,7 +1466,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L14_loopE; -.align 16 +ALIGN_4 .L14_bodyB: BROAD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec2; @@ -1503,7 +1504,7 @@ ADDQ $4*SIZE, ptrba; ADDQ $32*SIZE, ptrbb; DECQ k; JG .L14_bodyB; -.align 16 +ALIGN_4 .L14_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -1511,7 +1512,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L15_loopE; -.align 16 +ALIGN_4 .L15_bodyB: BROAD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec2; @@ -1538,7 +1539,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L16_loopE; -.align 16 +ALIGN_4 .L16_bodyB: BROAD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec2; @@ -1611,11 +1612,11 @@ LEAQ (C,ldc,8),C; .L0_bodyE:; DECQ j; JG .L0_bodyB; -.align 16; +ALIGN_4; .L0_loopE:; TEST $4, bn; # Rn = 4 JLE .L20_loopE; -.align 16; +ALIGN_4; .L20_bodyB: #if defined(TRMMKERNEL) && defined(LEFT) MOVQ OFFSET, %rax; @@ -1628,7 +1629,7 @@ MOVQ ba, ptrba; MOVQ bm, i; SARQ $3, i; JLE .L21_loopE; -.align 16 +ALIGN_4 .L21_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb, ptrbb; @@ -1668,7 +1669,7 @@ MOVQ %rax, kkk; #endif SARQ $2,k; JLE .L211_loopE; -.align 16 +ALIGN_4 .L211_bodyB: #### Unroll time 1 #### ODUP_SX 0*SIZE(ptrbb), xvec3; @@ -1800,7 +1801,7 @@ ADD_SX xvec7, xvec8; LD_SX 4*SIZE(ptrba), xvec1; DECQ k; JG .L211_bodyB; -.align 16 +ALIGN_4 .L211_loopE: #ifndef TRMMKERNEL TEST $2, bk @@ -1808,7 +1809,7 @@ TEST $2, bk TEST $2, kkk; #endif JLE .L212_loopE; -.align 16 +ALIGN_4 .L212_bodyB: #### Unroll time 1 #### ODUP_SX 0*SIZE(ptrbb), xvec3; @@ -1882,7 +1883,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L213_loopE; -.align 16 +ALIGN_4 .L213_bodyB: ODUP_SX 0*SIZE(ptrbb), xvec3; SHUF_SX $0x4e, xvec2, xvec4; @@ -1982,11 +1983,11 @@ ADDQ $8*SIZE, C0; ADDQ $8*SIZE, C1; DECQ i; JG .L21_bodyB; -.align 16 +ALIGN_4 .L21_loopE: TEST $4, bm; JLE .L22_loopE; -.align 16 +ALIGN_4 .L22_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb, ptrbb; @@ -2019,7 +2020,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L221_loopE; -.align 16 +ALIGN_4 .L221_bodyB: LD_SX 0*SIZE(ptrba), xvec0; EDUP_SX 0*SIZE(ptrbb), xvec2; @@ -2089,7 +2090,7 @@ ADDQ $16*SIZE, ptrbb; DECQ k; JG .L221_bodyB; -.align 16 +ALIGN_4 .L221_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -2097,7 +2098,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L222_loopE; -.align 16 +ALIGN_4 .L222_bodyB: LD_SX 0*SIZE(ptrba), xvec0; EDUP_SX 0*SIZE(ptrbb), xvec2; @@ -2139,7 +2140,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L223_loopE; -.align 16 +ALIGN_4 .L223_bodyB: LD_SX 0*SIZE(ptrba), xvec0; EDUP_SX 0*SIZE(ptrbb), xvec2; @@ -2203,7 +2204,7 @@ ADDQ $4*SIZE, C1; .L22_loopE: TEST $2, bm; JLE .L23_loopE; -.align 16 +ALIGN_4 .L23_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb, ptrbb; @@ -2234,7 +2235,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L231_loopE; -.align 16 +ALIGN_4 .L231_bodyB: LD_SX 0*SIZE(ptrba), xvec0; EDUP_SX 0*SIZE(ptrbb), xvec4; @@ -2274,7 +2275,7 @@ ADDQ $8*SIZE, ptrba; ADDQ $16*SIZE, ptrbb; DECQ k; JG .L231_bodyB; -.align 16 +ALIGN_4 .L231_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -2282,7 +2283,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L232_loopE; -.align 16 +ALIGN_4 .L232_bodyB: LD_SX 0*SIZE(ptrba), xvec0; EDUP_SX 0*SIZE(ptrbb), xvec4; @@ -2310,7 +2311,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L233_loopE; -.align 16 +ALIGN_4 .L233_bodyB: LD_SX 0*SIZE(ptrba), xvec0; EDUP_SX 0*SIZE(ptrbb), xvec4; @@ -2356,7 +2357,7 @@ ADDQ $2*SIZE, C1; .L23_loopE: TEST $1, bm; JLE .L24_loopE; -.align 16 +ALIGN_4 .L24_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb, ptrbb; @@ -2386,7 +2387,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L241_loopE; -.align 16 +ALIGN_4 .L241_bodyB: BROAD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec1; @@ -2419,7 +2420,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L242_loopE; -.align 16 +ALIGN_4 .L242_bodyB: BROAD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec1; @@ -2440,7 +2441,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L243_loopE; -.align 16; +ALIGN_4; .L243_bodyB: BROAD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec1; @@ -2491,7 +2492,7 @@ LEAQ (C, ldc, 4), C; .L20_loopE: TEST $2, bn; JLE .L30_loopE; -.align 16 +ALIGN_4 .L30_bodyB: #if defined(TRMMKERNEL) && defined(LEFT) MOVQ OFFSET, %rax; @@ -2503,7 +2504,7 @@ MOVQ ba, ptrba; MOVQ bm, i; SARQ $3, i; JLE .L31_loopE; -.align 16 +ALIGN_4 .L31_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb, ptrbb; @@ -2536,7 +2537,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L311_loopE; -.align 16 +ALIGN_4 .L311_bodyB: LD_SX 0*SIZE(ptrbb), xvec2; SHUF_SX $0x50, xvec2, xvec3; @@ -2612,7 +2613,7 @@ ADDQ $32*SIZE, ptrba; ADDQ $8*SIZE, ptrbb; DECQ k; JG .L311_bodyB; -.align 16 +ALIGN_4 .L311_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -2620,7 +2621,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L312_loopE; -.align 16 +ALIGN_4 .L312_bodyB: LD_SX 0*SIZE(ptrbb), xvec2; SHUF_SX $0x50, xvec2, xvec3; @@ -2666,7 +2667,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L313_loopE; -.align 16 +ALIGN_4 .L313_bodyB: LD_SX 0*SIZE(ptrbb), xvec2; SHUF_SX $0x50, xvec2, xvec3; @@ -2731,11 +2732,11 @@ ADDQ $8*SIZE, C0; ADDQ $8*SIZE, C1; DECQ i; JG .L31_bodyB; -.align 16 +ALIGN_4 .L31_loopE: TEST $4, bm; JLE .L32_loopE; -.align 16 +ALIGN_4 .L32_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb, ptrbb; @@ -2766,7 +2767,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L321_loopE; -.align 16 +ALIGN_4 .L321_bodyB: LD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec2; @@ -2806,7 +2807,7 @@ ADDQ $16*SIZE, ptrba; ADDQ $8*SIZE, ptrbb; DECQ k; JG .L321_bodyB; -.align 16 +ALIGN_4 .L321_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -2814,7 +2815,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L322_loopE; -.align 16 +ALIGN_4 .L322_bodyB: LD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec2; @@ -2842,7 +2843,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L323_loopE; -.align 16 +ALIGN_4 .L323_bodyB: LD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec2; @@ -2887,7 +2888,7 @@ ADDQ $4*SIZE, C1; .L32_loopE: TEST $2, bm; JLE .L33_loopE; -.align 16 +ALIGN_4 .L33_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb, ptrbb; @@ -2920,7 +2921,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L331_loopE; -.align 16 +ALIGN_4 .L331_bodyB: LD_SX 0*SIZE(ptrba), xvec0; # a0, a1, a2, a3 EDUP_SX 0*SIZE(ptrbb), xvec2; # b0, b0, b2, b2 @@ -2943,7 +2944,7 @@ ADDQ $8*SIZE, ptrba; ADDQ $8*SIZE, ptrbb; DECQ k; JG .L331_bodyB; -.align 16 +ALIGN_4 .L331_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -2951,7 +2952,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L332_loopE; -.align 16 +ALIGN_4 .L332_bodyB: LD_SX 0*SIZE(ptrba), xvec0; # a0, a1, a2, a3 EDUP_SX 0*SIZE(ptrbb), xvec2; # b0, b0, b2, b2 @@ -2972,7 +2973,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L333_loopE; -.align 16 +ALIGN_4 .L333_bodyB: movss 0*SIZE(ptrba), xvec0; movss 1*SIZE(ptrba), xvec1; @@ -3031,7 +3032,7 @@ ADDQ $2*SIZE, C1; .L33_loopE: TEST $1, bm; JLE .L34_loopE; -.align 16 +ALIGN_4 .L34_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb, ptrbb; @@ -3062,7 +3063,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L341_loopE; -.align 16 +ALIGN_4 .L341_bodyB: movss 0*SIZE(ptrba), xvec0; movss 0*SIZE(ptrbb), xvec1; @@ -3104,7 +3105,7 @@ addq $4*SIZE, ptrba; addq $8*SIZE, ptrbb; decq k; jg .L341_bodyB; -.align 16 +ALIGN_4 .L341_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -3112,7 +3113,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L342_loopE; -.align 16 +ALIGN_4 .L342_bodyB: movss 0*SIZE(ptrba), xvec0; movss 0*SIZE(ptrbb), xvec1; @@ -3140,7 +3141,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L343_loopE; -.align 16 +ALIGN_4 .L343_bodyB: movss 0*SIZE(ptrba), xvec0; movss 0*SIZE(ptrbb), xvec1; @@ -3189,7 +3190,7 @@ LEAQ (C, ldc, 2), C; .L30_loopE: TEST $1, bn; JLE .L40_loopE; -.align 16 +ALIGN_4 .L40_bodyB: #if defined(TRMMKERNEL)&&defined(LEFT) MOVQ OFFSET, %rax; @@ -3200,7 +3201,7 @@ MOVQ ba, ptrba; MOVQ bm, i; SARQ $3, i; JLE .L41_loopE; -.align 16 +ALIGN_4 .L41_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb, ptrbb; @@ -3230,7 +3231,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L411_loopE; -.align 16 +ALIGN_4 .L411_bodyB: LD_SY 0*SIZE(ptrba), yvec0; BROAD_SY 0*SIZE(ptrbb), yvec1; @@ -3256,7 +3257,7 @@ ADDQ $32*SIZE, ptrba; ADDQ $4*SIZE, ptrbb; DECQ k; JG .L411_bodyB; -.align 16 +ALIGN_4 .L411_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -3264,7 +3265,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L412_loopE; -.align 16 +ALIGN_4 .L412_bodyB: LD_SY 0*SIZE(ptrba), yvec0; BROAD_SY 0*SIZE(ptrbb), yvec1; @@ -3285,7 +3286,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L413_loopE; -.align 16 +ALIGN_4 .L413_bodyB: LD_SY 0*SIZE(ptrba), yvec0; BROAD_SY 0*SIZE(ptrbb), yvec1; @@ -3329,11 +3330,11 @@ ADDQ $8, kk; ADDQ $8*SIZE, C0; DECQ i; JG .L41_bodyB; -.align 16 +ALIGN_4 .L41_loopE: TEST $4, bm; JLE .L42_loopE; -.align 16 +ALIGN_4 .L42_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb, ptrbb; @@ -3362,7 +3363,7 @@ MOVQ %rax, kkk #endif SARQ $2, k; JLE .L421_loopE; -.align 16 +ALIGN_4 .L421_bodyB: LD_SX 0*SIZE(ptrba), xvec0; BROAD_SX 0*SIZE(ptrbb), xvec1; @@ -3387,7 +3388,7 @@ ADDQ $16*SIZE, ptrba; ADDQ $4*SIZE, ptrbb; DECQ k; JG .L421_bodyB; -.align 16 +ALIGN_4 .L421_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -3395,7 +3396,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L422_loopE; -.align 16 +ALIGN_4 .L422_bodyB: LD_SX 0*SIZE(ptrba), xvec0; BROAD_SX 0*SIZE(ptrbb), xvec1; @@ -3416,7 +3417,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L423_loopE; -.align 16 +ALIGN_4 .L423_bodyB: LD_SX 0*SIZE(ptrba), xvec0; BROAD_SX 0*SIZE(ptrbb), xvec1; @@ -3451,7 +3452,7 @@ ADDQ $4*SIZE, C0; .L42_loopE: TEST $2, bm; JLE .L43_loopE; -.align 16 +ALIGN_4 .L43_bodyB: #if !defined(TRMMKERNEL) || (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) MOVQ bb, ptrbb; @@ -3481,7 +3482,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L431_loopE; -.align 16 +ALIGN_4 .L431_bodyB: movss 0*SIZE(ptrba), xvec0; movss 1*SIZE(ptrba), xvec1; @@ -3518,7 +3519,7 @@ addq $8*SIZE, ptrba; addq $4*SIZE, ptrbb; decq k; JG .L431_bodyB; -.align 16 +ALIGN_4 .L431_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -3526,7 +3527,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L432_loopE; -.align 16 +ALIGN_4 .L432_bodyB: movss 0*SIZE(ptrba), xvec0; movss 1*SIZE(ptrba), xvec1; @@ -3553,7 +3554,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L433_loopE; -.align 16 +ALIGN_4 .L433_bodyB: movss 0*SIZE(ptrba), xvec0; movss 1*SIZE(ptrba), xvec1; @@ -3592,7 +3593,7 @@ addq $2*SIZE, C0; .L43_loopE: TEST $1, bm; JLE .L44_loopE; -.align 16 +ALIGN_4 .L44_bodyB: #if !defined(TRMMKERNEL) || (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) MOVQ bb, ptrbb; @@ -3621,7 +3622,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L441_loopE; -.align 16 +ALIGN_4 .L441_bodyB: movss 0*SIZE(ptrba), xvec0; movss 0*SIZE(ptrbb), xvec1; @@ -3646,7 +3647,7 @@ addq $4*SIZE, ptrba; addq $4*SIZE, ptrbb; decq k; JG .L441_bodyB; -.align 16 +ALIGN_4 .L441_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -3654,7 +3655,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L442_loopE; -.align 16 +ALIGN_4 .L442_bodyB: movss 0*SIZE(ptrba), xvec0; movss 0*SIZE(ptrbb), xvec1; @@ -3675,7 +3676,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L443_loopE; -.align 16 +ALIGN_4 .L443_bodyB: movss 0*SIZE(ptrba), xvec0; movss 0*SIZE(ptrbb), xvec1; diff --git a/kernel/x86_64/zgemm_kernel_4x4_sandy.S b/kernel/x86_64/zgemm_kernel_4x4_sandy.S index 34abbb529..f6f9f707f 100644 --- a/kernel/x86_64/zgemm_kernel_4x4_sandy.S +++ b/kernel/x86_64/zgemm_kernel_4x4_sandy.S @@ -145,6 +145,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define JMP jmp #define NOP #define XOR xorpd +#undef MOVQ +#define MOVQ movq #define XOR_SY vxorps #define XOR_DY vxorpd @@ -297,7 +299,7 @@ movq %r11, kk; MOVQ bn,j; SARQ $2,j; # Rn = 4 JLE .L0_loopE; -.align 32; +ALIGN_5; .L0_bodyB:; #if defined(TRMMKERNEL) && defined(LEFT) MOVQ OFFSET, %rax; @@ -312,7 +314,7 @@ MOVQ ba,ptrba; MOVQ bm,i; SARQ $2,i; # Rm = 4 JLE .L1_loopE; -.align 32; +ALIGN_5; .L1_bodyB:; #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb,ptrbb; @@ -361,7 +363,7 @@ MOVQ %rax, kkk; #endif SARQ $2,k; # Unroll 4 times JLE .L2_loopE; -.align 32; +ALIGN_5; .L2_bodyB:; #### Computing kernel #### @@ -584,7 +586,7 @@ ADD2_DY yvec6, yvec12, yvec12; ADD2_DY yvec7, yvec8, yvec8; DECQ k; JG .L2_bodyB; -.align 64; +ALIGN_5 .L2_loopE:; #ifndef TRMMKERNEL TEST $2, bk; @@ -592,7 +594,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L3_loopE; -.align 64 +ALIGN_5 .L3_bodyB: #### Unroll time 1 #### LD_DY 4*SIZE(ptrba), yvec1; @@ -710,7 +712,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L4_loopE; -.align 64 +ALIGN_5 .L4_loopB:; #### Unroll time 1 #### PREFETCH0 PRESIZE*SIZE(ptrba); @@ -852,7 +854,7 @@ MOVQ C0, %rax; OR ldc, %rax; TEST $15, %rax; JNE .L4_loopEx; -.align 32 +ALIGN_5 #### Store Back #### EXTRA_DY $1,yvec15,xvec7; EXTRA_DY $1,yvec14,xvec6; @@ -912,7 +914,7 @@ ADDQ $8*SIZE,C1; DECQ i; JG .L1_bodyB; JMP .L1_loopE; -.align 32 +ALIGN_5 .L4_loopEx: EXTRA_DY $1, yvec15, xvec7; EXTRA_DY $1, yvec14, xvec6; @@ -1024,11 +1026,11 @@ ADDQ $8*SIZE, C0; ADDQ $8*SIZE, C1; DECQ i; JG .L1_bodyB; -.align 32; +ALIGN_5; .L1_loopE:; TEST $2, bm; JLE .L5_loopE; -.align 32 +ALIGN_5 .L5_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb,ptrbb; @@ -1060,7 +1062,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L7_loopE; -.align 32 +ALIGN_5 .L7_bodyB: #### Compute kernel #### #### Unroll times 1 #### @@ -1194,7 +1196,7 @@ ADD2_DY yvec7, yvec12, yvec12; ADDQ $32*SIZE, ptrbb; DECQ k; JG .L7_bodyB; -.align 32 +ALIGN_5 .L7_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -1202,7 +1204,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L8_loopE; -.align 32 +ALIGN_5 .L8_bodyB: #### Unroll times 1 #### LD_DY 0*SIZE(ptrba), yvec0; @@ -1276,7 +1278,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L9_loopE; -.align 32 +ALIGN_5 .L9_bodyB: #### Unroll times 1 #### LD_DY 0*SIZE(ptrba), yvec0; @@ -1364,7 +1366,7 @@ MOVQ C0, %rax; OR ldc, %rax; TEST $15, %rax; JNE .L9_loopEx; -.align 32 +ALIGN_5 #### Writing back #### EXTRA_DY $1, yvec15, xvec7; EXTRA_DY $1, yvec14, xvec6; @@ -1401,7 +1403,7 @@ ADDQ $2, kk; ADDQ $4*SIZE, C0; ADDQ $4*SIZE, C1; JMP .L5_loopE; -.align 32 +ALIGN_5 .L9_loopEx: EXTRA_DY $1, yvec15, xvec7; EXTRA_DY $1, yvec14, xvec6; @@ -1466,7 +1468,7 @@ ADDQ $4*SIZE, C1; .L5_loopE: TEST $1, bm; JLE .L6_loopE; -.align 32 +ALIGN_5 .L6_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb,ptrbb; @@ -1496,7 +1498,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L10_loopE; -.align 32 +ALIGN_5 .L10_bodyB: LD_DY 0*SIZE(ptrba), yvec0; #### A1r A1i A2r A2i EDUP_DY 0*SIZE(ptrbb), yvec2; @@ -1570,7 +1572,7 @@ ADDQ $8*SIZE, ptrba; ADDQ $32*SIZE, ptrbb; DECQ k; JG .L10_bodyB; -.align 32 +ALIGN_5 .L10_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -1578,7 +1580,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L11_loopE; -.align 32 +ALIGN_5 .L11_bodyB: LD_DY 0*SIZE(ptrba), yvec0; #### A1r A1i A2r A2i EDUP_DY 0*SIZE(ptrbb), yvec2; @@ -1624,7 +1626,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L12_loopE; -.align 32 +ALIGN_5 .L12_bodyB: LD_DY 0*SIZE(ptrba), yvec0; #### A1r A1i A2r A2i EDUP_DY 0*SIZE(ptrbb), yvec2; @@ -1722,11 +1724,11 @@ LEAQ (C,ldc,4),C; .L0_bodyE:; DECQ j; JG .L0_bodyB; -.align 32; +ALIGN_5; .L0_loopE:; TEST $2, bn; JLE .L20_loopE; -.align 32 +ALIGN_5 .L20_bodyB: #if defined(TRMMKERNEL) && defined(LEFT) MOVQ OFFSET, %rax; @@ -1738,7 +1740,7 @@ MOVQ ba, ptrba; MOVQ bm, i; SARQ $2, i; JLE .L21_loopE; -.align 32 +ALIGN_5 .L21_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb,ptrbb; @@ -1770,7 +1772,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L211_loopE; -.align 32 +ALIGN_5 .L211_bodyB: #### Unroll time 1 #### EDUP_DY 0*SIZE(ptrbb), yvec2; @@ -1891,7 +1893,7 @@ ADD2_DY yvec7, yvec12, yvec12; ADDQ $32*SIZE, ptrba; DECQ k; JG .L211_bodyB; -.align 32 +ALIGN_5 .L211_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -1899,7 +1901,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L212_loopE; -.align 32 +ALIGN_5 .L212_bodyB: #### Unroll time 1 #### EDUP_DY 0*SIZE(ptrbb), yvec2; @@ -1969,7 +1971,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L213_loopE; -.align 32 +ALIGN_5 .L213_bodyB: #### Unroll time 1 #### EDUP_DY 0*SIZE(ptrbb), yvec2; @@ -2058,7 +2060,7 @@ MOVQ C0, %rax; OR ldc, %rax; TEST $15, %rax; JNE .L213_loopEx; -.align 32 +ALIGN_5 #### Writing back #### #ifndef TRMMKERNEL ADD_DX 0*SIZE(C0),xvec15; @@ -2093,7 +2095,7 @@ ADDQ $8*SIZE, C1; DECQ i; JG .L21_bodyB; JMP .L21_loopE; -.align 32 +ALIGN_5 .L213_loopEx: #ifndef TRMMKERNEL LDL_DX 0*SIZE(C0), xvec0; @@ -2153,11 +2155,11 @@ ADDQ $8*SIZE, C0; ADDQ $8*SIZE, C1; DECQ i; JG .L21_bodyB; -.align 32 +ALIGN_5 .L21_loopE: TEST $2, bm; JLE .L22_loopE; -.align 32 +ALIGN_5 .L22_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb,ptrbb; @@ -2187,7 +2189,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L221_loopE; -.align 32 +ALIGN_5 .L221_bodyB: #### Unroll time 1 #### EDUP_DY 0*SIZE(ptrbb), yvec2; @@ -2268,7 +2270,7 @@ ADD2_DY yvec6, yvec13, yvec13; ADDQ $16*SIZE, ptrba; DECQ k; JG .L221_bodyB; -.align 32 +ALIGN_5 .L221_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -2276,7 +2278,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L222_loopE; -.align 32 +ALIGN_5 .L222_bodyB: #### Unroll time 1 #### EDUP_DY 0*SIZE(ptrbb), yvec2; @@ -2325,7 +2327,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L223_loopE; -.align 32 +ALIGN_5 .L223_bodyB: #### Unroll time 1 #### EDUP_DY 0*SIZE(ptrbb), yvec2; @@ -2419,7 +2421,7 @@ ADDQ $4*SIZE, C1; .L22_loopE: TEST $1, bm; JLE .L23_loopE; -.align 32 +ALIGN_5 .L23_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb,ptrbb; @@ -2448,7 +2450,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L231_loopE; -.align 32 +ALIGN_5 .L231_bodyB: LD_DY 0*SIZE(ptrba), yvec0; #### A1r A1i A2r A2i EDUP_DY 0*SIZE(ptrbb), yvec2; @@ -2498,7 +2500,7 @@ ADDQ $8*SIZE, ptrba; ADDQ $16*SIZE, ptrbb; DECQ k; JG .L231_bodyB; -.align 32 +ALIGN_5 .L231_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -2506,7 +2508,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L232_loopE; -.align 32 +ALIGN_5 .L232_bodyB: LD_DY 0*SIZE(ptrba), yvec0; #### A1r A1i A2r A2i EDUP_DY 0*SIZE(ptrbb), yvec2; @@ -2540,7 +2542,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L233_loopE; -.align 32 +ALIGN_5 .L233_bodyB: LD_DY 0*SIZE(ptrba), yvec0; #### A1r A1i A2r A2i EDUP_DY 0*SIZE(ptrbb), yvec2; @@ -2614,7 +2616,7 @@ LEAQ (C, ldc, 2), C; .L20_loopE: TEST $1, bn; JLE .L30_loopE; -.align 32 +ALIGN_5 .L30_bodyB: #if defined(TRMMKERNEL) && defined(LEFT) MOVQ OFFSET, %rax; @@ -2625,7 +2627,7 @@ MOVQ C, C0; MOVQ bm, i; SARQ $2, i; JLE .L31_loopE; -.align 32 +ALIGN_5 .L31_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb,ptrbb; @@ -2655,7 +2657,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L311_loopE; -.align 32 +ALIGN_5 .L311_bodyB: LD_DY 0*SIZE(ptrba), yvec0; BROAD_DY 0*SIZE(ptrbb), yvec2; @@ -2732,7 +2734,7 @@ ADDQ $32*SIZE, ptrba; ADDQ $8*SIZE, ptrbb; DECQ k; JG .L311_bodyB; -.align 32 +ALIGN_5 .L311_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -2740,7 +2742,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L312_loopE; -.align 32 +ALIGN_5 .L312_bodyB: LD_DY 0*SIZE(ptrba), yvec0; BROAD_DY 0*SIZE(ptrbb), yvec2; @@ -2787,7 +2789,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L313_loopE; -.align 32 +ALIGN_5 .L313_bodyB: LD_DY 0*SIZE(ptrba), yvec0; BROAD_DY 0*SIZE(ptrbb), yvec2; @@ -2877,11 +2879,11 @@ ADDQ $4, kk; ADDQ $8*SIZE, C0; DECQ i; JG .L31_bodyB; -.align 32 +ALIGN_5 .L31_loopE: TEST $2, bm; JLE .L32_loopE; -.align 32 +ALIGN_5 .L32_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb,ptrbb; @@ -2910,7 +2912,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L321_loopE; -.align 32 +ALIGN_5 .L321_bodyB: LD_DY 0*SIZE(ptrba), yvec0; BROAD_DY 0*SIZE(ptrbb), yvec2; @@ -2951,7 +2953,7 @@ ADDQ $16*SIZE, ptrba; ADDQ $8*SIZE, ptrbb; DECQ k; JG .L321_bodyB; -.align 32 +ALIGN_5 .L321_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -2959,7 +2961,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L322_loopE; -.align 32 +ALIGN_5 .L322_bodyB: LD_DY 0*SIZE(ptrba), yvec0; BROAD_DY 0*SIZE(ptrbb), yvec2; @@ -2988,7 +2990,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L323_loopE; -.align 32 +ALIGN_5 .L323_bodyB: LD_DY 0*SIZE(ptrba), yvec0; BROAD_DY 0*SIZE(ptrbb), yvec2; @@ -3049,7 +3051,7 @@ ADDQ $4*SIZE, C0; .L32_loopE: TEST $1, bm; JLE .L33_loopE; -.align 32 +ALIGN_5 .L33_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb,ptrbb; @@ -3078,7 +3080,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L331_loopE; -.align 32 +ALIGN_5 .L331_bodyB: LD_DX 0*SIZE(ptrba), xvec0; BROAD_DX 0*SIZE(ptrbb), xvec2; @@ -3123,7 +3125,7 @@ ADDQ $8*SIZE, ptrba; ADDQ $8*SIZE, ptrbb; DECQ k; JG .L331_bodyB; -.align 32 +ALIGN_5 .L331_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -3131,7 +3133,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L332_loopE; -.align 32 +ALIGN_5 .L332_bodyB: LD_DX 0*SIZE(ptrba), xvec0; BROAD_DX 0*SIZE(ptrbb), xvec2; @@ -3162,7 +3164,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L333_loopE; -.align 32 +ALIGN_5 .L333_bodyB: LD_DX 0*SIZE(ptrba), xvec0; BROAD_DX 0*SIZE(ptrbb), xvec2; From 6cfcb54a2810b4607f9b9353e275345c2d64f27f Mon Sep 17 00:00:00 2001 From: wangqian Date: Wed, 20 Jun 2012 07:38:39 +0800 Subject: [PATCH 04/16] Fixed align problem in S and C precision GEMM kernels. --- kernel/x86_64/cgemm_kernel_4x8_sandy.S | 2 +- kernel/x86_64/sgemm_kernel_8x8_sandy.S | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/x86_64/cgemm_kernel_4x8_sandy.S b/kernel/x86_64/cgemm_kernel_4x8_sandy.S index 56ebee120..5987b8e61 100644 --- a/kernel/x86_64/cgemm_kernel_4x8_sandy.S +++ b/kernel/x86_64/cgemm_kernel_4x8_sandy.S @@ -3578,7 +3578,7 @@ ADDQ $8*SIZE, ptrba; ADDQ $16*SIZE, ptrbb; DECQ k; JG .L241_bodyB; -.align +ALIGN_5 .L241_loopE: #ifndef TRMMKERNEL TEST $2, bk; diff --git a/kernel/x86_64/sgemm_kernel_8x8_sandy.S b/kernel/x86_64/sgemm_kernel_8x8_sandy.S index 4d16a60d0..23eda3af8 100644 --- a/kernel/x86_64/sgemm_kernel_8x8_sandy.S +++ b/kernel/x86_64/sgemm_kernel_8x8_sandy.S @@ -2412,7 +2412,7 @@ ADDQ $4*SIZE, ptrba; ADDQ $16*SIZE, ptrbb; DECQ k; JG .L241_bodyB; -.align +ALIGN_4 .L241_loopE: #ifndef TRMMKERNEL TEST $2, bk; From 88c272f6a739039460afbca3e47b55cd3555f585 Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Wed, 20 Jun 2012 09:20:20 +0800 Subject: [PATCH 05/16] Refs #83. Added the missing ALIGN_5 macro on Mac OSX. However, it still exists SEGFAULT bug. --- common_x86_64.h | 1 + 1 file changed, 1 insertion(+) diff --git a/common_x86_64.h b/common_x86_64.h index 7b6d11f7d..19b0ac53c 100644 --- a/common_x86_64.h +++ b/common_x86_64.h @@ -425,6 +425,7 @@ REALNAME: #define ALIGN_2 .align 2 #define ALIGN_3 .align 3 #define ALIGN_4 .align 4 +#define ALIGN_5 .align 5 #define ffreep fstp #endif From d34fce56e4a980fefe4ddafe5d371798ad948b59 Mon Sep 17 00:00:00 2001 From: wangqian Date: Wed, 20 Jun 2012 19:53:18 +0800 Subject: [PATCH 06/16] Refs #83 Fixed S/DGEMM calling conventions bug on windows. --- kernel/x86_64/dgemm_kernel_4x8_sandy.S | 67 ++++++++++++++------------ kernel/x86_64/sgemm_kernel_8x8_sandy.S | 1 + 2 files changed, 36 insertions(+), 32 deletions(-) diff --git a/kernel/x86_64/dgemm_kernel_4x8_sandy.S b/kernel/x86_64/dgemm_kernel_4x8_sandy.S index c98879d7c..603552464 100644 --- a/kernel/x86_64/dgemm_kernel_4x8_sandy.S +++ b/kernel/x86_64/dgemm_kernel_4x8_sandy.S @@ -162,7 +162,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ST_SX movaps #define ST_DX movapd #define STL_DX movlpd +#define STL_DY vmovlpd #define STH_DX movhpd +#define STH_DY vmovhpd #define EDUP_SY vmovsldup #define ODUP_SY vmovshdup @@ -242,6 +244,7 @@ movq %r15, 40(%rsp); #ifdef TRMMKERNEL movq old_offset, %r11 #endif + movaps %xmm3, %xmm0 #else movq old_ldc, ldc @@ -660,10 +663,10 @@ LDL_DY 2*SIZE(C1), xvec5, xvec5; LDH_DY 3*SIZE(C1), xvec5, xvec5; ADD_DY xvec5, xvec7, xvec7; #endif -STL_DX xvec15, 0*SIZE(C0); -STH_DX xvec15, 1*SIZE(C0); -STL_DX xvec7, 2*SIZE(C1); -STH_DX xvec7, 3*SIZE(C1); +STL_DY xvec15, 0*SIZE(C0); +STH_DY xvec15, 1*SIZE(C0); +STL_DY xvec7, 2*SIZE(C1); +STH_DY xvec7, 3*SIZE(C1); EXTRA_DY $1, yvec14, xvec4; #ifndef TRMMKERNEL @@ -674,10 +677,10 @@ LDL_DY 6*SIZE(C1), xvec2, xvec2; LDH_DY 7*SIZE(C1), xvec2, xvec2; ADD_DY xvec2, xvec4, xvec4; #endif -STL_DX xvec14, 4*SIZE(C0); -STH_DX xvec14, 5*SIZE(C0); -STL_DX xvec4, 6*SIZE(C1); -STH_DX xvec4, 7*SIZE(C1); +STL_DY xvec14, 4*SIZE(C0); +STH_DY xvec14, 5*SIZE(C0); +STL_DY xvec4, 6*SIZE(C1); +STH_DY xvec4, 7*SIZE(C1); EXTRA_DY $1, yvec13, xvec7; #ifndef TRMMKERNEL @@ -688,10 +691,10 @@ LDL_DY 2*SIZE(C1, ldc, 1), xvec5, xvec5; LDH_DY 3*SIZE(C1, ldc, 1), xvec5, xvec5; ADD_DY xvec5, xvec7, xvec7; #endif -STL_DX xvec13, 0*SIZE(C0, ldc, 1); -STH_DX xvec13, 1*SIZE(C0, ldc, 1); -STL_DX xvec7, 2*SIZE(C1, ldc, 1); -STH_DX xvec7, 3*SIZE(C1, ldc, 1); +STL_DY xvec13, 0*SIZE(C0, ldc, 1); +STH_DY xvec13, 1*SIZE(C0, ldc, 1); +STL_DY xvec7, 2*SIZE(C1, ldc, 1); +STH_DY xvec7, 3*SIZE(C1, ldc, 1); EXTRA_DY $1, yvec12, xvec4; #ifndef TRMMKERNEL @@ -702,10 +705,10 @@ LDL_DY 6*SIZE(C1, ldc, 1), xvec2, xvec2; LDH_DY 7*SIZE(C1, ldc, 1), xvec2, xvec2; ADD_DY xvec2, xvec4, xvec4; #endif -STL_DX xvec12, 4*SIZE(C0, ldc, 1); -STH_DX xvec12, 5*SIZE(C0, ldc ,1); -STL_DX xvec4, 6*SIZE(C1, ldc, 1); -STH_DX xvec4, 7*SIZE(C1, ldc, 1); +STL_DY xvec12, 4*SIZE(C0, ldc, 1); +STH_DY xvec12, 5*SIZE(C0, ldc ,1); +STL_DY xvec4, 6*SIZE(C1, ldc, 1); +STH_DY xvec4, 7*SIZE(C1, ldc, 1); EXTRA_DY $1, yvec11, xvec7; #ifndef TRMMKERNEL @@ -716,10 +719,10 @@ LDL_DY 2*SIZE(C0), xvec5, xvec5; LDH_DY 3*SIZE(C0), xvec5, xvec5; ADD_DY xvec5, xvec7, xvec7; #endif -STL_DX xvec11, 0*SIZE(C1); -STH_DX xvec11, 1*SIZE(C1); -STL_DX xvec7, 2*SIZE(C0); -STH_DX xvec7, 3*SIZE(C0); +STL_DY xvec11, 0*SIZE(C1); +STH_DY xvec11, 1*SIZE(C1); +STL_DY xvec7, 2*SIZE(C0); +STH_DY xvec7, 3*SIZE(C0); EXTRA_DY $1, yvec10, xvec4; #ifndef TRMMKERNEL @@ -730,10 +733,10 @@ LDL_DY 6*SIZE(C0), xvec2, xvec2; LDH_DY 7*SIZE(C0), xvec2, xvec2; ADD_DY xvec2, xvec4, xvec4; #endif -STL_DX xvec10, 4*SIZE(C1); -STH_DX xvec10, 5*SIZE(C1); -STL_DX xvec4, 6*SIZE(C0); -STH_DX xvec4, 7*SIZE(C0); +STL_DY xvec10, 4*SIZE(C1); +STH_DY xvec10, 5*SIZE(C1); +STL_DY xvec4, 6*SIZE(C0); +STH_DY xvec4, 7*SIZE(C0); EXTRA_DY $1, yvec9, xvec7; #ifndef TRMMKERNEL @@ -744,10 +747,10 @@ LDL_DY 2*SIZE(C0, ldc, 1), xvec5, xvec5; LDH_DY 3*SIZE(C0, ldc ,1), xvec5, xvec5; ADD_DY xvec5, xvec7, xvec7; #endif -STL_DX xvec9, 0*SIZE(C1, ldc, 1); -STH_DX xvec9, 1*SIZE(C1, ldc, 1); -STL_DX xvec7, 2*SIZE(C0, ldc, 1); -STH_DX xvec7, 3*SIZE(C0, ldc, 1); +STL_DY xvec9, 0*SIZE(C1, ldc, 1); +STH_DY xvec9, 1*SIZE(C1, ldc, 1); +STL_DY xvec7, 2*SIZE(C0, ldc, 1); +STH_DY xvec7, 3*SIZE(C0, ldc, 1); EXTRA_DY $1, yvec8, xvec4; #ifndef TRMMKERNEL @@ -758,10 +761,10 @@ LDL_DY 6*SIZE(C0, ldc, 1), xvec2, xvec2; LDH_DY 7*SIZE(C0, ldc, 1), xvec2, xvec2; ADD_DY xvec2, xvec4, xvec4; #endif -STL_DX xvec8, 4*SIZE(C1, ldc, 1); -STH_DX xvec8, 5*SIZE(C1, ldc, 1); -STL_DX xvec4, 6*SIZE(C0, ldc, 1); -STH_DX xvec4, 7*SIZE(C0, ldc, 1); +STL_DY xvec8, 4*SIZE(C1, ldc, 1); +STH_DY xvec8, 5*SIZE(C1, ldc, 1); +STL_DY xvec4, 6*SIZE(C0, ldc, 1); +STH_DY xvec4, 7*SIZE(C0, ldc, 1); #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) ||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) MOVQ bk, %rax; SUBQ kkk, %rax; diff --git a/kernel/x86_64/sgemm_kernel_8x8_sandy.S b/kernel/x86_64/sgemm_kernel_8x8_sandy.S index 23eda3af8..59458effe 100644 --- a/kernel/x86_64/sgemm_kernel_8x8_sandy.S +++ b/kernel/x86_64/sgemm_kernel_8x8_sandy.S @@ -251,6 +251,7 @@ movq %r15, 40(%rsp); #ifdef TRMMKERNEL movq old_offset, %r11 #endif + movaps %xmm3, %xmm0 #else movq old_ldc, ldc From fda5e0da8a0a43234ef1f70e719f4a5dd60fad0d Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Thu, 21 Jun 2012 08:25:52 +0800 Subject: [PATCH 07/16] Refs #83. Clang 3.1 works fine on Sandy Bridge Mac OSX. Edit the document. --- README | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/README b/README index 6372e96bd..b3f1baa79 100644 --- a/README +++ b/README @@ -34,8 +34,10 @@ Please read GotoBLAS_01Readme.txt Additional support CPU: x86_64: Intel Xeon 56xx (Westmere) //Used GotoBLAS2 Nehalem codes. + Intel Sandy Bridge MIPS64: - ICT Loongson 3A //Level 3 BLAS subroutines are optimized. + ICT Loongson 3A + ICT Loongson 3B (Experimental) 4.Usages Link with libopenblas.a or -lopenblas for shared library. @@ -70,10 +72,10 @@ OpenBLAS users mailing list: http://list.rdcps.ac.cn/mailman/listinfo/openblas 8.ChangeLog Please see Changelog.txt to obtain the differences between GotoBLAS2 1.13 BSD version. -9.Known Issues -* The number of CPUs/Cores should less than or equal to 8*sizeof(unsigned long). On 64 bits, the limit - is 64. On 32 bits, it is 32. -* On Loongson 3A. make test would be failed because of pthread_create error. The error code is EAGAIN. However, it will be OK when you run the same testcase on shell. I don't think this is a bug in OpenBLAS. +9.Troubleshooting +* Please use Clang version 3.1 and above to compile the library on Sandy Bridge microarchitecture. The Clang 3.0 will generate the wrong AVX binary code. +* The number of CPUs/Cores should less than or equal to 256. +* On Loongson 3A. make test would be failed because of pthread_create error. The error code is EAGAIN. However, it will be OK when you run the same testcase on shell. 10. Specification of Git Branches We used the git branching model in this article (http://nvie.com/posts/a-successful-git-branching-model/). From 157cc5444981c60bd72e924eee0663fb96c6de48 Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Sat, 23 Jun 2012 08:04:58 +0800 Subject: [PATCH 08/16] Update git ignore file. --- .gitignore | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/.gitignore b/.gitignore index 6cfc5b3c1..118205ca2 100644 --- a/.gitignore +++ b/.gitignore @@ -1,16 +1,23 @@ *.obj *.lib *.dll +*.dylib *.def *.o lapack-3.1.1 lapack-3.1.1.tgz +lapack-3.4.1 +lapack-3.4.1.tgz *.so *.a .svn *~ +lib.grd +nohup.out config.h Makefile.conf +Makefile.conf_last +config_last.h getarch getarch_2nd utest/openblas_utest From fe809c39f9b3696a45531734e85edd9ff5eb93ff Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Sat, 23 Jun 2012 08:22:53 +0800 Subject: [PATCH 09/16] Update the docs for 0.2.0 --- Makefile.rule | 2 +- README => README.md | 80 ++++++++++++++++++++++++++------------------- 2 files changed, 47 insertions(+), 35 deletions(-) rename README => README.md (64%) diff --git a/Makefile.rule b/Makefile.rule index 56cd63540..299273773 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -3,7 +3,7 @@ # # This library's version -VERSION = 0.1.1 +VERSION = 0.2.0 # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library diff --git a/README b/README.md similarity index 64% rename from README rename to README.md index b3f1baa79..000bc4158 100644 --- a/README +++ b/README.md @@ -1,34 +1,41 @@ -OpenBLAS Readme +# OpenBLAS -1.Introduction -OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version. OpenBLAS is an open source project supported by Lab of Parallel Software and Computational Science, ISCAS.(http://www.rdcps.ac.cn) -Please read the documents on OpenBLAS wiki pages(http://github.com/xianyi/OpenBLAS/wiki). +## Introduction +OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version. OpenBLAS is an open source project supported by Lab of Parallel Software and Computational Science, ISCAS . -2.Intallation +Please read the documents on OpenBLAS wiki pages. + +## Intallation Download from project homepage. http://xianyi.github.com/OpenBLAS/ Or, check out codes from git://github.com/xianyi/OpenBLAS.git -1)Normal compile - (a) type "make" to detect the CPU automatically. +### Normal compile + * type "make" to detect the CPU automatically. or - (b) type "make TARGET=xxx" to set target CPU, e.g. "make TARGET=NEHALEM". The full target list is in file TargetList.txt. + * type "make TARGET=xxx" to set target CPU, e.g. "make TARGET=NEHALEM". The full target list is in file TargetList.txt. -2)Cross compile +### Cross compile Please set CC and FC with the cross toolchains. Then, set HOSTCC with your host C compiler. At last, set TARGET explicitly. -examples: +Examples: + On X86 box, compile this library for loongson3a CPU. -make BINARY=64 CC=mips64el-unknown-linux-gnu-gcc FC=mips64el-unknown-linux-gnu-gfortran HOSTCC=gcc TARGET=LOONGSON3A -3)Debug version -make DEBUG=1 + make BINARY=64 CC=mips64el-unknown-linux-gnu-gcc FC=mips64el-unknown-linux-gnu-gfortran HOSTCC=gcc TARGET=LOONGSON3A + +### Debug version + + make DEBUG=1 + +### Intall to the directory (Optional) + +Example: + + make install PREFIX=your_installation_directory -4)Intall to the directory (Optional) -e.g. -make install PREFIX=your_installation_directory The default directory is /opt/OpenBLAS -3.Support CPU & OS +## Support CPU & OS Please read GotoBLAS_01Readme.txt Additional support CPU: @@ -39,45 +46,50 @@ MIPS64: ICT Loongson 3A ICT Loongson 3B (Experimental) -4.Usages +## Usages Link with libopenblas.a or -lopenblas for shared library. -4.1 Set the number of threads with environment variables. for example, -export OPENBLAS_NUM_THREADS=4 +### Set the number of threads with environment variables. + +Examples: + export OPENBLAS_NUM_THREADS=4 + or -export GOTO_NUM_THREADS=4 + + export GOTO_NUM_THREADS=4 + or -export OMP_NUM_THREADS=4 + + export OMP_NUM_THREADS=4 The priorities are OPENBLAS_NUM_THREADS > GOTO_NUM_THREADS > OMP_NUM_THREADS. If you compile this lib with USE_OPENMP=1, you should set OMP_NUM_THREADS environment variable. OpenBLAS ignores OPENBLAS_NUM_THREADS and GOTO_NUM_THREADS with USE_OPENMP=1. -4.2 Set the number of threads with calling functions. for example, -void goto_set_num_threads(int num_threads); -or -void openblas_set_num_threads(int num_threads); +### Set the number of threads with calling functions. + +Examples: + void goto_set_num_threads(int num_threads); + void openblas_set_num_threads(int num_threads); If you compile this lib with USE_OPENMP=1, you should use the above functions, too. -5.Report Bugs +## Report Bugs Please add a issue in https://github.com/xianyi/OpenBLAS/issues -6.To-Do List: -Optimization on ICT Loongson 3A CPU - -7.Contact +## Contact OpenBLAS users mailing list: http://list.rdcps.ac.cn/mailman/listinfo/openblas -8.ChangeLog +## ChangeLog Please see Changelog.txt to obtain the differences between GotoBLAS2 1.13 BSD version. -9.Troubleshooting +## Troubleshooting +* Please use gcc version 4.6 and above to compile Sandy Bridge AVX kernels on Linux/MingW/BSD. * Please use Clang version 3.1 and above to compile the library on Sandy Bridge microarchitecture. The Clang 3.0 will generate the wrong AVX binary code. * The number of CPUs/Cores should less than or equal to 256. * On Loongson 3A. make test would be failed because of pthread_create error. The error code is EAGAIN. However, it will be OK when you run the same testcase on shell. -10. Specification of Git Branches +## Specification of Git Branches We used the git branching model in this article (http://nvie.com/posts/a-successful-git-branching-model/). Now, there are 4 branches in github.com. * The master branch. This a main branch to reflect a production-ready state. From a6214c057e6b06783e08c3b450a24c3f86a63c31 Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Sat, 23 Jun 2012 08:34:40 +0800 Subject: [PATCH 10/16] Modified readme. --- README.md | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 000bc4158..80116c658 100644 --- a/README.md +++ b/README.md @@ -3,12 +3,12 @@ ## Introduction OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version. OpenBLAS is an open source project supported by Lab of Parallel Software and Computational Science, ISCAS . -Please read the documents on OpenBLAS wiki pages. +Please read the documents on OpenBLAS wiki pages . ## Intallation Download from project homepage. http://xianyi.github.com/OpenBLAS/ -Or, -check out codes from git://github.com/xianyi/OpenBLAS.git + +Or, check out codes from git://github.com/xianyi/OpenBLAS.git ### Normal compile * type "make" to detect the CPU automatically. or @@ -38,13 +38,15 @@ The default directory is /opt/OpenBLAS ## Support CPU & OS Please read GotoBLAS_01Readme.txt -Additional support CPU: -x86_64: - Intel Xeon 56xx (Westmere) //Used GotoBLAS2 Nehalem codes. - Intel Sandy Bridge -MIPS64: - ICT Loongson 3A - ICT Loongson 3B (Experimental) +### Additional support CPU: + +#### x86/x86-64: +* Intel Xeon 56xx (Westmere). Used GotoBLAS2 Nehalem codes. +* Intel Sandy Bridge. Optimized Level-3 BLAS with AVX on x86-64. +* AMD Bobcat. Used GotoBLAS2 Barcelona codes. +#### MIPS64: +* ICT Loongson 3A. Optimized Level-3 BLAS and the part of Level-1,2. +* ICT Loongson 3B (Experimental) ## Usages Link with libopenblas.a or -lopenblas for shared library. @@ -52,6 +54,7 @@ Link with libopenblas.a or -lopenblas for shared library. ### Set the number of threads with environment variables. Examples: + export OPENBLAS_NUM_THREADS=4 or @@ -69,7 +72,9 @@ If you compile this lib with USE_OPENMP=1, you should set OMP_NUM_THREADS enviro ### Set the number of threads with calling functions. Examples: + void goto_set_num_threads(int num_threads); + void openblas_set_num_threads(int num_threads); If you compile this lib with USE_OPENMP=1, you should use the above functions, too. From 544af1efec5602e7413c1211dd0deb92d97b5b26 Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Sat, 23 Jun 2012 09:35:19 +0800 Subject: [PATCH 11/16] Correct the error in readme --- README.md | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 80116c658..a13e069ec 100644 --- a/README.md +++ b/README.md @@ -41,12 +41,19 @@ Please read GotoBLAS_01Readme.txt ### Additional support CPU: #### x86/x86-64: -* Intel Xeon 56xx (Westmere). Used GotoBLAS2 Nehalem codes. -* Intel Sandy Bridge. Optimized Level-3 BLAS with AVX on x86-64. -* AMD Bobcat. Used GotoBLAS2 Barcelona codes. +- **Intel Xeon 56xx (Westmere)**: Used GotoBLAS2 Nehalem codes. +- **Intel Sandy Bridge**: Optimized Level-3 BLAS with AVX on x86-64. +- **AMD Bobcat**: Used GotoBLAS2 Barcelona codes. + #### MIPS64: -* ICT Loongson 3A. Optimized Level-3 BLAS and the part of Level-1,2. -* ICT Loongson 3B (Experimental) +- **ICT Loongson 3A**: Optimized Level-3 BLAS and the part of Level-1,2. +- **ICT Loongson 3B**: Experimental + +### Support OS: +- **GNU/Linux** +- **MingWin/Windows**: Please read . +- **Darwin/Mac OS X**: Experimental. Although GotoBLAS2 supports Darwin, we are the beginner on Mac OS X. +- **FreeBSD**: Supportted by community. We didn't test the library on this OS. ## Usages Link with libopenblas.a or -lopenblas for shared library. From 422359d09ac28b27bb652b303318485fb2c02cca Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Sat, 23 Jun 2012 11:32:43 +0800 Subject: [PATCH 12/16] Export openblas_set_num_threads in shared library. --- cblas.h | 4 ++++ common_interface.h | 2 ++ driver/others/Makefile | 4 ++-- driver/others/openblas_set_num_threads.c | 13 ++++++++++--- exports/gensymbol | 18 +++++++++++++----- 5 files changed, 31 insertions(+), 10 deletions(-) diff --git a/cblas.h b/cblas.h index f3708a994..ee8bf08b2 100644 --- a/cblas.h +++ b/cblas.h @@ -9,6 +9,10 @@ extern "C" { #include #include "common.h" +/*Set the number of threads on runtime.*/ +void openblas_set_num_threads(int num_threads); +void goto_set_num_threads(int num_threads); + #define CBLAS_INDEX size_t enum CBLAS_ORDER {CblasRowMajor=101, CblasColMajor=102}; diff --git a/common_interface.h b/common_interface.h index 898d91001..dbe0bb851 100644 --- a/common_interface.h +++ b/common_interface.h @@ -45,6 +45,8 @@ extern "C" { int BLASFUNC(xerbla)(char *, blasint *info, blasint); +void BLASFUNC(openblas_set_num_threads)(int *); + FLOATRET BLASFUNC(sdot) (blasint *, float *, blasint *, float *, blasint *); FLOATRET BLASFUNC(sdsdot)(blasint *, float *, float *, blasint *, float *, blasint *); diff --git a/driver/others/Makefile b/driver/others/Makefile index 75b552b65..2fdbb4a42 100644 --- a/driver/others/Makefile +++ b/driver/others/Makefile @@ -1,12 +1,12 @@ TOPDIR = ../.. include ../../Makefile.system -COMMONOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) c_abs.$(SUFFIX) z_abs.$(SUFFIX) +COMMONOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) c_abs.$(SUFFIX) z_abs.$(SUFFIX) openblas_set_num_threads.$(SUFFIX) COMMONOBJS += slamch.$(SUFFIX) slamc3.$(SUFFIX) dlamch.$(SUFFIX) dlamc3.$(SUFFIX) ifdef SMP -COMMONOBJS += blas_server.$(SUFFIX) divtable.$(SUFFIX) blasL1thread.$(SUFFIX) openblas_set_num_threads.$(SUFFIX) +COMMONOBJS += blas_server.$(SUFFIX) divtable.$(SUFFIX) blasL1thread.$(SUFFIX) ifndef NO_AFFINITY COMMONOBJS += init.$(SUFFIX) endif diff --git a/driver/others/openblas_set_num_threads.c b/driver/others/openblas_set_num_threads.c index 7ca3b7114..27de83ffc 100644 --- a/driver/others/openblas_set_num_threads.c +++ b/driver/others/openblas_set_num_threads.c @@ -1,5 +1,5 @@ /***************************************************************************** -Copyright (c) 2011, Lab of Parallel Software and Computational Science,ICSAS +Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS All rights reserved. Redistribution and use in source and binary forms, with or without @@ -33,7 +33,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #ifdef SMP_SERVER -#ifdef OS_LINUX extern void openblas_set_num_threads(int num_threads) ; @@ -41,5 +40,13 @@ void NAME(int* num_threads){ openblas_set_num_threads(*num_threads); } -#endif +#else +//Single thread + +void openblas_set_num_threads(int num_threads) { +} + +void NAME(int* num_threads){ + +} #endif diff --git a/exports/gensymbol b/exports/gensymbol index dbd559473..61e7c8367 100644 --- a/exports/gensymbol +++ b/exports/gensymbol @@ -72,6 +72,14 @@ zgemm3m, cgemm3m, zsymm3m, csymm3m, zhemm3m, chemm3m, ); +@misc_no_underscore_objs = ( + openblas_set_num_threads, goto_set_num_threads, + ); + +@misc_underscore_objs = ( + openblas_set_num_threads, + ); + @lapackobjs = ( # These routines are provided by OpenBLAS. sgesv, dgesv, cgesv, zgesv, @@ -2660,11 +2668,11 @@ if ($ARGV[5] == 1) { #NO_LAPACK=1 - @underscore_objs = (@blasobjs); + @underscore_objs = (@blasobjs, @misc_underscore_objs); } elsif (-d "../lapack-3.1.1" || -d "../lapack-3.4.0" || -d "../lapack-3.4.1") { - @underscore_objs = (@blasobjs, @lapackobjs, @lapackobjs2); + @underscore_objs = (@blasobjs, @lapackobjs, @lapackobjs2, @misc_underscore_objs); } else { - @underscore_objs = (@blasobjs, @lapackobjs); + @underscore_objs = (@blasobjs, @lapackobjs, @misc_underscore_objs); } if ($ARGV[3] == 1){ @underscore_objs = (@underscore_objs, @exblasobjs); }; @@ -2678,10 +2686,10 @@ if ($ARGV[1] eq "ia64"){ @underscore_objs = (@underscore_objs, @gemm3mobjs); }; if ($ARGV[1] eq "MIPS"){ @underscore_objs = (@underscore_objs, @gemm3mobjs); }; if ($ARGV[4] == 0) { - @no_underscore_objs = (@cblasobjs); + @no_underscore_objs = (@cblasobjs, @misc_no_underscore_objs); }else{ #NO_CBLAS=1 - @no_underscore_objs = (); + @no_underscore_objs = (@misc_no_underscore_objs); } if ($ARGV[6] == 1) { #NO_LAPACKE=1 From 853d16ed7ec9169cf03ec024f5894e9a597c7da1 Mon Sep 17 00:00:00 2001 From: Xianyi Zhang Date: Sat, 23 Jun 2012 13:07:38 +0800 Subject: [PATCH 13/16] Added openblas_set_num_threads dummy function on Windows. We plan to implement this feature in next version. --- driver/others/blas_server_win32.c | 8 ++++++++ exports/gensymbol | 10 +++++++++- 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/driver/others/blas_server_win32.c b/driver/others/blas_server_win32.c index 6708509e1..c71e7c276 100644 --- a/driver/others/blas_server_win32.c +++ b/driver/others/blas_server_win32.c @@ -63,6 +63,14 @@ static blas_pool_t pool; static HANDLE blas_threads [MAX_CPU_NUMBER]; static DWORD blas_threads_id[MAX_CPU_NUMBER]; +void goto_set_num_threads(int num) +{ +} + +void openblas_set_num_threads(int num) +{ +} + static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ if (!(mode & BLAS_COMPLEX)){ diff --git a/exports/gensymbol b/exports/gensymbol index 61e7c8367..e09a8b6ab 100644 --- a/exports/gensymbol +++ b/exports/gensymbol @@ -2759,6 +2759,10 @@ if ($ARGV[0] eq "aix"){ if ($ARGV[0] eq "win2k"){ print "EXPORTS\n"; $count = 1; + + #remove openblas_set_num_threads + @underscore_objs = grep /[^openblas_set_num_threads]/,@underscore_objs; + foreach $objs (@underscore_objs) { $uppercase = $objs; $uppercase =~ tr/[a-z]/[A-Z]/; @@ -2769,7 +2773,11 @@ if ($ARGV[0] eq "win2k"){ print "\t$uppercase=$objs", "_ \@", $count, "\n"; $count ++; } - + + #for openblas_set_num_threads + print "\topenblas_set_num_threads_=openblas_set_num_threads_ \@", $count, "\n"; + $count ++; + # if ($ARGV[4] == 0) { foreach $objs (@no_underscore_objs) { print "\t",$objs,"=$objs"," \@", $count, "\n"; From b39c51195b0ec09d17a0bcf345fcd7873f352acc Mon Sep 17 00:00:00 2001 From: Xianyi Zhang Date: Mon, 25 Jun 2012 14:29:17 +0800 Subject: [PATCH 14/16] Fixed the build bug about Sandy Bridge on 32-bit. We used Nehalem/Penryn codes on Sandy Bridge 32-bit. --- param.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/param.h b/param.h index 27aeeb6ac..5465c1cbd 100644 --- a/param.h +++ b/param.h @@ -928,14 +928,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifdef ARCH_X86 #define SGEMM_DEFAULT_UNROLL_M 4 -#define DGEMM_DEFAULT_UNROLL_M 8 +#define DGEMM_DEFAULT_UNROLL_M 2 #define QGEMM_DEFAULT_UNROLL_M 2 #define CGEMM_DEFAULT_UNROLL_M 2 #define ZGEMM_DEFAULT_UNROLL_M 1 #define XGEMM_DEFAULT_UNROLL_M 1 #define SGEMM_DEFAULT_UNROLL_N 4 -#define DGEMM_DEFAULT_UNROLL_N 8 +#define DGEMM_DEFAULT_UNROLL_N 4 #define QGEMM_DEFAULT_UNROLL_N 2 #define CGEMM_DEFAULT_UNROLL_N 2 #define ZGEMM_DEFAULT_UNROLL_N 2 From 857a0fa0df83cd3ff79a4047ad31a9f0f9e9f5da Mon Sep 17 00:00:00 2001 From: wangqian Date: Mon, 25 Jun 2012 19:00:37 +0800 Subject: [PATCH 15/16] Fixed the issue of mixing AVX and SSE codes in S/D/C/ZGEMM. --- kernel/x86_64/cgemm_kernel_4x8_sandy.S | 1827 ++++++++++++------------ kernel/x86_64/dgemm_kernel_4x8_sandy.S | 1033 +++++++------- kernel/x86_64/sgemm_kernel_8x8_sandy.S | 1587 ++++++++++---------- kernel/x86_64/zgemm_kernel_4x4_sandy.S | 358 +++-- 4 files changed, 2366 insertions(+), 2439 deletions(-) diff --git a/kernel/x86_64/cgemm_kernel_4x8_sandy.S b/kernel/x86_64/cgemm_kernel_4x8_sandy.S index 5987b8e61..5a5588089 100644 --- a/kernel/x86_64/cgemm_kernel_4x8_sandy.S +++ b/kernel/x86_64/cgemm_kernel_4x8_sandy.S @@ -150,79 +150,54 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define MOVQ movq #define XOR_SY vxorps -#define XOR_DY vxorpd -#define XOR_SX xorps -#define XOR_DX xorpd +#define XOR_SX vxorps #define LD_SY vmovaps -#define LD_DY vmovapd -#define LD_SX movaps -#define LD_DX movapd -#define LDL_SX movlps +#define LD_SX vmovaps +#define LDL_SX vmovlps #define LDL_SY vmovlps -#define LDH_SX movhps +#define LDH_SX vmovhps #define LDH_SY vmovhps #define ST_SY vmovaps -#define ST_DY vmovapd -#define ST_SX movaps -#define ST_DX movapd -#define STL_SX movlps +#define ST_SX vmovaps +#define STL_SX vmovlps #define STL_SY vmovlps -#define STH_SX movhps +#define STH_SX vmovhps #define STH_SY vmovhps #define EDUP_SY vmovsldup #define ODUP_SY vmovshdup -#define EDUP_SX movsldup -#define ODUP_SX movshdup -#define EDUP_DY vmovddup +#define EDUP_SX vmovsldup +#define ODUP_SX vmovshdup #define ADD_SY vaddps -#define ADD_DY vaddpd -#define ADD_SX addps -#define ADD_DX addpd -#define SUB_DY vsubpd +#define ADD_SX vaddps #define SUB_SY vsubps -#define SUB_DX subpd -#define SUB_SX subps +#define SUB_SX vsubps -#define ADDSUB_DY vaddsubpd -#define ADDSUB_DX addsubpd #define ADDSUB_SY vaddsubps -#define ADDSUB_SX addsubps +#define ADDSUB_SX vaddsubps #define MUL_SY vmulps -#define MUL_DY vmulpd -#define MUL_SX mulps -#define MUL_DX mulpd +#define MUL_SX vmulps #define SHUF_SY vperm2f128 -#define SHUF_DY vperm2f128 -#define SHUF_DX pshufd -#define SHUF_SX pshufd +#define SHUF_SX vpshufd #define VPERMILP_SY vpermilps #define VPERMILP_SX vpermilps -#define VPERMILP_DY vpermilpd #define BROAD_SY vbroadcastss -#define BROAD_DY vbroadcastsd #define BROAD_SX vbroadcastss -#define BROAD_DX movddup #define MOV_SY vmovaps -#define MOV_DY vmovapd -#define MOV_SX movaps -#define MOV_DX movapd +#define MOV_SX vmovaps #define REVS_SY vshufps -#define REVS_DY vshufpd -#define REVS_SX shufps -#define REVS_DX movsd +#define REVS_SX vshufps #define EXTRA_SY vextractf128 -#define EXTRA_DY vextractf128 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) #define ADD1_SY ADD_SY @@ -289,6 +264,8 @@ movq old_offset, %r11; #endif #endif +vzeroupper + vmovlps %xmm0, MEMALPHA_R vmovlps %xmm1, MEMALPHA_I movq old_bm, bm @@ -1417,64 +1394,64 @@ REVS_SY $0xe4,yvec7,yvec9,yvec9; #### Writing back #### EXTRA_SY $1, yvec15, xvec7; #ifndef TRMMKERNEL -LDL_SX 0*SIZE(C0), xvec6; -LDH_SX 2*SIZE(C0), xvec6; -ADD_SX xvec6, xvec15; +LDL_SX 0*SIZE(C0), xvec6, xvec6; +LDH_SX 2*SIZE(C0), xvec6, xvec6; +ADD_SX xvec6, xvec15, xvec15; #endif STL_SX xvec15, 0*SIZE(C0); STH_SX xvec15, 2*SIZE(C0); #ifndef TRMMKERNEL -LDL_SX 4*SIZE(C1), xvec4; -LDH_SX 6*SIZE(C1), xvec4; -ADD_SX xvec4, xvec7; +LDL_SX 4*SIZE(C1), xvec4, xvec4; +LDH_SX 6*SIZE(C1), xvec4, xvec4; +ADD_SX xvec4, xvec7, xvec7; #endif STL_SX xvec7, 4*SIZE(C1); STH_SX xvec7, 6*SIZE(C1); EXTRA_SY $1, yvec13, xvec5; #ifndef TRMMKERNEL -LDL_SX 0*SIZE(C0, ldc, 1), xvec4; -LDH_SX 2*SIZE(C0, ldc, 1), xvec4; -ADD_SX xvec4, xvec13; +LDL_SX 0*SIZE(C0, ldc, 1), xvec4, xvec4; +LDH_SX 2*SIZE(C0, ldc, 1), xvec4, xvec4; +ADD_SX xvec4, xvec13, xvec13; #endif STL_SX xvec13, 0*SIZE(C0, ldc, 1); STH_SX xvec13, 2*SIZE(C0, ldc, 1); #ifndef TRMMKERNEL -LDL_SX 4*SIZE(C1, ldc, 1), xvec2; -LDH_SX 6*SIZE(C1, ldc, 1), xvec2; -ADD_SX xvec2, xvec5; +LDL_SX 4*SIZE(C1, ldc, 1), xvec2, xvec2; +LDH_SX 6*SIZE(C1, ldc, 1), xvec2, xvec2; +ADD_SX xvec2, xvec5, xvec5; #endif STL_SX xvec5, 4*SIZE(C1, ldc, 1); STH_SX xvec5, 6*SIZE(C1, ldc, 1); EXTRA_SY $1, yvec11, xvec3; #ifndef TRMMKERNEL -LDL_SX 0*SIZE(C1), xvec2; -LDH_SX 2*SIZE(C1), xvec2; -ADD_SX xvec2, xvec11; +LDL_SX 0*SIZE(C1), xvec2, xvec2; +LDH_SX 2*SIZE(C1), xvec2, xvec2; +ADD_SX xvec2, xvec11, xvec11; #endif STL_SX xvec11, 0*SIZE(C1); STH_SX xvec11, 2*SIZE(C1); #ifndef TRMMKERNEL -LDL_SX 4*SIZE(C0), xvec0; -LDH_SX 6*SIZE(C0), xvec0; -ADD_SX xvec0, xvec3; +LDL_SX 4*SIZE(C0), xvec0, xvec0; +LDH_SX 6*SIZE(C0), xvec0, xvec0; +ADD_SX xvec0, xvec3, xvec3; #endif STL_SX xvec3, 4*SIZE(C0); STH_SX xvec3, 6*SIZE(C0); EXTRA_SY $1, yvec9, xvec1; #ifndef TRMMKERNEL -LDL_SX 0*SIZE(C1, ldc, 1), xvec0; -LDH_SX 2*SIZE(C1, ldc, 1), xvec0; -ADD_SX xvec0, xvec9; +LDL_SX 0*SIZE(C1, ldc, 1), xvec0, xvec0; +LDH_SX 2*SIZE(C1, ldc, 1), xvec0, xvec0; +ADD_SX xvec0, xvec9, xvec9; #endif STL_SX xvec9, 0*SIZE(C1, ldc, 1); STH_SX xvec9, 2*SIZE(C1, ldc, 1); #ifndef TRMMKERNEL -LDL_SX 4*SIZE(C0, ldc, 1), xvec6; -LDH_SX 6*SIZE(C0, ldc, 1), xvec6; -ADD_SX xvec6, xvec1; +LDL_SX 4*SIZE(C0, ldc, 1), xvec6, xvec6; +LDH_SX 6*SIZE(C0, ldc, 1), xvec6, xvec6; +ADD_SX xvec6, xvec1, xvec1; #endif STL_SX xvec1, 4*SIZE(C0, ldc, 1); STH_SX xvec1, 6*SIZE(C0, ldc, 1); @@ -1533,122 +1510,122 @@ ALIGN_5 LD_SX 0*SIZE(ptrba), xvec0; # ar1, ai1, ar2, ai2 EDUP_SX 0*SIZE(ptrbb), xvec2; # br1, br1, br2, br2 SHUF_SX $0x4e, xvec2, xvec3; # br3, br3, br4, br4 -MUL_SX xvec0, xvec2; -ADD1_SX xvec2, xvec15; -MUL_SX xvec0, xvec3; -ADD1_SX xvec3, xvec14; +MUL_SX xvec0, xvec2, xvec2; +ADD1_SX xvec2, xvec15, xvec15; +MUL_SX xvec0, xvec3, xvec3; +ADD1_SX xvec3, xvec14, xvec14; EDUP_SX 4*SIZE(ptrbb), xvec4; SHUF_SX $0x4e, xvec4, xvec5; -MUL_SX xvec0, xvec4; -ADD1_SX xvec4, xvec13; -MUL_SX xvec0, xvec5; -ADD1_SX xvec5, xvec12; +MUL_SX xvec0, xvec4, xvec4; +ADD1_SX xvec4, xvec13, xvec13; +MUL_SX xvec0, xvec5, xvec5; +ADD1_SX xvec5, xvec12, xvec12; SHUF_SX $0xb1, xvec0, xvec1; ODUP_SX 0*SIZE(ptrbb), xvec2; SHUF_SX $0x4e, xvec2, xvec3; -MUL_SX xvec1, xvec2; -ADD2_SX xvec2, xvec15; -MUL_SX xvec1, xvec3; -ADD2_SX xvec3, xvec14; +MUL_SX xvec1, xvec2, xvec2; +ADD2_SX xvec2, xvec15, xvec15; +MUL_SX xvec1, xvec3, xvec3; +ADD2_SX xvec3, xvec14, xvec14; ODUP_SX 4*SIZE(ptrbb), xvec4; SHUF_SX $0x4e, xvec4, xvec5; -MUL_SX xvec1, xvec4; -ADD2_SX xvec4, xvec13; -MUL_SX xvec1, xvec5; -ADD2_SX xvec5, xvec12; +MUL_SX xvec1, xvec4, xvec4; +ADD2_SX xvec4, xvec13, xvec13; +MUL_SX xvec1, xvec5, xvec5; +ADD2_SX xvec5, xvec12, xvec12; LD_SX 4*SIZE(ptrba), xvec0; # ar1, ai1, ar2, ai2 EDUP_SX 8*SIZE(ptrbb), xvec2; # br1, br1, br2, br2 SHUF_SX $0x4e, xvec2, xvec3; # br3, br3, br4, br4 -MUL_SX xvec0, xvec2; -ADD1_SX xvec2, xvec15; -MUL_SX xvec0, xvec3; -ADD1_SX xvec3, xvec14; +MUL_SX xvec0, xvec2, xvec2; +ADD1_SX xvec2, xvec15, xvec15; +MUL_SX xvec0, xvec3, xvec3; +ADD1_SX xvec3, xvec14, xvec14; EDUP_SX 12*SIZE(ptrbb), xvec4; SHUF_SX $0x4e, xvec4, xvec5; -MUL_SX xvec0, xvec4; -ADD1_SX xvec4, xvec13; -MUL_SX xvec0, xvec5; -ADD1_SX xvec5, xvec12; +MUL_SX xvec0, xvec4, xvec4; +ADD1_SX xvec4, xvec13, xvec13; +MUL_SX xvec0, xvec5, xvec5; +ADD1_SX xvec5, xvec12, xvec12; SHUF_SX $0xb1, xvec0, xvec1; ODUP_SX 8*SIZE(ptrbb), xvec2; SHUF_SX $0x4e, xvec2, xvec3; -MUL_SX xvec1, xvec2; -ADD2_SX xvec2, xvec15; -MUL_SX xvec1, xvec3; -ADD2_SX xvec3, xvec14; +MUL_SX xvec1, xvec2, xvec2; +ADD2_SX xvec2, xvec15, xvec15; +MUL_SX xvec1, xvec3, xvec3; +ADD2_SX xvec3, xvec14, xvec14; ODUP_SX 12*SIZE(ptrbb), xvec4; SHUF_SX $0x4e, xvec4, xvec5; -MUL_SX xvec1, xvec4; -ADD2_SX xvec4, xvec13; -MUL_SX xvec1, xvec5; -ADD2_SX xvec5, xvec12; +MUL_SX xvec1, xvec4, xvec4; +ADD2_SX xvec4, xvec13, xvec13; +MUL_SX xvec1, xvec5, xvec5; +ADD2_SX xvec5, xvec12, xvec12; LD_SX 8*SIZE(ptrba), xvec0; # ar1, ai1, ar2, ai2 EDUP_SX 16*SIZE(ptrbb), xvec2; # br1, br1, br2, br2 SHUF_SX $0x4e, xvec2, xvec3; # br3, br3, br4, br4 -MUL_SX xvec0, xvec2; -ADD1_SX xvec2, xvec15; -MUL_SX xvec0, xvec3; -ADD1_SX xvec3, xvec14; +MUL_SX xvec0, xvec2, xvec2; +ADD1_SX xvec2, xvec15, xvec15; +MUL_SX xvec0, xvec3, xvec3; +ADD1_SX xvec3, xvec14, xvec14; EDUP_SX 20*SIZE(ptrbb), xvec4; SHUF_SX $0x4e, xvec4, xvec5; -MUL_SX xvec0, xvec4; -ADD1_SX xvec4, xvec13; -MUL_SX xvec0, xvec5; -ADD1_SX xvec5, xvec12; +MUL_SX xvec0, xvec4, xvec4; +ADD1_SX xvec4, xvec13, xvec13; +MUL_SX xvec0, xvec5, xvec5; +ADD1_SX xvec5, xvec12, xvec12; SHUF_SX $0xb1, xvec0, xvec1; ODUP_SX 16*SIZE(ptrbb), xvec2; SHUF_SX $0x4e, xvec2, xvec3; -MUL_SX xvec1, xvec2; -ADD2_SX xvec2, xvec15; -MUL_SX xvec1, xvec3; -ADD2_SX xvec3, xvec14; +MUL_SX xvec1, xvec2, xvec2; +ADD2_SX xvec2, xvec15, xvec15; +MUL_SX xvec1, xvec3, xvec3; +ADD2_SX xvec3, xvec14, xvec14; ODUP_SX 20*SIZE(ptrbb), xvec4; SHUF_SX $0x4e, xvec4, xvec5; -MUL_SX xvec1, xvec4; -ADD2_SX xvec4, xvec13; -MUL_SX xvec1, xvec5; -ADD2_SX xvec5, xvec12; +MUL_SX xvec1, xvec4, xvec4; +ADD2_SX xvec4, xvec13, xvec13; +MUL_SX xvec1, xvec5, xvec5; +ADD2_SX xvec5, xvec12, xvec12; LD_SX 12*SIZE(ptrba), xvec0; # ar1, ai1, ar2, ai2 EDUP_SX 24*SIZE(ptrbb), xvec2; # br1, br1, br2, br2 SHUF_SX $0x4e, xvec2, xvec3; # br3, br3, br4, br4 -MUL_SX xvec0, xvec2; -ADD1_SX xvec2, xvec15; -MUL_SX xvec0, xvec3; -ADD1_SX xvec3, xvec14; +MUL_SX xvec0, xvec2, xvec2; +ADD1_SX xvec2, xvec15, xvec15; +MUL_SX xvec0, xvec3, xvec3; +ADD1_SX xvec3, xvec14, xvec14; EDUP_SX 28*SIZE(ptrbb), xvec4; SHUF_SX $0x4e, xvec4, xvec5; -MUL_SX xvec0, xvec4; -ADD1_SX xvec4, xvec13; -MUL_SX xvec0, xvec5; -ADD1_SX xvec5, xvec12; +MUL_SX xvec0, xvec4, xvec4; +ADD1_SX xvec4, xvec13, xvec13; +MUL_SX xvec0, xvec5, xvec5; +ADD1_SX xvec5, xvec12, xvec12; SHUF_SX $0xb1, xvec0, xvec1; ODUP_SX 24*SIZE(ptrbb), xvec2; SHUF_SX $0x4e, xvec2, xvec3; -MUL_SX xvec1, xvec2; -ADD2_SX xvec2, xvec15; -MUL_SX xvec1, xvec3; -ADD2_SX xvec3, xvec14; +MUL_SX xvec1, xvec2, xvec2; +ADD2_SX xvec2, xvec15, xvec15; +MUL_SX xvec1, xvec3, xvec3; +ADD2_SX xvec3, xvec14, xvec14; ODUP_SX 28*SIZE(ptrbb), xvec4; SHUF_SX $0x4e, xvec4, xvec5; -MUL_SX xvec1, xvec4; -ADD2_SX xvec4, xvec13; -MUL_SX xvec1, xvec5; -ADD2_SX xvec5, xvec12; +MUL_SX xvec1, xvec4, xvec4; +ADD2_SX xvec4, xvec13, xvec13; +MUL_SX xvec1, xvec5, xvec5; +ADD2_SX xvec5, xvec12, xvec12; ADDQ $16*SIZE, ptrba; ADDQ $32*SIZE, ptrbb; DECQ k; @@ -1666,62 +1643,62 @@ ALIGN_5 LD_SX 0*SIZE(ptrba), xvec0; # ar1, ai1, ar2, ai2 EDUP_SX 0*SIZE(ptrbb), xvec2; # br1, br1, br2, br2 SHUF_SX $0x4e, xvec2, xvec3; # br3, br3, br4, br4 -MUL_SX xvec0, xvec2; -ADD1_SX xvec2, xvec15; -MUL_SX xvec0, xvec3; -ADD1_SX xvec3, xvec14; +MUL_SX xvec0, xvec2, xvec2; +ADD1_SX xvec2, xvec15, xvec15; +MUL_SX xvec0, xvec3, xvec3; +ADD1_SX xvec3, xvec14, xvec14; EDUP_SX 4*SIZE(ptrbb), xvec4; SHUF_SX $0x4e, xvec4, xvec5; -MUL_SX xvec0, xvec4; -ADD1_SX xvec4, xvec13; -MUL_SX xvec0, xvec5; -ADD1_SX xvec5, xvec12; +MUL_SX xvec0, xvec4, xvec4; +ADD1_SX xvec4, xvec13, xvec13; +MUL_SX xvec0, xvec5, xvec5; +ADD1_SX xvec5, xvec12, xvec12; SHUF_SX $0xb1, xvec0, xvec1; ODUP_SX 0*SIZE(ptrbb), xvec2; SHUF_SX $0x4e, xvec2, xvec3; -MUL_SX xvec1, xvec2; -ADD2_SX xvec2, xvec15; -MUL_SX xvec1, xvec3; -ADD2_SX xvec3, xvec14; +MUL_SX xvec1, xvec2, xvec2; +ADD2_SX xvec2, xvec15, xvec15; +MUL_SX xvec1, xvec3, xvec3; +ADD2_SX xvec3, xvec14, xvec14; ODUP_SX 4*SIZE(ptrbb), xvec4; SHUF_SX $0x4e, xvec4, xvec5; -MUL_SX xvec1, xvec4; -ADD2_SX xvec4, xvec13; -MUL_SX xvec1, xvec5; -ADD2_SX xvec5, xvec12; +MUL_SX xvec1, xvec4, xvec4; +ADD2_SX xvec4, xvec13, xvec13; +MUL_SX xvec1, xvec5, xvec5; +ADD2_SX xvec5, xvec12, xvec12; LD_SX 4*SIZE(ptrba), xvec0; # ar1, ai1, ar2, ai2 EDUP_SX 8*SIZE(ptrbb), xvec2; # br1, br1, br2, br2 SHUF_SX $0x4e, xvec2, xvec3; # br3, br3, br4, br4 -MUL_SX xvec0, xvec2; -ADD1_SX xvec2, xvec15; -MUL_SX xvec0, xvec3; -ADD1_SX xvec3, xvec14; +MUL_SX xvec0, xvec2, xvec2; +ADD1_SX xvec2, xvec15, xvec15; +MUL_SX xvec0, xvec3, xvec3; +ADD1_SX xvec3, xvec14, xvec14; EDUP_SX 12*SIZE(ptrbb), xvec4; SHUF_SX $0x4e, xvec4, xvec5; -MUL_SX xvec0, xvec4; -ADD1_SX xvec4, xvec13; -MUL_SX xvec0, xvec5; -ADD1_SX xvec5, xvec12; +MUL_SX xvec0, xvec4, xvec4; +ADD1_SX xvec4, xvec13, xvec13; +MUL_SX xvec0, xvec5, xvec5; +ADD1_SX xvec5, xvec12, xvec12; SHUF_SX $0xb1, xvec0, xvec1; ODUP_SX 8*SIZE(ptrbb), xvec2; SHUF_SX $0x4e, xvec2, xvec3; -MUL_SX xvec1, xvec2; -ADD2_SX xvec2, xvec15; -MUL_SX xvec1, xvec3; -ADD2_SX xvec3, xvec14; +MUL_SX xvec1, xvec2, xvec2; +ADD2_SX xvec2, xvec15, xvec15; +MUL_SX xvec1, xvec3, xvec3; +ADD2_SX xvec3, xvec14, xvec14; ODUP_SX 12*SIZE(ptrbb), xvec4; SHUF_SX $0x4e, xvec4, xvec5; -MUL_SX xvec1, xvec4; -ADD2_SX xvec4, xvec13; -MUL_SX xvec1, xvec5; -ADD2_SX xvec5, xvec12; +MUL_SX xvec1, xvec4, xvec4; +ADD2_SX xvec4, xvec13, xvec13; +MUL_SX xvec1, xvec5, xvec5; +ADD2_SX xvec5, xvec12, xvec12; ADDQ $8*SIZE, ptrba; ADDQ $16*SIZE, ptrbb; @@ -1737,32 +1714,32 @@ ALIGN_5 LD_SX 0*SIZE(ptrba), xvec0; # ar1, ai1, ar2, ai2 EDUP_SX 0*SIZE(ptrbb), xvec2; # br1, br1, br2, br2 SHUF_SX $0x4e, xvec2, xvec3; # br3, br3, br4, br4 -MUL_SX xvec0, xvec2; -ADD1_SX xvec2, xvec15; -MUL_SX xvec0, xvec3; -ADD1_SX xvec3, xvec14; +MUL_SX xvec0, xvec2, xvec2; +ADD1_SX xvec2, xvec15, xvec15; +MUL_SX xvec0, xvec3, xvec3; +ADD1_SX xvec3, xvec14, xvec14; EDUP_SX 4*SIZE(ptrbb), xvec4; SHUF_SX $0x4e, xvec4, xvec5; -MUL_SX xvec0, xvec4; -ADD1_SX xvec4, xvec13; -MUL_SX xvec0, xvec5; -ADD1_SX xvec5, xvec12; +MUL_SX xvec0, xvec4, xvec4; +ADD1_SX xvec4, xvec13, xvec13; +MUL_SX xvec0, xvec5, xvec5; +ADD1_SX xvec5, xvec12, xvec12; SHUF_SX $0xb1, xvec0, xvec1; ODUP_SX 0*SIZE(ptrbb), xvec2; SHUF_SX $0x4e, xvec2, xvec3; -MUL_SX xvec1, xvec2; -ADD2_SX xvec2, xvec15; -MUL_SX xvec1, xvec3; -ADD2_SX xvec3, xvec14; +MUL_SX xvec1, xvec2, xvec2; +ADD2_SX xvec2, xvec15, xvec15; +MUL_SX xvec1, xvec3, xvec3; +ADD2_SX xvec3, xvec14, xvec14; ODUP_SX 4*SIZE(ptrbb), xvec4; SHUF_SX $0x4e, xvec4, xvec5; -MUL_SX xvec1, xvec4; -ADD2_SX xvec4, xvec13; -MUL_SX xvec1, xvec5; -ADD2_SX xvec5, xvec12; +MUL_SX xvec1, xvec4, xvec4; +ADD2_SX xvec4, xvec13, xvec13; +MUL_SX xvec1, xvec5, xvec5; +ADD2_SX xvec5, xvec12, xvec12; ADDQ $4*SIZE, ptrba; ADDQ $8*SIZE, ptrbb; @@ -1770,29 +1747,29 @@ ADDQ $8*SIZE, ptrbb; #### Handle #### #if defined(RN) || defined(RT) || defined(CN) || defined(CT) XOR_SY yvec7, yvec7, yvec7; -ADDSUB_SX xvec15, xvec7; +ADDSUB_SX xvec15, xvec7, xvec7; MOV_SX xvec7, xvec15; XOR_SY yvec7, yvec7, yvec7; -ADDSUB_SX xvec14, xvec7; +ADDSUB_SX xvec14, xvec7, xvec7; MOV_SX xvec7, xvec14; XOR_SY yvec7, yvec7, yvec7; -ADDSUB_SX xvec13, xvec7; +ADDSUB_SX xvec13, xvec7, xvec7; MOV_SX xvec7, xvec13; XOR_SY yvec7, yvec7, yvec7; -ADDSUB_SX xvec12, xvec7; +ADDSUB_SX xvec12, xvec7, xvec7; MOV_SX xvec7, xvec12; #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) XOR_SY yvec7, yvec7, yvec7; -SUB_SX xvec15, xvec7; +SUB_SX xvec15, xvec7, xvec7; MOV_SX xvec7, xvec15; XOR_SY yvec7, yvec7, yvec7; -SUB_SX xvec14, xvec7; +SUB_SX xvec14, xvec7, xvec7; MOV_SX xvec7, xvec14; XOR_SY yvec7, yvec7, yvec7; -SUB_SX xvec13, xvec7; +SUB_SX xvec13, xvec7, xvec7; MOV_SX xvec7, xvec13; XOR_SY yvec7, yvec7, yvec7; -SUB_SX xvec12, xvec7; +SUB_SX xvec12, xvec7, xvec7; MOV_SX xvec7, xvec12; #elif defined(RR) || defined(RC) || defined(CR) || defined(CC) SHUF_SX $0xb1, xvec15, xvec15; @@ -1800,16 +1777,16 @@ SHUF_SX $0xb1, xvec14, xvec14; SHUF_SX $0xb1, xvec13, xvec13; SHUF_SX $0xb1, xvec12, xvec12; XOR_SY yvec7, yvec7, yvec7; -ADDSUB_SX xvec15, xvec7; +ADDSUB_SX xvec15, xvec7, xvec7; MOV_SX xvec7, xvec15; XOR_SY yvec7, yvec7, yvec7; -ADDSUB_SX xvec14, xvec7; +ADDSUB_SX xvec14, xvec7, xvec7; MOV_SX xvec7, xvec14; XOR_SY yvec7, yvec7, yvec7; -ADDSUB_SX xvec13, xvec7; +ADDSUB_SX xvec13, xvec7, xvec7; MOV_SX xvec7, xvec13; XOR_SY yvec7, yvec7, yvec7; -ADDSUB_SX xvec12, xvec7; +ADDSUB_SX xvec12, xvec7, xvec7; MOV_SX xvec7, xvec12; SHUF_SX $0xb1, xvec15, xvec15; SHUF_SX $0xb1, xvec14, xvec14; @@ -1821,35 +1798,35 @@ BROAD_SX MEMALPHA_R,xvec7; BROAD_SX MEMALPHA_I,xvec6; ##### Multiply Alpha #### VPERMILP_SX $0xb1,xvec15, xvec5; -MUL_SX xvec7, xvec15; -MUL_SX xvec6, xvec5; -ADDSUB_SX xvec5, xvec15; +MUL_SX xvec7, xvec15, xvec15; +MUL_SX xvec6, xvec5, xvec5; +ADDSUB_SX xvec5, xvec15, xvec15; VPERMILP_SX $0xb1,xvec14, xvec4; -MUL_SX xvec7, xvec14; -MUL_SX xvec6, xvec4; -ADDSUB_SX xvec4, xvec14; +MUL_SX xvec7, xvec14, xvec14; +MUL_SX xvec6, xvec4, xvec4; +ADDSUB_SX xvec4, xvec14, xvec14; VPERMILP_SX $0xb1,xvec13, xvec3; -MUL_SX xvec7, xvec13; -MUL_SX xvec6, xvec3; -ADDSUB_SX xvec3, xvec13; +MUL_SX xvec7, xvec13, xvec13; +MUL_SX xvec6, xvec3, xvec3; +ADDSUB_SX xvec3, xvec13, xvec13; VPERMILP_SX $0xb1,xvec12, xvec2; -MUL_SX xvec7, xvec12; -MUL_SX xvec6, xvec2; -ADDSUB_SX xvec2, xvec12; +MUL_SX xvec7, xvec12, xvec12; +MUL_SX xvec6, xvec2, xvec2; +ADDSUB_SX xvec2, xvec12, xvec12; #### Writing back #### #ifndef TRMMKERNEL -LDL_SX 0*SIZE(C0), xvec0; -LDH_SX 2*SIZE(C0, ldc, 1), xvec0; -LDL_SX 0*SIZE(C0, ldc, 1), xvec1; -LDH_SX 2*SIZE(C0), xvec1; -LDL_SX 0*SIZE(C1), xvec2; -LDH_SX 2*SIZE(C1, ldc, 1), xvec2; -LDL_SX 0*SIZE(C1, ldc, 1), xvec3; -LDH_SX 2*SIZE(C1), xvec3; -ADD_SX xvec0, xvec15; -ADD_SX xvec1, xvec14; -ADD_SX xvec2, xvec13; -ADD_SX xvec3, xvec12; +LDL_SX 0*SIZE(C0), xvec0, xvec0; +LDH_SX 2*SIZE(C0, ldc,1), xvec0, xvec0; +LDL_SX 0*SIZE(C0, ldc,1), xvec1, xvec1; +LDH_SX 2*SIZE(C0), xvec1, xvec1; +LDL_SX 0*SIZE(C1), xvec2, xvec2; +LDH_SX 2*SIZE(C1, ldc, 1), xvec2, xvec2; +LDL_SX 0*SIZE(C1, ldc, 1), xvec3, xvec3; +LDH_SX 2*SIZE(C1), xvec3, xvec3; +ADD_SX xvec0, xvec15, xvec15; +ADD_SX xvec1, xvec14, xvec14; +ADD_SX xvec2, xvec13, xvec13; +ADD_SX xvec3, xvec12, xvec12; #endif STL_SX xvec15, 0*SIZE(C0); STH_SX xvec15, 2*SIZE(C0, ldc, 1); @@ -1911,70 +1888,70 @@ ALIGN_5 BROAD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec2; SHUF_SX $0xb1, xvec2, xvec3; -MUL_SX xvec0, xvec2; -ADD1_SX xvec2, xvec15; +MUL_SX xvec0, xvec2, xvec2; +ADD1_SX xvec2, xvec15, xvec15; LD_SX 4*SIZE(ptrbb), xvec4; SHUF_SX $0xb1, xvec4, xvec5; -MUL_SX xvec0, xvec4; -ADD1_SX xvec4, xvec14; +MUL_SX xvec0, xvec4, xvec4; +ADD1_SX xvec4, xvec14, xvec14; BROAD_SX 1*SIZE(ptrba), xvec1; -MUL_SX xvec1, xvec3; -ADD2_SX xvec3, xvec15; -MUL_SX xvec1, xvec5; -ADD2_SX xvec5, xvec14; +MUL_SX xvec1, xvec3, xvec3; +ADD2_SX xvec3, xvec15, xvec15; +MUL_SX xvec1, xvec5, xvec5; +ADD2_SX xvec5, xvec14, xvec14; BROAD_SX 2*SIZE(ptrba), xvec0; LD_SX 8*SIZE(ptrbb), xvec2; SHUF_SX $0xb1, xvec2, xvec3; -MUL_SX xvec0, xvec2; -ADD1_SX xvec2, xvec15; +MUL_SX xvec0, xvec2, xvec2; +ADD1_SX xvec2, xvec15, xvec15; LD_SX 12*SIZE(ptrbb), xvec4; SHUF_SX $0xb1, xvec4, xvec5; -MUL_SX xvec0, xvec4; -ADD1_SX xvec4, xvec14; +MUL_SX xvec0, xvec4, xvec4; +ADD1_SX xvec4, xvec14, xvec14; BROAD_SX 3*SIZE(ptrba), xvec1; -MUL_SX xvec1, xvec3; -ADD2_SX xvec3, xvec15; -MUL_SX xvec1, xvec5; -ADD2_SX xvec5, xvec14; +MUL_SX xvec1, xvec3, xvec3; +ADD2_SX xvec3, xvec15, xvec15; +MUL_SX xvec1, xvec5, xvec5; +ADD2_SX xvec5, xvec14, xvec14; BROAD_SX 4*SIZE(ptrba), xvec0; LD_SX 16*SIZE(ptrbb), xvec2; SHUF_SX $0xb1, xvec2, xvec3; -MUL_SX xvec0, xvec2; -ADD1_SX xvec2, xvec15; +MUL_SX xvec0, xvec2, xvec2; +ADD1_SX xvec2, xvec15, xvec15; LD_SX 20*SIZE(ptrbb), xvec4; SHUF_SX $0xb1, xvec4, xvec5; -MUL_SX xvec0, xvec4; -ADD1_SX xvec4, xvec14; +MUL_SX xvec0, xvec4, xvec4; +ADD1_SX xvec4, xvec14, xvec14; BROAD_SX 5*SIZE(ptrba), xvec1; -MUL_SX xvec1, xvec3; -ADD2_SX xvec3, xvec15; -MUL_SX xvec1, xvec5; -ADD2_SX xvec5, xvec14; +MUL_SX xvec1, xvec3, xvec3; +ADD2_SX xvec3, xvec15, xvec15; +MUL_SX xvec1, xvec5, xvec5; +ADD2_SX xvec5, xvec14, xvec14; BROAD_SX 6*SIZE(ptrba), xvec0; LD_SX 24*SIZE(ptrbb), xvec2; SHUF_SX $0xb1, xvec2, xvec3; -MUL_SX xvec0, xvec2; -ADD1_SX xvec2, xvec15; +MUL_SX xvec0, xvec2, xvec2; +ADD1_SX xvec2, xvec15, xvec15; LD_SX 28*SIZE(ptrbb), xvec4; SHUF_SX $0xb1, xvec4, xvec5; -MUL_SX xvec0, xvec4; -ADD1_SX xvec4, xvec14; +MUL_SX xvec0, xvec4, xvec4; +ADD1_SX xvec4, xvec14, xvec14; BROAD_SX 7*SIZE(ptrba), xvec1; -MUL_SX xvec1, xvec3; -ADD2_SX xvec3, xvec15; -MUL_SX xvec1, xvec5; -ADD2_SX xvec5, xvec14; +MUL_SX xvec1, xvec3, xvec3; +ADD2_SX xvec3, xvec15, xvec15; +MUL_SX xvec1, xvec5, xvec5; +ADD2_SX xvec5, xvec14, xvec14; ADDQ $8*SIZE, ptrba; ADDQ $32*SIZE, ptrbb; DECQ k; @@ -1992,36 +1969,36 @@ ALIGN_5 BROAD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec2; SHUF_SX $0xb1, xvec2, xvec3; -MUL_SX xvec0, xvec2; -ADD1_SX xvec2, xvec15; +MUL_SX xvec0, xvec2, xvec2; +ADD1_SX xvec2, xvec15, xvec15; LD_SX 4*SIZE(ptrbb), xvec4; SHUF_SX $0xb1, xvec4, xvec5; -MUL_SX xvec0, xvec4; -ADD1_SX xvec4, xvec14; +MUL_SX xvec0, xvec4, xvec4; +ADD1_SX xvec4, xvec14, xvec14; BROAD_SX 1*SIZE(ptrba), xvec1; -MUL_SX xvec1, xvec3; -ADD2_SX xvec3, xvec15; -MUL_SX xvec1, xvec5; -ADD2_SX xvec5, xvec14; +MUL_SX xvec1, xvec3, xvec3; +ADD2_SX xvec3, xvec15, xvec15; +MUL_SX xvec1, xvec5, xvec5; +ADD2_SX xvec5, xvec14, xvec14; BROAD_SX 2*SIZE(ptrba), xvec0; LD_SX 8*SIZE(ptrbb), xvec2; SHUF_SX $0xb1, xvec2, xvec3; -MUL_SX xvec0, xvec2; -ADD1_SX xvec2, xvec15; +MUL_SX xvec0, xvec2, xvec2; +ADD1_SX xvec2, xvec15, xvec15; LD_SX 12*SIZE(ptrbb), xvec4; SHUF_SX $0xb1, xvec4, xvec5; -MUL_SX xvec0, xvec4; -ADD1_SX xvec4, xvec14; +MUL_SX xvec0, xvec4, xvec4; +ADD1_SX xvec4, xvec14, xvec14; BROAD_SX 3*SIZE(ptrba), xvec1; -MUL_SX xvec1, xvec3; -ADD2_SX xvec3, xvec15; -MUL_SX xvec1, xvec5; -ADD2_SX xvec5, xvec14; +MUL_SX xvec1, xvec3, xvec3; +ADD2_SX xvec3, xvec15, xvec15; +MUL_SX xvec1, xvec5, xvec5; +ADD2_SX xvec5, xvec14, xvec14; ADDQ $4*SIZE, ptrba; ADDQ $16*SIZE, ptrbb; @@ -2037,19 +2014,19 @@ ALIGN_5 BROAD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec2; SHUF_SX $0xb1, xvec2, xvec3; -MUL_SX xvec0, xvec2; -ADD1_SX xvec2, xvec15; +MUL_SX xvec0, xvec2, xvec2; +ADD1_SX xvec2, xvec15, xvec15; LD_SX 4*SIZE(ptrbb), xvec4; SHUF_SX $0xb1, xvec4, xvec5; -MUL_SX xvec0, xvec4; -ADD1_SX xvec4, xvec14; +MUL_SX xvec0, xvec4, xvec4; +ADD1_SX xvec4, xvec14, xvec14; BROAD_SX 1*SIZE(ptrba), xvec1; -MUL_SX xvec1, xvec3; -ADD2_SX xvec3, xvec15; -MUL_SX xvec1, xvec5; -ADD2_SX xvec5, xvec14; +MUL_SX xvec1, xvec3, xvec3; +ADD2_SX xvec3, xvec15, xvec15; +MUL_SX xvec1, xvec5, xvec5; +ADD2_SX xvec5, xvec14, xvec14; ADDQ $2*SIZE, ptrba; ADDQ $8*SIZE, ptrbb; @@ -2057,26 +2034,26 @@ ADDQ $8*SIZE, ptrbb; #### Handle #### #if defined(NR) || defined(NC) || defined(TR) || defined(TC) XOR_SY yvec7, yvec7, yvec7; -ADDSUB_SX xvec15, xvec7; +ADDSUB_SX xvec15, xvec7, xvec7; MOV_SX xvec7, xvec15; XOR_SY yvec7, yvec7, yvec7; -ADDSUB_SX xvec14, xvec7; +ADDSUB_SX xvec14, xvec7, xvec7; MOV_SX xvec7, xvec14; #elif defined(RN) || defined(RT) || defined(CN) || defined(CT) XOR_SY yvec7, yvec7, yvec7; -SUB_SX xvec15, xvec7; +SUB_SX xvec15, xvec7, xvec7; MOV_SX xvec7, xvec15; XOR_SY yvec7, yvec7, yvec7; -SUB_SX xvec14, xvec7; +SUB_SX xvec14, xvec7, xvec7; MOV_SX xvec7, xvec14; #elif defined(RR) || defined(RC) || defined(CR) || defined(CC) SHUF_SX $0xb1, xvec15, xvec15; SHUF_SX $0xb1, xvec14, xvec14; XOR_SY yvec7, yvec7, yvec7; -ADDSUB_SX xvec15, xvec7; +ADDSUB_SX xvec15, xvec7, xvec7; MOV_SX xvec7, xvec15; XOR_SY yvec7, yvec7, yvec7; -ADDSUB_SX xvec14, xvec7; +ADDSUB_SX xvec14, xvec7, xvec7; MOV_SX xvec7, xvec14; SHUF_SX $0xb1, xvec15, xvec15; SHUF_SX $0xb1, xvec14, xvec14; @@ -2086,21 +2063,21 @@ BROAD_SX MEMALPHA_R,xvec7; BROAD_SX MEMALPHA_I,xvec6; ##### Multiply Alpha #### VPERMILP_SX $0xb1,xvec15, xvec5; -MUL_SX xvec7, xvec15; -MUL_SX xvec6, xvec5; -ADDSUB_SX xvec5, xvec15; +MUL_SX xvec7, xvec15, xvec15; +MUL_SX xvec6, xvec5, xvec5; +ADDSUB_SX xvec5, xvec15, xvec15; VPERMILP_SX $0xb1,xvec14, xvec4; -MUL_SX xvec7, xvec14; -MUL_SX xvec6, xvec4; -ADDSUB_SX xvec4, xvec14; +MUL_SX xvec7, xvec14, xvec14; +MUL_SX xvec6, xvec4, xvec4; +ADDSUB_SX xvec4, xvec14, xvec14; #### Writing back #### #ifndef TRMMKERNEL -LDL_SX 0*SIZE(C0), xvec0; -LDH_SX 0*SIZE(C0, ldc, 1), xvec0; -LDL_SX 0*SIZE(C1), xvec1; -LDH_SX 0*SIZE(C1, ldc, 1), xvec1; -ADD_SX xvec0, xvec15; -ADD_SX xvec1, xvec14; +LDL_SX 0*SIZE(C0), xvec0, xvec0; +LDH_SX 0*SIZE(C0, ldc, 1), xvec0, xvec0; +LDL_SX 0*SIZE(C1), xvec1, xvec1; +LDH_SX 0*SIZE(C1, ldc, 1), xvec1, xvec1; +ADD_SX xvec0, xvec15, xvec15; +ADD_SX xvec1, xvec14, xvec14; #endif STL_SX xvec15, 0*SIZE(C0); STH_SX xvec15, 0*SIZE(C0, ldc, 1); @@ -2191,59 +2168,59 @@ SHUF_SX $0x4e, xvec5, xvec7; LD_SX 0*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec15; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec15, xvec15; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec11; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec11, xvec11; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec15; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec11; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec15, xvec15; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec11, xvec11; LD_SX 4*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec14; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec14, xvec14; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec10; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec10, xvec10; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec14; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec10; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec14, xvec14; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec10, xvec10; LD_SX 8*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec13; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec13, xvec13; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec9; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec9, xvec9; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec13; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec9; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec13, xvec13; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec9, xvec9; LD_SX 12*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec12; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec12, xvec12; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec8; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec8, xvec8; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec12; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec8; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec12, xvec12; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec8, xvec8; EDUP_SX 4*SIZE(ptrbb), xvec4; ODUP_SX 4*SIZE(ptrbb), xvec5; @@ -2252,59 +2229,59 @@ SHUF_SX $0x4e, xvec5, xvec7; LD_SX 16*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec15; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec15, xvec15; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec11; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec11, xvec11; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec15; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec11; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec15, xvec15; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec11, xvec11; LD_SX 20*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec14; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec14, xvec14; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec10; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec10, xvec10; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec14; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec10; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec14, xvec14; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec10, xvec10; LD_SX 24*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec13; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec13, xvec13; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec9; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec9, xvec9; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec13; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec9; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec13, xvec13; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec9, xvec9; LD_SX 28*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec12; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec12, xvec12; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec8; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec8, xvec8; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec12; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec8; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec12, xvec12; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec8, xvec8; EDUP_SX 8*SIZE(ptrbb), xvec4; ODUP_SX 8*SIZE(ptrbb), xvec5; @@ -2313,59 +2290,59 @@ SHUF_SX $0x4e, xvec5, xvec7; LD_SX 32*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec15; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec15, xvec15; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec11; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec11, xvec11; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec15; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec11; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec15, xvec15; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec11, xvec11; LD_SX 36*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec14; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec14, xvec14; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec10; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec10, xvec10; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec14; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec10; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec14, xvec14; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec10, xvec10; LD_SX 40*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec13; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec13, xvec13; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec9; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec9, xvec9; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec13; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec9; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec13, xvec13; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec9, xvec9; LD_SX 44*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec12; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec12, xvec12; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec8; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec8, xvec8; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec12; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec8; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec12, xvec12; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec8, xvec8; EDUP_SX 12*SIZE(ptrbb), xvec4; ODUP_SX 12*SIZE(ptrbb), xvec5; @@ -2374,59 +2351,59 @@ SHUF_SX $0x4e, xvec5, xvec7; LD_SX 48*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec15; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec15, xvec15; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec11; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec11, xvec11; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec15; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec11; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec15, xvec15; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec11, xvec11; LD_SX 52*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec14; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec14, xvec14; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec10; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec10, xvec10; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec14; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec10; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec14, xvec14; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec10, xvec10; LD_SX 56*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec13; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec13, xvec13; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec9; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec9, xvec9; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec13; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec9; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec13, xvec13; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec9, xvec9; LD_SX 60*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec12; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec12, xvec12; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec8; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec8, xvec8; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec12; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec8; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec12, xvec12; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec8, xvec8; ADDQ $64*SIZE, ptrba; ADDQ $16*SIZE, ptrbb; DECQ k; @@ -2448,59 +2425,59 @@ SHUF_SX $0x4e, xvec5, xvec7; LD_SX 0*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec15; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec15, xvec15; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec11; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec11, xvec11; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec15; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec11; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec15, xvec15; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec11, xvec11; LD_SX 4*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec14; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec14, xvec14; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec10; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec10, xvec10; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec14; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec10; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec14, xvec14; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec10, xvec10; LD_SX 8*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec13; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec13, xvec13; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec9; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec9, xvec9; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec13; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec9; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec13, xvec13; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec9, xvec9; LD_SX 12*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec12; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec12, xvec12; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec8; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec8, xvec8; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec12; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec8; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec12, xvec12; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec8, xvec8; EDUP_SX 4*SIZE(ptrbb), xvec4; ODUP_SX 4*SIZE(ptrbb), xvec5; @@ -2509,59 +2486,59 @@ SHUF_SX $0x4e, xvec5, xvec7; LD_SX 16*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec15; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec15, xvec15; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec11; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec11, xvec11; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec15; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec11; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec15, xvec15; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec11, xvec11; LD_SX 20*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec14; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec14, xvec14; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec10; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec10, xvec10; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec14; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec10; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec14, xvec14; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec10, xvec10; LD_SX 24*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec13; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec13, xvec13; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec9; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec9, xvec9; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec13; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec9; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec13, xvec13; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec9, xvec9; LD_SX 28*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec12; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec12, xvec12; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec8; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec8, xvec8; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec12; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec8; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec12, xvec12; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec8, xvec8; ADDQ $32*SIZE, ptrba; ADDQ $8*SIZE, ptrbb; @@ -2581,59 +2558,59 @@ SHUF_SX $0x4e, xvec5, xvec7; LD_SX 0*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec15; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec15, xvec15; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec11; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec11, xvec11; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec15; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec11; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec15, xvec15; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec11, xvec11; LD_SX 4*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec14; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec14, xvec14; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec10; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec10, xvec10; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec14; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec10; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec14, xvec14; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec10, xvec10; LD_SX 8*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec13; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec13, xvec13; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec9; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec9, xvec9; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec13; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec9; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec13, xvec13; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec9, xvec9; LD_SX 12*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec12; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec12, xvec12; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec8; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec8, xvec8; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec12; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec8; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec12, xvec12; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec8, xvec8; ADDQ $16*SIZE, ptrba; ADDQ $4*SIZE, ptrbb @@ -2641,53 +2618,53 @@ ADDQ $4*SIZE, ptrbb #### Handle #### #if defined(RN) || defined(RT) || defined(CN) || defined(CT) XOR_SY yvec7, yvec7, yvec7; -ADDSUB_SX xvec15, xvec7; +ADDSUB_SX xvec15, xvec7, xvec7; MOV_SX xvec7, xvec15; XOR_SY yvec7, yvec7, yvec7; -ADDSUB_SX xvec14, xvec7; +ADDSUB_SX xvec14, xvec7, xvec7; MOV_SX xvec7, xvec14; XOR_SY yvec7, yvec7, yvec7; -ADDSUB_SX xvec13, xvec7; +ADDSUB_SX xvec13, xvec7, xvec7; MOV_SX xvec7, xvec13; XOR_SY yvec7, yvec7, yvec7; -ADDSUB_SX xvec12, xvec7; +ADDSUB_SX xvec12, xvec7, xvec7; MOV_SX xvec7, xvec12; XOR_SY yvec7, yvec7, yvec7; -ADDSUB_SX xvec11, xvec7; +ADDSUB_SX xvec11, xvec7, xvec7; MOV_SX xvec7, xvec11; XOR_SY yvec7, yvec7, yvec7; -ADDSUB_SX xvec10, xvec7; +ADDSUB_SX xvec10, xvec7, xvec7; MOV_SX xvec7, xvec10; XOR_SY yvec7, yvec7, yvec7; -ADDSUB_SX xvec9, xvec7; +ADDSUB_SX xvec9, xvec7, xvec7; MOV_SX xvec7, xvec9; XOR_SY yvec7, yvec7, yvec7; -ADDSUB_SX xvec8, xvec7; +ADDSUB_SX xvec8, xvec7, xvec7; MOV_SX xvec7, xvec8; #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) XOR_SY yvec7, yvec7, yvec7; -SUB_SX xvec15, xvec7; +SUB_SX xvec15, xvec7, xvec7; MOV_SX xvec7, xvec15; XOR_SY yvec7, yvec7, yvec7; -SUB_SX xvec14, xvec7; +SUB_SX xvec14, xvec7, xvec7; MOV_SX xvec7, xvec14; XOR_SY yvec7, yvec7, yvec7; -SUB_SX xvec13, xvec7; +SUB_SX xvec13, xvec7, xvec7; MOV_SX xvec7, xvec13; XOR_SY yvec7, yvec7, yvec7; -SUB_SX xvec12, xvec7; +SUB_SX xvec12, xvec7, xvec7; MOV_SX xvec7, xvec12; XOR_SY yvec7, yvec7, yvec7; -SUB_SX xvec11, xvec7; +SUB_SX xvec11, xvec7, xvec7; MOV_SX xvec7, xvec11; XOR_SY yvec7, yvec7, yvec7; -SUB_SX xvec10, xvec7; +SUB_SX xvec10, xvec7, xvec7; MOV_SX xvec7, xvec10; XOR_SY yvec7, yvec7, yvec7; -SUB_SX xvec9, xvec7; +SUB_SX xvec9, xvec7, xvec7; MOV_SX xvec7, xvec9; XOR_SY yvec7, yvec7, yvec7; -SUB_SX xvec8, xvec7; +SUB_SX xvec8, xvec7, xvec7; MOV_SX xvec7, xvec8; #elif defined(RR) || defined(RC) || defined(CR) || defined(CC) SHUF_SX $0xb1, xvec15, xvec15; @@ -2699,28 +2676,28 @@ SHUF_SX $0xb1, xvec10, xvec10; SHUF_SX $0xb1, xvec9, xvec9; SHUF_SX $0xb1, xvec8, xvec8; XOR_SY yvec7, yvec7, yvec7; -ADDSUB_SX xvec15, xvec7; +ADDSUB_SX xvec15, xvec7, xvec7; MOV_SX xvec7, xvec15; XOR_SY yvec7, yvec7, yvec7; -ADDSUB_SX xvec14, xvec7; +ADDSUB_SX xvec14, xvec7, xvec7; MOV_SX xvec7, xvec14; XOR_SY yvec7, yvec7, yvec7; -ADDSUB_SX xvec13, xvec7; +ADDSUB_SX xvec13, xvec7, xvec7; MOV_SX xvec7, xvec13; XOR_SY yvec7, yvec7, yvec7; -ADDSUB_SX xvec12, xvec7; +ADDSUB_SX xvec12, xvec7, xvec7; MOV_SX xvec7, xvec12; XOR_SY yvec7, yvec7, yvec7; -ADDSUB_SX xvec11, xvec7; +ADDSUB_SX xvec11, xvec7, xvec7; MOV_SX xvec7, xvec11; XOR_SY yvec7, yvec7, yvec7; -ADDSUB_SX xvec10, xvec7; +ADDSUB_SX xvec10, xvec7, xvec7; MOV_SX xvec7, xvec10; XOR_SY yvec7, yvec7, yvec7; -ADDSUB_SX xvec9, xvec7; +ADDSUB_SX xvec9, xvec7, xvec7; MOV_SX xvec7, xvec9; XOR_SY yvec7, yvec7, yvec7; -ADDSUB_SX xvec8, xvec7; +ADDSUB_SX xvec8, xvec7, xvec7; MOV_SX xvec7, xvec8; SHUF_SX $0xb1, xvec15, xvec15; SHUF_SX $0xb1, xvec14, xvec14; @@ -2736,50 +2713,50 @@ BROAD_SX MEMALPHA_R, xvec7; BROAD_SX MEMALPHA_I, xvec6; #### Writng back #### VPERMILP_SX $0xb1,xvec15, xvec5; -MUL_SX xvec7, xvec15; -MUL_SX xvec6, xvec5; -ADDSUB_SX xvec5, xvec15; +MUL_SX xvec7, xvec15, xvec15; +MUL_SX xvec6, xvec5, xvec5; +ADDSUB_SX xvec5, xvec15, xvec15; VPERMILP_SX $0xb1,xvec14, xvec4; -MUL_SX xvec7, xvec14; -MUL_SX xvec6, xvec4; -ADDSUB_SX xvec4, xvec14; +MUL_SX xvec7, xvec14, xvec14; +MUL_SX xvec6, xvec4, xvec4; +ADDSUB_SX xvec4, xvec14, xvec14; VPERMILP_SX $0xb1,xvec13, xvec3; -MUL_SX xvec7, xvec13; -MUL_SX xvec6, xvec3; -ADDSUB_SX xvec3, xvec13; +MUL_SX xvec7, xvec13, xvec13; +MUL_SX xvec6, xvec3, xvec3; +ADDSUB_SX xvec3, xvec13, xvec13; VPERMILP_SX $0xb1,xvec12, xvec2; -MUL_SX xvec7, xvec12; -MUL_SX xvec6, xvec2; -ADDSUB_SX xvec2, xvec12; +MUL_SX xvec7, xvec12, xvec12; +MUL_SX xvec6, xvec2, xvec2; +ADDSUB_SX xvec2, xvec12, xvec12; VPERMILP_SX $0xb1,xvec11, xvec1; -MUL_SX xvec7, xvec11; -MUL_SX xvec6, xvec1; -ADDSUB_SX xvec1, xvec11; +MUL_SX xvec7, xvec11, xvec11; +MUL_SX xvec6, xvec1, xvec1; +ADDSUB_SX xvec1, xvec11, xvec11; VPERMILP_SX $0xb1,xvec10, xvec0; -MUL_SX xvec7, xvec10; -MUL_SX xvec6, xvec0; -ADDSUB_SX xvec0, xvec10; +MUL_SX xvec7, xvec10, xvec10; +MUL_SX xvec6, xvec0, xvec0; +ADDSUB_SX xvec0, xvec10, xvec10; VPERMILP_SX $0xb1,xvec9, xvec5; -MUL_SX xvec7, xvec9; -MUL_SX xvec6, xvec5; -ADDSUB_SX xvec5, xvec9; +MUL_SX xvec7, xvec9, xvec9; +MUL_SX xvec6, xvec5, xvec5; +ADDSUB_SX xvec5, xvec9, xvec9; VPERMILP_SX $0xb1,xvec8, xvec4; -MUL_SX xvec7, xvec8; -MUL_SX xvec6, xvec4; -ADDSUB_SX xvec4, xvec8; +MUL_SX xvec7, xvec8, xvec8; +MUL_SX xvec6, xvec4, xvec4; +ADDSUB_SX xvec4, xvec8, xvec8; #ifndef TRMMKERNEL -LDL_SX 0*SIZE(C0), xvec0; -LDH_SX 2*SIZE(C1), xvec0; -LDL_SX 4*SIZE(C0), xvec1; -LDH_SX 6*SIZE(C1), xvec1; -LDL_SX 8*SIZE(C0), xvec2; -LDH_SX 10*SIZE(C1), xvec2; -LDL_SX 12*SIZE(C0), xvec3; -LDH_SX 14*SIZE(C1), xvec3; -ADD_SX xvec0, xvec15; -ADD_SX xvec1, xvec14; -ADD_SX xvec2, xvec13; -ADD_SX xvec3, xvec12; +LDL_SX 0*SIZE(C0), xvec0, xvec0; +LDH_SX 2*SIZE(C1), xvec0, xvec0; +LDL_SX 4*SIZE(C0), xvec1, xvec1; +LDH_SX 6*SIZE(C1), xvec1, xvec1; +LDL_SX 8*SIZE(C0), xvec2, xvec2; +LDH_SX 10*SIZE(C1), xvec2, xvec2; +LDL_SX 12*SIZE(C0), xvec3, xvec3; +LDH_SX 14*SIZE(C1), xvec3, xvec3; +ADD_SX xvec0, xvec15, xvec15; +ADD_SX xvec1, xvec14, xvec14; +ADD_SX xvec2, xvec13, xvec13; +ADD_SX xvec3, xvec12, xvec12; #endif STL_SX xvec15, 0*SIZE(C0); STH_SX xvec15, 2*SIZE(C1); @@ -2790,18 +2767,18 @@ STH_SX xvec13, 10*SIZE(C1); STL_SX xvec12, 12*SIZE(C0); STH_SX xvec12, 14*SIZE(C1); #ifndef TRMMKERNEL -LDL_SX 0*SIZE(C1), xvec4; -LDH_SX 2*SIZE(C0), xvec4; -LDL_SX 4*SIZE(C1), xvec5; -LDH_SX 6*SIZE(C0), xvec5; -LDL_SX 8*SIZE(C1), xvec6; -LDH_SX 10*SIZE(C0), xvec6; -LDL_SX 12*SIZE(C1), xvec7; -LDH_SX 14*SIZE(C0), xvec7; -ADD_SX xvec4, xvec11; -ADD_SX xvec5, xvec10; -ADD_SX xvec6, xvec9; -ADD_SX xvec7, xvec8; +LDL_SX 0*SIZE(C1), xvec4, xvec4; +LDH_SX 2*SIZE(C0), xvec4, xvec4; +LDL_SX 4*SIZE(C1), xvec5, xvec5; +LDH_SX 6*SIZE(C0), xvec5, xvec5; +LDL_SX 8*SIZE(C1), xvec6, xvec6; +LDH_SX 10*SIZE(C0), xvec6, xvec6; +LDL_SX 12*SIZE(C1), xvec7, xvec7; +LDH_SX 14*SIZE(C0), xvec7, xvec7; +ADD_SX xvec4, xvec11, xvec11; +ADD_SX xvec5, xvec10, xvec10; +ADD_SX xvec6, xvec9, xvec9; +ADD_SX xvec7, xvec8, xvec8; #endif STL_SX xvec11, 0*SIZE(C1); STH_SX xvec11, 2*SIZE(C0); @@ -2872,31 +2849,31 @@ SHUF_SX $0x4e, xvec5, xvec7; LD_SX 0*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec15; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec15, xvec15; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec11; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec11, xvec11; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec15; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec11; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec15, xvec15; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec11, xvec11; LD_SX 4*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec14; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec14, xvec14; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec10; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec10, xvec10; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec14; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec10; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec14, xvec14; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec10, xvec10; #### Unroll 2 ##### EDUP_SX 4*SIZE(ptrbb), xvec4; @@ -2906,31 +2883,31 @@ SHUF_SX $0x4e, xvec5, xvec7; LD_SX 8*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec15; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec15, xvec15; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec11; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec11, xvec11; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec15; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec11; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec15, xvec15; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec11, xvec11; LD_SX 12*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec14; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec14, xvec14; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec10; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec10, xvec10; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec14; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec10; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec14, xvec14; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec10, xvec10; #### Unroll 3 #### EDUP_SX 8*SIZE(ptrbb), xvec4; @@ -2940,31 +2917,31 @@ SHUF_SX $0x4e, xvec5, xvec7; LD_SX 16*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec15; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec15, xvec15; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec11; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec11, xvec11; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec15; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec11; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec15, xvec15; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec11, xvec11; LD_SX 20*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec14; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec14, xvec14; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec10; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec10, xvec10; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec14; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec10; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec14, xvec14; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec10, xvec10; #### Unroll 4 #### EDUP_SX 12*SIZE(ptrbb), xvec4; @@ -2974,31 +2951,31 @@ SHUF_SX $0x4e, xvec5, xvec7; LD_SX 24*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec15; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec15, xvec15; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec11; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec11, xvec11; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec15; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec11; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec15, xvec15; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec11, xvec11; LD_SX 28*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec14; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec14, xvec14; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec10; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec10, xvec10; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec14; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec10; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec14, xvec14; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec10, xvec10; ADDQ $32*SIZE, ptrba; ADDQ $16*SIZE, ptrbb; DECQ k; @@ -3020,31 +2997,31 @@ SHUF_SX $0x4e, xvec5, xvec7; LD_SX 0*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec15; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec15, xvec15; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec11; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec11, xvec11; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec15; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec11; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec15, xvec15; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec11, xvec11; LD_SX 4*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec14; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec14, xvec14; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec10; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec10, xvec10; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec14; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec10; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec14, xvec14; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec10, xvec10; #### Unroll 2 ##### EDUP_SX 4*SIZE(ptrbb), xvec4; @@ -3054,31 +3031,31 @@ SHUF_SX $0x4e, xvec5, xvec7; LD_SX 8*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec15; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec15, xvec15; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec11; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec11, xvec11; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec15; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec11; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec15, xvec15; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec11, xvec11; LD_SX 12*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec14; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec14, xvec14; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec10; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec10, xvec10; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec14; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec10; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec14, xvec14; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec10, xvec10; ADDQ $16*SIZE, ptrba; ADDQ $8*SIZE, ptrbb; @@ -3099,31 +3076,31 @@ SHUF_SX $0x4e, xvec5, xvec7; LD_SX 0*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec15; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec15, xvec15; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec11; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec11, xvec11; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec15; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec11; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec15, xvec15; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec11, xvec11; LD_SX 4*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec14; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec14, xvec14; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec10; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec10, xvec10; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec14; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec10; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec14, xvec14; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec10, xvec10; ADDQ $8*SIZE, ptrba; ADDQ $4*SIZE, ptrbb; @@ -3131,29 +3108,29 @@ ADDQ $4*SIZE, ptrbb; #### Handle #### #if defined(RN) || defined(RT) || defined(CN) || defined(CT) XOR_SY yvec7, yvec7, yvec7; -ADDSUB_SX xvec15, xvec7; +ADDSUB_SX xvec15, xvec7, xvec7; MOV_SX xvec7, xvec15; XOR_SY yvec7, yvec7, yvec7; -ADDSUB_SX xvec14, xvec7; +ADDSUB_SX xvec14, xvec7, xvec7; MOV_SX xvec7, xvec14; XOR_SY yvec7, yvec7, yvec7; -ADDSUB_SX xvec11, xvec7; +ADDSUB_SX xvec11, xvec7, xvec7; MOV_SX xvec7, xvec11; XOR_SY yvec7, yvec7, yvec7; -ADDSUB_SX xvec10, xvec7; +ADDSUB_SX xvec10, xvec7, xvec7; MOV_SX xvec7, xvec10; #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) XOR_SY yvec7, yvec7, yvec7; -SUB_SX xvec15, xvec7; +SUB_SX xvec15, xvec7, xvec7; MOV_SX xvec7, xvec15; XOR_SY yvec7, yvec7, yvec7; -SUB_SX xvec14, xvec7; +SUB_SX xvec14, xvec7, xvec7; MOV_SX xvec7, xvec14; XOR_SY yvec7, yvec7, yvec7; -SUB_SX xvec11, xvec7; +SUB_SX xvec11, xvec7, xvec7; MOV_SX xvec7, xvec11; XOR_SY yvec7, yvec7, yvec7; -SUB_SX xvec10, xvec7; +SUB_SX xvec10, xvec7, xvec7; MOV_SX xvec7, xvec10; #elif defined(RR) || defined(RC) || defined(CR) || defined(CC) SHUF_SX $0xb1, xvec15, xvec15; @@ -3161,16 +3138,16 @@ SHUF_SX $0xb1, xvec14, xvec14; SHUF_SX $0xb1, xvec11, xvec11; SHUF_SX $0xb1, xvec10, xvec10; XOR_SY yvec7, yvec7, yvec7; -ADDSUB_SX xvec15, xvec7; +ADDSUB_SX xvec15, xvec7, xvec7; MOV_SX xvec7, xvec15; XOR_SY yvec7, yvec7, yvec7; -ADDSUB_SX xvec14, xvec7; +ADDSUB_SX xvec14, xvec7, xvec7; MOV_SX xvec7, xvec14; XOR_SY yvec7, yvec7, yvec7; -ADDSUB_SX xvec11, xvec7; +ADDSUB_SX xvec11, xvec7, xvec7; MOV_SX xvec7, xvec11; XOR_SY yvec7, yvec7, yvec7; -ADDSUB_SX xvec10, xvec7; +ADDSUB_SX xvec10, xvec7, xvec7; MOV_SX xvec7, xvec10; SHUF_SX $0xb1, xvec15, xvec15; SHUF_SX $0xb1, xvec14, xvec14; @@ -3182,40 +3159,40 @@ BROAD_SX MEMALPHA_R, xvec7; BROAD_SX MEMALPHA_I, xvec6; #### Writng back #### VPERMILP_SX $0xb1,xvec15, xvec5; -MUL_SX xvec7, xvec15; -MUL_SX xvec6, xvec5; -ADDSUB_SX xvec5, xvec15; +MUL_SX xvec7, xvec15, xvec15; +MUL_SX xvec6, xvec5, xvec5; +ADDSUB_SX xvec5, xvec15, xvec15; VPERMILP_SX $0xb1,xvec14, xvec4; -MUL_SX xvec7, xvec14; -MUL_SX xvec6, xvec4; -ADDSUB_SX xvec4, xvec14; +MUL_SX xvec7, xvec14, xvec14; +MUL_SX xvec6, xvec4, xvec4; +ADDSUB_SX xvec4, xvec14, xvec14; VPERMILP_SX $0xb1,xvec11, xvec1; -MUL_SX xvec7, xvec11; -MUL_SX xvec6, xvec1; -ADDSUB_SX xvec1, xvec11; +MUL_SX xvec7, xvec11, xvec11; +MUL_SX xvec6, xvec1, xvec1; +ADDSUB_SX xvec1, xvec11, xvec11; VPERMILP_SX $0xb1,xvec10, xvec0; -MUL_SX xvec7, xvec10; -MUL_SX xvec6, xvec0; -ADDSUB_SX xvec0, xvec10; +MUL_SX xvec7, xvec10, xvec10; +MUL_SX xvec6, xvec0, xvec0; +ADDSUB_SX xvec0, xvec10, xvec10; #ifndef TRMMKERNEL -LDL_SX 0*SIZE(C0), xvec0; -LDH_SX 2*SIZE(C1), xvec0; -LDL_SX 4*SIZE(C0), xvec1; -LDH_SX 6*SIZE(C1), xvec1; -ADD_SX xvec0, xvec15; -ADD_SX xvec1, xvec14; +LDL_SX 0*SIZE(C0), xvec0, xvec0; +LDH_SX 2*SIZE(C1), xvec0, xvec0; +LDL_SX 4*SIZE(C0), xvec1, xvec1; +LDH_SX 6*SIZE(C1), xvec1, xvec1; +ADD_SX xvec0, xvec15, xvec15; +ADD_SX xvec1, xvec14, xvec14; #endif STL_SX xvec15, 0*SIZE(C0); STH_SX xvec15, 2*SIZE(C1); STL_SX xvec14, 4*SIZE(C0); STH_SX xvec14, 6*SIZE(C1); #ifndef TRMMKERNEL -LDL_SX 0*SIZE(C1), xvec4; -LDH_SX 2*SIZE(C0), xvec4; -LDL_SX 4*SIZE(C1), xvec5; -LDH_SX 6*SIZE(C0), xvec5; -ADD_SX xvec4, xvec11; -ADD_SX xvec5, xvec10; +LDL_SX 0*SIZE(C1), xvec4, xvec4; +LDH_SX 2*SIZE(C0), xvec4, xvec4; +LDL_SX 4*SIZE(C1), xvec5, xvec5; +LDH_SX 6*SIZE(C0), xvec5, xvec5; +ADD_SX xvec4, xvec11, xvec11; +ADD_SX xvec5, xvec10, xvec10; #endif STL_SX xvec11, 0*SIZE(C1); STH_SX xvec11, 2*SIZE(C0); @@ -3277,17 +3254,17 @@ SHUF_SX $0x4e, xvec5, xvec7; LD_SX 0*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec15; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec15, xvec15; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec11; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec11, xvec11; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec15; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec11; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec15, xvec15; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec11, xvec11; #### Unroll 2 ##### EDUP_SX 4*SIZE(ptrbb), xvec4; @@ -3297,17 +3274,17 @@ SHUF_SX $0x4e, xvec5, xvec7; LD_SX 4*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec15; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec15, xvec15; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec11; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec11, xvec11; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec15; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec11; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec15, xvec15; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec11, xvec11; #### Unroll 3 #### EDUP_SX 8*SIZE(ptrbb), xvec4; @@ -3317,17 +3294,17 @@ SHUF_SX $0x4e, xvec5, xvec7; LD_SX 8*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec15; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec15, xvec15; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec11; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec11, xvec11; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec15; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec11; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec15, xvec15; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec11, xvec11; #### Unroll 4 #### EDUP_SX 12*SIZE(ptrbb), xvec4; @@ -3337,17 +3314,17 @@ SHUF_SX $0x4e, xvec5, xvec7; LD_SX 12*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec15; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec15, xvec15; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec11; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec11, xvec11; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec15; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec11; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec15, xvec15; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec11, xvec11; ADDQ $16*SIZE, ptrba; ADDQ $16*SIZE, ptrbb; DECQ k; @@ -3369,17 +3346,17 @@ SHUF_SX $0x4e, xvec5, xvec7; LD_SX 0*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec15; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec15, xvec15; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec11; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec11, xvec11; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec15; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec11; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec15, xvec15; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec11, xvec11; #### Unroll 2 ##### EDUP_SX 4*SIZE(ptrbb), xvec4; @@ -3389,17 +3366,17 @@ SHUF_SX $0x4e, xvec5, xvec7; LD_SX 4*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec15; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec15, xvec15; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec11; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec11, xvec11; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec15; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec11; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec15, xvec15; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec11, xvec11; ADDQ $8*SIZE, ptrba; ADDQ $8*SIZE, ptrbb; @@ -3419,17 +3396,17 @@ SHUF_SX $0x4e, xvec5, xvec7; LD_SX 0*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec15; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec15, xvec15; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec11; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec11, xvec11; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec15; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec11; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec15, xvec15; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec11, xvec11; ADDQ $4*SIZE, ptrba; ADDQ $4*SIZE, ptrbb; @@ -3437,26 +3414,26 @@ ADDQ $4*SIZE, ptrbb; #### Handle #### #if defined(RN) || defined(RT) || defined(CN) || defined(CT) XOR_SY yvec7, yvec7, yvec7; -ADDSUB_SX xvec15, xvec7; +ADDSUB_SX xvec15, xvec7, xvec7; MOV_SX xvec7, xvec15; XOR_SY yvec7, yvec7, yvec7; -ADDSUB_SX xvec11, xvec7; +ADDSUB_SX xvec11, xvec7, xvec7; MOV_SX xvec7, xvec11; #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) XOR_SY yvec7, yvec7, yvec7; -SUB_SX xvec15, xvec7; +SUB_SX xvec15, xvec7, xvec7; MOV_SX xvec7, xvec15; XOR_SY yvec7, yvec7, yvec7; -SUB_SX xvec11, xvec7; +SUB_SX xvec11, xvec7, xvec7; MOV_SX xvec7, xvec11; #elif defined(RR) || defined(RC) || defined(CR) || defined(CC) SHUF_SX $0xb1, xvec15, xvec15; SHUF_SX $0xb1, xvec11, xvec11; XOR_SY yvec7, yvec7, yvec7; -ADDSUB_SX xvec15, xvec7; +ADDSUB_SX xvec15, xvec7, xvec7; MOV_SX xvec7, xvec15; XOR_SY yvec7, yvec7, yvec7; -ADDSUB_SX xvec11, xvec7; +ADDSUB_SX xvec11, xvec7, xvec7; MOV_SX xvec7, xvec11; SHUF_SX $0xb1, xvec15, xvec15; SHUF_SX $0xb1, xvec11, xvec11; @@ -3466,24 +3443,24 @@ BROAD_SX MEMALPHA_R, xvec7; BROAD_SX MEMALPHA_I, xvec6; #### Writng back #### VPERMILP_SX $0xb1,xvec15, xvec5; -MUL_SX xvec7, xvec15; -MUL_SX xvec6, xvec5; -ADDSUB_SX xvec5, xvec15; +MUL_SX xvec7, xvec15, xvec15; +MUL_SX xvec6, xvec5, xvec5; +ADDSUB_SX xvec5, xvec15, xvec15; VPERMILP_SX $0xb1,xvec11, xvec1; -MUL_SX xvec7, xvec11; -MUL_SX xvec6, xvec1; -ADDSUB_SX xvec1, xvec11; +MUL_SX xvec7, xvec11, xvec11; +MUL_SX xvec6, xvec1, xvec1; +ADDSUB_SX xvec1, xvec11, xvec11; #ifndef TRMMKERNEL -LDL_SX 0*SIZE(C0), xvec0; -LDH_SX 2*SIZE(C1), xvec0; -ADD_SX xvec0, xvec15; +LDL_SX 0*SIZE(C0), xvec0, xvec0; +LDH_SX 2*SIZE(C1), xvec0, xvec0; +ADD_SX xvec0, xvec15, xvec15; #endif STL_SX xvec15, 0*SIZE(C0); STH_SX xvec15, 2*SIZE(C1); #ifndef TRMMKERNEL -LDL_SX 0*SIZE(C1), xvec4; -LDH_SX 2*SIZE(C0), xvec4; -ADD_SX xvec4, xvec11; +LDL_SX 0*SIZE(C1), xvec4, xvec4; +LDH_SX 2*SIZE(C0), xvec4, xvec4; +ADD_SX xvec4, xvec11, xvec11; #endif STL_SX xvec11, 0*SIZE(C1); STH_SX xvec11, 2*SIZE(C0); @@ -3538,42 +3515,42 @@ ALIGN_5 BROAD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec2; SHUF_SX $0xb1, xvec2, xvec3; -MUL_SX xvec0, xvec2; -ADD1_SX xvec2, xvec15; +MUL_SX xvec0, xvec2, xvec2; +ADD1_SX xvec2, xvec15, xvec15; BROAD_SX 1*SIZE(ptrba), xvec1; -MUL_SX xvec1, xvec3; -ADD2_SX xvec3, xvec15; +MUL_SX xvec1, xvec3, xvec3; +ADD2_SX xvec3, xvec15, xvec15; BROAD_SX 2*SIZE(ptrba), xvec0; LD_SX 4*SIZE(ptrbb), xvec2; SHUF_SX $0xb1, xvec2, xvec3; -MUL_SX xvec0, xvec2; -ADD1_SX xvec2, xvec15; +MUL_SX xvec0, xvec2, xvec2; +ADD1_SX xvec2, xvec15, xvec15; BROAD_SX 3*SIZE(ptrba), xvec1; -MUL_SX xvec1, xvec3; -ADD2_SX xvec3, xvec15; +MUL_SX xvec1, xvec3, xvec3; +ADD2_SX xvec3, xvec15, xvec15; BROAD_SX 4*SIZE(ptrba), xvec0; LD_SX 8*SIZE(ptrbb), xvec2; SHUF_SX $0xb1, xvec2, xvec3; -MUL_SX xvec0, xvec2; -ADD1_SX xvec2, xvec15; +MUL_SX xvec0, xvec2, xvec2; +ADD1_SX xvec2, xvec15, xvec15; BROAD_SX 5*SIZE(ptrba), xvec1; -MUL_SX xvec1, xvec3; -ADD2_SX xvec3, xvec15; +MUL_SX xvec1, xvec3, xvec3; +ADD2_SX xvec3, xvec15, xvec15; BROAD_SX 6*SIZE(ptrba), xvec0; LD_SX 12*SIZE(ptrbb), xvec2; SHUF_SX $0xb1, xvec2, xvec3; -MUL_SX xvec0, xvec2; -ADD1_SX xvec2, xvec15; +MUL_SX xvec0, xvec2, xvec2; +ADD1_SX xvec2, xvec15, xvec15; BROAD_SX 7*SIZE(ptrba), xvec1; -MUL_SX xvec1, xvec3; -ADD2_SX xvec3, xvec15; +MUL_SX xvec1, xvec3, xvec3; +ADD2_SX xvec3, xvec15, xvec15; ADDQ $8*SIZE, ptrba; ADDQ $16*SIZE, ptrbb; DECQ k; @@ -3591,22 +3568,22 @@ ALIGN_5 BROAD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec2; SHUF_SX $0xb1, xvec2, xvec3; -MUL_SX xvec0, xvec2; -ADD1_SX xvec2, xvec15; +MUL_SX xvec0, xvec2, xvec2; +ADD1_SX xvec2, xvec15, xvec15; BROAD_SX 1*SIZE(ptrba), xvec1; -MUL_SX xvec1, xvec3; -ADD2_SX xvec3, xvec15; +MUL_SX xvec1, xvec3, xvec3; +ADD2_SX xvec3, xvec15, xvec15; BROAD_SX 2*SIZE(ptrba), xvec0; LD_SX 4*SIZE(ptrbb), xvec2; SHUF_SX $0xb1, xvec2, xvec3; -MUL_SX xvec0, xvec2; -ADD1_SX xvec2, xvec15; +MUL_SX xvec0, xvec2, xvec2; +ADD1_SX xvec2, xvec15, xvec15; BROAD_SX 3*SIZE(ptrba), xvec1; -MUL_SX xvec1, xvec3; -ADD2_SX xvec3, xvec15; +MUL_SX xvec1, xvec3, xvec3; +ADD2_SX xvec3, xvec15, xvec15; ADDQ $4*SIZE, ptrba; ADDQ $8*SIZE, ptrbb; @@ -3622,12 +3599,12 @@ ALIGN_5 BROAD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec2; SHUF_SX $0xb1, xvec2, xvec3; -MUL_SX xvec0, xvec2; -ADD1_SX xvec2, xvec15; +MUL_SX xvec0, xvec2, xvec2; +ADD1_SX xvec2, xvec15, xvec15; BROAD_SX 1*SIZE(ptrba), xvec1; -MUL_SX xvec1, xvec3; -ADD2_SX xvec3, xvec15; +MUL_SX xvec1, xvec3, xvec3; +ADD2_SX xvec3, xvec15, xvec15; ADDQ $2*SIZE, ptrba; ADDQ $4*SIZE, ptrbb; @@ -3635,14 +3612,14 @@ ADDQ $4*SIZE, ptrbb; #### Handle #### XOR_SY yvec7, yvec7, yvec7; #if defined(NR) || defined(NC) || defined(TR) || defined(TC) -ADDSUB_SX xvec15, xvec7; +ADDSUB_SX xvec15, xvec7, xvec7; MOV_SX xvec7, xvec15; #elif defined(RN) || defined(RT) || defined(CN) || defined(CT) -SUB_SX xvec15, xvec7; +SUB_SX xvec15, xvec7, xvec7; MOV_SX xvec7, xvec15; #elif defined(RR) || defined(RC) || defined(CR) || defined(CC) SHUF_SX $0xb1, xvec15, xvec15; -ADDSUB_SX xvec15, xvec7; +ADDSUB_SX xvec15, xvec7, xvec7; MOV_SX xvec7, xvec15; SHUF_SX $0xb1, xvec15, xvec15; #endif @@ -3651,14 +3628,14 @@ BROAD_SX MEMALPHA_R,xvec7; BROAD_SX MEMALPHA_I,xvec6; ##### Multiply Alpha #### VPERMILP_SX $0xb1,xvec15, xvec5; -MUL_SX xvec7, xvec15; -MUL_SX xvec6, xvec5; -ADDSUB_SX xvec5, xvec15; +MUL_SX xvec7, xvec15, xvec15; +MUL_SX xvec6, xvec5, xvec5; +ADDSUB_SX xvec5, xvec15, xvec15; #### Writing back #### #ifndef TRMMKERNEL -LDL_SX 0*SIZE(C0), xvec0; -LDH_SX 0*SIZE(C1), xvec0; -ADD_SX xvec0, xvec15; +LDL_SX 0*SIZE(C0), xvec0, xvec0; +LDH_SX 0*SIZE(C1), xvec0, xvec0; +ADD_SX xvec0, xvec15, xvec15; #endif STL_SX xvec15, 0*SIZE(C0); STH_SX xvec15, 0*SIZE(C1); @@ -3908,18 +3885,18 @@ ADDSUB_SY yvec4, yvec14, yvec14; EXTRA_SY $1, yvec15, xvec7; EXTRA_SY $1, yvec14, xvec6; #ifndef TRMMKERNEL -LDL_SX 0*SIZE(C0), xvec0; -LDH_SX 2*SIZE(C0), xvec0; -LDL_SX 4*SIZE(C0), xvec1; -LDH_SX 6*SIZE(C0), xvec1; -LDL_SX 8*SIZE(C0), xvec2; -LDH_SX 10*SIZE(C0), xvec2; -LDL_SX 12*SIZE(C0), xvec3; -LDH_SX 14*SIZE(C0), xvec3; -ADD_SX xvec0, xvec15; -ADD_SX xvec1, xvec7; -ADD_SX xvec2, xvec14; -ADD_SX xvec3, xvec6; +LDL_SX 0*SIZE(C0), xvec0, xvec0; +LDH_SX 2*SIZE(C0), xvec0, xvec0; +LDL_SX 4*SIZE(C0), xvec1, xvec1; +LDH_SX 6*SIZE(C0), xvec1, xvec1; +LDL_SX 8*SIZE(C0), xvec2, xvec2; +LDH_SX 10*SIZE(C0), xvec2, xvec2; +LDL_SX 12*SIZE(C0), xvec3, xvec3; +LDH_SX 14*SIZE(C0), xvec3, xvec3; +ADD_SX xvec0, xvec15, xvec15; +ADD_SX xvec1, xvec7, xvec7; +ADD_SX xvec2, xvec14, xvec14; +ADD_SX xvec3, xvec6, xvec6; #endif STL_SX xvec15, 0*SIZE(C0); STH_SX xvec15, 2*SIZE(C0); @@ -4103,12 +4080,12 @@ ADDSUB_SY yvec5, yvec15, yvec15; #### Writing back #### EXTRA_SY $1, yvec15, xvec7; #ifndef TRMMKERNEL -LDL_SX 0*SIZE(C0), xvec0; -LDH_SX 2*SIZE(C0), xvec0; -LDL_SX 4*SIZE(C0), xvec1; -LDH_SX 6*SIZE(C0), xvec1; -ADD_SX xvec0, xvec15; -ADD_SX xvec1, xvec7; +LDL_SX 0*SIZE(C0), xvec0, xvec0; +LDH_SX 2*SIZE(C0), xvec0, xvec0; +LDL_SX 4*SIZE(C0), xvec1, xvec1; +LDH_SX 6*SIZE(C0), xvec1, xvec1; +ADD_SX xvec0, xvec15, xvec15; +ADD_SX xvec1, xvec7, xvec7; #endif STL_SX xvec15, 0*SIZE(C0); STH_SX xvec15, 2*SIZE(C0); @@ -4163,42 +4140,42 @@ ALIGN_5 #### Unroll 1 #### LD_SX 0*SIZE(ptrba), xvec0; BROAD_SX 0*SIZE(ptrbb), xvec2; -MUL_SX xvec0, xvec2; -ADD1_SX xvec2, xvec15; +MUL_SX xvec0, xvec2, xvec2; +ADD1_SX xvec2, xvec15, xvec15; SHUF_SX $0xb1, xvec0, xvec1; BROAD_SX 1*SIZE(ptrbb), xvec3; -MUL_SX xvec1, xvec3; -ADD2_SX xvec3, xvec15; +MUL_SX xvec1, xvec3, xvec3; +ADD2_SX xvec3, xvec15, xvec15; #### Unroll 2 #### LD_SX 4*SIZE(ptrba), xvec0; BROAD_SX 2*SIZE(ptrbb), xvec2; -MUL_SX xvec0, xvec2; -ADD1_SX xvec2, xvec15; +MUL_SX xvec0, xvec2, xvec2; +ADD1_SX xvec2, xvec15, xvec15; SHUF_SX $0xb1, xvec0, xvec1; BROAD_SX 3*SIZE(ptrbb), xvec3; -MUL_SX xvec1, xvec3; -ADD2_SX xvec3, xvec15; +MUL_SX xvec1, xvec3, xvec3; +ADD2_SX xvec3, xvec15, xvec15; #### Unroll 3 #### LD_SX 8*SIZE(ptrba), xvec0; BROAD_SX 4*SIZE(ptrbb), xvec2; -MUL_SX xvec0, xvec2; -ADD1_SX xvec2, xvec15; +MUL_SX xvec0, xvec2, xvec2; +ADD1_SX xvec2, xvec15, xvec15; SHUF_SX $0xb1, xvec0, xvec1; BROAD_SX 5*SIZE(ptrbb), xvec3; -MUL_SX xvec1, xvec3; -ADD2_SX xvec3, xvec15; +MUL_SX xvec1, xvec3, xvec3; +ADD2_SX xvec3, xvec15, xvec15; #### Unroll 4 #### LD_SX 12*SIZE(ptrba), xvec0; BROAD_SX 6*SIZE(ptrbb), xvec2; -MUL_SX xvec0, xvec2; -ADD1_SX xvec2, xvec15; +MUL_SX xvec0, xvec2, xvec2; +ADD1_SX xvec2, xvec15, xvec15; SHUF_SX $0xb1, xvec0, xvec1; BROAD_SX 7*SIZE(ptrbb), xvec3; -MUL_SX xvec1, xvec3; -ADD2_SX xvec3, xvec15; +MUL_SX xvec1, xvec3, xvec3; +ADD2_SX xvec3, xvec15, xvec15; ADDQ $16*SIZE, ptrba; ADDQ $8*SIZE, ptrbb; DECQ k; @@ -4216,22 +4193,22 @@ ALIGN_5 #### Unroll 1 #### LD_SX 0*SIZE(ptrba), xvec0; BROAD_SX 0*SIZE(ptrbb), xvec2; -MUL_SX xvec0, xvec2; -ADD1_SX xvec2, xvec15; +MUL_SX xvec0, xvec2, xvec2; +ADD1_SX xvec2, xvec15, xvec15; SHUF_SX $0xb1, xvec0, xvec1; BROAD_SX 1*SIZE(ptrbb), xvec3; -MUL_SX xvec1, xvec3; -ADD2_SX xvec3, xvec15; +MUL_SX xvec1, xvec3, xvec3; +ADD2_SX xvec3, xvec15, xvec15; #### Unroll 2 #### LD_SX 4*SIZE(ptrba), xvec0; BROAD_SX 2*SIZE(ptrbb), xvec2; -MUL_SX xvec0, xvec2; -ADD1_SX xvec2, xvec15; +MUL_SX xvec0, xvec2, xvec2; +ADD1_SX xvec2, xvec15, xvec15; SHUF_SX $0xb1, xvec0, xvec1; BROAD_SX 3*SIZE(ptrbb), xvec3; -MUL_SX xvec1, xvec3; -ADD2_SX xvec3, xvec15; +MUL_SX xvec1, xvec3, xvec3; +ADD2_SX xvec3, xvec15, xvec15; ADDQ $8*SIZE, ptrba; ADDQ $4*SIZE, ptrbb; @@ -4247,12 +4224,12 @@ ALIGN_5 #### Unroll 1 #### LD_SX 0*SIZE(ptrba), xvec0; BROAD_SX 0*SIZE(ptrbb), xvec2; -MUL_SX xvec0, xvec2; -ADD1_SX xvec2, xvec15; +MUL_SX xvec0, xvec2, xvec2; +ADD1_SX xvec2, xvec15, xvec15; SHUF_SX $0xb1, xvec0, xvec1; BROAD_SX 1*SIZE(ptrbb), xvec3; -MUL_SX xvec1, xvec3; -ADD2_SX xvec3, xvec15; +MUL_SX xvec1, xvec3, xvec3; +ADD2_SX xvec3, xvec15, xvec15; ADDQ $4*SIZE, ptrba; ADDQ $2*SIZE, ptrbb; @@ -4260,14 +4237,14 @@ ADDQ $2*SIZE, ptrbb; #### Handle #### XOR_SY yvec7, yvec7, yvec7; #if defined(RN) || defined(RT) || defined(CN) || defined(CT) -ADDSUB_SX xvec15, xvec7; +ADDSUB_SX xvec15, xvec7, xvec7; MOV_SX xvec7, xvec15; #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) -SUB_SX xvec15, xvec7; +SUB_SX xvec15, xvec7, xvec7; MOV_SX xvec7, xvec15; #elif defined(RR) || defined(RC) || defined(CR) || defined(CC) SHUF_SX $0xb1, xvec15, xvec15; -ADDSUB_SX xvec15, xvec7; +ADDSUB_SX xvec15, xvec7, xvec7; MOV_SX xvec7, xvec15; SHUF_SX $0xb1, xvec15, xvec15; #endif @@ -4276,13 +4253,13 @@ BROAD_SX MEMALPHA_R, xvec7; BROAD_SX MEMALPHA_I, xvec6; #### Writng back #### VPERMILP_SX $0xb1,xvec15, xvec5; -MUL_SX xvec7, xvec15; -MUL_SX xvec6, xvec5; -ADDSUB_SX xvec5, xvec15; +MUL_SX xvec7, xvec15, xvec15; +MUL_SX xvec6, xvec5, xvec5; +ADDSUB_SX xvec5, xvec15, xvec15; #ifndef TRMMKERNEL -LDL_SX 0*SIZE(C0), xvec0; -LDH_SX 2*SIZE(C0), xvec0; -ADD_SX xvec0, xvec15; +LDL_SX 0*SIZE(C0), xvec0, xvec0; +LDH_SX 2*SIZE(C0), xvec0, xvec0; +ADD_SX xvec0, xvec15, xvec15; #endif STL_SX xvec15, 0*SIZE(C0); STH_SX xvec15, 2*SIZE(C0); @@ -4335,22 +4312,22 @@ ALIGN_5 LD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec2; SHUF_SX $0xa0, xvec2, xvec3; -MUL_SX xvec0, xvec3; -ADD1_SX xvec3, xvec15; +MUL_SX xvec0, xvec3, xvec3; +ADD1_SX xvec3, xvec15, xvec15; SHUF_SX $0xb1, xvec0, xvec1; SHUF_SX $0xf5, xvec2, xvec4; -MUL_SX xvec1, xvec4; -ADD2_SX xvec4, xvec15; +MUL_SX xvec1, xvec4, xvec4; +ADD2_SX xvec4, xvec15, xvec15; LD_SX 4*SIZE(ptrba), xvec0; LD_SX 4*SIZE(ptrbb), xvec2; SHUF_SX $0xa0, xvec2, xvec3; -MUL_SX xvec0, xvec3; -ADD1_SX xvec3, xvec15; +MUL_SX xvec0, xvec3, xvec3; +ADD1_SX xvec3, xvec15, xvec15; SHUF_SX $0xb1, xvec0, xvec1; SHUF_SX $0xf5, xvec2, xvec4; -MUL_SX xvec1, xvec4; -ADD2_SX xvec4, xvec15; +MUL_SX xvec1, xvec4, xvec4; +ADD2_SX xvec4, xvec15, xvec15; ADDQ $8*SIZE, ptrba; ADDQ $8*SIZE, ptrbb; DECQ k; @@ -4368,12 +4345,12 @@ ALIGN_5 LD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec2; SHUF_SX $0xa0, xvec2, xvec3; -MUL_SX xvec0, xvec3; -ADD1_SX xvec3, xvec15; +MUL_SX xvec0, xvec3, xvec3; +ADD1_SX xvec3, xvec15, xvec15; SHUF_SX $0xb1, xvec0, xvec1; SHUF_SX $0xf5, xvec2, xvec4; -MUL_SX xvec1, xvec4; -ADD2_SX xvec4, xvec15; +MUL_SX xvec1, xvec4, xvec4; +ADD2_SX xvec4, xvec15, xvec15; ADDQ $4*SIZE, ptrba; ADDQ $4*SIZE, ptrbb; @@ -4388,15 +4365,15 @@ ALIGN_5 .L343_bodyB: XOR_SY yvec0, yvec0, yvec0; XOR_SY yvec2, yvec2, yvec2; -LDL_SX 0*SIZE(ptrba), xvec0; -LDL_SX 0*SIZE(ptrbb), xvec2; +LDL_SX 0*SIZE(ptrba), xvec0, xvec0; +LDL_SX 0*SIZE(ptrbb), xvec2, xvec2; SHUF_SX $0xe0, xvec2, xvec3; -MUL_SX xvec0, xvec3; -ADD1_SX xvec3, xvec15; +MUL_SX xvec0, xvec3, xvec3; +ADD1_SX xvec3, xvec15, xvec15; SHUF_SX $0xe1, xvec0, xvec1; SHUF_SX $0xe5, xvec2, xvec4; -MUL_SX xvec1, xvec4; -ADD2_SX xvec4, xvec15; +MUL_SX xvec1, xvec4, xvec4; +ADD2_SX xvec4, xvec15, xvec15; ADDQ $2*SIZE, ptrba; ADDQ $2*SIZE, ptrbb; @@ -4404,29 +4381,29 @@ ADDQ $2*SIZE, ptrbb; #### Handle #### XOR_SY yvec7, yvec7, yvec7; #if defined(RN) || defined(RT) || defined(CN) || defined(CT) -ADDSUB_SX xvec15, xvec7; +ADDSUB_SX xvec15, xvec7, xvec7; MOV_SX xvec7, xvec15; #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) -SUB_SX xvec15, xvec7; +SUB_SX xvec15, xvec7, xvec7; MOV_SX xvec7, xvec15; #elif defined(RR) || defined(RC) || defined(CR) || defined(CC) SHUF_SX $0xb1, xvec15, xvec15; -ADDSUB_SX xvec15, xvec7; +ADDSUB_SX xvec15, xvec7, xvec7; MOV_SX xvec7, xvec15; SHUF_SX $0xb1, xvec15, xvec15; #endif BROAD_SX MEMALPHA_R, xvec7; BROAD_SX MEMALPHA_I, xvec6; VPERMILP_SX $0xb1, xvec15, xvec5; -MUL_SX xvec7, xvec15; -MUL_SX xvec6, xvec5; -ADDSUB_SX xvec5, xvec15; +MUL_SX xvec7, xvec15, xvec15; +MUL_SX xvec6, xvec5, xvec5; +ADDSUB_SX xvec5, xvec15, xvec15; SHUF_SX $0x44, xvec15, xvec14; SHUF_SX $0xee, xvec15, xvec13; -ADD_SX xvec13, xvec14; +ADD_SX xvec13, xvec14, xvec14; #ifndef TRMMKERNEL -LDL_SX 0*SIZE(C0), xvec0; -ADD_SX xvec0, xvec14; +LDL_SX 0*SIZE(C0), xvec0, xvec0; +ADD_SX xvec0, xvec14, xvec14; #endif STL_SX xvec14, 0*SIZE(C0); #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) @@ -4458,6 +4435,8 @@ movq 24(%rsp), %r13; movq 32(%rsp), %r14; movq 40(%rsp), %r15; +vzeroupper + #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi diff --git a/kernel/x86_64/dgemm_kernel_4x8_sandy.S b/kernel/x86_64/dgemm_kernel_4x8_sandy.S index 603552464..3b1b2560e 100644 --- a/kernel/x86_64/dgemm_kernel_4x8_sandy.S +++ b/kernel/x86_64/dgemm_kernel_4x8_sandy.S @@ -143,71 +143,49 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #undef MOVQ #define MOVQ movq -#define XOR_SY vxorps #define XOR_DY vxorpd -#define XOR_SX xorps -#define XOR_DX xorpd +#define XOR_DX vxorpd -#define LD_SY vmovaps #define LD_DY vmovapd -#define LD_SX movaps -#define LD_DX movapd -#define LDL_DX movlpd +#define LD_DX vmovapd +#define LDL_DX vmovlpd #define LDL_DY vmovlpd -#define LDH_DX movhpd +#define LDH_DX vmovhpd #define LDH_DY vmovhpd -#define ST_SY vmovaps #define ST_DY vmovapd -#define ST_SX movaps -#define ST_DX movapd -#define STL_DX movlpd +#define ST_DX vmovapd +#define STL_DX vmovlpd #define STL_DY vmovlpd -#define STH_DX movhpd +#define STH_DX vmovhpd #define STH_DY vmovhpd -#define EDUP_SY vmovsldup -#define ODUP_SY vmovshdup #define EDUP_DY vmovddup -#define ADD_SY vaddps #define ADD_DY vaddpd -#define ADD_SX addps -#define ADD_DX addpd +#define ADD_DX vaddpd #define ADD1_DY vaddpd #define ADD2_DY vaddpd #define ADDSUB_DY vaddsubpd -#define ADDSUB_SY vaddsubps -#define MUL_SY vmulps #define MUL_DY vmulpd -#define MUL_SX mulps -#define MUL_DX mulpd +#define MUL_DX vmulpd -#define SHUF_SY vperm2f128 #define SHUF_DY vperm2f128 -#define SHUF_DX pshufd +#define SHUF_DX vpshufd -#define VPERMILP_SY vpermilps #define VPERMILP_DY vpermilpd -#define BROAD_SY vbroadcastss #define BROAD_DY vbroadcastsd -#define BROAD_SX -#define BROAD_DX movddup +#define BROAD_DX vmovddup -#define MOV_SY vmovaps #define MOV_DY vmovapd -#define MOV_SX movaps -#define MOV_DX movapd +#define MOV_DX vmovapd -#define REVS_SY vshufps #define REVS_DY vshufpd -#define REVS_SX shufps -#define REVS_DX movsd +#define REVS_DX vmovsd -#define EXTRA_SY vextractf128 #define EXTRA_DY vextractf128 PROLOGUE @@ -253,6 +231,8 @@ movq old_offset, %r11 #endif #endif +vzeroupper + vmovlps ALPHA, MEMALPHA movq old_bm, bm movq old_bn, bn @@ -988,14 +968,14 @@ EXTRA_DY $1,yvec13,xvec5; EXTRA_DY $1,yvec11,xvec3; EXTRA_DY $1,yvec9,xvec1; #ifndef TRMMKERNEL -ADD_DX 0*SIZE(C0),xvec15; -ADD_DX 2*SIZE(C1),xvec7; -ADD_DX 0*SIZE(C0,ldc,1),xvec13; -ADD_DX 2*SIZE(C1,ldc,1),xvec5; -ADD_DX 0*SIZE(C1),xvec11; -ADD_DX 2*SIZE(C0),xvec3; -ADD_DX 0*SIZE(C1,ldc,1),xvec9; -ADD_DX 2*SIZE(C0,ldc,1),xvec1; +ADD_DX 0*SIZE(C0), xvec15, xvec15; +ADD_DX 2*SIZE(C1), xvec7, xvec7; +ADD_DX 0*SIZE(C0, ldc, 1), xvec13, xvec13; +ADD_DX 2*SIZE(C1, ldc, 1), xvec5, xvec5; +ADD_DX 0*SIZE(C1), xvec11, xvec11; +ADD_DX 2*SIZE(C0), xvec3, xvec3; +ADD_DX 0*SIZE(C1, ldc, 1), xvec9, xvec9; +ADD_DX 2*SIZE(C0, ldc, 1), xvec1, xvec1; #endif ST_DX xvec15, 0*SIZE(C0); ST_DX xvec7, 2*SIZE(C1); @@ -1025,18 +1005,18 @@ EXTRA_DY $1,yvec13,xvec5; EXTRA_DY $1,yvec11,xvec3; EXTRA_DY $1,yvec9,xvec1; #ifndef TRMMKERNEL -LDL_DX 0*SIZE(C0), xvec14; -LDH_DX 1*SIZE(C0), xvec14; -LDL_DX 0*SIZE(C0, ldc, 1), xvec12; -LDH_DX 1*SIZE(C0, ldc, 1), xvec12; -LDL_DX 0*SIZE(C1), xvec10; -LDH_DX 1*SIZE(C1), xvec10; -LDL_DX 0*SIZE(C1, ldc, 1), xvec8; -LDH_DX 1*SIZE(C1, ldc, 1), xvec8; -ADD_DX xvec14, xvec15; -ADD_DX xvec12, xvec13; -ADD_DX xvec10, xvec11; -ADD_DX xvec8, xvec9; +LDL_DX 0*SIZE(C0), xvec14, xvec14; +LDH_DX 1*SIZE(C0), xvec14, xvec14; +LDL_DX 0*SIZE(C0, ldc, 1), xvec12, xvec12; +LDH_DX 1*SIZE(C0, ldc, 1), xvec12, xvec12; +LDL_DX 0*SIZE(C1), xvec10, xvec10; +LDH_DX 1*SIZE(C1), xvec10, xvec10; +LDL_DX 0*SIZE(C1, ldc, 1), xvec8, xvec8; +LDH_DX 1*SIZE(C1, ldc, 1), xvec8, xvec8; +ADD_DX xvec14, xvec15, xvec15; +ADD_DX xvec12, xvec13, xvec13; +ADD_DX xvec10, xvec11, xvec11; +ADD_DX xvec8, xvec9, xvec9; #endif STL_DX xvec15, 0*SIZE(C0); STH_DX xvec15, 1*SIZE(C0); @@ -1047,18 +1027,18 @@ STH_DX xvec11, 1*SIZE(C1); STL_DX xvec9, 0*SIZE(C1, ldc, 1); STH_DX xvec9, 1*SIZE(C1, ldc, 1); #ifndef TRMMKERNEL -LDL_DX 2*SIZE(C0), xvec0; -LDH_DX 3*SIZE(C0), xvec0; -LDL_DX 2*SIZE(C0, ldc, 1), xvec2; -LDH_DX 3*SIZE(C0, ldc, 1), xvec2; -LDL_DX 2*SIZE(C1), xvec4; -LDH_DX 3*SIZE(C1), xvec4; -LDL_DX 2*SIZE(C1, ldc, 1), xvec6; -LDH_DX 3*SIZE(C1, ldc, 1), xvec6; -ADD_DX xvec0, xvec3; -ADD_DX xvec2, xvec1; -ADD_DX xvec4, xvec7; -ADD_DX xvec6, xvec5; +LDL_DX 2*SIZE(C0), xvec0, xvec0; +LDH_DX 3*SIZE(C0), xvec0, xvec0; +LDL_DX 2*SIZE(C0, ldc, 1), xvec2, xvec2; +LDH_DX 3*SIZE(C0, ldc, 1), xvec2, xvec2; +LDL_DX 2*SIZE(C1), xvec4, xvec4; +LDH_DX 3*SIZE(C1), xvec4, xvec4; +LDL_DX 2*SIZE(C1, ldc, 1), xvec6, xvec6; +LDH_DX 3*SIZE(C1, ldc, 1), xvec6, xvec6; +ADD_DX xvec0, xvec3, xvec3; +ADD_DX xvec2, xvec1, xvec1; +ADD_DX xvec4, xvec7, xvec7; +ADD_DX xvec6, xvec5, xvec5; #endif STL_DX xvec3, 2*SIZE(C0); STH_DX xvec3, 3*SIZE(C0); @@ -1128,72 +1108,72 @@ ALIGN_5; ##### Unroll time 1 #### LD_DX 4*SIZE(ptrbb), xvec6; SHUF_DX $0x4e, xvec3, xvec5; -MUL_DX xvec0, xvec2; -ADD_DX xvec2, xvec15; +MUL_DX xvec0, xvec2, xvec2; +ADD_DX xvec2, xvec15, xvec15; LD_DX 6*SIZE(ptrbb), xvec7; -MUL_DX xvec0, xvec3; -ADD_DX xvec3, xvec11; +MUL_DX xvec0, xvec3, xvec3; +ADD_DX xvec3, xvec11, xvec11; LD_DX 2*SIZE(ptrba), xvec1; -MUL_DX xvec0, xvec4; -ADD_DX xvec4, xvec13; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec13, xvec13; SHUF_DX $0x4e, xvec6, xvec4; -MUL_DX xvec0, xvec5; -ADD_DX xvec5, xvec9; +MUL_DX xvec0, xvec5, xvec5; +ADD_DX xvec5, xvec9, xvec9; #### Unroll time 2 #### LD_DX 8*SIZE(ptrbb), xvec2; SHUF_DX $0x4e, xvec7, xvec5; -MUL_DX xvec1, xvec6; -ADD_DX xvec6, xvec15; +MUL_DX xvec1, xvec6, xvec6; +ADD_DX xvec6, xvec15, xvec15; LD_DX 10*SIZE(ptrbb), xvec3; -MUL_DX xvec1, xvec7; -ADD_DX xvec7, xvec11; +MUL_DX xvec1, xvec7, xvec7; +ADD_DX xvec7, xvec11, xvec11; LD_DX 4*SIZE(ptrba), xvec0; -MUL_DX xvec1, xvec4; -ADD_DX xvec4, xvec13; +MUL_DX xvec1, xvec4, xvec4; +ADD_DX xvec4, xvec13, xvec13; SHUF_DX $0x4e, xvec2, xvec4; -MUL_DX xvec1, xvec5; -ADD_DX xvec5, xvec9; +MUL_DX xvec1, xvec5, xvec5; +ADD_DX xvec5, xvec9, xvec9; ##### Unroll time 3 #### LD_DX 12*SIZE(ptrbb), xvec6; SHUF_DX $0x4e, xvec3, xvec5; -MUL_DX xvec0, xvec2; -ADD_DX xvec2, xvec15; +MUL_DX xvec0, xvec2, xvec2; +ADD_DX xvec2, xvec15, xvec15; LD_DX 14*SIZE(ptrbb), xvec7; -MUL_DX xvec0, xvec3; -ADD_DX xvec3, xvec11; +MUL_DX xvec0, xvec3, xvec3; +ADD_DX xvec3, xvec11, xvec11; ADDQ $16*SIZE, ptrbb; LD_DX 6*SIZE(ptrba), xvec1; -MUL_DX xvec0, xvec4; -ADD_DX xvec4, xvec13; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec13, xvec13; SHUF_DX $0x4e, xvec6, xvec4; ADDQ $8*SIZE, ptrba; -MUL_DX xvec0, xvec5; -ADD_DX xvec5, xvec9; +MUL_DX xvec0, xvec5, xvec5; +ADD_DX xvec5, xvec9, xvec9; #### Unroll time 4 #### LD_DX 0*SIZE(ptrbb), xvec2; SHUF_DX $0x4e, xvec7, xvec5; -MUL_DX xvec1, xvec6; -ADD_DX xvec6, xvec15; +MUL_DX xvec1, xvec6, xvec6; +ADD_DX xvec6, xvec15, xvec15; LD_DX 2*SIZE(ptrbb), xvec3; -MUL_DX xvec1, xvec7; -ADD_DX xvec7, xvec11; +MUL_DX xvec1, xvec7, xvec7; +ADD_DX xvec7, xvec11, xvec11; LD_DX 0*SIZE(ptrba), xvec0; -MUL_DX xvec1, xvec4; -ADD_DX xvec4, xvec13; +MUL_DX xvec1, xvec4, xvec4; +ADD_DX xvec4, xvec13, xvec13; SHUF_DX $0x4e, xvec2, xvec4; -MUL_DX xvec1, xvec5; -ADD_DX xvec5, xvec9; +MUL_DX xvec1, xvec5, xvec5; +ADD_DX xvec5, xvec9, xvec9; DECQ k; JG .L10_bodyB; ALIGN_5 @@ -1210,39 +1190,39 @@ ALIGN_5 ##### Unroll time 1 #### LD_DX 4*SIZE(ptrbb), xvec6; SHUF_DX $0x4e, xvec3, xvec5; -MUL_DX xvec0, xvec2; -ADD_DX xvec2, xvec15; +MUL_DX xvec0, xvec2, xvec2; +ADD_DX xvec2, xvec15, xvec15; LD_DX 6*SIZE(ptrbb), xvec7; -MUL_DX xvec0, xvec3; -ADD_DX xvec3, xvec11; +MUL_DX xvec0, xvec3, xvec3; +ADD_DX xvec3, xvec11, xvec11; ADDQ $8*SIZE, ptrbb; LD_DX 2*SIZE(ptrba), xvec1; -MUL_DX xvec0, xvec4; -ADD_DX xvec4, xvec13; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec13, xvec13; SHUF_DX $0x4e, xvec6, xvec4; ADDQ $4*SIZE, ptrba; -MUL_DX xvec0, xvec5; -ADD_DX xvec5, xvec9; +MUL_DX xvec0, xvec5, xvec5; +ADD_DX xvec5, xvec9, xvec9; #### Unroll time 2 #### LD_DX 0*SIZE(ptrbb), xvec2; SHUF_DX $0x4e, xvec7, xvec5; -MUL_DX xvec1, xvec6; -ADD_DX xvec6, xvec15; +MUL_DX xvec1, xvec6, xvec6; +ADD_DX xvec6, xvec15, xvec15; LD_DX 2*SIZE(ptrbb), xvec3; -MUL_DX xvec1, xvec7; -ADD_DX xvec7, xvec11; +MUL_DX xvec1, xvec7, xvec7; +ADD_DX xvec7, xvec11, xvec11; LD_DX 0*SIZE(ptrba), xvec0; -MUL_DX xvec1, xvec4; -ADD_DX xvec4, xvec13; +MUL_DX xvec1, xvec4, xvec4; +ADD_DX xvec4, xvec13, xvec13; SHUF_DX $0x4e, xvec2, xvec4; -MUL_DX xvec1, xvec5; -ADD_DX xvec5, xvec9; +MUL_DX xvec1, xvec5, xvec5; +ADD_DX xvec5, xvec9, xvec9; .L11_loopE:; #ifndef TRMMKERNEL @@ -1255,35 +1235,35 @@ JLE .L12_loopE; ALIGN_5 .L12_bodyB:; SHUF_DX $0x4e, xvec3, xvec5; -MUL_DX xvec0, xvec2; -ADD_DX xvec2, xvec15; +MUL_DX xvec0, xvec2, xvec2; +ADD_DX xvec2, xvec15, xvec15; ADDQ $4*SIZE, ptrbb; -MUL_DX xvec0, xvec3; -ADD_DX xvec3, xvec11; +MUL_DX xvec0, xvec3, xvec3; +ADD_DX xvec3, xvec11, xvec11; ADDQ $2*SIZE, ptrba; -MUL_DX xvec0, xvec4; -ADD_DX xvec4, xvec13; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec13, xvec13; -MUL_DX xvec0, xvec5; -ADD_DX xvec5, xvec9; +MUL_DX xvec0, xvec5, xvec5; +ADD_DX xvec5, xvec9, xvec9; .L12_loopE:; #### Load Alpha #### BROAD_DX MEMALPHA, xvec7; #### Multiply Alpha #### -MUL_DX xvec7, xvec15; -MUL_DX xvec7, xvec13; -MUL_DX xvec7, xvec11; -MUL_DX xvec7, xvec9; +MUL_DX xvec7, xvec15, xvec15; +MUL_DX xvec7, xvec13, xvec13; +MUL_DX xvec7, xvec11, xvec11; +MUL_DX xvec7, xvec9, xvec9; #### Reverse the Results #### MOV_DX xvec15, xvec6; -REVS_DX xvec13, xvec15; -REVS_DX xvec6, xvec13; +REVS_DX xvec13, xvec15, xvec15; +REVS_DX xvec6, xvec13, xvec13; MOV_DX xvec11, xvec6; -REVS_DX xvec9, xvec11; -REVS_DX xvec6, xvec9; +REVS_DX xvec9, xvec11, xvec11; +REVS_DX xvec6, xvec9, xvec9; #### Testing Alignment #### MOVQ C0, %rax; OR ldc, %rax; @@ -1292,10 +1272,10 @@ JNE .L12_loopEx; ALIGN_5 #### Writing Back #### #ifndef TRMMKERNEL -ADD_DX 0*SIZE(C0), xvec13; -ADD_DX 0*SIZE(C0, ldc, 1), xvec15; -ADD_DX 0*SIZE(C1), xvec9; -ADD_DX 0*SIZE(C1, ldc, 1), xvec11; +ADD_DX 0*SIZE(C0), xvec13, xvec13; +ADD_DX 0*SIZE(C0, ldc, 1), xvec15, xvec15; +ADD_DX 0*SIZE(C1), xvec9, xvec9; +ADD_DX 0*SIZE(C1, ldc, 1), xvec11, xvec11; #endif ST_DX xvec13, 0*SIZE(C0); ST_DX xvec15, 0*SIZE(C0, ldc, 1); @@ -1317,18 +1297,18 @@ JMP .L9_loopE; ALIGN_5 .L12_loopEx: #ifndef TRMMKERNEL -LDL_DX 0*SIZE(C0), xvec14; -LDH_DX 1*SIZE(C0), xvec14; -LDL_DX 0*SIZE(C0, ldc, 1), xvec12; -LDH_DX 1*SIZE(C0, ldc, 1), xvec12; -LDL_DX 0*SIZE(C1), xvec10; -LDH_DX 1*SIZE(C1), xvec10; -LDL_DX 0*SIZE(C1, ldc, 1), xvec8; -LDH_DX 1*SIZE(C1, ldc, 1), xvec8; -ADD_DX xvec14, xvec13; -ADD_DX xvec12, xvec15; -ADD_DX xvec10, xvec9; -ADD_DX xvec8, xvec11; +LDL_DX 0*SIZE(C0), xvec14, xvec14; +LDH_DX 1*SIZE(C0), xvec14, xvec14; +LDL_DX 0*SIZE(C0, ldc, 1), xvec12, xvec12; +LDH_DX 1*SIZE(C0, ldc, 1), xvec12, xvec12; +LDL_DX 0*SIZE(C1), xvec10, xvec10; +LDH_DX 1*SIZE(C1), xvec10, xvec10; +LDL_DX 0*SIZE(C1, ldc, 1), xvec8, xvec8; +LDH_DX 1*SIZE(C1, ldc, 1), xvec8, xvec8; +ADD_DX xvec14, xvec13, xvec13; +ADD_DX xvec12, xvec15, xvec15; +ADD_DX xvec10, xvec9, xvec9; +ADD_DX xvec8, xvec11, xvec11; #endif STL_DX xvec13, 0*SIZE(C0); STH_DX xvec13, 1*SIZE(C0); @@ -1455,12 +1435,12 @@ MUL_DY yvec15, yvec7, yvec15; #### Writing Back #### EXTRA_DY $1, yvec15, xvec7; #ifndef TRMMKERNEL -LDL_DX 0*SIZE(C0), xvec0; -LDH_DX 0*SIZE(C0, ldc, 1), xvec0; -LDL_DX 0*SIZE(C1), xvec1; -LDH_DX 0*SIZE(C1, ldc, 1), xvec1; -ADD_DX xvec0, xvec15; -ADD_DX xvec1, xvec7; +LDL_DX 0*SIZE(C0), xvec0, xvec0; +LDH_DX 0*SIZE(C0, ldc, 1), xvec0, xvec0; +LDL_DX 0*SIZE(C1), xvec1, xvec1; +LDH_DX 0*SIZE(C1, ldc, 1), xvec1, xvec1; +ADD_DX xvec0, xvec15, xvec15; +ADD_DX xvec1, xvec7, xvec7; #endif STL_DX xvec15, 0*SIZE(C0); STH_DX xvec15, 0*SIZE(C0, ldc, 1); @@ -1549,151 +1529,151 @@ ALIGN_5; LD_DX 0*SIZE(ptrba), xvec0; LD_DX 0*SIZE(ptrbb), xvec4; MOV_DX xvec4, xvec5; -MUL_DX xvec0, xvec4; -ADD_DX xvec4, xvec15; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec15, xvec15; LD_DX 2*SIZE(ptrba), xvec1; MOV_DX xvec5, xvec6; -MUL_DX xvec1, xvec5; -ADD_DX xvec5, xvec14; +MUL_DX xvec1, xvec5, xvec5; +ADD_DX xvec5, xvec14, xvec14; LD_DX 4*SIZE(ptrba), xvec2; MOV_DX xvec6, xvec7; -MUL_DX xvec2, xvec6; -ADD_DX xvec6, xvec13; +MUL_DX xvec2, xvec6, xvec6; +ADD_DX xvec6, xvec13, xvec13; LD_DX 6*SIZE(ptrba), xvec3; SHUF_DX $0x4e, xvec7, xvec4; -MUL_DX xvec3, xvec7; -ADD_DX xvec7, xvec12; +MUL_DX xvec3, xvec7, xvec7; +ADD_DX xvec7, xvec12, xvec12; MOV_DX xvec4, xvec5; -MUL_DX xvec0, xvec4; -ADD_DX xvec4, xvec11; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec11, xvec11; MOV_DX xvec5, xvec6; -MUL_DX xvec1, xvec5; -ADD_DX xvec5, xvec10; +MUL_DX xvec1, xvec5, xvec5; +ADD_DX xvec5, xvec10, xvec10; MOV_DX xvec6, xvec7; -MUL_DX xvec2, xvec6; -ADD_DX xvec6, xvec9; +MUL_DX xvec2, xvec6, xvec6; +ADD_DX xvec6, xvec9, xvec9; -MUL_DX xvec3, xvec7; -ADD_DX xvec7, xvec8; +MUL_DX xvec3, xvec7, xvec7; +ADD_DX xvec7, xvec8, xvec8; #### Unroll time 2 #### LD_DX 8*SIZE(ptrba), xvec0; LD_DX 2*SIZE(ptrbb), xvec4; MOV_DX xvec4, xvec5; -MUL_DX xvec0, xvec4; -ADD_DX xvec4, xvec15; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec15, xvec15; LD_DX 10*SIZE(ptrba), xvec1; MOV_DX xvec5, xvec6; -MUL_DX xvec1, xvec5; -ADD_DX xvec5, xvec14; +MUL_DX xvec1, xvec5, xvec5; +ADD_DX xvec5, xvec14, xvec14; LD_DX 12*SIZE(ptrba), xvec2; MOV_DX xvec6, xvec7; -MUL_DX xvec2, xvec6; -ADD_DX xvec6, xvec13; +MUL_DX xvec2, xvec6, xvec6; +ADD_DX xvec6, xvec13, xvec13; LD_DX 14*SIZE(ptrba), xvec3; SHUF_DX $0x4e, xvec7, xvec4; -MUL_DX xvec3, xvec7; -ADD_DX xvec7, xvec12; +MUL_DX xvec3, xvec7, xvec7; +ADD_DX xvec7, xvec12, xvec12; MOV_DX xvec4, xvec5; -MUL_DX xvec0, xvec4; -ADD_DX xvec4, xvec11; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec11, xvec11; MOV_DX xvec5, xvec6; -MUL_DX xvec1, xvec5; -ADD_DX xvec5, xvec10; +MUL_DX xvec1, xvec5, xvec5; +ADD_DX xvec5, xvec10, xvec10; MOV_DX xvec6, xvec7; -MUL_DX xvec2, xvec6; -ADD_DX xvec6, xvec9; +MUL_DX xvec2, xvec6, xvec6; +ADD_DX xvec6, xvec9, xvec9; -MUL_DX xvec3, xvec7; -ADD_DX xvec7, xvec8; +MUL_DX xvec3, xvec7, xvec7; +ADD_DX xvec7, xvec8, xvec8; #### Unroll time 3 #### LD_DX 16*SIZE(ptrba), xvec0; LD_DX 4*SIZE(ptrbb), xvec4; MOV_DX xvec4, xvec5; -MUL_DX xvec0, xvec4; -ADD_DX xvec4, xvec15; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec15, xvec15; LD_DX 18*SIZE(ptrba), xvec1; MOV_DX xvec5, xvec6; -MUL_DX xvec1, xvec5; -ADD_DX xvec5, xvec14; +MUL_DX xvec1, xvec5, xvec5; +ADD_DX xvec5, xvec14, xvec14; LD_DX 20*SIZE(ptrba), xvec2; MOV_DX xvec6, xvec7; -MUL_DX xvec2, xvec6; -ADD_DX xvec6, xvec13; +MUL_DX xvec2, xvec6, xvec6; +ADD_DX xvec6, xvec13, xvec13; LD_DX 22*SIZE(ptrba), xvec3; SHUF_DX $0x4e, xvec7, xvec4; -MUL_DX xvec3, xvec7; -ADD_DX xvec7, xvec12; +MUL_DX xvec3, xvec7, xvec7; +ADD_DX xvec7, xvec12, xvec12; MOV_DX xvec4, xvec5; -MUL_DX xvec0, xvec4; -ADD_DX xvec4, xvec11; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec11, xvec11; MOV_DX xvec5, xvec6; -MUL_DX xvec1, xvec5; -ADD_DX xvec5, xvec10; +MUL_DX xvec1, xvec5, xvec5; +ADD_DX xvec5, xvec10, xvec10; MOV_DX xvec6, xvec7; -MUL_DX xvec2, xvec6; -ADD_DX xvec6, xvec9; +MUL_DX xvec2, xvec6, xvec6; +ADD_DX xvec6, xvec9, xvec9; -MUL_DX xvec3, xvec7; -ADD_DX xvec7, xvec8; +MUL_DX xvec3, xvec7, xvec7; +ADD_DX xvec7, xvec8, xvec8; #### Unroll time 4 #### LD_DX 24*SIZE(ptrba), xvec0; LD_DX 6*SIZE(ptrbb), xvec4; MOV_DX xvec4, xvec5; -MUL_DX xvec0, xvec4; -ADD_DX xvec4, xvec15; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec15, xvec15; ADDQ $8*SIZE, ptrbb; LD_DX 26*SIZE(ptrba), xvec1; MOV_DX xvec5, xvec6; -MUL_DX xvec1, xvec5; -ADD_DX xvec5, xvec14; +MUL_DX xvec1, xvec5, xvec5; +ADD_DX xvec5, xvec14, xvec14; LD_DX 28*SIZE(ptrba), xvec2; MOV_DX xvec6, xvec7; -MUL_DX xvec2, xvec6; -ADD_DX xvec6, xvec13; +MUL_DX xvec2, xvec6, xvec6; +ADD_DX xvec6, xvec13, xvec13; LD_DX 30*SIZE(ptrba), xvec3; SHUF_DX $0x4e, xvec7, xvec4; -MUL_DX xvec3, xvec7; -ADD_DX xvec7, xvec12; +MUL_DX xvec3, xvec7, xvec7; +ADD_DX xvec7, xvec12, xvec12; ADDQ $32*SIZE, ptrba; MOV_DX xvec4, xvec5; -MUL_DX xvec0, xvec4; -ADD_DX xvec4, xvec11; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec11, xvec11; MOV_DX xvec5, xvec6; -MUL_DX xvec1, xvec5; -ADD_DX xvec5, xvec10; +MUL_DX xvec1, xvec5, xvec5; +ADD_DX xvec5, xvec10, xvec10; MOV_DX xvec6, xvec7; -MUL_DX xvec2, xvec6; -ADD_DX xvec6, xvec9; +MUL_DX xvec2, xvec6, xvec6; +ADD_DX xvec6, xvec9, xvec9; -MUL_DX xvec3, xvec7; -ADD_DX xvec7, xvec8; +MUL_DX xvec3, xvec7, xvec7; +ADD_DX xvec7, xvec8, xvec8; DECQ k; JG .L211_bodyB; ALIGN_5 @@ -1712,77 +1692,77 @@ ALIGN_5; LD_DX 0*SIZE(ptrba), xvec0; LD_DX 0*SIZE(ptrbb), xvec4; MOV_DX xvec4, xvec5; -MUL_DX xvec0, xvec4; -ADD_DX xvec4, xvec15; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec15, xvec15; LD_DX 2*SIZE(ptrba), xvec1; MOV_DX xvec5, xvec6; -MUL_DX xvec1, xvec5; -ADD_DX xvec5, xvec14; +MUL_DX xvec1, xvec5, xvec5; +ADD_DX xvec5, xvec14, xvec14; LD_DX 4*SIZE(ptrba), xvec2; MOV_DX xvec6, xvec7; -MUL_DX xvec2, xvec6; -ADD_DX xvec6, xvec13; +MUL_DX xvec2, xvec6, xvec6; +ADD_DX xvec6, xvec13, xvec13; LD_DX 6*SIZE(ptrba), xvec3; SHUF_DX $0x4e, xvec7, xvec4; -MUL_DX xvec3, xvec7; -ADD_DX xvec7, xvec12; +MUL_DX xvec3, xvec7, xvec7; +ADD_DX xvec7, xvec12, xvec12; MOV_DX xvec4, xvec5; -MUL_DX xvec0, xvec4; -ADD_DX xvec4, xvec11; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec11, xvec11; MOV_DX xvec5, xvec6; -MUL_DX xvec1, xvec5; -ADD_DX xvec5, xvec10; +MUL_DX xvec1, xvec5, xvec5; +ADD_DX xvec5, xvec10, xvec10; MOV_DX xvec6, xvec7; -MUL_DX xvec2, xvec6; -ADD_DX xvec6, xvec9; +MUL_DX xvec2, xvec6, xvec6; +ADD_DX xvec6, xvec9, xvec9; -MUL_DX xvec3, xvec7; -ADD_DX xvec7, xvec8; +MUL_DX xvec3, xvec7, xvec7; +ADD_DX xvec7, xvec8, xvec8; #### Unroll time 2 #### LD_DX 8*SIZE(ptrba), xvec0; LD_DX 2*SIZE(ptrbb), xvec4; MOV_DX xvec4, xvec5; -MUL_DX xvec0, xvec4; -ADD_DX xvec4, xvec15; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec15, xvec15; ADDQ $4*SIZE, ptrbb; LD_DX 10*SIZE(ptrba), xvec1; MOV_DX xvec5, xvec6; -MUL_DX xvec1, xvec5; -ADD_DX xvec5, xvec14; +MUL_DX xvec1, xvec5, xvec5; +ADD_DX xvec5, xvec14, xvec14; LD_DX 12*SIZE(ptrba), xvec2; MOV_DX xvec6, xvec7; -MUL_DX xvec2, xvec6; -ADD_DX xvec6, xvec13; +MUL_DX xvec2, xvec6, xvec6; +ADD_DX xvec6, xvec13, xvec13; LD_DX 14*SIZE(ptrba), xvec3; SHUF_DX $0x4e, xvec7, xvec4; -MUL_DX xvec3, xvec7; -ADD_DX xvec7, xvec12; +MUL_DX xvec3, xvec7, xvec7; +ADD_DX xvec7, xvec12, xvec12; ADDQ $16*SIZE, ptrba; MOV_DX xvec4, xvec5; -MUL_DX xvec0, xvec4; -ADD_DX xvec4, xvec11; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec11, xvec11; MOV_DX xvec5, xvec6; -MUL_DX xvec1, xvec5; -ADD_DX xvec5, xvec10; +MUL_DX xvec1, xvec5, xvec5; +ADD_DX xvec5, xvec10, xvec10; MOV_DX xvec6, xvec7; -MUL_DX xvec2, xvec6; -ADD_DX xvec6, xvec9; +MUL_DX xvec2, xvec6, xvec6; +ADD_DX xvec6, xvec9, xvec9; -MUL_DX xvec3, xvec7; -ADD_DX xvec7, xvec8; +MUL_DX xvec3, xvec7, xvec7; +ADD_DX xvec7, xvec8, xvec8; .L212_loopE: #ifndef TRMMKERNEL @@ -1798,65 +1778,65 @@ ALIGN_5 LD_DX 0*SIZE(ptrba), xvec0; LD_DX 0*SIZE(ptrbb), xvec4; MOV_DX xvec4, xvec5; -MUL_DX xvec0, xvec4; -ADD_DX xvec4, xvec15; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec15, xvec15; ADDQ $2*SIZE, ptrbb; LD_DX 2*SIZE(ptrba), xvec1; MOV_DX xvec5, xvec6; -MUL_DX xvec1, xvec5; -ADD_DX xvec5, xvec14; +MUL_DX xvec1, xvec5, xvec5; +ADD_DX xvec5, xvec14, xvec14; LD_DX 4*SIZE(ptrba), xvec2; MOV_DX xvec6, xvec7; -MUL_DX xvec2, xvec6; -ADD_DX xvec6, xvec13; +MUL_DX xvec2, xvec6, xvec6; +ADD_DX xvec6, xvec13, xvec13; LD_DX 6*SIZE(ptrba), xvec3; SHUF_DX $0x4e, xvec7, xvec4; -MUL_DX xvec3, xvec7; -ADD_DX xvec7, xvec12; +MUL_DX xvec3, xvec7, xvec7; +ADD_DX xvec7, xvec12, xvec12; ADDQ $8*SIZE, ptrba; MOV_DX xvec4, xvec5; -MUL_DX xvec0, xvec4; -ADD_DX xvec4, xvec11; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec11, xvec11; MOV_DX xvec5, xvec6; -MUL_DX xvec1, xvec5; -ADD_DX xvec5, xvec10; +MUL_DX xvec1, xvec5, xvec5; +ADD_DX xvec5, xvec10, xvec10; MOV_DX xvec6, xvec7; -MUL_DX xvec2, xvec6; -ADD_DX xvec6, xvec9; +MUL_DX xvec2, xvec6, xvec6; +ADD_DX xvec6, xvec9, xvec9; -MUL_DX xvec3, xvec7; -ADD_DX xvec7, xvec8; +MUL_DX xvec3, xvec7, xvec7; +ADD_DX xvec7, xvec8, xvec8; .L213_loopE: #### Multiply Alpha #### BROAD_DX MEMALPHA, xvec7; -MUL_DX xvec7, xvec15; -MUL_DX xvec7, xvec14; -MUL_DX xvec7, xvec13; -MUL_DX xvec7, xvec12; -MUL_DX xvec7, xvec11; -MUL_DX xvec7, xvec10; -MUL_DX xvec7, xvec9; -MUL_DX xvec7, xvec8; +MUL_DX xvec7, xvec15, xvec15; +MUL_DX xvec7, xvec14, xvec14; +MUL_DX xvec7, xvec13, xvec13; +MUL_DX xvec7, xvec12, xvec12; +MUL_DX xvec7, xvec11, xvec11; +MUL_DX xvec7, xvec10, xvec10; +MUL_DX xvec7, xvec9, xvec9; +MUL_DX xvec7, xvec8, xvec8; #### Reverse ##### MOV_DX xvec15, xvec6; -REVS_DX xvec11, xvec15; -REVS_DX xvec6, xvec11; +REVS_DX xvec11, xvec15, xvec15; +REVS_DX xvec6, xvec11, xvec11; MOV_DX xvec14, xvec6; -REVS_DX xvec10, xvec14; -REVS_DX xvec6, xvec10; +REVS_DX xvec10, xvec14, xvec14; +REVS_DX xvec6, xvec10, xvec10; MOV_DX xvec13, xvec6; -REVS_DX xvec9, xvec13; -REVS_DX xvec6, xvec9; +REVS_DX xvec9, xvec13, xvec13; +REVS_DX xvec6, xvec9, xvec9; MOV_DX xvec12, xvec6; -REVS_DX xvec8, xvec12; -REVS_DX xvec6, xvec8; +REVS_DX xvec8, xvec12, xvec12; +REVS_DX xvec6, xvec8, xvec8; #### Testing Alignment #### MOVQ C0, %rax; OR ldc, %rax; @@ -1865,14 +1845,14 @@ JNE .L213_loopEx; ALIGN_5 #### Writing Back #### #ifndef TRMMKERNEL -ADD_DX 0*SIZE(C0), xvec11; -ADD_DX 2*SIZE(C0), xvec10; -ADD_DX 4*SIZE(C0), xvec9; -ADD_DX 6*SIZE(C0), xvec8; -ADD_DX 0*SIZE(C1), xvec15; -ADD_DX 2*SIZE(C1), xvec14; -ADD_DX 4*SIZE(C1), xvec13; -ADD_DX 6*SIZE(C1), xvec12; +ADD_DX 0*SIZE(C0), xvec11, xvec11; +ADD_DX 2*SIZE(C0), xvec10, xvec10; +ADD_DX 4*SIZE(C0), xvec9, xvec9; +ADD_DX 6*SIZE(C0), xvec8, xvec8; +ADD_DX 0*SIZE(C1), xvec15, xvec15; +ADD_DX 2*SIZE(C1), xvec14, xvec14; +ADD_DX 4*SIZE(C1), xvec13, xvec13; +ADD_DX 6*SIZE(C1), xvec12, xvec12; #endif ST_DX xvec11, 0*SIZE(C0); ST_DX xvec10, 2*SIZE(C0); @@ -1900,18 +1880,18 @@ JMP .L21_loopE; ALIGN_5 .L213_loopEx:; #ifndef TRMMKERNEL -LDL_DX 0*SIZE(C0), xvec0; -LDH_DX 1*SIZE(C0), xvec0; -LDL_DX 2*SIZE(C0), xvec1; -LDH_DX 3*SIZE(C0), xvec1; -LDL_DX 4*SIZE(C0), xvec2; -LDH_DX 5*SIZE(C0), xvec2; -LDL_DX 6*SIZE(C0), xvec3; -LDH_DX 7*SIZE(C0), xvec3; -ADD_DX xvec0, xvec11; -ADD_DX xvec1, xvec10; -ADD_DX xvec2, xvec9; -ADD_DX xvec3, xvec8; +LDL_DX 0*SIZE(C0), xvec0, xvec0; +LDH_DX 1*SIZE(C0), xvec0, xvec0; +LDL_DX 2*SIZE(C0), xvec1, xvec1; +LDH_DX 3*SIZE(C0), xvec1, xvec1; +LDL_DX 4*SIZE(C0), xvec2, xvec2; +LDH_DX 5*SIZE(C0), xvec2, xvec2; +LDL_DX 6*SIZE(C0), xvec3, xvec3; +LDH_DX 7*SIZE(C0), xvec3, xvec3; +ADD_DX xvec0, xvec11, xvec11; +ADD_DX xvec1, xvec10, xvec10; +ADD_DX xvec2, xvec9, xvec9; +ADD_DX xvec3, xvec8, xvec8; #endif STL_DX xvec11, 0*SIZE(C0); STH_DX xvec11, 1*SIZE(C0); @@ -1922,18 +1902,18 @@ STH_DX xvec9, 5*SIZE(C0); STL_DX xvec8, 6*SIZE(C0); STH_DX xvec8, 7*SIZE(C0); #ifndef TRMMKERNEL -LDL_DX 0*SIZE(C1), xvec4; -LDH_DX 1*SIZE(C1), xvec4; -LDL_DX 2*SIZE(C1), xvec5; -LDH_DX 3*SIZE(C1), xvec5; -LDL_DX 4*SIZE(C1), xvec6; -LDH_DX 5*SIZE(C1), xvec6; -LDL_DX 6*SIZE(C1), xvec7; -LDH_DX 7*SIZE(C1), xvec7; -ADD_DX xvec4, xvec15; -ADD_DX xvec5, xvec14; -ADD_DX xvec6, xvec13; -ADD_DX xvec7, xvec12; +LDL_DX 0*SIZE(C1), xvec4, xvec4; +LDH_DX 1*SIZE(C1), xvec4, xvec4; +LDL_DX 2*SIZE(C1), xvec5, xvec5; +LDH_DX 3*SIZE(C1), xvec5, xvec5; +LDL_DX 4*SIZE(C1), xvec6, xvec6; +LDH_DX 5*SIZE(C1), xvec6, xvec6; +LDL_DX 6*SIZE(C1), xvec7, xvec7; +LDH_DX 7*SIZE(C1), xvec7, xvec7; +ADD_DX xvec4, xvec15, xvec15; +ADD_DX xvec5, xvec14, xvec14; +ADD_DX xvec6, xvec13, xvec13; +ADD_DX xvec7, xvec12, xvec12; #endif STL_DX xvec15, 0*SIZE(C1); STH_DX xvec15, 1*SIZE(C1); @@ -2000,79 +1980,79 @@ ALIGN_5 LD_DX 0*SIZE(ptrba), xvec0; LD_DX 0*SIZE(ptrbb), xvec4; MOV_DX xvec4, xvec5; -MUL_DX xvec0, xvec4; -ADD_DX xvec4, xvec15; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec15, xvec15; LD_DX 2*SIZE(ptrba), xvec1; SHUF_DX $0x4e, xvec5, xvec4; -MUL_DX xvec1, xvec5; -ADD_DX xvec5, xvec14; +MUL_DX xvec1, xvec5, xvec5; +ADD_DX xvec5, xvec14, xvec14; MOV_DX xvec4, xvec5; -MUL_DX xvec0, xvec4; -ADD_DX xvec4, xvec11; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec11, xvec11; -MUL_DX xvec1, xvec5; -ADD_DX xvec5, xvec10; +MUL_DX xvec1, xvec5, xvec5; +ADD_DX xvec5, xvec10, xvec10; #### Unroll time 2 #### LD_DX 4*SIZE(ptrba), xvec0; LD_DX 2*SIZE(ptrbb), xvec4; MOV_DX xvec4, xvec5; -MUL_DX xvec0, xvec4; -ADD_DX xvec4, xvec15; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec15, xvec15; LD_DX 6*SIZE(ptrba), xvec1; SHUF_DX $0x4e, xvec5, xvec4; -MUL_DX xvec1, xvec5; -ADD_DX xvec5, xvec14; +MUL_DX xvec1, xvec5, xvec5; +ADD_DX xvec5, xvec14, xvec14; MOV_DX xvec4, xvec5; -MUL_DX xvec0, xvec4; -ADD_DX xvec4, xvec11; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec11, xvec11; -MUL_DX xvec1, xvec5; -ADD_DX xvec5, xvec10; +MUL_DX xvec1, xvec5, xvec5; +ADD_DX xvec5, xvec10, xvec10; #### Unroll time 3 #### LD_DX 8*SIZE(ptrba), xvec0; LD_DX 4*SIZE(ptrbb), xvec4; MOV_DX xvec4, xvec5; -MUL_DX xvec0, xvec4; -ADD_DX xvec4, xvec15; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec15, xvec15; LD_DX 10*SIZE(ptrba), xvec1; SHUF_DX $0x4e, xvec5, xvec4; -MUL_DX xvec1, xvec5; -ADD_DX xvec5, xvec14; +MUL_DX xvec1, xvec5, xvec5; +ADD_DX xvec5, xvec14, xvec14; MOV_DX xvec4, xvec5; -MUL_DX xvec0, xvec4; -ADD_DX xvec4, xvec11; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec11, xvec11; -MUL_DX xvec1, xvec5; -ADD_DX xvec5, xvec10; +MUL_DX xvec1, xvec5, xvec5; +ADD_DX xvec5, xvec10, xvec10; #### Unroll time 4 #### LD_DX 12*SIZE(ptrba), xvec0; LD_DX 6*SIZE(ptrbb), xvec4; MOV_DX xvec4, xvec5; -MUL_DX xvec0, xvec4; -ADD_DX xvec4, xvec15; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec15, xvec15; ADDQ $8*SIZE, ptrbb; LD_DX 14*SIZE(ptrba), xvec1; SHUF_DX $0x4e, xvec5, xvec4; -MUL_DX xvec1, xvec5; -ADD_DX xvec5, xvec14; +MUL_DX xvec1, xvec5, xvec5; +ADD_DX xvec5, xvec14, xvec14; ADDQ $16*SIZE, ptrba; MOV_DX xvec4, xvec5; -MUL_DX xvec0, xvec4; -ADD_DX xvec4, xvec11; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec11, xvec11; -MUL_DX xvec1, xvec5; -ADD_DX xvec5, xvec10; +MUL_DX xvec1, xvec5, xvec5; +ADD_DX xvec5, xvec10, xvec10; DECQ k; JG .L221_bodyB; ALIGN_5 @@ -2090,40 +2070,40 @@ ALIGN_5 LD_DX 0*SIZE(ptrba), xvec0; LD_DX 0*SIZE(ptrbb), xvec4; MOV_DX xvec4, xvec5; -MUL_DX xvec0, xvec4; -ADD_DX xvec4, xvec15; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec15, xvec15; LD_DX 2*SIZE(ptrba), xvec1; SHUF_DX $0x4e, xvec5, xvec4; -MUL_DX xvec1, xvec5; -ADD_DX xvec5, xvec14; +MUL_DX xvec1, xvec5, xvec5; +ADD_DX xvec5, xvec14, xvec14; MOV_DX xvec4, xvec5; -MUL_DX xvec0, xvec4; -ADD_DX xvec4, xvec11; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec11, xvec11; -MUL_DX xvec1, xvec5; -ADD_DX xvec5, xvec10; +MUL_DX xvec1, xvec5, xvec5; +ADD_DX xvec5, xvec10, xvec10; #### Unroll time 2 #### LD_DX 4*SIZE(ptrba), xvec0; LD_DX 2*SIZE(ptrbb), xvec4; MOV_DX xvec4, xvec5; -MUL_DX xvec0, xvec4; -ADD_DX xvec4, xvec15; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec15, xvec15; ADDQ $4*SIZE, ptrbb; LD_DX 6*SIZE(ptrba), xvec1; SHUF_DX $0x4e, xvec5, xvec4; -MUL_DX xvec1, xvec5; -ADD_DX xvec5, xvec14; +MUL_DX xvec1, xvec5, xvec5; +ADD_DX xvec5, xvec14, xvec14; ADDQ $8*SIZE, ptrba; MOV_DX xvec4, xvec5; -MUL_DX xvec0, xvec4; -ADD_DX xvec4, xvec11; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec11, xvec11; -MUL_DX xvec1, xvec5; -ADD_DX xvec5, xvec10; +MUL_DX xvec1, xvec5, xvec5; +ADD_DX xvec5, xvec10, xvec10; .L222_loopE: #ifndef TRMMKERNEL @@ -2139,37 +2119,37 @@ ALIGN_5 LD_DX 0*SIZE(ptrba), xvec0; LD_DX 0*SIZE(ptrbb), xvec4; MOV_DX xvec4, xvec5; -MUL_DX xvec0, xvec4; -ADD_DX xvec4, xvec15; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec15, xvec15; ADDQ $2*SIZE, ptrbb; LD_DX 2*SIZE(ptrba), xvec1; SHUF_DX $0x4e, xvec5, xvec4; -MUL_DX xvec1, xvec5; -ADD_DX xvec5, xvec14; +MUL_DX xvec1, xvec5, xvec5; +ADD_DX xvec5, xvec14, xvec14; ADDQ $4*SIZE, ptrba; MOV_DX xvec4, xvec5; -MUL_DX xvec0, xvec4; -ADD_DX xvec4, xvec11; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec11, xvec11; -MUL_DX xvec1, xvec5; -ADD_DX xvec5, xvec10; +MUL_DX xvec1, xvec5, xvec5; +ADD_DX xvec5, xvec10, xvec10; .L223_loopE: #### Multiply Alpha #### BROAD_DX MEMALPHA, xvec7; -MUL_DX xvec7, xvec15; -MUL_DX xvec7, xvec14; -MUL_DX xvec7, xvec11; -MUL_DX xvec7, xvec10; +MUL_DX xvec7, xvec15, xvec15; +MUL_DX xvec7, xvec14, xvec14; +MUL_DX xvec7, xvec11, xvec11; +MUL_DX xvec7, xvec10, xvec10; #### Reverse ##### MOV_DX xvec15, xvec6; -REVS_DX xvec11, xvec15; -REVS_DX xvec6, xvec11; +REVS_DX xvec11, xvec15, xvec15; +REVS_DX xvec6, xvec11, xvec11; MOV_DX xvec14, xvec6; -REVS_DX xvec10, xvec14; -REVS_DX xvec6, xvec10; +REVS_DX xvec10, xvec14, xvec14; +REVS_DX xvec6, xvec10, xvec10; #### Testing Alignment #### MOVQ C0, %rax; OR ldc, %rax; @@ -2178,10 +2158,10 @@ JNE .L223_loopEx; ALIGN_5 #### Writing Back #### #ifndef TRMMKERNEL -ADD_DX 0*SIZE(C0), xvec11; -ADD_DX 2*SIZE(C0), xvec10; -ADD_DX 0*SIZE(C1), xvec15; -ADD_DX 2*SIZE(C1), xvec14; +ADD_DX 0*SIZE(C0), xvec11, xvec11; +ADD_DX 2*SIZE(C0), xvec10, xvec10; +ADD_DX 0*SIZE(C1), xvec15, xvec15; +ADD_DX 2*SIZE(C1), xvec14, xvec14; #endif ST_DX xvec11, 0*SIZE(C0); ST_DX xvec10, 2*SIZE(C0); @@ -2203,24 +2183,24 @@ JMP .L22_loopE; ALIGN_5 .L223_loopEx:; #ifndef TRMMKERNEL -LDL_DX 0*SIZE(C0), xvec0; -LDH_DX 1*SIZE(C0), xvec0; -LDL_DX 2*SIZE(C0), xvec1; -LDH_DX 3*SIZE(C0), xvec1; -ADD_DX xvec0, xvec11; -ADD_DX xvec1, xvec10; +LDL_DX 0*SIZE(C0), xvec0, xvec0; +LDH_DX 1*SIZE(C0), xvec0, xvec0; +LDL_DX 2*SIZE(C0), xvec1, xvec1; +LDH_DX 3*SIZE(C0), xvec1, xvec1; +ADD_DX xvec0, xvec11, xvec11; +ADD_DX xvec1, xvec10, xvec10; #endif STL_DX xvec11, 0*SIZE(C0); STH_DX xvec11, 1*SIZE(C0); STL_DX xvec10, 2*SIZE(C0); STH_DX xvec10, 3*SIZE(C0); #ifndef TRMMKERNEL -LDL_DX 0*SIZE(C1), xvec4; -LDH_DX 1*SIZE(C1), xvec4; -LDL_DX 2*SIZE(C1), xvec5; -LDH_DX 3*SIZE(C1), xvec5; -ADD_DX xvec4, xvec15; -ADD_DX xvec5, xvec14; +LDL_DX 0*SIZE(C1), xvec4, xvec4; +LDH_DX 1*SIZE(C1), xvec4, xvec4; +LDL_DX 2*SIZE(C1), xvec5, xvec5; +LDH_DX 3*SIZE(C1), xvec5, xvec5; +ADD_DX xvec4, xvec15, xvec15; +ADD_DX xvec5, xvec14, xvec14; #endif STL_DX xvec15, 0*SIZE(C1); STH_DX xvec15, 1*SIZE(C1); @@ -2278,38 +2258,38 @@ ALIGN_5 LD_DX 0*SIZE(ptrba), xvec0; LD_DX 0*SIZE(ptrbb), xvec4; SHUF_DX $0x4e, xvec4, xvec5; -MUL_DX xvec0, xvec4; -ADD_DX xvec4, xvec15; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec15, xvec15; -MUL_DX xvec0, xvec5; -ADD_DX xvec5, xvec11; +MUL_DX xvec0, xvec5, xvec5; +ADD_DX xvec5, xvec11, xvec11; #### Unroll time 2 #### LD_DX 2*SIZE(ptrba), xvec0; LD_DX 2*SIZE(ptrbb), xvec4; SHUF_DX $0x4e, xvec4, xvec5; -MUL_DX xvec0, xvec4; -ADD_DX xvec4, xvec15; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec15, xvec15; -MUL_DX xvec0, xvec5; -ADD_DX xvec5, xvec11; +MUL_DX xvec0, xvec5, xvec5; +ADD_DX xvec5, xvec11, xvec11; #### Unroll time 3 #### LD_DX 4*SIZE(ptrba), xvec0; LD_DX 4*SIZE(ptrbb), xvec4; SHUF_DX $0x4e, xvec4, xvec5; -MUL_DX xvec0, xvec4; -ADD_DX xvec4, xvec15; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec15, xvec15; -MUL_DX xvec0, xvec5; -ADD_DX xvec5, xvec11; +MUL_DX xvec0, xvec5, xvec5; +ADD_DX xvec5, xvec11, xvec11; #### Unroll time 4 #### LD_DX 6*SIZE(ptrba), xvec0; LD_DX 6*SIZE(ptrbb), xvec4; SHUF_DX $0x4e, xvec4, xvec5; -MUL_DX xvec0, xvec4; -ADD_DX xvec4, xvec15; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec15, xvec15; ADDQ $8*SIZE, ptrba; -MUL_DX xvec0, xvec5; -ADD_DX xvec5, xvec11; +MUL_DX xvec0, xvec5, xvec5; +ADD_DX xvec5, xvec11, xvec11; ADDQ $8*SIZE, ptrbb; DECQ k; JG .L231_bodyB; @@ -2328,20 +2308,20 @@ ALIGN_5 LD_DX 0*SIZE(ptrba), xvec0; LD_DX 0*SIZE(ptrbb), xvec4; SHUF_DX $0x4e, xvec4, xvec5; -MUL_DX xvec0, xvec4; -ADD_DX xvec4, xvec15; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec15, xvec15; -MUL_DX xvec0, xvec5; -ADD_DX xvec5, xvec11; +MUL_DX xvec0, xvec5, xvec5; +ADD_DX xvec5, xvec11, xvec11; #### Unroll time 2 #### LD_DX 2*SIZE(ptrba), xvec0; LD_DX 2*SIZE(ptrbb), xvec4; SHUF_DX $0x4e, xvec4, xvec5; -MUL_DX xvec0, xvec4; -ADD_DX xvec4, xvec15; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec15, xvec15; ADDQ $4*SIZE, ptrba; -MUL_DX xvec0, xvec5; -ADD_DX xvec5, xvec11; +MUL_DX xvec0, xvec5, xvec5; +ADD_DX xvec5, xvec11, xvec11; ADDQ $4*SIZE, ptrbb; .L232_loopE: #ifndef TRMMKERNEL @@ -2357,21 +2337,21 @@ ALIGN_5 LD_DX 0*SIZE(ptrba), xvec0; LD_DX 0*SIZE(ptrbb), xvec4; SHUF_DX $0x4e, xvec4, xvec5; -MUL_DX xvec0, xvec4; -ADD_DX xvec4, xvec15; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec15, xvec15; ADDQ $2*SIZE, ptrba; -MUL_DX xvec0, xvec5; -ADD_DX xvec5, xvec11; +MUL_DX xvec0, xvec5, xvec5; +ADD_DX xvec5, xvec11, xvec11; ADDQ $2*SIZE, ptrbb; .L233_loopE: #### Multiply Alpha #### BROAD_DX MEMALPHA, xvec7; -MUL_DX xvec7, xvec15; -MUL_DX xvec7, xvec11; +MUL_DX xvec7, xvec15, xvec15; +MUL_DX xvec7, xvec11, xvec11; #### Reverse ##### MOV_DX xvec15, xvec6; -REVS_DX xvec11, xvec15; -REVS_DX xvec6, xvec11; +REVS_DX xvec11, xvec15, xvec15; +REVS_DX xvec6, xvec11, xvec11; #### Testing Alignment #### MOVQ C0, %rax; OR ldc, %rax; @@ -2380,8 +2360,8 @@ JNE .L233_loopEx; ALIGN_5 #### Writing Back #### #ifndef TRMMKERNEL -ADD_DX 0*SIZE(C0), xvec11; -ADD_DX 0*SIZE(C1), xvec15; +ADD_DX 0*SIZE(C0), xvec11, xvec11; +ADD_DX 0*SIZE(C1), xvec15, xvec15; #endif ST_DX xvec11, 0*SIZE(C0); ST_DX xvec15, 0*SIZE(C1); @@ -2401,16 +2381,16 @@ JMP .L23_loopE; ALIGN_5 .L233_loopEx:; #ifndef TRMMKERNEL -LDL_DX 0*SIZE(C0), xvec0; -LDH_DX 1*SIZE(C0), xvec0; -ADD_DX xvec0, xvec11; +LDL_DX 0*SIZE(C0), xvec0, xvec0; +LDH_DX 1*SIZE(C0), xvec0, xvec0; +ADD_DX xvec0, xvec11, xvec11; #endif STL_DX xvec11, 0*SIZE(C0); STH_DX xvec11, 1*SIZE(C0); #ifndef TRMMKERNEL -LDL_DX 0*SIZE(C1), xvec4; -LDH_DX 1*SIZE(C1), xvec4; -ADD_DX xvec4, xvec15; +LDL_DX 0*SIZE(C1), xvec4, xvec4; +LDH_DX 1*SIZE(C1), xvec4, xvec4; +ADD_DX xvec4, xvec15, xvec15; #endif STL_DX xvec15, 0*SIZE(C1); STH_DX xvec15, 1*SIZE(C1); @@ -2462,23 +2442,23 @@ ALIGN_5 .L241_bodyB: BROAD_DX 0*SIZE(ptrba), xvec0; LD_DX 0*SIZE(ptrbb), xvec2; -MUL_DX xvec0, xvec2; -ADD_DX xvec2, xvec15; +MUL_DX xvec0, xvec2, xvec2; +ADD_DX xvec2, xvec15, xvec15; BROAD_DX 1*SIZE(ptrba), xvec1; LD_DX 2*SIZE(ptrbb), xvec3; -MUL_DX xvec1, xvec3; -ADD_DX xvec3, xvec15; +MUL_DX xvec1, xvec3, xvec3; +ADD_DX xvec3, xvec15, xvec15; BROAD_DX 2*SIZE(ptrba), xvec0; LD_DX 4*SIZE(ptrbb), xvec2; -MUL_DX xvec0, xvec2; -ADD_DX xvec2, xvec15; +MUL_DX xvec0, xvec2, xvec2; +ADD_DX xvec2, xvec15, xvec15; BROAD_DX 3*SIZE(ptrba), xvec1; LD_DX 6*SIZE(ptrbb), xvec3; -MUL_DX xvec1, xvec3; -ADD_DX xvec3, xvec15; +MUL_DX xvec1, xvec3, xvec3; +ADD_DX xvec3, xvec15, xvec15; ADDQ $4*SIZE, ptrba; ADDQ $8*SIZE, ptrbb; DECQ k; @@ -2496,13 +2476,13 @@ ALIGN_5 .L242_bodyB: BROAD_DX 0*SIZE(ptrba), xvec0; LD_DX 0*SIZE(ptrbb), xvec2; -MUL_DX xvec0, xvec2; -ADD_DX xvec2, xvec15; +MUL_DX xvec0, xvec2, xvec2; +ADD_DX xvec2, xvec15, xvec15; BROAD_DX 1*SIZE(ptrba), xvec1; LD_DX 2*SIZE(ptrbb), xvec3; -MUL_DX xvec1, xvec3; -ADD_DX xvec3, xvec15; +MUL_DX xvec1, xvec3, xvec3; +ADD_DX xvec3, xvec15, xvec15; ADDQ $2*SIZE, ptrba; ADDQ $4*SIZE, ptrbb; .L242_loopE: @@ -2517,18 +2497,18 @@ ALIGN_5 .L243_bodyB: BROAD_DX 0*SIZE(ptrba), xvec0; LD_DX 0*SIZE(ptrbb), xvec2; -MUL_DX xvec0, xvec2; -ADD_DX xvec2, xvec15; +MUL_DX xvec0, xvec2, xvec2; +ADD_DX xvec2, xvec15, xvec15; ADDQ $1*SIZE, ptrba; ADDQ $2*SIZE, ptrbb; .L243_loopE: BROAD_DX MEMALPHA, xvec7; -MUL_DX xvec7, xvec15; +MUL_DX xvec7, xvec15, xvec15; #ifndef TRMMKERNEL -LDL_DX 0*SIZE(C0), xvec0; -LDH_DX 0*SIZE(C1), xvec0; -ADD_DX xvec0, xvec15; +LDL_DX 0*SIZE(C0), xvec0, xvec0; +LDH_DX 0*SIZE(C1), xvec0, xvec0; +ADD_DX xvec0, xvec15, xvec15; #endif STL_DX xvec15, 0*SIZE(C0); STH_DX xvec15, 0*SIZE(C1); @@ -2705,10 +2685,10 @@ ALIGN_5 EXTRA_DY $1, yvec15, xvec13; EXTRA_DY $1, yvec14, xvec12; #ifndef TRMMKERNEL -ADD_DX 0*SIZE(C0), xvec15; -ADD_DX 2*SIZE(C0), xvec13; -ADD_DX 4*SIZE(C0), xvec14; -ADD_DX 6*SIZE(C0), xvec12; +ADD_DX 0*SIZE(C0), xvec15, xvec15; +ADD_DX 2*SIZE(C0), xvec13, xvec13; +ADD_DX 4*SIZE(C0), xvec14, xvec14; +ADD_DX 6*SIZE(C0), xvec12, xvec12; #endif ST_DX xvec15, 0*SIZE(C0); ST_DX xvec13, 2*SIZE(C0); @@ -2733,18 +2713,18 @@ ALIGN_5 EXTRA_DY $1, yvec15, xvec13; EXTRA_DY $1, yvec14, xvec12; #ifndef TRMMKERNEL -LDL_DX 0*SIZE(C0), xvec11; -LDH_DX 1*SIZE(C0), xvec11; -LDL_DX 2*SIZE(C0), xvec10; -LDH_DX 3*SIZE(C0), xvec10; -LDL_DX 4*SIZE(C0), xvec9; -LDH_DX 5*SIZE(C0), xvec9; -LDL_DX 6*SIZE(C0), xvec8; -LDH_DX 7*SIZE(C0), xvec8; -ADD_DX xvec11, xvec15; -ADD_DX xvec10, xvec13; -ADD_DX xvec9, xvec14; -ADD_DX xvec8, xvec12; +LDL_DX 0*SIZE(C0), xvec11, xvec11; +LDH_DX 1*SIZE(C0), xvec11, xvec11; +LDL_DX 2*SIZE(C0), xvec10, xvec10; +LDH_DX 3*SIZE(C0), xvec10, xvec10; +LDL_DX 4*SIZE(C0), xvec9, xvec9; +LDH_DX 5*SIZE(C0), xvec9, xvec9; +LDL_DX 6*SIZE(C0), xvec8, xvec8; +LDH_DX 7*SIZE(C0), xvec8, xvec8; +ADD_DX xvec11, xvec15, xvec15; +ADD_DX xvec10, xvec13, xvec13; +ADD_DX xvec9, xvec14, xvec14; +ADD_DX xvec8, xvec12, xvec12; #endif STL_DX xvec15, 0*SIZE(C0); STH_DX xvec15, 1*SIZE(C0); @@ -2878,8 +2858,8 @@ ALIGN_5 #### Writing Back #### EXTRA_DY $1, yvec15, xvec14; #ifndef TRMMKERNEL -ADD_DX 0*SIZE(C0), xvec15; -ADD_DX 2*SIZE(C0), xvec14; +ADD_DX 0*SIZE(C0), xvec15, xvec15; +ADD_DX 2*SIZE(C0), xvec14, xvec14; #endif ST_DX xvec15, 0*SIZE(C0); ST_DX xvec14, 2*SIZE(C0); @@ -2900,12 +2880,12 @@ ALIGN_5 #### Writing Back #### EXTRA_DY $1, yvec15, xvec14; #ifndef TRMMKERNEL -LDL_DX 0*SIZE(C0), xvec13; -LDH_DX 1*SIZE(C0), xvec13; -LDL_DX 2*SIZE(C0), xvec12; -LDH_DX 3*SIZE(C0), xvec12; -ADD_DX xvec13, xvec15; -ADD_DX xvec12, xvec14; +LDL_DX 0*SIZE(C0), xvec13, xvec13; +LDH_DX 1*SIZE(C0), xvec13, xvec13; +LDL_DX 2*SIZE(C0), xvec12, xvec12; +LDH_DX 3*SIZE(C0), xvec12, xvec12; +ADD_DX xvec13, xvec15, xvec15; +ADD_DX xvec12, xvec14, xvec14; #endif STL_DX xvec15, 0*SIZE(C0); STH_DX xvec15, 1*SIZE(C0); @@ -2959,23 +2939,23 @@ ALIGN_5 .L331_bodyB: LD_DX 0*SIZE(ptrba), xvec0; BROAD_DX 0*SIZE(ptrbb), xvec2; -MUL_DX xvec0, xvec2; -ADD_DX xvec2, xvec15; +MUL_DX xvec0, xvec2, xvec2; +ADD_DX xvec2, xvec15, xvec15; LD_DX 2*SIZE(ptrba), xvec1; BROAD_DX 1*SIZE(ptrbb), xvec3; -MUL_DX xvec1, xvec3; -ADD_DX xvec3, xvec15; +MUL_DX xvec1, xvec3, xvec3; +ADD_DX xvec3, xvec15, xvec15; LD_DX 4*SIZE(ptrba), xvec4; BROAD_DX 2*SIZE(ptrbb), xvec5; -MUL_DX xvec4, xvec5; -ADD_DX xvec5, xvec15; +MUL_DX xvec4, xvec5, xvec5; +ADD_DX xvec5, xvec15, xvec15; LD_DX 6*SIZE(ptrba), xvec6; BROAD_DX 3*SIZE(ptrbb), xvec7; -MUL_DX xvec6, xvec7; -ADD_DX xvec7, xvec15; +MUL_DX xvec6, xvec7, xvec7; +ADD_DX xvec7, xvec15, xvec15; ADDQ $8*SIZE, ptrba; ADDQ $4*SIZE, ptrbb; DECQ k; @@ -2993,13 +2973,13 @@ ALIGN_5 .L332_bodyB: LD_DX 0*SIZE(ptrba), xvec0; BROAD_DX 0*SIZE(ptrbb), xvec2; -MUL_DX xvec0, xvec2; -ADD_DX xvec2, xvec15; +MUL_DX xvec0, xvec2, xvec2; +ADD_DX xvec2, xvec15, xvec15; LD_DX 2*SIZE(ptrba), xvec1; BROAD_DX 1*SIZE(ptrbb), xvec3; -MUL_DX xvec1, xvec3; -ADD_DX xvec3, xvec15; +MUL_DX xvec1, xvec3, xvec3; +ADD_DX xvec3, xvec15, xvec15; ADDQ $4*SIZE, ptrba; ADDQ $2*SIZE, ptrbb; .L332_loopE: @@ -3014,18 +2994,18 @@ ALIGN_5 .L333_bodyB: LD_DX 0*SIZE(ptrba), xvec0; BROAD_DX 0*SIZE(ptrbb), xvec2; -MUL_DX xvec0, xvec2; -ADD_DX xvec2, xvec15; +MUL_DX xvec0, xvec2, xvec2; +ADD_DX xvec2, xvec15, xvec15; ADDQ $2*SIZE, ptrba; ADDQ $1*SIZE, ptrbb; .L333_loopE: #### Multiply Alpha #### BROAD_DX MEMALPHA, xvec7; -MUL_DX xvec7, xvec15; +MUL_DX xvec7, xvec15, xvec15; #ifndef TRMMKERNEL -LDL_DX 0*SIZE(C0), xvec14; -LDH_DX 1*SIZE(C0), xvec14; -ADD_DX xvec14, xvec15; +LDL_DX 0*SIZE(C0), xvec14, xvec14; +LDH_DX 1*SIZE(C0), xvec14, xvec14; +ADD_DX xvec14, xvec15, xvec15; #endif STL_DX xvec15, 0*SIZE(C0); STH_DX xvec15, 1*SIZE(C0); @@ -3074,25 +3054,25 @@ SARQ $2, k; JLE .L341_loopE; ALIGN_5 .L341_bodyB: -movsd 0*SIZE(ptrba), xvec0; -movsd 0*SIZE(ptrbb), xvec1; -mulsd xvec0, xvec1; -addsd xvec1, xvec15; +vmovsd 0*SIZE(ptrba), xvec0; +vmovsd 0*SIZE(ptrbb), xvec1; +vmulsd xvec0, xvec1, xvec1; +vaddsd xvec1, xvec15, xvec15; -movsd 1*SIZE(ptrba), xvec0; -movsd 1*SIZE(ptrbb), xvec1; -mulsd xvec0, xvec1; -addsd xvec1, xvec15; +vmovsd 1*SIZE(ptrba), xvec0; +vmovsd 1*SIZE(ptrbb), xvec1; +vmulsd xvec0, xvec1, xvec1; +vaddsd xvec1, xvec15, xvec15; -movsd 2*SIZE(ptrba), xvec0; -movsd 2*SIZE(ptrbb), xvec1; -mulsd xvec0, xvec1; -addsd xvec1, xvec15; +vmovsd 2*SIZE(ptrba), xvec0; +vmovsd 2*SIZE(ptrbb), xvec1; +vmulsd xvec0, xvec1, xvec1; +vaddsd xvec1, xvec15, xvec15; -movsd 3*SIZE(ptrba), xvec0; -movsd 3*SIZE(ptrbb), xvec1; -mulsd xvec0, xvec1; -addsd xvec1, xvec15; +vmovsd 3*SIZE(ptrba), xvec0; +vmovsd 3*SIZE(ptrbb), xvec1; +vmulsd xvec0, xvec1, xvec1; +vaddsd xvec1, xvec15, xvec15; addq $4*SIZE, ptrba; addq $4*SIZE, ptrbb; decq k; @@ -3108,15 +3088,15 @@ TEST $2, %rax; JLE .L342_loopE; ALIGN_5 .L342_bodyB: -movsd 0*SIZE(ptrba), xvec0; -movsd 0*SIZE(ptrbb), xvec1; -mulsd xvec0, xvec1; -addsd xvec1, xvec15; +vmovsd 0*SIZE(ptrba), xvec0; +vmovsd 0*SIZE(ptrbb), xvec1; +vmulsd xvec0, xvec1, xvec1; +vaddsd xvec1, xvec15, xvec15; -movsd 1*SIZE(ptrba), xvec0; -movsd 1*SIZE(ptrbb), xvec1; -mulsd xvec0, xvec1; -addsd xvec1, xvec15; +vmovsd 1*SIZE(ptrba), xvec0; +vmovsd 1*SIZE(ptrbb), xvec1; +vmulsd xvec0, xvec1, xvec1; +vaddsd xvec1, xvec15, xvec15; addq $2*SIZE, ptrba; addq $2*SIZE, ptrbb; @@ -3130,20 +3110,20 @@ TEST $1, %rax; JLE .L343_loopE; ALIGN_5 .L343_bodyB: -movsd 0*SIZE(ptrba), xvec0; -movsd 0*SIZE(ptrbb), xvec1; -mulsd xvec0, xvec1; -addsd xvec1, xvec15; +vmovsd 0*SIZE(ptrba), xvec0; +vmovsd 0*SIZE(ptrbb), xvec1; +vmulsd xvec0, xvec1, xvec1; +vaddsd xvec1, xvec15, xvec15; addq $1*SIZE, ptrba; addq $1*SIZE, ptrbb; .L343_loopE: #### Writing Back #### -movsd MEMALPHA, xvec7; -mulsd xvec7, xvec15; +vmovsd MEMALPHA, xvec7; +vmulsd xvec7, xvec15, xvec15; #ifndef TRMMKERNEL -movsd 0*SIZE(C0), xvec0; -addsd xvec0, xvec15; +vmovsd 0*SIZE(C0), xvec0; +vaddsd xvec0, xvec15, xvec15; #endif movsd xvec15, 0*SIZE(C0); #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) ||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) @@ -3170,6 +3150,9 @@ movq 16(%rsp), %r12; movq 24(%rsp), %r13; movq 32(%rsp), %r14; movq 40(%rsp), %r15; + +vzeroupper + #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi diff --git a/kernel/x86_64/sgemm_kernel_8x8_sandy.S b/kernel/x86_64/sgemm_kernel_8x8_sandy.S index 59458effe..20ddcaa8e 100644 --- a/kernel/x86_64/sgemm_kernel_8x8_sandy.S +++ b/kernel/x86_64/sgemm_kernel_8x8_sandy.S @@ -146,75 +146,53 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define MOVQ movq #define XOR_SY vxorps -#define XOR_DY vxorpd -#define XOR_SX xorps -#define XOR_DX xorpd +#define XOR_SX vxorps #define LD_SY vmovaps -#define LD_DY vmovapd -#define LD_SX movaps -#define LD_DX movapd -#define LDL_SX movlps +#define LD_SX vmovaps +#define LDL_SX vmovlps #define LDL_SY vmovlps -#define LDH_SX movhps +#define LDH_SX vmovhps #define LDH_SY vmovhps #define ST_SY vmovaps -#define ST_DY vmovapd -#define ST_SX movaps -#define ST_DX movapd -#define STL_SX movlps +#define ST_SX vmovaps +#define STL_SX vmovlps #define STL_SY vmovlps -#define STH_SX movhps +#define STH_SX vmovhps #define STH_SY vmovhps #define EDUP_SY vmovsldup #define ODUP_SY vmovshdup -#define EDUP_SX movsldup -#define ODUP_SX movshdup -#define EDUP_DY vmovddup +#define EDUP_SX vmovsldup +#define ODUP_SX vmovshdup #define ADD_SY vaddps -#define ADD_DY vaddpd -#define ADD_SX addps -#define ADD_DX addpd +#define ADD_SX vaddps #define ADD1_DY vaddpd -#define ADD2_DY vaddpd -#define ADDSUB_DY vaddsubpd #define ADDSUB_SY vaddsubps #define MUL_SY vmulps -#define MUL_DY vmulpd -#define MUL_SX mulps -#define MUL_DX mulpd +#define MUL_SX vmulps #define SHUF_SY vperm2f128 #define SHUF_DY vperm2f128 -#define SHUF_DX pshufd -#define SHUF_SX pshufd +#define SHUF_SX vpshufd #define VPERMILP_SY vpermilps #define VPERMILP_SX vpermilps -#define VPERMILP_DY vpermilpd #define BROAD_SY vbroadcastss -#define BROAD_DY vbroadcastsd #define BROAD_SX vbroadcastss -#define BROAD_DX movddup #define MOV_SY vmovaps -#define MOV_DY vmovapd -#define MOV_SX movaps -#define MOV_DX movapd +#define MOV_SX vmovaps #define REVS_SY vshufps -#define REVS_DY vshufpd -#define REVS_SX shufps -#define REVS_DX movsd +#define REVS_SX vshufps #define EXTRA_SY vextractf128 -#define EXTRA_DY vextractf128 PROLOGUE @@ -260,6 +238,8 @@ movq old_offset, %r11 #endif #endif +vzeroupper + vmovlps ALPHA, MEMALPHA movq old_bm, bm movq old_bn, bn @@ -864,125 +844,125 @@ ALIGN_4 #### Unroll time 1 #### SHUF_SX $0x4e, xvec2, xvec4; -MUL_SX xvec0, xvec2; -ADD_SX xvec2, xvec15; +MUL_SX xvec0, xvec2, xvec2; +ADD_SX xvec2, xvec15, xvec15; SHUF_SX $0x4e, xvec3, xvec5; -MUL_SX xvec0, xvec3; -ADD_SX xvec3, xvec14; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec14, xvec14; EDUP_SX 4*SIZE(ptrbb), xvec2; -MUL_SX xvec0, xvec4; -ADD_SX xvec4, xvec13; +MUL_SX xvec0, xvec4, xvec4; +ADD_SX xvec4, xvec13, xvec13; ODUP_SX 4*SIZE(ptrbb), xvec3; -MUL_SX xvec0, xvec5; -ADD_SX xvec5, xvec12; +MUL_SX xvec0, xvec5, xvec5; +ADD_SX xvec5, xvec12, xvec12; LD_SX 4*SIZE(ptrba), xvec1; SHUF_SX $0x4e, xvec2, xvec4; -MUL_SX xvec0, xvec2; -ADD_SX xvec2, xvec11; +MUL_SX xvec0, xvec2, xvec2; +ADD_SX xvec2, xvec11, xvec11; SHUF_SX $0x4e, xvec3, xvec5; -MUL_SX xvec0, xvec3; -ADD_SX xvec3, xvec10; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec10, xvec10; EDUP_SX 8*SIZE(ptrbb), xvec2; -MUL_SX xvec0, xvec4; -ADD_SX xvec4, xvec9; +MUL_SX xvec0, xvec4, xvec4; +ADD_SX xvec4, xvec9, xvec9; ODUP_SX 8*SIZE(ptrbb), xvec3; -MUL_SX xvec0, xvec5; -ADD_SX xvec5, xvec8; +MUL_SX xvec0, xvec5, xvec5; +ADD_SX xvec5, xvec8, xvec8; #### Unroll time 2 #### SHUF_SX $0x4e, xvec2, xvec4; -MUL_SX xvec1, xvec2; -ADD_SX xvec2, xvec15; +MUL_SX xvec1, xvec2, xvec2; +ADD_SX xvec2, xvec15, xvec15; SHUF_SX $0x4e, xvec3, xvec5; -MUL_SX xvec1, xvec3; -ADD_SX xvec3, xvec14; +MUL_SX xvec1, xvec3, xvec3; +ADD_SX xvec3, xvec14, xvec14; EDUP_SX 12*SIZE(ptrbb), xvec2; -MUL_SX xvec1, xvec4; -ADD_SX xvec4, xvec13; +MUL_SX xvec1, xvec4, xvec4; +ADD_SX xvec4, xvec13, xvec13; ODUP_SX 12*SIZE(ptrbb), xvec3; -MUL_SX xvec1, xvec5; -ADD_SX xvec5, xvec12; +MUL_SX xvec1, xvec5, xvec5; +ADD_SX xvec5, xvec12, xvec12; LD_SX 8*SIZE(ptrba), xvec0; SHUF_SX $0x4e, xvec2, xvec4; -MUL_SX xvec1, xvec2; -ADD_SX xvec2, xvec11; +MUL_SX xvec1, xvec2, xvec2; +ADD_SX xvec2, xvec11, xvec11; SHUF_SX $0x4e, xvec3, xvec5; -MUL_SX xvec1, xvec3; -ADD_SX xvec3, xvec10; +MUL_SX xvec1, xvec3, xvec3; +ADD_SX xvec3, xvec10, xvec10; EDUP_SX 16*SIZE(ptrbb), xvec2; -MUL_SX xvec1, xvec4; -ADD_SX xvec4, xvec9; +MUL_SX xvec1, xvec4, xvec4; +ADD_SX xvec4, xvec9, xvec9; ODUP_SX 16*SIZE(ptrbb), xvec3; -MUL_SX xvec1, xvec5; -ADD_SX xvec5, xvec8; +MUL_SX xvec1, xvec5, xvec5; +ADD_SX xvec5, xvec8, xvec8; #### Unroll time 3 #### SHUF_SX $0x4e, xvec2, xvec4; -MUL_SX xvec0, xvec2; -ADD_SX xvec2, xvec15; +MUL_SX xvec0, xvec2, xvec2; +ADD_SX xvec2, xvec15, xvec15; SHUF_SX $0x4e, xvec3, xvec5; -MUL_SX xvec0, xvec3; -ADD_SX xvec3, xvec14; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec14, xvec14; EDUP_SX 20*SIZE(ptrbb), xvec2; -MUL_SX xvec0, xvec4; -ADD_SX xvec4, xvec13; +MUL_SX xvec0, xvec4, xvec4; +ADD_SX xvec4, xvec13, xvec13; ODUP_SX 20*SIZE(ptrbb), xvec3; -MUL_SX xvec0, xvec5; -ADD_SX xvec5, xvec12; +MUL_SX xvec0, xvec5, xvec5; +ADD_SX xvec5, xvec12, xvec12; LD_SX 12*SIZE(ptrba), xvec1; SHUF_SX $0x4e, xvec2, xvec4; -MUL_SX xvec0, xvec2; -ADD_SX xvec2, xvec11; +MUL_SX xvec0, xvec2, xvec2; +ADD_SX xvec2, xvec11, xvec11; SHUF_SX $0x4e, xvec3, xvec5; -MUL_SX xvec0, xvec3; -ADD_SX xvec3, xvec10; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec10, xvec10; EDUP_SX 24*SIZE(ptrbb), xvec2; -MUL_SX xvec0, xvec4; -ADD_SX xvec4, xvec9; +MUL_SX xvec0, xvec4, xvec4; +ADD_SX xvec4, xvec9, xvec9; ODUP_SX 24*SIZE(ptrbb), xvec3; -MUL_SX xvec0, xvec5; -ADD_SX xvec5, xvec8; +MUL_SX xvec0, xvec5, xvec5; +ADD_SX xvec5, xvec8, xvec8; ADDQ $16*SIZE, ptrba; #### Unroll time 4 #### SHUF_SX $0x4e, xvec2, xvec4; -MUL_SX xvec1, xvec2; -ADD_SX xvec2, xvec15; +MUL_SX xvec1, xvec2, xvec2; +ADD_SX xvec2, xvec15, xvec15; SHUF_SX $0x4e, xvec3, xvec5; -MUL_SX xvec1, xvec3; -ADD_SX xvec3, xvec14; +MUL_SX xvec1, xvec3, xvec3; +ADD_SX xvec3, xvec14, xvec14; EDUP_SX 28*SIZE(ptrbb), xvec2; -MUL_SX xvec1, xvec4; -ADD_SX xvec4, xvec13; +MUL_SX xvec1, xvec4, xvec4; +ADD_SX xvec4, xvec13, xvec13; ODUP_SX 28*SIZE(ptrbb), xvec3; -MUL_SX xvec1, xvec5; -ADD_SX xvec5, xvec12; +MUL_SX xvec1, xvec5, xvec5; +ADD_SX xvec5, xvec12, xvec12; ADDQ $32*SIZE, ptrbb; LD_SX 0*SIZE(ptrba), xvec0; SHUF_SX $0x4e, xvec2, xvec4; -MUL_SX xvec1, xvec2; -ADD_SX xvec2, xvec11; +MUL_SX xvec1, xvec2, xvec2; +ADD_SX xvec2, xvec11, xvec11; SHUF_SX $0x4e, xvec3, xvec5; -MUL_SX xvec1, xvec3; -ADD_SX xvec3, xvec10; +MUL_SX xvec1, xvec3, xvec3; +ADD_SX xvec3, xvec10, xvec10; EDUP_SX 0*SIZE(ptrbb), xvec2; -MUL_SX xvec1, xvec4; -ADD_SX xvec4, xvec9; +MUL_SX xvec1, xvec4, xvec4; +ADD_SX xvec4, xvec9, xvec9; ODUP_SX 0*SIZE(ptrbb), xvec3; -MUL_SX xvec1, xvec5; -ADD_SX xvec5, xvec8; +MUL_SX xvec1, xvec5, xvec5; +ADD_SX xvec5, xvec8, xvec8; DECQ k; JG .L8_bodyB; ALIGN_4 @@ -997,65 +977,65 @@ ALIGN_4 .L9_bodyB: #### Unroll time 1 #### SHUF_SX $0x4e, xvec2, xvec4; -MUL_SX xvec0, xvec2; -ADD_SX xvec2, xvec15; +MUL_SX xvec0, xvec2, xvec2; +ADD_SX xvec2, xvec15, xvec15; SHUF_SX $0x4e, xvec3, xvec5; -MUL_SX xvec0, xvec3; -ADD_SX xvec3, xvec14; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec14, xvec14; EDUP_SX 4*SIZE(ptrbb), xvec2; -MUL_SX xvec0, xvec4; -ADD_SX xvec4, xvec13; +MUL_SX xvec0, xvec4, xvec4; +ADD_SX xvec4, xvec13, xvec13; ODUP_SX 4*SIZE(ptrbb), xvec3; -MUL_SX xvec0, xvec5; -ADD_SX xvec5, xvec12; +MUL_SX xvec0, xvec5, xvec5; +ADD_SX xvec5, xvec12, xvec12; LD_SX 4*SIZE(ptrba), xvec1; SHUF_SX $0x4e, xvec2, xvec4; -MUL_SX xvec0, xvec2; -ADD_SX xvec2, xvec11; +MUL_SX xvec0, xvec2, xvec2; +ADD_SX xvec2, xvec11, xvec11; SHUF_SX $0x4e, xvec3, xvec5; -MUL_SX xvec0, xvec3; -ADD_SX xvec3, xvec10; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec10, xvec10; EDUP_SX 8*SIZE(ptrbb), xvec2; -MUL_SX xvec0, xvec4; -ADD_SX xvec4, xvec9; +MUL_SX xvec0, xvec4, xvec4; +ADD_SX xvec4, xvec9, xvec9; ODUP_SX 8*SIZE(ptrbb), xvec3; -MUL_SX xvec0, xvec5; -ADD_SX xvec5, xvec8; +MUL_SX xvec0, xvec5, xvec5; +ADD_SX xvec5, xvec8, xvec8; #### Unroll time 2 #### ADDQ $8*SIZE, ptrba; SHUF_SX $0x4e, xvec2, xvec4; -MUL_SX xvec1, xvec2; -ADD_SX xvec2, xvec15; +MUL_SX xvec1, xvec2, xvec2; +ADD_SX xvec2, xvec15, xvec15; SHUF_SX $0x4e, xvec3, xvec5; -MUL_SX xvec1, xvec3; -ADD_SX xvec3, xvec14; +MUL_SX xvec1, xvec3, xvec3; +ADD_SX xvec3, xvec14, xvec14; EDUP_SX 12*SIZE(ptrbb), xvec2; -MUL_SX xvec1, xvec4; -ADD_SX xvec4, xvec13; +MUL_SX xvec1, xvec4, xvec4; +ADD_SX xvec4, xvec13, xvec13; ODUP_SX 12*SIZE(ptrbb), xvec3; -MUL_SX xvec1, xvec5; -ADD_SX xvec5, xvec12; +MUL_SX xvec1, xvec5, xvec5; +ADD_SX xvec5, xvec12, xvec12; ADDQ $16*SIZE, ptrbb; LD_SX 0*SIZE(ptrba), xvec0; SHUF_SX $0x4e, xvec2, xvec4; -MUL_SX xvec1, xvec2; -ADD_SX xvec2, xvec11; +MUL_SX xvec1, xvec2, xvec2; +ADD_SX xvec2, xvec11, xvec11; SHUF_SX $0x4e, xvec3, xvec5; -MUL_SX xvec1, xvec3; -ADD_SX xvec3, xvec10; +MUL_SX xvec1, xvec3, xvec3; +ADD_SX xvec3, xvec10, xvec10; EDUP_SX 0*SIZE(ptrbb), xvec2; -MUL_SX xvec1, xvec4; -ADD_SX xvec4, xvec9; +MUL_SX xvec1, xvec4, xvec4; +ADD_SX xvec4, xvec9, xvec9; ODUP_SX 0*SIZE(ptrbb), xvec3; -MUL_SX xvec1, xvec5; -ADD_SX xvec5, xvec8; +MUL_SX xvec1, xvec5, xvec5; +ADD_SX xvec5, xvec8, xvec8; .L9_loopE: #ifndef TRMMKERNEL @@ -1068,57 +1048,57 @@ ALIGN_4 .L10_bodyB: #### Unroll time 1 #### SHUF_SX $0x4e, xvec2, xvec4; -MUL_SX xvec0, xvec2; -ADD_SX xvec2, xvec15; +MUL_SX xvec0, xvec2, xvec2; +ADD_SX xvec2, xvec15, xvec15; SHUF_SX $0x4e, xvec3, xvec5; -MUL_SX xvec0, xvec3; -ADD_SX xvec3, xvec14; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec14, xvec14; ADDQ $4*SIZE, ptrba; EDUP_SX 4*SIZE(ptrbb), xvec2; -MUL_SX xvec0, xvec4; -ADD_SX xvec4, xvec13; +MUL_SX xvec0, xvec4, xvec4; +ADD_SX xvec4, xvec13, xvec13; ODUP_SX 4*SIZE(ptrbb), xvec3; -MUL_SX xvec0, xvec5; -ADD_SX xvec5, xvec12; +MUL_SX xvec0, xvec5, xvec5; +ADD_SX xvec5, xvec12, xvec12; ADDQ $8*SIZE, ptrbb; SHUF_SX $0x4e, xvec2, xvec4; -MUL_SX xvec0, xvec2; -ADD_SX xvec2, xvec11; +MUL_SX xvec0, xvec2, xvec2; +ADD_SX xvec2, xvec11, xvec11; SHUF_SX $0x4e, xvec3, xvec5; -MUL_SX xvec0, xvec3; -ADD_SX xvec3, xvec10; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec10, xvec10; -MUL_SX xvec0, xvec4; -ADD_SX xvec4, xvec9; -MUL_SX xvec0, xvec5; -ADD_SX xvec5, xvec8; +MUL_SX xvec0, xvec4, xvec4; +ADD_SX xvec4, xvec9, xvec9; +MUL_SX xvec0, xvec5, xvec5; +ADD_SX xvec5, xvec8, xvec8; .L10_loopE: #### Multiply Alpha #### BROAD_SX MEMALPHA, xvec7; -MUL_SX xvec7, xvec15; -MUL_SX xvec7, xvec14; -MUL_SX xvec7, xvec13; -MUL_SX xvec7, xvec12; -MUL_SX xvec7, xvec11; -MUL_SX xvec7, xvec10; -MUL_SX xvec7, xvec9; -MUL_SX xvec7, xvec8; +MUL_SX xvec7, xvec15, xvec15; +MUL_SX xvec7, xvec14, xvec14; +MUL_SX xvec7, xvec13, xvec13; +MUL_SX xvec7, xvec12, xvec12; +MUL_SX xvec7, xvec11, xvec11; +MUL_SX xvec7, xvec10, xvec10; +MUL_SX xvec7, xvec9, xvec9; +MUL_SX xvec7, xvec8, xvec8; #### Reverse Result #### MOV_SX xvec15, xvec7; -REVS_SX $0xe4, xvec13, xvec15; -REVS_SX $0xe4, xvec7, xvec13; +REVS_SX $0xe4, xvec13, xvec15, xvec15; +REVS_SX $0xe4, xvec7, xvec13, xvec13; MOV_SX xvec14, xvec7; -REVS_SX $0xe4, xvec12, xvec14; -REVS_SX $0xe4, xvec7, xvec12; +REVS_SX $0xe4, xvec12, xvec14, xvec14; +REVS_SX $0xe4, xvec7, xvec12, xvec12; MOV_SX xvec11, xvec7; -REVS_SX $0xe4, xvec9, xvec11; -REVS_SX $0xe4, xvec7, xvec9; +REVS_SX $0xe4, xvec9, xvec11, xvec11; +REVS_SX $0xe4, xvec7, xvec9, xvec9; MOV_SX xvec10, xvec7; -REVS_SX $0xe4, xvec8, xvec10; -REVS_SX $0xe4, xvec7, xvec8; +REVS_SX $0xe4, xvec8, xvec10, xvec10; +REVS_SX $0xe4, xvec7, xvec8, xvec8; #### Testing Alignment #### MOVQ C0, %rax; OR ldc, %rax; @@ -1127,14 +1107,14 @@ JNE .L10_loopEx; ALIGN_4 LEAQ (ldc,ldc,2),%rax; #ifndef TRMMKERNEL -ADD_SX 0*SIZE(C0), xvec15; -ADD_SX 0*SIZE(C0, ldc, 1), xvec14; -ADD_SX 0*SIZE(C0, ldc, 2), xvec13; -ADD_SX 0*SIZE(C0, %rax, 1), xvec12; -ADD_SX 0*SIZE(C1), xvec11; -ADD_SX 0*SIZE(C1, ldc, 1), xvec10; -ADD_SX 0*SIZE(C1, ldc, 2), xvec9; -ADD_SX 0*SIZE(C1, %rax, 1), xvec8; +ADD_SX 0*SIZE(C0), xvec15, xvec15; +ADD_SX 0*SIZE(C0, ldc,1), xvec14, xvec14; +ADD_SX 0*SIZE(C0, ldc,2), xvec13, xvec13; +ADD_SX 0*SIZE(C0, %rax,1), xvec12, xvec12; +ADD_SX 0*SIZE(C1), xvec11, xvec11; +ADD_SX 0*SIZE(C1, ldc,1), xvec10, xvec10; +ADD_SX 0*SIZE(C1, ldc,2), xvec9, xvec9; +ADD_SX 0*SIZE(C1, %rax,1), xvec8, xvec8; #endif ST_SX xvec15, 0*SIZE(C0); ST_SX xvec14, 0*SIZE(C0, ldc, 1); @@ -1161,30 +1141,30 @@ ALIGN_4 .L10_loopEx: LEAQ (ldc,ldc,2),%rax; #ifndef TRMMKERNEL -LDL_SX 0*SIZE(C0), xvec7; -LDH_SX 2*SIZE(C0), xvec7; -LDL_SX 0*SIZE(C0, ldc, 1), xvec6; -LDH_SX 2*SIZE(C0, ldc, 1), xvec6; -LDL_SX 0*SIZE(C0, ldc, 2), xvec5; -LDH_SX 2*SIZE(C0, ldc, 2), xvec5; -LDL_SX 0*SIZE(C0, %rax, 1), xvec4; -LDH_SX 2*SIZE(C0, %rax, 1), xvec4; -LDL_SX 0*SIZE(C1), xvec3; -LDH_SX 2*SIZE(C1), xvec3; -LDL_SX 0*SIZE(C1, ldc, 1), xvec2; -LDH_SX 2*SIZE(C1, ldc, 1), xvec2; -LDL_SX 0*SIZE(C1, ldc, 2), xvec1; -LDH_SX 2*SIZE(C1, ldc, 2), xvec1; -LDL_SX 0*SIZE(C1, %rax, 1), xvec0; -LDH_SX 2*SIZE(C1, %rax, 1), xvec0; -ADD_SX xvec7, xvec15; -ADD_SX xvec6, xvec14; -ADD_SX xvec5, xvec13; -ADD_SX xvec4, xvec12; -ADD_SX xvec3, xvec11; -ADD_SX xvec2, xvec10; -ADD_SX xvec1, xvec9; -ADD_SX xvec0, xvec8; +LDL_SX 0*SIZE(C0), xvec7, xvec7; +LDH_SX 2*SIZE(C0), xvec7, xvec7; +LDL_SX 0*SIZE(C0, ldc, 1), xvec6, xvec6; +LDH_SX 2*SIZE(C0, ldc, 1), xvec6, xvec6; +LDL_SX 0*SIZE(C0, ldc, 2), xvec5, xvec5; +LDH_SX 2*SIZE(C0, ldc, 2), xvec5, xvec5; +LDL_SX 0*SIZE(C0, %rax, 1), xvec4, xvec4; +LDH_SX 2*SIZE(C0, %rax, 1), xvec4, xvec4; +LDL_SX 0*SIZE(C1), xvec3, xvec3; +LDH_SX 2*SIZE(C1), xvec3, xvec3; +LDL_SX 0*SIZE(C1, ldc, 1), xvec2, xvec2; +LDH_SX 2*SIZE(C1, ldc, 1), xvec2, xvec2; +LDL_SX 0*SIZE(C1, ldc, 2), xvec1, xvec1; +LDH_SX 2*SIZE(C1, ldc, 2), xvec1, xvec1; +LDL_SX 0*SIZE(C1, %rax, 1), xvec0, xvec0; +LDH_SX 2*SIZE(C1, %rax, 1), xvec0, xvec0; +ADD_SX xvec7, xvec15, xvec15; +ADD_SX xvec6, xvec14, xvec14; +ADD_SX xvec5, xvec13, xvec13; +ADD_SX xvec4, xvec12, xvec12; +ADD_SX xvec3, xvec11, xvec11; +ADD_SX xvec2, xvec10, xvec10; +ADD_SX xvec1, xvec9, xvec9; +ADD_SX xvec0, xvec8, xvec8; #endif STL_SX xvec15, 0*SIZE(C0); STH_SX xvec15, 2*SIZE(C0); @@ -1258,63 +1238,63 @@ LD_SX 0*SIZE(ptrba), xvec0; # a1, a2, a3, a4 SHUF_SX $0x44, xvec0, xvec1; # a1, a2, a1, a2 EDUP_SX 0*SIZE(ptrbb), xvec2; ODUP_SX 0*SIZE(ptrbb), xvec3; -MUL_SX xvec1, xvec2; -ADD_SX xvec2, xvec15; -MUL_SX xvec1, xvec3; -ADD_SX xvec3, xvec14; +MUL_SX xvec1, xvec2, xvec2; +ADD_SX xvec2, xvec15, xvec15; +MUL_SX xvec1, xvec3, xvec3; +ADD_SX xvec3, xvec14, xvec14; EDUP_SX 4*SIZE(ptrbb), xvec4; ODUP_SX 4*SIZE(ptrbb), xvec5; -MUL_SX xvec1, xvec4; -ADD_SX xvec4, xvec13; -MUL_SX xvec1, xvec5; -ADD_SX xvec5, xvec12; +MUL_SX xvec1, xvec4, xvec4; +ADD_SX xvec4, xvec13, xvec13; +MUL_SX xvec1, xvec5, xvec5; +ADD_SX xvec5, xvec12, xvec12; SHUF_SX $0xee, xvec0, xvec6; EDUP_SX 8*SIZE(ptrbb), xvec2; ODUP_SX 8*SIZE(ptrbb), xvec3; -MUL_SX xvec6, xvec2; -ADD_SX xvec2, xvec15; -MUL_SX xvec6, xvec3; -ADD_SX xvec3, xvec14; +MUL_SX xvec6, xvec2, xvec2; +ADD_SX xvec2, xvec15, xvec15; +MUL_SX xvec6, xvec3, xvec3; +ADD_SX xvec3, xvec14, xvec14; EDUP_SX 12*SIZE(ptrbb), xvec4; ODUP_SX 12*SIZE(ptrbb), xvec5; -MUL_SX xvec6, xvec4; -ADD_SX xvec4, xvec13; -MUL_SX xvec6, xvec5; -ADD_SX xvec5, xvec12; +MUL_SX xvec6, xvec4, xvec4; +ADD_SX xvec4, xvec13, xvec13; +MUL_SX xvec6, xvec5, xvec5; +ADD_SX xvec5, xvec12, xvec12; LD_SX 4*SIZE(ptrba), xvec0; SHUF_SX $0x44, xvec0, xvec1; EDUP_SX 16*SIZE(ptrbb), xvec2; ODUP_SX 16*SIZE(ptrbb), xvec3; -MUL_SX xvec1, xvec2; -ADD_SX xvec2, xvec15; -MUL_SX xvec1, xvec3; -ADD_SX xvec3, xvec14; +MUL_SX xvec1, xvec2, xvec2; +ADD_SX xvec2, xvec15, xvec15; +MUL_SX xvec1, xvec3, xvec3; +ADD_SX xvec3, xvec14, xvec14; EDUP_SX 20*SIZE(ptrbb), xvec4; ODUP_SX 20*SIZE(ptrbb), xvec5; -MUL_SX xvec1, xvec4; -ADD_SX xvec4, xvec13; -MUL_SX xvec1, xvec5; -ADD_SX xvec5, xvec12; +MUL_SX xvec1, xvec4, xvec4; +ADD_SX xvec4, xvec13, xvec13; +MUL_SX xvec1, xvec5, xvec5; +ADD_SX xvec5, xvec12, xvec12; SHUF_SX $0xee, xvec0, xvec6; EDUP_SX 24*SIZE(ptrbb), xvec2; ODUP_SX 24*SIZE(ptrbb), xvec3; -MUL_SX xvec6, xvec2; -ADD_SX xvec2, xvec15; -MUL_SX xvec6, xvec3; -ADD_SX xvec3, xvec14; +MUL_SX xvec6, xvec2, xvec2; +ADD_SX xvec2, xvec15, xvec15; +MUL_SX xvec6, xvec3, xvec3; +ADD_SX xvec3, xvec14, xvec14; EDUP_SX 28*SIZE(ptrbb), xvec4; ODUP_SX 28*SIZE(ptrbb), xvec5; -MUL_SX xvec6, xvec4; -ADD_SX xvec4, xvec13; -MUL_SX xvec6, xvec5; -ADD_SX xvec5, xvec12; +MUL_SX xvec6, xvec4, xvec4; +ADD_SX xvec4, xvec13, xvec13; +MUL_SX xvec6, xvec5, xvec5; +ADD_SX xvec5, xvec12, xvec12; ADDQ $8*SIZE, ptrba; ADDQ $32*SIZE, ptrbb; @@ -1334,32 +1314,32 @@ LD_SX 0*SIZE(ptrba), xvec0; # a1, a2, a3, a4 SHUF_SX $0x44, xvec0, xvec1; # a1, a2, a1, a2 EDUP_SX 0*SIZE(ptrbb), xvec2; ODUP_SX 0*SIZE(ptrbb), xvec3; -MUL_SX xvec1, xvec2; -ADD_SX xvec2, xvec15; -MUL_SX xvec1, xvec3; -ADD_SX xvec3, xvec14; +MUL_SX xvec1, xvec2, xvec2; +ADD_SX xvec2, xvec15, xvec15; +MUL_SX xvec1, xvec3, xvec3; +ADD_SX xvec3, xvec14, xvec14; EDUP_SX 4*SIZE(ptrbb), xvec4; ODUP_SX 4*SIZE(ptrbb), xvec5; -MUL_SX xvec1, xvec4; -ADD_SX xvec4, xvec13; -MUL_SX xvec1, xvec5; -ADD_SX xvec5, xvec12; +MUL_SX xvec1, xvec4, xvec4; +ADD_SX xvec4, xvec13, xvec13; +MUL_SX xvec1, xvec5, xvec5; +ADD_SX xvec5, xvec12, xvec12; SHUF_SX $0xee, xvec0, xvec6; EDUP_SX 8*SIZE(ptrbb), xvec2; ODUP_SX 8*SIZE(ptrbb), xvec3; -MUL_SX xvec6, xvec2; -ADD_SX xvec2, xvec15; -MUL_SX xvec6, xvec3; -ADD_SX xvec3, xvec14; +MUL_SX xvec6, xvec2, xvec2; +ADD_SX xvec2, xvec15, xvec15; +MUL_SX xvec6, xvec3, xvec3; +ADD_SX xvec3, xvec14, xvec14; EDUP_SX 12*SIZE(ptrbb), xvec4; ODUP_SX 12*SIZE(ptrbb), xvec5; -MUL_SX xvec6, xvec4; -ADD_SX xvec4, xvec13; -MUL_SX xvec6, xvec5; -ADD_SX xvec5, xvec12; +MUL_SX xvec6, xvec4, xvec4; +ADD_SX xvec4, xvec13, xvec13; +MUL_SX xvec6, xvec5, xvec5; +ADD_SX xvec5, xvec12, xvec12; ADDQ $4*SIZE, ptrba; ADDQ $16*SIZE, ptrbb; @@ -1376,40 +1356,40 @@ LD_SX 0*SIZE(ptrba), xvec0; # a1, a2, a3, a4 SHUF_SX $0x44, xvec0, xvec1; # a1, a2, a1, a2 EDUP_SX 0*SIZE(ptrbb), xvec2; ODUP_SX 0*SIZE(ptrbb), xvec3; -MUL_SX xvec1, xvec2; -ADD_SX xvec2, xvec15; -MUL_SX xvec1, xvec3; -ADD_SX xvec3, xvec14; +MUL_SX xvec1, xvec2, xvec2; +ADD_SX xvec2, xvec15, xvec15; +MUL_SX xvec1, xvec3, xvec3; +ADD_SX xvec3, xvec14, xvec14; EDUP_SX 4*SIZE(ptrbb), xvec4; ODUP_SX 4*SIZE(ptrbb), xvec5; -MUL_SX xvec1, xvec4; -ADD_SX xvec4, xvec13; -MUL_SX xvec1, xvec5; -ADD_SX xvec5, xvec12; +MUL_SX xvec1, xvec4, xvec4; +ADD_SX xvec4, xvec13, xvec13; +MUL_SX xvec1, xvec5, xvec5; +ADD_SX xvec5, xvec12, xvec12; ADDQ $2*SIZE, ptrba; ADDQ $8*SIZE, ptrbb; .L13_loopE: LEAQ (ldc,ldc,2),%rax; #### Multiply Alpha #### BROAD_SX MEMALPHA, xvec7; -MUL_SX xvec7, xvec15; -MUL_SX xvec7, xvec14; -MUL_SX xvec7, xvec13; -MUL_SX xvec7, xvec12; +MUL_SX xvec7, xvec15, xvec15; +MUL_SX xvec7, xvec14, xvec14; +MUL_SX xvec7, xvec13, xvec13; +MUL_SX xvec7, xvec12, xvec12; #ifndef TRMMKERNEL -LDL_SX 0*SIZE(C0), xvec11; -LDH_SX 0*SIZE(C0, ldc, 2), xvec11; -LDL_SX 0*SIZE(C0, ldc, 1), xvec10; -LDH_SX 0*SIZE(C0, %rax, 1), xvec10; -LDL_SX 0*SIZE(C1), xvec9; -LDH_SX 0*SIZE(C1, ldc, 2), xvec9; -LDL_SX 0*SIZE(C1, ldc, 1), xvec8; -LDH_SX 0*SIZE(C1, %rax, 1), xvec8; -ADD_SX xvec11, xvec15; -ADD_SX xvec10, xvec14; -ADD_SX xvec9, xvec13; -ADD_SX xvec8, xvec12; +LDL_SX 0*SIZE(C0), xvec11, xvec11; +LDH_SX 0*SIZE(C0, ldc, 2), xvec11, xvec11; +LDL_SX 0*SIZE(C0, ldc, 1), xvec10, xvec10; +LDH_SX 0*SIZE(C0, %rax, 1), xvec10, xvec10; +LDL_SX 0*SIZE(C1), xvec9, xvec9; +LDH_SX 0*SIZE(C1, ldc, 2), xvec9, xvec9; +LDL_SX 0*SIZE(C1, ldc, 1), xvec8, xvec8; +LDH_SX 0*SIZE(C1, %rax,1), xvec8, xvec8; +ADD_SX xvec11, xvec15, xvec15; +ADD_SX xvec10, xvec14, xvec14; +ADD_SX xvec9, xvec13, xvec13; +ADD_SX xvec8, xvec12, xvec12; #endif STL_SX xvec15, 0*SIZE(C0); STH_SX xvec15, 0*SIZE(C0, ldc, 2); @@ -1471,35 +1451,35 @@ ALIGN_4 .L14_bodyB: BROAD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec2; -MUL_SX xvec0, xvec2; -ADD_SX xvec2, xvec15; +MUL_SX xvec0, xvec2, xvec2; +ADD_SX xvec2, xvec15, xvec15; LD_SX 4*SIZE(ptrbb), xvec3; -MUL_SX xvec0, xvec3; -ADD_SX xvec3, xvec14; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec14, xvec14; BROAD_SX 1*SIZE(ptrba), xvec1; LD_SX 8*SIZE(ptrbb), xvec4; -MUL_SX xvec1, xvec4; -ADD_SX xvec4, xvec15; +MUL_SX xvec1, xvec4, xvec4; +ADD_SX xvec4, xvec15, xvec15; LD_SX 12*SIZE(ptrbb), xvec5; -MUL_SX xvec1, xvec5; -ADD_SX xvec5, xvec14; +MUL_SX xvec1, xvec5, xvec5; +ADD_SX xvec5, xvec14, xvec14; BROAD_SX 2*SIZE(ptrba), xvec0; LD_SX 16*SIZE(ptrbb), xvec2; -MUL_SX xvec0, xvec2; -ADD_SX xvec2, xvec15; +MUL_SX xvec0, xvec2, xvec2; +ADD_SX xvec2, xvec15, xvec15; LD_SX 20*SIZE(ptrbb), xvec3; -MUL_SX xvec0, xvec3; -ADD_SX xvec3, xvec14; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec14, xvec14; BROAD_SX 3*SIZE(ptrba), xvec1; LD_SX 24*SIZE(ptrbb), xvec4; -MUL_SX xvec1, xvec4; -ADD_SX xvec4, xvec15; +MUL_SX xvec1, xvec4, xvec4; +ADD_SX xvec4, xvec15, xvec15; LD_SX 28*SIZE(ptrbb), xvec5; -MUL_SX xvec1, xvec5; -ADD_SX xvec5, xvec14; +MUL_SX xvec1, xvec5, xvec5; +ADD_SX xvec5, xvec14, xvec14; ADDQ $4*SIZE, ptrba; ADDQ $32*SIZE, ptrbb; @@ -1517,19 +1497,19 @@ ALIGN_4 .L15_bodyB: BROAD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec2; -MUL_SX xvec0, xvec2; -ADD_SX xvec2, xvec15; +MUL_SX xvec0, xvec2, xvec2; +ADD_SX xvec2, xvec15, xvec15; LD_SX 4*SIZE(ptrbb), xvec3; -MUL_SX xvec0, xvec3; -ADD_SX xvec3, xvec14; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec14, xvec14; BROAD_SX 1*SIZE(ptrba), xvec1; LD_SX 8*SIZE(ptrbb), xvec4; -MUL_SX xvec1, xvec4; -ADD_SX xvec4, xvec15; +MUL_SX xvec1, xvec4, xvec4; +ADD_SX xvec4, xvec15, xvec15; LD_SX 12*SIZE(ptrbb), xvec5; -MUL_SX xvec1, xvec5; -ADD_SX xvec5, xvec14; +MUL_SX xvec1, xvec5, xvec5; +ADD_SX xvec5, xvec14, xvec14; ADDQ $2*SIZE, ptrba; ADDQ $16*SIZE, ptrbb; @@ -1544,18 +1524,18 @@ ALIGN_4 .L16_bodyB: BROAD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec2; -MUL_SX xvec0, xvec2; -ADD_SX xvec2, xvec15; +MUL_SX xvec0, xvec2, xvec2; +ADD_SX xvec2, xvec15, xvec15; LD_SX 4*SIZE(ptrbb), xvec3; -MUL_SX xvec0, xvec3; -ADD_SX xvec3, xvec14; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec14, xvec14; ADDQ $1, ptrba; ADDQ $4, ptrbb; .L16_loopE: BROAD_SX MEMALPHA, xvec7; -MUL_SX xvec7, xvec15; -MUL_SX xvec7, xvec14; +MUL_SX xvec7, xvec15, xvec15; +MUL_SX xvec7, xvec14, xvec14; LEAQ (ldc,ldc,2),%rax; SHUF_SX $0xff, xvec15, xvec13; @@ -1676,96 +1656,96 @@ ALIGN_4 ODUP_SX 0*SIZE(ptrbb), xvec3; SHUF_SX $0x4e, xvec2, xvec4; MOV_SX xvec2, xvec6; -MUL_SX xvec0, xvec2; -ADD_SX xvec2, xvec15; -MUL_SX xvec1, xvec6; -ADD_SX xvec6, xvec14; +MUL_SX xvec0, xvec2, xvec2; +ADD_SX xvec2, xvec15, xvec15; +MUL_SX xvec1, xvec6, xvec6; +ADD_SX xvec6, xvec14, xvec14; SHUF_SX $0x4e, xvec3, xvec5; MOV_SX xvec3, xvec7; -MUL_SX xvec0, xvec3; -ADD_SX xvec3, xvec13; -MUL_SX xvec1, xvec7; -ADD_SX xvec7, xvec12; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec13, xvec13; +MUL_SX xvec1, xvec7, xvec7; +ADD_SX xvec7, xvec12, xvec12; EDUP_SX 4*SIZE(ptrbb), xvec2; MOV_SX xvec4, xvec6; -MUL_SX xvec0, xvec4; -ADD_SX xvec4, xvec11; -MUL_SX xvec1, xvec6; -ADD_SX xvec6, xvec10; +MUL_SX xvec0, xvec4, xvec4; +ADD_SX xvec4, xvec11, xvec11; +MUL_SX xvec1, xvec6, xvec6; +ADD_SX xvec6, xvec10, xvec10; MOV_SX xvec5, xvec7; -MUL_SX xvec0, xvec5; -ADD_SX xvec5, xvec9; +MUL_SX xvec0, xvec5, xvec5; +ADD_SX xvec5, xvec9, xvec9; LD_SX 8*SIZE(ptrba), xvec0; -MUL_SX xvec1, xvec7; -ADD_SX xvec7, xvec8; +MUL_SX xvec1, xvec7, xvec7; +ADD_SX xvec7, xvec8, xvec8; LD_SX 12*SIZE(ptrba), xvec1; #### Unroll time 2 #### ODUP_SX 4*SIZE(ptrbb), xvec3; SHUF_SX $0x4e, xvec2, xvec4; MOV_SX xvec2, xvec6; -MUL_SX xvec0, xvec2; -ADD_SX xvec2, xvec15; -MUL_SX xvec1, xvec6; -ADD_SX xvec6, xvec14; +MUL_SX xvec0, xvec2, xvec2; +ADD_SX xvec2, xvec15, xvec15; +MUL_SX xvec1, xvec6, xvec6; +ADD_SX xvec6, xvec14, xvec14; SHUF_SX $0x4e, xvec3, xvec5; MOV_SX xvec3, xvec7; -MUL_SX xvec0, xvec3; -ADD_SX xvec3, xvec13; -MUL_SX xvec1, xvec7; -ADD_SX xvec7, xvec12; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec13, xvec13; +MUL_SX xvec1, xvec7, xvec7; +ADD_SX xvec7, xvec12, xvec12; EDUP_SX 8*SIZE(ptrbb), xvec2; MOV_SX xvec4, xvec6; -MUL_SX xvec0, xvec4; -ADD_SX xvec4, xvec11; -MUL_SX xvec1, xvec6; -ADD_SX xvec6, xvec10; +MUL_SX xvec0, xvec4, xvec4; +ADD_SX xvec4, xvec11, xvec11; +MUL_SX xvec1, xvec6, xvec6; +ADD_SX xvec6, xvec10, xvec10; MOV_SX xvec5, xvec7; -MUL_SX xvec0, xvec5; -ADD_SX xvec5, xvec9; +MUL_SX xvec0, xvec5, xvec5; +ADD_SX xvec5, xvec9, xvec9; LD_SX 16*SIZE(ptrba), xvec0; -MUL_SX xvec1, xvec7; -ADD_SX xvec7, xvec8; +MUL_SX xvec1, xvec7, xvec7; +ADD_SX xvec7, xvec8, xvec8; LD_SX 20*SIZE(ptrba), xvec1; #### Unroll time 3 #### ODUP_SX 8*SIZE(ptrbb), xvec3; SHUF_SX $0x4e, xvec2, xvec4; MOV_SX xvec2, xvec6; -MUL_SX xvec0, xvec2; -ADD_SX xvec2, xvec15; -MUL_SX xvec1, xvec6; -ADD_SX xvec6, xvec14; +MUL_SX xvec0, xvec2, xvec2; +ADD_SX xvec2, xvec15, xvec15; +MUL_SX xvec1, xvec6, xvec6; +ADD_SX xvec6, xvec14, xvec14; SHUF_SX $0x4e, xvec3, xvec5; MOV_SX xvec3, xvec7; -MUL_SX xvec0, xvec3; -ADD_SX xvec3, xvec13; -MUL_SX xvec1, xvec7; -ADD_SX xvec7, xvec12; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec13, xvec13; +MUL_SX xvec1, xvec7, xvec7; +ADD_SX xvec7, xvec12, xvec12; EDUP_SX 12*SIZE(ptrbb), xvec2; MOV_SX xvec4, xvec6; ADDQ $16*SIZE, ptrbb; -MUL_SX xvec0, xvec4; -ADD_SX xvec4, xvec11; -MUL_SX xvec1, xvec6; -ADD_SX xvec6, xvec10; +MUL_SX xvec0, xvec4, xvec4; +ADD_SX xvec4, xvec11, xvec11; +MUL_SX xvec1, xvec6, xvec6; +ADD_SX xvec6, xvec10, xvec10; MOV_SX xvec5, xvec7; -MUL_SX xvec0, xvec5; -ADD_SX xvec5, xvec9; +MUL_SX xvec0, xvec5, xvec5; +ADD_SX xvec5, xvec9, xvec9; LD_SX 24*SIZE(ptrba), xvec0; -MUL_SX xvec1, xvec7; -ADD_SX xvec7, xvec8; +MUL_SX xvec1, xvec7, xvec7; +ADD_SX xvec7, xvec8, xvec8; LD_SX 28*SIZE(ptrba), xvec1; ADDQ $32*SIZE, ptrba; @@ -1773,32 +1753,32 @@ ADDQ $32*SIZE, ptrba; ODUP_SX -4*SIZE(ptrbb), xvec3; SHUF_SX $0x4e, xvec2, xvec4; MOV_SX xvec2, xvec6; -MUL_SX xvec0, xvec2; -ADD_SX xvec2, xvec15; -MUL_SX xvec1, xvec6; -ADD_SX xvec6, xvec14; +MUL_SX xvec0, xvec2, xvec2; +ADD_SX xvec2, xvec15, xvec15; +MUL_SX xvec1, xvec6, xvec6; +ADD_SX xvec6, xvec14, xvec14; SHUF_SX $0x4e, xvec3, xvec5; MOV_SX xvec3, xvec7; -MUL_SX xvec0, xvec3; -ADD_SX xvec3, xvec13; -MUL_SX xvec1, xvec7; -ADD_SX xvec7, xvec12; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec13, xvec13; +MUL_SX xvec1, xvec7, xvec7; +ADD_SX xvec7, xvec12, xvec12; EDUP_SX 0*SIZE(ptrbb), xvec2; MOV_SX xvec4, xvec6; -MUL_SX xvec0, xvec4; -ADD_SX xvec4, xvec11; -MUL_SX xvec1, xvec6; -ADD_SX xvec6, xvec10; +MUL_SX xvec0, xvec4, xvec4; +ADD_SX xvec4, xvec11, xvec11; +MUL_SX xvec1, xvec6, xvec6; +ADD_SX xvec6, xvec10, xvec10; MOV_SX xvec5, xvec7; -MUL_SX xvec0, xvec5; -ADD_SX xvec5, xvec9; +MUL_SX xvec0, xvec5, xvec5; +ADD_SX xvec5, xvec9, xvec9; LD_SX 0*SIZE(ptrba), xvec0; -MUL_SX xvec1, xvec7; -ADD_SX xvec7, xvec8; +MUL_SX xvec1, xvec7, xvec7; +ADD_SX xvec7, xvec8, xvec8; LD_SX 4*SIZE(ptrba), xvec1; DECQ k; JG .L211_bodyB; @@ -1816,33 +1796,33 @@ ALIGN_4 ODUP_SX 0*SIZE(ptrbb), xvec3; SHUF_SX $0x4e, xvec2, xvec4; MOV_SX xvec2, xvec6; -MUL_SX xvec0, xvec2; -ADD_SX xvec2, xvec15; -MUL_SX xvec1, xvec6; -ADD_SX xvec6, xvec14; +MUL_SX xvec0, xvec2, xvec2; +ADD_SX xvec2, xvec15, xvec15; +MUL_SX xvec1, xvec6, xvec6; +ADD_SX xvec6, xvec14, xvec14; SHUF_SX $0x4e, xvec3, xvec5; MOV_SX xvec3, xvec7; -MUL_SX xvec0, xvec3; -ADD_SX xvec3, xvec13; -MUL_SX xvec1, xvec7; -ADD_SX xvec7, xvec12; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec13, xvec13; +MUL_SX xvec1, xvec7, xvec7; +ADD_SX xvec7, xvec12, xvec12; EDUP_SX 4*SIZE(ptrbb), xvec2; MOV_SX xvec4, xvec6; -MUL_SX xvec0, xvec4; -ADD_SX xvec4, xvec11; +MUL_SX xvec0, xvec4, xvec4; +ADD_SX xvec4, xvec11, xvec11; ADDQ $8*SIZE, ptrbb; -MUL_SX xvec1, xvec6; -ADD_SX xvec6, xvec10; +MUL_SX xvec1, xvec6, xvec6; +ADD_SX xvec6, xvec10, xvec10; MOV_SX xvec5, xvec7; -MUL_SX xvec0, xvec5; -ADD_SX xvec5, xvec9; +MUL_SX xvec0, xvec5, xvec5; +ADD_SX xvec5, xvec9, xvec9; LD_SX 8*SIZE(ptrba), xvec0; -MUL_SX xvec1, xvec7; -ADD_SX xvec7, xvec8; +MUL_SX xvec1, xvec7, xvec7; +ADD_SX xvec7, xvec8, xvec8; LD_SX 12*SIZE(ptrba), xvec1; ADDQ $16*SIZE, ptrba; @@ -1850,31 +1830,31 @@ ADDQ $16*SIZE, ptrba; ODUP_SX -4*SIZE(ptrbb), xvec3; SHUF_SX $0x4e, xvec2, xvec4; MOV_SX xvec2, xvec6; -MUL_SX xvec0, xvec2; -ADD_SX xvec2, xvec15; -MUL_SX xvec1, xvec6; -ADD_SX xvec6, xvec14; +MUL_SX xvec0, xvec2, xvec2; +ADD_SX xvec2, xvec15, xvec15; +MUL_SX xvec1, xvec6, xvec6; +ADD_SX xvec6, xvec14, xvec14; SHUF_SX $0x4e, xvec3, xvec5; MOV_SX xvec3, xvec7; -MUL_SX xvec0, xvec3; -ADD_SX xvec3, xvec13; -MUL_SX xvec1, xvec7; -ADD_SX xvec7, xvec12; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec13, xvec13; +MUL_SX xvec1, xvec7, xvec7; +ADD_SX xvec7, xvec12, xvec12; EDUP_SX 0*SIZE(ptrbb), xvec2; MOV_SX xvec4, xvec6; -MUL_SX xvec0, xvec4; -ADD_SX xvec4, xvec11; -MUL_SX xvec1, xvec6; -ADD_SX xvec6, xvec10; +MUL_SX xvec0, xvec4, xvec4; +ADD_SX xvec4, xvec11, xvec11; +MUL_SX xvec1, xvec6, xvec6; +ADD_SX xvec6, xvec10, xvec10; MOV_SX xvec5, xvec7; -MUL_SX xvec0, xvec5; -ADD_SX xvec5, xvec9; +MUL_SX xvec0, xvec5, xvec5; +ADD_SX xvec5, xvec9, xvec9; LD_SX 0*SIZE(ptrba), xvec0; -MUL_SX xvec1, xvec7; -ADD_SX xvec7, xvec8; +MUL_SX xvec1, xvec7, xvec7; +ADD_SX xvec7, xvec8, xvec8; LD_SX 4*SIZE(ptrba), xvec1; .L212_loopE: @@ -1889,70 +1869,70 @@ ALIGN_4 ODUP_SX 0*SIZE(ptrbb), xvec3; SHUF_SX $0x4e, xvec2, xvec4; MOV_SX xvec2, xvec6; -MUL_SX xvec0, xvec2; -ADD_SX xvec2, xvec15; +MUL_SX xvec0, xvec2, xvec2; +ADD_SX xvec2, xvec15, xvec15; ADDQ $4*SIZE, ptrbb; SHUF_SX $0x4e, xvec3, xvec5; -MUL_SX xvec1, xvec6; -ADD_SX xvec6, xvec14; +MUL_SX xvec1, xvec6, xvec6; +ADD_SX xvec6, xvec14, xvec14; MOV_SX xvec3, xvec7; -MUL_SX xvec0, xvec3; -ADD_SX xvec3, xvec13; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec13, xvec13; -MUL_SX xvec1, xvec7; -ADD_SX xvec7, xvec12; +MUL_SX xvec1, xvec7, xvec7; +ADD_SX xvec7, xvec12, xvec12; MOV_SX xvec4, xvec6; ADDQ $8*SIZE, ptrba; -MUL_SX xvec0, xvec4; -ADD_SX xvec4, xvec11; -MUL_SX xvec1, xvec6; -ADD_SX xvec6, xvec10; +MUL_SX xvec0, xvec4, xvec4; +ADD_SX xvec4, xvec11, xvec11; +MUL_SX xvec1, xvec6, xvec6; +ADD_SX xvec6, xvec10, xvec10; MOV_SX xvec5, xvec7; -MUL_SX xvec0, xvec5; -ADD_SX xvec5, xvec9; -MUL_SX xvec1, xvec7; -ADD_SX xvec7, xvec8; +MUL_SX xvec0, xvec5, xvec5; +ADD_SX xvec5, xvec9, xvec9; +MUL_SX xvec1, xvec7, xvec7; +ADD_SX xvec7, xvec8, xvec8; .L213_loopE: #### Multiply Alpha #### BROAD_SX MEMALPHA, xvec7; -MUL_SX xvec7, xvec15; -MUL_SX xvec7, xvec14; -MUL_SX xvec7, xvec13; -MUL_SX xvec7, xvec12; -MUL_SX xvec7, xvec11; -MUL_SX xvec7, xvec10; -MUL_SX xvec7, xvec9; -MUL_SX xvec7, xvec8; +MUL_SX xvec7, xvec15, xvec15; +MUL_SX xvec7, xvec14, xvec14; +MUL_SX xvec7, xvec13, xvec13; +MUL_SX xvec7, xvec12, xvec12; +MUL_SX xvec7, xvec11, xvec11; +MUL_SX xvec7, xvec10, xvec10; +MUL_SX xvec7, xvec9, xvec9; +MUL_SX xvec7, xvec8, xvec8; #### Writing Back #### #ifndef TRMMKERNEL -LDL_SX 0*SIZE(C0), xvec0; -LDH_SX 2*SIZE(C1), xvec0; -LDL_SX 4*SIZE(C0), xvec1; -LDH_SX 6*SIZE(C1), xvec1; -LDL_SX 0*SIZE(C0, ldc, 1), xvec2; -LDH_SX 2*SIZE(C1, ldc, 1), xvec2; -LDL_SX 4*SIZE(C0, ldc, 1), xvec3; -LDH_SX 6*SIZE(C1, ldc, 1), xvec3; -LDL_SX 0*SIZE(C1), xvec4; -LDH_SX 2*SIZE(C0), xvec4; -LDL_SX 4*SIZE(C1), xvec5; -LDH_SX 6*SIZE(C0), xvec5; -LDL_SX 0*SIZE(C1, ldc, 1), xvec6; -LDH_SX 2*SIZE(C0, ldc, 1), xvec6; -LDL_SX 4*SIZE(C1, ldc, 1), xvec7; -LDH_SX 6*SIZE(C0, ldc, 1), xvec7; -ADD_SX xvec0, xvec15; -ADD_SX xvec1, xvec14; -ADD_SX xvec2, xvec13; -ADD_SX xvec3, xvec12; -ADD_SX xvec4, xvec11; -ADD_SX xvec5, xvec10; -ADD_SX xvec6, xvec9; -ADD_SX xvec7, xvec8; +LDL_SX 0*SIZE(C0), xvec0, xvec0; +LDH_SX 2*SIZE(C1), xvec0, xvec0; +LDL_SX 4*SIZE(C0), xvec1, xvec1; +LDH_SX 6*SIZE(C1), xvec1, xvec1; +LDL_SX 0*SIZE(C0, ldc, 1), xvec2, xvec2; +LDH_SX 2*SIZE(C1, ldc, 1), xvec2, xvec2; +LDL_SX 4*SIZE(C0, ldc, 1), xvec3, xvec3; +LDH_SX 6*SIZE(C1, ldc, 1), xvec3, xvec3; +LDL_SX 0*SIZE(C1), xvec4, xvec4; +LDH_SX 2*SIZE(C0), xvec4, xvec4; +LDL_SX 4*SIZE(C1), xvec5, xvec5; +LDH_SX 6*SIZE(C0), xvec5, xvec5; +LDL_SX 0*SIZE(C1, ldc, 1), xvec6, xvec6; +LDH_SX 2*SIZE(C0, ldc, 1), xvec6, xvec6; +LDL_SX 4*SIZE(C1, ldc, 1), xvec7, xvec7; +LDH_SX 6*SIZE(C0, ldc, 1), xvec7, xvec7; +ADD_SX xvec0, xvec15, xvec15; +ADD_SX xvec1, xvec14, xvec14; +ADD_SX xvec2, xvec13, xvec13; +ADD_SX xvec3, xvec12, xvec12; +ADD_SX xvec4, xvec11, xvec11; +ADD_SX xvec5, xvec10, xvec10; +ADD_SX xvec6, xvec9, xvec9; +ADD_SX xvec7, xvec8, xvec8; #endif STL_SX xvec15, 0*SIZE(C0); STH_SX xvec15, 2*SIZE(C1); @@ -2028,64 +2008,64 @@ EDUP_SX 0*SIZE(ptrbb), xvec2; ODUP_SX 0*SIZE(ptrbb), xvec3; SHUF_SX $0x4e, xvec2, xvec4; -MUL_SX xvec0, xvec2; -ADD_SX xvec2, xvec15; +MUL_SX xvec0, xvec2, xvec2; +ADD_SX xvec2, xvec15, xvec15; SHUF_SX $0x4e, xvec3, xvec5; -MUL_SX xvec0, xvec3; -ADD_SX xvec3, xvec14; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec14, xvec14; -MUL_SX xvec0, xvec4; -ADD_SX xvec4, xvec13; -MUL_SX xvec0, xvec5; -ADD_SX xvec5, xvec12; +MUL_SX xvec0, xvec4, xvec4; +ADD_SX xvec4, xvec13, xvec13; +MUL_SX xvec0, xvec5, xvec5; +ADD_SX xvec5, xvec12, xvec12; LD_SX 4*SIZE(ptrba), xvec1; EDUP_SX 4*SIZE(ptrbb), xvec2; ODUP_SX 4*SIZE(ptrbb), xvec3; SHUF_SX $0x4e, xvec2, xvec4; -MUL_SX xvec1, xvec2; -ADD_SX xvec2, xvec15; +MUL_SX xvec1, xvec2, xvec2; +ADD_SX xvec2, xvec15, xvec15; SHUF_SX $0x4e, xvec3, xvec5; -MUL_SX xvec1, xvec3; -ADD_SX xvec3, xvec14; +MUL_SX xvec1, xvec3, xvec3; +ADD_SX xvec3, xvec14, xvec14; -MUL_SX xvec1, xvec4; -ADD_SX xvec4, xvec13; -MUL_SX xvec1, xvec5; -ADD_SX xvec5, xvec12; +MUL_SX xvec1, xvec4, xvec4; +ADD_SX xvec4, xvec13, xvec13; +MUL_SX xvec1, xvec5, xvec5; +ADD_SX xvec5, xvec12, xvec12; LD_SX 8*SIZE(ptrba), xvec0; EDUP_SX 8*SIZE(ptrbb), xvec2; ODUP_SX 8*SIZE(ptrbb), xvec3; SHUF_SX $0x4e, xvec2, xvec4; -MUL_SX xvec0, xvec2; -ADD_SX xvec2, xvec15; +MUL_SX xvec0, xvec2, xvec2; +ADD_SX xvec2, xvec15, xvec15; SHUF_SX $0x4e, xvec3, xvec5; -MUL_SX xvec0, xvec3; -ADD_SX xvec3, xvec14; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec14, xvec14; -MUL_SX xvec0, xvec4; -ADD_SX xvec4, xvec13; -MUL_SX xvec0, xvec5; -ADD_SX xvec5, xvec12; +MUL_SX xvec0, xvec4, xvec4; +ADD_SX xvec4, xvec13, xvec13; +MUL_SX xvec0, xvec5, xvec5; +ADD_SX xvec5, xvec12, xvec12; LD_SX 12*SIZE(ptrba), xvec1; EDUP_SX 12*SIZE(ptrbb), xvec2; ODUP_SX 12*SIZE(ptrbb), xvec3; SHUF_SX $0x4e, xvec2, xvec4; -MUL_SX xvec1, xvec2; -ADD_SX xvec2, xvec15 +MUL_SX xvec1, xvec2, xvec2; +ADD_SX xvec2, xvec15, xvec15 SHUF_SX $0x4e, xvec3, xvec5; -MUL_SX xvec1, xvec3; -ADD_SX xvec3, xvec14; +MUL_SX xvec1, xvec3, xvec3; +ADD_SX xvec3, xvec14, xvec14; -MUL_SX xvec1, xvec4; -ADD_SX xvec4, xvec13; -MUL_SX xvec1, xvec5; -ADD_SX xvec5, xvec12; +MUL_SX xvec1, xvec4, xvec4; +ADD_SX xvec4, xvec13, xvec13; +MUL_SX xvec1, xvec5, xvec5; +ADD_SX xvec5, xvec12, xvec12; ADDQ $16*SIZE, ptrba; ADDQ $16*SIZE, ptrbb; @@ -2106,32 +2086,32 @@ EDUP_SX 0*SIZE(ptrbb), xvec2; ODUP_SX 0*SIZE(ptrbb), xvec3; SHUF_SX $0x4e, xvec2, xvec4; -MUL_SX xvec0, xvec2; -ADD_SX xvec2, xvec15; +MUL_SX xvec0, xvec2, xvec2; +ADD_SX xvec2, xvec15, xvec15; SHUF_SX $0x4e, xvec3, xvec5; -MUL_SX xvec0, xvec3; -ADD_SX xvec3, xvec14; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec14, xvec14; -MUL_SX xvec0, xvec4; -ADD_SX xvec4, xvec13; -MUL_SX xvec0, xvec5; -ADD_SX xvec5, xvec12; +MUL_SX xvec0, xvec4, xvec4; +ADD_SX xvec4, xvec13, xvec13; +MUL_SX xvec0, xvec5, xvec5; +ADD_SX xvec5, xvec12, xvec12; LD_SX 4*SIZE(ptrba), xvec1; EDUP_SX 4*SIZE(ptrbb), xvec2; ODUP_SX 4*SIZE(ptrbb), xvec3; SHUF_SX $0x4e, xvec2, xvec4; -MUL_SX xvec1, xvec2; -ADD_SX xvec2, xvec15; +MUL_SX xvec1, xvec2, xvec2; +ADD_SX xvec2, xvec15, xvec15; SHUF_SX $0x4e, xvec3, xvec5; -MUL_SX xvec1, xvec3; -ADD_SX xvec3, xvec14; +MUL_SX xvec1, xvec3, xvec3; +ADD_SX xvec3, xvec14, xvec14; -MUL_SX xvec1, xvec4; -ADD_SX xvec4, xvec13 -MUL_SX xvec1, xvec5; -ADD_SX xvec5, xvec12; +MUL_SX xvec1, xvec4, xvec4; +ADD_SX xvec4, xvec13, xvec13 +MUL_SX xvec1, xvec5, xvec5; +ADD_SX xvec5, xvec12, xvec12; ADDQ $8*SIZE, ptrba; ADDQ $8*SIZE, ptrbb; .L222_loopE: @@ -2148,39 +2128,39 @@ EDUP_SX 0*SIZE(ptrbb), xvec2; ODUP_SX 0*SIZE(ptrbb), xvec3; SHUF_SX $0x4e, xvec2, xvec4; -MUL_SX xvec0, xvec2; -ADD_SX xvec2, xvec15; +MUL_SX xvec0, xvec2, xvec2; +ADD_SX xvec2, xvec15, xvec15; SHUF_SX $0x4e, xvec3, xvec5; -MUL_SX xvec0, xvec3; -ADD_SX xvec3, xvec14; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec14, xvec14; -MUL_SX xvec0, xvec4; -ADD_SX xvec4, xvec13; -MUL_SX xvec0, xvec5; -ADD_SX xvec5, xvec12; +MUL_SX xvec0, xvec4, xvec4; +ADD_SX xvec4, xvec13, xvec13; +MUL_SX xvec0, xvec5, xvec5; +ADD_SX xvec5, xvec12, xvec12; ADDQ $4*SIZE, ptrba; ADDQ $4*SIZE, ptrbb; .L223_loopE: #### Multiply Alpha #### BROAD_SX MEMALPHA, xvec7; -MUL_SX xvec7, xvec15; -MUL_SX xvec7, xvec14; -MUL_SX xvec7, xvec13; -MUL_SX xvec7, xvec12; +MUL_SX xvec7, xvec15, xvec15; +MUL_SX xvec7, xvec14, xvec14; +MUL_SX xvec7, xvec13, xvec13; +MUL_SX xvec7, xvec12, xvec12; #### Writing back #### #ifndef TRMMKERNEL -LDL_SX 0*SIZE(C0), xvec0; -LDH_SX 2*SIZE(C1), xvec0; -LDL_SX 0*SIZE(C0, ldc, 1), xvec1; -LDH_SX 2*SIZE(C1, ldc, 1), xvec1; -LDL_SX 0*SIZE(C1), xvec2; -LDH_SX 2*SIZE(C0), xvec2; -LDL_SX 0*SIZE(C1, ldc, 1), xvec3; -LDH_SX 2*SIZE(C0, ldc, 1), xvec3; -ADD_SX xvec0, xvec15; -ADD_SX xvec1, xvec14; -ADD_SX xvec2, xvec13; -ADD_SX xvec3, xvec12; +LDL_SX 0*SIZE(C0), xvec0, xvec0; +LDH_SX 2*SIZE(C1), xvec0, xvec0; +LDL_SX 0*SIZE(C0, ldc, 1), xvec1, xvec1; +LDH_SX 2*SIZE(C1, ldc, 1), xvec1, xvec1; +LDL_SX 0*SIZE(C1), xvec2, xvec2; +LDH_SX 2*SIZE(C0), xvec2, xvec2; +LDL_SX 0*SIZE(C1, ldc, 1), xvec3, xvec3; +LDH_SX 2*SIZE(C0, ldc, 1), xvec3, xvec3; +ADD_SX xvec0, xvec15, xvec15; +ADD_SX xvec1, xvec14, xvec14; +ADD_SX xvec2, xvec13, xvec13; +ADD_SX xvec3, xvec12, xvec12; #endif STL_SX xvec15, 0*SIZE(C0); STH_SX xvec15, 2*SIZE(C1); @@ -2242,35 +2222,35 @@ LD_SX 0*SIZE(ptrba), xvec0; EDUP_SX 0*SIZE(ptrbb), xvec4; ODUP_SX 0*SIZE(ptrbb), xvec5; SHUF_SX $0x44, xvec0, xvec1; -MUL_SX xvec1, xvec4; -ADD_SX xvec4, xvec15; -MUL_SX xvec1, xvec5; -ADD_SX xvec5, xvec14; +MUL_SX xvec1, xvec4, xvec4; +ADD_SX xvec4, xvec15, xvec15; +MUL_SX xvec1, xvec5, xvec5; +ADD_SX xvec5, xvec14, xvec14; SHUF_SX $0xee, xvec0, xvec2; EDUP_SX 4*SIZE(ptrbb), xvec6; ODUP_SX 4*SIZE(ptrbb), xvec7; -MUL_SX xvec2, xvec6; -ADD_SX xvec6, xvec15; -MUL_SX xvec2, xvec7; -ADD_SX xvec7, xvec14; +MUL_SX xvec2, xvec6, xvec6; +ADD_SX xvec6, xvec15, xvec15; +MUL_SX xvec2, xvec7, xvec7; +ADD_SX xvec7, xvec14, xvec14; LD_SX 4*SIZE(ptrba), xvec0; EDUP_SX 8*SIZE(ptrbb), xvec4; ODUP_SX 8*SIZE(ptrbb), xvec5; SHUF_SX $0x44, xvec0, xvec1; -MUL_SX xvec1, xvec4; -ADD_SX xvec4, xvec15; -MUL_SX xvec1, xvec5; -ADD_SX xvec5, xvec14; +MUL_SX xvec1, xvec4, xvec4; +ADD_SX xvec4, xvec15, xvec15; +MUL_SX xvec1, xvec5, xvec5; +ADD_SX xvec5, xvec14, xvec14; SHUF_SX $0xee, xvec0, xvec2; EDUP_SX 12*SIZE(ptrbb), xvec6; ODUP_SX 12*SIZE(ptrbb), xvec7; -MUL_SX xvec2, xvec6; -ADD_SX xvec6, xvec15; -MUL_SX xvec2, xvec7; -ADD_SX xvec7, xvec14; +MUL_SX xvec2, xvec6, xvec6; +ADD_SX xvec6, xvec15, xvec15; +MUL_SX xvec2, xvec7, xvec7; +ADD_SX xvec7, xvec14, xvec14; ADDQ $8*SIZE, ptrba; ADDQ $16*SIZE, ptrbb; @@ -2290,18 +2270,18 @@ LD_SX 0*SIZE(ptrba), xvec0; EDUP_SX 0*SIZE(ptrbb), xvec4; ODUP_SX 0*SIZE(ptrbb), xvec5; SHUF_SX $0x44, xvec0, xvec1; -MUL_SX xvec1, xvec4; -ADD_SX xvec4, xvec15; -MUL_SX xvec1, xvec5; -ADD_SX xvec5, xvec14; +MUL_SX xvec1, xvec4, xvec4; +ADD_SX xvec4, xvec15, xvec15; +MUL_SX xvec1, xvec5, xvec5; +ADD_SX xvec5, xvec14, xvec14; SHUF_SX $0xee, xvec0, xvec2; EDUP_SX 4*SIZE(ptrbb), xvec6; ODUP_SX 4*SIZE(ptrbb), xvec7; -MUL_SX xvec2, xvec6; -ADD_SX xvec6, xvec15; -MUL_SX xvec2, xvec7; -ADD_SX xvec7, xvec14; +MUL_SX xvec2, xvec6, xvec6; +ADD_SX xvec6, xvec15, xvec15; +MUL_SX xvec2, xvec7, xvec7; +ADD_SX xvec7, xvec14, xvec14; ADDQ $4*SIZE, ptrba; ADDQ $8*SIZE, ptrbb; @@ -2318,10 +2298,10 @@ LD_SX 0*SIZE(ptrba), xvec0; EDUP_SX 0*SIZE(ptrbb), xvec4; ODUP_SX 0*SIZE(ptrbb), xvec5; SHUF_SX $0x44, xvec0, xvec1; -MUL_SX xvec1, xvec4; -ADD_SX xvec4, xvec15; -MUL_SX xvec1, xvec5; -ADD_SX xvec5, xvec14; +MUL_SX xvec1, xvec4, xvec4; +ADD_SX xvec4, xvec15, xvec15; +MUL_SX xvec1, xvec5, xvec5; +ADD_SX xvec5, xvec14, xvec14; ADDQ $2*SIZE, ptrba; ADDQ $4*SIZE, ptrbb; @@ -2392,23 +2372,23 @@ ALIGN_4 .L241_bodyB: BROAD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec1; -MUL_SX xvec0, xvec1; -ADD_SX xvec1, xvec15; +MUL_SX xvec0, xvec1, xvec1; +ADD_SX xvec1, xvec15, xvec15; BROAD_SX 1*SIZE(ptrba), xvec2; LD_SX 4*SIZE(ptrbb), xvec3; -MUL_SX xvec2, xvec3; -ADD_SX xvec3, xvec15; +MUL_SX xvec2, xvec3, xvec3; +ADD_SX xvec3, xvec15, xvec15; BROAD_SX 2*SIZE(ptrba), xvec4; LD_SX 8*SIZE(ptrbb), xvec5; -MUL_SX xvec4, xvec5; -ADD_SX xvec5, xvec15; +MUL_SX xvec4, xvec5, xvec5; +ADD_SX xvec5, xvec15, xvec15; BROAD_SX 3*SIZE(ptrba), xvec6; LD_SX 12*SIZE(ptrbb), xvec7; -MUL_SX xvec6, xvec7; -ADD_SX xvec7, xvec15; +MUL_SX xvec6, xvec7, xvec7; +ADD_SX xvec7, xvec15, xvec15; ADDQ $4*SIZE, ptrba; ADDQ $16*SIZE, ptrbb; DECQ k; @@ -2425,13 +2405,13 @@ ALIGN_4 .L242_bodyB: BROAD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec1; -MUL_SX xvec0, xvec1; -ADD_SX xvec1, xvec15; +MUL_SX xvec0, xvec1, xvec1; +ADD_SX xvec1, xvec15, xvec15; BROAD_SX 1*SIZE(ptrba), xvec2; LD_SX 4*SIZE(ptrbb), xvec3; -MUL_SX xvec2, xvec3; -ADD_SX xvec3, xvec15; +MUL_SX xvec2, xvec3, xvec3; +ADD_SX xvec3, xvec15, xvec15; ADDQ $2*SIZE, ptrba; ADDQ $8*SIZE, ptrbb; @@ -2446,14 +2426,14 @@ ALIGN_4; .L243_bodyB: BROAD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec1; -MUL_SX xvec0, xvec1; -ADD_SX xvec1, xvec15; +MUL_SX xvec0, xvec1, xvec1; +ADD_SX xvec1, xvec15, xvec15; ADDQ $1*SIZE, ptrba; ADDQ $4*SIZE, ptrbb; .L243_loopE: #### Multiply Alpha #### BROAD_SX MEMALPHA, xvec7; -MUL_SX xvec7, xvec15; +MUL_SX xvec7, xvec15, xvec15; SHUF_SX $0xff, xvec15, xvec14; SHUF_SX $0xaa, xvec15, xvec13; SHUF_SX $0x55, xvec15, xvec12; @@ -2546,34 +2526,34 @@ LD_SX 0*SIZE(ptrba), xvec0; LD_SX 4*SIZE(ptrba), xvec1; MOV_SX xvec3, xvec4; -MUL_SX xvec0, xvec3; -ADD_SX xvec3, xvec15; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec15, xvec15; SHUF_SX $0x4e, xvec4, xvec5; -MUL_SX xvec1, xvec4; -ADD_SX xvec4, xvec14; +MUL_SX xvec1, xvec4, xvec4; +ADD_SX xvec4, xvec14, xvec14; MOV_SX xvec5, xvec6; -MUL_SX xvec0, xvec5; -ADD_SX xvec5, xvec13; -MUL_SX xvec1, xvec6; -ADD_SX xvec6, xvec12; +MUL_SX xvec0, xvec5, xvec5; +ADD_SX xvec5, xvec13, xvec13; +MUL_SX xvec1, xvec6, xvec6; +ADD_SX xvec6, xvec12, xvec12; SHUF_SX $0xfa, xvec2, xvec3; LD_SX 8*SIZE(ptrba), xvec0; LD_SX 12*SIZE(ptrba), xvec1; MOV_SX xvec3, xvec4; -MUL_SX xvec0, xvec3; -ADD_SX xvec3, xvec15; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec15, xvec15; SHUF_SX $0x4e, xvec4, xvec5; -MUL_SX xvec1, xvec4; -ADD_SX xvec4, xvec14; +MUL_SX xvec1, xvec4, xvec4; +ADD_SX xvec4, xvec14, xvec14; MOV_SX xvec5, xvec6; -MUL_SX xvec0, xvec5; -ADD_SX xvec5, xvec13; -MUL_SX xvec1, xvec6; -ADD_SX xvec6, xvec12; +MUL_SX xvec0, xvec5, xvec5; +ADD_SX xvec5, xvec13, xvec13; +MUL_SX xvec1, xvec6, xvec6; +ADD_SX xvec6, xvec12, xvec12; LD_SX 4*SIZE(ptrbb), xvec2; SHUF_SX $0x50, xvec2, xvec3; @@ -2581,34 +2561,34 @@ LD_SX 16*SIZE(ptrba), xvec0; LD_SX 20*SIZE(ptrba), xvec1; MOV_SX xvec3, xvec4; -MUL_SX xvec0, xvec3; -ADD_SX xvec3, xvec15; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec15, xvec15; SHUF_SX $0x4e, xvec4, xvec5; -MUL_SX xvec1, xvec4; -ADD_SX xvec4, xvec14; +MUL_SX xvec1, xvec4, xvec4; +ADD_SX xvec4, xvec14, xvec14; MOV_SX xvec5, xvec6; -MUL_SX xvec0, xvec5; -ADD_SX xvec5, xvec13; -MUL_SX xvec1, xvec6; -ADD_SX xvec6, xvec12; +MUL_SX xvec0, xvec5, xvec5; +ADD_SX xvec5, xvec13, xvec13; +MUL_SX xvec1, xvec6, xvec6; +ADD_SX xvec6, xvec12, xvec12; SHUF_SX $0xfa, xvec2, xvec3; LD_SX 24*SIZE(ptrba), xvec0; LD_SX 28*SIZE(ptrba), xvec1; MOV_SX xvec3, xvec4; -MUL_SX xvec0, xvec3; -ADD_SX xvec3, xvec15; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec15, xvec15; SHUF_SX $0x4e, xvec4, xvec5; -MUL_SX xvec1, xvec4; -ADD_SX xvec4, xvec14; +MUL_SX xvec1, xvec4, xvec4; +ADD_SX xvec4, xvec14, xvec14; MOV_SX xvec5, xvec6; -MUL_SX xvec0, xvec5; -ADD_SX xvec5, xvec13; -MUL_SX xvec1, xvec6; -ADD_SX xvec6, xvec12; +MUL_SX xvec0, xvec5, xvec5; +ADD_SX xvec5, xvec13, xvec13; +MUL_SX xvec1, xvec6, xvec6; +ADD_SX xvec6, xvec12, xvec12; ADDQ $32*SIZE, ptrba; ADDQ $8*SIZE, ptrbb; @@ -2630,34 +2610,34 @@ LD_SX 0*SIZE(ptrba), xvec0; LD_SX 4*SIZE(ptrba), xvec1; MOV_SX xvec3, xvec4; -MUL_SX xvec0, xvec3; -ADD_SX xvec3, xvec15; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec15, xvec15; SHUF_SX $0x4e, xvec4, xvec5; -MUL_SX xvec1, xvec4; -ADD_SX xvec4, xvec14; +MUL_SX xvec1, xvec4, xvec4; +ADD_SX xvec4, xvec14, xvec14; MOV_SX xvec5, xvec6; -MUL_SX xvec0, xvec5; -ADD_SX xvec5, xvec13; -MUL_SX xvec1, xvec6; -ADD_SX xvec6, xvec12; +MUL_SX xvec0, xvec5, xvec5; +ADD_SX xvec5, xvec13, xvec13; +MUL_SX xvec1, xvec6, xvec6; +ADD_SX xvec6, xvec12, xvec12; SHUF_SX $0xfa, xvec2, xvec3; LD_SX 8*SIZE(ptrba), xvec0; LD_SX 12*SIZE(ptrba), xvec1; MOV_SX xvec3, xvec4; -MUL_SX xvec0, xvec3; -ADD_SX xvec3, xvec15; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec15, xvec15; SHUF_SX $0x4e, xvec4, xvec5; -MUL_SX xvec1, xvec4; -ADD_SX xvec4, xvec14; +MUL_SX xvec1, xvec4, xvec4; +ADD_SX xvec4, xvec14, xvec14; MOV_SX xvec5, xvec6; -MUL_SX xvec0, xvec5; -ADD_SX xvec5, xvec13; -MUL_SX xvec1, xvec6; -ADD_SX xvec6, xvec12; +MUL_SX xvec0, xvec5, xvec5; +ADD_SX xvec5, xvec13, xvec13; +MUL_SX xvec1, xvec6, xvec6; +ADD_SX xvec6, xvec12, xvec12; ADDQ $16*SIZE, ptrba; ADDQ $4*SIZE, ptrbb; @@ -2676,40 +2656,40 @@ LD_SX 0*SIZE(ptrba), xvec0; LD_SX 4*SIZE(ptrba), xvec1; MOV_SX xvec3, xvec4; -MUL_SX xvec0, xvec3; -ADD_SX xvec3, xvec15; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec15, xvec15; SHUF_SX $0x4e, xvec4, xvec5; -MUL_SX xvec1, xvec4; -ADD_SX xvec4, xvec14; +MUL_SX xvec1, xvec4, xvec4; +ADD_SX xvec4, xvec14, xvec14; MOV_SX xvec5, xvec6; -MUL_SX xvec0, xvec5; -ADD_SX xvec5, xvec13; -MUL_SX xvec1, xvec6; -ADD_SX xvec6, xvec12; +MUL_SX xvec0, xvec5, xvec5; +ADD_SX xvec5, xvec13, xvec13; +MUL_SX xvec1, xvec6, xvec6; +ADD_SX xvec6, xvec12, xvec12; ADDQ $8*SIZE, ptrba; ADDQ $2*SIZE, ptrbb; .L313_loopE: BROAD_SX MEMALPHA, xvec7; -MUL_SX xvec7, xvec15; -MUL_SX xvec7, xvec14; -MUL_SX xvec7, xvec13; -MUL_SX xvec7, xvec12; +MUL_SX xvec7, xvec15, xvec15; +MUL_SX xvec7, xvec14, xvec14; +MUL_SX xvec7, xvec13, xvec13; +MUL_SX xvec7, xvec12, xvec12; #### Writing Back #### #ifndef TRMMKERNEL -LDL_SX 0*SIZE(C0), xvec0; -LDH_SX 2*SIZE(C1), xvec0; -LDL_SX 4*SIZE(C0), xvec1; -LDH_SX 6*SIZE(C1), xvec1; -LDL_SX 0*SIZE(C1), xvec2; -LDH_SX 2*SIZE(C0), xvec2; -LDL_SX 4*SIZE(C1), xvec3; -LDH_SX 6*SIZE(C0), xvec3; -ADD_SX xvec0, xvec15; -ADD_SX xvec1, xvec14; -ADD_SX xvec2, xvec13; -ADD_SX xvec3, xvec12; +LDL_SX 0*SIZE(C0), xvec0, xvec0; +LDH_SX 2*SIZE(C1), xvec0, xvec0; +LDL_SX 4*SIZE(C0), xvec1, xvec1; +LDH_SX 6*SIZE(C1), xvec1, xvec1; +LDL_SX 0*SIZE(C1), xvec2, xvec2; +LDH_SX 2*SIZE(C0), xvec2, xvec2; +LDL_SX 4*SIZE(C1), xvec3, xvec3; +LDH_SX 6*SIZE(C0), xvec3, xvec3; +ADD_SX xvec0, xvec15, xvec15; +ADD_SX xvec1, xvec14, xvec14; +ADD_SX xvec2, xvec13, xvec13; +ADD_SX xvec3, xvec12, xvec12; #endif STL_SX xvec15, 0*SIZE(C0); STH_SX xvec15, 2*SIZE(C1); @@ -2774,35 +2754,35 @@ LD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec2; SHUF_SX $0x50, xvec2, xvec3; SHUF_SX $0x05, xvec2, xvec4; -MUL_SX xvec0, xvec3; -ADD_SX xvec3, xvec15; -MUL_SX xvec0, xvec4; -ADD_SX xvec4, xvec14; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec15, xvec15; +MUL_SX xvec0, xvec4, xvec4; +ADD_SX xvec4, xvec14, xvec14; LD_SX 4*SIZE(ptrba), xvec0; SHUF_SX $0xfa, xvec2, xvec5; SHUF_SX $0xaf, xvec2, xvec6; -MUL_SX xvec0, xvec5; -ADD_SX xvec5, xvec15; -MUL_SX xvec0, xvec6; -ADD_SX xvec6, xvec14; +MUL_SX xvec0, xvec5, xvec5; +ADD_SX xvec5, xvec15, xvec15; +MUL_SX xvec0, xvec6, xvec6; +ADD_SX xvec6, xvec14, xvec14; LD_SX 8*SIZE(ptrba), xvec0; LD_SX 4*SIZE(ptrbb), xvec2; SHUF_SX $0x50, xvec2, xvec3; SHUF_SX $0x05, xvec2, xvec4; -MUL_SX xvec0, xvec3; -ADD_SX xvec3, xvec15; -MUL_SX xvec0, xvec4; -ADD_SX xvec4, xvec14; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec15, xvec15; +MUL_SX xvec0, xvec4, xvec4; +ADD_SX xvec4, xvec14, xvec14; LD_SX 12*SIZE(ptrba), xvec0; SHUF_SX $0xfa, xvec2, xvec5; SHUF_SX $0xaf, xvec2, xvec6; -MUL_SX xvec0, xvec5; -ADD_SX xvec5, xvec15; -MUL_SX xvec0, xvec6; -ADD_SX xvec6, xvec14; +MUL_SX xvec0, xvec5, xvec5; +ADD_SX xvec5, xvec15, xvec15; +MUL_SX xvec0, xvec6, xvec6; +ADD_SX xvec6, xvec14, xvec14; ADDQ $16*SIZE, ptrba; ADDQ $8*SIZE, ptrbb; @@ -2822,18 +2802,18 @@ LD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec2; SHUF_SX $0x50, xvec2, xvec3; SHUF_SX $0x05, xvec2, xvec4; -MUL_SX xvec0, xvec3; -ADD_SX xvec3, xvec15; -MUL_SX xvec0, xvec4; -ADD_SX xvec4, xvec14; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec15, xvec15; +MUL_SX xvec0, xvec4, xvec4; +ADD_SX xvec4, xvec14, xvec14; LD_SX 4*SIZE(ptrba), xvec0; SHUF_SX $0xfa, xvec2, xvec5; SHUF_SX $0xaf, xvec2, xvec6; -MUL_SX xvec0, xvec5; -ADD_SX xvec5, xvec15; -MUL_SX xvec0, xvec6; -ADD_SX xvec6, xvec14; +MUL_SX xvec0, xvec5, xvec5; +ADD_SX xvec5, xvec15, xvec15; +MUL_SX xvec0, xvec6, xvec6; +ADD_SX xvec6, xvec14, xvec14; ADDQ $8*SIZE, ptrba; ADDQ $4*SIZE, ptrbb; @@ -2850,25 +2830,25 @@ LD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec2; SHUF_SX $0x50, xvec2, xvec3; SHUF_SX $0x05, xvec2, xvec4; -MUL_SX xvec0, xvec3; -ADD_SX xvec3, xvec15; -MUL_SX xvec0, xvec4; -ADD_SX xvec4, xvec14; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec15, xvec15; +MUL_SX xvec0, xvec4, xvec4; +ADD_SX xvec4, xvec14, xvec14; ADDQ $4*SIZE, ptrba; ADDQ $2*SIZE, ptrbb; .L323_loopE: BROAD_SX MEMALPHA, xvec7; -MUL_SX xvec7, xvec15; -MUL_SX xvec7, xvec14; +MUL_SX xvec7, xvec15, xvec15; +MUL_SX xvec7, xvec14, xvec14; #### Writing back #### #ifndef TRMMKERNEL -LDL_SX 0*SIZE(C0), xvec0; -LDH_SX 2*SIZE(C1), xvec0; -LDL_SX 0*SIZE(C1), xvec1; -LDH_SX 2*SIZE(C0), xvec1; -ADD_SX xvec0, xvec15; -ADD_SX xvec1, xvec14; +LDL_SX 0*SIZE(C0), xvec0, xvec0; +LDH_SX 2*SIZE(C1), xvec0, xvec0; +LDL_SX 0*SIZE(C1), xvec1, xvec1; +LDH_SX 2*SIZE(C0), xvec1, xvec1; +ADD_SX xvec0, xvec15, xvec15; +ADD_SX xvec1, xvec14, xvec14; #endif STL_SX xvec15, 0*SIZE(C0); STH_SX xvec15, 2*SIZE(C1); @@ -2928,19 +2908,19 @@ LD_SX 0*SIZE(ptrba), xvec0; # a0, a1, a2, a3 EDUP_SX 0*SIZE(ptrbb), xvec2; # b0, b0, b2, b2 ODUP_SX 0*SIZE(ptrbb), xvec3; # b1, b1, b3, b3 -MUL_SX xvec0, xvec2; # c00, c10 -ADD_SX xvec2, xvec15; -MUL_SX xvec0, xvec3; # C01, c11 -ADD_SX xvec3, xvec14; +MUL_SX xvec0, xvec2, xvec2; +ADD_SX xvec2, xvec15, xvec15; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec14, xvec14; LD_SX 4*SIZE(ptrba), xvec0; EDUP_SX 4*SIZE(ptrbb), xvec2; ODUP_SX 4*SIZE(ptrbb), xvec3; -MUL_SX xvec0, xvec2; -ADD_SX xvec2, xvec15; -MUL_SX xvec0, xvec3; -ADD_SX xvec3, xvec14; +MUL_SX xvec0, xvec2, xvec2; +ADD_SX xvec2, xvec15, xvec15; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec14, xvec14; ADDQ $8*SIZE, ptrba; ADDQ $8*SIZE, ptrbb; DECQ k; @@ -2959,10 +2939,10 @@ LD_SX 0*SIZE(ptrba), xvec0; # a0, a1, a2, a3 EDUP_SX 0*SIZE(ptrbb), xvec2; # b0, b0, b2, b2 ODUP_SX 0*SIZE(ptrbb), xvec3; # b1, b1, b3, b3 -MUL_SX xvec0, xvec2; # c00, c10 -ADD_SX xvec2, xvec15; -MUL_SX xvec0, xvec3; # C01, c11 -ADD_SX xvec3, xvec14; +MUL_SX xvec0, xvec2, xvec2; +ADD_SX xvec2, xvec15, xvec15; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec14, xvec14; ADDQ $4*SIZE, ptrba; ADDQ $4*SIZE, ptrbb; @@ -2985,7 +2965,7 @@ mulss xvec0, xvec2; addss xvec2, xvec15; mulss xvec1, xvec3; SHUF_SX $0xe1, xvec3, xvec4; -ADD_SX xvec4, xvec15; +ADD_SX xvec4, xvec15, xvec15; movss 1*SIZE(ptrbb), xvec5; XOR_SY yvec6, yvec6, yvec6; @@ -2994,26 +2974,26 @@ mulss xvec0, xvec5; addss xvec5, xvec14; mulss xvec1, xvec6; SHUF_SX $0xe1, xvec6, xvec7; -ADD_SX xvec7, xvec14 +ADD_SX xvec7, xvec14, xvec14 ADDQ $2*SIZE, ptrba; ADDQ $2*SIZE, ptrbb; .L333_loopE: BROAD_SX MEMALPHA, xvec7; -MUL_SX xvec7, xvec15; -MUL_SX xvec7, xvec14; +MUL_SX xvec7, xvec15, xvec15; +MUL_SX xvec7, xvec14, xvec14; SHUF_SX $0xee, xvec15, xvec13; SHUF_SX $0xee, xvec14, xvec12; SHUF_SX $0x44, xvec15, xvec11; SHUF_SX $0x44, xvec14, xvec10; -ADD_SX xvec13, xvec11; -ADD_SX xvec12, xvec10; +ADD_SX xvec13, xvec11, xvec11; +ADD_SX xvec12, xvec10, xvec10; #ifndef TRMMKERNEL -LDL_SX 0*SIZE(C0), xvec0; -LDL_SX 0*SIZE(C1), xvec1; -ADD_SX xvec0, xvec11; -ADD_SX xvec1, xvec10; +LDL_SX 0*SIZE(C0), xvec0, xvec0; +LDL_SX 0*SIZE(C1), xvec1, xvec1; +ADD_SX xvec0, xvec11, xvec11; +ADD_SX xvec1, xvec10, xvec10; #endif STL_SX xvec11, 0*SIZE(C0); STL_SX xvec10, 0*SIZE(C1); @@ -3305,14 +3285,14 @@ SHUF_SX $0xee, xvec15, xvec12; SHUF_SX $0x44, xvec14, xvec11; SHUF_SX $0xee, xvec14, xvec10; #ifndef TRMMKERNEL -LDL_SX 0*SIZE(C0), xvec0; -LDL_SX 2*SIZE(C0), xvec1; -LDL_SX 4*SIZE(C0), xvec2; -LDL_SX 6*SIZE(C0), xvec3; -ADD_SX xvec0, xvec13; -ADD_SX xvec1, xvec12; -ADD_SX xvec2, xvec11; -ADD_SX xvec3, xvec10; +LDL_SX 0*SIZE(C0), xvec0, xvec0; +LDL_SX 2*SIZE(C0), xvec1, xvec1; +LDL_SX 4*SIZE(C0), xvec2, xvec2; +LDL_SX 6*SIZE(C0), xvec3, xvec3; +ADD_SX xvec0, xvec13, xvec13; +ADD_SX xvec1, xvec12, xvec12; +ADD_SX xvec2, xvec11, xvec11; +ADD_SX xvec3, xvec10, xvec10; #endif STL_SX xvec13, 0*SIZE(C0); STL_SX xvec12, 2*SIZE(C0); @@ -3368,23 +3348,23 @@ ALIGN_4 .L421_bodyB: LD_SX 0*SIZE(ptrba), xvec0; BROAD_SX 0*SIZE(ptrbb), xvec1; -MUL_SX xvec0, xvec1; -ADD_SX xvec1, xvec15; +MUL_SX xvec0, xvec1, xvec1; +ADD_SX xvec1, xvec15, xvec15; LD_SX 4*SIZE(ptrba), xvec0; BROAD_SX 1*SIZE(ptrbb), xvec1; -MUL_SX xvec0, xvec1; -ADD_SX xvec1, xvec15; +MUL_SX xvec0, xvec1, xvec1; +ADD_SX xvec1, xvec15, xvec15; LD_SX 8*SIZE(ptrba), xvec0; BROAD_SX 2*SIZE(ptrbb), xvec1; -MUL_SX xvec0, xvec1; -ADD_SX xvec1, xvec15; +MUL_SX xvec0, xvec1, xvec1; +ADD_SX xvec1, xvec15, xvec15; LD_SX 12*SIZE(ptrba), xvec0; BROAD_SX 3*SIZE(ptrbb), xvec1; -MUL_SX xvec0, xvec1; -ADD_SX xvec1, xvec15; +MUL_SX xvec0, xvec1, xvec1; +ADD_SX xvec1, xvec15, xvec15; ADDQ $16*SIZE, ptrba; ADDQ $4*SIZE, ptrbb; DECQ k; @@ -3401,13 +3381,13 @@ ALIGN_4 .L422_bodyB: LD_SX 0*SIZE(ptrba), xvec0; BROAD_SX 0*SIZE(ptrbb), xvec1; -MUL_SX xvec0, xvec1; -ADD_SX xvec1, xvec15; +MUL_SX xvec0, xvec1, xvec1; +ADD_SX xvec1, xvec15, xvec15; LD_SX 4*SIZE(ptrba), xvec0; BROAD_SX 1*SIZE(ptrbb), xvec1; -MUL_SX xvec0, xvec1; -ADD_SX xvec1, xvec15; +MUL_SX xvec0, xvec1, xvec1; +ADD_SX xvec1, xvec15, xvec15; ADDQ $8*SIZE, ptrba; ADDQ $2*SIZE, ptrbb; @@ -3422,19 +3402,19 @@ ALIGN_4 .L423_bodyB: LD_SX 0*SIZE(ptrba), xvec0; BROAD_SX 0*SIZE(ptrbb), xvec1; -MUL_SX xvec0, xvec1; -ADD_SX xvec1, xvec15; +MUL_SX xvec0, xvec1, xvec1; +ADD_SX xvec1, xvec15, xvec15; ADDQ $4*SIZE, ptrba; ADDQ $1*SIZE, ptrbb; .L423_loopE: #### Writing back #### BROAD_SX MEMALPHA, xvec7; -MUL_SX xvec7, xvec15; +MUL_SX xvec7, xvec15, xvec15; #ifndef TRMMKERNEL -LDL_SX 0*SIZE(C0), xvec0; -LDH_SX 2*SIZE(C0), xvec0; -ADD_SX xvec0, xvec15; +LDL_SX 0*SIZE(C0), xvec0, xvec0; +LDH_SX 2*SIZE(C0), xvec0, xvec0; +ADD_SX xvec0, xvec15, xvec15; #endif STL_SX xvec15, 0*SIZE(C0); STH_SX xvec15, 2*SIZE(C0); @@ -3485,37 +3465,37 @@ SARQ $2, k; JLE .L431_loopE; ALIGN_4 .L431_bodyB: -movss 0*SIZE(ptrba), xvec0; -movss 1*SIZE(ptrba), xvec1; -movss 0*SIZE(ptrbb), xvec2; -mulss xvec2, xvec0; -addss xvec0, xvec15; -mulss xvec2, xvec1; -addss xvec1, xvec14; +vmovss 0*SIZE(ptrba), xvec0; +vmovss 1*SIZE(ptrba), xvec1; +vmovss 0*SIZE(ptrbb), xvec2; +vmulss xvec2, xvec0, xvec0; +vaddss xvec0, xvec15, xvec15; +vmulss xvec2, xvec1, xvec1; +vaddss xvec1, xvec14, xvec14; -movss 2*SIZE(ptrba), xvec3; -movss 3*SIZE(ptrba), xvec4; -movss 1*SIZE(ptrbb), xvec5; -mulss xvec5, xvec3; -addss xvec3, xvec15; -mulss xvec5, xvec4; -addss xvec4, xvec14; +vmovss 2*SIZE(ptrba), xvec3; +vmovss 3*SIZE(ptrba), xvec4; +vmovss 1*SIZE(ptrbb), xvec5; +vmulss xvec5, xvec3, xvec3; +vaddss xvec3, xvec15, xvec15; +vmulss xvec5, xvec4, xvec4; +vaddss xvec4, xvec14, xvec14; -movss 4*SIZE(ptrba), xvec0; -movss 5*SIZE(ptrba), xvec1; -movss 2*SIZE(ptrbb), xvec2; -mulss xvec2, xvec0; -addss xvec0, xvec15; -mulss xvec2, xvec1; -addss xvec1, xvec14; +vmovss 4*SIZE(ptrba), xvec0; +vmovss 5*SIZE(ptrba), xvec1; +vmovss 2*SIZE(ptrbb), xvec2; +vmulss xvec2, xvec0, xvec0; +vaddss xvec0, xvec15, xvec15; +vmulss xvec2, xvec1, xvec1; +vaddss xvec1, xvec14, xvec14; -movss 6*SIZE(ptrba), xvec3; -movss 7*SIZE(ptrba), xvec4; -movss 3*SIZE(ptrbb), xvec5; -mulss xvec5, xvec3; -addss xvec3, xvec15; -mulss xvec5, xvec4; -addss xvec4, xvec14; +vmovss 6*SIZE(ptrba), xvec3; +vmovss 7*SIZE(ptrba), xvec4; +vmovss 3*SIZE(ptrbb), xvec5; +vmulss xvec5, xvec3, xvec3; +vaddss xvec3, xvec15, xvec15; +vmulss xvec5, xvec4, xvec4; +vaddss xvec4, xvec14, xvec14; addq $8*SIZE, ptrba; addq $4*SIZE, ptrbb; decq k; @@ -3530,21 +3510,21 @@ TEST $2, kkk; JLE .L432_loopE; ALIGN_4 .L432_bodyB: -movss 0*SIZE(ptrba), xvec0; -movss 1*SIZE(ptrba), xvec1; -movss 0*SIZE(ptrbb), xvec2; -mulss xvec2, xvec0; -addss xvec0, xvec15; -mulss xvec2, xvec1; -addss xvec1, xvec14; +vmovss 0*SIZE(ptrba), xvec0; +vmovss 1*SIZE(ptrba), xvec1; +vmovss 0*SIZE(ptrbb), xvec2; +vmulss xvec2, xvec0, xvec0; +vaddss xvec0, xvec15, xvec15; +vmulss xvec2, xvec1, xvec1; +vaddss xvec1, xvec14, xvec14; -movss 2*SIZE(ptrba), xvec3; -movss 3*SIZE(ptrba), xvec4; -movss 1*SIZE(ptrbb), xvec5; -mulss xvec5, xvec3; -addss xvec3, xvec15; -mulss xvec5, xvec4; -addss xvec4, xvec14; +vmovss 2*SIZE(ptrba), xvec3; +vmovss 3*SIZE(ptrba), xvec4; +vmovss 1*SIZE(ptrbb), xvec5; +vmulss xvec5, xvec3, xvec3; +vaddss xvec3, xvec15, xvec15; +vmulss xvec5, xvec4, xvec4; +vaddss xvec4, xvec14, xvec14; addq $4*SIZE, ptrba; addq $2*SIZE, ptrbb; @@ -3557,28 +3537,28 @@ TEST $1, kkk; JLE .L433_loopE; ALIGN_4 .L433_bodyB: -movss 0*SIZE(ptrba), xvec0; -movss 1*SIZE(ptrba), xvec1; -movss 0*SIZE(ptrbb), xvec2; -mulss xvec2, xvec0; -addss xvec0, xvec15; -mulss xvec2, xvec1; -addss xvec1, xvec14; +vmovss 0*SIZE(ptrba), xvec0; +vmovss 1*SIZE(ptrba), xvec1; +vmovss 0*SIZE(ptrbb), xvec2; +vmulss xvec2, xvec0, xvec0; +vaddss xvec0, xvec15, xvec15; +vmulss xvec2, xvec1, xvec1; +vaddss xvec1, xvec14, xvec14; addq $2*SIZE, ptrba; addq $1*SIZE, ptrbb; .L433_loopE: #### Writing Back #### -movss MEMALPHA, xvec7; -mulss xvec7, xvec15; -mulss xvec7, xvec14; +vmovss MEMALPHA, xvec7; +vmulss xvec7, xvec15, xvec15; +vmulss xvec7, xvec14, xvec14; #ifndef TRMMKERNEL -addss 0*SIZE(C0), xvec15; -addss 1*SIZE(C0), xvec14; +vaddss 0*SIZE(C0), xvec15, xvec15; +vaddss 1*SIZE(C0), xvec14, xvec14; #endif -movss xvec15, 0*SIZE(C0); -movss xvec14, 1*SIZE(C0); +vmovss xvec15, 0*SIZE(C0); +vmovss xvec14, 1*SIZE(C0); #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) ||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) MOVQ bk, %rax; SUBQ kkk, %rax; @@ -3625,25 +3605,25 @@ SARQ $2, k; JLE .L441_loopE; ALIGN_4 .L441_bodyB: -movss 0*SIZE(ptrba), xvec0; -movss 0*SIZE(ptrbb), xvec1; -mulss xvec0, xvec1; -addss xvec1, xvec15; +vmovss 0*SIZE(ptrba), xvec0; +vmovss 0*SIZE(ptrbb), xvec1; +vmulss xvec0, xvec1, xvec1; +vaddss xvec1, xvec15, xvec15; -movss 1*SIZE(ptrba), xvec0; -movss 1*SIZE(ptrbb), xvec1; -mulss xvec0, xvec1; -addss xvec1, xvec15; +vmovss 1*SIZE(ptrba), xvec0; +vmovss 1*SIZE(ptrbb), xvec1; +vmulss xvec0, xvec1, xvec1; +vaddss xvec1, xvec15, xvec15; -movss 2*SIZE(ptrba), xvec0; -movss 2*SIZE(ptrbb), xvec1; -mulss xvec0, xvec1; -addss xvec1, xvec15; +vmovss 2*SIZE(ptrba), xvec0; +vmovss 2*SIZE(ptrbb), xvec1; +vmulss xvec0, xvec1, xvec1; +vaddss xvec1, xvec15, xvec15; -movss 3*SIZE(ptrba), xvec0; -movss 3*SIZE(ptrbb), xvec1; -mulss xvec0, xvec1; -addss xvec1, xvec15; +vmovss 3*SIZE(ptrba), xvec0; +vmovss 3*SIZE(ptrbb), xvec1; +vmulss xvec0, xvec1, xvec1; +vaddss xvec1, xvec15, xvec15; addq $4*SIZE, ptrba; addq $4*SIZE, ptrbb; decq k; @@ -3658,15 +3638,15 @@ TEST $2, kkk; JLE .L442_loopE; ALIGN_4 .L442_bodyB: -movss 0*SIZE(ptrba), xvec0; -movss 0*SIZE(ptrbb), xvec1; -mulss xvec0, xvec1; -addss xvec1, xvec15; +vmovss 0*SIZE(ptrba), xvec0; +vmovss 0*SIZE(ptrbb), xvec1; +vmulss xvec0, xvec1, xvec1; +vaddss xvec1, xvec15, xvec15; -movss 1*SIZE(ptrba), xvec0; -movss 1*SIZE(ptrbb), xvec1; -mulss xvec0, xvec1; -addss xvec1, xvec15; +vmovss 1*SIZE(ptrba), xvec0; +vmovss 1*SIZE(ptrbb), xvec1; +vmulss xvec0, xvec1, xvec1; +vaddss xvec1, xvec15, xvec15; addq $2*SIZE, ptrba; addq $2*SIZE, ptrbb; @@ -3679,21 +3659,21 @@ TEST $1, kkk; JLE .L443_loopE; ALIGN_4 .L443_bodyB: -movss 0*SIZE(ptrba), xvec0; -movss 0*SIZE(ptrbb), xvec1; -mulss xvec0, xvec1; -addss xvec1, xvec15; +vmovss 0*SIZE(ptrba), xvec0; +vmovss 0*SIZE(ptrbb), xvec1; +vmulss xvec0, xvec1, xvec1; +vaddss xvec1, xvec15, xvec15; addq $1*SIZE, ptrba; addq $1*SIZE, ptrbb; .L443_loopE: #### Writing Back #### -movss MEMALPHA, xvec7; -mulss xvec7, xvec15; +vmovss MEMALPHA, xvec7; +vmulss xvec7, xvec15, xvec15; #ifndef TRMMKERNEL -addss 0*SIZE(C0), xvec15; +vaddss 0*SIZE(C0), xvec15, xvec15; #endif -movss xvec15, 0*SIZE(C0); +vmovss xvec15, 0*SIZE(C0); #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) ||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) MOVQ bk, %rax; SUBQ kkk, %rax; @@ -3711,6 +3691,7 @@ MOV bk, k; SALQ $2, k; ADDQ k, bb; ADDQ ldc, C; + .L40_loopE: movq 0(%rsp), %rbx; movq 8(%rsp), %rbp; @@ -3718,6 +3699,9 @@ movq 16(%rsp), %r12; movq 24(%rsp), %r13; movq 32(%rsp), %r14; movq 40(%rsp), %r15; + +vzeroupper + #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi @@ -3732,6 +3716,7 @@ movq 40(%rsp), %r15; movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15 #endif + addq $STACKSIZE, %rsp; ret diff --git a/kernel/x86_64/zgemm_kernel_4x4_sandy.S b/kernel/x86_64/zgemm_kernel_4x4_sandy.S index f6f9f707f..9f6fb8a5f 100644 --- a/kernel/x86_64/zgemm_kernel_4x4_sandy.S +++ b/kernel/x86_64/zgemm_kernel_4x4_sandy.S @@ -148,74 +148,49 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #undef MOVQ #define MOVQ movq -#define XOR_SY vxorps #define XOR_DY vxorpd -#define XOR_SX xorps -#define XOR_DX xorpd +#define XOR_DX vxorpd -#define LD_SY vmovaps #define LD_DY vmovapd -#define LD_SX movaps -#define LD_DX movapd +#define LD_DX vmovapd #define LDL_DY vmovlpd -#define LDL_DX movlpd +#define LDL_DX vmovlpd #define LDH_DY vmovhpd -#define LDH_DX movhpd +#define LDH_DX vmovhpd -#define ST_SY vmovaps #define ST_DY vmovapd -#define ST_SX movaps -#define ST_DX movapd +#define ST_DX vmovapd #define STL_DY vmovlpd -#define STL_DX movlpd +#define STL_DX vmovlpd #define STH_DY vmovhpd -#define STH_DX movhpd +#define STH_DX vmovhpd -#define EDUP_SY vmovsldup -#define ODUP_SY vmovshdup -#define EDUP_SX movsldup -#define ODUP_SX movshdup #define EDUP_DY vmovddup -#define ADD_SY vaddps #define ADD_DY vaddpd -#define ADD_SX addps -#define ADD_DX addpd +#define ADD_DX vaddpd #define SUB_DY vsubpd -#define SUB_DX subpd +#define SUB_DX vsubpd #define ADDSUB_DY vaddsubpd -#define ADDSUB_DX addsubpd -#define ADDSUB_SY vaddsubps +#define ADDSUB_DX vaddsubpd -#define MUL_SY vmulps #define MUL_DY vmulpd -#define MUL_SX mulps -#define MUL_DX mulpd +#define MUL_DX vmulpd -#define SHUF_SY vperm2f128 #define SHUF_DY vperm2f128 -#define SHUF_DX pshufd -#define SHUF_SX pshufd +#define SHUF_DX vpshufd -#define VPERMILP_SY vpermilps -#define VPERMILP_SX vpermilps #define VPERMILP_DY vpermilpd -#define BROAD_SY vbroadcastss #define BROAD_DY vbroadcastsd -#define BROAD_SX vbroadcastss -#define BROAD_DX movddup +#define BROAD_DX vmovddup -#define MOV_SY vmovaps #define MOV_DY vmovapd -#define MOV_SX movaps -#define MOV_DX movapd +#define MOV_DX vmovapd -#define REVS_SY vshufps #define REVS_DY vshufpd -#define REVS_SX shufps -#define REVS_DX movsd +#define REVS_DX vmovsd #define EXTRA_DY vextractf128 @@ -282,6 +257,8 @@ movq old_offset, %r11; #endif #endif +vzeroupper + vmovlps %xmm0, MEMALPHA_R vmovlps %xmm1, MEMALPHA_I movq old_bm, bm @@ -1373,14 +1350,14 @@ EXTRA_DY $1, yvec14, xvec6; EXTRA_DY $1, yvec13, xvec5; EXTRA_DY $1, yvec12, xvec4; #ifndef TRMMKERNEL -ADD_DX 0*SIZE(C0), xvec15; -ADD_DX 2*SIZE(C0, ldc, 1), xvec7; -ADD_DX 0*SIZE(C0, ldc, 1), xvec13; -ADD_DX 2*SIZE(C0), xvec5; -ADD_DX 0*SIZE(C1), xvec14; -ADD_DX 2*SIZE(C1, ldc, 1), xvec6; -ADD_DX 0*SIZE(C1, ldc, 1), xvec12; -ADD_DX 2*SIZE(C1), xvec4; +ADD_DX 0*SIZE(C0), xvec15, xvec15; +ADD_DX 2*SIZE(C0, ldc, 1), xvec7, xvec7; +ADD_DX 0*SIZE(C0, ldc, 1), xvec13, xvec13; +ADD_DX 2*SIZE(C0), xvec5, xvec5; +ADD_DX 0*SIZE(C1), xvec14, xvec14; +ADD_DX 2*SIZE(C1, ldc, 1), xvec6, xvec6; +ADD_DX 0*SIZE(C1, ldc, 1), xvec12, xvec12; +ADD_DX 2*SIZE(C1), xvec4, xvec4; #endif ST_DX xvec15, 0*SIZE(C0); ST_DX xvec7, 2*SIZE(C0, ldc, 1); @@ -1410,18 +1387,18 @@ EXTRA_DY $1, yvec14, xvec6; EXTRA_DY $1, yvec13, xvec5; EXTRA_DY $2, yvec12, xvec4; #ifndef TRMMKERNEL -LDL_DX 0*SIZE(C0), xvec0; -LDH_DX 1*SIZE(C0), xvec0; -LDL_DX 2*SIZE(C0, ldc, 1), xvec1; -LDH_DX 3*SIZE(C0, ldc, 1), xvec1; -LDL_DX 0*SIZE(C0, ldc, 1), xvec2; -LDH_DX 1*SIZE(C0, ldc, 1), xvec2; -LDL_DX 2*SIZE(C0), xvec3; -LDH_DX 3*SIZE(C0), xvec3; -ADD_DX xvec0, xvec15; -ADD_DX xvec1, xvec7; -ADD_DX xvec2, xvec13; -ADD_DX xvec3, xvec5; +LDL_DX 0*SIZE(C0), xvec0, xvec0; +LDH_DX 1*SIZE(C0), xvec0, xvec0; +LDL_DX 2*SIZE(C0, ldc, 1), xvec1, xvec1; +LDH_DX 3*SIZE(C0, ldc, 1), xvec1, xvec1; +LDL_DX 0*SIZE(C0, ldc, 1), xvec2, xvec2; +LDH_DX 1*SIZE(C0, ldc, 1), xvec2, xvec2; +LDL_DX 2*SIZE(C0), xvec3, xvec3; +LDH_DX 3*SIZE(C0), xvec3, xvec3; +ADD_DX xvec0, xvec15, xvec15; +ADD_DX xvec1, xvec7, xvec7; +ADD_DX xvec2, xvec13, xvec13; +ADD_DX xvec3, xvec5, xvec5; #endif STL_DX xvec15, 0*SIZE(C0); STH_DX xvec15, 1*SIZE(C0); @@ -1432,18 +1409,18 @@ STH_DX xvec13, 1*SIZE(C0, ldc, 1); STL_DX xvec6, 2*SIZE(C0); STH_DX xvec6, 3*SIZE(C0); #ifndef TRMMKERNEL -LDL_DX 0*SIZE(C1), xvec0; -LDH_DX 1*SIZE(C1), xvec0; -LDL_DX 2*SIZE(C1, ldc, 1), xvec1; -LDH_DX 3*SIZE(C1, ldc, 1), xvec1; -LDL_DX 0*SIZE(C1, ldc, 1), xvec2; -LDH_DX 1*SIZE(C1, ldc, 1), xvec2; -LDL_DX 2*SIZE(C1), xvec3; -LDH_DX 3*SIZE(C1), xvec3; -ADD_DX xvec0, xvec14; -ADD_DX xvec1, xvec6; -ADD_DX xvec2, xvec12; -ADD_DX xvec3, xvec4; +LDL_DX 0*SIZE(C1), xvec0, xvec0; +LDH_DX 1*SIZE(C1), xvec0, xvec0; +LDL_DX 2*SIZE(C1, ldc, 1), xvec1, xvec1; +LDH_DX 3*SIZE(C1, ldc, 1), xvec1, xvec1; +LDL_DX 0*SIZE(C1, ldc, 1), xvec2, xvec2; +LDH_DX 1*SIZE(C1, ldc, 1), xvec2, xvec2; +LDL_DX 2*SIZE(C1), xvec3, xvec3; +LDH_DX 3*SIZE(C1), xvec3, xvec3; +ADD_DX xvec0, xvec14, xvec14; +ADD_DX xvec1, xvec6, xvec6; +ADD_DX xvec2, xvec12, xvec12; +ADD_DX xvec3, xvec4, xvec4; #endif STL_DX xvec14, 0*SIZE(C1); STH_DX xvec14, 1*SIZE(C1); @@ -1680,18 +1657,18 @@ ADD2_DY yvec4, yvec14, yvec14; EXTRA_DY $1, yvec15, xvec7; EXTRA_DY $1, yvec14, xvec6; #ifndef TRMMKERNEL -LDL_DX 0*SIZE(C0), xvec0; -LDH_DX 1*SIZE(C0), xvec0; -LDL_DX 0*SIZE(C0, ldc, 1), xvec1; -LDH_DX 1*SIZE(C0, ldc, 1), xvec1; -LDL_DX 0*SIZE(C1), xvec2; -LDH_DX 1*SIZE(C1), xvec2; -LDL_DX 0*SIZE(C1, ldc, 1), xvec3; -LDH_DX 1*SIZE(C1, ldc, 1), xvec3; -ADD_DX xvec0, xvec15; -ADD_DX xvec1, xvec7; -ADD_DX xvec2, xvec14; -ADD_DX xvec3, xvec6; +LDL_DX 0*SIZE(C0), xvec0, xvec0; +LDH_DX 1*SIZE(C0), xvec0, xvec0; +LDL_DX 0*SIZE(C0, ldc, 1), xvec1, xvec1; +LDH_DX 1*SIZE(C0, ldc, 1), xvec1, xvec1; +LDL_DX 0*SIZE(C1), xvec2, xvec2; +LDH_DX 1*SIZE(C1), xvec2, xvec2; +LDL_DX 0*SIZE(C1, ldc, 1), xvec3, xvec3; +LDH_DX 1*SIZE(C1, ldc, 1), xvec3, xvec3; +ADD_DX xvec0, xvec15, xvec15; +ADD_DX xvec1, xvec7, xvec7; +ADD_DX xvec2, xvec14, xvec14; +ADD_DX xvec3, xvec6, xvec6; #endif STL_DX xvec15, 0*SIZE(C0); STH_DX xvec15, 1*SIZE(C0); @@ -2063,14 +2040,14 @@ JNE .L213_loopEx; ALIGN_5 #### Writing back #### #ifndef TRMMKERNEL -ADD_DX 0*SIZE(C0),xvec15; -ADD_DX 2*SIZE(C1),xvec7; -ADD_DX 4*SIZE(C0),xvec14; -ADD_DX 6*SIZE(C1),xvec6; -ADD_DX 0*SIZE(C1),xvec13; -ADD_DX 2*SIZE(C0),xvec5; -ADD_DX 4*SIZE(C1),xvec12; -ADD_DX 6*SIZE(C0),xvec4; +ADD_DX 0*SIZE(C0), xvec15, xvec15; +ADD_DX 2*SIZE(C1), xvec7, xvec7; +ADD_DX 4*SIZE(C0), xvec14, xvec14; +ADD_DX 6*SIZE(C1), xvec6, xvec6; +ADD_DX 0*SIZE(C1), xvec13, xvec13; +ADD_DX 2*SIZE(C0), xvec5, xvec5; +ADD_DX 4*SIZE(C1), xvec12, xvec12; +ADD_DX 6*SIZE(C0), xvec4, xvec4; #endif ST_DX xvec15,0*SIZE(C0); ST_DX xvec7,2*SIZE(C1); @@ -2098,18 +2075,18 @@ JMP .L21_loopE; ALIGN_5 .L213_loopEx: #ifndef TRMMKERNEL -LDL_DX 0*SIZE(C0), xvec0; -LDH_DX 1*SIZE(C0), xvec0; -LDL_DX 2*SIZE(C1), xvec1; -LDH_DX 3*SIZE(C1), xvec1; -LDL_DX 4*SIZE(C0), xvec2; -LDH_DX 5*SIZE(C0), xvec2; -LDL_DX 6*SIZE(C1), xvec3; -LDH_DX 7*SIZE(C1), xvec3; -ADD_DX xvec0, xvec15; -ADD_DX xvec1, xvec7; -ADD_DX xvec2, xvec14; -ADD_DX xvec3, xvec6; +LDL_DX 0*SIZE(C0), xvec0, xvec0; +LDH_DX 1*SIZE(C0), xvec0, xvec0; +LDL_DX 2*SIZE(C1), xvec1, xvec1; +LDH_DX 3*SIZE(C1), xvec1, xvec1; +LDL_DX 4*SIZE(C0), xvec2, xvec2; +LDH_DX 5*SIZE(C0), xvec2, xvec2; +LDL_DX 6*SIZE(C1), xvec3, xvec3; +LDH_DX 7*SIZE(C1), xvec3, xvec3; +ADD_DX xvec0, xvec15, xvec15; +ADD_DX xvec1, xvec7, xvec7; +ADD_DX xvec2, xvec14, xvec14; +ADD_DX xvec3, xvec6, xvec6; #endif STL_DX xvec15, 0*SIZE(C0); STH_DX xvec15, 1*SIZE(C0); @@ -2120,18 +2097,18 @@ STH_DX xvec14, 5*SIZE(C0); STL_DX xvec6, 6*SIZE(C1); STH_DX xvec6, 7*SIZE(C1); #ifndef TRMMKERNEL -LDL_DX 0*SIZE(C1), xvec3; -LDH_DX 1*SIZE(C1), xvec3; -LDL_DX 2*SIZE(C0), xvec2; -LDH_DX 3*SIZE(C0), xvec2; -LDL_DX 4*SIZE(C1), xvec1; -LDH_DX 5*SIZE(C1), xvec1; -LDL_DX 6*SIZE(C0), xvec0; -LDH_DX 7*SIZE(C0), xvec0; -ADD_DX xvec3, xvec13; -ADD_DX xvec2, xvec5; -ADD_DX xvec1, xvec12; -ADD_DX xvec0, xvec4; +LDL_DX 0*SIZE(C1), xvec3, xvec3; +LDH_DX 1*SIZE(C1), xvec3, xvec3; +LDL_DX 2*SIZE(C0), xvec2, xvec2; +LDH_DX 3*SIZE(C0), xvec2, xvec2; +LDL_DX 4*SIZE(C1), xvec1, xvec1; +LDH_DX 5*SIZE(C1), xvec1, xvec1; +LDL_DX 6*SIZE(C0), xvec0, xvec0; +LDH_DX 7*SIZE(C0), xvec0, xvec0; +ADD_DX xvec3, xvec13, xvec13; +ADD_DX xvec2, xvec5, xvec5; +ADD_DX xvec1, xvec12, xvec12; +ADD_DX xvec0, xvec4, xvec4; #endif STL_DX xvec13, 0*SIZE(C1); STH_DX xvec13, 1*SIZE(C1); @@ -2384,18 +2361,18 @@ EXTRA_DY $1, yvec15, xvec7; EXTRA_DY $1, yvec13, xvec5; #### Write back #### #ifndef TRMMKERNEL -LDL_DX 0*SIZE(C0), xvec0; -LDH_DX 1*SIZE(C0), xvec0; -LDL_DX 2*SIZE(C1), xvec1; -LDH_DX 3*SIZE(C1), xvec1; -LDL_DX 0*SIZE(C1), xvec2; -LDH_DX 1*SIZE(C1), xvec2; -LDL_DX 2*SIZE(C0), xvec3; -LDH_DX 3*SIZE(C0), xvec3; -ADD_DX xvec0, xvec15; -ADD_DX xvec1, xvec7; -ADD_DX xvec2, xvec13; -ADD_DX xvec3, xvec5; +LDL_DX 0*SIZE(C0), xvec0, xvec0; +LDH_DX 1*SIZE(C0), xvec0, xvec0; +LDL_DX 2*SIZE(C1), xvec1, xvec1; +LDH_DX 3*SIZE(C1), xvec1, xvec1; +LDL_DX 0*SIZE(C1), xvec2, xvec2; +LDH_DX 1*SIZE(C1), xvec2, xvec2; +LDL_DX 2*SIZE(C0), xvec3, xvec3; +LDH_DX 3*SIZE(C0), xvec3, xvec3; +ADD_DX xvec0, xvec15, xvec15; +ADD_DX xvec1, xvec7, xvec7; +ADD_DX xvec2, xvec13, xvec13; +ADD_DX xvec3, xvec5, xvec5; #endif STL_DX xvec15, 0*SIZE(C0); STH_DX xvec15, 1*SIZE(C0); @@ -2582,12 +2559,12 @@ ADD2_DY yvec5, yvec15, yvec15; EXTRA_DY $1, yvec15, xvec7; #### Writing Back #### #ifndef TRMMKERNEL -LDL_DX 0*SIZE(C0), xvec0; -LDH_DX 1*SIZE(C0), xvec0; -LDL_DX 0*SIZE(C1), xvec1; -LDH_DX 1*SIZE(C1), xvec1; -ADD_DX xvec0, xvec15; -ADD_DX xvec1, xvec7; +LDL_DX 0*SIZE(C0), xvec0, xvec0; +LDH_DX 1*SIZE(C0), xvec0, xvec0; +LDL_DX 0*SIZE(C1), xvec1, xvec1; +LDH_DX 1*SIZE(C1), xvec1, xvec1; +ADD_DX xvec0, xvec15, xvec15; +ADD_DX xvec1, xvec7, xvec7; #endif STL_DX xvec15, 0*SIZE(C0); STH_DX xvec15, 1*SIZE(C0); @@ -2845,18 +2822,18 @@ EXTRA_DY $1, yvec15, xvec7; EXTRA_DY $1, yvec14, xvec6; #### Writing Back #### #ifndef TRMMKERNEL -LDL_DX 0*SIZE(C0), xvec0; -LDH_DX 1*SIZE(C0), xvec0; -LDL_DX 2*SIZE(C0), xvec1; -LDH_DX 3*SIZE(C0), xvec1; -LDL_DX 4*SIZE(C0), xvec2; -LDH_DX 5*SIZE(C0), xvec2; -LDL_DX 6*SIZE(C0), xvec3; -LDH_DX 7*SIZE(C0), xvec3; -ADD_DX xvec0, xvec15; -ADD_DX xvec1, xvec7; -ADD_DX xvec2, xvec14; -ADD_DX xvec3, xvec6; +LDL_DX 0*SIZE(C0), xvec0, xvec0; +LDH_DX 1*SIZE(C0), xvec0, xvec0; +LDL_DX 2*SIZE(C0), xvec1, xvec1; +LDH_DX 3*SIZE(C0), xvec1, xvec1; +LDL_DX 4*SIZE(C0), xvec2, xvec2; +LDH_DX 5*SIZE(C0), xvec2, xvec2; +LDL_DX 6*SIZE(C0), xvec3, xvec3; +LDH_DX 7*SIZE(C0), xvec3, xvec3; +ADD_DX xvec0, xvec15, xvec15; +ADD_DX xvec1, xvec7, xvec7; +ADD_DX xvec2, xvec14, xvec14; +ADD_DX xvec3, xvec6, xvec6; #endif STL_DX xvec15, 0*SIZE(C0); STH_DX xvec15, 1*SIZE(C0); @@ -3026,12 +3003,12 @@ ADD2_DY yvec5, yvec15, yvec15; EXTRA_DY $1, yvec15, xvec7; #### Writing Back #### #ifndef TRMMKERNEL -LDL_DX 0*SIZE(C0), xvec0; -LDH_DX 1*SIZE(C0), xvec0; -LDL_DX 2*SIZE(C0), xvec1; -LDH_DX 3*SIZE(C0), xvec1; -ADD_DX xvec0, xvec15; -ADD_DX xvec1, xvec7; +LDL_DX 0*SIZE(C0), xvec0, xvec0; +LDH_DX 1*SIZE(C0), xvec0, xvec0; +LDL_DX 2*SIZE(C0), xvec1, xvec1; +LDH_DX 3*SIZE(C0), xvec1, xvec1; +ADD_DX xvec0, xvec15, xvec15; +ADD_DX xvec1, xvec7, xvec7; #endif STL_DX xvec15, 0*SIZE(C0); STH_DX xvec15, 1*SIZE(C0); @@ -3084,43 +3061,43 @@ ALIGN_5 .L331_bodyB: LD_DX 0*SIZE(ptrba), xvec0; BROAD_DX 0*SIZE(ptrbb), xvec2; -MUL_DX xvec0, xvec2; -ADD1_DX xvec2, xvec15; +MUL_DX xvec0, xvec2, xvec2; +ADD1_DX xvec2, xvec15, xvec15; SHUF_DX $0x4e, xvec0, xvec1; BROAD_DX 1*SIZE(ptrbb), xvec3; -MUL_DX xvec1, xvec3; -ADDSUB_DX xvec3, xvec15; +MUL_DX xvec1, xvec3, xvec3; +ADDSUB_DX xvec3, xvec15, xvec15; LD_DX 2*SIZE(ptrba), xvec0; BROAD_DX 2*SIZE(ptrbb), xvec2; -MUL_DX xvec0, xvec2; -ADD1_DX xvec2, xvec15; +MUL_DX xvec0, xvec2, xvec2; +ADD1_DX xvec2, xvec15, xvec15; SHUF_DX $0x4e, xvec0, xvec1; BROAD_DX 3*SIZE(ptrbb), xvec3; -MUL_DX xvec1, xvec3; -ADDSUB_DX xvec3, xvec15; +MUL_DX xvec1, xvec3, xvec3; +ADDSUB_DX xvec3, xvec15, xvec15; LD_DX 4*SIZE(ptrba), xvec0; BROAD_DX 4*SIZE(ptrbb), xvec2; -MUL_DX xvec0, xvec2; -ADD1_DX xvec2, xvec15; +MUL_DX xvec0, xvec2, xvec2; +ADD1_DX xvec2, xvec15, xvec15; SHUF_DX $0x4e, xvec0, xvec1; BROAD_DX 5*SIZE(ptrbb), xvec3; -MUL_DX xvec1, xvec3; -ADDSUB_DX xvec3, xvec15; +MUL_DX xvec1, xvec3, xvec3; +ADDSUB_DX xvec3, xvec15, xvec15; LD_DX 6*SIZE(ptrba), xvec0; BROAD_DX 6*SIZE(ptrbb), xvec2; -MUL_DX xvec0, xvec2; -ADD1_DX xvec2, xvec15; +MUL_DX xvec0, xvec2, xvec2; +ADD1_DX xvec2, xvec15, xvec15; SHUF_DX $0x4e, xvec0, xvec1; BROAD_DX 7*SIZE(ptrbb), xvec3; -MUL_DX xvec1, xvec3; -ADDSUB_DX xvec3, xvec15; +MUL_DX xvec1, xvec3, xvec3; +ADDSUB_DX xvec3, xvec15, xvec15; ADDQ $8*SIZE, ptrba; ADDQ $8*SIZE, ptrbb; DECQ k; @@ -3137,23 +3114,23 @@ ALIGN_5 .L332_bodyB: LD_DX 0*SIZE(ptrba), xvec0; BROAD_DX 0*SIZE(ptrbb), xvec2; -MUL_DX xvec0, xvec2; -ADD1_DX xvec2, xvec15; +MUL_DX xvec0, xvec2, xvec2; +ADD1_DX xvec2, xvec15, xvec15; SHUF_DX $0x4e, xvec0, xvec1; BROAD_DX 1*SIZE(ptrbb), xvec3; -MUL_DX xvec1, xvec3; -ADDSUB_DX xvec3, xvec15; +MUL_DX xvec1, xvec3, xvec3; +ADDSUB_DX xvec3, xvec15, xvec15; LD_DX 2*SIZE(ptrba), xvec0; BROAD_DX 2*SIZE(ptrbb), xvec2; -MUL_DX xvec0, xvec2; -ADD1_DX xvec2, xvec15; +MUL_DX xvec0, xvec2, xvec2; +ADD1_DX xvec2, xvec15, xvec15; SHUF_DX $0x4e, xvec0, xvec1; BROAD_DX 3*SIZE(ptrbb), xvec3; -MUL_DX xvec1, xvec3; -ADDSUB_DX xvec3, xvec15; +MUL_DX xvec1, xvec3, xvec3; +ADDSUB_DX xvec3, xvec15, xvec15; ADDQ $4*SIZE, ptrba; ADDQ $4*SIZE, ptrbb; @@ -3168,13 +3145,13 @@ ALIGN_5 .L333_bodyB: LD_DX 0*SIZE(ptrba), xvec0; BROAD_DX 0*SIZE(ptrbb), xvec2; -MUL_DX xvec0, xvec2; -ADD1_DX xvec2, xvec15; +MUL_DX xvec0, xvec2, xvec2; +ADD1_DX xvec2, xvec15, xvec15; SHUF_DX $0x4e, xvec0, xvec1; BROAD_DX 1*SIZE(ptrbb), xvec3; -MUL_DX xvec1, xvec3; -ADDSUB_DX xvec3, xvec15; +MUL_DX xvec1, xvec3, xvec3; +ADDSUB_DX xvec3, xvec15, xvec15; ADDQ $2*SIZE, ptrba; ADDQ $2*SIZE, ptrbb; @@ -3182,14 +3159,14 @@ ADDQ $2*SIZE, ptrbb; #### Handle #### XOR_DY yvec7, yvec7, yvec7; #if defined(RN) || defined(RT) || defined(CN) || defined(CT) -ADDSUB_DX xvec15, xvec7; +ADDSUB_DX xvec15, xvec7, xvec7; MOV_DX xvec7, xvec15; #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) -SUB_DX xvec15, xvec7; +SUB_DX xvec15, xvec7, xvec7; MOV_DX xvec7, xvec15; #elif defined(RR) || defined(RC) || defined(CR) || defined(CC) SHUF_DX $0x4e, xvec15, xvec15; -ADDSUB_DX xvec15, xvec7; +ADDSUB_DX xvec15, xvec7, xvec7; MOV_DX xvec7, xvec15; SHUF_DX $0x4e, xvec15, xvec15; #endif @@ -3199,14 +3176,14 @@ BROAD_DX MEMALPHA_R,xvec7; BROAD_DX MEMALPHA_I,xvec6; #### Multiply Alpha #### SHUF_DX $0x4e, xvec15, xvec5; -MUL_DX xvec7, xvec15; -MUL_DX xvec6, xvec5; -ADDSUB_DX xvec5, xvec15; +MUL_DX xvec7, xvec15, xvec15; +MUL_DX xvec6, xvec5, xvec5; +ADDSUB_DX xvec5, xvec15, xvec15; #### Writing back #### #ifndef TRMMKERNEL -LDL_DX 0*SIZE(C0), xvec0; -LDH_DX 1*SIZE(C0), xvec0; -ADD_DX xvec0, xvec15; +LDL_DX 0*SIZE(C0), xvec0, xvec0; +LDH_DX 1*SIZE(C0), xvec0, xvec0; +ADD_DX xvec0, xvec15, xvec15; #endif STL_DX xvec15, 0*SIZE(C0); STH_DX xvec15, 1*SIZE(C0); @@ -3237,6 +3214,9 @@ movq 24(%rsp), %r13; movq 32(%rsp), %r14; movq 40(%rsp), %r15; + +vzeroupper + #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi From 13f5f181406df3de4553d2481206df1b19a99b4a Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Tue, 26 Jun 2012 07:43:06 +0800 Subject: [PATCH 16/16] Updated the doc for 0.2.0 version. --- Changelog.txt | 15 +++++++++++++++ README.md | 6 +++--- 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/Changelog.txt b/Changelog.txt index 0ed35b0e4..c222c7eee 100644 --- a/Changelog.txt +++ b/Changelog.txt @@ -1,4 +1,17 @@ OpenBLAS ChangeLog +==================================================================== +Version 0.2.0 +26-Jun-2012 +common: + * Removed the limitation (64) of numbers of CPU cores. + Now, it supports 256 cores at max. + * Supported clang compiler. + * Fixed some build bugs on FreeBSD +x86/x86-64: + * Optimized Level-3 BLAS on Intel Sandy Bridge x86-64 by AVX instructions. + Please use gcc >= 4.6 or clang >=3.1. + * Support AMD Bobcat by using GotoBLAS2 AMD Barcelona codes. + ==================================================================== Version 0.1.1 29-Apr-2012 @@ -7,6 +20,8 @@ common: * Supported LAPACKE, a C interface to LAPACKE. (Thank Zaheer Chothia) * Fixed the build bug (MD5 and download) on Mac OSX. * Auto download CUnit 2.1.2-2 from SF.net with UTEST_CHECK=1. + * Fxied the compatibility issue for compilers without C99 complex number + (e.g. Visual Studio) x86/x86_64: * Auto-detect Intel Sandy Bridge Core i7-3xxx & Xeon E7 Westmere-EX. * Test alpha=Nan in dscale. diff --git a/README.md b/README.md index a13e069ec..82e9f528c 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@ OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version. OpenB Please read the documents on OpenBLAS wiki pages . -## Intallation +## Installation Download from project homepage. http://xianyi.github.com/OpenBLAS/ Or, check out codes from git://github.com/xianyi/OpenBLAS.git @@ -76,9 +76,9 @@ The priorities are OPENBLAS_NUM_THREADS > GOTO_NUM_THREADS > OMP_NUM_THREADS. If you compile this lib with USE_OPENMP=1, you should set OMP_NUM_THREADS environment variable. OpenBLAS ignores OPENBLAS_NUM_THREADS and GOTO_NUM_THREADS with USE_OPENMP=1. -### Set the number of threads with calling functions. +### Set the number of threads on runtime. -Examples: +We provided the below functions to controll the number of threads on runtime. So far, we didn't support changing the number of threads on Windows. On Windows, these functions are dummy. void goto_set_num_threads(int num_threads);