From f76f9525477785bb452699c07d1985ec14dc2b61 Mon Sep 17 00:00:00 2001 From: wangqian Date: Tue, 19 Jun 2012 16:17:43 +0800 Subject: [PATCH 1/7] Refs #83 #53. Adding Intel Sandy Bridge (AVX supported) kernel codes for BLAS level 3 functions. --- kernel/generic/zgemm_ncopy_4_sandy.c | 235 ++ kernel/generic/zgemm_ncopy_8_sandy.c | 401 +++ kernel/generic/zgemm_tcopy_4_sandy.c | 237 ++ kernel/generic/zgemm_tcopy_8_sandy.c | 370 ++ kernel/x86_64/KERNEL.SANDYBRIDGE | 97 +- kernel/x86_64/cgemm_kernel_4x8_sandy.S | 4478 ++++++++++++++++++++++++ kernel/x86_64/dgemm_kernel_4x8_sandy.S | 3186 +++++++++++++++++ kernel/x86_64/sgemm_kernel_8x8_sandy.S | 3736 ++++++++++++++++++++ kernel/x86_64/zgemm_kernel_4x4_sandy.S | 3257 +++++++++++++++++ param.h | 100 +- 10 files changed, 15982 insertions(+), 115 deletions(-) create mode 100644 kernel/generic/zgemm_ncopy_4_sandy.c create mode 100644 kernel/generic/zgemm_ncopy_8_sandy.c create mode 100644 kernel/generic/zgemm_tcopy_4_sandy.c create mode 100644 kernel/generic/zgemm_tcopy_8_sandy.c create mode 100644 kernel/x86_64/cgemm_kernel_4x8_sandy.S create mode 100644 kernel/x86_64/dgemm_kernel_4x8_sandy.S create mode 100644 kernel/x86_64/sgemm_kernel_8x8_sandy.S create mode 100644 kernel/x86_64/zgemm_kernel_4x4_sandy.S diff --git a/kernel/generic/zgemm_ncopy_4_sandy.c b/kernel/generic/zgemm_ncopy_4_sandy.c new file mode 100644 index 000000000..839bd5939 --- /dev/null +++ b/kernel/generic/zgemm_ncopy_4_sandy.c @@ -0,0 +1,235 @@ +/***************************************************************************** + Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are +met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the ISCAS nor the names of its contributors may +be used to endorse or promote products derived from this software +without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + **********************************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG row,BLASLONG col,FLOAT* src,BLASLONG srcdim,FLOAT* dest) +{ + BLASLONG i,j; + BLASLONG idx=0; + BLASLONG ii; + FLOAT *src0,*src1,*src2,*src3,*dest0; + for (j=0; j +#include "common.h" + +int CNAME(BLASLONG row,BLASLONG col,FLOAT* src,BLASLONG srcdim,FLOAT* dest) +{ + BLASLONG i,j; + BLASLONG idx=0; + BLASLONG ii; + FLOAT *src0,*src1,*src2,*src3,*src4,*src5,*src6,*src7,*dest0; + for (j=0; j +#include "common.h" + +int CNAME(BLASLONG row,BLASLONG col,FLOAT* src,BLASLONG srcdim,FLOAT* dest) +{ + BLASLONG i,j; + BLASLONG idx=0; + BLASLONG ii; + FLOAT *src0,*src1,*src2,*src3,*dest0; + FLOAT *dest1,*dest2; + ii = col&-4; + ii = ii*(2*row); + dest2 = dest+ii; + ii = col&-2; + ii = ii*(2*row); + dest1 = dest+ii; + for (j=0; j +#include "common.h" + +int CNAME(BLASLONG row,BLASLONG col,FLOAT* src,BLASLONG srcdim,FLOAT* dest) +{ + BLASLONG i,j; + BLASLONG idx=0; + BLASLONG ii; + FLOAT *src0,*src1,*src2,*src3,*dest0; + FLOAT *dest1,*dest2,*dest4; + ii = col&-8; + ii = ii*(2*row); + dest4 = dest+ii; + ii = col&-4; + ii = ii*(2*row); + dest2 = dest+ii; + ii = col&-2; + ii = ii*(2*row); + dest1 = dest+ii; + for (j=0; j Date: Tue, 19 Jun 2012 17:29:06 +0800 Subject: [PATCH 2/7] Fixed dynamic_arch building bug. --- param.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/param.h b/param.h index db9ec6a3b..27aeeb6ac 100644 --- a/param.h +++ b/param.h @@ -1,5 +1,5 @@ /***************************************************************************** -Copyright (c) 2011, Lab of Parallel Software and Computational Science,ICSAS +Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS All rights reserved. Redistribution and use in source and binary forms, with or without @@ -143,7 +143,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -#if defined(BARCELONA) || defined(SHANGHAI) +#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) #define SNUMOPT 8 #define DNUMOPT 4 From 3ef96aa567e27ab76f07701b37da1ca0c0c59f39 Mon Sep 17 00:00:00 2001 From: wangqian Date: Tue, 19 Jun 2012 20:37:22 +0800 Subject: [PATCH 3/7] Fixed bug in MOVQ redefine and ALIGN SIZE problem. --- kernel/x86_64/cgemm_kernel_4x8_sandy.S | 137 +++++++++--------- kernel/x86_64/dgemm_kernel_4x8_sandy.S | 163 +++++++++++----------- kernel/x86_64/sgemm_kernel_8x8_sandy.S | 185 +++++++++++++------------ kernel/x86_64/zgemm_kernel_4x4_sandy.S | 118 ++++++++-------- 4 files changed, 304 insertions(+), 299 deletions(-) diff --git a/kernel/x86_64/cgemm_kernel_4x8_sandy.S b/kernel/x86_64/cgemm_kernel_4x8_sandy.S index 2b4e4dc64..56ebee120 100644 --- a/kernel/x86_64/cgemm_kernel_4x8_sandy.S +++ b/kernel/x86_64/cgemm_kernel_4x8_sandy.S @@ -146,6 +146,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define JMP jmp #define NOP #define XOR xorpd +#undef MOVQ #define MOVQ movq #define XOR_SY vxorps @@ -305,7 +306,7 @@ movq %r11, kk; MOVQ bn,j; SARQ $2,j; # Rn = 4 JLE .L0_loopE; -.align 32; +ALIGN_5; .L0_bodyB:; #if defined(TRMMKERNEL) && defined(LEFT) MOVQ OFFSET, %rax; @@ -320,7 +321,7 @@ MOVQ ba,ptrba; MOVQ bm,i; SARQ $3,i; # Rm = 8 JLE .L1_loopE; -.align 32; +ALIGN_5; .L1_bodyB:; #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb,ptrbb; @@ -367,7 +368,7 @@ MOVQ %rax, kkk; #endif SARQ $2,k; # Unroll 4 times JLE .L2_loopE; -.align 32; +ALIGN_5; .L2_bodyB:; # Computing kernel @@ -591,7 +592,7 @@ ADD2_SY yvec7, yvec8, yvec8; .L2_bodyE:; DECQ k; JG .L2_bodyB; -.align 64; +ALIGN_5 .L2_loopE:; #ifndef TRMMKERNEL TEST $2, bk; @@ -599,7 +600,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L3_loopE; -.align 64 +ALIGN_5 .L3_loopB: ######### Unroll 1 ################## PREFETCH0 PRESIZE*SIZE(ptrba) @@ -717,7 +718,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L4_loopE; -.align 64 +ALIGN_5 .L4_loopB:; ######### Unroll 1 ################## PREFETCH0 PRESIZE*SIZE(ptrba) @@ -875,7 +876,7 @@ MOVQ C0, %rax; OR ldc, %rax; TEST $15, %rax; JNE .L4_loopEx; -.align 32 +ALIGN_5 EXTRA_SY $1,yvec15,xvec7; EXTRA_SY $1,yvec14,xvec6; EXTRA_SY $1,yvec13,xvec5; @@ -934,7 +935,7 @@ ADDQ $16*SIZE,C1; DECQ i; JG .L1_bodyB; JMP .L1_loopE; -.align 32 +ALIGN_5 .L4_loopEx: EXTRA_SY $1, yvec15, xvec7; #ifndef TRMMKERNEL @@ -1077,11 +1078,11 @@ ADDQ $16*SIZE, C0; ADDQ $16*SIZE, C1; DECQ i; JG .L1_bodyB; -.align 32; +ALIGN_5; .L1_loopE:; TEST $4, bm; JLE .L5_loopE; -.align 32 +ALIGN_5 .L5_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb,ptrbb; @@ -1113,7 +1114,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L8_loopE; -.align 32 +ALIGN_5 .L8_bodyB: #### Unroll times 1 #### LD_SY 0*SIZE(ptrba), yvec0; @@ -1242,7 +1243,7 @@ ADDQ $32*SIZE, ptrba; ADDQ $32*SIZE, ptrbb; DECQ k; JG .L8_bodyB; -.align 32 +ALIGN_5 .L8_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -1250,7 +1251,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L9_loopE; -.align 32 +ALIGN_5 .L9_bodyB: #### Unroll times 1 #### LD_SY 0*SIZE(ptrba), yvec0; @@ -1323,7 +1324,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L10_loopE; -.align 32 +ALIGN_5 .L10_bodyB: #### Unroll times 1 #### LD_SY 0*SIZE(ptrba), yvec0; @@ -1494,7 +1495,7 @@ ADDQ $8*SIZE, C1; .L5_loopE: TEST $2, bm; JLE .L6_loopE; -.align 32 +ALIGN_5 .L6_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb,ptrbb; @@ -1527,7 +1528,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L11_loopE; -.align 32 +ALIGN_5 .L11_bodyB: LD_SX 0*SIZE(ptrba), xvec0; # ar1, ai1, ar2, ai2 EDUP_SX 0*SIZE(ptrbb), xvec2; # br1, br1, br2, br2 @@ -1652,7 +1653,7 @@ ADDQ $16*SIZE, ptrba; ADDQ $32*SIZE, ptrbb; DECQ k; JG .L11_bodyB; -.align 32 +ALIGN_5 .L11_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -1660,7 +1661,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L12_loopE; -.align 32 +ALIGN_5 .L12_bodyB: LD_SX 0*SIZE(ptrba), xvec0; # ar1, ai1, ar2, ai2 EDUP_SX 0*SIZE(ptrbb), xvec2; # br1, br1, br2, br2 @@ -1731,7 +1732,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L13_loopE; -.align 32 +ALIGN_5 .L13_bodyB: LD_SX 0*SIZE(ptrba), xvec0; # ar1, ai1, ar2, ai2 EDUP_SX 0*SIZE(ptrbb), xvec2; # br1, br1, br2, br2 @@ -1875,7 +1876,7 @@ ADDQ $4*SIZE, C1; .L6_loopE: TEST $1, bm; JLE .L7_loopE; -.align 32 +ALIGN_5 .L7_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb,ptrbb; @@ -1905,7 +1906,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L14_loopE; -.align 32 +ALIGN_5 .L14_bodyB: BROAD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec2; @@ -1978,7 +1979,7 @@ ADDQ $8*SIZE, ptrba; ADDQ $32*SIZE, ptrbb; DECQ k; JG .L14_bodyB; -.align 32 +ALIGN_5 .L14_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -1986,7 +1987,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L15_loopE; -.align 32 +ALIGN_5 .L15_bodyB: BROAD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec2; @@ -2031,7 +2032,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L16_loopE; -.align 32 +ALIGN_5 .L16_bodyB: BROAD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec2; @@ -2129,11 +2130,11 @@ LEAQ (C,ldc,4),C; .L0_bodyE:; DECQ j; JG .L0_bodyB; -.align 32; +ALIGN_5; .L0_loopE:; TEST $2, bn; JLE .L20_loopE; -.align 32 +ALIGN_5 .L20_bodyB: #if defined(TRMMKERNEL) && defined(LEFT) MOVQ OFFSET, %rax; @@ -2145,7 +2146,7 @@ MOVQ ba, ptrba; MOVQ bm, i; SARQ $3, i; JLE .L21_loopE; -.align 32 +ALIGN_5 .L21_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb,ptrbb; @@ -2181,7 +2182,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L211_loopE; -.align 32 +ALIGN_5 .L211_bodyB: EDUP_SX 0*SIZE(ptrbb), xvec4; ODUP_SX 0*SIZE(ptrbb), xvec5; @@ -2430,7 +2431,7 @@ ADDQ $64*SIZE, ptrba; ADDQ $16*SIZE, ptrbb; DECQ k; JG .L211_bodyB; -.align 32 +ALIGN_5 .L211_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -2438,7 +2439,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L212_loopE; -.align 32 +ALIGN_5 .L212_bodyB: EDUP_SX 0*SIZE(ptrbb), xvec4; ODUP_SX 0*SIZE(ptrbb), xvec5; @@ -2571,7 +2572,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L213_loopE; -.align 32 +ALIGN_5 .L213_bodyB: EDUP_SX 0*SIZE(ptrbb), xvec4; ODUP_SX 0*SIZE(ptrbb), xvec5; @@ -2825,11 +2826,11 @@ ADDQ $16*SIZE, C0; ADDQ $16*SIZE, C1; DECQ i; JG .L21_bodyB; -.align 32 +ALIGN_5 .L21_loopE: TEST $4, bm; JLE .L22_loopE; -.align 32 +ALIGN_5 .L22_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb,ptrbb; @@ -2862,7 +2863,7 @@ MOVQ %rax, kkk; SARQ $2, k; JLE .L221_loopE; -.align 32 +ALIGN_5 .L221_bodyB: EDUP_SX 0*SIZE(ptrbb), xvec4; ODUP_SX 0*SIZE(ptrbb), xvec5; @@ -3002,7 +3003,7 @@ ADDQ $32*SIZE, ptrba; ADDQ $16*SIZE, ptrbb; DECQ k; JG .L221_bodyB; -.align 32 +ALIGN_5 .L221_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -3010,7 +3011,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L222_loopE; -.align 32 +ALIGN_5 .L222_bodyB: EDUP_SX 0*SIZE(ptrbb), xvec4; ODUP_SX 0*SIZE(ptrbb), xvec5; @@ -3089,7 +3090,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L223_loopE; -.align 32 +ALIGN_5 .L223_bodyB: EDUP_SX 0*SIZE(ptrbb), xvec4; ODUP_SX 0*SIZE(ptrbb), xvec5; @@ -3237,7 +3238,7 @@ ADDQ $8*SIZE, C1; .L22_loopE: TEST $2, bm; JLE .L23_loopE; -.align 32 +ALIGN_5 .L23_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb,ptrbb; @@ -3267,7 +3268,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L231_loopE; -.align 32 +ALIGN_5 .L231_bodyB: EDUP_SX 0*SIZE(ptrbb), xvec4; ODUP_SX 0*SIZE(ptrbb), xvec5; @@ -3351,7 +3352,7 @@ ADDQ $16*SIZE, ptrba; ADDQ $16*SIZE, ptrbb; DECQ k; JG .L231_bodyB; -.align 32 +ALIGN_5 .L231_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -3359,7 +3360,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L232_loopE; -.align 32 +ALIGN_5 .L232_bodyB: EDUP_SX 0*SIZE(ptrbb), xvec4; ODUP_SX 0*SIZE(ptrbb), xvec5; @@ -3409,7 +3410,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L233_loopE; -.align 32 +ALIGN_5 .L233_bodyB: EDUP_SX 0*SIZE(ptrbb), xvec4; ODUP_SX 0*SIZE(ptrbb), xvec5; @@ -3503,7 +3504,7 @@ ADDQ $4*SIZE, C1; .L23_loopE: TEST $1, bm; JLE .L24_loopE; -.align 32 +ALIGN_5 .L24_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb,ptrbb; @@ -3532,7 +3533,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L241_loopE; -.align 32 +ALIGN_5 .L241_bodyB: BROAD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec2; @@ -3585,7 +3586,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L242_loopE; -.align 32 +ALIGN_5 .L242_bodyB: BROAD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec2; @@ -3616,7 +3617,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L243_loopE; -.align 32 +ALIGN_5 .L243_bodyB: BROAD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec2; @@ -3684,7 +3685,7 @@ LEAQ (C, ldc, 2), C; .L20_loopE: TEST $1, bn; JLE .L30_loopE; -.align 32 +ALIGN_5 .L30_bodyB: #if defined(TRMMKERNEL) && defined(LEFT) MOVQ OFFSET, %rax; @@ -3695,7 +3696,7 @@ MOVQ ba, ptrba; MOVQ bm, i; SARQ $3, i; JLE .L31_loopE; -.align 32 +ALIGN_5 .L31_bodyB: MOVQ bb, ptrbb; #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) @@ -3727,7 +3728,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L311_loopE; -.align 32 +ALIGN_5 .L311_bodyB: #### Unroll 1 #### LD_SY 0*SIZE(ptrba), yvec0; @@ -3800,7 +3801,7 @@ ADDQ $64*SIZE, ptrba; ADDQ $8*SIZE, ptrbb; DECQ k; JG .L311_bodyB; -.align 32 +ALIGN_5 .L311_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -3808,7 +3809,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L312_loopE; -.align 32 +ALIGN_5 .L312_bodyB: #### Unroll 1 #### LD_SY 0*SIZE(ptrba), yvec0; @@ -3853,7 +3854,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L313_loopE; -.align 32 +ALIGN_5 .L313_bodyB: #### Unroll 1 #### LD_SY 0*SIZE(ptrba), yvec0; @@ -3941,11 +3942,11 @@ ADDQ $8, kk; ADDQ $16*SIZE, C0; DECQ i; JG .L31_bodyB; -.align 32 +ALIGN_5 .L31_loopE: TEST $4, bm; JLE .L32_loopE; -.align 32 +ALIGN_5 .L32_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb,ptrbb; @@ -3974,7 +3975,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L321_loopE; -.align 32 +ALIGN_5 .L321_bodyB: #### Unroll 1 #### LD_SY 0*SIZE(ptrba), yvec0; @@ -4023,7 +4024,7 @@ ADDQ $32*SIZE, ptrba; ADDQ $8*SIZE, ptrbb; DECQ k; JG .L321_bodyB; -.align 32 +ALIGN_5 .L321_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -4031,7 +4032,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L322_loopE; -.align 32 +ALIGN_5 .L322_bodyB: #### Unroll 1 #### LD_SY 0*SIZE(ptrba), yvec0; @@ -4064,7 +4065,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L323_loopE; -.align 32 +ALIGN_5 .L323_bodyB: #### Unroll 1 #### LD_SY 0*SIZE(ptrba), yvec0; @@ -4128,7 +4129,7 @@ ADDQ $8*SIZE, C0; .L32_loopE: TEST $2, bm; JLE .L33_loopE; -.align 32 +ALIGN_5 .L33_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb,ptrbb; @@ -4157,7 +4158,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L331_loopE; -.align 32 +ALIGN_5 .L331_bodyB: #### Unroll 1 #### LD_SX 0*SIZE(ptrba), xvec0; @@ -4202,7 +4203,7 @@ ADDQ $16*SIZE, ptrba; ADDQ $8*SIZE, ptrbb; DECQ k; JG .L331_bodyB; -.align 32 +ALIGN_5 .L331_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -4210,7 +4211,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L332_loopE; -.align 32 +ALIGN_5 .L332_bodyB: #### Unroll 1 #### LD_SX 0*SIZE(ptrba), xvec0; @@ -4241,7 +4242,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L333_loopE; -.align 32 +ALIGN_5 .L333_bodyB: #### Unroll 1 #### LD_SX 0*SIZE(ptrba), xvec0; @@ -4300,7 +4301,7 @@ ADDQ $4*SIZE, C0; .L33_loopE: TEST $1, bm; JLE .L34_loopE; -.align 32 +ALIGN_5 .L34_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb,ptrbb; @@ -4329,7 +4330,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L341_loopE; -.align 32 +ALIGN_5 .L341_bodyB: LD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec2; @@ -4354,7 +4355,7 @@ ADDQ $8*SIZE, ptrba; ADDQ $8*SIZE, ptrbb; DECQ k; JG .L341_bodyB; -.align 32 +ALIGN_5 .L341_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -4362,7 +4363,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L342_loopE; -.align 32 +ALIGN_5 .L342_bodyB: LD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec2; @@ -4383,7 +4384,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L343_loopE; -.align 32 +ALIGN_5 .L343_bodyB: XOR_SY yvec0, yvec0, yvec0; XOR_SY yvec2, yvec2, yvec2; diff --git a/kernel/x86_64/dgemm_kernel_4x8_sandy.S b/kernel/x86_64/dgemm_kernel_4x8_sandy.S index fea5ecb4a..c98879d7c 100644 --- a/kernel/x86_64/dgemm_kernel_4x8_sandy.S +++ b/kernel/x86_64/dgemm_kernel_4x8_sandy.S @@ -140,6 +140,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define JNE jne #define NOP #define XOR xorpd +#undef MOVQ #define MOVQ movq #define XOR_SY vxorps @@ -265,7 +266,7 @@ movq %r11, kk MOVQ bn,j; SARQ $2,j; # Rn = 4 JLE .L0_loopE; -.align 32; +ALIGN_5; .L0_bodyB:; #if defined(TRMMKERNEL) && defined(LEFT) MOVQ OFFSET, %rax; @@ -281,7 +282,7 @@ MOVQ ba,ptrba; MOVQ bm,i; SARQ $3,i; # Rm = 8 JLE .L1_loopE; -.align 32; +ALIGN_5; .L1_bodyB:; #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb, ptrbb; @@ -328,7 +329,7 @@ MOVQ %rax, kkk; #endif SARQ $2,k; JLE .L2_loopE; -.align 32; +ALIGN_5; .L2_bodyB:; # Computing kernel @@ -448,7 +449,7 @@ ADD_DY yvec8, yvec7, yvec8; .L2_bodyE:; DECQ k; JG .L2_bodyB; -.align 64; +ALIGN_5 .L2_loopE:; PREFETCH2 0*SIZE(prebb); ADDQ $8*SIZE, prebb; @@ -459,7 +460,7 @@ MOVQ kkk, %rax; TEST $2, %rax; #endif JLE .L3_loopE; -.align 64 +ALIGN_5 .L3_bodyB: #### Unroll times 1 #### PREFETCH0 64*SIZE(ptrba) @@ -529,7 +530,7 @@ MOVQ kkk, %rax; TEST $1, %rax; #endif JLE .L4_loopE; -.align 64 +ALIGN_5 .L4_bodyB:; #### Unroll times 1 #### PREFETCH0 64*SIZE(ptrba) @@ -588,7 +589,7 @@ MOVQ C0, %rax; OR ldc, %rax; TEST $15, %rax; JNE .L4_loopEx; # Unalign part write back -.align 32 +ALIGN_5 #### Writing Back #### EXTRA_DY $1,yvec15,xvec7; EXTRA_DY $1,yvec14,xvec6; @@ -648,7 +649,7 @@ ADDQ $8*SIZE,C1; DECQ i; JG .L1_bodyB; JMP .L1_loopE; -.align 32; +ALIGN_5; .L4_loopEx:; EXTRA_DY $1, yvec15, xvec7; #ifndef TRMMKERNEL @@ -776,11 +777,11 @@ ADDQ $8*SIZE, C0; ADDQ $8*SIZE, C1; DECQ i; JG .L1_bodyB; -.align 32 +ALIGN_5 .L1_loopE:; TEST $4, bm; # Rm = 4 JLE .L5_loopE; -.align 32 +ALIGN_5 .L5_bodyB:; #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb, ptrbb; @@ -816,7 +817,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L6_loopE; -.align 32; +ALIGN_5; .L6_bodyB:; # Computing kernel @@ -887,7 +888,7 @@ MUL_DY yvec1, yvec5, yvec7; ADD_DY yvec9, yvec7, yvec9; DECQ k; JG .L6_bodyB; -.align 32 +ALIGN_5 .L6_loopE:; #ifndef TRMMKERNEL TEST $2, bk; @@ -896,7 +897,7 @@ MOVQ kkk, %rax; TEST $2, %rax; #endif JLE .L7_loopE; -.align 32 +ALIGN_5 .L7_bodyB:; #### Untoll time 1 #### LD_DY 4*SIZE(ptrba), yvec1; @@ -940,7 +941,7 @@ MOVQ kkk, %rax; TEST $1, %rax; #endif JLE .L8_loopE; -.align 32 +ALIGN_5 .L8_bodyB:; #### Untoll time 1 #### MUL_DY yvec0, yvec2, yvec6; @@ -977,7 +978,7 @@ MOVQ C0, %rax; OR ldc, %rax; TEST $15, %rax; JNE .L8_loopEx; # Unalign part write back -.align 32 +ALIGN_5 #### Writing Back #### EXTRA_DY $1,yvec15,xvec7; EXTRA_DY $1,yvec13,xvec5; @@ -1014,7 +1015,7 @@ ADDQ $4, kk ADDQ $4*SIZE, C0; ADDQ $4*SIZE, C1; JMP .L5_loopE; -.align 32 +ALIGN_5 .L8_loopEx:; EXTRA_DY $1,yvec15,xvec7; EXTRA_DY $1,yvec13,xvec5; @@ -1080,7 +1081,7 @@ ADDQ $4*SIZE, C1; .L5_loopE:; TEST $2, bm; JLE .L9_loopE; -.align 32 +ALIGN_5 .L9_bodyB:; #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb, ptrbb; @@ -1117,7 +1118,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L10_loopE; -.align 32; +ALIGN_5; .L10_bodyB:; # Computing kernel @@ -1192,7 +1193,7 @@ MUL_DX xvec1, xvec5; ADD_DX xvec5, xvec9; DECQ k; JG .L10_bodyB; -.align 32 +ALIGN_5 .L10_loopE:; #ifndef TRMMKERNEL TEST $2, bk @@ -1201,7 +1202,7 @@ MOVQ kkk, %rax; TEST $2, %rax; #endif JLE .L11_loopE; -.align 32 +ALIGN_5 .L11_bodyB:; ##### Unroll time 1 #### LD_DX 4*SIZE(ptrbb), xvec6; @@ -1248,7 +1249,7 @@ MOVQ kkk, %rax; TEST $1, %rax; #endif JLE .L12_loopE; -.align 32 +ALIGN_5 .L12_bodyB:; SHUF_DX $0x4e, xvec3, xvec5; MUL_DX xvec0, xvec2; @@ -1285,7 +1286,7 @@ MOVQ C0, %rax; OR ldc, %rax; TEST $15, %rax; JNE .L12_loopEx; -.align 32 +ALIGN_5 #### Writing Back #### #ifndef TRMMKERNEL ADD_DX 0*SIZE(C0), xvec13; @@ -1310,7 +1311,7 @@ ADDQ $2, kk ADDQ $2*SIZE, C0 ADDQ $2*SIZE, C1 JMP .L9_loopE; -.align 32 +ALIGN_5 .L12_loopEx: #ifndef TRMMKERNEL LDL_DX 0*SIZE(C0), xvec14; @@ -1349,7 +1350,7 @@ ADDQ $2*SIZE, C1; .L9_loopE:; TEST $1, bm JLE .L13_loopE; -.align 32 +ALIGN_5 .L13_bodyB:; #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb, ptrbb; @@ -1379,7 +1380,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L14_loopE; -.align 32 +ALIGN_5 .L14_bodyB:; BROAD_DY 0*SIZE(ptrba), yvec0; LD_DY 0*SIZE(ptrbb), yvec2; @@ -1404,7 +1405,7 @@ ADDQ $4*SIZE, ptrba; ADDQ $16*SIZE, ptrbb; DECQ k; JG .L14_bodyB; -.align 32 +ALIGN_5 .L14_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -1413,7 +1414,7 @@ MOVQ kkk, %rax; TEST $2, %rax; #endif JLE .L15_loopE; -.align 32 +ALIGN_5 .L15_bodyB: BROAD_DY 0*SIZE(ptrba), yvec0; LD_DY 0*SIZE(ptrbb), yvec2; @@ -1434,7 +1435,7 @@ MOVQ kkk, %rax; TEST $1, %rax; #endif JLE .L16_loopE; -.align 32 +ALIGN_5 .L16_bodyB:; BROAD_DY 0*SIZE(ptrba), yvec0; LD_DY 0*SIZE(ptrbb), yvec2; @@ -1485,11 +1486,11 @@ LEAQ (C,ldc,4),C; .L0_bodyE:; DECQ j; JG .L0_bodyB; -.align 32; +ALIGN_5; .L0_loopE:; TEST $2, bn; JLE .L20_loopE; -.align 32; +ALIGN_5; .L20_loopB:; #if defined(TRMMKERNEL) && defined(LEFT) MOVQ OFFSET, %rax; @@ -1501,7 +1502,7 @@ MOVQ ba, ptrba; MOVQ bm, i; SARQ $3, i; # Rm = 8 JLE .L21_loopE; -.align 32; +ALIGN_5; .L21_bodyB:; #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb, ptrbb; @@ -1538,7 +1539,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L211_loopE; -.align 32; +ALIGN_5; .L211_bodyB: # Computing kernel #### Unroll time 1 #### @@ -1692,7 +1693,7 @@ MUL_DX xvec3, xvec7; ADD_DX xvec7, xvec8; DECQ k; JG .L211_bodyB; -.align 32 +ALIGN_5 .L211_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -1701,7 +1702,7 @@ MOVQ kkk, %rax; TEST $2, %rax; #endif JLE .L212_loopE; -.align 32; +ALIGN_5; .L212_bodyB: # Computing kernel #### Unroll time 1 #### @@ -1788,7 +1789,7 @@ MOVQ kkk, %rax; TEST $1, %rax; #endif JLE .L213_loopE; -.align 32 +ALIGN_5 .L213_bodyB: #### Unroll time 1 #### LD_DX 0*SIZE(ptrba), xvec0; @@ -1858,7 +1859,7 @@ MOVQ C0, %rax; OR ldc, %rax; TEST $15, %rax; JNE .L213_loopEx; -.align 32 +ALIGN_5 #### Writing Back #### #ifndef TRMMKERNEL ADD_DX 0*SIZE(C0), xvec11; @@ -1893,7 +1894,7 @@ ADDQ $8*SIZE, C1; DECQ i; JG .L21_bodyB; JMP .L21_loopE; -.align 32 +ALIGN_5 .L213_loopEx:; #ifndef TRMMKERNEL LDL_DX 0*SIZE(C0), xvec0; @@ -1956,7 +1957,7 @@ JG .L21_bodyB; .L21_loopE:; TEST $4, bm; # Rm = 4 JLE .L22_loopE; -.align 32; +ALIGN_5; .L22_bodyB:; #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb, ptrbb; @@ -1989,7 +1990,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L221_loopE; -.align 32 +ALIGN_5 .L221_bodyB:; # Computing kernel #### Unroll time 1 #### @@ -2071,7 +2072,7 @@ MUL_DX xvec1, xvec5; ADD_DX xvec5, xvec10; DECQ k; JG .L221_bodyB; -.align 32 +ALIGN_5 .L221_loopE:; #ifndef TRMMKERNEL TEST $2, bk; @@ -2080,7 +2081,7 @@ MOVQ kkk, %rax; TEST $2, %rax; #endif JLE .L222_loopE; -.align 32 +ALIGN_5 .L222_bodyB: #### Unroll time 1 #### LD_DX 0*SIZE(ptrba), xvec0; @@ -2129,7 +2130,7 @@ MOVQ kkk, %rax; TEST $1, %rax; #endif JLE .L223_loopE; -.align 32 +ALIGN_5 .L223_bodyB: #### Unroll time 1 #### LD_DX 0*SIZE(ptrba), xvec0; @@ -2171,7 +2172,7 @@ MOVQ C0, %rax; OR ldc, %rax; TEST $15, %rax; JNE .L223_loopEx; -.align 32 +ALIGN_5 #### Writing Back #### #ifndef TRMMKERNEL ADD_DX 0*SIZE(C0), xvec11; @@ -2196,7 +2197,7 @@ ADDQ $4, kk ADDQ $4*SIZE, C0; ADDQ $4*SIZE, C1; JMP .L22_loopE; -.align 32 +ALIGN_5 .L223_loopEx:; #ifndef TRMMKERNEL LDL_DX 0*SIZE(C0), xvec0; @@ -2237,7 +2238,7 @@ ADDQ $4*SIZE, C1; .L22_loopE:; TEST $2, bm; # Rm = 2 JLE .L23_loopE; -.align 32; +ALIGN_5; .L23_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb, ptrbb; @@ -2267,7 +2268,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L231_loopE; -.align 32 +ALIGN_5 .L231_bodyB: # Computing kernel #### Unroll time 1 #### @@ -2309,7 +2310,7 @@ ADD_DX xvec5, xvec11; ADDQ $8*SIZE, ptrbb; DECQ k; JG .L231_bodyB; -.align 32 +ALIGN_5 .L231_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -2318,7 +2319,7 @@ MOVQ kkk, %rax; TEST $2, %rax; #endif JLE .L232_loopE; -.align 32 +ALIGN_5 .L232_bodyB: #### Unroll time 1 #### LD_DX 0*SIZE(ptrba), xvec0; @@ -2347,7 +2348,7 @@ MOVQ kkk, %rax; TEST $1, %rax; #endif JLE .L233_loopE; -.align 32 +ALIGN_5 .L233_bodyB: #### Unroll time 1 #### LD_DX 0*SIZE(ptrba), xvec0; @@ -2373,7 +2374,7 @@ MOVQ C0, %rax; OR ldc, %rax; TEST $15, %rax; JNE .L233_loopEx; -.align 32 +ALIGN_5 #### Writing Back #### #ifndef TRMMKERNEL ADD_DX 0*SIZE(C0), xvec11; @@ -2394,7 +2395,7 @@ ADDQ $2, kk; ADDQ $2*SIZE, C0; ADDQ $2*SIZE, C1; JMP .L23_loopE; -.align 32 +ALIGN_5 .L233_loopEx:; #ifndef TRMMKERNEL LDL_DX 0*SIZE(C0), xvec0; @@ -2425,7 +2426,7 @@ ADDQ $2*SIZE, C1; .L23_loopE: TEST $1, bm; # Rm = 1 JLE .L24_loopE; -.align 32; +ALIGN_5; .L24_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb, ptrbb; @@ -2454,7 +2455,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L241_loopE; -.align 32 +ALIGN_5 .L241_bodyB: BROAD_DX 0*SIZE(ptrba), xvec0; LD_DX 0*SIZE(ptrbb), xvec2; @@ -2479,7 +2480,7 @@ ADDQ $4*SIZE, ptrba; ADDQ $8*SIZE, ptrbb; DECQ k; JG .L241_bodyB; -.align 32 +ALIGN_5 .L241_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -2488,7 +2489,7 @@ MOVQ kkk, %rax; TEST $2, %rax; #endif JLE .L242_loopE; -.align 32 +ALIGN_5 .L242_bodyB: BROAD_DX 0*SIZE(ptrba), xvec0; LD_DX 0*SIZE(ptrbb), xvec2; @@ -2509,7 +2510,7 @@ MOVQ kkk, %rax; TEST $1, %rax; #endif JLE .L243_loopE; -.align 32 +ALIGN_5 .L243_bodyB: BROAD_DX 0*SIZE(ptrba), xvec0; LD_DX 0*SIZE(ptrbb), xvec2; @@ -2551,7 +2552,7 @@ LEAQ (C, ldc, 2), C; .L20_loopE:; TEST $1, bn; # Rn = 1 JLE .L30_loopE; -.align 32 +ALIGN_5 .L30_bodyB: #if defined(TRMMKERNEL)&&defined(LEFT) MOVQ OFFSET, %rax; @@ -2562,7 +2563,7 @@ MOVQ ba, ptrba; MOVQ bm, i; SARQ $3, i; JLE .L31_loopE; -.align 32 +ALIGN_5 .L31_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb, ptrbb; @@ -2593,7 +2594,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L311_loopE; -.align 32 +ALIGN_5 .L311_bodyB: #### Unroll time 1 #### LD_DY 0*SIZE(ptrba), yvec0; @@ -2634,7 +2635,7 @@ ADD_DY yvec4, yvec14, yvec14; ADDQ $4*SIZE, ptrbb; DECQ k; JG .L311_bodyB; -.align 32 +ALIGN_5 .L311_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -2643,7 +2644,7 @@ MOVQ kkk, %rax; TEST $2, %rax; #endif JLE .L312_loopE; -.align 32 +ALIGN_5 .L312_bodyB: #### Unroll time 1 #### LD_DY 0*SIZE(ptrba), yvec0; @@ -2673,7 +2674,7 @@ MOVQ kkk, %rax; TEST $1, %rax; #endif JLE .L313_loopE; -.align 32 +ALIGN_5 .L313_bodyB: #### Unroll time 1 #### LD_DY 0*SIZE(ptrba), yvec0; @@ -2696,7 +2697,7 @@ MOVQ C0, %rax; OR ldc, %rax; TEST $15, %rax; JNE .L313_loopEx; -.align 32 +ALIGN_5 #### Writing Back #### EXTRA_DY $1, yvec15, xvec13; EXTRA_DY $1, yvec14, xvec12; @@ -2724,7 +2725,7 @@ ADDQ $8*SIZE, C0; DECQ i; JG .L31_bodyB; JMP .L31_loopE; -.align 32 +ALIGN_5 .L313_loopEx: EXTRA_DY $1, yvec15, xvec13; EXTRA_DY $1, yvec14, xvec12; @@ -2766,7 +2767,7 @@ JG .L31_bodyB; .L31_loopE: TEST $4, bm JLE .L32_loopE; -.align 32 +ALIGN_5 .L32_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb, ptrbb; @@ -2796,7 +2797,7 @@ MOVQ %rax, kkk #endif SARQ $2, k; JLE .L321_loopE; -.align 32 +ALIGN_5 .L321_bodyB: LD_DY 0*SIZE(ptrba), yvec0; BROAD_DY 0*SIZE(ptrbb), yvec1; @@ -2821,7 +2822,7 @@ ADDQ $16*SIZE, ptrba; ADDQ $4*SIZE, ptrbb; DECQ k; JG .L321_bodyB; -.align 32 +ALIGN_5 .L321_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -2830,7 +2831,7 @@ MOVQ kkk, %rax; TEST $2, %rax; #endif JLE .L322_loopE; -.align 32 +ALIGN_5 .L322_bodyB: LD_DY 0*SIZE(ptrba), yvec0; BROAD_DY 0*SIZE(ptrbb), yvec1; @@ -2852,7 +2853,7 @@ MOVQ kkk, %rax; TEST $1, %rax; #endif JLE .L323_loopE; -.align 32 +ALIGN_5 .L323_bodyB: LD_DY 0*SIZE(ptrba), yvec0; BROAD_DY 0*SIZE(ptrbb), yvec1; @@ -2870,7 +2871,7 @@ MOVQ C0, %rax; OR ldc, %rax; TEST $15, %rax; JNE .L323_loopEx; -.align 32 +ALIGN_5 #### Writing Back #### EXTRA_DY $1, yvec15, xvec14; #ifndef TRMMKERNEL @@ -2891,7 +2892,7 @@ ADDQ $4, kk #endif ADDQ $4*SIZE, C0; JMP .L32_loopE; -.align 32 +ALIGN_5 .L323_loopEx: #### Writing Back #### EXTRA_DY $1, yvec15, xvec14; @@ -2921,7 +2922,7 @@ ADDQ $4*SIZE, C0; .L32_loopE: TEST $2, bm JLE .L33_loopE; -.align 32 +ALIGN_5 .L33_bodyB: #if !defined(TRMMKERNEL) || (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) MOVQ bb, ptrbb; @@ -2951,7 +2952,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L331_loopE; -.align 32 +ALIGN_5 .L331_bodyB: LD_DX 0*SIZE(ptrba), xvec0; BROAD_DX 0*SIZE(ptrbb), xvec2; @@ -2976,7 +2977,7 @@ ADDQ $8*SIZE, ptrba; ADDQ $4*SIZE, ptrbb; DECQ k; JG .L331_bodyB; -.align 32 +ALIGN_5 .L331_loopE: #ifndef TRMMKERNEL TEST $2,bk; @@ -2985,7 +2986,7 @@ MOVQ kkk, %rax; TEST $2, %rax #endif JLE .L332_loopE; -.align 32 +ALIGN_5 .L332_bodyB: LD_DX 0*SIZE(ptrba), xvec0; BROAD_DX 0*SIZE(ptrbb), xvec2; @@ -3006,7 +3007,7 @@ MOVQ kkk, %rax; TEST $1, %rax; #endif JLE .L333_loopE; -.align 32 +ALIGN_5 .L333_bodyB: LD_DX 0*SIZE(ptrba), xvec0; BROAD_DX 0*SIZE(ptrbb), xvec2; @@ -3039,7 +3040,7 @@ ADDQ $2*SIZE, C0; .L33_loopE: TEST $1, bm JLE .L34_loopE; -.align 32 +ALIGN_5 .L34_bodyB: #if !defined(TRMMKERNEL) || (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) MOVQ bb, ptrbb; @@ -3068,7 +3069,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L341_loopE; -.align 32 +ALIGN_5 .L341_bodyB: movsd 0*SIZE(ptrba), xvec0; movsd 0*SIZE(ptrbb), xvec1; @@ -3093,7 +3094,7 @@ addq $4*SIZE, ptrba; addq $4*SIZE, ptrbb; decq k; JG .L341_bodyB; -.align 32 +ALIGN_5 .L341_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -3102,7 +3103,7 @@ MOVQ kkk, %rax; TEST $2, %rax; #endif JLE .L342_loopE; -.align 32 +ALIGN_5 .L342_bodyB: movsd 0*SIZE(ptrba), xvec0; movsd 0*SIZE(ptrbb), xvec1; @@ -3124,7 +3125,7 @@ MOVQ kkk, %rax; TEST $1, %rax; #endif JLE .L343_loopE; -.align 32 +ALIGN_5 .L343_bodyB: movsd 0*SIZE(ptrba), xvec0; movsd 0*SIZE(ptrbb), xvec1; diff --git a/kernel/x86_64/sgemm_kernel_8x8_sandy.S b/kernel/x86_64/sgemm_kernel_8x8_sandy.S index 44f8f1802..4d16a60d0 100644 --- a/kernel/x86_64/sgemm_kernel_8x8_sandy.S +++ b/kernel/x86_64/sgemm_kernel_8x8_sandy.S @@ -142,6 +142,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define JMP jmp #define NOP #define XOR xorpd +#undef MOVQ #define MOVQ movq #define XOR_SY vxorps @@ -273,7 +274,7 @@ movq %r11, kk MOVQ bn,j; SARQ $3,j; JLE .L0_loopE; -.align 16; +ALIGN_4; .L0_bodyB:; #if defined(TRMMKERNEL) && defined(LEFT) MOVQ OFFSET, %rax; @@ -289,7 +290,7 @@ MOVQ ba,ptrba; MOVQ bm,i; SARQ $3,i; JLE .L1_loopE; -.align 16; +ALIGN_4; .L1_bodyB:; #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb, ptrbb; @@ -342,7 +343,7 @@ MOVQ %rax, kkk; #endif SARQ $2,k; JLE .L2_loopE; -.align 16; +ALIGN_4; .L2_bodyB:; # Computing kernel @@ -472,7 +473,7 @@ ADD_SY yvec8, yvec7, yvec8; .L2_bodyE:; DECQ k; JG .L2_bodyB; -.align 64; +ALIGN_4 .L2_loopE:; #ifndef TRMMKERNEL TEST $2, bk; @@ -480,7 +481,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L3_loopE; -.align 64 +ALIGN_4 .L3_loobB: #### Unroll times 1 #### MUL_SY yvec0, yvec2, yvec6; @@ -550,7 +551,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L4_loopE; -.align 64 +ALIGN_4 .L4_loopB:; #### Unroll times 1 #### MUL_SY yvec0, yvec2, yvec6; @@ -609,7 +610,7 @@ MOVQ C0, %rax; OR ldc, %rax; TEST $15, %rax; JNE .L4_loopEx; -.align 16 +ALIGN_4 LEAQ (ldc,ldc,2),%rax; EXTRA_SY $1,yvec15,xvec7; EXTRA_SY $1,yvec14,xvec6; @@ -669,7 +670,7 @@ ADDQ $8*SIZE,C1; DECQ i; JG .L1_bodyB; JMP .L1_loopE; -.align 16; +ALIGN_4; .L4_loopEx: LEAQ (ldc,ldc,2),%rax; EXTRA_SY $1, yvec15, xvec7; @@ -813,11 +814,11 @@ ADDQ $8*SIZE, C0; ADDQ $8*SIZE, C1; DECQ i; JG .L1_bodyB; -.align 16 +ALIGN_4 .L1_loopE:; TEST $4, bm; JLE .L5_loopE; -.align 16 +ALIGN_4 .L5_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb, ptrbb; @@ -857,7 +858,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L8_loopE; -.align 16 +ALIGN_4 .L8_bodyB: #### Unroll time 1 #### @@ -983,7 +984,7 @@ MUL_SX xvec1, xvec5; ADD_SX xvec5, xvec8; DECQ k; JG .L8_bodyB; -.align 16 +ALIGN_4 .L8_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -991,7 +992,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L9_loopE; -.align 16 +ALIGN_4 .L9_bodyB: #### Unroll time 1 #### SHUF_SX $0x4e, xvec2, xvec4; @@ -1062,7 +1063,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L10_loopE; -.align 16 +ALIGN_4 .L10_bodyB: #### Unroll time 1 #### SHUF_SX $0x4e, xvec2, xvec4; @@ -1122,7 +1123,7 @@ MOVQ C0, %rax; OR ldc, %rax; TEST $15, %rax; JNE .L10_loopEx; -.align 16 +ALIGN_4 LEAQ (ldc,ldc,2),%rax; #ifndef TRMMKERNEL ADD_SX 0*SIZE(C0), xvec15; @@ -1155,7 +1156,7 @@ ADDQ $4, kk ADDQ $4*SIZE, C0; ADDQ $4*SIZE, C1; JMP .L5_loopE; -.align 16 +ALIGN_4 .L10_loopEx: LEAQ (ldc,ldc,2),%rax; #ifndef TRMMKERNEL @@ -1215,7 +1216,7 @@ ADDQ $4*SIZE, C1; .L5_loopE: TEST $2, bm; JLE .L6_loopE; -.align 16 +ALIGN_4 .L6_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb, ptrbb; @@ -1249,7 +1250,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L11_loopE; -.align 16 +ALIGN_4 .L11_bodyB: #### Computing kernel LD_SX 0*SIZE(ptrba), xvec0; # a1, a2, a3, a4 @@ -1318,7 +1319,7 @@ ADDQ $8*SIZE, ptrba; ADDQ $32*SIZE, ptrbb; DECQ k; JG .L11_bodyB; -.align 16 +ALIGN_4 .L11_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -1326,7 +1327,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L12_loopE; -.align 16 +ALIGN_4 .L12_bodyB: LD_SX 0*SIZE(ptrba), xvec0; # a1, a2, a3, a4 SHUF_SX $0x44, xvec0, xvec1; # a1, a2, a1, a2 @@ -1368,7 +1369,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L13_loopE; -.align 16 +ALIGN_4 .L13_bodyB: LD_SX 0*SIZE(ptrba), xvec0; # a1, a2, a3, a4 SHUF_SX $0x44, xvec0, xvec1; # a1, a2, a1, a2 @@ -1433,7 +1434,7 @@ ADDQ $2*SIZE, C1; .L6_loopE: TEST $1, bm; JLE .L7_loopE; -.align 16 +ALIGN_4 .L7_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb, ptrbb; @@ -1465,7 +1466,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L14_loopE; -.align 16 +ALIGN_4 .L14_bodyB: BROAD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec2; @@ -1503,7 +1504,7 @@ ADDQ $4*SIZE, ptrba; ADDQ $32*SIZE, ptrbb; DECQ k; JG .L14_bodyB; -.align 16 +ALIGN_4 .L14_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -1511,7 +1512,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L15_loopE; -.align 16 +ALIGN_4 .L15_bodyB: BROAD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec2; @@ -1538,7 +1539,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L16_loopE; -.align 16 +ALIGN_4 .L16_bodyB: BROAD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec2; @@ -1611,11 +1612,11 @@ LEAQ (C,ldc,8),C; .L0_bodyE:; DECQ j; JG .L0_bodyB; -.align 16; +ALIGN_4; .L0_loopE:; TEST $4, bn; # Rn = 4 JLE .L20_loopE; -.align 16; +ALIGN_4; .L20_bodyB: #if defined(TRMMKERNEL) && defined(LEFT) MOVQ OFFSET, %rax; @@ -1628,7 +1629,7 @@ MOVQ ba, ptrba; MOVQ bm, i; SARQ $3, i; JLE .L21_loopE; -.align 16 +ALIGN_4 .L21_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb, ptrbb; @@ -1668,7 +1669,7 @@ MOVQ %rax, kkk; #endif SARQ $2,k; JLE .L211_loopE; -.align 16 +ALIGN_4 .L211_bodyB: #### Unroll time 1 #### ODUP_SX 0*SIZE(ptrbb), xvec3; @@ -1800,7 +1801,7 @@ ADD_SX xvec7, xvec8; LD_SX 4*SIZE(ptrba), xvec1; DECQ k; JG .L211_bodyB; -.align 16 +ALIGN_4 .L211_loopE: #ifndef TRMMKERNEL TEST $2, bk @@ -1808,7 +1809,7 @@ TEST $2, bk TEST $2, kkk; #endif JLE .L212_loopE; -.align 16 +ALIGN_4 .L212_bodyB: #### Unroll time 1 #### ODUP_SX 0*SIZE(ptrbb), xvec3; @@ -1882,7 +1883,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L213_loopE; -.align 16 +ALIGN_4 .L213_bodyB: ODUP_SX 0*SIZE(ptrbb), xvec3; SHUF_SX $0x4e, xvec2, xvec4; @@ -1982,11 +1983,11 @@ ADDQ $8*SIZE, C0; ADDQ $8*SIZE, C1; DECQ i; JG .L21_bodyB; -.align 16 +ALIGN_4 .L21_loopE: TEST $4, bm; JLE .L22_loopE; -.align 16 +ALIGN_4 .L22_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb, ptrbb; @@ -2019,7 +2020,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L221_loopE; -.align 16 +ALIGN_4 .L221_bodyB: LD_SX 0*SIZE(ptrba), xvec0; EDUP_SX 0*SIZE(ptrbb), xvec2; @@ -2089,7 +2090,7 @@ ADDQ $16*SIZE, ptrbb; DECQ k; JG .L221_bodyB; -.align 16 +ALIGN_4 .L221_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -2097,7 +2098,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L222_loopE; -.align 16 +ALIGN_4 .L222_bodyB: LD_SX 0*SIZE(ptrba), xvec0; EDUP_SX 0*SIZE(ptrbb), xvec2; @@ -2139,7 +2140,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L223_loopE; -.align 16 +ALIGN_4 .L223_bodyB: LD_SX 0*SIZE(ptrba), xvec0; EDUP_SX 0*SIZE(ptrbb), xvec2; @@ -2203,7 +2204,7 @@ ADDQ $4*SIZE, C1; .L22_loopE: TEST $2, bm; JLE .L23_loopE; -.align 16 +ALIGN_4 .L23_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb, ptrbb; @@ -2234,7 +2235,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L231_loopE; -.align 16 +ALIGN_4 .L231_bodyB: LD_SX 0*SIZE(ptrba), xvec0; EDUP_SX 0*SIZE(ptrbb), xvec4; @@ -2274,7 +2275,7 @@ ADDQ $8*SIZE, ptrba; ADDQ $16*SIZE, ptrbb; DECQ k; JG .L231_bodyB; -.align 16 +ALIGN_4 .L231_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -2282,7 +2283,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L232_loopE; -.align 16 +ALIGN_4 .L232_bodyB: LD_SX 0*SIZE(ptrba), xvec0; EDUP_SX 0*SIZE(ptrbb), xvec4; @@ -2310,7 +2311,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L233_loopE; -.align 16 +ALIGN_4 .L233_bodyB: LD_SX 0*SIZE(ptrba), xvec0; EDUP_SX 0*SIZE(ptrbb), xvec4; @@ -2356,7 +2357,7 @@ ADDQ $2*SIZE, C1; .L23_loopE: TEST $1, bm; JLE .L24_loopE; -.align 16 +ALIGN_4 .L24_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb, ptrbb; @@ -2386,7 +2387,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L241_loopE; -.align 16 +ALIGN_4 .L241_bodyB: BROAD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec1; @@ -2419,7 +2420,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L242_loopE; -.align 16 +ALIGN_4 .L242_bodyB: BROAD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec1; @@ -2440,7 +2441,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L243_loopE; -.align 16; +ALIGN_4; .L243_bodyB: BROAD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec1; @@ -2491,7 +2492,7 @@ LEAQ (C, ldc, 4), C; .L20_loopE: TEST $2, bn; JLE .L30_loopE; -.align 16 +ALIGN_4 .L30_bodyB: #if defined(TRMMKERNEL) && defined(LEFT) MOVQ OFFSET, %rax; @@ -2503,7 +2504,7 @@ MOVQ ba, ptrba; MOVQ bm, i; SARQ $3, i; JLE .L31_loopE; -.align 16 +ALIGN_4 .L31_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb, ptrbb; @@ -2536,7 +2537,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L311_loopE; -.align 16 +ALIGN_4 .L311_bodyB: LD_SX 0*SIZE(ptrbb), xvec2; SHUF_SX $0x50, xvec2, xvec3; @@ -2612,7 +2613,7 @@ ADDQ $32*SIZE, ptrba; ADDQ $8*SIZE, ptrbb; DECQ k; JG .L311_bodyB; -.align 16 +ALIGN_4 .L311_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -2620,7 +2621,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L312_loopE; -.align 16 +ALIGN_4 .L312_bodyB: LD_SX 0*SIZE(ptrbb), xvec2; SHUF_SX $0x50, xvec2, xvec3; @@ -2666,7 +2667,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L313_loopE; -.align 16 +ALIGN_4 .L313_bodyB: LD_SX 0*SIZE(ptrbb), xvec2; SHUF_SX $0x50, xvec2, xvec3; @@ -2731,11 +2732,11 @@ ADDQ $8*SIZE, C0; ADDQ $8*SIZE, C1; DECQ i; JG .L31_bodyB; -.align 16 +ALIGN_4 .L31_loopE: TEST $4, bm; JLE .L32_loopE; -.align 16 +ALIGN_4 .L32_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb, ptrbb; @@ -2766,7 +2767,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L321_loopE; -.align 16 +ALIGN_4 .L321_bodyB: LD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec2; @@ -2806,7 +2807,7 @@ ADDQ $16*SIZE, ptrba; ADDQ $8*SIZE, ptrbb; DECQ k; JG .L321_bodyB; -.align 16 +ALIGN_4 .L321_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -2814,7 +2815,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L322_loopE; -.align 16 +ALIGN_4 .L322_bodyB: LD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec2; @@ -2842,7 +2843,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L323_loopE; -.align 16 +ALIGN_4 .L323_bodyB: LD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec2; @@ -2887,7 +2888,7 @@ ADDQ $4*SIZE, C1; .L32_loopE: TEST $2, bm; JLE .L33_loopE; -.align 16 +ALIGN_4 .L33_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb, ptrbb; @@ -2920,7 +2921,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L331_loopE; -.align 16 +ALIGN_4 .L331_bodyB: LD_SX 0*SIZE(ptrba), xvec0; # a0, a1, a2, a3 EDUP_SX 0*SIZE(ptrbb), xvec2; # b0, b0, b2, b2 @@ -2943,7 +2944,7 @@ ADDQ $8*SIZE, ptrba; ADDQ $8*SIZE, ptrbb; DECQ k; JG .L331_bodyB; -.align 16 +ALIGN_4 .L331_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -2951,7 +2952,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L332_loopE; -.align 16 +ALIGN_4 .L332_bodyB: LD_SX 0*SIZE(ptrba), xvec0; # a0, a1, a2, a3 EDUP_SX 0*SIZE(ptrbb), xvec2; # b0, b0, b2, b2 @@ -2972,7 +2973,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L333_loopE; -.align 16 +ALIGN_4 .L333_bodyB: movss 0*SIZE(ptrba), xvec0; movss 1*SIZE(ptrba), xvec1; @@ -3031,7 +3032,7 @@ ADDQ $2*SIZE, C1; .L33_loopE: TEST $1, bm; JLE .L34_loopE; -.align 16 +ALIGN_4 .L34_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb, ptrbb; @@ -3062,7 +3063,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L341_loopE; -.align 16 +ALIGN_4 .L341_bodyB: movss 0*SIZE(ptrba), xvec0; movss 0*SIZE(ptrbb), xvec1; @@ -3104,7 +3105,7 @@ addq $4*SIZE, ptrba; addq $8*SIZE, ptrbb; decq k; jg .L341_bodyB; -.align 16 +ALIGN_4 .L341_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -3112,7 +3113,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L342_loopE; -.align 16 +ALIGN_4 .L342_bodyB: movss 0*SIZE(ptrba), xvec0; movss 0*SIZE(ptrbb), xvec1; @@ -3140,7 +3141,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L343_loopE; -.align 16 +ALIGN_4 .L343_bodyB: movss 0*SIZE(ptrba), xvec0; movss 0*SIZE(ptrbb), xvec1; @@ -3189,7 +3190,7 @@ LEAQ (C, ldc, 2), C; .L30_loopE: TEST $1, bn; JLE .L40_loopE; -.align 16 +ALIGN_4 .L40_bodyB: #if defined(TRMMKERNEL)&&defined(LEFT) MOVQ OFFSET, %rax; @@ -3200,7 +3201,7 @@ MOVQ ba, ptrba; MOVQ bm, i; SARQ $3, i; JLE .L41_loopE; -.align 16 +ALIGN_4 .L41_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb, ptrbb; @@ -3230,7 +3231,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L411_loopE; -.align 16 +ALIGN_4 .L411_bodyB: LD_SY 0*SIZE(ptrba), yvec0; BROAD_SY 0*SIZE(ptrbb), yvec1; @@ -3256,7 +3257,7 @@ ADDQ $32*SIZE, ptrba; ADDQ $4*SIZE, ptrbb; DECQ k; JG .L411_bodyB; -.align 16 +ALIGN_4 .L411_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -3264,7 +3265,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L412_loopE; -.align 16 +ALIGN_4 .L412_bodyB: LD_SY 0*SIZE(ptrba), yvec0; BROAD_SY 0*SIZE(ptrbb), yvec1; @@ -3285,7 +3286,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L413_loopE; -.align 16 +ALIGN_4 .L413_bodyB: LD_SY 0*SIZE(ptrba), yvec0; BROAD_SY 0*SIZE(ptrbb), yvec1; @@ -3329,11 +3330,11 @@ ADDQ $8, kk; ADDQ $8*SIZE, C0; DECQ i; JG .L41_bodyB; -.align 16 +ALIGN_4 .L41_loopE: TEST $4, bm; JLE .L42_loopE; -.align 16 +ALIGN_4 .L42_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb, ptrbb; @@ -3362,7 +3363,7 @@ MOVQ %rax, kkk #endif SARQ $2, k; JLE .L421_loopE; -.align 16 +ALIGN_4 .L421_bodyB: LD_SX 0*SIZE(ptrba), xvec0; BROAD_SX 0*SIZE(ptrbb), xvec1; @@ -3387,7 +3388,7 @@ ADDQ $16*SIZE, ptrba; ADDQ $4*SIZE, ptrbb; DECQ k; JG .L421_bodyB; -.align 16 +ALIGN_4 .L421_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -3395,7 +3396,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L422_loopE; -.align 16 +ALIGN_4 .L422_bodyB: LD_SX 0*SIZE(ptrba), xvec0; BROAD_SX 0*SIZE(ptrbb), xvec1; @@ -3416,7 +3417,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L423_loopE; -.align 16 +ALIGN_4 .L423_bodyB: LD_SX 0*SIZE(ptrba), xvec0; BROAD_SX 0*SIZE(ptrbb), xvec1; @@ -3451,7 +3452,7 @@ ADDQ $4*SIZE, C0; .L42_loopE: TEST $2, bm; JLE .L43_loopE; -.align 16 +ALIGN_4 .L43_bodyB: #if !defined(TRMMKERNEL) || (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) MOVQ bb, ptrbb; @@ -3481,7 +3482,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L431_loopE; -.align 16 +ALIGN_4 .L431_bodyB: movss 0*SIZE(ptrba), xvec0; movss 1*SIZE(ptrba), xvec1; @@ -3518,7 +3519,7 @@ addq $8*SIZE, ptrba; addq $4*SIZE, ptrbb; decq k; JG .L431_bodyB; -.align 16 +ALIGN_4 .L431_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -3526,7 +3527,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L432_loopE; -.align 16 +ALIGN_4 .L432_bodyB: movss 0*SIZE(ptrba), xvec0; movss 1*SIZE(ptrba), xvec1; @@ -3553,7 +3554,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L433_loopE; -.align 16 +ALIGN_4 .L433_bodyB: movss 0*SIZE(ptrba), xvec0; movss 1*SIZE(ptrba), xvec1; @@ -3592,7 +3593,7 @@ addq $2*SIZE, C0; .L43_loopE: TEST $1, bm; JLE .L44_loopE; -.align 16 +ALIGN_4 .L44_bodyB: #if !defined(TRMMKERNEL) || (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) MOVQ bb, ptrbb; @@ -3621,7 +3622,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L441_loopE; -.align 16 +ALIGN_4 .L441_bodyB: movss 0*SIZE(ptrba), xvec0; movss 0*SIZE(ptrbb), xvec1; @@ -3646,7 +3647,7 @@ addq $4*SIZE, ptrba; addq $4*SIZE, ptrbb; decq k; JG .L441_bodyB; -.align 16 +ALIGN_4 .L441_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -3654,7 +3655,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L442_loopE; -.align 16 +ALIGN_4 .L442_bodyB: movss 0*SIZE(ptrba), xvec0; movss 0*SIZE(ptrbb), xvec1; @@ -3675,7 +3676,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L443_loopE; -.align 16 +ALIGN_4 .L443_bodyB: movss 0*SIZE(ptrba), xvec0; movss 0*SIZE(ptrbb), xvec1; diff --git a/kernel/x86_64/zgemm_kernel_4x4_sandy.S b/kernel/x86_64/zgemm_kernel_4x4_sandy.S index 34abbb529..f6f9f707f 100644 --- a/kernel/x86_64/zgemm_kernel_4x4_sandy.S +++ b/kernel/x86_64/zgemm_kernel_4x4_sandy.S @@ -145,6 +145,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define JMP jmp #define NOP #define XOR xorpd +#undef MOVQ +#define MOVQ movq #define XOR_SY vxorps #define XOR_DY vxorpd @@ -297,7 +299,7 @@ movq %r11, kk; MOVQ bn,j; SARQ $2,j; # Rn = 4 JLE .L0_loopE; -.align 32; +ALIGN_5; .L0_bodyB:; #if defined(TRMMKERNEL) && defined(LEFT) MOVQ OFFSET, %rax; @@ -312,7 +314,7 @@ MOVQ ba,ptrba; MOVQ bm,i; SARQ $2,i; # Rm = 4 JLE .L1_loopE; -.align 32; +ALIGN_5; .L1_bodyB:; #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb,ptrbb; @@ -361,7 +363,7 @@ MOVQ %rax, kkk; #endif SARQ $2,k; # Unroll 4 times JLE .L2_loopE; -.align 32; +ALIGN_5; .L2_bodyB:; #### Computing kernel #### @@ -584,7 +586,7 @@ ADD2_DY yvec6, yvec12, yvec12; ADD2_DY yvec7, yvec8, yvec8; DECQ k; JG .L2_bodyB; -.align 64; +ALIGN_5 .L2_loopE:; #ifndef TRMMKERNEL TEST $2, bk; @@ -592,7 +594,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L3_loopE; -.align 64 +ALIGN_5 .L3_bodyB: #### Unroll time 1 #### LD_DY 4*SIZE(ptrba), yvec1; @@ -710,7 +712,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L4_loopE; -.align 64 +ALIGN_5 .L4_loopB:; #### Unroll time 1 #### PREFETCH0 PRESIZE*SIZE(ptrba); @@ -852,7 +854,7 @@ MOVQ C0, %rax; OR ldc, %rax; TEST $15, %rax; JNE .L4_loopEx; -.align 32 +ALIGN_5 #### Store Back #### EXTRA_DY $1,yvec15,xvec7; EXTRA_DY $1,yvec14,xvec6; @@ -912,7 +914,7 @@ ADDQ $8*SIZE,C1; DECQ i; JG .L1_bodyB; JMP .L1_loopE; -.align 32 +ALIGN_5 .L4_loopEx: EXTRA_DY $1, yvec15, xvec7; EXTRA_DY $1, yvec14, xvec6; @@ -1024,11 +1026,11 @@ ADDQ $8*SIZE, C0; ADDQ $8*SIZE, C1; DECQ i; JG .L1_bodyB; -.align 32; +ALIGN_5; .L1_loopE:; TEST $2, bm; JLE .L5_loopE; -.align 32 +ALIGN_5 .L5_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb,ptrbb; @@ -1060,7 +1062,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L7_loopE; -.align 32 +ALIGN_5 .L7_bodyB: #### Compute kernel #### #### Unroll times 1 #### @@ -1194,7 +1196,7 @@ ADD2_DY yvec7, yvec12, yvec12; ADDQ $32*SIZE, ptrbb; DECQ k; JG .L7_bodyB; -.align 32 +ALIGN_5 .L7_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -1202,7 +1204,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L8_loopE; -.align 32 +ALIGN_5 .L8_bodyB: #### Unroll times 1 #### LD_DY 0*SIZE(ptrba), yvec0; @@ -1276,7 +1278,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L9_loopE; -.align 32 +ALIGN_5 .L9_bodyB: #### Unroll times 1 #### LD_DY 0*SIZE(ptrba), yvec0; @@ -1364,7 +1366,7 @@ MOVQ C0, %rax; OR ldc, %rax; TEST $15, %rax; JNE .L9_loopEx; -.align 32 +ALIGN_5 #### Writing back #### EXTRA_DY $1, yvec15, xvec7; EXTRA_DY $1, yvec14, xvec6; @@ -1401,7 +1403,7 @@ ADDQ $2, kk; ADDQ $4*SIZE, C0; ADDQ $4*SIZE, C1; JMP .L5_loopE; -.align 32 +ALIGN_5 .L9_loopEx: EXTRA_DY $1, yvec15, xvec7; EXTRA_DY $1, yvec14, xvec6; @@ -1466,7 +1468,7 @@ ADDQ $4*SIZE, C1; .L5_loopE: TEST $1, bm; JLE .L6_loopE; -.align 32 +ALIGN_5 .L6_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb,ptrbb; @@ -1496,7 +1498,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L10_loopE; -.align 32 +ALIGN_5 .L10_bodyB: LD_DY 0*SIZE(ptrba), yvec0; #### A1r A1i A2r A2i EDUP_DY 0*SIZE(ptrbb), yvec2; @@ -1570,7 +1572,7 @@ ADDQ $8*SIZE, ptrba; ADDQ $32*SIZE, ptrbb; DECQ k; JG .L10_bodyB; -.align 32 +ALIGN_5 .L10_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -1578,7 +1580,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L11_loopE; -.align 32 +ALIGN_5 .L11_bodyB: LD_DY 0*SIZE(ptrba), yvec0; #### A1r A1i A2r A2i EDUP_DY 0*SIZE(ptrbb), yvec2; @@ -1624,7 +1626,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L12_loopE; -.align 32 +ALIGN_5 .L12_bodyB: LD_DY 0*SIZE(ptrba), yvec0; #### A1r A1i A2r A2i EDUP_DY 0*SIZE(ptrbb), yvec2; @@ -1722,11 +1724,11 @@ LEAQ (C,ldc,4),C; .L0_bodyE:; DECQ j; JG .L0_bodyB; -.align 32; +ALIGN_5; .L0_loopE:; TEST $2, bn; JLE .L20_loopE; -.align 32 +ALIGN_5 .L20_bodyB: #if defined(TRMMKERNEL) && defined(LEFT) MOVQ OFFSET, %rax; @@ -1738,7 +1740,7 @@ MOVQ ba, ptrba; MOVQ bm, i; SARQ $2, i; JLE .L21_loopE; -.align 32 +ALIGN_5 .L21_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb,ptrbb; @@ -1770,7 +1772,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L211_loopE; -.align 32 +ALIGN_5 .L211_bodyB: #### Unroll time 1 #### EDUP_DY 0*SIZE(ptrbb), yvec2; @@ -1891,7 +1893,7 @@ ADD2_DY yvec7, yvec12, yvec12; ADDQ $32*SIZE, ptrba; DECQ k; JG .L211_bodyB; -.align 32 +ALIGN_5 .L211_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -1899,7 +1901,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L212_loopE; -.align 32 +ALIGN_5 .L212_bodyB: #### Unroll time 1 #### EDUP_DY 0*SIZE(ptrbb), yvec2; @@ -1969,7 +1971,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L213_loopE; -.align 32 +ALIGN_5 .L213_bodyB: #### Unroll time 1 #### EDUP_DY 0*SIZE(ptrbb), yvec2; @@ -2058,7 +2060,7 @@ MOVQ C0, %rax; OR ldc, %rax; TEST $15, %rax; JNE .L213_loopEx; -.align 32 +ALIGN_5 #### Writing back #### #ifndef TRMMKERNEL ADD_DX 0*SIZE(C0),xvec15; @@ -2093,7 +2095,7 @@ ADDQ $8*SIZE, C1; DECQ i; JG .L21_bodyB; JMP .L21_loopE; -.align 32 +ALIGN_5 .L213_loopEx: #ifndef TRMMKERNEL LDL_DX 0*SIZE(C0), xvec0; @@ -2153,11 +2155,11 @@ ADDQ $8*SIZE, C0; ADDQ $8*SIZE, C1; DECQ i; JG .L21_bodyB; -.align 32 +ALIGN_5 .L21_loopE: TEST $2, bm; JLE .L22_loopE; -.align 32 +ALIGN_5 .L22_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb,ptrbb; @@ -2187,7 +2189,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L221_loopE; -.align 32 +ALIGN_5 .L221_bodyB: #### Unroll time 1 #### EDUP_DY 0*SIZE(ptrbb), yvec2; @@ -2268,7 +2270,7 @@ ADD2_DY yvec6, yvec13, yvec13; ADDQ $16*SIZE, ptrba; DECQ k; JG .L221_bodyB; -.align 32 +ALIGN_5 .L221_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -2276,7 +2278,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L222_loopE; -.align 32 +ALIGN_5 .L222_bodyB: #### Unroll time 1 #### EDUP_DY 0*SIZE(ptrbb), yvec2; @@ -2325,7 +2327,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L223_loopE; -.align 32 +ALIGN_5 .L223_bodyB: #### Unroll time 1 #### EDUP_DY 0*SIZE(ptrbb), yvec2; @@ -2419,7 +2421,7 @@ ADDQ $4*SIZE, C1; .L22_loopE: TEST $1, bm; JLE .L23_loopE; -.align 32 +ALIGN_5 .L23_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb,ptrbb; @@ -2448,7 +2450,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L231_loopE; -.align 32 +ALIGN_5 .L231_bodyB: LD_DY 0*SIZE(ptrba), yvec0; #### A1r A1i A2r A2i EDUP_DY 0*SIZE(ptrbb), yvec2; @@ -2498,7 +2500,7 @@ ADDQ $8*SIZE, ptrba; ADDQ $16*SIZE, ptrbb; DECQ k; JG .L231_bodyB; -.align 32 +ALIGN_5 .L231_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -2506,7 +2508,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L232_loopE; -.align 32 +ALIGN_5 .L232_bodyB: LD_DY 0*SIZE(ptrba), yvec0; #### A1r A1i A2r A2i EDUP_DY 0*SIZE(ptrbb), yvec2; @@ -2540,7 +2542,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L233_loopE; -.align 32 +ALIGN_5 .L233_bodyB: LD_DY 0*SIZE(ptrba), yvec0; #### A1r A1i A2r A2i EDUP_DY 0*SIZE(ptrbb), yvec2; @@ -2614,7 +2616,7 @@ LEAQ (C, ldc, 2), C; .L20_loopE: TEST $1, bn; JLE .L30_loopE; -.align 32 +ALIGN_5 .L30_bodyB: #if defined(TRMMKERNEL) && defined(LEFT) MOVQ OFFSET, %rax; @@ -2625,7 +2627,7 @@ MOVQ C, C0; MOVQ bm, i; SARQ $2, i; JLE .L31_loopE; -.align 32 +ALIGN_5 .L31_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb,ptrbb; @@ -2655,7 +2657,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L311_loopE; -.align 32 +ALIGN_5 .L311_bodyB: LD_DY 0*SIZE(ptrba), yvec0; BROAD_DY 0*SIZE(ptrbb), yvec2; @@ -2732,7 +2734,7 @@ ADDQ $32*SIZE, ptrba; ADDQ $8*SIZE, ptrbb; DECQ k; JG .L311_bodyB; -.align 32 +ALIGN_5 .L311_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -2740,7 +2742,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L312_loopE; -.align 32 +ALIGN_5 .L312_bodyB: LD_DY 0*SIZE(ptrba), yvec0; BROAD_DY 0*SIZE(ptrbb), yvec2; @@ -2787,7 +2789,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L313_loopE; -.align 32 +ALIGN_5 .L313_bodyB: LD_DY 0*SIZE(ptrba), yvec0; BROAD_DY 0*SIZE(ptrbb), yvec2; @@ -2877,11 +2879,11 @@ ADDQ $4, kk; ADDQ $8*SIZE, C0; DECQ i; JG .L31_bodyB; -.align 32 +ALIGN_5 .L31_loopE: TEST $2, bm; JLE .L32_loopE; -.align 32 +ALIGN_5 .L32_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb,ptrbb; @@ -2910,7 +2912,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L321_loopE; -.align 32 +ALIGN_5 .L321_bodyB: LD_DY 0*SIZE(ptrba), yvec0; BROAD_DY 0*SIZE(ptrbb), yvec2; @@ -2951,7 +2953,7 @@ ADDQ $16*SIZE, ptrba; ADDQ $8*SIZE, ptrbb; DECQ k; JG .L321_bodyB; -.align 32 +ALIGN_5 .L321_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -2959,7 +2961,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L322_loopE; -.align 32 +ALIGN_5 .L322_bodyB: LD_DY 0*SIZE(ptrba), yvec0; BROAD_DY 0*SIZE(ptrbb), yvec2; @@ -2988,7 +2990,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L323_loopE; -.align 32 +ALIGN_5 .L323_bodyB: LD_DY 0*SIZE(ptrba), yvec0; BROAD_DY 0*SIZE(ptrbb), yvec2; @@ -3049,7 +3051,7 @@ ADDQ $4*SIZE, C0; .L32_loopE: TEST $1, bm; JLE .L33_loopE; -.align 32 +ALIGN_5 .L33_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb,ptrbb; @@ -3078,7 +3080,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L331_loopE; -.align 32 +ALIGN_5 .L331_bodyB: LD_DX 0*SIZE(ptrba), xvec0; BROAD_DX 0*SIZE(ptrbb), xvec2; @@ -3123,7 +3125,7 @@ ADDQ $8*SIZE, ptrba; ADDQ $8*SIZE, ptrbb; DECQ k; JG .L331_bodyB; -.align 32 +ALIGN_5 .L331_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -3131,7 +3133,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L332_loopE; -.align 32 +ALIGN_5 .L332_bodyB: LD_DX 0*SIZE(ptrba), xvec0; BROAD_DX 0*SIZE(ptrbb), xvec2; @@ -3162,7 +3164,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L333_loopE; -.align 32 +ALIGN_5 .L333_bodyB: LD_DX 0*SIZE(ptrba), xvec0; BROAD_DX 0*SIZE(ptrbb), xvec2; From 6cfcb54a2810b4607f9b9353e275345c2d64f27f Mon Sep 17 00:00:00 2001 From: wangqian Date: Wed, 20 Jun 2012 07:38:39 +0800 Subject: [PATCH 4/7] Fixed align problem in S and C precision GEMM kernels. --- kernel/x86_64/cgemm_kernel_4x8_sandy.S | 2 +- kernel/x86_64/sgemm_kernel_8x8_sandy.S | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/x86_64/cgemm_kernel_4x8_sandy.S b/kernel/x86_64/cgemm_kernel_4x8_sandy.S index 56ebee120..5987b8e61 100644 --- a/kernel/x86_64/cgemm_kernel_4x8_sandy.S +++ b/kernel/x86_64/cgemm_kernel_4x8_sandy.S @@ -3578,7 +3578,7 @@ ADDQ $8*SIZE, ptrba; ADDQ $16*SIZE, ptrbb; DECQ k; JG .L241_bodyB; -.align +ALIGN_5 .L241_loopE: #ifndef TRMMKERNEL TEST $2, bk; diff --git a/kernel/x86_64/sgemm_kernel_8x8_sandy.S b/kernel/x86_64/sgemm_kernel_8x8_sandy.S index 4d16a60d0..23eda3af8 100644 --- a/kernel/x86_64/sgemm_kernel_8x8_sandy.S +++ b/kernel/x86_64/sgemm_kernel_8x8_sandy.S @@ -2412,7 +2412,7 @@ ADDQ $4*SIZE, ptrba; ADDQ $16*SIZE, ptrbb; DECQ k; JG .L241_bodyB; -.align +ALIGN_4 .L241_loopE: #ifndef TRMMKERNEL TEST $2, bk; From 88c272f6a739039460afbca3e47b55cd3555f585 Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Wed, 20 Jun 2012 09:20:20 +0800 Subject: [PATCH 5/7] Refs #83. Added the missing ALIGN_5 macro on Mac OSX. However, it still exists SEGFAULT bug. --- common_x86_64.h | 1 + 1 file changed, 1 insertion(+) diff --git a/common_x86_64.h b/common_x86_64.h index 7b6d11f7d..19b0ac53c 100644 --- a/common_x86_64.h +++ b/common_x86_64.h @@ -425,6 +425,7 @@ REALNAME: #define ALIGN_2 .align 2 #define ALIGN_3 .align 3 #define ALIGN_4 .align 4 +#define ALIGN_5 .align 5 #define ffreep fstp #endif From d34fce56e4a980fefe4ddafe5d371798ad948b59 Mon Sep 17 00:00:00 2001 From: wangqian Date: Wed, 20 Jun 2012 19:53:18 +0800 Subject: [PATCH 6/7] Refs #83 Fixed S/DGEMM calling conventions bug on windows. --- kernel/x86_64/dgemm_kernel_4x8_sandy.S | 67 ++++++++++++++------------ kernel/x86_64/sgemm_kernel_8x8_sandy.S | 1 + 2 files changed, 36 insertions(+), 32 deletions(-) diff --git a/kernel/x86_64/dgemm_kernel_4x8_sandy.S b/kernel/x86_64/dgemm_kernel_4x8_sandy.S index c98879d7c..603552464 100644 --- a/kernel/x86_64/dgemm_kernel_4x8_sandy.S +++ b/kernel/x86_64/dgemm_kernel_4x8_sandy.S @@ -162,7 +162,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ST_SX movaps #define ST_DX movapd #define STL_DX movlpd +#define STL_DY vmovlpd #define STH_DX movhpd +#define STH_DY vmovhpd #define EDUP_SY vmovsldup #define ODUP_SY vmovshdup @@ -242,6 +244,7 @@ movq %r15, 40(%rsp); #ifdef TRMMKERNEL movq old_offset, %r11 #endif + movaps %xmm3, %xmm0 #else movq old_ldc, ldc @@ -660,10 +663,10 @@ LDL_DY 2*SIZE(C1), xvec5, xvec5; LDH_DY 3*SIZE(C1), xvec5, xvec5; ADD_DY xvec5, xvec7, xvec7; #endif -STL_DX xvec15, 0*SIZE(C0); -STH_DX xvec15, 1*SIZE(C0); -STL_DX xvec7, 2*SIZE(C1); -STH_DX xvec7, 3*SIZE(C1); +STL_DY xvec15, 0*SIZE(C0); +STH_DY xvec15, 1*SIZE(C0); +STL_DY xvec7, 2*SIZE(C1); +STH_DY xvec7, 3*SIZE(C1); EXTRA_DY $1, yvec14, xvec4; #ifndef TRMMKERNEL @@ -674,10 +677,10 @@ LDL_DY 6*SIZE(C1), xvec2, xvec2; LDH_DY 7*SIZE(C1), xvec2, xvec2; ADD_DY xvec2, xvec4, xvec4; #endif -STL_DX xvec14, 4*SIZE(C0); -STH_DX xvec14, 5*SIZE(C0); -STL_DX xvec4, 6*SIZE(C1); -STH_DX xvec4, 7*SIZE(C1); +STL_DY xvec14, 4*SIZE(C0); +STH_DY xvec14, 5*SIZE(C0); +STL_DY xvec4, 6*SIZE(C1); +STH_DY xvec4, 7*SIZE(C1); EXTRA_DY $1, yvec13, xvec7; #ifndef TRMMKERNEL @@ -688,10 +691,10 @@ LDL_DY 2*SIZE(C1, ldc, 1), xvec5, xvec5; LDH_DY 3*SIZE(C1, ldc, 1), xvec5, xvec5; ADD_DY xvec5, xvec7, xvec7; #endif -STL_DX xvec13, 0*SIZE(C0, ldc, 1); -STH_DX xvec13, 1*SIZE(C0, ldc, 1); -STL_DX xvec7, 2*SIZE(C1, ldc, 1); -STH_DX xvec7, 3*SIZE(C1, ldc, 1); +STL_DY xvec13, 0*SIZE(C0, ldc, 1); +STH_DY xvec13, 1*SIZE(C0, ldc, 1); +STL_DY xvec7, 2*SIZE(C1, ldc, 1); +STH_DY xvec7, 3*SIZE(C1, ldc, 1); EXTRA_DY $1, yvec12, xvec4; #ifndef TRMMKERNEL @@ -702,10 +705,10 @@ LDL_DY 6*SIZE(C1, ldc, 1), xvec2, xvec2; LDH_DY 7*SIZE(C1, ldc, 1), xvec2, xvec2; ADD_DY xvec2, xvec4, xvec4; #endif -STL_DX xvec12, 4*SIZE(C0, ldc, 1); -STH_DX xvec12, 5*SIZE(C0, ldc ,1); -STL_DX xvec4, 6*SIZE(C1, ldc, 1); -STH_DX xvec4, 7*SIZE(C1, ldc, 1); +STL_DY xvec12, 4*SIZE(C0, ldc, 1); +STH_DY xvec12, 5*SIZE(C0, ldc ,1); +STL_DY xvec4, 6*SIZE(C1, ldc, 1); +STH_DY xvec4, 7*SIZE(C1, ldc, 1); EXTRA_DY $1, yvec11, xvec7; #ifndef TRMMKERNEL @@ -716,10 +719,10 @@ LDL_DY 2*SIZE(C0), xvec5, xvec5; LDH_DY 3*SIZE(C0), xvec5, xvec5; ADD_DY xvec5, xvec7, xvec7; #endif -STL_DX xvec11, 0*SIZE(C1); -STH_DX xvec11, 1*SIZE(C1); -STL_DX xvec7, 2*SIZE(C0); -STH_DX xvec7, 3*SIZE(C0); +STL_DY xvec11, 0*SIZE(C1); +STH_DY xvec11, 1*SIZE(C1); +STL_DY xvec7, 2*SIZE(C0); +STH_DY xvec7, 3*SIZE(C0); EXTRA_DY $1, yvec10, xvec4; #ifndef TRMMKERNEL @@ -730,10 +733,10 @@ LDL_DY 6*SIZE(C0), xvec2, xvec2; LDH_DY 7*SIZE(C0), xvec2, xvec2; ADD_DY xvec2, xvec4, xvec4; #endif -STL_DX xvec10, 4*SIZE(C1); -STH_DX xvec10, 5*SIZE(C1); -STL_DX xvec4, 6*SIZE(C0); -STH_DX xvec4, 7*SIZE(C0); +STL_DY xvec10, 4*SIZE(C1); +STH_DY xvec10, 5*SIZE(C1); +STL_DY xvec4, 6*SIZE(C0); +STH_DY xvec4, 7*SIZE(C0); EXTRA_DY $1, yvec9, xvec7; #ifndef TRMMKERNEL @@ -744,10 +747,10 @@ LDL_DY 2*SIZE(C0, ldc, 1), xvec5, xvec5; LDH_DY 3*SIZE(C0, ldc ,1), xvec5, xvec5; ADD_DY xvec5, xvec7, xvec7; #endif -STL_DX xvec9, 0*SIZE(C1, ldc, 1); -STH_DX xvec9, 1*SIZE(C1, ldc, 1); -STL_DX xvec7, 2*SIZE(C0, ldc, 1); -STH_DX xvec7, 3*SIZE(C0, ldc, 1); +STL_DY xvec9, 0*SIZE(C1, ldc, 1); +STH_DY xvec9, 1*SIZE(C1, ldc, 1); +STL_DY xvec7, 2*SIZE(C0, ldc, 1); +STH_DY xvec7, 3*SIZE(C0, ldc, 1); EXTRA_DY $1, yvec8, xvec4; #ifndef TRMMKERNEL @@ -758,10 +761,10 @@ LDL_DY 6*SIZE(C0, ldc, 1), xvec2, xvec2; LDH_DY 7*SIZE(C0, ldc, 1), xvec2, xvec2; ADD_DY xvec2, xvec4, xvec4; #endif -STL_DX xvec8, 4*SIZE(C1, ldc, 1); -STH_DX xvec8, 5*SIZE(C1, ldc, 1); -STL_DX xvec4, 6*SIZE(C0, ldc, 1); -STH_DX xvec4, 7*SIZE(C0, ldc, 1); +STL_DY xvec8, 4*SIZE(C1, ldc, 1); +STH_DY xvec8, 5*SIZE(C1, ldc, 1); +STL_DY xvec4, 6*SIZE(C0, ldc, 1); +STH_DY xvec4, 7*SIZE(C0, ldc, 1); #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) ||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) MOVQ bk, %rax; SUBQ kkk, %rax; diff --git a/kernel/x86_64/sgemm_kernel_8x8_sandy.S b/kernel/x86_64/sgemm_kernel_8x8_sandy.S index 23eda3af8..59458effe 100644 --- a/kernel/x86_64/sgemm_kernel_8x8_sandy.S +++ b/kernel/x86_64/sgemm_kernel_8x8_sandy.S @@ -251,6 +251,7 @@ movq %r15, 40(%rsp); #ifdef TRMMKERNEL movq old_offset, %r11 #endif + movaps %xmm3, %xmm0 #else movq old_ldc, ldc From fda5e0da8a0a43234ef1f70e719f4a5dd60fad0d Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Thu, 21 Jun 2012 08:25:52 +0800 Subject: [PATCH 7/7] Refs #83. Clang 3.1 works fine on Sandy Bridge Mac OSX. Edit the document. --- README | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/README b/README index 6372e96bd..b3f1baa79 100644 --- a/README +++ b/README @@ -34,8 +34,10 @@ Please read GotoBLAS_01Readme.txt Additional support CPU: x86_64: Intel Xeon 56xx (Westmere) //Used GotoBLAS2 Nehalem codes. + Intel Sandy Bridge MIPS64: - ICT Loongson 3A //Level 3 BLAS subroutines are optimized. + ICT Loongson 3A + ICT Loongson 3B (Experimental) 4.Usages Link with libopenblas.a or -lopenblas for shared library. @@ -70,10 +72,10 @@ OpenBLAS users mailing list: http://list.rdcps.ac.cn/mailman/listinfo/openblas 8.ChangeLog Please see Changelog.txt to obtain the differences between GotoBLAS2 1.13 BSD version. -9.Known Issues -* The number of CPUs/Cores should less than or equal to 8*sizeof(unsigned long). On 64 bits, the limit - is 64. On 32 bits, it is 32. -* On Loongson 3A. make test would be failed because of pthread_create error. The error code is EAGAIN. However, it will be OK when you run the same testcase on shell. I don't think this is a bug in OpenBLAS. +9.Troubleshooting +* Please use Clang version 3.1 and above to compile the library on Sandy Bridge microarchitecture. The Clang 3.0 will generate the wrong AVX binary code. +* The number of CPUs/Cores should less than or equal to 256. +* On Loongson 3A. make test would be failed because of pthread_create error. The error code is EAGAIN. However, it will be OK when you run the same testcase on shell. 10. Specification of Git Branches We used the git branching model in this article (http://nvie.com/posts/a-successful-git-branching-model/).