From f1db3862111dfe0f6cd757f74803008dd535a90d Mon Sep 17 00:00:00 2001 From: wernsaar Date: Wed, 13 Nov 2013 17:59:11 +0100 Subject: [PATCH] changes for compatibility with Pathscale compiler --- common_x86.h | 20 +++- common_x86_64.h | 16 ++- kernel/x86_64/cgemm_kernel_8x2_haswell.S | 138 +++++++++------------- kernel/x86_64/sgemm_kernel_16x4_haswell.S | 116 +++++++++--------- kernel/x86_64/zgemm_kernel_4x2_haswell.S | 126 ++++++++------------ 5 files changed, 178 insertions(+), 238 deletions(-) diff --git a/common_x86.h b/common_x86.h index 49e6be29e..8245f7078 100644 --- a/common_x86.h +++ b/common_x86.h @@ -171,11 +171,6 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){ #define MMXSTORE movd #endif -#if defined(SANDYBRIDGE) || defined(HASWELL) -//Enable some optimazation for nehalem. -#define NEHALEM_OPTIMIZATION -#endif - #if defined(PILEDRIVER) || defined(BULLDOZER) //Enable some optimazation for barcelona. #define BARCELONA_OPTIMIZATION @@ -306,12 +301,25 @@ REALNAME: #define PROFCODE #endif + +#if defined(C_PATHSCALE) || defined(OS_DARWIN) + #define EPILOGUE \ - .size REALNAME, .-REALNAME; \ + .size REALNAME, .-REALNAME; \ + .section .note.GNU-stack,"",@progbits + +#else + +#define EPILOGUE \ + .size REALNAME, .-REALNAME; \ .section .note.GNU-stack,"",%progbits #endif + + +#endif + #ifdef XDOUBLE #define FLD fldt #define FST fstpt diff --git a/common_x86_64.h b/common_x86_64.h index 8e9d79443..4fe23448f 100644 --- a/common_x86_64.h +++ b/common_x86_64.h @@ -218,12 +218,6 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){ #ifdef ASSEMBLER -#if defined(SANDYBRIDGE) || defined(HASWELL) -//Enable some optimazation for nehalem. -#define NEHALEM_OPTIMIZATION -#endif - - #if defined(PILEDRIVER) || defined(BULLDOZER) //Enable some optimazation for barcelona. #define BARCELONA_OPTIMIZATION @@ -378,10 +372,20 @@ REALNAME: #define PROFCODE #endif +#if defined(C_PATHSCALE) || defined(OS_DARWIN) + +#define EPILOGUE \ + .size REALNAME, .-REALNAME; \ + .section .note.GNU-stack,"",@progbits + +#else + #define EPILOGUE \ .size REALNAME, .-REALNAME; \ .section .note.GNU-stack,"",%progbits +#endif + #endif diff --git a/kernel/x86_64/cgemm_kernel_8x2_haswell.S b/kernel/x86_64/cgemm_kernel_8x2_haswell.S index 9729e6d70..e4aba23e4 100644 --- a/kernel/x86_64/cgemm_kernel_8x2_haswell.S +++ b/kernel/x86_64/cgemm_kernel_8x2_haswell.S @@ -26,7 +26,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. **********************************************************************************/ /********************************************************************* -* 2013/10/28 Saar +* 2013/11/13 Saar * BLASTEST : OK * CTEST : OK * TEST : OK @@ -138,43 +138,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) -.macro VFMADDPS_R y0,y1,y2 - vfmaddps \y0,\y1,\y2,\y0 -.endm +#define VFMADDPS_R( y0,y1,y2 ) vfmaddps y0,y1,y2,y0 -.macro VFMADDPS_I y0,y1,y2 - vfmaddps \y0,\y1,\y2,\y0 -.endm +#define VFMADDPS_I( y0,y1,y2 ) vfmaddps y0,y1,y2,y0 #elif defined(RN) || defined(RT) || defined(CN) || defined(CT) -.macro VFMADDPS_R y0,y1,y2 - vfnmaddps \y0,\y1,\y2,\y0 -.endm +#define VFMADDPS_R( y0,y1,y2 ) vfnmaddps y0,y1,y2,y0 -.macro VFMADDPS_I y0,y1,y2 - vfmaddps \y0,\y1,\y2,\y0 -.endm +#define VFMADDPS_I( y0,y1,y2 ) vfmaddps y0,y1,y2,y0 #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) -.macro VFMADDPS_R y0,y1,y2 - vfmaddps \y0,\y1,\y2,\y0 -.endm +#define VFMADDPS_R( y0,y1,y2 ) vfmaddps y0,y1,y2,y0 -.macro VFMADDPS_I y0,y1,y2 - vfnmaddps \y0,\y1,\y2,\y0 -.endm +#define VFMADDPS_I( y0,y1,y2 ) vfnmaddps y0,y1,y2,y0 #else -.macro VFMADDPS_R y0,y1,y2 - vfnmaddps \y0,\y1,\y2,\y0 -.endm +#define VFMADDPS_R( y0,y1,y2 ) vfnmaddps y0,y1,y2,y0 -.macro VFMADDPS_I y0,y1,y2 - vfnmaddps \y0,\y1,\y2,\y0 -.endm +#define VFMADDPS_I( y0,y1,y2 ) vfnmaddps y0,y1,y2,y0 #endif @@ -182,43 +166,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) -.macro VFMADDPS_R y0,y1,y2 - vfmadd231ps \y1,\y2,\y0 -.endm +#define VFMADDPS_R( y0,y1,y2 ) vfmadd231ps y1,y2,y0 -.macro VFMADDPS_I y0,y1,y2 - vfmadd231ps \y1,\y2,\y0 -.endm +#define VFMADDPS_I( y0,y1,y2 ) vfmadd231ps y1,y2,y0 #elif defined(RN) || defined(RT) || defined(CN) || defined(CT) -.macro VFMADDPS_R y0,y1,y2 - vfnmadd231ps \y1,\y2,\y0 -.endm +#define VFMADDPS_R( y0,y1,y2 ) vfnmadd231ps y1,y2,y0 -.macro VFMADDPS_I y0,y1,y2 - vfmadd231ps \y1,\y2,\y0 -.endm +#define VFMADDPS_I( y0,y1,y2 ) vfmadd231ps y1,y2,y0 #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) -.macro VFMADDPS_R y0,y1,y2 - vfmadd231ps \y1,\y2,\y0 -.endm +#define VFMADDPS_R( y0,y1,y2 ) vfmadd231ps y1,y2,y0 -.macro VFMADDPS_I y0,y1,y2 - vfnmadd231ps \y1,\y2,\y0 -.endm +#define VFMADDPS_I( y0,y1,y2 ) vfnmadd231ps y1,y2,y0 #else -.macro VFMADDPS_R y0,y1,y2 - vfnmadd231ps \y1,\y2,\y0 -.endm +#define VFMADDPS_R( y0,y1,y2 ) vfnmadd231ps y1,y2,y0 -.macro VFMADDPS_I y0,y1,y2 - vfnmadd231ps \y1,\y2,\y0 -.endm +#define VFMADDPS_I( y0,y1,y2 ) vfnmadd231ps y1,y2,y0 #endif @@ -234,18 +202,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 vbroadcastss -8 * SIZE(BO, BI, SIZE), %ymm4 - VFMADDPS_R %ymm8,%ymm4,%ymm0 + VFMADDPS_R( %ymm8,%ymm4,%ymm0 ) vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1 - VFMADDPS_R %ymm12,%ymm4,%ymm1 + VFMADDPS_R( %ymm12,%ymm4,%ymm1 ) vbroadcastss -7 * SIZE(BO, BI, SIZE), %ymm5 - VFMADDPS_I %ymm9,%ymm5,%ymm0 - VFMADDPS_I %ymm13,%ymm5,%ymm1 + VFMADDPS_I( %ymm9,%ymm5,%ymm0 ) + VFMADDPS_I( %ymm13,%ymm5,%ymm1 ) vbroadcastss -6 * SIZE(BO, BI, SIZE), %ymm6 - VFMADDPS_R %ymm10,%ymm6,%ymm0 - VFMADDPS_R %ymm14,%ymm6,%ymm1 + VFMADDPS_R( %ymm10,%ymm6,%ymm0 ) + VFMADDPS_R( %ymm14,%ymm6,%ymm1 ) vbroadcastss -5 * SIZE(BO, BI, SIZE), %ymm7 - VFMADDPS_I %ymm11,%ymm7,%ymm0 - VFMADDPS_I %ymm15,%ymm7,%ymm1 + VFMADDPS_I( %ymm11,%ymm7,%ymm0 ) + VFMADDPS_I( %ymm15,%ymm7,%ymm1 ) addq $4 , BI addq $16, %rax .endm @@ -338,18 +306,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x2_SUB vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 - VFMADDPS_R %xmm8,%xmm4,%xmm0 + VFMADDPS_R( %xmm8,%xmm4,%xmm0 ) vmovups -12 * SIZE(AO, %rax, SIZE), %xmm1 - VFMADDPS_R %xmm12,%xmm4,%xmm1 + VFMADDPS_R( %xmm12,%xmm4,%xmm1 ) vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 - VFMADDPS_I %xmm9,%xmm5,%xmm0 - VFMADDPS_I %xmm13,%xmm5,%xmm1 + VFMADDPS_I( %xmm9,%xmm5,%xmm0 ) + VFMADDPS_I( %xmm13,%xmm5,%xmm1 ) vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 - VFMADDPS_R %xmm10,%xmm6,%xmm0 - VFMADDPS_R %xmm14,%xmm6,%xmm1 + VFMADDPS_R( %xmm10,%xmm6,%xmm0 ) + VFMADDPS_R( %xmm14,%xmm6,%xmm1 ) vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 - VFMADDPS_I %xmm11,%xmm7,%xmm0 - VFMADDPS_I %xmm15,%xmm7,%xmm1 + VFMADDPS_I( %xmm11,%xmm7,%xmm0 ) + VFMADDPS_I( %xmm15,%xmm7,%xmm1 ) addq $4, BI addq $8, %rax .endm @@ -437,13 +405,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL2x2_SUB vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 - VFMADDPS_R %xmm8,%xmm4,%xmm0 + VFMADDPS_R( %xmm8,%xmm4,%xmm0 ) vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 - VFMADDPS_I %xmm9,%xmm5,%xmm0 + VFMADDPS_I( %xmm9,%xmm5,%xmm0 ) vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 - VFMADDPS_R %xmm10,%xmm6,%xmm0 + VFMADDPS_R( %xmm10,%xmm6,%xmm0 ) vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 - VFMADDPS_I %xmm11,%xmm7,%xmm0 + VFMADDPS_I( %xmm11,%xmm7,%xmm0 ) addq $4, BI addq $4, %rax .endm @@ -509,13 +477,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL1x2_SUB vmovsd -16 * SIZE(AO, %rax, SIZE), %xmm0 vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 - VFMADDPS_R %xmm8,%xmm4,%xmm0 + VFMADDPS_R( %xmm8,%xmm4,%xmm0 ) vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 - VFMADDPS_I %xmm9,%xmm5,%xmm0 + VFMADDPS_I( %xmm9,%xmm5,%xmm0 ) vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 - VFMADDPS_R %xmm10,%xmm6,%xmm0 + VFMADDPS_R( %xmm10,%xmm6,%xmm0 ) vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 - VFMADDPS_I %xmm11,%xmm7,%xmm0 + VFMADDPS_I( %xmm11,%xmm7,%xmm0 ) addq $4, BI addq $2, %rax .endm @@ -583,11 +551,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1 vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm4 - VFMADDPS_R %ymm8,%ymm4,%ymm0 - VFMADDPS_R %ymm12,%ymm4,%ymm1 + VFMADDPS_R( %ymm8,%ymm4,%ymm0 ) + VFMADDPS_R( %ymm12,%ymm4,%ymm1 ) vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm5 - VFMADDPS_I %ymm9,%ymm5,%ymm0 - VFMADDPS_I %ymm13,%ymm5,%ymm1 + VFMADDPS_I( %ymm9,%ymm5,%ymm0 ) + VFMADDPS_I( %ymm13,%ymm5,%ymm1 ) addq $2 , BI addq $16, %rax .endm @@ -654,12 +622,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x1_SUB vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 - VFMADDPS_R %xmm8,%xmm4,%xmm0 + VFMADDPS_R( %xmm8,%xmm4,%xmm0 ) vmovups -12 * SIZE(AO, %rax, SIZE), %xmm1 - VFMADDPS_R %xmm12,%xmm4,%xmm1 + VFMADDPS_R( %xmm12,%xmm4,%xmm1 ) vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 - VFMADDPS_I %xmm9,%xmm5,%xmm0 - VFMADDPS_I %xmm13,%xmm5,%xmm1 + VFMADDPS_I( %xmm9,%xmm5,%xmm0 ) + VFMADDPS_I( %xmm13,%xmm5,%xmm1 ) addq $2, BI addq $8, %rax .endm @@ -723,9 +691,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL2x1_SUB vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 - VFMADDPS_R %xmm8,%xmm4,%xmm0 + VFMADDPS_R( %xmm8,%xmm4,%xmm0 ) vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 - VFMADDPS_I %xmm9,%xmm5,%xmm0 + VFMADDPS_I( %xmm9,%xmm5,%xmm0 ) addq $2, BI addq $4, %rax .endm @@ -778,9 +746,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL1x1_SUB vmovsd -16 * SIZE(AO, %rax, SIZE), %xmm0 vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 - VFMADDPS_R %xmm8,%xmm4,%xmm0 + VFMADDPS_R( %xmm8,%xmm4,%xmm0 ) vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 - VFMADDPS_I %xmm9,%xmm5,%xmm0 + VFMADDPS_I( %xmm9,%xmm5,%xmm0 ) addq $2, BI addq $2, %rax .endm diff --git a/kernel/x86_64/sgemm_kernel_16x4_haswell.S b/kernel/x86_64/sgemm_kernel_16x4_haswell.S index 78adbafbb..2f1434ffa 100644 --- a/kernel/x86_64/sgemm_kernel_16x4_haswell.S +++ b/kernel/x86_64/sgemm_kernel_16x4_haswell.S @@ -26,7 +26,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. **********************************************************************************/ /********************************************************************* -* 2013/10/28 Saar +* 2013/11/13 Saar * BLASTEST : OK * CTEST : OK * TEST : OK @@ -131,23 +131,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(BULLDOZER) -.macro VFMADD231PS_ y0,y1,y2 - vfmaddps \y0,\y1,\y2,\y0 -.endm +#define VFMADD231PS_( y0,y1,y2 ) vfmaddps y0,y1,y2,y0 -.macro VFMADD231SS_ x0,x1,x2 - vfmaddss \x0,\x1,\x2,\x0 -.endm +#define VFMADD231SS_( x0,x1,x2 ) vfmaddss x0,x1,x2,x0 #else -.macro VFMADD231PS_ y0,y1,y2 - vfmadd231ps \y1,\y2,\y0 -.endm +#define VFMADD231PS_( y0,y1,y2 ) vfmadd231ps y1,y2,y0 -.macro VFMADD231SS_ x0,x1,x2 - vfmadd231ss \x1,\x2,\x0 -.endm +#define VFMADD231SS_( x0,x1,x2 ) vfmadd231ss x1,x2,x0 #endif @@ -164,16 +156,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1 vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm3 - VFMADD231PS_ %ymm4,%ymm2,%ymm0 - VFMADD231PS_ %ymm5,%ymm2,%ymm1 - VFMADD231PS_ %ymm6,%ymm3,%ymm0 - VFMADD231PS_ %ymm7,%ymm3,%ymm1 + VFMADD231PS_( %ymm4,%ymm2,%ymm0 ) + VFMADD231PS_( %ymm5,%ymm2,%ymm1 ) + VFMADD231PS_( %ymm6,%ymm3,%ymm0 ) + VFMADD231PS_( %ymm7,%ymm3,%ymm1 ) vbroadcastss -2 * SIZE(BO, BI, SIZE), %ymm2 vbroadcastss -1 * SIZE(BO, BI, SIZE), %ymm3 - VFMADD231PS_ %ymm8,%ymm2,%ymm0 - VFMADD231PS_ %ymm9,%ymm2,%ymm1 - VFMADD231PS_ %ymm10,%ymm3,%ymm0 - VFMADD231PS_ %ymm11,%ymm3,%ymm1 + VFMADD231PS_( %ymm8,%ymm2,%ymm0 ) + VFMADD231PS_( %ymm9,%ymm2,%ymm1 ) + VFMADD231PS_( %ymm10,%ymm3,%ymm0 ) + VFMADD231PS_( %ymm11,%ymm3,%ymm1 ) addq $4 , BI addq $16, %rax .endm @@ -235,12 +227,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm3 - VFMADD231PS_ %ymm4,%ymm2,%ymm0 - VFMADD231PS_ %ymm6,%ymm3,%ymm0 + VFMADD231PS_( %ymm4,%ymm2,%ymm0 ) + VFMADD231PS_( %ymm6,%ymm3,%ymm0 ) vbroadcastss -2 * SIZE(BO, BI, SIZE), %ymm2 vbroadcastss -1 * SIZE(BO, BI, SIZE), %ymm3 - VFMADD231PS_ %ymm8,%ymm2,%ymm0 - VFMADD231PS_ %ymm10,%ymm3,%ymm0 + VFMADD231PS_( %ymm8,%ymm2,%ymm0 ) + VFMADD231PS_( %ymm10,%ymm3,%ymm0 ) addq $4 , BI addq $8 , %rax .endm @@ -279,12 +271,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm2 vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm3 - VFMADD231PS_ %xmm4,%xmm2,%xmm0 - VFMADD231PS_ %xmm6,%xmm3,%xmm0 + VFMADD231PS_( %xmm4,%xmm2,%xmm0 ) + VFMADD231PS_( %xmm6,%xmm3,%xmm0 ) vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm2 vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm3 - VFMADD231PS_ %xmm8,%xmm2,%xmm0 - VFMADD231PS_ %xmm10,%xmm3,%xmm0 + VFMADD231PS_( %xmm8,%xmm2,%xmm0 ) + VFMADD231PS_( %xmm10,%xmm3,%xmm0 ) addq $4 , BI addq $4 , %rax .endm @@ -323,16 +315,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovss -15 * SIZE(AO, %rax, SIZE), %xmm1 vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 vmovss -3 * SIZE(BO, BI, SIZE), %xmm3 - VFMADD231SS_ %xmm4,%xmm2,%xmm0 - VFMADD231SS_ %xmm5,%xmm2,%xmm1 - VFMADD231SS_ %xmm6,%xmm3,%xmm0 - VFMADD231SS_ %xmm7,%xmm3,%xmm1 + VFMADD231SS_( %xmm4,%xmm2,%xmm0 ) + VFMADD231SS_( %xmm5,%xmm2,%xmm1 ) + VFMADD231SS_( %xmm6,%xmm3,%xmm0 ) + VFMADD231SS_( %xmm7,%xmm3,%xmm1 ) vmovss -2 * SIZE(BO, BI, SIZE), %xmm2 vmovss -1 * SIZE(BO, BI, SIZE), %xmm3 - VFMADD231SS_ %xmm8,%xmm2,%xmm0 - VFMADD231SS_ %xmm9,%xmm2,%xmm1 - VFMADD231SS_ %xmm10,%xmm3,%xmm0 - VFMADD231SS_ %xmm11,%xmm3,%xmm1 + VFMADD231SS_( %xmm8,%xmm2,%xmm0 ) + VFMADD231SS_( %xmm9,%xmm2,%xmm1 ) + VFMADD231SS_( %xmm10,%xmm3,%xmm0 ) + VFMADD231SS_( %xmm11,%xmm3,%xmm1 ) addq $4 , BI addq $2, %rax .endm @@ -388,12 +380,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 vmovss -3 * SIZE(BO, BI, SIZE), %xmm3 - VFMADD231SS_ %xmm4,%xmm2,%xmm0 - VFMADD231SS_ %xmm6,%xmm3,%xmm0 + VFMADD231SS_( %xmm4,%xmm2,%xmm0 ) + VFMADD231SS_( %xmm6,%xmm3,%xmm0 ) vmovss -2 * SIZE(BO, BI, SIZE), %xmm2 vmovss -1 * SIZE(BO, BI, SIZE), %xmm3 - VFMADD231SS_ %xmm8,%xmm2,%xmm0 - VFMADD231SS_ %xmm10,%xmm3,%xmm0 + VFMADD231SS_( %xmm8,%xmm2,%xmm0 ) + VFMADD231SS_( %xmm10,%xmm3,%xmm0 ) addq $4 , BI addq $1, %rax .endm @@ -436,10 +428,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1 vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm3 - VFMADD231PS_ %ymm4,%ymm2,%ymm0 - VFMADD231PS_ %ymm5,%ymm2,%ymm1 - VFMADD231PS_ %ymm6,%ymm3,%ymm0 - VFMADD231PS_ %ymm7,%ymm3,%ymm1 + VFMADD231PS_( %ymm4,%ymm2,%ymm0 ) + VFMADD231PS_( %ymm5,%ymm2,%ymm1 ) + VFMADD231PS_( %ymm6,%ymm3,%ymm0 ) + VFMADD231PS_( %ymm7,%ymm3,%ymm1 ) addq $2 , BI addq $16, %rax .endm @@ -480,8 +472,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm3 - VFMADD231PS_ %ymm4,%ymm2,%ymm0 - VFMADD231PS_ %ymm6,%ymm3,%ymm0 + VFMADD231PS_( %ymm4,%ymm2,%ymm0 ) + VFMADD231PS_( %ymm6,%ymm3,%ymm0 ) addq $2 , BI addq $8 , %rax .endm @@ -513,8 +505,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm2 vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm3 - VFMADD231PS_ %xmm4,%xmm2,%xmm0 - VFMADD231PS_ %xmm6,%xmm3,%xmm0 + VFMADD231PS_( %xmm4,%xmm2,%xmm0 ) + VFMADD231PS_( %xmm6,%xmm3,%xmm0 ) addq $2 , BI addq $4 , %rax .endm @@ -546,10 +538,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovss -15 * SIZE(AO, %rax, SIZE), %xmm1 vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 vmovss -3 * SIZE(BO, BI, SIZE), %xmm3 - VFMADD231SS_ %xmm4,%xmm2,%xmm0 - VFMADD231SS_ %xmm5,%xmm2,%xmm1 - VFMADD231SS_ %xmm6,%xmm3,%xmm0 - VFMADD231SS_ %xmm7,%xmm3,%xmm1 + VFMADD231SS_( %xmm4,%xmm2,%xmm0 ) + VFMADD231SS_( %xmm5,%xmm2,%xmm1 ) + VFMADD231SS_( %xmm6,%xmm3,%xmm0 ) + VFMADD231SS_( %xmm7,%xmm3,%xmm1 ) addq $2 , BI addq $2, %rax .endm @@ -589,8 +581,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 vmovss -3 * SIZE(BO, BI, SIZE), %xmm3 - VFMADD231SS_ %xmm4,%xmm2,%xmm0 - VFMADD231SS_ %xmm6,%xmm3,%xmm0 + VFMADD231SS_( %xmm4,%xmm2,%xmm0 ) + VFMADD231SS_( %xmm6,%xmm3,%xmm0 ) addq $2 , BI addq $1, %rax .endm @@ -625,8 +617,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1 vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 - VFMADD231PS_ %ymm4,%ymm2,%ymm0 - VFMADD231PS_ %ymm5,%ymm2,%ymm1 + VFMADD231PS_( %ymm4,%ymm2,%ymm0 ) + VFMADD231PS_( %ymm5,%ymm2,%ymm1 ) addq $1 , BI addq $16, %rax .endm @@ -656,7 +648,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL8x1_SUB vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 - VFMADD231PS_ %ymm4,%ymm2,%ymm0 + VFMADD231PS_( %ymm4,%ymm2,%ymm0 ) addq $1 , BI addq $8 , %rax .endm @@ -684,7 +676,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x1_SUB vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm2 - VFMADD231PS_ %xmm4,%xmm2,%xmm0 + VFMADD231PS_( %xmm4,%xmm2,%xmm0 ) addq $1 , BI addq $4 , %rax .endm @@ -712,8 +704,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 vmovss -15 * SIZE(AO, %rax, SIZE), %xmm1 vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 - VFMADD231SS_ %xmm4,%xmm2,%xmm0 - VFMADD231SS_ %xmm5,%xmm2,%xmm1 + VFMADD231SS_( %xmm4,%xmm2,%xmm0 ) + VFMADD231SS_( %xmm5,%xmm2,%xmm1 ) addq $1 , BI addq $2, %rax .endm @@ -743,7 +735,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL1x1_SUB vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 - VFMADD231SS_ %xmm4,%xmm2,%xmm0 + VFMADD231SS_( %xmm4,%xmm2,%xmm0 ) addq $1 , BI addq $1, %rax .endm diff --git a/kernel/x86_64/zgemm_kernel_4x2_haswell.S b/kernel/x86_64/zgemm_kernel_4x2_haswell.S index 949f90bea..1e6278466 100644 --- a/kernel/x86_64/zgemm_kernel_4x2_haswell.S +++ b/kernel/x86_64/zgemm_kernel_4x2_haswell.S @@ -26,7 +26,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. **********************************************************************************/ /******************************************************************************** -* 2013/10/28 Saar +* 2013/11/13 Saar * BLASTEST : OK * CTEST : OK * TEST : OK @@ -137,43 +137,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) -.macro VFMADDPD_R y0,y1,y2 - vfmaddpd \y0,\y1,\y2,\y0 -.endm +#define VFMADDPD_R( y0,y1,y2 ) vfmaddpd y0,y1,y2,y0 -.macro VFMADDPD_I y0,y1,y2 - vfmaddpd \y0,\y1,\y2,\y0 -.endm +#define VFMADDPD_I( y0,y1,y2 ) vfmaddpd y0,y1,y2,y0 #elif defined(RN) || defined(RT) || defined(CN) || defined(CT) -.macro VFMADDPD_R y0,y1,y2 - vfnmaddpd \y0,\y1,\y2,\y0 -.endm +#define VFMADDPD_R( y0,y1,y2 ) vfnmaddpd y0,y1,y2,y0 -.macro VFMADDPD_I y0,y1,y2 - vfmaddpd \y0,\y1,\y2,\y0 -.endm +#define VFMADDPD_I( y0,y1,y2 ) vfmaddpd y0,y1,y2,y0 #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) -.macro VFMADDPD_R y0,y1,y2 - vfmaddpd \y0,\y1,\y2,\y0 -.endm +#define VFMADDPD_R( y0,y1,y2 ) vfmaddpd y0,y1,y2,y0 -.macro VFMADDPD_I y0,y1,y2 - vfnmaddpd \y0,\y1,\y2,\y0 -.endm +#define VFMADDPD_I( y0,y1,y2 ) vfnmaddpd y0,y1,y2,y0 #else -.macro VFMADDPD_R y0,y1,y2 - vfnmaddpd \y0,\y1,\y2,\y0 -.endm +#define VFMADDPD_R( y0,y1,y2 ) vfnmaddpd y0,y1,y2,y0 -.macro VFMADDPD_I y0,y1,y2 - vfnmaddpd \y0,\y1,\y2,\y0 -.endm +#define VFMADDPD_I( y0,y1,y2 ) vfnmaddpd y0,y1,y2,y0 #endif @@ -181,43 +165,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) -.macro VFMADDPD_R y0,y1,y2 - vfmadd231pd \y1,\y2,\y0 -.endm +#define VFMADDPD_R( y0,y1,y2 ) vfmadd231pd y1,y2,y0 -.macro VFMADDPD_I y0,y1,y2 - vfmadd231pd \y1,\y2,\y0 -.endm +#define VFMADDPD_I( y0,y1,y2 ) vfmadd231pd y1,y2,y0 #elif defined(RN) || defined(RT) || defined(CN) || defined(CT) -.macro VFMADDPD_R y0,y1,y2 - vfnmadd231pd \y1,\y2,\y0 -.endm +#define VFMADDPD_R( y0,y1,y2 ) vfnmadd231pd y1,y2,y0 -.macro VFMADDPD_I y0,y1,y2 - vfmadd231pd \y1,\y2,\y0 -.endm +#define VFMADDPD_I( y0,y1,y2 ) vfmadd231pd y1,y2,y0 #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) -.macro VFMADDPD_R y0,y1,y2 - vfmadd231pd \y1,\y2,\y0 -.endm +#define VFMADDPD_R( y0,y1,y2 ) vfmadd231pd y1,y2,y0 -.macro VFMADDPD_I y0,y1,y2 - vfnmadd231pd \y1,\y2,\y0 -.endm +#define VFMADDPD_I( y0,y1,y2 ) vfnmadd231pd y1,y2,y0 #else -.macro VFMADDPD_R y0,y1,y2 - vfnmadd231pd \y1,\y2,\y0 -.endm +#define VFMADDPD_R( y0,y1,y2 ) vfnmadd231pd y1,y2,y0 -.macro VFMADDPD_I y0,y1,y2 - vfnmadd231pd \y1,\y2,\y0 -.endm +#define VFMADDPD_I( y0,y1,y2 ) vfnmadd231pd y1,y2,y0 #endif @@ -233,16 +201,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vbroadcastsd -8 * SIZE(BO, BI, SIZE), %ymm4 vbroadcastsd -7 * SIZE(BO, BI, SIZE), %ymm5 - VFMADDPD_R %ymm8 ,%ymm4,%ymm0 - VFMADDPD_R %ymm12,%ymm4,%ymm1 + VFMADDPD_R( %ymm8 ,%ymm4,%ymm0 ) + VFMADDPD_R( %ymm12,%ymm4,%ymm1 ) vbroadcastsd -6 * SIZE(BO, BI, SIZE), %ymm6 - VFMADDPD_I %ymm9 ,%ymm5,%ymm0 - VFMADDPD_I %ymm13,%ymm5,%ymm1 + VFMADDPD_I( %ymm9 ,%ymm5,%ymm0 ) + VFMADDPD_I( %ymm13,%ymm5,%ymm1 ) vbroadcastsd -5 * SIZE(BO, BI, SIZE), %ymm7 - VFMADDPD_R %ymm10,%ymm6,%ymm0 - VFMADDPD_R %ymm14,%ymm6,%ymm1 - VFMADDPD_I %ymm11,%ymm7,%ymm0 - VFMADDPD_I %ymm15,%ymm7,%ymm1 + VFMADDPD_R( %ymm10,%ymm6,%ymm0 ) + VFMADDPD_R( %ymm14,%ymm6,%ymm1 ) + VFMADDPD_I( %ymm11,%ymm7,%ymm0 ) + VFMADDPD_I( %ymm15,%ymm7,%ymm1 ) addq $4, BI addq $8, %rax @@ -337,17 +305,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 - VFMADDPD_R %xmm8,%xmm4,%xmm0 - VFMADDPD_R %xmm12,%xmm4,%xmm1 + VFMADDPD_R( %xmm8,%xmm4,%xmm0 ) + VFMADDPD_R( %xmm12,%xmm4,%xmm1 ) vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 - VFMADDPD_I %xmm9,%xmm5,%xmm0 - VFMADDPD_I %xmm13,%xmm5,%xmm1 + VFMADDPD_I( %xmm9,%xmm5,%xmm0 ) + VFMADDPD_I( %xmm13,%xmm5,%xmm1 ) vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 - VFMADDPD_R %xmm10,%xmm6,%xmm0 - VFMADDPD_R %xmm14,%xmm6,%xmm1 + VFMADDPD_R( %xmm10,%xmm6,%xmm0 ) + VFMADDPD_R( %xmm14,%xmm6,%xmm1 ) vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 - VFMADDPD_I %xmm11,%xmm7,%xmm0 - VFMADDPD_I %xmm15,%xmm7,%xmm1 + VFMADDPD_I( %xmm11,%xmm7,%xmm0 ) + VFMADDPD_I( %xmm15,%xmm7,%xmm1 ) addq $4, BI addq $4, %rax .endm @@ -441,12 +409,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 - VFMADDPD_R %xmm8,%xmm4,%xmm0 - VFMADDPD_I %xmm9,%xmm5,%xmm0 + VFMADDPD_R( %xmm8,%xmm4,%xmm0 ) + VFMADDPD_I( %xmm9,%xmm5,%xmm0 ) vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 - VFMADDPD_R %xmm10,%xmm6,%xmm0 - VFMADDPD_I %xmm11,%xmm7,%xmm0 + VFMADDPD_R( %xmm10,%xmm6,%xmm0 ) + VFMADDPD_I( %xmm11,%xmm7,%xmm0 ) addq $4, BI addq $2, %rax .endm @@ -513,10 +481,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovups -4 * SIZE(AO, %rax, SIZE), %ymm1 vbroadcastsd -4 * SIZE(BO, BI, SIZE) , %ymm4 vbroadcastsd -3 * SIZE(BO, BI, SIZE) , %ymm5 - VFMADDPD_R %ymm8 ,%ymm4,%ymm0 - VFMADDPD_R %ymm12,%ymm4,%ymm1 - VFMADDPD_I %ymm9 ,%ymm5,%ymm0 - VFMADDPD_I %ymm13,%ymm5,%ymm1 + VFMADDPD_R( %ymm8 ,%ymm4,%ymm0 ) + VFMADDPD_R( %ymm12,%ymm4,%ymm1 ) + VFMADDPD_I( %ymm9 ,%ymm5,%ymm0 ) + VFMADDPD_I( %ymm13,%ymm5,%ymm1 ) addq $2, BI addq $8, %rax @@ -585,12 +553,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL2x1_SUB vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 - VFMADDPD_R %xmm8,%xmm4,%xmm0 + VFMADDPD_R( %xmm8,%xmm4,%xmm0 ) vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 - VFMADDPD_R %xmm12,%xmm4,%xmm1 + VFMADDPD_R( %xmm12,%xmm4,%xmm1 ) vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 - VFMADDPD_I %xmm9,%xmm5,%xmm0 - VFMADDPD_I %xmm13,%xmm5,%xmm1 + VFMADDPD_I( %xmm9,%xmm5,%xmm0 ) + VFMADDPD_I( %xmm13,%xmm5,%xmm1 ) addq $2, BI addq $4, %rax .endm @@ -655,9 +623,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL1x1_SUB vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 - VFMADDPD_R %xmm8,%xmm4,%xmm0 + VFMADDPD_R( %xmm8,%xmm4,%xmm0 ) vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 - VFMADDPD_I %xmm9,%xmm5,%xmm0 + VFMADDPD_I( %xmm9,%xmm5,%xmm0 ) addq $2, BI addq $2, %rax .endm