From 9299d8cfd608ee4ec7f07583d4e4cb1c67f359a6 Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Wed, 8 Apr 2015 16:29:55 +0200 Subject: [PATCH 1/6] added optimized cdot- and zdot-kernels for bulldozer --- kernel/x86_64/KERNEL.BULLDOZER | 7 +- kernel/x86_64/KERNEL.PILEDRIVER | 3 + kernel/x86_64/KERNEL.STEAMROLLER | 5 +- kernel/x86_64/cdot.c | 174 +++++++++++++++++++++ kernel/x86_64/cdot_microk_bulldozer-2.c | 196 ++++++++++++++++++++++++ kernel/x86_64/zdot.c | 165 ++++++++++++++++++++ kernel/x86_64/zdot_microk_bulldozer-2.c | 115 ++++++++++++++ 7 files changed, 662 insertions(+), 3 deletions(-) create mode 100644 kernel/x86_64/cdot.c create mode 100644 kernel/x86_64/cdot_microk_bulldozer-2.c create mode 100644 kernel/x86_64/zdot.c create mode 100644 kernel/x86_64/zdot_microk_bulldozer-2.c diff --git a/kernel/x86_64/KERNEL.BULLDOZER b/kernel/x86_64/KERNEL.BULLDOZER index 791c18146..ef1108646 100644 --- a/kernel/x86_64/KERNEL.BULLDOZER +++ b/kernel/x86_64/KERNEL.BULLDOZER @@ -3,7 +3,8 @@ CAXPYKERNEL = caxpy.c ZAXPYKERNEL = zaxpy.c SDOTKERNEL = sdot.c -#DDOTKERNEL = ddot.c +CDOTKERNEL = cdot.c +ZDOTKERNEL = zdot.c DSYMV_U_KERNEL = dsymv_U.c DSYMV_L_KERNEL = dsymv_L.c @@ -26,11 +27,11 @@ SGEMMINCOPY = ../generic/gemm_ncopy_16.c SGEMMITCOPY = ../generic/gemm_tcopy_16.c SGEMMONCOPY = gemm_ncopy_2_bulldozer.S SGEMMOTCOPY = gemm_tcopy_2_bulldozer.S - SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) + DGEMMKERNEL = dgemm_kernel_8x2_bulldozer.S DGEMMINCOPY = dgemm_ncopy_8_bulldozer.S DGEMMITCOPY = dgemm_tcopy_8_bulldozer.S @@ -40,6 +41,7 @@ DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) + CGEMMKERNEL = cgemm_kernel_4x2_bulldozer.S CGEMMINCOPY = ../generic/zgemm_ncopy_4.c CGEMMITCOPY = ../generic/zgemm_tcopy_4.c @@ -49,6 +51,7 @@ CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) + ZGEMMKERNEL = zgemm_kernel_2x2_bulldozer.S ZGEMMINCOPY = ZGEMMITCOPY = diff --git a/kernel/x86_64/KERNEL.PILEDRIVER b/kernel/x86_64/KERNEL.PILEDRIVER index ec70253b8..6eddebdad 100644 --- a/kernel/x86_64/KERNEL.PILEDRIVER +++ b/kernel/x86_64/KERNEL.PILEDRIVER @@ -12,6 +12,9 @@ DGEMVNKERNEL = dgemv_n_bulldozer.S DGEMVTKERNEL = dgemv_t_bulldozer.S SDOTKERNEL = sdot.c +CDOTKERNEL = cdot.c +ZDOTKERNEL = zdot.c + DDOTKERNEL = ddot_bulldozer.S DCOPYKERNEL = dcopy_bulldozer.S diff --git a/kernel/x86_64/KERNEL.STEAMROLLER b/kernel/x86_64/KERNEL.STEAMROLLER index f5b5cb942..8926010d3 100644 --- a/kernel/x86_64/KERNEL.STEAMROLLER +++ b/kernel/x86_64/KERNEL.STEAMROLLER @@ -3,7 +3,10 @@ CAXPYKERNEL = caxpy.c ZAXPYKERNEL = zaxpy.c SDOTKERNEL = sdot.c -DDOTKERNEL = ddot.c +DDOTKERNEL = ddot_bullozer.S +CDOTKERNEL = cdot.c +ZDOTKERNEL = zdot.c + DSYMV_U_KERNEL = dsymv_U.c DSYMV_L_KERNEL = dsymv_L.c diff --git a/kernel/x86_64/cdot.c b/kernel/x86_64/cdot.c new file mode 100644 index 000000000..1e9e3204b --- /dev/null +++ b/kernel/x86_64/cdot.c @@ -0,0 +1,174 @@ +/*************************************************************************** +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include "common.h" +#include + + +#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) +#include "cdot_microk_bulldozer-2.c" +#elif defined(NEHALEM) +#include "cdot_microk_nehalem-2.c" +#elif defined(HASWELL) +#include "cdot_microk_haswell-2.c" +#elif defined(SANDYBRIDGE) +#include "cdot_microk_sandy-2.c" +#endif + + +#ifndef HAVE_KERNEL_16 + +static void cdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) __attribute__ ((noinline)); + +static void cdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) +{ + BLASLONG register i = 0; + FLOAT dot[8] = { 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 }; + BLASLONG j=0; + + while( i < n ) + { + + dot[0] += x[j] * y[j] ; + dot[1] += x[j+1] * y[j+1] ; + dot[4] += x[j] * y[j+1] ; + dot[5] += x[j+1] * y[j] ; + + dot[2] += x[j+2] * y[j+2] ; + dot[3] += x[j+3] * y[j+3] ; + dot[6] += x[j+2] * y[j+3] ; + dot[7] += x[j+3] * y[j+2] ; + + dot[0] += x[j+4] * y[j+4] ; + dot[1] += x[j+5] * y[j+5] ; + dot[4] += x[j+4] * y[j+5] ; + dot[5] += x[j+5] * y[j+4] ; + + dot[2] += x[j+6] * y[j+6] ; + dot[3] += x[j+7] * y[j+7] ; + dot[6] += x[j+6] * y[j+7] ; + dot[7] += x[j+7] * y[j+6] ; + + j+=8; + i+=4; + + } + d[0] = dot[0]; + d[1] = dot[1]; + d[2] = dot[2]; + d[3] = dot[3]; + d[4] = dot[4]; + d[5] = dot[5]; + d[6] = dot[6]; + d[7] = dot[7]; + +} + +#endif + +FLOAT _Complex CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +{ + BLASLONG i; + BLASLONG ix,iy; + FLOAT _Complex result; + FLOAT dot[8] = { 0.0, 0.0, 0.0 , 0.0, 0.0, 0.0, 0.0, 0.0 } ; + + if ( n <= 0 ) + { + __real__ result = 0.0 ; + __imag__ result = 0.0 ; + return(result); + + } + + if ( (inc_x == 1) && (inc_y == 1) ) + { + + int n1 = n & -16; + + if ( n1 ) + { + cdot_kernel_16(n1, x, y , dot ); + dot[0] += dot[2]; + dot[1] += dot[3]; + dot[4] += dot[6]; + dot[5] += dot[7]; + } + i = n1; + int j = i * 2; + while( i < n ) + { + + dot[0] += x[j] * y[j] ; + dot[1] += x[j+1] * y[j+1] ; + dot[4] += x[j] * y[j+1] ; + dot[5] += x[j+1] * y[j] ; + + j+=2; + i++ ; + + } + + + } + else + { + i=0; + ix=0; + iy=0; + inc_x <<= 1; + inc_y <<= 1; + while(i < n) + { + + dot[0] += x[ix] * y[iy] ; + dot[1] += x[ix+1] * y[iy+1] ; + dot[4] += x[ix] * y[iy+1] ; + dot[5] += x[ix+1] * y[iy] ; + + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + } + +#if !defined(CONJ) + __real__ result = dot[0] - dot[1]; + __imag__ result = dot[4] + dot[5]; +#else + __real__ result = dot[0] + dot[1]; + __imag__ result = dot[4] - dot[5]; + +#endif + + return(result); + +} + + diff --git a/kernel/x86_64/cdot_microk_bulldozer-2.c b/kernel/x86_64/cdot_microk_bulldozer-2.c new file mode 100644 index 000000000..f587aa036 --- /dev/null +++ b/kernel/x86_64/cdot_microk_bulldozer-2.c @@ -0,0 +1,196 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_16 1 +static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *dot) __attribute__ ((noinline)); + +static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) +{ + + + BLASLONG register i = 0; + + if ( n <=1024 ) + { + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + "vxorps %%xmm0, %%xmm0, %%xmm0 \n\t" + "vxorps %%xmm1, %%xmm1, %%xmm1 \n\t" + "vxorps %%xmm2, %%xmm2, %%xmm2 \n\t" + "vxorps %%xmm3, %%xmm3, %%xmm3 \n\t" + "vxorps %%xmm4, %%xmm4, %%xmm4 \n\t" + "vxorps %%xmm5, %%xmm5, %%xmm5 \n\t" + "vxorps %%xmm6, %%xmm6, %%xmm6 \n\t" + "vxorps %%xmm7, %%xmm7, %%xmm7 \n\t" + + ".align 16 \n\t" + "1: \n\t" + "vmovups (%2,%0,4), %%xmm8 \n\t" // 2 * x + "vmovups 16(%2,%0,4), %%xmm9 \n\t" // 2 * x + + "vmovups (%3,%0,4), %%xmm12 \n\t" // 2 * y + "vmovups 16(%3,%0,4), %%xmm13 \n\t" // 2 * y + + "vmovups 32(%2,%0,4), %%xmm10 \n\t" // 2 * x + "vmovups 48(%2,%0,4), %%xmm11 \n\t" // 2 * x + + "vmovups 32(%3,%0,4), %%xmm14 \n\t" // 2 * y + "vmovups 48(%3,%0,4), %%xmm15 \n\t" // 2 * y + + "vfmaddps %%xmm0, %%xmm8 , %%xmm12, %%xmm0 \n\t" // x_r * y_r, x_i * y_i + "vfmaddps %%xmm1, %%xmm9 , %%xmm13, %%xmm1 \n\t" // x_r * y_r, x_i * y_i + + "vpermilps $0xb1 , %%xmm12, %%xmm12 \n\t" // exchange real and imag part + "vpermilps $0xb1 , %%xmm13, %%xmm13 \n\t" + + "vfmaddps %%xmm2, %%xmm10, %%xmm14, %%xmm2 \n\t" // x_r * y_r, x_i * y_i + "vfmaddps %%xmm3, %%xmm11, %%xmm15, %%xmm3 \n\t" // x_r * y_r, x_i * y_i + + "vpermilps $0xb1 , %%xmm14, %%xmm14 \n\t" + "vpermilps $0xb1 , %%xmm15, %%xmm15 \n\t" + + "vfmaddps %%xmm4, %%xmm8 , %%xmm12, %%xmm4 \n\t" // x_r * y_i, x_i * y_r + "addq $16 , %0 \n\t" + "vfmaddps %%xmm5, %%xmm9 , %%xmm13, %%xmm5 \n\t" // x_r * y_i, x_i * y_r + "vfmaddps %%xmm6, %%xmm10, %%xmm14, %%xmm6 \n\t" // x_r * y_i, x_i * y_r + "subq $8 , %1 \n\t" + "vfmaddps %%xmm7, %%xmm11, %%xmm15, %%xmm7 \n\t" // x_r * y_i, x_i * y_r + + "jnz 1b \n\t" + + "vaddps %%xmm0, %%xmm1, %%xmm0 \n\t" + "vaddps %%xmm2, %%xmm3, %%xmm2 \n\t" + "vaddps %%xmm0, %%xmm2, %%xmm0 \n\t" + + "vaddps %%xmm4, %%xmm5, %%xmm4 \n\t" + "vaddps %%xmm6, %%xmm7, %%xmm6 \n\t" + "vaddps %%xmm4, %%xmm6, %%xmm4 \n\t" + + "vmovups %%xmm0, (%4) \n\t" + "vmovups %%xmm4, 16(%4) \n\t" + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (dot) // 4 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + + return; + } + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + "vxorps %%xmm0, %%xmm0, %%xmm0 \n\t" + "vxorps %%xmm1, %%xmm1, %%xmm1 \n\t" + "vxorps %%xmm2, %%xmm2, %%xmm2 \n\t" + "vxorps %%xmm3, %%xmm3, %%xmm3 \n\t" + "vxorps %%xmm4, %%xmm4, %%xmm4 \n\t" + "vxorps %%xmm5, %%xmm5, %%xmm5 \n\t" + "vxorps %%xmm6, %%xmm6, %%xmm6 \n\t" + "vxorps %%xmm7, %%xmm7, %%xmm7 \n\t" + + ".align 16 \n\t" + "1: \n\t" + "prefetcht0 384(%2,%0,4) \n\t" + "vmovups (%2,%0,4), %%xmm8 \n\t" // 2 * x + "vmovups 16(%2,%0,4), %%xmm9 \n\t" // 2 * x + + "prefetcht0 384(%3,%0,4) \n\t" + "vmovups (%3,%0,4), %%xmm12 \n\t" // 2 * y + "vmovups 16(%3,%0,4), %%xmm13 \n\t" // 2 * y + + "vmovups 32(%2,%0,4), %%xmm10 \n\t" // 2 * x + "vmovups 48(%2,%0,4), %%xmm11 \n\t" // 2 * x + + "vmovups 32(%3,%0,4), %%xmm14 \n\t" // 2 * y + "vmovups 48(%3,%0,4), %%xmm15 \n\t" // 2 * y + + "vfmaddps %%xmm0, %%xmm8 , %%xmm12, %%xmm0 \n\t" // x_r * y_r, x_i * y_i + "vfmaddps %%xmm1, %%xmm9 , %%xmm13, %%xmm1 \n\t" // x_r * y_r, x_i * y_i + + "vpermilps $0xb1 , %%xmm12, %%xmm12 \n\t" // exchange real and imag part + "vpermilps $0xb1 , %%xmm13, %%xmm13 \n\t" + + "vfmaddps %%xmm2, %%xmm10, %%xmm14, %%xmm2 \n\t" // x_r * y_r, x_i * y_i + "vfmaddps %%xmm3, %%xmm11, %%xmm15, %%xmm3 \n\t" // x_r * y_r, x_i * y_i + + "vpermilps $0xb1 , %%xmm14, %%xmm14 \n\t" + "vpermilps $0xb1 , %%xmm15, %%xmm15 \n\t" + + "vfmaddps %%xmm4, %%xmm8 , %%xmm12, %%xmm4 \n\t" // x_r * y_i, x_i * y_r + "addq $16 , %0 \n\t" + "vfmaddps %%xmm5, %%xmm9 , %%xmm13, %%xmm5 \n\t" // x_r * y_i, x_i * y_r + "vfmaddps %%xmm6, %%xmm10, %%xmm14, %%xmm6 \n\t" // x_r * y_i, x_i * y_r + "subq $8 , %1 \n\t" + "vfmaddps %%xmm7, %%xmm11, %%xmm15, %%xmm7 \n\t" // x_r * y_i, x_i * y_r + + "jnz 1b \n\t" + + "vaddps %%xmm0, %%xmm1, %%xmm0 \n\t" + "vaddps %%xmm2, %%xmm3, %%xmm2 \n\t" + "vaddps %%xmm0, %%xmm2, %%xmm0 \n\t" + + "vaddps %%xmm4, %%xmm5, %%xmm4 \n\t" + "vaddps %%xmm6, %%xmm7, %%xmm6 \n\t" + "vaddps %%xmm4, %%xmm6, %%xmm4 \n\t" + + "vmovups %%xmm0, (%4) \n\t" + "vmovups %%xmm4, 16(%4) \n\t" + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (dot) // 4 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + + +} + + diff --git a/kernel/x86_64/zdot.c b/kernel/x86_64/zdot.c new file mode 100644 index 000000000..e13858e06 --- /dev/null +++ b/kernel/x86_64/zdot.c @@ -0,0 +1,165 @@ +/*************************************************************************** +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include "common.h" +#include + + +#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) +#include "zdot_microk_bulldozer-2.c" +#elif defined(NEHALEM) +#include "zdot_microk_nehalem-2.c" +#elif defined(HASWELL) +#include "zdot_microk_haswell-2.c" +#elif defined(SANDYBRIDGE) +#include "zdot_microk_sandy-2.c" +#endif + + +#ifndef HAVE_KERNEL_8 + +static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) __attribute__ ((noinline)); + +static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) +{ + BLASLONG register i = 0; + FLOAT dot[4] = { 0.0, 0.0, 0.0, 0.0 }; + BLASLONG j=0; + + while( i < n ) + { + + dot[0] += x[j] * y[j] ; + dot[1] += x[j+1] * y[j+1] ; + dot[2] += x[j] * y[j+1] ; + dot[3] += x[j+1] * y[j] ; + + dot[0] += x[j+2] * y[j+2] ; + dot[1] += x[j+3] * y[j+3] ; + dot[2] += x[j+2] * y[j+3] ; + dot[3] += x[j+3] * y[j+2] ; + + dot[0] += x[j+4] * y[j+4] ; + dot[1] += x[j+5] * y[j+5] ; + dot[2] += x[j+4] * y[j+5] ; + dot[3] += x[j+5] * y[j+4] ; + + dot[0] += x[j+6] * y[j+6] ; + dot[1] += x[j+7] * y[j+7] ; + dot[2] += x[j+6] * y[j+7] ; + dot[3] += x[j+7] * y[j+6] ; + + j+=8; + i+=4; + + } + d[0] = dot[0]; + d[1] = dot[1]; + d[2] = dot[2]; + d[3] = dot[3]; + +} + +#endif + +FLOAT _Complex CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +{ + BLASLONG i; + BLASLONG ix,iy; + FLOAT _Complex result; + FLOAT dot[4] = { 0.0, 0.0, 0.0 , 0.0 } ; + + if ( n <= 0 ) + { + __real__ result = 0.0 ; + __imag__ result = 0.0 ; + return(result); + + } + + if ( (inc_x == 1) && (inc_y == 1) ) + { + + int n1 = n & -8; + + if ( n1 ) + zdot_kernel_8(n1, x, y , dot ); + + i = n1; + int j = i * 2; + while( i < n ) + { + + dot[0] += x[j] * y[j] ; + dot[1] += x[j+1] * y[j+1] ; + dot[2] += x[j] * y[j+1] ; + dot[3] += x[j+1] * y[j] ; + + j+=2; + i++ ; + + } + + + } + else + { + i=0; + ix=0; + iy=0; + inc_x <<= 1; + inc_y <<= 1; + while(i < n) + { + + dot[0] += x[ix] * y[iy] ; + dot[1] += x[ix+1] * y[iy+1] ; + dot[2] += x[ix] * y[iy+1] ; + dot[3] += x[ix+1] * y[iy] ; + + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + } + +#if !defined(CONJ) + __real__ result = dot[0] - dot[1]; + __imag__ result = dot[2] + dot[3]; +#else + __real__ result = dot[0] + dot[1]; + __imag__ result = dot[2] - dot[3]; + +#endif + + return(result); + +} + + diff --git a/kernel/x86_64/zdot_microk_bulldozer-2.c b/kernel/x86_64/zdot_microk_bulldozer-2.c new file mode 100644 index 000000000..d45c4ad38 --- /dev/null +++ b/kernel/x86_64/zdot_microk_bulldozer-2.c @@ -0,0 +1,115 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_8 1 +static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *dot) __attribute__ ((noinline)); + +static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) +{ + + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + "vxorpd %%xmm0, %%xmm0, %%xmm0 \n\t" + "vxorpd %%xmm1, %%xmm1, %%xmm1 \n\t" + "vxorpd %%xmm2, %%xmm2, %%xmm2 \n\t" + "vxorpd %%xmm3, %%xmm3, %%xmm3 \n\t" + "vxorpd %%xmm4, %%xmm4, %%xmm4 \n\t" + "vxorpd %%xmm5, %%xmm5, %%xmm5 \n\t" + "vxorpd %%xmm6, %%xmm6, %%xmm6 \n\t" + "vxorpd %%xmm7, %%xmm7, %%xmm7 \n\t" + + ".align 16 \n\t" + "1: \n\t" + "prefetcht0 512(%2,%0,8) \n\t" + "vmovups (%2,%0,8), %%xmm8 \n\t" // 1 * x + "vmovups 16(%2,%0,8), %%xmm9 \n\t" // 1 * x + + "prefetcht0 512(%3,%0,8) \n\t" + "vmovups (%3,%0,8), %%xmm12 \n\t" // 1 * y + "vmovups 16(%3,%0,8), %%xmm13 \n\t" // 1 * y + + "vmovups 32(%2,%0,8), %%xmm10 \n\t" // 1 * x + "vmovups 48(%2,%0,8), %%xmm11 \n\t" // 1 * x + + "vmovups 32(%3,%0,8), %%xmm14 \n\t" // 1 * y + "vmovups 48(%3,%0,8), %%xmm15 \n\t" // 1 * y + + "vfmaddpd %%xmm0, %%xmm8 , %%xmm12, %%xmm0 \n\t" // x_r * y_r, x_i * y_i + "vfmaddpd %%xmm1, %%xmm9 , %%xmm13, %%xmm1 \n\t" // x_r * y_r, x_i * y_i + + "vpermilpd $0x1 , %%xmm12, %%xmm12 \n\t" + "vpermilpd $0x1 , %%xmm13, %%xmm13 \n\t" + + "vfmaddpd %%xmm2, %%xmm10, %%xmm14, %%xmm2 \n\t" // x_r * y_r, x_i * y_i + "vfmaddpd %%xmm3, %%xmm11, %%xmm15, %%xmm3 \n\t" // x_r * y_r, x_i * y_i + + "vpermilpd $0x1 , %%xmm14, %%xmm14 \n\t" + "vpermilpd $0x1 , %%xmm15, %%xmm15 \n\t" + + "vfmaddpd %%xmm4, %%xmm8 , %%xmm12, %%xmm4 \n\t" // x_r * y_i, x_i * y_r + "addq $8 , %0 \n\t" + "vfmaddpd %%xmm5, %%xmm9 , %%xmm13, %%xmm5 \n\t" // x_r * y_i, x_i * y_r + "vfmaddpd %%xmm6, %%xmm10, %%xmm14, %%xmm6 \n\t" // x_r * y_i, x_i * y_r + "subq $4 , %1 \n\t" + "vfmaddpd %%xmm7, %%xmm11, %%xmm15, %%xmm7 \n\t" // x_r * y_i, x_i * y_r + + "jnz 1b \n\t" + + "vaddpd %%xmm0, %%xmm1, %%xmm0 \n\t" + "vaddpd %%xmm2, %%xmm3, %%xmm2 \n\t" + "vaddpd %%xmm0, %%xmm2, %%xmm0 \n\t" + + "vaddpd %%xmm4, %%xmm5, %%xmm4 \n\t" + "vaddpd %%xmm6, %%xmm7, %%xmm6 \n\t" + "vaddpd %%xmm4, %%xmm6, %%xmm4 \n\t" + + "vmovups %%xmm0, (%4) \n\t" + "vmovups %%xmm4, 16(%4) \n\t" + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (dot) // 4 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + From 5c511639720d428904cf2c010f550fce090acc12 Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Thu, 9 Apr 2015 09:45:23 +0200 Subject: [PATCH 2/6] added optimized cdot- and zdot-kernel for steamroller --- kernel/x86_64/KERNEL.STEAMROLLER | 2 +- kernel/x86_64/cdot.c | 4 +- kernel/x86_64/cdot_microk_steamroller-2.c | 196 ++++++++++++++++++++++ kernel/x86_64/zdot.c | 4 +- kernel/x86_64/zdot_microk_steamroller-2.c | 193 +++++++++++++++++++++ 5 files changed, 396 insertions(+), 3 deletions(-) create mode 100644 kernel/x86_64/cdot_microk_steamroller-2.c create mode 100644 kernel/x86_64/zdot_microk_steamroller-2.c diff --git a/kernel/x86_64/KERNEL.STEAMROLLER b/kernel/x86_64/KERNEL.STEAMROLLER index 8926010d3..fbe04ca70 100644 --- a/kernel/x86_64/KERNEL.STEAMROLLER +++ b/kernel/x86_64/KERNEL.STEAMROLLER @@ -3,7 +3,7 @@ CAXPYKERNEL = caxpy.c ZAXPYKERNEL = zaxpy.c SDOTKERNEL = sdot.c -DDOTKERNEL = ddot_bullozer.S +DDOTKERNEL = ddot.c CDOTKERNEL = cdot.c ZDOTKERNEL = zdot.c diff --git a/kernel/x86_64/cdot.c b/kernel/x86_64/cdot.c index 1e9e3204b..bfe707310 100644 --- a/kernel/x86_64/cdot.c +++ b/kernel/x86_64/cdot.c @@ -30,8 +30,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) +#if defined(BULLDOZER) || defined(PILEDRIVER) #include "cdot_microk_bulldozer-2.c" +#elif defined(STEAMROLLER) +#include "cdot_microk_steamroller-2.c" #elif defined(NEHALEM) #include "cdot_microk_nehalem-2.c" #elif defined(HASWELL) diff --git a/kernel/x86_64/cdot_microk_steamroller-2.c b/kernel/x86_64/cdot_microk_steamroller-2.c new file mode 100644 index 000000000..76a3aa0eb --- /dev/null +++ b/kernel/x86_64/cdot_microk_steamroller-2.c @@ -0,0 +1,196 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_16 1 +static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *dot) __attribute__ ((noinline)); + +static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) +{ + + + BLASLONG register i = 0; + + if ( n < 1280 ) + { + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + "vxorps %%xmm0, %%xmm0, %%xmm0 \n\t" + "vxorps %%xmm1, %%xmm1, %%xmm1 \n\t" + "vxorps %%xmm2, %%xmm2, %%xmm2 \n\t" + "vxorps %%xmm3, %%xmm3, %%xmm3 \n\t" + "vxorps %%xmm4, %%xmm4, %%xmm4 \n\t" + "vxorps %%xmm5, %%xmm5, %%xmm5 \n\t" + "vxorps %%xmm6, %%xmm6, %%xmm6 \n\t" + "vxorps %%xmm7, %%xmm7, %%xmm7 \n\t" + + ".align 16 \n\t" + "1: \n\t" + "vmovups (%2,%0,4), %%xmm8 \n\t" // 2 * x + "vmovups 16(%2,%0,4), %%xmm9 \n\t" // 2 * x + + "vmovups (%3,%0,4), %%xmm12 \n\t" // 2 * y + "vmovups 16(%3,%0,4), %%xmm13 \n\t" // 2 * y + + "vmovups 32(%2,%0,4), %%xmm10 \n\t" // 2 * x + "vmovups 48(%2,%0,4), %%xmm11 \n\t" // 2 * x + + "vmovups 32(%3,%0,4), %%xmm14 \n\t" // 2 * y + "vmovups 48(%3,%0,4), %%xmm15 \n\t" // 2 * y + + "vfmadd231ps %%xmm8 , %%xmm12, %%xmm0 \n\t" // x_r * y_r, x_i * y_i + "vfmadd231ps %%xmm9 , %%xmm13, %%xmm1 \n\t" // x_r * y_r, x_i * y_i + + "vpermilps $0xb1 , %%xmm12, %%xmm12 \n\t" // exchange real and imag part + "vpermilps $0xb1 , %%xmm13, %%xmm13 \n\t" + + "vfmadd231ps %%xmm10, %%xmm14, %%xmm2 \n\t" // x_r * y_r, x_i * y_i + "vfmadd231ps %%xmm11, %%xmm15, %%xmm3 \n\t" // x_r * y_r, x_i * y_i + + "vpermilps $0xb1 , %%xmm14, %%xmm14 \n\t" + "vpermilps $0xb1 , %%xmm15, %%xmm15 \n\t" + + "vfmadd231ps %%xmm8 , %%xmm12, %%xmm4 \n\t" // x_r * y_i, x_i * y_r + "addq $16 , %0 \n\t" + "vfmadd231ps %%xmm9 , %%xmm13, %%xmm5 \n\t" // x_r * y_i, x_i * y_r + "vfmadd231ps %%xmm10, %%xmm14, %%xmm6 \n\t" // x_r * y_i, x_i * y_r + "subq $8 , %1 \n\t" + "vfmadd231ps %%xmm11, %%xmm15, %%xmm7 \n\t" // x_r * y_i, x_i * y_r + + "jnz 1b \n\t" + + "vaddps %%xmm0, %%xmm1, %%xmm0 \n\t" + "vaddps %%xmm2, %%xmm3, %%xmm2 \n\t" + "vaddps %%xmm0, %%xmm2, %%xmm0 \n\t" + + "vaddps %%xmm4, %%xmm5, %%xmm4 \n\t" + "vaddps %%xmm6, %%xmm7, %%xmm6 \n\t" + "vaddps %%xmm4, %%xmm6, %%xmm4 \n\t" + + "vmovups %%xmm0, (%4) \n\t" + "vmovups %%xmm4, 16(%4) \n\t" + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (dot) // 4 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + + return; + } + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + "vxorps %%xmm0, %%xmm0, %%xmm0 \n\t" + "vxorps %%xmm1, %%xmm1, %%xmm1 \n\t" + "vxorps %%xmm2, %%xmm2, %%xmm2 \n\t" + "vxorps %%xmm3, %%xmm3, %%xmm3 \n\t" + "vxorps %%xmm4, %%xmm4, %%xmm4 \n\t" + "vxorps %%xmm5, %%xmm5, %%xmm5 \n\t" + "vxorps %%xmm6, %%xmm6, %%xmm6 \n\t" + "vxorps %%xmm7, %%xmm7, %%xmm7 \n\t" + + ".align 16 \n\t" + "1: \n\t" + "prefetcht0 512(%2,%0,4) \n\t" + "vmovups (%2,%0,4), %%xmm8 \n\t" // 2 * x + "vmovups 16(%2,%0,4), %%xmm9 \n\t" // 2 * x + + "prefetcht0 512(%3,%0,4) \n\t" + "vmovups (%3,%0,4), %%xmm12 \n\t" // 2 * y + "vmovups 16(%3,%0,4), %%xmm13 \n\t" // 2 * y + + "vmovups 32(%2,%0,4), %%xmm10 \n\t" // 2 * x + "vmovups 48(%2,%0,4), %%xmm11 \n\t" // 2 * x + + "vmovups 32(%3,%0,4), %%xmm14 \n\t" // 2 * y + "vmovups 48(%3,%0,4), %%xmm15 \n\t" // 2 * y + + "vfmadd231ps %%xmm8 , %%xmm12, %%xmm0 \n\t" // x_r * y_r, x_i * y_i + "vfmadd231ps %%xmm9 , %%xmm13, %%xmm1 \n\t" // x_r * y_r, x_i * y_i + + "vpermilps $0xb1 , %%xmm12, %%xmm12 \n\t" // exchange real and imag part + "vpermilps $0xb1 , %%xmm13, %%xmm13 \n\t" + + "vfmadd231ps %%xmm10, %%xmm14, %%xmm2 \n\t" // x_r * y_r, x_i * y_i + "vfmadd231ps %%xmm11, %%xmm15, %%xmm3 \n\t" // x_r * y_r, x_i * y_i + + "vpermilps $0xb1 , %%xmm14, %%xmm14 \n\t" + "vpermilps $0xb1 , %%xmm15, %%xmm15 \n\t" + + "vfmadd231ps %%xmm8 , %%xmm12, %%xmm4 \n\t" // x_r * y_i, x_i * y_r + "addq $16 , %0 \n\t" + "vfmadd231ps %%xmm9 , %%xmm13, %%xmm5 \n\t" // x_r * y_i, x_i * y_r + "vfmadd231ps %%xmm10, %%xmm14, %%xmm6 \n\t" // x_r * y_i, x_i * y_r + "subq $8 , %1 \n\t" + "vfmadd231ps %%xmm11, %%xmm15, %%xmm7 \n\t" // x_r * y_i, x_i * y_r + + "jnz 1b \n\t" + + "vaddps %%xmm0, %%xmm1, %%xmm0 \n\t" + "vaddps %%xmm2, %%xmm3, %%xmm2 \n\t" + "vaddps %%xmm0, %%xmm2, %%xmm0 \n\t" + + "vaddps %%xmm4, %%xmm5, %%xmm4 \n\t" + "vaddps %%xmm6, %%xmm7, %%xmm6 \n\t" + "vaddps %%xmm4, %%xmm6, %%xmm4 \n\t" + + "vmovups %%xmm0, (%4) \n\t" + "vmovups %%xmm4, 16(%4) \n\t" + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (dot) // 4 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + + +} + + diff --git a/kernel/x86_64/zdot.c b/kernel/x86_64/zdot.c index e13858e06..e11b62ccd 100644 --- a/kernel/x86_64/zdot.c +++ b/kernel/x86_64/zdot.c @@ -30,8 +30,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) +#if defined(BULLDOZER) || defined(PILEDRIVER) #include "zdot_microk_bulldozer-2.c" +#elif defined(STEAMROLLER) +#include "zdot_microk_steamroller-2.c" #elif defined(NEHALEM) #include "zdot_microk_nehalem-2.c" #elif defined(HASWELL) diff --git a/kernel/x86_64/zdot_microk_steamroller-2.c b/kernel/x86_64/zdot_microk_steamroller-2.c new file mode 100644 index 000000000..325f74ae3 --- /dev/null +++ b/kernel/x86_64/zdot_microk_steamroller-2.c @@ -0,0 +1,193 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_8 1 +static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *dot) __attribute__ ((noinline)); + +static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) +{ + + + BLASLONG register i = 0; + + if ( n < 640 ) + { + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + "vxorpd %%xmm0, %%xmm0, %%xmm0 \n\t" + "vxorpd %%xmm1, %%xmm1, %%xmm1 \n\t" + "vxorpd %%xmm2, %%xmm2, %%xmm2 \n\t" + "vxorpd %%xmm3, %%xmm3, %%xmm3 \n\t" + "vxorpd %%xmm4, %%xmm4, %%xmm4 \n\t" + "vxorpd %%xmm5, %%xmm5, %%xmm5 \n\t" + "vxorpd %%xmm6, %%xmm6, %%xmm6 \n\t" + "vxorpd %%xmm7, %%xmm7, %%xmm7 \n\t" + + ".align 16 \n\t" + "1: \n\t" + //"prefetcht0 512(%2,%0,8) \n\t" + "vmovups (%2,%0,8), %%xmm8 \n\t" // 1 * x + "vmovups 16(%2,%0,8), %%xmm9 \n\t" // 1 * x + + // "prefetcht0 512(%3,%0,8) \n\t" + "vmovups (%3,%0,8), %%xmm12 \n\t" // 1 * y + "vmovups 16(%3,%0,8), %%xmm13 \n\t" // 1 * y + + "vmovups 32(%2,%0,8), %%xmm10 \n\t" // 1 * x + "vmovups 48(%2,%0,8), %%xmm11 \n\t" // 1 * x + + "vmovups 32(%3,%0,8), %%xmm14 \n\t" // 1 * y + "vmovups 48(%3,%0,8), %%xmm15 \n\t" // 1 * y + + "vfmadd231pd %%xmm8 , %%xmm12, %%xmm0 \n\t" // x_r * y_r, x_i * y_i + "vfmadd231pd %%xmm9 , %%xmm13, %%xmm1 \n\t" // x_r * y_r, x_i * y_i + "vpermilpd $0x1 , %%xmm13, %%xmm13 \n\t" + "vpermilpd $0x1 , %%xmm12, %%xmm12 \n\t" + "vfmadd231pd %%xmm10, %%xmm14, %%xmm2 \n\t" // x_r * y_r, x_i * y_i + "vfmadd231pd %%xmm11, %%xmm15, %%xmm3 \n\t" // x_r * y_r, x_i * y_i + "vpermilpd $0x1 , %%xmm14, %%xmm14 \n\t" + "vpermilpd $0x1 , %%xmm15, %%xmm15 \n\t" + + "vfmadd231pd %%xmm8 , %%xmm12, %%xmm4 \n\t" // x_r * y_i, x_i * y_r + "addq $8 , %0 \n\t" + "vfmadd231pd %%xmm9 , %%xmm13, %%xmm5 \n\t" // x_r * y_i, x_i * y_r + "vfmadd231pd %%xmm10, %%xmm14, %%xmm6 \n\t" // x_r * y_i, x_i * y_r + "subq $4 , %1 \n\t" + "vfmadd231pd %%xmm11, %%xmm15, %%xmm7 \n\t" // x_r * y_i, x_i * y_r + + "jnz 1b \n\t" + + "vaddpd %%xmm0, %%xmm1, %%xmm0 \n\t" + "vaddpd %%xmm2, %%xmm3, %%xmm2 \n\t" + "vaddpd %%xmm0, %%xmm2, %%xmm0 \n\t" + + "vaddpd %%xmm4, %%xmm5, %%xmm4 \n\t" + "vaddpd %%xmm6, %%xmm7, %%xmm6 \n\t" + "vaddpd %%xmm4, %%xmm6, %%xmm4 \n\t" + + "vmovups %%xmm0, (%4) \n\t" + "vmovups %%xmm4, 16(%4) \n\t" + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (dot) // 4 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + + return; + } + + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + "vxorpd %%xmm0, %%xmm0, %%xmm0 \n\t" + "vxorpd %%xmm1, %%xmm1, %%xmm1 \n\t" + "vxorpd %%xmm2, %%xmm2, %%xmm2 \n\t" + "vxorpd %%xmm3, %%xmm3, %%xmm3 \n\t" + "vxorpd %%xmm4, %%xmm4, %%xmm4 \n\t" + "vxorpd %%xmm5, %%xmm5, %%xmm5 \n\t" + "vxorpd %%xmm6, %%xmm6, %%xmm6 \n\t" + "vxorpd %%xmm7, %%xmm7, %%xmm7 \n\t" + + ".align 16 \n\t" + "1: \n\t" + "prefetcht0 512(%2,%0,8) \n\t" + "vmovups (%2,%0,8), %%xmm8 \n\t" // 1 * x + "vmovups 16(%2,%0,8), %%xmm9 \n\t" // 1 * x + + "prefetcht0 512(%3,%0,8) \n\t" + "vmovups (%3,%0,8), %%xmm12 \n\t" // 1 * y + "vmovups 16(%3,%0,8), %%xmm13 \n\t" // 1 * y + + "vmovups 32(%2,%0,8), %%xmm10 \n\t" // 1 * x + "vmovups 48(%2,%0,8), %%xmm11 \n\t" // 1 * x + + "vmovups 32(%3,%0,8), %%xmm14 \n\t" // 1 * y + "vmovups 48(%3,%0,8), %%xmm15 \n\t" // 1 * y + + "vfmadd231pd %%xmm8 , %%xmm12, %%xmm0 \n\t" // x_r * y_r, x_i * y_i + "vfmadd231pd %%xmm9 , %%xmm13, %%xmm1 \n\t" // x_r * y_r, x_i * y_i + "vpermilpd $0x1 , %%xmm13, %%xmm13 \n\t" + "vpermilpd $0x1 , %%xmm12, %%xmm12 \n\t" + "vfmadd231pd %%xmm10, %%xmm14, %%xmm2 \n\t" // x_r * y_r, x_i * y_i + "vfmadd231pd %%xmm11, %%xmm15, %%xmm3 \n\t" // x_r * y_r, x_i * y_i + "vpermilpd $0x1 , %%xmm14, %%xmm14 \n\t" + "vpermilpd $0x1 , %%xmm15, %%xmm15 \n\t" + + "vfmadd231pd %%xmm8 , %%xmm12, %%xmm4 \n\t" // x_r * y_i, x_i * y_r + "addq $8 , %0 \n\t" + "vfmadd231pd %%xmm9 , %%xmm13, %%xmm5 \n\t" // x_r * y_i, x_i * y_r + "vfmadd231pd %%xmm10, %%xmm14, %%xmm6 \n\t" // x_r * y_i, x_i * y_r + "subq $4 , %1 \n\t" + "vfmadd231pd %%xmm11, %%xmm15, %%xmm7 \n\t" // x_r * y_i, x_i * y_r + + "jnz 1b \n\t" + + "vaddpd %%xmm0, %%xmm1, %%xmm0 \n\t" + "vaddpd %%xmm2, %%xmm3, %%xmm2 \n\t" + "vaddpd %%xmm0, %%xmm2, %%xmm0 \n\t" + + "vaddpd %%xmm4, %%xmm5, %%xmm4 \n\t" + "vaddpd %%xmm6, %%xmm7, %%xmm6 \n\t" + "vaddpd %%xmm4, %%xmm6, %%xmm4 \n\t" + + "vmovups %%xmm0, (%4) \n\t" + "vmovups %%xmm4, 16(%4) \n\t" + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (dot) // 4 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + + +} + + From b57a60dac86703d82960e03e63065b435ea06f6c Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Thu, 9 Apr 2015 10:33:46 +0200 Subject: [PATCH 3/6] updated cdot and zdot for piledriver --- kernel/x86_64/cdot.c | 4 ++-- kernel/x86_64/zdot.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/x86_64/cdot.c b/kernel/x86_64/cdot.c index bfe707310..e0ba31ae7 100644 --- a/kernel/x86_64/cdot.c +++ b/kernel/x86_64/cdot.c @@ -30,9 +30,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#if defined(BULLDOZER) || defined(PILEDRIVER) +#if defined(BULLDOZER) #include "cdot_microk_bulldozer-2.c" -#elif defined(STEAMROLLER) +#elif defined(STEAMROLLER) || defined(PILEDRIVER) #include "cdot_microk_steamroller-2.c" #elif defined(NEHALEM) #include "cdot_microk_nehalem-2.c" diff --git a/kernel/x86_64/zdot.c b/kernel/x86_64/zdot.c index e11b62ccd..ee220c70e 100644 --- a/kernel/x86_64/zdot.c +++ b/kernel/x86_64/zdot.c @@ -30,9 +30,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#if defined(BULLDOZER) || defined(PILEDRIVER) +#if defined(BULLDOZER) #include "zdot_microk_bulldozer-2.c" -#elif defined(STEAMROLLER) +#elif defined(STEAMROLLER) || defined(PILEDRIVER) #include "zdot_microk_steamroller-2.c" #elif defined(NEHALEM) #include "zdot_microk_nehalem-2.c" From fd838c75bc309b02c3a3abfdc3dc857e735f2a37 Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Thu, 9 Apr 2015 15:13:52 +0200 Subject: [PATCH 4/6] add optimized cdot- and zdot-kernel for haswell --- kernel/x86_64/KERNEL.HASWELL | 3 + kernel/x86_64/cdot_microk_haswell-2.c | 119 ++++++++++++++++++++++++++ kernel/x86_64/zdot_microk_haswell-2.c | 119 ++++++++++++++++++++++++++ 3 files changed, 241 insertions(+) create mode 100644 kernel/x86_64/cdot_microk_haswell-2.c create mode 100644 kernel/x86_64/zdot_microk_haswell-2.c diff --git a/kernel/x86_64/KERNEL.HASWELL b/kernel/x86_64/KERNEL.HASWELL index ea3d95872..a6e085d18 100644 --- a/kernel/x86_64/KERNEL.HASWELL +++ b/kernel/x86_64/KERNEL.HASWELL @@ -12,6 +12,9 @@ CGEMVTKERNEL = cgemv_t_4.c SDOTKERNEL = sdot.c DDOTKERNEL = ddot.c +CDOTKERNEL = cdot.c +ZDOTKERNEL = zdot.c + DAXPYKERNEL = daxpy.c SAXPYKERNEL = saxpy.c diff --git a/kernel/x86_64/cdot_microk_haswell-2.c b/kernel/x86_64/cdot_microk_haswell-2.c new file mode 100644 index 000000000..52cedd556 --- /dev/null +++ b/kernel/x86_64/cdot_microk_haswell-2.c @@ -0,0 +1,119 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_16 1 +static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *dot) __attribute__ ((noinline)); + +static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) +{ + + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + "vxorps %%ymm0, %%ymm0, %%ymm0 \n\t" + "vxorps %%ymm1, %%ymm1, %%ymm1 \n\t" + "vxorps %%ymm2, %%ymm2, %%ymm2 \n\t" + "vxorps %%ymm3, %%ymm3, %%ymm3 \n\t" + "vxorps %%ymm4, %%ymm4, %%ymm4 \n\t" + "vxorps %%ymm5, %%ymm5, %%ymm5 \n\t" + "vxorps %%ymm6, %%ymm6, %%ymm6 \n\t" + "vxorps %%ymm7, %%ymm7, %%ymm7 \n\t" + + ".align 16 \n\t" + "1: \n\t" + "vmovups (%2,%0,4), %%ymm8 \n\t" // 2 * x + "vmovups 32(%2,%0,4), %%ymm9 \n\t" // 2 * x + + "vmovups (%3,%0,4), %%ymm12 \n\t" // 2 * y + "vmovups 32(%3,%0,4), %%ymm13 \n\t" // 2 * y + + "vmovups 64(%2,%0,4), %%ymm10 \n\t" // 2 * x + "vmovups 96(%2,%0,4), %%ymm11 \n\t" // 2 * x + + "vmovups 64(%3,%0,4), %%ymm14 \n\t" // 2 * y + "vmovups 96(%3,%0,4), %%ymm15 \n\t" // 2 * y + + "vfmadd231ps %%ymm8 , %%ymm12, %%ymm0 \n\t" // x_r * y_r, x_i * y_i + "vfmadd231ps %%ymm9 , %%ymm13, %%ymm1 \n\t" // x_r * y_r, x_i * y_i + "vpermilps $0xb1 , %%ymm13, %%ymm13 \n\t" + "vpermilps $0xb1 , %%ymm12, %%ymm12 \n\t" + + "vfmadd231ps %%ymm10, %%ymm14, %%ymm2 \n\t" // x_r * y_r, x_i * y_i + "vfmadd231ps %%ymm11, %%ymm15, %%ymm3 \n\t" // x_r * y_r, x_i * y_i + "vpermilps $0xb1 , %%ymm14, %%ymm14 \n\t" + "vpermilps $0xb1 , %%ymm15, %%ymm15 \n\t" + + "vfmadd231ps %%ymm8 , %%ymm12, %%ymm4 \n\t" // x_r * y_i, x_i * y_r + "addq $32 , %0 \n\t" + "vfmadd231ps %%ymm9 , %%ymm13, %%ymm5 \n\t" // x_r * y_i, x_i * y_r + "vfmadd231ps %%ymm10, %%ymm14, %%ymm6 \n\t" // x_r * y_i, x_i * y_r + "subq $16 , %1 \n\t" + "vfmadd231ps %%ymm11, %%ymm15, %%ymm7 \n\t" // x_r * y_i, x_i * y_r + + "jnz 1b \n\t" + + "vaddps %%ymm0, %%ymm1, %%ymm0 \n\t" + "vaddps %%ymm2, %%ymm3, %%ymm2 \n\t" + "vaddps %%ymm0, %%ymm2, %%ymm0 \n\t" + + "vaddps %%ymm4, %%ymm5, %%ymm4 \n\t" + "vaddps %%ymm6, %%ymm7, %%ymm6 \n\t" + "vaddps %%ymm4, %%ymm6, %%ymm4 \n\t" + + "vextractf128 $1 , %%ymm0 , %%xmm1 \n\t" + "vextractf128 $1 , %%ymm4 , %%xmm5 \n\t" + + "vaddps %%xmm0, %%xmm1, %%xmm0 \n\t" + "vaddps %%xmm4, %%xmm5, %%xmm4 \n\t" + + "vmovups %%xmm0, (%4) \n\t" + "vmovups %%xmm4, 16(%4) \n\t" + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (dot) // 4 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + + + +} + + diff --git a/kernel/x86_64/zdot_microk_haswell-2.c b/kernel/x86_64/zdot_microk_haswell-2.c new file mode 100644 index 000000000..3785713de --- /dev/null +++ b/kernel/x86_64/zdot_microk_haswell-2.c @@ -0,0 +1,119 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_8 1 +static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *dot) __attribute__ ((noinline)); + +static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) +{ + + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + "vxorpd %%ymm0, %%ymm0, %%ymm0 \n\t" + "vxorpd %%ymm1, %%ymm1, %%ymm1 \n\t" + "vxorpd %%ymm2, %%ymm2, %%ymm2 \n\t" + "vxorpd %%ymm3, %%ymm3, %%ymm3 \n\t" + "vxorpd %%ymm4, %%ymm4, %%ymm4 \n\t" + "vxorpd %%ymm5, %%ymm5, %%ymm5 \n\t" + "vxorpd %%ymm6, %%ymm6, %%ymm6 \n\t" + "vxorpd %%ymm7, %%ymm7, %%ymm7 \n\t" + + ".align 16 \n\t" + "1: \n\t" + "vmovups (%2,%0,8), %%ymm8 \n\t" // 2 * x + "vmovups 32(%2,%0,8), %%ymm9 \n\t" // 2 * x + + "vmovups (%3,%0,8), %%ymm12 \n\t" // 2 * y + "vmovups 32(%3,%0,8), %%ymm13 \n\t" // 2 * y + + "vmovups 64(%2,%0,8), %%ymm10 \n\t" // 2 * x + "vmovups 96(%2,%0,8), %%ymm11 \n\t" // 2 * x + + "vmovups 64(%3,%0,8), %%ymm14 \n\t" // 2 * y + "vmovups 96(%3,%0,8), %%ymm15 \n\t" // 2 * y + + "vfmadd231pd %%ymm8 , %%ymm12, %%ymm0 \n\t" // x_r * y_r, x_i * y_i + "vfmadd231pd %%ymm9 , %%ymm13, %%ymm1 \n\t" // x_r * y_r, x_i * y_i + "vpermpd $0xb1 , %%ymm13, %%ymm13 \n\t" + "vpermpd $0xb1 , %%ymm12, %%ymm12 \n\t" + + "vfmadd231pd %%ymm10, %%ymm14, %%ymm2 \n\t" // x_r * y_r, x_i * y_i + "vfmadd231pd %%ymm11, %%ymm15, %%ymm3 \n\t" // x_r * y_r, x_i * y_i + "vpermpd $0xb1 , %%ymm14, %%ymm14 \n\t" + "vpermpd $0xb1 , %%ymm15, %%ymm15 \n\t" + + "vfmadd231pd %%ymm8 , %%ymm12, %%ymm4 \n\t" // x_r * y_i, x_i * y_r + "addq $16 , %0 \n\t" + "vfmadd231pd %%ymm9 , %%ymm13, %%ymm5 \n\t" // x_r * y_i, x_i * y_r + "vfmadd231pd %%ymm10, %%ymm14, %%ymm6 \n\t" // x_r * y_i, x_i * y_r + "subq $8 , %1 \n\t" + "vfmadd231pd %%ymm11, %%ymm15, %%ymm7 \n\t" // x_r * y_i, x_i * y_r + + "jnz 1b \n\t" + + "vaddpd %%ymm0, %%ymm1, %%ymm0 \n\t" + "vaddpd %%ymm2, %%ymm3, %%ymm2 \n\t" + "vaddpd %%ymm0, %%ymm2, %%ymm0 \n\t" + + "vaddpd %%ymm4, %%ymm5, %%ymm4 \n\t" + "vaddpd %%ymm6, %%ymm7, %%ymm6 \n\t" + "vaddpd %%ymm4, %%ymm6, %%ymm4 \n\t" + + "vextractf128 $1 , %%ymm0 , %%xmm1 \n\t" + "vextractf128 $1 , %%ymm4 , %%xmm5 \n\t" + + "vaddpd %%xmm0, %%xmm1, %%xmm0 \n\t" + "vaddpd %%xmm4, %%xmm5, %%xmm4 \n\t" + + "vmovups %%xmm0, (%4) \n\t" + "vmovups %%xmm4, 16(%4) \n\t" + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (dot) // 4 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + + + +} + + From 33b332372ac7c21c1334d5f27013fce0144ac374 Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Fri, 10 Apr 2015 09:37:26 +0200 Subject: [PATCH 5/6] add optimized cdot- and zdot-kernel for sandybridge --- kernel/x86_64/KERNEL.SANDYBRIDGE | 3 + kernel/x86_64/cdot_microk_haswell-2.c | 2 +- kernel/x86_64/cdot_microk_sandy-2.c | 127 +++++++++++++++ kernel/x86_64/zdot_microk_haswell-2.c | 2 +- kernel/x86_64/zdot_microk_sandy-2.c | 222 ++++++++++++++++++++++++++ 5 files changed, 354 insertions(+), 2 deletions(-) create mode 100644 kernel/x86_64/cdot_microk_sandy-2.c create mode 100644 kernel/x86_64/zdot_microk_sandy-2.c diff --git a/kernel/x86_64/KERNEL.SANDYBRIDGE b/kernel/x86_64/KERNEL.SANDYBRIDGE index b0b6c6c84..a60f4a17a 100644 --- a/kernel/x86_64/KERNEL.SANDYBRIDGE +++ b/kernel/x86_64/KERNEL.SANDYBRIDGE @@ -5,6 +5,9 @@ ZGEMVNKERNEL = zgemv_n_4.c SDOTKERNEL = sdot.c DDOTKERNEL = ddot.c +CDOTKERNEL = cdot.c +ZDOTKERNEL = zdot.c + SAXPYKERNEL = saxpy.c DAXPYKERNEL = daxpy.c diff --git a/kernel/x86_64/cdot_microk_haswell-2.c b/kernel/x86_64/cdot_microk_haswell-2.c index 52cedd556..fc76b138a 100644 --- a/kernel/x86_64/cdot_microk_haswell-2.c +++ b/kernel/x86_64/cdot_microk_haswell-2.c @@ -62,8 +62,8 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "vfmadd231ps %%ymm8 , %%ymm12, %%ymm0 \n\t" // x_r * y_r, x_i * y_i "vfmadd231ps %%ymm9 , %%ymm13, %%ymm1 \n\t" // x_r * y_r, x_i * y_i - "vpermilps $0xb1 , %%ymm13, %%ymm13 \n\t" "vpermilps $0xb1 , %%ymm12, %%ymm12 \n\t" + "vpermilps $0xb1 , %%ymm13, %%ymm13 \n\t" "vfmadd231ps %%ymm10, %%ymm14, %%ymm2 \n\t" // x_r * y_r, x_i * y_i "vfmadd231ps %%ymm11, %%ymm15, %%ymm3 \n\t" // x_r * y_r, x_i * y_i diff --git a/kernel/x86_64/cdot_microk_sandy-2.c b/kernel/x86_64/cdot_microk_sandy-2.c new file mode 100644 index 000000000..22cd79e2e --- /dev/null +++ b/kernel/x86_64/cdot_microk_sandy-2.c @@ -0,0 +1,127 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_16 1 +static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *dot) __attribute__ ((noinline)); + +static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) +{ + + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + "vxorps %%ymm0, %%ymm0, %%ymm0 \n\t" + "vxorps %%ymm1, %%ymm1, %%ymm1 \n\t" + "vxorps %%ymm2, %%ymm2, %%ymm2 \n\t" + "vxorps %%ymm3, %%ymm3, %%ymm3 \n\t" + "vxorps %%ymm4, %%ymm4, %%ymm4 \n\t" + "vxorps %%ymm5, %%ymm5, %%ymm5 \n\t" + "vxorps %%ymm6, %%ymm6, %%ymm6 \n\t" + "vxorps %%ymm7, %%ymm7, %%ymm7 \n\t" + + ".align 16 \n\t" + "1: \n\t" + "vmovups (%2,%0,4), %%ymm8 \n\t" // 2 * x + "vmovups 32(%2,%0,4), %%ymm9 \n\t" // 2 * x + + "vmovups (%3,%0,4), %%ymm12 \n\t" // 2 * y + "vmovups 32(%3,%0,4), %%ymm13 \n\t" // 2 * y + + "vmovups 64(%3,%0,4), %%ymm14 \n\t" // 2 * y + "vmovups 96(%3,%0,4), %%ymm15 \n\t" // 2 * y + + "vmulps %%ymm8 , %%ymm12, %%ymm10 \n\t" + "vmulps %%ymm9 , %%ymm13, %%ymm11 \n\t" + "vpermilps $0xb1 , %%ymm12, %%ymm12 \n\t" + "vpermilps $0xb1 , %%ymm13, %%ymm13 \n\t" + "vaddps %%ymm0 , %%ymm10, %%ymm0 \n\t" + "vaddps %%ymm1 , %%ymm11, %%ymm1 \n\t" + "vmulps %%ymm8 , %%ymm12, %%ymm10 \n\t" + "vmulps %%ymm9 , %%ymm13, %%ymm11 \n\t" + + "vmovups 64(%2,%0,4), %%ymm8 \n\t" // 2 * x + "vmovups 96(%2,%0,4), %%ymm9 \n\t" // 2 * x + + "vaddps %%ymm4 , %%ymm10, %%ymm4 \n\t" + "vaddps %%ymm5 , %%ymm11, %%ymm5 \n\t" + + "vmulps %%ymm8 , %%ymm14, %%ymm10 \n\t" + "vmulps %%ymm9 , %%ymm15, %%ymm11 \n\t" + "vpermilps $0xb1 , %%ymm14, %%ymm14 \n\t" + "vpermilps $0xb1 , %%ymm15, %%ymm15 \n\t" + "vaddps %%ymm2 , %%ymm10, %%ymm2 \n\t" + "vaddps %%ymm3 , %%ymm11, %%ymm3 \n\t" + "vmulps %%ymm8 , %%ymm14, %%ymm10 \n\t" + "vmulps %%ymm9 , %%ymm15, %%ymm11 \n\t" + "addq $32 , %0 \n\t" + "subq $16 , %1 \n\t" + "vaddps %%ymm6 , %%ymm10, %%ymm6 \n\t" + "vaddps %%ymm7 , %%ymm11, %%ymm7 \n\t" + + "jnz 1b \n\t" + + "vaddps %%ymm0, %%ymm1, %%ymm0 \n\t" + "vaddps %%ymm2, %%ymm3, %%ymm2 \n\t" + "vaddps %%ymm0, %%ymm2, %%ymm0 \n\t" + + "vaddps %%ymm4, %%ymm5, %%ymm4 \n\t" + "vaddps %%ymm6, %%ymm7, %%ymm6 \n\t" + "vaddps %%ymm4, %%ymm6, %%ymm4 \n\t" + + "vextractf128 $1 , %%ymm0 , %%xmm1 \n\t" + "vextractf128 $1 , %%ymm4 , %%xmm5 \n\t" + + "vaddps %%xmm0, %%xmm1, %%xmm0 \n\t" + "vaddps %%xmm4, %%xmm5, %%xmm4 \n\t" + + "vmovups %%xmm0, (%4) \n\t" + "vmovups %%xmm4, 16(%4) \n\t" + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (dot) // 4 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + + + +} + + diff --git a/kernel/x86_64/zdot_microk_haswell-2.c b/kernel/x86_64/zdot_microk_haswell-2.c index 3785713de..04a6b971f 100644 --- a/kernel/x86_64/zdot_microk_haswell-2.c +++ b/kernel/x86_64/zdot_microk_haswell-2.c @@ -62,8 +62,8 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "vfmadd231pd %%ymm8 , %%ymm12, %%ymm0 \n\t" // x_r * y_r, x_i * y_i "vfmadd231pd %%ymm9 , %%ymm13, %%ymm1 \n\t" // x_r * y_r, x_i * y_i - "vpermpd $0xb1 , %%ymm13, %%ymm13 \n\t" "vpermpd $0xb1 , %%ymm12, %%ymm12 \n\t" + "vpermpd $0xb1 , %%ymm13, %%ymm13 \n\t" "vfmadd231pd %%ymm10, %%ymm14, %%ymm2 \n\t" // x_r * y_r, x_i * y_i "vfmadd231pd %%ymm11, %%ymm15, %%ymm3 \n\t" // x_r * y_r, x_i * y_i diff --git a/kernel/x86_64/zdot_microk_sandy-2.c b/kernel/x86_64/zdot_microk_sandy-2.c new file mode 100644 index 000000000..fd06612e6 --- /dev/null +++ b/kernel/x86_64/zdot_microk_sandy-2.c @@ -0,0 +1,222 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_8 1 +static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *dot) __attribute__ ((noinline)); + +static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) +{ + + + BLASLONG register i = 0; + +if ( n < 1280 ) +{ + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + "vxorpd %%ymm0, %%ymm0, %%ymm0 \n\t" + "vxorpd %%ymm1, %%ymm1, %%ymm1 \n\t" + "vxorpd %%ymm2, %%ymm2, %%ymm2 \n\t" + "vxorpd %%ymm3, %%ymm3, %%ymm3 \n\t" + "vxorpd %%ymm4, %%ymm4, %%ymm4 \n\t" + "vxorpd %%ymm5, %%ymm5, %%ymm5 \n\t" + "vxorpd %%ymm6, %%ymm6, %%ymm6 \n\t" + "vxorpd %%ymm7, %%ymm7, %%ymm7 \n\t" + + ".align 16 \n\t" + "1: \n\t" + "vmovups (%2,%0,8), %%ymm8 \n\t" // 2 * x + "vmovups 32(%2,%0,8), %%ymm9 \n\t" // 2 * x + + "vmovups (%3,%0,8), %%ymm12 \n\t" // 2 * y + "vmovups 32(%3,%0,8), %%ymm13 \n\t" // 2 * y + + "vmovups 64(%3,%0,8), %%ymm14 \n\t" // 2 * y + "vmovups 96(%3,%0,8), %%ymm15 \n\t" // 2 * y + + "vmulpd %%ymm8 , %%ymm12, %%ymm10 \n\t" + "vmulpd %%ymm9 , %%ymm13, %%ymm11 \n\t" + "vpermilpd $0x5 , %%ymm12, %%ymm12 \n\t" + "vpermilpd $0x5 , %%ymm13, %%ymm13 \n\t" + "vaddpd %%ymm0 , %%ymm10, %%ymm0 \n\t" + "vaddpd %%ymm1 , %%ymm11, %%ymm1 \n\t" + "vmulpd %%ymm8 , %%ymm12, %%ymm10 \n\t" + "vmulpd %%ymm9 , %%ymm13, %%ymm11 \n\t" + "vmovups 64(%2,%0,8), %%ymm8 \n\t" // 2 * x + "vmovups 96(%2,%0,8), %%ymm9 \n\t" // 2 * x + "vaddpd %%ymm4 , %%ymm10, %%ymm4 \n\t" + "vaddpd %%ymm5 , %%ymm11, %%ymm5 \n\t" + + + "vmulpd %%ymm8 , %%ymm14, %%ymm10 \n\t" + "vmulpd %%ymm9 , %%ymm15, %%ymm11 \n\t" + "vpermilpd $0x5 , %%ymm14, %%ymm14 \n\t" + "vpermilpd $0x5 , %%ymm15, %%ymm15 \n\t" + "vaddpd %%ymm2 , %%ymm10, %%ymm2 \n\t" + "vaddpd %%ymm3 , %%ymm11, %%ymm3 \n\t" + "vmulpd %%ymm8 , %%ymm14, %%ymm10 \n\t" + "addq $16 , %0 \n\t" + "vmulpd %%ymm9 , %%ymm15, %%ymm11 \n\t" + "vaddpd %%ymm6 , %%ymm10, %%ymm6 \n\t" + "subq $8 , %1 \n\t" + "vaddpd %%ymm7 , %%ymm11, %%ymm7 \n\t" + + "jnz 1b \n\t" + + "vaddpd %%ymm0, %%ymm1, %%ymm0 \n\t" + "vaddpd %%ymm2, %%ymm3, %%ymm2 \n\t" + "vaddpd %%ymm0, %%ymm2, %%ymm0 \n\t" + + "vaddpd %%ymm4, %%ymm5, %%ymm4 \n\t" + "vaddpd %%ymm6, %%ymm7, %%ymm6 \n\t" + "vaddpd %%ymm4, %%ymm6, %%ymm4 \n\t" + + "vextractf128 $1 , %%ymm0 , %%xmm1 \n\t" + "vextractf128 $1 , %%ymm4 , %%xmm5 \n\t" + + "vaddpd %%xmm0, %%xmm1, %%xmm0 \n\t" + "vaddpd %%xmm4, %%xmm5, %%xmm4 \n\t" + + "vmovups %%xmm0, (%4) \n\t" + "vmovups %%xmm4, 16(%4) \n\t" + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (dot) // 4 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + return; + + } + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + "vxorpd %%ymm0, %%ymm0, %%ymm0 \n\t" + "vxorpd %%ymm1, %%ymm1, %%ymm1 \n\t" + "vxorpd %%ymm2, %%ymm2, %%ymm2 \n\t" + "vxorpd %%ymm3, %%ymm3, %%ymm3 \n\t" + "vxorpd %%ymm4, %%ymm4, %%ymm4 \n\t" + "vxorpd %%ymm5, %%ymm5, %%ymm5 \n\t" + "vxorpd %%ymm6, %%ymm6, %%ymm6 \n\t" + "vxorpd %%ymm7, %%ymm7, %%ymm7 \n\t" + + ".align 16 \n\t" + "1: \n\t" + "prefetcht0 512(%2,%0,8) \n\t" + "vmovups (%2,%0,8), %%ymm8 \n\t" // 2 * x + "vmovups 32(%2,%0,8), %%ymm9 \n\t" // 2 * x + + "prefetcht0 512(%3,%0,8) \n\t" + "vmovups (%3,%0,8), %%ymm12 \n\t" // 2 * y + "vmovups 32(%3,%0,8), %%ymm13 \n\t" // 2 * y + + "vmovups 64(%3,%0,8), %%ymm14 \n\t" // 2 * y + "vmovups 96(%3,%0,8), %%ymm15 \n\t" // 2 * y + + "prefetcht0 576(%3,%0,8) \n\t" + "vmulpd %%ymm8 , %%ymm12, %%ymm10 \n\t" + "vmulpd %%ymm9 , %%ymm13, %%ymm11 \n\t" + "prefetcht0 576(%2,%0,8) \n\t" + "vpermilpd $0x5 , %%ymm12, %%ymm12 \n\t" + "vpermilpd $0x5 , %%ymm13, %%ymm13 \n\t" + "vaddpd %%ymm0 , %%ymm10, %%ymm0 \n\t" + "vaddpd %%ymm1 , %%ymm11, %%ymm1 \n\t" + "vmulpd %%ymm8 , %%ymm12, %%ymm10 \n\t" + "vmulpd %%ymm9 , %%ymm13, %%ymm11 \n\t" + "vmovups 64(%2,%0,8), %%ymm8 \n\t" // 2 * x + "vmovups 96(%2,%0,8), %%ymm9 \n\t" // 2 * x + "vaddpd %%ymm4 , %%ymm10, %%ymm4 \n\t" + "vaddpd %%ymm5 , %%ymm11, %%ymm5 \n\t" + + + "vmulpd %%ymm8 , %%ymm14, %%ymm10 \n\t" + "vmulpd %%ymm9 , %%ymm15, %%ymm11 \n\t" + "vpermilpd $0x5 , %%ymm14, %%ymm14 \n\t" + "vpermilpd $0x5 , %%ymm15, %%ymm15 \n\t" + "vaddpd %%ymm2 , %%ymm10, %%ymm2 \n\t" + "vaddpd %%ymm3 , %%ymm11, %%ymm3 \n\t" + "vmulpd %%ymm8 , %%ymm14, %%ymm10 \n\t" + "addq $16 , %0 \n\t" + "vmulpd %%ymm9 , %%ymm15, %%ymm11 \n\t" + "vaddpd %%ymm6 , %%ymm10, %%ymm6 \n\t" + "subq $8 , %1 \n\t" + "vaddpd %%ymm7 , %%ymm11, %%ymm7 \n\t" + + "jnz 1b \n\t" + + "vaddpd %%ymm0, %%ymm1, %%ymm0 \n\t" + "vaddpd %%ymm2, %%ymm3, %%ymm2 \n\t" + "vaddpd %%ymm0, %%ymm2, %%ymm0 \n\t" + + "vaddpd %%ymm4, %%ymm5, %%ymm4 \n\t" + "vaddpd %%ymm6, %%ymm7, %%ymm6 \n\t" + "vaddpd %%ymm4, %%ymm6, %%ymm4 \n\t" + + "vextractf128 $1 , %%ymm0 , %%xmm1 \n\t" + "vextractf128 $1 , %%ymm4 , %%xmm5 \n\t" + + "vaddpd %%xmm0, %%xmm1, %%xmm0 \n\t" + "vaddpd %%xmm4, %%xmm5, %%xmm4 \n\t" + + "vmovups %%xmm0, (%4) \n\t" + "vmovups %%xmm4, 16(%4) \n\t" + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (dot) // 4 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + + + + +} + + From 3119def9a7cdcbd1b030cd70054dc68a65ead41a Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Fri, 10 Apr 2015 11:10:31 +0200 Subject: [PATCH 6/6] updated cdot and zdot --- kernel/x86_64/cdot.c | 2 - kernel/x86_64/zdot.c | 2 - kernel/x86_64/zdot_microk_bulldozer-2.c | 85 ++++++++++++++++++++++- kernel/x86_64/zdot_microk_haswell-2.c | 91 +++++++++++++++++++++++++ 4 files changed, 174 insertions(+), 6 deletions(-) diff --git a/kernel/x86_64/cdot.c b/kernel/x86_64/cdot.c index e0ba31ae7..266ab4fb9 100644 --- a/kernel/x86_64/cdot.c +++ b/kernel/x86_64/cdot.c @@ -34,8 +34,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "cdot_microk_bulldozer-2.c" #elif defined(STEAMROLLER) || defined(PILEDRIVER) #include "cdot_microk_steamroller-2.c" -#elif defined(NEHALEM) -#include "cdot_microk_nehalem-2.c" #elif defined(HASWELL) #include "cdot_microk_haswell-2.c" #elif defined(SANDYBRIDGE) diff --git a/kernel/x86_64/zdot.c b/kernel/x86_64/zdot.c index ee220c70e..c0cca521b 100644 --- a/kernel/x86_64/zdot.c +++ b/kernel/x86_64/zdot.c @@ -34,8 +34,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "zdot_microk_bulldozer-2.c" #elif defined(STEAMROLLER) || defined(PILEDRIVER) #include "zdot_microk_steamroller-2.c" -#elif defined(NEHALEM) -#include "zdot_microk_nehalem-2.c" #elif defined(HASWELL) #include "zdot_microk_haswell-2.c" #elif defined(SANDYBRIDGE) diff --git a/kernel/x86_64/zdot_microk_bulldozer-2.c b/kernel/x86_64/zdot_microk_bulldozer-2.c index d45c4ad38..30a9552d6 100644 --- a/kernel/x86_64/zdot_microk_bulldozer-2.c +++ b/kernel/x86_64/zdot_microk_bulldozer-2.c @@ -34,6 +34,9 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) BLASLONG register i = 0; + if ( n < 768 ) + { + __asm__ __volatile__ ( "vzeroupper \n\t" @@ -48,11 +51,88 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) ".align 16 \n\t" "1: \n\t" - "prefetcht0 512(%2,%0,8) \n\t" "vmovups (%2,%0,8), %%xmm8 \n\t" // 1 * x "vmovups 16(%2,%0,8), %%xmm9 \n\t" // 1 * x - "prefetcht0 512(%3,%0,8) \n\t" + "vmovups (%3,%0,8), %%xmm12 \n\t" // 1 * y + "vmovups 16(%3,%0,8), %%xmm13 \n\t" // 1 * y + + "vmovups 32(%2,%0,8), %%xmm10 \n\t" // 1 * x + "vmovups 48(%2,%0,8), %%xmm11 \n\t" // 1 * x + + "vmovups 32(%3,%0,8), %%xmm14 \n\t" // 1 * y + "vmovups 48(%3,%0,8), %%xmm15 \n\t" // 1 * y + + "vfmaddpd %%xmm0, %%xmm8 , %%xmm12, %%xmm0 \n\t" // x_r * y_r, x_i * y_i + "vfmaddpd %%xmm1, %%xmm9 , %%xmm13, %%xmm1 \n\t" // x_r * y_r, x_i * y_i + + "vpermilpd $0x1 , %%xmm12, %%xmm12 \n\t" + "vpermilpd $0x1 , %%xmm13, %%xmm13 \n\t" + + "vfmaddpd %%xmm2, %%xmm10, %%xmm14, %%xmm2 \n\t" // x_r * y_r, x_i * y_i + "vfmaddpd %%xmm3, %%xmm11, %%xmm15, %%xmm3 \n\t" // x_r * y_r, x_i * y_i + + "vpermilpd $0x1 , %%xmm14, %%xmm14 \n\t" + "vpermilpd $0x1 , %%xmm15, %%xmm15 \n\t" + + "vfmaddpd %%xmm4, %%xmm8 , %%xmm12, %%xmm4 \n\t" // x_r * y_i, x_i * y_r + "addq $8 , %0 \n\t" + "vfmaddpd %%xmm5, %%xmm9 , %%xmm13, %%xmm5 \n\t" // x_r * y_i, x_i * y_r + "vfmaddpd %%xmm6, %%xmm10, %%xmm14, %%xmm6 \n\t" // x_r * y_i, x_i * y_r + "subq $4 , %1 \n\t" + "vfmaddpd %%xmm7, %%xmm11, %%xmm15, %%xmm7 \n\t" // x_r * y_i, x_i * y_r + + "jnz 1b \n\t" + + "vaddpd %%xmm0, %%xmm1, %%xmm0 \n\t" + "vaddpd %%xmm2, %%xmm3, %%xmm2 \n\t" + "vaddpd %%xmm0, %%xmm2, %%xmm0 \n\t" + + "vaddpd %%xmm4, %%xmm5, %%xmm4 \n\t" + "vaddpd %%xmm6, %%xmm7, %%xmm6 \n\t" + "vaddpd %%xmm4, %%xmm6, %%xmm4 \n\t" + + "vmovups %%xmm0, (%4) \n\t" + "vmovups %%xmm4, 16(%4) \n\t" + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (dot) // 4 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + return; + + } + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + "vxorpd %%xmm0, %%xmm0, %%xmm0 \n\t" + "vxorpd %%xmm1, %%xmm1, %%xmm1 \n\t" + "vxorpd %%xmm2, %%xmm2, %%xmm2 \n\t" + "vxorpd %%xmm3, %%xmm3, %%xmm3 \n\t" + "vxorpd %%xmm4, %%xmm4, %%xmm4 \n\t" + "vxorpd %%xmm5, %%xmm5, %%xmm5 \n\t" + "vxorpd %%xmm6, %%xmm6, %%xmm6 \n\t" + "vxorpd %%xmm7, %%xmm7, %%xmm7 \n\t" + + ".align 16 \n\t" + "1: \n\t" + "prefetcht0 384(%2,%0,8) \n\t" + "vmovups (%2,%0,8), %%xmm8 \n\t" // 1 * x + "vmovups 16(%2,%0,8), %%xmm9 \n\t" // 1 * x + + "prefetcht0 384(%3,%0,8) \n\t" "vmovups (%3,%0,8), %%xmm12 \n\t" // 1 * y "vmovups 16(%3,%0,8), %%xmm13 \n\t" // 1 * y @@ -110,6 +190,7 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "memory" ); + } diff --git a/kernel/x86_64/zdot_microk_haswell-2.c b/kernel/x86_64/zdot_microk_haswell-2.c index 04a6b971f..810cb4439 100644 --- a/kernel/x86_64/zdot_microk_haswell-2.c +++ b/kernel/x86_64/zdot_microk_haswell-2.c @@ -34,6 +34,10 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) BLASLONG register i = 0; + if ( n <=1280 ) + { + + __asm__ __volatile__ ( "vzeroupper \n\t" @@ -111,6 +115,93 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); + return; + } + + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + "vxorpd %%ymm0, %%ymm0, %%ymm0 \n\t" + "vxorpd %%ymm1, %%ymm1, %%ymm1 \n\t" + "vxorpd %%ymm2, %%ymm2, %%ymm2 \n\t" + "vxorpd %%ymm3, %%ymm3, %%ymm3 \n\t" + "vxorpd %%ymm4, %%ymm4, %%ymm4 \n\t" + "vxorpd %%ymm5, %%ymm5, %%ymm5 \n\t" + "vxorpd %%ymm6, %%ymm6, %%ymm6 \n\t" + "vxorpd %%ymm7, %%ymm7, %%ymm7 \n\t" + + ".align 16 \n\t" + "1: \n\t" + "prefetcht0 512(%2,%0,8) \n\t" + "vmovups (%2,%0,8), %%ymm8 \n\t" // 2 * x + "vmovups 32(%2,%0,8), %%ymm9 \n\t" // 2 * x + + "prefetcht0 512(%3,%0,8) \n\t" + "vmovups (%3,%0,8), %%ymm12 \n\t" // 2 * y + "vmovups 32(%3,%0,8), %%ymm13 \n\t" // 2 * y + + "prefetcht0 576(%2,%0,8) \n\t" + "vmovups 64(%2,%0,8), %%ymm10 \n\t" // 2 * x + "vmovups 96(%2,%0,8), %%ymm11 \n\t" // 2 * x + + "prefetcht0 576(%3,%0,8) \n\t" + "vmovups 64(%3,%0,8), %%ymm14 \n\t" // 2 * y + "vmovups 96(%3,%0,8), %%ymm15 \n\t" // 2 * y + + "vfmadd231pd %%ymm8 , %%ymm12, %%ymm0 \n\t" // x_r * y_r, x_i * y_i + "vfmadd231pd %%ymm9 , %%ymm13, %%ymm1 \n\t" // x_r * y_r, x_i * y_i + "vpermpd $0xb1 , %%ymm12, %%ymm12 \n\t" + "vpermpd $0xb1 , %%ymm13, %%ymm13 \n\t" + + "vfmadd231pd %%ymm10, %%ymm14, %%ymm2 \n\t" // x_r * y_r, x_i * y_i + "vfmadd231pd %%ymm11, %%ymm15, %%ymm3 \n\t" // x_r * y_r, x_i * y_i + "vpermpd $0xb1 , %%ymm14, %%ymm14 \n\t" + "vpermpd $0xb1 , %%ymm15, %%ymm15 \n\t" + + "vfmadd231pd %%ymm8 , %%ymm12, %%ymm4 \n\t" // x_r * y_i, x_i * y_r + "addq $16 , %0 \n\t" + "vfmadd231pd %%ymm9 , %%ymm13, %%ymm5 \n\t" // x_r * y_i, x_i * y_r + "vfmadd231pd %%ymm10, %%ymm14, %%ymm6 \n\t" // x_r * y_i, x_i * y_r + "subq $8 , %1 \n\t" + "vfmadd231pd %%ymm11, %%ymm15, %%ymm7 \n\t" // x_r * y_i, x_i * y_r + + "jnz 1b \n\t" + + "vaddpd %%ymm0, %%ymm1, %%ymm0 \n\t" + "vaddpd %%ymm2, %%ymm3, %%ymm2 \n\t" + "vaddpd %%ymm0, %%ymm2, %%ymm0 \n\t" + + "vaddpd %%ymm4, %%ymm5, %%ymm4 \n\t" + "vaddpd %%ymm6, %%ymm7, %%ymm6 \n\t" + "vaddpd %%ymm4, %%ymm6, %%ymm4 \n\t" + + "vextractf128 $1 , %%ymm0 , %%xmm1 \n\t" + "vextractf128 $1 , %%ymm4 , %%xmm5 \n\t" + + "vaddpd %%xmm0, %%xmm1, %%xmm0 \n\t" + "vaddpd %%xmm4, %%xmm5, %%xmm4 \n\t" + + "vmovups %%xmm0, (%4) \n\t" + "vmovups %%xmm4, 16(%4) \n\t" + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (dot) // 4 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +