From 95a8caa2f340b5936ef3a8106f04df07a07e4d93 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Wed, 6 Aug 2014 12:12:17 +0200 Subject: [PATCH 01/27] added optimized sgemv_t kernel --- kernel/x86_64/KERNEL | 2 +- kernel/x86_64/KERNEL.BULLDOZER | 2 +- kernel/x86_64/KERNEL.HASWELL | 2 +- kernel/x86_64/KERNEL.NEHALEM | 2 +- kernel/x86_64/KERNEL.PILEDRIVER | 2 +- kernel/x86_64/KERNEL.SANDYBRIDGE | 2 +- kernel/x86_64/sgemv_t.c | 200 +++++++++++++++++++++ kernel/x86_64/sgemv_t_microk_bulldozer-2.c | 109 +++++++++++ 8 files changed, 315 insertions(+), 6 deletions(-) create mode 100644 kernel/x86_64/sgemv_t.c create mode 100644 kernel/x86_64/sgemv_t_microk_bulldozer-2.c diff --git a/kernel/x86_64/KERNEL b/kernel/x86_64/KERNEL index 9d0080ae7..3508753ee 100644 --- a/kernel/x86_64/KERNEL +++ b/kernel/x86_64/KERNEL @@ -377,7 +377,7 @@ SGEMVNKERNEL = sgemv_n.c endif ifndef SGEMVTKERNEL -SGEMVTKERNEL = ../arm/gemv_t.c +SGEMVTKERNEL = sgemv_t.c endif ifndef DGEMVNKERNEL diff --git a/kernel/x86_64/KERNEL.BULLDOZER b/kernel/x86_64/KERNEL.BULLDOZER index 21fc94701..accdddf0e 100644 --- a/kernel/x86_64/KERNEL.BULLDOZER +++ b/kernel/x86_64/KERNEL.BULLDOZER @@ -3,7 +3,7 @@ SGEMVNKERNEL = sgemv_n.c SGEMVTKERNEL = ../arm/gemv_t.c else SGEMVNKERNEL = sgemv_n.c -SGEMVTKERNEL = sgemv_t_avx.c +SGEMVTKERNEL = sgemv_t.c endif diff --git a/kernel/x86_64/KERNEL.HASWELL b/kernel/x86_64/KERNEL.HASWELL index 3e20fcfc7..878a56b04 100644 --- a/kernel/x86_64/KERNEL.HASWELL +++ b/kernel/x86_64/KERNEL.HASWELL @@ -3,7 +3,7 @@ SGEMVNKERNEL = sgemv_n.c SGEMVTKERNEL = ../arm/gemv_t.c else SGEMVNKERNEL = sgemv_n.c -SGEMVTKERNEL = sgemv_t_avx.c +SGEMVTKERNEL = sgemv_t.c endif diff --git a/kernel/x86_64/KERNEL.NEHALEM b/kernel/x86_64/KERNEL.NEHALEM index 04efa391a..8276150c6 100644 --- a/kernel/x86_64/KERNEL.NEHALEM +++ b/kernel/x86_64/KERNEL.NEHALEM @@ -3,7 +3,7 @@ SGEMVNKERNEL = sgemv_n.c SGEMVTKERNEL = ../arm/gemv_t.c else SGEMVNKERNEL = sgemv_n.c -SGEMVTKERNEL = ../arm/gemv_t.c +SGEMVTKERNEL = sgemv_t.c endif diff --git a/kernel/x86_64/KERNEL.PILEDRIVER b/kernel/x86_64/KERNEL.PILEDRIVER index b7565edeb..7b3c9a7b8 100644 --- a/kernel/x86_64/KERNEL.PILEDRIVER +++ b/kernel/x86_64/KERNEL.PILEDRIVER @@ -3,7 +3,7 @@ SGEMVNKERNEL = sgemv_n.c SGEMVTKERNEL = ../arm/gemv_t.c else SGEMVNKERNEL = sgemv_n.c -SGEMVTKERNEL = sgemv_t_avx.c +SGEMVTKERNEL = sgemv_t.c endif diff --git a/kernel/x86_64/KERNEL.SANDYBRIDGE b/kernel/x86_64/KERNEL.SANDYBRIDGE index 9dae3a41d..26706b61d 100644 --- a/kernel/x86_64/KERNEL.SANDYBRIDGE +++ b/kernel/x86_64/KERNEL.SANDYBRIDGE @@ -3,7 +3,7 @@ SGEMVNKERNEL = sgemv_n.c SGEMVTKERNEL = ../arm/gemv_t.c else SGEMVNKERNEL = sgemv_n.c -SGEMVTKERNEL = sgemv_t_avx.c +SGEMVTKERNEL = sgemv_t.c endif diff --git a/kernel/x86_64/sgemv_t.c b/kernel/x86_64/sgemv_t.c new file mode 100644 index 000000000..89254c256 --- /dev/null +++ b/kernel/x86_64/sgemv_t.c @@ -0,0 +1,200 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include "common.h" + +#if defined(BULLDOZER) || defined(PILEDRIVER) +#include "sgemv_t_microk_bulldozer-2.c" +#endif + +/* +#if defined(BULLDOZER) || defined(PILEDRIVER) +#include "sgemv_n_microk_bulldozer-2.c" +#elif defined(HASWELL) +#include "sgemv_n_microk_haswell-2.c" +#elif defined(SANDYBRIDGE) +#include "sgemv_n_microk_sandy-2.c" +#elif defined(NEHALEM) +#include "sgemv_n_microk_nehalem-2.c" +#endif +*/ + +#define NBMAX 4096 + +#ifndef HAVE_KERNEL_16x4 + +static void sgemv_kernel_16x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) +{ + BLASLONG i; + FLOAT *a0,*a1,*a2,*a3; + a0 = ap[0]; + a1 = ap[1]; + a2 = ap[2]; + a3 = ap[3]; + FLOAT temp0 = 0.0; + FLOAT temp1 = 0.0; + FLOAT temp2 = 0.0; + FLOAT temp3 = 0.0; + + for ( i=0; i< n; i+=4 ) + { + temp0 += a0[i]*x[i] + a0[i+1]*x[i+1] + a0[i+2]*x[i+2] + a0[i+3]*x[i+3]; + temp1 += a1[i]*x[i] + a1[i+1]*x[i+1] + a1[i+2]*x[i+2] + a1[i+3]*x[i+3]; + temp2 += a2[i]*x[i] + a2[i+1]*x[i+1] + a2[i+2]*x[i+2] + a2[i+3]*x[i+3]; + temp3 += a3[i]*x[i] + a3[i+1]*x[i+1] + a3[i+2]*x[i+2] + a3[i+3]*x[i+3]; + } + y[0] = temp0; + y[1] = temp1; + y[2] = temp2; + y[3] = temp3; +} + +#endif + +static void sgemv_kernel_16x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) +{ + BLASLONG i; + FLOAT *a0; + a0 = ap; + FLOAT temp = 0.0; + + for ( i=0; i< n; i+=4 ) + { + temp += a0[i]*x[i] + a0[i+1]*x[i+1] + a0[i+2]*x[i+2] + a0[i+3]*x[i+3]; + } + *y = temp; +} + +static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) +{ + BLASLONG i; + for ( i=0; i Date: Wed, 6 Aug 2014 13:42:41 +0200 Subject: [PATCH 02/27] added optimized sgemv_t kernel for haswell --- kernel/x86_64/sgemv_t.c | 2 + kernel/x86_64/sgemv_t_microk_haswell-2.c | 112 +++++++++++++++++++++++ 2 files changed, 114 insertions(+) create mode 100644 kernel/x86_64/sgemv_t_microk_haswell-2.c diff --git a/kernel/x86_64/sgemv_t.c b/kernel/x86_64/sgemv_t.c index 89254c256..566311a69 100644 --- a/kernel/x86_64/sgemv_t.c +++ b/kernel/x86_64/sgemv_t.c @@ -30,6 +30,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(BULLDOZER) || defined(PILEDRIVER) #include "sgemv_t_microk_bulldozer-2.c" +#elif defined(HASWELL) +#include "sgemv_t_microk_haswell-2.c" #endif /* diff --git a/kernel/x86_64/sgemv_t_microk_haswell-2.c b/kernel/x86_64/sgemv_t_microk_haswell-2.c new file mode 100644 index 000000000..cef703483 --- /dev/null +++ b/kernel/x86_64/sgemv_t_microk_haswell-2.c @@ -0,0 +1,112 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_16x4 1 +static void sgemv_kernel_16x4( BLASLONG n, float **ap, float *x, float *y) __attribute__ ((noinline)); + +static void sgemv_kernel_16x4( BLASLONG n, float **ap, float *x, float *y) +{ + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + "vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t" + "vxorps %%ymm5 , %%ymm5, %%ymm5 \n\t" + "vxorps %%ymm6 , %%ymm6, %%ymm6 \n\t" + "vxorps %%ymm7 , %%ymm7, %%ymm7 \n\t" + + ".align 16 \n\t" + ".L01LOOP%=: \n\t" + "prefetcht0 384(%2,%0,4) \n\t" + "vmovups (%2,%0,4), %%ymm12 \n\t" // 8 * x + "vmovups 32(%2,%0,4), %%ymm13 \n\t" // 8 * x + + "prefetcht0 384(%4,%0,4) \n\t" + "vfmadd231ps (%4,%0,4), %%ymm12, %%ymm4 \n\t" + "vfmadd231ps (%5,%0,4), %%ymm12, %%ymm5 \n\t" + "prefetcht0 384(%5,%0,4) \n\t" + "vfmadd231ps 32(%4,%0,4), %%ymm13, %%ymm4 \n\t" + "vfmadd231ps 32(%5,%0,4), %%ymm13, %%ymm5 \n\t" + "prefetcht0 384(%6,%0,4) \n\t" + "vfmadd231ps (%6,%0,4), %%ymm12, %%ymm6 \n\t" + "vfmadd231ps (%7,%0,4), %%ymm12, %%ymm7 \n\t" + "prefetcht0 384(%7,%0,4) \n\t" + "vfmadd231ps 32(%6,%0,4), %%ymm13, %%ymm6 \n\t" + "vfmadd231ps 32(%7,%0,4), %%ymm13, %%ymm7 \n\t" + + "addq $16, %0 \n\t" + "subq $16, %1 \n\t" + "jnz .L01LOOP%= \n\t" + + "vextractf128 $1 , %%ymm4, %%xmm12 \n\t" + "vextractf128 $1 , %%ymm5, %%xmm13 \n\t" + "vextractf128 $1 , %%ymm6, %%xmm14 \n\t" + "vextractf128 $1 , %%ymm7, %%xmm15 \n\t" + + "vaddps %%xmm4, %%xmm12, %%xmm4 \n\t" + "vaddps %%xmm5, %%xmm13, %%xmm5 \n\t" + "vaddps %%xmm6, %%xmm14, %%xmm6 \n\t" + "vaddps %%xmm7, %%xmm15, %%xmm7 \n\t" + + "vhaddps %%xmm4, %%xmm4, %%xmm4 \n\t" + "vhaddps %%xmm5, %%xmm5, %%xmm5 \n\t" + "vhaddps %%xmm6, %%xmm6, %%xmm6 \n\t" + "vhaddps %%xmm7, %%xmm7, %%xmm7 \n\t" + + "vhaddps %%xmm4, %%xmm4, %%xmm4 \n\t" + "vhaddps %%xmm5, %%xmm5, %%xmm5 \n\t" + "vhaddps %%xmm6, %%xmm6, %%xmm6 \n\t" + "vhaddps %%xmm7, %%xmm7, %%xmm7 \n\t" + + "vmovss %%xmm4, (%3) \n\t" + "vmovss %%xmm5, 4(%3) \n\t" + "vmovss %%xmm6, 8(%3) \n\t" + "vmovss %%xmm7, 12(%3) \n\t" + + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (ap[0]), // 4 + "r" (ap[1]), // 5 + "r" (ap[2]), // 6 + "r" (ap[3]) // 7 + : "cc", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + From 7aa43c8928bb1c69373ed426aa565c207a99470f Mon Sep 17 00:00:00 2001 From: wernsaar Date: Wed, 6 Aug 2014 14:06:30 +0200 Subject: [PATCH 03/27] enabled optimized sgemv kernels for windows --- kernel/x86_64/KERNEL.HASWELL | 6 ------ 1 file changed, 6 deletions(-) diff --git a/kernel/x86_64/KERNEL.HASWELL b/kernel/x86_64/KERNEL.HASWELL index 878a56b04..ff22954c6 100644 --- a/kernel/x86_64/KERNEL.HASWELL +++ b/kernel/x86_64/KERNEL.HASWELL @@ -1,11 +1,5 @@ -ifdef OS_WINDOWS -SGEMVNKERNEL = sgemv_n.c -SGEMVTKERNEL = ../arm/gemv_t.c -else SGEMVNKERNEL = sgemv_n.c SGEMVTKERNEL = sgemv_t.c -endif - SGEMMKERNEL = sgemm_kernel_16x4_haswell.S SGEMMINCOPY = ../generic/gemm_ncopy_16.c From ca6c8d06cea5bb2fbe1b544f0da1dabefd256422 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Wed, 6 Aug 2014 14:24:36 +0200 Subject: [PATCH 04/27] enabled optimized sgemv kernels for windows --- kernel/x86_64/KERNEL.BULLDOZER | 6 ------ kernel/x86_64/KERNEL.PILEDRIVER | 6 ------ 2 files changed, 12 deletions(-) diff --git a/kernel/x86_64/KERNEL.BULLDOZER b/kernel/x86_64/KERNEL.BULLDOZER index accdddf0e..893f13064 100644 --- a/kernel/x86_64/KERNEL.BULLDOZER +++ b/kernel/x86_64/KERNEL.BULLDOZER @@ -1,11 +1,5 @@ -ifdef OS_WINDOWS -SGEMVNKERNEL = sgemv_n.c -SGEMVTKERNEL = ../arm/gemv_t.c -else SGEMVNKERNEL = sgemv_n.c SGEMVTKERNEL = sgemv_t.c -endif - ZGEMVNKERNEL = zgemv_n_dup.S ZGEMVTKERNEL = zgemv_t.S diff --git a/kernel/x86_64/KERNEL.PILEDRIVER b/kernel/x86_64/KERNEL.PILEDRIVER index 7b3c9a7b8..146a8768b 100644 --- a/kernel/x86_64/KERNEL.PILEDRIVER +++ b/kernel/x86_64/KERNEL.PILEDRIVER @@ -1,11 +1,5 @@ -ifdef OS_WINDOWS -SGEMVNKERNEL = sgemv_n.c -SGEMVTKERNEL = ../arm/gemv_t.c -else SGEMVNKERNEL = sgemv_n.c SGEMVTKERNEL = sgemv_t.c -endif - ZGEMVNKERNEL = zgemv_n_dup.S ZGEMVTKERNEL = zgemv_t.S From d945a2b06d131995d4184311d397b17a6f1c7afb Mon Sep 17 00:00:00 2001 From: wernsaar Date: Wed, 6 Aug 2014 16:21:48 +0200 Subject: [PATCH 05/27] added optimized sgemv_t kernel for nehalem --- kernel/x86_64/sgemv_t.c | 2 + kernel/x86_64/sgemv_t_microk_nehalem-2.c | 159 +++++++++++++++++++++++ 2 files changed, 161 insertions(+) create mode 100644 kernel/x86_64/sgemv_t_microk_nehalem-2.c diff --git a/kernel/x86_64/sgemv_t.c b/kernel/x86_64/sgemv_t.c index 566311a69..47e749e58 100644 --- a/kernel/x86_64/sgemv_t.c +++ b/kernel/x86_64/sgemv_t.c @@ -32,6 +32,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "sgemv_t_microk_bulldozer-2.c" #elif defined(HASWELL) #include "sgemv_t_microk_haswell-2.c" +#elif defined(NEHALEM) +#include "sgemv_t_microk_nehalem-2.c" #endif /* diff --git a/kernel/x86_64/sgemv_t_microk_nehalem-2.c b/kernel/x86_64/sgemv_t_microk_nehalem-2.c new file mode 100644 index 000000000..e1f2b81bd --- /dev/null +++ b/kernel/x86_64/sgemv_t_microk_nehalem-2.c @@ -0,0 +1,159 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_16x4 1 +static void sgemv_kernel_16x4( BLASLONG n, float **ap, float *x, float *y) __attribute__ ((noinline)); + +static void sgemv_kernel_16x4( BLASLONG n, float **ap, float *x, float *y) +{ + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "xorps %%xmm0 , %%xmm0 \n\t" + "xorps %%xmm1 , %%xmm1 \n\t" + "xorps %%xmm2 , %%xmm2 \n\t" + "xorps %%xmm3 , %%xmm3 \n\t" + "xorps %%xmm4 , %%xmm4 \n\t" + "xorps %%xmm5 , %%xmm5 \n\t" + "xorps %%xmm6 , %%xmm6 \n\t" + "xorps %%xmm7 , %%xmm7 \n\t" + + ".align 16 \n\t" + ".L01LOOP%=: \n\t" + "prefetcht0 384(%2,%0,4) \n\t" + "movups (%2,%0,4), %%xmm12 \n\t" // 4 * x + "movups 16(%2,%0,4), %%xmm13 \n\t" // 4 * x + "movups (%4,%0,4), %%xmm8 \n\t" + "movups 32(%2,%0,4), %%xmm14 \n\t" // 4 * x + "movups 48(%2,%0,4), %%xmm15 \n\t" // 4 * x + + "prefetcht0 384(%4,%0,4) \n\t" + + "movups 16(%4,%0,4), %%xmm9 \n\t" + "movups 32(%4,%0,4), %%xmm10 \n\t" + "movups 48(%4,%0,4), %%xmm11 \n\t" + "mulps %%xmm12, %%xmm8 \n\t" + "addps %%xmm8 , %%xmm0 \n\t" + "mulps %%xmm13, %%xmm9 \n\t" + "addps %%xmm9 , %%xmm4 \n\t" + "movups (%5,%0,4), %%xmm8 \n\t" + "mulps %%xmm14, %%xmm10 \n\t" + "addps %%xmm10, %%xmm0 \n\t" + "mulps %%xmm15, %%xmm11 \n\t" + "addps %%xmm11, %%xmm4 \n\t" + + "prefetcht0 384(%5,%0,4) \n\t" + + "movups 16(%5,%0,4), %%xmm9 \n\t" + "movups 32(%5,%0,4), %%xmm10 \n\t" + "movups 48(%5,%0,4), %%xmm11 \n\t" + "mulps %%xmm12, %%xmm8 \n\t" + "addps %%xmm8 , %%xmm1 \n\t" + "mulps %%xmm13, %%xmm9 \n\t" + "addps %%xmm9 , %%xmm5 \n\t" + "movups (%6,%0,4), %%xmm8 \n\t" + "mulps %%xmm14, %%xmm10 \n\t" + "addps %%xmm10, %%xmm1 \n\t" + "mulps %%xmm15, %%xmm11 \n\t" + "addps %%xmm11, %%xmm5 \n\t" + + "prefetcht0 384(%6,%0,4) \n\t" + + "movups 16(%6,%0,4), %%xmm9 \n\t" + "movups 32(%6,%0,4), %%xmm10 \n\t" + "movups 48(%6,%0,4), %%xmm11 \n\t" + "mulps %%xmm12, %%xmm8 \n\t" + "addps %%xmm8 , %%xmm2 \n\t" + "mulps %%xmm13, %%xmm9 \n\t" + "addps %%xmm9 , %%xmm6 \n\t" + "movups (%7,%0,4), %%xmm8 \n\t" + "mulps %%xmm14, %%xmm10 \n\t" + "addps %%xmm10, %%xmm2 \n\t" + "mulps %%xmm15, %%xmm11 \n\t" + "addps %%xmm11, %%xmm6 \n\t" + + "prefetcht0 384(%7,%0,4) \n\t" + + "movups 16(%7,%0,4), %%xmm9 \n\t" + "movups 32(%7,%0,4), %%xmm10 \n\t" + "movups 48(%7,%0,4), %%xmm11 \n\t" + "mulps %%xmm12, %%xmm8 \n\t" + "addps %%xmm8 , %%xmm3 \n\t" + "mulps %%xmm13, %%xmm9 \n\t" + "addps %%xmm9 , %%xmm7 \n\t" + "mulps %%xmm14, %%xmm10 \n\t" + "addps %%xmm10, %%xmm3 \n\t" + "mulps %%xmm15, %%xmm11 \n\t" + "addps %%xmm11, %%xmm7 \n\t" + + "addq $16, %0 \n\t" + "subq $16, %1 \n\t" + "jnz .L01LOOP%= \n\t" + + "addps %%xmm0, %%xmm4 \n\t" + "addps %%xmm1, %%xmm5 \n\t" + "addps %%xmm2, %%xmm6 \n\t" + "addps %%xmm3, %%xmm7 \n\t" + + "haddps %%xmm4, %%xmm4 \n\t" + "haddps %%xmm5, %%xmm5 \n\t" + "haddps %%xmm6, %%xmm6 \n\t" + "haddps %%xmm7, %%xmm7 \n\t" + + "haddps %%xmm4, %%xmm4 \n\t" + "haddps %%xmm5, %%xmm5 \n\t" + "haddps %%xmm6, %%xmm6 \n\t" + "haddps %%xmm7, %%xmm7 \n\t" + + "movss %%xmm4, (%3) \n\t" + "movss %%xmm5, 4(%3) \n\t" + "movss %%xmm6, 8(%3) \n\t" + "movss %%xmm7, 12(%3) \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (ap[0]), // 4 + "r" (ap[1]), // 5 + "r" (ap[2]), // 6 + "r" (ap[3]) // 7 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + From 2f8927376f50445f800b995ba6010e7ed571ecba Mon Sep 17 00:00:00 2001 From: wernsaar Date: Wed, 6 Aug 2014 16:58:21 +0200 Subject: [PATCH 06/27] enabled optimized nehalem sgemv_t kernel for windows --- kernel/x86_64/KERNEL.NEHALEM | 6 ------ 1 file changed, 6 deletions(-) diff --git a/kernel/x86_64/KERNEL.NEHALEM b/kernel/x86_64/KERNEL.NEHALEM index 8276150c6..ca9ff252d 100644 --- a/kernel/x86_64/KERNEL.NEHALEM +++ b/kernel/x86_64/KERNEL.NEHALEM @@ -1,11 +1,5 @@ -ifdef OS_WINDOWS -SGEMVNKERNEL = sgemv_n.c -SGEMVTKERNEL = ../arm/gemv_t.c -else SGEMVNKERNEL = sgemv_n.c SGEMVTKERNEL = sgemv_t.c -endif - SGEMMKERNEL = gemm_kernel_4x8_nehalem.S SGEMMINCOPY = gemm_ncopy_4.S From c9bad1403ad4ff8b170bab16affcc1de1a6b66ce Mon Sep 17 00:00:00 2001 From: wernsaar Date: Thu, 7 Aug 2014 07:49:33 +0200 Subject: [PATCH 07/27] added optimized sgemv_t kernel for sandybridge --- kernel/x86_64/KERNEL.SANDYBRIDGE | 6 -- kernel/x86_64/sgemv_t.c | 14 +-- kernel/x86_64/sgemv_t_microk_sandy-2.c | 132 +++++++++++++++++++++++++ 3 files changed, 134 insertions(+), 18 deletions(-) create mode 100644 kernel/x86_64/sgemv_t_microk_sandy-2.c diff --git a/kernel/x86_64/KERNEL.SANDYBRIDGE b/kernel/x86_64/KERNEL.SANDYBRIDGE index 26706b61d..d4fbca7f2 100644 --- a/kernel/x86_64/KERNEL.SANDYBRIDGE +++ b/kernel/x86_64/KERNEL.SANDYBRIDGE @@ -1,11 +1,5 @@ -ifdef OS_WINDOWS -SGEMVNKERNEL = sgemv_n.c -SGEMVTKERNEL = ../arm/gemv_t.c -else SGEMVNKERNEL = sgemv_n.c SGEMVTKERNEL = sgemv_t.c -endif - SGEMMKERNEL = sgemm_kernel_16x4_sandy.S SGEMMINCOPY = ../generic/gemm_ncopy_16.c diff --git a/kernel/x86_64/sgemv_t.c b/kernel/x86_64/sgemv_t.c index 47e749e58..adfaa9925 100644 --- a/kernel/x86_64/sgemv_t.c +++ b/kernel/x86_64/sgemv_t.c @@ -32,22 +32,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "sgemv_t_microk_bulldozer-2.c" #elif defined(HASWELL) #include "sgemv_t_microk_haswell-2.c" +#elif defined(SANDYBRIDGE) +#include "sgemv_t_microk_sandy-2.c" #elif defined(NEHALEM) #include "sgemv_t_microk_nehalem-2.c" #endif -/* -#if defined(BULLDOZER) || defined(PILEDRIVER) -#include "sgemv_n_microk_bulldozer-2.c" -#elif defined(HASWELL) -#include "sgemv_n_microk_haswell-2.c" -#elif defined(SANDYBRIDGE) -#include "sgemv_n_microk_sandy-2.c" -#elif defined(NEHALEM) -#include "sgemv_n_microk_nehalem-2.c" -#endif -*/ - #define NBMAX 4096 #ifndef HAVE_KERNEL_16x4 diff --git a/kernel/x86_64/sgemv_t_microk_sandy-2.c b/kernel/x86_64/sgemv_t_microk_sandy-2.c new file mode 100644 index 000000000..6a3748238 --- /dev/null +++ b/kernel/x86_64/sgemv_t_microk_sandy-2.c @@ -0,0 +1,132 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_16x4 1 +static void sgemv_kernel_16x4( BLASLONG n, float **ap, float *x, float *y) __attribute__ ((noinline)); + +static void sgemv_kernel_16x4( BLASLONG n, float **ap, float *x, float *y) +{ + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + "vxorps %%ymm0 , %%ymm0, %%ymm0 \n\t" + "vxorps %%ymm1 , %%ymm1, %%ymm1 \n\t" + "vxorps %%ymm2 , %%ymm2, %%ymm2 \n\t" + "vxorps %%ymm3 , %%ymm3, %%ymm3 \n\t" + "vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t" + "vxorps %%ymm5 , %%ymm5, %%ymm5 \n\t" + "vxorps %%ymm6 , %%ymm6, %%ymm6 \n\t" + "vxorps %%ymm7 , %%ymm7, %%ymm7 \n\t" + + ".align 16 \n\t" + ".L01LOOP%=: \n\t" + "prefetcht0 384(%2,%0,4) \n\t" + "vmovups (%2,%0,4), %%ymm12 \n\t" // 8 * x + "vmovups 32(%2,%0,4), %%ymm13 \n\t" // 8 * x + + "prefetcht0 384(%4,%0,4) \n\t" + "vmulps (%4,%0,4), %%ymm12, %%ymm8 \n\t" + "vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t" + "vmulps 32(%4,%0,4), %%ymm13, %%ymm9 \n\t" + "vaddps %%ymm0, %%ymm9 , %%ymm0 \n\t" + "prefetcht0 384(%5,%0,4) \n\t" + "vmulps (%5,%0,4), %%ymm12, %%ymm10 \n\t" + "vaddps %%ymm1, %%ymm10, %%ymm1 \n\t" + "vmulps 32(%5,%0,4), %%ymm13, %%ymm11 \n\t" + "vaddps %%ymm5, %%ymm11, %%ymm5 \n\t" + "prefetcht0 384(%6,%0,4) \n\t" + "vmulps (%6,%0,4), %%ymm12, %%ymm8 \n\t" + "vaddps %%ymm6, %%ymm8 , %%ymm6 \n\t" + "vmulps 32(%6,%0,4), %%ymm13, %%ymm9 \n\t" + "vaddps %%ymm2, %%ymm9 , %%ymm2 \n\t" + "prefetcht0 384(%7,%0,4) \n\t" + "vmulps (%7,%0,4), %%ymm12, %%ymm10 \n\t" + "vaddps %%ymm7, %%ymm10, %%ymm7 \n\t" + "vmulps 32(%7,%0,4), %%ymm13, %%ymm11 \n\t" + "vaddps %%ymm3, %%ymm11, %%ymm3 \n\t" + + "addq $16, %0 \n\t" + "subq $16, %1 \n\t" + "jnz .L01LOOP%= \n\t" + + "vaddps %%ymm4, %%ymm0, %%ymm4 \n\t" + "vaddps %%ymm5, %%ymm1, %%ymm5 \n\t" + "vaddps %%ymm6, %%ymm2, %%ymm6 \n\t" + "vaddps %%ymm7, %%ymm3, %%ymm7 \n\t" + + "vextractf128 $1 , %%ymm4, %%xmm12 \n\t" + "vextractf128 $1 , %%ymm5, %%xmm13 \n\t" + "vextractf128 $1 , %%ymm6, %%xmm14 \n\t" + "vextractf128 $1 , %%ymm7, %%xmm15 \n\t" + + "vaddps %%xmm4, %%xmm12, %%xmm4 \n\t" + "vaddps %%xmm5, %%xmm13, %%xmm5 \n\t" + "vaddps %%xmm6, %%xmm14, %%xmm6 \n\t" + "vaddps %%xmm7, %%xmm15, %%xmm7 \n\t" + + "vhaddps %%xmm4, %%xmm4, %%xmm4 \n\t" + "vhaddps %%xmm5, %%xmm5, %%xmm5 \n\t" + "vhaddps %%xmm6, %%xmm6, %%xmm6 \n\t" + "vhaddps %%xmm7, %%xmm7, %%xmm7 \n\t" + + "vhaddps %%xmm4, %%xmm4, %%xmm4 \n\t" + "vhaddps %%xmm5, %%xmm5, %%xmm5 \n\t" + "vhaddps %%xmm6, %%xmm6, %%xmm6 \n\t" + "vhaddps %%xmm7, %%xmm7, %%xmm7 \n\t" + + "vmovss %%xmm4, (%3) \n\t" + "vmovss %%xmm5, 4(%3) \n\t" + "vmovss %%xmm6, 8(%3) \n\t" + "vmovss %%xmm7, 12(%3) \n\t" + + + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (ap[0]), // 4 + "r" (ap[1]), // 5 + "r" (ap[2]), // 6 + "r" (ap[3]) // 7 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + From 60f17628cc65387f474160462511f91f93d50c3e Mon Sep 17 00:00:00 2001 From: wernsaar Date: Thu, 7 Aug 2014 09:18:02 +0200 Subject: [PATCH 08/27] added optimized dgemv_n kernel for haswell --- kernel/x86_64/KERNEL.HASWELL | 1 + kernel/x86_64/dgemv_n.c | 203 +++++++++++++++++++++++ kernel/x86_64/dgemv_n_microk_haswell-2.c | 89 ++++++++++ 3 files changed, 293 insertions(+) create mode 100644 kernel/x86_64/dgemv_n.c create mode 100644 kernel/x86_64/dgemv_n_microk_haswell-2.c diff --git a/kernel/x86_64/KERNEL.HASWELL b/kernel/x86_64/KERNEL.HASWELL index ff22954c6..bcff2e224 100644 --- a/kernel/x86_64/KERNEL.HASWELL +++ b/kernel/x86_64/KERNEL.HASWELL @@ -1,5 +1,6 @@ SGEMVNKERNEL = sgemv_n.c SGEMVTKERNEL = sgemv_t.c +DGEMVNKERNEL = dgemv_n.c SGEMMKERNEL = sgemm_kernel_16x4_haswell.S SGEMMINCOPY = ../generic/gemm_ncopy_16.c diff --git a/kernel/x86_64/dgemv_n.c b/kernel/x86_64/dgemv_n.c new file mode 100644 index 000000000..5192ba193 --- /dev/null +++ b/kernel/x86_64/dgemv_n.c @@ -0,0 +1,203 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include "common.h" + + +#if defined(HASWELL) +#include "dgemv_n_microk_haswell-2.c" +#endif + + +#define NBMAX 2048 + +#ifndef HAVE_KERNEL_16x4 + +static void dgemv_kernel_16x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) +{ + BLASLONG i; + FLOAT *a0,*a1,*a2,*a3; + a0 = ap[0]; + a1 = ap[1]; + a2 = ap[2]; + a3 = ap[3]; + + for ( i=0; i< n; i+=4 ) + { + y[i] += a0[i]*x[0] + a1[i]*x[1] + a2[i]*x[2] + a3[i]*x[3]; + y[i+1] += a0[i+1]*x[0] + a1[i+1]*x[1] + a2[i+1]*x[2] + a3[i+1]*x[3]; + y[i+2] += a0[i+2]*x[0] + a1[i+2]*x[1] + a2[i+2]*x[2] + a3[i+2]*x[3]; + y[i+3] += a0[i+3]*x[0] + a1[i+3]*x[1] + a2[i+3]*x[2] + a3[i+3]*x[3]; + } +} + +#endif + +static void dgemv_kernel_16x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) +{ + BLASLONG i; + FLOAT *a0; + a0 = ap; + + for ( i=0; i< n; i+=4 ) + { + y[i] += a0[i]*x[0]; + y[i+1] += a0[i+1]*x[0]; + y[i+2] += a0[i+2]*x[0]; + y[i+3] += a0[i+3]*x[0]; + } +} + + +static void zero_y(BLASLONG n, FLOAT *dest) +{ + BLASLONG i; + for ( i=0; i Date: Thu, 7 Aug 2014 10:08:54 +0200 Subject: [PATCH 09/27] added optimized dgemv_t kernel for haswell --- kernel/x86_64/KERNEL.HASWELL | 1 + kernel/x86_64/dgemv_t.c | 188 +++++++++++++++++++++++ kernel/x86_64/dgemv_t_microk_haswell-2.c | 107 +++++++++++++ 3 files changed, 296 insertions(+) create mode 100644 kernel/x86_64/dgemv_t.c create mode 100644 kernel/x86_64/dgemv_t_microk_haswell-2.c diff --git a/kernel/x86_64/KERNEL.HASWELL b/kernel/x86_64/KERNEL.HASWELL index bcff2e224..cd280e4b6 100644 --- a/kernel/x86_64/KERNEL.HASWELL +++ b/kernel/x86_64/KERNEL.HASWELL @@ -1,6 +1,7 @@ SGEMVNKERNEL = sgemv_n.c SGEMVTKERNEL = sgemv_t.c DGEMVNKERNEL = dgemv_n.c +DGEMVTKERNEL = dgemv_t.c SGEMMKERNEL = sgemm_kernel_16x4_haswell.S SGEMMINCOPY = ../generic/gemm_ncopy_16.c diff --git a/kernel/x86_64/dgemv_t.c b/kernel/x86_64/dgemv_t.c new file mode 100644 index 000000000..76aacd349 --- /dev/null +++ b/kernel/x86_64/dgemv_t.c @@ -0,0 +1,188 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include "common.h" + +#if defined(HASWELL) +#include "dgemv_t_microk_haswell-2.c" +#endif + +#define NBMAX 2048 + +#ifndef HAVE_KERNEL_16x4 + +static void dgemv_kernel_16x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) +{ + BLASLONG i; + FLOAT *a0,*a1,*a2,*a3; + a0 = ap[0]; + a1 = ap[1]; + a2 = ap[2]; + a3 = ap[3]; + FLOAT temp0 = 0.0; + FLOAT temp1 = 0.0; + FLOAT temp2 = 0.0; + FLOAT temp3 = 0.0; + + for ( i=0; i< n; i+=4 ) + { + temp0 += a0[i]*x[i] + a0[i+1]*x[i+1] + a0[i+2]*x[i+2] + a0[i+3]*x[i+3]; + temp1 += a1[i]*x[i] + a1[i+1]*x[i+1] + a1[i+2]*x[i+2] + a1[i+3]*x[i+3]; + temp2 += a2[i]*x[i] + a2[i+1]*x[i+1] + a2[i+2]*x[i+2] + a2[i+3]*x[i+3]; + temp3 += a3[i]*x[i] + a3[i+1]*x[i+1] + a3[i+2]*x[i+2] + a3[i+3]*x[i+3]; + } + y[0] = temp0; + y[1] = temp1; + y[2] = temp2; + y[3] = temp3; +} + +#endif + +static void dgemv_kernel_16x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) +{ + BLASLONG i; + FLOAT *a0; + a0 = ap; + FLOAT temp = 0.0; + + for ( i=0; i< n; i+=4 ) + { + temp += a0[i]*x[i] + a0[i+1]*x[i+1] + a0[i+2]*x[i+2] + a0[i+3]*x[i+3]; + } + *y = temp; +} + +static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) +{ + BLASLONG i; + for ( i=0; i Date: Thu, 7 Aug 2014 22:30:20 +0200 Subject: [PATCH 10/27] added zgemv_n c-function --- kernel/x86_64/zgemv_n.c | 302 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 302 insertions(+) create mode 100644 kernel/x86_64/zgemv_n.c diff --git a/kernel/x86_64/zgemv_n.c b/kernel/x86_64/zgemv_n.c new file mode 100644 index 000000000..be5b08dcd --- /dev/null +++ b/kernel/x86_64/zgemv_n.c @@ -0,0 +1,302 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include "common.h" + + +#define NBMAX 1024 + +#ifndef HAVE_KERNEL_16x4 + +static void zgemv_kernel_16x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) +{ + BLASLONG i; + FLOAT *a0,*a1,*a2,*a3; + a0 = ap[0]; + a1 = ap[1]; + a2 = ap[2]; + a3 = ap[3]; + + for ( i=0; i< 2*n; i+=2 ) + { +#if !defined(CONJ) +#if !defined(XCONJ) + y[i] += a0[i]*x[0] - a0[i+1] * x[1]; + y[i+1] += a0[i]*x[1] + a0[i+1] * x[0]; + y[i] += a1[i]*x[2] - a1[i+1] * x[3]; + y[i+1] += a1[i]*x[3] + a1[i+1] * x[2]; + y[i] += a2[i]*x[4] - a2[i+1] * x[5]; + y[i+1] += a2[i]*x[5] + a2[i+1] * x[4]; + y[i] += a3[i]*x[6] - a3[i+1] * x[7]; + y[i+1] += a3[i]*x[7] + a3[i+1] * x[6]; +#else + y[i] += a0[i]*x[0] + a0[i+1] * x[1]; + y[i+1] += a0[i]*x[1] - a0[i+1] * x[0]; + y[i] += a1[i]*x[2] + a1[i+1] * x[3]; + y[i+1] += a1[i]*x[3] - a1[i+1] * x[2]; + y[i] += a2[i]*x[4] + a2[i+1] * x[5]; + y[i+1] += a2[i]*x[5] - a2[i+1] * x[4]; + y[i] += a3[i]*x[6] + a3[i+1] * x[7]; + y[i+1] += a3[i]*x[7] - a3[i+1] * x[6]; +#endif +#else +#if !defined(XCONJ) + y[i] += a0[i]*x[0] + a0[i+1] * x[1]; + y[i+1] -= a0[i]*x[1] - a0[i+1] * x[0]; + y[i] += a1[i]*x[2] + a1[i+1] * x[3]; + y[i+1] -= a1[i]*x[3] - a1[i+1] * x[2]; + y[i] += a2[i]*x[4] + a2[i+1] * x[5]; + y[i+1] -= a2[i]*x[5] - a2[i+1] * x[4]; + y[i] += a3[i]*x[6] + a3[i+1] * x[7]; + y[i+1] -= a3[i]*x[7] - a3[i+1] * x[6]; + +#else + y[i] += a0[i]*x[0] - a0[i+1] * x[1]; + y[i+1] -= a0[i]*x[1] + a0[i+1] * x[0]; + y[i] += a1[i]*x[2] - a1[i+1] * x[3]; + y[i+1] -= a1[i]*x[3] + a1[i+1] * x[2]; + y[i] += a2[i]*x[4] - a2[i+1] * x[5]; + y[i+1] -= a2[i]*x[5] + a2[i+1] * x[4]; + y[i] += a3[i]*x[6] - a3[i+1] * x[7]; + y[i+1] -= a3[i]*x[7] + a3[i+1] * x[6]; + +#endif +#endif + } +} + +#endif + +static void zgemv_kernel_16x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) +{ + BLASLONG i; + FLOAT *a0; + a0 = ap; + + for ( i=0; i< 2*n; i+=2 ) + { +#if !defined(CONJ) +#if !defined(XCONJ) + y[i] += a0[i]*x[0] - a0[i+1] * x[1]; + y[i+1] += a0[i]*x[1] + a0[i+1] * x[0]; +#else + y[i] += a0[i]*x[0] + a0[i+1] * x[1]; + y[i+1] += a0[i]*x[1] - a0[i+1] * x[0]; +#endif +#else +#if !defined(XCONJ) + y[i] += a0[i]*x[0] + a0[i+1] * x[1]; + y[i+1] -= a0[i]*x[1] - a0[i+1] * x[0]; + +#else + y[i] += a0[i]*x[0] - a0[i+1] * x[1]; + y[i+1] -= a0[i]*x[1] + a0[i+1] * x[0]; +#endif +#endif + + } +} + + +static void zero_y(BLASLONG n, FLOAT *dest) +{ + BLASLONG i; + for ( i=0; i<2*n; i++ ) + { + *dest = 0.0; + dest++; + } +} + + + +static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) +{ + BLASLONG i; + for ( i=0; i Date: Sun, 10 Aug 2014 08:39:17 +0200 Subject: [PATCH 11/27] added optimized zgemv_n kernel for haswell --- kernel/x86_64/KERNEL.HASWELL | 1 + kernel/x86_64/dgemv_n_microk_haswell-2.c | 2 +- kernel/x86_64/zgemv_n.c | 4 + kernel/x86_64/zgemv_n_microk_haswell-2.c | 149 +++++++++++++++++++++++ 4 files changed, 155 insertions(+), 1 deletion(-) create mode 100644 kernel/x86_64/zgemv_n_microk_haswell-2.c diff --git a/kernel/x86_64/KERNEL.HASWELL b/kernel/x86_64/KERNEL.HASWELL index cd280e4b6..d126eb6f4 100644 --- a/kernel/x86_64/KERNEL.HASWELL +++ b/kernel/x86_64/KERNEL.HASWELL @@ -2,6 +2,7 @@ SGEMVNKERNEL = sgemv_n.c SGEMVTKERNEL = sgemv_t.c DGEMVNKERNEL = dgemv_n.c DGEMVTKERNEL = dgemv_t.c +ZGEMVNKERNEL = zgemv_n.c SGEMMKERNEL = sgemm_kernel_16x4_haswell.S SGEMMINCOPY = ../generic/gemm_ncopy_16.c diff --git a/kernel/x86_64/dgemv_n_microk_haswell-2.c b/kernel/x86_64/dgemv_n_microk_haswell-2.c index 88c7d4163..b9f462cb2 100644 --- a/kernel/x86_64/dgemv_n_microk_haswell-2.c +++ b/kernel/x86_64/dgemv_n_microk_haswell-2.c @@ -43,7 +43,7 @@ static void dgemv_kernel_16x4( BLASLONG n, float **ap, float *x, float *y) ".align 16 \n\t" ".L01LOOP%=: \n\t" - "prefetcht0 192(%4,%0,8) \n\t" + "prefetcht0 192(%3,%0,8) \n\t" "vmovups (%3,%0,8), %%ymm4 \n\t" // 4 * y "vmovups 32(%3,%0,8), %%ymm5 \n\t" // 4 * y diff --git a/kernel/x86_64/zgemv_n.c b/kernel/x86_64/zgemv_n.c index be5b08dcd..141cb35df 100644 --- a/kernel/x86_64/zgemv_n.c +++ b/kernel/x86_64/zgemv_n.c @@ -28,6 +28,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" +#if defined(HASWELL) +#include "zgemv_n_microk_haswell-2.c" +#endif + #define NBMAX 1024 diff --git a/kernel/x86_64/zgemv_n_microk_haswell-2.c b/kernel/x86_64/zgemv_n_microk_haswell-2.c new file mode 100644 index 000000000..8583f96b3 --- /dev/null +++ b/kernel/x86_64/zgemv_n_microk_haswell-2.c @@ -0,0 +1,149 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_16x4 1 +static void zgemv_kernel_16x4( BLASLONG n, float **ap, float *x, float *y) __attribute__ ((noinline)); + +static void zgemv_kernel_16x4( BLASLONG n, float **ap, float *x, float *y) +{ + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + + "vbroadcastsd (%2), %%ymm0 \n\t" // real part x0 + "vbroadcastsd 8(%2), %%ymm1 \n\t" // imag part x0 + "vbroadcastsd 16(%2), %%ymm2 \n\t" // real part x1 + "vbroadcastsd 24(%2), %%ymm3 \n\t" // imag part x1 + "vbroadcastsd 32(%2), %%ymm4 \n\t" // real part x2 + "vbroadcastsd 40(%2), %%ymm5 \n\t" // imag part x2 + "vbroadcastsd 48(%2), %%ymm6 \n\t" // real part x3 + "vbroadcastsd 56(%2), %%ymm7 \n\t" // imag part x3 + + + ".align 16 \n\t" + ".L01LOOP%=: \n\t" + "prefetcht0 192(%4,%0,8) \n\t" + "vmovups (%4,%0,8), %%ymm8 \n\t" // 2 complex values form a0 + "vmovups 32(%4,%0,8), %%ymm9 \n\t" // 2 complex values form a0 + + "vxorpd %%ymm12, %%ymm12, %%ymm12 \n\t" + "vxorpd %%ymm13, %%ymm13, %%ymm13 \n\t" + "vxorpd %%ymm14, %%ymm14, %%ymm14 \n\t" + "vxorpd %%ymm15, %%ymm15, %%ymm15 \n\t" + + "prefetcht0 192(%5,%0,8) \n\t" + "vmovups (%5,%0,8), %%ymm10 \n\t" // 2 complex values form a1 + "vmovups 32(%5,%0,8), %%ymm11 \n\t" // 2 complex values form a1 + + "vfmadd231pd %%ymm8 , %%ymm0, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r + "vfmadd231pd %%ymm8 , %%ymm1, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i + "vfmadd231pd %%ymm9 , %%ymm0, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r + "vfmadd231pd %%ymm9 , %%ymm1, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i + + "prefetcht0 192(%6,%0,8) \n\t" + "vmovups (%6,%0,8), %%ymm8 \n\t" // 2 complex values form a2 + "vmovups 32(%6,%0,8), %%ymm9 \n\t" // 2 complex values form a2 + + "vfmadd231pd %%ymm10, %%ymm2, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r + "vfmadd231pd %%ymm10, %%ymm3, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i + "vfmadd231pd %%ymm11, %%ymm2, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r + "vfmadd231pd %%ymm11, %%ymm3, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i + + "prefetcht0 192(%7,%0,8) \n\t" + "vmovups (%7,%0,8), %%ymm10 \n\t" // 2 complex values form a3 + "vmovups 32(%7,%0,8), %%ymm11 \n\t" // 2 complex values form a3 + + "vfmadd231pd %%ymm8 , %%ymm4, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r + "vfmadd231pd %%ymm8 , %%ymm5, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i + "vfmadd231pd %%ymm9 , %%ymm4, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r + "vfmadd231pd %%ymm9 , %%ymm5, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i + + "vfmadd231pd %%ymm10, %%ymm6, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r + "vfmadd231pd %%ymm10, %%ymm7, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i + "vfmadd231pd %%ymm11, %%ymm6, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r + "vfmadd231pd %%ymm11, %%ymm7, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i + + +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + "vpermilpd $0x5 , %%ymm13, %%ymm13 \n\t" + "vpermilpd $0x5 , %%ymm15, %%ymm15 \n\t" + "vaddsubpd %%ymm13, %%ymm12, %%ymm8 \n\t" + "vaddsubpd %%ymm15, %%ymm14, %%ymm9 \n\t" +#else + "vpermilpd $0x5 , %%ymm12, %%ymm12 \n\t" + "vpermilpd $0x5 , %%ymm14, %%ymm14 \n\t" + "vaddsubpd %%ymm13, %%ymm12, %%ymm8 \n\t" + "vaddsubpd %%ymm15, %%ymm14, %%ymm9 \n\t" + "vpermilpd $0x5 , %%ymm8 , %%ymm8 \n\t" + "vpermilpd $0x5 , %%ymm9 , %%ymm9 \n\t" +#endif + + "prefetcht0 192(%3,%0,8) \n\t" + "vmovups (%3,%0,8), %%ymm12 \n\t" + "vmovups 32(%3,%0,8), %%ymm13 \n\t" + +#if !defined(XCONJ) + "vaddpd %%ymm8, %%ymm12, %%ymm12 \n\t" + "vaddpd %%ymm9, %%ymm13, %%ymm13 \n\t" +#else + "vaddsubpd %%ymm12, %%ymm8, %%ymm12 \n\t" + "vaddsubpd %%ymm13, %%ymm9, %%ymm13 \n\t" +#endif + + + "vmovups %%ymm12, (%3,%0,8) \n\t" // 2 complex values to y + "vmovups %%ymm13, 32(%3,%0,8) \n\t" + + "addq $8 , %0 \n\t" + "subq $4 , %1 \n\t" + "jnz .L01LOOP%= \n\t" + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (ap[0]), // 4 + "r" (ap[1]), // 5 + "r" (ap[2]), // 6 + "r" (ap[3]) // 7 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + From dbc2eff029b298e324c175b698ec132436e6df43 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sun, 10 Aug 2014 11:57:24 +0200 Subject: [PATCH 12/27] disabled optimized haswell zgemv_n kernel for windows ( bad rounding ) --- kernel/x86_64/KERNEL.HASWELL | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/x86_64/KERNEL.HASWELL b/kernel/x86_64/KERNEL.HASWELL index d126eb6f4..7d4cddbcc 100644 --- a/kernel/x86_64/KERNEL.HASWELL +++ b/kernel/x86_64/KERNEL.HASWELL @@ -2,7 +2,10 @@ SGEMVNKERNEL = sgemv_n.c SGEMVTKERNEL = sgemv_t.c DGEMVNKERNEL = dgemv_n.c DGEMVTKERNEL = dgemv_t.c + +ifndef OS_WINDOWS ZGEMVNKERNEL = zgemv_n.c +endif SGEMMKERNEL = sgemm_kernel_16x4_haswell.S SGEMMINCOPY = ../generic/gemm_ncopy_16.c From 6fe416976d1ad2ff8f60829cb01a63a11d876429 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Mon, 11 Aug 2014 09:13:18 +0200 Subject: [PATCH 13/27] added optimimized zgemv_t c-kernel --- kernel/x86_64/zgemv_t.c | 267 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 267 insertions(+) create mode 100644 kernel/x86_64/zgemv_t.c diff --git a/kernel/x86_64/zgemv_t.c b/kernel/x86_64/zgemv_t.c new file mode 100644 index 000000000..a2dc45c45 --- /dev/null +++ b/kernel/x86_64/zgemv_t.c @@ -0,0 +1,267 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include "common.h" + +/* +#if defined(HASWELL) +#include "zgemv_t_microk_haswell-2.c" +#endif +*/ + +#define NBMAX 1028 + +#ifndef HAVE_KERNEL_16x4 + +static void zgemv_kernel_16x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) +{ + BLASLONG i; + FLOAT *a0,*a1,*a2,*a3; + a0 = ap[0]; + a1 = ap[1]; + a2 = ap[2]; + a3 = ap[3]; + FLOAT temp_r0 = 0.0; + FLOAT temp_r1 = 0.0; + FLOAT temp_r2 = 0.0; + FLOAT temp_r3 = 0.0; + FLOAT temp_i0 = 0.0; + FLOAT temp_i1 = 0.0; + FLOAT temp_i2 = 0.0; + FLOAT temp_i3 = 0.0; + + + for ( i=0; i< 2*n; i+=2 ) + { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + temp_r0 += a0[i]*x[i] - a0[i+1]*x[i+1]; + temp_i0 += a0[i]*x[i+1] + a0[i+1]*x[i]; + temp_r1 += a1[i]*x[i] - a1[i+1]*x[i+1]; + temp_i1 += a1[i]*x[i+1] + a1[i+1]*x[i]; + temp_r2 += a2[i]*x[i] - a2[i+1]*x[i+1]; + temp_i2 += a2[i]*x[i+1] + a2[i+1]*x[i]; + temp_r3 += a3[i]*x[i] - a3[i+1]*x[i+1]; + temp_i3 += a3[i]*x[i+1] + a3[i+1]*x[i]; +#else + temp_r0 += a0[i]*x[i] + a0[i+1]*x[i+1]; + temp_i0 += a0[i]*x[i+1] - a0[i+1]*x[i]; + temp_r1 += a1[i]*x[i] + a1[i+1]*x[i+1]; + temp_i1 += a1[i]*x[i+1] - a1[i+1]*x[i]; + temp_r2 += a2[i]*x[i] + a2[i+1]*x[i+1]; + temp_i2 += a2[i]*x[i+1] - a2[i+1]*x[i]; + temp_r3 += a3[i]*x[i] + a3[i+1]*x[i+1]; + temp_i3 += a3[i]*x[i+1] - a3[i+1]*x[i]; +#endif + } + y[0] = temp_r0; + y[1] = temp_i0; + y[2] = temp_r1; + y[3] = temp_i1; + y[4] = temp_r2; + y[5] = temp_i2; + y[6] = temp_r3; + y[7] = temp_i3; +} + +#endif + +static void zgemv_kernel_16x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) +{ + BLASLONG i; + FLOAT *a0; + a0 = ap; + FLOAT temp_r = 0.0; + FLOAT temp_i = 0.0; + + for ( i=0; i< 2*n; i+=2 ) + { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + temp_r += a0[i]*x[i] - a0[i+1]*x[i+1]; + temp_i += a0[i]*x[i+1] + a0[i+1]*x[i]; +#else + temp_r += a0[i]*x[i] + a0[i+1]*x[i+1]; + temp_i += a0[i]*x[i+1] - a0[i+1]*x[i]; +#endif + } + *y = temp_r; + *(y+1) = temp_i; +} + +static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) +{ + BLASLONG i; + for ( i=0; i Date: Mon, 11 Aug 2014 13:10:12 +0200 Subject: [PATCH 14/27] added optimized zgemv_t for haswell --- kernel/x86_64/dgemv_n_microk_haswell-2.c | 4 +- kernel/x86_64/dgemv_t_microk_haswell-2.c | 4 +- kernel/x86_64/sgemv_n_microk_bulldozer-2.c | 4 +- kernel/x86_64/sgemv_n_microk_haswell-2.c | 4 +- kernel/x86_64/sgemv_n_microk_nehalem-2.c | 4 +- kernel/x86_64/sgemv_n_microk_sandy-2.c | 4 +- kernel/x86_64/sgemv_t_microk_bulldozer-2.c | 4 +- kernel/x86_64/sgemv_t_microk_haswell-2.c | 4 +- kernel/x86_64/sgemv_t_microk_nehalem-2.c | 4 +- kernel/x86_64/sgemv_t_microk_sandy-2.c | 4 +- kernel/x86_64/zgemv_n_microk_haswell-2.c | 4 +- kernel/x86_64/zgemv_t_microk_bulldozer-2.c | 139 +++++++++++++++++++++ kernel/x86_64/zgemv_t_microk_haswell-2.c | 139 +++++++++++++++++++++ 13 files changed, 300 insertions(+), 22 deletions(-) create mode 100644 kernel/x86_64/zgemv_t_microk_bulldozer-2.c create mode 100644 kernel/x86_64/zgemv_t_microk_haswell-2.c diff --git a/kernel/x86_64/dgemv_n_microk_haswell-2.c b/kernel/x86_64/dgemv_n_microk_haswell-2.c index b9f462cb2..28e2fe4f6 100644 --- a/kernel/x86_64/dgemv_n_microk_haswell-2.c +++ b/kernel/x86_64/dgemv_n_microk_haswell-2.c @@ -26,9 +26,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #define HAVE_KERNEL_16x4 1 -static void dgemv_kernel_16x4( BLASLONG n, float **ap, float *x, float *y) __attribute__ ((noinline)); +static void dgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline)); -static void dgemv_kernel_16x4( BLASLONG n, float **ap, float *x, float *y) +static void dgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { BLASLONG register i = 0; diff --git a/kernel/x86_64/dgemv_t_microk_haswell-2.c b/kernel/x86_64/dgemv_t_microk_haswell-2.c index 94d4c319e..1a4ba37d7 100644 --- a/kernel/x86_64/dgemv_t_microk_haswell-2.c +++ b/kernel/x86_64/dgemv_t_microk_haswell-2.c @@ -26,9 +26,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #define HAVE_KERNEL_16x4 1 -static void dgemv_kernel_16x4( BLASLONG n, float **ap, float *x, float *y) __attribute__ ((noinline)); +static void dgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline)); -static void dgemv_kernel_16x4( BLASLONG n, float **ap, float *x, float *y) +static void dgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { BLASLONG register i = 0; diff --git a/kernel/x86_64/sgemv_n_microk_bulldozer-2.c b/kernel/x86_64/sgemv_n_microk_bulldozer-2.c index d50fa4268..c4a490587 100644 --- a/kernel/x86_64/sgemv_n_microk_bulldozer-2.c +++ b/kernel/x86_64/sgemv_n_microk_bulldozer-2.c @@ -26,9 +26,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #define HAVE_KERNEL_16x4 1 -static void sgemv_kernel_16x4( BLASLONG n, float **ap, float *x, float *y) __attribute__ ((noinline)); +static void sgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline)); -static void sgemv_kernel_16x4( BLASLONG n, float **ap, float *x, float *y) +static void sgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { BLASLONG register i = 0; diff --git a/kernel/x86_64/sgemv_n_microk_haswell-2.c b/kernel/x86_64/sgemv_n_microk_haswell-2.c index d3fee67c3..19888d150 100644 --- a/kernel/x86_64/sgemv_n_microk_haswell-2.c +++ b/kernel/x86_64/sgemv_n_microk_haswell-2.c @@ -26,9 +26,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #define HAVE_KERNEL_16x4 1 -static void sgemv_kernel_16x4( BLASLONG n, float **ap, float *x, float *y) __attribute__ ((noinline)); +static void sgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline)); -static void sgemv_kernel_16x4( BLASLONG n, float **ap, float *x, float *y) +static void sgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { BLASLONG register i = 0; diff --git a/kernel/x86_64/sgemv_n_microk_nehalem-2.c b/kernel/x86_64/sgemv_n_microk_nehalem-2.c index 3cfb82a45..40ccbb78f 100644 --- a/kernel/x86_64/sgemv_n_microk_nehalem-2.c +++ b/kernel/x86_64/sgemv_n_microk_nehalem-2.c @@ -26,9 +26,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #define HAVE_KERNEL_16x4 1 -static void sgemv_kernel_16x4( BLASLONG n, float **ap, float *x, float *y) __attribute__ ((noinline)); +static void sgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline)); -static void sgemv_kernel_16x4( BLASLONG n, float **ap, float *x, float *y) +static void sgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { BLASLONG register i = 0; diff --git a/kernel/x86_64/sgemv_n_microk_sandy-2.c b/kernel/x86_64/sgemv_n_microk_sandy-2.c index 21eff1c5e..b255ddbcb 100644 --- a/kernel/x86_64/sgemv_n_microk_sandy-2.c +++ b/kernel/x86_64/sgemv_n_microk_sandy-2.c @@ -26,9 +26,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #define HAVE_KERNEL_16x4 1 -static void sgemv_kernel_16x4( BLASLONG n, float **ap, float *x, float *y) __attribute__ ((noinline)); +static void sgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline)); -static void sgemv_kernel_16x4( BLASLONG n, float **ap, float *x, float *y) +static void sgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { BLASLONG register i = 0; diff --git a/kernel/x86_64/sgemv_t_microk_bulldozer-2.c b/kernel/x86_64/sgemv_t_microk_bulldozer-2.c index 54bdca63a..e4498afa3 100644 --- a/kernel/x86_64/sgemv_t_microk_bulldozer-2.c +++ b/kernel/x86_64/sgemv_t_microk_bulldozer-2.c @@ -26,9 +26,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #define HAVE_KERNEL_16x4 1 -static void sgemv_kernel_16x4( BLASLONG n, float **ap, float *x, float *y) __attribute__ ((noinline)); +static void sgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline)); -static void sgemv_kernel_16x4( BLASLONG n, float **ap, float *x, float *y) +static void sgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { BLASLONG register i = 0; diff --git a/kernel/x86_64/sgemv_t_microk_haswell-2.c b/kernel/x86_64/sgemv_t_microk_haswell-2.c index cef703483..e6d47270d 100644 --- a/kernel/x86_64/sgemv_t_microk_haswell-2.c +++ b/kernel/x86_64/sgemv_t_microk_haswell-2.c @@ -26,9 +26,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #define HAVE_KERNEL_16x4 1 -static void sgemv_kernel_16x4( BLASLONG n, float **ap, float *x, float *y) __attribute__ ((noinline)); +static void sgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline)); -static void sgemv_kernel_16x4( BLASLONG n, float **ap, float *x, float *y) +static void sgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { BLASLONG register i = 0; diff --git a/kernel/x86_64/sgemv_t_microk_nehalem-2.c b/kernel/x86_64/sgemv_t_microk_nehalem-2.c index e1f2b81bd..db5a1448b 100644 --- a/kernel/x86_64/sgemv_t_microk_nehalem-2.c +++ b/kernel/x86_64/sgemv_t_microk_nehalem-2.c @@ -26,9 +26,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #define HAVE_KERNEL_16x4 1 -static void sgemv_kernel_16x4( BLASLONG n, float **ap, float *x, float *y) __attribute__ ((noinline)); +static void sgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline)); -static void sgemv_kernel_16x4( BLASLONG n, float **ap, float *x, float *y) +static void sgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { BLASLONG register i = 0; diff --git a/kernel/x86_64/sgemv_t_microk_sandy-2.c b/kernel/x86_64/sgemv_t_microk_sandy-2.c index 6a3748238..841522302 100644 --- a/kernel/x86_64/sgemv_t_microk_sandy-2.c +++ b/kernel/x86_64/sgemv_t_microk_sandy-2.c @@ -26,9 +26,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #define HAVE_KERNEL_16x4 1 -static void sgemv_kernel_16x4( BLASLONG n, float **ap, float *x, float *y) __attribute__ ((noinline)); +static void sgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline)); -static void sgemv_kernel_16x4( BLASLONG n, float **ap, float *x, float *y) +static void sgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { BLASLONG register i = 0; diff --git a/kernel/x86_64/zgemv_n_microk_haswell-2.c b/kernel/x86_64/zgemv_n_microk_haswell-2.c index 8583f96b3..833983fe0 100644 --- a/kernel/x86_64/zgemv_n_microk_haswell-2.c +++ b/kernel/x86_64/zgemv_n_microk_haswell-2.c @@ -26,9 +26,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #define HAVE_KERNEL_16x4 1 -static void zgemv_kernel_16x4( BLASLONG n, float **ap, float *x, float *y) __attribute__ ((noinline)); +static void zgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline)); -static void zgemv_kernel_16x4( BLASLONG n, float **ap, float *x, float *y) +static void zgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { BLASLONG register i = 0; diff --git a/kernel/x86_64/zgemv_t_microk_bulldozer-2.c b/kernel/x86_64/zgemv_t_microk_bulldozer-2.c new file mode 100644 index 000000000..efb6d784e --- /dev/null +++ b/kernel/x86_64/zgemv_t_microk_bulldozer-2.c @@ -0,0 +1,139 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary froms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary from must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_16x4 1 +static void zgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline)); + +static void zgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) +{ + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + + "vxorpd %%ymm8 , %%ymm8 , %%ymm8 \n\t" // temp + "vxorpd %%ymm9 , %%ymm9 , %%ymm9 \n\t" // temp + "vxorpd %%ymm10, %%ymm10, %%ymm10 \n\t" // temp + "vxorpd %%ymm11, %%ymm11, %%ymm11 \n\t" // temp + "vxorpd %%ymm12, %%ymm12, %%ymm12 \n\t" // temp + "vxorpd %%ymm13, %%ymm13, %%ymm13 \n\t" + "vxorpd %%ymm14, %%ymm14, %%ymm14 \n\t" + "vxorpd %%ymm15, %%ymm15, %%ymm15 \n\t" + + ".align 16 \n\t" + ".L01LOOP%=: \n\t" + + "vmovddup (%2,%0,8), %%xmm0 \n\t" // real value from x0 + "vmovddup 8(%2,%0,8), %%xmm1 \n\t" // imag value from x0 + "vmovddup 16(%2,%0,8), %%xmm2 \n\t" // real value from x1 + "vmovddup 24(%2,%0,8), %%xmm3 \n\t" // imag value from x1 + "vinsertf128 $1, %%xmm2, %%ymm0 , %%ymm0 \n\t" // real values from x0 and x1 + "vinsertf128 $1, %%xmm3, %%ymm1 , %%ymm1 \n\t" // imag values from x0 and x1 + + "vmovups (%4,%0,8), %%ymm4 \n\t" // 2 complex values from a0 + "vmovups (%5,%0,8), %%ymm5 \n\t" // 2 complex values from a1 + "vmovups (%6,%0,8), %%ymm6 \n\t" // 2 complex values from a2 + "vmovups (%7,%0,8), %%ymm7 \n\t" // 2 complex values from a3 + + "vfmaddpd %%ymm8 , %%ymm4 , %%ymm0, %%ymm8 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 + "vfmaddpd %%ymm9 , %%ymm4 , %%ymm1, %%ymm9 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 + "vfmaddpd %%ymm10, %%ymm5 , %%ymm0, %%ymm10 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 + "vfmaddpd %%ymm11, %%ymm5 , %%ymm1, %%ymm11 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 + "vfmaddpd %%ymm12, %%ymm6 , %%ymm0, %%ymm12 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 + "vfmaddpd %%ymm13, %%ymm6 , %%ymm1, %%ymm13 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 + "vfmaddpd %%ymm14, %%ymm7 , %%ymm0, %%ymm14 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 + "vfmaddpd %%ymm15, %%ymm7 , %%ymm1, %%ymm15 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 + + + "addq $4 , %0 \n\t" + "subq $2 , %1 \n\t" + "jnz .L01LOOP%= \n\t" + +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + "vpermilpd $0x5 , %%ymm9 , %%ymm9 \n\t" + "vpermilpd $0x5 , %%ymm11, %%ymm11 \n\t" + "vpermilpd $0x5 , %%ymm13, %%ymm13 \n\t" + "vpermilpd $0x5 , %%ymm15, %%ymm15 \n\t" + "vaddsubpd %%ymm9 , %%ymm8, %%ymm8 \n\t" + "vaddsubpd %%ymm11, %%ymm10, %%ymm10 \n\t" + "vaddsubpd %%ymm13, %%ymm12, %%ymm12 \n\t" + "vaddsubpd %%ymm15, %%ymm14, %%ymm14 \n\t" +#else + "vpermilpd $0x5 , %%ymm8 , %%ymm8 \n\t" + "vpermilpd $0x5 , %%ymm10, %%ymm10 \n\t" + "vpermilpd $0x5 , %%ymm12, %%ymm12 \n\t" + "vpermilpd $0x5 , %%ymm14, %%ymm14 \n\t" + "vaddsubpd %%ymm8 , %%ymm9 , %%ymm8 \n\t" + "vaddsubpd %%ymm10, %%ymm11, %%ymm10 \n\t" + "vaddsubpd %%ymm12, %%ymm13, %%ymm12 \n\t" + "vaddsubpd %%ymm14, %%ymm15, %%ymm14 \n\t" + "vpermilpd $0x5 , %%ymm8 , %%ymm8 \n\t" + "vpermilpd $0x5 , %%ymm10, %%ymm10 \n\t" + "vpermilpd $0x5 , %%ymm12, %%ymm12 \n\t" + "vpermilpd $0x5 , %%ymm14, %%ymm14 \n\t" +#endif + + "vextractf128 $1, %%ymm8 , %%xmm9 \n\t" + "vextractf128 $1, %%ymm10, %%xmm11 \n\t" + "vextractf128 $1, %%ymm12, %%xmm13 \n\t" + "vextractf128 $1, %%ymm14, %%xmm15 \n\t" + + "vaddpd %%xmm8 , %%xmm9 , %%xmm8 \n\t" + "vaddpd %%xmm10, %%xmm11, %%xmm10 \n\t" + "vaddpd %%xmm12, %%xmm13, %%xmm12 \n\t" + "vaddpd %%xmm14, %%xmm15, %%xmm14 \n\t" + + "vmovups %%xmm8 , (%3) \n\t" + "vmovups %%xmm10, 16(%3) \n\t" + "vmovups %%xmm12, 32(%3) \n\t" + "vmovups %%xmm14, 48(%3) \n\t" + + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (ap[0]), // 4 + "r" (ap[1]), // 5 + "r" (ap[2]), // 6 + "r" (ap[3]) // 7 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + diff --git a/kernel/x86_64/zgemv_t_microk_haswell-2.c b/kernel/x86_64/zgemv_t_microk_haswell-2.c new file mode 100644 index 000000000..2dddef27d --- /dev/null +++ b/kernel/x86_64/zgemv_t_microk_haswell-2.c @@ -0,0 +1,139 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary froms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary from must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_16x4 1 +static void zgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline)); + +static void zgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) +{ + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + + "vxorpd %%ymm8 , %%ymm8 , %%ymm8 \n\t" // temp + "vxorpd %%ymm9 , %%ymm9 , %%ymm9 \n\t" // temp + "vxorpd %%ymm10, %%ymm10, %%ymm10 \n\t" // temp + "vxorpd %%ymm11, %%ymm11, %%ymm11 \n\t" // temp + "vxorpd %%ymm12, %%ymm12, %%ymm12 \n\t" // temp + "vxorpd %%ymm13, %%ymm13, %%ymm13 \n\t" + "vxorpd %%ymm14, %%ymm14, %%ymm14 \n\t" + "vxorpd %%ymm15, %%ymm15, %%ymm15 \n\t" + + ".align 16 \n\t" + ".L01LOOP%=: \n\t" + + "vmovddup (%2,%0,8), %%xmm0 \n\t" // real value from x0 + "vmovddup 8(%2,%0,8), %%xmm1 \n\t" // imag value from x0 + "vmovddup 16(%2,%0,8), %%xmm2 \n\t" // real value from x1 + "vmovddup 24(%2,%0,8), %%xmm3 \n\t" // imag value from x1 + "vinsertf128 $1, %%xmm2, %%ymm0 , %%ymm0 \n\t" // real values from x0 and x1 + "vinsertf128 $1, %%xmm3, %%ymm1 , %%ymm1 \n\t" // imag values from x0 and x1 + + "vmovups (%4,%0,8), %%ymm4 \n\t" // 2 complex values from a0 + "vmovups (%5,%0,8), %%ymm5 \n\t" // 2 complex values from a1 + "vmovups (%6,%0,8), %%ymm6 \n\t" // 2 complex values from a2 + "vmovups (%7,%0,8), %%ymm7 \n\t" // 2 complex values from a3 + + "vfmadd231pd %%ymm4 , %%ymm0, %%ymm8 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 + "vfmadd231pd %%ymm4 , %%ymm1, %%ymm9 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 + "vfmadd231pd %%ymm5 , %%ymm0, %%ymm10 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 + "vfmadd231pd %%ymm5 , %%ymm1, %%ymm11 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 + "vfmadd231pd %%ymm6 , %%ymm0, %%ymm12 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 + "vfmadd231pd %%ymm6 , %%ymm1, %%ymm13 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 + "vfmadd231pd %%ymm7 , %%ymm0, %%ymm14 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 + "vfmadd231pd %%ymm7 , %%ymm1, %%ymm15 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 + + + "addq $4 , %0 \n\t" + "subq $2 , %1 \n\t" + "jnz .L01LOOP%= \n\t" + +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + "vpermilpd $0x5 , %%ymm9 , %%ymm9 \n\t" + "vpermilpd $0x5 , %%ymm11, %%ymm11 \n\t" + "vpermilpd $0x5 , %%ymm13, %%ymm13 \n\t" + "vpermilpd $0x5 , %%ymm15, %%ymm15 \n\t" + "vaddsubpd %%ymm9 , %%ymm8, %%ymm8 \n\t" + "vaddsubpd %%ymm11, %%ymm10, %%ymm10 \n\t" + "vaddsubpd %%ymm13, %%ymm12, %%ymm12 \n\t" + "vaddsubpd %%ymm15, %%ymm14, %%ymm14 \n\t" +#else + "vpermilpd $0x5 , %%ymm8 , %%ymm8 \n\t" + "vpermilpd $0x5 , %%ymm10, %%ymm10 \n\t" + "vpermilpd $0x5 , %%ymm12, %%ymm12 \n\t" + "vpermilpd $0x5 , %%ymm14, %%ymm14 \n\t" + "vaddsubpd %%ymm8 , %%ymm9 , %%ymm8 \n\t" + "vaddsubpd %%ymm10, %%ymm11, %%ymm10 \n\t" + "vaddsubpd %%ymm12, %%ymm13, %%ymm12 \n\t" + "vaddsubpd %%ymm14, %%ymm15, %%ymm14 \n\t" + "vpermilpd $0x5 , %%ymm8 , %%ymm8 \n\t" + "vpermilpd $0x5 , %%ymm10, %%ymm10 \n\t" + "vpermilpd $0x5 , %%ymm12, %%ymm12 \n\t" + "vpermilpd $0x5 , %%ymm14, %%ymm14 \n\t" +#endif + + "vextractf128 $1, %%ymm8 , %%xmm9 \n\t" + "vextractf128 $1, %%ymm10, %%xmm11 \n\t" + "vextractf128 $1, %%ymm12, %%xmm13 \n\t" + "vextractf128 $1, %%ymm14, %%xmm15 \n\t" + + "vaddpd %%xmm8 , %%xmm9 , %%xmm8 \n\t" + "vaddpd %%xmm10, %%xmm11, %%xmm10 \n\t" + "vaddpd %%xmm12, %%xmm13, %%xmm12 \n\t" + "vaddpd %%xmm14, %%xmm15, %%xmm14 \n\t" + + "vmovups %%xmm8 , (%3) \n\t" + "vmovups %%xmm10, 16(%3) \n\t" + "vmovups %%xmm12, 32(%3) \n\t" + "vmovups %%xmm14, 48(%3) \n\t" + + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (ap[0]), // 4 + "r" (ap[1]), // 5 + "r" (ap[2]), // 6 + "r" (ap[3]) // 7 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + From 09fcd3a34135ad86f4b17f8e15893fe99b9f0171 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Mon, 11 Aug 2014 14:19:25 +0200 Subject: [PATCH 15/27] add optimized zgemv_t kernel for bulldozer --- kernel/x86_64/KERNEL.BULLDOZER | 2 +- kernel/x86_64/zgemv_t.c | 8 +- kernel/x86_64/zgemv_t_microk_bulldozer-2.c | 151 +++++++++++++-------- 3 files changed, 101 insertions(+), 60 deletions(-) diff --git a/kernel/x86_64/KERNEL.BULLDOZER b/kernel/x86_64/KERNEL.BULLDOZER index 893f13064..19bf7fd32 100644 --- a/kernel/x86_64/KERNEL.BULLDOZER +++ b/kernel/x86_64/KERNEL.BULLDOZER @@ -2,7 +2,7 @@ SGEMVNKERNEL = sgemv_n.c SGEMVTKERNEL = sgemv_t.c ZGEMVNKERNEL = zgemv_n_dup.S -ZGEMVTKERNEL = zgemv_t.S +ZGEMVTKERNEL = zgemv_t.c DGEMVNKERNEL = dgemv_n_bulldozer.S DGEMVTKERNEL = dgemv_t_bulldozer.S diff --git a/kernel/x86_64/zgemv_t.c b/kernel/x86_64/zgemv_t.c index a2dc45c45..b54d5f4e2 100644 --- a/kernel/x86_64/zgemv_t.c +++ b/kernel/x86_64/zgemv_t.c @@ -28,11 +28,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -/* -#if defined(HASWELL) -#include "zgemv_t_microk_haswell-2.c" + +#if defined(BULLDOZER) +#include "zgemv_t_microk_bulldozer-2.c" #endif -*/ + #define NBMAX 1028 diff --git a/kernel/x86_64/zgemv_t_microk_bulldozer-2.c b/kernel/x86_64/zgemv_t_microk_bulldozer-2.c index efb6d784e..65d5a10a2 100644 --- a/kernel/x86_64/zgemv_t_microk_bulldozer-2.c +++ b/kernel/x86_64/zgemv_t_microk_bulldozer-2.c @@ -37,77 +37,118 @@ static void zgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) ( "vzeroupper \n\t" - "vxorpd %%ymm8 , %%ymm8 , %%ymm8 \n\t" // temp - "vxorpd %%ymm9 , %%ymm9 , %%ymm9 \n\t" // temp - "vxorpd %%ymm10, %%ymm10, %%ymm10 \n\t" // temp - "vxorpd %%ymm11, %%ymm11, %%ymm11 \n\t" // temp - "vxorpd %%ymm12, %%ymm12, %%ymm12 \n\t" // temp - "vxorpd %%ymm13, %%ymm13, %%ymm13 \n\t" - "vxorpd %%ymm14, %%ymm14, %%ymm14 \n\t" - "vxorpd %%ymm15, %%ymm15, %%ymm15 \n\t" + "vxorpd %%xmm8 , %%xmm8 , %%xmm8 \n\t" // temp + "vxorpd %%xmm9 , %%xmm9 , %%xmm9 \n\t" // temp + "vxorpd %%xmm10, %%xmm10, %%xmm10 \n\t" // temp + "vxorpd %%xmm11, %%xmm11, %%xmm11 \n\t" // temp + "vxorpd %%xmm12, %%xmm12, %%xmm12 \n\t" // temp + "vxorpd %%xmm13, %%xmm13, %%xmm13 \n\t" + "vxorpd %%xmm14, %%xmm14, %%xmm14 \n\t" + "vxorpd %%xmm15, %%xmm15, %%xmm15 \n\t" ".align 16 \n\t" ".L01LOOP%=: \n\t" "vmovddup (%2,%0,8), %%xmm0 \n\t" // real value from x0 "vmovddup 8(%2,%0,8), %%xmm1 \n\t" // imag value from x0 - "vmovddup 16(%2,%0,8), %%xmm2 \n\t" // real value from x1 - "vmovddup 24(%2,%0,8), %%xmm3 \n\t" // imag value from x1 - "vinsertf128 $1, %%xmm2, %%ymm0 , %%ymm0 \n\t" // real values from x0 and x1 - "vinsertf128 $1, %%xmm3, %%ymm1 , %%ymm1 \n\t" // imag values from x0 and x1 - "vmovups (%4,%0,8), %%ymm4 \n\t" // 2 complex values from a0 - "vmovups (%5,%0,8), %%ymm5 \n\t" // 2 complex values from a1 - "vmovups (%6,%0,8), %%ymm6 \n\t" // 2 complex values from a2 - "vmovups (%7,%0,8), %%ymm7 \n\t" // 2 complex values from a3 + "prefetcht0 192(%4,%0,8) \n\t" + "vmovups (%4,%0,8), %%xmm4 \n\t" // 1 complex values from a0 + "prefetcht0 192(%5,%0,8) \n\t" + "vmovups (%5,%0,8), %%xmm5 \n\t" // 1 complex values from a1 + "prefetcht0 192(%6,%0,8) \n\t" + "vmovups (%6,%0,8), %%xmm6 \n\t" // 1 complex values from a2 + "prefetcht0 192(%7,%0,8) \n\t" + "vmovups (%7,%0,8), %%xmm7 \n\t" // 1 complex values from a3 - "vfmaddpd %%ymm8 , %%ymm4 , %%ymm0, %%ymm8 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 - "vfmaddpd %%ymm9 , %%ymm4 , %%ymm1, %%ymm9 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 - "vfmaddpd %%ymm10, %%ymm5 , %%ymm0, %%ymm10 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 - "vfmaddpd %%ymm11, %%ymm5 , %%ymm1, %%ymm11 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 - "vfmaddpd %%ymm12, %%ymm6 , %%ymm0, %%ymm12 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 - "vfmaddpd %%ymm13, %%ymm6 , %%ymm1, %%ymm13 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 - "vfmaddpd %%ymm14, %%ymm7 , %%ymm0, %%ymm14 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 - "vfmaddpd %%ymm15, %%ymm7 , %%ymm1, %%ymm15 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 + "vfmaddpd %%xmm8 , %%xmm4 , %%xmm0, %%xmm8 \n\t" // ar0*xr0,al0*xr0 + "vfmaddpd %%xmm9 , %%xmm4 , %%xmm1, %%xmm9 \n\t" // ar0*xl0,al0*xl0 + "vfmaddpd %%xmm10, %%xmm5 , %%xmm0, %%xmm10 \n\t" // ar0*xr0,al0*xr0 + "vfmaddpd %%xmm11, %%xmm5 , %%xmm1, %%xmm11 \n\t" // ar0*xl0,al0*xl0 + "vfmaddpd %%xmm12, %%xmm6 , %%xmm0, %%xmm12 \n\t" // ar0*xr0,al0*xr0 + "vfmaddpd %%xmm13, %%xmm6 , %%xmm1, %%xmm13 \n\t" // ar0*xl0,al0*xl0 + "vfmaddpd %%xmm14, %%xmm7 , %%xmm0, %%xmm14 \n\t" // ar0*xr0,al0*xr0 + "vfmaddpd %%xmm15, %%xmm7 , %%xmm1, %%xmm15 \n\t" // ar0*xl0,al0*xl0 + "vmovddup 16(%2,%0,8), %%xmm0 \n\t" // real value from x0 + "vmovddup 24(%2,%0,8), %%xmm1 \n\t" // imag value from x0 - "addq $4 , %0 \n\t" - "subq $2 , %1 \n\t" + "vmovups 16(%4,%0,8), %%xmm4 \n\t" // 1 complex values from a0 + "vmovups 16(%5,%0,8), %%xmm5 \n\t" // 1 complex values from a1 + "vmovups 16(%6,%0,8), %%xmm6 \n\t" // 1 complex values from a2 + "vmovups 16(%7,%0,8), %%xmm7 \n\t" // 1 complex values from a3 + + "vfmaddpd %%xmm8 , %%xmm4 , %%xmm0, %%xmm8 \n\t" // ar0*xr0,al0*xr0 + "vfmaddpd %%xmm9 , %%xmm4 , %%xmm1, %%xmm9 \n\t" // ar0*xl0,al0*xl0 + "vfmaddpd %%xmm10, %%xmm5 , %%xmm0, %%xmm10 \n\t" // ar0*xr0,al0*xr0 + "vfmaddpd %%xmm11, %%xmm5 , %%xmm1, %%xmm11 \n\t" // ar0*xl0,al0*xl0 + "vfmaddpd %%xmm12, %%xmm6 , %%xmm0, %%xmm12 \n\t" // ar0*xr0,al0*xr0 + "vfmaddpd %%xmm13, %%xmm6 , %%xmm1, %%xmm13 \n\t" // ar0*xl0,al0*xl0 + "vfmaddpd %%xmm14, %%xmm7 , %%xmm0, %%xmm14 \n\t" // ar0*xr0,al0*xr0 + "vfmaddpd %%xmm15, %%xmm7 , %%xmm1, %%xmm15 \n\t" // ar0*xl0,al0*xl0 + + "vmovddup 32(%2,%0,8), %%xmm0 \n\t" // real value from x0 + "vmovddup 40(%2,%0,8), %%xmm1 \n\t" // imag value from x0 + + "vmovups 32(%4,%0,8), %%xmm4 \n\t" // 1 complex values from a0 + "vmovups 32(%5,%0,8), %%xmm5 \n\t" // 1 complex values from a1 + "vmovups 32(%6,%0,8), %%xmm6 \n\t" // 1 complex values from a2 + "vmovups 32(%7,%0,8), %%xmm7 \n\t" // 1 complex values from a3 + + "vfmaddpd %%xmm8 , %%xmm4 , %%xmm0, %%xmm8 \n\t" // ar0*xr0,al0*xr0 + "vfmaddpd %%xmm9 , %%xmm4 , %%xmm1, %%xmm9 \n\t" // ar0*xl0,al0*xl0 + "vfmaddpd %%xmm10, %%xmm5 , %%xmm0, %%xmm10 \n\t" // ar0*xr0,al0*xr0 + "vfmaddpd %%xmm11, %%xmm5 , %%xmm1, %%xmm11 \n\t" // ar0*xl0,al0*xl0 + "vfmaddpd %%xmm12, %%xmm6 , %%xmm0, %%xmm12 \n\t" // ar0*xr0,al0*xr0 + "vfmaddpd %%xmm13, %%xmm6 , %%xmm1, %%xmm13 \n\t" // ar0*xl0,al0*xl0 + "vfmaddpd %%xmm14, %%xmm7 , %%xmm0, %%xmm14 \n\t" // ar0*xr0,al0*xr0 + "vfmaddpd %%xmm15, %%xmm7 , %%xmm1, %%xmm15 \n\t" // ar0*xl0,al0*xl0 + + "vmovddup 48(%2,%0,8), %%xmm0 \n\t" // real value from x0 + "vmovddup 56(%2,%0,8), %%xmm1 \n\t" // imag value from x0 + + "vmovups 48(%4,%0,8), %%xmm4 \n\t" // 1 complex values from a0 + "vmovups 48(%5,%0,8), %%xmm5 \n\t" // 1 complex values from a1 + "vmovups 48(%6,%0,8), %%xmm6 \n\t" // 1 complex values from a2 + "vmovups 48(%7,%0,8), %%xmm7 \n\t" // 1 complex values from a3 + + "vfmaddpd %%xmm8 , %%xmm4 , %%xmm0, %%xmm8 \n\t" // ar0*xr0,al0*xr0 + "vfmaddpd %%xmm9 , %%xmm4 , %%xmm1, %%xmm9 \n\t" // ar0*xl0,al0*xl0 + "vfmaddpd %%xmm10, %%xmm5 , %%xmm0, %%xmm10 \n\t" // ar0*xr0,al0*xr0 + "vfmaddpd %%xmm11, %%xmm5 , %%xmm1, %%xmm11 \n\t" // ar0*xl0,al0*xl0 + "vfmaddpd %%xmm12, %%xmm6 , %%xmm0, %%xmm12 \n\t" // ar0*xr0,al0*xr0 + "vfmaddpd %%xmm13, %%xmm6 , %%xmm1, %%xmm13 \n\t" // ar0*xl0,al0*xl0 + "vfmaddpd %%xmm14, %%xmm7 , %%xmm0, %%xmm14 \n\t" // ar0*xr0,al0*xr0 + "vfmaddpd %%xmm15, %%xmm7 , %%xmm1, %%xmm15 \n\t" // ar0*xl0,al0*xl0 + + "addq $8 , %0 \n\t" + "subq $4 , %1 \n\t" "jnz .L01LOOP%= \n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - "vpermilpd $0x5 , %%ymm9 , %%ymm9 \n\t" - "vpermilpd $0x5 , %%ymm11, %%ymm11 \n\t" - "vpermilpd $0x5 , %%ymm13, %%ymm13 \n\t" - "vpermilpd $0x5 , %%ymm15, %%ymm15 \n\t" - "vaddsubpd %%ymm9 , %%ymm8, %%ymm8 \n\t" - "vaddsubpd %%ymm11, %%ymm10, %%ymm10 \n\t" - "vaddsubpd %%ymm13, %%ymm12, %%ymm12 \n\t" - "vaddsubpd %%ymm15, %%ymm14, %%ymm14 \n\t" + "vpermilpd $0x1 , %%xmm9 , %%xmm9 \n\t" + "vpermilpd $0x1 , %%xmm11, %%xmm11 \n\t" + "vpermilpd $0x1 , %%xmm13, %%xmm13 \n\t" + "vpermilpd $0x1 , %%xmm15, %%xmm15 \n\t" + "vaddsubpd %%xmm9 , %%xmm8, %%xmm8 \n\t" + "vaddsubpd %%xmm11, %%xmm10, %%xmm10 \n\t" + "vaddsubpd %%xmm13, %%xmm12, %%xmm12 \n\t" + "vaddsubpd %%xmm15, %%xmm14, %%xmm14 \n\t" #else - "vpermilpd $0x5 , %%ymm8 , %%ymm8 \n\t" - "vpermilpd $0x5 , %%ymm10, %%ymm10 \n\t" - "vpermilpd $0x5 , %%ymm12, %%ymm12 \n\t" - "vpermilpd $0x5 , %%ymm14, %%ymm14 \n\t" - "vaddsubpd %%ymm8 , %%ymm9 , %%ymm8 \n\t" - "vaddsubpd %%ymm10, %%ymm11, %%ymm10 \n\t" - "vaddsubpd %%ymm12, %%ymm13, %%ymm12 \n\t" - "vaddsubpd %%ymm14, %%ymm15, %%ymm14 \n\t" - "vpermilpd $0x5 , %%ymm8 , %%ymm8 \n\t" - "vpermilpd $0x5 , %%ymm10, %%ymm10 \n\t" - "vpermilpd $0x5 , %%ymm12, %%ymm12 \n\t" - "vpermilpd $0x5 , %%ymm14, %%ymm14 \n\t" + "vpermilpd $0x1 , %%xmm8 , %%xmm8 \n\t" + "vpermilpd $0x1 , %%xmm10, %%xmm10 \n\t" + "vpermilpd $0x1 , %%xmm12, %%xmm12 \n\t" + "vpermilpd $0x1 , %%xmm14, %%xmm14 \n\t" + "vaddsubpd %%xmm8 , %%xmm9 , %%xmm8 \n\t" + "vaddsubpd %%xmm10, %%xmm11, %%xmm10 \n\t" + "vaddsubpd %%xmm12, %%xmm13, %%xmm12 \n\t" + "vaddsubpd %%xmm14, %%xmm15, %%xmm14 \n\t" + "vpermilpd $0x1 , %%xmm8 , %%xmm8 \n\t" + "vpermilpd $0x1 , %%xmm10, %%xmm10 \n\t" + "vpermilpd $0x1 , %%xmm12, %%xmm12 \n\t" + "vpermilpd $0x1 , %%xmm14, %%xmm14 \n\t" #endif - "vextractf128 $1, %%ymm8 , %%xmm9 \n\t" - "vextractf128 $1, %%ymm10, %%xmm11 \n\t" - "vextractf128 $1, %%ymm12, %%xmm13 \n\t" - "vextractf128 $1, %%ymm14, %%xmm15 \n\t" - - "vaddpd %%xmm8 , %%xmm9 , %%xmm8 \n\t" - "vaddpd %%xmm10, %%xmm11, %%xmm10 \n\t" - "vaddpd %%xmm12, %%xmm13, %%xmm12 \n\t" - "vaddpd %%xmm14, %%xmm15, %%xmm14 \n\t" "vmovups %%xmm8 , (%3) \n\t" "vmovups %%xmm10, 16(%3) \n\t" From 58b075daef4c65b02951ed8a8fd78dc53cab0893 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Mon, 11 Aug 2014 16:57:52 +0200 Subject: [PATCH 16/27] added optimized zgemv_t kernel for haswell --- kernel/x86_64/KERNEL.HASWELL | 1 + kernel/x86_64/zgemv_t.c | 2 ++ kernel/x86_64/zgemv_t_microk_haswell-2.c | 29 ++++++++++++++++++++++-- 3 files changed, 30 insertions(+), 2 deletions(-) diff --git a/kernel/x86_64/KERNEL.HASWELL b/kernel/x86_64/KERNEL.HASWELL index 7d4cddbcc..9a48289c5 100644 --- a/kernel/x86_64/KERNEL.HASWELL +++ b/kernel/x86_64/KERNEL.HASWELL @@ -6,6 +6,7 @@ DGEMVTKERNEL = dgemv_t.c ifndef OS_WINDOWS ZGEMVNKERNEL = zgemv_n.c endif +ZGEMVTKERNEL = zgemv_t.c SGEMMKERNEL = sgemm_kernel_16x4_haswell.S SGEMMINCOPY = ../generic/gemm_ncopy_16.c diff --git a/kernel/x86_64/zgemv_t.c b/kernel/x86_64/zgemv_t.c index b54d5f4e2..df75afeff 100644 --- a/kernel/x86_64/zgemv_t.c +++ b/kernel/x86_64/zgemv_t.c @@ -31,6 +31,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(BULLDOZER) #include "zgemv_t_microk_bulldozer-2.c" +#elif defined(HASWELL) +#include "zgemv_t_microk_haswell-2.c" #endif diff --git a/kernel/x86_64/zgemv_t_microk_haswell-2.c b/kernel/x86_64/zgemv_t_microk_haswell-2.c index 2dddef27d..99a620e44 100644 --- a/kernel/x86_64/zgemv_t_microk_haswell-2.c +++ b/kernel/x86_64/zgemv_t_microk_haswell-2.c @@ -49,6 +49,7 @@ static void zgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) ".align 16 \n\t" ".L01LOOP%=: \n\t" + "prefetcht0 384(%2,%0,8) \n\t" "vmovddup (%2,%0,8), %%xmm0 \n\t" // real value from x0 "vmovddup 8(%2,%0,8), %%xmm1 \n\t" // imag value from x0 "vmovddup 16(%2,%0,8), %%xmm2 \n\t" // real value from x1 @@ -56,9 +57,13 @@ static void zgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) "vinsertf128 $1, %%xmm2, %%ymm0 , %%ymm0 \n\t" // real values from x0 and x1 "vinsertf128 $1, %%xmm3, %%ymm1 , %%ymm1 \n\t" // imag values from x0 and x1 + "prefetcht0 384(%4,%0,8) \n\t" "vmovups (%4,%0,8), %%ymm4 \n\t" // 2 complex values from a0 + "prefetcht0 384(%5,%0,8) \n\t" "vmovups (%5,%0,8), %%ymm5 \n\t" // 2 complex values from a1 + "prefetcht0 384(%6,%0,8) \n\t" "vmovups (%6,%0,8), %%ymm6 \n\t" // 2 complex values from a2 + "prefetcht0 384(%7,%0,8) \n\t" "vmovups (%7,%0,8), %%ymm7 \n\t" // 2 complex values from a3 "vfmadd231pd %%ymm4 , %%ymm0, %%ymm8 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 @@ -70,9 +75,29 @@ static void zgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) "vfmadd231pd %%ymm7 , %%ymm0, %%ymm14 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 "vfmadd231pd %%ymm7 , %%ymm1, %%ymm15 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 + "vmovddup 32(%2,%0,8), %%xmm0 \n\t" // real value from x0 + "vmovddup 40(%2,%0,8), %%xmm1 \n\t" // imag value from x0 + "vmovddup 48(%2,%0,8), %%xmm2 \n\t" // real value from x1 + "vmovddup 56(%2,%0,8), %%xmm3 \n\t" // imag value from x1 + "vinsertf128 $1, %%xmm2, %%ymm0 , %%ymm0 \n\t" // real values from x0 and x1 + "vinsertf128 $1, %%xmm3, %%ymm1 , %%ymm1 \n\t" // imag values from x0 and x1 - "addq $4 , %0 \n\t" - "subq $2 , %1 \n\t" + "vmovups 32(%4,%0,8), %%ymm4 \n\t" // 2 complex values from a0 + "vmovups 32(%5,%0,8), %%ymm5 \n\t" // 2 complex values from a1 + "vmovups 32(%6,%0,8), %%ymm6 \n\t" // 2 complex values from a2 + "vmovups 32(%7,%0,8), %%ymm7 \n\t" // 2 complex values from a3 + + "vfmadd231pd %%ymm4 , %%ymm0, %%ymm8 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 + "vfmadd231pd %%ymm4 , %%ymm1, %%ymm9 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 + "vfmadd231pd %%ymm5 , %%ymm0, %%ymm10 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 + "vfmadd231pd %%ymm5 , %%ymm1, %%ymm11 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 + "vfmadd231pd %%ymm6 , %%ymm0, %%ymm12 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 + "vfmadd231pd %%ymm6 , %%ymm1, %%ymm13 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 + "vfmadd231pd %%ymm7 , %%ymm0, %%ymm14 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 + "vfmadd231pd %%ymm7 , %%ymm1, %%ymm15 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 + + "addq $8 , %0 \n\t" + "subq $4 , %1 \n\t" "jnz .L01LOOP%= \n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) From 07c66b196093b3bc124f674a341ce304939eccde Mon Sep 17 00:00:00 2001 From: wernsaar Date: Tue, 12 Aug 2014 08:35:42 +0200 Subject: [PATCH 17/27] modified algorithm for better numerical stability --- kernel/x86_64/zgemv_n.c | 114 ++++++++++------------------------------ 1 file changed, 27 insertions(+), 87 deletions(-) diff --git a/kernel/x86_64/zgemv_n.c b/kernel/x86_64/zgemv_n.c index 141cb35df..75e40eccb 100644 --- a/kernel/x86_64/zgemv_n.c +++ b/kernel/x86_64/zgemv_n.c @@ -48,8 +48,7 @@ static void zgemv_kernel_16x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) for ( i=0; i< 2*n; i+=2 ) { -#if !defined(CONJ) -#if !defined(XCONJ) +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) y[i] += a0[i]*x[0] - a0[i+1] * x[1]; y[i+1] += a0[i]*x[1] + a0[i+1] * x[0]; y[i] += a1[i]*x[2] - a1[i+1] * x[3]; @@ -67,29 +66,6 @@ static void zgemv_kernel_16x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) y[i+1] += a2[i]*x[5] - a2[i+1] * x[4]; y[i] += a3[i]*x[6] + a3[i+1] * x[7]; y[i+1] += a3[i]*x[7] - a3[i+1] * x[6]; -#endif -#else -#if !defined(XCONJ) - y[i] += a0[i]*x[0] + a0[i+1] * x[1]; - y[i+1] -= a0[i]*x[1] - a0[i+1] * x[0]; - y[i] += a1[i]*x[2] + a1[i+1] * x[3]; - y[i+1] -= a1[i]*x[3] - a1[i+1] * x[2]; - y[i] += a2[i]*x[4] + a2[i+1] * x[5]; - y[i+1] -= a2[i]*x[5] - a2[i+1] * x[4]; - y[i] += a3[i]*x[6] + a3[i+1] * x[7]; - y[i+1] -= a3[i]*x[7] - a3[i+1] * x[6]; - -#else - y[i] += a0[i]*x[0] - a0[i+1] * x[1]; - y[i+1] -= a0[i]*x[1] + a0[i+1] * x[0]; - y[i] += a1[i]*x[2] - a1[i+1] * x[3]; - y[i+1] -= a1[i]*x[3] + a1[i+1] * x[2]; - y[i] += a2[i]*x[4] - a2[i+1] * x[5]; - y[i+1] -= a2[i]*x[5] + a2[i+1] * x[4]; - y[i] += a3[i]*x[6] - a3[i+1] * x[7]; - y[i+1] -= a3[i]*x[7] + a3[i+1] * x[6]; - -#endif #endif } } @@ -104,23 +80,12 @@ static void zgemv_kernel_16x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) for ( i=0; i< 2*n; i+=2 ) { -#if !defined(CONJ) -#if !defined(XCONJ) +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) y[i] += a0[i]*x[0] - a0[i+1] * x[1]; y[i+1] += a0[i]*x[1] + a0[i+1] * x[0]; #else y[i] += a0[i]*x[0] + a0[i+1] * x[1]; y[i+1] += a0[i]*x[1] - a0[i+1] * x[0]; -#endif -#else -#if !defined(XCONJ) - y[i] += a0[i]*x[0] + a0[i+1] * x[1]; - y[i+1] -= a0[i]*x[1] - a0[i+1] * x[0]; - -#else - y[i] += a0[i]*x[0] - a0[i+1] * x[1]; - y[i+1] -= a0[i]*x[1] + a0[i+1] * x[0]; -#endif #endif } @@ -139,17 +104,24 @@ static void zero_y(BLASLONG n, FLOAT *dest) -static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) +static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT alpha_r, FLOAT alpha_i) { BLASLONG i; + FLOAT temp_r; + FLOAT temp_i; for ( i=0; i Date: Tue, 12 Aug 2014 10:02:25 +0200 Subject: [PATCH 18/27] bugfix in zgemv_n_microk_haswell-2.c --- kernel/x86_64/zgemv_n_microk_haswell-2.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/x86_64/zgemv_n_microk_haswell-2.c b/kernel/x86_64/zgemv_n_microk_haswell-2.c index 833983fe0..bb40ec3ac 100644 --- a/kernel/x86_64/zgemv_n_microk_haswell-2.c +++ b/kernel/x86_64/zgemv_n_microk_haswell-2.c @@ -99,8 +99,8 @@ static void zgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) #else "vpermilpd $0x5 , %%ymm12, %%ymm12 \n\t" "vpermilpd $0x5 , %%ymm14, %%ymm14 \n\t" - "vaddsubpd %%ymm13, %%ymm12, %%ymm8 \n\t" - "vaddsubpd %%ymm15, %%ymm14, %%ymm9 \n\t" + "vaddsubpd %%ymm12, %%ymm13, %%ymm8 \n\t" + "vaddsubpd %%ymm14, %%ymm15, %%ymm9 \n\t" "vpermilpd $0x5 , %%ymm8 , %%ymm8 \n\t" "vpermilpd $0x5 , %%ymm9 , %%ymm9 \n\t" #endif From b06550519eb6a02d87f73a48b73fcef2fdedb9c9 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Tue, 12 Aug 2014 12:15:41 +0200 Subject: [PATCH 19/27] added optimized cgemv_t c-kernel --- kernel/x86_64/cgemv_t.c | 269 +++++++++++++++++++++++++ kernel/x86_64/zgemv_n_microk_sandy-2.c | 161 +++++++++++++++ 2 files changed, 430 insertions(+) create mode 100644 kernel/x86_64/cgemv_t.c create mode 100644 kernel/x86_64/zgemv_n_microk_sandy-2.c diff --git a/kernel/x86_64/cgemv_t.c b/kernel/x86_64/cgemv_t.c new file mode 100644 index 000000000..ccdf13a57 --- /dev/null +++ b/kernel/x86_64/cgemv_t.c @@ -0,0 +1,269 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include "common.h" + +/* +#if defined(BULLDOZER) +#include "zgemv_t_microk_bulldozer-2.c" +#elif defined(HASWELL) +#include "zgemv_t_microk_haswell-2.c" +#endif +*/ + +#define NBMAX 2048 + +#ifndef HAVE_KERNEL_16x4 + +static void zgemv_kernel_16x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) +{ + BLASLONG i; + FLOAT *a0,*a1,*a2,*a3; + a0 = ap[0]; + a1 = ap[1]; + a2 = ap[2]; + a3 = ap[3]; + FLOAT temp_r0 = 0.0; + FLOAT temp_r1 = 0.0; + FLOAT temp_r2 = 0.0; + FLOAT temp_r3 = 0.0; + FLOAT temp_i0 = 0.0; + FLOAT temp_i1 = 0.0; + FLOAT temp_i2 = 0.0; + FLOAT temp_i3 = 0.0; + + + for ( i=0; i< 2*n; i+=2 ) + { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + temp_r0 += a0[i]*x[i] - a0[i+1]*x[i+1]; + temp_i0 += a0[i]*x[i+1] + a0[i+1]*x[i]; + temp_r1 += a1[i]*x[i] - a1[i+1]*x[i+1]; + temp_i1 += a1[i]*x[i+1] + a1[i+1]*x[i]; + temp_r2 += a2[i]*x[i] - a2[i+1]*x[i+1]; + temp_i2 += a2[i]*x[i+1] + a2[i+1]*x[i]; + temp_r3 += a3[i]*x[i] - a3[i+1]*x[i+1]; + temp_i3 += a3[i]*x[i+1] + a3[i+1]*x[i]; +#else + temp_r0 += a0[i]*x[i] + a0[i+1]*x[i+1]; + temp_i0 += a0[i]*x[i+1] - a0[i+1]*x[i]; + temp_r1 += a1[i]*x[i] + a1[i+1]*x[i+1]; + temp_i1 += a1[i]*x[i+1] - a1[i+1]*x[i]; + temp_r2 += a2[i]*x[i] + a2[i+1]*x[i+1]; + temp_i2 += a2[i]*x[i+1] - a2[i+1]*x[i]; + temp_r3 += a3[i]*x[i] + a3[i+1]*x[i+1]; + temp_i3 += a3[i]*x[i+1] - a3[i+1]*x[i]; +#endif + } + y[0] = temp_r0; + y[1] = temp_i0; + y[2] = temp_r1; + y[3] = temp_i1; + y[4] = temp_r2; + y[5] = temp_i2; + y[6] = temp_r3; + y[7] = temp_i3; +} + +#endif + +static void zgemv_kernel_16x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) +{ + BLASLONG i; + FLOAT *a0; + a0 = ap; + FLOAT temp_r = 0.0; + FLOAT temp_i = 0.0; + + for ( i=0; i< 2*n; i+=2 ) + { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + temp_r += a0[i]*x[i] - a0[i+1]*x[i+1]; + temp_i += a0[i]*x[i+1] + a0[i+1]*x[i]; +#else + temp_r += a0[i]*x[i] + a0[i+1]*x[i+1]; + temp_i += a0[i]*x[i+1] - a0[i+1]*x[i]; +#endif + } + *y = temp_r; + *(y+1) = temp_i; +} + +static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) +{ + BLASLONG i; + for ( i=0; i Date: Wed, 13 Aug 2014 12:18:03 +0200 Subject: [PATCH 20/27] bugfix in zgemv_n_microk_sandy-2.c --- kernel/x86_64/zgemv_n_microk_sandy-2.c | 35 ++++++++------------------ 1 file changed, 10 insertions(+), 25 deletions(-) diff --git a/kernel/x86_64/zgemv_n_microk_sandy-2.c b/kernel/x86_64/zgemv_n_microk_sandy-2.c index 8061ed4fa..f90e2210a 100644 --- a/kernel/x86_64/zgemv_n_microk_sandy-2.c +++ b/kernel/x86_64/zgemv_n_microk_sandy-2.c @@ -50,22 +50,13 @@ static void zgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) ".align 16 \n\t" ".L01LOOP%=: \n\t" - "vxorpd %%ymm12, %%ymm12, %%ymm12 \n\t" - "vxorpd %%ymm13, %%ymm13, %%ymm13 \n\t" - "vxorpd %%ymm14, %%ymm14, %%ymm14 \n\t" - "vxorpd %%ymm15, %%ymm15, %%ymm15 \n\t" - "vmovups (%4,%0,8), %%ymm8 \n\t" // 2 complex values form a0 "vmovups 32(%4,%0,8), %%ymm9 \n\t" // 2 complex values form a0 - "vmulpd %%ymm8 , %%ymm0 , %%ymm10 \n\t" - "vaddpd %%ymm12, %%ymm10, %%ymm12 \n\t" - "vmulpd %%ymm8 , %%ymm1 , %%ymm11 \n\t" - "vaddpd %%ymm13, %%ymm11, %%ymm13 \n\t" - "vmulpd %%ymm9 , %%ymm0 , %%ymm10 \n\t" - "vaddpd %%ymm14, %%ymm10, %%ymm14 \n\t" - "vmulpd %%ymm9 , %%ymm1 , %%ymm11 \n\t" - "vaddpd %%ymm15, %%ymm11, %%ymm15 \n\t" + "vmulpd %%ymm8 , %%ymm0 , %%ymm12 \n\t" + "vmulpd %%ymm8 , %%ymm1 , %%ymm13 \n\t" + "vmulpd %%ymm9 , %%ymm0 , %%ymm14 \n\t" + "vmulpd %%ymm9 , %%ymm1 , %%ymm15 \n\t" "vmovups (%5,%0,8), %%ymm8 \n\t" // 2 complex values form a0 "vmovups 32(%5,%0,8), %%ymm9 \n\t" // 2 complex values form a0 @@ -103,6 +94,10 @@ static void zgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) "vmulpd %%ymm9 , %%ymm7 , %%ymm11 \n\t" "vaddpd %%ymm15, %%ymm11, %%ymm15 \n\t" + "prefetcht0 192(%3,%0,8) \n\t" + "vmovups (%3,%0,8), %%ymm10 \n\t" + "vmovups 32(%3,%0,8), %%ymm11 \n\t" + #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) "vpermilpd $0x5 , %%ymm13, %%ymm13 \n\t" "vpermilpd $0x5 , %%ymm15, %%ymm15 \n\t" @@ -117,18 +112,8 @@ static void zgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) "vpermilpd $0x5 , %%ymm9 , %%ymm9 \n\t" #endif - "prefetcht0 192(%3,%0,8) \n\t" - "vmovups (%3,%0,8), %%ymm12 \n\t" - "vmovups 32(%3,%0,8), %%ymm13 \n\t" - -#if !defined(XCONJ) - "vaddpd %%ymm8, %%ymm12, %%ymm12 \n\t" - "vaddpd %%ymm9, %%ymm13, %%ymm13 \n\t" -#else - "vaddsubpd %%ymm12, %%ymm8, %%ymm12 \n\t" - "vaddsubpd %%ymm13, %%ymm9, %%ymm13 \n\t" -#endif - + "vaddpd %%ymm8, %%ymm10, %%ymm12 \n\t" + "vaddpd %%ymm9, %%ymm11, %%ymm13 \n\t" "vmovups %%ymm12, (%3,%0,8) \n\t" // 2 complex values to y "vmovups %%ymm13, 32(%3,%0,8) \n\t" From 11e34ddd1b45832606d5ef000d07519410f30676 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Wed, 13 Aug 2014 12:54:18 +0200 Subject: [PATCH 21/27] bugfix for zgemv_n_microk_haswell-2.c --- kernel/x86_64/KERNEL.HASWELL | 4 +--- kernel/x86_64/zgemv_n.c | 11 ++++++++- kernel/x86_64/zgemv_n_microk_haswell-2.c | 30 +++++++----------------- 3 files changed, 20 insertions(+), 25 deletions(-) diff --git a/kernel/x86_64/KERNEL.HASWELL b/kernel/x86_64/KERNEL.HASWELL index 9a48289c5..2d54920cc 100644 --- a/kernel/x86_64/KERNEL.HASWELL +++ b/kernel/x86_64/KERNEL.HASWELL @@ -3,10 +3,8 @@ SGEMVTKERNEL = sgemv_t.c DGEMVNKERNEL = dgemv_n.c DGEMVTKERNEL = dgemv_t.c -ifndef OS_WINDOWS ZGEMVNKERNEL = zgemv_n.c -endif -ZGEMVTKERNEL = zgemv_t.c +#ZGEMVTKERNEL = zgemv_t.c SGEMMKERNEL = sgemm_kernel_16x4_haswell.S SGEMMINCOPY = ../generic/gemm_ncopy_16.c diff --git a/kernel/x86_64/zgemv_n.c b/kernel/x86_64/zgemv_n.c index 75e40eccb..7b8907044 100644 --- a/kernel/x86_64/zgemv_n.c +++ b/kernel/x86_64/zgemv_n.c @@ -25,7 +25,8 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ - +#include +#include #include "common.h" #if defined(HASWELL) @@ -141,6 +142,14 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r,FLOAT alpha_i, BLASLONG n2; FLOAT xbuffer[8],*ybuffer; + +#if 0 +printf("%s %d %d %.16f %.16f %d %d %d\n","zgemv_n",m,n,alpha_r,alpha_i,lda,inc_x,inc_y); +#endif + + if ( m < 1 ) return(0); + if ( n < 1 ) return(0); + ybuffer = buffer; inc_x *= 2; diff --git a/kernel/x86_64/zgemv_n_microk_haswell-2.c b/kernel/x86_64/zgemv_n_microk_haswell-2.c index bb40ec3ac..e1c5838f7 100644 --- a/kernel/x86_64/zgemv_n_microk_haswell-2.c +++ b/kernel/x86_64/zgemv_n_microk_haswell-2.c @@ -53,19 +53,14 @@ static void zgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) "vmovups (%4,%0,8), %%ymm8 \n\t" // 2 complex values form a0 "vmovups 32(%4,%0,8), %%ymm9 \n\t" // 2 complex values form a0 - "vxorpd %%ymm12, %%ymm12, %%ymm12 \n\t" - "vxorpd %%ymm13, %%ymm13, %%ymm13 \n\t" - "vxorpd %%ymm14, %%ymm14, %%ymm14 \n\t" - "vxorpd %%ymm15, %%ymm15, %%ymm15 \n\t" - "prefetcht0 192(%5,%0,8) \n\t" "vmovups (%5,%0,8), %%ymm10 \n\t" // 2 complex values form a1 "vmovups 32(%5,%0,8), %%ymm11 \n\t" // 2 complex values form a1 - "vfmadd231pd %%ymm8 , %%ymm0, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r - "vfmadd231pd %%ymm8 , %%ymm1, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i - "vfmadd231pd %%ymm9 , %%ymm0, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r - "vfmadd231pd %%ymm9 , %%ymm1, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i + "vmulpd %%ymm8 , %%ymm0, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r + "vmulpd %%ymm8 , %%ymm1, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i + "vmulpd %%ymm9 , %%ymm0, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r + "vmulpd %%ymm9 , %%ymm1, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i "prefetcht0 192(%6,%0,8) \n\t" "vmovups (%6,%0,8), %%ymm8 \n\t" // 2 complex values form a2 @@ -90,6 +85,9 @@ static void zgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) "vfmadd231pd %%ymm11, %%ymm6, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r "vfmadd231pd %%ymm11, %%ymm7, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i + "prefetcht0 192(%3,%0,8) \n\t" + "vmovups (%3,%0,8), %%ymm10 \n\t" + "vmovups 32(%3,%0,8), %%ymm11 \n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) "vpermilpd $0x5 , %%ymm13, %%ymm13 \n\t" @@ -105,18 +103,8 @@ static void zgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) "vpermilpd $0x5 , %%ymm9 , %%ymm9 \n\t" #endif - "prefetcht0 192(%3,%0,8) \n\t" - "vmovups (%3,%0,8), %%ymm12 \n\t" - "vmovups 32(%3,%0,8), %%ymm13 \n\t" - -#if !defined(XCONJ) - "vaddpd %%ymm8, %%ymm12, %%ymm12 \n\t" - "vaddpd %%ymm9, %%ymm13, %%ymm13 \n\t" -#else - "vaddsubpd %%ymm12, %%ymm8, %%ymm12 \n\t" - "vaddsubpd %%ymm13, %%ymm9, %%ymm13 \n\t" -#endif - + "vaddpd %%ymm8, %%ymm10, %%ymm12 \n\t" + "vaddpd %%ymm9, %%ymm11, %%ymm13 \n\t" "vmovups %%ymm12, (%3,%0,8) \n\t" // 2 complex values to y "vmovups %%ymm13, 32(%3,%0,8) \n\t" From 8c582d362d0f8a53c222dc4c9cbb7919cdb32116 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Wed, 13 Aug 2014 13:42:22 +0200 Subject: [PATCH 22/27] optimized zgemv_t_microk_haswell-2.c --- kernel/x86_64/KERNEL.HASWELL | 2 +- kernel/x86_64/zgemv_t.c | 3 +++ kernel/x86_64/zgemv_t_microk_haswell-2.c | 28 +++++++++++------------- 3 files changed, 17 insertions(+), 16 deletions(-) diff --git a/kernel/x86_64/KERNEL.HASWELL b/kernel/x86_64/KERNEL.HASWELL index 2d54920cc..6d0792f16 100644 --- a/kernel/x86_64/KERNEL.HASWELL +++ b/kernel/x86_64/KERNEL.HASWELL @@ -4,7 +4,7 @@ DGEMVNKERNEL = dgemv_n.c DGEMVTKERNEL = dgemv_t.c ZGEMVNKERNEL = zgemv_n.c -#ZGEMVTKERNEL = zgemv_t.c +ZGEMVTKERNEL = zgemv_t.c SGEMMKERNEL = sgemm_kernel_16x4_haswell.S SGEMMINCOPY = ../generic/gemm_ncopy_16.c diff --git a/kernel/x86_64/zgemv_t.c b/kernel/x86_64/zgemv_t.c index df75afeff..bb3f90420 100644 --- a/kernel/x86_64/zgemv_t.c +++ b/kernel/x86_64/zgemv_t.c @@ -141,6 +141,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, BLASLONG n2; FLOAT ybuffer[8],*xbuffer; + if ( m < 1 ) return(0); + if ( n < 1 ) return(0); + inc_x *= 2; inc_y *= 2; lda *= 2; diff --git a/kernel/x86_64/zgemv_t_microk_haswell-2.c b/kernel/x86_64/zgemv_t_microk_haswell-2.c index 99a620e44..8325db5cf 100644 --- a/kernel/x86_64/zgemv_t_microk_haswell-2.c +++ b/kernel/x86_64/zgemv_t_microk_haswell-2.c @@ -49,23 +49,22 @@ static void zgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) ".align 16 \n\t" ".L01LOOP%=: \n\t" - "prefetcht0 384(%2,%0,8) \n\t" + "prefetcht0 192(%2,%0,8) \n\t" "vmovddup (%2,%0,8), %%xmm0 \n\t" // real value from x0 + "prefetcht0 192(%4,%0,8) \n\t" + "vmovups (%5,%0,8), %%ymm5 \n\t" // 2 complex values from a1 "vmovddup 8(%2,%0,8), %%xmm1 \n\t" // imag value from x0 + "vmovups (%4,%0,8), %%ymm4 \n\t" // 2 complex values from a0 + "prefetcht0 192(%5,%0,8) \n\t" "vmovddup 16(%2,%0,8), %%xmm2 \n\t" // real value from x1 + "prefetcht0 192(%6,%0,8) \n\t" + "vmovups (%6,%0,8), %%ymm6 \n\t" // 2 complex values from a2 "vmovddup 24(%2,%0,8), %%xmm3 \n\t" // imag value from x1 + "prefetcht0 192(%7,%0,8) \n\t" + "vmovups (%7,%0,8), %%ymm7 \n\t" // 2 complex values from a3 "vinsertf128 $1, %%xmm2, %%ymm0 , %%ymm0 \n\t" // real values from x0 and x1 "vinsertf128 $1, %%xmm3, %%ymm1 , %%ymm1 \n\t" // imag values from x0 and x1 - "prefetcht0 384(%4,%0,8) \n\t" - "vmovups (%4,%0,8), %%ymm4 \n\t" // 2 complex values from a0 - "prefetcht0 384(%5,%0,8) \n\t" - "vmovups (%5,%0,8), %%ymm5 \n\t" // 2 complex values from a1 - "prefetcht0 384(%6,%0,8) \n\t" - "vmovups (%6,%0,8), %%ymm6 \n\t" // 2 complex values from a2 - "prefetcht0 384(%7,%0,8) \n\t" - "vmovups (%7,%0,8), %%ymm7 \n\t" // 2 complex values from a3 - "vfmadd231pd %%ymm4 , %%ymm0, %%ymm8 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 "vfmadd231pd %%ymm4 , %%ymm1, %%ymm9 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 "vfmadd231pd %%ymm5 , %%ymm0, %%ymm10 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 @@ -75,17 +74,16 @@ static void zgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) "vfmadd231pd %%ymm7 , %%ymm0, %%ymm14 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 "vfmadd231pd %%ymm7 , %%ymm1, %%ymm15 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 + "vmovups 32(%4,%0,8), %%ymm4 \n\t" // 2 complex values from a0 + "vmovups 32(%5,%0,8), %%ymm5 \n\t" // 2 complex values from a1 "vmovddup 32(%2,%0,8), %%xmm0 \n\t" // real value from x0 "vmovddup 40(%2,%0,8), %%xmm1 \n\t" // imag value from x0 "vmovddup 48(%2,%0,8), %%xmm2 \n\t" // real value from x1 "vmovddup 56(%2,%0,8), %%xmm3 \n\t" // imag value from x1 - "vinsertf128 $1, %%xmm2, %%ymm0 , %%ymm0 \n\t" // real values from x0 and x1 - "vinsertf128 $1, %%xmm3, %%ymm1 , %%ymm1 \n\t" // imag values from x0 and x1 - - "vmovups 32(%4,%0,8), %%ymm4 \n\t" // 2 complex values from a0 - "vmovups 32(%5,%0,8), %%ymm5 \n\t" // 2 complex values from a1 "vmovups 32(%6,%0,8), %%ymm6 \n\t" // 2 complex values from a2 "vmovups 32(%7,%0,8), %%ymm7 \n\t" // 2 complex values from a3 + "vinsertf128 $1, %%xmm2, %%ymm0 , %%ymm0 \n\t" // real values from x0 and x1 + "vinsertf128 $1, %%xmm3, %%ymm1 , %%ymm1 \n\t" // imag values from x0 and x1 "vfmadd231pd %%ymm4 , %%ymm0, %%ymm8 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 "vfmadd231pd %%ymm4 , %%ymm1, %%ymm9 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 From 2470129132df121aa922e4abe955c64a5d1385cb Mon Sep 17 00:00:00 2001 From: wernsaar Date: Wed, 13 Aug 2014 13:54:19 +0200 Subject: [PATCH 23/27] added fast return, if m or n < 1 --- kernel/x86_64/dgemv_n.c | 3 +++ kernel/x86_64/dgemv_t.c | 3 +++ kernel/x86_64/sgemv_n.c | 3 +++ kernel/x86_64/sgemv_t.c | 3 +++ kernel/x86_64/zgemv_t.c | 2 +- 5 files changed, 13 insertions(+), 1 deletion(-) diff --git a/kernel/x86_64/dgemv_n.c b/kernel/x86_64/dgemv_n.c index 5192ba193..5d826dc63 100644 --- a/kernel/x86_64/dgemv_n.c +++ b/kernel/x86_64/dgemv_n.c @@ -125,6 +125,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO BLASLONG n2; FLOAT xbuffer[4],*ybuffer; + if ( m < 1 ) return(0); + if ( n < 1 ) return(0); + ybuffer = buffer; n1 = n / 4 ; diff --git a/kernel/x86_64/dgemv_t.c b/kernel/x86_64/dgemv_t.c index 76aacd349..0fa8378fe 100644 --- a/kernel/x86_64/dgemv_t.c +++ b/kernel/x86_64/dgemv_t.c @@ -104,6 +104,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO BLASLONG n2; FLOAT ybuffer[4],*xbuffer; + if ( m < 1 ) return(0); + if ( n < 1 ) return(0); + xbuffer = buffer; n1 = n / 4 ; diff --git a/kernel/x86_64/sgemv_n.c b/kernel/x86_64/sgemv_n.c index f2de1b76a..faa8e1f8c 100644 --- a/kernel/x86_64/sgemv_n.c +++ b/kernel/x86_64/sgemv_n.c @@ -131,6 +131,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO BLASLONG n2; FLOAT xbuffer[4],*ybuffer; + if ( m < 1 ) return(0); + if ( n < 1 ) return(0); + ybuffer = buffer; n1 = n / 4 ; diff --git a/kernel/x86_64/sgemv_t.c b/kernel/x86_64/sgemv_t.c index adfaa9925..532afee5d 100644 --- a/kernel/x86_64/sgemv_t.c +++ b/kernel/x86_64/sgemv_t.c @@ -110,6 +110,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO BLASLONG n2; FLOAT ybuffer[4],*xbuffer; + if ( m < 1 ) return(0); + if ( n < 1 ) return(0); + xbuffer = buffer; n1 = n / 4 ; diff --git a/kernel/x86_64/zgemv_t.c b/kernel/x86_64/zgemv_t.c index bb3f90420..9f5444a72 100644 --- a/kernel/x86_64/zgemv_t.c +++ b/kernel/x86_64/zgemv_t.c @@ -29,7 +29,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(BULLDOZER) +#if defined(BULLDOZER) || defined(PILEDRIVER) #include "zgemv_t_microk_bulldozer-2.c" #elif defined(HASWELL) #include "zgemv_t_microk_haswell-2.c" From dc0593731365c17a694d297afd5ba8bf1bfaf0cd Mon Sep 17 00:00:00 2001 From: wernsaar Date: Wed, 13 Aug 2014 14:54:50 +0200 Subject: [PATCH 24/27] added additional test values --- lapack-netlib/TESTING/dstest.in | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lapack-netlib/TESTING/dstest.in b/lapack-netlib/TESTING/dstest.in index 6ec68d13a..4a31076a6 100644 --- a/lapack-netlib/TESTING/dstest.in +++ b/lapack-netlib/TESTING/dstest.in @@ -1,8 +1,8 @@ Data file for testing DSGESV/DSPOSV LAPACK routines 12 Number of values of M 0 1 2 13 17 45 78 91 101 119 120 132 values of M (row dimension) -4 Number of values of NRHS -1 2 14 16 Values of NRHS (number of right hand sides) +6 Number of values of NRHS +1 2 14 15 16 13 Values of NRHS (number of right hand sides) 30.0 Threshold value of test ratio T Put T to test the driver routine T Put T to test the error exits From c1a6374c6fe7df294aeca2c550bc58d61acfa654 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Wed, 13 Aug 2014 16:10:03 +0200 Subject: [PATCH 25/27] optimized zgemv_n kernel for sandybridge --- kernel/x86_64/KERNEL.SANDYBRIDGE | 3 +++ kernel/x86_64/zgemv_n.c | 3 +++ kernel/x86_64/zgemv_n_microk_sandy-2.c | 13 ++++++++----- 3 files changed, 14 insertions(+), 5 deletions(-) diff --git a/kernel/x86_64/KERNEL.SANDYBRIDGE b/kernel/x86_64/KERNEL.SANDYBRIDGE index d4fbca7f2..b654d3564 100644 --- a/kernel/x86_64/KERNEL.SANDYBRIDGE +++ b/kernel/x86_64/KERNEL.SANDYBRIDGE @@ -1,6 +1,9 @@ SGEMVNKERNEL = sgemv_n.c SGEMVTKERNEL = sgemv_t.c +ZGEMVNKERNEL = zgemv_n.c + + SGEMMKERNEL = sgemm_kernel_16x4_sandy.S SGEMMINCOPY = ../generic/gemm_ncopy_16.c SGEMMITCOPY = ../generic/gemm_tcopy_16.c diff --git a/kernel/x86_64/zgemv_n.c b/kernel/x86_64/zgemv_n.c index 7b8907044..9098368a5 100644 --- a/kernel/x86_64/zgemv_n.c +++ b/kernel/x86_64/zgemv_n.c @@ -31,9 +31,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(HASWELL) #include "zgemv_n_microk_haswell-2.c" +#elif defined(SANDYBRIDGE) +#include "zgemv_n_microk_sandy-2.c" #endif + #define NBMAX 1024 #ifndef HAVE_KERNEL_16x4 diff --git a/kernel/x86_64/zgemv_n_microk_sandy-2.c b/kernel/x86_64/zgemv_n_microk_sandy-2.c index f90e2210a..352c60f87 100644 --- a/kernel/x86_64/zgemv_n_microk_sandy-2.c +++ b/kernel/x86_64/zgemv_n_microk_sandy-2.c @@ -50,39 +50,42 @@ static void zgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) ".align 16 \n\t" ".L01LOOP%=: \n\t" + "prefetcht0 256(%4,%0,8) \n\t" "vmovups (%4,%0,8), %%ymm8 \n\t" // 2 complex values form a0 "vmovups 32(%4,%0,8), %%ymm9 \n\t" // 2 complex values form a0 "vmulpd %%ymm8 , %%ymm0 , %%ymm12 \n\t" "vmulpd %%ymm8 , %%ymm1 , %%ymm13 \n\t" + "prefetcht0 256(%5,%0,8) \n\t" "vmulpd %%ymm9 , %%ymm0 , %%ymm14 \n\t" - "vmulpd %%ymm9 , %%ymm1 , %%ymm15 \n\t" - "vmovups (%5,%0,8), %%ymm8 \n\t" // 2 complex values form a0 + "vmulpd %%ymm9 , %%ymm1 , %%ymm15 \n\t" "vmovups 32(%5,%0,8), %%ymm9 \n\t" // 2 complex values form a0 "vmulpd %%ymm8 , %%ymm2 , %%ymm10 \n\t" "vaddpd %%ymm12, %%ymm10, %%ymm12 \n\t" "vmulpd %%ymm8 , %%ymm3 , %%ymm11 \n\t" "vaddpd %%ymm13, %%ymm11, %%ymm13 \n\t" + "prefetcht0 256(%6,%0,8) \n\t" "vmulpd %%ymm9 , %%ymm2 , %%ymm10 \n\t" "vaddpd %%ymm14, %%ymm10, %%ymm14 \n\t" + "vmovups (%6,%0,8), %%ymm8 \n\t" // 2 complex values form a0 "vmulpd %%ymm9 , %%ymm3 , %%ymm11 \n\t" "vaddpd %%ymm15, %%ymm11, %%ymm15 \n\t" - "vmovups (%6,%0,8), %%ymm8 \n\t" // 2 complex values form a0 "vmovups 32(%6,%0,8), %%ymm9 \n\t" // 2 complex values form a0 "vmulpd %%ymm8 , %%ymm4 , %%ymm10 \n\t" "vaddpd %%ymm12, %%ymm10, %%ymm12 \n\t" "vmulpd %%ymm8 , %%ymm5 , %%ymm11 \n\t" "vaddpd %%ymm13, %%ymm11, %%ymm13 \n\t" + "prefetcht0 256(%7,%0,8) \n\t" "vmulpd %%ymm9 , %%ymm4 , %%ymm10 \n\t" "vaddpd %%ymm14, %%ymm10, %%ymm14 \n\t" + "vmovups (%7,%0,8), %%ymm8 \n\t" // 2 complex values form a0 "vmulpd %%ymm9 , %%ymm5 , %%ymm11 \n\t" "vaddpd %%ymm15, %%ymm11, %%ymm15 \n\t" - "vmovups (%7,%0,8), %%ymm8 \n\t" // 2 complex values form a0 "vmovups 32(%7,%0,8), %%ymm9 \n\t" // 2 complex values form a0 "vmulpd %%ymm8 , %%ymm6 , %%ymm10 \n\t" @@ -94,7 +97,7 @@ static void zgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) "vmulpd %%ymm9 , %%ymm7 , %%ymm11 \n\t" "vaddpd %%ymm15, %%ymm11, %%ymm15 \n\t" - "prefetcht0 192(%3,%0,8) \n\t" + "prefetcht0 256(%3,%0,8) \n\t" "vmovups (%3,%0,8), %%ymm10 \n\t" "vmovups 32(%3,%0,8), %%ymm11 \n\t" From 4568d32b6bb1ad27882268b8866ef35def75605e Mon Sep 17 00:00:00 2001 From: wernsaar Date: Thu, 14 Aug 2014 14:10:29 +0200 Subject: [PATCH 26/27] added optimized cgemv_t kernel for haswell --- kernel/x86_64/KERNEL.HASWELL | 2 + kernel/x86_64/cgemv_t.c | 18 +-- kernel/x86_64/cgemv_t_microk_haswell-2.c | 171 +++++++++++++++++++++++ 3 files changed, 180 insertions(+), 11 deletions(-) create mode 100644 kernel/x86_64/cgemv_t_microk_haswell-2.c diff --git a/kernel/x86_64/KERNEL.HASWELL b/kernel/x86_64/KERNEL.HASWELL index 6d0792f16..e07448abb 100644 --- a/kernel/x86_64/KERNEL.HASWELL +++ b/kernel/x86_64/KERNEL.HASWELL @@ -6,6 +6,8 @@ DGEMVTKERNEL = dgemv_t.c ZGEMVNKERNEL = zgemv_n.c ZGEMVTKERNEL = zgemv_t.c +CGEMVTKERNEL = cgemv_t.c + SGEMMKERNEL = sgemm_kernel_16x4_haswell.S SGEMMINCOPY = ../generic/gemm_ncopy_16.c SGEMMITCOPY = ../generic/gemm_tcopy_16.c diff --git a/kernel/x86_64/cgemv_t.c b/kernel/x86_64/cgemv_t.c index ccdf13a57..e40fd349e 100644 --- a/kernel/x86_64/cgemv_t.c +++ b/kernel/x86_64/cgemv_t.c @@ -28,19 +28,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -/* -#if defined(BULLDOZER) -#include "zgemv_t_microk_bulldozer-2.c" -#elif defined(HASWELL) -#include "zgemv_t_microk_haswell-2.c" +#if defined(HASWELL) +#include "cgemv_t_microk_haswell-2.c" #endif -*/ #define NBMAX 2048 #ifndef HAVE_KERNEL_16x4 -static void zgemv_kernel_16x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) +static void cgemv_kernel_16x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { BLASLONG i; FLOAT *a0,*a1,*a2,*a3; @@ -92,7 +88,7 @@ static void zgemv_kernel_16x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) #endif -static void zgemv_kernel_16x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) +static void cgemv_kernel_16x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) { BLASLONG i; FLOAT *a0; @@ -113,7 +109,7 @@ static void zgemv_kernel_16x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) *y = temp_r; *(y+1) = temp_i; } - + static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) { BLASLONG i; @@ -176,7 +172,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, ap[1] = a_ptr + lda; ap[2] = ap[1] + lda; ap[3] = ap[2] + lda; - zgemv_kernel_16x4(NB,ap,xbuffer,ybuffer); + cgemv_kernel_16x4(NB,ap,xbuffer,ybuffer); a_ptr += 4 * lda; #if !defined(XCONJ) @@ -210,7 +206,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, for( i = 0; i < n2 ; i++) { - zgemv_kernel_16x1(NB,a_ptr,xbuffer,ybuffer); + cgemv_kernel_16x1(NB,a_ptr,xbuffer,ybuffer); a_ptr += 1 * lda; #if !defined(XCONJ) diff --git a/kernel/x86_64/cgemv_t_microk_haswell-2.c b/kernel/x86_64/cgemv_t_microk_haswell-2.c new file mode 100644 index 000000000..0d79714af --- /dev/null +++ b/kernel/x86_64/cgemv_t_microk_haswell-2.c @@ -0,0 +1,171 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary froms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary from must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_16x4 1 +static void cgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline)); + +static void cgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) +{ + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + + "vxorps %%ymm8 , %%ymm8 , %%ymm8 \n\t" // temp + "vxorps %%ymm9 , %%ymm9 , %%ymm9 \n\t" // temp + "vxorps %%ymm10, %%ymm10, %%ymm10 \n\t" // temp + "vxorps %%ymm11, %%ymm11, %%ymm11 \n\t" // temp + "vxorps %%ymm12, %%ymm12, %%ymm12 \n\t" // temp + "vxorps %%ymm13, %%ymm13, %%ymm13 \n\t" + "vxorps %%ymm14, %%ymm14, %%ymm14 \n\t" + "vxorps %%ymm15, %%ymm15, %%ymm15 \n\t" + + ".align 16 \n\t" + ".L01LOOP%=: \n\t" + "prefetcht0 192(%4,%0,4) \n\t" + "vmovups (%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0 + "prefetcht0 192(%5,%0,4) \n\t" + "vmovups (%5,%0,4), %%ymm5 \n\t" // 4 complex values from a1 + + "prefetcht0 192(%2,%0,4) \n\t" + "vmovups (%2,%0,4) , %%ymm6 \n\t" // 4 complex values from x + "vpermilps $0xb1, %%ymm6, %%ymm7 \n\t" // exchange real and imap parts + "vblendps $0x55, %%ymm6, %%ymm7, %%ymm0 \n\t" // only the real parts + "vblendps $0x55, %%ymm7, %%ymm6, %%ymm1 \n\t" // only the imag parts + + "prefetcht0 192(%6,%0,4) \n\t" + "vmovups (%6,%0,4), %%ymm6 \n\t" // 4 complex values from a2 + "prefetcht0 192(%7,%0,4) \n\t" + "vmovups (%7,%0,4), %%ymm7 \n\t" // 4 complex values from a3 + + "vfmadd231ps %%ymm4 , %%ymm0, %%ymm8 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 + "vfmadd231ps %%ymm4 , %%ymm1, %%ymm9 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 + "vfmadd231ps %%ymm5 , %%ymm0, %%ymm10 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 + "vfmadd231ps %%ymm5 , %%ymm1, %%ymm11 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 + "vfmadd231ps %%ymm6 , %%ymm0, %%ymm12 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 + "vfmadd231ps %%ymm6 , %%ymm1, %%ymm13 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 + "vfmadd231ps %%ymm7 , %%ymm0, %%ymm14 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 + "vfmadd231ps %%ymm7 , %%ymm1, %%ymm15 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 + + "vmovups 32(%4,%0,4), %%ymm4 \n\t" // 2 complex values from a0 + "vmovups 32(%5,%0,4), %%ymm5 \n\t" // 2 complex values from a1 + + "vmovups 32(%2,%0,4) , %%ymm6 \n\t" // 4 complex values from x + "vpermilps $0xb1, %%ymm6, %%ymm7 \n\t" // exchange real and imap parts + "vblendps $0x55, %%ymm6, %%ymm7, %%ymm0 \n\t" // only the real parts + "vblendps $0x55, %%ymm7, %%ymm6, %%ymm1 \n\t" // only the imag parts + + "vmovups 32(%6,%0,4), %%ymm6 \n\t" // 2 complex values from a2 + "vmovups 32(%7,%0,4), %%ymm7 \n\t" // 2 complex values from a3 + + "vfmadd231ps %%ymm4 , %%ymm0, %%ymm8 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 + "vfmadd231ps %%ymm4 , %%ymm1, %%ymm9 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 + "vfmadd231ps %%ymm5 , %%ymm0, %%ymm10 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 + "vfmadd231ps %%ymm5 , %%ymm1, %%ymm11 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 + "vfmadd231ps %%ymm6 , %%ymm0, %%ymm12 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 + "vfmadd231ps %%ymm6 , %%ymm1, %%ymm13 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 + "vfmadd231ps %%ymm7 , %%ymm0, %%ymm14 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 + "vfmadd231ps %%ymm7 , %%ymm1, %%ymm15 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 + + "addq $16 , %0 \n\t" + "subq $8 , %1 \n\t" + "jnz .L01LOOP%= \n\t" + +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + "vpermilps $0xb1 , %%ymm9 , %%ymm9 \n\t" + "vpermilps $0xb1 , %%ymm11, %%ymm11 \n\t" + "vpermilps $0xb1 , %%ymm13, %%ymm13 \n\t" + "vpermilps $0xb1 , %%ymm15, %%ymm15 \n\t" + "vaddsubps %%ymm9 , %%ymm8, %%ymm8 \n\t" + "vaddsubps %%ymm11, %%ymm10, %%ymm10 \n\t" + "vaddsubps %%ymm13, %%ymm12, %%ymm12 \n\t" + "vaddsubps %%ymm15, %%ymm14, %%ymm14 \n\t" +#else + "vpermilps $0xb1 , %%ymm8 , %%ymm8 \n\t" + "vpermilps $0xb1 , %%ymm10, %%ymm10 \n\t" + "vpermilps $0xb1 , %%ymm12, %%ymm12 \n\t" + "vpermilps $0xb1 , %%ymm14, %%ymm14 \n\t" + "vaddsubps %%ymm8 , %%ymm9 , %%ymm8 \n\t" + "vaddsubps %%ymm10, %%ymm11, %%ymm10 \n\t" + "vaddsubps %%ymm12, %%ymm13, %%ymm12 \n\t" + "vaddsubps %%ymm14, %%ymm15, %%ymm14 \n\t" + "vpermilps $0xb1 , %%ymm8 , %%ymm8 \n\t" + "vpermilps $0xb1 , %%ymm10, %%ymm10 \n\t" + "vpermilps $0xb1 , %%ymm12, %%ymm12 \n\t" + "vpermilps $0xb1 , %%ymm14, %%ymm14 \n\t" +#endif + + "vextractf128 $1, %%ymm8 , %%xmm9 \n\t" + "vextractf128 $1, %%ymm10, %%xmm11 \n\t" + "vextractf128 $1, %%ymm12, %%xmm13 \n\t" + "vextractf128 $1, %%ymm14, %%xmm15 \n\t" + + "vaddps %%xmm8 , %%xmm9 , %%xmm8 \n\t" + "vaddps %%xmm10, %%xmm11, %%xmm10 \n\t" + "vaddps %%xmm12, %%xmm13, %%xmm12 \n\t" + "vaddps %%xmm14, %%xmm15, %%xmm14 \n\t" + + "vshufpd $0x1, %%xmm8 , %%xmm8 , %%xmm9 \n\t" + "vshufpd $0x1, %%xmm10, %%xmm10, %%xmm11 \n\t" + "vshufpd $0x1, %%xmm12, %%xmm12, %%xmm13 \n\t" + "vshufpd $0x1, %%xmm14, %%xmm14, %%xmm15 \n\t" + + "vaddps %%xmm8 , %%xmm9 , %%xmm8 \n\t" + "vaddps %%xmm10, %%xmm11, %%xmm10 \n\t" + "vaddps %%xmm12, %%xmm13, %%xmm12 \n\t" + "vaddps %%xmm14, %%xmm15, %%xmm14 \n\t" + + "vmovsd %%xmm8 , (%3) \n\t" + "vmovsd %%xmm10, 8(%3) \n\t" + "vmovsd %%xmm12, 16(%3) \n\t" + "vmovsd %%xmm14, 24(%3) \n\t" + + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (ap[0]), // 4 + "r" (ap[1]), // 5 + "r" (ap[2]), // 6 + "r" (ap[3]) // 7 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + From 11eab4c0199f85ece37453f351ffee6450bf8c7c Mon Sep 17 00:00:00 2001 From: wernsaar Date: Thu, 14 Aug 2014 19:00:30 +0200 Subject: [PATCH 27/27] added optimized cgemv_n for haswell --- kernel/x86_64/KERNEL.HASWELL | 2 + kernel/x86_64/cgemv_n.c | 255 +++++++++++++++++++++++ kernel/x86_64/cgemv_n_microk_haswell-2.c | 137 ++++++++++++ 3 files changed, 394 insertions(+) create mode 100644 kernel/x86_64/cgemv_n.c create mode 100644 kernel/x86_64/cgemv_n_microk_haswell-2.c diff --git a/kernel/x86_64/KERNEL.HASWELL b/kernel/x86_64/KERNEL.HASWELL index e07448abb..d0ac9c72f 100644 --- a/kernel/x86_64/KERNEL.HASWELL +++ b/kernel/x86_64/KERNEL.HASWELL @@ -1,11 +1,13 @@ SGEMVNKERNEL = sgemv_n.c SGEMVTKERNEL = sgemv_t.c + DGEMVNKERNEL = dgemv_n.c DGEMVTKERNEL = dgemv_t.c ZGEMVNKERNEL = zgemv_n.c ZGEMVTKERNEL = zgemv_t.c +CGEMVNKERNEL = cgemv_n.c CGEMVTKERNEL = cgemv_t.c SGEMMKERNEL = sgemm_kernel_16x4_haswell.S diff --git a/kernel/x86_64/cgemv_n.c b/kernel/x86_64/cgemv_n.c new file mode 100644 index 000000000..47ef0d447 --- /dev/null +++ b/kernel/x86_64/cgemv_n.c @@ -0,0 +1,255 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include +#include "common.h" + +#if defined(HASWELL) +#include "cgemv_n_microk_haswell-2.c" +#endif + + +#define NBMAX 2048 + +#ifndef HAVE_KERNEL_16x4 + +static void cgemv_kernel_16x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) +{ + BLASLONG i; + FLOAT *a0,*a1,*a2,*a3; + a0 = ap[0]; + a1 = ap[1]; + a2 = ap[2]; + a3 = ap[3]; + + for ( i=0; i< 2*n; i+=2 ) + { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + y[i] += a0[i]*x[0] - a0[i+1] * x[1]; + y[i+1] += a0[i]*x[1] + a0[i+1] * x[0]; + y[i] += a1[i]*x[2] - a1[i+1] * x[3]; + y[i+1] += a1[i]*x[3] + a1[i+1] * x[2]; + y[i] += a2[i]*x[4] - a2[i+1] * x[5]; + y[i+1] += a2[i]*x[5] + a2[i+1] * x[4]; + y[i] += a3[i]*x[6] - a3[i+1] * x[7]; + y[i+1] += a3[i]*x[7] + a3[i+1] * x[6]; +#else + y[i] += a0[i]*x[0] + a0[i+1] * x[1]; + y[i+1] += a0[i]*x[1] - a0[i+1] * x[0]; + y[i] += a1[i]*x[2] + a1[i+1] * x[3]; + y[i+1] += a1[i]*x[3] - a1[i+1] * x[2]; + y[i] += a2[i]*x[4] + a2[i+1] * x[5]; + y[i+1] += a2[i]*x[5] - a2[i+1] * x[4]; + y[i] += a3[i]*x[6] + a3[i+1] * x[7]; + y[i+1] += a3[i]*x[7] - a3[i+1] * x[6]; +#endif + } +} + +#endif + +static void cgemv_kernel_16x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) +{ + BLASLONG i; + FLOAT *a0; + a0 = ap; + + for ( i=0; i< 2*n; i+=2 ) + { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + y[i] += a0[i]*x[0] - a0[i+1] * x[1]; + y[i+1] += a0[i]*x[1] + a0[i+1] * x[0]; +#else + y[i] += a0[i]*x[0] + a0[i+1] * x[1]; + y[i+1] += a0[i]*x[1] - a0[i+1] * x[0]; +#endif + + } +} + + +static void zero_y(BLASLONG n, FLOAT *dest) +{ + BLASLONG i; + for ( i=0; i<2*n; i++ ) + { + *dest = 0.0; + dest++; + } +} + + + +static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT alpha_r, FLOAT alpha_i) +{ + BLASLONG i; + FLOAT temp_r; + FLOAT temp_i; + for ( i=0; i