From 493d4fe7e5bfdc5a237df2c6c1fe489904d5bf7d Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sat, 16 Aug 2014 11:36:48 +0200 Subject: [PATCH 01/32] added reference in C for symv_L --- kernel/arm/symv_L.c | 70 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 70 insertions(+) create mode 100644 kernel/arm/symv_L.c diff --git a/kernel/arm/symv_L.c b/kernel/arm/symv_L.c new file mode 100644 index 000000000..8f48d03f5 --- /dev/null +++ b/kernel/arm/symv_L.c @@ -0,0 +1,70 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) +{ + BLASLONG i; + BLASLONG ix,iy; + BLASLONG jx,jy; + BLASLONG j; + FLOAT temp1; + FLOAT temp2; + +#if 0 + if ( m != offset ) + printf("Symv_L: m=%d offset=%d\n",m,offset); +#endif + + jx = 0; + jy = 0; + + for (j=0; j Date: Sat, 16 Aug 2014 13:52:50 +0200 Subject: [PATCH 02/32] add reference in C for symv_U --- kernel/arm/symv_U.c | 71 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 71 insertions(+) create mode 100644 kernel/arm/symv_U.c diff --git a/kernel/arm/symv_U.c b/kernel/arm/symv_U.c new file mode 100644 index 000000000..b5a0c96e9 --- /dev/null +++ b/kernel/arm/symv_U.c @@ -0,0 +1,71 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) +{ + BLASLONG i; + BLASLONG ix,iy; + BLASLONG jx,jy; + BLASLONG j; + FLOAT temp1; + FLOAT temp2; + +#if 0 + if( m != offset ) + printf("Symv_U: m=%d offset=%d\n",m,offset); +#endif + + BLASLONG m1 = m - offset; + + jx = m1 * inc_x; + jy = m1 * inc_y; + + for (j=m1; j Date: Mon, 18 Aug 2014 12:18:10 +0200 Subject: [PATCH 03/32] added optimized dsymv_U kernel for bulldozer --- kernel/x86_64/KERNEL.BULLDOZER | 2 + kernel/x86_64/dsymv_U.c | 208 +++++++++++++++++++++ kernel/x86_64/dsymv_U_microk_bulldozer-2.c | 117 ++++++++++++ 3 files changed, 327 insertions(+) create mode 100644 kernel/x86_64/dsymv_U.c create mode 100644 kernel/x86_64/dsymv_U_microk_bulldozer-2.c diff --git a/kernel/x86_64/KERNEL.BULLDOZER b/kernel/x86_64/KERNEL.BULLDOZER index 19bf7fd32..03925cc19 100644 --- a/kernel/x86_64/KERNEL.BULLDOZER +++ b/kernel/x86_64/KERNEL.BULLDOZER @@ -1,3 +1,5 @@ +DSYMV_U_KERNEL = dsymv_U.c + SGEMVNKERNEL = sgemv_n.c SGEMVTKERNEL = sgemv_t.c diff --git a/kernel/x86_64/dsymv_U.c b/kernel/x86_64/dsymv_U.c new file mode 100644 index 000000000..1f22abe8d --- /dev/null +++ b/kernel/x86_64/dsymv_U.c @@ -0,0 +1,208 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include "common.h" + +#if defined(BULLDOZER) +#include "dsymv_U_microk_bulldozer-2.c" +#endif + + +#ifndef HAVE_KERNEL_8x2 + +static void dsymv_kernel_8x2(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *xp, FLOAT *yp, FLOAT *temp1, FLOAT *temp2) +{ + FLOAT at0,at1,at2,at3; + FLOAT tmp2[2] = { 0.0, 0.0 }; + FLOAT tp0; + FLOAT tp1; + BLASLONG i; + + tp0 = temp1[0]; + tp1 = temp1[1]; + + for (i=0; i Date: Mon, 18 Aug 2014 13:52:24 +0200 Subject: [PATCH 04/32] added optimized ssymv_U kernel for bulldozer --- kernel/x86_64/KERNEL.BULLDOZER | 1 + kernel/x86_64/ssymv_U.c | 209 +++++++++++++++++++++ kernel/x86_64/ssymv_U_microk_bulldozer-2.c | 119 ++++++++++++ 3 files changed, 329 insertions(+) create mode 100644 kernel/x86_64/ssymv_U.c create mode 100644 kernel/x86_64/ssymv_U_microk_bulldozer-2.c diff --git a/kernel/x86_64/KERNEL.BULLDOZER b/kernel/x86_64/KERNEL.BULLDOZER index 03925cc19..a078528cd 100644 --- a/kernel/x86_64/KERNEL.BULLDOZER +++ b/kernel/x86_64/KERNEL.BULLDOZER @@ -1,4 +1,5 @@ DSYMV_U_KERNEL = dsymv_U.c +SSYMV_U_KERNEL = ssymv_U.c SGEMVNKERNEL = sgemv_n.c SGEMVTKERNEL = sgemv_t.c diff --git a/kernel/x86_64/ssymv_U.c b/kernel/x86_64/ssymv_U.c new file mode 100644 index 000000000..75b8e2c3e --- /dev/null +++ b/kernel/x86_64/ssymv_U.c @@ -0,0 +1,209 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include "common.h" + + +#if defined(BULLDOZER) +#include "ssymv_U_microk_bulldozer-2.c" +#endif + + +#ifndef HAVE_KERNEL_16x2 + +static void ssymv_kernel_16x2(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *xp, FLOAT *yp, FLOAT *temp1, FLOAT *temp2) +{ + FLOAT at0,at1,at2,at3; + FLOAT tmp2[2] = { 0.0, 0.0 }; + FLOAT tp0; + FLOAT tp1; + BLASLONG i; + + tp0 = temp1[0]; + tp1 = temp1[1]; + + for (i=0; i Date: Tue, 19 Aug 2014 17:09:45 +0200 Subject: [PATCH 05/32] added optimized ssymv_U kernel for nehalem --- kernel/x86_64/KERNEL.NEHALEM | 3 + kernel/x86_64/ssymv_U.c | 134 +++++++++++++++++------ kernel/x86_64/ssymv_U_microk_nehalem-2.c | 130 ++++++++++++++++++++++ 3 files changed, 232 insertions(+), 35 deletions(-) create mode 100644 kernel/x86_64/ssymv_U_microk_nehalem-2.c diff --git a/kernel/x86_64/KERNEL.NEHALEM b/kernel/x86_64/KERNEL.NEHALEM index ca9ff252d..353514449 100644 --- a/kernel/x86_64/KERNEL.NEHALEM +++ b/kernel/x86_64/KERNEL.NEHALEM @@ -1,3 +1,6 @@ +#DSYMV_U_KERNEL = dsymv_U.c +SSYMV_U_KERNEL = ssymv_U.c + SGEMVNKERNEL = sgemv_n.c SGEMVTKERNEL = sgemv_t.c diff --git a/kernel/x86_64/ssymv_U.c b/kernel/x86_64/ssymv_U.c index 75b8e2c3e..61127aa3d 100644 --- a/kernel/x86_64/ssymv_U.c +++ b/kernel/x86_64/ssymv_U.c @@ -31,41 +31,94 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(BULLDOZER) #include "ssymv_U_microk_bulldozer-2.c" +#elif defined(NEHALEM) +#include "ssymv_U_microk_nehalem-2.c" #endif +#ifndef HAVE_KERNEL_4x4 -#ifndef HAVE_KERNEL_16x2 - -static void ssymv_kernel_16x2(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *xp, FLOAT *yp, FLOAT *temp1, FLOAT *temp2) +static void ssymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT *a3, FLOAT *xp, FLOAT *yp, FLOAT *temp1, FLOAT *temp2) { FLOAT at0,at1,at2,at3; - FLOAT tmp2[2] = { 0.0, 0.0 }; + FLOAT x; + FLOAT tmp2[4] = { 0.0, 0.0, 0.0, 0.0 }; FLOAT tp0; FLOAT tp1; + FLOAT tp2; + FLOAT tp3; BLASLONG i; tp0 = temp1[0]; tp1 = temp1[1]; + tp2 = temp1[2]; + tp3 = temp1[3]; - for (i=0; i Date: Tue, 19 Aug 2014 19:25:03 +0200 Subject: [PATCH 06/32] updated optimized ssymv_U for bulldozer --- kernel/x86_64/ssymv_U_microk_bulldozer-2.c | 97 ++++++++++------------ 1 file changed, 46 insertions(+), 51 deletions(-) diff --git a/kernel/x86_64/ssymv_U_microk_bulldozer-2.c b/kernel/x86_64/ssymv_U_microk_bulldozer-2.c index b0b0bed65..b8b3b73e9 100644 --- a/kernel/x86_64/ssymv_U_microk_bulldozer-2.c +++ b/kernel/x86_64/ssymv_U_microk_bulldozer-2.c @@ -25,10 +25,10 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#define HAVE_KERNEL_16x2 1 -static void ssymv_kernel_16x2( BLASLONG n, FLOAT *a1, FLOAT *a2, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2) __attribute__ ((noinline)); +#define HAVE_KERNEL_4x4 1 +static void ssymv_kernel_4x4( BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT *a3, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2) __attribute__ ((noinline)); -static void ssymv_kernel_16x2(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2) +static void ssymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT *a3, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2) { BLASLONG register i = 0; @@ -37,64 +37,57 @@ static void ssymv_kernel_16x2(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *x, FLOAT ( "vxorps %%xmm0 , %%xmm0 , %%xmm0 \n\t" // temp2[0] "vxorps %%xmm1 , %%xmm1 , %%xmm1 \n\t" // temp2[1] - "vbroadcastss (%6), %%xmm2 \n\t" // temp1[0] - "vbroadcastss 4(%6), %%xmm3 \n\t" // temp1[1] + "vxorps %%xmm2 , %%xmm2 , %%xmm2 \n\t" // temp2[2] + "vxorps %%xmm3 , %%xmm3 , %%xmm3 \n\t" // temp2[3] + "vbroadcastss (%8), %%xmm4 \n\t" // temp1[0] + "vbroadcastss 4(%8), %%xmm5 \n\t" // temp1[1] + "vbroadcastss 8(%8), %%xmm6 \n\t" // temp1[1] + "vbroadcastss 12(%8), %%xmm7 \n\t" // temp1[1] "xorq %0,%0 \n\t" - ".align 16 \n\t" - ".L01LOOP%=: \n\t" + ".align 16 \n\t" + ".L01LOOP%=: \n\t" - "prefetcht0 192(%4,%0,4) \n\t" - "vmovups (%4,%0,4), %%xmm4 \n\t" // 2 * a0 - "vmovups 16(%4,%0,4), %%xmm5 \n\t" // 2 * a0 - "prefetcht0 192(%2,%0,4) \n\t" - "vmovups (%2,%0,4), %%xmm8 \n\t" // 2 * x - "vmovups 16(%2,%0,4), %%xmm9 \n\t" // 2 * x - "prefetcht0 192(%3,%0,4) \n\t" - "vmovups 32(%4,%0,4), %%xmm6 \n\t" // 2 * a0 - "vmovups 48(%4,%0,4), %%xmm7 \n\t" // 2 * a0 - "vmovups 32(%2,%0,4), %%xmm10 \n\t" // 2 * x - "vmovups 48(%2,%0,4), %%xmm11 \n\t" // 2 * x + "vmovups (%2,%0,4), %%xmm8 \n\t" // 4 * x + "vmovups (%3,%0,4), %%xmm9 \n\t" // 4 * y - "prefetcht0 192(%5,%0,4) \n\t" - "vfmaddps (%3,%0,4), %%xmm2 , %%xmm4 , %%xmm12 \n\t" // y += temp1 * a0 - "vfmaddps %%xmm0 , %%xmm8 , %%xmm4 , %%xmm0 \n\t" // temp2 += a0 * x - "vfmaddps 16(%3,%0,4), %%xmm2 , %%xmm5 , %%xmm13 \n\t" // y += temp1 * a0 - "vmovups (%5,%0,4), %%xmm4 \n\t" // 2 * a1 - "vfmaddps %%xmm0 , %%xmm9 , %%xmm5 , %%xmm0 \n\t" // temp2 += a0 * x - "vfmaddps 32(%3,%0,4), %%xmm2 , %%xmm6 , %%xmm14 \n\t" // y += temp1 * a0 - "vmovups 16(%5,%0,4), %%xmm5 \n\t" // 2 * a1 - "vfmaddps %%xmm0 , %%xmm10, %%xmm6 , %%xmm0 \n\t" // temp2 += a0 * x - "vfmaddps 48(%3,%0,4), %%xmm2 , %%xmm7 , %%xmm15 \n\t" // y += temp1 * a0 - "vmovups 32(%5,%0,4), %%xmm6 \n\t" // 2 * a1 - "vfmaddps %%xmm0 , %%xmm11, %%xmm7 , %%xmm0 \n\t" // temp2 += a0 * x - "vmovups 48(%5,%0,4), %%xmm7 \n\t" // 2 * a1 + "vmovups (%4,%0,4), %%xmm12 \n\t" // 4 * a + "vmovups (%5,%0,4), %%xmm13 \n\t" // 4 * a - "vfmaddps %%xmm12, %%xmm3 , %%xmm4 , %%xmm12 \n\t" // y += temp1 * a1 - "vfmaddps %%xmm13, %%xmm3 , %%xmm5 , %%xmm13 \n\t" // y += temp1 * a1 - "vmovups %%xmm12, (%3,%0,4) \n\t" // 2 * y - "vfmaddps %%xmm14, %%xmm3 , %%xmm6 , %%xmm14 \n\t" // y += temp1 * a1 - "vmovups %%xmm13, 16(%3,%0,4) \n\t" // 2 * y - "vfmaddps %%xmm15, %%xmm3 , %%xmm7 , %%xmm15 \n\t" // y += temp1 * a1 - "vmovups %%xmm14, 32(%3,%0,4) \n\t" // 2 * y + "vfmaddps %%xmm0 , %%xmm8, %%xmm12 , %%xmm0 \n\t" // temp2 += x * a + "vfmaddps %%xmm9 , %%xmm4, %%xmm12 , %%xmm9 \n\t" // y += temp1 * a - "vfmaddps %%xmm1 , %%xmm8 , %%xmm4 , %%xmm1 \n\t" // temp2 += a1 * x - "vfmaddps %%xmm1 , %%xmm9 , %%xmm5 , %%xmm1 \n\t" // temp2 += a1 * x - "vmovups %%xmm15, 48(%3,%0,4) \n\t" // 2 * y - "vfmaddps %%xmm1 , %%xmm10, %%xmm6 , %%xmm1 \n\t" // temp2 += a1 * x - "vfmaddps %%xmm1 , %%xmm11, %%xmm7 , %%xmm1 \n\t" // temp2 += a1 * x + "vfmaddps %%xmm1 , %%xmm8, %%xmm13 , %%xmm1 \n\t" // temp2 += x * a + "vmovups (%6,%0,4), %%xmm14 \n\t" // 4 * a + "vfmaddps %%xmm9 , %%xmm5, %%xmm13 , %%xmm9 \n\t" // y += temp1 * a - "addq $16, %0 \n\t" - "subq $16, %1 \n\t" + "vfmaddps %%xmm2 , %%xmm8, %%xmm14 , %%xmm2 \n\t" // temp2 += x * a + "vmovups (%7,%0,4), %%xmm15 \n\t" // 4 * a + "vfmaddps %%xmm9 , %%xmm6, %%xmm14 , %%xmm9 \n\t" // y += temp1 * a + + "vfmaddps %%xmm3 , %%xmm8, %%xmm15 , %%xmm3 \n\t" // temp2 += x * a + "vfmaddps %%xmm9 , %%xmm7, %%xmm15 , %%xmm9 \n\t" // y += temp1 * a + + "vmovups %%xmm9 , (%3,%0,4) \n\t" + + "addq $4 , %0 \n\t" + "subq $4 , %1 \n\t" "jnz .L01LOOP%= \n\t" "vhaddps %%xmm0, %%xmm0, %%xmm0 \n\t" "vhaddps %%xmm1, %%xmm1, %%xmm1 \n\t" + "vhaddps %%xmm2, %%xmm2, %%xmm2 \n\t" + "vhaddps %%xmm3, %%xmm3, %%xmm3 \n\t" "vhaddps %%xmm0, %%xmm0, %%xmm0 \n\t" "vhaddps %%xmm1, %%xmm1, %%xmm1 \n\t" - "vmovss %%xmm0 , (%7) \n\t" // save temp2 - "vmovss %%xmm1 ,4(%7) \n\t" // save temp2 + "vhaddps %%xmm2, %%xmm2, %%xmm2 \n\t" + "vhaddps %%xmm3, %%xmm3, %%xmm3 \n\t" + + "vmovss %%xmm0 , (%9) \n\t" // save temp2 + "vmovss %%xmm1 , 4(%9) \n\t" // save temp2 + "vmovss %%xmm2 , 8(%9) \n\t" // save temp2 + "vmovss %%xmm3 ,12(%9) \n\t" // save temp2 : : @@ -102,10 +95,12 @@ static void ssymv_kernel_16x2(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *x, FLOAT "r" (n), // 1 "r" (x), // 2 "r" (y), // 3 - "r" (a0), // 4 - "r" (a1), // 5 - "r" (temp1), // 6 - "r" (temp2) // 7 + "r" (a0), // 4 + "r" (a1), // 5 + "r" (a2), // 6 + "r" (a3), // 7 + "r" (temp1), // 8 + "r" (temp2) // 9 : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", From ef6374196d0fbb69d0720c973abad9ef39a89253 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Wed, 20 Aug 2014 09:00:56 +0200 Subject: [PATCH 07/32] updated optimized dsymv_U kernel for bulldozer --- kernel/x86_64/dsymv_U.c | 135 +++++++++++++++------ kernel/x86_64/dsymv_U_microk_bulldozer-2.c | 115 ++++++++++-------- 2 files changed, 164 insertions(+), 86 deletions(-) diff --git a/kernel/x86_64/dsymv_U.c b/kernel/x86_64/dsymv_U.c index 1f22abe8d..267755c2f 100644 --- a/kernel/x86_64/dsymv_U.c +++ b/kernel/x86_64/dsymv_U.c @@ -28,43 +28,97 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" + #if defined(BULLDOZER) #include "dsymv_U_microk_bulldozer-2.c" +#elif defined(NEHALEM) +#include "dsymv_U_microk_nehalem-2.c" #endif +#ifndef HAVE_KERNEL_4x4 -#ifndef HAVE_KERNEL_8x2 - -static void dsymv_kernel_8x2(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *xp, FLOAT *yp, FLOAT *temp1, FLOAT *temp2) +static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT *a3, FLOAT *xp, FLOAT *yp, FLOAT *temp1, FLOAT *temp2) { FLOAT at0,at1,at2,at3; - FLOAT tmp2[2] = { 0.0, 0.0 }; + FLOAT x; + FLOAT tmp2[4] = { 0.0, 0.0, 0.0, 0.0 }; FLOAT tp0; FLOAT tp1; + FLOAT tp2; + FLOAT tp3; BLASLONG i; tp0 = temp1[0]; tp1 = temp1[1]; + tp2 = temp1[2]; + tp3 = temp1[3]; - for (i=0; i Date: Wed, 20 Aug 2014 09:58:04 +0200 Subject: [PATCH 08/32] added optimized dsymv_U kernel for nehalem --- kernel/x86_64/KERNEL.NEHALEM | 2 +- kernel/x86_64/dsymv_U_microk_nehalem-2.c | 125 +++++++++++++++++++++++ 2 files changed, 126 insertions(+), 1 deletion(-) create mode 100644 kernel/x86_64/dsymv_U_microk_nehalem-2.c diff --git a/kernel/x86_64/KERNEL.NEHALEM b/kernel/x86_64/KERNEL.NEHALEM index 353514449..35be648a4 100644 --- a/kernel/x86_64/KERNEL.NEHALEM +++ b/kernel/x86_64/KERNEL.NEHALEM @@ -1,4 +1,4 @@ -#DSYMV_U_KERNEL = dsymv_U.c +DSYMV_U_KERNEL = dsymv_U.c SSYMV_U_KERNEL = ssymv_U.c SGEMVNKERNEL = sgemv_n.c diff --git a/kernel/x86_64/dsymv_U_microk_nehalem-2.c b/kernel/x86_64/dsymv_U_microk_nehalem-2.c new file mode 100644 index 000000000..6aab57500 --- /dev/null +++ b/kernel/x86_64/dsymv_U_microk_nehalem-2.c @@ -0,0 +1,125 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_4x4 1 +static void dsymv_kernel_4x4( BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT *a3, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2) __attribute__ ((noinline)); + +static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT *a3, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2) +{ + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "xorpd %%xmm0 , %%xmm0 \n\t" // temp2[0] + "xorpd %%xmm1 , %%xmm1 \n\t" // temp2[1] + "xorpd %%xmm2 , %%xmm2 \n\t" // temp2[2] + "xorpd %%xmm3 , %%xmm3 \n\t" // temp2[3] + "movsd (%8), %%xmm4 \n\t" // temp1[0] + "movsd 8(%8), %%xmm5 \n\t" // temp1[1] + "movsd 16(%8), %%xmm6 \n\t" // temp1[2] + "movsd 24(%8), %%xmm7 \n\t" // temp1[3] + "shufpd $0, %%xmm4, %%xmm4 \n\t" + "shufpd $0, %%xmm5, %%xmm5 \n\t" + "shufpd $0, %%xmm6, %%xmm6 \n\t" + "shufpd $0, %%xmm7, %%xmm7 \n\t" + + "xorq %0,%0 \n\t" + + ".align 16 \n\t" + ".L01LOOP%=: \n\t" + "movups (%4,%0,8), %%xmm12 \n\t" // 2 * a + "movups (%2,%0,8), %%xmm8 \n\t" // 2 * x + "movups %%xmm12 , %%xmm11 \n\t" + "movups (%3,%0,8), %%xmm9 \n\t" // 2 * y + "movups (%5,%0,8), %%xmm13 \n\t" // 2 * a + + "mulpd %%xmm4 , %%xmm11 \n\t" // temp1 * a + "addpd %%xmm11 , %%xmm9 \n\t" // y += temp1 * a + "mulpd %%xmm8 , %%xmm12 \n\t" // a * x + "addpd %%xmm12 , %%xmm0 \n\t" // temp2 += x * a + + "movups (%6,%0,8), %%xmm14 \n\t" // 2 * a + "movups (%7,%0,8), %%xmm15 \n\t" // 2 * a + + "movups %%xmm13 , %%xmm11 \n\t" + "mulpd %%xmm5 , %%xmm11 \n\t" // temp1 * a + "addpd %%xmm11 , %%xmm9 \n\t" // y += temp1 * a + "mulpd %%xmm8 , %%xmm13 \n\t" // a * x + "addpd %%xmm13 , %%xmm1 \n\t" // temp2 += x * a + + "movups %%xmm14 , %%xmm11 \n\t" + "mulpd %%xmm6 , %%xmm11 \n\t" // temp1 * a + "addpd %%xmm11 , %%xmm9 \n\t" // y += temp1 * a + "mulpd %%xmm8 , %%xmm14 \n\t" // a * x + "addpd %%xmm14 , %%xmm2 \n\t" // temp2 += x * a + + "addq $2 , %0 \n\t" + "movups %%xmm15 , %%xmm11 \n\t" + "mulpd %%xmm7 , %%xmm11 \n\t" // temp1 * a + "addpd %%xmm11 , %%xmm9 \n\t" // y += temp1 * a + "mulpd %%xmm8 , %%xmm15 \n\t" // a * x + "addpd %%xmm15 , %%xmm3 \n\t" // temp2 += x * a + + "movups %%xmm9,-16(%3,%0,8) \n\t" // 2 * y + + "subq $2 , %1 \n\t" + "jnz .L01LOOP%= \n\t" + + "haddpd %%xmm0, %%xmm0 \n\t" + "haddpd %%xmm1, %%xmm1 \n\t" + "haddpd %%xmm2, %%xmm2 \n\t" + "haddpd %%xmm3, %%xmm3 \n\t" + + "movsd %%xmm0 , (%9) \n\t" // save temp2 + "movsd %%xmm1 , 8(%9) \n\t" // save temp2 + "movsd %%xmm2 , 16(%9) \n\t" // save temp2 + "movsd %%xmm3 , 24(%9) \n\t" // save temp2 + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (a0), // 4 + "r" (a1), // 5 + "r" (a2), // 6 + "r" (a3), // 7 + "r" (temp1), // 8 + "r" (temp2) // 9 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + From f6f9122660ee8be175b61dd91abc1a86e9bb4a7e Mon Sep 17 00:00:00 2001 From: wernsaar Date: Thu, 21 Aug 2014 13:02:53 +0200 Subject: [PATCH 09/32] added optimized dsymv_L kernel for bulldozer --- kernel/x86_64/dsymv_L.c | 299 +++++++++++++++++++++ kernel/x86_64/dsymv_L_microk_bulldozer-2.c | 137 ++++++++++ 2 files changed, 436 insertions(+) create mode 100644 kernel/x86_64/dsymv_L.c create mode 100644 kernel/x86_64/dsymv_L_microk_bulldozer-2.c diff --git a/kernel/x86_64/dsymv_L.c b/kernel/x86_64/dsymv_L.c new file mode 100644 index 000000000..29e3aa2f8 --- /dev/null +++ b/kernel/x86_64/dsymv_L.c @@ -0,0 +1,299 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include "common.h" + +#if defined(BULLDOZER) +#include "dsymv_L_microk_bulldozer-2.c" +#elif defined(NEHALEM) +#include "dsymv_U_microk_nehalem-2.c" +#endif + + +#ifndef HAVE_KERNEL_4x4 + +static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *tmp1, FLOAT *temp2) +{ + FLOAT tmp2[4] = { 0.0, 0.0, 0.0, 0.0 }; + BLASLONG i; + + for (i=from; i=12 ) + { + BLASLONG m2 = (m/4)*4; + for (i=j+1; i j+4 ) + dsymv_kernel_4x4(j+4,m2,ap,x,y,tmp1,tmp2); + + + for (i=m2; i=8 ) + { + BLASLONG j1 = ((from + 4)/4)*4; + BLASLONG j2 = (m/4)*4; + for (i=from; i Date: Thu, 21 Aug 2014 13:32:06 +0200 Subject: [PATCH 10/32] added optimized ssymv_L kernel for bulldozer --- kernel/x86_64/ssymv_L.c | 299 +++++++++++++++++++++ kernel/x86_64/ssymv_L_microk_bulldozer-2.c | 122 +++++++++ 2 files changed, 421 insertions(+) create mode 100644 kernel/x86_64/ssymv_L.c create mode 100644 kernel/x86_64/ssymv_L_microk_bulldozer-2.c diff --git a/kernel/x86_64/ssymv_L.c b/kernel/x86_64/ssymv_L.c new file mode 100644 index 000000000..352f1c862 --- /dev/null +++ b/kernel/x86_64/ssymv_L.c @@ -0,0 +1,299 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include "common.h" + +#if defined(BULLDOZER) +#include "ssymv_L_microk_bulldozer-2.c" +#elif defined(NEHALEM) +#include "ssymv_U_microk_nehalem-2.c" +#endif + + +#ifndef HAVE_KERNEL_4x4 + +static void ssymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *tmp1, FLOAT *temp2) +{ + FLOAT tmp2[4] = { 0.0, 0.0, 0.0, 0.0 }; + BLASLONG i; + + for (i=from; i=12 ) + { + BLASLONG m2 = (m/4)*4; + for (i=j+1; i j+4 ) + ssymv_kernel_4x4(j+4,m2,ap,x,y,tmp1,tmp2); + + + for (i=m2; i=8 ) + { + BLASLONG j1 = ((from + 4)/4)*4; + BLASLONG j2 = (m/4)*4; + for (i=from; i Date: Thu, 21 Aug 2014 14:27:00 +0200 Subject: [PATCH 11/32] added optimized symv_L kernels for nehalem --- kernel/x86_64/KERNEL.BULLDOZER | 2 + kernel/x86_64/KERNEL.NEHALEM | 2 + kernel/x86_64/dsymv_L.c | 2 +- kernel/x86_64/dsymv_L_microk_nehalem-2.c | 132 ++++++++++++++++++++++ kernel/x86_64/ssymv_L.c | 2 +- kernel/x86_64/ssymv_L_microk_nehalem-2.c | 137 +++++++++++++++++++++++ 6 files changed, 275 insertions(+), 2 deletions(-) create mode 100644 kernel/x86_64/dsymv_L_microk_nehalem-2.c create mode 100644 kernel/x86_64/ssymv_L_microk_nehalem-2.c diff --git a/kernel/x86_64/KERNEL.BULLDOZER b/kernel/x86_64/KERNEL.BULLDOZER index a078528cd..3ee1978b8 100644 --- a/kernel/x86_64/KERNEL.BULLDOZER +++ b/kernel/x86_64/KERNEL.BULLDOZER @@ -1,5 +1,7 @@ DSYMV_U_KERNEL = dsymv_U.c +DSYMV_L_KERNEL = dsymv_L.c SSYMV_U_KERNEL = ssymv_U.c +SSYMV_L_KERNEL = ssymv_L.c SGEMVNKERNEL = sgemv_n.c SGEMVTKERNEL = sgemv_t.c diff --git a/kernel/x86_64/KERNEL.NEHALEM b/kernel/x86_64/KERNEL.NEHALEM index 35be648a4..b16fd9c49 100644 --- a/kernel/x86_64/KERNEL.NEHALEM +++ b/kernel/x86_64/KERNEL.NEHALEM @@ -1,5 +1,7 @@ DSYMV_U_KERNEL = dsymv_U.c +DSYMV_L_KERNEL = dsymv_L.c SSYMV_U_KERNEL = ssymv_U.c +SSYMV_L_KERNEL = ssymv_L.c SGEMVNKERNEL = sgemv_n.c SGEMVTKERNEL = sgemv_t.c diff --git a/kernel/x86_64/dsymv_L.c b/kernel/x86_64/dsymv_L.c index 29e3aa2f8..8d1337746 100644 --- a/kernel/x86_64/dsymv_L.c +++ b/kernel/x86_64/dsymv_L.c @@ -31,7 +31,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(BULLDOZER) #include "dsymv_L_microk_bulldozer-2.c" #elif defined(NEHALEM) -#include "dsymv_U_microk_nehalem-2.c" +#include "dsymv_L_microk_nehalem-2.c" #endif diff --git a/kernel/x86_64/dsymv_L_microk_nehalem-2.c b/kernel/x86_64/dsymv_L_microk_nehalem-2.c new file mode 100644 index 000000000..3ba596c5e --- /dev/null +++ b/kernel/x86_64/dsymv_L_microk_nehalem-2.c @@ -0,0 +1,132 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_4x4 1 +static void dsymv_kernel_4x4( BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2) __attribute__ ((noinline)); + +static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2) +{ + + + __asm__ __volatile__ + ( + "xorpd %%xmm0 , %%xmm0 \n\t" // temp2[0] + "xorpd %%xmm1 , %%xmm1 \n\t" // temp2[1] + "xorpd %%xmm2 , %%xmm2 \n\t" // temp2[2] + "xorpd %%xmm3 , %%xmm3 \n\t" // temp2[3] + "movsd (%8), %%xmm4 \n\t" // temp1[0] + "movsd 8(%8), %%xmm5 \n\t" // temp1[1] + "movsd 16(%8), %%xmm6 \n\t" // temp1[2] + "movsd 24(%8), %%xmm7 \n\t" // temp1[3] + "shufpd $0, %%xmm4, %%xmm4 \n\t" + "shufpd $0, %%xmm5, %%xmm5 \n\t" + "shufpd $0, %%xmm6, %%xmm6 \n\t" + "shufpd $0, %%xmm7, %%xmm7 \n\t" + + ".align 16 \n\t" + ".L01LOOP%=: \n\t" + "movups (%4,%0,8), %%xmm12 \n\t" // 2 * a + "movups (%2,%0,8), %%xmm8 \n\t" // 2 * x + "movups %%xmm12 , %%xmm11 \n\t" + "movups (%3,%0,8), %%xmm9 \n\t" // 2 * y + "movups (%5,%0,8), %%xmm13 \n\t" // 2 * a + + "mulpd %%xmm4 , %%xmm11 \n\t" // temp1 * a + "addpd %%xmm11 , %%xmm9 \n\t" // y += temp1 * a + "mulpd %%xmm8 , %%xmm12 \n\t" // a * x + "addpd %%xmm12 , %%xmm0 \n\t" // temp2 += x * a + + "movups (%6,%0,8), %%xmm14 \n\t" // 2 * a + "movups (%7,%0,8), %%xmm15 \n\t" // 2 * a + + "movups %%xmm13 , %%xmm11 \n\t" + "mulpd %%xmm5 , %%xmm11 \n\t" // temp1 * a + "addpd %%xmm11 , %%xmm9 \n\t" // y += temp1 * a + "mulpd %%xmm8 , %%xmm13 \n\t" // a * x + "addpd %%xmm13 , %%xmm1 \n\t" // temp2 += x * a + + "movups %%xmm14 , %%xmm11 \n\t" + "mulpd %%xmm6 , %%xmm11 \n\t" // temp1 * a + "addpd %%xmm11 , %%xmm9 \n\t" // y += temp1 * a + "mulpd %%xmm8 , %%xmm14 \n\t" // a * x + "addpd %%xmm14 , %%xmm2 \n\t" // temp2 += x * a + + "addq $2 , %0 \n\t" + "movups %%xmm15 , %%xmm11 \n\t" + "mulpd %%xmm7 , %%xmm11 \n\t" // temp1 * a + "addpd %%xmm11 , %%xmm9 \n\t" // y += temp1 * a + "mulpd %%xmm8 , %%xmm15 \n\t" // a * x + "addpd %%xmm15 , %%xmm3 \n\t" // temp2 += x * a + + "movups %%xmm9,-16(%3,%0,8) \n\t" // 2 * y + + "cmpq %0 , %1 \n\t" + "jnz .L01LOOP%= \n\t" + + "movsd (%9), %%xmm4 \n\t" // temp1[0] + "movsd 8(%9), %%xmm5 \n\t" // temp1[1] + "movsd 16(%9), %%xmm6 \n\t" // temp1[2] + "movsd 24(%9), %%xmm7 \n\t" // temp1[3] + + "haddpd %%xmm0, %%xmm0 \n\t" + "haddpd %%xmm1, %%xmm1 \n\t" + "haddpd %%xmm2, %%xmm2 \n\t" + "haddpd %%xmm3, %%xmm3 \n\t" + + "addsd %%xmm4, %%xmm0 \n\t" + "addsd %%xmm5, %%xmm1 \n\t" + "addsd %%xmm6, %%xmm2 \n\t" + "addsd %%xmm7, %%xmm3 \n\t" + + "movsd %%xmm0 , (%9) \n\t" // save temp2 + "movsd %%xmm1 , 8(%9) \n\t" // save temp2 + "movsd %%xmm2 , 16(%9) \n\t" // save temp2 + "movsd %%xmm3 , 24(%9) \n\t" // save temp2 + + : + : + "r" (from), // 0 + "r" (to), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (a[0]), // 4 + "r" (a[1]), // 5 + "r" (a[2]), // 6 + "r" (a[3]), // 7 + "r" (temp1), // 8 + "r" (temp2) // 9 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + diff --git a/kernel/x86_64/ssymv_L.c b/kernel/x86_64/ssymv_L.c index 352f1c862..096adc6ca 100644 --- a/kernel/x86_64/ssymv_L.c +++ b/kernel/x86_64/ssymv_L.c @@ -31,7 +31,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(BULLDOZER) #include "ssymv_L_microk_bulldozer-2.c" #elif defined(NEHALEM) -#include "ssymv_U_microk_nehalem-2.c" +#include "ssymv_L_microk_nehalem-2.c" #endif diff --git a/kernel/x86_64/ssymv_L_microk_nehalem-2.c b/kernel/x86_64/ssymv_L_microk_nehalem-2.c new file mode 100644 index 000000000..a1c62caf6 --- /dev/null +++ b/kernel/x86_64/ssymv_L_microk_nehalem-2.c @@ -0,0 +1,137 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_4x4 1 +static void ssymv_kernel_4x4( BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2) __attribute__ ((noinline)); + +static void ssymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2) +{ + + + __asm__ __volatile__ + ( + "xorps %%xmm0 , %%xmm0 \n\t" // temp2[0] + "xorps %%xmm1 , %%xmm1 \n\t" // temp2[1] + "xorps %%xmm2 , %%xmm2 \n\t" // temp2[2] + "xorps %%xmm3 , %%xmm3 \n\t" // temp2[3] + "movss (%8), %%xmm4 \n\t" // temp1[0] + "movss 4(%8), %%xmm5 \n\t" // temp1[1] + "movss 8(%8), %%xmm6 \n\t" // temp1[2] + "movss 12(%8), %%xmm7 \n\t" // temp1[3] + "shufps $0, %%xmm4, %%xmm4 \n\t" + "shufps $0, %%xmm5, %%xmm5 \n\t" + "shufps $0, %%xmm6, %%xmm6 \n\t" + "shufps $0, %%xmm7, %%xmm7 \n\t" + + ".align 16 \n\t" + ".L01LOOP%=: \n\t" + "movups (%2,%0,4), %%xmm8 \n\t" // 4 * x + "movups (%3,%0,4), %%xmm9 \n\t" // 4 * y + + "movups (%4,%0,4), %%xmm12 \n\t" // 4 * a + "movups (%5,%0,4), %%xmm13 \n\t" // 4 * a + + "movups %%xmm12 , %%xmm11 \n\t" + "mulps %%xmm4 , %%xmm11 \n\t" // temp1 * a + "addps %%xmm11 , %%xmm9 \n\t" // y += temp1 * a + "mulps %%xmm8 , %%xmm12 \n\t" // a * x + "addps %%xmm12 , %%xmm0 \n\t" // temp2 += x * a + + "movups (%6,%0,4), %%xmm14 \n\t" // 4 * a + "movups (%7,%0,4), %%xmm15 \n\t" // 4 * a + + "movups %%xmm13 , %%xmm11 \n\t" + "mulps %%xmm5 , %%xmm11 \n\t" // temp1 * a + "addps %%xmm11 , %%xmm9 \n\t" // y += temp1 * a + "mulps %%xmm8 , %%xmm13 \n\t" // a * x + "addps %%xmm13 , %%xmm1 \n\t" // temp2 += x * a + + "movups %%xmm14 , %%xmm11 \n\t" + "mulps %%xmm6 , %%xmm11 \n\t" // temp1 * a + "addps %%xmm11 , %%xmm9 \n\t" // y += temp1 * a + "mulps %%xmm8 , %%xmm14 \n\t" // a * x + "addps %%xmm14 , %%xmm2 \n\t" // temp2 += x * a + + "movups %%xmm15 , %%xmm11 \n\t" + "mulps %%xmm7 , %%xmm11 \n\t" // temp1 * a + "addps %%xmm11 , %%xmm9 \n\t" // y += temp1 * a + "mulps %%xmm8 , %%xmm15 \n\t" // a * x + "addps %%xmm15 , %%xmm3 \n\t" // temp2 += x * a + + "movups %%xmm9, (%3,%0,4) \n\t" // 4 * y + + "addq $4 , %0 \n\t" + "cmpq %0 , %1 \n\t" + "jnz .L01LOOP%= \n\t" + + "movss (%9), %%xmm4 \n\t" // temp1[0] + "movss 4(%9), %%xmm5 \n\t" // temp1[1] + "movss 8(%9), %%xmm6 \n\t" // temp1[2] + "movss 12(%9), %%xmm7 \n\t" // temp1[3] + + "haddps %%xmm0, %%xmm0 \n\t" + "haddps %%xmm1, %%xmm1 \n\t" + "haddps %%xmm2, %%xmm2 \n\t" + "haddps %%xmm3, %%xmm3 \n\t" + "haddps %%xmm0, %%xmm0 \n\t" + "haddps %%xmm1, %%xmm1 \n\t" + "haddps %%xmm2, %%xmm2 \n\t" + "haddps %%xmm3, %%xmm3 \n\t" + + "addss %%xmm4, %%xmm0 \n\t" + "addss %%xmm5, %%xmm1 \n\t" + "addss %%xmm6, %%xmm2 \n\t" + "addss %%xmm7, %%xmm3 \n\t" + + "movss %%xmm0 , (%9) \n\t" // save temp2 + "movss %%xmm1 , 4(%9) \n\t" // save temp2 + "movss %%xmm2 , 8(%9) \n\t" // save temp2 + "movss %%xmm3 , 12(%9) \n\t" // save temp2 + + : + : + "r" (from), // 0 + "r" (to), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (a[0]), // 4 + "r" (a[1]), // 5 + "r" (a[2]), // 6 + "r" (a[3]), // 7 + "r" (temp1), // 8 + "r" (temp2) // 9 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + From 6f73ffc114a6d56b85c3023f6e8f7856e4198aec Mon Sep 17 00:00:00 2001 From: wernsaar Date: Thu, 21 Aug 2014 19:33:57 +0200 Subject: [PATCH 12/32] added benchmarks for csymv and zsymv --- benchmark/Makefile | 40 ++++++++++++++++++++++++++++++++++++---- 1 file changed, 36 insertions(+), 4 deletions(-) diff --git a/benchmark/Makefile b/benchmark/Makefile index de94dcc59..3b2d263c8 100644 --- a/benchmark/Makefile +++ b/benchmark/Makefile @@ -35,7 +35,7 @@ goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \ ssyrk.goto dsyrk.goto csyrk.goto zsyrk.goto \ ssyr2k.goto dsyr2k.goto csyr2k.goto zsyr2k.goto \ sger.goto dger.goto \ - ssymv.goto dsymv.goto \ + ssymv.goto dsymv.goto csymv.goto zsymv.goto \ chemm.goto zhemm.goto \ cherk.goto zherk.goto \ cher2k.goto zher2k.goto \ @@ -53,7 +53,7 @@ acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ ssyrk.acml dsyrk.acml csyrk.acml zsyrk.acml \ ssyr2k.acml dsyr2k.acml csyr2k.acml zsyr2k.acml \ sger.acml dger.acml \ - ssymv.acml dsymv.acml \ + ssymv.acml dsymv.acml csymv.acml zsymv.acml \ chemm.acml zhemm.acml \ cherk.acml zherk.acml \ cher2k.acml zher2k.acml \ @@ -71,7 +71,7 @@ atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \ ssyrk.atlas dsyrk.atlas csyrk.atlas zsyrk.atlas \ ssyr2k.atlas dsyr2k.atlas csyr2k.atlas zsyr2k.atlas \ sger.atlas dger.atlas \ - ssymv.atlas dsymv.atlas \ + ssymv.atlas dsymv.atlas csymv.atlas zsymv.atlas \ chemm.acml zhemm.acml \ chemm.atlas zhemm.atlas \ cherk.atlas zherk.atlas \ @@ -90,7 +90,7 @@ mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \ ssyrk.mkl dsyrk.mkl csyrk.mkl zsyrk.mkl \ ssyr2k.mkl dsyr2k.mkl csyr2k.mkl zsyr2k.mkl \ sger.mkl dger.mkl \ - ssymv.mkl dsymv.mkl \ + ssymv.mkl dsymv.mkl csymv.mkl zsymv.mkl \ chemm.mkl zhemm.mkl \ cherk.mkl zherk.mkl \ cher2k.mkl zher2k.mkl \ @@ -732,6 +732,32 @@ dsymv.atlas : dsymv.$(SUFFIX) dsymv.mkl : dsymv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +##################################### Csymv #################################################### +csymv.goto : csymv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + +csymv.acml : csymv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +csymv.atlas : csymv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +csymv.mkl : csymv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Dsymv #################################################### +zsymv.goto : zsymv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + +zsymv.acml : zsymv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zsymv.atlas : zsymv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zsymv.mkl : zsymv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Sgeev #################################################### sgeev.goto : sgeev.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm @@ -1037,6 +1063,12 @@ ssymv.$(SUFFIX) : symv.c dsymv.$(SUFFIX) : symv.c $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ +csymv.$(SUFFIX) : symv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zsymv.$(SUFFIX) : symv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + sgeev.$(SUFFIX) : geev.c $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ From fb0b4552a5dd5e83434f159aaf4d44b8a0377b49 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Fri, 22 Aug 2014 10:00:09 +0200 Subject: [PATCH 13/32] added hemv benchmark --- benchmark/Makefile | 38 +++++++++ benchmark/hemv.c | 208 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 246 insertions(+) create mode 100644 benchmark/hemv.c diff --git a/benchmark/Makefile b/benchmark/Makefile index 3b2d263c8..6c364ddfe 100644 --- a/benchmark/Makefile +++ b/benchmark/Makefile @@ -36,6 +36,7 @@ goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \ ssyr2k.goto dsyr2k.goto csyr2k.goto zsyr2k.goto \ sger.goto dger.goto \ ssymv.goto dsymv.goto csymv.goto zsymv.goto \ + chemv.goto zhemv.goto \ chemm.goto zhemm.goto \ cherk.goto zherk.goto \ cher2k.goto zher2k.goto \ @@ -54,6 +55,7 @@ acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ ssyr2k.acml dsyr2k.acml csyr2k.acml zsyr2k.acml \ sger.acml dger.acml \ ssymv.acml dsymv.acml csymv.acml zsymv.acml \ + chemv.acml zhemv.acml \ chemm.acml zhemm.acml \ cherk.acml zherk.acml \ cher2k.acml zher2k.acml \ @@ -72,6 +74,7 @@ atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \ ssyr2k.atlas dsyr2k.atlas csyr2k.atlas zsyr2k.atlas \ sger.atlas dger.atlas \ ssymv.atlas dsymv.atlas csymv.atlas zsymv.atlas \ + chemv.atlas zhemv.atlas \ chemm.acml zhemm.acml \ chemm.atlas zhemm.atlas \ cherk.atlas zherk.atlas \ @@ -91,6 +94,7 @@ mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \ ssyr2k.mkl dsyr2k.mkl csyr2k.mkl zsyr2k.mkl \ sger.mkl dger.mkl \ ssymv.mkl dsymv.mkl csymv.mkl zsymv.mkl \ + chemv.mkl zhemv.mkl \ chemm.mkl zhemm.mkl \ cherk.mkl zherk.mkl \ cher2k.mkl zher2k.mkl \ @@ -922,6 +926,33 @@ zpotrf.atlas : zpotrf.$(SUFFIX) zpotrf.mkl : zpotrf.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +##################################### Chemv #################################################### + +chemv.goto : chemv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + +chemv.acml : chemv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +chemv.atlas : chemv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +chemv.mkl : chemv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Zhemv #################################################### + +zhemv.goto : zhemv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + +zhemv.acml : zhemv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zhemv.atlas : zhemv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zhemv.mkl : zhemv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ################################################################################################### @@ -1105,6 +1136,13 @@ cpotrf.$(SUFFIX) : potrf.c zpotrf.$(SUFFIX) : potrf.c $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ +chemv.$(SUFFIX) : hemv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zhemv.$(SUFFIX) : hemv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + + diff --git a/benchmark/hemv.c b/benchmark/hemv.c new file mode 100644 index 000000000..79b7679cc --- /dev/null +++ b/benchmark/hemv.c @@ -0,0 +1,208 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + + +#undef HEMV + + +#ifdef DOUBLE +#define HEMV BLASFUNC(zhemv) +#else +#define HEMV BLASFUNC(chemv) +#endif + + +#if defined(__WIN32__) || defined(__WIN64__) + +#ifndef DELTA_EPOCH_IN_MICROSECS +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + +int gettimeofday(struct timeval *tv, void *tz){ + + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag; + + if (NULL != tv) + { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres /= 10; /*convert into microseconds*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size){ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int MAIN__(int argc, char *argv[]){ + + FLOAT *a, *x, *y; + FLOAT alpha[] = {1.0, 1.0}; + FLOAT beta [] = {1.0, 1.0}; + char uplo='L'; + blasint m, i, j; + blasint inc_x=1,inc_y=1; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + struct timeval start, stop; + double time1,timeg; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); + if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p; + + fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = '%c' Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,uplo,inc_x,inc_y,loops); + + if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops\n"); + + for(m = from; m <= to; m += step) + { + + timeg=0; + + fprintf(stderr, " %6dx%d : ", (int)m,(int)m); + + for(j = 0; j < m; j++){ + for(i = 0; i < m * COMPSIZE; i++){ + a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + } + } + + + for (l=0; l Date: Fri, 22 Aug 2014 11:42:07 +0200 Subject: [PATCH 14/32] added sdot and ddot benchmarks --- benchmark/Makefile | 37 +++++++++ benchmark/dot.c | 195 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 232 insertions(+) create mode 100644 benchmark/dot.c diff --git a/benchmark/Makefile b/benchmark/Makefile index 6c364ddfe..1bc631aa4 100644 --- a/benchmark/Makefile +++ b/benchmark/Makefile @@ -35,6 +35,7 @@ goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \ ssyrk.goto dsyrk.goto csyrk.goto zsyrk.goto \ ssyr2k.goto dsyr2k.goto csyr2k.goto zsyr2k.goto \ sger.goto dger.goto \ + sdot.goto ddot.goto \ ssymv.goto dsymv.goto csymv.goto zsymv.goto \ chemv.goto zhemv.goto \ chemm.goto zhemm.goto \ @@ -54,6 +55,7 @@ acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ ssyrk.acml dsyrk.acml csyrk.acml zsyrk.acml \ ssyr2k.acml dsyr2k.acml csyr2k.acml zsyr2k.acml \ sger.acml dger.acml \ + sdot.acml ddot.acml \ ssymv.acml dsymv.acml csymv.acml zsymv.acml \ chemv.acml zhemv.acml \ chemm.acml zhemm.acml \ @@ -73,6 +75,7 @@ atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \ ssyrk.atlas dsyrk.atlas csyrk.atlas zsyrk.atlas \ ssyr2k.atlas dsyr2k.atlas csyr2k.atlas zsyr2k.atlas \ sger.atlas dger.atlas \ + sdot.atlas ddot.atlas \ ssymv.atlas dsymv.atlas csymv.atlas zsymv.atlas \ chemv.atlas zhemv.atlas \ chemm.acml zhemm.acml \ @@ -93,6 +96,7 @@ mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \ ssyrk.mkl dsyrk.mkl csyrk.mkl zsyrk.mkl \ ssyr2k.mkl dsyr2k.mkl csyr2k.mkl zsyr2k.mkl \ sger.mkl dger.mkl \ + sdot.mkl ddot.mkl \ ssymv.mkl dsymv.mkl csymv.mkl zsymv.mkl \ chemv.mkl zhemv.mkl \ chemm.mkl zhemm.mkl \ @@ -954,6 +958,33 @@ zhemv.atlas : zhemv.$(SUFFIX) zhemv.mkl : zhemv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +##################################### Sdot #################################################### +sdot.goto : sdot.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + +sdot.acml : sdot.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sdot.atlas : sdot.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sdot.mkl : sdot.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Ddot #################################################### +ddot.goto : ddot.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + +ddot.acml : ddot.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ddot.atlas : ddot.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ddot.mkl : ddot.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + + ################################################################################################### slinpack.$(SUFFIX) : linpack.c @@ -1142,6 +1173,12 @@ chemv.$(SUFFIX) : hemv.c zhemv.$(SUFFIX) : hemv.c $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ +sdot.$(SUFFIX) : dot.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +ddot.$(SUFFIX) : dot.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + diff --git a/benchmark/dot.c b/benchmark/dot.c new file mode 100644 index 000000000..6132ed324 --- /dev/null +++ b/benchmark/dot.c @@ -0,0 +1,195 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + + +#undef DOT + + +#ifdef DOUBLE +#define DOT BLASFUNC(ddot) +#else +#define DOT BLASFUNC(sdot) +#endif + + +#if defined(__WIN32__) || defined(__WIN64__) + +#ifndef DELTA_EPOCH_IN_MICROSECS +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + +int gettimeofday(struct timeval *tv, void *tz){ + + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag; + + if (NULL != tv) + { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres /= 10; /*convert into microseconds*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size){ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int MAIN__(int argc, char *argv[]){ + + FLOAT *x, *y; + FLOAT result; + blasint m, i; + blasint inc_x=1,inc_y=1; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + struct timeval start, stop; + double time1,timeg; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); + + fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops); + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops\n"); + + for(m = from; m <= to; m += step) + { + + timeg=0; + + fprintf(stderr, " %6d : ", (int)m); + + + for (l=0; l Date: Fri, 22 Aug 2014 11:51:30 +0200 Subject: [PATCH 15/32] bugfix in Makefile --- benchmark/Makefile | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/benchmark/Makefile b/benchmark/Makefile index 1bc631aa4..679a98c32 100644 --- a/benchmark/Makefile +++ b/benchmark/Makefile @@ -960,29 +960,29 @@ zhemv.mkl : zhemv.$(SUFFIX) ##################################### Sdot #################################################### sdot.goto : sdot.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm sdot.acml : sdot.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) sdot.atlas : sdot.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) sdot.mkl : sdot.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ##################################### Ddot #################################################### ddot.goto : ddot.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm ddot.acml : ddot.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ddot.atlas : ddot.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ddot.mkl : ddot.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ################################################################################################### @@ -1174,10 +1174,10 @@ zhemv.$(SUFFIX) : hemv.c $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ sdot.$(SUFFIX) : dot.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ ddot.$(SUFFIX) : dot.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ From 8a9e868919c50c407057f1e79b5a4c94a24e54e8 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Fri, 22 Aug 2014 14:29:17 +0200 Subject: [PATCH 16/32] added optimized sdot for bulldozer --- kernel/x86_64/sdot.c | 107 ++++++++++++++++++++++++ kernel/x86_64/sdot_microk_bulldozer-2.c | 85 +++++++++++++++++++ 2 files changed, 192 insertions(+) create mode 100644 kernel/x86_64/sdot.c create mode 100644 kernel/x86_64/sdot_microk_bulldozer-2.c diff --git a/kernel/x86_64/sdot.c b/kernel/x86_64/sdot.c new file mode 100644 index 000000000..8c60b954a --- /dev/null +++ b/kernel/x86_64/sdot.c @@ -0,0 +1,107 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include "common.h" + +#if defined(BULLDOZER) || defined(PILEDRIVER) +#include "sdot_microk_bulldozer-2.c" +#endif + + +#ifndef HAVE_KERNEL_16 + +void sdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) +{ + BLASLONG register i = 0; + FLOAT dot = 0.0; + + while(i < n) + { + dot += y[i] * x[i] + + y[i+1] * x[i+1] + + y[i+2] * x[i+2] + + y[i+3] * x[i+3] + + y[i+4] * x[i+4] + + y[i+5] * x[i+5] + + y[i+6] * x[i+6] + + y[i+7] * x[i+7] ; + + i+=8 ; + + } + *d += dot; + +} + +#endif + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + + FLOAT dot = 0.0 ; + + if ( n <= 0 ) return(dot); + + if ( (inc_x == 1) && (inc_y == 1) ) + { + + int n1 = n & -16; + + if ( n1 ) + sdot_kernel_16(n1, x, y , &dot ); + + + i = n1; + while(i < n) + { + + dot += y[i] * x[i] ; + i++ ; + + } + return(dot); + + + } + + while(i < n) + { + + dot += y[iy] * x[ix] ; + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + return(dot); + +} + + diff --git a/kernel/x86_64/sdot_microk_bulldozer-2.c b/kernel/x86_64/sdot_microk_bulldozer-2.c new file mode 100644 index 000000000..024b2ce6d --- /dev/null +++ b/kernel/x86_64/sdot_microk_bulldozer-2.c @@ -0,0 +1,85 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_16 1 +static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *dot) __attribute__ ((noinline)); + +static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) +{ + + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "vxorps %%xmm4, %%xmm4, %%xmm4 \n\t" + "vxorps %%xmm5, %%xmm5, %%xmm5 \n\t" + "vxorps %%xmm6, %%xmm6, %%xmm6 \n\t" + "vxorps %%xmm7, %%xmm7, %%xmm7 \n\t" + + ".align 16 \n\t" + ".L01LOOP%=: \n\t" + "vmovups (%2,%0,4), %%xmm12 \n\t" // 4 * x + "vmovups 16(%2,%0,4), %%xmm13 \n\t" // 4 * x + "vmovups 32(%2,%0,4), %%xmm14 \n\t" // 4 * x + "vmovups 48(%2,%0,4), %%xmm15 \n\t" // 4 * x + + "vfmaddps %%xmm4, (%3,%0,4), %%xmm12, %%xmm4 \n\t" // 4 * y + "vfmaddps %%xmm5, 16(%3,%0,4), %%xmm13, %%xmm5 \n\t" // 4 * y + "vfmaddps %%xmm6, 32(%3,%0,4), %%xmm14, %%xmm6 \n\t" // 4 * y + "vfmaddps %%xmm7, 48(%3,%0,4), %%xmm15, %%xmm7 \n\t" // 4 * y + + "addq $16, %0 \n\t" + "subq $16, %1 \n\t" + "jnz .L01LOOP%= \n\t" + + "vaddps %%xmm4, %%xmm5, %%xmm4 \n\t" + "vaddps %%xmm6, %%xmm7, %%xmm6 \n\t" + "vaddps %%xmm4, %%xmm6, %%xmm4 \n\t" + + "vhaddps %%xmm4, %%xmm4, %%xmm4 \n\t" + "vhaddps %%xmm4, %%xmm4, %%xmm4 \n\t" + + "vmovss %%xmm4, (%4) \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (dot) // 4 + : "cc", + "%xmm4", "%xmm5", + "%xmm6", "%xmm7", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + From 5d97b0754c3c1ef975365410b37d9cda360b73eb Mon Sep 17 00:00:00 2001 From: wernsaar Date: Fri, 22 Aug 2014 17:00:26 +0200 Subject: [PATCH 17/32] added optimized sdot kernel for nehalem --- kernel/x86_64/KERNEL.NEHALEM | 2 + kernel/x86_64/sdot.c | 2 + kernel/x86_64/sdot_microk_nehalem-2.c | 94 +++++++++++++++++++++++++++ 3 files changed, 98 insertions(+) create mode 100644 kernel/x86_64/sdot_microk_nehalem-2.c diff --git a/kernel/x86_64/KERNEL.NEHALEM b/kernel/x86_64/KERNEL.NEHALEM index b16fd9c49..aee622d0d 100644 --- a/kernel/x86_64/KERNEL.NEHALEM +++ b/kernel/x86_64/KERNEL.NEHALEM @@ -1,3 +1,5 @@ +SDOTKERNEL = sdot.c + DSYMV_U_KERNEL = dsymv_U.c DSYMV_L_KERNEL = dsymv_L.c SSYMV_U_KERNEL = ssymv_U.c diff --git a/kernel/x86_64/sdot.c b/kernel/x86_64/sdot.c index 8c60b954a..a13d65d25 100644 --- a/kernel/x86_64/sdot.c +++ b/kernel/x86_64/sdot.c @@ -30,6 +30,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(BULLDOZER) || defined(PILEDRIVER) #include "sdot_microk_bulldozer-2.c" +#elif defined(NEHALEM) +#include "sdot_microk_nehalem-2.c" #endif diff --git a/kernel/x86_64/sdot_microk_nehalem-2.c b/kernel/x86_64/sdot_microk_nehalem-2.c new file mode 100644 index 000000000..3548ace88 --- /dev/null +++ b/kernel/x86_64/sdot_microk_nehalem-2.c @@ -0,0 +1,94 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_16 1 +static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *dot) __attribute__ ((noinline)); + +static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) +{ + + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "xorps %%xmm4, %%xmm4 \n\t" + "xorps %%xmm5, %%xmm5 \n\t" + "xorps %%xmm6, %%xmm6 \n\t" + "xorps %%xmm7, %%xmm7 \n\t" + + ".align 16 \n\t" + ".L01LOOP%=: \n\t" + "movups (%2,%0,4), %%xmm12 \n\t" // 4 * x + "movups (%3,%0,4), %%xmm8 \n\t" // 4 * x + "movups 16(%2,%0,4), %%xmm13 \n\t" // 4 * x + "movups 16(%3,%0,4), %%xmm9 \n\t" // 4 * x + "movups 32(%2,%0,4), %%xmm14 \n\t" // 4 * x + "movups 32(%3,%0,4), %%xmm10 \n\t" // 4 * x + "movups 48(%2,%0,4), %%xmm15 \n\t" // 4 * x + "movups 48(%3,%0,4), %%xmm11 \n\t" // 4 * x + + "mulps %%xmm8 , %%xmm12 \n\t" + "mulps %%xmm9 , %%xmm13 \n\t" + "mulps %%xmm10, %%xmm14 \n\t" + "mulps %%xmm11, %%xmm15 \n\t" + + "addps %%xmm12, %%xmm4 \n\t" + "addps %%xmm13, %%xmm5 \n\t" + "addps %%xmm14, %%xmm6 \n\t" + "addps %%xmm15, %%xmm7 \n\t" + + "addq $16, %0 \n\t" + "subq $16, %1 \n\t" + "jnz .L01LOOP%= \n\t" + + "addps %%xmm5, %%xmm4 \n\t" + "addps %%xmm7, %%xmm6 \n\t" + "addps %%xmm6, %%xmm4 \n\t" + + "haddps %%xmm4, %%xmm4 \n\t" + "haddps %%xmm4, %%xmm4 \n\t" + + "movss %%xmm4, (%4) \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (dot) // 4 + : "cc", + "%xmm4", "%xmm5", + "%xmm6", "%xmm7", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + From 95a707ced3a332221d7539e51e0051a9d07edd91 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Fri, 22 Aug 2014 17:01:27 +0200 Subject: [PATCH 18/32] update of KERNEL.BULLDOZER --- kernel/x86_64/KERNEL.BULLDOZER | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/x86_64/KERNEL.BULLDOZER b/kernel/x86_64/KERNEL.BULLDOZER index 3ee1978b8..7aa597ea0 100644 --- a/kernel/x86_64/KERNEL.BULLDOZER +++ b/kernel/x86_64/KERNEL.BULLDOZER @@ -1,3 +1,5 @@ +SDOTKERNEL = sdot.c + DSYMV_U_KERNEL = dsymv_U.c DSYMV_L_KERNEL = dsymv_L.c SSYMV_U_KERNEL = ssymv_U.c From 53ec5789e2aeeb4bc5eee2de65047ec3a8169c8a Mon Sep 17 00:00:00 2001 From: wernsaar Date: Fri, 22 Aug 2014 17:02:55 +0200 Subject: [PATCH 19/32] bugfix for Makefile --- benchmark/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmark/Makefile b/benchmark/Makefile index 679a98c32..933b4bd44 100644 --- a/benchmark/Makefile +++ b/benchmark/Makefile @@ -108,7 +108,7 @@ mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \ spotrf.mkl dpotrf.mkl cpotrf.mkl zpotrf.mkl \ ssymm.mkl dsymm.mkl csymm.mkl zsymm.mkl -all :: goto atlas acml mkl +all :: goto mkl atlas acml ##################################### Slinpack #################################################### slinpack.goto : slinpack.$(SUFFIX) ../$(LIBNAME) From 16d6be852dbcb09365fd61e1e11b3cac66901b76 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Fri, 22 Aug 2014 20:34:41 +0200 Subject: [PATCH 20/32] added optimized ddot kernel for nehalem --- kernel/x86_64/KERNEL.NEHALEM | 1 + kernel/x86_64/ddot.c | 110 ++++++++++++++++++++++++++ kernel/x86_64/ddot_microk_nehalem-2.c | 94 ++++++++++++++++++++++ kernel/x86_64/sdot_microk_nehalem-2.c | 4 +- 4 files changed, 207 insertions(+), 2 deletions(-) create mode 100644 kernel/x86_64/ddot.c create mode 100644 kernel/x86_64/ddot_microk_nehalem-2.c diff --git a/kernel/x86_64/KERNEL.NEHALEM b/kernel/x86_64/KERNEL.NEHALEM index aee622d0d..e0f3c3336 100644 --- a/kernel/x86_64/KERNEL.NEHALEM +++ b/kernel/x86_64/KERNEL.NEHALEM @@ -1,4 +1,5 @@ SDOTKERNEL = sdot.c +DDOTKERNEL = ddot.c DSYMV_U_KERNEL = dsymv_U.c DSYMV_L_KERNEL = dsymv_L.c diff --git a/kernel/x86_64/ddot.c b/kernel/x86_64/ddot.c new file mode 100644 index 000000000..ee6785f6a --- /dev/null +++ b/kernel/x86_64/ddot.c @@ -0,0 +1,110 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include "common.h" + + +#if defined(BULLDOZER) || defined(PILEDRIVER) +#include "ddot_microk_bulldozer-2.c" +#elif defined(NEHALEM) +#include "ddot_microk_nehalem-2.c" +#endif + + +#ifndef HAVE_KERNEL_8 + +void ddot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) +{ + BLASLONG register i = 0; + FLOAT dot = 0.0; + + while(i < n) + { + dot += y[i] * x[i] + + y[i+1] * x[i+1] + + y[i+2] * x[i+2] + + y[i+3] * x[i+3] + + y[i+4] * x[i+4] + + y[i+5] * x[i+5] + + y[i+6] * x[i+6] + + y[i+7] * x[i+7] ; + + i+=8 ; + + } + *d += dot; + +} + +#endif + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + + FLOAT dot = 0.0 ; + + if ( n <= 0 ) return(dot); + + if ( (inc_x == 1) && (inc_y == 1) ) + { + + int n1 = n & -8; + + if ( n1 ) + ddot_kernel_8(n1, x, y , &dot ); + + + i = n1; + while(i < n) + { + + dot += y[i] * x[i] ; + i++ ; + + } + return(dot); + + + } + + while(i < n) + { + + dot += y[iy] * x[ix] ; + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + return(dot); + +} + + diff --git a/kernel/x86_64/ddot_microk_nehalem-2.c b/kernel/x86_64/ddot_microk_nehalem-2.c new file mode 100644 index 000000000..dd05053f7 --- /dev/null +++ b/kernel/x86_64/ddot_microk_nehalem-2.c @@ -0,0 +1,94 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_8 1 +static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *dot) __attribute__ ((noinline)); + +static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) +{ + + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "xorpd %%xmm4, %%xmm4 \n\t" + "xorpd %%xmm5, %%xmm5 \n\t" + "xorpd %%xmm6, %%xmm6 \n\t" + "xorpd %%xmm7, %%xmm7 \n\t" + + ".align 16 \n\t" + ".L01LOOP%=: \n\t" + + "movups (%2,%0,8), %%xmm12 \n\t" // 2 * x + "movups (%3,%0,8), %%xmm8 \n\t" // 2 * y + "movups 16(%2,%0,8), %%xmm13 \n\t" // 2 * x + "movups 16(%3,%0,8), %%xmm9 \n\t" // 2 * y + "movups 32(%2,%0,8), %%xmm14 \n\t" // 2 * x + "movups 32(%3,%0,8), %%xmm10 \n\t" // 2 * y + "movups 48(%2,%0,8), %%xmm15 \n\t" // 2 * x + "movups 48(%3,%0,8), %%xmm11 \n\t" // 2 * y + + "mulpd %%xmm8 , %%xmm12 \n\t" + "mulpd %%xmm9 , %%xmm13 \n\t" + "mulpd %%xmm10, %%xmm14 \n\t" + "mulpd %%xmm11, %%xmm15 \n\t" + + "addpd %%xmm12, %%xmm4 \n\t" + "addpd %%xmm13, %%xmm5 \n\t" + "addpd %%xmm14, %%xmm6 \n\t" + "addpd %%xmm15, %%xmm7 \n\t" + + "addq $8 , %0 \n\t" + "subq $8 , %1 \n\t" + "jnz .L01LOOP%= \n\t" + + "addpd %%xmm5, %%xmm4 \n\t" + "addpd %%xmm7, %%xmm6 \n\t" + "addpd %%xmm6, %%xmm4 \n\t" + + "haddpd %%xmm4, %%xmm4 \n\t" + + "movsd %%xmm4, (%4) \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (dot) // 4 + : "cc", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + diff --git a/kernel/x86_64/sdot_microk_nehalem-2.c b/kernel/x86_64/sdot_microk_nehalem-2.c index 3548ace88..2a918b5ea 100644 --- a/kernel/x86_64/sdot_microk_nehalem-2.c +++ b/kernel/x86_64/sdot_microk_nehalem-2.c @@ -83,8 +83,8 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "r" (y), // 3 "r" (dot) // 4 : "cc", - "%xmm4", "%xmm5", - "%xmm6", "%xmm7", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); From f1b96c4846f9fd110b65335600445fae50fb1d26 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Fri, 22 Aug 2014 21:19:29 +0200 Subject: [PATCH 21/32] added optimized ddot kernel for bulldozer --- kernel/x86_64/KERNEL.BULLDOZER | 1 + kernel/x86_64/ddot_microk_bulldozer-2.c | 84 +++++++++++++++++++++++++ 2 files changed, 85 insertions(+) create mode 100644 kernel/x86_64/ddot_microk_bulldozer-2.c diff --git a/kernel/x86_64/KERNEL.BULLDOZER b/kernel/x86_64/KERNEL.BULLDOZER index 7aa597ea0..9e6fc2e28 100644 --- a/kernel/x86_64/KERNEL.BULLDOZER +++ b/kernel/x86_64/KERNEL.BULLDOZER @@ -1,4 +1,5 @@ SDOTKERNEL = sdot.c +DDOTKERNEL = ddot.c DSYMV_U_KERNEL = dsymv_U.c DSYMV_L_KERNEL = dsymv_L.c diff --git a/kernel/x86_64/ddot_microk_bulldozer-2.c b/kernel/x86_64/ddot_microk_bulldozer-2.c new file mode 100644 index 000000000..0c77b6349 --- /dev/null +++ b/kernel/x86_64/ddot_microk_bulldozer-2.c @@ -0,0 +1,84 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_8 1 +static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *dot) __attribute__ ((noinline)); + +static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) +{ + + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "vxorpd %%xmm4, %%xmm4, %%xmm4 \n\t" + "vxorpd %%xmm5, %%xmm5, %%xmm5 \n\t" + "vxorpd %%xmm6, %%xmm6, %%xmm6 \n\t" + "vxorpd %%xmm7, %%xmm7, %%xmm7 \n\t" + + ".align 16 \n\t" + ".L01LOOP%=: \n\t" + "vmovups (%2,%0,8), %%xmm12 \n\t" // 2 * x + "vmovups 16(%2,%0,8), %%xmm13 \n\t" // 2 * x + "vmovups 32(%2,%0,8), %%xmm14 \n\t" // 2 * x + "vmovups 48(%2,%0,8), %%xmm15 \n\t" // 2 * x + + "vfmaddpd %%xmm4, (%3,%0,8), %%xmm12, %%xmm4 \n\t" // 2 * y + "vfmaddpd %%xmm5, 16(%3,%0,8), %%xmm13, %%xmm5 \n\t" // 2 * y + "vfmaddpd %%xmm6, 32(%3,%0,8), %%xmm14, %%xmm6 \n\t" // 2 * y + "vfmaddpd %%xmm7, 48(%3,%0,8), %%xmm15, %%xmm7 \n\t" // 2 * y + + "addq $8 , %0 \n\t" + "subq $8 , %1 \n\t" + "jnz .L01LOOP%= \n\t" + + "vaddpd %%xmm4, %%xmm5, %%xmm4 \n\t" + "vaddpd %%xmm6, %%xmm7, %%xmm6 \n\t" + "vaddpd %%xmm4, %%xmm6, %%xmm4 \n\t" + + "vhaddpd %%xmm4, %%xmm4, %%xmm4 \n\t" + + "vmovsd %%xmm4, (%4) \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (dot) // 4 + : "cc", + "%xmm4", "%xmm5", + "%xmm6", "%xmm7", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + From ac76b6267f1d7de2cbbc0b7fb22c825281c36b44 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sat, 23 Aug 2014 10:40:57 +0200 Subject: [PATCH 22/32] added optimized dgemv_n kernel for nehalem --- kernel/x86_64/KERNEL.NEHALEM | 1 + kernel/x86_64/dgemv_n.c | 2 + kernel/x86_64/dgemv_n_microk_nehalem-2.c | 137 +++++++++++++++++++++++ 3 files changed, 140 insertions(+) create mode 100644 kernel/x86_64/dgemv_n_microk_nehalem-2.c diff --git a/kernel/x86_64/KERNEL.NEHALEM b/kernel/x86_64/KERNEL.NEHALEM index e0f3c3336..5523b4571 100644 --- a/kernel/x86_64/KERNEL.NEHALEM +++ b/kernel/x86_64/KERNEL.NEHALEM @@ -8,6 +8,7 @@ SSYMV_L_KERNEL = ssymv_L.c SGEMVNKERNEL = sgemv_n.c SGEMVTKERNEL = sgemv_t.c +DGEMVNKERNEL = dgemv_n.c SGEMMKERNEL = gemm_kernel_4x8_nehalem.S SGEMMINCOPY = gemm_ncopy_4.S diff --git a/kernel/x86_64/dgemv_n.c b/kernel/x86_64/dgemv_n.c index 5d826dc63..cecb8d3fc 100644 --- a/kernel/x86_64/dgemv_n.c +++ b/kernel/x86_64/dgemv_n.c @@ -31,6 +31,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(HASWELL) #include "dgemv_n_microk_haswell-2.c" +#elif defined(NEHALEM) +#include "dgemv_n_microk_nehalem-2.c" #endif diff --git a/kernel/x86_64/dgemv_n_microk_nehalem-2.c b/kernel/x86_64/dgemv_n_microk_nehalem-2.c new file mode 100644 index 000000000..84b82f805 --- /dev/null +++ b/kernel/x86_64/dgemv_n_microk_nehalem-2.c @@ -0,0 +1,137 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_16x4 1 +static void dgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline)); + +static void dgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) +{ + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "movddup (%2), %%xmm12 \n\t" // x0 + "movddup 8(%2), %%xmm13 \n\t" // x1 + "movddup 16(%2), %%xmm14 \n\t" // x2 + "movddup 24(%2), %%xmm15 \n\t" // x3 + + ".align 16 \n\t" + ".L01LOOP%=: \n\t" + "prefetcht0 192(%3,%0,8) \n\t" + "movups (%3,%0,8), %%xmm4 \n\t" // 2 * y + "movups 16(%3,%0,8), %%xmm5 \n\t" // 2 * y + "movups 32(%3,%0,8), %%xmm6 \n\t" // 2 * y + "movups 48(%3,%0,8), %%xmm7 \n\t" // 2 * y + "movups (%4,%0,8), %%xmm8 \n\t" // 2 * a + "movups 16(%4,%0,8), %%xmm9 \n\t" // 2 * a + "movups 32(%4,%0,8), %%xmm10 \n\t" // 2 * a + "movups 48(%4,%0,8), %%xmm11 \n\t" // 2 * a + + "prefetcht0 192(%4,%0,8) \n\t" + "mulpd %%xmm12 , %%xmm8 \n\t" // a * x + "mulpd %%xmm12 , %%xmm9 \n\t" // a * x + "mulpd %%xmm12 , %%xmm10 \n\t" // a * x + "mulpd %%xmm12 , %%xmm11 \n\t" // a * x + "addpd %%xmm8 , %%xmm4 \n\t" // y += a * x + "addpd %%xmm9 , %%xmm5 \n\t" // y += a * x + "addpd %%xmm10 , %%xmm6 \n\t" // y += a * x + "addpd %%xmm11 , %%xmm7 \n\t" // y += a * x + + "prefetcht0 192(%5,%0,8) \n\t" + "movups (%5,%0,8), %%xmm8 \n\t" // 2 * a + "movups 16(%5,%0,8), %%xmm9 \n\t" // 2 * a + "movups 32(%5,%0,8), %%xmm10 \n\t" // 2 * a + "movups 48(%5,%0,8), %%xmm11 \n\t" // 2 * a + "mulpd %%xmm13 , %%xmm8 \n\t" // a * x + "mulpd %%xmm13 , %%xmm9 \n\t" // a * x + "mulpd %%xmm13 , %%xmm10 \n\t" // a * x + "mulpd %%xmm13 , %%xmm11 \n\t" // a * x + "addpd %%xmm8 , %%xmm4 \n\t" // y += a * x + "addpd %%xmm9 , %%xmm5 \n\t" // y += a * x + "addpd %%xmm10 , %%xmm6 \n\t" // y += a * x + "addpd %%xmm11 , %%xmm7 \n\t" // y += a * x + + "prefetcht0 192(%6,%0,8) \n\t" + "movups (%6,%0,8), %%xmm8 \n\t" // 2 * a + "movups 16(%6,%0,8), %%xmm9 \n\t" // 2 * a + "movups 32(%6,%0,8), %%xmm10 \n\t" // 2 * a + "movups 48(%6,%0,8), %%xmm11 \n\t" // 2 * a + "mulpd %%xmm14 , %%xmm8 \n\t" // a * x + "mulpd %%xmm14 , %%xmm9 \n\t" // a * x + "mulpd %%xmm14 , %%xmm10 \n\t" // a * x + "mulpd %%xmm14 , %%xmm11 \n\t" // a * x + "addpd %%xmm8 , %%xmm4 \n\t" // y += a * x + "addpd %%xmm9 , %%xmm5 \n\t" // y += a * x + "addpd %%xmm10 , %%xmm6 \n\t" // y += a * x + "addpd %%xmm11 , %%xmm7 \n\t" // y += a * x + + "prefetcht0 192(%7,%0,8) \n\t" + "movups (%7,%0,8), %%xmm8 \n\t" // 2 * a + "movups 16(%7,%0,8), %%xmm9 \n\t" // 2 * a + "movups 32(%7,%0,8), %%xmm10 \n\t" // 2 * a + "movups 48(%7,%0,8), %%xmm11 \n\t" // 2 * a + "mulpd %%xmm15 , %%xmm8 \n\t" // a * x + "mulpd %%xmm15 , %%xmm9 \n\t" // a * x + "mulpd %%xmm15 , %%xmm10 \n\t" // a * x + "mulpd %%xmm15 , %%xmm11 \n\t" // a * x + "addpd %%xmm8 , %%xmm4 \n\t" // y += a * x + "addpd %%xmm9 , %%xmm5 \n\t" // y += a * x + "addpd %%xmm10 , %%xmm6 \n\t" // y += a * x + "addpd %%xmm11 , %%xmm7 \n\t" // y += a * x + + "movups %%xmm4, (%3,%0,8) \n\t" // 4 * y + "movups %%xmm5, 16(%3,%0,8) \n\t" // 4 * y + "movups %%xmm6, 32(%3,%0,8) \n\t" // 4 * y + "movups %%xmm7, 48(%3,%0,8) \n\t" // 4 * y + + "addq $8 , %0 \n\t" + "subq $8 , %1 \n\t" + "jnz .L01LOOP%= \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (ap[0]), // 4 + "r" (ap[1]), // 5 + "r" (ap[2]), // 6 + "r" (ap[3]) // 7 + : "cc", + "%xmm4", "%xmm5", + "%xmm6", "%xmm7", + "%xmm8", "%xmm9", + "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + From 55e81da379ebf839b28a100f194e03589959ab91 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sat, 23 Aug 2014 13:12:44 +0200 Subject: [PATCH 23/32] added axpy benchmark-test --- benchmark/Makefile | 68 +++++++++++++++ benchmark/axpy.c | 201 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 269 insertions(+) create mode 100644 benchmark/axpy.c diff --git a/benchmark/Makefile b/benchmark/Makefile index 933b4bd44..07bf5a792 100644 --- a/benchmark/Makefile +++ b/benchmark/Makefile @@ -36,6 +36,7 @@ goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \ ssyr2k.goto dsyr2k.goto csyr2k.goto zsyr2k.goto \ sger.goto dger.goto \ sdot.goto ddot.goto \ + saxpy.goto daxpy.goto caxpy.goto zaxpy.goto \ ssymv.goto dsymv.goto csymv.goto zsymv.goto \ chemv.goto zhemv.goto \ chemm.goto zhemm.goto \ @@ -56,6 +57,7 @@ acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ ssyr2k.acml dsyr2k.acml csyr2k.acml zsyr2k.acml \ sger.acml dger.acml \ sdot.acml ddot.acml \ + saxpy.acml daxpy.acml caxpy.acml zaxpy.acml \ ssymv.acml dsymv.acml csymv.acml zsymv.acml \ chemv.acml zhemv.acml \ chemm.acml zhemm.acml \ @@ -76,6 +78,7 @@ atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \ ssyr2k.atlas dsyr2k.atlas csyr2k.atlas zsyr2k.atlas \ sger.atlas dger.atlas \ sdot.atlas ddot.atlas \ + saxpy.atlas daxpy.atlas caxpy.atlas zaxpy.atlas \ ssymv.atlas dsymv.atlas csymv.atlas zsymv.atlas \ chemv.atlas zhemv.atlas \ chemm.acml zhemm.acml \ @@ -97,6 +100,7 @@ mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \ ssyr2k.mkl dsyr2k.mkl csyr2k.mkl zsyr2k.mkl \ sger.mkl dger.mkl \ sdot.mkl ddot.mkl \ + saxpy.mkl daxpy.mkl caxpy.mkl zaxpy.mkl \ ssymv.mkl dsymv.mkl csymv.mkl zsymv.mkl \ chemv.mkl zhemv.mkl \ chemm.mkl zhemm.mkl \ @@ -984,6 +988,61 @@ ddot.atlas : ddot.$(SUFFIX) ddot.mkl : ddot.$(SUFFIX) $(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +##################################### Saxpy #################################################### +saxpy.goto : saxpy.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + +saxpy.acml : saxpy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +saxpy.atlas : saxpy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +saxpy.mkl : saxpy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Daxpy #################################################### +daxpy.goto : daxpy.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + +daxpy.acml : daxpy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +daxpy.atlas : daxpy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +daxpy.mkl : daxpy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Caxpy #################################################### + +caxpy.goto : caxpy.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + +caxpy.acml : caxpy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +caxpy.atlas : caxpy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +caxpy.mkl : caxpy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Zaxpy #################################################### + +zaxpy.goto : zaxpy.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + +zaxpy.acml : zaxpy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zaxpy.atlas : zaxpy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zaxpy.mkl : zaxpy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + + ################################################################################################### @@ -1179,8 +1238,17 @@ sdot.$(SUFFIX) : dot.c ddot.$(SUFFIX) : dot.c $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ +saxpy.$(SUFFIX) : axpy.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ +daxpy.$(SUFFIX) : axpy.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ +caxpy.$(SUFFIX) : axpy.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zaxpy.$(SUFFIX) : axpy.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ diff --git a/benchmark/axpy.c b/benchmark/axpy.c new file mode 100644 index 000000000..ef3b5ae4f --- /dev/null +++ b/benchmark/axpy.c @@ -0,0 +1,201 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + + +#undef AXPY + +#ifdef COMPLEX +#ifdef DOUBLE +#define AXPY BLASFUNC(zaxpy) +#else +#define AXPY BLASFUNC(caxpy) +#endif +#else +#ifdef DOUBLE +#define AXPY BLASFUNC(daxpy) +#else +#define AXPY BLASFUNC(saxpy) +#endif +#endif + +#if defined(__WIN32__) || defined(__WIN64__) + +#ifndef DELTA_EPOCH_IN_MICROSECS +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + +int gettimeofday(struct timeval *tv, void *tz){ + + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag; + + if (NULL != tv) + { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres /= 10; /*convert into microseconds*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size){ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int MAIN__(int argc, char *argv[]){ + + FLOAT *x, *y; + FLOAT alpha[2] = { 2.0, 2.0 }; + blasint m, i; + blasint inc_x=1,inc_y=1; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + struct timeval start, stop; + double time1,timeg; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); + + fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops); + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops\n"); + + for(m = from; m <= to; m += step) + { + + timeg=0; + + fprintf(stderr, " %6d : ", (int)m); + + + for (l=0; l Date: Sat, 23 Aug 2014 17:15:21 +0200 Subject: [PATCH 24/32] added optimized saxpy kernel for nehalem --- kernel/x86_64/KERNEL.NEHALEM | 2 + kernel/x86_64/saxpy.c | 103 +++++++++++++++++++++++++ kernel/x86_64/saxpy_microk_nehalem-2.c | 91 ++++++++++++++++++++++ 3 files changed, 196 insertions(+) create mode 100644 kernel/x86_64/saxpy.c create mode 100644 kernel/x86_64/saxpy_microk_nehalem-2.c diff --git a/kernel/x86_64/KERNEL.NEHALEM b/kernel/x86_64/KERNEL.NEHALEM index 5523b4571..3cbaa471f 100644 --- a/kernel/x86_64/KERNEL.NEHALEM +++ b/kernel/x86_64/KERNEL.NEHALEM @@ -1,3 +1,5 @@ +SAXPYKERNEL = saxpy.c + SDOTKERNEL = sdot.c DDOTKERNEL = ddot.c diff --git a/kernel/x86_64/saxpy.c b/kernel/x86_64/saxpy.c new file mode 100644 index 000000000..e6c016ee3 --- /dev/null +++ b/kernel/x86_64/saxpy.c @@ -0,0 +1,103 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include "common.h" + + +#if defined(NEHALEM) +#include "saxpy_microk_nehalem-2.c" +#endif + + +#ifndef HAVE_KERNEL_16 + +void saxpy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + BLASLONG register i = 0; + FLOAT a = *alpha; + + while(i < n) + { + y[i] += a * x[i]; + y[i+1] += a * x[i+1]; + y[i+2] += a * x[i+2]; + y[i+3] += a * x[i+3]; + y[i+4] += a * x[i+4]; + y[i+5] += a * x[i+5]; + y[i+6] += a * x[i+6]; + y[i+7] += a * x[i+7]; + i+=8 ; + + } + +} + +#endif + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + + if ( n <= 0 ) return(0); + + if ( (inc_x == 1) && (inc_y == 1) ) + { + + int n1 = n & -16; + + if ( n1 ) + saxpy_kernel_16(n1, x, y , &da ); + + i = n1; + while(i < n) + { + + y[i] += da * x[i] ; + i++ ; + + } + return(0); + + + } + + while(i < n) + { + + y[iy] += da * x[ix] ; + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + return(0); + +} + + diff --git a/kernel/x86_64/saxpy_microk_nehalem-2.c b/kernel/x86_64/saxpy_microk_nehalem-2.c new file mode 100644 index 000000000..14ff51a0d --- /dev/null +++ b/kernel/x86_64/saxpy_microk_nehalem-2.c @@ -0,0 +1,91 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_16 1 +static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *alpha) __attribute__ ((noinline)); + +static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "movss (%4), %%xmm0 \n\t" // alpha + "shufps $0, %%xmm0, %%xmm0 \n\t" + + ".align 16 \n\t" + ".L01LOOP%=: \n\t" + // "prefetcht0 192(%2,%0,4) \n\t" + // "prefetcht0 192(%3,%0,4) \n\t" + + "movups (%2,%0,4), %%xmm12 \n\t" // 4 * x + "movups 16(%2,%0,4), %%xmm13 \n\t" // 4 * x + "movups 32(%2,%0,4), %%xmm14 \n\t" // 4 * x + "movups 48(%2,%0,4), %%xmm15 \n\t" // 4 * x + "movups (%3,%0,4), %%xmm8 \n\t" // 4 * y + "movups 16(%3,%0,4), %%xmm9 \n\t" // 4 * y + "movups 32(%3,%0,4), %%xmm10 \n\t" // 4 * y + "movups 48(%3,%0,4), %%xmm11 \n\t" // 4 * y + + "mulps %%xmm0 , %%xmm12 \n\t" // alpha * x + "mulps %%xmm0 , %%xmm13 \n\t" + "mulps %%xmm0 , %%xmm14 \n\t" + "mulps %%xmm0 , %%xmm15 \n\t" + + "addps %%xmm12, %%xmm8 \n\t" // y += alpha *x + "addps %%xmm13, %%xmm9 \n\t" + "addps %%xmm14, %%xmm10 \n\t" + "addps %%xmm15, %%xmm11 \n\t" + + "movups %%xmm8 , (%3,%0,4) \n\t" + "movups %%xmm9 , 16(%3,%0,4) \n\t" + "movups %%xmm10, 32(%3,%0,4) \n\t" + "movups %%xmm11, 48(%3,%0,4) \n\t" + + "addq $16, %0 \n\t" + "subq $16, %1 \n\t" + "jnz .L01LOOP%= \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (alpha) // 4 + : "cc", + "%xmm0", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + From 29125864b30e730f7c81abf98818fb3f5734a9c4 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sat, 23 Aug 2014 17:28:01 +0200 Subject: [PATCH 25/32] updated gemm.c --- benchmark/gemm.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/benchmark/gemm.c b/benchmark/gemm.c index fc482c075..4f9a58825 100644 --- a/benchmark/gemm.c +++ b/benchmark/gemm.c @@ -142,7 +142,9 @@ int MAIN__(int argc, char *argv[]){ if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} if (argc > 0) { step = atol(*argv); argc--; argv++;} - fprintf(stderr, "From : %3d To : %3d Step = %3d\n", from, to, step); + if ((p = getenv("OPENBLAS_TRANS"))) trans=*p; + + fprintf(stderr, "From : %3d To : %3d Step=%d : Trans=%c\n", from, to, step, trans); if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){ fprintf(stderr,"Out of Memory!!\n");exit(1); From b55f99730231d7d33eb3bf0ab49a55c5f22c92d0 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sat, 23 Aug 2014 17:53:07 +0200 Subject: [PATCH 26/32] added optimized daxpy kernel for nehalem --- kernel/x86_64/KERNEL.NEHALEM | 1 + kernel/x86_64/daxpy.c | 103 +++++++++++++++++++++++++ kernel/x86_64/daxpy_microk_nehalem-2.c | 91 ++++++++++++++++++++++ 3 files changed, 195 insertions(+) create mode 100644 kernel/x86_64/daxpy.c create mode 100644 kernel/x86_64/daxpy_microk_nehalem-2.c diff --git a/kernel/x86_64/KERNEL.NEHALEM b/kernel/x86_64/KERNEL.NEHALEM index 3cbaa471f..8adb579cf 100644 --- a/kernel/x86_64/KERNEL.NEHALEM +++ b/kernel/x86_64/KERNEL.NEHALEM @@ -1,4 +1,5 @@ SAXPYKERNEL = saxpy.c +DAXPYKERNEL = daxpy.c SDOTKERNEL = sdot.c DDOTKERNEL = ddot.c diff --git a/kernel/x86_64/daxpy.c b/kernel/x86_64/daxpy.c new file mode 100644 index 000000000..feda045d0 --- /dev/null +++ b/kernel/x86_64/daxpy.c @@ -0,0 +1,103 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include "common.h" + + +#if defined(NEHALEM) +#include "daxpy_microk_nehalem-2.c" +#endif + + +#ifndef HAVE_KERNEL_8 + +void daxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + BLASLONG register i = 0; + FLOAT a = *alpha; + + while(i < n) + { + y[i] += a * x[i]; + y[i+1] += a * x[i+1]; + y[i+2] += a * x[i+2]; + y[i+3] += a * x[i+3]; + y[i+4] += a * x[i+4]; + y[i+5] += a * x[i+5]; + y[i+6] += a * x[i+6]; + y[i+7] += a * x[i+7]; + i+=8 ; + + } + +} + +#endif + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + + if ( n <= 0 ) return(0); + + if ( (inc_x == 1) && (inc_y == 1) ) + { + + int n1 = n & -8; + + if ( n1 ) + daxpy_kernel_8(n1, x, y , &da ); + + i = n1; + while(i < n) + { + + y[i] += da * x[i] ; + i++ ; + + } + return(0); + + + } + + while(i < n) + { + + y[iy] += da * x[ix] ; + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + return(0); + +} + + diff --git a/kernel/x86_64/daxpy_microk_nehalem-2.c b/kernel/x86_64/daxpy_microk_nehalem-2.c new file mode 100644 index 000000000..32ed1857c --- /dev/null +++ b/kernel/x86_64/daxpy_microk_nehalem-2.c @@ -0,0 +1,91 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_8 1 +static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *alpha) __attribute__ ((noinline)); + +static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "movsd (%4), %%xmm0 \n\t" // alpha + "shufpd $0, %%xmm0, %%xmm0 \n\t" + + ".align 16 \n\t" + ".L01LOOP%=: \n\t" + // "prefetcht0 192(%2,%0,8) \n\t" + // "prefetcht0 192(%3,%0,8) \n\t" + + "movups (%2,%0,8), %%xmm12 \n\t" // 2 * x + "movups 16(%2,%0,8), %%xmm13 \n\t" // 2 * x + "movups 32(%2,%0,8), %%xmm14 \n\t" // 2 * x + "movups 48(%2,%0,8), %%xmm15 \n\t" // 2 * x + "movups (%3,%0,8), %%xmm8 \n\t" // 2 * y + "movups 16(%3,%0,8), %%xmm9 \n\t" // 2 * y + "movups 32(%3,%0,8), %%xmm10 \n\t" // 2 * y + "movups 48(%3,%0,8), %%xmm11 \n\t" // 2 * y + + "mulpd %%xmm0 , %%xmm12 \n\t" // alpha * x + "mulpd %%xmm0 , %%xmm13 \n\t" + "mulpd %%xmm0 , %%xmm14 \n\t" + "mulpd %%xmm0 , %%xmm15 \n\t" + + "addpd %%xmm12, %%xmm8 \n\t" // y += alpha *x + "addpd %%xmm13, %%xmm9 \n\t" + "addpd %%xmm14, %%xmm10 \n\t" + "addpd %%xmm15, %%xmm11 \n\t" + + "movups %%xmm8 , (%3,%0,8) \n\t" + "movups %%xmm9 , 16(%3,%0,8) \n\t" + "movups %%xmm10, 32(%3,%0,8) \n\t" + "movups %%xmm11, 48(%3,%0,8) \n\t" + + "addq $8 , %0 \n\t" + "subq $8 , %1 \n\t" + "jnz .L01LOOP%= \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (alpha) // 4 + : "cc", + "%xmm0", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + From 9d2ace8bac646d423528b307528043446a9c98ee Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sun, 24 Aug 2014 10:57:12 +0200 Subject: [PATCH 27/32] added optimized daxpy kernel for bulldozer --- kernel/x86_64/daxpy.c | 2 + kernel/x86_64/daxpy_microk_bulldozer-2.c | 82 ++++++++++++++++++++++++ 2 files changed, 84 insertions(+) create mode 100644 kernel/x86_64/daxpy_microk_bulldozer-2.c diff --git a/kernel/x86_64/daxpy.c b/kernel/x86_64/daxpy.c index feda045d0..83754cbd3 100644 --- a/kernel/x86_64/daxpy.c +++ b/kernel/x86_64/daxpy.c @@ -31,6 +31,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(NEHALEM) #include "daxpy_microk_nehalem-2.c" +#elif defined(BULLDOZER) +#include "daxpy_microk_bulldozer-2.c" #endif diff --git a/kernel/x86_64/daxpy_microk_bulldozer-2.c b/kernel/x86_64/daxpy_microk_bulldozer-2.c new file mode 100644 index 000000000..b1ef84a18 --- /dev/null +++ b/kernel/x86_64/daxpy_microk_bulldozer-2.c @@ -0,0 +1,82 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_8 1 +static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *alpha) __attribute__ ((noinline)); + +static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "vmovddup (%4), %%xmm0 \n\t" // alpha + + ".align 16 \n\t" + ".L01LOOP%=: \n\t" + + "prefetcht0 768(%3,%0,8) \n\t" + "vmovups (%2,%0,8), %%xmm12 \n\t" // 2 * x + "vfmaddpd (%3,%0,8), %%xmm0 , %%xmm12, %%xmm8 \n\t" // y += alpha * x + "vmovups 16(%2,%0,8), %%xmm13 \n\t" // 2 * x + ".align 2 \n\t" + "vmovups %%xmm8 , (%3,%0,8) \n\t" + "vfmaddpd 16(%3,%0,8), %%xmm0 , %%xmm13, %%xmm9 \n\t" // y += alpha * x + ".align 2 \n\t" + "vmovups 32(%2,%0,8), %%xmm14 \n\t" // 2 * x + "vmovups %%xmm9 , 16(%3,%0,8) \n\t" + "prefetcht0 768(%2,%0,8) \n\t" + ".align 2 \n\t" + "vfmaddpd 32(%3,%0,8), %%xmm0 , %%xmm14, %%xmm10 \n\t" // y += alpha * x + "vmovups 48(%2,%0,8), %%xmm15 \n\t" // 2 * x + "vmovups %%xmm10, 32(%3,%0,8) \n\t" + "vfmaddpd 48(%3,%0,8), %%xmm0 , %%xmm15, %%xmm11 \n\t" // y += alpha * x + "vmovups %%xmm11, 48(%3,%0,8) \n\t" + + "addq $8 , %0 \n\t" + "subq $8 , %1 \n\t" + "jnz .L01LOOP%= \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (alpha) // 4 + : "cc", + "%xmm0", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + From ee744451559d7e41c8bab16d2bd82f6f2ab8c103 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Mon, 25 Aug 2014 14:53:28 +0200 Subject: [PATCH 28/32] added optimized caxpy kernel for bulldozer --- kernel/x86_64/KERNEL.BULLDOZER | 3 + kernel/x86_64/caxpy.c | 131 ++++++++++++++++++++++ kernel/x86_64/caxpy_microk_bulldozer-2.c | 135 +++++++++++++++++++++++ kernel/x86_64/daxpy.c | 2 +- kernel/x86_64/ddot.c | 2 +- kernel/x86_64/saxpy.c | 2 +- kernel/x86_64/sdot.c | 2 +- 7 files changed, 273 insertions(+), 4 deletions(-) create mode 100644 kernel/x86_64/caxpy.c create mode 100644 kernel/x86_64/caxpy_microk_bulldozer-2.c diff --git a/kernel/x86_64/KERNEL.BULLDOZER b/kernel/x86_64/KERNEL.BULLDOZER index 9e6fc2e28..701eea310 100644 --- a/kernel/x86_64/KERNEL.BULLDOZER +++ b/kernel/x86_64/KERNEL.BULLDOZER @@ -1,3 +1,6 @@ +DAXPYKERNEL = daxpy.c +CAXPYKERNEL = caxpy.c + SDOTKERNEL = sdot.c DDOTKERNEL = ddot.c diff --git a/kernel/x86_64/caxpy.c b/kernel/x86_64/caxpy.c new file mode 100644 index 000000000..fa8924ae9 --- /dev/null +++ b/kernel/x86_64/caxpy.c @@ -0,0 +1,131 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include "common.h" + + +#if defined(BULLDOZER) +#include "caxpy_microk_bulldozer-2.c" +#endif + + +#ifndef HAVE_KERNEL_8 + +static void caxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + BLASLONG register i = 0; + BLASLONG register ix = 0; + FLOAT da_r = alpha[0]; + FLOAT da_i = alpha[1]; + + + while(i < n) + { +#if !defined(CONJ) + y[ix] += ( da_r * x[ix] - da_i * x[ix+1] ) ; + y[ix+1] += ( da_r * x[ix+1] + da_i * x[ix] ) ; + y[ix+2] += ( da_r * x[ix+2] - da_i * x[ix+3] ) ; + y[ix+3] += ( da_r * x[ix+3] + da_i * x[ix+2] ) ; +#else + y[ix] += ( da_r * x[ix] + da_i * x[ix+1] ) ; + y[ix+1] -= ( da_r * x[ix+1] - da_i * x[ix] ) ; + y[ix+2] += ( da_r * x[ix+2] + da_i * x[ix+3] ) ; + y[ix+3] -= ( da_r * x[ix+3] - da_i * x[ix+2] ) ; +#endif + + ix+=4 ; + i+=2 ; + + } + +} + +#endif + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + FLOAT da[2]; + + if ( n <= 0 ) return(0); + + if ( (inc_x == 1) && (inc_y == 1) ) + { + + int n1 = n & -8; + + if ( n1 ) + { + da[0] = da_r; + da[1] = da_i; + caxpy_kernel_8(n1, x, y , &da ); + ix = 2 * n1; + } + i = n1; + while(i < n) + { +#if !defined(CONJ) + y[ix] += ( da_r * x[ix] - da_i * x[ix+1] ) ; + y[ix+1] += ( da_r * x[ix+1] + da_i * x[ix] ) ; +#else + y[ix] += ( da_r * x[ix] + da_i * x[ix+1] ) ; + y[ix+1] -= ( da_r * x[ix+1] - da_i * x[ix] ) ; +#endif + i++ ; + ix += 2; + + } + return(0); + + + } + + inc_x *=2; + inc_y *=2; + + while(i < n) + { + +#if !defined(CONJ) + y[iy] += ( da_r * x[ix] - da_i * x[ix+1] ) ; + y[iy+1] += ( da_r * x[ix+1] + da_i * x[ix] ) ; +#else + y[iy] += ( da_r * x[ix] + da_i * x[ix+1] ) ; + y[iy+1] -= ( da_r * x[ix+1] - da_i * x[ix] ) ; +#endif + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + return(0); + +} + + diff --git a/kernel/x86_64/caxpy_microk_bulldozer-2.c b/kernel/x86_64/caxpy_microk_bulldozer-2.c new file mode 100644 index 000000000..86407028c --- /dev/null +++ b/kernel/x86_64/caxpy_microk_bulldozer-2.c @@ -0,0 +1,135 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_8 1 +static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *alpha) __attribute__ ((noinline)); + +static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "vbroadcastss (%4), %%xmm0 \n\t" // real part of alpha + "vbroadcastss 4(%4), %%xmm1 \n\t" // imag part of alpha + + ".align 16 \n\t" + ".L01LOOP%=: \n\t" + + "prefetcht0 768(%2,%0,4) \n\t" + "vmovups (%2,%0,4), %%xmm5 \n\t" // 2 complex values from x + "vmovups 16(%2,%0,4), %%xmm7 \n\t" // 2 complex values from x + "vmovups 32(%2,%0,4), %%xmm9 \n\t" // 2 complex values from x + "vmovups 48(%2,%0,4), %%xmm11 \n\t" // 2 complex values from x + "prefetcht0 768(%3,%0,4) \n\t" + +#if !defined(CONJ) + "vfmaddps (%3,%0,4), %%xmm0 , %%xmm5, %%xmm12 \n\t" + "vpermilps $0xb1 , %%xmm5 , %%xmm4 \n\t" // exchange real and imag part + "vmulps %%xmm1, %%xmm4 , %%xmm4 \n\t" + + "vfmaddps 16(%3,%0,4), %%xmm0 , %%xmm7, %%xmm13 \n\t" + "vpermilps $0xb1 , %%xmm7 , %%xmm6 \n\t" // exchange real and imag part + "vmulps %%xmm1, %%xmm6 , %%xmm6 \n\t" + + "vfmaddps 32(%3,%0,4), %%xmm0 , %%xmm9, %%xmm14 \n\t" + "vpermilps $0xb1 , %%xmm9 , %%xmm8 \n\t" // exchange real and imag part + "vmulps %%xmm1, %%xmm8 , %%xmm8 \n\t" + + "vfmaddps 48(%3,%0,4), %%xmm0 , %%xmm11,%%xmm15 \n\t" + "vpermilps $0xb1 , %%xmm11, %%xmm10 \n\t" // exchange real and imag part + "vmulps %%xmm1, %%xmm10, %%xmm10 \n\t" + + "vaddsubps %%xmm4, %%xmm12, %%xmm12 \n\t" + "vaddsubps %%xmm6, %%xmm13, %%xmm13 \n\t" + "vaddsubps %%xmm8, %%xmm14, %%xmm14 \n\t" + "vaddsubps %%xmm10,%%xmm15, %%xmm15 \n\t" + +#else + + "vmulps %%xmm0, %%xmm5, %%xmm4 \n\t" // a_r*x_r, a_r*x_i + "vmulps %%xmm1, %%xmm5, %%xmm5 \n\t" // a_i*x_r, a_i*x_i + "vmulps %%xmm0, %%xmm7, %%xmm6 \n\t" // a_r*x_r, a_r*x_i + "vmulps %%xmm1, %%xmm7, %%xmm7 \n\t" // a_i*x_r, a_i*x_i + "vmulps %%xmm0, %%xmm9, %%xmm8 \n\t" // a_r*x_r, a_r*x_i + "vmulps %%xmm1, %%xmm9, %%xmm9 \n\t" // a_i*x_r, a_i*x_i + "vmulps %%xmm0, %%xmm11, %%xmm10 \n\t" // a_r*x_r, a_r*x_i + "vmulps %%xmm1, %%xmm11, %%xmm11 \n\t" // a_i*x_r, a_i*x_i + + "vpermilps $0xb1 , %%xmm4 , %%xmm4 \n\t" // exchange real and imag part + "vaddsubps %%xmm4 ,%%xmm5 , %%xmm4 \n\t" + "vpermilps $0xb1 , %%xmm4 , %%xmm4 \n\t" // exchange real and imag part + + "vpermilps $0xb1 , %%xmm6 , %%xmm6 \n\t" // exchange real and imag part + "vaddsubps %%xmm6 ,%%xmm7 , %%xmm6 \n\t" + "vpermilps $0xb1 , %%xmm6 , %%xmm6 \n\t" // exchange real and imag part + + "vpermilps $0xb1 , %%xmm8 , %%xmm8 \n\t" // exchange real and imag part + "vaddsubps %%xmm8 ,%%xmm9 , %%xmm8 \n\t" + "vpermilps $0xb1 , %%xmm8 , %%xmm8 \n\t" // exchange real and imag part + + "vpermilps $0xb1 , %%xmm10, %%xmm10 \n\t" // exchange real and imag part + "vaddsubps %%xmm10,%%xmm11, %%xmm10 \n\t" + "vpermilps $0xb1 , %%xmm10, %%xmm10 \n\t" // exchange real and imag part + + "vaddps (%3,%0,4) ,%%xmm4 , %%xmm12 \n\t" + "vaddps 16(%3,%0,4) ,%%xmm6 , %%xmm13 \n\t" + "vaddps 32(%3,%0,4) ,%%xmm8 , %%xmm14 \n\t" + "vaddps 48(%3,%0,4) ,%%xmm10, %%xmm15 \n\t" + + +#endif + + "vmovups %%xmm12, (%3,%0,4) \n\t" + "vmovups %%xmm13, 16(%3,%0,4) \n\t" + "vmovups %%xmm14, 32(%3,%0,4) \n\t" + "vmovups %%xmm15, 48(%3,%0,4) \n\t" + + "addq $16, %0 \n\t" + "subq $8 , %1 \n\t" + "jnz .L01LOOP%= \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (alpha) // 4 + : "cc", + "%xmm0", "%xmm1", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + diff --git a/kernel/x86_64/daxpy.c b/kernel/x86_64/daxpy.c index 83754cbd3..f1d50c909 100644 --- a/kernel/x86_64/daxpy.c +++ b/kernel/x86_64/daxpy.c @@ -38,7 +38,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifndef HAVE_KERNEL_8 -void daxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) +static void daxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) { BLASLONG register i = 0; FLOAT a = *alpha; diff --git a/kernel/x86_64/ddot.c b/kernel/x86_64/ddot.c index ee6785f6a..b3aad438f 100644 --- a/kernel/x86_64/ddot.c +++ b/kernel/x86_64/ddot.c @@ -38,7 +38,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifndef HAVE_KERNEL_8 -void ddot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) +static void ddot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) { BLASLONG register i = 0; FLOAT dot = 0.0; diff --git a/kernel/x86_64/saxpy.c b/kernel/x86_64/saxpy.c index e6c016ee3..da81f1354 100644 --- a/kernel/x86_64/saxpy.c +++ b/kernel/x86_64/saxpy.c @@ -36,7 +36,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifndef HAVE_KERNEL_16 -void saxpy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) +static void saxpy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) { BLASLONG register i = 0; FLOAT a = *alpha; diff --git a/kernel/x86_64/sdot.c b/kernel/x86_64/sdot.c index a13d65d25..632d16810 100644 --- a/kernel/x86_64/sdot.c +++ b/kernel/x86_64/sdot.c @@ -37,7 +37,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifndef HAVE_KERNEL_16 -void sdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) +static void sdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) { BLASLONG register i = 0; FLOAT dot = 0.0; From 3885eebdb82787b452b532e6cb7a23d9711a514e Mon Sep 17 00:00:00 2001 From: wernsaar Date: Mon, 25 Aug 2014 15:52:35 +0200 Subject: [PATCH 29/32] added optimized zaxpy bulldozer kernel --- kernel/x86_64/KERNEL.BULLDOZER | 1 + kernel/x86_64/zaxpy.c | 131 ++++++++++++++++++++++ kernel/x86_64/zaxpy_microk_bulldozer-2.c | 135 +++++++++++++++++++++++ 3 files changed, 267 insertions(+) create mode 100644 kernel/x86_64/zaxpy.c create mode 100644 kernel/x86_64/zaxpy_microk_bulldozer-2.c diff --git a/kernel/x86_64/KERNEL.BULLDOZER b/kernel/x86_64/KERNEL.BULLDOZER index 701eea310..6318b202c 100644 --- a/kernel/x86_64/KERNEL.BULLDOZER +++ b/kernel/x86_64/KERNEL.BULLDOZER @@ -1,5 +1,6 @@ DAXPYKERNEL = daxpy.c CAXPYKERNEL = caxpy.c +ZAXPYKERNEL = zaxpy.c SDOTKERNEL = sdot.c DDOTKERNEL = ddot.c diff --git a/kernel/x86_64/zaxpy.c b/kernel/x86_64/zaxpy.c new file mode 100644 index 000000000..ca2f03dd0 --- /dev/null +++ b/kernel/x86_64/zaxpy.c @@ -0,0 +1,131 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include "common.h" + + +#if defined(BULLDOZER) +#include "zaxpy_microk_bulldozer-2.c" +#endif + + +#ifndef HAVE_KERNEL_4 + +static void zaxpy_kernel_4(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + BLASLONG register i = 0; + BLASLONG register ix = 0; + FLOAT da_r = alpha[0]; + FLOAT da_i = alpha[1]; + + + while(i < n) + { +#if !defined(CONJ) + y[ix] += ( da_r * x[ix] - da_i * x[ix+1] ) ; + y[ix+1] += ( da_r * x[ix+1] + da_i * x[ix] ) ; + y[ix+2] += ( da_r * x[ix+2] - da_i * x[ix+3] ) ; + y[ix+3] += ( da_r * x[ix+3] + da_i * x[ix+2] ) ; +#else + y[ix] += ( da_r * x[ix] + da_i * x[ix+1] ) ; + y[ix+1] -= ( da_r * x[ix+1] - da_i * x[ix] ) ; + y[ix+2] += ( da_r * x[ix+2] + da_i * x[ix+3] ) ; + y[ix+3] -= ( da_r * x[ix+3] - da_i * x[ix+2] ) ; +#endif + + ix+=4 ; + i+=2 ; + + } + +} + +#endif + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + FLOAT da[2]; + + if ( n <= 0 ) return(0); + + if ( (inc_x == 1) && (inc_y == 1) ) + { + + int n1 = n & -4; + + if ( n1 ) + { + da[0] = da_r; + da[1] = da_i; + zaxpy_kernel_4(n1, x, y , &da ); + ix = 2 * n1; + } + i = n1; + while(i < n) + { +#if !defined(CONJ) + y[ix] += ( da_r * x[ix] - da_i * x[ix+1] ) ; + y[ix+1] += ( da_r * x[ix+1] + da_i * x[ix] ) ; +#else + y[ix] += ( da_r * x[ix] + da_i * x[ix+1] ) ; + y[ix+1] -= ( da_r * x[ix+1] - da_i * x[ix] ) ; +#endif + i++ ; + ix += 2; + + } + return(0); + + + } + + inc_x *=2; + inc_y *=2; + + while(i < n) + { + +#if !defined(CONJ) + y[iy] += ( da_r * x[ix] - da_i * x[ix+1] ) ; + y[iy+1] += ( da_r * x[ix+1] + da_i * x[ix] ) ; +#else + y[iy] += ( da_r * x[ix] + da_i * x[ix+1] ) ; + y[iy+1] -= ( da_r * x[ix+1] - da_i * x[ix] ) ; +#endif + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + return(0); + +} + + diff --git a/kernel/x86_64/zaxpy_microk_bulldozer-2.c b/kernel/x86_64/zaxpy_microk_bulldozer-2.c new file mode 100644 index 000000000..780109b69 --- /dev/null +++ b/kernel/x86_64/zaxpy_microk_bulldozer-2.c @@ -0,0 +1,135 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_4 1 +static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *alpha) __attribute__ ((noinline)); + +static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "vmovddup (%4), %%xmm0 \n\t" // real part of alpha + "vmovddup 8(%4), %%xmm1 \n\t" // imag part of alpha + + ".align 16 \n\t" + ".L01LOOP%=: \n\t" + + "prefetcht0 768(%2,%0,8) \n\t" + "vmovups (%2,%0,8), %%xmm5 \n\t" // 1 complex values from x + "vmovups 16(%2,%0,8), %%xmm7 \n\t" // 1 complex values from x + "vmovups 32(%2,%0,8), %%xmm9 \n\t" // 1 complex values from x + "vmovups 48(%2,%0,8), %%xmm11 \n\t" // 1 complex values from x + "prefetcht0 768(%3,%0,8) \n\t" + +#if !defined(CONJ) + "vfmaddpd (%3,%0,8), %%xmm0 , %%xmm5, %%xmm12 \n\t" + "vpermilpd $0x1 , %%xmm5 , %%xmm4 \n\t" // exchange real and imag part + "vmulpd %%xmm1, %%xmm4 , %%xmm4 \n\t" + + "vfmaddpd 16(%3,%0,8), %%xmm0 , %%xmm7, %%xmm13 \n\t" + "vpermilpd $0x1 , %%xmm7 , %%xmm6 \n\t" // exchange real and imag part + "vmulpd %%xmm1, %%xmm6 , %%xmm6 \n\t" + + "vfmaddpd 32(%3,%0,8), %%xmm0 , %%xmm9, %%xmm14 \n\t" + "vpermilpd $0x1 , %%xmm9 , %%xmm8 \n\t" // exchange real and imag part + "vmulpd %%xmm1, %%xmm8 , %%xmm8 \n\t" + + "vfmaddpd 48(%3,%0,8), %%xmm0 , %%xmm11,%%xmm15 \n\t" + "vpermilpd $0x1 , %%xmm11, %%xmm10 \n\t" // exchange real and imag part + "vmulpd %%xmm1, %%xmm10, %%xmm10 \n\t" + + "vaddsubpd %%xmm4, %%xmm12, %%xmm12 \n\t" + "vaddsubpd %%xmm6, %%xmm13, %%xmm13 \n\t" + "vaddsubpd %%xmm8, %%xmm14, %%xmm14 \n\t" + "vaddsubpd %%xmm10,%%xmm15, %%xmm15 \n\t" + +#else + + "vmulpd %%xmm0, %%xmm5, %%xmm4 \n\t" // a_r*x_r, a_r*x_i + "vmulpd %%xmm1, %%xmm5, %%xmm5 \n\t" // a_i*x_r, a_i*x_i + "vmulpd %%xmm0, %%xmm7, %%xmm6 \n\t" // a_r*x_r, a_r*x_i + "vmulpd %%xmm1, %%xmm7, %%xmm7 \n\t" // a_i*x_r, a_i*x_i + "vmulpd %%xmm0, %%xmm9, %%xmm8 \n\t" // a_r*x_r, a_r*x_i + "vmulpd %%xmm1, %%xmm9, %%xmm9 \n\t" // a_i*x_r, a_i*x_i + "vmulpd %%xmm0, %%xmm11, %%xmm10 \n\t" // a_r*x_r, a_r*x_i + "vmulpd %%xmm1, %%xmm11, %%xmm11 \n\t" // a_i*x_r, a_i*x_i + + "vpermilpd $0x1 , %%xmm4 , %%xmm4 \n\t" // exchange real and imag part + "vaddsubpd %%xmm4 ,%%xmm5 , %%xmm4 \n\t" + "vpermilpd $0x1 , %%xmm4 , %%xmm4 \n\t" // exchange real and imag part + + "vpermilpd $0x1 , %%xmm6 , %%xmm6 \n\t" // exchange real and imag part + "vaddsubpd %%xmm6 ,%%xmm7 , %%xmm6 \n\t" + "vpermilpd $0x1 , %%xmm6 , %%xmm6 \n\t" // exchange real and imag part + + "vpermilpd $0x1 , %%xmm8 , %%xmm8 \n\t" // exchange real and imag part + "vaddsubpd %%xmm8 ,%%xmm9 , %%xmm8 \n\t" + "vpermilpd $0x1 , %%xmm8 , %%xmm8 \n\t" // exchange real and imag part + + "vpermilpd $0x1 , %%xmm10, %%xmm10 \n\t" // exchange real and imag part + "vaddsubpd %%xmm10,%%xmm11, %%xmm10 \n\t" + "vpermilpd $0x1 , %%xmm10, %%xmm10 \n\t" // exchange real and imag part + + "vaddpd (%3,%0,8) ,%%xmm4 , %%xmm12 \n\t" + "vaddpd 16(%3,%0,8) ,%%xmm6 , %%xmm13 \n\t" + "vaddpd 32(%3,%0,8) ,%%xmm8 , %%xmm14 \n\t" + "vaddpd 48(%3,%0,8) ,%%xmm10, %%xmm15 \n\t" + + +#endif + + "vmovups %%xmm12, (%3,%0,8) \n\t" + "vmovups %%xmm13, 16(%3,%0,8) \n\t" + "vmovups %%xmm14, 32(%3,%0,8) \n\t" + "vmovups %%xmm15, 48(%3,%0,8) \n\t" + + "addq $8 , %0 \n\t" + "subq $4 , %1 \n\t" + "jnz .L01LOOP%= \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (alpha) // 4 + : "cc", + "%xmm0", "%xmm1", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + From 84badf80866d300eecdb62f49689df1b6ca2ccd7 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Tue, 26 Aug 2014 17:36:32 +0200 Subject: [PATCH 30/32] EXPERIMENTAL: added the flag -no-integrated-as for clang compiler in Makefile.system --- Makefile.system | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/Makefile.system b/Makefile.system index ccde8e9ce..385ad47c3 100644 --- a/Makefile.system +++ b/Makefile.system @@ -339,7 +339,7 @@ FCOMMON_OPT += -m128bit-long-double endif ifeq ($(C_COMPILER), CLANG) EXPRECISION = 1 -CCOMMON_OPT += -DEXPRECISION +CCOMMON_OPT += -DEXPRECISION FCOMMON_OPT += -m128bit-long-double endif endif @@ -350,6 +350,12 @@ ifeq ($(C_COMPILER), INTEL) CCOMMON_OPT += -wd981 endif +ifeq ($(ARCH), x86_64) +ifeq ($(C_COMPILER), CLANG) +CCOMMON_OPT += -no-integrated-as +endif +endif + ifeq ($(USE_OPENMP), 1) # ifeq logical or. GCC or LSB ifeq ($(C_COMPILER), $(filter $(C_COMPILER),GCC LSB)) From 5fa61587312bd6221fe4050233b7135f3cc73a3f Mon Sep 17 00:00:00 2001 From: wernsaar Date: Tue, 26 Aug 2014 18:29:40 +0200 Subject: [PATCH 31/32] renoved flag no-integrated-as, because not working on macosx --- Makefile.system | 5 ----- 1 file changed, 5 deletions(-) diff --git a/Makefile.system b/Makefile.system index 385ad47c3..d2ff74146 100644 --- a/Makefile.system +++ b/Makefile.system @@ -350,11 +350,6 @@ ifeq ($(C_COMPILER), INTEL) CCOMMON_OPT += -wd981 endif -ifeq ($(ARCH), x86_64) -ifeq ($(C_COMPILER), CLANG) -CCOMMON_OPT += -no-integrated-as -endif -endif ifeq ($(USE_OPENMP), 1) # ifeq logical or. GCC or LSB From 20cd85012509993f2e1ed9b9ab83ae63297e0e6d Mon Sep 17 00:00:00 2001 From: wernsaar Date: Wed, 27 Aug 2014 09:00:20 +0200 Subject: [PATCH 32/32] modification for clang compiler --- kernel/x86_64/cgemm_kernel_8x2_haswell.S | 16 ++++++++-------- kernel/x86_64/sgemm_kernel_16x4_haswell.S | 20 ++++++++++---------- kernel/x86_64/zgemm_kernel_4x2_haswell.S | 12 ++++++------ 3 files changed, 24 insertions(+), 24 deletions(-) diff --git a/kernel/x86_64/cgemm_kernel_8x2_haswell.S b/kernel/x86_64/cgemm_kernel_8x2_haswell.S index 98f40054e..a608071db 100644 --- a/kernel/x86_64/cgemm_kernel_8x2_haswell.S +++ b/kernel/x86_64/cgemm_kernel_8x2_haswell.S @@ -227,8 +227,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. VFMADDPS_I( %ymm7 ,%ymm3,%ymm1 ) - addq $6*SIZE, BO - addq $16*SIZE, AO + addq $ 6*SIZE, BO + addq $ 16*SIZE, AO decq %rax .endm @@ -356,8 +356,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. VFMADDPS_R( %ymm4 ,%ymm2,%ymm0 ) VFMADDPS_I( %ymm5 ,%ymm3,%ymm0 ) - addq $6*SIZE, BO - addq $8*SIZE, AO + addq $ 6*SIZE, BO + addq $ 8*SIZE, AO decq %rax .endm @@ -447,8 +447,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. VFMADDPS_R( %xmm4 ,%xmm2,%xmm0 ) VFMADDPS_I( %xmm5 ,%xmm3,%xmm0 ) - addq $6*SIZE, BO - addq $4*SIZE, AO + addq $ 6*SIZE, BO + addq $ 4*SIZE, AO decq %rax .endm @@ -540,8 +540,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. VFMADDPS_R( %xmm4 ,%xmm2,%xmm0 ) VFMADDPS_I( %xmm5 ,%xmm3,%xmm0 ) - addq $6*SIZE, BO - addq $2*SIZE, AO + addq $ 6*SIZE, BO + addq $ 2*SIZE, AO decq %rax .endm diff --git a/kernel/x86_64/sgemm_kernel_16x4_haswell.S b/kernel/x86_64/sgemm_kernel_16x4_haswell.S index d88add02b..ef156fd27 100644 --- a/kernel/x86_64/sgemm_kernel_16x4_haswell.S +++ b/kernel/x86_64/sgemm_kernel_16x4_haswell.S @@ -181,8 +181,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. VFMADD231PS_( %ymm14,%ymm3,%ymm0 ) VFMADD231PS_( %ymm15,%ymm3,%ymm1 ) - addq $6*SIZE, BO - addq $16*SIZE, AO + addq $ 6*SIZE, BO + addq $ 16*SIZE, AO decq %rax .endm @@ -268,8 +268,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. VFMADD231PS_( %ymm12,%ymm2,%ymm0 ) VFMADD231PS_( %ymm14,%ymm3,%ymm0 ) - addq $6*SIZE, BO - addq $8*SIZE, AO + addq $ 6*SIZE, BO + addq $ 8*SIZE, AO decq %rax .endm @@ -327,8 +327,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. VFMADD231PS_( %xmm12,%xmm2,%xmm0 ) VFMADD231PS_( %xmm14,%xmm3,%xmm0 ) - addq $6*SIZE, BO - addq $4*SIZE, AO + addq $ 6*SIZE, BO + addq $ 4*SIZE, AO decq %rax .endm @@ -392,8 +392,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. VFMADD231SS_( %xmm14,%xmm3,%xmm0 ) VFMADD231SS_( %xmm15,%xmm3,%xmm1 ) - addq $6*SIZE, BO - addq $2*SIZE, AO + addq $ 6*SIZE, BO + addq $ 2*SIZE, AO decq %rax .endm @@ -478,8 +478,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. VFMADD231SS_( %xmm12,%xmm2,%xmm0 ) VFMADD231SS_( %xmm14,%xmm3,%xmm0 ) - addq $6*SIZE, BO - addq $1*SIZE, AO + addq $ 6*SIZE, BO + addq $ 1*SIZE, AO decq %rax .endm diff --git a/kernel/x86_64/zgemm_kernel_4x2_haswell.S b/kernel/x86_64/zgemm_kernel_4x2_haswell.S index e23e09ecc..f91bfa89b 100644 --- a/kernel/x86_64/zgemm_kernel_4x2_haswell.S +++ b/kernel/x86_64/zgemm_kernel_4x2_haswell.S @@ -222,8 +222,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. VFMADDPD_I( %ymm5 ,%ymm3,%ymm0 ) VFMADDPD_I( %ymm7 ,%ymm3,%ymm1 ) - addq $6*SIZE, BO - addq $8*SIZE, AO + addq $ 6*SIZE, BO + addq $ 8*SIZE, AO decq %rax .endm @@ -362,8 +362,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. VFMADDPD_I( %xmm5 ,%xmm3,%xmm0 ) VFMADDPD_I( %xmm7 ,%xmm3,%xmm1 ) - addq $6*SIZE, BO - addq $4*SIZE, AO + addq $ 6*SIZE, BO + addq $ 4*SIZE, AO decq %rax .endm @@ -491,8 +491,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. VFMADDPD_R( %xmm4 ,%xmm2,%xmm0 ) VFMADDPD_I( %xmm5 ,%xmm3,%xmm0 ) - addq $6*SIZE, BO - addq $2*SIZE, AO + addq $ 6*SIZE, BO + addq $ 2*SIZE, AO decq %rax .endm