From 53e6dbf6ca6e7c798e0ed0dfd24a78570a814553 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sat, 30 Aug 2014 13:36:27 +0200 Subject: [PATCH 01/44] optimized sgemv_t kernel for small sizes --- kernel/x86_64/KERNEL.NEHALEM | 2 +- kernel/x86_64/sgemv_t_4.c | 235 +++++++++++++++++++++++ kernel/x86_64/sgemv_t_microk_nehalem-4.c | 99 ++++++++++ 3 files changed, 335 insertions(+), 1 deletion(-) create mode 100644 kernel/x86_64/sgemv_t_4.c create mode 100644 kernel/x86_64/sgemv_t_microk_nehalem-4.c diff --git a/kernel/x86_64/KERNEL.NEHALEM b/kernel/x86_64/KERNEL.NEHALEM index 8adb579cf..00c3b4d15 100644 --- a/kernel/x86_64/KERNEL.NEHALEM +++ b/kernel/x86_64/KERNEL.NEHALEM @@ -10,7 +10,7 @@ SSYMV_U_KERNEL = ssymv_U.c SSYMV_L_KERNEL = ssymv_L.c SGEMVNKERNEL = sgemv_n.c -SGEMVTKERNEL = sgemv_t.c +SGEMVTKERNEL = sgemv_t_4.c DGEMVNKERNEL = dgemv_n.c SGEMMKERNEL = gemm_kernel_4x8_nehalem.S diff --git a/kernel/x86_64/sgemv_t_4.c b/kernel/x86_64/sgemv_t_4.c new file mode 100644 index 000000000..e0eb9220b --- /dev/null +++ b/kernel/x86_64/sgemv_t_4.c @@ -0,0 +1,235 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include "common.h" + +#if defined(NEHALEM) +#include "sgemv_t_microk_nehalem-4.c" +#endif + +#define NBMAX 4096 + +#ifndef HAVE_KERNEL_4x4 + +static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) +{ + BLASLONG i; + FLOAT *a0,*a1,*a2,*a3; + a0 = ap[0]; + a1 = ap[1]; + a2 = ap[2]; + a3 = ap[3]; + FLOAT temp0 = 0.0; + FLOAT temp1 = 0.0; + FLOAT temp2 = 0.0; + FLOAT temp3 = 0.0; + + for ( i=0; i< n; i+=4 ) + { + temp0 += a0[i]*x[i] + a0[i+1]*x[i+1] + a0[i+2]*x[i+2] + a0[i+3]*x[i+3]; + temp1 += a1[i]*x[i] + a1[i+1]*x[i+1] + a1[i+2]*x[i+2] + a1[i+3]*x[i+3]; + temp2 += a2[i]*x[i] + a2[i+1]*x[i+1] + a2[i+2]*x[i+2] + a2[i+3]*x[i+3]; + temp3 += a3[i]*x[i] + a3[i+1]*x[i+1] + a3[i+2]*x[i+2] + a3[i+3]*x[i+3]; + } + y[0] = temp0; + y[1] = temp1; + y[2] = temp2; + y[3] = temp3; +} + +#endif + +static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) +{ + BLASLONG i; + FLOAT *a0; + a0 = ap; + FLOAT temp = 0.0; + + for ( i=0; i< n; i+=4 ) + { + temp += a0[i]*x[i] + a0[i+1]*x[i+1] + a0[i+2]*x[i+2] + a0[i+3]*x[i+3]; + } + *y = temp; +} + +static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) +{ + BLASLONG i; + for ( i=0; i> 2 ; + n2 = n & 3 ; + + m3 = m & 3 ; + m1 = m & -4 ; + m2 = (m & (NBMAX-1)) - m3 ; + + + BLASLONG NB = NBMAX; + + while ( NB == NBMAX ) + { + + m1 -= NB; + if ( m1 < 0) + { + if ( m2 == 0 ) break; + NB = m2; + } + + y_ptr = y; + a_ptr = a; + x_ptr = x; + + if ( inc_x == 1 ) + xbuffer = x_ptr; + else + copy_x(NB,x_ptr,xbuffer,inc_x); + + + FLOAT *ap[4]; + BLASLONG register lda4 = 4 * lda; + ap[0] = a_ptr; + ap[1] = a_ptr + lda; + ap[2] = ap[1] + lda; + ap[3] = ap[2] + lda; + + for( i = 0; i < n1 ; i++) + { + sgemv_kernel_4x4(NB,ap,xbuffer,ybuffer); + ap[0] += lda4 ; + ap[1] += lda4 ; + ap[2] += lda4 ; + ap[3] += lda4 ; + a_ptr += lda4 ; + if ( inc_y == 1 ) + { + + __asm__ __volatile__ + ( + "movss (%0) , %%xmm10 \n\t" + "shufps $0 , %%xmm10 , %%xmm10 \n\t" + "movups (%1) , %%xmm12 \n\t" + "movups (%2) , %%xmm11 \n\t" + "mulps %%xmm10 , %%xmm12 \n\t" + + "addps %%xmm11 , %%xmm12 \n\t" + "movups %%xmm12, (%2) \n\t" + + : + : + "r" (&alpha), // 0 + "r" (ybuffer), // 1 + "r" (y_ptr) // 2 + : + "%xmm10", "%xmm11", "%xmm12", + "memory" + ); + + y_ptr += 4; + + } + else + { + *y_ptr += ybuffer[0]*alpha; + y_ptr += inc_y; + *y_ptr += ybuffer[1]*alpha; + y_ptr += inc_y; + *y_ptr += ybuffer[2]*alpha; + y_ptr += inc_y; + *y_ptr += ybuffer[3]*alpha; + y_ptr += inc_y; + } + } + + for( i = 0; i < n2 ; i++) + { + sgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer); + a_ptr += 1 * lda; + *y_ptr += ybuffer[0]*alpha; + y_ptr += inc_y; + + } + a += NB; + x += NB * inc_x; + } + + if ( m3 == 0 ) return(0); + xbuffer = buffer; + x_ptr = x; + for ( i=0; i< m3; i++ ) + { + xbuffer[i] = *x_ptr; + x_ptr += inc_x; + } + j=0; + a_ptr = a; + y_ptr = y; + while ( j < n) + { + FLOAT temp = 0.0; + for( i = 0; i < m3; i++ ) + { + temp += a_ptr[i] * xbuffer[i]; + } + a_ptr += lda; + y_ptr[0] += alpha * temp; + y_ptr += inc_y; + j++; + } + return(0); +} + + diff --git a/kernel/x86_64/sgemv_t_microk_nehalem-4.c b/kernel/x86_64/sgemv_t_microk_nehalem-4.c new file mode 100644 index 000000000..4a167900e --- /dev/null +++ b/kernel/x86_64/sgemv_t_microk_nehalem-4.c @@ -0,0 +1,99 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_4x4 1 +static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline)); + +static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) +{ + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "xorps %%xmm4 , %%xmm4 \n\t" + "xorps %%xmm5 , %%xmm5 \n\t" + "xorps %%xmm6 , %%xmm6 \n\t" + "xorps %%xmm7 , %%xmm7 \n\t" + + ".align 16 \n\t" + ".L01LOOP%=: \n\t" + + "movups (%2,%0,4), %%xmm12 \n\t" // 4 * x + "movups (%4,%0,4), %%xmm8 \n\t" // 4 * a0 + "movups (%5,%0,4), %%xmm9 \n\t" // 4 * a1 + "movups (%6,%0,4), %%xmm10 \n\t" // 4 * a2 + "movups (%7,%0,4), %%xmm11 \n\t" // 4 * a3 + + "mulps %%xmm12, %%xmm8 \n\t" + "mulps %%xmm12, %%xmm9 \n\t" + "mulps %%xmm12, %%xmm10 \n\t" + "mulps %%xmm12, %%xmm11 \n\t" + "addps %%xmm8 , %%xmm4 \n\t" + "addq $4 , %0 \n\t" + "addps %%xmm9 , %%xmm5 \n\t" + "subq $4 , %1 \n\t" + "addps %%xmm10, %%xmm6 \n\t" + "addps %%xmm11, %%xmm7 \n\t" + + "jnz .L01LOOP%= \n\t" + + "haddps %%xmm4, %%xmm4 \n\t" + "haddps %%xmm5, %%xmm5 \n\t" + "haddps %%xmm6, %%xmm6 \n\t" + "haddps %%xmm7, %%xmm7 \n\t" + + "haddps %%xmm4, %%xmm4 \n\t" + "haddps %%xmm5, %%xmm5 \n\t" + "haddps %%xmm6, %%xmm6 \n\t" + "haddps %%xmm7, %%xmm7 \n\t" + + "movss %%xmm4, (%3) \n\t" + "movss %%xmm5, 4(%3) \n\t" + "movss %%xmm6, 8(%3) \n\t" + "movss %%xmm7, 12(%3) \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (ap[0]), // 4 + "r" (ap[1]), // 5 + "r" (ap[2]), // 6 + "r" (ap[3]) // 7 + : "cc", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", + "memory" + ); + +} + + From e2fc8c8c2cd490d8774eb0d2b74e3060373a0199 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sat, 30 Aug 2014 13:58:02 +0200 Subject: [PATCH 02/44] changed 1 test value (bug in lapack-testing?) --- lapack-netlib/TESTING/dstest.in | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapack-netlib/TESTING/dstest.in b/lapack-netlib/TESTING/dstest.in index 4a31076a6..b5a9f29f4 100644 --- a/lapack-netlib/TESTING/dstest.in +++ b/lapack-netlib/TESTING/dstest.in @@ -1,6 +1,6 @@ Data file for testing DSGESV/DSPOSV LAPACK routines 12 Number of values of M -0 1 2 13 17 45 78 91 101 119 120 132 values of M (row dimension) +0 1 2 13 17 45 78 91 101 119 112 132 values of M (row dimension) 6 Number of values of NRHS 1 2 14 15 16 13 Values of NRHS (number of right hand sides) 30.0 Threshold value of test ratio From 848c0f16f7740563be56dc11f2b6c10ef174024e Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sun, 31 Aug 2014 13:23:44 +0200 Subject: [PATCH 03/44] optimized sgemv_t_4.c for small size --- kernel/x86_64/sgemv_t_4.c | 150 +++++++++++++++++++++++++++----------- 1 file changed, 108 insertions(+), 42 deletions(-) diff --git a/kernel/x86_64/sgemv_t_4.c b/kernel/x86_64/sgemv_t_4.c index e0eb9220b..cefbaccd4 100644 --- a/kernel/x86_64/sgemv_t_4.c +++ b/kernel/x86_64/sgemv_t_4.c @@ -64,6 +64,8 @@ static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) #endif +static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline)); + static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) { BLASLONG i; @@ -71,11 +73,51 @@ static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) a0 = ap; FLOAT temp = 0.0; + if (n <=0 ) return; +/* for ( i=0; i< n; i+=4 ) { temp += a0[i]*x[i] + a0[i+1]*x[i+1] + a0[i+2]*x[i+2] + a0[i+3]*x[i+3]; } *y = temp; +*/ + + i=0; + + __asm__ __volatile__ + ( + "xorps %%xmm10 , %%xmm10 \n\t" + + ".align 16 \n\t" + ".L01LOOP%=: \n\t" + + "movups (%3,%0,4) , %%xmm12 \n\t" + "movups (%4,%0,4) , %%xmm11 \n\t" + "mulps %%xmm11 , %%xmm12 \n\t" + "addq $4 , %0 \n\t" + "addps %%xmm12 , %%xmm10 \n\t" + "subq $4 , %1 \n\t" + + "jnz .L01LOOP%= \n\t" + + "haddps %%xmm10, %%xmm10 \n\t" + "haddps %%xmm10, %%xmm10 \n\t" + + "movss %%xmm10, (%2) \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (y), // 2 + "r" (ap), // 3 + "r" (x) // 4 + : "cc", + "%xmm10", "%xmm11", "%xmm12", + "memory" + ); + + } static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) @@ -89,6 +131,57 @@ static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) } } +static void add_y(BLASLONG n, FLOAT da , FLOAT *src, FLOAT *dest, BLASLONG inc_dest) __attribute__ ((noinline)); + +static void add_y(BLASLONG n, FLOAT da , FLOAT *src, FLOAT *dest, BLASLONG inc_dest) +{ + + BLASLONG i; + + if ( inc_dest != 1 ) + { + for ( i=0; i> 2 ; n2 = n & 3 ; @@ -140,65 +235,36 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO FLOAT *ap[4]; + FLOAT *yp; BLASLONG register lda4 = 4 * lda; ap[0] = a_ptr; ap[1] = a_ptr + lda; ap[2] = ap[1] + lda; ap[3] = ap[2] + lda; + yp = ytemp; for( i = 0; i < n1 ; i++) { - sgemv_kernel_4x4(NB,ap,xbuffer,ybuffer); + sgemv_kernel_4x4(NB,ap,xbuffer,yp); ap[0] += lda4 ; ap[1] += lda4 ; ap[2] += lda4 ; ap[3] += lda4 ; - a_ptr += lda4 ; - if ( inc_y == 1 ) - { - - __asm__ __volatile__ - ( - "movss (%0) , %%xmm10 \n\t" - "shufps $0 , %%xmm10 , %%xmm10 \n\t" - "movups (%1) , %%xmm12 \n\t" - "movups (%2) , %%xmm11 \n\t" - "mulps %%xmm10 , %%xmm12 \n\t" - - "addps %%xmm11 , %%xmm12 \n\t" - "movups %%xmm12, (%2) \n\t" - - : - : - "r" (&alpha), // 0 - "r" (ybuffer), // 1 - "r" (y_ptr) // 2 - : - "%xmm10", "%xmm11", "%xmm12", - "memory" - ); - - y_ptr += 4; - - } - else - { - *y_ptr += ybuffer[0]*alpha; - y_ptr += inc_y; - *y_ptr += ybuffer[1]*alpha; - y_ptr += inc_y; - *y_ptr += ybuffer[2]*alpha; - y_ptr += inc_y; - *y_ptr += ybuffer[3]*alpha; - y_ptr += inc_y; - } + yp += 4; + } + if ( n1 > 0 ) + { + add_y(n1*4, alpha, ytemp, y_ptr, inc_y ); + y_ptr += n1 * inc_y * 4; + a_ptr += n1 * lda4 ; } for( i = 0; i < n2 ; i++) { + sgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer); - a_ptr += 1 * lda; - *y_ptr += ybuffer[0]*alpha; + a_ptr += lda; + *y_ptr += ybuffer[0] * alpha; y_ptr += inc_y; } From bc99faef1bf2e1a98e99dcf6cfba2ea58ae0a56e Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sun, 31 Aug 2014 14:33:15 +0200 Subject: [PATCH 04/44] optimized sgemv_t_4.c for uneven sizes --- kernel/x86_64/sgemv_t_4.c | 28 ++++++++++++---------------- 1 file changed, 12 insertions(+), 16 deletions(-) diff --git a/kernel/x86_64/sgemv_t_4.c b/kernel/x86_64/sgemv_t_4.c index cefbaccd4..76187b57d 100644 --- a/kernel/x86_64/sgemv_t_4.c +++ b/kernel/x86_64/sgemv_t_4.c @@ -273,28 +273,24 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO } if ( m3 == 0 ) return(0); - xbuffer = buffer; + x_ptr = x; + a_ptr = a; for ( i=0; i< m3; i++ ) { - xbuffer[i] = *x_ptr; - x_ptr += inc_x; - } - j=0; - a_ptr = a; - y_ptr = y; - while ( j < n) - { - FLOAT temp = 0.0; - for( i = 0; i < m3; i++ ) + FLOAT xtemp = *x_ptr * alpha; + FLOAT *aj = a_ptr; + y_ptr = y; + for ( j=0; j Date: Sun, 31 Aug 2014 15:38:18 +0200 Subject: [PATCH 05/44] modified benchmark/gemv.c --- benchmark/gemv.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/benchmark/gemv.c b/benchmark/gemv.c index e26a36ac1..c5db09d89 100644 --- a/benchmark/gemv.c +++ b/benchmark/gemv.c @@ -151,23 +151,26 @@ int MAIN__(int argc, char *argv[]){ if ((p = getenv("OPENBLAS_TRANS"))) trans=*p; if ((p = getenv("OPENBLAS_PARAM_N"))) { n = atoi(p); - if ((n>0) && (n<=to)) has_param_n = 1; + if ((n>0)) has_param_n = 1; } + int tomax = to; + if ( n > tomax ) tomax = n; + if ( has_param_n == 1 ) fprintf(stderr, "From : %3d To : %3d Step = %3d Trans = '%c' N = %d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,trans,n,inc_x,inc_y,loops); else fprintf(stderr, "From : %3d To : %3d Step = %3d Trans = '%c' Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,trans,inc_x,inc_y,loops); - if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){ + if (( a = (FLOAT *)malloc(sizeof(FLOAT) * tomax * tomax * COMPSIZE)) == NULL){ fprintf(stderr,"Out of Memory!!\n");exit(1); } - if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * tomax * abs(inc_x) * COMPSIZE)) == NULL){ fprintf(stderr,"Out of Memory!!\n");exit(1); } - if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ + if (( y = (FLOAT *)malloc(sizeof(FLOAT) * tomax * abs(inc_y) * COMPSIZE)) == NULL){ fprintf(stderr,"Out of Memory!!\n");exit(1); } From d7f91f8b4f506b0e6071c61164a8e1c7ac8f32e9 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Mon, 1 Sep 2014 15:07:36 +0200 Subject: [PATCH 06/44] extended gemv.c benchmark --- benchmark/gemv.c | 103 ++++++++++++++++++++++++++++++++--------------- 1 file changed, 70 insertions(+), 33 deletions(-) diff --git a/benchmark/gemv.c b/benchmark/gemv.c index c5db09d89..e21868259 100644 --- a/benchmark/gemv.c +++ b/benchmark/gemv.c @@ -128,6 +128,7 @@ int MAIN__(int argc, char *argv[]){ blasint inc_x=1,inc_y=1; blasint n=0; int has_param_n = 0; + int has_param_m = 0; int loops = 1; int l; char *p; @@ -145,6 +146,9 @@ int MAIN__(int argc, char *argv[]){ if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} if (argc > 0) { step = atol(*argv); argc--; argv++;} + + int tomax = to; + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); @@ -152,15 +156,18 @@ int MAIN__(int argc, char *argv[]){ if ((p = getenv("OPENBLAS_PARAM_N"))) { n = atoi(p); if ((n>0)) has_param_n = 1; + if ( n > tomax ) tomax = n; } + if ( has_param_n == 0 ) + if ((p = getenv("OPENBLAS_PARAM_M"))) { + m = atoi(p); + if ((m>0)) has_param_m = 1; + if ( m > tomax ) tomax = m; + } - int tomax = to; - if ( n > tomax ) tomax = n; - if ( has_param_n == 1 ) - fprintf(stderr, "From : %3d To : %3d Step = %3d Trans = '%c' N = %d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,trans,n,inc_x,inc_y,loops); - else - fprintf(stderr, "From : %3d To : %3d Step = %3d Trans = '%c' Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,trans,inc_x,inc_y,loops); + + fprintf(stderr, "From : %3d To : %3d Step = %3d Trans = '%c' Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,trans,inc_x,inc_y,loops); if (( a = (FLOAT *)malloc(sizeof(FLOAT) * tomax * tomax * COMPSIZE)) == NULL){ fprintf(stderr,"Out of Memory!!\n");exit(1); @@ -180,50 +187,80 @@ int MAIN__(int argc, char *argv[]){ fprintf(stderr, " SIZE Flops\n"); - for(m = from; m <= to; m += step) + if (has_param_m == 0) { - timeg=0; + for(m = from; m <= to; m += step) + { + timeg=0; + if ( has_param_n == 0 ) n = m; + fprintf(stderr, " %6dx%d : ", (int)m,(int)n); + for(j = 0; j < m; j++){ + for(i = 0; i < n * COMPSIZE; i++){ + a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + } + } - if ( has_param_n == 0 ) n = m; + for (l=0; l Date: Mon, 1 Sep 2014 15:11:37 +0200 Subject: [PATCH 07/44] optimized sgemv_t_4.c for small sizes --- kernel/x86_64/sgemv_t_4.c | 123 ++++++++++++++++++++++++++++++++------ 1 file changed, 105 insertions(+), 18 deletions(-) diff --git a/kernel/x86_64/sgemv_t_4.c b/kernel/x86_64/sgemv_t_4.c index 76187b57d..ae1279296 100644 --- a/kernel/x86_64/sgemv_t_4.c +++ b/kernel/x86_64/sgemv_t_4.c @@ -64,23 +64,63 @@ static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) #endif +static void sgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT *y) __attribute__ ((noinline)); + +static void sgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT *y) +{ + BLASLONG i; + + i=0; + + __asm__ __volatile__ + ( + "xorps %%xmm10 , %%xmm10 \n\t" + "xorps %%xmm11 , %%xmm11 \n\t" + + ".align 16 \n\t" + ".L01LOOP%=: \n\t" + + "movups (%5,%0,4) , %%xmm14 \n\t" // x + "movups (%3,%0,4) , %%xmm12 \n\t" // ap0 + "movups (%4,%0,4) , %%xmm13 \n\t" // ap1 + "mulps %%xmm14 , %%xmm12 \n\t" + "mulps %%xmm14 , %%xmm13 \n\t" + "addq $4 , %0 \n\t" + "addps %%xmm12 , %%xmm10 \n\t" + "subq $4 , %1 \n\t" + "addps %%xmm13 , %%xmm11 \n\t" + + "jnz .L01LOOP%= \n\t" + + "haddps %%xmm10, %%xmm10 \n\t" + "haddps %%xmm11, %%xmm11 \n\t" + "haddps %%xmm10, %%xmm10 \n\t" + "haddps %%xmm11, %%xmm11 \n\t" + + "movss %%xmm10, (%2) \n\t" + "movss %%xmm11,4(%2) \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (y), // 2 + "r" (ap0), // 3 + "r" (ap1), // 4 + "r" (x) // 5 + : "cc", + "%xmm10", "%xmm11", "%xmm12", + "memory" + ); + + +} + static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline)); static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) { BLASLONG i; - FLOAT *a0; - a0 = ap; - FLOAT temp = 0.0; - - if (n <=0 ) return; -/* - for ( i=0; i< n; i+=4 ) - { - temp += a0[i]*x[i] + a0[i+1]*x[i+1] + a0[i+2]*x[i+2] + a0[i+3]*x[i+3]; - } - *y = temp; -*/ i=0; @@ -259,7 +299,19 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO a_ptr += n1 * lda4 ; } - for( i = 0; i < n2 ; i++) + if ( n2 & 2 ) + { + + sgemv_kernel_4x2(NB,ap[0],ap[1],xbuffer,ybuffer); + a_ptr += lda * 2; + *y_ptr += ybuffer[0] * alpha; + y_ptr += inc_y; + *y_ptr += ybuffer[1] * alpha; + y_ptr += inc_y; + + } + + if ( n2 & 1 ) { sgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer); @@ -276,20 +328,55 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO x_ptr = x; a_ptr = a; - for ( i=0; i< m3; i++ ) + if ( m3 == 3 ) { - FLOAT xtemp = *x_ptr * alpha; + FLOAT xtemp0 = *x_ptr * alpha; + x_ptr += inc_x; + FLOAT xtemp1 = *x_ptr * alpha; + x_ptr += inc_x; + FLOAT xtemp2 = *x_ptr * alpha; + FLOAT *aj = a_ptr; y_ptr = y; for ( j=0; j Date: Tue, 2 Sep 2014 12:42:36 +0200 Subject: [PATCH 08/44] optimized sgemv_t for bulldozer --- kernel/x86_64/KERNEL.BULLDOZER | 2 +- kernel/x86_64/sgemv_t_4.c | 6 +- kernel/x86_64/sgemv_t_microk_bulldozer-4.c | 147 +++++++++++++++++++++ 3 files changed, 152 insertions(+), 3 deletions(-) create mode 100644 kernel/x86_64/sgemv_t_microk_bulldozer-4.c diff --git a/kernel/x86_64/KERNEL.BULLDOZER b/kernel/x86_64/KERNEL.BULLDOZER index 6318b202c..346315aba 100644 --- a/kernel/x86_64/KERNEL.BULLDOZER +++ b/kernel/x86_64/KERNEL.BULLDOZER @@ -11,7 +11,7 @@ SSYMV_U_KERNEL = ssymv_U.c SSYMV_L_KERNEL = ssymv_L.c SGEMVNKERNEL = sgemv_n.c -SGEMVTKERNEL = sgemv_t.c +SGEMVTKERNEL = sgemv_t_4.c ZGEMVNKERNEL = zgemv_n_dup.S ZGEMVTKERNEL = zgemv_t.c diff --git a/kernel/x86_64/sgemv_t_4.c b/kernel/x86_64/sgemv_t_4.c index ae1279296..5568b98cc 100644 --- a/kernel/x86_64/sgemv_t_4.c +++ b/kernel/x86_64/sgemv_t_4.c @@ -30,6 +30,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(NEHALEM) #include "sgemv_t_microk_nehalem-4.c" +#elif defined(BULLDOZER) +#include "sgemv_t_microk_bulldozer-4.c" #endif #define NBMAX 4096 @@ -202,9 +204,9 @@ static void add_y(BLASLONG n, FLOAT da , FLOAT *src, FLOAT *dest, BLASLONG inc_d "movups (%4,%0,4) , %%xmm11 \n\t" "mulps %%xmm10 , %%xmm12 \n\t" "addq $4 , %0 \n\t" - "addps %%xmm11 , %%xmm12 \n\t" + "addps %%xmm12 , %%xmm11 \n\t" "subq $4 , %1 \n\t" - "movups %%xmm12, -16(%4,%0,4) \n\t" + "movups %%xmm11, -16(%4,%0,4) \n\t" "jnz .L01LOOP%= \n\t" diff --git a/kernel/x86_64/sgemv_t_microk_bulldozer-4.c b/kernel/x86_64/sgemv_t_microk_bulldozer-4.c new file mode 100644 index 000000000..40e318de3 --- /dev/null +++ b/kernel/x86_64/sgemv_t_microk_bulldozer-4.c @@ -0,0 +1,147 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_4x4 1 +static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline)); + +static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) +{ + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "vxorps %%xmm4, %%xmm4, %%xmm4 \n\t" + "vxorps %%xmm5, %%xmm5, %%xmm5 \n\t" + "vxorps %%xmm6, %%xmm6, %%xmm6 \n\t" + "vxorps %%xmm7, %%xmm7, %%xmm7 \n\t" + + "testq $0x04, %1 \n\t" + "jz .L08LABEL%= \n\t" + + "vmovups (%2,%0,4), %%xmm12 \n\t" // 4 * x + "vfmaddps %%xmm4, (%4,%0,4), %%xmm12, %%xmm4 \n\t" + "vfmaddps %%xmm5, (%5,%0,4), %%xmm12, %%xmm5 \n\t" + "vfmaddps %%xmm6, (%6,%0,4), %%xmm12, %%xmm6 \n\t" + "vfmaddps %%xmm7, (%7,%0,4), %%xmm12, %%xmm7 \n\t" + "addq $4 , %0 \n\t" + "subq $4 , %1 \n\t" + + ".L08LABEL%=: \n\t" + + "testq $0x08, %1 \n\t" + "jz .L16LABEL%= \n\t" + + "vmovups (%2,%0,4), %%xmm12 \n\t" // 4 * x + "vmovups 16(%2,%0,4), %%xmm13 \n\t" // 4 * x + "vfmaddps %%xmm4, (%4,%0,4), %%xmm12, %%xmm4 \n\t" + "vfmaddps %%xmm5, (%5,%0,4), %%xmm12, %%xmm5 \n\t" + "vfmaddps %%xmm6, (%6,%0,4), %%xmm12, %%xmm6 \n\t" + "vfmaddps %%xmm7, (%7,%0,4), %%xmm12, %%xmm7 \n\t" + "vfmaddps %%xmm4, 16(%4,%0,4), %%xmm13, %%xmm4 \n\t" + "vfmaddps %%xmm5, 16(%5,%0,4), %%xmm13, %%xmm5 \n\t" + "vfmaddps %%xmm6, 16(%6,%0,4), %%xmm13, %%xmm6 \n\t" + "vfmaddps %%xmm7, 16(%7,%0,4), %%xmm13, %%xmm7 \n\t" + + "addq $8 , %0 \n\t" + "subq $8 , %1 \n\t" + + ".L16LABEL%=: \n\t" + + "cmpq $0, %1 \n\t" + "je .L16END%= \n\t" + + ".align 16 \n\t" + ".L01LOOP%=: \n\t" + "vmovups (%2,%0,4), %%xmm12 \n\t" // 4 * x + + "prefetcht0 384(%4,%0,4) \n\t" + "vfmaddps %%xmm4, (%4,%0,4), %%xmm12, %%xmm4 \n\t" + "vfmaddps %%xmm5, (%5,%0,4), %%xmm12, %%xmm5 \n\t" + "vmovups 16(%2,%0,4), %%xmm13 \n\t" // 4 * x + "vfmaddps %%xmm6, (%6,%0,4), %%xmm12, %%xmm6 \n\t" + "vfmaddps %%xmm7, (%7,%0,4), %%xmm12, %%xmm7 \n\t" + "prefetcht0 384(%5,%0,4) \n\t" + ".align 2 \n\t" + "vfmaddps %%xmm4, 16(%4,%0,4), %%xmm13, %%xmm4 \n\t" + "vfmaddps %%xmm5, 16(%5,%0,4), %%xmm13, %%xmm5 \n\t" + "vmovups 32(%2,%0,4), %%xmm14 \n\t" // 4 * x + "vfmaddps %%xmm6, 16(%6,%0,4), %%xmm13, %%xmm6 \n\t" + "vfmaddps %%xmm7, 16(%7,%0,4), %%xmm13, %%xmm7 \n\t" + "prefetcht0 384(%6,%0,4) \n\t" + ".align 2 \n\t" + "vfmaddps %%xmm4, 32(%4,%0,4), %%xmm14, %%xmm4 \n\t" + "vfmaddps %%xmm5, 32(%5,%0,4), %%xmm14, %%xmm5 \n\t" + "vmovups 48(%2,%0,4), %%xmm15 \n\t" // 4 * x + "vfmaddps %%xmm6, 32(%6,%0,4), %%xmm14, %%xmm6 \n\t" + "vfmaddps %%xmm7, 32(%7,%0,4), %%xmm14, %%xmm7 \n\t" + "prefetcht0 384(%7,%0,4) \n\t" + "vfmaddps %%xmm4, 48(%4,%0,4), %%xmm15, %%xmm4 \n\t" + "addq $16, %0 \n\t" + "vfmaddps %%xmm5,-16(%5,%0,4), %%xmm15, %%xmm5 \n\t" + "vfmaddps %%xmm6,-16(%6,%0,4), %%xmm15, %%xmm6 \n\t" + "subq $16, %1 \n\t" + "vfmaddps %%xmm7,-16(%7,%0,4), %%xmm15, %%xmm7 \n\t" + + "jnz .L01LOOP%= \n\t" + + ".L16END%=: \n\t" + "vhaddps %%xmm4, %%xmm4, %%xmm4 \n\t" + "vhaddps %%xmm5, %%xmm5, %%xmm5 \n\t" + "vhaddps %%xmm6, %%xmm6, %%xmm6 \n\t" + "vhaddps %%xmm7, %%xmm7, %%xmm7 \n\t" + + "vhaddps %%xmm4, %%xmm4, %%xmm4 \n\t" + "vhaddps %%xmm5, %%xmm5, %%xmm5 \n\t" + "vhaddps %%xmm6, %%xmm6, %%xmm6 \n\t" + "vhaddps %%xmm7, %%xmm7, %%xmm7 \n\t" + + "vmovss %%xmm4, (%3) \n\t" + "vmovss %%xmm5, 4(%3) \n\t" + "vmovss %%xmm6, 8(%3) \n\t" + "vmovss %%xmm7, 12(%3) \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (ap[0]), // 4 + "r" (ap[1]), // 5 + "r" (ap[2]), // 6 + "r" (ap[3]) // 7 + : "cc", + "%xmm4", "%xmm5", + "%xmm6", "%xmm7", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + From f3b50dcf5b1f06b8d778544f70d8e85e0f445090 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Tue, 2 Sep 2014 13:35:41 +0200 Subject: [PATCH 09/44] removed obsolete instructions from sgemv_t_4.c --- kernel/x86_64/sgemv_t_4.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/kernel/x86_64/sgemv_t_4.c b/kernel/x86_64/sgemv_t_4.c index 5568b98cc..2d0648a6c 100644 --- a/kernel/x86_64/sgemv_t_4.c +++ b/kernel/x86_64/sgemv_t_4.c @@ -377,8 +377,6 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO y_ptr += inc_y; aj += lda; } - x_ptr += inc_x; - a_ptr++ ; return(0); } From 210bec9111c5252dfe600795c3ac63baaa060a9c Mon Sep 17 00:00:00 2001 From: wernsaar Date: Tue, 2 Sep 2014 14:11:42 +0200 Subject: [PATCH 10/44] added plot-header to compare multithreading --- benchmark/tplot-header | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) create mode 100644 benchmark/tplot-header diff --git a/benchmark/tplot-header b/benchmark/tplot-header new file mode 100644 index 000000000..b7ce7f225 --- /dev/null +++ b/benchmark/tplot-header @@ -0,0 +1,42 @@ +# ********************************************************************************** +# Copyright (c) 2014, The OpenBLAS Project +# All rights reserved. +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# 3. Neither the name of the OpenBLAS project nor the names of +# its contributors may be used to endorse or promote products +# derived from this software without specific prior written permission. +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +# USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# ********************************************************************************** + +set term x11 font sans; +set ylabel "MFlops"; +set xlabel "Size"; +set grid xtics; +set grid ytics; +set key left; +set timestamp "generated on %Y-%m-%d by `whoami`" +set title "Sgemv\nTRANS=T\nBulldozer" +plot '1-THREAD' smooth bezier, '2-THREADS' smooth bezier, '4-THREADS' smooth bezier; +set output "print.png"; +show title; +show plot; +show output; + + From f4ff889491de5d95d24d9d4edcbd85b0f83ff380 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Tue, 2 Sep 2014 16:30:04 +0200 Subject: [PATCH 11/44] updated interface/gemv.c for multithreading --- interface/gemv.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/interface/gemv.c b/interface/gemv.c index 08553ad21..3bcc099a5 100644 --- a/interface/gemv.c +++ b/interface/gemv.c @@ -215,8 +215,9 @@ void CNAME(enum CBLAS_ORDER order, int nthreads_max = num_cpu_avail(2); int nthreads_avail = nthreads_max; + double MNK = (double) m * (double) n; - if ( MNK <= (500.0 * 100.0 * (double) GEMM_MULTITHREAD_THRESHOLD) ) + if ( MNK <= (128.0 * 32.0 * (double) GEMM_MULTITHREAD_THRESHOLD) ) nthreads_max = 1; if ( nthreads_max > nthreads_avail ) From d1800397f592226cd0cb933303c09de325034412 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Tue, 2 Sep 2014 17:36:07 +0200 Subject: [PATCH 12/44] optimized interface/gemv.c for multithreading --- interface/gemv.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/interface/gemv.c b/interface/gemv.c index 3bcc099a5..64dc641d0 100644 --- a/interface/gemv.c +++ b/interface/gemv.c @@ -215,9 +215,8 @@ void CNAME(enum CBLAS_ORDER order, int nthreads_max = num_cpu_avail(2); int nthreads_avail = nthreads_max; - double MNK = (double) m * (double) n; - if ( MNK <= (128.0 * 32.0 * (double) GEMM_MULTITHREAD_THRESHOLD) ) + if ( MNK <= (96.0 * 24.0 * (double) GEMM_MULTITHREAD_THRESHOLD) ) nthreads_max = 1; if ( nthreads_max > nthreads_avail ) From 0fc560ba239767098f05f2f13161b036b2eb805d Mon Sep 17 00:00:00 2001 From: wernsaar Date: Wed, 3 Sep 2014 10:13:47 +0200 Subject: [PATCH 13/44] bugfix for buffer overflow --- kernel/x86_64/sgemv_t_4.c | 30 +++++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) diff --git a/kernel/x86_64/sgemv_t_4.c b/kernel/x86_64/sgemv_t_4.c index 2d0648a6c..fb0ba9741 100644 --- a/kernel/x86_64/sgemv_t_4.c +++ b/kernel/x86_64/sgemv_t_4.c @@ -232,6 +232,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO FLOAT *a_ptr; FLOAT *x_ptr; FLOAT *y_ptr; + BLASLONG n0; BLASLONG n1; BLASLONG m1; BLASLONG m2; @@ -246,7 +247,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO xbuffer = buffer; ytemp = buffer + NBMAX; - n1 = n >> 2 ; + n0 = n / NBMAX; + n1 = (n % NBMAX) >> 2 ; n2 = n & 3 ; m3 = m & 3 ; @@ -283,6 +285,32 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO ap[1] = a_ptr + lda; ap[2] = ap[1] + lda; ap[3] = ap[2] + lda; + + if ( n0 > 0 ) + { + BLASLONG nb1 = NBMAX / 4; + for( j=0; j Date: Wed, 3 Sep 2014 14:48:45 +0200 Subject: [PATCH 14/44] optimized sgemv_n for small sizes --- kernel/x86_64/KERNEL.NEHALEM | 2 +- kernel/x86_64/sgemv_n_4.c | 319 +++++++++++++++++++++++ kernel/x86_64/sgemv_n_microk_nehalem-4.c | 185 +++++++++++++ 3 files changed, 505 insertions(+), 1 deletion(-) create mode 100644 kernel/x86_64/sgemv_n_4.c create mode 100644 kernel/x86_64/sgemv_n_microk_nehalem-4.c diff --git a/kernel/x86_64/KERNEL.NEHALEM b/kernel/x86_64/KERNEL.NEHALEM index 00c3b4d15..68c741cea 100644 --- a/kernel/x86_64/KERNEL.NEHALEM +++ b/kernel/x86_64/KERNEL.NEHALEM @@ -9,7 +9,7 @@ DSYMV_L_KERNEL = dsymv_L.c SSYMV_U_KERNEL = ssymv_U.c SSYMV_L_KERNEL = ssymv_L.c -SGEMVNKERNEL = sgemv_n.c +SGEMVNKERNEL = sgemv_n_4.c SGEMVTKERNEL = sgemv_t_4.c DGEMVNKERNEL = dgemv_n.c diff --git a/kernel/x86_64/sgemv_n_4.c b/kernel/x86_64/sgemv_n_4.c new file mode 100644 index 000000000..f84016075 --- /dev/null +++ b/kernel/x86_64/sgemv_n_4.c @@ -0,0 +1,319 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include "common.h" + + +#if defined(BULLDOZER) || defined(PILEDRIVER) +#include "sgemv_n_microk_bulldozer-4.c" +#elif defined(NEHALEM) +#include "sgemv_n_microk_nehalem-4.c" +#endif + + +#define NBMAX 4096 + +#ifndef HAVE_KERNEL_4x8 + +static void sgemv_kernel_4x8(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4) +{ + BLASLONG i; + FLOAT *a0,*a1,*a2,*a3; + FLOAT *b0,*b1,*b2,*b3; + FLOAT *x4; + a0 = ap[0]; + a1 = ap[1]; + a2 = ap[2]; + a3 = ap[3]; + b0 = a0 + lda4 ; + b1 = a1 + lda4 ; + b2 = a2 + lda4 ; + b3 = a3 + lda4 ; + x4 = x + 4; + + for ( i=0; i< n; i+=4 ) + { + + y[i] += a0[i]*x[0] + a1[i]*x[1] + a2[i]*x[2] + a3[i]*x[3]; + y[i+1] += a0[i+1]*x[0] + a1[i+1]*x[1] + a2[i+1]*x[2] + a3[i+1]*x[3]; + y[i+2] += a0[i+2]*x[0] + a1[i+2]*x[1] + a2[i+2]*x[2] + a3[i+2]*x[3]; + y[i+3] += a0[i+3]*x[0] + a1[i+3]*x[1] + a2[i+3]*x[2] + a3[i+3]*x[3]; + + y[i] += b0[i]*x4[0] + b1[i]*x4[1] + b2[i]*x4[2] + b3[i]*x4[3]; + y[i+1] += b0[i+1]*x4[0] + b1[i+1]*x4[1] + b2[i+1]*x4[2] + b3[i+1]*x4[3]; + y[i+2] += b0[i+2]*x4[0] + b1[i+2]*x4[1] + b2[i+2]*x4[2] + b3[i+2]*x4[3]; + y[i+3] += b0[i+3]*x4[0] + b1[i+3]*x4[1] + b2[i+3]*x4[2] + b3[i+3]*x4[3]; + + } +} + +#endif + + +#ifndef HAVE_KERNEL_4x4 + +static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) +{ + BLASLONG i; + FLOAT *a0,*a1,*a2,*a3; + a0 = ap[0]; + a1 = ap[1]; + a2 = ap[2]; + a3 = ap[3]; + + for ( i=0; i< n; i+=4 ) + { + y[i] += a0[i]*x[0] + a1[i]*x[1] + a2[i]*x[2] + a3[i]*x[3]; + y[i+1] += a0[i+1]*x[0] + a1[i+1]*x[1] + a2[i+1]*x[2] + a3[i+1]*x[3]; + y[i+2] += a0[i+2]*x[0] + a1[i+2]*x[1] + a2[i+2]*x[2] + a3[i+2]*x[3]; + y[i+3] += a0[i+3]*x[0] + a1[i+3]*x[1] + a2[i+3]*x[2] + a3[i+3]*x[3]; + } +} + +#endif + +static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) +{ + BLASLONG i; + FLOAT *a0; + a0 = ap; + + for ( i=0; i< n; i+=4 ) + { + y[i] += a0[i]*x[0]; + y[i+1] += a0[i+1]*x[0]; + y[i+2] += a0[i+2]*x[0]; + y[i+3] += a0[i+3]*x[0]; + } +} + +static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, FLOAT *alpha) __attribute__ ((noinline)); + +static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, FLOAT *alpha) +{ + BLASLONG i; + if ( inc_dest != 1 ) + { + FLOAT da = *alpha; + for ( i=0; i Date: Wed, 3 Sep 2014 15:34:30 +0200 Subject: [PATCH 15/44] optimized sgemv_n_4.c --- kernel/x86_64/sgemv_n_4.c | 34 ++++++++++++++---------- kernel/x86_64/sgemv_n_microk_nehalem-4.c | 5 +++- 2 files changed, 24 insertions(+), 15 deletions(-) diff --git a/kernel/x86_64/sgemv_n_4.c b/kernel/x86_64/sgemv_n_4.c index f84016075..31d841ddd 100644 --- a/kernel/x86_64/sgemv_n_4.c +++ b/kernel/x86_64/sgemv_n_4.c @@ -174,9 +174,10 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO BLASLONG n1; BLASLONG m1; BLASLONG m2; + BLASLONG m3; BLASLONG n2; - BLASLONG lda4 = 4 * lda; - BLASLONG lda8 = 8 * lda; + BLASLONG lda4 = lda << 2; + BLASLONG lda8 = lda << 3; FLOAT xbuffer[8],*ybuffer; if ( m < 1 ) return(0); @@ -186,19 +187,21 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO if ( inc_x == 1 ) { - n1 = n / 8 ; - n2 = n % 8 ; + n1 = n >> 3 ; + n2 = n & 7 ; } else { - n1 = n / 4 ; - n2 = n % 4 ; + n1 = n >> 2 ; + n2 = n & 3 ; } - m1 = m - ( m % 4 ); - m2 = (m % NBMAX) - (m % 4) ; - + m3 = m & 3 ; + m1 = m & -4 ; + m2 = (m & (NBMAX-1)) - m3 ; + + y_ptr = y; BLASLONG NB = NBMAX; @@ -237,8 +240,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO x_ptr += 8; } -/* - for( i = 0; i < n1 ; i++) + + if ( n2 & 4 ) { sgemv_kernel_4x4(NB,ap,x_ptr,ybuffer); ap[0] += lda4; @@ -248,8 +251,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO a_ptr += lda4; x_ptr += 4; } -*/ - for( i = 0; i < n2 ; i++) + + for( i = 0; i < ( n2 & 3 ) ; i++) { xbuffer[0] = x_ptr[0]; x_ptr += inc_x; @@ -296,8 +299,11 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO a += NB; y_ptr += NB * inc_y; } + + if ( m3 == 0 ) return; + j=0; - while ( j < (m % 4)) + while ( j < m3 ) { a_ptr = a; x_ptr = x; diff --git a/kernel/x86_64/sgemv_n_microk_nehalem-4.c b/kernel/x86_64/sgemv_n_microk_nehalem-4.c index accc529b3..f87cfa425 100644 --- a/kernel/x86_64/sgemv_n_microk_nehalem-4.c +++ b/kernel/x86_64/sgemv_n_microk_nehalem-4.c @@ -58,13 +58,15 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO ".align 16 \n\t" ".L01LOOP%=: \n\t" - "movups (%3,%0,4), %%xmm4 \n\t" // 4 * y "xorps %%xmm5 , %%xmm5 \n\t" + "movups (%3,%0,4), %%xmm4 \n\t" // 4 * y + ".align 2 \n\t" "movups (%4,%0,4), %%xmm8 \n\t" "movups (%5,%0,4), %%xmm9 \n\t" "movups (%6,%0,4), %%xmm10 \n\t" "movups (%7,%0,4), %%xmm11 \n\t" + ".align 2 \n\t" "mulps %%xmm12, %%xmm8 \n\t" "mulps %%xmm13, %%xmm9 \n\t" "mulps %%xmm14, %%xmm10 \n\t" @@ -78,6 +80,7 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO "movups (%5,%8,4), %%xmm9 \n\t" "movups (%6,%8,4), %%xmm10 \n\t" "movups (%7,%8,4), %%xmm11 \n\t" + ".align 2 \n\t" "mulps %%xmm0 , %%xmm8 \n\t" "mulps %%xmm1 , %%xmm9 \n\t" "mulps %%xmm2 , %%xmm10 \n\t" From 7f910010a08f84b6ed74149f6cdcaaa71ca7f09b Mon Sep 17 00:00:00 2001 From: wernsaar Date: Thu, 4 Sep 2014 13:09:27 +0200 Subject: [PATCH 16/44] optimized sgemv_n kernel for small sizes --- kernel/x86_64/sgemv_n_4.c | 41 +++- kernel/x86_64/sgemv_n_microk_bulldozer-4.c | 254 +++++++++++++++++++++ 2 files changed, 283 insertions(+), 12 deletions(-) create mode 100644 kernel/x86_64/sgemv_n_microk_bulldozer-4.c diff --git a/kernel/x86_64/sgemv_n_4.c b/kernel/x86_64/sgemv_n_4.c index 31d841ddd..f1573dd30 100644 --- a/kernel/x86_64/sgemv_n_4.c +++ b/kernel/x86_64/sgemv_n_4.c @@ -185,17 +185,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO ybuffer = buffer; - if ( inc_x == 1 ) - { - n1 = n >> 3 ; - n2 = n & 7 ; - } - else - { - n1 = n >> 2 ; - n2 = n & 3 ; - - } + n1 = n >> 3 ; + n2 = n & 7 ; m3 = m & 3 ; m1 = m & -4 ; @@ -267,6 +258,32 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO { for( i = 0; i < n1 ; i++) + { + xbuffer[0] = x_ptr[0]; + x_ptr += inc_x; + xbuffer[1] = x_ptr[0]; + x_ptr += inc_x; + xbuffer[2] = x_ptr[0]; + x_ptr += inc_x; + xbuffer[3] = x_ptr[0]; + x_ptr += inc_x; + xbuffer[4] = x_ptr[0]; + x_ptr += inc_x; + xbuffer[5] = x_ptr[0]; + x_ptr += inc_x; + xbuffer[6] = x_ptr[0]; + x_ptr += inc_x; + xbuffer[7] = x_ptr[0]; + x_ptr += inc_x; + sgemv_kernel_4x8(NB,ap,x_ptr,ybuffer,lda4); + ap[0] += lda8; + ap[1] += lda8; + ap[2] += lda8; + ap[3] += lda8; + a_ptr += lda8; + } + + if ( n2 & 4 ) { xbuffer[0] = x_ptr[0]; x_ptr += inc_x; @@ -284,7 +301,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO a_ptr += lda4; } - for( i = 0; i < n2 ; i++) + for( i = 0; i < ( n2 & 3) ; i++) { xbuffer[0] = x_ptr[0]; x_ptr += inc_x; diff --git a/kernel/x86_64/sgemv_n_microk_bulldozer-4.c b/kernel/x86_64/sgemv_n_microk_bulldozer-4.c new file mode 100644 index 000000000..53287df75 --- /dev/null +++ b/kernel/x86_64/sgemv_n_microk_bulldozer-4.c @@ -0,0 +1,254 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + + +#define HAVE_KERNEL_4x8 1 +static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4) __attribute__ ((noinline)); + +static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4) +{ + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "vbroadcastss (%2), %%xmm12 \n\t" // x0 + "vbroadcastss 4(%2), %%xmm13 \n\t" // x1 + "vbroadcastss 8(%2), %%xmm14 \n\t" // x2 + "vbroadcastss 12(%2), %%xmm15 \n\t" // x3 + "vbroadcastss 16(%2), %%xmm0 \n\t" // x4 + "vbroadcastss 20(%2), %%xmm1 \n\t" // x5 + "vbroadcastss 24(%2), %%xmm2 \n\t" // x6 + "vbroadcastss 28(%2), %%xmm3 \n\t" // x7 + + "testq $0x04, %1 \n\t" + "jz .L08LABEL%= \n\t" + + "vxorps %%xmm5, %%xmm5 , %%xmm5 \n\t" + "vmovups (%3,%0,4), %%xmm4 \n\t" // 4 * y + + "vfmaddps %%xmm4, (%4,%0,4), %%xmm12, %%xmm4 \n\t" + "vfmaddps %%xmm5, (%5,%0,4), %%xmm13, %%xmm5 \n\t" + "vfmaddps %%xmm4, (%6,%0,4), %%xmm14, %%xmm4 \n\t" + "vfmaddps %%xmm5, (%7,%0,4), %%xmm15, %%xmm5 \n\t" + "addq $4 , %0 \n\t" + + "vfmaddps %%xmm4, (%4,%8,4), %%xmm0 , %%xmm4 \n\t" + "vfmaddps %%xmm5, (%5,%8,4), %%xmm1 , %%xmm5 \n\t" + "vfmaddps %%xmm4, (%6,%8,4), %%xmm2 , %%xmm4 \n\t" + "vfmaddps %%xmm5, (%7,%8,4), %%xmm3 , %%xmm5 \n\t" + "addq $4 , %8 \n\t" + + "vaddps %%xmm4, %%xmm5, %%xmm6 \n\t" + "vmovups %%xmm6, -16(%3,%0,4) \n\t" // 4 * y + + "subq $4 , %1 \n\t" + + ".L08LABEL%=: \n\t" + + "testq $0x08, %1 \n\t" + "jz .L16LABEL%= \n\t" + + "vmovups (%3,%0,4), %%xmm4 \n\t" // 4 * y + "vmovups 16(%3,%0,4), %%xmm5 \n\t" // 4 * y + + "vfmaddps %%xmm4, (%4,%0,4), %%xmm12, %%xmm4 \n\t" + "vfmaddps %%xmm5, 16(%4,%0,4), %%xmm12, %%xmm5 \n\t" + "vfmaddps %%xmm4, (%5,%0,4), %%xmm13, %%xmm4 \n\t" + "vfmaddps %%xmm5, 16(%5,%0,4), %%xmm13, %%xmm5 \n\t" + "vfmaddps %%xmm4, (%6,%0,4), %%xmm14, %%xmm4 \n\t" + "vfmaddps %%xmm5, 16(%6,%0,4), %%xmm14, %%xmm5 \n\t" + "vfmaddps %%xmm4, (%7,%0,4), %%xmm15, %%xmm4 \n\t" + "vfmaddps %%xmm5, 16(%7,%0,4), %%xmm15, %%xmm5 \n\t" + + "vfmaddps %%xmm4, (%4,%8,4), %%xmm0 , %%xmm4 \n\t" + "vfmaddps %%xmm5, 16(%4,%8,4), %%xmm0 , %%xmm5 \n\t" + "vfmaddps %%xmm4, (%5,%8,4), %%xmm1 , %%xmm4 \n\t" + "vfmaddps %%xmm5, 16(%5,%8,4), %%xmm1 , %%xmm5 \n\t" + "vfmaddps %%xmm4, (%6,%8,4), %%xmm2 , %%xmm4 \n\t" + "vfmaddps %%xmm5, 16(%6,%8,4), %%xmm2 , %%xmm5 \n\t" + "vfmaddps %%xmm4, (%7,%8,4), %%xmm3 , %%xmm4 \n\t" + "vfmaddps %%xmm5, 16(%7,%8,4), %%xmm3 , %%xmm5 \n\t" + + "vmovups %%xmm4, (%3,%0,4) \n\t" // 4 * y + "vmovups %%xmm5, 16(%3,%0,4) \n\t" // 4 * y + + "addq $8 , %0 \n\t" + "addq $8 , %8 \n\t" + "subq $8 , %1 \n\t" + + + ".L16LABEL%=: \n\t" + + "cmpq $0, %1 \n\t" + "je .L16END%= \n\t" + + ".align 16 \n\t" + ".L01LOOP%=: \n\t" + + "vmovups (%3,%0,4), %%xmm4 \n\t" // 4 * y + ".align 2 \n\t" + "vmovups 16(%3,%0,4), %%xmm5 \n\t" // 4 * y + "vmovups 32(%3,%0,4), %%xmm6 \n\t" // 4 * y + "vmovups 48(%3,%0,4), %%xmm7 \n\t" // 4 * y + + "prefetcht0 192(%4,%0,4) \n\t" + "vfmaddps %%xmm4, (%4,%0,4), %%xmm12, %%xmm4 \n\t" + "vfmaddps %%xmm5, 16(%4,%0,4), %%xmm12, %%xmm5 \n\t" + "prefetcht0 192(%5,%0,4) \n\t" + "vfmaddps %%xmm4, (%5,%0,4), %%xmm13, %%xmm4 \n\t" + "vfmaddps %%xmm5, 16(%5,%0,4), %%xmm13, %%xmm5 \n\t" + "prefetcht0 192(%6,%0,4) \n\t" + "vfmaddps %%xmm4, (%6,%0,4), %%xmm14, %%xmm4 \n\t" + "vfmaddps %%xmm5, 16(%6,%0,4), %%xmm14, %%xmm5 \n\t" + "prefetcht0 192(%7,%0,4) \n\t" + "vfmaddps %%xmm4, (%7,%0,4), %%xmm15, %%xmm4 \n\t" + ".align 2 \n\t" + "vfmaddps %%xmm5, 16(%7,%0,4), %%xmm15, %%xmm5 \n\t" + + "vfmaddps %%xmm6, 32(%4,%0,4), %%xmm12, %%xmm6 \n\t" + "vfmaddps %%xmm7, 48(%4,%0,4), %%xmm12, %%xmm7 \n\t" + "vfmaddps %%xmm6, 32(%5,%0,4), %%xmm13, %%xmm6 \n\t" + "vfmaddps %%xmm7, 48(%5,%0,4), %%xmm13, %%xmm7 \n\t" + "vfmaddps %%xmm6, 32(%6,%0,4), %%xmm14, %%xmm6 \n\t" + "vfmaddps %%xmm7, 48(%6,%0,4), %%xmm14, %%xmm7 \n\t" + "vfmaddps %%xmm6, 32(%7,%0,4), %%xmm15, %%xmm6 \n\t" + "vfmaddps %%xmm7, 48(%7,%0,4), %%xmm15, %%xmm7 \n\t" + + "prefetcht0 192(%4,%8,4) \n\t" + "vfmaddps %%xmm4, (%4,%8,4), %%xmm0 , %%xmm4 \n\t" + "vfmaddps %%xmm5, 16(%4,%8,4), %%xmm0 , %%xmm5 \n\t" + "prefetcht0 192(%5,%8,4) \n\t" + "vfmaddps %%xmm4, (%5,%8,4), %%xmm1 , %%xmm4 \n\t" + "vfmaddps %%xmm5, 16(%5,%8,4), %%xmm1 , %%xmm5 \n\t" + "prefetcht0 192(%6,%8,4) \n\t" + "vfmaddps %%xmm4, (%6,%8,4), %%xmm2 , %%xmm4 \n\t" + "vfmaddps %%xmm5, 16(%6,%8,4), %%xmm2 , %%xmm5 \n\t" + "prefetcht0 192(%7,%8,4) \n\t" + "vfmaddps %%xmm4, (%7,%8,4), %%xmm3 , %%xmm4 \n\t" + "vfmaddps %%xmm5, 16(%7,%8,4), %%xmm3 , %%xmm5 \n\t" + + "vfmaddps %%xmm6, 32(%4,%8,4), %%xmm0 , %%xmm6 \n\t" + "vfmaddps %%xmm7, 48(%4,%8,4), %%xmm0 , %%xmm7 \n\t" + "vfmaddps %%xmm6, 32(%5,%8,4), %%xmm1 , %%xmm6 \n\t" + "vfmaddps %%xmm7, 48(%5,%8,4), %%xmm1 , %%xmm7 \n\t" + "addq $16, %0 \n\t" + "vfmaddps %%xmm6, 32(%6,%8,4), %%xmm2 , %%xmm6 \n\t" + "vfmaddps %%xmm7, 48(%6,%8,4), %%xmm2 , %%xmm7 \n\t" + "vfmaddps %%xmm6, 32(%7,%8,4), %%xmm3 , %%xmm6 \n\t" + "vfmaddps %%xmm7, 48(%7,%8,4), %%xmm3 , %%xmm7 \n\t" + + "addq $16, %8 \n\t" + "vmovups %%xmm4,-64(%3,%0,4) \n\t" // 4 * y + "vmovups %%xmm5,-48(%3,%0,4) \n\t" // 4 * y + "subq $16, %1 \n\t" + "vmovups %%xmm6,-32(%3,%0,4) \n\t" // 4 * y + "vmovups %%xmm7,-16(%3,%0,4) \n\t" // 4 * y + + "jnz .L01LOOP%= \n\t" + + ".L16END%=: \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (ap[0]), // 4 + "r" (ap[1]), // 5 + "r" (ap[2]), // 6 + "r" (ap[3]), // 7 + "r" (lda4) // 8 + : "cc", + "%xmm0", "%xmm1", + "%xmm2", "%xmm3", + "%xmm4", "%xmm5", + "%xmm6", "%xmm7", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + + + +#define HAVE_KERNEL_4x4 1 +static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline)); + +static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) +{ + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "vbroadcastss (%2), %%xmm12 \n\t" // x0 + "vbroadcastss 4(%2), %%xmm13 \n\t" // x1 + "vbroadcastss 8(%2), %%xmm14 \n\t" // x2 + "vbroadcastss 12(%2), %%xmm15 \n\t" // x3 + + ".align 16 \n\t" + ".L01LOOP%=: \n\t" + "vmovups (%3,%0,4), %%xmm4 \n\t" // 4 * y + "vxorps %%xmm5, %%xmm5 , %%xmm5 \n\t" + + "vfmaddps %%xmm4, (%4,%0,4), %%xmm12, %%xmm4 \n\t" + "vfmaddps %%xmm5, (%5,%0,4), %%xmm13, %%xmm5 \n\t" + "vfmaddps %%xmm4, (%6,%0,4), %%xmm14, %%xmm4 \n\t" + "vfmaddps %%xmm5, (%7,%0,4), %%xmm15, %%xmm5 \n\t" + + "vaddps %%xmm4, %%xmm5, %%xmm6 \n\t" + + "vmovups %%xmm6, (%3,%0,4) \n\t" // 4 * y + + "addq $4 , %0 \n\t" + "subq $4 , %1 \n\t" + "jnz .L01LOOP%= \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (ap[0]), // 4 + "r" (ap[1]), // 5 + "r" (ap[2]), // 6 + "r" (ap[3]) // 7 + : "cc", + "%xmm4", "%xmm5", + "%xmm6", "%xmm7", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + From 53de943690abc2f500ae131627136c9fbd35e541 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Thu, 4 Sep 2014 18:55:52 +0200 Subject: [PATCH 17/44] bugfix for sgemv_n_4.c --- kernel/x86_64/KERNEL.BULLDOZER | 2 +- kernel/x86_64/sgemv_n_4.c | 41 ++++++++++------------------------ 2 files changed, 13 insertions(+), 30 deletions(-) diff --git a/kernel/x86_64/KERNEL.BULLDOZER b/kernel/x86_64/KERNEL.BULLDOZER index 346315aba..0fd7ac35f 100644 --- a/kernel/x86_64/KERNEL.BULLDOZER +++ b/kernel/x86_64/KERNEL.BULLDOZER @@ -10,7 +10,7 @@ DSYMV_L_KERNEL = dsymv_L.c SSYMV_U_KERNEL = ssymv_U.c SSYMV_L_KERNEL = ssymv_L.c -SGEMVNKERNEL = sgemv_n.c +SGEMVNKERNEL = sgemv_n_4.c SGEMVTKERNEL = sgemv_t_4.c ZGEMVNKERNEL = zgemv_n_dup.S diff --git a/kernel/x86_64/sgemv_n_4.c b/kernel/x86_64/sgemv_n_4.c index f1573dd30..31d841ddd 100644 --- a/kernel/x86_64/sgemv_n_4.c +++ b/kernel/x86_64/sgemv_n_4.c @@ -185,8 +185,17 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO ybuffer = buffer; - n1 = n >> 3 ; - n2 = n & 7 ; + if ( inc_x == 1 ) + { + n1 = n >> 3 ; + n2 = n & 7 ; + } + else + { + n1 = n >> 2 ; + n2 = n & 3 ; + + } m3 = m & 3 ; m1 = m & -4 ; @@ -258,32 +267,6 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO { for( i = 0; i < n1 ; i++) - { - xbuffer[0] = x_ptr[0]; - x_ptr += inc_x; - xbuffer[1] = x_ptr[0]; - x_ptr += inc_x; - xbuffer[2] = x_ptr[0]; - x_ptr += inc_x; - xbuffer[3] = x_ptr[0]; - x_ptr += inc_x; - xbuffer[4] = x_ptr[0]; - x_ptr += inc_x; - xbuffer[5] = x_ptr[0]; - x_ptr += inc_x; - xbuffer[6] = x_ptr[0]; - x_ptr += inc_x; - xbuffer[7] = x_ptr[0]; - x_ptr += inc_x; - sgemv_kernel_4x8(NB,ap,x_ptr,ybuffer,lda4); - ap[0] += lda8; - ap[1] += lda8; - ap[2] += lda8; - ap[3] += lda8; - a_ptr += lda8; - } - - if ( n2 & 4 ) { xbuffer[0] = x_ptr[0]; x_ptr += inc_x; @@ -301,7 +284,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO a_ptr += lda4; } - for( i = 0; i < ( n2 & 3) ; i++) + for( i = 0; i < n2 ; i++) { xbuffer[0] = x_ptr[0]; x_ptr += inc_x; From 6df7a8893078e2f9878efeb7212fb7030185cf37 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Fri, 5 Sep 2014 10:22:50 +0200 Subject: [PATCH 18/44] optimized sgemv_t for sandybridge --- kernel/x86_64/KERNEL.SANDYBRIDGE | 2 +- kernel/x86_64/sgemv_t_4.c | 2 + kernel/x86_64/sgemv_t_microk_sandy-4.c | 174 +++++++++++++++++++++++++ 3 files changed, 177 insertions(+), 1 deletion(-) create mode 100644 kernel/x86_64/sgemv_t_microk_sandy-4.c diff --git a/kernel/x86_64/KERNEL.SANDYBRIDGE b/kernel/x86_64/KERNEL.SANDYBRIDGE index b654d3564..b70486436 100644 --- a/kernel/x86_64/KERNEL.SANDYBRIDGE +++ b/kernel/x86_64/KERNEL.SANDYBRIDGE @@ -1,5 +1,5 @@ SGEMVNKERNEL = sgemv_n.c -SGEMVTKERNEL = sgemv_t.c +SGEMVTKERNEL = sgemv_t_4.c ZGEMVNKERNEL = zgemv_n.c diff --git a/kernel/x86_64/sgemv_t_4.c b/kernel/x86_64/sgemv_t_4.c index fb0ba9741..b89ec7f7f 100644 --- a/kernel/x86_64/sgemv_t_4.c +++ b/kernel/x86_64/sgemv_t_4.c @@ -32,6 +32,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "sgemv_t_microk_nehalem-4.c" #elif defined(BULLDOZER) #include "sgemv_t_microk_bulldozer-4.c" +#elif defined(SANDYBRIDGE) +#include "sgemv_t_microk_sandy-4.c" #endif #define NBMAX 4096 diff --git a/kernel/x86_64/sgemv_t_microk_sandy-4.c b/kernel/x86_64/sgemv_t_microk_sandy-4.c new file mode 100644 index 000000000..6550518f7 --- /dev/null +++ b/kernel/x86_64/sgemv_t_microk_sandy-4.c @@ -0,0 +1,174 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_4x4 1 +static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline)); + +static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) +{ + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + "vxorps %%ymm0 , %%ymm0, %%ymm0 \n\t" + "vxorps %%ymm1 , %%ymm1, %%ymm1 \n\t" + "vxorps %%ymm2 , %%ymm2, %%ymm2 \n\t" + "vxorps %%ymm3 , %%ymm3, %%ymm3 \n\t" + "vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t" + "vxorps %%ymm5 , %%ymm5, %%ymm5 \n\t" + "vxorps %%ymm6 , %%ymm6, %%ymm6 \n\t" + "vxorps %%ymm7 , %%ymm7, %%ymm7 \n\t" + + "testq $0x04, %1 \n\t" + "jz .L08LABEL%= \n\t" + + "vmovups (%2,%0,4), %%xmm12 \n\t" // 4 * x + + "vmulps (%4,%0,4), %%xmm12, %%xmm8 \n\t" + "vmulps (%5,%0,4), %%xmm12, %%xmm10 \n\t" + "vmulps (%6,%0,4), %%xmm12, %%xmm9 \n\t" + "vmulps (%7,%0,4), %%xmm12, %%xmm11 \n\t" + "vaddps %%xmm4, %%xmm8 , %%xmm4 \n\t" + "addq $4 , %0 \n\t" + "vaddps %%xmm5, %%xmm10, %%xmm5 \n\t" + "vaddps %%xmm6, %%xmm9 , %%xmm6 \n\t" + "subq $4 , %1 \n\t" + "vaddps %%xmm7, %%xmm11, %%xmm7 \n\t" + + ".L08LABEL%=: \n\t" + + "testq $0x08, %1 \n\t" + "jz .L16LABEL%= \n\t" + + "vmovups (%2,%0,4), %%ymm12 \n\t" // 8 * x + + "vmulps (%4,%0,4), %%ymm12, %%ymm8 \n\t" + "vmulps (%5,%0,4), %%ymm12, %%ymm10 \n\t" + "vmulps (%6,%0,4), %%ymm12, %%ymm9 \n\t" + "vmulps (%7,%0,4), %%ymm12, %%ymm11 \n\t" + "vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t" + "addq $8 , %0 \n\t" + "vaddps %%ymm5, %%ymm10, %%ymm5 \n\t" + "vaddps %%ymm6, %%ymm9 , %%ymm6 \n\t" + "subq $8 , %1 \n\t" + "vaddps %%ymm7, %%ymm11, %%ymm7 \n\t" + + ".L16LABEL%=: \n\t" + + "cmpq $0, %1 \n\t" + "je .L16END%= \n\t" + + + ".align 16 \n\t" + ".L01LOOP%=: \n\t" + "prefetcht0 384(%2,%0,4) \n\t" + "vmovups (%2,%0,4), %%ymm12 \n\t" // 8 * x + "vmovups 32(%2,%0,4), %%ymm13 \n\t" // 8 * x + + "prefetcht0 384(%4,%0,4) \n\t" + "vmulps (%4,%0,4), %%ymm12, %%ymm8 \n\t" + "vmulps 32(%4,%0,4), %%ymm13, %%ymm9 \n\t" + "prefetcht0 384(%5,%0,4) \n\t" + "vmulps (%5,%0,4), %%ymm12, %%ymm10 \n\t" + "vmulps 32(%5,%0,4), %%ymm13, %%ymm11 \n\t" + "vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t" + "vaddps %%ymm0, %%ymm9 , %%ymm0 \n\t" + "vaddps %%ymm1, %%ymm10, %%ymm1 \n\t" + "vaddps %%ymm5, %%ymm11, %%ymm5 \n\t" + "prefetcht0 384(%6,%0,4) \n\t" + "vmulps (%6,%0,4), %%ymm12, %%ymm8 \n\t" + "vmulps 32(%6,%0,4), %%ymm13, %%ymm9 \n\t" + "prefetcht0 384(%7,%0,4) \n\t" + "vmulps (%7,%0,4), %%ymm12, %%ymm10 \n\t" + "vmulps 32(%7,%0,4), %%ymm13, %%ymm11 \n\t" + "vaddps %%ymm6, %%ymm8 , %%ymm6 \n\t" + "addq $16, %0 \n\t" + "vaddps %%ymm2, %%ymm9 , %%ymm2 \n\t" + "vaddps %%ymm7, %%ymm10, %%ymm7 \n\t" + "subq $16, %1 \n\t" + "vaddps %%ymm3, %%ymm11, %%ymm3 \n\t" + + "jnz .L01LOOP%= \n\t" + + ".L16END%=: \n\t" + + "vaddps %%ymm4, %%ymm0, %%ymm4 \n\t" + "vaddps %%ymm5, %%ymm1, %%ymm5 \n\t" + "vaddps %%ymm6, %%ymm2, %%ymm6 \n\t" + "vaddps %%ymm7, %%ymm3, %%ymm7 \n\t" + + "vextractf128 $1 , %%ymm4, %%xmm12 \n\t" + "vextractf128 $1 , %%ymm5, %%xmm13 \n\t" + "vextractf128 $1 , %%ymm6, %%xmm14 \n\t" + "vextractf128 $1 , %%ymm7, %%xmm15 \n\t" + + "vaddps %%xmm4, %%xmm12, %%xmm4 \n\t" + "vaddps %%xmm5, %%xmm13, %%xmm5 \n\t" + "vaddps %%xmm6, %%xmm14, %%xmm6 \n\t" + "vaddps %%xmm7, %%xmm15, %%xmm7 \n\t" + + "vhaddps %%xmm4, %%xmm4, %%xmm4 \n\t" + "vhaddps %%xmm5, %%xmm5, %%xmm5 \n\t" + "vhaddps %%xmm6, %%xmm6, %%xmm6 \n\t" + "vhaddps %%xmm7, %%xmm7, %%xmm7 \n\t" + + "vhaddps %%xmm4, %%xmm4, %%xmm4 \n\t" + "vhaddps %%xmm5, %%xmm5, %%xmm5 \n\t" + "vhaddps %%xmm6, %%xmm6, %%xmm6 \n\t" + "vhaddps %%xmm7, %%xmm7, %%xmm7 \n\t" + + "vmovss %%xmm4, (%3) \n\t" + "vmovss %%xmm5, 4(%3) \n\t" + "vmovss %%xmm6, 8(%3) \n\t" + "vmovss %%xmm7, 12(%3) \n\t" + + + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (ap[0]), // 4 + "r" (ap[1]), // 5 + "r" (ap[2]), // 6 + "r" (ap[3]) // 7 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + From 2021d0f9d6e155997450ab199d8af7e0a3a8551a Mon Sep 17 00:00:00 2001 From: wernsaar Date: Fri, 5 Sep 2014 15:05:53 +0200 Subject: [PATCH 19/44] experimentally removed expensive function calls --- common_x86_64.h | 9 +++++++++ driver/others/parameter.c | 2 ++ 2 files changed, 11 insertions(+) diff --git a/common_x86_64.h b/common_x86_64.h index 0f842ee94..ae9b88718 100644 --- a/common_x86_64.h +++ b/common_x86_64.h @@ -46,6 +46,7 @@ #define __volatile__ #endif +/* #ifdef HAVE_SSE2 #define MB __asm__ __volatile__ ("mfence"); #define WMB __asm__ __volatile__ ("sfence"); @@ -53,6 +54,10 @@ #define MB #define WMB #endif +*/ + +#define MB +#define WMB static void __inline blas_lock(volatile BLASULONG *address){ @@ -99,6 +104,8 @@ static __inline void cpuid(int op, int *eax, int *ebx, int *ecx, int *edx){ : "0" (op)); } +/* + #define WHEREAMI static inline int WhereAmI(void){ @@ -111,6 +118,8 @@ static inline int WhereAmI(void){ return apicid; } +*/ + #ifdef CORE_BARCELONA #define IFLUSH gotoblas_iflush() #define IFLUSH_HALF gotoblas_iflush_half() diff --git a/driver/others/parameter.c b/driver/others/parameter.c index a0a8b5188..c6c7301e8 100644 --- a/driver/others/parameter.c +++ b/driver/others/parameter.c @@ -251,7 +251,9 @@ void blas_set_parameter(void){ env_var_t p; int factor; +#if !defined(BULLDOZER) int size = get_L2_size(); +#endif #if defined(CORE_KATMAI) || defined(CORE_COPPERMINE) || defined(CORE_BANIAS) size >>= 7; From a64fe9bcc95b5378d47c424f615da95d38a9ec43 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sat, 6 Sep 2014 08:41:53 +0200 Subject: [PATCH 20/44] added optimized sgemv_n kernel for sandybridge --- driver/others/parameter.c | 4 +- kernel/x86_64/KERNEL.SANDYBRIDGE | 2 +- kernel/x86_64/sgemv_n_4.c | 2 + kernel/x86_64/sgemv_n_microk_sandy-4.c | 322 +++++++++++++++++++++++++ 4 files changed, 328 insertions(+), 2 deletions(-) create mode 100644 kernel/x86_64/sgemv_n_microk_sandy-4.c diff --git a/driver/others/parameter.c b/driver/others/parameter.c index c6c7301e8..f0f889a15 100644 --- a/driver/others/parameter.c +++ b/driver/others/parameter.c @@ -251,7 +251,9 @@ void blas_set_parameter(void){ env_var_t p; int factor; -#if !defined(BULLDOZER) +#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(SANDYBRIDGE) || defined(NEHALEM) || defined(HASWELL) + int size = 16; +#else int size = get_L2_size(); #endif diff --git a/kernel/x86_64/KERNEL.SANDYBRIDGE b/kernel/x86_64/KERNEL.SANDYBRIDGE index b70486436..dfc2882aa 100644 --- a/kernel/x86_64/KERNEL.SANDYBRIDGE +++ b/kernel/x86_64/KERNEL.SANDYBRIDGE @@ -1,4 +1,4 @@ -SGEMVNKERNEL = sgemv_n.c +SGEMVNKERNEL = sgemv_n_4.c SGEMVTKERNEL = sgemv_t_4.c ZGEMVNKERNEL = zgemv_n.c diff --git a/kernel/x86_64/sgemv_n_4.c b/kernel/x86_64/sgemv_n_4.c index 31d841ddd..617b1788f 100644 --- a/kernel/x86_64/sgemv_n_4.c +++ b/kernel/x86_64/sgemv_n_4.c @@ -33,6 +33,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "sgemv_n_microk_bulldozer-4.c" #elif defined(NEHALEM) #include "sgemv_n_microk_nehalem-4.c" +#elif defined(SANDYBRIDGE) +#include "sgemv_n_microk_sandy-4.c" #endif diff --git a/kernel/x86_64/sgemv_n_microk_sandy-4.c b/kernel/x86_64/sgemv_n_microk_sandy-4.c new file mode 100644 index 000000000..b4caca630 --- /dev/null +++ b/kernel/x86_64/sgemv_n_microk_sandy-4.c @@ -0,0 +1,322 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + + + +#define HAVE_KERNEL_4x8 1 +static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4) __attribute__ ((noinline)); + +static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4) +{ + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + "vbroadcastss (%2), %%ymm12 \n\t" // x0 + "vbroadcastss 4(%2), %%ymm13 \n\t" // x1 + "vbroadcastss 8(%2), %%ymm14 \n\t" // x2 + "vbroadcastss 12(%2), %%ymm15 \n\t" // x3 + "vbroadcastss 16(%2), %%ymm0 \n\t" // x4 + "vbroadcastss 20(%2), %%ymm1 \n\t" // x5 + "vbroadcastss 24(%2), %%ymm2 \n\t" // x6 + "vbroadcastss 28(%2), %%ymm3 \n\t" // x7 + + "testq $0x04, %1 \n\t" + "jz .L08LABEL%= \n\t" + + "vmovups (%3,%0,4), %%xmm4 \n\t" // 4 * y + + "vmulps (%4,%0,4), %%xmm12, %%xmm8 \n\t" + "vmulps (%5,%0,4), %%xmm13, %%xmm10 \n\t" + "vmulps (%6,%0,4), %%xmm14, %%xmm9 \n\t" + "vmulps (%7,%0,4), %%xmm15, %%xmm11 \n\t" + "vaddps %%xmm4, %%xmm8 , %%xmm4 \n\t" + "vaddps %%xmm4, %%xmm10, %%xmm4 \n\t" + "vaddps %%xmm4, %%xmm9 , %%xmm4 \n\t" + "vaddps %%xmm4, %%xmm11, %%xmm4 \n\t" + + "vmulps (%4,%8,4), %%xmm0 , %%xmm8 \n\t" + "vmulps (%5,%8,4), %%xmm1 , %%xmm10 \n\t" + "vmulps (%6,%8,4), %%xmm2 , %%xmm9 \n\t" + "vmulps (%7,%8,4), %%xmm3 , %%xmm11 \n\t" + "vaddps %%xmm4, %%xmm8 , %%xmm4 \n\t" + "vaddps %%xmm4, %%xmm10, %%xmm4 \n\t" + "vaddps %%xmm4, %%xmm9 , %%xmm4 \n\t" + "vaddps %%xmm4, %%xmm11, %%xmm4 \n\t" + + "vmovups %%xmm4, (%3,%0,4) \n\t" // 4 * y + + "addq $4, %8 \n\t" + "addq $4, %0 \n\t" + "subq $4, %1 \n\t" + + ".L08LABEL%=: \n\t" + + "testq $0x08, %1 \n\t" + "jz .L16LABEL%= \n\t" + + "vmovups (%3,%0,4), %%ymm4 \n\t" // 8 * y + + "vmulps (%4,%0,4), %%ymm12, %%ymm8 \n\t" + "vmulps (%5,%0,4), %%ymm13, %%ymm10 \n\t" + "vmulps (%6,%0,4), %%ymm14, %%ymm9 \n\t" + "vmulps (%7,%0,4), %%ymm15, %%ymm11 \n\t" + "vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t" + "vaddps %%ymm4, %%ymm10, %%ymm4 \n\t" + "vaddps %%ymm4, %%ymm9 , %%ymm4 \n\t" + "vaddps %%ymm4, %%ymm11, %%ymm4 \n\t" + + "vmulps (%4,%8,4), %%ymm0 , %%ymm8 \n\t" + "vmulps (%5,%8,4), %%ymm1 , %%ymm10 \n\t" + "vmulps (%6,%8,4), %%ymm2 , %%ymm9 \n\t" + "vmulps (%7,%8,4), %%ymm3 , %%ymm11 \n\t" + "vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t" + "vaddps %%ymm4, %%ymm10, %%ymm4 \n\t" + "vaddps %%ymm4, %%ymm9 , %%ymm4 \n\t" + "vaddps %%ymm4, %%ymm11, %%ymm4 \n\t" + + "vmovups %%ymm4, (%3,%0,4) \n\t" // 8 * y + + "addq $8, %8 \n\t" + "addq $8, %0 \n\t" + "subq $8, %1 \n\t" + + + ".L16LABEL%=: \n\t" + + "cmpq $0, %1 \n\t" + "je .L16END%= \n\t" + + + ".align 16 \n\t" + ".L01LOOP%=: \n\t" + "vmovups (%3,%0,4), %%ymm4 \n\t" // 8 * y + "vmovups 32(%3,%0,4), %%ymm5 \n\t" // 8 * y + + "prefetcht0 192(%4,%0,4) \n\t" + "vmulps (%4,%0,4), %%ymm12, %%ymm8 \n\t" + "vmulps 32(%4,%0,4), %%ymm12, %%ymm9 \n\t" + "prefetcht0 192(%5,%0,4) \n\t" + "vmulps (%5,%0,4), %%ymm13, %%ymm10 \n\t" + "vmulps 32(%5,%0,4), %%ymm13, %%ymm11 \n\t" + "vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t" + "vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t" + "vaddps %%ymm4, %%ymm10, %%ymm4 \n\t" + "vaddps %%ymm5, %%ymm11, %%ymm5 \n\t" + + "prefetcht0 192(%6,%0,4) \n\t" + "vmulps (%6,%0,4), %%ymm14, %%ymm8 \n\t" + "vmulps 32(%6,%0,4), %%ymm14, %%ymm9 \n\t" + "prefetcht0 192(%7,%0,4) \n\t" + "vmulps (%7,%0,4), %%ymm15, %%ymm10 \n\t" + "vmulps 32(%7,%0,4), %%ymm15, %%ymm11 \n\t" + "vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t" + "vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t" + "vaddps %%ymm4, %%ymm10, %%ymm4 \n\t" + "vaddps %%ymm5, %%ymm11, %%ymm5 \n\t" + + "prefetcht0 192(%4,%8,4) \n\t" + "vmulps (%4,%8,4), %%ymm0 , %%ymm8 \n\t" + "vmulps 32(%4,%8,4), %%ymm0 , %%ymm9 \n\t" + "prefetcht0 192(%5,%8,4) \n\t" + "vmulps (%5,%8,4), %%ymm1 , %%ymm10 \n\t" + "vmulps 32(%5,%8,4), %%ymm1 , %%ymm11 \n\t" + "vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t" + "vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t" + "vaddps %%ymm4, %%ymm10, %%ymm4 \n\t" + "vaddps %%ymm5, %%ymm11, %%ymm5 \n\t" + + "prefetcht0 192(%6,%8,4) \n\t" + "vmulps (%6,%8,4), %%ymm2 , %%ymm8 \n\t" + "vmulps 32(%6,%8,4), %%ymm2 , %%ymm9 \n\t" + "prefetcht0 192(%7,%8,4) \n\t" + "vmulps (%7,%8,4), %%ymm3 , %%ymm10 \n\t" + "vmulps 32(%7,%8,4), %%ymm3 , %%ymm11 \n\t" + "vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t" + "vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t" + "vaddps %%ymm4, %%ymm10, %%ymm4 \n\t" + "vaddps %%ymm5, %%ymm11, %%ymm5 \n\t" + + "vmovups %%ymm4, (%3,%0,4) \n\t" // 8 * y + "vmovups %%ymm5, 32(%3,%0,4) \n\t" // 8 * y + + "addq $16, %8 \n\t" + "addq $16, %0 \n\t" + "subq $16, %1 \n\t" + "jnz .L01LOOP%= \n\t" + + ".L16END%=: \n\t" + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (ap[0]), // 4 + "r" (ap[1]), // 5 + "r" (ap[2]), // 6 + "r" (ap[3]), // 7 + "r" (lda4) // 8 + : "cc", + "%xmm0", "%xmm1", + "%xmm2", "%xmm3", + "%xmm4", "%xmm5", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + + + +#define HAVE_KERNEL_4x4 1 +static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline)); + +static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) +{ + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + "vbroadcastss (%2), %%ymm12 \n\t" // x0 + "vbroadcastss 4(%2), %%ymm13 \n\t" // x1 + "vbroadcastss 8(%2), %%ymm14 \n\t" // x2 + "vbroadcastss 12(%2), %%ymm15 \n\t" // x3 + + "testq $0x04, %1 \n\t" + "jz .L08LABEL%= \n\t" + + "vmovups (%3,%0,4), %%xmm4 \n\t" // 4 * y + + "vmulps (%4,%0,4), %%xmm12, %%xmm8 \n\t" + "vmulps (%5,%0,4), %%xmm13, %%xmm10 \n\t" + "vmulps (%6,%0,4), %%xmm14, %%xmm9 \n\t" + "vmulps (%7,%0,4), %%xmm15, %%xmm11 \n\t" + "vaddps %%xmm4, %%xmm8 , %%xmm4 \n\t" + "vaddps %%xmm4, %%xmm10, %%xmm4 \n\t" + "vaddps %%xmm4, %%xmm9 , %%xmm4 \n\t" + "vaddps %%xmm4, %%xmm11, %%xmm4 \n\t" + + "vmovups %%xmm4, (%3,%0,4) \n\t" // 4 * y + + "addq $4, %0 \n\t" + "subq $4, %1 \n\t" + + ".L08LABEL%=: \n\t" + + "testq $0x08, %1 \n\t" + "jz .L16LABEL%= \n\t" + + "vmovups (%3,%0,4), %%ymm4 \n\t" // 8 * y + + "vmulps (%4,%0,4), %%ymm12, %%ymm8 \n\t" + "vmulps (%5,%0,4), %%ymm13, %%ymm10 \n\t" + "vmulps (%6,%0,4), %%ymm14, %%ymm9 \n\t" + "vmulps (%7,%0,4), %%ymm15, %%ymm11 \n\t" + "vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t" + "vaddps %%ymm4, %%ymm10, %%ymm4 \n\t" + "vaddps %%ymm4, %%ymm9 , %%ymm4 \n\t" + "vaddps %%ymm4, %%ymm11, %%ymm4 \n\t" + + "vmovups %%ymm4, (%3,%0,4) \n\t" // 8 * y + + "addq $8, %0 \n\t" + "subq $8, %1 \n\t" + + + ".L16LABEL%=: \n\t" + + "cmpq $0, %1 \n\t" + "je .L16END%= \n\t" + + + ".align 16 \n\t" + ".L01LOOP%=: \n\t" + "vmovups (%3,%0,4), %%ymm4 \n\t" // 8 * y + "vmovups 32(%3,%0,4), %%ymm5 \n\t" // 8 * y + + "prefetcht0 192(%4,%0,4) \n\t" + "vmulps (%4,%0,4), %%ymm12, %%ymm8 \n\t" + "vmulps 32(%4,%0,4), %%ymm12, %%ymm9 \n\t" + "prefetcht0 192(%5,%0,4) \n\t" + "vmulps (%5,%0,4), %%ymm13, %%ymm10 \n\t" + "vmulps 32(%5,%0,4), %%ymm13, %%ymm11 \n\t" + "vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t" + "vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t" + "vaddps %%ymm4, %%ymm10, %%ymm4 \n\t" + "vaddps %%ymm5, %%ymm11, %%ymm5 \n\t" + + "prefetcht0 192(%6,%0,4) \n\t" + "vmulps (%6,%0,4), %%ymm14, %%ymm8 \n\t" + "vmulps 32(%6,%0,4), %%ymm14, %%ymm9 \n\t" + "prefetcht0 192(%7,%0,4) \n\t" + "vmulps (%7,%0,4), %%ymm15, %%ymm10 \n\t" + "vmulps 32(%7,%0,4), %%ymm15, %%ymm11 \n\t" + "vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t" + "vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t" + "vaddps %%ymm4, %%ymm10, %%ymm4 \n\t" + "vaddps %%ymm5, %%ymm11, %%ymm5 \n\t" + + "vmovups %%ymm4, (%3,%0,4) \n\t" // 8 * y + "vmovups %%ymm5, 32(%3,%0,4) \n\t" // 8 * y + + "addq $16, %0 \n\t" + "subq $16, %1 \n\t" + "jnz .L01LOOP%= \n\t" + + ".L16END%=: \n\t" + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (ap[0]), // 4 + "r" (ap[1]), // 5 + "r" (ap[2]), // 6 + "r" (ap[3]) // 7 + : "cc", + "%xmm4", "%xmm5", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + From 77942374759502f740ccd7ec9130e7e790494d3a Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sat, 6 Sep 2014 11:01:42 +0200 Subject: [PATCH 21/44] undef WHEREAMI --- common_x86_64.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/common_x86_64.h b/common_x86_64.h index ae9b88718..547614f74 100644 --- a/common_x86_64.h +++ b/common_x86_64.h @@ -105,8 +105,8 @@ static __inline void cpuid(int op, int *eax, int *ebx, int *ecx, int *edx){ } /* - #define WHEREAMI +*/ static inline int WhereAmI(void){ int eax, ebx, ecx, edx; @@ -118,7 +118,6 @@ static inline int WhereAmI(void){ return apicid; } -*/ #ifdef CORE_BARCELONA #define IFLUSH gotoblas_iflush() From d143f84dd26219e4a8d62e545a5449d47fe80583 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sat, 6 Sep 2014 12:08:48 +0200 Subject: [PATCH 22/44] added optimized sgemv_n kernel for haswell --- kernel/x86_64/KERNEL.HASWELL | 2 +- kernel/x86_64/sgemv_n_4.c | 2 + kernel/x86_64/sgemv_n_microk_haswell-4.c | 271 +++++++++++++++++++++++ 3 files changed, 274 insertions(+), 1 deletion(-) create mode 100644 kernel/x86_64/sgemv_n_microk_haswell-4.c diff --git a/kernel/x86_64/KERNEL.HASWELL b/kernel/x86_64/KERNEL.HASWELL index d0ac9c72f..c2c64939b 100644 --- a/kernel/x86_64/KERNEL.HASWELL +++ b/kernel/x86_64/KERNEL.HASWELL @@ -1,4 +1,4 @@ -SGEMVNKERNEL = sgemv_n.c +SGEMVNKERNEL = sgemv_n_4.c SGEMVTKERNEL = sgemv_t.c DGEMVNKERNEL = dgemv_n.c diff --git a/kernel/x86_64/sgemv_n_4.c b/kernel/x86_64/sgemv_n_4.c index 617b1788f..943dcdefa 100644 --- a/kernel/x86_64/sgemv_n_4.c +++ b/kernel/x86_64/sgemv_n_4.c @@ -35,6 +35,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "sgemv_n_microk_nehalem-4.c" #elif defined(SANDYBRIDGE) #include "sgemv_n_microk_sandy-4.c" +#elif defined(HASWELL) +#include "sgemv_n_microk_haswell-4.c" #endif diff --git a/kernel/x86_64/sgemv_n_microk_haswell-4.c b/kernel/x86_64/sgemv_n_microk_haswell-4.c new file mode 100644 index 000000000..ed1792245 --- /dev/null +++ b/kernel/x86_64/sgemv_n_microk_haswell-4.c @@ -0,0 +1,271 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + + +#define HAVE_KERNEL_4x8 1 +static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4) __attribute__ ((noinline)); + +static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4) +{ + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + "vbroadcastss (%2), %%ymm12 \n\t" // x0 + "vbroadcastss 4(%2), %%ymm13 \n\t" // x1 + "vbroadcastss 8(%2), %%ymm14 \n\t" // x2 + "vbroadcastss 12(%2), %%ymm15 \n\t" // x3 + "vbroadcastss 16(%2), %%ymm0 \n\t" // x4 + "vbroadcastss 20(%2), %%ymm1 \n\t" // x5 + "vbroadcastss 24(%2), %%ymm2 \n\t" // x6 + "vbroadcastss 28(%2), %%ymm3 \n\t" // x7 + + "testq $0x04, %1 \n\t" + "jz .L08LABEL%= \n\t" + + "vmovups (%3,%0,4), %%xmm4 \n\t" // 4 * y + "vxorps %%xmm5 , %%xmm5, %%xmm5 \n\t" + + "vfmadd231ps (%4,%0,4), %%xmm12, %%xmm4 \n\t" + "vfmadd231ps (%5,%0,4), %%xmm13, %%xmm5 \n\t" + "vfmadd231ps (%6,%0,4), %%xmm14, %%xmm4 \n\t" + "vfmadd231ps (%7,%0,4), %%xmm15, %%xmm5 \n\t" + + "vfmadd231ps (%4,%8,4), %%xmm0 , %%xmm4 \n\t" + "vfmadd231ps (%5,%8,4), %%xmm1 , %%xmm5 \n\t" + "vfmadd231ps (%6,%8,4), %%xmm2 , %%xmm4 \n\t" + "vfmadd231ps (%7,%8,4), %%xmm3 , %%xmm5 \n\t" + + "vaddps %%xmm4 , %%xmm5 , %%xmm5 \n\t" + + "vmovups %%xmm5, (%3,%0,4) \n\t" // 4 * y + + "addq $4 , %8 \n\t" + "addq $4 , %0 \n\t" + "subq $4 , %1 \n\t" + + ".L08LABEL%=: \n\t" + + "testq $0x08, %1 \n\t" + "jz .L16LABEL%= \n\t" + + "vmovups (%3,%0,4), %%ymm4 \n\t" // 8 * y + "vxorps %%ymm5 , %%ymm5, %%ymm5 \n\t" + + "vfmadd231ps (%4,%0,4), %%ymm12, %%ymm4 \n\t" + "vfmadd231ps (%5,%0,4), %%ymm13, %%ymm5 \n\t" + "vfmadd231ps (%6,%0,4), %%ymm14, %%ymm4 \n\t" + "vfmadd231ps (%7,%0,4), %%ymm15, %%ymm5 \n\t" + + "vfmadd231ps (%4,%8,4), %%ymm0 , %%ymm4 \n\t" + "vfmadd231ps (%5,%8,4), %%ymm1 , %%ymm5 \n\t" + "vfmadd231ps (%6,%8,4), %%ymm2 , %%ymm4 \n\t" + "vfmadd231ps (%7,%8,4), %%ymm3 , %%ymm5 \n\t" + + "vaddps %%ymm4 , %%ymm5 , %%ymm5 \n\t" + + "vmovups %%ymm5, (%3,%0,4) \n\t" // 8 * y + + "addq $8 , %8 \n\t" + "addq $8 , %0 \n\t" + "subq $8 , %1 \n\t" + + ".L16LABEL%=: \n\t" + + "cmpq $0, %1 \n\t" + "je .L16END%= \n\t" + + + ".align 16 \n\t" + ".L01LOOP%=: \n\t" + "vmovups (%3,%0,4), %%ymm4 \n\t" // 8 * y + "vmovups 32(%3,%0,4), %%ymm5 \n\t" // 8 * y + + "prefetcht0 192(%4,%0,4) \n\t" + "vfmadd231ps (%4,%0,4), %%ymm12, %%ymm4 \n\t" + "vfmadd231ps 32(%4,%0,4), %%ymm12, %%ymm5 \n\t" + "prefetcht0 192(%5,%0,4) \n\t" + "vfmadd231ps (%5,%0,4), %%ymm13, %%ymm4 \n\t" + "vfmadd231ps 32(%5,%0,4), %%ymm13, %%ymm5 \n\t" + "prefetcht0 192(%6,%0,4) \n\t" + "vfmadd231ps (%6,%0,4), %%ymm14, %%ymm4 \n\t" + "vfmadd231ps 32(%6,%0,4), %%ymm14, %%ymm5 \n\t" + "prefetcht0 192(%7,%0,4) \n\t" + "vfmadd231ps (%7,%0,4), %%ymm15, %%ymm4 \n\t" + "vfmadd231ps 32(%7,%0,4), %%ymm15, %%ymm5 \n\t" + + "prefetcht0 192(%4,%8,4) \n\t" + "vfmadd231ps (%4,%8,4), %%ymm0 , %%ymm4 \n\t" + "vfmadd231ps 32(%4,%8,4), %%ymm0 , %%ymm5 \n\t" + "prefetcht0 192(%5,%8,4) \n\t" + "vfmadd231ps (%5,%8,4), %%ymm1 , %%ymm4 \n\t" + "vfmadd231ps 32(%5,%8,4), %%ymm1 , %%ymm5 \n\t" + "prefetcht0 192(%6,%8,4) \n\t" + "vfmadd231ps (%6,%8,4), %%ymm2 , %%ymm4 \n\t" + "vfmadd231ps 32(%6,%8,4), %%ymm2 , %%ymm5 \n\t" + "prefetcht0 192(%7,%8,4) \n\t" + "vfmadd231ps (%7,%8,4), %%ymm3 , %%ymm4 \n\t" + "vfmadd231ps 32(%7,%8,4), %%ymm3 , %%ymm5 \n\t" + + "vmovups %%ymm4, (%3,%0,4) \n\t" // 8 * y + "vmovups %%ymm5, 32(%3,%0,4) \n\t" // 8 * y + + "addq $16, %8 \n\t" + "addq $16, %0 \n\t" + "subq $16, %1 \n\t" + "jnz .L01LOOP%= \n\t" + + ".L16END%=: \n\t" + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (ap[0]), // 4 + "r" (ap[1]), // 5 + "r" (ap[2]), // 6 + "r" (ap[3]), // 7 + "r" (lda4) // 8 + : "cc", + "%xmm0", "%xmm1", + "%xmm2", "%xmm3", + "%xmm4", "%xmm5", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + + +#define HAVE_KERNEL_4x4 1 +static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline)); + +static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) +{ + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + "vbroadcastss (%2), %%ymm12 \n\t" // x0 + "vbroadcastss 4(%2), %%ymm13 \n\t" // x1 + "vbroadcastss 8(%2), %%ymm14 \n\t" // x2 + "vbroadcastss 12(%2), %%ymm15 \n\t" // x3 + + "testq $0x04, %1 \n\t" + "jz .L08LABEL%= \n\t" + + "vmovups (%3,%0,4), %%xmm4 \n\t" // 4 * y + + "vfmadd231ps (%4,%0,4), %%xmm12, %%xmm4 \n\t" + "vfmadd231ps (%5,%0,4), %%xmm13, %%xmm4 \n\t" + "vfmadd231ps (%6,%0,4), %%xmm14, %%xmm4 \n\t" + "vfmadd231ps (%7,%0,4), %%xmm15, %%xmm4 \n\t" + + "vmovups %%xmm4, (%3,%0,4) \n\t" // 4 * y + + "addq $4 , %0 \n\t" + "subq $4 , %1 \n\t" + + ".L08LABEL%=: \n\t" + + "testq $0x08, %1 \n\t" + "jz .L16LABEL%= \n\t" + + "vmovups (%3,%0,4), %%ymm4 \n\t" // 8 * y + + "vfmadd231ps (%4,%0,4), %%ymm12, %%ymm4 \n\t" + "vfmadd231ps (%5,%0,4), %%ymm13, %%ymm4 \n\t" + "vfmadd231ps (%6,%0,4), %%ymm14, %%ymm4 \n\t" + "vfmadd231ps (%7,%0,4), %%ymm15, %%ymm4 \n\t" + + "vmovups %%ymm4, (%3,%0,4) \n\t" // 8 * y + + "addq $8 , %0 \n\t" + "subq $8 , %1 \n\t" + + ".L16LABEL%=: \n\t" + + "cmpq $0, %1 \n\t" + "je .L16END%= \n\t" + + + ".align 16 \n\t" + ".L01LOOP%=: \n\t" + "vmovups (%3,%0,4), %%ymm4 \n\t" // 8 * y + "vmovups 32(%3,%0,4), %%ymm5 \n\t" // 8 * y + + "prefetcht0 192(%4,%0,4) \n\t" + "vfmadd231ps (%4,%0,4), %%ymm12, %%ymm4 \n\t" + "vfmadd231ps 32(%4,%0,4), %%ymm12, %%ymm5 \n\t" + "prefetcht0 192(%5,%0,4) \n\t" + "vfmadd231ps (%5,%0,4), %%ymm13, %%ymm4 \n\t" + "vfmadd231ps 32(%5,%0,4), %%ymm13, %%ymm5 \n\t" + "prefetcht0 192(%6,%0,4) \n\t" + "vfmadd231ps (%6,%0,4), %%ymm14, %%ymm4 \n\t" + "vfmadd231ps 32(%6,%0,4), %%ymm14, %%ymm5 \n\t" + "prefetcht0 192(%7,%0,4) \n\t" + "vfmadd231ps (%7,%0,4), %%ymm15, %%ymm4 \n\t" + "vfmadd231ps 32(%7,%0,4), %%ymm15, %%ymm5 \n\t" + + "vmovups %%ymm4, (%3,%0,4) \n\t" // 8 * y + "vmovups %%ymm5, 32(%3,%0,4) \n\t" // 8 * y + + "addq $16, %0 \n\t" + "subq $16, %1 \n\t" + "jnz .L01LOOP%= \n\t" + + ".L16END%=: \n\t" + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (ap[0]), // 4 + "r" (ap[1]), // 5 + "r" (ap[2]), // 6 + "r" (ap[3]) // 7 + : "cc", + "%xmm4", "%xmm5", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + From cf5544b41750fae89ec2c3e83f6ed70ca2d508dc Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sat, 6 Sep 2014 13:17:56 +0200 Subject: [PATCH 23/44] optimization for small size --- kernel/x86_64/sgemv_n_microk_haswell-4.c | 25 ++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/kernel/x86_64/sgemv_n_microk_haswell-4.c b/kernel/x86_64/sgemv_n_microk_haswell-4.c index ed1792245..a2470a4b7 100644 --- a/kernel/x86_64/sgemv_n_microk_haswell-4.c +++ b/kernel/x86_64/sgemv_n_microk_haswell-4.c @@ -105,41 +105,42 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO ".align 16 \n\t" ".L01LOOP%=: \n\t" + // "prefetcht0 192(%3,%0,4) \n\t" "vmovups (%3,%0,4), %%ymm4 \n\t" // 8 * y "vmovups 32(%3,%0,4), %%ymm5 \n\t" // 8 * y - "prefetcht0 192(%4,%0,4) \n\t" + // "prefetcht0 192(%4,%0,4) \n\t" "vfmadd231ps (%4,%0,4), %%ymm12, %%ymm4 \n\t" "vfmadd231ps 32(%4,%0,4), %%ymm12, %%ymm5 \n\t" - "prefetcht0 192(%5,%0,4) \n\t" + // "prefetcht0 192(%5,%0,4) \n\t" "vfmadd231ps (%5,%0,4), %%ymm13, %%ymm4 \n\t" "vfmadd231ps 32(%5,%0,4), %%ymm13, %%ymm5 \n\t" - "prefetcht0 192(%6,%0,4) \n\t" + // "prefetcht0 192(%6,%0,4) \n\t" "vfmadd231ps (%6,%0,4), %%ymm14, %%ymm4 \n\t" "vfmadd231ps 32(%6,%0,4), %%ymm14, %%ymm5 \n\t" - "prefetcht0 192(%7,%0,4) \n\t" + // "prefetcht0 192(%7,%0,4) \n\t" "vfmadd231ps (%7,%0,4), %%ymm15, %%ymm4 \n\t" "vfmadd231ps 32(%7,%0,4), %%ymm15, %%ymm5 \n\t" - "prefetcht0 192(%4,%8,4) \n\t" + // "prefetcht0 192(%4,%8,4) \n\t" "vfmadd231ps (%4,%8,4), %%ymm0 , %%ymm4 \n\t" + "addq $16, %0 \n\t" "vfmadd231ps 32(%4,%8,4), %%ymm0 , %%ymm5 \n\t" - "prefetcht0 192(%5,%8,4) \n\t" + // "prefetcht0 192(%5,%8,4) \n\t" "vfmadd231ps (%5,%8,4), %%ymm1 , %%ymm4 \n\t" "vfmadd231ps 32(%5,%8,4), %%ymm1 , %%ymm5 \n\t" - "prefetcht0 192(%6,%8,4) \n\t" + // "prefetcht0 192(%6,%8,4) \n\t" "vfmadd231ps (%6,%8,4), %%ymm2 , %%ymm4 \n\t" "vfmadd231ps 32(%6,%8,4), %%ymm2 , %%ymm5 \n\t" - "prefetcht0 192(%7,%8,4) \n\t" + // "prefetcht0 192(%7,%8,4) \n\t" "vfmadd231ps (%7,%8,4), %%ymm3 , %%ymm4 \n\t" "vfmadd231ps 32(%7,%8,4), %%ymm3 , %%ymm5 \n\t" - "vmovups %%ymm4, (%3,%0,4) \n\t" // 8 * y - "vmovups %%ymm5, 32(%3,%0,4) \n\t" // 8 * y - "addq $16, %8 \n\t" - "addq $16, %0 \n\t" + "vmovups %%ymm4,-64(%3,%0,4) \n\t" // 8 * y "subq $16, %1 \n\t" + "vmovups %%ymm5,-32(%3,%0,4) \n\t" // 8 * y + "jnz .L01LOOP%= \n\t" ".L16END%=: \n\t" From 3a7ab47ee95a34d113e68003a37c81eb70d74a6b Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sat, 6 Sep 2014 18:34:25 +0200 Subject: [PATCH 24/44] optimized sgemv_t --- kernel/x86_64/sgemv_t_4.c | 69 ++++++++++++++++++++++++++++++++++----- 1 file changed, 61 insertions(+), 8 deletions(-) diff --git a/kernel/x86_64/sgemv_t_4.c b/kernel/x86_64/sgemv_t_4.c index b89ec7f7f..e4476080a 100644 --- a/kernel/x86_64/sgemv_t_4.c +++ b/kernel/x86_64/sgemv_t_4.c @@ -80,9 +80,9 @@ static void sgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT ( "xorps %%xmm10 , %%xmm10 \n\t" "xorps %%xmm11 , %%xmm11 \n\t" - - ".align 16 \n\t" - ".L01LOOP%=: \n\t" + + "testq $4 , %1 \n\t" + "jz .L01LABEL%= \n\t" "movups (%5,%0,4) , %%xmm14 \n\t" // x "movups (%3,%0,4) , %%xmm12 \n\t" // ap0 @@ -94,8 +94,36 @@ static void sgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT "subq $4 , %1 \n\t" "addps %%xmm13 , %%xmm11 \n\t" + ".L01LABEL%=: \n\t" + + "cmpq $0, %1 \n\t" + "je .L01END%= \n\t" + + ".align 16 \n\t" + ".L01LOOP%=: \n\t" + + "movups (%5,%0,4) , %%xmm14 \n\t" // x + "movups (%3,%0,4) , %%xmm12 \n\t" // ap0 + "movups (%4,%0,4) , %%xmm13 \n\t" // ap1 + "mulps %%xmm14 , %%xmm12 \n\t" + "mulps %%xmm14 , %%xmm13 \n\t" + "addps %%xmm12 , %%xmm10 \n\t" + "addps %%xmm13 , %%xmm11 \n\t" + + "movups 16(%5,%0,4) , %%xmm14 \n\t" // x + "movups 16(%3,%0,4) , %%xmm12 \n\t" // ap0 + "movups 16(%4,%0,4) , %%xmm13 \n\t" // ap1 + "mulps %%xmm14 , %%xmm12 \n\t" + "mulps %%xmm14 , %%xmm13 \n\t" + "addps %%xmm12 , %%xmm10 \n\t" + "addps %%xmm13 , %%xmm11 \n\t" + + "addq $8 , %0 \n\t" + "subq $8 , %1 \n\t" "jnz .L01LOOP%= \n\t" + ".L01END%=: \n\t" + "haddps %%xmm10, %%xmm10 \n\t" "haddps %%xmm11, %%xmm11 \n\t" "haddps %%xmm10, %%xmm10 \n\t" @@ -113,7 +141,8 @@ static void sgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT "r" (ap1), // 4 "r" (x) // 5 : "cc", - "%xmm10", "%xmm11", "%xmm12", + "%xmm4", "%xmm5", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); @@ -130,10 +159,11 @@ static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) __asm__ __volatile__ ( + "xorps %%xmm9 , %%xmm9 \n\t" "xorps %%xmm10 , %%xmm10 \n\t" - - ".align 16 \n\t" - ".L01LOOP%=: \n\t" + + "testq $4 , %1 \n\t" + "jz .L01LABEL%= \n\t" "movups (%3,%0,4) , %%xmm12 \n\t" "movups (%4,%0,4) , %%xmm11 \n\t" @@ -142,8 +172,30 @@ static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) "addps %%xmm12 , %%xmm10 \n\t" "subq $4 , %1 \n\t" + ".L01LABEL%=: \n\t" + + "cmpq $0, %1 \n\t" + "je .L01END%= \n\t" + + ".align 16 \n\t" + ".L01LOOP%=: \n\t" + + "movups (%3,%0,4) , %%xmm12 \n\t" + "movups 16(%3,%0,4) , %%xmm14 \n\t" + "movups (%4,%0,4) , %%xmm11 \n\t" + "movups 16(%4,%0,4) , %%xmm13 \n\t" + "mulps %%xmm11 , %%xmm12 \n\t" + "mulps %%xmm13 , %%xmm14 \n\t" + "addq $8 , %0 \n\t" + "addps %%xmm12 , %%xmm10 \n\t" + "subq $8 , %1 \n\t" + "addps %%xmm14 , %%xmm9 \n\t" + "jnz .L01LOOP%= \n\t" + ".L01END%=: \n\t" + + "addps %%xmm9 , %%xmm10 \n\t" "haddps %%xmm10, %%xmm10 \n\t" "haddps %%xmm10, %%xmm10 \n\t" @@ -157,7 +209,8 @@ static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) "r" (ap), // 3 "r" (x) // 4 : "cc", - "%xmm10", "%xmm11", "%xmm12", + "%xmm9", "%xmm10" , + "%xmm11", "%xmm12", "%xmm13", "%xmm14", "memory" ); From c8eaf3ae2d19a60039f55e5579c44329ff2d3000 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sat, 6 Sep 2014 19:41:57 +0200 Subject: [PATCH 25/44] optimized sgemv_t_4 kernel for very small sizes --- kernel/x86_64/sgemv_t_4.c | 98 +++++++++++++++++++++++++++++++++------ 1 file changed, 84 insertions(+), 14 deletions(-) diff --git a/kernel/x86_64/sgemv_t_4.c b/kernel/x86_64/sgemv_t_4.c index e4476080a..692dd536d 100644 --- a/kernel/x86_64/sgemv_t_4.c +++ b/kernel/x86_64/sgemv_t_4.c @@ -423,14 +423,37 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO FLOAT *aj = a_ptr; y_ptr = y; - for ( j=0; j Date: Sat, 6 Sep 2014 21:28:57 +0200 Subject: [PATCH 26/44] better optimzations for sgemv_t kernel --- kernel/x86_64/sgemv_t_4.c | 113 +++++++++++++++++++++++++++++++++----- 1 file changed, 99 insertions(+), 14 deletions(-) diff --git a/kernel/x86_64/sgemv_t_4.c b/kernel/x86_64/sgemv_t_4.c index 692dd536d..920322c4f 100644 --- a/kernel/x86_64/sgemv_t_4.c +++ b/kernel/x86_64/sgemv_t_4.c @@ -446,12 +446,45 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO } else { - for ( j=0; j Date: Sun, 7 Sep 2014 13:45:03 +0200 Subject: [PATCH 27/44] optimizations for very small sizes --- kernel/x86_64/sgemv_n_4.c | 218 +++++++++++++++++---- kernel/x86_64/sgemv_n_microk_bulldozer-4.c | 65 +++--- 2 files changed, 216 insertions(+), 67 deletions(-) diff --git a/kernel/x86_64/sgemv_n_4.c b/kernel/x86_64/sgemv_n_4.c index 943dcdefa..ee762ffce 100644 --- a/kernel/x86_64/sgemv_n_4.c +++ b/kernel/x86_64/sgemv_n_4.c @@ -44,12 +44,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifndef HAVE_KERNEL_4x8 -static void sgemv_kernel_4x8(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4) +static void sgemv_kernel_4x8(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, BLASLONG lda4, FLOAT *alpha) { BLASLONG i; FLOAT *a0,*a1,*a2,*a3; FLOAT *b0,*b1,*b2,*b3; FLOAT *x4; + FLOAT x[8]; a0 = ap[0]; a1 = ap[1]; a2 = ap[2]; @@ -60,6 +61,9 @@ static void sgemv_kernel_4x8(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLON b3 = a3 + lda4 ; x4 = x + 4; + for ( i=0; i<8; i++) + x[i] = xo[i] * *alpha; + for ( i=0; i< n; i+=4 ) { @@ -81,15 +85,19 @@ static void sgemv_kernel_4x8(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLON #ifndef HAVE_KERNEL_4x4 -static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) +static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) { BLASLONG i; FLOAT *a0,*a1,*a2,*a3; + FLOAT x[4]; a0 = ap[0]; a1 = ap[1]; a2 = ap[2]; a3 = ap[3]; + for ( i=0; i<4; i++) + x[i] = xo[i] * *alpha; + for ( i=0; i< n; i+=4 ) { y[i] += a0[i]*x[0] + a1[i]*x[1] + a2[i]*x[2] + a3[i]*x[3]; @@ -101,32 +109,147 @@ static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) #endif -static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) +#ifndef HAVE_KERNEL_4x2 + +static void sgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline)); + +static void sgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) { - BLASLONG i; - FLOAT *a0; - a0 = ap; - for ( i=0; i< n; i+=4 ) - { - y[i] += a0[i]*x[0]; - y[i+1] += a0[i+1]*x[0]; - y[i+2] += a0[i+2]*x[0]; - y[i+3] += a0[i+3]*x[0]; - } + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "movss (%2) , %%xmm12 \n\t" // x0 + "movss (%6) , %%xmm4 \n\t" // alpha + "movss 4(%2) , %%xmm13 \n\t" // x1 + "mulss %%xmm4 , %%xmm12 \n\t" // alpha + "mulss %%xmm4 , %%xmm13 \n\t" // alpha + "shufps $0, %%xmm12, %%xmm12 \n\t" + "shufps $0, %%xmm13, %%xmm13 \n\t" + + ".align 16 \n\t" + ".L01LOOP%=: \n\t" + "movups (%3,%0,4), %%xmm4 \n\t" // 4 * y + + "movups (%4,%0,4), %%xmm8 \n\t" + "movups (%5,%0,4), %%xmm9 \n\t" + "mulps %%xmm12, %%xmm8 \n\t" + "mulps %%xmm13, %%xmm9 \n\t" + "addps %%xmm8 , %%xmm4 \n\t" + "addq $4 , %0 \n\t" + "addps %%xmm9 , %%xmm4 \n\t" + + "movups %%xmm4 , -16(%3,%0,4) \n\t" // 4 * y + + "subq $4 , %1 \n\t" + "jnz .L01LOOP%= \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (ap[0]), // 4 + "r" (ap[1]), // 5 + "r" (alpha) // 6 + : "cc", + "%xmm4", "%xmm5", + "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + +#endif + +#ifndef HAVE_KERNEL_4x2 + +static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline)); + +static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + + BLASLONG register i = 0; + BLASLONG register n1 = n & -8 ; + BLASLONG register n2 = n & 4 ; + + __asm__ __volatile__ + ( + "movss (%2), %%xmm12 \n\t" // x0 + "mulss (%6), %%xmm12 \n\t" // alpha + "shufps $0, %%xmm12, %%xmm12 \n\t" + + "cmpq $0, %1 \n\t" + "je .L16END%= \n\t" + + ".align 16 \n\t" + ".L01LOOP%=: \n\t" + "movups (%3,%0,4), %%xmm4 \n\t" // 4 * y + "movups 16(%3,%0,4), %%xmm5 \n\t" // 4 * y + "movups (%4,%0,4), %%xmm8 \n\t" // 4 * a + "movups 16(%4,%0,4), %%xmm9 \n\t" // 4 * a + "mulps %%xmm12, %%xmm8 \n\t" + "mulps %%xmm12, %%xmm9 \n\t" + "addps %%xmm4 , %%xmm8 \n\t" + "addps %%xmm5 , %%xmm9 \n\t" + + "addq $8 , %0 \n\t" + "movups %%xmm8 , -32(%3,%0,4) \n\t" // 4 * y + "movups %%xmm9 , -16(%3,%0,4) \n\t" // 4 * y + + "subq $8 , %1 \n\t" + + "jnz .L01LOOP%= \n\t" + + ".L16END%=: \n\t" + + "testq $0x04, %5 \n\t" + "jz .L08LABEL%= \n\t" + + "movups (%3,%0,4), %%xmm4 \n\t" // 4 * y + "movups (%4,%0,4), %%xmm8 \n\t" // 4 * a + "mulps %%xmm12, %%xmm8 \n\t" + "addps %%xmm8 , %%xmm4 \n\t" + "movups %%xmm4 , (%3,%0,4) \n\t" // 4 * y + "addq $4 , %0 \n\t" + "subq $4 , %1 \n\t" + + ".L08LABEL%=: \n\t" + : + : + "r" (i), // 0 + "r" (n1), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (ap), // 4 + "r" (n2), // 5 + "r" (alpha) // 6 + : "cc", + "%xmm4", "%xmm5", + "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + } - -static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, FLOAT *alpha) __attribute__ ((noinline)); -static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, FLOAT *alpha) +#endif + +static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) __attribute__ ((noinline)); + +static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) { BLASLONG i; if ( inc_dest != 1 ) { - FLOAT da = *alpha; for ( i=0; i Date: Sun, 7 Sep 2014 18:23:48 +0200 Subject: [PATCH 28/44] optimized sgemv_n for very small size of m --- kernel/x86_64/sgemv_n_4.c | 148 +++++++++++++++++++++++++++++++++++--- 1 file changed, 138 insertions(+), 10 deletions(-) diff --git a/kernel/x86_64/sgemv_n_4.c b/kernel/x86_64/sgemv_n_4.c index ee762ffce..0135306af 100644 --- a/kernel/x86_64/sgemv_n_4.c +++ b/kernel/x86_64/sgemv_n_4.c @@ -438,25 +438,153 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO } - if ( m3 == 0 ) return; + if ( m3 == 0 ) return(0); - j=0; - while ( j < m3 ) + if ( m3 == 3 ) + { + a_ptr = a; + x_ptr = x; + FLOAT temp0 = 0.0; + FLOAT temp1 = 0.0; + FLOAT temp2 = 0.0; + if ( lda == 3 && inc_x ==1 ) + { + + for( i = 0; i < ( n & -4 ); i+=4 ) + { + + temp0 += a_ptr[0] * x_ptr[0] + a_ptr[3] * x_ptr[1]; + temp1 += a_ptr[1] * x_ptr[0] + a_ptr[4] * x_ptr[1]; + temp2 += a_ptr[2] * x_ptr[0] + a_ptr[5] * x_ptr[1]; + + temp0 += a_ptr[6] * x_ptr[2] + a_ptr[9] * x_ptr[3]; + temp1 += a_ptr[7] * x_ptr[2] + a_ptr[10] * x_ptr[3]; + temp2 += a_ptr[8] * x_ptr[2] + a_ptr[11] * x_ptr[3]; + + a_ptr += 12; + x_ptr += 4; + } + + for( ; i < n; i++ ) + { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + temp2 += a_ptr[2] * x_ptr[0]; + a_ptr += 3; + x_ptr ++; + } + + } + else + { + + for( i = 0; i < n; i++ ) + { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + temp2 += a_ptr[2] * x_ptr[0]; + a_ptr += lda; + x_ptr += inc_x; + + + } + + } + y_ptr[0] += alpha * temp0; + y_ptr += inc_y; + y_ptr[0] += alpha * temp1; + y_ptr += inc_y; + y_ptr[0] += alpha * temp2; + return(0); + } + + + if ( m3 == 2 ) + { + a_ptr = a; + x_ptr = x; + FLOAT temp0 = 0.0; + FLOAT temp1 = 0.0; + if ( lda == 2 && inc_x ==1 ) + { + + for( i = 0; i < (n & -4) ; i+=4 ) + { + temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1]; + temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1]; + temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3]; + temp1 += a_ptr[5] * x_ptr[2] + a_ptr[7] * x_ptr[3]; + a_ptr += 8; + x_ptr += 4; + + } + + + for( ; i < n; i++ ) + { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + a_ptr += 2; + x_ptr ++; + } + + } + else + { + + for( i = 0; i < n; i++ ) + { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + a_ptr += lda; + x_ptr += inc_x; + + + } + + } + y_ptr[0] += alpha * temp0; + y_ptr += inc_y; + y_ptr[0] += alpha * temp1; + return(0); + } + + if ( m3 == 1 ) { a_ptr = a; x_ptr = x; FLOAT temp = 0.0; - for( i = 0; i < n; i++ ) + if ( lda == 1 && inc_x ==1 ) { - temp += a_ptr[0] * x_ptr[0]; - a_ptr += lda; - x_ptr += inc_x; + + for( i = 0; i < (n & -4); i+=4 ) + { + temp += a_ptr[i] * x_ptr[i] + a_ptr[i+1] * x_ptr[i+1] + a_ptr[i+2] * x_ptr[i+2] + a_ptr[i+3] * x_ptr[i+3]; + + } + + for( ; i < n; i++ ) + { + temp += a_ptr[i] * x_ptr[i]; + } + + } + else + { + + for( i = 0; i < n; i++ ) + { + temp += a_ptr[0] * x_ptr[0]; + a_ptr += lda; + x_ptr += inc_x; + } + } y_ptr[0] += alpha * temp; - y_ptr += inc_y; - a++; - j++; + return(0); } + + return(0); } From 7b3932b3f348e88cfd9462463bb5ac1f6a5d3a8e Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sun, 7 Sep 2014 19:20:08 +0200 Subject: [PATCH 29/44] optimized sgemv_n kernel for nehalem --- kernel/x86_64/sgemv_n_microk_nehalem-4.c | 42 ++++++++++++++++-------- 1 file changed, 29 insertions(+), 13 deletions(-) diff --git a/kernel/x86_64/sgemv_n_microk_nehalem-4.c b/kernel/x86_64/sgemv_n_microk_nehalem-4.c index f87cfa425..77a1b11aa 100644 --- a/kernel/x86_64/sgemv_n_microk_nehalem-4.c +++ b/kernel/x86_64/sgemv_n_microk_nehalem-4.c @@ -28,9 +28,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define HAVE_KERNEL_4x8 1 -static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4) __attribute__ ((noinline)); +static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) __attribute__ ((noinline)); -static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4) +static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) { BLASLONG register i = 0; @@ -55,11 +55,15 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO "shufps $0, %%xmm2 , %%xmm2 \n\t" "shufps $0, %%xmm3 , %%xmm3 \n\t" + "movss (%9), %%xmm6 \n\t" // alpha + "shufps $0, %%xmm6 , %%xmm6 \n\t" + ".align 16 \n\t" ".L01LOOP%=: \n\t" + "xorps %%xmm4 , %%xmm4 \n\t" "xorps %%xmm5 , %%xmm5 \n\t" - "movups (%3,%0,4), %%xmm4 \n\t" // 4 * y + "movups (%3,%0,4), %%xmm7 \n\t" // 4 * y ".align 2 \n\t" "movups (%4,%0,4), %%xmm8 \n\t" @@ -85,16 +89,19 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO "mulps %%xmm1 , %%xmm9 \n\t" "mulps %%xmm2 , %%xmm10 \n\t" "mulps %%xmm3 , %%xmm11 \n\t" - "addq $4 , %8 \n\t" "addps %%xmm8 , %%xmm4 \n\t" "addps %%xmm9 , %%xmm5 \n\t" - "addq $4 , %0 \n\t" "addps %%xmm10, %%xmm4 \n\t" "addps %%xmm11, %%xmm5 \n\t" - "subq $4 , %1 \n\t" - "addps %%xmm4 , %%xmm5 \n\t" - "movups %%xmm5 , -16(%3,%0,4) \n\t" // 4 * y + "addq $4 , %8 \n\t" + "addps %%xmm5 , %%xmm4 \n\t" + "addq $4 , %0 \n\t" + "mulps %%xmm6 , %%xmm4 \n\t" + "subq $4 , %1 \n\t" + "addps %%xmm4 , %%xmm7 \n\t" + + "movups %%xmm7 , -16(%3,%0,4) \n\t" // 4 * y "jnz .L01LOOP%= \n\t" @@ -108,11 +115,13 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO "r" (ap[1]), // 5 "r" (ap[2]), // 6 "r" (ap[3]), // 7 - "r" (lda4) // 8 + "r" (lda4), // 8 + "r" (alpha) // 9 : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", + "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" @@ -124,9 +133,9 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO #define HAVE_KERNEL_4x4 1 -static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline)); +static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline)); -static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) +static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) { BLASLONG register i = 0; @@ -142,9 +151,13 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) "shufps $0, %%xmm14, %%xmm14\n\t" "shufps $0, %%xmm15, %%xmm15\n\t" + "movss (%8), %%xmm6 \n\t" // alpha + "shufps $0, %%xmm6 , %%xmm6 \n\t" + ".align 16 \n\t" ".L01LOOP%=: \n\t" - "movups (%3,%0,4), %%xmm4 \n\t" // 4 * y + "xorps %%xmm4 , %%xmm4 \n\t" + "movups (%3,%0,4), %%xmm7 \n\t" // 4 * y "movups (%4,%0,4), %%xmm8 \n\t" "movups (%5,%0,4), %%xmm9 \n\t" @@ -161,6 +174,8 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) "addps %%xmm10 , %%xmm4 \n\t" "addps %%xmm4 , %%xmm11 \n\t" + "mulps %%xmm6 , %%xmm11 \n\t" + "addps %%xmm7 , %%xmm11 \n\t" "movups %%xmm11, -16(%3,%0,4) \n\t" // 4 * y "jnz .L01LOOP%= \n\t" @@ -174,7 +189,8 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) "r" (ap[0]), // 4 "r" (ap[1]), // 5 "r" (ap[2]), // 6 - "r" (ap[3]) // 7 + "r" (ap[3]), // 7 + "r" (alpha) // 8 : "cc", "%xmm4", "%xmm5", "%xmm6", "%xmm7", From 553e2754077eb0b2cb8782e8ecdb5e6eb9c8366b Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sun, 7 Sep 2014 20:53:30 +0200 Subject: [PATCH 30/44] optimized sgemv_n kernel for sandybridge --- kernel/x86_64/sgemv_n_microk_sandy-4.c | 112 +++++++++++++++++-------- 1 file changed, 79 insertions(+), 33 deletions(-) diff --git a/kernel/x86_64/sgemv_n_microk_sandy-4.c b/kernel/x86_64/sgemv_n_microk_sandy-4.c index b4caca630..44c2b3f2b 100644 --- a/kernel/x86_64/sgemv_n_microk_sandy-4.c +++ b/kernel/x86_64/sgemv_n_microk_sandy-4.c @@ -29,9 +29,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define HAVE_KERNEL_4x8 1 -static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4) __attribute__ ((noinline)); +static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) __attribute__ ((noinline)); -static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4) +static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) { BLASLONG register i = 0; @@ -48,61 +48,75 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO "vbroadcastss 24(%2), %%ymm2 \n\t" // x6 "vbroadcastss 28(%2), %%ymm3 \n\t" // x7 + "vbroadcastss (%9), %%ymm6 \n\t" // alpha + "testq $0x04, %1 \n\t" "jz .L08LABEL%= \n\t" - "vmovups (%3,%0,4), %%xmm4 \n\t" // 4 * y + "vxorps %%xmm4 , %%xmm4 , %%xmm4 \n\t" + "vxorps %%xmm5 , %%xmm5 , %%xmm5 \n\t" + "vmovups (%3,%0,4), %%xmm7 \n\t" // 4 * y "vmulps (%4,%0,4), %%xmm12, %%xmm8 \n\t" "vmulps (%5,%0,4), %%xmm13, %%xmm10 \n\t" "vmulps (%6,%0,4), %%xmm14, %%xmm9 \n\t" "vmulps (%7,%0,4), %%xmm15, %%xmm11 \n\t" "vaddps %%xmm4, %%xmm8 , %%xmm4 \n\t" - "vaddps %%xmm4, %%xmm10, %%xmm4 \n\t" + "vaddps %%xmm5, %%xmm10, %%xmm5 \n\t" "vaddps %%xmm4, %%xmm9 , %%xmm4 \n\t" - "vaddps %%xmm4, %%xmm11, %%xmm4 \n\t" + "vaddps %%xmm5, %%xmm11, %%xmm5 \n\t" "vmulps (%4,%8,4), %%xmm0 , %%xmm8 \n\t" "vmulps (%5,%8,4), %%xmm1 , %%xmm10 \n\t" "vmulps (%6,%8,4), %%xmm2 , %%xmm9 \n\t" "vmulps (%7,%8,4), %%xmm3 , %%xmm11 \n\t" "vaddps %%xmm4, %%xmm8 , %%xmm4 \n\t" - "vaddps %%xmm4, %%xmm10, %%xmm4 \n\t" + "vaddps %%xmm5, %%xmm10, %%xmm5 \n\t" "vaddps %%xmm4, %%xmm9 , %%xmm4 \n\t" - "vaddps %%xmm4, %%xmm11, %%xmm4 \n\t" + "vaddps %%xmm5, %%xmm11, %%xmm5 \n\t" - "vmovups %%xmm4, (%3,%0,4) \n\t" // 4 * y + "vaddps %%xmm5, %%xmm4 , %%xmm4 \n\t" + "vmulps %%xmm6, %%xmm4 , %%xmm5 \n\t" + "vaddps %%xmm5, %%xmm7 , %%xmm5 \n\t" + + "vmovups %%xmm5, (%3,%0,4) \n\t" // 4 * y "addq $4, %8 \n\t" "addq $4, %0 \n\t" "subq $4, %1 \n\t" - ".L08LABEL%=: \n\t" + ".L08LABEL%=: \n\t" "testq $0x08, %1 \n\t" "jz .L16LABEL%= \n\t" - "vmovups (%3,%0,4), %%ymm4 \n\t" // 8 * y + "vxorps %%ymm4 , %%ymm4 , %%ymm4 \n\t" + "vxorps %%ymm5 , %%ymm5 , %%ymm5 \n\t" + "vmovups (%3,%0,4), %%ymm7 \n\t" // 8 * y "vmulps (%4,%0,4), %%ymm12, %%ymm8 \n\t" "vmulps (%5,%0,4), %%ymm13, %%ymm10 \n\t" "vmulps (%6,%0,4), %%ymm14, %%ymm9 \n\t" "vmulps (%7,%0,4), %%ymm15, %%ymm11 \n\t" "vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t" - "vaddps %%ymm4, %%ymm10, %%ymm4 \n\t" + "vaddps %%ymm5, %%ymm10, %%ymm5 \n\t" "vaddps %%ymm4, %%ymm9 , %%ymm4 \n\t" - "vaddps %%ymm4, %%ymm11, %%ymm4 \n\t" + "vaddps %%ymm5, %%ymm11, %%ymm5 \n\t" "vmulps (%4,%8,4), %%ymm0 , %%ymm8 \n\t" "vmulps (%5,%8,4), %%ymm1 , %%ymm10 \n\t" "vmulps (%6,%8,4), %%ymm2 , %%ymm9 \n\t" "vmulps (%7,%8,4), %%ymm3 , %%ymm11 \n\t" "vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t" - "vaddps %%ymm4, %%ymm10, %%ymm4 \n\t" + "vaddps %%ymm5, %%ymm10, %%ymm5 \n\t" "vaddps %%ymm4, %%ymm9 , %%ymm4 \n\t" - "vaddps %%ymm4, %%ymm11, %%ymm4 \n\t" + "vaddps %%ymm5, %%ymm11, %%ymm5 \n\t" - "vmovups %%ymm4, (%3,%0,4) \n\t" // 8 * y + "vaddps %%ymm5, %%ymm4 , %%ymm4 \n\t" + "vmulps %%ymm6, %%ymm4 , %%ymm5 \n\t" + "vaddps %%ymm5, %%ymm7 , %%ymm5 \n\t" + + "vmovups %%ymm5, (%3,%0,4) \n\t" // 8 * y "addq $8, %8 \n\t" "addq $8, %0 \n\t" @@ -117,8 +131,8 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO ".align 16 \n\t" ".L01LOOP%=: \n\t" - "vmovups (%3,%0,4), %%ymm4 \n\t" // 8 * y - "vmovups 32(%3,%0,4), %%ymm5 \n\t" // 8 * y + "vxorps %%ymm4 , %%ymm4 , %%ymm4 \n\t" + "vxorps %%ymm5 , %%ymm5 , %%ymm5 \n\t" "prefetcht0 192(%4,%0,4) \n\t" "vmulps (%4,%0,4), %%ymm12, %%ymm8 \n\t" @@ -164,6 +178,12 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO "vaddps %%ymm4, %%ymm10, %%ymm4 \n\t" "vaddps %%ymm5, %%ymm11, %%ymm5 \n\t" + "vmulps %%ymm6, %%ymm4 , %%ymm4 \n\t" + "vmulps %%ymm6, %%ymm5 , %%ymm5 \n\t" + + "vaddps (%3,%0,4), %%ymm4 , %%ymm4 \n\t" // 8 * y + "vaddps 32(%3,%0,4), %%ymm5 , %%ymm5 \n\t" // 8 * y + "vmovups %%ymm4, (%3,%0,4) \n\t" // 8 * y "vmovups %%ymm5, 32(%3,%0,4) \n\t" // 8 * y @@ -185,11 +205,13 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO "r" (ap[1]), // 5 "r" (ap[2]), // 6 "r" (ap[3]), // 7 - "r" (lda4) // 8 + "r" (lda4), // 8 + "r" (alpha) // 9 : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", + "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" @@ -201,9 +223,9 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO #define HAVE_KERNEL_4x4 1 -static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline)); +static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline)); -static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) +static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) { BLASLONG register i = 0; @@ -216,21 +238,29 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) "vbroadcastss 8(%2), %%ymm14 \n\t" // x2 "vbroadcastss 12(%2), %%ymm15 \n\t" // x3 + "vbroadcastss (%8), %%ymm6 \n\t" // alpha + "testq $0x04, %1 \n\t" "jz .L08LABEL%= \n\t" - "vmovups (%3,%0,4), %%xmm4 \n\t" // 4 * y + "vxorps %%ymm4 , %%ymm4 , %%ymm4 \n\t" + "vxorps %%ymm5 , %%ymm5 , %%ymm5 \n\t" + "vmovups (%3,%0,4), %%xmm7 \n\t" // 4 * y "vmulps (%4,%0,4), %%xmm12, %%xmm8 \n\t" "vmulps (%5,%0,4), %%xmm13, %%xmm10 \n\t" "vmulps (%6,%0,4), %%xmm14, %%xmm9 \n\t" "vmulps (%7,%0,4), %%xmm15, %%xmm11 \n\t" "vaddps %%xmm4, %%xmm8 , %%xmm4 \n\t" - "vaddps %%xmm4, %%xmm10, %%xmm4 \n\t" + "vaddps %%xmm5, %%xmm10, %%xmm5 \n\t" "vaddps %%xmm4, %%xmm9 , %%xmm4 \n\t" - "vaddps %%xmm4, %%xmm11, %%xmm4 \n\t" + "vaddps %%xmm5, %%xmm11, %%xmm5 \n\t" - "vmovups %%xmm4, (%3,%0,4) \n\t" // 4 * y + "vaddps %%xmm5, %%xmm4 , %%xmm4 \n\t" + "vmulps %%xmm6, %%xmm4 , %%xmm5 \n\t" + "vaddps %%xmm5, %%xmm7 , %%xmm5 \n\t" + + "vmovups %%xmm5, (%3,%0,4) \n\t" // 4 * y "addq $4, %0 \n\t" "subq $4, %1 \n\t" @@ -240,18 +270,24 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) "testq $0x08, %1 \n\t" "jz .L16LABEL%= \n\t" - "vmovups (%3,%0,4), %%ymm4 \n\t" // 8 * y + "vxorps %%ymm4 , %%ymm4 , %%ymm4 \n\t" + "vxorps %%ymm5 , %%ymm5 , %%ymm5 \n\t" + "vmovups (%3,%0,4), %%ymm7 \n\t" // 8 * y "vmulps (%4,%0,4), %%ymm12, %%ymm8 \n\t" "vmulps (%5,%0,4), %%ymm13, %%ymm10 \n\t" "vmulps (%6,%0,4), %%ymm14, %%ymm9 \n\t" "vmulps (%7,%0,4), %%ymm15, %%ymm11 \n\t" "vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t" - "vaddps %%ymm4, %%ymm10, %%ymm4 \n\t" + "vaddps %%ymm5, %%ymm10, %%ymm5 \n\t" "vaddps %%ymm4, %%ymm9 , %%ymm4 \n\t" - "vaddps %%ymm4, %%ymm11, %%ymm4 \n\t" + "vaddps %%ymm5, %%ymm11, %%ymm5 \n\t" - "vmovups %%ymm4, (%3,%0,4) \n\t" // 8 * y + "vaddps %%ymm5, %%ymm4 , %%ymm4 \n\t" + "vmulps %%ymm6, %%ymm4 , %%ymm5 \n\t" + "vaddps %%ymm5, %%ymm7 , %%ymm5 \n\t" + + "vmovups %%ymm5, (%3,%0,4) \n\t" // 8 * y "addq $8, %0 \n\t" "subq $8, %1 \n\t" @@ -265,8 +301,10 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) ".align 16 \n\t" ".L01LOOP%=: \n\t" - "vmovups (%3,%0,4), %%ymm4 \n\t" // 8 * y - "vmovups 32(%3,%0,4), %%ymm5 \n\t" // 8 * y + "vxorps %%ymm4 , %%ymm4 , %%ymm4 \n\t" + "vxorps %%ymm5 , %%ymm5 , %%ymm5 \n\t" + "vmovups (%3,%0,4), %%ymm0 \n\t" // 8 * y + "vmovups 32(%3,%0,4), %%ymm1 \n\t" // 8 * y "prefetcht0 192(%4,%0,4) \n\t" "vmulps (%4,%0,4), %%ymm12, %%ymm8 \n\t" @@ -290,8 +328,14 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) "vaddps %%ymm4, %%ymm10, %%ymm4 \n\t" "vaddps %%ymm5, %%ymm11, %%ymm5 \n\t" - "vmovups %%ymm4, (%3,%0,4) \n\t" // 8 * y - "vmovups %%ymm5, 32(%3,%0,4) \n\t" // 8 * y + "vmulps %%ymm6, %%ymm4 , %%ymm4 \n\t" + "vmulps %%ymm6, %%ymm5 , %%ymm5 \n\t" + + "vaddps %%ymm4, %%ymm0 , %%ymm0 \n\t" + "vaddps %%ymm5, %%ymm1 , %%ymm1 \n\t" + + "vmovups %%ymm0, (%3,%0,4) \n\t" // 8 * y + "vmovups %%ymm1, 32(%3,%0,4) \n\t" // 8 * y "addq $16, %0 \n\t" "subq $16, %1 \n\t" @@ -309,8 +353,10 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) "r" (ap[0]), // 4 "r" (ap[1]), // 5 "r" (ap[2]), // 6 - "r" (ap[3]) // 7 + "r" (ap[3]), // 7 + "r" (alpha) // 8 : "cc", + "%xmm0", "%xmm1", "%xmm4", "%xmm5", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", From 80f77868758a6e99c2716dd12d8bfb63d6ed015f Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sun, 7 Sep 2014 21:13:57 +0200 Subject: [PATCH 31/44] enabled optimized sgemv kernels for piledriver --- kernel/x86_64/KERNEL.HASWELL | 2 +- kernel/x86_64/KERNEL.PILEDRIVER | 4 ++-- kernel/x86_64/sgemv_t_4.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/x86_64/KERNEL.HASWELL b/kernel/x86_64/KERNEL.HASWELL index c2c64939b..d0ac9c72f 100644 --- a/kernel/x86_64/KERNEL.HASWELL +++ b/kernel/x86_64/KERNEL.HASWELL @@ -1,4 +1,4 @@ -SGEMVNKERNEL = sgemv_n_4.c +SGEMVNKERNEL = sgemv_n.c SGEMVTKERNEL = sgemv_t.c DGEMVNKERNEL = dgemv_n.c diff --git a/kernel/x86_64/KERNEL.PILEDRIVER b/kernel/x86_64/KERNEL.PILEDRIVER index 146a8768b..4f15e5a36 100644 --- a/kernel/x86_64/KERNEL.PILEDRIVER +++ b/kernel/x86_64/KERNEL.PILEDRIVER @@ -1,5 +1,5 @@ -SGEMVNKERNEL = sgemv_n.c -SGEMVTKERNEL = sgemv_t.c +SGEMVNKERNEL = sgemv_n_4.c +SGEMVTKERNEL = sgemv_t_4.c ZGEMVNKERNEL = zgemv_n_dup.S ZGEMVTKERNEL = zgemv_t.S diff --git a/kernel/x86_64/sgemv_t_4.c b/kernel/x86_64/sgemv_t_4.c index 920322c4f..3316473af 100644 --- a/kernel/x86_64/sgemv_t_4.c +++ b/kernel/x86_64/sgemv_t_4.c @@ -30,7 +30,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(NEHALEM) #include "sgemv_t_microk_nehalem-4.c" -#elif defined(BULLDOZER) +#elif defined(BULLDOZER) || defined(PILEDRIVER) #include "sgemv_t_microk_bulldozer-4.c" #elif defined(SANDYBRIDGE) #include "sgemv_t_microk_sandy-4.c" From 2be5c7a640488796d98ac3cbb44004a39491da7f Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sun, 7 Sep 2014 21:48:42 +0200 Subject: [PATCH 32/44] bugfix for windows --- kernel/x86_64/sgemv_n_microk_sandy-4.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/x86_64/sgemv_n_microk_sandy-4.c b/kernel/x86_64/sgemv_n_microk_sandy-4.c index 44c2b3f2b..c162eeeb6 100644 --- a/kernel/x86_64/sgemv_n_microk_sandy-4.c +++ b/kernel/x86_64/sgemv_n_microk_sandy-4.c @@ -357,7 +357,9 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT "r" (alpha) // 8 : "cc", "%xmm0", "%xmm1", + "%xmm2", "%xmm3", "%xmm4", "%xmm5", + "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" From cbbc80aad3586900443ed7fef1d0ff1814a80e9a Mon Sep 17 00:00:00 2001 From: wernsaar Date: Mon, 8 Sep 2014 10:13:39 +0200 Subject: [PATCH 33/44] added optimized sgemv_t kernel for haswell --- kernel/x86_64/sgemv_n_microk_haswell-4.c | 275 ++++++++++++----------- kernel/x86_64/sgemv_t_microk_haswell-4.c | 148 ++++++++++++ 2 files changed, 298 insertions(+), 125 deletions(-) create mode 100644 kernel/x86_64/sgemv_t_microk_haswell-4.c diff --git a/kernel/x86_64/sgemv_n_microk_haswell-4.c b/kernel/x86_64/sgemv_n_microk_haswell-4.c index a2470a4b7..1e4498d9e 100644 --- a/kernel/x86_64/sgemv_n_microk_haswell-4.c +++ b/kernel/x86_64/sgemv_n_microk_haswell-4.c @@ -28,9 +28,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define HAVE_KERNEL_4x8 1 -static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4) __attribute__ ((noinline)); +static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) __attribute__ ((noinline)); -static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4) +static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) { BLASLONG register i = 0; @@ -47,10 +47,13 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO "vbroadcastss 24(%2), %%ymm2 \n\t" // x6 "vbroadcastss 28(%2), %%ymm3 \n\t" // x7 + "vbroadcastss (%9), %%ymm6 \n\t" // alpha + "testq $0x04, %1 \n\t" "jz .L08LABEL%= \n\t" - "vmovups (%3,%0,4), %%xmm4 \n\t" // 4 * y + "vmovups (%3,%0,4), %%xmm7 \n\t" // 4 * y + "vxorps %%xmm4 , %%xmm4, %%xmm4 \n\t" "vxorps %%xmm5 , %%xmm5, %%xmm5 \n\t" "vfmadd231ps (%4,%0,4), %%xmm12, %%xmm4 \n\t" @@ -64,6 +67,8 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO "vfmadd231ps (%7,%8,4), %%xmm3 , %%xmm5 \n\t" "vaddps %%xmm4 , %%xmm5 , %%xmm5 \n\t" + "vmulps %%xmm6 , %%xmm5 , %%xmm5 \n\t" + "vaddps %%xmm7 , %%xmm5 , %%xmm5 \n\t" "vmovups %%xmm5, (%3,%0,4) \n\t" // 4 * y @@ -76,7 +81,8 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO "testq $0x08, %1 \n\t" "jz .L16LABEL%= \n\t" - "vmovups (%3,%0,4), %%ymm4 \n\t" // 8 * y + "vmovups (%3,%0,4), %%ymm7 \n\t" // 8 * y + "vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t" "vxorps %%ymm5 , %%ymm5, %%ymm5 \n\t" "vfmadd231ps (%4,%0,4), %%ymm12, %%ymm4 \n\t" @@ -90,6 +96,9 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO "vfmadd231ps (%7,%8,4), %%ymm3 , %%ymm5 \n\t" "vaddps %%ymm4 , %%ymm5 , %%ymm5 \n\t" + "vmulps %%ymm6 , %%ymm5 , %%ymm5 \n\t" + "vaddps %%ymm7 , %%ymm5 , %%ymm5 \n\t" + "vmovups %%ymm5, (%3,%0,4) \n\t" // 8 * y @@ -105,42 +114,160 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO ".align 16 \n\t" ".L01LOOP%=: \n\t" - // "prefetcht0 192(%3,%0,4) \n\t" - "vmovups (%3,%0,4), %%ymm4 \n\t" // 8 * y - "vmovups 32(%3,%0,4), %%ymm5 \n\t" // 8 * y - // "prefetcht0 192(%4,%0,4) \n\t" + "vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t" + "vxorps %%ymm5 , %%ymm5, %%ymm5 \n\t" + "vmovups (%3,%0,4), %%ymm8 \n\t" // 8 * y + "vmovups 32(%3,%0,4), %%ymm9 \n\t" // 8 * y + "vfmadd231ps (%4,%0,4), %%ymm12, %%ymm4 \n\t" "vfmadd231ps 32(%4,%0,4), %%ymm12, %%ymm5 \n\t" - // "prefetcht0 192(%5,%0,4) \n\t" "vfmadd231ps (%5,%0,4), %%ymm13, %%ymm4 \n\t" "vfmadd231ps 32(%5,%0,4), %%ymm13, %%ymm5 \n\t" - // "prefetcht0 192(%6,%0,4) \n\t" "vfmadd231ps (%6,%0,4), %%ymm14, %%ymm4 \n\t" "vfmadd231ps 32(%6,%0,4), %%ymm14, %%ymm5 \n\t" - // "prefetcht0 192(%7,%0,4) \n\t" "vfmadd231ps (%7,%0,4), %%ymm15, %%ymm4 \n\t" "vfmadd231ps 32(%7,%0,4), %%ymm15, %%ymm5 \n\t" - // "prefetcht0 192(%4,%8,4) \n\t" "vfmadd231ps (%4,%8,4), %%ymm0 , %%ymm4 \n\t" - "addq $16, %0 \n\t" + "addq $16, %0 \n\t" "vfmadd231ps 32(%4,%8,4), %%ymm0 , %%ymm5 \n\t" - // "prefetcht0 192(%5,%8,4) \n\t" "vfmadd231ps (%5,%8,4), %%ymm1 , %%ymm4 \n\t" "vfmadd231ps 32(%5,%8,4), %%ymm1 , %%ymm5 \n\t" - // "prefetcht0 192(%6,%8,4) \n\t" "vfmadd231ps (%6,%8,4), %%ymm2 , %%ymm4 \n\t" "vfmadd231ps 32(%6,%8,4), %%ymm2 , %%ymm5 \n\t" - // "prefetcht0 192(%7,%8,4) \n\t" "vfmadd231ps (%7,%8,4), %%ymm3 , %%ymm4 \n\t" "vfmadd231ps 32(%7,%8,4), %%ymm3 , %%ymm5 \n\t" - "addq $16, %8 \n\t" - "vmovups %%ymm4,-64(%3,%0,4) \n\t" // 8 * y - "subq $16, %1 \n\t" - "vmovups %%ymm5,-32(%3,%0,4) \n\t" // 8 * y + "vfmadd231ps %%ymm6 , %%ymm4 , %%ymm8 \n\t" + "vfmadd231ps %%ymm6 , %%ymm5 , %%ymm9 \n\t" + "addq $16, %8 \n\t" + "vmovups %%ymm8,-64(%3,%0,4) \n\t" // 8 * y + "subq $16, %1 \n\t" + "vmovups %%ymm9,-32(%3,%0,4) \n\t" // 8 * y + + "jnz .L01LOOP%= \n\t" + + ".L16END%=: \n\t" + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (ap[0]), // 4 + "r" (ap[1]), // 5 + "r" (ap[2]), // 6 + "r" (ap[3]), // 7 + "r" (lda4), // 8 + "r" (alpha) // 9 + : "cc", + "%xmm0", "%xmm1", + "%xmm2", "%xmm3", + "%xmm4", "%xmm5", + "%xmm6", "%xmm7", + "%xmm8", "%xmm9", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + + +#define HAVE_KERNEL_4x4 1 +static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline)); + +static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + "vbroadcastss (%2), %%ymm12 \n\t" // x0 + "vbroadcastss 4(%2), %%ymm13 \n\t" // x1 + "vbroadcastss 8(%2), %%ymm14 \n\t" // x2 + "vbroadcastss 12(%2), %%ymm15 \n\t" // x3 + + "vbroadcastss (%8), %%ymm6 \n\t" // alpha + + "testq $0x04, %1 \n\t" + "jz .L08LABEL%= \n\t" + + "vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t" + "vxorps %%ymm5 , %%ymm5, %%ymm5 \n\t" + "vmovups (%3,%0,4), %%xmm7 \n\t" // 4 * y + + "vfmadd231ps (%4,%0,4), %%xmm12, %%xmm4 \n\t" + "vfmadd231ps (%5,%0,4), %%xmm13, %%xmm5 \n\t" + "vfmadd231ps (%6,%0,4), %%xmm14, %%xmm4 \n\t" + "vfmadd231ps (%7,%0,4), %%xmm15, %%xmm5 \n\t" + + "vaddps %%xmm4 , %%xmm5 , %%xmm5 \n\t" + "vmulps %%xmm6 , %%xmm5 , %%xmm5 \n\t" + "vaddps %%xmm7 , %%xmm5 , %%xmm5 \n\t" + + "vmovups %%xmm5, (%3,%0,4) \n\t" // 4 * y + + "addq $4 , %0 \n\t" + "subq $4 , %1 \n\t" + + ".L08LABEL%=: \n\t" + + "testq $0x08, %1 \n\t" + "jz .L16LABEL%= \n\t" + + "vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t" + "vxorps %%ymm5 , %%ymm5, %%ymm5 \n\t" + "vmovups (%3,%0,4), %%ymm7 \n\t" // 8 * y + + "vfmadd231ps (%4,%0,4), %%ymm12, %%ymm4 \n\t" + "vfmadd231ps (%5,%0,4), %%ymm13, %%ymm5 \n\t" + "vfmadd231ps (%6,%0,4), %%ymm14, %%ymm4 \n\t" + "vfmadd231ps (%7,%0,4), %%ymm15, %%ymm5 \n\t" + + "vaddps %%ymm4 , %%ymm5 , %%ymm5 \n\t" + "vmulps %%ymm6 , %%ymm5 , %%ymm5 \n\t" + "vaddps %%ymm7 , %%ymm5 , %%ymm5 \n\t" + + "vmovups %%ymm5, (%3,%0,4) \n\t" // 8 * y + + "addq $8 , %0 \n\t" + "subq $8 , %1 \n\t" + + ".L16LABEL%=: \n\t" + + "cmpq $0, %1 \n\t" + "je .L16END%= \n\t" + + + ".align 16 \n\t" + ".L01LOOP%=: \n\t" + "vmovups (%3,%0,4), %%ymm8 \n\t" // 8 * y + "vmovups 32(%3,%0,4), %%ymm9 \n\t" // 8 * y + + "vfmadd231ps (%4,%0,4), %%ymm12, %%ymm4 \n\t" + "vfmadd231ps 32(%4,%0,4), %%ymm12, %%ymm5 \n\t" + "vfmadd231ps (%5,%0,4), %%ymm13, %%ymm4 \n\t" + "vfmadd231ps 32(%5,%0,4), %%ymm13, %%ymm5 \n\t" + "vfmadd231ps (%6,%0,4), %%ymm14, %%ymm4 \n\t" + "vfmadd231ps 32(%6,%0,4), %%ymm14, %%ymm5 \n\t" + "vfmadd231ps (%7,%0,4), %%ymm15, %%ymm4 \n\t" + "vfmadd231ps 32(%7,%0,4), %%ymm15, %%ymm5 \n\t" + + "vfmadd231ps %%ymm6 , %%ymm4 , %%ymm8 \n\t" + "vfmadd231ps %%ymm6 , %%ymm5 , %%ymm9 \n\t" + + "vmovups %%ymm8, (%3,%0,4) \n\t" // 8 * y + "vmovups %%ymm9, 32(%3,%0,4) \n\t" // 8 * y + + "addq $16, %0 \n\t" + "subq $16, %1 \n\t" "jnz .L01LOOP%= \n\t" ".L16END%=: \n\t" @@ -156,113 +283,11 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO "r" (ap[1]), // 5 "r" (ap[2]), // 6 "r" (ap[3]), // 7 - "r" (lda4) // 8 - : "cc", - "%xmm0", "%xmm1", - "%xmm2", "%xmm3", - "%xmm4", "%xmm5", - "%xmm12", "%xmm13", "%xmm14", "%xmm15", - "memory" - ); - -} - - - -#define HAVE_KERNEL_4x4 1 -static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline)); - -static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) -{ - - BLASLONG register i = 0; - - __asm__ __volatile__ - ( - "vzeroupper \n\t" - "vbroadcastss (%2), %%ymm12 \n\t" // x0 - "vbroadcastss 4(%2), %%ymm13 \n\t" // x1 - "vbroadcastss 8(%2), %%ymm14 \n\t" // x2 - "vbroadcastss 12(%2), %%ymm15 \n\t" // x3 - - "testq $0x04, %1 \n\t" - "jz .L08LABEL%= \n\t" - - "vmovups (%3,%0,4), %%xmm4 \n\t" // 4 * y - - "vfmadd231ps (%4,%0,4), %%xmm12, %%xmm4 \n\t" - "vfmadd231ps (%5,%0,4), %%xmm13, %%xmm4 \n\t" - "vfmadd231ps (%6,%0,4), %%xmm14, %%xmm4 \n\t" - "vfmadd231ps (%7,%0,4), %%xmm15, %%xmm4 \n\t" - - "vmovups %%xmm4, (%3,%0,4) \n\t" // 4 * y - - "addq $4 , %0 \n\t" - "subq $4 , %1 \n\t" - - ".L08LABEL%=: \n\t" - - "testq $0x08, %1 \n\t" - "jz .L16LABEL%= \n\t" - - "vmovups (%3,%0,4), %%ymm4 \n\t" // 8 * y - - "vfmadd231ps (%4,%0,4), %%ymm12, %%ymm4 \n\t" - "vfmadd231ps (%5,%0,4), %%ymm13, %%ymm4 \n\t" - "vfmadd231ps (%6,%0,4), %%ymm14, %%ymm4 \n\t" - "vfmadd231ps (%7,%0,4), %%ymm15, %%ymm4 \n\t" - - "vmovups %%ymm4, (%3,%0,4) \n\t" // 8 * y - - "addq $8 , %0 \n\t" - "subq $8 , %1 \n\t" - - ".L16LABEL%=: \n\t" - - "cmpq $0, %1 \n\t" - "je .L16END%= \n\t" - - - ".align 16 \n\t" - ".L01LOOP%=: \n\t" - "vmovups (%3,%0,4), %%ymm4 \n\t" // 8 * y - "vmovups 32(%3,%0,4), %%ymm5 \n\t" // 8 * y - - "prefetcht0 192(%4,%0,4) \n\t" - "vfmadd231ps (%4,%0,4), %%ymm12, %%ymm4 \n\t" - "vfmadd231ps 32(%4,%0,4), %%ymm12, %%ymm5 \n\t" - "prefetcht0 192(%5,%0,4) \n\t" - "vfmadd231ps (%5,%0,4), %%ymm13, %%ymm4 \n\t" - "vfmadd231ps 32(%5,%0,4), %%ymm13, %%ymm5 \n\t" - "prefetcht0 192(%6,%0,4) \n\t" - "vfmadd231ps (%6,%0,4), %%ymm14, %%ymm4 \n\t" - "vfmadd231ps 32(%6,%0,4), %%ymm14, %%ymm5 \n\t" - "prefetcht0 192(%7,%0,4) \n\t" - "vfmadd231ps (%7,%0,4), %%ymm15, %%ymm4 \n\t" - "vfmadd231ps 32(%7,%0,4), %%ymm15, %%ymm5 \n\t" - - "vmovups %%ymm4, (%3,%0,4) \n\t" // 8 * y - "vmovups %%ymm5, 32(%3,%0,4) \n\t" // 8 * y - - "addq $16, %0 \n\t" - "subq $16, %1 \n\t" - "jnz .L01LOOP%= \n\t" - - ".L16END%=: \n\t" - "vzeroupper \n\t" - - : - : - "r" (i), // 0 - "r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (ap[0]), // 4 - "r" (ap[1]), // 5 - "r" (ap[2]), // 6 - "r" (ap[3]) // 7 + "r" (alpha) // 8 : "cc", "%xmm4", "%xmm5", + "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); diff --git a/kernel/x86_64/sgemv_t_microk_haswell-4.c b/kernel/x86_64/sgemv_t_microk_haswell-4.c new file mode 100644 index 000000000..016cb35e7 --- /dev/null +++ b/kernel/x86_64/sgemv_t_microk_haswell-4.c @@ -0,0 +1,148 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_4x4 1 +static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline)); + +static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) +{ + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + "vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t" + "vxorps %%ymm5 , %%ymm5, %%ymm5 \n\t" + "vxorps %%ymm6 , %%ymm6, %%ymm6 \n\t" + "vxorps %%ymm7 , %%ymm7, %%ymm7 \n\t" + + "testq $0x04, %1 \n\t" + "jz .L08LABEL%= \n\t" + + "vmovups (%2,%0,4), %%xmm12 \n\t" // 4 * x + + "vfmadd231ps (%4,%0,4), %%xmm12, %%xmm4 \n\t" + "vfmadd231ps (%5,%0,4), %%xmm12, %%xmm5 \n\t" + "vfmadd231ps (%6,%0,4), %%xmm12, %%xmm6 \n\t" + "vfmadd231ps (%7,%0,4), %%xmm12, %%xmm7 \n\t" + + "addq $4 , %0 \n\t" + "subq $4 , %1 \n\t" + + ".L08LABEL%=: \n\t" + + "testq $0x08, %1 \n\t" + "jz .L16LABEL%= \n\t" + + "vmovups (%2,%0,4), %%ymm12 \n\t" // 8 * x + + "vfmadd231ps (%4,%0,4), %%ymm12, %%ymm4 \n\t" + "vfmadd231ps (%5,%0,4), %%ymm12, %%ymm5 \n\t" + "vfmadd231ps (%6,%0,4), %%ymm12, %%ymm6 \n\t" + "vfmadd231ps (%7,%0,4), %%ymm12, %%ymm7 \n\t" + + "addq $8 , %0 \n\t" + "subq $8 , %1 \n\t" + + ".L16LABEL%=: \n\t" + + "cmpq $0, %1 \n\t" + "je .L16END%= \n\t" + + + ".align 16 \n\t" + ".L01LOOP%=: \n\t" + "prefetcht0 384(%2,%0,4) \n\t" + "vmovups (%2,%0,4), %%ymm12 \n\t" // 8 * x + "vmovups 32(%2,%0,4), %%ymm13 \n\t" // 8 * x + + "prefetcht0 384(%4,%0,4) \n\t" + "vfmadd231ps (%4,%0,4), %%ymm12, %%ymm4 \n\t" + "vfmadd231ps (%5,%0,4), %%ymm12, %%ymm5 \n\t" + "prefetcht0 384(%5,%0,4) \n\t" + "vfmadd231ps 32(%4,%0,4), %%ymm13, %%ymm4 \n\t" + "vfmadd231ps 32(%5,%0,4), %%ymm13, %%ymm5 \n\t" + "prefetcht0 384(%6,%0,4) \n\t" + "vfmadd231ps (%6,%0,4), %%ymm12, %%ymm6 \n\t" + "vfmadd231ps (%7,%0,4), %%ymm12, %%ymm7 \n\t" + "prefetcht0 384(%7,%0,4) \n\t" + "vfmadd231ps 32(%6,%0,4), %%ymm13, %%ymm6 \n\t" + "vfmadd231ps 32(%7,%0,4), %%ymm13, %%ymm7 \n\t" + + "addq $16, %0 \n\t" + "subq $16, %1 \n\t" + "jnz .L01LOOP%= \n\t" + + ".L16END%=: \n\t" + + "vextractf128 $1 , %%ymm4, %%xmm12 \n\t" + "vextractf128 $1 , %%ymm5, %%xmm13 \n\t" + "vextractf128 $1 , %%ymm6, %%xmm14 \n\t" + "vextractf128 $1 , %%ymm7, %%xmm15 \n\t" + + "vaddps %%xmm4, %%xmm12, %%xmm4 \n\t" + "vaddps %%xmm5, %%xmm13, %%xmm5 \n\t" + "vaddps %%xmm6, %%xmm14, %%xmm6 \n\t" + "vaddps %%xmm7, %%xmm15, %%xmm7 \n\t" + + "vhaddps %%xmm4, %%xmm4, %%xmm4 \n\t" + "vhaddps %%xmm5, %%xmm5, %%xmm5 \n\t" + "vhaddps %%xmm6, %%xmm6, %%xmm6 \n\t" + "vhaddps %%xmm7, %%xmm7, %%xmm7 \n\t" + + "vhaddps %%xmm4, %%xmm4, %%xmm4 \n\t" + "vhaddps %%xmm5, %%xmm5, %%xmm5 \n\t" + "vhaddps %%xmm6, %%xmm6, %%xmm6 \n\t" + "vhaddps %%xmm7, %%xmm7, %%xmm7 \n\t" + + "vmovss %%xmm4, (%3) \n\t" + "vmovss %%xmm5, 4(%3) \n\t" + "vmovss %%xmm6, 8(%3) \n\t" + "vmovss %%xmm7, 12(%3) \n\t" + + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (ap[0]), // 4 + "r" (ap[1]), // 5 + "r" (ap[2]), // 6 + "r" (ap[3]) // 7 + : "cc", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + From 7c0a94ff472bcb01d666ca0bd6975c0b24267680 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Mon, 8 Sep 2014 10:54:33 +0200 Subject: [PATCH 34/44] bugfix in sgemv_n_microk_haswell-4.c --- kernel/x86_64/sgemv_n_microk_haswell-4.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/x86_64/sgemv_n_microk_haswell-4.c b/kernel/x86_64/sgemv_n_microk_haswell-4.c index 1e4498d9e..8f56655a9 100644 --- a/kernel/x86_64/sgemv_n_microk_haswell-4.c +++ b/kernel/x86_64/sgemv_n_microk_haswell-4.c @@ -248,6 +248,8 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT ".align 16 \n\t" ".L01LOOP%=: \n\t" + "vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t" + "vxorps %%ymm5 , %%ymm5, %%ymm5 \n\t" "vmovups (%3,%0,4), %%ymm8 \n\t" // 8 * y "vmovups 32(%3,%0,4), %%ymm9 \n\t" // 8 * y From c4d9d4e5f8319a743df17564c4bf1a1a0c3670e2 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Mon, 8 Sep 2014 12:25:16 +0200 Subject: [PATCH 35/44] added haswell optimized kernel --- kernel/x86_64/sgemv_t_4.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/x86_64/sgemv_t_4.c b/kernel/x86_64/sgemv_t_4.c index 3316473af..b0e883252 100644 --- a/kernel/x86_64/sgemv_t_4.c +++ b/kernel/x86_64/sgemv_t_4.c @@ -34,6 +34,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "sgemv_t_microk_bulldozer-4.c" #elif defined(SANDYBRIDGE) #include "sgemv_t_microk_sandy-4.c" +#elif defined(HASWELL) +#include "sgemv_t_microk_haswell-4.c" #endif #define NBMAX 4096 From f511807fc07e4e62f07b4a880d3196b860796bec Mon Sep 17 00:00:00 2001 From: wernsaar Date: Mon, 8 Sep 2014 12:27:32 +0200 Subject: [PATCH 36/44] modified multithreading threshold --- interface/gemv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/interface/gemv.c b/interface/gemv.c index 64dc641d0..2dd82dce5 100644 --- a/interface/gemv.c +++ b/interface/gemv.c @@ -216,7 +216,7 @@ void CNAME(enum CBLAS_ORDER order, int nthreads_avail = nthreads_max; double MNK = (double) m * (double) n; - if ( MNK <= (96.0 * 24.0 * (double) GEMM_MULTITHREAD_THRESHOLD) ) + if ( MNK <= (24.0 * 24.0 * (double) (GEMM_MULTITHREAD_THRESHOLD*GEMM_MULTITHREAD_THRESHOLD) ) ) nthreads_max = 1; if ( nthreads_max > nthreads_avail ) From 658939faaada12ab40334f986a665d28eef2ef19 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Mon, 8 Sep 2014 15:22:35 +0200 Subject: [PATCH 37/44] optimized dgemv_n kernel for small sizes --- kernel/x86_64/KERNEL.NEHALEM | 2 +- kernel/x86_64/dgemv_n_4.c | 546 +++++++++++++++++++++++ kernel/x86_64/dgemv_n_microk_nehalem-4.c | 265 +++++++++++ 3 files changed, 812 insertions(+), 1 deletion(-) create mode 100644 kernel/x86_64/dgemv_n_4.c create mode 100644 kernel/x86_64/dgemv_n_microk_nehalem-4.c diff --git a/kernel/x86_64/KERNEL.NEHALEM b/kernel/x86_64/KERNEL.NEHALEM index 68c741cea..8feef5c31 100644 --- a/kernel/x86_64/KERNEL.NEHALEM +++ b/kernel/x86_64/KERNEL.NEHALEM @@ -11,7 +11,7 @@ SSYMV_L_KERNEL = ssymv_L.c SGEMVNKERNEL = sgemv_n_4.c SGEMVTKERNEL = sgemv_t_4.c -DGEMVNKERNEL = dgemv_n.c +DGEMVNKERNEL = dgemv_n_4.c SGEMMKERNEL = gemm_kernel_4x8_nehalem.S SGEMMINCOPY = gemm_ncopy_4.S diff --git a/kernel/x86_64/dgemv_n_4.c b/kernel/x86_64/dgemv_n_4.c new file mode 100644 index 000000000..249df8009 --- /dev/null +++ b/kernel/x86_64/dgemv_n_4.c @@ -0,0 +1,546 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include "common.h" + + +#if defined(NEHALEM) +#include "dgemv_n_microk_nehalem-4.c" +#endif + + +#define NBMAX 2048 + +#ifndef HAVE_KERNEL_4x8 + +static void dgemv_kernel_4x8(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, BLASLONG lda4, FLOAT *alpha) +{ + BLASLONG i; + FLOAT *a0,*a1,*a2,*a3; + FLOAT *b0,*b1,*b2,*b3; + FLOAT *x4; + FLOAT x[8]; + a0 = ap[0]; + a1 = ap[1]; + a2 = ap[2]; + a3 = ap[3]; + b0 = a0 + lda4 ; + b1 = a1 + lda4 ; + b2 = a2 + lda4 ; + b3 = a3 + lda4 ; + x4 = x + 4; + + for ( i=0; i<8; i++) + x[i] = xo[i] * *alpha; + + for ( i=0; i< n; i+=4 ) + { + + y[i] += a0[i]*x[0] + a1[i]*x[1] + a2[i]*x[2] + a3[i]*x[3]; + y[i+1] += a0[i+1]*x[0] + a1[i+1]*x[1] + a2[i+1]*x[2] + a3[i+1]*x[3]; + y[i+2] += a0[i+2]*x[0] + a1[i+2]*x[1] + a2[i+2]*x[2] + a3[i+2]*x[3]; + y[i+3] += a0[i+3]*x[0] + a1[i+3]*x[1] + a2[i+3]*x[2] + a3[i+3]*x[3]; + + y[i] += b0[i]*x4[0] + b1[i]*x4[1] + b2[i]*x4[2] + b3[i]*x4[3]; + y[i+1] += b0[i+1]*x4[0] + b1[i+1]*x4[1] + b2[i+1]*x4[2] + b3[i+1]*x4[3]; + y[i+2] += b0[i+2]*x4[0] + b1[i+2]*x4[1] + b2[i+2]*x4[2] + b3[i+2]*x4[3]; + y[i+3] += b0[i+3]*x4[0] + b1[i+3]*x4[1] + b2[i+3]*x4[2] + b3[i+3]*x4[3]; + + } +} + +#endif + + +#ifndef HAVE_KERNEL_4x4 + +static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) +{ + BLASLONG i; + FLOAT *a0,*a1,*a2,*a3; + FLOAT x[4]; + a0 = ap[0]; + a1 = ap[1]; + a2 = ap[2]; + a3 = ap[3]; + + for ( i=0; i<4; i++) + x[i] = xo[i] * *alpha; + + for ( i=0; i< n; i+=4 ) + { + y[i] += a0[i]*x[0] + a1[i]*x[1] + a2[i]*x[2] + a3[i]*x[3]; + y[i+1] += a0[i+1]*x[0] + a1[i+1]*x[1] + a2[i+1]*x[2] + a3[i+1]*x[3]; + y[i+2] += a0[i+2]*x[0] + a1[i+2]*x[1] + a2[i+2]*x[2] + a3[i+2]*x[3]; + y[i+3] += a0[i+3]*x[0] + a1[i+3]*x[1] + a2[i+3]*x[2] + a3[i+3]*x[3]; + } +} + +#endif + +#ifndef HAVE_KERNEL_4x2 + +static void dgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline)); + +static void dgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "movsd (%2) , %%xmm12 \n\t" // x0 + "movsd (%6) , %%xmm4 \n\t" // alpha + "movsd 8(%2) , %%xmm13 \n\t" // x1 + "mulsd %%xmm4 , %%xmm12 \n\t" // alpha + "mulsd %%xmm4 , %%xmm13 \n\t" // alpha + "shufpd $0, %%xmm12, %%xmm12 \n\t" + "shufpd $0, %%xmm13, %%xmm13 \n\t" + + ".align 16 \n\t" + ".L01LOOP%=: \n\t" + "movups (%3,%0,8), %%xmm4 \n\t" // 2 * y + "movups 16(%3,%0,8), %%xmm5 \n\t" // 2 * y + + "movups (%4,%0,8), %%xmm8 \n\t" + "movups (%5,%0,8), %%xmm9 \n\t" + "mulpd %%xmm12, %%xmm8 \n\t" + "mulpd %%xmm13, %%xmm9 \n\t" + "addpd %%xmm8 , %%xmm4 \n\t" + "addpd %%xmm9 , %%xmm4 \n\t" + + "movups 16(%4,%0,8), %%xmm8 \n\t" + "movups 16(%5,%0,8), %%xmm9 \n\t" + "mulpd %%xmm12, %%xmm8 \n\t" + "mulpd %%xmm13, %%xmm9 \n\t" + "addpd %%xmm8 , %%xmm5 \n\t" + "addpd %%xmm9 , %%xmm5 \n\t" + + "movups %%xmm4 , (%3,%0,8) \n\t" // 2 * y + "movups %%xmm5 , 16(%3,%0,8) \n\t" // 2 * y + + "addq $4 , %0 \n\t" + "subq $4 , %1 \n\t" + "jnz .L01LOOP%= \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (ap[0]), // 4 + "r" (ap[1]), // 5 + "r" (alpha) // 6 + : "cc", + "%xmm4", "%xmm5", + "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + +#endif + +#ifndef HAVE_KERNEL_4x2 + +static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline)); + +static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "movsd (%2), %%xmm12 \n\t" // x0 + "mulsd (%5), %%xmm12 \n\t" // alpha + "shufpd $0, %%xmm12, %%xmm12 \n\t" + + ".align 16 \n\t" + ".L01LOOP%=: \n\t" + "movups (%4,%0,8), %%xmm8 \n\t" // 2 * a + "movups 16(%4,%0,8), %%xmm9 \n\t" // 2 * a + "movups (%3,%0,8), %%xmm4 \n\t" // 2 * y + "movups 16(%3,%0,8), %%xmm5 \n\t" // 2 * y + "mulpd %%xmm12, %%xmm8 \n\t" + "mulpd %%xmm12, %%xmm9 \n\t" + "addpd %%xmm8 , %%xmm4 \n\t" + "addpd %%xmm9 , %%xmm5 \n\t" + + "movups %%xmm4 , (%3,%0,8) \n\t" // 2 * y + "movups %%xmm5 , 16(%3,%0,8) \n\t" // 2 * y + + "addq $4 , %0 \n\t" + "subq $4 , %1 \n\t" + + "jnz .L01LOOP%= \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (ap), // 4 + "r" (alpha) // 5 + : "cc", + "%xmm4", "%xmm5", + "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + +#endif + +static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) __attribute__ ((noinline)); + +static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) +{ + BLASLONG i; + if ( inc_dest != 1 ) + { + for ( i=0; i> 3 ; + n2 = n & 7 ; + } + else + { + n1 = n >> 2 ; + n2 = n & 3 ; + + } + + m3 = m & 3 ; + m1 = m & -4 ; + m2 = (m & (NBMAX-1)) - m3 ; + + + y_ptr = y; + + BLASLONG NB = NBMAX; + + while ( NB == NBMAX ) + { + + m1 -= NB; + if ( m1 < 0) + { + if ( m2 == 0 ) break; + NB = m2; + } + + a_ptr = a; + x_ptr = x; + + ap[0] = a_ptr; + ap[1] = a_ptr + lda; + ap[2] = ap[1] + lda; + ap[3] = ap[2] + lda; + + if ( inc_y != 1 ) + memset(ybuffer,0,NB*8); + else + ybuffer = y_ptr; + + if ( inc_x == 1 ) + { + + + for( i = 0; i < n1 ; i++) + { + dgemv_kernel_4x8(NB,ap,x_ptr,ybuffer,lda4,&alpha); + ap[0] += lda8; + ap[1] += lda8; + ap[2] += lda8; + ap[3] += lda8; + a_ptr += lda8; + x_ptr += 8; + } + + + if ( n2 & 4 ) + { + dgemv_kernel_4x4(NB,ap,x_ptr,ybuffer,&alpha); + ap[0] += lda4; + ap[1] += lda4; + a_ptr += lda4; + x_ptr += 4; + } + + if ( n2 & 2 ) + { + dgemv_kernel_4x2(NB,ap,x_ptr,ybuffer,&alpha); + a_ptr += lda*2; + x_ptr += 2; + } + + + if ( n2 & 1 ) + { + dgemv_kernel_4x1(NB,a_ptr,x_ptr,ybuffer,&alpha); + a_ptr += lda; + x_ptr += 1; + + } + + + } + else + { + + for( i = 0; i < n1 ; i++) + { + xbuffer[0] = x_ptr[0]; + x_ptr += inc_x; + xbuffer[1] = x_ptr[0]; + x_ptr += inc_x; + xbuffer[2] = x_ptr[0]; + x_ptr += inc_x; + xbuffer[3] = x_ptr[0]; + x_ptr += inc_x; + dgemv_kernel_4x4(NB,ap,xbuffer,ybuffer,&alpha); + ap[0] += lda4; + ap[1] += lda4; + ap[2] += lda4; + ap[3] += lda4; + a_ptr += lda4; + } + + for( i = 0; i < n2 ; i++) + { + xbuffer[0] = x_ptr[0]; + x_ptr += inc_x; + dgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer,&alpha); + a_ptr += lda; + + } + + } + + a += NB; + if ( inc_y != 1 ) + { + add_y(NB,ybuffer,y_ptr,inc_y); + y_ptr += NB * inc_y; + } + else + y_ptr += NB ; + + } + + if ( m3 == 0 ) return(0); + + if ( m3 == 3 ) + { + a_ptr = a; + x_ptr = x; + FLOAT temp0 = 0.0; + FLOAT temp1 = 0.0; + FLOAT temp2 = 0.0; + if ( lda == 3 && inc_x ==1 ) + { + + for( i = 0; i < ( n & -4 ); i+=4 ) + { + + temp0 += a_ptr[0] * x_ptr[0] + a_ptr[3] * x_ptr[1]; + temp1 += a_ptr[1] * x_ptr[0] + a_ptr[4] * x_ptr[1]; + temp2 += a_ptr[2] * x_ptr[0] + a_ptr[5] * x_ptr[1]; + + temp0 += a_ptr[6] * x_ptr[2] + a_ptr[9] * x_ptr[3]; + temp1 += a_ptr[7] * x_ptr[2] + a_ptr[10] * x_ptr[3]; + temp2 += a_ptr[8] * x_ptr[2] + a_ptr[11] * x_ptr[3]; + + a_ptr += 12; + x_ptr += 4; + } + + for( ; i < n; i++ ) + { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + temp2 += a_ptr[2] * x_ptr[0]; + a_ptr += 3; + x_ptr ++; + } + + } + else + { + + for( i = 0; i < n; i++ ) + { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + temp2 += a_ptr[2] * x_ptr[0]; + a_ptr += lda; + x_ptr += inc_x; + + + } + + } + y_ptr[0] += alpha * temp0; + y_ptr += inc_y; + y_ptr[0] += alpha * temp1; + y_ptr += inc_y; + y_ptr[0] += alpha * temp2; + return(0); + } + + + if ( m3 == 2 ) + { + a_ptr = a; + x_ptr = x; + FLOAT temp0 = 0.0; + FLOAT temp1 = 0.0; + if ( lda == 2 && inc_x ==1 ) + { + + for( i = 0; i < (n & -4) ; i+=4 ) + { + temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1]; + temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1]; + temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3]; + temp1 += a_ptr[5] * x_ptr[2] + a_ptr[7] * x_ptr[3]; + a_ptr += 8; + x_ptr += 4; + + } + + + for( ; i < n; i++ ) + { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + a_ptr += 2; + x_ptr ++; + } + + } + else + { + + for( i = 0; i < n; i++ ) + { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + a_ptr += lda; + x_ptr += inc_x; + + + } + + } + y_ptr[0] += alpha * temp0; + y_ptr += inc_y; + y_ptr[0] += alpha * temp1; + return(0); + } + + if ( m3 == 1 ) + { + a_ptr = a; + x_ptr = x; + FLOAT temp = 0.0; + if ( lda == 1 && inc_x ==1 ) + { + + for( i = 0; i < (n & -4); i+=4 ) + { + temp += a_ptr[i] * x_ptr[i] + a_ptr[i+1] * x_ptr[i+1] + a_ptr[i+2] * x_ptr[i+2] + a_ptr[i+3] * x_ptr[i+3]; + + } + + for( ; i < n; i++ ) + { + temp += a_ptr[i] * x_ptr[i]; + } + + } + else + { + + for( i = 0; i < n; i++ ) + { + temp += a_ptr[0] * x_ptr[0]; + a_ptr += lda; + x_ptr += inc_x; + } + + } + y_ptr[0] += alpha * temp; + return(0); + } + + + return(0); +} + + diff --git a/kernel/x86_64/dgemv_n_microk_nehalem-4.c b/kernel/x86_64/dgemv_n_microk_nehalem-4.c new file mode 100644 index 000000000..e311326f1 --- /dev/null +++ b/kernel/x86_64/dgemv_n_microk_nehalem-4.c @@ -0,0 +1,265 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + + +#define HAVE_KERNEL_4x8 1 +static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) __attribute__ ((noinline)); + +static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) +{ + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "movsd (%2), %%xmm12 \n\t" // x0 + "movsd 8(%2), %%xmm13 \n\t" // x1 + "movsd 16(%2), %%xmm14 \n\t" // x2 + "movsd 24(%2), %%xmm15 \n\t" // x3 + "shufpd $0, %%xmm12, %%xmm12\n\t" + "shufpd $0, %%xmm13, %%xmm13\n\t" + "shufpd $0, %%xmm14, %%xmm14\n\t" + "shufpd $0, %%xmm15, %%xmm15\n\t" + + "movsd 32(%2), %%xmm0 \n\t" // x4 + "movsd 40(%2), %%xmm1 \n\t" // x5 + "movsd 48(%2), %%xmm2 \n\t" // x6 + "movsd 56(%2), %%xmm3 \n\t" // x7 + "shufpd $0, %%xmm0 , %%xmm0 \n\t" + "shufpd $0, %%xmm1 , %%xmm1 \n\t" + "shufpd $0, %%xmm2 , %%xmm2 \n\t" + "shufpd $0, %%xmm3 , %%xmm3 \n\t" + + "movsd (%9), %%xmm6 \n\t" // alpha + "shufpd $0, %%xmm6 , %%xmm6 \n\t" + + + ".align 16 \n\t" + ".L01LOOP%=: \n\t" + "xorpd %%xmm4 , %%xmm4 \n\t" + "xorpd %%xmm5 , %%xmm5 \n\t" + "movups (%3,%0,8), %%xmm7 \n\t" // 2 * y + + ".align 2 \n\t" + "movups (%4,%0,8), %%xmm8 \n\t" + "movups (%5,%0,8), %%xmm9 \n\t" + "movups (%6,%0,8), %%xmm10 \n\t" + "movups (%7,%0,8), %%xmm11 \n\t" + ".align 2 \n\t" + "mulpd %%xmm12, %%xmm8 \n\t" + "mulpd %%xmm13, %%xmm9 \n\t" + "mulpd %%xmm14, %%xmm10 \n\t" + "mulpd %%xmm15, %%xmm11 \n\t" + "addpd %%xmm8 , %%xmm4 \n\t" + "addpd %%xmm9 , %%xmm5 \n\t" + "addpd %%xmm10, %%xmm4 \n\t" + "addpd %%xmm11, %%xmm5 \n\t" + + "movups (%4,%8,8), %%xmm8 \n\t" + "movups (%5,%8,8), %%xmm9 \n\t" + "movups (%6,%8,8), %%xmm10 \n\t" + "movups (%7,%8,8), %%xmm11 \n\t" + ".align 2 \n\t" + "mulpd %%xmm0 , %%xmm8 \n\t" + "mulpd %%xmm1 , %%xmm9 \n\t" + "mulpd %%xmm2 , %%xmm10 \n\t" + "mulpd %%xmm3 , %%xmm11 \n\t" + "addpd %%xmm8 , %%xmm4 \n\t" + "addpd %%xmm9 , %%xmm5 \n\t" + "addpd %%xmm10, %%xmm4 \n\t" + "addpd %%xmm11, %%xmm5 \n\t" + + "addpd %%xmm5 , %%xmm4 \n\t" + "mulpd %%xmm6 , %%xmm4 \n\t" + "addpd %%xmm4 , %%xmm7 \n\t" + + "movups %%xmm7 , (%3,%0,8) \n\t" // 2 * y + + "xorpd %%xmm4 , %%xmm4 \n\t" + "xorpd %%xmm5 , %%xmm5 \n\t" + "movups 16(%3,%0,8), %%xmm7 \n\t" // 2 * y + + ".align 2 \n\t" + "movups 16(%4,%0,8), %%xmm8 \n\t" + "movups 16(%5,%0,8), %%xmm9 \n\t" + "movups 16(%6,%0,8), %%xmm10 \n\t" + "movups 16(%7,%0,8), %%xmm11 \n\t" + ".align 2 \n\t" + "mulpd %%xmm12, %%xmm8 \n\t" + "mulpd %%xmm13, %%xmm9 \n\t" + "mulpd %%xmm14, %%xmm10 \n\t" + "mulpd %%xmm15, %%xmm11 \n\t" + "addpd %%xmm8 , %%xmm4 \n\t" + "addpd %%xmm9 , %%xmm5 \n\t" + "addpd %%xmm10, %%xmm4 \n\t" + "addpd %%xmm11, %%xmm5 \n\t" + + "movups 16(%4,%8,8), %%xmm8 \n\t" + "movups 16(%5,%8,8), %%xmm9 \n\t" + "movups 16(%6,%8,8), %%xmm10 \n\t" + "movups 16(%7,%8,8), %%xmm11 \n\t" + ".align 2 \n\t" + "mulpd %%xmm0 , %%xmm8 \n\t" + "mulpd %%xmm1 , %%xmm9 \n\t" + "mulpd %%xmm2 , %%xmm10 \n\t" + "mulpd %%xmm3 , %%xmm11 \n\t" + "addpd %%xmm8 , %%xmm4 \n\t" + "addpd %%xmm9 , %%xmm5 \n\t" + "addpd %%xmm10, %%xmm4 \n\t" + "addpd %%xmm11, %%xmm5 \n\t" + + "addq $4 , %8 \n\t" + "addpd %%xmm5 , %%xmm4 \n\t" + "mulpd %%xmm6 , %%xmm4 \n\t" + "addpd %%xmm4 , %%xmm7 \n\t" + + "movups %%xmm7 , 16(%3,%0,8) \n\t" // 2 * y + + "addq $4 , %0 \n\t" + "subq $4 , %1 \n\t" + "jnz .L01LOOP%= \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (ap[0]), // 4 + "r" (ap[1]), // 5 + "r" (ap[2]), // 6 + "r" (ap[3]), // 7 + "r" (lda4), // 8 + "r" (alpha) // 9 + : "cc", + "%xmm0", "%xmm1", + "%xmm2", "%xmm3", + "%xmm4", "%xmm5", + "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + + + +#define HAVE_KERNEL_4x4 1 +static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline)); + +static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "movsd (%2), %%xmm12 \n\t" // x0 + "movsd 8(%2), %%xmm13 \n\t" // x1 + "movsd 16(%2), %%xmm14 \n\t" // x2 + "movsd 24(%2), %%xmm15 \n\t" // x3 + "shufpd $0, %%xmm12, %%xmm12\n\t" + "shufpd $0, %%xmm13, %%xmm13\n\t" + "shufpd $0, %%xmm14, %%xmm14\n\t" + "shufpd $0, %%xmm15, %%xmm15\n\t" + + "movsd (%8), %%xmm6 \n\t" // alpha + "shufpd $0, %%xmm6 , %%xmm6 \n\t" + + ".align 16 \n\t" + ".L01LOOP%=: \n\t" + "xorpd %%xmm4 , %%xmm4 \n\t" + "xorpd %%xmm5 , %%xmm5 \n\t" + "movups (%3,%0,8), %%xmm7 \n\t" // 2 * y + + "movups (%4,%0,8), %%xmm8 \n\t" + "movups (%5,%0,8), %%xmm9 \n\t" + "movups (%6,%0,8), %%xmm10 \n\t" + "movups (%7,%0,8), %%xmm11 \n\t" + "mulpd %%xmm12, %%xmm8 \n\t" + "mulpd %%xmm13, %%xmm9 \n\t" + "mulpd %%xmm14, %%xmm10 \n\t" + "mulpd %%xmm15, %%xmm11 \n\t" + "addpd %%xmm8 , %%xmm4 \n\t" + "addpd %%xmm9 , %%xmm4 \n\t" + "addpd %%xmm10 , %%xmm4 \n\t" + "addpd %%xmm4 , %%xmm11 \n\t" + + "mulpd %%xmm6 , %%xmm11 \n\t" + "addpd %%xmm7 , %%xmm11 \n\t" + "movups %%xmm11, (%3,%0,8) \n\t" // 2 * y + + "xorpd %%xmm4 , %%xmm4 \n\t" + "xorpd %%xmm5 , %%xmm5 \n\t" + "movups 16(%3,%0,8), %%xmm7 \n\t" // 2 * y + + "movups 16(%4,%0,8), %%xmm8 \n\t" + "movups 16(%5,%0,8), %%xmm9 \n\t" + "movups 16(%6,%0,8), %%xmm10 \n\t" + "movups 16(%7,%0,8), %%xmm11 \n\t" + "mulpd %%xmm12, %%xmm8 \n\t" + "mulpd %%xmm13, %%xmm9 \n\t" + "mulpd %%xmm14, %%xmm10 \n\t" + "mulpd %%xmm15, %%xmm11 \n\t" + "addpd %%xmm8 , %%xmm4 \n\t" + "addpd %%xmm9 , %%xmm4 \n\t" + "addpd %%xmm10 , %%xmm4 \n\t" + "addpd %%xmm4 , %%xmm11 \n\t" + + "mulpd %%xmm6 , %%xmm11 \n\t" + "addpd %%xmm7 , %%xmm11 \n\t" + "movups %%xmm11, 16(%3,%0,8) \n\t" // 2 * y + + "addq $4 , %0 \n\t" + "subq $4 , %1 \n\t" + "jnz .L01LOOP%= \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (ap[0]), // 4 + "r" (ap[1]), // 5 + "r" (ap[2]), // 6 + "r" (ap[3]), // 7 + "r" (alpha) // 8 + : "cc", + "%xmm4", "%xmm5", + "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + From cd34e9701b552bff542c335943aec70e159037ba Mon Sep 17 00:00:00 2001 From: wernsaar Date: Mon, 8 Sep 2014 19:15:31 +0200 Subject: [PATCH 38/44] removed obsolete files --- kernel/x86_64/sgemv_n_avx.c | 218 ----------- kernel/x86_64/sgemv_n_microk_bulldozer.c | 451 --------------------- kernel/x86_64/sgemv_n_microk_haswell.c | 461 ---------------------- kernel/x86_64/sgemv_n_microk_sandy.c | 473 ----------------------- kernel/x86_64/sgemv_t_avx.c | 232 ----------- kernel/x86_64/sgemv_t_microk_bulldozer.c | 99 ----- kernel/x86_64/sgemv_t_microk_haswell.c | 100 ----- kernel/x86_64/sgemv_t_microk_sandy.c | 106 ----- 8 files changed, 2140 deletions(-) delete mode 100644 kernel/x86_64/sgemv_n_avx.c delete mode 100644 kernel/x86_64/sgemv_n_microk_bulldozer.c delete mode 100644 kernel/x86_64/sgemv_n_microk_haswell.c delete mode 100644 kernel/x86_64/sgemv_n_microk_sandy.c delete mode 100644 kernel/x86_64/sgemv_t_avx.c delete mode 100644 kernel/x86_64/sgemv_t_microk_bulldozer.c delete mode 100644 kernel/x86_64/sgemv_t_microk_haswell.c delete mode 100644 kernel/x86_64/sgemv_t_microk_sandy.c diff --git a/kernel/x86_64/sgemv_n_avx.c b/kernel/x86_64/sgemv_n_avx.c deleted file mode 100644 index 57aaad4b4..000000000 --- a/kernel/x86_64/sgemv_n_avx.c +++ /dev/null @@ -1,218 +0,0 @@ -/*************************************************************************** -Copyright (c) 2014, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - - -#include "common.h" - -#if defined(BULLDOZER) || defined(PILEDRIVER) -#include "sgemv_n_microk_bulldozer.c" -#elif defined(HASWELL) -#include "sgemv_n_microk_haswell.c" -#else -#include "sgemv_n_microk_sandy.c" -#endif - -static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) -{ - BLASLONG i; - for ( i=0; i 0 ) - { - - if ( inc_x == 1 ) - xbuffer = x_ptr; - else - copy_x(n2,x_ptr,xbuffer,inc_x); - - a_ptr = a + n1 * 512 * lda; - y_ptr = y; - - for(i = 0; i rax - "vbroadcastss %1, %%ymm1\n\t" // alpha -> ymm1 - "movq %2, %%rsi\n\t" // adress of a -> rsi - "movq %3, %%rcx\n\t" // value of lda > rcx - "movq %4, %%rdi\n\t" // adress of x -> rdi - "movq %5, %%rdx\n\t" // adress of y -> rdx - "movq %6, %%r8\n\t" // address for prefetch - "prefetcht0 (%%r8)\n\t" // Prefetch - "prefetcht0 64(%%r8)\n\t" // Prefetch - - "vxorps %%ymm8 , %%ymm8 , %%ymm8 \n\t" // set to zero - "vxorps %%ymm9 , %%ymm9 , %%ymm9 \n\t" // set to zero - "vxorps %%ymm10, %%ymm10, %%ymm10\n\t" // set to zero - "vxorps %%ymm11, %%ymm11, %%ymm11\n\t" // set to zero - "vxorps %%ymm12, %%ymm12, %%ymm12\n\t" // set to zero - "vxorps %%ymm13, %%ymm13, %%ymm13\n\t" // set to zero - "vxorps %%ymm14, %%ymm14, %%ymm14\n\t" // set to zero - "vxorps %%ymm15, %%ymm15, %%ymm15\n\t" // set to zero - ".align 16 \n\t" - ".L01LOOP%=: \n\t" - "vbroadcastss (%%rdi), %%ymm0 \n\t" // load values of c - "nop \n\t" - "leaq (%%r8 , %%rcx, 4), %%r8 \n\t" // add lda to pointer for prefetch - - "prefetcht0 (%%r8)\n\t" // Prefetch - "vfmaddps %%ymm8 , 0*4(%%rsi), %%ymm0, %%ymm8 \n\t" // multiply a and c and add to temp - "prefetcht0 64(%%r8)\n\t" // Prefetch - "vfmaddps %%ymm9 , 8*4(%%rsi), %%ymm0, %%ymm9 \n\t" // multiply a and c and add to temp - "prefetcht0 128(%%r8)\n\t" // Prefetch - "vfmaddps %%ymm10, 16*4(%%rsi), %%ymm0, %%ymm10\n\t" // multiply a and c and add to temp - "vfmaddps %%ymm11, 24*4(%%rsi), %%ymm0, %%ymm11\n\t" // multiply a and c and add to temp - "prefetcht0 192(%%r8)\n\t" // Prefetch - "vfmaddps %%ymm12, 32*4(%%rsi), %%ymm0, %%ymm12\n\t" // multiply a and c and add to temp - "vfmaddps %%ymm13, 40*4(%%rsi), %%ymm0, %%ymm13\n\t" // multiply a and c and add to temp - "vfmaddps %%ymm14, 48*4(%%rsi), %%ymm0, %%ymm14\n\t" // multiply a and c and add to temp - "vfmaddps %%ymm15, 56*4(%%rsi), %%ymm0, %%ymm15\n\t" // multiply a and c and add to temp - - "addq $4 , %%rdi \n\t" // increment pointer of c - "leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a - - "dec %%rax \n\t" // n = n -1 - "jnz .L01LOOP%= \n\t" - - "vmulps %%ymm8 , %%ymm1, %%ymm8 \n\t" // scale by alpha - "vmulps %%ymm9 , %%ymm1, %%ymm9 \n\t" // scale by alpha - "vmulps %%ymm10, %%ymm1, %%ymm10\n\t" // scale by alpha - "vmulps %%ymm11, %%ymm1, %%ymm11\n\t" // scale by alpha - "vmulps %%ymm12, %%ymm1, %%ymm12\n\t" // scale by alpha - "vmulps %%ymm13, %%ymm1, %%ymm13\n\t" // scale by alpha - "vmulps %%ymm14, %%ymm1, %%ymm14\n\t" // scale by alpha - "vmulps %%ymm15, %%ymm1, %%ymm15\n\t" // scale by alpha - - "vmovups %%ymm8 , (%%rdx) \n\t" // store temp -> y - "vmovups %%ymm9 , 8*4(%%rdx) \n\t" // store temp -> y - "vmovups %%ymm10, 16*4(%%rdx) \n\t" // store temp -> y - "vmovups %%ymm11, 24*4(%%rdx) \n\t" // store temp -> y - "vmovups %%ymm12, 32*4(%%rdx) \n\t" // store temp -> y - "vmovups %%ymm13, 40*4(%%rdx) \n\t" // store temp -> y - "vmovups %%ymm14, 48*4(%%rdx) \n\t" // store temp -> y - "vmovups %%ymm15, 56*4(%%rdx) \n\t" // store temp -> y - - : - : - "m" (n), // 0 - "m" (alpha), // 1 - "m" (a), // 2 - "m" (lda), // 3 - "m" (x), // 4 - "m" (y), // 5 - "m" (pre) // 6 - : "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", - "%xmm0", "%xmm1", - "%xmm8", "%xmm9", "%xmm10", "%xmm11", - "%xmm12", "%xmm13", "%xmm14", "%xmm15", - "memory" - ); - -} - - - -static void sgemv_kernel_32( long n, float alpha, float *a, long lda, float *x, float *y) -{ - - - float *pre = a + lda*3; - - __asm__ __volatile__ - ( - "movq %0, %%rax\n\t" // n -> rax - "vbroadcastss %1, %%xmm1\n\t" // alpha -> xmm1 - "movq %2, %%rsi\n\t" // adress of a -> rsi - "movq %3, %%rcx\n\t" // value of lda > rcx - "movq %4, %%rdi\n\t" // adress of x -> rdi - "movq %5, %%rdx\n\t" // adress of y -> rdx - "movq %6, %%r8\n\t" // address for prefetch - "prefetcht0 (%%r8)\n\t" // Prefetch - "prefetcht0 64(%%r8)\n\t" // Prefetch - - "vxorps %%xmm8 , %%xmm8 , %%xmm8 \n\t" // set to zero - "vxorps %%xmm9 , %%xmm9 , %%xmm9 \n\t" // set to zero - "vxorps %%xmm10, %%xmm10, %%xmm10\n\t" // set to zero - "vxorps %%xmm11, %%xmm11, %%xmm11\n\t" // set to zero - "vxorps %%xmm12, %%xmm12, %%xmm12\n\t" // set to zero - "vxorps %%xmm13, %%xmm13, %%xmm13\n\t" // set to zero - "vxorps %%xmm14, %%xmm14, %%xmm14\n\t" // set to zero - "vxorps %%xmm15, %%xmm15, %%xmm15\n\t" // set to zero - ".align 16 \n\t" - ".L01LOOP%=: \n\t" - "vbroadcastss (%%rdi), %%xmm0 \n\t" // load values of c - "nop \n\t" - "leaq (%%r8 , %%rcx, 4), %%r8 \n\t" // add lda to pointer for prefetch - - "prefetcht0 (%%r8)\n\t" // Prefetch - "vfmaddps %%xmm8 , 0*4(%%rsi), %%xmm0, %%xmm8 \n\t" // multiply a and c and add to temp - "prefetcht0 64(%%r8)\n\t" // Prefetch - "vfmaddps %%xmm9 , 4*4(%%rsi), %%xmm0, %%xmm9 \n\t" // multiply a and c and add to temp - "vfmaddps %%xmm10, 8*4(%%rsi), %%xmm0, %%xmm10\n\t" // multiply a and c and add to temp - "vfmaddps %%xmm11, 12*4(%%rsi), %%xmm0, %%xmm11\n\t" // multiply a and c and add to temp - "vfmaddps %%xmm12, 16*4(%%rsi), %%xmm0, %%xmm12\n\t" // multiply a and c and add to temp - "vfmaddps %%xmm13, 20*4(%%rsi), %%xmm0, %%xmm13\n\t" // multiply a and c and add to temp - "vfmaddps %%xmm14, 24*4(%%rsi), %%xmm0, %%xmm14\n\t" // multiply a and c and add to temp - "vfmaddps %%xmm15, 28*4(%%rsi), %%xmm0, %%xmm15\n\t" // multiply a and c and add to temp - - "addq $4 , %%rdi \n\t" // increment pointer of c - "leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a - - "dec %%rax \n\t" // n = n -1 - "jnz .L01LOOP%= \n\t" - - "vmulps %%xmm8 , %%xmm1, %%xmm8 \n\t" // scale by alpha - "vmulps %%xmm9 , %%xmm1, %%xmm9 \n\t" // scale by alpha - "vmulps %%xmm10, %%xmm1, %%xmm10\n\t" // scale by alpha - "vmulps %%xmm11, %%xmm1, %%xmm11\n\t" // scale by alpha - "vmulps %%xmm12, %%xmm1, %%xmm12\n\t" // scale by alpha - "vmulps %%xmm13, %%xmm1, %%xmm13\n\t" // scale by alpha - "vmulps %%xmm14, %%xmm1, %%xmm14\n\t" // scale by alpha - "vmulps %%xmm15, %%xmm1, %%xmm15\n\t" // scale by alpha - - "vmovups %%xmm8 , (%%rdx) \n\t" // store temp -> y - "vmovups %%xmm9 , 4*4(%%rdx) \n\t" // store temp -> y - "vmovups %%xmm10, 8*4(%%rdx) \n\t" // store temp -> y - "vmovups %%xmm11, 12*4(%%rdx) \n\t" // store temp -> y - "vmovups %%xmm12, 16*4(%%rdx) \n\t" // store temp -> y - "vmovups %%xmm13, 20*4(%%rdx) \n\t" // store temp -> y - "vmovups %%xmm14, 24*4(%%rdx) \n\t" // store temp -> y - "vmovups %%xmm15, 28*4(%%rdx) \n\t" // store temp -> y - - : - : - "m" (n), // 0 - "m" (alpha), // 1 - "m" (a), // 2 - "m" (lda), // 3 - "m" (x), // 4 - "m" (y), // 5 - "m" (pre) // 6 - ); - -} - -static void sgemv_kernel_16( long n, float alpha, float *a, long lda, float *x, float *y) -{ - - float *pre = a + lda*3; - - __asm__ __volatile__ - ( - "movq %0, %%rax\n\t" // n -> rax - "vbroadcastss %1, %%ymm1\n\t" // alpha -> ymm1 - "movq %2, %%rsi\n\t" // adress of a -> rsi - "movq %3, %%rcx\n\t" // value of lda > rcx - "movq %4, %%rdi\n\t" // adress of x -> rdi - "movq %5, %%rdx\n\t" // adress of y -> rdx - "movq %6, %%r8\n\t" // address for prefetch - "prefetcht0 (%%r8)\n\t" // Prefetch - - "vxorps %%ymm12, %%ymm12, %%ymm12\n\t" // set to zero - "vxorps %%ymm13, %%ymm13, %%ymm13\n\t" // set to zero - - ".L01LOOP%=: \n\t" - "vbroadcastss (%%rdi), %%ymm0 \n\t" // load values of c - "addq $4 , %%rdi \n\t" // increment pointer of c - - "leaq (%%r8 , %%rcx, 4), %%r8 \n\t" // add lda to pointer for prefetch - "prefetcht0 (%%r8)\n\t" // Prefetch - - "vfmaddps %%ymm12, 0*4(%%rsi), %%ymm0, %%ymm12\n\t" // multiply a and c and add to temp - "vfmaddps %%ymm13, 8*4(%%rsi), %%ymm0, %%ymm13\n\t" // multiply a and c and add to temp - - "leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a - - "dec %%rax \n\t" // n = n -1 - "jnz .L01LOOP%= \n\t" - - "vmulps %%ymm12, %%ymm1, %%ymm12\n\t" // scale by alpha - "vmulps %%ymm13, %%ymm1, %%ymm13\n\t" // scale by alpha - - "vmovups %%ymm12, (%%rdx) \n\t" // store temp -> y - "vmovups %%ymm13, 8*4(%%rdx) \n\t" // store temp -> y - - : - : - "m" (n), // 0 - "m" (alpha), // 1 - "m" (a), // 2 - "m" (lda), // 3 - "m" (x), // 4 - "m" (y), // 5 - "m" (pre) // 6 - : "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", - "%xmm0", "%xmm1", - "%xmm8", "%xmm9", "%xmm10", "%xmm11", - "%xmm12", "%xmm13", "%xmm14", "%xmm15", - "memory" - ); - -} - - -static void sgemv_kernel_8( long n, float alpha, float *a, long lda, float *x, float *y) -{ - - - __asm__ __volatile__ - ( - "movq %0, %%rax\n\t" // n -> rax - "vbroadcastss %1, %%ymm1\n\t" // alpha -> ymm1 - "movq %2, %%rsi\n\t" // adress of a -> rsi - "movq %3, %%rcx\n\t" // value of lda > rcx - "movq %4, %%rdi\n\t" // adress of x -> rdi - "movq %5, %%rdx\n\t" // adress of y -> rdx - - "vxorps %%ymm12, %%ymm12, %%ymm12\n\t" // set to zero - - ".L01LOOP%=: \n\t" - "vbroadcastss (%%rdi), %%ymm0 \n\t" // load values of c - "addq $4 , %%rdi \n\t" // increment pointer of c - - "vfmaddps %%ymm12, 0*4(%%rsi), %%ymm0, %%ymm12\n\t" // multiply a and c and add to temp - - "leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a - - "dec %%rax \n\t" // n = n -1 - "jnz .L01LOOP%= \n\t" - - "vmulps %%ymm12, %%ymm1, %%ymm12\n\t" // scale by alpha - - "vmovups %%ymm12, (%%rdx) \n\t" // store temp -> y - - : - : - "m" (n), // 0 - "m" (alpha), // 1 - "m" (a), // 2 - "m" (lda), // 3 - "m" (x), // 4 - "m" (y) // 5 - : "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", - "%xmm0", "%xmm1", - "%xmm8", "%xmm9", "%xmm10", "%xmm11", - "%xmm12", "%xmm13", "%xmm14", "%xmm15", - "memory" - ); - -} - - -static void sgemv_kernel_4( long n, float alpha, float *a, long lda, float *x, float *y) -{ - - - __asm__ __volatile__ - ( - "movq %0, %%rax\n\t" // n -> rax - "vbroadcastss %1, %%xmm1\n\t" // alpha -> xmm1 - "movq %2, %%rsi\n\t" // adress of a -> rsi - "movq %3, %%rcx\n\t" // value of lda > rcx - "movq %4, %%rdi\n\t" // adress of x -> rdi - "movq %5, %%rdx\n\t" // adress of y -> rdx - - "vxorps %%xmm12, %%xmm12, %%xmm12\n\t" // set to zero - - ".L01LOOP%=: \n\t" - "vbroadcastss (%%rdi), %%xmm0 \n\t" // load values of c - "addq $4 , %%rdi \n\t" // increment pointer of c - - "vfmaddps %%xmm12, 0*4(%%rsi), %%xmm0, %%xmm12\n\t" // multiply a and c and add to temp - - "leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a - - "dec %%rax \n\t" // n = n -1 - "jnz .L01LOOP%= \n\t" - - "vmulps %%xmm12, %%xmm1, %%xmm12\n\t" // scale by alpha - - "vmovups %%xmm12, (%%rdx) \n\t" // store temp -> y - - : - : - "m" (n), // 0 - "m" (alpha), // 1 - "m" (a), // 2 - "m" (lda), // 3 - "m" (x), // 4 - "m" (y) // 5 - : "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", - "%xmm0", "%xmm1", - "%xmm8", "%xmm9", "%xmm10", "%xmm11", - "%xmm12", "%xmm13", "%xmm14", "%xmm15", - "memory" - ); - -} - -static void sgemv_kernel_2( long n, float alpha, float *a, long lda, float *x, float *y) -{ - - - __asm__ __volatile__ - ( - "movq %0, %%rax\n\t" // n -> rax - "vmovss %1, %%xmm1\n\t" // alpha -> xmm1 - "movq %2, %%rsi\n\t" // adress of a -> rsi - "movq %3, %%rcx\n\t" // value of lda > rcx - "movq %4, %%rdi\n\t" // adress of x -> rdi - "movq %5, %%rdx\n\t" // adress of y -> rdx - - "vxorps %%xmm12, %%xmm12, %%xmm12\n\t" // set to zero - "vxorps %%xmm13, %%xmm13, %%xmm13\n\t" // set to zero - - ".L01LOOP%=: \n\t" - "vmovss (%%rdi), %%xmm0 \n\t" // load values of c - "addq $4 , %%rdi \n\t" // increment pointer of c - - "vfmaddss %%xmm12, 0*4(%%rsi), %%xmm0, %%xmm12\n\t" // multiply a and c and add to temp - "vfmaddss %%xmm13, 1*4(%%rsi), %%xmm0, %%xmm13\n\t" // multiply a and c and add to temp - - "leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a - - "dec %%rax \n\t" // n = n -1 - "jnz .L01LOOP%= \n\t" - - "vmulss %%xmm12, %%xmm1, %%xmm12\n\t" // scale by alpha - "vmulss %%xmm13, %%xmm1, %%xmm13\n\t" // scale by alpha - - "vmovss %%xmm12, (%%rdx) \n\t" // store temp -> y - "vmovss %%xmm13, 4(%%rdx) \n\t" // store temp -> y - - : - : - "m" (n), // 0 - "m" (alpha), // 1 - "m" (a), // 2 - "m" (lda), // 3 - "m" (x), // 4 - "m" (y) // 5 - : "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", - "%xmm0", "%xmm1", - "%xmm8", "%xmm9", "%xmm10", "%xmm11", - "%xmm12", "%xmm13", "%xmm14", "%xmm15", - "memory" - ); - -} - - - -static void sgemv_kernel_1( long n, float alpha, float *a, long lda, float *x, float *y) -{ - - - __asm__ __volatile__ - ( - "movq %0, %%rax\n\t" // n -> rax - "vmovss %1, %%xmm1\n\t" // alpha -> xmm1 - "movq %2, %%rsi\n\t" // adress of a -> rsi - "movq %3, %%rcx\n\t" // value of lda > rcx - "movq %4, %%rdi\n\t" // adress of x -> rdi - "movq %5, %%rdx\n\t" // adress of y -> rdx - - "vxorps %%xmm12, %%xmm12, %%xmm12\n\t" // set to zero - - ".L01LOOP%=: \n\t" - "vmovss (%%rdi), %%xmm0 \n\t" // load values of c - "addq $4 , %%rdi \n\t" // increment pointer of c - - "vfmaddss %%xmm12, 0*4(%%rsi), %%xmm0, %%xmm12\n\t" // multiply a and c and add to temp - - "leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a - - "dec %%rax \n\t" // n = n -1 - "jnz .L01LOOP%= \n\t" - - "vmulss %%xmm12, %%xmm1, %%xmm12\n\t" // scale by alpha - - "vmovss %%xmm12, (%%rdx) \n\t" // store temp -> y - - : - : - "m" (n), // 0 - "m" (alpha), // 1 - "m" (a), // 2 - "m" (lda), // 3 - "m" (x), // 4 - "m" (y) // 5 - : "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", - "%xmm0", "%xmm1", - "%xmm8", "%xmm9", "%xmm10", "%xmm11", - "%xmm12", "%xmm13", "%xmm14", "%xmm15", - "memory" - ); - -} - - diff --git a/kernel/x86_64/sgemv_n_microk_haswell.c b/kernel/x86_64/sgemv_n_microk_haswell.c deleted file mode 100644 index 9db3869d2..000000000 --- a/kernel/x86_64/sgemv_n_microk_haswell.c +++ /dev/null @@ -1,461 +0,0 @@ -/*************************************************************************** -Copyright (c) 2014, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -static void sgemv_kernel_64( long n, float alpha, float *a, long lda, float *x, float *y) -{ - - - float *pre = a + lda*2; - - __asm__ __volatile__ - ( - "movq %0, %%rax\n\t" // n -> rax - "vbroadcastss %1, %%ymm1\n\t" // alpha -> ymm1 - "movq %2, %%rsi\n\t" // adress of a -> rsi - "movq %3, %%rcx\n\t" // value of lda > rcx - "movq %4, %%rdi\n\t" // adress of x -> rdi - "movq %5, %%rdx\n\t" // adress of y -> rdx - "movq %6, %%r8\n\t" // address for prefetch - "prefetcht0 (%%r8)\n\t" // Prefetch - "prefetcht0 64(%%r8)\n\t" // Prefetch - - "vxorps %%ymm8 , %%ymm8 , %%ymm8 \n\t" // set to zero - "vxorps %%ymm9 , %%ymm9 , %%ymm9 \n\t" // set to zero - "vxorps %%ymm10, %%ymm10, %%ymm10\n\t" // set to zero - "vxorps %%ymm11, %%ymm11, %%ymm11\n\t" // set to zero - "vxorps %%ymm12, %%ymm12, %%ymm12\n\t" // set to zero - "vxorps %%ymm13, %%ymm13, %%ymm13\n\t" // set to zero - "vxorps %%ymm14, %%ymm14, %%ymm14\n\t" // set to zero - "vxorps %%ymm15, %%ymm15, %%ymm15\n\t" // set to zero - ".align 16 \n\t" - ".L01LOOP%=: \n\t" - "vbroadcastss (%%rdi), %%ymm0 \n\t" // load values of c - "leaq (%%r8 , %%rcx, 4), %%r8 \n\t" // add lda to pointer for prefetch - - "prefetcht0 (%%r8)\n\t" // Prefetch - "vfmadd231ps 0*4(%%rsi), %%ymm0, %%ymm8 \n\t" // multiply a and c and add to temp - "vfmadd231ps 8*4(%%rsi), %%ymm0, %%ymm9 \n\t" // multiply a and c and add to temp - "prefetcht0 64(%%r8)\n\t" // Prefetch - "vfmadd231ps 16*4(%%rsi), %%ymm0, %%ymm10\n\t" // multiply a and c and add to temp - "vfmadd231ps 24*4(%%rsi), %%ymm0, %%ymm11\n\t" // multiply a and c and add to temp - "prefetcht0 128(%%r8)\n\t" // Prefetch - "vfmadd231ps 32*4(%%rsi), %%ymm0, %%ymm12\n\t" // multiply a and c and add to temp - "vfmadd231ps 40*4(%%rsi), %%ymm0, %%ymm13\n\t" // multiply a and c and add to temp - "prefetcht0 192(%%r8)\n\t" // Prefetch - "vfmadd231ps 48*4(%%rsi), %%ymm0, %%ymm14\n\t" // multiply a and c and add to temp - "vfmadd231ps 56*4(%%rsi), %%ymm0, %%ymm15\n\t" // multiply a and c and add to temp - - "addq $4 , %%rdi \n\t" // increment pointer of c - "leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a - - "dec %%rax \n\t" // n = n -1 - "jnz .L01LOOP%= \n\t" - - "vmulps %%ymm8 , %%ymm1, %%ymm8 \n\t" // scale by alpha - "vmulps %%ymm9 , %%ymm1, %%ymm9 \n\t" // scale by alpha - "vmulps %%ymm10, %%ymm1, %%ymm10\n\t" // scale by alpha - "vmulps %%ymm11, %%ymm1, %%ymm11\n\t" // scale by alpha - "vmulps %%ymm12, %%ymm1, %%ymm12\n\t" // scale by alpha - "vmulps %%ymm13, %%ymm1, %%ymm13\n\t" // scale by alpha - "vmulps %%ymm14, %%ymm1, %%ymm14\n\t" // scale by alpha - "vmulps %%ymm15, %%ymm1, %%ymm15\n\t" // scale by alpha - - "vmovups %%ymm8 , (%%rdx) \n\t" // store temp -> y - "vmovups %%ymm9 , 8*4(%%rdx) \n\t" // store temp -> y - "vmovups %%ymm10, 16*4(%%rdx) \n\t" // store temp -> y - "vmovups %%ymm11, 24*4(%%rdx) \n\t" // store temp -> y - "vmovups %%ymm12, 32*4(%%rdx) \n\t" // store temp -> y - "vmovups %%ymm13, 40*4(%%rdx) \n\t" // store temp -> y - "vmovups %%ymm14, 48*4(%%rdx) \n\t" // store temp -> y - "vmovups %%ymm15, 56*4(%%rdx) \n\t" // store temp -> y - - : - : - "m" (n), // 0 - "m" (alpha), // 1 - "m" (a), // 2 - "m" (lda), // 3 - "m" (x), // 4 - "m" (y), // 5 - "m" (pre) // 6 - : "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", "cc", - "%xmm0", "%xmm1", - "%xmm4", "%xmm5", "%xmm6", "%xmm7", - "%xmm8", "%xmm9", "%xmm10", "%xmm11", - "%xmm12", "%xmm13", "%xmm14", "%xmm15", - "memory" - ); - -} - - - -static void sgemv_kernel_32( long n, float alpha, float *a, long lda, float *x, float *y) -{ - - - float *pre = a + lda*3; - - __asm__ __volatile__ - ( - "movq %0, %%rax\n\t" // n -> rax - "vbroadcastss %1, %%ymm1\n\t" // alpha -> ymm1 - "movq %2, %%rsi\n\t" // adress of a -> rsi - "movq %3, %%rcx\n\t" // value of lda > rcx - "movq %4, %%rdi\n\t" // adress of x -> rdi - "movq %5, %%rdx\n\t" // adress of y -> rdx - "movq %6, %%r8\n\t" // address for prefetch - "prefetcht0 (%%r8)\n\t" // Prefetch - "prefetcht0 64(%%r8)\n\t" // Prefetch - - "vxorps %%ymm8 , %%ymm8 , %%ymm8 \n\t" // set to zero - "vxorps %%ymm9 , %%ymm9 , %%ymm9 \n\t" // set to zero - "vxorps %%ymm10, %%ymm10, %%ymm10\n\t" // set to zero - "vxorps %%ymm11, %%ymm11, %%ymm11\n\t" // set to zero - ".align 16 \n\t" - ".L01LOOP%=: \n\t" - "vbroadcastss (%%rdi), %%ymm0 \n\t" // load values of c - "nop \n\t" - "leaq (%%r8 , %%rcx, 4), %%r8 \n\t" // add lda to pointer for prefetch - - "prefetcht0 (%%r8)\n\t" // Prefetch - "prefetcht0 64(%%r8)\n\t" // Prefetch - - "vmulps 0*4(%%rsi), %%ymm0, %%ymm4 \n\t" // multiply a and c and add to temp - "vmulps 8*4(%%rsi), %%ymm0, %%ymm5 \n\t" // multiply a and c and add to temp - "vmulps 16*4(%%rsi), %%ymm0, %%ymm6 \n\t" // multiply a and c and add to temp - "vmulps 24*4(%%rsi), %%ymm0, %%ymm7 \n\t" // multiply a and c and add to temp - - "vaddps %%ymm8 , %%ymm4, %%ymm8 \n\t" // multiply a and c and add to temp - "vaddps %%ymm9 , %%ymm5, %%ymm9 \n\t" // multiply a and c and add to temp - "vaddps %%ymm10, %%ymm6, %%ymm10\n\t" // multiply a and c and add to temp - "vaddps %%ymm11, %%ymm7, %%ymm11\n\t" // multiply a and c and add to temp - - - - "addq $4 , %%rdi \n\t" // increment pointer of c - "leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a - - "dec %%rax \n\t" // n = n -1 - "jnz .L01LOOP%= \n\t" - - "vmulps %%ymm8 , %%ymm1, %%ymm8 \n\t" // scale by alpha - "vmulps %%ymm9 , %%ymm1, %%ymm9 \n\t" // scale by alpha - "vmulps %%ymm10, %%ymm1, %%ymm10\n\t" // scale by alpha - "vmulps %%ymm11, %%ymm1, %%ymm11\n\t" // scale by alpha - - "vmovups %%ymm8 , (%%rdx) \n\t" // store temp -> y - "vmovups %%ymm9 , 8*4(%%rdx) \n\t" // store temp -> y - "vmovups %%ymm10, 16*4(%%rdx) \n\t" // store temp -> y - "vmovups %%ymm11, 24*4(%%rdx) \n\t" // store temp -> y - - : - : - "m" (n), // 0 - "m" (alpha), // 1 - "m" (a), // 2 - "m" (lda), // 3 - "m" (x), // 4 - "m" (y), // 5 - "m" (pre) // 6 - : "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", "cc", - "%xmm0", "%xmm1", - "%xmm4", "%xmm5", "%xmm6", "%xmm7", - "%xmm8", "%xmm9", "%xmm10", "%xmm11", - "memory" - ); - - - -} - -static void sgemv_kernel_16( long n, float alpha, float *a, long lda, float *x, float *y) -{ - - float *pre = a + lda*3; - - __asm__ __volatile__ - ( - "movq %0, %%rax\n\t" // n -> rax - "vbroadcastss %1, %%ymm1\n\t" // alpha -> ymm1 - "movq %2, %%rsi\n\t" // adress of a -> rsi - "movq %3, %%rcx\n\t" // value of lda > rcx - "movq %4, %%rdi\n\t" // adress of x -> rdi - "movq %5, %%rdx\n\t" // adress of y -> rdx - "movq %6, %%r8\n\t" // address for prefetch - "prefetcht0 (%%r8)\n\t" // Prefetch - "prefetcht0 64(%%r8)\n\t" // Prefetch - - "vxorps %%ymm8 , %%ymm8 , %%ymm8 \n\t" // set to zero - "vxorps %%ymm9 , %%ymm9 , %%ymm9 \n\t" // set to zero - ".align 16 \n\t" - ".L01LOOP%=: \n\t" - "vbroadcastss (%%rdi), %%ymm0 \n\t" // load values of c - "nop \n\t" - "leaq (%%r8 , %%rcx, 4), %%r8 \n\t" // add lda to pointer for prefetch - - "prefetcht0 (%%r8)\n\t" // Prefetch - - "vmulps 0*4(%%rsi), %%ymm0, %%ymm4 \n\t" // multiply a and c and add to temp - "vmulps 8*4(%%rsi), %%ymm0, %%ymm5 \n\t" // multiply a and c and add to temp - - "vaddps %%ymm8 , %%ymm4, %%ymm8 \n\t" // multiply a and c and add to temp - "vaddps %%ymm9 , %%ymm5, %%ymm9 \n\t" // multiply a and c and add to temp - - "addq $4 , %%rdi \n\t" // increment pointer of c - "leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a - - "dec %%rax \n\t" // n = n -1 - "jnz .L01LOOP%= \n\t" - - "vmulps %%ymm8 , %%ymm1, %%ymm8 \n\t" // scale by alpha - "vmulps %%ymm9 , %%ymm1, %%ymm9 \n\t" // scale by alpha - - "vmovups %%ymm8 , (%%rdx) \n\t" // store temp -> y - "vmovups %%ymm9 , 8*4(%%rdx) \n\t" // store temp -> y - - : - : - "m" (n), // 0 - "m" (alpha), // 1 - "m" (a), // 2 - "m" (lda), // 3 - "m" (x), // 4 - "m" (y), // 5 - "m" (pre) // 6 - : "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", "cc", - "%xmm0", "%xmm1", - "%xmm4", "%xmm5", "%xmm6", "%xmm7", - "%xmm8", "%xmm9", "%xmm10", "%xmm11", - "memory" - ); - - -} - - -static void sgemv_kernel_8( long n, float alpha, float *a, long lda, float *x, float *y) -{ - - __asm__ __volatile__ - ( - "movq %0, %%rax\n\t" // n -> rax - "vbroadcastss %1, %%ymm1\n\t" // alpha -> ymm1 - "movq %2, %%rsi\n\t" // adress of a -> rsi - "movq %3, %%rcx\n\t" // value of lda > rcx - "movq %4, %%rdi\n\t" // adress of x -> rdi - "movq %5, %%rdx\n\t" // adress of y -> rdx - - "vxorps %%ymm8 , %%ymm8 , %%ymm8 \n\t" // set to zero - ".align 16 \n\t" - ".L01LOOP%=: \n\t" - "vbroadcastss (%%rdi), %%ymm0 \n\t" // load values of c - - "vmulps 0*4(%%rsi), %%ymm0, %%ymm4 \n\t" // multiply a and c and add to temp - "vaddps %%ymm8 , %%ymm4, %%ymm8 \n\t" // multiply a and c and add to temp - - "addq $4 , %%rdi \n\t" // increment pointer of c - "leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a - - "dec %%rax \n\t" // n = n -1 - "jnz .L01LOOP%= \n\t" - - "vmulps %%ymm8 , %%ymm1, %%ymm8 \n\t" // scale by alpha - "vmovups %%ymm8 , (%%rdx) \n\t" // store temp -> y - - : - : - "m" (n), // 0 - "m" (alpha), // 1 - "m" (a), // 2 - "m" (lda), // 3 - "m" (x), // 4 - "m" (y) // 5 - : "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", "cc", - "%xmm0", "%xmm1", - "%xmm4", "%xmm5", "%xmm6", "%xmm7", - "%xmm8", "%xmm9", "%xmm10", "%xmm11", - "memory" - ); - - -} - - -static void sgemv_kernel_4( long n, float alpha, float *a, long lda, float *x, float *y) -{ - - - __asm__ __volatile__ - ( - "movq %0, %%rax\n\t" // n -> rax - "vbroadcastss %1, %%xmm1\n\t" // alpha -> xmm1 - "movq %2, %%rsi\n\t" // adress of a -> rsi - "movq %3, %%rcx\n\t" // value of lda > rcx - "movq %4, %%rdi\n\t" // adress of x -> rdi - "movq %5, %%rdx\n\t" // adress of y -> rdx - - "vxorps %%xmm12, %%xmm12, %%xmm12\n\t" // set to zero - - ".L01LOOP%=: \n\t" - "vbroadcastss (%%rdi), %%xmm0 \n\t" // load values of c - - "vmulps 0*4(%%rsi), %%xmm0, %%xmm4 \n\t" // multiply a and c and add to temp - "vaddps %%xmm12, %%xmm4, %%xmm12 \n\t" // multiply a and c and add to temp - - "addq $4 , %%rdi \n\t" // increment pointer of c - "leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a - - "dec %%rax \n\t" // n = n -1 - "jnz .L01LOOP%= \n\t" - - "vmulps %%xmm12, %%xmm1, %%xmm12\n\t" // scale by alpha - - "vmovups %%xmm12, (%%rdx) \n\t" // store temp -> y - - : - : - "m" (n), // 0 - "m" (alpha), // 1 - "m" (a), // 2 - "m" (lda), // 3 - "m" (x), // 4 - "m" (y) // 5 - : "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", - "%xmm0", "%xmm1", - "%xmm8", "%xmm9", "%xmm10", "%xmm11", - "%xmm12", "%xmm13", "%xmm14", "%xmm15", - "memory" - ); - -} - -static void sgemv_kernel_2( long n, float alpha, float *a, long lda, float *x, float *y) -{ - - - __asm__ __volatile__ - ( - "movq %0, %%rax\n\t" // n -> rax - "vmovss %1, %%xmm1\n\t" // alpha -> xmm1 - "movq %2, %%rsi\n\t" // adress of a -> rsi - "movq %3, %%rcx\n\t" // value of lda > rcx - "movq %4, %%rdi\n\t" // adress of x -> rdi - "movq %5, %%rdx\n\t" // adress of y -> rdx - - "vxorps %%xmm12, %%xmm12, %%xmm12\n\t" // set to zero - "vxorps %%xmm13, %%xmm13, %%xmm13\n\t" // set to zero - - ".L01LOOP%=: \n\t" - "vmovss (%%rdi), %%xmm0 \n\t" // load values of c - - "vmulps 0*4(%%rsi), %%xmm0, %%xmm4 \n\t" // multiply a and c and add to temp - "vmulps 1*4(%%rsi), %%xmm0, %%xmm5 \n\t" // multiply a and c and add to temp - - "vaddps %%xmm12, %%xmm4, %%xmm12 \n\t" // multiply a and c and add to temp - "vaddps %%xmm13, %%xmm5, %%xmm13 \n\t" // multiply a and c and add to temp - - "addq $4 , %%rdi \n\t" // increment pointer of c - "leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a - - "dec %%rax \n\t" // n = n -1 - "jnz .L01LOOP%= \n\t" - - "vmulss %%xmm12, %%xmm1, %%xmm12\n\t" // scale by alpha - "vmulss %%xmm13, %%xmm1, %%xmm13\n\t" // scale by alpha - - "vmovss %%xmm12, (%%rdx) \n\t" // store temp -> y - "vmovss %%xmm13, 4(%%rdx) \n\t" // store temp -> y - - : - : - "m" (n), // 0 - "m" (alpha), // 1 - "m" (a), // 2 - "m" (lda), // 3 - "m" (x), // 4 - "m" (y) // 5 - : "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", - "%xmm0", "%xmm1", - "%xmm8", "%xmm9", "%xmm10", "%xmm11", - "%xmm12", "%xmm13", "%xmm14", "%xmm15", - "memory" - ); - -} - - - -static void sgemv_kernel_1( long n, float alpha, float *a, long lda, float *x, float *y) -{ - - - __asm__ __volatile__ - ( - "movq %0, %%rax\n\t" // n -> rax - "vmovss %1, %%xmm1\n\t" // alpha -> xmm1 - "movq %2, %%rsi\n\t" // adress of a -> rsi - "movq %3, %%rcx\n\t" // value of lda > rcx - "movq %4, %%rdi\n\t" // adress of x -> rdi - "movq %5, %%rdx\n\t" // adress of y -> rdx - - "vxorps %%xmm12, %%xmm12, %%xmm12\n\t" // set to zero - - ".L01LOOP%=: \n\t" - "vmovss (%%rdi), %%xmm0 \n\t" // load values of c - "addq $4 , %%rdi \n\t" // increment pointer of c - - "vmulss 0*4(%%rsi), %%xmm0, %%xmm4 \n\t" // multiply a and c and add to temp - "vaddss %%xmm12, %%xmm4, %%xmm12 \n\t" // multiply a and c and add to temp - - "leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a - - "dec %%rax \n\t" // n = n -1 - "jnz .L01LOOP%= \n\t" - - "vmulss %%xmm12, %%xmm1, %%xmm12\n\t" // scale by alpha - - "vmovss %%xmm12, (%%rdx) \n\t" // store temp -> y - - : - : - "m" (n), // 0 - "m" (alpha), // 1 - "m" (a), // 2 - "m" (lda), // 3 - "m" (x), // 4 - "m" (y) // 5 - : "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", - "%xmm0", "%xmm1", - "%xmm8", "%xmm9", "%xmm10", "%xmm11", - "%xmm12", "%xmm13", "%xmm14", "%xmm15", - "memory" - ); - -} - - diff --git a/kernel/x86_64/sgemv_n_microk_sandy.c b/kernel/x86_64/sgemv_n_microk_sandy.c deleted file mode 100644 index 9bdb06600..000000000 --- a/kernel/x86_64/sgemv_n_microk_sandy.c +++ /dev/null @@ -1,473 +0,0 @@ -/*************************************************************************** -Copyright (c) 2014, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -static void sgemv_kernel_64( long n, float alpha, float *a, long lda, float *x, float *y) -{ - - - float *pre = a + lda*2; - - __asm__ __volatile__ - ( - "movq %0, %%rax\n\t" // n -> rax - "vbroadcastss %1, %%ymm1\n\t" // alpha -> ymm1 - "movq %2, %%rsi\n\t" // adress of a -> rsi - "movq %3, %%rcx\n\t" // value of lda > rcx - "movq %4, %%rdi\n\t" // adress of x -> rdi - "movq %5, %%rdx\n\t" // adress of y -> rdx - "movq %6, %%r8\n\t" // address for prefetch - "prefetcht0 (%%r8)\n\t" // Prefetch - "prefetcht0 64(%%r8)\n\t" // Prefetch - - "vxorps %%ymm8 , %%ymm8 , %%ymm8 \n\t" // set to zero - "vxorps %%ymm9 , %%ymm9 , %%ymm9 \n\t" // set to zero - "vxorps %%ymm10, %%ymm10, %%ymm10\n\t" // set to zero - "vxorps %%ymm11, %%ymm11, %%ymm11\n\t" // set to zero - "vxorps %%ymm12, %%ymm12, %%ymm12\n\t" // set to zero - "vxorps %%ymm13, %%ymm13, %%ymm13\n\t" // set to zero - "vxorps %%ymm14, %%ymm14, %%ymm14\n\t" // set to zero - "vxorps %%ymm15, %%ymm15, %%ymm15\n\t" // set to zero - ".align 16 \n\t" - ".L01LOOP%=: \n\t" - "vbroadcastss (%%rdi), %%ymm0 \n\t" // load values of c - "nop \n\t" - "leaq (%%r8 , %%rcx, 4), %%r8 \n\t" // add lda to pointer for prefetch - - "prefetcht0 (%%r8)\n\t" // Prefetch - "vmulps 0*4(%%rsi), %%ymm0, %%ymm4 \n\t" // multiply a and c and add to temp - "vmulps 8*4(%%rsi), %%ymm0, %%ymm5 \n\t" // multiply a and c and add to temp - "prefetcht0 64(%%r8)\n\t" // Prefetch - "vmulps 16*4(%%rsi), %%ymm0, %%ymm6 \n\t" // multiply a and c and add to temp - "vmulps 24*4(%%rsi), %%ymm0, %%ymm7 \n\t" // multiply a and c and add to temp - - "vaddps %%ymm8 , %%ymm4, %%ymm8 \n\t" // multiply a and c and add to temp - "vaddps %%ymm9 , %%ymm5, %%ymm9 \n\t" // multiply a and c and add to temp - "prefetcht0 128(%%r8)\n\t" // Prefetch - "vaddps %%ymm10, %%ymm6, %%ymm10\n\t" // multiply a and c and add to temp - "vaddps %%ymm11, %%ymm7, %%ymm11\n\t" // multiply a and c and add to temp - - "prefetcht0 192(%%r8)\n\t" // Prefetch - "vmulps 32*4(%%rsi), %%ymm0, %%ymm4 \n\t" // multiply a and c and add to temp - "vmulps 40*4(%%rsi), %%ymm0, %%ymm5 \n\t" // multiply a and c and add to temp - "vmulps 48*4(%%rsi), %%ymm0, %%ymm6 \n\t" // multiply a and c and add to temp - "vmulps 56*4(%%rsi), %%ymm0, %%ymm7 \n\t" // multiply a and c and add to temp - - "vaddps %%ymm12, %%ymm4, %%ymm12\n\t" // multiply a and c and add to temp - "vaddps %%ymm13, %%ymm5, %%ymm13\n\t" // multiply a and c and add to temp - "vaddps %%ymm14, %%ymm6, %%ymm14\n\t" // multiply a and c and add to temp - "vaddps %%ymm15, %%ymm7, %%ymm15\n\t" // multiply a and c and add to temp - - "addq $4 , %%rdi \n\t" // increment pointer of c - "leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a - - "dec %%rax \n\t" // n = n -1 - "jnz .L01LOOP%= \n\t" - - "vmulps %%ymm8 , %%ymm1, %%ymm8 \n\t" // scale by alpha - "vmulps %%ymm9 , %%ymm1, %%ymm9 \n\t" // scale by alpha - "vmulps %%ymm10, %%ymm1, %%ymm10\n\t" // scale by alpha - "vmulps %%ymm11, %%ymm1, %%ymm11\n\t" // scale by alpha - "vmulps %%ymm12, %%ymm1, %%ymm12\n\t" // scale by alpha - "vmulps %%ymm13, %%ymm1, %%ymm13\n\t" // scale by alpha - "vmulps %%ymm14, %%ymm1, %%ymm14\n\t" // scale by alpha - "vmulps %%ymm15, %%ymm1, %%ymm15\n\t" // scale by alpha - - "vmovups %%ymm8 , (%%rdx) \n\t" // store temp -> y - "vmovups %%ymm9 , 8*4(%%rdx) \n\t" // store temp -> y - "vmovups %%ymm10, 16*4(%%rdx) \n\t" // store temp -> y - "vmovups %%ymm11, 24*4(%%rdx) \n\t" // store temp -> y - "vmovups %%ymm12, 32*4(%%rdx) \n\t" // store temp -> y - "vmovups %%ymm13, 40*4(%%rdx) \n\t" // store temp -> y - "vmovups %%ymm14, 48*4(%%rdx) \n\t" // store temp -> y - "vmovups %%ymm15, 56*4(%%rdx) \n\t" // store temp -> y - - : - : - "m" (n), // 0 - "m" (alpha), // 1 - "m" (a), // 2 - "m" (lda), // 3 - "m" (x), // 4 - "m" (y), // 5 - "m" (pre) // 6 - : "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", "cc", - "%xmm0", "%xmm1", - "%xmm4", "%xmm5", "%xmm6", "%xmm7", - "%xmm8", "%xmm9", "%xmm10", "%xmm11", - "%xmm12", "%xmm13", "%xmm14", "%xmm15", - "memory" - ); - -} - - - -static void sgemv_kernel_32( long n, float alpha, float *a, long lda, float *x, float *y) -{ - - - float *pre = a + lda*3; - - __asm__ __volatile__ - ( - "movq %0, %%rax\n\t" // n -> rax - "vbroadcastss %1, %%ymm1\n\t" // alpha -> ymm1 - "movq %2, %%rsi\n\t" // adress of a -> rsi - "movq %3, %%rcx\n\t" // value of lda > rcx - "movq %4, %%rdi\n\t" // adress of x -> rdi - "movq %5, %%rdx\n\t" // adress of y -> rdx - "movq %6, %%r8\n\t" // address for prefetch - "prefetcht0 (%%r8)\n\t" // Prefetch - "prefetcht0 64(%%r8)\n\t" // Prefetch - - "vxorps %%ymm8 , %%ymm8 , %%ymm8 \n\t" // set to zero - "vxorps %%ymm9 , %%ymm9 , %%ymm9 \n\t" // set to zero - "vxorps %%ymm10, %%ymm10, %%ymm10\n\t" // set to zero - "vxorps %%ymm11, %%ymm11, %%ymm11\n\t" // set to zero - ".align 16 \n\t" - ".L01LOOP%=: \n\t" - "vbroadcastss (%%rdi), %%ymm0 \n\t" // load values of c - "nop \n\t" - "leaq (%%r8 , %%rcx, 4), %%r8 \n\t" // add lda to pointer for prefetch - - "prefetcht0 (%%r8)\n\t" // Prefetch - "prefetcht0 64(%%r8)\n\t" // Prefetch - - "vmulps 0*4(%%rsi), %%ymm0, %%ymm4 \n\t" // multiply a and c and add to temp - "vmulps 8*4(%%rsi), %%ymm0, %%ymm5 \n\t" // multiply a and c and add to temp - "vmulps 16*4(%%rsi), %%ymm0, %%ymm6 \n\t" // multiply a and c and add to temp - "vmulps 24*4(%%rsi), %%ymm0, %%ymm7 \n\t" // multiply a and c and add to temp - - "vaddps %%ymm8 , %%ymm4, %%ymm8 \n\t" // multiply a and c and add to temp - "vaddps %%ymm9 , %%ymm5, %%ymm9 \n\t" // multiply a and c and add to temp - "vaddps %%ymm10, %%ymm6, %%ymm10\n\t" // multiply a and c and add to temp - "vaddps %%ymm11, %%ymm7, %%ymm11\n\t" // multiply a and c and add to temp - - - - "addq $4 , %%rdi \n\t" // increment pointer of c - "leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a - - "dec %%rax \n\t" // n = n -1 - "jnz .L01LOOP%= \n\t" - - "vmulps %%ymm8 , %%ymm1, %%ymm8 \n\t" // scale by alpha - "vmulps %%ymm9 , %%ymm1, %%ymm9 \n\t" // scale by alpha - "vmulps %%ymm10, %%ymm1, %%ymm10\n\t" // scale by alpha - "vmulps %%ymm11, %%ymm1, %%ymm11\n\t" // scale by alpha - - "vmovups %%ymm8 , (%%rdx) \n\t" // store temp -> y - "vmovups %%ymm9 , 8*4(%%rdx) \n\t" // store temp -> y - "vmovups %%ymm10, 16*4(%%rdx) \n\t" // store temp -> y - "vmovups %%ymm11, 24*4(%%rdx) \n\t" // store temp -> y - - : - : - "m" (n), // 0 - "m" (alpha), // 1 - "m" (a), // 2 - "m" (lda), // 3 - "m" (x), // 4 - "m" (y), // 5 - "m" (pre) // 6 - : "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", "cc", - "%xmm0", "%xmm1", - "%xmm4", "%xmm5", "%xmm6", "%xmm7", - "%xmm8", "%xmm9", "%xmm10", "%xmm11", - "memory" - ); - - - -} - -static void sgemv_kernel_16( long n, float alpha, float *a, long lda, float *x, float *y) -{ - - float *pre = a + lda*3; - - __asm__ __volatile__ - ( - "movq %0, %%rax\n\t" // n -> rax - "vbroadcastss %1, %%ymm1\n\t" // alpha -> ymm1 - "movq %2, %%rsi\n\t" // adress of a -> rsi - "movq %3, %%rcx\n\t" // value of lda > rcx - "movq %4, %%rdi\n\t" // adress of x -> rdi - "movq %5, %%rdx\n\t" // adress of y -> rdx - "movq %6, %%r8\n\t" // address for prefetch - "prefetcht0 (%%r8)\n\t" // Prefetch - "prefetcht0 64(%%r8)\n\t" // Prefetch - - "vxorps %%ymm8 , %%ymm8 , %%ymm8 \n\t" // set to zero - "vxorps %%ymm9 , %%ymm9 , %%ymm9 \n\t" // set to zero - ".align 16 \n\t" - ".L01LOOP%=: \n\t" - "vbroadcastss (%%rdi), %%ymm0 \n\t" // load values of c - "nop \n\t" - "leaq (%%r8 , %%rcx, 4), %%r8 \n\t" // add lda to pointer for prefetch - - "prefetcht0 (%%r8)\n\t" // Prefetch - - "vmulps 0*4(%%rsi), %%ymm0, %%ymm4 \n\t" // multiply a and c and add to temp - "vmulps 8*4(%%rsi), %%ymm0, %%ymm5 \n\t" // multiply a and c and add to temp - - "vaddps %%ymm8 , %%ymm4, %%ymm8 \n\t" // multiply a and c and add to temp - "vaddps %%ymm9 , %%ymm5, %%ymm9 \n\t" // multiply a and c and add to temp - - "addq $4 , %%rdi \n\t" // increment pointer of c - "leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a - - "dec %%rax \n\t" // n = n -1 - "jnz .L01LOOP%= \n\t" - - "vmulps %%ymm8 , %%ymm1, %%ymm8 \n\t" // scale by alpha - "vmulps %%ymm9 , %%ymm1, %%ymm9 \n\t" // scale by alpha - - "vmovups %%ymm8 , (%%rdx) \n\t" // store temp -> y - "vmovups %%ymm9 , 8*4(%%rdx) \n\t" // store temp -> y - - : - : - "m" (n), // 0 - "m" (alpha), // 1 - "m" (a), // 2 - "m" (lda), // 3 - "m" (x), // 4 - "m" (y), // 5 - "m" (pre) // 6 - : "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", "cc", - "%xmm0", "%xmm1", - "%xmm4", "%xmm5", "%xmm6", "%xmm7", - "%xmm8", "%xmm9", "%xmm10", "%xmm11", - "memory" - ); - - -} - - -static void sgemv_kernel_8( long n, float alpha, float *a, long lda, float *x, float *y) -{ - - __asm__ __volatile__ - ( - "movq %0, %%rax\n\t" // n -> rax - "vbroadcastss %1, %%ymm1\n\t" // alpha -> ymm1 - "movq %2, %%rsi\n\t" // adress of a -> rsi - "movq %3, %%rcx\n\t" // value of lda > rcx - "movq %4, %%rdi\n\t" // adress of x -> rdi - "movq %5, %%rdx\n\t" // adress of y -> rdx - - "vxorps %%ymm8 , %%ymm8 , %%ymm8 \n\t" // set to zero - ".align 16 \n\t" - ".L01LOOP%=: \n\t" - "vbroadcastss (%%rdi), %%ymm0 \n\t" // load values of c - - "vmulps 0*4(%%rsi), %%ymm0, %%ymm4 \n\t" // multiply a and c and add to temp - "vaddps %%ymm8 , %%ymm4, %%ymm8 \n\t" // multiply a and c and add to temp - - "addq $4 , %%rdi \n\t" // increment pointer of c - "leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a - - "dec %%rax \n\t" // n = n -1 - "jnz .L01LOOP%= \n\t" - - "vmulps %%ymm8 , %%ymm1, %%ymm8 \n\t" // scale by alpha - "vmovups %%ymm8 , (%%rdx) \n\t" // store temp -> y - - : - : - "m" (n), // 0 - "m" (alpha), // 1 - "m" (a), // 2 - "m" (lda), // 3 - "m" (x), // 4 - "m" (y) // 5 - : "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", "cc", - "%xmm0", "%xmm1", - "%xmm4", "%xmm5", "%xmm6", "%xmm7", - "%xmm8", "%xmm9", "%xmm10", "%xmm11", - "memory" - ); - - -} - - -static void sgemv_kernel_4( long n, float alpha, float *a, long lda, float *x, float *y) -{ - - - __asm__ __volatile__ - ( - "movq %0, %%rax\n\t" // n -> rax - "vbroadcastss %1, %%xmm1\n\t" // alpha -> xmm1 - "movq %2, %%rsi\n\t" // adress of a -> rsi - "movq %3, %%rcx\n\t" // value of lda > rcx - "movq %4, %%rdi\n\t" // adress of x -> rdi - "movq %5, %%rdx\n\t" // adress of y -> rdx - - "vxorps %%xmm12, %%xmm12, %%xmm12\n\t" // set to zero - - ".L01LOOP%=: \n\t" - "vbroadcastss (%%rdi), %%xmm0 \n\t" // load values of c - - "vmulps 0*4(%%rsi), %%xmm0, %%xmm4 \n\t" // multiply a and c and add to temp - "vaddps %%xmm12, %%xmm4, %%xmm12 \n\t" // multiply a and c and add to temp - - "addq $4 , %%rdi \n\t" // increment pointer of c - "leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a - - "dec %%rax \n\t" // n = n -1 - "jnz .L01LOOP%= \n\t" - - "vmulps %%xmm12, %%xmm1, %%xmm12\n\t" // scale by alpha - - "vmovups %%xmm12, (%%rdx) \n\t" // store temp -> y - - : - : - "m" (n), // 0 - "m" (alpha), // 1 - "m" (a), // 2 - "m" (lda), // 3 - "m" (x), // 4 - "m" (y) // 5 - : "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", - "%xmm0", "%xmm1", - "%xmm8", "%xmm9", "%xmm10", "%xmm11", - "%xmm12", "%xmm13", "%xmm14", "%xmm15", - "memory" - ); - -} - -static void sgemv_kernel_2( long n, float alpha, float *a, long lda, float *x, float *y) -{ - - - __asm__ __volatile__ - ( - "movq %0, %%rax\n\t" // n -> rax - "vmovss %1, %%xmm1\n\t" // alpha -> xmm1 - "movq %2, %%rsi\n\t" // adress of a -> rsi - "movq %3, %%rcx\n\t" // value of lda > rcx - "movq %4, %%rdi\n\t" // adress of x -> rdi - "movq %5, %%rdx\n\t" // adress of y -> rdx - - "vxorps %%xmm12, %%xmm12, %%xmm12\n\t" // set to zero - "vxorps %%xmm13, %%xmm13, %%xmm13\n\t" // set to zero - - ".L01LOOP%=: \n\t" - "vmovss (%%rdi), %%xmm0 \n\t" // load values of c - - "vmulps 0*4(%%rsi), %%xmm0, %%xmm4 \n\t" // multiply a and c and add to temp - "vmulps 1*4(%%rsi), %%xmm0, %%xmm5 \n\t" // multiply a and c and add to temp - - "vaddps %%xmm12, %%xmm4, %%xmm12 \n\t" // multiply a and c and add to temp - "vaddps %%xmm13, %%xmm5, %%xmm13 \n\t" // multiply a and c and add to temp - - "addq $4 , %%rdi \n\t" // increment pointer of c - "leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a - - "dec %%rax \n\t" // n = n -1 - "jnz .L01LOOP%= \n\t" - - "vmulss %%xmm12, %%xmm1, %%xmm12\n\t" // scale by alpha - "vmulss %%xmm13, %%xmm1, %%xmm13\n\t" // scale by alpha - - "vmovss %%xmm12, (%%rdx) \n\t" // store temp -> y - "vmovss %%xmm13, 4(%%rdx) \n\t" // store temp -> y - - : - : - "m" (n), // 0 - "m" (alpha), // 1 - "m" (a), // 2 - "m" (lda), // 3 - "m" (x), // 4 - "m" (y) // 5 - : "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", - "%xmm0", "%xmm1", - "%xmm8", "%xmm9", "%xmm10", "%xmm11", - "%xmm12", "%xmm13", "%xmm14", "%xmm15", - "memory" - ); - -} - - - -static void sgemv_kernel_1( long n, float alpha, float *a, long lda, float *x, float *y) -{ - - - __asm__ __volatile__ - ( - "movq %0, %%rax\n\t" // n -> rax - "vmovss %1, %%xmm1\n\t" // alpha -> xmm1 - "movq %2, %%rsi\n\t" // adress of a -> rsi - "movq %3, %%rcx\n\t" // value of lda > rcx - "movq %4, %%rdi\n\t" // adress of x -> rdi - "movq %5, %%rdx\n\t" // adress of y -> rdx - - "vxorps %%xmm12, %%xmm12, %%xmm12\n\t" // set to zero - - ".L01LOOP%=: \n\t" - "vmovss (%%rdi), %%xmm0 \n\t" // load values of c - "addq $4 , %%rdi \n\t" // increment pointer of c - - "vmulss 0*4(%%rsi), %%xmm0, %%xmm4 \n\t" // multiply a and c and add to temp - "vaddss %%xmm12, %%xmm4, %%xmm12 \n\t" // multiply a and c and add to temp - - "leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a - - "dec %%rax \n\t" // n = n -1 - "jnz .L01LOOP%= \n\t" - - "vmulss %%xmm12, %%xmm1, %%xmm12\n\t" // scale by alpha - - "vmovss %%xmm12, (%%rdx) \n\t" // store temp -> y - - : - : - "m" (n), // 0 - "m" (alpha), // 1 - "m" (a), // 2 - "m" (lda), // 3 - "m" (x), // 4 - "m" (y) // 5 - : "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", - "%xmm0", "%xmm1", - "%xmm8", "%xmm9", "%xmm10", "%xmm11", - "%xmm12", "%xmm13", "%xmm14", "%xmm15", - "memory" - ); - -} - - diff --git a/kernel/x86_64/sgemv_t_avx.c b/kernel/x86_64/sgemv_t_avx.c deleted file mode 100644 index 55fb3d623..000000000 --- a/kernel/x86_64/sgemv_t_avx.c +++ /dev/null @@ -1,232 +0,0 @@ -/*************************************************************************** -Copyright (c) 2014, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - - -#include "common.h" - -#if defined(BULLDOZER) || defined(PILEDRIVER) -#include "sgemv_t_microk_bulldozer.c" -#elif defined(HASWELL) -#include "sgemv_t_microk_haswell.c" -#else -#include "sgemv_t_microk_sandy.c" -#endif - -static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) -{ - BLASLONG i; - for ( i=0; i= 16 ) - { - if ( m2 & Mblock) - { - - if ( inc_x == 1 ) - xbuffer = x_ptr; - else - copy_x(Mblock,x_ptr,xbuffer,inc_x); - - y_ptr = y; - a_ptrl = a_ptr; - - for(i = 0; i rax - "vmovss %1, %%xmm1\n\t" // alpha -> xmm1 - "movq %2, %%rsi\n\t" // adress of a -> rsi - "movq %3, %%rcx\n\t" // value of lda > rcx - "movq %4, %%rdi\n\t" // adress of x -> rdi - "movq %5, %%rdx\n\t" // adress of y -> rdx - - "leaq (, %%rcx,4), %%rcx \n\t" // scale lda by size of float - "leaq (%%rsi,%%rcx,1), %%r8 \n\t" // pointer to next line - - "vxorps %%xmm12, %%xmm12, %%xmm12\n\t" // set to zero - "vxorps %%xmm13, %%xmm13, %%xmm13\n\t" // set to zero - "vxorps %%xmm14, %%xmm14, %%xmm14\n\t" // set to zero - "vxorps %%xmm15, %%xmm15, %%xmm15\n\t" // set to zero - - "sarq $4, %%rax \n\t" // n = n / 16 - - ".align 16 \n\t" - ".L01LOOP%=: \n\t" - // "prefetcht0 512(%%rsi) \n\t" - "prefetcht0 (%%r8) \n\t" //prefetch next line of a - "vmovups (%%rsi), %%xmm4 \n\t" - "vmovups 4*4(%%rsi), %%xmm5 \n\t" - "vmovups 8*4(%%rsi), %%xmm6 \n\t" - "vmovups 12*4(%%rsi), %%xmm7 \n\t" - - "vfmaddps %%xmm12, 0*4(%%rdi), %%xmm4, %%xmm12\n\t" // multiply a and c and add to temp - "vfmaddps %%xmm13, 4*4(%%rdi), %%xmm5, %%xmm13\n\t" // multiply a and c and add to temp - "vfmaddps %%xmm14, 8*4(%%rdi), %%xmm6, %%xmm14\n\t" // multiply a and c and add to temp - "vfmaddps %%xmm15, 12*4(%%rdi), %%xmm7, %%xmm15\n\t" // multiply a and c and add to temp - - "addq $16*4 , %%r8 \n\t" // increment prefetch pointer - "addq $16*4 , %%rsi \n\t" // increment pointer of a - "addq $16*4 , %%rdi \n\t" // increment pointer of c - "dec %%rax \n\t" // n = n -1 - "jnz .L01LOOP%= \n\t" - - "vaddps %%xmm12, %%xmm14, %%xmm12\n\t" - "vaddps %%xmm13, %%xmm15, %%xmm13\n\t" - "vaddps %%xmm12, %%xmm13, %%xmm12\n\t" - "vhaddps %%xmm12, %%xmm12, %%xmm12\n\t" - "vhaddps %%xmm12, %%xmm12, %%xmm12\n\t" - - "vfmaddss (%%rdx), %%xmm12, %%xmm1, %%xmm12\n\t" - "vmovss %%xmm12, (%%rdx) \n\t" // store temp -> y - - : - : - "m" (n), // 0 - "m" (alpha), // 1 - "m" (a), // 2 - "m" (lda), // 3 - "m" (x), // 4 - "m" (y) // 5 - : "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", - "%xmm0", "%xmm1", - "%xmm4", "%xmm5", "%xmm6", "%xmm7", - "%xmm12", "%xmm13", "%xmm14", "%xmm15", - "memory" - ); - -} - - - diff --git a/kernel/x86_64/sgemv_t_microk_haswell.c b/kernel/x86_64/sgemv_t_microk_haswell.c deleted file mode 100644 index ecb9845bb..000000000 --- a/kernel/x86_64/sgemv_t_microk_haswell.c +++ /dev/null @@ -1,100 +0,0 @@ -/*************************************************************************** -Copyright (c) 2014, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -static void sgemv_kernel_16( long n, float alpha, float *a, long lda, float *x, float *y) -{ - - //n = n / 16; - - __asm__ __volatile__ - ( - "movq %0, %%rax\n\t" // n -> rax - "vmovss %1, %%xmm1\n\t" // alpha -> xmm1 - "movq %2, %%rsi\n\t" // adress of a -> rsi - "movq %3, %%rcx\n\t" // value of lda > rcx - "movq %4, %%rdi\n\t" // adress of x -> rdi - "movq %5, %%rdx\n\t" // adress of y -> rdx - - "leaq (, %%rcx,4), %%rcx \n\t" // scale lda by size of float - "leaq (%%rsi,%%rcx,1), %%r8 \n\t" // pointer to next line - - "vxorps %%xmm12, %%xmm12, %%xmm12\n\t" // set to zero - "vxorps %%xmm13, %%xmm13, %%xmm13\n\t" // set to zero - "vxorps %%xmm14, %%xmm14, %%xmm14\n\t" // set to zero - "vxorps %%xmm15, %%xmm15, %%xmm15\n\t" // set to zero - - "sarq $4, %%rax \n\t" // n = n / 16 - - ".align 16 \n\t" - ".L01LOOP%=: \n\t" - // "prefetcht0 512(%%rsi) \n\t" - "prefetcht0 (%%r8) \n\t" //prefetch next line of a - "vmovups (%%rsi), %%xmm4 \n\t" - "vmovups 4*4(%%rsi), %%xmm5 \n\t" - "vmovups 8*4(%%rsi), %%xmm6 \n\t" - "vmovups 12*4(%%rsi), %%xmm7 \n\t" - - "vfmadd231ps 0*4(%%rdi), %%xmm4, %%xmm12\n\t" // multiply a and c and add to temp - "vfmadd231ps 4*4(%%rdi), %%xmm5, %%xmm13\n\t" // multiply a and c and add to temp - "vfmadd231ps 8*4(%%rdi), %%xmm6, %%xmm14\n\t" // multiply a and c and add to temp - "vfmadd231ps 12*4(%%rdi), %%xmm7, %%xmm15\n\t" // multiply a and c and add to temp - - "addq $16*4 , %%r8 \n\t" // increment prefetch pointer - "addq $16*4 , %%rsi \n\t" // increment pointer of a - "addq $16*4 , %%rdi \n\t" // increment pointer of c - "dec %%rax \n\t" // n = n -1 - "jnz .L01LOOP%= \n\t" - - "vaddps %%xmm12, %%xmm14, %%xmm12\n\t" - "vaddps %%xmm13, %%xmm15, %%xmm13\n\t" - "vaddps %%xmm12, %%xmm13, %%xmm12\n\t" - "vhaddps %%xmm12, %%xmm12, %%xmm12\n\t" - "vhaddps %%xmm12, %%xmm12, %%xmm12\n\t" - - "vmulss %%xmm12, %%xmm1, %%xmm12\n\t" - "vaddss (%%rdx), %%xmm12,%%xmm12\n\t" - "vmovss %%xmm12, (%%rdx) \n\t" // store temp -> y - - : - : - "m" (n), // 0 - "m" (alpha), // 1 - "m" (a), // 2 - "m" (lda), // 3 - "m" (x), // 4 - "m" (y) // 5 - : "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", - "%xmm0", "%xmm1", - "%xmm4", "%xmm5", "%xmm6", "%xmm7", - "%xmm12", "%xmm13", "%xmm14", "%xmm15", - "memory" - ); - -} - - - diff --git a/kernel/x86_64/sgemv_t_microk_sandy.c b/kernel/x86_64/sgemv_t_microk_sandy.c deleted file mode 100644 index 4ecd6d3d0..000000000 --- a/kernel/x86_64/sgemv_t_microk_sandy.c +++ /dev/null @@ -1,106 +0,0 @@ -/*************************************************************************** -Copyright (c) 2014, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -static void sgemv_kernel_16( long n, float alpha, float *a, long lda, float *x, float *y) -{ - - //n = n / 16; - - __asm__ __volatile__ - ( - "movq %0, %%rax\n\t" // n -> rax - "vmovss %1, %%xmm1\n\t" // alpha -> xmm1 - "movq %2, %%rsi\n\t" // adress of a -> rsi - "movq %3, %%rcx\n\t" // value of lda > rcx - "movq %4, %%rdi\n\t" // adress of x -> rdi - "movq %5, %%rdx\n\t" // adress of y -> rdx - - "leaq (, %%rcx,4), %%rcx \n\t" // scale lda by size of float - "leaq (%%rsi,%%rcx,1), %%r8 \n\t" // pointer to next line - - "vxorps %%xmm12, %%xmm12, %%xmm12\n\t" // set to zero - "vxorps %%xmm13, %%xmm13, %%xmm13\n\t" // set to zero - "vxorps %%xmm14, %%xmm14, %%xmm14\n\t" // set to zero - "vxorps %%xmm15, %%xmm15, %%xmm15\n\t" // set to zero - - "sarq $4, %%rax \n\t" // n = n / 16 - - ".align 16 \n\t" - ".L01LOOP%=: \n\t" - // "prefetcht0 512(%%rsi) \n\t" - "prefetcht0 (%%r8) \n\t" //prefetch next line of a - "vmovups (%%rsi), %%xmm4 \n\t" - "vmovups 4*4(%%rsi), %%xmm5 \n\t" - "vmovups 8*4(%%rsi), %%xmm6 \n\t" - "vmovups 12*4(%%rsi), %%xmm7 \n\t" - - "vmulps 0*4(%%rdi), %%xmm4, %%xmm8 \n\t" // multiply a and c and add to temp - "vmulps 4*4(%%rdi), %%xmm5, %%xmm9 \n\t" // multiply a and c and add to temp - "vmulps 8*4(%%rdi), %%xmm6, %%xmm10\n\t" // multiply a and c and add to temp - "vmulps 12*4(%%rdi), %%xmm7, %%xmm11\n\t" // multiply a and c and add to temp - - "vaddps %%xmm12, %%xmm8 , %%xmm12\n\t" - "vaddps %%xmm13, %%xmm9 , %%xmm13\n\t" - "vaddps %%xmm14, %%xmm10, %%xmm14\n\t" - "vaddps %%xmm15, %%xmm11, %%xmm15\n\t" - - "addq $16*4 , %%r8 \n\t" // increment prefetch pointer - "addq $16*4 , %%rsi \n\t" // increment pointer of a - "addq $16*4 , %%rdi \n\t" // increment pointer of c - "dec %%rax \n\t" // n = n -1 - "jnz .L01LOOP%= \n\t" - - "vaddps %%xmm12, %%xmm14, %%xmm12\n\t" - "vaddps %%xmm13, %%xmm15, %%xmm13\n\t" - "vaddps %%xmm12, %%xmm13, %%xmm12\n\t" - "vhaddps %%xmm12, %%xmm12, %%xmm12\n\t" - "vhaddps %%xmm12, %%xmm12, %%xmm12\n\t" - - "vmulss %%xmm12, %%xmm1, %%xmm12 \n\t" - "vaddss (%%rdx), %%xmm12, %%xmm12\n\t" - "vmovss %%xmm12, (%%rdx) \n\t" // store temp -> y - - : - : - "m" (n), // 0 - "m" (alpha), // 1 - "m" (a), // 2 - "m" (lda), // 3 - "m" (x), // 4 - "m" (y) // 5 - : "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", "cc", - "%xmm0", "%xmm1", - "%xmm4", "%xmm5", "%xmm6", "%xmm7", - "%xmm8", "%xmm9", "%xmm10", "%xmm11", - "%xmm12", "%xmm13", "%xmm14", "%xmm15", - "memory" - ); - -} - - - From 44f2bf9bae7b25356fc0179d6b935de4edadc637 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Tue, 9 Sep 2014 13:34:22 +0200 Subject: [PATCH 39/44] added optimized dgemv_t kernel for haswell --- kernel/x86_64/dgemv_t_4.c | 623 +++++++++++++++++++++++ kernel/x86_64/dgemv_t_microk_haswell-4.c | 127 +++++ 2 files changed, 750 insertions(+) create mode 100644 kernel/x86_64/dgemv_t_4.c create mode 100644 kernel/x86_64/dgemv_t_microk_haswell-4.c diff --git a/kernel/x86_64/dgemv_t_4.c b/kernel/x86_64/dgemv_t_4.c new file mode 100644 index 000000000..0d0409bec --- /dev/null +++ b/kernel/x86_64/dgemv_t_4.c @@ -0,0 +1,623 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include "common.h" + +/* +#if defined(NEHALEM) +#include "dgemv_t_microk_nehalem-4.c" +#elif defined(BULLDOZER) || defined(PILEDRIVER) +#include "dgemv_t_microk_bulldozer-4.c" +#elif defined(SANDYBRIDGE) +#include "dgemv_t_microk_sandy-4.c" +#elif defined(HASWELL) +#include "dgemv_t_microk_haswell-4.c" +#endif +*/ + +#define NBMAX 2048 + +#ifndef HAVE_KERNEL_4x4 + +static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) +{ + BLASLONG i; + FLOAT *a0,*a1,*a2,*a3; + a0 = ap[0]; + a1 = ap[1]; + a2 = ap[2]; + a3 = ap[3]; + FLOAT temp0 = 0.0; + FLOAT temp1 = 0.0; + FLOAT temp2 = 0.0; + FLOAT temp3 = 0.0; + + for ( i=0; i< n; i+=4 ) + { + temp0 += a0[i]*x[i] + a0[i+1]*x[i+1] + a0[i+2]*x[i+2] + a0[i+3]*x[i+3]; + temp1 += a1[i]*x[i] + a1[i+1]*x[i+1] + a1[i+2]*x[i+2] + a1[i+3]*x[i+3]; + temp2 += a2[i]*x[i] + a2[i+1]*x[i+1] + a2[i+2]*x[i+2] + a2[i+3]*x[i+3]; + temp3 += a3[i]*x[i] + a3[i+1]*x[i+1] + a3[i+2]*x[i+2] + a3[i+3]*x[i+3]; + } + y[0] = temp0; + y[1] = temp1; + y[2] = temp2; + y[3] = temp3; +} + +#endif + +static void dgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT *y) __attribute__ ((noinline)); + +static void dgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT *y) +{ + BLASLONG i; + + i=0; + + __asm__ __volatile__ + ( + "xorpd %%xmm10 , %%xmm10 \n\t" + "xorpd %%xmm11 , %%xmm11 \n\t" + + "testq $2 , %1 \n\t" + "jz .L01LABEL%= \n\t" + + "movups (%5,%0,8) , %%xmm14 \n\t" // x + "movups (%3,%0,8) , %%xmm12 \n\t" // ap0 + "movups (%4,%0,8) , %%xmm13 \n\t" // ap1 + "mulpd %%xmm14 , %%xmm12 \n\t" + "mulpd %%xmm14 , %%xmm13 \n\t" + "addq $2 , %0 \n\t" + "addpd %%xmm12 , %%xmm10 \n\t" + "subq $2 , %1 \n\t" + "addpd %%xmm13 , %%xmm11 \n\t" + + ".L01LABEL%=: \n\t" + + "cmpq $0, %1 \n\t" + "je .L01END%= \n\t" + + ".align 16 \n\t" + ".L01LOOP%=: \n\t" + + "movups (%5,%0,8) , %%xmm14 \n\t" // x + "movups (%3,%0,8) , %%xmm12 \n\t" // ap0 + "movups (%4,%0,8) , %%xmm13 \n\t" // ap1 + "mulpd %%xmm14 , %%xmm12 \n\t" + "mulpd %%xmm14 , %%xmm13 \n\t" + "addpd %%xmm12 , %%xmm10 \n\t" + "addpd %%xmm13 , %%xmm11 \n\t" + + "movups 16(%5,%0,8) , %%xmm14 \n\t" // x + "movups 16(%3,%0,8) , %%xmm12 \n\t" // ap0 + "movups 16(%4,%0,8) , %%xmm13 \n\t" // ap1 + "mulpd %%xmm14 , %%xmm12 \n\t" + "mulpd %%xmm14 , %%xmm13 \n\t" + "addpd %%xmm12 , %%xmm10 \n\t" + "addpd %%xmm13 , %%xmm11 \n\t" + + "addq $4 , %0 \n\t" + "subq $4 , %1 \n\t" + "jnz .L01LOOP%= \n\t" + + ".L01END%=: \n\t" + + "haddpd %%xmm10, %%xmm10 \n\t" + "haddpd %%xmm11, %%xmm11 \n\t" + + "movsd %%xmm10, (%2) \n\t" + "movsd %%xmm11,8(%2) \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (y), // 2 + "r" (ap0), // 3 + "r" (ap1), // 4 + "r" (x) // 5 + : "cc", + "%xmm4", "%xmm5", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + + +} + +static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline)); + +static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) +{ + BLASLONG i; + + i=0; + + __asm__ __volatile__ + ( + "xorpd %%xmm9 , %%xmm9 \n\t" + "xorpd %%xmm10 , %%xmm10 \n\t" + + "testq $2 , %1 \n\t" + "jz .L01LABEL%= \n\t" + + "movups (%3,%0,8) , %%xmm12 \n\t" + "movups (%4,%0,8) , %%xmm11 \n\t" + "mulpd %%xmm11 , %%xmm12 \n\t" + "addq $2 , %0 \n\t" + "addpd %%xmm12 , %%xmm10 \n\t" + "subq $2 , %1 \n\t" + + ".L01LABEL%=: \n\t" + + "cmpq $0, %1 \n\t" + "je .L01END%= \n\t" + + ".align 16 \n\t" + ".L01LOOP%=: \n\t" + + "movups (%3,%0,8) , %%xmm12 \n\t" + "movups 16(%3,%0,8) , %%xmm14 \n\t" + "movups (%4,%0,8) , %%xmm11 \n\t" + "movups 16(%4,%0,8) , %%xmm13 \n\t" + "mulpd %%xmm11 , %%xmm12 \n\t" + "mulpd %%xmm13 , %%xmm14 \n\t" + "addq $4 , %0 \n\t" + "addpd %%xmm12 , %%xmm10 \n\t" + "subq $4 , %1 \n\t" + "addpd %%xmm14 , %%xmm9 \n\t" + + "jnz .L01LOOP%= \n\t" + + ".L01END%=: \n\t" + + "addpd %%xmm9 , %%xmm10 \n\t" + "haddpd %%xmm10, %%xmm10 \n\t" + + "movsd %%xmm10, (%2) \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (y), // 2 + "r" (ap), // 3 + "r" (x) // 4 + : "cc", + "%xmm9", "%xmm10" , + "%xmm11", "%xmm12", "%xmm13", "%xmm14", + "memory" + ); + + +} + +static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) +{ + BLASLONG i; + for ( i=0; i> 2 ; + n2 = n & 3 ; + + m3 = m & 3 ; + m1 = m & -4 ; + m2 = (m & (NBMAX-1)) - m3 ; + + + BLASLONG NB = NBMAX; + + while ( NB == NBMAX ) + { + + m1 -= NB; + if ( m1 < 0) + { + if ( m2 == 0 ) break; + NB = m2; + } + + y_ptr = y; + a_ptr = a; + x_ptr = x; + + if ( inc_x == 1 ) + xbuffer = x_ptr; + else + copy_x(NB,x_ptr,xbuffer,inc_x); + + + FLOAT *ap[4]; + FLOAT *yp; + BLASLONG register lda4 = 4 * lda; + ap[0] = a_ptr; + ap[1] = a_ptr + lda; + ap[2] = ap[1] + lda; + ap[3] = ap[2] + lda; + + if ( n0 > 0 ) + { + BLASLONG nb1 = NBMAX / 4; + for( j=0; j 0 ) + { + add_y(n1*4, alpha, ytemp, y_ptr, inc_y ); + y_ptr += n1 * inc_y * 4; + a_ptr += n1 * lda4 ; + } + + if ( n2 & 2 ) + { + + dgemv_kernel_4x2(NB,ap[0],ap[1],xbuffer,ybuffer); + a_ptr += lda * 2; + *y_ptr += ybuffer[0] * alpha; + y_ptr += inc_y; + *y_ptr += ybuffer[1] * alpha; + y_ptr += inc_y; + + } + + if ( n2 & 1 ) + { + + dgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer); + a_ptr += lda; + *y_ptr += ybuffer[0] * alpha; + y_ptr += inc_y; + + } + a += NB; + x += NB * inc_x; + } + + if ( m3 == 0 ) return(0); + + x_ptr = x; + a_ptr = a; + if ( m3 == 3 ) + { + FLOAT xtemp0 = *x_ptr * alpha; + x_ptr += inc_x; + FLOAT xtemp1 = *x_ptr * alpha; + x_ptr += inc_x; + FLOAT xtemp2 = *x_ptr * alpha; + + FLOAT *aj = a_ptr; + y_ptr = y; + + if ( lda == 3 && inc_y == 1 ) + { + + for ( j=0; j< ( n & -4) ; j+=4 ) + { + + y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2; + y_ptr[j+1] += aj[3] * xtemp0 + aj[4] * xtemp1 + aj[5] * xtemp2; + y_ptr[j+2] += aj[6] * xtemp0 + aj[7] * xtemp1 + aj[8] * xtemp2; + y_ptr[j+3] += aj[9] * xtemp0 + aj[10] * xtemp1 + aj[11] * xtemp2; + aj += 12; + } + + for ( ; j Date: Tue, 9 Sep 2014 13:54:55 +0200 Subject: [PATCH 40/44] added optimized gemv kernels --- kernel/x86_64/KERNEL.HASWELL | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/x86_64/KERNEL.HASWELL b/kernel/x86_64/KERNEL.HASWELL index d0ac9c72f..8aab560c4 100644 --- a/kernel/x86_64/KERNEL.HASWELL +++ b/kernel/x86_64/KERNEL.HASWELL @@ -1,8 +1,8 @@ -SGEMVNKERNEL = sgemv_n.c -SGEMVTKERNEL = sgemv_t.c +SGEMVNKERNEL = sgemv_n_4.c +SGEMVTKERNEL = sgemv_t_4.c -DGEMVNKERNEL = dgemv_n.c -DGEMVTKERNEL = dgemv_t.c +DGEMVNKERNEL = dgemv_n_4.c +DGEMVTKERNEL = dgemv_t_4.c ZGEMVNKERNEL = zgemv_n.c ZGEMVTKERNEL = zgemv_t.c From debc6d1a056f7d2763dfb4de01cf1b4780a2536b Mon Sep 17 00:00:00 2001 From: wernsaar Date: Tue, 9 Sep 2014 14:04:44 +0200 Subject: [PATCH 41/44] bugfix in KERNEL.HASWELL --- kernel/x86_64/KERNEL.HASWELL | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/x86_64/KERNEL.HASWELL b/kernel/x86_64/KERNEL.HASWELL index 8aab560c4..9a5c54ffc 100644 --- a/kernel/x86_64/KERNEL.HASWELL +++ b/kernel/x86_64/KERNEL.HASWELL @@ -1,7 +1,7 @@ SGEMVNKERNEL = sgemv_n_4.c SGEMVTKERNEL = sgemv_t_4.c -DGEMVNKERNEL = dgemv_n_4.c +DGEMVNKERNEL = dgemv_n.c DGEMVTKERNEL = dgemv_t_4.c ZGEMVNKERNEL = zgemv_n.c From 8109d8232c66c4119044fa3111947d88afae9eb6 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Tue, 9 Sep 2014 14:38:08 +0200 Subject: [PATCH 42/44] optimized dgemv_t kernel for haswell --- kernel/x86_64/dgemv_t_4.c | 10 +--------- kernel/x86_64/dgemv_t_microk_haswell-4.c | 22 +++++++++++----------- 2 files changed, 12 insertions(+), 20 deletions(-) diff --git a/kernel/x86_64/dgemv_t_4.c b/kernel/x86_64/dgemv_t_4.c index 0d0409bec..ebec7d2c3 100644 --- a/kernel/x86_64/dgemv_t_4.c +++ b/kernel/x86_64/dgemv_t_4.c @@ -28,17 +28,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -/* -#if defined(NEHALEM) -#include "dgemv_t_microk_nehalem-4.c" -#elif defined(BULLDOZER) || defined(PILEDRIVER) -#include "dgemv_t_microk_bulldozer-4.c" -#elif defined(SANDYBRIDGE) -#include "dgemv_t_microk_sandy-4.c" -#elif defined(HASWELL) +#if defined(HASWELL) #include "dgemv_t_microk_haswell-4.c" #endif -*/ #define NBMAX 2048 diff --git a/kernel/x86_64/dgemv_t_microk_haswell-4.c b/kernel/x86_64/dgemv_t_microk_haswell-4.c index 410225500..33b43515d 100644 --- a/kernel/x86_64/dgemv_t_microk_haswell-4.c +++ b/kernel/x86_64/dgemv_t_microk_haswell-4.c @@ -61,25 +61,25 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) ".align 16 \n\t" ".L01LOOP%=: \n\t" - "prefetcht0 384(%2,%0,8) \n\t" + // "prefetcht0 384(%2,%0,8) \n\t" "vmovups (%2,%0,8), %%ymm12 \n\t" // 4 * x "vmovups 32(%2,%0,8), %%ymm13 \n\t" // 4 * x - "prefetcht0 384(%4,%0,8) \n\t" + // "prefetcht0 384(%4,%0,8) \n\t" "vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t" "vfmadd231pd (%5,%0,8), %%ymm12, %%ymm5 \n\t" - "prefetcht0 384(%5,%0,8) \n\t" - "vfmadd231pd 32(%4,%0,8), %%ymm13, %%ymm4 \n\t" - "vfmadd231pd 32(%5,%0,8), %%ymm13, %%ymm5 \n\t" - "prefetcht0 384(%6,%0,8) \n\t" + // "prefetcht0 384(%5,%0,8) \n\t" "vfmadd231pd (%6,%0,8), %%ymm12, %%ymm6 \n\t" "vfmadd231pd (%7,%0,8), %%ymm12, %%ymm7 \n\t" - "prefetcht0 384(%7,%0,8) \n\t" - "vfmadd231pd 32(%6,%0,8), %%ymm13, %%ymm6 \n\t" - "vfmadd231pd 32(%7,%0,8), %%ymm13, %%ymm7 \n\t" + // "prefetcht0 384(%6,%0,8) \n\t" + "vfmadd231pd 32(%4,%0,8), %%ymm13, %%ymm4 \n\t" + "vfmadd231pd 32(%5,%0,8), %%ymm13, %%ymm5 \n\t" + "addq $8 , %0 \n\t" + // "prefetcht0 384(%7,%0,8) \n\t" + "vfmadd231pd -32(%6,%0,8), %%ymm13, %%ymm6 \n\t" + "subq $8 , %1 \n\t" + "vfmadd231pd -32(%7,%0,8), %%ymm13, %%ymm7 \n\t" - "addq $8 , %0 \n\t" - "subq $8 , %1 \n\t" "jnz .L01LOOP%= \n\t" ".L16END%=: \n\t" From faab7a181d72023c11b098da9cabc49a2ae3701d Mon Sep 17 00:00:00 2001 From: wernsaar Date: Tue, 9 Sep 2014 15:32:32 +0200 Subject: [PATCH 43/44] added optimized dgemv_n kernel for haswell --- kernel/x86_64/dgemv_n_microk_haswell-4.c | 247 +++++++++++++++++++++++ 1 file changed, 247 insertions(+) create mode 100644 kernel/x86_64/dgemv_n_microk_haswell-4.c diff --git a/kernel/x86_64/dgemv_n_microk_haswell-4.c b/kernel/x86_64/dgemv_n_microk_haswell-4.c new file mode 100644 index 000000000..2c77f3469 --- /dev/null +++ b/kernel/x86_64/dgemv_n_microk_haswell-4.c @@ -0,0 +1,247 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + + +#define HAVE_KERNEL_4x8 1 +static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) __attribute__ ((noinline)); + +static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) +{ + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + "vbroadcastsd (%2), %%ymm12 \n\t" // x0 + "vbroadcastsd 8(%2), %%ymm13 \n\t" // x1 + "vbroadcastsd 16(%2), %%ymm14 \n\t" // x2 + "vbroadcastsd 24(%2), %%ymm15 \n\t" // x3 + "vbroadcastsd 32(%2), %%ymm0 \n\t" // x4 + "vbroadcastsd 40(%2), %%ymm1 \n\t" // x5 + "vbroadcastsd 48(%2), %%ymm2 \n\t" // x6 + "vbroadcastsd 56(%2), %%ymm3 \n\t" // x7 + + "vbroadcastsd (%9), %%ymm6 \n\t" // alpha + + "testq $0x04, %1 \n\t" + "jz .L8LABEL%= \n\t" + + "vmovupd (%3,%0,8), %%ymm7 \n\t" // 4 * y + "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t" + "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t" + + "vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t" + "vfmadd231pd (%5,%0,8), %%ymm13, %%ymm5 \n\t" + "vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t" + "vfmadd231pd (%7,%0,8), %%ymm15, %%ymm5 \n\t" + + "vfmadd231pd (%4,%8,8), %%ymm0 , %%ymm4 \n\t" + "vfmadd231pd (%5,%8,8), %%ymm1 , %%ymm5 \n\t" + "vfmadd231pd (%6,%8,8), %%ymm2 , %%ymm4 \n\t" + "vfmadd231pd (%7,%8,8), %%ymm3 , %%ymm5 \n\t" + + "vaddpd %%ymm4 , %%ymm5 , %%ymm5 \n\t" + "vmulpd %%ymm6 , %%ymm5 , %%ymm5 \n\t" + "vaddpd %%ymm7 , %%ymm5 , %%ymm5 \n\t" + + + "vmovupd %%ymm5, (%3,%0,8) \n\t" // 4 * y + + "addq $4 , %8 \n\t" + "addq $4 , %0 \n\t" + "subq $4 , %1 \n\t" + + ".L8LABEL%=: \n\t" + + "cmpq $0, %1 \n\t" + "je .L16END%= \n\t" + + + ".align 16 \n\t" + ".L01LOOP%=: \n\t" + + "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t" + "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t" + "vmovupd (%3,%0,8), %%ymm8 \n\t" // 4 * y + "vmovupd 32(%3,%0,8), %%ymm9 \n\t" // 4 * y + + "vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t" + "vfmadd231pd 32(%4,%0,8), %%ymm12, %%ymm5 \n\t" + "vfmadd231pd (%5,%0,8), %%ymm13, %%ymm4 \n\t" + "vfmadd231pd 32(%5,%0,8), %%ymm13, %%ymm5 \n\t" + "vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t" + "vfmadd231pd 32(%6,%0,8), %%ymm14, %%ymm5 \n\t" + "vfmadd231pd (%7,%0,8), %%ymm15, %%ymm4 \n\t" + "vfmadd231pd 32(%7,%0,8), %%ymm15, %%ymm5 \n\t" + + "vfmadd231pd (%4,%8,8), %%ymm0 , %%ymm4 \n\t" + "addq $8 , %0 \n\t" + "vfmadd231pd 32(%4,%8,8), %%ymm0 , %%ymm5 \n\t" + "vfmadd231pd (%5,%8,8), %%ymm1 , %%ymm4 \n\t" + "vfmadd231pd 32(%5,%8,8), %%ymm1 , %%ymm5 \n\t" + "vfmadd231pd (%6,%8,8), %%ymm2 , %%ymm4 \n\t" + "vfmadd231pd 32(%6,%8,8), %%ymm2 , %%ymm5 \n\t" + "vfmadd231pd (%7,%8,8), %%ymm3 , %%ymm4 \n\t" + "vfmadd231pd 32(%7,%8,8), %%ymm3 , %%ymm5 \n\t" + + "vfmadd231pd %%ymm6 , %%ymm4 , %%ymm8 \n\t" + "vfmadd231pd %%ymm6 , %%ymm5 , %%ymm9 \n\t" + + "addq $8 , %8 \n\t" + "vmovupd %%ymm8,-64(%3,%0,8) \n\t" // 4 * y + "subq $8 , %1 \n\t" + "vmovupd %%ymm9,-32(%3,%0,8) \n\t" // 4 * y + + "jnz .L01LOOP%= \n\t" + + ".L16END%=: \n\t" + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (ap[0]), // 4 + "r" (ap[1]), // 5 + "r" (ap[2]), // 6 + "r" (ap[3]), // 7 + "r" (lda4), // 8 + "r" (alpha) // 9 + : "cc", + "%xmm0", "%xmm1", + "%xmm2", "%xmm3", + "%xmm4", "%xmm5", + "%xmm6", "%xmm7", + "%xmm8", "%xmm9", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + + +#define HAVE_KERNEL_4x4 1 +static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline)); + +static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + "vbroadcastsd (%2), %%ymm12 \n\t" // x0 + "vbroadcastsd 8(%2), %%ymm13 \n\t" // x1 + "vbroadcastsd 16(%2), %%ymm14 \n\t" // x2 + "vbroadcastsd 24(%2), %%ymm15 \n\t" // x3 + + "vbroadcastsd (%8), %%ymm6 \n\t" // alpha + + "testq $0x04, %1 \n\t" + "jz .L8LABEL%= \n\t" + + "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t" + "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t" + "vmovupd (%3,%0,8), %%ymm7 \n\t" // 4 * y + + "vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t" + "vfmadd231pd (%5,%0,8), %%ymm13, %%ymm5 \n\t" + "vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t" + "vfmadd231pd (%7,%0,8), %%ymm15, %%ymm5 \n\t" + + "vaddpd %%ymm4 , %%ymm5 , %%ymm5 \n\t" + "vmulpd %%ymm6 , %%ymm5 , %%ymm5 \n\t" + "vaddpd %%ymm7 , %%ymm5 , %%ymm5 \n\t" + + "vmovupd %%ymm5, (%3,%0,8) \n\t" // 4 * y + + "addq $4 , %0 \n\t" + "subq $4 , %1 \n\t" + + ".L8LABEL%=: \n\t" + + "cmpq $0, %1 \n\t" + "je .L8END%= \n\t" + + + ".align 16 \n\t" + ".L01LOOP%=: \n\t" + "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t" + "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t" + "vmovupd (%3,%0,8), %%ymm8 \n\t" // 4 * y + "vmovupd 32(%3,%0,8), %%ymm9 \n\t" // 4 * y + + "vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t" + "vfmadd231pd 32(%4,%0,8), %%ymm12, %%ymm5 \n\t" + "vfmadd231pd (%5,%0,8), %%ymm13, %%ymm4 \n\t" + "vfmadd231pd 32(%5,%0,8), %%ymm13, %%ymm5 \n\t" + "vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t" + "vfmadd231pd 32(%6,%0,8), %%ymm14, %%ymm5 \n\t" + "vfmadd231pd (%7,%0,8), %%ymm15, %%ymm4 \n\t" + "vfmadd231pd 32(%7,%0,8), %%ymm15, %%ymm5 \n\t" + + "vfmadd231pd %%ymm6 , %%ymm4 , %%ymm8 \n\t" + "vfmadd231pd %%ymm6 , %%ymm5 , %%ymm9 \n\t" + + "vmovupd %%ymm8, (%3,%0,8) \n\t" // 4 * y + "vmovupd %%ymm9, 32(%3,%0,8) \n\t" // 4 * y + + "addq $8 , %0 \n\t" + "subq $8 , %1 \n\t" + "jnz .L01LOOP%= \n\t" + + ".L8END%=: \n\t" + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (ap[0]), // 4 + "r" (ap[1]), // 5 + "r" (ap[2]), // 6 + "r" (ap[3]), // 7 + "r" (alpha) // 8 + : "cc", + "%xmm4", "%xmm5", + "%xmm6", "%xmm7", + "%xmm8", "%xmm9", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + From baa46e4fba439a4dea4eed9fe82d0cd164f77a5a Mon Sep 17 00:00:00 2001 From: wernsaar Date: Tue, 9 Sep 2014 16:17:45 +0200 Subject: [PATCH 44/44] added and tested optimized dgemv_n kernel for haswell --- kernel/x86_64/KERNEL.HASWELL | 2 +- kernel/x86_64/dgemv_n_4.c | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/kernel/x86_64/KERNEL.HASWELL b/kernel/x86_64/KERNEL.HASWELL index 9a5c54ffc..8aab560c4 100644 --- a/kernel/x86_64/KERNEL.HASWELL +++ b/kernel/x86_64/KERNEL.HASWELL @@ -1,7 +1,7 @@ SGEMVNKERNEL = sgemv_n_4.c SGEMVTKERNEL = sgemv_t_4.c -DGEMVNKERNEL = dgemv_n.c +DGEMVNKERNEL = dgemv_n_4.c DGEMVTKERNEL = dgemv_t_4.c ZGEMVNKERNEL = zgemv_n.c diff --git a/kernel/x86_64/dgemv_n_4.c b/kernel/x86_64/dgemv_n_4.c index 249df8009..371fd73ee 100644 --- a/kernel/x86_64/dgemv_n_4.c +++ b/kernel/x86_64/dgemv_n_4.c @@ -31,6 +31,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(NEHALEM) #include "dgemv_n_microk_nehalem-4.c" +#elif defined(HASWELL) +#include "dgemv_n_microk_haswell-4.c" #endif