diff --git a/benchmark/gemv.c b/benchmark/gemv.c index e26a36ac1..e21868259 100644 --- a/benchmark/gemv.c +++ b/benchmark/gemv.c @@ -128,6 +128,7 @@ int MAIN__(int argc, char *argv[]){ blasint inc_x=1,inc_y=1; blasint n=0; int has_param_n = 0; + int has_param_m = 0; int loops = 1; int l; char *p; @@ -145,29 +146,38 @@ int MAIN__(int argc, char *argv[]){ if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} if (argc > 0) { step = atol(*argv); argc--; argv++;} + + int tomax = to; + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); if ((p = getenv("OPENBLAS_TRANS"))) trans=*p; if ((p = getenv("OPENBLAS_PARAM_N"))) { n = atoi(p); - if ((n>0) && (n<=to)) has_param_n = 1; + if ((n>0)) has_param_n = 1; + if ( n > tomax ) tomax = n; } + if ( has_param_n == 0 ) + if ((p = getenv("OPENBLAS_PARAM_M"))) { + m = atoi(p); + if ((m>0)) has_param_m = 1; + if ( m > tomax ) tomax = m; + } - if ( has_param_n == 1 ) - fprintf(stderr, "From : %3d To : %3d Step = %3d Trans = '%c' N = %d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,trans,n,inc_x,inc_y,loops); - else - fprintf(stderr, "From : %3d To : %3d Step = %3d Trans = '%c' Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,trans,inc_x,inc_y,loops); - if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){ + + fprintf(stderr, "From : %3d To : %3d Step = %3d Trans = '%c' Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,trans,inc_x,inc_y,loops); + + if (( a = (FLOAT *)malloc(sizeof(FLOAT) * tomax * tomax * COMPSIZE)) == NULL){ fprintf(stderr,"Out of Memory!!\n");exit(1); } - if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * tomax * abs(inc_x) * COMPSIZE)) == NULL){ fprintf(stderr,"Out of Memory!!\n");exit(1); } - if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ + if (( y = (FLOAT *)malloc(sizeof(FLOAT) * tomax * abs(inc_y) * COMPSIZE)) == NULL){ fprintf(stderr,"Out of Memory!!\n");exit(1); } @@ -177,50 +187,80 @@ int MAIN__(int argc, char *argv[]){ fprintf(stderr, " SIZE Flops\n"); - for(m = from; m <= to; m += step) + if (has_param_m == 0) { - timeg=0; + for(m = from; m <= to; m += step) + { + timeg=0; + if ( has_param_n == 0 ) n = m; + fprintf(stderr, " %6dx%d : ", (int)m,(int)n); + for(j = 0; j < m; j++){ + for(i = 0; i < n * COMPSIZE; i++){ + a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + } + } - if ( has_param_n == 0 ) n = m; + for (l=0; l>= 7; diff --git a/interface/gemv.c b/interface/gemv.c index 08553ad21..2dd82dce5 100644 --- a/interface/gemv.c +++ b/interface/gemv.c @@ -216,7 +216,7 @@ void CNAME(enum CBLAS_ORDER order, int nthreads_avail = nthreads_max; double MNK = (double) m * (double) n; - if ( MNK <= (500.0 * 100.0 * (double) GEMM_MULTITHREAD_THRESHOLD) ) + if ( MNK <= (24.0 * 24.0 * (double) (GEMM_MULTITHREAD_THRESHOLD*GEMM_MULTITHREAD_THRESHOLD) ) ) nthreads_max = 1; if ( nthreads_max > nthreads_avail ) diff --git a/kernel/x86_64/KERNEL.BULLDOZER b/kernel/x86_64/KERNEL.BULLDOZER index 6318b202c..0fd7ac35f 100644 --- a/kernel/x86_64/KERNEL.BULLDOZER +++ b/kernel/x86_64/KERNEL.BULLDOZER @@ -10,8 +10,8 @@ DSYMV_L_KERNEL = dsymv_L.c SSYMV_U_KERNEL = ssymv_U.c SSYMV_L_KERNEL = ssymv_L.c -SGEMVNKERNEL = sgemv_n.c -SGEMVTKERNEL = sgemv_t.c +SGEMVNKERNEL = sgemv_n_4.c +SGEMVTKERNEL = sgemv_t_4.c ZGEMVNKERNEL = zgemv_n_dup.S ZGEMVTKERNEL = zgemv_t.c diff --git a/kernel/x86_64/KERNEL.HASWELL b/kernel/x86_64/KERNEL.HASWELL index d0ac9c72f..8aab560c4 100644 --- a/kernel/x86_64/KERNEL.HASWELL +++ b/kernel/x86_64/KERNEL.HASWELL @@ -1,8 +1,8 @@ -SGEMVNKERNEL = sgemv_n.c -SGEMVTKERNEL = sgemv_t.c +SGEMVNKERNEL = sgemv_n_4.c +SGEMVTKERNEL = sgemv_t_4.c -DGEMVNKERNEL = dgemv_n.c -DGEMVTKERNEL = dgemv_t.c +DGEMVNKERNEL = dgemv_n_4.c +DGEMVTKERNEL = dgemv_t_4.c ZGEMVNKERNEL = zgemv_n.c ZGEMVTKERNEL = zgemv_t.c diff --git a/kernel/x86_64/KERNEL.NEHALEM b/kernel/x86_64/KERNEL.NEHALEM index 8adb579cf..8feef5c31 100644 --- a/kernel/x86_64/KERNEL.NEHALEM +++ b/kernel/x86_64/KERNEL.NEHALEM @@ -9,9 +9,9 @@ DSYMV_L_KERNEL = dsymv_L.c SSYMV_U_KERNEL = ssymv_U.c SSYMV_L_KERNEL = ssymv_L.c -SGEMVNKERNEL = sgemv_n.c -SGEMVTKERNEL = sgemv_t.c -DGEMVNKERNEL = dgemv_n.c +SGEMVNKERNEL = sgemv_n_4.c +SGEMVTKERNEL = sgemv_t_4.c +DGEMVNKERNEL = dgemv_n_4.c SGEMMKERNEL = gemm_kernel_4x8_nehalem.S SGEMMINCOPY = gemm_ncopy_4.S diff --git a/kernel/x86_64/KERNEL.PILEDRIVER b/kernel/x86_64/KERNEL.PILEDRIVER index 146a8768b..4f15e5a36 100644 --- a/kernel/x86_64/KERNEL.PILEDRIVER +++ b/kernel/x86_64/KERNEL.PILEDRIVER @@ -1,5 +1,5 @@ -SGEMVNKERNEL = sgemv_n.c -SGEMVTKERNEL = sgemv_t.c +SGEMVNKERNEL = sgemv_n_4.c +SGEMVTKERNEL = sgemv_t_4.c ZGEMVNKERNEL = zgemv_n_dup.S ZGEMVTKERNEL = zgemv_t.S diff --git a/kernel/x86_64/KERNEL.SANDYBRIDGE b/kernel/x86_64/KERNEL.SANDYBRIDGE index b654d3564..dfc2882aa 100644 --- a/kernel/x86_64/KERNEL.SANDYBRIDGE +++ b/kernel/x86_64/KERNEL.SANDYBRIDGE @@ -1,5 +1,5 @@ -SGEMVNKERNEL = sgemv_n.c -SGEMVTKERNEL = sgemv_t.c +SGEMVNKERNEL = sgemv_n_4.c +SGEMVTKERNEL = sgemv_t_4.c ZGEMVNKERNEL = zgemv_n.c diff --git a/kernel/x86_64/dgemv_n_4.c b/kernel/x86_64/dgemv_n_4.c new file mode 100644 index 000000000..371fd73ee --- /dev/null +++ b/kernel/x86_64/dgemv_n_4.c @@ -0,0 +1,548 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include "common.h" + + +#if defined(NEHALEM) +#include "dgemv_n_microk_nehalem-4.c" +#elif defined(HASWELL) +#include "dgemv_n_microk_haswell-4.c" +#endif + + +#define NBMAX 2048 + +#ifndef HAVE_KERNEL_4x8 + +static void dgemv_kernel_4x8(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, BLASLONG lda4, FLOAT *alpha) +{ + BLASLONG i; + FLOAT *a0,*a1,*a2,*a3; + FLOAT *b0,*b1,*b2,*b3; + FLOAT *x4; + FLOAT x[8]; + a0 = ap[0]; + a1 = ap[1]; + a2 = ap[2]; + a3 = ap[3]; + b0 = a0 + lda4 ; + b1 = a1 + lda4 ; + b2 = a2 + lda4 ; + b3 = a3 + lda4 ; + x4 = x + 4; + + for ( i=0; i<8; i++) + x[i] = xo[i] * *alpha; + + for ( i=0; i< n; i+=4 ) + { + + y[i] += a0[i]*x[0] + a1[i]*x[1] + a2[i]*x[2] + a3[i]*x[3]; + y[i+1] += a0[i+1]*x[0] + a1[i+1]*x[1] + a2[i+1]*x[2] + a3[i+1]*x[3]; + y[i+2] += a0[i+2]*x[0] + a1[i+2]*x[1] + a2[i+2]*x[2] + a3[i+2]*x[3]; + y[i+3] += a0[i+3]*x[0] + a1[i+3]*x[1] + a2[i+3]*x[2] + a3[i+3]*x[3]; + + y[i] += b0[i]*x4[0] + b1[i]*x4[1] + b2[i]*x4[2] + b3[i]*x4[3]; + y[i+1] += b0[i+1]*x4[0] + b1[i+1]*x4[1] + b2[i+1]*x4[2] + b3[i+1]*x4[3]; + y[i+2] += b0[i+2]*x4[0] + b1[i+2]*x4[1] + b2[i+2]*x4[2] + b3[i+2]*x4[3]; + y[i+3] += b0[i+3]*x4[0] + b1[i+3]*x4[1] + b2[i+3]*x4[2] + b3[i+3]*x4[3]; + + } +} + +#endif + + +#ifndef HAVE_KERNEL_4x4 + +static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) +{ + BLASLONG i; + FLOAT *a0,*a1,*a2,*a3; + FLOAT x[4]; + a0 = ap[0]; + a1 = ap[1]; + a2 = ap[2]; + a3 = ap[3]; + + for ( i=0; i<4; i++) + x[i] = xo[i] * *alpha; + + for ( i=0; i< n; i+=4 ) + { + y[i] += a0[i]*x[0] + a1[i]*x[1] + a2[i]*x[2] + a3[i]*x[3]; + y[i+1] += a0[i+1]*x[0] + a1[i+1]*x[1] + a2[i+1]*x[2] + a3[i+1]*x[3]; + y[i+2] += a0[i+2]*x[0] + a1[i+2]*x[1] + a2[i+2]*x[2] + a3[i+2]*x[3]; + y[i+3] += a0[i+3]*x[0] + a1[i+3]*x[1] + a2[i+3]*x[2] + a3[i+3]*x[3]; + } +} + +#endif + +#ifndef HAVE_KERNEL_4x2 + +static void dgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline)); + +static void dgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "movsd (%2) , %%xmm12 \n\t" // x0 + "movsd (%6) , %%xmm4 \n\t" // alpha + "movsd 8(%2) , %%xmm13 \n\t" // x1 + "mulsd %%xmm4 , %%xmm12 \n\t" // alpha + "mulsd %%xmm4 , %%xmm13 \n\t" // alpha + "shufpd $0, %%xmm12, %%xmm12 \n\t" + "shufpd $0, %%xmm13, %%xmm13 \n\t" + + ".align 16 \n\t" + ".L01LOOP%=: \n\t" + "movups (%3,%0,8), %%xmm4 \n\t" // 2 * y + "movups 16(%3,%0,8), %%xmm5 \n\t" // 2 * y + + "movups (%4,%0,8), %%xmm8 \n\t" + "movups (%5,%0,8), %%xmm9 \n\t" + "mulpd %%xmm12, %%xmm8 \n\t" + "mulpd %%xmm13, %%xmm9 \n\t" + "addpd %%xmm8 , %%xmm4 \n\t" + "addpd %%xmm9 , %%xmm4 \n\t" + + "movups 16(%4,%0,8), %%xmm8 \n\t" + "movups 16(%5,%0,8), %%xmm9 \n\t" + "mulpd %%xmm12, %%xmm8 \n\t" + "mulpd %%xmm13, %%xmm9 \n\t" + "addpd %%xmm8 , %%xmm5 \n\t" + "addpd %%xmm9 , %%xmm5 \n\t" + + "movups %%xmm4 , (%3,%0,8) \n\t" // 2 * y + "movups %%xmm5 , 16(%3,%0,8) \n\t" // 2 * y + + "addq $4 , %0 \n\t" + "subq $4 , %1 \n\t" + "jnz .L01LOOP%= \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (ap[0]), // 4 + "r" (ap[1]), // 5 + "r" (alpha) // 6 + : "cc", + "%xmm4", "%xmm5", + "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + +#endif + +#ifndef HAVE_KERNEL_4x2 + +static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline)); + +static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "movsd (%2), %%xmm12 \n\t" // x0 + "mulsd (%5), %%xmm12 \n\t" // alpha + "shufpd $0, %%xmm12, %%xmm12 \n\t" + + ".align 16 \n\t" + ".L01LOOP%=: \n\t" + "movups (%4,%0,8), %%xmm8 \n\t" // 2 * a + "movups 16(%4,%0,8), %%xmm9 \n\t" // 2 * a + "movups (%3,%0,8), %%xmm4 \n\t" // 2 * y + "movups 16(%3,%0,8), %%xmm5 \n\t" // 2 * y + "mulpd %%xmm12, %%xmm8 \n\t" + "mulpd %%xmm12, %%xmm9 \n\t" + "addpd %%xmm8 , %%xmm4 \n\t" + "addpd %%xmm9 , %%xmm5 \n\t" + + "movups %%xmm4 , (%3,%0,8) \n\t" // 2 * y + "movups %%xmm5 , 16(%3,%0,8) \n\t" // 2 * y + + "addq $4 , %0 \n\t" + "subq $4 , %1 \n\t" + + "jnz .L01LOOP%= \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (ap), // 4 + "r" (alpha) // 5 + : "cc", + "%xmm4", "%xmm5", + "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + +#endif + +static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) __attribute__ ((noinline)); + +static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) +{ + BLASLONG i; + if ( inc_dest != 1 ) + { + for ( i=0; i> 3 ; + n2 = n & 7 ; + } + else + { + n1 = n >> 2 ; + n2 = n & 3 ; + + } + + m3 = m & 3 ; + m1 = m & -4 ; + m2 = (m & (NBMAX-1)) - m3 ; + + + y_ptr = y; + + BLASLONG NB = NBMAX; + + while ( NB == NBMAX ) + { + + m1 -= NB; + if ( m1 < 0) + { + if ( m2 == 0 ) break; + NB = m2; + } + + a_ptr = a; + x_ptr = x; + + ap[0] = a_ptr; + ap[1] = a_ptr + lda; + ap[2] = ap[1] + lda; + ap[3] = ap[2] + lda; + + if ( inc_y != 1 ) + memset(ybuffer,0,NB*8); + else + ybuffer = y_ptr; + + if ( inc_x == 1 ) + { + + + for( i = 0; i < n1 ; i++) + { + dgemv_kernel_4x8(NB,ap,x_ptr,ybuffer,lda4,&alpha); + ap[0] += lda8; + ap[1] += lda8; + ap[2] += lda8; + ap[3] += lda8; + a_ptr += lda8; + x_ptr += 8; + } + + + if ( n2 & 4 ) + { + dgemv_kernel_4x4(NB,ap,x_ptr,ybuffer,&alpha); + ap[0] += lda4; + ap[1] += lda4; + a_ptr += lda4; + x_ptr += 4; + } + + if ( n2 & 2 ) + { + dgemv_kernel_4x2(NB,ap,x_ptr,ybuffer,&alpha); + a_ptr += lda*2; + x_ptr += 2; + } + + + if ( n2 & 1 ) + { + dgemv_kernel_4x1(NB,a_ptr,x_ptr,ybuffer,&alpha); + a_ptr += lda; + x_ptr += 1; + + } + + + } + else + { + + for( i = 0; i < n1 ; i++) + { + xbuffer[0] = x_ptr[0]; + x_ptr += inc_x; + xbuffer[1] = x_ptr[0]; + x_ptr += inc_x; + xbuffer[2] = x_ptr[0]; + x_ptr += inc_x; + xbuffer[3] = x_ptr[0]; + x_ptr += inc_x; + dgemv_kernel_4x4(NB,ap,xbuffer,ybuffer,&alpha); + ap[0] += lda4; + ap[1] += lda4; + ap[2] += lda4; + ap[3] += lda4; + a_ptr += lda4; + } + + for( i = 0; i < n2 ; i++) + { + xbuffer[0] = x_ptr[0]; + x_ptr += inc_x; + dgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer,&alpha); + a_ptr += lda; + + } + + } + + a += NB; + if ( inc_y != 1 ) + { + add_y(NB,ybuffer,y_ptr,inc_y); + y_ptr += NB * inc_y; + } + else + y_ptr += NB ; + + } + + if ( m3 == 0 ) return(0); + + if ( m3 == 3 ) + { + a_ptr = a; + x_ptr = x; + FLOAT temp0 = 0.0; + FLOAT temp1 = 0.0; + FLOAT temp2 = 0.0; + if ( lda == 3 && inc_x ==1 ) + { + + for( i = 0; i < ( n & -4 ); i+=4 ) + { + + temp0 += a_ptr[0] * x_ptr[0] + a_ptr[3] * x_ptr[1]; + temp1 += a_ptr[1] * x_ptr[0] + a_ptr[4] * x_ptr[1]; + temp2 += a_ptr[2] * x_ptr[0] + a_ptr[5] * x_ptr[1]; + + temp0 += a_ptr[6] * x_ptr[2] + a_ptr[9] * x_ptr[3]; + temp1 += a_ptr[7] * x_ptr[2] + a_ptr[10] * x_ptr[3]; + temp2 += a_ptr[8] * x_ptr[2] + a_ptr[11] * x_ptr[3]; + + a_ptr += 12; + x_ptr += 4; + } + + for( ; i < n; i++ ) + { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + temp2 += a_ptr[2] * x_ptr[0]; + a_ptr += 3; + x_ptr ++; + } + + } + else + { + + for( i = 0; i < n; i++ ) + { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + temp2 += a_ptr[2] * x_ptr[0]; + a_ptr += lda; + x_ptr += inc_x; + + + } + + } + y_ptr[0] += alpha * temp0; + y_ptr += inc_y; + y_ptr[0] += alpha * temp1; + y_ptr += inc_y; + y_ptr[0] += alpha * temp2; + return(0); + } + + + if ( m3 == 2 ) + { + a_ptr = a; + x_ptr = x; + FLOAT temp0 = 0.0; + FLOAT temp1 = 0.0; + if ( lda == 2 && inc_x ==1 ) + { + + for( i = 0; i < (n & -4) ; i+=4 ) + { + temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1]; + temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1]; + temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3]; + temp1 += a_ptr[5] * x_ptr[2] + a_ptr[7] * x_ptr[3]; + a_ptr += 8; + x_ptr += 4; + + } + + + for( ; i < n; i++ ) + { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + a_ptr += 2; + x_ptr ++; + } + + } + else + { + + for( i = 0; i < n; i++ ) + { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + a_ptr += lda; + x_ptr += inc_x; + + + } + + } + y_ptr[0] += alpha * temp0; + y_ptr += inc_y; + y_ptr[0] += alpha * temp1; + return(0); + } + + if ( m3 == 1 ) + { + a_ptr = a; + x_ptr = x; + FLOAT temp = 0.0; + if ( lda == 1 && inc_x ==1 ) + { + + for( i = 0; i < (n & -4); i+=4 ) + { + temp += a_ptr[i] * x_ptr[i] + a_ptr[i+1] * x_ptr[i+1] + a_ptr[i+2] * x_ptr[i+2] + a_ptr[i+3] * x_ptr[i+3]; + + } + + for( ; i < n; i++ ) + { + temp += a_ptr[i] * x_ptr[i]; + } + + } + else + { + + for( i = 0; i < n; i++ ) + { + temp += a_ptr[0] * x_ptr[0]; + a_ptr += lda; + x_ptr += inc_x; + } + + } + y_ptr[0] += alpha * temp; + return(0); + } + + + return(0); +} + + diff --git a/kernel/x86_64/dgemv_n_microk_haswell-4.c b/kernel/x86_64/dgemv_n_microk_haswell-4.c new file mode 100644 index 000000000..2c77f3469 --- /dev/null +++ b/kernel/x86_64/dgemv_n_microk_haswell-4.c @@ -0,0 +1,247 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + + +#define HAVE_KERNEL_4x8 1 +static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) __attribute__ ((noinline)); + +static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) +{ + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + "vbroadcastsd (%2), %%ymm12 \n\t" // x0 + "vbroadcastsd 8(%2), %%ymm13 \n\t" // x1 + "vbroadcastsd 16(%2), %%ymm14 \n\t" // x2 + "vbroadcastsd 24(%2), %%ymm15 \n\t" // x3 + "vbroadcastsd 32(%2), %%ymm0 \n\t" // x4 + "vbroadcastsd 40(%2), %%ymm1 \n\t" // x5 + "vbroadcastsd 48(%2), %%ymm2 \n\t" // x6 + "vbroadcastsd 56(%2), %%ymm3 \n\t" // x7 + + "vbroadcastsd (%9), %%ymm6 \n\t" // alpha + + "testq $0x04, %1 \n\t" + "jz .L8LABEL%= \n\t" + + "vmovupd (%3,%0,8), %%ymm7 \n\t" // 4 * y + "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t" + "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t" + + "vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t" + "vfmadd231pd (%5,%0,8), %%ymm13, %%ymm5 \n\t" + "vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t" + "vfmadd231pd (%7,%0,8), %%ymm15, %%ymm5 \n\t" + + "vfmadd231pd (%4,%8,8), %%ymm0 , %%ymm4 \n\t" + "vfmadd231pd (%5,%8,8), %%ymm1 , %%ymm5 \n\t" + "vfmadd231pd (%6,%8,8), %%ymm2 , %%ymm4 \n\t" + "vfmadd231pd (%7,%8,8), %%ymm3 , %%ymm5 \n\t" + + "vaddpd %%ymm4 , %%ymm5 , %%ymm5 \n\t" + "vmulpd %%ymm6 , %%ymm5 , %%ymm5 \n\t" + "vaddpd %%ymm7 , %%ymm5 , %%ymm5 \n\t" + + + "vmovupd %%ymm5, (%3,%0,8) \n\t" // 4 * y + + "addq $4 , %8 \n\t" + "addq $4 , %0 \n\t" + "subq $4 , %1 \n\t" + + ".L8LABEL%=: \n\t" + + "cmpq $0, %1 \n\t" + "je .L16END%= \n\t" + + + ".align 16 \n\t" + ".L01LOOP%=: \n\t" + + "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t" + "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t" + "vmovupd (%3,%0,8), %%ymm8 \n\t" // 4 * y + "vmovupd 32(%3,%0,8), %%ymm9 \n\t" // 4 * y + + "vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t" + "vfmadd231pd 32(%4,%0,8), %%ymm12, %%ymm5 \n\t" + "vfmadd231pd (%5,%0,8), %%ymm13, %%ymm4 \n\t" + "vfmadd231pd 32(%5,%0,8), %%ymm13, %%ymm5 \n\t" + "vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t" + "vfmadd231pd 32(%6,%0,8), %%ymm14, %%ymm5 \n\t" + "vfmadd231pd (%7,%0,8), %%ymm15, %%ymm4 \n\t" + "vfmadd231pd 32(%7,%0,8), %%ymm15, %%ymm5 \n\t" + + "vfmadd231pd (%4,%8,8), %%ymm0 , %%ymm4 \n\t" + "addq $8 , %0 \n\t" + "vfmadd231pd 32(%4,%8,8), %%ymm0 , %%ymm5 \n\t" + "vfmadd231pd (%5,%8,8), %%ymm1 , %%ymm4 \n\t" + "vfmadd231pd 32(%5,%8,8), %%ymm1 , %%ymm5 \n\t" + "vfmadd231pd (%6,%8,8), %%ymm2 , %%ymm4 \n\t" + "vfmadd231pd 32(%6,%8,8), %%ymm2 , %%ymm5 \n\t" + "vfmadd231pd (%7,%8,8), %%ymm3 , %%ymm4 \n\t" + "vfmadd231pd 32(%7,%8,8), %%ymm3 , %%ymm5 \n\t" + + "vfmadd231pd %%ymm6 , %%ymm4 , %%ymm8 \n\t" + "vfmadd231pd %%ymm6 , %%ymm5 , %%ymm9 \n\t" + + "addq $8 , %8 \n\t" + "vmovupd %%ymm8,-64(%3,%0,8) \n\t" // 4 * y + "subq $8 , %1 \n\t" + "vmovupd %%ymm9,-32(%3,%0,8) \n\t" // 4 * y + + "jnz .L01LOOP%= \n\t" + + ".L16END%=: \n\t" + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (ap[0]), // 4 + "r" (ap[1]), // 5 + "r" (ap[2]), // 6 + "r" (ap[3]), // 7 + "r" (lda4), // 8 + "r" (alpha) // 9 + : "cc", + "%xmm0", "%xmm1", + "%xmm2", "%xmm3", + "%xmm4", "%xmm5", + "%xmm6", "%xmm7", + "%xmm8", "%xmm9", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + + +#define HAVE_KERNEL_4x4 1 +static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline)); + +static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + "vbroadcastsd (%2), %%ymm12 \n\t" // x0 + "vbroadcastsd 8(%2), %%ymm13 \n\t" // x1 + "vbroadcastsd 16(%2), %%ymm14 \n\t" // x2 + "vbroadcastsd 24(%2), %%ymm15 \n\t" // x3 + + "vbroadcastsd (%8), %%ymm6 \n\t" // alpha + + "testq $0x04, %1 \n\t" + "jz .L8LABEL%= \n\t" + + "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t" + "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t" + "vmovupd (%3,%0,8), %%ymm7 \n\t" // 4 * y + + "vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t" + "vfmadd231pd (%5,%0,8), %%ymm13, %%ymm5 \n\t" + "vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t" + "vfmadd231pd (%7,%0,8), %%ymm15, %%ymm5 \n\t" + + "vaddpd %%ymm4 , %%ymm5 , %%ymm5 \n\t" + "vmulpd %%ymm6 , %%ymm5 , %%ymm5 \n\t" + "vaddpd %%ymm7 , %%ymm5 , %%ymm5 \n\t" + + "vmovupd %%ymm5, (%3,%0,8) \n\t" // 4 * y + + "addq $4 , %0 \n\t" + "subq $4 , %1 \n\t" + + ".L8LABEL%=: \n\t" + + "cmpq $0, %1 \n\t" + "je .L8END%= \n\t" + + + ".align 16 \n\t" + ".L01LOOP%=: \n\t" + "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t" + "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t" + "vmovupd (%3,%0,8), %%ymm8 \n\t" // 4 * y + "vmovupd 32(%3,%0,8), %%ymm9 \n\t" // 4 * y + + "vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t" + "vfmadd231pd 32(%4,%0,8), %%ymm12, %%ymm5 \n\t" + "vfmadd231pd (%5,%0,8), %%ymm13, %%ymm4 \n\t" + "vfmadd231pd 32(%5,%0,8), %%ymm13, %%ymm5 \n\t" + "vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t" + "vfmadd231pd 32(%6,%0,8), %%ymm14, %%ymm5 \n\t" + "vfmadd231pd (%7,%0,8), %%ymm15, %%ymm4 \n\t" + "vfmadd231pd 32(%7,%0,8), %%ymm15, %%ymm5 \n\t" + + "vfmadd231pd %%ymm6 , %%ymm4 , %%ymm8 \n\t" + "vfmadd231pd %%ymm6 , %%ymm5 , %%ymm9 \n\t" + + "vmovupd %%ymm8, (%3,%0,8) \n\t" // 4 * y + "vmovupd %%ymm9, 32(%3,%0,8) \n\t" // 4 * y + + "addq $8 , %0 \n\t" + "subq $8 , %1 \n\t" + "jnz .L01LOOP%= \n\t" + + ".L8END%=: \n\t" + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (ap[0]), // 4 + "r" (ap[1]), // 5 + "r" (ap[2]), // 6 + "r" (ap[3]), // 7 + "r" (alpha) // 8 + : "cc", + "%xmm4", "%xmm5", + "%xmm6", "%xmm7", + "%xmm8", "%xmm9", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + diff --git a/kernel/x86_64/dgemv_n_microk_nehalem-4.c b/kernel/x86_64/dgemv_n_microk_nehalem-4.c new file mode 100644 index 000000000..e311326f1 --- /dev/null +++ b/kernel/x86_64/dgemv_n_microk_nehalem-4.c @@ -0,0 +1,265 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + + +#define HAVE_KERNEL_4x8 1 +static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) __attribute__ ((noinline)); + +static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) +{ + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "movsd (%2), %%xmm12 \n\t" // x0 + "movsd 8(%2), %%xmm13 \n\t" // x1 + "movsd 16(%2), %%xmm14 \n\t" // x2 + "movsd 24(%2), %%xmm15 \n\t" // x3 + "shufpd $0, %%xmm12, %%xmm12\n\t" + "shufpd $0, %%xmm13, %%xmm13\n\t" + "shufpd $0, %%xmm14, %%xmm14\n\t" + "shufpd $0, %%xmm15, %%xmm15\n\t" + + "movsd 32(%2), %%xmm0 \n\t" // x4 + "movsd 40(%2), %%xmm1 \n\t" // x5 + "movsd 48(%2), %%xmm2 \n\t" // x6 + "movsd 56(%2), %%xmm3 \n\t" // x7 + "shufpd $0, %%xmm0 , %%xmm0 \n\t" + "shufpd $0, %%xmm1 , %%xmm1 \n\t" + "shufpd $0, %%xmm2 , %%xmm2 \n\t" + "shufpd $0, %%xmm3 , %%xmm3 \n\t" + + "movsd (%9), %%xmm6 \n\t" // alpha + "shufpd $0, %%xmm6 , %%xmm6 \n\t" + + + ".align 16 \n\t" + ".L01LOOP%=: \n\t" + "xorpd %%xmm4 , %%xmm4 \n\t" + "xorpd %%xmm5 , %%xmm5 \n\t" + "movups (%3,%0,8), %%xmm7 \n\t" // 2 * y + + ".align 2 \n\t" + "movups (%4,%0,8), %%xmm8 \n\t" + "movups (%5,%0,8), %%xmm9 \n\t" + "movups (%6,%0,8), %%xmm10 \n\t" + "movups (%7,%0,8), %%xmm11 \n\t" + ".align 2 \n\t" + "mulpd %%xmm12, %%xmm8 \n\t" + "mulpd %%xmm13, %%xmm9 \n\t" + "mulpd %%xmm14, %%xmm10 \n\t" + "mulpd %%xmm15, %%xmm11 \n\t" + "addpd %%xmm8 , %%xmm4 \n\t" + "addpd %%xmm9 , %%xmm5 \n\t" + "addpd %%xmm10, %%xmm4 \n\t" + "addpd %%xmm11, %%xmm5 \n\t" + + "movups (%4,%8,8), %%xmm8 \n\t" + "movups (%5,%8,8), %%xmm9 \n\t" + "movups (%6,%8,8), %%xmm10 \n\t" + "movups (%7,%8,8), %%xmm11 \n\t" + ".align 2 \n\t" + "mulpd %%xmm0 , %%xmm8 \n\t" + "mulpd %%xmm1 , %%xmm9 \n\t" + "mulpd %%xmm2 , %%xmm10 \n\t" + "mulpd %%xmm3 , %%xmm11 \n\t" + "addpd %%xmm8 , %%xmm4 \n\t" + "addpd %%xmm9 , %%xmm5 \n\t" + "addpd %%xmm10, %%xmm4 \n\t" + "addpd %%xmm11, %%xmm5 \n\t" + + "addpd %%xmm5 , %%xmm4 \n\t" + "mulpd %%xmm6 , %%xmm4 \n\t" + "addpd %%xmm4 , %%xmm7 \n\t" + + "movups %%xmm7 , (%3,%0,8) \n\t" // 2 * y + + "xorpd %%xmm4 , %%xmm4 \n\t" + "xorpd %%xmm5 , %%xmm5 \n\t" + "movups 16(%3,%0,8), %%xmm7 \n\t" // 2 * y + + ".align 2 \n\t" + "movups 16(%4,%0,8), %%xmm8 \n\t" + "movups 16(%5,%0,8), %%xmm9 \n\t" + "movups 16(%6,%0,8), %%xmm10 \n\t" + "movups 16(%7,%0,8), %%xmm11 \n\t" + ".align 2 \n\t" + "mulpd %%xmm12, %%xmm8 \n\t" + "mulpd %%xmm13, %%xmm9 \n\t" + "mulpd %%xmm14, %%xmm10 \n\t" + "mulpd %%xmm15, %%xmm11 \n\t" + "addpd %%xmm8 , %%xmm4 \n\t" + "addpd %%xmm9 , %%xmm5 \n\t" + "addpd %%xmm10, %%xmm4 \n\t" + "addpd %%xmm11, %%xmm5 \n\t" + + "movups 16(%4,%8,8), %%xmm8 \n\t" + "movups 16(%5,%8,8), %%xmm9 \n\t" + "movups 16(%6,%8,8), %%xmm10 \n\t" + "movups 16(%7,%8,8), %%xmm11 \n\t" + ".align 2 \n\t" + "mulpd %%xmm0 , %%xmm8 \n\t" + "mulpd %%xmm1 , %%xmm9 \n\t" + "mulpd %%xmm2 , %%xmm10 \n\t" + "mulpd %%xmm3 , %%xmm11 \n\t" + "addpd %%xmm8 , %%xmm4 \n\t" + "addpd %%xmm9 , %%xmm5 \n\t" + "addpd %%xmm10, %%xmm4 \n\t" + "addpd %%xmm11, %%xmm5 \n\t" + + "addq $4 , %8 \n\t" + "addpd %%xmm5 , %%xmm4 \n\t" + "mulpd %%xmm6 , %%xmm4 \n\t" + "addpd %%xmm4 , %%xmm7 \n\t" + + "movups %%xmm7 , 16(%3,%0,8) \n\t" // 2 * y + + "addq $4 , %0 \n\t" + "subq $4 , %1 \n\t" + "jnz .L01LOOP%= \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (ap[0]), // 4 + "r" (ap[1]), // 5 + "r" (ap[2]), // 6 + "r" (ap[3]), // 7 + "r" (lda4), // 8 + "r" (alpha) // 9 + : "cc", + "%xmm0", "%xmm1", + "%xmm2", "%xmm3", + "%xmm4", "%xmm5", + "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + + + +#define HAVE_KERNEL_4x4 1 +static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline)); + +static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "movsd (%2), %%xmm12 \n\t" // x0 + "movsd 8(%2), %%xmm13 \n\t" // x1 + "movsd 16(%2), %%xmm14 \n\t" // x2 + "movsd 24(%2), %%xmm15 \n\t" // x3 + "shufpd $0, %%xmm12, %%xmm12\n\t" + "shufpd $0, %%xmm13, %%xmm13\n\t" + "shufpd $0, %%xmm14, %%xmm14\n\t" + "shufpd $0, %%xmm15, %%xmm15\n\t" + + "movsd (%8), %%xmm6 \n\t" // alpha + "shufpd $0, %%xmm6 , %%xmm6 \n\t" + + ".align 16 \n\t" + ".L01LOOP%=: \n\t" + "xorpd %%xmm4 , %%xmm4 \n\t" + "xorpd %%xmm5 , %%xmm5 \n\t" + "movups (%3,%0,8), %%xmm7 \n\t" // 2 * y + + "movups (%4,%0,8), %%xmm8 \n\t" + "movups (%5,%0,8), %%xmm9 \n\t" + "movups (%6,%0,8), %%xmm10 \n\t" + "movups (%7,%0,8), %%xmm11 \n\t" + "mulpd %%xmm12, %%xmm8 \n\t" + "mulpd %%xmm13, %%xmm9 \n\t" + "mulpd %%xmm14, %%xmm10 \n\t" + "mulpd %%xmm15, %%xmm11 \n\t" + "addpd %%xmm8 , %%xmm4 \n\t" + "addpd %%xmm9 , %%xmm4 \n\t" + "addpd %%xmm10 , %%xmm4 \n\t" + "addpd %%xmm4 , %%xmm11 \n\t" + + "mulpd %%xmm6 , %%xmm11 \n\t" + "addpd %%xmm7 , %%xmm11 \n\t" + "movups %%xmm11, (%3,%0,8) \n\t" // 2 * y + + "xorpd %%xmm4 , %%xmm4 \n\t" + "xorpd %%xmm5 , %%xmm5 \n\t" + "movups 16(%3,%0,8), %%xmm7 \n\t" // 2 * y + + "movups 16(%4,%0,8), %%xmm8 \n\t" + "movups 16(%5,%0,8), %%xmm9 \n\t" + "movups 16(%6,%0,8), %%xmm10 \n\t" + "movups 16(%7,%0,8), %%xmm11 \n\t" + "mulpd %%xmm12, %%xmm8 \n\t" + "mulpd %%xmm13, %%xmm9 \n\t" + "mulpd %%xmm14, %%xmm10 \n\t" + "mulpd %%xmm15, %%xmm11 \n\t" + "addpd %%xmm8 , %%xmm4 \n\t" + "addpd %%xmm9 , %%xmm4 \n\t" + "addpd %%xmm10 , %%xmm4 \n\t" + "addpd %%xmm4 , %%xmm11 \n\t" + + "mulpd %%xmm6 , %%xmm11 \n\t" + "addpd %%xmm7 , %%xmm11 \n\t" + "movups %%xmm11, 16(%3,%0,8) \n\t" // 2 * y + + "addq $4 , %0 \n\t" + "subq $4 , %1 \n\t" + "jnz .L01LOOP%= \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (ap[0]), // 4 + "r" (ap[1]), // 5 + "r" (ap[2]), // 6 + "r" (ap[3]), // 7 + "r" (alpha) // 8 + : "cc", + "%xmm4", "%xmm5", + "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + diff --git a/kernel/x86_64/dgemv_t_4.c b/kernel/x86_64/dgemv_t_4.c new file mode 100644 index 000000000..ebec7d2c3 --- /dev/null +++ b/kernel/x86_64/dgemv_t_4.c @@ -0,0 +1,615 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include "common.h" + +#if defined(HASWELL) +#include "dgemv_t_microk_haswell-4.c" +#endif + +#define NBMAX 2048 + +#ifndef HAVE_KERNEL_4x4 + +static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) +{ + BLASLONG i; + FLOAT *a0,*a1,*a2,*a3; + a0 = ap[0]; + a1 = ap[1]; + a2 = ap[2]; + a3 = ap[3]; + FLOAT temp0 = 0.0; + FLOAT temp1 = 0.0; + FLOAT temp2 = 0.0; + FLOAT temp3 = 0.0; + + for ( i=0; i< n; i+=4 ) + { + temp0 += a0[i]*x[i] + a0[i+1]*x[i+1] + a0[i+2]*x[i+2] + a0[i+3]*x[i+3]; + temp1 += a1[i]*x[i] + a1[i+1]*x[i+1] + a1[i+2]*x[i+2] + a1[i+3]*x[i+3]; + temp2 += a2[i]*x[i] + a2[i+1]*x[i+1] + a2[i+2]*x[i+2] + a2[i+3]*x[i+3]; + temp3 += a3[i]*x[i] + a3[i+1]*x[i+1] + a3[i+2]*x[i+2] + a3[i+3]*x[i+3]; + } + y[0] = temp0; + y[1] = temp1; + y[2] = temp2; + y[3] = temp3; +} + +#endif + +static void dgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT *y) __attribute__ ((noinline)); + +static void dgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT *y) +{ + BLASLONG i; + + i=0; + + __asm__ __volatile__ + ( + "xorpd %%xmm10 , %%xmm10 \n\t" + "xorpd %%xmm11 , %%xmm11 \n\t" + + "testq $2 , %1 \n\t" + "jz .L01LABEL%= \n\t" + + "movups (%5,%0,8) , %%xmm14 \n\t" // x + "movups (%3,%0,8) , %%xmm12 \n\t" // ap0 + "movups (%4,%0,8) , %%xmm13 \n\t" // ap1 + "mulpd %%xmm14 , %%xmm12 \n\t" + "mulpd %%xmm14 , %%xmm13 \n\t" + "addq $2 , %0 \n\t" + "addpd %%xmm12 , %%xmm10 \n\t" + "subq $2 , %1 \n\t" + "addpd %%xmm13 , %%xmm11 \n\t" + + ".L01LABEL%=: \n\t" + + "cmpq $0, %1 \n\t" + "je .L01END%= \n\t" + + ".align 16 \n\t" + ".L01LOOP%=: \n\t" + + "movups (%5,%0,8) , %%xmm14 \n\t" // x + "movups (%3,%0,8) , %%xmm12 \n\t" // ap0 + "movups (%4,%0,8) , %%xmm13 \n\t" // ap1 + "mulpd %%xmm14 , %%xmm12 \n\t" + "mulpd %%xmm14 , %%xmm13 \n\t" + "addpd %%xmm12 , %%xmm10 \n\t" + "addpd %%xmm13 , %%xmm11 \n\t" + + "movups 16(%5,%0,8) , %%xmm14 \n\t" // x + "movups 16(%3,%0,8) , %%xmm12 \n\t" // ap0 + "movups 16(%4,%0,8) , %%xmm13 \n\t" // ap1 + "mulpd %%xmm14 , %%xmm12 \n\t" + "mulpd %%xmm14 , %%xmm13 \n\t" + "addpd %%xmm12 , %%xmm10 \n\t" + "addpd %%xmm13 , %%xmm11 \n\t" + + "addq $4 , %0 \n\t" + "subq $4 , %1 \n\t" + "jnz .L01LOOP%= \n\t" + + ".L01END%=: \n\t" + + "haddpd %%xmm10, %%xmm10 \n\t" + "haddpd %%xmm11, %%xmm11 \n\t" + + "movsd %%xmm10, (%2) \n\t" + "movsd %%xmm11,8(%2) \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (y), // 2 + "r" (ap0), // 3 + "r" (ap1), // 4 + "r" (x) // 5 + : "cc", + "%xmm4", "%xmm5", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + + +} + +static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline)); + +static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) +{ + BLASLONG i; + + i=0; + + __asm__ __volatile__ + ( + "xorpd %%xmm9 , %%xmm9 \n\t" + "xorpd %%xmm10 , %%xmm10 \n\t" + + "testq $2 , %1 \n\t" + "jz .L01LABEL%= \n\t" + + "movups (%3,%0,8) , %%xmm12 \n\t" + "movups (%4,%0,8) , %%xmm11 \n\t" + "mulpd %%xmm11 , %%xmm12 \n\t" + "addq $2 , %0 \n\t" + "addpd %%xmm12 , %%xmm10 \n\t" + "subq $2 , %1 \n\t" + + ".L01LABEL%=: \n\t" + + "cmpq $0, %1 \n\t" + "je .L01END%= \n\t" + + ".align 16 \n\t" + ".L01LOOP%=: \n\t" + + "movups (%3,%0,8) , %%xmm12 \n\t" + "movups 16(%3,%0,8) , %%xmm14 \n\t" + "movups (%4,%0,8) , %%xmm11 \n\t" + "movups 16(%4,%0,8) , %%xmm13 \n\t" + "mulpd %%xmm11 , %%xmm12 \n\t" + "mulpd %%xmm13 , %%xmm14 \n\t" + "addq $4 , %0 \n\t" + "addpd %%xmm12 , %%xmm10 \n\t" + "subq $4 , %1 \n\t" + "addpd %%xmm14 , %%xmm9 \n\t" + + "jnz .L01LOOP%= \n\t" + + ".L01END%=: \n\t" + + "addpd %%xmm9 , %%xmm10 \n\t" + "haddpd %%xmm10, %%xmm10 \n\t" + + "movsd %%xmm10, (%2) \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (y), // 2 + "r" (ap), // 3 + "r" (x) // 4 + : "cc", + "%xmm9", "%xmm10" , + "%xmm11", "%xmm12", "%xmm13", "%xmm14", + "memory" + ); + + +} + +static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) +{ + BLASLONG i; + for ( i=0; i> 2 ; + n2 = n & 3 ; + + m3 = m & 3 ; + m1 = m & -4 ; + m2 = (m & (NBMAX-1)) - m3 ; + + + BLASLONG NB = NBMAX; + + while ( NB == NBMAX ) + { + + m1 -= NB; + if ( m1 < 0) + { + if ( m2 == 0 ) break; + NB = m2; + } + + y_ptr = y; + a_ptr = a; + x_ptr = x; + + if ( inc_x == 1 ) + xbuffer = x_ptr; + else + copy_x(NB,x_ptr,xbuffer,inc_x); + + + FLOAT *ap[4]; + FLOAT *yp; + BLASLONG register lda4 = 4 * lda; + ap[0] = a_ptr; + ap[1] = a_ptr + lda; + ap[2] = ap[1] + lda; + ap[3] = ap[2] + lda; + + if ( n0 > 0 ) + { + BLASLONG nb1 = NBMAX / 4; + for( j=0; j 0 ) + { + add_y(n1*4, alpha, ytemp, y_ptr, inc_y ); + y_ptr += n1 * inc_y * 4; + a_ptr += n1 * lda4 ; + } + + if ( n2 & 2 ) + { + + dgemv_kernel_4x2(NB,ap[0],ap[1],xbuffer,ybuffer); + a_ptr += lda * 2; + *y_ptr += ybuffer[0] * alpha; + y_ptr += inc_y; + *y_ptr += ybuffer[1] * alpha; + y_ptr += inc_y; + + } + + if ( n2 & 1 ) + { + + dgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer); + a_ptr += lda; + *y_ptr += ybuffer[0] * alpha; + y_ptr += inc_y; + + } + a += NB; + x += NB * inc_x; + } + + if ( m3 == 0 ) return(0); + + x_ptr = x; + a_ptr = a; + if ( m3 == 3 ) + { + FLOAT xtemp0 = *x_ptr * alpha; + x_ptr += inc_x; + FLOAT xtemp1 = *x_ptr * alpha; + x_ptr += inc_x; + FLOAT xtemp2 = *x_ptr * alpha; + + FLOAT *aj = a_ptr; + y_ptr = y; + + if ( lda == 3 && inc_y == 1 ) + { + + for ( j=0; j< ( n & -4) ; j+=4 ) + { + + y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2; + y_ptr[j+1] += aj[3] * xtemp0 + aj[4] * xtemp1 + aj[5] * xtemp2; + y_ptr[j+2] += aj[6] * xtemp0 + aj[7] * xtemp1 + aj[8] * xtemp2; + y_ptr[j+3] += aj[9] * xtemp0 + aj[10] * xtemp1 + aj[11] * xtemp2; + aj += 12; + } + + for ( ; j> 3 ; + n2 = n & 7 ; + } + else + { + n1 = n >> 2 ; + n2 = n & 3 ; + + } + + m3 = m & 3 ; + m1 = m & -4 ; + m2 = (m & (NBMAX-1)) - m3 ; + + + y_ptr = y; + + BLASLONG NB = NBMAX; + + while ( NB == NBMAX ) + { + + m1 -= NB; + if ( m1 < 0) + { + if ( m2 == 0 ) break; + NB = m2; + } + + a_ptr = a; + x_ptr = x; + + ap[0] = a_ptr; + ap[1] = a_ptr + lda; + ap[2] = ap[1] + lda; + ap[3] = ap[2] + lda; + + if ( inc_y != 1 ) + memset(ybuffer,0,NB*4); + else + ybuffer = y_ptr; + + if ( inc_x == 1 ) + { + + + for( i = 0; i < n1 ; i++) + { + sgemv_kernel_4x8(NB,ap,x_ptr,ybuffer,lda4,&alpha); + ap[0] += lda8; + ap[1] += lda8; + ap[2] += lda8; + ap[3] += lda8; + a_ptr += lda8; + x_ptr += 8; + } + + + if ( n2 & 4 ) + { + sgemv_kernel_4x4(NB,ap,x_ptr,ybuffer,&alpha); + ap[0] += lda4; + ap[1] += lda4; + a_ptr += lda4; + x_ptr += 4; + } + + if ( n2 & 2 ) + { + sgemv_kernel_4x2(NB,ap,x_ptr,ybuffer,&alpha); + a_ptr += lda*2; + x_ptr += 2; + } + + + if ( n2 & 1 ) + { + sgemv_kernel_4x1(NB,a_ptr,x_ptr,ybuffer,&alpha); + a_ptr += lda; + x_ptr += 1; + + } + + + } + else + { + + for( i = 0; i < n1 ; i++) + { + xbuffer[0] = x_ptr[0]; + x_ptr += inc_x; + xbuffer[1] = x_ptr[0]; + x_ptr += inc_x; + xbuffer[2] = x_ptr[0]; + x_ptr += inc_x; + xbuffer[3] = x_ptr[0]; + x_ptr += inc_x; + sgemv_kernel_4x4(NB,ap,xbuffer,ybuffer,&alpha); + ap[0] += lda4; + ap[1] += lda4; + ap[2] += lda4; + ap[3] += lda4; + a_ptr += lda4; + } + + for( i = 0; i < n2 ; i++) + { + xbuffer[0] = x_ptr[0]; + x_ptr += inc_x; + sgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer,&alpha); + a_ptr += lda; + + } + + } + + a += NB; + if ( inc_y != 1 ) + { + add_y(NB,ybuffer,y_ptr,inc_y); + y_ptr += NB * inc_y; + } + else + y_ptr += NB ; + + } + + if ( m3 == 0 ) return(0); + + if ( m3 == 3 ) + { + a_ptr = a; + x_ptr = x; + FLOAT temp0 = 0.0; + FLOAT temp1 = 0.0; + FLOAT temp2 = 0.0; + if ( lda == 3 && inc_x ==1 ) + { + + for( i = 0; i < ( n & -4 ); i+=4 ) + { + + temp0 += a_ptr[0] * x_ptr[0] + a_ptr[3] * x_ptr[1]; + temp1 += a_ptr[1] * x_ptr[0] + a_ptr[4] * x_ptr[1]; + temp2 += a_ptr[2] * x_ptr[0] + a_ptr[5] * x_ptr[1]; + + temp0 += a_ptr[6] * x_ptr[2] + a_ptr[9] * x_ptr[3]; + temp1 += a_ptr[7] * x_ptr[2] + a_ptr[10] * x_ptr[3]; + temp2 += a_ptr[8] * x_ptr[2] + a_ptr[11] * x_ptr[3]; + + a_ptr += 12; + x_ptr += 4; + } + + for( ; i < n; i++ ) + { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + temp2 += a_ptr[2] * x_ptr[0]; + a_ptr += 3; + x_ptr ++; + } + + } + else + { + + for( i = 0; i < n; i++ ) + { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + temp2 += a_ptr[2] * x_ptr[0]; + a_ptr += lda; + x_ptr += inc_x; + + + } + + } + y_ptr[0] += alpha * temp0; + y_ptr += inc_y; + y_ptr[0] += alpha * temp1; + y_ptr += inc_y; + y_ptr[0] += alpha * temp2; + return(0); + } + + + if ( m3 == 2 ) + { + a_ptr = a; + x_ptr = x; + FLOAT temp0 = 0.0; + FLOAT temp1 = 0.0; + if ( lda == 2 && inc_x ==1 ) + { + + for( i = 0; i < (n & -4) ; i+=4 ) + { + temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1]; + temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1]; + temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3]; + temp1 += a_ptr[5] * x_ptr[2] + a_ptr[7] * x_ptr[3]; + a_ptr += 8; + x_ptr += 4; + + } + + + for( ; i < n; i++ ) + { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + a_ptr += 2; + x_ptr ++; + } + + } + else + { + + for( i = 0; i < n; i++ ) + { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + a_ptr += lda; + x_ptr += inc_x; + + + } + + } + y_ptr[0] += alpha * temp0; + y_ptr += inc_y; + y_ptr[0] += alpha * temp1; + return(0); + } + + if ( m3 == 1 ) + { + a_ptr = a; + x_ptr = x; + FLOAT temp = 0.0; + if ( lda == 1 && inc_x ==1 ) + { + + for( i = 0; i < (n & -4); i+=4 ) + { + temp += a_ptr[i] * x_ptr[i] + a_ptr[i+1] * x_ptr[i+1] + a_ptr[i+2] * x_ptr[i+2] + a_ptr[i+3] * x_ptr[i+3]; + + } + + for( ; i < n; i++ ) + { + temp += a_ptr[i] * x_ptr[i]; + } + + } + else + { + + for( i = 0; i < n; i++ ) + { + temp += a_ptr[0] * x_ptr[0]; + a_ptr += lda; + x_ptr += inc_x; + } + + } + y_ptr[0] += alpha * temp; + return(0); + } + + + return(0); +} + + diff --git a/kernel/x86_64/sgemv_n_avx.c b/kernel/x86_64/sgemv_n_avx.c deleted file mode 100644 index 57aaad4b4..000000000 --- a/kernel/x86_64/sgemv_n_avx.c +++ /dev/null @@ -1,218 +0,0 @@ -/*************************************************************************** -Copyright (c) 2014, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - - -#include "common.h" - -#if defined(BULLDOZER) || defined(PILEDRIVER) -#include "sgemv_n_microk_bulldozer.c" -#elif defined(HASWELL) -#include "sgemv_n_microk_haswell.c" -#else -#include "sgemv_n_microk_sandy.c" -#endif - -static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) -{ - BLASLONG i; - for ( i=0; i 0 ) - { - - if ( inc_x == 1 ) - xbuffer = x_ptr; - else - copy_x(n2,x_ptr,xbuffer,inc_x); - - a_ptr = a + n1 * 512 * lda; - y_ptr = y; - - for(i = 0; i rax - "vbroadcastss %1, %%ymm1\n\t" // alpha -> ymm1 - "movq %2, %%rsi\n\t" // adress of a -> rsi - "movq %3, %%rcx\n\t" // value of lda > rcx - "movq %4, %%rdi\n\t" // adress of x -> rdi - "movq %5, %%rdx\n\t" // adress of y -> rdx - "movq %6, %%r8\n\t" // address for prefetch - "prefetcht0 (%%r8)\n\t" // Prefetch - "prefetcht0 64(%%r8)\n\t" // Prefetch - - "vxorps %%ymm8 , %%ymm8 , %%ymm8 \n\t" // set to zero - "vxorps %%ymm9 , %%ymm9 , %%ymm9 \n\t" // set to zero - "vxorps %%ymm10, %%ymm10, %%ymm10\n\t" // set to zero - "vxorps %%ymm11, %%ymm11, %%ymm11\n\t" // set to zero - "vxorps %%ymm12, %%ymm12, %%ymm12\n\t" // set to zero - "vxorps %%ymm13, %%ymm13, %%ymm13\n\t" // set to zero - "vxorps %%ymm14, %%ymm14, %%ymm14\n\t" // set to zero - "vxorps %%ymm15, %%ymm15, %%ymm15\n\t" // set to zero - ".align 16 \n\t" - ".L01LOOP%=: \n\t" - "vbroadcastss (%%rdi), %%ymm0 \n\t" // load values of c - "nop \n\t" - "leaq (%%r8 , %%rcx, 4), %%r8 \n\t" // add lda to pointer for prefetch - - "prefetcht0 (%%r8)\n\t" // Prefetch - "vfmaddps %%ymm8 , 0*4(%%rsi), %%ymm0, %%ymm8 \n\t" // multiply a and c and add to temp - "prefetcht0 64(%%r8)\n\t" // Prefetch - "vfmaddps %%ymm9 , 8*4(%%rsi), %%ymm0, %%ymm9 \n\t" // multiply a and c and add to temp - "prefetcht0 128(%%r8)\n\t" // Prefetch - "vfmaddps %%ymm10, 16*4(%%rsi), %%ymm0, %%ymm10\n\t" // multiply a and c and add to temp - "vfmaddps %%ymm11, 24*4(%%rsi), %%ymm0, %%ymm11\n\t" // multiply a and c and add to temp - "prefetcht0 192(%%r8)\n\t" // Prefetch - "vfmaddps %%ymm12, 32*4(%%rsi), %%ymm0, %%ymm12\n\t" // multiply a and c and add to temp - "vfmaddps %%ymm13, 40*4(%%rsi), %%ymm0, %%ymm13\n\t" // multiply a and c and add to temp - "vfmaddps %%ymm14, 48*4(%%rsi), %%ymm0, %%ymm14\n\t" // multiply a and c and add to temp - "vfmaddps %%ymm15, 56*4(%%rsi), %%ymm0, %%ymm15\n\t" // multiply a and c and add to temp - - "addq $4 , %%rdi \n\t" // increment pointer of c - "leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a - - "dec %%rax \n\t" // n = n -1 - "jnz .L01LOOP%= \n\t" - - "vmulps %%ymm8 , %%ymm1, %%ymm8 \n\t" // scale by alpha - "vmulps %%ymm9 , %%ymm1, %%ymm9 \n\t" // scale by alpha - "vmulps %%ymm10, %%ymm1, %%ymm10\n\t" // scale by alpha - "vmulps %%ymm11, %%ymm1, %%ymm11\n\t" // scale by alpha - "vmulps %%ymm12, %%ymm1, %%ymm12\n\t" // scale by alpha - "vmulps %%ymm13, %%ymm1, %%ymm13\n\t" // scale by alpha - "vmulps %%ymm14, %%ymm1, %%ymm14\n\t" // scale by alpha - "vmulps %%ymm15, %%ymm1, %%ymm15\n\t" // scale by alpha - - "vmovups %%ymm8 , (%%rdx) \n\t" // store temp -> y - "vmovups %%ymm9 , 8*4(%%rdx) \n\t" // store temp -> y - "vmovups %%ymm10, 16*4(%%rdx) \n\t" // store temp -> y - "vmovups %%ymm11, 24*4(%%rdx) \n\t" // store temp -> y - "vmovups %%ymm12, 32*4(%%rdx) \n\t" // store temp -> y - "vmovups %%ymm13, 40*4(%%rdx) \n\t" // store temp -> y - "vmovups %%ymm14, 48*4(%%rdx) \n\t" // store temp -> y - "vmovups %%ymm15, 56*4(%%rdx) \n\t" // store temp -> y - - : - : - "m" (n), // 0 - "m" (alpha), // 1 - "m" (a), // 2 - "m" (lda), // 3 - "m" (x), // 4 - "m" (y), // 5 - "m" (pre) // 6 - : "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", - "%xmm0", "%xmm1", - "%xmm8", "%xmm9", "%xmm10", "%xmm11", - "%xmm12", "%xmm13", "%xmm14", "%xmm15", - "memory" - ); - -} - - - -static void sgemv_kernel_32( long n, float alpha, float *a, long lda, float *x, float *y) -{ - - - float *pre = a + lda*3; - - __asm__ __volatile__ - ( - "movq %0, %%rax\n\t" // n -> rax - "vbroadcastss %1, %%xmm1\n\t" // alpha -> xmm1 - "movq %2, %%rsi\n\t" // adress of a -> rsi - "movq %3, %%rcx\n\t" // value of lda > rcx - "movq %4, %%rdi\n\t" // adress of x -> rdi - "movq %5, %%rdx\n\t" // adress of y -> rdx - "movq %6, %%r8\n\t" // address for prefetch - "prefetcht0 (%%r8)\n\t" // Prefetch - "prefetcht0 64(%%r8)\n\t" // Prefetch - - "vxorps %%xmm8 , %%xmm8 , %%xmm8 \n\t" // set to zero - "vxorps %%xmm9 , %%xmm9 , %%xmm9 \n\t" // set to zero - "vxorps %%xmm10, %%xmm10, %%xmm10\n\t" // set to zero - "vxorps %%xmm11, %%xmm11, %%xmm11\n\t" // set to zero - "vxorps %%xmm12, %%xmm12, %%xmm12\n\t" // set to zero - "vxorps %%xmm13, %%xmm13, %%xmm13\n\t" // set to zero - "vxorps %%xmm14, %%xmm14, %%xmm14\n\t" // set to zero - "vxorps %%xmm15, %%xmm15, %%xmm15\n\t" // set to zero - ".align 16 \n\t" - ".L01LOOP%=: \n\t" - "vbroadcastss (%%rdi), %%xmm0 \n\t" // load values of c - "nop \n\t" - "leaq (%%r8 , %%rcx, 4), %%r8 \n\t" // add lda to pointer for prefetch - - "prefetcht0 (%%r8)\n\t" // Prefetch - "vfmaddps %%xmm8 , 0*4(%%rsi), %%xmm0, %%xmm8 \n\t" // multiply a and c and add to temp - "prefetcht0 64(%%r8)\n\t" // Prefetch - "vfmaddps %%xmm9 , 4*4(%%rsi), %%xmm0, %%xmm9 \n\t" // multiply a and c and add to temp - "vfmaddps %%xmm10, 8*4(%%rsi), %%xmm0, %%xmm10\n\t" // multiply a and c and add to temp - "vfmaddps %%xmm11, 12*4(%%rsi), %%xmm0, %%xmm11\n\t" // multiply a and c and add to temp - "vfmaddps %%xmm12, 16*4(%%rsi), %%xmm0, %%xmm12\n\t" // multiply a and c and add to temp - "vfmaddps %%xmm13, 20*4(%%rsi), %%xmm0, %%xmm13\n\t" // multiply a and c and add to temp - "vfmaddps %%xmm14, 24*4(%%rsi), %%xmm0, %%xmm14\n\t" // multiply a and c and add to temp - "vfmaddps %%xmm15, 28*4(%%rsi), %%xmm0, %%xmm15\n\t" // multiply a and c and add to temp - - "addq $4 , %%rdi \n\t" // increment pointer of c - "leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a - - "dec %%rax \n\t" // n = n -1 - "jnz .L01LOOP%= \n\t" - - "vmulps %%xmm8 , %%xmm1, %%xmm8 \n\t" // scale by alpha - "vmulps %%xmm9 , %%xmm1, %%xmm9 \n\t" // scale by alpha - "vmulps %%xmm10, %%xmm1, %%xmm10\n\t" // scale by alpha - "vmulps %%xmm11, %%xmm1, %%xmm11\n\t" // scale by alpha - "vmulps %%xmm12, %%xmm1, %%xmm12\n\t" // scale by alpha - "vmulps %%xmm13, %%xmm1, %%xmm13\n\t" // scale by alpha - "vmulps %%xmm14, %%xmm1, %%xmm14\n\t" // scale by alpha - "vmulps %%xmm15, %%xmm1, %%xmm15\n\t" // scale by alpha - - "vmovups %%xmm8 , (%%rdx) \n\t" // store temp -> y - "vmovups %%xmm9 , 4*4(%%rdx) \n\t" // store temp -> y - "vmovups %%xmm10, 8*4(%%rdx) \n\t" // store temp -> y - "vmovups %%xmm11, 12*4(%%rdx) \n\t" // store temp -> y - "vmovups %%xmm12, 16*4(%%rdx) \n\t" // store temp -> y - "vmovups %%xmm13, 20*4(%%rdx) \n\t" // store temp -> y - "vmovups %%xmm14, 24*4(%%rdx) \n\t" // store temp -> y - "vmovups %%xmm15, 28*4(%%rdx) \n\t" // store temp -> y - - : - : - "m" (n), // 0 - "m" (alpha), // 1 - "m" (a), // 2 - "m" (lda), // 3 - "m" (x), // 4 - "m" (y), // 5 - "m" (pre) // 6 - ); - -} - -static void sgemv_kernel_16( long n, float alpha, float *a, long lda, float *x, float *y) -{ - - float *pre = a + lda*3; - - __asm__ __volatile__ - ( - "movq %0, %%rax\n\t" // n -> rax - "vbroadcastss %1, %%ymm1\n\t" // alpha -> ymm1 - "movq %2, %%rsi\n\t" // adress of a -> rsi - "movq %3, %%rcx\n\t" // value of lda > rcx - "movq %4, %%rdi\n\t" // adress of x -> rdi - "movq %5, %%rdx\n\t" // adress of y -> rdx - "movq %6, %%r8\n\t" // address for prefetch - "prefetcht0 (%%r8)\n\t" // Prefetch - - "vxorps %%ymm12, %%ymm12, %%ymm12\n\t" // set to zero - "vxorps %%ymm13, %%ymm13, %%ymm13\n\t" // set to zero - - ".L01LOOP%=: \n\t" - "vbroadcastss (%%rdi), %%ymm0 \n\t" // load values of c - "addq $4 , %%rdi \n\t" // increment pointer of c - - "leaq (%%r8 , %%rcx, 4), %%r8 \n\t" // add lda to pointer for prefetch - "prefetcht0 (%%r8)\n\t" // Prefetch - - "vfmaddps %%ymm12, 0*4(%%rsi), %%ymm0, %%ymm12\n\t" // multiply a and c and add to temp - "vfmaddps %%ymm13, 8*4(%%rsi), %%ymm0, %%ymm13\n\t" // multiply a and c and add to temp - - "leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a - - "dec %%rax \n\t" // n = n -1 - "jnz .L01LOOP%= \n\t" - - "vmulps %%ymm12, %%ymm1, %%ymm12\n\t" // scale by alpha - "vmulps %%ymm13, %%ymm1, %%ymm13\n\t" // scale by alpha - - "vmovups %%ymm12, (%%rdx) \n\t" // store temp -> y - "vmovups %%ymm13, 8*4(%%rdx) \n\t" // store temp -> y - - : - : - "m" (n), // 0 - "m" (alpha), // 1 - "m" (a), // 2 - "m" (lda), // 3 - "m" (x), // 4 - "m" (y), // 5 - "m" (pre) // 6 - : "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", - "%xmm0", "%xmm1", - "%xmm8", "%xmm9", "%xmm10", "%xmm11", - "%xmm12", "%xmm13", "%xmm14", "%xmm15", - "memory" - ); - -} - - -static void sgemv_kernel_8( long n, float alpha, float *a, long lda, float *x, float *y) -{ - - - __asm__ __volatile__ - ( - "movq %0, %%rax\n\t" // n -> rax - "vbroadcastss %1, %%ymm1\n\t" // alpha -> ymm1 - "movq %2, %%rsi\n\t" // adress of a -> rsi - "movq %3, %%rcx\n\t" // value of lda > rcx - "movq %4, %%rdi\n\t" // adress of x -> rdi - "movq %5, %%rdx\n\t" // adress of y -> rdx - - "vxorps %%ymm12, %%ymm12, %%ymm12\n\t" // set to zero - - ".L01LOOP%=: \n\t" - "vbroadcastss (%%rdi), %%ymm0 \n\t" // load values of c - "addq $4 , %%rdi \n\t" // increment pointer of c - - "vfmaddps %%ymm12, 0*4(%%rsi), %%ymm0, %%ymm12\n\t" // multiply a and c and add to temp - - "leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a - - "dec %%rax \n\t" // n = n -1 - "jnz .L01LOOP%= \n\t" - - "vmulps %%ymm12, %%ymm1, %%ymm12\n\t" // scale by alpha - - "vmovups %%ymm12, (%%rdx) \n\t" // store temp -> y - - : - : - "m" (n), // 0 - "m" (alpha), // 1 - "m" (a), // 2 - "m" (lda), // 3 - "m" (x), // 4 - "m" (y) // 5 - : "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", - "%xmm0", "%xmm1", - "%xmm8", "%xmm9", "%xmm10", "%xmm11", - "%xmm12", "%xmm13", "%xmm14", "%xmm15", - "memory" - ); - -} - - -static void sgemv_kernel_4( long n, float alpha, float *a, long lda, float *x, float *y) -{ - - - __asm__ __volatile__ - ( - "movq %0, %%rax\n\t" // n -> rax - "vbroadcastss %1, %%xmm1\n\t" // alpha -> xmm1 - "movq %2, %%rsi\n\t" // adress of a -> rsi - "movq %3, %%rcx\n\t" // value of lda > rcx - "movq %4, %%rdi\n\t" // adress of x -> rdi - "movq %5, %%rdx\n\t" // adress of y -> rdx - - "vxorps %%xmm12, %%xmm12, %%xmm12\n\t" // set to zero - - ".L01LOOP%=: \n\t" - "vbroadcastss (%%rdi), %%xmm0 \n\t" // load values of c - "addq $4 , %%rdi \n\t" // increment pointer of c - - "vfmaddps %%xmm12, 0*4(%%rsi), %%xmm0, %%xmm12\n\t" // multiply a and c and add to temp - - "leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a - - "dec %%rax \n\t" // n = n -1 - "jnz .L01LOOP%= \n\t" - - "vmulps %%xmm12, %%xmm1, %%xmm12\n\t" // scale by alpha - - "vmovups %%xmm12, (%%rdx) \n\t" // store temp -> y - - : - : - "m" (n), // 0 - "m" (alpha), // 1 - "m" (a), // 2 - "m" (lda), // 3 - "m" (x), // 4 - "m" (y) // 5 - : "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", - "%xmm0", "%xmm1", - "%xmm8", "%xmm9", "%xmm10", "%xmm11", - "%xmm12", "%xmm13", "%xmm14", "%xmm15", - "memory" - ); - -} - -static void sgemv_kernel_2( long n, float alpha, float *a, long lda, float *x, float *y) -{ - - - __asm__ __volatile__ - ( - "movq %0, %%rax\n\t" // n -> rax - "vmovss %1, %%xmm1\n\t" // alpha -> xmm1 - "movq %2, %%rsi\n\t" // adress of a -> rsi - "movq %3, %%rcx\n\t" // value of lda > rcx - "movq %4, %%rdi\n\t" // adress of x -> rdi - "movq %5, %%rdx\n\t" // adress of y -> rdx - - "vxorps %%xmm12, %%xmm12, %%xmm12\n\t" // set to zero - "vxorps %%xmm13, %%xmm13, %%xmm13\n\t" // set to zero - - ".L01LOOP%=: \n\t" - "vmovss (%%rdi), %%xmm0 \n\t" // load values of c - "addq $4 , %%rdi \n\t" // increment pointer of c - - "vfmaddss %%xmm12, 0*4(%%rsi), %%xmm0, %%xmm12\n\t" // multiply a and c and add to temp - "vfmaddss %%xmm13, 1*4(%%rsi), %%xmm0, %%xmm13\n\t" // multiply a and c and add to temp - - "leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a - - "dec %%rax \n\t" // n = n -1 - "jnz .L01LOOP%= \n\t" - - "vmulss %%xmm12, %%xmm1, %%xmm12\n\t" // scale by alpha - "vmulss %%xmm13, %%xmm1, %%xmm13\n\t" // scale by alpha - - "vmovss %%xmm12, (%%rdx) \n\t" // store temp -> y - "vmovss %%xmm13, 4(%%rdx) \n\t" // store temp -> y - - : - : - "m" (n), // 0 - "m" (alpha), // 1 - "m" (a), // 2 - "m" (lda), // 3 - "m" (x), // 4 - "m" (y) // 5 - : "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", - "%xmm0", "%xmm1", - "%xmm8", "%xmm9", "%xmm10", "%xmm11", - "%xmm12", "%xmm13", "%xmm14", "%xmm15", - "memory" - ); - -} - - - -static void sgemv_kernel_1( long n, float alpha, float *a, long lda, float *x, float *y) -{ - - - __asm__ __volatile__ - ( - "movq %0, %%rax\n\t" // n -> rax - "vmovss %1, %%xmm1\n\t" // alpha -> xmm1 - "movq %2, %%rsi\n\t" // adress of a -> rsi - "movq %3, %%rcx\n\t" // value of lda > rcx - "movq %4, %%rdi\n\t" // adress of x -> rdi - "movq %5, %%rdx\n\t" // adress of y -> rdx - - "vxorps %%xmm12, %%xmm12, %%xmm12\n\t" // set to zero - - ".L01LOOP%=: \n\t" - "vmovss (%%rdi), %%xmm0 \n\t" // load values of c - "addq $4 , %%rdi \n\t" // increment pointer of c - - "vfmaddss %%xmm12, 0*4(%%rsi), %%xmm0, %%xmm12\n\t" // multiply a and c and add to temp - - "leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a - - "dec %%rax \n\t" // n = n -1 - "jnz .L01LOOP%= \n\t" - - "vmulss %%xmm12, %%xmm1, %%xmm12\n\t" // scale by alpha - - "vmovss %%xmm12, (%%rdx) \n\t" // store temp -> y - - : - : - "m" (n), // 0 - "m" (alpha), // 1 - "m" (a), // 2 - "m" (lda), // 3 - "m" (x), // 4 - "m" (y) // 5 - : "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", - "%xmm0", "%xmm1", - "%xmm8", "%xmm9", "%xmm10", "%xmm11", - "%xmm12", "%xmm13", "%xmm14", "%xmm15", - "memory" - ); - -} - - diff --git a/kernel/x86_64/sgemv_n_microk_haswell-4.c b/kernel/x86_64/sgemv_n_microk_haswell-4.c new file mode 100644 index 000000000..8f56655a9 --- /dev/null +++ b/kernel/x86_64/sgemv_n_microk_haswell-4.c @@ -0,0 +1,299 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + + +#define HAVE_KERNEL_4x8 1 +static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) __attribute__ ((noinline)); + +static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) +{ + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + "vbroadcastss (%2), %%ymm12 \n\t" // x0 + "vbroadcastss 4(%2), %%ymm13 \n\t" // x1 + "vbroadcastss 8(%2), %%ymm14 \n\t" // x2 + "vbroadcastss 12(%2), %%ymm15 \n\t" // x3 + "vbroadcastss 16(%2), %%ymm0 \n\t" // x4 + "vbroadcastss 20(%2), %%ymm1 \n\t" // x5 + "vbroadcastss 24(%2), %%ymm2 \n\t" // x6 + "vbroadcastss 28(%2), %%ymm3 \n\t" // x7 + + "vbroadcastss (%9), %%ymm6 \n\t" // alpha + + "testq $0x04, %1 \n\t" + "jz .L08LABEL%= \n\t" + + "vmovups (%3,%0,4), %%xmm7 \n\t" // 4 * y + "vxorps %%xmm4 , %%xmm4, %%xmm4 \n\t" + "vxorps %%xmm5 , %%xmm5, %%xmm5 \n\t" + + "vfmadd231ps (%4,%0,4), %%xmm12, %%xmm4 \n\t" + "vfmadd231ps (%5,%0,4), %%xmm13, %%xmm5 \n\t" + "vfmadd231ps (%6,%0,4), %%xmm14, %%xmm4 \n\t" + "vfmadd231ps (%7,%0,4), %%xmm15, %%xmm5 \n\t" + + "vfmadd231ps (%4,%8,4), %%xmm0 , %%xmm4 \n\t" + "vfmadd231ps (%5,%8,4), %%xmm1 , %%xmm5 \n\t" + "vfmadd231ps (%6,%8,4), %%xmm2 , %%xmm4 \n\t" + "vfmadd231ps (%7,%8,4), %%xmm3 , %%xmm5 \n\t" + + "vaddps %%xmm4 , %%xmm5 , %%xmm5 \n\t" + "vmulps %%xmm6 , %%xmm5 , %%xmm5 \n\t" + "vaddps %%xmm7 , %%xmm5 , %%xmm5 \n\t" + + "vmovups %%xmm5, (%3,%0,4) \n\t" // 4 * y + + "addq $4 , %8 \n\t" + "addq $4 , %0 \n\t" + "subq $4 , %1 \n\t" + + ".L08LABEL%=: \n\t" + + "testq $0x08, %1 \n\t" + "jz .L16LABEL%= \n\t" + + "vmovups (%3,%0,4), %%ymm7 \n\t" // 8 * y + "vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t" + "vxorps %%ymm5 , %%ymm5, %%ymm5 \n\t" + + "vfmadd231ps (%4,%0,4), %%ymm12, %%ymm4 \n\t" + "vfmadd231ps (%5,%0,4), %%ymm13, %%ymm5 \n\t" + "vfmadd231ps (%6,%0,4), %%ymm14, %%ymm4 \n\t" + "vfmadd231ps (%7,%0,4), %%ymm15, %%ymm5 \n\t" + + "vfmadd231ps (%4,%8,4), %%ymm0 , %%ymm4 \n\t" + "vfmadd231ps (%5,%8,4), %%ymm1 , %%ymm5 \n\t" + "vfmadd231ps (%6,%8,4), %%ymm2 , %%ymm4 \n\t" + "vfmadd231ps (%7,%8,4), %%ymm3 , %%ymm5 \n\t" + + "vaddps %%ymm4 , %%ymm5 , %%ymm5 \n\t" + "vmulps %%ymm6 , %%ymm5 , %%ymm5 \n\t" + "vaddps %%ymm7 , %%ymm5 , %%ymm5 \n\t" + + + "vmovups %%ymm5, (%3,%0,4) \n\t" // 8 * y + + "addq $8 , %8 \n\t" + "addq $8 , %0 \n\t" + "subq $8 , %1 \n\t" + + ".L16LABEL%=: \n\t" + + "cmpq $0, %1 \n\t" + "je .L16END%= \n\t" + + + ".align 16 \n\t" + ".L01LOOP%=: \n\t" + + "vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t" + "vxorps %%ymm5 , %%ymm5, %%ymm5 \n\t" + "vmovups (%3,%0,4), %%ymm8 \n\t" // 8 * y + "vmovups 32(%3,%0,4), %%ymm9 \n\t" // 8 * y + + "vfmadd231ps (%4,%0,4), %%ymm12, %%ymm4 \n\t" + "vfmadd231ps 32(%4,%0,4), %%ymm12, %%ymm5 \n\t" + "vfmadd231ps (%5,%0,4), %%ymm13, %%ymm4 \n\t" + "vfmadd231ps 32(%5,%0,4), %%ymm13, %%ymm5 \n\t" + "vfmadd231ps (%6,%0,4), %%ymm14, %%ymm4 \n\t" + "vfmadd231ps 32(%6,%0,4), %%ymm14, %%ymm5 \n\t" + "vfmadd231ps (%7,%0,4), %%ymm15, %%ymm4 \n\t" + "vfmadd231ps 32(%7,%0,4), %%ymm15, %%ymm5 \n\t" + + "vfmadd231ps (%4,%8,4), %%ymm0 , %%ymm4 \n\t" + "addq $16, %0 \n\t" + "vfmadd231ps 32(%4,%8,4), %%ymm0 , %%ymm5 \n\t" + "vfmadd231ps (%5,%8,4), %%ymm1 , %%ymm4 \n\t" + "vfmadd231ps 32(%5,%8,4), %%ymm1 , %%ymm5 \n\t" + "vfmadd231ps (%6,%8,4), %%ymm2 , %%ymm4 \n\t" + "vfmadd231ps 32(%6,%8,4), %%ymm2 , %%ymm5 \n\t" + "vfmadd231ps (%7,%8,4), %%ymm3 , %%ymm4 \n\t" + "vfmadd231ps 32(%7,%8,4), %%ymm3 , %%ymm5 \n\t" + + "vfmadd231ps %%ymm6 , %%ymm4 , %%ymm8 \n\t" + "vfmadd231ps %%ymm6 , %%ymm5 , %%ymm9 \n\t" + + "addq $16, %8 \n\t" + "vmovups %%ymm8,-64(%3,%0,4) \n\t" // 8 * y + "subq $16, %1 \n\t" + "vmovups %%ymm9,-32(%3,%0,4) \n\t" // 8 * y + + "jnz .L01LOOP%= \n\t" + + ".L16END%=: \n\t" + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (ap[0]), // 4 + "r" (ap[1]), // 5 + "r" (ap[2]), // 6 + "r" (ap[3]), // 7 + "r" (lda4), // 8 + "r" (alpha) // 9 + : "cc", + "%xmm0", "%xmm1", + "%xmm2", "%xmm3", + "%xmm4", "%xmm5", + "%xmm6", "%xmm7", + "%xmm8", "%xmm9", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + + +#define HAVE_KERNEL_4x4 1 +static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline)); + +static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + "vbroadcastss (%2), %%ymm12 \n\t" // x0 + "vbroadcastss 4(%2), %%ymm13 \n\t" // x1 + "vbroadcastss 8(%2), %%ymm14 \n\t" // x2 + "vbroadcastss 12(%2), %%ymm15 \n\t" // x3 + + "vbroadcastss (%8), %%ymm6 \n\t" // alpha + + "testq $0x04, %1 \n\t" + "jz .L08LABEL%= \n\t" + + "vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t" + "vxorps %%ymm5 , %%ymm5, %%ymm5 \n\t" + "vmovups (%3,%0,4), %%xmm7 \n\t" // 4 * y + + "vfmadd231ps (%4,%0,4), %%xmm12, %%xmm4 \n\t" + "vfmadd231ps (%5,%0,4), %%xmm13, %%xmm5 \n\t" + "vfmadd231ps (%6,%0,4), %%xmm14, %%xmm4 \n\t" + "vfmadd231ps (%7,%0,4), %%xmm15, %%xmm5 \n\t" + + "vaddps %%xmm4 , %%xmm5 , %%xmm5 \n\t" + "vmulps %%xmm6 , %%xmm5 , %%xmm5 \n\t" + "vaddps %%xmm7 , %%xmm5 , %%xmm5 \n\t" + + "vmovups %%xmm5, (%3,%0,4) \n\t" // 4 * y + + "addq $4 , %0 \n\t" + "subq $4 , %1 \n\t" + + ".L08LABEL%=: \n\t" + + "testq $0x08, %1 \n\t" + "jz .L16LABEL%= \n\t" + + "vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t" + "vxorps %%ymm5 , %%ymm5, %%ymm5 \n\t" + "vmovups (%3,%0,4), %%ymm7 \n\t" // 8 * y + + "vfmadd231ps (%4,%0,4), %%ymm12, %%ymm4 \n\t" + "vfmadd231ps (%5,%0,4), %%ymm13, %%ymm5 \n\t" + "vfmadd231ps (%6,%0,4), %%ymm14, %%ymm4 \n\t" + "vfmadd231ps (%7,%0,4), %%ymm15, %%ymm5 \n\t" + + "vaddps %%ymm4 , %%ymm5 , %%ymm5 \n\t" + "vmulps %%ymm6 , %%ymm5 , %%ymm5 \n\t" + "vaddps %%ymm7 , %%ymm5 , %%ymm5 \n\t" + + "vmovups %%ymm5, (%3,%0,4) \n\t" // 8 * y + + "addq $8 , %0 \n\t" + "subq $8 , %1 \n\t" + + ".L16LABEL%=: \n\t" + + "cmpq $0, %1 \n\t" + "je .L16END%= \n\t" + + + ".align 16 \n\t" + ".L01LOOP%=: \n\t" + "vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t" + "vxorps %%ymm5 , %%ymm5, %%ymm5 \n\t" + "vmovups (%3,%0,4), %%ymm8 \n\t" // 8 * y + "vmovups 32(%3,%0,4), %%ymm9 \n\t" // 8 * y + + "vfmadd231ps (%4,%0,4), %%ymm12, %%ymm4 \n\t" + "vfmadd231ps 32(%4,%0,4), %%ymm12, %%ymm5 \n\t" + "vfmadd231ps (%5,%0,4), %%ymm13, %%ymm4 \n\t" + "vfmadd231ps 32(%5,%0,4), %%ymm13, %%ymm5 \n\t" + "vfmadd231ps (%6,%0,4), %%ymm14, %%ymm4 \n\t" + "vfmadd231ps 32(%6,%0,4), %%ymm14, %%ymm5 \n\t" + "vfmadd231ps (%7,%0,4), %%ymm15, %%ymm4 \n\t" + "vfmadd231ps 32(%7,%0,4), %%ymm15, %%ymm5 \n\t" + + "vfmadd231ps %%ymm6 , %%ymm4 , %%ymm8 \n\t" + "vfmadd231ps %%ymm6 , %%ymm5 , %%ymm9 \n\t" + + "vmovups %%ymm8, (%3,%0,4) \n\t" // 8 * y + "vmovups %%ymm9, 32(%3,%0,4) \n\t" // 8 * y + + "addq $16, %0 \n\t" + "subq $16, %1 \n\t" + "jnz .L01LOOP%= \n\t" + + ".L16END%=: \n\t" + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (ap[0]), // 4 + "r" (ap[1]), // 5 + "r" (ap[2]), // 6 + "r" (ap[3]), // 7 + "r" (alpha) // 8 + : "cc", + "%xmm4", "%xmm5", + "%xmm6", "%xmm7", + "%xmm8", "%xmm9", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + diff --git a/kernel/x86_64/sgemv_n_microk_haswell.c b/kernel/x86_64/sgemv_n_microk_haswell.c deleted file mode 100644 index 9db3869d2..000000000 --- a/kernel/x86_64/sgemv_n_microk_haswell.c +++ /dev/null @@ -1,461 +0,0 @@ -/*************************************************************************** -Copyright (c) 2014, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -static void sgemv_kernel_64( long n, float alpha, float *a, long lda, float *x, float *y) -{ - - - float *pre = a + lda*2; - - __asm__ __volatile__ - ( - "movq %0, %%rax\n\t" // n -> rax - "vbroadcastss %1, %%ymm1\n\t" // alpha -> ymm1 - "movq %2, %%rsi\n\t" // adress of a -> rsi - "movq %3, %%rcx\n\t" // value of lda > rcx - "movq %4, %%rdi\n\t" // adress of x -> rdi - "movq %5, %%rdx\n\t" // adress of y -> rdx - "movq %6, %%r8\n\t" // address for prefetch - "prefetcht0 (%%r8)\n\t" // Prefetch - "prefetcht0 64(%%r8)\n\t" // Prefetch - - "vxorps %%ymm8 , %%ymm8 , %%ymm8 \n\t" // set to zero - "vxorps %%ymm9 , %%ymm9 , %%ymm9 \n\t" // set to zero - "vxorps %%ymm10, %%ymm10, %%ymm10\n\t" // set to zero - "vxorps %%ymm11, %%ymm11, %%ymm11\n\t" // set to zero - "vxorps %%ymm12, %%ymm12, %%ymm12\n\t" // set to zero - "vxorps %%ymm13, %%ymm13, %%ymm13\n\t" // set to zero - "vxorps %%ymm14, %%ymm14, %%ymm14\n\t" // set to zero - "vxorps %%ymm15, %%ymm15, %%ymm15\n\t" // set to zero - ".align 16 \n\t" - ".L01LOOP%=: \n\t" - "vbroadcastss (%%rdi), %%ymm0 \n\t" // load values of c - "leaq (%%r8 , %%rcx, 4), %%r8 \n\t" // add lda to pointer for prefetch - - "prefetcht0 (%%r8)\n\t" // Prefetch - "vfmadd231ps 0*4(%%rsi), %%ymm0, %%ymm8 \n\t" // multiply a and c and add to temp - "vfmadd231ps 8*4(%%rsi), %%ymm0, %%ymm9 \n\t" // multiply a and c and add to temp - "prefetcht0 64(%%r8)\n\t" // Prefetch - "vfmadd231ps 16*4(%%rsi), %%ymm0, %%ymm10\n\t" // multiply a and c and add to temp - "vfmadd231ps 24*4(%%rsi), %%ymm0, %%ymm11\n\t" // multiply a and c and add to temp - "prefetcht0 128(%%r8)\n\t" // Prefetch - "vfmadd231ps 32*4(%%rsi), %%ymm0, %%ymm12\n\t" // multiply a and c and add to temp - "vfmadd231ps 40*4(%%rsi), %%ymm0, %%ymm13\n\t" // multiply a and c and add to temp - "prefetcht0 192(%%r8)\n\t" // Prefetch - "vfmadd231ps 48*4(%%rsi), %%ymm0, %%ymm14\n\t" // multiply a and c and add to temp - "vfmadd231ps 56*4(%%rsi), %%ymm0, %%ymm15\n\t" // multiply a and c and add to temp - - "addq $4 , %%rdi \n\t" // increment pointer of c - "leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a - - "dec %%rax \n\t" // n = n -1 - "jnz .L01LOOP%= \n\t" - - "vmulps %%ymm8 , %%ymm1, %%ymm8 \n\t" // scale by alpha - "vmulps %%ymm9 , %%ymm1, %%ymm9 \n\t" // scale by alpha - "vmulps %%ymm10, %%ymm1, %%ymm10\n\t" // scale by alpha - "vmulps %%ymm11, %%ymm1, %%ymm11\n\t" // scale by alpha - "vmulps %%ymm12, %%ymm1, %%ymm12\n\t" // scale by alpha - "vmulps %%ymm13, %%ymm1, %%ymm13\n\t" // scale by alpha - "vmulps %%ymm14, %%ymm1, %%ymm14\n\t" // scale by alpha - "vmulps %%ymm15, %%ymm1, %%ymm15\n\t" // scale by alpha - - "vmovups %%ymm8 , (%%rdx) \n\t" // store temp -> y - "vmovups %%ymm9 , 8*4(%%rdx) \n\t" // store temp -> y - "vmovups %%ymm10, 16*4(%%rdx) \n\t" // store temp -> y - "vmovups %%ymm11, 24*4(%%rdx) \n\t" // store temp -> y - "vmovups %%ymm12, 32*4(%%rdx) \n\t" // store temp -> y - "vmovups %%ymm13, 40*4(%%rdx) \n\t" // store temp -> y - "vmovups %%ymm14, 48*4(%%rdx) \n\t" // store temp -> y - "vmovups %%ymm15, 56*4(%%rdx) \n\t" // store temp -> y - - : - : - "m" (n), // 0 - "m" (alpha), // 1 - "m" (a), // 2 - "m" (lda), // 3 - "m" (x), // 4 - "m" (y), // 5 - "m" (pre) // 6 - : "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", "cc", - "%xmm0", "%xmm1", - "%xmm4", "%xmm5", "%xmm6", "%xmm7", - "%xmm8", "%xmm9", "%xmm10", "%xmm11", - "%xmm12", "%xmm13", "%xmm14", "%xmm15", - "memory" - ); - -} - - - -static void sgemv_kernel_32( long n, float alpha, float *a, long lda, float *x, float *y) -{ - - - float *pre = a + lda*3; - - __asm__ __volatile__ - ( - "movq %0, %%rax\n\t" // n -> rax - "vbroadcastss %1, %%ymm1\n\t" // alpha -> ymm1 - "movq %2, %%rsi\n\t" // adress of a -> rsi - "movq %3, %%rcx\n\t" // value of lda > rcx - "movq %4, %%rdi\n\t" // adress of x -> rdi - "movq %5, %%rdx\n\t" // adress of y -> rdx - "movq %6, %%r8\n\t" // address for prefetch - "prefetcht0 (%%r8)\n\t" // Prefetch - "prefetcht0 64(%%r8)\n\t" // Prefetch - - "vxorps %%ymm8 , %%ymm8 , %%ymm8 \n\t" // set to zero - "vxorps %%ymm9 , %%ymm9 , %%ymm9 \n\t" // set to zero - "vxorps %%ymm10, %%ymm10, %%ymm10\n\t" // set to zero - "vxorps %%ymm11, %%ymm11, %%ymm11\n\t" // set to zero - ".align 16 \n\t" - ".L01LOOP%=: \n\t" - "vbroadcastss (%%rdi), %%ymm0 \n\t" // load values of c - "nop \n\t" - "leaq (%%r8 , %%rcx, 4), %%r8 \n\t" // add lda to pointer for prefetch - - "prefetcht0 (%%r8)\n\t" // Prefetch - "prefetcht0 64(%%r8)\n\t" // Prefetch - - "vmulps 0*4(%%rsi), %%ymm0, %%ymm4 \n\t" // multiply a and c and add to temp - "vmulps 8*4(%%rsi), %%ymm0, %%ymm5 \n\t" // multiply a and c and add to temp - "vmulps 16*4(%%rsi), %%ymm0, %%ymm6 \n\t" // multiply a and c and add to temp - "vmulps 24*4(%%rsi), %%ymm0, %%ymm7 \n\t" // multiply a and c and add to temp - - "vaddps %%ymm8 , %%ymm4, %%ymm8 \n\t" // multiply a and c and add to temp - "vaddps %%ymm9 , %%ymm5, %%ymm9 \n\t" // multiply a and c and add to temp - "vaddps %%ymm10, %%ymm6, %%ymm10\n\t" // multiply a and c and add to temp - "vaddps %%ymm11, %%ymm7, %%ymm11\n\t" // multiply a and c and add to temp - - - - "addq $4 , %%rdi \n\t" // increment pointer of c - "leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a - - "dec %%rax \n\t" // n = n -1 - "jnz .L01LOOP%= \n\t" - - "vmulps %%ymm8 , %%ymm1, %%ymm8 \n\t" // scale by alpha - "vmulps %%ymm9 , %%ymm1, %%ymm9 \n\t" // scale by alpha - "vmulps %%ymm10, %%ymm1, %%ymm10\n\t" // scale by alpha - "vmulps %%ymm11, %%ymm1, %%ymm11\n\t" // scale by alpha - - "vmovups %%ymm8 , (%%rdx) \n\t" // store temp -> y - "vmovups %%ymm9 , 8*4(%%rdx) \n\t" // store temp -> y - "vmovups %%ymm10, 16*4(%%rdx) \n\t" // store temp -> y - "vmovups %%ymm11, 24*4(%%rdx) \n\t" // store temp -> y - - : - : - "m" (n), // 0 - "m" (alpha), // 1 - "m" (a), // 2 - "m" (lda), // 3 - "m" (x), // 4 - "m" (y), // 5 - "m" (pre) // 6 - : "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", "cc", - "%xmm0", "%xmm1", - "%xmm4", "%xmm5", "%xmm6", "%xmm7", - "%xmm8", "%xmm9", "%xmm10", "%xmm11", - "memory" - ); - - - -} - -static void sgemv_kernel_16( long n, float alpha, float *a, long lda, float *x, float *y) -{ - - float *pre = a + lda*3; - - __asm__ __volatile__ - ( - "movq %0, %%rax\n\t" // n -> rax - "vbroadcastss %1, %%ymm1\n\t" // alpha -> ymm1 - "movq %2, %%rsi\n\t" // adress of a -> rsi - "movq %3, %%rcx\n\t" // value of lda > rcx - "movq %4, %%rdi\n\t" // adress of x -> rdi - "movq %5, %%rdx\n\t" // adress of y -> rdx - "movq %6, %%r8\n\t" // address for prefetch - "prefetcht0 (%%r8)\n\t" // Prefetch - "prefetcht0 64(%%r8)\n\t" // Prefetch - - "vxorps %%ymm8 , %%ymm8 , %%ymm8 \n\t" // set to zero - "vxorps %%ymm9 , %%ymm9 , %%ymm9 \n\t" // set to zero - ".align 16 \n\t" - ".L01LOOP%=: \n\t" - "vbroadcastss (%%rdi), %%ymm0 \n\t" // load values of c - "nop \n\t" - "leaq (%%r8 , %%rcx, 4), %%r8 \n\t" // add lda to pointer for prefetch - - "prefetcht0 (%%r8)\n\t" // Prefetch - - "vmulps 0*4(%%rsi), %%ymm0, %%ymm4 \n\t" // multiply a and c and add to temp - "vmulps 8*4(%%rsi), %%ymm0, %%ymm5 \n\t" // multiply a and c and add to temp - - "vaddps %%ymm8 , %%ymm4, %%ymm8 \n\t" // multiply a and c and add to temp - "vaddps %%ymm9 , %%ymm5, %%ymm9 \n\t" // multiply a and c and add to temp - - "addq $4 , %%rdi \n\t" // increment pointer of c - "leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a - - "dec %%rax \n\t" // n = n -1 - "jnz .L01LOOP%= \n\t" - - "vmulps %%ymm8 , %%ymm1, %%ymm8 \n\t" // scale by alpha - "vmulps %%ymm9 , %%ymm1, %%ymm9 \n\t" // scale by alpha - - "vmovups %%ymm8 , (%%rdx) \n\t" // store temp -> y - "vmovups %%ymm9 , 8*4(%%rdx) \n\t" // store temp -> y - - : - : - "m" (n), // 0 - "m" (alpha), // 1 - "m" (a), // 2 - "m" (lda), // 3 - "m" (x), // 4 - "m" (y), // 5 - "m" (pre) // 6 - : "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", "cc", - "%xmm0", "%xmm1", - "%xmm4", "%xmm5", "%xmm6", "%xmm7", - "%xmm8", "%xmm9", "%xmm10", "%xmm11", - "memory" - ); - - -} - - -static void sgemv_kernel_8( long n, float alpha, float *a, long lda, float *x, float *y) -{ - - __asm__ __volatile__ - ( - "movq %0, %%rax\n\t" // n -> rax - "vbroadcastss %1, %%ymm1\n\t" // alpha -> ymm1 - "movq %2, %%rsi\n\t" // adress of a -> rsi - "movq %3, %%rcx\n\t" // value of lda > rcx - "movq %4, %%rdi\n\t" // adress of x -> rdi - "movq %5, %%rdx\n\t" // adress of y -> rdx - - "vxorps %%ymm8 , %%ymm8 , %%ymm8 \n\t" // set to zero - ".align 16 \n\t" - ".L01LOOP%=: \n\t" - "vbroadcastss (%%rdi), %%ymm0 \n\t" // load values of c - - "vmulps 0*4(%%rsi), %%ymm0, %%ymm4 \n\t" // multiply a and c and add to temp - "vaddps %%ymm8 , %%ymm4, %%ymm8 \n\t" // multiply a and c and add to temp - - "addq $4 , %%rdi \n\t" // increment pointer of c - "leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a - - "dec %%rax \n\t" // n = n -1 - "jnz .L01LOOP%= \n\t" - - "vmulps %%ymm8 , %%ymm1, %%ymm8 \n\t" // scale by alpha - "vmovups %%ymm8 , (%%rdx) \n\t" // store temp -> y - - : - : - "m" (n), // 0 - "m" (alpha), // 1 - "m" (a), // 2 - "m" (lda), // 3 - "m" (x), // 4 - "m" (y) // 5 - : "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", "cc", - "%xmm0", "%xmm1", - "%xmm4", "%xmm5", "%xmm6", "%xmm7", - "%xmm8", "%xmm9", "%xmm10", "%xmm11", - "memory" - ); - - -} - - -static void sgemv_kernel_4( long n, float alpha, float *a, long lda, float *x, float *y) -{ - - - __asm__ __volatile__ - ( - "movq %0, %%rax\n\t" // n -> rax - "vbroadcastss %1, %%xmm1\n\t" // alpha -> xmm1 - "movq %2, %%rsi\n\t" // adress of a -> rsi - "movq %3, %%rcx\n\t" // value of lda > rcx - "movq %4, %%rdi\n\t" // adress of x -> rdi - "movq %5, %%rdx\n\t" // adress of y -> rdx - - "vxorps %%xmm12, %%xmm12, %%xmm12\n\t" // set to zero - - ".L01LOOP%=: \n\t" - "vbroadcastss (%%rdi), %%xmm0 \n\t" // load values of c - - "vmulps 0*4(%%rsi), %%xmm0, %%xmm4 \n\t" // multiply a and c and add to temp - "vaddps %%xmm12, %%xmm4, %%xmm12 \n\t" // multiply a and c and add to temp - - "addq $4 , %%rdi \n\t" // increment pointer of c - "leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a - - "dec %%rax \n\t" // n = n -1 - "jnz .L01LOOP%= \n\t" - - "vmulps %%xmm12, %%xmm1, %%xmm12\n\t" // scale by alpha - - "vmovups %%xmm12, (%%rdx) \n\t" // store temp -> y - - : - : - "m" (n), // 0 - "m" (alpha), // 1 - "m" (a), // 2 - "m" (lda), // 3 - "m" (x), // 4 - "m" (y) // 5 - : "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", - "%xmm0", "%xmm1", - "%xmm8", "%xmm9", "%xmm10", "%xmm11", - "%xmm12", "%xmm13", "%xmm14", "%xmm15", - "memory" - ); - -} - -static void sgemv_kernel_2( long n, float alpha, float *a, long lda, float *x, float *y) -{ - - - __asm__ __volatile__ - ( - "movq %0, %%rax\n\t" // n -> rax - "vmovss %1, %%xmm1\n\t" // alpha -> xmm1 - "movq %2, %%rsi\n\t" // adress of a -> rsi - "movq %3, %%rcx\n\t" // value of lda > rcx - "movq %4, %%rdi\n\t" // adress of x -> rdi - "movq %5, %%rdx\n\t" // adress of y -> rdx - - "vxorps %%xmm12, %%xmm12, %%xmm12\n\t" // set to zero - "vxorps %%xmm13, %%xmm13, %%xmm13\n\t" // set to zero - - ".L01LOOP%=: \n\t" - "vmovss (%%rdi), %%xmm0 \n\t" // load values of c - - "vmulps 0*4(%%rsi), %%xmm0, %%xmm4 \n\t" // multiply a and c and add to temp - "vmulps 1*4(%%rsi), %%xmm0, %%xmm5 \n\t" // multiply a and c and add to temp - - "vaddps %%xmm12, %%xmm4, %%xmm12 \n\t" // multiply a and c and add to temp - "vaddps %%xmm13, %%xmm5, %%xmm13 \n\t" // multiply a and c and add to temp - - "addq $4 , %%rdi \n\t" // increment pointer of c - "leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a - - "dec %%rax \n\t" // n = n -1 - "jnz .L01LOOP%= \n\t" - - "vmulss %%xmm12, %%xmm1, %%xmm12\n\t" // scale by alpha - "vmulss %%xmm13, %%xmm1, %%xmm13\n\t" // scale by alpha - - "vmovss %%xmm12, (%%rdx) \n\t" // store temp -> y - "vmovss %%xmm13, 4(%%rdx) \n\t" // store temp -> y - - : - : - "m" (n), // 0 - "m" (alpha), // 1 - "m" (a), // 2 - "m" (lda), // 3 - "m" (x), // 4 - "m" (y) // 5 - : "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", - "%xmm0", "%xmm1", - "%xmm8", "%xmm9", "%xmm10", "%xmm11", - "%xmm12", "%xmm13", "%xmm14", "%xmm15", - "memory" - ); - -} - - - -static void sgemv_kernel_1( long n, float alpha, float *a, long lda, float *x, float *y) -{ - - - __asm__ __volatile__ - ( - "movq %0, %%rax\n\t" // n -> rax - "vmovss %1, %%xmm1\n\t" // alpha -> xmm1 - "movq %2, %%rsi\n\t" // adress of a -> rsi - "movq %3, %%rcx\n\t" // value of lda > rcx - "movq %4, %%rdi\n\t" // adress of x -> rdi - "movq %5, %%rdx\n\t" // adress of y -> rdx - - "vxorps %%xmm12, %%xmm12, %%xmm12\n\t" // set to zero - - ".L01LOOP%=: \n\t" - "vmovss (%%rdi), %%xmm0 \n\t" // load values of c - "addq $4 , %%rdi \n\t" // increment pointer of c - - "vmulss 0*4(%%rsi), %%xmm0, %%xmm4 \n\t" // multiply a and c and add to temp - "vaddss %%xmm12, %%xmm4, %%xmm12 \n\t" // multiply a and c and add to temp - - "leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a - - "dec %%rax \n\t" // n = n -1 - "jnz .L01LOOP%= \n\t" - - "vmulss %%xmm12, %%xmm1, %%xmm12\n\t" // scale by alpha - - "vmovss %%xmm12, (%%rdx) \n\t" // store temp -> y - - : - : - "m" (n), // 0 - "m" (alpha), // 1 - "m" (a), // 2 - "m" (lda), // 3 - "m" (x), // 4 - "m" (y) // 5 - : "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", - "%xmm0", "%xmm1", - "%xmm8", "%xmm9", "%xmm10", "%xmm11", - "%xmm12", "%xmm13", "%xmm14", "%xmm15", - "memory" - ); - -} - - diff --git a/kernel/x86_64/sgemv_n_microk_nehalem-4.c b/kernel/x86_64/sgemv_n_microk_nehalem-4.c new file mode 100644 index 000000000..77a1b11aa --- /dev/null +++ b/kernel/x86_64/sgemv_n_microk_nehalem-4.c @@ -0,0 +1,204 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + + +#define HAVE_KERNEL_4x8 1 +static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) __attribute__ ((noinline)); + +static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) +{ + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "movss (%2), %%xmm12 \n\t" // x0 + "movss 4(%2), %%xmm13 \n\t" // x1 + "movss 8(%2), %%xmm14 \n\t" // x2 + "movss 12(%2), %%xmm15 \n\t" // x3 + "shufps $0, %%xmm12, %%xmm12\n\t" + "shufps $0, %%xmm13, %%xmm13\n\t" + "shufps $0, %%xmm14, %%xmm14\n\t" + "shufps $0, %%xmm15, %%xmm15\n\t" + + "movss 16(%2), %%xmm0 \n\t" // x4 + "movss 20(%2), %%xmm1 \n\t" // x5 + "movss 24(%2), %%xmm2 \n\t" // x6 + "movss 28(%2), %%xmm3 \n\t" // x7 + "shufps $0, %%xmm0 , %%xmm0 \n\t" + "shufps $0, %%xmm1 , %%xmm1 \n\t" + "shufps $0, %%xmm2 , %%xmm2 \n\t" + "shufps $0, %%xmm3 , %%xmm3 \n\t" + + "movss (%9), %%xmm6 \n\t" // alpha + "shufps $0, %%xmm6 , %%xmm6 \n\t" + + + ".align 16 \n\t" + ".L01LOOP%=: \n\t" + "xorps %%xmm4 , %%xmm4 \n\t" + "xorps %%xmm5 , %%xmm5 \n\t" + "movups (%3,%0,4), %%xmm7 \n\t" // 4 * y + + ".align 2 \n\t" + "movups (%4,%0,4), %%xmm8 \n\t" + "movups (%5,%0,4), %%xmm9 \n\t" + "movups (%6,%0,4), %%xmm10 \n\t" + "movups (%7,%0,4), %%xmm11 \n\t" + ".align 2 \n\t" + "mulps %%xmm12, %%xmm8 \n\t" + "mulps %%xmm13, %%xmm9 \n\t" + "mulps %%xmm14, %%xmm10 \n\t" + "mulps %%xmm15, %%xmm11 \n\t" + "addps %%xmm8 , %%xmm4 \n\t" + "addps %%xmm9 , %%xmm5 \n\t" + "addps %%xmm10, %%xmm4 \n\t" + "addps %%xmm11, %%xmm5 \n\t" + + "movups (%4,%8,4), %%xmm8 \n\t" + "movups (%5,%8,4), %%xmm9 \n\t" + "movups (%6,%8,4), %%xmm10 \n\t" + "movups (%7,%8,4), %%xmm11 \n\t" + ".align 2 \n\t" + "mulps %%xmm0 , %%xmm8 \n\t" + "mulps %%xmm1 , %%xmm9 \n\t" + "mulps %%xmm2 , %%xmm10 \n\t" + "mulps %%xmm3 , %%xmm11 \n\t" + "addps %%xmm8 , %%xmm4 \n\t" + "addps %%xmm9 , %%xmm5 \n\t" + "addps %%xmm10, %%xmm4 \n\t" + "addps %%xmm11, %%xmm5 \n\t" + + "addq $4 , %8 \n\t" + "addps %%xmm5 , %%xmm4 \n\t" + "addq $4 , %0 \n\t" + "mulps %%xmm6 , %%xmm4 \n\t" + "subq $4 , %1 \n\t" + "addps %%xmm4 , %%xmm7 \n\t" + + "movups %%xmm7 , -16(%3,%0,4) \n\t" // 4 * y + + "jnz .L01LOOP%= \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (ap[0]), // 4 + "r" (ap[1]), // 5 + "r" (ap[2]), // 6 + "r" (ap[3]), // 7 + "r" (lda4), // 8 + "r" (alpha) // 9 + : "cc", + "%xmm0", "%xmm1", + "%xmm2", "%xmm3", + "%xmm4", "%xmm5", + "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + + + +#define HAVE_KERNEL_4x4 1 +static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline)); + +static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "movss (%2), %%xmm12 \n\t" // x0 + "movss 4(%2), %%xmm13 \n\t" // x1 + "movss 8(%2), %%xmm14 \n\t" // x2 + "movss 12(%2), %%xmm15 \n\t" // x3 + "shufps $0, %%xmm12, %%xmm12\n\t" + "shufps $0, %%xmm13, %%xmm13\n\t" + "shufps $0, %%xmm14, %%xmm14\n\t" + "shufps $0, %%xmm15, %%xmm15\n\t" + + "movss (%8), %%xmm6 \n\t" // alpha + "shufps $0, %%xmm6 , %%xmm6 \n\t" + + ".align 16 \n\t" + ".L01LOOP%=: \n\t" + "xorps %%xmm4 , %%xmm4 \n\t" + "movups (%3,%0,4), %%xmm7 \n\t" // 4 * y + + "movups (%4,%0,4), %%xmm8 \n\t" + "movups (%5,%0,4), %%xmm9 \n\t" + "movups (%6,%0,4), %%xmm10 \n\t" + "movups (%7,%0,4), %%xmm11 \n\t" + "mulps %%xmm12, %%xmm8 \n\t" + "mulps %%xmm13, %%xmm9 \n\t" + "mulps %%xmm14, %%xmm10 \n\t" + "mulps %%xmm15, %%xmm11 \n\t" + "addps %%xmm8 , %%xmm4 \n\t" + "addq $4 , %0 \n\t" + "addps %%xmm9 , %%xmm4 \n\t" + "subq $4 , %1 \n\t" + "addps %%xmm10 , %%xmm4 \n\t" + "addps %%xmm4 , %%xmm11 \n\t" + + "mulps %%xmm6 , %%xmm11 \n\t" + "addps %%xmm7 , %%xmm11 \n\t" + "movups %%xmm11, -16(%3,%0,4) \n\t" // 4 * y + + "jnz .L01LOOP%= \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (ap[0]), // 4 + "r" (ap[1]), // 5 + "r" (ap[2]), // 6 + "r" (ap[3]), // 7 + "r" (alpha) // 8 + : "cc", + "%xmm4", "%xmm5", + "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + diff --git a/kernel/x86_64/sgemv_n_microk_sandy-4.c b/kernel/x86_64/sgemv_n_microk_sandy-4.c new file mode 100644 index 000000000..c162eeeb6 --- /dev/null +++ b/kernel/x86_64/sgemv_n_microk_sandy-4.c @@ -0,0 +1,370 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + + + +#define HAVE_KERNEL_4x8 1 +static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) __attribute__ ((noinline)); + +static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) +{ + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + "vbroadcastss (%2), %%ymm12 \n\t" // x0 + "vbroadcastss 4(%2), %%ymm13 \n\t" // x1 + "vbroadcastss 8(%2), %%ymm14 \n\t" // x2 + "vbroadcastss 12(%2), %%ymm15 \n\t" // x3 + "vbroadcastss 16(%2), %%ymm0 \n\t" // x4 + "vbroadcastss 20(%2), %%ymm1 \n\t" // x5 + "vbroadcastss 24(%2), %%ymm2 \n\t" // x6 + "vbroadcastss 28(%2), %%ymm3 \n\t" // x7 + + "vbroadcastss (%9), %%ymm6 \n\t" // alpha + + "testq $0x04, %1 \n\t" + "jz .L08LABEL%= \n\t" + + "vxorps %%xmm4 , %%xmm4 , %%xmm4 \n\t" + "vxorps %%xmm5 , %%xmm5 , %%xmm5 \n\t" + "vmovups (%3,%0,4), %%xmm7 \n\t" // 4 * y + + "vmulps (%4,%0,4), %%xmm12, %%xmm8 \n\t" + "vmulps (%5,%0,4), %%xmm13, %%xmm10 \n\t" + "vmulps (%6,%0,4), %%xmm14, %%xmm9 \n\t" + "vmulps (%7,%0,4), %%xmm15, %%xmm11 \n\t" + "vaddps %%xmm4, %%xmm8 , %%xmm4 \n\t" + "vaddps %%xmm5, %%xmm10, %%xmm5 \n\t" + "vaddps %%xmm4, %%xmm9 , %%xmm4 \n\t" + "vaddps %%xmm5, %%xmm11, %%xmm5 \n\t" + + "vmulps (%4,%8,4), %%xmm0 , %%xmm8 \n\t" + "vmulps (%5,%8,4), %%xmm1 , %%xmm10 \n\t" + "vmulps (%6,%8,4), %%xmm2 , %%xmm9 \n\t" + "vmulps (%7,%8,4), %%xmm3 , %%xmm11 \n\t" + "vaddps %%xmm4, %%xmm8 , %%xmm4 \n\t" + "vaddps %%xmm5, %%xmm10, %%xmm5 \n\t" + "vaddps %%xmm4, %%xmm9 , %%xmm4 \n\t" + "vaddps %%xmm5, %%xmm11, %%xmm5 \n\t" + + "vaddps %%xmm5, %%xmm4 , %%xmm4 \n\t" + "vmulps %%xmm6, %%xmm4 , %%xmm5 \n\t" + "vaddps %%xmm5, %%xmm7 , %%xmm5 \n\t" + + "vmovups %%xmm5, (%3,%0,4) \n\t" // 4 * y + + "addq $4, %8 \n\t" + "addq $4, %0 \n\t" + "subq $4, %1 \n\t" + + ".L08LABEL%=: \n\t" + + "testq $0x08, %1 \n\t" + "jz .L16LABEL%= \n\t" + + "vxorps %%ymm4 , %%ymm4 , %%ymm4 \n\t" + "vxorps %%ymm5 , %%ymm5 , %%ymm5 \n\t" + "vmovups (%3,%0,4), %%ymm7 \n\t" // 8 * y + + "vmulps (%4,%0,4), %%ymm12, %%ymm8 \n\t" + "vmulps (%5,%0,4), %%ymm13, %%ymm10 \n\t" + "vmulps (%6,%0,4), %%ymm14, %%ymm9 \n\t" + "vmulps (%7,%0,4), %%ymm15, %%ymm11 \n\t" + "vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t" + "vaddps %%ymm5, %%ymm10, %%ymm5 \n\t" + "vaddps %%ymm4, %%ymm9 , %%ymm4 \n\t" + "vaddps %%ymm5, %%ymm11, %%ymm5 \n\t" + + "vmulps (%4,%8,4), %%ymm0 , %%ymm8 \n\t" + "vmulps (%5,%8,4), %%ymm1 , %%ymm10 \n\t" + "vmulps (%6,%8,4), %%ymm2 , %%ymm9 \n\t" + "vmulps (%7,%8,4), %%ymm3 , %%ymm11 \n\t" + "vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t" + "vaddps %%ymm5, %%ymm10, %%ymm5 \n\t" + "vaddps %%ymm4, %%ymm9 , %%ymm4 \n\t" + "vaddps %%ymm5, %%ymm11, %%ymm5 \n\t" + + "vaddps %%ymm5, %%ymm4 , %%ymm4 \n\t" + "vmulps %%ymm6, %%ymm4 , %%ymm5 \n\t" + "vaddps %%ymm5, %%ymm7 , %%ymm5 \n\t" + + "vmovups %%ymm5, (%3,%0,4) \n\t" // 8 * y + + "addq $8, %8 \n\t" + "addq $8, %0 \n\t" + "subq $8, %1 \n\t" + + + ".L16LABEL%=: \n\t" + + "cmpq $0, %1 \n\t" + "je .L16END%= \n\t" + + + ".align 16 \n\t" + ".L01LOOP%=: \n\t" + "vxorps %%ymm4 , %%ymm4 , %%ymm4 \n\t" + "vxorps %%ymm5 , %%ymm5 , %%ymm5 \n\t" + + "prefetcht0 192(%4,%0,4) \n\t" + "vmulps (%4,%0,4), %%ymm12, %%ymm8 \n\t" + "vmulps 32(%4,%0,4), %%ymm12, %%ymm9 \n\t" + "prefetcht0 192(%5,%0,4) \n\t" + "vmulps (%5,%0,4), %%ymm13, %%ymm10 \n\t" + "vmulps 32(%5,%0,4), %%ymm13, %%ymm11 \n\t" + "vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t" + "vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t" + "vaddps %%ymm4, %%ymm10, %%ymm4 \n\t" + "vaddps %%ymm5, %%ymm11, %%ymm5 \n\t" + + "prefetcht0 192(%6,%0,4) \n\t" + "vmulps (%6,%0,4), %%ymm14, %%ymm8 \n\t" + "vmulps 32(%6,%0,4), %%ymm14, %%ymm9 \n\t" + "prefetcht0 192(%7,%0,4) \n\t" + "vmulps (%7,%0,4), %%ymm15, %%ymm10 \n\t" + "vmulps 32(%7,%0,4), %%ymm15, %%ymm11 \n\t" + "vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t" + "vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t" + "vaddps %%ymm4, %%ymm10, %%ymm4 \n\t" + "vaddps %%ymm5, %%ymm11, %%ymm5 \n\t" + + "prefetcht0 192(%4,%8,4) \n\t" + "vmulps (%4,%8,4), %%ymm0 , %%ymm8 \n\t" + "vmulps 32(%4,%8,4), %%ymm0 , %%ymm9 \n\t" + "prefetcht0 192(%5,%8,4) \n\t" + "vmulps (%5,%8,4), %%ymm1 , %%ymm10 \n\t" + "vmulps 32(%5,%8,4), %%ymm1 , %%ymm11 \n\t" + "vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t" + "vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t" + "vaddps %%ymm4, %%ymm10, %%ymm4 \n\t" + "vaddps %%ymm5, %%ymm11, %%ymm5 \n\t" + + "prefetcht0 192(%6,%8,4) \n\t" + "vmulps (%6,%8,4), %%ymm2 , %%ymm8 \n\t" + "vmulps 32(%6,%8,4), %%ymm2 , %%ymm9 \n\t" + "prefetcht0 192(%7,%8,4) \n\t" + "vmulps (%7,%8,4), %%ymm3 , %%ymm10 \n\t" + "vmulps 32(%7,%8,4), %%ymm3 , %%ymm11 \n\t" + "vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t" + "vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t" + "vaddps %%ymm4, %%ymm10, %%ymm4 \n\t" + "vaddps %%ymm5, %%ymm11, %%ymm5 \n\t" + + "vmulps %%ymm6, %%ymm4 , %%ymm4 \n\t" + "vmulps %%ymm6, %%ymm5 , %%ymm5 \n\t" + + "vaddps (%3,%0,4), %%ymm4 , %%ymm4 \n\t" // 8 * y + "vaddps 32(%3,%0,4), %%ymm5 , %%ymm5 \n\t" // 8 * y + + "vmovups %%ymm4, (%3,%0,4) \n\t" // 8 * y + "vmovups %%ymm5, 32(%3,%0,4) \n\t" // 8 * y + + "addq $16, %8 \n\t" + "addq $16, %0 \n\t" + "subq $16, %1 \n\t" + "jnz .L01LOOP%= \n\t" + + ".L16END%=: \n\t" + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (ap[0]), // 4 + "r" (ap[1]), // 5 + "r" (ap[2]), // 6 + "r" (ap[3]), // 7 + "r" (lda4), // 8 + "r" (alpha) // 9 + : "cc", + "%xmm0", "%xmm1", + "%xmm2", "%xmm3", + "%xmm4", "%xmm5", + "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + + + +#define HAVE_KERNEL_4x4 1 +static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline)); + +static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + "vbroadcastss (%2), %%ymm12 \n\t" // x0 + "vbroadcastss 4(%2), %%ymm13 \n\t" // x1 + "vbroadcastss 8(%2), %%ymm14 \n\t" // x2 + "vbroadcastss 12(%2), %%ymm15 \n\t" // x3 + + "vbroadcastss (%8), %%ymm6 \n\t" // alpha + + "testq $0x04, %1 \n\t" + "jz .L08LABEL%= \n\t" + + "vxorps %%ymm4 , %%ymm4 , %%ymm4 \n\t" + "vxorps %%ymm5 , %%ymm5 , %%ymm5 \n\t" + "vmovups (%3,%0,4), %%xmm7 \n\t" // 4 * y + + "vmulps (%4,%0,4), %%xmm12, %%xmm8 \n\t" + "vmulps (%5,%0,4), %%xmm13, %%xmm10 \n\t" + "vmulps (%6,%0,4), %%xmm14, %%xmm9 \n\t" + "vmulps (%7,%0,4), %%xmm15, %%xmm11 \n\t" + "vaddps %%xmm4, %%xmm8 , %%xmm4 \n\t" + "vaddps %%xmm5, %%xmm10, %%xmm5 \n\t" + "vaddps %%xmm4, %%xmm9 , %%xmm4 \n\t" + "vaddps %%xmm5, %%xmm11, %%xmm5 \n\t" + + "vaddps %%xmm5, %%xmm4 , %%xmm4 \n\t" + "vmulps %%xmm6, %%xmm4 , %%xmm5 \n\t" + "vaddps %%xmm5, %%xmm7 , %%xmm5 \n\t" + + "vmovups %%xmm5, (%3,%0,4) \n\t" // 4 * y + + "addq $4, %0 \n\t" + "subq $4, %1 \n\t" + + ".L08LABEL%=: \n\t" + + "testq $0x08, %1 \n\t" + "jz .L16LABEL%= \n\t" + + "vxorps %%ymm4 , %%ymm4 , %%ymm4 \n\t" + "vxorps %%ymm5 , %%ymm5 , %%ymm5 \n\t" + "vmovups (%3,%0,4), %%ymm7 \n\t" // 8 * y + + "vmulps (%4,%0,4), %%ymm12, %%ymm8 \n\t" + "vmulps (%5,%0,4), %%ymm13, %%ymm10 \n\t" + "vmulps (%6,%0,4), %%ymm14, %%ymm9 \n\t" + "vmulps (%7,%0,4), %%ymm15, %%ymm11 \n\t" + "vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t" + "vaddps %%ymm5, %%ymm10, %%ymm5 \n\t" + "vaddps %%ymm4, %%ymm9 , %%ymm4 \n\t" + "vaddps %%ymm5, %%ymm11, %%ymm5 \n\t" + + "vaddps %%ymm5, %%ymm4 , %%ymm4 \n\t" + "vmulps %%ymm6, %%ymm4 , %%ymm5 \n\t" + "vaddps %%ymm5, %%ymm7 , %%ymm5 \n\t" + + "vmovups %%ymm5, (%3,%0,4) \n\t" // 8 * y + + "addq $8, %0 \n\t" + "subq $8, %1 \n\t" + + + ".L16LABEL%=: \n\t" + + "cmpq $0, %1 \n\t" + "je .L16END%= \n\t" + + + ".align 16 \n\t" + ".L01LOOP%=: \n\t" + "vxorps %%ymm4 , %%ymm4 , %%ymm4 \n\t" + "vxorps %%ymm5 , %%ymm5 , %%ymm5 \n\t" + "vmovups (%3,%0,4), %%ymm0 \n\t" // 8 * y + "vmovups 32(%3,%0,4), %%ymm1 \n\t" // 8 * y + + "prefetcht0 192(%4,%0,4) \n\t" + "vmulps (%4,%0,4), %%ymm12, %%ymm8 \n\t" + "vmulps 32(%4,%0,4), %%ymm12, %%ymm9 \n\t" + "prefetcht0 192(%5,%0,4) \n\t" + "vmulps (%5,%0,4), %%ymm13, %%ymm10 \n\t" + "vmulps 32(%5,%0,4), %%ymm13, %%ymm11 \n\t" + "vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t" + "vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t" + "vaddps %%ymm4, %%ymm10, %%ymm4 \n\t" + "vaddps %%ymm5, %%ymm11, %%ymm5 \n\t" + + "prefetcht0 192(%6,%0,4) \n\t" + "vmulps (%6,%0,4), %%ymm14, %%ymm8 \n\t" + "vmulps 32(%6,%0,4), %%ymm14, %%ymm9 \n\t" + "prefetcht0 192(%7,%0,4) \n\t" + "vmulps (%7,%0,4), %%ymm15, %%ymm10 \n\t" + "vmulps 32(%7,%0,4), %%ymm15, %%ymm11 \n\t" + "vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t" + "vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t" + "vaddps %%ymm4, %%ymm10, %%ymm4 \n\t" + "vaddps %%ymm5, %%ymm11, %%ymm5 \n\t" + + "vmulps %%ymm6, %%ymm4 , %%ymm4 \n\t" + "vmulps %%ymm6, %%ymm5 , %%ymm5 \n\t" + + "vaddps %%ymm4, %%ymm0 , %%ymm0 \n\t" + "vaddps %%ymm5, %%ymm1 , %%ymm1 \n\t" + + "vmovups %%ymm0, (%3,%0,4) \n\t" // 8 * y + "vmovups %%ymm1, 32(%3,%0,4) \n\t" // 8 * y + + "addq $16, %0 \n\t" + "subq $16, %1 \n\t" + "jnz .L01LOOP%= \n\t" + + ".L16END%=: \n\t" + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (ap[0]), // 4 + "r" (ap[1]), // 5 + "r" (ap[2]), // 6 + "r" (ap[3]), // 7 + "r" (alpha) // 8 + : "cc", + "%xmm0", "%xmm1", + "%xmm2", "%xmm3", + "%xmm4", "%xmm5", + "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + diff --git a/kernel/x86_64/sgemv_n_microk_sandy.c b/kernel/x86_64/sgemv_n_microk_sandy.c deleted file mode 100644 index 9bdb06600..000000000 --- a/kernel/x86_64/sgemv_n_microk_sandy.c +++ /dev/null @@ -1,473 +0,0 @@ -/*************************************************************************** -Copyright (c) 2014, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -static void sgemv_kernel_64( long n, float alpha, float *a, long lda, float *x, float *y) -{ - - - float *pre = a + lda*2; - - __asm__ __volatile__ - ( - "movq %0, %%rax\n\t" // n -> rax - "vbroadcastss %1, %%ymm1\n\t" // alpha -> ymm1 - "movq %2, %%rsi\n\t" // adress of a -> rsi - "movq %3, %%rcx\n\t" // value of lda > rcx - "movq %4, %%rdi\n\t" // adress of x -> rdi - "movq %5, %%rdx\n\t" // adress of y -> rdx - "movq %6, %%r8\n\t" // address for prefetch - "prefetcht0 (%%r8)\n\t" // Prefetch - "prefetcht0 64(%%r8)\n\t" // Prefetch - - "vxorps %%ymm8 , %%ymm8 , %%ymm8 \n\t" // set to zero - "vxorps %%ymm9 , %%ymm9 , %%ymm9 \n\t" // set to zero - "vxorps %%ymm10, %%ymm10, %%ymm10\n\t" // set to zero - "vxorps %%ymm11, %%ymm11, %%ymm11\n\t" // set to zero - "vxorps %%ymm12, %%ymm12, %%ymm12\n\t" // set to zero - "vxorps %%ymm13, %%ymm13, %%ymm13\n\t" // set to zero - "vxorps %%ymm14, %%ymm14, %%ymm14\n\t" // set to zero - "vxorps %%ymm15, %%ymm15, %%ymm15\n\t" // set to zero - ".align 16 \n\t" - ".L01LOOP%=: \n\t" - "vbroadcastss (%%rdi), %%ymm0 \n\t" // load values of c - "nop \n\t" - "leaq (%%r8 , %%rcx, 4), %%r8 \n\t" // add lda to pointer for prefetch - - "prefetcht0 (%%r8)\n\t" // Prefetch - "vmulps 0*4(%%rsi), %%ymm0, %%ymm4 \n\t" // multiply a and c and add to temp - "vmulps 8*4(%%rsi), %%ymm0, %%ymm5 \n\t" // multiply a and c and add to temp - "prefetcht0 64(%%r8)\n\t" // Prefetch - "vmulps 16*4(%%rsi), %%ymm0, %%ymm6 \n\t" // multiply a and c and add to temp - "vmulps 24*4(%%rsi), %%ymm0, %%ymm7 \n\t" // multiply a and c and add to temp - - "vaddps %%ymm8 , %%ymm4, %%ymm8 \n\t" // multiply a and c and add to temp - "vaddps %%ymm9 , %%ymm5, %%ymm9 \n\t" // multiply a and c and add to temp - "prefetcht0 128(%%r8)\n\t" // Prefetch - "vaddps %%ymm10, %%ymm6, %%ymm10\n\t" // multiply a and c and add to temp - "vaddps %%ymm11, %%ymm7, %%ymm11\n\t" // multiply a and c and add to temp - - "prefetcht0 192(%%r8)\n\t" // Prefetch - "vmulps 32*4(%%rsi), %%ymm0, %%ymm4 \n\t" // multiply a and c and add to temp - "vmulps 40*4(%%rsi), %%ymm0, %%ymm5 \n\t" // multiply a and c and add to temp - "vmulps 48*4(%%rsi), %%ymm0, %%ymm6 \n\t" // multiply a and c and add to temp - "vmulps 56*4(%%rsi), %%ymm0, %%ymm7 \n\t" // multiply a and c and add to temp - - "vaddps %%ymm12, %%ymm4, %%ymm12\n\t" // multiply a and c and add to temp - "vaddps %%ymm13, %%ymm5, %%ymm13\n\t" // multiply a and c and add to temp - "vaddps %%ymm14, %%ymm6, %%ymm14\n\t" // multiply a and c and add to temp - "vaddps %%ymm15, %%ymm7, %%ymm15\n\t" // multiply a and c and add to temp - - "addq $4 , %%rdi \n\t" // increment pointer of c - "leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a - - "dec %%rax \n\t" // n = n -1 - "jnz .L01LOOP%= \n\t" - - "vmulps %%ymm8 , %%ymm1, %%ymm8 \n\t" // scale by alpha - "vmulps %%ymm9 , %%ymm1, %%ymm9 \n\t" // scale by alpha - "vmulps %%ymm10, %%ymm1, %%ymm10\n\t" // scale by alpha - "vmulps %%ymm11, %%ymm1, %%ymm11\n\t" // scale by alpha - "vmulps %%ymm12, %%ymm1, %%ymm12\n\t" // scale by alpha - "vmulps %%ymm13, %%ymm1, %%ymm13\n\t" // scale by alpha - "vmulps %%ymm14, %%ymm1, %%ymm14\n\t" // scale by alpha - "vmulps %%ymm15, %%ymm1, %%ymm15\n\t" // scale by alpha - - "vmovups %%ymm8 , (%%rdx) \n\t" // store temp -> y - "vmovups %%ymm9 , 8*4(%%rdx) \n\t" // store temp -> y - "vmovups %%ymm10, 16*4(%%rdx) \n\t" // store temp -> y - "vmovups %%ymm11, 24*4(%%rdx) \n\t" // store temp -> y - "vmovups %%ymm12, 32*4(%%rdx) \n\t" // store temp -> y - "vmovups %%ymm13, 40*4(%%rdx) \n\t" // store temp -> y - "vmovups %%ymm14, 48*4(%%rdx) \n\t" // store temp -> y - "vmovups %%ymm15, 56*4(%%rdx) \n\t" // store temp -> y - - : - : - "m" (n), // 0 - "m" (alpha), // 1 - "m" (a), // 2 - "m" (lda), // 3 - "m" (x), // 4 - "m" (y), // 5 - "m" (pre) // 6 - : "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", "cc", - "%xmm0", "%xmm1", - "%xmm4", "%xmm5", "%xmm6", "%xmm7", - "%xmm8", "%xmm9", "%xmm10", "%xmm11", - "%xmm12", "%xmm13", "%xmm14", "%xmm15", - "memory" - ); - -} - - - -static void sgemv_kernel_32( long n, float alpha, float *a, long lda, float *x, float *y) -{ - - - float *pre = a + lda*3; - - __asm__ __volatile__ - ( - "movq %0, %%rax\n\t" // n -> rax - "vbroadcastss %1, %%ymm1\n\t" // alpha -> ymm1 - "movq %2, %%rsi\n\t" // adress of a -> rsi - "movq %3, %%rcx\n\t" // value of lda > rcx - "movq %4, %%rdi\n\t" // adress of x -> rdi - "movq %5, %%rdx\n\t" // adress of y -> rdx - "movq %6, %%r8\n\t" // address for prefetch - "prefetcht0 (%%r8)\n\t" // Prefetch - "prefetcht0 64(%%r8)\n\t" // Prefetch - - "vxorps %%ymm8 , %%ymm8 , %%ymm8 \n\t" // set to zero - "vxorps %%ymm9 , %%ymm9 , %%ymm9 \n\t" // set to zero - "vxorps %%ymm10, %%ymm10, %%ymm10\n\t" // set to zero - "vxorps %%ymm11, %%ymm11, %%ymm11\n\t" // set to zero - ".align 16 \n\t" - ".L01LOOP%=: \n\t" - "vbroadcastss (%%rdi), %%ymm0 \n\t" // load values of c - "nop \n\t" - "leaq (%%r8 , %%rcx, 4), %%r8 \n\t" // add lda to pointer for prefetch - - "prefetcht0 (%%r8)\n\t" // Prefetch - "prefetcht0 64(%%r8)\n\t" // Prefetch - - "vmulps 0*4(%%rsi), %%ymm0, %%ymm4 \n\t" // multiply a and c and add to temp - "vmulps 8*4(%%rsi), %%ymm0, %%ymm5 \n\t" // multiply a and c and add to temp - "vmulps 16*4(%%rsi), %%ymm0, %%ymm6 \n\t" // multiply a and c and add to temp - "vmulps 24*4(%%rsi), %%ymm0, %%ymm7 \n\t" // multiply a and c and add to temp - - "vaddps %%ymm8 , %%ymm4, %%ymm8 \n\t" // multiply a and c and add to temp - "vaddps %%ymm9 , %%ymm5, %%ymm9 \n\t" // multiply a and c and add to temp - "vaddps %%ymm10, %%ymm6, %%ymm10\n\t" // multiply a and c and add to temp - "vaddps %%ymm11, %%ymm7, %%ymm11\n\t" // multiply a and c and add to temp - - - - "addq $4 , %%rdi \n\t" // increment pointer of c - "leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a - - "dec %%rax \n\t" // n = n -1 - "jnz .L01LOOP%= \n\t" - - "vmulps %%ymm8 , %%ymm1, %%ymm8 \n\t" // scale by alpha - "vmulps %%ymm9 , %%ymm1, %%ymm9 \n\t" // scale by alpha - "vmulps %%ymm10, %%ymm1, %%ymm10\n\t" // scale by alpha - "vmulps %%ymm11, %%ymm1, %%ymm11\n\t" // scale by alpha - - "vmovups %%ymm8 , (%%rdx) \n\t" // store temp -> y - "vmovups %%ymm9 , 8*4(%%rdx) \n\t" // store temp -> y - "vmovups %%ymm10, 16*4(%%rdx) \n\t" // store temp -> y - "vmovups %%ymm11, 24*4(%%rdx) \n\t" // store temp -> y - - : - : - "m" (n), // 0 - "m" (alpha), // 1 - "m" (a), // 2 - "m" (lda), // 3 - "m" (x), // 4 - "m" (y), // 5 - "m" (pre) // 6 - : "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", "cc", - "%xmm0", "%xmm1", - "%xmm4", "%xmm5", "%xmm6", "%xmm7", - "%xmm8", "%xmm9", "%xmm10", "%xmm11", - "memory" - ); - - - -} - -static void sgemv_kernel_16( long n, float alpha, float *a, long lda, float *x, float *y) -{ - - float *pre = a + lda*3; - - __asm__ __volatile__ - ( - "movq %0, %%rax\n\t" // n -> rax - "vbroadcastss %1, %%ymm1\n\t" // alpha -> ymm1 - "movq %2, %%rsi\n\t" // adress of a -> rsi - "movq %3, %%rcx\n\t" // value of lda > rcx - "movq %4, %%rdi\n\t" // adress of x -> rdi - "movq %5, %%rdx\n\t" // adress of y -> rdx - "movq %6, %%r8\n\t" // address for prefetch - "prefetcht0 (%%r8)\n\t" // Prefetch - "prefetcht0 64(%%r8)\n\t" // Prefetch - - "vxorps %%ymm8 , %%ymm8 , %%ymm8 \n\t" // set to zero - "vxorps %%ymm9 , %%ymm9 , %%ymm9 \n\t" // set to zero - ".align 16 \n\t" - ".L01LOOP%=: \n\t" - "vbroadcastss (%%rdi), %%ymm0 \n\t" // load values of c - "nop \n\t" - "leaq (%%r8 , %%rcx, 4), %%r8 \n\t" // add lda to pointer for prefetch - - "prefetcht0 (%%r8)\n\t" // Prefetch - - "vmulps 0*4(%%rsi), %%ymm0, %%ymm4 \n\t" // multiply a and c and add to temp - "vmulps 8*4(%%rsi), %%ymm0, %%ymm5 \n\t" // multiply a and c and add to temp - - "vaddps %%ymm8 , %%ymm4, %%ymm8 \n\t" // multiply a and c and add to temp - "vaddps %%ymm9 , %%ymm5, %%ymm9 \n\t" // multiply a and c and add to temp - - "addq $4 , %%rdi \n\t" // increment pointer of c - "leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a - - "dec %%rax \n\t" // n = n -1 - "jnz .L01LOOP%= \n\t" - - "vmulps %%ymm8 , %%ymm1, %%ymm8 \n\t" // scale by alpha - "vmulps %%ymm9 , %%ymm1, %%ymm9 \n\t" // scale by alpha - - "vmovups %%ymm8 , (%%rdx) \n\t" // store temp -> y - "vmovups %%ymm9 , 8*4(%%rdx) \n\t" // store temp -> y - - : - : - "m" (n), // 0 - "m" (alpha), // 1 - "m" (a), // 2 - "m" (lda), // 3 - "m" (x), // 4 - "m" (y), // 5 - "m" (pre) // 6 - : "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", "cc", - "%xmm0", "%xmm1", - "%xmm4", "%xmm5", "%xmm6", "%xmm7", - "%xmm8", "%xmm9", "%xmm10", "%xmm11", - "memory" - ); - - -} - - -static void sgemv_kernel_8( long n, float alpha, float *a, long lda, float *x, float *y) -{ - - __asm__ __volatile__ - ( - "movq %0, %%rax\n\t" // n -> rax - "vbroadcastss %1, %%ymm1\n\t" // alpha -> ymm1 - "movq %2, %%rsi\n\t" // adress of a -> rsi - "movq %3, %%rcx\n\t" // value of lda > rcx - "movq %4, %%rdi\n\t" // adress of x -> rdi - "movq %5, %%rdx\n\t" // adress of y -> rdx - - "vxorps %%ymm8 , %%ymm8 , %%ymm8 \n\t" // set to zero - ".align 16 \n\t" - ".L01LOOP%=: \n\t" - "vbroadcastss (%%rdi), %%ymm0 \n\t" // load values of c - - "vmulps 0*4(%%rsi), %%ymm0, %%ymm4 \n\t" // multiply a and c and add to temp - "vaddps %%ymm8 , %%ymm4, %%ymm8 \n\t" // multiply a and c and add to temp - - "addq $4 , %%rdi \n\t" // increment pointer of c - "leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a - - "dec %%rax \n\t" // n = n -1 - "jnz .L01LOOP%= \n\t" - - "vmulps %%ymm8 , %%ymm1, %%ymm8 \n\t" // scale by alpha - "vmovups %%ymm8 , (%%rdx) \n\t" // store temp -> y - - : - : - "m" (n), // 0 - "m" (alpha), // 1 - "m" (a), // 2 - "m" (lda), // 3 - "m" (x), // 4 - "m" (y) // 5 - : "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", "cc", - "%xmm0", "%xmm1", - "%xmm4", "%xmm5", "%xmm6", "%xmm7", - "%xmm8", "%xmm9", "%xmm10", "%xmm11", - "memory" - ); - - -} - - -static void sgemv_kernel_4( long n, float alpha, float *a, long lda, float *x, float *y) -{ - - - __asm__ __volatile__ - ( - "movq %0, %%rax\n\t" // n -> rax - "vbroadcastss %1, %%xmm1\n\t" // alpha -> xmm1 - "movq %2, %%rsi\n\t" // adress of a -> rsi - "movq %3, %%rcx\n\t" // value of lda > rcx - "movq %4, %%rdi\n\t" // adress of x -> rdi - "movq %5, %%rdx\n\t" // adress of y -> rdx - - "vxorps %%xmm12, %%xmm12, %%xmm12\n\t" // set to zero - - ".L01LOOP%=: \n\t" - "vbroadcastss (%%rdi), %%xmm0 \n\t" // load values of c - - "vmulps 0*4(%%rsi), %%xmm0, %%xmm4 \n\t" // multiply a and c and add to temp - "vaddps %%xmm12, %%xmm4, %%xmm12 \n\t" // multiply a and c and add to temp - - "addq $4 , %%rdi \n\t" // increment pointer of c - "leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a - - "dec %%rax \n\t" // n = n -1 - "jnz .L01LOOP%= \n\t" - - "vmulps %%xmm12, %%xmm1, %%xmm12\n\t" // scale by alpha - - "vmovups %%xmm12, (%%rdx) \n\t" // store temp -> y - - : - : - "m" (n), // 0 - "m" (alpha), // 1 - "m" (a), // 2 - "m" (lda), // 3 - "m" (x), // 4 - "m" (y) // 5 - : "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", - "%xmm0", "%xmm1", - "%xmm8", "%xmm9", "%xmm10", "%xmm11", - "%xmm12", "%xmm13", "%xmm14", "%xmm15", - "memory" - ); - -} - -static void sgemv_kernel_2( long n, float alpha, float *a, long lda, float *x, float *y) -{ - - - __asm__ __volatile__ - ( - "movq %0, %%rax\n\t" // n -> rax - "vmovss %1, %%xmm1\n\t" // alpha -> xmm1 - "movq %2, %%rsi\n\t" // adress of a -> rsi - "movq %3, %%rcx\n\t" // value of lda > rcx - "movq %4, %%rdi\n\t" // adress of x -> rdi - "movq %5, %%rdx\n\t" // adress of y -> rdx - - "vxorps %%xmm12, %%xmm12, %%xmm12\n\t" // set to zero - "vxorps %%xmm13, %%xmm13, %%xmm13\n\t" // set to zero - - ".L01LOOP%=: \n\t" - "vmovss (%%rdi), %%xmm0 \n\t" // load values of c - - "vmulps 0*4(%%rsi), %%xmm0, %%xmm4 \n\t" // multiply a and c and add to temp - "vmulps 1*4(%%rsi), %%xmm0, %%xmm5 \n\t" // multiply a and c and add to temp - - "vaddps %%xmm12, %%xmm4, %%xmm12 \n\t" // multiply a and c and add to temp - "vaddps %%xmm13, %%xmm5, %%xmm13 \n\t" // multiply a and c and add to temp - - "addq $4 , %%rdi \n\t" // increment pointer of c - "leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a - - "dec %%rax \n\t" // n = n -1 - "jnz .L01LOOP%= \n\t" - - "vmulss %%xmm12, %%xmm1, %%xmm12\n\t" // scale by alpha - "vmulss %%xmm13, %%xmm1, %%xmm13\n\t" // scale by alpha - - "vmovss %%xmm12, (%%rdx) \n\t" // store temp -> y - "vmovss %%xmm13, 4(%%rdx) \n\t" // store temp -> y - - : - : - "m" (n), // 0 - "m" (alpha), // 1 - "m" (a), // 2 - "m" (lda), // 3 - "m" (x), // 4 - "m" (y) // 5 - : "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", - "%xmm0", "%xmm1", - "%xmm8", "%xmm9", "%xmm10", "%xmm11", - "%xmm12", "%xmm13", "%xmm14", "%xmm15", - "memory" - ); - -} - - - -static void sgemv_kernel_1( long n, float alpha, float *a, long lda, float *x, float *y) -{ - - - __asm__ __volatile__ - ( - "movq %0, %%rax\n\t" // n -> rax - "vmovss %1, %%xmm1\n\t" // alpha -> xmm1 - "movq %2, %%rsi\n\t" // adress of a -> rsi - "movq %3, %%rcx\n\t" // value of lda > rcx - "movq %4, %%rdi\n\t" // adress of x -> rdi - "movq %5, %%rdx\n\t" // adress of y -> rdx - - "vxorps %%xmm12, %%xmm12, %%xmm12\n\t" // set to zero - - ".L01LOOP%=: \n\t" - "vmovss (%%rdi), %%xmm0 \n\t" // load values of c - "addq $4 , %%rdi \n\t" // increment pointer of c - - "vmulss 0*4(%%rsi), %%xmm0, %%xmm4 \n\t" // multiply a and c and add to temp - "vaddss %%xmm12, %%xmm4, %%xmm12 \n\t" // multiply a and c and add to temp - - "leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a - - "dec %%rax \n\t" // n = n -1 - "jnz .L01LOOP%= \n\t" - - "vmulss %%xmm12, %%xmm1, %%xmm12\n\t" // scale by alpha - - "vmovss %%xmm12, (%%rdx) \n\t" // store temp -> y - - : - : - "m" (n), // 0 - "m" (alpha), // 1 - "m" (a), // 2 - "m" (lda), // 3 - "m" (x), // 4 - "m" (y) // 5 - : "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", - "%xmm0", "%xmm1", - "%xmm8", "%xmm9", "%xmm10", "%xmm11", - "%xmm12", "%xmm13", "%xmm14", "%xmm15", - "memory" - ); - -} - - diff --git a/kernel/x86_64/sgemv_t_4.c b/kernel/x86_64/sgemv_t_4.c new file mode 100644 index 000000000..b0e883252 --- /dev/null +++ b/kernel/x86_64/sgemv_t_4.c @@ -0,0 +1,624 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include "common.h" + +#if defined(NEHALEM) +#include "sgemv_t_microk_nehalem-4.c" +#elif defined(BULLDOZER) || defined(PILEDRIVER) +#include "sgemv_t_microk_bulldozer-4.c" +#elif defined(SANDYBRIDGE) +#include "sgemv_t_microk_sandy-4.c" +#elif defined(HASWELL) +#include "sgemv_t_microk_haswell-4.c" +#endif + +#define NBMAX 4096 + +#ifndef HAVE_KERNEL_4x4 + +static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) +{ + BLASLONG i; + FLOAT *a0,*a1,*a2,*a3; + a0 = ap[0]; + a1 = ap[1]; + a2 = ap[2]; + a3 = ap[3]; + FLOAT temp0 = 0.0; + FLOAT temp1 = 0.0; + FLOAT temp2 = 0.0; + FLOAT temp3 = 0.0; + + for ( i=0; i< n; i+=4 ) + { + temp0 += a0[i]*x[i] + a0[i+1]*x[i+1] + a0[i+2]*x[i+2] + a0[i+3]*x[i+3]; + temp1 += a1[i]*x[i] + a1[i+1]*x[i+1] + a1[i+2]*x[i+2] + a1[i+3]*x[i+3]; + temp2 += a2[i]*x[i] + a2[i+1]*x[i+1] + a2[i+2]*x[i+2] + a2[i+3]*x[i+3]; + temp3 += a3[i]*x[i] + a3[i+1]*x[i+1] + a3[i+2]*x[i+2] + a3[i+3]*x[i+3]; + } + y[0] = temp0; + y[1] = temp1; + y[2] = temp2; + y[3] = temp3; +} + +#endif + +static void sgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT *y) __attribute__ ((noinline)); + +static void sgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT *y) +{ + BLASLONG i; + + i=0; + + __asm__ __volatile__ + ( + "xorps %%xmm10 , %%xmm10 \n\t" + "xorps %%xmm11 , %%xmm11 \n\t" + + "testq $4 , %1 \n\t" + "jz .L01LABEL%= \n\t" + + "movups (%5,%0,4) , %%xmm14 \n\t" // x + "movups (%3,%0,4) , %%xmm12 \n\t" // ap0 + "movups (%4,%0,4) , %%xmm13 \n\t" // ap1 + "mulps %%xmm14 , %%xmm12 \n\t" + "mulps %%xmm14 , %%xmm13 \n\t" + "addq $4 , %0 \n\t" + "addps %%xmm12 , %%xmm10 \n\t" + "subq $4 , %1 \n\t" + "addps %%xmm13 , %%xmm11 \n\t" + + ".L01LABEL%=: \n\t" + + "cmpq $0, %1 \n\t" + "je .L01END%= \n\t" + + ".align 16 \n\t" + ".L01LOOP%=: \n\t" + + "movups (%5,%0,4) , %%xmm14 \n\t" // x + "movups (%3,%0,4) , %%xmm12 \n\t" // ap0 + "movups (%4,%0,4) , %%xmm13 \n\t" // ap1 + "mulps %%xmm14 , %%xmm12 \n\t" + "mulps %%xmm14 , %%xmm13 \n\t" + "addps %%xmm12 , %%xmm10 \n\t" + "addps %%xmm13 , %%xmm11 \n\t" + + "movups 16(%5,%0,4) , %%xmm14 \n\t" // x + "movups 16(%3,%0,4) , %%xmm12 \n\t" // ap0 + "movups 16(%4,%0,4) , %%xmm13 \n\t" // ap1 + "mulps %%xmm14 , %%xmm12 \n\t" + "mulps %%xmm14 , %%xmm13 \n\t" + "addps %%xmm12 , %%xmm10 \n\t" + "addps %%xmm13 , %%xmm11 \n\t" + + "addq $8 , %0 \n\t" + "subq $8 , %1 \n\t" + "jnz .L01LOOP%= \n\t" + + ".L01END%=: \n\t" + + "haddps %%xmm10, %%xmm10 \n\t" + "haddps %%xmm11, %%xmm11 \n\t" + "haddps %%xmm10, %%xmm10 \n\t" + "haddps %%xmm11, %%xmm11 \n\t" + + "movss %%xmm10, (%2) \n\t" + "movss %%xmm11,4(%2) \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (y), // 2 + "r" (ap0), // 3 + "r" (ap1), // 4 + "r" (x) // 5 + : "cc", + "%xmm4", "%xmm5", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + + +} + +static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline)); + +static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) +{ + BLASLONG i; + + i=0; + + __asm__ __volatile__ + ( + "xorps %%xmm9 , %%xmm9 \n\t" + "xorps %%xmm10 , %%xmm10 \n\t" + + "testq $4 , %1 \n\t" + "jz .L01LABEL%= \n\t" + + "movups (%3,%0,4) , %%xmm12 \n\t" + "movups (%4,%0,4) , %%xmm11 \n\t" + "mulps %%xmm11 , %%xmm12 \n\t" + "addq $4 , %0 \n\t" + "addps %%xmm12 , %%xmm10 \n\t" + "subq $4 , %1 \n\t" + + ".L01LABEL%=: \n\t" + + "cmpq $0, %1 \n\t" + "je .L01END%= \n\t" + + ".align 16 \n\t" + ".L01LOOP%=: \n\t" + + "movups (%3,%0,4) , %%xmm12 \n\t" + "movups 16(%3,%0,4) , %%xmm14 \n\t" + "movups (%4,%0,4) , %%xmm11 \n\t" + "movups 16(%4,%0,4) , %%xmm13 \n\t" + "mulps %%xmm11 , %%xmm12 \n\t" + "mulps %%xmm13 , %%xmm14 \n\t" + "addq $8 , %0 \n\t" + "addps %%xmm12 , %%xmm10 \n\t" + "subq $8 , %1 \n\t" + "addps %%xmm14 , %%xmm9 \n\t" + + "jnz .L01LOOP%= \n\t" + + ".L01END%=: \n\t" + + "addps %%xmm9 , %%xmm10 \n\t" + "haddps %%xmm10, %%xmm10 \n\t" + "haddps %%xmm10, %%xmm10 \n\t" + + "movss %%xmm10, (%2) \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (y), // 2 + "r" (ap), // 3 + "r" (x) // 4 + : "cc", + "%xmm9", "%xmm10" , + "%xmm11", "%xmm12", "%xmm13", "%xmm14", + "memory" + ); + + +} + +static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) +{ + BLASLONG i; + for ( i=0; i> 2 ; + n2 = n & 3 ; + + m3 = m & 3 ; + m1 = m & -4 ; + m2 = (m & (NBMAX-1)) - m3 ; + + + BLASLONG NB = NBMAX; + + while ( NB == NBMAX ) + { + + m1 -= NB; + if ( m1 < 0) + { + if ( m2 == 0 ) break; + NB = m2; + } + + y_ptr = y; + a_ptr = a; + x_ptr = x; + + if ( inc_x == 1 ) + xbuffer = x_ptr; + else + copy_x(NB,x_ptr,xbuffer,inc_x); + + + FLOAT *ap[4]; + FLOAT *yp; + BLASLONG register lda4 = 4 * lda; + ap[0] = a_ptr; + ap[1] = a_ptr + lda; + ap[2] = ap[1] + lda; + ap[3] = ap[2] + lda; + + if ( n0 > 0 ) + { + BLASLONG nb1 = NBMAX / 4; + for( j=0; j 0 ) + { + add_y(n1*4, alpha, ytemp, y_ptr, inc_y ); + y_ptr += n1 * inc_y * 4; + a_ptr += n1 * lda4 ; + } + + if ( n2 & 2 ) + { + + sgemv_kernel_4x2(NB,ap[0],ap[1],xbuffer,ybuffer); + a_ptr += lda * 2; + *y_ptr += ybuffer[0] * alpha; + y_ptr += inc_y; + *y_ptr += ybuffer[1] * alpha; + y_ptr += inc_y; + + } + + if ( n2 & 1 ) + { + + sgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer); + a_ptr += lda; + *y_ptr += ybuffer[0] * alpha; + y_ptr += inc_y; + + } + a += NB; + x += NB * inc_x; + } + + if ( m3 == 0 ) return(0); + + x_ptr = x; + a_ptr = a; + if ( m3 == 3 ) + { + FLOAT xtemp0 = *x_ptr * alpha; + x_ptr += inc_x; + FLOAT xtemp1 = *x_ptr * alpha; + x_ptr += inc_x; + FLOAT xtemp2 = *x_ptr * alpha; + + FLOAT *aj = a_ptr; + y_ptr = y; + + if ( lda == 3 && inc_y == 1 ) + { + + for ( j=0; j< ( n & -4) ; j+=4 ) + { + + y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2; + y_ptr[j+1] += aj[3] * xtemp0 + aj[4] * xtemp1 + aj[5] * xtemp2; + y_ptr[j+2] += aj[6] * xtemp0 + aj[7] * xtemp1 + aj[8] * xtemp2; + y_ptr[j+3] += aj[9] * xtemp0 + aj[10] * xtemp1 + aj[11] * xtemp2; + aj += 12; + } + + for ( ; j= 16 ) - { - if ( m2 & Mblock) - { - - if ( inc_x == 1 ) - xbuffer = x_ptr; - else - copy_x(Mblock,x_ptr,xbuffer,inc_x); - - y_ptr = y; - a_ptrl = a_ptr; - - for(i = 0; i rax - "vmovss %1, %%xmm1\n\t" // alpha -> xmm1 - "movq %2, %%rsi\n\t" // adress of a -> rsi - "movq %3, %%rcx\n\t" // value of lda > rcx - "movq %4, %%rdi\n\t" // adress of x -> rdi - "movq %5, %%rdx\n\t" // adress of y -> rdx - - "leaq (, %%rcx,4), %%rcx \n\t" // scale lda by size of float - "leaq (%%rsi,%%rcx,1), %%r8 \n\t" // pointer to next line - - "vxorps %%xmm12, %%xmm12, %%xmm12\n\t" // set to zero - "vxorps %%xmm13, %%xmm13, %%xmm13\n\t" // set to zero - "vxorps %%xmm14, %%xmm14, %%xmm14\n\t" // set to zero - "vxorps %%xmm15, %%xmm15, %%xmm15\n\t" // set to zero - - "sarq $4, %%rax \n\t" // n = n / 16 - - ".align 16 \n\t" - ".L01LOOP%=: \n\t" - // "prefetcht0 512(%%rsi) \n\t" - "prefetcht0 (%%r8) \n\t" //prefetch next line of a - "vmovups (%%rsi), %%xmm4 \n\t" - "vmovups 4*4(%%rsi), %%xmm5 \n\t" - "vmovups 8*4(%%rsi), %%xmm6 \n\t" - "vmovups 12*4(%%rsi), %%xmm7 \n\t" - - "vfmaddps %%xmm12, 0*4(%%rdi), %%xmm4, %%xmm12\n\t" // multiply a and c and add to temp - "vfmaddps %%xmm13, 4*4(%%rdi), %%xmm5, %%xmm13\n\t" // multiply a and c and add to temp - "vfmaddps %%xmm14, 8*4(%%rdi), %%xmm6, %%xmm14\n\t" // multiply a and c and add to temp - "vfmaddps %%xmm15, 12*4(%%rdi), %%xmm7, %%xmm15\n\t" // multiply a and c and add to temp - - "addq $16*4 , %%r8 \n\t" // increment prefetch pointer - "addq $16*4 , %%rsi \n\t" // increment pointer of a - "addq $16*4 , %%rdi \n\t" // increment pointer of c - "dec %%rax \n\t" // n = n -1 - "jnz .L01LOOP%= \n\t" - - "vaddps %%xmm12, %%xmm14, %%xmm12\n\t" - "vaddps %%xmm13, %%xmm15, %%xmm13\n\t" - "vaddps %%xmm12, %%xmm13, %%xmm12\n\t" - "vhaddps %%xmm12, %%xmm12, %%xmm12\n\t" - "vhaddps %%xmm12, %%xmm12, %%xmm12\n\t" - - "vfmaddss (%%rdx), %%xmm12, %%xmm1, %%xmm12\n\t" - "vmovss %%xmm12, (%%rdx) \n\t" // store temp -> y - - : - : - "m" (n), // 0 - "m" (alpha), // 1 - "m" (a), // 2 - "m" (lda), // 3 - "m" (x), // 4 - "m" (y) // 5 - : "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", - "%xmm0", "%xmm1", - "%xmm4", "%xmm5", "%xmm6", "%xmm7", - "%xmm12", "%xmm13", "%xmm14", "%xmm15", - "memory" - ); - -} - - - diff --git a/kernel/x86_64/sgemv_t_microk_haswell-4.c b/kernel/x86_64/sgemv_t_microk_haswell-4.c new file mode 100644 index 000000000..016cb35e7 --- /dev/null +++ b/kernel/x86_64/sgemv_t_microk_haswell-4.c @@ -0,0 +1,148 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_4x4 1 +static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline)); + +static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) +{ + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + "vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t" + "vxorps %%ymm5 , %%ymm5, %%ymm5 \n\t" + "vxorps %%ymm6 , %%ymm6, %%ymm6 \n\t" + "vxorps %%ymm7 , %%ymm7, %%ymm7 \n\t" + + "testq $0x04, %1 \n\t" + "jz .L08LABEL%= \n\t" + + "vmovups (%2,%0,4), %%xmm12 \n\t" // 4 * x + + "vfmadd231ps (%4,%0,4), %%xmm12, %%xmm4 \n\t" + "vfmadd231ps (%5,%0,4), %%xmm12, %%xmm5 \n\t" + "vfmadd231ps (%6,%0,4), %%xmm12, %%xmm6 \n\t" + "vfmadd231ps (%7,%0,4), %%xmm12, %%xmm7 \n\t" + + "addq $4 , %0 \n\t" + "subq $4 , %1 \n\t" + + ".L08LABEL%=: \n\t" + + "testq $0x08, %1 \n\t" + "jz .L16LABEL%= \n\t" + + "vmovups (%2,%0,4), %%ymm12 \n\t" // 8 * x + + "vfmadd231ps (%4,%0,4), %%ymm12, %%ymm4 \n\t" + "vfmadd231ps (%5,%0,4), %%ymm12, %%ymm5 \n\t" + "vfmadd231ps (%6,%0,4), %%ymm12, %%ymm6 \n\t" + "vfmadd231ps (%7,%0,4), %%ymm12, %%ymm7 \n\t" + + "addq $8 , %0 \n\t" + "subq $8 , %1 \n\t" + + ".L16LABEL%=: \n\t" + + "cmpq $0, %1 \n\t" + "je .L16END%= \n\t" + + + ".align 16 \n\t" + ".L01LOOP%=: \n\t" + "prefetcht0 384(%2,%0,4) \n\t" + "vmovups (%2,%0,4), %%ymm12 \n\t" // 8 * x + "vmovups 32(%2,%0,4), %%ymm13 \n\t" // 8 * x + + "prefetcht0 384(%4,%0,4) \n\t" + "vfmadd231ps (%4,%0,4), %%ymm12, %%ymm4 \n\t" + "vfmadd231ps (%5,%0,4), %%ymm12, %%ymm5 \n\t" + "prefetcht0 384(%5,%0,4) \n\t" + "vfmadd231ps 32(%4,%0,4), %%ymm13, %%ymm4 \n\t" + "vfmadd231ps 32(%5,%0,4), %%ymm13, %%ymm5 \n\t" + "prefetcht0 384(%6,%0,4) \n\t" + "vfmadd231ps (%6,%0,4), %%ymm12, %%ymm6 \n\t" + "vfmadd231ps (%7,%0,4), %%ymm12, %%ymm7 \n\t" + "prefetcht0 384(%7,%0,4) \n\t" + "vfmadd231ps 32(%6,%0,4), %%ymm13, %%ymm6 \n\t" + "vfmadd231ps 32(%7,%0,4), %%ymm13, %%ymm7 \n\t" + + "addq $16, %0 \n\t" + "subq $16, %1 \n\t" + "jnz .L01LOOP%= \n\t" + + ".L16END%=: \n\t" + + "vextractf128 $1 , %%ymm4, %%xmm12 \n\t" + "vextractf128 $1 , %%ymm5, %%xmm13 \n\t" + "vextractf128 $1 , %%ymm6, %%xmm14 \n\t" + "vextractf128 $1 , %%ymm7, %%xmm15 \n\t" + + "vaddps %%xmm4, %%xmm12, %%xmm4 \n\t" + "vaddps %%xmm5, %%xmm13, %%xmm5 \n\t" + "vaddps %%xmm6, %%xmm14, %%xmm6 \n\t" + "vaddps %%xmm7, %%xmm15, %%xmm7 \n\t" + + "vhaddps %%xmm4, %%xmm4, %%xmm4 \n\t" + "vhaddps %%xmm5, %%xmm5, %%xmm5 \n\t" + "vhaddps %%xmm6, %%xmm6, %%xmm6 \n\t" + "vhaddps %%xmm7, %%xmm7, %%xmm7 \n\t" + + "vhaddps %%xmm4, %%xmm4, %%xmm4 \n\t" + "vhaddps %%xmm5, %%xmm5, %%xmm5 \n\t" + "vhaddps %%xmm6, %%xmm6, %%xmm6 \n\t" + "vhaddps %%xmm7, %%xmm7, %%xmm7 \n\t" + + "vmovss %%xmm4, (%3) \n\t" + "vmovss %%xmm5, 4(%3) \n\t" + "vmovss %%xmm6, 8(%3) \n\t" + "vmovss %%xmm7, 12(%3) \n\t" + + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (ap[0]), // 4 + "r" (ap[1]), // 5 + "r" (ap[2]), // 6 + "r" (ap[3]) // 7 + : "cc", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + diff --git a/kernel/x86_64/sgemv_t_microk_haswell.c b/kernel/x86_64/sgemv_t_microk_haswell.c deleted file mode 100644 index ecb9845bb..000000000 --- a/kernel/x86_64/sgemv_t_microk_haswell.c +++ /dev/null @@ -1,100 +0,0 @@ -/*************************************************************************** -Copyright (c) 2014, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -static void sgemv_kernel_16( long n, float alpha, float *a, long lda, float *x, float *y) -{ - - //n = n / 16; - - __asm__ __volatile__ - ( - "movq %0, %%rax\n\t" // n -> rax - "vmovss %1, %%xmm1\n\t" // alpha -> xmm1 - "movq %2, %%rsi\n\t" // adress of a -> rsi - "movq %3, %%rcx\n\t" // value of lda > rcx - "movq %4, %%rdi\n\t" // adress of x -> rdi - "movq %5, %%rdx\n\t" // adress of y -> rdx - - "leaq (, %%rcx,4), %%rcx \n\t" // scale lda by size of float - "leaq (%%rsi,%%rcx,1), %%r8 \n\t" // pointer to next line - - "vxorps %%xmm12, %%xmm12, %%xmm12\n\t" // set to zero - "vxorps %%xmm13, %%xmm13, %%xmm13\n\t" // set to zero - "vxorps %%xmm14, %%xmm14, %%xmm14\n\t" // set to zero - "vxorps %%xmm15, %%xmm15, %%xmm15\n\t" // set to zero - - "sarq $4, %%rax \n\t" // n = n / 16 - - ".align 16 \n\t" - ".L01LOOP%=: \n\t" - // "prefetcht0 512(%%rsi) \n\t" - "prefetcht0 (%%r8) \n\t" //prefetch next line of a - "vmovups (%%rsi), %%xmm4 \n\t" - "vmovups 4*4(%%rsi), %%xmm5 \n\t" - "vmovups 8*4(%%rsi), %%xmm6 \n\t" - "vmovups 12*4(%%rsi), %%xmm7 \n\t" - - "vfmadd231ps 0*4(%%rdi), %%xmm4, %%xmm12\n\t" // multiply a and c and add to temp - "vfmadd231ps 4*4(%%rdi), %%xmm5, %%xmm13\n\t" // multiply a and c and add to temp - "vfmadd231ps 8*4(%%rdi), %%xmm6, %%xmm14\n\t" // multiply a and c and add to temp - "vfmadd231ps 12*4(%%rdi), %%xmm7, %%xmm15\n\t" // multiply a and c and add to temp - - "addq $16*4 , %%r8 \n\t" // increment prefetch pointer - "addq $16*4 , %%rsi \n\t" // increment pointer of a - "addq $16*4 , %%rdi \n\t" // increment pointer of c - "dec %%rax \n\t" // n = n -1 - "jnz .L01LOOP%= \n\t" - - "vaddps %%xmm12, %%xmm14, %%xmm12\n\t" - "vaddps %%xmm13, %%xmm15, %%xmm13\n\t" - "vaddps %%xmm12, %%xmm13, %%xmm12\n\t" - "vhaddps %%xmm12, %%xmm12, %%xmm12\n\t" - "vhaddps %%xmm12, %%xmm12, %%xmm12\n\t" - - "vmulss %%xmm12, %%xmm1, %%xmm12\n\t" - "vaddss (%%rdx), %%xmm12,%%xmm12\n\t" - "vmovss %%xmm12, (%%rdx) \n\t" // store temp -> y - - : - : - "m" (n), // 0 - "m" (alpha), // 1 - "m" (a), // 2 - "m" (lda), // 3 - "m" (x), // 4 - "m" (y) // 5 - : "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", - "%xmm0", "%xmm1", - "%xmm4", "%xmm5", "%xmm6", "%xmm7", - "%xmm12", "%xmm13", "%xmm14", "%xmm15", - "memory" - ); - -} - - - diff --git a/kernel/x86_64/sgemv_t_microk_nehalem-4.c b/kernel/x86_64/sgemv_t_microk_nehalem-4.c new file mode 100644 index 000000000..4a167900e --- /dev/null +++ b/kernel/x86_64/sgemv_t_microk_nehalem-4.c @@ -0,0 +1,99 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_4x4 1 +static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline)); + +static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) +{ + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "xorps %%xmm4 , %%xmm4 \n\t" + "xorps %%xmm5 , %%xmm5 \n\t" + "xorps %%xmm6 , %%xmm6 \n\t" + "xorps %%xmm7 , %%xmm7 \n\t" + + ".align 16 \n\t" + ".L01LOOP%=: \n\t" + + "movups (%2,%0,4), %%xmm12 \n\t" // 4 * x + "movups (%4,%0,4), %%xmm8 \n\t" // 4 * a0 + "movups (%5,%0,4), %%xmm9 \n\t" // 4 * a1 + "movups (%6,%0,4), %%xmm10 \n\t" // 4 * a2 + "movups (%7,%0,4), %%xmm11 \n\t" // 4 * a3 + + "mulps %%xmm12, %%xmm8 \n\t" + "mulps %%xmm12, %%xmm9 \n\t" + "mulps %%xmm12, %%xmm10 \n\t" + "mulps %%xmm12, %%xmm11 \n\t" + "addps %%xmm8 , %%xmm4 \n\t" + "addq $4 , %0 \n\t" + "addps %%xmm9 , %%xmm5 \n\t" + "subq $4 , %1 \n\t" + "addps %%xmm10, %%xmm6 \n\t" + "addps %%xmm11, %%xmm7 \n\t" + + "jnz .L01LOOP%= \n\t" + + "haddps %%xmm4, %%xmm4 \n\t" + "haddps %%xmm5, %%xmm5 \n\t" + "haddps %%xmm6, %%xmm6 \n\t" + "haddps %%xmm7, %%xmm7 \n\t" + + "haddps %%xmm4, %%xmm4 \n\t" + "haddps %%xmm5, %%xmm5 \n\t" + "haddps %%xmm6, %%xmm6 \n\t" + "haddps %%xmm7, %%xmm7 \n\t" + + "movss %%xmm4, (%3) \n\t" + "movss %%xmm5, 4(%3) \n\t" + "movss %%xmm6, 8(%3) \n\t" + "movss %%xmm7, 12(%3) \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (ap[0]), // 4 + "r" (ap[1]), // 5 + "r" (ap[2]), // 6 + "r" (ap[3]) // 7 + : "cc", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", + "memory" + ); + +} + + diff --git a/kernel/x86_64/sgemv_t_microk_sandy-4.c b/kernel/x86_64/sgemv_t_microk_sandy-4.c new file mode 100644 index 000000000..6550518f7 --- /dev/null +++ b/kernel/x86_64/sgemv_t_microk_sandy-4.c @@ -0,0 +1,174 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_4x4 1 +static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline)); + +static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) +{ + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + "vxorps %%ymm0 , %%ymm0, %%ymm0 \n\t" + "vxorps %%ymm1 , %%ymm1, %%ymm1 \n\t" + "vxorps %%ymm2 , %%ymm2, %%ymm2 \n\t" + "vxorps %%ymm3 , %%ymm3, %%ymm3 \n\t" + "vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t" + "vxorps %%ymm5 , %%ymm5, %%ymm5 \n\t" + "vxorps %%ymm6 , %%ymm6, %%ymm6 \n\t" + "vxorps %%ymm7 , %%ymm7, %%ymm7 \n\t" + + "testq $0x04, %1 \n\t" + "jz .L08LABEL%= \n\t" + + "vmovups (%2,%0,4), %%xmm12 \n\t" // 4 * x + + "vmulps (%4,%0,4), %%xmm12, %%xmm8 \n\t" + "vmulps (%5,%0,4), %%xmm12, %%xmm10 \n\t" + "vmulps (%6,%0,4), %%xmm12, %%xmm9 \n\t" + "vmulps (%7,%0,4), %%xmm12, %%xmm11 \n\t" + "vaddps %%xmm4, %%xmm8 , %%xmm4 \n\t" + "addq $4 , %0 \n\t" + "vaddps %%xmm5, %%xmm10, %%xmm5 \n\t" + "vaddps %%xmm6, %%xmm9 , %%xmm6 \n\t" + "subq $4 , %1 \n\t" + "vaddps %%xmm7, %%xmm11, %%xmm7 \n\t" + + ".L08LABEL%=: \n\t" + + "testq $0x08, %1 \n\t" + "jz .L16LABEL%= \n\t" + + "vmovups (%2,%0,4), %%ymm12 \n\t" // 8 * x + + "vmulps (%4,%0,4), %%ymm12, %%ymm8 \n\t" + "vmulps (%5,%0,4), %%ymm12, %%ymm10 \n\t" + "vmulps (%6,%0,4), %%ymm12, %%ymm9 \n\t" + "vmulps (%7,%0,4), %%ymm12, %%ymm11 \n\t" + "vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t" + "addq $8 , %0 \n\t" + "vaddps %%ymm5, %%ymm10, %%ymm5 \n\t" + "vaddps %%ymm6, %%ymm9 , %%ymm6 \n\t" + "subq $8 , %1 \n\t" + "vaddps %%ymm7, %%ymm11, %%ymm7 \n\t" + + ".L16LABEL%=: \n\t" + + "cmpq $0, %1 \n\t" + "je .L16END%= \n\t" + + + ".align 16 \n\t" + ".L01LOOP%=: \n\t" + "prefetcht0 384(%2,%0,4) \n\t" + "vmovups (%2,%0,4), %%ymm12 \n\t" // 8 * x + "vmovups 32(%2,%0,4), %%ymm13 \n\t" // 8 * x + + "prefetcht0 384(%4,%0,4) \n\t" + "vmulps (%4,%0,4), %%ymm12, %%ymm8 \n\t" + "vmulps 32(%4,%0,4), %%ymm13, %%ymm9 \n\t" + "prefetcht0 384(%5,%0,4) \n\t" + "vmulps (%5,%0,4), %%ymm12, %%ymm10 \n\t" + "vmulps 32(%5,%0,4), %%ymm13, %%ymm11 \n\t" + "vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t" + "vaddps %%ymm0, %%ymm9 , %%ymm0 \n\t" + "vaddps %%ymm1, %%ymm10, %%ymm1 \n\t" + "vaddps %%ymm5, %%ymm11, %%ymm5 \n\t" + "prefetcht0 384(%6,%0,4) \n\t" + "vmulps (%6,%0,4), %%ymm12, %%ymm8 \n\t" + "vmulps 32(%6,%0,4), %%ymm13, %%ymm9 \n\t" + "prefetcht0 384(%7,%0,4) \n\t" + "vmulps (%7,%0,4), %%ymm12, %%ymm10 \n\t" + "vmulps 32(%7,%0,4), %%ymm13, %%ymm11 \n\t" + "vaddps %%ymm6, %%ymm8 , %%ymm6 \n\t" + "addq $16, %0 \n\t" + "vaddps %%ymm2, %%ymm9 , %%ymm2 \n\t" + "vaddps %%ymm7, %%ymm10, %%ymm7 \n\t" + "subq $16, %1 \n\t" + "vaddps %%ymm3, %%ymm11, %%ymm3 \n\t" + + "jnz .L01LOOP%= \n\t" + + ".L16END%=: \n\t" + + "vaddps %%ymm4, %%ymm0, %%ymm4 \n\t" + "vaddps %%ymm5, %%ymm1, %%ymm5 \n\t" + "vaddps %%ymm6, %%ymm2, %%ymm6 \n\t" + "vaddps %%ymm7, %%ymm3, %%ymm7 \n\t" + + "vextractf128 $1 , %%ymm4, %%xmm12 \n\t" + "vextractf128 $1 , %%ymm5, %%xmm13 \n\t" + "vextractf128 $1 , %%ymm6, %%xmm14 \n\t" + "vextractf128 $1 , %%ymm7, %%xmm15 \n\t" + + "vaddps %%xmm4, %%xmm12, %%xmm4 \n\t" + "vaddps %%xmm5, %%xmm13, %%xmm5 \n\t" + "vaddps %%xmm6, %%xmm14, %%xmm6 \n\t" + "vaddps %%xmm7, %%xmm15, %%xmm7 \n\t" + + "vhaddps %%xmm4, %%xmm4, %%xmm4 \n\t" + "vhaddps %%xmm5, %%xmm5, %%xmm5 \n\t" + "vhaddps %%xmm6, %%xmm6, %%xmm6 \n\t" + "vhaddps %%xmm7, %%xmm7, %%xmm7 \n\t" + + "vhaddps %%xmm4, %%xmm4, %%xmm4 \n\t" + "vhaddps %%xmm5, %%xmm5, %%xmm5 \n\t" + "vhaddps %%xmm6, %%xmm6, %%xmm6 \n\t" + "vhaddps %%xmm7, %%xmm7, %%xmm7 \n\t" + + "vmovss %%xmm4, (%3) \n\t" + "vmovss %%xmm5, 4(%3) \n\t" + "vmovss %%xmm6, 8(%3) \n\t" + "vmovss %%xmm7, 12(%3) \n\t" + + + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (ap[0]), // 4 + "r" (ap[1]), // 5 + "r" (ap[2]), // 6 + "r" (ap[3]) // 7 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + diff --git a/kernel/x86_64/sgemv_t_microk_sandy.c b/kernel/x86_64/sgemv_t_microk_sandy.c deleted file mode 100644 index 4ecd6d3d0..000000000 --- a/kernel/x86_64/sgemv_t_microk_sandy.c +++ /dev/null @@ -1,106 +0,0 @@ -/*************************************************************************** -Copyright (c) 2014, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -static void sgemv_kernel_16( long n, float alpha, float *a, long lda, float *x, float *y) -{ - - //n = n / 16; - - __asm__ __volatile__ - ( - "movq %0, %%rax\n\t" // n -> rax - "vmovss %1, %%xmm1\n\t" // alpha -> xmm1 - "movq %2, %%rsi\n\t" // adress of a -> rsi - "movq %3, %%rcx\n\t" // value of lda > rcx - "movq %4, %%rdi\n\t" // adress of x -> rdi - "movq %5, %%rdx\n\t" // adress of y -> rdx - - "leaq (, %%rcx,4), %%rcx \n\t" // scale lda by size of float - "leaq (%%rsi,%%rcx,1), %%r8 \n\t" // pointer to next line - - "vxorps %%xmm12, %%xmm12, %%xmm12\n\t" // set to zero - "vxorps %%xmm13, %%xmm13, %%xmm13\n\t" // set to zero - "vxorps %%xmm14, %%xmm14, %%xmm14\n\t" // set to zero - "vxorps %%xmm15, %%xmm15, %%xmm15\n\t" // set to zero - - "sarq $4, %%rax \n\t" // n = n / 16 - - ".align 16 \n\t" - ".L01LOOP%=: \n\t" - // "prefetcht0 512(%%rsi) \n\t" - "prefetcht0 (%%r8) \n\t" //prefetch next line of a - "vmovups (%%rsi), %%xmm4 \n\t" - "vmovups 4*4(%%rsi), %%xmm5 \n\t" - "vmovups 8*4(%%rsi), %%xmm6 \n\t" - "vmovups 12*4(%%rsi), %%xmm7 \n\t" - - "vmulps 0*4(%%rdi), %%xmm4, %%xmm8 \n\t" // multiply a and c and add to temp - "vmulps 4*4(%%rdi), %%xmm5, %%xmm9 \n\t" // multiply a and c and add to temp - "vmulps 8*4(%%rdi), %%xmm6, %%xmm10\n\t" // multiply a and c and add to temp - "vmulps 12*4(%%rdi), %%xmm7, %%xmm11\n\t" // multiply a and c and add to temp - - "vaddps %%xmm12, %%xmm8 , %%xmm12\n\t" - "vaddps %%xmm13, %%xmm9 , %%xmm13\n\t" - "vaddps %%xmm14, %%xmm10, %%xmm14\n\t" - "vaddps %%xmm15, %%xmm11, %%xmm15\n\t" - - "addq $16*4 , %%r8 \n\t" // increment prefetch pointer - "addq $16*4 , %%rsi \n\t" // increment pointer of a - "addq $16*4 , %%rdi \n\t" // increment pointer of c - "dec %%rax \n\t" // n = n -1 - "jnz .L01LOOP%= \n\t" - - "vaddps %%xmm12, %%xmm14, %%xmm12\n\t" - "vaddps %%xmm13, %%xmm15, %%xmm13\n\t" - "vaddps %%xmm12, %%xmm13, %%xmm12\n\t" - "vhaddps %%xmm12, %%xmm12, %%xmm12\n\t" - "vhaddps %%xmm12, %%xmm12, %%xmm12\n\t" - - "vmulss %%xmm12, %%xmm1, %%xmm12 \n\t" - "vaddss (%%rdx), %%xmm12, %%xmm12\n\t" - "vmovss %%xmm12, (%%rdx) \n\t" // store temp -> y - - : - : - "m" (n), // 0 - "m" (alpha), // 1 - "m" (a), // 2 - "m" (lda), // 3 - "m" (x), // 4 - "m" (y) // 5 - : "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", "cc", - "%xmm0", "%xmm1", - "%xmm4", "%xmm5", "%xmm6", "%xmm7", - "%xmm8", "%xmm9", "%xmm10", "%xmm11", - "%xmm12", "%xmm13", "%xmm14", "%xmm15", - "memory" - ); - -} - - - diff --git a/lapack-netlib/TESTING/dstest.in b/lapack-netlib/TESTING/dstest.in index 4a31076a6..b5a9f29f4 100644 --- a/lapack-netlib/TESTING/dstest.in +++ b/lapack-netlib/TESTING/dstest.in @@ -1,6 +1,6 @@ Data file for testing DSGESV/DSPOSV LAPACK routines 12 Number of values of M -0 1 2 13 17 45 78 91 101 119 120 132 values of M (row dimension) +0 1 2 13 17 45 78 91 101 119 112 132 values of M (row dimension) 6 Number of values of NRHS 1 2 14 15 16 13 Values of NRHS (number of right hand sides) 30.0 Threshold value of test ratio