diff --git a/interface/zgemv.c b/interface/zgemv.c index 50513a8e4..704034aaf 100644 --- a/interface/zgemv.c +++ b/interface/zgemv.c @@ -238,7 +238,7 @@ void CNAME(enum CBLAS_ORDER order, int nthreads_avail = nthreads_max; double MNK = (double) m * (double) n; - if ( MNK <= (80.0 * 20.0 * (double) GEMM_MULTITHREAD_THRESHOLD) ) + if ( MNK <= ( 256.0 * (double) (GEMM_MULTITHREAD_THRESHOLD * GEMM_MULTITHREAD_THRESHOLD) )) nthreads_max = 1; if ( nthreads_max > nthreads_avail ) diff --git a/kernel/x86_64/KERNEL.BULLDOZER b/kernel/x86_64/KERNEL.BULLDOZER index 0fd7ac35f..289529772 100644 --- a/kernel/x86_64/KERNEL.BULLDOZER +++ b/kernel/x86_64/KERNEL.BULLDOZER @@ -14,7 +14,7 @@ SGEMVNKERNEL = sgemv_n_4.c SGEMVTKERNEL = sgemv_t_4.c ZGEMVNKERNEL = zgemv_n_dup.S -ZGEMVTKERNEL = zgemv_t.c +ZGEMVTKERNEL = zgemv_t_4.c DGEMVNKERNEL = dgemv_n_bulldozer.S DGEMVTKERNEL = dgemv_t_bulldozer.S diff --git a/kernel/x86_64/KERNEL.HASWELL b/kernel/x86_64/KERNEL.HASWELL index 8aab560c4..a621b4484 100644 --- a/kernel/x86_64/KERNEL.HASWELL +++ b/kernel/x86_64/KERNEL.HASWELL @@ -4,11 +4,11 @@ SGEMVTKERNEL = sgemv_t_4.c DGEMVNKERNEL = dgemv_n_4.c DGEMVTKERNEL = dgemv_t_4.c -ZGEMVNKERNEL = zgemv_n.c -ZGEMVTKERNEL = zgemv_t.c +ZGEMVNKERNEL = zgemv_n_4.c +ZGEMVTKERNEL = zgemv_t_4.c -CGEMVNKERNEL = cgemv_n.c -CGEMVTKERNEL = cgemv_t.c +CGEMVNKERNEL = cgemv_n_4.c +CGEMVTKERNEL = cgemv_t_4.c SGEMMKERNEL = sgemm_kernel_16x4_haswell.S SGEMMINCOPY = ../generic/gemm_ncopy_16.c diff --git a/kernel/x86_64/KERNEL.PILEDRIVER b/kernel/x86_64/KERNEL.PILEDRIVER index 4f15e5a36..55285e3d3 100644 --- a/kernel/x86_64/KERNEL.PILEDRIVER +++ b/kernel/x86_64/KERNEL.PILEDRIVER @@ -2,10 +2,11 @@ SGEMVNKERNEL = sgemv_n_4.c SGEMVTKERNEL = sgemv_t_4.c ZGEMVNKERNEL = zgemv_n_dup.S -ZGEMVTKERNEL = zgemv_t.S +ZGEMVTKERNEL = zgemv_t_4.c DGEMVNKERNEL = dgemv_n_bulldozer.S DGEMVTKERNEL = dgemv_t_bulldozer.S + DDOTKERNEL = ddot_bulldozer.S DCOPYKERNEL = dcopy_bulldozer.S diff --git a/kernel/x86_64/KERNEL.SANDYBRIDGE b/kernel/x86_64/KERNEL.SANDYBRIDGE index dfc2882aa..61e13a116 100644 --- a/kernel/x86_64/KERNEL.SANDYBRIDGE +++ b/kernel/x86_64/KERNEL.SANDYBRIDGE @@ -1,7 +1,7 @@ SGEMVNKERNEL = sgemv_n_4.c SGEMVTKERNEL = sgemv_t_4.c -ZGEMVNKERNEL = zgemv_n.c +ZGEMVNKERNEL = zgemv_n_4.c SGEMMKERNEL = sgemm_kernel_16x4_sandy.S diff --git a/kernel/x86_64/cgemv_n.c b/kernel/x86_64/cgemv_n.c deleted file mode 100644 index 47ef0d447..000000000 --- a/kernel/x86_64/cgemv_n.c +++ /dev/null @@ -1,255 +0,0 @@ -/*************************************************************************** -Copyright (c) 2014, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#include -#include -#include "common.h" - -#if defined(HASWELL) -#include "cgemv_n_microk_haswell-2.c" -#endif - - -#define NBMAX 2048 - -#ifndef HAVE_KERNEL_16x4 - -static void cgemv_kernel_16x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) -{ - BLASLONG i; - FLOAT *a0,*a1,*a2,*a3; - a0 = ap[0]; - a1 = ap[1]; - a2 = ap[2]; - a3 = ap[3]; - - for ( i=0; i< 2*n; i+=2 ) - { -#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - y[i] += a0[i]*x[0] - a0[i+1] * x[1]; - y[i+1] += a0[i]*x[1] + a0[i+1] * x[0]; - y[i] += a1[i]*x[2] - a1[i+1] * x[3]; - y[i+1] += a1[i]*x[3] + a1[i+1] * x[2]; - y[i] += a2[i]*x[4] - a2[i+1] * x[5]; - y[i+1] += a2[i]*x[5] + a2[i+1] * x[4]; - y[i] += a3[i]*x[6] - a3[i+1] * x[7]; - y[i+1] += a3[i]*x[7] + a3[i+1] * x[6]; -#else - y[i] += a0[i]*x[0] + a0[i+1] * x[1]; - y[i+1] += a0[i]*x[1] - a0[i+1] * x[0]; - y[i] += a1[i]*x[2] + a1[i+1] * x[3]; - y[i+1] += a1[i]*x[3] - a1[i+1] * x[2]; - y[i] += a2[i]*x[4] + a2[i+1] * x[5]; - y[i+1] += a2[i]*x[5] - a2[i+1] * x[4]; - y[i] += a3[i]*x[6] + a3[i+1] * x[7]; - y[i+1] += a3[i]*x[7] - a3[i+1] * x[6]; -#endif - } -} - -#endif - -static void cgemv_kernel_16x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) -{ - BLASLONG i; - FLOAT *a0; - a0 = ap; - - for ( i=0; i< 2*n; i+=2 ) - { -#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - y[i] += a0[i]*x[0] - a0[i+1] * x[1]; - y[i+1] += a0[i]*x[1] + a0[i+1] * x[0]; -#else - y[i] += a0[i]*x[0] + a0[i+1] * x[1]; - y[i+1] += a0[i]*x[1] - a0[i+1] * x[0]; -#endif - - } -} - - -static void zero_y(BLASLONG n, FLOAT *dest) -{ - BLASLONG i; - for ( i=0; i<2*n; i++ ) - { - *dest = 0.0; - dest++; - } -} - - - -static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT alpha_r, FLOAT alpha_i) -{ - BLASLONG i; - FLOAT temp_r; - FLOAT temp_i; - for ( i=0; i +#include +#include "common.h" + +#if defined(HASWELL) +#include "cgemv_n_microk_haswell-4.c" +#endif + + +#define NBMAX 2048 + +#ifndef HAVE_KERNEL_4x4 + +static void cgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) +{ + BLASLONG i; + FLOAT *a0,*a1,*a2,*a3; + a0 = ap[0]; + a1 = ap[1]; + a2 = ap[2]; + a3 = ap[3]; + + for ( i=0; i< 2*n; i+=2 ) + { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + y[i] += a0[i]*x[0] - a0[i+1] * x[1]; + y[i+1] += a0[i]*x[1] + a0[i+1] * x[0]; + y[i] += a1[i]*x[2] - a1[i+1] * x[3]; + y[i+1] += a1[i]*x[3] + a1[i+1] * x[2]; + y[i] += a2[i]*x[4] - a2[i+1] * x[5]; + y[i+1] += a2[i]*x[5] + a2[i+1] * x[4]; + y[i] += a3[i]*x[6] - a3[i+1] * x[7]; + y[i+1] += a3[i]*x[7] + a3[i+1] * x[6]; +#else + y[i] += a0[i]*x[0] + a0[i+1] * x[1]; + y[i+1] += a0[i]*x[1] - a0[i+1] * x[0]; + y[i] += a1[i]*x[2] + a1[i+1] * x[3]; + y[i+1] += a1[i]*x[3] - a1[i+1] * x[2]; + y[i] += a2[i]*x[4] + a2[i+1] * x[5]; + y[i+1] += a2[i]*x[5] - a2[i+1] * x[4]; + y[i] += a3[i]*x[6] + a3[i+1] * x[7]; + y[i+1] += a3[i]*x[7] - a3[i+1] * x[6]; +#endif + } +} + +#endif + + + +#ifndef HAVE_KERNEL_4x2 + +static void cgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) +{ + BLASLONG i; + FLOAT *a0,*a1; + a0 = ap[0]; + a1 = ap[1]; + + for ( i=0; i< 2*n; i+=2 ) + { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + y[i] += a0[i]*x[0] - a0[i+1] * x[1]; + y[i+1] += a0[i]*x[1] + a0[i+1] * x[0]; + y[i] += a1[i]*x[2] - a1[i+1] * x[3]; + y[i+1] += a1[i]*x[3] + a1[i+1] * x[2]; +#else + y[i] += a0[i]*x[0] + a0[i+1] * x[1]; + y[i+1] += a0[i]*x[1] - a0[i+1] * x[0]; + y[i] += a1[i]*x[2] + a1[i+1] * x[3]; + y[i+1] += a1[i]*x[3] - a1[i+1] * x[2]; +#endif + } +} + +#endif + + + + +#ifndef HAVE_KERNEL_4x1 + + +static void cgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) +{ + BLASLONG i; + FLOAT *a0; + a0 = ap; + + for ( i=0; i< 2*n; i+=2 ) + { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + y[i] += a0[i]*x[0] - a0[i+1] * x[1]; + y[i+1] += a0[i]*x[1] + a0[i+1] * x[0]; +#else + y[i] += a0[i]*x[0] + a0[i+1] * x[1]; + y[i+1] += a0[i]*x[1] - a0[i+1] * x[0]; +#endif + + } +} + + +#endif + + +#ifndef HAVE_KERNEL_ADDY + +static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT alpha_r, FLOAT alpha_i) __attribute__ ((noinline)); + +static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT alpha_r, FLOAT alpha_i) +{ + BLASLONG i; + + if ( inc_dest != 2 ) + { + + FLOAT temp_r; + FLOAT temp_i; + for ( i=0; i> 2 ; + n2 = n & 3 ; + + m3 = m & 3 ; + m1 = m - m3; + m2 = (m & (NBMAX-1)) - m3 ; + + alpha[0] = alpha_r; + alpha[1] = alpha_i; + + BLASLONG NB = NBMAX; + + while ( NB == NBMAX ) + { + + m1 -= NB; + if ( m1 < 0) + { + if ( m2 == 0 ) break; + NB = m2; + } + + y_ptr = y; + a_ptr = a; + x_ptr = x; + ap[0] = a_ptr; + ap[1] = a_ptr + lda; + ap[2] = ap[1] + lda; + ap[3] = ap[2] + lda; + if ( inc_x != 2 ) + copy_x(NB,x_ptr,xbuffer,inc_x); + else + xbuffer = x_ptr; + + if ( inc_y == 2 ) + { + + for( i = 0; i < n1 ; i++) + { + cgemv_kernel_4x4(NB,ap,xbuffer,y_ptr,alpha); + ap[0] += lda4; + ap[1] += lda4; + ap[2] += lda4; + ap[3] += lda4; + a_ptr += lda4; + y_ptr += 8; + + } + + if ( n2 & 2 ) + { + cgemv_kernel_4x2(NB,ap,xbuffer,y_ptr,alpha); + a_ptr += lda * 2; + y_ptr += 4; + + } + + if ( n2 & 1 ) + { + cgemv_kernel_4x1(NB,a_ptr,xbuffer,y_ptr,alpha); + a_ptr += lda; + y_ptr += 2; + + } + + } + else + { + + for( i = 0; i < n1 ; i++) + { + memset(ybuffer,0,32); + cgemv_kernel_4x4(NB,ap,xbuffer,ybuffer,alpha); + ap[0] += lda4; + ap[1] += lda4; + ap[2] += lda4; + ap[3] += lda4; + a_ptr += lda4; + + y_ptr[0] += ybuffer[0]; + y_ptr[1] += ybuffer[1]; + y_ptr += inc_y; + y_ptr[0] += ybuffer[2]; + y_ptr[1] += ybuffer[3]; + y_ptr += inc_y; + y_ptr[0] += ybuffer[4]; + y_ptr[1] += ybuffer[5]; + y_ptr += inc_y; + y_ptr[0] += ybuffer[6]; + y_ptr[1] += ybuffer[7]; + y_ptr += inc_y; + + } + + for( i = 0; i < n2 ; i++) + { + memset(ybuffer,0,32); + cgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer,alpha); + a_ptr += lda; + y_ptr[0] += ybuffer[0]; + y_ptr[1] += ybuffer[1]; + y_ptr += inc_y; + + } + + } + a += 2 * NB; + x += NB * inc_x; + } + + + + if ( m3 == 0 ) return(0); + + x_ptr = x; + j=0; + a_ptr = a; + y_ptr = y; + + if ( m3 == 3 ) + { + + FLOAT temp_r ; + FLOAT temp_i ; + FLOAT x0 = x_ptr[0]; + FLOAT x1 = x_ptr[1]; + x_ptr += inc_x; + FLOAT x2 = x_ptr[0]; + FLOAT x3 = x_ptr[1]; + x_ptr += inc_x; + FLOAT x4 = x_ptr[0]; + FLOAT x5 = x_ptr[1]; + while ( j < n) + { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; + temp_r += a_ptr[4] * x4 - a_ptr[5] * x5; + temp_i += a_ptr[4] * x5 + a_ptr[5] * x4; +#else + + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; + temp_r += a_ptr[4] * x4 + a_ptr[5] * x5; + temp_i += a_ptr[4] * x5 - a_ptr[5] * x4; +#endif + +#if !defined(XCONJ) + y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; + y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; +#else + y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; + y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; +#endif + + a_ptr += lda; + y_ptr += inc_y; + j++; + } + return(0); + } + + + if ( m3 == 2 ) + { + + FLOAT temp_r ; + FLOAT temp_i ; + FLOAT temp_r1 ; + FLOAT temp_i1 ; + FLOAT x0 = x_ptr[0]; + FLOAT x1 = x_ptr[1]; + x_ptr += inc_x; + FLOAT x2 = x_ptr[0]; + FLOAT x3 = x_ptr[1]; + FLOAT ar = alpha[0]; + FLOAT ai = alpha[1]; + + while ( j < ( n & -2 )) + { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; + a_ptr += lda; + temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r1 += a_ptr[2] * x2 - a_ptr[3] * x3; + temp_i1 += a_ptr[2] * x3 + a_ptr[3] * x2; +#else + + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; + a_ptr += lda; + temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r1 += a_ptr[2] * x2 + a_ptr[3] * x3; + temp_i1 += a_ptr[2] * x3 - a_ptr[3] * x2; +#endif + +#if !defined(XCONJ) + y_ptr[0] += ar * temp_r - ai * temp_i; + y_ptr[1] += ar * temp_i + ai * temp_r; + y_ptr += inc_y; + y_ptr[0] += ar * temp_r1 - ai * temp_i1; + y_ptr[1] += ar * temp_i1 + ai * temp_r1; +#else + y_ptr[0] += ar * temp_r + ai * temp_i; + y_ptr[1] -= ar * temp_i - ai * temp_r; + y_ptr += inc_y; + y_ptr[0] += ar * temp_r1 + ai * temp_i1; + y_ptr[1] -= ar * temp_i1 - ai * temp_r1; +#endif + + a_ptr += lda; + y_ptr += inc_y; + j+=2; + } + + + while ( j < n) + { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; +#else + + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; +#endif + +#if !defined(XCONJ) + y_ptr[0] += ar * temp_r - ai * temp_i; + y_ptr[1] += ar * temp_i + ai * temp_r; +#else + y_ptr[0] += ar * temp_r + ai * temp_i; + y_ptr[1] -= ar * temp_i - ai * temp_r; +#endif + + a_ptr += lda; + y_ptr += inc_y; + j++; + } + + return(0); + } + + + if ( m3 == 1 ) + { + + FLOAT temp_r ; + FLOAT temp_i ; + FLOAT temp_r1 ; + FLOAT temp_i1 ; + FLOAT x0 = x_ptr[0]; + FLOAT x1 = x_ptr[1]; + FLOAT ar = alpha[0]; + FLOAT ai = alpha[1]; + + while ( j < ( n & -2 )) + { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; + a_ptr += lda; + temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0; +#else + + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; + a_ptr += lda; + temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0; +#endif + +#if !defined(XCONJ) + y_ptr[0] += ar * temp_r - ai * temp_i; + y_ptr[1] += ar * temp_i + ai * temp_r; + y_ptr += inc_y; + y_ptr[0] += ar * temp_r1 - ai * temp_i1; + y_ptr[1] += ar * temp_i1 + ai * temp_r1; +#else + y_ptr[0] += ar * temp_r + ai * temp_i; + y_ptr[1] -= ar * temp_i - ai * temp_r; + y_ptr += inc_y; + y_ptr[0] += ar * temp_r1 + ai * temp_i1; + y_ptr[1] -= ar * temp_i1 - ai * temp_r1; +#endif + + a_ptr += lda; + y_ptr += inc_y; + j+=2; + } + + while ( j < n) + { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; +#else + + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; +#endif + +#if !defined(XCONJ) + y_ptr[0] += ar * temp_r - ai * temp_i; + y_ptr[1] += ar * temp_i + ai * temp_r; +#else + y_ptr[0] += ar * temp_r + ai * temp_i; + y_ptr[1] -= ar * temp_i - ai * temp_r; +#endif + + a_ptr += lda; + y_ptr += inc_y; + j++; + } + return(0); + } + + return(0); + + +} + + diff --git a/kernel/x86_64/cgemv_t_microk_haswell-2.c b/kernel/x86_64/cgemv_t_microk_haswell-2.c deleted file mode 100644 index 0d79714af..000000000 --- a/kernel/x86_64/cgemv_t_microk_haswell-2.c +++ /dev/null @@ -1,171 +0,0 @@ -/*************************************************************************** -Copyright (c) 2014, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary froms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary from must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#define HAVE_KERNEL_16x4 1 -static void cgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline)); - -static void cgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) -{ - - BLASLONG register i = 0; - - __asm__ __volatile__ - ( - "vzeroupper \n\t" - - "vxorps %%ymm8 , %%ymm8 , %%ymm8 \n\t" // temp - "vxorps %%ymm9 , %%ymm9 , %%ymm9 \n\t" // temp - "vxorps %%ymm10, %%ymm10, %%ymm10 \n\t" // temp - "vxorps %%ymm11, %%ymm11, %%ymm11 \n\t" // temp - "vxorps %%ymm12, %%ymm12, %%ymm12 \n\t" // temp - "vxorps %%ymm13, %%ymm13, %%ymm13 \n\t" - "vxorps %%ymm14, %%ymm14, %%ymm14 \n\t" - "vxorps %%ymm15, %%ymm15, %%ymm15 \n\t" - - ".align 16 \n\t" - ".L01LOOP%=: \n\t" - "prefetcht0 192(%4,%0,4) \n\t" - "vmovups (%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0 - "prefetcht0 192(%5,%0,4) \n\t" - "vmovups (%5,%0,4), %%ymm5 \n\t" // 4 complex values from a1 - - "prefetcht0 192(%2,%0,4) \n\t" - "vmovups (%2,%0,4) , %%ymm6 \n\t" // 4 complex values from x - "vpermilps $0xb1, %%ymm6, %%ymm7 \n\t" // exchange real and imap parts - "vblendps $0x55, %%ymm6, %%ymm7, %%ymm0 \n\t" // only the real parts - "vblendps $0x55, %%ymm7, %%ymm6, %%ymm1 \n\t" // only the imag parts - - "prefetcht0 192(%6,%0,4) \n\t" - "vmovups (%6,%0,4), %%ymm6 \n\t" // 4 complex values from a2 - "prefetcht0 192(%7,%0,4) \n\t" - "vmovups (%7,%0,4), %%ymm7 \n\t" // 4 complex values from a3 - - "vfmadd231ps %%ymm4 , %%ymm0, %%ymm8 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 - "vfmadd231ps %%ymm4 , %%ymm1, %%ymm9 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 - "vfmadd231ps %%ymm5 , %%ymm0, %%ymm10 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 - "vfmadd231ps %%ymm5 , %%ymm1, %%ymm11 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 - "vfmadd231ps %%ymm6 , %%ymm0, %%ymm12 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 - "vfmadd231ps %%ymm6 , %%ymm1, %%ymm13 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 - "vfmadd231ps %%ymm7 , %%ymm0, %%ymm14 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 - "vfmadd231ps %%ymm7 , %%ymm1, %%ymm15 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 - - "vmovups 32(%4,%0,4), %%ymm4 \n\t" // 2 complex values from a0 - "vmovups 32(%5,%0,4), %%ymm5 \n\t" // 2 complex values from a1 - - "vmovups 32(%2,%0,4) , %%ymm6 \n\t" // 4 complex values from x - "vpermilps $0xb1, %%ymm6, %%ymm7 \n\t" // exchange real and imap parts - "vblendps $0x55, %%ymm6, %%ymm7, %%ymm0 \n\t" // only the real parts - "vblendps $0x55, %%ymm7, %%ymm6, %%ymm1 \n\t" // only the imag parts - - "vmovups 32(%6,%0,4), %%ymm6 \n\t" // 2 complex values from a2 - "vmovups 32(%7,%0,4), %%ymm7 \n\t" // 2 complex values from a3 - - "vfmadd231ps %%ymm4 , %%ymm0, %%ymm8 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 - "vfmadd231ps %%ymm4 , %%ymm1, %%ymm9 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 - "vfmadd231ps %%ymm5 , %%ymm0, %%ymm10 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 - "vfmadd231ps %%ymm5 , %%ymm1, %%ymm11 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 - "vfmadd231ps %%ymm6 , %%ymm0, %%ymm12 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 - "vfmadd231ps %%ymm6 , %%ymm1, %%ymm13 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 - "vfmadd231ps %%ymm7 , %%ymm0, %%ymm14 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 - "vfmadd231ps %%ymm7 , %%ymm1, %%ymm15 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 - - "addq $16 , %0 \n\t" - "subq $8 , %1 \n\t" - "jnz .L01LOOP%= \n\t" - -#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - "vpermilps $0xb1 , %%ymm9 , %%ymm9 \n\t" - "vpermilps $0xb1 , %%ymm11, %%ymm11 \n\t" - "vpermilps $0xb1 , %%ymm13, %%ymm13 \n\t" - "vpermilps $0xb1 , %%ymm15, %%ymm15 \n\t" - "vaddsubps %%ymm9 , %%ymm8, %%ymm8 \n\t" - "vaddsubps %%ymm11, %%ymm10, %%ymm10 \n\t" - "vaddsubps %%ymm13, %%ymm12, %%ymm12 \n\t" - "vaddsubps %%ymm15, %%ymm14, %%ymm14 \n\t" -#else - "vpermilps $0xb1 , %%ymm8 , %%ymm8 \n\t" - "vpermilps $0xb1 , %%ymm10, %%ymm10 \n\t" - "vpermilps $0xb1 , %%ymm12, %%ymm12 \n\t" - "vpermilps $0xb1 , %%ymm14, %%ymm14 \n\t" - "vaddsubps %%ymm8 , %%ymm9 , %%ymm8 \n\t" - "vaddsubps %%ymm10, %%ymm11, %%ymm10 \n\t" - "vaddsubps %%ymm12, %%ymm13, %%ymm12 \n\t" - "vaddsubps %%ymm14, %%ymm15, %%ymm14 \n\t" - "vpermilps $0xb1 , %%ymm8 , %%ymm8 \n\t" - "vpermilps $0xb1 , %%ymm10, %%ymm10 \n\t" - "vpermilps $0xb1 , %%ymm12, %%ymm12 \n\t" - "vpermilps $0xb1 , %%ymm14, %%ymm14 \n\t" -#endif - - "vextractf128 $1, %%ymm8 , %%xmm9 \n\t" - "vextractf128 $1, %%ymm10, %%xmm11 \n\t" - "vextractf128 $1, %%ymm12, %%xmm13 \n\t" - "vextractf128 $1, %%ymm14, %%xmm15 \n\t" - - "vaddps %%xmm8 , %%xmm9 , %%xmm8 \n\t" - "vaddps %%xmm10, %%xmm11, %%xmm10 \n\t" - "vaddps %%xmm12, %%xmm13, %%xmm12 \n\t" - "vaddps %%xmm14, %%xmm15, %%xmm14 \n\t" - - "vshufpd $0x1, %%xmm8 , %%xmm8 , %%xmm9 \n\t" - "vshufpd $0x1, %%xmm10, %%xmm10, %%xmm11 \n\t" - "vshufpd $0x1, %%xmm12, %%xmm12, %%xmm13 \n\t" - "vshufpd $0x1, %%xmm14, %%xmm14, %%xmm15 \n\t" - - "vaddps %%xmm8 , %%xmm9 , %%xmm8 \n\t" - "vaddps %%xmm10, %%xmm11, %%xmm10 \n\t" - "vaddps %%xmm12, %%xmm13, %%xmm12 \n\t" - "vaddps %%xmm14, %%xmm15, %%xmm14 \n\t" - - "vmovsd %%xmm8 , (%3) \n\t" - "vmovsd %%xmm10, 8(%3) \n\t" - "vmovsd %%xmm12, 16(%3) \n\t" - "vmovsd %%xmm14, 24(%3) \n\t" - - "vzeroupper \n\t" - - : - : - "r" (i), // 0 - "r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (ap[0]), // 4 - "r" (ap[1]), // 5 - "r" (ap[2]), // 6 - "r" (ap[3]) // 7 - : "cc", - "%xmm0", "%xmm1", "%xmm2", "%xmm3", - "%xmm4", "%xmm5", "%xmm6", "%xmm7", - "%xmm8", "%xmm9", "%xmm10", "%xmm11", - "%xmm12", "%xmm13", "%xmm14", "%xmm15", - "memory" - ); - -} - - diff --git a/kernel/x86_64/cgemv_t_microk_haswell-4.c b/kernel/x86_64/cgemv_t_microk_haswell-4.c new file mode 100644 index 000000000..2c506c9e9 --- /dev/null +++ b/kernel/x86_64/cgemv_t_microk_haswell-4.c @@ -0,0 +1,539 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary froms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary from must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_4x4 1 +static void cgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline)); + +static void cgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + + "vxorps %%ymm8 , %%ymm8 , %%ymm8 \n\t" // temp + "vxorps %%ymm9 , %%ymm9 , %%ymm9 \n\t" // temp + "vxorps %%ymm10, %%ymm10, %%ymm10 \n\t" // temp + "vxorps %%ymm11, %%ymm11, %%ymm11 \n\t" // temp + "vxorps %%ymm12, %%ymm12, %%ymm12 \n\t" // temp + "vxorps %%ymm13, %%ymm13, %%ymm13 \n\t" + "vxorps %%ymm14, %%ymm14, %%ymm14 \n\t" + "vxorps %%ymm15, %%ymm15, %%ymm15 \n\t" + + "testq $0x04, %1 \n\t" + "jz .L08LABEL%= \n\t" + + "vmovups (%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0 + "vmovups (%5,%0,4), %%ymm5 \n\t" // 4 complex values from a1 + + "vmovups (%2,%0,4) , %%ymm6 \n\t" // 4 complex values from x + "vpermilps $0xb1, %%ymm6, %%ymm7 \n\t" // exchange real and imap parts + "vblendps $0x55, %%ymm6, %%ymm7, %%ymm0 \n\t" // only the real parts + "vblendps $0x55, %%ymm7, %%ymm6, %%ymm1 \n\t" // only the imag parts + + "vmovups (%6,%0,4), %%ymm6 \n\t" // 4 complex values from a2 + "vmovups (%7,%0,4), %%ymm7 \n\t" // 4 complex values from a3 + + "vfmadd231ps %%ymm4 , %%ymm0, %%ymm8 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 + "vfmadd231ps %%ymm4 , %%ymm1, %%ymm9 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 + "vfmadd231ps %%ymm5 , %%ymm0, %%ymm10 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 + "vfmadd231ps %%ymm5 , %%ymm1, %%ymm11 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 + "vfmadd231ps %%ymm6 , %%ymm0, %%ymm12 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 + "vfmadd231ps %%ymm6 , %%ymm1, %%ymm13 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 + "vfmadd231ps %%ymm7 , %%ymm0, %%ymm14 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 + "vfmadd231ps %%ymm7 , %%ymm1, %%ymm15 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 + + "addq $8 , %0 \n\t" + "subq $4 , %1 \n\t" + + ".L08LABEL%=: \n\t" + "cmpq $0, %1 \n\t" + "je .L08END%= \n\t" + + ".align 16 \n\t" + ".L01LOOP%=: \n\t" + "prefetcht0 192(%4,%0,4) \n\t" + "vmovups (%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0 + "prefetcht0 192(%5,%0,4) \n\t" + "vmovups (%5,%0,4), %%ymm5 \n\t" // 4 complex values from a1 + + "prefetcht0 192(%2,%0,4) \n\t" + "vmovups (%2,%0,4) , %%ymm6 \n\t" // 4 complex values from x + "vpermilps $0xb1, %%ymm6, %%ymm7 \n\t" // exchange real and imap parts + "vblendps $0x55, %%ymm6, %%ymm7, %%ymm0 \n\t" // only the real parts + "vblendps $0x55, %%ymm7, %%ymm6, %%ymm1 \n\t" // only the imag parts + + "prefetcht0 192(%6,%0,4) \n\t" + "vmovups (%6,%0,4), %%ymm6 \n\t" // 4 complex values from a2 + "prefetcht0 192(%7,%0,4) \n\t" + "vmovups (%7,%0,4), %%ymm7 \n\t" // 4 complex values from a3 + + "vfmadd231ps %%ymm4 , %%ymm0, %%ymm8 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 + "vfmadd231ps %%ymm4 , %%ymm1, %%ymm9 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 + "vfmadd231ps %%ymm5 , %%ymm0, %%ymm10 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 + "vfmadd231ps %%ymm5 , %%ymm1, %%ymm11 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 + "vfmadd231ps %%ymm6 , %%ymm0, %%ymm12 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 + "vfmadd231ps %%ymm6 , %%ymm1, %%ymm13 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 + "vfmadd231ps %%ymm7 , %%ymm0, %%ymm14 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 + "vfmadd231ps %%ymm7 , %%ymm1, %%ymm15 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 + + "vmovups 32(%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0 + "vmovups 32(%5,%0,4), %%ymm5 \n\t" // 4 complex values from a1 + + "vmovups 32(%2,%0,4) , %%ymm6 \n\t" // 4 complex values from x + "vpermilps $0xb1, %%ymm6, %%ymm7 \n\t" // exchange real and imap parts + "vblendps $0x55, %%ymm6, %%ymm7, %%ymm0 \n\t" // only the real parts + "vblendps $0x55, %%ymm7, %%ymm6, %%ymm1 \n\t" // only the imag parts + + "vmovups 32(%6,%0,4), %%ymm6 \n\t" // 4 complex values from a2 + "vmovups 32(%7,%0,4), %%ymm7 \n\t" // 4 complex values from a3 + + "vfmadd231ps %%ymm4 , %%ymm0, %%ymm8 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 + "vfmadd231ps %%ymm4 , %%ymm1, %%ymm9 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 + "vfmadd231ps %%ymm5 , %%ymm0, %%ymm10 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 + "vfmadd231ps %%ymm5 , %%ymm1, %%ymm11 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 + "vfmadd231ps %%ymm6 , %%ymm0, %%ymm12 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 + "vfmadd231ps %%ymm6 , %%ymm1, %%ymm13 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 + "vfmadd231ps %%ymm7 , %%ymm0, %%ymm14 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 + "vfmadd231ps %%ymm7 , %%ymm1, %%ymm15 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 + + "addq $16 , %0 \n\t" + "subq $8 , %1 \n\t" + "jnz .L01LOOP%= \n\t" + + ".L08END%=: \n\t" + + "vbroadcastss (%8) , %%xmm0 \n\t" // value from alpha + "vbroadcastss 4(%8) , %%xmm1 \n\t" // value from alpha + + +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + "vpermilps $0xb1 , %%ymm9 , %%ymm9 \n\t" + "vpermilps $0xb1 , %%ymm11, %%ymm11 \n\t" + "vpermilps $0xb1 , %%ymm13, %%ymm13 \n\t" + "vpermilps $0xb1 , %%ymm15, %%ymm15 \n\t" + "vaddsubps %%ymm9 , %%ymm8, %%ymm8 \n\t" + "vaddsubps %%ymm11, %%ymm10, %%ymm10 \n\t" + "vaddsubps %%ymm13, %%ymm12, %%ymm12 \n\t" + "vaddsubps %%ymm15, %%ymm14, %%ymm14 \n\t" +#else + "vpermilps $0xb1 , %%ymm8 , %%ymm8 \n\t" + "vpermilps $0xb1 , %%ymm10, %%ymm10 \n\t" + "vpermilps $0xb1 , %%ymm12, %%ymm12 \n\t" + "vpermilps $0xb1 , %%ymm14, %%ymm14 \n\t" + "vaddsubps %%ymm8 , %%ymm9 , %%ymm8 \n\t" + "vaddsubps %%ymm10, %%ymm11, %%ymm10 \n\t" + "vaddsubps %%ymm12, %%ymm13, %%ymm12 \n\t" + "vaddsubps %%ymm14, %%ymm15, %%ymm14 \n\t" + "vpermilps $0xb1 , %%ymm8 , %%ymm8 \n\t" + "vpermilps $0xb1 , %%ymm10, %%ymm10 \n\t" + "vpermilps $0xb1 , %%ymm12, %%ymm12 \n\t" + "vpermilps $0xb1 , %%ymm14, %%ymm14 \n\t" +#endif + + "vmovsd (%3), %%xmm4 \n\t" // read y + "vmovsd 8(%3), %%xmm5 \n\t" + "vmovsd 16(%3), %%xmm6 \n\t" + "vmovsd 24(%3), %%xmm7 \n\t" + + "vextractf128 $1, %%ymm8 , %%xmm9 \n\t" + "vextractf128 $1, %%ymm10, %%xmm11 \n\t" + "vextractf128 $1, %%ymm12, %%xmm13 \n\t" + "vextractf128 $1, %%ymm14, %%xmm15 \n\t" + + "vaddps %%xmm8 , %%xmm9 , %%xmm8 \n\t" + "vaddps %%xmm10, %%xmm11, %%xmm10 \n\t" + "vaddps %%xmm12, %%xmm13, %%xmm12 \n\t" + "vaddps %%xmm14, %%xmm15, %%xmm14 \n\t" + + "vshufpd $0x1, %%xmm8 , %%xmm8 , %%xmm9 \n\t" + "vshufpd $0x1, %%xmm10, %%xmm10, %%xmm11 \n\t" + "vshufpd $0x1, %%xmm12, %%xmm12, %%xmm13 \n\t" + "vshufpd $0x1, %%xmm14, %%xmm14, %%xmm15 \n\t" + + "vaddps %%xmm8 , %%xmm9 , %%xmm8 \n\t" + "vaddps %%xmm10, %%xmm11, %%xmm10 \n\t" + "vaddps %%xmm12, %%xmm13, %%xmm12 \n\t" + "vaddps %%xmm14, %%xmm15, %%xmm14 \n\t" + + + "vmulps %%xmm8 , %%xmm1 , %%xmm9 \n\t" // t_r * alpha_i , t_i * alpha_i + "vmulps %%xmm8 , %%xmm0 , %%xmm8 \n\t" // t_r * alpha_r , t_i * alpha_r + "vmulps %%xmm10, %%xmm1 , %%xmm11 \n\t" // t_r * alpha_i , t_i * alpha_i + "vmulps %%xmm10, %%xmm0 , %%xmm10 \n\t" // t_r * alpha_r , t_i * alpha_r + "vmulps %%xmm12, %%xmm1 , %%xmm13 \n\t" // t_r * alpha_i , t_i * alpha_i + "vmulps %%xmm12, %%xmm0 , %%xmm12 \n\t" // t_r * alpha_r , t_i * alpha_r + "vmulps %%xmm14, %%xmm1 , %%xmm15 \n\t" // t_r * alpha_i , t_i * alpha_i + "vmulps %%xmm14, %%xmm0 , %%xmm14 \n\t" // t_r * alpha_r , t_i * alpha_r + +#if !defined(XCONJ) + "vpermilps $0xb1 , %%xmm9 , %%xmm9 \n\t" + "vpermilps $0xb1 , %%xmm11, %%xmm11 \n\t" + "vpermilps $0xb1 , %%xmm13, %%xmm13 \n\t" + "vpermilps $0xb1 , %%xmm15, %%xmm15 \n\t" + "vaddsubps %%xmm9 , %%xmm8, %%xmm8 \n\t" + "vaddsubps %%xmm11, %%xmm10, %%xmm10 \n\t" + "vaddsubps %%xmm13, %%xmm12, %%xmm12 \n\t" + "vaddsubps %%xmm15, %%xmm14, %%xmm14 \n\t" +#else + "vpermilps $0xb1 , %%xmm8 , %%xmm8 \n\t" + "vpermilps $0xb1 , %%xmm10, %%xmm10 \n\t" + "vpermilps $0xb1 , %%xmm12, %%xmm12 \n\t" + "vpermilps $0xb1 , %%xmm14, %%xmm14 \n\t" + "vaddsubps %%xmm8 , %%xmm9 , %%xmm8 \n\t" + "vaddsubps %%xmm10, %%xmm11, %%xmm10 \n\t" + "vaddsubps %%xmm12, %%xmm13, %%xmm12 \n\t" + "vaddsubps %%xmm14, %%xmm15, %%xmm14 \n\t" + "vpermilps $0xb1 , %%xmm8 , %%xmm8 \n\t" + "vpermilps $0xb1 , %%xmm10, %%xmm10 \n\t" + "vpermilps $0xb1 , %%xmm12, %%xmm12 \n\t" + "vpermilps $0xb1 , %%xmm14, %%xmm14 \n\t" +#endif + + + "vaddps %%xmm8 , %%xmm4 , %%xmm8 \n\t" + "vaddps %%xmm10, %%xmm5 , %%xmm10 \n\t" + "vaddps %%xmm12, %%xmm6 , %%xmm12 \n\t" + "vaddps %%xmm14, %%xmm7 , %%xmm14 \n\t" + + "vmovsd %%xmm8 , (%3) \n\t" + "vmovsd %%xmm10, 8(%3) \n\t" + "vmovsd %%xmm12, 16(%3) \n\t" + "vmovsd %%xmm14, 24(%3) \n\t" + + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (ap[0]), // 4 + "r" (ap[1]), // 5 + "r" (ap[2]), // 6 + "r" (ap[3]), // 7 + "r" (alpha) // 8 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + +#define HAVE_KERNEL_4x2 1 +static void cgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline)); + +static void cgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + + "vxorps %%ymm8 , %%ymm8 , %%ymm8 \n\t" // temp + "vxorps %%ymm9 , %%ymm9 , %%ymm9 \n\t" // temp + "vxorps %%ymm10, %%ymm10, %%ymm10 \n\t" // temp + "vxorps %%ymm11, %%ymm11, %%ymm11 \n\t" // temp + + "testq $0x04, %1 \n\t" + "jz .L08LABEL%= \n\t" + + "vmovups (%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0 + "vmovups (%5,%0,4), %%ymm5 \n\t" // 4 complex values from a1 + + "vmovups (%2,%0,4) , %%ymm6 \n\t" // 4 complex values from x + "vpermilps $0xb1, %%ymm6, %%ymm7 \n\t" // exchange real and imap parts + "vblendps $0x55, %%ymm6, %%ymm7, %%ymm0 \n\t" // only the real parts + "vblendps $0x55, %%ymm7, %%ymm6, %%ymm1 \n\t" // only the imag parts + + + "vfmadd231ps %%ymm4 , %%ymm0, %%ymm8 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 + "vfmadd231ps %%ymm4 , %%ymm1, %%ymm9 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 + "vfmadd231ps %%ymm5 , %%ymm0, %%ymm10 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 + "vfmadd231ps %%ymm5 , %%ymm1, %%ymm11 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 + + "addq $8 , %0 \n\t" + "subq $4 , %1 \n\t" + + ".L08LABEL%=: \n\t" + "cmpq $0, %1 \n\t" + "je .L08END%= \n\t" + + ".align 16 \n\t" + ".L01LOOP%=: \n\t" + "prefetcht0 192(%4,%0,4) \n\t" + "vmovups (%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0 + "prefetcht0 192(%5,%0,4) \n\t" + "vmovups (%5,%0,4), %%ymm5 \n\t" // 4 complex values from a1 + + "prefetcht0 192(%2,%0,4) \n\t" + "vmovups (%2,%0,4) , %%ymm6 \n\t" // 4 complex values from x + "vpermilps $0xb1, %%ymm6, %%ymm7 \n\t" // exchange real and imap parts + "vblendps $0x55, %%ymm6, %%ymm7, %%ymm0 \n\t" // only the real parts + "vblendps $0x55, %%ymm7, %%ymm6, %%ymm1 \n\t" // only the imag parts + + "vfmadd231ps %%ymm4 , %%ymm0, %%ymm8 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 + "vfmadd231ps %%ymm4 , %%ymm1, %%ymm9 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 + "vfmadd231ps %%ymm5 , %%ymm0, %%ymm10 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 + "vfmadd231ps %%ymm5 , %%ymm1, %%ymm11 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 + + "vmovups 32(%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0 + "vmovups 32(%5,%0,4), %%ymm5 \n\t" // 4 complex values from a1 + + "vmovups 32(%2,%0,4) , %%ymm6 \n\t" // 4 complex values from x + "vpermilps $0xb1, %%ymm6, %%ymm7 \n\t" // exchange real and imap parts + "vblendps $0x55, %%ymm6, %%ymm7, %%ymm0 \n\t" // only the real parts + "vblendps $0x55, %%ymm7, %%ymm6, %%ymm1 \n\t" // only the imag parts + + "vfmadd231ps %%ymm4 , %%ymm0, %%ymm8 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 + "vfmadd231ps %%ymm4 , %%ymm1, %%ymm9 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 + "vfmadd231ps %%ymm5 , %%ymm0, %%ymm10 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 + "vfmadd231ps %%ymm5 , %%ymm1, %%ymm11 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 + + "addq $16 , %0 \n\t" + "subq $8 , %1 \n\t" + "jnz .L01LOOP%= \n\t" + + ".L08END%=: \n\t" + + "vbroadcastss (%6) , %%xmm0 \n\t" // value from alpha + "vbroadcastss 4(%6) , %%xmm1 \n\t" // value from alpha + + +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + "vpermilps $0xb1 , %%ymm9 , %%ymm9 \n\t" + "vpermilps $0xb1 , %%ymm11, %%ymm11 \n\t" + "vaddsubps %%ymm9 , %%ymm8, %%ymm8 \n\t" + "vaddsubps %%ymm11, %%ymm10, %%ymm10 \n\t" +#else + "vpermilps $0xb1 , %%ymm8 , %%ymm8 \n\t" + "vpermilps $0xb1 , %%ymm10, %%ymm10 \n\t" + "vaddsubps %%ymm8 , %%ymm9 , %%ymm8 \n\t" + "vaddsubps %%ymm10, %%ymm11, %%ymm10 \n\t" + "vpermilps $0xb1 , %%ymm8 , %%ymm8 \n\t" + "vpermilps $0xb1 , %%ymm10, %%ymm10 \n\t" +#endif + + "vmovsd (%3), %%xmm4 \n\t" // read y + "vmovsd 8(%3), %%xmm5 \n\t" + + "vextractf128 $1, %%ymm8 , %%xmm9 \n\t" + "vextractf128 $1, %%ymm10, %%xmm11 \n\t" + + "vaddps %%xmm8 , %%xmm9 , %%xmm8 \n\t" + "vaddps %%xmm10, %%xmm11, %%xmm10 \n\t" + + "vshufpd $0x1, %%xmm8 , %%xmm8 , %%xmm9 \n\t" + "vshufpd $0x1, %%xmm10, %%xmm10, %%xmm11 \n\t" + + "vaddps %%xmm8 , %%xmm9 , %%xmm8 \n\t" + "vaddps %%xmm10, %%xmm11, %%xmm10 \n\t" + + "vmulps %%xmm8 , %%xmm1 , %%xmm9 \n\t" // t_r * alpha_i , t_i * alpha_i + "vmulps %%xmm8 , %%xmm0 , %%xmm8 \n\t" // t_r * alpha_r , t_i * alpha_r + "vmulps %%xmm10, %%xmm1 , %%xmm11 \n\t" // t_r * alpha_i , t_i * alpha_i + "vmulps %%xmm10, %%xmm0 , %%xmm10 \n\t" // t_r * alpha_r , t_i * alpha_r + +#if !defined(XCONJ) + "vpermilps $0xb1 , %%xmm9 , %%xmm9 \n\t" + "vpermilps $0xb1 , %%xmm11, %%xmm11 \n\t" + "vaddsubps %%xmm9 , %%xmm8, %%xmm8 \n\t" + "vaddsubps %%xmm11, %%xmm10, %%xmm10 \n\t" +#else + "vpermilps $0xb1 , %%xmm8 , %%xmm8 \n\t" + "vpermilps $0xb1 , %%xmm10, %%xmm10 \n\t" + "vaddsubps %%xmm8 , %%xmm9 , %%xmm8 \n\t" + "vaddsubps %%xmm10, %%xmm11, %%xmm10 \n\t" + "vpermilps $0xb1 , %%xmm8 , %%xmm8 \n\t" + "vpermilps $0xb1 , %%xmm10, %%xmm10 \n\t" +#endif + + + "vaddps %%xmm8 , %%xmm4 , %%xmm8 \n\t" + "vaddps %%xmm10, %%xmm5 , %%xmm10 \n\t" + + "vmovsd %%xmm8 , (%3) \n\t" + "vmovsd %%xmm10, 8(%3) \n\t" + + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (ap[0]), // 4 + "r" (ap[1]), // 5 + "r" (alpha) // 6 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + +#define HAVE_KERNEL_4x1 1 +static void cgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline)); + +static void cgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + + "vxorps %%ymm8 , %%ymm8 , %%ymm8 \n\t" // temp + "vxorps %%ymm9 , %%ymm9 , %%ymm9 \n\t" // temp + + "testq $0x04, %1 \n\t" + "jz .L08LABEL%= \n\t" + + "vmovups (%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0 + + "vmovups (%2,%0,4) , %%ymm6 \n\t" // 4 complex values from x + "vpermilps $0xb1, %%ymm6, %%ymm7 \n\t" // exchange real and imap parts + "vblendps $0x55, %%ymm6, %%ymm7, %%ymm0 \n\t" // only the real parts + "vblendps $0x55, %%ymm7, %%ymm6, %%ymm1 \n\t" // only the imag parts + + + "vfmadd231ps %%ymm4 , %%ymm0, %%ymm8 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 + "vfmadd231ps %%ymm4 , %%ymm1, %%ymm9 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 + + "addq $8 , %0 \n\t" + "subq $4 , %1 \n\t" + + ".L08LABEL%=: \n\t" + "cmpq $0, %1 \n\t" + "je .L08END%= \n\t" + + ".align 16 \n\t" + ".L01LOOP%=: \n\t" + "prefetcht0 192(%4,%0,4) \n\t" + "vmovups (%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0 + + "prefetcht0 192(%2,%0,4) \n\t" + "vmovups (%2,%0,4) , %%ymm6 \n\t" // 4 complex values from x + "vpermilps $0xb1, %%ymm6, %%ymm7 \n\t" // exchange real and imap parts + "vblendps $0x55, %%ymm6, %%ymm7, %%ymm0 \n\t" // only the real parts + "vblendps $0x55, %%ymm7, %%ymm6, %%ymm1 \n\t" // only the imag parts + + "vfmadd231ps %%ymm4 , %%ymm0, %%ymm8 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 + "vfmadd231ps %%ymm4 , %%ymm1, %%ymm9 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 + + "vmovups 32(%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0 + + "vmovups 32(%2,%0,4) , %%ymm6 \n\t" // 4 complex values from x + "vpermilps $0xb1, %%ymm6, %%ymm7 \n\t" // exchange real and imap parts + "vblendps $0x55, %%ymm6, %%ymm7, %%ymm0 \n\t" // only the real parts + "vblendps $0x55, %%ymm7, %%ymm6, %%ymm1 \n\t" // only the imag parts + + "vfmadd231ps %%ymm4 , %%ymm0, %%ymm8 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 + "vfmadd231ps %%ymm4 , %%ymm1, %%ymm9 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 + + "addq $16 , %0 \n\t" + "subq $8 , %1 \n\t" + "jnz .L01LOOP%= \n\t" + + ".L08END%=: \n\t" + + "vbroadcastss (%5) , %%xmm0 \n\t" // value from alpha + "vbroadcastss 4(%5) , %%xmm1 \n\t" // value from alpha + + +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + "vpermilps $0xb1 , %%ymm9 , %%ymm9 \n\t" + "vaddsubps %%ymm9 , %%ymm8, %%ymm8 \n\t" +#else + "vpermilps $0xb1 , %%ymm8 , %%ymm8 \n\t" + "vaddsubps %%ymm8 , %%ymm9 , %%ymm8 \n\t" + "vpermilps $0xb1 , %%ymm8 , %%ymm8 \n\t" +#endif + + "vmovsd (%3), %%xmm4 \n\t" // read y + + "vextractf128 $1, %%ymm8 , %%xmm9 \n\t" + + "vaddps %%xmm8 , %%xmm9 , %%xmm8 \n\t" + + "vshufpd $0x1, %%xmm8 , %%xmm8 , %%xmm9 \n\t" + + "vaddps %%xmm8 , %%xmm9 , %%xmm8 \n\t" + + "vmulps %%xmm8 , %%xmm1 , %%xmm9 \n\t" // t_r * alpha_i , t_i * alpha_i + "vmulps %%xmm8 , %%xmm0 , %%xmm8 \n\t" // t_r * alpha_r , t_i * alpha_r + +#if !defined(XCONJ) + "vpermilps $0xb1 , %%xmm9 , %%xmm9 \n\t" + "vaddsubps %%xmm9 , %%xmm8, %%xmm8 \n\t" +#else + "vpermilps $0xb1 , %%xmm8 , %%xmm8 \n\t" + "vaddsubps %%xmm8 , %%xmm9 , %%xmm8 \n\t" + "vpermilps $0xb1 , %%xmm8 , %%xmm8 \n\t" +#endif + + + "vaddps %%xmm8 , %%xmm4 , %%xmm8 \n\t" + + "vmovsd %%xmm8 , (%3) \n\t" + + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (ap), // 4 + "r" (alpha) // 5 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + diff --git a/kernel/x86_64/dgemv_n.c b/kernel/x86_64/dgemv_n.c deleted file mode 100644 index cecb8d3fc..000000000 --- a/kernel/x86_64/dgemv_n.c +++ /dev/null @@ -1,208 +0,0 @@ -/*************************************************************************** -Copyright (c) 2014, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - - -#include "common.h" - - -#if defined(HASWELL) -#include "dgemv_n_microk_haswell-2.c" -#elif defined(NEHALEM) -#include "dgemv_n_microk_nehalem-2.c" -#endif - - -#define NBMAX 2048 - -#ifndef HAVE_KERNEL_16x4 - -static void dgemv_kernel_16x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) -{ - BLASLONG i; - FLOAT *a0,*a1,*a2,*a3; - a0 = ap[0]; - a1 = ap[1]; - a2 = ap[2]; - a3 = ap[3]; - - for ( i=0; i< n; i+=4 ) - { - y[i] += a0[i]*x[0] + a1[i]*x[1] + a2[i]*x[2] + a3[i]*x[3]; - y[i+1] += a0[i+1]*x[0] + a1[i+1]*x[1] + a2[i+1]*x[2] + a3[i+1]*x[3]; - y[i+2] += a0[i+2]*x[0] + a1[i+2]*x[1] + a2[i+2]*x[2] + a3[i+2]*x[3]; - y[i+3] += a0[i+3]*x[0] + a1[i+3]*x[1] + a2[i+3]*x[2] + a3[i+3]*x[3]; - } -} - -#endif - -static void dgemv_kernel_16x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) -{ - BLASLONG i; - FLOAT *a0; - a0 = ap; - - for ( i=0; i< n; i+=4 ) - { - y[i] += a0[i]*x[0]; - y[i+1] += a0[i+1]*x[0]; - y[i+2] += a0[i+2]*x[0]; - y[i+3] += a0[i+3]*x[0]; - } -} - - -static void zero_y(BLASLONG n, FLOAT *dest) -{ - BLASLONG i; - for ( i=0; i -#include -#include "common.h" - -#if defined(HASWELL) -#include "zgemv_n_microk_haswell-2.c" -#elif defined(SANDYBRIDGE) -#include "zgemv_n_microk_sandy-2.c" -#endif - - - -#define NBMAX 1024 - -#ifndef HAVE_KERNEL_16x4 - -static void zgemv_kernel_16x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) -{ - BLASLONG i; - FLOAT *a0,*a1,*a2,*a3; - a0 = ap[0]; - a1 = ap[1]; - a2 = ap[2]; - a3 = ap[3]; - - for ( i=0; i< 2*n; i+=2 ) - { -#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - y[i] += a0[i]*x[0] - a0[i+1] * x[1]; - y[i+1] += a0[i]*x[1] + a0[i+1] * x[0]; - y[i] += a1[i]*x[2] - a1[i+1] * x[3]; - y[i+1] += a1[i]*x[3] + a1[i+1] * x[2]; - y[i] += a2[i]*x[4] - a2[i+1] * x[5]; - y[i+1] += a2[i]*x[5] + a2[i+1] * x[4]; - y[i] += a3[i]*x[6] - a3[i+1] * x[7]; - y[i+1] += a3[i]*x[7] + a3[i+1] * x[6]; -#else - y[i] += a0[i]*x[0] + a0[i+1] * x[1]; - y[i+1] += a0[i]*x[1] - a0[i+1] * x[0]; - y[i] += a1[i]*x[2] + a1[i+1] * x[3]; - y[i+1] += a1[i]*x[3] - a1[i+1] * x[2]; - y[i] += a2[i]*x[4] + a2[i+1] * x[5]; - y[i+1] += a2[i]*x[5] - a2[i+1] * x[4]; - y[i] += a3[i]*x[6] + a3[i+1] * x[7]; - y[i+1] += a3[i]*x[7] - a3[i+1] * x[6]; -#endif - } -} - -#endif - -static void zgemv_kernel_16x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) -{ - BLASLONG i; - FLOAT *a0; - a0 = ap; - - for ( i=0; i< 2*n; i+=2 ) - { -#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - y[i] += a0[i]*x[0] - a0[i+1] * x[1]; - y[i+1] += a0[i]*x[1] + a0[i+1] * x[0]; -#else - y[i] += a0[i]*x[0] + a0[i+1] * x[1]; - y[i+1] += a0[i]*x[1] - a0[i+1] * x[0]; -#endif - - } -} - - -static void zero_y(BLASLONG n, FLOAT *dest) -{ - BLASLONG i; - for ( i=0; i<2*n; i++ ) - { - *dest = 0.0; - dest++; - } -} - - - -static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT alpha_r, FLOAT alpha_i) -{ - BLASLONG i; - FLOAT temp_r; - FLOAT temp_i; - for ( i=0; i +#include +#include "common.h" + + +#if defined(HASWELL) +#include "zgemv_n_microk_haswell-4.c" +#elif defined(SANDYBRIDGE) +#include "zgemv_n_microk_sandy-4.c" +#endif + + +#define NBMAX 1024 + +#ifndef HAVE_KERNEL_4x4 + +static void zgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) +{ + BLASLONG i; + FLOAT *a0,*a1,*a2,*a3; + a0 = ap[0]; + a1 = ap[1]; + a2 = ap[2]; + a3 = ap[3]; + + for ( i=0; i< 2*n; i+=2 ) + { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + y[i] += a0[i]*x[0] - a0[i+1] * x[1]; + y[i+1] += a0[i]*x[1] + a0[i+1] * x[0]; + y[i] += a1[i]*x[2] - a1[i+1] * x[3]; + y[i+1] += a1[i]*x[3] + a1[i+1] * x[2]; + y[i] += a2[i]*x[4] - a2[i+1] * x[5]; + y[i+1] += a2[i]*x[5] + a2[i+1] * x[4]; + y[i] += a3[i]*x[6] - a3[i+1] * x[7]; + y[i+1] += a3[i]*x[7] + a3[i+1] * x[6]; +#else + y[i] += a0[i]*x[0] + a0[i+1] * x[1]; + y[i+1] += a0[i]*x[1] - a0[i+1] * x[0]; + y[i] += a1[i]*x[2] + a1[i+1] * x[3]; + y[i+1] += a1[i]*x[3] - a1[i+1] * x[2]; + y[i] += a2[i]*x[4] + a2[i+1] * x[5]; + y[i+1] += a2[i]*x[5] - a2[i+1] * x[4]; + y[i] += a3[i]*x[6] + a3[i+1] * x[7]; + y[i+1] += a3[i]*x[7] - a3[i+1] * x[6]; +#endif + } +} + +#endif + + + +#ifndef HAVE_KERNEL_4x2 + +static void zgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) +{ + BLASLONG i; + FLOAT *a0,*a1; + a0 = ap[0]; + a1 = ap[1]; + + for ( i=0; i< 2*n; i+=2 ) + { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + y[i] += a0[i]*x[0] - a0[i+1] * x[1]; + y[i+1] += a0[i]*x[1] + a0[i+1] * x[0]; + y[i] += a1[i]*x[2] - a1[i+1] * x[3]; + y[i+1] += a1[i]*x[3] + a1[i+1] * x[2]; +#else + y[i] += a0[i]*x[0] + a0[i+1] * x[1]; + y[i+1] += a0[i]*x[1] - a0[i+1] * x[0]; + y[i] += a1[i]*x[2] + a1[i+1] * x[3]; + y[i+1] += a1[i]*x[3] - a1[i+1] * x[2]; +#endif + } +} + +#endif + + + + +#ifndef HAVE_KERNEL_4x1 + + +static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) +{ + BLASLONG i; + FLOAT *a0; + a0 = ap; + + for ( i=0; i< 2*n; i+=2 ) + { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + y[i] += a0[i]*x[0] - a0[i+1] * x[1]; + y[i+1] += a0[i]*x[1] + a0[i+1] * x[0]; +#else + y[i] += a0[i]*x[0] + a0[i+1] * x[1]; + y[i+1] += a0[i]*x[1] - a0[i+1] * x[0]; +#endif + + } +} + + +#endif + + +#ifndef HAVE_KERNEL_ADDY + +static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT alpha_r, FLOAT alpha_i) __attribute__ ((noinline)); + +static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT alpha_r, FLOAT alpha_i) +{ + BLASLONG i; + + if ( inc_dest != 2 ) + { + + FLOAT temp_r; + FLOAT temp_i; + for ( i=0; i> 2 ; + n2 = n & 3 ; + + m3 = m & 3 ; + m1 = m - m3; + m2 = (m & (NBMAX-1)) - m3 ; + + alpha[0] = alpha_r; + alpha[1] = alpha_i; + + BLASLONG NB = NBMAX; + + while ( NB == NBMAX ) + { + + m1 -= NB; + if ( m1 < 0) + { + if ( m2 == 0 ) break; + NB = m2; + } + + y_ptr = y; + a_ptr = a; + x_ptr = x; + ap[0] = a_ptr; + ap[1] = a_ptr + lda; + ap[2] = ap[1] + lda; + ap[3] = ap[2] + lda; + if ( inc_x != 2 ) + copy_x(NB,x_ptr,xbuffer,inc_x); + else + xbuffer = x_ptr; + + if ( inc_y == 2 ) + { + + for( i = 0; i < n1 ; i++) + { + zgemv_kernel_4x4(NB,ap,xbuffer,y_ptr,alpha); + ap[0] += lda4; + ap[1] += lda4; + ap[2] += lda4; + ap[3] += lda4; + a_ptr += lda4; + y_ptr += 8; + + } + + if ( n2 & 2 ) + { + zgemv_kernel_4x2(NB,ap,xbuffer,y_ptr,alpha); + a_ptr += lda * 2; + y_ptr += 4; + + } + + if ( n2 & 1 ) + { + zgemv_kernel_4x1(NB,a_ptr,xbuffer,y_ptr,alpha); + a_ptr += lda; + y_ptr += 2; + + } + + } + else + { + + for( i = 0; i < n1 ; i++) + { + memset(ybuffer,0,64); + zgemv_kernel_4x4(NB,ap,xbuffer,ybuffer,alpha); + ap[0] += lda4; + ap[1] += lda4; + ap[2] += lda4; + ap[3] += lda4; + a_ptr += lda4; + + y_ptr[0] += ybuffer[0]; + y_ptr[1] += ybuffer[1]; + y_ptr += inc_y; + y_ptr[0] += ybuffer[2]; + y_ptr[1] += ybuffer[3]; + y_ptr += inc_y; + y_ptr[0] += ybuffer[4]; + y_ptr[1] += ybuffer[5]; + y_ptr += inc_y; + y_ptr[0] += ybuffer[6]; + y_ptr[1] += ybuffer[7]; + y_ptr += inc_y; + + } + + for( i = 0; i < n2 ; i++) + { + memset(ybuffer,0,64); + zgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer,alpha); + a_ptr += lda; + y_ptr[0] += ybuffer[0]; + y_ptr[1] += ybuffer[1]; + y_ptr += inc_y; + + } + + } + a += 2 * NB; + x += NB * inc_x; + } + + + + if ( m3 == 0 ) return(0); + + x_ptr = x; + j=0; + a_ptr = a; + y_ptr = y; + + if ( m3 == 3 ) + { + + FLOAT temp_r ; + FLOAT temp_i ; + FLOAT x0 = x_ptr[0]; + FLOAT x1 = x_ptr[1]; + x_ptr += inc_x; + FLOAT x2 = x_ptr[0]; + FLOAT x3 = x_ptr[1]; + x_ptr += inc_x; + FLOAT x4 = x_ptr[0]; + FLOAT x5 = x_ptr[1]; + while ( j < n) + { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; + temp_r += a_ptr[4] * x4 - a_ptr[5] * x5; + temp_i += a_ptr[4] * x5 + a_ptr[5] * x4; +#else + + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; + temp_r += a_ptr[4] * x4 + a_ptr[5] * x5; + temp_i += a_ptr[4] * x5 - a_ptr[5] * x4; +#endif + +#if !defined(XCONJ) + y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; + y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; +#else + y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; + y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; +#endif + + a_ptr += lda; + y_ptr += inc_y; + j++; + } + return(0); + } + + + if ( m3 == 2 ) + { + + FLOAT temp_r ; + FLOAT temp_i ; + FLOAT temp_r1 ; + FLOAT temp_i1 ; + FLOAT x0 = x_ptr[0]; + FLOAT x1 = x_ptr[1]; + x_ptr += inc_x; + FLOAT x2 = x_ptr[0]; + FLOAT x3 = x_ptr[1]; + FLOAT ar = alpha[0]; + FLOAT ai = alpha[1]; + + while ( j < ( n & -2 )) + { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; + a_ptr += lda; + temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r1 += a_ptr[2] * x2 - a_ptr[3] * x3; + temp_i1 += a_ptr[2] * x3 + a_ptr[3] * x2; +#else + + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; + a_ptr += lda; + temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r1 += a_ptr[2] * x2 + a_ptr[3] * x3; + temp_i1 += a_ptr[2] * x3 - a_ptr[3] * x2; +#endif + +#if !defined(XCONJ) + y_ptr[0] += ar * temp_r - ai * temp_i; + y_ptr[1] += ar * temp_i + ai * temp_r; + y_ptr += inc_y; + y_ptr[0] += ar * temp_r1 - ai * temp_i1; + y_ptr[1] += ar * temp_i1 + ai * temp_r1; +#else + y_ptr[0] += ar * temp_r + ai * temp_i; + y_ptr[1] -= ar * temp_i - ai * temp_r; + y_ptr += inc_y; + y_ptr[0] += ar * temp_r1 + ai * temp_i1; + y_ptr[1] -= ar * temp_i1 - ai * temp_r1; +#endif + + a_ptr += lda; + y_ptr += inc_y; + j+=2; + } + + + while ( j < n) + { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; +#else + + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; +#endif + +#if !defined(XCONJ) + y_ptr[0] += ar * temp_r - ai * temp_i; + y_ptr[1] += ar * temp_i + ai * temp_r; +#else + y_ptr[0] += ar * temp_r + ai * temp_i; + y_ptr[1] -= ar * temp_i - ai * temp_r; +#endif + + a_ptr += lda; + y_ptr += inc_y; + j++; + } + + return(0); + } + + + if ( m3 == 1 ) + { + + FLOAT temp_r ; + FLOAT temp_i ; + FLOAT temp_r1 ; + FLOAT temp_i1 ; + FLOAT x0 = x_ptr[0]; + FLOAT x1 = x_ptr[1]; + FLOAT ar = alpha[0]; + FLOAT ai = alpha[1]; + + while ( j < ( n & -2 )) + { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; + a_ptr += lda; + temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0; +#else + + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; + a_ptr += lda; + temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0; +#endif + +#if !defined(XCONJ) + y_ptr[0] += ar * temp_r - ai * temp_i; + y_ptr[1] += ar * temp_i + ai * temp_r; + y_ptr += inc_y; + y_ptr[0] += ar * temp_r1 - ai * temp_i1; + y_ptr[1] += ar * temp_i1 + ai * temp_r1; +#else + y_ptr[0] += ar * temp_r + ai * temp_i; + y_ptr[1] -= ar * temp_i - ai * temp_r; + y_ptr += inc_y; + y_ptr[0] += ar * temp_r1 + ai * temp_i1; + y_ptr[1] -= ar * temp_i1 - ai * temp_r1; +#endif + + a_ptr += lda; + y_ptr += inc_y; + j+=2; + } + + while ( j < n) + { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; +#else + + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; +#endif + +#if !defined(XCONJ) + y_ptr[0] += ar * temp_r - ai * temp_i; + y_ptr[1] += ar * temp_i + ai * temp_r; +#else + y_ptr[0] += ar * temp_r + ai * temp_i; + y_ptr[1] -= ar * temp_i - ai * temp_r; +#endif + + a_ptr += lda; + y_ptr += inc_y; + j++; + } + return(0); + } + + return(0); + + +} + + diff --git a/kernel/x86_64/zgemv_t_microk_bulldozer-2.c b/kernel/x86_64/zgemv_t_microk_bulldozer-2.c deleted file mode 100644 index 65d5a10a2..000000000 --- a/kernel/x86_64/zgemv_t_microk_bulldozer-2.c +++ /dev/null @@ -1,180 +0,0 @@ -/*************************************************************************** -Copyright (c) 2014, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary froms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary from must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#define HAVE_KERNEL_16x4 1 -static void zgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline)); - -static void zgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) -{ - - BLASLONG register i = 0; - - __asm__ __volatile__ - ( - "vzeroupper \n\t" - - "vxorpd %%xmm8 , %%xmm8 , %%xmm8 \n\t" // temp - "vxorpd %%xmm9 , %%xmm9 , %%xmm9 \n\t" // temp - "vxorpd %%xmm10, %%xmm10, %%xmm10 \n\t" // temp - "vxorpd %%xmm11, %%xmm11, %%xmm11 \n\t" // temp - "vxorpd %%xmm12, %%xmm12, %%xmm12 \n\t" // temp - "vxorpd %%xmm13, %%xmm13, %%xmm13 \n\t" - "vxorpd %%xmm14, %%xmm14, %%xmm14 \n\t" - "vxorpd %%xmm15, %%xmm15, %%xmm15 \n\t" - - ".align 16 \n\t" - ".L01LOOP%=: \n\t" - - "vmovddup (%2,%0,8), %%xmm0 \n\t" // real value from x0 - "vmovddup 8(%2,%0,8), %%xmm1 \n\t" // imag value from x0 - - "prefetcht0 192(%4,%0,8) \n\t" - "vmovups (%4,%0,8), %%xmm4 \n\t" // 1 complex values from a0 - "prefetcht0 192(%5,%0,8) \n\t" - "vmovups (%5,%0,8), %%xmm5 \n\t" // 1 complex values from a1 - "prefetcht0 192(%6,%0,8) \n\t" - "vmovups (%6,%0,8), %%xmm6 \n\t" // 1 complex values from a2 - "prefetcht0 192(%7,%0,8) \n\t" - "vmovups (%7,%0,8), %%xmm7 \n\t" // 1 complex values from a3 - - "vfmaddpd %%xmm8 , %%xmm4 , %%xmm0, %%xmm8 \n\t" // ar0*xr0,al0*xr0 - "vfmaddpd %%xmm9 , %%xmm4 , %%xmm1, %%xmm9 \n\t" // ar0*xl0,al0*xl0 - "vfmaddpd %%xmm10, %%xmm5 , %%xmm0, %%xmm10 \n\t" // ar0*xr0,al0*xr0 - "vfmaddpd %%xmm11, %%xmm5 , %%xmm1, %%xmm11 \n\t" // ar0*xl0,al0*xl0 - "vfmaddpd %%xmm12, %%xmm6 , %%xmm0, %%xmm12 \n\t" // ar0*xr0,al0*xr0 - "vfmaddpd %%xmm13, %%xmm6 , %%xmm1, %%xmm13 \n\t" // ar0*xl0,al0*xl0 - "vfmaddpd %%xmm14, %%xmm7 , %%xmm0, %%xmm14 \n\t" // ar0*xr0,al0*xr0 - "vfmaddpd %%xmm15, %%xmm7 , %%xmm1, %%xmm15 \n\t" // ar0*xl0,al0*xl0 - - "vmovddup 16(%2,%0,8), %%xmm0 \n\t" // real value from x0 - "vmovddup 24(%2,%0,8), %%xmm1 \n\t" // imag value from x0 - - "vmovups 16(%4,%0,8), %%xmm4 \n\t" // 1 complex values from a0 - "vmovups 16(%5,%0,8), %%xmm5 \n\t" // 1 complex values from a1 - "vmovups 16(%6,%0,8), %%xmm6 \n\t" // 1 complex values from a2 - "vmovups 16(%7,%0,8), %%xmm7 \n\t" // 1 complex values from a3 - - "vfmaddpd %%xmm8 , %%xmm4 , %%xmm0, %%xmm8 \n\t" // ar0*xr0,al0*xr0 - "vfmaddpd %%xmm9 , %%xmm4 , %%xmm1, %%xmm9 \n\t" // ar0*xl0,al0*xl0 - "vfmaddpd %%xmm10, %%xmm5 , %%xmm0, %%xmm10 \n\t" // ar0*xr0,al0*xr0 - "vfmaddpd %%xmm11, %%xmm5 , %%xmm1, %%xmm11 \n\t" // ar0*xl0,al0*xl0 - "vfmaddpd %%xmm12, %%xmm6 , %%xmm0, %%xmm12 \n\t" // ar0*xr0,al0*xr0 - "vfmaddpd %%xmm13, %%xmm6 , %%xmm1, %%xmm13 \n\t" // ar0*xl0,al0*xl0 - "vfmaddpd %%xmm14, %%xmm7 , %%xmm0, %%xmm14 \n\t" // ar0*xr0,al0*xr0 - "vfmaddpd %%xmm15, %%xmm7 , %%xmm1, %%xmm15 \n\t" // ar0*xl0,al0*xl0 - - "vmovddup 32(%2,%0,8), %%xmm0 \n\t" // real value from x0 - "vmovddup 40(%2,%0,8), %%xmm1 \n\t" // imag value from x0 - - "vmovups 32(%4,%0,8), %%xmm4 \n\t" // 1 complex values from a0 - "vmovups 32(%5,%0,8), %%xmm5 \n\t" // 1 complex values from a1 - "vmovups 32(%6,%0,8), %%xmm6 \n\t" // 1 complex values from a2 - "vmovups 32(%7,%0,8), %%xmm7 \n\t" // 1 complex values from a3 - - "vfmaddpd %%xmm8 , %%xmm4 , %%xmm0, %%xmm8 \n\t" // ar0*xr0,al0*xr0 - "vfmaddpd %%xmm9 , %%xmm4 , %%xmm1, %%xmm9 \n\t" // ar0*xl0,al0*xl0 - "vfmaddpd %%xmm10, %%xmm5 , %%xmm0, %%xmm10 \n\t" // ar0*xr0,al0*xr0 - "vfmaddpd %%xmm11, %%xmm5 , %%xmm1, %%xmm11 \n\t" // ar0*xl0,al0*xl0 - "vfmaddpd %%xmm12, %%xmm6 , %%xmm0, %%xmm12 \n\t" // ar0*xr0,al0*xr0 - "vfmaddpd %%xmm13, %%xmm6 , %%xmm1, %%xmm13 \n\t" // ar0*xl0,al0*xl0 - "vfmaddpd %%xmm14, %%xmm7 , %%xmm0, %%xmm14 \n\t" // ar0*xr0,al0*xr0 - "vfmaddpd %%xmm15, %%xmm7 , %%xmm1, %%xmm15 \n\t" // ar0*xl0,al0*xl0 - - "vmovddup 48(%2,%0,8), %%xmm0 \n\t" // real value from x0 - "vmovddup 56(%2,%0,8), %%xmm1 \n\t" // imag value from x0 - - "vmovups 48(%4,%0,8), %%xmm4 \n\t" // 1 complex values from a0 - "vmovups 48(%5,%0,8), %%xmm5 \n\t" // 1 complex values from a1 - "vmovups 48(%6,%0,8), %%xmm6 \n\t" // 1 complex values from a2 - "vmovups 48(%7,%0,8), %%xmm7 \n\t" // 1 complex values from a3 - - "vfmaddpd %%xmm8 , %%xmm4 , %%xmm0, %%xmm8 \n\t" // ar0*xr0,al0*xr0 - "vfmaddpd %%xmm9 , %%xmm4 , %%xmm1, %%xmm9 \n\t" // ar0*xl0,al0*xl0 - "vfmaddpd %%xmm10, %%xmm5 , %%xmm0, %%xmm10 \n\t" // ar0*xr0,al0*xr0 - "vfmaddpd %%xmm11, %%xmm5 , %%xmm1, %%xmm11 \n\t" // ar0*xl0,al0*xl0 - "vfmaddpd %%xmm12, %%xmm6 , %%xmm0, %%xmm12 \n\t" // ar0*xr0,al0*xr0 - "vfmaddpd %%xmm13, %%xmm6 , %%xmm1, %%xmm13 \n\t" // ar0*xl0,al0*xl0 - "vfmaddpd %%xmm14, %%xmm7 , %%xmm0, %%xmm14 \n\t" // ar0*xr0,al0*xr0 - "vfmaddpd %%xmm15, %%xmm7 , %%xmm1, %%xmm15 \n\t" // ar0*xl0,al0*xl0 - - "addq $8 , %0 \n\t" - "subq $4 , %1 \n\t" - "jnz .L01LOOP%= \n\t" - -#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - "vpermilpd $0x1 , %%xmm9 , %%xmm9 \n\t" - "vpermilpd $0x1 , %%xmm11, %%xmm11 \n\t" - "vpermilpd $0x1 , %%xmm13, %%xmm13 \n\t" - "vpermilpd $0x1 , %%xmm15, %%xmm15 \n\t" - "vaddsubpd %%xmm9 , %%xmm8, %%xmm8 \n\t" - "vaddsubpd %%xmm11, %%xmm10, %%xmm10 \n\t" - "vaddsubpd %%xmm13, %%xmm12, %%xmm12 \n\t" - "vaddsubpd %%xmm15, %%xmm14, %%xmm14 \n\t" -#else - "vpermilpd $0x1 , %%xmm8 , %%xmm8 \n\t" - "vpermilpd $0x1 , %%xmm10, %%xmm10 \n\t" - "vpermilpd $0x1 , %%xmm12, %%xmm12 \n\t" - "vpermilpd $0x1 , %%xmm14, %%xmm14 \n\t" - "vaddsubpd %%xmm8 , %%xmm9 , %%xmm8 \n\t" - "vaddsubpd %%xmm10, %%xmm11, %%xmm10 \n\t" - "vaddsubpd %%xmm12, %%xmm13, %%xmm12 \n\t" - "vaddsubpd %%xmm14, %%xmm15, %%xmm14 \n\t" - "vpermilpd $0x1 , %%xmm8 , %%xmm8 \n\t" - "vpermilpd $0x1 , %%xmm10, %%xmm10 \n\t" - "vpermilpd $0x1 , %%xmm12, %%xmm12 \n\t" - "vpermilpd $0x1 , %%xmm14, %%xmm14 \n\t" -#endif - - - "vmovups %%xmm8 , (%3) \n\t" - "vmovups %%xmm10, 16(%3) \n\t" - "vmovups %%xmm12, 32(%3) \n\t" - "vmovups %%xmm14, 48(%3) \n\t" - - "vzeroupper \n\t" - - : - : - "r" (i), // 0 - "r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (ap[0]), // 4 - "r" (ap[1]), // 5 - "r" (ap[2]), // 6 - "r" (ap[3]) // 7 - : "cc", - "%xmm0", "%xmm1", "%xmm2", "%xmm3", - "%xmm4", "%xmm5", "%xmm6", "%xmm7", - "%xmm8", "%xmm9", "%xmm10", "%xmm11", - "%xmm12", "%xmm13", "%xmm14", "%xmm15", - "memory" - ); - -} - - diff --git a/kernel/x86_64/zgemv_t_microk_bulldozer-4.c b/kernel/x86_64/zgemv_t_microk_bulldozer-4.c new file mode 100644 index 000000000..006db226b --- /dev/null +++ b/kernel/x86_64/zgemv_t_microk_bulldozer-4.c @@ -0,0 +1,457 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary froms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary from must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_4x4 1 +static void zgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline)); + +static void zgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + + "vxorpd %%xmm8 , %%xmm8 , %%xmm8 \n\t" // temp + "vxorpd %%xmm9 , %%xmm9 , %%xmm9 \n\t" // temp + "vxorpd %%xmm10, %%xmm10, %%xmm10 \n\t" // temp + "vxorpd %%xmm11, %%xmm11, %%xmm11 \n\t" // temp + "vxorpd %%xmm12, %%xmm12, %%xmm12 \n\t" // temp + "vxorpd %%xmm13, %%xmm13, %%xmm13 \n\t" + "vxorpd %%xmm14, %%xmm14, %%xmm14 \n\t" + "vxorpd %%xmm15, %%xmm15, %%xmm15 \n\t" + + ".align 16 \n\t" + ".L01LOOP%=: \n\t" + + "vmovddup (%2,%0,8), %%xmm0 \n\t" // real value from x0 + "vmovddup 8(%2,%0,8), %%xmm1 \n\t" // imag value from x0 + + "prefetcht0 192(%4,%0,8) \n\t" + "vmovups (%4,%0,8), %%xmm4 \n\t" // 1 complex values from a0 + "prefetcht0 192(%5,%0,8) \n\t" + "vmovups (%5,%0,8), %%xmm5 \n\t" // 1 complex values from a1 + "prefetcht0 192(%6,%0,8) \n\t" + "vmovups (%6,%0,8), %%xmm6 \n\t" // 1 complex values from a2 + "prefetcht0 192(%7,%0,8) \n\t" + "vmovups (%7,%0,8), %%xmm7 \n\t" // 1 complex values from a3 + + "vfmaddpd %%xmm8 , %%xmm4 , %%xmm0, %%xmm8 \n\t" // ar0*xr0,al0*xr0 + "vfmaddpd %%xmm9 , %%xmm4 , %%xmm1, %%xmm9 \n\t" // ar0*xl0,al0*xl0 + "vfmaddpd %%xmm10, %%xmm5 , %%xmm0, %%xmm10 \n\t" // ar0*xr0,al0*xr0 + "vfmaddpd %%xmm11, %%xmm5 , %%xmm1, %%xmm11 \n\t" // ar0*xl0,al0*xl0 + "vfmaddpd %%xmm12, %%xmm6 , %%xmm0, %%xmm12 \n\t" // ar0*xr0,al0*xr0 + "vfmaddpd %%xmm13, %%xmm6 , %%xmm1, %%xmm13 \n\t" // ar0*xl0,al0*xl0 + "vfmaddpd %%xmm14, %%xmm7 , %%xmm0, %%xmm14 \n\t" // ar0*xr0,al0*xr0 + "vfmaddpd %%xmm15, %%xmm7 , %%xmm1, %%xmm15 \n\t" // ar0*xl0,al0*xl0 + + "vmovddup 16(%2,%0,8), %%xmm0 \n\t" // real value from x0 + "vmovddup 24(%2,%0,8), %%xmm1 \n\t" // imag value from x0 + + "vmovups 16(%4,%0,8), %%xmm4 \n\t" // 1 complex values from a0 + "vmovups 16(%5,%0,8), %%xmm5 \n\t" // 1 complex values from a1 + "vmovups 16(%6,%0,8), %%xmm6 \n\t" // 1 complex values from a2 + "vmovups 16(%7,%0,8), %%xmm7 \n\t" // 1 complex values from a3 + + "vfmaddpd %%xmm8 , %%xmm4 , %%xmm0, %%xmm8 \n\t" // ar0*xr0,al0*xr0 + "vfmaddpd %%xmm9 , %%xmm4 , %%xmm1, %%xmm9 \n\t" // ar0*xl0,al0*xl0 + "vfmaddpd %%xmm10, %%xmm5 , %%xmm0, %%xmm10 \n\t" // ar0*xr0,al0*xr0 + "vfmaddpd %%xmm11, %%xmm5 , %%xmm1, %%xmm11 \n\t" // ar0*xl0,al0*xl0 + "vfmaddpd %%xmm12, %%xmm6 , %%xmm0, %%xmm12 \n\t" // ar0*xr0,al0*xr0 + "vfmaddpd %%xmm13, %%xmm6 , %%xmm1, %%xmm13 \n\t" // ar0*xl0,al0*xl0 + "vfmaddpd %%xmm14, %%xmm7 , %%xmm0, %%xmm14 \n\t" // ar0*xr0,al0*xr0 + "vfmaddpd %%xmm15, %%xmm7 , %%xmm1, %%xmm15 \n\t" // ar0*xl0,al0*xl0 + + "vmovddup 32(%2,%0,8), %%xmm0 \n\t" // real value from x0 + "vmovddup 40(%2,%0,8), %%xmm1 \n\t" // imag value from x0 + + "vmovups 32(%4,%0,8), %%xmm4 \n\t" // 1 complex values from a0 + "vmovups 32(%5,%0,8), %%xmm5 \n\t" // 1 complex values from a1 + "vmovups 32(%6,%0,8), %%xmm6 \n\t" // 1 complex values from a2 + "vmovups 32(%7,%0,8), %%xmm7 \n\t" // 1 complex values from a3 + + "vfmaddpd %%xmm8 , %%xmm4 , %%xmm0, %%xmm8 \n\t" // ar0*xr0,al0*xr0 + "vfmaddpd %%xmm9 , %%xmm4 , %%xmm1, %%xmm9 \n\t" // ar0*xl0,al0*xl0 + "vfmaddpd %%xmm10, %%xmm5 , %%xmm0, %%xmm10 \n\t" // ar0*xr0,al0*xr0 + "vfmaddpd %%xmm11, %%xmm5 , %%xmm1, %%xmm11 \n\t" // ar0*xl0,al0*xl0 + "vfmaddpd %%xmm12, %%xmm6 , %%xmm0, %%xmm12 \n\t" // ar0*xr0,al0*xr0 + "vfmaddpd %%xmm13, %%xmm6 , %%xmm1, %%xmm13 \n\t" // ar0*xl0,al0*xl0 + "vfmaddpd %%xmm14, %%xmm7 , %%xmm0, %%xmm14 \n\t" // ar0*xr0,al0*xr0 + "vfmaddpd %%xmm15, %%xmm7 , %%xmm1, %%xmm15 \n\t" // ar0*xl0,al0*xl0 + + "vmovddup 48(%2,%0,8), %%xmm0 \n\t" // real value from x0 + "vmovddup 56(%2,%0,8), %%xmm1 \n\t" // imag value from x0 + + "vmovups 48(%4,%0,8), %%xmm4 \n\t" // 1 complex values from a0 + "vmovups 48(%5,%0,8), %%xmm5 \n\t" // 1 complex values from a1 + "vmovups 48(%6,%0,8), %%xmm6 \n\t" // 1 complex values from a2 + "vmovups 48(%7,%0,8), %%xmm7 \n\t" // 1 complex values from a3 + + "vfmaddpd %%xmm8 , %%xmm4 , %%xmm0, %%xmm8 \n\t" // ar0*xr0,al0*xr0 + "vfmaddpd %%xmm9 , %%xmm4 , %%xmm1, %%xmm9 \n\t" // ar0*xl0,al0*xl0 + "vfmaddpd %%xmm10, %%xmm5 , %%xmm0, %%xmm10 \n\t" // ar0*xr0,al0*xr0 + "vfmaddpd %%xmm11, %%xmm5 , %%xmm1, %%xmm11 \n\t" // ar0*xl0,al0*xl0 + "vfmaddpd %%xmm12, %%xmm6 , %%xmm0, %%xmm12 \n\t" // ar0*xr0,al0*xr0 + "vfmaddpd %%xmm13, %%xmm6 , %%xmm1, %%xmm13 \n\t" // ar0*xl0,al0*xl0 + "vfmaddpd %%xmm14, %%xmm7 , %%xmm0, %%xmm14 \n\t" // ar0*xr0,al0*xr0 + "vfmaddpd %%xmm15, %%xmm7 , %%xmm1, %%xmm15 \n\t" // ar0*xl0,al0*xl0 + + "addq $8 , %0 \n\t" + "subq $4 , %1 \n\t" + "jnz .L01LOOP%= \n\t" + + "vmovddup (%8) , %%xmm0 \n\t" // value from alpha + "vmovddup 8(%8) , %%xmm1 \n\t" // value from alpha + +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + "vpermilpd $0x1 , %%xmm9 , %%xmm9 \n\t" + "vpermilpd $0x1 , %%xmm11, %%xmm11 \n\t" + "vpermilpd $0x1 , %%xmm13, %%xmm13 \n\t" + "vpermilpd $0x1 , %%xmm15, %%xmm15 \n\t" + "vaddsubpd %%xmm9 , %%xmm8, %%xmm8 \n\t" + "vaddsubpd %%xmm11, %%xmm10, %%xmm10 \n\t" + "vaddsubpd %%xmm13, %%xmm12, %%xmm12 \n\t" + "vaddsubpd %%xmm15, %%xmm14, %%xmm14 \n\t" +#else + "vpermilpd $0x1 , %%xmm8 , %%xmm8 \n\t" + "vpermilpd $0x1 , %%xmm10, %%xmm10 \n\t" + "vpermilpd $0x1 , %%xmm12, %%xmm12 \n\t" + "vpermilpd $0x1 , %%xmm14, %%xmm14 \n\t" + "vaddsubpd %%xmm8 , %%xmm9 , %%xmm8 \n\t" + "vaddsubpd %%xmm10, %%xmm11, %%xmm10 \n\t" + "vaddsubpd %%xmm12, %%xmm13, %%xmm12 \n\t" + "vaddsubpd %%xmm14, %%xmm15, %%xmm14 \n\t" + "vpermilpd $0x1 , %%xmm8 , %%xmm8 \n\t" + "vpermilpd $0x1 , %%xmm10, %%xmm10 \n\t" + "vpermilpd $0x1 , %%xmm12, %%xmm12 \n\t" + "vpermilpd $0x1 , %%xmm14, %%xmm14 \n\t" +#endif + + "vmulpd %%xmm8 , %%xmm1 , %%xmm9 \n\t" // t_r * alpha_i , t_i * alpha_i + "vmulpd %%xmm8 , %%xmm0 , %%xmm8 \n\t" // t_r * alpha_r , t_i * alpha_r + "vmulpd %%xmm10, %%xmm1 , %%xmm11 \n\t" // t_r * alpha_i , t_i * alpha_i + "vmulpd %%xmm10, %%xmm0 , %%xmm10 \n\t" // t_r * alpha_r , t_i * alpha_r + "vmulpd %%xmm12, %%xmm1 , %%xmm13 \n\t" // t_r * alpha_i , t_i * alpha_i + "vmulpd %%xmm12, %%xmm0 , %%xmm12 \n\t" // t_r * alpha_r , t_i * alpha_r + "vmulpd %%xmm14, %%xmm1 , %%xmm15 \n\t" // t_r * alpha_i , t_i * alpha_i + "vmulpd %%xmm14, %%xmm0 , %%xmm14 \n\t" // t_r * alpha_r , t_i * alpha_r + +#if !defined(XCONJ) + "vpermilpd $0x1 , %%xmm9 , %%xmm9 \n\t" + "vpermilpd $0x1 , %%xmm11, %%xmm11 \n\t" + "vpermilpd $0x1 , %%xmm13, %%xmm13 \n\t" + "vpermilpd $0x1 , %%xmm15, %%xmm15 \n\t" + "vaddsubpd %%xmm9 , %%xmm8, %%xmm8 \n\t" + "vaddsubpd %%xmm11, %%xmm10, %%xmm10 \n\t" + "vaddsubpd %%xmm13, %%xmm12, %%xmm12 \n\t" + "vaddsubpd %%xmm15, %%xmm14, %%xmm14 \n\t" +#else + "vpermilpd $0x1 , %%xmm8 , %%xmm8 \n\t" + "vpermilpd $0x1 , %%xmm10, %%xmm10 \n\t" + "vpermilpd $0x1 , %%xmm12, %%xmm12 \n\t" + "vpermilpd $0x1 , %%xmm14, %%xmm14 \n\t" + "vaddsubpd %%xmm8 , %%xmm9 , %%xmm8 \n\t" + "vaddsubpd %%xmm10, %%xmm11, %%xmm10 \n\t" + "vaddsubpd %%xmm12, %%xmm13, %%xmm12 \n\t" + "vaddsubpd %%xmm14, %%xmm15, %%xmm14 \n\t" + "vpermilpd $0x1 , %%xmm8 , %%xmm8 \n\t" + "vpermilpd $0x1 , %%xmm10, %%xmm10 \n\t" + "vpermilpd $0x1 , %%xmm12, %%xmm12 \n\t" + "vpermilpd $0x1 , %%xmm14, %%xmm14 \n\t" +#endif + + "vaddpd (%3) , %%xmm8 , %%xmm8 \n\t" + "vaddpd 16(%3) , %%xmm10, %%xmm10 \n\t" + "vaddpd 32(%3) , %%xmm12, %%xmm12 \n\t" + "vaddpd 48(%3) , %%xmm14, %%xmm14 \n\t" + + "vmovups %%xmm8 , (%3) \n\t" + "vmovups %%xmm10, 16(%3) \n\t" + "vmovups %%xmm12, 32(%3) \n\t" + "vmovups %%xmm14, 48(%3) \n\t" + + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (ap[0]), // 4 + "r" (ap[1]), // 5 + "r" (ap[2]), // 6 + "r" (ap[3]), // 7 + "r" (alpha) // 8 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + +#define HAVE_KERNEL_4x2 1 +static void zgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline)); + +static void zgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + + "vxorpd %%xmm8 , %%xmm8 , %%xmm8 \n\t" // temp + "vxorpd %%xmm9 , %%xmm9 , %%xmm9 \n\t" // temp + "vxorpd %%xmm10, %%xmm10, %%xmm10 \n\t" // temp + "vxorpd %%xmm11, %%xmm11, %%xmm11 \n\t" // temp + + ".align 16 \n\t" + ".L01LOOP%=: \n\t" + + "vmovddup (%2,%0,8), %%xmm0 \n\t" // real value from x0 + "vmovddup 8(%2,%0,8), %%xmm1 \n\t" // imag value from x0 + + "prefetcht0 192(%4,%0,8) \n\t" + "vmovups (%4,%0,8), %%xmm4 \n\t" // 1 complex values from a0 + "prefetcht0 192(%5,%0,8) \n\t" + "vmovups (%5,%0,8), %%xmm5 \n\t" // 1 complex values from a1 + + "vfmaddpd %%xmm8 , %%xmm4 , %%xmm0, %%xmm8 \n\t" // ar0*xr0,al0*xr0 + "vfmaddpd %%xmm9 , %%xmm4 , %%xmm1, %%xmm9 \n\t" // ar0*xl0,al0*xl0 + "vfmaddpd %%xmm10, %%xmm5 , %%xmm0, %%xmm10 \n\t" // ar0*xr0,al0*xr0 + "vfmaddpd %%xmm11, %%xmm5 , %%xmm1, %%xmm11 \n\t" // ar0*xl0,al0*xl0 + + "vmovddup 16(%2,%0,8), %%xmm0 \n\t" // real value from x0 + "vmovddup 24(%2,%0,8), %%xmm1 \n\t" // imag value from x0 + + "vmovups 16(%4,%0,8), %%xmm4 \n\t" // 1 complex values from a0 + "vmovups 16(%5,%0,8), %%xmm5 \n\t" // 1 complex values from a1 + + "vfmaddpd %%xmm8 , %%xmm4 , %%xmm0, %%xmm8 \n\t" // ar0*xr0,al0*xr0 + "vfmaddpd %%xmm9 , %%xmm4 , %%xmm1, %%xmm9 \n\t" // ar0*xl0,al0*xl0 + "vfmaddpd %%xmm10, %%xmm5 , %%xmm0, %%xmm10 \n\t" // ar0*xr0,al0*xr0 + "vfmaddpd %%xmm11, %%xmm5 , %%xmm1, %%xmm11 \n\t" // ar0*xl0,al0*xl0 + + "vmovddup 32(%2,%0,8), %%xmm0 \n\t" // real value from x0 + "vmovddup 40(%2,%0,8), %%xmm1 \n\t" // imag value from x0 + + "vmovups 32(%4,%0,8), %%xmm4 \n\t" // 1 complex values from a0 + "vmovups 32(%5,%0,8), %%xmm5 \n\t" // 1 complex values from a1 + + "vfmaddpd %%xmm8 , %%xmm4 , %%xmm0, %%xmm8 \n\t" // ar0*xr0,al0*xr0 + "vfmaddpd %%xmm9 , %%xmm4 , %%xmm1, %%xmm9 \n\t" // ar0*xl0,al0*xl0 + "vfmaddpd %%xmm10, %%xmm5 , %%xmm0, %%xmm10 \n\t" // ar0*xr0,al0*xr0 + "vfmaddpd %%xmm11, %%xmm5 , %%xmm1, %%xmm11 \n\t" // ar0*xl0,al0*xl0 + + "vmovddup 48(%2,%0,8), %%xmm0 \n\t" // real value from x0 + "vmovddup 56(%2,%0,8), %%xmm1 \n\t" // imag value from x0 + + "vmovups 48(%4,%0,8), %%xmm4 \n\t" // 1 complex values from a0 + "vmovups 48(%5,%0,8), %%xmm5 \n\t" // 1 complex values from a1 + + "vfmaddpd %%xmm8 , %%xmm4 , %%xmm0, %%xmm8 \n\t" // ar0*xr0,al0*xr0 + "vfmaddpd %%xmm9 , %%xmm4 , %%xmm1, %%xmm9 \n\t" // ar0*xl0,al0*xl0 + "vfmaddpd %%xmm10, %%xmm5 , %%xmm0, %%xmm10 \n\t" // ar0*xr0,al0*xr0 + "vfmaddpd %%xmm11, %%xmm5 , %%xmm1, %%xmm11 \n\t" // ar0*xl0,al0*xl0 + + "addq $8 , %0 \n\t" + "subq $4 , %1 \n\t" + "jnz .L01LOOP%= \n\t" + + "vmovddup (%6) , %%xmm0 \n\t" // value from alpha + "vmovddup 8(%6) , %%xmm1 \n\t" // value from alpha + +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + "vpermilpd $0x1 , %%xmm9 , %%xmm9 \n\t" + "vpermilpd $0x1 , %%xmm11, %%xmm11 \n\t" + "vaddsubpd %%xmm9 , %%xmm8, %%xmm8 \n\t" + "vaddsubpd %%xmm11, %%xmm10, %%xmm10 \n\t" +#else + "vpermilpd $0x1 , %%xmm8 , %%xmm8 \n\t" + "vpermilpd $0x1 , %%xmm10, %%xmm10 \n\t" + "vaddsubpd %%xmm8 , %%xmm9 , %%xmm8 \n\t" + "vaddsubpd %%xmm10, %%xmm11, %%xmm10 \n\t" + "vpermilpd $0x1 , %%xmm8 , %%xmm8 \n\t" + "vpermilpd $0x1 , %%xmm10, %%xmm10 \n\t" +#endif + + "vmulpd %%xmm8 , %%xmm1 , %%xmm9 \n\t" // t_r * alpha_i , t_i * alpha_i + "vmulpd %%xmm8 , %%xmm0 , %%xmm8 \n\t" // t_r * alpha_r , t_i * alpha_r + "vmulpd %%xmm10, %%xmm1 , %%xmm11 \n\t" // t_r * alpha_i , t_i * alpha_i + "vmulpd %%xmm10, %%xmm0 , %%xmm10 \n\t" // t_r * alpha_r , t_i * alpha_r + +#if !defined(XCONJ) + "vpermilpd $0x1 , %%xmm9 , %%xmm9 \n\t" + "vpermilpd $0x1 , %%xmm11, %%xmm11 \n\t" + "vaddsubpd %%xmm9 , %%xmm8, %%xmm8 \n\t" + "vaddsubpd %%xmm11, %%xmm10, %%xmm10 \n\t" +#else + "vpermilpd $0x1 , %%xmm8 , %%xmm8 \n\t" + "vpermilpd $0x1 , %%xmm10, %%xmm10 \n\t" + "vaddsubpd %%xmm8 , %%xmm9 , %%xmm8 \n\t" + "vaddsubpd %%xmm10, %%xmm11, %%xmm10 \n\t" + "vpermilpd $0x1 , %%xmm8 , %%xmm8 \n\t" + "vpermilpd $0x1 , %%xmm10, %%xmm10 \n\t" +#endif + + "vaddpd (%3) , %%xmm8 , %%xmm8 \n\t" + "vaddpd 16(%3) , %%xmm10, %%xmm10 \n\t" + + "vmovups %%xmm8 , (%3) \n\t" + "vmovups %%xmm10, 16(%3) \n\t" + + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (ap[0]), // 4 + "r" (ap[1]), // 5 + "r" (alpha) // 6 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + + +#define HAVE_KERNEL_4x1 1 +static void zgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline)); + +static void zgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + + "vxorpd %%xmm8 , %%xmm8 , %%xmm8 \n\t" // temp + "vxorpd %%xmm9 , %%xmm9 , %%xmm9 \n\t" // temp + + ".align 16 \n\t" + ".L01LOOP%=: \n\t" + + "vmovddup (%2,%0,8), %%xmm0 \n\t" // real value from x0 + "vmovddup 8(%2,%0,8), %%xmm1 \n\t" // imag value from x0 + + "prefetcht0 192(%4,%0,8) \n\t" + "vmovups (%4,%0,8), %%xmm4 \n\t" // 1 complex values from a0 + "vmovups 16(%4,%0,8), %%xmm5 \n\t" // 1 complex values from a0 + + "vmovddup 16(%2,%0,8), %%xmm2 \n\t" // real value from x0 + "vmovddup 24(%2,%0,8), %%xmm3 \n\t" // imag value from x0 + + "vfmaddpd %%xmm8 , %%xmm4 , %%xmm0, %%xmm8 \n\t" // ar0*xr0,al0*xr0 + "vfmaddpd %%xmm9 , %%xmm4 , %%xmm1, %%xmm9 \n\t" // ar0*xl0,al0*xl0 + + "vmovddup 32(%2,%0,8), %%xmm0 \n\t" // real value from x0 + "vmovddup 40(%2,%0,8), %%xmm1 \n\t" // imag value from x0 + + "vfmaddpd %%xmm8 , %%xmm5 , %%xmm2, %%xmm8 \n\t" // ar0*xr0,al0*xr0 + "vfmaddpd %%xmm9 , %%xmm5 , %%xmm3, %%xmm9 \n\t" // ar0*xl0,al0*xl0 + + "vmovups 32(%4,%0,8), %%xmm4 \n\t" // 1 complex values from a0 + "vmovups 48(%4,%0,8), %%xmm5 \n\t" // 1 complex values from a0 + + "vmovddup 48(%2,%0,8), %%xmm2 \n\t" // real value from x0 + "vmovddup 56(%2,%0,8), %%xmm3 \n\t" // imag value from x0 + + "addq $8 , %0 \n\t" + "vfmaddpd %%xmm8 , %%xmm4 , %%xmm0, %%xmm8 \n\t" // ar0*xr0,al0*xr0 + "vfmaddpd %%xmm9 , %%xmm4 , %%xmm1, %%xmm9 \n\t" // ar0*xl0,al0*xl0 + + "subq $4 , %1 \n\t" + "vfmaddpd %%xmm8 , %%xmm5 , %%xmm2, %%xmm8 \n\t" // ar0*xr0,al0*xr0 + "vfmaddpd %%xmm9 , %%xmm5 , %%xmm3, %%xmm9 \n\t" // ar0*xl0,al0*xl0 + + "jnz .L01LOOP%= \n\t" + + "vmovddup (%5) , %%xmm0 \n\t" // value from alpha + "vmovddup 8(%5) , %%xmm1 \n\t" // value from alpha + +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + "vpermilpd $0x1 , %%xmm9 , %%xmm9 \n\t" + "vaddsubpd %%xmm9 , %%xmm8, %%xmm8 \n\t" +#else + "vpermilpd $0x1 , %%xmm8 , %%xmm8 \n\t" + "vaddsubpd %%xmm8 , %%xmm9 , %%xmm8 \n\t" + "vpermilpd $0x1 , %%xmm8 , %%xmm8 \n\t" +#endif + + "vmulpd %%xmm8 , %%xmm1 , %%xmm9 \n\t" // t_r * alpha_i , t_i * alpha_i + "vmulpd %%xmm8 , %%xmm0 , %%xmm8 \n\t" // t_r * alpha_r , t_i * alpha_r + +#if !defined(XCONJ) + "vpermilpd $0x1 , %%xmm9 , %%xmm9 \n\t" + "vaddsubpd %%xmm9 , %%xmm8, %%xmm8 \n\t" +#else + "vpermilpd $0x1 , %%xmm8 , %%xmm8 \n\t" + "vaddsubpd %%xmm8 , %%xmm9 , %%xmm8 \n\t" + "vpermilpd $0x1 , %%xmm8 , %%xmm8 \n\t" +#endif + + "vaddpd (%3) , %%xmm8 , %%xmm8 \n\t" + + "vmovups %%xmm8 , (%3) \n\t" + + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (ap), // 4 + "r" (alpha) // 5 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + diff --git a/kernel/x86_64/zgemv_t_microk_haswell-2.c b/kernel/x86_64/zgemv_t_microk_haswell-2.c deleted file mode 100644 index 8325db5cf..000000000 --- a/kernel/x86_64/zgemv_t_microk_haswell-2.c +++ /dev/null @@ -1,162 +0,0 @@ -/*************************************************************************** -Copyright (c) 2014, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary froms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary from must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#define HAVE_KERNEL_16x4 1 -static void zgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline)); - -static void zgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) -{ - - BLASLONG register i = 0; - - __asm__ __volatile__ - ( - "vzeroupper \n\t" - - "vxorpd %%ymm8 , %%ymm8 , %%ymm8 \n\t" // temp - "vxorpd %%ymm9 , %%ymm9 , %%ymm9 \n\t" // temp - "vxorpd %%ymm10, %%ymm10, %%ymm10 \n\t" // temp - "vxorpd %%ymm11, %%ymm11, %%ymm11 \n\t" // temp - "vxorpd %%ymm12, %%ymm12, %%ymm12 \n\t" // temp - "vxorpd %%ymm13, %%ymm13, %%ymm13 \n\t" - "vxorpd %%ymm14, %%ymm14, %%ymm14 \n\t" - "vxorpd %%ymm15, %%ymm15, %%ymm15 \n\t" - - ".align 16 \n\t" - ".L01LOOP%=: \n\t" - - "prefetcht0 192(%2,%0,8) \n\t" - "vmovddup (%2,%0,8), %%xmm0 \n\t" // real value from x0 - "prefetcht0 192(%4,%0,8) \n\t" - "vmovups (%5,%0,8), %%ymm5 \n\t" // 2 complex values from a1 - "vmovddup 8(%2,%0,8), %%xmm1 \n\t" // imag value from x0 - "vmovups (%4,%0,8), %%ymm4 \n\t" // 2 complex values from a0 - "prefetcht0 192(%5,%0,8) \n\t" - "vmovddup 16(%2,%0,8), %%xmm2 \n\t" // real value from x1 - "prefetcht0 192(%6,%0,8) \n\t" - "vmovups (%6,%0,8), %%ymm6 \n\t" // 2 complex values from a2 - "vmovddup 24(%2,%0,8), %%xmm3 \n\t" // imag value from x1 - "prefetcht0 192(%7,%0,8) \n\t" - "vmovups (%7,%0,8), %%ymm7 \n\t" // 2 complex values from a3 - "vinsertf128 $1, %%xmm2, %%ymm0 , %%ymm0 \n\t" // real values from x0 and x1 - "vinsertf128 $1, %%xmm3, %%ymm1 , %%ymm1 \n\t" // imag values from x0 and x1 - - "vfmadd231pd %%ymm4 , %%ymm0, %%ymm8 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 - "vfmadd231pd %%ymm4 , %%ymm1, %%ymm9 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 - "vfmadd231pd %%ymm5 , %%ymm0, %%ymm10 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 - "vfmadd231pd %%ymm5 , %%ymm1, %%ymm11 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 - "vfmadd231pd %%ymm6 , %%ymm0, %%ymm12 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 - "vfmadd231pd %%ymm6 , %%ymm1, %%ymm13 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 - "vfmadd231pd %%ymm7 , %%ymm0, %%ymm14 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 - "vfmadd231pd %%ymm7 , %%ymm1, %%ymm15 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 - - "vmovups 32(%4,%0,8), %%ymm4 \n\t" // 2 complex values from a0 - "vmovups 32(%5,%0,8), %%ymm5 \n\t" // 2 complex values from a1 - "vmovddup 32(%2,%0,8), %%xmm0 \n\t" // real value from x0 - "vmovddup 40(%2,%0,8), %%xmm1 \n\t" // imag value from x0 - "vmovddup 48(%2,%0,8), %%xmm2 \n\t" // real value from x1 - "vmovddup 56(%2,%0,8), %%xmm3 \n\t" // imag value from x1 - "vmovups 32(%6,%0,8), %%ymm6 \n\t" // 2 complex values from a2 - "vmovups 32(%7,%0,8), %%ymm7 \n\t" // 2 complex values from a3 - "vinsertf128 $1, %%xmm2, %%ymm0 , %%ymm0 \n\t" // real values from x0 and x1 - "vinsertf128 $1, %%xmm3, %%ymm1 , %%ymm1 \n\t" // imag values from x0 and x1 - - "vfmadd231pd %%ymm4 , %%ymm0, %%ymm8 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 - "vfmadd231pd %%ymm4 , %%ymm1, %%ymm9 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 - "vfmadd231pd %%ymm5 , %%ymm0, %%ymm10 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 - "vfmadd231pd %%ymm5 , %%ymm1, %%ymm11 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 - "vfmadd231pd %%ymm6 , %%ymm0, %%ymm12 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 - "vfmadd231pd %%ymm6 , %%ymm1, %%ymm13 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 - "vfmadd231pd %%ymm7 , %%ymm0, %%ymm14 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 - "vfmadd231pd %%ymm7 , %%ymm1, %%ymm15 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 - - "addq $8 , %0 \n\t" - "subq $4 , %1 \n\t" - "jnz .L01LOOP%= \n\t" - -#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - "vpermilpd $0x5 , %%ymm9 , %%ymm9 \n\t" - "vpermilpd $0x5 , %%ymm11, %%ymm11 \n\t" - "vpermilpd $0x5 , %%ymm13, %%ymm13 \n\t" - "vpermilpd $0x5 , %%ymm15, %%ymm15 \n\t" - "vaddsubpd %%ymm9 , %%ymm8, %%ymm8 \n\t" - "vaddsubpd %%ymm11, %%ymm10, %%ymm10 \n\t" - "vaddsubpd %%ymm13, %%ymm12, %%ymm12 \n\t" - "vaddsubpd %%ymm15, %%ymm14, %%ymm14 \n\t" -#else - "vpermilpd $0x5 , %%ymm8 , %%ymm8 \n\t" - "vpermilpd $0x5 , %%ymm10, %%ymm10 \n\t" - "vpermilpd $0x5 , %%ymm12, %%ymm12 \n\t" - "vpermilpd $0x5 , %%ymm14, %%ymm14 \n\t" - "vaddsubpd %%ymm8 , %%ymm9 , %%ymm8 \n\t" - "vaddsubpd %%ymm10, %%ymm11, %%ymm10 \n\t" - "vaddsubpd %%ymm12, %%ymm13, %%ymm12 \n\t" - "vaddsubpd %%ymm14, %%ymm15, %%ymm14 \n\t" - "vpermilpd $0x5 , %%ymm8 , %%ymm8 \n\t" - "vpermilpd $0x5 , %%ymm10, %%ymm10 \n\t" - "vpermilpd $0x5 , %%ymm12, %%ymm12 \n\t" - "vpermilpd $0x5 , %%ymm14, %%ymm14 \n\t" -#endif - - "vextractf128 $1, %%ymm8 , %%xmm9 \n\t" - "vextractf128 $1, %%ymm10, %%xmm11 \n\t" - "vextractf128 $1, %%ymm12, %%xmm13 \n\t" - "vextractf128 $1, %%ymm14, %%xmm15 \n\t" - - "vaddpd %%xmm8 , %%xmm9 , %%xmm8 \n\t" - "vaddpd %%xmm10, %%xmm11, %%xmm10 \n\t" - "vaddpd %%xmm12, %%xmm13, %%xmm12 \n\t" - "vaddpd %%xmm14, %%xmm15, %%xmm14 \n\t" - - "vmovups %%xmm8 , (%3) \n\t" - "vmovups %%xmm10, 16(%3) \n\t" - "vmovups %%xmm12, 32(%3) \n\t" - "vmovups %%xmm14, 48(%3) \n\t" - - "vzeroupper \n\t" - - : - : - "r" (i), // 0 - "r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (ap[0]), // 4 - "r" (ap[1]), // 5 - "r" (ap[2]), // 6 - "r" (ap[3]) // 7 - : "cc", - "%xmm0", "%xmm1", "%xmm2", "%xmm3", - "%xmm4", "%xmm5", "%xmm6", "%xmm7", - "%xmm8", "%xmm9", "%xmm10", "%xmm11", - "%xmm12", "%xmm13", "%xmm14", "%xmm15", - "memory" - ); - -} - - diff --git a/kernel/x86_64/zgemv_t_microk_haswell-4.c b/kernel/x86_64/zgemv_t_microk_haswell-4.c new file mode 100644 index 000000000..c87b5ce0f --- /dev/null +++ b/kernel/x86_64/zgemv_t_microk_haswell-4.c @@ -0,0 +1,428 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary froms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary from must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_4x4 1 +static void zgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline)); + +static void zgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + + "vxorpd %%ymm8 , %%ymm8 , %%ymm8 \n\t" // temp + "vxorpd %%ymm9 , %%ymm9 , %%ymm9 \n\t" // temp + "vxorpd %%ymm10, %%ymm10, %%ymm10 \n\t" // temp + "vxorpd %%ymm11, %%ymm11, %%ymm11 \n\t" // temp + "vxorpd %%ymm12, %%ymm12, %%ymm12 \n\t" // temp + "vxorpd %%ymm13, %%ymm13, %%ymm13 \n\t" + "vxorpd %%ymm14, %%ymm14, %%ymm14 \n\t" + "vxorpd %%ymm15, %%ymm15, %%ymm15 \n\t" + + ".align 16 \n\t" + ".L01LOOP%=: \n\t" + + "prefetcht0 192(%2,%0,8) \n\t" + "vmovddup (%2,%0,8), %%xmm0 \n\t" // real value from x0 + "prefetcht0 192(%4,%0,8) \n\t" + "vmovups (%5,%0,8), %%ymm5 \n\t" // 2 complex values from a1 + "vmovddup 8(%2,%0,8), %%xmm1 \n\t" // imag value from x0 + "vmovups (%4,%0,8), %%ymm4 \n\t" // 2 complex values from a0 + "prefetcht0 192(%5,%0,8) \n\t" + "vmovddup 16(%2,%0,8), %%xmm2 \n\t" // real value from x1 + "prefetcht0 192(%6,%0,8) \n\t" + "vmovups (%6,%0,8), %%ymm6 \n\t" // 2 complex values from a2 + "vmovddup 24(%2,%0,8), %%xmm3 \n\t" // imag value from x1 + "prefetcht0 192(%7,%0,8) \n\t" + "vmovups (%7,%0,8), %%ymm7 \n\t" // 2 complex values from a3 + "vinsertf128 $1, %%xmm2, %%ymm0 , %%ymm0 \n\t" // real values from x0 and x1 + "vinsertf128 $1, %%xmm3, %%ymm1 , %%ymm1 \n\t" // imag values from x0 and x1 + + "vfmadd231pd %%ymm4 , %%ymm0, %%ymm8 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 + "vfmadd231pd %%ymm4 , %%ymm1, %%ymm9 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 + "vfmadd231pd %%ymm5 , %%ymm0, %%ymm10 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 + "vfmadd231pd %%ymm5 , %%ymm1, %%ymm11 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 + "vfmadd231pd %%ymm6 , %%ymm0, %%ymm12 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 + "vfmadd231pd %%ymm6 , %%ymm1, %%ymm13 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 + "vfmadd231pd %%ymm7 , %%ymm0, %%ymm14 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 + "vfmadd231pd %%ymm7 , %%ymm1, %%ymm15 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 + + "vmovups 32(%4,%0,8), %%ymm4 \n\t" // 2 complex values from a0 + "vmovups 32(%5,%0,8), %%ymm5 \n\t" // 2 complex values from a1 + "vmovddup 32(%2,%0,8), %%xmm0 \n\t" // real value from x0 + "vmovddup 40(%2,%0,8), %%xmm1 \n\t" // imag value from x0 + "vmovddup 48(%2,%0,8), %%xmm2 \n\t" // real value from x1 + "vmovddup 56(%2,%0,8), %%xmm3 \n\t" // imag value from x1 + "vmovups 32(%6,%0,8), %%ymm6 \n\t" // 2 complex values from a2 + "vmovups 32(%7,%0,8), %%ymm7 \n\t" // 2 complex values from a3 + "vinsertf128 $1, %%xmm2, %%ymm0 , %%ymm0 \n\t" // real values from x0 and x1 + "vinsertf128 $1, %%xmm3, %%ymm1 , %%ymm1 \n\t" // imag values from x0 and x1 + + "vfmadd231pd %%ymm4 , %%ymm0, %%ymm8 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 + "vfmadd231pd %%ymm4 , %%ymm1, %%ymm9 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 + "vfmadd231pd %%ymm5 , %%ymm0, %%ymm10 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 + "vfmadd231pd %%ymm5 , %%ymm1, %%ymm11 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 + "vfmadd231pd %%ymm6 , %%ymm0, %%ymm12 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 + "vfmadd231pd %%ymm6 , %%ymm1, %%ymm13 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 + "vfmadd231pd %%ymm7 , %%ymm0, %%ymm14 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 + "vfmadd231pd %%ymm7 , %%ymm1, %%ymm15 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 + + "addq $8 , %0 \n\t" + "subq $4 , %1 \n\t" + "jnz .L01LOOP%= \n\t" + + "vmovddup (%8) , %%xmm0 \n\t" // value from alpha + "vmovddup 8(%8) , %%xmm1 \n\t" // value from alpha + +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + "vpermilpd $0x5 , %%ymm9 , %%ymm9 \n\t" + "vpermilpd $0x5 , %%ymm11, %%ymm11 \n\t" + "vpermilpd $0x5 , %%ymm13, %%ymm13 \n\t" + "vpermilpd $0x5 , %%ymm15, %%ymm15 \n\t" + "vaddsubpd %%ymm9 , %%ymm8, %%ymm8 \n\t" + "vaddsubpd %%ymm11, %%ymm10, %%ymm10 \n\t" + "vaddsubpd %%ymm13, %%ymm12, %%ymm12 \n\t" + "vaddsubpd %%ymm15, %%ymm14, %%ymm14 \n\t" +#else + "vpermilpd $0x5 , %%ymm8 , %%ymm8 \n\t" + "vpermilpd $0x5 , %%ymm10, %%ymm10 \n\t" + "vpermilpd $0x5 , %%ymm12, %%ymm12 \n\t" + "vpermilpd $0x5 , %%ymm14, %%ymm14 \n\t" + "vaddsubpd %%ymm8 , %%ymm9 , %%ymm8 \n\t" + "vaddsubpd %%ymm10, %%ymm11, %%ymm10 \n\t" + "vaddsubpd %%ymm12, %%ymm13, %%ymm12 \n\t" + "vaddsubpd %%ymm14, %%ymm15, %%ymm14 \n\t" + "vpermilpd $0x5 , %%ymm8 , %%ymm8 \n\t" + "vpermilpd $0x5 , %%ymm10, %%ymm10 \n\t" + "vpermilpd $0x5 , %%ymm12, %%ymm12 \n\t" + "vpermilpd $0x5 , %%ymm14, %%ymm14 \n\t" +#endif + + "vextractf128 $1, %%ymm8 , %%xmm9 \n\t" + "vextractf128 $1, %%ymm10, %%xmm11 \n\t" + "vextractf128 $1, %%ymm12, %%xmm13 \n\t" + "vextractf128 $1, %%ymm14, %%xmm15 \n\t" + + "vaddpd %%xmm8 , %%xmm9 , %%xmm8 \n\t" + "vaddpd %%xmm10, %%xmm11, %%xmm10 \n\t" + "vaddpd %%xmm12, %%xmm13, %%xmm12 \n\t" + "vaddpd %%xmm14, %%xmm15, %%xmm14 \n\t" + + "vmulpd %%xmm8 , %%xmm1 , %%xmm9 \n\t" // t_r * alpha_i , t_i * alpha_i + "vmulpd %%xmm8 , %%xmm0 , %%xmm8 \n\t" // t_r * alpha_r , t_i * alpha_r + "vmulpd %%xmm10, %%xmm1 , %%xmm11 \n\t" // t_r * alpha_i , t_i * alpha_i + "vmulpd %%xmm10, %%xmm0 , %%xmm10 \n\t" // t_r * alpha_r , t_i * alpha_r + "vmulpd %%xmm12, %%xmm1 , %%xmm13 \n\t" // t_r * alpha_i , t_i * alpha_i + "vmulpd %%xmm12, %%xmm0 , %%xmm12 \n\t" // t_r * alpha_r , t_i * alpha_r + "vmulpd %%xmm14, %%xmm1 , %%xmm15 \n\t" // t_r * alpha_i , t_i * alpha_i + "vmulpd %%xmm14, %%xmm0 , %%xmm14 \n\t" // t_r * alpha_r , t_i * alpha_r + +#if !defined(XCONJ) + "vpermilpd $0x1 , %%xmm9 , %%xmm9 \n\t" + "vpermilpd $0x1 , %%xmm11, %%xmm11 \n\t" + "vpermilpd $0x1 , %%xmm13, %%xmm13 \n\t" + "vpermilpd $0x1 , %%xmm15, %%xmm15 \n\t" + "vaddsubpd %%xmm9 , %%xmm8, %%xmm8 \n\t" + "vaddsubpd %%xmm11, %%xmm10, %%xmm10 \n\t" + "vaddsubpd %%xmm13, %%xmm12, %%xmm12 \n\t" + "vaddsubpd %%xmm15, %%xmm14, %%xmm14 \n\t" +#else + "vpermilpd $0x1 , %%xmm8 , %%xmm8 \n\t" + "vpermilpd $0x1 , %%xmm10, %%xmm10 \n\t" + "vpermilpd $0x1 , %%xmm12, %%xmm12 \n\t" + "vpermilpd $0x1 , %%xmm14, %%xmm14 \n\t" + "vaddsubpd %%xmm8 , %%xmm9 , %%xmm8 \n\t" + "vaddsubpd %%xmm10, %%xmm11, %%xmm10 \n\t" + "vaddsubpd %%xmm12, %%xmm13, %%xmm12 \n\t" + "vaddsubpd %%xmm14, %%xmm15, %%xmm14 \n\t" + "vpermilpd $0x1 , %%xmm8 , %%xmm8 \n\t" + "vpermilpd $0x1 , %%xmm10, %%xmm10 \n\t" + "vpermilpd $0x1 , %%xmm12, %%xmm12 \n\t" + "vpermilpd $0x1 , %%xmm14, %%xmm14 \n\t" +#endif + + "vaddpd (%3) , %%xmm8 , %%xmm8 \n\t" + "vaddpd 16(%3) , %%xmm10, %%xmm10 \n\t" + "vaddpd 32(%3) , %%xmm12, %%xmm12 \n\t" + "vaddpd 48(%3) , %%xmm14, %%xmm14 \n\t" + + "vmovups %%xmm8 , (%3) \n\t" + "vmovups %%xmm10, 16(%3) \n\t" + "vmovups %%xmm12, 32(%3) \n\t" + "vmovups %%xmm14, 48(%3) \n\t" + + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (ap[0]), // 4 + "r" (ap[1]), // 5 + "r" (ap[2]), // 6 + "r" (ap[3]), // 7 + "r" (alpha) // 8 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + +#define HAVE_KERNEL_4x2 1 +static void zgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline)); + +static void zgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + + "vxorpd %%ymm8 , %%ymm8 , %%ymm8 \n\t" // temp + "vxorpd %%ymm9 , %%ymm9 , %%ymm9 \n\t" // temp + "vxorpd %%ymm10, %%ymm10, %%ymm10 \n\t" // temp + "vxorpd %%ymm11, %%ymm11, %%ymm11 \n\t" // temp + + ".align 16 \n\t" + ".L01LOOP%=: \n\t" + + "prefetcht0 192(%2,%0,8) \n\t" + "vmovddup (%2,%0,8), %%xmm0 \n\t" // real value from x0 + "prefetcht0 192(%4,%0,8) \n\t" + "vmovups (%5,%0,8), %%ymm5 \n\t" // 2 complex values from a1 + "vmovddup 8(%2,%0,8), %%xmm1 \n\t" // imag value from x0 + "vmovups (%4,%0,8), %%ymm4 \n\t" // 2 complex values from a0 + "prefetcht0 192(%5,%0,8) \n\t" + "vmovddup 16(%2,%0,8), %%xmm2 \n\t" // real value from x1 + "vmovddup 24(%2,%0,8), %%xmm3 \n\t" // imag value from x1 + "vinsertf128 $1, %%xmm2, %%ymm0 , %%ymm0 \n\t" // real values from x0 and x1 + "vinsertf128 $1, %%xmm3, %%ymm1 , %%ymm1 \n\t" // imag values from x0 and x1 + + "vfmadd231pd %%ymm4 , %%ymm0, %%ymm8 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 + "vfmadd231pd %%ymm4 , %%ymm1, %%ymm9 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 + "vfmadd231pd %%ymm5 , %%ymm0, %%ymm10 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 + "vfmadd231pd %%ymm5 , %%ymm1, %%ymm11 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 + + "vmovups 32(%4,%0,8), %%ymm4 \n\t" // 2 complex values from a0 + "vmovups 32(%5,%0,8), %%ymm5 \n\t" // 2 complex values from a1 + "vmovddup 32(%2,%0,8), %%xmm0 \n\t" // real value from x0 + "vmovddup 40(%2,%0,8), %%xmm1 \n\t" // imag value from x0 + "vmovddup 48(%2,%0,8), %%xmm2 \n\t" // real value from x1 + "vmovddup 56(%2,%0,8), %%xmm3 \n\t" // imag value from x1 + "vinsertf128 $1, %%xmm2, %%ymm0 , %%ymm0 \n\t" // real values from x0 and x1 + "vinsertf128 $1, %%xmm3, %%ymm1 , %%ymm1 \n\t" // imag values from x0 and x1 + + "vfmadd231pd %%ymm4 , %%ymm0, %%ymm8 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 + "vfmadd231pd %%ymm4 , %%ymm1, %%ymm9 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 + "vfmadd231pd %%ymm5 , %%ymm0, %%ymm10 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 + "vfmadd231pd %%ymm5 , %%ymm1, %%ymm11 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 + + "addq $8 , %0 \n\t" + "subq $4 , %1 \n\t" + "jnz .L01LOOP%= \n\t" + + "vmovddup (%6) , %%xmm0 \n\t" // value from alpha + "vmovddup 8(%6) , %%xmm1 \n\t" // value from alpha + +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + "vpermilpd $0x5 , %%ymm9 , %%ymm9 \n\t" + "vpermilpd $0x5 , %%ymm11, %%ymm11 \n\t" + "vaddsubpd %%ymm9 , %%ymm8, %%ymm8 \n\t" + "vaddsubpd %%ymm11, %%ymm10, %%ymm10 \n\t" +#else + "vpermilpd $0x5 , %%ymm8 , %%ymm8 \n\t" + "vpermilpd $0x5 , %%ymm10, %%ymm10 \n\t" + "vaddsubpd %%ymm8 , %%ymm9 , %%ymm8 \n\t" + "vaddsubpd %%ymm10, %%ymm11, %%ymm10 \n\t" + "vpermilpd $0x5 , %%ymm8 , %%ymm8 \n\t" + "vpermilpd $0x5 , %%ymm10, %%ymm10 \n\t" +#endif + + "vextractf128 $1, %%ymm8 , %%xmm9 \n\t" + "vextractf128 $1, %%ymm10, %%xmm11 \n\t" + + "vaddpd %%xmm8 , %%xmm9 , %%xmm8 \n\t" + "vaddpd %%xmm10, %%xmm11, %%xmm10 \n\t" + + "vmulpd %%xmm8 , %%xmm1 , %%xmm9 \n\t" // t_r * alpha_i , t_i * alpha_i + "vmulpd %%xmm8 , %%xmm0 , %%xmm8 \n\t" // t_r * alpha_r , t_i * alpha_r + "vmulpd %%xmm10, %%xmm1 , %%xmm11 \n\t" // t_r * alpha_i , t_i * alpha_i + "vmulpd %%xmm10, %%xmm0 , %%xmm10 \n\t" // t_r * alpha_r , t_i * alpha_r + +#if !defined(XCONJ) + "vpermilpd $0x1 , %%xmm9 , %%xmm9 \n\t" + "vpermilpd $0x1 , %%xmm11, %%xmm11 \n\t" + "vaddsubpd %%xmm9 , %%xmm8, %%xmm8 \n\t" + "vaddsubpd %%xmm11, %%xmm10, %%xmm10 \n\t" +#else + "vpermilpd $0x1 , %%xmm8 , %%xmm8 \n\t" + "vpermilpd $0x1 , %%xmm10, %%xmm10 \n\t" + "vaddsubpd %%xmm8 , %%xmm9 , %%xmm8 \n\t" + "vaddsubpd %%xmm10, %%xmm11, %%xmm10 \n\t" + "vpermilpd $0x1 , %%xmm8 , %%xmm8 \n\t" + "vpermilpd $0x1 , %%xmm10, %%xmm10 \n\t" +#endif + + "vaddpd (%3) , %%xmm8 , %%xmm8 \n\t" + "vaddpd 16(%3) , %%xmm10, %%xmm10 \n\t" + + "vmovups %%xmm8 , (%3) \n\t" + "vmovups %%xmm10, 16(%3) \n\t" + + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (ap[0]), // 4 + "r" (ap[1]), // 5 + "r" (alpha) // 6 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + +#define HAVE_KERNEL_4x1 1 +static void zgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline)); + +static void zgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + + "vxorpd %%ymm8 , %%ymm8 , %%ymm8 \n\t" // temp + "vxorpd %%ymm9 , %%ymm9 , %%ymm9 \n\t" // temp + + ".align 16 \n\t" + ".L01LOOP%=: \n\t" + + "prefetcht0 192(%2,%0,8) \n\t" + "vmovddup (%2,%0,8), %%xmm0 \n\t" // real value from x0 + "prefetcht0 192(%4,%0,8) \n\t" + "vmovddup 8(%2,%0,8), %%xmm1 \n\t" // imag value from x0 + "vmovups (%4,%0,8), %%ymm4 \n\t" // 2 complex values from a0 + "vmovddup 16(%2,%0,8), %%xmm2 \n\t" // real value from x1 + "vmovddup 24(%2,%0,8), %%xmm3 \n\t" // imag value from x1 + "vinsertf128 $1, %%xmm2, %%ymm0 , %%ymm0 \n\t" // real values from x0 and x1 + "vinsertf128 $1, %%xmm3, %%ymm1 , %%ymm1 \n\t" // imag values from x0 and x1 + + "vfmadd231pd %%ymm4 , %%ymm0, %%ymm8 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 + "vfmadd231pd %%ymm4 , %%ymm1, %%ymm9 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 + + "vmovups 32(%4,%0,8), %%ymm4 \n\t" // 2 complex values from a0 + "vmovddup 32(%2,%0,8), %%xmm0 \n\t" // real value from x0 + "vmovddup 40(%2,%0,8), %%xmm1 \n\t" // imag value from x0 + "vmovddup 48(%2,%0,8), %%xmm2 \n\t" // real value from x1 + "vmovddup 56(%2,%0,8), %%xmm3 \n\t" // imag value from x1 + "vinsertf128 $1, %%xmm2, %%ymm0 , %%ymm0 \n\t" // real values from x0 and x1 + "vinsertf128 $1, %%xmm3, %%ymm1 , %%ymm1 \n\t" // imag values from x0 and x1 + + "vfmadd231pd %%ymm4 , %%ymm0, %%ymm8 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 + "vfmadd231pd %%ymm4 , %%ymm1, %%ymm9 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 + + "addq $8 , %0 \n\t" + "subq $4 , %1 \n\t" + "jnz .L01LOOP%= \n\t" + + "vmovddup (%5) , %%xmm0 \n\t" // value from alpha + "vmovddup 8(%5) , %%xmm1 \n\t" // value from alpha + +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + "vpermilpd $0x5 , %%ymm9 , %%ymm9 \n\t" + "vaddsubpd %%ymm9 , %%ymm8, %%ymm8 \n\t" +#else + "vpermilpd $0x5 , %%ymm8 , %%ymm8 \n\t" + "vaddsubpd %%ymm8 , %%ymm9 , %%ymm8 \n\t" + "vpermilpd $0x5 , %%ymm8 , %%ymm8 \n\t" +#endif + + "vextractf128 $1, %%ymm8 , %%xmm9 \n\t" + + "vaddpd %%xmm8 , %%xmm9 , %%xmm8 \n\t" + + "vmulpd %%xmm8 , %%xmm1 , %%xmm9 \n\t" // t_r * alpha_i , t_i * alpha_i + "vmulpd %%xmm8 , %%xmm0 , %%xmm8 \n\t" // t_r * alpha_r , t_i * alpha_r + +#if !defined(XCONJ) + "vpermilpd $0x1 , %%xmm9 , %%xmm9 \n\t" + "vaddsubpd %%xmm9 , %%xmm8, %%xmm8 \n\t" +#else + "vpermilpd $0x1 , %%xmm8 , %%xmm8 \n\t" + "vaddsubpd %%xmm8 , %%xmm9 , %%xmm8 \n\t" + "vpermilpd $0x1 , %%xmm8 , %%xmm8 \n\t" +#endif + + "vaddpd (%3) , %%xmm8 , %%xmm8 \n\t" + + "vmovups %%xmm8 , (%3) \n\t" + + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (ap), // 4 + "r" (alpha) // 5 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + +