/*************************************************************************** Copyright (c) 2017, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include #include #include "common.h" #define HAVE_KERNEL_4x4_VEC 1 #define HAVE_KERNEL_4x2_VEC 1 #define HAVE_KERNEL_4x1_VEC 1 #define HAVE_KERNEL_ADDY 1 #if defined(HAVE_KERNEL_4x4_VEC) || defined(HAVE_KERNEL_4x2_VEC) || defined(HAVE_KERNEL_4x1_VEC) #include #endif /** * if define IGNORE_TEMP_PERM we store and use ybuffer as {real,real} {img;img} * of not we will retrieve and store normal way */ #if (defined(HAVE_KERNEL_4x4_VEC_ASM) || defined(HAVE_KERNEL_4x4_VEC) ) && defined(HAVE_KERNEL_4x2_VEC) && defined(HAVE_KERNEL_4x1_VEC) && defined(HAVE_KERNEL_ADDY) // #define IGNORE_TEMP_PERM 1 #endif #define NBMAX 1024 #ifdef HAVE_KERNEL_4x4_VEC_ASM #elif HAVE_KERNEL_4x4_VEC static void zgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { BLASLONG i; FLOAT *a0,*a1,*a2,*a3; a0 = ap[0]; a1 = ap[1]; a2 = ap[2]; a3 = ap[3]; register __vector double vx0_r = {x[0],x[0]}; register __vector double vx0_i = {x[1],x[1]}; register __vector double vx1_r = {x[2],x[2]}; register __vector double vx1_i = {x[3],x[3]}; register __vector double vx2_r = {x[4],x[4]}; register __vector double vx2_i = {x[5],x[5]}; register __vector double vx3_r = {x[6],x[6]}; register __vector double vx3_i = {x[7],x[7]}; #ifdef IGNORE_TEMP_PERM register __vector double *vy = (__vector double *)y; register BLASLONG j=0; #endif for ( i=0; i< 2*n; i+=4 ) { #ifdef IGNORE_TEMP_PERM register __vector double vresult_r = vy[j]; register __vector double vresult_i = vy[j+1]; #else register __vector double vresult_r = {y[i],y[i+2]}; register __vector double vresult_i = {y[i+1],y[i+3]}; #endif register __vector double va0_r= {a0[i],a0[i+2]}; register __vector double va0_i= {a0[i+1],a0[i+3]}; register __vector double va1_r= {a1[i],a1[i+2]}; register __vector double va1_i= {a1[i+1],a1[i+3]}; register __vector double va2_r= {a2[i],a2[i+2]}; register __vector double va2_i= {a2[i+1],a2[i+3]}; register __vector double va3_r= {a3[i],a3[i+2]}; register __vector double va3_i= {a3[i+1],a3[i+3]}; #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) vresult_r = va0_r * vx0_r - (va0_i*vx0_i -vresult_r) ; vresult_i = vresult_i + va0_r * vx0_i + va0_i * vx0_r ; vresult_r = va1_r * vx1_r - (va1_i*vx1_i -vresult_r) ; vresult_i = vresult_i + va1_r * vx1_i + va1_i * vx1_r ; vresult_r = va2_r * vx2_r - (va2_i*vx2_i -vresult_r) ; vresult_i = vresult_i + va2_r * vx2_i + va2_i * vx2_r ; vresult_r = va3_r * vx3_r - (va3_i*vx3_i -vresult_r) ; vresult_i = vresult_i + va3_r * vx3_i + va3_i * vx3_r ; #else vresult_r = vresult_r + va0_r * vx0_r + va0_i*vx0_i ; vresult_i = va0_r * vx0_i - ( va0_i * vx0_r - vresult_i) ; vresult_r = vresult_r + va1_r * vx1_r + va1_i*vx1_i ; vresult_i = va1_r * vx1_i - ( va1_i * vx1_r - vresult_i) ; vresult_r = vresult_r + va2_r * vx2_r + va2_i*vx2_i ; vresult_i = va2_r * vx2_i - ( va2_i * vx2_r - vresult_i) ; vresult_r = vresult_r + va3_r * vx3_r + va3_i*vx3_i ; vresult_i = va3_r * vx3_i - ( va3_i * vx3_r - vresult_i) ; #endif #ifdef IGNORE_TEMP_PERM vy[j] = vresult_r ; vy[j+1] = vresult_i ; j+=2; #else y[i] = vresult_r[0]; y[i+1] = vresult_i[0]; y[i +2 ] = vresult_r[1]; y[i + 3 ] = vresult_i[1]; #endif } } #else static void zgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { BLASLONG i; FLOAT *a0,*a1,*a2,*a3; a0 = ap[0]; a1 = ap[1]; a2 = ap[2]; a3 = ap[3]; for ( i=0; i< 2*n; i+=2 ) { #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) y[i] += a0[i]*x[0] - a0[i+1] * x[1]; y[i+1] += a0[i]*x[1] + a0[i+1] * x[0]; y[i] += a1[i]*x[2] - a1[i+1] * x[3]; y[i+1] += a1[i]*x[3] + a1[i+1] * x[2]; y[i] += a2[i]*x[4] - a2[i+1] * x[5]; y[i+1] += a2[i]*x[5] + a2[i+1] * x[4]; y[i] += a3[i]*x[6] - a3[i+1] * x[7]; y[i+1] += a3[i]*x[7] + a3[i+1] * x[6]; #else y[i] += a0[i]*x[0] + a0[i+1] * x[1]; y[i+1] += a0[i]*x[1] - a0[i+1] * x[0]; y[i] += a1[i]*x[2] + a1[i+1] * x[3]; y[i+1] += a1[i]*x[3] - a1[i+1] * x[2]; y[i] += a2[i]*x[4] + a2[i+1] * x[5]; y[i+1] += a2[i]*x[5] - a2[i+1] * x[4]; y[i] += a3[i]*x[6] + a3[i+1] * x[7]; y[i+1] += a3[i]*x[7] - a3[i+1] * x[6]; #endif } } #endif #ifdef HAVE_KERNEL_4x2_VEC static void zgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { BLASLONG i; FLOAT *a0,*a1; a0 = ap[0]; a1 = ap[1]; register __vector double vx0_r = {x[0],x[0]}; register __vector double vx0_i = {x[1],x[1]}; register __vector double vx1_r = {x[2],x[2]}; register __vector double vx1_i = {x[3],x[3]}; #ifdef IGNORE_TEMP_PERM register __vector double *vy = (__vector double *)y; register BLASLONG j=0; #endif for ( i=0; i< 2*n; i+=4 ) { #ifdef IGNORE_TEMP_PERM register __vector double vresult_r = vy[j]; register __vector double vresult_i = vy[j+1]; #else register __vector double vresult_r = {y[i],y[i+2]}; register __vector double vresult_i = {y[i+1],y[i+3]}; #endif register __vector double va0_r= {a0[i],a0[i+2]}; register __vector double va0_i= {a0[i+1],a0[i+3]}; register __vector double va1_r= {a1[i],a1[i+2]}; register __vector double va1_i= {a1[i+1],a1[i+3]}; #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) vresult_r = va0_r * vx0_r - (va0_i*vx0_i -vresult_r) ; vresult_i = vresult_i + va0_r * vx0_i + va0_i * vx0_r ; vresult_r = va1_r * vx1_r - (va1_i*vx1_i -vresult_r) ; vresult_i = vresult_i + va1_r * vx1_i + va1_i * vx1_r ; #else vresult_r = vresult_r + va0_r * vx0_r + va0_i*vx0_i ; vresult_i = va0_r * vx0_i - ( va0_i * vx0_r - vresult_i) ; vresult_r = vresult_r + va1_r * vx1_r + va1_i*vx1_i ; vresult_i = va1_r * vx1_i - ( va1_i * vx1_r - vresult_i) ; #endif #ifdef IGNORE_TEMP_PERM vy[j] = vresult_r ; vy[j+1] = vresult_i ; j+=2; #else y[i] = vresult_r[0]; y[i+1] = vresult_i[0]; y[i +2 ] = vresult_r[1]; y[i + 3 ] = vresult_i[1]; #endif } } #else static void zgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { BLASLONG i; FLOAT *a0,*a1; a0 = ap[0]; a1 = ap[1]; for ( i=0; i< 2*n; i+=2 ) { #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) y[i] += a0[i]*x[0] - a0[i+1] * x[1]; y[i+1] += a0[i]*x[1] + a0[i+1] * x[0]; y[i] += a1[i]*x[2] - a1[i+1] * x[3]; y[i+1] += a1[i]*x[3] + a1[i+1] * x[2]; #else y[i] += a0[i]*x[0] + a0[i+1] * x[1]; y[i+1] += a0[i]*x[1] - a0[i+1] * x[0]; y[i] += a1[i]*x[2] + a1[i+1] * x[3]; y[i+1] += a1[i]*x[3] - a1[i+1] * x[2]; #endif } } #endif #ifdef HAVE_KERNEL_4x1_VEC static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) { BLASLONG i; FLOAT *a0; a0 = ap; register __vector double vx_r = {x[0],x[0]}; register __vector double vx_i = {x[1],x[1]}; #ifdef IGNORE_TEMP_PERM register __vector double *vy = (__vector double *)y; register BLASLONG j=0; #endif for ( i=0; i< 2*n; i+=4 ) { #ifdef IGNORE_TEMP_PERM register __vector double vresult_r = vy[j]; register __vector double vresult_i = vy[j+1]; #else register __vector double vresult_r = {y[i],y[i+2]}; register __vector double vresult_i = {y[i+1],y[i+3]}; #endif register __vector double va0_r= {a0[i],a0[i+2]}; register __vector double va0_i= {a0[i+1],a0[i+3]}; #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) vresult_r = va0_r * vx_r - (va0_i*vx_i -vresult_r) ; vresult_i = vresult_i + va0_r * vx_i + va0_i * vx_r ; #else vresult_r = vresult_r + va0_r * vx_r + va0_i*vx_i ; vresult_i = va0_r * vx_i - ( va0_i * vx_r - vresult_i) ; // y[i] += a0[i]*x[0] + a0[i+1] * x[1]; // y[i+1] += a0[i]*x[1] - a0[i+1] * x[0]; #endif #ifndef IGNORE_TEMP_PERM y[i] = vresult_r[0]; y[i+1] = vresult_i[0]; y[i +2 ] = vresult_r[1]; y[i + 3 ] = vresult_i[1]; #else vy[j] = vresult_r ; vy[j+1] = vresult_i ; j+=2; #endif } } #else static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) { BLASLONG i; FLOAT *a0; a0 = ap; for ( i=0; i< 2*n; i+=2 ) { #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) y[i] += a0[i]*x[0] - a0[i+1] * x[1]; y[i+1] += a0[i]*x[1] + a0[i+1] * x[0]; #else y[i] += a0[i]*x[0] + a0[i+1] * x[1]; y[i+1] += a0[i]*x[1] - a0[i+1] * x[0]; #endif } } #endif #ifdef HAVE_KERNEL_ADDY static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT alpha_r, FLOAT alpha_i) { BLASLONG i; #ifdef IGNORE_TEMP_PERM register __vector double *src_vec = (__vector double *)src; #endif register __vector double valpha_r = {alpha_r,alpha_r}; register __vector double valpha_i = {alpha_i,alpha_i}; register __vector double vresult_r; register __vector double vresult_i; if ( inc_dest != 2 ) { for ( i=0; i