diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index a960708ef..08f8cc69d 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -166,5 +166,5 @@ In chronological order: * [2017-01-01] dgemm and dtrmm kernels for IBM z13 * [2017-02-26] ztrmm kernel for IBM z13 * [2017-03-13] strmm and ctrmm kernel for IBM z13 - + * [2017-09-01] initial Blas Level-1,2 (double precision) for IBM z13 diff --git a/README.md b/README.md index c933f62eb..6712d5188 100644 --- a/README.md +++ b/README.md @@ -107,7 +107,7 @@ Please read GotoBLAS_01Readme.txt - **ARM Cortex-A57**: Experimental #### IBM zEnterprise System: -- **Z13**: Optimized Level-3 BLAS +- **Z13**: Optimized Level-3 BLAS and Level-1,2 (double precision) ### Support OS: diff --git a/interface/axpy.c b/interface/axpy.c index 61b7b4d78..f0d95b395 100644 --- a/interface/axpy.c +++ b/interface/axpy.c @@ -40,8 +40,12 @@ #include "common.h" #ifdef FUNCTION_PROFILE #include "functable.h" +#endif +#if defined(Z13) +#define MULTI_THREAD_MINIMAL 200000 +#else +#define MULTI_THREAD_MINIMAL 10000 #endif - #ifndef CBLAS void NAME(blasint *N, FLOAT *ALPHA, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY){ @@ -88,7 +92,7 @@ void CNAME(blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT *y, blasint inc //Temporarily work-around the low performance issue with small imput size & //multithreads. - if (n <= 10000) + if (n <= MULTI_THREAD_MINIMAL) nthreads = 1; if (nthreads == 1) { diff --git a/kernel/zarch/KERNEL.Z13 b/kernel/zarch/KERNEL.Z13 index 9e3650bff..8e5c3706a 100644 --- a/kernel/zarch/KERNEL.Z13 +++ b/kernel/zarch/KERNEL.Z13 @@ -15,14 +15,14 @@ SMINKERNEL = ../arm/min.c DMINKERNEL = ../arm/min.c ISAMAXKERNEL = ../arm/iamax.c -IDAMAXKERNEL = ../arm/iamax.c +IDAMAXKERNEL = idamax.c ICAMAXKERNEL = ../arm/izamax.c -IZAMAXKERNEL = ../arm/izamax.c +IZAMAXKERNEL = izamax.c ISAMINKERNEL = ../arm/iamin.c -IDAMINKERNEL = ../arm/iamin.c +IDAMINKERNEL = idamin.c ICAMINKERNEL = ../arm/izamin.c -IZAMINKERNEL = ../arm/izamin.c +IZAMINKERNEL = izamin.c ISMAXKERNEL = ../arm/imax.c IDMAXKERNEL = ../arm/imax.c @@ -31,24 +31,24 @@ ISMINKERNEL = ../arm/imin.c IDMINKERNEL = ../arm/imin.c SASUMKERNEL = ../arm/asum.c -DASUMKERNEL = ../arm/asum.c +DASUMKERNEL = dasum.c CASUMKERNEL = ../arm/zasum.c -ZASUMKERNEL = ../arm/zasum.c +ZASUMKERNEL = zasum.c SAXPYKERNEL = ../arm/axpy.c -DAXPYKERNEL = ../arm/axpy.c +DAXPYKERNEL = daxpy.c CAXPYKERNEL = ../arm/zaxpy.c -ZAXPYKERNEL = ../arm/zaxpy.c +ZAXPYKERNEL = zaxpy.c SCOPYKERNEL = ../arm/copy.c -DCOPYKERNEL = ../arm/copy.c +DCOPYKERNEL = dcopy.c CCOPYKERNEL = ../arm/zcopy.c -ZCOPYKERNEL = ../arm/zcopy.c +ZCOPYKERNEL = zcopy.c SDOTKERNEL = ../arm/dot.c -DDOTKERNEL = ../arm/dot.c +DDOTKERNEL = ddot.c CDOTKERNEL = ../arm/zdot.c -ZDOTKERNEL = ../arm/zdot.c +ZDOTKERNEL = zdot.c SNRM2KERNEL = ../arm/nrm2.c DNRM2KERNEL = ../arm/nrm2.c @@ -56,29 +56,29 @@ CNRM2KERNEL = ../arm/znrm2.c ZNRM2KERNEL = ../arm/znrm2.c SROTKERNEL = ../arm/rot.c -DROTKERNEL = ../arm/rot.c +DROTKERNEL = drot.c CROTKERNEL = ../arm/zrot.c -ZROTKERNEL = ../arm/zrot.c +ZROTKERNEL = zrot.c SSCALKERNEL = ../arm/scal.c -DSCALKERNEL = ../arm/scal.c +DSCALKERNEL = dscal.c CSCALKERNEL = ../arm/zscal.c -ZSCALKERNEL = ../arm/zscal.c +ZSCALKERNEL = zscal.c SSWAPKERNEL = ../arm/swap.c -DSWAPKERNEL = ../arm/swap.c +DSWAPKERNEL = dswap.c CSWAPKERNEL = ../arm/zswap.c -ZSWAPKERNEL = ../arm/zswap.c +ZSWAPKERNEL = zswap.c SGEMVNKERNEL = ../arm/gemv_n.c -DGEMVNKERNEL = ../arm/gemv_n.c +DGEMVNKERNEL = dgemv_n_4.c CGEMVNKERNEL = ../arm/zgemv_n.c -ZGEMVNKERNEL = ../arm/zgemv_n.c +ZGEMVNKERNEL = zgemv_n_4.c SGEMVTKERNEL = ../arm/gemv_t.c -DGEMVTKERNEL = ../arm/gemv_t.c +DGEMVTKERNEL = dgemv_t_4.c CGEMVTKERNEL = ../arm/zgemv_t.c -ZGEMVTKERNEL = ../arm/zgemv_t.c +ZGEMVTKERNEL = zgemv_t_4.c STRMMKERNEL = strmm8x4V.S DTRMMKERNEL = trmm8x4V.S diff --git a/kernel/zarch/dasum.c b/kernel/zarch/dasum.c new file mode 100644 index 000000000..c0b5ab930 --- /dev/null +++ b/kernel/zarch/dasum.c @@ -0,0 +1,159 @@ +/*************************************************************************** +Copyright (c) 2013-2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ + + +#include "common.h" +#include + +#if defined(DOUBLE) +#define ABS fabs +#else +#define ABS fabsf +#endif + + +static FLOAT __attribute__ ((noinline)) dasum_kernel_32(BLASLONG n, FLOAT *x) { + + __asm__ ( + "pfd 1, 0(%1) \n\t" + "sllg %%r0,%0,3 \n\t" + "agr %%r0,%1 \n\t" + "vzero %%v0 \n\t" + "vzero %%v1 \n\t" + "vzero %%v2 \n\t" + "vzero %%v3 \n\t" + ".align 16 \n\t" + "1: \n\t" + "pfd 1, 256(%1 ) \n\t" + "vlm %%v24,%%v31, 0(%1 ) \n\t" + + "vflpdb %%v24, %%v24 \n\t" + "vflpdb %%v25, %%v25 \n\t" + "vflpdb %%v26, %%v26 \n\t" + "vflpdb %%v27, %%v27 \n\t" + "vflpdb %%v28, %%v28 \n\t" + "vflpdb %%v29, %%v29 \n\t" + "vflpdb %%v30, %%v30 \n\t" + "vflpdb %%v31, %%v31 \n\t" + + "vfadb %%v0,%%v0,%%v24 \n\t" + "vfadb %%v1,%%v1,%%v25 \n\t" + "vfadb %%v2,%%v2,%%v26 \n\t" + "vfadb %%v3,%%v3,%%v27 \n\t" + "vfadb %%v0,%%v0,%%v28 \n\t" + "vfadb %%v1,%%v1,%%v29 \n\t" + "vfadb %%v2,%%v2,%%v30 \n\t" + "vfadb %%v3,%%v3,%%v31 \n\t" + + "vlm %%v24,%%v31, 128(%1) \n\t" + + "vflpdb %%v24, %%v24 \n\t" + "vflpdb %%v25, %%v25 \n\t" + "vflpdb %%v26, %%v26 \n\t" + "vflpdb %%v27, %%v27 \n\t" + "vflpdb %%v28, %%v28 \n\t" + "vflpdb %%v29, %%v29 \n\t" + "vflpdb %%v30, %%v30 \n\t" + "vflpdb %%v31, %%v31 \n\t" + "la %1,256(%1) \n\t" + "vfadb %%v0,%%v0,%%v24 \n\t" + "vfadb %%v1,%%v1,%%v25 \n\t" + "vfadb %%v2,%%v2,%%v26 \n\t" + "vfadb %%v3,%%v3,%%v27 \n\t" + "vfadb %%v0,%%v0,%%v28 \n\t" + "vfadb %%v1,%%v1,%%v29 \n\t" + "vfadb %%v2,%%v2,%%v30 \n\t" + "vfadb %%v3,%%v3,%%v31 \n\t" + + "clgrjl %1,%%r0,1b \n\t" + "vfadb %%v24,%%v0,%%v1 \n\t" + "vfadb %%v25,%%v2,%%v3 \n\t" + "vfadb %%v0,%%v25,%%v24 \n\t" + "vrepg %%v1,%%v0,1 \n\t" + "adbr %%f0,%%f1 \n\t" + : + : "r"(n), "a"(x) + : "cc", "memory","r0","f0","f1","v0","v1","v2","v3","v24","v25","v26","v27","v28","v29","v30","v31" + ); + +} + + + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT sumf = 0.0; + BLASLONG n1; + + if (n <= 0 || inc_x <= 0) return sumf; + + if (inc_x == 1) { + + n1 = n & -32; + + if (n1 > 0) { + + sumf = dasum_kernel_32(n1, x); + i = n1; + } + + while (i < n) { + sumf += ABS(x[i]); + i++; + } + + } else { + BLASLONG n1 = n & -4; + register FLOAT sum1, sum2; + sum1 = 0.0; + sum2 = 0.0; + while (j < n1) { + + sum1 += ABS(x[i]); + sum2 += ABS(x[i + inc_x]); + sum1 += ABS(x[i + 2 * inc_x]); + sum2 += ABS(x[i + 3 * inc_x]); + + i += inc_x * 4; + j += 4; + + } + sumf = sum1 + sum2; + while (j < n) { + + sumf += ABS(x[i]); + i += inc_x; + j++; + } + + + } + return sumf; +} + + diff --git a/kernel/zarch/daxpy.c b/kernel/zarch/daxpy.c new file mode 100644 index 000000000..b6cbdfee8 --- /dev/null +++ b/kernel/zarch/daxpy.c @@ -0,0 +1,386 @@ +/*************************************************************************** +Copyright (c) 2013-2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include "common.h" +#define Z13_D 1 +#define PREFETCH_INS 1 +#if defined(Z13_A) +#include + +static void daxpy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + BLASLONG i = 0; + __vector double v_a = {*alpha,*alpha}; + __vector double * v_y=(__vector double *)y; + __vector double * v_x=(__vector double *)x; + + for(; i 0) { + dcopy_kernel_32(n1, x, y); + i = n1; + } + + while (i < n) { + y[i] = x[i]; + i++; + + } + + + } else { + + BLASLONG n1 = n & -4; + + while (i < n1) { + + y[iy] = x[ix]; + y[iy + inc_y] = x[ix + inc_x]; + y[iy + 2 * inc_y] = x[ix + 2 * inc_x]; + y[iy + 3 * inc_y] = x[ix + 3 * inc_x]; + + ix += inc_x * 4; + iy += inc_y * 4; + i += 4; + + } + + while (i < n) { + + y[iy] = x[ix]; + ix += inc_x; + iy += inc_y; + i++; + + } + + } + return 0; + + +} + + diff --git a/kernel/zarch/ddot.c b/kernel/zarch/ddot.c new file mode 100644 index 000000000..e4a2bd95b --- /dev/null +++ b/kernel/zarch/ddot.c @@ -0,0 +1,194 @@ +/*************************************************************************** +Copyright (c) 2013-2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include "common.h" + + +#if defined(Z13) +static void __attribute__ ((noinline)) ddot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) +{ + + __asm__ volatile( + "pfd 1, 0(%1) \n\t" + "pfd 1, 0(%2) \n\t" + "vzero %%v24 \n\t" + "vzero %%v25 \n\t" + "vzero %%v26 \n\t" + "vzero %%v27 \n\t" + "srlg %%r0,%0,4 \n\t" + "xgr %%r1,%%r1 \n\t" + ".align 16 \n\t" + "1: \n\t" + "pfd 1, 256(%%r1,%1) \n\t" + "pfd 1, 256(%%r1,%2) \n\t" + "vl %%v16, 0(%%r1,%1) \n\t" + "vl %%v17, 16(%%r1,%1) \n\t" + "vl %%v18, 32(%%r1,%1) \n\t" + "vl %%v19, 48(%%r1,%1) \n\t" + + "vl %%v28, 0(%%r1,%2) \n\t" + "vfmadb %%v24,%%v16,%%v28,%%v24 \n\t" + "vl %%v29, 16(%%r1,%2) \n\t" + "vfmadb %%v25,%%v17,%%v29,%%v25 \n\t" + + "vl %%v30, 32(%%r1,%2) \n\t" + "vfmadb %%v26,%%v18,%%v30,%%v26 \n\t" + "vl %%v31, 48(%%r1,%2) \n\t" + "vfmadb %%v27,%%v19,%%v31,%%v27 \n\t" + + "vl %%v16, 64(%%r1,%1) \n\t" + "vl %%v17, 80(%%r1,%1) \n\t" + "vl %%v18, 96(%%r1,%1) \n\t" + "vl %%v19, 112(%%r1,%1) \n\t" + + "vl %%v28, 64(%%r1,%2) \n\t" + "vfmadb %%v24,%%v16,%%v28,%%v24 \n\t" + "vl %%v29, 80(%%r1,%2) \n\t" + "vfmadb %%v25,%%v17,%%v29,%%v25 \n\t" + + + "vl %%v30, 96(%%r1,%2) \n\t" + "vfmadb %%v26,%%v18,%%v30,%%v26 \n\t" + "vl %%v31, 112(%%r1,%2) \n\t" + "vfmadb %%v27,%%v19,%%v31,%%v27 \n\t" + + + "la %%r1,128(%%r1) \n\t" + "brctg %%r0,1b \n\t" + "vfadb %%v24,%%v25,%%v24 \n\t" + "vfadb %%v24,%%v26,%%v24 \n\t" + "vfadb %%v24,%%v27,%%v24 \n\t" + "vrepg %%v1,%%v24,1 \n\t" + "vfadb %%v1,%%v24,%%v1 \n\t" + " std %%f1,0(%3) \n\t" + : + :"r"(n),"a"(x),"a"(y),"a"(d) + :"cc" , "memory" ,"r0","r1","v16", "v17","v18","v19","v20","v21","v22","v23", + "v24","v25","v26","v27","v28","v29","v30","v31" + + ); + +} + + +#else + +static void ddot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) +{ + BLASLONG register i = 0; + FLOAT dot = 0.0; + + while(i < n) + { + dot += y[i] * x[i] + + y[i+1] * x[i+1] + + y[i+2] * x[i+2] + + y[i+3] * x[i+3] + + y[i+4] * x[i+4] + + y[i+5] * x[i+5] + + y[i+6] * x[i+6] + + y[i+7] * x[i+7] ; + + i+=8 ; + + } + *d += dot; + +} + +#endif + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + + FLOAT dot = 0.0 ; + + if ( n <= 0 ) return(dot); + + if ( (inc_x == 1) && (inc_y == 1) ) + { + + BLASLONG n1 = n & -16; + + if ( n1 ) + ddot_kernel_8(n1, x, y , &dot ); + + i = n1; + while(i < n) + { + + dot += y[i] * x[i] ; + i++ ; + + } + return(dot); + + + } + + FLOAT temp1 = 0.0; + FLOAT temp2 = 0.0; + + BLASLONG n1 = n & -4; + + while(i < n1) + { + + FLOAT m1 = y[iy] * x[ix] ; + FLOAT m2 = y[iy+inc_y] * x[ix+inc_x] ; + + FLOAT m3 = y[iy+2*inc_y] * x[ix+2*inc_x] ; + FLOAT m4 = y[iy+3*inc_y] * x[ix+3*inc_x] ; + + ix += inc_x*4 ; + iy += inc_y*4 ; + + temp1 += m1+m3; + temp2 += m2+m4; + + i+=4 ; + + } + + while(i < n) + { + + temp1 += y[iy] * x[ix] ; + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + dot = temp1 + temp2; + return(dot); + +} + + diff --git a/kernel/zarch/dgemv_n_4.c b/kernel/zarch/dgemv_n_4.c new file mode 100644 index 000000000..5897b2c17 --- /dev/null +++ b/kernel/zarch/dgemv_n_4.c @@ -0,0 +1,487 @@ +/*************************************************************************** +Copyright (c) 2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include "common.h" + +#define NBMAX 2048 + +#define HAVE_KERNEL_4x4_VEC 1 +#define HAVE_KERNEL_4x2_VEC 1 +#define HAVE_KERNEL_4x1_VEC 1 + +#if defined(HAVE_KERNEL_4x4_VEC) || defined(HAVE_KERNEL_4x2_VEC) || defined(HAVE_KERNEL_4x1_VEC) + #include +#endif + +#ifdef HAVE_KERNEL_4x4 + +#elif HAVE_KERNEL_4x4_VEC + +static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) +{ + BLASLONG i; + FLOAT x0,x1,x2,x3; + x0 = xo[0] * *alpha; + x1 = xo[1] * *alpha; + x2 = xo[2] * *alpha; + x3 = xo[3] * *alpha; + __vector double v_x0 = {x0,x0}; + __vector double v_x1 = {x1,x1}; + __vector double v_x2 = {x2,x2}; + __vector double v_x3 = {x3,x3}; + __vector double* v_y =(__vector double*)y; + __vector double* va0 = (__vector double*)ap[0]; + __vector double* va1 = (__vector double*)ap[1]; + __vector double* va2 = (__vector double*)ap[2]; + __vector double* va3 = (__vector double*)ap[3]; + + for ( i=0; i< n/2; i+=2 ) + { + v_y[i] += v_x0 * va0[i] + v_x1 * va1[i] + v_x2 * va2[i] + v_x3 * va3[i] ; + v_y[i+1] += v_x0 * va0[i+1] + v_x1 * va1[i+1] + v_x2 * va2[i+1] + v_x3 * va3[i+1] ; + } +} + +#else + +static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) +{ + BLASLONG i; + FLOAT *a0,*a1,*a2,*a3; + FLOAT x[4] __attribute__ ((aligned (16))); + a0 = ap[0]; + a1 = ap[1]; + a2 = ap[2]; + a3 = ap[3]; + + for ( i=0; i<4; i++) + x[i] = xo[i] * *alpha; + + for ( i=0; i< n; i+=4 ) + { + y[i] += a0[i]*x[0] + a1[i]*x[1] + a2[i]*x[2] + a3[i]*x[3]; + y[i+1] += a0[i+1]*x[0] + a1[i+1]*x[1] + a2[i+1]*x[2] + a3[i+1]*x[3]; + y[i+2] += a0[i+2]*x[0] + a1[i+2]*x[1] + a2[i+2]*x[2] + a3[i+2]*x[3]; + y[i+3] += a0[i+3]*x[0] + a1[i+3]*x[1] + a2[i+3]*x[2] + a3[i+3]*x[3]; + } +} + + +#endif + +#ifdef HAVE_KERNEL_4x2 + +#elif HAVE_KERNEL_4x2_VEC + +static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) +{ + BLASLONG i; + FLOAT x0,x1; + x0 = xo[0] * *alpha; + x1 = xo[1] * *alpha; + __vector double v_x0 = {x0,x0}; + __vector double v_x1 = {x1,x1}; + __vector double* v_y =(__vector double*)y; + __vector double* va0 = (__vector double*)ap[0]; + __vector double* va1 = (__vector double*)ap[1]; + + for ( i=0; i< n/2; i+=2 ) + { + v_y[i] += v_x0 * va0[i] + v_x1 * va1[i] ; + v_y[i+1] += v_x0 * va0[i+1] + v_x1 * va1[i+1] ; + } +} +#else + +static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) +{ + BLASLONG i; + FLOAT *a0,*a1; + FLOAT x[4] __attribute__ ((aligned (16))); + a0 = ap[0]; + a1 = ap[1]; + + for ( i=0; i<2; i++) + x[i] = xo[i] * *alpha; + + for ( i=0; i< n; i+=4 ) + { + y[i] += a0[i]*x[0] + a1[i]*x[1]; + y[i+1] += a0[i+1]*x[0] + a1[i+1]*x[1]; + y[i+2] += a0[i+2]*x[0] + a1[i+2]*x[1]; + y[i+3] += a0[i+3]*x[0] + a1[i+3]*x[1]; + } +} + + +#endif + +#ifdef HAVE_KERNEL_4x1 + +#elif HAVE_KERNEL_4x1_VEC +static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) +{ + + BLASLONG i; + FLOAT x0; + x0 = xo[0] * *alpha; + __vector double v_x0 = {x0,x0}; + __vector double* v_y =(__vector double*)y; + __vector double* va0 = (__vector double*)ap; + + for ( i=0; i< n/2; i+=2 ) + { + v_y[i] += v_x0 * va0[i] ; + v_y[i+1] += v_x0 * va0[i+1] ; + } + + +} + +#else +static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) +{ + BLASLONG i; + FLOAT *a0; + FLOAT x[4] __attribute__ ((aligned (16))); + a0 = ap; + + for ( i=0; i<1; i++) + x[i] = xo[i] * *alpha; + + for ( i=0; i< n; i+=4 ) + { + y[i] += a0[i]*x[0]; + y[i+1] += a0[i+1]*x[0]; + y[i+2] += a0[i+2]*x[0]; + y[i+3] += a0[i+3]*x[0]; + } +} + + +#endif + + + +static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) __attribute__ ((noinline)); + +static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) +{ + BLASLONG i; + + for ( i=0; i> 2 ; + n2 = n & 3 ; + + m3 = m & 3 ; + m1 = m & -4 ; + m2 = (m & (NBMAX-1)) - m3 ; + + y_ptr = y; + + BLASLONG NB = NBMAX; + + while ( NB == NBMAX ) + { + + m1 -= NB; + if ( m1 < 0) + { + if ( m2 == 0 ) break; + NB = m2; + } + + a_ptr = a; + x_ptr = x; + + ap[0] = a_ptr; + ap[1] = a_ptr + lda; + ap[2] = ap[1] + lda; + ap[3] = ap[2] + lda; + + if ( inc_y != 1 ) + memset(ybuffer,0,NB*8); + else + ybuffer = y_ptr; + + if ( inc_x == 1 ) + { + + + for( i = 0; i < n1 ; i++) + { + dgemv_kernel_4x4(NB,ap,x_ptr,ybuffer,&alpha); + ap[0] += lda4; + ap[1] += lda4; + ap[2] += lda4; + ap[3] += lda4; + a_ptr += lda4; + x_ptr += 4; + } + + if ( n2 & 2 ) + { + dgemv_kernel_4x2(NB,ap,x_ptr,ybuffer,&alpha); + a_ptr += lda*2; + x_ptr += 2; + } + + + if ( n2 & 1 ) + { + dgemv_kernel_4x1(NB,a_ptr,x_ptr,ybuffer,&alpha); + a_ptr += lda; + x_ptr += 1; + + } + + + } + else + { + + for( i = 0; i < n1 ; i++) + { + xbuffer[0] = x_ptr[0]; + x_ptr += inc_x; + xbuffer[1] = x_ptr[0]; + x_ptr += inc_x; + xbuffer[2] = x_ptr[0]; + x_ptr += inc_x; + xbuffer[3] = x_ptr[0]; + x_ptr += inc_x; + dgemv_kernel_4x4(NB,ap,xbuffer,ybuffer,&alpha); + ap[0] += lda4; + ap[1] += lda4; + ap[2] += lda4; + ap[3] += lda4; + a_ptr += lda4; + } + + for( i = 0; i < n2 ; i++) + { + xbuffer[0] = x_ptr[0]; + x_ptr += inc_x; + dgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer,&alpha); + a_ptr += lda; + + } + + } + + a += NB; + if ( inc_y != 1 ) + { + add_y(NB,ybuffer,y_ptr,inc_y); + y_ptr += NB * inc_y; + } + else + y_ptr += NB ; + + } + + if ( m3 == 0 ) return(0); + + if ( m3 == 3 ) + { + a_ptr = a; + x_ptr = x; + FLOAT temp0 = 0.0; + FLOAT temp1 = 0.0; + FLOAT temp2 = 0.0; + if ( lda == 3 && inc_x ==1 ) + { + + for( i = 0; i < ( n & -4 ); i+=4 ) + { + + temp0 += a_ptr[0] * x_ptr[0] + a_ptr[3] * x_ptr[1]; + temp1 += a_ptr[1] * x_ptr[0] + a_ptr[4] * x_ptr[1]; + temp2 += a_ptr[2] * x_ptr[0] + a_ptr[5] * x_ptr[1]; + + temp0 += a_ptr[6] * x_ptr[2] + a_ptr[9] * x_ptr[3]; + temp1 += a_ptr[7] * x_ptr[2] + a_ptr[10] * x_ptr[3]; + temp2 += a_ptr[8] * x_ptr[2] + a_ptr[11] * x_ptr[3]; + + a_ptr += 12; + x_ptr += 4; + } + + for( ; i < n; i++ ) + { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + temp2 += a_ptr[2] * x_ptr[0]; + a_ptr += 3; + x_ptr ++; + } + + } + else + { + + for( i = 0; i < n; i++ ) + { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + temp2 += a_ptr[2] * x_ptr[0]; + a_ptr += lda; + x_ptr += inc_x; + + + } + + } + y_ptr[0] += alpha * temp0; + y_ptr += inc_y; + y_ptr[0] += alpha * temp1; + y_ptr += inc_y; + y_ptr[0] += alpha * temp2; + return(0); + } + + + if ( m3 == 2 ) + { + a_ptr = a; + x_ptr = x; + FLOAT temp0 = 0.0; + FLOAT temp1 = 0.0; + if ( lda == 2 && inc_x ==1 ) + { + + for( i = 0; i < (n & -4) ; i+=4 ) + { + temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1]; + temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1]; + temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3]; + temp1 += a_ptr[5] * x_ptr[2] + a_ptr[7] * x_ptr[3]; + a_ptr += 8; + x_ptr += 4; + + } + + + for( ; i < n; i++ ) + { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + a_ptr += 2; + x_ptr ++; + } + + } + else + { + + for( i = 0; i < n; i++ ) + { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + a_ptr += lda; + x_ptr += inc_x; + + + } + + } + y_ptr[0] += alpha * temp0; + y_ptr += inc_y; + y_ptr[0] += alpha * temp1; + return(0); + } + + if ( m3 == 1 ) + { + a_ptr = a; + x_ptr = x; + FLOAT temp = 0.0; + if ( lda == 1 && inc_x ==1 ) + { + + for( i = 0; i < (n & -4); i+=4 ) + { + temp += a_ptr[i] * x_ptr[i] + a_ptr[i+1] * x_ptr[i+1] + a_ptr[i+2] * x_ptr[i+2] + a_ptr[i+3] * x_ptr[i+3]; + + } + + for( ; i < n; i++ ) + { + temp += a_ptr[i] * x_ptr[i]; + } + + } + else + { + + for( i = 0; i < n; i++ ) + { + temp += a_ptr[0] * x_ptr[0]; + a_ptr += lda; + x_ptr += inc_x; + } + + } + y_ptr[0] += alpha * temp; + return(0); + } + + + return(0); +} + + diff --git a/kernel/zarch/dgemv_t_4.c b/kernel/zarch/dgemv_t_4.c new file mode 100644 index 000000000..96af0139c --- /dev/null +++ b/kernel/zarch/dgemv_t_4.c @@ -0,0 +1,541 @@ +/*************************************************************************** +Copyright (c) 2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include "common.h" + +#define HAVE_KERNEL_4x4_VEC 1 +#define HAVE_KERNEL_4x2_VEC 1 +#define HAVE_KERNEL_4x1_VEC 1 + +#if defined(HAVE_KERNEL_4x4_VEC) || defined(HAVE_KERNEL_4x2_VEC) || defined(HAVE_KERNEL_4x1_VEC) + #include +#endif +#define NBMAX 2048 + +#ifdef HAVE_KERNEL_4x4 + +#elif HAVE_KERNEL_4x4_VEC + +static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) +{ + BLASLONG i; + __vector double* va0 = (__vector double*)ap[0]; + __vector double* va1 = (__vector double*)ap[1]; + __vector double* va2 = (__vector double*)ap[2]; + __vector double* va3 = (__vector double*)ap[3]; + __vector double* v_x =(__vector double*)x; + __vector double temp0 = {0,0}; + __vector double temp1 = {0,0}; + __vector double temp2 = {0,0}; + __vector double temp3 = {0,0}; + + for ( i=0; i< n/2; i+=2 ) + { + temp0 += v_x[i] * va0[i] + v_x[i+1] * va0[i+1] ; + temp1 += v_x[i] * va1[i] + v_x[i+1] * va1[i+1] ; + temp2 += v_x[i] * va2[i] + v_x[i+1] * va2[i+1] ; + temp3 += v_x[i] * va3[i] + v_x[i+1] * va3[i+1] ; + } + + y[0] = temp0[0] + temp0[1]; + y[1] = temp1[0] + temp1[1]; + y[2] = temp2[0] + temp2[1]; + y[3] = temp3[0] + temp3[1];; +} +#else +static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) +{ + BLASLONG i; + FLOAT *a0,*a1,*a2,*a3; + a0 = ap[0]; + a1 = ap[1]; + a2 = ap[2]; + a3 = ap[3]; + FLOAT temp0 = 0.0; + FLOAT temp1 = 0.0; + FLOAT temp2 = 0.0; + FLOAT temp3 = 0.0; + + for ( i=0; i< n; i+=4 ) + { + temp0 += a0[i]*x[i] + a0[i+1]*x[i+1] + a0[i+2]*x[i+2] + a0[i+3]*x[i+3]; + temp1 += a1[i]*x[i] + a1[i+1]*x[i+1] + a1[i+2]*x[i+2] + a1[i+3]*x[i+3]; + temp2 += a2[i]*x[i] + a2[i+1]*x[i+1] + a2[i+2]*x[i+2] + a2[i+3]*x[i+3]; + temp3 += a3[i]*x[i] + a3[i+1]*x[i+1] + a3[i+2]*x[i+2] + a3[i+3]*x[i+3]; + } + y[0] = temp0; + y[1] = temp1; + y[2] = temp2; + y[3] = temp3; +} + +#endif + +#ifdef HAVE_KERNEL_4x2 + +#elif HAVE_KERNEL_4x2_VEC + +static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) +{ + BLASLONG i; + __vector double* va0 = (__vector double*)ap[0]; + __vector double* va1 = (__vector double*)ap[1]; + __vector double* v_x =(__vector double*)x; + __vector double temp0 = {0,0}; + __vector double temp1 = {0,0}; + + for ( i=0; i< n/2; i+=2 ) + { + temp0 += v_x[i] * va0[i] + v_x[i+1] * va0[i+1] ; + temp1 += v_x[i] * va1[i] + v_x[i+1] * va1[i+1] ; + } + + y[0] = temp0[0] + temp0[1]; + y[1] = temp1[0] + temp1[1]; +} +#else +static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) +{ + + BLASLONG i; + FLOAT *a0,*a1; + a0 = ap[0]; + a1 = ap[1]; + FLOAT temp0 = 0.0; + FLOAT temp1 = 0.0; + + for ( i=0; i< n; i+=4 ) + { + temp0 += a0[i]*x[i] + a0[i+1]*x[i+1] + a0[i+2]*x[i+2] + a0[i+3]*x[i+3]; + temp1 += a1[i]*x[i] + a1[i+1]*x[i+1] + a1[i+2]*x[i+2] + a1[i+3]*x[i+3]; + } + y[0] = temp0; + y[1] = temp1; + +} +#endif + +#ifdef HAVE_KERNEL_4x1 + +#elif HAVE_KERNEL_4x1_VEC + +static void dgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y) +{ + BLASLONG i; + __vector double* va0 = (__vector double*)a0; + __vector double* v_x =(__vector double*)x; + __vector double temp0 = {0,0}; + + for ( i=0; i< n/2; i+=2 ) + { + temp0 += v_x[i] * va0[i] + v_x[i+1] * va0[i+1] ; + } + + y[0] = temp0[0] + temp0[1]; +} +#else +static void dgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y) +{ + BLASLONG i; + + + FLOAT temp0 = 0.0; + + for ( i=0; i< n; i+=4 ) + { + temp0 += a0[i]*x[i] + a0[i+1]*x[i+1] + a0[i+2]*x[i+2] + a0[i+3]*x[i+3]; + } + y[0] = temp0; +} +#endif + +static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) +{ + BLASLONG i; + for ( i=0; i> 2 ; + n2 = n & 3 ; + + m3 = m & 3 ; + m1 = m & -4 ; + m2 = (m & (NBMAX-1)) - m3 ; + + + BLASLONG NB = NBMAX; + + while ( NB == NBMAX ) + { + + m1 -= NB; + if ( m1 < 0) + { + if ( m2 == 0 ) break; + NB = m2; + } + + y_ptr = y; + a_ptr = a; + x_ptr = x; + + if ( inc_x == 1 ) + xbuffer = x_ptr; + else + copy_x(NB,x_ptr,xbuffer,inc_x); + + + FLOAT *ap[4]; + FLOAT *yp; + BLASLONG register lda4 = 4 * lda; + ap[0] = a_ptr; + ap[1] = a_ptr + lda; + ap[2] = ap[1] + lda; + ap[3] = ap[2] + lda; + + if ( n0 > 0 ) + { + BLASLONG nb1 = NBMAX / 4; + for( j=0; j 0 ) + { + add_y(n1*4, alpha, ytemp, y_ptr, inc_y ); + y_ptr += n1 * inc_y * 4; + a_ptr += n1 * lda4 ; + } + + if ( n2 & 2 ) + { + + dgemv_kernel_4x2(NB,ap,xbuffer,ybuffer); + a_ptr += lda * 2; + *y_ptr += ybuffer[0] * alpha; + y_ptr += inc_y; + *y_ptr += ybuffer[1] * alpha; + y_ptr += inc_y; + + } + + if ( n2 & 1 ) + { + + dgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer); + a_ptr += lda; + *y_ptr += ybuffer[0] * alpha; + y_ptr += inc_y; + + } + a += NB; + x += NB * inc_x; + } + + if ( m3 == 0 ) return(0); + + x_ptr = x; + a_ptr = a; + if ( m3 == 3 ) + { + FLOAT xtemp0 = *x_ptr * alpha; + x_ptr += inc_x; + FLOAT xtemp1 = *x_ptr * alpha; + x_ptr += inc_x; + FLOAT xtemp2 = *x_ptr * alpha; + + FLOAT *aj = a_ptr; + y_ptr = y; + + if ( lda == 3 && inc_y == 1 ) + { + + for ( j=0; j< ( n & -4) ; j+=4 ) + { + + y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2; + y_ptr[j+1] += aj[3] * xtemp0 + aj[4] * xtemp1 + aj[5] * xtemp2; + y_ptr[j+2] += aj[6] * xtemp0 + aj[7] * xtemp1 + aj[8] * xtemp2; + y_ptr[j+3] += aj[9] * xtemp0 + aj[10] * xtemp1 + aj[11] * xtemp2; + aj += 12; + } + + for ( ; j 0 ) + { + FLOAT cosa,sina; + cosa=c; + sina=s; + drot_kernel_32(n1, x, y, &cosa, &sina); + i=n1; + } + + while(i < n) + { + temp = c*x[i] + s*y[i] ; + y[i] = c*y[i] - s*x[i] ; + x[i] = temp ; + + i++ ; + + } + + + } + else + { + + while(i < n) + { + temp = c*x[ix] + s*y[iy] ; + y[iy] = c*y[iy] - s*x[ix] ; + x[ix] = temp ; + + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + + } + return(0); + +} + + diff --git a/kernel/zarch/dscal.c b/kernel/zarch/dscal.c new file mode 100644 index 000000000..846b9737c --- /dev/null +++ b/kernel/zarch/dscal.c @@ -0,0 +1,210 @@ +/*************************************************************************** +Copyright (c) 2013-2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + + +#if defined(Z13) +static void __attribute__ ((noinline)) dscal_kernel_8( BLASLONG n, FLOAT da , FLOAT *x ) +{ + + __asm__ ("pfd 2, 0(%1) \n\t" + "vrepg %%v0 , %%v0,0 \n\t" + "sllg %%r0,%0,3 \n\t" + "xgr %%r1,%%r1 \n\t" + ".align 16 \n\t" + "1: \n\t" + "pfd 2, 256(%%r1,%1) \n\t" + "vl %%v24, 0(%%r1,%1) \n\t" + "vfmdb %%v24,%%v24,%%v0 \n\t" + "vst %%v24, 0(%%r1,%1) \n\t" + "vl %%v25, 16(%%r1,%1) \n\t" + "vfmdb %%v25,%%v25,%%v0 \n\t" + "vst %%v25, 16(%%r1,%1) \n\t" + "vl %%v26, 32(%%r1,%1) \n\t" + "vfmdb %%v26,%%v26,%%v0 \n\t" + "vst %%v26, 32(%%r1,%1) \n\t" + "vl %%v27, 48(%%r1,%1) \n\t" + "vfmdb %%v27,%%v27,%%v0 \n\t" + "vst %%v27, 48(%%r1,%1) \n\t" + "vl %%v24, 64(%%r1,%1) \n\t" + "vfmdb %%v24,%%v24,%%v0 \n\t" + "vst %%v24, 64(%%r1,%1) \n\t" + "vl %%v25, 80(%%r1,%1) \n\t" + "vfmdb %%v25,%%v25,%%v0 \n\t" + "vst %%v25, 80(%%r1,%1) \n\t" + "vl %%v26, 96(%%r1,%1) \n\t" + "vfmdb %%v26,%%v26,%%v0 \n\t" + "vst %%v26, 96(%%r1,%1) \n\t" + "vl %%v27, 112(%%r1,%1) \n\t" + "vfmdb %%v27,%%v27,%%v0 \n\t" + "vst %%v27, 112(%%r1,%1) \n\t" + "la %%r1,128(%%r1) \n\t" + "clgrjl %%r1,%%r0,1b \n\t" + : + :"r"(n),"a"(x),"f"(da) + :"cc" , "memory" ,"r0","r1","v0","v24","v25","v26","v27" + ); + +} + +static void __attribute__ ((noinline)) dscal_kernel_8_zero( BLASLONG n, FLOAT da , FLOAT *x ) +{ + + __asm__ ("pfd 2, 0(%1) \n\t" + "vzero %%v0 \n\t" + "sllg %%r0,%0,3 \n\t" + "xgr %%r1,%%r1 \n\t" + ".align 16 \n\t" + "1: \n\t" + "pfd 2, 256(%%r1,%1) \n\t" + "vst %%v0, 0(%%r1,%1) \n\t" + "vst %%v0, 16(%%r1,%1) \n\t" + "vst %%v0, 32(%%r1,%1) \n\t" + "vst %%v0, 48(%%r1,%1) \n\t" + "vst %%v0, 64(%%r1,%1) \n\t" + "vst %%v0, 80(%%r1,%1) \n\t" + "vst %%v0, 96(%%r1,%1) \n\t" + "vst %%v0, 112(%%r1,%1) \n\t" + "la %%r1,128(%%r1) \n\t" + "clgrjl %%r1,%%r0,1b \n\t" + : + :"r"(n),"a"(x),"f"(da) + :"cc" , "memory" ,"r0","r1","v0" + ); +} + + +#endif + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + BLASLONG i=0,j=0; + if ( n <= 0 || inc_x <=0 ) + return(0); + + + if ( inc_x == 1 ) + { + + if ( da == 0.0 ) + { + + BLASLONG n1 = n & -16; + if ( n1 > 0 ) + { + + dscal_kernel_8_zero(n1 , da , x); + j=n1; + } + + while(j < n) + { + + x[j]=0.0; + j++; + } + + } + else + { + + BLASLONG n1 = n & -16; + if ( n1 > 0 ) + { + dscal_kernel_8(n1 , da , x); + j=n1; + } + while(j < n) + { + + x[j] = da * x[j] ; + j++; + } + } + + + } + else + { + + if ( da == 0.0 ) + { + + BLASLONG n1 = n & -4; + + while (j < n1) { + + x[i]=0.0; + x[i + inc_x]=0.0; + x[i + 2 * inc_x]=0.0; + x[i + 3 * inc_x]=0.0; + + i += inc_x * 4; + j += 4; + + } + while(j < n) + { + + x[i]=0.0; + i += inc_x ; + j++; + } + + } + else + { + BLASLONG n1 = n & -4; + + while (j < n1) { + + x[i] = da * x[i] ; + x[i + inc_x] = da * x[i + inc_x]; + x[i + 2 * inc_x] = da * x[i + 2 * inc_x]; + x[i + 3 * inc_x] = da * x[i + 3 * inc_x]; + + i += inc_x * 4; + j += 4; + + } + + while(j < n) + { + + x[i] = da * x[i] ; + i += inc_x ; + j++; + } + } + + } + return 0; + +} + + diff --git a/kernel/zarch/dswap.c b/kernel/zarch/dswap.c new file mode 100644 index 000000000..b98347870 --- /dev/null +++ b/kernel/zarch/dswap.c @@ -0,0 +1,382 @@ +/*************************************************************************** +Copyright (c) 2013-2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + + +#include "common.h" + +#define Z13_SWAP_C 1 + +#if defined(Z13_SWAP_A) +static void __attribute__ ((noinline)) dswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) +{ + __asm__ volatile( + "pfd 1, 0(%1) \n\t" + "pfd 2, 0(%2) \n\t" + "srlg %%r0,%0,5 \n\t" + "xgr %%r1,%%r1 \n\t" + ".align 16 \n\t" + "1: \n\t" + "pfd 2, 256(%%r1,%1) \n\t" + "pfd 2, 256(%%r1,%2) \n\t" + + "vl %%v24, 0(%%r1,%1) \n\t" + "vl %%v16, 0(%%r1,%2) \n\t" + "vst %%v24, 0(%%r1,%2) \n\t" + "vst %%v16, 0(%%r1,%1) \n\t" + + "vl %%v25, 16(%%r1,%1) \n\t" + "vl %%v17, 16(%%r1,%2) \n\t" + "vst %%v25, 16(%%r1,%2) \n\t" + "vst %%v17, 16(%%r1,%1) \n\t" + + "vl %%v26, 32(%%r1,%1) \n\t" + "vl %%v18, 32(%%r1,%2) \n\t" + "vst %%v26, 32(%%r1,%2) \n\t" + "vst %%v18, 32(%%r1,%1) \n\t" + + "vl %%v27, 48(%%r1,%1) \n\t" + "vl %%v19, 48(%%r1,%2) \n\t" + "vst %%v27, 48(%%r1,%2) \n\t" + "vst %%v19, 48(%%r1,%1) \n\t" + + "vl %%v28, 64(%%r1,%1) \n\t" + "vl %%v20, 64(%%r1,%2) \n\t" + "vst %%v28, 64(%%r1,%2) \n\t" + "vst %%v20, 64(%%r1,%1) \n\t" + + "vl %%v29, 80(%%r1,%1) \n\t" + "vl %%v21, 80(%%r1,%2) \n\t" + "vst %%v29, 80(%%r1,%2) \n\t" + "vst %%v21, 80(%%r1,%1) \n\t" + + "vl %%v30, 96(%%r1,%1) \n\t" + "vl %%v22, 96(%%r1,%2) \n\t" + "vst %%v30, 96(%%r1,%2) \n\t" + "vst %%v22, 96(%%r1,%1) \n\t" + + "vl %%v31, 112(%%r1,%1) \n\t" + "vl %%v23, 112(%%r1,%2) \n\t" + "vst %%v31, 112(%%r1,%2) \n\t" + "vst %%v23, 112(%%r1,%1) \n\t" + + "vl %%v24, 128(%%r1,%1) \n\t" + "vl %%v16, 128(%%r1,%2) \n\t" + "vst %%v24, 128(%%r1,%2) \n\t" + "vst %%v16, 128(%%r1,%1) \n\t" + + "vl %%v25, 144(%%r1,%1) \n\t" + "vl %%v17, 144(%%r1,%2) \n\t" + "vst %%v25, 144(%%r1,%2) \n\t" + "vst %%v17, 144(%%r1,%1) \n\t" + + "vl %%v26, 160(%%r1,%1) \n\t" + "vl %%v18, 160(%%r1,%2) \n\t" + "vst %%v26, 160(%%r1,%2) \n\t" + "vst %%v18, 160(%%r1,%1) \n\t" + + "vl %%v27, 176(%%r1,%1) \n\t" + "vl %%v19, 176(%%r1,%2) \n\t" + "vst %%v27, 176(%%r1,%2) \n\t" + "vst %%v19, 176(%%r1,%1) \n\t" + + "vl %%v28, 192(%%r1,%1) \n\t" + "vl %%v20, 192(%%r1,%2) \n\t" + "vst %%v28, 192(%%r1,%2) \n\t" + "vst %%v20, 192(%%r1,%1) \n\t" + + "vl %%v29, 208(%%r1,%1) \n\t" + "vl %%v21, 208(%%r1,%2) \n\t" + "vst %%v29, 208(%%r1,%2) \n\t" + "vst %%v21, 208(%%r1,%1) \n\t" + + "vl %%v30, 224(%%r1,%1) \n\t" + "vl %%v22, 224(%%r1,%2) \n\t" + "vst %%v30, 224(%%r1,%2) \n\t" + "vst %%v22, 224(%%r1,%1) \n\t" + + "vl %%v31, 240(%%r1,%1) \n\t" + "vl %%v23, 240(%%r1,%2) \n\t" + "vst %%v31, 240(%%r1,%2) \n\t" + "vst %%v23, 240(%%r1,%1) \n\t" + + "la %%r1,256(%%r1) \n\t" + "brctg %%r0,1b" + : + : "r"(n), "a"(x), "a"(y) + : "cc", "memory" ,"r0","r1", "v16","v17","v18","v19","v20","v21","v22","v23" + ,"v24","v25","v26","v27","v28","v29","v30","v31" + ); + return; + +} + +#elif defined(Z13_SWAP_B) +static void __attribute__ ((noinline)) dswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) +{ + __asm__ volatile( + "pfd 2, 0(%1) \n\t" + "pfd 2, 0(%2) \n\t" + "srlg %%r0,%0,5 \n\t" + "xgr %%r1,%%r1 \n\t" + ".align 16 \n\t" + "1: \n\t" + "pfd 2, 256(%%r1,%1) \n\t" + "pfd 2, 256(%%r1,%2) \n\t" + "vl %%v24, 0(%%r1,%1) \n\t" + "vl %%v25, 16(%%r1,%1) \n\t" + "vl %%v26, 32(%%r1,%1) \n\t" + "vl %%v27, 48(%%r1,%1) \n\t" + "vl %%v28, 64(%%r1,%1) \n\t" + "vl %%v29, 80(%%r1,%1) \n\t" + "vl %%v30, 96(%%r1,%1) \n\t" + "vl %%v31, 112(%%r1,%1) \n\t" + "vl %%v16, 0(%%r1,%2) \n\t" + "vl %%v17, 16(%%r1,%2) \n\t" + "vl %%v18, 32(%%r1,%2) \n\t" + "vl %%v19, 48(%%r1,%2) \n\t" + "vl %%v20, 64(%%r1,%2) \n\t" + "vl %%v21, 80(%%r1,%2) \n\t" + "vl %%v22, 96(%%r1,%2) \n\t" + "vl %%v23, 112(%%r1,%2) \n\t" + + "vst %%v24, 0(%%r1,%2) \n\t" + "vst %%v25, 16(%%r1,%2) \n\t" + "vst %%v26, 32(%%r1,%2) \n\t" + "vst %%v27, 48(%%r1,%2) \n\t" + "vst %%v28, 64(%%r1,%2) \n\t" + "vst %%v29, 80(%%r1,%2) \n\t" + "vst %%v30, 96(%%r1,%2) \n\t" + "vst %%v31, 112(%%r1,%2)\n\t" + "vst %%v16, 0(%%r1,%1) \n\t" + "vst %%v17, 16(%%r1,%1) \n\t" + "vst %%v18, 32(%%r1,%1) \n\t" + "vst %%v19, 48(%%r1,%1) \n\t" + "vst %%v20, 64(%%r1,%1) \n\t" + "vst %%v21, 80(%%r1,%1) \n\t" + "vst %%v22, 96(%%r1,%1) \n\t" + "vst %%v23, 112(%%r1,%1)\n\t" + + + + "vl %%v24, 128(%%r1,%1) \n\t" + "vl %%v25, 144(%%r1,%1) \n\t" + "vl %%v26, 160(%%r1,%1) \n\t" + "vl %%v27, 176(%%r1,%1) \n\t" + "vl %%v28, 192(%%r1,%1) \n\t" + "vl %%v29, 208(%%r1,%1) \n\t" + "vl %%v30, 224(%%r1,%1) \n\t" + "vl %%v31, 240(%%r1,%1) \n\t" + + "vl %%v16, 128(%%r1,%2) \n\t" + "vl %%v17, 144(%%r1,%2) \n\t" + "vl %%v18, 160(%%r1,%2) \n\t" + "vl %%v19, 176(%%r1,%2) \n\t" + "vl %%v20, 192(%%r1,%2) \n\t" + "vl %%v21, 208(%%r1,%2) \n\t" + "vl %%v22, 224(%%r1,%2) \n\t" + "vl %%v23, 240(%%r1,%2) \n\t" + + + "vst %%v24, 128(%%r1,%2) \n\t" + "vst %%v25, 144(%%r1,%2) \n\t" + "vst %%v26, 160(%%r1,%2) \n\t" + "vst %%v27, 176(%%r1,%2) \n\t" + "vst %%v28, 192(%%r1,%2) \n\t" + "vst %%v29, 208(%%r1,%2) \n\t" + "vst %%v30, 224(%%r1,%2) \n\t" + "vst %%v31, 240(%%r1,%2) \n\t" + + + "vst %%v16, 128(%%r1,%1) \n\t" + "vst %%v17, 144(%%r1,%1) \n\t" + "vst %%v18, 160(%%r1,%1) \n\t" + "vst %%v19, 176(%%r1,%1) \n\t" + "vst %%v20, 192(%%r1,%1) \n\t" + "vst %%v21, 208(%%r1,%1) \n\t" + "vst %%v22, 224(%%r1,%1) \n\t" + "vst %%v23, 240(%%r1,%1) \n\t" + + + "la %%r1,256(%%r1) \n\t" + "brctg %%r0,1b" + : + : "r"(n), "a"(x), "a"(y) + : "cc", "memory","r0","r1", "v16", + "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + + ); + return; + +} + +#elif defined(Z13_SWAP_C) +static void __attribute__ ((noinline)) dswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) +{ + __asm__ volatile( + "pfd 2, 0(%1) \n\t" + "pfd 2, 0(%2) \n\t" + "srlg %%r0,%0,5 \n\t" + "xgr %%r1,%%r1 \n\t" + ".align 16 \n\t" + "1: \n\t" + "pfd 2, 256(%%r1,%1) \n\t" + "pfd 2, 256(%%r1,%2) \n\t" + + "vl %%v16, 0(%%r1,%1) \n\t" + "vl %%v17, 16(%%r1,%1) \n\t" + "vl %%v18, 32(%%r1,%1) \n\t" + "vl %%v19, 48(%%r1,%1) \n\t" + "vl %%v20, 64(%%r1,%1) \n\t" + "vl %%v21, 80(%%r1,%1) \n\t" + "vl %%v22, 96(%%r1,%1) \n\t" + "vl %%v23, 112(%%r1,%1) \n\t" + "vl %%v24, 128(%%r1,%1) \n\t" + "vl %%v25, 144(%%r1,%1) \n\t" + "vl %%v26, 160(%%r1,%1) \n\t" + "vl %%v27, 176(%%r1,%1) \n\t" + "vl %%v28, 192(%%r1,%1) \n\t" + "vl %%v29, 208(%%r1,%1) \n\t" + "vl %%v30, 224(%%r1,%1) \n\t" + "vl %%v31, 240(%%r1,%1) \n\t" + + + "vl %%v0, 0(%%r1,%2) \n\t" + "vl %%v1, 16(%%r1,%2) \n\t" + "vl %%v2, 32(%%r1,%2) \n\t" + "vl %%v3, 48(%%r1,%2) \n\t" + "vl %%v4, 64(%%r1,%2) \n\t" + "vl %%v5, 80(%%r1,%2) \n\t" + "vl %%v6, 96(%%r1,%2) \n\t" + "vl %%v7, 112(%%r1,%2) \n\t" + "vst %%v0, 0(%%r1,%1) \n\t" + "vst %%v1, 16(%%r1,%1) \n\t" + "vst %%v2, 32(%%r1,%1) \n\t" + "vst %%v3, 48(%%r1,%1) \n\t" + "vst %%v4, 64(%%r1,%1) \n\t" + "vst %%v5, 80(%%r1,%1) \n\t" + "vst %%v6, 96(%%r1,%1) \n\t" + "vst %%v7, 112(%%r1,%1) \n\t" + + "vl %%v0, 128(%%r1,%2) \n\t" + "vl %%v1, 144(%%r1,%2) \n\t" + "vl %%v2, 160(%%r1,%2) \n\t" + "vl %%v3, 176(%%r1,%2) \n\t" + "vl %%v4, 192(%%r1,%2) \n\t" + "vl %%v5, 208(%%r1,%2) \n\t" + "vl %%v6, 224(%%r1,%2) \n\t" + "vl %%v7, 240(%%r1,%2) \n\t" + "vst %%v0, 128(%%r1,%1) \n\t" + "vst %%v1, 144(%%r1,%1) \n\t" + "vst %%v2, 160(%%r1,%1) \n\t" + "vst %%v3, 176(%%r1,%1) \n\t" + "vst %%v4, 192(%%r1,%1) \n\t" + "vst %%v5, 208(%%r1,%1) \n\t" + "vst %%v6, 224(%%r1,%1) \n\t" + "vst %%v7, 240(%%r1,%1) \n\t" + + "vst %%v16, 0(%%r1,%2) \n\t" + "vst %%v17, 16(%%r1,%2) \n\t" + "vst %%v18, 32(%%r1,%2) \n\t" + "vst %%v19, 48(%%r1,%2) \n\t" + "vst %%v20, 64(%%r1,%2) \n\t" + "vst %%v21, 80(%%r1,%2) \n\t" + "vst %%v22, 96(%%r1,%2) \n\t" + "vst %%v23, 112(%%r1,%2) \n\t" + "vst %%v24, 128(%%r1,%2) \n\t" + "vst %%v25, 144(%%r1,%2) \n\t" + "vst %%v26, 160(%%r1,%2) \n\t" + "vst %%v27, 176(%%r1,%2) \n\t" + "vst %%v28, 192(%%r1,%2) \n\t" + "vst %%v29, 208(%%r1,%2) \n\t" + "vst %%v30, 224(%%r1,%2) \n\t" + "vst %%v31, 240(%%r1,%2) \n\t" + + + "la %%r1,256(%%r1) \n\t" + "brctg %%r0,1b" + : + : "r"(n), "a"(x), "a"(y) + : "cc", "memory","r0","r1", "v0","v1","v2","v3","v4","v5","v6","v7","v16", + "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); + return; + +} + +#endif + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + FLOAT temp; + + if ( n <= 0 ) return(0); + + if ( (inc_x == 1) && (inc_y == 1 )) + { + + BLASLONG n1 = n & -32; + if ( n1 > 0 ) + { + dswap_kernel_32(n1, x, y); + i=n1; + } + + while(i < n) + { + temp = y[i]; + y[i] = x[i] ; + x[i] = temp; + i++ ; + + } + + + } + else + { + + while(i < n) + { + temp = y[iy]; + y[iy] = x[ix] ; + x[ix] = temp; + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + + } + return(0); + + +} + + diff --git a/kernel/zarch/idamax.c b/kernel/zarch/idamax.c new file mode 100644 index 000000000..7926adf44 --- /dev/null +++ b/kernel/zarch/idamax.c @@ -0,0 +1,249 @@ +/*************************************************************************** +Copyright (c) 2013-2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ +#include "common.h" +#include + +#if defined(DOUBLE) + +#define ABS fabs + +#else + +#define ABS fabsf + +#endif + +static BLASLONG __attribute__((noinline)) diamax_kernel_32_TUNED(BLASLONG n, FLOAT *x, FLOAT *maxf) { + + __asm__( + "pfd 1, 0(%1) \n\t" + "sllg %%r0,%0,3 \n\t" + "agr %%r0,%1 \n\t" + "VLEIG %%v20,0,0 \n\t" + "VLEIG %%v20,1,1 \n\t" + "VLEIG %%v21,2,0 \n\t" + "VLEIG %%v21,3,1 \n\t" + "VLEIG %%v22,4,0 \n\t" + "VLEIG %%v22,5,1 \n\t" + "VLEIG %%v23,6,0 \n\t" + "VLEIG %%v23,7,1 \n\t" + "VREPIG %%v4,8 \n\t" + "vzero %%v5 \n\t" + "vzero %%v18 \n\t" + "vzero %%v19 \n\t" + ".align 16 \n\t" + "1: \n\t" + "pfd 1, 256(%1 ) \n\t" + "vlm %%v24,%%v31, 0(%1 ) \n\t" + + "vflpdb %%v24, %%v24 \n\t" + "vflpdb %%v25, %%v25 \n\t" + "vflpdb %%v26, %%v26 \n\t" + "vflpdb %%v27, %%v27 \n\t" + "vflpdb %%v28, %%v28 \n\t" + "vflpdb %%v29, %%v29 \n\t" + "vflpdb %%v30, %%v30 \n\t" + "vflpdb %%v31, %%v31 \n\t" + + "vfchdb %%v16,%%v25,%%v24 \n\t " + "vfchdb %%v17,%%v27,%%v26 \n\t " + "vsel %%v1,%%v21,%%v20,%%v16 \n\t" + "vsel %%v0,%%v25,%%v24,%%v16 \n\t" + "vsel %%v2,%%v23,%%v22,%%v17 \n\t" + "vsel %%v3,%%v27,%%v26,%%v17 \n\t" + "vfchdb %%v16,%%v29,%%v28 \n\t " + "vfchdb %%v17,%%v31,%%v30 \n\t" + "vsel %%v24,%%v21,%%v20,%%v16 \n\t" + "vsel %%v25,%%v29,%%v28,%%v16 \n\t" + "vsel %%v26,%%v23,%%v22,%%v17 \n\t" + "vsel %%v27,%%v31,%%v30,%%v17 \n\t" + + + "vfchdb %%v28, %%v3,%%v0 \n\t" + "vfchdb %%v29,%%v27, %%v25 \n\t" + "vsel %%v1,%%v2,%%v1,%%v28 \n\t" + "vsel %%v0,%%v3,%%v0,%%v28 \n\t" + "vsel %%v24,%%v26,%%v24,%%v29 \n\t" + "vsel %%v25,%%v27,%%v25,%%v29 \n\t" + + "VAG %%v1,%%v1,%%v5 \n\t" + "VAG %%v24,%%v24,%%v5 \n\t" + "VAG %%v24,%%v24,%%v4 \n\t" + + "vfchdb %%v16,%%v25 , %%v0 \n\t" + "VAG %%v5,%%v5,%%v4 \n\t" + "vsel %%v29,%%v25,%%v0,%%v16 \n\t" + "vsel %%v28,%%v24,%%v1,%%v16 \n\t" + + "vfchdb %%v17, %%v29,%%v18 \n\t" + "vsel %%v19,%%v28,%%v19,%%v17 \n\t" + "vsel %%v18,%%v29,%%v18,%%v17 \n\t" + + "VAG %%v5,%%v5,%%v4 \n\t" + "vlm %%v24,%%v31,128(%1 ) \n\t" + + "vflpdb %%v24, %%v24 \n\t" + "vflpdb %%v25, %%v25 \n\t" + "vflpdb %%v26, %%v26 \n\t" + "vflpdb %%v27, %%v27 \n\t" + "vflpdb %%v28, %%v28 \n\t" + "vflpdb %%v29, %%v29 \n\t" + "vflpdb %%v30, %%v30 \n\t" + "vflpdb %%v31, %%v31 \n\t" + + "vfchdb %%v16,%%v25,%%v24 \n\t " + "vfchdb %%v17,%%v27,%%v26 \n\t " + "vsel %%v1,%%v21,%%v20,%%v16 \n\t" + "vsel %%v0,%%v25,%%v24,%%v16 \n\t" + "vsel %%v2,%%v23,%%v22,%%v17 \n\t" + "vsel %%v3,%%v27,%%v26,%%v17 \n\t" + "vfchdb %%v16,%%v29,%%v28 \n\t " + "vfchdb %%v17,%%v31,%%v30 \n\t" + "vsel %%v24,%%v21,%%v20,%%v16 \n\t" + "vsel %%v25,%%v29,%%v28,%%v16 \n\t" + "vsel %%v26,%%v23,%%v22,%%v17 \n\t" + "vsel %%v27,%%v31,%%v30,%%v17 \n\t" + + + "vfchdb %%v28, %%v3,%%v0 \n\t" + "vfchdb %%v29,%%v27, %%v25 \n\t" + "vsel %%v1,%%v2,%%v1,%%v28 \n\t" + "vsel %%v0,%%v3,%%v0,%%v28 \n\t" + "vsel %%v24,%%v26,%%v24,%%v29 \n\t" + "vsel %%v25,%%v27,%%v25,%%v29 \n\t" + + "VAG %%v1,%%v1,%%v5 \n\t" + "VAG %%v24,%%v24,%%v5 \n\t" + "la %1,256(%1) \n\t" + "VAG %%v24,%%v24,%%v4 \n\t" + + "vfchdb %%v16,%%v25 , %%v0 \n\t" + "VAG %%v5,%%v5,%%v4 \n\t" + "vsel %%v29,%%v25,%%v0,%%v16 \n\t" + "vsel %%v28,%%v24,%%v1,%%v16 \n\t" + + "vfchdb %%v17, %%v29,%%v18 \n\t" + "vsel %%v19,%%v28,%%v19,%%v17 \n\t" + "vsel %%v18,%%v29,%%v18,%%v17 \n\t" + + "VAG %%v5,%%v5,%%v4 \n\t" + + "clgrjl %1,%%r0,1b \n\t" + + + "vrepg %%v26,%%v18,1 \n\t" + "vrepg %%v5,%%v19,1 \n\t" + "wfcdb %%v26,%%v18 \n\t" + "jne 2f \n\t" + "VSTEG %%v18,0(%2),0 \n\t" + "VMNLG %%v1,%%v5,%%v19 \n\t" + "VLGVG %%r2,%%v1,0 \n\t" + "br %%r14 \n\t" + "2: \n\t" + "wfchdb %%v16,%%v26,%%v18 \n\t" + "vsel %%v1,%%v5,%%v19,%%v16 \n\t" + "vsel %%v0,%%v26,%%v18,%%v16 \n\t" + "VLGVG %%r2,%%v1,0 \n\t" + "std %%f0,0(%2) \n\t" + + : + : "r"(n), "a"(x), "a"(maxf) + : "cc", "memory","r0","r1","r2","f0","v0","v1","v2","v3","v4","v5","v6","v7","v16", + "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + + ); + +} + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + BLASLONG ix = 0; + FLOAT maxf = 0.0; + BLASLONG max = 0; + + if (n <= 0 || inc_x <= 0) return (max); + + if (inc_x == 1) { + + BLASLONG n1 = n & -32; + if (n1 > 0) { + + max = diamax_kernel_32_TUNED(n1, x, &maxf); + + i = n1; + } + + while (i < n) { + if (ABS(x[i]) > maxf) { + max = i; + maxf = ABS(x[i]); + } + i++; + } + return (max + 1); + + } else { + + BLASLONG n1 = n & -4; + while (j < n1) { + + if (ABS(x[i]) > maxf) { + max = j; + maxf = ABS(x[i]); + } + if (ABS(x[i + inc_x]) > maxf) { + max = j + 1; + maxf = ABS(x[i + inc_x]); + } + if (ABS(x[i + 2 * inc_x]) > maxf) { + max = j + 2; + maxf = ABS(x[i + 2 * inc_x]); + } + if (ABS(x[i + 3 * inc_x]) > maxf) { + max = j + 3; + maxf = ABS(x[i + 3 * inc_x]); + } + + i += inc_x * 4; + + j += 4; + + } + + + while (j < n) { + if (ABS(x[i]) > maxf) { + max = j; + maxf = ABS(x[i]); + } + i += inc_x; + j++; + } + return (max + 1); + } +} diff --git a/kernel/zarch/idamin.c b/kernel/zarch/idamin.c new file mode 100644 index 000000000..27dbe58d0 --- /dev/null +++ b/kernel/zarch/idamin.c @@ -0,0 +1,249 @@ +/*************************************************************************** +Copyright (c) 2013-2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ +#include "common.h" +#include + +#if defined(DOUBLE) + +#define ABS fabs + +#else + +#define ABS fabsf + +#endif + +static BLASLONG __attribute__((noinline)) diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) { + + __asm__( + "pfd 1, 0(%1) \n\t" + "sllg %%r0,%0,3 \n\t" + "agr %%r0,%1 \n\t" + "VLEIG %%v20,0,0 \n\t" + "VLEIG %%v20,1,1 \n\t" + "VLEIG %%v21,2,0 \n\t" + "VLEIG %%v21,3,1 \n\t" + "VLEIG %%v22,4,0 \n\t" + "VLEIG %%v22,5,1 \n\t" + "VLEIG %%v23,6,0 \n\t" + "VLEIG %%v23,7,1 \n\t" + "VREPIG %%v4,8 \n\t" + "vzero %%v5 \n\t" + "vlrepg %%v18,0(%1) \n\t" + "vzero %%v19 \n\t" + ".align 16 \n\t" + "1: \n\t" + "pfd 1, 256(%1 ) \n\t" + "vlm %%v24,%%v31, 0(%1 ) \n\t" + + "vflpdb %%v24, %%v24 \n\t" + "vflpdb %%v25, %%v25 \n\t" + "vflpdb %%v26, %%v26 \n\t" + "vflpdb %%v27, %%v27 \n\t" + "vflpdb %%v28, %%v28 \n\t" + "vflpdb %%v29, %%v29 \n\t" + "vflpdb %%v30, %%v30 \n\t" + "vflpdb %%v31, %%v31 \n\t" + + "vfchdb %%v16,%%v24,%%v25 \n\t " + "vfchdb %%v17,%%v26 ,%%v27 \n\t " + "vsel %%v1,%%v21,%%v20,%%v16 \n\t" + "vsel %%v0,%%v25,%%v24,%%v16 \n\t" + "vsel %%v2,%%v23,%%v22,%%v17 \n\t" + "vsel %%v3,%%v27,%%v26,%%v17 \n\t" + "vfchdb %%v16,%%v28, %%v29 \n\t " + "vfchdb %%v17,%%v30,%%v31 \n\t" + "vsel %%v24,%%v21,%%v20,%%v16 \n\t" + "vsel %%v25,%%v29,%%v28,%%v16 \n\t" + "vsel %%v26,%%v23,%%v22,%%v17 \n\t" + "vsel %%v27,%%v31,%%v30,%%v17 \n\t" + + + "vfchdb %%v28,%%v0 , %%v3 \n\t" + "vfchdb %%v29, %%v25,%%v27 \n\t" + "vsel %%v1,%%v2,%%v1,%%v28 \n\t" + "vsel %%v0,%%v3,%%v0,%%v28 \n\t" + "vsel %%v24,%%v26,%%v24,%%v29 \n\t" + "vsel %%v25,%%v27,%%v25,%%v29 \n\t" + + "VAG %%v1,%%v1,%%v5 \n\t" + "VAG %%v24,%%v24,%%v5 \n\t" + "VAG %%v24,%%v24,%%v4 \n\t" + + "vfchdb %%v16, %%v0,%%v25 \n\t" + "VAG %%v5,%%v5,%%v4 \n\t" + "vsel %%v29,%%v25,%%v0,%%v16 \n\t" + "vsel %%v28,%%v24,%%v1,%%v16 \n\t" + + "vfchdb %%v17,%%v18, %%v29 \n\t" + "vsel %%v19,%%v28,%%v19,%%v17 \n\t" + "vsel %%v18,%%v29,%%v18,%%v17 \n\t" + + "VAG %%v5,%%v5,%%v4 \n\t" + + "vlm %%v24,%%v31,128(%1 ) \n\t" + "vflpdb %%v24, %%v24 \n\t" + "vflpdb %%v25, %%v25 \n\t" + "vflpdb %%v26, %%v26 \n\t" + "vflpdb %%v27, %%v27 \n\t" + "vflpdb %%v28, %%v28 \n\t" + "vflpdb %%v29, %%v29 \n\t" + "vflpdb %%v30, %%v30 \n\t" + "vflpdb %%v31, %%v31 \n\t" + + "vfchdb %%v16,%%v24,%%v25 \n\t" + "vfchdb %%v17,%%v26 ,%%v27 \n\t" + "vsel %%v1,%%v21,%%v20,%%v16 \n\t" + "vsel %%v0,%%v25,%%v24,%%v16 \n\t" + "vsel %%v2,%%v23,%%v22,%%v17 \n\t" + "vsel %%v3,%%v27,%%v26,%%v17 \n\t" + "vfchdb %%v16,%%v28 ,%%v29 \n\t" + "vfchdb %%v17,%%v30,%%v31 \n\t" + "vsel %%v24,%%v21,%%v20,%%v16 \n\t" + "vsel %%v25,%%v29,%%v28,%%v16 \n\t" + "vsel %%v26,%%v23,%%v22,%%v17 \n\t" + "vsel %%v27,%%v31,%%v30,%%v17 \n\t" + + + "vfchdb %%v28,%%v0 , %%v3 \n\t" + "vfchdb %%v29, %%v25,%%v27 \n\t" + "vsel %%v1,%%v2,%%v1,%%v28 \n\t" + "vsel %%v0,%%v3,%%v0,%%v28 \n\t" + "vsel %%v24,%%v26,%%v24,%%v29 \n\t" + "vsel %%v25,%%v27,%%v25,%%v29 \n\t" + + "VAG %%v1,%%v1,%%v5 \n\t" + "VAG %%v24,%%v24,%%v5 \n\t" + "la %1,256(%1) \n\t" + "VAG %%v24,%%v24,%%v4 \n\t" + + "vfchdb %%v16, %%v0,%%v25 \n\t" + "VAG %%v5,%%v5,%%v4 \n\t" + "vsel %%v29,%%v25,%%v0,%%v16 \n\t" + "vsel %%v28,%%v24,%%v1,%%v16 \n\t" + + "vfchdb %%v17,%%v18, %%v29 \n\t" + "vsel %%v19,%%v28,%%v19,%%v17 \n\t" + "vsel %%v18,%%v29,%%v18,%%v17 \n\t" + + "VAG %%v5,%%v5,%%v4 \n\t" + + "clgrjl %1,%%r0,1b \n\t" + + + "vrepg %%v26,%%v18,1 \n\t" + "vrepg %%v5,%%v19,1 \n\t" + "wfcdb %%v26,%%v18 \n\t" + "jne 2f \n\t" + "VSTEG %%v18,0(%2),0 \n\t" + "VMNLG %%v1,%%v5,%%v19 \n\t" + "VLGVG %%r2,%%v1,0 \n\t" + "br %%r14 \n\t" + "2: \n\t" + "wfchdb %%v16,%%v18 ,%%v26 \n\t " + "vsel %%v1,%%v5,%%v19,%%v16 \n\t" + "vsel %%v0,%%v26,%%v18,%%v16 \n\t" + "VLGVG %%r2,%%v1,0 \n\t" + "std %%f0,0(%2) \n\t" + + : + : "r"(n), "a"(x), "a"(maxf) + : "cc", "memory","r0","r1","r2","f0","v0","v1","v2","v3","v4","v5","v6","v7","v16", + "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + + ); + +} + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + BLASLONG ix = 0; + FLOAT minf = 0.0; + BLASLONG min = 0; + + if (n <= 0 || inc_x <= 0) return (min); + + if (inc_x == 1) { + + BLASLONG n1 = n & -32; + if (n1 > 0) { + + min = diamin_kernel_32(n1, x, &minf); + + i = n1; + } + + while (i < n) { + if (ABS(x[i]) < minf) { + min = i; + minf = ABS(x[i]); + } + i++; + } + return (min + 1); + + } else { + + BLASLONG n1 = n & -4; + while (j < n1) { + + if (ABS(x[i]) < minf) { + min = j; + minf = ABS(x[i]); + } + if (ABS(x[i + inc_x]) < minf) { + min = j + 1; + minf = ABS(x[i + inc_x]); + } + if (ABS(x[i + 2 * inc_x]) < minf) { + min = j + 2; + minf = ABS(x[i + 2 * inc_x]); + } + if (ABS(x[i + 3 * inc_x]) < minf) { + min = j + 3; + minf = ABS(x[i + 3 * inc_x]); + } + + i += inc_x * 4; + + j += 4; + + } + + + while (j < n) { + if (ABS(x[i]) < minf) { + min = j; + minf = ABS(x[i]); + } + i += inc_x; + j++; + } + return (min + 1); + } +} diff --git a/kernel/zarch/izamax.c b/kernel/zarch/izamax.c new file mode 100644 index 000000000..3966dfa1c --- /dev/null +++ b/kernel/zarch/izamax.c @@ -0,0 +1,257 @@ +/*************************************************************************** +Copyright (c) 2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include "common.h" +#include + +#define ABS fabs +#define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) + + + +static BLASLONG __attribute__((noinline)) ziamax_kernel_8_TUNED(BLASLONG n, FLOAT *x, FLOAT *maxf) { + + __asm__( + "pfd 1, 0(%1) \n\t" + "VLEIG %%v16,0,0 \n\t" + "VLEIG %%v16,1,1 \n\t" + "VLEIG %%v17,2,0 \n\t" + "VLEIG %%v17,3,1 \n\t" + "VLEIG %%v18,4,0 \n\t" + "VLEIG %%v18,5,1 \n\t" + "VLEIG %%v19,6,0 \n\t" + "VLEIG %%v19,7,1 \n\t" + "VLEIG %%v20,8,0 \n\t" + "VLEIG %%v20,9,1 \n\t" + "VLEIG %%v21,10,0 \n\t" + "VLEIG %%v21,11,1 \n\t" + "VLEIG %%v22,12,0 \n\t" + "VLEIG %%v22,13,1 \n\t" + "VLEIG %%v23,14,0 \n\t" + "VLEIG %%v23,15,1 \n\t" + + + "sllg %%r0,%0,4 \n\t" + "agr %%r0,%1 \n\t" + "vzero %%v6 \n\t" + "vzero %%v7 \n\t" + "VREPIG %%v4,16 \n\t" + "vzero %%v5 \n\t" + ".align 16 \n\t" + "1: \n\t" + "pfd 1, 256(%1 ) \n\t" + + "vleg %%v24 , 0( %1),0 \n\t" + "vleg %%v25 , 8( %1),0 \n\t" + "vleg %%v24 , 16( %1),1 \n\t" + "vleg %%v25 , 24( %1),1 \n\t" + "vleg %%v26 , 32( %1),0 \n\t" + "vleg %%v27 , 40( %1),0 \n\t" + "vleg %%v26 , 48( %1),1 \n\t" + "vleg %%v27 , 56( %1),1 \n\t" + "vleg %%v28 , 64( %1),0 \n\t" + "vleg %%v29 , 72( %1),0 \n\t" + "vleg %%v28 , 80( %1),1 \n\t" + "vleg %%v29 , 88( %1),1 \n\t" + "vleg %%v30 , 96( %1),0 \n\t" + "vleg %%v31 ,104( %1),0 \n\t" + "vleg %%v30 ,112( %1),1 \n\t" + "vleg %%v31 ,120( %1),1 \n\t" + "vflpdb %%v24, %%v24 \n\t" + "vflpdb %%v25, %%v25 \n\t" + "vflpdb %%v26, %%v26 \n\t" + "vflpdb %%v27, %%v27 \n\t" + "vflpdb %%v28, %%v28 \n\t" + "vflpdb %%v29, %%v29 \n\t" + "vflpdb %%v30, %%v30 \n\t" + "vflpdb %%v31, %%v31 \n\t" + + "vfadb %%v0,%%v24,%%v25 \n\t" + "vfadb %%v1,%%v26,%%v27 \n\t" + "vfadb %%v2,%%v28,%%v29 \n\t" + "vfadb %%v3,%%v30,%%v31 \n\t" + + + "vleg %%v24 , 128( %1),0 \n\t" + "vleg %%v25 , 136( %1),0 \n\t" + "vleg %%v24 , 144( %1),1 \n\t" + "vleg %%v25 , 152( %1),1 \n\t" + "vleg %%v26 , 160( %1),0 \n\t" + "vleg %%v27 , 168( %1),0 \n\t" + "vleg %%v26 , 176( %1),1 \n\t" + "vleg %%v27 , 184( %1),1 \n\t" + "vleg %%v28 , 192( %1),0 \n\t" + "vleg %%v29 , 200( %1),0 \n\t" + "vleg %%v28 , 208( %1),1 \n\t" + "vleg %%v29 , 216( %1),1 \n\t" + "vleg %%v30 , 224( %1),0 \n\t" + "vleg %%v31 , 232( %1),0 \n\t" + "vleg %%v30 , 240( %1),1 \n\t" + "vleg %%v31 , 248( %1),1 \n\t" + "vflpdb %%v24, %%v24 \n\t" + "vflpdb %%v25, %%v25 \n\t" + "vflpdb %%v26, %%v26 \n\t" + "vflpdb %%v27, %%v27 \n\t" + "vflpdb %%v28, %%v28 \n\t" + "vflpdb %%v29, %%v29 \n\t" + "vflpdb %%v30, %%v30 \n\t" + "vflpdb %%v31, %%v31 \n\t" + + "vfadb %%v24,%%v24,%%v25 \n\t" + "vfadb %%v26,%%v26,%%v27 \n\t" + "vfadb %%v28,%%v28,%%v29 \n\t" + "vfadb %%v30,%%v30,%%v31 \n\t" + + "vfchdb %%v25,%%v1,%%v0 \n\t" + "vsel %%v29,%%v17,%%v16,%%v25 \n\t" + "vsel %%v31,%%v1,%%v0,%%v25 \n\t" + + "vfchdb %%v27,%%v3,%%v2 \n\t " + "vsel %%v0,%%v19,%%v18,%%v27 \n\t" + "vsel %%v1,%%v3,%%v2,%%v27 \n\t" + + "vfchdb %%v25,%%v26,%%v24 \n\t " + "vsel %%v2,%%v21,%%v20,%%v25 \n\t" + "vsel %%v3,%%v26,%%v24,%%v25 \n\t" + + "vfchdb %%v27,%%v30,%%v28 \n\t " + "vsel %%v25,%%v23,%%v22,%%v27 \n\t" + "vsel %%v27,%%v30,%%v28,%%v27 \n\t" + + "vfchdb %%v24, %%v1,%%v31 \n\t" + "vsel %%v26,%%v0,%%v29,%%v24 \n\t" + "vsel %%v28,%%v1,%%v31,%%v24 \n\t" + + "vfchdb %%v30, %%v27,%%v3 \n\t" + "vsel %%v29,%%v25,%%v2,%%v30 \n\t" + "vsel %%v31,%%v27,%%v3 ,%%v30 \n\t" + + "la %1,256(%1) \n\t" + + "vfchdb %%v0, %%v31,%%v28 \n\t" + "vsel %%v25,%%v29,%%v26,%%v0 \n\t" + "vsel %%v27,%%v31,%%v28,%%v0 \n\t" + + "VAG %%v25,%%v25,%%v5 \n\t" + + //cmp with previous + "vfchdb %%v30, %%v27,%%v6 \n\t" + "vsel %%v7,%%v25,%%v7,%%v30 \n\t" + "vsel %%v6,%%v27,%%v6,%%v30 \n\t" + + "VAG %%v5,%%v5,%%v4 \n\t" + + "clgrjl %1,%%r0,1b \n\t" + + //xtract index + "vrepg %%v26,%%v6,1 \n\t" + "vrepg %%v5,%%v7,1 \n\t" + "wfcdb %%v26,%%v6 \n\t" + "jne 2f \n\t" + "VSTEG %%v6,0(%2),0 \n\t" + "VMNLG %%v1,%%v5,%%v7 \n\t" + "VLGVG %%r2,%%v1,0 \n\t" + "br %%r14 \n\t" + "2: \n\t" + "wfchdb %%v16,%%v26,%%v6 \n\t" + "vsel %%v1,%%v5,%%v7,%%v16 \n\t" + "vsel %%v0,%%v26,%%v6,%%v16 \n\t" + "VLGVG %%r2,%%v1,0 \n\t" + "std %%f0,0(%2) \n\t" + + : + : "r"(n), "a"(x), "a"(maxf) + : "cc", "memory","r0","r1","r2","f0","v0","v1","v2","v3","v4","v5","v6","v7","v16", + "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + + ); + +} + + + + + + + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i = 0; + BLASLONG ix = 0; + FLOAT maxf = 0; + BLASLONG max = 0; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) return(max); + + if (inc_x == 1) { + + BLASLONG n1 = n & -8; + if (n1 > 0) { + + max = ziamax_kernel_8_TUNED(n1, x, &maxf); + + i = n1; + } + + while(i < n) + { + if( CABS1(x,ix) > maxf ) + { + max = i; + maxf = CABS1(x,ix); + } + ix += 2; + i++; + } + return (max + 1); + + } else { + + inc_x2 = 2 * inc_x; + + maxf = CABS1(x,0); + ix += inc_x2; + i++; + + while(i < n) + { + if( CABS1(x,ix) > maxf ) + { + max = i; + maxf = CABS1(x,ix); + } + ix += inc_x2; + i++; + } + return (max + 1); + } + +} + + diff --git a/kernel/zarch/izamin.c b/kernel/zarch/izamin.c new file mode 100644 index 000000000..4af1410bb --- /dev/null +++ b/kernel/zarch/izamin.c @@ -0,0 +1,259 @@ +/*************************************************************************** +Copyright (c) 2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include "common.h" +#include + +#define ABS fabs +#define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) + + + +static BLASLONG __attribute__((noinline)) ziamin_kernel_8_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) { + + __asm__( + "pfd 1, 0(%1) \n\t" + "VLEIG %%v16,0,0 \n\t" + "VLEIG %%v16,1,1 \n\t" + "VLEIG %%v17,2,0 \n\t" + "VLEIG %%v17,3,1 \n\t" + "VLEIG %%v18,4,0 \n\t" + "VLEIG %%v18,5,1 \n\t" + "VLEIG %%v19,6,0 \n\t" + "VLEIG %%v19,7,1 \n\t" + "VLEIG %%v20,8,0 \n\t" + "VLEIG %%v20,9,1 \n\t" + "VLEIG %%v21,10,0 \n\t" + "VLEIG %%v21,11,1 \n\t" + "VLEIG %%v22,12,0 \n\t" + "VLEIG %%v22,13,1 \n\t" + "VLEIG %%v23,14,0 \n\t" + "VLEIG %%v23,15,1 \n\t" + "ld %%f6,0(%1) \n\t" + "lpdbr %%f6,%%f6 \n\t" + "ld %%f7,8(%1) \n\t" + "lpdbr %%f7,%%f7 \n\t" + "adbr %%f6,%%f7 \n\t" + "sllg %%r0,%0,4 \n\t" + "agr %%r0,%1 \n\t" + "vrepg %%v6,%%v6,0 \n\t" + "vzero %%v7 \n\t" + "VREPIG %%v4,16 \n\t" + "vzero %%v5 \n\t" + ".align 16 \n\t" + "1: \n\t" + "pfd 1, 256(%1 ) \n\t" + + "vleg %%v24 , 0( %1),0 \n\t" + "vleg %%v25 , 8( %1),0 \n\t" + "vleg %%v24 , 16( %1),1 \n\t" + "vleg %%v25 , 24( %1),1 \n\t" + "vleg %%v26 , 32( %1),0 \n\t" + "vleg %%v27 , 40( %1),0 \n\t" + "vleg %%v26 , 48( %1),1 \n\t" + "vleg %%v27 , 56( %1),1 \n\t" + "vleg %%v28 , 64( %1),0 \n\t" + "vleg %%v29 , 72( %1),0 \n\t" + "vleg %%v28 , 80( %1),1 \n\t" + "vleg %%v29 , 88( %1),1 \n\t" + "vleg %%v30 , 96( %1),0 \n\t" + "vleg %%v31 ,104( %1),0 \n\t" + "vleg %%v30 ,112( %1),1 \n\t" + "vleg %%v31 ,120( %1),1 \n\t" + "vflpdb %%v24, %%v24 \n\t" + "vflpdb %%v25, %%v25 \n\t" + "vflpdb %%v26, %%v26 \n\t" + "vflpdb %%v27, %%v27 \n\t" + "vflpdb %%v28, %%v28 \n\t" + "vflpdb %%v29, %%v29 \n\t" + "vflpdb %%v30, %%v30 \n\t" + "vflpdb %%v31, %%v31 \n\t" + + "vfadb %%v0,%%v24,%%v25 \n\t" + "vfadb %%v1,%%v26,%%v27 \n\t" + "vfadb %%v2,%%v28,%%v29 \n\t" + "vfadb %%v3,%%v30,%%v31 \n\t" + + + "vleg %%v24 ,128( %1),0 \n\t" + "vleg %%v25 ,136( %1),0 \n\t" + "vleg %%v24 ,144( %1),1 \n\t" + "vleg %%v25 ,152( %1),1 \n\t" + "vleg %%v26 ,160( %1),0 \n\t" + "vleg %%v27 ,168( %1),0 \n\t" + "vleg %%v26 ,176( %1),1 \n\t" + "vleg %%v27 ,184( %1),1 \n\t" + "vleg %%v28 ,192( %1),0 \n\t" + "vleg %%v29 ,200( %1),0 \n\t" + "vleg %%v28 ,208( %1),1 \n\t" + "vleg %%v29 ,216( %1),1 \n\t" + "vleg %%v30 ,224( %1),0 \n\t" + "vleg %%v31 ,232( %1),0 \n\t" + "vleg %%v30 ,240( %1),1 \n\t" + "vleg %%v31 ,248( %1),1 \n\t" + "vflpdb %%v24, %%v24 \n\t" + "vflpdb %%v25, %%v25 \n\t" + "vflpdb %%v26, %%v26 \n\t" + "vflpdb %%v27, %%v27 \n\t" + "vflpdb %%v28, %%v28 \n\t" + "vflpdb %%v29, %%v29 \n\t" + "vflpdb %%v30, %%v30 \n\t" + "vflpdb %%v31, %%v31 \n\t" + + "vfadb %%v24,%%v24,%%v25 \n\t" + "vfadb %%v26,%%v26,%%v27 \n\t" + "vfadb %%v28,%%v28,%%v29 \n\t" + "vfadb %%v30,%%v30,%%v31 \n\t" + + + "vfchdb %%v25,%%v0 ,%%v1 \n\t" + "vsel %%v29,%%v17,%%v16,%%v25 \n\t" + "vsel %%v31,%%v1,%%v0,%%v25 \n\t" + + "vfchdb %%v27,%%v2,%%v3 \n\t" + "vsel %%v0,%%v19,%%v18,%%v27 \n\t" + "vsel %%v1,%%v3,%%v2,%%v27 \n\t" + + "vfchdb %%v25,%%v24,%%v26 \n\t" + "vsel %%v2,%%v21,%%v20,%%v25 \n\t" + "vsel %%v3,%%v26,%%v24,%%v25 \n\t" + + "vfchdb %%v27,%%v28,%%v30 \n\t" + "vsel %%v25,%%v23,%%v22,%%v27 \n\t" + "vsel %%v27,%%v30,%%v28,%%v27 \n\t" + + "vfchdb %%v24,%%v31, %%v1 \n\t" + "vsel %%v26,%%v0,%%v29,%%v24 \n\t" + "vsel %%v28,%%v1,%%v31,%%v24 \n\t" + + "vfchdb %%v30,%%v3, %%v27 \n\t" + "vsel %%v29,%%v25,%%v2,%%v30 \n\t" + "vsel %%v31,%%v27,%%v3 ,%%v30 \n\t" + + "la %1,256(%1) \n\t" + + "vfchdb %%v0,%%v28, %%v31 \n\t" + "vsel %%v25,%%v29,%%v26,%%v0 \n\t" + "vsel %%v27,%%v31,%%v28,%%v0 \n\t" + + "VAG %%v25,%%v25,%%v5 \n\t" + + //cmp with previous + "vfchdb %%v30,%%v6 , %%v27 \n\t" + "vsel %%v7,%%v25,%%v7,%%v30 \n\t" + "vsel %%v6,%%v27,%%v6,%%v30 \n\t" + + "VAG %%v5,%%v5,%%v4 \n\t" + + "clgrjl %1,%%r0,1b \n\t" + + //xtract index + "vrepg %%v26,%%v6,1 \n\t" + "vrepg %%v5,%%v7,1 \n\t" + "wfcdb %%v26,%%v6 \n\t" + "jne 2f \n\t" + "VSTEG %%v6,0(%2),0 \n\t" + "VMNLG %%v1,%%v5,%%v7 \n\t" + "VLGVG %%r2,%%v1,0 \n\t" + "br %%r14 \n\t" + "2: \n\t" + "wfchdb %%v16,%%v6 ,%%v26 \n\t" + "vsel %%v1,%%v5,%%v7,%%v16 \n\t" + "vsel %%v0,%%v26,%%v6,%%v16 \n\t" + "VLGVG %%r2,%%v1,0 \n\t" + "std %%f0,0(%2) \n\t" + + : + : "r"(n), "a"(x), "a"(minf) + : "cc", "memory","r0","r1","r2","f0","v0","v1","v2","v3","v4","v5","v6","v7","v16", + "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + + ); + +} + + + + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT minf; + BLASLONG min=0; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) return(min); + + + if (inc_x == 1) { + + BLASLONG n1 = n & -8; + if (n1 > 0) { + + min = ziamin_kernel_8_TUNED(n1, x, &minf); + + i = n1; + } + + while(i < n) + { + if( CABS1(x,ix) < minf ) + { + min = i; + minf = CABS1(x,ix); + } + ix += 2; + i++; + } + return (min + 1); + + } else { + + inc_x2 = 2 * inc_x; + + minf = CABS1(x,0); + ix += inc_x2; + i++; + + while(i < n) + { + if( CABS1(x,ix) < minf ) + { + min = i; + minf = CABS1(x,ix); + } + ix += inc_x2; + i++; + } + return (min + 1); + } + +} + + diff --git a/kernel/zarch/zasum.c b/kernel/zarch/zasum.c new file mode 100644 index 000000000..a9504f2d6 --- /dev/null +++ b/kernel/zarch/zasum.c @@ -0,0 +1,156 @@ +/*************************************************************************** +Copyright (c) 2013-2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include "common.h" +#include + +#if defined(DOUBLE) + +#define ABS fabs + +#else + +#define ABS fabsf + +#endif + + +static FLOAT __attribute__ ((noinline)) zasum_kernel_16(BLASLONG n, FLOAT *x) { + + __asm__ ( + "pfd 1, 0(%1) \n\t" + "sllg %%r0,%0,4 \n\t" + "agr %%r0,%1 \n\t" + "vzero %%v0 \n\t" + "vzero %%v1 \n\t" + "vzero %%v22 \n\t" + "vzero %%v23 \n\t" + ".align 16 \n\t" + "1: \n\t" + "pfd 1, 256(%1 ) \n\t" + "vlm %%v24,%%v31,0(%1) \n\t" + + "vflpdb %%v24, %%v24 \n\t" + "vflpdb %%v25, %%v25 \n\t" + "vflpdb %%v26, %%v26 \n\t" + "vflpdb %%v27, %%v27 \n\t" + "vflpdb %%v28, %%v28 \n\t" + "vflpdb %%v29, %%v29 \n\t" + "vflpdb %%v30, %%v30 \n\t" + "vflpdb %%v31, %%v31 \n\t" + + "vfadb %%v0,%%v0,%%v24 \n\t" + "vfadb %%v1,%%v1,%%v25 \n\t" + "vfadb %%v23,%%v23,%%v26 \n\t" + "vfadb %%v22,%%v22,%%v27 \n\t" + "vfadb %%v0,%%v0,%%v28 \n\t" + "vfadb %%v1,%%v1,%%v29 \n\t" + "vfadb %%v23,%%v23,%%v30 \n\t" + "vfadb %%v22,%%v22,%%v31 \n\t" + + "vlm %%v24,%%v31, 128(%1 ) \n\t" + + "vflpdb %%v24, %%v24 \n\t" + "vflpdb %%v25, %%v25 \n\t" + "vflpdb %%v26, %%v26 \n\t" + "vflpdb %%v27, %%v27 \n\t" + "vflpdb %%v28, %%v28 \n\t" + "vflpdb %%v29, %%v29 \n\t" + "vflpdb %%v30, %%v30 \n\t" + "vflpdb %%v31, %%v31 \n\t" + "la %1,256(%1) \n\t" + "vfadb %%v0,%%v0,%%v24 \n\t" + "vfadb %%v1,%%v1,%%v25 \n\t" + "vfadb %%v23,%%v23,%%v26 \n\t" + "vfadb %%v22,%%v22,%%v27 \n\t" + "vfadb %%v0,%%v0,%%v28 \n\t" + "vfadb %%v1,%%v1,%%v29 \n\t" + "vfadb %%v23,%%v23,%%v30 \n\t" + "vfadb %%v22,%%v22,%%v31 \n\t" + + "clgrjl %1,%%r0,1b \n\t" + "vfadb %%v24,%%v0,%%v1 \n\t" + "vfadb %%v25,%%v23,%%v22 \n\t" + "vfadb %%v0,%%v25,%%v24 \n\t" + "vrepg %%v1,%%v0,1 \n\t" + "adbr %%f0,%%f1 \n\t" + : + : "r"(n), "a"(x) + : "cc", "memory","r0","f0","f1","v0","v1","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); + +} + + + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ip=0; + FLOAT sumf = 0.0; + BLASLONG n1; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) return(sumf); + + if ( inc_x == 1 ) + { + + n1 = n & -16; + if ( n1 > 0 ) + { + + sumf=zasum_kernel_16(n1, x ); + i=n1; + ip=2*n1; + } + + while(i < n) + { + sumf += ABS(x[ip]) + ABS(x[ip+1]); + i++; + ip+=2; + } + + } + else + { + inc_x2 = 2* inc_x; + + while(i < n) + { + sumf += ABS(x[ip]) + ABS(x[ip+1]); + ip+=inc_x2; + i++; + } + + } + return(sumf); +} + + diff --git a/kernel/zarch/zaxpy.c b/kernel/zarch/zaxpy.c new file mode 100644 index 000000000..0cd6036b9 --- /dev/null +++ b/kernel/zarch/zaxpy.c @@ -0,0 +1,207 @@ +/*************************************************************************** +Copyright (c) 2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ + + +#include "common.h" + +static void __attribute__ ((noinline)) zaxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) { + __asm__ ("pfd 1, 0(%1) \n\t" + "pfd 2, 0(%2) \n\t" + "vlrepg %%v28 , 0(%3) \n\t" + "vlrepg %%v29, 8(%3) \n\t" + "srlg %3,%0,3 \n\t" + "xgr %%r1,%%r1 \n\t" + ".align 16 \n\t" + "1: \n\t" + "pfd 1, 256(%%r1,%1) \n\t" + "pfd 2, 256(%%r1,%2) \n\t" + "vleg %%v16 , 0(%%r1,%2),0 \n\t" + "vleg %%v17 , 8(%%r1,%2),0 \n\t" + "vleg %%v16 , 16(%%r1,%2),1 \n\t" + "vleg %%v17 , 24(%%r1,%2),1 \n\t" + + "vleg %%v18 , 32(%%r1,%2),0 \n\t" + "vleg %%v19 , 40(%%r1,%2),0 \n\t" + "vleg %%v18 , 48(%%r1,%2),1 \n\t" + "vleg %%v19 , 56(%%r1,%2),1 \n\t" + + "vleg %%v24 , 0(%%r1,%1),0 \n\t" + "vleg %%v25 , 8(%%r1,%1),0 \n\t" + "vleg %%v24 , 16(%%r1,%1),1 \n\t" + "vleg %%v25 , 24(%%r1,%1),1 \n\t" + + "vleg %%v26 , 32(%%r1,%1),0 \n\t" + "vleg %%v27 , 40(%%r1,%1),0 \n\t" + "vleg %%v26 , 48(%%r1,%1),1 \n\t" + "vleg %%v27 , 56(%%r1,%1),1 \n\t" +#if !defined(CONJ) + "vfmsdb %%v16, %%v25, %%v29,%%v16 \n\t" + "vfmadb %%v17, %%v24, %%v29, %%v17 \n\t" + "vfmsdb %%v18, %%v27, %%v29, %%v18 \n\t" + "vfmadb %%v19, %%v26, %%v29, %%v19 \n\t" + + "vfmsdb %%v16, %%v24, %%v28 ,%%v16 \n\t" + "vfmadb %%v17, %%v25, %%v28, %%v17 \n\t" + "vfmsdb %%v18, %%v26, %%v28, %%v18 \n\t" + "vfmadb %%v19, %%v27, %%v28, %%v19 \n\t" +#else + "vfmadb %%v16, %%v25, %%v29, %%v16 \n\t" + "vfmsdb %%v17, %%v25, %%v28, %%v17 \n\t" + "vfmadb %%v18, %%v27, %%v29, %%v18 \n\t" + "vfmsdb %%v19, %%v27, %%v28, %%v19 \n\t" + "vfmadb %%v16, %%v24, %%v28, %%v16 \n\t" + "vfmsdb %%v17, %%v24, %%v29, %%v17 \n\t" + "vfmadb %%v18, %%v26, %%v28, %%v18 \n\t" + "vfmsdb %%v19, %%v26, %%v29, %%v19 \n\t" + +#endif + "vsteg %%v16 , 0(%%r1,%2),0 \n\t" + "vsteg %%v17 , 8(%%r1,%2),0 \n\t" + "vsteg %%v16 , 16(%%r1,%2),1 \n\t" + "vsteg %%v17 , 24(%%r1,%2),1 \n\t" + + "vsteg %%v18 , 32(%%r1,%2),0 \n\t" + "vsteg %%v19 , 40(%%r1,%2),0 \n\t" + "vsteg %%v18 , 48(%%r1,%2),1 \n\t" + "vsteg %%v19 , 56(%%r1,%2),1 \n\t" + + "vleg %%v20 , 64(%%r1,%2),0 \n\t" + "vleg %%v21 , 72(%%r1,%2),0 \n\t" + "vleg %%v20 , 80(%%r1,%2),1 \n\t" + "vleg %%v21 , 88(%%r1,%2),1 \n\t" + + "vleg %%v22 , 96(%%r1,%2),0 \n\t" + "vleg %%v23 , 104(%%r1,%2),0 \n\t" + "vleg %%v22 , 112(%%r1,%2),1 \n\t" + "vleg %%v23 , 120(%%r1,%2),1 \n\t" + + "vleg %%v24 , 64(%%r1,%1),0 \n\t" + "vleg %%v25 , 72(%%r1,%1),0 \n\t" + "vleg %%v24 , 80(%%r1,%1),1 \n\t" + "vleg %%v25 , 88(%%r1,%1),1 \n\t" + + "vleg %%v26 , 96(%%r1,%1),0 \n\t" + "vleg %%v27 , 104(%%r1,%1),0 \n\t" + "vleg %%v26 , 112(%%r1,%1),1 \n\t" + "vleg %%v27 , 120(%%r1,%1),1 \n\t" +#if !defined(CONJ) + "vfmsdb %%v20, %%v25, %%v29,%%v20 \n\t" + "vfmadb %%v21, %%v24, %%v29, %%v21 \n\t" + "vfmsdb %%v22, %%v27, %%v29, %%v22 \n\t" + "vfmadb %%v23, %%v26, %%v29, %%v23 \n\t" + + "vfmsdb %%v20, %%v24, %%v28 ,%%v20 \n\t" + "vfmadb %%v21, %%v25, %%v28, %%v21 \n\t" + "vfmsdb %%v22, %%v26, %%v28, %%v22 \n\t" + "vfmadb %%v23, %%v27, %%v28, %%v23 \n\t" +#else + "vfmadb %%v20, %%v25, %%v29, %%v20 \n\t" + "vfmsdb %%v21, %%v25, %%v28, %%v21 \n\t" + "vfmadb %%v22, %%v27, %%v29, %%v22 \n\t" + "vfmsdb %%v23, %%v27, %%v28, %%v23 \n\t" + "vfmadb %%v20, %%v24, %%v28, %%v20 \n\t" + "vfmsdb %%v21, %%v24, %%v29, %%v21 \n\t" + "vfmadb %%v22, %%v26, %%v28, %%v22 \n\t" + "vfmsdb %%v23, %%v26, %%v29, %%v23 \n\t" +#endif + "vsteg %%v20 , 64(%%r1,%2),0 \n\t" + "vsteg %%v21 , 72(%%r1,%2),0 \n\t" + "vsteg %%v20 , 80(%%r1,%2),1 \n\t" + "vsteg %%v21 , 88(%%r1,%2),1 \n\t" + + "vsteg %%v22 , 96(%%r1,%2),0 \n\t" + "vsteg %%v23 , 104(%%r1,%2),0 \n\t" + "vsteg %%v22 , 112(%%r1,%2),1 \n\t" + "vsteg %%v23 , 120(%%r1,%2),1 \n\t" + + "la %%r1,128(%%r1) \n\t" + "brctg %3,1b" + : + : "r"(n), "a"(x), "a"(y), "a"(alpha) + : "cc", "memory", "r1","v16", + "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29" + ); + +} + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { + BLASLONG i = 0; + BLASLONG ix = 0, iy = 0; + FLOAT da[2]; + + if (n <= 0) return (0); + + if ((inc_x == 1) && (inc_y == 1)) { + + BLASLONG n1 = n & -8; + + if (n1) { + da[0] = da_r; + da[1] = da_i; + zaxpy_kernel_8(n1, x, y, da); + ix = 2 * n1; + } + i = n1; + while (i < n) { +#if !defined(CONJ) + y[ix] += (da_r * x[ix] - da_i * x[ix + 1]); + y[ix + 1] += (da_r * x[ix + 1] + da_i * x[ix]); +#else + y[ix] += (da_r * x[ix] + da_i * x[ix + 1]); + y[ix + 1] -= (da_r * x[ix + 1] - da_i * x[ix]); +#endif + i++; + ix += 2; + + } + return (0); + + + } + + inc_x *= 2; + inc_y *= 2; + + while (i < n) { + +#if !defined(CONJ) + y[iy] += (da_r * x[ix] - da_i * x[ix + 1]); + y[iy + 1] += (da_r * x[ix + 1] + da_i * x[ix]); +#else + y[iy] += (da_r * x[ix] + da_i * x[ix + 1]); + y[iy + 1] -= (da_r * x[ix + 1] - da_i * x[ix]); +#endif + ix += inc_x; + iy += inc_y; + i++; + + } + return (0); + +} + + diff --git a/kernel/zarch/zcopy.c b/kernel/zarch/zcopy.c new file mode 100644 index 000000000..9123830ea --- /dev/null +++ b/kernel/zarch/zcopy.c @@ -0,0 +1,145 @@ +/*************************************************************************** +Copyright (c) 2013-2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +static void __attribute__ ((noinline)) zcopy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) { + + __asm__ volatile( + "pfd 1, 0(%1) \n\t" + "pfd 2, 0(%2) \n\t" + "srlg %%r0,%0,4 \n\t" + "xgr %%r1,%%r1 \n\t" + ".align 16 \n\t" + "1: \n\t" + "pfd 1, 256(%%r1,%1) \n\t" + "pfd 2, 256(%%r1,%2) \n\t" + + "vl %%v24, 0(%%r1,%1) \n\t" + "vst %%v24, 0(%%r1,%2) \n\t" + "vl %%v25, 16(%%r1,%1) \n\t" + "vst %%v25, 16(%%r1,%2) \n\t" + "vl %%v26, 32(%%r1,%1) \n\t" + "vst %%v26, 32(%%r1,%2) \n\t" + "vl %%v27, 48(%%r1,%1) \n\t" + "vst %%v27, 48(%%r1,%2) \n\t" + + "vl %%v28, 64(%%r1,%1) \n\t" + "vst %%v28, 64(%%r1,%2) \n\t" + "vl %%v29, 80(%%r1,%1) \n\t" + "vst %%v29, 80(%%r1,%2) \n\t" + "vl %%v30, 96(%%r1,%1) \n\t" + "vst %%v30, 96(%%r1,%2) \n\t" + "vl %%v31,112(%%r1,%1) \n\t" + "vst %%v31,112(%%r1,%2) \n\t" + + + "vl %%v24,128(%%r1,%1) \n\t" + "vst %%v24,128(%%r1,%2) \n\t" + + "vl %%v25,144(%%r1,%1) \n\t" + "vst %%v25,144(%%r1,%2) \n\t" + + "vl %%v26,160(%%r1,%1) \n\t" + "vst %%v26,160(%%r1,%2) \n\t" + + "vl %%v27,176(%%r1,%1) \n\t" + "vst %%v27,176(%%r1,%2) \n\t" + + "vl %%v28, 192(%%r1,%1) \n\t" + "vst %%v28, 192(%%r1,%2) \n\t" + "vl %%v29, 208(%%r1,%1) \n\t" + "vst %%v29, 208(%%r1,%2) \n\t" + "vl %%v30, 224(%%r1,%1) \n\t" + "vst %%v30, 224(%%r1,%2) \n\t" + "vl %%v31, 240(%%r1,%1) \n\t" + "vst %%v31, 240(%%r1,%2) \n\t" + "la %%r1,256(%%r1) \n\t" + "brctg %%r0,1b" + : + : "r"(n), "a"(x), "a"(y) + : "cc", "memory","r0","r1","v24","v25","v26","v27","v28","v29","v30","v31" + ); + return; + +} + + +int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + + if ( n <= 0 ) return(0); + + if ( (inc_x == 1) && (inc_y == 1 )) + { + + BLASLONG n1 = n & -16; + if ( n1 > 0 ) + { + zcopy_kernel_16(n1, x, y); + i=n1; + ix=n1*2; + iy=n1*2; + } + + while(i < n) + { + y[iy] = x[iy] ; + y[iy+1] = x[ix+1] ; + ix+=2; + iy+=2; + i++ ; + + } + + + } + else + { + + BLASLONG inc_x2 = 2 * inc_x; + BLASLONG inc_y2 = 2 * inc_y; + + while(i < n) + { + y[iy] = x[ix] ; + y[iy+1] = x[ix+1] ; + ix += inc_x2 ; + iy += inc_y2 ; + i++ ; + + } + + } + return(0); + + +} + + diff --git a/kernel/zarch/zdot.c b/kernel/zarch/zdot.c new file mode 100644 index 000000000..478cf252b --- /dev/null +++ b/kernel/zarch/zdot.c @@ -0,0 +1,216 @@ +/*************************************************************************** +Copyright (c) 2013-2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ + + +#include "common.h" + +static void __attribute__ ((noinline)) zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) { + + __asm__ volatile( + "pfd 1, 0(%1) \n\t" + "pfd 1, 0(%2) \n\t" + "vzero %%v24 \n\t" + "vzero %%v25 \n\t" + "vzero %%v26 \n\t" + "vzero %%v27 \n\t" + "srlg %%r0,%0,3 \n\t" + "xgr %%r1,%%r1 \n\t" + ".align 16 \n\t" + "1: \n\t" + "pfd 1, 256(%%r1,%1) \n\t" + "pfd 1, 256(%%r1,%2) \n\t" + "vl %%v16, 0(%%r1,%1) \n\t" + "vl %%v17, 16(%%r1,%1) \n\t" + "vl %%v18, 32(%%r1,%1) \n\t" + "vl %%v19, 48(%%r1,%1) \n\t" + "vl %%v28, 0(%%r1,%2) \n\t" + "vl %%v29, 16(%%r1,%2) \n\t" + "vl %%v30, 32(%%r1,%2) \n\t" + "vl %%v31, 48(%%r1,%2) \n\t" + "vpdi %%v20,%%v16,%%v16,4 \n\t" + "vpdi %%v21,%%v17,%%v17,4 \n\t" + "vpdi %%v22,%%v18,%%v18,4 \n\t" + "vpdi %%v23,%%v19,%%v19,4 \n\t" + + + "vfmadb %%v24,%%v16,%%v28,%%v24 \n\t" + "vfmadb %%v25,%%v20,%%v28,%%v25 \n\t" + "vfmadb %%v26,%%v17,%%v29,%%v26 \n\t" + "vfmadb %%v27,%%v21,%%v29,%%v27 \n\t" + "vfmadb %%v24,%%v18,%%v30,%%v24 \n\t" + "vfmadb %%v25,%%v22,%%v30,%%v25 \n\t" + "vfmadb %%v26,%%v19,%%v31,%%v26 \n\t" + "vfmadb %%v27,%%v23,%%v31,%%v27 \n\t" + + + + "vl %%v16, 64(%%r1,%1) \n\t" + "vl %%v17, 80(%%r1,%1) \n\t" + "vl %%v18, 96(%%r1,%1) \n\t" + "vl %%v19,112(%%r1,%1) \n\t" + "vl %%v28, 64(%%r1,%2) \n\t" + "vl %%v29, 80(%%r1,%2) \n\t" + "vl %%v30, 96(%%r1,%2) \n\t" + "vl %%v31,112(%%r1,%2) \n\t" + "vpdi %%v20,%%v16,%%v16,4 \n\t" + "vpdi %%v21,%%v17,%%v17,4 \n\t" + "vpdi %%v22,%%v18,%%v18,4 \n\t" + "vpdi %%v23,%%v19,%%v19,4 \n\t" + "vfmadb %%v24,%%v16,%%v28,%%v24 \n\t" + "vfmadb %%v25,%%v20,%%v28,%%v25 \n\t" + "vfmadb %%v26,%%v17,%%v29,%%v26 \n\t" + "vfmadb %%v27,%%v21,%%v29,%%v27 \n\t" + "vfmadb %%v24,%%v18,%%v30,%%v24 \n\t" + "vfmadb %%v25,%%v22,%%v30,%%v25 \n\t" + "vfmadb %%v26,%%v19,%%v31,%%v26 \n\t" + "vfmadb %%v27,%%v23,%%v31,%%v27 \n\t" + + + "la %%r1,128(%%r1) \n\t" + "brctg %%r0,1b \n\t" + "vfadb %%v24,%%v26,%%v24 \n\t" + "vfadb %%v25,%%v25,%%v27 \n\t" + "vsteg %%v24,0(%3),0 \n\t" + "vsteg %%v24,8(%3),1 \n\t" + "vsteg %%v25,16(%3),1 \n\t" + "vsteg %%v25,24(%3),0 \n\t" + : + : "r"(n), "a"(x), "a"(y), "a"(d) + : "cc", "memory","r0","r1","v16", + "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); + +} + +static __attribute__ ((noinline)) void zdot_kernel_8n(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) { + BLASLONG register i = 0; + FLOAT dot[4] = {0.0, 0.0, 0.0, 0.0}; + BLASLONG j = 0; + + while (i < n) { + + dot[0] += x[j] * y[j]; + dot[1] += x[j + 1] * y[j + 1]; + dot[2] += x[j] * y[j + 1]; + dot[3] += x[j + 1] * y[j]; + + dot[0] += x[j + 2] * y[j + 2]; + dot[1] += x[j + 3] * y[j + 3]; + dot[2] += x[j + 2] * y[j + 3]; + dot[3] += x[j + 3] * y[j + 2]; + + dot[0] += x[j + 4] * y[j + 4]; + dot[1] += x[j + 5] * y[j + 5]; + dot[2] += x[j + 4] * y[j + 5]; + dot[3] += x[j + 5] * y[j + 4]; + + dot[0] += x[j + 6] * y[j + 6]; + dot[1] += x[j + 7] * y[j + 7]; + dot[2] += x[j + 6] * y[j + 7]; + dot[3] += x[j + 7] * y[j + 6]; + + j += 8; + i += 4; + + } + d[0] = dot[0]; + d[1] = dot[1]; + d[2] = dot[2]; + d[3] = dot[3]; + +} + +OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { + BLASLONG i; + BLASLONG ix, iy; + OPENBLAS_COMPLEX_FLOAT result; + FLOAT dot[4] __attribute__ ((aligned(16))) = {0.0, 0.0, 0.0, 0.0}; + + if (n <= 0) { + CREAL(result) = 0.0; + CIMAG(result) = 0.0; + return (result); + + } + + if ((inc_x == 1) && (inc_y == 1)) { + + BLASLONG n1 = n & -16; + + if (n1) + zdot_kernel_8(n1, x, y, dot); + + i = n1; + BLASLONG j = i * 2; + + while (i < n) { + + dot[0] += x[j] * y[j]; + dot[1] += x[j + 1] * y[j + 1]; + dot[2] += x[j] * y[j + 1]; + dot[3] += x[j + 1] * y[j]; + + j += 2; + i++; + + } + + + } else { + i = 0; + ix = 0; + iy = 0; + inc_x <<= 1; + inc_y <<= 1; + while (i < n) { + + dot[0] += x[ix] * y[iy]; + dot[1] += x[ix + 1] * y[iy + 1]; + dot[2] += x[ix] * y[iy + 1]; + dot[3] += x[ix + 1] * y[iy]; + + ix += inc_x; + iy += inc_y; + i++; + + } + } + +#if !defined(CONJ) + CREAL(result) = dot[0] - dot[1]; + CIMAG(result) = dot[2] + dot[3]; +#else + CREAL(result) = dot[0] + dot[1]; + CIMAG(result) = dot[2] - dot[3]; + +#endif + + return (result); + +} + + diff --git a/kernel/zarch/zgemv_n_4.c b/kernel/zarch/zgemv_n_4.c new file mode 100644 index 000000000..e2805749b --- /dev/null +++ b/kernel/zarch/zgemv_n_4.c @@ -0,0 +1,919 @@ +/*************************************************************************** +Copyright (c) 2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include +#include "common.h" + + +#define HAVE_KERNEL_4x4_VEC 1 +#define HAVE_KERNEL_4x2_VEC 1 +#define HAVE_KERNEL_4x1_VEC 1 +#define HAVE_KERNEL_ADDY 1 + +#if defined(HAVE_KERNEL_4x4_VEC) || defined(HAVE_KERNEL_4x2_VEC) || defined(HAVE_KERNEL_4x1_VEC) + #include +#endif + +/** + * if define IGNORE_TEMP_PERM we store and use ybuffer as {real,real} {img;img} + * of not we will retrieve and store normal way + */ +#if (defined(HAVE_KERNEL_4x4_VEC_ASM) || defined(HAVE_KERNEL_4x4_VEC) ) && defined(HAVE_KERNEL_4x2_VEC) && defined(HAVE_KERNEL_4x1_VEC) && defined(HAVE_KERNEL_ADDY) + // #define IGNORE_TEMP_PERM 1 +#endif + +#define NBMAX 1024 + +#ifdef HAVE_KERNEL_4x4_VEC_ASM + +#elif HAVE_KERNEL_4x4_VEC +static void zgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) +{ + BLASLONG i; + FLOAT *a0,*a1,*a2,*a3; + a0 = ap[0]; + a1 = ap[1]; + a2 = ap[2]; + a3 = ap[3]; + + register __vector double vx0_r = {x[0],x[0]}; + register __vector double vx0_i = {x[1],x[1]}; + register __vector double vx1_r = {x[2],x[2]}; + register __vector double vx1_i = {x[3],x[3]}; + register __vector double vx2_r = {x[4],x[4]}; + register __vector double vx2_i = {x[5],x[5]}; + register __vector double vx3_r = {x[6],x[6]}; + register __vector double vx3_i = {x[7],x[7]}; + +#ifdef IGNORE_TEMP_PERM + register __vector double *vy = (__vector double *)y; + register BLASLONG j=0; +#endif + + for ( i=0; i< 2*n; i+=4 ) + { + +#ifdef IGNORE_TEMP_PERM + register __vector double vresult_r = vy[j]; + register __vector double vresult_i = vy[j+1]; + +#else + register __vector double vresult_r = {y[i],y[i+2]}; + register __vector double vresult_i = {y[i+1],y[i+3]}; +#endif + register __vector double va0_r= {a0[i],a0[i+2]}; + register __vector double va0_i= {a0[i+1],a0[i+3]}; + register __vector double va1_r= {a1[i],a1[i+2]}; + register __vector double va1_i= {a1[i+1],a1[i+3]}; + register __vector double va2_r= {a2[i],a2[i+2]}; + register __vector double va2_i= {a2[i+1],a2[i+3]}; + register __vector double va3_r= {a3[i],a3[i+2]}; + register __vector double va3_i= {a3[i+1],a3[i+3]}; +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + + vresult_r = va0_r * vx0_r - (va0_i*vx0_i -vresult_r) ; + vresult_i = vresult_i + va0_r * vx0_i + va0_i * vx0_r ; + vresult_r = va1_r * vx1_r - (va1_i*vx1_i -vresult_r) ; + vresult_i = vresult_i + va1_r * vx1_i + va1_i * vx1_r ; + vresult_r = va2_r * vx2_r - (va2_i*vx2_i -vresult_r) ; + vresult_i = vresult_i + va2_r * vx2_i + va2_i * vx2_r ; + vresult_r = va3_r * vx3_r - (va3_i*vx3_i -vresult_r) ; + vresult_i = vresult_i + va3_r * vx3_i + va3_i * vx3_r ; + +#else + vresult_r = vresult_r + va0_r * vx0_r + va0_i*vx0_i ; + vresult_i = va0_r * vx0_i - ( va0_i * vx0_r - vresult_i) ; + vresult_r = vresult_r + va1_r * vx1_r + va1_i*vx1_i ; + vresult_i = va1_r * vx1_i - ( va1_i * vx1_r - vresult_i) ; + vresult_r = vresult_r + va2_r * vx2_r + va2_i*vx2_i ; + vresult_i = va2_r * vx2_i - ( va2_i * vx2_r - vresult_i) ; + vresult_r = vresult_r + va3_r * vx3_r + va3_i*vx3_i ; + vresult_i = va3_r * vx3_i - ( va3_i * vx3_r - vresult_i) ; +#endif + +#ifdef IGNORE_TEMP_PERM + vy[j] = vresult_r ; + vy[j+1] = vresult_i ; + j+=2; + +#else + y[i] = vresult_r[0]; + y[i+1] = vresult_i[0]; + y[i +2 ] = vresult_r[1]; + y[i + 3 ] = vresult_i[1]; +#endif + + } + +} + +#else +static void zgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) +{ + BLASLONG i; + FLOAT *a0,*a1,*a2,*a3; + a0 = ap[0]; + a1 = ap[1]; + a2 = ap[2]; + a3 = ap[3]; + + for ( i=0; i< 2*n; i+=2 ) + { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + y[i] += a0[i]*x[0] - a0[i+1] * x[1]; + y[i+1] += a0[i]*x[1] + a0[i+1] * x[0]; + y[i] += a1[i]*x[2] - a1[i+1] * x[3]; + y[i+1] += a1[i]*x[3] + a1[i+1] * x[2]; + y[i] += a2[i]*x[4] - a2[i+1] * x[5]; + y[i+1] += a2[i]*x[5] + a2[i+1] * x[4]; + y[i] += a3[i]*x[6] - a3[i+1] * x[7]; + y[i+1] += a3[i]*x[7] + a3[i+1] * x[6]; +#else + y[i] += a0[i]*x[0] + a0[i+1] * x[1]; + y[i+1] += a0[i]*x[1] - a0[i+1] * x[0]; + y[i] += a1[i]*x[2] + a1[i+1] * x[3]; + y[i+1] += a1[i]*x[3] - a1[i+1] * x[2]; + y[i] += a2[i]*x[4] + a2[i+1] * x[5]; + y[i+1] += a2[i]*x[5] - a2[i+1] * x[4]; + y[i] += a3[i]*x[6] + a3[i+1] * x[7]; + y[i+1] += a3[i]*x[7] - a3[i+1] * x[6]; +#endif + } +} + +#endif + + + +#ifdef HAVE_KERNEL_4x2_VEC + +static void zgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) +{ + BLASLONG i; + FLOAT *a0,*a1; + a0 = ap[0]; + a1 = ap[1]; + + register __vector double vx0_r = {x[0],x[0]}; + register __vector double vx0_i = {x[1],x[1]}; + register __vector double vx1_r = {x[2],x[2]}; + register __vector double vx1_i = {x[3],x[3]}; +#ifdef IGNORE_TEMP_PERM + register __vector double *vy = (__vector double *)y; + register BLASLONG j=0; +#endif + + for ( i=0; i< 2*n; i+=4 ) + { +#ifdef IGNORE_TEMP_PERM + register __vector double vresult_r = vy[j]; + register __vector double vresult_i = vy[j+1]; + +#else + register __vector double vresult_r = {y[i],y[i+2]}; + register __vector double vresult_i = {y[i+1],y[i+3]}; +#endif + register __vector double va0_r= {a0[i],a0[i+2]}; + register __vector double va0_i= {a0[i+1],a0[i+3]}; + register __vector double va1_r= {a1[i],a1[i+2]}; + register __vector double va1_i= {a1[i+1],a1[i+3]}; +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + + vresult_r = va0_r * vx0_r - (va0_i*vx0_i -vresult_r) ; + vresult_i = vresult_i + va0_r * vx0_i + va0_i * vx0_r ; + vresult_r = va1_r * vx1_r - (va1_i*vx1_i -vresult_r) ; + vresult_i = vresult_i + va1_r * vx1_i + va1_i * vx1_r ; + +#else + vresult_r = vresult_r + va0_r * vx0_r + va0_i*vx0_i ; + vresult_i = va0_r * vx0_i - ( va0_i * vx0_r - vresult_i) ; + vresult_r = vresult_r + va1_r * vx1_r + va1_i*vx1_i ; + vresult_i = va1_r * vx1_i - ( va1_i * vx1_r - vresult_i) ; +#endif + +#ifdef IGNORE_TEMP_PERM + vy[j] = vresult_r ; + vy[j+1] = vresult_i ; + j+=2; + +#else + y[i] = vresult_r[0]; + y[i+1] = vresult_i[0]; + y[i +2 ] = vresult_r[1]; + y[i + 3 ] = vresult_i[1]; +#endif + + } +} + +#else +static void zgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) +{ + BLASLONG i; + FLOAT *a0,*a1; + a0 = ap[0]; + a1 = ap[1]; + + for ( i=0; i< 2*n; i+=2 ) + { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + y[i] += a0[i]*x[0] - a0[i+1] * x[1]; + y[i+1] += a0[i]*x[1] + a0[i+1] * x[0]; + y[i] += a1[i]*x[2] - a1[i+1] * x[3]; + y[i+1] += a1[i]*x[3] + a1[i+1] * x[2]; +#else + y[i] += a0[i]*x[0] + a0[i+1] * x[1]; + y[i+1] += a0[i]*x[1] - a0[i+1] * x[0]; + y[i] += a1[i]*x[2] + a1[i+1] * x[3]; + y[i+1] += a1[i]*x[3] - a1[i+1] * x[2]; +#endif + } +} + +#endif + + + + +#ifdef HAVE_KERNEL_4x1_VEC + +static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) +{ + BLASLONG i; + FLOAT *a0; + a0 = ap; + + register __vector double vx_r = {x[0],x[0]}; + register __vector double vx_i = {x[1],x[1]}; + +#ifdef IGNORE_TEMP_PERM + register __vector double *vy = (__vector double *)y; + register BLASLONG j=0; +#endif + + for ( i=0; i< 2*n; i+=4 ) + { +#ifdef IGNORE_TEMP_PERM + register __vector double vresult_r = vy[j]; + register __vector double vresult_i = vy[j+1]; + +#else + register __vector double vresult_r = {y[i],y[i+2]}; + register __vector double vresult_i = {y[i+1],y[i+3]}; +#endif + register __vector double va0_r= {a0[i],a0[i+2]}; + register __vector double va0_i= {a0[i+1],a0[i+3]}; +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + + vresult_r = va0_r * vx_r - (va0_i*vx_i -vresult_r) ; + vresult_i = vresult_i + va0_r * vx_i + va0_i * vx_r ; + +#else + vresult_r = vresult_r + va0_r * vx_r + va0_i*vx_i ; + vresult_i = va0_r * vx_i - ( va0_i * vx_r - vresult_i) ; + +// y[i] += a0[i]*x[0] + a0[i+1] * x[1]; +// y[i+1] += a0[i]*x[1] - a0[i+1] * x[0]; +#endif +#ifndef IGNORE_TEMP_PERM + y[i] = vresult_r[0]; + y[i+1] = vresult_i[0]; + y[i +2 ] = vresult_r[1]; + y[i + 3 ] = vresult_i[1]; +#else + vy[j] = vresult_r ; + vy[j+1] = vresult_i ; + j+=2; +#endif + + } +} + +#else + +static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) +{ + BLASLONG i; + FLOAT *a0; + a0 = ap; + + for ( i=0; i< 2*n; i+=2 ) + { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + y[i] += a0[i]*x[0] - a0[i+1] * x[1]; + y[i+1] += a0[i]*x[1] + a0[i+1] * x[0]; +#else + y[i] += a0[i]*x[0] + a0[i+1] * x[1]; + y[i+1] += a0[i]*x[1] - a0[i+1] * x[0]; +#endif + + } +} + + +#endif + + +#ifdef HAVE_KERNEL_ADDY +static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT alpha_r, FLOAT alpha_i) +{ + BLASLONG i; +#ifdef IGNORE_TEMP_PERM + register __vector double *src_vec = (__vector double *)src; +#endif + register __vector double valpha_r = {alpha_r,alpha_r}; + register __vector double valpha_i = {alpha_i,alpha_i}; + register __vector double vresult_r; + register __vector double vresult_i; + if ( inc_dest != 2 ) + { + + + for ( i=0; i +#endif + +#ifdef HAVE_KERNEL_4x4_VEC_ASM + +#elif HAVE_KERNEL_4x4_VEC +static void zgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + BLASLONG i; + FLOAT *a0,*a1,*a2,*a3; + a0 = ap[0]; + a1 = ap[1]; + a2 = ap[2]; + a3 = ap[3]; + register __vector double vtemp0_r = {0.0,0.0}; + register __vector double vtemp0_i = {0.0,0.0}; + register __vector double vtemp1_r = {0.0,0.0}; + register __vector double vtemp1_i = {0.0,0.0}; + register __vector double vtemp2_r = {0.0,0.0}; + register __vector double vtemp2_i = {0.0,0.0}; + register __vector double vtemp3_r = {0.0,0.0}; + register __vector double vtemp3_i = {0.0,0.0}; + for ( i=0; i< 2*n; i+=4 ) + { + register __vector double vx_r = {x[i],x[i+2]}; + register __vector double vx_i = {x[i+1],x[i+3]}; + register __vector double va0_r= {a0[i],a0[i+2]}; + register __vector double va0_i= {a0[i+1],a0[i+3]}; + register __vector double va1_r= {a1[i],a1[i+2]}; + register __vector double va1_i= {a1[i+1],a1[i+3]}; + register __vector double va2_r= {a2[i],a2[i+2]}; + register __vector double va2_i= {a2[i+1],a2[i+3]}; + register __vector double va3_r= {a3[i],a3[i+2]}; + register __vector double va3_i= {a3[i+1],a3[i+3]}; + +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + + vtemp0_r = va0_r * vx_r - (va0_i*vx_i -vtemp0_r) ; + vtemp0_i = vtemp0_i + va0_r * vx_i + va0_i * vx_r ; + vtemp1_r = va1_r * vx_r - (va1_i*vx_i -vtemp1_r) ; + vtemp1_i = vtemp1_i + va1_r * vx_i + va1_i * vx_r ; + + vtemp2_r = va2_r * vx_r - (va2_i*vx_i -vtemp2_r) ; + vtemp2_i = vtemp2_i + va2_r * vx_i + va2_i * vx_r ; + vtemp3_r = va3_r * vx_r - (va3_i*vx_i -vtemp3_r) ; + vtemp3_i = vtemp3_i + va3_r * vx_i + va3_i * vx_r ; +#else + + + vtemp0_r = vtemp0_r + va0_r * vx_r + va0_i*vx_i ; + vtemp0_i = va0_r * vx_i - ( va0_i * vx_r - vtemp0_i) ; + vtemp1_r = vtemp1_r + va1_r * vx_r + va1_i*vx_i ; + vtemp1_i = va1_r * vx_i - ( va1_i * vx_r - vtemp1_i); + vtemp2_r = vtemp2_r + va2_r * vx_r + va2_i*vx_i ; + vtemp2_i = va2_r * vx_i - ( va2_i * vx_r - vtemp2_i) ; + vtemp3_r = vtemp3_r + va3_r * vx_r + va3_i*vx_i ; + vtemp3_i = va3_r * vx_i - ( va3_i * vx_r - vtemp3_i); +#endif + } + + register FLOAT alpha_r = alpha[0] ; + register FLOAT alpha_i = alpha[1] ; + register FLOAT temp_r0 = vtemp0_r[0]+vtemp0_r[1] ; + register FLOAT temp_i0 = vtemp0_i[0]+vtemp0_i[1] ; + register FLOAT temp_r1 = vtemp1_r[0]+vtemp1_r[1] ; + register FLOAT temp_i1 = vtemp1_i[0]+vtemp1_i[1] ; + + register FLOAT temp_r2 = vtemp2_r[0]+vtemp2_r[1] ; + register FLOAT temp_i2 = vtemp2_i[0]+vtemp2_i[1] ; + register FLOAT temp_r3 = vtemp3_r[0]+vtemp3_r[1] ; + register FLOAT temp_i3 = vtemp3_i[0]+vtemp3_i[1] ; + +#if !defined(XCONJ) + + y[0] += alpha_r * temp_r0 - alpha_i * temp_i0; + y[1] += alpha_r * temp_i0 + alpha_i * temp_r0; + y[2] += alpha_r * temp_r1 - alpha_i * temp_i1; + y[3] += alpha_r * temp_i1 + alpha_i * temp_r1; + y[4] += alpha_r * temp_r2 - alpha_i * temp_i2; + y[5] += alpha_r * temp_i2 + alpha_i * temp_r2; + y[6] += alpha_r * temp_r3 - alpha_i * temp_i3; + y[7] += alpha_r * temp_i3 + alpha_i * temp_r3; + +#else + + y[0] += alpha_r * temp_r0 + alpha_i * temp_i0; + y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; + y[2] += alpha_r * temp_r1 + alpha_i * temp_i1; + y[3] -= alpha_r * temp_i1 - alpha_i * temp_r1; + y[4] += alpha_r * temp_r2 + alpha_i * temp_i2; + y[5] -= alpha_r * temp_i2 - alpha_i * temp_r2; + y[6] += alpha_r * temp_r3 + alpha_i * temp_i3; + y[7] -= alpha_r * temp_i3 - alpha_i * temp_r3; + +#endif +} + +#else + +static void zgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + BLASLONG i; + FLOAT *a0,*a1,*a2,*a3; + a0 = ap[0]; + a1 = ap[1]; + a2 = ap[2]; + a3 = ap[3]; + FLOAT alpha_r = alpha[0]; + FLOAT alpha_i = alpha[1]; + FLOAT temp_r0 = 0.0; + FLOAT temp_r1 = 0.0; + FLOAT temp_r2 = 0.0; + FLOAT temp_r3 = 0.0; + FLOAT temp_i0 = 0.0; + FLOAT temp_i1 = 0.0; + FLOAT temp_i2 = 0.0; + FLOAT temp_i3 = 0.0; + + + for ( i=0; i< 2*n; i+=2 ) + { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + temp_r0 += a0[i]*x[i] - a0[i+1]*x[i+1]; + temp_i0 += a0[i]*x[i+1] + a0[i+1]*x[i]; + temp_r1 += a1[i]*x[i] - a1[i+1]*x[i+1]; + temp_i1 += a1[i]*x[i+1] + a1[i+1]*x[i]; + temp_r2 += a2[i]*x[i] - a2[i+1]*x[i+1]; + temp_i2 += a2[i]*x[i+1] + a2[i+1]*x[i]; + temp_r3 += a3[i]*x[i] - a3[i+1]*x[i+1]; + temp_i3 += a3[i]*x[i+1] + a3[i+1]*x[i]; +#else + temp_r0 += a0[i]*x[i] + a0[i+1]*x[i+1]; + temp_i0 += a0[i]*x[i+1] - a0[i+1]*x[i]; + temp_r1 += a1[i]*x[i] + a1[i+1]*x[i+1]; + temp_i1 += a1[i]*x[i+1] - a1[i+1]*x[i]; + temp_r2 += a2[i]*x[i] + a2[i+1]*x[i+1]; + temp_i2 += a2[i]*x[i+1] - a2[i+1]*x[i]; + temp_r3 += a3[i]*x[i] + a3[i+1]*x[i+1]; + temp_i3 += a3[i]*x[i+1] - a3[i+1]*x[i]; +#endif + } + +#if !defined(XCONJ) + + y[0] += alpha_r * temp_r0 - alpha_i * temp_i0; + y[1] += alpha_r * temp_i0 + alpha_i * temp_r0; + y[2] += alpha_r * temp_r1 - alpha_i * temp_i1; + y[3] += alpha_r * temp_i1 + alpha_i * temp_r1; + y[4] += alpha_r * temp_r2 - alpha_i * temp_i2; + y[5] += alpha_r * temp_i2 + alpha_i * temp_r2; + y[6] += alpha_r * temp_r3 - alpha_i * temp_i3; + y[7] += alpha_r * temp_i3 + alpha_i * temp_r3; + +#else + + y[0] += alpha_r * temp_r0 + alpha_i * temp_i0; + y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; + y[2] += alpha_r * temp_r1 + alpha_i * temp_i1; + y[3] -= alpha_r * temp_i1 - alpha_i * temp_r1; + y[4] += alpha_r * temp_r2 + alpha_i * temp_i2; + y[5] -= alpha_r * temp_i2 - alpha_i * temp_r2; + y[6] += alpha_r * temp_r3 + alpha_i * temp_i3; + y[7] -= alpha_r * temp_i3 - alpha_i * temp_r3; + +#endif +} + +#endif + +#ifdef HAVE_KERNEL_4x2_VEC + + +static void zgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + BLASLONG i; + FLOAT *a0,*a1; + a0 = ap[0]; + a1 = ap[1]; + register __vector double vtemp0_r = {0.0,0.0}; + register __vector double vtemp0_i = {0.0,0.0}; + register __vector double vtemp1_r = {0.0,0.0}; + register __vector double vtemp1_i = {0.0,0.0}; + for ( i=0; i< 2*n; i+=4 ) + { + register __vector double vx_r = {x[i],x[i+2]}; + register __vector double vx_i = {x[i+1],x[i+3]}; + register __vector double va0_r= {a0[i],a0[i+2]}; + register __vector double va0_i= {a0[i+1],a0[i+3]}; + register __vector double va1_r= {a1[i],a1[i+2]}; + register __vector double va1_i= {a1[i+1],a1[i+3]}; + + + +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + + vtemp0_r = va0_r * vx_r - (va0_i*vx_i -vtemp0_r) ; + vtemp0_i = vtemp0_i + va0_r * vx_i + va0_i * vx_r ; + vtemp1_r = va1_r * vx_r - (va1_i*vx_i -vtemp1_r) ; + vtemp1_i = vtemp1_i + va1_r * vx_i + va1_i * vx_r ; +#else + vtemp0_r = vtemp0_r + va0_r * vx_r + va0_i*vx_i ; + vtemp0_i = va0_r * vx_i - ( va0_i * vx_r - vtemp0_i) ; + vtemp1_r = vtemp1_r + va1_r * vx_r + va1_i*vx_i ; + vtemp1_i = va1_r * vx_i - ( va1_i * vx_r - vtemp1_i); +#endif + } + + register FLOAT temp_r0 = vtemp0_r[0]+vtemp0_r[1] ; + register FLOAT temp_i0 = vtemp0_i[0]+vtemp0_i[1] ; + register FLOAT temp_r1 = vtemp1_r[0]+vtemp1_r[1] ; + register FLOAT temp_i1 = vtemp1_i[0]+vtemp1_i[1] ; + register FLOAT alpha_r = alpha[0] ; + register FLOAT alpha_i = alpha[1] ; + +#if !defined(XCONJ) + + y[0] += alpha_r * temp_r0 - alpha_i * temp_i0; + y[1] += alpha_r * temp_i0 + alpha_i * temp_r0; + y[2] += alpha_r * temp_r1 - alpha_i * temp_i1; + y[3] += alpha_r * temp_i1 + alpha_i * temp_r1; + +#else + + y[0] += alpha_r * temp_r0 + alpha_i * temp_i0; + y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; + y[2] += alpha_r * temp_r1 + alpha_i * temp_i1; + y[3] -= alpha_r * temp_i1 - alpha_i * temp_r1; + +#endif +} + +#else + +static void zgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + BLASLONG i; + FLOAT *a0,*a1; + a0 = ap[0]; + a1 = ap[1]; + FLOAT alpha_r = alpha[0]; + FLOAT alpha_i = alpha[1]; + FLOAT temp_r0 = 0.0; + FLOAT temp_r1 = 0.0; + FLOAT temp_i0 = 0.0; + FLOAT temp_i1 = 0.0; + + + for ( i=0; i< 2*n; i+=2 ) + { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + temp_r0 += a0[i]*x[i] - a0[i+1]*x[i+1]; + temp_i0 += a0[i]*x[i+1] + a0[i+1]*x[i]; + temp_r1 += a1[i]*x[i] - a1[i+1]*x[i+1]; + temp_i1 += a1[i]*x[i+1] + a1[i+1]*x[i]; +#else + temp_r0 += a0[i]*x[i] + a0[i+1]*x[i+1]; + temp_i0 += a0[i]*x[i+1] - a0[i+1]*x[i]; + temp_r1 += a1[i]*x[i] + a1[i+1]*x[i+1]; + temp_i1 += a1[i]*x[i+1] - a1[i+1]*x[i]; +#endif + } + +#if !defined(XCONJ) + + y[0] += alpha_r * temp_r0 - alpha_i * temp_i0; + y[1] += alpha_r * temp_i0 + alpha_i * temp_r0; + y[2] += alpha_r * temp_r1 - alpha_i * temp_i1; + y[3] += alpha_r * temp_i1 + alpha_i * temp_r1; + +#else + + y[0] += alpha_r * temp_r0 + alpha_i * temp_i0; + y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; + y[2] += alpha_r * temp_r1 + alpha_i * temp_i1; + y[3] -= alpha_r * temp_i1 - alpha_i * temp_r1; + +#endif +} + +#endif + + +#ifdef HAVE_KERNEL_4x1_VEC +static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + BLASLONG i; + FLOAT *a0; + a0 = ap; + + + register __vector double vtemp_r = {0.0,0.0}; + register __vector double vtemp_i = {0.0,0.0}; + + for ( i=0; i< 2*n; i+=4 ) + { + register __vector double va0_r= {a0[i],a0[i+2]}; + register __vector double va0_i= {a0[i+1],a0[i+3]}; + register __vector double vx0_r = {x[i],x[i+2]}; + register __vector double vx0_i = {x[i+1],x[i+3]}; + +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + + vtemp_r = va0_r * vx0_r - (va0_i*vx0_i -vtemp_r) ; + vtemp_i = vtemp_i + va0_r * vx0_i + va0_i * vx0_r ; +#else + vtemp_r = vtemp_r + va0_r * vx0_r + va0_i*vx0_i ; + vtemp_i = va0_r * vx0_i - ( va0_i * vx0_r - vtemp_i) ; +#endif + } + + register FLOAT temp_r0 = vtemp_r[0]+vtemp_r[1] ; + register FLOAT temp_i0 = vtemp_i[0]+vtemp_i[1] ; + register FLOAT alpha_r = alpha[0] ; + register FLOAT alpha_i = alpha[1] ; + +#if !defined(XCONJ) + + y[0] += alpha_r * temp_r0 - alpha_i * temp_i0; + y[1] += alpha_r * temp_i0 + alpha_i * temp_r0; + +#else + + y[0] += alpha_r * temp_r0 + alpha_i * temp_i0; + y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; + +#endif + + +} + +#else + +static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + BLASLONG i; + FLOAT *a0; + a0 = ap; + FLOAT alpha_r = alpha[0]; + FLOAT alpha_i = alpha[1]; + FLOAT temp_r0 = 0.0; + FLOAT temp_i0 = 0.0; + + for ( i=0; i< 2*n; i+=2 ) + { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + temp_r0 += a0[i]*x[i] - a0[i+1]*x[i+1]; + temp_i0 += a0[i]*x[i+1] + a0[i+1]*x[i]; +#else + temp_r0 += a0[i]*x[i] + a0[i+1]*x[i+1]; + temp_i0 += a0[i]*x[i+1] - a0[i+1]*x[i]; +#endif + } + +#if !defined(XCONJ) + + y[0] += alpha_r * temp_r0 - alpha_i * temp_i0; + y[1] += alpha_r * temp_i0 + alpha_i * temp_r0; + +#else + + y[0] += alpha_r * temp_r0 + alpha_i * temp_i0; + y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; + +#endif + + +} + +#endif + + +static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) +{ + BLASLONG i; + for ( i=0; i> 2 ; + n2 = n & 3 ; + + m3 = m & 3 ; + m1 = m - m3; + m2 = (m & (NBMAX-1)) - m3 ; + + alpha[0] = alpha_r; + alpha[1] = alpha_i; + + BLASLONG NB = NBMAX; + + while ( NB == NBMAX ) + { + + m1 -= NB; + if ( m1 < 0) + { + if ( m2 == 0 ) break; + NB = m2; + } + + y_ptr = y; + a_ptr = a; + x_ptr = x; + ap[0] = a_ptr; + ap[1] = a_ptr + lda; + ap[2] = ap[1] + lda; + ap[3] = ap[2] + lda; + if ( inc_x != 2 ) + copy_x(NB,x_ptr,xbuffer,inc_x); + else + xbuffer = x_ptr; + + if ( inc_y == 2 ) + { + + for( i = 0; i < n1 ; i++) + { + zgemv_kernel_4x4(NB,ap,xbuffer,y_ptr,alpha); + ap[0] += lda4; + ap[1] += lda4; + ap[2] += lda4; + ap[3] += lda4; + a_ptr += lda4; + y_ptr += 8; + + } + + if ( n2 & 2 ) + { + zgemv_kernel_4x2(NB,ap,xbuffer,y_ptr,alpha); + a_ptr += lda * 2; + y_ptr += 4; + + } + + if ( n2 & 1 ) + { + zgemv_kernel_4x1(NB,a_ptr,xbuffer,y_ptr,alpha); + a_ptr += lda; + y_ptr += 2; + + } + + } + else + { + + for( i = 0; i < n1 ; i++) + { + memset(ybuffer,0,64); + zgemv_kernel_4x4(NB,ap,xbuffer,ybuffer,alpha); + ap[0] += lda4; + ap[1] += lda4; + ap[2] += lda4; + ap[3] += lda4; + a_ptr += lda4; + + y_ptr[0] += ybuffer[0]; + y_ptr[1] += ybuffer[1]; + y_ptr += inc_y; + y_ptr[0] += ybuffer[2]; + y_ptr[1] += ybuffer[3]; + y_ptr += inc_y; + y_ptr[0] += ybuffer[4]; + y_ptr[1] += ybuffer[5]; + y_ptr += inc_y; + y_ptr[0] += ybuffer[6]; + y_ptr[1] += ybuffer[7]; + y_ptr += inc_y; + + } + + for( i = 0; i < n2 ; i++) + { + memset(ybuffer,0,64); + zgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer,alpha); + a_ptr += lda; + y_ptr[0] += ybuffer[0]; + y_ptr[1] += ybuffer[1]; + y_ptr += inc_y; + + } + + } + a += 2 * NB; + x += NB * inc_x; + } + + + + if ( m3 == 0 ) return(0); + + x_ptr = x; + j=0; + a_ptr = a; + y_ptr = y; + + if ( m3 == 3 ) + { + + FLOAT temp_r ; + FLOAT temp_i ; + FLOAT x0 = x_ptr[0]; + FLOAT x1 = x_ptr[1]; + x_ptr += inc_x; + FLOAT x2 = x_ptr[0]; + FLOAT x3 = x_ptr[1]; + x_ptr += inc_x; + FLOAT x4 = x_ptr[0]; + FLOAT x5 = x_ptr[1]; + while ( j < n) + { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; + temp_r += a_ptr[4] * x4 - a_ptr[5] * x5; + temp_i += a_ptr[4] * x5 + a_ptr[5] * x4; +#else + + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; + temp_r += a_ptr[4] * x4 + a_ptr[5] * x5; + temp_i += a_ptr[4] * x5 - a_ptr[5] * x4; +#endif + +#if !defined(XCONJ) + y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; + y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; +#else + y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; + y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; +#endif + + a_ptr += lda; + y_ptr += inc_y; + j++; + } + return(0); + } + + + if ( m3 == 2 ) + { + + FLOAT temp_r ; + FLOAT temp_i ; + FLOAT temp_r1 ; + FLOAT temp_i1 ; + FLOAT x0 = x_ptr[0]; + FLOAT x1 = x_ptr[1]; + x_ptr += inc_x; + FLOAT x2 = x_ptr[0]; + FLOAT x3 = x_ptr[1]; + FLOAT ar = alpha[0]; + FLOAT ai = alpha[1]; + + while ( j < ( n & -2 )) + { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; + a_ptr += lda; + temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r1 += a_ptr[2] * x2 - a_ptr[3] * x3; + temp_i1 += a_ptr[2] * x3 + a_ptr[3] * x2; +#else + + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; + a_ptr += lda; + temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r1 += a_ptr[2] * x2 + a_ptr[3] * x3; + temp_i1 += a_ptr[2] * x3 - a_ptr[3] * x2; +#endif + +#if !defined(XCONJ) + y_ptr[0] += ar * temp_r - ai * temp_i; + y_ptr[1] += ar * temp_i + ai * temp_r; + y_ptr += inc_y; + y_ptr[0] += ar * temp_r1 - ai * temp_i1; + y_ptr[1] += ar * temp_i1 + ai * temp_r1; +#else + y_ptr[0] += ar * temp_r + ai * temp_i; + y_ptr[1] -= ar * temp_i - ai * temp_r; + y_ptr += inc_y; + y_ptr[0] += ar * temp_r1 + ai * temp_i1; + y_ptr[1] -= ar * temp_i1 - ai * temp_r1; +#endif + + a_ptr += lda; + y_ptr += inc_y; + j+=2; + } + + + while ( j < n) + { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; +#else + + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; +#endif + +#if !defined(XCONJ) + y_ptr[0] += ar * temp_r - ai * temp_i; + y_ptr[1] += ar * temp_i + ai * temp_r; +#else + y_ptr[0] += ar * temp_r + ai * temp_i; + y_ptr[1] -= ar * temp_i - ai * temp_r; +#endif + + a_ptr += lda; + y_ptr += inc_y; + j++; + } + + return(0); + } + + + if ( m3 == 1 ) + { + + FLOAT temp_r ; + FLOAT temp_i ; + FLOAT temp_r1 ; + FLOAT temp_i1 ; + FLOAT x0 = x_ptr[0]; + FLOAT x1 = x_ptr[1]; + FLOAT ar = alpha[0]; + FLOAT ai = alpha[1]; + + while ( j < ( n & -2 )) + { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; + a_ptr += lda; + temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0; +#else + + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; + a_ptr += lda; + temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0; +#endif + +#if !defined(XCONJ) + y_ptr[0] += ar * temp_r - ai * temp_i; + y_ptr[1] += ar * temp_i + ai * temp_r; + y_ptr += inc_y; + y_ptr[0] += ar * temp_r1 - ai * temp_i1; + y_ptr[1] += ar * temp_i1 + ai * temp_r1; +#else + y_ptr[0] += ar * temp_r + ai * temp_i; + y_ptr[1] -= ar * temp_i - ai * temp_r; + y_ptr += inc_y; + y_ptr[0] += ar * temp_r1 + ai * temp_i1; + y_ptr[1] -= ar * temp_i1 - ai * temp_r1; +#endif + + a_ptr += lda; + y_ptr += inc_y; + j+=2; + } + + while ( j < n) + { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; +#else + + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; +#endif + +#if !defined(XCONJ) + y_ptr[0] += ar * temp_r - ai * temp_i; + y_ptr[1] += ar * temp_i + ai * temp_r; +#else + y_ptr[0] += ar * temp_r + ai * temp_i; + y_ptr[1] -= ar * temp_i - ai * temp_r; +#endif + + a_ptr += lda; + y_ptr += inc_y; + j++; + } + return(0); + } + + return(0); + + +} + + diff --git a/kernel/zarch/zrot.c b/kernel/zarch/zrot.c new file mode 100644 index 000000000..dd5574850 --- /dev/null +++ b/kernel/zarch/zrot.c @@ -0,0 +1,276 @@ +/*************************************************************************** +Copyright (c) 2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include "common.h" + +static void __attribute__ ((noinline)) zrot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) +{ + __asm__ ( + "pfd 2, 0(%1) \n\t" + "pfd 2, 0(%2) \n\t" + + "vlrepg %%v0,0(%3) \n\t" + "vlrepg %%v1,0(%4) \n\t" + "srlg %%r0,%0,4 \n\t" + "xgr %%r1,%%r1 \n\t" + ".align 16 \n\t" + "1: \n\t" + "pfd 2, 256(%%r1,%1) \n\t" + "pfd 2, 256(%%r1,%2) \n\t" + "vl %%v24, 0(%%r1,%1) \n\t" + "vl %%v25, 16(%%r1,%1) \n\t" + "vl %%v26, 32(%%r1,%1) \n\t" + "vl %%v27, 48(%%r1,%1) \n\t" + "vl %%v16, 0(%%r1,%2) \n\t" + "vl %%v17, 16(%%r1,%2) \n\t" + "vl %%v18, 32(%%r1,%2) \n\t" + "vl %%v19, 48(%%r1,%2) \n\t" + + "vfmdb %%v28,%%v24,%%v0 \n\t" + "vfmdb %%v29,%%v25,%%v0 \n\t" + "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v30,%%v26,%%v0 \n\t" + "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v31,%%v27,%%v0 \n\t" + "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ + /* 2nd parts*/ + "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" + "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ + "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" + "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ + "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" + "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ + "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" + "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ + + + + "vst %%v28, 0(%%r1,%1) \n\t" + "vst %%v29, 16(%%r1,%1) \n\t" + "vst %%v30, 32(%%r1,%1) \n\t" + "vst %%v31, 48(%%r1,%1) \n\t" + "vst %%v20, 0(%%r1,%2) \n\t" + "vst %%v21, 16(%%r1,%2) \n\t" + "vst %%v22, 32(%%r1,%2) \n\t" + "vst %%v23, 48(%%r1,%2) \n\t" + + "vl %%v24, 64(%%r1,%1) \n\t" + "vl %%v25, 80(%%r1,%1) \n\t" + "vl %%v26, 96(%%r1,%1) \n\t" + "vl %%v27,112(%%r1,%1) \n\t" + "vl %%v16, 64(%%r1,%2) \n\t" + "vl %%v17, 80(%%r1,%2) \n\t" + "vl %%v18, 96(%%r1,%2) \n\t" + "vl %%v19,112(%%r1,%2) \n\t" + + "vfmdb %%v28,%%v24,%%v0 \n\t" + "vfmdb %%v29,%%v25,%%v0 \n\t" + "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v30,%%v26,%%v0 \n\t" + "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v31,%%v27,%%v0 \n\t" + "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ + /* 2nd parts*/ + "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" + "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ + "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" + "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ + "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" + "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ + "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" + "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ + + + + "vst %%v28, 64(%%r1,%1) \n\t" + "vst %%v29, 80(%%r1,%1) \n\t" + "vst %%v30, 96(%%r1,%1) \n\t" + "vst %%v31, 112(%%r1,%1) \n\t" + "vst %%v20, 64(%%r1,%2) \n\t" + "vst %%v21, 80(%%r1,%2) \n\t" + "vst %%v22, 96(%%r1,%2) \n\t" + "vst %%v23, 112(%%r1,%2) \n\t" + + "vl %%v24, 128(%%r1,%1) \n\t" + "vl %%v25, 144(%%r1,%1) \n\t" + "vl %%v26, 160(%%r1,%1) \n\t" + "vl %%v27, 176(%%r1,%1) \n\t" + "vl %%v16, 128(%%r1,%2) \n\t" + "vl %%v17, 144(%%r1,%2) \n\t" + "vl %%v18, 160(%%r1,%2) \n\t" + "vl %%v19, 176(%%r1,%2) \n\t" + + "vfmdb %%v28,%%v24,%%v0 \n\t" + "vfmdb %%v29,%%v25,%%v0 \n\t" + "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v30,%%v26,%%v0 \n\t" + "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v31,%%v27,%%v0 \n\t" + "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ + /* 2nd parts*/ + "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" + "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ + "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" + "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ + "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" + "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ + "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" + "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ + + + + "vst %%v28, 128(%%r1,%1) \n\t" + "vst %%v29, 144(%%r1,%1) \n\t" + "vst %%v30, 160(%%r1,%1) \n\t" + "vst %%v31, 176(%%r1,%1) \n\t" + "vst %%v20, 128(%%r1,%2) \n\t" + "vst %%v21, 144(%%r1,%2) \n\t" + "vst %%v22, 160(%%r1,%2) \n\t" + "vst %%v23, 176(%%r1,%2) \n\t" + + "vl %%v24, 192(%%r1,%1) \n\t" + "vl %%v25, 208(%%r1,%1) \n\t" + "vl %%v26, 224(%%r1,%1) \n\t" + "vl %%v27, 240(%%r1,%1) \n\t" + "vl %%v16, 192(%%r1,%2) \n\t" + "vl %%v17, 208(%%r1,%2) \n\t" + "vl %%v18, 224(%%r1,%2) \n\t" + "vl %%v19, 240(%%r1,%2) \n\t" + + "vfmdb %%v28,%%v24,%%v0 \n\t" + "vfmdb %%v29,%%v25,%%v0 \n\t" + "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v30,%%v26,%%v0 \n\t" + "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v31,%%v27,%%v0 \n\t" + "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ + /* 2nd parts*/ + "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" + "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ + "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" + "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ + "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" + "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ + "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" + "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ + + + + "vst %%v28, 192(%%r1,%1) \n\t" + "vst %%v29, 208(%%r1,%1) \n\t" + "vst %%v30, 224(%%r1,%1) \n\t" + "vst %%v31, 240(%%r1,%1) \n\t" + "vst %%v20, 192(%%r1,%2) \n\t" + "vst %%v21, 208(%%r1,%2) \n\t" + "vst %%v22, 224(%%r1,%2) \n\t" + "vst %%v23, 240(%%r1,%2) \n\t" + + + + "la %%r1,256(%%r1) \n\t" + "brctg %%r0,1b" + : + : "r"(n), "a"(x), "a"(y),"a"(c),"a"(s) + : "cc", "memory","r0","r1" ,"v0","v1","v16", + "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); + return; + +} + + + +int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + FLOAT temp[2]; + BLASLONG inc_x2; + BLASLONG inc_y2; + + if ( n <= 0 ) return(0); + + if ( (inc_x == 1) && (inc_y == 1) ) + { + + BLASLONG n1 = n & -16; + if ( n1 > 0 ) + { + FLOAT cosa,sina; + cosa=c; + sina=s; + zrot_kernel_16(n1, x, y, &cosa, &sina); + i=n1; + ix=2*n1; + } + + while(i < n) + { + temp[0] = c*x[ix] + s*y[ix] ; + temp[1] = c*x[ix+1] + s*y[ix+1] ; + y[ix] = c*y[ix] - s*x[ix] ; + y[ix+1] = c*y[ix+1] - s*x[ix+1] ; + x[ix] = temp[0] ; + x[ix+1] = temp[1] ; + + ix += 2 ; + i++ ; + + } + + + } + else + { + inc_x2 = 2 * inc_x ; + inc_y2 = 2 * inc_y ; + while(i < n) + { + temp[0] = c*x[ix] + s*y[iy] ; + temp[1] = c*x[ix+1] + s*y[iy+1] ; + y[iy] = c*y[iy] - s*x[ix] ; + y[iy+1] = c*y[iy+1] - s*x[ix+1] ; + x[ix] = temp[0] ; + x[ix+1] = temp[1] ; + + ix += inc_x2 ; + iy += inc_y2 ; + i++ ; + + } + + } + return(0); + +} + + diff --git a/kernel/zarch/zscal.c b/kernel/zarch/zscal.c new file mode 100644 index 000000000..b46f925bb --- /dev/null +++ b/kernel/zarch/zscal.c @@ -0,0 +1,483 @@ +/*************************************************************************** +Copyright (c) 2013 - 2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ + +#include "common.h" + + + +static void __attribute__ ((noinline)) zscal_kernel_8(BLASLONG n, FLOAT *alpha, FLOAT *x) { + __asm__( + + "pfd 1, 0(%1) \n\t" + "sllg %%r0,%0,4 \n\t" + "agr %%r0,%2 \n\t" + "vlrepg %%v24,0(%1) \n\t" + "vlrepg %%v25,8(%1) \n\t" + ".align 16 \n\t" + "1: \n\t" + "pfd 2, 256(%2 ) \n\t" + + "vleg %%v20 , 0(%2),0 \n\t" + "vleg %%v21 , 8(%2),0 \n\t" + "vleg %%v20 , 16(%2),1 \n\t" + "vleg %%v21 , 24(%2),1 \n\t" + + "vleg %%v22 , 32(%2),0 \n\t" + "vleg %%v23 , 40(%2),0 \n\t" + "vleg %%v22 , 48(%2),1 \n\t" + "vleg %%v23 , 56(%2),1 \n\t" + + "vfmdb %%v16, %%v21, %%v25 \n\t" + "vfmdb %%v17, %%v20, %%v25 \n\t" + "vfmdb %%v18, %%v23, %%v25 \n\t" + "vfmdb %%v19, %%v22, %%v25 \n\t" + + "vfmsdb %%v16, %%v20, %%v24 ,%%v16 \n\t" + "vfmadb %%v17, %%v21, %%v24, %%v17 \n\t" + "vfmsdb %%v18, %%v22, %%v24, %%v18 \n\t" + "vfmadb %%v19, %%v23, %%v24, %%v19 \n\t" + + "vsteg %%v16 , 0(%2),0 \n\t" + "vsteg %%v17 , 8(%2),0 \n\t" + "vsteg %%v16 , 16(%2),1 \n\t" + "vsteg %%v17 , 24(%2),1 \n\t" + + "vsteg %%v18 , 32(%2),0 \n\t" + "vsteg %%v19 , 40(%2),0 \n\t" + "vsteg %%v18 , 48(%2),1 \n\t" + "vsteg %%v19 , 56(%2),1 \n\t" + + "vleg %%v20 , 64(%2),0 \n\t" + "vleg %%v21 , 72(%2),0 \n\t" + "vleg %%v20 , 80(%2),1 \n\t" + "vleg %%v21 , 88(%2),1 \n\t" + + "vleg %%v22 , 96(%2),0 \n\t" + "vleg %%v23 , 104(%2),0 \n\t" + "vleg %%v22 , 112(%2),1 \n\t" + "vleg %%v23 , 120(%2),1 \n\t" + + "vfmdb %%v16, %%v21, %%v25 \n\t" + "vfmdb %%v17, %%v20, %%v25 \n\t" + "vfmdb %%v18, %%v23, %%v25 \n\t" + "vfmdb %%v19, %%v22, %%v25 \n\t" + + "vfmsdb %%v16, %%v20, %%v24 ,%%v16 \n\t" + "vfmadb %%v17, %%v21, %%v24, %%v17 \n\t" + "vfmsdb %%v18, %%v22, %%v24, %%v18 \n\t" + "vfmadb %%v19, %%v23, %%v24, %%v19 \n\t" + + "vsteg %%v16 , 64(%2),0 \n\t" + "vsteg %%v17 , 72(%2),0 \n\t" + "vsteg %%v16 , 80(%2),1 \n\t" + "vsteg %%v17 , 88(%2),1 \n\t" + + "vsteg %%v18 , 96(%2),0 \n\t" + "vsteg %%v19 , 104(%2),0 \n\t" + "vsteg %%v18 , 112(%2),1 \n\t" + "vsteg %%v19 , 120(%2),1 \n\t" + + "la %2,128(%2) \n\t" + "clgrjl %2,%%r0,1b \n\t" + : + : "r"(n), "a"(alpha), "a"(x) + : "cc", "memory","r0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25" + ); + + +} + +static void __attribute__ ((noinline)) zscal_kernel_8_zero_r(BLASLONG n, FLOAT *alpha, FLOAT *x) { + + __asm__ ( "pfd 2, 0(%1) \n\t" + "ld %%f0,8(%2) \n\t" + "lcdbr %%f1,%%f0 \n\t" + "lgdr %%r0,%%f1 \n\t" + "vlvgg %%v0,%%r0,1 \n\t" + "vlr %%v16,%%v0 \n\t" + "vlr %%v17 ,%%v0 \n\t" + "vlr %%v1,%%v0 \n\t" + "sllg %%r0,%0,4 \n\t" + "agr %%r0,%1 \n\t" + ".align 16 \n\t" + "1: \n\t" + "vl %%v24, 0(%1) \n\t" + "vfmdb %%v24,%%v24,%%v0 \n\t" + "vsteg %%v24, 0(%1),1 \n\t" + "vsteg %%v24, 8(%1),0 \n\t" + "vl %%v25, 16(%1) \n\t" + "vfmdb %%v25,%%v25,%%v1 \n\t" + "vsteg %%v25, 16(%1),1 \n\t" + "vsteg %%v25, 24(%1),0 \n\t" + "vl %%v26, 32(%1) \n\t" + "vfmdb %%v26,%%v26,%%v16 \n\t" + "vsteg %%v26, 32(%1),1 \n\t" + "vsteg %%v26, 40(%1),0 \n\t" + "vl %%v27, 48(%1) \n\t" + "vfmdb %%v27,%%v27,%%v17 \n\t" + "vsteg %%v27, 40(%1),1 \n\t" + "vsteg %%v27, 48(%1),0 \n\t" + "vl %%v28, 64(%1) \n\t" + "vfmdb %%v28,%%v28,%%v0 \n\t" + "vsteg %%v28, 64(%1),1 \n\t" + "vsteg %%v28, 72(%1),0 \n\t" + "vl %%v29, 80(%1) \n\t" + "vfmdb %%v29,%%v29,%%v1 \n\t" + "vsteg %%v29, 80(%1),1 \n\t" + "vsteg %%v29, 88(%1),0 \n\t" + "vl %%v30, 96(%1) \n\t" + "vfmdb %%v30,%%v30,%%v16 \n\t" + "vsteg %%v27, 96(%1),1 \n\t" + "vsteg %%v27, 104(%1),0 \n\t" + "vl %%v31, 112(%1) \n\t" + "vfmdb %%v31,%%v31,%%v17 \n\t" + "vsteg %%v31, 112(%1),1 \n\t" + "vsteg %%v31, 120(%1),0 \n\t" + "la %1,128(%1) \n\t" + "clgrjl %1,%%r0,1b \n\t" + : + :"r"(n),"a"(x) ,"a"(alpha) + :"cc", "memory","r0","f0", "f1","v0","v1","v18","v19","v24","v25","v26","v27","v28","v29","v30","v31" + ); + + +} + +static void __attribute__ ((noinline)) zscal_kernel_8_zero_i(BLASLONG n, FLOAT *alpha, FLOAT *x) { + __asm__ ("pfd 2, 0(%1) \n\t" + "vlrepg %%v18,0(%2) \n\t" + "vlr %%v19,%%v18 \n\t" + "vlr %%v16 ,%%v18 \n\t" + "vlr %%v17,%%v18 \n\t" + "sllg %%r0,%0,4 \n\t" + "agr %%r0,%1 \n\t" + ".align 16 \n\t" + "1: \n\t" + "vl %%v24, 0(%1) \n\t" + "vfmdb %%v24,%%v24,%%v18 \n\t" + "vst %%v24, 0(%1) \n\t" + "vl %%v25, 16(%1) \n\t" + "vfmdb %%v25,%%v25,%%v19 \n\t" + "vst %%v25, 16(%1) \n\t" + "vl %%v26, 32(%1) \n\t" + "vfmdb %%v26,%%v26,%%v16 \n\t" + "vst %%v26, 32(%1) \n\t" + "vl %%v27, 48(%1) \n\t" + "vfmdb %%v27,%%v27,%%v17 \n\t" + "vst %%v27, 48(%1) \n\t" + "vl %%v28, 64(%1) \n\t" + "vfmdb %%v28,%%v28,%%v18 \n\t" + "vst %%v28, 64(%1) \n\t" + "vl %%v29, 80(%1) \n\t" + "vfmdb %%v29,%%v29,%%v19 \n\t" + "vst %%v29, 80(%1) \n\t" + "vl %%v30, 96(%1) \n\t" + "vfmdb %%v30,%%v30,%%v16 \n\t" + "vst %%v30, 96(%1) \n\t" + "vl %%v31, 112(%1) \n\t" + "vfmdb %%v31,%%v31,%%v17 \n\t" + "vst %%v31, 112(%1) \n\t" + "la %1,128(%1) \n\t" + "clgrjl %1,%%r0,1b \n\t" + : + :"r"(n),"a"(x) ,"a"(alpha) + :"cc", "memory","r0","v16", "v17","v18","v19","v24","v25","v26","v27","v28","v29","v30","v31" + ); + +} + +static void __attribute__ ((noinline)) zscal_kernel_8_zero(BLASLONG n, FLOAT *x) { + + __asm__ ( "pfd 2, 0(%1) \n\t" + "vzero %%v24 \n\t" + "vzero %%v25 \n\t" + "vzero %%v26 \n\t" + "vzero %%v27 \n\t" + "sllg %%r0,%0,4 \n\t" + "agr %%r0,%1 \n\t" + ".align 16 \n\t" + "1: \n\t" + "pfd 2, 256( %1) \n\t" + "vst %%v24, 0( %1) \n\t" + "vst %%v25, 16( %1) \n\t" + "vst %%v26, 32( %1) \n\t" + "vst %%v27, 48( %1) \n\t" + "vst %%v24, 64( %1) \n\t" + "vst %%v25, 80( %1) \n\t" + "vst %%v26, 96( %1) \n\t" + "vst %%v27,112( %1) \n\t" + + "la %1,128(%1) \n\t" + "clgrjl %1,%%r0,1b \n\t" + : + :"r"(n),"a"(x) + :"cc" , "memory" ,"r0","v24","v25","v26","v27" + ); + +} + + + +static void zscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, BLASLONG inc_x) __attribute__ ((noinline)); + +static void zscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, BLASLONG inc_x) { + + BLASLONG i; + BLASLONG inc_x2 = 2 * inc_x; + BLASLONG inc_x3 = inc_x2 + inc_x; + FLOAT t0, t1, t2, t3; + FLOAT da_r = alpha[0]; + FLOAT da_i = alpha[1]; + + for (i = 0; i < n; i += 4) { + t0 = da_r * x[0] - da_i * x[1]; + t1 = da_r * x[inc_x] - da_i * x[inc_x + 1]; + t2 = da_r * x[inc_x2] - da_i * x[inc_x2 + 1]; + t3 = da_r * x[inc_x3] - da_i * x[inc_x3 + 1]; + + x[1] = da_i * x[0] + da_r * x[1]; + x[inc_x + 1] = da_i * x[inc_x] + da_r * x[inc_x + 1]; + x[inc_x2 + 1] = da_i * x[inc_x2] + da_r * x[inc_x2 + 1]; + x[inc_x3 + 1] = da_i * x[inc_x3] + da_r * x[inc_x3 + 1]; + + x[0] = t0; + x[inc_x] = t1; + x[inc_x2] = t2; + x[inc_x3] = t3; + + x += 4 * inc_x; + + } + + +} + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { + BLASLONG i = 0, j = 0; + FLOAT temp0; + FLOAT temp1; + FLOAT alpha[2]; + + if (inc_x != 1) { + inc_x <<= 1; + + if (da_r == 0.0) { + + BLASLONG n1 = n & -2; + + if (da_i == 0.0) { + + while (j < n1) { + + x[i] = 0.0; + x[i + 1] = 0.0; + x[i + inc_x] = 0.0; + x[i + 1 + inc_x] = 0.0; + i += 2 * inc_x; + j += 2; + + } + + while (j < n) { + + x[i] = 0.0; + x[i + 1] = 0.0; + i += inc_x; + j++; + + } + + } else { + + while (j < n1) { + + temp0 = -da_i * x[i + 1]; + x[i + 1] = da_i * x[i]; + x[i] = temp0; + temp1 = -da_i * x[i + 1 + inc_x]; + x[i + 1 + inc_x] = da_i * x[i + inc_x]; + x[i + inc_x] = temp1; + i += 2 * inc_x; + j += 2; + + } + + while (j < n) { + + temp0 = -da_i * x[i + 1]; + x[i + 1] = da_i * x[i]; + x[i] = temp0; + i += inc_x; + j++; + + } + + + + } + + } else { + + + if (da_i == 0.0) { + BLASLONG n1 = n & -2; + + while (j < n1) { + + temp0 = da_r * x[i]; + x[i + 1] = da_r * x[i + 1]; + x[i] = temp0; + temp1 = da_r * x[i + inc_x]; + x[i + 1 + inc_x] = da_r * x[i + 1 + inc_x]; + x[i + inc_x] = temp1; + i += 2 * inc_x; + j += 2; + + } + + while (j < n) { + + temp0 = da_r * x[i]; + x[i + 1] = da_r * x[i + 1]; + x[i] = temp0; + i += inc_x; + j++; + + } + + } else { + + BLASLONG n1 = n & -8; + if (n1 > 0) { + alpha[0] = da_r; + alpha[1] = da_i; + zscal_kernel_inc_8(n1, alpha, x, inc_x); + j = n1; + i = n1 * inc_x; + } + + while (j < n) { + + temp0 = da_r * x[i] - da_i * x[i + 1]; + x[i + 1] = da_r * x[i + 1] + da_i * x[i]; + x[i] = temp0; + i += inc_x; + j++; + + } + + } + + } + + return (0); + } + + + BLASLONG n1 = n & -8; + if (n1 > 0) { + + alpha[0] = da_r; + alpha[1] = da_i; + + if (da_r == 0.0) + if (da_i == 0) + zscal_kernel_8_zero(n1, x); + else + zscal_kernel_8_zero_r(n1, alpha, x); + else + if (da_i == 0) + zscal_kernel_8_zero_i(n1, alpha, x); + else + zscal_kernel_8(n1, alpha, x); + + i = n1 << 1; + j = n1; + } + + + if (da_r == 0.0) { + + if (da_i == 0.0) { + + while (j < n) { + + x[i] = 0.0; + x[i + 1] = 0.0; + i += 2; + j++; + + } + + } else { + + while (j < n) { + + temp0 = -da_i * x[i + 1]; + x[i + 1] = da_i * x[i]; + x[i] = temp0; + i += 2; + j++; + + } + + } + + } else { + + if (da_i == 0.0) { + + while (j < n) { + + temp0 = da_r * x[i]; + x[i + 1] = da_r * x[i + 1]; + x[i] = temp0; + i += 2; + j++; + + } + + } else { + + while (j < n) { + + temp0 = da_r * x[i] - da_i * x[i + 1]; + x[i + 1] = da_r * x[i + 1] + da_i * x[i]; + x[i] = temp0; + i += 2; + j++; + + } + + } + + } + + return (0); +} + + diff --git a/kernel/zarch/zswap.c b/kernel/zarch/zswap.c new file mode 100644 index 000000000..8ed13e98d --- /dev/null +++ b/kernel/zarch/zswap.c @@ -0,0 +1,198 @@ +/*************************************************************************** +Copyright (c) 2013-2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include "common.h" + + +static void __attribute__ ((noinline)) zswap_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) +{ + __asm__ volatile( + "pfd 2, 0(%1) \n\t" + "pfd 2, 0(%2) \n\t" + "srlg %%r0,%0,4 \n\t" + "xgr %%r1,%%r1 \n\t" + ".align 16 \n\t" + "1: \n\t" + "pfd 2, 256(%%r1,%1) \n\t" + "pfd 2, 256(%%r1,%2) \n\t" + + "vl %%v16, 0(%%r1,%1) \n\t" + "vl %%v17, 16(%%r1,%1) \n\t" + "vl %%v18, 32(%%r1,%1) \n\t" + "vl %%v19, 48(%%r1,%1) \n\t" + "vl %%v20, 64(%%r1,%1) \n\t" + "vl %%v21, 80(%%r1,%1) \n\t" + "vl %%v22, 96(%%r1,%1) \n\t" + "vl %%v23, 112(%%r1,%1) \n\t" + "vl %%v24, 128(%%r1,%1) \n\t" + "vl %%v25, 144(%%r1,%1) \n\t" + "vl %%v26, 160(%%r1,%1) \n\t" + "vl %%v27, 176(%%r1,%1) \n\t" + "vl %%v28, 192(%%r1,%1) \n\t" + "vl %%v29, 208(%%r1,%1) \n\t" + "vl %%v30, 224(%%r1,%1) \n\t" + "vl %%v31, 240(%%r1,%1) \n\t" + + + "vl %%v0, 0(%%r1,%2) \n\t" + "vl %%v1, 16(%%r1,%2) \n\t" + "vl %%v2, 32(%%r1,%2) \n\t" + "vl %%v3, 48(%%r1,%2) \n\t" + "vl %%v4, 64(%%r1,%2) \n\t" + "vl %%v5, 80(%%r1,%2) \n\t" + "vl %%v6, 96(%%r1,%2) \n\t" + "vl %%v7, 112(%%r1,%2) \n\t" + "vst %%v0, 0(%%r1,%1) \n\t" + "vst %%v1, 16(%%r1,%1) \n\t" + "vst %%v2, 32(%%r1,%1) \n\t" + "vst %%v3, 48(%%r1,%1) \n\t" + "vst %%v4, 64(%%r1,%1) \n\t" + "vst %%v5, 80(%%r1,%1) \n\t" + "vst %%v6, 96(%%r1,%1) \n\t" + "vst %%v7, 112(%%r1,%1) \n\t" + + "vl %%v0, 128(%%r1,%2) \n\t" + "vl %%v1, 144(%%r1,%2) \n\t" + "vl %%v2, 160(%%r1,%2) \n\t" + "vl %%v3, 176(%%r1,%2) \n\t" + "vl %%v4, 192(%%r1,%2) \n\t" + "vl %%v5, 208(%%r1,%2) \n\t" + "vl %%v6, 224(%%r1,%2) \n\t" + "vl %%v7, 240(%%r1,%2) \n\t" + "vst %%v0, 128(%%r1,%1) \n\t" + "vst %%v1, 144(%%r1,%1) \n\t" + "vst %%v2, 160(%%r1,%1) \n\t" + "vst %%v3, 176(%%r1,%1) \n\t" + "vst %%v4, 192(%%r1,%1) \n\t" + "vst %%v5, 208(%%r1,%1) \n\t" + "vst %%v6, 224(%%r1,%1) \n\t" + "vst %%v7, 240(%%r1,%1) \n\t" + + "vst %%v16, 0(%%r1,%2) \n\t" + "vst %%v17, 16(%%r1,%2) \n\t" + "vst %%v18, 32(%%r1,%2) \n\t" + "vst %%v19, 48(%%r1,%2) \n\t" + "vst %%v20, 64(%%r1,%2) \n\t" + "vst %%v21, 80(%%r1,%2) \n\t" + "vst %%v22, 96(%%r1,%2) \n\t" + "vst %%v23, 112(%%r1,%2) \n\t" + "vst %%v24, 128(%%r1,%2) \n\t" + "vst %%v25, 144(%%r1,%2) \n\t" + "vst %%v26, 160(%%r1,%2) \n\t" + "vst %%v27, 176(%%r1,%2) \n\t" + "vst %%v28, 192(%%r1,%2) \n\t" + "vst %%v29, 208(%%r1,%2) \n\t" + "vst %%v30, 224(%%r1,%2) \n\t" + "vst %%v31, 240(%%r1,%2) \n\t" + + + "la %%r1,256(%%r1) \n\t" + "brctg %%r0,1b" + : + : "r"(n), "a"(x), "a"(y) + :"cc", "memory","r0","r1", "v0","v1","v2","v3","v4","v5","v6","v7","v16", + "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); + return; + +} + + + + + + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + FLOAT temp[2]; + BLASLONG inc_x2, inc_y2; + + if ( n <= 0 ) return(0); + + if ( (inc_x == 1) && (inc_y == 1 )) + { + + BLASLONG n1 = n & -16; + if ( n1 > 0 ) + { + zswap_kernel_16(n1, x, y); + i=n1; + ix = 2* n1; + iy = 2* n1; + } + + while(i < n) + { + + temp[0] = x[ix] ; + temp[1] = x[ix+1] ; + x[ix] = y[iy] ; + x[ix+1] = y[iy+1] ; + y[iy] = temp[0] ; + y[iy+1] = temp[1] ; + + ix += 2 ; + iy += 2 ; + i++ ; + + + } + + + } + else + { + + inc_x2 = 2 * inc_x; + inc_y2 = 2 * inc_y; + + while(i < n) + { + + temp[0] = x[ix] ; + temp[1] = x[ix+1] ; + x[ix] = y[iy] ; + x[ix+1] = y[iy+1] ; + y[iy] = temp[0] ; + y[iy+1] = temp[1] ; + + ix += inc_x2 ; + iy += inc_y2 ; + i++ ; + + } + + } + return(0); + + +} + +