From 8c3386be8780bdf631ffebe085fde2591d4cd062 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Wed, 16 Jan 2019 15:16:21 +0000 Subject: [PATCH] Added missing Blas1 single fp {saxpy, caxpy, cdot, crot(refactored version of srot),isamax ,isamin, icamax, icamin}, Fixed idamin,icamin choosing the first occurance index of equal minimals --- kernel/power/KERNEL.POWER8 | 20 +-- kernel/power/caxpy.c | 145 +++++++++++++++++++ kernel/power/cdot.c | 164 +++++++++++++++++++++ kernel/power/crot.c | 213 +++++++++++++++++++++++++++ kernel/power/icamax.c | 261 +++++++++++++++++++++++++++++++++ kernel/power/icamin.c | 266 ++++++++++++++++++++++++++++++++++ kernel/power/idamin.c | 50 +++---- kernel/power/isamax.c | 288 +++++++++++++++++++++++++++++++++++++ kernel/power/isamin.c | 288 +++++++++++++++++++++++++++++++++++++ kernel/power/izamin.c | 26 ++-- kernel/power/saxpy.c | 129 +++++++++++++++++ 11 files changed, 1802 insertions(+), 48 deletions(-) create mode 100644 kernel/power/caxpy.c create mode 100644 kernel/power/cdot.c create mode 100644 kernel/power/crot.c create mode 100644 kernel/power/icamax.c create mode 100644 kernel/power/icamin.c create mode 100644 kernel/power/isamax.c create mode 100644 kernel/power/isamin.c create mode 100644 kernel/power/saxpy.c diff --git a/kernel/power/KERNEL.POWER8 b/kernel/power/KERNEL.POWER8 index 00ff8682a..cbcffb8fe 100644 --- a/kernel/power/KERNEL.POWER8 +++ b/kernel/power/KERNEL.POWER8 @@ -89,14 +89,14 @@ ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c #SMINKERNEL = ../arm/min.c #DMINKERNEL = ../arm/min.c # -#ISAMAXKERNEL = ../arm/iamax.c +ISAMAXKERNEL = isamax.c IDAMAXKERNEL = idamax.c -#ICAMAXKERNEL = ../arm/izamax.c -IZAMAXKERNEL = izamax.c +ICAMAXKERNEL = icamax.c +IZAMAXKERNEL = izamax.c # -#ISAMINKERNEL = ../arm/iamin.c -IDAMINKERNEL = idamin.c -#ICAMINKERNEL = ../arm/izamin.c +ISAMINKERNEL = isamin.c +IDAMINKERNEL = idamin.c +ICAMINKERNEL = icamin.c IZAMINKERNEL = izamin.c # #ISMAXKERNEL = ../arm/imax.c @@ -110,9 +110,9 @@ DASUMKERNEL = dasum.c CASUMKERNEL = casum.c ZASUMKERNEL = zasum.c # -#SAXPYKERNEL = ../arm/axpy.c +SAXPYKERNEL = saxpy.c DAXPYKERNEL = daxpy.c -#CAXPYKERNEL = ../arm/zaxpy.c +CAXPYKERNEL = caxpy.c ZAXPYKERNEL = zaxpy.c # SCOPYKERNEL = scopy.c @@ -123,7 +123,7 @@ ZCOPYKERNEL = zcopy.c SDOTKERNEL = sdot.c DDOTKERNEL = ddot.c DSDOTKERNEL = sdot.c -#CDOTKERNEL = ../arm/zdot.c +CDOTKERNEL = cdot.c ZDOTKERNEL = zdot.c # SNRM2KERNEL = ../arm/nrm2.c @@ -133,7 +133,7 @@ ZNRM2KERNEL = ../arm/znrm2.c # SROTKERNEL = srot.c DROTKERNEL = drot.c -#CROTKERNEL = ../arm/zrot.c +CROTKERNEL = crot.c ZROTKERNEL = zrot.c # SSCALKERNEL = sscal.c diff --git a/kernel/power/caxpy.c b/kernel/power/caxpy.c new file mode 100644 index 000000000..4bdf13c34 --- /dev/null +++ b/kernel/power/caxpy.c @@ -0,0 +1,145 @@ +/* +Copyright (c) 2013-2018, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + + +#ifndef HAVE_ASM_KERNEL +#include +static void caxpy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) +{ + +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + + register __vector float valpha_r = {alpha_r, alpha_r,alpha_r, alpha_r}; + register __vector float valpha_i = {-alpha_i, alpha_i,-alpha_i, alpha_i}; + +#else + register __vector float valpha_r = {alpha_r, -alpha_r,alpha_r, -alpha_r}; + register __vector float valpha_i = {alpha_i, alpha_i,alpha_i, alpha_i}; +#endif + + __vector unsigned char swap_mask = { 4,5,6,7,0,1,2,3, 12,13,14,15, 8,9,10,11}; + register __vector float *vy = (__vector float *) y; + register __vector float *vx = (__vector float *) x; + BLASLONG i=0; + for (; i < n/2; i += 8) { + + register __vector float vy_0 = vy[i]; + register __vector float vy_1 = vy[i + 1]; + register __vector float vy_2 = vy[i + 2]; + register __vector float vy_3 = vy[i + 3]; + register __vector float vy_4 = vy[i + 4]; + register __vector float vy_5 = vy[i + 5]; + register __vector float vy_6 = vy[i + 6]; + register __vector float vy_7 = vy[i + 7]; + register __vector float vx_0 = vx[i]; + register __vector float vx_1 = vx[i + 1]; + register __vector float vx_2 = vx[i + 2]; + register __vector float vx_3 = vx[i + 3]; + register __vector float vx_4 = vx[i + 4]; + register __vector float vx_5 = vx[i + 5]; + register __vector float vx_6 = vx[i + 6]; + register __vector float vx_7 = vx[i + 7]; + vy_0 += vx_0*valpha_r; + vy_1 += vx_1*valpha_r; + vy_2 += vx_2*valpha_r; + vy_3 += vx_3*valpha_r; + vy_4 += vx_4*valpha_r; + vy_5 += vx_5*valpha_r; + vy_6 += vx_6*valpha_r; + vy_7 += vx_7*valpha_r; + vx_0 = vec_perm(vx_0, vx_0, swap_mask); + vx_1 = vec_perm(vx_1, vx_1, swap_mask); + vx_2 = vec_perm(vx_2, vx_2, swap_mask); + vx_3 = vec_perm(vx_3, vx_3, swap_mask); + vx_4 = vec_perm(vx_4, vx_4, swap_mask); + vx_5 = vec_perm(vx_5, vx_5, swap_mask); + vx_6 = vec_perm(vx_6, vx_6, swap_mask); + vx_7 = vec_perm(vx_7, vx_7, swap_mask); + vy_0 += vx_0*valpha_i; + vy_1 += vx_1*valpha_i; + vy_2 += vx_2*valpha_i; + vy_3 += vx_3*valpha_i; + vy_4 += vx_4*valpha_i; + vy_5 += vx_5*valpha_i; + vy_6 += vx_6*valpha_i; + vy_7 += vx_7*valpha_i; + vy[i] = vy_0; + vy[i + 1] = vy_1; + vy[i + 2] = vy_2; + vy[i + 3] = vy_3; + vy[i + 4] = vy_4; + vy[i + 5] = vy_5 ; + vy[i + 6] = vy_6 ; + vy[i + 7] = vy_7 ; + + } +} +#endif +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { + BLASLONG i = 0; + BLASLONG ix = 0, iy = 0; + if (n <= 0) return (0); + if ((inc_x == 1) && (inc_y == 1)) { + BLASLONG n1 = n & -16; + if (n1) { + caxpy_kernel_16(n1, x, y, da_r,da_i); + ix = 2 * n1; + } + i = n1; + while (i < n) { +#if !defined(CONJ) + y[ix] += (da_r * x[ix] - da_i * x[ix + 1]); + y[ix + 1] += (da_r * x[ix + 1] + da_i * x[ix]); +#else + y[ix] += (da_r * x[ix] + da_i * x[ix + 1]); + y[ix + 1] -= (da_r * x[ix + 1] - da_i * x[ix]); +#endif + i++; + ix += 2; + } + return (0); + + } + inc_x *= 2; + inc_y *= 2; + while (i < n) { +#if !defined(CONJ) + y[iy] += (da_r * x[ix] - da_i * x[ix + 1]); + y[iy + 1] += (da_r * x[ix + 1] + da_i * x[ix]); +#else + y[iy] += (da_r * x[ix] + da_i * x[ix + 1]); + y[iy + 1] -= (da_r * x[ix + 1] - da_i * x[ix]); +#endif + ix += inc_x; + iy += inc_y; + i++; + } + return (0); +} + diff --git a/kernel/power/cdot.c b/kernel/power/cdot.c new file mode 100644 index 000000000..f86a33f22 --- /dev/null +++ b/kernel/power/cdot.c @@ -0,0 +1,164 @@ +/*Copyright (c) 2013-201\n8, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ + +#include "common.h" + +#ifndef HAVE_KERNEL_8 +#include +static void cdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, float *dot) +{ + __vector unsigned char swap_mask = { 4,5,6,7,0,1,2,3, 12,13,14,15, 8,9,10,11}; + register __vector float *vy = (__vector float *) y; + register __vector float *vx = (__vector float *) x; + BLASLONG i = 0; + register __vector float vd_0 = { 0 }; + register __vector float vd_1 = { 0 }; + register __vector float vd_2 = { 0 }; + register __vector float vd_3 = { 0 }; + register __vector float vdd_0 = { 0 }; + register __vector float vdd_1 = { 0 }; + register __vector float vdd_2 = { 0 }; + register __vector float vdd_3 = { 0 }; + for (; i < n/2; i += 4) { + + register __vector float vyy_0 ; + register __vector float vyy_1 ; + register __vector float vyy_2 ; + register __vector float vyy_3 ; + + register __vector float vy_0 = vy[i]; + register __vector float vy_1 = vy[i + 1]; + register __vector float vy_2 = vy[i + 2]; + register __vector float vy_3 = vy[i + 3]; + register __vector float vx_0= vx[i]; + register __vector float vx_1 = vx[i + 1]; + register __vector float vx_2 = vx[i + 2]; + register __vector float vx_3 = vx[i + 3]; + vyy_0 = vec_perm(vy_0, vy_0, swap_mask); + vyy_1 = vec_perm(vy_1, vy_1, swap_mask); + vyy_2 = vec_perm(vy_2, vy_2, swap_mask); + vyy_3 = vec_perm(vy_3, vy_3, swap_mask); + + vd_0 += vx_0 * vy_0; + vd_1 += vx_1 * vy_1; + vd_2 += vx_2 * vy_2; + vd_3 += vx_3 * vy_3; + + vdd_0 += vx_0 * vyy_0; + vdd_1 += vx_1 * vyy_1; + vdd_2 += vx_2 * vyy_2; + vdd_3 += vx_3 * vyy_3; + + + } + //aggregate + vd_0 = vd_0 + vd_1 +vd_2 +vd_3; + vdd_0= vdd_0 + vdd_1 +vdd_2 +vdd_3; + //reverse and aggregate + vd_1=vec_xxpermdi(vd_0,vd_0,2) ; + vdd_1=vec_xxpermdi(vdd_0,vdd_0,2); + vd_2=vd_0+vd_1; + vdd_2=vdd_0+vdd_1; + + dot[0]=vd_2[0]; + dot[1]=vd_2[1]; + dot[2]=vdd_2[0]; + dot[3]=vdd_2[1]; + +} +#endif + + +OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { + BLASLONG i = 0; + BLASLONG ix=0, iy=0; + OPENBLAS_COMPLEX_FLOAT result; + FLOAT dot[4] __attribute__ ((aligned(16))) = {0.0, 0.0, 0.0, 0.0}; + + if (n <= 0) { + CREAL(result) = 0.0; + CIMAG(result) = 0.0; + return (result); + + } + + if ((inc_x == 1) && (inc_y == 1)) { + + BLASLONG n1 = n & -8; + BLASLONG j=0; + + if (n1){ + cdot_kernel_8(n1, x, y, dot); + i = n1; + j = n1 <<1; + } + + + while (i < n) { + + dot[0] += x[j] * y[j]; + dot[1] += x[j + 1] * y[j + 1]; + dot[2] += x[j] * y[j + 1]; + dot[3] += x[j + 1] * y[j]; + + j += 2; + i++; + + } + + + } else { + i = 0; + ix = 0; + iy = 0; + inc_x <<= 1; + inc_y <<= 1; + while (i < n) { + + dot[0] += x[ix] * y[iy]; + dot[1] += x[ix + 1] * y[iy + 1]; + dot[2] += x[ix] * y[iy + 1]; + dot[3] += x[ix + 1] * y[iy]; + + ix += inc_x; + iy += inc_y; + i++; + + } + } + +#if !defined(CONJ) + CREAL(result) = dot[0] - dot[1]; + CIMAG(result) = dot[2] + dot[3]; +#else + CREAL(result) = dot[0] + dot[1]; + CIMAG(result) = dot[2] - dot[3]; + +#endif + + return (result); + +} diff --git a/kernel/power/crot.c b/kernel/power/crot.c new file mode 100644 index 000000000..7e04a09e8 --- /dev/null +++ b/kernel/power/crot.c @@ -0,0 +1,213 @@ +/*************************************************************************** +Copyright (c) 2013-2018, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if defined(POWER8) + +static void crot_kernel_8 (long n, float *x, float *y, float c, float s) +{ + __vector float t0; + __vector float t1; + __vector float t2; + __vector float t3; + __vector float t4; + __vector float t5; + __vector float t6; + __vector float t7; + __asm__ + ( + "xscvdpspn 36, %x[cos] \n\t" // load c to all words + "xxspltw 36, 36, 0 \n\t" + "xscvdpspn 37, %x[sin] \n\t" // load s to all words + "xxspltw 37, 37, 0 \n\t" + "lxvd2x 32, 0, %[x_ptr] \n\t" // load x + "lxvd2x 33, %[i16], %[x_ptr] \n\t" + "lxvd2x 34, %[i32], %[x_ptr] \n\t" + "lxvd2x 35, %[i48], %[x_ptr] \n\t" + "lxvd2x 48, 0, %[y_ptr] \n\t" // load y + "lxvd2x 49, %[i16], %[y_ptr] \n\t" + "lxvd2x 50, %[i32], %[y_ptr] \n\t" + "lxvd2x 51, %[i48], %[y_ptr] \n\t" + "addi %[x_ptr], %[x_ptr], 64 \n\t" + "addi %[y_ptr], %[y_ptr], 64 \n\t" + "addic. %[temp_n], %[temp_n], -16 \n\t" + "ble 2f \n\t" + ".p2align 5 \n\t" + "1: \n\t" + "xvmulsp 40, 32, 36 \n\t" // c * x + "xvmulsp 41, 33, 36 \n\t" + "xvmulsp 42, 34, 36 \n\t" + "xvmulsp 43, 35, 36 \n\t" + "xvmulsp %x[x0], 48, 36 \n\t" // c * y + "xvmulsp %x[x2], 49, 36 \n\t" + "xvmulsp %x[x1], 50, 36 \n\t" + "xvmulsp %x[x3], 51, 36 \n\t" + "xvmulsp 44, 32, 37 \n\t" // s * x + "xvmulsp 45, 33, 37 \n\t" + "lxvd2x 32, 0, %[x_ptr] \n\t" // load x + "lxvd2x 33, %[i16], %[x_ptr] \n\t" + "xvmulsp 46, 34, 37 \n\t" + "xvmulsp 47, 35, 37 \n\t" + "lxvd2x 34, %[i32], %[x_ptr] \n\t" + "lxvd2x 35, %[i48], %[x_ptr] \n\t" + "xvmulsp %x[x4], 48, 37 \n\t" // s * y + "xvmulsp %x[x5], 49, 37 \n\t" + "lxvd2x 48, 0, %[y_ptr] \n\t" // load y + "lxvd2x 49, %[i16], %[y_ptr] \n\t" + "xvmulsp %x[x6], 50, 37 \n\t" + "xvmulsp %x[x7], 51, 37 \n\t" + "lxvd2x 50, %[i32], %[y_ptr] \n\t" + "lxvd2x 51, %[i48], %[y_ptr] \n\t" + "xvaddsp 40, 40, %x[x4] \n\t" // c * x + s * y + "xvaddsp 41, 41, %x[x5] \n\t" // c * x + s * y + "addi %[x_ptr], %[x_ptr], -64 \n\t" + "addi %[y_ptr], %[y_ptr], -64 \n\t" + "xvaddsp 42, 42, %x[x6] \n\t" // c * x + s * y + "xvaddsp 43, 43, %x[x7] \n\t" // c * x + s * y + "xvsubsp %x[x0], %x[x0], 44 \n\t" // c * y - s * x + "xvsubsp %x[x2], %x[x2], 45 \n\t" // c * y - s * x + "xvsubsp %x[x1], %x[x1], 46 \n\t" // c * y - s * x + "xvsubsp %x[x3], %x[x3], 47 \n\t" // c * y - s * x + "stxvd2x 40, 0, %[x_ptr] \n\t" // store x + "stxvd2x 41, %[i16], %[x_ptr] \n\t" + "stxvd2x 42, %[i32], %[x_ptr] \n\t" + "stxvd2x 43, %[i48], %[x_ptr] \n\t" + "stxvd2x %x[x0], 0, %[y_ptr] \n\t" // store y + "stxvd2x %x[x2], %[i16], %[y_ptr] \n\t" + "stxvd2x %x[x1], %[i32], %[y_ptr] \n\t" + "stxvd2x %x[x3], %[i48], %[y_ptr] \n\t" + "addi %[x_ptr], %[x_ptr], 128 \n\t" + "addi %[y_ptr], %[y_ptr], 128 \n\t" + "addic. %[temp_n], %[temp_n], -16 \n\t" + "bgt 1b \n\t" + "2: \n\t" + "xvmulsp 40, 32, 36 \n\t" // c * x + "xvmulsp 41, 33, 36 \n\t" + "xvmulsp 42, 34, 36 \n\t" + "xvmulsp 43, 35, 36 \n\t" + "xvmulsp %x[x0], 48, 36 \n\t" // c * y + "xvmulsp %x[x2], 49, 36 \n\t" + "xvmulsp %x[x1], 50, 36 \n\t" + "xvmulsp %x[x3], 51, 36 \n\t" + "xvmulsp 44, 32, 37 \n\t" // s * x + "xvmulsp 45, 33, 37 \n\t" + "xvmulsp 46, 34, 37 \n\t" + "xvmulsp 47, 35, 37 \n\t" + "xvmulsp %x[x4], 48, 37 \n\t" // s * y + "xvmulsp %x[x5], 49, 37 \n\t" + "xvmulsp %x[x6], 50, 37 \n\t" + "xvmulsp %x[x7], 51, 37 \n\t" + "addi %[x_ptr], %[x_ptr], -64 \n\t" + "addi %[y_ptr], %[y_ptr], -64 \n\t" + "xvaddsp 40, 40, %x[x4] \n\t" // c * x + s * y + "xvaddsp 41, 41, %x[x5] \n\t" // c * x + s * y + "xvaddsp 42, 42, %x[x6] \n\t" // c * x + s * y + "xvaddsp 43, 43, %x[x7] \n\t" // c * x + s * y + "xvsubsp %x[x0], %x[x0], 44 \n\t" // c * y - s * x + "xvsubsp %x[x2], %x[x2], 45 \n\t" // c * y - s * x + "xvsubsp %x[x1], %x[x1], 46 \n\t" // c * y - s * x + "xvsubsp %x[x3], %x[x3], 47 \n\t" // c * y - s * x + "stxvd2x 40, 0, %[x_ptr] \n\t" // store x + "stxvd2x 41, %[i16], %[x_ptr] \n\t" + "stxvd2x 42, %[i32], %[x_ptr] \n\t" + "stxvd2x 43, %[i48], %[x_ptr] \n\t" + "stxvd2x %x[x0], 0, %[y_ptr] \n\t" // store y + "stxvd2x %x[x2], %[i16], %[y_ptr] \n\t" + "stxvd2x %x[x1], %[i32], %[y_ptr] \n\t" + "stxvd2x %x[x3], %[i48], %[y_ptr] " + : + [mem_x] "+m" (*(float (*)[2*n])x), + [mem_y] "+m" (*(float (*)[2*n])y), + [temp_n] "+r" (n), + [x_ptr] "+&b" (x), + [y_ptr] "+&b" (y), + [x0] "=wa" (t0), + [x1] "=wa" (t2), + [x2] "=wa" (t1), + [x3] "=wa" (t3), + [x4] "=wa" (t4), + [x5] "=wa" (t5), + [x6] "=wa" (t6), + [x7] "=wa" (t7) + : + [cos] "f" (c), + [sin] "f" (s), + [i16] "b" (16), + [i32] "b" (32), + [i48] "b" (48) + : + "cr0", + "vs32","vs33","vs34","vs35","vs36","vs37", + "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47", + "vs48","vs49","vs50","vs51" + ); +} + +#endif + + +int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + FLOAT *x1=x; + FLOAT *y1=y; + FLOAT temp; + if ( n <= 0 ) return(0); + if ( (inc_x == 1) && (inc_y == 1) ) + { + BLASLONG n1 = n & -8; + if ( n1 > 0 ) + { + crot_kernel_8(n1, x1, y1, c, s); + i=n1; + } + while(i < n) + { + temp = c*x[i] + s*y[i] ; + y[i] = c*y[i] - s*x[i] ; + x[i] = temp ; + i++ ; + } + + } + else + { + while(i < n) + { + temp = c*x[ix] + s*y[iy] ; + y[iy] = c*y[iy] - s*x[ix] ; + x[ix] = temp ; + ix += inc_x ; + iy += inc_y ; + i++ ; + } + } + return(0); +} + diff --git a/kernel/power/icamax.c b/kernel/power/icamax.c new file mode 100644 index 000000000..aa0531dc6 --- /dev/null +++ b/kernel/power/icamax.c @@ -0,0 +1,261 @@ +/*************************************************************************** +Copyright (c) 2019, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include "common.h" +#include +#include +#if defined(DOUBLE) + #define ABS fabs +#else + #define ABS fabsf +#endif +#define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) + + + + +/** + * Find maximum index + * Warning: requirements n>0 and n % 32 == 0 + * @param n + * @param x pointer to the vector + * @param maxf (out) maximum absolute value .( only for output ) + * @return index + */ +static BLASLONG ciamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) { + + BLASLONG index; + BLASLONG i; + register __vector unsigned int static_index0 = {0,1,2,3}; + register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register + register __vector unsigned int temp1= temp0<<1; //{8,8,8,8} + register __vector unsigned int static_index1=static_index0 +temp0;//{4,5,6,7}; + register __vector unsigned int static_index2=static_index0 +temp1;//{8,9,10,11}; + register __vector unsigned int static_index3=static_index1 +temp1; //{12,13,14,15}; + temp0=vec_xor(temp0,temp0); + temp1=temp1 <<1 ; //{16,16,16,16} + register __vector unsigned int temp_add=temp1 <<1; //{32,32,32,32} + register __vector unsigned int quadruple_indices=temp0;//{0,0,0,0} + register __vector float quadruple_values={0,0,0,0}; + + register __vector float * v_ptrx=(__vector float *)x; + register __vector unsigned char real_pack_mask = { 0,1,2,3,8,9,10,11,16,17,18,19, 24,25,26,27}; + register __vector unsigned char image_pack_mask= {4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31}; + for(; i31 + + //find final quadruple from 32 elements + r2=vec_cmpgt(vv0,vf0); + ind2 = vec_sel( indf0,indv0,r2); + vv0= vec_sel(vf0,vv0,r2); + //get asbolute index + ind2+=temp0; + //compare with old quadruple and update + r1=vec_cmpgt(vv0,quadruple_values); + quadruple_indices = vec_sel( quadruple_indices,ind2,r1); + quadruple_values= vec_sel(quadruple_values,vv0,r1); + + temp0+=temp_add; + } + + //now we have to chose from 4 values and 4 different indices + // we will compare pairwise if pairs are exactly the same we will choose minimum between index + // otherwise we will assign index of the maximum value + float a1,a2,a3,a4; + unsigned int i1,i2,i3,i4; + a1=vec_extract(quadruple_values,0); + a2=vec_extract(quadruple_values,1); + a3=vec_extract(quadruple_values,2); + a4=vec_extract(quadruple_values,3); + i1=vec_extract(quadruple_indices,0); + i2=vec_extract(quadruple_indices,1); + i3=vec_extract(quadruple_indices,2); + i4=vec_extract(quadruple_indices,3); + if(a1==a2){ + index=i1>i2?i2:i1; + }else if(a2>a1){ + index=i2; + a1=a2; + }else{ + index= i1; + } + + if(a4==a3){ + i1=i3>i4?i4:i3; + }else if(a4>a3){ + i1=i4; + a3=a4; + }else{ + i1= i3; + } + + if(a1==a3){ + index=i1>index?index:i1; + *maxf=a1; + }else if(a3>a1){ + index=i1; + *maxf=a3; + }else{ + *maxf=a1; + } + return index; + +} + + + + + + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i = 0; + BLASLONG ix = 0; + FLOAT maxf = 0; + BLASLONG max = 0; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) return(max); + + if (inc_x == 1) { + + BLASLONG n1 = n & -32; + if (n1 > 0) { + + max = ciamax_kernel_32(n1, x, &maxf); + i = n1; + ix = n1 << 1; + } + + while(i < n) + { + if( CABS1(x,ix) > maxf ) + { + max = i; + maxf = CABS1(x,ix); + } + ix += 2; + i++; + } + return (max + 1); + + } else { + + inc_x2 = 2 * inc_x; + + maxf = CABS1(x,0); + ix += inc_x2; + i++; + + while(i < n) + { + if( CABS1(x,ix) > maxf ) + { + max = i; + maxf = CABS1(x,ix); + } + ix += inc_x2; + i++; + } + return (max + 1); + } + +} + + diff --git a/kernel/power/icamin.c b/kernel/power/icamin.c new file mode 100644 index 000000000..36432c993 --- /dev/null +++ b/kernel/power/icamin.c @@ -0,0 +1,266 @@ +/*************************************************************************** +Copyright (c) 2019, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include "common.h" +#include +#include +#if defined(DOUBLE) + #define ABS fabs +#else + #define ABS fabsf +#endif +#define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) + + + + +/** + * Find minimum index + * Warning: requirements n>0 and n % 32 == 0 + * @param n + * @param x pointer to the vector + * @param minf (out) minimum absolute value .( only for output ) + * @return index + */ +static BLASLONG ciamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { + + BLASLONG index; + BLASLONG i; + register __vector unsigned int static_index0 = {0,1,2,3}; + register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register + register __vector unsigned int temp1= temp0<<1; //{8,8,8,8} + register __vector unsigned int static_index1=static_index0 +temp0;//{4,5,6,7}; + register __vector unsigned int static_index2=static_index0 +temp1;//{8,9,10,11}; + register __vector unsigned int static_index3=static_index1 +temp1; //{12,13,14,15}; + temp0=vec_xor(temp0,temp0); + temp1=temp1 <<1 ; //{16,16,16,16} + register __vector unsigned int temp_add=temp1 <<1; //{32,32,32,32} + register __vector unsigned int quadruple_indices=temp0;//{0,0,0,0} + float first_min=CABS1(x,0); + register __vector float quadruple_values={first_min,first_min,first_min,first_min}; + + register __vector float * v_ptrx=(__vector float *)x; + register __vector unsigned char real_pack_mask = { 0,1,2,3,8,9,10,11,16,17,18,19, 24,25,26,27}; + register __vector unsigned char image_pack_mask= {4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31}; + for(; i31 + + //find final quadruple from 32 elements + r2=vec_cmpgt(vf0,vv0); + ind2 = vec_sel( indf0,indv0,r2); + vv0= vec_sel(vf0,vv0,r2); + //get asbolute index + ind2+=temp0; + //compare with old quadruple and update + r1=vec_cmpgt(quadruple_values,vv0); + quadruple_indices = vec_sel( quadruple_indices,ind2,r1); + quadruple_values= vec_sel(quadruple_values,vv0,r1); + + temp0+=temp_add; + } + + //now we have to chose from 4 values and 4 different indices + // we will compare pairwise if pairs are exactly the same we will choose minimum between index + // otherwise we will assign index of the minimum value + float a1,a2,a3,a4; + unsigned int i1,i2,i3,i4; + a1=vec_extract(quadruple_values,0); + a2=vec_extract(quadruple_values,1); + a3=vec_extract(quadruple_values,2); + a4=vec_extract(quadruple_values,3); + i1=vec_extract(quadruple_indices,0); + i2=vec_extract(quadruple_indices,1); + i3=vec_extract(quadruple_indices,2); + i4=vec_extract(quadruple_indices,3); + if(a1==a2){ + index=i1>i2?i2:i1; + }else if(a2i4?i4:i3; + }else if(a4index?index:i1; + *minf=a1; + }else if(a3 0) { + + min = ciamin_kernel_32(n1, x, &minf); + i = n1; + ix = n1 << 1; + } + + + while(i < n) + { + if( CABS1(x,ix) < minf ) + { + min = i; + minf = CABS1(x,ix); + } + ix += 2; + i++; + } + return (min + 1); + + } else { + + inc_x2 = 2 * inc_x; + + minf = CABS1(x,0); + ix += inc_x2; + i++; + + while(i < n) + { + if( CABS1(x,ix) < minf ) + { + min = i; + minf = CABS1(x,ix); + } + ix += inc_x2; + i++; + } + return (min + 1); + } + +} + + diff --git a/kernel/power/idamin.c b/kernel/power/idamin.c index f4d1d1bdb..7fe0f8a33 100644 --- a/kernel/power/idamin.c +++ b/kernel/power/idamin.c @@ -89,10 +89,10 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { ".p2align 5 \n\t" "1: \n\t" - "xvcmpgedp 2,44,45 \n\t " - "xvcmpgedp 3,46,47 \n\t " - "xvcmpgedp 4,48,49 \n\t " - "xvcmpgedp 5,50,51 \n\t" + "xvcmpgtdp 2,44,45 \n\t " + "xvcmpgtdp 3,46,47 \n\t " + "xvcmpgtdp 4,48,49 \n\t " + "xvcmpgtdp 5,50,51 \n\t" "xxsel 32,40,41,2 \n\t" "xxsel 0,44,45,2 \n\t" @@ -103,8 +103,8 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { "xxsel 35,42,43,5 \n\t" "xxsel 47,50,51,5 \n\t" - "xvcmpgedp 2,0, 1 \n\t" - "xvcmpgedp 3, 45,47 \n\t" + "xvcmpgtdp 2,0, 1 \n\t" + "xvcmpgtdp 3, 45,47 \n\t" "addi %[ptr_tmp] ,%[ptr_tmp] , 128 \n\t" @@ -125,7 +125,7 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { "lxvd2x 47, %[i48],%[ptr_tmp] \n\t" //choose smaller from first and second part - "xvcmpgedp 4, 0,5 \n\t" + "xvcmpgtdp 4, 0,5 \n\t" "xxsel 3, 0,5,4 \n\t" "xxsel 33,32,34,4 \n\t" @@ -139,7 +139,7 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { "lxvd2x 51,%[i112],%[ptr_tmp] \n\t" //compare with previous to get vec_min_index(v6 | vs38 ) and vec_min_value (vs39) - "xvcmpgedp 2,39, 3 \n\t" + "xvcmpgtdp 2,39, 3 \n\t" "xxsel 39,39,3,2 \n\t" "xxsel 38,38,33,2 \n\t" @@ -162,10 +162,10 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { //<-----------jump here from first load "2: \n\t" - "xvcmpgedp 2,44,45 \n\t " - "xvcmpgedp 3,46,47 \n\t " - "xvcmpgedp 4,48,49 \n\t " - "xvcmpgedp 5,50,51 \n\t" + "xvcmpgtdp 2,44,45 \n\t " + "xvcmpgtdp 3,46,47 \n\t " + "xvcmpgtdp 4,48,49 \n\t " + "xvcmpgtdp 5,50,51 \n\t" "xxsel 32,40,41,2 \n\t" "xxsel 0,44,45,2 \n\t" @@ -176,8 +176,8 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { "xxsel 35,42,43,5 \n\t" "xxsel 47,50,51,5 \n\t" - "xvcmpgedp 2,0, 1 \n\t" - "xvcmpgedp 3, 45,47 \n\t" + "xvcmpgtdp 2,0, 1 \n\t" + "xvcmpgtdp 3, 45,47 \n\t" "xxsel 32,32,33,2 \n\t" "xxsel 0 ,0,1,2 \n\t" "xxsel 34,34,35,3 \n\t" @@ -194,7 +194,7 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { "lxvd2x 47, %[i48],%[ptr_tmp] \n\t" //choose smaller from first and second part - "xvcmpgedp 4, 0,5 \n\t" + "xvcmpgtdp 4, 0,5 \n\t" "xxsel 3, 0,5,4 \n\t" "xxsel 33,32,34,4 \n\t" @@ -210,7 +210,7 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { //compare with previous to get vec_min_index(v6 | vs38 ) and vec_min_value (vs39) - "xvcmpgedp 2,39, 3 \n\t" + "xvcmpgtdp 2,39, 3 \n\t" "xxsel 39,39,3,2 \n\t" "xxsel 38,38,33,2 \n\t" @@ -238,10 +238,10 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { //============================================================================== - "xvcmpgedp 2,44,45 \n\t " - "xvcmpgedp 3,46,47 \n\t " - "xvcmpgedp 4,48,49 \n\t " - "xvcmpgedp 5,50,51 \n\t" + "xvcmpgtdp 2,44,45 \n\t " + "xvcmpgtdp 3,46,47 \n\t " + "xvcmpgtdp 4,48,49 \n\t " + "xvcmpgtdp 5,50,51 \n\t" "xxsel 32,40,41,2 \n\t" "xxsel 0,44,45,2 \n\t" @@ -252,8 +252,8 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { "xxsel 35,42,43,5 \n\t" "xxsel 47,50,51,5 \n\t" - "xvcmpgedp 2,0, 1 \n\t" - "xvcmpgedp 3, 45,47 \n\t" + "xvcmpgtdp 2,0, 1 \n\t" + "xvcmpgtdp 3, 45,47 \n\t" "xxsel 32,32,33,2 \n\t" @@ -264,14 +264,14 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { // for {second 8 elements } we have to add 8 to each so that it became {from 8 to 16} "vaddudm 2,2,4 \n\t" // vs34=vs34 + vs36{8,8} //choose smaller from first and second part - "xvcmpgedp 4, 0,5 \n\t" + "xvcmpgtdp 4, 0,5 \n\t" "xxsel 3, 0,5,4 \n\t" "xxsel 33,32,34,4 \n\t" "vaddudm 1,1,5 \n\t" // get real index for first smaller //compare with previous to get vec_min_index(v6 | vs38 ) and vec_min_value (vs39) - "xvcmpgedp 2,39, 3 \n\t" + "xvcmpgtdp 2,39, 3 \n\t" "xxsel 39,39,3,2 \n\t" "xxsel 38,38,33,2 \n\t" @@ -284,7 +284,7 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { //cr6 0 bit set if all true, cr6=4*6+bit_ind=24,0011at CR(BI)==1, at=10 hint that it occurs rarely //0b001110=14 "bc 14,24, 3f \n\t" - "xvcmpgedp 4,39, 40 \n\t" + "xvcmpgtdp 4,39, 40 \n\t" "xxsel 0,39,40,4 \n\t" "xxsel 1,38,32,4 \n\t" "stxsdx 0,0,%[ptr_minf] \n\t" diff --git a/kernel/power/isamax.c b/kernel/power/isamax.c new file mode 100644 index 000000000..bf1af78d6 --- /dev/null +++ b/kernel/power/isamax.c @@ -0,0 +1,288 @@ +/*************************************************************************** +Copyright (c) 2013-2019, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ +#include "common.h" +#include +#include + + +#if defined(DOUBLE) + #define ABS fabs +#else + #define ABS fabsf +#endif + +/** + * Find maximum index + * Warning: requirements n>0 and n % 64 == 0 + * @param n + * @param x pointer to the vector + * @param maxf (out) maximum absolute value .( only for output ) + * @return index + */ +static BLASLONG siamax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *maxf) { + BLASLONG index; + BLASLONG i=0; + register __vector unsigned int static_index0 = {0,1,2,3}; + register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register + register __vector unsigned int temp1= temp0<<1; //{8,8,8,8} + register __vector unsigned int static_index1=static_index0 +temp0;//{4,5,6,7}; + register __vector unsigned int static_index2=static_index0 +temp1;//{8,9,10,11}; + register __vector unsigned int static_index3=static_index1 +temp1; //{12,13,14,15}; + temp0=vec_xor(temp0,temp0); + temp1=temp1 <<1 ; //{16,16,16,16} + register __vector unsigned int quadruple_indices=temp0;//{0,0,0,0} + register __vector float quadruple_values={0,0,0,0}; + register __vector float * v_ptrx=(__vector float *)x; + for(; ii2?i2:i1; + }else if(a2>a1){ + index=i2; + a1=a2; + }else{ + index= i1; + } + + if(a4==a3){ + i1=i3>i4?i4:i3; + }else if(a4>a3){ + i1=i4; + a3=a4; + }else{ + i1= i3; + } + + if(a1==a3){ + index=i1>index?index:i1; + *maxf=a1; + }else if(a3>a1){ + index=i1; + *maxf=a3; + }else{ + *maxf=a1; + } + return index; + +} + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT maxf = 0.0; + BLASLONG max = 0; + + if (n <= 0 || inc_x <= 0) return (max); + + if (inc_x == 1) { + + BLASLONG n1 = n & -64; + if (n1 > 0) { + + max = siamax_kernel_64(n1, x, &maxf); + + i = n1; + } + + while (i < n) { + if (ABS(x[i]) > maxf) { + max = i; + maxf = ABS(x[i]); + } + i++; + } + return (max + 1); + + } else { + + BLASLONG n1 = n & -4; + while (j < n1) { + + if (ABS(x[i]) > maxf) { + max = j; + maxf = ABS(x[i]); + } + if (ABS(x[i + inc_x]) > maxf) { + max = j + 1; + maxf = ABS(x[i + inc_x]); + } + if (ABS(x[i + 2 * inc_x]) > maxf) { + max = j + 2; + maxf = ABS(x[i + 2 * inc_x]); + } + if (ABS(x[i + 3 * inc_x]) > maxf) { + max = j + 3; + maxf = ABS(x[i + 3 * inc_x]); + } + + i += inc_x * 4; + + j += 4; + + } + + + while (j < n) { + if (ABS(x[i]) > maxf) { + max = j; + maxf = ABS(x[i]); + } + i += inc_x; + j++; + } + return (max + 1); + } +} diff --git a/kernel/power/isamin.c b/kernel/power/isamin.c new file mode 100644 index 000000000..1c1f0ad78 --- /dev/null +++ b/kernel/power/isamin.c @@ -0,0 +1,288 @@ +/*************************************************************************** +Copyright (c) 2013-2019, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ +#include "common.h" +#include +#include +#if defined(DOUBLE) + #define ABS fabs +#else + #define ABS fabsf +#endif +/** + * Find minimum index + * Warning: requirements n>0 and n % 64 == 0 + * @param n + * @param x pointer to the vector + * @param minf (out) minimum absolute value .( only for output ) + * @return index + */ +static BLASLONG siamin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *minf) { + BLASLONG index; + BLASLONG i=0; + register __vector unsigned int static_index0 = {0,1,2,3}; + register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register + register __vector unsigned int temp1= temp0<<1; //{8,8,8,8} + register __vector unsigned int static_index1=static_index0 +temp0;//{4,5,6,7}; + register __vector unsigned int static_index2=static_index0 +temp1;//{8,9,10,11}; + register __vector unsigned int static_index3=static_index1 +temp1; //{12,13,14,15}; + temp0=vec_xor(temp0,temp0); + temp1=temp1 <<1 ; //{16,16,16,16} + register __vector unsigned int quadruple_indices=static_index0;//{0,1,2,3}; + register __vector float * v_ptrx=(__vector float *)x; + register __vector float quadruple_values=vec_abs(v_ptrx[0]); + for(; ii2?i2:i1; + }else if(a2i4?i4:i3; + }else if(a4index?index:i1; + *minf=a1; + }else if(a3 0) { + + min = siamin_kernel_64(n1, x, &minf); + i = n1; + } + + while (i < n) { + if (ABS(x[i]) < minf) { + min = i; + minf = ABS(x[i]); + } + i++; + } + return (min + 1); + + } else { + + BLASLONG n1 = n & -4; + while (j < n1) { + + if (ABS(x[i]) < minf) { + min = j; + minf = ABS(x[i]); + } + if (ABS(x[i + inc_x]) < minf) { + min = j + 1; + minf = ABS(x[i + inc_x]); + } + if (ABS(x[i + 2 * inc_x]) < minf) { + min = j + 2; + minf = ABS(x[i + 2 * inc_x]); + } + if (ABS(x[i + 3 * inc_x]) < minf) { + min = j + 3; + minf = ABS(x[i + 3 * inc_x]); + } + + i += inc_x * 4; + + j += 4; + + } + + + while (j < n) { + if (ABS(x[i]) < minf) { + min = j; + minf = ABS(x[i]); + } + i += inc_x; + j++; + } + return (min + 1); + } +} diff --git a/kernel/power/izamin.c b/kernel/power/izamin.c index 448247ffd..1ffa3ba8b 100644 --- a/kernel/power/izamin.c +++ b/kernel/power/izamin.c @@ -101,8 +101,8 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) { - "xvcmpgedp 50,46,47 \n\t " - "xvcmpgedp 51,48,49 \n\t " + "xvcmpgtdp 50,46,47 \n\t " + "xvcmpgtdp 51,48,49 \n\t " "addi %[ptr_tmp] ,%[ptr_tmp] , 128 \n\t" @@ -114,7 +114,7 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) { "lxvd2x 44, 0,%[ptr_tmp] \n\t" "lxvd2x 45, %[i16],%[ptr_tmp] \n\t" - "xvcmpgedp 2,0,1 \n\t " + "xvcmpgtdp 2,0,1 \n\t " "lxvd2x 46, %[i32],%[ptr_tmp] \n\t" "lxvd2x 47, %[i48],%[ptr_tmp] \n\t" @@ -126,7 +126,7 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) { //cmp with previous - "xvcmpgedp 4,39,3 \n\t " + "xvcmpgtdp 4,39,3 \n\t " "vaddudm 5,5,4 \n\t" "lxvd2x 48, %[i64],%[ptr_tmp] \n\t" @@ -166,8 +166,8 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) { "xvadddp 48, 4,5 \n\t" "xvadddp 49, 44,45 \n\t" - "xvcmpgedp 50,46,47 \n\t " - "xvcmpgedp 51,48,49 \n\t " + "xvcmpgtdp 50,46,47 \n\t " + "xvcmpgtdp 51,48,49 \n\t " "addi %[ptr_tmp] ,%[ptr_tmp] , 128 \n\t" @@ -179,7 +179,7 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) { "lxvd2x 44, 0,%[ptr_tmp] \n\t" "lxvd2x 45, %[i16],%[ptr_tmp] \n\t" - "xvcmpgedp 2,0,1 \n\t " + "xvcmpgtdp 2,0,1 \n\t " "lxvd2x 46, %[i32],%[ptr_tmp] \n\t" "lxvd2x 47, %[i48],%[ptr_tmp] \n\t" @@ -191,7 +191,7 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) { //cmp with previous - "xvcmpgedp 4,39,3 \n\t " + "xvcmpgtdp 4,39,3 \n\t " "vaddudm 5,5,4 \n\t" "lxvd2x 48, %[i64],%[ptr_tmp] \n\t" @@ -235,15 +235,15 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) { - "xvcmpgedp 50,46,47 \n\t " - "xvcmpgedp 51,48,49 \n\t " + "xvcmpgtdp 50,46,47 \n\t " + "xvcmpgtdp 51,48,49 \n\t " "xxsel 32,40,41,50 \n\t" "xxsel 0,46,47,50 \n\t" "xxsel 33,42,43,51 \n\t" "xxsel 1,48,49,51 \n\t" - "xvcmpgedp 2,0,1 \n\t " + "xvcmpgtdp 2,0,1 \n\t " "xxsel 32,32,33,2 \n\t" "xxsel 3,0,1,2 \n\t" @@ -252,7 +252,7 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) { "addi %[ptr_tmp] ,%[ptr_tmp] , 128 \n\t" //cmp with previous - "xvcmpgedp 4,39,3 \n\t " + "xvcmpgtdp 4,39,3 \n\t " "vaddudm 5,5,4 \n\t" "xxsel 38,38,32,4 \n\t" "xxsel 39,39,3,4 \n\t" @@ -267,7 +267,7 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) { //cr6 0 bit set if all true, cr6=4*6+bit_ind=24,0011at CR(BI)==1, at=10 hint that it occurs rarely //0b001110=14 "bc 14,24, 3f \n\t" - "xvcmpgedp 4,39, 40 \n\t" + "xvcmpgtdp 4,39, 40 \n\t" "xxsel 0,39,40,4 \n\t" "xxsel 1,38,32,4 \n\t" "stxsdx 0,0,%[ptr_minf] \n\t" diff --git a/kernel/power/saxpy.c b/kernel/power/saxpy.c new file mode 100644 index 000000000..393cdfadc --- /dev/null +++ b/kernel/power/saxpy.c @@ -0,0 +1,129 @@ +/*************************************************************************** +Copyright (c) 2013-2018, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include "common.h" + + + + +#ifndef HAVE_KERNEL_8 +#include + +static void saxpy_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT alpha) +{ + BLASLONG i = 0; + __vector float v_a = {alpha,alpha,alpha,alpha}; + __vector float * v_y=(__vector float *)y; + __vector float * v_x=(__vector float *)x; + + for(; i