Optimized standard Blas Level-1,2 (excluding nrm2 functions) for z13 (double precision)

This commit is contained in:
Abdurrauf 2017-04-08 21:51:15 +04:00
parent 08786c4b95
commit 1cfdb2295d
26 changed files with 7230 additions and 26 deletions

View File

@ -166,5 +166,5 @@ In chronological order:
* [2017-01-01] dgemm and dtrmm kernels for IBM z13
* [2017-02-26] ztrmm kernel for IBM z13
* [2017-03-13] strmm and ctrmm kernel for IBM z13
* [2017-09-01] initial Blas Level-1,2 (double precision) for IBM z13

View File

@ -107,7 +107,7 @@ Please read GotoBLAS_01Readme.txt
- **ARM Cortex-A57**: Experimental
#### IBM zEnterprise System:
- **Z13**: Optimized Level-3 BLAS
- **Z13**: Optimized Level-3 BLAS and Level-1,2 (double precision)
### Support OS:

View File

@ -40,8 +40,12 @@
#include "common.h"
#ifdef FUNCTION_PROFILE
#include "functable.h"
#endif
#if defined(Z13)
#define MULTI_THREAD_MINIMAL 200000
#else
#define MULTI_THREAD_MINIMAL 10000
#endif
#ifndef CBLAS
void NAME(blasint *N, FLOAT *ALPHA, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY){
@ -88,7 +92,7 @@ void CNAME(blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT *y, blasint inc
//Temporarily work-around the low performance issue with small imput size &
//multithreads.
if (n <= 10000)
if (n <= MULTI_THREAD_MINIMAL)
nthreads = 1;
if (nthreads == 1) {

View File

@ -15,14 +15,14 @@ SMINKERNEL = ../arm/min.c
DMINKERNEL = ../arm/min.c
ISAMAXKERNEL = ../arm/iamax.c
IDAMAXKERNEL = ../arm/iamax.c
IDAMAXKERNEL = idamax.c
ICAMAXKERNEL = ../arm/izamax.c
IZAMAXKERNEL = ../arm/izamax.c
IZAMAXKERNEL = izamax.c
ISAMINKERNEL = ../arm/iamin.c
IDAMINKERNEL = ../arm/iamin.c
IDAMINKERNEL = idamin.c
ICAMINKERNEL = ../arm/izamin.c
IZAMINKERNEL = ../arm/izamin.c
IZAMINKERNEL = izamin.c
ISMAXKERNEL = ../arm/imax.c
IDMAXKERNEL = ../arm/imax.c
@ -31,24 +31,24 @@ ISMINKERNEL = ../arm/imin.c
IDMINKERNEL = ../arm/imin.c
SASUMKERNEL = ../arm/asum.c
DASUMKERNEL = ../arm/asum.c
DASUMKERNEL = dasum.c
CASUMKERNEL = ../arm/zasum.c
ZASUMKERNEL = ../arm/zasum.c
ZASUMKERNEL = zasum.c
SAXPYKERNEL = ../arm/axpy.c
DAXPYKERNEL = ../arm/axpy.c
DAXPYKERNEL = daxpy.c
CAXPYKERNEL = ../arm/zaxpy.c
ZAXPYKERNEL = ../arm/zaxpy.c
ZAXPYKERNEL = zaxpy.c
SCOPYKERNEL = ../arm/copy.c
DCOPYKERNEL = ../arm/copy.c
DCOPYKERNEL = dcopy.c
CCOPYKERNEL = ../arm/zcopy.c
ZCOPYKERNEL = ../arm/zcopy.c
ZCOPYKERNEL = zcopy.c
SDOTKERNEL = ../arm/dot.c
DDOTKERNEL = ../arm/dot.c
DDOTKERNEL = ddot.c
CDOTKERNEL = ../arm/zdot.c
ZDOTKERNEL = ../arm/zdot.c
ZDOTKERNEL = zdot.c
SNRM2KERNEL = ../arm/nrm2.c
DNRM2KERNEL = ../arm/nrm2.c
@ -56,29 +56,29 @@ CNRM2KERNEL = ../arm/znrm2.c
ZNRM2KERNEL = ../arm/znrm2.c
SROTKERNEL = ../arm/rot.c
DROTKERNEL = ../arm/rot.c
DROTKERNEL = drot.c
CROTKERNEL = ../arm/zrot.c
ZROTKERNEL = ../arm/zrot.c
ZROTKERNEL = zrot.c
SSCALKERNEL = ../arm/scal.c
DSCALKERNEL = ../arm/scal.c
DSCALKERNEL = dscal.c
CSCALKERNEL = ../arm/zscal.c
ZSCALKERNEL = ../arm/zscal.c
ZSCALKERNEL = zscal.c
SSWAPKERNEL = ../arm/swap.c
DSWAPKERNEL = ../arm/swap.c
DSWAPKERNEL = dswap.c
CSWAPKERNEL = ../arm/zswap.c
ZSWAPKERNEL = ../arm/zswap.c
ZSWAPKERNEL = zswap.c
SGEMVNKERNEL = ../arm/gemv_n.c
DGEMVNKERNEL = ../arm/gemv_n.c
DGEMVNKERNEL = dgemv_n_4.c
CGEMVNKERNEL = ../arm/zgemv_n.c
ZGEMVNKERNEL = ../arm/zgemv_n.c
ZGEMVNKERNEL = zgemv_n_4.c
SGEMVTKERNEL = ../arm/gemv_t.c
DGEMVTKERNEL = ../arm/gemv_t.c
DGEMVTKERNEL = dgemv_t_4.c
CGEMVTKERNEL = ../arm/zgemv_t.c
ZGEMVTKERNEL = ../arm/zgemv_t.c
ZGEMVTKERNEL = zgemv_t_4.c
STRMMKERNEL = strmm8x4V.S
DTRMMKERNEL = trmm8x4V.S

159
kernel/zarch/dasum.c Normal file
View File

@ -0,0 +1,159 @@
/***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#include <math.h>
#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf
#endif
static FLOAT __attribute__ ((noinline)) dasum_kernel_32(BLASLONG n, FLOAT *x) {
__asm__ (
"pfd 1, 0(%1) \n\t"
"sllg %%r0,%0,3 \n\t"
"agr %%r0,%1 \n\t"
"vzero %%v0 \n\t"
"vzero %%v1 \n\t"
"vzero %%v2 \n\t"
"vzero %%v3 \n\t"
".align 16 \n\t"
"1: \n\t"
"pfd 1, 256(%1 ) \n\t"
"vlm %%v24,%%v31, 0(%1 ) \n\t"
"vflpdb %%v24, %%v24 \n\t"
"vflpdb %%v25, %%v25 \n\t"
"vflpdb %%v26, %%v26 \n\t"
"vflpdb %%v27, %%v27 \n\t"
"vflpdb %%v28, %%v28 \n\t"
"vflpdb %%v29, %%v29 \n\t"
"vflpdb %%v30, %%v30 \n\t"
"vflpdb %%v31, %%v31 \n\t"
"vfadb %%v0,%%v0,%%v24 \n\t"
"vfadb %%v1,%%v1,%%v25 \n\t"
"vfadb %%v2,%%v2,%%v26 \n\t"
"vfadb %%v3,%%v3,%%v27 \n\t"
"vfadb %%v0,%%v0,%%v28 \n\t"
"vfadb %%v1,%%v1,%%v29 \n\t"
"vfadb %%v2,%%v2,%%v30 \n\t"
"vfadb %%v3,%%v3,%%v31 \n\t"
"vlm %%v24,%%v31, 128(%1) \n\t"
"vflpdb %%v24, %%v24 \n\t"
"vflpdb %%v25, %%v25 \n\t"
"vflpdb %%v26, %%v26 \n\t"
"vflpdb %%v27, %%v27 \n\t"
"vflpdb %%v28, %%v28 \n\t"
"vflpdb %%v29, %%v29 \n\t"
"vflpdb %%v30, %%v30 \n\t"
"vflpdb %%v31, %%v31 \n\t"
"la %1,256(%1) \n\t"
"vfadb %%v0,%%v0,%%v24 \n\t"
"vfadb %%v1,%%v1,%%v25 \n\t"
"vfadb %%v2,%%v2,%%v26 \n\t"
"vfadb %%v3,%%v3,%%v27 \n\t"
"vfadb %%v0,%%v0,%%v28 \n\t"
"vfadb %%v1,%%v1,%%v29 \n\t"
"vfadb %%v2,%%v2,%%v30 \n\t"
"vfadb %%v3,%%v3,%%v31 \n\t"
"clgrjl %1,%%r0,1b \n\t"
"vfadb %%v24,%%v0,%%v1 \n\t"
"vfadb %%v25,%%v2,%%v3 \n\t"
"vfadb %%v0,%%v25,%%v24 \n\t"
"vrepg %%v1,%%v0,1 \n\t"
"adbr %%f0,%%f1 \n\t"
:
: "r"(n), "a"(x)
: "cc", "memory","r0","f0","f1","v0","v1","v2","v3","v24","v25","v26","v27","v28","v29","v30","v31"
);
}
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0;
BLASLONG j = 0;
FLOAT sumf = 0.0;
BLASLONG n1;
if (n <= 0 || inc_x <= 0) return sumf;
if (inc_x == 1) {
n1 = n & -32;
if (n1 > 0) {
sumf = dasum_kernel_32(n1, x);
i = n1;
}
while (i < n) {
sumf += ABS(x[i]);
i++;
}
} else {
BLASLONG n1 = n & -4;
register FLOAT sum1, sum2;
sum1 = 0.0;
sum2 = 0.0;
while (j < n1) {
sum1 += ABS(x[i]);
sum2 += ABS(x[i + inc_x]);
sum1 += ABS(x[i + 2 * inc_x]);
sum2 += ABS(x[i + 3 * inc_x]);
i += inc_x * 4;
j += 4;
}
sumf = sum1 + sum2;
while (j < n) {
sumf += ABS(x[i]);
i += inc_x;
j++;
}
}
return sumf;
}

386
kernel/zarch/daxpy.c Normal file
View File

@ -0,0 +1,386 @@
/***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#define Z13_D 1
#define PREFETCH_INS 1
#if defined(Z13_A)
#include <vecintrin.h>
static void daxpy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
{
BLASLONG i = 0;
__vector double v_a = {*alpha,*alpha};
__vector double * v_y=(__vector double *)y;
__vector double * v_x=(__vector double *)x;
for(; i<n/2; i+=16){
v_y[i] += v_a * v_x[i];
v_y[i+1] += v_a * v_x[i+1];
v_y[i+2] += v_a * v_x[i+2];
v_y[i+3] += v_a * v_x[i+3];
v_y[i+4] += v_a * v_x[i+4];
v_y[i+5] += v_a * v_x[i+5];
v_y[i+6] += v_a * v_x[i+6];
v_y[i+7] += v_a * v_x[i+7];
v_y[i+8] += v_a * v_x[i+8];
v_y[i+9] += v_a * v_x[i+9];
v_y[i+10] += v_a * v_x[i+10];
v_y[i+11] += v_a * v_x[i+11];
v_y[i+12] += v_a * v_x[i+12];
v_y[i+13] += v_a * v_x[i+13];
v_y[i+14] += v_a * v_x[i+14];
v_y[i+15] += v_a * v_x[i+15];
}
}
#elif defined(Z13_B)
static void __attribute__ ((noinline)) daxpy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
{
__asm__ volatile(
#if defined(PREFETCH_INS)
"pfd 1, 0(%1) \n\t"
"pfd 2, 0(%2) \n\t"
#endif
"vlrepg %%v0 , 0(%3) \n\t"
"srlg %3,%0,5 \n\t"
"xgr %%r1,%%r1 \n\t"
"vlr %%v1,%%v0 \n\t"
".align 16 \n\t"
"1: \n\t"
#if defined(PREFETCH_INS)
"pfd 1, 256(%%r1,%1) \n\t"
"pfd 2, 256(%%r1,%2) \n\t"
#endif
"vl %%v24, 0(%%r1,%2) \n\t"
"vl %%v16, 0(%%r1,%1) \n\t"
"vfmadb %%v16,%%v0,%%v16,%%v24 \n\t"
"vst %%v16, 0(%%r1,%2) \n\t"
"vl %%v25, 16(%%r1,%2) \n\t"
"vl %%v17, 16(%%r1,%1) \n\t"
"vfmadb %%v17,%%v0,%%v17,%%v25 \n\t"
"vst %%v17, 16(%%r1,%2) \n\t"
"vl %%v26, 32(%%r1,%2) \n\t"
"vl %%v18, 32(%%r1,%1) \n\t"
"vfmadb %%v18,%%v0,%%v18,%%v26 \n\t"
"vst %%v18, 32(%%r1,%2) \n\t"
"vl %%v27, 48(%%r1,%2) \n\t"
"vl %%v19, 48(%%r1,%1) \n\t"
"vfmadb %%v19,%%v0,%%v19,%%v27 \n\t"
"vst %%v19, 48(%%r1,%2) \n\t"
"vl %%v24,( 0+64)(%%r1,%2) \n\t"
"vl %%v16,( 0+64)(%%r1,%1) \n\t"
"vfmadb %%v16,%%v0,%%v16,%%v24 \n\t"
"vst %%v16,( 0+64)(%%r1,%2) \n\t"
"vl %%v25, (16+64)(%%r1,%2) \n\t"
"vl %%v17, (16+64)(%%r1,%1) \n\t"
"vfmadb %%v17,%%v0,%%v17,%%v25 \n\t"
"vst %%v17, (16+64)(%%r1,%2) \n\t"
"vl %%v26, (32+64)(%%r1,%2) \n\t"
"vl %%v18, (32+64)(%%r1,%1) \n\t"
"vfmadb %%v18,%%v0,%%v18,%%v26 \n\t"
"vst %%v18, (32+64)(%%r1,%2) \n\t"
"vl %%v27, (48+64)(%%r1,%2) \n\t"
"vl %%v19, (48+64)(%%r1,%1) \n\t"
"vfmadb %%v19,%%v0,%%v19,%%v27 \n\t"
"vst %%v19, (48+64)(%%r1,%2) \n\t"
"vl %%v24,( 0+128)(%%r1,%2) \n\t"
"vl %%v16,( 0+128)(%%r1,%1) \n\t"
"vfmadb %%v16,%%v0,%%v16,%%v24 \n\t"
"vst %%v16,( 0+128)(%%r1,%2) \n\t"
"vl %%v25, (16+128)(%%r1,%2) \n\t"
"vl %%v17, (16+128)(%%r1,%1) \n\t"
"vfmadb %%v17,%%v0,%%v17,%%v25 \n\t"
"vst %%v17, (16+128)(%%r1,%2) \n\t"
"vl %%v26, (32+128)(%%r1,%2) \n\t"
"vl %%v18, (32+128)(%%r1,%1) \n\t"
"vfmadb %%v18,%%v0,%%v18,%%v26 \n\t"
"vst %%v18, (32+128)(%%r1,%2) \n\t"
"vl %%v27, (48+128)(%%r1,%2) \n\t"
"vl %%v19, (48+128)(%%r1,%1) \n\t"
"vfmadb %%v19,%%v0,%%v19,%%v27 \n\t"
"vst %%v19, (48+128)(%%r1,%2) \n\t"
"vl %%v24,( 0+192)(%%r1,%2) \n\t"
"vl %%v16,( 0+192)(%%r1,%1) \n\t"
"vfmadb %%v16,%%v0,%%v16,%%v24 \n\t"
"vst %%v16,( 0+192)(%%r1,%2) \n\t"
"vl %%v25, (16+192)(%%r1,%2) \n\t"
"vl %%v17, (16+192)(%%r1,%1) \n\t"
"vfmadb %%v17,%%v0,%%v17,%%v25 \n\t"
"vst %%v17, (16+192)(%%r1,%2) \n\t"
"vl %%v26, (32+192)(%%r1,%2) \n\t"
"vl %%v18, (32+192)(%%r1,%1) \n\t"
"vfmadb %%v18,%%v0,%%v18,%%v26 \n\t"
"vst %%v18, (32+192)(%%r1,%2) \n\t"
"vl %%v27, (48+192)(%%r1,%2) \n\t"
"vl %%v19, (48+192)(%%r1,%1) \n\t"
"vfmadb %%v19,%%v0,%%v19,%%v27 \n\t"
"vst %%v19, (48+192)(%%r1,%2) \n\t"
"la %%r1,256(%%r1) \n\t"
"brctg %3,1b"
:
:"r"(n),"a"(x),"a"(y),"a"(alpha)
:"cc", "memory", "r1" ,"v0" ,"v16","v17","v18","v19", "v24","v25","v26","v27"
);
}
#elif defined(Z13_C)
static void __attribute__ ((noinline)) daxpy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
{
__asm__ volatile(
#if defined(PREFETCH_INS)
"pfd 1, 0(%1) \n\t"
"pfd 2, 0(%2) \n\t"
#endif
"vlrepg %%v0 , 0(%3) \n\t"
"srlg %3,%0,5 \n\t"
"xgr %%r1,%%r1 \n\t"
"vlr %%v1,%%v0 \n\t"
".align 16 \n\t"
"1: \n\t"
#if defined(PREFETCH_INS)
"pfd 1, 256(%%r1,%1) \n\t"
"pfd 2, 256(%%r1,%2) \n\t"
#endif
"vl %%v16, 0(%%r1,%1) \n\t"
"vl %%v17, 16(%%r1,%1) \n\t"
"vl %%v18, 32(%%r1,%1) \n\t"
"vl %%v19, 48(%%r1,%1) \n\t"
"vl %%v24, 0(%%r1,%2) \n\t"
"vl %%v25, 16(%%r1,%2) \n\t"
"vl %%v26, 32(%%r1,%2) \n\t"
"vl %%v27, 48(%%r1,%2) \n\t"
"vfmadb %%v16,%%v0,%%v16,%%v24 \n\t"
"vfmadb %%v17,%%v1,%%v17,%%v25 \n\t"
"vfmadb %%v18,%%v0,%%v18,%%v26 \n\t"
"vfmadb %%v19,%%v1,%%v19,%%v27 \n\t"
"vst %%v16, 0(%%r1,%2) \n\t"
"vst %%v17, 16(%%r1,%2) \n\t"
"vst %%v18, 32(%%r1,%2) \n\t"
"vst %%v19, 48(%%r1,%2) \n\t"
"vl %%v24, 64(%%r1,%1) \n\t"
"vl %%v25, 80(%%r1,%1) \n\t"
"vl %%v26, 96(%%r1,%1) \n\t"
"vl %%v27, 112(%%r1,%1) \n\t"
"vl %%v16, 64(%%r1,%2) \n\t"
"vl %%v17, 80(%%r1,%2) \n\t"
"vl %%v18, 96(%%r1,%2) \n\t"
"vl %%v19, 112(%%r1,%2) \n\t"
"vfmadb %%v24,%%v0,%%v24,%%v16 \n\t"
"vfmadb %%v25,%%v1,%%v25,%%v17 \n\t"
"vfmadb %%v26,%%v0,%%v26,%%v18 \n\t"
"vfmadb %%v27,%%v1,%%v27,%%v19 \n\t"
"vst %%v24, 64(%%r1,%2) \n\t"
"vst %%v25, 80(%%r1,%2) \n\t"
"vst %%v26, 96(%%r1,%2) \n\t"
"vst %%v27, 112(%%r1,%2) \n\t"
"vl %%v16, (0+128)(%%r1,%1) \n\t"
"vl %%v17, (16+128)(%%r1,%1) \n\t"
"vl %%v18, (32+128)(%%r1,%1) \n\t"
"vl %%v19, (48+128)(%%r1,%1) \n\t"
"vl %%v24, (0+128)(%%r1,%2) \n\t"
"vl %%v25, (16+128)(%%r1,%2) \n\t"
"vl %%v26, (32+128)(%%r1,%2) \n\t"
"vl %%v27, (48+128)(%%r1,%2) \n\t"
"vfmadb %%v16,%%v0,%%v16,%%v24 \n\t"
"vfmadb %%v17,%%v1,%%v17,%%v25 \n\t"
"vfmadb %%v18,%%v0,%%v18,%%v26 \n\t"
"vfmadb %%v19,%%v1,%%v19,%%v27 \n\t"
"vst %%v16, (0+128)(%%r1,%2) \n\t"
"vst %%v17, (16+128)(%%r1,%2) \n\t"
"vst %%v18, (32+128)(%%r1,%2) \n\t"
"vst %%v19, (48+128)(%%r1,%2) \n\t"
"vl %%v24, (64+128)(%%r1,%1) \n\t"
"vl %%v25, (80+128)(%%r1,%1) \n\t"
"vl %%v26, (96+128)(%%r1,%1) \n\t"
"vl %%v27, (112+128)(%%r1,%1) \n\t"
"vl %%v16, (64+128)(%%r1,%2) \n\t"
"vl %%v17, (80+128)(%%r1,%2) \n\t"
"vl %%v18, (96+128)(%%r1,%2) \n\t"
"vl %%v19, (112+128)(%%r1,%2) \n\t"
"vfmadb %%v24,%%v0,%%v24,%%v16 \n\t"
"vfmadb %%v25,%%v1,%%v25,%%v17 \n\t"
"vfmadb %%v26,%%v0,%%v26,%%v18 \n\t"
"vfmadb %%v27,%%v1,%%v27,%%v19 \n\t"
"vst %%v24, (64+128)(%%r1,%2) \n\t"
"vst %%v25, (80+128)(%%r1,%2) \n\t"
"vst %%v26, (96+128)(%%r1,%2) \n\t"
"vst %%v27, (112+128)(%%r1,%2) \n\t"
"la %%r1,256(%%r1) \n\t"
"brctg %3,1b"
:
:"r"(n),"a"(x),"a"(y),"a"(alpha)
:"cc", "memory", "r1" ,"v0","v1","v16","v17","v18","v19", "v24","v25","v26","v27"
);
}
#elif defined(Z13_D)
static void __attribute__ ((noinline)) daxpy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
{
__asm__ volatile(
#if defined(PREFETCH_INS)
"pfd 1, 0(%1) \n\t"
"pfd 2, 0(%2) \n\t"
#endif
"vlrepg %%v0 , 0(%3) \n\t"
"srlg %3,%0,5 \n\t"
"vlr %%v1,%%v0 \n\t"
".align 16 \n\t"
"1: \n\t"
#if defined(PREFETCH_INS)
"pfd 1, 256(%1) \n\t"
"pfd 2, 256(%2) \n\t"
#endif
"vlm %%v16,%%v23, 0(%1) \n\t"
"vlm %%v24, %%v31, 0(%2) \n\t"
"vfmadb %%v16,%%v0,%%v16,%%v24 \n\t"
"vfmadb %%v17,%%v1,%%v17,%%v25 \n\t"
"vfmadb %%v18,%%v0,%%v18,%%v26 \n\t"
"vfmadb %%v19,%%v1,%%v19,%%v27 \n\t"
"vfmadb %%v20,%%v0,%%v20,%%v28 \n\t"
"vfmadb %%v21,%%v1,%%v21,%%v29 \n\t"
"vfmadb %%v22,%%v0,%%v22,%%v30 \n\t"
"vfmadb %%v23,%%v1,%%v23,%%v31 \n\t"
"vstm %%v16,%%v23, 0(%2) \n\t"
"vlm %%v24,%%v31, 128(%1) \n\t"
"vlm %%v16,%%v23, 128(%2) \n\t"
"vfmadb %%v24,%%v0,%%v24,%%v16 \n\t"
"vfmadb %%v25,%%v1,%%v25,%%v17 \n\t"
"vfmadb %%v26,%%v0,%%v26,%%v18 \n\t"
"vfmadb %%v27,%%v1,%%v27,%%v19 \n\t"
"vfmadb %%v28,%%v0,%%v28,%%v20 \n\t"
"vfmadb %%v29,%%v1,%%v29,%%v21 \n\t"
"vfmadb %%v30,%%v0,%%v30,%%v22 \n\t"
"vfmadb %%v31,%%v1,%%v31,%%v23 \n\t"
"la %1,256(%1) \n\t"
"vstm %%v24, %%v31, 128(%2) \n\t"
"la %2,256(%2) \n\t"
"brctg %3,1b"
:
:"r"(n),"a"(x),"a"(y),"a"(alpha)
:"cc", "memory", "v0","v1","v16","v17","v18","v19","v20","v21",
"v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
}
#endif
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
{
BLASLONG i=0;
BLASLONG ix=0,iy=0;
if ( n <= 0 ) return 0 ;
if ( (inc_x == 1) && (inc_y == 1) )
{
BLASLONG n1 = n & -32;
if ( n1 )
daxpy_kernel_32(n1, x, y , &da );
i = n1;
while(i < n)
{
y[i] += da * x[i] ;
i++ ;
}
return 0 ;
}
BLASLONG n1 = n & -4;
while(i < n1)
{
FLOAT m1 = da * x[ix] ;
FLOAT m2 = da * x[ix+inc_x] ;
FLOAT m3 = da * x[ix+2*inc_x] ;
FLOAT m4 = da * x[ix+3*inc_x] ;
y[iy] += m1 ;
y[iy+inc_y] += m2 ;
y[iy+2*inc_y] += m3 ;
y[iy+3*inc_y] += m4 ;
ix += inc_x*4 ;
iy += inc_y*4 ;
i+=4 ;
}
while(i < n)
{
y[iy] += da * x[ix] ;
ix += inc_x ;
iy += inc_y ;
i++ ;
}
return 0 ;
}

169
kernel/zarch/dcopy.c Normal file
View File

@ -0,0 +1,169 @@
/***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#if defined(Z13mvc)
static void __attribute__ ((noinline)) dcopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) {
__asm__ volatile(
"pfd 1, 0(%1) \n\t"
"pfd 2, 0(%2) \n\t"
"srlg %%r0,%0,5 \n\t"
".align 16 \n\t"
"1: \n\t"
"mvc 0(256,%2),0(%1) \n\t"
"la %1,256(%1) \n\t"
"la %2,256(%2) \n\t"
"brctg %%r0,1b"
:
: "r"(n), "a"(x), "a"(y)
: "cc", "memory","r0"
);
return;
}
#else
static void __attribute__ ((noinline)) dcopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) {
__asm__ volatile(
"pfd 1, 0(%1) \n\t"
"pfd 2, 0(%2) \n\t"
"srlg %%r0,%0,5 \n\t"
"xgr %%r1,%%r1 \n\t"
".align 16 \n\t"
"1: \n\t"
"pfd 1, 256(%%r1,%1) \n\t"
"pfd 2, 256(%%r1,%2) \n\t"
"vl %%v24, 0(%%r1,%1) \n\t"
"vst %%v24, 0(%%r1,%2) \n\t"
"vl %%v25, 16(%%r1,%1) \n\t"
"vst %%v25, 16(%%r1,%2) \n\t"
"vl %%v26, 32(%%r1,%1) \n\t"
"vst %%v26, 32(%%r1,%2) \n\t"
"vl %%v27, 48(%%r1,%1) \n\t"
"vst %%v27, 48(%%r1,%2) \n\t"
"vl %%v24, 64(%%r1,%1) \n\t"
"vst %%v24, 64(%%r1,%2) \n\t"
"vl %%v25, 80(%%r1,%1) \n\t"
"vst %%v25, 80(%%r1,%2) \n\t"
"vl %%v26, 96(%%r1,%1) \n\t"
"vst %%v26, 96(%%r1,%2) \n\t"
"vl %%v27, 112(%%r1,%1) \n\t"
"vst %%v27, 112(%%r1,%2) \n\t"
"vl %%v24, 128(%%r1,%1) \n\t"
"vst %%v24, 128(%%r1,%2) \n\t"
"vl %%v25, 144(%%r1,%1) \n\t"
"vst %%v25, 144(%%r1,%2) \n\t"
"vl %%v26, 160(%%r1,%1) \n\t"
"vst %%v26, 160(%%r1,%2) \n\t"
"vl %%v27, 176(%%r1,%1) \n\t"
"vst %%v27, 176(%%r1,%2) \n\t"
"vl %%v24, 192(%%r1,%1) \n\t"
"vst %%v24, 192(%%r1,%2) \n\t"
"vl %%v25, 208(%%r1,%1) \n\t"
"vst %%v25, 208(%%r1,%2) \n\t"
"vl %%v26, 224(%%r1,%1) \n\t"
"vst %%v26, 224(%%r1,%2) \n\t"
"vl %%v27, 240(%%r1,%1) \n\t"
"vst %%v27, 240(%%r1,%2) \n\t"
"la %%r1,256(%%r1) \n\t"
"brctg %%r0,1b"
:
: "r"(n), "a"(x), "a"(y)
: "cc", "memory","r0","r1", "v24","v25","v26","v27"
);
return;
}
#endif
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) {
BLASLONG i = 0;
BLASLONG ix = 0, iy = 0;
if (n <= 0) return 0;
if ((inc_x == 1) && (inc_y == 1)) {
BLASLONG n1 = n & -32;
if (n1 > 0) {
dcopy_kernel_32(n1, x, y);
i = n1;
}
while (i < n) {
y[i] = x[i];
i++;
}
} else {
BLASLONG n1 = n & -4;
while (i < n1) {
y[iy] = x[ix];
y[iy + inc_y] = x[ix + inc_x];
y[iy + 2 * inc_y] = x[ix + 2 * inc_x];
y[iy + 3 * inc_y] = x[ix + 3 * inc_x];
ix += inc_x * 4;
iy += inc_y * 4;
i += 4;
}
while (i < n) {
y[iy] = x[ix];
ix += inc_x;
iy += inc_y;
i++;
}
}
return 0;
}

194
kernel/zarch/ddot.c Normal file
View File

@ -0,0 +1,194 @@
/***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#if defined(Z13)
static void __attribute__ ((noinline)) ddot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d)
{
__asm__ volatile(
"pfd 1, 0(%1) \n\t"
"pfd 1, 0(%2) \n\t"
"vzero %%v24 \n\t"
"vzero %%v25 \n\t"
"vzero %%v26 \n\t"
"vzero %%v27 \n\t"
"srlg %%r0,%0,4 \n\t"
"xgr %%r1,%%r1 \n\t"
".align 16 \n\t"
"1: \n\t"
"pfd 1, 256(%%r1,%1) \n\t"
"pfd 1, 256(%%r1,%2) \n\t"
"vl %%v16, 0(%%r1,%1) \n\t"
"vl %%v17, 16(%%r1,%1) \n\t"
"vl %%v18, 32(%%r1,%1) \n\t"
"vl %%v19, 48(%%r1,%1) \n\t"
"vl %%v28, 0(%%r1,%2) \n\t"
"vfmadb %%v24,%%v16,%%v28,%%v24 \n\t"
"vl %%v29, 16(%%r1,%2) \n\t"
"vfmadb %%v25,%%v17,%%v29,%%v25 \n\t"
"vl %%v30, 32(%%r1,%2) \n\t"
"vfmadb %%v26,%%v18,%%v30,%%v26 \n\t"
"vl %%v31, 48(%%r1,%2) \n\t"
"vfmadb %%v27,%%v19,%%v31,%%v27 \n\t"
"vl %%v16, 64(%%r1,%1) \n\t"
"vl %%v17, 80(%%r1,%1) \n\t"
"vl %%v18, 96(%%r1,%1) \n\t"
"vl %%v19, 112(%%r1,%1) \n\t"
"vl %%v28, 64(%%r1,%2) \n\t"
"vfmadb %%v24,%%v16,%%v28,%%v24 \n\t"
"vl %%v29, 80(%%r1,%2) \n\t"
"vfmadb %%v25,%%v17,%%v29,%%v25 \n\t"
"vl %%v30, 96(%%r1,%2) \n\t"
"vfmadb %%v26,%%v18,%%v30,%%v26 \n\t"
"vl %%v31, 112(%%r1,%2) \n\t"
"vfmadb %%v27,%%v19,%%v31,%%v27 \n\t"
"la %%r1,128(%%r1) \n\t"
"brctg %%r0,1b \n\t"
"vfadb %%v24,%%v25,%%v24 \n\t"
"vfadb %%v24,%%v26,%%v24 \n\t"
"vfadb %%v24,%%v27,%%v24 \n\t"
"vrepg %%v1,%%v24,1 \n\t"
"vfadb %%v1,%%v24,%%v1 \n\t"
" std %%f1,0(%3) \n\t"
:
:"r"(n),"a"(x),"a"(y),"a"(d)
:"cc" , "memory" ,"r0","r1","v16", "v17","v18","v19","v20","v21","v22","v23",
"v24","v25","v26","v27","v28","v29","v30","v31"
);
}
#else
static void ddot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d)
{
BLASLONG register i = 0;
FLOAT dot = 0.0;
while(i < n)
{
dot += y[i] * x[i]
+ y[i+1] * x[i+1]
+ y[i+2] * x[i+2]
+ y[i+3] * x[i+3]
+ y[i+4] * x[i+4]
+ y[i+5] * x[i+5]
+ y[i+6] * x[i+6]
+ y[i+7] * x[i+7] ;
i+=8 ;
}
*d += dot;
}
#endif
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
{
BLASLONG i=0;
BLASLONG ix=0,iy=0;
FLOAT dot = 0.0 ;
if ( n <= 0 ) return(dot);
if ( (inc_x == 1) && (inc_y == 1) )
{
BLASLONG n1 = n & -16;
if ( n1 )
ddot_kernel_8(n1, x, y , &dot );
i = n1;
while(i < n)
{
dot += y[i] * x[i] ;
i++ ;
}
return(dot);
}
FLOAT temp1 = 0.0;
FLOAT temp2 = 0.0;
BLASLONG n1 = n & -4;
while(i < n1)
{
FLOAT m1 = y[iy] * x[ix] ;
FLOAT m2 = y[iy+inc_y] * x[ix+inc_x] ;
FLOAT m3 = y[iy+2*inc_y] * x[ix+2*inc_x] ;
FLOAT m4 = y[iy+3*inc_y] * x[ix+3*inc_x] ;
ix += inc_x*4 ;
iy += inc_y*4 ;
temp1 += m1+m3;
temp2 += m2+m4;
i+=4 ;
}
while(i < n)
{
temp1 += y[iy] * x[ix] ;
ix += inc_x ;
iy += inc_y ;
i++ ;
}
dot = temp1 + temp2;
return(dot);
}

487
kernel/zarch/dgemv_n_4.c Normal file
View File

@ -0,0 +1,487 @@
/***************************************************************************
Copyright (c) 2017, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#define NBMAX 2048
#define HAVE_KERNEL_4x4_VEC 1
#define HAVE_KERNEL_4x2_VEC 1
#define HAVE_KERNEL_4x1_VEC 1
#if defined(HAVE_KERNEL_4x4_VEC) || defined(HAVE_KERNEL_4x2_VEC) || defined(HAVE_KERNEL_4x1_VEC)
#include <vecintrin.h>
#endif
#ifdef HAVE_KERNEL_4x4
#elif HAVE_KERNEL_4x4_VEC
static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha)
{
BLASLONG i;
FLOAT x0,x1,x2,x3;
x0 = xo[0] * *alpha;
x1 = xo[1] * *alpha;
x2 = xo[2] * *alpha;
x3 = xo[3] * *alpha;
__vector double v_x0 = {x0,x0};
__vector double v_x1 = {x1,x1};
__vector double v_x2 = {x2,x2};
__vector double v_x3 = {x3,x3};
__vector double* v_y =(__vector double*)y;
__vector double* va0 = (__vector double*)ap[0];
__vector double* va1 = (__vector double*)ap[1];
__vector double* va2 = (__vector double*)ap[2];
__vector double* va3 = (__vector double*)ap[3];
for ( i=0; i< n/2; i+=2 )
{
v_y[i] += v_x0 * va0[i] + v_x1 * va1[i] + v_x2 * va2[i] + v_x3 * va3[i] ;
v_y[i+1] += v_x0 * va0[i+1] + v_x1 * va1[i+1] + v_x2 * va2[i+1] + v_x3 * va3[i+1] ;
}
}
#else
static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha)
{
BLASLONG i;
FLOAT *a0,*a1,*a2,*a3;
FLOAT x[4] __attribute__ ((aligned (16)));
a0 = ap[0];
a1 = ap[1];
a2 = ap[2];
a3 = ap[3];
for ( i=0; i<4; i++)
x[i] = xo[i] * *alpha;
for ( i=0; i< n; i+=4 )
{
y[i] += a0[i]*x[0] + a1[i]*x[1] + a2[i]*x[2] + a3[i]*x[3];
y[i+1] += a0[i+1]*x[0] + a1[i+1]*x[1] + a2[i+1]*x[2] + a3[i+1]*x[3];
y[i+2] += a0[i+2]*x[0] + a1[i+2]*x[1] + a2[i+2]*x[2] + a3[i+2]*x[3];
y[i+3] += a0[i+3]*x[0] + a1[i+3]*x[1] + a2[i+3]*x[2] + a3[i+3]*x[3];
}
}
#endif
#ifdef HAVE_KERNEL_4x2
#elif HAVE_KERNEL_4x2_VEC
static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha)
{
BLASLONG i;
FLOAT x0,x1;
x0 = xo[0] * *alpha;
x1 = xo[1] * *alpha;
__vector double v_x0 = {x0,x0};
__vector double v_x1 = {x1,x1};
__vector double* v_y =(__vector double*)y;
__vector double* va0 = (__vector double*)ap[0];
__vector double* va1 = (__vector double*)ap[1];
for ( i=0; i< n/2; i+=2 )
{
v_y[i] += v_x0 * va0[i] + v_x1 * va1[i] ;
v_y[i+1] += v_x0 * va0[i+1] + v_x1 * va1[i+1] ;
}
}
#else
static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha)
{
BLASLONG i;
FLOAT *a0,*a1;
FLOAT x[4] __attribute__ ((aligned (16)));
a0 = ap[0];
a1 = ap[1];
for ( i=0; i<2; i++)
x[i] = xo[i] * *alpha;
for ( i=0; i< n; i+=4 )
{
y[i] += a0[i]*x[0] + a1[i]*x[1];
y[i+1] += a0[i+1]*x[0] + a1[i+1]*x[1];
y[i+2] += a0[i+2]*x[0] + a1[i+2]*x[1];
y[i+3] += a0[i+3]*x[0] + a1[i+3]*x[1];
}
}
#endif
#ifdef HAVE_KERNEL_4x1
#elif HAVE_KERNEL_4x1_VEC
static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *xo, FLOAT *y, FLOAT *alpha)
{
BLASLONG i;
FLOAT x0;
x0 = xo[0] * *alpha;
__vector double v_x0 = {x0,x0};
__vector double* v_y =(__vector double*)y;
__vector double* va0 = (__vector double*)ap;
for ( i=0; i< n/2; i+=2 )
{
v_y[i] += v_x0 * va0[i] ;
v_y[i+1] += v_x0 * va0[i+1] ;
}
}
#else
static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *xo, FLOAT *y, FLOAT *alpha)
{
BLASLONG i;
FLOAT *a0;
FLOAT x[4] __attribute__ ((aligned (16)));
a0 = ap;
for ( i=0; i<1; i++)
x[i] = xo[i] * *alpha;
for ( i=0; i< n; i+=4 )
{
y[i] += a0[i]*x[0];
y[i+1] += a0[i+1]*x[0];
y[i+2] += a0[i+2]*x[0];
y[i+3] += a0[i+3]*x[0];
}
}
#endif
static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) __attribute__ ((noinline));
static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest)
{
BLASLONG i;
for ( i=0; i<n; i++ ){
*dest += *src;
src++;
dest += inc_dest;
}
return;
}
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
{
BLASLONG i;
BLASLONG j;
FLOAT *a_ptr;
FLOAT *x_ptr;
FLOAT *y_ptr;
FLOAT *ap[4];
BLASLONG n1;
BLASLONG m1;
BLASLONG m2;
BLASLONG m3;
BLASLONG n2;
BLASLONG lda4 = lda << 2;
FLOAT xbuffer[8],*ybuffer;
if ( m < 1 ) return(0);
if ( n < 1 ) return(0);
ybuffer = buffer;
n1 = n >> 2 ;
n2 = n & 3 ;
m3 = m & 3 ;
m1 = m & -4 ;
m2 = (m & (NBMAX-1)) - m3 ;
y_ptr = y;
BLASLONG NB = NBMAX;
while ( NB == NBMAX )
{
m1 -= NB;
if ( m1 < 0)
{
if ( m2 == 0 ) break;
NB = m2;
}
a_ptr = a;
x_ptr = x;
ap[0] = a_ptr;
ap[1] = a_ptr + lda;
ap[2] = ap[1] + lda;
ap[3] = ap[2] + lda;
if ( inc_y != 1 )
memset(ybuffer,0,NB*8);
else
ybuffer = y_ptr;
if ( inc_x == 1 )
{
for( i = 0; i < n1 ; i++)
{
dgemv_kernel_4x4(NB,ap,x_ptr,ybuffer,&alpha);
ap[0] += lda4;
ap[1] += lda4;
ap[2] += lda4;
ap[3] += lda4;
a_ptr += lda4;
x_ptr += 4;
}
if ( n2 & 2 )
{
dgemv_kernel_4x2(NB,ap,x_ptr,ybuffer,&alpha);
a_ptr += lda*2;
x_ptr += 2;
}
if ( n2 & 1 )
{
dgemv_kernel_4x1(NB,a_ptr,x_ptr,ybuffer,&alpha);
a_ptr += lda;
x_ptr += 1;
}
}
else
{
for( i = 0; i < n1 ; i++)
{
xbuffer[0] = x_ptr[0];
x_ptr += inc_x;
xbuffer[1] = x_ptr[0];
x_ptr += inc_x;
xbuffer[2] = x_ptr[0];
x_ptr += inc_x;
xbuffer[3] = x_ptr[0];
x_ptr += inc_x;
dgemv_kernel_4x4(NB,ap,xbuffer,ybuffer,&alpha);
ap[0] += lda4;
ap[1] += lda4;
ap[2] += lda4;
ap[3] += lda4;
a_ptr += lda4;
}
for( i = 0; i < n2 ; i++)
{
xbuffer[0] = x_ptr[0];
x_ptr += inc_x;
dgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer,&alpha);
a_ptr += lda;
}
}
a += NB;
if ( inc_y != 1 )
{
add_y(NB,ybuffer,y_ptr,inc_y);
y_ptr += NB * inc_y;
}
else
y_ptr += NB ;
}
if ( m3 == 0 ) return(0);
if ( m3 == 3 )
{
a_ptr = a;
x_ptr = x;
FLOAT temp0 = 0.0;
FLOAT temp1 = 0.0;
FLOAT temp2 = 0.0;
if ( lda == 3 && inc_x ==1 )
{
for( i = 0; i < ( n & -4 ); i+=4 )
{
temp0 += a_ptr[0] * x_ptr[0] + a_ptr[3] * x_ptr[1];
temp1 += a_ptr[1] * x_ptr[0] + a_ptr[4] * x_ptr[1];
temp2 += a_ptr[2] * x_ptr[0] + a_ptr[5] * x_ptr[1];
temp0 += a_ptr[6] * x_ptr[2] + a_ptr[9] * x_ptr[3];
temp1 += a_ptr[7] * x_ptr[2] + a_ptr[10] * x_ptr[3];
temp2 += a_ptr[8] * x_ptr[2] + a_ptr[11] * x_ptr[3];
a_ptr += 12;
x_ptr += 4;
}
for( ; i < n; i++ )
{
temp0 += a_ptr[0] * x_ptr[0];
temp1 += a_ptr[1] * x_ptr[0];
temp2 += a_ptr[2] * x_ptr[0];
a_ptr += 3;
x_ptr ++;
}
}
else
{
for( i = 0; i < n; i++ )
{
temp0 += a_ptr[0] * x_ptr[0];
temp1 += a_ptr[1] * x_ptr[0];
temp2 += a_ptr[2] * x_ptr[0];
a_ptr += lda;
x_ptr += inc_x;
}
}
y_ptr[0] += alpha * temp0;
y_ptr += inc_y;
y_ptr[0] += alpha * temp1;
y_ptr += inc_y;
y_ptr[0] += alpha * temp2;
return(0);
}
if ( m3 == 2 )
{
a_ptr = a;
x_ptr = x;
FLOAT temp0 = 0.0;
FLOAT temp1 = 0.0;
if ( lda == 2 && inc_x ==1 )
{
for( i = 0; i < (n & -4) ; i+=4 )
{
temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1];
temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1];
temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3];
temp1 += a_ptr[5] * x_ptr[2] + a_ptr[7] * x_ptr[3];
a_ptr += 8;
x_ptr += 4;
}
for( ; i < n; i++ )
{
temp0 += a_ptr[0] * x_ptr[0];
temp1 += a_ptr[1] * x_ptr[0];
a_ptr += 2;
x_ptr ++;
}
}
else
{
for( i = 0; i < n; i++ )
{
temp0 += a_ptr[0] * x_ptr[0];
temp1 += a_ptr[1] * x_ptr[0];
a_ptr += lda;
x_ptr += inc_x;
}
}
y_ptr[0] += alpha * temp0;
y_ptr += inc_y;
y_ptr[0] += alpha * temp1;
return(0);
}
if ( m3 == 1 )
{
a_ptr = a;
x_ptr = x;
FLOAT temp = 0.0;
if ( lda == 1 && inc_x ==1 )
{
for( i = 0; i < (n & -4); i+=4 )
{
temp += a_ptr[i] * x_ptr[i] + a_ptr[i+1] * x_ptr[i+1] + a_ptr[i+2] * x_ptr[i+2] + a_ptr[i+3] * x_ptr[i+3];
}
for( ; i < n; i++ )
{
temp += a_ptr[i] * x_ptr[i];
}
}
else
{
for( i = 0; i < n; i++ )
{
temp += a_ptr[0] * x_ptr[0];
a_ptr += lda;
x_ptr += inc_x;
}
}
y_ptr[0] += alpha * temp;
return(0);
}
return(0);
}

541
kernel/zarch/dgemv_t_4.c Normal file
View File

@ -0,0 +1,541 @@
/***************************************************************************
Copyright (c) 2017, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#define HAVE_KERNEL_4x4_VEC 1
#define HAVE_KERNEL_4x2_VEC 1
#define HAVE_KERNEL_4x1_VEC 1
#if defined(HAVE_KERNEL_4x4_VEC) || defined(HAVE_KERNEL_4x2_VEC) || defined(HAVE_KERNEL_4x1_VEC)
#include <vecintrin.h>
#endif
#define NBMAX 2048
#ifdef HAVE_KERNEL_4x4
#elif HAVE_KERNEL_4x4_VEC
static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
{
BLASLONG i;
__vector double* va0 = (__vector double*)ap[0];
__vector double* va1 = (__vector double*)ap[1];
__vector double* va2 = (__vector double*)ap[2];
__vector double* va3 = (__vector double*)ap[3];
__vector double* v_x =(__vector double*)x;
__vector double temp0 = {0,0};
__vector double temp1 = {0,0};
__vector double temp2 = {0,0};
__vector double temp3 = {0,0};
for ( i=0; i< n/2; i+=2 )
{
temp0 += v_x[i] * va0[i] + v_x[i+1] * va0[i+1] ;
temp1 += v_x[i] * va1[i] + v_x[i+1] * va1[i+1] ;
temp2 += v_x[i] * va2[i] + v_x[i+1] * va2[i+1] ;
temp3 += v_x[i] * va3[i] + v_x[i+1] * va3[i+1] ;
}
y[0] = temp0[0] + temp0[1];
y[1] = temp1[0] + temp1[1];
y[2] = temp2[0] + temp2[1];
y[3] = temp3[0] + temp3[1];;
}
#else
static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
{
BLASLONG i;
FLOAT *a0,*a1,*a2,*a3;
a0 = ap[0];
a1 = ap[1];
a2 = ap[2];
a3 = ap[3];
FLOAT temp0 = 0.0;
FLOAT temp1 = 0.0;
FLOAT temp2 = 0.0;
FLOAT temp3 = 0.0;
for ( i=0; i< n; i+=4 )
{
temp0 += a0[i]*x[i] + a0[i+1]*x[i+1] + a0[i+2]*x[i+2] + a0[i+3]*x[i+3];
temp1 += a1[i]*x[i] + a1[i+1]*x[i+1] + a1[i+2]*x[i+2] + a1[i+3]*x[i+3];
temp2 += a2[i]*x[i] + a2[i+1]*x[i+1] + a2[i+2]*x[i+2] + a2[i+3]*x[i+3];
temp3 += a3[i]*x[i] + a3[i+1]*x[i+1] + a3[i+2]*x[i+2] + a3[i+3]*x[i+3];
}
y[0] = temp0;
y[1] = temp1;
y[2] = temp2;
y[3] = temp3;
}
#endif
#ifdef HAVE_KERNEL_4x2
#elif HAVE_KERNEL_4x2_VEC
static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
{
BLASLONG i;
__vector double* va0 = (__vector double*)ap[0];
__vector double* va1 = (__vector double*)ap[1];
__vector double* v_x =(__vector double*)x;
__vector double temp0 = {0,0};
__vector double temp1 = {0,0};
for ( i=0; i< n/2; i+=2 )
{
temp0 += v_x[i] * va0[i] + v_x[i+1] * va0[i+1] ;
temp1 += v_x[i] * va1[i] + v_x[i+1] * va1[i+1] ;
}
y[0] = temp0[0] + temp0[1];
y[1] = temp1[0] + temp1[1];
}
#else
static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
{
BLASLONG i;
FLOAT *a0,*a1;
a0 = ap[0];
a1 = ap[1];
FLOAT temp0 = 0.0;
FLOAT temp1 = 0.0;
for ( i=0; i< n; i+=4 )
{
temp0 += a0[i]*x[i] + a0[i+1]*x[i+1] + a0[i+2]*x[i+2] + a0[i+3]*x[i+3];
temp1 += a1[i]*x[i] + a1[i+1]*x[i+1] + a1[i+2]*x[i+2] + a1[i+3]*x[i+3];
}
y[0] = temp0;
y[1] = temp1;
}
#endif
#ifdef HAVE_KERNEL_4x1
#elif HAVE_KERNEL_4x1_VEC
static void dgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y)
{
BLASLONG i;
__vector double* va0 = (__vector double*)a0;
__vector double* v_x =(__vector double*)x;
__vector double temp0 = {0,0};
for ( i=0; i< n/2; i+=2 )
{
temp0 += v_x[i] * va0[i] + v_x[i+1] * va0[i+1] ;
}
y[0] = temp0[0] + temp0[1];
}
#else
static void dgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y)
{
BLASLONG i;
FLOAT temp0 = 0.0;
for ( i=0; i< n; i+=4 )
{
temp0 += a0[i]*x[i] + a0[i+1]*x[i+1] + a0[i+2]*x[i+2] + a0[i+3]*x[i+3];
}
y[0] = temp0;
}
#endif
static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src)
{
BLASLONG i;
for ( i=0; i<n; i++ )
{
*dest = *src;
dest++;
src += inc_src;
}
}
static void add_y(BLASLONG n, FLOAT da , FLOAT *src, FLOAT *dest, BLASLONG inc_dest)
{
BLASLONG i;
for ( i=0; i<n; i++ )
{
*dest += src[i] * da;
dest += inc_dest;
}
return;
}
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
{
BLASLONG register i;
BLASLONG register j;
FLOAT *a_ptr;
FLOAT *x_ptr;
FLOAT *y_ptr;
BLASLONG n0;
BLASLONG n1;
BLASLONG m1;
BLASLONG m2;
BLASLONG m3;
BLASLONG n2;
FLOAT ybuffer[4],*xbuffer;
FLOAT *ytemp;
if ( m < 1 ) return(0);
if ( n < 1 ) return(0);
xbuffer = buffer;
ytemp = buffer + (m < NBMAX ? m : NBMAX);
n0 = n / NBMAX;
n1 = (n % NBMAX) >> 2 ;
n2 = n & 3 ;
m3 = m & 3 ;
m1 = m & -4 ;
m2 = (m & (NBMAX-1)) - m3 ;
BLASLONG NB = NBMAX;
while ( NB == NBMAX )
{
m1 -= NB;
if ( m1 < 0)
{
if ( m2 == 0 ) break;
NB = m2;
}
y_ptr = y;
a_ptr = a;
x_ptr = x;
if ( inc_x == 1 )
xbuffer = x_ptr;
else
copy_x(NB,x_ptr,xbuffer,inc_x);
FLOAT *ap[4];
FLOAT *yp;
BLASLONG register lda4 = 4 * lda;
ap[0] = a_ptr;
ap[1] = a_ptr + lda;
ap[2] = ap[1] + lda;
ap[3] = ap[2] + lda;
if ( n0 > 0 )
{
BLASLONG nb1 = NBMAX / 4;
for( j=0; j<n0; j++)
{
yp = ytemp;
for( i = 0; i < nb1 ; i++)
{
dgemv_kernel_4x4(NB,ap,xbuffer,yp);
ap[0] += lda4 ;
ap[1] += lda4 ;
ap[2] += lda4 ;
ap[3] += lda4 ;
yp += 4;
}
add_y(nb1*4, alpha, ytemp, y_ptr, inc_y );
y_ptr += nb1 * inc_y * 4;
a_ptr += nb1 * lda4 ;
}
}
yp = ytemp;
for( i = 0; i < n1 ; i++)
{
dgemv_kernel_4x4(NB,ap,xbuffer,yp);
ap[0] += lda4 ;
ap[1] += lda4 ;
ap[2] += lda4 ;
ap[3] += lda4 ;
yp += 4;
}
if ( n1 > 0 )
{
add_y(n1*4, alpha, ytemp, y_ptr, inc_y );
y_ptr += n1 * inc_y * 4;
a_ptr += n1 * lda4 ;
}
if ( n2 & 2 )
{
dgemv_kernel_4x2(NB,ap,xbuffer,ybuffer);
a_ptr += lda * 2;
*y_ptr += ybuffer[0] * alpha;
y_ptr += inc_y;
*y_ptr += ybuffer[1] * alpha;
y_ptr += inc_y;
}
if ( n2 & 1 )
{
dgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer);
a_ptr += lda;
*y_ptr += ybuffer[0] * alpha;
y_ptr += inc_y;
}
a += NB;
x += NB * inc_x;
}
if ( m3 == 0 ) return(0);
x_ptr = x;
a_ptr = a;
if ( m3 == 3 )
{
FLOAT xtemp0 = *x_ptr * alpha;
x_ptr += inc_x;
FLOAT xtemp1 = *x_ptr * alpha;
x_ptr += inc_x;
FLOAT xtemp2 = *x_ptr * alpha;
FLOAT *aj = a_ptr;
y_ptr = y;
if ( lda == 3 && inc_y == 1 )
{
for ( j=0; j< ( n & -4) ; j+=4 )
{
y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2;
y_ptr[j+1] += aj[3] * xtemp0 + aj[4] * xtemp1 + aj[5] * xtemp2;
y_ptr[j+2] += aj[6] * xtemp0 + aj[7] * xtemp1 + aj[8] * xtemp2;
y_ptr[j+3] += aj[9] * xtemp0 + aj[10] * xtemp1 + aj[11] * xtemp2;
aj += 12;
}
for ( ; j<n; j++ )
{
y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2;
aj += 3;
}
}
else
{
if ( inc_y == 1 )
{
BLASLONG register lda2 = lda << 1;
BLASLONG register lda4 = lda << 2;
BLASLONG register lda3 = lda2 + lda;
for ( j=0; j< ( n & -4 ); j+=4 )
{
y_ptr[j] += *aj * xtemp0 + *(aj+1) * xtemp1 + *(aj+2) * xtemp2;
y_ptr[j+1] += *(aj+lda) * xtemp0 + *(aj+lda+1) * xtemp1 + *(aj+lda+2) * xtemp2;
y_ptr[j+2] += *(aj+lda2) * xtemp0 + *(aj+lda2+1) * xtemp1 + *(aj+lda2+2) * xtemp2;
y_ptr[j+3] += *(aj+lda3) * xtemp0 + *(aj+lda3+1) * xtemp1 + *(aj+lda3+2) * xtemp2;
aj += lda4;
}
for ( ; j< n ; j++ )
{
y_ptr[j] += *aj * xtemp0 + *(aj+1) * xtemp1 + *(aj+2) * xtemp2 ;
aj += lda;
}
}
else
{
for ( j=0; j<n; j++ )
{
*y_ptr += *aj * xtemp0 + *(aj+1) * xtemp1 + *(aj+2) * xtemp2;
y_ptr += inc_y;
aj += lda;
}
}
}
return(0);
}
if ( m3 == 2 )
{
FLOAT xtemp0 = *x_ptr * alpha;
x_ptr += inc_x;
FLOAT xtemp1 = *x_ptr * alpha;
FLOAT *aj = a_ptr;
y_ptr = y;
if ( lda == 2 && inc_y == 1 )
{
for ( j=0; j< ( n & -4) ; j+=4 )
{
y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 ;
y_ptr[j+1] += aj[2] * xtemp0 + aj[3] * xtemp1 ;
y_ptr[j+2] += aj[4] * xtemp0 + aj[5] * xtemp1 ;
y_ptr[j+3] += aj[6] * xtemp0 + aj[7] * xtemp1 ;
aj += 8;
}
for ( ; j<n; j++ )
{
y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 ;
aj += 2;
}
}
else
{
if ( inc_y == 1 )
{
BLASLONG register lda2 = lda << 1;
BLASLONG register lda4 = lda << 2;
BLASLONG register lda3 = lda2 + lda;
for ( j=0; j< ( n & -4 ); j+=4 )
{
y_ptr[j] += *aj * xtemp0 + *(aj+1) * xtemp1 ;
y_ptr[j+1] += *(aj+lda) * xtemp0 + *(aj+lda+1) * xtemp1 ;
y_ptr[j+2] += *(aj+lda2) * xtemp0 + *(aj+lda2+1) * xtemp1 ;
y_ptr[j+3] += *(aj+lda3) * xtemp0 + *(aj+lda3+1) * xtemp1 ;
aj += lda4;
}
for ( ; j< n ; j++ )
{
y_ptr[j] += *aj * xtemp0 + *(aj+1) * xtemp1 ;
aj += lda;
}
}
else
{
for ( j=0; j<n; j++ )
{
*y_ptr += *aj * xtemp0 + *(aj+1) * xtemp1 ;
y_ptr += inc_y;
aj += lda;
}
}
}
return(0);
}
FLOAT xtemp = *x_ptr * alpha;
FLOAT *aj = a_ptr;
y_ptr = y;
if ( lda == 1 && inc_y == 1 )
{
for ( j=0; j< ( n & -4) ; j+=4 )
{
y_ptr[j] += aj[j] * xtemp;
y_ptr[j+1] += aj[j+1] * xtemp;
y_ptr[j+2] += aj[j+2] * xtemp;
y_ptr[j+3] += aj[j+3] * xtemp;
}
for ( ; j<n ; j++ )
{
y_ptr[j] += aj[j] * xtemp;
}
}
else
{
if ( inc_y == 1 )
{
BLASLONG register lda2 = lda << 1;
BLASLONG register lda4 = lda << 2;
BLASLONG register lda3 = lda2 + lda;
for ( j=0; j< ( n & -4 ); j+=4 )
{
y_ptr[j] += *aj * xtemp;
y_ptr[j+1] += *(aj+lda) * xtemp;
y_ptr[j+2] += *(aj+lda2) * xtemp;
y_ptr[j+3] += *(aj+lda3) * xtemp;
aj += lda4 ;
}
for ( ; j<n; j++ )
{
y_ptr[j] += *aj * xtemp;
aj += lda;
}
}
else
{
for ( j=0; j<n; j++ )
{
*y_ptr += *aj * xtemp;
y_ptr += inc_y;
aj += lda;
}
}
}
return(0);
}

270
kernel/zarch/drot.c Normal file
View File

@ -0,0 +1,270 @@
/***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
static void __attribute__ ((noinline)) drot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s)
{
__asm__ (
"pfd 2, 0(%1) \n\t"
"pfd 2, 0(%2) \n\t"
"vlrepg %%v0,0(%3) \n\t"
"vlrepg %%v1,0(%4) \n\t"
"srlg %%r0,%0,5 \n\t"
"xgr %%r1,%%r1 \n\t"
".align 16 \n\t"
"1: \n\t"
"pfd 2, 256(%%r1,%1) \n\t"
"pfd 2, 256(%%r1,%2) \n\t"
"vl %%v24, 0(%%r1,%1) \n\t"
"vl %%v25, 16(%%r1,%1) \n\t"
"vl %%v26, 32(%%r1,%1) \n\t"
"vl %%v27, 48(%%r1,%1) \n\t"
"vl %%v16, 0(%%r1,%2) \n\t"
"vl %%v17, 16(%%r1,%2) \n\t"
"vl %%v18, 32(%%r1,%2) \n\t"
"vl %%v19, 48(%%r1,%2) \n\t"
"vfmdb %%v28,%%v24,%%v0 \n\t"
"vfmdb %%v29,%%v25,%%v0 \n\t"
"vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v30,%%v26,%%v0 \n\t"
"vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v31,%%v27,%%v0 \n\t"
"vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
/* 2nd parts*/
"vfmadb %%v28,%%v16,%%v1,%%v28 \n\t"
"vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
"vfmadb %%v29,%%v17,%%v1,%%v29 \n\t"
"vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
"vfmadb %%v30,%%v18,%%v1,%%v30 \n\t"
"vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */
"vfmadb %%v31,%%v19,%%v1,%%v31 \n\t"
"vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
"vst %%v28, 0(%%r1,%1) \n\t"
"vst %%v29, 16(%%r1,%1) \n\t"
"vst %%v30, 32(%%r1,%1) \n\t"
"vst %%v31, 48(%%r1,%1) \n\t"
"vst %%v20, 0(%%r1,%2) \n\t"
"vst %%v21, 16(%%r1,%2) \n\t"
"vst %%v22, 32(%%r1,%2) \n\t"
"vst %%v23, 48(%%r1,%2) \n\t"
"vl %%v24, 64(%%r1,%1) \n\t"
"vl %%v25, 80(%%r1,%1) \n\t"
"vl %%v26, 96(%%r1,%1) \n\t"
"vl %%v27, 112(%%r1,%1) \n\t"
"vl %%v16, 64(%%r1,%2) \n\t"
"vl %%v17, 80(%%r1,%2) \n\t"
"vl %%v18, 96(%%r1,%2) \n\t"
"vl %%v19, 112(%%r1,%2) \n\t"
"vfmdb %%v28,%%v24,%%v0 \n\t"
"vfmdb %%v29,%%v25,%%v0 \n\t"
"vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v30,%%v26,%%v0 \n\t"
"vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v31,%%v27,%%v0 \n\t"
"vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
/* 2nd parts*/
"vfmadb %%v28,%%v16,%%v1,%%v28 \n\t"
"vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
"vfmadb %%v29,%%v17,%%v1,%%v29 \n\t"
"vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
"vfmadb %%v30,%%v18,%%v1,%%v30 \n\t"
"vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */
"vfmadb %%v31,%%v19,%%v1,%%v31 \n\t"
"vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
"vst %%v28, 64(%%r1,%1) \n\t"
"vst %%v29, 80(%%r1,%1) \n\t"
"vst %%v30, 96(%%r1,%1) \n\t"
"vst %%v31, 112(%%r1,%1) \n\t"
"vst %%v20, 64(%%r1,%2) \n\t"
"vst %%v21, 80(%%r1,%2) \n\t"
"vst %%v22, 96(%%r1,%2) \n\t"
"vst %%v23, 112(%%r1,%2) \n\t"
"vl %%v24, 128(%%r1,%1) \n\t"
"vl %%v25, 144(%%r1,%1) \n\t"
"vl %%v26, 160(%%r1,%1) \n\t"
"vl %%v27, 176(%%r1,%1) \n\t"
"vl %%v16, 128(%%r1,%2) \n\t"
"vl %%v17, 144(%%r1,%2) \n\t"
"vl %%v18, 160(%%r1,%2) \n\t"
"vl %%v19, 176(%%r1,%2) \n\t"
"vfmdb %%v28,%%v24,%%v0 \n\t"
"vfmdb %%v29,%%v25,%%v0 \n\t"
"vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v30,%%v26,%%v0 \n\t"
"vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v31,%%v27,%%v0 \n\t"
"vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
/* 2nd parts*/
"vfmadb %%v28,%%v16,%%v1,%%v28 \n\t"
"vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
"vfmadb %%v29,%%v17,%%v1,%%v29 \n\t"
"vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
"vfmadb %%v30,%%v18,%%v1,%%v30 \n\t"
"vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */
"vfmadb %%v31,%%v19,%%v1,%%v31 \n\t"
"vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
"vst %%v28, 128(%%r1,%1) \n\t"
"vst %%v29, 144(%%r1,%1) \n\t"
"vst %%v30, 160(%%r1,%1) \n\t"
"vst %%v31, 176(%%r1,%1) \n\t"
"vst %%v20, 128(%%r1,%2) \n\t"
"vst %%v21, 144(%%r1,%2) \n\t"
"vst %%v22, 160(%%r1,%2) \n\t"
"vst %%v23, 176(%%r1,%2) \n\t"
"vl %%v24, 192(%%r1,%1) \n\t"
"vl %%v25, 208(%%r1,%1) \n\t"
"vl %%v26, 224(%%r1,%1) \n\t"
"vl %%v27, 240(%%r1,%1) \n\t"
"vl %%v16, 192(%%r1,%2) \n\t"
"vl %%v17, 208(%%r1,%2) \n\t"
"vl %%v18, 224(%%r1,%2) \n\t"
"vl %%v19, 240(%%r1,%2) \n\t"
"vfmdb %%v28,%%v24,%%v0 \n\t"
"vfmdb %%v29,%%v25,%%v0 \n\t"
"vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v30,%%v26,%%v0 \n\t"
"vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v31,%%v27,%%v0 \n\t"
"vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
/* 2nd parts*/
"vfmadb %%v28,%%v16,%%v1,%%v28 \n\t"
"vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
"vfmadb %%v29,%%v17,%%v1,%%v29 \n\t"
"vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
"vfmadb %%v30,%%v18,%%v1,%%v30 \n\t"
"vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */
"vfmadb %%v31,%%v19,%%v1,%%v31 \n\t"
"vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
"vst %%v28, 192(%%r1,%1) \n\t"
"vst %%v29, 208(%%r1,%1) \n\t"
"vst %%v30, 224(%%r1,%1) \n\t"
"vst %%v31, 240(%%r1,%1) \n\t"
"vst %%v20, 192(%%r1,%2) \n\t"
"vst %%v21, 208(%%r1,%2) \n\t"
"vst %%v22, 224(%%r1,%2) \n\t"
"vst %%v23, 240(%%r1,%2) \n\t"
"la %%r1,256(%%r1) \n\t"
"brctg %%r0,1b"
:
: "r"(n), "a"(x), "a"(y),"a"(c),"a"(s)
: "cc", "memory","r0","r1" ,"v0","v1","v16",
"v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
return;
}
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
{
BLASLONG i=0;
BLASLONG ix=0,iy=0;
FLOAT temp;
if ( n <= 0 ) return(0);
if ( (inc_x == 1) && (inc_y == 1) )
{
BLASLONG n1 = n & -32;
if ( n1 > 0 )
{
FLOAT cosa,sina;
cosa=c;
sina=s;
drot_kernel_32(n1, x, y, &cosa, &sina);
i=n1;
}
while(i < n)
{
temp = c*x[i] + s*y[i] ;
y[i] = c*y[i] - s*x[i] ;
x[i] = temp ;
i++ ;
}
}
else
{
while(i < n)
{
temp = c*x[ix] + s*y[iy] ;
y[iy] = c*y[iy] - s*x[ix] ;
x[ix] = temp ;
ix += inc_x ;
iy += inc_y ;
i++ ;
}
}
return(0);
}

210
kernel/zarch/dscal.c Normal file
View File

@ -0,0 +1,210 @@
/***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#if defined(Z13)
static void __attribute__ ((noinline)) dscal_kernel_8( BLASLONG n, FLOAT da , FLOAT *x )
{
__asm__ ("pfd 2, 0(%1) \n\t"
"vrepg %%v0 , %%v0,0 \n\t"
"sllg %%r0,%0,3 \n\t"
"xgr %%r1,%%r1 \n\t"
".align 16 \n\t"
"1: \n\t"
"pfd 2, 256(%%r1,%1) \n\t"
"vl %%v24, 0(%%r1,%1) \n\t"
"vfmdb %%v24,%%v24,%%v0 \n\t"
"vst %%v24, 0(%%r1,%1) \n\t"
"vl %%v25, 16(%%r1,%1) \n\t"
"vfmdb %%v25,%%v25,%%v0 \n\t"
"vst %%v25, 16(%%r1,%1) \n\t"
"vl %%v26, 32(%%r1,%1) \n\t"
"vfmdb %%v26,%%v26,%%v0 \n\t"
"vst %%v26, 32(%%r1,%1) \n\t"
"vl %%v27, 48(%%r1,%1) \n\t"
"vfmdb %%v27,%%v27,%%v0 \n\t"
"vst %%v27, 48(%%r1,%1) \n\t"
"vl %%v24, 64(%%r1,%1) \n\t"
"vfmdb %%v24,%%v24,%%v0 \n\t"
"vst %%v24, 64(%%r1,%1) \n\t"
"vl %%v25, 80(%%r1,%1) \n\t"
"vfmdb %%v25,%%v25,%%v0 \n\t"
"vst %%v25, 80(%%r1,%1) \n\t"
"vl %%v26, 96(%%r1,%1) \n\t"
"vfmdb %%v26,%%v26,%%v0 \n\t"
"vst %%v26, 96(%%r1,%1) \n\t"
"vl %%v27, 112(%%r1,%1) \n\t"
"vfmdb %%v27,%%v27,%%v0 \n\t"
"vst %%v27, 112(%%r1,%1) \n\t"
"la %%r1,128(%%r1) \n\t"
"clgrjl %%r1,%%r0,1b \n\t"
:
:"r"(n),"a"(x),"f"(da)
:"cc" , "memory" ,"r0","r1","v0","v24","v25","v26","v27"
);
}
static void __attribute__ ((noinline)) dscal_kernel_8_zero( BLASLONG n, FLOAT da , FLOAT *x )
{
__asm__ ("pfd 2, 0(%1) \n\t"
"vzero %%v0 \n\t"
"sllg %%r0,%0,3 \n\t"
"xgr %%r1,%%r1 \n\t"
".align 16 \n\t"
"1: \n\t"
"pfd 2, 256(%%r1,%1) \n\t"
"vst %%v0, 0(%%r1,%1) \n\t"
"vst %%v0, 16(%%r1,%1) \n\t"
"vst %%v0, 32(%%r1,%1) \n\t"
"vst %%v0, 48(%%r1,%1) \n\t"
"vst %%v0, 64(%%r1,%1) \n\t"
"vst %%v0, 80(%%r1,%1) \n\t"
"vst %%v0, 96(%%r1,%1) \n\t"
"vst %%v0, 112(%%r1,%1) \n\t"
"la %%r1,128(%%r1) \n\t"
"clgrjl %%r1,%%r0,1b \n\t"
:
:"r"(n),"a"(x),"f"(da)
:"cc" , "memory" ,"r0","r1","v0"
);
}
#endif
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
{
BLASLONG i=0,j=0;
if ( n <= 0 || inc_x <=0 )
return(0);
if ( inc_x == 1 )
{
if ( da == 0.0 )
{
BLASLONG n1 = n & -16;
if ( n1 > 0 )
{
dscal_kernel_8_zero(n1 , da , x);
j=n1;
}
while(j < n)
{
x[j]=0.0;
j++;
}
}
else
{
BLASLONG n1 = n & -16;
if ( n1 > 0 )
{
dscal_kernel_8(n1 , da , x);
j=n1;
}
while(j < n)
{
x[j] = da * x[j] ;
j++;
}
}
}
else
{
if ( da == 0.0 )
{
BLASLONG n1 = n & -4;
while (j < n1) {
x[i]=0.0;
x[i + inc_x]=0.0;
x[i + 2 * inc_x]=0.0;
x[i + 3 * inc_x]=0.0;
i += inc_x * 4;
j += 4;
}
while(j < n)
{
x[i]=0.0;
i += inc_x ;
j++;
}
}
else
{
BLASLONG n1 = n & -4;
while (j < n1) {
x[i] = da * x[i] ;
x[i + inc_x] = da * x[i + inc_x];
x[i + 2 * inc_x] = da * x[i + 2 * inc_x];
x[i + 3 * inc_x] = da * x[i + 3 * inc_x];
i += inc_x * 4;
j += 4;
}
while(j < n)
{
x[i] = da * x[i] ;
i += inc_x ;
j++;
}
}
}
return 0;
}

382
kernel/zarch/dswap.c Normal file
View File

@ -0,0 +1,382 @@
/***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#define Z13_SWAP_C 1
#if defined(Z13_SWAP_A)
static void __attribute__ ((noinline)) dswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y)
{
__asm__ volatile(
"pfd 1, 0(%1) \n\t"
"pfd 2, 0(%2) \n\t"
"srlg %%r0,%0,5 \n\t"
"xgr %%r1,%%r1 \n\t"
".align 16 \n\t"
"1: \n\t"
"pfd 2, 256(%%r1,%1) \n\t"
"pfd 2, 256(%%r1,%2) \n\t"
"vl %%v24, 0(%%r1,%1) \n\t"
"vl %%v16, 0(%%r1,%2) \n\t"
"vst %%v24, 0(%%r1,%2) \n\t"
"vst %%v16, 0(%%r1,%1) \n\t"
"vl %%v25, 16(%%r1,%1) \n\t"
"vl %%v17, 16(%%r1,%2) \n\t"
"vst %%v25, 16(%%r1,%2) \n\t"
"vst %%v17, 16(%%r1,%1) \n\t"
"vl %%v26, 32(%%r1,%1) \n\t"
"vl %%v18, 32(%%r1,%2) \n\t"
"vst %%v26, 32(%%r1,%2) \n\t"
"vst %%v18, 32(%%r1,%1) \n\t"
"vl %%v27, 48(%%r1,%1) \n\t"
"vl %%v19, 48(%%r1,%2) \n\t"
"vst %%v27, 48(%%r1,%2) \n\t"
"vst %%v19, 48(%%r1,%1) \n\t"
"vl %%v28, 64(%%r1,%1) \n\t"
"vl %%v20, 64(%%r1,%2) \n\t"
"vst %%v28, 64(%%r1,%2) \n\t"
"vst %%v20, 64(%%r1,%1) \n\t"
"vl %%v29, 80(%%r1,%1) \n\t"
"vl %%v21, 80(%%r1,%2) \n\t"
"vst %%v29, 80(%%r1,%2) \n\t"
"vst %%v21, 80(%%r1,%1) \n\t"
"vl %%v30, 96(%%r1,%1) \n\t"
"vl %%v22, 96(%%r1,%2) \n\t"
"vst %%v30, 96(%%r1,%2) \n\t"
"vst %%v22, 96(%%r1,%1) \n\t"
"vl %%v31, 112(%%r1,%1) \n\t"
"vl %%v23, 112(%%r1,%2) \n\t"
"vst %%v31, 112(%%r1,%2) \n\t"
"vst %%v23, 112(%%r1,%1) \n\t"
"vl %%v24, 128(%%r1,%1) \n\t"
"vl %%v16, 128(%%r1,%2) \n\t"
"vst %%v24, 128(%%r1,%2) \n\t"
"vst %%v16, 128(%%r1,%1) \n\t"
"vl %%v25, 144(%%r1,%1) \n\t"
"vl %%v17, 144(%%r1,%2) \n\t"
"vst %%v25, 144(%%r1,%2) \n\t"
"vst %%v17, 144(%%r1,%1) \n\t"
"vl %%v26, 160(%%r1,%1) \n\t"
"vl %%v18, 160(%%r1,%2) \n\t"
"vst %%v26, 160(%%r1,%2) \n\t"
"vst %%v18, 160(%%r1,%1) \n\t"
"vl %%v27, 176(%%r1,%1) \n\t"
"vl %%v19, 176(%%r1,%2) \n\t"
"vst %%v27, 176(%%r1,%2) \n\t"
"vst %%v19, 176(%%r1,%1) \n\t"
"vl %%v28, 192(%%r1,%1) \n\t"
"vl %%v20, 192(%%r1,%2) \n\t"
"vst %%v28, 192(%%r1,%2) \n\t"
"vst %%v20, 192(%%r1,%1) \n\t"
"vl %%v29, 208(%%r1,%1) \n\t"
"vl %%v21, 208(%%r1,%2) \n\t"
"vst %%v29, 208(%%r1,%2) \n\t"
"vst %%v21, 208(%%r1,%1) \n\t"
"vl %%v30, 224(%%r1,%1) \n\t"
"vl %%v22, 224(%%r1,%2) \n\t"
"vst %%v30, 224(%%r1,%2) \n\t"
"vst %%v22, 224(%%r1,%1) \n\t"
"vl %%v31, 240(%%r1,%1) \n\t"
"vl %%v23, 240(%%r1,%2) \n\t"
"vst %%v31, 240(%%r1,%2) \n\t"
"vst %%v23, 240(%%r1,%1) \n\t"
"la %%r1,256(%%r1) \n\t"
"brctg %%r0,1b"
:
: "r"(n), "a"(x), "a"(y)
: "cc", "memory" ,"r0","r1", "v16","v17","v18","v19","v20","v21","v22","v23"
,"v24","v25","v26","v27","v28","v29","v30","v31"
);
return;
}
#elif defined(Z13_SWAP_B)
static void __attribute__ ((noinline)) dswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y)
{
__asm__ volatile(
"pfd 2, 0(%1) \n\t"
"pfd 2, 0(%2) \n\t"
"srlg %%r0,%0,5 \n\t"
"xgr %%r1,%%r1 \n\t"
".align 16 \n\t"
"1: \n\t"
"pfd 2, 256(%%r1,%1) \n\t"
"pfd 2, 256(%%r1,%2) \n\t"
"vl %%v24, 0(%%r1,%1) \n\t"
"vl %%v25, 16(%%r1,%1) \n\t"
"vl %%v26, 32(%%r1,%1) \n\t"
"vl %%v27, 48(%%r1,%1) \n\t"
"vl %%v28, 64(%%r1,%1) \n\t"
"vl %%v29, 80(%%r1,%1) \n\t"
"vl %%v30, 96(%%r1,%1) \n\t"
"vl %%v31, 112(%%r1,%1) \n\t"
"vl %%v16, 0(%%r1,%2) \n\t"
"vl %%v17, 16(%%r1,%2) \n\t"
"vl %%v18, 32(%%r1,%2) \n\t"
"vl %%v19, 48(%%r1,%2) \n\t"
"vl %%v20, 64(%%r1,%2) \n\t"
"vl %%v21, 80(%%r1,%2) \n\t"
"vl %%v22, 96(%%r1,%2) \n\t"
"vl %%v23, 112(%%r1,%2) \n\t"
"vst %%v24, 0(%%r1,%2) \n\t"
"vst %%v25, 16(%%r1,%2) \n\t"
"vst %%v26, 32(%%r1,%2) \n\t"
"vst %%v27, 48(%%r1,%2) \n\t"
"vst %%v28, 64(%%r1,%2) \n\t"
"vst %%v29, 80(%%r1,%2) \n\t"
"vst %%v30, 96(%%r1,%2) \n\t"
"vst %%v31, 112(%%r1,%2)\n\t"
"vst %%v16, 0(%%r1,%1) \n\t"
"vst %%v17, 16(%%r1,%1) \n\t"
"vst %%v18, 32(%%r1,%1) \n\t"
"vst %%v19, 48(%%r1,%1) \n\t"
"vst %%v20, 64(%%r1,%1) \n\t"
"vst %%v21, 80(%%r1,%1) \n\t"
"vst %%v22, 96(%%r1,%1) \n\t"
"vst %%v23, 112(%%r1,%1)\n\t"
"vl %%v24, 128(%%r1,%1) \n\t"
"vl %%v25, 144(%%r1,%1) \n\t"
"vl %%v26, 160(%%r1,%1) \n\t"
"vl %%v27, 176(%%r1,%1) \n\t"
"vl %%v28, 192(%%r1,%1) \n\t"
"vl %%v29, 208(%%r1,%1) \n\t"
"vl %%v30, 224(%%r1,%1) \n\t"
"vl %%v31, 240(%%r1,%1) \n\t"
"vl %%v16, 128(%%r1,%2) \n\t"
"vl %%v17, 144(%%r1,%2) \n\t"
"vl %%v18, 160(%%r1,%2) \n\t"
"vl %%v19, 176(%%r1,%2) \n\t"
"vl %%v20, 192(%%r1,%2) \n\t"
"vl %%v21, 208(%%r1,%2) \n\t"
"vl %%v22, 224(%%r1,%2) \n\t"
"vl %%v23, 240(%%r1,%2) \n\t"
"vst %%v24, 128(%%r1,%2) \n\t"
"vst %%v25, 144(%%r1,%2) \n\t"
"vst %%v26, 160(%%r1,%2) \n\t"
"vst %%v27, 176(%%r1,%2) \n\t"
"vst %%v28, 192(%%r1,%2) \n\t"
"vst %%v29, 208(%%r1,%2) \n\t"
"vst %%v30, 224(%%r1,%2) \n\t"
"vst %%v31, 240(%%r1,%2) \n\t"
"vst %%v16, 128(%%r1,%1) \n\t"
"vst %%v17, 144(%%r1,%1) \n\t"
"vst %%v18, 160(%%r1,%1) \n\t"
"vst %%v19, 176(%%r1,%1) \n\t"
"vst %%v20, 192(%%r1,%1) \n\t"
"vst %%v21, 208(%%r1,%1) \n\t"
"vst %%v22, 224(%%r1,%1) \n\t"
"vst %%v23, 240(%%r1,%1) \n\t"
"la %%r1,256(%%r1) \n\t"
"brctg %%r0,1b"
:
: "r"(n), "a"(x), "a"(y)
: "cc", "memory","r0","r1", "v16",
"v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
return;
}
#elif defined(Z13_SWAP_C)
static void __attribute__ ((noinline)) dswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y)
{
__asm__ volatile(
"pfd 2, 0(%1) \n\t"
"pfd 2, 0(%2) \n\t"
"srlg %%r0,%0,5 \n\t"
"xgr %%r1,%%r1 \n\t"
".align 16 \n\t"
"1: \n\t"
"pfd 2, 256(%%r1,%1) \n\t"
"pfd 2, 256(%%r1,%2) \n\t"
"vl %%v16, 0(%%r1,%1) \n\t"
"vl %%v17, 16(%%r1,%1) \n\t"
"vl %%v18, 32(%%r1,%1) \n\t"
"vl %%v19, 48(%%r1,%1) \n\t"
"vl %%v20, 64(%%r1,%1) \n\t"
"vl %%v21, 80(%%r1,%1) \n\t"
"vl %%v22, 96(%%r1,%1) \n\t"
"vl %%v23, 112(%%r1,%1) \n\t"
"vl %%v24, 128(%%r1,%1) \n\t"
"vl %%v25, 144(%%r1,%1) \n\t"
"vl %%v26, 160(%%r1,%1) \n\t"
"vl %%v27, 176(%%r1,%1) \n\t"
"vl %%v28, 192(%%r1,%1) \n\t"
"vl %%v29, 208(%%r1,%1) \n\t"
"vl %%v30, 224(%%r1,%1) \n\t"
"vl %%v31, 240(%%r1,%1) \n\t"
"vl %%v0, 0(%%r1,%2) \n\t"
"vl %%v1, 16(%%r1,%2) \n\t"
"vl %%v2, 32(%%r1,%2) \n\t"
"vl %%v3, 48(%%r1,%2) \n\t"
"vl %%v4, 64(%%r1,%2) \n\t"
"vl %%v5, 80(%%r1,%2) \n\t"
"vl %%v6, 96(%%r1,%2) \n\t"
"vl %%v7, 112(%%r1,%2) \n\t"
"vst %%v0, 0(%%r1,%1) \n\t"
"vst %%v1, 16(%%r1,%1) \n\t"
"vst %%v2, 32(%%r1,%1) \n\t"
"vst %%v3, 48(%%r1,%1) \n\t"
"vst %%v4, 64(%%r1,%1) \n\t"
"vst %%v5, 80(%%r1,%1) \n\t"
"vst %%v6, 96(%%r1,%1) \n\t"
"vst %%v7, 112(%%r1,%1) \n\t"
"vl %%v0, 128(%%r1,%2) \n\t"
"vl %%v1, 144(%%r1,%2) \n\t"
"vl %%v2, 160(%%r1,%2) \n\t"
"vl %%v3, 176(%%r1,%2) \n\t"
"vl %%v4, 192(%%r1,%2) \n\t"
"vl %%v5, 208(%%r1,%2) \n\t"
"vl %%v6, 224(%%r1,%2) \n\t"
"vl %%v7, 240(%%r1,%2) \n\t"
"vst %%v0, 128(%%r1,%1) \n\t"
"vst %%v1, 144(%%r1,%1) \n\t"
"vst %%v2, 160(%%r1,%1) \n\t"
"vst %%v3, 176(%%r1,%1) \n\t"
"vst %%v4, 192(%%r1,%1) \n\t"
"vst %%v5, 208(%%r1,%1) \n\t"
"vst %%v6, 224(%%r1,%1) \n\t"
"vst %%v7, 240(%%r1,%1) \n\t"
"vst %%v16, 0(%%r1,%2) \n\t"
"vst %%v17, 16(%%r1,%2) \n\t"
"vst %%v18, 32(%%r1,%2) \n\t"
"vst %%v19, 48(%%r1,%2) \n\t"
"vst %%v20, 64(%%r1,%2) \n\t"
"vst %%v21, 80(%%r1,%2) \n\t"
"vst %%v22, 96(%%r1,%2) \n\t"
"vst %%v23, 112(%%r1,%2) \n\t"
"vst %%v24, 128(%%r1,%2) \n\t"
"vst %%v25, 144(%%r1,%2) \n\t"
"vst %%v26, 160(%%r1,%2) \n\t"
"vst %%v27, 176(%%r1,%2) \n\t"
"vst %%v28, 192(%%r1,%2) \n\t"
"vst %%v29, 208(%%r1,%2) \n\t"
"vst %%v30, 224(%%r1,%2) \n\t"
"vst %%v31, 240(%%r1,%2) \n\t"
"la %%r1,256(%%r1) \n\t"
"brctg %%r0,1b"
:
: "r"(n), "a"(x), "a"(y)
: "cc", "memory","r0","r1", "v0","v1","v2","v3","v4","v5","v6","v7","v16",
"v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
return;
}
#endif
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
{
BLASLONG i=0;
BLASLONG ix=0,iy=0;
FLOAT temp;
if ( n <= 0 ) return(0);
if ( (inc_x == 1) && (inc_y == 1 ))
{
BLASLONG n1 = n & -32;
if ( n1 > 0 )
{
dswap_kernel_32(n1, x, y);
i=n1;
}
while(i < n)
{
temp = y[i];
y[i] = x[i] ;
x[i] = temp;
i++ ;
}
}
else
{
while(i < n)
{
temp = y[iy];
y[iy] = x[ix] ;
x[ix] = temp;
ix += inc_x ;
iy += inc_y ;
i++ ;
}
}
return(0);
}

249
kernel/zarch/idamax.c Normal file
View File

@ -0,0 +1,249 @@
/***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#include <math.h>
#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf
#endif
static BLASLONG __attribute__((noinline)) diamax_kernel_32_TUNED(BLASLONG n, FLOAT *x, FLOAT *maxf) {
__asm__(
"pfd 1, 0(%1) \n\t"
"sllg %%r0,%0,3 \n\t"
"agr %%r0,%1 \n\t"
"VLEIG %%v20,0,0 \n\t"
"VLEIG %%v20,1,1 \n\t"
"VLEIG %%v21,2,0 \n\t"
"VLEIG %%v21,3,1 \n\t"
"VLEIG %%v22,4,0 \n\t"
"VLEIG %%v22,5,1 \n\t"
"VLEIG %%v23,6,0 \n\t"
"VLEIG %%v23,7,1 \n\t"
"VREPIG %%v4,8 \n\t"
"vzero %%v5 \n\t"
"vzero %%v18 \n\t"
"vzero %%v19 \n\t"
".align 16 \n\t"
"1: \n\t"
"pfd 1, 256(%1 ) \n\t"
"vlm %%v24,%%v31, 0(%1 ) \n\t"
"vflpdb %%v24, %%v24 \n\t"
"vflpdb %%v25, %%v25 \n\t"
"vflpdb %%v26, %%v26 \n\t"
"vflpdb %%v27, %%v27 \n\t"
"vflpdb %%v28, %%v28 \n\t"
"vflpdb %%v29, %%v29 \n\t"
"vflpdb %%v30, %%v30 \n\t"
"vflpdb %%v31, %%v31 \n\t"
"vfchdb %%v16,%%v25,%%v24 \n\t "
"vfchdb %%v17,%%v27,%%v26 \n\t "
"vsel %%v1,%%v21,%%v20,%%v16 \n\t"
"vsel %%v0,%%v25,%%v24,%%v16 \n\t"
"vsel %%v2,%%v23,%%v22,%%v17 \n\t"
"vsel %%v3,%%v27,%%v26,%%v17 \n\t"
"vfchdb %%v16,%%v29,%%v28 \n\t "
"vfchdb %%v17,%%v31,%%v30 \n\t"
"vsel %%v24,%%v21,%%v20,%%v16 \n\t"
"vsel %%v25,%%v29,%%v28,%%v16 \n\t"
"vsel %%v26,%%v23,%%v22,%%v17 \n\t"
"vsel %%v27,%%v31,%%v30,%%v17 \n\t"
"vfchdb %%v28, %%v3,%%v0 \n\t"
"vfchdb %%v29,%%v27, %%v25 \n\t"
"vsel %%v1,%%v2,%%v1,%%v28 \n\t"
"vsel %%v0,%%v3,%%v0,%%v28 \n\t"
"vsel %%v24,%%v26,%%v24,%%v29 \n\t"
"vsel %%v25,%%v27,%%v25,%%v29 \n\t"
"VAG %%v1,%%v1,%%v5 \n\t"
"VAG %%v24,%%v24,%%v5 \n\t"
"VAG %%v24,%%v24,%%v4 \n\t"
"vfchdb %%v16,%%v25 , %%v0 \n\t"
"VAG %%v5,%%v5,%%v4 \n\t"
"vsel %%v29,%%v25,%%v0,%%v16 \n\t"
"vsel %%v28,%%v24,%%v1,%%v16 \n\t"
"vfchdb %%v17, %%v29,%%v18 \n\t"
"vsel %%v19,%%v28,%%v19,%%v17 \n\t"
"vsel %%v18,%%v29,%%v18,%%v17 \n\t"
"VAG %%v5,%%v5,%%v4 \n\t"
"vlm %%v24,%%v31,128(%1 ) \n\t"
"vflpdb %%v24, %%v24 \n\t"
"vflpdb %%v25, %%v25 \n\t"
"vflpdb %%v26, %%v26 \n\t"
"vflpdb %%v27, %%v27 \n\t"
"vflpdb %%v28, %%v28 \n\t"
"vflpdb %%v29, %%v29 \n\t"
"vflpdb %%v30, %%v30 \n\t"
"vflpdb %%v31, %%v31 \n\t"
"vfchdb %%v16,%%v25,%%v24 \n\t "
"vfchdb %%v17,%%v27,%%v26 \n\t "
"vsel %%v1,%%v21,%%v20,%%v16 \n\t"
"vsel %%v0,%%v25,%%v24,%%v16 \n\t"
"vsel %%v2,%%v23,%%v22,%%v17 \n\t"
"vsel %%v3,%%v27,%%v26,%%v17 \n\t"
"vfchdb %%v16,%%v29,%%v28 \n\t "
"vfchdb %%v17,%%v31,%%v30 \n\t"
"vsel %%v24,%%v21,%%v20,%%v16 \n\t"
"vsel %%v25,%%v29,%%v28,%%v16 \n\t"
"vsel %%v26,%%v23,%%v22,%%v17 \n\t"
"vsel %%v27,%%v31,%%v30,%%v17 \n\t"
"vfchdb %%v28, %%v3,%%v0 \n\t"
"vfchdb %%v29,%%v27, %%v25 \n\t"
"vsel %%v1,%%v2,%%v1,%%v28 \n\t"
"vsel %%v0,%%v3,%%v0,%%v28 \n\t"
"vsel %%v24,%%v26,%%v24,%%v29 \n\t"
"vsel %%v25,%%v27,%%v25,%%v29 \n\t"
"VAG %%v1,%%v1,%%v5 \n\t"
"VAG %%v24,%%v24,%%v5 \n\t"
"la %1,256(%1) \n\t"
"VAG %%v24,%%v24,%%v4 \n\t"
"vfchdb %%v16,%%v25 , %%v0 \n\t"
"VAG %%v5,%%v5,%%v4 \n\t"
"vsel %%v29,%%v25,%%v0,%%v16 \n\t"
"vsel %%v28,%%v24,%%v1,%%v16 \n\t"
"vfchdb %%v17, %%v29,%%v18 \n\t"
"vsel %%v19,%%v28,%%v19,%%v17 \n\t"
"vsel %%v18,%%v29,%%v18,%%v17 \n\t"
"VAG %%v5,%%v5,%%v4 \n\t"
"clgrjl %1,%%r0,1b \n\t"
"vrepg %%v26,%%v18,1 \n\t"
"vrepg %%v5,%%v19,1 \n\t"
"wfcdb %%v26,%%v18 \n\t"
"jne 2f \n\t"
"VSTEG %%v18,0(%2),0 \n\t"
"VMNLG %%v1,%%v5,%%v19 \n\t"
"VLGVG %%r2,%%v1,0 \n\t"
"br %%r14 \n\t"
"2: \n\t"
"wfchdb %%v16,%%v26,%%v18 \n\t"
"vsel %%v1,%%v5,%%v19,%%v16 \n\t"
"vsel %%v0,%%v26,%%v18,%%v16 \n\t"
"VLGVG %%r2,%%v1,0 \n\t"
"std %%f0,0(%2) \n\t"
:
: "r"(n), "a"(x), "a"(maxf)
: "cc", "memory","r0","r1","r2","f0","v0","v1","v2","v3","v4","v5","v6","v7","v16",
"v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
}
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0;
BLASLONG j = 0;
BLASLONG ix = 0;
FLOAT maxf = 0.0;
BLASLONG max = 0;
if (n <= 0 || inc_x <= 0) return (max);
if (inc_x == 1) {
BLASLONG n1 = n & -32;
if (n1 > 0) {
max = diamax_kernel_32_TUNED(n1, x, &maxf);
i = n1;
}
while (i < n) {
if (ABS(x[i]) > maxf) {
max = i;
maxf = ABS(x[i]);
}
i++;
}
return (max + 1);
} else {
BLASLONG n1 = n & -4;
while (j < n1) {
if (ABS(x[i]) > maxf) {
max = j;
maxf = ABS(x[i]);
}
if (ABS(x[i + inc_x]) > maxf) {
max = j + 1;
maxf = ABS(x[i + inc_x]);
}
if (ABS(x[i + 2 * inc_x]) > maxf) {
max = j + 2;
maxf = ABS(x[i + 2 * inc_x]);
}
if (ABS(x[i + 3 * inc_x]) > maxf) {
max = j + 3;
maxf = ABS(x[i + 3 * inc_x]);
}
i += inc_x * 4;
j += 4;
}
while (j < n) {
if (ABS(x[i]) > maxf) {
max = j;
maxf = ABS(x[i]);
}
i += inc_x;
j++;
}
return (max + 1);
}
}

249
kernel/zarch/idamin.c Normal file
View File

@ -0,0 +1,249 @@
/***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#include <math.h>
#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf
#endif
static BLASLONG __attribute__((noinline)) diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) {
__asm__(
"pfd 1, 0(%1) \n\t"
"sllg %%r0,%0,3 \n\t"
"agr %%r0,%1 \n\t"
"VLEIG %%v20,0,0 \n\t"
"VLEIG %%v20,1,1 \n\t"
"VLEIG %%v21,2,0 \n\t"
"VLEIG %%v21,3,1 \n\t"
"VLEIG %%v22,4,0 \n\t"
"VLEIG %%v22,5,1 \n\t"
"VLEIG %%v23,6,0 \n\t"
"VLEIG %%v23,7,1 \n\t"
"VREPIG %%v4,8 \n\t"
"vzero %%v5 \n\t"
"vlrepg %%v18,0(%1) \n\t"
"vzero %%v19 \n\t"
".align 16 \n\t"
"1: \n\t"
"pfd 1, 256(%1 ) \n\t"
"vlm %%v24,%%v31, 0(%1 ) \n\t"
"vflpdb %%v24, %%v24 \n\t"
"vflpdb %%v25, %%v25 \n\t"
"vflpdb %%v26, %%v26 \n\t"
"vflpdb %%v27, %%v27 \n\t"
"vflpdb %%v28, %%v28 \n\t"
"vflpdb %%v29, %%v29 \n\t"
"vflpdb %%v30, %%v30 \n\t"
"vflpdb %%v31, %%v31 \n\t"
"vfchdb %%v16,%%v24,%%v25 \n\t "
"vfchdb %%v17,%%v26 ,%%v27 \n\t "
"vsel %%v1,%%v21,%%v20,%%v16 \n\t"
"vsel %%v0,%%v25,%%v24,%%v16 \n\t"
"vsel %%v2,%%v23,%%v22,%%v17 \n\t"
"vsel %%v3,%%v27,%%v26,%%v17 \n\t"
"vfchdb %%v16,%%v28, %%v29 \n\t "
"vfchdb %%v17,%%v30,%%v31 \n\t"
"vsel %%v24,%%v21,%%v20,%%v16 \n\t"
"vsel %%v25,%%v29,%%v28,%%v16 \n\t"
"vsel %%v26,%%v23,%%v22,%%v17 \n\t"
"vsel %%v27,%%v31,%%v30,%%v17 \n\t"
"vfchdb %%v28,%%v0 , %%v3 \n\t"
"vfchdb %%v29, %%v25,%%v27 \n\t"
"vsel %%v1,%%v2,%%v1,%%v28 \n\t"
"vsel %%v0,%%v3,%%v0,%%v28 \n\t"
"vsel %%v24,%%v26,%%v24,%%v29 \n\t"
"vsel %%v25,%%v27,%%v25,%%v29 \n\t"
"VAG %%v1,%%v1,%%v5 \n\t"
"VAG %%v24,%%v24,%%v5 \n\t"
"VAG %%v24,%%v24,%%v4 \n\t"
"vfchdb %%v16, %%v0,%%v25 \n\t"
"VAG %%v5,%%v5,%%v4 \n\t"
"vsel %%v29,%%v25,%%v0,%%v16 \n\t"
"vsel %%v28,%%v24,%%v1,%%v16 \n\t"
"vfchdb %%v17,%%v18, %%v29 \n\t"
"vsel %%v19,%%v28,%%v19,%%v17 \n\t"
"vsel %%v18,%%v29,%%v18,%%v17 \n\t"
"VAG %%v5,%%v5,%%v4 \n\t"
"vlm %%v24,%%v31,128(%1 ) \n\t"
"vflpdb %%v24, %%v24 \n\t"
"vflpdb %%v25, %%v25 \n\t"
"vflpdb %%v26, %%v26 \n\t"
"vflpdb %%v27, %%v27 \n\t"
"vflpdb %%v28, %%v28 \n\t"
"vflpdb %%v29, %%v29 \n\t"
"vflpdb %%v30, %%v30 \n\t"
"vflpdb %%v31, %%v31 \n\t"
"vfchdb %%v16,%%v24,%%v25 \n\t"
"vfchdb %%v17,%%v26 ,%%v27 \n\t"
"vsel %%v1,%%v21,%%v20,%%v16 \n\t"
"vsel %%v0,%%v25,%%v24,%%v16 \n\t"
"vsel %%v2,%%v23,%%v22,%%v17 \n\t"
"vsel %%v3,%%v27,%%v26,%%v17 \n\t"
"vfchdb %%v16,%%v28 ,%%v29 \n\t"
"vfchdb %%v17,%%v30,%%v31 \n\t"
"vsel %%v24,%%v21,%%v20,%%v16 \n\t"
"vsel %%v25,%%v29,%%v28,%%v16 \n\t"
"vsel %%v26,%%v23,%%v22,%%v17 \n\t"
"vsel %%v27,%%v31,%%v30,%%v17 \n\t"
"vfchdb %%v28,%%v0 , %%v3 \n\t"
"vfchdb %%v29, %%v25,%%v27 \n\t"
"vsel %%v1,%%v2,%%v1,%%v28 \n\t"
"vsel %%v0,%%v3,%%v0,%%v28 \n\t"
"vsel %%v24,%%v26,%%v24,%%v29 \n\t"
"vsel %%v25,%%v27,%%v25,%%v29 \n\t"
"VAG %%v1,%%v1,%%v5 \n\t"
"VAG %%v24,%%v24,%%v5 \n\t"
"la %1,256(%1) \n\t"
"VAG %%v24,%%v24,%%v4 \n\t"
"vfchdb %%v16, %%v0,%%v25 \n\t"
"VAG %%v5,%%v5,%%v4 \n\t"
"vsel %%v29,%%v25,%%v0,%%v16 \n\t"
"vsel %%v28,%%v24,%%v1,%%v16 \n\t"
"vfchdb %%v17,%%v18, %%v29 \n\t"
"vsel %%v19,%%v28,%%v19,%%v17 \n\t"
"vsel %%v18,%%v29,%%v18,%%v17 \n\t"
"VAG %%v5,%%v5,%%v4 \n\t"
"clgrjl %1,%%r0,1b \n\t"
"vrepg %%v26,%%v18,1 \n\t"
"vrepg %%v5,%%v19,1 \n\t"
"wfcdb %%v26,%%v18 \n\t"
"jne 2f \n\t"
"VSTEG %%v18,0(%2),0 \n\t"
"VMNLG %%v1,%%v5,%%v19 \n\t"
"VLGVG %%r2,%%v1,0 \n\t"
"br %%r14 \n\t"
"2: \n\t"
"wfchdb %%v16,%%v18 ,%%v26 \n\t "
"vsel %%v1,%%v5,%%v19,%%v16 \n\t"
"vsel %%v0,%%v26,%%v18,%%v16 \n\t"
"VLGVG %%r2,%%v1,0 \n\t"
"std %%f0,0(%2) \n\t"
:
: "r"(n), "a"(x), "a"(maxf)
: "cc", "memory","r0","r1","r2","f0","v0","v1","v2","v3","v4","v5","v6","v7","v16",
"v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
}
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0;
BLASLONG j = 0;
BLASLONG ix = 0;
FLOAT minf = 0.0;
BLASLONG min = 0;
if (n <= 0 || inc_x <= 0) return (min);
if (inc_x == 1) {
BLASLONG n1 = n & -32;
if (n1 > 0) {
min = diamin_kernel_32(n1, x, &minf);
i = n1;
}
while (i < n) {
if (ABS(x[i]) < minf) {
min = i;
minf = ABS(x[i]);
}
i++;
}
return (min + 1);
} else {
BLASLONG n1 = n & -4;
while (j < n1) {
if (ABS(x[i]) < minf) {
min = j;
minf = ABS(x[i]);
}
if (ABS(x[i + inc_x]) < minf) {
min = j + 1;
minf = ABS(x[i + inc_x]);
}
if (ABS(x[i + 2 * inc_x]) < minf) {
min = j + 2;
minf = ABS(x[i + 2 * inc_x]);
}
if (ABS(x[i + 3 * inc_x]) < minf) {
min = j + 3;
minf = ABS(x[i + 3 * inc_x]);
}
i += inc_x * 4;
j += 4;
}
while (j < n) {
if (ABS(x[i]) < minf) {
min = j;
minf = ABS(x[i]);
}
i += inc_x;
j++;
}
return (min + 1);
}
}

257
kernel/zarch/izamax.c Normal file
View File

@ -0,0 +1,257 @@
/***************************************************************************
Copyright (c) 2017, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#include <math.h>
#define ABS fabs
#define CABS1(x,i) ABS(x[i])+ABS(x[i+1])
static BLASLONG __attribute__((noinline)) ziamax_kernel_8_TUNED(BLASLONG n, FLOAT *x, FLOAT *maxf) {
__asm__(
"pfd 1, 0(%1) \n\t"
"VLEIG %%v16,0,0 \n\t"
"VLEIG %%v16,1,1 \n\t"
"VLEIG %%v17,2,0 \n\t"
"VLEIG %%v17,3,1 \n\t"
"VLEIG %%v18,4,0 \n\t"
"VLEIG %%v18,5,1 \n\t"
"VLEIG %%v19,6,0 \n\t"
"VLEIG %%v19,7,1 \n\t"
"VLEIG %%v20,8,0 \n\t"
"VLEIG %%v20,9,1 \n\t"
"VLEIG %%v21,10,0 \n\t"
"VLEIG %%v21,11,1 \n\t"
"VLEIG %%v22,12,0 \n\t"
"VLEIG %%v22,13,1 \n\t"
"VLEIG %%v23,14,0 \n\t"
"VLEIG %%v23,15,1 \n\t"
"sllg %%r0,%0,4 \n\t"
"agr %%r0,%1 \n\t"
"vzero %%v6 \n\t"
"vzero %%v7 \n\t"
"VREPIG %%v4,16 \n\t"
"vzero %%v5 \n\t"
".align 16 \n\t"
"1: \n\t"
"pfd 1, 256(%1 ) \n\t"
"vleg %%v24 , 0( %1),0 \n\t"
"vleg %%v25 , 8( %1),0 \n\t"
"vleg %%v24 , 16( %1),1 \n\t"
"vleg %%v25 , 24( %1),1 \n\t"
"vleg %%v26 , 32( %1),0 \n\t"
"vleg %%v27 , 40( %1),0 \n\t"
"vleg %%v26 , 48( %1),1 \n\t"
"vleg %%v27 , 56( %1),1 \n\t"
"vleg %%v28 , 64( %1),0 \n\t"
"vleg %%v29 , 72( %1),0 \n\t"
"vleg %%v28 , 80( %1),1 \n\t"
"vleg %%v29 , 88( %1),1 \n\t"
"vleg %%v30 , 96( %1),0 \n\t"
"vleg %%v31 ,104( %1),0 \n\t"
"vleg %%v30 ,112( %1),1 \n\t"
"vleg %%v31 ,120( %1),1 \n\t"
"vflpdb %%v24, %%v24 \n\t"
"vflpdb %%v25, %%v25 \n\t"
"vflpdb %%v26, %%v26 \n\t"
"vflpdb %%v27, %%v27 \n\t"
"vflpdb %%v28, %%v28 \n\t"
"vflpdb %%v29, %%v29 \n\t"
"vflpdb %%v30, %%v30 \n\t"
"vflpdb %%v31, %%v31 \n\t"
"vfadb %%v0,%%v24,%%v25 \n\t"
"vfadb %%v1,%%v26,%%v27 \n\t"
"vfadb %%v2,%%v28,%%v29 \n\t"
"vfadb %%v3,%%v30,%%v31 \n\t"
"vleg %%v24 , 128( %1),0 \n\t"
"vleg %%v25 , 136( %1),0 \n\t"
"vleg %%v24 , 144( %1),1 \n\t"
"vleg %%v25 , 152( %1),1 \n\t"
"vleg %%v26 , 160( %1),0 \n\t"
"vleg %%v27 , 168( %1),0 \n\t"
"vleg %%v26 , 176( %1),1 \n\t"
"vleg %%v27 , 184( %1),1 \n\t"
"vleg %%v28 , 192( %1),0 \n\t"
"vleg %%v29 , 200( %1),0 \n\t"
"vleg %%v28 , 208( %1),1 \n\t"
"vleg %%v29 , 216( %1),1 \n\t"
"vleg %%v30 , 224( %1),0 \n\t"
"vleg %%v31 , 232( %1),0 \n\t"
"vleg %%v30 , 240( %1),1 \n\t"
"vleg %%v31 , 248( %1),1 \n\t"
"vflpdb %%v24, %%v24 \n\t"
"vflpdb %%v25, %%v25 \n\t"
"vflpdb %%v26, %%v26 \n\t"
"vflpdb %%v27, %%v27 \n\t"
"vflpdb %%v28, %%v28 \n\t"
"vflpdb %%v29, %%v29 \n\t"
"vflpdb %%v30, %%v30 \n\t"
"vflpdb %%v31, %%v31 \n\t"
"vfadb %%v24,%%v24,%%v25 \n\t"
"vfadb %%v26,%%v26,%%v27 \n\t"
"vfadb %%v28,%%v28,%%v29 \n\t"
"vfadb %%v30,%%v30,%%v31 \n\t"
"vfchdb %%v25,%%v1,%%v0 \n\t"
"vsel %%v29,%%v17,%%v16,%%v25 \n\t"
"vsel %%v31,%%v1,%%v0,%%v25 \n\t"
"vfchdb %%v27,%%v3,%%v2 \n\t "
"vsel %%v0,%%v19,%%v18,%%v27 \n\t"
"vsel %%v1,%%v3,%%v2,%%v27 \n\t"
"vfchdb %%v25,%%v26,%%v24 \n\t "
"vsel %%v2,%%v21,%%v20,%%v25 \n\t"
"vsel %%v3,%%v26,%%v24,%%v25 \n\t"
"vfchdb %%v27,%%v30,%%v28 \n\t "
"vsel %%v25,%%v23,%%v22,%%v27 \n\t"
"vsel %%v27,%%v30,%%v28,%%v27 \n\t"
"vfchdb %%v24, %%v1,%%v31 \n\t"
"vsel %%v26,%%v0,%%v29,%%v24 \n\t"
"vsel %%v28,%%v1,%%v31,%%v24 \n\t"
"vfchdb %%v30, %%v27,%%v3 \n\t"
"vsel %%v29,%%v25,%%v2,%%v30 \n\t"
"vsel %%v31,%%v27,%%v3 ,%%v30 \n\t"
"la %1,256(%1) \n\t"
"vfchdb %%v0, %%v31,%%v28 \n\t"
"vsel %%v25,%%v29,%%v26,%%v0 \n\t"
"vsel %%v27,%%v31,%%v28,%%v0 \n\t"
"VAG %%v25,%%v25,%%v5 \n\t"
//cmp with previous
"vfchdb %%v30, %%v27,%%v6 \n\t"
"vsel %%v7,%%v25,%%v7,%%v30 \n\t"
"vsel %%v6,%%v27,%%v6,%%v30 \n\t"
"VAG %%v5,%%v5,%%v4 \n\t"
"clgrjl %1,%%r0,1b \n\t"
//xtract index
"vrepg %%v26,%%v6,1 \n\t"
"vrepg %%v5,%%v7,1 \n\t"
"wfcdb %%v26,%%v6 \n\t"
"jne 2f \n\t"
"VSTEG %%v6,0(%2),0 \n\t"
"VMNLG %%v1,%%v5,%%v7 \n\t"
"VLGVG %%r2,%%v1,0 \n\t"
"br %%r14 \n\t"
"2: \n\t"
"wfchdb %%v16,%%v26,%%v6 \n\t"
"vsel %%v1,%%v5,%%v7,%%v16 \n\t"
"vsel %%v0,%%v26,%%v6,%%v16 \n\t"
"VLGVG %%r2,%%v1,0 \n\t"
"std %%f0,0(%2) \n\t"
:
: "r"(n), "a"(x), "a"(maxf)
: "cc", "memory","r0","r1","r2","f0","v0","v1","v2","v3","v4","v5","v6","v7","v16",
"v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
}
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
BLASLONG i = 0;
BLASLONG ix = 0;
FLOAT maxf = 0;
BLASLONG max = 0;
BLASLONG inc_x2;
if (n <= 0 || inc_x <= 0) return(max);
if (inc_x == 1) {
BLASLONG n1 = n & -8;
if (n1 > 0) {
max = ziamax_kernel_8_TUNED(n1, x, &maxf);
i = n1;
}
while(i < n)
{
if( CABS1(x,ix) > maxf )
{
max = i;
maxf = CABS1(x,ix);
}
ix += 2;
i++;
}
return (max + 1);
} else {
inc_x2 = 2 * inc_x;
maxf = CABS1(x,0);
ix += inc_x2;
i++;
while(i < n)
{
if( CABS1(x,ix) > maxf )
{
max = i;
maxf = CABS1(x,ix);
}
ix += inc_x2;
i++;
}
return (max + 1);
}
}

259
kernel/zarch/izamin.c Normal file
View File

@ -0,0 +1,259 @@
/***************************************************************************
Copyright (c) 2017, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#include <math.h>
#define ABS fabs
#define CABS1(x,i) ABS(x[i])+ABS(x[i+1])
static BLASLONG __attribute__((noinline)) ziamin_kernel_8_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) {
__asm__(
"pfd 1, 0(%1) \n\t"
"VLEIG %%v16,0,0 \n\t"
"VLEIG %%v16,1,1 \n\t"
"VLEIG %%v17,2,0 \n\t"
"VLEIG %%v17,3,1 \n\t"
"VLEIG %%v18,4,0 \n\t"
"VLEIG %%v18,5,1 \n\t"
"VLEIG %%v19,6,0 \n\t"
"VLEIG %%v19,7,1 \n\t"
"VLEIG %%v20,8,0 \n\t"
"VLEIG %%v20,9,1 \n\t"
"VLEIG %%v21,10,0 \n\t"
"VLEIG %%v21,11,1 \n\t"
"VLEIG %%v22,12,0 \n\t"
"VLEIG %%v22,13,1 \n\t"
"VLEIG %%v23,14,0 \n\t"
"VLEIG %%v23,15,1 \n\t"
"ld %%f6,0(%1) \n\t"
"lpdbr %%f6,%%f6 \n\t"
"ld %%f7,8(%1) \n\t"
"lpdbr %%f7,%%f7 \n\t"
"adbr %%f6,%%f7 \n\t"
"sllg %%r0,%0,4 \n\t"
"agr %%r0,%1 \n\t"
"vrepg %%v6,%%v6,0 \n\t"
"vzero %%v7 \n\t"
"VREPIG %%v4,16 \n\t"
"vzero %%v5 \n\t"
".align 16 \n\t"
"1: \n\t"
"pfd 1, 256(%1 ) \n\t"
"vleg %%v24 , 0( %1),0 \n\t"
"vleg %%v25 , 8( %1),0 \n\t"
"vleg %%v24 , 16( %1),1 \n\t"
"vleg %%v25 , 24( %1),1 \n\t"
"vleg %%v26 , 32( %1),0 \n\t"
"vleg %%v27 , 40( %1),0 \n\t"
"vleg %%v26 , 48( %1),1 \n\t"
"vleg %%v27 , 56( %1),1 \n\t"
"vleg %%v28 , 64( %1),0 \n\t"
"vleg %%v29 , 72( %1),0 \n\t"
"vleg %%v28 , 80( %1),1 \n\t"
"vleg %%v29 , 88( %1),1 \n\t"
"vleg %%v30 , 96( %1),0 \n\t"
"vleg %%v31 ,104( %1),0 \n\t"
"vleg %%v30 ,112( %1),1 \n\t"
"vleg %%v31 ,120( %1),1 \n\t"
"vflpdb %%v24, %%v24 \n\t"
"vflpdb %%v25, %%v25 \n\t"
"vflpdb %%v26, %%v26 \n\t"
"vflpdb %%v27, %%v27 \n\t"
"vflpdb %%v28, %%v28 \n\t"
"vflpdb %%v29, %%v29 \n\t"
"vflpdb %%v30, %%v30 \n\t"
"vflpdb %%v31, %%v31 \n\t"
"vfadb %%v0,%%v24,%%v25 \n\t"
"vfadb %%v1,%%v26,%%v27 \n\t"
"vfadb %%v2,%%v28,%%v29 \n\t"
"vfadb %%v3,%%v30,%%v31 \n\t"
"vleg %%v24 ,128( %1),0 \n\t"
"vleg %%v25 ,136( %1),0 \n\t"
"vleg %%v24 ,144( %1),1 \n\t"
"vleg %%v25 ,152( %1),1 \n\t"
"vleg %%v26 ,160( %1),0 \n\t"
"vleg %%v27 ,168( %1),0 \n\t"
"vleg %%v26 ,176( %1),1 \n\t"
"vleg %%v27 ,184( %1),1 \n\t"
"vleg %%v28 ,192( %1),0 \n\t"
"vleg %%v29 ,200( %1),0 \n\t"
"vleg %%v28 ,208( %1),1 \n\t"
"vleg %%v29 ,216( %1),1 \n\t"
"vleg %%v30 ,224( %1),0 \n\t"
"vleg %%v31 ,232( %1),0 \n\t"
"vleg %%v30 ,240( %1),1 \n\t"
"vleg %%v31 ,248( %1),1 \n\t"
"vflpdb %%v24, %%v24 \n\t"
"vflpdb %%v25, %%v25 \n\t"
"vflpdb %%v26, %%v26 \n\t"
"vflpdb %%v27, %%v27 \n\t"
"vflpdb %%v28, %%v28 \n\t"
"vflpdb %%v29, %%v29 \n\t"
"vflpdb %%v30, %%v30 \n\t"
"vflpdb %%v31, %%v31 \n\t"
"vfadb %%v24,%%v24,%%v25 \n\t"
"vfadb %%v26,%%v26,%%v27 \n\t"
"vfadb %%v28,%%v28,%%v29 \n\t"
"vfadb %%v30,%%v30,%%v31 \n\t"
"vfchdb %%v25,%%v0 ,%%v1 \n\t"
"vsel %%v29,%%v17,%%v16,%%v25 \n\t"
"vsel %%v31,%%v1,%%v0,%%v25 \n\t"
"vfchdb %%v27,%%v2,%%v3 \n\t"
"vsel %%v0,%%v19,%%v18,%%v27 \n\t"
"vsel %%v1,%%v3,%%v2,%%v27 \n\t"
"vfchdb %%v25,%%v24,%%v26 \n\t"
"vsel %%v2,%%v21,%%v20,%%v25 \n\t"
"vsel %%v3,%%v26,%%v24,%%v25 \n\t"
"vfchdb %%v27,%%v28,%%v30 \n\t"
"vsel %%v25,%%v23,%%v22,%%v27 \n\t"
"vsel %%v27,%%v30,%%v28,%%v27 \n\t"
"vfchdb %%v24,%%v31, %%v1 \n\t"
"vsel %%v26,%%v0,%%v29,%%v24 \n\t"
"vsel %%v28,%%v1,%%v31,%%v24 \n\t"
"vfchdb %%v30,%%v3, %%v27 \n\t"
"vsel %%v29,%%v25,%%v2,%%v30 \n\t"
"vsel %%v31,%%v27,%%v3 ,%%v30 \n\t"
"la %1,256(%1) \n\t"
"vfchdb %%v0,%%v28, %%v31 \n\t"
"vsel %%v25,%%v29,%%v26,%%v0 \n\t"
"vsel %%v27,%%v31,%%v28,%%v0 \n\t"
"VAG %%v25,%%v25,%%v5 \n\t"
//cmp with previous
"vfchdb %%v30,%%v6 , %%v27 \n\t"
"vsel %%v7,%%v25,%%v7,%%v30 \n\t"
"vsel %%v6,%%v27,%%v6,%%v30 \n\t"
"VAG %%v5,%%v5,%%v4 \n\t"
"clgrjl %1,%%r0,1b \n\t"
//xtract index
"vrepg %%v26,%%v6,1 \n\t"
"vrepg %%v5,%%v7,1 \n\t"
"wfcdb %%v26,%%v6 \n\t"
"jne 2f \n\t"
"VSTEG %%v6,0(%2),0 \n\t"
"VMNLG %%v1,%%v5,%%v7 \n\t"
"VLGVG %%r2,%%v1,0 \n\t"
"br %%r14 \n\t"
"2: \n\t"
"wfchdb %%v16,%%v6 ,%%v26 \n\t"
"vsel %%v1,%%v5,%%v7,%%v16 \n\t"
"vsel %%v0,%%v26,%%v6,%%v16 \n\t"
"VLGVG %%r2,%%v1,0 \n\t"
"std %%f0,0(%2) \n\t"
:
: "r"(n), "a"(x), "a"(minf)
: "cc", "memory","r0","r1","r2","f0","v0","v1","v2","v3","v4","v5","v6","v7","v16",
"v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
}
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
BLASLONG i=0;
BLASLONG ix=0;
FLOAT minf;
BLASLONG min=0;
BLASLONG inc_x2;
if (n <= 0 || inc_x <= 0) return(min);
if (inc_x == 1) {
BLASLONG n1 = n & -8;
if (n1 > 0) {
min = ziamin_kernel_8_TUNED(n1, x, &minf);
i = n1;
}
while(i < n)
{
if( CABS1(x,ix) < minf )
{
min = i;
minf = CABS1(x,ix);
}
ix += 2;
i++;
}
return (min + 1);
} else {
inc_x2 = 2 * inc_x;
minf = CABS1(x,0);
ix += inc_x2;
i++;
while(i < n)
{
if( CABS1(x,ix) < minf )
{
min = i;
minf = CABS1(x,ix);
}
ix += inc_x2;
i++;
}
return (min + 1);
}
}

156
kernel/zarch/zasum.c Normal file
View File

@ -0,0 +1,156 @@
/***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#include <math.h>
#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf
#endif
static FLOAT __attribute__ ((noinline)) zasum_kernel_16(BLASLONG n, FLOAT *x) {
__asm__ (
"pfd 1, 0(%1) \n\t"
"sllg %%r0,%0,4 \n\t"
"agr %%r0,%1 \n\t"
"vzero %%v0 \n\t"
"vzero %%v1 \n\t"
"vzero %%v22 \n\t"
"vzero %%v23 \n\t"
".align 16 \n\t"
"1: \n\t"
"pfd 1, 256(%1 ) \n\t"
"vlm %%v24,%%v31,0(%1) \n\t"
"vflpdb %%v24, %%v24 \n\t"
"vflpdb %%v25, %%v25 \n\t"
"vflpdb %%v26, %%v26 \n\t"
"vflpdb %%v27, %%v27 \n\t"
"vflpdb %%v28, %%v28 \n\t"
"vflpdb %%v29, %%v29 \n\t"
"vflpdb %%v30, %%v30 \n\t"
"vflpdb %%v31, %%v31 \n\t"
"vfadb %%v0,%%v0,%%v24 \n\t"
"vfadb %%v1,%%v1,%%v25 \n\t"
"vfadb %%v23,%%v23,%%v26 \n\t"
"vfadb %%v22,%%v22,%%v27 \n\t"
"vfadb %%v0,%%v0,%%v28 \n\t"
"vfadb %%v1,%%v1,%%v29 \n\t"
"vfadb %%v23,%%v23,%%v30 \n\t"
"vfadb %%v22,%%v22,%%v31 \n\t"
"vlm %%v24,%%v31, 128(%1 ) \n\t"
"vflpdb %%v24, %%v24 \n\t"
"vflpdb %%v25, %%v25 \n\t"
"vflpdb %%v26, %%v26 \n\t"
"vflpdb %%v27, %%v27 \n\t"
"vflpdb %%v28, %%v28 \n\t"
"vflpdb %%v29, %%v29 \n\t"
"vflpdb %%v30, %%v30 \n\t"
"vflpdb %%v31, %%v31 \n\t"
"la %1,256(%1) \n\t"
"vfadb %%v0,%%v0,%%v24 \n\t"
"vfadb %%v1,%%v1,%%v25 \n\t"
"vfadb %%v23,%%v23,%%v26 \n\t"
"vfadb %%v22,%%v22,%%v27 \n\t"
"vfadb %%v0,%%v0,%%v28 \n\t"
"vfadb %%v1,%%v1,%%v29 \n\t"
"vfadb %%v23,%%v23,%%v30 \n\t"
"vfadb %%v22,%%v22,%%v31 \n\t"
"clgrjl %1,%%r0,1b \n\t"
"vfadb %%v24,%%v0,%%v1 \n\t"
"vfadb %%v25,%%v23,%%v22 \n\t"
"vfadb %%v0,%%v25,%%v24 \n\t"
"vrepg %%v1,%%v0,1 \n\t"
"adbr %%f0,%%f1 \n\t"
:
: "r"(n), "a"(x)
: "cc", "memory","r0","f0","f1","v0","v1","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
}
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
BLASLONG i=0;
BLASLONG ip=0;
FLOAT sumf = 0.0;
BLASLONG n1;
BLASLONG inc_x2;
if (n <= 0 || inc_x <= 0) return(sumf);
if ( inc_x == 1 )
{
n1 = n & -16;
if ( n1 > 0 )
{
sumf=zasum_kernel_16(n1, x );
i=n1;
ip=2*n1;
}
while(i < n)
{
sumf += ABS(x[ip]) + ABS(x[ip+1]);
i++;
ip+=2;
}
}
else
{
inc_x2 = 2* inc_x;
while(i < n)
{
sumf += ABS(x[ip]) + ABS(x[ip+1]);
ip+=inc_x2;
i++;
}
}
return(sumf);
}

207
kernel/zarch/zaxpy.c Normal file
View File

@ -0,0 +1,207 @@
/***************************************************************************
Copyright (c) 2017, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
static void __attribute__ ((noinline)) zaxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) {
__asm__ ("pfd 1, 0(%1) \n\t"
"pfd 2, 0(%2) \n\t"
"vlrepg %%v28 , 0(%3) \n\t"
"vlrepg %%v29, 8(%3) \n\t"
"srlg %3,%0,3 \n\t"
"xgr %%r1,%%r1 \n\t"
".align 16 \n\t"
"1: \n\t"
"pfd 1, 256(%%r1,%1) \n\t"
"pfd 2, 256(%%r1,%2) \n\t"
"vleg %%v16 , 0(%%r1,%2),0 \n\t"
"vleg %%v17 , 8(%%r1,%2),0 \n\t"
"vleg %%v16 , 16(%%r1,%2),1 \n\t"
"vleg %%v17 , 24(%%r1,%2),1 \n\t"
"vleg %%v18 , 32(%%r1,%2),0 \n\t"
"vleg %%v19 , 40(%%r1,%2),0 \n\t"
"vleg %%v18 , 48(%%r1,%2),1 \n\t"
"vleg %%v19 , 56(%%r1,%2),1 \n\t"
"vleg %%v24 , 0(%%r1,%1),0 \n\t"
"vleg %%v25 , 8(%%r1,%1),0 \n\t"
"vleg %%v24 , 16(%%r1,%1),1 \n\t"
"vleg %%v25 , 24(%%r1,%1),1 \n\t"
"vleg %%v26 , 32(%%r1,%1),0 \n\t"
"vleg %%v27 , 40(%%r1,%1),0 \n\t"
"vleg %%v26 , 48(%%r1,%1),1 \n\t"
"vleg %%v27 , 56(%%r1,%1),1 \n\t"
#if !defined(CONJ)
"vfmsdb %%v16, %%v25, %%v29,%%v16 \n\t"
"vfmadb %%v17, %%v24, %%v29, %%v17 \n\t"
"vfmsdb %%v18, %%v27, %%v29, %%v18 \n\t"
"vfmadb %%v19, %%v26, %%v29, %%v19 \n\t"
"vfmsdb %%v16, %%v24, %%v28 ,%%v16 \n\t"
"vfmadb %%v17, %%v25, %%v28, %%v17 \n\t"
"vfmsdb %%v18, %%v26, %%v28, %%v18 \n\t"
"vfmadb %%v19, %%v27, %%v28, %%v19 \n\t"
#else
"vfmadb %%v16, %%v25, %%v29, %%v16 \n\t"
"vfmsdb %%v17, %%v25, %%v28, %%v17 \n\t"
"vfmadb %%v18, %%v27, %%v29, %%v18 \n\t"
"vfmsdb %%v19, %%v27, %%v28, %%v19 \n\t"
"vfmadb %%v16, %%v24, %%v28, %%v16 \n\t"
"vfmsdb %%v17, %%v24, %%v29, %%v17 \n\t"
"vfmadb %%v18, %%v26, %%v28, %%v18 \n\t"
"vfmsdb %%v19, %%v26, %%v29, %%v19 \n\t"
#endif
"vsteg %%v16 , 0(%%r1,%2),0 \n\t"
"vsteg %%v17 , 8(%%r1,%2),0 \n\t"
"vsteg %%v16 , 16(%%r1,%2),1 \n\t"
"vsteg %%v17 , 24(%%r1,%2),1 \n\t"
"vsteg %%v18 , 32(%%r1,%2),0 \n\t"
"vsteg %%v19 , 40(%%r1,%2),0 \n\t"
"vsteg %%v18 , 48(%%r1,%2),1 \n\t"
"vsteg %%v19 , 56(%%r1,%2),1 \n\t"
"vleg %%v20 , 64(%%r1,%2),0 \n\t"
"vleg %%v21 , 72(%%r1,%2),0 \n\t"
"vleg %%v20 , 80(%%r1,%2),1 \n\t"
"vleg %%v21 , 88(%%r1,%2),1 \n\t"
"vleg %%v22 , 96(%%r1,%2),0 \n\t"
"vleg %%v23 , 104(%%r1,%2),0 \n\t"
"vleg %%v22 , 112(%%r1,%2),1 \n\t"
"vleg %%v23 , 120(%%r1,%2),1 \n\t"
"vleg %%v24 , 64(%%r1,%1),0 \n\t"
"vleg %%v25 , 72(%%r1,%1),0 \n\t"
"vleg %%v24 , 80(%%r1,%1),1 \n\t"
"vleg %%v25 , 88(%%r1,%1),1 \n\t"
"vleg %%v26 , 96(%%r1,%1),0 \n\t"
"vleg %%v27 , 104(%%r1,%1),0 \n\t"
"vleg %%v26 , 112(%%r1,%1),1 \n\t"
"vleg %%v27 , 120(%%r1,%1),1 \n\t"
#if !defined(CONJ)
"vfmsdb %%v20, %%v25, %%v29,%%v20 \n\t"
"vfmadb %%v21, %%v24, %%v29, %%v21 \n\t"
"vfmsdb %%v22, %%v27, %%v29, %%v22 \n\t"
"vfmadb %%v23, %%v26, %%v29, %%v23 \n\t"
"vfmsdb %%v20, %%v24, %%v28 ,%%v20 \n\t"
"vfmadb %%v21, %%v25, %%v28, %%v21 \n\t"
"vfmsdb %%v22, %%v26, %%v28, %%v22 \n\t"
"vfmadb %%v23, %%v27, %%v28, %%v23 \n\t"
#else
"vfmadb %%v20, %%v25, %%v29, %%v20 \n\t"
"vfmsdb %%v21, %%v25, %%v28, %%v21 \n\t"
"vfmadb %%v22, %%v27, %%v29, %%v22 \n\t"
"vfmsdb %%v23, %%v27, %%v28, %%v23 \n\t"
"vfmadb %%v20, %%v24, %%v28, %%v20 \n\t"
"vfmsdb %%v21, %%v24, %%v29, %%v21 \n\t"
"vfmadb %%v22, %%v26, %%v28, %%v22 \n\t"
"vfmsdb %%v23, %%v26, %%v29, %%v23 \n\t"
#endif
"vsteg %%v20 , 64(%%r1,%2),0 \n\t"
"vsteg %%v21 , 72(%%r1,%2),0 \n\t"
"vsteg %%v20 , 80(%%r1,%2),1 \n\t"
"vsteg %%v21 , 88(%%r1,%2),1 \n\t"
"vsteg %%v22 , 96(%%r1,%2),0 \n\t"
"vsteg %%v23 , 104(%%r1,%2),0 \n\t"
"vsteg %%v22 , 112(%%r1,%2),1 \n\t"
"vsteg %%v23 , 120(%%r1,%2),1 \n\t"
"la %%r1,128(%%r1) \n\t"
"brctg %3,1b"
:
: "r"(n), "a"(x), "a"(y), "a"(alpha)
: "cc", "memory", "r1","v16",
"v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29"
);
}
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) {
BLASLONG i = 0;
BLASLONG ix = 0, iy = 0;
FLOAT da[2];
if (n <= 0) return (0);
if ((inc_x == 1) && (inc_y == 1)) {
BLASLONG n1 = n & -8;
if (n1) {
da[0] = da_r;
da[1] = da_i;
zaxpy_kernel_8(n1, x, y, da);
ix = 2 * n1;
}
i = n1;
while (i < n) {
#if !defined(CONJ)
y[ix] += (da_r * x[ix] - da_i * x[ix + 1]);
y[ix + 1] += (da_r * x[ix + 1] + da_i * x[ix]);
#else
y[ix] += (da_r * x[ix] + da_i * x[ix + 1]);
y[ix + 1] -= (da_r * x[ix + 1] - da_i * x[ix]);
#endif
i++;
ix += 2;
}
return (0);
}
inc_x *= 2;
inc_y *= 2;
while (i < n) {
#if !defined(CONJ)
y[iy] += (da_r * x[ix] - da_i * x[ix + 1]);
y[iy + 1] += (da_r * x[ix + 1] + da_i * x[ix]);
#else
y[iy] += (da_r * x[ix] + da_i * x[ix + 1]);
y[iy + 1] -= (da_r * x[ix + 1] - da_i * x[ix]);
#endif
ix += inc_x;
iy += inc_y;
i++;
}
return (0);
}

145
kernel/zarch/zcopy.c Normal file
View File

@ -0,0 +1,145 @@
/***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
static void __attribute__ ((noinline)) zcopy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) {
__asm__ volatile(
"pfd 1, 0(%1) \n\t"
"pfd 2, 0(%2) \n\t"
"srlg %%r0,%0,4 \n\t"
"xgr %%r1,%%r1 \n\t"
".align 16 \n\t"
"1: \n\t"
"pfd 1, 256(%%r1,%1) \n\t"
"pfd 2, 256(%%r1,%2) \n\t"
"vl %%v24, 0(%%r1,%1) \n\t"
"vst %%v24, 0(%%r1,%2) \n\t"
"vl %%v25, 16(%%r1,%1) \n\t"
"vst %%v25, 16(%%r1,%2) \n\t"
"vl %%v26, 32(%%r1,%1) \n\t"
"vst %%v26, 32(%%r1,%2) \n\t"
"vl %%v27, 48(%%r1,%1) \n\t"
"vst %%v27, 48(%%r1,%2) \n\t"
"vl %%v28, 64(%%r1,%1) \n\t"
"vst %%v28, 64(%%r1,%2) \n\t"
"vl %%v29, 80(%%r1,%1) \n\t"
"vst %%v29, 80(%%r1,%2) \n\t"
"vl %%v30, 96(%%r1,%1) \n\t"
"vst %%v30, 96(%%r1,%2) \n\t"
"vl %%v31,112(%%r1,%1) \n\t"
"vst %%v31,112(%%r1,%2) \n\t"
"vl %%v24,128(%%r1,%1) \n\t"
"vst %%v24,128(%%r1,%2) \n\t"
"vl %%v25,144(%%r1,%1) \n\t"
"vst %%v25,144(%%r1,%2) \n\t"
"vl %%v26,160(%%r1,%1) \n\t"
"vst %%v26,160(%%r1,%2) \n\t"
"vl %%v27,176(%%r1,%1) \n\t"
"vst %%v27,176(%%r1,%2) \n\t"
"vl %%v28, 192(%%r1,%1) \n\t"
"vst %%v28, 192(%%r1,%2) \n\t"
"vl %%v29, 208(%%r1,%1) \n\t"
"vst %%v29, 208(%%r1,%2) \n\t"
"vl %%v30, 224(%%r1,%1) \n\t"
"vst %%v30, 224(%%r1,%2) \n\t"
"vl %%v31, 240(%%r1,%1) \n\t"
"vst %%v31, 240(%%r1,%2) \n\t"
"la %%r1,256(%%r1) \n\t"
"brctg %%r0,1b"
:
: "r"(n), "a"(x), "a"(y)
: "cc", "memory","r0","r1","v24","v25","v26","v27","v28","v29","v30","v31"
);
return;
}
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
{
BLASLONG i=0;
BLASLONG ix=0,iy=0;
if ( n <= 0 ) return(0);
if ( (inc_x == 1) && (inc_y == 1 ))
{
BLASLONG n1 = n & -16;
if ( n1 > 0 )
{
zcopy_kernel_16(n1, x, y);
i=n1;
ix=n1*2;
iy=n1*2;
}
while(i < n)
{
y[iy] = x[iy] ;
y[iy+1] = x[ix+1] ;
ix+=2;
iy+=2;
i++ ;
}
}
else
{
BLASLONG inc_x2 = 2 * inc_x;
BLASLONG inc_y2 = 2 * inc_y;
while(i < n)
{
y[iy] = x[ix] ;
y[iy+1] = x[ix+1] ;
ix += inc_x2 ;
iy += inc_y2 ;
i++ ;
}
}
return(0);
}

216
kernel/zarch/zdot.c Normal file
View File

@ -0,0 +1,216 @@
/***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
static void __attribute__ ((noinline)) zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) {
__asm__ volatile(
"pfd 1, 0(%1) \n\t"
"pfd 1, 0(%2) \n\t"
"vzero %%v24 \n\t"
"vzero %%v25 \n\t"
"vzero %%v26 \n\t"
"vzero %%v27 \n\t"
"srlg %%r0,%0,3 \n\t"
"xgr %%r1,%%r1 \n\t"
".align 16 \n\t"
"1: \n\t"
"pfd 1, 256(%%r1,%1) \n\t"
"pfd 1, 256(%%r1,%2) \n\t"
"vl %%v16, 0(%%r1,%1) \n\t"
"vl %%v17, 16(%%r1,%1) \n\t"
"vl %%v18, 32(%%r1,%1) \n\t"
"vl %%v19, 48(%%r1,%1) \n\t"
"vl %%v28, 0(%%r1,%2) \n\t"
"vl %%v29, 16(%%r1,%2) \n\t"
"vl %%v30, 32(%%r1,%2) \n\t"
"vl %%v31, 48(%%r1,%2) \n\t"
"vpdi %%v20,%%v16,%%v16,4 \n\t"
"vpdi %%v21,%%v17,%%v17,4 \n\t"
"vpdi %%v22,%%v18,%%v18,4 \n\t"
"vpdi %%v23,%%v19,%%v19,4 \n\t"
"vfmadb %%v24,%%v16,%%v28,%%v24 \n\t"
"vfmadb %%v25,%%v20,%%v28,%%v25 \n\t"
"vfmadb %%v26,%%v17,%%v29,%%v26 \n\t"
"vfmadb %%v27,%%v21,%%v29,%%v27 \n\t"
"vfmadb %%v24,%%v18,%%v30,%%v24 \n\t"
"vfmadb %%v25,%%v22,%%v30,%%v25 \n\t"
"vfmadb %%v26,%%v19,%%v31,%%v26 \n\t"
"vfmadb %%v27,%%v23,%%v31,%%v27 \n\t"
"vl %%v16, 64(%%r1,%1) \n\t"
"vl %%v17, 80(%%r1,%1) \n\t"
"vl %%v18, 96(%%r1,%1) \n\t"
"vl %%v19,112(%%r1,%1) \n\t"
"vl %%v28, 64(%%r1,%2) \n\t"
"vl %%v29, 80(%%r1,%2) \n\t"
"vl %%v30, 96(%%r1,%2) \n\t"
"vl %%v31,112(%%r1,%2) \n\t"
"vpdi %%v20,%%v16,%%v16,4 \n\t"
"vpdi %%v21,%%v17,%%v17,4 \n\t"
"vpdi %%v22,%%v18,%%v18,4 \n\t"
"vpdi %%v23,%%v19,%%v19,4 \n\t"
"vfmadb %%v24,%%v16,%%v28,%%v24 \n\t"
"vfmadb %%v25,%%v20,%%v28,%%v25 \n\t"
"vfmadb %%v26,%%v17,%%v29,%%v26 \n\t"
"vfmadb %%v27,%%v21,%%v29,%%v27 \n\t"
"vfmadb %%v24,%%v18,%%v30,%%v24 \n\t"
"vfmadb %%v25,%%v22,%%v30,%%v25 \n\t"
"vfmadb %%v26,%%v19,%%v31,%%v26 \n\t"
"vfmadb %%v27,%%v23,%%v31,%%v27 \n\t"
"la %%r1,128(%%r1) \n\t"
"brctg %%r0,1b \n\t"
"vfadb %%v24,%%v26,%%v24 \n\t"
"vfadb %%v25,%%v25,%%v27 \n\t"
"vsteg %%v24,0(%3),0 \n\t"
"vsteg %%v24,8(%3),1 \n\t"
"vsteg %%v25,16(%3),1 \n\t"
"vsteg %%v25,24(%3),0 \n\t"
:
: "r"(n), "a"(x), "a"(y), "a"(d)
: "cc", "memory","r0","r1","v16",
"v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
}
static __attribute__ ((noinline)) void zdot_kernel_8n(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) {
BLASLONG register i = 0;
FLOAT dot[4] = {0.0, 0.0, 0.0, 0.0};
BLASLONG j = 0;
while (i < n) {
dot[0] += x[j] * y[j];
dot[1] += x[j + 1] * y[j + 1];
dot[2] += x[j] * y[j + 1];
dot[3] += x[j + 1] * y[j];
dot[0] += x[j + 2] * y[j + 2];
dot[1] += x[j + 3] * y[j + 3];
dot[2] += x[j + 2] * y[j + 3];
dot[3] += x[j + 3] * y[j + 2];
dot[0] += x[j + 4] * y[j + 4];
dot[1] += x[j + 5] * y[j + 5];
dot[2] += x[j + 4] * y[j + 5];
dot[3] += x[j + 5] * y[j + 4];
dot[0] += x[j + 6] * y[j + 6];
dot[1] += x[j + 7] * y[j + 7];
dot[2] += x[j + 6] * y[j + 7];
dot[3] += x[j + 7] * y[j + 6];
j += 8;
i += 4;
}
d[0] = dot[0];
d[1] = dot[1];
d[2] = dot[2];
d[3] = dot[3];
}
OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) {
BLASLONG i;
BLASLONG ix, iy;
OPENBLAS_COMPLEX_FLOAT result;
FLOAT dot[4] __attribute__ ((aligned(16))) = {0.0, 0.0, 0.0, 0.0};
if (n <= 0) {
CREAL(result) = 0.0;
CIMAG(result) = 0.0;
return (result);
}
if ((inc_x == 1) && (inc_y == 1)) {
BLASLONG n1 = n & -16;
if (n1)
zdot_kernel_8(n1, x, y, dot);
i = n1;
BLASLONG j = i * 2;
while (i < n) {
dot[0] += x[j] * y[j];
dot[1] += x[j + 1] * y[j + 1];
dot[2] += x[j] * y[j + 1];
dot[3] += x[j + 1] * y[j];
j += 2;
i++;
}
} else {
i = 0;
ix = 0;
iy = 0;
inc_x <<= 1;
inc_y <<= 1;
while (i < n) {
dot[0] += x[ix] * y[iy];
dot[1] += x[ix + 1] * y[iy + 1];
dot[2] += x[ix] * y[iy + 1];
dot[3] += x[ix + 1] * y[iy];
ix += inc_x;
iy += inc_y;
i++;
}
}
#if !defined(CONJ)
CREAL(result) = dot[0] - dot[1];
CIMAG(result) = dot[2] + dot[3];
#else
CREAL(result) = dot[0] + dot[1];
CIMAG(result) = dot[2] - dot[3];
#endif
return (result);
}

919
kernel/zarch/zgemv_n_4.c Normal file
View File

@ -0,0 +1,919 @@
/***************************************************************************
Copyright (c) 2017, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <stdlib.h>
#include <stdio.h>
#include "common.h"
#define HAVE_KERNEL_4x4_VEC 1
#define HAVE_KERNEL_4x2_VEC 1
#define HAVE_KERNEL_4x1_VEC 1
#define HAVE_KERNEL_ADDY 1
#if defined(HAVE_KERNEL_4x4_VEC) || defined(HAVE_KERNEL_4x2_VEC) || defined(HAVE_KERNEL_4x1_VEC)
#include <vecintrin.h>
#endif
/**
* if define IGNORE_TEMP_PERM we store and use ybuffer as {real,real} {img;img}
* of not we will retrieve and store normal way
*/
#if (defined(HAVE_KERNEL_4x4_VEC_ASM) || defined(HAVE_KERNEL_4x4_VEC) ) && defined(HAVE_KERNEL_4x2_VEC) && defined(HAVE_KERNEL_4x1_VEC) && defined(HAVE_KERNEL_ADDY)
// #define IGNORE_TEMP_PERM 1
#endif
#define NBMAX 1024
#ifdef HAVE_KERNEL_4x4_VEC_ASM
#elif HAVE_KERNEL_4x4_VEC
static void zgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
{
BLASLONG i;
FLOAT *a0,*a1,*a2,*a3;
a0 = ap[0];
a1 = ap[1];
a2 = ap[2];
a3 = ap[3];
register __vector double vx0_r = {x[0],x[0]};
register __vector double vx0_i = {x[1],x[1]};
register __vector double vx1_r = {x[2],x[2]};
register __vector double vx1_i = {x[3],x[3]};
register __vector double vx2_r = {x[4],x[4]};
register __vector double vx2_i = {x[5],x[5]};
register __vector double vx3_r = {x[6],x[6]};
register __vector double vx3_i = {x[7],x[7]};
#ifdef IGNORE_TEMP_PERM
register __vector double *vy = (__vector double *)y;
register BLASLONG j=0;
#endif
for ( i=0; i< 2*n; i+=4 )
{
#ifdef IGNORE_TEMP_PERM
register __vector double vresult_r = vy[j];
register __vector double vresult_i = vy[j+1];
#else
register __vector double vresult_r = {y[i],y[i+2]};
register __vector double vresult_i = {y[i+1],y[i+3]};
#endif
register __vector double va0_r= {a0[i],a0[i+2]};
register __vector double va0_i= {a0[i+1],a0[i+3]};
register __vector double va1_r= {a1[i],a1[i+2]};
register __vector double va1_i= {a1[i+1],a1[i+3]};
register __vector double va2_r= {a2[i],a2[i+2]};
register __vector double va2_i= {a2[i+1],a2[i+3]};
register __vector double va3_r= {a3[i],a3[i+2]};
register __vector double va3_i= {a3[i+1],a3[i+3]};
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
vresult_r = va0_r * vx0_r - (va0_i*vx0_i -vresult_r) ;
vresult_i = vresult_i + va0_r * vx0_i + va0_i * vx0_r ;
vresult_r = va1_r * vx1_r - (va1_i*vx1_i -vresult_r) ;
vresult_i = vresult_i + va1_r * vx1_i + va1_i * vx1_r ;
vresult_r = va2_r * vx2_r - (va2_i*vx2_i -vresult_r) ;
vresult_i = vresult_i + va2_r * vx2_i + va2_i * vx2_r ;
vresult_r = va3_r * vx3_r - (va3_i*vx3_i -vresult_r) ;
vresult_i = vresult_i + va3_r * vx3_i + va3_i * vx3_r ;
#else
vresult_r = vresult_r + va0_r * vx0_r + va0_i*vx0_i ;
vresult_i = va0_r * vx0_i - ( va0_i * vx0_r - vresult_i) ;
vresult_r = vresult_r + va1_r * vx1_r + va1_i*vx1_i ;
vresult_i = va1_r * vx1_i - ( va1_i * vx1_r - vresult_i) ;
vresult_r = vresult_r + va2_r * vx2_r + va2_i*vx2_i ;
vresult_i = va2_r * vx2_i - ( va2_i * vx2_r - vresult_i) ;
vresult_r = vresult_r + va3_r * vx3_r + va3_i*vx3_i ;
vresult_i = va3_r * vx3_i - ( va3_i * vx3_r - vresult_i) ;
#endif
#ifdef IGNORE_TEMP_PERM
vy[j] = vresult_r ;
vy[j+1] = vresult_i ;
j+=2;
#else
y[i] = vresult_r[0];
y[i+1] = vresult_i[0];
y[i +2 ] = vresult_r[1];
y[i + 3 ] = vresult_i[1];
#endif
}
}
#else
static void zgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
{
BLASLONG i;
FLOAT *a0,*a1,*a2,*a3;
a0 = ap[0];
a1 = ap[1];
a2 = ap[2];
a3 = ap[3];
for ( i=0; i< 2*n; i+=2 )
{
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
y[i] += a0[i]*x[0] - a0[i+1] * x[1];
y[i+1] += a0[i]*x[1] + a0[i+1] * x[0];
y[i] += a1[i]*x[2] - a1[i+1] * x[3];
y[i+1] += a1[i]*x[3] + a1[i+1] * x[2];
y[i] += a2[i]*x[4] - a2[i+1] * x[5];
y[i+1] += a2[i]*x[5] + a2[i+1] * x[4];
y[i] += a3[i]*x[6] - a3[i+1] * x[7];
y[i+1] += a3[i]*x[7] + a3[i+1] * x[6];
#else
y[i] += a0[i]*x[0] + a0[i+1] * x[1];
y[i+1] += a0[i]*x[1] - a0[i+1] * x[0];
y[i] += a1[i]*x[2] + a1[i+1] * x[3];
y[i+1] += a1[i]*x[3] - a1[i+1] * x[2];
y[i] += a2[i]*x[4] + a2[i+1] * x[5];
y[i+1] += a2[i]*x[5] - a2[i+1] * x[4];
y[i] += a3[i]*x[6] + a3[i+1] * x[7];
y[i+1] += a3[i]*x[7] - a3[i+1] * x[6];
#endif
}
}
#endif
#ifdef HAVE_KERNEL_4x2_VEC
static void zgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
{
BLASLONG i;
FLOAT *a0,*a1;
a0 = ap[0];
a1 = ap[1];
register __vector double vx0_r = {x[0],x[0]};
register __vector double vx0_i = {x[1],x[1]};
register __vector double vx1_r = {x[2],x[2]};
register __vector double vx1_i = {x[3],x[3]};
#ifdef IGNORE_TEMP_PERM
register __vector double *vy = (__vector double *)y;
register BLASLONG j=0;
#endif
for ( i=0; i< 2*n; i+=4 )
{
#ifdef IGNORE_TEMP_PERM
register __vector double vresult_r = vy[j];
register __vector double vresult_i = vy[j+1];
#else
register __vector double vresult_r = {y[i],y[i+2]};
register __vector double vresult_i = {y[i+1],y[i+3]};
#endif
register __vector double va0_r= {a0[i],a0[i+2]};
register __vector double va0_i= {a0[i+1],a0[i+3]};
register __vector double va1_r= {a1[i],a1[i+2]};
register __vector double va1_i= {a1[i+1],a1[i+3]};
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
vresult_r = va0_r * vx0_r - (va0_i*vx0_i -vresult_r) ;
vresult_i = vresult_i + va0_r * vx0_i + va0_i * vx0_r ;
vresult_r = va1_r * vx1_r - (va1_i*vx1_i -vresult_r) ;
vresult_i = vresult_i + va1_r * vx1_i + va1_i * vx1_r ;
#else
vresult_r = vresult_r + va0_r * vx0_r + va0_i*vx0_i ;
vresult_i = va0_r * vx0_i - ( va0_i * vx0_r - vresult_i) ;
vresult_r = vresult_r + va1_r * vx1_r + va1_i*vx1_i ;
vresult_i = va1_r * vx1_i - ( va1_i * vx1_r - vresult_i) ;
#endif
#ifdef IGNORE_TEMP_PERM
vy[j] = vresult_r ;
vy[j+1] = vresult_i ;
j+=2;
#else
y[i] = vresult_r[0];
y[i+1] = vresult_i[0];
y[i +2 ] = vresult_r[1];
y[i + 3 ] = vresult_i[1];
#endif
}
}
#else
static void zgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
{
BLASLONG i;
FLOAT *a0,*a1;
a0 = ap[0];
a1 = ap[1];
for ( i=0; i< 2*n; i+=2 )
{
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
y[i] += a0[i]*x[0] - a0[i+1] * x[1];
y[i+1] += a0[i]*x[1] + a0[i+1] * x[0];
y[i] += a1[i]*x[2] - a1[i+1] * x[3];
y[i+1] += a1[i]*x[3] + a1[i+1] * x[2];
#else
y[i] += a0[i]*x[0] + a0[i+1] * x[1];
y[i+1] += a0[i]*x[1] - a0[i+1] * x[0];
y[i] += a1[i]*x[2] + a1[i+1] * x[3];
y[i+1] += a1[i]*x[3] - a1[i+1] * x[2];
#endif
}
}
#endif
#ifdef HAVE_KERNEL_4x1_VEC
static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
{
BLASLONG i;
FLOAT *a0;
a0 = ap;
register __vector double vx_r = {x[0],x[0]};
register __vector double vx_i = {x[1],x[1]};
#ifdef IGNORE_TEMP_PERM
register __vector double *vy = (__vector double *)y;
register BLASLONG j=0;
#endif
for ( i=0; i< 2*n; i+=4 )
{
#ifdef IGNORE_TEMP_PERM
register __vector double vresult_r = vy[j];
register __vector double vresult_i = vy[j+1];
#else
register __vector double vresult_r = {y[i],y[i+2]};
register __vector double vresult_i = {y[i+1],y[i+3]};
#endif
register __vector double va0_r= {a0[i],a0[i+2]};
register __vector double va0_i= {a0[i+1],a0[i+3]};
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
vresult_r = va0_r * vx_r - (va0_i*vx_i -vresult_r) ;
vresult_i = vresult_i + va0_r * vx_i + va0_i * vx_r ;
#else
vresult_r = vresult_r + va0_r * vx_r + va0_i*vx_i ;
vresult_i = va0_r * vx_i - ( va0_i * vx_r - vresult_i) ;
// y[i] += a0[i]*x[0] + a0[i+1] * x[1];
// y[i+1] += a0[i]*x[1] - a0[i+1] * x[0];
#endif
#ifndef IGNORE_TEMP_PERM
y[i] = vresult_r[0];
y[i+1] = vresult_i[0];
y[i +2 ] = vresult_r[1];
y[i + 3 ] = vresult_i[1];
#else
vy[j] = vresult_r ;
vy[j+1] = vresult_i ;
j+=2;
#endif
}
}
#else
static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
{
BLASLONG i;
FLOAT *a0;
a0 = ap;
for ( i=0; i< 2*n; i+=2 )
{
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
y[i] += a0[i]*x[0] - a0[i+1] * x[1];
y[i+1] += a0[i]*x[1] + a0[i+1] * x[0];
#else
y[i] += a0[i]*x[0] + a0[i+1] * x[1];
y[i+1] += a0[i]*x[1] - a0[i+1] * x[0];
#endif
}
}
#endif
#ifdef HAVE_KERNEL_ADDY
static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT alpha_r, FLOAT alpha_i)
{
BLASLONG i;
#ifdef IGNORE_TEMP_PERM
register __vector double *src_vec = (__vector double *)src;
#endif
register __vector double valpha_r = {alpha_r,alpha_r};
register __vector double valpha_i = {alpha_i,alpha_i};
register __vector double vresult_r;
register __vector double vresult_i;
if ( inc_dest != 2 )
{
for ( i=0; i<n; i+=2 )
{
#ifdef IGNORE_TEMP_PERM
register __vector double vsrc_r= src_vec[i];
register __vector double vsrc_i= src_vec[i+1];
#else
register __vector double vsrc_r= {src[0],src[2]};
register __vector double vsrc_i= {src[1],src[3]};
#endif
#if !defined(XCONJ)
vresult_r = vsrc_r * valpha_r;
vresult_r-= vsrc_i*valpha_i ;
vresult_i = vsrc_r * valpha_i + vsrc_i * valpha_r ;
#else
vresult_r = vsrc_r * valpha_r + vsrc_i*valpha_i ;
vresult_i = vsrc_r * valpha_i ;
vresult_i -= vsrc_i * valpha_r ;
#endif
*dest += vresult_r[0];
*(dest+1) += vresult_i[0];
*(dest + inc_dest) += vresult_r[1];
*(dest+inc_dest+1) += vresult_i[1];
#ifndef IGNORE_TEMP_PERM
src+=4;
#endif
dest += 2*inc_dest;
}
return;
}
for ( i=0; i<n; i+=2 )
{
#ifdef IGNORE_TEMP_PERM
register __vector double vsrc_r= src_vec[i];
register __vector double vsrc_i= src_vec[i+1];
#else
register __vector double vsrc_r= {src[0],src[2]};
register __vector double vsrc_i= {src[1],src[3]};
#endif
#if !defined(XCONJ)
vresult_r = vsrc_r * valpha_r;
vresult_r-= vsrc_i*valpha_i ;
vresult_i = vsrc_r * valpha_i + vsrc_i * valpha_r ;
#else
vresult_r = vsrc_r * valpha_r + vsrc_i*valpha_i ;
vresult_i = vsrc_r * valpha_i ;
vresult_i -= vsrc_i * valpha_r ;
#endif
*dest += vresult_r[0];
*(dest+1) += vresult_i[0];
*(dest + 2) += vresult_r[1];
*(dest+3) += vresult_i[1];
#ifndef IGNORE_TEMP_PERM
src+=4;
#endif
dest += 4;
}
return;
return;
}
#else
static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT alpha_r, FLOAT alpha_i)
{
BLASLONG i;
if ( inc_dest != 2 )
{
FLOAT temp_r;
FLOAT temp_i;
for ( i=0; i<n; i++ )
{
#if !defined(XCONJ)
temp_r = alpha_r * src[0] - alpha_i * src[1];
temp_i = alpha_r * src[1] + alpha_i * src[0];
#else
temp_r = alpha_r * src[0] + alpha_i * src[1];
temp_i = -alpha_r * src[1] + alpha_i * src[0];
#endif
*dest += temp_r;
*(dest+1) += temp_i;
src+=2;
dest += inc_dest;
}
return;
}
FLOAT temp_r0;
FLOAT temp_i0;
FLOAT temp_r1;
FLOAT temp_i1;
FLOAT temp_r2;
FLOAT temp_i2;
FLOAT temp_r3;
FLOAT temp_i3;
for ( i=0; i<n; i+=4 )
{
#if !defined(XCONJ)
temp_r0 = alpha_r * src[0] - alpha_i * src[1];
temp_i0 = alpha_r * src[1] + alpha_i * src[0];
temp_r1 = alpha_r * src[2] - alpha_i * src[3];
temp_i1 = alpha_r * src[3] + alpha_i * src[2];
temp_r2 = alpha_r * src[4] - alpha_i * src[5];
temp_i2 = alpha_r * src[5] + alpha_i * src[4];
temp_r3 = alpha_r * src[6] - alpha_i * src[7];
temp_i3 = alpha_r * src[7] + alpha_i * src[6];
#else
temp_r0 = alpha_r * src[0] + alpha_i * src[1];
temp_i0 = -alpha_r * src[1] + alpha_i * src[0];
temp_r1 = alpha_r * src[2] + alpha_i * src[3];
temp_i1 = -alpha_r * src[3] + alpha_i * src[2];
temp_r2 = alpha_r * src[4] + alpha_i * src[5];
temp_i2 = -alpha_r * src[5] + alpha_i * src[4];
temp_r3 = alpha_r * src[6] + alpha_i * src[7];
temp_i3 = -alpha_r * src[7] + alpha_i * src[6];
#endif
dest[0] += temp_r0;
dest[1] += temp_i0;
dest[2] += temp_r1;
dest[3] += temp_i1;
dest[4] += temp_r2;
dest[5] += temp_i2;
dest[6] += temp_r3;
dest[7] += temp_i3;
src += 8;
dest += 8;
}
return;
}
#endif
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r,FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
{
BLASLONG i;
BLASLONG j;
FLOAT *a_ptr;
FLOAT *x_ptr;
FLOAT *y_ptr;
FLOAT *ap[4];
BLASLONG n1;
BLASLONG m1;
BLASLONG m2;
BLASLONG m3;
BLASLONG n2;
BLASLONG lda4;
FLOAT xbuffer[8],*ybuffer;
if ( m < 1 ) return(0);
if ( n < 1 ) return(0);
ybuffer = buffer;
inc_x *= 2;
inc_y *= 2;
lda *= 2;
lda4 = 4 * lda;
n1 = n / 4 ;
n2 = n % 4 ;
m3 = m % 4;
m1 = m - ( m % 4 );
m2 = (m % NBMAX) - (m % 4) ;
y_ptr = y;
BLASLONG NB = NBMAX;
while ( NB == NBMAX )
{
m1 -= NB;
if ( m1 < 0)
{
if ( m2 == 0 ) break;
NB = m2;
}
a_ptr = a;
ap[0] = a_ptr;
ap[1] = a_ptr + lda;
ap[2] = ap[1] + lda;
ap[3] = ap[2] + lda;
x_ptr = x;
//zero_y(NB,ybuffer);
memset(ybuffer,0,NB*16);
if ( inc_x == 2 )
{
for( i = 0; i < n1 ; i++)
{
zgemv_kernel_4x4(NB,ap,x_ptr,ybuffer);
ap[0] += lda4;
ap[1] += lda4;
ap[2] += lda4;
ap[3] += lda4;
a_ptr += lda4;
x_ptr += 8;
}
if ( n2 & 2 )
{
zgemv_kernel_4x2(NB,ap,x_ptr,ybuffer);
x_ptr += 4;
a_ptr += 2 * lda;
}
if ( n2 & 1 )
{
zgemv_kernel_4x1(NB,a_ptr,x_ptr,ybuffer);
x_ptr += 2;
a_ptr += lda;
}
}
else
{
for( i = 0; i < n1 ; i++)
{
xbuffer[0] = x_ptr[0];
xbuffer[1] = x_ptr[1];
x_ptr += inc_x;
xbuffer[2] = x_ptr[0];
xbuffer[3] = x_ptr[1];
x_ptr += inc_x;
xbuffer[4] = x_ptr[0];
xbuffer[5] = x_ptr[1];
x_ptr += inc_x;
xbuffer[6] = x_ptr[0];
xbuffer[7] = x_ptr[1];
x_ptr += inc_x;
zgemv_kernel_4x4(NB,ap,xbuffer,ybuffer);
ap[0] += lda4;
ap[1] += lda4;
ap[2] += lda4;
ap[3] += lda4;
a_ptr += lda4;
}
for( i = 0; i < n2 ; i++)
{
xbuffer[0] = x_ptr[0];
xbuffer[1] = x_ptr[1];
x_ptr += inc_x;
zgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer);
a_ptr += 1 * lda;
}
}
add_y(NB,ybuffer,y_ptr,inc_y,alpha_r,alpha_i);
a += 2 * NB;
y_ptr += NB * inc_y;
}
if ( m3 == 0 ) return(0);
if ( m3 == 1 )
{
a_ptr = a;
x_ptr = x;
FLOAT temp_r = 0.0;
FLOAT temp_i = 0.0;
if ( lda == 2 && inc_x == 2 )
{
for( i=0 ; i < (n & -2); i+=2 )
{
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
temp_r += a_ptr[2] * x_ptr[2] - a_ptr[3] * x_ptr[3];
temp_i += a_ptr[2] * x_ptr[3] + a_ptr[3] * x_ptr[2];
#else
temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
temp_r += a_ptr[2] * x_ptr[2] + a_ptr[3] * x_ptr[3];
temp_i += a_ptr[2] * x_ptr[3] - a_ptr[3] * x_ptr[2];
#endif
a_ptr += 4;
x_ptr += 4;
}
for( ; i < n; i++ )
{
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
#else
temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
#endif
a_ptr += 2;
x_ptr += 2;
}
}
else
{
for( i = 0; i < n; i++ )
{
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
#else
temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
#endif
a_ptr += lda;
x_ptr += inc_x;
}
}
#if !defined(XCONJ)
y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i;
y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r;
#else
y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i;
y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r;
#endif
return(0);
}
if ( m3 == 2 )
{
a_ptr = a;
x_ptr = x;
FLOAT temp_r0 = 0.0;
FLOAT temp_i0 = 0.0;
FLOAT temp_r1 = 0.0;
FLOAT temp_i1 = 0.0;
if ( lda == 4 && inc_x == 2 )
{
for( i = 0; i < (n & -2); i+=2 )
{
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];
temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];
temp_r0 += a_ptr[4] * x_ptr[2] - a_ptr[5] * x_ptr[3];
temp_i0 += a_ptr[4] * x_ptr[3] + a_ptr[5] * x_ptr[2];
temp_r1 += a_ptr[6] * x_ptr[2] - a_ptr[7] * x_ptr[3];
temp_i1 += a_ptr[6] * x_ptr[3] + a_ptr[7] * x_ptr[2];
#else
temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];
temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];
temp_r0 += a_ptr[4] * x_ptr[2] + a_ptr[5] * x_ptr[3];
temp_i0 += a_ptr[4] * x_ptr[3] - a_ptr[5] * x_ptr[2];
temp_r1 += a_ptr[6] * x_ptr[2] + a_ptr[7] * x_ptr[3];
temp_i1 += a_ptr[6] * x_ptr[3] - a_ptr[7] * x_ptr[2];
#endif
a_ptr += 8;
x_ptr += 4;
}
for( ; i < n; i++ )
{
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];
temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];
#else
temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];
temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];
#endif
a_ptr += 4;
x_ptr += 2;
}
}
else
{
for( i=0 ; i < n; i++ )
{
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];
temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];
#else
temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];
temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];
#endif
a_ptr += lda;
x_ptr += inc_x;
}
}
#if !defined(XCONJ)
y_ptr[0] += alpha_r * temp_r0 - alpha_i * temp_i0;
y_ptr[1] += alpha_r * temp_i0 + alpha_i * temp_r0;
y_ptr += inc_y;
y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1;
y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1;
#else
y_ptr[0] += alpha_r * temp_r0 + alpha_i * temp_i0;
y_ptr[1] -= alpha_r * temp_i0 - alpha_i * temp_r0;
y_ptr += inc_y;
y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1;
y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1;
#endif
return(0);
}
if ( m3 == 3 )
{
a_ptr = a;
x_ptr = x;
FLOAT temp_r0 = 0.0;
FLOAT temp_i0 = 0.0;
FLOAT temp_r1 = 0.0;
FLOAT temp_i1 = 0.0;
FLOAT temp_r2 = 0.0;
FLOAT temp_i2 = 0.0;
if ( lda == 6 && inc_x == 2 )
{
for( i=0 ; i < n; i++ )
{
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];
temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];
temp_r2 += a_ptr[4] * x_ptr[0] - a_ptr[5] * x_ptr[1];
temp_i2 += a_ptr[4] * x_ptr[1] + a_ptr[5] * x_ptr[0];
#else
temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];
temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];
temp_r2 += a_ptr[4] * x_ptr[0] + a_ptr[5] * x_ptr[1];
temp_i2 += a_ptr[4] * x_ptr[1] - a_ptr[5] * x_ptr[0];
#endif
a_ptr += 6;
x_ptr += 2;
}
}
else
{
for( i = 0; i < n; i++ )
{
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];
temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];
temp_r2 += a_ptr[4] * x_ptr[0] - a_ptr[5] * x_ptr[1];
temp_i2 += a_ptr[4] * x_ptr[1] + a_ptr[5] * x_ptr[0];
#else
temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];
temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];
temp_r2 += a_ptr[4] * x_ptr[0] + a_ptr[5] * x_ptr[1];
temp_i2 += a_ptr[4] * x_ptr[1] - a_ptr[5] * x_ptr[0];
#endif
a_ptr += lda;
x_ptr += inc_x;
}
}
#if !defined(XCONJ)
y_ptr[0] += alpha_r * temp_r0 - alpha_i * temp_i0;
y_ptr[1] += alpha_r * temp_i0 + alpha_i * temp_r0;
y_ptr += inc_y;
y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1;
y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1;
y_ptr += inc_y;
y_ptr[0] += alpha_r * temp_r2 - alpha_i * temp_i2;
y_ptr[1] += alpha_r * temp_i2 + alpha_i * temp_r2;
#else
y_ptr[0] += alpha_r * temp_r0 + alpha_i * temp_i0;
y_ptr[1] -= alpha_r * temp_i0 - alpha_i * temp_r0;
y_ptr += inc_y;
y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1;
y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1;
y_ptr += inc_y;
y_ptr[0] += alpha_r * temp_r2 + alpha_i * temp_i2;
y_ptr[1] -= alpha_r * temp_i2 - alpha_i * temp_r2;
#endif
return(0);
}
return(0);
}

788
kernel/zarch/zgemv_t_4.c Normal file
View File

@ -0,0 +1,788 @@
/***************************************************************************
Copyright (c) 2017, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#define NBMAX 1024
#define HAVE_KERNEL_4x4_VEC 1
#define HAVE_KERNEL_4x2_VEC 1
#define HAVE_KERNEL_4x1_VEC 1
#if defined(HAVE_KERNEL_4x4_VEC) || defined(HAVE_KERNEL_4x2_VEC) || defined(HAVE_KERNEL_4x1_VEC)
#include <vecintrin.h>
#endif
#ifdef HAVE_KERNEL_4x4_VEC_ASM
#elif HAVE_KERNEL_4x4_VEC
static void zgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
{
BLASLONG i;
FLOAT *a0,*a1,*a2,*a3;
a0 = ap[0];
a1 = ap[1];
a2 = ap[2];
a3 = ap[3];
register __vector double vtemp0_r = {0.0,0.0};
register __vector double vtemp0_i = {0.0,0.0};
register __vector double vtemp1_r = {0.0,0.0};
register __vector double vtemp1_i = {0.0,0.0};
register __vector double vtemp2_r = {0.0,0.0};
register __vector double vtemp2_i = {0.0,0.0};
register __vector double vtemp3_r = {0.0,0.0};
register __vector double vtemp3_i = {0.0,0.0};
for ( i=0; i< 2*n; i+=4 )
{
register __vector double vx_r = {x[i],x[i+2]};
register __vector double vx_i = {x[i+1],x[i+3]};
register __vector double va0_r= {a0[i],a0[i+2]};
register __vector double va0_i= {a0[i+1],a0[i+3]};
register __vector double va1_r= {a1[i],a1[i+2]};
register __vector double va1_i= {a1[i+1],a1[i+3]};
register __vector double va2_r= {a2[i],a2[i+2]};
register __vector double va2_i= {a2[i+1],a2[i+3]};
register __vector double va3_r= {a3[i],a3[i+2]};
register __vector double va3_i= {a3[i+1],a3[i+3]};
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
vtemp0_r = va0_r * vx_r - (va0_i*vx_i -vtemp0_r) ;
vtemp0_i = vtemp0_i + va0_r * vx_i + va0_i * vx_r ;
vtemp1_r = va1_r * vx_r - (va1_i*vx_i -vtemp1_r) ;
vtemp1_i = vtemp1_i + va1_r * vx_i + va1_i * vx_r ;
vtemp2_r = va2_r * vx_r - (va2_i*vx_i -vtemp2_r) ;
vtemp2_i = vtemp2_i + va2_r * vx_i + va2_i * vx_r ;
vtemp3_r = va3_r * vx_r - (va3_i*vx_i -vtemp3_r) ;
vtemp3_i = vtemp3_i + va3_r * vx_i + va3_i * vx_r ;
#else
vtemp0_r = vtemp0_r + va0_r * vx_r + va0_i*vx_i ;
vtemp0_i = va0_r * vx_i - ( va0_i * vx_r - vtemp0_i) ;
vtemp1_r = vtemp1_r + va1_r * vx_r + va1_i*vx_i ;
vtemp1_i = va1_r * vx_i - ( va1_i * vx_r - vtemp1_i);
vtemp2_r = vtemp2_r + va2_r * vx_r + va2_i*vx_i ;
vtemp2_i = va2_r * vx_i - ( va2_i * vx_r - vtemp2_i) ;
vtemp3_r = vtemp3_r + va3_r * vx_r + va3_i*vx_i ;
vtemp3_i = va3_r * vx_i - ( va3_i * vx_r - vtemp3_i);
#endif
}
register FLOAT alpha_r = alpha[0] ;
register FLOAT alpha_i = alpha[1] ;
register FLOAT temp_r0 = vtemp0_r[0]+vtemp0_r[1] ;
register FLOAT temp_i0 = vtemp0_i[0]+vtemp0_i[1] ;
register FLOAT temp_r1 = vtemp1_r[0]+vtemp1_r[1] ;
register FLOAT temp_i1 = vtemp1_i[0]+vtemp1_i[1] ;
register FLOAT temp_r2 = vtemp2_r[0]+vtemp2_r[1] ;
register FLOAT temp_i2 = vtemp2_i[0]+vtemp2_i[1] ;
register FLOAT temp_r3 = vtemp3_r[0]+vtemp3_r[1] ;
register FLOAT temp_i3 = vtemp3_i[0]+vtemp3_i[1] ;
#if !defined(XCONJ)
y[0] += alpha_r * temp_r0 - alpha_i * temp_i0;
y[1] += alpha_r * temp_i0 + alpha_i * temp_r0;
y[2] += alpha_r * temp_r1 - alpha_i * temp_i1;
y[3] += alpha_r * temp_i1 + alpha_i * temp_r1;
y[4] += alpha_r * temp_r2 - alpha_i * temp_i2;
y[5] += alpha_r * temp_i2 + alpha_i * temp_r2;
y[6] += alpha_r * temp_r3 - alpha_i * temp_i3;
y[7] += alpha_r * temp_i3 + alpha_i * temp_r3;
#else
y[0] += alpha_r * temp_r0 + alpha_i * temp_i0;
y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0;
y[2] += alpha_r * temp_r1 + alpha_i * temp_i1;
y[3] -= alpha_r * temp_i1 - alpha_i * temp_r1;
y[4] += alpha_r * temp_r2 + alpha_i * temp_i2;
y[5] -= alpha_r * temp_i2 - alpha_i * temp_r2;
y[6] += alpha_r * temp_r3 + alpha_i * temp_i3;
y[7] -= alpha_r * temp_i3 - alpha_i * temp_r3;
#endif
}
#else
static void zgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
{
BLASLONG i;
FLOAT *a0,*a1,*a2,*a3;
a0 = ap[0];
a1 = ap[1];
a2 = ap[2];
a3 = ap[3];
FLOAT alpha_r = alpha[0];
FLOAT alpha_i = alpha[1];
FLOAT temp_r0 = 0.0;
FLOAT temp_r1 = 0.0;
FLOAT temp_r2 = 0.0;
FLOAT temp_r3 = 0.0;
FLOAT temp_i0 = 0.0;
FLOAT temp_i1 = 0.0;
FLOAT temp_i2 = 0.0;
FLOAT temp_i3 = 0.0;
for ( i=0; i< 2*n; i+=2 )
{
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r0 += a0[i]*x[i] - a0[i+1]*x[i+1];
temp_i0 += a0[i]*x[i+1] + a0[i+1]*x[i];
temp_r1 += a1[i]*x[i] - a1[i+1]*x[i+1];
temp_i1 += a1[i]*x[i+1] + a1[i+1]*x[i];
temp_r2 += a2[i]*x[i] - a2[i+1]*x[i+1];
temp_i2 += a2[i]*x[i+1] + a2[i+1]*x[i];
temp_r3 += a3[i]*x[i] - a3[i+1]*x[i+1];
temp_i3 += a3[i]*x[i+1] + a3[i+1]*x[i];
#else
temp_r0 += a0[i]*x[i] + a0[i+1]*x[i+1];
temp_i0 += a0[i]*x[i+1] - a0[i+1]*x[i];
temp_r1 += a1[i]*x[i] + a1[i+1]*x[i+1];
temp_i1 += a1[i]*x[i+1] - a1[i+1]*x[i];
temp_r2 += a2[i]*x[i] + a2[i+1]*x[i+1];
temp_i2 += a2[i]*x[i+1] - a2[i+1]*x[i];
temp_r3 += a3[i]*x[i] + a3[i+1]*x[i+1];
temp_i3 += a3[i]*x[i+1] - a3[i+1]*x[i];
#endif
}
#if !defined(XCONJ)
y[0] += alpha_r * temp_r0 - alpha_i * temp_i0;
y[1] += alpha_r * temp_i0 + alpha_i * temp_r0;
y[2] += alpha_r * temp_r1 - alpha_i * temp_i1;
y[3] += alpha_r * temp_i1 + alpha_i * temp_r1;
y[4] += alpha_r * temp_r2 - alpha_i * temp_i2;
y[5] += alpha_r * temp_i2 + alpha_i * temp_r2;
y[6] += alpha_r * temp_r3 - alpha_i * temp_i3;
y[7] += alpha_r * temp_i3 + alpha_i * temp_r3;
#else
y[0] += alpha_r * temp_r0 + alpha_i * temp_i0;
y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0;
y[2] += alpha_r * temp_r1 + alpha_i * temp_i1;
y[3] -= alpha_r * temp_i1 - alpha_i * temp_r1;
y[4] += alpha_r * temp_r2 + alpha_i * temp_i2;
y[5] -= alpha_r * temp_i2 - alpha_i * temp_r2;
y[6] += alpha_r * temp_r3 + alpha_i * temp_i3;
y[7] -= alpha_r * temp_i3 - alpha_i * temp_r3;
#endif
}
#endif
#ifdef HAVE_KERNEL_4x2_VEC
static void zgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
{
BLASLONG i;
FLOAT *a0,*a1;
a0 = ap[0];
a1 = ap[1];
register __vector double vtemp0_r = {0.0,0.0};
register __vector double vtemp0_i = {0.0,0.0};
register __vector double vtemp1_r = {0.0,0.0};
register __vector double vtemp1_i = {0.0,0.0};
for ( i=0; i< 2*n; i+=4 )
{
register __vector double vx_r = {x[i],x[i+2]};
register __vector double vx_i = {x[i+1],x[i+3]};
register __vector double va0_r= {a0[i],a0[i+2]};
register __vector double va0_i= {a0[i+1],a0[i+3]};
register __vector double va1_r= {a1[i],a1[i+2]};
register __vector double va1_i= {a1[i+1],a1[i+3]};
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
vtemp0_r = va0_r * vx_r - (va0_i*vx_i -vtemp0_r) ;
vtemp0_i = vtemp0_i + va0_r * vx_i + va0_i * vx_r ;
vtemp1_r = va1_r * vx_r - (va1_i*vx_i -vtemp1_r) ;
vtemp1_i = vtemp1_i + va1_r * vx_i + va1_i * vx_r ;
#else
vtemp0_r = vtemp0_r + va0_r * vx_r + va0_i*vx_i ;
vtemp0_i = va0_r * vx_i - ( va0_i * vx_r - vtemp0_i) ;
vtemp1_r = vtemp1_r + va1_r * vx_r + va1_i*vx_i ;
vtemp1_i = va1_r * vx_i - ( va1_i * vx_r - vtemp1_i);
#endif
}
register FLOAT temp_r0 = vtemp0_r[0]+vtemp0_r[1] ;
register FLOAT temp_i0 = vtemp0_i[0]+vtemp0_i[1] ;
register FLOAT temp_r1 = vtemp1_r[0]+vtemp1_r[1] ;
register FLOAT temp_i1 = vtemp1_i[0]+vtemp1_i[1] ;
register FLOAT alpha_r = alpha[0] ;
register FLOAT alpha_i = alpha[1] ;
#if !defined(XCONJ)
y[0] += alpha_r * temp_r0 - alpha_i * temp_i0;
y[1] += alpha_r * temp_i0 + alpha_i * temp_r0;
y[2] += alpha_r * temp_r1 - alpha_i * temp_i1;
y[3] += alpha_r * temp_i1 + alpha_i * temp_r1;
#else
y[0] += alpha_r * temp_r0 + alpha_i * temp_i0;
y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0;
y[2] += alpha_r * temp_r1 + alpha_i * temp_i1;
y[3] -= alpha_r * temp_i1 - alpha_i * temp_r1;
#endif
}
#else
static void zgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
{
BLASLONG i;
FLOAT *a0,*a1;
a0 = ap[0];
a1 = ap[1];
FLOAT alpha_r = alpha[0];
FLOAT alpha_i = alpha[1];
FLOAT temp_r0 = 0.0;
FLOAT temp_r1 = 0.0;
FLOAT temp_i0 = 0.0;
FLOAT temp_i1 = 0.0;
for ( i=0; i< 2*n; i+=2 )
{
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r0 += a0[i]*x[i] - a0[i+1]*x[i+1];
temp_i0 += a0[i]*x[i+1] + a0[i+1]*x[i];
temp_r1 += a1[i]*x[i] - a1[i+1]*x[i+1];
temp_i1 += a1[i]*x[i+1] + a1[i+1]*x[i];
#else
temp_r0 += a0[i]*x[i] + a0[i+1]*x[i+1];
temp_i0 += a0[i]*x[i+1] - a0[i+1]*x[i];
temp_r1 += a1[i]*x[i] + a1[i+1]*x[i+1];
temp_i1 += a1[i]*x[i+1] - a1[i+1]*x[i];
#endif
}
#if !defined(XCONJ)
y[0] += alpha_r * temp_r0 - alpha_i * temp_i0;
y[1] += alpha_r * temp_i0 + alpha_i * temp_r0;
y[2] += alpha_r * temp_r1 - alpha_i * temp_i1;
y[3] += alpha_r * temp_i1 + alpha_i * temp_r1;
#else
y[0] += alpha_r * temp_r0 + alpha_i * temp_i0;
y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0;
y[2] += alpha_r * temp_r1 + alpha_i * temp_i1;
y[3] -= alpha_r * temp_i1 - alpha_i * temp_r1;
#endif
}
#endif
#ifdef HAVE_KERNEL_4x1_VEC
static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
{
BLASLONG i;
FLOAT *a0;
a0 = ap;
register __vector double vtemp_r = {0.0,0.0};
register __vector double vtemp_i = {0.0,0.0};
for ( i=0; i< 2*n; i+=4 )
{
register __vector double va0_r= {a0[i],a0[i+2]};
register __vector double va0_i= {a0[i+1],a0[i+3]};
register __vector double vx0_r = {x[i],x[i+2]};
register __vector double vx0_i = {x[i+1],x[i+3]};
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
vtemp_r = va0_r * vx0_r - (va0_i*vx0_i -vtemp_r) ;
vtemp_i = vtemp_i + va0_r * vx0_i + va0_i * vx0_r ;
#else
vtemp_r = vtemp_r + va0_r * vx0_r + va0_i*vx0_i ;
vtemp_i = va0_r * vx0_i - ( va0_i * vx0_r - vtemp_i) ;
#endif
}
register FLOAT temp_r0 = vtemp_r[0]+vtemp_r[1] ;
register FLOAT temp_i0 = vtemp_i[0]+vtemp_i[1] ;
register FLOAT alpha_r = alpha[0] ;
register FLOAT alpha_i = alpha[1] ;
#if !defined(XCONJ)
y[0] += alpha_r * temp_r0 - alpha_i * temp_i0;
y[1] += alpha_r * temp_i0 + alpha_i * temp_r0;
#else
y[0] += alpha_r * temp_r0 + alpha_i * temp_i0;
y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0;
#endif
}
#else
static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
{
BLASLONG i;
FLOAT *a0;
a0 = ap;
FLOAT alpha_r = alpha[0];
FLOAT alpha_i = alpha[1];
FLOAT temp_r0 = 0.0;
FLOAT temp_i0 = 0.0;
for ( i=0; i< 2*n; i+=2 )
{
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r0 += a0[i]*x[i] - a0[i+1]*x[i+1];
temp_i0 += a0[i]*x[i+1] + a0[i+1]*x[i];
#else
temp_r0 += a0[i]*x[i] + a0[i+1]*x[i+1];
temp_i0 += a0[i]*x[i+1] - a0[i+1]*x[i];
#endif
}
#if !defined(XCONJ)
y[0] += alpha_r * temp_r0 - alpha_i * temp_i0;
y[1] += alpha_r * temp_i0 + alpha_i * temp_r0;
#else
y[0] += alpha_r * temp_r0 + alpha_i * temp_i0;
y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0;
#endif
}
#endif
static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src)
{
BLASLONG i;
for ( i=0; i<n; i++ )
{
*dest = *src;
*(dest+1) = *(src+1);
dest+=2;
src += inc_src;
}
}
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
{
BLASLONG i;
BLASLONG j;
FLOAT *a_ptr;
FLOAT *x_ptr;
FLOAT *y_ptr;
FLOAT *ap[8];
BLASLONG n1;
BLASLONG m1;
BLASLONG m2;
BLASLONG m3;
BLASLONG n2;
BLASLONG lda4;
FLOAT ybuffer[8],*xbuffer;
FLOAT alpha[2];
if ( m < 1 ) return(0);
if ( n < 1 ) return(0);
inc_x <<= 1;
inc_y <<= 1;
lda <<= 1;
lda4 = lda << 2;
xbuffer = buffer;
n1 = n >> 2 ;
n2 = n & 3 ;
m3 = m & 3 ;
m1 = m - m3;
m2 = (m & (NBMAX-1)) - m3 ;
alpha[0] = alpha_r;
alpha[1] = alpha_i;
BLASLONG NB = NBMAX;
while ( NB == NBMAX )
{
m1 -= NB;
if ( m1 < 0)
{
if ( m2 == 0 ) break;
NB = m2;
}
y_ptr = y;
a_ptr = a;
x_ptr = x;
ap[0] = a_ptr;
ap[1] = a_ptr + lda;
ap[2] = ap[1] + lda;
ap[3] = ap[2] + lda;
if ( inc_x != 2 )
copy_x(NB,x_ptr,xbuffer,inc_x);
else
xbuffer = x_ptr;
if ( inc_y == 2 )
{
for( i = 0; i < n1 ; i++)
{
zgemv_kernel_4x4(NB,ap,xbuffer,y_ptr,alpha);
ap[0] += lda4;
ap[1] += lda4;
ap[2] += lda4;
ap[3] += lda4;
a_ptr += lda4;
y_ptr += 8;
}
if ( n2 & 2 )
{
zgemv_kernel_4x2(NB,ap,xbuffer,y_ptr,alpha);
a_ptr += lda * 2;
y_ptr += 4;
}
if ( n2 & 1 )
{
zgemv_kernel_4x1(NB,a_ptr,xbuffer,y_ptr,alpha);
a_ptr += lda;
y_ptr += 2;
}
}
else
{
for( i = 0; i < n1 ; i++)
{
memset(ybuffer,0,64);
zgemv_kernel_4x4(NB,ap,xbuffer,ybuffer,alpha);
ap[0] += lda4;
ap[1] += lda4;
ap[2] += lda4;
ap[3] += lda4;
a_ptr += lda4;
y_ptr[0] += ybuffer[0];
y_ptr[1] += ybuffer[1];
y_ptr += inc_y;
y_ptr[0] += ybuffer[2];
y_ptr[1] += ybuffer[3];
y_ptr += inc_y;
y_ptr[0] += ybuffer[4];
y_ptr[1] += ybuffer[5];
y_ptr += inc_y;
y_ptr[0] += ybuffer[6];
y_ptr[1] += ybuffer[7];
y_ptr += inc_y;
}
for( i = 0; i < n2 ; i++)
{
memset(ybuffer,0,64);
zgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer,alpha);
a_ptr += lda;
y_ptr[0] += ybuffer[0];
y_ptr[1] += ybuffer[1];
y_ptr += inc_y;
}
}
a += 2 * NB;
x += NB * inc_x;
}
if ( m3 == 0 ) return(0);
x_ptr = x;
j=0;
a_ptr = a;
y_ptr = y;
if ( m3 == 3 )
{
FLOAT temp_r ;
FLOAT temp_i ;
FLOAT x0 = x_ptr[0];
FLOAT x1 = x_ptr[1];
x_ptr += inc_x;
FLOAT x2 = x_ptr[0];
FLOAT x3 = x_ptr[1];
x_ptr += inc_x;
FLOAT x4 = x_ptr[0];
FLOAT x5 = x_ptr[1];
while ( j < n)
{
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;
temp_r += a_ptr[2] * x2 - a_ptr[3] * x3;
temp_i += a_ptr[2] * x3 + a_ptr[3] * x2;
temp_r += a_ptr[4] * x4 - a_ptr[5] * x5;
temp_i += a_ptr[4] * x5 + a_ptr[5] * x4;
#else
temp_r = a_ptr[0] * x0 + a_ptr[1] * x1;
temp_i = a_ptr[0] * x1 - a_ptr[1] * x0;
temp_r += a_ptr[2] * x2 + a_ptr[3] * x3;
temp_i += a_ptr[2] * x3 - a_ptr[3] * x2;
temp_r += a_ptr[4] * x4 + a_ptr[5] * x5;
temp_i += a_ptr[4] * x5 - a_ptr[5] * x4;
#endif
#if !defined(XCONJ)
y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i;
y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r;
#else
y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i;
y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r;
#endif
a_ptr += lda;
y_ptr += inc_y;
j++;
}
return(0);
}
if ( m3 == 2 )
{
FLOAT temp_r ;
FLOAT temp_i ;
FLOAT temp_r1 ;
FLOAT temp_i1 ;
FLOAT x0 = x_ptr[0];
FLOAT x1 = x_ptr[1];
x_ptr += inc_x;
FLOAT x2 = x_ptr[0];
FLOAT x3 = x_ptr[1];
FLOAT ar = alpha[0];
FLOAT ai = alpha[1];
while ( j < ( n & -2 ))
{
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;
temp_r += a_ptr[2] * x2 - a_ptr[3] * x3;
temp_i += a_ptr[2] * x3 + a_ptr[3] * x2;
a_ptr += lda;
temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1;
temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0;
temp_r1 += a_ptr[2] * x2 - a_ptr[3] * x3;
temp_i1 += a_ptr[2] * x3 + a_ptr[3] * x2;
#else
temp_r = a_ptr[0] * x0 + a_ptr[1] * x1;
temp_i = a_ptr[0] * x1 - a_ptr[1] * x0;
temp_r += a_ptr[2] * x2 + a_ptr[3] * x3;
temp_i += a_ptr[2] * x3 - a_ptr[3] * x2;
a_ptr += lda;
temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1;
temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0;
temp_r1 += a_ptr[2] * x2 + a_ptr[3] * x3;
temp_i1 += a_ptr[2] * x3 - a_ptr[3] * x2;
#endif
#if !defined(XCONJ)
y_ptr[0] += ar * temp_r - ai * temp_i;
y_ptr[1] += ar * temp_i + ai * temp_r;
y_ptr += inc_y;
y_ptr[0] += ar * temp_r1 - ai * temp_i1;
y_ptr[1] += ar * temp_i1 + ai * temp_r1;
#else
y_ptr[0] += ar * temp_r + ai * temp_i;
y_ptr[1] -= ar * temp_i - ai * temp_r;
y_ptr += inc_y;
y_ptr[0] += ar * temp_r1 + ai * temp_i1;
y_ptr[1] -= ar * temp_i1 - ai * temp_r1;
#endif
a_ptr += lda;
y_ptr += inc_y;
j+=2;
}
while ( j < n)
{
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;
temp_r += a_ptr[2] * x2 - a_ptr[3] * x3;
temp_i += a_ptr[2] * x3 + a_ptr[3] * x2;
#else
temp_r = a_ptr[0] * x0 + a_ptr[1] * x1;
temp_i = a_ptr[0] * x1 - a_ptr[1] * x0;
temp_r += a_ptr[2] * x2 + a_ptr[3] * x3;
temp_i += a_ptr[2] * x3 - a_ptr[3] * x2;
#endif
#if !defined(XCONJ)
y_ptr[0] += ar * temp_r - ai * temp_i;
y_ptr[1] += ar * temp_i + ai * temp_r;
#else
y_ptr[0] += ar * temp_r + ai * temp_i;
y_ptr[1] -= ar * temp_i - ai * temp_r;
#endif
a_ptr += lda;
y_ptr += inc_y;
j++;
}
return(0);
}
if ( m3 == 1 )
{
FLOAT temp_r ;
FLOAT temp_i ;
FLOAT temp_r1 ;
FLOAT temp_i1 ;
FLOAT x0 = x_ptr[0];
FLOAT x1 = x_ptr[1];
FLOAT ar = alpha[0];
FLOAT ai = alpha[1];
while ( j < ( n & -2 ))
{
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;
a_ptr += lda;
temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1;
temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0;
#else
temp_r = a_ptr[0] * x0 + a_ptr[1] * x1;
temp_i = a_ptr[0] * x1 - a_ptr[1] * x0;
a_ptr += lda;
temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1;
temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0;
#endif
#if !defined(XCONJ)
y_ptr[0] += ar * temp_r - ai * temp_i;
y_ptr[1] += ar * temp_i + ai * temp_r;
y_ptr += inc_y;
y_ptr[0] += ar * temp_r1 - ai * temp_i1;
y_ptr[1] += ar * temp_i1 + ai * temp_r1;
#else
y_ptr[0] += ar * temp_r + ai * temp_i;
y_ptr[1] -= ar * temp_i - ai * temp_r;
y_ptr += inc_y;
y_ptr[0] += ar * temp_r1 + ai * temp_i1;
y_ptr[1] -= ar * temp_i1 - ai * temp_r1;
#endif
a_ptr += lda;
y_ptr += inc_y;
j+=2;
}
while ( j < n)
{
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;
#else
temp_r = a_ptr[0] * x0 + a_ptr[1] * x1;
temp_i = a_ptr[0] * x1 - a_ptr[1] * x0;
#endif
#if !defined(XCONJ)
y_ptr[0] += ar * temp_r - ai * temp_i;
y_ptr[1] += ar * temp_i + ai * temp_r;
#else
y_ptr[0] += ar * temp_r + ai * temp_i;
y_ptr[1] -= ar * temp_i - ai * temp_r;
#endif
a_ptr += lda;
y_ptr += inc_y;
j++;
}
return(0);
}
return(0);
}

276
kernel/zarch/zrot.c Normal file
View File

@ -0,0 +1,276 @@
/***************************************************************************
Copyright (c) 2017, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
static void __attribute__ ((noinline)) zrot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s)
{
__asm__ (
"pfd 2, 0(%1) \n\t"
"pfd 2, 0(%2) \n\t"
"vlrepg %%v0,0(%3) \n\t"
"vlrepg %%v1,0(%4) \n\t"
"srlg %%r0,%0,4 \n\t"
"xgr %%r1,%%r1 \n\t"
".align 16 \n\t"
"1: \n\t"
"pfd 2, 256(%%r1,%1) \n\t"
"pfd 2, 256(%%r1,%2) \n\t"
"vl %%v24, 0(%%r1,%1) \n\t"
"vl %%v25, 16(%%r1,%1) \n\t"
"vl %%v26, 32(%%r1,%1) \n\t"
"vl %%v27, 48(%%r1,%1) \n\t"
"vl %%v16, 0(%%r1,%2) \n\t"
"vl %%v17, 16(%%r1,%2) \n\t"
"vl %%v18, 32(%%r1,%2) \n\t"
"vl %%v19, 48(%%r1,%2) \n\t"
"vfmdb %%v28,%%v24,%%v0 \n\t"
"vfmdb %%v29,%%v25,%%v0 \n\t"
"vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v30,%%v26,%%v0 \n\t"
"vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v31,%%v27,%%v0 \n\t"
"vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
/* 2nd parts*/
"vfmadb %%v28,%%v16,%%v1,%%v28 \n\t"
"vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
"vfmadb %%v29,%%v17,%%v1,%%v29 \n\t"
"vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
"vfmadb %%v30,%%v18,%%v1,%%v30 \n\t"
"vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */
"vfmadb %%v31,%%v19,%%v1,%%v31 \n\t"
"vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
"vst %%v28, 0(%%r1,%1) \n\t"
"vst %%v29, 16(%%r1,%1) \n\t"
"vst %%v30, 32(%%r1,%1) \n\t"
"vst %%v31, 48(%%r1,%1) \n\t"
"vst %%v20, 0(%%r1,%2) \n\t"
"vst %%v21, 16(%%r1,%2) \n\t"
"vst %%v22, 32(%%r1,%2) \n\t"
"vst %%v23, 48(%%r1,%2) \n\t"
"vl %%v24, 64(%%r1,%1) \n\t"
"vl %%v25, 80(%%r1,%1) \n\t"
"vl %%v26, 96(%%r1,%1) \n\t"
"vl %%v27,112(%%r1,%1) \n\t"
"vl %%v16, 64(%%r1,%2) \n\t"
"vl %%v17, 80(%%r1,%2) \n\t"
"vl %%v18, 96(%%r1,%2) \n\t"
"vl %%v19,112(%%r1,%2) \n\t"
"vfmdb %%v28,%%v24,%%v0 \n\t"
"vfmdb %%v29,%%v25,%%v0 \n\t"
"vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v30,%%v26,%%v0 \n\t"
"vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v31,%%v27,%%v0 \n\t"
"vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
/* 2nd parts*/
"vfmadb %%v28,%%v16,%%v1,%%v28 \n\t"
"vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
"vfmadb %%v29,%%v17,%%v1,%%v29 \n\t"
"vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
"vfmadb %%v30,%%v18,%%v1,%%v30 \n\t"
"vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */
"vfmadb %%v31,%%v19,%%v1,%%v31 \n\t"
"vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
"vst %%v28, 64(%%r1,%1) \n\t"
"vst %%v29, 80(%%r1,%1) \n\t"
"vst %%v30, 96(%%r1,%1) \n\t"
"vst %%v31, 112(%%r1,%1) \n\t"
"vst %%v20, 64(%%r1,%2) \n\t"
"vst %%v21, 80(%%r1,%2) \n\t"
"vst %%v22, 96(%%r1,%2) \n\t"
"vst %%v23, 112(%%r1,%2) \n\t"
"vl %%v24, 128(%%r1,%1) \n\t"
"vl %%v25, 144(%%r1,%1) \n\t"
"vl %%v26, 160(%%r1,%1) \n\t"
"vl %%v27, 176(%%r1,%1) \n\t"
"vl %%v16, 128(%%r1,%2) \n\t"
"vl %%v17, 144(%%r1,%2) \n\t"
"vl %%v18, 160(%%r1,%2) \n\t"
"vl %%v19, 176(%%r1,%2) \n\t"
"vfmdb %%v28,%%v24,%%v0 \n\t"
"vfmdb %%v29,%%v25,%%v0 \n\t"
"vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v30,%%v26,%%v0 \n\t"
"vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v31,%%v27,%%v0 \n\t"
"vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
/* 2nd parts*/
"vfmadb %%v28,%%v16,%%v1,%%v28 \n\t"
"vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
"vfmadb %%v29,%%v17,%%v1,%%v29 \n\t"
"vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
"vfmadb %%v30,%%v18,%%v1,%%v30 \n\t"
"vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */
"vfmadb %%v31,%%v19,%%v1,%%v31 \n\t"
"vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
"vst %%v28, 128(%%r1,%1) \n\t"
"vst %%v29, 144(%%r1,%1) \n\t"
"vst %%v30, 160(%%r1,%1) \n\t"
"vst %%v31, 176(%%r1,%1) \n\t"
"vst %%v20, 128(%%r1,%2) \n\t"
"vst %%v21, 144(%%r1,%2) \n\t"
"vst %%v22, 160(%%r1,%2) \n\t"
"vst %%v23, 176(%%r1,%2) \n\t"
"vl %%v24, 192(%%r1,%1) \n\t"
"vl %%v25, 208(%%r1,%1) \n\t"
"vl %%v26, 224(%%r1,%1) \n\t"
"vl %%v27, 240(%%r1,%1) \n\t"
"vl %%v16, 192(%%r1,%2) \n\t"
"vl %%v17, 208(%%r1,%2) \n\t"
"vl %%v18, 224(%%r1,%2) \n\t"
"vl %%v19, 240(%%r1,%2) \n\t"
"vfmdb %%v28,%%v24,%%v0 \n\t"
"vfmdb %%v29,%%v25,%%v0 \n\t"
"vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v30,%%v26,%%v0 \n\t"
"vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v31,%%v27,%%v0 \n\t"
"vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
/* 2nd parts*/
"vfmadb %%v28,%%v16,%%v1,%%v28 \n\t"
"vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
"vfmadb %%v29,%%v17,%%v1,%%v29 \n\t"
"vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
"vfmadb %%v30,%%v18,%%v1,%%v30 \n\t"
"vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */
"vfmadb %%v31,%%v19,%%v1,%%v31 \n\t"
"vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
"vst %%v28, 192(%%r1,%1) \n\t"
"vst %%v29, 208(%%r1,%1) \n\t"
"vst %%v30, 224(%%r1,%1) \n\t"
"vst %%v31, 240(%%r1,%1) \n\t"
"vst %%v20, 192(%%r1,%2) \n\t"
"vst %%v21, 208(%%r1,%2) \n\t"
"vst %%v22, 224(%%r1,%2) \n\t"
"vst %%v23, 240(%%r1,%2) \n\t"
"la %%r1,256(%%r1) \n\t"
"brctg %%r0,1b"
:
: "r"(n), "a"(x), "a"(y),"a"(c),"a"(s)
: "cc", "memory","r0","r1" ,"v0","v1","v16",
"v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
return;
}
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
{
BLASLONG i=0;
BLASLONG ix=0,iy=0;
FLOAT temp[2];
BLASLONG inc_x2;
BLASLONG inc_y2;
if ( n <= 0 ) return(0);
if ( (inc_x == 1) && (inc_y == 1) )
{
BLASLONG n1 = n & -16;
if ( n1 > 0 )
{
FLOAT cosa,sina;
cosa=c;
sina=s;
zrot_kernel_16(n1, x, y, &cosa, &sina);
i=n1;
ix=2*n1;
}
while(i < n)
{
temp[0] = c*x[ix] + s*y[ix] ;
temp[1] = c*x[ix+1] + s*y[ix+1] ;
y[ix] = c*y[ix] - s*x[ix] ;
y[ix+1] = c*y[ix+1] - s*x[ix+1] ;
x[ix] = temp[0] ;
x[ix+1] = temp[1] ;
ix += 2 ;
i++ ;
}
}
else
{
inc_x2 = 2 * inc_x ;
inc_y2 = 2 * inc_y ;
while(i < n)
{
temp[0] = c*x[ix] + s*y[iy] ;
temp[1] = c*x[ix+1] + s*y[iy+1] ;
y[iy] = c*y[iy] - s*x[ix] ;
y[iy+1] = c*y[iy+1] - s*x[ix+1] ;
x[ix] = temp[0] ;
x[ix+1] = temp[1] ;
ix += inc_x2 ;
iy += inc_y2 ;
i++ ;
}
}
return(0);
}

483
kernel/zarch/zscal.c Normal file
View File

@ -0,0 +1,483 @@
/***************************************************************************
Copyright (c) 2013 - 2017, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
static void __attribute__ ((noinline)) zscal_kernel_8(BLASLONG n, FLOAT *alpha, FLOAT *x) {
__asm__(
"pfd 1, 0(%1) \n\t"
"sllg %%r0,%0,4 \n\t"
"agr %%r0,%2 \n\t"
"vlrepg %%v24,0(%1) \n\t"
"vlrepg %%v25,8(%1) \n\t"
".align 16 \n\t"
"1: \n\t"
"pfd 2, 256(%2 ) \n\t"
"vleg %%v20 , 0(%2),0 \n\t"
"vleg %%v21 , 8(%2),0 \n\t"
"vleg %%v20 , 16(%2),1 \n\t"
"vleg %%v21 , 24(%2),1 \n\t"
"vleg %%v22 , 32(%2),0 \n\t"
"vleg %%v23 , 40(%2),0 \n\t"
"vleg %%v22 , 48(%2),1 \n\t"
"vleg %%v23 , 56(%2),1 \n\t"
"vfmdb %%v16, %%v21, %%v25 \n\t"
"vfmdb %%v17, %%v20, %%v25 \n\t"
"vfmdb %%v18, %%v23, %%v25 \n\t"
"vfmdb %%v19, %%v22, %%v25 \n\t"
"vfmsdb %%v16, %%v20, %%v24 ,%%v16 \n\t"
"vfmadb %%v17, %%v21, %%v24, %%v17 \n\t"
"vfmsdb %%v18, %%v22, %%v24, %%v18 \n\t"
"vfmadb %%v19, %%v23, %%v24, %%v19 \n\t"
"vsteg %%v16 , 0(%2),0 \n\t"
"vsteg %%v17 , 8(%2),0 \n\t"
"vsteg %%v16 , 16(%2),1 \n\t"
"vsteg %%v17 , 24(%2),1 \n\t"
"vsteg %%v18 , 32(%2),0 \n\t"
"vsteg %%v19 , 40(%2),0 \n\t"
"vsteg %%v18 , 48(%2),1 \n\t"
"vsteg %%v19 , 56(%2),1 \n\t"
"vleg %%v20 , 64(%2),0 \n\t"
"vleg %%v21 , 72(%2),0 \n\t"
"vleg %%v20 , 80(%2),1 \n\t"
"vleg %%v21 , 88(%2),1 \n\t"
"vleg %%v22 , 96(%2),0 \n\t"
"vleg %%v23 , 104(%2),0 \n\t"
"vleg %%v22 , 112(%2),1 \n\t"
"vleg %%v23 , 120(%2),1 \n\t"
"vfmdb %%v16, %%v21, %%v25 \n\t"
"vfmdb %%v17, %%v20, %%v25 \n\t"
"vfmdb %%v18, %%v23, %%v25 \n\t"
"vfmdb %%v19, %%v22, %%v25 \n\t"
"vfmsdb %%v16, %%v20, %%v24 ,%%v16 \n\t"
"vfmadb %%v17, %%v21, %%v24, %%v17 \n\t"
"vfmsdb %%v18, %%v22, %%v24, %%v18 \n\t"
"vfmadb %%v19, %%v23, %%v24, %%v19 \n\t"
"vsteg %%v16 , 64(%2),0 \n\t"
"vsteg %%v17 , 72(%2),0 \n\t"
"vsteg %%v16 , 80(%2),1 \n\t"
"vsteg %%v17 , 88(%2),1 \n\t"
"vsteg %%v18 , 96(%2),0 \n\t"
"vsteg %%v19 , 104(%2),0 \n\t"
"vsteg %%v18 , 112(%2),1 \n\t"
"vsteg %%v19 , 120(%2),1 \n\t"
"la %2,128(%2) \n\t"
"clgrjl %2,%%r0,1b \n\t"
:
: "r"(n), "a"(alpha), "a"(x)
: "cc", "memory","r0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25"
);
}
static void __attribute__ ((noinline)) zscal_kernel_8_zero_r(BLASLONG n, FLOAT *alpha, FLOAT *x) {
__asm__ ( "pfd 2, 0(%1) \n\t"
"ld %%f0,8(%2) \n\t"
"lcdbr %%f1,%%f0 \n\t"
"lgdr %%r0,%%f1 \n\t"
"vlvgg %%v0,%%r0,1 \n\t"
"vlr %%v16,%%v0 \n\t"
"vlr %%v17 ,%%v0 \n\t"
"vlr %%v1,%%v0 \n\t"
"sllg %%r0,%0,4 \n\t"
"agr %%r0,%1 \n\t"
".align 16 \n\t"
"1: \n\t"
"vl %%v24, 0(%1) \n\t"
"vfmdb %%v24,%%v24,%%v0 \n\t"
"vsteg %%v24, 0(%1),1 \n\t"
"vsteg %%v24, 8(%1),0 \n\t"
"vl %%v25, 16(%1) \n\t"
"vfmdb %%v25,%%v25,%%v1 \n\t"
"vsteg %%v25, 16(%1),1 \n\t"
"vsteg %%v25, 24(%1),0 \n\t"
"vl %%v26, 32(%1) \n\t"
"vfmdb %%v26,%%v26,%%v16 \n\t"
"vsteg %%v26, 32(%1),1 \n\t"
"vsteg %%v26, 40(%1),0 \n\t"
"vl %%v27, 48(%1) \n\t"
"vfmdb %%v27,%%v27,%%v17 \n\t"
"vsteg %%v27, 40(%1),1 \n\t"
"vsteg %%v27, 48(%1),0 \n\t"
"vl %%v28, 64(%1) \n\t"
"vfmdb %%v28,%%v28,%%v0 \n\t"
"vsteg %%v28, 64(%1),1 \n\t"
"vsteg %%v28, 72(%1),0 \n\t"
"vl %%v29, 80(%1) \n\t"
"vfmdb %%v29,%%v29,%%v1 \n\t"
"vsteg %%v29, 80(%1),1 \n\t"
"vsteg %%v29, 88(%1),0 \n\t"
"vl %%v30, 96(%1) \n\t"
"vfmdb %%v30,%%v30,%%v16 \n\t"
"vsteg %%v27, 96(%1),1 \n\t"
"vsteg %%v27, 104(%1),0 \n\t"
"vl %%v31, 112(%1) \n\t"
"vfmdb %%v31,%%v31,%%v17 \n\t"
"vsteg %%v31, 112(%1),1 \n\t"
"vsteg %%v31, 120(%1),0 \n\t"
"la %1,128(%1) \n\t"
"clgrjl %1,%%r0,1b \n\t"
:
:"r"(n),"a"(x) ,"a"(alpha)
:"cc", "memory","r0","f0", "f1","v0","v1","v18","v19","v24","v25","v26","v27","v28","v29","v30","v31"
);
}
static void __attribute__ ((noinline)) zscal_kernel_8_zero_i(BLASLONG n, FLOAT *alpha, FLOAT *x) {
__asm__ ("pfd 2, 0(%1) \n\t"
"vlrepg %%v18,0(%2) \n\t"
"vlr %%v19,%%v18 \n\t"
"vlr %%v16 ,%%v18 \n\t"
"vlr %%v17,%%v18 \n\t"
"sllg %%r0,%0,4 \n\t"
"agr %%r0,%1 \n\t"
".align 16 \n\t"
"1: \n\t"
"vl %%v24, 0(%1) \n\t"
"vfmdb %%v24,%%v24,%%v18 \n\t"
"vst %%v24, 0(%1) \n\t"
"vl %%v25, 16(%1) \n\t"
"vfmdb %%v25,%%v25,%%v19 \n\t"
"vst %%v25, 16(%1) \n\t"
"vl %%v26, 32(%1) \n\t"
"vfmdb %%v26,%%v26,%%v16 \n\t"
"vst %%v26, 32(%1) \n\t"
"vl %%v27, 48(%1) \n\t"
"vfmdb %%v27,%%v27,%%v17 \n\t"
"vst %%v27, 48(%1) \n\t"
"vl %%v28, 64(%1) \n\t"
"vfmdb %%v28,%%v28,%%v18 \n\t"
"vst %%v28, 64(%1) \n\t"
"vl %%v29, 80(%1) \n\t"
"vfmdb %%v29,%%v29,%%v19 \n\t"
"vst %%v29, 80(%1) \n\t"
"vl %%v30, 96(%1) \n\t"
"vfmdb %%v30,%%v30,%%v16 \n\t"
"vst %%v30, 96(%1) \n\t"
"vl %%v31, 112(%1) \n\t"
"vfmdb %%v31,%%v31,%%v17 \n\t"
"vst %%v31, 112(%1) \n\t"
"la %1,128(%1) \n\t"
"clgrjl %1,%%r0,1b \n\t"
:
:"r"(n),"a"(x) ,"a"(alpha)
:"cc", "memory","r0","v16", "v17","v18","v19","v24","v25","v26","v27","v28","v29","v30","v31"
);
}
static void __attribute__ ((noinline)) zscal_kernel_8_zero(BLASLONG n, FLOAT *x) {
__asm__ ( "pfd 2, 0(%1) \n\t"
"vzero %%v24 \n\t"
"vzero %%v25 \n\t"
"vzero %%v26 \n\t"
"vzero %%v27 \n\t"
"sllg %%r0,%0,4 \n\t"
"agr %%r0,%1 \n\t"
".align 16 \n\t"
"1: \n\t"
"pfd 2, 256( %1) \n\t"
"vst %%v24, 0( %1) \n\t"
"vst %%v25, 16( %1) \n\t"
"vst %%v26, 32( %1) \n\t"
"vst %%v27, 48( %1) \n\t"
"vst %%v24, 64( %1) \n\t"
"vst %%v25, 80( %1) \n\t"
"vst %%v26, 96( %1) \n\t"
"vst %%v27,112( %1) \n\t"
"la %1,128(%1) \n\t"
"clgrjl %1,%%r0,1b \n\t"
:
:"r"(n),"a"(x)
:"cc" , "memory" ,"r0","v24","v25","v26","v27"
);
}
static void zscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, BLASLONG inc_x) __attribute__ ((noinline));
static void zscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, BLASLONG inc_x) {
BLASLONG i;
BLASLONG inc_x2 = 2 * inc_x;
BLASLONG inc_x3 = inc_x2 + inc_x;
FLOAT t0, t1, t2, t3;
FLOAT da_r = alpha[0];
FLOAT da_i = alpha[1];
for (i = 0; i < n; i += 4) {
t0 = da_r * x[0] - da_i * x[1];
t1 = da_r * x[inc_x] - da_i * x[inc_x + 1];
t2 = da_r * x[inc_x2] - da_i * x[inc_x2 + 1];
t3 = da_r * x[inc_x3] - da_i * x[inc_x3 + 1];
x[1] = da_i * x[0] + da_r * x[1];
x[inc_x + 1] = da_i * x[inc_x] + da_r * x[inc_x + 1];
x[inc_x2 + 1] = da_i * x[inc_x2] + da_r * x[inc_x2 + 1];
x[inc_x3 + 1] = da_i * x[inc_x3] + da_r * x[inc_x3 + 1];
x[0] = t0;
x[inc_x] = t1;
x[inc_x2] = t2;
x[inc_x3] = t3;
x += 4 * inc_x;
}
}
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) {
BLASLONG i = 0, j = 0;
FLOAT temp0;
FLOAT temp1;
FLOAT alpha[2];
if (inc_x != 1) {
inc_x <<= 1;
if (da_r == 0.0) {
BLASLONG n1 = n & -2;
if (da_i == 0.0) {
while (j < n1) {
x[i] = 0.0;
x[i + 1] = 0.0;
x[i + inc_x] = 0.0;
x[i + 1 + inc_x] = 0.0;
i += 2 * inc_x;
j += 2;
}
while (j < n) {
x[i] = 0.0;
x[i + 1] = 0.0;
i += inc_x;
j++;
}
} else {
while (j < n1) {
temp0 = -da_i * x[i + 1];
x[i + 1] = da_i * x[i];
x[i] = temp0;
temp1 = -da_i * x[i + 1 + inc_x];
x[i + 1 + inc_x] = da_i * x[i + inc_x];
x[i + inc_x] = temp1;
i += 2 * inc_x;
j += 2;
}
while (j < n) {
temp0 = -da_i * x[i + 1];
x[i + 1] = da_i * x[i];
x[i] = temp0;
i += inc_x;
j++;
}
}
} else {
if (da_i == 0.0) {
BLASLONG n1 = n & -2;
while (j < n1) {
temp0 = da_r * x[i];
x[i + 1] = da_r * x[i + 1];
x[i] = temp0;
temp1 = da_r * x[i + inc_x];
x[i + 1 + inc_x] = da_r * x[i + 1 + inc_x];
x[i + inc_x] = temp1;
i += 2 * inc_x;
j += 2;
}
while (j < n) {
temp0 = da_r * x[i];
x[i + 1] = da_r * x[i + 1];
x[i] = temp0;
i += inc_x;
j++;
}
} else {
BLASLONG n1 = n & -8;
if (n1 > 0) {
alpha[0] = da_r;
alpha[1] = da_i;
zscal_kernel_inc_8(n1, alpha, x, inc_x);
j = n1;
i = n1 * inc_x;
}
while (j < n) {
temp0 = da_r * x[i] - da_i * x[i + 1];
x[i + 1] = da_r * x[i + 1] + da_i * x[i];
x[i] = temp0;
i += inc_x;
j++;
}
}
}
return (0);
}
BLASLONG n1 = n & -8;
if (n1 > 0) {
alpha[0] = da_r;
alpha[1] = da_i;
if (da_r == 0.0)
if (da_i == 0)
zscal_kernel_8_zero(n1, x);
else
zscal_kernel_8_zero_r(n1, alpha, x);
else
if (da_i == 0)
zscal_kernel_8_zero_i(n1, alpha, x);
else
zscal_kernel_8(n1, alpha, x);
i = n1 << 1;
j = n1;
}
if (da_r == 0.0) {
if (da_i == 0.0) {
while (j < n) {
x[i] = 0.0;
x[i + 1] = 0.0;
i += 2;
j++;
}
} else {
while (j < n) {
temp0 = -da_i * x[i + 1];
x[i + 1] = da_i * x[i];
x[i] = temp0;
i += 2;
j++;
}
}
} else {
if (da_i == 0.0) {
while (j < n) {
temp0 = da_r * x[i];
x[i + 1] = da_r * x[i + 1];
x[i] = temp0;
i += 2;
j++;
}
} else {
while (j < n) {
temp0 = da_r * x[i] - da_i * x[i + 1];
x[i + 1] = da_r * x[i + 1] + da_i * x[i];
x[i] = temp0;
i += 2;
j++;
}
}
}
return (0);
}

198
kernel/zarch/zswap.c Normal file
View File

@ -0,0 +1,198 @@
/***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
static void __attribute__ ((noinline)) zswap_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y)
{
__asm__ volatile(
"pfd 2, 0(%1) \n\t"
"pfd 2, 0(%2) \n\t"
"srlg %%r0,%0,4 \n\t"
"xgr %%r1,%%r1 \n\t"
".align 16 \n\t"
"1: \n\t"
"pfd 2, 256(%%r1,%1) \n\t"
"pfd 2, 256(%%r1,%2) \n\t"
"vl %%v16, 0(%%r1,%1) \n\t"
"vl %%v17, 16(%%r1,%1) \n\t"
"vl %%v18, 32(%%r1,%1) \n\t"
"vl %%v19, 48(%%r1,%1) \n\t"
"vl %%v20, 64(%%r1,%1) \n\t"
"vl %%v21, 80(%%r1,%1) \n\t"
"vl %%v22, 96(%%r1,%1) \n\t"
"vl %%v23, 112(%%r1,%1) \n\t"
"vl %%v24, 128(%%r1,%1) \n\t"
"vl %%v25, 144(%%r1,%1) \n\t"
"vl %%v26, 160(%%r1,%1) \n\t"
"vl %%v27, 176(%%r1,%1) \n\t"
"vl %%v28, 192(%%r1,%1) \n\t"
"vl %%v29, 208(%%r1,%1) \n\t"
"vl %%v30, 224(%%r1,%1) \n\t"
"vl %%v31, 240(%%r1,%1) \n\t"
"vl %%v0, 0(%%r1,%2) \n\t"
"vl %%v1, 16(%%r1,%2) \n\t"
"vl %%v2, 32(%%r1,%2) \n\t"
"vl %%v3, 48(%%r1,%2) \n\t"
"vl %%v4, 64(%%r1,%2) \n\t"
"vl %%v5, 80(%%r1,%2) \n\t"
"vl %%v6, 96(%%r1,%2) \n\t"
"vl %%v7, 112(%%r1,%2) \n\t"
"vst %%v0, 0(%%r1,%1) \n\t"
"vst %%v1, 16(%%r1,%1) \n\t"
"vst %%v2, 32(%%r1,%1) \n\t"
"vst %%v3, 48(%%r1,%1) \n\t"
"vst %%v4, 64(%%r1,%1) \n\t"
"vst %%v5, 80(%%r1,%1) \n\t"
"vst %%v6, 96(%%r1,%1) \n\t"
"vst %%v7, 112(%%r1,%1) \n\t"
"vl %%v0, 128(%%r1,%2) \n\t"
"vl %%v1, 144(%%r1,%2) \n\t"
"vl %%v2, 160(%%r1,%2) \n\t"
"vl %%v3, 176(%%r1,%2) \n\t"
"vl %%v4, 192(%%r1,%2) \n\t"
"vl %%v5, 208(%%r1,%2) \n\t"
"vl %%v6, 224(%%r1,%2) \n\t"
"vl %%v7, 240(%%r1,%2) \n\t"
"vst %%v0, 128(%%r1,%1) \n\t"
"vst %%v1, 144(%%r1,%1) \n\t"
"vst %%v2, 160(%%r1,%1) \n\t"
"vst %%v3, 176(%%r1,%1) \n\t"
"vst %%v4, 192(%%r1,%1) \n\t"
"vst %%v5, 208(%%r1,%1) \n\t"
"vst %%v6, 224(%%r1,%1) \n\t"
"vst %%v7, 240(%%r1,%1) \n\t"
"vst %%v16, 0(%%r1,%2) \n\t"
"vst %%v17, 16(%%r1,%2) \n\t"
"vst %%v18, 32(%%r1,%2) \n\t"
"vst %%v19, 48(%%r1,%2) \n\t"
"vst %%v20, 64(%%r1,%2) \n\t"
"vst %%v21, 80(%%r1,%2) \n\t"
"vst %%v22, 96(%%r1,%2) \n\t"
"vst %%v23, 112(%%r1,%2) \n\t"
"vst %%v24, 128(%%r1,%2) \n\t"
"vst %%v25, 144(%%r1,%2) \n\t"
"vst %%v26, 160(%%r1,%2) \n\t"
"vst %%v27, 176(%%r1,%2) \n\t"
"vst %%v28, 192(%%r1,%2) \n\t"
"vst %%v29, 208(%%r1,%2) \n\t"
"vst %%v30, 224(%%r1,%2) \n\t"
"vst %%v31, 240(%%r1,%2) \n\t"
"la %%r1,256(%%r1) \n\t"
"brctg %%r0,1b"
:
: "r"(n), "a"(x), "a"(y)
:"cc", "memory","r0","r1", "v0","v1","v2","v3","v4","v5","v6","v7","v16",
"v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
return;
}
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
{
BLASLONG i=0;
BLASLONG ix=0,iy=0;
FLOAT temp[2];
BLASLONG inc_x2, inc_y2;
if ( n <= 0 ) return(0);
if ( (inc_x == 1) && (inc_y == 1 ))
{
BLASLONG n1 = n & -16;
if ( n1 > 0 )
{
zswap_kernel_16(n1, x, y);
i=n1;
ix = 2* n1;
iy = 2* n1;
}
while(i < n)
{
temp[0] = x[ix] ;
temp[1] = x[ix+1] ;
x[ix] = y[iy] ;
x[ix+1] = y[iy+1] ;
y[iy] = temp[0] ;
y[iy+1] = temp[1] ;
ix += 2 ;
iy += 2 ;
i++ ;
}
}
else
{
inc_x2 = 2 * inc_x;
inc_y2 = 2 * inc_y;
while(i < n)
{
temp[0] = x[ix] ;
temp[1] = x[ix+1] ;
x[ix] = y[iy] ;
x[ix+1] = y[iy+1] ;
y[iy] = temp[0] ;
y[iy+1] = temp[1] ;
ix += inc_x2 ;
iy += inc_y2 ;
i++ ;
}
}
return(0);
}