Merge pull request #1991 from maamountki/z14

[ZARCH] Z14 Support, BLAS 1/2 single precision implementations
This commit is contained in:
Martin Kroeker 2019-01-31 19:10:03 +01:00 committed by GitHub
commit 42df9efa0c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
71 changed files with 15497 additions and 4409 deletions

View File

@ -4,3 +4,7 @@ CCOMMON_OPT += -march=z13 -mzvector
FCOMMON_OPT += -march=z13 -mzvector FCOMMON_OPT += -march=z13 -mzvector
endif endif
ifeq ($(CORE), Z14)
CCOMMON_OPT += -march=z14 -mzvector
FCOMMON_OPT += -march=z14 -mzvector
endif

View File

@ -27,9 +27,9 @@
#include <string.h> #include <string.h>
#define CPU_GENERIC 0 #define CPU_GENERIC 0
#define CPU_Z13 1 #define CPU_Z13 1
#define CPU_Z14 2 #define CPU_Z14 2
static char *cpuname[] = { static char *cpuname[] = {
"ZARCH_GENERIC", "ZARCH_GENERIC",
@ -64,10 +64,8 @@ int detect(void)
if (strstr(p, "2964")) return CPU_Z13; if (strstr(p, "2964")) return CPU_Z13;
if (strstr(p, "2965")) return CPU_Z13; if (strstr(p, "2965")) return CPU_Z13;
if (strstr(p, "3906")) return CPU_Z14;
/* detect z14, but fall back to z13 */ if (strstr(p, "3907")) return CPU_Z14;
if (strstr(p, "3906")) return CPU_Z13;
if (strstr(p, "3907")) return CPU_Z13;
return CPU_GENERIC; return CPU_GENERIC;
} }

View File

@ -1,18 +1,18 @@
SAMAXKERNEL = ../arm/amax.c SAMAXKERNEL = ../arm/amax.c
DAMAXKERNEL = ../arm/amax.c DAMAXKERNEL = damax_z13.c
CAMAXKERNEL = ../arm/zamax.c CAMAXKERNEL = ../arm/zamax.c
ZAMAXKERNEL = ../arm/zamax.c ZAMAXKERNEL = zamax_z13.c
SAMINKERNEL = ../arm/amin.c SAMINKERNEL = ../arm/amin.c
DAMINKERNEL = ../arm/amin.c DAMINKERNEL = damin_z13.c
CAMINKERNEL = ../arm/zamin.c CAMINKERNEL = ../arm/zamin.c
ZAMINKERNEL = ../arm/zamin.c ZAMINKERNEL = zamin_z13.c
SMAXKERNEL = ../arm/max.c SMAXKERNEL = ../arm/max.c
DMAXKERNEL = ../arm/max.c DMAXKERNEL = dmax_z13.c
SMINKERNEL = ../arm/min.c SMINKERNEL = ../arm/min.c
DMINKERNEL = ../arm/min.c DMINKERNEL = dmin_z13.c
ISAMAXKERNEL = ../arm/iamax.c ISAMAXKERNEL = ../arm/iamax.c
IDAMAXKERNEL = idamax.c IDAMAXKERNEL = idamax.c
@ -25,10 +25,10 @@ ICAMINKERNEL = ../arm/izamin.c
IZAMINKERNEL = izamin.c IZAMINKERNEL = izamin.c
ISMAXKERNEL = ../arm/imax.c ISMAXKERNEL = ../arm/imax.c
IDMAXKERNEL = ../arm/imax.c IDMAXKERNEL = idmax.c
ISMINKERNEL = ../arm/imin.c ISMINKERNEL = ../arm/imin.c
IDMINKERNEL = ../arm/imin.c IDMINKERNEL = idmin.c
SASUMKERNEL = ../arm/asum.c SASUMKERNEL = ../arm/asum.c
DASUMKERNEL = dasum.c DASUMKERNEL = dasum.c

146
kernel/zarch/KERNEL.Z14 Normal file
View File

@ -0,0 +1,146 @@
SAMAXKERNEL = samax.c
DAMAXKERNEL = damax.c
CAMAXKERNEL = camax.c
ZAMAXKERNEL = zamax.c
SAMINKERNEL = samin.c
DAMINKERNEL = damin.c
CAMINKERNEL = camin.c
ZAMINKERNEL = zamin.c
SMAXKERNEL = smax.c
DMAXKERNEL = dmax.c
SMINKERNEL = smin.c
DMINKERNEL = dmin.c
ISAMAXKERNEL = isamax.c
IDAMAXKERNEL = idamax.c
ICAMAXKERNEL = icamax.c
IZAMAXKERNEL = izamax.c
ISAMINKERNEL = isamin.c
IDAMINKERNEL = idamin.c
ICAMINKERNEL = icamin.c
IZAMINKERNEL = izamin.c
ISMAXKERNEL = ismax.c
IDMAXKERNEL = idmax.c
ISMINKERNEL = ismin.c
IDMINKERNEL = idmin.c
SASUMKERNEL = sasum.c
DASUMKERNEL = dasum.c
CASUMKERNEL = casum.c
ZASUMKERNEL = zasum.c
SAXPYKERNEL = saxpy.c
DAXPYKERNEL = daxpy.c
CAXPYKERNEL = caxpy.c
ZAXPYKERNEL = zaxpy.c
SCOPYKERNEL = scopy.c
DCOPYKERNEL = dcopy.c
CCOPYKERNEL = ccopy.c
ZCOPYKERNEL = zcopy.c
SDOTKERNEL = sdot.c
DDOTKERNEL = ddot.c
CDOTKERNEL = cdot.c
ZDOTKERNEL = zdot.c
DSDOTKERNEL = dsdot.c
SNRM2KERNEL = ../arm/nrm2.c
DNRM2KERNEL = ../arm/nrm2.c
CNRM2KERNEL = ../arm/znrm2.c
ZNRM2KERNEL = ../arm/znrm2.c
SROTKERNEL = srot.c
DROTKERNEL = drot.c
CROTKERNEL = crot.c
ZROTKERNEL = zrot.c
SSCALKERNEL = sscal.c
DSCALKERNEL = dscal.c
CSCALKERNEL = cscal.c
ZSCALKERNEL = zscal.c
SSWAPKERNEL = sswap.c
DSWAPKERNEL = dswap.c
CSWAPKERNEL = cswap.c
ZSWAPKERNEL = zswap.c
SGEMVNKERNEL = sgemv_n_4.c
DGEMVNKERNEL = dgemv_n_4.c
CGEMVNKERNEL = cgemv_n_4.c
ZGEMVNKERNEL = zgemv_n_4.c
SGEMVTKERNEL = sgemv_t_4.c
DGEMVTKERNEL = dgemv_t_4.c
CGEMVTKERNEL = cgemv_t_4.c
ZGEMVTKERNEL = zgemv_t_4.c
STRMMKERNEL = strmm8x4V.S
DTRMMKERNEL = trmm8x4V.S
CTRMMKERNEL = ctrmm4x4V.S
ZTRMMKERNEL = ztrmm4x4V.S
SGEMMKERNEL = strmm8x4V.S
SGEMMINCOPY = ../generic/gemm_ncopy_8.c
SGEMMITCOPY = ../generic/gemm_tcopy_8.c
SGEMMONCOPY = ../generic/gemm_ncopy_4.c
SGEMMOTCOPY = ../generic/gemm_tcopy_4.c
SGEMMINCOPYOBJ = sgemm_incopy.o
SGEMMITCOPYOBJ = sgemm_itcopy.o
SGEMMONCOPYOBJ = sgemm_oncopy.o
SGEMMOTCOPYOBJ = sgemm_otcopy.o
DGEMMKERNEL = gemm8x4V.S
DGEMMINCOPY = ../generic/gemm_ncopy_8.c
DGEMMITCOPY = ../generic/gemm_tcopy_8.c
DGEMMONCOPY = ../generic/gemm_ncopy_4.c
DGEMMOTCOPY = ../generic/gemm_tcopy_4.c
DGEMMINCOPYOBJ = dgemm_incopy.o
DGEMMITCOPYOBJ = dgemm_itcopy.o
DGEMMONCOPYOBJ = dgemm_oncopy.o
DGEMMOTCOPYOBJ = dgemm_otcopy.o
CGEMMKERNEL = ctrmm4x4V.S
CGEMMONCOPY = ../generic/zgemm_ncopy_4.c
CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c
CGEMMONCOPYOBJ = cgemm_oncopy.o
CGEMMOTCOPYOBJ = cgemm_otcopy.o
ZGEMMKERNEL = ztrmm4x4V.S
ZGEMMONCOPY = ../generic/zgemm_ncopy_4.c
ZGEMMOTCOPY = ../generic/zgemm_tcopy_4.c
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c

241
kernel/zarch/camax.c Normal file
View File

@ -0,0 +1,241 @@
/***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#include <math.h>
#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf
#endif
#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1]))
static FLOAT camax_kernel_32(BLASLONG n, FLOAT *x)
{
FLOAT amax;
__asm__ volatile (
"vlef %%v0,0(%2),0 \n\t"
"vlef %%v16,4(%2),0 \n\t"
"vlef %%v0,8(%2),1 \n\t"
"vlef %%v16,12(%2),1 \n\t"
"vlef %%v0,16(%2),2 \n\t"
"vlef %%v16,20(%2),2 \n\t"
"vlef %%v0,24(%2),3 \n\t"
"vlef %%v16,28(%2),3 \n\t"
"vflpsb %%v0,%%v0 \n\t"
"vflpsb %%v16,%%v16 \n\t"
"vfasb %%v0,%%v0,%%v16 \n\t"
"vleib %%v1,0,0 \n\t"
"vleib %%v1,1,1 \n\t"
"vleib %%v1,2,2 \n\t"
"vleib %%v1,3,3 \n\t"
"vleib %%v1,8,4 \n\t"
"vleib %%v1,9,5 \n\t"
"vleib %%v1,10,6 \n\t"
"vleib %%v1,11,7 \n\t"
"vleib %%v1,16,8 \n\t"
"vleib %%v1,17,9 \n\t"
"vleib %%v1,18,10 \n\t"
"vleib %%v1,19,11 \n\t"
"vleib %%v1,24,12 \n\t"
"vleib %%v1,25,13 \n\t"
"vleib %%v1,26,14 \n\t"
"vleib %%v1,27,15 \n\t"
"srlg %%r0,%1,5 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%2) \n\t"
"vl %%v16,0(%%r1,%2) \n\t"
"vl %%v2,16(%%r1,%2) \n\t"
"vpkg %%v17,%%v16,%%v2 \n\t"
"vperm %%v16,%%v16,%%v2,%%v1 \n\t"
"vl %%v18,32(%%r1,%2) \n\t"
"vl %%v2,48(%%r1,%2) \n\t"
"vpkg %%v19,%%v18,%%v2 \n\t"
"vperm %%v18,%%v18,%%v2,%%v1 \n\t"
"vl %%v20,64(%%r1,%2) \n\t"
"vl %%v2,80(%%r1,%2) \n\t"
"vpkg %%v21,%%v20,%%v2 \n\t"
"vperm %%v20,%%v20,%%v2,%%v1 \n\t"
"vl %%v22,96(%%r1,%2) \n\t"
"vl %%v2,112(%%r1,%2) \n\t"
"vpkg %%v23,%%v22,%%v2 \n\t"
"vperm %%v22,%%v22,%%v2,%%v1 \n\t"
"vl %%v24,128(%%r1,%2) \n\t"
"vl %%v2,144(%%r1,%2) \n\t"
"vpkg %%v25,%%v24,%%v2 \n\t"
"vperm %%v24,%%v24,%%v2,%%v1 \n\t"
"vl %%v26,160(%%r1,%2) \n\t"
"vl %%v2,176(%%r1,%2) \n\t"
"vpkg %%v27,%%v26,%%v2 \n\t"
"vperm %%v26,%%v26,%%v2,%%v1 \n\t"
"vl %%v28,192(%%r1,%2) \n\t"
"vl %%v2,208(%%r1,%2) \n\t"
"vpkg %%v29,%%v28,%%v2 \n\t"
"vperm %%v28,%%v28,%%v2,%%v1 \n\t"
"vl %%v30,224(%%r1,%2) \n\t"
"vl %%v2,240(%%r1,%2) \n\t"
"vpkg %%v31,%%v30,%%v2 \n\t"
"vperm %%v30,%%v30,%%v2,%%v1 \n\t"
"vflpsb %%v16,%%v16 \n\t"
"vflpsb %%v17,%%v17 \n\t"
"vflpsb %%v18,%%v18 \n\t"
"vflpsb %%v19,%%v19 \n\t"
"vflpsb %%v20,%%v20 \n\t"
"vflpsb %%v21,%%v21 \n\t"
"vflpsb %%v22,%%v22 \n\t"
"vflpsb %%v23,%%v23 \n\t"
"vflpsb %%v24,%%v24 \n\t"
"vflpsb %%v25,%%v25 \n\t"
"vflpsb %%v26,%%v26 \n\t"
"vflpsb %%v27,%%v27 \n\t"
"vflpsb %%v28,%%v28 \n\t"
"vflpsb %%v29,%%v29 \n\t"
"vflpsb %%v30,%%v30 \n\t"
"vflpsb %%v31,%%v31 \n\t"
"vfasb %%v16,%%v16,%%v17 \n\t"
"vfasb %%v18,%%v18,%%v19 \n\t"
"vfasb %%v20,%%v20,%%v21 \n\t"
"vfasb %%v22,%%v22,%%v23 \n\t"
"vfasb %%v24,%%v24,%%v25 \n\t"
"vfasb %%v26,%%v26,%%v27 \n\t"
"vfasb %%v28,%%v28,%%v29 \n\t"
"vfasb %%v30,%%v30,%%v31 \n\t"
"vfmaxsb %%v16,%%v16,%%v24,0 \n\t"
"vfmaxsb %%v18,%%v18,%%v26,0 \n\t"
"vfmaxsb %%v20,%%v20,%%v28,0 \n\t"
"vfmaxsb %%v22,%%v22,%%v30,0 \n\t"
"vfmaxsb %%v16,%%v16,%%v20,0 \n\t"
"vfmaxsb %%v18,%%v18,%%v22,0 \n\t"
"vfmaxsb %%v16,%%v16,%%v18,0 \n\t"
"vfmaxsb %%v0,%%v0,%%v16,0 \n\t"
"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"
"veslg %%v16,%%v0,32 \n\t"
"vfmaxsb %%v0,%%v0,%%v16,0 \n\t"
"vrepf %%v16,%%v0,2 \n\t"
"wfmaxsb %%v0,%%v0,%%v16,0 \n\t"
"ler %0,%%f0 "
:"=f"(amax)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v1","v2","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
return amax;
}
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0;
BLASLONG ix = 0;
FLOAT maxf = 0.0;
BLASLONG inc_x2;
if (n <= 0 || inc_x <= 0) return (maxf);
if (inc_x == 1) {
BLASLONG n1 = n & -32;
if (n1 > 0) {
maxf = camax_kernel_32(n1, x);
ix = n1 * 2;
i = n1;
}
else
{
maxf=CABS1(x,0);
ix += 2;
i++;
}
while (i < n) {
if (CABS1(x,ix) > maxf) {
maxf = CABS1(x,ix);
}
ix += 2;
i++;
}
return (maxf);
} else {
maxf=CABS1(x,0);
inc_x2 = 2 * inc_x;
BLASLONG n1 = n & -4;
while (i < n1) {
if (CABS1(x,ix) > maxf) {
maxf = CABS1(x,ix);
}
if (CABS1(x,ix+inc_x2) > maxf) {
maxf = CABS1(x,ix+inc_x2);
}
if (CABS1(x,ix+inc_x2*2) > maxf) {
maxf = CABS1(x,ix+inc_x2*2);
}
if (CABS1(x,ix+inc_x2*3) > maxf) {
maxf = CABS1(x,ix+inc_x2*3);
}
ix += inc_x2 * 4;
i += 4;
}
while (i < n) {
if (CABS1(x,ix) > maxf) {
maxf = CABS1(x,ix);
}
ix += inc_x2;
i++;
}
return (maxf);
}
}

241
kernel/zarch/camin.c Normal file
View File

@ -0,0 +1,241 @@
/***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#include <math.h>
#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf
#endif
#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1]))
static FLOAT camin_kernel_32(BLASLONG n, FLOAT *x)
{
FLOAT amin;
__asm__ volatile (
"vlef %%v0,0(%2),0 \n\t"
"vlef %%v16,4(%2),0 \n\t"
"vlef %%v0,8(%2),1 \n\t"
"vlef %%v16,12(%2),1 \n\t"
"vlef %%v0,16(%2),2 \n\t"
"vlef %%v16,20(%2),2 \n\t"
"vlef %%v0,24(%2),3 \n\t"
"vlef %%v16,28(%2),3 \n\t"
"vflpsb %%v0,%%v0 \n\t"
"vflpsb %%v16,%%v16 \n\t"
"vfasb %%v0,%%v0,%%v16 \n\t"
"vleib %%v1,0,0 \n\t"
"vleib %%v1,1,1 \n\t"
"vleib %%v1,2,2 \n\t"
"vleib %%v1,3,3 \n\t"
"vleib %%v1,8,4 \n\t"
"vleib %%v1,9,5 \n\t"
"vleib %%v1,10,6 \n\t"
"vleib %%v1,11,7 \n\t"
"vleib %%v1,16,8 \n\t"
"vleib %%v1,17,9 \n\t"
"vleib %%v1,18,10 \n\t"
"vleib %%v1,19,11 \n\t"
"vleib %%v1,24,12 \n\t"
"vleib %%v1,25,13 \n\t"
"vleib %%v1,26,14 \n\t"
"vleib %%v1,27,15 \n\t"
"srlg %%r0,%1,5 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%2) \n\t"
"vl %%v16,0(%%r1,%2) \n\t"
"vl %%v2,16(%%r1,%2) \n\t"
"vpkg %%v17,%%v16,%%v2 \n\t"
"vperm %%v16,%%v16,%%v2,%%v1 \n\t"
"vl %%v18,32(%%r1,%2) \n\t"
"vl %%v2,48(%%r1,%2) \n\t"
"vpkg %%v19,%%v18,%%v2 \n\t"
"vperm %%v18,%%v18,%%v2,%%v1 \n\t"
"vl %%v20,64(%%r1,%2) \n\t"
"vl %%v2,80(%%r1,%2) \n\t"
"vpkg %%v21,%%v20,%%v2 \n\t"
"vperm %%v20,%%v20,%%v2,%%v1 \n\t"
"vl %%v22,96(%%r1,%2) \n\t"
"vl %%v2,112(%%r1,%2) \n\t"
"vpkg %%v23,%%v22,%%v2 \n\t"
"vperm %%v22,%%v22,%%v2,%%v1 \n\t"
"vl %%v24,128(%%r1,%2) \n\t"
"vl %%v2,144(%%r1,%2) \n\t"
"vpkg %%v25,%%v24,%%v2 \n\t"
"vperm %%v24,%%v24,%%v2,%%v1 \n\t"
"vl %%v26,160(%%r1,%2) \n\t"
"vl %%v2,176(%%r1,%2) \n\t"
"vpkg %%v27,%%v26,%%v2 \n\t"
"vperm %%v26,%%v26,%%v2,%%v1 \n\t"
"vl %%v28,192(%%r1,%2) \n\t"
"vl %%v2,208(%%r1,%2) \n\t"
"vpkg %%v29,%%v28,%%v2 \n\t"
"vperm %%v28,%%v28,%%v2,%%v1 \n\t"
"vl %%v30,224(%%r1,%2) \n\t"
"vl %%v2,240(%%r1,%2) \n\t"
"vpkg %%v31,%%v30,%%v2 \n\t"
"vperm %%v30,%%v30,%%v2,%%v1 \n\t"
"vflpsb %%v16,%%v16 \n\t"
"vflpsb %%v17,%%v17 \n\t"
"vflpsb %%v18,%%v18 \n\t"
"vflpsb %%v19,%%v19 \n\t"
"vflpsb %%v20,%%v20 \n\t"
"vflpsb %%v21,%%v21 \n\t"
"vflpsb %%v22,%%v22 \n\t"
"vflpsb %%v23,%%v23 \n\t"
"vflpsb %%v24,%%v24 \n\t"
"vflpsb %%v25,%%v25 \n\t"
"vflpsb %%v26,%%v26 \n\t"
"vflpsb %%v27,%%v27 \n\t"
"vflpsb %%v28,%%v28 \n\t"
"vflpsb %%v29,%%v29 \n\t"
"vflpsb %%v30,%%v30 \n\t"
"vflpsb %%v31,%%v31 \n\t"
"vfasb %%v16,%%v16,%%v17 \n\t"
"vfasb %%v18,%%v18,%%v19 \n\t"
"vfasb %%v20,%%v20,%%v21 \n\t"
"vfasb %%v22,%%v22,%%v23 \n\t"
"vfasb %%v24,%%v24,%%v25 \n\t"
"vfasb %%v26,%%v26,%%v27 \n\t"
"vfasb %%v28,%%v28,%%v29 \n\t"
"vfasb %%v30,%%v30,%%v31 \n\t"
"vfminsb %%v16,%%v16,%%v24,0 \n\t"
"vfminsb %%v18,%%v18,%%v26,0 \n\t"
"vfminsb %%v20,%%v20,%%v28,0 \n\t"
"vfminsb %%v22,%%v22,%%v30,0 \n\t"
"vfminsb %%v16,%%v16,%%v20,0 \n\t"
"vfminsb %%v18,%%v18,%%v22,0 \n\t"
"vfminsb %%v16,%%v16,%%v18,0 \n\t"
"vfminsb %%v0,%%v0,%%v16,0 \n\t"
"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"
"veslg %%v16,%%v0,32 \n\t"
"vfminsb %%v0,%%v0,%%v16,0 \n\t"
"vrepf %%v16,%%v0,2 \n\t"
"wfminsb %%v0,%%v0,%%v16,0 \n\t"
"ler %0,%%f0 "
:"=f"(amin)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v1","v2","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
return amin;
}
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0;
BLASLONG ix = 0;
FLOAT minf = 0.0;
BLASLONG inc_x2;
if (n <= 0 || inc_x <= 0) return (minf);
if (inc_x == 1) {
BLASLONG n1 = n & -32;
if (n1 > 0) {
minf = camin_kernel_32(n1, x);
ix = n1 * 2;
i = n1;
}
else
{
minf=CABS1(x,0);
ix += 2;
i++;
}
while (i < n) {
if (CABS1(x,ix) < minf) {
minf = CABS1(x,ix);
}
ix += 2;
i++;
}
return (minf);
} else {
minf=CABS1(x,0);
inc_x2 = 2 * inc_x;
BLASLONG n1 = n & -4;
while (i < n1) {
if (CABS1(x,ix) < minf) {
minf = CABS1(x,ix);
}
if (CABS1(x,ix+inc_x2) < minf) {
minf = CABS1(x,ix+inc_x2);
}
if (CABS1(x,ix+inc_x2*2) < minf) {
minf = CABS1(x,ix+inc_x2*2);
}
if (CABS1(x,ix+inc_x2*3) < minf) {
minf = CABS1(x,ix+inc_x2*3);
}
ix += inc_x2 * 4;
i += 4;
}
while (i < n) {
if (CABS1(x,ix) < minf) {
minf = CABS1(x,ix);
}
ix += inc_x2;
i++;
}
return (minf);
}
}

167
kernel/zarch/casum.c Normal file
View File

@ -0,0 +1,167 @@
/***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#include <math.h>
#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf
#endif
static FLOAT casum_kernel_32(BLASLONG n, FLOAT *x)
{
FLOAT asum;
__asm__ (
"vzero %%v0 \n\t"
"vzero %%v1 \n\t"
"vzero %%v2 \n\t"
"vzero %%v3 \n\t"
"srlg %%r0,%1,5 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%2) \n\t"
"vl %%v16, 0(%%r1,%2) \n\t"
"vl %%v17, 16(%%r1,%2) \n\t"
"vl %%v18, 32(%%r1,%2) \n\t"
"vl %%v19, 48(%%r1,%2) \n\t"
"vl %%v20, 64(%%r1,%2) \n\t"
"vl %%v21, 80(%%r1,%2) \n\t"
"vl %%v22, 96(%%r1,%2) \n\t"
"vl %%v23, 112(%%r1,%2) \n\t"
"vflpsb %%v16, %%v16 \n\t"
"vflpsb %%v17, %%v17 \n\t"
"vflpsb %%v18, %%v18 \n\t"
"vflpsb %%v19, %%v19 \n\t"
"vflpsb %%v20, %%v20 \n\t"
"vflpsb %%v21, %%v21 \n\t"
"vflpsb %%v22, %%v22 \n\t"
"vflpsb %%v23, %%v23 \n\t"
"vfasb %%v0,%%v0,%%v16 \n\t"
"vfasb %%v1,%%v1,%%v17 \n\t"
"vfasb %%v2,%%v2,%%v18 \n\t"
"vfasb %%v3,%%v3,%%v19 \n\t"
"vfasb %%v0,%%v0,%%v20 \n\t"
"vfasb %%v1,%%v1,%%v21 \n\t"
"vfasb %%v2,%%v2,%%v22 \n\t"
"vfasb %%v3,%%v3,%%v23 \n\t"
"vl %%v16, 128(%%r1,%2) \n\t"
"vl %%v17, 144(%%r1,%2) \n\t"
"vl %%v18, 160(%%r1,%2) \n\t"
"vl %%v19, 176(%%r1,%2) \n\t"
"vl %%v20, 192(%%r1,%2) \n\t"
"vl %%v21, 208(%%r1,%2) \n\t"
"vl %%v22, 224(%%r1,%2) \n\t"
"vl %%v23, 240(%%r1,%2) \n\t"
"vflpsb %%v16, %%v16 \n\t"
"vflpsb %%v17, %%v17 \n\t"
"vflpsb %%v18, %%v18 \n\t"
"vflpsb %%v19, %%v19 \n\t"
"vflpsb %%v20, %%v20 \n\t"
"vflpsb %%v21, %%v21 \n\t"
"vflpsb %%v22, %%v22 \n\t"
"vflpsb %%v23, %%v23 \n\t"
"vfasb %%v0,%%v0,%%v16 \n\t"
"vfasb %%v1,%%v1,%%v17 \n\t"
"vfasb %%v2,%%v2,%%v18 \n\t"
"vfasb %%v3,%%v3,%%v19 \n\t"
"vfasb %%v0,%%v0,%%v20 \n\t"
"vfasb %%v1,%%v1,%%v21 \n\t"
"vfasb %%v2,%%v2,%%v22 \n\t"
"vfasb %%v3,%%v3,%%v23 \n\t"
"agfi %%r1,256 \n\t"
"brctg %%r0,0b \n\t"
"vfasb %%v0,%%v0,%%v1 \n\t"
"vfasb %%v0,%%v0,%%v2 \n\t"
"vfasb %%v0,%%v0,%%v3 \n\t"
"veslg %%v1,%%v0,32 \n\t"
"vfasb %%v0,%%v0,%%v1 \n\t"
"vrepf %%v1,%%v0,2 \n\t"
"aebr %%f0,%%f1 \n\t"
"ler %0,%%f0 "
:"=f"(asum)
:"r"(n),"ZR"((const FLOAT (*)[n * 2])x)
:"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23"
);
return asum;
}
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
BLASLONG i=0;
BLASLONG ip=0;
FLOAT sumf = 0.0;
BLASLONG n1;
BLASLONG inc_x2;
if (n <= 0 || inc_x <= 0) return(sumf);
if ( inc_x == 1 )
{
n1 = n & -32;
if ( n1 > 0 )
{
sumf = casum_kernel_32(n1, x);
i=n1;
ip=2*n1;
}
while(i < n)
{
sumf += ABS(x[ip]) + ABS(x[ip+1]);
i++;
ip+=2;
}
}
else
{
inc_x2 = 2* inc_x;
while(i < n)
{
sumf += ABS(x[ip]) + ABS(x[ip+1]);
ip+=inc_x2;
i++;
}
}
return(sumf);
}

174
kernel/zarch/caxpy.c Normal file
View File

@ -0,0 +1,174 @@
/***************************************************************************
Copyright (c) 2017, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
static void caxpy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
{
__asm__ volatile(
#if !defined(CONJ)
"vlrepf %%v0,0(%3) \n\t"
"vlef %%v1,4(%3),0 \n\t"
"vlef %%v1,4(%3),2 \n\t"
"vflcsb %%v1,%%v1 \n\t"
"vlef %%v1,4(%3),1 \n\t"
"vlef %%v1,4(%3),3 \n\t"
#else
"vlef %%v0,0(%3),1 \n\t"
"vlef %%v0,0(%3),3 \n\t"
"vflcsb %%v0,%%v0 \n\t"
"vlef %%v0,0(%3),0 \n\t"
"vlef %%v0,0(%3),2 \n\t"
"vlrepf %%v1,4(%3) \n\t"
#endif
"srlg %%r0,%0,4 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%1) \n\t"
"pfd 2, 1024(%%r1,%2) \n\t"
"vl %%v16,0(%%r1,%1) \n\t"
"vl %%v17,16(%%r1,%1) \n\t"
"vl %%v18,32(%%r1,%1) \n\t"
"vl %%v19,48(%%r1,%1) \n\t"
"vl %%v20,0(%%r1,%2) \n\t"
"vl %%v21,16(%%r1,%2) \n\t"
"vl %%v22,32(%%r1,%2) \n\t"
"vl %%v23,48(%%r1,%2) \n\t"
"verllg %%v24,%%v16,32 \n\t"
"verllg %%v25,%%v17,32 \n\t"
"verllg %%v26,%%v18,32 \n\t"
"verllg %%v27,%%v19,32 \n\t"
"vfmasb %%v28,%%v16,%%v0,%%v20 \n\t"
"vfmasb %%v29,%%v17,%%v0,%%v21 \n\t"
"vfmasb %%v30,%%v18,%%v0,%%v22 \n\t"
"vfmasb %%v31,%%v19,%%v0,%%v23 \n\t"
"vfmasb %%v28,%%v24,%%v1,%%v28 \n\t"
"vfmasb %%v29,%%v25,%%v1,%%v29 \n\t"
"vfmasb %%v30,%%v26,%%v1,%%v30 \n\t"
"vfmasb %%v31,%%v27,%%v1,%%v31 \n\t"
"vst %%v28,0(%%r1,%2) \n\t"
"vst %%v29,16(%%r1,%2) \n\t"
"vst %%v30,32(%%r1,%2) \n\t"
"vst %%v31,48(%%r1,%2) \n\t"
"vl %%v16,64(%%r1,%1) \n\t"
"vl %%v17,80(%%r1,%1) \n\t"
"vl %%v18,96(%%r1,%1) \n\t"
"vl %%v19,112(%%r1,%1) \n\t"
"vl %%v20,64(%%r1,%2) \n\t"
"vl %%v21,80(%%r1,%2) \n\t"
"vl %%v22,96(%%r1,%2) \n\t"
"vl %%v23,112(%%r1,%2) \n\t"
"verllg %%v24,%%v16,32 \n\t"
"verllg %%v25,%%v17,32 \n\t"
"verllg %%v26,%%v18,32 \n\t"
"verllg %%v27,%%v19,32 \n\t"
"vfmasb %%v28,%%v16,%%v0,%%v20 \n\t"
"vfmasb %%v29,%%v17,%%v0,%%v21 \n\t"
"vfmasb %%v30,%%v18,%%v0,%%v22 \n\t"
"vfmasb %%v31,%%v19,%%v0,%%v23 \n\t"
"vfmasb %%v28,%%v24,%%v1,%%v28 \n\t"
"vfmasb %%v29,%%v25,%%v1,%%v29 \n\t"
"vfmasb %%v30,%%v26,%%v1,%%v30 \n\t"
"vfmasb %%v31,%%v27,%%v1,%%v31 \n\t"
"vst %%v28,64(%%r1,%2) \n\t"
"vst %%v29,80(%%r1,%2) \n\t"
"vst %%v30,96(%%r1,%2) \n\t"
"vst %%v31,112(%%r1,%2) \n\t"
"agfi %%r1,128 \n\t"
"brctg %%r0,0b "
:
:"r"(n),"ZR"((const FLOAT (*)[n * 2])x),"ZR"((FLOAT (*)[n * 2])y),"ZQ"((const FLOAT (*)[2])alpha)
:"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
}
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) {
BLASLONG i = 0;
BLASLONG ix = 0, iy = 0;
FLOAT da[2] __attribute__ ((aligned(16)));
if (n <= 0) return (0);
if ((inc_x == 1) && (inc_y == 1)) {
BLASLONG n1 = n & -16;
if (n1) {
da[0] = da_r;
da[1] = da_i;
caxpy_kernel_16(n1, x, y, da);
ix = 2 * n1;
}
i = n1;
while (i < n) {
#if !defined(CONJ)
y[ix] += (da_r * x[ix] - da_i * x[ix + 1]);
y[ix + 1] += (da_r * x[ix + 1] + da_i * x[ix]);
#else
y[ix] += (da_r * x[ix] + da_i * x[ix + 1]);
y[ix + 1] -= (da_r * x[ix + 1] - da_i * x[ix]);
#endif
i++;
ix += 2;
}
return (0);
}
inc_x *= 2;
inc_y *= 2;
while (i < n) {
#if !defined(CONJ)
y[iy] += (da_r * x[ix] - da_i * x[ix + 1]);
y[iy + 1] += (da_r * x[ix + 1] + da_i * x[ix]);
#else
y[iy] += (da_r * x[ix] + da_i * x[ix + 1]);
y[iy + 1] -= (da_r * x[ix + 1] - da_i * x[ix]);
#endif
ix += inc_x;
iy += inc_y;
i++;
}
return (0);
}

99
kernel/zarch/ccopy.c Normal file
View File

@ -0,0 +1,99 @@
/***************************************************************************
Copyright (c) 2013-2018, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
static void ccopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y)
{
__asm__ volatile (
"lgr %%r1,%1 \n\t"
"lgr %%r2,%2 \n\t"
"srlg %%r0,%0,5 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1) \n\t"
"pfd 2, 1024(%%r2) \n\t"
"mvc 0(256,%%r2),0(%%r1) \n\t"
"agfi %%r1,256 \n\t"
"agfi %%r2,256 \n\t"
"brctg %%r0,0b "
:
:"r"(n),"a"((const FLOAT (*)[n * 2])x),"a"((FLOAT (*)[n * 2])y)
:"memory","cc","r0","r1","r2"
);
}
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
{
BLASLONG i=0;
BLASLONG ix=0,iy=0;
if ( n <= 0 ) return(0);
if ( (inc_x == 1) && (inc_y == 1 ))
{
BLASLONG n1 = n & -32;
if ( n1 > 0 )
{
ccopy_kernel_32(n1, x, y);
i=n1;
ix=n1*2;
iy=n1*2;
}
while(i < n)
{
y[iy] = x[iy] ;
y[iy+1] = x[ix+1] ;
ix+=2;
iy+=2;
i++ ;
}
}
else
{
BLASLONG inc_x2 = 2 * inc_x;
BLASLONG inc_y2 = 2 * inc_y;
while(i < n)
{
y[iy] = x[ix] ;
y[iy+1] = x[ix+1] ;
ix += inc_x2 ;
iy += inc_y2 ;
i++ ;
}
}
return(0);
}

182
kernel/zarch/cdot.c Normal file
View File

@ -0,0 +1,182 @@
/***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
static void cdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d)
{
__asm__ volatile(
"vzero %%v24 \n\t"
"vzero %%v25 \n\t"
"vzero %%v26 \n\t"
"vzero %%v27 \n\t"
"vzero %%v28 \n\t"
"vzero %%v29 \n\t"
"vzero %%v30 \n\t"
"vzero %%v31 \n\t"
"srlg %%r0,%0,4 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%1) \n\t"
"pfd 1, 1024(%%r1,%2) \n\t"
"vl %%v16, 0(%%r1,%1) \n\t"
"vl %%v17, 16(%%r1,%1) \n\t"
"vl %%v18, 32(%%r1,%1) \n\t"
"vl %%v19, 48(%%r1,%1) \n\t"
"vl %%v0, 0(%%r1,%2) \n\t"
"vl %%v1, 16(%%r1,%2) \n\t"
"vl %%v2, 32(%%r1,%2) \n\t"
"vl %%v3, 48(%%r1,%2) \n\t"
"verllg %%v20,%%v16,32 \n\t"
"verllg %%v21,%%v17,32 \n\t"
"verllg %%v22,%%v18,32 \n\t"
"verllg %%v23,%%v19,32 \n\t"
"vfmasb %%v24,%%v16,%%v0,%%v24 \n\t"
"vfmasb %%v25,%%v20,%%v0,%%v25 \n\t"
"vfmasb %%v26,%%v17,%%v1,%%v26 \n\t"
"vfmasb %%v27,%%v21,%%v1,%%v27 \n\t"
"vfmasb %%v28,%%v18,%%v2,%%v28 \n\t"
"vfmasb %%v29,%%v22,%%v2,%%v29 \n\t"
"vfmasb %%v30,%%v19,%%v3,%%v30 \n\t"
"vfmasb %%v31,%%v23,%%v3,%%v31 \n\t"
"vl %%v16, 64(%%r1,%1) \n\t"
"vl %%v17, 80(%%r1,%1) \n\t"
"vl %%v18, 96(%%r1,%1) \n\t"
"vl %%v19, 112(%%r1,%1) \n\t"
"vl %%v0, 64(%%r1,%2) \n\t"
"vl %%v1, 80(%%r1,%2) \n\t"
"vl %%v2, 96(%%r1,%2) \n\t"
"vl %%v3, 112(%%r1,%2) \n\t"
"verllg %%v20,%%v16,32 \n\t"
"verllg %%v21,%%v17,32 \n\t"
"verllg %%v22,%%v18,32 \n\t"
"verllg %%v23,%%v19,32 \n\t"
"vfmasb %%v24,%%v16,%%v0,%%v24 \n\t"
"vfmasb %%v25,%%v20,%%v0,%%v25 \n\t"
"vfmasb %%v26,%%v17,%%v1,%%v26 \n\t"
"vfmasb %%v27,%%v21,%%v1,%%v27 \n\t"
"vfmasb %%v28,%%v18,%%v2,%%v28 \n\t"
"vfmasb %%v29,%%v22,%%v2,%%v29 \n\t"
"vfmasb %%v30,%%v19,%%v3,%%v30 \n\t"
"vfmasb %%v31,%%v23,%%v3,%%v31 \n\t"
"agfi %%r1,128 \n\t"
"brctg %%r0,0b \n\t"
"vfasb %%v24,%%v24,%%v26 \n\t"
"vfasb %%v24,%%v24,%%v28 \n\t"
"vfasb %%v24,%%v24,%%v30 \n\t"
"vrepg %%v26,%%v24,1 \n\t"
"vfasb %%v24,%%v24,%%v26 \n\t"
"vfasb %%v25,%%v25,%%v27 \n\t"
"vfasb %%v25,%%v25,%%v29 \n\t"
"vfasb %%v25,%%v25,%%v31 \n\t"
"vrepg %%v27,%%v25,1 \n\t"
"vfasb %%v25,%%v25,%%v27 \n\t"
"vstef %%v24,0(%3),0 \n\t"
"vstef %%v24,4(%3),1 \n\t"
"vstef %%v25,8(%3),1 \n\t"
"vstef %%v25,12(%3),0 "
:
:"r"(n),"ZR"((const FLOAT (*)[n * 2])x),"ZR"((const FLOAT (*)[n * 2])y),"ZQ"((FLOAT (*)[4])d)
:"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
}
OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) {
BLASLONG i;
BLASLONG ix, iy;
OPENBLAS_COMPLEX_FLOAT result;
FLOAT dot[4] __attribute__ ((aligned(16))) = {0.0, 0.0, 0.0, 0.0};
if (n <= 0) {
CREAL(result) = 0.0;
CIMAG(result) = 0.0;
return (result);
}
if ((inc_x == 1) && (inc_y == 1)) {
BLASLONG n1 = n & -16;
if (n1)
cdot_kernel_16(n1, x, y, dot);
i = n1;
BLASLONG j = i * 2;
while (i < n) {
dot[0] += x[j] * y[j];
dot[1] += x[j + 1] * y[j + 1];
dot[2] += x[j] * y[j + 1];
dot[3] += x[j + 1] * y[j];
j += 2;
i++;
}
} else {
i = 0;
ix = 0;
iy = 0;
inc_x <<= 1;
inc_y <<= 1;
while (i < n) {
dot[0] += x[ix] * y[iy];
dot[1] += x[ix + 1] * y[iy + 1];
dot[2] += x[ix] * y[iy + 1];
dot[3] += x[ix + 1] * y[iy];
ix += inc_x;
iy += inc_y;
i++;
}
}
#if !defined(CONJ)
CREAL(result) = dot[0] - dot[1];
CIMAG(result) = dot[2] + dot[3];
#else
CREAL(result) = dot[0] + dot[1];
CIMAG(result) = dot[2] - dot[3];
#endif
return (result);
}

743
kernel/zarch/cgemv_n_4.c Normal file
View File

@ -0,0 +1,743 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <stdlib.h>
#include <stdio.h>
#include "common.h"
#define NBMAX 2048
static void cgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
{
__asm__ volatile (
"vlrepg %%v16,0(%5) \n\t"
"vlrepg %%v17,8(%5) \n\t"
"vlrepg %%v18,16(%5) \n\t"
"vlrepg %%v19,24(%5) \n\t"
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
"vlef %%v20,4(%5),0 \n\t"
"vlef %%v20,4(%5),2 \n\t"
"vflcsb %%v20,%%v20 \n\t"
"vlef %%v20,0(%5),1 \n\t"
"vlef %%v20,0(%5),3 \n\t"
"vlef %%v21,12(%5),0 \n\t"
"vlef %%v21,12(%5),2 \n\t"
"vflcsb %%v21,%%v21 \n\t"
"vlef %%v21,8(%5),1 \n\t"
"vlef %%v21,8(%5),3 \n\t"
"vlef %%v22,20(%5),0 \n\t"
"vlef %%v22,20(%5),2 \n\t"
"vflcsb %%v22,%%v22 \n\t"
"vlef %%v22,16(%5),1 \n\t"
"vlef %%v22,16(%5),3 \n\t"
"vlef %%v23,28(%5),0 \n\t"
"vlef %%v23,28(%5),2 \n\t"
"vflcsb %%v23,%%v23 \n\t"
"vlef %%v23,24(%5),1 \n\t"
"vlef %%v23,24(%5),3 \n\t"
#else
"vlef %%v20,0(%5),1 \n\t"
"vlef %%v20,0(%5),3 \n\t"
"vflcsb %%v20,%%v20 \n\t"
"vlef %%v20,4(%5),0 \n\t"
"vlef %%v20,4(%5),2 \n\t"
"vlef %%v21,8(%5),1 \n\t"
"vlef %%v21,8(%5),3 \n\t"
"vflcsb %%v21,%%v21 \n\t"
"vlef %%v21,12(%5),0 \n\t"
"vlef %%v21,12(%5),2 \n\t"
"vlef %%v22,16(%5),1 \n\t"
"vlef %%v22,16(%5),3 \n\t"
"vflcsb %%v22,%%v22 \n\t"
"vlef %%v22,20(%5),0 \n\t"
"vlef %%v22,20(%5),2 \n\t"
"vlef %%v23,24(%5),1 \n\t"
"vlef %%v23,24(%5),3 \n\t"
"vflcsb %%v23,%%v23 \n\t"
"vlef %%v23,28(%5),0 \n\t"
"vlef %%v23,28(%5),2 \n\t"
#endif
"xgr %%r1,%%r1 \n\t"
"srlg %%r0,%0,1 \n\t"
"0: \n\t"
"pfd 1,1024(%%r1,%1) \n\t"
"pfd 1,1024(%%r1,%2) \n\t"
"pfd 1,1024(%%r1,%3) \n\t"
"pfd 1,1024(%%r1,%4) \n\t"
"pfd 2,1024(%%r1,%6) \n\t"
"vlef %%v24,0(%%r1,%1),0 \n\t"
"vlef %%v24,0(%%r1,%1),1 \n\t"
"vlef %%v24,8(%%r1,%1),2 \n\t"
"vlef %%v24,8(%%r1,%1),3 \n\t"
"vlef %%v25,4(%%r1,%1),0 \n\t"
"vlef %%v25,4(%%r1,%1),1 \n\t"
"vlef %%v25,12(%%r1,%1),2 \n\t"
"vlef %%v25,12(%%r1,%1),3 \n\t"
"vlef %%v26,0(%%r1,%2),0 \n\t"
"vlef %%v26,0(%%r1,%2),1 \n\t"
"vlef %%v26,8(%%r1,%2),2 \n\t"
"vlef %%v26,8(%%r1,%2),3 \n\t"
"vlef %%v27,4(%%r1,%2),0 \n\t"
"vlef %%v27,4(%%r1,%2),1 \n\t"
"vlef %%v27,12(%%r1,%2),2 \n\t"
"vlef %%v27,12(%%r1,%2),3 \n\t"
"vl %%v0,0(%%r1,%6) \n\t"
"vfmasb %%v0,%%v24,%%v16,%%v0 \n\t"
"vfmasb %%v0,%%v25,%%v20,%%v0 \n\t"
"vfmasb %%v0,%%v26,%%v17,%%v0 \n\t"
"vfmasb %%v0,%%v27,%%v21,%%v0 \n\t"
"vlef %%v28,0(%%r1,%3),0 \n\t"
"vlef %%v28,0(%%r1,%3),1 \n\t"
"vlef %%v28,8(%%r1,%3),2 \n\t"
"vlef %%v28,8(%%r1,%3),3 \n\t"
"vlef %%v29,4(%%r1,%3),0 \n\t"
"vlef %%v29,4(%%r1,%3),1 \n\t"
"vlef %%v29,12(%%r1,%3),2 \n\t"
"vlef %%v29,12(%%r1,%3),3 \n\t"
"vlef %%v30,0(%%r1,%4),0 \n\t"
"vlef %%v30,0(%%r1,%4),1 \n\t"
"vlef %%v30,8(%%r1,%4),2 \n\t"
"vlef %%v30,8(%%r1,%4),3 \n\t"
"vlef %%v31,4(%%r1,%4),0 \n\t"
"vlef %%v31,4(%%r1,%4),1 \n\t"
"vlef %%v31,12(%%r1,%4),2 \n\t"
"vlef %%v31,12(%%r1,%4),3 \n\t"
"vfmasb %%v0,%%v28,%%v18,%%v0 \n\t"
"vfmasb %%v0,%%v29,%%v22,%%v0 \n\t"
"vfmasb %%v0,%%v30,%%v19,%%v0 \n\t"
"vfmasb %%v0,%%v31,%%v23,%%v0 \n\t"
"vst %%v0,0(%%r1,%6) \n\t"
"agfi %%r1,16 \n\t"
"brctg %%r0,0b \n\t"
:
:"r"(n),"ZR"((const FLOAT (*)[n * 2])ap[0]),"ZR"((const FLOAT (*)[n * 2])ap[1]),"ZR"((const FLOAT (*)[n * 2])ap[2]),"ZR"((const FLOAT (*)[n * 2])ap[3]),"ZQ"((const FLOAT (*)[8])x),"ZR"((FLOAT (*)[n * 2])y)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
}
static void cgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
{
__asm__ volatile (
"vlrepg %%v16,0(%3) \n\t"
"vlrepg %%v17,8(%3) \n\t"
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
"vlef %%v18,4(%3),0 \n\t"
"vlef %%v18,4(%3),2 \n\t"
"vflcsb %%v18,%%v18 \n\t"
"vlef %%v18,0(%3),1 \n\t"
"vlef %%v18,0(%3),3 \n\t"
"vlef %%v19,12(%3),0 \n\t"
"vlef %%v19,12(%3),2 \n\t"
"vflcsb %%v19,%%v19 \n\t"
"vlef %%v19,8(%3),1 \n\t"
"vlef %%v19,8(%3),3 \n\t"
#else
"vlef %%v18,0(%3),1 \n\t"
"vlef %%v18,0(%3),3 \n\t"
"vflcsb %%v18,%%v18 \n\t"
"vlef %%v18,4(%3),0 \n\t"
"vlef %%v18,4(%3),2 \n\t"
"vlef %%v19,8(%3),1 \n\t"
"vlef %%v19,8(%3),3 \n\t"
"vflcsb %%v19,%%v19 \n\t"
"vlef %%v19,12(%3),0 \n\t"
"vlef %%v19,12(%3),2 \n\t"
#endif
"xgr %%r1,%%r1 \n\t"
"srlg %%r0,%0,1 \n\t"
"0: \n\t"
"pfd 1,1024(%%r1,%1) \n\t"
"pfd 1,1024(%%r1,%2) \n\t"
"pfd 2,1024(%%r1,%4) \n\t"
"vlef %%v20,0(%%r1,%1),0 \n\t"
"vlef %%v20,0(%%r1,%1),1 \n\t"
"vlef %%v20,8(%%r1,%1),2 \n\t"
"vlef %%v20,8(%%r1,%1),3 \n\t"
"vlef %%v21,4(%%r1,%1),0 \n\t"
"vlef %%v21,4(%%r1,%1),1 \n\t"
"vlef %%v21,12(%%r1,%1),2 \n\t"
"vlef %%v21,12(%%r1,%1),3 \n\t"
"vlef %%v22,0(%%r1,%2),0 \n\t"
"vlef %%v22,0(%%r1,%2),1 \n\t"
"vlef %%v22,8(%%r1,%2),2 \n\t"
"vlef %%v22,8(%%r1,%2),3 \n\t"
"vlef %%v23,4(%%r1,%2),0 \n\t"
"vlef %%v23,4(%%r1,%2),1 \n\t"
"vlef %%v23,12(%%r1,%2),2 \n\t"
"vlef %%v23,12(%%r1,%2),3 \n\t"
"vl %%v0,0(%%r1,%4) \n\t"
"vfmasb %%v0,%%v20,%%v16,%%v0 \n\t"
"vfmasb %%v0,%%v21,%%v18,%%v0 \n\t"
"vfmasb %%v0,%%v22,%%v17,%%v0 \n\t"
"vfmasb %%v0,%%v23,%%v19,%%v0 \n\t"
"vst %%v0,0(%%r1,%4) \n\t"
"agfi %%r1,16 \n\t"
"brctg %%r0,0b \n\t"
:
:"r"(n),"ZR"((const FLOAT (*)[n * 2])ap[0]),"ZR"((const FLOAT (*)[n * 2])ap[1]),"ZQ"((const FLOAT (*)[4])x),"ZR"((FLOAT (*)[n * 2])y)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23"
);
}
static void cgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
{
__asm__ volatile (
"vlrepg %%v16,0(%2) \n\t"
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
"vlef %%v17,4(%2),0 \n\t"
"vlef %%v17,4(%2),2 \n\t"
"vflcsb %%v17,%%v17 \n\t"
"vlef %%v17,0(%2),1 \n\t"
"vlef %%v17,0(%2),3 \n\t"
#else
"vlef %%v17,0(%2),1 \n\t"
"vlef %%v17,0(%2),3 \n\t"
"vflcsb %%v17,%%v17 \n\t"
"vlef %%v17,4(%2),0 \n\t"
"vlef %%v17,4(%2),2 \n\t"
#endif
"xgr %%r1,%%r1 \n\t"
"srlg %%r0,%0,1 \n\t"
"0: \n\t"
"pfd 1,1024(%%r1,%1) \n\t"
"pfd 2,1024(%%r1,%3) \n\t"
"vlef %%v18,0(%%r1,%1),0 \n\t"
"vlef %%v18,0(%%r1,%1),1 \n\t"
"vlef %%v18,8(%%r1,%1),2 \n\t"
"vlef %%v18,8(%%r1,%1),3 \n\t"
"vlef %%v19,4(%%r1,%1),0 \n\t"
"vlef %%v19,4(%%r1,%1),1 \n\t"
"vlef %%v19,12(%%r1,%1),2 \n\t"
"vlef %%v19,12(%%r1,%1),3 \n\t"
"vl %%v0,0(%%r1,%3) \n\t"
"vfmasb %%v0,%%v18,%%v16,%%v0 \n\t"
"vfmasb %%v0,%%v19,%%v17,%%v0 \n\t"
"vst %%v0,0(%%r1,%3) \n\t"
"agfi %%r1,16 \n\t"
"brctg %%r0,0b \n\t"
:
:"r"(n),"ZR"((const FLOAT (*)[n * 2])ap),"ZQ"((const FLOAT (*)[2])x),"ZR"((FLOAT (*)[n * 2])y)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19"
);
}
static void add_y_4(BLASLONG n, FLOAT *src, FLOAT *dest, FLOAT alpha_r, FLOAT alpha_i)
{
__asm__ volatile (
#if !defined(XCONJ)
"vlrepf %%v0,%3 \n\t"
"vlef %%v1,%4,0 \n\t"
"vlef %%v1,%4,2 \n\t"
"vflcsb %%v1,%%v1 \n\t"
"vlef %%v1,%4,1 \n\t"
"vlef %%v1,%4,3 \n\t"
#else
"vlef %%v0,%3,1 \n\t"
"vlef %%v0,%3,3 \n\t"
"vflcsb %%v0,%%v0 \n\t"
"vlef %%v0,%3,0 \n\t"
"vlef %%v0,%3,2 \n\t"
"vlrepf %%v1,%4 \n\t"
#endif
"xgr %%r1,%%r1 \n\t"
"srlg %%r0,%0,2 \n\t"
"0: \n\t"
"pfd 1,1024(%%r1,%1) \n\t"
"pfd 2,1024(%%r1,%2) \n\t"
"vl %%v16,0(%%r1,%1) \n\t"
"vl %%v17,16(%%r1,%1) \n\t"
"vl %%v18,0(%%r1,%2) \n\t"
"vl %%v19,16(%%r1,%2) \n\t"
"verllg %%v20,%%v16,32 \n\t"
"verllg %%v21,%%v17,32 \n\t"
"vfmasb %%v22,%%v16,%%v0,%%v18 \n\t"
"vfmasb %%v23,%%v17,%%v0,%%v19 \n\t"
"vfmasb %%v22,%%v20,%%v1,%%v22 \n\t"
"vfmasb %%v23,%%v21,%%v1,%%v23 \n\t"
"vst %%v22,0(%%r1,%2) \n\t"
"vst %%v23,16(%%r1,%2) \n\t"
"agfi %%r1,32 \n\t"
"brctg %%r0,0b "
:
:"r"(n),"ZR"((const FLOAT (*)[n * 2])src),"ZR"((FLOAT (*)[n * 2])dest),"m"(alpha_r),"m"(alpha_i)
:"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23"
);
}
static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, FLOAT alpha_r, FLOAT alpha_i)
{
BLASLONG i;
if ( inc_dest != 2 )
{
FLOAT temp_r;
FLOAT temp_i;
for ( i=0; i<n; i++ )
{
#if !defined(XCONJ)
temp_r = alpha_r * src[0] - alpha_i * src[1];
temp_i = alpha_r * src[1] + alpha_i * src[0];
#else
temp_r = alpha_r * src[0] + alpha_i * src[1];
temp_i = -alpha_r * src[1] + alpha_i * src[0];
#endif
*dest += temp_r;
*(dest+1) += temp_i;
src+=2;
dest += inc_dest;
}
return;
}
add_y_4(n, src, dest, alpha_r, alpha_i);
}
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r,FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
{
BLASLONG i;
FLOAT *a_ptr;
FLOAT *x_ptr;
FLOAT *y_ptr;
FLOAT *ap[4];
BLASLONG n1;
BLASLONG m1;
BLASLONG m2;
BLASLONG m3;
BLASLONG n2;
BLASLONG lda4;
FLOAT xbuffer[8],*ybuffer;
if ( m < 1 ) return(0);
if ( n < 1 ) return(0);
ybuffer = buffer;
inc_x *= 2;
inc_y *= 2;
lda *= 2;
lda4 = 4 * lda;
n1 = n / 4 ;
n2 = n % 4 ;
m3 = m % 4;
m1 = m - ( m % 4 );
m2 = (m % NBMAX) - (m % 4) ;
y_ptr = y;
BLASLONG NB = NBMAX;
while ( NB == NBMAX )
{
m1 -= NB;
if ( m1 < 0)
{
if ( m2 == 0 ) break;
NB = m2;
}
a_ptr = a;
ap[0] = a_ptr;
ap[1] = a_ptr + lda;
ap[2] = ap[1] + lda;
ap[3] = ap[2] + lda;
x_ptr = x;
//zero_y(NB,ybuffer);
memset(ybuffer,0,NB*8);
if ( inc_x == 2 )
{
for( i = 0; i < n1 ; i++)
{
cgemv_kernel_4x4(NB,ap,x_ptr,ybuffer);
ap[0] += lda4;
ap[1] += lda4;
ap[2] += lda4;
ap[3] += lda4;
a_ptr += lda4;
x_ptr += 8;
}
if ( n2 & 2 )
{
cgemv_kernel_4x2(NB,ap,x_ptr,ybuffer);
x_ptr += 4;
a_ptr += 2 * lda;
}
if ( n2 & 1 )
{
cgemv_kernel_4x1(NB,a_ptr,x_ptr,ybuffer);
/* x_ptr += 2;
a_ptr += lda; */
}
}
else
{
for( i = 0; i < n1 ; i++)
{
xbuffer[0] = x_ptr[0];
xbuffer[1] = x_ptr[1];
x_ptr += inc_x;
xbuffer[2] = x_ptr[0];
xbuffer[3] = x_ptr[1];
x_ptr += inc_x;
xbuffer[4] = x_ptr[0];
xbuffer[5] = x_ptr[1];
x_ptr += inc_x;
xbuffer[6] = x_ptr[0];
xbuffer[7] = x_ptr[1];
x_ptr += inc_x;
cgemv_kernel_4x4(NB,ap,xbuffer,ybuffer);
ap[0] += lda4;
ap[1] += lda4;
ap[2] += lda4;
ap[3] += lda4;
a_ptr += lda4;
}
for( i = 0; i < n2 ; i++)
{
xbuffer[0] = x_ptr[0];
xbuffer[1] = x_ptr[1];
x_ptr += inc_x;
cgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer);
a_ptr += 1 * lda;
}
}
add_y(NB,ybuffer,y_ptr,inc_y,alpha_r,alpha_i);
a += 2 * NB;
y_ptr += NB * inc_y;
}
if ( m3 == 0 ) return(0);
if ( m3 == 1 )
{
a_ptr = a;
x_ptr = x;
FLOAT temp_r = 0.0;
FLOAT temp_i = 0.0;
if ( lda == 2 && inc_x == 2 )
{
for( i=0 ; i < (n & -2); i+=2 )
{
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
temp_r += a_ptr[2] * x_ptr[2] - a_ptr[3] * x_ptr[3];
temp_i += a_ptr[2] * x_ptr[3] + a_ptr[3] * x_ptr[2];
#else
temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
temp_r += a_ptr[2] * x_ptr[2] + a_ptr[3] * x_ptr[3];
temp_i += a_ptr[2] * x_ptr[3] - a_ptr[3] * x_ptr[2];
#endif
a_ptr += 4;
x_ptr += 4;
}
for( ; i < n; i++ )
{
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
#else
temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
#endif
a_ptr += 2;
x_ptr += 2;
}
}
else
{
for( i = 0; i < n; i++ )
{
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
#else
temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
#endif
a_ptr += lda;
x_ptr += inc_x;
}
}
#if !defined(XCONJ)
y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i;
y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r;
#else
y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i;
y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r;
#endif
return(0);
}
if ( m3 == 2 )
{
a_ptr = a;
x_ptr = x;
FLOAT temp_r0 = 0.0;
FLOAT temp_i0 = 0.0;
FLOAT temp_r1 = 0.0;
FLOAT temp_i1 = 0.0;
if ( lda == 4 && inc_x == 2 )
{
for( i = 0; i < (n & -2); i+=2 )
{
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];
temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];
temp_r0 += a_ptr[4] * x_ptr[2] - a_ptr[5] * x_ptr[3];
temp_i0 += a_ptr[4] * x_ptr[3] + a_ptr[5] * x_ptr[2];
temp_r1 += a_ptr[6] * x_ptr[2] - a_ptr[7] * x_ptr[3];
temp_i1 += a_ptr[6] * x_ptr[3] + a_ptr[7] * x_ptr[2];
#else
temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];
temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];
temp_r0 += a_ptr[4] * x_ptr[2] + a_ptr[5] * x_ptr[3];
temp_i0 += a_ptr[4] * x_ptr[3] - a_ptr[5] * x_ptr[2];
temp_r1 += a_ptr[6] * x_ptr[2] + a_ptr[7] * x_ptr[3];
temp_i1 += a_ptr[6] * x_ptr[3] - a_ptr[7] * x_ptr[2];
#endif
a_ptr += 8;
x_ptr += 4;
}
for( ; i < n; i++ )
{
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];
temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];
#else
temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];
temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];
#endif
a_ptr += 4;
x_ptr += 2;
}
}
else
{
for( i=0 ; i < n; i++ )
{
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];
temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];
#else
temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];
temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];
#endif
a_ptr += lda;
x_ptr += inc_x;
}
}
#if !defined(XCONJ)
y_ptr[0] += alpha_r * temp_r0 - alpha_i * temp_i0;
y_ptr[1] += alpha_r * temp_i0 + alpha_i * temp_r0;
y_ptr += inc_y;
y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1;
y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1;
#else
y_ptr[0] += alpha_r * temp_r0 + alpha_i * temp_i0;
y_ptr[1] -= alpha_r * temp_i0 - alpha_i * temp_r0;
y_ptr += inc_y;
y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1;
y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1;
#endif
return(0);
}
if ( m3 == 3 )
{
a_ptr = a;
x_ptr = x;
FLOAT temp_r0 = 0.0;
FLOAT temp_i0 = 0.0;
FLOAT temp_r1 = 0.0;
FLOAT temp_i1 = 0.0;
FLOAT temp_r2 = 0.0;
FLOAT temp_i2 = 0.0;
if ( lda == 6 && inc_x == 2 )
{
for( i=0 ; i < n; i++ )
{
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];
temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];
temp_r2 += a_ptr[4] * x_ptr[0] - a_ptr[5] * x_ptr[1];
temp_i2 += a_ptr[4] * x_ptr[1] + a_ptr[5] * x_ptr[0];
#else
temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];
temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];
temp_r2 += a_ptr[4] * x_ptr[0] + a_ptr[5] * x_ptr[1];
temp_i2 += a_ptr[4] * x_ptr[1] - a_ptr[5] * x_ptr[0];
#endif
a_ptr += 6;
x_ptr += 2;
}
}
else
{
for( i = 0; i < n; i++ )
{
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];
temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];
temp_r2 += a_ptr[4] * x_ptr[0] - a_ptr[5] * x_ptr[1];
temp_i2 += a_ptr[4] * x_ptr[1] + a_ptr[5] * x_ptr[0];
#else
temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];
temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];
temp_r2 += a_ptr[4] * x_ptr[0] + a_ptr[5] * x_ptr[1];
temp_i2 += a_ptr[4] * x_ptr[1] - a_ptr[5] * x_ptr[0];
#endif
a_ptr += lda;
x_ptr += inc_x;
}
}
#if !defined(XCONJ)
y_ptr[0] += alpha_r * temp_r0 - alpha_i * temp_i0;
y_ptr[1] += alpha_r * temp_i0 + alpha_i * temp_r0;
y_ptr += inc_y;
y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1;
y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1;
y_ptr += inc_y;
y_ptr[0] += alpha_r * temp_r2 - alpha_i * temp_i2;
y_ptr[1] += alpha_r * temp_i2 + alpha_i * temp_r2;
#else
y_ptr[0] += alpha_r * temp_r0 + alpha_i * temp_i0;
y_ptr[1] -= alpha_r * temp_i0 - alpha_i * temp_r0;
y_ptr += inc_y;
y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1;
y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1;
y_ptr += inc_y;
y_ptr[0] += alpha_r * temp_r2 + alpha_i * temp_i2;
y_ptr[1] -= alpha_r * temp_i2 - alpha_i * temp_r2;
#endif
return(0);
}
return(0);
}

671
kernel/zarch/cgemv_t_4.c Normal file
View File

@ -0,0 +1,671 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#define NBMAX 2048
static void cgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
{
__asm__ volatile (
"vzero %%v16 \n\t"
"vzero %%v17 \n\t"
"vzero %%v18 \n\t"
"vzero %%v19 \n\t"
"xgr %%r1,%%r1 \n\t"
"srlg %%r0,%0,1 \n\t"
"0: \n\t"
"pfd 1,1024(%%r1,%1) \n\t"
"pfd 1,1024(%%r1,%2) \n\t"
"pfd 1,1024(%%r1,%3) \n\t"
"pfd 1,1024(%%r1,%4) \n\t"
"pfd 1,1024(%%r1,%5) \n\t"
"vl %%v20,0(%%r1,%5) \n\t"
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
"vlef %%v21,4(%%r1,%5),0 \n\t"
"vlef %%v21,12(%%r1,%5),2 \n\t"
"vflcsb %%v21,%%v21 \n\t"
"vlef %%v21,0(%%r1,%5),1 \n\t"
"vlef %%v21,8(%%r1,%5),3 \n\t"
#else
"vlef %%v21,0(%%r1,%5),1 \n\t"
"vlef %%v21,8(%%r1,%5),3 \n\t"
"vflcsb %%v21,%%v21 \n\t"
"vlef %%v21,4(%%r1,%5),0 \n\t"
"vlef %%v21,12(%%r1,%5),2 \n\t"
#endif
"vlef %%v22,0(%%r1,%1),0 \n\t"
"vlef %%v22,0(%%r1,%1),1 \n\t"
"vlef %%v22,8(%%r1,%1),2 \n\t"
"vlef %%v22,8(%%r1,%1),3 \n\t"
"vlef %%v23,4(%%r1,%1),0 \n\t"
"vlef %%v23,4(%%r1,%1),1 \n\t"
"vlef %%v23,12(%%r1,%1),2 \n\t"
"vlef %%v23,12(%%r1,%1),3 \n\t"
"vlef %%v24,0(%%r1,%2),0 \n\t"
"vlef %%v24,0(%%r1,%2),1 \n\t"
"vlef %%v24,8(%%r1,%2),2 \n\t"
"vlef %%v24,8(%%r1,%2),3 \n\t"
"vlef %%v25,4(%%r1,%2),0 \n\t"
"vlef %%v25,4(%%r1,%2),1 \n\t"
"vlef %%v25,12(%%r1,%2),2 \n\t"
"vlef %%v25,12(%%r1,%2),3 \n\t"
"vfmasb %%v16,%%v22,%%v20,%%v16 \n\t"
"vfmasb %%v16,%%v23,%%v21,%%v16 \n\t"
"vfmasb %%v17,%%v24,%%v20,%%v17 \n\t"
"vfmasb %%v17,%%v25,%%v21,%%v17 \n\t"
"vlef %%v26,0(%%r1,%3),0 \n\t"
"vlef %%v26,0(%%r1,%3),1 \n\t"
"vlef %%v26,8(%%r1,%3),2 \n\t"
"vlef %%v26,8(%%r1,%3),3 \n\t"
"vlef %%v27,4(%%r1,%3),0 \n\t"
"vlef %%v27,4(%%r1,%3),1 \n\t"
"vlef %%v27,12(%%r1,%3),2 \n\t"
"vlef %%v27,12(%%r1,%3),3 \n\t"
"vlef %%v28,0(%%r1,%4),0 \n\t"
"vlef %%v28,0(%%r1,%4),1 \n\t"
"vlef %%v28,8(%%r1,%4),2 \n\t"
"vlef %%v28,8(%%r1,%4),3 \n\t"
"vlef %%v29,4(%%r1,%4),0 \n\t"
"vlef %%v29,4(%%r1,%4),1 \n\t"
"vlef %%v29,12(%%r1,%4),2 \n\t"
"vlef %%v29,12(%%r1,%4),3 \n\t"
"vfmasb %%v18,%%v26,%%v20,%%v18 \n\t"
"vfmasb %%v18,%%v27,%%v21,%%v18 \n\t"
"vfmasb %%v19,%%v28,%%v20,%%v19 \n\t"
"vfmasb %%v19,%%v29,%%v21,%%v19 \n\t"
"agfi %%r1,16 \n\t"
"brctg %%r0,0b \n\t"
"vrepg %%v20,%%v16,1 \n\t"
"vrepg %%v21,%%v17,1 \n\t"
"vrepg %%v22,%%v18,1 \n\t"
"vrepg %%v23,%%v19,1 \n\t"
"vfasb %%v16,%%v16,%%v20 \n\t"
"vfasb %%v17,%%v17,%%v21 \n\t"
"vfasb %%v18,%%v18,%%v22 \n\t"
"vfasb %%v19,%%v19,%%v23 \n\t"
"vmrhg %%v16,%%v16,%%v17 \n\t"
"vmrhg %%v17,%%v18,%%v19 \n\t"
"verllg %%v18,%%v16,32 \n\t"
"verllg %%v19,%%v17,32 \n\t"
#if !defined(XCONJ)
"vlrepf %%v20,0(%7) \n\t"
"vlef %%v21,4(%7),0 \n\t"
"vlef %%v21,4(%7),2 \n\t"
"vflcsb %%v21,%%v21 \n\t"
"vlef %%v21,4(%7),1 \n\t"
"vlef %%v21,4(%7),3 \n\t"
#else
"vlef %%v20,0(%7),1 \n\t"
"vlef %%v20,0(%7),3 \n\t"
"vflcsb %%v20,%%v20 \n\t"
"vlef %%v20,0(%7),0 \n\t"
"vlef %%v20,0(%7),2 \n\t"
"vlrepf %%v21,4(%7) \n\t"
#endif
"vl %%v22,0(%6) \n\t"
"vl %%v23,16(%6) \n\t"
"vfmasb %%v22,%%v16,%%v20,%%v22 \n\t"
"vfmasb %%v22,%%v18,%%v21,%%v22 \n\t"
"vfmasb %%v23,%%v17,%%v20,%%v23 \n\t"
"vfmasb %%v23,%%v19,%%v21,%%v23 \n\t"
"vst %%v22,0(%6) \n\t"
"vst %%v23,16(%6) "
:
:"r"(n),"ZR"((const FLOAT (*)[n * 2])ap[0]),"ZR"((const FLOAT (*)[n * 2])ap[1]),"ZR"((const FLOAT (*)[n * 2])ap[2]),"ZR"((const FLOAT (*)[n * 2])ap[3]),"ZR"((const FLOAT (*)[n * 2])x),"ZQ"((FLOAT (*)[8])y),"ZQ"((const FLOAT (*)[2])alpha)
:"memory","cc","r0","r1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29"
);
}
static void cgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
{
__asm__ volatile (
"vzero %%v16 \n\t"
"vzero %%v17 \n\t"
"xgr %%r1,%%r1 \n\t"
"srlg %%r0,%0,1 \n\t"
"0: \n\t"
"pfd 1,1024(%%r1,%1) \n\t"
"pfd 1,1024(%%r1,%2) \n\t"
"pfd 1,1024(%%r1,%3) \n\t"
"vl %%v18,0(%%r1,%3) \n\t"
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
"vlef %%v19,4(%%r1,%3),0 \n\t"
"vlef %%v19,12(%%r1,%3),2 \n\t"
"vflcsb %%v19,%%v19 \n\t"
"vlef %%v19,0(%%r1,%3),1 \n\t"
"vlef %%v19,8(%%r1,%3),3 \n\t"
#else
"vlef %%v19,0(%%r1,%3),1 \n\t"
"vlef %%v19,8(%%r1,%3),3 \n\t"
"vflcsb %%v19,%%v19 \n\t"
"vlef %%v19,4(%%r1,%3),0 \n\t"
"vlef %%v19,12(%%r1,%3),2 \n\t"
#endif
"vlef %%v20,0(%%r1,%1),0 \n\t"
"vlef %%v20,0(%%r1,%1),1 \n\t"
"vlef %%v20,8(%%r1,%1),2 \n\t"
"vlef %%v20,8(%%r1,%1),3 \n\t"
"vlef %%v21,4(%%r1,%1),0 \n\t"
"vlef %%v21,4(%%r1,%1),1 \n\t"
"vlef %%v21,12(%%r1,%1),2 \n\t"
"vlef %%v21,12(%%r1,%1),3 \n\t"
"vlef %%v22,0(%%r1,%2),0 \n\t"
"vlef %%v22,0(%%r1,%2),1 \n\t"
"vlef %%v22,8(%%r1,%2),2 \n\t"
"vlef %%v22,8(%%r1,%2),3 \n\t"
"vlef %%v23,4(%%r1,%2),0 \n\t"
"vlef %%v23,4(%%r1,%2),1 \n\t"
"vlef %%v23,12(%%r1,%2),2 \n\t"
"vlef %%v23,12(%%r1,%2),3 \n\t"
"vfmasb %%v16,%%v20,%%v18,%%v16 \n\t"
"vfmasb %%v16,%%v21,%%v19,%%v16 \n\t"
"vfmasb %%v17,%%v22,%%v18,%%v17 \n\t"
"vfmasb %%v17,%%v23,%%v19,%%v17 \n\t"
"agfi %%r1,16 \n\t"
"brctg %%r0,0b \n\t"
"vrepg %%v18,%%v16,1 \n\t"
"vrepg %%v19,%%v17,1 \n\t"
"vfasb %%v16,%%v16,%%v18 \n\t"
"vfasb %%v17,%%v17,%%v19 \n\t"
"vmrhg %%v16,%%v16,%%v17 \n\t"
"verllg %%v17,%%v16,32 \n\t"
#if !defined(XCONJ)
"vlrepf %%v18,0(%5) \n\t"
"vlef %%v19,4(%5),0 \n\t"
"vlef %%v19,4(%5),2 \n\t"
"vflcsb %%v19,%%v19 \n\t"
"vlef %%v19,4(%5),1 \n\t"
"vlef %%v19,4(%5),3 \n\t"
#else
"vlef %%v18,0(%5),1 \n\t"
"vlef %%v18,0(%5),3 \n\t"
"vflcsb %%v18,%%v18 \n\t"
"vlef %%v18,0(%5),0 \n\t"
"vlef %%v18,0(%5),2 \n\t"
"vlrepf %%v19,4(%5) \n\t"
#endif
"vl %%v20,0(%4) \n\t"
"vfmasb %%v20,%%v16,%%v18,%%v20 \n\t"
"vfmasb %%v20,%%v17,%%v19,%%v20 \n\t"
"vst %%v20,0(%4) "
:
:"r"(n),"ZR"((const FLOAT (*)[n * 2])ap[0]),"ZR"((const FLOAT (*)[n * 2])ap[1]),"ZR"((const FLOAT (*)[n * 2])x),"ZQ"((FLOAT (*)[4])y),"ZQ"((const FLOAT (*)[2])alpha)
:"memory","cc","r0","r1","v16","v17","v18","v19","v20","v21","v22","v23"
);
}
static void cgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
{
__asm__ volatile (
"vzero %%v16 \n\t"
"xgr %%r1,%%r1 \n\t"
"srlg %%r0,%0,1 \n\t"
"0: \n\t"
"pfd 1,1024(%%r1,%1) \n\t"
"pfd 1,1024(%%r1,%2) \n\t"
"vl %%v17,0(%%r1,%2) \n\t"
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
"vlef %%v18,4(%%r1,%2),0 \n\t"
"vlef %%v18,12(%%r1,%2),2 \n\t"
"vflcsb %%v18,%%v18 \n\t"
"vlef %%v18,0(%%r1,%2),1 \n\t"
"vlef %%v18,8(%%r1,%2),3 \n\t"
#else
"vlef %%v18,0(%%r1,%2),1 \n\t"
"vlef %%v18,8(%%r1,%2),3 \n\t"
"vflcsb %%v18,%%v18 \n\t"
"vlef %%v18,4(%%r1,%2),0 \n\t"
"vlef %%v18,12(%%r1,%2),2 \n\t"
#endif
"vlef %%v19,0(%%r1,%1),0 \n\t"
"vlef %%v19,0(%%r1,%1),1 \n\t"
"vlef %%v19,8(%%r1,%1),2 \n\t"
"vlef %%v19,8(%%r1,%1),3 \n\t"
"vlef %%v20,4(%%r1,%1),0 \n\t"
"vlef %%v20,4(%%r1,%1),1 \n\t"
"vlef %%v20,12(%%r1,%1),2 \n\t"
"vlef %%v20,12(%%r1,%1),3 \n\t"
"vfmasb %%v16,%%v19,%%v17,%%v16 \n\t"
"vfmasb %%v16,%%v20,%%v18,%%v16 \n\t"
"agfi %%r1,16 \n\t"
"brctg %%r0,0b \n\t"
"vrepg %%v17,%%v16,1 \n\t"
"vfasb %%v16,%%v16,%%v17 \n\t"
"verllg %%v17,%%v16,32 \n\t"
#if !defined(XCONJ)
"vlrepf %%v18,0(%4) \n\t"
"vlef %%v19,4(%4),0 \n\t"
"vflcsb %%v19,%%v19 \n\t"
"vlef %%v19,4(%4),1 \n\t"
#else
"vlef %%v18,0(%4),1 \n\t"
"vflcsb %%v18,%%v18 \n\t"
"vlef %%v18,0(%4),0 \n\t"
"vlrepf %%v19,4(%4) \n\t"
#endif
"vleg %%v20,0(%3),0 \n\t"
"vfmasb %%v20,%%v16,%%v18,%%v20 \n\t"
"vfmasb %%v20,%%v17,%%v19,%%v20 \n\t"
"vsteg %%v20,0(%3),0 "
:
:"r"(n),"ZR"((const FLOAT (*)[n * 2])ap),"ZR"((const FLOAT (*)[n * 2])x),"ZQ"((FLOAT (*)[2])y),"ZQ"((const FLOAT (*)[2])alpha)
:"memory","cc","r0","r1","v16","v17","v18","v19","v20","v21","v22","v23"
);
}
static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src)
{
BLASLONG i;
for ( i=0; i<n; i++ )
{
*dest = *src;
*(dest+1) = *(src+1);
dest+=2;
src += inc_src;
}
}
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
{
BLASLONG i;
BLASLONG j;
FLOAT *a_ptr;
FLOAT *x_ptr;
FLOAT *y_ptr;
FLOAT *ap[8];
BLASLONG n1;
BLASLONG m1;
BLASLONG m2;
BLASLONG m3;
BLASLONG n2;
BLASLONG lda4;
FLOAT ybuffer[8],*xbuffer;
FLOAT alpha[2];
if ( m < 1 ) return(0);
if ( n < 1 ) return(0);
inc_x <<= 1;
inc_y <<= 1;
lda <<= 1;
lda4 = lda << 2;
xbuffer = buffer;
n1 = n >> 2 ;
n2 = n & 3 ;
m3 = m & 3 ;
m1 = m - m3;
m2 = (m & (NBMAX-1)) - m3 ;
alpha[0] = alpha_r;
alpha[1] = alpha_i;
BLASLONG NB = NBMAX;
while ( NB == NBMAX )
{
m1 -= NB;
if ( m1 < 0)
{
if ( m2 == 0 ) break;
NB = m2;
}
y_ptr = y;
a_ptr = a;
x_ptr = x;
ap[0] = a_ptr;
ap[1] = a_ptr + lda;
ap[2] = ap[1] + lda;
ap[3] = ap[2] + lda;
if ( inc_x != 2 )
copy_x(NB,x_ptr,xbuffer,inc_x);
else
xbuffer = x_ptr;
if ( inc_y == 2 )
{
for( i = 0; i < n1 ; i++)
{
cgemv_kernel_4x4(NB,ap,xbuffer,y_ptr,alpha);
ap[0] += lda4;
ap[1] += lda4;
ap[2] += lda4;
ap[3] += lda4;
a_ptr += lda4;
y_ptr += 8;
}
if ( n2 & 2 )
{
cgemv_kernel_4x2(NB,ap,xbuffer,y_ptr,alpha);
a_ptr += lda * 2;
y_ptr += 4;
}
if ( n2 & 1 )
{
cgemv_kernel_4x1(NB,a_ptr,xbuffer,y_ptr,alpha);
/* a_ptr += lda;
y_ptr += 2; */
}
}
else
{
for( i = 0; i < n1 ; i++)
{
memset(ybuffer,0,sizeof(ybuffer));
cgemv_kernel_4x4(NB,ap,xbuffer,ybuffer,alpha);
ap[0] += lda4;
ap[1] += lda4;
ap[2] += lda4;
ap[3] += lda4;
a_ptr += lda4;
y_ptr[0] += ybuffer[0];
y_ptr[1] += ybuffer[1];
y_ptr += inc_y;
y_ptr[0] += ybuffer[2];
y_ptr[1] += ybuffer[3];
y_ptr += inc_y;
y_ptr[0] += ybuffer[4];
y_ptr[1] += ybuffer[5];
y_ptr += inc_y;
y_ptr[0] += ybuffer[6];
y_ptr[1] += ybuffer[7];
y_ptr += inc_y;
}
for( i = 0; i < n2 ; i++)
{
memset(ybuffer,0,sizeof(ybuffer));
cgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer,alpha);
a_ptr += lda;
y_ptr[0] += ybuffer[0];
y_ptr[1] += ybuffer[1];
y_ptr += inc_y;
}
}
a += 2 * NB;
x += NB * inc_x;
}
if ( m3 == 0 ) return(0);
x_ptr = x;
j=0;
a_ptr = a;
y_ptr = y;
if ( m3 == 3 )
{
FLOAT temp_r ;
FLOAT temp_i ;
FLOAT x0 = x_ptr[0];
FLOAT x1 = x_ptr[1];
x_ptr += inc_x;
FLOAT x2 = x_ptr[0];
FLOAT x3 = x_ptr[1];
x_ptr += inc_x;
FLOAT x4 = x_ptr[0];
FLOAT x5 = x_ptr[1];
while ( j < n)
{
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;
temp_r += a_ptr[2] * x2 - a_ptr[3] * x3;
temp_i += a_ptr[2] * x3 + a_ptr[3] * x2;
temp_r += a_ptr[4] * x4 - a_ptr[5] * x5;
temp_i += a_ptr[4] * x5 + a_ptr[5] * x4;
#else
temp_r = a_ptr[0] * x0 + a_ptr[1] * x1;
temp_i = a_ptr[0] * x1 - a_ptr[1] * x0;
temp_r += a_ptr[2] * x2 + a_ptr[3] * x3;
temp_i += a_ptr[2] * x3 - a_ptr[3] * x2;
temp_r += a_ptr[4] * x4 + a_ptr[5] * x5;
temp_i += a_ptr[4] * x5 - a_ptr[5] * x4;
#endif
#if !defined(XCONJ)
y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i;
y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r;
#else
y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i;
y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r;
#endif
a_ptr += lda;
y_ptr += inc_y;
j++;
}
return(0);
}
if ( m3 == 2 )
{
FLOAT temp_r ;
FLOAT temp_i ;
FLOAT temp_r1 ;
FLOAT temp_i1 ;
FLOAT x0 = x_ptr[0];
FLOAT x1 = x_ptr[1];
x_ptr += inc_x;
FLOAT x2 = x_ptr[0];
FLOAT x3 = x_ptr[1];
FLOAT ar = alpha[0];
FLOAT ai = alpha[1];
while ( j < ( n & -2 ))
{
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;
temp_r += a_ptr[2] * x2 - a_ptr[3] * x3;
temp_i += a_ptr[2] * x3 + a_ptr[3] * x2;
a_ptr += lda;
temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1;
temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0;
temp_r1 += a_ptr[2] * x2 - a_ptr[3] * x3;
temp_i1 += a_ptr[2] * x3 + a_ptr[3] * x2;
#else
temp_r = a_ptr[0] * x0 + a_ptr[1] * x1;
temp_i = a_ptr[0] * x1 - a_ptr[1] * x0;
temp_r += a_ptr[2] * x2 + a_ptr[3] * x3;
temp_i += a_ptr[2] * x3 - a_ptr[3] * x2;
a_ptr += lda;
temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1;
temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0;
temp_r1 += a_ptr[2] * x2 + a_ptr[3] * x3;
temp_i1 += a_ptr[2] * x3 - a_ptr[3] * x2;
#endif
#if !defined(XCONJ)
y_ptr[0] += ar * temp_r - ai * temp_i;
y_ptr[1] += ar * temp_i + ai * temp_r;
y_ptr += inc_y;
y_ptr[0] += ar * temp_r1 - ai * temp_i1;
y_ptr[1] += ar * temp_i1 + ai * temp_r1;
#else
y_ptr[0] += ar * temp_r + ai * temp_i;
y_ptr[1] -= ar * temp_i - ai * temp_r;
y_ptr += inc_y;
y_ptr[0] += ar * temp_r1 + ai * temp_i1;
y_ptr[1] -= ar * temp_i1 - ai * temp_r1;
#endif
a_ptr += lda;
y_ptr += inc_y;
j+=2;
}
while ( j < n)
{
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;
temp_r += a_ptr[2] * x2 - a_ptr[3] * x3;
temp_i += a_ptr[2] * x3 + a_ptr[3] * x2;
#else
temp_r = a_ptr[0] * x0 + a_ptr[1] * x1;
temp_i = a_ptr[0] * x1 - a_ptr[1] * x0;
temp_r += a_ptr[2] * x2 + a_ptr[3] * x3;
temp_i += a_ptr[2] * x3 - a_ptr[3] * x2;
#endif
#if !defined(XCONJ)
y_ptr[0] += ar * temp_r - ai * temp_i;
y_ptr[1] += ar * temp_i + ai * temp_r;
#else
y_ptr[0] += ar * temp_r + ai * temp_i;
y_ptr[1] -= ar * temp_i - ai * temp_r;
#endif
a_ptr += lda;
y_ptr += inc_y;
j++;
}
return(0);
}
if ( m3 == 1 )
{
FLOAT temp_r ;
FLOAT temp_i ;
FLOAT temp_r1 ;
FLOAT temp_i1 ;
FLOAT x0 = x_ptr[0];
FLOAT x1 = x_ptr[1];
FLOAT ar = alpha[0];
FLOAT ai = alpha[1];
while ( j < ( n & -2 ))
{
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;
a_ptr += lda;
temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1;
temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0;
#else
temp_r = a_ptr[0] * x0 + a_ptr[1] * x1;
temp_i = a_ptr[0] * x1 - a_ptr[1] * x0;
a_ptr += lda;
temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1;
temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0;
#endif
#if !defined(XCONJ)
y_ptr[0] += ar * temp_r - ai * temp_i;
y_ptr[1] += ar * temp_i + ai * temp_r;
y_ptr += inc_y;
y_ptr[0] += ar * temp_r1 - ai * temp_i1;
y_ptr[1] += ar * temp_i1 + ai * temp_r1;
#else
y_ptr[0] += ar * temp_r + ai * temp_i;
y_ptr[1] -= ar * temp_i - ai * temp_r;
y_ptr += inc_y;
y_ptr[0] += ar * temp_r1 + ai * temp_i1;
y_ptr[1] -= ar * temp_i1 - ai * temp_r1;
#endif
a_ptr += lda;
y_ptr += inc_y;
j+=2;
}
while ( j < n)
{
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;
#else
temp_r = a_ptr[0] * x0 + a_ptr[1] * x1;
temp_i = a_ptr[0] * x1 - a_ptr[1] * x0;
#endif
#if !defined(XCONJ)
y_ptr[0] += ar * temp_r - ai * temp_i;
y_ptr[1] += ar * temp_i + ai * temp_r;
#else
y_ptr[0] += ar * temp_r + ai * temp_i;
y_ptr[1] -= ar * temp_i - ai * temp_r;
#endif
a_ptr += lda;
y_ptr += inc_y;
j++;
}
return(0);
}
return(0);
}

256
kernel/zarch/crot.c Normal file
View File

@ -0,0 +1,256 @@
/***************************************************************************
Copyright (c) 2013-2018, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
static void crot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s)
{
__asm__ (
"vlrepf %%v0,%3 \n\t"
"vlrepf %%v1,%4 \n\t"
"srlg %%r0,%0,5 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 2, 1024(%%r1,%1) \n\t"
"pfd 2, 1024(%%r1,%2) \n\t"
"vl %%v24, 0(%%r1,%1) \n\t"
"vl %%v25, 16(%%r1,%1) \n\t"
"vl %%v26, 32(%%r1,%1) \n\t"
"vl %%v27, 48(%%r1,%1) \n\t"
"vl %%v16, 0(%%r1,%2) \n\t"
"vl %%v17, 16(%%r1,%2) \n\t"
"vl %%v18, 32(%%r1,%2) \n\t"
"vl %%v19, 48(%%r1,%2) \n\t"
"vfmsb %%v28,%%v24,%%v0 \n\t"
"vfmsb %%v29,%%v25,%%v0 \n\t"
"vfmsb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
"vfmsb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
"vfmsb %%v30,%%v26,%%v0 \n\t"
"vfmsb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
"vfmsb %%v31,%%v27,%%v0 \n\t"
"vfmsb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
/* 2nd parts*/
"vfmasb %%v28,%%v16,%%v1,%%v28 \n\t"
"vfmssb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
"vfmasb %%v29,%%v17,%%v1,%%v29 \n\t"
"vfmssb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
"vfmasb %%v30,%%v18,%%v1,%%v30 \n\t"
"vfmssb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */
"vfmasb %%v31,%%v19,%%v1,%%v31 \n\t"
"vfmssb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
"vst %%v28, 0(%%r1,%1) \n\t"
"vst %%v29, 16(%%r1,%1) \n\t"
"vst %%v30, 32(%%r1,%1) \n\t"
"vst %%v31, 48(%%r1,%1) \n\t"
"vst %%v20, 0(%%r1,%2) \n\t"
"vst %%v21, 16(%%r1,%2) \n\t"
"vst %%v22, 32(%%r1,%2) \n\t"
"vst %%v23, 48(%%r1,%2) \n\t"
"vl %%v24, 64(%%r1,%1) \n\t"
"vl %%v25, 80(%%r1,%1) \n\t"
"vl %%v26, 96(%%r1,%1) \n\t"
"vl %%v27, 112(%%r1,%1) \n\t"
"vl %%v16, 64(%%r1,%2) \n\t"
"vl %%v17, 80(%%r1,%2) \n\t"
"vl %%v18, 96(%%r1,%2) \n\t"
"vl %%v19, 112(%%r1,%2) \n\t"
"vfmsb %%v28,%%v24,%%v0 \n\t"
"vfmsb %%v29,%%v25,%%v0 \n\t"
"vfmsb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
"vfmsb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
"vfmsb %%v30,%%v26,%%v0 \n\t"
"vfmsb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
"vfmsb %%v31,%%v27,%%v0 \n\t"
"vfmsb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
/* 2nd parts*/
"vfmasb %%v28,%%v16,%%v1,%%v28 \n\t"
"vfmssb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
"vfmasb %%v29,%%v17,%%v1,%%v29 \n\t"
"vfmssb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
"vfmasb %%v30,%%v18,%%v1,%%v30 \n\t"
"vfmssb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */
"vfmasb %%v31,%%v19,%%v1,%%v31 \n\t"
"vfmssb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
"vst %%v28, 64(%%r1,%1) \n\t"
"vst %%v29, 80(%%r1,%1) \n\t"
"vst %%v30, 96(%%r1,%1) \n\t"
"vst %%v31, 112(%%r1,%1) \n\t"
"vst %%v20, 64(%%r1,%2) \n\t"
"vst %%v21, 80(%%r1,%2) \n\t"
"vst %%v22, 96(%%r1,%2) \n\t"
"vst %%v23, 112(%%r1,%2) \n\t"
"vl %%v24, 128(%%r1,%1) \n\t"
"vl %%v25, 144(%%r1,%1) \n\t"
"vl %%v26, 160(%%r1,%1) \n\t"
"vl %%v27, 176(%%r1,%1) \n\t"
"vl %%v16, 128(%%r1,%2) \n\t"
"vl %%v17, 144(%%r1,%2) \n\t"
"vl %%v18, 160(%%r1,%2) \n\t"
"vl %%v19, 176(%%r1,%2) \n\t"
"vfmsb %%v28,%%v24,%%v0 \n\t"
"vfmsb %%v29,%%v25,%%v0 \n\t"
"vfmsb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
"vfmsb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
"vfmsb %%v30,%%v26,%%v0 \n\t"
"vfmsb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
"vfmsb %%v31,%%v27,%%v0 \n\t"
"vfmsb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
/* 2nd parts*/
"vfmasb %%v28,%%v16,%%v1,%%v28 \n\t"
"vfmssb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
"vfmasb %%v29,%%v17,%%v1,%%v29 \n\t"
"vfmssb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
"vfmasb %%v30,%%v18,%%v1,%%v30 \n\t"
"vfmssb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */
"vfmasb %%v31,%%v19,%%v1,%%v31 \n\t"
"vfmssb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
"vst %%v28, 128(%%r1,%1) \n\t"
"vst %%v29, 144(%%r1,%1) \n\t"
"vst %%v30, 160(%%r1,%1) \n\t"
"vst %%v31, 176(%%r1,%1) \n\t"
"vst %%v20, 128(%%r1,%2) \n\t"
"vst %%v21, 144(%%r1,%2) \n\t"
"vst %%v22, 160(%%r1,%2) \n\t"
"vst %%v23, 176(%%r1,%2) \n\t"
"vl %%v24, 192(%%r1,%1) \n\t"
"vl %%v25, 208(%%r1,%1) \n\t"
"vl %%v26, 224(%%r1,%1) \n\t"
"vl %%v27, 240(%%r1,%1) \n\t"
"vl %%v16, 192(%%r1,%2) \n\t"
"vl %%v17, 208(%%r1,%2) \n\t"
"vl %%v18, 224(%%r1,%2) \n\t"
"vl %%v19, 240(%%r1,%2) \n\t"
"vfmsb %%v28,%%v24,%%v0 \n\t"
"vfmsb %%v29,%%v25,%%v0 \n\t"
"vfmsb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
"vfmsb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
"vfmsb %%v30,%%v26,%%v0 \n\t"
"vfmsb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
"vfmsb %%v31,%%v27,%%v0 \n\t"
"vfmsb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
/* 2nd parts*/
"vfmasb %%v28,%%v16,%%v1,%%v28 \n\t"
"vfmssb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
"vfmasb %%v29,%%v17,%%v1,%%v29 \n\t"
"vfmssb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
"vfmasb %%v30,%%v18,%%v1,%%v30 \n\t"
"vfmssb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */
"vfmasb %%v31,%%v19,%%v1,%%v31 \n\t"
"vfmssb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
"vst %%v28, 192(%%r1,%1) \n\t"
"vst %%v29, 208(%%r1,%1) \n\t"
"vst %%v30, 224(%%r1,%1) \n\t"
"vst %%v31, 240(%%r1,%1) \n\t"
"vst %%v20, 192(%%r1,%2) \n\t"
"vst %%v21, 208(%%r1,%2) \n\t"
"vst %%v22, 224(%%r1,%2) \n\t"
"vst %%v23, 240(%%r1,%2) \n\t"
"agfi %%r1,256 \n\t"
"brctg %%r0,0b "
:
:"r"(n),"ZR"((FLOAT (*)[n * 2])x),"ZR"((FLOAT (*)[n * 2])y),"m"(*c),"m"(*s)
:"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
}
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
{
BLASLONG i=0;
BLASLONG ix=0,iy=0;
FLOAT temp[2];
BLASLONG inc_x2;
BLASLONG inc_y2;
if ( n <= 0 ) return(0);
if ( (inc_x == 1) && (inc_y == 1) )
{
BLASLONG n1 = n & -32;
if ( n1 > 0 )
{
FLOAT cosa,sina;
cosa=c;
sina=s;
crot_kernel_32(n1, x, y, &cosa, &sina);
i=n1;
ix=2*n1;
}
while(i < n)
{
temp[0] = c*x[ix] + s*y[ix] ;
temp[1] = c*x[ix+1] + s*y[ix+1] ;
y[ix] = c*y[ix] - s*x[ix] ;
y[ix+1] = c*y[ix+1] - s*x[ix+1] ;
x[ix] = temp[0] ;
x[ix+1] = temp[1] ;
ix += 2 ;
i++ ;
}
}
else
{
inc_x2 = 2 * inc_x ;
inc_y2 = 2 * inc_y ;
while(i < n)
{
temp[0] = c*x[ix] + s*y[iy] ;
temp[1] = c*x[ix+1] + s*y[iy+1] ;
y[iy] = c*y[iy] - s*x[ix] ;
y[iy+1] = c*y[iy+1] - s*x[ix+1] ;
x[ix] = temp[0] ;
x[ix+1] = temp[1] ;
ix += inc_x2 ;
iy += inc_y2 ;
i++ ;
}
}
return(0);
}

456
kernel/zarch/cscal.c Normal file
View File

@ -0,0 +1,456 @@
/***************************************************************************
Copyright (c) 2013 - 2017, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
static void cscal_kernel_16(BLASLONG n, FLOAT *alpha, FLOAT *x)
{
__asm__ volatile(
"vlrepf %%v0,0(%1) \n\t"
"vlef %%v1,4(%1),0 \n\t"
"vlef %%v1,4(%1),2 \n\t"
"vflcsb %%v1,%%v1 \n\t"
"vlef %%v1,4(%1),1 \n\t"
"vlef %%v1,4(%1),3 \n\t"
"srlg %%r0,%0,4 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 2, 1024(%%r1,%2) \n\t"
"vl %%v16,0(%%r1,%2) \n\t"
"vl %%v17,16(%%r1,%2) \n\t"
"vl %%v18,32(%%r1,%2) \n\t"
"vl %%v19,48(%%r1,%2) \n\t"
"vl %%v20,64(%%r1,%2) \n\t"
"vl %%v21,80(%%r1,%2) \n\t"
"vl %%v22,96(%%r1,%2) \n\t"
"vl %%v23,112(%%r1,%2) \n\t"
"verllg %%v24,%%v16,32 \n\t"
"verllg %%v25,%%v17,32 \n\t"
"verllg %%v26,%%v18,32 \n\t"
"verllg %%v27,%%v19,32 \n\t"
"verllg %%v28,%%v20,32 \n\t"
"verllg %%v29,%%v21,32 \n\t"
"verllg %%v30,%%v22,32 \n\t"
"verllg %%v31,%%v23,32 \n\t"
"vfmsb %%v16,%%v16,%%v0 \n\t"
"vfmsb %%v17,%%v17,%%v0 \n\t"
"vfmsb %%v18,%%v18,%%v0 \n\t"
"vfmsb %%v19,%%v19,%%v0 \n\t"
"vfmsb %%v20,%%v20,%%v0 \n\t"
"vfmsb %%v21,%%v21,%%v0 \n\t"
"vfmsb %%v22,%%v22,%%v0 \n\t"
"vfmsb %%v23,%%v23,%%v0 \n\t"
"vfmasb %%v16,%%v24,%%v1,%%v16 \n\t"
"vfmasb %%v17,%%v25,%%v1,%%v17 \n\t"
"vfmasb %%v18,%%v26,%%v1,%%v18 \n\t"
"vfmasb %%v19,%%v27,%%v1,%%v19 \n\t"
"vfmasb %%v20,%%v28,%%v1,%%v20 \n\t"
"vfmasb %%v21,%%v29,%%v1,%%v21 \n\t"
"vfmasb %%v22,%%v30,%%v1,%%v22 \n\t"
"vfmasb %%v23,%%v31,%%v1,%%v23 \n\t"
"vst %%v16,0(%%r1,%2) \n\t"
"vst %%v17,16(%%r1,%2) \n\t"
"vst %%v18,32(%%r1,%2) \n\t"
"vst %%v19,48(%%r1,%2) \n\t"
"vst %%v20,64(%%r1,%2) \n\t"
"vst %%v21,80(%%r1,%2) \n\t"
"vst %%v22,96(%%r1,%2) \n\t"
"vst %%v23,112(%%r1,%2) \n\t"
"agfi %%r1,128 \n\t"
"brctg %%r0,0b "
:
:"r"(n),"ZQ"((const FLOAT (*)[2])alpha),"ZR"((FLOAT (*)[n * 2])x)
:"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
}
static void cscal_kernel_16_zero_r(BLASLONG n, FLOAT *alpha, FLOAT *x)
{
__asm__ volatile(
"vlef %%v0,4(%1),0 \n\t"
"vlef %%v0,4(%1),2 \n\t"
"vflcsb %%v0,%%v0 \n\t"
"vlef %%v0,4(%1),1 \n\t"
"vlef %%v0,4(%1),3 \n\t"
"srlg %%r0,%0,4 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 2, 1024(%%r1,%2) \n\t"
"vl %%v16,0(%%r1,%2) \n\t"
"vl %%v17,16(%%r1,%2) \n\t"
"vl %%v18,32(%%r1,%2) \n\t"
"vl %%v19,48(%%r1,%2) \n\t"
"vl %%v20,64(%%r1,%2) \n\t"
"vl %%v21,80(%%r1,%2) \n\t"
"vl %%v22,96(%%r1,%2) \n\t"
"vl %%v23,112(%%r1,%2) \n\t"
"verllg %%v16,%%v16,32 \n\t"
"verllg %%v17,%%v17,32 \n\t"
"verllg %%v18,%%v18,32 \n\t"
"verllg %%v19,%%v19,32 \n\t"
"verllg %%v20,%%v20,32 \n\t"
"verllg %%v21,%%v21,32 \n\t"
"verllg %%v22,%%v22,32 \n\t"
"verllg %%v23,%%v23,32 \n\t"
"vfmsb %%v16,%%v16,%%v0 \n\t"
"vfmsb %%v17,%%v17,%%v0 \n\t"
"vfmsb %%v18,%%v18,%%v0 \n\t"
"vfmsb %%v19,%%v19,%%v0 \n\t"
"vfmsb %%v20,%%v20,%%v0 \n\t"
"vfmsb %%v21,%%v21,%%v0 \n\t"
"vfmsb %%v22,%%v22,%%v0 \n\t"
"vfmsb %%v23,%%v23,%%v0 \n\t"
"vst %%v16,0(%%r1,%2) \n\t"
"vst %%v17,16(%%r1,%2) \n\t"
"vst %%v18,32(%%r1,%2) \n\t"
"vst %%v19,48(%%r1,%2) \n\t"
"vst %%v20,64(%%r1,%2) \n\t"
"vst %%v21,80(%%r1,%2) \n\t"
"vst %%v22,96(%%r1,%2) \n\t"
"vst %%v23,112(%%r1,%2) \n\t"
"agfi %%r1,128 \n\t"
"brctg %%r0,0b "
:
:"r"(n),"ZQ"((const FLOAT (*)[2])alpha),"ZR"((FLOAT (*)[n * 2])x)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23"
);
}
static void cscal_kernel_16_zero_i(BLASLONG n, FLOAT *alpha, FLOAT *x)
{
__asm__ volatile(
"vlrepf %%v0,0(%1) \n\t"
"srlg %%r0,%0,4 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 2, 1024(%%r1,%2) \n\t"
"vl %%v16,0(%%r1,%2) \n\t"
"vl %%v17,16(%%r1,%2) \n\t"
"vl %%v18,32(%%r1,%2) \n\t"
"vl %%v19,48(%%r1,%2) \n\t"
"vl %%v20,64(%%r1,%2) \n\t"
"vl %%v21,80(%%r1,%2) \n\t"
"vl %%v22,96(%%r1,%2) \n\t"
"vl %%v23,112(%%r1,%2) \n\t"
"vfmsb %%v16,%%v16,%%v0 \n\t"
"vfmsb %%v17,%%v17,%%v0 \n\t"
"vfmsb %%v18,%%v18,%%v0 \n\t"
"vfmsb %%v19,%%v19,%%v0 \n\t"
"vfmsb %%v20,%%v20,%%v0 \n\t"
"vfmsb %%v21,%%v21,%%v0 \n\t"
"vfmsb %%v22,%%v22,%%v0 \n\t"
"vfmsb %%v23,%%v23,%%v0 \n\t"
"vst %%v16,0(%%r1,%2) \n\t"
"vst %%v17,16(%%r1,%2) \n\t"
"vst %%v18,32(%%r1,%2) \n\t"
"vst %%v19,48(%%r1,%2) \n\t"
"vst %%v20,64(%%r1,%2) \n\t"
"vst %%v21,80(%%r1,%2) \n\t"
"vst %%v22,96(%%r1,%2) \n\t"
"vst %%v23,112(%%r1,%2) \n\t"
"agfi %%r1,128 \n\t"
"brctg %%r0,0b "
:
:"r"(n),"ZQ"((const FLOAT (*)[2])alpha),"ZR"((FLOAT (*)[n * 2])x)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23"
);
}
static void cscal_kernel_16_zero(BLASLONG n, FLOAT *x)
{
__asm__ volatile(
"vzero %%v24 \n\t"
"vzero %%v25 \n\t"
"vzero %%v26 \n\t"
"vzero %%v27 \n\t"
"srlg %%r0,%0,4 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 2, 1024(%%r1,%1) \n\t"
"vst %%v24,0(%%r1,%1) \n\t"
"vst %%v25,16(%%r1,%1) \n\t"
"vst %%v26,32(%%r1,%1) \n\t"
"vst %%v27,48(%%r1,%1) \n\t"
"vst %%v24,64(%%r1,%1) \n\t"
"vst %%v25,80(%%r1,%1) \n\t"
"vst %%v26,96(%%r1,%1) \n\t"
"vst %%v27,112(%%r1,%1) \n\t"
"agfi %%r1,128 \n\t"
"brctg %%r0,0b "
:
:"r"(n),"ZR"((FLOAT (*)[n * 2])x)
:"memory","cc","r0","r1","v24","v25","v26","v27"
);
}
static void cscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, BLASLONG inc_x)
{
BLASLONG i;
BLASLONG inc_x2 = 2 * inc_x;
BLASLONG inc_x3 = inc_x2 + inc_x;
FLOAT t0, t1, t2, t3;
FLOAT da_r = alpha[0];
FLOAT da_i = alpha[1];
for (i = 0; i < n; i += 4)
{
t0 = da_r * x[0] - da_i * x[1];
t1 = da_r * x[inc_x] - da_i * x[inc_x + 1];
t2 = da_r * x[inc_x2] - da_i * x[inc_x2 + 1];
t3 = da_r * x[inc_x3] - da_i * x[inc_x3 + 1];
x[1] = da_i * x[0] + da_r * x[1];
x[inc_x + 1] = da_i * x[inc_x] + da_r * x[inc_x + 1];
x[inc_x2 + 1] = da_i * x[inc_x2] + da_r * x[inc_x2 + 1];
x[inc_x3 + 1] = da_i * x[inc_x3] + da_r * x[inc_x3 + 1];
x[0] = t0;
x[inc_x] = t1;
x[inc_x2] = t2;
x[inc_x3] = t3;
x += 4 * inc_x;
}
}
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) {
BLASLONG i = 0, j = 0;
FLOAT temp0;
FLOAT temp1;
FLOAT alpha[2] __attribute__ ((aligned(16)));
if (inc_x != 1) {
inc_x <<= 1;
if (da_r == 0.0) {
BLASLONG n1 = n & -2;
if (da_i == 0.0) {
while (j < n1) {
x[i] = 0.0;
x[i + 1] = 0.0;
x[i + inc_x] = 0.0;
x[i + 1 + inc_x] = 0.0;
i += 2 * inc_x;
j += 2;
}
while (j < n) {
x[i] = 0.0;
x[i + 1] = 0.0;
i += inc_x;
j++;
}
} else {
while (j < n1) {
temp0 = -da_i * x[i + 1];
x[i + 1] = da_i * x[i];
x[i] = temp0;
temp1 = -da_i * x[i + 1 + inc_x];
x[i + 1 + inc_x] = da_i * x[i + inc_x];
x[i + inc_x] = temp1;
i += 2 * inc_x;
j += 2;
}
while (j < n) {
temp0 = -da_i * x[i + 1];
x[i + 1] = da_i * x[i];
x[i] = temp0;
i += inc_x;
j++;
}
}
} else {
if (da_i == 0.0) {
BLASLONG n1 = n & -2;
while (j < n1) {
temp0 = da_r * x[i];
x[i + 1] = da_r * x[i + 1];
x[i] = temp0;
temp1 = da_r * x[i + inc_x];
x[i + 1 + inc_x] = da_r * x[i + 1 + inc_x];
x[i + inc_x] = temp1;
i += 2 * inc_x;
j += 2;
}
while (j < n) {
temp0 = da_r * x[i];
x[i + 1] = da_r * x[i + 1];
x[i] = temp0;
i += inc_x;
j++;
}
} else {
BLASLONG n1 = n & -8;
if (n1 > 0) {
alpha[0] = da_r;
alpha[1] = da_i;
cscal_kernel_inc_8(n1, alpha, x, inc_x);
j = n1;
i = n1 * inc_x;
}
while (j < n) {
temp0 = da_r * x[i] - da_i * x[i + 1];
x[i + 1] = da_r * x[i + 1] + da_i * x[i];
x[i] = temp0;
i += inc_x;
j++;
}
}
}
return (0);
}
BLASLONG n1 = n & -16;
if (n1 > 0) {
alpha[0] = da_r;
alpha[1] = da_i;
if (da_r == 0.0)
if (da_i == 0)
cscal_kernel_16_zero(n1, x);
else
cscal_kernel_16_zero_r(n1, alpha, x);
else
if (da_i == 0)
cscal_kernel_16_zero_i(n1, alpha, x);
else
cscal_kernel_16(n1, alpha, x);
i = n1 << 1;
j = n1;
}
if (da_r == 0.0) {
if (da_i == 0.0) {
while (j < n) {
x[i] = 0.0;
x[i + 1] = 0.0;
i += 2;
j++;
}
} else {
while (j < n) {
temp0 = -da_i * x[i + 1];
x[i + 1] = da_i * x[i];
x[i] = temp0;
i += 2;
j++;
}
}
} else {
if (da_i == 0.0) {
while (j < n) {
temp0 = da_r * x[i];
x[i + 1] = da_r * x[i + 1];
x[i] = temp0;
i += 2;
j++;
}
} else {
while (j < n) {
temp0 = da_r * x[i] - da_i * x[i + 1];
x[i + 1] = da_r * x[i + 1] + da_i * x[i];
x[i] = temp0;
i += 2;
j++;
}
}
}
return (0);
}

183
kernel/zarch/cswap.c Normal file
View File

@ -0,0 +1,183 @@
/***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
static void cswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y)
{
__asm__ volatile(
"srlg %%r0,%0,5 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 2, 1024(%%r1,%1) \n\t"
"pfd 2, 1024(%%r1,%2) \n\t"
"vl %%v16, 0(%%r1,%1) \n\t"
"vl %%v17, 16(%%r1,%1) \n\t"
"vl %%v18, 32(%%r1,%1) \n\t"
"vl %%v19, 48(%%r1,%1) \n\t"
"vl %%v20, 64(%%r1,%1) \n\t"
"vl %%v21, 80(%%r1,%1) \n\t"
"vl %%v22, 96(%%r1,%1) \n\t"
"vl %%v23, 112(%%r1,%1) \n\t"
"vl %%v24, 128(%%r1,%1) \n\t"
"vl %%v25, 144(%%r1,%1) \n\t"
"vl %%v26, 160(%%r1,%1) \n\t"
"vl %%v27, 176(%%r1,%1) \n\t"
"vl %%v28, 192(%%r1,%1) \n\t"
"vl %%v29, 208(%%r1,%1) \n\t"
"vl %%v30, 224(%%r1,%1) \n\t"
"vl %%v31, 240(%%r1,%1) \n\t"
"vl %%v0, 0(%%r1,%2) \n\t"
"vl %%v1, 16(%%r1,%2) \n\t"
"vl %%v2, 32(%%r1,%2) \n\t"
"vl %%v3, 48(%%r1,%2) \n\t"
"vl %%v4, 64(%%r1,%2) \n\t"
"vl %%v5, 80(%%r1,%2) \n\t"
"vl %%v6, 96(%%r1,%2) \n\t"
"vl %%v7, 112(%%r1,%2) \n\t"
"vst %%v0, 0(%%r1,%1) \n\t"
"vst %%v1, 16(%%r1,%1) \n\t"
"vst %%v2, 32(%%r1,%1) \n\t"
"vst %%v3, 48(%%r1,%1) \n\t"
"vst %%v4, 64(%%r1,%1) \n\t"
"vst %%v5, 80(%%r1,%1) \n\t"
"vst %%v6, 96(%%r1,%1) \n\t"
"vst %%v7, 112(%%r1,%1) \n\t"
"vl %%v0, 128(%%r1,%2) \n\t"
"vl %%v1, 144(%%r1,%2) \n\t"
"vl %%v2, 160(%%r1,%2) \n\t"
"vl %%v3, 176(%%r1,%2) \n\t"
"vl %%v4, 192(%%r1,%2) \n\t"
"vl %%v5, 208(%%r1,%2) \n\t"
"vl %%v6, 224(%%r1,%2) \n\t"
"vl %%v7, 240(%%r1,%2) \n\t"
"vst %%v0, 128(%%r1,%1) \n\t"
"vst %%v1, 144(%%r1,%1) \n\t"
"vst %%v2, 160(%%r1,%1) \n\t"
"vst %%v3, 176(%%r1,%1) \n\t"
"vst %%v4, 192(%%r1,%1) \n\t"
"vst %%v5, 208(%%r1,%1) \n\t"
"vst %%v6, 224(%%r1,%1) \n\t"
"vst %%v7, 240(%%r1,%1) \n\t"
"vst %%v16, 0(%%r1,%2) \n\t"
"vst %%v17, 16(%%r1,%2) \n\t"
"vst %%v18, 32(%%r1,%2) \n\t"
"vst %%v19, 48(%%r1,%2) \n\t"
"vst %%v20, 64(%%r1,%2) \n\t"
"vst %%v21, 80(%%r1,%2) \n\t"
"vst %%v22, 96(%%r1,%2) \n\t"
"vst %%v23, 112(%%r1,%2) \n\t"
"vst %%v24, 128(%%r1,%2) \n\t"
"vst %%v25, 144(%%r1,%2) \n\t"
"vst %%v26, 160(%%r1,%2) \n\t"
"vst %%v27, 176(%%r1,%2) \n\t"
"vst %%v28, 192(%%r1,%2) \n\t"
"vst %%v29, 208(%%r1,%2) \n\t"
"vst %%v30, 224(%%r1,%2) \n\t"
"vst %%v31, 240(%%r1,%2) \n\t"
"agfi %%r1,256 \n\t"
"brctg %%r0,0b "
:
:"r"(n),"ZR"((FLOAT (*)[n * 2])x),"ZR"((FLOAT (*)[n * 2])y)
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
}
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
{
BLASLONG i=0;
BLASLONG ix=0,iy=0;
FLOAT temp[2];
BLASLONG inc_x2, inc_y2;
if ( n <= 0 ) return(0);
if ( (inc_x == 1) && (inc_y == 1 ))
{
BLASLONG n1 = n & -32;
if ( n1 > 0 )
{
cswap_kernel_32(n1, x, y);
i=n1;
ix = 2* n1;
iy = 2* n1;
}
while(i < n)
{
temp[0] = x[ix] ;
temp[1] = x[ix+1] ;
x[ix] = y[iy] ;
x[ix+1] = y[iy+1] ;
y[iy] = temp[0] ;
y[iy+1] = temp[1] ;
ix += 2 ;
iy += 2 ;
i++ ;
}
}
else
{
inc_x2 = 2 * inc_x;
inc_y2 = 2 * inc_y;
while(i < n)
{
temp[0] = x[ix] ;
temp[1] = x[ix+1] ;
x[ix] = y[iy] ;
x[ix+1] = y[iy+1] ;
y[iy] = temp[0] ;
y[iy+1] = temp[1] ;
ix += inc_x2 ;
iy += inc_y2 ;
i++ ;
}
}
return(0);
}

166
kernel/zarch/damax.c Normal file
View File

@ -0,0 +1,166 @@
/***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#include <math.h>
#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf
#endif
static FLOAT damax_kernel_32(BLASLONG n, FLOAT *x)
{
FLOAT amax;
__asm__ volatile (
"vl %%v0,0(%2) \n\t"
"srlg %%r0,%1,5 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%2) \n\t"
"vl %%v16,0(%%r1,%2) \n\t"
"vl %%v17,16(%%r1,%2) \n\t"
"vl %%v18,32(%%r1,%2) \n\t"
"vl %%v19,48(%%r1,%2) \n\t"
"vl %%v20,64(%%r1,%2) \n\t"
"vl %%v21,80(%%r1,%2) \n\t"
"vl %%v22,96(%%r1,%2) \n\t"
"vl %%v23,112(%%r1,%2) \n\t"
"vl %%v24,128(%%r1,%2) \n\t"
"vl %%v25,144(%%r1,%2) \n\t"
"vl %%v26,160(%%r1,%2) \n\t"
"vl %%v27,176(%%r1,%2) \n\t"
"vl %%v28,192(%%r1,%2) \n\t"
"vl %%v29,208(%%r1,%2) \n\t"
"vl %%v30,224(%%r1,%2) \n\t"
"vl %%v31,240(%%r1,%2) \n\t"
"vfmaxdb %%v16,%%v16,%%v24,8 \n\t"
"vfmaxdb %%v17,%%v17,%%v25,8 \n\t"
"vfmaxdb %%v18,%%v18,%%v26,8 \n\t"
"vfmaxdb %%v19,%%v19,%%v27,8 \n\t"
"vfmaxdb %%v20,%%v20,%%v28,8 \n\t"
"vfmaxdb %%v21,%%v21,%%v29,8 \n\t"
"vfmaxdb %%v22,%%v22,%%v30,8 \n\t"
"vfmaxdb %%v23,%%v23,%%v31,8 \n\t"
"vfmaxdb %%v16,%%v16,%%v20,8 \n\t"
"vfmaxdb %%v17,%%v17,%%v21,8 \n\t"
"vfmaxdb %%v18,%%v18,%%v22,8 \n\t"
"vfmaxdb %%v19,%%v19,%%v23,8 \n\t"
"vfmaxdb %%v16,%%v16,%%v18,8 \n\t"
"vfmaxdb %%v17,%%v17,%%v19,8 \n\t"
"vfmaxdb %%v16,%%v16,%%v17,8 \n\t"
"vfmaxdb %%v0,%%v0,%%16,8 \n\t"
"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"
"vrepg %%v16,%%v0,1 \n\t"
"wfmaxdb %%v0,%%v0,%%v16,8 \n\t"
"lpdr %0,%%f0 "
:"=f"(amax)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
return amax;
}
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0;
BLASLONG j = 0;
FLOAT maxf = 0.0;
if (n <= 0 || inc_x <= 0) return (maxf);
if (inc_x == 1) {
BLASLONG n1 = n & -32;
if (n1 > 0) {
maxf = damax_kernel_32(n1, x);
i = n1;
}
else
{
maxf=ABS(x[0]);
i++;
}
while (i < n) {
if (ABS(x[i]) > maxf) {
maxf = ABS(x[i]);
}
i++;
}
return (maxf);
} else {
maxf=ABS(x[0]);
BLASLONG n1 = n & -4;
while (j < n1) {
if (ABS(x[i]) > maxf) {
maxf = ABS(x[i]);
}
if (ABS(x[i + inc_x]) > maxf) {
maxf = ABS(x[i + inc_x]);
}
if (ABS(x[i + 2 * inc_x]) > maxf) {
maxf = ABS(x[i + 2 * inc_x]);
}
if (ABS(x[i + 3 * inc_x]) > maxf) {
maxf = ABS(x[i + 3 * inc_x]);
}
i += inc_x * 4;
j += 4;
}
while (j < n) {
if (ABS(x[i]) > maxf) {
maxf = ABS(x[i]);
}
i += inc_x;
j++;
}
return (maxf);
}
}

204
kernel/zarch/damax_z13.c Normal file
View File

@ -0,0 +1,204 @@
/***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#include <math.h>
#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf
#endif
static FLOAT damax_kernel_32(BLASLONG n, FLOAT *x)
{
FLOAT amax;
__asm__ volatile (
"vl %%v0,0(%2) \n\t"
"vflpdb %%v0,%%v0 \n\t"
"srlg %%r0,%1,5 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%2) \n\t"
"vl %%v16,0(%%r1,%2) \n\t"
"vl %%v17,16(%%r1,%2) \n\t"
"vl %%v18,32(%%r1,%2) \n\t"
"vl %%v19,48(%%r1,%2) \n\t"
"vl %%v20,64(%%r1,%2) \n\t"
"vl %%v21,80(%%r1,%2) \n\t"
"vl %%v22,96(%%r1,%2) \n\t"
"vl %%v23,112(%%r1,%2) \n\t"
"vflpdb %%v16, %%v16 \n\t"
"vflpdb %%v17, %%v17 \n\t"
"vflpdb %%v18, %%v18 \n\t"
"vflpdb %%v19, %%v19 \n\t"
"vflpdb %%v20, %%v20 \n\t"
"vflpdb %%v21, %%v21 \n\t"
"vflpdb %%v22, %%v22 \n\t"
"vflpdb %%v23, %%v23 \n\t"
"vfchdb %%v24,%%v16,%%v17 \n\t"
"vfchdb %%v25,%%v18,%%v19 \n\t"
"vfchdb %%v26,%%v20,%%v21 \n\t"
"vfchdb %%v27,%%v22,%%v23 \n\t"
"vsel %%v24,%%v16,%%v17,%%v24 \n\t"
"vsel %%v25,%%v18,%%v19,%%v25 \n\t"
"vsel %%v26,%%v20,%%v21,%%v26 \n\t"
"vsel %%v27,%%v22,%%v23,%%v27 \n\t"
"vfchdb %%v28,%%v24,%%v25 \n\t"
"vfchdb %%v29,%%v26,%%v27 \n\t"
"vsel %%v28,%%v24,%%v25,%%v28 \n\t"
"vsel %%v29,%%v26,%%v27,%%v29 \n\t"
"vfchdb %%v30,%%v28,%%v29 \n\t"
"vsel %%v30,%%v28,%%v29,%%v30 \n\t"
"vfchdb %%v31,%%v30,%%v0 \n\t"
"vsel %%v0,%%v30,%%v0,%%v31 \n\t"
"vl %%v16,128(%%r1,%2) \n\t"
"vl %%v17,144(%%r1,%2) \n\t"
"vl %%v18,160(%%r1,%2) \n\t"
"vl %%v19,176(%%r1,%2) \n\t"
"vl %%v20,192(%%r1,%2) \n\t"
"vl %%v21,208(%%r1,%2) \n\t"
"vl %%v22,224(%%r1,%2) \n\t"
"vl %%v23,240(%%r1,%2) \n\t"
"vflpdb %%v16, %%v16 \n\t"
"vflpdb %%v17, %%v17 \n\t"
"vflpdb %%v18, %%v18 \n\t"
"vflpdb %%v19, %%v19 \n\t"
"vflpdb %%v20, %%v20 \n\t"
"vflpdb %%v21, %%v21 \n\t"
"vflpdb %%v22, %%v22 \n\t"
"vflpdb %%v23, %%v23 \n\t"
"vfchdb %%v24,%%v16,%%v17 \n\t"
"vfchdb %%v25,%%v18,%%v19 \n\t"
"vfchdb %%v26,%%v20,%%v21 \n\t"
"vfchdb %%v27,%%v22,%%v23 \n\t"
"vsel %%v24,%%v16,%%v17,%%v24 \n\t"
"vsel %%v25,%%v18,%%v19,%%v25 \n\t"
"vsel %%v26,%%v20,%%v21,%%v26 \n\t"
"vsel %%v27,%%v22,%%v23,%%v27 \n\t"
"vfchdb %%v28,%%v24,%%v25 \n\t"
"vfchdb %%v29,%%v26,%%v27 \n\t"
"vsel %%v28,%%v24,%%v25,%%v28 \n\t"
"vsel %%v29,%%v26,%%v27,%%v29 \n\t"
"vfchdb %%v30,%%v28,%%v29 \n\t"
"vsel %%v30,%%v28,%%v29,%%v30 \n\t"
"vfchdb %%v31,%%v30,%%v0 \n\t"
"vsel %%v0,%%v30,%%v0,%%v31 \n\t"
"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"
"vrepg %%v16,%%v0,1 \n\t"
"wfchdb %%v17,%%v0,%%v16 \n\t"
"vsel %%v0,%%v0,%%v16,%%v17 \n\t"
"ldr %0,%%f0 "
:"=f"(amax)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
return amax;
}
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0;
BLASLONG j = 0;
FLOAT maxf = 0.0;
if (n <= 0 || inc_x <= 0) return (maxf);
if (inc_x == 1) {
BLASLONG n1 = n & -32;
if (n1 > 0) {
maxf = damax_kernel_32(n1, x);
i = n1;
}
else
{
maxf=ABS(x[0]);
i++;
}
while (i < n) {
if (ABS(x[i]) > maxf) {
maxf = ABS(x[i]);
}
i++;
}
return (maxf);
} else {
maxf=ABS(x[0]);
BLASLONG n1 = n & -4;
while (j < n1) {
if (ABS(x[i]) > maxf) {
maxf = ABS(x[i]);
}
if (ABS(x[i + inc_x]) > maxf) {
maxf = ABS(x[i + inc_x]);
}
if (ABS(x[i + 2 * inc_x]) > maxf) {
maxf = ABS(x[i + 2 * inc_x]);
}
if (ABS(x[i + 3 * inc_x]) > maxf) {
maxf = ABS(x[i + 3 * inc_x]);
}
i += inc_x * 4;
j += 4;
}
while (j < n) {
if (ABS(x[i]) > maxf) {
maxf = ABS(x[i]);
}
i += inc_x;
j++;
}
return (maxf);
}
}

166
kernel/zarch/damin.c Normal file
View File

@ -0,0 +1,166 @@
/***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#include <math.h>
#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf
#endif
static FLOAT damin_kernel_32(BLASLONG n, FLOAT *x)
{
FLOAT amin;
__asm__ volatile (
"vl %%v0,0(%2) \n\t"
"srlg %%r0,%1,5 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%2) \n\t"
"vl %%v16,0(%%r1,%2) \n\t"
"vl %%v17,16(%%r1,%2) \n\t"
"vl %%v18,32(%%r1,%2) \n\t"
"vl %%v19,48(%%r1,%2) \n\t"
"vl %%v20,64(%%r1,%2) \n\t"
"vl %%v21,80(%%r1,%2) \n\t"
"vl %%v22,96(%%r1,%2) \n\t"
"vl %%v23,112(%%r1,%2) \n\t"
"vl %%v24,128(%%r1,%2) \n\t"
"vl %%v25,144(%%r1,%2) \n\t"
"vl %%v26,160(%%r1,%2) \n\t"
"vl %%v27,176(%%r1,%2) \n\t"
"vl %%v28,192(%%r1,%2) \n\t"
"vl %%v29,208(%%r1,%2) \n\t"
"vl %%v30,224(%%r1,%2) \n\t"
"vl %%v31,240(%%r1,%2) \n\t"
"vfmindb %%v16,%%v16,%%v24,8 \n\t"
"vfmindb %%v17,%%v17,%%v25,8 \n\t"
"vfmindb %%v18,%%v18,%%v26,8 \n\t"
"vfmindb %%v19,%%v19,%%v27,8 \n\t"
"vfmindb %%v20,%%v20,%%v28,8 \n\t"
"vfmindb %%v21,%%v21,%%v29,8 \n\t"
"vfmindb %%v22,%%v22,%%v30,8 \n\t"
"vfmindb %%v23,%%v23,%%v31,8 \n\t"
"vfmindb %%v16,%%v16,%%v20,8 \n\t"
"vfmindb %%v17,%%v17,%%v21,8 \n\t"
"vfmindb %%v18,%%v18,%%v22,8 \n\t"
"vfmindb %%v19,%%v19,%%v23,8 \n\t"
"vfmindb %%v16,%%v16,%%v18,8 \n\t"
"vfmindb %%v17,%%v17,%%v19,8 \n\t"
"vfmindb %%v16,%%v16,%%v17,8 \n\t"
"vfmindb %%v0,%%v0,%%16,8 \n\t"
"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"
"vrepg %%v16,%%v0,1 \n\t"
"wfmindb %%v0,%%v0,%%v16,8 \n\t"
"lpdr %0,%%f0 "
:"=f"(amin)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
return amin;
}
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0;
BLASLONG j = 0;
FLOAT minf = 0.0;
if (n <= 0 || inc_x <= 0) return (minf);
if (inc_x == 1) {
BLASLONG n1 = n & -32;
if (n1 > 0) {
minf = damin_kernel_32(n1, x);
i = n1;
}
else
{
minf=ABS(x[0]);
i++;
}
while (i < n) {
if (ABS(x[i]) < minf) {
minf = ABS(x[i]);
}
i++;
}
return (minf);
} else {
minf=ABS(x[0]);
BLASLONG n1 = n & -4;
while (j < n1) {
if (ABS(x[i]) < minf) {
minf = ABS(x[i]);
}
if (ABS(x[i + inc_x]) < minf) {
minf = ABS(x[i + inc_x]);
}
if (ABS(x[i + 2 * inc_x]) < minf) {
minf = ABS(x[i + 2 * inc_x]);
}
if (ABS(x[i + 3 * inc_x]) < minf) {
minf = ABS(x[i + 3 * inc_x]);
}
i += inc_x * 4;
j += 4;
}
while (j < n) {
if (ABS(x[i]) < minf) {
minf = ABS(x[i]);
}
i += inc_x;
j++;
}
return (minf);
}
}

204
kernel/zarch/damin_z13.c Normal file
View File

@ -0,0 +1,204 @@
/***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#include <math.h>
#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf
#endif
static FLOAT damin_kernel_32(BLASLONG n, FLOAT *x)
{
FLOAT amin;
__asm__ volatile (
"vl %%v0,0(%2) \n\t"
"vflpdb %%v0,%%v0 \n\t"
"srlg %%r0,%1,5 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%2) \n\t"
"vl %%v16,0(%%r1,%2) \n\t"
"vl %%v17,16(%%r1,%2) \n\t"
"vl %%v18,32(%%r1,%2) \n\t"
"vl %%v19,48(%%r1,%2) \n\t"
"vl %%v20,64(%%r1,%2) \n\t"
"vl %%v21,80(%%r1,%2) \n\t"
"vl %%v22,96(%%r1,%2) \n\t"
"vl %%v23,112(%%r1,%2) \n\t"
"vflpdb %%v16, %%v16 \n\t"
"vflpdb %%v17, %%v17 \n\t"
"vflpdb %%v18, %%v18 \n\t"
"vflpdb %%v19, %%v19 \n\t"
"vflpdb %%v20, %%v20 \n\t"
"vflpdb %%v21, %%v21 \n\t"
"vflpdb %%v22, %%v22 \n\t"
"vflpdb %%v23, %%v23 \n\t"
"vfchdb %%v24,%%v17,%%v16 \n\t"
"vfchdb %%v25,%%v19,%%v18 \n\t"
"vfchdb %%v26,%%v21,%%v20 \n\t"
"vfchdb %%v27,%%v23,%%v22 \n\t"
"vsel %%v24,%%v16,%%v17,%%v24 \n\t"
"vsel %%v25,%%v18,%%v19,%%v25 \n\t"
"vsel %%v26,%%v20,%%v21,%%v26 \n\t"
"vsel %%v27,%%v22,%%v23,%%v27 \n\t"
"vfchdb %%v28,%%v25,%%v24 \n\t"
"vfchdb %%v29,%%v27,%%v26 \n\t"
"vsel %%v28,%%v24,%%v25,%%v28 \n\t"
"vsel %%v29,%%v26,%%v27,%%v29 \n\t"
"vfchdb %%v30,%%v29,%%v28 \n\t"
"vsel %%v30,%%v28,%%v29,%%v30 \n\t"
"vfchdb %%v31,%%v0,%%v30 \n\t"
"vsel %%v0,%%v30,%%v0,%%v31 \n\t"
"vl %%v16,128(%%r1,%2) \n\t"
"vl %%v17,144(%%r1,%2) \n\t"
"vl %%v18,160(%%r1,%2) \n\t"
"vl %%v19,176(%%r1,%2) \n\t"
"vl %%v20,192(%%r1,%2) \n\t"
"vl %%v21,208(%%r1,%2) \n\t"
"vl %%v22,224(%%r1,%2) \n\t"
"vl %%v23,240(%%r1,%2) \n\t"
"vflpdb %%v16, %%v16 \n\t"
"vflpdb %%v17, %%v17 \n\t"
"vflpdb %%v18, %%v18 \n\t"
"vflpdb %%v19, %%v19 \n\t"
"vflpdb %%v20, %%v20 \n\t"
"vflpdb %%v21, %%v21 \n\t"
"vflpdb %%v22, %%v22 \n\t"
"vflpdb %%v23, %%v23 \n\t"
"vfchdb %%v24,%%v17,%%v16 \n\t"
"vfchdb %%v25,%%v19,%%v18 \n\t"
"vfchdb %%v26,%%v21,%%v20 \n\t"
"vfchdb %%v27,%%v23,%%v22 \n\t"
"vsel %%v24,%%v16,%%v17,%%v24 \n\t"
"vsel %%v25,%%v18,%%v19,%%v25 \n\t"
"vsel %%v26,%%v20,%%v21,%%v26 \n\t"
"vsel %%v27,%%v22,%%v23,%%v27 \n\t"
"vfchdb %%v28,%%v25,%%v24 \n\t"
"vfchdb %%v29,%%v27,%%v26 \n\t"
"vsel %%v28,%%v24,%%v25,%%v28 \n\t"
"vsel %%v29,%%v26,%%v27,%%v29 \n\t"
"vfchdb %%v30,%%v29,%%v28 \n\t"
"vsel %%v30,%%v28,%%v29,%%v30 \n\t"
"vfchdb %%v31,%%v0,%%v30 \n\t"
"vsel %%v0,%%v30,%%v0,%%v31 \n\t"
"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"
"vrepg %%v16,%%v0,1 \n\t"
"wfchdb %%v17,%%v16,%%v0 \n\t"
"vsel %%v0,%%v0,%%v16,%%v17 \n\t"
"ldr %0,%%f0 "
:"=f"(amin)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
return amin;
}
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0;
BLASLONG j = 0;
FLOAT minf = 0.0;
if (n <= 0 || inc_x <= 0) return (minf);
if (inc_x == 1) {
BLASLONG n1 = n & -32;
if (n1 > 0) {
minf = damin_kernel_32(n1, x);
i = n1;
}
else
{
minf=ABS(x[0]);
i++;
}
while (i < n) {
if (ABS(x[i]) < minf) {
minf = ABS(x[i]);
}
i++;
}
return (minf);
} else {
minf=ABS(x[0]);
BLASLONG n1 = n & -4;
while (j < n1) {
if (ABS(x[i]) < minf) {
minf = ABS(x[i]);
}
if (ABS(x[i + inc_x]) < minf) {
minf = ABS(x[i + inc_x]);
}
if (ABS(x[i + 2 * inc_x]) < minf) {
minf = ABS(x[i + 2 * inc_x]);
}
if (ABS(x[i + 3 * inc_x]) < minf) {
minf = ABS(x[i + 3 * inc_x]);
}
i += inc_x * 4;
j += 4;
}
while (j < n) {
if (ABS(x[i]) < minf) {
minf = ABS(x[i]);
}
i += inc_x;
j++;
}
return (minf);
}
}

View File

@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project Copyright (c) 2013-2018, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@ -23,8 +23,7 @@ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/ *****************************************************************************/
#include "common.h" #include "common.h"
#include <math.h> #include <math.h>
@ -35,80 +34,89 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define ABS fabsf #define ABS fabsf
#endif #endif
static FLOAT dasum_kernel_32(BLASLONG n, FLOAT *x)
{
FLOAT asum;
__asm__ (
"vzero %%v0 \n\t"
"vzero %%v1 \n\t"
"vzero %%v2 \n\t"
"vzero %%v3 \n\t"
"srlg %%r0,%1,5 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%2) \n\t"
"vl %%v16, 0(%%r1,%2) \n\t"
"vl %%v17, 16(%%r1,%2) \n\t"
"vl %%v18, 32(%%r1,%2) \n\t"
"vl %%v19, 48(%%r1,%2) \n\t"
"vl %%v20, 64(%%r1,%2) \n\t"
"vl %%v21, 80(%%r1,%2) \n\t"
"vl %%v22, 96(%%r1,%2) \n\t"
"vl %%v23, 112(%%r1,%2) \n\t"
static FLOAT dasum_kernel_32(BLASLONG n, FLOAT *x) { "vflpdb %%v16, %%v16 \n\t"
FLOAT asum ; "vflpdb %%v17, %%v17 \n\t"
__asm__ ( "vflpdb %%v18, %%v18 \n\t"
"pfd 1, 0(%[ptr_x]) \n\t" "vflpdb %%v19, %%v19 \n\t"
"sllg %%r0,%[n],3 \n\t" "vflpdb %%v20, %%v20 \n\t"
"agr %%r0,%[ptr_x] \n\t" "vflpdb %%v21, %%v21 \n\t"
"vzero %%v0 \n\t" "vflpdb %%v22, %%v22 \n\t"
"vzero %%v1 \n\t" "vflpdb %%v23, %%v23 \n\t"
"vzero %%v2 \n\t"
"vzero %%v3 \n\t"
".align 16 \n\t"
"1: \n\t"
"pfd 1, 256(%[ptr_temp] ) \n\t"
"vlm %%v24,%%v31, 0(%[ptr_temp] ) \n\t"
"vflpdb %%v24, %%v24 \n\t"
"vflpdb %%v25, %%v25 \n\t"
"vflpdb %%v26, %%v26 \n\t"
"vflpdb %%v27, %%v27 \n\t"
"vflpdb %%v28, %%v28 \n\t"
"vflpdb %%v29, %%v29 \n\t"
"vflpdb %%v30, %%v30 \n\t"
"vflpdb %%v31, %%v31 \n\t"
"vfadb %%v0,%%v0,%%v24 \n\t"
"vfadb %%v1,%%v1,%%v25 \n\t"
"vfadb %%v2,%%v2,%%v26 \n\t"
"vfadb %%v3,%%v3,%%v27 \n\t"
"vfadb %%v0,%%v0,%%v28 \n\t"
"vfadb %%v1,%%v1,%%v29 \n\t"
"vfadb %%v2,%%v2,%%v30 \n\t"
"vfadb %%v3,%%v3,%%v31 \n\t"
"vlm %%v24,%%v31, 128(%[ptr_temp]) \n\t"
"vflpdb %%v24, %%v24 \n\t"
"vflpdb %%v25, %%v25 \n\t"
"vflpdb %%v26, %%v26 \n\t"
"vflpdb %%v27, %%v27 \n\t"
"vflpdb %%v28, %%v28 \n\t"
"vflpdb %%v29, %%v29 \n\t"
"vflpdb %%v30, %%v30 \n\t"
"vflpdb %%v31, %%v31 \n\t"
"la %[ptr_temp],256(%[ptr_temp]) \n\t"
"vfadb %%v0,%%v0,%%v24 \n\t"
"vfadb %%v1,%%v1,%%v25 \n\t"
"vfadb %%v2,%%v2,%%v26 \n\t"
"vfadb %%v3,%%v3,%%v27 \n\t"
"vfadb %%v0,%%v0,%%v28 \n\t"
"vfadb %%v1,%%v1,%%v29 \n\t"
"vfadb %%v2,%%v2,%%v30 \n\t"
"vfadb %%v3,%%v3,%%v31 \n\t"
"clgrjl %[ptr_temp],%%r0,1b \n\t"
"vfadb %%v24,%%v0,%%v1 \n\t"
"vfadb %%v25,%%v2,%%v3 \n\t"
"vfadb %%v0,%%v25,%%v24 \n\t"
"vrepg %%v1,%%v0,1 \n\t"
"adbr %%f0,%%f1 \n\t"
"ldr %[asum],%%f0 \n\t"
: [asum] "=f"(asum),[ptr_temp] "+&a"(x)
: [mem] "m"( *(const double (*)[n])x ), [n] "r"(n), [ptr_x] "a"(x)
: "cc", "r0" ,"f0","f1","v0","v1","v2","v3","v24","v25","v26","v27","v28","v29","v30","v31"
);
return asum;
"vfadb %%v0,%%v0,%%v16 \n\t"
"vfadb %%v1,%%v1,%%v17 \n\t"
"vfadb %%v2,%%v2,%%v18 \n\t"
"vfadb %%v3,%%v3,%%v19 \n\t"
"vfadb %%v0,%%v0,%%v20 \n\t"
"vfadb %%v1,%%v1,%%v21 \n\t"
"vfadb %%v2,%%v2,%%v22 \n\t"
"vfadb %%v3,%%v3,%%v23 \n\t"
"vl %%v16, 128(%%r1,%2) \n\t"
"vl %%v17, 144(%%r1,%2) \n\t"
"vl %%v18, 160(%%r1,%2) \n\t"
"vl %%v19, 176(%%r1,%2) \n\t"
"vl %%v20, 192(%%r1,%2) \n\t"
"vl %%v21, 208(%%r1,%2) \n\t"
"vl %%v22, 224(%%r1,%2) \n\t"
"vl %%v23, 240(%%r1,%2) \n\t"
"vflpdb %%v16, %%v16 \n\t"
"vflpdb %%v17, %%v17 \n\t"
"vflpdb %%v18, %%v18 \n\t"
"vflpdb %%v19, %%v19 \n\t"
"vflpdb %%v20, %%v20 \n\t"
"vflpdb %%v21, %%v21 \n\t"
"vflpdb %%v22, %%v22 \n\t"
"vflpdb %%v23, %%v23 \n\t"
"vfadb %%v0,%%v0,%%v16 \n\t"
"vfadb %%v1,%%v1,%%v17 \n\t"
"vfadb %%v2,%%v2,%%v18 \n\t"
"vfadb %%v3,%%v3,%%v19 \n\t"
"vfadb %%v0,%%v0,%%v20 \n\t"
"vfadb %%v1,%%v1,%%v21 \n\t"
"vfadb %%v2,%%v2,%%v22 \n\t"
"vfadb %%v3,%%v3,%%v23 \n\t"
"agfi %%r1,256 \n\t"
"brctg %%r0,0b \n\t"
"vfadb %%v0,%%v0,%%v1 \n\t"
"vfadb %%v0,%%v0,%%v2 \n\t"
"vfadb %%v0,%%v0,%%v3 \n\t"
"vrepg %%v1,%%v0,1 \n\t"
"adbr %%f0,%%f1 \n\t"
"ldr %0,%%f0 "
:"=f"(asum)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23"
);
return asum;
} }
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0; BLASLONG i = 0;
BLASLONG j = 0; BLASLONG j = 0;

View File

@ -25,98 +25,99 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/ *****************************************************************************/
#include "common.h" #include "common.h"
#define PREFETCH_INS 1 static void daxpy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
#if defined(Z13_A)
#include <vecintrin.h>
static void daxpy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT alpha)
{
BLASLONG i = 0;
__vector double v_a = {alpha,alpha};
__vector double * v_y=(__vector double *)y;
__vector double * v_x=(__vector double *)x;
for(; i<n/2; i+=16){
v_y[i] += v_a * v_x[i];
v_y[i+1] += v_a * v_x[i+1];
v_y[i+2] += v_a * v_x[i+2];
v_y[i+3] += v_a * v_x[i+3];
v_y[i+4] += v_a * v_x[i+4];
v_y[i+5] += v_a * v_x[i+5];
v_y[i+6] += v_a * v_x[i+6];
v_y[i+7] += v_a * v_x[i+7];
v_y[i+8] += v_a * v_x[i+8];
v_y[i+9] += v_a * v_x[i+9];
v_y[i+10] += v_a * v_x[i+10];
v_y[i+11] += v_a * v_x[i+11];
v_y[i+12] += v_a * v_x[i+12];
v_y[i+13] += v_a * v_x[i+13];
v_y[i+14] += v_a * v_x[i+14];
v_y[i+15] += v_a * v_x[i+15];
}
}
#else
static void daxpy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT alpha)
{ {
__asm__ volatile(
"vlrepg %%v0,%3 \n\t"
"srlg %%r0,%0,5 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%1) \n\t"
"pfd 2, 1024(%%r1,%2) \n\t"
"vl %%v16,0(%%r1,%1) \n\t"
"vl %%v17,16(%%r1,%1) \n\t"
"vl %%v18,32(%%r1,%1) \n\t"
"vl %%v19,48(%%r1,%1) \n\t"
"vl %%v20,0(%%r1,%2) \n\t"
"vl %%v21,16(%%r1,%2) \n\t"
"vl %%v22,32(%%r1,%2) \n\t"
"vl %%v23,48(%%r1,%2) \n\t"
__asm__ volatile( "vfmadb %%v16,%%v0,%%v16,%%v20 \n\t"
#if defined(PREFETCH_INS) "vfmadb %%v17,%%v0,%%v17,%%v21 \n\t"
"pfd 1, 0(%[x_tmp]) \n\t" "vfmadb %%v18,%%v0,%%v18,%%v22 \n\t"
"pfd 2, 0(%[y_tmp]) \n\t" "vfmadb %%v19,%%v0,%%v19,%%v23 \n\t"
#endif
"lgdr %%r0,%[alpha] \n\t"
"vlvgp %%v0,%%r0,%%r0 \n\t"
"srlg %%r0,%[n],5 \n\t"
"vlr %%v1,%%v0 \n\t"
".align 16 \n\t"
"1: \n\t"
#if defined(PREFETCH_INS)
"pfd 1, 256(%[x_tmp]) \n\t"
"pfd 2, 256(%[y_tmp]) \n\t"
#endif
"vlm %%v16,%%v23, 0(%[x_tmp]) \n\t"
"vlm %%v24, %%v31, 0(%[y_tmp]) \n\t"
"vfmadb %%v16,%%v0,%%v16,%%v24 \n\t"
"vfmadb %%v17,%%v1,%%v17,%%v25 \n\t"
"vfmadb %%v18,%%v0,%%v18,%%v26 \n\t"
"vfmadb %%v19,%%v1,%%v19,%%v27 \n\t"
"vfmadb %%v20,%%v0,%%v20,%%v28 \n\t"
"vfmadb %%v21,%%v1,%%v21,%%v29 \n\t"
"vfmadb %%v22,%%v0,%%v22,%%v30 \n\t"
"vfmadb %%v23,%%v1,%%v23,%%v31 \n\t"
"vstm %%v16,%%v23, 0(%[y_tmp]) \n\t"
"vlm %%v24,%%v31, 128(%[x_tmp]) \n\t"
"vlm %%v16,%%v23, 128(%[y_tmp]) \n\t"
"vfmadb %%v24,%%v0,%%v24,%%v16 \n\t"
"vfmadb %%v25,%%v1,%%v25,%%v17 \n\t"
"vfmadb %%v26,%%v0,%%v26,%%v18 \n\t"
"vfmadb %%v27,%%v1,%%v27,%%v19 \n\t"
"vfmadb %%v28,%%v0,%%v28,%%v20 \n\t"
"vfmadb %%v29,%%v1,%%v29,%%v21 \n\t"
"vfmadb %%v30,%%v0,%%v30,%%v22 \n\t"
"vfmadb %%v31,%%v1,%%v31,%%v23 \n\t"
"la %[x_tmp],256(%[x_tmp]) \n\t"
"vstm %%v24, %%v31, 128(%[y_tmp]) \n\t"
"la %[y_tmp],256(%[y_tmp]) \n\t"
"brctg %%r0,1b"
: [mem_y] "+m" (*(double (*)[n])y), [x_tmp] "+&a"(x), [y_tmp] "+&a"(y)
: [mem_x] "m" (*(const double (*)[n])x), [n] "r"(n), [alpha] "f"(alpha)
:"cc", "r0", "v0","v1","v16","v17","v18","v19","v20","v21",
"v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
"vl %%v24,64(%%r1,%1) \n\t"
"vl %%v25,80(%%r1,%1) \n\t"
"vl %%v26,96(%%r1,%1) \n\t"
"vl %%v27,112(%%r1,%1) \n\t"
"vl %%v28,64(%%r1,%2) \n\t"
"vl %%v29,80(%%r1,%2) \n\t"
"vl %%v30,96(%%r1,%2) \n\t"
"vl %%v31,112(%%r1,%2) \n\t"
"vfmadb %%v20,%%v0,%%v24,%%v28 \n\t"
"vfmadb %%v21,%%v0,%%v25,%%v29 \n\t"
"vfmadb %%v22,%%v0,%%v26,%%v30 \n\t"
"vfmadb %%v23,%%v0,%%v27,%%v31 \n\t"
"vst %%v16,0(%%r1,%2) \n\t"
"vst %%v17,16(%%r1,%2) \n\t"
"vst %%v18,32(%%r1,%2) \n\t"
"vst %%v19,48(%%r1,%2) \n\t"
"vst %%v20,64(%%r1,%2) \n\t"
"vst %%v21,80(%%r1,%2) \n\t"
"vst %%v22,96(%%r1,%2) \n\t"
"vst %%v23,112(%%r1,%2) \n\t"
"vl %%v16,128(%%r1,%1) \n\t"
"vl %%v17,144(%%r1,%1) \n\t"
"vl %%v18,160(%%r1,%1) \n\t"
"vl %%v19,176(%%r1,%1) \n\t"
"vl %%v20,128(%%r1,%2) \n\t"
"vl %%v21,144(%%r1,%2) \n\t"
"vl %%v22,160(%%r1,%2) \n\t"
"vl %%v23,176(%%r1,%2) \n\t"
"vfmadb %%v16,%%v0,%%v16,%%v20 \n\t"
"vfmadb %%v17,%%v0,%%v17,%%v21 \n\t"
"vfmadb %%v18,%%v0,%%v18,%%v22 \n\t"
"vfmadb %%v19,%%v0,%%v19,%%v23 \n\t"
"vl %%v24,192(%%r1,%1) \n\t"
"vl %%v25,208(%%r1,%1) \n\t"
"vl %%v26,224(%%r1,%1) \n\t"
"vl %%v27,240(%%r1,%1) \n\t"
"vl %%v28,192(%%r1,%2) \n\t"
"vl %%v29,208(%%r1,%2) \n\t"
"vl %%v30,224(%%r1,%2) \n\t"
"vl %%v31,240(%%r1,%2) \n\t"
"vfmadb %%v20,%%v0,%%v24,%%v28 \n\t"
"vfmadb %%v21,%%v0,%%v25,%%v29 \n\t"
"vfmadb %%v22,%%v0,%%v26,%%v30 \n\t"
"vfmadb %%v23,%%v0,%%v27,%%v31 \n\t"
"vst %%v16,128(%%r1,%2) \n\t"
"vst %%v17,144(%%r1,%2) \n\t"
"vst %%v18,160(%%r1,%2) \n\t"
"vst %%v19,176(%%r1,%2) \n\t"
"vst %%v20,192(%%r1,%2) \n\t"
"vst %%v21,208(%%r1,%2) \n\t"
"vst %%v22,224(%%r1,%2) \n\t"
"vst %%v23,240(%%r1,%2) \n\t"
"agfi %%r1,256 \n\t"
"brctg %%r0,0b "
:
:"r"(n),"ZR"((const FLOAT (*)[n])x),"ZR"((FLOAT (*)[n])y),"m"(*alpha)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
} }
#endif
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
{ {
@ -131,7 +132,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
BLASLONG n1 = n & -32; BLASLONG n1 = n & -32;
if ( n1 ) if ( n1 )
daxpy_kernel_32(n1, x, y , da ); daxpy_kernel_32(n1, x, y , &da);
i = n1; i = n1;
while(i < n) while(i < n)

View File

@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project Copyright (c) 2013-2018, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@ -23,95 +23,28 @@ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/ *****************************************************************************/
#include "common.h" #include "common.h"
#if defined(Z13mvc) static void dcopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y)
{
static void dcopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) { __asm__ volatile (
"lgr %%r1,%1 \n\t"
__asm__ volatile( "lgr %%r2,%2 \n\t"
"pfd 1, 0(%[ptr_x]) \n\t" "srlg %%r0,%0,5 \n\t"
"pfd 2, 0(%[ptr_y]) \n\t" "0: \n\t"
"srlg %[n_tmp],%[n_tmp],5 \n\t" "pfd 1, 1024(%%r1) \n\t"
".align 16 \n\t" "pfd 2, 1024(%%r2) \n\t"
"1: \n\t" "mvc 0(256,%%r2),0(%%r1) \n\t"
"mvc 0(256,%[ptr_y]),0(%[ptr_x]) \n\t" "agfi %%r1,256 \n\t"
"la %[ptr_x],256(%[ptr_x]) \n\t" "agfi %%r2,256 \n\t"
"la %[ptr_y],256(%[ptr_y]) \n\t" "brctg %%r0,0b "
"brctg %[n_tmp],1b" :
: [mem_y] "=m" (*(double (*)[n])y), [n_tmp] "+&r"(n), :"r"(n),"a"((const FLOAT (*)[n])x),"a"((FLOAT (*)[n])y)
[ptr_x] "+&a"(x), [ptr_y] "+&a"(y) :"memory","cc","r0","r1","r2"
: [mem_x] "m" (*(const double (*)[n])x) );
: "cc"
);
return;
} }
#else
static void dcopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) {
__asm__ volatile(
"pfd 1, 0(%[ptr_x]) \n\t"
"pfd 2, 0(%[ptr_y]) \n\t"
"srlg %[n_tmp],%[n_tmp],5 \n\t"
"xgr %%r1,%%r1 \n\t"
".align 16 \n\t"
"1: \n\t"
"pfd 1, 256(%%r1,%[ptr_x]) \n\t"
"pfd 2, 256(%%r1,%[ptr_y]) \n\t"
"vl %%v24, 0(%%r1,%[ptr_x]) \n\t"
"vst %%v24, 0(%%r1,%[ptr_y]) \n\t"
"vl %%v25, 16(%%r1,%[ptr_x]) \n\t"
"vst %%v25, 16(%%r1,%[ptr_y]) \n\t"
"vl %%v26, 32(%%r1,%[ptr_x]) \n\t"
"vst %%v26, 32(%%r1,%[ptr_y]) \n\t"
"vl %%v27, 48(%%r1,%[ptr_x]) \n\t"
"vst %%v27, 48(%%r1,%[ptr_y]) \n\t"
"vl %%v24, 64(%%r1,%[ptr_x]) \n\t"
"vst %%v24, 64(%%r1,%[ptr_y]) \n\t"
"vl %%v25, 80(%%r1,%[ptr_x]) \n\t"
"vst %%v25, 80(%%r1,%[ptr_y]) \n\t"
"vl %%v26, 96(%%r1,%[ptr_x]) \n\t"
"vst %%v26, 96(%%r1,%[ptr_y]) \n\t"
"vl %%v27, 112(%%r1,%[ptr_x]) \n\t"
"vst %%v27, 112(%%r1,%[ptr_y]) \n\t"
"vl %%v24, 128(%%r1,%[ptr_x]) \n\t"
"vst %%v24, 128(%%r1,%[ptr_y]) \n\t"
"vl %%v25, 144(%%r1,%[ptr_x]) \n\t"
"vst %%v25, 144(%%r1,%[ptr_y]) \n\t"
"vl %%v26, 160(%%r1,%[ptr_x]) \n\t"
"vst %%v26, 160(%%r1,%[ptr_y]) \n\t"
"vl %%v27, 176(%%r1,%[ptr_x]) \n\t"
"vst %%v27, 176(%%r1,%[ptr_y]) \n\t"
"vl %%v24, 192(%%r1,%[ptr_x]) \n\t"
"vst %%v24, 192(%%r1,%[ptr_y]) \n\t"
"vl %%v25, 208(%%r1,%[ptr_x]) \n\t"
"vst %%v25, 208(%%r1,%[ptr_y]) \n\t"
"vl %%v26, 224(%%r1,%[ptr_x]) \n\t"
"vst %%v26, 224(%%r1,%[ptr_y]) \n\t"
"vl %%v27, 240(%%r1,%[ptr_x]) \n\t"
"vst %%v27, 240(%%r1,%[ptr_y]) \n\t"
"la %%r1,256(%%r1) \n\t"
"brctg %[n_tmp],1b"
: [mem_y] "=m" (*(double (*)[n])y), [n_tmp] "+&r"(n)
: [mem_x] "m" (*(const double (*)[n])x), [ptr_x] "a"(x), [ptr_y] "a"(y)
: "cc", "r1", "v24","v25","v26","v27"
);
return;
}
#endif
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) {
BLASLONG i = 0; BLASLONG i = 0;
@ -136,21 +69,6 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) {
} else { } else {
BLASLONG n1 = n & -4;
while (i < n1) {
y[iy] = x[ix];
y[iy + inc_y] = x[ix + inc_x];
y[iy + 2 * inc_y] = x[ix + 2 * inc_x];
y[iy + 3 * inc_y] = x[ix + 3 * inc_x];
ix += inc_x * 4;
iy += inc_y * 4;
i += 4;
}
while (i < n) { while (i < n) {
y[iy] = x[ix]; y[iy] = x[ix];
@ -165,5 +83,3 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) {
} }

View File

@ -25,116 +25,59 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/ *****************************************************************************/
#include "common.h" #include "common.h"
static FLOAT ddot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y)
#if defined(Z13)
static FLOAT ddot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y)
{ {
FLOAT dot; FLOAT dot;
__asm__ volatile(
"pfd 1, 0(%[ptr_x_tmp]) \n\t"
"pfd 1, 0(%[ptr_y_tmp]) \n\t"
"vzero %%v24 \n\t"
"vzero %%v25 \n\t"
"vzero %%v26 \n\t"
"vzero %%v27 \n\t"
"srlg %[n_tmp],%[n_tmp],4 \n\t"
"xgr %%r1,%%r1 \n\t"
".align 16 \n\t"
"1: \n\t"
"pfd 1, 256(%%r1,%[ptr_x_tmp]) \n\t"
"pfd 1, 256(%%r1,%[ptr_y_tmp]) \n\t"
"vl %%v16, 0(%%r1,%[ptr_x_tmp]) \n\t"
"vl %%v17, 16(%%r1,%[ptr_x_tmp]) \n\t"
"vl %%v18, 32(%%r1,%[ptr_x_tmp]) \n\t"
"vl %%v19, 48(%%r1,%[ptr_x_tmp]) \n\t"
"vl %%v28, 0(%%r1,%[ptr_y_tmp]) \n\t" __asm__ volatile (
"vfmadb %%v24,%%v16,%%v28,%%v24 \n\t" "vzero %%v0 \n\t"
"vl %%v29, 16(%%r1,%[ptr_y_tmp]) \n\t" "srlg %%r0,%1,4 \n\t"
"vfmadb %%v25,%%v17,%%v29,%%v25 \n\t" "xgr %%r1,%%r1 \n\t"
"0: \n\t"
"vl %%v30, 32(%%r1,%[ptr_y_tmp]) \n\t" "pfd 1,1024(%%r1,%2) \n\t"
"vfmadb %%v26,%%v18,%%v30,%%v26 \n\t" "pfd 1,1024(%%r1,%3) \n\t"
"vl %%v31, 48(%%r1,%[ptr_y_tmp]) \n\t"
"vfmadb %%v27,%%v19,%%v31,%%v27 \n\t"
"vl %%v16, 64(%%r1 ,%[ptr_x_tmp]) \n\t"
"vl %%v17, 80(%%r1,%[ptr_x_tmp]) \n\t"
"vl %%v18, 96(%%r1,%[ptr_x_tmp]) \n\t"
"vl %%v19, 112(%%r1,%[ptr_x_tmp]) \n\t"
"vl %%v28, 64(%%r1,%[ptr_y_tmp]) \n\t" "vl %%v16,0(%%r1,%2) \n\t"
"vfmadb %%v24,%%v16,%%v28,%%v24 \n\t" "vl %%v17,16(%%r1,%2) \n\t"
"vl %%v29, 80(%%r1,%[ptr_y_tmp]) \n\t" "vl %%v18,32(%%r1,%2) \n\t"
"vfmadb %%v25,%%v17,%%v29,%%v25 \n\t" "vl %%v19,48(%%r1,%2) \n\t"
"vl %%v20,64(%%r1,%2) \n\t"
"vl %%v21,80(%%r1,%2) \n\t"
"vl %%v30, 96(%%r1,%[ptr_y_tmp]) \n\t" "vl %%v22,96(%%r1,%2) \n\t"
"vfmadb %%v26,%%v18,%%v30,%%v26 \n\t" "vl %%v23,112(%%r1,%2) \n\t"
"vl %%v31, 112(%%r1,%[ptr_y_tmp]) \n\t"
"vfmadb %%v27,%%v19,%%v31,%%v27 \n\t"
"la %%r1,128(%%r1) \n\t"
"brctg %[n_tmp],1b \n\t"
"vfadb %%v24,%%v25,%%v24 \n\t"
"vfadb %%v24,%%v26,%%v24 \n\t"
"vfadb %%v24,%%v27,%%v24 \n\t"
"vrepg %%v1,%%v24,1 \n\t"
"vfadb %%v1,%%v24,%%v1 \n\t"
"ldr %[dot], %%f1 \n\t"
: [dot] "=f"(dot) ,[n_tmp] "+&r"(n)
: [mem_x] "m"( *(const double (*)[n])x),
[mem_y] "m"( *(const double (*)[n])y),
[ptr_x_tmp]"a"(x), [ptr_y_tmp] "a"(y)
:"cc" , "r1","f1","v16", "v17","v18","v19","v20","v21","v22","v23",
"v24","v25","v26","v27","v28","v29","v30","v31"
); "vl %%v24,0(%%r1,%3) \n\t"
return dot; "vfmadb %%v0,%%v16,%%v24,%%v0 \n\t"
"vl %%v25,16(%%r1,%3) \n\t"
"vfmadb %%v0,%%v17,%%v25,%%v0 \n\t"
"vl %%v26,32(%%r1,%3) \n\t"
"vfmadb %%v0,%%v18,%%v26,%%v0 \n\t"
"vl %%v27,48(%%r1,%3) \n\t"
"vfmadb %%v0,%%v19,%%v27,%%v0 \n\t"
"vl %%v28,64(%%r1,%3) \n\t"
"vfmadb %%v0,%%v20,%%v28,%%v0 \n\t"
"vl %%v29,80(%%r1,%3) \n\t"
"vfmadb %%v0,%%v21,%%v29,%%v0 \n\t"
"vl %%v30,96(%%r1,%3) \n\t"
"vfmadb %%v0,%%v22,%%v30,%%v0 \n\t"
"vl %%v31,112(%%r1,%3) \n\t"
"vfmadb %%v0,%%v23,%%v31,%%v0 \n\t"
"agfi %%r1,128 \n\t"
"brctg %%r0,0b \n\t"
"vrepg %%v1,%%v0,1 \n\t"
"adbr %%f0,%%f1 \n\t"
"ldr %0,%%f0 "
:"=f"(dot)
:"r"(n),"ZR"((const FLOAT (*)[n])x),"ZR"((const FLOAT (*)[n])y)
:"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
}
#else
static FLOAT ddot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y )
{
BLASLONG register i = 0;
FLOAT dot = 0.0;
while(i < n)
{
dot += y[i] * x[i]
+ y[i+1] * x[i+1]
+ y[i+2] * x[i+2]
+ y[i+3] * x[i+3]
+ y[i+4] * x[i+4]
+ y[i+5] * x[i+5]
+ y[i+6] * x[i+6]
+ y[i+7] * x[i+7] ;
dot += y[i+8] * x[i+8]
+ y[i+9] * x[i+9]
+ y[i+10] * x[i+10]
+ y[i+11] * x[i+11]
+ y[i+12] * x[i+12]
+ y[i+13] * x[i+13]
+ y[i+14] * x[i+14]
+ y[i+15] * x[i+15] ;
i+=16 ;
}
return dot; return dot;
} }
#endif
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
{ {
BLASLONG i=0; BLASLONG i=0;
@ -148,13 +91,11 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
{ {
BLASLONG n1 = n & -16; BLASLONG n1 = n & -16;
if ( n1 ){
dot = ddot_kernel_16(n1, x, y );
i = n1;
}
if ( n1 )
dot = ddot_kernel_16(n1, x, y);
i = n1;
while(i < n) while(i < n)
{ {

View File

@ -25,186 +25,392 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/ *****************************************************************************/
#include "common.h" #include "common.h"
#define NBMAX 2048 #define NBMAX 2048
#define HAVE_KERNEL_4x4_VEC 1
#define HAVE_KERNEL_4x2_VEC 1
#define HAVE_KERNEL_4x1_VEC 1
#if defined(HAVE_KERNEL_4x4_VEC) || defined(HAVE_KERNEL_4x2_VEC) || defined(HAVE_KERNEL_4x1_VEC)
#include <vecintrin.h>
#endif
#ifdef HAVE_KERNEL_4x4
#elif HAVE_KERNEL_4x4_VEC
static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha)
{ {
BLASLONG i; __asm__ volatile (
FLOAT x0,x1,x2,x3; "vlrepg %%v0,0(%5) \n\t"
x0 = xo[0] * *alpha; "vlrepg %%v1,8(%5) \n\t"
x1 = xo[1] * *alpha; "vlrepg %%v2,16(%5) \n\t"
x2 = xo[2] * *alpha; "vlrepg %%v3,24(%5) \n\t"
x3 = xo[3] * *alpha; "vlrepg %%v4,%7 \n\t"
__vector double v_x0 = {x0,x0}; "vfmdb %%v0,%%v0,%%v4 \n\t"
__vector double v_x1 = {x1,x1}; "vfmdb %%v1,%%v1,%%v4 \n\t"
__vector double v_x2 = {x2,x2}; "vfmdb %%v2,%%v2,%%v4 \n\t"
__vector double v_x3 = {x3,x3}; "vfmdb %%v3,%%v3,%%v4 \n\t"
__vector double* v_y =(__vector double*)y; "xgr %%r1,%%r1 \n\t"
__vector double* va0 = (__vector double*)ap[0];
__vector double* va1 = (__vector double*)ap[1];
__vector double* va2 = (__vector double*)ap[2];
__vector double* va3 = (__vector double*)ap[3];
for ( i=0; i< n/2; i+=2 ) "lghi %%r0,-16 \n\t"
{ "ngr %%r0,%0 \n\t"
v_y[i] += v_x0 * va0[i] + v_x1 * va1[i] + v_x2 * va2[i] + v_x3 * va3[i] ; "ltgr %%r0,%%r0 \n\t"
v_y[i+1] += v_x0 * va0[i+1] + v_x1 * va1[i+1] + v_x2 * va2[i+1] + v_x3 * va3[i+1] ; "jz 1f \n\t"
}
}
#else "srlg %%r0,%%r0,4 \n\t"
"0: \n\t"
"pfd 1,1024(%%r1,%1) \n\t"
"pfd 1,1024(%%r1,%2) \n\t"
"pfd 1,1024(%%r1,%3) \n\t"
"pfd 1,1024(%%r1,%4) \n\t"
"pfd 2,1024(%%r1,%6) \n\t"
static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) "vl %%v16,0(%%r1,%1) \n\t"
{ "vl %%v17,0(%%r1,%2) \n\t"
BLASLONG i; "vl %%v18,0(%%r1,%3) \n\t"
FLOAT *a0,*a1,*a2,*a3; "vl %%v19,0(%%r1,%4) \n\t"
FLOAT x[4] __attribute__ ((aligned (16))); "vl %%v20,16(%%r1,%1) \n\t"
a0 = ap[0]; "vl %%v21,16(%%r1,%2) \n\t"
a1 = ap[1]; "vl %%v22,16(%%r1,%3) \n\t"
a2 = ap[2]; "vl %%v23,16(%%r1,%4) \n\t"
a3 = ap[3]; "vl %%v24,32(%%r1,%1) \n\t"
"vl %%v25,32(%%r1,%2) \n\t"
"vl %%v26,32(%%r1,%3) \n\t"
"vl %%v27,32(%%r1,%4) \n\t"
"vl %%v28,48(%%r1,%1) \n\t"
"vl %%v29,48(%%r1,%2) \n\t"
"vl %%v30,48(%%r1,%3) \n\t"
"vl %%v31,48(%%r1,%4) \n\t"
for ( i=0; i<4; i++) "vl %%v4,0(%%r1,%6) \n\t"
x[i] = xo[i] * *alpha; "vfmadb %%v4,%%v16,%%v0,%%v4 \n\t"
"vfmadb %%v4,%%v17,%%v1,%%v4 \n\t"
"vfmadb %%v4,%%v18,%%v2,%%v4 \n\t"
"vfmadb %%v4,%%v19,%%v3,%%v4 \n\t"
"vst %%v4,0(%%r1,%6) \n\t"
for ( i=0; i< n; i+=4 ) "vl %%v4,16(%%r1,%6) \n\t"
{ "vfmadb %%v4,%%v20,%%v0,%%v4 \n\t"
y[i] += a0[i]*x[0] + a1[i]*x[1] + a2[i]*x[2] + a3[i]*x[3]; "vfmadb %%v4,%%v21,%%v1,%%v4 \n\t"
y[i+1] += a0[i+1]*x[0] + a1[i+1]*x[1] + a2[i+1]*x[2] + a3[i+1]*x[3]; "vfmadb %%v4,%%v22,%%v2,%%v4 \n\t"
y[i+2] += a0[i+2]*x[0] + a1[i+2]*x[1] + a2[i+2]*x[2] + a3[i+2]*x[3]; "vfmadb %%v4,%%v23,%%v3,%%v4 \n\t"
y[i+3] += a0[i+3]*x[0] + a1[i+3]*x[1] + a2[i+3]*x[2] + a3[i+3]*x[3]; "vst %%v4,16(%%r1,%6) \n\t"
}
}
"vl %%v4,32(%%r1,%6) \n\t"
"vfmadb %%v4,%%v24,%%v0,%%v4 \n\t"
"vfmadb %%v4,%%v25,%%v1,%%v4 \n\t"
"vfmadb %%v4,%%v26,%%v2,%%v4 \n\t"
"vfmadb %%v4,%%v27,%%v3,%%v4 \n\t"
"vst %%v4,32(%%r1,%6) \n\t"
#endif "vl %%v4,48(%%r1,%6) \n\t"
"vfmadb %%v4,%%v28,%%v0,%%v4 \n\t"
"vfmadb %%v4,%%v29,%%v1,%%v4 \n\t"
"vfmadb %%v4,%%v30,%%v2,%%v4 \n\t"
"vfmadb %%v4,%%v31,%%v3,%%v4 \n\t"
"vst %%v4,48(%%r1,%6) \n\t"
#ifdef HAVE_KERNEL_4x2 "vl %%v16,64(%%r1,%1) \n\t"
"vl %%v17,64(%%r1,%2) \n\t"
"vl %%v18,64(%%r1,%3) \n\t"
"vl %%v19,64(%%r1,%4) \n\t"
"vl %%v20,80(%%r1,%1) \n\t"
"vl %%v21,80(%%r1,%2) \n\t"
"vl %%v22,80(%%r1,%3) \n\t"
"vl %%v23,80(%%r1,%4) \n\t"
"vl %%v24,96(%%r1,%1) \n\t"
"vl %%v25,96(%%r1,%2) \n\t"
"vl %%v26,96(%%r1,%3) \n\t"
"vl %%v27,96(%%r1,%4) \n\t"
"vl %%v28,112(%%r1,%1) \n\t"
"vl %%v29,112(%%r1,%2) \n\t"
"vl %%v30,112(%%r1,%3) \n\t"
"vl %%v31,112(%%r1,%4) \n\t"
#elif HAVE_KERNEL_4x2_VEC "vl %%v4,64(%%r1,%6) \n\t"
"vfmadb %%v4,%%v16,%%v0,%%v4 \n\t"
"vfmadb %%v4,%%v17,%%v1,%%v4 \n\t"
"vfmadb %%v4,%%v18,%%v2,%%v4 \n\t"
"vfmadb %%v4,%%v19,%%v3,%%v4 \n\t"
"vst %%v4,64(%%r1,%6) \n\t"
static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) "vl %%v4,80(%%r1,%6) \n\t"
{ "vfmadb %%v4,%%v20,%%v0,%%v4 \n\t"
BLASLONG i; "vfmadb %%v4,%%v21,%%v1,%%v4 \n\t"
FLOAT x0,x1; "vfmadb %%v4,%%v22,%%v2,%%v4 \n\t"
x0 = xo[0] * *alpha; "vfmadb %%v4,%%v23,%%v3,%%v4 \n\t"
x1 = xo[1] * *alpha; "vst %%v4,80(%%r1,%6) \n\t"
__vector double v_x0 = {x0,x0};
__vector double v_x1 = {x1,x1};
__vector double* v_y =(__vector double*)y;
__vector double* va0 = (__vector double*)ap[0];
__vector double* va1 = (__vector double*)ap[1];
for ( i=0; i< n/2; i+=2 ) "vl %%v4,96(%%r1,%6) \n\t"
{ "vfmadb %%v4,%%v24,%%v0,%%v4 \n\t"
v_y[i] += v_x0 * va0[i] + v_x1 * va1[i] ; "vfmadb %%v4,%%v25,%%v1,%%v4 \n\t"
v_y[i+1] += v_x0 * va0[i+1] + v_x1 * va1[i+1] ; "vfmadb %%v4,%%v26,%%v2,%%v4 \n\t"
} "vfmadb %%v4,%%v27,%%v3,%%v4 \n\t"
} "vst %%v4,96(%%r1,%6) \n\t"
#else
static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) "vl %%v4,112(%%r1,%6) \n\t"
{ "vfmadb %%v4,%%v28,%%v0,%%v4 \n\t"
BLASLONG i; "vfmadb %%v4,%%v29,%%v1,%%v4 \n\t"
FLOAT *a0,*a1; "vfmadb %%v4,%%v30,%%v2,%%v4 \n\t"
FLOAT x[4] __attribute__ ((aligned (16))); "vfmadb %%v4,%%v31,%%v3,%%v4 \n\t"
a0 = ap[0]; "vst %%v4,112(%%r1,%6) \n\t"
a1 = ap[1];
for ( i=0; i<2; i++)
x[i] = xo[i] * *alpha;
for ( i=0; i< n; i+=4 )
{
y[i] += a0[i]*x[0] + a1[i]*x[1];
y[i+1] += a0[i+1]*x[0] + a1[i+1]*x[1];
y[i+2] += a0[i+2]*x[0] + a1[i+2]*x[1];
y[i+3] += a0[i+3]*x[0] + a1[i+3]*x[1];
}
}
#endif
#ifdef HAVE_KERNEL_4x1
#elif HAVE_KERNEL_4x1_VEC
static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *xo, FLOAT *y, FLOAT *alpha)
{
BLASLONG i;
FLOAT x0;
x0 = xo[0] * *alpha;
__vector double v_x0 = {x0,x0};
__vector double* v_y =(__vector double*)y;
__vector double* va0 = (__vector double*)ap;
for ( i=0; i< n/2; i+=2 )
{
v_y[i] += v_x0 * va0[i] ;
v_y[i+1] += v_x0 * va0[i+1] ;
}
"agfi %%r1,128 \n\t"
"brctg %%r0,0b \n\t"
"1: \n\t"
"lghi %%r0,12 \n\t"
"ngr %%r0,%0 \n\t"
"ltgr %%r0,%%r0 \n\t"
"jz 3f \n\t"
"srlg %%r0,%%r0,2 \n\t"
"2: \n\t"
"vl %%v16,0(%%r1,%1) \n\t"
"vl %%v17,0(%%r1,%2) \n\t"
"vl %%v18,0(%%r1,%3) \n\t"
"vl %%v19,0(%%r1,%4) \n\t"
"vl %%v20,16(%%r1,%1) \n\t"
"vl %%v21,16(%%r1,%2) \n\t"
"vl %%v22,16(%%r1,%3) \n\t"
"vl %%v23,16(%%r1,%4) \n\t"
"vl %%v4,0(%%r1,%6) \n\t"
"vfmadb %%v4,%%v16,%%v0,%%v4 \n\t"
"vfmadb %%v4,%%v17,%%v1,%%v4 \n\t"
"vfmadb %%v4,%%v18,%%v2,%%v4 \n\t"
"vfmadb %%v4,%%v19,%%v3,%%v4 \n\t"
"vst %%v4,0(%%r1,%6) \n\t"
"vl %%v4,16(%%r1,%6) \n\t"
"vfmadb %%v4,%%v20,%%v0,%%v4 \n\t"
"vfmadb %%v4,%%v21,%%v1,%%v4 \n\t"
"vfmadb %%v4,%%v22,%%v2,%%v4 \n\t"
"vfmadb %%v4,%%v23,%%v3,%%v4 \n\t"
"vst %%v4,16(%%r1,%6) \n\t"
"agfi %%r1,32 \n\t"
"brctg %%r0,2b \n\t"
"3: \n\t"
"nop "
:
:"r"(n),"ZR"((const FLOAT (*)[n])ap[0]),"ZR"((const FLOAT (*)[n])ap[1]),"ZR"((const FLOAT (*)[n])ap[2]),"ZR"((const FLOAT (*)[n])ap[3]),"ZQ"((const FLOAT (*)[4])xo),"ZR"((FLOAT (*)[n])y),"m"(*alpha)
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
} }
#else static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha)
static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *xo, FLOAT *y, FLOAT *alpha)
{ {
BLASLONG i; __asm__ volatile (
FLOAT *a0; "vlrepg %%v0,0(%3) \n\t"
FLOAT x[4] __attribute__ ((aligned (16))); "vlrepg %%v1,8(%3) \n\t"
a0 = ap; "vlrepg %%v2,%5 \n\t"
"vfmdb %%v0,%%v0,%%v2 \n\t"
"vfmdb %%v1,%%v1,%%v2 \n\t"
"xgr %%r1,%%r1 \n\t"
for ( i=0; i<1; i++) "lghi %%r0,-16 \n\t"
x[i] = xo[i] * *alpha; "ngr %%r0,%0 \n\t"
"ltgr %%r0,%%r0 \n\t"
"jz 1f \n\t"
for ( i=0; i< n; i+=4 ) "srlg %%r0,%%r0,4 \n\t"
{ "0: \n\t"
y[i] += a0[i]*x[0]; "pfd 1,1024(%%r1,%1) \n\t"
y[i+1] += a0[i+1]*x[0]; "pfd 1,1024(%%r1,%2) \n\t"
y[i+2] += a0[i+2]*x[0]; "pfd 2,1024(%%r1,%4) \n\t"
y[i+3] += a0[i+3]*x[0];
} "vl %%v16,0(%%r1,%1) \n\t"
"vl %%v17,0(%%r1,%2) \n\t"
"vl %%v18,16(%%r1,%1) \n\t"
"vl %%v19,16(%%r1,%2) \n\t"
"vl %%v20,32(%%r1,%1) \n\t"
"vl %%v21,32(%%r1,%2) \n\t"
"vl %%v22,48(%%r1,%1) \n\t"
"vl %%v23,48(%%r1,%2) \n\t"
"vl %%v24,64(%%r1,%1) \n\t"
"vl %%v25,64(%%r1,%2) \n\t"
"vl %%v26,80(%%r1,%1) \n\t"
"vl %%v27,80(%%r1,%2) \n\t"
"vl %%v28,96(%%r1,%1) \n\t"
"vl %%v29,96(%%r1,%2) \n\t"
"vl %%v30,112(%%r1,%1) \n\t"
"vl %%v31,112(%%r1,%2) \n\t"
"vl %%v2,0(%%r1,%4) \n\t"
"vfmadb %%v2,%%v16,%%v0,%%v2 \n\t"
"vfmadb %%v2,%%v17,%%v1,%%v2 \n\t"
"vst %%v2,0(%%r1,%4) \n\t"
"vl %%v2,16(%%r1,%4) \n\t"
"vfmadb %%v2,%%v18,%%v0,%%v2 \n\t"
"vfmadb %%v2,%%v19,%%v1,%%v2 \n\t"
"vst %%v2,16(%%r1,%4) \n\t"
"vl %%v2,32(%%r1,%4) \n\t"
"vfmadb %%v2,%%v20,%%v0,%%v2 \n\t"
"vfmadb %%v2,%%v21,%%v1,%%v2 \n\t"
"vst %%v2,32(%%r1,%4) \n\t"
"vl %%v2,48(%%r1,%4) \n\t"
"vfmadb %%v2,%%v22,%%v0,%%v2 \n\t"
"vfmadb %%v2,%%v23,%%v1,%%v2 \n\t"
"vst %%v2,48(%%r1,%4) \n\t"
"vl %%v2,64(%%r1,%4) \n\t"
"vfmadb %%v2,%%v24,%%v0,%%v2 \n\t"
"vfmadb %%v2,%%v25,%%v1,%%v2 \n\t"
"vst %%v2,64(%%r1,%4) \n\t"
"vl %%v2,80(%%r1,%4) \n\t"
"vfmadb %%v2,%%v26,%%v0,%%v2 \n\t"
"vfmadb %%v2,%%v27,%%v1,%%v2 \n\t"
"vst %%v2,80(%%r1,%4) \n\t"
"vl %%v2,96(%%r1,%4) \n\t"
"vfmadb %%v2,%%v28,%%v0,%%v2 \n\t"
"vfmadb %%v2,%%v29,%%v1,%%v2 \n\t"
"vst %%v2,96(%%r1,%4) \n\t"
"vl %%v2,112(%%r1,%4) \n\t"
"vfmadb %%v2,%%v30,%%v0,%%v2 \n\t"
"vfmadb %%v2,%%v31,%%v1,%%v2 \n\t"
"vst %%v2,112(%%r1,%4) \n\t"
"agfi %%r1,128 \n\t"
"brctg %%r0,0b \n\t"
"1: \n\t"
"lghi %%r0,12 \n\t"
"ngr %%r0,%0 \n\t"
"ltgr %%r0,%%r0 \n\t"
"jz 3f \n\t"
"srlg %%r0,%%r0,2 \n\t"
"2: \n\t"
"vl %%v16,0(%%r1,%1) \n\t"
"vl %%v17,0(%%r1,%2) \n\t"
"vl %%v18,16(%%r1,%1) \n\t"
"vl %%v19,16(%%r1,%2) \n\t"
"vl %%v2,0(%%r1,%4) \n\t"
"vfmadb %%v2,%%v16,%%v0,%%v2 \n\t"
"vfmadb %%v2,%%v17,%%v1,%%v2 \n\t"
"vst %%v2,0(%%r1,%4) \n\t"
"vl %%v2,16(%%r1,%4) \n\t"
"vfmadb %%v2,%%v18,%%v0,%%v2 \n\t"
"vfmadb %%v2,%%v19,%%v1,%%v2 \n\t"
"vst %%v2,16(%%r1,%4) \n\t"
"agfi %%r1,32 \n\t"
"brctg %%r0,2b \n\t"
"3: \n\t"
"nop "
:
:"r"(n),"ZR"((const FLOAT (*)[n])ap[0]),"ZR"((const FLOAT (*)[n])ap[1]),"ZQ"((const FLOAT (*)[2])xo),"ZR"((FLOAT (*)[n])y),"m"(*alpha)
:"memory","cc","r0","r1","v0","v1","v2","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
} }
static void dgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *xo, FLOAT *y, FLOAT *alpha)
{
__asm__ volatile (
"vlrepg %%v0,0(%2) \n\t"
"vlrepg %%v1,%4 \n\t"
"vfmdb %%v0,%%v0,%%v1 \n\t"
"xgr %%r1,%%r1 \n\t"
#endif "lghi %%r0,-16 \n\t"
"ngr %%r0,%0 \n\t"
"ltgr %%r0,%%r0 \n\t"
"jz 1f \n\t"
"srlg %%r0,%%r0,4 \n\t"
"0: \n\t"
"pfd 1,1024(%%r1,%1) \n\t"
"pfd 2,1024(%%r1,%3) \n\t"
"vl %%v16,0(%%r1,%1) \n\t"
"vl %%v17,16(%%r1,%1) \n\t"
"vl %%v18,32(%%r1,%1) \n\t"
"vl %%v19,48(%%r1,%1) \n\t"
"vl %%v20,64(%%r1,%1) \n\t"
"vl %%v21,80(%%r1,%1) \n\t"
"vl %%v22,96(%%r1,%1) \n\t"
"vl %%v23,112(%%r1,%1) \n\t"
"vl %%v1,0(%%r1,%3) \n\t"
"vfmadb %%v1,%%v16,%%v0,%%v1 \n\t"
"vst %%v1,0(%%r1,%3) \n\t"
"vl %%v1,16(%%r1,%3) \n\t"
"vfmadb %%v1,%%v17,%%v0,%%v1 \n\t"
"vst %%v1,16(%%r1,%3) \n\t"
"vl %%v1,32(%%r1,%3) \n\t"
"vfmadb %%v1,%%v18,%%v0,%%v1 \n\t"
"vst %%v1,32(%%r1,%3) \n\t"
"vl %%v1,48(%%r1,%3) \n\t"
"vfmadb %%v1,%%v19,%%v0,%%v1 \n\t"
"vst %%v1,48(%%r1,%3) \n\t"
"vl %%v1,64(%%r1,%3) \n\t"
"vfmadb %%v1,%%v20,%%v0,%%v1 \n\t"
"vst %%v1,64(%%r1,%3) \n\t"
"vl %%v1,80(%%r1,%3) \n\t"
"vfmadb %%v1,%%v21,%%v0,%%v1 \n\t"
"vst %%v1,80(%%r1,%3) \n\t"
"vl %%v1,96(%%r1,%3) \n\t"
"vfmadb %%v1,%%v22,%%v0,%%v1 \n\t"
"vst %%v1,96(%%r1,%3) \n\t"
"vl %%v1,112(%%r1,%3) \n\t"
"vfmadb %%v1,%%v23,%%v0,%%v1 \n\t"
"vst %%v1,112(%%r1,%3) \n\t"
"agfi %%r1,128 \n\t"
"brctg %%r0,0b \n\t"
"1: \n\t"
"lghi %%r0,12 \n\t"
"ngr %%r0,%0 \n\t"
"ltgr %%r0,%%r0 \n\t"
"jz 3f \n\t"
"srlg %%r0,%%r0,2 \n\t"
"2: \n\t"
"vl %%v16,0(%%r1,%1) \n\t"
"vl %%v17,16(%%r1,%1) \n\t"
"vl %%v1,0(%%r1,%3) \n\t"
"vfmadb %%v1,%%v16,%%v0,%%v1 \n\t"
"vst %%v1,0(%%r1,%3) \n\t"
"vl %%v1,16(%%r1,%3) \n\t"
"vfmadb %%v1,%%v17,%%v0,%%v1 \n\t"
"vst %%v1,16(%%r1,%3) \n\t"
"agfi %%r1,32 \n\t"
"brctg %%r0,2b \n\t"
"3: \n\t"
"nop "
:
:"r"(n),"ZR"((const FLOAT (*)[n])a0),"ZQ"((const FLOAT (*)[1])xo),"ZR"((FLOAT (*)[n])y),"m"(*alpha)
:"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
}
static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest)
{ {
BLASLONG i; BLASLONG i;
for (i = 0; i < n; i++)
for ( i=0; i<n; i++ ){ {
*dest += *src; *dest += src[i];
src++; dest += inc_dest;
dest += inc_dest;
} }
return;
} }
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
{ {
BLASLONG i; BLASLONG i;
BLASLONG j;
FLOAT *a_ptr; FLOAT *a_ptr;
FLOAT *x_ptr; FLOAT *x_ptr;
FLOAT *y_ptr; FLOAT *y_ptr;
@ -282,8 +488,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
if ( n2 & 1 ) if ( n2 & 1 )
{ {
dgemv_kernel_4x1(NB,a_ptr,x_ptr,ybuffer,&alpha); dgemv_kernel_4x1(NB,a_ptr,x_ptr,ybuffer,&alpha);
a_ptr += lda; /* a_ptr += lda;
x_ptr += 1; x_ptr += 1; */
} }

View File

@ -25,178 +25,460 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/ *****************************************************************************/
#include "common.h" #include "common.h"
#define HAVE_KERNEL_4x4_VEC 1
#define HAVE_KERNEL_4x2_VEC 1
#define HAVE_KERNEL_4x1_VEC 1
#if defined(HAVE_KERNEL_4x4_VEC) || defined(HAVE_KERNEL_4x2_VEC) || defined(HAVE_KERNEL_4x1_VEC)
#include <vecintrin.h>
#endif
#define NBMAX 2048 #define NBMAX 2048
#ifdef HAVE_KERNEL_4x4
#elif HAVE_KERNEL_4x4_VEC
static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
{ {
BLASLONG i; __asm__ volatile (
__vector double* va0 = (__vector double*)ap[0]; "vzero %%v0 \n\t"
__vector double* va1 = (__vector double*)ap[1]; "vzero %%v1 \n\t"
__vector double* va2 = (__vector double*)ap[2]; "vzero %%v2 \n\t"
__vector double* va3 = (__vector double*)ap[3]; "vzero %%v3 \n\t"
__vector double* v_x =(__vector double*)x; "xgr %%r1,%%r1 \n\t"
__vector double temp0 = {0,0};
__vector double temp1 = {0,0};
__vector double temp2 = {0,0};
__vector double temp3 = {0,0};
for ( i=0; i< n/2; i+=2 ) "lghi %%r0,-16 \n\t"
{ "ngr %%r0,%0 \n\t"
temp0 += v_x[i] * va0[i] + v_x[i+1] * va0[i+1] ; "ltgr %%r0,%%r0 \n\t"
temp1 += v_x[i] * va1[i] + v_x[i+1] * va1[i+1] ; "jz 1f \n\t"
temp2 += v_x[i] * va2[i] + v_x[i+1] * va2[i+1] ;
temp3 += v_x[i] * va3[i] + v_x[i+1] * va3[i+1] ; "srlg %%r0,%%r0,4 \n\t"
} "0: \n\t"
"pfd 1,1024(%%r1,%1) \n\t"
y[0] = temp0[0] + temp0[1]; "pfd 1,1024(%%r1,%2) \n\t"
y[1] = temp1[0] + temp1[1]; "pfd 1,1024(%%r1,%3) \n\t"
y[2] = temp2[0] + temp2[1]; "pfd 1,1024(%%r1,%4) \n\t"
y[3] = temp3[0] + temp3[1];; "pfd 1,1024(%%r1,%5) \n\t"
"vl %%v16,0(%%r1,%5) \n\t"
"vl %%v17,16(%%r1,%5) \n\t"
"vl %%v18,32(%%r1,%5) \n\t"
"vl %%v19,48(%%r1,%5) \n\t"
"vl %%v20,64(%%r1,%5) \n\t"
"vl %%v21,80(%%r1,%5) \n\t"
"vl %%v22,96(%%r1,%5) \n\t"
"vl %%v23,112(%%r1,%5) \n\t"
"vl %%v24,0(%%r1,%1) \n\t"
"vfmadb %%v0,%%v16,%%v24,%%v0 \n\t"
"vl %%v25,0(%%r1,%2) \n\t"
"vfmadb %%v1,%%v16,%%v25,%%v1 \n\t"
"vl %%v26,0(%%r1,%3) \n\t"
"vfmadb %%v2,%%v16,%%v26,%%v2 \n\t"
"vl %%v27,0(%%r1,%4) \n\t"
"vfmadb %%v3,%%v16,%%v27,%%v3 \n\t"
"vl %%v28,16(%%r1,%1) \n\t"
"vfmadb %%v0,%%v17,%%v28,%%v0 \n\t"
"vl %%v29,16(%%r1,%2) \n\t"
"vfmadb %%v1,%%v17,%%v29,%%v1 \n\t"
"vl %%v30,16(%%r1,%3) \n\t"
"vfmadb %%v2,%%v17,%%v30,%%v2 \n\t"
"vl %%v31,16(%%r1,%4) \n\t"
"vfmadb %%v3,%%v17,%%v31,%%v3 \n\t"
"vl %%v24,32(%%r1,%1) \n\t"
"vfmadb %%v0,%%v18,%%v24,%%v0 \n\t"
"vl %%v25,32(%%r1,%2) \n\t"
"vfmadb %%v1,%%v18,%%v25,%%v1 \n\t"
"vl %%v26,32(%%r1,%3) \n\t"
"vfmadb %%v2,%%v18,%%v26,%%v2 \n\t"
"vl %%v27,32(%%r1,%4) \n\t"
"vfmadb %%v3,%%v18,%%v27,%%v3 \n\t"
"vl %%v28,48(%%r1,%1) \n\t"
"vfmadb %%v0,%%v19,%%v28,%%v0 \n\t"
"vl %%v29,48(%%r1,%2) \n\t"
"vfmadb %%v1,%%v19,%%v29,%%v1 \n\t"
"vl %%v30,48(%%r1,%3) \n\t"
"vfmadb %%v2,%%v19,%%v30,%%v2 \n\t"
"vl %%v31,48(%%r1,%4) \n\t"
"vfmadb %%v3,%%v19,%%v31,%%v3 \n\t"
"vl %%v24,64(%%r1,%1) \n\t"
"vfmadb %%v0,%%v20,%%v24,%%v0 \n\t"
"vl %%v25,64(%%r1,%2) \n\t"
"vfmadb %%v1,%%v20,%%v25,%%v1 \n\t"
"vl %%v26,64(%%r1,%3) \n\t"
"vfmadb %%v2,%%v20,%%v26,%%v2 \n\t"
"vl %%v27,64(%%r1,%4) \n\t"
"vfmadb %%v3,%%v20,%%v27,%%v3 \n\t"
"vl %%v28,80(%%r1,%1) \n\t"
"vfmadb %%v0,%%v21,%%v28,%%v0 \n\t"
"vl %%v29,80(%%r1,%2) \n\t"
"vfmadb %%v1,%%v21,%%v29,%%v1 \n\t"
"vl %%v30,80(%%r1,%3) \n\t"
"vfmadb %%v2,%%v21,%%v30,%%v2 \n\t"
"vl %%v31,80(%%r1,%4) \n\t"
"vfmadb %%v3,%%v21,%%v31,%%v3 \n\t"
"vl %%v24,96(%%r1,%1) \n\t"
"vfmadb %%v0,%%v22,%%v24,%%v0 \n\t"
"vl %%v25,96(%%r1,%2) \n\t"
"vfmadb %%v1,%%v22,%%v25,%%v1 \n\t"
"vl %%v26,96(%%r1,%3) \n\t"
"vfmadb %%v2,%%v22,%%v26,%%v2 \n\t"
"vl %%v27,96(%%r1,%4) \n\t"
"vfmadb %%v3,%%v22,%%v27,%%v3 \n\t"
"vl %%v28,112(%%r1,%1) \n\t"
"vfmadb %%v0,%%v23,%%v28,%%v0 \n\t"
"vl %%v29,112(%%r1,%2) \n\t"
"vfmadb %%v1,%%v23,%%v29,%%v1 \n\t"
"vl %%v30,112(%%r1,%3) \n\t"
"vfmadb %%v2,%%v23,%%v30,%%v2 \n\t"
"vl %%v31,112(%%r1,%4) \n\t"
"vfmadb %%v3,%%v23,%%v31,%%v3 \n\t"
"agfi %%r1,128 \n\t"
"brctg %%r0,0b \n\t"
"1: \n\t"
"lghi %%r0,12 \n\t"
"ngr %%r0,%0 \n\t"
"ltgr %%r0,%%r0 \n\t"
"jz 3f \n\t"
"srlg %%r0,%%r0,2 \n\t"
"2: \n\t"
"vl %%v16,0(%%r1,%5) \n\t"
"vl %%v17,16(%%r1,%5) \n\t"
"vl %%v24,0(%%r1,%1) \n\t"
"vfmadb %%v0,%%v16,%%v24,%%v0 \n\t"
"vl %%v25,0(%%r1,%2) \n\t"
"vfmadb %%v1,%%v16,%%v25,%%v1 \n\t"
"vl %%v26,0(%%r1,%3) \n\t"
"vfmadb %%v2,%%v16,%%v26,%%v2 \n\t"
"vl %%v27,0(%%r1,%4) \n\t"
"vfmadb %%v3,%%v16,%%v27,%%v3 \n\t"
"vl %%v28,16(%%r1,%1) \n\t"
"vfmadb %%v0,%%v17,%%v28,%%v0 \n\t"
"vl %%v29,16(%%r1,%2) \n\t"
"vfmadb %%v1,%%v17,%%v29,%%v1 \n\t"
"vl %%v30,16(%%r1,%3) \n\t"
"vfmadb %%v2,%%v17,%%v30,%%v2 \n\t"
"vl %%v31,16(%%r1,%4) \n\t"
"vfmadb %%v3,%%v17,%%v31,%%v3 \n\t"
"agfi %%r1,32 \n\t"
"brctg %%r0,2b \n\t"
"3: \n\t"
"vrepg %%v4,%%v0,1 \n\t"
"adbr %%f0,%%f4 \n\t"
"std %%f0,0(%6) \n\t"
"vrepg %%v4,%%v1,1 \n\t"
"adbr %%f1,%%f4 \n\t"
"std %%f1,8(%6) \n\t"
"vrepg %%v4,%%v2,1 \n\t"
"adbr %%f2,%%f4 \n\t"
"std %%f2,16(%6) \n\t"
"vrepg %%v4,%%v3,1 \n\t"
"adbr %%f3,%%f4 \n\t"
"std %%f3,24(%6) "
:
:"r"(n),"ZR"((const FLOAT (*)[n])ap[0]),"ZR"((const FLOAT (*)[n])ap[1]),"ZR"((const FLOAT (*)[n])ap[2]),"ZR"((const FLOAT (*)[n])ap[3]),"ZR"((const FLOAT (*)[n])x),"ZQ"((FLOAT (*)[4])y)
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
} }
#else
static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
{
BLASLONG i;
FLOAT *a0,*a1,*a2,*a3;
a0 = ap[0];
a1 = ap[1];
a2 = ap[2];
a3 = ap[3];
FLOAT temp0 = 0.0;
FLOAT temp1 = 0.0;
FLOAT temp2 = 0.0;
FLOAT temp3 = 0.0;
for ( i=0; i< n; i+=4 )
{
temp0 += a0[i]*x[i] + a0[i+1]*x[i+1] + a0[i+2]*x[i+2] + a0[i+3]*x[i+3];
temp1 += a1[i]*x[i] + a1[i+1]*x[i+1] + a1[i+2]*x[i+2] + a1[i+3]*x[i+3];
temp2 += a2[i]*x[i] + a2[i+1]*x[i+1] + a2[i+2]*x[i+2] + a2[i+3]*x[i+3];
temp3 += a3[i]*x[i] + a3[i+1]*x[i+1] + a3[i+2]*x[i+2] + a3[i+3]*x[i+3];
}
y[0] = temp0;
y[1] = temp1;
y[2] = temp2;
y[3] = temp3;
}
#endif
#ifdef HAVE_KERNEL_4x2
#elif HAVE_KERNEL_4x2_VEC
static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
{ {
BLASLONG i; __asm__ volatile (
__vector double* va0 = (__vector double*)ap[0]; "vzero %%v0 \n\t"
__vector double* va1 = (__vector double*)ap[1]; "vzero %%v1 \n\t"
__vector double* v_x =(__vector double*)x; "xgr %%r1,%%r1 \n\t"
__vector double temp0 = {0,0};
__vector double temp1 = {0,0};
for ( i=0; i< n/2; i+=2 ) "lghi %%r0,-16 \n\t"
{ "ngr %%r0,%0 \n\t"
temp0 += v_x[i] * va0[i] + v_x[i+1] * va0[i+1] ; "ltgr %%r0,%%r0 \n\t"
temp1 += v_x[i] * va1[i] + v_x[i+1] * va1[i+1] ; "jz 1f \n\t"
}
"srlg %%r0,%%r0,4 \n\t"
y[0] = temp0[0] + temp0[1]; "0: \n\t"
y[1] = temp1[0] + temp1[1]; "pfd 1,1024(%%r1,%1) \n\t"
"pfd 1,1024(%%r1,%2) \n\t"
"pfd 1,1024(%%r1,%3) \n\t"
"vl %%v16,0(%%r1,%3) \n\t"
"vl %%v17,16(%%r1,%3) \n\t"
"vl %%v18,32(%%r1,%3) \n\t"
"vl %%v19,48(%%r1,%3) \n\t"
"vl %%v20,64(%%r1,%3) \n\t"
"vl %%v21,80(%%r1,%3) \n\t"
"vl %%v22,96(%%r1,%3) \n\t"
"vl %%v23,112(%%r1,%3) \n\t"
"vl %%v24,0(%%r1,%1) \n\t"
"vfmadb %%v0,%%v16,%%v24,%%v0 \n\t"
"vl %%v25,0(%%r1,%2) \n\t"
"vfmadb %%v1,%%v16,%%v25,%%v1 \n\t"
"vl %%v26,16(%%r1,%1) \n\t"
"vfmadb %%v0,%%v17,%%v26,%%v0 \n\t"
"vl %%v27,16(%%r1,%2) \n\t"
"vfmadb %%v1,%%v17,%%v27,%%v1 \n\t"
"vl %%v28,32(%%r1,%1) \n\t"
"vfmadb %%v0,%%v18,%%v28,%%v0 \n\t"
"vl %%v29,32(%%r1,%2) \n\t"
"vfmadb %%v1,%%v18,%%v29,%%v1 \n\t"
"vl %%v30,48(%%r1,%1) \n\t"
"vfmadb %%v0,%%v19,%%v30,%%v0 \n\t"
"vl %%v31,48(%%r1,%2) \n\t"
"vfmadb %%v1,%%v19,%%v31,%%v1 \n\t"
"vl %%v24,64(%%r1,%1) \n\t"
"vfmadb %%v0,%%v20,%%v24,%%v0 \n\t"
"vl %%v25,64(%%r1,%2) \n\t"
"vfmadb %%v1,%%v20,%%v25,%%v1 \n\t"
"vl %%v26,80(%%r1,%1) \n\t"
"vfmadb %%v0,%%v21,%%v26,%%v0 \n\t"
"vl %%v27,80(%%r1,%2) \n\t"
"vfmadb %%v1,%%v21,%%v27,%%v1 \n\t"
"vl %%v28,96(%%r1,%1) \n\t"
"vfmadb %%v0,%%v22,%%v28,%%v0 \n\t"
"vl %%v29,96(%%r1,%2) \n\t"
"vfmadb %%v1,%%v22,%%v29,%%v1 \n\t"
"vl %%v30,112(%%r1,%1) \n\t"
"vfmadb %%v0,%%v23,%%v30,%%v0 \n\t"
"vl %%v31,112(%%r1,%2) \n\t"
"vfmadb %%v1,%%v23,%%v31,%%v1 \n\t"
"agfi %%r1,128 \n\t"
"brctg %%r0,0b \n\t"
"1: \n\t"
"lghi %%r0,12 \n\t"
"ngr %%r0,%0 \n\t"
"ltgr %%r0,%%r0 \n\t"
"jz 3f \n\t"
"srlg %%r0,%%r0,2 \n\t"
"2: \n\t"
"vl %%v16,0(%%r1,%3) \n\t"
"vl %%v17,16(%%r1,%3) \n\t"
"vl %%v24,0(%%r1,%1) \n\t"
"vfmadb %%v0,%%v16,%%v24,%%v0 \n\t"
"vl %%v25,0(%%r1,%2) \n\t"
"vfmadb %%v1,%%v16,%%v25,%%v1 \n\t"
"vl %%v26,16(%%r1,%1) \n\t"
"vfmadb %%v0,%%v17,%%v26,%%v0 \n\t"
"vl %%v27,16(%%r1,%2) \n\t"
"vfmadb %%v1,%%v17,%%v27,%%v1 \n\t"
"agfi %%r1,32 \n\t"
"brctg %%r0,2b \n\t"
"3: \n\t"
"vrepg %%v2,%%v0,1 \n\t"
"adbr %%f0,%%f2 \n\t"
"std %%f0,0(%4) \n\t"
"vrepg %%v2,%%v1,1 \n\t"
"adbr %%f1,%%f2 \n\t"
"std %%f1,8(%4) "
:
:"r"(n),"ZR"((const FLOAT (*)[n])ap[0]),"ZR"((const FLOAT (*)[n])ap[1]),"ZR"((const FLOAT (*)[n])x),"ZQ"((FLOAT (*)[2])y)
:"memory","cc","r0","r1","v0","v1","v2","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
} }
#else
static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
{
BLASLONG i;
FLOAT *a0,*a1;
a0 = ap[0];
a1 = ap[1];
FLOAT temp0 = 0.0;
FLOAT temp1 = 0.0;
for ( i=0; i< n; i+=4 )
{
temp0 += a0[i]*x[i] + a0[i+1]*x[i+1] + a0[i+2]*x[i+2] + a0[i+3]*x[i+3];
temp1 += a1[i]*x[i] + a1[i+1]*x[i+1] + a1[i+2]*x[i+2] + a1[i+3]*x[i+3];
}
y[0] = temp0;
y[1] = temp1;
}
#endif
#ifdef HAVE_KERNEL_4x1
#elif HAVE_KERNEL_4x1_VEC
static void dgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y) static void dgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y)
{ {
BLASLONG i; __asm__ volatile (
__vector double* va0 = (__vector double*)a0; "vzero %%v0 \n\t"
__vector double* v_x =(__vector double*)x; "xgr %%r1,%%r1 \n\t"
__vector double temp0 = {0,0};
for ( i=0; i< n/2; i+=2 ) "lghi %%r0,-16 \n\t"
{ "ngr %%r0,%0 \n\t"
temp0 += v_x[i] * va0[i] + v_x[i+1] * va0[i+1] ; "ltgr %%r0,%%r0 \n\t"
} "jz 1f \n\t"
y[0] = temp0[0] + temp0[1];
}
#else
static void dgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y)
{
BLASLONG i;
FLOAT temp0 = 0.0;
for ( i=0; i< n; i+=4 ) "srlg %%r0,%%r0,4 \n\t"
{ "0: \n\t"
temp0 += a0[i]*x[i] + a0[i+1]*x[i+1] + a0[i+2]*x[i+2] + a0[i+3]*x[i+3]; "pfd 1,1024(%%r1,%1) \n\t"
} "pfd 1,1024(%%r1,%2) \n\t"
y[0] = temp0;
"vl %%v16,0(%%r1,%2) \n\t"
"vl %%v17,16(%%r1,%2) \n\t"
"vl %%v18,32(%%r1,%2) \n\t"
"vl %%v19,48(%%r1,%2) \n\t"
"vl %%v20,64(%%r1,%2) \n\t"
"vl %%v21,80(%%r1,%2) \n\t"
"vl %%v22,96(%%r1,%2) \n\t"
"vl %%v23,112(%%r1,%2) \n\t"
"vl %%v24,0(%%r1,%1) \n\t"
"vfmadb %%v0,%%v16,%%v24,%%v0 \n\t"
"vl %%v25,16(%%r1,%1) \n\t"
"vfmadb %%v0,%%v17,%%v25,%%v0 \n\t"
"vl %%v26,32(%%r1,%1) \n\t"
"vfmadb %%v0,%%v18,%%v26,%%v0 \n\t"
"vl %%v27,48(%%r1,%1) \n\t"
"vfmadb %%v0,%%v19,%%v27,%%v0 \n\t"
"vl %%v28,64(%%r1,%1) \n\t"
"vfmadb %%v0,%%v20,%%v28,%%v0 \n\t"
"vl %%v29,80(%%r1,%1) \n\t"
"vfmadb %%v0,%%v21,%%v29,%%v0 \n\t"
"vl %%v30,96(%%r1,%1) \n\t"
"vfmadb %%v0,%%v22,%%v30,%%v0 \n\t"
"vl %%v31,112(%%r1,%1) \n\t"
"vfmadb %%v0,%%v23,%%v31,%%v0 \n\t"
"agfi %%r1,128 \n\t"
"brctg %%r0,0b \n\t"
"1: \n\t"
"lghi %%r0,12 \n\t"
"ngr %%r0,%0 \n\t"
"ltgr %%r0,%%r0 \n\t"
"jz 3f \n\t"
"srlg %%r0,%%r0,2 \n\t"
"2: \n\t"
"vl %%v16,0(%%r1,%2) \n\t"
"vl %%v17,16(%%r1,%2) \n\t"
"vl %%v24,0(%%r1,%1) \n\t"
"vfmadb %%v0,%%v16,%%v24,%%v0 \n\t"
"vl %%v25,16(%%r1,%1) \n\t"
"vfmadb %%v0,%%v17,%%v25,%%v0 \n\t"
"agfi %%r1,32 \n\t"
"brctg %%r0,2b \n\t"
"3: \n\t"
"vrepg %%v1,%%v0,1 \n\t"
"adbr %%f0,%%f1 \n\t"
"std %%f0,0(%3) "
:
:"r"(n),"ZR"((const FLOAT (*)[n])a0),"ZR"((const FLOAT (*)[n])x),"ZQ"((FLOAT (*)[1])y)
:"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
} }
#endif
static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src)
{ {
BLASLONG i; BLASLONG i;
for ( i=0; i<n; i++ ) for (i = 0; i < n; i++)
{ {
*dest = *src; dest[i] = *src;
dest++; src += inc_src;
src += inc_src; }
}
} }
static void add_y(BLASLONG n, FLOAT da , FLOAT *src, FLOAT *dest, BLASLONG inc_dest) static void add_y_kernel_4(BLASLONG n, FLOAT da, FLOAT *src, FLOAT *dest)
{ {
__asm__ volatile (
"vlrepg %%v0,%1 \n\t"
"xgr %%r1,%%r1 \n\t"
"lghi %%r0,-16 \n\t"
"ngr %%r0,%0 \n\t"
"ltgr %%r0,%%r0 \n\t"
"jz 1f \n\t"
"srlg %%r0,%%r0,4 \n\t"
"0: \n\t"
"pfd 1,1024(%%r1,%2) \n\t"
"pfd 2,1024(%%r1,%3) \n\t"
"vl %%v16,0(%%r1,%2) \n\t"
"vl %%v17,16(%%r1,%2) \n\t"
"vl %%v18,32(%%r1,%2) \n\t"
"vl %%v19,48(%%r1,%2) \n\t"
"vl %%v20,64(%%r1,%2) \n\t"
"vl %%v21,80(%%r1,%2) \n\t"
"vl %%v22,96(%%r1,%2) \n\t"
"vl %%v23,112(%%r1,%2) \n\t"
"vl %%v24, 0(%%r1,%3) \n\t"
"vfmadb %%v24,%%v16,%%v0,%%v24 \n\t"
"vst %%v24, 0(%%r1,%3) \n\t"
"vl %%v25, 16(%%r1,%3) \n\t"
"vfmadb %%v25,%%v17,%%v0,%%v25 \n\t"
"vst %%v25, 16(%%r1,%3) \n\t"
"vl %%v26, 32(%%r1,%3) \n\t"
"vfmadb %%v26,%%v18,%%v0,%%v26 \n\t"
"vst %%v26, 32(%%r1,%3) \n\t"
"vl %%v27, 48(%%r1,%3) \n\t"
"vfmadb %%v27,%%v19,%%v0,%%v27 \n\t"
"vst %%v27, 48(%%r1,%3) \n\t"
"vl %%v28, 64(%%r1,%3) \n\t"
"vfmadb %%v28,%%v20,%%v0,%%v28 \n\t"
"vst %%v28, 64(%%r1,%3) \n\t"
"vl %%v29, 80(%%r1,%3) \n\t"
"vfmadb %%v29,%%v21,%%v0,%%v29 \n\t"
"vst %%v29, 80(%%r1,%3) \n\t"
"vl %%v30, 96(%%r1,%3) \n\t"
"vfmadb %%v30,%%v22,%%v0,%%v30 \n\t"
"vst %%v30, 96(%%r1,%3) \n\t"
"vl %%v31, 112(%%r1,%3) \n\t"
"vfmadb %%v31,%%v23,%%v0,%%v31 \n\t"
"vst %%v31, 112(%%r1,%3) \n\t"
"agfi %%r1,128 \n\t"
"brctg %%r0,0b \n\t"
"1: \n\t"
"lghi %%r0,12 \n\t"
"ngr %%r0,%0 \n\t"
"ltgr %%r0,%%r0 \n\t"
"jz 3f \n\t"
"srlg %%r0,%%r0,2 \n\t"
"2: \n\t"
"vl %%v16,0(%%r1,%2) \n\t"
"vl %%v17,16(%%r1,%2) \n\t"
"vl %%v24, 0(%%r1,%3) \n\t"
"vfmadb %%v24,%%v16,%%v0,%%v24 \n\t"
"vst %%v24, 0(%%r1,%3) \n\t"
"vl %%v25, 16(%%r1,%3) \n\t"
"vfmadb %%v25,%%v17,%%v0,%%v25 \n\t"
"vst %%v25, 16(%%r1,%3) \n\t"
"agfi %%r1,32 \n\t"
"brctg %%r0,2b \n\t"
"3: \n\t"
"nop "
:
:"r"(n),"m"(da),"ZR"((const FLOAT (*)[n])src),"ZR"((FLOAT (*)[n])dest)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
}
static void add_y(BLASLONG n, FLOAT da, FLOAT *src, FLOAT *dest, BLASLONG inc_dest)
{
if (inc_dest == 1)
add_y_kernel_4(n, da, src, dest);
else
{
BLASLONG i; BLASLONG i;
for (i = 0; i < n; i++)
for ( i=0; i<n; i++ )
{ {
*dest += src[i] * da; *dest += src[i] * da;
dest += inc_dest; dest += inc_dest;
} }
return; }
} }
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
@ -212,7 +494,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
BLASLONG m2; BLASLONG m2;
BLASLONG m3; BLASLONG m3;
BLASLONG n2; BLASLONG n2;
FLOAT ybuffer[4],*xbuffer; FLOAT ybuffer[2] __attribute__ ((aligned(16)));
FLOAT *xbuffer;
FLOAT *ytemp; FLOAT *ytemp;
if ( m < 1 ) return(0); if ( m < 1 ) return(0);
@ -234,7 +517,6 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
while ( NB == NBMAX ) while ( NB == NBMAX )
{ {
m1 -= NB; m1 -= NB;
if ( m1 < 0) if ( m1 < 0)
{ {
@ -319,9 +601,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
{ {
dgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer); dgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer);
a_ptr += lda; // a_ptr += lda;
*y_ptr += ybuffer[0] * alpha; *y_ptr += ybuffer[0] * alpha;
y_ptr += inc_y; // y_ptr += inc_y;
} }
a += NB; a += NB;

159
kernel/zarch/dmax.c Normal file
View File

@ -0,0 +1,159 @@
/***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
static FLOAT dmax_kernel_32(BLASLONG n, FLOAT *x)
{
FLOAT max;
__asm__ volatile (
"vl %%v0,0(%2) \n\t"
"srlg %%r0,%1,5 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%2) \n\t"
"vl %%v16,0(%%r1,%2) \n\t"
"vl %%v17,16(%%r1,%2) \n\t"
"vl %%v18,32(%%r1,%2) \n\t"
"vl %%v19,48(%%r1,%2) \n\t"
"vl %%v20,64(%%r1,%2) \n\t"
"vl %%v21,80(%%r1,%2) \n\t"
"vl %%v22,96(%%r1,%2) \n\t"
"vl %%v23,112(%%r1,%2) \n\t"
"vl %%v24,128(%%r1,%2) \n\t"
"vl %%v25,144(%%r1,%2) \n\t"
"vl %%v26,160(%%r1,%2) \n\t"
"vl %%v27,176(%%r1,%2) \n\t"
"vl %%v28,192(%%r1,%2) \n\t"
"vl %%v29,208(%%r1,%2) \n\t"
"vl %%v30,224(%%r1,%2) \n\t"
"vl %%v31,240(%%r1,%2) \n\t"
"vfmaxdb %%v16,%%v16,%%v24,0 \n\t"
"vfmaxdb %%v17,%%v17,%%v25,0 \n\t"
"vfmaxdb %%v18,%%v18,%%v26,0 \n\t"
"vfmaxdb %%v19,%%v19,%%v27,0 \n\t"
"vfmaxdb %%v20,%%v20,%%v28,0 \n\t"
"vfmaxdb %%v21,%%v21,%%v29,0 \n\t"
"vfmaxdb %%v22,%%v22,%%v30,0 \n\t"
"vfmaxdb %%v23,%%v23,%%v31,0 \n\t"
"vfmaxdb %%v16,%%v16,%%v20,0 \n\t"
"vfmaxdb %%v17,%%v17,%%v21,0 \n\t"
"vfmaxdb %%v18,%%v18,%%v22,0 \n\t"
"vfmaxdb %%v19,%%v19,%%v23,0 \n\t"
"vfmaxdb %%v16,%%v16,%%v18,0 \n\t"
"vfmaxdb %%v17,%%v17,%%v19,0 \n\t"
"vfmaxdb %%v16,%%v16,%%v17,0 \n\t"
"vfmaxdb %%v0,%%v0,%%16,0 \n\t"
"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"
"vrepg %%v16,%%v0,1 \n\t"
"wfmaxdb %%v0,%%v0,%%v16,0 \n\t"
"ldr %0,%%f0 "
:"=f"(max)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
return max;
}
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0;
BLASLONG j = 0;
FLOAT maxf = 0.0;
if (n <= 0 || inc_x <= 0) return (maxf);
if (inc_x == 1) {
BLASLONG n1 = n & -32;
if (n1 > 0) {
maxf = dmax_kernel_32(n1, x);
i = n1;
}
else
{
maxf=x[0];
i++;
}
while (i < n) {
if (x[i] > maxf) {
maxf = x[i];
}
i++;
}
return (maxf);
} else {
maxf=x[0];
BLASLONG n1 = n & -4;
while (j < n1) {
if (x[i] > maxf) {
maxf = x[i];
}
if (x[i + inc_x] > maxf) {
maxf = x[i + inc_x];
}
if (x[i + 2 * inc_x] > maxf) {
maxf = x[i + 2 * inc_x];
}
if (x[i + 3 * inc_x] > maxf) {
maxf = x[i + 3 * inc_x];
}
i += inc_x * 4;
j += 4;
}
while (j < n) {
if (x[i] > maxf) {
maxf = x[i];
}
i += inc_x;
j++;
}
return (maxf);
}
}

180
kernel/zarch/dmax_z13.c Normal file
View File

@ -0,0 +1,180 @@
/***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
static FLOAT dmax_kernel_32(BLASLONG n, FLOAT *x)
{
FLOAT max;
__asm__ volatile (
"vl %%v0,0(%2) \n\t"
"srlg %%r0,%1,5 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%2) \n\t"
"vl %%v16,0(%%r1,%2) \n\t"
"vl %%v17,16(%%r1,%2) \n\t"
"vl %%v18,32(%%r1,%2) \n\t"
"vl %%v19,48(%%r1,%2) \n\t"
"vl %%v20,64(%%r1,%2) \n\t"
"vl %%v21,80(%%r1,%2) \n\t"
"vl %%v22,96(%%r1,%2) \n\t"
"vl %%v23,112(%%r1,%2) \n\t"
"vfchdb %%v24,%%v16,%%v17 \n\t"
"vfchdb %%v25,%%v18,%%v19 \n\t"
"vfchdb %%v26,%%v20,%%v21 \n\t"
"vfchdb %%v27,%%v22,%%v23 \n\t"
"vsel %%v24,%%v16,%%v17,%%v24 \n\t"
"vsel %%v25,%%v18,%%v19,%%v25 \n\t"
"vsel %%v26,%%v20,%%v21,%%v26 \n\t"
"vsel %%v27,%%v22,%%v23,%%v27 \n\t"
"vfchdb %%v28,%%v24,%%v25 \n\t"
"vfchdb %%v29,%%v26,%%v27 \n\t"
"vsel %%v28,%%v24,%%v25,%%v28 \n\t"
"vsel %%v29,%%v26,%%v27,%%v29 \n\t"
"vfchdb %%v30,%%v28,%%v29 \n\t"
"vsel %%v30,%%v28,%%v29,%%v30 \n\t"
"vfchdb %%v31,%%v30,%%v0 \n\t"
"vsel %%v0,%%v30,%%v0,%%v31 \n\t"
"vl %%v16,128(%%r1,%2) \n\t"
"vl %%v17,144(%%r1,%2) \n\t"
"vl %%v18,160(%%r1,%2) \n\t"
"vl %%v19,176(%%r1,%2) \n\t"
"vl %%v20,192(%%r1,%2) \n\t"
"vl %%v21,208(%%r1,%2) \n\t"
"vl %%v22,224(%%r1,%2) \n\t"
"vl %%v23,240(%%r1,%2) \n\t"
"vfchdb %%v24,%%v16,%%v17 \n\t"
"vfchdb %%v25,%%v18,%%v19 \n\t"
"vfchdb %%v26,%%v20,%%v21 \n\t"
"vfchdb %%v27,%%v22,%%v23 \n\t"
"vsel %%v24,%%v16,%%v17,%%v24 \n\t"
"vsel %%v25,%%v18,%%v19,%%v25 \n\t"
"vsel %%v26,%%v20,%%v21,%%v26 \n\t"
"vsel %%v27,%%v22,%%v23,%%v27 \n\t"
"vfchdb %%v28,%%v24,%%v25 \n\t"
"vfchdb %%v29,%%v26,%%v27 \n\t"
"vsel %%v28,%%v24,%%v25,%%v28 \n\t"
"vsel %%v29,%%v26,%%v27,%%v29 \n\t"
"vfchdb %%v30,%%v28,%%v29 \n\t"
"vsel %%v30,%%v28,%%v29,%%v30 \n\t"
"vfchdb %%v31,%%v30,%%v0 \n\t"
"vsel %%v0,%%v30,%%v0,%%v31 \n\t"
"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"
"vrepg %%v16,%%v0,1 \n\t"
"wfchdb %%v17,%%v0,%%v16 \n\t"
"vsel %%v0,%%v0,%%v16,%%v17 \n\t"
"ldr %0,%%f0 "
:"=f"(max)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
return max;
}
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0;
BLASLONG j = 0;
FLOAT maxf = 0.0;
if (n <= 0 || inc_x <= 0) return (maxf);
if (inc_x == 1) {
BLASLONG n1 = n & -32;
if (n1 > 0) {
maxf = dmax_kernel_32(n1, x);
i = n1;
}
else
{
maxf=x[0];
i++;
}
while (i < n) {
if (x[i] > maxf) {
maxf = x[i];
}
i++;
}
return (maxf);
} else {
maxf=x[0];
BLASLONG n1 = n & -4;
while (j < n1) {
if (x[i] > maxf) {
maxf = x[i];
}
if (x[i + inc_x] > maxf) {
maxf = x[i + inc_x];
}
if (x[i + 2 * inc_x] > maxf) {
maxf = x[i + 2 * inc_x];
}
if (x[i + 3 * inc_x] > maxf) {
maxf = x[i + 3 * inc_x];
}
i += inc_x * 4;
j += 4;
}
while (j < n) {
if (x[i] > maxf) {
maxf = x[i];
}
i += inc_x;
j++;
}
return (maxf);
}
}

159
kernel/zarch/dmin.c Normal file
View File

@ -0,0 +1,159 @@
/***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
static FLOAT dmin_kernel_32(BLASLONG n, FLOAT *x)
{
FLOAT min;
__asm__ volatile (
"vl %%v0,0(%2) \n\t"
"srlg %%r0,%1,5 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%2) \n\t"
"vl %%v16,0(%%r1,%2) \n\t"
"vl %%v17,16(%%r1,%2) \n\t"
"vl %%v18,32(%%r1,%2) \n\t"
"vl %%v19,48(%%r1,%2) \n\t"
"vl %%v20,64(%%r1,%2) \n\t"
"vl %%v21,80(%%r1,%2) \n\t"
"vl %%v22,96(%%r1,%2) \n\t"
"vl %%v23,112(%%r1,%2) \n\t"
"vl %%v24,128(%%r1,%2) \n\t"
"vl %%v25,144(%%r1,%2) \n\t"
"vl %%v26,160(%%r1,%2) \n\t"
"vl %%v27,176(%%r1,%2) \n\t"
"vl %%v28,192(%%r1,%2) \n\t"
"vl %%v29,208(%%r1,%2) \n\t"
"vl %%v30,224(%%r1,%2) \n\t"
"vl %%v31,240(%%r1,%2) \n\t"
"vfmindb %%v16,%%v16,%%v24,0 \n\t"
"vfmindb %%v17,%%v17,%%v25,0 \n\t"
"vfmindb %%v18,%%v18,%%v26,0 \n\t"
"vfmindb %%v19,%%v19,%%v27,0 \n\t"
"vfmindb %%v20,%%v20,%%v28,0 \n\t"
"vfmindb %%v21,%%v21,%%v29,0 \n\t"
"vfmindb %%v22,%%v22,%%v30,0 \n\t"
"vfmindb %%v23,%%v23,%%v31,0 \n\t"
"vfmindb %%v16,%%v16,%%v20,0 \n\t"
"vfmindb %%v17,%%v17,%%v21,0 \n\t"
"vfmindb %%v18,%%v18,%%v22,0 \n\t"
"vfmindb %%v19,%%v19,%%v23,0 \n\t"
"vfmindb %%v16,%%v16,%%v18,0 \n\t"
"vfmindb %%v17,%%v17,%%v19,0 \n\t"
"vfmindb %%v16,%%v16,%%v17,0 \n\t"
"vfmindb %%v0,%%v0,%%16,0 \n\t"
"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"
"vrepg %%v16,%%v0,1 \n\t"
"wfmindb %%v0,%%v0,%%v16,0 \n\t"
"ldr %0,%%f0 "
:"=f"(min)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
return min;
}
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0;
BLASLONG j = 0;
FLOAT minf = 0.0;
if (n <= 0 || inc_x <= 0) return (minf);
if (inc_x == 1) {
BLASLONG n1 = n & -32;
if (n1 > 0) {
minf = dmin_kernel_32(n1, x);
i = n1;
}
else
{
minf=x[0];
i++;
}
while (i < n) {
if (x[i] < minf) {
minf = x[i];
}
i++;
}
return (minf);
} else {
minf=x[0];
BLASLONG n1 = n & -4;
while (j < n1) {
if (x[i] < minf) {
minf = x[i];
}
if (x[i + inc_x] < minf) {
minf = x[i + inc_x];
}
if (x[i + 2 * inc_x] < minf) {
minf = x[i + 2 * inc_x];
}
if (x[i + 3 * inc_x] < minf) {
minf = x[i + 3 * inc_x];
}
i += inc_x * 4;
j += 4;
}
while (j < n) {
if (x[i] < minf) {
minf = x[i];
}
i += inc_x;
j++;
}
return (minf);
}
}

180
kernel/zarch/dmin_z13.c Normal file
View File

@ -0,0 +1,180 @@
/***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
static FLOAT dmin_kernel_32(BLASLONG n, FLOAT *x)
{
FLOAT min;
__asm__ volatile (
"vl %%v0,0(%2) \n\t"
"srlg %%r0,%1,5 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%2) \n\t"
"vl %%v16,0(%%r1,%2) \n\t"
"vl %%v17,16(%%r1,%2) \n\t"
"vl %%v18,32(%%r1,%2) \n\t"
"vl %%v19,48(%%r1,%2) \n\t"
"vl %%v20,64(%%r1,%2) \n\t"
"vl %%v21,80(%%r1,%2) \n\t"
"vl %%v22,96(%%r1,%2) \n\t"
"vl %%v23,112(%%r1,%2) \n\t"
"vfchdb %%v24,%%v17,%%v16 \n\t"
"vfchdb %%v25,%%v19,%%v18 \n\t"
"vfchdb %%v26,%%v21,%%v20 \n\t"
"vfchdb %%v27,%%v23,%%v22 \n\t"
"vsel %%v24,%%v16,%%v17,%%v24 \n\t"
"vsel %%v25,%%v18,%%v19,%%v25 \n\t"
"vsel %%v26,%%v20,%%v21,%%v26 \n\t"
"vsel %%v27,%%v22,%%v23,%%v27 \n\t"
"vfchdb %%v28,%%v25,%%v24 \n\t"
"vfchdb %%v29,%%v27,%%v26 \n\t"
"vsel %%v28,%%v24,%%v25,%%v28 \n\t"
"vsel %%v29,%%v26,%%v27,%%v29 \n\t"
"vfchdb %%v30,%%v29,%%v28 \n\t"
"vsel %%v30,%%v28,%%v29,%%v30 \n\t"
"vfchdb %%v31,%%v0,%%v30 \n\t"
"vsel %%v0,%%v30,%%v0,%%v31 \n\t"
"vl %%v16,128(%%r1,%2) \n\t"
"vl %%v17,144(%%r1,%2) \n\t"
"vl %%v18,160(%%r1,%2) \n\t"
"vl %%v19,176(%%r1,%2) \n\t"
"vl %%v20,192(%%r1,%2) \n\t"
"vl %%v21,208(%%r1,%2) \n\t"
"vl %%v22,224(%%r1,%2) \n\t"
"vl %%v23,240(%%r1,%2) \n\t"
"vfchdb %%v24,%%v17,%%v16 \n\t"
"vfchdb %%v25,%%v19,%%v18 \n\t"
"vfchdb %%v26,%%v21,%%v20 \n\t"
"vfchdb %%v27,%%v23,%%v22 \n\t"
"vsel %%v24,%%v16,%%v17,%%v24 \n\t"
"vsel %%v25,%%v18,%%v19,%%v25 \n\t"
"vsel %%v26,%%v20,%%v21,%%v26 \n\t"
"vsel %%v27,%%v22,%%v23,%%v27 \n\t"
"vfchdb %%v28,%%v25,%%v24 \n\t"
"vfchdb %%v29,%%v27,%%v26 \n\t"
"vsel %%v28,%%v24,%%v25,%%v28 \n\t"
"vsel %%v29,%%v26,%%v27,%%v29 \n\t"
"vfchdb %%v30,%%v29,%%v28 \n\t"
"vsel %%v30,%%v28,%%v29,%%v30 \n\t"
"vfchdb %%v31,%%v0,%%v30 \n\t"
"vsel %%v0,%%v30,%%v0,%%v31 \n\t"
"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"
"vrepg %%v16,%%v0,1 \n\t"
"wfchdb %%v17,%%v16,%%v0 \n\t"
"vsel %%v0,%%v0,%%v16,%%v17 \n\t"
"ldr %0,%%f0 "
:"=f"(min)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
return min;
}
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0;
BLASLONG j = 0;
FLOAT minf = 0.0;
if (n <= 0 || inc_x <= 0) return (minf);
if (inc_x == 1) {
BLASLONG n1 = n & -32;
if (n1 > 0) {
minf = dmin_kernel_32(n1, x);
i = n1;
}
else
{
minf=x[0];
i++;
}
while (i < n) {
if (x[i] < minf) {
minf = x[i];
}
i++;
}
return (minf);
} else {
minf=x[0];
BLASLONG n1 = n & -4;
while (j < n1) {
if (x[i] < minf) {
minf = x[i];
}
if (x[i + inc_x] < minf) {
minf = x[i + inc_x];
}
if (x[i + 2 * inc_x] < minf) {
minf = x[i + 2 * inc_x];
}
if (x[i + 3 * inc_x] < minf) {
minf = x[i + 3 * inc_x];
}
i += inc_x * 4;
j += 4;
}
while (j < n) {
if (x[i] < minf) {
minf = x[i];
}
i += inc_x;
j++;
}
return (minf);
}
}

View File

@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project Copyright (c) 2013-2018, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@ -27,176 +27,166 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
static void drot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT cosA, FLOAT sinA) static void drot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s)
{ {
__asm__ ( __asm__ (
"pfd 2, 0(%[ptr_x]) \n\t" "vlrepg %%v0,%3 \n\t"
"pfd 2, 0(%[ptr_y]) \n\t" "vlrepg %%v1,%4 \n\t"
"lgdr %%r1,%[cos] \n\t" "srlg %%r0,%0,5 \n\t"
"vlvgp %%v0,%%r1,%%r1 \n\t" "xgr %%r1,%%r1 \n\t"
"lgdr %%r1,%[sin] \n\t" "0: \n\t"
"vlvgp %%v1,%%r1,%%r1 \n\t" "pfd 2, 1024(%%r1,%1) \n\t"
"srlg %[n_tmp],%[n_tmp],5 \n\t" "pfd 2, 1024(%%r1,%2) \n\t"
"xgr %%r1,%%r1 \n\t" "vl %%v24, 0(%%r1,%1) \n\t"
".align 16 \n\t" "vl %%v25, 16(%%r1,%1) \n\t"
"1: \n\t" "vl %%v26, 32(%%r1,%1) \n\t"
"pfd 2, 256(%%r1,%[ptr_x]) \n\t" "vl %%v27, 48(%%r1,%1) \n\t"
"pfd 2, 256(%%r1,%[ptr_y]) \n\t" "vl %%v16, 0(%%r1,%2) \n\t"
"vl %%v24, 0(%%r1,%[ptr_x]) \n\t" "vl %%v17, 16(%%r1,%2) \n\t"
"vl %%v25, 16(%%r1,%[ptr_x]) \n\t" "vl %%v18, 32(%%r1,%2) \n\t"
"vl %%v26, 32(%%r1,%[ptr_x]) \n\t" "vl %%v19, 48(%%r1,%2) \n\t"
"vl %%v27, 48(%%r1,%[ptr_x]) \n\t"
"vl %%v16, 0(%%r1,%[ptr_y]) \n\t" "vfmdb %%v28,%%v24,%%v0 \n\t"
"vl %%v17, 16(%%r1,%[ptr_y]) \n\t" "vfmdb %%v29,%%v25,%%v0 \n\t"
"vl %%v18, 32(%%r1,%[ptr_y]) \n\t" "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
"vl %%v19, 48(%%r1,%[ptr_y]) \n\t" "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v30,%%v26,%%v0 \n\t"
"vfmdb %%v28,%%v24,%%v0 \n\t" "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v29,%%v25,%%v0 \n\t" "vfmdb %%v31,%%v27,%%v0 \n\t"
"vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ /* 2nd parts*/
"vfmdb %%v30,%%v26,%%v0 \n\t" "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t"
"vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
"vfmdb %%v31,%%v27,%%v0 \n\t" "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t"
"vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
/* 2nd parts*/ "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t"
"vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */
"vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t"
"vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
"vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
"vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" "vst %%v28, 0(%%r1,%1) \n\t"
"vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ "vst %%v29, 16(%%r1,%1) \n\t"
"vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" "vst %%v30, 32(%%r1,%1) \n\t"
"vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ "vst %%v31, 48(%%r1,%1) \n\t"
"vst %%v20, 0(%%r1,%2) \n\t"
"vst %%v28, 0(%%r1,%[ptr_x]) \n\t" "vst %%v21, 16(%%r1,%2) \n\t"
"vst %%v29, 16(%%r1,%[ptr_x]) \n\t" "vst %%v22, 32(%%r1,%2) \n\t"
"vst %%v30, 32(%%r1,%[ptr_x]) \n\t" "vst %%v23, 48(%%r1,%2) \n\t"
"vst %%v31, 48(%%r1,%[ptr_x]) \n\t"
"vst %%v20, 0(%%r1,%[ptr_y]) \n\t" "vl %%v24, 64(%%r1,%1) \n\t"
"vst %%v21, 16(%%r1,%[ptr_y]) \n\t" "vl %%v25, 80(%%r1,%1) \n\t"
"vst %%v22, 32(%%r1,%[ptr_y]) \n\t" "vl %%v26, 96(%%r1,%1) \n\t"
"vst %%v23, 48(%%r1,%[ptr_y]) \n\t" "vl %%v27, 112(%%r1,%1) \n\t"
"vl %%v16, 64(%%r1,%2) \n\t"
"vl %%v24, 64(%%r1,%[ptr_x]) \n\t" "vl %%v17, 80(%%r1,%2) \n\t"
"vl %%v25, 80(%%r1,%[ptr_x]) \n\t" "vl %%v18, 96(%%r1,%2) \n\t"
"vl %%v26, 96(%%r1,%[ptr_x]) \n\t" "vl %%v19, 112(%%r1,%2) \n\t"
"vl %%v27, 112(%%r1,%[ptr_x]) \n\t"
"vl %%v16, 64(%%r1,%[ptr_y]) \n\t" "vfmdb %%v28,%%v24,%%v0 \n\t"
"vl %%v17, 80(%%r1,%[ptr_y]) \n\t" "vfmdb %%v29,%%v25,%%v0 \n\t"
"vl %%v18, 96(%%r1,%[ptr_y]) \n\t" "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
"vl %%v19, 112(%%r1,%[ptr_y]) \n\t" "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v30,%%v26,%%v0 \n\t"
"vfmdb %%v28,%%v24,%%v0 \n\t" "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v29,%%v25,%%v0 \n\t" "vfmdb %%v31,%%v27,%%v0 \n\t"
"vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ /* 2nd parts*/
"vfmdb %%v30,%%v26,%%v0 \n\t" "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t"
"vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
"vfmdb %%v31,%%v27,%%v0 \n\t" "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t"
"vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
/* 2nd parts*/ "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t"
"vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */
"vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t"
"vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
"vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
"vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" "vst %%v28, 64(%%r1,%1) \n\t"
"vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ "vst %%v29, 80(%%r1,%1) \n\t"
"vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" "vst %%v30, 96(%%r1,%1) \n\t"
"vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ "vst %%v31, 112(%%r1,%1) \n\t"
"vst %%v20, 64(%%r1,%2) \n\t"
"vst %%v28, 64(%%r1,%[ptr_x]) \n\t" "vst %%v21, 80(%%r1,%2) \n\t"
"vst %%v29, 80(%%r1,%[ptr_x]) \n\t" "vst %%v22, 96(%%r1,%2) \n\t"
"vst %%v30, 96(%%r1,%[ptr_x]) \n\t" "vst %%v23, 112(%%r1,%2) \n\t"
"vst %%v31, 112(%%r1,%[ptr_x]) \n\t"
"vst %%v20, 64(%%r1,%[ptr_y]) \n\t" "vl %%v24, 128(%%r1,%1) \n\t"
"vst %%v21, 80(%%r1,%[ptr_y]) \n\t" "vl %%v25, 144(%%r1,%1) \n\t"
"vst %%v22, 96(%%r1,%[ptr_y]) \n\t" "vl %%v26, 160(%%r1,%1) \n\t"
"vst %%v23, 112(%%r1,%[ptr_y]) \n\t" "vl %%v27, 176(%%r1,%1) \n\t"
"vl %%v16, 128(%%r1,%2) \n\t"
"vl %%v24, 128(%%r1,%[ptr_x]) \n\t" "vl %%v17, 144(%%r1,%2) \n\t"
"vl %%v25, 144(%%r1,%[ptr_x]) \n\t" "vl %%v18, 160(%%r1,%2) \n\t"
"vl %%v26, 160(%%r1,%[ptr_x]) \n\t" "vl %%v19, 176(%%r1,%2) \n\t"
"vl %%v27, 176(%%r1,%[ptr_x]) \n\t"
"vl %%v16, 128(%%r1,%[ptr_y]) \n\t" "vfmdb %%v28,%%v24,%%v0 \n\t"
"vl %%v17, 144(%%r1,%[ptr_y]) \n\t" "vfmdb %%v29,%%v25,%%v0 \n\t"
"vl %%v18, 160(%%r1,%[ptr_y]) \n\t" "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
"vl %%v19, 176(%%r1,%[ptr_y]) \n\t" "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v30,%%v26,%%v0 \n\t"
"vfmdb %%v28,%%v24,%%v0 \n\t" "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v29,%%v25,%%v0 \n\t" "vfmdb %%v31,%%v27,%%v0 \n\t"
"vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ /* 2nd parts*/
"vfmdb %%v30,%%v26,%%v0 \n\t" "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t"
"vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
"vfmdb %%v31,%%v27,%%v0 \n\t" "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t"
"vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
/* 2nd parts*/ "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t"
"vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */
"vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t"
"vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
"vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
"vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" "vst %%v28, 128(%%r1,%1) \n\t"
"vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ "vst %%v29, 144(%%r1,%1) \n\t"
"vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" "vst %%v30, 160(%%r1,%1) \n\t"
"vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ "vst %%v31, 176(%%r1,%1) \n\t"
"vst %%v20, 128(%%r1,%2) \n\t"
"vst %%v28, 128(%%r1,%[ptr_x]) \n\t" "vst %%v21, 144(%%r1,%2) \n\t"
"vst %%v29, 144(%%r1,%[ptr_x]) \n\t" "vst %%v22, 160(%%r1,%2) \n\t"
"vst %%v30, 160(%%r1,%[ptr_x]) \n\t" "vst %%v23, 176(%%r1,%2) \n\t"
"vst %%v31, 176(%%r1,%[ptr_x]) \n\t"
"vst %%v20, 128(%%r1,%[ptr_y]) \n\t" "vl %%v24, 192(%%r1,%1) \n\t"
"vst %%v21, 144(%%r1,%[ptr_y]) \n\t" "vl %%v25, 208(%%r1,%1) \n\t"
"vst %%v22, 160(%%r1,%[ptr_y]) \n\t" "vl %%v26, 224(%%r1,%1) \n\t"
"vst %%v23, 176(%%r1,%[ptr_y]) \n\t" "vl %%v27, 240(%%r1,%1) \n\t"
"vl %%v16, 192(%%r1,%2) \n\t"
"vl %%v24, 192(%%r1,%[ptr_x]) \n\t" "vl %%v17, 208(%%r1,%2) \n\t"
"vl %%v25, 208(%%r1,%[ptr_x]) \n\t" "vl %%v18, 224(%%r1,%2) \n\t"
"vl %%v26, 224(%%r1,%[ptr_x]) \n\t" "vl %%v19, 240(%%r1,%2) \n\t"
"vl %%v27, 240(%%r1,%[ptr_x]) \n\t"
"vl %%v16, 192(%%r1,%[ptr_y]) \n\t" "vfmdb %%v28,%%v24,%%v0 \n\t"
"vl %%v17, 208(%%r1,%[ptr_y]) \n\t" "vfmdb %%v29,%%v25,%%v0 \n\t"
"vl %%v18, 224(%%r1,%[ptr_y]) \n\t" "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
"vl %%v19, 240(%%r1,%[ptr_y]) \n\t" "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v30,%%v26,%%v0 \n\t"
"vfmdb %%v28,%%v24,%%v0 \n\t" "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v29,%%v25,%%v0 \n\t" "vfmdb %%v31,%%v27,%%v0 \n\t"
"vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ /* 2nd parts*/
"vfmdb %%v30,%%v26,%%v0 \n\t" "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t"
"vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
"vfmdb %%v31,%%v27,%%v0 \n\t" "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t"
"vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
/* 2nd parts*/ "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t"
"vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */
"vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t"
"vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
"vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
"vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" "vst %%v28, 192(%%r1,%1) \n\t"
"vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ "vst %%v29, 208(%%r1,%1) \n\t"
"vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" "vst %%v30, 224(%%r1,%1) \n\t"
"vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ "vst %%v31, 240(%%r1,%1) \n\t"
"vst %%v20, 192(%%r1,%2) \n\t"
"vst %%v28, 192(%%r1,%[ptr_x]) \n\t" "vst %%v21, 208(%%r1,%2) \n\t"
"vst %%v29, 208(%%r1,%[ptr_x]) \n\t" "vst %%v22, 224(%%r1,%2) \n\t"
"vst %%v30, 224(%%r1,%[ptr_x]) \n\t" "vst %%v23, 240(%%r1,%2) \n\t"
"vst %%v31, 240(%%r1,%[ptr_x]) \n\t"
"vst %%v20, 192(%%r1,%[ptr_y]) \n\t" "agfi %%r1,256 \n\t"
"vst %%v21, 208(%%r1,%[ptr_y]) \n\t" "brctg %%r0,0b "
"vst %%v22, 224(%%r1,%[ptr_y]) \n\t" :
"vst %%v23, 240(%%r1,%[ptr_y]) \n\t" :"r"(n),"ZR"((FLOAT (*)[n])x),"ZR"((FLOAT (*)[n])y),"m"(*c),"m"(*s)
:"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
"la %%r1,256(%%r1) \n\t" );
"brctg %[n_tmp],1b"
: [mem_x] "+m" (*(double (*)[n])x),
[mem_y] "+m" (*(double (*)[n])y),
[n_tmp] "+&r"(n)
: [ptr_x] "a"(x), [ptr_y] "a"(y),[cos] "f"(cosA),[sin] "f"(sinA)
: "cc", "r1" ,"v0","v1","v16",
"v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
return;
} }
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
@ -214,8 +204,10 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
BLASLONG n1 = n & -32; BLASLONG n1 = n & -32;
if ( n1 > 0 ) if ( n1 > 0 )
{ {
FLOAT cosa,sina;
drot_kernel_32(n1, x, y, c, s); cosa=c;
sina=s;
drot_kernel_32(n1, x, y, &cosa, &sina);
i=n1; i=n1;
} }
@ -229,6 +221,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
} }
} }
else else
{ {
@ -250,3 +243,4 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
} }

View File

@ -27,135 +27,75 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
#ifdef Z13_A static void dscal_kernel_16(BLASLONG n, FLOAT da, FLOAT *x)
static void dscal_kernel_32( BLASLONG n, FLOAT da , FLOAT *x )
{ {
__asm__ volatile (
"vlrepg %%v0,%1 \n\t"
"srlg %%r0,%0,4 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 2, 1024(%%r1,%2) \n\t"
"vl %%v24, 0(%%r1,%2) \n\t"
"vfmdb %%v24,%%v24,%%v0 \n\t"
"vst %%v24, 0(%%r1,%2) \n\t"
"vl %%v25, 16(%%r1,%2) \n\t"
"vfmdb %%v25,%%v25,%%v0 \n\t"
"vst %%v25, 16(%%r1,%2) \n\t"
"vl %%v26, 32(%%r1,%2) \n\t"
"vfmdb %%v26,%%v26,%%v0 \n\t"
"vst %%v26, 32(%%r1,%2) \n\t"
"vl %%v27, 48(%%r1,%2) \n\t"
"vfmdb %%v27,%%v27,%%v0 \n\t"
"vst %%v27, 48(%%r1,%2) \n\t"
"vl %%v24, 64(%%r1,%2) \n\t"
"vfmdb %%v24,%%v24,%%v0 \n\t"
"vst %%v24, 64(%%r1,%2) \n\t"
"vl %%v25, 80(%%r1,%2) \n\t"
"vfmdb %%v25,%%v25,%%v0 \n\t"
"vst %%v25, 80(%%r1,%2) \n\t"
"vl %%v26, 96(%%r1,%2) \n\t"
"vfmdb %%v26,%%v26,%%v0 \n\t"
"vst %%v26, 96(%%r1,%2) \n\t"
"vl %%v27, 112(%%r1,%2) \n\t"
"vfmdb %%v27,%%v27,%%v0 \n\t"
"vst %%v27, 112(%%r1,%2) \n\t"
"agfi %%r1,128 \n\t"
"brctg %%r0,0b "
:
:"r"(n),"m"(da),"ZR"((FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v24","v25","v26","v27"
);
}
static void dscal_kernel_16_zero(BLASLONG n, FLOAT *x)
__asm__ ("pfd 2, 0(%[x_ptr]) \n\t"
"lgdr %%r0,%[alpha] \n\t"
"vlvgp %%v0,%%r0,%%r0 \n\t"
"srlg %[n],%[n],4 \n\t"
"vlr %%v1,%%v0 \n\t"
"vlm %%v16,%%v23, 0(%[x_ptr]) \n\t"
"la %[x_ptr], 128(%[x_ptr]) \n\t"
"aghik %[n], %[n], -1 \n\t"
"jle 2f \n\t"
".align 16 \n\t"
"1: \n\t"
"vfmdb %%v24, %%v16, %%v0 \n\t"
"vfmdb %%v25, %%v17, %%v0 \n\t"
"vfmdb %%v26, %%v18, %%v0 \n\t"
"vfmdb %%v27, %%v19, %%v1 \n\t"
"vlm %%v16,%%v19, 0(%[x_ptr]) \n\t"
"vfmdb %%v28, %%v20, %%v0 \n\t"
"vfmdb %%v29, %%v21, %%v1 \n\t"
"vfmdb %%v30, %%v22, %%v0 \n\t"
"vfmdb %%v31, %%v23, %%v1 \n\t"
"vlm %%v20,%%v23, 64(%[x_ptr]) \n\t"
"lay %[x_ptr], -128(%[x_ptr]) \n\t"
"vstm %%v24,%%v31, 0(%[x_ptr]) \n\t"
"la %[x_ptr],256(%[x_ptr]) \n\t"
"brctg %[n],1b \n\t"
"2: \n\t"
"vfmdb %%v24, %%v16, %%v0 \n\t"
"vfmdb %%v25, %%v17, %%v1 \n\t"
"vfmdb %%v26, %%v18, %%v0 \n\t"
"vfmdb %%v27, %%v19, %%v1 \n\t"
"lay %[x_ptr] , -128(%[x_ptr]) \n\t"
"vfmdb %%v28, %%v20, %%v0 \n\t"
"vfmdb %%v29, %%v21, %%v1 \n\t"
"vfmdb %%v30, %%v22, %%v0 \n\t"
"vfmdb %%v31, %%v23, %%v1 \n\t"
"vstm %%v24,%%v31, 0(%[x_ptr]) \n\t"
: [mem] "+m" (*(double (*)[n])x) ,[x_ptr] "+&a"(x),[n] "+&r"(n)
: [alpha] "f"(da)
:"cc" , "r0","v0","v1","v16","v17","v18","v19","v20","v21",
"v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
}
#else
static void dscal_kernel_32( BLASLONG n, FLOAT da , FLOAT *x )
{ {
__asm__ volatile(
"vzero %%v24 \n\t"
"vzero %%v25 \n\t"
"vzero %%v26 \n\t"
"vzero %%v27 \n\t"
"srlg %%r0,%0,4 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 2, 1024(%%r1,%1) \n\t"
/* faster than sequence of triples(vl vfmd vst) (tested OPENBLAS_LOOPS=10000) */ "vst %%v24,0(%%r1,%1) \n\t"
__asm__ ("pfd 2, 0(%[x_ptr]) \n\t" "vst %%v25,16(%%r1,%1) \n\t"
"lgdr %%r0,%[alpha] \n\t" "vst %%v26,32(%%r1,%1) \n\t"
"vlvgp %%v0,%%r0,%%r0 \n\t" "vst %%v27,48(%%r1,%1) \n\t"
"vlr %%v1,%%v0 \n\t" "vst %%v24,64(%%r1,%1) \n\t"
"sllg %%r0,%[n],3 \n\t" "vst %%v25,80(%%r1,%1) \n\t"
"agr %%r0,%[x_ptr] \n\t" "vst %%v26,96(%%r1,%1) \n\t"
".align 16 \n\t" "vst %%v27,112(%%r1,%1) \n\t"
"1: \n\t"
"pfd 2, 256(%[x_ptr]) \n\t" "agfi %%r1,128 \n\t"
"vlm %%v16,%%v23, 0(%[x_ptr]) \n\t" "brctg %%r0,0b "
"vfmdb %%v16,%%v16,%%v0 \n\t" :
"vfmdb %%v17,%%v17,%%v1 \n\t" :"r"(n),"ZR"((FLOAT (*)[n])x)
"vfmdb %%v18,%%v18,%%v0 \n\t" :"memory","cc","r0","r1","v24","v25","v26","v27"
"vfmdb %%v19,%%v19,%%v1 \n\t" );
"vfmdb %%v20,%%v20,%%v0 \n\t"
"vfmdb %%v21,%%v21,%%v1 \n\t"
"vfmdb %%v22,%%v22,%%v0 \n\t"
"vfmdb %%v23,%%v23,%%v1 \n\t"
"vstm %%v16,%%v23, 0(%[x_ptr]) \n\t"
"vlm %%v24,%%v31,128(%[x_ptr]) \n\t"
"vfmdb %%v24,%%v24,%%v0 \n\t"
"vfmdb %%v25,%%v25,%%v1 \n\t"
"vfmdb %%v26,%%v26,%%v0 \n\t"
"vfmdb %%v27,%%v27,%%v1 \n\t"
"vfmdb %%v28,%%v28,%%v0 \n\t"
"vfmdb %%v29,%%v29,%%v1 \n\t"
"vfmdb %%v30,%%v30,%%v0 \n\t"
"vfmdb %%v31,%%v31,%%v1 \n\t"
"vstm %%v24,%%v31,128(%[x_ptr]) \n\t"
"la %[x_ptr], 256(%[x_ptr]) \n\t"
"clgrjl %[x_ptr],%%r0,1b \n\t"
: [mem] "+m" (*(double (*)[n])x) ,[x_ptr] "+&a"(x)
: [n] "r"(n),[alpha] "f"(da)
:"cc" , "r0","v0","v1","v16","v17","v18","v19","v20","v21",
"v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
}
#endif
static void dscal_kernel_32_zero( BLASLONG n, FLOAT *x )
{
__asm__ ("pfd 2, 0(%[x_ptr]) \n\t"
"vzero %%v24 \n\t"
"sllg %%r0,%[n],3 \n\t"
"vzero %%v25 \n\t"
"agr %%r0,%[x_ptr] \n\t"
".align 16 \n\t"
"1: \n\t"
"pfd 2, 256(%[x_ptr]) \n\t"
"vst %%v24, 0(%[x_ptr]) \n\t"
"vst %%v25, 16(%[x_ptr]) \n\t"
"vst %%v24, 32(%[x_ptr]) \n\t"
"vst %%v25, 48(%[x_ptr]) \n\t"
"vst %%v24, 64(%[x_ptr]) \n\t"
"vst %%v25, 80(%[x_ptr]) \n\t"
"vst %%v24, 96(%[x_ptr]) \n\t"
"vst %%v25, 112(%[x_ptr]) \n\t"
"vst %%v24, 128(%[x_ptr]) \n\t"
"vst %%v25, 144(%[x_ptr]) \n\t"
"vst %%v24, 160(%[x_ptr]) \n\t"
"vst %%v25, 176(%[x_ptr]) \n\t"
"vst %%v24, 192(%[x_ptr]) \n\t"
"vst %%v25, 208(%[x_ptr]) \n\t"
"vst %%v24, 224(%[x_ptr]) \n\t"
"vst %%v25, 240(%[x_ptr]) \n\t"
"la %[x_ptr],256(%[x_ptr]) \n\t"
"clgrjl %[x_ptr],%%r0,1b \n\t"
: [mem] "=m" (*(double (*)[n])x) ,[x_ptr] "+&a"(x)
: [n] "r"(n)
:"cc" , "r0", "v24" ,"v25"
);
} }
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
{ {
BLASLONG i=0,j=0; BLASLONG i=0,j=0;
@ -169,11 +109,11 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
if ( da == 0.0 ) if ( da == 0.0 )
{ {
BLASLONG n1 = n & -32; BLASLONG n1 = n & -16;
if ( n1 > 0 ) if ( n1 > 0 )
{ {
dscal_kernel_32_zero(n1 , x); dscal_kernel_16_zero(n1, x);
j=n1; j=n1;
} }
@ -188,10 +128,10 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
else else
{ {
BLASLONG n1 = n & -32; BLASLONG n1 = n & -16;
if ( n1 > 0 ) if ( n1 > 0 )
{ {
dscal_kernel_32(n1 , da , x); dscal_kernel_16(n1, da, x);
j=n1; j=n1;
} }
while(j < n) while(j < n)
@ -260,4 +200,6 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
} }
return 0; return 0;
} }

169
kernel/zarch/dsdot.c Normal file
View File

@ -0,0 +1,169 @@
/***************************************************************************
Copyright (c) 2013-2018,The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms,with or without
modification,are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice,this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice,this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES,INCLUDING,BUT NOT LIMITED TO,THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT,INDIRECT,INCIDENTAL,SPECIAL,EXEMPLARY,OR CONSEQUENTIAL
DAMAGES (INCLUDING,BUT NOT LIMITED TO,PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE,DATA,OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY,WHETHER IN CONTRACT,STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE,EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
static double dsdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y)
{
double dot;
__asm__ volatile (
"vzero %%v0 \n\t"
"srlg %%r0,%1,4 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1,1024(%%r1,%2) \n\t"
"pfd 1,1024(%%r1,%3) \n\t"
"vlef %%v16,0(%%r1,%2),0 \n\t"
"vlef %%v16,4(%%r1,%2),2 \n\t"
"vlef %%v17,8(%%r1,%2),0 \n\t"
"vlef %%v17,12(%%r1,%2),2 \n\t"
"vlef %%v18,16(%%r1,%2),0 \n\t"
"vlef %%v18,20(%%r1,%2),2 \n\t"
"vlef %%v19,24(%%r1,%2),0 \n\t"
"vlef %%v19,28(%%r1,%2),2 \n\t"
"vlef %%v20,32(%%r1,%2),0 \n\t"
"vlef %%v20,36(%%r1,%2),2 \n\t"
"vlef %%v21,40(%%r1,%2),0 \n\t"
"vlef %%v21,44(%%r1,%2),2 \n\t"
"vlef %%v22,48(%%r1,%2),0 \n\t"
"vlef %%v22,52(%%r1,%2),2 \n\t"
"vlef %%v23,56(%%r1,%2),0 \n\t"
"vlef %%v23,60(%%r1,%2),2 \n\t"
"vflls %%v16,%%v16 \n\t"
"vflls %%v17,%%v17 \n\t"
"vflls %%v18,%%v18 \n\t"
"vflls %%v19,%%v19 \n\t"
"vflls %%v20,%%v20 \n\t"
"vflls %%v21,%%v21 \n\t"
"vflls %%v22,%%v22 \n\t"
"vflls %%v23,%%v23 \n\t"
"vlef %%v24,0(%%r1,%3),0 \n\t"
"vlef %%v24,4(%%r1,%3),2 \n\t"
"vflls %%v24,%%v24 \n\t"
"vfmadb %%v0,%%v16,%%v24,%%v0 \n\t"
"vlef %%v25,8(%%r1,%3),0 \n\t"
"vlef %%v25,12(%%r1,%3),2 \n\t"
"vflls %%v25,%%v25 \n\t"
"vfmadb %%v0,%%v17,%%v25,%%v0 \n\t"
"vlef %%v26,16(%%r1,%3),0 \n\t"
"vlef %%v26,20(%%r1,%3),2 \n\t"
"vflls %%v26,%%v26 \n\t"
"vfmadb %%v0,%%v18,%%v26,%%v0 \n\t"
"vlef %%v27,24(%%r1,%3),0 \n\t"
"vlef %%v27,28(%%r1,%3),2 \n\t"
"vflls %%v27,%%v27 \n\t"
"vfmadb %%v0,%%v19,%%v27,%%v0 \n\t"
"vlef %%v28,32(%%r1,%3),0 \n\t"
"vlef %%v28,36(%%r1,%3),2 \n\t"
"vflls %%v28,%%v28 \n\t"
"vfmadb %%v0,%%v20,%%v28,%%v0 \n\t"
"vlef %%v29,40(%%r1,%3),0 \n\t"
"vlef %%v29,44(%%r1,%3),2 \n\t"
"vflls %%v29,%%v29 \n\t"
"vfmadb %%v0,%%v21,%%v29,%%v0 \n\t"
"vlef %%v30,48(%%r1,%3),0 \n\t"
"vlef %%v30,52(%%r1,%3),2 \n\t"
"vflls %%v30,%%v30 \n\t"
"vfmadb %%v0,%%v22,%%v30,%%v0 \n\t"
"vlef %%v31,56(%%r1,%3),0 \n\t"
"vlef %%v31,60(%%r1,%3),2 \n\t"
"vflls %%v31,%%v31 \n\t"
"vfmadb %%v0,%%v23,%%v31,%%v0 \n\t"
"agfi %%r1,64 \n\t"
"brctg %%r0,0b \n\t"
"vrepg %%v1,%%v0,1 \n\t"
"adbr %%f0,%%f1 \n\t"
"ldr %0,%%f0 "
:"=f"(dot)
:"r"(n),"ZR"((const FLOAT (*)[n])x),"ZR"((const FLOAT (*)[n])y)
:"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
return dot;
}
double CNAME(BLASLONG n,FLOAT *x,BLASLONG inc_x,FLOAT *y,BLASLONG inc_y)
{
BLASLONG i=0;
BLASLONG ix=0,iy=0;
double dot = 0.0 ;
if ( n <= 0 ) return(dot);
if ( (inc_x == 1) && (inc_y == 1) )
{
BLASLONG n1 = n & -16;
if ( n1 )
dot = dsdot_kernel_16(n1,x,y);
i = n1;
while(i < n)
{
dot += y[i] * x[i] ;
i++ ;
}
return(dot);
}
BLASLONG n1 = n & -2;
while(i < n1)
{
dot += y[iy] * x[ix] + y[iy+inc_y] * x[ix+inc_x];
ix += inc_x*2 ;
iy += inc_y*2 ;
i+=2 ;
}
while(i < n)
{
dot += y[iy] * x[ix] ;
ix += inc_x ;
iy += inc_y ;
i++ ;
}
return(dot);
}

View File

@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project Copyright (c) 2013-2018, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@ -25,217 +25,93 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/ *****************************************************************************/
#include "common.h" #include "common.h"
static void dswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y)
#if defined(Z13_SWAP_A)
static void dswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y)
{ {
__asm__ volatile( __asm__ volatile(
"pfd 1, 0(%[ptr_x]) \n\t" "srlg %%r0,%0,5 \n\t"
"pfd 2, 0(%[ptr_y]) \n\t" "xgr %%r1,%%r1 \n\t"
"srlg %[n_tmp],%[n_tmp],5 \n\t" "0: \n\t"
"xgr %%r1,%%r1 \n\t" "pfd 2, 1024(%%r1,%1) \n\t"
".align 16 \n\t" "pfd 2, 1024(%%r1,%2) \n\t"
"1: \n\t"
"pfd 2, 256(%%r1,%[ptr_x]) \n\t" "vl %%v16, 0(%%r1,%1) \n\t"
"pfd 2, 256(%%r1,%[ptr_y]) \n\t" "vl %%v17, 16(%%r1,%1) \n\t"
"vl %%v18, 32(%%r1,%1) \n\t"
"vl %%v24, 0(%%r1,%[ptr_x]) \n\t" "vl %%v19, 48(%%r1,%1) \n\t"
"vl %%v16, 0(%%r1,%[ptr_y]) \n\t" "vl %%v20, 64(%%r1,%1) \n\t"
"vst %%v24, 0(%%r1,%[ptr_y]) \n\t" "vl %%v21, 80(%%r1,%1) \n\t"
"vst %%v16, 0(%%r1,%[ptr_x]) \n\t" "vl %%v22, 96(%%r1,%1) \n\t"
"vl %%v23, 112(%%r1,%1) \n\t"
"vl %%v24, 128(%%r1,%1) \n\t"
"vl %%v25, 144(%%r1,%1) \n\t"
"vl %%v26, 160(%%r1,%1) \n\t"
"vl %%v27, 176(%%r1,%1) \n\t"
"vl %%v28, 192(%%r1,%1) \n\t"
"vl %%v29, 208(%%r1,%1) \n\t"
"vl %%v30, 224(%%r1,%1) \n\t"
"vl %%v31, 240(%%r1,%1) \n\t"
"vl %%v25, 16(%%r1,%[ptr_x]) \n\t" "vl %%v0, 0(%%r1,%2) \n\t"
"vl %%v17, 16(%%r1,%[ptr_y]) \n\t" "vl %%v1, 16(%%r1,%2) \n\t"
"vst %%v25, 16(%%r1,%[ptr_y]) \n\t" "vl %%v2, 32(%%r1,%2) \n\t"
"vst %%v17, 16(%%r1,%[ptr_x]) \n\t" "vl %%v3, 48(%%r1,%2) \n\t"
"vl %%v4, 64(%%r1,%2) \n\t"
"vl %%v5, 80(%%r1,%2) \n\t"
"vl %%v6, 96(%%r1,%2) \n\t"
"vl %%v7, 112(%%r1,%2) \n\t"
"vst %%v0, 0(%%r1,%1) \n\t"
"vst %%v1, 16(%%r1,%1) \n\t"
"vst %%v2, 32(%%r1,%1) \n\t"
"vst %%v3, 48(%%r1,%1) \n\t"
"vst %%v4, 64(%%r1,%1) \n\t"
"vst %%v5, 80(%%r1,%1) \n\t"
"vst %%v6, 96(%%r1,%1) \n\t"
"vst %%v7, 112(%%r1,%1) \n\t"
"vl %%v26, 32(%%r1,%[ptr_x]) \n\t" "vl %%v0, 128(%%r1,%2) \n\t"
"vl %%v18, 32(%%r1,%[ptr_y]) \n\t" "vl %%v1, 144(%%r1,%2) \n\t"
"vst %%v26, 32(%%r1,%[ptr_y]) \n\t" "vl %%v2, 160(%%r1,%2) \n\t"
"vst %%v18, 32(%%r1,%[ptr_x]) \n\t" "vl %%v3, 176(%%r1,%2) \n\t"
"vl %%v4, 192(%%r1,%2) \n\t"
"vl %%v27, 48(%%r1,%[ptr_x]) \n\t" "vl %%v5, 208(%%r1,%2) \n\t"
"vl %%v19, 48(%%r1,%[ptr_y]) \n\t" "vl %%v6, 224(%%r1,%2) \n\t"
"vst %%v27, 48(%%r1,%[ptr_y]) \n\t" "vl %%v7, 240(%%r1,%2) \n\t"
"vst %%v19, 48(%%r1,%[ptr_x]) \n\t" "vst %%v0, 128(%%r1,%1) \n\t"
"vst %%v1, 144(%%r1,%1) \n\t"
"vl %%v28, 64(%%r1,%[ptr_x]) \n\t" "vst %%v2, 160(%%r1,%1) \n\t"
"vl %%v20, 64(%%r1,%[ptr_y]) \n\t" "vst %%v3, 176(%%r1,%1) \n\t"
"vst %%v28, 64(%%r1,%[ptr_y]) \n\t" "vst %%v4, 192(%%r1,%1) \n\t"
"vst %%v20, 64(%%r1,%[ptr_x]) \n\t" "vst %%v5, 208(%%r1,%1) \n\t"
"vst %%v6, 224(%%r1,%1) \n\t"
"vl %%v29, 80(%%r1,%[ptr_x]) \n\t" "vst %%v7, 240(%%r1,%1) \n\t"
"vl %%v21, 80(%%r1,%[ptr_y]) \n\t"
"vst %%v29, 80(%%r1,%[ptr_y]) \n\t"
"vst %%v21, 80(%%r1,%[ptr_x]) \n\t"
"vl %%v30, 96(%%r1,%[ptr_x]) \n\t"
"vl %%v22, 96(%%r1,%[ptr_y]) \n\t"
"vst %%v30, 96(%%r1,%[ptr_y]) \n\t"
"vst %%v22, 96(%%r1,%[ptr_x]) \n\t"
"vl %%v31, 112(%%r1,%[ptr_x]) \n\t"
"vl %%v23, 112(%%r1,%[ptr_y]) \n\t"
"vst %%v31, 112(%%r1,%[ptr_y]) \n\t"
"vst %%v23, 112(%%r1,%[ptr_x]) \n\t"
"vl %%v24, 128(%%r1,%[ptr_x]) \n\t"
"vl %%v16, 128(%%r1,%[ptr_y]) \n\t"
"vst %%v24, 128(%%r1,%[ptr_y]) \n\t"
"vst %%v16, 128(%%r1,%[ptr_x]) \n\t"
"vl %%v25, 144(%%r1,%[ptr_x]) \n\t"
"vl %%v17, 144(%%r1,%[ptr_y]) \n\t"
"vst %%v25, 144(%%r1,%[ptr_y]) \n\t"
"vst %%v17, 144(%%r1,%[ptr_x]) \n\t"
"vl %%v26, 160(%%r1,%[ptr_x]) \n\t"
"vl %%v18, 160(%%r1,%[ptr_y]) \n\t"
"vst %%v26, 160(%%r1,%[ptr_y]) \n\t"
"vst %%v18, 160(%%r1,%[ptr_x]) \n\t"
"vl %%v27, 176(%%r1,%[ptr_x]) \n\t"
"vl %%v19, 176(%%r1,%[ptr_y]) \n\t"
"vst %%v27, 176(%%r1,%[ptr_y]) \n\t"
"vst %%v19, 176(%%r1,%[ptr_x]) \n\t"
"vl %%v28, 192(%%r1,%[ptr_x]) \n\t"
"vl %%v20, 192(%%r1,%[ptr_y]) \n\t"
"vst %%v28, 192(%%r1,%[ptr_y]) \n\t"
"vst %%v20, 192(%%r1,%[ptr_x]) \n\t"
"vl %%v29, 208(%%r1,%[ptr_x]) \n\t"
"vl %%v21, 208(%%r1,%[ptr_y]) \n\t"
"vst %%v29, 208(%%r1,%[ptr_y]) \n\t"
"vst %%v21, 208(%%r1,%[ptr_x]) \n\t"
"vl %%v30, 224(%%r1,%[ptr_x]) \n\t"
"vl %%v22, 224(%%r1,%[ptr_y]) \n\t"
"vst %%v30, 224(%%r1,%[ptr_y]) \n\t"
"vst %%v22, 224(%%r1,%[ptr_x]) \n\t"
"vl %%v31, 240(%%r1,%[ptr_x]) \n\t"
"vl %%v23, 240(%%r1,%[ptr_y]) \n\t"
"vst %%v31, 240(%%r1,%[ptr_y]) \n\t"
"vst %%v23, 240(%%r1,%[ptr_x]) \n\t"
"la %%r1,256(%%r1) \n\t"
"brctg %[n_tmp],1b"
: [mem_x] "+m" (*(double (*)[n])x),
[mem_y] "+m" (*(double (*)[n])y),
[n_tmp] "+&r"(n)
: [ptr_x] "a"(x), [ptr_y] "a"(y)
: "cc", "r1", "v16","v17","v18","v19","v20","v21","v22","v23"
,"v24","v25","v26","v27","v28","v29","v30","v31"
);
return;
"vst %%v16, 0(%%r1,%2) \n\t"
"vst %%v17, 16(%%r1,%2) \n\t"
"vst %%v18, 32(%%r1,%2) \n\t"
"vst %%v19, 48(%%r1,%2) \n\t"
"vst %%v20, 64(%%r1,%2) \n\t"
"vst %%v21, 80(%%r1,%2) \n\t"
"vst %%v22, 96(%%r1,%2) \n\t"
"vst %%v23, 112(%%r1,%2) \n\t"
"vst %%v24, 128(%%r1,%2) \n\t"
"vst %%v25, 144(%%r1,%2) \n\t"
"vst %%v26, 160(%%r1,%2) \n\t"
"vst %%v27, 176(%%r1,%2) \n\t"
"vst %%v28, 192(%%r1,%2) \n\t"
"vst %%v29, 208(%%r1,%2) \n\t"
"vst %%v30, 224(%%r1,%2) \n\t"
"vst %%v31, 240(%%r1,%2) \n\t"
"agfi %%r1,256 \n\t"
"brctg %%r0,0b "
:
:"r"(n),"ZR"((FLOAT (*)[n])x),"ZR"((FLOAT (*)[n])y)
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
} }
#else
static void dswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y)
{
__asm__ volatile(
"pfd 2, 0(%[ptr_x]) \n\t"
"pfd 2, 0(%[ptr_y]) \n\t"
"srlg %[n_tmp],%[n_tmp],5 \n\t"
"xgr %%r1,%%r1 \n\t"
".align 16 \n\t"
"1: \n\t"
"pfd 2, 256(%%r1,%[ptr_x]) \n\t"
"pfd 2, 256(%%r1,%[ptr_y]) \n\t"
"vl %%v16, 0(%%r1,%[ptr_x]) \n\t"
"vl %%v17, 16(%%r1,%[ptr_x]) \n\t"
"vl %%v18, 32(%%r1,%[ptr_x]) \n\t"
"vl %%v19, 48(%%r1,%[ptr_x]) \n\t"
"vl %%v20, 64(%%r1,%[ptr_x]) \n\t"
"vl %%v21, 80(%%r1,%[ptr_x]) \n\t"
"vl %%v22, 96(%%r1,%[ptr_x]) \n\t"
"vl %%v23, 112(%%r1,%[ptr_x]) \n\t"
"vl %%v24, 128(%%r1,%[ptr_x]) \n\t"
"vl %%v25, 144(%%r1,%[ptr_x]) \n\t"
"vl %%v26, 160(%%r1,%[ptr_x]) \n\t"
"vl %%v27, 176(%%r1,%[ptr_x]) \n\t"
"vl %%v28, 192(%%r1,%[ptr_x]) \n\t"
"vl %%v29, 208(%%r1,%[ptr_x]) \n\t"
"vl %%v30, 224(%%r1,%[ptr_x]) \n\t"
"vl %%v31, 240(%%r1,%[ptr_x]) \n\t"
"vl %%v0, 0(%%r1,%[ptr_y]) \n\t"
"vl %%v1, 16(%%r1,%[ptr_y]) \n\t"
"vl %%v2, 32(%%r1,%[ptr_y]) \n\t"
"vl %%v3, 48(%%r1,%[ptr_y]) \n\t"
"vl %%v4, 64(%%r1,%[ptr_y]) \n\t"
"vl %%v5, 80(%%r1,%[ptr_y]) \n\t"
"vl %%v6, 96(%%r1,%[ptr_y]) \n\t"
"vl %%v7, 112(%%r1,%[ptr_y]) \n\t"
"vst %%v0, 0(%%r1,%[ptr_x]) \n\t"
"vst %%v1, 16(%%r1,%[ptr_x]) \n\t"
"vst %%v2, 32(%%r1,%[ptr_x]) \n\t"
"vst %%v3, 48(%%r1,%[ptr_x]) \n\t"
"vst %%v4, 64(%%r1,%[ptr_x]) \n\t"
"vst %%v5, 80(%%r1,%[ptr_x]) \n\t"
"vst %%v6, 96(%%r1,%[ptr_x]) \n\t"
"vst %%v7, 112(%%r1,%[ptr_x]) \n\t"
"vl %%v0, 128(%%r1,%[ptr_y]) \n\t"
"vl %%v1, 144(%%r1,%[ptr_y]) \n\t"
"vl %%v2, 160(%%r1,%[ptr_y]) \n\t"
"vl %%v3, 176(%%r1,%[ptr_y]) \n\t"
"vl %%v4, 192(%%r1,%[ptr_y]) \n\t"
"vl %%v5, 208(%%r1,%[ptr_y]) \n\t"
"vl %%v6, 224(%%r1,%[ptr_y]) \n\t"
"vl %%v7, 240(%%r1,%[ptr_y]) \n\t"
"vst %%v0, 128(%%r1,%[ptr_x]) \n\t"
"vst %%v1, 144(%%r1,%[ptr_x]) \n\t"
"vst %%v2, 160(%%r1,%[ptr_x]) \n\t"
"vst %%v3, 176(%%r1,%[ptr_x]) \n\t"
"vst %%v4, 192(%%r1,%[ptr_x]) \n\t"
"vst %%v5, 208(%%r1,%[ptr_x]) \n\t"
"vst %%v6, 224(%%r1,%[ptr_x]) \n\t"
"vst %%v7, 240(%%r1,%[ptr_x]) \n\t"
"vst %%v16, 0(%%r1,%[ptr_y]) \n\t"
"vst %%v17, 16(%%r1,%[ptr_y]) \n\t"
"vst %%v18, 32(%%r1,%[ptr_y]) \n\t"
"vst %%v19, 48(%%r1,%[ptr_y]) \n\t"
"vst %%v20, 64(%%r1,%[ptr_y]) \n\t"
"vst %%v21, 80(%%r1,%[ptr_y]) \n\t"
"vst %%v22, 96(%%r1,%[ptr_y]) \n\t"
"vst %%v23, 112(%%r1,%[ptr_y]) \n\t"
"vst %%v24, 128(%%r1,%[ptr_y]) \n\t"
"vst %%v25, 144(%%r1,%[ptr_y]) \n\t"
"vst %%v26, 160(%%r1,%[ptr_y]) \n\t"
"vst %%v27, 176(%%r1,%[ptr_y]) \n\t"
"vst %%v28, 192(%%r1,%[ptr_y]) \n\t"
"vst %%v29, 208(%%r1,%[ptr_y]) \n\t"
"vst %%v30, 224(%%r1,%[ptr_y]) \n\t"
"vst %%v31, 240(%%r1,%[ptr_y]) \n\t"
"la %%r1,256(%%r1) \n\t"
"brctg %[n_tmp],1b"
: [mem_x] "+m" (*(double (*)[n])x),
[mem_y] "+m" (*(double (*)[n])y),
[n_tmp] "+&r"(n)
: [ptr_x] "a"(x), [ptr_y] "a"(y)
: "cc", "r1", "v0","v1","v2","v3","v4","v5","v6","v7","v16",
"v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
return;
}
#endif
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
{ {
BLASLONG i=0; BLASLONG i=0;
@ -284,5 +160,3 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x,
} }

311
kernel/zarch/icamax.c Normal file
View File

@ -0,0 +1,311 @@
/***************************************************************************
Copyright (c) 2017, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#include <math.h>
#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf
#endif
#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1]))
static BLASLONG icamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax)
{
BLASLONG iamax;
__asm__ volatile (
"vlef %%v0,0(%3),0 \n\t"
"vlef %%v1,4(%3),0 \n\t"
"vlef %%v0,8(%3),1 \n\t"
"vlef %%v1,12(%3),1 \n\t"
"vlef %%v0,16(%3),2 \n\t"
"vlef %%v1,20(%3),2 \n\t"
"vlef %%v0,24(%3),3 \n\t"
"vlef %%v1,28(%3),3 \n\t"
"vflpsb %%v0,%%v0 \n\t"
"vflpsb %%v1,%%v1 \n\t"
"vfasb %%v0,%%v0,%%v1 \n\t"
"vleig %%v1,0,0 \n\t"
"vleig %%v1,2,1 \n\t"
"vleig %%v2,1,0 \n\t"
"vleig %%v2,3,1 \n\t"
"vrepig %%v3,16 \n\t"
"vzero %%v4 \n\t"
"vleib %%v9,0,0 \n\t"
"vleib %%v9,1,1 \n\t"
"vleib %%v9,2,2 \n\t"
"vleib %%v9,3,3 \n\t"
"vleib %%v9,8,4 \n\t"
"vleib %%v9,9,5 \n\t"
"vleib %%v9,10,6 \n\t"
"vleib %%v9,11,7 \n\t"
"vleib %%v9,16,8 \n\t"
"vleib %%v9,17,9 \n\t"
"vleib %%v9,18,10 \n\t"
"vleib %%v9,19,11 \n\t"
"vleib %%v9,24,12 \n\t"
"vleib %%v9,25,13 \n\t"
"vleib %%v9,26,14 \n\t"
"vleib %%v9,27,15 \n\t"
"vleif %%v24,0,0 \n\t"
"vleif %%v24,1,1 \n\t"
"vleif %%v24,2,2 \n\t"
"vleif %%v24,3,3 \n\t"
"vleif %%v25,4,0 \n\t"
"vleif %%v25,5,1 \n\t"
"vleif %%v25,6,2 \n\t"
"vleif %%v25,7,3 \n\t"
"vleif %%v26,8,0 \n\t"
"vleif %%v26,9,1 \n\t"
"vleif %%v26,10,2 \n\t"
"vleif %%v26,11,3 \n\t"
"vleif %%v27,12,0 \n\t"
"vleif %%v27,13,1 \n\t"
"vleif %%v27,14,2 \n\t"
"vleif %%v27,15,3 \n\t"
"srlg %%r0,%2,5 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%3) \n\t"
"vl %%v16,0(%%r1,%3) \n\t"
"vl %%v28,16(%%r1,%3) \n\t"
"vpkg %%v17,%%v16,%%v28 \n\t"
"vperm %%v16,%%v16,%%v28,%%v9 \n\t"
"vl %%v18,32(%%r1,%3) \n\t"
"vl %%v29,48(%%r1,%3) \n\t"
"vpkg %%v19,%%v18,%%v29 \n\t"
"vperm %%v18,%%v18,%%v29,%%v9 \n\t"
"vl %%v20,64(%%r1,%3) \n\t"
"vl %%v30,80(%%r1,%3) \n\t"
"vpkg %%v21,%%v20,%%v30 \n\t"
"vperm %%v20,%%v20,%%v30,%%v9 \n\t"
"vl %%v22,96(%%r1,%3) \n\t"
"vl %%v31,112(%%r1,%3) \n\t"
"vpkg %%v23,%%v22,%%v31 \n\t"
"vperm %%v22,%%v22,%%v31,%%v9 \n\t"
"vflpsb %%v16, %%v16 \n\t"
"vflpsb %%v17, %%v17 \n\t"
"vflpsb %%v18, %%v18 \n\t"
"vflpsb %%v19, %%v19 \n\t"
"vflpsb %%v20, %%v20 \n\t"
"vflpsb %%v21, %%v21 \n\t"
"vflpsb %%v22, %%v22 \n\t"
"vflpsb %%v23, %%v23 \n\t"
"vfasb %%v16,%%v16,%%v17 \n\t"
"vfasb %%v17,%%v18,%%v19 \n\t"
"vfasb %%v18,%%v20,%%v21 \n\t"
"vfasb %%v19,%%v22,%%v23 \n\t"
"vfchesb %%v5,%%v16,%%v17 \n\t"
"vfchesb %%v6,%%v18,%%v19 \n\t"
"vsel %%v16,%%v16,%%v17,%%v5 \n\t"
"vsel %%v5,%%v24,%%v25,%%v5 \n\t"
"vsel %%v17,%%v18,%%v19,%%v6 \n\t"
"vsel %%v6,%%v26,%%v27,%%v6 \n\t"
"vfchesb %%v18,%%v16,%%v17 \n\t"
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
"vsel %%v5,%%v5,%%v6,%%v18 \n\t"
"vsegf %%v6,%%v5 \n\t"
"vesrlg %%v5,%%v5,32 \n\t"
"vag %%v5,%%v5,%%v4 \n\t"
"vag %%v6,%%v6,%%v4 \n\t"
"vfchesb %%v7,%%v0,%%v16 \n\t"
"vsel %%v0,%%v0,%%v16,%%v7 \n\t"
"vsegf %%v8,%%v7 \n\t"
"vesrlg %%v7,%%v7,32 \n\t"
"vsegf %%v7,%%v7 \n\t"
"vsel %%v1,%%v1,%%v5,%%v7 \n\t"
"vsel %%v2,%%v2,%%v6,%%v8 \n\t"
"vag %%v4,%%v4,%%v3 \n\t"
"vl %%v16,128(%%r1,%3) \n\t"
"vl %%v28,144(%%r1,%3) \n\t"
"vpkg %%v17,%%v16,%%v28 \n\t"
"vperm %%v16,%%v16,%%v28,%%v9 \n\t"
"vl %%v18,160(%%r1,%3) \n\t"
"vl %%v29,176(%%r1,%3) \n\t"
"vpkg %%v19,%%v18,%%v29 \n\t"
"vperm %%v18,%%v18,%%v29,%%v9 \n\t"
"vl %%v20,192(%%r1,%3) \n\t"
"vl %%v30,208(%%r1,%3) \n\t"
"vpkg %%v21,%%v20,%%v30 \n\t"
"vperm %%v20,%%v20,%%v30,%%v9 \n\t"
"vl %%v22,224(%%r1,%3) \n\t"
"vl %%v31,240(%%r1,%3) \n\t"
"vpkg %%v23,%%v22,%%v31 \n\t"
"vperm %%v22,%%v22,%%v31,%%v9 \n\t"
"vflpsb %%v16, %%v16 \n\t"
"vflpsb %%v17, %%v17 \n\t"
"vflpsb %%v18, %%v18 \n\t"
"vflpsb %%v19, %%v19 \n\t"
"vflpsb %%v20, %%v20 \n\t"
"vflpsb %%v21, %%v21 \n\t"
"vflpsb %%v22, %%v22 \n\t"
"vflpsb %%v23, %%v23 \n\t"
"vfasb %%v16,%%v16,%%v17 \n\t"
"vfasb %%v17,%%v18,%%v19 \n\t"
"vfasb %%v18,%%v20,%%v21 \n\t"
"vfasb %%v19,%%v22,%%v23 \n\t"
"vfchesb %%v5,%%v16,%%v17 \n\t"
"vfchesb %%v6,%%v18,%%v19 \n\t"
"vsel %%v16,%%v16,%%v17,%%v5 \n\t"
"vsel %%v5,%%v24,%%v25,%%v5 \n\t"
"vsel %%v17,%%v18,%%v19,%%v6 \n\t"
"vsel %%v6,%%v26,%%v27,%%v6 \n\t"
"vfchesb %%v18,%%v16,%%v17 \n\t"
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
"vsel %%v5,%%v5,%%v6,%%v18 \n\t"
"vsegf %%v6,%%v5 \n\t"
"vesrlg %%v5,%%v5,32 \n\t"
"vag %%v5,%%v5,%%v4 \n\t"
"vag %%v6,%%v6,%%v4 \n\t"
"vfchesb %%v7,%%v0,%%v16 \n\t"
"vsel %%v0,%%v0,%%v16,%%v7 \n\t"
"vsegf %%v8,%%v7 \n\t"
"vesrlg %%v7,%%v7,32 \n\t"
"vsegf %%v7,%%v7 \n\t"
"vsel %%v1,%%v1,%%v5,%%v7 \n\t"
"vsel %%v2,%%v2,%%v6,%%v8 \n\t"
"vag %%v4,%%v4,%%v3 \n\t"
"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"
"veslg %%v3,%%v0,32 \n\t"
"vfchsb %%v4,%%v0,%%v3 \n\t"
"vchlg %%v5,%%v2,%%v1 \n\t"
"vfcesb %%v6,%%v0,%%v3 \n\t"
"vn %%v5,%%v5,%%v6 \n\t"
"vo %%v4,%%v4,%%v5 \n\t"
"vsel %%v0,%%v0,%%v3,%%v4 \n\t"
"vesrlg %%v4,%%v4,32 \n\t"
"vsegf %%v4,%%v4 \n\t"
"vsel %%v1,%%v1,%%v2,%%v4 \n\t"
"vrepf %%v2,%%v0,2 \n\t"
"vrepg %%v3,%%v1,1 \n\t"
"wfcsb %%v2,%%v0 \n\t"
"jne 1f \n\t"
"vstef %%v0,%1,0 \n\t"
"vmnlg %%v0,%%v1,%%v3 \n\t"
"vlgvg %0,%%v0,0 \n\t"
"j 2f \n\t"
"1: \n\t"
"wfchsb %%v4,%%v2,%%v0 \n\t"
"vesrlg %%v4,%%v4,32 \n\t"
"vsegf %%v4,%%v4 \n\t"
"vsel %%v1,%%v3,%%v1,%%v4 \n\t"
"vsel %%v0,%%v2,%%v0,%%v4 \n\t"
"ste %%f0,%1 \n\t"
"vlgvg %0,%%v1,0 \n\t"
"2: \n\t"
"nop "
:"=r"(iamax),"=m"(*amax)
:"r"(n),"ZR"((const FLOAT (*)[n * 2])x)
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v8","v9","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
return iamax;
}
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
BLASLONG i = 0;
BLASLONG ix = 0;
FLOAT maxf = 0;
BLASLONG max = 0;
BLASLONG inc_x2;
if (n <= 0 || inc_x <= 0) return(max);
if (inc_x == 1) {
BLASLONG n1 = n & -32;
if (n1 > 0) {
max = icamax_kernel_32(n1, x, &maxf);
ix = n1 * 2;
i = n1;
}
else
{
maxf = CABS1(x,0);
ix += 2;
i++;
}
while(i < n)
{
if( CABS1(x,ix) > maxf )
{
max = i;
maxf = CABS1(x,ix);
}
ix += 2;
i++;
}
return (max + 1);
} else {
max = 0;
maxf = CABS1(x,0);
inc_x2 = 2 * inc_x;
ix += inc_x2;
i++;
while(i < n)
{
if( CABS1(x,ix) > maxf )
{
max = i;
maxf = CABS1(x,ix);
}
ix += inc_x2;
i++;
}
return (max + 1);
}
}

311
kernel/zarch/icamin.c Normal file
View File

@ -0,0 +1,311 @@
/***************************************************************************
Copyright (c) 2017, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#include <math.h>
#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf
#endif
#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1]))
static BLASLONG icamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin)
{
BLASLONG iamin;
__asm__ volatile (
"vlef %%v0,0(%3),0 \n\t"
"vlef %%v1,4(%3),0 \n\t"
"vlef %%v0,8(%3),1 \n\t"
"vlef %%v1,12(%3),1 \n\t"
"vlef %%v0,16(%3),2 \n\t"
"vlef %%v1,20(%3),2 \n\t"
"vlef %%v0,24(%3),3 \n\t"
"vlef %%v1,28(%3),3 \n\t"
"vflpsb %%v0,%%v0 \n\t"
"vflpsb %%v1,%%v1 \n\t"
"vfasb %%v0,%%v0,%%v1 \n\t"
"vleig %%v1,0,0 \n\t"
"vleig %%v1,2,1 \n\t"
"vleig %%v2,1,0 \n\t"
"vleig %%v2,3,1 \n\t"
"vrepig %%v3,16 \n\t"
"vzero %%v4 \n\t"
"vleib %%v9,0,0 \n\t"
"vleib %%v9,1,1 \n\t"
"vleib %%v9,2,2 \n\t"
"vleib %%v9,3,3 \n\t"
"vleib %%v9,8,4 \n\t"
"vleib %%v9,9,5 \n\t"
"vleib %%v9,10,6 \n\t"
"vleib %%v9,11,7 \n\t"
"vleib %%v9,16,8 \n\t"
"vleib %%v9,17,9 \n\t"
"vleib %%v9,18,10 \n\t"
"vleib %%v9,19,11 \n\t"
"vleib %%v9,24,12 \n\t"
"vleib %%v9,25,13 \n\t"
"vleib %%v9,26,14 \n\t"
"vleib %%v9,27,15 \n\t"
"vleif %%v24,0,0 \n\t"
"vleif %%v24,1,1 \n\t"
"vleif %%v24,2,2 \n\t"
"vleif %%v24,3,3 \n\t"
"vleif %%v25,4,0 \n\t"
"vleif %%v25,5,1 \n\t"
"vleif %%v25,6,2 \n\t"
"vleif %%v25,7,3 \n\t"
"vleif %%v26,8,0 \n\t"
"vleif %%v26,9,1 \n\t"
"vleif %%v26,10,2 \n\t"
"vleif %%v26,11,3 \n\t"
"vleif %%v27,12,0 \n\t"
"vleif %%v27,13,1 \n\t"
"vleif %%v27,14,2 \n\t"
"vleif %%v27,15,3 \n\t"
"srlg %%r0,%2,5 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%3) \n\t"
"vl %%v16,0(%%r1,%3) \n\t"
"vl %%v28,16(%%r1,%3) \n\t"
"vpkg %%v17,%%v16,%%v28 \n\t"
"vperm %%v16,%%v16,%%v28,%%v9 \n\t"
"vl %%v18,32(%%r1,%3) \n\t"
"vl %%v29,48(%%r1,%3) \n\t"
"vpkg %%v19,%%v18,%%v29 \n\t"
"vperm %%v18,%%v18,%%v29,%%v9 \n\t"
"vl %%v20,64(%%r1,%3) \n\t"
"vl %%v30,80(%%r1,%3) \n\t"
"vpkg %%v21,%%v20,%%v30 \n\t"
"vperm %%v20,%%v20,%%v30,%%v9 \n\t"
"vl %%v22,96(%%r1,%3) \n\t"
"vl %%v31,112(%%r1,%3) \n\t"
"vpkg %%v23,%%v22,%%v31 \n\t"
"vperm %%v22,%%v22,%%v31,%%v9 \n\t"
"vflpsb %%v16, %%v16 \n\t"
"vflpsb %%v17, %%v17 \n\t"
"vflpsb %%v18, %%v18 \n\t"
"vflpsb %%v19, %%v19 \n\t"
"vflpsb %%v20, %%v20 \n\t"
"vflpsb %%v21, %%v21 \n\t"
"vflpsb %%v22, %%v22 \n\t"
"vflpsb %%v23, %%v23 \n\t"
"vfasb %%v16,%%v16,%%v17 \n\t"
"vfasb %%v17,%%v18,%%v19 \n\t"
"vfasb %%v18,%%v20,%%v21 \n\t"
"vfasb %%v19,%%v22,%%v23 \n\t"
"vfchesb %%v5,%%v17,%%v16 \n\t"
"vfchesb %%v6,%%v19,%%v18 \n\t"
"vsel %%v16,%%v16,%%v17,%%v5 \n\t"
"vsel %%v5,%%v24,%%v25,%%v5 \n\t"
"vsel %%v17,%%v18,%%v19,%%v6 \n\t"
"vsel %%v6,%%v26,%%v27,%%v6 \n\t"
"vfchesb %%v18,%%v17,%%v16 \n\t"
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
"vsel %%v5,%%v5,%%v6,%%v18 \n\t"
"vsegf %%v6,%%v5 \n\t"
"vesrlg %%v5,%%v5,32 \n\t"
"vag %%v5,%%v5,%%v4 \n\t"
"vag %%v6,%%v6,%%v4 \n\t"
"vfchesb %%v7,%%v16,%%v0 \n\t"
"vsel %%v0,%%v0,%%v16,%%v7 \n\t"
"vsegf %%v8,%%v7 \n\t"
"vesrlg %%v7,%%v7,32 \n\t"
"vsegf %%v7,%%v7 \n\t"
"vsel %%v1,%%v1,%%v5,%%v7 \n\t"
"vsel %%v2,%%v2,%%v6,%%v8 \n\t"
"vag %%v4,%%v4,%%v3 \n\t"
"vl %%v16,128(%%r1,%3) \n\t"
"vl %%v28,144(%%r1,%3) \n\t"
"vpkg %%v17,%%v16,%%v28 \n\t"
"vperm %%v16,%%v16,%%v28,%%v9 \n\t"
"vl %%v18,160(%%r1,%3) \n\t"
"vl %%v29,176(%%r1,%3) \n\t"
"vpkg %%v19,%%v18,%%v29 \n\t"
"vperm %%v18,%%v18,%%v29,%%v9 \n\t"
"vl %%v20,192(%%r1,%3) \n\t"
"vl %%v30,208(%%r1,%3) \n\t"
"vpkg %%v21,%%v20,%%v30 \n\t"
"vperm %%v20,%%v20,%%v30,%%v9 \n\t"
"vl %%v22,224(%%r1,%3) \n\t"
"vl %%v31,240(%%r1,%3) \n\t"
"vpkg %%v23,%%v22,%%v31 \n\t"
"vperm %%v22,%%v22,%%v31,%%v9 \n\t"
"vflpsb %%v16, %%v16 \n\t"
"vflpsb %%v17, %%v17 \n\t"
"vflpsb %%v18, %%v18 \n\t"
"vflpsb %%v19, %%v19 \n\t"
"vflpsb %%v20, %%v20 \n\t"
"vflpsb %%v21, %%v21 \n\t"
"vflpsb %%v22, %%v22 \n\t"
"vflpsb %%v23, %%v23 \n\t"
"vfasb %%v16,%%v16,%%v17 \n\t"
"vfasb %%v17,%%v18,%%v19 \n\t"
"vfasb %%v18,%%v20,%%v21 \n\t"
"vfasb %%v19,%%v22,%%v23 \n\t"
"vfchesb %%v5,%%v17,%%v16 \n\t"
"vfchesb %%v6,%%v19,%%v18 \n\t"
"vsel %%v16,%%v16,%%v17,%%v5 \n\t"
"vsel %%v5,%%v24,%%v25,%%v5 \n\t"
"vsel %%v17,%%v18,%%v19,%%v6 \n\t"
"vsel %%v6,%%v26,%%v27,%%v6 \n\t"
"vfchesb %%v18,%%v17,%%v16 \n\t"
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
"vsel %%v5,%%v5,%%v6,%%v18 \n\t"
"vsegf %%v6,%%v5 \n\t"
"vesrlg %%v5,%%v5,32 \n\t"
"vag %%v5,%%v5,%%v4 \n\t"
"vag %%v6,%%v6,%%v4 \n\t"
"vfchesb %%v7,%%v16,%%v0 \n\t"
"vsel %%v0,%%v0,%%v16,%%v7 \n\t"
"vsegf %%v8,%%v7 \n\t"
"vesrlg %%v7,%%v7,32 \n\t"
"vsegf %%v7,%%v7 \n\t"
"vsel %%v1,%%v1,%%v5,%%v7 \n\t"
"vsel %%v2,%%v2,%%v6,%%v8 \n\t"
"vag %%v4,%%v4,%%v3 \n\t"
"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"
"veslg %%v3,%%v0,32 \n\t"
"vfchsb %%v4,%%v3,%%v0 \n\t"
"vchlg %%v5,%%v2,%%v1 \n\t"
"vfcesb %%v6,%%v0,%%v3 \n\t"
"vn %%v5,%%v5,%%v6 \n\t"
"vo %%v4,%%v4,%%v5 \n\t"
"vsel %%v0,%%v0,%%v3,%%v4 \n\t"
"vesrlg %%v4,%%v4,32 \n\t"
"vsegf %%v4,%%v4 \n\t"
"vsel %%v1,%%v1,%%v2,%%v4 \n\t"
"vrepf %%v2,%%v0,2 \n\t"
"vrepg %%v3,%%v1,1 \n\t"
"wfcsb %%v2,%%v0 \n\t"
"jne 1f \n\t"
"vstef %%v0,%1,0 \n\t"
"vmnlg %%v0,%%v1,%%v3 \n\t"
"vlgvg %0,%%v0,0 \n\t"
"j 2f \n\t"
"1: \n\t"
"wfchsb %%v4,%%v0,%%v2 \n\t"
"vesrlg %%v4,%%v4,32 \n\t"
"vsegf %%v4,%%v4 \n\t"
"vsel %%v1,%%v3,%%v1,%%v4 \n\t"
"vsel %%v0,%%v2,%%v0,%%v4 \n\t"
"ste %%f0,%1 \n\t"
"vlgvg %0,%%v1,0 \n\t"
"2: \n\t"
"nop "
:"=r"(iamin),"=m"(*amin)
:"r"(n),"ZR"((const FLOAT (*)[n * 2])x)
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v8","v9","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
return iamin;
}
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
BLASLONG i = 0;
BLASLONG ix = 0;
FLOAT minf = 0;
BLASLONG min = 0;
BLASLONG inc_x2;
if (n <= 0 || inc_x <= 0) return(min);
if (inc_x == 1) {
BLASLONG n1 = n & -32;
if (n1 > 0) {
min = icamin_kernel_32(n1, x, &minf);
ix = n1 * 2;
i = n1;
}
else
{
minf = CABS1(x,0);
ix += 2;
i++;
}
while(i < n)
{
if( CABS1(x,ix) < minf )
{
min = i;
minf = CABS1(x,ix);
}
ix += 2;
i++;
}
return (min + 1);
} else {
min = 0;
minf = CABS1(x,0);
inc_x2 = 2 * inc_x;
ix += inc_x2;
i++;
while(i < n)
{
if( CABS1(x,ix) < minf )
{
min = i;
minf = CABS1(x,ix);
}
ix += inc_x2;
i++;
}
return (min + 1);
}
}

View File

@ -23,164 +23,173 @@ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/ *****************************************************************************/
#include "common.h" #include "common.h"
#include <math.h> #include <math.h>
#if defined(DOUBLE) #if defined(DOUBLE)
#define ABS fabs #define ABS fabs
#else #else
#define ABS fabsf #define ABS fabsf
#endif #endif
static BLASLONG idamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax)
/** {
* Find maximum index BLASLONG iamax;
* Warning: requirements n>0 and n % 32 == 0
* @param n
* @param x pointer to the vector
* @param maxf (out) maximum absolute value .( only for output )
* @return index
*/
static BLASLONG diamax_kernel_32_TUNED(BLASLONG n, FLOAT *x, FLOAT *maxf) {
BLASLONG index;
__asm__(
"pfd 1, 0(%[ptr_x]) \n\t"
"sllg %%r0,%[n],3 \n\t"
"agr %%r0,%[ptr_x] \n\t"
"vleig %%v20,0,0 \n\t"
"vleig %%v20,1,1 \n\t"
"vleig %%v21,2,0 \n\t"
"vleig %%v21,3,1 \n\t"
"vleig %%v22,4,0 \n\t"
"vleig %%v22,5,1 \n\t"
"vleig %%v23,6,0 \n\t"
"vleig %%v23,7,1 \n\t"
"vrepig %%v4,8 \n\t"
"vzero %%v5 \n\t"
"vzero %%v18 \n\t"
"vzero %%v19 \n\t"
".align 16 \n\t"
"1: \n\t"
"pfd 1, 256(%[ptr_tmp] ) \n\t"
"vlm %%v24,%%v31, 0(%[ptr_tmp] ) \n\t"
"vflpdb %%v24, %%v24 \n\t"
"vflpdb %%v25, %%v25 \n\t"
"vflpdb %%v26, %%v26 \n\t"
"vflpdb %%v27, %%v27 \n\t"
"vflpdb %%v28, %%v28 \n\t"
"vflpdb %%v29, %%v29 \n\t"
"vflpdb %%v30, %%v30 \n\t"
"vflpdb %%v31, %%v31 \n\t"
"vfchdb %%v16,%%v25,%%v24 \n\t "
"vfchdb %%v17,%%v27,%%v26 \n\t "
"vsel %%v1,%%v21,%%v20,%%v16 \n\t"
"vsel %%v0,%%v25,%%v24,%%v16 \n\t"
"vsel %%v2,%%v23,%%v22,%%v17 \n\t"
"vsel %%v3,%%v27,%%v26,%%v17 \n\t"
"vfchdb %%v16,%%v29,%%v28 \n\t "
"vfchdb %%v17,%%v31,%%v30 \n\t"
"vsel %%v24,%%v21,%%v20,%%v16 \n\t"
"vsel %%v25,%%v29,%%v28,%%v16 \n\t"
"vsel %%v26,%%v23,%%v22,%%v17 \n\t"
"vsel %%v27,%%v31,%%v30,%%v17 \n\t"
"vfchdb %%v28, %%v3,%%v0 \n\t" __asm__ volatile (
"vfchdb %%v29,%%v27, %%v25 \n\t" "vl %%v0,0(%3) \n\t"
"vsel %%v1,%%v2,%%v1,%%v28 \n\t" "vflpdb %%v0,%%v0 \n\t"
"vsel %%v0,%%v3,%%v0,%%v28 \n\t" "vleig %%v1,0,0 \n\t"
"vsel %%v24,%%v26,%%v24,%%v29 \n\t" "vleig %%v1,1,1 \n\t"
"vsel %%v25,%%v27,%%v25,%%v29 \n\t" "vrepig %%v2,16 \n\t"
"vag %%v1,%%v1,%%v5 \n\t" "vzero %%v3 \n\t"
"vag %%v24,%%v24,%%v5 \n\t" "vleig %%v24,0,0 \n\t"
"vag %%v24,%%v24,%%v4 \n\t" "vleig %%v24,1,1 \n\t"
"vfchdb %%v16,%%v25 , %%v0 \n\t" "vleig %%v25,2,0 \n\t"
"vag %%v5,%%v5,%%v4 \n\t" "vleig %%v25,3,1 \n\t"
"vsel %%v29,%%v25,%%v0,%%v16 \n\t" "vleig %%v26,4,0 \n\t"
"vsel %%v28,%%v24,%%v1,%%v16 \n\t" "vleig %%v26,5,1 \n\t"
"vfchdb %%v17, %%v29,%%v18 \n\t" "vleig %%v27,6,0 \n\t"
"vsel %%v19,%%v28,%%v19,%%v17 \n\t" "vleig %%v27,7,1 \n\t"
"vsel %%v18,%%v29,%%v18,%%v17 \n\t" "vleig %%v28,8,0 \n\t"
"vag %%v5,%%v5,%%v4 \n\t" "vleig %%v28,9,1 \n\t"
"vlm %%v24,%%v31,128(%[ptr_tmp] ) \n\t" "vleig %%v29,10,0 \n\t"
"vflpdb %%v24, %%v24 \n\t" "vleig %%v29,11,1 \n\t"
"vflpdb %%v25, %%v25 \n\t" "vleig %%v30,12,0 \n\t"
"vflpdb %%v26, %%v26 \n\t" "vleig %%v30,13,1 \n\t"
"vflpdb %%v27, %%v27 \n\t" "vleig %%v31,14,0 \n\t"
"vflpdb %%v28, %%v28 \n\t" "vleig %%v31,15,1 \n\t"
"vflpdb %%v29, %%v29 \n\t" "srlg %%r0,%2,5 \n\t"
"vflpdb %%v30, %%v30 \n\t" "xgr %%r1,%%r1 \n\t"
"vflpdb %%v31, %%v31 \n\t" "0: \n\t"
"vfchdb %%v16,%%v25,%%v24 \n\t " "pfd 1, 1024(%%r1,%3) \n\t"
"vfchdb %%v17,%%v27,%%v26 \n\t "
"vsel %%v1,%%v21,%%v20,%%v16 \n\t"
"vsel %%v0,%%v25,%%v24,%%v16 \n\t"
"vsel %%v2,%%v23,%%v22,%%v17 \n\t"
"vsel %%v3,%%v27,%%v26,%%v17 \n\t"
"vfchdb %%v16,%%v29,%%v28 \n\t "
"vfchdb %%v17,%%v31,%%v30 \n\t"
"vsel %%v24,%%v21,%%v20,%%v16 \n\t"
"vsel %%v25,%%v29,%%v28,%%v16 \n\t"
"vsel %%v26,%%v23,%%v22,%%v17 \n\t"
"vsel %%v27,%%v31,%%v30,%%v17 \n\t"
"vfchdb %%v28, %%v3,%%v0 \n\t" "vl %%v16,0(%%r1,%3) \n\t"
"vfchdb %%v29,%%v27, %%v25 \n\t" "vl %%v17,16(%%r1,%3) \n\t"
"vsel %%v1,%%v2,%%v1,%%v28 \n\t" "vl %%v18,32(%%r1,%3) \n\t"
"vsel %%v0,%%v3,%%v0,%%v28 \n\t" "vl %%v19,48(%%r1,%3) \n\t"
"vsel %%v24,%%v26,%%v24,%%v29 \n\t" "vl %%v20,64(%%r1,%3) \n\t"
"vsel %%v25,%%v27,%%v25,%%v29 \n\t" "vl %%v21,80(%%r1,%3) \n\t"
"vag %%v1,%%v1,%%v5 \n\t" "vl %%v22,96(%%r1,%3) \n\t"
"vag %%v24,%%v24,%%v5 \n\t" "vl %%v23,112(%%r1,%3) \n\t"
"la %[ptr_tmp],256(%[ptr_tmp]) \n\t" "vflpdb %%v16, %%v16 \n\t"
"vag %%v24,%%v24,%%v4 \n\t" "vflpdb %%v17, %%v17 \n\t"
"vfchdb %%v16,%%v25 , %%v0 \n\t" "vflpdb %%v18, %%v18 \n\t"
"vag %%v5,%%v5,%%v4 \n\t" "vflpdb %%v19, %%v19 \n\t"
"vsel %%v29,%%v25,%%v0,%%v16 \n\t" "vflpdb %%v20, %%v20 \n\t"
"vsel %%v28,%%v24,%%v1,%%v16 \n\t" "vflpdb %%v21, %%v21 \n\t"
"vfchdb %%v17, %%v29,%%v18 \n\t" "vflpdb %%v22, %%v22 \n\t"
"vsel %%v19,%%v28,%%v19,%%v17 \n\t" "vflpdb %%v23, %%v23 \n\t"
"vsel %%v18,%%v29,%%v18,%%v17 \n\t"
"vag %%v5,%%v5,%%v4 \n\t" "vfchedb %%v4,%%v16,%%v17 \n\t"
"clgrjl %[ptr_tmp],%%r0,1b \n\t" "vfchedb %%v5,%%v18,%%v19 \n\t"
"vfchedb %%v6,%%v20,%%v21 \n\t"
"vfchedb %%v7,%%v22,%%v23 \n\t"
"vsel %%v16,%%v16,%%v17,%%v4 \n\t"
"vsel %%v4,%%v24,%%v25,%%v4 \n\t"
"vsel %%v17,%%v18,%%v19,%%v5 \n\t"
"vsel %%v5,%%v26,%%v27,%%v5 \n\t"
"vsel %%v18,%%v20,%%v21,%%v6 \n\t"
"vsel %%v6,%%v28,%%v29,%%v6 \n\t"
"vsel %%v19,%%v22,%%v23,%%v7 \n\t"
"vsel %%v7,%%v30,%%v31,%%v7 \n\t"
"vrepg %%v26,%%v18,1 \n\t" "vfchedb %%v20,%%v16,%%v17 \n\t"
"vrepg %%v5,%%v19,1 \n\t" "vfchedb %%v21,%%v18,%%v19 \n\t"
"wfcdb %%v26,%%v18 \n\t" "vsel %%v16,%%v16,%%v17,%%v20 \n\t"
"jne 2f \n\t" "vsel %%v4,%%v4,%%v5,%%v20 \n\t"
"vsteg %%v18,%[maxf],0 \n\t" "vsel %%v17,%%v18,%%v19,%%v21 \n\t"
"vmnlg %%v1,%%v5,%%v19 \n\t" "vsel %%v5,%%v6,%%v7,%%v21 \n\t"
"j 3f \n\t"
"2: \n\t" "vfchedb %%v18,%%v16,%%v17 \n\t"
"wfchdb %%v16,%%v26,%%v18 \n\t" "vsel %%v16,%%v16,%%v17,%%v18 \n\t"
"vsel %%v1,%%v5,%%v19,%%v16 \n\t" "vsel %%v4,%%v4,%%v5,%%v18 \n\t"
"vsel %%v0,%%v26,%%v18,%%v16 \n\t" "vag %%v4,%%v4,%%v3 \n\t"
"std %%f0,%[maxf] \n\t"
"3: \n\t"
"vlgvg %[index],%%v1,0 \n\t"
: [index] "+r"(index) ,[maxf] "=m"(*maxf), [ptr_tmp] "+&a"(x)
: [mem] "m"( *(const double (*)[n])x), [n] "r"(n), [ptr_x] "r"(x)
: "cc", "r0", "f0","v0","v1","v2","v3","v4","v5","v6","v7","v16",
"v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
return index;
"vfchedb %%v5,%%v0,%%v16 \n\t"
"vsel %%v0,%%v0,%%v16,%%v5 \n\t"
"vsel %%v1,%%v1,%%v4,%%v5 \n\t"
"vag %%v3,%%v3,%%v2 \n\t"
"vl %%v16,128(%%r1,%3) \n\t"
"vl %%v17,144(%%r1,%3) \n\t"
"vl %%v18,160(%%r1,%3) \n\t"
"vl %%v19,176(%%r1,%3) \n\t"
"vl %%v20,192(%%r1,%3) \n\t"
"vl %%v21,208(%%r1,%3) \n\t"
"vl %%v22,224(%%r1,%3) \n\t"
"vl %%v23,240(%%r1,%3) \n\t"
"vflpdb %%v16, %%v16 \n\t"
"vflpdb %%v17, %%v17 \n\t"
"vflpdb %%v18, %%v18 \n\t"
"vflpdb %%v19, %%v19 \n\t"
"vflpdb %%v20, %%v20 \n\t"
"vflpdb %%v21, %%v21 \n\t"
"vflpdb %%v22, %%v22 \n\t"
"vflpdb %%v23, %%v23 \n\t"
"vfchedb %%v4,%%v16,%%v17 \n\t"
"vfchedb %%v5,%%v18,%%v19 \n\t"
"vfchedb %%v6,%%v20,%%v21 \n\t"
"vfchedb %%v7,%%v22,%%v23 \n\t"
"vsel %%v16,%%v16,%%v17,%%v4 \n\t"
"vsel %%v4,%%v24,%%v25,%%v4 \n\t"
"vsel %%v17,%%v18,%%v19,%%v5 \n\t"
"vsel %%v5,%%v26,%%v27,%%v5 \n\t"
"vsel %%v18,%%v20,%%v21,%%v6 \n\t"
"vsel %%v6,%%v28,%%v29,%%v6 \n\t"
"vsel %%v19,%%v22,%%v23,%%v7 \n\t"
"vsel %%v7,%%v30,%%v31,%%v7 \n\t"
"vfchedb %%v20,%%v16,%%v17 \n\t"
"vfchedb %%v21,%%v18,%%v19 \n\t"
"vsel %%v16,%%v16,%%v17,%%v20 \n\t"
"vsel %%v4,%%v4,%%v5,%%v20 \n\t"
"vsel %%v17,%%v18,%%v19,%%v21 \n\t"
"vsel %%v5,%%v6,%%v7,%%v21 \n\t"
"vfchedb %%v18,%%v16,%%v17 \n\t"
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
"vsel %%v4,%%v4,%%v5,%%v18 \n\t"
"vag %%v4,%%v4,%%v3 \n\t"
"vfchedb %%v5,%%v0,%%v16 \n\t"
"vsel %%v0,%%v0,%%v16,%%v5 \n\t"
"vsel %%v1,%%v1,%%v4,%%v5 \n\t"
"vag %%v3,%%v3,%%v2 \n\t"
"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"
"vrepg %%v2,%%v0,1 \n\t"
"vrepg %%v3,%%v1,1 \n\t"
"wfcdb %%v2,%%v0 \n\t"
"jne 1f \n\t"
"vsteg %%v0,%1,0 \n\t"
"vmnlg %%v0,%%v1,%%v3 \n\t"
"vlgvg %0,%%v0,0 \n\t"
"j 2f \n\t"
"1: \n\t"
"wfchdb %%v4,%%v2,%%v0 \n\t"
"vsel %%v1,%%v3,%%v1,%%v4 \n\t"
"vsel %%v0,%%v2,%%v0,%%v4 \n\t"
"std %%f0,%1 \n\t"
"vlgvg %0,%%v1,0 \n\t"
"2: \n\t"
"nop "
:"=r"(iamax),"=m"(*amax)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
return iamax;
} }
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0; BLASLONG i = 0;
BLASLONG j = 0; BLASLONG j = 0;
BLASLONG ix = 0;
FLOAT maxf = 0.0; FLOAT maxf = 0.0;
BLASLONG max = 0; BLASLONG max = 0;
@ -191,10 +200,15 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG n1 = n & -32; BLASLONG n1 = n & -32;
if (n1 > 0) { if (n1 > 0) {
max = diamax_kernel_32_TUNED(n1, x, &maxf); max = idamax_kernel_32(n1, x, &maxf);
i = n1; i = n1;
} }
else
{
maxf = ABS(x[0]);
i++;
}
while (i < n) { while (i < n) {
if (ABS(x[i]) > maxf) { if (ABS(x[i]) > maxf) {
@ -207,6 +221,9 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
} else { } else {
max = 0;
maxf = ABS(x[0]);
BLASLONG n1 = n & -4; BLASLONG n1 = n & -4;
while (j < n1) { while (j < n1) {

View File

@ -23,194 +23,192 @@ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/ *****************************************************************************/
#include "common.h" #include "common.h"
#include <math.h> #include <math.h>
#if defined(DOUBLE) #if defined(DOUBLE)
#define ABS fabs #define ABS fabs
#else #else
#define ABS fabsf #define ABS fabsf
#endif #endif
/** static BLASLONG idamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin)
* Find minimum index {
* Warning: requirements n>0 and n % 32 == 0 BLASLONG iamin;
* @param n
* @param x pointer to the vector
* @param minf (out) minimum absolute value .( only for output )
* @return minimum index
*/
static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) {
BLASLONG index;
__asm__(
"pfd 1, 0(%[ptr_x]) \n\t"
"sllg %%r0,%[n],3 \n\t"
"agr %%r0,%[ptr_x] \n\t"
"vleig %%v20,0,0 \n\t"
"vleig %%v20,1,1 \n\t"
"vleig %%v21,2,0 \n\t"
"vleig %%v21,3,1 \n\t"
"vleig %%v22,4,0 \n\t"
"vleig %%v22,5,1 \n\t"
"vleig %%v23,6,0 \n\t"
"vleig %%v23,7,1 \n\t"
"vrepig %%v4,8 \n\t"
"vlrepg %%v18,0(%[ptr_x]) \n\t"
"vzero %%v5 \n\t"
"vflpdb %%v18, %%v18 \n\t"
"vzero %%v19 \n\t"
".align 16 \n\t"
"1: \n\t"
"pfd 1, 256(%[ptr_tmp] ) \n\t"
"vlm %%v24,%%v31, 0(%[ptr_tmp] ) \n\t"
"vflpdb %%v24, %%v24 \n\t" __asm__ volatile (
"vflpdb %%v25, %%v25 \n\t" "vl %%v0,0(%3) \n\t"
"vflpdb %%v26, %%v26 \n\t" "vflpdb %%v0,%%v0 \n\t"
"vflpdb %%v27, %%v27 \n\t" "vleig %%v1,0,0 \n\t"
"vflpdb %%v28, %%v28 \n\t" "vleig %%v1,1,1 \n\t"
"vflpdb %%v29, %%v29 \n\t" "vrepig %%v2,16 \n\t"
"vflpdb %%v30, %%v30 \n\t" "vzero %%v3 \n\t"
"vflpdb %%v31, %%v31 \n\t" "vleig %%v24,0,0 \n\t"
"vleig %%v24,1,1 \n\t"
"vleig %%v25,2,0 \n\t"
"vleig %%v25,3,1 \n\t"
"vleig %%v26,4,0 \n\t"
"vleig %%v26,5,1 \n\t"
"vleig %%v27,6,0 \n\t"
"vleig %%v27,7,1 \n\t"
"vleig %%v28,8,0 \n\t"
"vleig %%v28,9,1 \n\t"
"vleig %%v29,10,0 \n\t"
"vleig %%v29,11,1 \n\t"
"vleig %%v30,12,0 \n\t"
"vleig %%v30,13,1 \n\t"
"vleig %%v31,14,0 \n\t"
"vleig %%v31,15,1 \n\t"
"srlg %%r0,%2,5 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%3) \n\t"
"vfchdb %%v16,%%v24,%%v25 \n\t " "vl %%v16,0(%%r1,%3) \n\t"
"vfchdb %%v17,%%v26 ,%%v27 \n\t " "vl %%v17,16(%%r1,%3) \n\t"
"vsel %%v1,%%v21,%%v20,%%v16 \n\t" "vl %%v18,32(%%r1,%3) \n\t"
"vsel %%v0,%%v25,%%v24,%%v16 \n\t" "vl %%v19,48(%%r1,%3) \n\t"
"vsel %%v2,%%v23,%%v22,%%v17 \n\t" "vl %%v20,64(%%r1,%3) \n\t"
"vsel %%v3,%%v27,%%v26,%%v17 \n\t" "vl %%v21,80(%%r1,%3) \n\t"
"vfchdb %%v16,%%v28, %%v29 \n\t " "vl %%v22,96(%%r1,%3) \n\t"
"vfchdb %%v17,%%v30,%%v31 \n\t" "vl %%v23,112(%%r1,%3) \n\t"
"vsel %%v24,%%v21,%%v20,%%v16 \n\t" "vflpdb %%v16, %%v16 \n\t"
"vsel %%v25,%%v29,%%v28,%%v16 \n\t" "vflpdb %%v17, %%v17 \n\t"
"vsel %%v26,%%v23,%%v22,%%v17 \n\t" "vflpdb %%v18, %%v18 \n\t"
"vsel %%v27,%%v31,%%v30,%%v17 \n\t" "vflpdb %%v19, %%v19 \n\t"
"vflpdb %%v20, %%v20 \n\t"
"vflpdb %%v21, %%v21 \n\t"
"vflpdb %%v22, %%v22 \n\t"
"vflpdb %%v23, %%v23 \n\t"
"vfchedb %%v4,%%v17,%%v16 \n\t"
"vfchedb %%v5,%%v19,%%v18 \n\t"
"vfchedb %%v6,%%v21,%%v20 \n\t"
"vfchedb %%v7,%%v23,%%v22 \n\t"
"vsel %%v16,%%v16,%%v17,%%v4 \n\t"
"vsel %%v4,%%v24,%%v25,%%v4 \n\t"
"vsel %%v17,%%v18,%%v19,%%v5 \n\t"
"vsel %%v5,%%v26,%%v27,%%v5 \n\t"
"vsel %%v18,%%v20,%%v21,%%v6 \n\t"
"vsel %%v6,%%v28,%%v29,%%v6 \n\t"
"vsel %%v19,%%v22,%%v23,%%v7 \n\t"
"vsel %%v7,%%v30,%%v31,%%v7 \n\t"
"vfchedb %%v20,%%v17,%%v16 \n\t"
"vfchedb %%v21,%%v19,%%v18 \n\t"
"vsel %%v16,%%v16,%%v17,%%v20 \n\t"
"vsel %%v4,%%v4,%%v5,%%v20 \n\t"
"vsel %%v17,%%v18,%%v19,%%v21 \n\t"
"vsel %%v5,%%v6,%%v7,%%v21 \n\t"
"vfchdb %%v28,%%v0 , %%v3 \n\t" "vfchedb %%v18,%%v17,%%v16 \n\t"
"vfchdb %%v29, %%v25,%%v27 \n\t" "vsel %%v16,%%v16,%%v17,%%v18 \n\t"
"vsel %%v1,%%v2,%%v1,%%v28 \n\t" "vsel %%v4,%%v4,%%v5,%%v18 \n\t"
"vsel %%v0,%%v3,%%v0,%%v28 \n\t" "vag %%v4,%%v4,%%v3 \n\t"
"vsel %%v24,%%v26,%%v24,%%v29 \n\t"
"vsel %%v25,%%v27,%%v25,%%v29 \n\t"
"vag %%v1,%%v1,%%v5 \n\t" "vfchedb %%v5,%%v16,%%v0 \n\t"
"vag %%v24,%%v24,%%v5 \n\t" "vsel %%v0,%%v0,%%v16,%%v5 \n\t"
"vag %%v24,%%v24,%%v4 \n\t" "vsel %%v1,%%v1,%%v4,%%v5 \n\t"
"vag %%v3,%%v3,%%v2 \n\t"
"vfchdb %%v16, %%v0,%%v25 \n\t" "vl %%v16,128(%%r1,%3) \n\t"
"vag %%v5,%%v5,%%v4 \n\t" "vl %%v17,144(%%r1,%3) \n\t"
"vsel %%v29,%%v25,%%v0,%%v16 \n\t" "vl %%v18,160(%%r1,%3) \n\t"
"vsel %%v28,%%v24,%%v1,%%v16 \n\t" "vl %%v19,176(%%r1,%3) \n\t"
"vl %%v20,192(%%r1,%3) \n\t"
"vl %%v21,208(%%r1,%3) \n\t"
"vl %%v22,224(%%r1,%3) \n\t"
"vl %%v23,240(%%r1,%3) \n\t"
"vflpdb %%v16, %%v16 \n\t"
"vflpdb %%v17, %%v17 \n\t"
"vflpdb %%v18, %%v18 \n\t"
"vflpdb %%v19, %%v19 \n\t"
"vflpdb %%v20, %%v20 \n\t"
"vflpdb %%v21, %%v21 \n\t"
"vflpdb %%v22, %%v22 \n\t"
"vflpdb %%v23, %%v23 \n\t"
"vfchdb %%v17,%%v18, %%v29 \n\t" "vfchedb %%v4,%%v17,%%v16 \n\t"
"vsel %%v19,%%v28,%%v19,%%v17 \n\t" "vfchedb %%v5,%%v19,%%v18 \n\t"
"vsel %%v18,%%v29,%%v18,%%v17 \n\t" "vfchedb %%v6,%%v21,%%v20 \n\t"
"vfchedb %%v7,%%v23,%%v22 \n\t"
"vsel %%v16,%%v16,%%v17,%%v4 \n\t"
"vsel %%v4,%%v24,%%v25,%%v4 \n\t"
"vsel %%v17,%%v18,%%v19,%%v5 \n\t"
"vsel %%v5,%%v26,%%v27,%%v5 \n\t"
"vsel %%v18,%%v20,%%v21,%%v6 \n\t"
"vsel %%v6,%%v28,%%v29,%%v6 \n\t"
"vsel %%v19,%%v22,%%v23,%%v7 \n\t"
"vsel %%v7,%%v30,%%v31,%%v7 \n\t"
"vag %%v5,%%v5,%%v4 \n\t" "vfchedb %%v20,%%v17,%%v16 \n\t"
"vfchedb %%v21,%%v19,%%v18 \n\t"
"vsel %%v16,%%v16,%%v17,%%v20 \n\t"
"vsel %%v4,%%v4,%%v5,%%v20 \n\t"
"vsel %%v17,%%v18,%%v19,%%v21 \n\t"
"vsel %%v5,%%v6,%%v7,%%v21 \n\t"
"vlm %%v24,%%v31,128(%[ptr_tmp] ) \n\t" "vfchedb %%v18,%%v17,%%v16 \n\t"
"vflpdb %%v24, %%v24 \n\t" "vsel %%v16,%%v16,%%v17,%%v18 \n\t"
"vflpdb %%v25, %%v25 \n\t" "vsel %%v4,%%v4,%%v5,%%v18 \n\t"
"vflpdb %%v26, %%v26 \n\t" "vag %%v4,%%v4,%%v3 \n\t"
"vflpdb %%v27, %%v27 \n\t"
"vflpdb %%v28, %%v28 \n\t"
"vflpdb %%v29, %%v29 \n\t"
"vflpdb %%v30, %%v30 \n\t"
"vflpdb %%v31, %%v31 \n\t"
"vfchdb %%v16,%%v24,%%v25 \n\t" "vfchedb %%v5,%%v16,%%v0 \n\t"
"vfchdb %%v17,%%v26 ,%%v27 \n\t" "vsel %%v0,%%v0,%%v16,%%v5 \n\t"
"vsel %%v1,%%v21,%%v20,%%v16 \n\t" "vsel %%v1,%%v1,%%v4,%%v5 \n\t"
"vsel %%v0,%%v25,%%v24,%%v16 \n\t" "vag %%v3,%%v3,%%v2 \n\t"
"vsel %%v2,%%v23,%%v22,%%v17 \n\t"
"vsel %%v3,%%v27,%%v26,%%v17 \n\t"
"vfchdb %%v16,%%v28 ,%%v29 \n\t"
"vfchdb %%v17,%%v30,%%v31 \n\t"
"vsel %%v24,%%v21,%%v20,%%v16 \n\t"
"vsel %%v25,%%v29,%%v28,%%v16 \n\t"
"vsel %%v26,%%v23,%%v22,%%v17 \n\t"
"vsel %%v27,%%v31,%%v30,%%v17 \n\t"
"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"
"vfchdb %%v28,%%v0 , %%v3 \n\t" "vrepg %%v2,%%v0,1 \n\t"
"vfchdb %%v29, %%v25,%%v27 \n\t" "vrepg %%v3,%%v1,1 \n\t"
"vsel %%v1,%%v2,%%v1,%%v28 \n\t" "wfcdb %%v2,%%v0 \n\t"
"vsel %%v0,%%v3,%%v0,%%v28 \n\t" "jne 1f \n\t"
"vsel %%v24,%%v26,%%v24,%%v29 \n\t" "vsteg %%v0,%1,0 \n\t"
"vsel %%v25,%%v27,%%v25,%%v29 \n\t" "vmnlg %%v0,%%v1,%%v3 \n\t"
"vlgvg %0,%%v0,0 \n\t"
"vag %%v1,%%v1,%%v5 \n\t" "j 2f \n\t"
"vag %%v24,%%v24,%%v5 \n\t" "1: \n\t"
"la %[ptr_tmp],256(%[ptr_tmp]) \n\t" "wfchdb %%v4,%%v0,%%v2 \n\t"
"vag %%v24,%%v24,%%v4 \n\t" "vsel %%v1,%%v3,%%v1,%%v4 \n\t"
"vsel %%v0,%%v2,%%v0,%%v4 \n\t"
"vfchdb %%v16, %%v0,%%v25 \n\t" "std %%f0,%1 \n\t"
"vag %%v5,%%v5,%%v4 \n\t" "vlgvg %0,%%v1,0 \n\t"
"vsel %%v29,%%v25,%%v0,%%v16 \n\t" "2: \n\t"
"vsel %%v28,%%v24,%%v1,%%v16 \n\t" "nop "
:"=r"(iamin),"=m"(*amin)
"vfchdb %%v17,%%v18, %%v29 \n\t" :"r"(n),"ZR"((const FLOAT (*)[n])x)
"vsel %%v19,%%v28,%%v19,%%v17 \n\t" :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
"vsel %%v18,%%v29,%%v18,%%v17 \n\t" );
"vag %%v5,%%v5,%%v4 \n\t"
"clgrjl %[ptr_tmp],%%r0,1b \n\t"
"vrepg %%v26,%%v18,1 \n\t"
"vrepg %%v5,%%v19,1 \n\t"
"wfcdb %%v26,%%v18 \n\t"
"jne 2f \n\t"
"vsteg %%v18,%[minf],0 \n\t"
"vmnlg %%v1,%%v5,%%v19 \n\t"
"j 3f \n\t"
"2: \n\t"
"wfchdb %%v16,%%v18 ,%%v26 \n\t "
"vsel %%v1,%%v5,%%v19,%%v16 \n\t"
"vsel %%v0,%%v26,%%v18,%%v16 \n\t"
"std %%f0,%[minf] \n\t"
"3: \n\t"
"vlgvg %[index],%%v1,0 \n\t"
: [index] "+r"(index) ,[minf] "=m"(*minf), [ptr_tmp] "+&a"(x)
: [mem] "m"( *(const double (*)[n])x), [n] "r"(n), [ptr_x] "r"(x)
: "cc","r0", "f0","v0","v1","v2","v3","v4","v5","v6","v7","v16",
"v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
return index;
return iamin;
} }
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0; BLASLONG i = 0;
BLASLONG j = 0; BLASLONG j = 0;
BLASLONG ix = 0;
BLASLONG min = 0;
FLOAT minf = 0.0; FLOAT minf = 0.0;
BLASLONG min = 0;
if (n <= 0 || inc_x <= 0) return (min); if (n <= 0 || inc_x <= 0) return (min);
minf = ABS(x[0]); //index's not incremented,though it will make first comparision redundant
if (inc_x == 1) { if (inc_x == 1) {
BLASLONG n1 = n & -32; BLASLONG n1 = n & -32;
if (n1 > 0) { if (n1 > 0) {
min = diamin_kernel_32(n1, x, &minf); min = idamin_kernel_32(n1, x, &minf);
i = n1; i = n1;
} }
else
{
minf = ABS(x[0]);
i++;
}
while (i < n) { while (i < n) {
if (ABS(x[i]) < minf) { if (ABS(x[i]) < minf) {
@ -223,6 +221,9 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
} else { } else {
min = 0;
minf = ABS(x[0]);
BLASLONG n1 = n & -4; BLASLONG n1 = n & -4;
while (j < n1) { while (j < n1) {

240
kernel/zarch/idmax.c Normal file
View File

@ -0,0 +1,240 @@
/***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
static BLASLONG idmax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *max)
{
BLASLONG imax;
__asm__ volatile (
"vl %%v0,0(%3) \n\t"
"vleig %%v1,0,0 \n\t"
"vleig %%v1,1,1 \n\t"
"vrepig %%v2,16 \n\t"
"vzero %%v3 \n\t"
"vleig %%v24,0,0 \n\t"
"vleig %%v24,1,1 \n\t"
"vleig %%v25,2,0 \n\t"
"vleig %%v25,3,1 \n\t"
"vleig %%v26,4,0 \n\t"
"vleig %%v26,5,1 \n\t"
"vleig %%v27,6,0 \n\t"
"vleig %%v27,7,1 \n\t"
"vleig %%v28,8,0 \n\t"
"vleig %%v28,9,1 \n\t"
"vleig %%v29,10,0 \n\t"
"vleig %%v29,11,1 \n\t"
"vleig %%v30,12,0 \n\t"
"vleig %%v30,13,1 \n\t"
"vleig %%v31,14,0 \n\t"
"vleig %%v31,15,1 \n\t"
"srlg %%r0,%2,5 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%3) \n\t"
"vl %%v16,0(%%r1,%3) \n\t"
"vl %%v17,16(%%r1,%3) \n\t"
"vl %%v18,32(%%r1,%3) \n\t"
"vl %%v19,48(%%r1,%3) \n\t"
"vl %%v20,64(%%r1,%3) \n\t"
"vl %%v21,80(%%r1,%3) \n\t"
"vl %%v22,96(%%r1,%3) \n\t"
"vl %%v23,112(%%r1,%3) \n\t"
"vfchedb %%v4,%%v16,%%v17 \n\t"
"vfchedb %%v5,%%v18,%%v19 \n\t"
"vfchedb %%v6,%%v20,%%v21 \n\t"
"vfchedb %%v7,%%v22,%%v23 \n\t"
"vsel %%v16,%%v16,%%v17,%%v4 \n\t"
"vsel %%v4,%%v24,%%v25,%%v4 \n\t"
"vsel %%v17,%%v18,%%v19,%%v5 \n\t"
"vsel %%v5,%%v26,%%v27,%%v5 \n\t"
"vsel %%v18,%%v20,%%v21,%%v6 \n\t"
"vsel %%v6,%%v28,%%v29,%%v6 \n\t"
"vsel %%v19,%%v22,%%v23,%%v7 \n\t"
"vsel %%v7,%%v30,%%v31,%%v7 \n\t"
"vfchedb %%v20,%%v16,%%v17 \n\t"
"vfchedb %%v21,%%v18,%%v19 \n\t"
"vsel %%v16,%%v16,%%v17,%%v20 \n\t"
"vsel %%v4,%%v4,%%v5,%%v20 \n\t"
"vsel %%v17,%%v18,%%v19,%%v21 \n\t"
"vsel %%v5,%%v6,%%v7,%%v21 \n\t"
"vfchedb %%v18,%%v16,%%v17 \n\t"
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
"vsel %%v4,%%v4,%%v5,%%v18 \n\t"
"vag %%v4,%%v4,%%v3 \n\t"
"vfchedb %%v5,%%v0,%%v16 \n\t"
"vsel %%v0,%%v0,%%v16,%%v5 \n\t"
"vsel %%v1,%%v1,%%v4,%%v5 \n\t"
"vag %%v3,%%v3,%%v2 \n\t"
"vl %%v16,128(%%r1,%3) \n\t"
"vl %%v17,144(%%r1,%3) \n\t"
"vl %%v18,160(%%r1,%3) \n\t"
"vl %%v19,176(%%r1,%3) \n\t"
"vl %%v20,192(%%r1,%3) \n\t"
"vl %%v21,208(%%r1,%3) \n\t"
"vl %%v22,224(%%r1,%3) \n\t"
"vl %%v23,240(%%r1,%3) \n\t"
"vfchedb %%v4,%%v16,%%v17 \n\t"
"vfchedb %%v5,%%v18,%%v19 \n\t"
"vfchedb %%v6,%%v20,%%v21 \n\t"
"vfchedb %%v7,%%v22,%%v23 \n\t"
"vsel %%v16,%%v16,%%v17,%%v4 \n\t"
"vsel %%v4,%%v24,%%v25,%%v4 \n\t"
"vsel %%v17,%%v18,%%v19,%%v5 \n\t"
"vsel %%v5,%%v26,%%v27,%%v5 \n\t"
"vsel %%v18,%%v20,%%v21,%%v6 \n\t"
"vsel %%v6,%%v28,%%v29,%%v6 \n\t"
"vsel %%v19,%%v22,%%v23,%%v7 \n\t"
"vsel %%v7,%%v30,%%v31,%%v7 \n\t"
"vfchedb %%v20,%%v16,%%v17 \n\t"
"vfchedb %%v21,%%v18,%%v19 \n\t"
"vsel %%v16,%%v16,%%v17,%%v20 \n\t"
"vsel %%v4,%%v4,%%v5,%%v20 \n\t"
"vsel %%v17,%%v18,%%v19,%%v21 \n\t"
"vsel %%v5,%%v6,%%v7,%%v21 \n\t"
"vfchedb %%v18,%%v16,%%v17 \n\t"
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
"vsel %%v4,%%v4,%%v5,%%v18 \n\t"
"vag %%v4,%%v4,%%v3 \n\t"
"vfchedb %%v5,%%v0,%%v16 \n\t"
"vsel %%v0,%%v0,%%v16,%%v5 \n\t"
"vsel %%v1,%%v1,%%v4,%%v5 \n\t"
"vag %%v3,%%v3,%%v2 \n\t"
"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"
"vrepg %%v2,%%v0,1 \n\t"
"vrepg %%v3,%%v1,1 \n\t"
"wfcdb %%v2,%%v0 \n\t"
"jne 1f \n\t"
"vsteg %%v0,%1,0 \n\t"
"vmnlg %%v0,%%v1,%%v3 \n\t"
"vlgvg %0,%%v0,0 \n\t"
"j 2f \n\t"
"1: \n\t"
"wfchdb %%v4,%%v2,%%v0 \n\t"
"vsel %%v1,%%v3,%%v1,%%v4 \n\t"
"vsel %%v0,%%v2,%%v0,%%v4 \n\t"
"std %%f0,%1 \n\t"
"vlgvg %0,%%v1,0 \n\t"
"2: \n\t"
"nop "
:"=r"(imax),"=m"(*max)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
return imax;
}
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0;
BLASLONG j = 0;
FLOAT maxf = 0.0;
BLASLONG max = 0;
if (n <= 0 || inc_x <= 0) return (max);
if (inc_x == 1) {
BLASLONG n1 = n & -32;
if (n1 > 0) {
max = idmax_kernel_32(n1, x, &maxf);
i = n1;
}
else
{
maxf = x[0];
i++;
}
while (i < n) {
if (x[i] > maxf) {
max = i;
maxf = x[i];
}
i++;
}
return (max + 1);
} else {
max = 0;
maxf = x[0];
BLASLONG n1 = n & -4;
while (j < n1) {
if (x[i] > maxf) {
max = j;
maxf = x[i];
}
if (x[i + inc_x] > maxf) {
max = j + 1;
maxf = x[i + inc_x];
}
if (x[i + 2 * inc_x] > maxf) {
max = j + 2;
maxf = x[i + 2 * inc_x];
}
if (x[i + 3 * inc_x] > maxf) {
max = j + 3;
maxf = x[i + 3 * inc_x];
}
i += inc_x * 4;
j += 4;
}
while (j < n) {
if (x[i] > maxf) {
max = j;
maxf = x[i];
}
i += inc_x;
j++;
}
return (max + 1);
}
}

240
kernel/zarch/idmin.c Normal file
View File

@ -0,0 +1,240 @@
/***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
static BLASLONG idmin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *min)
{
BLASLONG imin;
__asm__ volatile (
"vl %%v0,0(%3) \n\t"
"vleig %%v1,0,0 \n\t"
"vleig %%v1,1,1 \n\t"
"vrepig %%v2,16 \n\t"
"vzero %%v3 \n\t"
"vleig %%v24,0,0 \n\t"
"vleig %%v24,1,1 \n\t"
"vleig %%v25,2,0 \n\t"
"vleig %%v25,3,1 \n\t"
"vleig %%v26,4,0 \n\t"
"vleig %%v26,5,1 \n\t"
"vleig %%v27,6,0 \n\t"
"vleig %%v27,7,1 \n\t"
"vleig %%v28,8,0 \n\t"
"vleig %%v28,9,1 \n\t"
"vleig %%v29,10,0 \n\t"
"vleig %%v29,11,1 \n\t"
"vleig %%v30,12,0 \n\t"
"vleig %%v30,13,1 \n\t"
"vleig %%v31,14,0 \n\t"
"vleig %%v31,15,1 \n\t"
"srlg %%r0,%2,5 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%3) \n\t"
"vl %%v16,0(%%r1,%3) \n\t"
"vl %%v17,16(%%r1,%3) \n\t"
"vl %%v18,32(%%r1,%3) \n\t"
"vl %%v19,48(%%r1,%3) \n\t"
"vl %%v20,64(%%r1,%3) \n\t"
"vl %%v21,80(%%r1,%3) \n\t"
"vl %%v22,96(%%r1,%3) \n\t"
"vl %%v23,112(%%r1,%3) \n\t"
"vfchedb %%v4,%%v17,%%v16 \n\t"
"vfchedb %%v5,%%v19,%%v18 \n\t"
"vfchedb %%v6,%%v21,%%v20 \n\t"
"vfchedb %%v7,%%v23,%%v22 \n\t"
"vsel %%v16,%%v16,%%v17,%%v4 \n\t"
"vsel %%v4,%%v24,%%v25,%%v4 \n\t"
"vsel %%v17,%%v18,%%v19,%%v5 \n\t"
"vsel %%v5,%%v26,%%v27,%%v5 \n\t"
"vsel %%v18,%%v20,%%v21,%%v6 \n\t"
"vsel %%v6,%%v28,%%v29,%%v6 \n\t"
"vsel %%v19,%%v22,%%v23,%%v7 \n\t"
"vsel %%v7,%%v30,%%v31,%%v7 \n\t"
"vfchedb %%v20,%%v17,%%v16 \n\t"
"vfchedb %%v21,%%v19,%%v18 \n\t"
"vsel %%v16,%%v16,%%v17,%%v20 \n\t"
"vsel %%v4,%%v4,%%v5,%%v20 \n\t"
"vsel %%v17,%%v18,%%v19,%%v21 \n\t"
"vsel %%v5,%%v6,%%v7,%%v21 \n\t"
"vfchedb %%v18,%%v17,%%v16 \n\t"
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
"vsel %%v4,%%v4,%%v5,%%v18 \n\t"
"vag %%v4,%%v4,%%v3 \n\t"
"vfchedb %%v5,%%v16,%%v0 \n\t"
"vsel %%v0,%%v0,%%v16,%%v5 \n\t"
"vsel %%v1,%%v1,%%v4,%%v5 \n\t"
"vag %%v3,%%v3,%%v2 \n\t"
"vl %%v16,128(%%r1,%3) \n\t"
"vl %%v17,144(%%r1,%3) \n\t"
"vl %%v18,160(%%r1,%3) \n\t"
"vl %%v19,176(%%r1,%3) \n\t"
"vl %%v20,192(%%r1,%3) \n\t"
"vl %%v21,208(%%r1,%3) \n\t"
"vl %%v22,224(%%r1,%3) \n\t"
"vl %%v23,240(%%r1,%3) \n\t"
"vfchedb %%v4,%%v17,%%v16 \n\t"
"vfchedb %%v5,%%v19,%%v18 \n\t"
"vfchedb %%v6,%%v21,%%v20 \n\t"
"vfchedb %%v7,%%v23,%%v22 \n\t"
"vsel %%v16,%%v16,%%v17,%%v4 \n\t"
"vsel %%v4,%%v24,%%v25,%%v4 \n\t"
"vsel %%v17,%%v18,%%v19,%%v5 \n\t"
"vsel %%v5,%%v26,%%v27,%%v5 \n\t"
"vsel %%v18,%%v20,%%v21,%%v6 \n\t"
"vsel %%v6,%%v28,%%v29,%%v6 \n\t"
"vsel %%v19,%%v22,%%v23,%%v7 \n\t"
"vsel %%v7,%%v30,%%v31,%%v7 \n\t"
"vfchedb %%v20,%%v17,%%v16 \n\t"
"vfchedb %%v21,%%v19,%%v18 \n\t"
"vsel %%v16,%%v16,%%v17,%%v20 \n\t"
"vsel %%v4,%%v4,%%v5,%%v20 \n\t"
"vsel %%v17,%%v18,%%v19,%%v21 \n\t"
"vsel %%v5,%%v6,%%v7,%%v21 \n\t"
"vfchedb %%v18,%%v17,%%v16 \n\t"
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
"vsel %%v4,%%v4,%%v5,%%v18 \n\t"
"vag %%v4,%%v4,%%v3 \n\t"
"vfchedb %%v5,%%v16,%%v0 \n\t"
"vsel %%v0,%%v0,%%v16,%%v5 \n\t"
"vsel %%v1,%%v1,%%v4,%%v5 \n\t"
"vag %%v3,%%v3,%%v2 \n\t"
"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"
"vrepg %%v2,%%v0,1 \n\t"
"vrepg %%v3,%%v1,1 \n\t"
"wfcdb %%v2,%%v0 \n\t"
"jne 1f \n\t"
"vsteg %%v0,%1,0 \n\t"
"vmnlg %%v0,%%v1,%%v3 \n\t"
"vlgvg %0,%%v0,0 \n\t"
"j 2f \n\t"
"1: \n\t"
"wfchdb %%v4,%%v0,%%v2 \n\t"
"vsel %%v1,%%v3,%%v1,%%v4 \n\t"
"vsel %%v0,%%v2,%%v0,%%v4 \n\t"
"std %%f0,%1 \n\t"
"vlgvg %0,%%v1,0 \n\t"
"2: \n\t"
"nop "
:"=r"(imin),"=m"(*min)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
return imin;
}
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0;
BLASLONG j = 0;
FLOAT minf = 0.0;
BLASLONG min = 0;
if (n <= 0 || inc_x <= 0) return (min);
if (inc_x == 1) {
BLASLONG n1 = n & -32;
if (n1 > 0) {
min = idmin_kernel_32(n1, x, &minf);
i = n1;
}
else
{
minf = x[0];
i++;
}
while (i < n) {
if (x[i] < minf) {
min = i;
minf = x[i];
}
i++;
}
return (min + 1);
} else {
min = 0;
minf = x[0];
BLASLONG n1 = n & -4;
while (j < n1) {
if (x[i] < minf) {
min = j;
minf = x[i];
}
if (x[i + inc_x] < minf) {
min = j + 1;
minf = x[i + inc_x];
}
if (x[i + 2 * inc_x] < minf) {
min = j + 2;
minf = x[i + 2 * inc_x];
}
if (x[i + 3 * inc_x] < minf) {
min = j + 3;
minf = x[i + 3 * inc_x];
}
i += inc_x * 4;
j += 4;
}
while (j < n) {
if (x[i] < minf) {
min = j;
minf = x[i];
}
i += inc_x;
j++;
}
return (min + 1);
}
}

309
kernel/zarch/isamax.c Normal file
View File

@ -0,0 +1,309 @@
/***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#include <math.h>
#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf
#endif
static BLASLONG isamax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amax)
{
BLASLONG iamax;
__asm__ volatile (
"vl %%v0,0(%3) \n\t"
"vflpsb %%v0,%%v0 \n\t"
"vleig %%v1,0,0 \n\t"
"vleig %%v1,2,1 \n\t"
"vleig %%v2,1,0 \n\t"
"vleig %%v2,3,1 \n\t"
"vrepig %%v3,32 \n\t"
"vzero %%v4 \n\t"
"vleif %%v24,0,0 \n\t"
"vleif %%v24,1,1 \n\t"
"vleif %%v24,2,2 \n\t"
"vleif %%v24,3,3 \n\t"
"vleif %%v25,4,0 \n\t"
"vleif %%v25,5,1 \n\t"
"vleif %%v25,6,2 \n\t"
"vleif %%v25,7,3 \n\t"
"vleif %%v26,8,0 \n\t"
"vleif %%v26,9,1 \n\t"
"vleif %%v26,10,2 \n\t"
"vleif %%v26,11,3 \n\t"
"vleif %%v27,12,0 \n\t"
"vleif %%v27,13,1 \n\t"
"vleif %%v27,14,2 \n\t"
"vleif %%v27,15,3 \n\t"
"vleif %%v28,16,0 \n\t"
"vleif %%v28,17,1 \n\t"
"vleif %%v28,18,2 \n\t"
"vleif %%v28,19,3 \n\t"
"vleif %%v29,20,0 \n\t"
"vleif %%v29,21,1 \n\t"
"vleif %%v29,22,2 \n\t"
"vleif %%v29,23,3 \n\t"
"vleif %%v30,24,0 \n\t"
"vleif %%v30,25,1 \n\t"
"vleif %%v30,26,2 \n\t"
"vleif %%v30,27,3 \n\t"
"vleif %%v31,28,0 \n\t"
"vleif %%v31,29,1 \n\t"
"vleif %%v31,30,2 \n\t"
"vleif %%v31,31,3 \n\t"
"srlg %%r0,%2,6 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%3) \n\t"
"vl %%v16,0(%%r1,%3) \n\t"
"vl %%v17,16(%%r1,%3) \n\t"
"vl %%v18,32(%%r1,%3) \n\t"
"vl %%v19,48(%%r1,%3) \n\t"
"vl %%v20,64(%%r1,%3) \n\t"
"vl %%v21,80(%%r1,%3) \n\t"
"vl %%v22,96(%%r1,%3) \n\t"
"vl %%v23,112(%%r1,%3) \n\t"
"vflpsb %%v16, %%v16 \n\t"
"vflpsb %%v17, %%v17 \n\t"
"vflpsb %%v18, %%v18 \n\t"
"vflpsb %%v19, %%v19 \n\t"
"vflpsb %%v20, %%v20 \n\t"
"vflpsb %%v21, %%v21 \n\t"
"vflpsb %%v22, %%v22 \n\t"
"vflpsb %%v23, %%v23 \n\t"
"vfchesb %%v5,%%v16,%%v17 \n\t"
"vfchesb %%v6,%%v18,%%v19 \n\t"
"vfchesb %%v7,%%v20,%%v21 \n\t"
"vfchesb %%v8,%%v22,%%v23 \n\t"
"vsel %%v16,%%v16,%%v17,%%v5 \n\t"
"vsel %%v5,%%v24,%%v25,%%v5 \n\t"
"vsel %%v17,%%v18,%%v19,%%v6 \n\t"
"vsel %%v6,%%v26,%%v27,%%v6 \n\t"
"vsel %%v18,%%v20,%%v21,%%v7 \n\t"
"vsel %%v7,%%v28,%%v29,%%v7 \n\t"
"vsel %%v19,%%v22,%%v23,%%v8 \n\t"
"vsel %%v8,%%v30,%%v31,%%v8 \n\t"
"vfchesb %%v20,%%v16,%%v17 \n\t"
"vfchesb %%v21,%%v18,%%v19 \n\t"
"vsel %%v16,%%v16,%%v17,%%v20 \n\t"
"vsel %%v5,%%v5,%%v6,%%v20 \n\t"
"vsel %%v17,%%v18,%%v19,%%v21 \n\t"
"vsel %%v6,%%v7,%%v8,%%v21 \n\t"
"vfchesb %%v18,%%v16,%%v17 \n\t"
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
"vsel %%v5,%%v5,%%v6,%%v18 \n\t"
"vsegf %%v6,%%v5 \n\t"
"vesrlg %%v5,%%v5,32 \n\t"
"vag %%v5,%%v5,%%v4 \n\t"
"vag %%v6,%%v6,%%v4 \n\t"
"vfchesb %%v7,%%v0,%%v16 \n\t"
"vsel %%v0,%%v0,%%v16,%%v7 \n\t"
"vsegf %%v8,%%v7 \n\t"
"vesrlg %%v7,%%v7,32 \n\t"
"vsegf %%v7,%%v7 \n\t"
"vsel %%v1,%%v1,%%v5,%%v7 \n\t"
"vsel %%v2,%%v2,%%v6,%%v8 \n\t"
"vag %%v4,%%v4,%%v3 \n\t"
"vl %%v16,128(%%r1,%3) \n\t"
"vl %%v17,144(%%r1,%3) \n\t"
"vl %%v18,160(%%r1,%3) \n\t"
"vl %%v19,176(%%r1,%3) \n\t"
"vl %%v20,192(%%r1,%3) \n\t"
"vl %%v21,208(%%r1,%3) \n\t"
"vl %%v22,224(%%r1,%3) \n\t"
"vl %%v23,240(%%r1,%3) \n\t"
"vflpsb %%v16, %%v16 \n\t"
"vflpsb %%v17, %%v17 \n\t"
"vflpsb %%v18, %%v18 \n\t"
"vflpsb %%v19, %%v19 \n\t"
"vflpsb %%v20, %%v20 \n\t"
"vflpsb %%v21, %%v21 \n\t"
"vflpsb %%v22, %%v22 \n\t"
"vflpsb %%v23, %%v23 \n\t"
"vfchesb %%v5,%%v16,%%v17 \n\t"
"vfchesb %%v6,%%v18,%%v19 \n\t"
"vfchesb %%v7,%%v20,%%v21 \n\t"
"vfchesb %%v8,%%v22,%%v23 \n\t"
"vsel %%v16,%%v16,%%v17,%%v5 \n\t"
"vsel %%v5,%%v24,%%v25,%%v5 \n\t"
"vsel %%v17,%%v18,%%v19,%%v6 \n\t"
"vsel %%v6,%%v26,%%v27,%%v6 \n\t"
"vsel %%v18,%%v20,%%v21,%%v7 \n\t"
"vsel %%v7,%%v28,%%v29,%%v7 \n\t"
"vsel %%v19,%%v22,%%v23,%%v8 \n\t"
"vsel %%v8,%%v30,%%v31,%%v8 \n\t"
"vfchesb %%v20,%%v16,%%v17 \n\t"
"vfchesb %%v21,%%v18,%%v19 \n\t"
"vsel %%v16,%%v16,%%v17,%%v20 \n\t"
"vsel %%v5,%%v5,%%v6,%%v20 \n\t"
"vsel %%v17,%%v18,%%v19,%%v21 \n\t"
"vsel %%v6,%%v7,%%v8,%%v21 \n\t"
"vfchesb %%v18,%%v16,%%v17 \n\t"
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
"vsel %%v5,%%v5,%%v6,%%v18 \n\t"
"vsegf %%v6,%%v5 \n\t"
"vesrlg %%v5,%%v5,32 \n\t"
"vag %%v5,%%v5,%%v4 \n\t"
"vag %%v6,%%v6,%%v4 \n\t"
"vfchesb %%v7,%%v0,%%v16 \n\t"
"vsel %%v0,%%v0,%%v16,%%v7 \n\t"
"vsegf %%v8,%%v7 \n\t"
"vesrlg %%v7,%%v7,32 \n\t"
"vsegf %%v7,%%v7 \n\t"
"vsel %%v1,%%v1,%%v5,%%v7 \n\t"
"vsel %%v2,%%v2,%%v6,%%v8 \n\t"
"vag %%v4,%%v4,%%v3 \n\t"
"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"
"veslg %%v3,%%v0,32 \n\t"
"vfchsb %%v4,%%v0,%%v3 \n\t"
"vchlg %%v5,%%v2,%%v1 \n\t"
"vfcesb %%v6,%%v0,%%v3 \n\t"
"vn %%v5,%%v5,%%v6 \n\t"
"vo %%v4,%%v4,%%v5 \n\t"
"vsel %%v0,%%v0,%%v3,%%v4 \n\t"
"vesrlg %%v4,%%v4,32 \n\t"
"vsegf %%v4,%%v4 \n\t"
"vsel %%v1,%%v1,%%v2,%%v4 \n\t"
"vrepf %%v2,%%v0,2 \n\t"
"vrepg %%v3,%%v1,1 \n\t"
"wfcsb %%v2,%%v0 \n\t"
"jne 1f \n\t"
"vstef %%v0,%1,0 \n\t"
"vmnlg %%v0,%%v1,%%v3 \n\t"
"vlgvg %0,%%v0,0 \n\t"
"j 2f \n\t"
"1: \n\t"
"wfchsb %%v4,%%v2,%%v0 \n\t"
"vesrlg %%v4,%%v4,32 \n\t"
"vsegf %%v4,%%v4 \n\t"
"vsel %%v1,%%v3,%%v1,%%v4 \n\t"
"vsel %%v0,%%v2,%%v0,%%v4 \n\t"
"ste %%f0,%1 \n\t"
"vlgvg %0,%%v1,0 \n\t"
"2: \n\t"
"nop "
:"=r"(iamax),"=m"(*amax)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v1","v2","v4","v5","v6","v7","v8","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
return iamax;
}
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0;
BLASLONG j = 0;
FLOAT maxf = 0.0;
BLASLONG max = 0;
if (n <= 0 || inc_x <= 0) return (max);
if (inc_x == 1) {
BLASLONG n1 = n & -64;
if (n1 > 0) {
max = isamax_kernel_64(n1, x, &maxf);
i = n1;
}
else
{
maxf = ABS(x[0]);
i++;
}
while (i < n) {
if (ABS(x[i]) > maxf) {
max = i;
maxf = ABS(x[i]);
}
i++;
}
return (max + 1);
} else {
max = 0;
maxf = ABS(x[0]);
BLASLONG n1 = n & -4;
while (j < n1) {
if (ABS(x[i]) > maxf) {
max = j;
maxf = ABS(x[i]);
}
if (ABS(x[i + inc_x]) > maxf) {
max = j + 1;
maxf = ABS(x[i + inc_x]);
}
if (ABS(x[i + 2 * inc_x]) > maxf) {
max = j + 2;
maxf = ABS(x[i + 2 * inc_x]);
}
if (ABS(x[i + 3 * inc_x]) > maxf) {
max = j + 3;
maxf = ABS(x[i + 3 * inc_x]);
}
i += inc_x * 4;
j += 4;
}
while (j < n) {
if (ABS(x[i]) > maxf) {
max = j;
maxf = ABS(x[i]);
}
i += inc_x;
j++;
}
return (max + 1);
}
}

309
kernel/zarch/isamin.c Normal file
View File

@ -0,0 +1,309 @@
/***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#include <math.h>
#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf
#endif
static BLASLONG isamin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amin)
{
BLASLONG iamin;
__asm__ volatile (
"vl %%v0,0(%3) \n\t"
"vflpsb %%v0,%%v0 \n\t"
"vleig %%v1,0,0 \n\t"
"vleig %%v1,2,1 \n\t"
"vleig %%v2,1,0 \n\t"
"vleig %%v2,3,1 \n\t"
"vrepig %%v3,32 \n\t"
"vzero %%v4 \n\t"
"vleif %%v24,0,0 \n\t"
"vleif %%v24,1,1 \n\t"
"vleif %%v24,2,2 \n\t"
"vleif %%v24,3,3 \n\t"
"vleif %%v25,4,0 \n\t"
"vleif %%v25,5,1 \n\t"
"vleif %%v25,6,2 \n\t"
"vleif %%v25,7,3 \n\t"
"vleif %%v26,8,0 \n\t"
"vleif %%v26,9,1 \n\t"
"vleif %%v26,10,2 \n\t"
"vleif %%v26,11,3 \n\t"
"vleif %%v27,12,0 \n\t"
"vleif %%v27,13,1 \n\t"
"vleif %%v27,14,2 \n\t"
"vleif %%v27,15,3 \n\t"
"vleif %%v28,16,0 \n\t"
"vleif %%v28,17,1 \n\t"
"vleif %%v28,18,2 \n\t"
"vleif %%v28,19,3 \n\t"
"vleif %%v29,20,0 \n\t"
"vleif %%v29,21,1 \n\t"
"vleif %%v29,22,2 \n\t"
"vleif %%v29,23,3 \n\t"
"vleif %%v30,24,0 \n\t"
"vleif %%v30,25,1 \n\t"
"vleif %%v30,26,2 \n\t"
"vleif %%v30,27,3 \n\t"
"vleif %%v31,28,0 \n\t"
"vleif %%v31,29,1 \n\t"
"vleif %%v31,30,2 \n\t"
"vleif %%v31,31,3 \n\t"
"srlg %%r0,%2,6 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%3) \n\t"
"vl %%v16,0(%%r1,%3) \n\t"
"vl %%v17,16(%%r1,%3) \n\t"
"vl %%v18,32(%%r1,%3) \n\t"
"vl %%v19,48(%%r1,%3) \n\t"
"vl %%v20,64(%%r1,%3) \n\t"
"vl %%v21,80(%%r1,%3) \n\t"
"vl %%v22,96(%%r1,%3) \n\t"
"vl %%v23,112(%%r1,%3) \n\t"
"vflpsb %%v16, %%v16 \n\t"
"vflpsb %%v17, %%v17 \n\t"
"vflpsb %%v18, %%v18 \n\t"
"vflpsb %%v19, %%v19 \n\t"
"vflpsb %%v20, %%v20 \n\t"
"vflpsb %%v21, %%v21 \n\t"
"vflpsb %%v22, %%v22 \n\t"
"vflpsb %%v23, %%v23 \n\t"
"vfchesb %%v5,%%v17,%%v16 \n\t"
"vfchesb %%v6,%%v19,%%v18 \n\t"
"vfchesb %%v7,%%v21,%%v20 \n\t"
"vfchesb %%v8,%%v23,%%v22 \n\t"
"vsel %%v16,%%v16,%%v17,%%v5 \n\t"
"vsel %%v5,%%v24,%%v25,%%v5 \n\t"
"vsel %%v17,%%v18,%%v19,%%v6 \n\t"
"vsel %%v6,%%v26,%%v27,%%v6 \n\t"
"vsel %%v18,%%v20,%%v21,%%v7 \n\t"
"vsel %%v7,%%v28,%%v29,%%v7 \n\t"
"vsel %%v19,%%v22,%%v23,%%v8 \n\t"
"vsel %%v8,%%v30,%%v31,%%v8 \n\t"
"vfchesb %%v20,%%v17,%%v16 \n\t"
"vfchesb %%v21,%%v19,%%v18 \n\t"
"vsel %%v16,%%v16,%%v17,%%v20 \n\t"
"vsel %%v5,%%v5,%%v6,%%v20 \n\t"
"vsel %%v17,%%v18,%%v19,%%v21 \n\t"
"vsel %%v6,%%v7,%%v8,%%v21 \n\t"
"vfchesb %%v18,%%v17,%%v16 \n\t"
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
"vsel %%v5,%%v5,%%v6,%%v18 \n\t"
"vsegf %%v6,%%v5 \n\t"
"vesrlg %%v5,%%v5,32 \n\t"
"vag %%v5,%%v5,%%v4 \n\t"
"vag %%v6,%%v6,%%v4 \n\t"
"vfchesb %%v7,%%v16,%%v0 \n\t"
"vsel %%v0,%%v0,%%v16,%%v7 \n\t"
"vsegf %%v8,%%v7 \n\t"
"vesrlg %%v7,%%v7,32 \n\t"
"vsegf %%v7,%%v7 \n\t"
"vsel %%v1,%%v1,%%v5,%%v7 \n\t"
"vsel %%v2,%%v2,%%v6,%%v8 \n\t"
"vag %%v4,%%v4,%%v3 \n\t"
"vl %%v16,128(%%r1,%3) \n\t"
"vl %%v17,144(%%r1,%3) \n\t"
"vl %%v18,160(%%r1,%3) \n\t"
"vl %%v19,176(%%r1,%3) \n\t"
"vl %%v20,192(%%r1,%3) \n\t"
"vl %%v21,208(%%r1,%3) \n\t"
"vl %%v22,224(%%r1,%3) \n\t"
"vl %%v23,240(%%r1,%3) \n\t"
"vflpsb %%v16, %%v16 \n\t"
"vflpsb %%v17, %%v17 \n\t"
"vflpsb %%v18, %%v18 \n\t"
"vflpsb %%v19, %%v19 \n\t"
"vflpsb %%v20, %%v20 \n\t"
"vflpsb %%v21, %%v21 \n\t"
"vflpsb %%v22, %%v22 \n\t"
"vflpsb %%v23, %%v23 \n\t"
"vfchesb %%v5,%%v17,%%v16 \n\t"
"vfchesb %%v6,%%v19,%%v18 \n\t"
"vfchesb %%v7,%%v21,%%v20 \n\t"
"vfchesb %%v8,%%v23,%%v22 \n\t"
"vsel %%v16,%%v16,%%v17,%%v5 \n\t"
"vsel %%v5,%%v24,%%v25,%%v5 \n\t"
"vsel %%v17,%%v18,%%v19,%%v6 \n\t"
"vsel %%v6,%%v26,%%v27,%%v6 \n\t"
"vsel %%v18,%%v20,%%v21,%%v7 \n\t"
"vsel %%v7,%%v28,%%v29,%%v7 \n\t"
"vsel %%v19,%%v22,%%v23,%%v8 \n\t"
"vsel %%v8,%%v30,%%v31,%%v8 \n\t"
"vfchesb %%v20,%%v17,%%v16 \n\t"
"vfchesb %%v21,%%v19,%%v18 \n\t"
"vsel %%v16,%%v16,%%v17,%%v20 \n\t"
"vsel %%v5,%%v5,%%v6,%%v20 \n\t"
"vsel %%v17,%%v18,%%v19,%%v21 \n\t"
"vsel %%v6,%%v7,%%v8,%%v21 \n\t"
"vfchesb %%v18,%%v17,%%v16 \n\t"
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
"vsel %%v5,%%v5,%%v6,%%v18 \n\t"
"vsegf %%v6,%%v5 \n\t"
"vesrlg %%v5,%%v5,32 \n\t"
"vag %%v5,%%v5,%%v4 \n\t"
"vag %%v6,%%v6,%%v4 \n\t"
"vfchesb %%v7,%%v16,%%v0 \n\t"
"vsel %%v0,%%v0,%%v16,%%v7 \n\t"
"vsegf %%v8,%%v7 \n\t"
"vesrlg %%v7,%%v7,32 \n\t"
"vsegf %%v7,%%v7 \n\t"
"vsel %%v1,%%v1,%%v5,%%v7 \n\t"
"vsel %%v2,%%v2,%%v6,%%v8 \n\t"
"vag %%v4,%%v4,%%v3 \n\t"
"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"
"veslg %%v3,%%v0,32 \n\t"
"vfchsb %%v4,%%v3,%%v0 \n\t"
"vchlg %%v5,%%v2,%%v1 \n\t"
"vfcesb %%v6,%%v0,%%v3 \n\t"
"vn %%v5,%%v5,%%v6 \n\t"
"vo %%v4,%%v4,%%v5 \n\t"
"vsel %%v0,%%v0,%%v3,%%v4 \n\t"
"vesrlg %%v4,%%v4,32 \n\t"
"vsegf %%v4,%%v4 \n\t"
"vsel %%v1,%%v1,%%v2,%%v4 \n\t"
"vrepf %%v2,%%v0,2 \n\t"
"vrepg %%v3,%%v1,1 \n\t"
"wfcsb %%v2,%%v0 \n\t"
"jne 1f \n\t"
"vstef %%v0,%1,0 \n\t"
"vmnlg %%v0,%%v1,%%v3 \n\t"
"vlgvg %0,%%v0,0 \n\t"
"j 2f \n\t"
"1: \n\t"
"wfchsb %%v4,%%v0,%%v2 \n\t"
"vesrlg %%v4,%%v4,32 \n\t"
"vsegf %%v4,%%v4 \n\t"
"vsel %%v1,%%v3,%%v1,%%v4 \n\t"
"vsel %%v0,%%v2,%%v0,%%v4 \n\t"
"ste %%f0,%1 \n\t"
"vlgvg %0,%%v1,0 \n\t"
"2: \n\t"
"nop "
:"=r"(iamin),"=m"(*amin)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v1","v2","v4","v5","v6","v7","v8","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
return iamin;
}
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0;
BLASLONG j = 0;
FLOAT minf = 0.0;
BLASLONG min = 0;
if (n <= 0 || inc_x <= 0) return (min);
if (inc_x == 1) {
BLASLONG n1 = n & -64;
if (n1 > 0) {
min = isamin_kernel_64(n1, x, &minf);
i = n1;
}
else
{
minf = ABS(x[0]);
i++;
}
while (i < n) {
if (ABS(x[i]) < minf) {
min = i;
minf = ABS(x[i]);
}
i++;
}
return (min + 1);
} else {
min = 0;
minf = ABS(x[0]);
BLASLONG n1 = n & -4;
while (j < n1) {
if (ABS(x[i]) < minf) {
min = j;
minf = ABS(x[i]);
}
if (ABS(x[i + inc_x]) < minf) {
min = j + 1;
minf = ABS(x[i + inc_x]);
}
if (ABS(x[i + 2 * inc_x]) < minf) {
min = j + 2;
minf = ABS(x[i + 2 * inc_x]);
}
if (ABS(x[i + 3 * inc_x]) < minf) {
min = j + 3;
minf = ABS(x[i + 3 * inc_x]);
}
i += inc_x * 4;
j += 4;
}
while (j < n) {
if (ABS(x[i]) < minf) {
min = j;
minf = ABS(x[i]);
}
i += inc_x;
j++;
}
return (min + 1);
}
}

285
kernel/zarch/ismax.c Normal file
View File

@ -0,0 +1,285 @@
/***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
static BLASLONG ismax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *max)
{
BLASLONG imax;
__asm__ volatile (
"vl %%v0,0(%3) \n\t"
"vleig %%v1,0,0 \n\t"
"vleig %%v1,2,1 \n\t"
"vleig %%v2,1,0 \n\t"
"vleig %%v2,3,1 \n\t"
"vrepig %%v3,32 \n\t"
"vzero %%v4 \n\t"
"vleif %%v24,0,0 \n\t"
"vleif %%v24,1,1 \n\t"
"vleif %%v24,2,2 \n\t"
"vleif %%v24,3,3 \n\t"
"vleif %%v25,4,0 \n\t"
"vleif %%v25,5,1 \n\t"
"vleif %%v25,6,2 \n\t"
"vleif %%v25,7,3 \n\t"
"vleif %%v26,8,0 \n\t"
"vleif %%v26,9,1 \n\t"
"vleif %%v26,10,2 \n\t"
"vleif %%v26,11,3 \n\t"
"vleif %%v27,12,0 \n\t"
"vleif %%v27,13,1 \n\t"
"vleif %%v27,14,2 \n\t"
"vleif %%v27,15,3 \n\t"
"vleif %%v28,16,0 \n\t"
"vleif %%v28,17,1 \n\t"
"vleif %%v28,18,2 \n\t"
"vleif %%v28,19,3 \n\t"
"vleif %%v29,20,0 \n\t"
"vleif %%v29,21,1 \n\t"
"vleif %%v29,22,2 \n\t"
"vleif %%v29,23,3 \n\t"
"vleif %%v30,24,0 \n\t"
"vleif %%v30,25,1 \n\t"
"vleif %%v30,26,2 \n\t"
"vleif %%v30,27,3 \n\t"
"vleif %%v31,28,0 \n\t"
"vleif %%v31,29,1 \n\t"
"vleif %%v31,30,2 \n\t"
"vleif %%v31,31,3 \n\t"
"srlg %%r0,%2,6 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%3) \n\t"
"vl %%v16,0(%%r1,%3) \n\t"
"vl %%v17,16(%%r1,%3) \n\t"
"vl %%v18,32(%%r1,%3) \n\t"
"vl %%v19,48(%%r1,%3) \n\t"
"vl %%v20,64(%%r1,%3) \n\t"
"vl %%v21,80(%%r1,%3) \n\t"
"vl %%v22,96(%%r1,%3) \n\t"
"vl %%v23,112(%%r1,%3) \n\t"
"vfchesb %%v5,%%v16,%%v17 \n\t"
"vfchesb %%v6,%%v18,%%v19 \n\t"
"vfchesb %%v7,%%v20,%%v21 \n\t"
"vfchesb %%v8,%%v22,%%v23 \n\t"
"vsel %%v16,%%v16,%%v17,%%v5 \n\t"
"vsel %%v5,%%v24,%%v25,%%v5 \n\t"
"vsel %%v17,%%v18,%%v19,%%v6 \n\t"
"vsel %%v6,%%v26,%%v27,%%v6 \n\t"
"vsel %%v18,%%v20,%%v21,%%v7 \n\t"
"vsel %%v7,%%v28,%%v29,%%v7 \n\t"
"vsel %%v19,%%v22,%%v23,%%v8 \n\t"
"vsel %%v8,%%v30,%%v31,%%v8 \n\t"
"vfchesb %%v20,%%v16,%%v17 \n\t"
"vfchesb %%v21,%%v18,%%v19 \n\t"
"vsel %%v16,%%v16,%%v17,%%v20 \n\t"
"vsel %%v5,%%v5,%%v6,%%v20 \n\t"
"vsel %%v17,%%v18,%%v19,%%v21 \n\t"
"vsel %%v6,%%v7,%%v8,%%v21 \n\t"
"vfchesb %%v18,%%v16,%%v17 \n\t"
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
"vsel %%v5,%%v5,%%v6,%%v18 \n\t"
"vsegf %%v6,%%v5 \n\t"
"vesrlg %%v5,%%v5,32 \n\t"
"vag %%v5,%%v5,%%v4 \n\t"
"vag %%v6,%%v6,%%v4 \n\t"
"vfchesb %%v7,%%v0,%%v16 \n\t"
"vsel %%v0,%%v0,%%v16,%%v7 \n\t"
"vsegf %%v8,%%v7 \n\t"
"vesrlg %%v7,%%v7,32 \n\t"
"vsegf %%v7,%%v7 \n\t"
"vsel %%v1,%%v1,%%v5,%%v7 \n\t"
"vsel %%v2,%%v2,%%v6,%%v8 \n\t"
"vag %%v4,%%v4,%%v3 \n\t"
"vl %%v16,128(%%r1,%3) \n\t"
"vl %%v17,144(%%r1,%3) \n\t"
"vl %%v18,160(%%r1,%3) \n\t"
"vl %%v19,176(%%r1,%3) \n\t"
"vl %%v20,192(%%r1,%3) \n\t"
"vl %%v21,208(%%r1,%3) \n\t"
"vl %%v22,224(%%r1,%3) \n\t"
"vl %%v23,240(%%r1,%3) \n\t"
"vfchesb %%v5,%%v16,%%v17 \n\t"
"vfchesb %%v6,%%v18,%%v19 \n\t"
"vfchesb %%v7,%%v20,%%v21 \n\t"
"vfchesb %%v8,%%v22,%%v23 \n\t"
"vsel %%v16,%%v16,%%v17,%%v5 \n\t"
"vsel %%v5,%%v24,%%v25,%%v5 \n\t"
"vsel %%v17,%%v18,%%v19,%%v6 \n\t"
"vsel %%v6,%%v26,%%v27,%%v6 \n\t"
"vsel %%v18,%%v20,%%v21,%%v7 \n\t"
"vsel %%v7,%%v28,%%v29,%%v7 \n\t"
"vsel %%v19,%%v22,%%v23,%%v8 \n\t"
"vsel %%v8,%%v30,%%v31,%%v8 \n\t"
"vfchesb %%v20,%%v16,%%v17 \n\t"
"vfchesb %%v21,%%v18,%%v19 \n\t"
"vsel %%v16,%%v16,%%v17,%%v20 \n\t"
"vsel %%v5,%%v5,%%v6,%%v20 \n\t"
"vsel %%v17,%%v18,%%v19,%%v21 \n\t"
"vsel %%v6,%%v7,%%v8,%%v21 \n\t"
"vfchesb %%v18,%%v16,%%v17 \n\t"
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
"vsel %%v5,%%v5,%%v6,%%v18 \n\t"
"vsegf %%v6,%%v5 \n\t"
"vesrlg %%v5,%%v5,32 \n\t"
"vag %%v5,%%v5,%%v4 \n\t"
"vag %%v6,%%v6,%%v4 \n\t"
"vfchesb %%v7,%%v0,%%v16 \n\t"
"vsel %%v0,%%v0,%%v16,%%v7 \n\t"
"vsegf %%v8,%%v7 \n\t"
"vesrlg %%v7,%%v7,32 \n\t"
"vsegf %%v7,%%v7 \n\t"
"vsel %%v1,%%v1,%%v5,%%v7 \n\t"
"vsel %%v2,%%v2,%%v6,%%v8 \n\t"
"vag %%v4,%%v4,%%v3 \n\t"
"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"
"veslg %%v3,%%v0,32 \n\t"
"vfchsb %%v4,%%v0,%%v3 \n\t"
"vchlg %%v5,%%v2,%%v1 \n\t"
"vfcesb %%v6,%%v0,%%v3 \n\t"
"vn %%v5,%%v5,%%v6 \n\t"
"vo %%v4,%%v4,%%v5 \n\t"
"vsel %%v0,%%v0,%%v3,%%v4 \n\t"
"vesrlg %%v4,%%v4,32 \n\t"
"vsegf %%v4,%%v4 \n\t"
"vsel %%v1,%%v1,%%v2,%%v4 \n\t"
"vrepf %%v2,%%v0,2 \n\t"
"vrepg %%v3,%%v1,1 \n\t"
"wfcsb %%v2,%%v0 \n\t"
"jne 1f \n\t"
"vstef %%v0,%1,0 \n\t"
"vmnlg %%v0,%%v1,%%v3 \n\t"
"vlgvg %0,%%v0,0 \n\t"
"j 2f \n\t"
"1: \n\t"
"wfchsb %%v4,%%v2,%%v0 \n\t"
"vesrlg %%v4,%%v4,32 \n\t"
"vsegf %%v4,%%v4 \n\t"
"vsel %%v1,%%v3,%%v1,%%v4 \n\t"
"vsel %%v0,%%v2,%%v0,%%v4 \n\t"
"ste %%f0,%1 \n\t"
"vlgvg %0,%%v1,0 \n\t"
"2: \n\t"
"nop "
:"=r"(imax),"=m"(*max)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v1","v2","v4","v5","v6","v7","v8","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
return imax;
}
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0;
BLASLONG j = 0;
FLOAT maxf = 0.0;
BLASLONG max = 0;
if (n <= 0 || inc_x <= 0) return (max);
if (inc_x == 1) {
BLASLONG n1 = n & -64;
if (n1 > 0) {
max = ismax_kernel_64(n1, x, &maxf);
i = n1;
}
else
{
maxf = x[0];
i++;
}
while (i < n) {
if (x[i] > maxf) {
max = i;
maxf = x[i];
}
i++;
}
return (max + 1);
} else {
max = 0;
maxf = x[0];
BLASLONG n1 = n & -4;
while (j < n1) {
if (x[i] > maxf) {
max = j;
maxf = x[i];
}
if (x[i + inc_x] > maxf) {
max = j + 1;
maxf = x[i + inc_x];
}
if (x[i + 2 * inc_x] > maxf) {
max = j + 2;
maxf = x[i + 2 * inc_x];
}
if (x[i + 3 * inc_x] > maxf) {
max = j + 3;
maxf = x[i + 3 * inc_x];
}
i += inc_x * 4;
j += 4;
}
while (j < n) {
if (x[i] > maxf) {
max = j;
maxf = x[i];
}
i += inc_x;
j++;
}
return (max + 1);
}
}

285
kernel/zarch/ismin.c Normal file
View File

@ -0,0 +1,285 @@
/***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
static BLASLONG ismin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *min)
{
BLASLONG imin;
__asm__ volatile (
"vl %%v0,0(%3) \n\t"
"vleig %%v1,0,0 \n\t"
"vleig %%v1,2,1 \n\t"
"vleig %%v2,1,0 \n\t"
"vleig %%v2,3,1 \n\t"
"vrepig %%v3,32 \n\t"
"vzero %%v4 \n\t"
"vleif %%v24,0,0 \n\t"
"vleif %%v24,1,1 \n\t"
"vleif %%v24,2,2 \n\t"
"vleif %%v24,3,3 \n\t"
"vleif %%v25,4,0 \n\t"
"vleif %%v25,5,1 \n\t"
"vleif %%v25,6,2 \n\t"
"vleif %%v25,7,3 \n\t"
"vleif %%v26,8,0 \n\t"
"vleif %%v26,9,1 \n\t"
"vleif %%v26,10,2 \n\t"
"vleif %%v26,11,3 \n\t"
"vleif %%v27,12,0 \n\t"
"vleif %%v27,13,1 \n\t"
"vleif %%v27,14,2 \n\t"
"vleif %%v27,15,3 \n\t"
"vleif %%v28,16,0 \n\t"
"vleif %%v28,17,1 \n\t"
"vleif %%v28,18,2 \n\t"
"vleif %%v28,19,3 \n\t"
"vleif %%v29,20,0 \n\t"
"vleif %%v29,21,1 \n\t"
"vleif %%v29,22,2 \n\t"
"vleif %%v29,23,3 \n\t"
"vleif %%v30,24,0 \n\t"
"vleif %%v30,25,1 \n\t"
"vleif %%v30,26,2 \n\t"
"vleif %%v30,27,3 \n\t"
"vleif %%v31,28,0 \n\t"
"vleif %%v31,29,1 \n\t"
"vleif %%v31,30,2 \n\t"
"vleif %%v31,31,3 \n\t"
"srlg %%r0,%2,6 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%3) \n\t"
"vl %%v16,0(%%r1,%3) \n\t"
"vl %%v17,16(%%r1,%3) \n\t"
"vl %%v18,32(%%r1,%3) \n\t"
"vl %%v19,48(%%r1,%3) \n\t"
"vl %%v20,64(%%r1,%3) \n\t"
"vl %%v21,80(%%r1,%3) \n\t"
"vl %%v22,96(%%r1,%3) \n\t"
"vl %%v23,112(%%r1,%3) \n\t"
"vfchesb %%v5,%%v17,%%v16 \n\t"
"vfchesb %%v6,%%v19,%%v18 \n\t"
"vfchesb %%v7,%%v21,%%v20 \n\t"
"vfchesb %%v8,%%v23,%%v22 \n\t"
"vsel %%v16,%%v16,%%v17,%%v5 \n\t"
"vsel %%v5,%%v24,%%v25,%%v5 \n\t"
"vsel %%v17,%%v18,%%v19,%%v6 \n\t"
"vsel %%v6,%%v26,%%v27,%%v6 \n\t"
"vsel %%v18,%%v20,%%v21,%%v7 \n\t"
"vsel %%v7,%%v28,%%v29,%%v7 \n\t"
"vsel %%v19,%%v22,%%v23,%%v8 \n\t"
"vsel %%v8,%%v30,%%v31,%%v8 \n\t"
"vfchesb %%v20,%%v17,%%v16 \n\t"
"vfchesb %%v21,%%v19,%%v18 \n\t"
"vsel %%v16,%%v16,%%v17,%%v20 \n\t"
"vsel %%v5,%%v5,%%v6,%%v20 \n\t"
"vsel %%v17,%%v18,%%v19,%%v21 \n\t"
"vsel %%v6,%%v7,%%v8,%%v21 \n\t"
"vfchesb %%v18,%%v17,%%v16 \n\t"
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
"vsel %%v5,%%v5,%%v6,%%v18 \n\t"
"vsegf %%v6,%%v5 \n\t"
"vesrlg %%v5,%%v5,32 \n\t"
"vag %%v5,%%v5,%%v4 \n\t"
"vag %%v6,%%v6,%%v4 \n\t"
"vfchesb %%v7,%%v16,%%v0 \n\t"
"vsel %%v0,%%v0,%%v16,%%v7 \n\t"
"vsegf %%v8,%%v7 \n\t"
"vesrlg %%v7,%%v7,32 \n\t"
"vsegf %%v7,%%v7 \n\t"
"vsel %%v1,%%v1,%%v5,%%v7 \n\t"
"vsel %%v2,%%v2,%%v6,%%v8 \n\t"
"vag %%v4,%%v4,%%v3 \n\t"
"vl %%v16,128(%%r1,%3) \n\t"
"vl %%v17,144(%%r1,%3) \n\t"
"vl %%v18,160(%%r1,%3) \n\t"
"vl %%v19,176(%%r1,%3) \n\t"
"vl %%v20,192(%%r1,%3) \n\t"
"vl %%v21,208(%%r1,%3) \n\t"
"vl %%v22,224(%%r1,%3) \n\t"
"vl %%v23,240(%%r1,%3) \n\t"
"vfchesb %%v5,%%v17,%%v16 \n\t"
"vfchesb %%v6,%%v19,%%v18 \n\t"
"vfchesb %%v7,%%v21,%%v20 \n\t"
"vfchesb %%v8,%%v23,%%v22 \n\t"
"vsel %%v16,%%v16,%%v17,%%v5 \n\t"
"vsel %%v5,%%v24,%%v25,%%v5 \n\t"
"vsel %%v17,%%v18,%%v19,%%v6 \n\t"
"vsel %%v6,%%v26,%%v27,%%v6 \n\t"
"vsel %%v18,%%v20,%%v21,%%v7 \n\t"
"vsel %%v7,%%v28,%%v29,%%v7 \n\t"
"vsel %%v19,%%v22,%%v23,%%v8 \n\t"
"vsel %%v8,%%v30,%%v31,%%v8 \n\t"
"vfchesb %%v20,%%v17,%%v16 \n\t"
"vfchesb %%v21,%%v19,%%v18 \n\t"
"vsel %%v16,%%v16,%%v17,%%v20 \n\t"
"vsel %%v5,%%v5,%%v6,%%v20 \n\t"
"vsel %%v17,%%v18,%%v19,%%v21 \n\t"
"vsel %%v6,%%v7,%%v8,%%v21 \n\t"
"vfchesb %%v18,%%v17,%%v16 \n\t"
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
"vsel %%v5,%%v5,%%v6,%%v18 \n\t"
"vsegf %%v6,%%v5 \n\t"
"vesrlg %%v5,%%v5,32 \n\t"
"vag %%v5,%%v5,%%v4 \n\t"
"vag %%v6,%%v6,%%v4 \n\t"
"vfchesb %%v7,%%v16,%%v0 \n\t"
"vsel %%v0,%%v0,%%v16,%%v7 \n\t"
"vsegf %%v8,%%v7 \n\t"
"vesrlg %%v7,%%v7,32 \n\t"
"vsegf %%v7,%%v7 \n\t"
"vsel %%v1,%%v1,%%v5,%%v7 \n\t"
"vsel %%v2,%%v2,%%v6,%%v8 \n\t"
"vag %%v4,%%v4,%%v3 \n\t"
"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"
"veslg %%v3,%%v0,32 \n\t"
"vfchsb %%v4,%%v3,%%v0 \n\t"
"vchlg %%v5,%%v2,%%v1 \n\t"
"vfcesb %%v6,%%v0,%%v3 \n\t"
"vn %%v5,%%v5,%%v6 \n\t"
"vo %%v4,%%v4,%%v5 \n\t"
"vsel %%v0,%%v0,%%v3,%%v4 \n\t"
"vesrlg %%v4,%%v4,32 \n\t"
"vsegf %%v4,%%v4 \n\t"
"vsel %%v1,%%v1,%%v2,%%v4 \n\t"
"vrepf %%v2,%%v0,2 \n\t"
"vrepg %%v3,%%v1,1 \n\t"
"wfcsb %%v2,%%v0 \n\t"
"jne 1f \n\t"
"vstef %%v0,%1,0 \n\t"
"vmnlg %%v0,%%v1,%%v3 \n\t"
"vlgvg %0,%%v0,0 \n\t"
"j 2f \n\t"
"1: \n\t"
"wfchsb %%v4,%%v0,%%v2 \n\t"
"vesrlg %%v4,%%v4,32 \n\t"
"vsegf %%v4,%%v4 \n\t"
"vsel %%v1,%%v3,%%v1,%%v4 \n\t"
"vsel %%v0,%%v2,%%v0,%%v4 \n\t"
"ste %%f0,%1 \n\t"
"vlgvg %0,%%v1,0 \n\t"
"2: \n\t"
"nop "
:"=r"(imin),"=m"(*min)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v1","v2","v4","v5","v6","v7","v8","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
return imin;
}
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0;
BLASLONG j = 0;
FLOAT minf = 0.0;
BLASLONG min = 0;
if (n <= 0 || inc_x <= 0) return (min);
if (inc_x == 1) {
BLASLONG n1 = n & -64;
if (n1 > 0) {
min = ismin_kernel_64(n1, x, &minf);
i = n1;
}
else
{
minf = x[0];
i++;
}
while (i < n) {
if (x[i] < minf) {
min = i;
minf = x[i];
}
i++;
}
return (min + 1);
} else {
min = 0;
minf = x[0];
BLASLONG n1 = n & -4;
while (j < n1) {
if (x[i] < minf) {
min = j;
minf = x[i];
}
if (x[i + inc_x] < minf) {
min = j + 1;
minf = x[i + inc_x];
}
if (x[i + 2 * inc_x] < minf) {
min = j + 2;
minf = x[i + 2 * inc_x];
}
if (x[i + 3 * inc_x] < minf) {
min = j + 3;
minf = x[i + 3 * inc_x];
}
i += inc_x * 4;
j += 4;
}
while (j < n) {
if (x[i] < minf) {
min = j;
minf = x[i];
}
i += inc_x;
j++;
}
return (min + 1);
}
}

View File

@ -24,190 +24,165 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/ *****************************************************************************/
#include "common.h" #include "common.h"
#include <math.h> #include <math.h>
#define ABS fabs #if defined(DOUBLE)
#define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) #define ABS fabs
#else
#define ABS fabsf
#endif
#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1]))
static BLASLONG izamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *amax)
{
BLASLONG iamax;
__asm__ volatile (
"vleg %%v0,0(%3),0 \n\t"
"vleg %%v1,8(%3),0 \n\t"
"vleg %%v0,16(%3),1 \n\t"
"vleg %%v1,24(%3),1 \n\t"
"vflpdb %%v0,%%v0 \n\t"
"vflpdb %%v1,%%v1 \n\t"
"vfadb %%v0,%%v0,%%v1 \n\t"
"vleig %%v1,0,0 \n\t"
"vleig %%v1,1,1 \n\t"
"vrepig %%v2,8 \n\t"
"vzero %%v3 \n\t"
"vleig %%v24,0,0 \n\t"
"vleig %%v24,1,1 \n\t"
"vleig %%v25,2,0 \n\t"
"vleig %%v25,3,1 \n\t"
"vleig %%v26,4,0 \n\t"
"vleig %%v26,5,1 \n\t"
"vleig %%v27,6,0 \n\t"
"vleig %%v27,7,1 \n\t"
"srlg %%r0,%2,4 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%3) \n\t"
"vleg %%v16,0(%%r1,%3),0 \n\t"
/** "vleg %%v17,8(%%r1,%3),0 \n\t"
* Find maximum index "vleg %%v16,16(%%r1,%3),1 \n\t"
* Warning: requirements n>0 and n % 16 == 0 "vleg %%v17,24(%%r1,%3),1 \n\t"
* @param n "vleg %%v18,32(%%r1,%3),0 \n\t"
* @param x pointer to the vector "vleg %%v19,40(%%r1,%3),0 \n\t"
* @param maxf (out) maximum absolute value .( only for output ) "vleg %%v18,48(%%r1,%3),1 \n\t"
* @return index "vleg %%v19,56(%%r1,%3),1 \n\t"
*/ "vleg %%v20,64(%%r1,%3),0 \n\t"
static BLASLONG ziamax_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *maxf) { "vleg %%v21,72(%%r1,%3),0 \n\t"
BLASLONG index; "vleg %%v20,80(%%r1,%3),1 \n\t"
__asm__( "vleg %%v21,88(%%r1,%3),1 \n\t"
"pfd 1, 0(%[ptr_x]) \n\t" "vleg %%v22,96(%%r1,%3),0 \n\t"
"vleig %%v16,0,0 \n\t" "vleg %%v23,104(%%r1,%3),0 \n\t"
"vleig %%v16,1,1 \n\t" "vleg %%v22,112(%%r1,%3),1 \n\t"
"vleig %%v17,2,0 \n\t" "vleg %%v23,120(%%r1,%3),1 \n\t"
"vleig %%v17,3,1 \n\t" "vflpdb %%v16, %%v16 \n\t"
"vleig %%v18,4,0 \n\t" "vflpdb %%v17, %%v17 \n\t"
"vleig %%v18,5,1 \n\t" "vflpdb %%v18, %%v18 \n\t"
"vleig %%v19,6,0 \n\t" "vflpdb %%v19, %%v19 \n\t"
"vleig %%v19,7,1 \n\t" "vflpdb %%v20, %%v20 \n\t"
"vleig %%v20,8,0 \n\t" "vflpdb %%v21, %%v21 \n\t"
"vleig %%v20,9,1 \n\t" "vflpdb %%v22, %%v22 \n\t"
"vleig %%v21,10,0 \n\t" "vflpdb %%v23, %%v23 \n\t"
"vleig %%v21,11,1 \n\t" "vfadb %%v16,%%v16,%%v17 \n\t"
"vleig %%v22,12,0 \n\t" "vfadb %%v17,%%v18,%%v19 \n\t"
"vleig %%v22,13,1 \n\t" "vfadb %%v18,%%v20,%%v21 \n\t"
"vleig %%v23,14,0 \n\t" "vfadb %%v19,%%v22,%%v23 \n\t"
"vleig %%v23,15,1 \n\t"
"sllg %%r0,%[n],4 \n\t"
"agr %%r0,%[ptr_x] \n\t"
"vzero %%v6 \n\t"
"vzero %%v7 \n\t"
"vrepig %%v4,16 \n\t"
"vzero %%v5 \n\t"
".align 16 \n\t"
"1: \n\t"
"pfd 1, 256(%[ptr_tmp] ) \n\t"
"vleg %%v24 , 0(%[ptr_tmp]),0 \n\t" "vfchedb %%v4,%%v16,%%v17 \n\t"
"vleg %%v25 , 8(%[ptr_tmp]),0 \n\t" "vfchedb %%v5,%%v18,%%v19 \n\t"
"vleg %%v24 , 16(%[ptr_tmp]),1 \n\t" "vsel %%v16,%%v16,%%v17,%%v4 \n\t"
"vleg %%v25 , 24(%[ptr_tmp]),1 \n\t" "vsel %%v4,%%v24,%%v25,%%v4 \n\t"
"vleg %%v26 , 32(%[ptr_tmp]),0 \n\t" "vsel %%v17,%%v18,%%v19,%%v5 \n\t"
"vleg %%v27 , 40(%[ptr_tmp]),0 \n\t" "vsel %%v5,%%v26,%%v27,%%v5 \n\t"
"vleg %%v26 , 48(%[ptr_tmp]),1 \n\t"
"vleg %%v27 , 56(%[ptr_tmp]),1 \n\t"
"vleg %%v28 , 64(%[ptr_tmp]),0 \n\t"
"vleg %%v29 , 72(%[ptr_tmp]),0 \n\t"
"vleg %%v28 , 80(%[ptr_tmp]),1 \n\t"
"vleg %%v29 , 88(%[ptr_tmp]),1 \n\t"
"vleg %%v30 , 96(%[ptr_tmp]),0 \n\t"
"vleg %%v31 ,104(%[ptr_tmp]),0 \n\t"
"vleg %%v30 ,112(%[ptr_tmp]),1 \n\t"
"vleg %%v31 ,120(%[ptr_tmp]),1 \n\t"
"vflpdb %%v24, %%v24 \n\t"
"vflpdb %%v25, %%v25 \n\t"
"vflpdb %%v26, %%v26 \n\t"
"vflpdb %%v27, %%v27 \n\t"
"vflpdb %%v28, %%v28 \n\t"
"vflpdb %%v29, %%v29 \n\t"
"vflpdb %%v30, %%v30 \n\t"
"vflpdb %%v31, %%v31 \n\t"
"vfadb %%v0,%%v24,%%v25 \n\t"
"vfadb %%v1,%%v26,%%v27 \n\t"
"vfadb %%v2,%%v28,%%v29 \n\t"
"vfadb %%v3,%%v30,%%v31 \n\t"
"vleg %%v24 , 128(%[ptr_tmp]),0 \n\t"
"vleg %%v25 , 136(%[ptr_tmp]),0 \n\t"
"vleg %%v24 , 144(%[ptr_tmp]),1 \n\t"
"vleg %%v25 , 152(%[ptr_tmp]),1 \n\t"
"vleg %%v26 , 160(%[ptr_tmp]),0 \n\t"
"vleg %%v27 , 168(%[ptr_tmp]),0 \n\t"
"vleg %%v26 , 176(%[ptr_tmp]),1 \n\t"
"vleg %%v27 , 184(%[ptr_tmp]),1 \n\t"
"vleg %%v28 , 192(%[ptr_tmp]),0 \n\t"
"vleg %%v29 , 200(%[ptr_tmp]),0 \n\t"
"vleg %%v28 , 208(%[ptr_tmp]),1 \n\t"
"vleg %%v29 , 216(%[ptr_tmp]),1 \n\t"
"vleg %%v30 , 224(%[ptr_tmp]),0 \n\t"
"vleg %%v31 , 232(%[ptr_tmp]),0 \n\t"
"vleg %%v30 , 240(%[ptr_tmp]),1 \n\t"
"vleg %%v31 , 248(%[ptr_tmp]),1 \n\t"
"vflpdb %%v24, %%v24 \n\t"
"vflpdb %%v25, %%v25 \n\t"
"vflpdb %%v26, %%v26 \n\t"
"vflpdb %%v27, %%v27 \n\t"
"vflpdb %%v28, %%v28 \n\t"
"vflpdb %%v29, %%v29 \n\t"
"vflpdb %%v30, %%v30 \n\t"
"vflpdb %%v31, %%v31 \n\t"
"vfadb %%v24,%%v24,%%v25 \n\t"
"vfadb %%v26,%%v26,%%v27 \n\t"
"vfadb %%v28,%%v28,%%v29 \n\t"
"vfadb %%v30,%%v30,%%v31 \n\t"
"vfchdb %%v25,%%v1,%%v0 \n\t"
"vsel %%v29,%%v17,%%v16,%%v25 \n\t"
"vsel %%v31,%%v1,%%v0,%%v25 \n\t"
"vfchdb %%v27,%%v3,%%v2 \n\t "
"vsel %%v0,%%v19,%%v18,%%v27 \n\t"
"vsel %%v1,%%v3,%%v2,%%v27 \n\t"
"vfchdb %%v25,%%v26,%%v24 \n\t"
"vsel %%v2,%%v21,%%v20,%%v25 \n\t"
"vsel %%v3,%%v26,%%v24,%%v25 \n\t"
"vfchdb %%v27,%%v30,%%v28 \n\t"
"vsel %%v25,%%v23,%%v22,%%v27 \n\t"
"vsel %%v27,%%v30,%%v28,%%v27 \n\t"
"vfchdb %%v24, %%v1,%%v31 \n\t"
"vsel %%v26,%%v0,%%v29,%%v24 \n\t"
"vsel %%v28,%%v1,%%v31,%%v24 \n\t"
"vfchdb %%v30, %%v27,%%v3 \n\t"
"vsel %%v29,%%v25,%%v2,%%v30 \n\t"
"vsel %%v31,%%v27,%%v3 ,%%v30 \n\t"
"la %[ptr_tmp],256(%[ptr_tmp]) \n\t"
"vfchdb %%v0, %%v31,%%v28 \n\t"
"vsel %%v25,%%v29,%%v26,%%v0 \n\t"
"vsel %%v27,%%v31,%%v28,%%v0 \n\t"
"vag %%v25,%%v25,%%v5 \n\t"
//cmp with previous
"vfchdb %%v30, %%v27,%%v6 \n\t"
"vsel %%v7,%%v25,%%v7,%%v30 \n\t"
"vsel %%v6,%%v27,%%v6,%%v30 \n\t"
"vag %%v5,%%v5,%%v4 \n\t"
"clgrjl %[ptr_tmp],%%r0,1b \n\t"
//xtract index "vfchedb %%v18,%%v16,%%v17 \n\t"
"vrepg %%v26,%%v6,1 \n\t" "vsel %%v16,%%v16,%%v17,%%v18 \n\t"
"vrepg %%v5,%%v7,1 \n\t" "vsel %%v4,%%v4,%%v5,%%v18 \n\t"
"wfcdb %%v26,%%v6 \n\t" "vag %%v4,%%v4,%%v3 \n\t"
"jne 2f \n\t"
"vsteg %%v6,%[maxf],0 \n\t"
"vmnlg %%v1,%%v5,%%v7 \n\t"
"vlgvg %[index],%%v1,0 \n\t"
"j 3 \n\t"
"2: \n\t"
"wfchdb %%v16,%%v26,%%v6 \n\t"
"vsel %%v1,%%v5,%%v7,%%v16 \n\t"
"vsel %%v0,%%v26,%%v6,%%v16 \n\t"
"vlgvg %[index],%%v1,0 \n\t"
"std %%f0,%[maxf] \n\t"
"3: \n\t"
: [index] "+r"(index) ,[maxf] "=m"(*maxf), [ptr_tmp] "+&a"(x)
: [mem] "m"( *(const double (*)[2*n])x), [n] "r"(n), [ptr_x] "r"(x)
: "cc","r0", "f0","v0","v1","v2","v3","v4","v5","v6","v7","v16",
"v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
); "vfchedb %%v5,%%v0,%%v16 \n\t"
return index; "vsel %%v0,%%v0,%%v16,%%v5 \n\t"
"vsel %%v1,%%v1,%%v4,%%v5 \n\t"
"vag %%v3,%%v3,%%v2 \n\t"
"vleg %%v16,128(%%r1,%3),0 \n\t"
"vleg %%v17,136(%%r1,%3),0 \n\t"
"vleg %%v16,144(%%r1,%3),1 \n\t"
"vleg %%v17,152(%%r1,%3),1 \n\t"
"vleg %%v18,160(%%r1,%3),0 \n\t"
"vleg %%v19,168(%%r1,%3),0 \n\t"
"vleg %%v18,176(%%r1,%3),1 \n\t"
"vleg %%v19,184(%%r1,%3),1 \n\t"
"vleg %%v20,192(%%r1,%3),0 \n\t"
"vleg %%v21,200(%%r1,%3),0 \n\t"
"vleg %%v20,208(%%r1,%3),1 \n\t"
"vleg %%v21,216(%%r1,%3),1 \n\t"
"vleg %%v22,224(%%r1,%3),0 \n\t"
"vleg %%v23,232(%%r1,%3),0 \n\t"
"vleg %%v22,240(%%r1,%3),1 \n\t"
"vleg %%v23,248(%%r1,%3),1 \n\t"
"vflpdb %%v16, %%v16 \n\t"
"vflpdb %%v17, %%v17 \n\t"
"vflpdb %%v18, %%v18 \n\t"
"vflpdb %%v19, %%v19 \n\t"
"vflpdb %%v20, %%v20 \n\t"
"vflpdb %%v21, %%v21 \n\t"
"vflpdb %%v22, %%v22 \n\t"
"vflpdb %%v23, %%v23 \n\t"
"vfadb %%v16,%%v16,%%v17 \n\t"
"vfadb %%v17,%%v18,%%v19 \n\t"
"vfadb %%v18,%%v20,%%v21 \n\t"
"vfadb %%v19,%%v22,%%v23 \n\t"
"vfchedb %%v4,%%v16,%%v17 \n\t"
"vfchedb %%v5,%%v18,%%v19 \n\t"
"vsel %%v16,%%v16,%%v17,%%v4 \n\t"
"vsel %%v4,%%v24,%%v25,%%v4 \n\t"
"vsel %%v17,%%v18,%%v19,%%v5 \n\t"
"vsel %%v5,%%v26,%%v27,%%v5 \n\t"
"vfchedb %%v18,%%v16,%%v17 \n\t"
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
"vsel %%v4,%%v4,%%v5,%%v18 \n\t"
"vag %%v4,%%v4,%%v3 \n\t"
"vfchedb %%v5,%%v0,%%v16 \n\t"
"vsel %%v0,%%v0,%%v16,%%v5 \n\t"
"vsel %%v1,%%v1,%%v4,%%v5 \n\t"
"vag %%v3,%%v3,%%v2 \n\t"
"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"
"vrepg %%v2,%%v0,1 \n\t"
"vrepg %%v3,%%v1,1 \n\t"
"wfcdb %%v2,%%v0 \n\t"
"jne 1f \n\t"
"vsteg %%v0,%1,0 \n\t"
"vmnlg %%v0,%%v1,%%v3 \n\t"
"vlgvg %0,%%v0,0 \n\t"
"j 2f \n\t"
"1: \n\t"
"wfchdb %%v4,%%v2,%%v0 \n\t"
"vsel %%v1,%%v3,%%v1,%%v4 \n\t"
"vsel %%v0,%%v2,%%v0,%%v4 \n\t"
"std %%f0,%1 \n\t"
"vlgvg %0,%%v1,0 \n\t"
"2: \n\t"
"nop "
:"=r"(iamax),"=m"(*amax)
:"r"(n),"ZR"((const FLOAT (*)[n * 2])x)
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27"
);
return iamax;
} }
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{ {
BLASLONG i = 0; BLASLONG i = 0;
@ -223,10 +198,16 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
BLASLONG n1 = n & -16; BLASLONG n1 = n & -16;
if (n1 > 0) { if (n1 > 0) {
max = ziamax_kernel_16_TUNED(n1, x, &maxf); max = izamax_kernel_16(n1, x, &maxf);
ix = n1 * 2;
i = n1; i = n1;
ix = n1 << 1;
} }
else
{
maxf = CABS1(x,0);
ix += 2;
i++;
}
while(i < n) while(i < n)
{ {
@ -242,9 +223,9 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
} else { } else {
inc_x2 = 2 * inc_x; max = 0;
maxf = CABS1(x,0); maxf = CABS1(x,0);
inc_x2 = 2 * inc_x;
ix += inc_x2; ix += inc_x2;
i++; i++;
@ -260,7 +241,6 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
} }
return (max + 1); return (max + 1);
} }
} }

View File

@ -24,253 +24,223 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/ *****************************************************************************/
#include "common.h" #include "common.h"
#include <math.h> #include <math.h>
#define ABS fabs #if defined(DOUBLE)
#define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) #define ABS fabs
#else
#define ABS fabsf
#endif
#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1]))
static BLASLONG izamin_kernel_16(BLASLONG n, FLOAT *x, FLOAT *amin)
/** {
* Find minimum index BLASLONG iamin;
* Warning: requirements n>0 and n % 16 == 0
* @param n __asm__ volatile (
* @param x pointer to the vector "vleg %%v0,0(%3),0 \n\t"
* @param minf (out) minimum absolute value .( only for output ) "vleg %%v1,8(%3),0 \n\t"
* @return minimum index "vleg %%v0,16(%3),1 \n\t"
*/ "vleg %%v1,24(%3),1 \n\t"
static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) { "vflpdb %%v0,%%v0 \n\t"
BLASLONG index ; "vflpdb %%v1,%%v1 \n\t"
__asm__( "vfadb %%v0,%%v0,%%v1 \n\t"
"pfd 1, 0(%[ptr_x]) \n\t" "vleig %%v1,0,0 \n\t"
"vleig %%v16,0,0 \n\t" "vleig %%v1,1,1 \n\t"
"vleig %%v16,1,1 \n\t" "vrepig %%v2,8 \n\t"
"vleig %%v17,2,0 \n\t" "vzero %%v3 \n\t"
"vleig %%v17,3,1 \n\t" "vleig %%v24,0,0 \n\t"
"vleig %%v18,4,0 \n\t" "vleig %%v24,1,1 \n\t"
"vleig %%v18,5,1 \n\t" "vleig %%v25,2,0 \n\t"
"vleig %%v19,6,0 \n\t" "vleig %%v25,3,1 \n\t"
"vleig %%v19,7,1 \n\t" "vleig %%v26,4,0 \n\t"
"vleig %%v20,8,0 \n\t" "vleig %%v26,5,1 \n\t"
"vleig %%v20,9,1 \n\t" "vleig %%v27,6,0 \n\t"
"vleig %%v21,10,0 \n\t" "vleig %%v27,7,1 \n\t"
"vleig %%v21,11,1 \n\t" "srlg %%r0,%2,4 \n\t"
"vleig %%v22,12,0 \n\t" "xgr %%r1,%%r1 \n\t"
"vleig %%v22,13,1 \n\t" "0: \n\t"
"vleig %%v23,14,0 \n\t" "pfd 1, 1024(%%r1,%3) \n\t"
"vleig %%v23,15,1 \n\t"
"ld %%f6,0(%[ptr_x]) \n\t" "vleg %%v16,0(%%r1,%3),0 \n\t"
"lpdbr %%f6,%%f6 \n\t" "vleg %%v17,8(%%r1,%3),0 \n\t"
"ld %%f7,8(%[ptr_x]) \n\t" "vleg %%v16,16(%%r1,%3),1 \n\t"
"lpdbr %%f7,%%f7 \n\t" "vleg %%v17,24(%%r1,%3),1 \n\t"
"adbr %%f6,%%f7 \n\t" "vleg %%v18,32(%%r1,%3),0 \n\t"
"sllg %%r0,%[n],4 \n\t" "vleg %%v19,40(%%r1,%3),0 \n\t"
"agr %%r0,%[ptr_x] \n\t" "vleg %%v18,48(%%r1,%3),1 \n\t"
"vrepg %%v6,%%v6,0 \n\t" "vleg %%v19,56(%%r1,%3),1 \n\t"
"vzero %%v7 \n\t" "vleg %%v20,64(%%r1,%3),0 \n\t"
"vrepig %%v4,16 \n\t" "vleg %%v21,72(%%r1,%3),0 \n\t"
"vzero %%v5 \n\t" "vleg %%v20,80(%%r1,%3),1 \n\t"
".align 16 \n\t" "vleg %%v21,88(%%r1,%3),1 \n\t"
"1: \n\t" "vleg %%v22,96(%%r1,%3),0 \n\t"
"pfd 1, 256(%[ptr_tmp] ) \n\t" "vleg %%v23,104(%%r1,%3),0 \n\t"
"vleg %%v22,112(%%r1,%3),1 \n\t"
"vleg %%v23,120(%%r1,%3),1 \n\t"
"vflpdb %%v16, %%v16 \n\t"
"vflpdb %%v17, %%v17 \n\t"
"vflpdb %%v18, %%v18 \n\t"
"vflpdb %%v19, %%v19 \n\t"
"vflpdb %%v20, %%v20 \n\t"
"vflpdb %%v21, %%v21 \n\t"
"vflpdb %%v22, %%v22 \n\t"
"vflpdb %%v23, %%v23 \n\t"
"vfadb %%v16,%%v16,%%v17 \n\t"
"vfadb %%v17,%%v18,%%v19 \n\t"
"vfadb %%v18,%%v20,%%v21 \n\t"
"vfadb %%v19,%%v22,%%v23 \n\t"
"vleg %%v24 , 0(%[ptr_tmp]),0 \n\t" "vfchedb %%v4,%%v17,%%v16 \n\t"
"vleg %%v25 , 8(%[ptr_tmp]),0 \n\t" "vfchedb %%v5,%%v19,%%v18 \n\t"
"vleg %%v24 , 16(%[ptr_tmp]),1 \n\t" "vsel %%v16,%%v16,%%v17,%%v4 \n\t"
"vleg %%v25 , 24(%[ptr_tmp]),1 \n\t" "vsel %%v4,%%v24,%%v25,%%v4 \n\t"
"vleg %%v26 , 32(%[ptr_tmp]),0 \n\t" "vsel %%v17,%%v18,%%v19,%%v5 \n\t"
"vleg %%v27 , 40(%[ptr_tmp]),0 \n\t" "vsel %%v5,%%v26,%%v27,%%v5 \n\t"
"vleg %%v26 , 48(%[ptr_tmp]),1 \n\t"
"vleg %%v27 , 56(%[ptr_tmp]),1 \n\t"
"vleg %%v28 , 64(%[ptr_tmp]),0 \n\t"
"vleg %%v29 , 72(%[ptr_tmp]),0 \n\t"
"vleg %%v28 , 80(%[ptr_tmp]),1 \n\t"
"vleg %%v29 , 88(%[ptr_tmp]),1 \n\t"
"vleg %%v30 , 96(%[ptr_tmp]),0 \n\t"
"vleg %%v31 ,104(%[ptr_tmp]),0 \n\t"
"vleg %%v30 ,112(%[ptr_tmp]),1 \n\t"
"vleg %%v31 ,120(%[ptr_tmp]),1 \n\t"
"vflpdb %%v24, %%v24 \n\t"
"vflpdb %%v25, %%v25 \n\t"
"vflpdb %%v26, %%v26 \n\t"
"vflpdb %%v27, %%v27 \n\t"
"vflpdb %%v28, %%v28 \n\t"
"vflpdb %%v29, %%v29 \n\t"
"vflpdb %%v30, %%v30 \n\t"
"vflpdb %%v31, %%v31 \n\t"
"vfadb %%v0,%%v24,%%v25 \n\t"
"vfadb %%v1,%%v26,%%v27 \n\t"
"vfadb %%v2,%%v28,%%v29 \n\t"
"vfadb %%v3,%%v30,%%v31 \n\t"
"vleg %%v24 ,128(%[ptr_tmp]),0 \n\t"
"vleg %%v25 ,136(%[ptr_tmp]),0 \n\t"
"vleg %%v24 ,144(%[ptr_tmp]),1 \n\t"
"vleg %%v25 ,152(%[ptr_tmp]),1 \n\t"
"vleg %%v26 ,160(%[ptr_tmp]),0 \n\t"
"vleg %%v27 ,168(%[ptr_tmp]),0 \n\t"
"vleg %%v26 ,176(%[ptr_tmp]),1 \n\t"
"vleg %%v27 ,184(%[ptr_tmp]),1 \n\t"
"vleg %%v28 ,192(%[ptr_tmp]),0 \n\t"
"vleg %%v29 ,200(%[ptr_tmp]),0 \n\t"
"vleg %%v28 ,208(%[ptr_tmp]),1 \n\t"
"vleg %%v29 ,216(%[ptr_tmp]),1 \n\t"
"vleg %%v30 ,224(%[ptr_tmp]),0 \n\t"
"vleg %%v31 ,232(%[ptr_tmp]),0 \n\t"
"vleg %%v30 ,240(%[ptr_tmp]),1 \n\t"
"vleg %%v31 ,248(%[ptr_tmp]),1 \n\t"
"vflpdb %%v24, %%v24 \n\t"
"vflpdb %%v25, %%v25 \n\t"
"vflpdb %%v26, %%v26 \n\t"
"vflpdb %%v27, %%v27 \n\t"
"vflpdb %%v28, %%v28 \n\t"
"vflpdb %%v29, %%v29 \n\t"
"vflpdb %%v30, %%v30 \n\t"
"vflpdb %%v31, %%v31 \n\t"
"vfadb %%v24,%%v24,%%v25 \n\t"
"vfadb %%v26,%%v26,%%v27 \n\t"
"vfadb %%v28,%%v28,%%v29 \n\t"
"vfadb %%v30,%%v30,%%v31 \n\t"
"vfchdb %%v25,%%v0 ,%%v1 \n\t"
"vsel %%v29,%%v17,%%v16,%%v25 \n\t"
"vsel %%v31,%%v1,%%v0,%%v25 \n\t"
"vfchdb %%v27,%%v2,%%v3 \n\t"
"vsel %%v0,%%v19,%%v18,%%v27 \n\t"
"vsel %%v1,%%v3,%%v2,%%v27 \n\t"
"vfchdb %%v25,%%v24,%%v26 \n\t"
"vsel %%v2,%%v21,%%v20,%%v25 \n\t"
"vsel %%v3,%%v26,%%v24,%%v25 \n\t"
"vfchdb %%v27,%%v28,%%v30 \n\t"
"vsel %%v25,%%v23,%%v22,%%v27 \n\t"
"vsel %%v27,%%v30,%%v28,%%v27 \n\t"
"vfchdb %%v24,%%v31, %%v1 \n\t"
"vsel %%v26,%%v0,%%v29,%%v24 \n\t"
"vsel %%v28,%%v1,%%v31,%%v24 \n\t"
"vfchdb %%v30,%%v3, %%v27 \n\t"
"vsel %%v29,%%v25,%%v2,%%v30 \n\t"
"vsel %%v31,%%v27,%%v3 ,%%v30 \n\t"
"la %[ptr_tmp],256(%[ptr_tmp]) \n\t"
"vfchdb %%v0,%%v28, %%v31 \n\t"
"vsel %%v25,%%v29,%%v26,%%v0 \n\t"
"vsel %%v27,%%v31,%%v28,%%v0 \n\t"
"vag %%v25,%%v25,%%v5 \n\t"
//cmp with previous
"vfchdb %%v30,%%v6 , %%v27 \n\t"
"vsel %%v7,%%v25,%%v7,%%v30 \n\t"
"vsel %%v6,%%v27,%%v6,%%v30 \n\t"
"vag %%v5,%%v5,%%v4 \n\t"
"clgrjl %[ptr_tmp],%%r0,1b \n\t"
//xtract index "vfchedb %%v18,%%v17,%%v16 \n\t"
"vrepg %%v26,%%v6,1 \n\t" "vsel %%v16,%%v16,%%v17,%%v18 \n\t"
"vrepg %%v5,%%v7,1 \n\t" "vsel %%v4,%%v4,%%v5,%%v18 \n\t"
"wfcdb %%v26,%%v6 \n\t" "vag %%v4,%%v4,%%v3 \n\t"
"jne 2f \n\t"
"vsteg %%v6,%[minf],0 \n\t"
"vmnlg %%v1,%%v5,%%v7 \n\t"
"vlgvg %[index],%%v1,0 \n\t"
"j 3f \n\t"
"2: \n\t"
"wfchdb %%v16,%%v6 ,%%v26 \n\t"
"vsel %%v1,%%v5,%%v7,%%v16 \n\t"
"vsel %%v0,%%v26,%%v6,%%v16 \n\t"
"vlgvg %[index],%%v1,0 \n\t"
"std %%f0,%[minf] \n\t"
"3: \n\t"
: [index] "+r"(index) ,[minf] "=m"(*minf), [ptr_tmp] "+&a"(x) "vfchedb %%v5,%%v16,%%v0 \n\t"
: [mem] "m"( *(const double (*)[2*n])x), [n] "r"(n), [ptr_x] "r"(x) "vsel %%v0,%%v0,%%v16,%%v5 \n\t"
: "cc","r0","f0","v0","v1","v2","v3","v4","v5","v6","v7","v16", "vsel %%v1,%%v1,%%v4,%%v5 \n\t"
"v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" "vag %%v3,%%v3,%%v2 \n\t"
); "vleg %%v16,128(%%r1,%3),0 \n\t"
"vleg %%v17,136(%%r1,%3),0 \n\t"
"vleg %%v16,144(%%r1,%3),1 \n\t"
"vleg %%v17,152(%%r1,%3),1 \n\t"
"vleg %%v18,160(%%r1,%3),0 \n\t"
"vleg %%v19,168(%%r1,%3),0 \n\t"
"vleg %%v18,176(%%r1,%3),1 \n\t"
"vleg %%v19,184(%%r1,%3),1 \n\t"
"vleg %%v20,192(%%r1,%3),0 \n\t"
"vleg %%v21,200(%%r1,%3),0 \n\t"
"vleg %%v20,208(%%r1,%3),1 \n\t"
"vleg %%v21,216(%%r1,%3),1 \n\t"
"vleg %%v22,224(%%r1,%3),0 \n\t"
"vleg %%v23,232(%%r1,%3),0 \n\t"
"vleg %%v22,240(%%r1,%3),1 \n\t"
"vleg %%v23,248(%%r1,%3),1 \n\t"
"vflpdb %%v16, %%v16 \n\t"
"vflpdb %%v17, %%v17 \n\t"
"vflpdb %%v18, %%v18 \n\t"
"vflpdb %%v19, %%v19 \n\t"
"vflpdb %%v20, %%v20 \n\t"
"vflpdb %%v21, %%v21 \n\t"
"vflpdb %%v22, %%v22 \n\t"
"vflpdb %%v23, %%v23 \n\t"
"vfadb %%v16,%%v16,%%v17 \n\t"
"vfadb %%v17,%%v18,%%v19 \n\t"
"vfadb %%v18,%%v20,%%v21 \n\t"
"vfadb %%v19,%%v22,%%v23 \n\t"
"vfchedb %%v4,%%v17,%%v16 \n\t"
"vfchedb %%v5,%%v19,%%v18 \n\t"
"vsel %%v16,%%v16,%%v17,%%v4 \n\t"
"vsel %%v4,%%v24,%%v25,%%v4 \n\t"
"vsel %%v17,%%v18,%%v19,%%v5 \n\t"
"vsel %%v5,%%v26,%%v27,%%v5 \n\t"
return index; "vfchedb %%v18,%%v17,%%v16 \n\t"
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
"vsel %%v4,%%v4,%%v5,%%v18 \n\t"
"vag %%v4,%%v4,%%v3 \n\t"
"vfchedb %%v5,%%v16,%%v0 \n\t"
"vsel %%v0,%%v0,%%v16,%%v5 \n\t"
"vsel %%v1,%%v1,%%v4,%%v5 \n\t"
"vag %%v3,%%v3,%%v2 \n\t"
"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"
"vrepg %%v2,%%v0,1 \n\t"
"vrepg %%v3,%%v1,1 \n\t"
"wfcdb %%v2,%%v0 \n\t"
"jne 1f \n\t"
"vsteg %%v0,%1,0 \n\t"
"vmnlg %%v0,%%v1,%%v3 \n\t"
"vlgvg %0,%%v0,0 \n\t"
"j 2f \n\t"
"1: \n\t"
"wfchdb %%v4,%%v0,%%v2 \n\t"
"vsel %%v1,%%v3,%%v1,%%v4 \n\t"
"vsel %%v0,%%v2,%%v0,%%v4 \n\t"
"std %%f0,%1 \n\t"
"vlgvg %0,%%v1,0 \n\t"
"2: \n\t"
"nop "
:"=r"(iamin),"=m"(*amin)
:"r"(n),"ZR"((const FLOAT (*)[n * 2])x)
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27"
);
return iamin;
} }
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{ {
BLASLONG i=0; BLASLONG i = 0;
BLASLONG ix=0; BLASLONG ix = 0;
FLOAT minf; FLOAT minf = 0;
BLASLONG min=0; BLASLONG min = 0;
BLASLONG inc_x2; BLASLONG inc_x2;
if (n <= 0 || inc_x <= 0) return(min); if (n <= 0 || inc_x <= 0) return(min);
if (inc_x == 1) { if (inc_x == 1) {
BLASLONG n1 = n & -16; BLASLONG n1 = n & -16;
if (n1 > 0) { if (n1 > 0) {
min = ziamin_kernel_16_TUNED(n1, x, &minf); min = izamin_kernel_16(n1, x, &minf);
ix = n1 * 2;
i = n1; i = n1;
ix = n1 << 1; }
} else
else {
//assign minf
minf = CABS1(x,0);
ix += 2;
i++;
}
while(i < n)
{ {
if( CABS1(x,ix) < minf ) minf = CABS1(x,0);
{
min = i;
minf = CABS1(x,ix);
}
ix += 2; ix += 2;
i++; i++;
} }
while(i < n)
{
if( CABS1(x,ix) < minf )
{
min = i;
minf = CABS1(x,ix);
}
ix += 2;
i++;
}
return (min + 1); return (min + 1);
} else { } else {
inc_x2 = 2 * inc_x; min = 0;
minf = CABS1(x,0);
inc_x2 = 2 * inc_x;
ix += inc_x2;
i++;
minf = CABS1(x,0); while(i < n)
{
if( CABS1(x,ix) < minf )
{
min = i;
minf = CABS1(x,ix);
}
ix += inc_x2; ix += inc_x2;
i++; i++;
}
while(i < n)
{
if( CABS1(x,ix) < minf )
{
min = i;
minf = CABS1(x,ix);
}
ix += inc_x2;
i++;
}
return (min + 1); return (min + 1);
} }
} }

169
kernel/zarch/samax.c Normal file
View File

@ -0,0 +1,169 @@
/***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#include <math.h>
#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf
#endif
static FLOAT samax_kernel_64(BLASLONG n, FLOAT *x)
{
FLOAT amax;
__asm__ volatile (
"vl %%v0,0(%2) \n\t"
"srlg %%r0,%1,6 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%2) \n\t"
"vl %%v16,0(%%r1,%2) \n\t"
"vl %%v17,16(%%r1,%2) \n\t"
"vl %%v18,32(%%r1,%2) \n\t"
"vl %%v19,48(%%r1,%2) \n\t"
"vl %%v20,64(%%r1,%2) \n\t"
"vl %%v21,80(%%r1,%2) \n\t"
"vl %%v22,96(%%r1,%2) \n\t"
"vl %%v23,112(%%r1,%2) \n\t"
"vl %%v24,128(%%r1,%2) \n\t"
"vl %%v25,144(%%r1,%2) \n\t"
"vl %%v26,160(%%r1,%2) \n\t"
"vl %%v27,176(%%r1,%2) \n\t"
"vl %%v28,192(%%r1,%2) \n\t"
"vl %%v29,208(%%r1,%2) \n\t"
"vl %%v30,224(%%r1,%2) \n\t"
"vl %%v31,240(%%r1,%2) \n\t"
"vfmaxsb %%v16,%%v16,%%v24,8 \n\t"
"vfmaxsb %%v17,%%v17,%%v25,8 \n\t"
"vfmaxsb %%v18,%%v18,%%v26,8 \n\t"
"vfmaxsb %%v19,%%v19,%%v27,8 \n\t"
"vfmaxsb %%v20,%%v20,%%v28,8 \n\t"
"vfmaxsb %%v21,%%v21,%%v29,8 \n\t"
"vfmaxsb %%v22,%%v22,%%v30,8 \n\t"
"vfmaxsb %%v23,%%v23,%%v31,8 \n\t"
"vfmaxsb %%v16,%%v16,%%v20,8 \n\t"
"vfmaxsb %%v17,%%v17,%%v21,8 \n\t"
"vfmaxsb %%v18,%%v18,%%v22,8 \n\t"
"vfmaxsb %%v19,%%v19,%%v23,8 \n\t"
"vfmaxsb %%v16,%%v16,%%v18,8 \n\t"
"vfmaxsb %%v17,%%v17,%%v19,8 \n\t"
"vfmaxsb %%v16,%%v16,%%v17,8 \n\t"
"vfmaxsb %%v0,%%v0,%%16,8 \n\t"
"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"
"veslg %%v16,%%v0,32 \n\t"
"vfmaxsb %%v0,%%v0,%%v16,8 \n\t"
"vrepf %%v16,%%v0,2 \n\t"
"wfmaxsb %%v0,%%v0,%%v16,8 \n\t"
"lper %0,%%f0 "
:"=f"(amax)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
return amax;
}
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0;
BLASLONG j = 0;
FLOAT maxf = 0.0;
if (n <= 0 || inc_x <= 0) return (maxf);
if (inc_x == 1) {
BLASLONG n1 = n & -64;
if (n1 > 0) {
maxf = samax_kernel_64(n1, x);
i = n1;
}
else
{
maxf=ABS(x[0]);
i++;
}
while (i < n) {
if (ABS(x[i]) > maxf) {
maxf = ABS(x[i]);
}
i++;
}
return (maxf);
} else {
maxf=ABS(x[0]);
BLASLONG n1 = n & -4;
while (j < n1) {
if (ABS(x[i]) > maxf) {
maxf = ABS(x[i]);
}
if (ABS(x[i + inc_x]) > maxf) {
maxf = ABS(x[i + inc_x]);
}
if (ABS(x[i + 2 * inc_x]) > maxf) {
maxf = ABS(x[i + 2 * inc_x]);
}
if (ABS(x[i + 3 * inc_x]) > maxf) {
maxf = ABS(x[i + 3 * inc_x]);
}
i += inc_x * 4;
j += 4;
}
while (j < n) {
if (ABS(x[i]) > maxf) {
maxf = ABS(x[i]);
}
i += inc_x;
j++;
}
return (maxf);
}
}

169
kernel/zarch/samin.c Normal file
View File

@ -0,0 +1,169 @@
/***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#include <math.h>
#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf
#endif
static FLOAT samin_kernel_64(BLASLONG n, FLOAT *x)
{
FLOAT amin;
__asm__ volatile (
"vl %%v0,0(%2) \n\t"
"srlg %%r0,%1,6 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%2) \n\t"
"vl %%v16,0(%%r1,%2) \n\t"
"vl %%v17,16(%%r1,%2) \n\t"
"vl %%v18,32(%%r1,%2) \n\t"
"vl %%v19,48(%%r1,%2) \n\t"
"vl %%v20,64(%%r1,%2) \n\t"
"vl %%v21,80(%%r1,%2) \n\t"
"vl %%v22,96(%%r1,%2) \n\t"
"vl %%v23,112(%%r1,%2) \n\t"
"vl %%v24,128(%%r1,%2) \n\t"
"vl %%v25,144(%%r1,%2) \n\t"
"vl %%v26,160(%%r1,%2) \n\t"
"vl %%v27,176(%%r1,%2) \n\t"
"vl %%v28,192(%%r1,%2) \n\t"
"vl %%v29,208(%%r1,%2) \n\t"
"vl %%v30,224(%%r1,%2) \n\t"
"vl %%v31,240(%%r1,%2) \n\t"
"vfminsb %%v16,%%v16,%%v24,8 \n\t"
"vfminsb %%v17,%%v17,%%v25,8 \n\t"
"vfminsb %%v18,%%v18,%%v26,8 \n\t"
"vfminsb %%v19,%%v19,%%v27,8 \n\t"
"vfminsb %%v20,%%v20,%%v28,8 \n\t"
"vfminsb %%v21,%%v21,%%v29,8 \n\t"
"vfminsb %%v22,%%v22,%%v30,8 \n\t"
"vfminsb %%v23,%%v23,%%v31,8 \n\t"
"vfminsb %%v16,%%v16,%%v20,8 \n\t"
"vfminsb %%v17,%%v17,%%v21,8 \n\t"
"vfminsb %%v18,%%v18,%%v22,8 \n\t"
"vfminsb %%v19,%%v19,%%v23,8 \n\t"
"vfminsb %%v16,%%v16,%%v18,8 \n\t"
"vfminsb %%v17,%%v17,%%v19,8 \n\t"
"vfminsb %%v16,%%v16,%%v17,8 \n\t"
"vfminsb %%v0,%%v0,%%16,8 \n\t"
"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"
"veslg %%v16,%%v0,32 \n\t"
"vfminsb %%v0,%%v0,%%v16,8 \n\t"
"vrepf %%v16,%%v0,2 \n\t"
"wfminsb %%v0,%%v0,%%v16,8 \n\t"
"lper %0,%%f0 "
:"=f"(amin)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
return amin;
}
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0;
BLASLONG j = 0;
FLOAT minf = 0.0;
if (n <= 0 || inc_x <= 0) return (minf);
if (inc_x == 1) {
BLASLONG n1 = n & -64;
if (n1 > 0) {
minf = samin_kernel_64(n1, x);
i = n1;
}
else
{
minf=ABS(x[0]);
i++;
}
while (i < n) {
if (ABS(x[i]) < minf) {
minf = ABS(x[i]);
}
i++;
}
return (minf);
} else {
minf=ABS(x[0]);
BLASLONG n1 = n & -4;
while (j < n1) {
if (ABS(x[i]) < minf) {
minf = ABS(x[i]);
}
if (ABS(x[i + inc_x]) < minf) {
minf = ABS(x[i + inc_x]);
}
if (ABS(x[i + 2 * inc_x]) < minf) {
minf = ABS(x[i + 2 * inc_x]);
}
if (ABS(x[i + 3 * inc_x]) < minf) {
minf = ABS(x[i + 3 * inc_x]);
}
i += inc_x * 4;
j += 4;
}
while (j < n) {
if (ABS(x[i]) < minf) {
minf = ABS(x[i]);
}
i += inc_x;
j++;
}
return (minf);
}
}

174
kernel/zarch/sasum.c Normal file
View File

@ -0,0 +1,174 @@
/***************************************************************************
Copyright (c) 2013-2018, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#include <math.h>
#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf
#endif
static FLOAT sasum_kernel_64(BLASLONG n, FLOAT *x)
{
FLOAT asum;
__asm__ (
"vzero %%v0 \n\t"
"vzero %%v1 \n\t"
"vzero %%v2 \n\t"
"vzero %%v3 \n\t"
"srlg %%r0,%1,6 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%2) \n\t"
"vl %%v16, 0(%%r1,%2) \n\t"
"vl %%v17, 16(%%r1,%2) \n\t"
"vl %%v18, 32(%%r1,%2) \n\t"
"vl %%v19, 48(%%r1,%2) \n\t"
"vl %%v20, 64(%%r1,%2) \n\t"
"vl %%v21, 80(%%r1,%2) \n\t"
"vl %%v22, 96(%%r1,%2) \n\t"
"vl %%v23, 112(%%r1,%2) \n\t"
"vflpsb %%v16, %%v16 \n\t"
"vflpsb %%v17, %%v17 \n\t"
"vflpsb %%v18, %%v18 \n\t"
"vflpsb %%v19, %%v19 \n\t"
"vflpsb %%v20, %%v20 \n\t"
"vflpsb %%v21, %%v21 \n\t"
"vflpsb %%v22, %%v22 \n\t"
"vflpsb %%v23, %%v23 \n\t"
"vfasb %%v0,%%v0,%%v16 \n\t"
"vfasb %%v1,%%v1,%%v17 \n\t"
"vfasb %%v2,%%v2,%%v18 \n\t"
"vfasb %%v3,%%v3,%%v19 \n\t"
"vfasb %%v0,%%v0,%%v20 \n\t"
"vfasb %%v1,%%v1,%%v21 \n\t"
"vfasb %%v2,%%v2,%%v22 \n\t"
"vfasb %%v3,%%v3,%%v23 \n\t"
"vl %%v16, 128(%%r1,%2) \n\t"
"vl %%v17, 144(%%r1,%2) \n\t"
"vl %%v18, 160(%%r1,%2) \n\t"
"vl %%v19, 176(%%r1,%2) \n\t"
"vl %%v20, 192(%%r1,%2) \n\t"
"vl %%v21, 208(%%r1,%2) \n\t"
"vl %%v22, 224(%%r1,%2) \n\t"
"vl %%v23, 240(%%r1,%2) \n\t"
"vflpsb %%v16, %%v16 \n\t"
"vflpsb %%v17, %%v17 \n\t"
"vflpsb %%v18, %%v18 \n\t"
"vflpsb %%v19, %%v19 \n\t"
"vflpsb %%v20, %%v20 \n\t"
"vflpsb %%v21, %%v21 \n\t"
"vflpsb %%v22, %%v22 \n\t"
"vflpsb %%v23, %%v23 \n\t"
"vfasb %%v0,%%v0,%%v16 \n\t"
"vfasb %%v1,%%v1,%%v17 \n\t"
"vfasb %%v2,%%v2,%%v18 \n\t"
"vfasb %%v3,%%v3,%%v19 \n\t"
"vfasb %%v0,%%v0,%%v20 \n\t"
"vfasb %%v1,%%v1,%%v21 \n\t"
"vfasb %%v2,%%v2,%%v22 \n\t"
"vfasb %%v3,%%v3,%%v23 \n\t"
"agfi %%r1,256 \n\t"
"brctg %%r0,0b \n\t"
"vfasb %%v0,%%v0,%%v1 \n\t"
"vfasb %%v0,%%v0,%%v2 \n\t"
"vfasb %%v0,%%v0,%%v3 \n\t"
"veslg %%v1,%%v0,32 \n\t"
"vfasb %%v0,%%v0,%%v1 \n\t"
"vrepf %%v1,%%v0,2 \n\t"
"aebr %%f0,%%f1 \n\t"
"ler %0,%%f0 "
:"=f"(asum)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23"
);
return asum;
}
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0;
BLASLONG j = 0;
FLOAT sumf = 0.0;
BLASLONG n1;
if (n <= 0 || inc_x <= 0) return sumf;
if (inc_x == 1) {
n1 = n & -64;
if (n1 > 0) {
sumf = sasum_kernel_64(n1, x);
i = n1;
}
while (i < n) {
sumf += ABS(x[i]);
i++;
}
} else {
BLASLONG n1 = n & -4;
register FLOAT sum1, sum2;
sum1 = 0.0;
sum2 = 0.0;
while (j < n1) {
sum1 += ABS(x[i]);
sum2 += ABS(x[i + inc_x]);
sum1 += ABS(x[i + 2 * inc_x]);
sum2 += ABS(x[i + 3 * inc_x]);
i += inc_x * 4;
j += 4;
}
sumf = sum1 + sum2;
while (j < n) {
sumf += ABS(x[i]);
i += inc_x;
j++;
}
}
return sumf;
}

184
kernel/zarch/saxpy.c Normal file
View File

@ -0,0 +1,184 @@
/***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
static void saxpy_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
{
__asm__ volatile(
"vlrepf %%v0,%3 \n\t"
"srlg %%r0,%0,6 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%1) \n\t"
"pfd 2, 1024(%%r1,%2) \n\t"
"vl %%v16,0(%%r1,%1) \n\t"
"vl %%v17,16(%%r1,%1) \n\t"
"vl %%v18,32(%%r1,%1) \n\t"
"vl %%v19,48(%%r1,%1) \n\t"
"vl %%v20,0(%%r1,%2) \n\t"
"vl %%v21,16(%%r1,%2) \n\t"
"vl %%v22,32(%%r1,%2) \n\t"
"vl %%v23,48(%%r1,%2) \n\t"
"vfmasb %%v16,%%v0,%%v16,%%v20 \n\t"
"vfmasb %%v17,%%v0,%%v17,%%v21 \n\t"
"vfmasb %%v18,%%v0,%%v18,%%v22 \n\t"
"vfmasb %%v19,%%v0,%%v19,%%v23 \n\t"
"vl %%v24,64(%%r1,%1) \n\t"
"vl %%v25,80(%%r1,%1) \n\t"
"vl %%v26,96(%%r1,%1) \n\t"
"vl %%v27,112(%%r1,%1) \n\t"
"vl %%v28,64(%%r1,%2) \n\t"
"vl %%v29,80(%%r1,%2) \n\t"
"vl %%v30,96(%%r1,%2) \n\t"
"vl %%v31,112(%%r1,%2) \n\t"
"vfmasb %%v20,%%v0,%%v24,%%v28 \n\t"
"vfmasb %%v21,%%v0,%%v25,%%v29 \n\t"
"vfmasb %%v22,%%v0,%%v26,%%v30 \n\t"
"vfmasb %%v23,%%v0,%%v27,%%v31 \n\t"
"vst %%v16,0(%%r1,%2) \n\t"
"vst %%v17,16(%%r1,%2) \n\t"
"vst %%v18,32(%%r1,%2) \n\t"
"vst %%v19,48(%%r1,%2) \n\t"
"vst %%v20,64(%%r1,%2) \n\t"
"vst %%v21,80(%%r1,%2) \n\t"
"vst %%v22,96(%%r1,%2) \n\t"
"vst %%v23,112(%%r1,%2) \n\t"
"vl %%v16,128(%%r1,%1) \n\t"
"vl %%v17,144(%%r1,%1) \n\t"
"vl %%v18,160(%%r1,%1) \n\t"
"vl %%v19,176(%%r1,%1) \n\t"
"vl %%v20,128(%%r1,%2) \n\t"
"vl %%v21,144(%%r1,%2) \n\t"
"vl %%v22,160(%%r1,%2) \n\t"
"vl %%v23,176(%%r1,%2) \n\t"
"vfmasb %%v16,%%v0,%%v16,%%v20 \n\t"
"vfmasb %%v17,%%v0,%%v17,%%v21 \n\t"
"vfmasb %%v18,%%v0,%%v18,%%v22 \n\t"
"vfmasb %%v19,%%v0,%%v19,%%v23 \n\t"
"vl %%v24,192(%%r1,%1) \n\t"
"vl %%v25,208(%%r1,%1) \n\t"
"vl %%v26,224(%%r1,%1) \n\t"
"vl %%v27,240(%%r1,%1) \n\t"
"vl %%v28,192(%%r1,%2) \n\t"
"vl %%v29,208(%%r1,%2) \n\t"
"vl %%v30,224(%%r1,%2) \n\t"
"vl %%v31,240(%%r1,%2) \n\t"
"vfmasb %%v20,%%v0,%%v24,%%v28 \n\t"
"vfmasb %%v21,%%v0,%%v25,%%v29 \n\t"
"vfmasb %%v22,%%v0,%%v26,%%v30 \n\t"
"vfmasb %%v23,%%v0,%%v27,%%v31 \n\t"
"vst %%v16,128(%%r1,%2) \n\t"
"vst %%v17,144(%%r1,%2) \n\t"
"vst %%v18,160(%%r1,%2) \n\t"
"vst %%v19,176(%%r1,%2) \n\t"
"vst %%v20,192(%%r1,%2) \n\t"
"vst %%v21,208(%%r1,%2) \n\t"
"vst %%v22,224(%%r1,%2) \n\t"
"vst %%v23,240(%%r1,%2) \n\t"
"agfi %%r1,256 \n\t"
"brctg %%r0,0b "
:
:"r"(n),"ZR"((const FLOAT (*)[n])x),"ZR"((FLOAT (*)[n])y),"m"(*alpha)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
}
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
{
BLASLONG i=0;
BLASLONG ix=0,iy=0;
if ( n <= 0 ) return 0 ;
if ( (inc_x == 1) && (inc_y == 1) )
{
BLASLONG n1 = n & -64;
if ( n1 )
saxpy_kernel_64(n1, x, y , &da);
i = n1;
while(i < n)
{
y[i] += da * x[i] ;
i++ ;
}
return 0 ;
}
BLASLONG n1 = n & -4;
while(i < n1)
{
FLOAT m1 = da * x[ix] ;
FLOAT m2 = da * x[ix+inc_x] ;
FLOAT m3 = da * x[ix+2*inc_x] ;
FLOAT m4 = da * x[ix+3*inc_x] ;
y[iy] += m1 ;
y[iy+inc_y] += m2 ;
y[iy+2*inc_y] += m3 ;
y[iy+3*inc_y] += m4 ;
ix += inc_x*4 ;
iy += inc_y*4 ;
i+=4 ;
}
while(i < n)
{
y[iy] += da * x[ix] ;
ix += inc_x ;
iy += inc_y ;
i++ ;
}
return 0 ;
}

85
kernel/zarch/scopy.c Normal file
View File

@ -0,0 +1,85 @@
/***************************************************************************
Copyright (c) 2013-2018, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
static void scopy_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y)
{
__asm__ volatile (
"lgr %%r1,%1 \n\t"
"lgr %%r2,%2 \n\t"
"srlg %%r0,%0,6 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1) \n\t"
"pfd 2, 1024(%%r2) \n\t"
"mvc 0(256,%%r2),0(%%r1) \n\t"
"agfi %%r1,256 \n\t"
"agfi %%r2,256 \n\t"
"brctg %%r0,0b "
:
:"r"(n),"a"((const FLOAT (*)[n])x),"a"((FLOAT (*)[n])y)
:"memory","cc","r0","r1","r2"
);
}
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) {
BLASLONG i = 0;
BLASLONG ix = 0, iy = 0;
if (n <= 0) return 0;
if ((inc_x == 1) && (inc_y == 1)) {
BLASLONG n1 = n & -64;
if (n1 > 0) {
scopy_kernel_64(n1, x, y);
i = n1;
}
while (i < n) {
y[i] = x[i];
i++;
}
} else {
while (i < n) {
y[iy] = x[ix];
ix += inc_x;
iy += inc_y;
i++;
}
}
return 0;
}

140
kernel/zarch/sdot.c Normal file
View File

@ -0,0 +1,140 @@
/***************************************************************************
Copyright (c) 2013-2018,The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms,with or without
modification,are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice,this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice,this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES,INCLUDING,BUT NOT LIMITED TO,THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT,INDIRECT,INCIDENTAL,SPECIAL,EXEMPLARY,OR CONSEQUENTIAL
DAMAGES (INCLUDING,BUT NOT LIMITED TO,PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE,DATA,OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY,WHETHER IN CONTRACT,STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE,EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
static FLOAT sdot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y)
{
FLOAT dot;
__asm__ volatile (
"vzero %%v0 \n\t"
"srlg %%r0,%1,5 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1,1024(%%r1,%2) \n\t"
"pfd 1,1024(%%r1,%3) \n\t"
"vl %%v16,0(%%r1,%2) \n\t"
"vl %%v17,16(%%r1,%2) \n\t"
"vl %%v18,32(%%r1,%2) \n\t"
"vl %%v19,48(%%r1,%2) \n\t"
"vl %%v20,64(%%r1,%2) \n\t"
"vl %%v21,80(%%r1,%2) \n\t"
"vl %%v22,96(%%r1,%2) \n\t"
"vl %%v23,112(%%r1,%2) \n\t"
"vl %%v24,0(%%r1,%3) \n\t"
"vfmasb %%v0,%%v16,%%v24,%%v0 \n\t"
"vl %%v25,16(%%r1,%3) \n\t"
"vfmasb %%v0,%%v17,%%v25,%%v0 \n\t"
"vl %%v26,32(%%r1,%3) \n\t"
"vfmasb %%v0,%%v18,%%v26,%%v0 \n\t"
"vl %%v27,48(%%r1,%3) \n\t"
"vfmasb %%v0,%%v19,%%v27,%%v0 \n\t"
"vl %%v28,64(%%r1,%3) \n\t"
"vfmasb %%v0,%%v20,%%v28,%%v0 \n\t"
"vl %%v29,80(%%r1,%3) \n\t"
"vfmasb %%v0,%%v21,%%v29,%%v0 \n\t"
"vl %%v30,96(%%r1,%3) \n\t"
"vfmasb %%v0,%%v22,%%v30,%%v0 \n\t"
"vl %%v31,112(%%r1,%3) \n\t"
"vfmasb %%v0,%%v23,%%v31,%%v0 \n\t"
"agfi %%r1,128 \n\t"
"brctg %%r0,0b \n\t"
"vrepf %%v1,%%v0,1 \n\t"
"vrepf %%v2,%%v0,2 \n\t"
"vrepf %%v3,%%v0,3 \n\t"
"aebr %%f0,%%f1 \n\t"
"aebr %%f0,%%f2 \n\t"
"aebr %%f0,%%f3 \n\t"
"ler %0,%%f0 "
:"=f"(dot)
:"r"(n),"ZR"((const FLOAT (*)[n])x),"ZR"((const FLOAT (*)[n])y)
:"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
return dot;
}
FLOAT CNAME(BLASLONG n,FLOAT *x,BLASLONG inc_x,FLOAT *y,BLASLONG inc_y)
{
BLASLONG i=0;
BLASLONG ix=0,iy=0;
FLOAT dot = 0.0 ;
if ( n <= 0 ) return(dot);
if ( (inc_x == 1) && (inc_y == 1) )
{
BLASLONG n1 = n & -32;
if ( n1 )
dot = sdot_kernel_32(n1,x,y);
i = n1;
while(i < n)
{
dot += y[i] * x[i] ;
i++ ;
}
return(dot);
}
BLASLONG n1 = n & -2;
while(i < n1)
{
dot += y[iy] * x[ix] + y[iy+inc_y] * x[ix+inc_x];
ix += inc_x*2 ;
iy += inc_y*2 ;
i+=2 ;
}
while(i < n)
{
dot += y[iy] * x[ix] ;
ix += inc_x ;
iy += inc_y ;
i++ ;
}
return(dot);
}

668
kernel/zarch/sgemv_n_4.c Normal file
View File

@ -0,0 +1,668 @@
/***************************************************************************
Copyright (c) 2017, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#define NBMAX 2048
static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha)
{
__asm__ volatile (
"vlrepf %%v0,0(%5) \n\t"
"vlrepf %%v1,4(%5) \n\t"
"vlrepf %%v2,8(%5) \n\t"
"vlrepf %%v3,12(%5) \n\t"
"vlrepf %%v4,%7 \n\t"
"vfmsb %%v0,%%v0,%%v4 \n\t"
"vfmsb %%v1,%%v1,%%v4 \n\t"
"vfmsb %%v2,%%v2,%%v4 \n\t"
"vfmsb %%v3,%%v3,%%v4 \n\t"
"xgr %%r1,%%r1 \n\t"
"lghi %%r0,-32 \n\t"
"ngr %%r0,%0 \n\t"
"ltgr %%r0,%%r0 \n\t"
"jz 1f \n\t"
"srlg %%r0,%%r0,5 \n\t"
"0: \n\t"
"pfd 1,1024(%%r1,%1) \n\t"
"pfd 1,1024(%%r1,%2) \n\t"
"pfd 1,1024(%%r1,%3) \n\t"
"pfd 1,1024(%%r1,%4) \n\t"
"pfd 2,1024(%%r1,%6) \n\t"
"vl %%v16,0(%%r1,%1) \n\t"
"vl %%v17,0(%%r1,%2) \n\t"
"vl %%v18,0(%%r1,%3) \n\t"
"vl %%v19,0(%%r1,%4) \n\t"
"vl %%v20,16(%%r1,%1) \n\t"
"vl %%v21,16(%%r1,%2) \n\t"
"vl %%v22,16(%%r1,%3) \n\t"
"vl %%v23,16(%%r1,%4) \n\t"
"vl %%v24,32(%%r1,%1) \n\t"
"vl %%v25,32(%%r1,%2) \n\t"
"vl %%v26,32(%%r1,%3) \n\t"
"vl %%v27,32(%%r1,%4) \n\t"
"vl %%v28,48(%%r1,%1) \n\t"
"vl %%v29,48(%%r1,%2) \n\t"
"vl %%v30,48(%%r1,%3) \n\t"
"vl %%v31,48(%%r1,%4) \n\t"
"vl %%v4,0(%%r1,%6) \n\t"
"vfmasb %%v4,%%v16,%%v0,%%v4 \n\t"
"vfmasb %%v4,%%v17,%%v1,%%v4 \n\t"
"vfmasb %%v4,%%v18,%%v2,%%v4 \n\t"
"vfmasb %%v4,%%v19,%%v3,%%v4 \n\t"
"vst %%v4,0(%%r1,%6) \n\t"
"vl %%v4,16(%%r1,%6) \n\t"
"vfmasb %%v4,%%v20,%%v0,%%v4 \n\t"
"vfmasb %%v4,%%v21,%%v1,%%v4 \n\t"
"vfmasb %%v4,%%v22,%%v2,%%v4 \n\t"
"vfmasb %%v4,%%v23,%%v3,%%v4 \n\t"
"vst %%v4,16(%%r1,%6) \n\t"
"vl %%v4,32(%%r1,%6) \n\t"
"vfmasb %%v4,%%v24,%%v0,%%v4 \n\t"
"vfmasb %%v4,%%v25,%%v1,%%v4 \n\t"
"vfmasb %%v4,%%v26,%%v2,%%v4 \n\t"
"vfmasb %%v4,%%v27,%%v3,%%v4 \n\t"
"vst %%v4,32(%%r1,%6) \n\t"
"vl %%v4,48(%%r1,%6) \n\t"
"vfmasb %%v4,%%v28,%%v0,%%v4 \n\t"
"vfmasb %%v4,%%v29,%%v1,%%v4 \n\t"
"vfmasb %%v4,%%v30,%%v2,%%v4 \n\t"
"vfmasb %%v4,%%v31,%%v3,%%v4 \n\t"
"vst %%v4,48(%%r1,%6) \n\t"
"vl %%v16,64(%%r1,%1) \n\t"
"vl %%v17,64(%%r1,%2) \n\t"
"vl %%v18,64(%%r1,%3) \n\t"
"vl %%v19,64(%%r1,%4) \n\t"
"vl %%v20,80(%%r1,%1) \n\t"
"vl %%v21,80(%%r1,%2) \n\t"
"vl %%v22,80(%%r1,%3) \n\t"
"vl %%v23,80(%%r1,%4) \n\t"
"vl %%v24,96(%%r1,%1) \n\t"
"vl %%v25,96(%%r1,%2) \n\t"
"vl %%v26,96(%%r1,%3) \n\t"
"vl %%v27,96(%%r1,%4) \n\t"
"vl %%v28,112(%%r1,%1) \n\t"
"vl %%v29,112(%%r1,%2) \n\t"
"vl %%v30,112(%%r1,%3) \n\t"
"vl %%v31,112(%%r1,%4) \n\t"
"vl %%v4,64(%%r1,%6) \n\t"
"vfmasb %%v4,%%v16,%%v0,%%v4 \n\t"
"vfmasb %%v4,%%v17,%%v1,%%v4 \n\t"
"vfmasb %%v4,%%v18,%%v2,%%v4 \n\t"
"vfmasb %%v4,%%v19,%%v3,%%v4 \n\t"
"vst %%v4,64(%%r1,%6) \n\t"
"vl %%v4,80(%%r1,%6) \n\t"
"vfmasb %%v4,%%v20,%%v0,%%v4 \n\t"
"vfmasb %%v4,%%v21,%%v1,%%v4 \n\t"
"vfmasb %%v4,%%v22,%%v2,%%v4 \n\t"
"vfmasb %%v4,%%v23,%%v3,%%v4 \n\t"
"vst %%v4,80(%%r1,%6) \n\t"
"vl %%v4,96(%%r1,%6) \n\t"
"vfmasb %%v4,%%v24,%%v0,%%v4 \n\t"
"vfmasb %%v4,%%v25,%%v1,%%v4 \n\t"
"vfmasb %%v4,%%v26,%%v2,%%v4 \n\t"
"vfmasb %%v4,%%v27,%%v3,%%v4 \n\t"
"vst %%v4,96(%%r1,%6) \n\t"
"vl %%v4,112(%%r1,%6) \n\t"
"vfmasb %%v4,%%v28,%%v0,%%v4 \n\t"
"vfmasb %%v4,%%v29,%%v1,%%v4 \n\t"
"vfmasb %%v4,%%v30,%%v2,%%v4 \n\t"
"vfmasb %%v4,%%v31,%%v3,%%v4 \n\t"
"vst %%v4,112(%%r1,%6) \n\t"
"agfi %%r1,128 \n\t"
"brctg %%r0,0b \n\t"
"1: \n\t"
"lghi %%r0,28 \n\t"
"ngr %%r0,%0 \n\t"
"ltgr %%r0,%%r0 \n\t"
"jz 3f \n\t"
"srlg %%r0,%%r0,2 \n\t"
"2: \n\t"
"vl %%v16,0(%%r1,%1) \n\t"
"vl %%v17,0(%%r1,%2) \n\t"
"vl %%v18,0(%%r1,%3) \n\t"
"vl %%v19,0(%%r1,%4) \n\t"
"vl %%v4,0(%%r1,%6) \n\t"
"vfmasb %%v4,%%v16,%%v0,%%v4 \n\t"
"vfmasb %%v4,%%v17,%%v1,%%v4 \n\t"
"vfmasb %%v4,%%v18,%%v2,%%v4 \n\t"
"vfmasb %%v4,%%v19,%%v3,%%v4 \n\t"
"vst %%v4,0(%%r1,%6) \n\t"
"agfi %%r1,16 \n\t"
"brctg %%r0,2b \n\t"
"3: \n\t"
"nop "
:
:"r"(n),"ZR"((const FLOAT (*)[n])ap[0]),"ZR"((const FLOAT (*)[n])ap[1]),"ZR"((const FLOAT (*)[n])ap[2]),"ZR"((const FLOAT (*)[n])ap[3]),"ZQ"((const FLOAT (*)[4])xo),"ZR"((FLOAT (*)[n])y),"m"(*alpha)
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
}
static void sgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha)
{
__asm__ volatile (
"vlrepf %%v0,0(%3) \n\t"
"vlrepf %%v1,4(%3) \n\t"
"vlrepf %%v2,%5 \n\t"
"vfmsb %%v0,%%v0,%%v2 \n\t"
"vfmsb %%v1,%%v1,%%v2 \n\t"
"xgr %%r1,%%r1 \n\t"
"lghi %%r0,-32 \n\t"
"ngr %%r0,%0 \n\t"
"ltgr %%r0,%%r0 \n\t"
"jz 1f \n\t"
"srlg %%r0,%%r0,5 \n\t"
"0: \n\t"
"pfd 1,1024(%%r1,%1) \n\t"
"pfd 1,1024(%%r1,%2) \n\t"
"pfd 2,1024(%%r1,%4) \n\t"
"vl %%v16,0(%%r1,%1) \n\t"
"vl %%v17,0(%%r1,%2) \n\t"
"vl %%v18,16(%%r1,%1) \n\t"
"vl %%v19,16(%%r1,%2) \n\t"
"vl %%v20,32(%%r1,%1) \n\t"
"vl %%v21,32(%%r1,%2) \n\t"
"vl %%v22,48(%%r1,%1) \n\t"
"vl %%v23,48(%%r1,%2) \n\t"
"vl %%v24,64(%%r1,%1) \n\t"
"vl %%v25,64(%%r1,%2) \n\t"
"vl %%v26,80(%%r1,%1) \n\t"
"vl %%v27,80(%%r1,%2) \n\t"
"vl %%v28,96(%%r1,%1) \n\t"
"vl %%v29,96(%%r1,%2) \n\t"
"vl %%v30,112(%%r1,%1) \n\t"
"vl %%v31,112(%%r1,%2) \n\t"
"vl %%v2,0(%%r1,%4) \n\t"
"vfmasb %%v2,%%v16,%%v0,%%v2 \n\t"
"vfmasb %%v2,%%v17,%%v1,%%v2 \n\t"
"vst %%v2,0(%%r1,%4) \n\t"
"vl %%v2,16(%%r1,%4) \n\t"
"vfmasb %%v2,%%v18,%%v0,%%v2 \n\t"
"vfmasb %%v2,%%v19,%%v1,%%v2 \n\t"
"vst %%v2,16(%%r1,%4) \n\t"
"vl %%v2,32(%%r1,%4) \n\t"
"vfmasb %%v2,%%v20,%%v0,%%v2 \n\t"
"vfmasb %%v2,%%v21,%%v1,%%v2 \n\t"
"vst %%v2,32(%%r1,%4) \n\t"
"vl %%v2,48(%%r1,%4) \n\t"
"vfmasb %%v2,%%v22,%%v0,%%v2 \n\t"
"vfmasb %%v2,%%v23,%%v1,%%v2 \n\t"
"vst %%v2,48(%%r1,%4) \n\t"
"vl %%v2,64(%%r1,%4) \n\t"
"vfmasb %%v2,%%v24,%%v0,%%v2 \n\t"
"vfmasb %%v2,%%v25,%%v1,%%v2 \n\t"
"vst %%v2,64(%%r1,%4) \n\t"
"vl %%v2,80(%%r1,%4) \n\t"
"vfmasb %%v2,%%v26,%%v0,%%v2 \n\t"
"vfmasb %%v2,%%v27,%%v1,%%v2 \n\t"
"vst %%v2,80(%%r1,%4) \n\t"
"vl %%v2,96(%%r1,%4) \n\t"
"vfmasb %%v2,%%v28,%%v0,%%v2 \n\t"
"vfmasb %%v2,%%v29,%%v1,%%v2 \n\t"
"vst %%v2,96(%%r1,%4) \n\t"
"vl %%v2,112(%%r1,%4) \n\t"
"vfmasb %%v2,%%v30,%%v0,%%v2 \n\t"
"vfmasb %%v2,%%v31,%%v1,%%v2 \n\t"
"vst %%v2,112(%%r1,%4) \n\t"
"agfi %%r1,128 \n\t"
"brctg %%r0,0b \n\t"
"1: \n\t"
"lghi %%r0,28 \n\t"
"ngr %%r0,%0 \n\t"
"ltgr %%r0,%%r0 \n\t"
"jz 3f \n\t"
"srlg %%r0,%%r0,2 \n\t"
"2: \n\t"
"vl %%v16,0(%%r1,%1) \n\t"
"vl %%v17,0(%%r1,%2) \n\t"
"vl %%v2,0(%%r1,%4) \n\t"
"vfmasb %%v2,%%v16,%%v0,%%v2 \n\t"
"vfmasb %%v2,%%v17,%%v1,%%v2 \n\t"
"vst %%v2,0(%%r1,%4) \n\t"
"agfi %%r1,16 \n\t"
"brctg %%r0,2b \n\t"
"3: \n\t"
"nop "
:
:"r"(n),"ZR"((const FLOAT (*)[n])ap[0]),"ZR"((const FLOAT (*)[n])ap[1]),"ZQ"((const FLOAT (*)[2])xo),"ZR"((FLOAT (*)[n])y),"m"(*alpha)
:"memory","cc","r0","r1","v0","v1","v2","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
}
static void sgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *xo, FLOAT *y, FLOAT *alpha)
{
__asm__ volatile (
"vlrepf %%v0,0(%2) \n\t"
"vlrepf %%v1,%4 \n\t"
"vfmsb %%v0,%%v0,%%v1 \n\t"
"xgr %%r1,%%r1 \n\t"
"lghi %%r0,-32 \n\t"
"ngr %%r0,%0 \n\t"
"ltgr %%r0,%%r0 \n\t"
"jz 1f \n\t"
"srlg %%r0,%%r0,5 \n\t"
"0: \n\t"
"pfd 1,1024(%%r1,%1) \n\t"
"pfd 2,1024(%%r1,%3) \n\t"
"vl %%v16,0(%%r1,%1) \n\t"
"vl %%v17,16(%%r1,%1) \n\t"
"vl %%v18,32(%%r1,%1) \n\t"
"vl %%v19,48(%%r1,%1) \n\t"
"vl %%v20,64(%%r1,%1) \n\t"
"vl %%v21,80(%%r1,%1) \n\t"
"vl %%v22,96(%%r1,%1) \n\t"
"vl %%v23,112(%%r1,%1) \n\t"
"vl %%v1,0(%%r1,%3) \n\t"
"vfmasb %%v1,%%v16,%%v0,%%v1 \n\t"
"vst %%v1,0(%%r1,%3) \n\t"
"vl %%v1,16(%%r1,%3) \n\t"
"vfmasb %%v1,%%v17,%%v0,%%v1 \n\t"
"vst %%v1,16(%%r1,%3) \n\t"
"vl %%v1,32(%%r1,%3) \n\t"
"vfmasb %%v1,%%v18,%%v0,%%v1 \n\t"
"vst %%v1,32(%%r1,%3) \n\t"
"vl %%v1,48(%%r1,%3) \n\t"
"vfmasb %%v1,%%v19,%%v0,%%v1 \n\t"
"vst %%v1,48(%%r1,%3) \n\t"
"vl %%v1,64(%%r1,%3) \n\t"
"vfmasb %%v1,%%v20,%%v0,%%v1 \n\t"
"vst %%v1,64(%%r1,%3) \n\t"
"vl %%v1,80(%%r1,%3) \n\t"
"vfmasb %%v1,%%v21,%%v0,%%v1 \n\t"
"vst %%v1,80(%%r1,%3) \n\t"
"vl %%v1,96(%%r1,%3) \n\t"
"vfmasb %%v1,%%v22,%%v0,%%v1 \n\t"
"vst %%v1,96(%%r1,%3) \n\t"
"vl %%v1,112(%%r1,%3) \n\t"
"vfmasb %%v1,%%v23,%%v0,%%v1 \n\t"
"vst %%v1,112(%%r1,%3) \n\t"
"agfi %%r1,128 \n\t"
"brctg %%r0,0b \n\t"
"1: \n\t"
"lghi %%r0,28 \n\t"
"ngr %%r0,%0 \n\t"
"ltgr %%r0,%%r0 \n\t"
"jz 3f \n\t"
"srlg %%r0,%%r0,2 \n\t"
"2: \n\t"
"vl %%v16,0(%%r1,%1) \n\t"
"vl %%v1,0(%%r1,%3) \n\t"
"vfmasb %%v1,%%v16,%%v0,%%v1 \n\t"
"vst %%v1,0(%%r1,%3) \n\t"
"agfi %%r1,16 \n\t"
"brctg %%r0,2b \n\t"
"3: \n\t"
"nop "
:
:"r"(n),"ZR"((const FLOAT (*)[n])a0),"ZQ"((const FLOAT (*)[1])xo),"ZR"((FLOAT (*)[n])y),"m"(*alpha)
:"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
}
static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest)
{
BLASLONG i;
for (i = 0; i < n; i++)
{
*dest += src[i];
dest += inc_dest;
}
}
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
{
BLASLONG i;
FLOAT *a_ptr;
FLOAT *x_ptr;
FLOAT *y_ptr;
FLOAT *ap[4];
BLASLONG n1;
BLASLONG m1;
BLASLONG m2;
BLASLONG m3;
BLASLONG n2;
BLASLONG lda4 = lda << 2;
FLOAT xbuffer[8],*ybuffer;
if ( m < 1 ) return(0);
if ( n < 1 ) return(0);
ybuffer = buffer;
n1 = n >> 2 ;
n2 = n & 3 ;
m3 = m & 3 ;
m1 = m & -4 ;
m2 = (m & (NBMAX-1)) - m3 ;
y_ptr = y;
BLASLONG NB = NBMAX;
while ( NB == NBMAX )
{
m1 -= NB;
if ( m1 < 0)
{
if ( m2 == 0 ) break;
NB = m2;
}
a_ptr = a;
x_ptr = x;
ap[0] = a_ptr;
ap[1] = a_ptr + lda;
ap[2] = ap[1] + lda;
ap[3] = ap[2] + lda;
if ( inc_y != 1 )
memset(ybuffer,0,NB*4);
else
ybuffer = y_ptr;
if ( inc_x == 1 )
{
for( i = 0; i < n1 ; i++)
{
sgemv_kernel_4x4(NB,ap,x_ptr,ybuffer,&alpha);
ap[0] += lda4;
ap[1] += lda4;
ap[2] += lda4;
ap[3] += lda4;
a_ptr += lda4;
x_ptr += 4;
}
if ( n2 & 2 )
{
sgemv_kernel_4x2(NB,ap,x_ptr,ybuffer,&alpha);
a_ptr += lda*2;
x_ptr += 2;
}
if ( n2 & 1 )
{
sgemv_kernel_4x1(NB,a_ptr,x_ptr,ybuffer,&alpha);
/* a_ptr += lda;
x_ptr += 1; */
}
}
else
{
for( i = 0; i < n1 ; i++)
{
xbuffer[0] = x_ptr[0];
x_ptr += inc_x;
xbuffer[1] = x_ptr[0];
x_ptr += inc_x;
xbuffer[2] = x_ptr[0];
x_ptr += inc_x;
xbuffer[3] = x_ptr[0];
x_ptr += inc_x;
sgemv_kernel_4x4(NB,ap,xbuffer,ybuffer,&alpha);
ap[0] += lda4;
ap[1] += lda4;
ap[2] += lda4;
ap[3] += lda4;
a_ptr += lda4;
}
for( i = 0; i < n2 ; i++)
{
xbuffer[0] = x_ptr[0];
x_ptr += inc_x;
sgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer,&alpha);
a_ptr += lda;
}
}
a += NB;
if ( inc_y != 1 )
{
add_y(NB,ybuffer,y_ptr,inc_y);
y_ptr += NB * inc_y;
}
else
y_ptr += NB ;
}
if ( m3 == 0 ) return(0);
if ( m3 == 3 )
{
a_ptr = a;
x_ptr = x;
FLOAT temp0 = 0.0;
FLOAT temp1 = 0.0;
FLOAT temp2 = 0.0;
if ( lda == 3 && inc_x ==1 )
{
for( i = 0; i < ( n & -4 ); i+=4 )
{
temp0 += a_ptr[0] * x_ptr[0] + a_ptr[3] * x_ptr[1];
temp1 += a_ptr[1] * x_ptr[0] + a_ptr[4] * x_ptr[1];
temp2 += a_ptr[2] * x_ptr[0] + a_ptr[5] * x_ptr[1];
temp0 += a_ptr[6] * x_ptr[2] + a_ptr[9] * x_ptr[3];
temp1 += a_ptr[7] * x_ptr[2] + a_ptr[10] * x_ptr[3];
temp2 += a_ptr[8] * x_ptr[2] + a_ptr[11] * x_ptr[3];
a_ptr += 12;
x_ptr += 4;
}
for( ; i < n; i++ )
{
temp0 += a_ptr[0] * x_ptr[0];
temp1 += a_ptr[1] * x_ptr[0];
temp2 += a_ptr[2] * x_ptr[0];
a_ptr += 3;
x_ptr ++;
}
}
else
{
for( i = 0; i < n; i++ )
{
temp0 += a_ptr[0] * x_ptr[0];
temp1 += a_ptr[1] * x_ptr[0];
temp2 += a_ptr[2] * x_ptr[0];
a_ptr += lda;
x_ptr += inc_x;
}
}
y_ptr[0] += alpha * temp0;
y_ptr += inc_y;
y_ptr[0] += alpha * temp1;
y_ptr += inc_y;
y_ptr[0] += alpha * temp2;
return(0);
}
if ( m3 == 2 )
{
a_ptr = a;
x_ptr = x;
FLOAT temp0 = 0.0;
FLOAT temp1 = 0.0;
if ( lda == 2 && inc_x ==1 )
{
for( i = 0; i < (n & -4) ; i+=4 )
{
temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1];
temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1];
temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3];
temp1 += a_ptr[5] * x_ptr[2] + a_ptr[7] * x_ptr[3];
a_ptr += 8;
x_ptr += 4;
}
for( ; i < n; i++ )
{
temp0 += a_ptr[0] * x_ptr[0];
temp1 += a_ptr[1] * x_ptr[0];
a_ptr += 2;
x_ptr ++;
}
}
else
{
for( i = 0; i < n; i++ )
{
temp0 += a_ptr[0] * x_ptr[0];
temp1 += a_ptr[1] * x_ptr[0];
a_ptr += lda;
x_ptr += inc_x;
}
}
y_ptr[0] += alpha * temp0;
y_ptr += inc_y;
y_ptr[0] += alpha * temp1;
return(0);
}
if ( m3 == 1 )
{
a_ptr = a;
x_ptr = x;
FLOAT temp = 0.0;
if ( lda == 1 && inc_x ==1 )
{
for( i = 0; i < (n & -4); i+=4 )
{
temp += a_ptr[i] * x_ptr[i] + a_ptr[i+1] * x_ptr[i+1] + a_ptr[i+2] * x_ptr[i+2] + a_ptr[i+3] * x_ptr[i+3];
}
for( ; i < n; i++ )
{
temp += a_ptr[i] * x_ptr[i];
}
}
else
{
for( i = 0; i < n; i++ )
{
temp += a_ptr[0] * x_ptr[0];
a_ptr += lda;
x_ptr += inc_x;
}
}
y_ptr[0] += alpha * temp;
return(0);
}
return(0);
}

827
kernel/zarch/sgemv_t_4.c Normal file
View File

@ -0,0 +1,827 @@
/***************************************************************************
Copyright (c) 2017, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#define NBMAX 2048
static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
{
__asm__ volatile (
"vzero %%v0 \n\t"
"vzero %%v1 \n\t"
"vzero %%v2 \n\t"
"vzero %%v3 \n\t"
"xgr %%r1,%%r1 \n\t"
"lghi %%r0,-32 \n\t"
"ngr %%r0,%0 \n\t"
"ltgr %%r0,%%r0 \n\t"
"jz 1f \n\t"
"srlg %%r0,%%r0,5 \n\t"
"0: \n\t"
"pfd 1,1024(%%r1,%1) \n\t"
"pfd 1,1024(%%r1,%2) \n\t"
"pfd 1,1024(%%r1,%3) \n\t"
"pfd 1,1024(%%r1,%4) \n\t"
"pfd 1,1024(%%r1,%5) \n\t"
"vl %%v16,0(%%r1,%5) \n\t"
"vl %%v17,16(%%r1,%5) \n\t"
"vl %%v18,32(%%r1,%5) \n\t"
"vl %%v19,48(%%r1,%5) \n\t"
"vl %%v20,64(%%r1,%5) \n\t"
"vl %%v21,80(%%r1,%5) \n\t"
"vl %%v22,96(%%r1,%5) \n\t"
"vl %%v23,112(%%r1,%5) \n\t"
"vl %%v24,0(%%r1,%1) \n\t"
"vfmasb %%v0,%%v16,%%v24,%%v0 \n\t"
"vl %%v25,0(%%r1,%2) \n\t"
"vfmasb %%v1,%%v16,%%v25,%%v1 \n\t"
"vl %%v26,0(%%r1,%3) \n\t"
"vfmasb %%v2,%%v16,%%v26,%%v2 \n\t"
"vl %%v27,0(%%r1,%4) \n\t"
"vfmasb %%v3,%%v16,%%v27,%%v3 \n\t"
"vl %%v28,16(%%r1,%1) \n\t"
"vfmasb %%v0,%%v17,%%v28,%%v0 \n\t"
"vl %%v29,16(%%r1,%2) \n\t"
"vfmasb %%v1,%%v17,%%v29,%%v1 \n\t"
"vl %%v30,16(%%r1,%3) \n\t"
"vfmasb %%v2,%%v17,%%v30,%%v2 \n\t"
"vl %%v31,16(%%r1,%4) \n\t"
"vfmasb %%v3,%%v17,%%v31,%%v3 \n\t"
"vl %%v24,32(%%r1,%1) \n\t"
"vfmasb %%v0,%%v18,%%v24,%%v0 \n\t"
"vl %%v25,32(%%r1,%2) \n\t"
"vfmasb %%v1,%%v18,%%v25,%%v1 \n\t"
"vl %%v26,32(%%r1,%3) \n\t"
"vfmasb %%v2,%%v18,%%v26,%%v2 \n\t"
"vl %%v27,32(%%r1,%4) \n\t"
"vfmasb %%v3,%%v18,%%v27,%%v3 \n\t"
"vl %%v28,48(%%r1,%1) \n\t"
"vfmasb %%v0,%%v19,%%v28,%%v0 \n\t"
"vl %%v29,48(%%r1,%2) \n\t"
"vfmasb %%v1,%%v19,%%v29,%%v1 \n\t"
"vl %%v30,48(%%r1,%3) \n\t"
"vfmasb %%v2,%%v19,%%v30,%%v2 \n\t"
"vl %%v31,48(%%r1,%4) \n\t"
"vfmasb %%v3,%%v19,%%v31,%%v3 \n\t"
"vl %%v24,64(%%r1,%1) \n\t"
"vfmasb %%v0,%%v20,%%v24,%%v0 \n\t"
"vl %%v25,64(%%r1,%2) \n\t"
"vfmasb %%v1,%%v20,%%v25,%%v1 \n\t"
"vl %%v26,64(%%r1,%3) \n\t"
"vfmasb %%v2,%%v20,%%v26,%%v2 \n\t"
"vl %%v27,64(%%r1,%4) \n\t"
"vfmasb %%v3,%%v20,%%v27,%%v3 \n\t"
"vl %%v28,80(%%r1,%1) \n\t"
"vfmasb %%v0,%%v21,%%v28,%%v0 \n\t"
"vl %%v29,80(%%r1,%2) \n\t"
"vfmasb %%v1,%%v21,%%v29,%%v1 \n\t"
"vl %%v30,80(%%r1,%3) \n\t"
"vfmasb %%v2,%%v21,%%v30,%%v2 \n\t"
"vl %%v31,80(%%r1,%4) \n\t"
"vfmasb %%v3,%%v21,%%v31,%%v3 \n\t"
"vl %%v24,96(%%r1,%1) \n\t"
"vfmasb %%v0,%%v22,%%v24,%%v0 \n\t"
"vl %%v25,96(%%r1,%2) \n\t"
"vfmasb %%v1,%%v22,%%v25,%%v1 \n\t"
"vl %%v26,96(%%r1,%3) \n\t"
"vfmasb %%v2,%%v22,%%v26,%%v2 \n\t"
"vl %%v27,96(%%r1,%4) \n\t"
"vfmasb %%v3,%%v22,%%v27,%%v3 \n\t"
"vl %%v28,112(%%r1,%1) \n\t"
"vfmasb %%v0,%%v23,%%v28,%%v0 \n\t"
"vl %%v29,112(%%r1,%2) \n\t"
"vfmasb %%v1,%%v23,%%v29,%%v1 \n\t"
"vl %%v30,112(%%r1,%3) \n\t"
"vfmasb %%v2,%%v23,%%v30,%%v2 \n\t"
"vl %%v31,112(%%r1,%4) \n\t"
"vfmasb %%v3,%%v23,%%v31,%%v3 \n\t"
"agfi %%r1,128 \n\t"
"brctg %%r0,0b \n\t"
"1: \n\t"
"lghi %%r0,28 \n\t"
"ngr %%r0,%0 \n\t"
"ltgr %%r0,%%r0 \n\t"
"jz 3f \n\t"
"srlg %%r0,%%r0,2 \n\t"
"2: \n\t"
"vl %%v16,0(%%r1,%5) \n\t"
"vl %%v24,0(%%r1,%1) \n\t"
"vfmasb %%v0,%%v16,%%v24,%%v0 \n\t"
"vl %%v25,0(%%r1,%2) \n\t"
"vfmasb %%v1,%%v16,%%v25,%%v1 \n\t"
"vl %%v26,0(%%r1,%3) \n\t"
"vfmasb %%v2,%%v16,%%v26,%%v2 \n\t"
"vl %%v27,0(%%r1,%4) \n\t"
"vfmasb %%v3,%%v16,%%v27,%%v3 \n\t"
"agfi %%r1,16 \n\t"
"brctg %%r0,2b \n\t"
"3: \n\t"
"vrepf %%v4,%%v0,1 \n\t"
"aebr %%f0,%%f4 \n\t"
"vrepf %%v4,%%v0,2 \n\t"
"aebr %%f0,%%f4 \n\t"
"vrepf %%v4,%%v0,3 \n\t"
"aebr %%f0,%%f4 \n\t"
"ste %%f0,0(%6) \n\t"
"vrepf %%v4,%%v1,1 \n\t"
"aebr %%f1,%%f4 \n\t"
"vrepf %%v4,%%v1,2 \n\t"
"aebr %%f1,%%f4 \n\t"
"vrepf %%v4,%%v1,3 \n\t"
"aebr %%f1,%%f4 \n\t"
"ste %%f1,4(%6) \n\t"
"vrepf %%v4,%%v2,1 \n\t"
"aebr %%f2,%%f4 \n\t"
"vrepf %%v4,%%v2,2 \n\t"
"aebr %%f2,%%f4 \n\t"
"vrepf %%v4,%%v2,3 \n\t"
"aebr %%f2,%%f4 \n\t"
"ste %%f2,8(%6) \n\t"
"vrepf %%v4,%%v3,1 \n\t"
"aebr %%f3,%%f4 \n\t"
"vrepf %%v4,%%v3,2 \n\t"
"aebr %%f3,%%f4 \n\t"
"vrepf %%v4,%%v3,3 \n\t"
"aebr %%f3,%%f4 \n\t"
"ste %%f3,12(%6) "
:
:"r"(n),"ZR"((const FLOAT (*)[n])ap[0]),"ZR"((const FLOAT (*)[n])ap[1]),"ZR"((const FLOAT (*)[n])ap[2]),"ZR"((const FLOAT (*)[n])ap[3]),"ZR"((const FLOAT (*)[n])x),"ZQ"((FLOAT (*)[4])y)
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
}
static void sgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
{
__asm__ volatile (
"vzero %%v0 \n\t"
"vzero %%v1 \n\t"
"xgr %%r1,%%r1 \n\t"
"lghi %%r0,-32 \n\t"
"ngr %%r0,%0 \n\t"
"ltgr %%r0,%%r0 \n\t"
"jz 1f \n\t"
"srlg %%r0,%%r0,5 \n\t"
"0: \n\t"
"pfd 1,1024(%%r1,%1) \n\t"
"pfd 1,1024(%%r1,%2) \n\t"
"pfd 1,1024(%%r1,%3) \n\t"
"vl %%v16,0(%%r1,%3) \n\t"
"vl %%v17,16(%%r1,%3) \n\t"
"vl %%v18,32(%%r1,%3) \n\t"
"vl %%v19,48(%%r1,%3) \n\t"
"vl %%v20,64(%%r1,%3) \n\t"
"vl %%v21,80(%%r1,%3) \n\t"
"vl %%v22,96(%%r1,%3) \n\t"
"vl %%v23,112(%%r1,%3) \n\t"
"vl %%v24,0(%%r1,%1) \n\t"
"vfmasb %%v0,%%v16,%%v24,%%v0 \n\t"
"vl %%v25,0(%%r1,%2) \n\t"
"vfmasb %%v1,%%v16,%%v25,%%v1 \n\t"
"vl %%v26,16(%%r1,%1) \n\t"
"vfmasb %%v0,%%v17,%%v26,%%v0 \n\t"
"vl %%v27,16(%%r1,%2) \n\t"
"vfmasb %%v1,%%v17,%%v27,%%v1 \n\t"
"vl %%v28,32(%%r1,%1) \n\t"
"vfmasb %%v0,%%v18,%%v28,%%v0 \n\t"
"vl %%v29,32(%%r1,%2) \n\t"
"vfmasb %%v1,%%v18,%%v29,%%v1 \n\t"
"vl %%v30,48(%%r1,%1) \n\t"
"vfmasb %%v0,%%v19,%%v30,%%v0 \n\t"
"vl %%v31,48(%%r1,%2) \n\t"
"vfmasb %%v1,%%v19,%%v31,%%v1 \n\t"
"vl %%v24,64(%%r1,%1) \n\t"
"vfmasb %%v0,%%v20,%%v24,%%v0 \n\t"
"vl %%v25,64(%%r1,%2) \n\t"
"vfmasb %%v1,%%v20,%%v25,%%v1 \n\t"
"vl %%v26,80(%%r1,%1) \n\t"
"vfmasb %%v0,%%v21,%%v26,%%v0 \n\t"
"vl %%v27,80(%%r1,%2) \n\t"
"vfmasb %%v1,%%v21,%%v27,%%v1 \n\t"
"vl %%v28,96(%%r1,%1) \n\t"
"vfmasb %%v0,%%v22,%%v28,%%v0 \n\t"
"vl %%v29,96(%%r1,%2) \n\t"
"vfmasb %%v1,%%v22,%%v29,%%v1 \n\t"
"vl %%v30,112(%%r1,%1) \n\t"
"vfmasb %%v0,%%v23,%%v30,%%v0 \n\t"
"vl %%v31,112(%%r1,%2) \n\t"
"vfmasb %%v1,%%v23,%%v31,%%v1 \n\t"
"agfi %%r1,128 \n\t"
"brctg %%r0,0b \n\t"
"1: \n\t"
"lghi %%r0,28 \n\t"
"ngr %%r0,%0 \n\t"
"ltgr %%r0,%%r0 \n\t"
"jz 3f \n\t"
"srlg %%r0,%%r0,2 \n\t"
"2: \n\t"
"vl %%v16,0(%%r1,%3) \n\t"
"vl %%v24,0(%%r1,%1) \n\t"
"vfmasb %%v0,%%v16,%%v24,%%v0 \n\t"
"vl %%v25,0(%%r1,%2) \n\t"
"vfmasb %%v1,%%v16,%%v25,%%v1 \n\t"
"agfi %%r1,16 \n\t"
"brctg %%r0,2b \n\t"
"3: \n\t"
"vrepf %%v2,%%v0,1 \n\t"
"aebr %%f0,%%f2 \n\t"
"vrepf %%v2,%%v0,2 \n\t"
"aebr %%f0,%%f2 \n\t"
"vrepf %%v2,%%v0,3 \n\t"
"aebr %%f0,%%f2 \n\t"
"ste %%f0,0(%4) \n\t"
"vrepf %%v2,%%v1,1 \n\t"
"aebr %%f1,%%f2 \n\t"
"vrepf %%v2,%%v1,2 \n\t"
"aebr %%f1,%%f2 \n\t"
"vrepf %%v2,%%v1,3 \n\t"
"aebr %%f1,%%f2 \n\t"
"ste %%f1,4(%4) "
:
:"r"(n),"ZR"((const FLOAT (*)[n])ap[0]),"ZR"((const FLOAT (*)[n])ap[1]),"ZR"((const FLOAT (*)[n])x),"ZQ"((FLOAT (*)[2])y)
:"memory","cc","r0","r1","v0","v1","v2","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
}
static void sgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y)
{
__asm__ volatile (
"vzero %%v0 \n\t"
"xgr %%r1,%%r1 \n\t"
"lghi %%r0,-32 \n\t"
"ngr %%r0,%0 \n\t"
"ltgr %%r0,%%r0 \n\t"
"jz 1f \n\t"
"srlg %%r0,%%r0,5 \n\t"
"0: \n\t"
"pfd 1,1024(%%r1,%1) \n\t"
"pfd 1,1024(%%r1,%2) \n\t"
"vl %%v16,0(%%r1,%2) \n\t"
"vl %%v17,16(%%r1,%2) \n\t"
"vl %%v18,32(%%r1,%2) \n\t"
"vl %%v19,48(%%r1,%2) \n\t"
"vl %%v20,64(%%r1,%2) \n\t"
"vl %%v21,80(%%r1,%2) \n\t"
"vl %%v22,96(%%r1,%2) \n\t"
"vl %%v23,112(%%r1,%2) \n\t"
"vl %%v24,0(%%r1,%1) \n\t"
"vfmasb %%v0,%%v16,%%v24,%%v0 \n\t"
"vl %%v25,16(%%r1,%1) \n\t"
"vfmasb %%v0,%%v17,%%v25,%%v0 \n\t"
"vl %%v26,32(%%r1,%1) \n\t"
"vfmasb %%v0,%%v18,%%v26,%%v0 \n\t"
"vl %%v27,48(%%r1,%1) \n\t"
"vfmasb %%v0,%%v19,%%v27,%%v0 \n\t"
"vl %%v28,64(%%r1,%1) \n\t"
"vfmasb %%v0,%%v20,%%v28,%%v0 \n\t"
"vl %%v29,80(%%r1,%1) \n\t"
"vfmasb %%v0,%%v21,%%v29,%%v0 \n\t"
"vl %%v30,96(%%r1,%1) \n\t"
"vfmasb %%v0,%%v22,%%v30,%%v0 \n\t"
"vl %%v31,112(%%r1,%1) \n\t"
"vfmasb %%v0,%%v23,%%v31,%%v0 \n\t"
"agfi %%r1,128 \n\t"
"brctg %%r0,0b \n\t"
"1: \n\t"
"lghi %%r0,28 \n\t"
"ngr %%r0,%0 \n\t"
"ltgr %%r0,%%r0 \n\t"
"jz 3f \n\t"
"srlg %%r0,%%r0,2 \n\t"
"2: \n\t"
"vl %%v16,0(%%r1,%2) \n\t"
"vl %%v24,0(%%r1,%1) \n\t"
"vfmasb %%v0,%%v16,%%v24,%%v0 \n\t"
"agfi %%r1,16 \n\t"
"brctg %%r0,2b \n\t"
"3: \n\t"
"vrepf %%v1,%%v0,1 \n\t"
"aebr %%f0,%%f1 \n\t"
"vrepf %%v1,%%v0,2 \n\t"
"aebr %%f0,%%f1 \n\t"
"vrepf %%v1,%%v0,3 \n\t"
"aebr %%f0,%%f1 \n\t"
"ste %%f0,0(%3) "
:
:"r"(n),"ZR"((const FLOAT (*)[n])a0),"ZR"((const FLOAT (*)[n])x),"ZQ"((FLOAT (*)[1])y)
:"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
}
static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src)
{
BLASLONG i;
for (i = 0; i < n; i++)
{
dest[i] = *src;
src += inc_src;
}
}
static void add_y_kernel_4(BLASLONG n, FLOAT da, FLOAT *src, FLOAT *dest)
{
__asm__ volatile (
"vlrepf %%v0,%1 \n\t"
"xgr %%r1,%%r1 \n\t"
"lghi %%r0,-32 \n\t"
"ngr %%r0,%0 \n\t"
"ltgr %%r0,%%r0 \n\t"
"jz 1f \n\t"
"srlg %%r0,%%r0,5 \n\t"
"0: \n\t"
"pfd 1,1024(%%r1,%2) \n\t"
"pfd 2,1024(%%r1,%3) \n\t"
"vl %%v16,0(%%r1,%2) \n\t"
"vl %%v17,16(%%r1,%2) \n\t"
"vl %%v18,32(%%r1,%2) \n\t"
"vl %%v19,48(%%r1,%2) \n\t"
"vl %%v20,64(%%r1,%2) \n\t"
"vl %%v21,80(%%r1,%2) \n\t"
"vl %%v22,96(%%r1,%2) \n\t"
"vl %%v23,112(%%r1,%2) \n\t"
"vl %%v24, 0(%%r1,%3) \n\t"
"vfmasb %%v24,%%v16,%%v0,%%v24 \n\t"
"vst %%v24, 0(%%r1,%3) \n\t"
"vl %%v25, 16(%%r1,%3) \n\t"
"vfmasb %%v25,%%v17,%%v0,%%v25 \n\t"
"vst %%v25, 16(%%r1,%3) \n\t"
"vl %%v26, 32(%%r1,%3) \n\t"
"vfmasb %%v26,%%v18,%%v0,%%v26 \n\t"
"vst %%v26, 32(%%r1,%3) \n\t"
"vl %%v27, 48(%%r1,%3) \n\t"
"vfmasb %%v27,%%v19,%%v0,%%v27 \n\t"
"vst %%v27, 48(%%r1,%3) \n\t"
"vl %%v28, 64(%%r1,%3) \n\t"
"vfmasb %%v28,%%v20,%%v0,%%v28 \n\t"
"vst %%v28, 64(%%r1,%3) \n\t"
"vl %%v29, 80(%%r1,%3) \n\t"
"vfmasb %%v29,%%v21,%%v0,%%v29 \n\t"
"vst %%v29, 80(%%r1,%3) \n\t"
"vl %%v30, 96(%%r1,%3) \n\t"
"vfmasb %%v30,%%v22,%%v0,%%v30 \n\t"
"vst %%v30, 96(%%r1,%3) \n\t"
"vl %%v31, 112(%%r1,%3) \n\t"
"vfmasb %%v31,%%v23,%%v0,%%v31 \n\t"
"vst %%v31, 112(%%r1,%3) \n\t"
"agfi %%r1,128 \n\t"
"brctg %%r0,0b \n\t"
"1: \n\t"
"lghi %%r0,28 \n\t"
"ngr %%r0,%0 \n\t"
"ltgr %%r0,%%r0 \n\t"
"jz 3f \n\t"
"srlg %%r0,%%r0,2 \n\t"
"2: \n\t"
"vl %%v16,0(%%r1,%2) \n\t"
"vl %%v24, 0(%%r1,%3) \n\t"
"vfmasb %%v24,%%v16,%%v0,%%v24 \n\t"
"vst %%v24, 0(%%r1,%3) \n\t"
"agfi %%r1,16 \n\t"
"brctg %%r0,2b \n\t"
"3: \n\t"
"nop "
:
:"r"(n),"m"(da),"ZR"((const FLOAT (*)[n])src),"ZR"((FLOAT (*)[n])dest)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
}
static void add_y(BLASLONG n, FLOAT da, FLOAT *src, FLOAT *dest, BLASLONG inc_dest)
{
if (inc_dest == 1)
add_y_kernel_4(n, da, src, dest);
else
{
BLASLONG i;
for (i = 0; i < n; i++)
{
*dest += src[i] * da;
dest += inc_dest;
}
}
}
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
{
BLASLONG register i;
BLASLONG register j;
FLOAT *a_ptr;
FLOAT *x_ptr;
FLOAT *y_ptr;
BLASLONG n0;
BLASLONG n1;
BLASLONG m1;
BLASLONG m2;
BLASLONG m3;
BLASLONG n2;
FLOAT ybuffer[2] __attribute__ ((aligned(16)));
FLOAT *xbuffer;
FLOAT *ytemp;
if ( m < 1 ) return(0);
if ( n < 1 ) return(0);
xbuffer = buffer;
ytemp = buffer + (m < NBMAX ? m : NBMAX);
n0 = n / NBMAX;
n1 = (n % NBMAX) >> 2 ;
n2 = n & 3 ;
m3 = m & 3 ;
m1 = m & -4 ;
m2 = (m & (NBMAX-1)) - m3 ;
BLASLONG NB = NBMAX;
while ( NB == NBMAX )
{
m1 -= NB;
if ( m1 < 0)
{
if ( m2 == 0 ) break;
NB = m2;
}
y_ptr = y;
a_ptr = a;
x_ptr = x;
if ( inc_x == 1 )
xbuffer = x_ptr;
else
copy_x(NB,x_ptr,xbuffer,inc_x);
FLOAT *ap[4];
FLOAT *yp;
BLASLONG register lda4 = 4 * lda;
ap[0] = a_ptr;
ap[1] = a_ptr + lda;
ap[2] = ap[1] + lda;
ap[3] = ap[2] + lda;
if ( n0 > 0 )
{
BLASLONG nb1 = NBMAX / 4;
for( j=0; j<n0; j++)
{
yp = ytemp;
for( i = 0; i < nb1 ; i++)
{
sgemv_kernel_4x4(NB,ap,xbuffer,yp);
ap[0] += lda4 ;
ap[1] += lda4 ;
ap[2] += lda4 ;
ap[3] += lda4 ;
yp += 4;
}
add_y(nb1*4, alpha, ytemp, y_ptr, inc_y );
y_ptr += nb1 * inc_y * 4;
a_ptr += nb1 * lda4 ;
}
}
yp = ytemp;
for( i = 0; i < n1 ; i++)
{
sgemv_kernel_4x4(NB,ap,xbuffer,yp);
ap[0] += lda4 ;
ap[1] += lda4 ;
ap[2] += lda4 ;
ap[3] += lda4 ;
yp += 4;
}
if ( n1 > 0 )
{
add_y(n1*4, alpha, ytemp, y_ptr, inc_y );
y_ptr += n1 * inc_y * 4;
a_ptr += n1 * lda4 ;
}
if ( n2 & 2 )
{
sgemv_kernel_4x2(NB,ap,xbuffer,ybuffer);
a_ptr += lda * 2;
*y_ptr += ybuffer[0] * alpha;
y_ptr += inc_y;
*y_ptr += ybuffer[1] * alpha;
y_ptr += inc_y;
}
if ( n2 & 1 )
{
sgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer);
// a_ptr += lda;
*y_ptr += ybuffer[0] * alpha;
// y_ptr += inc_y;
}
a += NB;
x += NB * inc_x;
}
if ( m3 == 0 ) return(0);
x_ptr = x;
a_ptr = a;
if ( m3 == 3 )
{
FLOAT xtemp0 = *x_ptr * alpha;
x_ptr += inc_x;
FLOAT xtemp1 = *x_ptr * alpha;
x_ptr += inc_x;
FLOAT xtemp2 = *x_ptr * alpha;
FLOAT *aj = a_ptr;
y_ptr = y;
if ( lda == 3 && inc_y == 1 )
{
for ( j=0; j< ( n & -4) ; j+=4 )
{
y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2;
y_ptr[j+1] += aj[3] * xtemp0 + aj[4] * xtemp1 + aj[5] * xtemp2;
y_ptr[j+2] += aj[6] * xtemp0 + aj[7] * xtemp1 + aj[8] * xtemp2;
y_ptr[j+3] += aj[9] * xtemp0 + aj[10] * xtemp1 + aj[11] * xtemp2;
aj += 12;
}
for ( ; j<n; j++ )
{
y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2;
aj += 3;
}
}
else
{
if ( inc_y == 1 )
{
BLASLONG register lda2 = lda << 1;
BLASLONG register lda4 = lda << 2;
BLASLONG register lda3 = lda2 + lda;
for ( j=0; j< ( n & -4 ); j+=4 )
{
y_ptr[j] += *aj * xtemp0 + *(aj+1) * xtemp1 + *(aj+2) * xtemp2;
y_ptr[j+1] += *(aj+lda) * xtemp0 + *(aj+lda+1) * xtemp1 + *(aj+lda+2) * xtemp2;
y_ptr[j+2] += *(aj+lda2) * xtemp0 + *(aj+lda2+1) * xtemp1 + *(aj+lda2+2) * xtemp2;
y_ptr[j+3] += *(aj+lda3) * xtemp0 + *(aj+lda3+1) * xtemp1 + *(aj+lda3+2) * xtemp2;
aj += lda4;
}
for ( ; j< n ; j++ )
{
y_ptr[j] += *aj * xtemp0 + *(aj+1) * xtemp1 + *(aj+2) * xtemp2 ;
aj += lda;
}
}
else
{
for ( j=0; j<n; j++ )
{
*y_ptr += *aj * xtemp0 + *(aj+1) * xtemp1 + *(aj+2) * xtemp2;
y_ptr += inc_y;
aj += lda;
}
}
}
return(0);
}
if ( m3 == 2 )
{
FLOAT xtemp0 = *x_ptr * alpha;
x_ptr += inc_x;
FLOAT xtemp1 = *x_ptr * alpha;
FLOAT *aj = a_ptr;
y_ptr = y;
if ( lda == 2 && inc_y == 1 )
{
for ( j=0; j< ( n & -4) ; j+=4 )
{
y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 ;
y_ptr[j+1] += aj[2] * xtemp0 + aj[3] * xtemp1 ;
y_ptr[j+2] += aj[4] * xtemp0 + aj[5] * xtemp1 ;
y_ptr[j+3] += aj[6] * xtemp0 + aj[7] * xtemp1 ;
aj += 8;
}
for ( ; j<n; j++ )
{
y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 ;
aj += 2;
}
}
else
{
if ( inc_y == 1 )
{
BLASLONG register lda2 = lda << 1;
BLASLONG register lda4 = lda << 2;
BLASLONG register lda3 = lda2 + lda;
for ( j=0; j< ( n & -4 ); j+=4 )
{
y_ptr[j] += *aj * xtemp0 + *(aj+1) * xtemp1 ;
y_ptr[j+1] += *(aj+lda) * xtemp0 + *(aj+lda+1) * xtemp1 ;
y_ptr[j+2] += *(aj+lda2) * xtemp0 + *(aj+lda2+1) * xtemp1 ;
y_ptr[j+3] += *(aj+lda3) * xtemp0 + *(aj+lda3+1) * xtemp1 ;
aj += lda4;
}
for ( ; j< n ; j++ )
{
y_ptr[j] += *aj * xtemp0 + *(aj+1) * xtemp1 ;
aj += lda;
}
}
else
{
for ( j=0; j<n; j++ )
{
*y_ptr += *aj * xtemp0 + *(aj+1) * xtemp1 ;
y_ptr += inc_y;
aj += lda;
}
}
}
return(0);
}
FLOAT xtemp = *x_ptr * alpha;
FLOAT *aj = a_ptr;
y_ptr = y;
if ( lda == 1 && inc_y == 1 )
{
for ( j=0; j< ( n & -4) ; j+=4 )
{
y_ptr[j] += aj[j] * xtemp;
y_ptr[j+1] += aj[j+1] * xtemp;
y_ptr[j+2] += aj[j+2] * xtemp;
y_ptr[j+3] += aj[j+3] * xtemp;
}
for ( ; j<n ; j++ )
{
y_ptr[j] += aj[j] * xtemp;
}
}
else
{
if ( inc_y == 1 )
{
BLASLONG register lda2 = lda << 1;
BLASLONG register lda4 = lda << 2;
BLASLONG register lda3 = lda2 + lda;
for ( j=0; j< ( n & -4 ); j+=4 )
{
y_ptr[j] += *aj * xtemp;
y_ptr[j+1] += *(aj+lda) * xtemp;
y_ptr[j+2] += *(aj+lda2) * xtemp;
y_ptr[j+3] += *(aj+lda3) * xtemp;
aj += lda4 ;
}
for ( ; j<n; j++ )
{
y_ptr[j] += *aj * xtemp;
aj += lda;
}
}
else
{
for ( j=0; j<n; j++ )
{
*y_ptr += *aj * xtemp;
y_ptr += inc_y;
aj += lda;
}
}
}
return(0);
}

162
kernel/zarch/smax.c Normal file
View File

@ -0,0 +1,162 @@
/***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
static FLOAT smax_kernel_64(BLASLONG n, FLOAT *x)
{
FLOAT max;
__asm__ volatile (
"vl %%v0,0(%2) \n\t"
"srlg %%r0,%1,6 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%2) \n\t"
"vl %%v16,0(%%r1,%2) \n\t"
"vl %%v17,16(%%r1,%2) \n\t"
"vl %%v18,32(%%r1,%2) \n\t"
"vl %%v19,48(%%r1,%2) \n\t"
"vl %%v20,64(%%r1,%2) \n\t"
"vl %%v21,80(%%r1,%2) \n\t"
"vl %%v22,96(%%r1,%2) \n\t"
"vl %%v23,112(%%r1,%2) \n\t"
"vl %%v24,128(%%r1,%2) \n\t"
"vl %%v25,144(%%r1,%2) \n\t"
"vl %%v26,160(%%r1,%2) \n\t"
"vl %%v27,176(%%r1,%2) \n\t"
"vl %%v28,192(%%r1,%2) \n\t"
"vl %%v29,208(%%r1,%2) \n\t"
"vl %%v30,224(%%r1,%2) \n\t"
"vl %%v31,240(%%r1,%2) \n\t"
"vfmaxsb %%v16,%%v16,%%v24,0 \n\t"
"vfmaxsb %%v17,%%v17,%%v25,0 \n\t"
"vfmaxsb %%v18,%%v18,%%v26,0 \n\t"
"vfmaxsb %%v19,%%v19,%%v27,0 \n\t"
"vfmaxsb %%v20,%%v20,%%v28,0 \n\t"
"vfmaxsb %%v21,%%v21,%%v29,0 \n\t"
"vfmaxsb %%v22,%%v22,%%v30,0 \n\t"
"vfmaxsb %%v23,%%v23,%%v31,0 \n\t"
"vfmaxsb %%v16,%%v16,%%v20,0 \n\t"
"vfmaxsb %%v17,%%v17,%%v21,0 \n\t"
"vfmaxsb %%v18,%%v18,%%v22,0 \n\t"
"vfmaxsb %%v19,%%v19,%%v23,0 \n\t"
"vfmaxsb %%v16,%%v16,%%v18,0 \n\t"
"vfmaxsb %%v17,%%v17,%%v19,0 \n\t"
"vfmaxsb %%v16,%%v16,%%v17,0 \n\t"
"vfmaxsb %%v0,%%v0,%%16,0 \n\t"
"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"
"veslg %%v16,%%v0,32 \n\t"
"vfmaxsb %%v0,%%v0,%%v16,0 \n\t"
"vrepf %%v16,%%v0,2 \n\t"
"wfmaxsb %%v0,%%v0,%%v16,0 \n\t"
"ler %0,%%f0 "
:"=f"(max)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
return max;
}
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0;
BLASLONG j = 0;
FLOAT maxf = 0.0;
if (n <= 0 || inc_x <= 0) return (maxf);
if (inc_x == 1) {
BLASLONG n1 = n & -64;
if (n1 > 0) {
maxf = smax_kernel_64(n1, x);
i = n1;
}
else
{
maxf=x[0];
i++;
}
while (i < n) {
if (x[i] > maxf) {
maxf = x[i];
}
i++;
}
return (maxf);
} else {
maxf=x[0];
BLASLONG n1 = n & -4;
while (j < n1) {
if (x[i] > maxf) {
maxf = x[i];
}
if (x[i + inc_x] > maxf) {
maxf = x[i + inc_x];
}
if (x[i + 2 * inc_x] > maxf) {
maxf = x[i + 2 * inc_x];
}
if (x[i + 3 * inc_x] > maxf) {
maxf = x[i + 3 * inc_x];
}
i += inc_x * 4;
j += 4;
}
while (j < n) {
if (x[i] > maxf) {
maxf = x[i];
}
i += inc_x;
j++;
}
return (maxf);
}
}

162
kernel/zarch/smin.c Normal file
View File

@ -0,0 +1,162 @@
/***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
static FLOAT smin_kernel_64(BLASLONG n, FLOAT *x)
{
FLOAT min;
__asm__ volatile (
"vl %%v0,0(%2) \n\t"
"srlg %%r0,%1,6 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%2) \n\t"
"vl %%v16,0(%%r1,%2) \n\t"
"vl %%v17,16(%%r1,%2) \n\t"
"vl %%v18,32(%%r1,%2) \n\t"
"vl %%v19,48(%%r1,%2) \n\t"
"vl %%v20,64(%%r1,%2) \n\t"
"vl %%v21,80(%%r1,%2) \n\t"
"vl %%v22,96(%%r1,%2) \n\t"
"vl %%v23,112(%%r1,%2) \n\t"
"vl %%v24,128(%%r1,%2) \n\t"
"vl %%v25,144(%%r1,%2) \n\t"
"vl %%v26,160(%%r1,%2) \n\t"
"vl %%v27,176(%%r1,%2) \n\t"
"vl %%v28,192(%%r1,%2) \n\t"
"vl %%v29,208(%%r1,%2) \n\t"
"vl %%v30,224(%%r1,%2) \n\t"
"vl %%v31,240(%%r1,%2) \n\t"
"vfminsb %%v16,%%v16,%%v24,0 \n\t"
"vfminsb %%v17,%%v17,%%v25,0 \n\t"
"vfminsb %%v18,%%v18,%%v26,0 \n\t"
"vfminsb %%v19,%%v19,%%v27,0 \n\t"
"vfminsb %%v20,%%v20,%%v28,0 \n\t"
"vfminsb %%v21,%%v21,%%v29,0 \n\t"
"vfminsb %%v22,%%v22,%%v30,0 \n\t"
"vfminsb %%v23,%%v23,%%v31,0 \n\t"
"vfminsb %%v16,%%v16,%%v20,0 \n\t"
"vfminsb %%v17,%%v17,%%v21,0 \n\t"
"vfminsb %%v18,%%v18,%%v22,0 \n\t"
"vfminsb %%v19,%%v19,%%v23,0 \n\t"
"vfminsb %%v16,%%v16,%%v18,0 \n\t"
"vfminsb %%v17,%%v17,%%v19,0 \n\t"
"vfminsb %%v16,%%v16,%%v17,0 \n\t"
"vfminsb %%v0,%%v0,%%16,0 \n\t"
"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"
"veslg %%v16,%%v0,32 \n\t"
"vfminsb %%v0,%%v0,%%v16,0 \n\t"
"vrepf %%v16,%%v0,2 \n\t"
"wfminsb %%v0,%%v0,%%v16,0 \n\t"
"ler %0,%%f0 "
:"=f"(min)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
return min;
}
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0;
BLASLONG j = 0;
FLOAT minf = 0.0;
if (n <= 0 || inc_x <= 0) return (minf);
if (inc_x == 1) {
BLASLONG n1 = n & -64;
if (n1 > 0) {
minf = smin_kernel_64(n1, x);
i = n1;
}
else
{
minf=x[0];
i++;
}
while (i < n) {
if (x[i] < minf) {
minf = x[i];
}
i++;
}
return (minf);
} else {
minf=x[0];
BLASLONG n1 = n & -4;
while (j < n1) {
if (x[i] < minf) {
minf = x[i];
}
if (x[i + inc_x] < minf) {
minf = x[i + inc_x];
}
if (x[i + 2 * inc_x] < minf) {
minf = x[i + 2 * inc_x];
}
if (x[i + 3 * inc_x] < minf) {
minf = x[i + 3 * inc_x];
}
i += inc_x * 4;
j += 4;
}
while (j < n) {
if (x[i] < minf) {
minf = x[i];
}
i += inc_x;
j++;
}
return (minf);
}
}

246
kernel/zarch/srot.c Normal file
View File

@ -0,0 +1,246 @@
/***************************************************************************
Copyright (c) 2013-2018, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
static void srot_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s)
{
__asm__ (
"vlrepf %%v0,%3 \n\t"
"vlrepf %%v1,%4 \n\t"
"srlg %%r0,%0,6 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 2, 1024(%%r1,%1) \n\t"
"pfd 2, 1024(%%r1,%2) \n\t"
"vl %%v24, 0(%%r1,%1) \n\t"
"vl %%v25, 16(%%r1,%1) \n\t"
"vl %%v26, 32(%%r1,%1) \n\t"
"vl %%v27, 48(%%r1,%1) \n\t"
"vl %%v16, 0(%%r1,%2) \n\t"
"vl %%v17, 16(%%r1,%2) \n\t"
"vl %%v18, 32(%%r1,%2) \n\t"
"vl %%v19, 48(%%r1,%2) \n\t"
"vfmsb %%v28,%%v24,%%v0 \n\t"
"vfmsb %%v29,%%v25,%%v0 \n\t"
"vfmsb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
"vfmsb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
"vfmsb %%v30,%%v26,%%v0 \n\t"
"vfmsb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
"vfmsb %%v31,%%v27,%%v0 \n\t"
"vfmsb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
/* 2nd parts*/
"vfmasb %%v28,%%v16,%%v1,%%v28 \n\t"
"vfmssb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
"vfmasb %%v29,%%v17,%%v1,%%v29 \n\t"
"vfmssb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
"vfmasb %%v30,%%v18,%%v1,%%v30 \n\t"
"vfmssb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */
"vfmasb %%v31,%%v19,%%v1,%%v31 \n\t"
"vfmssb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
"vst %%v28, 0(%%r1,%1) \n\t"
"vst %%v29, 16(%%r1,%1) \n\t"
"vst %%v30, 32(%%r1,%1) \n\t"
"vst %%v31, 48(%%r1,%1) \n\t"
"vst %%v20, 0(%%r1,%2) \n\t"
"vst %%v21, 16(%%r1,%2) \n\t"
"vst %%v22, 32(%%r1,%2) \n\t"
"vst %%v23, 48(%%r1,%2) \n\t"
"vl %%v24, 64(%%r1,%1) \n\t"
"vl %%v25, 80(%%r1,%1) \n\t"
"vl %%v26, 96(%%r1,%1) \n\t"
"vl %%v27, 112(%%r1,%1) \n\t"
"vl %%v16, 64(%%r1,%2) \n\t"
"vl %%v17, 80(%%r1,%2) \n\t"
"vl %%v18, 96(%%r1,%2) \n\t"
"vl %%v19, 112(%%r1,%2) \n\t"
"vfmsb %%v28,%%v24,%%v0 \n\t"
"vfmsb %%v29,%%v25,%%v0 \n\t"
"vfmsb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
"vfmsb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
"vfmsb %%v30,%%v26,%%v0 \n\t"
"vfmsb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
"vfmsb %%v31,%%v27,%%v0 \n\t"
"vfmsb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
/* 2nd parts*/
"vfmasb %%v28,%%v16,%%v1,%%v28 \n\t"
"vfmssb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
"vfmasb %%v29,%%v17,%%v1,%%v29 \n\t"
"vfmssb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
"vfmasb %%v30,%%v18,%%v1,%%v30 \n\t"
"vfmssb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */
"vfmasb %%v31,%%v19,%%v1,%%v31 \n\t"
"vfmssb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
"vst %%v28, 64(%%r1,%1) \n\t"
"vst %%v29, 80(%%r1,%1) \n\t"
"vst %%v30, 96(%%r1,%1) \n\t"
"vst %%v31, 112(%%r1,%1) \n\t"
"vst %%v20, 64(%%r1,%2) \n\t"
"vst %%v21, 80(%%r1,%2) \n\t"
"vst %%v22, 96(%%r1,%2) \n\t"
"vst %%v23, 112(%%r1,%2) \n\t"
"vl %%v24, 128(%%r1,%1) \n\t"
"vl %%v25, 144(%%r1,%1) \n\t"
"vl %%v26, 160(%%r1,%1) \n\t"
"vl %%v27, 176(%%r1,%1) \n\t"
"vl %%v16, 128(%%r1,%2) \n\t"
"vl %%v17, 144(%%r1,%2) \n\t"
"vl %%v18, 160(%%r1,%2) \n\t"
"vl %%v19, 176(%%r1,%2) \n\t"
"vfmsb %%v28,%%v24,%%v0 \n\t"
"vfmsb %%v29,%%v25,%%v0 \n\t"
"vfmsb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
"vfmsb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
"vfmsb %%v30,%%v26,%%v0 \n\t"
"vfmsb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
"vfmsb %%v31,%%v27,%%v0 \n\t"
"vfmsb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
/* 2nd parts*/
"vfmasb %%v28,%%v16,%%v1,%%v28 \n\t"
"vfmssb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
"vfmasb %%v29,%%v17,%%v1,%%v29 \n\t"
"vfmssb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
"vfmasb %%v30,%%v18,%%v1,%%v30 \n\t"
"vfmssb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */
"vfmasb %%v31,%%v19,%%v1,%%v31 \n\t"
"vfmssb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
"vst %%v28, 128(%%r1,%1) \n\t"
"vst %%v29, 144(%%r1,%1) \n\t"
"vst %%v30, 160(%%r1,%1) \n\t"
"vst %%v31, 176(%%r1,%1) \n\t"
"vst %%v20, 128(%%r1,%2) \n\t"
"vst %%v21, 144(%%r1,%2) \n\t"
"vst %%v22, 160(%%r1,%2) \n\t"
"vst %%v23, 176(%%r1,%2) \n\t"
"vl %%v24, 192(%%r1,%1) \n\t"
"vl %%v25, 208(%%r1,%1) \n\t"
"vl %%v26, 224(%%r1,%1) \n\t"
"vl %%v27, 240(%%r1,%1) \n\t"
"vl %%v16, 192(%%r1,%2) \n\t"
"vl %%v17, 208(%%r1,%2) \n\t"
"vl %%v18, 224(%%r1,%2) \n\t"
"vl %%v19, 240(%%r1,%2) \n\t"
"vfmsb %%v28,%%v24,%%v0 \n\t"
"vfmsb %%v29,%%v25,%%v0 \n\t"
"vfmsb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
"vfmsb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
"vfmsb %%v30,%%v26,%%v0 \n\t"
"vfmsb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
"vfmsb %%v31,%%v27,%%v0 \n\t"
"vfmsb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
/* 2nd parts*/
"vfmasb %%v28,%%v16,%%v1,%%v28 \n\t"
"vfmssb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
"vfmasb %%v29,%%v17,%%v1,%%v29 \n\t"
"vfmssb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
"vfmasb %%v30,%%v18,%%v1,%%v30 \n\t"
"vfmssb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */
"vfmasb %%v31,%%v19,%%v1,%%v31 \n\t"
"vfmssb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
"vst %%v28, 192(%%r1,%1) \n\t"
"vst %%v29, 208(%%r1,%1) \n\t"
"vst %%v30, 224(%%r1,%1) \n\t"
"vst %%v31, 240(%%r1,%1) \n\t"
"vst %%v20, 192(%%r1,%2) \n\t"
"vst %%v21, 208(%%r1,%2) \n\t"
"vst %%v22, 224(%%r1,%2) \n\t"
"vst %%v23, 240(%%r1,%2) \n\t"
"agfi %%r1,256 \n\t"
"brctg %%r0,0b "
:
:"r"(n),"ZR"((FLOAT (*)[n])x),"ZR"((FLOAT (*)[n])y),"m"(*c),"m"(*s)
:"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
}
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
{
BLASLONG i=0;
BLASLONG ix=0,iy=0;
FLOAT temp;
if ( n <= 0 ) return(0);
if ( (inc_x == 1) && (inc_y == 1) )
{
BLASLONG n1 = n & -64;
if ( n1 > 0 )
{
FLOAT cosa,sina;
cosa=c;
sina=s;
srot_kernel_64(n1, x, y, &cosa, &sina);
i=n1;
}
while(i < n)
{
temp = c*x[i] + s*y[i] ;
y[i] = c*y[i] - s*x[i] ;
x[i] = temp ;
i++ ;
}
}
else
{
while(i < n)
{
temp = c*x[ix] + s*y[iy] ;
y[iy] = c*y[iy] - s*x[ix] ;
x[ix] = temp ;
ix += inc_x ;
iy += inc_y ;
i++ ;
}
}
return(0);
}

201
kernel/zarch/sscal.c Normal file
View File

@ -0,0 +1,201 @@
/***************************************************************************
Copyright (c) 2013-2018, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
static void sscal_kernel_32(BLASLONG n, FLOAT da, FLOAT *x)
{
__asm__ volatile (
"vlrepf %%v0,%1 \n\t"
"srlg %%r0,%0,5 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 2, 1024(%%r1,%2) \n\t"
"vl %%v24, 0(%%r1,%2) \n\t"
"vfmsb %%v24,%%v24,%%v0 \n\t"
"vst %%v24, 0(%%r1,%2) \n\t"
"vl %%v25, 16(%%r1,%2) \n\t"
"vfmsb %%v25,%%v25,%%v0 \n\t"
"vst %%v25, 16(%%r1,%2) \n\t"
"vl %%v26, 32(%%r1,%2) \n\t"
"vfmsb %%v26,%%v26,%%v0 \n\t"
"vst %%v26, 32(%%r1,%2) \n\t"
"vl %%v27, 48(%%r1,%2) \n\t"
"vfmsb %%v27,%%v27,%%v0 \n\t"
"vst %%v27, 48(%%r1,%2) \n\t"
"vl %%v24, 64(%%r1,%2) \n\t"
"vfmsb %%v24,%%v24,%%v0 \n\t"
"vst %%v24, 64(%%r1,%2) \n\t"
"vl %%v25, 80(%%r1,%2) \n\t"
"vfmsb %%v25,%%v25,%%v0 \n\t"
"vst %%v25, 80(%%r1,%2) \n\t"
"vl %%v26, 96(%%r1,%2) \n\t"
"vfmsb %%v26,%%v26,%%v0 \n\t"
"vst %%v26, 96(%%r1,%2) \n\t"
"vl %%v27, 112(%%r1,%2) \n\t"
"vfmsb %%v27,%%v27,%%v0 \n\t"
"vst %%v27, 112(%%r1,%2) \n\t"
"agfi %%r1,128 \n\t"
"brctg %%r0,0b "
:
:"r"(n),"m"(da),"ZR"((FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v24","v25","v26","v27"
);
}
static void sscal_kernel_32_zero(BLASLONG n, FLOAT *x)
{
__asm__ volatile(
"vzero %%v24 \n\t"
"vzero %%v25 \n\t"
"vzero %%v26 \n\t"
"vzero %%v27 \n\t"
"srlg %%r0,%0,5 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 2, 1024(%%r1,%1) \n\t"
"vst %%v24,0(%%r1,%1) \n\t"
"vst %%v25,16(%%r1,%1) \n\t"
"vst %%v26,32(%%r1,%1) \n\t"
"vst %%v27,48(%%r1,%1) \n\t"
"vst %%v24,64(%%r1,%1) \n\t"
"vst %%v25,80(%%r1,%1) \n\t"
"vst %%v26,96(%%r1,%1) \n\t"
"vst %%v27,112(%%r1,%1) \n\t"
"agfi %%r1,128 \n\t"
"brctg %%r0,0b "
:
:"r"(n),"ZR"((FLOAT (*)[n])x)
:"memory","cc","r0","r1","v24","v25","v26","v27"
);
}
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
{
BLASLONG i=0,j=0;
if ( n <= 0 || inc_x <=0 )
return(0);
if ( inc_x == 1 )
{
if ( da == 0.0 )
{
BLASLONG n1 = n & -32;
if ( n1 > 0 )
{
sscal_kernel_32_zero(n1, x);
j=n1;
}
while(j < n)
{
x[j]=0.0;
j++;
}
}
else
{
BLASLONG n1 = n & -32;
if ( n1 > 0 )
{
sscal_kernel_32(n1, da, x);
j=n1;
}
while(j < n)
{
x[j] = da * x[j] ;
j++;
}
}
}
else
{
if ( da == 0.0 )
{
BLASLONG n1 = n & -2;
while (j < n1) {
x[i]=0.0;
x[i + inc_x]=0.0;
i += inc_x * 2;
j += 2;
}
while(j < n)
{
x[i]=0.0;
i += inc_x ;
j++;
}
}
else
{
BLASLONG n1 = n & -2;
while (j < n1) {
x[i] = da * x[i] ;
x[i + inc_x] = da * x[i + inc_x];
i += inc_x * 2;
j += 2;
}
while(j < n)
{
x[i] = da * x[i] ;
i += inc_x ;
j++;
}
}
}
return 0;
}

164
kernel/zarch/sswap.c Normal file
View File

@ -0,0 +1,164 @@
/***************************************************************************
Copyright (c) 2013-2018, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
static void sswap_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y)
{
__asm__ volatile(
"srlg %%r0,%0,6 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 2, 1024(%%r1,%1) \n\t"
"pfd 2, 1024(%%r1,%2) \n\t"
"vl %%v16, 0(%%r1,%1) \n\t"
"vl %%v17, 16(%%r1,%1) \n\t"
"vl %%v18, 32(%%r1,%1) \n\t"
"vl %%v19, 48(%%r1,%1) \n\t"
"vl %%v20, 64(%%r1,%1) \n\t"
"vl %%v21, 80(%%r1,%1) \n\t"
"vl %%v22, 96(%%r1,%1) \n\t"
"vl %%v23, 112(%%r1,%1) \n\t"
"vl %%v24, 128(%%r1,%1) \n\t"
"vl %%v25, 144(%%r1,%1) \n\t"
"vl %%v26, 160(%%r1,%1) \n\t"
"vl %%v27, 176(%%r1,%1) \n\t"
"vl %%v28, 192(%%r1,%1) \n\t"
"vl %%v29, 208(%%r1,%1) \n\t"
"vl %%v30, 224(%%r1,%1) \n\t"
"vl %%v31, 240(%%r1,%1) \n\t"
"vl %%v0, 0(%%r1,%2) \n\t"
"vl %%v1, 16(%%r1,%2) \n\t"
"vl %%v2, 32(%%r1,%2) \n\t"
"vl %%v3, 48(%%r1,%2) \n\t"
"vl %%v4, 64(%%r1,%2) \n\t"
"vl %%v5, 80(%%r1,%2) \n\t"
"vl %%v6, 96(%%r1,%2) \n\t"
"vl %%v7, 112(%%r1,%2) \n\t"
"vst %%v0, 0(%%r1,%1) \n\t"
"vst %%v1, 16(%%r1,%1) \n\t"
"vst %%v2, 32(%%r1,%1) \n\t"
"vst %%v3, 48(%%r1,%1) \n\t"
"vst %%v4, 64(%%r1,%1) \n\t"
"vst %%v5, 80(%%r1,%1) \n\t"
"vst %%v6, 96(%%r1,%1) \n\t"
"vst %%v7, 112(%%r1,%1) \n\t"
"vl %%v0, 128(%%r1,%2) \n\t"
"vl %%v1, 144(%%r1,%2) \n\t"
"vl %%v2, 160(%%r1,%2) \n\t"
"vl %%v3, 176(%%r1,%2) \n\t"
"vl %%v4, 192(%%r1,%2) \n\t"
"vl %%v5, 208(%%r1,%2) \n\t"
"vl %%v6, 224(%%r1,%2) \n\t"
"vl %%v7, 240(%%r1,%2) \n\t"
"vst %%v0, 128(%%r1,%1) \n\t"
"vst %%v1, 144(%%r1,%1) \n\t"
"vst %%v2, 160(%%r1,%1) \n\t"
"vst %%v3, 176(%%r1,%1) \n\t"
"vst %%v4, 192(%%r1,%1) \n\t"
"vst %%v5, 208(%%r1,%1) \n\t"
"vst %%v6, 224(%%r1,%1) \n\t"
"vst %%v7, 240(%%r1,%1) \n\t"
"vst %%v16, 0(%%r1,%2) \n\t"
"vst %%v17, 16(%%r1,%2) \n\t"
"vst %%v18, 32(%%r1,%2) \n\t"
"vst %%v19, 48(%%r1,%2) \n\t"
"vst %%v20, 64(%%r1,%2) \n\t"
"vst %%v21, 80(%%r1,%2) \n\t"
"vst %%v22, 96(%%r1,%2) \n\t"
"vst %%v23, 112(%%r1,%2) \n\t"
"vst %%v24, 128(%%r1,%2) \n\t"
"vst %%v25, 144(%%r1,%2) \n\t"
"vst %%v26, 160(%%r1,%2) \n\t"
"vst %%v27, 176(%%r1,%2) \n\t"
"vst %%v28, 192(%%r1,%2) \n\t"
"vst %%v29, 208(%%r1,%2) \n\t"
"vst %%v30, 224(%%r1,%2) \n\t"
"vst %%v31, 240(%%r1,%2) \n\t"
"agfi %%r1,256 \n\t"
"brctg %%r0,0b "
:
:"r"(n),"ZR"((FLOAT (*)[n])x),"ZR"((FLOAT (*)[n])y)
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
}
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
{
BLASLONG i=0;
BLASLONG ix=0,iy=0;
FLOAT temp;
if ( n <= 0 ) return(0);
if ( (inc_x == 1) && (inc_y == 1 ))
{
BLASLONG n1 = n & -64;
if ( n1 > 0 )
{
sswap_kernel_64(n1, x, y);
i=n1;
}
while(i < n)
{
temp = y[i];
y[i] = x[i] ;
x[i] = temp;
i++ ;
}
}
else
{
while(i < n)
{
temp = y[iy];
y[iy] = x[ix] ;
x[ix] = temp;
ix += inc_x ;
iy += inc_y ;
i++ ;
}
}
return(0);
}

211
kernel/zarch/zamax.c Normal file
View File

@ -0,0 +1,211 @@
/***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#include <math.h>
#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf
#endif
#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1]))
static FLOAT zamax_kernel_16(BLASLONG n, FLOAT *x)
{
FLOAT amax;
__asm__ volatile (
"vleg %%v0,0(%2),0 \n\t"
"vleg %%v16,8(%2),0 \n\t"
"vleg %%v0,16(%2),1 \n\t"
"vleg %%v16,24(%2),1 \n\t"
"vflpdb %%v0,%%v0 \n\t"
"vflpdb %%v16,%%v16 \n\t"
"vfadb %%v0,%%v0,%%v16 \n\t"
"srlg %%r0,%1,4 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%2) \n\t"
"vleg %%v16,0(%%r1,%2),0 \n\t"
"vleg %%v17,8(%%r1,%2),0 \n\t"
"vleg %%v16,16(%%r1,%2),1 \n\t"
"vleg %%v17,24(%%r1,%2),1 \n\t"
"vleg %%v18,32(%%r1,%2),0 \n\t"
"vleg %%v19,40(%%r1,%2),0 \n\t"
"vleg %%v18,48(%%r1,%2),1 \n\t"
"vleg %%v19,56(%%r1,%2),1 \n\t"
"vleg %%v20,64(%%r1,%2),0 \n\t"
"vleg %%v21,72(%%r1,%2),0 \n\t"
"vleg %%v20,80(%%r1,%2),1 \n\t"
"vleg %%v21,88(%%r1,%2),1 \n\t"
"vleg %%v22,96(%%r1,%2),0 \n\t"
"vleg %%v23,104(%%r1,%2),0 \n\t"
"vleg %%v22,112(%%r1,%2),1 \n\t"
"vleg %%v23,120(%%r1,%2),1 \n\t"
"vleg %%v24,128(%%r1,%2),0 \n\t"
"vleg %%v25,136(%%r1,%2),0 \n\t"
"vleg %%v24,144(%%r1,%2),1 \n\t"
"vleg %%v25,152(%%r1,%2),1 \n\t"
"vleg %%v26,160(%%r1,%2),0 \n\t"
"vleg %%v27,168(%%r1,%2),0 \n\t"
"vleg %%v26,176(%%r1,%2),1 \n\t"
"vleg %%v27,184(%%r1,%2),1 \n\t"
"vleg %%v28,192(%%r1,%2),0 \n\t"
"vleg %%v29,200(%%r1,%2),0 \n\t"
"vleg %%v28,208(%%r1,%2),1 \n\t"
"vleg %%v29,216(%%r1,%2),1 \n\t"
"vleg %%v30,224(%%r1,%2),0 \n\t"
"vleg %%v31,232(%%r1,%2),0 \n\t"
"vleg %%v30,240(%%r1,%2),1 \n\t"
"vleg %%v31,248(%%r1,%2),1 \n\t"
"vflpdb %%v16,%%v16 \n\t"
"vflpdb %%v17,%%v17 \n\t"
"vflpdb %%v18,%%v18 \n\t"
"vflpdb %%v19,%%v19 \n\t"
"vflpdb %%v20,%%v20 \n\t"
"vflpdb %%v21,%%v21 \n\t"
"vflpdb %%v22,%%v22 \n\t"
"vflpdb %%v23,%%v23 \n\t"
"vflpdb %%v24,%%v24 \n\t"
"vflpdb %%v25,%%v25 \n\t"
"vflpdb %%v26,%%v26 \n\t"
"vflpdb %%v27,%%v27 \n\t"
"vflpdb %%v28,%%v28 \n\t"
"vflpdb %%v29,%%v29 \n\t"
"vflpdb %%v30,%%v30 \n\t"
"vflpdb %%v31,%%v31 \n\t"
"vfadb %%v16,%%v16,%%v17 \n\t"
"vfadb %%v18,%%v18,%%v19 \n\t"
"vfadb %%v20,%%v20,%%v21 \n\t"
"vfadb %%v22,%%v22,%%v23 \n\t"
"vfadb %%v24,%%v24,%%v25 \n\t"
"vfadb %%v26,%%v26,%%v27 \n\t"
"vfadb %%v28,%%v28,%%v29 \n\t"
"vfadb %%v30,%%v30,%%v31 \n\t"
"vfmaxdb %%v16,%%v16,%%v24,0 \n\t"
"vfmaxdb %%v18,%%v18,%%v26,0 \n\t"
"vfmaxdb %%v20,%%v20,%%v28,0 \n\t"
"vfmaxdb %%v22,%%v22,%%v30,0 \n\t"
"vfmaxdb %%v16,%%v16,%%v20,0 \n\t"
"vfmaxdb %%v18,%%v18,%%v22,0 \n\t"
"vfmaxdb %%v16,%%v16,%%v18,0 \n\t"
"vfmaxdb %%v0,%%v0,%%v16,0 \n\t"
"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"
"vrepg %%v16,%%v0,1 \n\t"
"wfmaxdb %%v0,%%v0,%%v16,0 \n\t"
"ldr %0,%%f0 "
:"=f"(amax)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
return amax;
}
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0;
BLASLONG ix = 0;
FLOAT maxf = 0.0;
BLASLONG inc_x2;
if (n <= 0 || inc_x <= 0) return (maxf);
if (inc_x == 1) {
BLASLONG n1 = n & -16;
if (n1 > 0) {
maxf = zamax_kernel_16(n1, x);
ix = n1 * 2;
i = n1;
}
else
{
maxf=CABS1(x,0);
ix += 2;
i++;
}
while (i < n) {
if (CABS1(x,ix) > maxf) {
maxf = CABS1(x,ix);
}
ix += 2;
i++;
}
return (maxf);
} else {
maxf=CABS1(x,0);
inc_x2 = 2 * inc_x;
BLASLONG n1 = n & -4;
while (i < n1) {
if (CABS1(x,ix) > maxf) {
maxf = CABS1(x,ix);
}
if (CABS1(x,ix+inc_x2) > maxf) {
maxf = CABS1(x,ix+inc_x2);
}
if (CABS1(x,ix+inc_x2*2) > maxf) {
maxf = CABS1(x,ix+inc_x2*2);
}
if (CABS1(x,ix+inc_x2*3) > maxf) {
maxf = CABS1(x,ix+inc_x2*3);
}
ix += inc_x2 * 4;
i += 4;
}
while (i < n) {
if (CABS1(x,ix) > maxf) {
maxf = CABS1(x,ix);
}
ix += inc_x2;
i++;
}
return (maxf);
}
}

221
kernel/zarch/zamax_z13.c Normal file
View File

@ -0,0 +1,221 @@
/***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#include <math.h>
#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf
#endif
#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1]))
static FLOAT zamax_kernel_16(BLASLONG n, FLOAT *x)
{
FLOAT amax;
__asm__ volatile (
"vleg %%v0,0(%2),0 \n\t"
"vleg %%v16,8(%2),0 \n\t"
"vleg %%v0,16(%2),1 \n\t"
"vleg %%v16,24(%2),1 \n\t"
"vflpdb %%v0,%%v0 \n\t"
"vflpdb %%v16,%%v16 \n\t"
"vfadb %%v0,%%v0,%%v16 \n\t"
"srlg %%r0,%1,4 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%2) \n\t"
"vleg %%v16,0(%%r1,%2),0 \n\t"
"vleg %%v17,8(%%r1,%2),0 \n\t"
"vleg %%v16,16(%%r1,%2),1 \n\t"
"vleg %%v17,24(%%r1,%2),1 \n\t"
"vleg %%v18,32(%%r1,%2),0 \n\t"
"vleg %%v19,40(%%r1,%2),0 \n\t"
"vleg %%v18,48(%%r1,%2),1 \n\t"
"vleg %%v19,56(%%r1,%2),1 \n\t"
"vleg %%v20,64(%%r1,%2),0 \n\t"
"vleg %%v21,72(%%r1,%2),0 \n\t"
"vleg %%v20,80(%%r1,%2),1 \n\t"
"vleg %%v21,88(%%r1,%2),1 \n\t"
"vleg %%v22,96(%%r1,%2),0 \n\t"
"vleg %%v23,104(%%r1,%2),0 \n\t"
"vleg %%v22,112(%%r1,%2),1 \n\t"
"vleg %%v23,120(%%r1,%2),1 \n\t"
"vflpdb %%v16, %%v16 \n\t"
"vflpdb %%v17, %%v17 \n\t"
"vflpdb %%v18, %%v18 \n\t"
"vflpdb %%v19, %%v19 \n\t"
"vflpdb %%v20, %%v20 \n\t"
"vflpdb %%v21, %%v21 \n\t"
"vflpdb %%v22, %%v22 \n\t"
"vflpdb %%v23, %%v23 \n\t"
"vfadb %%v16,%%v16,%%v17 \n\t"
"vfadb %%v17,%%v18,%%v19 \n\t"
"vfadb %%v18,%%v20,%%v21 \n\t"
"vfadb %%v19,%%v22,%%v23 \n\t"
"vfchdb %%v24,%%v16,%%v17 \n\t"
"vfchdb %%v25,%%v18,%%v19 \n\t"
"vsel %%v24,%%v16,%%v17,%%v24 \n\t"
"vsel %%v25,%%v18,%%v19,%%v25 \n\t"
"vfchdb %%v26,%%v24,%%v25 \n\t"
"vsel %%v26,%%v24,%%v25,%%v26 \n\t"
"vfchdb %%v27,%%v26,%%v0 \n\t"
"vsel %%v0,%%v26,%%v0,%%v27 \n\t"
"vleg %%v16,128(%%r1,%2),0 \n\t"
"vleg %%v17,136(%%r1,%2),0 \n\t"
"vleg %%v16,144(%%r1,%2),1 \n\t"
"vleg %%v17,152(%%r1,%2),1 \n\t"
"vleg %%v18,160(%%r1,%2),0 \n\t"
"vleg %%v19,168(%%r1,%2),0 \n\t"
"vleg %%v18,176(%%r1,%2),1 \n\t"
"vleg %%v19,184(%%r1,%2),1 \n\t"
"vleg %%v20,192(%%r1,%2),0 \n\t"
"vleg %%v21,200(%%r1,%2),0 \n\t"
"vleg %%v20,208(%%r1,%2),1 \n\t"
"vleg %%v21,216(%%r1,%2),1 \n\t"
"vleg %%v22,224(%%r1,%2),0 \n\t"
"vleg %%v23,232(%%r1,%2),0 \n\t"
"vleg %%v22,240(%%r1,%2),1 \n\t"
"vleg %%v23,248(%%r1,%2),1 \n\t"
"vflpdb %%v16, %%v16 \n\t"
"vflpdb %%v17, %%v17 \n\t"
"vflpdb %%v18, %%v18 \n\t"
"vflpdb %%v19, %%v19 \n\t"
"vflpdb %%v20, %%v20 \n\t"
"vflpdb %%v21, %%v21 \n\t"
"vflpdb %%v22, %%v22 \n\t"
"vflpdb %%v23, %%v23 \n\t"
"vfadb %%v16,%%v16,%%v17 \n\t"
"vfadb %%v17,%%v18,%%v19 \n\t"
"vfadb %%v18,%%v20,%%v21 \n\t"
"vfadb %%v19,%%v22,%%v23 \n\t"
"vfchdb %%v24,%%v16,%%v17 \n\t"
"vfchdb %%v25,%%v18,%%v19 \n\t"
"vsel %%v24,%%v16,%%v17,%%v24 \n\t"
"vsel %%v25,%%v18,%%v19,%%v25 \n\t"
"vfchdb %%v26,%%v24,%%v25 \n\t"
"vsel %%v26,%%v24,%%v25,%%v26 \n\t"
"vfchdb %%v27,%%v26,%%v0 \n\t"
"vsel %%v0,%%v26,%%v0,%%v27 \n\t"
"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"
"vrepg %%v16,%%v0,1 \n\t"
"wfchdb %%v17,%%v0,%%v16 \n\t"
"vsel %%v0,%%v0,%%v16,%%v17 \n\t"
"ldr %0,%%f0 "
:"=f"(amax)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27"
);
return amax;
}
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0;
BLASLONG ix = 0;
FLOAT maxf = 0.0;
BLASLONG inc_x2;
if (n <= 0 || inc_x <= 0) return (maxf);
if (inc_x == 1) {
BLASLONG n1 = n & -16;
if (n1 > 0) {
maxf = zamax_kernel_16(n1, x);
ix = n1 * 2;
i = n1;
}
else
{
maxf=CABS1(x,0);
ix += 2;
i++;
}
while (i < n) {
if (CABS1(x,ix) > maxf) {
maxf = CABS1(x,ix);
}
ix += 2;
i++;
}
return (maxf);
} else {
maxf=CABS1(x,0);
inc_x2 = 2 * inc_x;
BLASLONG n1 = n & -4;
while (i < n1) {
if (CABS1(x,ix) > maxf) {
maxf = CABS1(x,ix);
}
if (CABS1(x,ix+inc_x2) > maxf) {
maxf = CABS1(x,ix+inc_x2);
}
if (CABS1(x,ix+inc_x2*2) > maxf) {
maxf = CABS1(x,ix+inc_x2*2);
}
if (CABS1(x,ix+inc_x2*3) > maxf) {
maxf = CABS1(x,ix+inc_x2*3);
}
ix += inc_x2 * 4;
i += 4;
}
while (i < n) {
if (CABS1(x,ix) > maxf) {
maxf = CABS1(x,ix);
}
ix += inc_x2;
i++;
}
return (maxf);
}
}

211
kernel/zarch/zamin.c Normal file
View File

@ -0,0 +1,211 @@
/***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#include <math.h>
#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf
#endif
#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1]))
static FLOAT zamin_kernel_16(BLASLONG n, FLOAT *x)
{
FLOAT amin;
__asm__ volatile (
"vleg %%v0,0(%2),0 \n\t"
"vleg %%v16,8(%2),0 \n\t"
"vleg %%v0,16(%2),1 \n\t"
"vleg %%v16,24(%2),1 \n\t"
"vflpdb %%v0,%%v0 \n\t"
"vflpdb %%v16,%%v16 \n\t"
"vfadb %%v0,%%v0,%%v16 \n\t"
"srlg %%r0,%1,4 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%2) \n\t"
"vleg %%v16,0(%%r1,%2),0 \n\t"
"vleg %%v17,8(%%r1,%2),0 \n\t"
"vleg %%v16,16(%%r1,%2),1 \n\t"
"vleg %%v17,24(%%r1,%2),1 \n\t"
"vleg %%v18,32(%%r1,%2),0 \n\t"
"vleg %%v19,40(%%r1,%2),0 \n\t"
"vleg %%v18,48(%%r1,%2),1 \n\t"
"vleg %%v19,56(%%r1,%2),1 \n\t"
"vleg %%v20,64(%%r1,%2),0 \n\t"
"vleg %%v21,72(%%r1,%2),0 \n\t"
"vleg %%v20,80(%%r1,%2),1 \n\t"
"vleg %%v21,88(%%r1,%2),1 \n\t"
"vleg %%v22,96(%%r1,%2),0 \n\t"
"vleg %%v23,104(%%r1,%2),0 \n\t"
"vleg %%v22,112(%%r1,%2),1 \n\t"
"vleg %%v23,120(%%r1,%2),1 \n\t"
"vleg %%v24,128(%%r1,%2),0 \n\t"
"vleg %%v25,136(%%r1,%2),0 \n\t"
"vleg %%v24,144(%%r1,%2),1 \n\t"
"vleg %%v25,152(%%r1,%2),1 \n\t"
"vleg %%v26,160(%%r1,%2),0 \n\t"
"vleg %%v27,168(%%r1,%2),0 \n\t"
"vleg %%v26,176(%%r1,%2),1 \n\t"
"vleg %%v27,184(%%r1,%2),1 \n\t"
"vleg %%v28,192(%%r1,%2),0 \n\t"
"vleg %%v29,200(%%r1,%2),0 \n\t"
"vleg %%v28,208(%%r1,%2),1 \n\t"
"vleg %%v29,216(%%r1,%2),1 \n\t"
"vleg %%v30,224(%%r1,%2),0 \n\t"
"vleg %%v31,232(%%r1,%2),0 \n\t"
"vleg %%v30,240(%%r1,%2),1 \n\t"
"vleg %%v31,248(%%r1,%2),1 \n\t"
"vflpdb %%v16,%%v16 \n\t"
"vflpdb %%v17,%%v17 \n\t"
"vflpdb %%v18,%%v18 \n\t"
"vflpdb %%v19,%%v19 \n\t"
"vflpdb %%v20,%%v20 \n\t"
"vflpdb %%v21,%%v21 \n\t"
"vflpdb %%v22,%%v22 \n\t"
"vflpdb %%v23,%%v23 \n\t"
"vflpdb %%v24,%%v24 \n\t"
"vflpdb %%v25,%%v25 \n\t"
"vflpdb %%v26,%%v26 \n\t"
"vflpdb %%v27,%%v27 \n\t"
"vflpdb %%v28,%%v28 \n\t"
"vflpdb %%v29,%%v29 \n\t"
"vflpdb %%v30,%%v30 \n\t"
"vflpdb %%v31,%%v31 \n\t"
"vfadb %%v16,%%v16,%%v17 \n\t"
"vfadb %%v18,%%v18,%%v19 \n\t"
"vfadb %%v20,%%v20,%%v21 \n\t"
"vfadb %%v22,%%v22,%%v23 \n\t"
"vfadb %%v24,%%v24,%%v25 \n\t"
"vfadb %%v26,%%v26,%%v27 \n\t"
"vfadb %%v28,%%v28,%%v29 \n\t"
"vfadb %%v30,%%v30,%%v31 \n\t"
"vfmindb %%v16,%%v16,%%v24,0 \n\t"
"vfmindb %%v18,%%v18,%%v26,0 \n\t"
"vfmindb %%v20,%%v20,%%v28,0 \n\t"
"vfmindb %%v22,%%v22,%%v30,0 \n\t"
"vfmindb %%v16,%%v16,%%v20,0 \n\t"
"vfmindb %%v18,%%v18,%%v22,0 \n\t"
"vfmindb %%v16,%%v16,%%v18,0 \n\t"
"vfmindb %%v0,%%v0,%%v16,0 \n\t"
"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"
"vrepg %%v16,%%v0,1 \n\t"
"wfmindb %%v0,%%v0,%%v16,0 \n\t"
"ldr %0,%%f0 "
:"=f"(amin)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
return amin;
}
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0;
BLASLONG ix = 0;
FLOAT minf = 0.0;
BLASLONG inc_x2;
if (n <= 0 || inc_x <= 0) return (minf);
if (inc_x == 1) {
BLASLONG n1 = n & -16;
if (n1 > 0) {
minf = zamin_kernel_16(n1, x);
ix = n1 * 2;
i = n1;
}
else
{
minf=CABS1(x,0);
ix += 2;
i++;
}
while (i < n) {
if (CABS1(x,ix) < minf) {
minf = CABS1(x,ix);
}
ix += 2;
i++;
}
return (minf);
} else {
minf=CABS1(x,0);
inc_x2 = 2 * inc_x;
BLASLONG n1 = n & -4;
while (i < n1) {
if (CABS1(x,ix) < minf) {
minf = CABS1(x,ix);
}
if (CABS1(x,ix+inc_x2) < minf) {
minf = CABS1(x,ix+inc_x2);
}
if (CABS1(x,ix+inc_x2*2) < minf) {
minf = CABS1(x,ix+inc_x2*2);
}
if (CABS1(x,ix+inc_x2*3) < minf) {
minf = CABS1(x,ix+inc_x2*3);
}
ix += inc_x2 * 4;
i += 4;
}
while (i < n) {
if (CABS1(x,ix) < minf) {
minf = CABS1(x,ix);
}
ix += inc_x2;
i++;
}
return (minf);
}
}

221
kernel/zarch/zamin_z13.c Normal file
View File

@ -0,0 +1,221 @@
/***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#include <math.h>
#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf
#endif
#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1]))
static FLOAT zamin_kernel_16(BLASLONG n, FLOAT *x)
{
FLOAT amin;
__asm__ volatile (
"vleg %%v0,0(%2),0 \n\t"
"vleg %%v16,8(%2),0 \n\t"
"vleg %%v0,16(%2),1 \n\t"
"vleg %%v16,24(%2),1 \n\t"
"vflpdb %%v0,%%v0 \n\t"
"vflpdb %%v16,%%v16 \n\t"
"vfadb %%v0,%%v0,%%v16 \n\t"
"srlg %%r0,%1,4 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%2) \n\t"
"vleg %%v16,0(%%r1,%2),0 \n\t"
"vleg %%v17,8(%%r1,%2),0 \n\t"
"vleg %%v16,16(%%r1,%2),1 \n\t"
"vleg %%v17,24(%%r1,%2),1 \n\t"
"vleg %%v18,32(%%r1,%2),0 \n\t"
"vleg %%v19,40(%%r1,%2),0 \n\t"
"vleg %%v18,48(%%r1,%2),1 \n\t"
"vleg %%v19,56(%%r1,%2),1 \n\t"
"vleg %%v20,64(%%r1,%2),0 \n\t"
"vleg %%v21,72(%%r1,%2),0 \n\t"
"vleg %%v20,80(%%r1,%2),1 \n\t"
"vleg %%v21,88(%%r1,%2),1 \n\t"
"vleg %%v22,96(%%r1,%2),0 \n\t"
"vleg %%v23,104(%%r1,%2),0 \n\t"
"vleg %%v22,112(%%r1,%2),1 \n\t"
"vleg %%v23,120(%%r1,%2),1 \n\t"
"vflpdb %%v16, %%v16 \n\t"
"vflpdb %%v17, %%v17 \n\t"
"vflpdb %%v18, %%v18 \n\t"
"vflpdb %%v19, %%v19 \n\t"
"vflpdb %%v20, %%v20 \n\t"
"vflpdb %%v21, %%v21 \n\t"
"vflpdb %%v22, %%v22 \n\t"
"vflpdb %%v23, %%v23 \n\t"
"vfadb %%v16,%%v16,%%v17 \n\t"
"vfadb %%v17,%%v18,%%v19 \n\t"
"vfadb %%v18,%%v20,%%v21 \n\t"
"vfadb %%v19,%%v22,%%v23 \n\t"
"vfchdb %%v24,%%v17,%%v16 \n\t"
"vfchdb %%v25,%%v19,%%v18 \n\t"
"vsel %%v24,%%v16,%%v17,%%v24 \n\t"
"vsel %%v25,%%v18,%%v19,%%v25 \n\t"
"vfchdb %%v26,%%v25,%%v24 \n\t"
"vsel %%v26,%%v24,%%v25,%%v26 \n\t"
"vfchdb %%v27,%%v0,%%v26 \n\t"
"vsel %%v0,%%v26,%%v0,%%v27 \n\t"
"vleg %%v16,128(%%r1,%2),0 \n\t"
"vleg %%v17,136(%%r1,%2),0 \n\t"
"vleg %%v16,144(%%r1,%2),1 \n\t"
"vleg %%v17,152(%%r1,%2),1 \n\t"
"vleg %%v18,160(%%r1,%2),0 \n\t"
"vleg %%v19,168(%%r1,%2),0 \n\t"
"vleg %%v18,176(%%r1,%2),1 \n\t"
"vleg %%v19,184(%%r1,%2),1 \n\t"
"vleg %%v20,192(%%r1,%2),0 \n\t"
"vleg %%v21,200(%%r1,%2),0 \n\t"
"vleg %%v20,208(%%r1,%2),1 \n\t"
"vleg %%v21,216(%%r1,%2),1 \n\t"
"vleg %%v22,224(%%r1,%2),0 \n\t"
"vleg %%v23,232(%%r1,%2),0 \n\t"
"vleg %%v22,240(%%r1,%2),1 \n\t"
"vleg %%v23,248(%%r1,%2),1 \n\t"
"vflpdb %%v16, %%v16 \n\t"
"vflpdb %%v17, %%v17 \n\t"
"vflpdb %%v18, %%v18 \n\t"
"vflpdb %%v19, %%v19 \n\t"
"vflpdb %%v20, %%v20 \n\t"
"vflpdb %%v21, %%v21 \n\t"
"vflpdb %%v22, %%v22 \n\t"
"vflpdb %%v23, %%v23 \n\t"
"vfadb %%v16,%%v16,%%v17 \n\t"
"vfadb %%v17,%%v18,%%v19 \n\t"
"vfadb %%v18,%%v20,%%v21 \n\t"
"vfadb %%v19,%%v22,%%v23 \n\t"
"vfchdb %%v24,%%v17,%%v16 \n\t"
"vfchdb %%v25,%%v19,%%v18 \n\t"
"vsel %%v24,%%v16,%%v17,%%v24 \n\t"
"vsel %%v25,%%v18,%%v19,%%v25 \n\t"
"vfchdb %%v26,%%v25,%%v24 \n\t"
"vsel %%v26,%%v24,%%v25,%%v26 \n\t"
"vfchdb %%v27,%%v0,%%v26 \n\t"
"vsel %%v0,%%v26,%%v0,%%v27 \n\t"
"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"
"vrepg %%v16,%%v0,1 \n\t"
"wfchdb %%v17,%%v16,%%v0 \n\t"
"vsel %%v0,%%v0,%%v16,%%v17 \n\t"
"ldr %0,%%f0 "
:"=f"(amin)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27"
);
return amin;
}
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0;
BLASLONG ix = 0;
FLOAT minf = 0.0;
BLASLONG inc_x2;
if (n <= 0 || inc_x <= 0) return (minf);
if (inc_x == 1) {
BLASLONG n1 = n & -16;
if (n1 > 0) {
minf = zamin_kernel_16(n1, x);
ix = n1 * 2;
i = n1;
}
else
{
minf=CABS1(x,0);
ix += 2;
i++;
}
while (i < n) {
if (CABS1(x,ix) < minf) {
minf = CABS1(x,ix);
}
ix += 2;
i++;
}
return (minf);
} else {
minf=CABS1(x,0);
inc_x2 = 2 * inc_x;
BLASLONG n1 = n & -4;
while (i < n1) {
if (CABS1(x,ix) < minf) {
minf = CABS1(x,ix);
}
if (CABS1(x,ix+inc_x2) < minf) {
minf = CABS1(x,ix+inc_x2);
}
if (CABS1(x,ix+inc_x2*2) < minf) {
minf = CABS1(x,ix+inc_x2*2);
}
if (CABS1(x,ix+inc_x2*3) < minf) {
minf = CABS1(x,ix+inc_x2*3);
}
ix += inc_x2 * 4;
i += 4;
}
while (i < n) {
if (CABS1(x,ix) < minf) {
minf = CABS1(x,ix);
}
ix += inc_x2;
i++;
}
return (minf);
}
}

View File

@ -25,92 +25,98 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/ *****************************************************************************/
#include "common.h" #include "common.h"
#include <math.h> #include <math.h>
#if defined(DOUBLE) #if defined(DOUBLE)
#define ABS fabs #define ABS fabs
#else #else
#define ABS fabsf #define ABS fabsf
#endif #endif
static FLOAT zasum_kernel_16(BLASLONG n, FLOAT *x)
static FLOAT zasum_kernel_16(BLASLONG n, FLOAT *x) { {
FLOAT asum; FLOAT asum;
__asm__ ( __asm__ (
"pfd 1, 0(%[ptr_x]) \n\t" "vzero %%v0 \n\t"
"sllg %%r0,%[n],4 \n\t" "vzero %%v1 \n\t"
"agr %%r0,%[ptr_x] \n\t" "vzero %%v2 \n\t"
"vzero %%v0 \n\t" "vzero %%v3 \n\t"
"vzero %%v1 \n\t" "srlg %%r0,%1,4 \n\t"
"vzero %%v22 \n\t" "xgr %%r1,%%r1 \n\t"
"vzero %%v23 \n\t" "0: \n\t"
".align 16 \n\t" "pfd 1, 1024(%%r1,%2) \n\t"
"1: \n\t" "vl %%v16, 0(%%r1,%2) \n\t"
"pfd 1, 256(%[ptr_tmp] ) \n\t" "vl %%v17, 16(%%r1,%2) \n\t"
"vlm %%v24,%%v31,0(%[ptr_tmp]) \n\t" "vl %%v18, 32(%%r1,%2) \n\t"
"vl %%v19, 48(%%r1,%2) \n\t"
"vflpdb %%v24, %%v24 \n\t" "vl %%v20, 64(%%r1,%2) \n\t"
"vflpdb %%v25, %%v25 \n\t" "vl %%v21, 80(%%r1,%2) \n\t"
"vflpdb %%v26, %%v26 \n\t" "vl %%v22, 96(%%r1,%2) \n\t"
"vflpdb %%v27, %%v27 \n\t" "vl %%v23, 112(%%r1,%2) \n\t"
"vflpdb %%v28, %%v28 \n\t"
"vflpdb %%v29, %%v29 \n\t" "vflpdb %%v16, %%v16 \n\t"
"vflpdb %%v30, %%v30 \n\t" "vflpdb %%v17, %%v17 \n\t"
"vflpdb %%v31, %%v31 \n\t" "vflpdb %%v18, %%v18 \n\t"
"vflpdb %%v19, %%v19 \n\t"
"vfadb %%v0,%%v0,%%v24 \n\t" "vflpdb %%v20, %%v20 \n\t"
"vfadb %%v1,%%v1,%%v25 \n\t" "vflpdb %%v21, %%v21 \n\t"
"vfadb %%v23,%%v23,%%v26 \n\t" "vflpdb %%v22, %%v22 \n\t"
"vfadb %%v22,%%v22,%%v27 \n\t" "vflpdb %%v23, %%v23 \n\t"
"vfadb %%v0,%%v0,%%v28 \n\t"
"vfadb %%v1,%%v1,%%v29 \n\t" "vfadb %%v0,%%v0,%%v16 \n\t"
"vfadb %%v23,%%v23,%%v30 \n\t" "vfadb %%v1,%%v1,%%v17 \n\t"
"vfadb %%v22,%%v22,%%v31 \n\t" "vfadb %%v2,%%v2,%%v18 \n\t"
"vfadb %%v3,%%v3,%%v19 \n\t"
"vlm %%v24,%%v31, 128(%[ptr_tmp]) \n\t" "vfadb %%v0,%%v0,%%v20 \n\t"
"vfadb %%v1,%%v1,%%v21 \n\t"
"vflpdb %%v24, %%v24 \n\t" "vfadb %%v2,%%v2,%%v22 \n\t"
"vflpdb %%v25, %%v25 \n\t" "vfadb %%v3,%%v3,%%v23 \n\t"
"vflpdb %%v26, %%v26 \n\t"
"vflpdb %%v27, %%v27 \n\t" "vl %%v16, 128(%%r1,%2) \n\t"
"vflpdb %%v28, %%v28 \n\t" "vl %%v17, 144(%%r1,%2) \n\t"
"vflpdb %%v29, %%v29 \n\t" "vl %%v18, 160(%%r1,%2) \n\t"
"vflpdb %%v30, %%v30 \n\t" "vl %%v19, 176(%%r1,%2) \n\t"
"vflpdb %%v31, %%v31 \n\t" "vl %%v20, 192(%%r1,%2) \n\t"
"la %[ptr_tmp],256(%[ptr_tmp]) \n\t" "vl %%v21, 208(%%r1,%2) \n\t"
"vfadb %%v0,%%v0,%%v24 \n\t" "vl %%v22, 224(%%r1,%2) \n\t"
"vfadb %%v1,%%v1,%%v25 \n\t" "vl %%v23, 240(%%r1,%2) \n\t"
"vfadb %%v23,%%v23,%%v26 \n\t"
"vfadb %%v22,%%v22,%%v27 \n\t" "vflpdb %%v16, %%v16 \n\t"
"vfadb %%v0,%%v0,%%v28 \n\t" "vflpdb %%v17, %%v17 \n\t"
"vfadb %%v1,%%v1,%%v29 \n\t" "vflpdb %%v18, %%v18 \n\t"
"vfadb %%v23,%%v23,%%v30 \n\t" "vflpdb %%v19, %%v19 \n\t"
"vfadb %%v22,%%v22,%%v31 \n\t" "vflpdb %%v20, %%v20 \n\t"
"vflpdb %%v21, %%v21 \n\t"
"clgrjl %[ptr_tmp],%%r0,1b \n\t" "vflpdb %%v22, %%v22 \n\t"
"vfadb %%v24,%%v0,%%v1 \n\t" "vflpdb %%v23, %%v23 \n\t"
"vfadb %%v25,%%v23,%%v22 \n\t"
"vfadb %%v0,%%v25,%%v24 \n\t" "vfadb %%v0,%%v0,%%v16 \n\t"
"vrepg %%v1,%%v0,1 \n\t" "vfadb %%v1,%%v1,%%v17 \n\t"
"adbr %%f0,%%f1 \n\t" "vfadb %%v2,%%v2,%%v18 \n\t"
"ldr %[asum] ,%%f0" "vfadb %%v3,%%v3,%%v19 \n\t"
: [asum] "=f"(asum),[ptr_tmp] "+&a"(x) "vfadb %%v0,%%v0,%%v20 \n\t"
: [mem] "m"( *(const double (*)[2*n])x ), [n] "r"(n), [ptr_x] "a"(x) "vfadb %%v1,%%v1,%%v21 \n\t"
: "cc", "r0","f0","f1","v0","v1","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" "vfadb %%v2,%%v2,%%v22 \n\t"
); "vfadb %%v3,%%v3,%%v23 \n\t"
"agfi %%r1,256 \n\t"
"brctg %%r0,0b \n\t"
"vfadb %%v0,%%v0,%%v1 \n\t"
"vfadb %%v0,%%v0,%%v2 \n\t"
"vfadb %%v0,%%v0,%%v3 \n\t"
"vrepg %%v1,%%v0,1 \n\t"
"adbr %%f0,%%f1 \n\t"
"ldr %0,%%f0 "
:"=f"(asum)
:"r"(n),"ZR"((const FLOAT (*)[n * 2])x)
:"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23"
);
return asum; return asum;
} }
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{ {
BLASLONG i=0; BLASLONG i=0;
@ -128,7 +134,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
if ( n1 > 0 ) if ( n1 > 0 )
{ {
sumf=zasum_kernel_16(n1, x ); sumf = zasum_kernel_16(n1, x);
i=n1; i=n1;
ip=2*n1; ip=2*n1;
} }

View File

@ -23,142 +23,98 @@ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/ *****************************************************************************/
#include "common.h" #include "common.h"
static void zaxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
static void zaxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT da_r,FLOAT da_i) { {
__asm__ volatile(
BLASLONG tempR1 ;
__asm__ ("pfd 1, 0(%[x_tmp]) \n\t"
"pfd 2, 0(%[y_tmp]) \n\t"
#if !defined(CONJ) #if !defined(CONJ)
"lgdr %[t1],%[alpha_r] \n\t" "vlrepg %%v0,0(%3) \n\t"
"vlvgp %%v28,%[t1],%[t1] \n\t" //load both from disjoint "vleg %%v1,8(%3),0 \n\t"
"lgdr %[t1],%[alpha_i] \n\t" "wflcdb %%v1,%%v1 \n\t"
"vlvgp %%v29,%[t1],%[t1] \n\t" //load both from disjoint "vleg %%v1,8(%3),1 \n\t"
"vflcdb %%v29,%%v29 \n\t" //complement both #else
"vlvgg %%v29,%[t1],1 \n\t" //restore 2nd so that {-alpha_i, alpha_i} "vleg %%v0,0(%3),1 \n\t"
"vflcdb %%v0,%%v0 \n\t"
"vleg %%v0,0(%3),0 \n\t"
"vlrepg %%v1,8(%3) \n\t"
#endif
"srlg %%r0,%0,3 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%1) \n\t"
"pfd 2, 1024(%%r1,%2) \n\t"
#else "vl %%v16,0(%%r1,%1) \n\t"
"lgdr %[t1],%[alpha_i] \n\t" "vl %%v17,16(%%r1,%1) \n\t"
"vlvgp %%v29,%[t1],%[t1] \n\t" //load both from disjoint "vl %%v18,32(%%r1,%1) \n\t"
"lgdr %[t1],%[alpha_r] \n\t" "vl %%v19,48(%%r1,%1) \n\t"
"vlvgp %%v28,%[t1],%[t1] \n\t" //load both from disjoint "vl %%v20,0(%%r1,%2) \n\t"
"vflcdb %%v28,%%v28 \n\t" //complement both "vl %%v21,16(%%r1,%2) \n\t"
"vlvgg %%v28,%[t1],0 \n\t" //restore 1st so that {alpha_r,-alpha_r} "vl %%v22,32(%%r1,%2) \n\t"
#endif "vl %%v23,48(%%r1,%2) \n\t"
"vpdi %%v24,%%v16,%%v16,4 \n\t"
"xgr %[t1],%[t1] \n\t" "vpdi %%v25,%%v17,%%v17,4 \n\t"
"sllg %[tmp],%[tmp],4 \n\t" "vpdi %%v26,%%v18,%%v18,4 \n\t"
"vl %%v30 , 0(%[t1],%[y_tmp]) \n\t" "vpdi %%v27,%%v19,%%v19,4 \n\t"
"vl %%v31 , 16(%[t1],%[y_tmp]) \n\t"
"vl %%v6 , 32(%[t1],%[y_tmp]) \n\t"
"vl %%v7 , 48(%[t1],%[y_tmp]) \n\t"
"vl %%v20 , 0(%[t1],%[x_tmp]) \n\t"
"vl %%v21 , 16(%[t1],%[x_tmp]) \n\t"
"vl %%v22 , 32(%[t1],%[x_tmp]) \n\t"
"vl %%v23 , 48(%[t1],%[x_tmp]) \n\t"
"lay %[tmp],-64 (%[tmp]) \n\t" //tmp-=64 so that t1+64 can break tmp condition
"j 2f \n\t"
".align 16 \n\t"
"1: \n\t"
"vpdi %%v24 , %%v20, %%v20, 4 \n\t"
"vpdi %%v25 , %%v21, %%v21, 4 \n\t"
"vpdi %%v26 , %%v22, %%v22, 4 \n\t"
"vpdi %%v27 , %%v23, %%v23, 4 \n\t"
"vfmadb %%v16, %%v20, %%v28, %%v16 \n\t"
"vfmadb %%v17, %%v21, %%v28, %%v17 \n\t"
"vfmadb %%v18, %%v22, %%v28, %%v18 \n\t"
"vfmadb %%v19, %%v23, %%v28, %%v19 \n\t"
"vl %%v30, 64(%[t1],%[y_tmp]) \n\t"
"vl %%v31, 80(%[t1],%[y_tmp]) \n\t"
"vl %%v6 , 96(%[t1],%[y_tmp]) \n\t"
"vl %%v7 , 112(%[t1],%[y_tmp]) \n\t"
"vfmadb %%v16, %%v24, %%v29, %%v16 \n\t"
"vfmadb %%v17, %%v25, %%v29, %%v17 \n\t"
"vfmadb %%v18, %%v26, %%v29, %%v18 \n\t"
"vfmadb %%v19, %%v27, %%v29, %%v19 \n\t"
"vl %%v20 , 64(%[t1],%[x_tmp]) \n\t"
"vl %%v21 , 80(%[t1],%[x_tmp]) \n\t"
"vl %%v22 , 96(%[t1],%[x_tmp]) \n\t"
"vl %%v23 ,112(%[t1],%[x_tmp]) \n\t"
"vst %%v16 , 0(%[t1],%[y_tmp]) \n\t" "vfmadb %%v28,%%v16,%%v0,%%v20 \n\t"
"vst %%v17 , 16(%[t1],%[y_tmp]) \n\t" "vfmadb %%v29,%%v17,%%v0,%%v21 \n\t"
"vst %%v18 , 32(%[t1],%[y_tmp]) \n\t" "vfmadb %%v30,%%v18,%%v0,%%v22 \n\t"
"vst %%v19 , 48(%[t1],%[y_tmp]) \n\t" "vfmadb %%v31,%%v19,%%v0,%%v23 \n\t"
"la %[t1],64(%[t1] ) \n\t"
"2: \n\t"
"pfd 1, 256(%[t1],%[x_tmp]) \n\t"
"pfd 2, 256(%[t1],%[y_tmp]) \n\t"
"vpdi %%v24 , %%v20, %%v20, 4 \n\t"
"vpdi %%v25 , %%v21, %%v21, 4 \n\t"
"vpdi %%v26 , %%v22, %%v22, 4 \n\t"
"vpdi %%v27 , %%v23, %%v23, 4 \n\t"
"vfmadb %%v30, %%v20, %%v28, %%v30 \n\t" "vfmadb %%v28,%%v24,%%v1,%%v28 \n\t"
"vfmadb %%v31, %%v21, %%v28, %%v31 \n\t" "vfmadb %%v29,%%v25,%%v1,%%v29 \n\t"
"vfmadb %%v6, %%v22, %%v28, %%v6 \n\t" "vfmadb %%v30,%%v26,%%v1,%%v30 \n\t"
"vfmadb %%v7, %%v23, %%v28, %%v7 \n\t" "vfmadb %%v31,%%v27,%%v1,%%v31 \n\t"
"vl %%v16, 64(%[t1],%[y_tmp]) \n\t"
"vl %%v17, 80(%[t1],%[y_tmp]) \n\t"
"vl %%v18, 96(%[t1],%[y_tmp]) \n\t"
"vl %%v19, 112(%[t1],%[y_tmp]) \n\t"
"vfmadb %%v30, %%v24, %%v29, %%v30 \n\t"
"vfmadb %%v31, %%v25, %%v29, %%v31 \n\t"
"vfmadb %%v6, %%v26, %%v29, %%v6 \n\t"
"vfmadb %%v7, %%v27, %%v29, %%v7 \n\t"
"vl %%v20 , 64(%[t1],%[x_tmp]) \n\t" "vst %%v28,0(%%r1,%2) \n\t"
"vl %%v21 , 80(%[t1],%[x_tmp]) \n\t" "vst %%v29,16(%%r1,%2) \n\t"
"vl %%v22 , 96(%[t1],%[x_tmp]) \n\t" "vst %%v30,32(%%r1,%2) \n\t"
"vl %%v23 ,112(%[t1],%[x_tmp]) \n\t" "vst %%v31,48(%%r1,%2) \n\t"
"vst %%v30 , 0(%[t1],%[y_tmp]) \n\t" "vl %%v16,64(%%r1,%1) \n\t"
"vst %%v31 , 16(%[t1],%[y_tmp]) \n\t" "vl %%v17,80(%%r1,%1) \n\t"
"vst %%v6 , 32(%[t1],%[y_tmp]) \n\t" "vl %%v18,96(%%r1,%1) \n\t"
"vst %%v7 , 48(%[t1],%[y_tmp]) \n\t" "vl %%v19,112(%%r1,%1) \n\t"
"vl %%v20,64(%%r1,%2) \n\t"
"la %[t1],64(%[t1] ) \n\t" "vl %%v21,80(%%r1,%2) \n\t"
"vl %%v22,96(%%r1,%2) \n\t"
"vl %%v23,112(%%r1,%2) \n\t"
"vpdi %%v24,%%v16,%%v16,4 \n\t"
"vpdi %%v25,%%v17,%%v17,4 \n\t"
"vpdi %%v26,%%v18,%%v18,4 \n\t"
"vpdi %%v27,%%v19,%%v19,4 \n\t"
"clgrjl %[t1],%[tmp],1b \n\t" "vfmadb %%v28,%%v16,%%v0,%%v20 \n\t"
//---------------------------------------------------------------------- "vfmadb %%v29,%%v17,%%v0,%%v21 \n\t"
"vfmadb %%v16, %%v20, %%v28, %%v16 \n\t" "vfmadb %%v30,%%v18,%%v0,%%v22 \n\t"
"vfmadb %%v17, %%v21, %%v28, %%v17 \n\t" "vfmadb %%v31,%%v19,%%v0,%%v23 \n\t"
"vfmadb %%v18, %%v22, %%v28, %%v18 \n\t"
"vfmadb %%v19, %%v23, %%v28, %%v19 \n\t"
"vpdi %%v24 , %%v20, %%v20, 4 \n\t"
"vpdi %%v25 , %%v21, %%v21, 4 \n\t"
"vpdi %%v26 , %%v22, %%v22, 4 \n\t"
"vpdi %%v27 , %%v23, %%v23, 4 \n\t"
"vfmadb %%v16, %%v24, %%v29, %%v16 \n\t"
"vfmadb %%v17, %%v25, %%v29, %%v17 \n\t"
"vfmadb %%v18, %%v26, %%v29, %%v18 \n\t"
"vfmadb %%v19, %%v27, %%v29, %%v19 \n\t"
"vst %%v16 , 0(%[t1],%[y_tmp]) \n\t" "vfmadb %%v28,%%v24,%%v1,%%v28 \n\t"
"vst %%v17 , 16(%[t1],%[y_tmp]) \n\t" "vfmadb %%v29,%%v25,%%v1,%%v29 \n\t"
"vst %%v18 , 32(%[t1],%[y_tmp]) \n\t" "vfmadb %%v30,%%v26,%%v1,%%v30 \n\t"
"vst %%v19 , 48(%[t1],%[y_tmp]) \n\t" "vfmadb %%v31,%%v27,%%v1,%%v31 \n\t"
: [mem_y] "+m" (*(double (*)[2*n])y),[tmp]"+&r"(n) , [t1] "=&a" (tempR1) "vst %%v28,64(%%r1,%2) \n\t"
: [mem_x] "m" (*(const double (*)[2*n])x), [x_tmp] "a"(x), [y_tmp] "a"(y), [alpha_r] "f"(da_r),[alpha_i] "f"(da_i) "vst %%v29,80(%%r1,%2) \n\t"
: "cc", "v6","v7", "v16", "vst %%v30,96(%%r1,%2) \n\t"
"v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" "vst %%v31,112(%%r1,%2) \n\t"
);
"agfi %%r1,128 \n\t"
"brctg %%r0,0b "
:
:"r"(n),"ZR"((const FLOAT (*)[n * 2])x),"ZR"((FLOAT (*)[n * 2])y),"ZQ"((const FLOAT (*)[2])alpha)
:"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
} }
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) {
BLASLONG i = 0; BLASLONG i = 0;
BLASLONG ix = 0, iy = 0; BLASLONG ix = 0, iy = 0;
FLOAT da[2] __attribute__ ((aligned(16)));
if (n <= 0) return (0); if (n <= 0) return (0);
@ -166,8 +122,10 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
BLASLONG n1 = n & -8; BLASLONG n1 = n & -8;
if (n1) { if (n1) {
zaxpy_kernel_8(n1, x, y, da_r,da_i); da[0] = da_r;
da[1] = da_i;
zaxpy_kernel_8(n1, x, y, da);
ix = 2 * n1; ix = 2 * n1;
} }
i = n1; i = n1;

View File

@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project Copyright (c) 2013-2018, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@ -24,71 +24,28 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/ *****************************************************************************/
#include "common.h" #include "common.h"
static void zcopy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) {
__asm__ volatile(
"pfd 1, 0(%[ptr_x]) \n\t"
"pfd 2, 0(%[ptr_y]) \n\t"
"srlg %[n_tmp],%[n_tmp],4 \n\t"
"xgr %%r1,%%r1 \n\t"
".align 16 \n\t"
"1: \n\t"
"pfd 1, 256(%%r1,%[ptr_x]) \n\t"
"pfd 2, 256(%%r1,%[ptr_y]) \n\t"
"vl %%v24, 0(%%r1,%[ptr_x]) \n\t"
"vst %%v24, 0(%%r1,%[ptr_y]) \n\t"
"vl %%v25, 16(%%r1,%[ptr_x]) \n\t"
"vst %%v25, 16(%%r1,%[ptr_y]) \n\t"
"vl %%v26, 32(%%r1,%[ptr_x]) \n\t"
"vst %%v26, 32(%%r1,%[ptr_y]) \n\t"
"vl %%v27, 48(%%r1,%[ptr_x]) \n\t"
"vst %%v27, 48(%%r1,%[ptr_y]) \n\t"
"vl %%v28, 64(%%r1,%[ptr_x]) \n\t"
"vst %%v28, 64(%%r1,%[ptr_y]) \n\t"
"vl %%v29, 80(%%r1,%[ptr_x]) \n\t"
"vst %%v29, 80(%%r1,%[ptr_y]) \n\t"
"vl %%v30, 96(%%r1,%[ptr_x]) \n\t"
"vst %%v30, 96(%%r1,%[ptr_y]) \n\t"
"vl %%v31, 112(%%r1,%[ptr_x]) \n\t"
"vst %%v31, 112(%%r1,%[ptr_y]) \n\t"
"vl %%v24, 128(%%r1,%[ptr_x]) \n\t"
"vst %%v24, 128(%%r1,%[ptr_y]) \n\t"
"vl %%v25, 144(%%r1,%[ptr_x]) \n\t"
"vst %%v25, 144(%%r1,%[ptr_y]) \n\t"
"vl %%v26, 160(%%r1,%[ptr_x]) \n\t"
"vst %%v26, 160(%%r1,%[ptr_y]) \n\t"
"vl %%v27, 176(%%r1,%[ptr_x]) \n\t"
"vst %%v27, 176(%%r1,%[ptr_y]) \n\t"
"vl %%v28, 192(%%r1,%[ptr_x]) \n\t"
"vst %%v28, 192(%%r1,%[ptr_y]) \n\t"
"vl %%v29, 208(%%r1,%[ptr_x]) \n\t"
"vst %%v29, 208(%%r1,%[ptr_y]) \n\t"
"vl %%v30, 224(%%r1,%[ptr_x]) \n\t"
"vst %%v30, 224(%%r1,%[ptr_y]) \n\t"
"vl %%v31, 240(%%r1,%[ptr_x]) \n\t"
"vst %%v31, 240(%%r1,%[ptr_y]) \n\t"
"la %%r1,256(%%r1) \n\t"
"brctg %[n_tmp],1b"
: [mem_y] "=m" (*(double (*)[2*n])y), [n_tmp] "+&r"(n)
: [mem_x] "m" (*(const double (*)[2*n])x), [ptr_x] "a"(x), [ptr_y] "a"(y)
: "cc", "r1", "v24","v25","v26","v27","v28","v29","v30","v31"
);
return;
static void zcopy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y)
{
__asm__ volatile (
"lgr %%r1,%1 \n\t"
"lgr %%r2,%2 \n\t"
"srlg %%r0,%0,4 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1) \n\t"
"pfd 2, 1024(%%r2) \n\t"
"mvc 0(256,%%r2),0(%%r1) \n\t"
"agfi %%r1,256 \n\t"
"agfi %%r2,256 \n\t"
"brctg %%r0,0b "
:
:"r"(n),"a"((const FLOAT (*)[n * 2])x),"a"((FLOAT (*)[n * 2])y)
:"memory","cc","r0","r1","r2"
);
} }
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
{ {
BLASLONG i=0; BLASLONG i=0;
@ -137,9 +94,6 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
} }
} }
return(0);
return(0);
} }

View File

@ -23,137 +23,92 @@ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/ *****************************************************************************/
#include "common.h" #include "common.h"
#if defined(Z13)
static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) {
static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d)
{
__asm__ volatile( __asm__ volatile(
"pfd 1, 0(%[ptr_x_tmp]) \n\t" "vzero %%v24 \n\t"
"pfd 1, 0(%[ptr_y_tmp]) \n\t" "vzero %%v25 \n\t"
"vzero %%v24 \n\t" "vzero %%v26 \n\t"
"vzero %%v25 \n\t" "vzero %%v27 \n\t"
"vzero %%v26 \n\t" "vzero %%v28 \n\t"
"vzero %%v27 \n\t" "vzero %%v29 \n\t"
"srlg %[n_tmp],%[n_tmp],3 \n\t" "vzero %%v30 \n\t"
"xgr %%r1,%%r1 \n\t" "vzero %%v31 \n\t"
".align 16 \n\t" "srlg %%r0,%0,3 \n\t"
"1: \n\t" "xgr %%r1,%%r1 \n\t"
"pfd 1, 256(%%r1,%[ptr_x_tmp]) \n\t" "0: \n\t"
"pfd 1, 256(%%r1,%[ptr_y_tmp]) \n\t" "pfd 1, 1024(%%r1,%1) \n\t"
"vl %%v16, 0(%%r1,%[ptr_x_tmp]) \n\t" "pfd 1, 1024(%%r1,%2) \n\t"
"vl %%v17, 16(%%r1,%[ptr_x_tmp]) \n\t"
"vl %%v18, 32(%%r1,%[ptr_x_tmp]) \n\t"
"vl %%v19, 48(%%r1,%[ptr_x_tmp]) \n\t"
"vl %%v28, 0(%%r1,%[ptr_y_tmp]) \n\t"
"vl %%v29, 16(%%r1,%[ptr_y_tmp]) \n\t"
"vl %%v30, 32(%%r1,%[ptr_y_tmp]) \n\t"
"vl %%v31, 48(%%r1,%[ptr_y_tmp]) \n\t"
"vpdi %%v20,%%v16,%%v16,4 \n\t"
"vpdi %%v21,%%v17,%%v17,4 \n\t"
"vpdi %%v22,%%v18,%%v18,4 \n\t"
"vpdi %%v23,%%v19,%%v19,4 \n\t"
"vl %%v16, 0(%%r1,%1) \n\t"
"vl %%v17, 16(%%r1,%1) \n\t"
"vl %%v18, 32(%%r1,%1) \n\t"
"vl %%v19, 48(%%r1,%1) \n\t"
"vl %%v0, 0(%%r1,%2) \n\t"
"vl %%v1, 16(%%r1,%2) \n\t"
"vl %%v2, 32(%%r1,%2) \n\t"
"vl %%v3, 48(%%r1,%2) \n\t"
"vpdi %%v20,%%v16,%%v16,4 \n\t"
"vpdi %%v21,%%v17,%%v17,4 \n\t"
"vpdi %%v22,%%v18,%%v18,4 \n\t"
"vpdi %%v23,%%v19,%%v19,4 \n\t"
"vfmadb %%v24,%%v16,%%v28,%%v24 \n\t" "vfmadb %%v24,%%v16,%%v0,%%v24 \n\t"
"vfmadb %%v25,%%v20,%%v28,%%v25 \n\t" "vfmadb %%v25,%%v20,%%v0,%%v25 \n\t"
"vfmadb %%v26,%%v17,%%v29,%%v26 \n\t" "vfmadb %%v26,%%v17,%%v1,%%v26 \n\t"
"vfmadb %%v27,%%v21,%%v29,%%v27 \n\t" "vfmadb %%v27,%%v21,%%v1,%%v27 \n\t"
"vfmadb %%v24,%%v18,%%v30,%%v24 \n\t" "vfmadb %%v28,%%v18,%%v2,%%v28 \n\t"
"vfmadb %%v25,%%v22,%%v30,%%v25 \n\t" "vfmadb %%v29,%%v22,%%v2,%%v29 \n\t"
"vfmadb %%v26,%%v19,%%v31,%%v26 \n\t" "vfmadb %%v30,%%v19,%%v3,%%v30 \n\t"
"vfmadb %%v27,%%v23,%%v31,%%v27 \n\t" "vfmadb %%v31,%%v23,%%v3,%%v31 \n\t"
"vl %%v16, 64(%%r1,%1) \n\t"
"vl %%v17, 80(%%r1,%1) \n\t"
"vl %%v18, 96(%%r1,%1) \n\t"
"vl %%v19, 112(%%r1,%1) \n\t"
"vl %%v0, 64(%%r1,%2) \n\t"
"vl %%v1, 80(%%r1,%2) \n\t"
"vl %%v2, 96(%%r1,%2) \n\t"
"vl %%v3, 112(%%r1,%2) \n\t"
"vpdi %%v20,%%v16,%%v16,4 \n\t"
"vpdi %%v21,%%v17,%%v17,4 \n\t"
"vpdi %%v22,%%v18,%%v18,4 \n\t"
"vpdi %%v23,%%v19,%%v19,4 \n\t"
"vfmadb %%v24,%%v16,%%v0,%%v24 \n\t"
"vfmadb %%v25,%%v20,%%v0,%%v25 \n\t"
"vfmadb %%v26,%%v17,%%v1,%%v26 \n\t"
"vfmadb %%v27,%%v21,%%v1,%%v27 \n\t"
"vfmadb %%v28,%%v18,%%v2,%%v28 \n\t"
"vfmadb %%v29,%%v22,%%v2,%%v29 \n\t"
"vfmadb %%v30,%%v19,%%v3,%%v30 \n\t"
"vfmadb %%v31,%%v23,%%v3,%%v31 \n\t"
"vl %%v16, 64(%%r1,%[ptr_x_tmp]) \n\t" "agfi %%r1,128 \n\t"
"vl %%v17, 80(%%r1,%[ptr_x_tmp]) \n\t" "brctg %%r0,0b \n\t"
"vl %%v18, 96(%%r1,%[ptr_x_tmp]) \n\t" "vfadb %%v24,%%v24,%%v26 \n\t"
"vl %%v19,112(%%r1,%[ptr_x_tmp]) \n\t" "vfadb %%v24,%%v24,%%v28 \n\t"
"vl %%v28, 64(%%r1,%[ptr_y_tmp]) \n\t" "vfadb %%v24,%%v24,%%v30 \n\t"
"vl %%v29, 80(%%r1,%[ptr_y_tmp]) \n\t" "vfadb %%v25,%%v25,%%v27 \n\t"
"vl %%v30, 96(%%r1,%[ptr_y_tmp]) \n\t" "vfadb %%v25,%%v25,%%v29 \n\t"
"vl %%v31,112(%%r1,%[ptr_y_tmp]) \n\t" "vfadb %%v25,%%v25,%%v31 \n\t"
"vpdi %%v20,%%v16,%%v16,4 \n\t" "vsteg %%v24,0(%3),0 \n\t"
"vpdi %%v21,%%v17,%%v17,4 \n\t" "vsteg %%v24,8(%3),1 \n\t"
"vpdi %%v22,%%v18,%%v18,4 \n\t" "vsteg %%v25,16(%3),1 \n\t"
"vpdi %%v23,%%v19,%%v19,4 \n\t" "vsteg %%v25,24(%3),0 "
"vfmadb %%v24,%%v16,%%v28,%%v24 \n\t" :
"vfmadb %%v25,%%v20,%%v28,%%v25 \n\t" :"r"(n),"ZR"((const FLOAT (*)[n * 2])x),"ZR"((const FLOAT (*)[n * 2])y),"ZQ"((FLOAT (*)[4])d)
"vfmadb %%v26,%%v17,%%v29,%%v26 \n\t" :"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
"vfmadb %%v27,%%v21,%%v29,%%v27 \n\t" );
"vfmadb %%v24,%%v18,%%v30,%%v24 \n\t"
"vfmadb %%v25,%%v22,%%v30,%%v25 \n\t"
"vfmadb %%v26,%%v19,%%v31,%%v26 \n\t"
"vfmadb %%v27,%%v23,%%v31,%%v27 \n\t"
"la %%r1,128(%%r1) \n\t"
"brctg %[n_tmp],1b \n\t"
"vfadb %%v24,%%v26,%%v24 \n\t"
"vfadb %%v25,%%v25,%%v27 \n\t"
"vsteg %%v24, 0(%[ptr_d]),0 \n\t"
"vsteg %%v24, 8(%[ptr_d]),1 \n\t"
"vsteg %%v25,16(%[ptr_d]),1 \n\t"
"vsteg %%v25,24(%[ptr_d]),0 \n\t"
: [mem_out] "=m"(*(double (*)[4])d ) ,[n_tmp] "+&r"(n)
: [mem_x] "m"( *(const double (*)[2*n])x),
[mem_y] "m"( *(const double (*)[2*n])y),
[ptr_x_tmp] "a"(x), [ptr_y_tmp] "a"(y), [ptr_d] "a"(d)
: "cc", "r1","v16",
"v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
} }
#else
static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) {
BLASLONG register i = 0;
FLOAT dot[4] = {0.0, 0.0, 0.0, 0.0};
BLASLONG j = 0;
while (i < n) {
dot[0] += x[j] * y[j];
dot[1] += x[j + 1] * y[j + 1];
dot[2] += x[j] * y[j + 1];
dot[3] += x[j + 1] * y[j];
dot[0] += x[j + 2] * y[j + 2];
dot[1] += x[j + 3] * y[j + 3];
dot[2] += x[j + 2] * y[j + 3];
dot[3] += x[j + 3] * y[j + 2];
dot[0] += x[j + 4] * y[j + 4];
dot[1] += x[j + 5] * y[j + 5];
dot[2] += x[j + 4] * y[j + 5];
dot[3] += x[j + 5] * y[j + 4];
dot[0] += x[j + 6] * y[j + 6];
dot[1] += x[j + 7] * y[j + 7];
dot[2] += x[j + 6] * y[j + 7];
dot[3] += x[j + 7] * y[j + 6];
j += 8;
i += 4;
}
d[0] = dot[0];
d[1] = dot[1];
d[2] = dot[2];
d[3] = dot[3];
}
#endif
OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) {
BLASLONG i = 0; BLASLONG i;
BLASLONG ix=0, iy=0; BLASLONG ix, iy;
OPENBLAS_COMPLEX_FLOAT result; OPENBLAS_COMPLEX_FLOAT result;
FLOAT dot[4] __attribute__ ((aligned(16))) = {0.0, 0.0, 0.0, 0.0}; FLOAT dot[4] __attribute__ ((aligned(16))) = {0.0, 0.0, 0.0, 0.0};
@ -167,14 +122,12 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA
if ((inc_x == 1) && (inc_y == 1)) { if ((inc_x == 1) && (inc_y == 1)) {
BLASLONG n1 = n & -8; BLASLONG n1 = n & -8;
BLASLONG j=0;
if (n1){ if (n1)
zdot_kernel_8(n1, x, y, dot); zdot_kernel_8(n1, x, y, dot);
i = n1;
j = n1 <<1; i = n1;
} BLASLONG j = i * 2;
while (i < n) { while (i < n) {

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2017, The OpenBLAS Project Copyright (c) 2013-2018, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@ -27,176 +27,166 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
static void zrot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT cosA, FLOAT sinA) static void zrot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s)
{ {
__asm__ ( __asm__ (
"pfd 2, 0(%[ptr_x]) \n\t" "vlrepg %%v0,%3 \n\t"
"pfd 2, 0(%[ptr_y]) \n\t" "vlrepg %%v1,%4 \n\t"
"lgdr %%r1,%[cos] \n\t" "srlg %%r0,%0,4 \n\t"
"vlvgp %%v0,%%r1,%%r1 \n\t" "xgr %%r1,%%r1 \n\t"
"lgdr %%r1,%[sin] \n\t" "0: \n\t"
"vlvgp %%v1,%%r1,%%r1 \n\t" "pfd 2, 1024(%%r1,%1) \n\t"
"sllg %[tmp],%[tmp],4 \n\t" "pfd 2, 1024(%%r1,%2) \n\t"
"xgr %%r1,%%r1 \n\t" "vl %%v24, 0(%%r1,%1) \n\t"
".align 16 \n\t" "vl %%v25, 16(%%r1,%1) \n\t"
"1: \n\t" "vl %%v26, 32(%%r1,%1) \n\t"
"pfd 2, 256(%%r1,%[ptr_x]) \n\t" "vl %%v27, 48(%%r1,%1) \n\t"
"pfd 2, 256(%%r1,%[ptr_y]) \n\t" "vl %%v16, 0(%%r1,%2) \n\t"
"vl %%v24, 0(%%r1,%[ptr_x]) \n\t" "vl %%v17, 16(%%r1,%2) \n\t"
"vl %%v25, 16(%%r1,%[ptr_x]) \n\t" "vl %%v18, 32(%%r1,%2) \n\t"
"vl %%v26, 32(%%r1,%[ptr_x]) \n\t" "vl %%v19, 48(%%r1,%2) \n\t"
"vl %%v27, 48(%%r1,%[ptr_x]) \n\t"
"vl %%v16, 0(%%r1,%[ptr_y]) \n\t" "vfmdb %%v28,%%v24,%%v0 \n\t"
"vl %%v17, 16(%%r1,%[ptr_y]) \n\t" "vfmdb %%v29,%%v25,%%v0 \n\t"
"vl %%v18, 32(%%r1,%[ptr_y]) \n\t" "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
"vl %%v19, 48(%%r1,%[ptr_y]) \n\t" "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v30,%%v26,%%v0 \n\t"
"vfmdb %%v28,%%v24,%%v0 \n\t" "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v29,%%v25,%%v0 \n\t" "vfmdb %%v31,%%v27,%%v0 \n\t"
"vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ /* 2nd parts*/
"vfmdb %%v30,%%v26,%%v0 \n\t" "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t"
"vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
"vfmdb %%v31,%%v27,%%v0 \n\t" "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t"
"vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
/* 2nd parts*/ "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t"
"vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */
"vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t"
"vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
"vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
"vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" "vst %%v28, 0(%%r1,%1) \n\t"
"vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ "vst %%v29, 16(%%r1,%1) \n\t"
"vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" "vst %%v30, 32(%%r1,%1) \n\t"
"vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ "vst %%v31, 48(%%r1,%1) \n\t"
"vst %%v20, 0(%%r1,%2) \n\t"
"vst %%v28, 0(%%r1,%[ptr_x]) \n\t" "vst %%v21, 16(%%r1,%2) \n\t"
"vst %%v29, 16(%%r1,%[ptr_x]) \n\t" "vst %%v22, 32(%%r1,%2) \n\t"
"vst %%v30, 32(%%r1,%[ptr_x]) \n\t" "vst %%v23, 48(%%r1,%2) \n\t"
"vst %%v31, 48(%%r1,%[ptr_x]) \n\t"
"vst %%v20, 0(%%r1,%[ptr_y]) \n\t" "vl %%v24, 64(%%r1,%1) \n\t"
"vst %%v21, 16(%%r1,%[ptr_y]) \n\t" "vl %%v25, 80(%%r1,%1) \n\t"
"vst %%v22, 32(%%r1,%[ptr_y]) \n\t" "vl %%v26, 96(%%r1,%1) \n\t"
"vst %%v23, 48(%%r1,%[ptr_y]) \n\t" "vl %%v27, 112(%%r1,%1) \n\t"
"vl %%v16, 64(%%r1,%2) \n\t"
"vl %%v24, 64(%%r1,%[ptr_x]) \n\t" "vl %%v17, 80(%%r1,%2) \n\t"
"vl %%v25, 80(%%r1,%[ptr_x]) \n\t" "vl %%v18, 96(%%r1,%2) \n\t"
"vl %%v26, 96(%%r1,%[ptr_x]) \n\t" "vl %%v19, 112(%%r1,%2) \n\t"
"vl %%v27,112(%%r1,%[ptr_x]) \n\t"
"vl %%v16, 64(%%r1,%[ptr_y]) \n\t" "vfmdb %%v28,%%v24,%%v0 \n\t"
"vl %%v17, 80(%%r1,%[ptr_y]) \n\t" "vfmdb %%v29,%%v25,%%v0 \n\t"
"vl %%v18, 96(%%r1,%[ptr_y]) \n\t" "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
"vl %%v19,112(%%r1,%[ptr_y]) \n\t" "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v30,%%v26,%%v0 \n\t"
"vfmdb %%v28,%%v24,%%v0 \n\t" "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v29,%%v25,%%v0 \n\t" "vfmdb %%v31,%%v27,%%v0 \n\t"
"vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ /* 2nd parts*/
"vfmdb %%v30,%%v26,%%v0 \n\t" "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t"
"vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
"vfmdb %%v31,%%v27,%%v0 \n\t" "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t"
"vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
/* 2nd parts*/ "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t"
"vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */
"vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t"
"vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
"vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
"vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" "vst %%v28, 64(%%r1,%1) \n\t"
"vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ "vst %%v29, 80(%%r1,%1) \n\t"
"vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" "vst %%v30, 96(%%r1,%1) \n\t"
"vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ "vst %%v31, 112(%%r1,%1) \n\t"
"vst %%v20, 64(%%r1,%2) \n\t"
"vst %%v28, 64(%%r1,%[ptr_x]) \n\t" "vst %%v21, 80(%%r1,%2) \n\t"
"vst %%v29, 80(%%r1,%[ptr_x]) \n\t" "vst %%v22, 96(%%r1,%2) \n\t"
"vst %%v30, 96(%%r1,%[ptr_x]) \n\t" "vst %%v23, 112(%%r1,%2) \n\t"
"vst %%v31, 112(%%r1,%[ptr_x]) \n\t"
"vst %%v20, 64(%%r1,%[ptr_y]) \n\t" "vl %%v24, 128(%%r1,%1) \n\t"
"vst %%v21, 80(%%r1,%[ptr_y]) \n\t" "vl %%v25, 144(%%r1,%1) \n\t"
"vst %%v22, 96(%%r1,%[ptr_y]) \n\t" "vl %%v26, 160(%%r1,%1) \n\t"
"vst %%v23, 112(%%r1,%[ptr_y]) \n\t" "vl %%v27, 176(%%r1,%1) \n\t"
"vl %%v16, 128(%%r1,%2) \n\t"
"vl %%v24, 128(%%r1,%[ptr_x]) \n\t" "vl %%v17, 144(%%r1,%2) \n\t"
"vl %%v25, 144(%%r1,%[ptr_x]) \n\t" "vl %%v18, 160(%%r1,%2) \n\t"
"vl %%v26, 160(%%r1,%[ptr_x]) \n\t" "vl %%v19, 176(%%r1,%2) \n\t"
"vl %%v27, 176(%%r1,%[ptr_x]) \n\t"
"vl %%v16, 128(%%r1,%[ptr_y]) \n\t" "vfmdb %%v28,%%v24,%%v0 \n\t"
"vl %%v17, 144(%%r1,%[ptr_y]) \n\t" "vfmdb %%v29,%%v25,%%v0 \n\t"
"vl %%v18, 160(%%r1,%[ptr_y]) \n\t" "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
"vl %%v19, 176(%%r1,%[ptr_y]) \n\t" "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v30,%%v26,%%v0 \n\t"
"vfmdb %%v28,%%v24,%%v0 \n\t" "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v29,%%v25,%%v0 \n\t" "vfmdb %%v31,%%v27,%%v0 \n\t"
"vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ /* 2nd parts*/
"vfmdb %%v30,%%v26,%%v0 \n\t" "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t"
"vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
"vfmdb %%v31,%%v27,%%v0 \n\t" "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t"
"vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
/* 2nd parts*/ "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t"
"vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */
"vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t"
"vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
"vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
"vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" "vst %%v28, 128(%%r1,%1) \n\t"
"vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ "vst %%v29, 144(%%r1,%1) \n\t"
"vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" "vst %%v30, 160(%%r1,%1) \n\t"
"vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ "vst %%v31, 176(%%r1,%1) \n\t"
"vst %%v20, 128(%%r1,%2) \n\t"
"vst %%v28, 128(%%r1,%[ptr_x]) \n\t" "vst %%v21, 144(%%r1,%2) \n\t"
"vst %%v29, 144(%%r1,%[ptr_x]) \n\t" "vst %%v22, 160(%%r1,%2) \n\t"
"vst %%v30, 160(%%r1,%[ptr_x]) \n\t" "vst %%v23, 176(%%r1,%2) \n\t"
"vst %%v31, 176(%%r1,%[ptr_x]) \n\t"
"vst %%v20, 128(%%r1,%[ptr_y]) \n\t" "vl %%v24, 192(%%r1,%1) \n\t"
"vst %%v21, 144(%%r1,%[ptr_y]) \n\t" "vl %%v25, 208(%%r1,%1) \n\t"
"vst %%v22, 160(%%r1,%[ptr_y]) \n\t" "vl %%v26, 224(%%r1,%1) \n\t"
"vst %%v23, 176(%%r1,%[ptr_y]) \n\t" "vl %%v27, 240(%%r1,%1) \n\t"
"vl %%v16, 192(%%r1,%2) \n\t"
"vl %%v24, 192(%%r1,%[ptr_x]) \n\t" "vl %%v17, 208(%%r1,%2) \n\t"
"vl %%v25, 208(%%r1,%[ptr_x]) \n\t" "vl %%v18, 224(%%r1,%2) \n\t"
"vl %%v26, 224(%%r1,%[ptr_x]) \n\t" "vl %%v19, 240(%%r1,%2) \n\t"
"vl %%v27, 240(%%r1,%[ptr_x]) \n\t"
"vl %%v16, 192(%%r1,%[ptr_y]) \n\t" "vfmdb %%v28,%%v24,%%v0 \n\t"
"vl %%v17, 208(%%r1,%[ptr_y]) \n\t" "vfmdb %%v29,%%v25,%%v0 \n\t"
"vl %%v18, 224(%%r1,%[ptr_y]) \n\t" "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
"vl %%v19, 240(%%r1,%[ptr_y]) \n\t" "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v30,%%v26,%%v0 \n\t"
"vfmdb %%v28,%%v24,%%v0 \n\t" "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v29,%%v25,%%v0 \n\t" "vfmdb %%v31,%%v27,%%v0 \n\t"
"vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ /* 2nd parts*/
"vfmdb %%v30,%%v26,%%v0 \n\t" "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t"
"vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
"vfmdb %%v31,%%v27,%%v0 \n\t" "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t"
"vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
/* 2nd parts*/ "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t"
"vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */
"vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t"
"vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
"vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
"vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" "vst %%v28, 192(%%r1,%1) \n\t"
"vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ "vst %%v29, 208(%%r1,%1) \n\t"
"vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" "vst %%v30, 224(%%r1,%1) \n\t"
"vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ "vst %%v31, 240(%%r1,%1) \n\t"
"vst %%v20, 192(%%r1,%2) \n\t"
"vst %%v28, 192(%%r1,%[ptr_x]) \n\t" "vst %%v21, 208(%%r1,%2) \n\t"
"vst %%v29, 208(%%r1,%[ptr_x]) \n\t" "vst %%v22, 224(%%r1,%2) \n\t"
"vst %%v30, 224(%%r1,%[ptr_x]) \n\t" "vst %%v23, 240(%%r1,%2) \n\t"
"vst %%v31, 240(%%r1,%[ptr_x]) \n\t"
"vst %%v20, 192(%%r1,%[ptr_y]) \n\t" "agfi %%r1,256 \n\t"
"vst %%v21, 208(%%r1,%[ptr_y]) \n\t" "brctg %%r0,0b "
"vst %%v22, 224(%%r1,%[ptr_y]) \n\t" :
"vst %%v23, 240(%%r1,%[ptr_y]) \n\t" :"r"(n),"ZR"((FLOAT (*)[n * 2])x),"ZR"((FLOAT (*)[n * 2])y),"m"(*c),"m"(*s)
:"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
"la %%r1,256(%%r1) \n\t" );
"clgrjl %%r1,%[tmp],1b \n\t"
: [mem_x] "+m" (*(double (*)[2*n])x),
[mem_y] "+m" (*(double (*)[2*n])y),
[tmp] "+&r"(n)
: [ptr_x] "a"(x), [ptr_y] "a"(y),[cos] "f"(cosA),[sin] "f"(sinA)
: "cc","r1" ,"v0","v1","v16",
"v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
return;
} }
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
@ -214,8 +204,11 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
BLASLONG n1 = n & -16; BLASLONG n1 = n & -16;
if ( n1 > 0 ) if ( n1 > 0 )
{ {
zrot_kernel_16(n1, x, y, c, s); FLOAT cosa,sina;
cosa=c;
sina=s;
zrot_kernel_16(n1, x, y, &cosa, &sina);
i=n1; i=n1;
ix=2*n1; ix=2*n1;
} }
@ -234,6 +227,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
} }
} }
else else
{ {
@ -259,3 +253,4 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
} }

View File

@ -23,270 +23,211 @@ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/ *****************************************************************************/
#include "common.h" #include "common.h"
static void zscal_kernel_8(BLASLONG n, FLOAT *alpha, FLOAT *x)
{
__asm__ volatile(
"vlrepg %%v0,0(%1) \n\t"
"vleg %%v1,8(%1),0 \n\t"
"wflcdb %%v1,%%v1 \n\t"
"vleg %%v1,8(%1),1 \n\t"
"srlg %%r0,%0,3 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 2, 1024(%%r1,%2) \n\t"
"vl %%v16,0(%%r1,%2) \n\t"
"vl %%v17,16(%%r1,%2) \n\t"
"vl %%v18,32(%%r1,%2) \n\t"
"vl %%v19,48(%%r1,%2) \n\t"
"vl %%v20,64(%%r1,%2) \n\t"
"vl %%v21,80(%%r1,%2) \n\t"
"vl %%v22,96(%%r1,%2) \n\t"
"vl %%v23,112(%%r1,%2) \n\t"
"vpdi %%v24,%%v16,%%v16,4 \n\t"
"vpdi %%v25,%%v17,%%v17,4 \n\t"
"vpdi %%v26,%%v18,%%v18,4 \n\t"
"vpdi %%v27,%%v19,%%v19,4 \n\t"
"vpdi %%v28,%%v20,%%v20,4 \n\t"
"vpdi %%v29,%%v21,%%v21,4 \n\t"
"vpdi %%v30,%%v22,%%v22,4 \n\t"
"vpdi %%v31,%%v23,%%v23,4 \n\t"
"vfmdb %%v16,%%v16,%%v0 \n\t"
"vfmdb %%v17,%%v17,%%v0 \n\t"
"vfmdb %%v18,%%v18,%%v0 \n\t"
"vfmdb %%v19,%%v19,%%v0 \n\t"
"vfmdb %%v20,%%v20,%%v0 \n\t"
"vfmdb %%v21,%%v21,%%v0 \n\t"
"vfmdb %%v22,%%v22,%%v0 \n\t"
"vfmdb %%v23,%%v23,%%v0 \n\t"
"vfmadb %%v16,%%v24,%%v1,%%v16 \n\t"
"vfmadb %%v17,%%v25,%%v1,%%v17 \n\t"
"vfmadb %%v18,%%v26,%%v1,%%v18 \n\t"
"vfmadb %%v19,%%v27,%%v1,%%v19 \n\t"
"vfmadb %%v20,%%v28,%%v1,%%v20 \n\t"
"vfmadb %%v21,%%v29,%%v1,%%v21 \n\t"
"vfmadb %%v22,%%v30,%%v1,%%v22 \n\t"
"vfmadb %%v23,%%v31,%%v1,%%v23 \n\t"
"vst %%v16,0(%%r1,%2) \n\t"
"vst %%v17,16(%%r1,%2) \n\t"
"vst %%v18,32(%%r1,%2) \n\t"
"vst %%v19,48(%%r1,%2) \n\t"
"vst %%v20,64(%%r1,%2) \n\t"
"vst %%v21,80(%%r1,%2) \n\t"
"vst %%v22,96(%%r1,%2) \n\t"
"vst %%v23,112(%%r1,%2) \n\t"
"agfi %%r1,128 \n\t"
"brctg %%r0,0b "
:
:"r"(n),"ZQ"((const FLOAT (*)[2])alpha),"ZR"((FLOAT (*)[n * 2])x)
:"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
}
static void zscal_kernel_8_zero_r(BLASLONG n, FLOAT *alpha, FLOAT *x)
{
__asm__ volatile(
"vleg %%v0,8(%1),0 \n\t"
"wflcdb %%v0,%%v0 \n\t"
"vleg %%v0,8(%1),1 \n\t"
"srlg %%r0,%0,3 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 2, 1024(%%r1,%2) \n\t"
static void zscal_kernel_8(BLASLONG n, FLOAT da_r,FLOAT da_i, FLOAT *x) { "vl %%v16,0(%%r1,%2) \n\t"
BLASLONG tempR1 ; "vl %%v17,16(%%r1,%2) \n\t"
__asm__ ( "vl %%v18,32(%%r1,%2) \n\t"
"pfd 2, 0(%[x_tmp]) \n\t" "vl %%v19,48(%%r1,%2) \n\t"
#if !defined(CONJ) "vl %%v20,64(%%r1,%2) \n\t"
"lgdr %[t1],%[alpha_r] \n\t" "vl %%v21,80(%%r1,%2) \n\t"
"vlvgp %%v28,%[t1],%[t1] \n\t" //load both from disjoint "vl %%v22,96(%%r1,%2) \n\t"
"lgdr %[t1],%[alpha_i] \n\t" "vl %%v23,112(%%r1,%2) \n\t"
"vlvgp %%v29,%[t1],%[t1] \n\t" //load both from disjoint "vpdi %%v16,%%v16,%%v16,4 \n\t"
"vflcdb %%v29,%%v29 \n\t" //complement both "vpdi %%v17,%%v17,%%v17,4 \n\t"
"vlvgg %%v29,%[t1],1 \n\t" //restore 2nd so that {-alpha_i, alpha_i} "vpdi %%v18,%%v18,%%v18,4 \n\t"
"vpdi %%v19,%%v19,%%v19,4 \n\t"
"vpdi %%v20,%%v20,%%v20,4 \n\t"
"vpdi %%v21,%%v21,%%v21,4 \n\t"
"vpdi %%v22,%%v22,%%v22,4 \n\t"
"vpdi %%v23,%%v23,%%v23,4 \n\t"
#else "vfmdb %%v16,%%v16,%%v0 \n\t"
"lgdr %[t1],%[alpha_i] \n\t" "vfmdb %%v17,%%v17,%%v0 \n\t"
"vlvgp %%v29,%[t1],%[t1] \n\t" //load both from disjoint "vfmdb %%v18,%%v18,%%v0 \n\t"
"lgdr %[t1],%[alpha_r] \n\t" "vfmdb %%v19,%%v19,%%v0 \n\t"
"vlvgp %%v28,%[t1],%[t1] \n\t" //load both from disjoint "vfmdb %%v20,%%v20,%%v0 \n\t"
"vflcdb %%v28,%%v28 \n\t" //complement both "vfmdb %%v21,%%v21,%%v0 \n\t"
"vlvgg %%v28,%[t1],0 \n\t" //restore 1st so that {alpha_r,-alpha_r} "vfmdb %%v22,%%v22,%%v0 \n\t"
#endif "vfmdb %%v23,%%v23,%%v0 \n\t"
"xgr %[t1],%[t1] \n\t"
"sllg %[tmp],%[tmp],4 \n\t"
"vl %%v20 , 0(%[t1],%[x_tmp]) \n\t"
"vl %%v21 , 16(%[t1],%[x_tmp]) \n\t"
"vl %%v22 , 32(%[t1],%[x_tmp]) \n\t"
"vl %%v23 , 48(%[t1],%[x_tmp]) \n\t"
"lay %[tmp],-64 (%[tmp]) \n\t" //tmp-=64 so that t1+64 can break tmp condition
"j 2f \n\t"
".align 16 \n\t"
"1: \n\t"
"vpdi %%v24 , %%v20, %%v20, 4 \n\t"
"vpdi %%v25 , %%v21, %%v21, 4 \n\t"
"vpdi %%v26 , %%v22, %%v22, 4 \n\t"
"vpdi %%v27 , %%v23, %%v23, 4 \n\t"
"vfmdb %%v16, %%v20, %%v28 \n\t"
"vfmdb %%v17, %%v21, %%v28 \n\t"
"vfmdb %%v18, %%v22, %%v28 \n\t"
"vfmdb %%v19, %%v23, %%v28 \n\t"
"vl %%v20, 64(%[t1],%[x_tmp]) \n\t"
"vl %%v21, 80(%[t1],%[x_tmp]) \n\t"
"vl %%v22, 96(%[t1],%[x_tmp]) \n\t"
"vl %%v23, 112(%[t1],%[x_tmp]) \n\t"
"vfmadb %%v16, %%v24, %%v29, %%v16 \n\t"
"vfmadb %%v17, %%v25, %%v29, %%v17 \n\t"
"vfmadb %%v18, %%v26, %%v29, %%v18 \n\t"
"vfmadb %%v19, %%v27, %%v29, %%v19 \n\t"
"vst %%v16,0(%%r1,%2) \n\t"
"vst %%v17,16(%%r1,%2) \n\t"
"vst %%v18,32(%%r1,%2) \n\t"
"vst %%v19,48(%%r1,%2) \n\t"
"vst %%v20,64(%%r1,%2) \n\t"
"vst %%v21,80(%%r1,%2) \n\t"
"vst %%v22,96(%%r1,%2) \n\t"
"vst %%v23,112(%%r1,%2) \n\t"
"vst %%v16 , 0(%[t1],%[x_tmp]) \n\t" "agfi %%r1,128 \n\t"
"vst %%v17 , 16(%[t1],%[x_tmp]) \n\t" "brctg %%r0,0b "
"vst %%v18 , 32(%[t1],%[x_tmp]) \n\t" :
"vst %%v19 , 48(%[t1],%[x_tmp]) \n\t" :"r"(n),"ZQ"((const FLOAT (*)[2])alpha),"ZR"((FLOAT (*)[n * 2])x)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23"
);
}
static void zscal_kernel_8_zero_i(BLASLONG n, FLOAT *alpha, FLOAT *x)
{
__asm__ volatile(
"vlrepg %%v0,0(%1) \n\t"
"srlg %%r0,%0,3 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 2, 1024(%%r1,%2) \n\t"
"vl %%v16,0(%%r1,%2) \n\t"
"vl %%v17,16(%%r1,%2) \n\t"
"vl %%v18,32(%%r1,%2) \n\t"
"vl %%v19,48(%%r1,%2) \n\t"
"vl %%v20,64(%%r1,%2) \n\t"
"vl %%v21,80(%%r1,%2) \n\t"
"vl %%v22,96(%%r1,%2) \n\t"
"vl %%v23,112(%%r1,%2) \n\t"
"vfmdb %%v16,%%v16,%%v0 \n\t"
"vfmdb %%v17,%%v17,%%v0 \n\t"
"vfmdb %%v18,%%v18,%%v0 \n\t"
"vfmdb %%v19,%%v19,%%v0 \n\t"
"vfmdb %%v20,%%v20,%%v0 \n\t"
"vfmdb %%v21,%%v21,%%v0 \n\t"
"vfmdb %%v22,%%v22,%%v0 \n\t"
"vfmdb %%v23,%%v23,%%v0 \n\t"
"vst %%v16,0(%%r1,%2) \n\t"
"vst %%v17,16(%%r1,%2) \n\t"
"vst %%v18,32(%%r1,%2) \n\t"
"vst %%v19,48(%%r1,%2) \n\t"
"vst %%v20,64(%%r1,%2) \n\t"
"vst %%v21,80(%%r1,%2) \n\t"
"vst %%v22,96(%%r1,%2) \n\t"
"vst %%v23,112(%%r1,%2) \n\t"
"agfi %%r1,128 \n\t"
"brctg %%r0,0b "
:
:"r"(n),"ZQ"((const FLOAT (*)[2])alpha),"ZR"((FLOAT (*)[n * 2])x)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23"
);
}
static void zscal_kernel_8_zero(BLASLONG n, FLOAT *x)
{
__asm__ volatile(
"vzero %%v24 \n\t"
"vzero %%v25 \n\t"
"vzero %%v26 \n\t"
"vzero %%v27 \n\t"
"srlg %%r0,%0,3 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 2, 1024(%%r1,%1) \n\t"
"vst %%v24,0(%%r1,%1) \n\t"
"vst %%v25,16(%%r1,%1) \n\t"
"vst %%v26,32(%%r1,%1) \n\t"
"vst %%v27,48(%%r1,%1) \n\t"
"vst %%v24,64(%%r1,%1) \n\t"
"vst %%v25,80(%%r1,%1) \n\t"
"vst %%v26,96(%%r1,%1) \n\t"
"vst %%v27,112(%%r1,%1) \n\t"
"la %[t1],64(%[t1] ) \n\t" "agfi %%r1,128 \n\t"
"2: \n\t" "brctg %%r0,0b "
"pfd 2, 256(%[t1],%[x_tmp]) \n\t" :
"vpdi %%v24 , %%v20, %%v20, 4 \n\t" :"r"(n),"ZR"((FLOAT (*)[n * 2])x)
"vpdi %%v25 , %%v21, %%v21, 4 \n\t" :"memory","cc","r0","r1","v24","v25","v26","v27"
"vpdi %%v26 , %%v22, %%v22, 4 \n\t" );
"vpdi %%v27 , %%v23, %%v23, 4 \n\t"
"vfmdb %%v30, %%v20, %%v28 \n\t"
"vfmdb %%v31, %%v21, %%v28 \n\t"
"vfmdb %%v6, %%v22, %%v28 \n\t"
"vfmdb %%v7, %%v23, %%v28 \n\t"
"vl %%v20 , 64(%[t1],%[x_tmp]) \n\t"
"vl %%v21 , 80(%[t1],%[x_tmp]) \n\t"
"vl %%v22 , 96(%[t1],%[x_tmp]) \n\t"
"vl %%v23 ,112(%[t1],%[x_tmp]) \n\t"
"vfmadb %%v30, %%v24, %%v29, %%v30 \n\t"
"vfmadb %%v31, %%v25, %%v29, %%v31 \n\t"
"vfmadb %%v6, %%v26, %%v29, %%v6 \n\t"
"vfmadb %%v7, %%v27, %%v29, %%v7 \n\t"
"vst %%v30 , 0(%[t1],%[x_tmp]) \n\t"
"vst %%v31 , 16(%[t1],%[x_tmp]) \n\t"
"vst %%v6 , 32(%[t1],%[x_tmp]) \n\t"
"vst %%v7 , 48(%[t1],%[x_tmp]) \n\t"
"la %[t1],64(%[t1] ) \n\t"
"clgrjl %[t1],%[tmp],1b \n\t"
//----------------------------------------------------------------------
"vfmdb %%v16, %%v20, %%v28 \n\t"
"vfmdb %%v17, %%v21, %%v28 \n\t"
"vfmdb %%v18, %%v22, %%v28 \n\t"
"vfmdb %%v19, %%v23, %%v28 \n\t"
"vpdi %%v24 , %%v20, %%v20, 4 \n\t"
"vpdi %%v25 , %%v21, %%v21, 4 \n\t"
"vpdi %%v26 , %%v22, %%v22, 4 \n\t"
"vpdi %%v27 , %%v23, %%v23, 4 \n\t"
"vfmadb %%v16, %%v24, %%v29, %%v16 \n\t"
"vfmadb %%v17, %%v25, %%v29, %%v17 \n\t"
"vfmadb %%v18, %%v26, %%v29, %%v18 \n\t"
"vfmadb %%v19, %%v27, %%v29, %%v19 \n\t"
"vst %%v16 , 0(%[t1],%[x_tmp]) \n\t"
"vst %%v17 , 16(%[t1],%[x_tmp]) \n\t"
"vst %%v18 , 32(%[t1],%[x_tmp]) \n\t"
"vst %%v19 , 48(%[t1],%[x_tmp]) \n\t"
: [mem_x] "+m" (*(double (*)[2*n])x),[tmp]"+&r"(n) , [t1] "=&a" (tempR1)
: [x_tmp] "a"(x), [alpha_r] "f"(da_r),[alpha_i] "f"(da_i)
: "cc", "v6","v7", "v16",
"v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
}
static void zscal_kernel_8_zero_r(BLASLONG n, FLOAT da_i, FLOAT *x) {
__asm__ ( "pfd 2, 0(%1) \n\t"
"lgdr %%r0,%[alpha] \n\t"
"vlvgp %%v16,%%r0,%%r0 \n\t" //load both from disjoint
"vflcdb %%v16,%%v16 \n\t" //complement both
"vlvgg %%v16,%%r0,0 \n\t" //restore 1st
"vlr %%v17 ,%%v16 \n\t"
"sllg %%r0,%[n],4 \n\t"
"agr %%r0,%[x_ptr] \n\t"
".align 16 \n\t"
"1: \n\t"
"vl %%v24, 0(%[x_ptr]) \n\t"
"vfmdb %%v24,%%v24,%%v16 \n\t"
"vsteg %%v24, 0(%[x_ptr]),1 \n\t"
"vsteg %%v24, 8(%[x_ptr]),0 \n\t"
"vl %%v25, 16(%[x_ptr]) \n\t"
"vfmdb %%v25,%%v25,%%v17 \n\t"
"vsteg %%v25, 16(%[x_ptr]),1 \n\t"
"vsteg %%v25, 24(%[x_ptr]),0 \n\t"
"vl %%v26, 32(%[x_ptr]) \n\t"
"vfmdb %%v26,%%v26,%%v16 \n\t"
"vsteg %%v26, 32(%[x_ptr]),1 \n\t"
"vsteg %%v26, 40(%[x_ptr]),0 \n\t"
"vl %%v27, 48(%[x_ptr]) \n\t"
"vfmdb %%v27,%%v27,%%v17 \n\t"
"vsteg %%v27, 48(%[x_ptr]),1 \n\t"
"vsteg %%v27, 56(%[x_ptr]),0 \n\t"
"vl %%v28, 64(%[x_ptr]) \n\t"
"vfmdb %%v28,%%v28,%%v16 \n\t"
"vsteg %%v28, 64(%[x_ptr]),1 \n\t"
"vsteg %%v28, 72(%[x_ptr]),0 \n\t"
"vl %%v29, 80(%[x_ptr]) \n\t"
"vfmdb %%v29,%%v29,%%v17 \n\t"
"vsteg %%v29, 80(%[x_ptr]),1 \n\t"
"vsteg %%v29, 88(%[x_ptr]),0 \n\t"
"vl %%v30, 96(%[x_ptr]) \n\t"
"vfmdb %%v30,%%v30,%%v16 \n\t"
"vsteg %%v30, 96(%[x_ptr]),1 \n\t"
"vsteg %%v30, 104(%[x_ptr]),0 \n\t"
"vl %%v31, 112(%[x_ptr]) \n\t"
"vfmdb %%v31,%%v31,%%v17 \n\t"
"vsteg %%v31, 112(%[x_ptr]),1 \n\t"
"vsteg %%v31, 120(%[x_ptr]),0 \n\t"
"la %[x_ptr],128(%[x_ptr]) \n\t"
"clgrjl %[x_ptr],%%r0,1b \n\t"
: [mem] "+m" (*(double (*)[2*n])x) ,[x_ptr] "+&a"(x)
: [n] "r"(n),[alpha] "f"(da_i)
:"cc", "r0","f0", "f1","v16","v17" ,"v24","v25","v26","v27","v28","v29","v30","v31"
);
} }
static void zscal_kernel_8_zero_i(BLASLONG n, FLOAT da_r, FLOAT *x) { static void zscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, BLASLONG inc_x)
__asm__ ("pfd 2, 0(%[x_ptr]) \n\t" {
"lgdr %%r0,%[alpha] \n\t"
"vlvgp %%v18,%%r0,%%r0 \n\t"
"vlr %%v19,%%v18 \n\t"
"vlr %%v16,%%v18 \n\t"
"vlr %%v17,%%v18 \n\t"
"sllg %%r0,%[n],4 \n\t"
"agr %%r0,%[x_ptr] \n\t"
".align 16 \n\t"
"1: \n\t"
"vl %%v24, 0(%[x_ptr]) \n\t"
"vfmdb %%v24,%%v24,%%v18 \n\t"
"vst %%v24, 0(%[x_ptr]) \n\t"
"vl %%v25, 16(%[x_ptr]) \n\t"
"vfmdb %%v25,%%v25,%%v19 \n\t"
"vst %%v25, 16(%[x_ptr]) \n\t"
"vl %%v26, 32(%[x_ptr]) \n\t"
"vfmdb %%v26,%%v26,%%v16 \n\t"
"vst %%v26, 32(%[x_ptr]) \n\t"
"vl %%v27, 48(%[x_ptr]) \n\t"
"vfmdb %%v27,%%v27,%%v17 \n\t"
"vst %%v27, 48(%[x_ptr]) \n\t"
"vl %%v28, 64(%[x_ptr]) \n\t"
"vfmdb %%v28,%%v28,%%v18 \n\t"
"vst %%v28, 64(%[x_ptr]) \n\t"
"vl %%v29, 80(%[x_ptr]) \n\t"
"vfmdb %%v29,%%v29,%%v19 \n\t"
"vst %%v29, 80(%[x_ptr]) \n\t"
"vl %%v30, 96(%[x_ptr]) \n\t"
"vfmdb %%v30,%%v30,%%v16 \n\t"
"vst %%v30, 96(%[x_ptr]) \n\t"
"vl %%v31,112(%[x_ptr]) \n\t"
"vfmdb %%v31,%%v31,%%v17 \n\t"
"vst %%v31,112(%[x_ptr]) \n\t"
"la %[x_ptr],128(%[x_ptr]) \n\t"
"clgrjl %[x_ptr],%%r0,1b \n\t"
: [mem] "+m" (*(double (*)[2*n])x) ,[x_ptr] "+&a"(x)
: [n] "r"(n),[alpha] "f"(da_r)
: "cc", "r0","v16", "v17","v18","v19","v24","v25","v26","v27","v28","v29","v30","v31"
);
}
static void zscal_kernel_8_zero(BLASLONG n, FLOAT *x) {
__asm__ ( "pfd 2, 0(%[x_ptr]) \n\t"
"vzero %%v24 \n\t"
"vzero %%v25 \n\t"
"vzero %%v26 \n\t"
"vzero %%v27 \n\t"
"sllg %%r0,%[n],4 \n\t"
"agr %%r0,%[x_ptr] \n\t"
".align 16 \n\t"
"1: \n\t"
"pfd 2, 256( %[x_ptr]) \n\t"
"vst %%v24, 0( %[x_ptr]) \n\t"
"vst %%v25, 16( %[x_ptr]) \n\t"
"vst %%v26, 32( %[x_ptr]) \n\t"
"vst %%v27, 48( %[x_ptr]) \n\t"
"vst %%v24, 64( %[x_ptr]) \n\t"
"vst %%v25, 80( %[x_ptr]) \n\t"
"vst %%v26, 96( %[x_ptr]) \n\t"
"vst %%v27,112( %[x_ptr]) \n\t"
"la %[x_ptr],128(%[x_ptr]) \n\t"
"clgrjl %[x_ptr],%%r0,1b \n\t"
: [mem] "+m" (*(double (*)[2*n])x),[x_ptr] "+&a"(x)
: [n] "r"(n)
:"cc" ,"r0","v24","v25","v26","v27"
);
}
static void zscal_kernel_inc_8(BLASLONG n, FLOAT da_r,FLOAT da_i, FLOAT *x, BLASLONG inc_x) {
BLASLONG i; BLASLONG i;
BLASLONG inc_x2 = 2 * inc_x; BLASLONG inc_x2 = 2 * inc_x;
BLASLONG inc_x3 = inc_x2 + inc_x; BLASLONG inc_x3 = inc_x2 + inc_x;
FLOAT t0, t1, t2, t3; FLOAT t0, t1, t2, t3;
FLOAT da_r = alpha[0];
FLOAT da_i = alpha[1];
for (i = 0; i < n; i += 4) { for (i = 0; i < n; i += 4)
{
t0 = da_r * x[0] - da_i * x[1]; t0 = da_r * x[0] - da_i * x[1];
t1 = da_r * x[inc_x] - da_i * x[inc_x + 1]; t1 = da_r * x[inc_x] - da_i * x[inc_x + 1];
t2 = da_r * x[inc_x2] - da_i * x[inc_x2 + 1]; t2 = da_r * x[inc_x2] - da_i * x[inc_x2 + 1];
@ -303,17 +244,14 @@ static void zscal_kernel_inc_8(BLASLONG n, FLOAT da_r,FLOAT da_i, FLOAT *x, BLAS
x[inc_x3] = t3; x[inc_x3] = t3;
x += 4 * inc_x; x += 4 * inc_x;
} }
} }
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) {
BLASLONG i = 0, j = 0; BLASLONG i = 0, j = 0;
FLOAT temp0; FLOAT temp0;
FLOAT temp1; FLOAT temp1;
FLOAT alpha[2] __attribute__ ((aligned(16)));
if (inc_x != 1) { if (inc_x != 1) {
inc_x <<= 1; inc_x <<= 1;
@ -405,8 +343,10 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
} else { } else {
BLASLONG n1 = n & -8; BLASLONG n1 = n & -8;
if (n1 > 0) { if (n1 > 0) {
zscal_kernel_inc_8(n1, da_r,da_i, x, inc_x); alpha[0] = da_r;
alpha[1] = da_i;
zscal_kernel_inc_8(n1, alpha, x, inc_x);
j = n1; j = n1;
i = n1 * inc_x; i = n1 * inc_x;
} }
@ -432,17 +372,19 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
BLASLONG n1 = n & -8; BLASLONG n1 = n & -8;
if (n1 > 0) { if (n1 > 0) {
alpha[0] = da_r;
alpha[1] = da_i;
if (da_r == 0.0) if (da_r == 0.0)
if (da_i == 0) if (da_i == 0)
zscal_kernel_8_zero(n1, x); zscal_kernel_8_zero(n1, x);
else else
zscal_kernel_8_zero_r(n1, da_i, x); zscal_kernel_8_zero_r(n1, alpha, x);
else else
if (da_i == 0) if (da_i == 0)
zscal_kernel_8_zero_i(n1, da_r, x); zscal_kernel_8_zero_i(n1, alpha, x);
else else
zscal_kernel_8(n1, da_r,da_i, x); zscal_kernel_8(n1, alpha, x);
i = n1 << 1; i = n1 << 1;
j = n1; j = n1;
@ -508,5 +450,3 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
return (0); return (0);
} }

View File

@ -25,220 +25,93 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/ *****************************************************************************/
#include "common.h" #include "common.h"
static void zswap_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y)
#if defined(Z13_SWAP_A)
static void zswap_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y)
{ {
__asm__ volatile( __asm__ volatile(
"pfd 1, 0(%[ptr_x]) \n\t" "srlg %%r0,%0,4 \n\t"
"pfd 2, 0(%[ptr_y]) \n\t" "xgr %%r1,%%r1 \n\t"
"srlg %[n_tmp],%[n_tmp],4 \n\t" "0: \n\t"
"xgr %%r1,%%r1 \n\t" "pfd 2, 1024(%%r1,%1) \n\t"
".align 16 \n\t" "pfd 2, 1024(%%r1,%2) \n\t"
"1: \n\t"
"pfd 2, 256(%%r1,%[ptr_x]) \n\t" "vl %%v16, 0(%%r1,%1) \n\t"
"pfd 2, 256(%%r1,%[ptr_y]) \n\t" "vl %%v17, 16(%%r1,%1) \n\t"
"vl %%v18, 32(%%r1,%1) \n\t"
"vl %%v24, 0(%%r1,%[ptr_x]) \n\t" "vl %%v19, 48(%%r1,%1) \n\t"
"vl %%v16, 0(%%r1,%[ptr_y]) \n\t" "vl %%v20, 64(%%r1,%1) \n\t"
"vst %%v24, 0(%%r1,%[ptr_y]) \n\t" "vl %%v21, 80(%%r1,%1) \n\t"
"vst %%v16, 0(%%r1,%[ptr_x]) \n\t" "vl %%v22, 96(%%r1,%1) \n\t"
"vl %%v23, 112(%%r1,%1) \n\t"
"vl %%v24, 128(%%r1,%1) \n\t"
"vl %%v25, 144(%%r1,%1) \n\t"
"vl %%v26, 160(%%r1,%1) \n\t"
"vl %%v27, 176(%%r1,%1) \n\t"
"vl %%v28, 192(%%r1,%1) \n\t"
"vl %%v29, 208(%%r1,%1) \n\t"
"vl %%v30, 224(%%r1,%1) \n\t"
"vl %%v31, 240(%%r1,%1) \n\t"
"vl %%v25, 16(%%r1,%[ptr_x]) \n\t" "vl %%v0, 0(%%r1,%2) \n\t"
"vl %%v17, 16(%%r1,%[ptr_y]) \n\t" "vl %%v1, 16(%%r1,%2) \n\t"
"vst %%v25, 16(%%r1,%[ptr_y]) \n\t" "vl %%v2, 32(%%r1,%2) \n\t"
"vst %%v17, 16(%%r1,%[ptr_x]) \n\t" "vl %%v3, 48(%%r1,%2) \n\t"
"vl %%v4, 64(%%r1,%2) \n\t"
"vl %%v5, 80(%%r1,%2) \n\t"
"vl %%v6, 96(%%r1,%2) \n\t"
"vl %%v7, 112(%%r1,%2) \n\t"
"vst %%v0, 0(%%r1,%1) \n\t"
"vst %%v1, 16(%%r1,%1) \n\t"
"vst %%v2, 32(%%r1,%1) \n\t"
"vst %%v3, 48(%%r1,%1) \n\t"
"vst %%v4, 64(%%r1,%1) \n\t"
"vst %%v5, 80(%%r1,%1) \n\t"
"vst %%v6, 96(%%r1,%1) \n\t"
"vst %%v7, 112(%%r1,%1) \n\t"
"vl %%v26, 32(%%r1,%[ptr_x]) \n\t" "vl %%v0, 128(%%r1,%2) \n\t"
"vl %%v18, 32(%%r1,%[ptr_y]) \n\t" "vl %%v1, 144(%%r1,%2) \n\t"
"vst %%v26, 32(%%r1,%[ptr_y]) \n\t" "vl %%v2, 160(%%r1,%2) \n\t"
"vst %%v18, 32(%%r1,%[ptr_x]) \n\t" "vl %%v3, 176(%%r1,%2) \n\t"
"vl %%v4, 192(%%r1,%2) \n\t"
"vl %%v27, 48(%%r1,%[ptr_x]) \n\t" "vl %%v5, 208(%%r1,%2) \n\t"
"vl %%v19, 48(%%r1,%[ptr_y]) \n\t" "vl %%v6, 224(%%r1,%2) \n\t"
"vst %%v27, 48(%%r1,%[ptr_y]) \n\t" "vl %%v7, 240(%%r1,%2) \n\t"
"vst %%v19, 48(%%r1,%[ptr_x]) \n\t" "vst %%v0, 128(%%r1,%1) \n\t"
"vst %%v1, 144(%%r1,%1) \n\t"
"vl %%v28, 64(%%r1,%[ptr_x]) \n\t" "vst %%v2, 160(%%r1,%1) \n\t"
"vl %%v20, 64(%%r1,%[ptr_y]) \n\t" "vst %%v3, 176(%%r1,%1) \n\t"
"vst %%v28, 64(%%r1,%[ptr_y]) \n\t" "vst %%v4, 192(%%r1,%1) \n\t"
"vst %%v20, 64(%%r1,%[ptr_x]) \n\t" "vst %%v5, 208(%%r1,%1) \n\t"
"vst %%v6, 224(%%r1,%1) \n\t"
"vl %%v29, 80(%%r1,%[ptr_x]) \n\t" "vst %%v7, 240(%%r1,%1) \n\t"
"vl %%v21, 80(%%r1,%[ptr_y]) \n\t"
"vst %%v29, 80(%%r1,%[ptr_y]) \n\t"
"vst %%v21, 80(%%r1,%[ptr_x]) \n\t"
"vl %%v30, 96(%%r1,%[ptr_x]) \n\t"
"vl %%v22, 96(%%r1,%[ptr_y]) \n\t"
"vst %%v30, 96(%%r1,%[ptr_y]) \n\t"
"vst %%v22, 96(%%r1,%[ptr_x]) \n\t"
"vl %%v31, 112(%%r1,%[ptr_x]) \n\t"
"vl %%v23, 112(%%r1,%[ptr_y]) \n\t"
"vst %%v31, 112(%%r1,%[ptr_y]) \n\t"
"vst %%v23, 112(%%r1,%[ptr_x]) \n\t"
"vl %%v24, 128(%%r1,%[ptr_x]) \n\t"
"vl %%v16, 128(%%r1,%[ptr_y]) \n\t"
"vst %%v24, 128(%%r1,%[ptr_y]) \n\t"
"vst %%v16, 128(%%r1,%[ptr_x]) \n\t"
"vl %%v25, 144(%%r1,%[ptr_x]) \n\t"
"vl %%v17, 144(%%r1,%[ptr_y]) \n\t"
"vst %%v25, 144(%%r1,%[ptr_y]) \n\t"
"vst %%v17, 144(%%r1,%[ptr_x]) \n\t"
"vl %%v26, 160(%%r1,%[ptr_x]) \n\t"
"vl %%v18, 160(%%r1,%[ptr_y]) \n\t"
"vst %%v26, 160(%%r1,%[ptr_y]) \n\t"
"vst %%v18, 160(%%r1,%[ptr_x]) \n\t"
"vl %%v27, 176(%%r1,%[ptr_x]) \n\t"
"vl %%v19, 176(%%r1,%[ptr_y]) \n\t"
"vst %%v27, 176(%%r1,%[ptr_y]) \n\t"
"vst %%v19, 176(%%r1,%[ptr_x]) \n\t"
"vl %%v28, 192(%%r1,%[ptr_x]) \n\t"
"vl %%v20, 192(%%r1,%[ptr_y]) \n\t"
"vst %%v28, 192(%%r1,%[ptr_y]) \n\t"
"vst %%v20, 192(%%r1,%[ptr_x]) \n\t"
"vl %%v29, 208(%%r1,%[ptr_x]) \n\t"
"vl %%v21, 208(%%r1,%[ptr_y]) \n\t"
"vst %%v29, 208(%%r1,%[ptr_y]) \n\t"
"vst %%v21, 208(%%r1,%[ptr_x]) \n\t"
"vl %%v30, 224(%%r1,%[ptr_x]) \n\t"
"vl %%v22, 224(%%r1,%[ptr_y]) \n\t"
"vst %%v30, 224(%%r1,%[ptr_y]) \n\t"
"vst %%v22, 224(%%r1,%[ptr_x]) \n\t"
"vl %%v31, 240(%%r1,%[ptr_x]) \n\t"
"vl %%v23, 240(%%r1,%[ptr_y]) \n\t"
"vst %%v31, 240(%%r1,%[ptr_y]) \n\t"
"vst %%v23, 240(%%r1,%[ptr_x]) \n\t"
"la %%r1,256(%%r1) \n\t"
"brctg %[n_tmp],1b"
: [mem_x] "+m" (*(double (*)[2*n])x),
[mem_y] "+m" (*(double (*)[2*n])y),
[n_tmp] "+&r"(n)
: [ptr_x] "a"(x), [ptr_y] "a"(y)
: "cc", "r1", "v16","v17","v18","v19","v20","v21","v22","v23"
,"v24","v25","v26","v27","v28","v29","v30","v31"
);
return;
"vst %%v16, 0(%%r1,%2) \n\t"
"vst %%v17, 16(%%r1,%2) \n\t"
"vst %%v18, 32(%%r1,%2) \n\t"
"vst %%v19, 48(%%r1,%2) \n\t"
"vst %%v20, 64(%%r1,%2) \n\t"
"vst %%v21, 80(%%r1,%2) \n\t"
"vst %%v22, 96(%%r1,%2) \n\t"
"vst %%v23, 112(%%r1,%2) \n\t"
"vst %%v24, 128(%%r1,%2) \n\t"
"vst %%v25, 144(%%r1,%2) \n\t"
"vst %%v26, 160(%%r1,%2) \n\t"
"vst %%v27, 176(%%r1,%2) \n\t"
"vst %%v28, 192(%%r1,%2) \n\t"
"vst %%v29, 208(%%r1,%2) \n\t"
"vst %%v30, 224(%%r1,%2) \n\t"
"vst %%v31, 240(%%r1,%2) \n\t"
"agfi %%r1,256 \n\t"
"brctg %%r0,0b "
:
:"r"(n),"ZR"((FLOAT (*)[n * 2])x),"ZR"((FLOAT (*)[n * 2])y)
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
} }
#else
static void zswap_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y)
{
__asm__ volatile(
"pfd 2, 0(%[ptr_x]) \n\t"
"pfd 2, 0(%[ptr_y]) \n\t"
"srlg %[n_tmp],%[n_tmp],4 \n\t"
"xgr %%r1,%%r1 \n\t"
".align 16 \n\t"
"1: \n\t"
"pfd 2, 256(%%r1,%[ptr_x]) \n\t"
"pfd 2, 256(%%r1,%[ptr_y]) \n\t"
"vl %%v16, 0(%%r1,%[ptr_x]) \n\t"
"vl %%v17, 16(%%r1,%[ptr_x]) \n\t"
"vl %%v18, 32(%%r1,%[ptr_x]) \n\t"
"vl %%v19, 48(%%r1,%[ptr_x]) \n\t"
"vl %%v20, 64(%%r1,%[ptr_x]) \n\t"
"vl %%v21, 80(%%r1,%[ptr_x]) \n\t"
"vl %%v22, 96(%%r1,%[ptr_x]) \n\t"
"vl %%v23, 112(%%r1,%[ptr_x]) \n\t"
"vl %%v24, 128(%%r1,%[ptr_x]) \n\t"
"vl %%v25, 144(%%r1,%[ptr_x]) \n\t"
"vl %%v26, 160(%%r1,%[ptr_x]) \n\t"
"vl %%v27, 176(%%r1,%[ptr_x]) \n\t"
"vl %%v28, 192(%%r1,%[ptr_x]) \n\t"
"vl %%v29, 208(%%r1,%[ptr_x]) \n\t"
"vl %%v30, 224(%%r1,%[ptr_x]) \n\t"
"vl %%v31, 240(%%r1,%[ptr_x]) \n\t"
"vl %%v0, 0(%%r1,%[ptr_y]) \n\t"
"vl %%v1, 16(%%r1,%[ptr_y]) \n\t"
"vl %%v2, 32(%%r1,%[ptr_y]) \n\t"
"vl %%v3, 48(%%r1,%[ptr_y]) \n\t"
"vl %%v4, 64(%%r1,%[ptr_y]) \n\t"
"vl %%v5, 80(%%r1,%[ptr_y]) \n\t"
"vl %%v6, 96(%%r1,%[ptr_y]) \n\t"
"vl %%v7, 112(%%r1,%[ptr_y]) \n\t"
"vst %%v0, 0(%%r1,%[ptr_x]) \n\t"
"vst %%v1, 16(%%r1,%[ptr_x]) \n\t"
"vst %%v2, 32(%%r1,%[ptr_x]) \n\t"
"vst %%v3, 48(%%r1,%[ptr_x]) \n\t"
"vst %%v4, 64(%%r1,%[ptr_x]) \n\t"
"vst %%v5, 80(%%r1,%[ptr_x]) \n\t"
"vst %%v6, 96(%%r1,%[ptr_x]) \n\t"
"vst %%v7, 112(%%r1,%[ptr_x]) \n\t"
"vl %%v0, 128(%%r1,%[ptr_y]) \n\t"
"vl %%v1, 144(%%r1,%[ptr_y]) \n\t"
"vl %%v2, 160(%%r1,%[ptr_y]) \n\t"
"vl %%v3, 176(%%r1,%[ptr_y]) \n\t"
"vl %%v4, 192(%%r1,%[ptr_y]) \n\t"
"vl %%v5, 208(%%r1,%[ptr_y]) \n\t"
"vl %%v6, 224(%%r1,%[ptr_y]) \n\t"
"vl %%v7, 240(%%r1,%[ptr_y]) \n\t"
"vst %%v0, 128(%%r1,%[ptr_x]) \n\t"
"vst %%v1, 144(%%r1,%[ptr_x]) \n\t"
"vst %%v2, 160(%%r1,%[ptr_x]) \n\t"
"vst %%v3, 176(%%r1,%[ptr_x]) \n\t"
"vst %%v4, 192(%%r1,%[ptr_x]) \n\t"
"vst %%v5, 208(%%r1,%[ptr_x]) \n\t"
"vst %%v6, 224(%%r1,%[ptr_x]) \n\t"
"vst %%v7, 240(%%r1,%[ptr_x]) \n\t"
"vst %%v16, 0(%%r1,%[ptr_y]) \n\t"
"vst %%v17, 16(%%r1,%[ptr_y]) \n\t"
"vst %%v18, 32(%%r1,%[ptr_y]) \n\t"
"vst %%v19, 48(%%r1,%[ptr_y]) \n\t"
"vst %%v20, 64(%%r1,%[ptr_y]) \n\t"
"vst %%v21, 80(%%r1,%[ptr_y]) \n\t"
"vst %%v22, 96(%%r1,%[ptr_y]) \n\t"
"vst %%v23, 112(%%r1,%[ptr_y]) \n\t"
"vst %%v24, 128(%%r1,%[ptr_y]) \n\t"
"vst %%v25, 144(%%r1,%[ptr_y]) \n\t"
"vst %%v26, 160(%%r1,%[ptr_y]) \n\t"
"vst %%v27, 176(%%r1,%[ptr_y]) \n\t"
"vst %%v28, 192(%%r1,%[ptr_y]) \n\t"
"vst %%v29, 208(%%r1,%[ptr_y]) \n\t"
"vst %%v30, 224(%%r1,%[ptr_y]) \n\t"
"vst %%v31, 240(%%r1,%[ptr_y]) \n\t"
"la %%r1,256(%%r1) \n\t"
"brctg %[n_tmp],1b"
: [mem_x] "+m" (*(double (*)[2*n])x),
[mem_y] "+m" (*(double (*)[2*n])y),
[n_tmp] "+&r"(n)
: [ptr_x] "a"(x), [ptr_y] "a"(y)
: "cc", "r1", "v0","v1","v2","v3","v4","v5","v6","v7","v16",
"v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
return;
}
#endif
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
{ {
BLASLONG i=0; BLASLONG i=0;