Merge pull request #1991 from maamountki/z14
[ZARCH] Z14 Support, BLAS 1/2 single precision implementations
This commit is contained in:
commit
42df9efa0c
|
|
@ -4,3 +4,7 @@ CCOMMON_OPT += -march=z13 -mzvector
|
|||
FCOMMON_OPT += -march=z13 -mzvector
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), Z14)
|
||||
CCOMMON_OPT += -march=z14 -mzvector
|
||||
FCOMMON_OPT += -march=z14 -mzvector
|
||||
endif
|
||||
|
|
|
|||
|
|
@ -27,9 +27,9 @@
|
|||
|
||||
#include <string.h>
|
||||
|
||||
#define CPU_GENERIC 0
|
||||
#define CPU_Z13 1
|
||||
#define CPU_Z14 2
|
||||
#define CPU_GENERIC 0
|
||||
#define CPU_Z13 1
|
||||
#define CPU_Z14 2
|
||||
|
||||
static char *cpuname[] = {
|
||||
"ZARCH_GENERIC",
|
||||
|
|
@ -64,10 +64,8 @@ int detect(void)
|
|||
|
||||
if (strstr(p, "2964")) return CPU_Z13;
|
||||
if (strstr(p, "2965")) return CPU_Z13;
|
||||
|
||||
/* detect z14, but fall back to z13 */
|
||||
if (strstr(p, "3906")) return CPU_Z13;
|
||||
if (strstr(p, "3907")) return CPU_Z13;
|
||||
if (strstr(p, "3906")) return CPU_Z14;
|
||||
if (strstr(p, "3907")) return CPU_Z14;
|
||||
|
||||
return CPU_GENERIC;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,18 +1,18 @@
|
|||
SAMAXKERNEL = ../arm/amax.c
|
||||
DAMAXKERNEL = ../arm/amax.c
|
||||
DAMAXKERNEL = damax_z13.c
|
||||
CAMAXKERNEL = ../arm/zamax.c
|
||||
ZAMAXKERNEL = ../arm/zamax.c
|
||||
ZAMAXKERNEL = zamax_z13.c
|
||||
|
||||
SAMINKERNEL = ../arm/amin.c
|
||||
DAMINKERNEL = ../arm/amin.c
|
||||
DAMINKERNEL = damin_z13.c
|
||||
CAMINKERNEL = ../arm/zamin.c
|
||||
ZAMINKERNEL = ../arm/zamin.c
|
||||
ZAMINKERNEL = zamin_z13.c
|
||||
|
||||
SMAXKERNEL = ../arm/max.c
|
||||
DMAXKERNEL = ../arm/max.c
|
||||
DMAXKERNEL = dmax_z13.c
|
||||
|
||||
SMINKERNEL = ../arm/min.c
|
||||
DMINKERNEL = ../arm/min.c
|
||||
DMINKERNEL = dmin_z13.c
|
||||
|
||||
ISAMAXKERNEL = ../arm/iamax.c
|
||||
IDAMAXKERNEL = idamax.c
|
||||
|
|
@ -25,10 +25,10 @@ ICAMINKERNEL = ../arm/izamin.c
|
|||
IZAMINKERNEL = izamin.c
|
||||
|
||||
ISMAXKERNEL = ../arm/imax.c
|
||||
IDMAXKERNEL = ../arm/imax.c
|
||||
IDMAXKERNEL = idmax.c
|
||||
|
||||
ISMINKERNEL = ../arm/imin.c
|
||||
IDMINKERNEL = ../arm/imin.c
|
||||
IDMINKERNEL = idmin.c
|
||||
|
||||
SASUMKERNEL = ../arm/asum.c
|
||||
DASUMKERNEL = dasum.c
|
||||
|
|
|
|||
|
|
@ -0,0 +1,146 @@
|
|||
SAMAXKERNEL = samax.c
|
||||
DAMAXKERNEL = damax.c
|
||||
CAMAXKERNEL = camax.c
|
||||
ZAMAXKERNEL = zamax.c
|
||||
|
||||
SAMINKERNEL = samin.c
|
||||
DAMINKERNEL = damin.c
|
||||
CAMINKERNEL = camin.c
|
||||
ZAMINKERNEL = zamin.c
|
||||
|
||||
SMAXKERNEL = smax.c
|
||||
DMAXKERNEL = dmax.c
|
||||
|
||||
SMINKERNEL = smin.c
|
||||
DMINKERNEL = dmin.c
|
||||
|
||||
ISAMAXKERNEL = isamax.c
|
||||
IDAMAXKERNEL = idamax.c
|
||||
ICAMAXKERNEL = icamax.c
|
||||
IZAMAXKERNEL = izamax.c
|
||||
|
||||
ISAMINKERNEL = isamin.c
|
||||
IDAMINKERNEL = idamin.c
|
||||
ICAMINKERNEL = icamin.c
|
||||
IZAMINKERNEL = izamin.c
|
||||
|
||||
ISMAXKERNEL = ismax.c
|
||||
IDMAXKERNEL = idmax.c
|
||||
|
||||
ISMINKERNEL = ismin.c
|
||||
IDMINKERNEL = idmin.c
|
||||
|
||||
SASUMKERNEL = sasum.c
|
||||
DASUMKERNEL = dasum.c
|
||||
CASUMKERNEL = casum.c
|
||||
ZASUMKERNEL = zasum.c
|
||||
|
||||
SAXPYKERNEL = saxpy.c
|
||||
DAXPYKERNEL = daxpy.c
|
||||
CAXPYKERNEL = caxpy.c
|
||||
ZAXPYKERNEL = zaxpy.c
|
||||
|
||||
SCOPYKERNEL = scopy.c
|
||||
DCOPYKERNEL = dcopy.c
|
||||
CCOPYKERNEL = ccopy.c
|
||||
ZCOPYKERNEL = zcopy.c
|
||||
|
||||
SDOTKERNEL = sdot.c
|
||||
DDOTKERNEL = ddot.c
|
||||
CDOTKERNEL = cdot.c
|
||||
ZDOTKERNEL = zdot.c
|
||||
DSDOTKERNEL = dsdot.c
|
||||
|
||||
SNRM2KERNEL = ../arm/nrm2.c
|
||||
DNRM2KERNEL = ../arm/nrm2.c
|
||||
CNRM2KERNEL = ../arm/znrm2.c
|
||||
ZNRM2KERNEL = ../arm/znrm2.c
|
||||
|
||||
SROTKERNEL = srot.c
|
||||
DROTKERNEL = drot.c
|
||||
CROTKERNEL = crot.c
|
||||
ZROTKERNEL = zrot.c
|
||||
|
||||
SSCALKERNEL = sscal.c
|
||||
DSCALKERNEL = dscal.c
|
||||
CSCALKERNEL = cscal.c
|
||||
ZSCALKERNEL = zscal.c
|
||||
|
||||
SSWAPKERNEL = sswap.c
|
||||
DSWAPKERNEL = dswap.c
|
||||
CSWAPKERNEL = cswap.c
|
||||
ZSWAPKERNEL = zswap.c
|
||||
|
||||
SGEMVNKERNEL = sgemv_n_4.c
|
||||
DGEMVNKERNEL = dgemv_n_4.c
|
||||
CGEMVNKERNEL = cgemv_n_4.c
|
||||
ZGEMVNKERNEL = zgemv_n_4.c
|
||||
|
||||
SGEMVTKERNEL = sgemv_t_4.c
|
||||
DGEMVTKERNEL = dgemv_t_4.c
|
||||
CGEMVTKERNEL = cgemv_t_4.c
|
||||
ZGEMVTKERNEL = zgemv_t_4.c
|
||||
|
||||
STRMMKERNEL = strmm8x4V.S
|
||||
DTRMMKERNEL = trmm8x4V.S
|
||||
CTRMMKERNEL = ctrmm4x4V.S
|
||||
ZTRMMKERNEL = ztrmm4x4V.S
|
||||
|
||||
SGEMMKERNEL = strmm8x4V.S
|
||||
SGEMMINCOPY = ../generic/gemm_ncopy_8.c
|
||||
SGEMMITCOPY = ../generic/gemm_tcopy_8.c
|
||||
SGEMMONCOPY = ../generic/gemm_ncopy_4.c
|
||||
SGEMMOTCOPY = ../generic/gemm_tcopy_4.c
|
||||
SGEMMINCOPYOBJ = sgemm_incopy.o
|
||||
SGEMMITCOPYOBJ = sgemm_itcopy.o
|
||||
SGEMMONCOPYOBJ = sgemm_oncopy.o
|
||||
SGEMMOTCOPYOBJ = sgemm_otcopy.o
|
||||
|
||||
|
||||
|
||||
DGEMMKERNEL = gemm8x4V.S
|
||||
DGEMMINCOPY = ../generic/gemm_ncopy_8.c
|
||||
DGEMMITCOPY = ../generic/gemm_tcopy_8.c
|
||||
DGEMMONCOPY = ../generic/gemm_ncopy_4.c
|
||||
DGEMMOTCOPY = ../generic/gemm_tcopy_4.c
|
||||
DGEMMINCOPYOBJ = dgemm_incopy.o
|
||||
DGEMMITCOPYOBJ = dgemm_itcopy.o
|
||||
DGEMMONCOPYOBJ = dgemm_oncopy.o
|
||||
DGEMMOTCOPYOBJ = dgemm_otcopy.o
|
||||
|
||||
CGEMMKERNEL = ctrmm4x4V.S
|
||||
CGEMMONCOPY = ../generic/zgemm_ncopy_4.c
|
||||
CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c
|
||||
CGEMMONCOPYOBJ = cgemm_oncopy.o
|
||||
CGEMMOTCOPYOBJ = cgemm_otcopy.o
|
||||
|
||||
ZGEMMKERNEL = ztrmm4x4V.S
|
||||
ZGEMMONCOPY = ../generic/zgemm_ncopy_4.c
|
||||
ZGEMMOTCOPY = ../generic/zgemm_tcopy_4.c
|
||||
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,241 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2017, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include <math.h>
|
||||
|
||||
#if defined(DOUBLE)
|
||||
#define ABS fabs
|
||||
#else
|
||||
#define ABS fabsf
|
||||
#endif
|
||||
|
||||
#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1]))
|
||||
|
||||
static FLOAT camax_kernel_32(BLASLONG n, FLOAT *x)
|
||||
{
|
||||
FLOAT amax;
|
||||
|
||||
__asm__ volatile (
|
||||
"vlef %%v0,0(%2),0 \n\t"
|
||||
"vlef %%v16,4(%2),0 \n\t"
|
||||
"vlef %%v0,8(%2),1 \n\t"
|
||||
"vlef %%v16,12(%2),1 \n\t"
|
||||
"vlef %%v0,16(%2),2 \n\t"
|
||||
"vlef %%v16,20(%2),2 \n\t"
|
||||
"vlef %%v0,24(%2),3 \n\t"
|
||||
"vlef %%v16,28(%2),3 \n\t"
|
||||
"vflpsb %%v0,%%v0 \n\t"
|
||||
"vflpsb %%v16,%%v16 \n\t"
|
||||
"vfasb %%v0,%%v0,%%v16 \n\t"
|
||||
"vleib %%v1,0,0 \n\t"
|
||||
"vleib %%v1,1,1 \n\t"
|
||||
"vleib %%v1,2,2 \n\t"
|
||||
"vleib %%v1,3,3 \n\t"
|
||||
"vleib %%v1,8,4 \n\t"
|
||||
"vleib %%v1,9,5 \n\t"
|
||||
"vleib %%v1,10,6 \n\t"
|
||||
"vleib %%v1,11,7 \n\t"
|
||||
"vleib %%v1,16,8 \n\t"
|
||||
"vleib %%v1,17,9 \n\t"
|
||||
"vleib %%v1,18,10 \n\t"
|
||||
"vleib %%v1,19,11 \n\t"
|
||||
"vleib %%v1,24,12 \n\t"
|
||||
"vleib %%v1,25,13 \n\t"
|
||||
"vleib %%v1,26,14 \n\t"
|
||||
"vleib %%v1,27,15 \n\t"
|
||||
"srlg %%r0,%1,5 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 1, 1024(%%r1,%2) \n\t"
|
||||
|
||||
"vl %%v16,0(%%r1,%2) \n\t"
|
||||
"vl %%v2,16(%%r1,%2) \n\t"
|
||||
"vpkg %%v17,%%v16,%%v2 \n\t"
|
||||
"vperm %%v16,%%v16,%%v2,%%v1 \n\t"
|
||||
|
||||
"vl %%v18,32(%%r1,%2) \n\t"
|
||||
"vl %%v2,48(%%r1,%2) \n\t"
|
||||
"vpkg %%v19,%%v18,%%v2 \n\t"
|
||||
"vperm %%v18,%%v18,%%v2,%%v1 \n\t"
|
||||
|
||||
"vl %%v20,64(%%r1,%2) \n\t"
|
||||
"vl %%v2,80(%%r1,%2) \n\t"
|
||||
"vpkg %%v21,%%v20,%%v2 \n\t"
|
||||
"vperm %%v20,%%v20,%%v2,%%v1 \n\t"
|
||||
|
||||
"vl %%v22,96(%%r1,%2) \n\t"
|
||||
"vl %%v2,112(%%r1,%2) \n\t"
|
||||
"vpkg %%v23,%%v22,%%v2 \n\t"
|
||||
"vperm %%v22,%%v22,%%v2,%%v1 \n\t"
|
||||
|
||||
"vl %%v24,128(%%r1,%2) \n\t"
|
||||
"vl %%v2,144(%%r1,%2) \n\t"
|
||||
"vpkg %%v25,%%v24,%%v2 \n\t"
|
||||
"vperm %%v24,%%v24,%%v2,%%v1 \n\t"
|
||||
|
||||
"vl %%v26,160(%%r1,%2) \n\t"
|
||||
"vl %%v2,176(%%r1,%2) \n\t"
|
||||
"vpkg %%v27,%%v26,%%v2 \n\t"
|
||||
"vperm %%v26,%%v26,%%v2,%%v1 \n\t"
|
||||
|
||||
"vl %%v28,192(%%r1,%2) \n\t"
|
||||
"vl %%v2,208(%%r1,%2) \n\t"
|
||||
"vpkg %%v29,%%v28,%%v2 \n\t"
|
||||
"vperm %%v28,%%v28,%%v2,%%v1 \n\t"
|
||||
|
||||
"vl %%v30,224(%%r1,%2) \n\t"
|
||||
"vl %%v2,240(%%r1,%2) \n\t"
|
||||
"vpkg %%v31,%%v30,%%v2 \n\t"
|
||||
"vperm %%v30,%%v30,%%v2,%%v1 \n\t"
|
||||
|
||||
"vflpsb %%v16,%%v16 \n\t"
|
||||
"vflpsb %%v17,%%v17 \n\t"
|
||||
"vflpsb %%v18,%%v18 \n\t"
|
||||
"vflpsb %%v19,%%v19 \n\t"
|
||||
"vflpsb %%v20,%%v20 \n\t"
|
||||
"vflpsb %%v21,%%v21 \n\t"
|
||||
"vflpsb %%v22,%%v22 \n\t"
|
||||
"vflpsb %%v23,%%v23 \n\t"
|
||||
"vflpsb %%v24,%%v24 \n\t"
|
||||
"vflpsb %%v25,%%v25 \n\t"
|
||||
"vflpsb %%v26,%%v26 \n\t"
|
||||
"vflpsb %%v27,%%v27 \n\t"
|
||||
"vflpsb %%v28,%%v28 \n\t"
|
||||
"vflpsb %%v29,%%v29 \n\t"
|
||||
"vflpsb %%v30,%%v30 \n\t"
|
||||
"vflpsb %%v31,%%v31 \n\t"
|
||||
|
||||
"vfasb %%v16,%%v16,%%v17 \n\t"
|
||||
"vfasb %%v18,%%v18,%%v19 \n\t"
|
||||
"vfasb %%v20,%%v20,%%v21 \n\t"
|
||||
"vfasb %%v22,%%v22,%%v23 \n\t"
|
||||
"vfasb %%v24,%%v24,%%v25 \n\t"
|
||||
"vfasb %%v26,%%v26,%%v27 \n\t"
|
||||
"vfasb %%v28,%%v28,%%v29 \n\t"
|
||||
"vfasb %%v30,%%v30,%%v31 \n\t"
|
||||
|
||||
"vfmaxsb %%v16,%%v16,%%v24,0 \n\t"
|
||||
"vfmaxsb %%v18,%%v18,%%v26,0 \n\t"
|
||||
"vfmaxsb %%v20,%%v20,%%v28,0 \n\t"
|
||||
"vfmaxsb %%v22,%%v22,%%v30,0 \n\t"
|
||||
|
||||
"vfmaxsb %%v16,%%v16,%%v20,0 \n\t"
|
||||
"vfmaxsb %%v18,%%v18,%%v22,0 \n\t"
|
||||
|
||||
"vfmaxsb %%v16,%%v16,%%v18,0 \n\t"
|
||||
|
||||
"vfmaxsb %%v0,%%v0,%%v16,0 \n\t"
|
||||
|
||||
"agfi %%r1, 256 \n\t"
|
||||
"brctg %%r0, 0b \n\t"
|
||||
|
||||
"veslg %%v16,%%v0,32 \n\t"
|
||||
"vfmaxsb %%v0,%%v0,%%v16,0 \n\t"
|
||||
|
||||
"vrepf %%v16,%%v0,2 \n\t"
|
||||
"wfmaxsb %%v0,%%v0,%%v16,0 \n\t"
|
||||
"ler %0,%%f0 "
|
||||
:"=f"(amax)
|
||||
:"r"(n),"ZR"((const FLOAT (*)[n])x)
|
||||
:"memory","cc","r0","r1","v0","v1","v2","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
|
||||
return amax;
|
||||
}
|
||||
|
||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||
BLASLONG i = 0;
|
||||
BLASLONG ix = 0;
|
||||
FLOAT maxf = 0.0;
|
||||
BLASLONG inc_x2;
|
||||
|
||||
if (n <= 0 || inc_x <= 0) return (maxf);
|
||||
|
||||
if (inc_x == 1) {
|
||||
|
||||
BLASLONG n1 = n & -32;
|
||||
if (n1 > 0) {
|
||||
|
||||
maxf = camax_kernel_32(n1, x);
|
||||
ix = n1 * 2;
|
||||
i = n1;
|
||||
}
|
||||
else
|
||||
{
|
||||
maxf=CABS1(x,0);
|
||||
ix += 2;
|
||||
i++;
|
||||
}
|
||||
|
||||
while (i < n) {
|
||||
if (CABS1(x,ix) > maxf) {
|
||||
maxf = CABS1(x,ix);
|
||||
}
|
||||
ix += 2;
|
||||
i++;
|
||||
}
|
||||
return (maxf);
|
||||
|
||||
} else {
|
||||
|
||||
maxf=CABS1(x,0);
|
||||
inc_x2 = 2 * inc_x;
|
||||
|
||||
BLASLONG n1 = n & -4;
|
||||
while (i < n1) {
|
||||
|
||||
if (CABS1(x,ix) > maxf) {
|
||||
maxf = CABS1(x,ix);
|
||||
}
|
||||
if (CABS1(x,ix+inc_x2) > maxf) {
|
||||
maxf = CABS1(x,ix+inc_x2);
|
||||
}
|
||||
if (CABS1(x,ix+inc_x2*2) > maxf) {
|
||||
maxf = CABS1(x,ix+inc_x2*2);
|
||||
}
|
||||
if (CABS1(x,ix+inc_x2*3) > maxf) {
|
||||
maxf = CABS1(x,ix+inc_x2*3);
|
||||
}
|
||||
|
||||
ix += inc_x2 * 4;
|
||||
|
||||
i += 4;
|
||||
|
||||
}
|
||||
|
||||
|
||||
while (i < n) {
|
||||
if (CABS1(x,ix) > maxf) {
|
||||
maxf = CABS1(x,ix);
|
||||
}
|
||||
ix += inc_x2;
|
||||
i++;
|
||||
}
|
||||
return (maxf);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,241 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2017, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include <math.h>
|
||||
|
||||
#if defined(DOUBLE)
|
||||
#define ABS fabs
|
||||
#else
|
||||
#define ABS fabsf
|
||||
#endif
|
||||
|
||||
#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1]))
|
||||
|
||||
static FLOAT camin_kernel_32(BLASLONG n, FLOAT *x)
|
||||
{
|
||||
FLOAT amin;
|
||||
|
||||
__asm__ volatile (
|
||||
"vlef %%v0,0(%2),0 \n\t"
|
||||
"vlef %%v16,4(%2),0 \n\t"
|
||||
"vlef %%v0,8(%2),1 \n\t"
|
||||
"vlef %%v16,12(%2),1 \n\t"
|
||||
"vlef %%v0,16(%2),2 \n\t"
|
||||
"vlef %%v16,20(%2),2 \n\t"
|
||||
"vlef %%v0,24(%2),3 \n\t"
|
||||
"vlef %%v16,28(%2),3 \n\t"
|
||||
"vflpsb %%v0,%%v0 \n\t"
|
||||
"vflpsb %%v16,%%v16 \n\t"
|
||||
"vfasb %%v0,%%v0,%%v16 \n\t"
|
||||
"vleib %%v1,0,0 \n\t"
|
||||
"vleib %%v1,1,1 \n\t"
|
||||
"vleib %%v1,2,2 \n\t"
|
||||
"vleib %%v1,3,3 \n\t"
|
||||
"vleib %%v1,8,4 \n\t"
|
||||
"vleib %%v1,9,5 \n\t"
|
||||
"vleib %%v1,10,6 \n\t"
|
||||
"vleib %%v1,11,7 \n\t"
|
||||
"vleib %%v1,16,8 \n\t"
|
||||
"vleib %%v1,17,9 \n\t"
|
||||
"vleib %%v1,18,10 \n\t"
|
||||
"vleib %%v1,19,11 \n\t"
|
||||
"vleib %%v1,24,12 \n\t"
|
||||
"vleib %%v1,25,13 \n\t"
|
||||
"vleib %%v1,26,14 \n\t"
|
||||
"vleib %%v1,27,15 \n\t"
|
||||
"srlg %%r0,%1,5 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 1, 1024(%%r1,%2) \n\t"
|
||||
|
||||
"vl %%v16,0(%%r1,%2) \n\t"
|
||||
"vl %%v2,16(%%r1,%2) \n\t"
|
||||
"vpkg %%v17,%%v16,%%v2 \n\t"
|
||||
"vperm %%v16,%%v16,%%v2,%%v1 \n\t"
|
||||
|
||||
"vl %%v18,32(%%r1,%2) \n\t"
|
||||
"vl %%v2,48(%%r1,%2) \n\t"
|
||||
"vpkg %%v19,%%v18,%%v2 \n\t"
|
||||
"vperm %%v18,%%v18,%%v2,%%v1 \n\t"
|
||||
|
||||
"vl %%v20,64(%%r1,%2) \n\t"
|
||||
"vl %%v2,80(%%r1,%2) \n\t"
|
||||
"vpkg %%v21,%%v20,%%v2 \n\t"
|
||||
"vperm %%v20,%%v20,%%v2,%%v1 \n\t"
|
||||
|
||||
"vl %%v22,96(%%r1,%2) \n\t"
|
||||
"vl %%v2,112(%%r1,%2) \n\t"
|
||||
"vpkg %%v23,%%v22,%%v2 \n\t"
|
||||
"vperm %%v22,%%v22,%%v2,%%v1 \n\t"
|
||||
|
||||
"vl %%v24,128(%%r1,%2) \n\t"
|
||||
"vl %%v2,144(%%r1,%2) \n\t"
|
||||
"vpkg %%v25,%%v24,%%v2 \n\t"
|
||||
"vperm %%v24,%%v24,%%v2,%%v1 \n\t"
|
||||
|
||||
"vl %%v26,160(%%r1,%2) \n\t"
|
||||
"vl %%v2,176(%%r1,%2) \n\t"
|
||||
"vpkg %%v27,%%v26,%%v2 \n\t"
|
||||
"vperm %%v26,%%v26,%%v2,%%v1 \n\t"
|
||||
|
||||
"vl %%v28,192(%%r1,%2) \n\t"
|
||||
"vl %%v2,208(%%r1,%2) \n\t"
|
||||
"vpkg %%v29,%%v28,%%v2 \n\t"
|
||||
"vperm %%v28,%%v28,%%v2,%%v1 \n\t"
|
||||
|
||||
"vl %%v30,224(%%r1,%2) \n\t"
|
||||
"vl %%v2,240(%%r1,%2) \n\t"
|
||||
"vpkg %%v31,%%v30,%%v2 \n\t"
|
||||
"vperm %%v30,%%v30,%%v2,%%v1 \n\t"
|
||||
|
||||
"vflpsb %%v16,%%v16 \n\t"
|
||||
"vflpsb %%v17,%%v17 \n\t"
|
||||
"vflpsb %%v18,%%v18 \n\t"
|
||||
"vflpsb %%v19,%%v19 \n\t"
|
||||
"vflpsb %%v20,%%v20 \n\t"
|
||||
"vflpsb %%v21,%%v21 \n\t"
|
||||
"vflpsb %%v22,%%v22 \n\t"
|
||||
"vflpsb %%v23,%%v23 \n\t"
|
||||
"vflpsb %%v24,%%v24 \n\t"
|
||||
"vflpsb %%v25,%%v25 \n\t"
|
||||
"vflpsb %%v26,%%v26 \n\t"
|
||||
"vflpsb %%v27,%%v27 \n\t"
|
||||
"vflpsb %%v28,%%v28 \n\t"
|
||||
"vflpsb %%v29,%%v29 \n\t"
|
||||
"vflpsb %%v30,%%v30 \n\t"
|
||||
"vflpsb %%v31,%%v31 \n\t"
|
||||
|
||||
"vfasb %%v16,%%v16,%%v17 \n\t"
|
||||
"vfasb %%v18,%%v18,%%v19 \n\t"
|
||||
"vfasb %%v20,%%v20,%%v21 \n\t"
|
||||
"vfasb %%v22,%%v22,%%v23 \n\t"
|
||||
"vfasb %%v24,%%v24,%%v25 \n\t"
|
||||
"vfasb %%v26,%%v26,%%v27 \n\t"
|
||||
"vfasb %%v28,%%v28,%%v29 \n\t"
|
||||
"vfasb %%v30,%%v30,%%v31 \n\t"
|
||||
|
||||
"vfminsb %%v16,%%v16,%%v24,0 \n\t"
|
||||
"vfminsb %%v18,%%v18,%%v26,0 \n\t"
|
||||
"vfminsb %%v20,%%v20,%%v28,0 \n\t"
|
||||
"vfminsb %%v22,%%v22,%%v30,0 \n\t"
|
||||
|
||||
"vfminsb %%v16,%%v16,%%v20,0 \n\t"
|
||||
"vfminsb %%v18,%%v18,%%v22,0 \n\t"
|
||||
|
||||
"vfminsb %%v16,%%v16,%%v18,0 \n\t"
|
||||
|
||||
"vfminsb %%v0,%%v0,%%v16,0 \n\t"
|
||||
|
||||
"agfi %%r1, 256 \n\t"
|
||||
"brctg %%r0, 0b \n\t"
|
||||
|
||||
"veslg %%v16,%%v0,32 \n\t"
|
||||
"vfminsb %%v0,%%v0,%%v16,0 \n\t"
|
||||
|
||||
"vrepf %%v16,%%v0,2 \n\t"
|
||||
"wfminsb %%v0,%%v0,%%v16,0 \n\t"
|
||||
"ler %0,%%f0 "
|
||||
:"=f"(amin)
|
||||
:"r"(n),"ZR"((const FLOAT (*)[n])x)
|
||||
:"memory","cc","r0","r1","v0","v1","v2","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
|
||||
return amin;
|
||||
}
|
||||
|
||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||
BLASLONG i = 0;
|
||||
BLASLONG ix = 0;
|
||||
FLOAT minf = 0.0;
|
||||
BLASLONG inc_x2;
|
||||
|
||||
if (n <= 0 || inc_x <= 0) return (minf);
|
||||
|
||||
if (inc_x == 1) {
|
||||
|
||||
BLASLONG n1 = n & -32;
|
||||
if (n1 > 0) {
|
||||
|
||||
minf = camin_kernel_32(n1, x);
|
||||
ix = n1 * 2;
|
||||
i = n1;
|
||||
}
|
||||
else
|
||||
{
|
||||
minf=CABS1(x,0);
|
||||
ix += 2;
|
||||
i++;
|
||||
}
|
||||
|
||||
while (i < n) {
|
||||
if (CABS1(x,ix) < minf) {
|
||||
minf = CABS1(x,ix);
|
||||
}
|
||||
ix += 2;
|
||||
i++;
|
||||
}
|
||||
return (minf);
|
||||
|
||||
} else {
|
||||
|
||||
minf=CABS1(x,0);
|
||||
inc_x2 = 2 * inc_x;
|
||||
|
||||
BLASLONG n1 = n & -4;
|
||||
while (i < n1) {
|
||||
|
||||
if (CABS1(x,ix) < minf) {
|
||||
minf = CABS1(x,ix);
|
||||
}
|
||||
if (CABS1(x,ix+inc_x2) < minf) {
|
||||
minf = CABS1(x,ix+inc_x2);
|
||||
}
|
||||
if (CABS1(x,ix+inc_x2*2) < minf) {
|
||||
minf = CABS1(x,ix+inc_x2*2);
|
||||
}
|
||||
if (CABS1(x,ix+inc_x2*3) < minf) {
|
||||
minf = CABS1(x,ix+inc_x2*3);
|
||||
}
|
||||
|
||||
ix += inc_x2 * 4;
|
||||
|
||||
i += 4;
|
||||
|
||||
}
|
||||
|
||||
|
||||
while (i < n) {
|
||||
if (CABS1(x,ix) < minf) {
|
||||
minf = CABS1(x,ix);
|
||||
}
|
||||
ix += inc_x2;
|
||||
i++;
|
||||
}
|
||||
return (minf);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,167 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2017, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include <math.h>
|
||||
|
||||
#if defined(DOUBLE)
|
||||
#define ABS fabs
|
||||
#else
|
||||
#define ABS fabsf
|
||||
#endif
|
||||
|
||||
static FLOAT casum_kernel_32(BLASLONG n, FLOAT *x)
|
||||
{
|
||||
FLOAT asum;
|
||||
|
||||
__asm__ (
|
||||
"vzero %%v0 \n\t"
|
||||
"vzero %%v1 \n\t"
|
||||
"vzero %%v2 \n\t"
|
||||
"vzero %%v3 \n\t"
|
||||
"srlg %%r0,%1,5 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 1, 1024(%%r1,%2) \n\t"
|
||||
"vl %%v16, 0(%%r1,%2) \n\t"
|
||||
"vl %%v17, 16(%%r1,%2) \n\t"
|
||||
"vl %%v18, 32(%%r1,%2) \n\t"
|
||||
"vl %%v19, 48(%%r1,%2) \n\t"
|
||||
"vl %%v20, 64(%%r1,%2) \n\t"
|
||||
"vl %%v21, 80(%%r1,%2) \n\t"
|
||||
"vl %%v22, 96(%%r1,%2) \n\t"
|
||||
"vl %%v23, 112(%%r1,%2) \n\t"
|
||||
|
||||
"vflpsb %%v16, %%v16 \n\t"
|
||||
"vflpsb %%v17, %%v17 \n\t"
|
||||
"vflpsb %%v18, %%v18 \n\t"
|
||||
"vflpsb %%v19, %%v19 \n\t"
|
||||
"vflpsb %%v20, %%v20 \n\t"
|
||||
"vflpsb %%v21, %%v21 \n\t"
|
||||
"vflpsb %%v22, %%v22 \n\t"
|
||||
"vflpsb %%v23, %%v23 \n\t"
|
||||
|
||||
"vfasb %%v0,%%v0,%%v16 \n\t"
|
||||
"vfasb %%v1,%%v1,%%v17 \n\t"
|
||||
"vfasb %%v2,%%v2,%%v18 \n\t"
|
||||
"vfasb %%v3,%%v3,%%v19 \n\t"
|
||||
"vfasb %%v0,%%v0,%%v20 \n\t"
|
||||
"vfasb %%v1,%%v1,%%v21 \n\t"
|
||||
"vfasb %%v2,%%v2,%%v22 \n\t"
|
||||
"vfasb %%v3,%%v3,%%v23 \n\t"
|
||||
|
||||
"vl %%v16, 128(%%r1,%2) \n\t"
|
||||
"vl %%v17, 144(%%r1,%2) \n\t"
|
||||
"vl %%v18, 160(%%r1,%2) \n\t"
|
||||
"vl %%v19, 176(%%r1,%2) \n\t"
|
||||
"vl %%v20, 192(%%r1,%2) \n\t"
|
||||
"vl %%v21, 208(%%r1,%2) \n\t"
|
||||
"vl %%v22, 224(%%r1,%2) \n\t"
|
||||
"vl %%v23, 240(%%r1,%2) \n\t"
|
||||
|
||||
"vflpsb %%v16, %%v16 \n\t"
|
||||
"vflpsb %%v17, %%v17 \n\t"
|
||||
"vflpsb %%v18, %%v18 \n\t"
|
||||
"vflpsb %%v19, %%v19 \n\t"
|
||||
"vflpsb %%v20, %%v20 \n\t"
|
||||
"vflpsb %%v21, %%v21 \n\t"
|
||||
"vflpsb %%v22, %%v22 \n\t"
|
||||
"vflpsb %%v23, %%v23 \n\t"
|
||||
|
||||
"vfasb %%v0,%%v0,%%v16 \n\t"
|
||||
"vfasb %%v1,%%v1,%%v17 \n\t"
|
||||
"vfasb %%v2,%%v2,%%v18 \n\t"
|
||||
"vfasb %%v3,%%v3,%%v19 \n\t"
|
||||
"vfasb %%v0,%%v0,%%v20 \n\t"
|
||||
"vfasb %%v1,%%v1,%%v21 \n\t"
|
||||
"vfasb %%v2,%%v2,%%v22 \n\t"
|
||||
"vfasb %%v3,%%v3,%%v23 \n\t"
|
||||
|
||||
"agfi %%r1,256 \n\t"
|
||||
"brctg %%r0,0b \n\t"
|
||||
"vfasb %%v0,%%v0,%%v1 \n\t"
|
||||
"vfasb %%v0,%%v0,%%v2 \n\t"
|
||||
"vfasb %%v0,%%v0,%%v3 \n\t"
|
||||
"veslg %%v1,%%v0,32 \n\t"
|
||||
"vfasb %%v0,%%v0,%%v1 \n\t"
|
||||
"vrepf %%v1,%%v0,2 \n\t"
|
||||
"aebr %%f0,%%f1 \n\t"
|
||||
"ler %0,%%f0 "
|
||||
:"=f"(asum)
|
||||
:"r"(n),"ZR"((const FLOAT (*)[n * 2])x)
|
||||
:"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23"
|
||||
);
|
||||
|
||||
return asum;
|
||||
}
|
||||
|
||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
{
|
||||
BLASLONG i=0;
|
||||
BLASLONG ip=0;
|
||||
FLOAT sumf = 0.0;
|
||||
BLASLONG n1;
|
||||
BLASLONG inc_x2;
|
||||
|
||||
if (n <= 0 || inc_x <= 0) return(sumf);
|
||||
|
||||
if ( inc_x == 1 )
|
||||
{
|
||||
|
||||
n1 = n & -32;
|
||||
if ( n1 > 0 )
|
||||
{
|
||||
|
||||
sumf = casum_kernel_32(n1, x);
|
||||
i=n1;
|
||||
ip=2*n1;
|
||||
}
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
sumf += ABS(x[ip]) + ABS(x[ip+1]);
|
||||
i++;
|
||||
ip+=2;
|
||||
}
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
inc_x2 = 2* inc_x;
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
sumf += ABS(x[ip]) + ABS(x[ip+1]);
|
||||
ip+=inc_x2;
|
||||
i++;
|
||||
}
|
||||
|
||||
}
|
||||
return(sumf);
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,174 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2017, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
static void caxpy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
||||
{
|
||||
__asm__ volatile(
|
||||
#if !defined(CONJ)
|
||||
"vlrepf %%v0,0(%3) \n\t"
|
||||
"vlef %%v1,4(%3),0 \n\t"
|
||||
"vlef %%v1,4(%3),2 \n\t"
|
||||
"vflcsb %%v1,%%v1 \n\t"
|
||||
"vlef %%v1,4(%3),1 \n\t"
|
||||
"vlef %%v1,4(%3),3 \n\t"
|
||||
#else
|
||||
"vlef %%v0,0(%3),1 \n\t"
|
||||
"vlef %%v0,0(%3),3 \n\t"
|
||||
"vflcsb %%v0,%%v0 \n\t"
|
||||
"vlef %%v0,0(%3),0 \n\t"
|
||||
"vlef %%v0,0(%3),2 \n\t"
|
||||
"vlrepf %%v1,4(%3) \n\t"
|
||||
#endif
|
||||
"srlg %%r0,%0,4 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 1, 1024(%%r1,%1) \n\t"
|
||||
"pfd 2, 1024(%%r1,%2) \n\t"
|
||||
|
||||
"vl %%v16,0(%%r1,%1) \n\t"
|
||||
"vl %%v17,16(%%r1,%1) \n\t"
|
||||
"vl %%v18,32(%%r1,%1) \n\t"
|
||||
"vl %%v19,48(%%r1,%1) \n\t"
|
||||
"vl %%v20,0(%%r1,%2) \n\t"
|
||||
"vl %%v21,16(%%r1,%2) \n\t"
|
||||
"vl %%v22,32(%%r1,%2) \n\t"
|
||||
"vl %%v23,48(%%r1,%2) \n\t"
|
||||
"verllg %%v24,%%v16,32 \n\t"
|
||||
"verllg %%v25,%%v17,32 \n\t"
|
||||
"verllg %%v26,%%v18,32 \n\t"
|
||||
"verllg %%v27,%%v19,32 \n\t"
|
||||
|
||||
"vfmasb %%v28,%%v16,%%v0,%%v20 \n\t"
|
||||
"vfmasb %%v29,%%v17,%%v0,%%v21 \n\t"
|
||||
"vfmasb %%v30,%%v18,%%v0,%%v22 \n\t"
|
||||
"vfmasb %%v31,%%v19,%%v0,%%v23 \n\t"
|
||||
|
||||
"vfmasb %%v28,%%v24,%%v1,%%v28 \n\t"
|
||||
"vfmasb %%v29,%%v25,%%v1,%%v29 \n\t"
|
||||
"vfmasb %%v30,%%v26,%%v1,%%v30 \n\t"
|
||||
"vfmasb %%v31,%%v27,%%v1,%%v31 \n\t"
|
||||
|
||||
"vst %%v28,0(%%r1,%2) \n\t"
|
||||
"vst %%v29,16(%%r1,%2) \n\t"
|
||||
"vst %%v30,32(%%r1,%2) \n\t"
|
||||
"vst %%v31,48(%%r1,%2) \n\t"
|
||||
|
||||
"vl %%v16,64(%%r1,%1) \n\t"
|
||||
"vl %%v17,80(%%r1,%1) \n\t"
|
||||
"vl %%v18,96(%%r1,%1) \n\t"
|
||||
"vl %%v19,112(%%r1,%1) \n\t"
|
||||
"vl %%v20,64(%%r1,%2) \n\t"
|
||||
"vl %%v21,80(%%r1,%2) \n\t"
|
||||
"vl %%v22,96(%%r1,%2) \n\t"
|
||||
"vl %%v23,112(%%r1,%2) \n\t"
|
||||
"verllg %%v24,%%v16,32 \n\t"
|
||||
"verllg %%v25,%%v17,32 \n\t"
|
||||
"verllg %%v26,%%v18,32 \n\t"
|
||||
"verllg %%v27,%%v19,32 \n\t"
|
||||
|
||||
"vfmasb %%v28,%%v16,%%v0,%%v20 \n\t"
|
||||
"vfmasb %%v29,%%v17,%%v0,%%v21 \n\t"
|
||||
"vfmasb %%v30,%%v18,%%v0,%%v22 \n\t"
|
||||
"vfmasb %%v31,%%v19,%%v0,%%v23 \n\t"
|
||||
|
||||
"vfmasb %%v28,%%v24,%%v1,%%v28 \n\t"
|
||||
"vfmasb %%v29,%%v25,%%v1,%%v29 \n\t"
|
||||
"vfmasb %%v30,%%v26,%%v1,%%v30 \n\t"
|
||||
"vfmasb %%v31,%%v27,%%v1,%%v31 \n\t"
|
||||
|
||||
"vst %%v28,64(%%r1,%2) \n\t"
|
||||
"vst %%v29,80(%%r1,%2) \n\t"
|
||||
"vst %%v30,96(%%r1,%2) \n\t"
|
||||
"vst %%v31,112(%%r1,%2) \n\t"
|
||||
|
||||
"agfi %%r1,128 \n\t"
|
||||
"brctg %%r0,0b "
|
||||
:
|
||||
:"r"(n),"ZR"((const FLOAT (*)[n * 2])x),"ZR"((FLOAT (*)[n * 2])y),"ZQ"((const FLOAT (*)[2])alpha)
|
||||
:"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
}
|
||||
|
||||
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) {
|
||||
BLASLONG i = 0;
|
||||
BLASLONG ix = 0, iy = 0;
|
||||
FLOAT da[2] __attribute__ ((aligned(16)));
|
||||
|
||||
if (n <= 0) return (0);
|
||||
|
||||
if ((inc_x == 1) && (inc_y == 1)) {
|
||||
|
||||
BLASLONG n1 = n & -16;
|
||||
|
||||
if (n1) {
|
||||
da[0] = da_r;
|
||||
da[1] = da_i;
|
||||
caxpy_kernel_16(n1, x, y, da);
|
||||
ix = 2 * n1;
|
||||
}
|
||||
i = n1;
|
||||
while (i < n) {
|
||||
#if !defined(CONJ)
|
||||
y[ix] += (da_r * x[ix] - da_i * x[ix + 1]);
|
||||
y[ix + 1] += (da_r * x[ix + 1] + da_i * x[ix]);
|
||||
#else
|
||||
y[ix] += (da_r * x[ix] + da_i * x[ix + 1]);
|
||||
y[ix + 1] -= (da_r * x[ix + 1] - da_i * x[ix]);
|
||||
#endif
|
||||
i++;
|
||||
ix += 2;
|
||||
|
||||
}
|
||||
return (0);
|
||||
|
||||
|
||||
}
|
||||
|
||||
inc_x *= 2;
|
||||
inc_y *= 2;
|
||||
|
||||
while (i < n) {
|
||||
|
||||
#if !defined(CONJ)
|
||||
y[iy] += (da_r * x[ix] - da_i * x[ix + 1]);
|
||||
y[iy + 1] += (da_r * x[ix + 1] + da_i * x[ix]);
|
||||
#else
|
||||
y[iy] += (da_r * x[ix] + da_i * x[ix + 1]);
|
||||
y[iy + 1] -= (da_r * x[ix + 1] - da_i * x[ix]);
|
||||
#endif
|
||||
ix += inc_x;
|
||||
iy += inc_y;
|
||||
i++;
|
||||
|
||||
}
|
||||
return (0);
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,99 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2018, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
static void ccopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y)
|
||||
{
|
||||
__asm__ volatile (
|
||||
"lgr %%r1,%1 \n\t"
|
||||
"lgr %%r2,%2 \n\t"
|
||||
"srlg %%r0,%0,5 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 1, 1024(%%r1) \n\t"
|
||||
"pfd 2, 1024(%%r2) \n\t"
|
||||
"mvc 0(256,%%r2),0(%%r1) \n\t"
|
||||
"agfi %%r1,256 \n\t"
|
||||
"agfi %%r2,256 \n\t"
|
||||
"brctg %%r0,0b "
|
||||
:
|
||||
:"r"(n),"a"((const FLOAT (*)[n * 2])x),"a"((FLOAT (*)[n * 2])y)
|
||||
:"memory","cc","r0","r1","r2"
|
||||
);
|
||||
}
|
||||
|
||||
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
||||
{
|
||||
BLASLONG i=0;
|
||||
BLASLONG ix=0,iy=0;
|
||||
|
||||
if ( n <= 0 ) return(0);
|
||||
|
||||
if ( (inc_x == 1) && (inc_y == 1 ))
|
||||
{
|
||||
|
||||
BLASLONG n1 = n & -32;
|
||||
if ( n1 > 0 )
|
||||
{
|
||||
ccopy_kernel_32(n1, x, y);
|
||||
i=n1;
|
||||
ix=n1*2;
|
||||
iy=n1*2;
|
||||
}
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
y[iy] = x[iy] ;
|
||||
y[iy+1] = x[ix+1] ;
|
||||
ix+=2;
|
||||
iy+=2;
|
||||
i++ ;
|
||||
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
BLASLONG inc_x2 = 2 * inc_x;
|
||||
BLASLONG inc_y2 = 2 * inc_y;
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
y[iy] = x[ix] ;
|
||||
y[iy+1] = x[ix+1] ;
|
||||
ix += inc_x2 ;
|
||||
iy += inc_y2 ;
|
||||
i++ ;
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
return(0);
|
||||
}
|
||||
|
|
@ -0,0 +1,182 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2017, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
static void cdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d)
|
||||
{
|
||||
__asm__ volatile(
|
||||
"vzero %%v24 \n\t"
|
||||
"vzero %%v25 \n\t"
|
||||
"vzero %%v26 \n\t"
|
||||
"vzero %%v27 \n\t"
|
||||
"vzero %%v28 \n\t"
|
||||
"vzero %%v29 \n\t"
|
||||
"vzero %%v30 \n\t"
|
||||
"vzero %%v31 \n\t"
|
||||
"srlg %%r0,%0,4 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 1, 1024(%%r1,%1) \n\t"
|
||||
"pfd 1, 1024(%%r1,%2) \n\t"
|
||||
|
||||
"vl %%v16, 0(%%r1,%1) \n\t"
|
||||
"vl %%v17, 16(%%r1,%1) \n\t"
|
||||
"vl %%v18, 32(%%r1,%1) \n\t"
|
||||
"vl %%v19, 48(%%r1,%1) \n\t"
|
||||
"vl %%v0, 0(%%r1,%2) \n\t"
|
||||
"vl %%v1, 16(%%r1,%2) \n\t"
|
||||
"vl %%v2, 32(%%r1,%2) \n\t"
|
||||
"vl %%v3, 48(%%r1,%2) \n\t"
|
||||
"verllg %%v20,%%v16,32 \n\t"
|
||||
"verllg %%v21,%%v17,32 \n\t"
|
||||
"verllg %%v22,%%v18,32 \n\t"
|
||||
"verllg %%v23,%%v19,32 \n\t"
|
||||
|
||||
"vfmasb %%v24,%%v16,%%v0,%%v24 \n\t"
|
||||
"vfmasb %%v25,%%v20,%%v0,%%v25 \n\t"
|
||||
"vfmasb %%v26,%%v17,%%v1,%%v26 \n\t"
|
||||
"vfmasb %%v27,%%v21,%%v1,%%v27 \n\t"
|
||||
"vfmasb %%v28,%%v18,%%v2,%%v28 \n\t"
|
||||
"vfmasb %%v29,%%v22,%%v2,%%v29 \n\t"
|
||||
"vfmasb %%v30,%%v19,%%v3,%%v30 \n\t"
|
||||
"vfmasb %%v31,%%v23,%%v3,%%v31 \n\t"
|
||||
|
||||
"vl %%v16, 64(%%r1,%1) \n\t"
|
||||
"vl %%v17, 80(%%r1,%1) \n\t"
|
||||
"vl %%v18, 96(%%r1,%1) \n\t"
|
||||
"vl %%v19, 112(%%r1,%1) \n\t"
|
||||
"vl %%v0, 64(%%r1,%2) \n\t"
|
||||
"vl %%v1, 80(%%r1,%2) \n\t"
|
||||
"vl %%v2, 96(%%r1,%2) \n\t"
|
||||
"vl %%v3, 112(%%r1,%2) \n\t"
|
||||
"verllg %%v20,%%v16,32 \n\t"
|
||||
"verllg %%v21,%%v17,32 \n\t"
|
||||
"verllg %%v22,%%v18,32 \n\t"
|
||||
"verllg %%v23,%%v19,32 \n\t"
|
||||
|
||||
"vfmasb %%v24,%%v16,%%v0,%%v24 \n\t"
|
||||
"vfmasb %%v25,%%v20,%%v0,%%v25 \n\t"
|
||||
"vfmasb %%v26,%%v17,%%v1,%%v26 \n\t"
|
||||
"vfmasb %%v27,%%v21,%%v1,%%v27 \n\t"
|
||||
"vfmasb %%v28,%%v18,%%v2,%%v28 \n\t"
|
||||
"vfmasb %%v29,%%v22,%%v2,%%v29 \n\t"
|
||||
"vfmasb %%v30,%%v19,%%v3,%%v30 \n\t"
|
||||
"vfmasb %%v31,%%v23,%%v3,%%v31 \n\t"
|
||||
|
||||
"agfi %%r1,128 \n\t"
|
||||
"brctg %%r0,0b \n\t"
|
||||
"vfasb %%v24,%%v24,%%v26 \n\t"
|
||||
"vfasb %%v24,%%v24,%%v28 \n\t"
|
||||
"vfasb %%v24,%%v24,%%v30 \n\t"
|
||||
"vrepg %%v26,%%v24,1 \n\t"
|
||||
"vfasb %%v24,%%v24,%%v26 \n\t"
|
||||
"vfasb %%v25,%%v25,%%v27 \n\t"
|
||||
"vfasb %%v25,%%v25,%%v29 \n\t"
|
||||
"vfasb %%v25,%%v25,%%v31 \n\t"
|
||||
"vrepg %%v27,%%v25,1 \n\t"
|
||||
"vfasb %%v25,%%v25,%%v27 \n\t"
|
||||
"vstef %%v24,0(%3),0 \n\t"
|
||||
"vstef %%v24,4(%3),1 \n\t"
|
||||
"vstef %%v25,8(%3),1 \n\t"
|
||||
"vstef %%v25,12(%3),0 "
|
||||
:
|
||||
:"r"(n),"ZR"((const FLOAT (*)[n * 2])x),"ZR"((const FLOAT (*)[n * 2])y),"ZQ"((FLOAT (*)[4])d)
|
||||
:"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
}
|
||||
|
||||
OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) {
|
||||
BLASLONG i;
|
||||
BLASLONG ix, iy;
|
||||
OPENBLAS_COMPLEX_FLOAT result;
|
||||
FLOAT dot[4] __attribute__ ((aligned(16))) = {0.0, 0.0, 0.0, 0.0};
|
||||
|
||||
if (n <= 0) {
|
||||
CREAL(result) = 0.0;
|
||||
CIMAG(result) = 0.0;
|
||||
return (result);
|
||||
|
||||
}
|
||||
|
||||
if ((inc_x == 1) && (inc_y == 1)) {
|
||||
|
||||
BLASLONG n1 = n & -16;
|
||||
|
||||
if (n1)
|
||||
cdot_kernel_16(n1, x, y, dot);
|
||||
|
||||
i = n1;
|
||||
BLASLONG j = i * 2;
|
||||
|
||||
while (i < n) {
|
||||
|
||||
dot[0] += x[j] * y[j];
|
||||
dot[1] += x[j + 1] * y[j + 1];
|
||||
dot[2] += x[j] * y[j + 1];
|
||||
dot[3] += x[j + 1] * y[j];
|
||||
|
||||
j += 2;
|
||||
i++;
|
||||
|
||||
}
|
||||
|
||||
|
||||
} else {
|
||||
i = 0;
|
||||
ix = 0;
|
||||
iy = 0;
|
||||
inc_x <<= 1;
|
||||
inc_y <<= 1;
|
||||
while (i < n) {
|
||||
|
||||
dot[0] += x[ix] * y[iy];
|
||||
dot[1] += x[ix + 1] * y[iy + 1];
|
||||
dot[2] += x[ix] * y[iy + 1];
|
||||
dot[3] += x[ix + 1] * y[iy];
|
||||
|
||||
ix += inc_x;
|
||||
iy += inc_y;
|
||||
i++;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
#if !defined(CONJ)
|
||||
CREAL(result) = dot[0] - dot[1];
|
||||
CIMAG(result) = dot[2] + dot[3];
|
||||
#else
|
||||
CREAL(result) = dot[0] + dot[1];
|
||||
CIMAG(result) = dot[2] - dot[3];
|
||||
|
||||
#endif
|
||||
|
||||
return (result);
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,743 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2014, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include "common.h"
|
||||
|
||||
#define NBMAX 2048
|
||||
|
||||
static void cgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
|
||||
{
|
||||
__asm__ volatile (
|
||||
"vlrepg %%v16,0(%5) \n\t"
|
||||
"vlrepg %%v17,8(%5) \n\t"
|
||||
"vlrepg %%v18,16(%5) \n\t"
|
||||
"vlrepg %%v19,24(%5) \n\t"
|
||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||
"vlef %%v20,4(%5),0 \n\t"
|
||||
"vlef %%v20,4(%5),2 \n\t"
|
||||
"vflcsb %%v20,%%v20 \n\t"
|
||||
"vlef %%v20,0(%5),1 \n\t"
|
||||
"vlef %%v20,0(%5),3 \n\t"
|
||||
|
||||
"vlef %%v21,12(%5),0 \n\t"
|
||||
"vlef %%v21,12(%5),2 \n\t"
|
||||
"vflcsb %%v21,%%v21 \n\t"
|
||||
"vlef %%v21,8(%5),1 \n\t"
|
||||
"vlef %%v21,8(%5),3 \n\t"
|
||||
|
||||
"vlef %%v22,20(%5),0 \n\t"
|
||||
"vlef %%v22,20(%5),2 \n\t"
|
||||
"vflcsb %%v22,%%v22 \n\t"
|
||||
"vlef %%v22,16(%5),1 \n\t"
|
||||
"vlef %%v22,16(%5),3 \n\t"
|
||||
|
||||
"vlef %%v23,28(%5),0 \n\t"
|
||||
"vlef %%v23,28(%5),2 \n\t"
|
||||
"vflcsb %%v23,%%v23 \n\t"
|
||||
"vlef %%v23,24(%5),1 \n\t"
|
||||
"vlef %%v23,24(%5),3 \n\t"
|
||||
#else
|
||||
"vlef %%v20,0(%5),1 \n\t"
|
||||
"vlef %%v20,0(%5),3 \n\t"
|
||||
"vflcsb %%v20,%%v20 \n\t"
|
||||
"vlef %%v20,4(%5),0 \n\t"
|
||||
"vlef %%v20,4(%5),2 \n\t"
|
||||
|
||||
"vlef %%v21,8(%5),1 \n\t"
|
||||
"vlef %%v21,8(%5),3 \n\t"
|
||||
"vflcsb %%v21,%%v21 \n\t"
|
||||
"vlef %%v21,12(%5),0 \n\t"
|
||||
"vlef %%v21,12(%5),2 \n\t"
|
||||
|
||||
"vlef %%v22,16(%5),1 \n\t"
|
||||
"vlef %%v22,16(%5),3 \n\t"
|
||||
"vflcsb %%v22,%%v22 \n\t"
|
||||
"vlef %%v22,20(%5),0 \n\t"
|
||||
"vlef %%v22,20(%5),2 \n\t"
|
||||
|
||||
"vlef %%v23,24(%5),1 \n\t"
|
||||
"vlef %%v23,24(%5),3 \n\t"
|
||||
"vflcsb %%v23,%%v23 \n\t"
|
||||
"vlef %%v23,28(%5),0 \n\t"
|
||||
"vlef %%v23,28(%5),2 \n\t"
|
||||
#endif
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
"srlg %%r0,%0,1 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 1,1024(%%r1,%1) \n\t"
|
||||
"pfd 1,1024(%%r1,%2) \n\t"
|
||||
"pfd 1,1024(%%r1,%3) \n\t"
|
||||
"pfd 1,1024(%%r1,%4) \n\t"
|
||||
"pfd 2,1024(%%r1,%6) \n\t"
|
||||
|
||||
"vlef %%v24,0(%%r1,%1),0 \n\t"
|
||||
"vlef %%v24,0(%%r1,%1),1 \n\t"
|
||||
"vlef %%v24,8(%%r1,%1),2 \n\t"
|
||||
"vlef %%v24,8(%%r1,%1),3 \n\t"
|
||||
"vlef %%v25,4(%%r1,%1),0 \n\t"
|
||||
"vlef %%v25,4(%%r1,%1),1 \n\t"
|
||||
"vlef %%v25,12(%%r1,%1),2 \n\t"
|
||||
"vlef %%v25,12(%%r1,%1),3 \n\t"
|
||||
"vlef %%v26,0(%%r1,%2),0 \n\t"
|
||||
"vlef %%v26,0(%%r1,%2),1 \n\t"
|
||||
"vlef %%v26,8(%%r1,%2),2 \n\t"
|
||||
"vlef %%v26,8(%%r1,%2),3 \n\t"
|
||||
"vlef %%v27,4(%%r1,%2),0 \n\t"
|
||||
"vlef %%v27,4(%%r1,%2),1 \n\t"
|
||||
"vlef %%v27,12(%%r1,%2),2 \n\t"
|
||||
"vlef %%v27,12(%%r1,%2),3 \n\t"
|
||||
|
||||
"vl %%v0,0(%%r1,%6) \n\t"
|
||||
"vfmasb %%v0,%%v24,%%v16,%%v0 \n\t"
|
||||
"vfmasb %%v0,%%v25,%%v20,%%v0 \n\t"
|
||||
"vfmasb %%v0,%%v26,%%v17,%%v0 \n\t"
|
||||
"vfmasb %%v0,%%v27,%%v21,%%v0 \n\t"
|
||||
|
||||
"vlef %%v28,0(%%r1,%3),0 \n\t"
|
||||
"vlef %%v28,0(%%r1,%3),1 \n\t"
|
||||
"vlef %%v28,8(%%r1,%3),2 \n\t"
|
||||
"vlef %%v28,8(%%r1,%3),3 \n\t"
|
||||
"vlef %%v29,4(%%r1,%3),0 \n\t"
|
||||
"vlef %%v29,4(%%r1,%3),1 \n\t"
|
||||
"vlef %%v29,12(%%r1,%3),2 \n\t"
|
||||
"vlef %%v29,12(%%r1,%3),3 \n\t"
|
||||
"vlef %%v30,0(%%r1,%4),0 \n\t"
|
||||
"vlef %%v30,0(%%r1,%4),1 \n\t"
|
||||
"vlef %%v30,8(%%r1,%4),2 \n\t"
|
||||
"vlef %%v30,8(%%r1,%4),3 \n\t"
|
||||
"vlef %%v31,4(%%r1,%4),0 \n\t"
|
||||
"vlef %%v31,4(%%r1,%4),1 \n\t"
|
||||
"vlef %%v31,12(%%r1,%4),2 \n\t"
|
||||
"vlef %%v31,12(%%r1,%4),3 \n\t"
|
||||
|
||||
"vfmasb %%v0,%%v28,%%v18,%%v0 \n\t"
|
||||
"vfmasb %%v0,%%v29,%%v22,%%v0 \n\t"
|
||||
"vfmasb %%v0,%%v30,%%v19,%%v0 \n\t"
|
||||
"vfmasb %%v0,%%v31,%%v23,%%v0 \n\t"
|
||||
"vst %%v0,0(%%r1,%6) \n\t"
|
||||
|
||||
"agfi %%r1,16 \n\t"
|
||||
"brctg %%r0,0b \n\t"
|
||||
:
|
||||
:"r"(n),"ZR"((const FLOAT (*)[n * 2])ap[0]),"ZR"((const FLOAT (*)[n * 2])ap[1]),"ZR"((const FLOAT (*)[n * 2])ap[2]),"ZR"((const FLOAT (*)[n * 2])ap[3]),"ZQ"((const FLOAT (*)[8])x),"ZR"((FLOAT (*)[n * 2])y)
|
||||
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
}
|
||||
|
||||
static void cgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
|
||||
{
|
||||
__asm__ volatile (
|
||||
"vlrepg %%v16,0(%3) \n\t"
|
||||
"vlrepg %%v17,8(%3) \n\t"
|
||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||
"vlef %%v18,4(%3),0 \n\t"
|
||||
"vlef %%v18,4(%3),2 \n\t"
|
||||
"vflcsb %%v18,%%v18 \n\t"
|
||||
"vlef %%v18,0(%3),1 \n\t"
|
||||
"vlef %%v18,0(%3),3 \n\t"
|
||||
|
||||
"vlef %%v19,12(%3),0 \n\t"
|
||||
"vlef %%v19,12(%3),2 \n\t"
|
||||
"vflcsb %%v19,%%v19 \n\t"
|
||||
"vlef %%v19,8(%3),1 \n\t"
|
||||
"vlef %%v19,8(%3),3 \n\t"
|
||||
#else
|
||||
"vlef %%v18,0(%3),1 \n\t"
|
||||
"vlef %%v18,0(%3),3 \n\t"
|
||||
"vflcsb %%v18,%%v18 \n\t"
|
||||
"vlef %%v18,4(%3),0 \n\t"
|
||||
"vlef %%v18,4(%3),2 \n\t"
|
||||
|
||||
"vlef %%v19,8(%3),1 \n\t"
|
||||
"vlef %%v19,8(%3),3 \n\t"
|
||||
"vflcsb %%v19,%%v19 \n\t"
|
||||
"vlef %%v19,12(%3),0 \n\t"
|
||||
"vlef %%v19,12(%3),2 \n\t"
|
||||
#endif
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
"srlg %%r0,%0,1 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 1,1024(%%r1,%1) \n\t"
|
||||
"pfd 1,1024(%%r1,%2) \n\t"
|
||||
"pfd 2,1024(%%r1,%4) \n\t"
|
||||
|
||||
"vlef %%v20,0(%%r1,%1),0 \n\t"
|
||||
"vlef %%v20,0(%%r1,%1),1 \n\t"
|
||||
"vlef %%v20,8(%%r1,%1),2 \n\t"
|
||||
"vlef %%v20,8(%%r1,%1),3 \n\t"
|
||||
"vlef %%v21,4(%%r1,%1),0 \n\t"
|
||||
"vlef %%v21,4(%%r1,%1),1 \n\t"
|
||||
"vlef %%v21,12(%%r1,%1),2 \n\t"
|
||||
"vlef %%v21,12(%%r1,%1),3 \n\t"
|
||||
"vlef %%v22,0(%%r1,%2),0 \n\t"
|
||||
"vlef %%v22,0(%%r1,%2),1 \n\t"
|
||||
"vlef %%v22,8(%%r1,%2),2 \n\t"
|
||||
"vlef %%v22,8(%%r1,%2),3 \n\t"
|
||||
"vlef %%v23,4(%%r1,%2),0 \n\t"
|
||||
"vlef %%v23,4(%%r1,%2),1 \n\t"
|
||||
"vlef %%v23,12(%%r1,%2),2 \n\t"
|
||||
"vlef %%v23,12(%%r1,%2),3 \n\t"
|
||||
|
||||
"vl %%v0,0(%%r1,%4) \n\t"
|
||||
"vfmasb %%v0,%%v20,%%v16,%%v0 \n\t"
|
||||
"vfmasb %%v0,%%v21,%%v18,%%v0 \n\t"
|
||||
"vfmasb %%v0,%%v22,%%v17,%%v0 \n\t"
|
||||
"vfmasb %%v0,%%v23,%%v19,%%v0 \n\t"
|
||||
"vst %%v0,0(%%r1,%4) \n\t"
|
||||
|
||||
"agfi %%r1,16 \n\t"
|
||||
"brctg %%r0,0b \n\t"
|
||||
:
|
||||
:"r"(n),"ZR"((const FLOAT (*)[n * 2])ap[0]),"ZR"((const FLOAT (*)[n * 2])ap[1]),"ZQ"((const FLOAT (*)[4])x),"ZR"((FLOAT (*)[n * 2])y)
|
||||
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23"
|
||||
);
|
||||
}
|
||||
|
||||
static void cgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
|
||||
{
|
||||
__asm__ volatile (
|
||||
"vlrepg %%v16,0(%2) \n\t"
|
||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||
"vlef %%v17,4(%2),0 \n\t"
|
||||
"vlef %%v17,4(%2),2 \n\t"
|
||||
"vflcsb %%v17,%%v17 \n\t"
|
||||
"vlef %%v17,0(%2),1 \n\t"
|
||||
"vlef %%v17,0(%2),3 \n\t"
|
||||
#else
|
||||
"vlef %%v17,0(%2),1 \n\t"
|
||||
"vlef %%v17,0(%2),3 \n\t"
|
||||
"vflcsb %%v17,%%v17 \n\t"
|
||||
"vlef %%v17,4(%2),0 \n\t"
|
||||
"vlef %%v17,4(%2),2 \n\t"
|
||||
#endif
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
"srlg %%r0,%0,1 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 1,1024(%%r1,%1) \n\t"
|
||||
"pfd 2,1024(%%r1,%3) \n\t"
|
||||
|
||||
"vlef %%v18,0(%%r1,%1),0 \n\t"
|
||||
"vlef %%v18,0(%%r1,%1),1 \n\t"
|
||||
"vlef %%v18,8(%%r1,%1),2 \n\t"
|
||||
"vlef %%v18,8(%%r1,%1),3 \n\t"
|
||||
"vlef %%v19,4(%%r1,%1),0 \n\t"
|
||||
"vlef %%v19,4(%%r1,%1),1 \n\t"
|
||||
"vlef %%v19,12(%%r1,%1),2 \n\t"
|
||||
"vlef %%v19,12(%%r1,%1),3 \n\t"
|
||||
|
||||
"vl %%v0,0(%%r1,%3) \n\t"
|
||||
"vfmasb %%v0,%%v18,%%v16,%%v0 \n\t"
|
||||
"vfmasb %%v0,%%v19,%%v17,%%v0 \n\t"
|
||||
"vst %%v0,0(%%r1,%3) \n\t"
|
||||
|
||||
"agfi %%r1,16 \n\t"
|
||||
"brctg %%r0,0b \n\t"
|
||||
:
|
||||
:"r"(n),"ZR"((const FLOAT (*)[n * 2])ap),"ZQ"((const FLOAT (*)[2])x),"ZR"((FLOAT (*)[n * 2])y)
|
||||
:"memory","cc","r0","r1","v0","v16","v17","v18","v19"
|
||||
);
|
||||
}
|
||||
|
||||
static void add_y_4(BLASLONG n, FLOAT *src, FLOAT *dest, FLOAT alpha_r, FLOAT alpha_i)
|
||||
{
|
||||
__asm__ volatile (
|
||||
#if !defined(XCONJ)
|
||||
"vlrepf %%v0,%3 \n\t"
|
||||
"vlef %%v1,%4,0 \n\t"
|
||||
"vlef %%v1,%4,2 \n\t"
|
||||
"vflcsb %%v1,%%v1 \n\t"
|
||||
"vlef %%v1,%4,1 \n\t"
|
||||
"vlef %%v1,%4,3 \n\t"
|
||||
#else
|
||||
"vlef %%v0,%3,1 \n\t"
|
||||
"vlef %%v0,%3,3 \n\t"
|
||||
"vflcsb %%v0,%%v0 \n\t"
|
||||
"vlef %%v0,%3,0 \n\t"
|
||||
"vlef %%v0,%3,2 \n\t"
|
||||
"vlrepf %%v1,%4 \n\t"
|
||||
#endif
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
"srlg %%r0,%0,2 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 1,1024(%%r1,%1) \n\t"
|
||||
"pfd 2,1024(%%r1,%2) \n\t"
|
||||
|
||||
"vl %%v16,0(%%r1,%1) \n\t"
|
||||
"vl %%v17,16(%%r1,%1) \n\t"
|
||||
"vl %%v18,0(%%r1,%2) \n\t"
|
||||
"vl %%v19,16(%%r1,%2) \n\t"
|
||||
"verllg %%v20,%%v16,32 \n\t"
|
||||
"verllg %%v21,%%v17,32 \n\t"
|
||||
|
||||
"vfmasb %%v22,%%v16,%%v0,%%v18 \n\t"
|
||||
"vfmasb %%v23,%%v17,%%v0,%%v19 \n\t"
|
||||
|
||||
"vfmasb %%v22,%%v20,%%v1,%%v22 \n\t"
|
||||
"vfmasb %%v23,%%v21,%%v1,%%v23 \n\t"
|
||||
|
||||
"vst %%v22,0(%%r1,%2) \n\t"
|
||||
"vst %%v23,16(%%r1,%2) \n\t"
|
||||
|
||||
"agfi %%r1,32 \n\t"
|
||||
"brctg %%r0,0b "
|
||||
:
|
||||
:"r"(n),"ZR"((const FLOAT (*)[n * 2])src),"ZR"((FLOAT (*)[n * 2])dest),"m"(alpha_r),"m"(alpha_i)
|
||||
:"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23"
|
||||
);
|
||||
}
|
||||
|
||||
static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, FLOAT alpha_r, FLOAT alpha_i)
|
||||
{
|
||||
BLASLONG i;
|
||||
|
||||
if ( inc_dest != 2 )
|
||||
{
|
||||
|
||||
FLOAT temp_r;
|
||||
FLOAT temp_i;
|
||||
for ( i=0; i<n; i++ )
|
||||
{
|
||||
#if !defined(XCONJ)
|
||||
temp_r = alpha_r * src[0] - alpha_i * src[1];
|
||||
temp_i = alpha_r * src[1] + alpha_i * src[0];
|
||||
#else
|
||||
temp_r = alpha_r * src[0] + alpha_i * src[1];
|
||||
temp_i = -alpha_r * src[1] + alpha_i * src[0];
|
||||
#endif
|
||||
|
||||
*dest += temp_r;
|
||||
*(dest+1) += temp_i;
|
||||
|
||||
src+=2;
|
||||
dest += inc_dest;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
add_y_4(n, src, dest, alpha_r, alpha_i);
|
||||
}
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r,FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
|
||||
{
|
||||
BLASLONG i;
|
||||
FLOAT *a_ptr;
|
||||
FLOAT *x_ptr;
|
||||
FLOAT *y_ptr;
|
||||
FLOAT *ap[4];
|
||||
BLASLONG n1;
|
||||
BLASLONG m1;
|
||||
BLASLONG m2;
|
||||
BLASLONG m3;
|
||||
BLASLONG n2;
|
||||
BLASLONG lda4;
|
||||
FLOAT xbuffer[8],*ybuffer;
|
||||
|
||||
if ( m < 1 ) return(0);
|
||||
if ( n < 1 ) return(0);
|
||||
|
||||
ybuffer = buffer;
|
||||
|
||||
inc_x *= 2;
|
||||
inc_y *= 2;
|
||||
lda *= 2;
|
||||
lda4 = 4 * lda;
|
||||
|
||||
n1 = n / 4 ;
|
||||
n2 = n % 4 ;
|
||||
|
||||
m3 = m % 4;
|
||||
m1 = m - ( m % 4 );
|
||||
m2 = (m % NBMAX) - (m % 4) ;
|
||||
|
||||
y_ptr = y;
|
||||
|
||||
BLASLONG NB = NBMAX;
|
||||
|
||||
while ( NB == NBMAX )
|
||||
{
|
||||
|
||||
m1 -= NB;
|
||||
if ( m1 < 0)
|
||||
{
|
||||
if ( m2 == 0 ) break;
|
||||
NB = m2;
|
||||
}
|
||||
|
||||
a_ptr = a;
|
||||
ap[0] = a_ptr;
|
||||
ap[1] = a_ptr + lda;
|
||||
ap[2] = ap[1] + lda;
|
||||
ap[3] = ap[2] + lda;
|
||||
x_ptr = x;
|
||||
//zero_y(NB,ybuffer);
|
||||
memset(ybuffer,0,NB*8);
|
||||
|
||||
if ( inc_x == 2 )
|
||||
{
|
||||
|
||||
for( i = 0; i < n1 ; i++)
|
||||
{
|
||||
cgemv_kernel_4x4(NB,ap,x_ptr,ybuffer);
|
||||
ap[0] += lda4;
|
||||
ap[1] += lda4;
|
||||
ap[2] += lda4;
|
||||
ap[3] += lda4;
|
||||
a_ptr += lda4;
|
||||
x_ptr += 8;
|
||||
}
|
||||
|
||||
if ( n2 & 2 )
|
||||
{
|
||||
cgemv_kernel_4x2(NB,ap,x_ptr,ybuffer);
|
||||
x_ptr += 4;
|
||||
a_ptr += 2 * lda;
|
||||
|
||||
}
|
||||
|
||||
if ( n2 & 1 )
|
||||
{
|
||||
cgemv_kernel_4x1(NB,a_ptr,x_ptr,ybuffer);
|
||||
/* x_ptr += 2;
|
||||
a_ptr += lda; */
|
||||
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
for( i = 0; i < n1 ; i++)
|
||||
{
|
||||
|
||||
xbuffer[0] = x_ptr[0];
|
||||
xbuffer[1] = x_ptr[1];
|
||||
x_ptr += inc_x;
|
||||
xbuffer[2] = x_ptr[0];
|
||||
xbuffer[3] = x_ptr[1];
|
||||
x_ptr += inc_x;
|
||||
xbuffer[4] = x_ptr[0];
|
||||
xbuffer[5] = x_ptr[1];
|
||||
x_ptr += inc_x;
|
||||
xbuffer[6] = x_ptr[0];
|
||||
xbuffer[7] = x_ptr[1];
|
||||
x_ptr += inc_x;
|
||||
|
||||
cgemv_kernel_4x4(NB,ap,xbuffer,ybuffer);
|
||||
ap[0] += lda4;
|
||||
ap[1] += lda4;
|
||||
ap[2] += lda4;
|
||||
ap[3] += lda4;
|
||||
a_ptr += lda4;
|
||||
}
|
||||
|
||||
for( i = 0; i < n2 ; i++)
|
||||
{
|
||||
xbuffer[0] = x_ptr[0];
|
||||
xbuffer[1] = x_ptr[1];
|
||||
x_ptr += inc_x;
|
||||
cgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer);
|
||||
a_ptr += 1 * lda;
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
add_y(NB,ybuffer,y_ptr,inc_y,alpha_r,alpha_i);
|
||||
a += 2 * NB;
|
||||
y_ptr += NB * inc_y;
|
||||
}
|
||||
|
||||
if ( m3 == 0 ) return(0);
|
||||
|
||||
if ( m3 == 1 )
|
||||
{
|
||||
a_ptr = a;
|
||||
x_ptr = x;
|
||||
FLOAT temp_r = 0.0;
|
||||
FLOAT temp_i = 0.0;
|
||||
|
||||
if ( lda == 2 && inc_x == 2 )
|
||||
{
|
||||
|
||||
|
||||
for( i=0 ; i < (n & -2); i+=2 )
|
||||
{
|
||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||
temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
|
||||
temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
|
||||
temp_r += a_ptr[2] * x_ptr[2] - a_ptr[3] * x_ptr[3];
|
||||
temp_i += a_ptr[2] * x_ptr[3] + a_ptr[3] * x_ptr[2];
|
||||
#else
|
||||
temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
|
||||
temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
|
||||
temp_r += a_ptr[2] * x_ptr[2] + a_ptr[3] * x_ptr[3];
|
||||
temp_i += a_ptr[2] * x_ptr[3] - a_ptr[3] * x_ptr[2];
|
||||
#endif
|
||||
|
||||
a_ptr += 4;
|
||||
x_ptr += 4;
|
||||
}
|
||||
|
||||
|
||||
|
||||
for( ; i < n; i++ )
|
||||
{
|
||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||
temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
|
||||
temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
|
||||
#else
|
||||
temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
|
||||
temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
|
||||
#endif
|
||||
|
||||
a_ptr += 2;
|
||||
x_ptr += 2;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
for( i = 0; i < n; i++ )
|
||||
{
|
||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||
temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
|
||||
temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
|
||||
#else
|
||||
temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
|
||||
temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
|
||||
#endif
|
||||
|
||||
a_ptr += lda;
|
||||
x_ptr += inc_x;
|
||||
}
|
||||
|
||||
}
|
||||
#if !defined(XCONJ)
|
||||
y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i;
|
||||
y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r;
|
||||
#else
|
||||
y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i;
|
||||
y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r;
|
||||
#endif
|
||||
return(0);
|
||||
}
|
||||
|
||||
if ( m3 == 2 )
|
||||
{
|
||||
a_ptr = a;
|
||||
x_ptr = x;
|
||||
FLOAT temp_r0 = 0.0;
|
||||
FLOAT temp_i0 = 0.0;
|
||||
FLOAT temp_r1 = 0.0;
|
||||
FLOAT temp_i1 = 0.0;
|
||||
|
||||
if ( lda == 4 && inc_x == 2 )
|
||||
{
|
||||
|
||||
for( i = 0; i < (n & -2); i+=2 )
|
||||
{
|
||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||
|
||||
temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
|
||||
temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
|
||||
temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];
|
||||
temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];
|
||||
|
||||
temp_r0 += a_ptr[4] * x_ptr[2] - a_ptr[5] * x_ptr[3];
|
||||
temp_i0 += a_ptr[4] * x_ptr[3] + a_ptr[5] * x_ptr[2];
|
||||
temp_r1 += a_ptr[6] * x_ptr[2] - a_ptr[7] * x_ptr[3];
|
||||
temp_i1 += a_ptr[6] * x_ptr[3] + a_ptr[7] * x_ptr[2];
|
||||
|
||||
#else
|
||||
temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
|
||||
temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
|
||||
temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];
|
||||
temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];
|
||||
|
||||
temp_r0 += a_ptr[4] * x_ptr[2] + a_ptr[5] * x_ptr[3];
|
||||
temp_i0 += a_ptr[4] * x_ptr[3] - a_ptr[5] * x_ptr[2];
|
||||
temp_r1 += a_ptr[6] * x_ptr[2] + a_ptr[7] * x_ptr[3];
|
||||
temp_i1 += a_ptr[6] * x_ptr[3] - a_ptr[7] * x_ptr[2];
|
||||
|
||||
#endif
|
||||
|
||||
a_ptr += 8;
|
||||
x_ptr += 4;
|
||||
}
|
||||
|
||||
|
||||
for( ; i < n; i++ )
|
||||
{
|
||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||
temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
|
||||
temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
|
||||
temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];
|
||||
temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];
|
||||
#else
|
||||
temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
|
||||
temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
|
||||
temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];
|
||||
temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];
|
||||
#endif
|
||||
|
||||
a_ptr += 4;
|
||||
x_ptr += 2;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
for( i=0 ; i < n; i++ )
|
||||
{
|
||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||
temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
|
||||
temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
|
||||
temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];
|
||||
temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];
|
||||
#else
|
||||
temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
|
||||
temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
|
||||
temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];
|
||||
temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];
|
||||
#endif
|
||||
|
||||
a_ptr += lda;
|
||||
x_ptr += inc_x;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
#if !defined(XCONJ)
|
||||
y_ptr[0] += alpha_r * temp_r0 - alpha_i * temp_i0;
|
||||
y_ptr[1] += alpha_r * temp_i0 + alpha_i * temp_r0;
|
||||
y_ptr += inc_y;
|
||||
y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1;
|
||||
y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1;
|
||||
#else
|
||||
y_ptr[0] += alpha_r * temp_r0 + alpha_i * temp_i0;
|
||||
y_ptr[1] -= alpha_r * temp_i0 - alpha_i * temp_r0;
|
||||
y_ptr += inc_y;
|
||||
y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1;
|
||||
y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1;
|
||||
#endif
|
||||
return(0);
|
||||
}
|
||||
|
||||
|
||||
if ( m3 == 3 )
|
||||
{
|
||||
a_ptr = a;
|
||||
x_ptr = x;
|
||||
FLOAT temp_r0 = 0.0;
|
||||
FLOAT temp_i0 = 0.0;
|
||||
FLOAT temp_r1 = 0.0;
|
||||
FLOAT temp_i1 = 0.0;
|
||||
FLOAT temp_r2 = 0.0;
|
||||
FLOAT temp_i2 = 0.0;
|
||||
|
||||
if ( lda == 6 && inc_x == 2 )
|
||||
{
|
||||
|
||||
for( i=0 ; i < n; i++ )
|
||||
{
|
||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||
temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
|
||||
temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
|
||||
temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];
|
||||
temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];
|
||||
temp_r2 += a_ptr[4] * x_ptr[0] - a_ptr[5] * x_ptr[1];
|
||||
temp_i2 += a_ptr[4] * x_ptr[1] + a_ptr[5] * x_ptr[0];
|
||||
#else
|
||||
temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
|
||||
temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
|
||||
temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];
|
||||
temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];
|
||||
temp_r2 += a_ptr[4] * x_ptr[0] + a_ptr[5] * x_ptr[1];
|
||||
temp_i2 += a_ptr[4] * x_ptr[1] - a_ptr[5] * x_ptr[0];
|
||||
#endif
|
||||
|
||||
a_ptr += 6;
|
||||
x_ptr += 2;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
for( i = 0; i < n; i++ )
|
||||
{
|
||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||
temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
|
||||
temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
|
||||
temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];
|
||||
temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];
|
||||
temp_r2 += a_ptr[4] * x_ptr[0] - a_ptr[5] * x_ptr[1];
|
||||
temp_i2 += a_ptr[4] * x_ptr[1] + a_ptr[5] * x_ptr[0];
|
||||
#else
|
||||
temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
|
||||
temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
|
||||
temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];
|
||||
temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];
|
||||
temp_r2 += a_ptr[4] * x_ptr[0] + a_ptr[5] * x_ptr[1];
|
||||
temp_i2 += a_ptr[4] * x_ptr[1] - a_ptr[5] * x_ptr[0];
|
||||
#endif
|
||||
|
||||
a_ptr += lda;
|
||||
x_ptr += inc_x;
|
||||
}
|
||||
|
||||
}
|
||||
#if !defined(XCONJ)
|
||||
y_ptr[0] += alpha_r * temp_r0 - alpha_i * temp_i0;
|
||||
y_ptr[1] += alpha_r * temp_i0 + alpha_i * temp_r0;
|
||||
y_ptr += inc_y;
|
||||
y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1;
|
||||
y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1;
|
||||
y_ptr += inc_y;
|
||||
y_ptr[0] += alpha_r * temp_r2 - alpha_i * temp_i2;
|
||||
y_ptr[1] += alpha_r * temp_i2 + alpha_i * temp_r2;
|
||||
#else
|
||||
y_ptr[0] += alpha_r * temp_r0 + alpha_i * temp_i0;
|
||||
y_ptr[1] -= alpha_r * temp_i0 - alpha_i * temp_r0;
|
||||
y_ptr += inc_y;
|
||||
y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1;
|
||||
y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1;
|
||||
y_ptr += inc_y;
|
||||
y_ptr[0] += alpha_r * temp_r2 + alpha_i * temp_i2;
|
||||
y_ptr[1] -= alpha_r * temp_i2 - alpha_i * temp_r2;
|
||||
#endif
|
||||
return(0);
|
||||
}
|
||||
|
||||
return(0);
|
||||
}
|
||||
|
|
@ -0,0 +1,671 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2014, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#define NBMAX 2048
|
||||
|
||||
static void cgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
||||
{
|
||||
__asm__ volatile (
|
||||
"vzero %%v16 \n\t"
|
||||
"vzero %%v17 \n\t"
|
||||
"vzero %%v18 \n\t"
|
||||
"vzero %%v19 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
"srlg %%r0,%0,1 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 1,1024(%%r1,%1) \n\t"
|
||||
"pfd 1,1024(%%r1,%2) \n\t"
|
||||
"pfd 1,1024(%%r1,%3) \n\t"
|
||||
"pfd 1,1024(%%r1,%4) \n\t"
|
||||
"pfd 1,1024(%%r1,%5) \n\t"
|
||||
|
||||
"vl %%v20,0(%%r1,%5) \n\t"
|
||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||
"vlef %%v21,4(%%r1,%5),0 \n\t"
|
||||
"vlef %%v21,12(%%r1,%5),2 \n\t"
|
||||
"vflcsb %%v21,%%v21 \n\t"
|
||||
"vlef %%v21,0(%%r1,%5),1 \n\t"
|
||||
"vlef %%v21,8(%%r1,%5),3 \n\t"
|
||||
#else
|
||||
"vlef %%v21,0(%%r1,%5),1 \n\t"
|
||||
"vlef %%v21,8(%%r1,%5),3 \n\t"
|
||||
"vflcsb %%v21,%%v21 \n\t"
|
||||
"vlef %%v21,4(%%r1,%5),0 \n\t"
|
||||
"vlef %%v21,12(%%r1,%5),2 \n\t"
|
||||
#endif
|
||||
|
||||
"vlef %%v22,0(%%r1,%1),0 \n\t"
|
||||
"vlef %%v22,0(%%r1,%1),1 \n\t"
|
||||
"vlef %%v22,8(%%r1,%1),2 \n\t"
|
||||
"vlef %%v22,8(%%r1,%1),3 \n\t"
|
||||
"vlef %%v23,4(%%r1,%1),0 \n\t"
|
||||
"vlef %%v23,4(%%r1,%1),1 \n\t"
|
||||
"vlef %%v23,12(%%r1,%1),2 \n\t"
|
||||
"vlef %%v23,12(%%r1,%1),3 \n\t"
|
||||
"vlef %%v24,0(%%r1,%2),0 \n\t"
|
||||
"vlef %%v24,0(%%r1,%2),1 \n\t"
|
||||
"vlef %%v24,8(%%r1,%2),2 \n\t"
|
||||
"vlef %%v24,8(%%r1,%2),3 \n\t"
|
||||
"vlef %%v25,4(%%r1,%2),0 \n\t"
|
||||
"vlef %%v25,4(%%r1,%2),1 \n\t"
|
||||
"vlef %%v25,12(%%r1,%2),2 \n\t"
|
||||
"vlef %%v25,12(%%r1,%2),3 \n\t"
|
||||
|
||||
"vfmasb %%v16,%%v22,%%v20,%%v16 \n\t"
|
||||
"vfmasb %%v16,%%v23,%%v21,%%v16 \n\t"
|
||||
"vfmasb %%v17,%%v24,%%v20,%%v17 \n\t"
|
||||
"vfmasb %%v17,%%v25,%%v21,%%v17 \n\t"
|
||||
|
||||
"vlef %%v26,0(%%r1,%3),0 \n\t"
|
||||
"vlef %%v26,0(%%r1,%3),1 \n\t"
|
||||
"vlef %%v26,8(%%r1,%3),2 \n\t"
|
||||
"vlef %%v26,8(%%r1,%3),3 \n\t"
|
||||
"vlef %%v27,4(%%r1,%3),0 \n\t"
|
||||
"vlef %%v27,4(%%r1,%3),1 \n\t"
|
||||
"vlef %%v27,12(%%r1,%3),2 \n\t"
|
||||
"vlef %%v27,12(%%r1,%3),3 \n\t"
|
||||
"vlef %%v28,0(%%r1,%4),0 \n\t"
|
||||
"vlef %%v28,0(%%r1,%4),1 \n\t"
|
||||
"vlef %%v28,8(%%r1,%4),2 \n\t"
|
||||
"vlef %%v28,8(%%r1,%4),3 \n\t"
|
||||
"vlef %%v29,4(%%r1,%4),0 \n\t"
|
||||
"vlef %%v29,4(%%r1,%4),1 \n\t"
|
||||
"vlef %%v29,12(%%r1,%4),2 \n\t"
|
||||
"vlef %%v29,12(%%r1,%4),3 \n\t"
|
||||
|
||||
"vfmasb %%v18,%%v26,%%v20,%%v18 \n\t"
|
||||
"vfmasb %%v18,%%v27,%%v21,%%v18 \n\t"
|
||||
"vfmasb %%v19,%%v28,%%v20,%%v19 \n\t"
|
||||
"vfmasb %%v19,%%v29,%%v21,%%v19 \n\t"
|
||||
|
||||
"agfi %%r1,16 \n\t"
|
||||
"brctg %%r0,0b \n\t"
|
||||
|
||||
"vrepg %%v20,%%v16,1 \n\t"
|
||||
"vrepg %%v21,%%v17,1 \n\t"
|
||||
"vrepg %%v22,%%v18,1 \n\t"
|
||||
"vrepg %%v23,%%v19,1 \n\t"
|
||||
"vfasb %%v16,%%v16,%%v20 \n\t"
|
||||
"vfasb %%v17,%%v17,%%v21 \n\t"
|
||||
"vfasb %%v18,%%v18,%%v22 \n\t"
|
||||
"vfasb %%v19,%%v19,%%v23 \n\t"
|
||||
"vmrhg %%v16,%%v16,%%v17 \n\t"
|
||||
"vmrhg %%v17,%%v18,%%v19 \n\t"
|
||||
"verllg %%v18,%%v16,32 \n\t"
|
||||
"verllg %%v19,%%v17,32 \n\t"
|
||||
#if !defined(XCONJ)
|
||||
"vlrepf %%v20,0(%7) \n\t"
|
||||
"vlef %%v21,4(%7),0 \n\t"
|
||||
"vlef %%v21,4(%7),2 \n\t"
|
||||
"vflcsb %%v21,%%v21 \n\t"
|
||||
"vlef %%v21,4(%7),1 \n\t"
|
||||
"vlef %%v21,4(%7),3 \n\t"
|
||||
#else
|
||||
"vlef %%v20,0(%7),1 \n\t"
|
||||
"vlef %%v20,0(%7),3 \n\t"
|
||||
"vflcsb %%v20,%%v20 \n\t"
|
||||
"vlef %%v20,0(%7),0 \n\t"
|
||||
"vlef %%v20,0(%7),2 \n\t"
|
||||
"vlrepf %%v21,4(%7) \n\t"
|
||||
#endif
|
||||
"vl %%v22,0(%6) \n\t"
|
||||
"vl %%v23,16(%6) \n\t"
|
||||
"vfmasb %%v22,%%v16,%%v20,%%v22 \n\t"
|
||||
"vfmasb %%v22,%%v18,%%v21,%%v22 \n\t"
|
||||
"vfmasb %%v23,%%v17,%%v20,%%v23 \n\t"
|
||||
"vfmasb %%v23,%%v19,%%v21,%%v23 \n\t"
|
||||
"vst %%v22,0(%6) \n\t"
|
||||
"vst %%v23,16(%6) "
|
||||
:
|
||||
:"r"(n),"ZR"((const FLOAT (*)[n * 2])ap[0]),"ZR"((const FLOAT (*)[n * 2])ap[1]),"ZR"((const FLOAT (*)[n * 2])ap[2]),"ZR"((const FLOAT (*)[n * 2])ap[3]),"ZR"((const FLOAT (*)[n * 2])x),"ZQ"((FLOAT (*)[8])y),"ZQ"((const FLOAT (*)[2])alpha)
|
||||
:"memory","cc","r0","r1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29"
|
||||
);
|
||||
}
|
||||
|
||||
static void cgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
||||
{
|
||||
__asm__ volatile (
|
||||
"vzero %%v16 \n\t"
|
||||
"vzero %%v17 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
"srlg %%r0,%0,1 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 1,1024(%%r1,%1) \n\t"
|
||||
"pfd 1,1024(%%r1,%2) \n\t"
|
||||
"pfd 1,1024(%%r1,%3) \n\t"
|
||||
|
||||
"vl %%v18,0(%%r1,%3) \n\t"
|
||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||
"vlef %%v19,4(%%r1,%3),0 \n\t"
|
||||
"vlef %%v19,12(%%r1,%3),2 \n\t"
|
||||
"vflcsb %%v19,%%v19 \n\t"
|
||||
"vlef %%v19,0(%%r1,%3),1 \n\t"
|
||||
"vlef %%v19,8(%%r1,%3),3 \n\t"
|
||||
#else
|
||||
"vlef %%v19,0(%%r1,%3),1 \n\t"
|
||||
"vlef %%v19,8(%%r1,%3),3 \n\t"
|
||||
"vflcsb %%v19,%%v19 \n\t"
|
||||
"vlef %%v19,4(%%r1,%3),0 \n\t"
|
||||
"vlef %%v19,12(%%r1,%3),2 \n\t"
|
||||
#endif
|
||||
|
||||
"vlef %%v20,0(%%r1,%1),0 \n\t"
|
||||
"vlef %%v20,0(%%r1,%1),1 \n\t"
|
||||
"vlef %%v20,8(%%r1,%1),2 \n\t"
|
||||
"vlef %%v20,8(%%r1,%1),3 \n\t"
|
||||
"vlef %%v21,4(%%r1,%1),0 \n\t"
|
||||
"vlef %%v21,4(%%r1,%1),1 \n\t"
|
||||
"vlef %%v21,12(%%r1,%1),2 \n\t"
|
||||
"vlef %%v21,12(%%r1,%1),3 \n\t"
|
||||
"vlef %%v22,0(%%r1,%2),0 \n\t"
|
||||
"vlef %%v22,0(%%r1,%2),1 \n\t"
|
||||
"vlef %%v22,8(%%r1,%2),2 \n\t"
|
||||
"vlef %%v22,8(%%r1,%2),3 \n\t"
|
||||
"vlef %%v23,4(%%r1,%2),0 \n\t"
|
||||
"vlef %%v23,4(%%r1,%2),1 \n\t"
|
||||
"vlef %%v23,12(%%r1,%2),2 \n\t"
|
||||
"vlef %%v23,12(%%r1,%2),3 \n\t"
|
||||
|
||||
"vfmasb %%v16,%%v20,%%v18,%%v16 \n\t"
|
||||
"vfmasb %%v16,%%v21,%%v19,%%v16 \n\t"
|
||||
"vfmasb %%v17,%%v22,%%v18,%%v17 \n\t"
|
||||
"vfmasb %%v17,%%v23,%%v19,%%v17 \n\t"
|
||||
|
||||
"agfi %%r1,16 \n\t"
|
||||
"brctg %%r0,0b \n\t"
|
||||
|
||||
"vrepg %%v18,%%v16,1 \n\t"
|
||||
"vrepg %%v19,%%v17,1 \n\t"
|
||||
"vfasb %%v16,%%v16,%%v18 \n\t"
|
||||
"vfasb %%v17,%%v17,%%v19 \n\t"
|
||||
"vmrhg %%v16,%%v16,%%v17 \n\t"
|
||||
"verllg %%v17,%%v16,32 \n\t"
|
||||
#if !defined(XCONJ)
|
||||
"vlrepf %%v18,0(%5) \n\t"
|
||||
"vlef %%v19,4(%5),0 \n\t"
|
||||
"vlef %%v19,4(%5),2 \n\t"
|
||||
"vflcsb %%v19,%%v19 \n\t"
|
||||
"vlef %%v19,4(%5),1 \n\t"
|
||||
"vlef %%v19,4(%5),3 \n\t"
|
||||
#else
|
||||
"vlef %%v18,0(%5),1 \n\t"
|
||||
"vlef %%v18,0(%5),3 \n\t"
|
||||
"vflcsb %%v18,%%v18 \n\t"
|
||||
"vlef %%v18,0(%5),0 \n\t"
|
||||
"vlef %%v18,0(%5),2 \n\t"
|
||||
"vlrepf %%v19,4(%5) \n\t"
|
||||
#endif
|
||||
"vl %%v20,0(%4) \n\t"
|
||||
"vfmasb %%v20,%%v16,%%v18,%%v20 \n\t"
|
||||
"vfmasb %%v20,%%v17,%%v19,%%v20 \n\t"
|
||||
"vst %%v20,0(%4) "
|
||||
:
|
||||
:"r"(n),"ZR"((const FLOAT (*)[n * 2])ap[0]),"ZR"((const FLOAT (*)[n * 2])ap[1]),"ZR"((const FLOAT (*)[n * 2])x),"ZQ"((FLOAT (*)[4])y),"ZQ"((const FLOAT (*)[2])alpha)
|
||||
:"memory","cc","r0","r1","v16","v17","v18","v19","v20","v21","v22","v23"
|
||||
);
|
||||
}
|
||||
|
||||
static void cgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
||||
{
|
||||
__asm__ volatile (
|
||||
"vzero %%v16 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
"srlg %%r0,%0,1 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 1,1024(%%r1,%1) \n\t"
|
||||
"pfd 1,1024(%%r1,%2) \n\t"
|
||||
|
||||
"vl %%v17,0(%%r1,%2) \n\t"
|
||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||
"vlef %%v18,4(%%r1,%2),0 \n\t"
|
||||
"vlef %%v18,12(%%r1,%2),2 \n\t"
|
||||
"vflcsb %%v18,%%v18 \n\t"
|
||||
"vlef %%v18,0(%%r1,%2),1 \n\t"
|
||||
"vlef %%v18,8(%%r1,%2),3 \n\t"
|
||||
#else
|
||||
"vlef %%v18,0(%%r1,%2),1 \n\t"
|
||||
"vlef %%v18,8(%%r1,%2),3 \n\t"
|
||||
"vflcsb %%v18,%%v18 \n\t"
|
||||
"vlef %%v18,4(%%r1,%2),0 \n\t"
|
||||
"vlef %%v18,12(%%r1,%2),2 \n\t"
|
||||
#endif
|
||||
|
||||
"vlef %%v19,0(%%r1,%1),0 \n\t"
|
||||
"vlef %%v19,0(%%r1,%1),1 \n\t"
|
||||
"vlef %%v19,8(%%r1,%1),2 \n\t"
|
||||
"vlef %%v19,8(%%r1,%1),3 \n\t"
|
||||
"vlef %%v20,4(%%r1,%1),0 \n\t"
|
||||
"vlef %%v20,4(%%r1,%1),1 \n\t"
|
||||
"vlef %%v20,12(%%r1,%1),2 \n\t"
|
||||
"vlef %%v20,12(%%r1,%1),3 \n\t"
|
||||
|
||||
"vfmasb %%v16,%%v19,%%v17,%%v16 \n\t"
|
||||
"vfmasb %%v16,%%v20,%%v18,%%v16 \n\t"
|
||||
|
||||
"agfi %%r1,16 \n\t"
|
||||
"brctg %%r0,0b \n\t"
|
||||
|
||||
"vrepg %%v17,%%v16,1 \n\t"
|
||||
"vfasb %%v16,%%v16,%%v17 \n\t"
|
||||
"verllg %%v17,%%v16,32 \n\t"
|
||||
#if !defined(XCONJ)
|
||||
"vlrepf %%v18,0(%4) \n\t"
|
||||
"vlef %%v19,4(%4),0 \n\t"
|
||||
"vflcsb %%v19,%%v19 \n\t"
|
||||
"vlef %%v19,4(%4),1 \n\t"
|
||||
#else
|
||||
"vlef %%v18,0(%4),1 \n\t"
|
||||
"vflcsb %%v18,%%v18 \n\t"
|
||||
"vlef %%v18,0(%4),0 \n\t"
|
||||
"vlrepf %%v19,4(%4) \n\t"
|
||||
#endif
|
||||
"vleg %%v20,0(%3),0 \n\t"
|
||||
"vfmasb %%v20,%%v16,%%v18,%%v20 \n\t"
|
||||
"vfmasb %%v20,%%v17,%%v19,%%v20 \n\t"
|
||||
"vsteg %%v20,0(%3),0 "
|
||||
:
|
||||
:"r"(n),"ZR"((const FLOAT (*)[n * 2])ap),"ZR"((const FLOAT (*)[n * 2])x),"ZQ"((FLOAT (*)[2])y),"ZQ"((const FLOAT (*)[2])alpha)
|
||||
:"memory","cc","r0","r1","v16","v17","v18","v19","v20","v21","v22","v23"
|
||||
);
|
||||
}
|
||||
|
||||
static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src)
|
||||
{
|
||||
BLASLONG i;
|
||||
for ( i=0; i<n; i++ )
|
||||
{
|
||||
*dest = *src;
|
||||
*(dest+1) = *(src+1);
|
||||
dest+=2;
|
||||
src += inc_src;
|
||||
}
|
||||
}
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
|
||||
{
|
||||
BLASLONG i;
|
||||
BLASLONG j;
|
||||
FLOAT *a_ptr;
|
||||
FLOAT *x_ptr;
|
||||
FLOAT *y_ptr;
|
||||
FLOAT *ap[8];
|
||||
BLASLONG n1;
|
||||
BLASLONG m1;
|
||||
BLASLONG m2;
|
||||
BLASLONG m3;
|
||||
BLASLONG n2;
|
||||
BLASLONG lda4;
|
||||
FLOAT ybuffer[8],*xbuffer;
|
||||
FLOAT alpha[2];
|
||||
|
||||
if ( m < 1 ) return(0);
|
||||
if ( n < 1 ) return(0);
|
||||
|
||||
inc_x <<= 1;
|
||||
inc_y <<= 1;
|
||||
lda <<= 1;
|
||||
lda4 = lda << 2;
|
||||
|
||||
xbuffer = buffer;
|
||||
|
||||
n1 = n >> 2 ;
|
||||
n2 = n & 3 ;
|
||||
|
||||
m3 = m & 3 ;
|
||||
m1 = m - m3;
|
||||
m2 = (m & (NBMAX-1)) - m3 ;
|
||||
|
||||
alpha[0] = alpha_r;
|
||||
alpha[1] = alpha_i;
|
||||
|
||||
BLASLONG NB = NBMAX;
|
||||
|
||||
while ( NB == NBMAX )
|
||||
{
|
||||
|
||||
m1 -= NB;
|
||||
if ( m1 < 0)
|
||||
{
|
||||
if ( m2 == 0 ) break;
|
||||
NB = m2;
|
||||
}
|
||||
|
||||
y_ptr = y;
|
||||
a_ptr = a;
|
||||
x_ptr = x;
|
||||
ap[0] = a_ptr;
|
||||
ap[1] = a_ptr + lda;
|
||||
ap[2] = ap[1] + lda;
|
||||
ap[3] = ap[2] + lda;
|
||||
if ( inc_x != 2 )
|
||||
copy_x(NB,x_ptr,xbuffer,inc_x);
|
||||
else
|
||||
xbuffer = x_ptr;
|
||||
|
||||
if ( inc_y == 2 )
|
||||
{
|
||||
|
||||
for( i = 0; i < n1 ; i++)
|
||||
{
|
||||
cgemv_kernel_4x4(NB,ap,xbuffer,y_ptr,alpha);
|
||||
ap[0] += lda4;
|
||||
ap[1] += lda4;
|
||||
ap[2] += lda4;
|
||||
ap[3] += lda4;
|
||||
a_ptr += lda4;
|
||||
y_ptr += 8;
|
||||
|
||||
}
|
||||
|
||||
if ( n2 & 2 )
|
||||
{
|
||||
cgemv_kernel_4x2(NB,ap,xbuffer,y_ptr,alpha);
|
||||
a_ptr += lda * 2;
|
||||
y_ptr += 4;
|
||||
|
||||
}
|
||||
|
||||
if ( n2 & 1 )
|
||||
{
|
||||
cgemv_kernel_4x1(NB,a_ptr,xbuffer,y_ptr,alpha);
|
||||
/* a_ptr += lda;
|
||||
y_ptr += 2; */
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
for( i = 0; i < n1 ; i++)
|
||||
{
|
||||
memset(ybuffer,0,sizeof(ybuffer));
|
||||
cgemv_kernel_4x4(NB,ap,xbuffer,ybuffer,alpha);
|
||||
ap[0] += lda4;
|
||||
ap[1] += lda4;
|
||||
ap[2] += lda4;
|
||||
ap[3] += lda4;
|
||||
a_ptr += lda4;
|
||||
|
||||
y_ptr[0] += ybuffer[0];
|
||||
y_ptr[1] += ybuffer[1];
|
||||
y_ptr += inc_y;
|
||||
y_ptr[0] += ybuffer[2];
|
||||
y_ptr[1] += ybuffer[3];
|
||||
y_ptr += inc_y;
|
||||
y_ptr[0] += ybuffer[4];
|
||||
y_ptr[1] += ybuffer[5];
|
||||
y_ptr += inc_y;
|
||||
y_ptr[0] += ybuffer[6];
|
||||
y_ptr[1] += ybuffer[7];
|
||||
y_ptr += inc_y;
|
||||
|
||||
}
|
||||
|
||||
for( i = 0; i < n2 ; i++)
|
||||
{
|
||||
memset(ybuffer,0,sizeof(ybuffer));
|
||||
cgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer,alpha);
|
||||
a_ptr += lda;
|
||||
y_ptr[0] += ybuffer[0];
|
||||
y_ptr[1] += ybuffer[1];
|
||||
y_ptr += inc_y;
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
a += 2 * NB;
|
||||
x += NB * inc_x;
|
||||
}
|
||||
|
||||
|
||||
|
||||
if ( m3 == 0 ) return(0);
|
||||
|
||||
x_ptr = x;
|
||||
j=0;
|
||||
a_ptr = a;
|
||||
y_ptr = y;
|
||||
|
||||
if ( m3 == 3 )
|
||||
{
|
||||
|
||||
FLOAT temp_r ;
|
||||
FLOAT temp_i ;
|
||||
FLOAT x0 = x_ptr[0];
|
||||
FLOAT x1 = x_ptr[1];
|
||||
x_ptr += inc_x;
|
||||
FLOAT x2 = x_ptr[0];
|
||||
FLOAT x3 = x_ptr[1];
|
||||
x_ptr += inc_x;
|
||||
FLOAT x4 = x_ptr[0];
|
||||
FLOAT x5 = x_ptr[1];
|
||||
while ( j < n)
|
||||
{
|
||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||
temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
|
||||
temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;
|
||||
temp_r += a_ptr[2] * x2 - a_ptr[3] * x3;
|
||||
temp_i += a_ptr[2] * x3 + a_ptr[3] * x2;
|
||||
temp_r += a_ptr[4] * x4 - a_ptr[5] * x5;
|
||||
temp_i += a_ptr[4] * x5 + a_ptr[5] * x4;
|
||||
#else
|
||||
|
||||
temp_r = a_ptr[0] * x0 + a_ptr[1] * x1;
|
||||
temp_i = a_ptr[0] * x1 - a_ptr[1] * x0;
|
||||
temp_r += a_ptr[2] * x2 + a_ptr[3] * x3;
|
||||
temp_i += a_ptr[2] * x3 - a_ptr[3] * x2;
|
||||
temp_r += a_ptr[4] * x4 + a_ptr[5] * x5;
|
||||
temp_i += a_ptr[4] * x5 - a_ptr[5] * x4;
|
||||
#endif
|
||||
|
||||
#if !defined(XCONJ)
|
||||
y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i;
|
||||
y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r;
|
||||
#else
|
||||
y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i;
|
||||
y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r;
|
||||
#endif
|
||||
|
||||
a_ptr += lda;
|
||||
y_ptr += inc_y;
|
||||
j++;
|
||||
}
|
||||
return(0);
|
||||
}
|
||||
|
||||
|
||||
if ( m3 == 2 )
|
||||
{
|
||||
|
||||
FLOAT temp_r ;
|
||||
FLOAT temp_i ;
|
||||
FLOAT temp_r1 ;
|
||||
FLOAT temp_i1 ;
|
||||
FLOAT x0 = x_ptr[0];
|
||||
FLOAT x1 = x_ptr[1];
|
||||
x_ptr += inc_x;
|
||||
FLOAT x2 = x_ptr[0];
|
||||
FLOAT x3 = x_ptr[1];
|
||||
FLOAT ar = alpha[0];
|
||||
FLOAT ai = alpha[1];
|
||||
|
||||
while ( j < ( n & -2 ))
|
||||
{
|
||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||
temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
|
||||
temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;
|
||||
temp_r += a_ptr[2] * x2 - a_ptr[3] * x3;
|
||||
temp_i += a_ptr[2] * x3 + a_ptr[3] * x2;
|
||||
a_ptr += lda;
|
||||
temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1;
|
||||
temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0;
|
||||
temp_r1 += a_ptr[2] * x2 - a_ptr[3] * x3;
|
||||
temp_i1 += a_ptr[2] * x3 + a_ptr[3] * x2;
|
||||
#else
|
||||
|
||||
temp_r = a_ptr[0] * x0 + a_ptr[1] * x1;
|
||||
temp_i = a_ptr[0] * x1 - a_ptr[1] * x0;
|
||||
temp_r += a_ptr[2] * x2 + a_ptr[3] * x3;
|
||||
temp_i += a_ptr[2] * x3 - a_ptr[3] * x2;
|
||||
a_ptr += lda;
|
||||
temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1;
|
||||
temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0;
|
||||
temp_r1 += a_ptr[2] * x2 + a_ptr[3] * x3;
|
||||
temp_i1 += a_ptr[2] * x3 - a_ptr[3] * x2;
|
||||
#endif
|
||||
|
||||
#if !defined(XCONJ)
|
||||
y_ptr[0] += ar * temp_r - ai * temp_i;
|
||||
y_ptr[1] += ar * temp_i + ai * temp_r;
|
||||
y_ptr += inc_y;
|
||||
y_ptr[0] += ar * temp_r1 - ai * temp_i1;
|
||||
y_ptr[1] += ar * temp_i1 + ai * temp_r1;
|
||||
#else
|
||||
y_ptr[0] += ar * temp_r + ai * temp_i;
|
||||
y_ptr[1] -= ar * temp_i - ai * temp_r;
|
||||
y_ptr += inc_y;
|
||||
y_ptr[0] += ar * temp_r1 + ai * temp_i1;
|
||||
y_ptr[1] -= ar * temp_i1 - ai * temp_r1;
|
||||
#endif
|
||||
|
||||
a_ptr += lda;
|
||||
y_ptr += inc_y;
|
||||
j+=2;
|
||||
}
|
||||
|
||||
|
||||
while ( j < n)
|
||||
{
|
||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||
temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
|
||||
temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;
|
||||
temp_r += a_ptr[2] * x2 - a_ptr[3] * x3;
|
||||
temp_i += a_ptr[2] * x3 + a_ptr[3] * x2;
|
||||
#else
|
||||
|
||||
temp_r = a_ptr[0] * x0 + a_ptr[1] * x1;
|
||||
temp_i = a_ptr[0] * x1 - a_ptr[1] * x0;
|
||||
temp_r += a_ptr[2] * x2 + a_ptr[3] * x3;
|
||||
temp_i += a_ptr[2] * x3 - a_ptr[3] * x2;
|
||||
#endif
|
||||
|
||||
#if !defined(XCONJ)
|
||||
y_ptr[0] += ar * temp_r - ai * temp_i;
|
||||
y_ptr[1] += ar * temp_i + ai * temp_r;
|
||||
#else
|
||||
y_ptr[0] += ar * temp_r + ai * temp_i;
|
||||
y_ptr[1] -= ar * temp_i - ai * temp_r;
|
||||
#endif
|
||||
|
||||
a_ptr += lda;
|
||||
y_ptr += inc_y;
|
||||
j++;
|
||||
}
|
||||
|
||||
return(0);
|
||||
}
|
||||
|
||||
|
||||
if ( m3 == 1 )
|
||||
{
|
||||
|
||||
FLOAT temp_r ;
|
||||
FLOAT temp_i ;
|
||||
FLOAT temp_r1 ;
|
||||
FLOAT temp_i1 ;
|
||||
FLOAT x0 = x_ptr[0];
|
||||
FLOAT x1 = x_ptr[1];
|
||||
FLOAT ar = alpha[0];
|
||||
FLOAT ai = alpha[1];
|
||||
|
||||
while ( j < ( n & -2 ))
|
||||
{
|
||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||
temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
|
||||
temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;
|
||||
a_ptr += lda;
|
||||
temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1;
|
||||
temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0;
|
||||
#else
|
||||
|
||||
temp_r = a_ptr[0] * x0 + a_ptr[1] * x1;
|
||||
temp_i = a_ptr[0] * x1 - a_ptr[1] * x0;
|
||||
a_ptr += lda;
|
||||
temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1;
|
||||
temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0;
|
||||
#endif
|
||||
|
||||
#if !defined(XCONJ)
|
||||
y_ptr[0] += ar * temp_r - ai * temp_i;
|
||||
y_ptr[1] += ar * temp_i + ai * temp_r;
|
||||
y_ptr += inc_y;
|
||||
y_ptr[0] += ar * temp_r1 - ai * temp_i1;
|
||||
y_ptr[1] += ar * temp_i1 + ai * temp_r1;
|
||||
#else
|
||||
y_ptr[0] += ar * temp_r + ai * temp_i;
|
||||
y_ptr[1] -= ar * temp_i - ai * temp_r;
|
||||
y_ptr += inc_y;
|
||||
y_ptr[0] += ar * temp_r1 + ai * temp_i1;
|
||||
y_ptr[1] -= ar * temp_i1 - ai * temp_r1;
|
||||
#endif
|
||||
|
||||
a_ptr += lda;
|
||||
y_ptr += inc_y;
|
||||
j+=2;
|
||||
}
|
||||
|
||||
while ( j < n)
|
||||
{
|
||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||
temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
|
||||
temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;
|
||||
#else
|
||||
|
||||
temp_r = a_ptr[0] * x0 + a_ptr[1] * x1;
|
||||
temp_i = a_ptr[0] * x1 - a_ptr[1] * x0;
|
||||
#endif
|
||||
|
||||
#if !defined(XCONJ)
|
||||
y_ptr[0] += ar * temp_r - ai * temp_i;
|
||||
y_ptr[1] += ar * temp_i + ai * temp_r;
|
||||
#else
|
||||
y_ptr[0] += ar * temp_r + ai * temp_i;
|
||||
y_ptr[1] -= ar * temp_i - ai * temp_r;
|
||||
#endif
|
||||
|
||||
a_ptr += lda;
|
||||
y_ptr += inc_y;
|
||||
j++;
|
||||
}
|
||||
return(0);
|
||||
}
|
||||
|
||||
return(0);
|
||||
}
|
||||
|
|
@ -0,0 +1,256 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2018, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
static void crot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s)
|
||||
{
|
||||
__asm__ (
|
||||
"vlrepf %%v0,%3 \n\t"
|
||||
"vlrepf %%v1,%4 \n\t"
|
||||
"srlg %%r0,%0,5 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 2, 1024(%%r1,%1) \n\t"
|
||||
"pfd 2, 1024(%%r1,%2) \n\t"
|
||||
"vl %%v24, 0(%%r1,%1) \n\t"
|
||||
"vl %%v25, 16(%%r1,%1) \n\t"
|
||||
"vl %%v26, 32(%%r1,%1) \n\t"
|
||||
"vl %%v27, 48(%%r1,%1) \n\t"
|
||||
"vl %%v16, 0(%%r1,%2) \n\t"
|
||||
"vl %%v17, 16(%%r1,%2) \n\t"
|
||||
"vl %%v18, 32(%%r1,%2) \n\t"
|
||||
"vl %%v19, 48(%%r1,%2) \n\t"
|
||||
|
||||
"vfmsb %%v28,%%v24,%%v0 \n\t"
|
||||
"vfmsb %%v29,%%v25,%%v0 \n\t"
|
||||
"vfmsb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmsb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmsb %%v30,%%v26,%%v0 \n\t"
|
||||
"vfmsb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmsb %%v31,%%v27,%%v0 \n\t"
|
||||
"vfmsb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
|
||||
/* 2nd parts*/
|
||||
"vfmasb %%v28,%%v16,%%v1,%%v28 \n\t"
|
||||
"vfmssb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
|
||||
"vfmasb %%v29,%%v17,%%v1,%%v29 \n\t"
|
||||
"vfmssb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
|
||||
"vfmasb %%v30,%%v18,%%v1,%%v30 \n\t"
|
||||
"vfmssb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */
|
||||
"vfmasb %%v31,%%v19,%%v1,%%v31 \n\t"
|
||||
"vfmssb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
|
||||
|
||||
"vst %%v28, 0(%%r1,%1) \n\t"
|
||||
"vst %%v29, 16(%%r1,%1) \n\t"
|
||||
"vst %%v30, 32(%%r1,%1) \n\t"
|
||||
"vst %%v31, 48(%%r1,%1) \n\t"
|
||||
"vst %%v20, 0(%%r1,%2) \n\t"
|
||||
"vst %%v21, 16(%%r1,%2) \n\t"
|
||||
"vst %%v22, 32(%%r1,%2) \n\t"
|
||||
"vst %%v23, 48(%%r1,%2) \n\t"
|
||||
|
||||
"vl %%v24, 64(%%r1,%1) \n\t"
|
||||
"vl %%v25, 80(%%r1,%1) \n\t"
|
||||
"vl %%v26, 96(%%r1,%1) \n\t"
|
||||
"vl %%v27, 112(%%r1,%1) \n\t"
|
||||
"vl %%v16, 64(%%r1,%2) \n\t"
|
||||
"vl %%v17, 80(%%r1,%2) \n\t"
|
||||
"vl %%v18, 96(%%r1,%2) \n\t"
|
||||
"vl %%v19, 112(%%r1,%2) \n\t"
|
||||
|
||||
"vfmsb %%v28,%%v24,%%v0 \n\t"
|
||||
"vfmsb %%v29,%%v25,%%v0 \n\t"
|
||||
"vfmsb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmsb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmsb %%v30,%%v26,%%v0 \n\t"
|
||||
"vfmsb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmsb %%v31,%%v27,%%v0 \n\t"
|
||||
"vfmsb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
|
||||
/* 2nd parts*/
|
||||
"vfmasb %%v28,%%v16,%%v1,%%v28 \n\t"
|
||||
"vfmssb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
|
||||
"vfmasb %%v29,%%v17,%%v1,%%v29 \n\t"
|
||||
"vfmssb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
|
||||
"vfmasb %%v30,%%v18,%%v1,%%v30 \n\t"
|
||||
"vfmssb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */
|
||||
"vfmasb %%v31,%%v19,%%v1,%%v31 \n\t"
|
||||
"vfmssb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
|
||||
|
||||
"vst %%v28, 64(%%r1,%1) \n\t"
|
||||
"vst %%v29, 80(%%r1,%1) \n\t"
|
||||
"vst %%v30, 96(%%r1,%1) \n\t"
|
||||
"vst %%v31, 112(%%r1,%1) \n\t"
|
||||
"vst %%v20, 64(%%r1,%2) \n\t"
|
||||
"vst %%v21, 80(%%r1,%2) \n\t"
|
||||
"vst %%v22, 96(%%r1,%2) \n\t"
|
||||
"vst %%v23, 112(%%r1,%2) \n\t"
|
||||
|
||||
"vl %%v24, 128(%%r1,%1) \n\t"
|
||||
"vl %%v25, 144(%%r1,%1) \n\t"
|
||||
"vl %%v26, 160(%%r1,%1) \n\t"
|
||||
"vl %%v27, 176(%%r1,%1) \n\t"
|
||||
"vl %%v16, 128(%%r1,%2) \n\t"
|
||||
"vl %%v17, 144(%%r1,%2) \n\t"
|
||||
"vl %%v18, 160(%%r1,%2) \n\t"
|
||||
"vl %%v19, 176(%%r1,%2) \n\t"
|
||||
|
||||
"vfmsb %%v28,%%v24,%%v0 \n\t"
|
||||
"vfmsb %%v29,%%v25,%%v0 \n\t"
|
||||
"vfmsb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmsb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmsb %%v30,%%v26,%%v0 \n\t"
|
||||
"vfmsb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmsb %%v31,%%v27,%%v0 \n\t"
|
||||
"vfmsb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
|
||||
/* 2nd parts*/
|
||||
"vfmasb %%v28,%%v16,%%v1,%%v28 \n\t"
|
||||
"vfmssb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
|
||||
"vfmasb %%v29,%%v17,%%v1,%%v29 \n\t"
|
||||
"vfmssb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
|
||||
"vfmasb %%v30,%%v18,%%v1,%%v30 \n\t"
|
||||
"vfmssb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */
|
||||
"vfmasb %%v31,%%v19,%%v1,%%v31 \n\t"
|
||||
"vfmssb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
|
||||
|
||||
"vst %%v28, 128(%%r1,%1) \n\t"
|
||||
"vst %%v29, 144(%%r1,%1) \n\t"
|
||||
"vst %%v30, 160(%%r1,%1) \n\t"
|
||||
"vst %%v31, 176(%%r1,%1) \n\t"
|
||||
"vst %%v20, 128(%%r1,%2) \n\t"
|
||||
"vst %%v21, 144(%%r1,%2) \n\t"
|
||||
"vst %%v22, 160(%%r1,%2) \n\t"
|
||||
"vst %%v23, 176(%%r1,%2) \n\t"
|
||||
|
||||
"vl %%v24, 192(%%r1,%1) \n\t"
|
||||
"vl %%v25, 208(%%r1,%1) \n\t"
|
||||
"vl %%v26, 224(%%r1,%1) \n\t"
|
||||
"vl %%v27, 240(%%r1,%1) \n\t"
|
||||
"vl %%v16, 192(%%r1,%2) \n\t"
|
||||
"vl %%v17, 208(%%r1,%2) \n\t"
|
||||
"vl %%v18, 224(%%r1,%2) \n\t"
|
||||
"vl %%v19, 240(%%r1,%2) \n\t"
|
||||
|
||||
"vfmsb %%v28,%%v24,%%v0 \n\t"
|
||||
"vfmsb %%v29,%%v25,%%v0 \n\t"
|
||||
"vfmsb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmsb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmsb %%v30,%%v26,%%v0 \n\t"
|
||||
"vfmsb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmsb %%v31,%%v27,%%v0 \n\t"
|
||||
"vfmsb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
|
||||
/* 2nd parts*/
|
||||
"vfmasb %%v28,%%v16,%%v1,%%v28 \n\t"
|
||||
"vfmssb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
|
||||
"vfmasb %%v29,%%v17,%%v1,%%v29 \n\t"
|
||||
"vfmssb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
|
||||
"vfmasb %%v30,%%v18,%%v1,%%v30 \n\t"
|
||||
"vfmssb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */
|
||||
"vfmasb %%v31,%%v19,%%v1,%%v31 \n\t"
|
||||
"vfmssb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
|
||||
|
||||
"vst %%v28, 192(%%r1,%1) \n\t"
|
||||
"vst %%v29, 208(%%r1,%1) \n\t"
|
||||
"vst %%v30, 224(%%r1,%1) \n\t"
|
||||
"vst %%v31, 240(%%r1,%1) \n\t"
|
||||
"vst %%v20, 192(%%r1,%2) \n\t"
|
||||
"vst %%v21, 208(%%r1,%2) \n\t"
|
||||
"vst %%v22, 224(%%r1,%2) \n\t"
|
||||
"vst %%v23, 240(%%r1,%2) \n\t"
|
||||
|
||||
"agfi %%r1,256 \n\t"
|
||||
"brctg %%r0,0b "
|
||||
:
|
||||
:"r"(n),"ZR"((FLOAT (*)[n * 2])x),"ZR"((FLOAT (*)[n * 2])y),"m"(*c),"m"(*s)
|
||||
:"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
}
|
||||
|
||||
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
|
||||
{
|
||||
BLASLONG i=0;
|
||||
BLASLONG ix=0,iy=0;
|
||||
FLOAT temp[2];
|
||||
BLASLONG inc_x2;
|
||||
BLASLONG inc_y2;
|
||||
|
||||
if ( n <= 0 ) return(0);
|
||||
|
||||
if ( (inc_x == 1) && (inc_y == 1) )
|
||||
{
|
||||
|
||||
BLASLONG n1 = n & -32;
|
||||
if ( n1 > 0 )
|
||||
{
|
||||
FLOAT cosa,sina;
|
||||
cosa=c;
|
||||
sina=s;
|
||||
crot_kernel_32(n1, x, y, &cosa, &sina);
|
||||
i=n1;
|
||||
ix=2*n1;
|
||||
}
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
temp[0] = c*x[ix] + s*y[ix] ;
|
||||
temp[1] = c*x[ix+1] + s*y[ix+1] ;
|
||||
y[ix] = c*y[ix] - s*x[ix] ;
|
||||
y[ix+1] = c*y[ix+1] - s*x[ix+1] ;
|
||||
x[ix] = temp[0] ;
|
||||
x[ix+1] = temp[1] ;
|
||||
|
||||
ix += 2 ;
|
||||
i++ ;
|
||||
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
inc_x2 = 2 * inc_x ;
|
||||
inc_y2 = 2 * inc_y ;
|
||||
while(i < n)
|
||||
{
|
||||
temp[0] = c*x[ix] + s*y[iy] ;
|
||||
temp[1] = c*x[ix+1] + s*y[iy+1] ;
|
||||
y[iy] = c*y[iy] - s*x[ix] ;
|
||||
y[iy+1] = c*y[iy+1] - s*x[ix+1] ;
|
||||
x[ix] = temp[0] ;
|
||||
x[ix+1] = temp[1] ;
|
||||
|
||||
ix += inc_x2 ;
|
||||
iy += inc_y2 ;
|
||||
i++ ;
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
return(0);
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,456 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013 - 2017, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
static void cscal_kernel_16(BLASLONG n, FLOAT *alpha, FLOAT *x)
|
||||
{
|
||||
__asm__ volatile(
|
||||
"vlrepf %%v0,0(%1) \n\t"
|
||||
"vlef %%v1,4(%1),0 \n\t"
|
||||
"vlef %%v1,4(%1),2 \n\t"
|
||||
"vflcsb %%v1,%%v1 \n\t"
|
||||
"vlef %%v1,4(%1),1 \n\t"
|
||||
"vlef %%v1,4(%1),3 \n\t"
|
||||
"srlg %%r0,%0,4 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 2, 1024(%%r1,%2) \n\t"
|
||||
|
||||
"vl %%v16,0(%%r1,%2) \n\t"
|
||||
"vl %%v17,16(%%r1,%2) \n\t"
|
||||
"vl %%v18,32(%%r1,%2) \n\t"
|
||||
"vl %%v19,48(%%r1,%2) \n\t"
|
||||
"vl %%v20,64(%%r1,%2) \n\t"
|
||||
"vl %%v21,80(%%r1,%2) \n\t"
|
||||
"vl %%v22,96(%%r1,%2) \n\t"
|
||||
"vl %%v23,112(%%r1,%2) \n\t"
|
||||
"verllg %%v24,%%v16,32 \n\t"
|
||||
"verllg %%v25,%%v17,32 \n\t"
|
||||
"verllg %%v26,%%v18,32 \n\t"
|
||||
"verllg %%v27,%%v19,32 \n\t"
|
||||
"verllg %%v28,%%v20,32 \n\t"
|
||||
"verllg %%v29,%%v21,32 \n\t"
|
||||
"verllg %%v30,%%v22,32 \n\t"
|
||||
"verllg %%v31,%%v23,32 \n\t"
|
||||
|
||||
"vfmsb %%v16,%%v16,%%v0 \n\t"
|
||||
"vfmsb %%v17,%%v17,%%v0 \n\t"
|
||||
"vfmsb %%v18,%%v18,%%v0 \n\t"
|
||||
"vfmsb %%v19,%%v19,%%v0 \n\t"
|
||||
"vfmsb %%v20,%%v20,%%v0 \n\t"
|
||||
"vfmsb %%v21,%%v21,%%v0 \n\t"
|
||||
"vfmsb %%v22,%%v22,%%v0 \n\t"
|
||||
"vfmsb %%v23,%%v23,%%v0 \n\t"
|
||||
"vfmasb %%v16,%%v24,%%v1,%%v16 \n\t"
|
||||
"vfmasb %%v17,%%v25,%%v1,%%v17 \n\t"
|
||||
"vfmasb %%v18,%%v26,%%v1,%%v18 \n\t"
|
||||
"vfmasb %%v19,%%v27,%%v1,%%v19 \n\t"
|
||||
"vfmasb %%v20,%%v28,%%v1,%%v20 \n\t"
|
||||
"vfmasb %%v21,%%v29,%%v1,%%v21 \n\t"
|
||||
"vfmasb %%v22,%%v30,%%v1,%%v22 \n\t"
|
||||
"vfmasb %%v23,%%v31,%%v1,%%v23 \n\t"
|
||||
|
||||
"vst %%v16,0(%%r1,%2) \n\t"
|
||||
"vst %%v17,16(%%r1,%2) \n\t"
|
||||
"vst %%v18,32(%%r1,%2) \n\t"
|
||||
"vst %%v19,48(%%r1,%2) \n\t"
|
||||
"vst %%v20,64(%%r1,%2) \n\t"
|
||||
"vst %%v21,80(%%r1,%2) \n\t"
|
||||
"vst %%v22,96(%%r1,%2) \n\t"
|
||||
"vst %%v23,112(%%r1,%2) \n\t"
|
||||
|
||||
"agfi %%r1,128 \n\t"
|
||||
"brctg %%r0,0b "
|
||||
:
|
||||
:"r"(n),"ZQ"((const FLOAT (*)[2])alpha),"ZR"((FLOAT (*)[n * 2])x)
|
||||
:"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
}
|
||||
|
||||
static void cscal_kernel_16_zero_r(BLASLONG n, FLOAT *alpha, FLOAT *x)
|
||||
{
|
||||
__asm__ volatile(
|
||||
"vlef %%v0,4(%1),0 \n\t"
|
||||
"vlef %%v0,4(%1),2 \n\t"
|
||||
"vflcsb %%v0,%%v0 \n\t"
|
||||
"vlef %%v0,4(%1),1 \n\t"
|
||||
"vlef %%v0,4(%1),3 \n\t"
|
||||
"srlg %%r0,%0,4 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 2, 1024(%%r1,%2) \n\t"
|
||||
|
||||
"vl %%v16,0(%%r1,%2) \n\t"
|
||||
"vl %%v17,16(%%r1,%2) \n\t"
|
||||
"vl %%v18,32(%%r1,%2) \n\t"
|
||||
"vl %%v19,48(%%r1,%2) \n\t"
|
||||
"vl %%v20,64(%%r1,%2) \n\t"
|
||||
"vl %%v21,80(%%r1,%2) \n\t"
|
||||
"vl %%v22,96(%%r1,%2) \n\t"
|
||||
"vl %%v23,112(%%r1,%2) \n\t"
|
||||
"verllg %%v16,%%v16,32 \n\t"
|
||||
"verllg %%v17,%%v17,32 \n\t"
|
||||
"verllg %%v18,%%v18,32 \n\t"
|
||||
"verllg %%v19,%%v19,32 \n\t"
|
||||
"verllg %%v20,%%v20,32 \n\t"
|
||||
"verllg %%v21,%%v21,32 \n\t"
|
||||
"verllg %%v22,%%v22,32 \n\t"
|
||||
"verllg %%v23,%%v23,32 \n\t"
|
||||
|
||||
"vfmsb %%v16,%%v16,%%v0 \n\t"
|
||||
"vfmsb %%v17,%%v17,%%v0 \n\t"
|
||||
"vfmsb %%v18,%%v18,%%v0 \n\t"
|
||||
"vfmsb %%v19,%%v19,%%v0 \n\t"
|
||||
"vfmsb %%v20,%%v20,%%v0 \n\t"
|
||||
"vfmsb %%v21,%%v21,%%v0 \n\t"
|
||||
"vfmsb %%v22,%%v22,%%v0 \n\t"
|
||||
"vfmsb %%v23,%%v23,%%v0 \n\t"
|
||||
|
||||
"vst %%v16,0(%%r1,%2) \n\t"
|
||||
"vst %%v17,16(%%r1,%2) \n\t"
|
||||
"vst %%v18,32(%%r1,%2) \n\t"
|
||||
"vst %%v19,48(%%r1,%2) \n\t"
|
||||
"vst %%v20,64(%%r1,%2) \n\t"
|
||||
"vst %%v21,80(%%r1,%2) \n\t"
|
||||
"vst %%v22,96(%%r1,%2) \n\t"
|
||||
"vst %%v23,112(%%r1,%2) \n\t"
|
||||
|
||||
"agfi %%r1,128 \n\t"
|
||||
"brctg %%r0,0b "
|
||||
:
|
||||
:"r"(n),"ZQ"((const FLOAT (*)[2])alpha),"ZR"((FLOAT (*)[n * 2])x)
|
||||
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23"
|
||||
);
|
||||
}
|
||||
|
||||
static void cscal_kernel_16_zero_i(BLASLONG n, FLOAT *alpha, FLOAT *x)
|
||||
{
|
||||
__asm__ volatile(
|
||||
"vlrepf %%v0,0(%1) \n\t"
|
||||
"srlg %%r0,%0,4 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 2, 1024(%%r1,%2) \n\t"
|
||||
|
||||
"vl %%v16,0(%%r1,%2) \n\t"
|
||||
"vl %%v17,16(%%r1,%2) \n\t"
|
||||
"vl %%v18,32(%%r1,%2) \n\t"
|
||||
"vl %%v19,48(%%r1,%2) \n\t"
|
||||
"vl %%v20,64(%%r1,%2) \n\t"
|
||||
"vl %%v21,80(%%r1,%2) \n\t"
|
||||
"vl %%v22,96(%%r1,%2) \n\t"
|
||||
"vl %%v23,112(%%r1,%2) \n\t"
|
||||
|
||||
"vfmsb %%v16,%%v16,%%v0 \n\t"
|
||||
"vfmsb %%v17,%%v17,%%v0 \n\t"
|
||||
"vfmsb %%v18,%%v18,%%v0 \n\t"
|
||||
"vfmsb %%v19,%%v19,%%v0 \n\t"
|
||||
"vfmsb %%v20,%%v20,%%v0 \n\t"
|
||||
"vfmsb %%v21,%%v21,%%v0 \n\t"
|
||||
"vfmsb %%v22,%%v22,%%v0 \n\t"
|
||||
"vfmsb %%v23,%%v23,%%v0 \n\t"
|
||||
|
||||
"vst %%v16,0(%%r1,%2) \n\t"
|
||||
"vst %%v17,16(%%r1,%2) \n\t"
|
||||
"vst %%v18,32(%%r1,%2) \n\t"
|
||||
"vst %%v19,48(%%r1,%2) \n\t"
|
||||
"vst %%v20,64(%%r1,%2) \n\t"
|
||||
"vst %%v21,80(%%r1,%2) \n\t"
|
||||
"vst %%v22,96(%%r1,%2) \n\t"
|
||||
"vst %%v23,112(%%r1,%2) \n\t"
|
||||
|
||||
"agfi %%r1,128 \n\t"
|
||||
"brctg %%r0,0b "
|
||||
:
|
||||
:"r"(n),"ZQ"((const FLOAT (*)[2])alpha),"ZR"((FLOAT (*)[n * 2])x)
|
||||
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23"
|
||||
);
|
||||
}
|
||||
|
||||
static void cscal_kernel_16_zero(BLASLONG n, FLOAT *x)
|
||||
{
|
||||
__asm__ volatile(
|
||||
"vzero %%v24 \n\t"
|
||||
"vzero %%v25 \n\t"
|
||||
"vzero %%v26 \n\t"
|
||||
"vzero %%v27 \n\t"
|
||||
"srlg %%r0,%0,4 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 2, 1024(%%r1,%1) \n\t"
|
||||
|
||||
"vst %%v24,0(%%r1,%1) \n\t"
|
||||
"vst %%v25,16(%%r1,%1) \n\t"
|
||||
"vst %%v26,32(%%r1,%1) \n\t"
|
||||
"vst %%v27,48(%%r1,%1) \n\t"
|
||||
"vst %%v24,64(%%r1,%1) \n\t"
|
||||
"vst %%v25,80(%%r1,%1) \n\t"
|
||||
"vst %%v26,96(%%r1,%1) \n\t"
|
||||
"vst %%v27,112(%%r1,%1) \n\t"
|
||||
|
||||
"agfi %%r1,128 \n\t"
|
||||
"brctg %%r0,0b "
|
||||
:
|
||||
:"r"(n),"ZR"((FLOAT (*)[n * 2])x)
|
||||
:"memory","cc","r0","r1","v24","v25","v26","v27"
|
||||
);
|
||||
}
|
||||
|
||||
static void cscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, BLASLONG inc_x)
|
||||
{
|
||||
BLASLONG i;
|
||||
BLASLONG inc_x2 = 2 * inc_x;
|
||||
BLASLONG inc_x3 = inc_x2 + inc_x;
|
||||
FLOAT t0, t1, t2, t3;
|
||||
FLOAT da_r = alpha[0];
|
||||
FLOAT da_i = alpha[1];
|
||||
|
||||
for (i = 0; i < n; i += 4)
|
||||
{
|
||||
t0 = da_r * x[0] - da_i * x[1];
|
||||
t1 = da_r * x[inc_x] - da_i * x[inc_x + 1];
|
||||
t2 = da_r * x[inc_x2] - da_i * x[inc_x2 + 1];
|
||||
t3 = da_r * x[inc_x3] - da_i * x[inc_x3 + 1];
|
||||
|
||||
x[1] = da_i * x[0] + da_r * x[1];
|
||||
x[inc_x + 1] = da_i * x[inc_x] + da_r * x[inc_x + 1];
|
||||
x[inc_x2 + 1] = da_i * x[inc_x2] + da_r * x[inc_x2 + 1];
|
||||
x[inc_x3 + 1] = da_i * x[inc_x3] + da_r * x[inc_x3 + 1];
|
||||
|
||||
x[0] = t0;
|
||||
x[inc_x] = t1;
|
||||
x[inc_x2] = t2;
|
||||
x[inc_x3] = t3;
|
||||
|
||||
x += 4 * inc_x;
|
||||
}
|
||||
}
|
||||
|
||||
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) {
|
||||
BLASLONG i = 0, j = 0;
|
||||
FLOAT temp0;
|
||||
FLOAT temp1;
|
||||
FLOAT alpha[2] __attribute__ ((aligned(16)));
|
||||
|
||||
if (inc_x != 1) {
|
||||
inc_x <<= 1;
|
||||
|
||||
if (da_r == 0.0) {
|
||||
|
||||
BLASLONG n1 = n & -2;
|
||||
|
||||
if (da_i == 0.0) {
|
||||
|
||||
while (j < n1) {
|
||||
|
||||
x[i] = 0.0;
|
||||
x[i + 1] = 0.0;
|
||||
x[i + inc_x] = 0.0;
|
||||
x[i + 1 + inc_x] = 0.0;
|
||||
i += 2 * inc_x;
|
||||
j += 2;
|
||||
|
||||
}
|
||||
|
||||
while (j < n) {
|
||||
|
||||
x[i] = 0.0;
|
||||
x[i + 1] = 0.0;
|
||||
i += inc_x;
|
||||
j++;
|
||||
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
while (j < n1) {
|
||||
|
||||
temp0 = -da_i * x[i + 1];
|
||||
x[i + 1] = da_i * x[i];
|
||||
x[i] = temp0;
|
||||
temp1 = -da_i * x[i + 1 + inc_x];
|
||||
x[i + 1 + inc_x] = da_i * x[i + inc_x];
|
||||
x[i + inc_x] = temp1;
|
||||
i += 2 * inc_x;
|
||||
j += 2;
|
||||
|
||||
}
|
||||
|
||||
while (j < n) {
|
||||
|
||||
temp0 = -da_i * x[i + 1];
|
||||
x[i + 1] = da_i * x[i];
|
||||
x[i] = temp0;
|
||||
i += inc_x;
|
||||
j++;
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
|
||||
if (da_i == 0.0) {
|
||||
BLASLONG n1 = n & -2;
|
||||
|
||||
while (j < n1) {
|
||||
|
||||
temp0 = da_r * x[i];
|
||||
x[i + 1] = da_r * x[i + 1];
|
||||
x[i] = temp0;
|
||||
temp1 = da_r * x[i + inc_x];
|
||||
x[i + 1 + inc_x] = da_r * x[i + 1 + inc_x];
|
||||
x[i + inc_x] = temp1;
|
||||
i += 2 * inc_x;
|
||||
j += 2;
|
||||
|
||||
}
|
||||
|
||||
while (j < n) {
|
||||
|
||||
temp0 = da_r * x[i];
|
||||
x[i + 1] = da_r * x[i + 1];
|
||||
x[i] = temp0;
|
||||
i += inc_x;
|
||||
j++;
|
||||
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
BLASLONG n1 = n & -8;
|
||||
if (n1 > 0) {
|
||||
alpha[0] = da_r;
|
||||
alpha[1] = da_i;
|
||||
cscal_kernel_inc_8(n1, alpha, x, inc_x);
|
||||
j = n1;
|
||||
i = n1 * inc_x;
|
||||
}
|
||||
|
||||
while (j < n) {
|
||||
|
||||
temp0 = da_r * x[i] - da_i * x[i + 1];
|
||||
x[i + 1] = da_r * x[i + 1] + da_i * x[i];
|
||||
x[i] = temp0;
|
||||
i += inc_x;
|
||||
j++;
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
return (0);
|
||||
}
|
||||
|
||||
|
||||
BLASLONG n1 = n & -16;
|
||||
if (n1 > 0) {
|
||||
|
||||
alpha[0] = da_r;
|
||||
alpha[1] = da_i;
|
||||
|
||||
if (da_r == 0.0)
|
||||
if (da_i == 0)
|
||||
cscal_kernel_16_zero(n1, x);
|
||||
else
|
||||
cscal_kernel_16_zero_r(n1, alpha, x);
|
||||
else
|
||||
if (da_i == 0)
|
||||
cscal_kernel_16_zero_i(n1, alpha, x);
|
||||
else
|
||||
cscal_kernel_16(n1, alpha, x);
|
||||
|
||||
i = n1 << 1;
|
||||
j = n1;
|
||||
}
|
||||
|
||||
|
||||
if (da_r == 0.0) {
|
||||
|
||||
if (da_i == 0.0) {
|
||||
|
||||
while (j < n) {
|
||||
|
||||
x[i] = 0.0;
|
||||
x[i + 1] = 0.0;
|
||||
i += 2;
|
||||
j++;
|
||||
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
while (j < n) {
|
||||
|
||||
temp0 = -da_i * x[i + 1];
|
||||
x[i + 1] = da_i * x[i];
|
||||
x[i] = temp0;
|
||||
i += 2;
|
||||
j++;
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
if (da_i == 0.0) {
|
||||
|
||||
while (j < n) {
|
||||
|
||||
temp0 = da_r * x[i];
|
||||
x[i + 1] = da_r * x[i + 1];
|
||||
x[i] = temp0;
|
||||
i += 2;
|
||||
j++;
|
||||
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
while (j < n) {
|
||||
|
||||
temp0 = da_r * x[i] - da_i * x[i + 1];
|
||||
x[i + 1] = da_r * x[i + 1] + da_i * x[i];
|
||||
x[i] = temp0;
|
||||
i += 2;
|
||||
j++;
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
return (0);
|
||||
}
|
||||
|
|
@ -0,0 +1,183 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2017, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
static void cswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y)
|
||||
{
|
||||
__asm__ volatile(
|
||||
"srlg %%r0,%0,5 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 2, 1024(%%r1,%1) \n\t"
|
||||
"pfd 2, 1024(%%r1,%2) \n\t"
|
||||
|
||||
"vl %%v16, 0(%%r1,%1) \n\t"
|
||||
"vl %%v17, 16(%%r1,%1) \n\t"
|
||||
"vl %%v18, 32(%%r1,%1) \n\t"
|
||||
"vl %%v19, 48(%%r1,%1) \n\t"
|
||||
"vl %%v20, 64(%%r1,%1) \n\t"
|
||||
"vl %%v21, 80(%%r1,%1) \n\t"
|
||||
"vl %%v22, 96(%%r1,%1) \n\t"
|
||||
"vl %%v23, 112(%%r1,%1) \n\t"
|
||||
"vl %%v24, 128(%%r1,%1) \n\t"
|
||||
"vl %%v25, 144(%%r1,%1) \n\t"
|
||||
"vl %%v26, 160(%%r1,%1) \n\t"
|
||||
"vl %%v27, 176(%%r1,%1) \n\t"
|
||||
"vl %%v28, 192(%%r1,%1) \n\t"
|
||||
"vl %%v29, 208(%%r1,%1) \n\t"
|
||||
"vl %%v30, 224(%%r1,%1) \n\t"
|
||||
"vl %%v31, 240(%%r1,%1) \n\t"
|
||||
|
||||
"vl %%v0, 0(%%r1,%2) \n\t"
|
||||
"vl %%v1, 16(%%r1,%2) \n\t"
|
||||
"vl %%v2, 32(%%r1,%2) \n\t"
|
||||
"vl %%v3, 48(%%r1,%2) \n\t"
|
||||
"vl %%v4, 64(%%r1,%2) \n\t"
|
||||
"vl %%v5, 80(%%r1,%2) \n\t"
|
||||
"vl %%v6, 96(%%r1,%2) \n\t"
|
||||
"vl %%v7, 112(%%r1,%2) \n\t"
|
||||
"vst %%v0, 0(%%r1,%1) \n\t"
|
||||
"vst %%v1, 16(%%r1,%1) \n\t"
|
||||
"vst %%v2, 32(%%r1,%1) \n\t"
|
||||
"vst %%v3, 48(%%r1,%1) \n\t"
|
||||
"vst %%v4, 64(%%r1,%1) \n\t"
|
||||
"vst %%v5, 80(%%r1,%1) \n\t"
|
||||
"vst %%v6, 96(%%r1,%1) \n\t"
|
||||
"vst %%v7, 112(%%r1,%1) \n\t"
|
||||
|
||||
"vl %%v0, 128(%%r1,%2) \n\t"
|
||||
"vl %%v1, 144(%%r1,%2) \n\t"
|
||||
"vl %%v2, 160(%%r1,%2) \n\t"
|
||||
"vl %%v3, 176(%%r1,%2) \n\t"
|
||||
"vl %%v4, 192(%%r1,%2) \n\t"
|
||||
"vl %%v5, 208(%%r1,%2) \n\t"
|
||||
"vl %%v6, 224(%%r1,%2) \n\t"
|
||||
"vl %%v7, 240(%%r1,%2) \n\t"
|
||||
"vst %%v0, 128(%%r1,%1) \n\t"
|
||||
"vst %%v1, 144(%%r1,%1) \n\t"
|
||||
"vst %%v2, 160(%%r1,%1) \n\t"
|
||||
"vst %%v3, 176(%%r1,%1) \n\t"
|
||||
"vst %%v4, 192(%%r1,%1) \n\t"
|
||||
"vst %%v5, 208(%%r1,%1) \n\t"
|
||||
"vst %%v6, 224(%%r1,%1) \n\t"
|
||||
"vst %%v7, 240(%%r1,%1) \n\t"
|
||||
|
||||
"vst %%v16, 0(%%r1,%2) \n\t"
|
||||
"vst %%v17, 16(%%r1,%2) \n\t"
|
||||
"vst %%v18, 32(%%r1,%2) \n\t"
|
||||
"vst %%v19, 48(%%r1,%2) \n\t"
|
||||
"vst %%v20, 64(%%r1,%2) \n\t"
|
||||
"vst %%v21, 80(%%r1,%2) \n\t"
|
||||
"vst %%v22, 96(%%r1,%2) \n\t"
|
||||
"vst %%v23, 112(%%r1,%2) \n\t"
|
||||
"vst %%v24, 128(%%r1,%2) \n\t"
|
||||
"vst %%v25, 144(%%r1,%2) \n\t"
|
||||
"vst %%v26, 160(%%r1,%2) \n\t"
|
||||
"vst %%v27, 176(%%r1,%2) \n\t"
|
||||
"vst %%v28, 192(%%r1,%2) \n\t"
|
||||
"vst %%v29, 208(%%r1,%2) \n\t"
|
||||
"vst %%v30, 224(%%r1,%2) \n\t"
|
||||
"vst %%v31, 240(%%r1,%2) \n\t"
|
||||
|
||||
"agfi %%r1,256 \n\t"
|
||||
"brctg %%r0,0b "
|
||||
:
|
||||
:"r"(n),"ZR"((FLOAT (*)[n * 2])x),"ZR"((FLOAT (*)[n * 2])y)
|
||||
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
}
|
||||
|
||||
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
|
||||
{
|
||||
BLASLONG i=0;
|
||||
BLASLONG ix=0,iy=0;
|
||||
FLOAT temp[2];
|
||||
BLASLONG inc_x2, inc_y2;
|
||||
|
||||
if ( n <= 0 ) return(0);
|
||||
|
||||
if ( (inc_x == 1) && (inc_y == 1 ))
|
||||
{
|
||||
|
||||
BLASLONG n1 = n & -32;
|
||||
if ( n1 > 0 )
|
||||
{
|
||||
cswap_kernel_32(n1, x, y);
|
||||
i=n1;
|
||||
ix = 2* n1;
|
||||
iy = 2* n1;
|
||||
}
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
|
||||
temp[0] = x[ix] ;
|
||||
temp[1] = x[ix+1] ;
|
||||
x[ix] = y[iy] ;
|
||||
x[ix+1] = y[iy+1] ;
|
||||
y[iy] = temp[0] ;
|
||||
y[iy+1] = temp[1] ;
|
||||
|
||||
ix += 2 ;
|
||||
iy += 2 ;
|
||||
i++ ;
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
inc_x2 = 2 * inc_x;
|
||||
inc_y2 = 2 * inc_y;
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
|
||||
temp[0] = x[ix] ;
|
||||
temp[1] = x[ix+1] ;
|
||||
x[ix] = y[iy] ;
|
||||
x[ix+1] = y[iy+1] ;
|
||||
y[iy] = temp[0] ;
|
||||
y[iy+1] = temp[1] ;
|
||||
|
||||
ix += inc_x2 ;
|
||||
iy += inc_y2 ;
|
||||
i++ ;
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
return(0);
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,166 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2017, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include <math.h>
|
||||
|
||||
#if defined(DOUBLE)
|
||||
#define ABS fabs
|
||||
#else
|
||||
#define ABS fabsf
|
||||
#endif
|
||||
|
||||
static FLOAT damax_kernel_32(BLASLONG n, FLOAT *x)
|
||||
{
|
||||
FLOAT amax;
|
||||
|
||||
__asm__ volatile (
|
||||
"vl %%v0,0(%2) \n\t"
|
||||
"srlg %%r0,%1,5 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 1, 1024(%%r1,%2) \n\t"
|
||||
|
||||
"vl %%v16,0(%%r1,%2) \n\t"
|
||||
"vl %%v17,16(%%r1,%2) \n\t"
|
||||
"vl %%v18,32(%%r1,%2) \n\t"
|
||||
"vl %%v19,48(%%r1,%2) \n\t"
|
||||
"vl %%v20,64(%%r1,%2) \n\t"
|
||||
"vl %%v21,80(%%r1,%2) \n\t"
|
||||
"vl %%v22,96(%%r1,%2) \n\t"
|
||||
"vl %%v23,112(%%r1,%2) \n\t"
|
||||
"vl %%v24,128(%%r1,%2) \n\t"
|
||||
"vl %%v25,144(%%r1,%2) \n\t"
|
||||
"vl %%v26,160(%%r1,%2) \n\t"
|
||||
"vl %%v27,176(%%r1,%2) \n\t"
|
||||
"vl %%v28,192(%%r1,%2) \n\t"
|
||||
"vl %%v29,208(%%r1,%2) \n\t"
|
||||
"vl %%v30,224(%%r1,%2) \n\t"
|
||||
"vl %%v31,240(%%r1,%2) \n\t"
|
||||
|
||||
"vfmaxdb %%v16,%%v16,%%v24,8 \n\t"
|
||||
"vfmaxdb %%v17,%%v17,%%v25,8 \n\t"
|
||||
"vfmaxdb %%v18,%%v18,%%v26,8 \n\t"
|
||||
"vfmaxdb %%v19,%%v19,%%v27,8 \n\t"
|
||||
"vfmaxdb %%v20,%%v20,%%v28,8 \n\t"
|
||||
"vfmaxdb %%v21,%%v21,%%v29,8 \n\t"
|
||||
"vfmaxdb %%v22,%%v22,%%v30,8 \n\t"
|
||||
"vfmaxdb %%v23,%%v23,%%v31,8 \n\t"
|
||||
|
||||
"vfmaxdb %%v16,%%v16,%%v20,8 \n\t"
|
||||
"vfmaxdb %%v17,%%v17,%%v21,8 \n\t"
|
||||
"vfmaxdb %%v18,%%v18,%%v22,8 \n\t"
|
||||
"vfmaxdb %%v19,%%v19,%%v23,8 \n\t"
|
||||
|
||||
"vfmaxdb %%v16,%%v16,%%v18,8 \n\t"
|
||||
"vfmaxdb %%v17,%%v17,%%v19,8 \n\t"
|
||||
|
||||
"vfmaxdb %%v16,%%v16,%%v17,8 \n\t"
|
||||
|
||||
"vfmaxdb %%v0,%%v0,%%16,8 \n\t"
|
||||
|
||||
"agfi %%r1, 256 \n\t"
|
||||
"brctg %%r0, 0b \n\t"
|
||||
|
||||
"vrepg %%v16,%%v0,1 \n\t"
|
||||
"wfmaxdb %%v0,%%v0,%%v16,8 \n\t"
|
||||
"lpdr %0,%%f0 "
|
||||
:"=f"(amax)
|
||||
:"r"(n),"ZR"((const FLOAT (*)[n])x)
|
||||
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
|
||||
return amax;
|
||||
}
|
||||
|
||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||
BLASLONG i = 0;
|
||||
BLASLONG j = 0;
|
||||
FLOAT maxf = 0.0;
|
||||
|
||||
if (n <= 0 || inc_x <= 0) return (maxf);
|
||||
|
||||
if (inc_x == 1) {
|
||||
|
||||
BLASLONG n1 = n & -32;
|
||||
if (n1 > 0) {
|
||||
|
||||
maxf = damax_kernel_32(n1, x);
|
||||
|
||||
i = n1;
|
||||
}
|
||||
else
|
||||
{
|
||||
maxf=ABS(x[0]);
|
||||
i++;
|
||||
}
|
||||
|
||||
while (i < n) {
|
||||
if (ABS(x[i]) > maxf) {
|
||||
maxf = ABS(x[i]);
|
||||
}
|
||||
i++;
|
||||
}
|
||||
return (maxf);
|
||||
|
||||
} else {
|
||||
|
||||
maxf=ABS(x[0]);
|
||||
|
||||
BLASLONG n1 = n & -4;
|
||||
while (j < n1) {
|
||||
|
||||
if (ABS(x[i]) > maxf) {
|
||||
maxf = ABS(x[i]);
|
||||
}
|
||||
if (ABS(x[i + inc_x]) > maxf) {
|
||||
maxf = ABS(x[i + inc_x]);
|
||||
}
|
||||
if (ABS(x[i + 2 * inc_x]) > maxf) {
|
||||
maxf = ABS(x[i + 2 * inc_x]);
|
||||
}
|
||||
if (ABS(x[i + 3 * inc_x]) > maxf) {
|
||||
maxf = ABS(x[i + 3 * inc_x]);
|
||||
}
|
||||
|
||||
i += inc_x * 4;
|
||||
|
||||
j += 4;
|
||||
|
||||
}
|
||||
|
||||
|
||||
while (j < n) {
|
||||
if (ABS(x[i]) > maxf) {
|
||||
maxf = ABS(x[i]);
|
||||
}
|
||||
i += inc_x;
|
||||
j++;
|
||||
}
|
||||
return (maxf);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,204 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2017, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include <math.h>
|
||||
|
||||
#if defined(DOUBLE)
|
||||
#define ABS fabs
|
||||
#else
|
||||
#define ABS fabsf
|
||||
#endif
|
||||
|
||||
static FLOAT damax_kernel_32(BLASLONG n, FLOAT *x)
|
||||
{
|
||||
FLOAT amax;
|
||||
|
||||
__asm__ volatile (
|
||||
"vl %%v0,0(%2) \n\t"
|
||||
"vflpdb %%v0,%%v0 \n\t"
|
||||
"srlg %%r0,%1,5 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 1, 1024(%%r1,%2) \n\t"
|
||||
|
||||
"vl %%v16,0(%%r1,%2) \n\t"
|
||||
"vl %%v17,16(%%r1,%2) \n\t"
|
||||
"vl %%v18,32(%%r1,%2) \n\t"
|
||||
"vl %%v19,48(%%r1,%2) \n\t"
|
||||
"vl %%v20,64(%%r1,%2) \n\t"
|
||||
"vl %%v21,80(%%r1,%2) \n\t"
|
||||
"vl %%v22,96(%%r1,%2) \n\t"
|
||||
"vl %%v23,112(%%r1,%2) \n\t"
|
||||
"vflpdb %%v16, %%v16 \n\t"
|
||||
"vflpdb %%v17, %%v17 \n\t"
|
||||
"vflpdb %%v18, %%v18 \n\t"
|
||||
"vflpdb %%v19, %%v19 \n\t"
|
||||
"vflpdb %%v20, %%v20 \n\t"
|
||||
"vflpdb %%v21, %%v21 \n\t"
|
||||
"vflpdb %%v22, %%v22 \n\t"
|
||||
"vflpdb %%v23, %%v23 \n\t"
|
||||
|
||||
"vfchdb %%v24,%%v16,%%v17 \n\t"
|
||||
"vfchdb %%v25,%%v18,%%v19 \n\t"
|
||||
"vfchdb %%v26,%%v20,%%v21 \n\t"
|
||||
"vfchdb %%v27,%%v22,%%v23 \n\t"
|
||||
"vsel %%v24,%%v16,%%v17,%%v24 \n\t"
|
||||
"vsel %%v25,%%v18,%%v19,%%v25 \n\t"
|
||||
"vsel %%v26,%%v20,%%v21,%%v26 \n\t"
|
||||
"vsel %%v27,%%v22,%%v23,%%v27 \n\t"
|
||||
|
||||
"vfchdb %%v28,%%v24,%%v25 \n\t"
|
||||
"vfchdb %%v29,%%v26,%%v27 \n\t"
|
||||
"vsel %%v28,%%v24,%%v25,%%v28 \n\t"
|
||||
"vsel %%v29,%%v26,%%v27,%%v29 \n\t"
|
||||
|
||||
"vfchdb %%v30,%%v28,%%v29 \n\t"
|
||||
"vsel %%v30,%%v28,%%v29,%%v30 \n\t"
|
||||
|
||||
"vfchdb %%v31,%%v30,%%v0 \n\t"
|
||||
"vsel %%v0,%%v30,%%v0,%%v31 \n\t"
|
||||
|
||||
"vl %%v16,128(%%r1,%2) \n\t"
|
||||
"vl %%v17,144(%%r1,%2) \n\t"
|
||||
"vl %%v18,160(%%r1,%2) \n\t"
|
||||
"vl %%v19,176(%%r1,%2) \n\t"
|
||||
"vl %%v20,192(%%r1,%2) \n\t"
|
||||
"vl %%v21,208(%%r1,%2) \n\t"
|
||||
"vl %%v22,224(%%r1,%2) \n\t"
|
||||
"vl %%v23,240(%%r1,%2) \n\t"
|
||||
"vflpdb %%v16, %%v16 \n\t"
|
||||
"vflpdb %%v17, %%v17 \n\t"
|
||||
"vflpdb %%v18, %%v18 \n\t"
|
||||
"vflpdb %%v19, %%v19 \n\t"
|
||||
"vflpdb %%v20, %%v20 \n\t"
|
||||
"vflpdb %%v21, %%v21 \n\t"
|
||||
"vflpdb %%v22, %%v22 \n\t"
|
||||
"vflpdb %%v23, %%v23 \n\t"
|
||||
|
||||
"vfchdb %%v24,%%v16,%%v17 \n\t"
|
||||
"vfchdb %%v25,%%v18,%%v19 \n\t"
|
||||
"vfchdb %%v26,%%v20,%%v21 \n\t"
|
||||
"vfchdb %%v27,%%v22,%%v23 \n\t"
|
||||
"vsel %%v24,%%v16,%%v17,%%v24 \n\t"
|
||||
"vsel %%v25,%%v18,%%v19,%%v25 \n\t"
|
||||
"vsel %%v26,%%v20,%%v21,%%v26 \n\t"
|
||||
"vsel %%v27,%%v22,%%v23,%%v27 \n\t"
|
||||
|
||||
"vfchdb %%v28,%%v24,%%v25 \n\t"
|
||||
"vfchdb %%v29,%%v26,%%v27 \n\t"
|
||||
"vsel %%v28,%%v24,%%v25,%%v28 \n\t"
|
||||
"vsel %%v29,%%v26,%%v27,%%v29 \n\t"
|
||||
|
||||
"vfchdb %%v30,%%v28,%%v29 \n\t"
|
||||
"vsel %%v30,%%v28,%%v29,%%v30 \n\t"
|
||||
|
||||
"vfchdb %%v31,%%v30,%%v0 \n\t"
|
||||
"vsel %%v0,%%v30,%%v0,%%v31 \n\t"
|
||||
|
||||
"agfi %%r1, 256 \n\t"
|
||||
"brctg %%r0, 0b \n\t"
|
||||
|
||||
"vrepg %%v16,%%v0,1 \n\t"
|
||||
"wfchdb %%v17,%%v0,%%v16 \n\t"
|
||||
"vsel %%v0,%%v0,%%v16,%%v17 \n\t"
|
||||
"ldr %0,%%f0 "
|
||||
:"=f"(amax)
|
||||
:"r"(n),"ZR"((const FLOAT (*)[n])x)
|
||||
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
|
||||
return amax;
|
||||
}
|
||||
|
||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||
BLASLONG i = 0;
|
||||
BLASLONG j = 0;
|
||||
FLOAT maxf = 0.0;
|
||||
|
||||
if (n <= 0 || inc_x <= 0) return (maxf);
|
||||
|
||||
if (inc_x == 1) {
|
||||
|
||||
BLASLONG n1 = n & -32;
|
||||
if (n1 > 0) {
|
||||
|
||||
maxf = damax_kernel_32(n1, x);
|
||||
|
||||
i = n1;
|
||||
}
|
||||
else
|
||||
{
|
||||
maxf=ABS(x[0]);
|
||||
i++;
|
||||
}
|
||||
|
||||
while (i < n) {
|
||||
if (ABS(x[i]) > maxf) {
|
||||
maxf = ABS(x[i]);
|
||||
}
|
||||
i++;
|
||||
}
|
||||
return (maxf);
|
||||
|
||||
} else {
|
||||
|
||||
maxf=ABS(x[0]);
|
||||
|
||||
BLASLONG n1 = n & -4;
|
||||
while (j < n1) {
|
||||
|
||||
if (ABS(x[i]) > maxf) {
|
||||
maxf = ABS(x[i]);
|
||||
}
|
||||
if (ABS(x[i + inc_x]) > maxf) {
|
||||
maxf = ABS(x[i + inc_x]);
|
||||
}
|
||||
if (ABS(x[i + 2 * inc_x]) > maxf) {
|
||||
maxf = ABS(x[i + 2 * inc_x]);
|
||||
}
|
||||
if (ABS(x[i + 3 * inc_x]) > maxf) {
|
||||
maxf = ABS(x[i + 3 * inc_x]);
|
||||
}
|
||||
|
||||
i += inc_x * 4;
|
||||
|
||||
j += 4;
|
||||
|
||||
}
|
||||
|
||||
|
||||
while (j < n) {
|
||||
if (ABS(x[i]) > maxf) {
|
||||
maxf = ABS(x[i]);
|
||||
}
|
||||
i += inc_x;
|
||||
j++;
|
||||
}
|
||||
return (maxf);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,166 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2017, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include <math.h>
|
||||
|
||||
#if defined(DOUBLE)
|
||||
#define ABS fabs
|
||||
#else
|
||||
#define ABS fabsf
|
||||
#endif
|
||||
|
||||
static FLOAT damin_kernel_32(BLASLONG n, FLOAT *x)
|
||||
{
|
||||
FLOAT amin;
|
||||
|
||||
__asm__ volatile (
|
||||
"vl %%v0,0(%2) \n\t"
|
||||
"srlg %%r0,%1,5 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 1, 1024(%%r1,%2) \n\t"
|
||||
|
||||
"vl %%v16,0(%%r1,%2) \n\t"
|
||||
"vl %%v17,16(%%r1,%2) \n\t"
|
||||
"vl %%v18,32(%%r1,%2) \n\t"
|
||||
"vl %%v19,48(%%r1,%2) \n\t"
|
||||
"vl %%v20,64(%%r1,%2) \n\t"
|
||||
"vl %%v21,80(%%r1,%2) \n\t"
|
||||
"vl %%v22,96(%%r1,%2) \n\t"
|
||||
"vl %%v23,112(%%r1,%2) \n\t"
|
||||
"vl %%v24,128(%%r1,%2) \n\t"
|
||||
"vl %%v25,144(%%r1,%2) \n\t"
|
||||
"vl %%v26,160(%%r1,%2) \n\t"
|
||||
"vl %%v27,176(%%r1,%2) \n\t"
|
||||
"vl %%v28,192(%%r1,%2) \n\t"
|
||||
"vl %%v29,208(%%r1,%2) \n\t"
|
||||
"vl %%v30,224(%%r1,%2) \n\t"
|
||||
"vl %%v31,240(%%r1,%2) \n\t"
|
||||
|
||||
"vfmindb %%v16,%%v16,%%v24,8 \n\t"
|
||||
"vfmindb %%v17,%%v17,%%v25,8 \n\t"
|
||||
"vfmindb %%v18,%%v18,%%v26,8 \n\t"
|
||||
"vfmindb %%v19,%%v19,%%v27,8 \n\t"
|
||||
"vfmindb %%v20,%%v20,%%v28,8 \n\t"
|
||||
"vfmindb %%v21,%%v21,%%v29,8 \n\t"
|
||||
"vfmindb %%v22,%%v22,%%v30,8 \n\t"
|
||||
"vfmindb %%v23,%%v23,%%v31,8 \n\t"
|
||||
|
||||
"vfmindb %%v16,%%v16,%%v20,8 \n\t"
|
||||
"vfmindb %%v17,%%v17,%%v21,8 \n\t"
|
||||
"vfmindb %%v18,%%v18,%%v22,8 \n\t"
|
||||
"vfmindb %%v19,%%v19,%%v23,8 \n\t"
|
||||
|
||||
"vfmindb %%v16,%%v16,%%v18,8 \n\t"
|
||||
"vfmindb %%v17,%%v17,%%v19,8 \n\t"
|
||||
|
||||
"vfmindb %%v16,%%v16,%%v17,8 \n\t"
|
||||
|
||||
"vfmindb %%v0,%%v0,%%16,8 \n\t"
|
||||
|
||||
"agfi %%r1, 256 \n\t"
|
||||
"brctg %%r0, 0b \n\t"
|
||||
|
||||
"vrepg %%v16,%%v0,1 \n\t"
|
||||
"wfmindb %%v0,%%v0,%%v16,8 \n\t"
|
||||
"lpdr %0,%%f0 "
|
||||
:"=f"(amin)
|
||||
:"r"(n),"ZR"((const FLOAT (*)[n])x)
|
||||
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
|
||||
return amin;
|
||||
}
|
||||
|
||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||
BLASLONG i = 0;
|
||||
BLASLONG j = 0;
|
||||
FLOAT minf = 0.0;
|
||||
|
||||
if (n <= 0 || inc_x <= 0) return (minf);
|
||||
|
||||
if (inc_x == 1) {
|
||||
|
||||
BLASLONG n1 = n & -32;
|
||||
if (n1 > 0) {
|
||||
|
||||
minf = damin_kernel_32(n1, x);
|
||||
|
||||
i = n1;
|
||||
}
|
||||
else
|
||||
{
|
||||
minf=ABS(x[0]);
|
||||
i++;
|
||||
}
|
||||
|
||||
while (i < n) {
|
||||
if (ABS(x[i]) < minf) {
|
||||
minf = ABS(x[i]);
|
||||
}
|
||||
i++;
|
||||
}
|
||||
return (minf);
|
||||
|
||||
} else {
|
||||
|
||||
minf=ABS(x[0]);
|
||||
|
||||
BLASLONG n1 = n & -4;
|
||||
while (j < n1) {
|
||||
|
||||
if (ABS(x[i]) < minf) {
|
||||
minf = ABS(x[i]);
|
||||
}
|
||||
if (ABS(x[i + inc_x]) < minf) {
|
||||
minf = ABS(x[i + inc_x]);
|
||||
}
|
||||
if (ABS(x[i + 2 * inc_x]) < minf) {
|
||||
minf = ABS(x[i + 2 * inc_x]);
|
||||
}
|
||||
if (ABS(x[i + 3 * inc_x]) < minf) {
|
||||
minf = ABS(x[i + 3 * inc_x]);
|
||||
}
|
||||
|
||||
i += inc_x * 4;
|
||||
|
||||
j += 4;
|
||||
|
||||
}
|
||||
|
||||
|
||||
while (j < n) {
|
||||
if (ABS(x[i]) < minf) {
|
||||
minf = ABS(x[i]);
|
||||
}
|
||||
i += inc_x;
|
||||
j++;
|
||||
}
|
||||
return (minf);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,204 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2017, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include <math.h>
|
||||
|
||||
#if defined(DOUBLE)
|
||||
#define ABS fabs
|
||||
#else
|
||||
#define ABS fabsf
|
||||
#endif
|
||||
|
||||
static FLOAT damin_kernel_32(BLASLONG n, FLOAT *x)
|
||||
{
|
||||
FLOAT amin;
|
||||
|
||||
__asm__ volatile (
|
||||
"vl %%v0,0(%2) \n\t"
|
||||
"vflpdb %%v0,%%v0 \n\t"
|
||||
"srlg %%r0,%1,5 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 1, 1024(%%r1,%2) \n\t"
|
||||
|
||||
"vl %%v16,0(%%r1,%2) \n\t"
|
||||
"vl %%v17,16(%%r1,%2) \n\t"
|
||||
"vl %%v18,32(%%r1,%2) \n\t"
|
||||
"vl %%v19,48(%%r1,%2) \n\t"
|
||||
"vl %%v20,64(%%r1,%2) \n\t"
|
||||
"vl %%v21,80(%%r1,%2) \n\t"
|
||||
"vl %%v22,96(%%r1,%2) \n\t"
|
||||
"vl %%v23,112(%%r1,%2) \n\t"
|
||||
"vflpdb %%v16, %%v16 \n\t"
|
||||
"vflpdb %%v17, %%v17 \n\t"
|
||||
"vflpdb %%v18, %%v18 \n\t"
|
||||
"vflpdb %%v19, %%v19 \n\t"
|
||||
"vflpdb %%v20, %%v20 \n\t"
|
||||
"vflpdb %%v21, %%v21 \n\t"
|
||||
"vflpdb %%v22, %%v22 \n\t"
|
||||
"vflpdb %%v23, %%v23 \n\t"
|
||||
|
||||
"vfchdb %%v24,%%v17,%%v16 \n\t"
|
||||
"vfchdb %%v25,%%v19,%%v18 \n\t"
|
||||
"vfchdb %%v26,%%v21,%%v20 \n\t"
|
||||
"vfchdb %%v27,%%v23,%%v22 \n\t"
|
||||
"vsel %%v24,%%v16,%%v17,%%v24 \n\t"
|
||||
"vsel %%v25,%%v18,%%v19,%%v25 \n\t"
|
||||
"vsel %%v26,%%v20,%%v21,%%v26 \n\t"
|
||||
"vsel %%v27,%%v22,%%v23,%%v27 \n\t"
|
||||
|
||||
"vfchdb %%v28,%%v25,%%v24 \n\t"
|
||||
"vfchdb %%v29,%%v27,%%v26 \n\t"
|
||||
"vsel %%v28,%%v24,%%v25,%%v28 \n\t"
|
||||
"vsel %%v29,%%v26,%%v27,%%v29 \n\t"
|
||||
|
||||
"vfchdb %%v30,%%v29,%%v28 \n\t"
|
||||
"vsel %%v30,%%v28,%%v29,%%v30 \n\t"
|
||||
|
||||
"vfchdb %%v31,%%v0,%%v30 \n\t"
|
||||
"vsel %%v0,%%v30,%%v0,%%v31 \n\t"
|
||||
|
||||
"vl %%v16,128(%%r1,%2) \n\t"
|
||||
"vl %%v17,144(%%r1,%2) \n\t"
|
||||
"vl %%v18,160(%%r1,%2) \n\t"
|
||||
"vl %%v19,176(%%r1,%2) \n\t"
|
||||
"vl %%v20,192(%%r1,%2) \n\t"
|
||||
"vl %%v21,208(%%r1,%2) \n\t"
|
||||
"vl %%v22,224(%%r1,%2) \n\t"
|
||||
"vl %%v23,240(%%r1,%2) \n\t"
|
||||
"vflpdb %%v16, %%v16 \n\t"
|
||||
"vflpdb %%v17, %%v17 \n\t"
|
||||
"vflpdb %%v18, %%v18 \n\t"
|
||||
"vflpdb %%v19, %%v19 \n\t"
|
||||
"vflpdb %%v20, %%v20 \n\t"
|
||||
"vflpdb %%v21, %%v21 \n\t"
|
||||
"vflpdb %%v22, %%v22 \n\t"
|
||||
"vflpdb %%v23, %%v23 \n\t"
|
||||
|
||||
"vfchdb %%v24,%%v17,%%v16 \n\t"
|
||||
"vfchdb %%v25,%%v19,%%v18 \n\t"
|
||||
"vfchdb %%v26,%%v21,%%v20 \n\t"
|
||||
"vfchdb %%v27,%%v23,%%v22 \n\t"
|
||||
"vsel %%v24,%%v16,%%v17,%%v24 \n\t"
|
||||
"vsel %%v25,%%v18,%%v19,%%v25 \n\t"
|
||||
"vsel %%v26,%%v20,%%v21,%%v26 \n\t"
|
||||
"vsel %%v27,%%v22,%%v23,%%v27 \n\t"
|
||||
|
||||
"vfchdb %%v28,%%v25,%%v24 \n\t"
|
||||
"vfchdb %%v29,%%v27,%%v26 \n\t"
|
||||
"vsel %%v28,%%v24,%%v25,%%v28 \n\t"
|
||||
"vsel %%v29,%%v26,%%v27,%%v29 \n\t"
|
||||
|
||||
"vfchdb %%v30,%%v29,%%v28 \n\t"
|
||||
"vsel %%v30,%%v28,%%v29,%%v30 \n\t"
|
||||
|
||||
"vfchdb %%v31,%%v0,%%v30 \n\t"
|
||||
"vsel %%v0,%%v30,%%v0,%%v31 \n\t"
|
||||
|
||||
"agfi %%r1, 256 \n\t"
|
||||
"brctg %%r0, 0b \n\t"
|
||||
|
||||
"vrepg %%v16,%%v0,1 \n\t"
|
||||
"wfchdb %%v17,%%v16,%%v0 \n\t"
|
||||
"vsel %%v0,%%v0,%%v16,%%v17 \n\t"
|
||||
"ldr %0,%%f0 "
|
||||
:"=f"(amin)
|
||||
:"r"(n),"ZR"((const FLOAT (*)[n])x)
|
||||
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
|
||||
return amin;
|
||||
}
|
||||
|
||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||
BLASLONG i = 0;
|
||||
BLASLONG j = 0;
|
||||
FLOAT minf = 0.0;
|
||||
|
||||
if (n <= 0 || inc_x <= 0) return (minf);
|
||||
|
||||
if (inc_x == 1) {
|
||||
|
||||
BLASLONG n1 = n & -32;
|
||||
if (n1 > 0) {
|
||||
|
||||
minf = damin_kernel_32(n1, x);
|
||||
|
||||
i = n1;
|
||||
}
|
||||
else
|
||||
{
|
||||
minf=ABS(x[0]);
|
||||
i++;
|
||||
}
|
||||
|
||||
while (i < n) {
|
||||
if (ABS(x[i]) < minf) {
|
||||
minf = ABS(x[i]);
|
||||
}
|
||||
i++;
|
||||
}
|
||||
return (minf);
|
||||
|
||||
} else {
|
||||
|
||||
minf=ABS(x[0]);
|
||||
|
||||
BLASLONG n1 = n & -4;
|
||||
while (j < n1) {
|
||||
|
||||
if (ABS(x[i]) < minf) {
|
||||
minf = ABS(x[i]);
|
||||
}
|
||||
if (ABS(x[i + inc_x]) < minf) {
|
||||
minf = ABS(x[i + inc_x]);
|
||||
}
|
||||
if (ABS(x[i + 2 * inc_x]) < minf) {
|
||||
minf = ABS(x[i + 2 * inc_x]);
|
||||
}
|
||||
if (ABS(x[i + 3 * inc_x]) < minf) {
|
||||
minf = ABS(x[i + 3 * inc_x]);
|
||||
}
|
||||
|
||||
i += inc_x * 4;
|
||||
|
||||
j += 4;
|
||||
|
||||
}
|
||||
|
||||
|
||||
while (j < n) {
|
||||
if (ABS(x[i]) < minf) {
|
||||
minf = ABS(x[i]);
|
||||
}
|
||||
i += inc_x;
|
||||
j++;
|
||||
}
|
||||
return (minf);
|
||||
}
|
||||
}
|
||||
|
|
@ -1,5 +1,5 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2017, The OpenBLAS Project
|
||||
Copyright (c) 2013-2018, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
|
|
@ -23,8 +23,7 @@ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
|||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include <math.h>
|
||||
|
|
@ -35,80 +34,89 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define ABS fabsf
|
||||
#endif
|
||||
|
||||
static FLOAT dasum_kernel_32(BLASLONG n, FLOAT *x)
|
||||
{
|
||||
FLOAT asum;
|
||||
|
||||
|
||||
__asm__ (
|
||||
"vzero %%v0 \n\t"
|
||||
"vzero %%v1 \n\t"
|
||||
"vzero %%v2 \n\t"
|
||||
"vzero %%v3 \n\t"
|
||||
"srlg %%r0,%1,5 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 1, 1024(%%r1,%2) \n\t"
|
||||
"vl %%v16, 0(%%r1,%2) \n\t"
|
||||
"vl %%v17, 16(%%r1,%2) \n\t"
|
||||
"vl %%v18, 32(%%r1,%2) \n\t"
|
||||
"vl %%v19, 48(%%r1,%2) \n\t"
|
||||
"vl %%v20, 64(%%r1,%2) \n\t"
|
||||
"vl %%v21, 80(%%r1,%2) \n\t"
|
||||
"vl %%v22, 96(%%r1,%2) \n\t"
|
||||
"vl %%v23, 112(%%r1,%2) \n\t"
|
||||
|
||||
static FLOAT dasum_kernel_32(BLASLONG n, FLOAT *x) {
|
||||
FLOAT asum ;
|
||||
__asm__ (
|
||||
"pfd 1, 0(%[ptr_x]) \n\t"
|
||||
"sllg %%r0,%[n],3 \n\t"
|
||||
"agr %%r0,%[ptr_x] \n\t"
|
||||
"vzero %%v0 \n\t"
|
||||
"vzero %%v1 \n\t"
|
||||
"vzero %%v2 \n\t"
|
||||
"vzero %%v3 \n\t"
|
||||
".align 16 \n\t"
|
||||
"1: \n\t"
|
||||
"pfd 1, 256(%[ptr_temp] ) \n\t"
|
||||
"vlm %%v24,%%v31, 0(%[ptr_temp] ) \n\t"
|
||||
|
||||
"vflpdb %%v24, %%v24 \n\t"
|
||||
"vflpdb %%v25, %%v25 \n\t"
|
||||
"vflpdb %%v26, %%v26 \n\t"
|
||||
"vflpdb %%v27, %%v27 \n\t"
|
||||
"vflpdb %%v28, %%v28 \n\t"
|
||||
"vflpdb %%v29, %%v29 \n\t"
|
||||
"vflpdb %%v30, %%v30 \n\t"
|
||||
"vflpdb %%v31, %%v31 \n\t"
|
||||
|
||||
"vfadb %%v0,%%v0,%%v24 \n\t"
|
||||
"vfadb %%v1,%%v1,%%v25 \n\t"
|
||||
"vfadb %%v2,%%v2,%%v26 \n\t"
|
||||
"vfadb %%v3,%%v3,%%v27 \n\t"
|
||||
"vfadb %%v0,%%v0,%%v28 \n\t"
|
||||
"vfadb %%v1,%%v1,%%v29 \n\t"
|
||||
"vfadb %%v2,%%v2,%%v30 \n\t"
|
||||
"vfadb %%v3,%%v3,%%v31 \n\t"
|
||||
|
||||
"vlm %%v24,%%v31, 128(%[ptr_temp]) \n\t"
|
||||
|
||||
"vflpdb %%v24, %%v24 \n\t"
|
||||
"vflpdb %%v25, %%v25 \n\t"
|
||||
"vflpdb %%v26, %%v26 \n\t"
|
||||
"vflpdb %%v27, %%v27 \n\t"
|
||||
"vflpdb %%v28, %%v28 \n\t"
|
||||
"vflpdb %%v29, %%v29 \n\t"
|
||||
"vflpdb %%v30, %%v30 \n\t"
|
||||
"vflpdb %%v31, %%v31 \n\t"
|
||||
"la %[ptr_temp],256(%[ptr_temp]) \n\t"
|
||||
"vfadb %%v0,%%v0,%%v24 \n\t"
|
||||
"vfadb %%v1,%%v1,%%v25 \n\t"
|
||||
"vfadb %%v2,%%v2,%%v26 \n\t"
|
||||
"vfadb %%v3,%%v3,%%v27 \n\t"
|
||||
"vfadb %%v0,%%v0,%%v28 \n\t"
|
||||
"vfadb %%v1,%%v1,%%v29 \n\t"
|
||||
"vfadb %%v2,%%v2,%%v30 \n\t"
|
||||
"vfadb %%v3,%%v3,%%v31 \n\t"
|
||||
|
||||
"clgrjl %[ptr_temp],%%r0,1b \n\t"
|
||||
"vfadb %%v24,%%v0,%%v1 \n\t"
|
||||
"vfadb %%v25,%%v2,%%v3 \n\t"
|
||||
"vfadb %%v0,%%v25,%%v24 \n\t"
|
||||
"vrepg %%v1,%%v0,1 \n\t"
|
||||
"adbr %%f0,%%f1 \n\t"
|
||||
"ldr %[asum],%%f0 \n\t"
|
||||
: [asum] "=f"(asum),[ptr_temp] "+&a"(x)
|
||||
: [mem] "m"( *(const double (*)[n])x ), [n] "r"(n), [ptr_x] "a"(x)
|
||||
: "cc", "r0" ,"f0","f1","v0","v1","v2","v3","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
return asum;
|
||||
"vflpdb %%v16, %%v16 \n\t"
|
||||
"vflpdb %%v17, %%v17 \n\t"
|
||||
"vflpdb %%v18, %%v18 \n\t"
|
||||
"vflpdb %%v19, %%v19 \n\t"
|
||||
"vflpdb %%v20, %%v20 \n\t"
|
||||
"vflpdb %%v21, %%v21 \n\t"
|
||||
"vflpdb %%v22, %%v22 \n\t"
|
||||
"vflpdb %%v23, %%v23 \n\t"
|
||||
|
||||
"vfadb %%v0,%%v0,%%v16 \n\t"
|
||||
"vfadb %%v1,%%v1,%%v17 \n\t"
|
||||
"vfadb %%v2,%%v2,%%v18 \n\t"
|
||||
"vfadb %%v3,%%v3,%%v19 \n\t"
|
||||
"vfadb %%v0,%%v0,%%v20 \n\t"
|
||||
"vfadb %%v1,%%v1,%%v21 \n\t"
|
||||
"vfadb %%v2,%%v2,%%v22 \n\t"
|
||||
"vfadb %%v3,%%v3,%%v23 \n\t"
|
||||
|
||||
"vl %%v16, 128(%%r1,%2) \n\t"
|
||||
"vl %%v17, 144(%%r1,%2) \n\t"
|
||||
"vl %%v18, 160(%%r1,%2) \n\t"
|
||||
"vl %%v19, 176(%%r1,%2) \n\t"
|
||||
"vl %%v20, 192(%%r1,%2) \n\t"
|
||||
"vl %%v21, 208(%%r1,%2) \n\t"
|
||||
"vl %%v22, 224(%%r1,%2) \n\t"
|
||||
"vl %%v23, 240(%%r1,%2) \n\t"
|
||||
|
||||
"vflpdb %%v16, %%v16 \n\t"
|
||||
"vflpdb %%v17, %%v17 \n\t"
|
||||
"vflpdb %%v18, %%v18 \n\t"
|
||||
"vflpdb %%v19, %%v19 \n\t"
|
||||
"vflpdb %%v20, %%v20 \n\t"
|
||||
"vflpdb %%v21, %%v21 \n\t"
|
||||
"vflpdb %%v22, %%v22 \n\t"
|
||||
"vflpdb %%v23, %%v23 \n\t"
|
||||
|
||||
"vfadb %%v0,%%v0,%%v16 \n\t"
|
||||
"vfadb %%v1,%%v1,%%v17 \n\t"
|
||||
"vfadb %%v2,%%v2,%%v18 \n\t"
|
||||
"vfadb %%v3,%%v3,%%v19 \n\t"
|
||||
"vfadb %%v0,%%v0,%%v20 \n\t"
|
||||
"vfadb %%v1,%%v1,%%v21 \n\t"
|
||||
"vfadb %%v2,%%v2,%%v22 \n\t"
|
||||
"vfadb %%v3,%%v3,%%v23 \n\t"
|
||||
|
||||
"agfi %%r1,256 \n\t"
|
||||
"brctg %%r0,0b \n\t"
|
||||
"vfadb %%v0,%%v0,%%v1 \n\t"
|
||||
"vfadb %%v0,%%v0,%%v2 \n\t"
|
||||
"vfadb %%v0,%%v0,%%v3 \n\t"
|
||||
"vrepg %%v1,%%v0,1 \n\t"
|
||||
"adbr %%f0,%%f1 \n\t"
|
||||
"ldr %0,%%f0 "
|
||||
:"=f"(asum)
|
||||
:"r"(n),"ZR"((const FLOAT (*)[n])x)
|
||||
:"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23"
|
||||
);
|
||||
|
||||
return asum;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||
BLASLONG i = 0;
|
||||
BLASLONG j = 0;
|
||||
|
|
|
|||
|
|
@ -25,98 +25,99 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
|||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#define PREFETCH_INS 1
|
||||
#if defined(Z13_A)
|
||||
#include <vecintrin.h>
|
||||
|
||||
static void daxpy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT alpha)
|
||||
{
|
||||
BLASLONG i = 0;
|
||||
__vector double v_a = {alpha,alpha};
|
||||
__vector double * v_y=(__vector double *)y;
|
||||
__vector double * v_x=(__vector double *)x;
|
||||
|
||||
for(; i<n/2; i+=16){
|
||||
|
||||
v_y[i] += v_a * v_x[i];
|
||||
v_y[i+1] += v_a * v_x[i+1];
|
||||
v_y[i+2] += v_a * v_x[i+2];
|
||||
v_y[i+3] += v_a * v_x[i+3];
|
||||
v_y[i+4] += v_a * v_x[i+4];
|
||||
v_y[i+5] += v_a * v_x[i+5];
|
||||
v_y[i+6] += v_a * v_x[i+6];
|
||||
v_y[i+7] += v_a * v_x[i+7];
|
||||
v_y[i+8] += v_a * v_x[i+8];
|
||||
v_y[i+9] += v_a * v_x[i+9];
|
||||
v_y[i+10] += v_a * v_x[i+10];
|
||||
v_y[i+11] += v_a * v_x[i+11];
|
||||
v_y[i+12] += v_a * v_x[i+12];
|
||||
v_y[i+13] += v_a * v_x[i+13];
|
||||
v_y[i+14] += v_a * v_x[i+14];
|
||||
v_y[i+15] += v_a * v_x[i+15];
|
||||
}
|
||||
|
||||
}
|
||||
#else
|
||||
static void daxpy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT alpha)
|
||||
static void daxpy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
||||
{
|
||||
__asm__ volatile(
|
||||
"vlrepg %%v0,%3 \n\t"
|
||||
"srlg %%r0,%0,5 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 1, 1024(%%r1,%1) \n\t"
|
||||
"pfd 2, 1024(%%r1,%2) \n\t"
|
||||
|
||||
"vl %%v16,0(%%r1,%1) \n\t"
|
||||
"vl %%v17,16(%%r1,%1) \n\t"
|
||||
"vl %%v18,32(%%r1,%1) \n\t"
|
||||
"vl %%v19,48(%%r1,%1) \n\t"
|
||||
"vl %%v20,0(%%r1,%2) \n\t"
|
||||
"vl %%v21,16(%%r1,%2) \n\t"
|
||||
"vl %%v22,32(%%r1,%2) \n\t"
|
||||
"vl %%v23,48(%%r1,%2) \n\t"
|
||||
|
||||
__asm__ volatile(
|
||||
#if defined(PREFETCH_INS)
|
||||
"pfd 1, 0(%[x_tmp]) \n\t"
|
||||
"pfd 2, 0(%[y_tmp]) \n\t"
|
||||
#endif
|
||||
"lgdr %%r0,%[alpha] \n\t"
|
||||
"vlvgp %%v0,%%r0,%%r0 \n\t"
|
||||
"srlg %%r0,%[n],5 \n\t"
|
||||
"vlr %%v1,%%v0 \n\t"
|
||||
".align 16 \n\t"
|
||||
"1: \n\t"
|
||||
#if defined(PREFETCH_INS)
|
||||
"pfd 1, 256(%[x_tmp]) \n\t"
|
||||
"pfd 2, 256(%[y_tmp]) \n\t"
|
||||
#endif
|
||||
"vlm %%v16,%%v23, 0(%[x_tmp]) \n\t"
|
||||
"vlm %%v24, %%v31, 0(%[y_tmp]) \n\t"
|
||||
"vfmadb %%v16,%%v0,%%v16,%%v24 \n\t"
|
||||
"vfmadb %%v17,%%v1,%%v17,%%v25 \n\t"
|
||||
"vfmadb %%v18,%%v0,%%v18,%%v26 \n\t"
|
||||
"vfmadb %%v19,%%v1,%%v19,%%v27 \n\t"
|
||||
"vfmadb %%v20,%%v0,%%v20,%%v28 \n\t"
|
||||
"vfmadb %%v21,%%v1,%%v21,%%v29 \n\t"
|
||||
"vfmadb %%v22,%%v0,%%v22,%%v30 \n\t"
|
||||
"vfmadb %%v23,%%v1,%%v23,%%v31 \n\t"
|
||||
"vstm %%v16,%%v23, 0(%[y_tmp]) \n\t"
|
||||
"vlm %%v24,%%v31, 128(%[x_tmp]) \n\t"
|
||||
"vlm %%v16,%%v23, 128(%[y_tmp]) \n\t"
|
||||
"vfmadb %%v24,%%v0,%%v24,%%v16 \n\t"
|
||||
"vfmadb %%v25,%%v1,%%v25,%%v17 \n\t"
|
||||
"vfmadb %%v26,%%v0,%%v26,%%v18 \n\t"
|
||||
"vfmadb %%v27,%%v1,%%v27,%%v19 \n\t"
|
||||
"vfmadb %%v28,%%v0,%%v28,%%v20 \n\t"
|
||||
"vfmadb %%v29,%%v1,%%v29,%%v21 \n\t"
|
||||
"vfmadb %%v30,%%v0,%%v30,%%v22 \n\t"
|
||||
"vfmadb %%v31,%%v1,%%v31,%%v23 \n\t"
|
||||
"la %[x_tmp],256(%[x_tmp]) \n\t"
|
||||
"vstm %%v24, %%v31, 128(%[y_tmp]) \n\t"
|
||||
"la %[y_tmp],256(%[y_tmp]) \n\t"
|
||||
"brctg %%r0,1b"
|
||||
: [mem_y] "+m" (*(double (*)[n])y), [x_tmp] "+&a"(x), [y_tmp] "+&a"(y)
|
||||
: [mem_x] "m" (*(const double (*)[n])x), [n] "r"(n), [alpha] "f"(alpha)
|
||||
:"cc", "r0", "v0","v1","v16","v17","v18","v19","v20","v21",
|
||||
"v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
|
||||
"vfmadb %%v16,%%v0,%%v16,%%v20 \n\t"
|
||||
"vfmadb %%v17,%%v0,%%v17,%%v21 \n\t"
|
||||
"vfmadb %%v18,%%v0,%%v18,%%v22 \n\t"
|
||||
"vfmadb %%v19,%%v0,%%v19,%%v23 \n\t"
|
||||
|
||||
"vl %%v24,64(%%r1,%1) \n\t"
|
||||
"vl %%v25,80(%%r1,%1) \n\t"
|
||||
"vl %%v26,96(%%r1,%1) \n\t"
|
||||
"vl %%v27,112(%%r1,%1) \n\t"
|
||||
"vl %%v28,64(%%r1,%2) \n\t"
|
||||
"vl %%v29,80(%%r1,%2) \n\t"
|
||||
"vl %%v30,96(%%r1,%2) \n\t"
|
||||
"vl %%v31,112(%%r1,%2) \n\t"
|
||||
|
||||
"vfmadb %%v20,%%v0,%%v24,%%v28 \n\t"
|
||||
"vfmadb %%v21,%%v0,%%v25,%%v29 \n\t"
|
||||
"vfmadb %%v22,%%v0,%%v26,%%v30 \n\t"
|
||||
"vfmadb %%v23,%%v0,%%v27,%%v31 \n\t"
|
||||
|
||||
"vst %%v16,0(%%r1,%2) \n\t"
|
||||
"vst %%v17,16(%%r1,%2) \n\t"
|
||||
"vst %%v18,32(%%r1,%2) \n\t"
|
||||
"vst %%v19,48(%%r1,%2) \n\t"
|
||||
"vst %%v20,64(%%r1,%2) \n\t"
|
||||
"vst %%v21,80(%%r1,%2) \n\t"
|
||||
"vst %%v22,96(%%r1,%2) \n\t"
|
||||
"vst %%v23,112(%%r1,%2) \n\t"
|
||||
|
||||
"vl %%v16,128(%%r1,%1) \n\t"
|
||||
"vl %%v17,144(%%r1,%1) \n\t"
|
||||
"vl %%v18,160(%%r1,%1) \n\t"
|
||||
"vl %%v19,176(%%r1,%1) \n\t"
|
||||
"vl %%v20,128(%%r1,%2) \n\t"
|
||||
"vl %%v21,144(%%r1,%2) \n\t"
|
||||
"vl %%v22,160(%%r1,%2) \n\t"
|
||||
"vl %%v23,176(%%r1,%2) \n\t"
|
||||
|
||||
"vfmadb %%v16,%%v0,%%v16,%%v20 \n\t"
|
||||
"vfmadb %%v17,%%v0,%%v17,%%v21 \n\t"
|
||||
"vfmadb %%v18,%%v0,%%v18,%%v22 \n\t"
|
||||
"vfmadb %%v19,%%v0,%%v19,%%v23 \n\t"
|
||||
|
||||
"vl %%v24,192(%%r1,%1) \n\t"
|
||||
"vl %%v25,208(%%r1,%1) \n\t"
|
||||
"vl %%v26,224(%%r1,%1) \n\t"
|
||||
"vl %%v27,240(%%r1,%1) \n\t"
|
||||
"vl %%v28,192(%%r1,%2) \n\t"
|
||||
"vl %%v29,208(%%r1,%2) \n\t"
|
||||
"vl %%v30,224(%%r1,%2) \n\t"
|
||||
"vl %%v31,240(%%r1,%2) \n\t"
|
||||
|
||||
"vfmadb %%v20,%%v0,%%v24,%%v28 \n\t"
|
||||
"vfmadb %%v21,%%v0,%%v25,%%v29 \n\t"
|
||||
"vfmadb %%v22,%%v0,%%v26,%%v30 \n\t"
|
||||
"vfmadb %%v23,%%v0,%%v27,%%v31 \n\t"
|
||||
|
||||
"vst %%v16,128(%%r1,%2) \n\t"
|
||||
"vst %%v17,144(%%r1,%2) \n\t"
|
||||
"vst %%v18,160(%%r1,%2) \n\t"
|
||||
"vst %%v19,176(%%r1,%2) \n\t"
|
||||
"vst %%v20,192(%%r1,%2) \n\t"
|
||||
"vst %%v21,208(%%r1,%2) \n\t"
|
||||
"vst %%v22,224(%%r1,%2) \n\t"
|
||||
"vst %%v23,240(%%r1,%2) \n\t"
|
||||
|
||||
"agfi %%r1,256 \n\t"
|
||||
"brctg %%r0,0b "
|
||||
:
|
||||
:"r"(n),"ZR"((const FLOAT (*)[n])x),"ZR"((FLOAT (*)[n])y),"m"(*alpha)
|
||||
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
|
||||
{
|
||||
|
|
@ -131,7 +132,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
|
|||
BLASLONG n1 = n & -32;
|
||||
|
||||
if ( n1 )
|
||||
daxpy_kernel_32(n1, x, y , da );
|
||||
daxpy_kernel_32(n1, x, y , &da);
|
||||
|
||||
i = n1;
|
||||
while(i < n)
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2017, The OpenBLAS Project
|
||||
Copyright (c) 2013-2018, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
|
|
@ -23,95 +23,28 @@ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
|||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#if defined(Z13mvc)
|
||||
|
||||
static void dcopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) {
|
||||
|
||||
__asm__ volatile(
|
||||
"pfd 1, 0(%[ptr_x]) \n\t"
|
||||
"pfd 2, 0(%[ptr_y]) \n\t"
|
||||
"srlg %[n_tmp],%[n_tmp],5 \n\t"
|
||||
".align 16 \n\t"
|
||||
"1: \n\t"
|
||||
"mvc 0(256,%[ptr_y]),0(%[ptr_x]) \n\t"
|
||||
"la %[ptr_x],256(%[ptr_x]) \n\t"
|
||||
"la %[ptr_y],256(%[ptr_y]) \n\t"
|
||||
"brctg %[n_tmp],1b"
|
||||
: [mem_y] "=m" (*(double (*)[n])y), [n_tmp] "+&r"(n),
|
||||
[ptr_x] "+&a"(x), [ptr_y] "+&a"(y)
|
||||
: [mem_x] "m" (*(const double (*)[n])x)
|
||||
: "cc"
|
||||
);
|
||||
return;
|
||||
|
||||
static void dcopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y)
|
||||
{
|
||||
__asm__ volatile (
|
||||
"lgr %%r1,%1 \n\t"
|
||||
"lgr %%r2,%2 \n\t"
|
||||
"srlg %%r0,%0,5 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 1, 1024(%%r1) \n\t"
|
||||
"pfd 2, 1024(%%r2) \n\t"
|
||||
"mvc 0(256,%%r2),0(%%r1) \n\t"
|
||||
"agfi %%r1,256 \n\t"
|
||||
"agfi %%r2,256 \n\t"
|
||||
"brctg %%r0,0b "
|
||||
:
|
||||
:"r"(n),"a"((const FLOAT (*)[n])x),"a"((FLOAT (*)[n])y)
|
||||
:"memory","cc","r0","r1","r2"
|
||||
);
|
||||
}
|
||||
#else
|
||||
|
||||
static void dcopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) {
|
||||
|
||||
__asm__ volatile(
|
||||
"pfd 1, 0(%[ptr_x]) \n\t"
|
||||
"pfd 2, 0(%[ptr_y]) \n\t"
|
||||
"srlg %[n_tmp],%[n_tmp],5 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
".align 16 \n\t"
|
||||
"1: \n\t"
|
||||
"pfd 1, 256(%%r1,%[ptr_x]) \n\t"
|
||||
"pfd 2, 256(%%r1,%[ptr_y]) \n\t"
|
||||
|
||||
"vl %%v24, 0(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v24, 0(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v25, 16(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v25, 16(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v26, 32(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v26, 32(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v27, 48(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v27, 48(%%r1,%[ptr_y]) \n\t"
|
||||
|
||||
"vl %%v24, 64(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v24, 64(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v25, 80(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v25, 80(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v26, 96(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v26, 96(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v27, 112(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v27, 112(%%r1,%[ptr_y]) \n\t"
|
||||
|
||||
|
||||
"vl %%v24, 128(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v24, 128(%%r1,%[ptr_y]) \n\t"
|
||||
|
||||
"vl %%v25, 144(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v25, 144(%%r1,%[ptr_y]) \n\t"
|
||||
|
||||
"vl %%v26, 160(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v26, 160(%%r1,%[ptr_y]) \n\t"
|
||||
|
||||
"vl %%v27, 176(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v27, 176(%%r1,%[ptr_y]) \n\t"
|
||||
|
||||
"vl %%v24, 192(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v24, 192(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v25, 208(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v25, 208(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v26, 224(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v26, 224(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v27, 240(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v27, 240(%%r1,%[ptr_y]) \n\t"
|
||||
"la %%r1,256(%%r1) \n\t"
|
||||
"brctg %[n_tmp],1b"
|
||||
: [mem_y] "=m" (*(double (*)[n])y), [n_tmp] "+&r"(n)
|
||||
: [mem_x] "m" (*(const double (*)[n])x), [ptr_x] "a"(x), [ptr_y] "a"(y)
|
||||
: "cc", "r1", "v24","v25","v26","v27"
|
||||
);
|
||||
return;
|
||||
|
||||
}
|
||||
#endif
|
||||
|
||||
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) {
|
||||
BLASLONG i = 0;
|
||||
|
|
@ -136,21 +69,6 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) {
|
|||
|
||||
} else {
|
||||
|
||||
BLASLONG n1 = n & -4;
|
||||
|
||||
while (i < n1) {
|
||||
|
||||
y[iy] = x[ix];
|
||||
y[iy + inc_y] = x[ix + inc_x];
|
||||
y[iy + 2 * inc_y] = x[ix + 2 * inc_x];
|
||||
y[iy + 3 * inc_y] = x[ix + 3 * inc_x];
|
||||
|
||||
ix += inc_x * 4;
|
||||
iy += inc_y * 4;
|
||||
i += 4;
|
||||
|
||||
}
|
||||
|
||||
while (i < n) {
|
||||
|
||||
y[iy] = x[ix];
|
||||
|
|
@ -165,5 +83,3 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) {
|
|||
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -25,116 +25,59 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
|||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
#include "common.h"
|
||||
|
||||
|
||||
#if defined(Z13)
|
||||
static FLOAT ddot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y)
|
||||
static FLOAT ddot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y)
|
||||
{
|
||||
FLOAT dot;
|
||||
__asm__ volatile(
|
||||
"pfd 1, 0(%[ptr_x_tmp]) \n\t"
|
||||
"pfd 1, 0(%[ptr_y_tmp]) \n\t"
|
||||
"vzero %%v24 \n\t"
|
||||
"vzero %%v25 \n\t"
|
||||
"vzero %%v26 \n\t"
|
||||
"vzero %%v27 \n\t"
|
||||
"srlg %[n_tmp],%[n_tmp],4 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
".align 16 \n\t"
|
||||
"1: \n\t"
|
||||
"pfd 1, 256(%%r1,%[ptr_x_tmp]) \n\t"
|
||||
"pfd 1, 256(%%r1,%[ptr_y_tmp]) \n\t"
|
||||
"vl %%v16, 0(%%r1,%[ptr_x_tmp]) \n\t"
|
||||
"vl %%v17, 16(%%r1,%[ptr_x_tmp]) \n\t"
|
||||
"vl %%v18, 32(%%r1,%[ptr_x_tmp]) \n\t"
|
||||
"vl %%v19, 48(%%r1,%[ptr_x_tmp]) \n\t"
|
||||
|
||||
"vl %%v28, 0(%%r1,%[ptr_y_tmp]) \n\t"
|
||||
"vfmadb %%v24,%%v16,%%v28,%%v24 \n\t"
|
||||
"vl %%v29, 16(%%r1,%[ptr_y_tmp]) \n\t"
|
||||
"vfmadb %%v25,%%v17,%%v29,%%v25 \n\t"
|
||||
|
||||
"vl %%v30, 32(%%r1,%[ptr_y_tmp]) \n\t"
|
||||
"vfmadb %%v26,%%v18,%%v30,%%v26 \n\t"
|
||||
"vl %%v31, 48(%%r1,%[ptr_y_tmp]) \n\t"
|
||||
"vfmadb %%v27,%%v19,%%v31,%%v27 \n\t"
|
||||
|
||||
"vl %%v16, 64(%%r1 ,%[ptr_x_tmp]) \n\t"
|
||||
"vl %%v17, 80(%%r1,%[ptr_x_tmp]) \n\t"
|
||||
"vl %%v18, 96(%%r1,%[ptr_x_tmp]) \n\t"
|
||||
"vl %%v19, 112(%%r1,%[ptr_x_tmp]) \n\t"
|
||||
__asm__ volatile (
|
||||
"vzero %%v0 \n\t"
|
||||
"srlg %%r0,%1,4 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 1,1024(%%r1,%2) \n\t"
|
||||
"pfd 1,1024(%%r1,%3) \n\t"
|
||||
|
||||
"vl %%v28, 64(%%r1,%[ptr_y_tmp]) \n\t"
|
||||
"vfmadb %%v24,%%v16,%%v28,%%v24 \n\t"
|
||||
"vl %%v29, 80(%%r1,%[ptr_y_tmp]) \n\t"
|
||||
"vfmadb %%v25,%%v17,%%v29,%%v25 \n\t"
|
||||
|
||||
|
||||
"vl %%v30, 96(%%r1,%[ptr_y_tmp]) \n\t"
|
||||
"vfmadb %%v26,%%v18,%%v30,%%v26 \n\t"
|
||||
"vl %%v31, 112(%%r1,%[ptr_y_tmp]) \n\t"
|
||||
"vfmadb %%v27,%%v19,%%v31,%%v27 \n\t"
|
||||
|
||||
|
||||
"la %%r1,128(%%r1) \n\t"
|
||||
"brctg %[n_tmp],1b \n\t"
|
||||
"vfadb %%v24,%%v25,%%v24 \n\t"
|
||||
"vfadb %%v24,%%v26,%%v24 \n\t"
|
||||
"vfadb %%v24,%%v27,%%v24 \n\t"
|
||||
"vrepg %%v1,%%v24,1 \n\t"
|
||||
"vfadb %%v1,%%v24,%%v1 \n\t"
|
||||
"ldr %[dot], %%f1 \n\t"
|
||||
: [dot] "=f"(dot) ,[n_tmp] "+&r"(n)
|
||||
: [mem_x] "m"( *(const double (*)[n])x),
|
||||
[mem_y] "m"( *(const double (*)[n])y),
|
||||
[ptr_x_tmp]"a"(x), [ptr_y_tmp] "a"(y)
|
||||
:"cc" , "r1","f1","v16", "v17","v18","v19","v20","v21","v22","v23",
|
||||
"v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
"vl %%v16,0(%%r1,%2) \n\t"
|
||||
"vl %%v17,16(%%r1,%2) \n\t"
|
||||
"vl %%v18,32(%%r1,%2) \n\t"
|
||||
"vl %%v19,48(%%r1,%2) \n\t"
|
||||
"vl %%v20,64(%%r1,%2) \n\t"
|
||||
"vl %%v21,80(%%r1,%2) \n\t"
|
||||
"vl %%v22,96(%%r1,%2) \n\t"
|
||||
"vl %%v23,112(%%r1,%2) \n\t"
|
||||
|
||||
);
|
||||
return dot;
|
||||
"vl %%v24,0(%%r1,%3) \n\t"
|
||||
"vfmadb %%v0,%%v16,%%v24,%%v0 \n\t"
|
||||
"vl %%v25,16(%%r1,%3) \n\t"
|
||||
"vfmadb %%v0,%%v17,%%v25,%%v0 \n\t"
|
||||
"vl %%v26,32(%%r1,%3) \n\t"
|
||||
"vfmadb %%v0,%%v18,%%v26,%%v0 \n\t"
|
||||
"vl %%v27,48(%%r1,%3) \n\t"
|
||||
"vfmadb %%v0,%%v19,%%v27,%%v0 \n\t"
|
||||
"vl %%v28,64(%%r1,%3) \n\t"
|
||||
"vfmadb %%v0,%%v20,%%v28,%%v0 \n\t"
|
||||
"vl %%v29,80(%%r1,%3) \n\t"
|
||||
"vfmadb %%v0,%%v21,%%v29,%%v0 \n\t"
|
||||
"vl %%v30,96(%%r1,%3) \n\t"
|
||||
"vfmadb %%v0,%%v22,%%v30,%%v0 \n\t"
|
||||
"vl %%v31,112(%%r1,%3) \n\t"
|
||||
"vfmadb %%v0,%%v23,%%v31,%%v0 \n\t"
|
||||
|
||||
"agfi %%r1,128 \n\t"
|
||||
"brctg %%r0,0b \n\t"
|
||||
"vrepg %%v1,%%v0,1 \n\t"
|
||||
"adbr %%f0,%%f1 \n\t"
|
||||
"ldr %0,%%f0 "
|
||||
:"=f"(dot)
|
||||
:"r"(n),"ZR"((const FLOAT (*)[n])x),"ZR"((const FLOAT (*)[n])y)
|
||||
:"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
|
||||
#else
|
||||
|
||||
static FLOAT ddot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y )
|
||||
{
|
||||
BLASLONG register i = 0;
|
||||
FLOAT dot = 0.0;
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
dot += y[i] * x[i]
|
||||
+ y[i+1] * x[i+1]
|
||||
+ y[i+2] * x[i+2]
|
||||
+ y[i+3] * x[i+3]
|
||||
+ y[i+4] * x[i+4]
|
||||
+ y[i+5] * x[i+5]
|
||||
+ y[i+6] * x[i+6]
|
||||
+ y[i+7] * x[i+7] ;
|
||||
dot += y[i+8] * x[i+8]
|
||||
+ y[i+9] * x[i+9]
|
||||
+ y[i+10] * x[i+10]
|
||||
+ y[i+11] * x[i+11]
|
||||
+ y[i+12] * x[i+12]
|
||||
+ y[i+13] * x[i+13]
|
||||
+ y[i+14] * x[i+14]
|
||||
+ y[i+15] * x[i+15] ;
|
||||
|
||||
|
||||
i+=16 ;
|
||||
|
||||
}
|
||||
return dot;
|
||||
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
||||
{
|
||||
BLASLONG i=0;
|
||||
|
|
@ -148,13 +91,11 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
|||
{
|
||||
|
||||
BLASLONG n1 = n & -16;
|
||||
|
||||
if ( n1 ){
|
||||
dot = ddot_kernel_16(n1, x, y );
|
||||
i = n1;
|
||||
}
|
||||
|
||||
|
||||
if ( n1 )
|
||||
dot = ddot_kernel_16(n1, x, y);
|
||||
|
||||
i = n1;
|
||||
while(i < n)
|
||||
{
|
||||
|
||||
|
|
|
|||
|
|
@ -25,186 +25,392 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
|||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#define NBMAX 2048
|
||||
|
||||
#define HAVE_KERNEL_4x4_VEC 1
|
||||
#define HAVE_KERNEL_4x2_VEC 1
|
||||
#define HAVE_KERNEL_4x1_VEC 1
|
||||
|
||||
#if defined(HAVE_KERNEL_4x4_VEC) || defined(HAVE_KERNEL_4x2_VEC) || defined(HAVE_KERNEL_4x1_VEC)
|
||||
#include <vecintrin.h>
|
||||
#endif
|
||||
|
||||
#ifdef HAVE_KERNEL_4x4
|
||||
|
||||
#elif HAVE_KERNEL_4x4_VEC
|
||||
|
||||
static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha)
|
||||
{
|
||||
BLASLONG i;
|
||||
FLOAT x0,x1,x2,x3;
|
||||
x0 = xo[0] * *alpha;
|
||||
x1 = xo[1] * *alpha;
|
||||
x2 = xo[2] * *alpha;
|
||||
x3 = xo[3] * *alpha;
|
||||
__vector double v_x0 = {x0,x0};
|
||||
__vector double v_x1 = {x1,x1};
|
||||
__vector double v_x2 = {x2,x2};
|
||||
__vector double v_x3 = {x3,x3};
|
||||
__vector double* v_y =(__vector double*)y;
|
||||
__vector double* va0 = (__vector double*)ap[0];
|
||||
__vector double* va1 = (__vector double*)ap[1];
|
||||
__vector double* va2 = (__vector double*)ap[2];
|
||||
__vector double* va3 = (__vector double*)ap[3];
|
||||
__asm__ volatile (
|
||||
"vlrepg %%v0,0(%5) \n\t"
|
||||
"vlrepg %%v1,8(%5) \n\t"
|
||||
"vlrepg %%v2,16(%5) \n\t"
|
||||
"vlrepg %%v3,24(%5) \n\t"
|
||||
"vlrepg %%v4,%7 \n\t"
|
||||
"vfmdb %%v0,%%v0,%%v4 \n\t"
|
||||
"vfmdb %%v1,%%v1,%%v4 \n\t"
|
||||
"vfmdb %%v2,%%v2,%%v4 \n\t"
|
||||
"vfmdb %%v3,%%v3,%%v4 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
|
||||
for ( i=0; i< n/2; i+=2 )
|
||||
{
|
||||
v_y[i] += v_x0 * va0[i] + v_x1 * va1[i] + v_x2 * va2[i] + v_x3 * va3[i] ;
|
||||
v_y[i+1] += v_x0 * va0[i+1] + v_x1 * va1[i+1] + v_x2 * va2[i+1] + v_x3 * va3[i+1] ;
|
||||
}
|
||||
}
|
||||
"lghi %%r0,-16 \n\t"
|
||||
"ngr %%r0,%0 \n\t"
|
||||
"ltgr %%r0,%%r0 \n\t"
|
||||
"jz 1f \n\t"
|
||||
|
||||
#else
|
||||
"srlg %%r0,%%r0,4 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 1,1024(%%r1,%1) \n\t"
|
||||
"pfd 1,1024(%%r1,%2) \n\t"
|
||||
"pfd 1,1024(%%r1,%3) \n\t"
|
||||
"pfd 1,1024(%%r1,%4) \n\t"
|
||||
"pfd 2,1024(%%r1,%6) \n\t"
|
||||
|
||||
static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha)
|
||||
{
|
||||
BLASLONG i;
|
||||
FLOAT *a0,*a1,*a2,*a3;
|
||||
FLOAT x[4] __attribute__ ((aligned (16)));
|
||||
a0 = ap[0];
|
||||
a1 = ap[1];
|
||||
a2 = ap[2];
|
||||
a3 = ap[3];
|
||||
"vl %%v16,0(%%r1,%1) \n\t"
|
||||
"vl %%v17,0(%%r1,%2) \n\t"
|
||||
"vl %%v18,0(%%r1,%3) \n\t"
|
||||
"vl %%v19,0(%%r1,%4) \n\t"
|
||||
"vl %%v20,16(%%r1,%1) \n\t"
|
||||
"vl %%v21,16(%%r1,%2) \n\t"
|
||||
"vl %%v22,16(%%r1,%3) \n\t"
|
||||
"vl %%v23,16(%%r1,%4) \n\t"
|
||||
"vl %%v24,32(%%r1,%1) \n\t"
|
||||
"vl %%v25,32(%%r1,%2) \n\t"
|
||||
"vl %%v26,32(%%r1,%3) \n\t"
|
||||
"vl %%v27,32(%%r1,%4) \n\t"
|
||||
"vl %%v28,48(%%r1,%1) \n\t"
|
||||
"vl %%v29,48(%%r1,%2) \n\t"
|
||||
"vl %%v30,48(%%r1,%3) \n\t"
|
||||
"vl %%v31,48(%%r1,%4) \n\t"
|
||||
|
||||
for ( i=0; i<4; i++)
|
||||
x[i] = xo[i] * *alpha;
|
||||
"vl %%v4,0(%%r1,%6) \n\t"
|
||||
"vfmadb %%v4,%%v16,%%v0,%%v4 \n\t"
|
||||
"vfmadb %%v4,%%v17,%%v1,%%v4 \n\t"
|
||||
"vfmadb %%v4,%%v18,%%v2,%%v4 \n\t"
|
||||
"vfmadb %%v4,%%v19,%%v3,%%v4 \n\t"
|
||||
"vst %%v4,0(%%r1,%6) \n\t"
|
||||
|
||||
for ( i=0; i< n; i+=4 )
|
||||
{
|
||||
y[i] += a0[i]*x[0] + a1[i]*x[1] + a2[i]*x[2] + a3[i]*x[3];
|
||||
y[i+1] += a0[i+1]*x[0] + a1[i+1]*x[1] + a2[i+1]*x[2] + a3[i+1]*x[3];
|
||||
y[i+2] += a0[i+2]*x[0] + a1[i+2]*x[1] + a2[i+2]*x[2] + a3[i+2]*x[3];
|
||||
y[i+3] += a0[i+3]*x[0] + a1[i+3]*x[1] + a2[i+3]*x[2] + a3[i+3]*x[3];
|
||||
}
|
||||
}
|
||||
"vl %%v4,16(%%r1,%6) \n\t"
|
||||
"vfmadb %%v4,%%v20,%%v0,%%v4 \n\t"
|
||||
"vfmadb %%v4,%%v21,%%v1,%%v4 \n\t"
|
||||
"vfmadb %%v4,%%v22,%%v2,%%v4 \n\t"
|
||||
"vfmadb %%v4,%%v23,%%v3,%%v4 \n\t"
|
||||
"vst %%v4,16(%%r1,%6) \n\t"
|
||||
|
||||
"vl %%v4,32(%%r1,%6) \n\t"
|
||||
"vfmadb %%v4,%%v24,%%v0,%%v4 \n\t"
|
||||
"vfmadb %%v4,%%v25,%%v1,%%v4 \n\t"
|
||||
"vfmadb %%v4,%%v26,%%v2,%%v4 \n\t"
|
||||
"vfmadb %%v4,%%v27,%%v3,%%v4 \n\t"
|
||||
"vst %%v4,32(%%r1,%6) \n\t"
|
||||
|
||||
#endif
|
||||
"vl %%v4,48(%%r1,%6) \n\t"
|
||||
"vfmadb %%v4,%%v28,%%v0,%%v4 \n\t"
|
||||
"vfmadb %%v4,%%v29,%%v1,%%v4 \n\t"
|
||||
"vfmadb %%v4,%%v30,%%v2,%%v4 \n\t"
|
||||
"vfmadb %%v4,%%v31,%%v3,%%v4 \n\t"
|
||||
"vst %%v4,48(%%r1,%6) \n\t"
|
||||
|
||||
#ifdef HAVE_KERNEL_4x2
|
||||
"vl %%v16,64(%%r1,%1) \n\t"
|
||||
"vl %%v17,64(%%r1,%2) \n\t"
|
||||
"vl %%v18,64(%%r1,%3) \n\t"
|
||||
"vl %%v19,64(%%r1,%4) \n\t"
|
||||
"vl %%v20,80(%%r1,%1) \n\t"
|
||||
"vl %%v21,80(%%r1,%2) \n\t"
|
||||
"vl %%v22,80(%%r1,%3) \n\t"
|
||||
"vl %%v23,80(%%r1,%4) \n\t"
|
||||
"vl %%v24,96(%%r1,%1) \n\t"
|
||||
"vl %%v25,96(%%r1,%2) \n\t"
|
||||
"vl %%v26,96(%%r1,%3) \n\t"
|
||||
"vl %%v27,96(%%r1,%4) \n\t"
|
||||
"vl %%v28,112(%%r1,%1) \n\t"
|
||||
"vl %%v29,112(%%r1,%2) \n\t"
|
||||
"vl %%v30,112(%%r1,%3) \n\t"
|
||||
"vl %%v31,112(%%r1,%4) \n\t"
|
||||
|
||||
#elif HAVE_KERNEL_4x2_VEC
|
||||
"vl %%v4,64(%%r1,%6) \n\t"
|
||||
"vfmadb %%v4,%%v16,%%v0,%%v4 \n\t"
|
||||
"vfmadb %%v4,%%v17,%%v1,%%v4 \n\t"
|
||||
"vfmadb %%v4,%%v18,%%v2,%%v4 \n\t"
|
||||
"vfmadb %%v4,%%v19,%%v3,%%v4 \n\t"
|
||||
"vst %%v4,64(%%r1,%6) \n\t"
|
||||
|
||||
static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha)
|
||||
{
|
||||
BLASLONG i;
|
||||
FLOAT x0,x1;
|
||||
x0 = xo[0] * *alpha;
|
||||
x1 = xo[1] * *alpha;
|
||||
__vector double v_x0 = {x0,x0};
|
||||
__vector double v_x1 = {x1,x1};
|
||||
__vector double* v_y =(__vector double*)y;
|
||||
__vector double* va0 = (__vector double*)ap[0];
|
||||
__vector double* va1 = (__vector double*)ap[1];
|
||||
"vl %%v4,80(%%r1,%6) \n\t"
|
||||
"vfmadb %%v4,%%v20,%%v0,%%v4 \n\t"
|
||||
"vfmadb %%v4,%%v21,%%v1,%%v4 \n\t"
|
||||
"vfmadb %%v4,%%v22,%%v2,%%v4 \n\t"
|
||||
"vfmadb %%v4,%%v23,%%v3,%%v4 \n\t"
|
||||
"vst %%v4,80(%%r1,%6) \n\t"
|
||||
|
||||
for ( i=0; i< n/2; i+=2 )
|
||||
{
|
||||
v_y[i] += v_x0 * va0[i] + v_x1 * va1[i] ;
|
||||
v_y[i+1] += v_x0 * va0[i+1] + v_x1 * va1[i+1] ;
|
||||
}
|
||||
}
|
||||
#else
|
||||
"vl %%v4,96(%%r1,%6) \n\t"
|
||||
"vfmadb %%v4,%%v24,%%v0,%%v4 \n\t"
|
||||
"vfmadb %%v4,%%v25,%%v1,%%v4 \n\t"
|
||||
"vfmadb %%v4,%%v26,%%v2,%%v4 \n\t"
|
||||
"vfmadb %%v4,%%v27,%%v3,%%v4 \n\t"
|
||||
"vst %%v4,96(%%r1,%6) \n\t"
|
||||
|
||||
static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha)
|
||||
{
|
||||
BLASLONG i;
|
||||
FLOAT *a0,*a1;
|
||||
FLOAT x[4] __attribute__ ((aligned (16)));
|
||||
a0 = ap[0];
|
||||
a1 = ap[1];
|
||||
|
||||
for ( i=0; i<2; i++)
|
||||
x[i] = xo[i] * *alpha;
|
||||
|
||||
for ( i=0; i< n; i+=4 )
|
||||
{
|
||||
y[i] += a0[i]*x[0] + a1[i]*x[1];
|
||||
y[i+1] += a0[i+1]*x[0] + a1[i+1]*x[1];
|
||||
y[i+2] += a0[i+2]*x[0] + a1[i+2]*x[1];
|
||||
y[i+3] += a0[i+3]*x[0] + a1[i+3]*x[1];
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
#ifdef HAVE_KERNEL_4x1
|
||||
|
||||
#elif HAVE_KERNEL_4x1_VEC
|
||||
static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *xo, FLOAT *y, FLOAT *alpha)
|
||||
{
|
||||
|
||||
BLASLONG i;
|
||||
FLOAT x0;
|
||||
x0 = xo[0] * *alpha;
|
||||
__vector double v_x0 = {x0,x0};
|
||||
__vector double* v_y =(__vector double*)y;
|
||||
__vector double* va0 = (__vector double*)ap;
|
||||
|
||||
for ( i=0; i< n/2; i+=2 )
|
||||
{
|
||||
v_y[i] += v_x0 * va0[i] ;
|
||||
v_y[i+1] += v_x0 * va0[i+1] ;
|
||||
}
|
||||
"vl %%v4,112(%%r1,%6) \n\t"
|
||||
"vfmadb %%v4,%%v28,%%v0,%%v4 \n\t"
|
||||
"vfmadb %%v4,%%v29,%%v1,%%v4 \n\t"
|
||||
"vfmadb %%v4,%%v30,%%v2,%%v4 \n\t"
|
||||
"vfmadb %%v4,%%v31,%%v3,%%v4 \n\t"
|
||||
"vst %%v4,112(%%r1,%6) \n\t"
|
||||
|
||||
|
||||
"agfi %%r1,128 \n\t"
|
||||
"brctg %%r0,0b \n\t"
|
||||
|
||||
"1: \n\t"
|
||||
"lghi %%r0,12 \n\t"
|
||||
"ngr %%r0,%0 \n\t"
|
||||
"ltgr %%r0,%%r0 \n\t"
|
||||
"jz 3f \n\t"
|
||||
|
||||
"srlg %%r0,%%r0,2 \n\t"
|
||||
"2: \n\t"
|
||||
"vl %%v16,0(%%r1,%1) \n\t"
|
||||
"vl %%v17,0(%%r1,%2) \n\t"
|
||||
"vl %%v18,0(%%r1,%3) \n\t"
|
||||
"vl %%v19,0(%%r1,%4) \n\t"
|
||||
"vl %%v20,16(%%r1,%1) \n\t"
|
||||
"vl %%v21,16(%%r1,%2) \n\t"
|
||||
"vl %%v22,16(%%r1,%3) \n\t"
|
||||
"vl %%v23,16(%%r1,%4) \n\t"
|
||||
|
||||
"vl %%v4,0(%%r1,%6) \n\t"
|
||||
"vfmadb %%v4,%%v16,%%v0,%%v4 \n\t"
|
||||
"vfmadb %%v4,%%v17,%%v1,%%v4 \n\t"
|
||||
"vfmadb %%v4,%%v18,%%v2,%%v4 \n\t"
|
||||
"vfmadb %%v4,%%v19,%%v3,%%v4 \n\t"
|
||||
"vst %%v4,0(%%r1,%6) \n\t"
|
||||
|
||||
"vl %%v4,16(%%r1,%6) \n\t"
|
||||
"vfmadb %%v4,%%v20,%%v0,%%v4 \n\t"
|
||||
"vfmadb %%v4,%%v21,%%v1,%%v4 \n\t"
|
||||
"vfmadb %%v4,%%v22,%%v2,%%v4 \n\t"
|
||||
"vfmadb %%v4,%%v23,%%v3,%%v4 \n\t"
|
||||
"vst %%v4,16(%%r1,%6) \n\t"
|
||||
|
||||
"agfi %%r1,32 \n\t"
|
||||
"brctg %%r0,2b \n\t"
|
||||
|
||||
"3: \n\t"
|
||||
"nop "
|
||||
:
|
||||
:"r"(n),"ZR"((const FLOAT (*)[n])ap[0]),"ZR"((const FLOAT (*)[n])ap[1]),"ZR"((const FLOAT (*)[n])ap[2]),"ZR"((const FLOAT (*)[n])ap[3]),"ZQ"((const FLOAT (*)[4])xo),"ZR"((FLOAT (*)[n])y),"m"(*alpha)
|
||||
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
}
|
||||
|
||||
#else
|
||||
static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *xo, FLOAT *y, FLOAT *alpha)
|
||||
static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha)
|
||||
{
|
||||
BLASLONG i;
|
||||
FLOAT *a0;
|
||||
FLOAT x[4] __attribute__ ((aligned (16)));
|
||||
a0 = ap;
|
||||
__asm__ volatile (
|
||||
"vlrepg %%v0,0(%3) \n\t"
|
||||
"vlrepg %%v1,8(%3) \n\t"
|
||||
"vlrepg %%v2,%5 \n\t"
|
||||
"vfmdb %%v0,%%v0,%%v2 \n\t"
|
||||
"vfmdb %%v1,%%v1,%%v2 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
|
||||
for ( i=0; i<1; i++)
|
||||
x[i] = xo[i] * *alpha;
|
||||
"lghi %%r0,-16 \n\t"
|
||||
"ngr %%r0,%0 \n\t"
|
||||
"ltgr %%r0,%%r0 \n\t"
|
||||
"jz 1f \n\t"
|
||||
|
||||
for ( i=0; i< n; i+=4 )
|
||||
{
|
||||
y[i] += a0[i]*x[0];
|
||||
y[i+1] += a0[i+1]*x[0];
|
||||
y[i+2] += a0[i+2]*x[0];
|
||||
y[i+3] += a0[i+3]*x[0];
|
||||
}
|
||||
"srlg %%r0,%%r0,4 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 1,1024(%%r1,%1) \n\t"
|
||||
"pfd 1,1024(%%r1,%2) \n\t"
|
||||
"pfd 2,1024(%%r1,%4) \n\t"
|
||||
|
||||
"vl %%v16,0(%%r1,%1) \n\t"
|
||||
"vl %%v17,0(%%r1,%2) \n\t"
|
||||
"vl %%v18,16(%%r1,%1) \n\t"
|
||||
"vl %%v19,16(%%r1,%2) \n\t"
|
||||
"vl %%v20,32(%%r1,%1) \n\t"
|
||||
"vl %%v21,32(%%r1,%2) \n\t"
|
||||
"vl %%v22,48(%%r1,%1) \n\t"
|
||||
"vl %%v23,48(%%r1,%2) \n\t"
|
||||
"vl %%v24,64(%%r1,%1) \n\t"
|
||||
"vl %%v25,64(%%r1,%2) \n\t"
|
||||
"vl %%v26,80(%%r1,%1) \n\t"
|
||||
"vl %%v27,80(%%r1,%2) \n\t"
|
||||
"vl %%v28,96(%%r1,%1) \n\t"
|
||||
"vl %%v29,96(%%r1,%2) \n\t"
|
||||
"vl %%v30,112(%%r1,%1) \n\t"
|
||||
"vl %%v31,112(%%r1,%2) \n\t"
|
||||
|
||||
"vl %%v2,0(%%r1,%4) \n\t"
|
||||
"vfmadb %%v2,%%v16,%%v0,%%v2 \n\t"
|
||||
"vfmadb %%v2,%%v17,%%v1,%%v2 \n\t"
|
||||
"vst %%v2,0(%%r1,%4) \n\t"
|
||||
|
||||
"vl %%v2,16(%%r1,%4) \n\t"
|
||||
"vfmadb %%v2,%%v18,%%v0,%%v2 \n\t"
|
||||
"vfmadb %%v2,%%v19,%%v1,%%v2 \n\t"
|
||||
"vst %%v2,16(%%r1,%4) \n\t"
|
||||
|
||||
"vl %%v2,32(%%r1,%4) \n\t"
|
||||
"vfmadb %%v2,%%v20,%%v0,%%v2 \n\t"
|
||||
"vfmadb %%v2,%%v21,%%v1,%%v2 \n\t"
|
||||
"vst %%v2,32(%%r1,%4) \n\t"
|
||||
|
||||
"vl %%v2,48(%%r1,%4) \n\t"
|
||||
"vfmadb %%v2,%%v22,%%v0,%%v2 \n\t"
|
||||
"vfmadb %%v2,%%v23,%%v1,%%v2 \n\t"
|
||||
"vst %%v2,48(%%r1,%4) \n\t"
|
||||
|
||||
"vl %%v2,64(%%r1,%4) \n\t"
|
||||
"vfmadb %%v2,%%v24,%%v0,%%v2 \n\t"
|
||||
"vfmadb %%v2,%%v25,%%v1,%%v2 \n\t"
|
||||
"vst %%v2,64(%%r1,%4) \n\t"
|
||||
|
||||
"vl %%v2,80(%%r1,%4) \n\t"
|
||||
"vfmadb %%v2,%%v26,%%v0,%%v2 \n\t"
|
||||
"vfmadb %%v2,%%v27,%%v1,%%v2 \n\t"
|
||||
"vst %%v2,80(%%r1,%4) \n\t"
|
||||
|
||||
"vl %%v2,96(%%r1,%4) \n\t"
|
||||
"vfmadb %%v2,%%v28,%%v0,%%v2 \n\t"
|
||||
"vfmadb %%v2,%%v29,%%v1,%%v2 \n\t"
|
||||
"vst %%v2,96(%%r1,%4) \n\t"
|
||||
|
||||
"vl %%v2,112(%%r1,%4) \n\t"
|
||||
"vfmadb %%v2,%%v30,%%v0,%%v2 \n\t"
|
||||
"vfmadb %%v2,%%v31,%%v1,%%v2 \n\t"
|
||||
"vst %%v2,112(%%r1,%4) \n\t"
|
||||
|
||||
"agfi %%r1,128 \n\t"
|
||||
"brctg %%r0,0b \n\t"
|
||||
|
||||
"1: \n\t"
|
||||
"lghi %%r0,12 \n\t"
|
||||
"ngr %%r0,%0 \n\t"
|
||||
"ltgr %%r0,%%r0 \n\t"
|
||||
"jz 3f \n\t"
|
||||
|
||||
"srlg %%r0,%%r0,2 \n\t"
|
||||
"2: \n\t"
|
||||
"vl %%v16,0(%%r1,%1) \n\t"
|
||||
"vl %%v17,0(%%r1,%2) \n\t"
|
||||
"vl %%v18,16(%%r1,%1) \n\t"
|
||||
"vl %%v19,16(%%r1,%2) \n\t"
|
||||
|
||||
"vl %%v2,0(%%r1,%4) \n\t"
|
||||
"vfmadb %%v2,%%v16,%%v0,%%v2 \n\t"
|
||||
"vfmadb %%v2,%%v17,%%v1,%%v2 \n\t"
|
||||
"vst %%v2,0(%%r1,%4) \n\t"
|
||||
|
||||
"vl %%v2,16(%%r1,%4) \n\t"
|
||||
"vfmadb %%v2,%%v18,%%v0,%%v2 \n\t"
|
||||
"vfmadb %%v2,%%v19,%%v1,%%v2 \n\t"
|
||||
"vst %%v2,16(%%r1,%4) \n\t"
|
||||
|
||||
"agfi %%r1,32 \n\t"
|
||||
"brctg %%r0,2b \n\t"
|
||||
|
||||
"3: \n\t"
|
||||
"nop "
|
||||
:
|
||||
:"r"(n),"ZR"((const FLOAT (*)[n])ap[0]),"ZR"((const FLOAT (*)[n])ap[1]),"ZQ"((const FLOAT (*)[2])xo),"ZR"((FLOAT (*)[n])y),"m"(*alpha)
|
||||
:"memory","cc","r0","r1","v0","v1","v2","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
}
|
||||
|
||||
static void dgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *xo, FLOAT *y, FLOAT *alpha)
|
||||
{
|
||||
__asm__ volatile (
|
||||
"vlrepg %%v0,0(%2) \n\t"
|
||||
"vlrepg %%v1,%4 \n\t"
|
||||
"vfmdb %%v0,%%v0,%%v1 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
|
||||
#endif
|
||||
"lghi %%r0,-16 \n\t"
|
||||
"ngr %%r0,%0 \n\t"
|
||||
"ltgr %%r0,%%r0 \n\t"
|
||||
"jz 1f \n\t"
|
||||
|
||||
|
||||
"srlg %%r0,%%r0,4 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 1,1024(%%r1,%1) \n\t"
|
||||
"pfd 2,1024(%%r1,%3) \n\t"
|
||||
|
||||
"vl %%v16,0(%%r1,%1) \n\t"
|
||||
"vl %%v17,16(%%r1,%1) \n\t"
|
||||
"vl %%v18,32(%%r1,%1) \n\t"
|
||||
"vl %%v19,48(%%r1,%1) \n\t"
|
||||
"vl %%v20,64(%%r1,%1) \n\t"
|
||||
"vl %%v21,80(%%r1,%1) \n\t"
|
||||
"vl %%v22,96(%%r1,%1) \n\t"
|
||||
"vl %%v23,112(%%r1,%1) \n\t"
|
||||
|
||||
"vl %%v1,0(%%r1,%3) \n\t"
|
||||
"vfmadb %%v1,%%v16,%%v0,%%v1 \n\t"
|
||||
"vst %%v1,0(%%r1,%3) \n\t"
|
||||
|
||||
"vl %%v1,16(%%r1,%3) \n\t"
|
||||
"vfmadb %%v1,%%v17,%%v0,%%v1 \n\t"
|
||||
"vst %%v1,16(%%r1,%3) \n\t"
|
||||
|
||||
"vl %%v1,32(%%r1,%3) \n\t"
|
||||
"vfmadb %%v1,%%v18,%%v0,%%v1 \n\t"
|
||||
"vst %%v1,32(%%r1,%3) \n\t"
|
||||
|
||||
"vl %%v1,48(%%r1,%3) \n\t"
|
||||
"vfmadb %%v1,%%v19,%%v0,%%v1 \n\t"
|
||||
"vst %%v1,48(%%r1,%3) \n\t"
|
||||
|
||||
"vl %%v1,64(%%r1,%3) \n\t"
|
||||
"vfmadb %%v1,%%v20,%%v0,%%v1 \n\t"
|
||||
"vst %%v1,64(%%r1,%3) \n\t"
|
||||
|
||||
"vl %%v1,80(%%r1,%3) \n\t"
|
||||
"vfmadb %%v1,%%v21,%%v0,%%v1 \n\t"
|
||||
"vst %%v1,80(%%r1,%3) \n\t"
|
||||
|
||||
"vl %%v1,96(%%r1,%3) \n\t"
|
||||
"vfmadb %%v1,%%v22,%%v0,%%v1 \n\t"
|
||||
"vst %%v1,96(%%r1,%3) \n\t"
|
||||
|
||||
"vl %%v1,112(%%r1,%3) \n\t"
|
||||
"vfmadb %%v1,%%v23,%%v0,%%v1 \n\t"
|
||||
"vst %%v1,112(%%r1,%3) \n\t"
|
||||
|
||||
"agfi %%r1,128 \n\t"
|
||||
"brctg %%r0,0b \n\t"
|
||||
|
||||
"1: \n\t"
|
||||
"lghi %%r0,12 \n\t"
|
||||
"ngr %%r0,%0 \n\t"
|
||||
"ltgr %%r0,%%r0 \n\t"
|
||||
"jz 3f \n\t"
|
||||
|
||||
"srlg %%r0,%%r0,2 \n\t"
|
||||
"2: \n\t"
|
||||
"vl %%v16,0(%%r1,%1) \n\t"
|
||||
"vl %%v17,16(%%r1,%1) \n\t"
|
||||
|
||||
"vl %%v1,0(%%r1,%3) \n\t"
|
||||
"vfmadb %%v1,%%v16,%%v0,%%v1 \n\t"
|
||||
"vst %%v1,0(%%r1,%3) \n\t"
|
||||
|
||||
"vl %%v1,16(%%r1,%3) \n\t"
|
||||
"vfmadb %%v1,%%v17,%%v0,%%v1 \n\t"
|
||||
"vst %%v1,16(%%r1,%3) \n\t"
|
||||
|
||||
"agfi %%r1,32 \n\t"
|
||||
"brctg %%r0,2b \n\t"
|
||||
|
||||
"3: \n\t"
|
||||
"nop "
|
||||
:
|
||||
:"r"(n),"ZR"((const FLOAT (*)[n])a0),"ZQ"((const FLOAT (*)[1])xo),"ZR"((FLOAT (*)[n])y),"m"(*alpha)
|
||||
:"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
}
|
||||
|
||||
static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest)
|
||||
{
|
||||
BLASLONG i;
|
||||
|
||||
for ( i=0; i<n; i++ ){
|
||||
*dest += *src;
|
||||
src++;
|
||||
dest += inc_dest;
|
||||
for (i = 0; i < n; i++)
|
||||
{
|
||||
*dest += src[i];
|
||||
dest += inc_dest;
|
||||
}
|
||||
return;
|
||||
|
||||
}
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
|
||||
{
|
||||
BLASLONG i;
|
||||
BLASLONG j;
|
||||
FLOAT *a_ptr;
|
||||
FLOAT *x_ptr;
|
||||
FLOAT *y_ptr;
|
||||
|
|
@ -282,8 +488,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
|
|||
if ( n2 & 1 )
|
||||
{
|
||||
dgemv_kernel_4x1(NB,a_ptr,x_ptr,ybuffer,&alpha);
|
||||
a_ptr += lda;
|
||||
x_ptr += 1;
|
||||
/* a_ptr += lda;
|
||||
x_ptr += 1; */
|
||||
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -25,178 +25,460 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
|||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#define HAVE_KERNEL_4x4_VEC 1
|
||||
#define HAVE_KERNEL_4x2_VEC 1
|
||||
#define HAVE_KERNEL_4x1_VEC 1
|
||||
|
||||
#if defined(HAVE_KERNEL_4x4_VEC) || defined(HAVE_KERNEL_4x2_VEC) || defined(HAVE_KERNEL_4x1_VEC)
|
||||
#include <vecintrin.h>
|
||||
#endif
|
||||
#define NBMAX 2048
|
||||
|
||||
#ifdef HAVE_KERNEL_4x4
|
||||
|
||||
#elif HAVE_KERNEL_4x4_VEC
|
||||
|
||||
static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
|
||||
{
|
||||
BLASLONG i;
|
||||
__vector double* va0 = (__vector double*)ap[0];
|
||||
__vector double* va1 = (__vector double*)ap[1];
|
||||
__vector double* va2 = (__vector double*)ap[2];
|
||||
__vector double* va3 = (__vector double*)ap[3];
|
||||
__vector double* v_x =(__vector double*)x;
|
||||
__vector double temp0 = {0,0};
|
||||
__vector double temp1 = {0,0};
|
||||
__vector double temp2 = {0,0};
|
||||
__vector double temp3 = {0,0};
|
||||
__asm__ volatile (
|
||||
"vzero %%v0 \n\t"
|
||||
"vzero %%v1 \n\t"
|
||||
"vzero %%v2 \n\t"
|
||||
"vzero %%v3 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
|
||||
for ( i=0; i< n/2; i+=2 )
|
||||
{
|
||||
temp0 += v_x[i] * va0[i] + v_x[i+1] * va0[i+1] ;
|
||||
temp1 += v_x[i] * va1[i] + v_x[i+1] * va1[i+1] ;
|
||||
temp2 += v_x[i] * va2[i] + v_x[i+1] * va2[i+1] ;
|
||||
temp3 += v_x[i] * va3[i] + v_x[i+1] * va3[i+1] ;
|
||||
}
|
||||
|
||||
y[0] = temp0[0] + temp0[1];
|
||||
y[1] = temp1[0] + temp1[1];
|
||||
y[2] = temp2[0] + temp2[1];
|
||||
y[3] = temp3[0] + temp3[1];;
|
||||
"lghi %%r0,-16 \n\t"
|
||||
"ngr %%r0,%0 \n\t"
|
||||
"ltgr %%r0,%%r0 \n\t"
|
||||
"jz 1f \n\t"
|
||||
|
||||
"srlg %%r0,%%r0,4 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 1,1024(%%r1,%1) \n\t"
|
||||
"pfd 1,1024(%%r1,%2) \n\t"
|
||||
"pfd 1,1024(%%r1,%3) \n\t"
|
||||
"pfd 1,1024(%%r1,%4) \n\t"
|
||||
"pfd 1,1024(%%r1,%5) \n\t"
|
||||
|
||||
"vl %%v16,0(%%r1,%5) \n\t"
|
||||
"vl %%v17,16(%%r1,%5) \n\t"
|
||||
"vl %%v18,32(%%r1,%5) \n\t"
|
||||
"vl %%v19,48(%%r1,%5) \n\t"
|
||||
"vl %%v20,64(%%r1,%5) \n\t"
|
||||
"vl %%v21,80(%%r1,%5) \n\t"
|
||||
"vl %%v22,96(%%r1,%5) \n\t"
|
||||
"vl %%v23,112(%%r1,%5) \n\t"
|
||||
|
||||
"vl %%v24,0(%%r1,%1) \n\t"
|
||||
"vfmadb %%v0,%%v16,%%v24,%%v0 \n\t"
|
||||
"vl %%v25,0(%%r1,%2) \n\t"
|
||||
"vfmadb %%v1,%%v16,%%v25,%%v1 \n\t"
|
||||
"vl %%v26,0(%%r1,%3) \n\t"
|
||||
"vfmadb %%v2,%%v16,%%v26,%%v2 \n\t"
|
||||
"vl %%v27,0(%%r1,%4) \n\t"
|
||||
"vfmadb %%v3,%%v16,%%v27,%%v3 \n\t"
|
||||
|
||||
"vl %%v28,16(%%r1,%1) \n\t"
|
||||
"vfmadb %%v0,%%v17,%%v28,%%v0 \n\t"
|
||||
"vl %%v29,16(%%r1,%2) \n\t"
|
||||
"vfmadb %%v1,%%v17,%%v29,%%v1 \n\t"
|
||||
"vl %%v30,16(%%r1,%3) \n\t"
|
||||
"vfmadb %%v2,%%v17,%%v30,%%v2 \n\t"
|
||||
"vl %%v31,16(%%r1,%4) \n\t"
|
||||
"vfmadb %%v3,%%v17,%%v31,%%v3 \n\t"
|
||||
|
||||
"vl %%v24,32(%%r1,%1) \n\t"
|
||||
"vfmadb %%v0,%%v18,%%v24,%%v0 \n\t"
|
||||
"vl %%v25,32(%%r1,%2) \n\t"
|
||||
"vfmadb %%v1,%%v18,%%v25,%%v1 \n\t"
|
||||
"vl %%v26,32(%%r1,%3) \n\t"
|
||||
"vfmadb %%v2,%%v18,%%v26,%%v2 \n\t"
|
||||
"vl %%v27,32(%%r1,%4) \n\t"
|
||||
"vfmadb %%v3,%%v18,%%v27,%%v3 \n\t"
|
||||
|
||||
"vl %%v28,48(%%r1,%1) \n\t"
|
||||
"vfmadb %%v0,%%v19,%%v28,%%v0 \n\t"
|
||||
"vl %%v29,48(%%r1,%2) \n\t"
|
||||
"vfmadb %%v1,%%v19,%%v29,%%v1 \n\t"
|
||||
"vl %%v30,48(%%r1,%3) \n\t"
|
||||
"vfmadb %%v2,%%v19,%%v30,%%v2 \n\t"
|
||||
"vl %%v31,48(%%r1,%4) \n\t"
|
||||
"vfmadb %%v3,%%v19,%%v31,%%v3 \n\t"
|
||||
|
||||
"vl %%v24,64(%%r1,%1) \n\t"
|
||||
"vfmadb %%v0,%%v20,%%v24,%%v0 \n\t"
|
||||
"vl %%v25,64(%%r1,%2) \n\t"
|
||||
"vfmadb %%v1,%%v20,%%v25,%%v1 \n\t"
|
||||
"vl %%v26,64(%%r1,%3) \n\t"
|
||||
"vfmadb %%v2,%%v20,%%v26,%%v2 \n\t"
|
||||
"vl %%v27,64(%%r1,%4) \n\t"
|
||||
"vfmadb %%v3,%%v20,%%v27,%%v3 \n\t"
|
||||
|
||||
"vl %%v28,80(%%r1,%1) \n\t"
|
||||
"vfmadb %%v0,%%v21,%%v28,%%v0 \n\t"
|
||||
"vl %%v29,80(%%r1,%2) \n\t"
|
||||
"vfmadb %%v1,%%v21,%%v29,%%v1 \n\t"
|
||||
"vl %%v30,80(%%r1,%3) \n\t"
|
||||
"vfmadb %%v2,%%v21,%%v30,%%v2 \n\t"
|
||||
"vl %%v31,80(%%r1,%4) \n\t"
|
||||
"vfmadb %%v3,%%v21,%%v31,%%v3 \n\t"
|
||||
|
||||
"vl %%v24,96(%%r1,%1) \n\t"
|
||||
"vfmadb %%v0,%%v22,%%v24,%%v0 \n\t"
|
||||
"vl %%v25,96(%%r1,%2) \n\t"
|
||||
"vfmadb %%v1,%%v22,%%v25,%%v1 \n\t"
|
||||
"vl %%v26,96(%%r1,%3) \n\t"
|
||||
"vfmadb %%v2,%%v22,%%v26,%%v2 \n\t"
|
||||
"vl %%v27,96(%%r1,%4) \n\t"
|
||||
"vfmadb %%v3,%%v22,%%v27,%%v3 \n\t"
|
||||
|
||||
"vl %%v28,112(%%r1,%1) \n\t"
|
||||
"vfmadb %%v0,%%v23,%%v28,%%v0 \n\t"
|
||||
"vl %%v29,112(%%r1,%2) \n\t"
|
||||
"vfmadb %%v1,%%v23,%%v29,%%v1 \n\t"
|
||||
"vl %%v30,112(%%r1,%3) \n\t"
|
||||
"vfmadb %%v2,%%v23,%%v30,%%v2 \n\t"
|
||||
"vl %%v31,112(%%r1,%4) \n\t"
|
||||
"vfmadb %%v3,%%v23,%%v31,%%v3 \n\t"
|
||||
|
||||
"agfi %%r1,128 \n\t"
|
||||
"brctg %%r0,0b \n\t"
|
||||
|
||||
"1: \n\t"
|
||||
"lghi %%r0,12 \n\t"
|
||||
"ngr %%r0,%0 \n\t"
|
||||
"ltgr %%r0,%%r0 \n\t"
|
||||
"jz 3f \n\t"
|
||||
|
||||
"srlg %%r0,%%r0,2 \n\t"
|
||||
"2: \n\t"
|
||||
"vl %%v16,0(%%r1,%5) \n\t"
|
||||
"vl %%v17,16(%%r1,%5) \n\t"
|
||||
|
||||
"vl %%v24,0(%%r1,%1) \n\t"
|
||||
"vfmadb %%v0,%%v16,%%v24,%%v0 \n\t"
|
||||
"vl %%v25,0(%%r1,%2) \n\t"
|
||||
"vfmadb %%v1,%%v16,%%v25,%%v1 \n\t"
|
||||
"vl %%v26,0(%%r1,%3) \n\t"
|
||||
"vfmadb %%v2,%%v16,%%v26,%%v2 \n\t"
|
||||
"vl %%v27,0(%%r1,%4) \n\t"
|
||||
"vfmadb %%v3,%%v16,%%v27,%%v3 \n\t"
|
||||
|
||||
"vl %%v28,16(%%r1,%1) \n\t"
|
||||
"vfmadb %%v0,%%v17,%%v28,%%v0 \n\t"
|
||||
"vl %%v29,16(%%r1,%2) \n\t"
|
||||
"vfmadb %%v1,%%v17,%%v29,%%v1 \n\t"
|
||||
"vl %%v30,16(%%r1,%3) \n\t"
|
||||
"vfmadb %%v2,%%v17,%%v30,%%v2 \n\t"
|
||||
"vl %%v31,16(%%r1,%4) \n\t"
|
||||
"vfmadb %%v3,%%v17,%%v31,%%v3 \n\t"
|
||||
|
||||
"agfi %%r1,32 \n\t"
|
||||
"brctg %%r0,2b \n\t"
|
||||
|
||||
"3: \n\t"
|
||||
"vrepg %%v4,%%v0,1 \n\t"
|
||||
"adbr %%f0,%%f4 \n\t"
|
||||
"std %%f0,0(%6) \n\t"
|
||||
"vrepg %%v4,%%v1,1 \n\t"
|
||||
"adbr %%f1,%%f4 \n\t"
|
||||
"std %%f1,8(%6) \n\t"
|
||||
"vrepg %%v4,%%v2,1 \n\t"
|
||||
"adbr %%f2,%%f4 \n\t"
|
||||
"std %%f2,16(%6) \n\t"
|
||||
"vrepg %%v4,%%v3,1 \n\t"
|
||||
"adbr %%f3,%%f4 \n\t"
|
||||
"std %%f3,24(%6) "
|
||||
:
|
||||
:"r"(n),"ZR"((const FLOAT (*)[n])ap[0]),"ZR"((const FLOAT (*)[n])ap[1]),"ZR"((const FLOAT (*)[n])ap[2]),"ZR"((const FLOAT (*)[n])ap[3]),"ZR"((const FLOAT (*)[n])x),"ZQ"((FLOAT (*)[4])y)
|
||||
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
}
|
||||
#else
|
||||
static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
|
||||
{
|
||||
BLASLONG i;
|
||||
FLOAT *a0,*a1,*a2,*a3;
|
||||
a0 = ap[0];
|
||||
a1 = ap[1];
|
||||
a2 = ap[2];
|
||||
a3 = ap[3];
|
||||
FLOAT temp0 = 0.0;
|
||||
FLOAT temp1 = 0.0;
|
||||
FLOAT temp2 = 0.0;
|
||||
FLOAT temp3 = 0.0;
|
||||
|
||||
for ( i=0; i< n; i+=4 )
|
||||
{
|
||||
temp0 += a0[i]*x[i] + a0[i+1]*x[i+1] + a0[i+2]*x[i+2] + a0[i+3]*x[i+3];
|
||||
temp1 += a1[i]*x[i] + a1[i+1]*x[i+1] + a1[i+2]*x[i+2] + a1[i+3]*x[i+3];
|
||||
temp2 += a2[i]*x[i] + a2[i+1]*x[i+1] + a2[i+2]*x[i+2] + a2[i+3]*x[i+3];
|
||||
temp3 += a3[i]*x[i] + a3[i+1]*x[i+1] + a3[i+2]*x[i+2] + a3[i+3]*x[i+3];
|
||||
}
|
||||
y[0] = temp0;
|
||||
y[1] = temp1;
|
||||
y[2] = temp2;
|
||||
y[3] = temp3;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#ifdef HAVE_KERNEL_4x2
|
||||
|
||||
#elif HAVE_KERNEL_4x2_VEC
|
||||
|
||||
static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
|
||||
{
|
||||
BLASLONG i;
|
||||
__vector double* va0 = (__vector double*)ap[0];
|
||||
__vector double* va1 = (__vector double*)ap[1];
|
||||
__vector double* v_x =(__vector double*)x;
|
||||
__vector double temp0 = {0,0};
|
||||
__vector double temp1 = {0,0};
|
||||
__asm__ volatile (
|
||||
"vzero %%v0 \n\t"
|
||||
"vzero %%v1 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
|
||||
for ( i=0; i< n/2; i+=2 )
|
||||
{
|
||||
temp0 += v_x[i] * va0[i] + v_x[i+1] * va0[i+1] ;
|
||||
temp1 += v_x[i] * va1[i] + v_x[i+1] * va1[i+1] ;
|
||||
}
|
||||
|
||||
y[0] = temp0[0] + temp0[1];
|
||||
y[1] = temp1[0] + temp1[1];
|
||||
"lghi %%r0,-16 \n\t"
|
||||
"ngr %%r0,%0 \n\t"
|
||||
"ltgr %%r0,%%r0 \n\t"
|
||||
"jz 1f \n\t"
|
||||
|
||||
"srlg %%r0,%%r0,4 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 1,1024(%%r1,%1) \n\t"
|
||||
"pfd 1,1024(%%r1,%2) \n\t"
|
||||
"pfd 1,1024(%%r1,%3) \n\t"
|
||||
|
||||
"vl %%v16,0(%%r1,%3) \n\t"
|
||||
"vl %%v17,16(%%r1,%3) \n\t"
|
||||
"vl %%v18,32(%%r1,%3) \n\t"
|
||||
"vl %%v19,48(%%r1,%3) \n\t"
|
||||
"vl %%v20,64(%%r1,%3) \n\t"
|
||||
"vl %%v21,80(%%r1,%3) \n\t"
|
||||
"vl %%v22,96(%%r1,%3) \n\t"
|
||||
"vl %%v23,112(%%r1,%3) \n\t"
|
||||
|
||||
"vl %%v24,0(%%r1,%1) \n\t"
|
||||
"vfmadb %%v0,%%v16,%%v24,%%v0 \n\t"
|
||||
"vl %%v25,0(%%r1,%2) \n\t"
|
||||
"vfmadb %%v1,%%v16,%%v25,%%v1 \n\t"
|
||||
|
||||
"vl %%v26,16(%%r1,%1) \n\t"
|
||||
"vfmadb %%v0,%%v17,%%v26,%%v0 \n\t"
|
||||
"vl %%v27,16(%%r1,%2) \n\t"
|
||||
"vfmadb %%v1,%%v17,%%v27,%%v1 \n\t"
|
||||
|
||||
"vl %%v28,32(%%r1,%1) \n\t"
|
||||
"vfmadb %%v0,%%v18,%%v28,%%v0 \n\t"
|
||||
"vl %%v29,32(%%r1,%2) \n\t"
|
||||
"vfmadb %%v1,%%v18,%%v29,%%v1 \n\t"
|
||||
|
||||
"vl %%v30,48(%%r1,%1) \n\t"
|
||||
"vfmadb %%v0,%%v19,%%v30,%%v0 \n\t"
|
||||
"vl %%v31,48(%%r1,%2) \n\t"
|
||||
"vfmadb %%v1,%%v19,%%v31,%%v1 \n\t"
|
||||
|
||||
"vl %%v24,64(%%r1,%1) \n\t"
|
||||
"vfmadb %%v0,%%v20,%%v24,%%v0 \n\t"
|
||||
"vl %%v25,64(%%r1,%2) \n\t"
|
||||
"vfmadb %%v1,%%v20,%%v25,%%v1 \n\t"
|
||||
|
||||
"vl %%v26,80(%%r1,%1) \n\t"
|
||||
"vfmadb %%v0,%%v21,%%v26,%%v0 \n\t"
|
||||
"vl %%v27,80(%%r1,%2) \n\t"
|
||||
"vfmadb %%v1,%%v21,%%v27,%%v1 \n\t"
|
||||
|
||||
"vl %%v28,96(%%r1,%1) \n\t"
|
||||
"vfmadb %%v0,%%v22,%%v28,%%v0 \n\t"
|
||||
"vl %%v29,96(%%r1,%2) \n\t"
|
||||
"vfmadb %%v1,%%v22,%%v29,%%v1 \n\t"
|
||||
|
||||
"vl %%v30,112(%%r1,%1) \n\t"
|
||||
"vfmadb %%v0,%%v23,%%v30,%%v0 \n\t"
|
||||
"vl %%v31,112(%%r1,%2) \n\t"
|
||||
"vfmadb %%v1,%%v23,%%v31,%%v1 \n\t"
|
||||
|
||||
"agfi %%r1,128 \n\t"
|
||||
"brctg %%r0,0b \n\t"
|
||||
|
||||
"1: \n\t"
|
||||
"lghi %%r0,12 \n\t"
|
||||
"ngr %%r0,%0 \n\t"
|
||||
"ltgr %%r0,%%r0 \n\t"
|
||||
"jz 3f \n\t"
|
||||
|
||||
"srlg %%r0,%%r0,2 \n\t"
|
||||
"2: \n\t"
|
||||
"vl %%v16,0(%%r1,%3) \n\t"
|
||||
"vl %%v17,16(%%r1,%3) \n\t"
|
||||
|
||||
"vl %%v24,0(%%r1,%1) \n\t"
|
||||
"vfmadb %%v0,%%v16,%%v24,%%v0 \n\t"
|
||||
"vl %%v25,0(%%r1,%2) \n\t"
|
||||
"vfmadb %%v1,%%v16,%%v25,%%v1 \n\t"
|
||||
|
||||
"vl %%v26,16(%%r1,%1) \n\t"
|
||||
"vfmadb %%v0,%%v17,%%v26,%%v0 \n\t"
|
||||
"vl %%v27,16(%%r1,%2) \n\t"
|
||||
"vfmadb %%v1,%%v17,%%v27,%%v1 \n\t"
|
||||
|
||||
"agfi %%r1,32 \n\t"
|
||||
"brctg %%r0,2b \n\t"
|
||||
|
||||
"3: \n\t"
|
||||
"vrepg %%v2,%%v0,1 \n\t"
|
||||
"adbr %%f0,%%f2 \n\t"
|
||||
"std %%f0,0(%4) \n\t"
|
||||
"vrepg %%v2,%%v1,1 \n\t"
|
||||
"adbr %%f1,%%f2 \n\t"
|
||||
"std %%f1,8(%4) "
|
||||
:
|
||||
:"r"(n),"ZR"((const FLOAT (*)[n])ap[0]),"ZR"((const FLOAT (*)[n])ap[1]),"ZR"((const FLOAT (*)[n])x),"ZQ"((FLOAT (*)[2])y)
|
||||
:"memory","cc","r0","r1","v0","v1","v2","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
}
|
||||
#else
|
||||
static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
|
||||
{
|
||||
|
||||
BLASLONG i;
|
||||
FLOAT *a0,*a1;
|
||||
a0 = ap[0];
|
||||
a1 = ap[1];
|
||||
FLOAT temp0 = 0.0;
|
||||
FLOAT temp1 = 0.0;
|
||||
|
||||
for ( i=0; i< n; i+=4 )
|
||||
{
|
||||
temp0 += a0[i]*x[i] + a0[i+1]*x[i+1] + a0[i+2]*x[i+2] + a0[i+3]*x[i+3];
|
||||
temp1 += a1[i]*x[i] + a1[i+1]*x[i+1] + a1[i+2]*x[i+2] + a1[i+3]*x[i+3];
|
||||
}
|
||||
y[0] = temp0;
|
||||
y[1] = temp1;
|
||||
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef HAVE_KERNEL_4x1
|
||||
|
||||
#elif HAVE_KERNEL_4x1_VEC
|
||||
|
||||
static void dgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y)
|
||||
{
|
||||
BLASLONG i;
|
||||
__vector double* va0 = (__vector double*)a0;
|
||||
__vector double* v_x =(__vector double*)x;
|
||||
__vector double temp0 = {0,0};
|
||||
__asm__ volatile (
|
||||
"vzero %%v0 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
|
||||
for ( i=0; i< n/2; i+=2 )
|
||||
{
|
||||
temp0 += v_x[i] * va0[i] + v_x[i+1] * va0[i+1] ;
|
||||
}
|
||||
|
||||
y[0] = temp0[0] + temp0[1];
|
||||
}
|
||||
#else
|
||||
static void dgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y)
|
||||
{
|
||||
BLASLONG i;
|
||||
|
||||
|
||||
FLOAT temp0 = 0.0;
|
||||
"lghi %%r0,-16 \n\t"
|
||||
"ngr %%r0,%0 \n\t"
|
||||
"ltgr %%r0,%%r0 \n\t"
|
||||
"jz 1f \n\t"
|
||||
|
||||
for ( i=0; i< n; i+=4 )
|
||||
{
|
||||
temp0 += a0[i]*x[i] + a0[i+1]*x[i+1] + a0[i+2]*x[i+2] + a0[i+3]*x[i+3];
|
||||
}
|
||||
y[0] = temp0;
|
||||
"srlg %%r0,%%r0,4 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 1,1024(%%r1,%1) \n\t"
|
||||
"pfd 1,1024(%%r1,%2) \n\t"
|
||||
|
||||
"vl %%v16,0(%%r1,%2) \n\t"
|
||||
"vl %%v17,16(%%r1,%2) \n\t"
|
||||
"vl %%v18,32(%%r1,%2) \n\t"
|
||||
"vl %%v19,48(%%r1,%2) \n\t"
|
||||
"vl %%v20,64(%%r1,%2) \n\t"
|
||||
"vl %%v21,80(%%r1,%2) \n\t"
|
||||
"vl %%v22,96(%%r1,%2) \n\t"
|
||||
"vl %%v23,112(%%r1,%2) \n\t"
|
||||
|
||||
"vl %%v24,0(%%r1,%1) \n\t"
|
||||
"vfmadb %%v0,%%v16,%%v24,%%v0 \n\t"
|
||||
|
||||
"vl %%v25,16(%%r1,%1) \n\t"
|
||||
"vfmadb %%v0,%%v17,%%v25,%%v0 \n\t"
|
||||
|
||||
"vl %%v26,32(%%r1,%1) \n\t"
|
||||
"vfmadb %%v0,%%v18,%%v26,%%v0 \n\t"
|
||||
|
||||
"vl %%v27,48(%%r1,%1) \n\t"
|
||||
"vfmadb %%v0,%%v19,%%v27,%%v0 \n\t"
|
||||
|
||||
"vl %%v28,64(%%r1,%1) \n\t"
|
||||
"vfmadb %%v0,%%v20,%%v28,%%v0 \n\t"
|
||||
|
||||
"vl %%v29,80(%%r1,%1) \n\t"
|
||||
"vfmadb %%v0,%%v21,%%v29,%%v0 \n\t"
|
||||
|
||||
"vl %%v30,96(%%r1,%1) \n\t"
|
||||
"vfmadb %%v0,%%v22,%%v30,%%v0 \n\t"
|
||||
|
||||
"vl %%v31,112(%%r1,%1) \n\t"
|
||||
"vfmadb %%v0,%%v23,%%v31,%%v0 \n\t"
|
||||
|
||||
"agfi %%r1,128 \n\t"
|
||||
"brctg %%r0,0b \n\t"
|
||||
|
||||
"1: \n\t"
|
||||
"lghi %%r0,12 \n\t"
|
||||
"ngr %%r0,%0 \n\t"
|
||||
"ltgr %%r0,%%r0 \n\t"
|
||||
"jz 3f \n\t"
|
||||
|
||||
"srlg %%r0,%%r0,2 \n\t"
|
||||
"2: \n\t"
|
||||
"vl %%v16,0(%%r1,%2) \n\t"
|
||||
"vl %%v17,16(%%r1,%2) \n\t"
|
||||
|
||||
"vl %%v24,0(%%r1,%1) \n\t"
|
||||
"vfmadb %%v0,%%v16,%%v24,%%v0 \n\t"
|
||||
|
||||
"vl %%v25,16(%%r1,%1) \n\t"
|
||||
"vfmadb %%v0,%%v17,%%v25,%%v0 \n\t"
|
||||
|
||||
"agfi %%r1,32 \n\t"
|
||||
"brctg %%r0,2b \n\t"
|
||||
|
||||
"3: \n\t"
|
||||
"vrepg %%v1,%%v0,1 \n\t"
|
||||
"adbr %%f0,%%f1 \n\t"
|
||||
"std %%f0,0(%3) "
|
||||
:
|
||||
:"r"(n),"ZR"((const FLOAT (*)[n])a0),"ZR"((const FLOAT (*)[n])x),"ZQ"((FLOAT (*)[1])y)
|
||||
:"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src)
|
||||
{
|
||||
BLASLONG i;
|
||||
for ( i=0; i<n; i++ )
|
||||
{
|
||||
*dest = *src;
|
||||
dest++;
|
||||
src += inc_src;
|
||||
}
|
||||
BLASLONG i;
|
||||
for (i = 0; i < n; i++)
|
||||
{
|
||||
dest[i] = *src;
|
||||
src += inc_src;
|
||||
}
|
||||
}
|
||||
|
||||
static void add_y(BLASLONG n, FLOAT da , FLOAT *src, FLOAT *dest, BLASLONG inc_dest)
|
||||
static void add_y_kernel_4(BLASLONG n, FLOAT da, FLOAT *src, FLOAT *dest)
|
||||
{
|
||||
__asm__ volatile (
|
||||
"vlrepg %%v0,%1 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
|
||||
"lghi %%r0,-16 \n\t"
|
||||
"ngr %%r0,%0 \n\t"
|
||||
"ltgr %%r0,%%r0 \n\t"
|
||||
"jz 1f \n\t"
|
||||
|
||||
"srlg %%r0,%%r0,4 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 1,1024(%%r1,%2) \n\t"
|
||||
"pfd 2,1024(%%r1,%3) \n\t"
|
||||
|
||||
"vl %%v16,0(%%r1,%2) \n\t"
|
||||
"vl %%v17,16(%%r1,%2) \n\t"
|
||||
"vl %%v18,32(%%r1,%2) \n\t"
|
||||
"vl %%v19,48(%%r1,%2) \n\t"
|
||||
"vl %%v20,64(%%r1,%2) \n\t"
|
||||
"vl %%v21,80(%%r1,%2) \n\t"
|
||||
"vl %%v22,96(%%r1,%2) \n\t"
|
||||
"vl %%v23,112(%%r1,%2) \n\t"
|
||||
|
||||
"vl %%v24, 0(%%r1,%3) \n\t"
|
||||
"vfmadb %%v24,%%v16,%%v0,%%v24 \n\t"
|
||||
"vst %%v24, 0(%%r1,%3) \n\t"
|
||||
"vl %%v25, 16(%%r1,%3) \n\t"
|
||||
"vfmadb %%v25,%%v17,%%v0,%%v25 \n\t"
|
||||
"vst %%v25, 16(%%r1,%3) \n\t"
|
||||
"vl %%v26, 32(%%r1,%3) \n\t"
|
||||
"vfmadb %%v26,%%v18,%%v0,%%v26 \n\t"
|
||||
"vst %%v26, 32(%%r1,%3) \n\t"
|
||||
"vl %%v27, 48(%%r1,%3) \n\t"
|
||||
"vfmadb %%v27,%%v19,%%v0,%%v27 \n\t"
|
||||
"vst %%v27, 48(%%r1,%3) \n\t"
|
||||
"vl %%v28, 64(%%r1,%3) \n\t"
|
||||
"vfmadb %%v28,%%v20,%%v0,%%v28 \n\t"
|
||||
"vst %%v28, 64(%%r1,%3) \n\t"
|
||||
"vl %%v29, 80(%%r1,%3) \n\t"
|
||||
"vfmadb %%v29,%%v21,%%v0,%%v29 \n\t"
|
||||
"vst %%v29, 80(%%r1,%3) \n\t"
|
||||
"vl %%v30, 96(%%r1,%3) \n\t"
|
||||
"vfmadb %%v30,%%v22,%%v0,%%v30 \n\t"
|
||||
"vst %%v30, 96(%%r1,%3) \n\t"
|
||||
"vl %%v31, 112(%%r1,%3) \n\t"
|
||||
"vfmadb %%v31,%%v23,%%v0,%%v31 \n\t"
|
||||
"vst %%v31, 112(%%r1,%3) \n\t"
|
||||
|
||||
"agfi %%r1,128 \n\t"
|
||||
"brctg %%r0,0b \n\t"
|
||||
|
||||
"1: \n\t"
|
||||
"lghi %%r0,12 \n\t"
|
||||
"ngr %%r0,%0 \n\t"
|
||||
"ltgr %%r0,%%r0 \n\t"
|
||||
"jz 3f \n\t"
|
||||
|
||||
"srlg %%r0,%%r0,2 \n\t"
|
||||
"2: \n\t"
|
||||
"vl %%v16,0(%%r1,%2) \n\t"
|
||||
"vl %%v17,16(%%r1,%2) \n\t"
|
||||
|
||||
"vl %%v24, 0(%%r1,%3) \n\t"
|
||||
"vfmadb %%v24,%%v16,%%v0,%%v24 \n\t"
|
||||
"vst %%v24, 0(%%r1,%3) \n\t"
|
||||
"vl %%v25, 16(%%r1,%3) \n\t"
|
||||
"vfmadb %%v25,%%v17,%%v0,%%v25 \n\t"
|
||||
"vst %%v25, 16(%%r1,%3) \n\t"
|
||||
|
||||
"agfi %%r1,32 \n\t"
|
||||
"brctg %%r0,2b \n\t"
|
||||
|
||||
"3: \n\t"
|
||||
"nop "
|
||||
:
|
||||
:"r"(n),"m"(da),"ZR"((const FLOAT (*)[n])src),"ZR"((FLOAT (*)[n])dest)
|
||||
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
}
|
||||
static void add_y(BLASLONG n, FLOAT da, FLOAT *src, FLOAT *dest, BLASLONG inc_dest)
|
||||
{
|
||||
if (inc_dest == 1)
|
||||
add_y_kernel_4(n, da, src, dest);
|
||||
else
|
||||
{
|
||||
BLASLONG i;
|
||||
|
||||
for ( i=0; i<n; i++ )
|
||||
for (i = 0; i < n; i++)
|
||||
{
|
||||
*dest += src[i] * da;
|
||||
dest += inc_dest;
|
||||
*dest += src[i] * da;
|
||||
dest += inc_dest;
|
||||
}
|
||||
return;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
|
||||
|
|
@ -212,7 +494,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
|
|||
BLASLONG m2;
|
||||
BLASLONG m3;
|
||||
BLASLONG n2;
|
||||
FLOAT ybuffer[4],*xbuffer;
|
||||
FLOAT ybuffer[2] __attribute__ ((aligned(16)));
|
||||
FLOAT *xbuffer;
|
||||
FLOAT *ytemp;
|
||||
|
||||
if ( m < 1 ) return(0);
|
||||
|
|
@ -234,7 +517,6 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
|
|||
|
||||
while ( NB == NBMAX )
|
||||
{
|
||||
|
||||
m1 -= NB;
|
||||
if ( m1 < 0)
|
||||
{
|
||||
|
|
@ -319,9 +601,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
|
|||
{
|
||||
|
||||
dgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer);
|
||||
a_ptr += lda;
|
||||
// a_ptr += lda;
|
||||
*y_ptr += ybuffer[0] * alpha;
|
||||
y_ptr += inc_y;
|
||||
// y_ptr += inc_y;
|
||||
|
||||
}
|
||||
a += NB;
|
||||
|
|
|
|||
|
|
@ -0,0 +1,159 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2017, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
static FLOAT dmax_kernel_32(BLASLONG n, FLOAT *x)
|
||||
{
|
||||
FLOAT max;
|
||||
|
||||
__asm__ volatile (
|
||||
"vl %%v0,0(%2) \n\t"
|
||||
"srlg %%r0,%1,5 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 1, 1024(%%r1,%2) \n\t"
|
||||
|
||||
"vl %%v16,0(%%r1,%2) \n\t"
|
||||
"vl %%v17,16(%%r1,%2) \n\t"
|
||||
"vl %%v18,32(%%r1,%2) \n\t"
|
||||
"vl %%v19,48(%%r1,%2) \n\t"
|
||||
"vl %%v20,64(%%r1,%2) \n\t"
|
||||
"vl %%v21,80(%%r1,%2) \n\t"
|
||||
"vl %%v22,96(%%r1,%2) \n\t"
|
||||
"vl %%v23,112(%%r1,%2) \n\t"
|
||||
"vl %%v24,128(%%r1,%2) \n\t"
|
||||
"vl %%v25,144(%%r1,%2) \n\t"
|
||||
"vl %%v26,160(%%r1,%2) \n\t"
|
||||
"vl %%v27,176(%%r1,%2) \n\t"
|
||||
"vl %%v28,192(%%r1,%2) \n\t"
|
||||
"vl %%v29,208(%%r1,%2) \n\t"
|
||||
"vl %%v30,224(%%r1,%2) \n\t"
|
||||
"vl %%v31,240(%%r1,%2) \n\t"
|
||||
|
||||
"vfmaxdb %%v16,%%v16,%%v24,0 \n\t"
|
||||
"vfmaxdb %%v17,%%v17,%%v25,0 \n\t"
|
||||
"vfmaxdb %%v18,%%v18,%%v26,0 \n\t"
|
||||
"vfmaxdb %%v19,%%v19,%%v27,0 \n\t"
|
||||
"vfmaxdb %%v20,%%v20,%%v28,0 \n\t"
|
||||
"vfmaxdb %%v21,%%v21,%%v29,0 \n\t"
|
||||
"vfmaxdb %%v22,%%v22,%%v30,0 \n\t"
|
||||
"vfmaxdb %%v23,%%v23,%%v31,0 \n\t"
|
||||
|
||||
"vfmaxdb %%v16,%%v16,%%v20,0 \n\t"
|
||||
"vfmaxdb %%v17,%%v17,%%v21,0 \n\t"
|
||||
"vfmaxdb %%v18,%%v18,%%v22,0 \n\t"
|
||||
"vfmaxdb %%v19,%%v19,%%v23,0 \n\t"
|
||||
|
||||
"vfmaxdb %%v16,%%v16,%%v18,0 \n\t"
|
||||
"vfmaxdb %%v17,%%v17,%%v19,0 \n\t"
|
||||
|
||||
"vfmaxdb %%v16,%%v16,%%v17,0 \n\t"
|
||||
|
||||
"vfmaxdb %%v0,%%v0,%%16,0 \n\t"
|
||||
|
||||
"agfi %%r1, 256 \n\t"
|
||||
"brctg %%r0, 0b \n\t"
|
||||
|
||||
"vrepg %%v16,%%v0,1 \n\t"
|
||||
"wfmaxdb %%v0,%%v0,%%v16,0 \n\t"
|
||||
"ldr %0,%%f0 "
|
||||
:"=f"(max)
|
||||
:"r"(n),"ZR"((const FLOAT (*)[n])x)
|
||||
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
|
||||
return max;
|
||||
}
|
||||
|
||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||
BLASLONG i = 0;
|
||||
BLASLONG j = 0;
|
||||
FLOAT maxf = 0.0;
|
||||
|
||||
if (n <= 0 || inc_x <= 0) return (maxf);
|
||||
|
||||
if (inc_x == 1) {
|
||||
|
||||
BLASLONG n1 = n & -32;
|
||||
if (n1 > 0) {
|
||||
|
||||
maxf = dmax_kernel_32(n1, x);
|
||||
|
||||
i = n1;
|
||||
}
|
||||
else
|
||||
{
|
||||
maxf=x[0];
|
||||
i++;
|
||||
}
|
||||
|
||||
while (i < n) {
|
||||
if (x[i] > maxf) {
|
||||
maxf = x[i];
|
||||
}
|
||||
i++;
|
||||
}
|
||||
return (maxf);
|
||||
|
||||
} else {
|
||||
|
||||
maxf=x[0];
|
||||
|
||||
BLASLONG n1 = n & -4;
|
||||
while (j < n1) {
|
||||
|
||||
if (x[i] > maxf) {
|
||||
maxf = x[i];
|
||||
}
|
||||
if (x[i + inc_x] > maxf) {
|
||||
maxf = x[i + inc_x];
|
||||
}
|
||||
if (x[i + 2 * inc_x] > maxf) {
|
||||
maxf = x[i + 2 * inc_x];
|
||||
}
|
||||
if (x[i + 3 * inc_x] > maxf) {
|
||||
maxf = x[i + 3 * inc_x];
|
||||
}
|
||||
|
||||
i += inc_x * 4;
|
||||
|
||||
j += 4;
|
||||
|
||||
}
|
||||
|
||||
|
||||
while (j < n) {
|
||||
if (x[i] > maxf) {
|
||||
maxf = x[i];
|
||||
}
|
||||
i += inc_x;
|
||||
j++;
|
||||
}
|
||||
return (maxf);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,180 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2017, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
static FLOAT dmax_kernel_32(BLASLONG n, FLOAT *x)
|
||||
{
|
||||
FLOAT max;
|
||||
|
||||
__asm__ volatile (
|
||||
"vl %%v0,0(%2) \n\t"
|
||||
"srlg %%r0,%1,5 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 1, 1024(%%r1,%2) \n\t"
|
||||
|
||||
"vl %%v16,0(%%r1,%2) \n\t"
|
||||
"vl %%v17,16(%%r1,%2) \n\t"
|
||||
"vl %%v18,32(%%r1,%2) \n\t"
|
||||
"vl %%v19,48(%%r1,%2) \n\t"
|
||||
"vl %%v20,64(%%r1,%2) \n\t"
|
||||
"vl %%v21,80(%%r1,%2) \n\t"
|
||||
"vl %%v22,96(%%r1,%2) \n\t"
|
||||
"vl %%v23,112(%%r1,%2) \n\t"
|
||||
|
||||
"vfchdb %%v24,%%v16,%%v17 \n\t"
|
||||
"vfchdb %%v25,%%v18,%%v19 \n\t"
|
||||
"vfchdb %%v26,%%v20,%%v21 \n\t"
|
||||
"vfchdb %%v27,%%v22,%%v23 \n\t"
|
||||
"vsel %%v24,%%v16,%%v17,%%v24 \n\t"
|
||||
"vsel %%v25,%%v18,%%v19,%%v25 \n\t"
|
||||
"vsel %%v26,%%v20,%%v21,%%v26 \n\t"
|
||||
"vsel %%v27,%%v22,%%v23,%%v27 \n\t"
|
||||
|
||||
"vfchdb %%v28,%%v24,%%v25 \n\t"
|
||||
"vfchdb %%v29,%%v26,%%v27 \n\t"
|
||||
"vsel %%v28,%%v24,%%v25,%%v28 \n\t"
|
||||
"vsel %%v29,%%v26,%%v27,%%v29 \n\t"
|
||||
|
||||
"vfchdb %%v30,%%v28,%%v29 \n\t"
|
||||
"vsel %%v30,%%v28,%%v29,%%v30 \n\t"
|
||||
|
||||
"vfchdb %%v31,%%v30,%%v0 \n\t"
|
||||
"vsel %%v0,%%v30,%%v0,%%v31 \n\t"
|
||||
|
||||
"vl %%v16,128(%%r1,%2) \n\t"
|
||||
"vl %%v17,144(%%r1,%2) \n\t"
|
||||
"vl %%v18,160(%%r1,%2) \n\t"
|
||||
"vl %%v19,176(%%r1,%2) \n\t"
|
||||
"vl %%v20,192(%%r1,%2) \n\t"
|
||||
"vl %%v21,208(%%r1,%2) \n\t"
|
||||
"vl %%v22,224(%%r1,%2) \n\t"
|
||||
"vl %%v23,240(%%r1,%2) \n\t"
|
||||
|
||||
"vfchdb %%v24,%%v16,%%v17 \n\t"
|
||||
"vfchdb %%v25,%%v18,%%v19 \n\t"
|
||||
"vfchdb %%v26,%%v20,%%v21 \n\t"
|
||||
"vfchdb %%v27,%%v22,%%v23 \n\t"
|
||||
"vsel %%v24,%%v16,%%v17,%%v24 \n\t"
|
||||
"vsel %%v25,%%v18,%%v19,%%v25 \n\t"
|
||||
"vsel %%v26,%%v20,%%v21,%%v26 \n\t"
|
||||
"vsel %%v27,%%v22,%%v23,%%v27 \n\t"
|
||||
|
||||
"vfchdb %%v28,%%v24,%%v25 \n\t"
|
||||
"vfchdb %%v29,%%v26,%%v27 \n\t"
|
||||
"vsel %%v28,%%v24,%%v25,%%v28 \n\t"
|
||||
"vsel %%v29,%%v26,%%v27,%%v29 \n\t"
|
||||
|
||||
"vfchdb %%v30,%%v28,%%v29 \n\t"
|
||||
"vsel %%v30,%%v28,%%v29,%%v30 \n\t"
|
||||
|
||||
"vfchdb %%v31,%%v30,%%v0 \n\t"
|
||||
"vsel %%v0,%%v30,%%v0,%%v31 \n\t"
|
||||
|
||||
"agfi %%r1, 256 \n\t"
|
||||
"brctg %%r0, 0b \n\t"
|
||||
|
||||
"vrepg %%v16,%%v0,1 \n\t"
|
||||
"wfchdb %%v17,%%v0,%%v16 \n\t"
|
||||
"vsel %%v0,%%v0,%%v16,%%v17 \n\t"
|
||||
"ldr %0,%%f0 "
|
||||
:"=f"(max)
|
||||
:"r"(n),"ZR"((const FLOAT (*)[n])x)
|
||||
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
|
||||
return max;
|
||||
}
|
||||
|
||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||
BLASLONG i = 0;
|
||||
BLASLONG j = 0;
|
||||
FLOAT maxf = 0.0;
|
||||
|
||||
if (n <= 0 || inc_x <= 0) return (maxf);
|
||||
|
||||
if (inc_x == 1) {
|
||||
|
||||
BLASLONG n1 = n & -32;
|
||||
if (n1 > 0) {
|
||||
|
||||
maxf = dmax_kernel_32(n1, x);
|
||||
|
||||
i = n1;
|
||||
}
|
||||
else
|
||||
{
|
||||
maxf=x[0];
|
||||
i++;
|
||||
}
|
||||
|
||||
while (i < n) {
|
||||
if (x[i] > maxf) {
|
||||
maxf = x[i];
|
||||
}
|
||||
i++;
|
||||
}
|
||||
return (maxf);
|
||||
|
||||
} else {
|
||||
|
||||
maxf=x[0];
|
||||
|
||||
BLASLONG n1 = n & -4;
|
||||
while (j < n1) {
|
||||
|
||||
if (x[i] > maxf) {
|
||||
maxf = x[i];
|
||||
}
|
||||
if (x[i + inc_x] > maxf) {
|
||||
maxf = x[i + inc_x];
|
||||
}
|
||||
if (x[i + 2 * inc_x] > maxf) {
|
||||
maxf = x[i + 2 * inc_x];
|
||||
}
|
||||
if (x[i + 3 * inc_x] > maxf) {
|
||||
maxf = x[i + 3 * inc_x];
|
||||
}
|
||||
|
||||
i += inc_x * 4;
|
||||
|
||||
j += 4;
|
||||
|
||||
}
|
||||
|
||||
|
||||
while (j < n) {
|
||||
if (x[i] > maxf) {
|
||||
maxf = x[i];
|
||||
}
|
||||
i += inc_x;
|
||||
j++;
|
||||
}
|
||||
return (maxf);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,159 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2017, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
static FLOAT dmin_kernel_32(BLASLONG n, FLOAT *x)
|
||||
{
|
||||
FLOAT min;
|
||||
|
||||
__asm__ volatile (
|
||||
"vl %%v0,0(%2) \n\t"
|
||||
"srlg %%r0,%1,5 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 1, 1024(%%r1,%2) \n\t"
|
||||
|
||||
"vl %%v16,0(%%r1,%2) \n\t"
|
||||
"vl %%v17,16(%%r1,%2) \n\t"
|
||||
"vl %%v18,32(%%r1,%2) \n\t"
|
||||
"vl %%v19,48(%%r1,%2) \n\t"
|
||||
"vl %%v20,64(%%r1,%2) \n\t"
|
||||
"vl %%v21,80(%%r1,%2) \n\t"
|
||||
"vl %%v22,96(%%r1,%2) \n\t"
|
||||
"vl %%v23,112(%%r1,%2) \n\t"
|
||||
"vl %%v24,128(%%r1,%2) \n\t"
|
||||
"vl %%v25,144(%%r1,%2) \n\t"
|
||||
"vl %%v26,160(%%r1,%2) \n\t"
|
||||
"vl %%v27,176(%%r1,%2) \n\t"
|
||||
"vl %%v28,192(%%r1,%2) \n\t"
|
||||
"vl %%v29,208(%%r1,%2) \n\t"
|
||||
"vl %%v30,224(%%r1,%2) \n\t"
|
||||
"vl %%v31,240(%%r1,%2) \n\t"
|
||||
|
||||
"vfmindb %%v16,%%v16,%%v24,0 \n\t"
|
||||
"vfmindb %%v17,%%v17,%%v25,0 \n\t"
|
||||
"vfmindb %%v18,%%v18,%%v26,0 \n\t"
|
||||
"vfmindb %%v19,%%v19,%%v27,0 \n\t"
|
||||
"vfmindb %%v20,%%v20,%%v28,0 \n\t"
|
||||
"vfmindb %%v21,%%v21,%%v29,0 \n\t"
|
||||
"vfmindb %%v22,%%v22,%%v30,0 \n\t"
|
||||
"vfmindb %%v23,%%v23,%%v31,0 \n\t"
|
||||
|
||||
"vfmindb %%v16,%%v16,%%v20,0 \n\t"
|
||||
"vfmindb %%v17,%%v17,%%v21,0 \n\t"
|
||||
"vfmindb %%v18,%%v18,%%v22,0 \n\t"
|
||||
"vfmindb %%v19,%%v19,%%v23,0 \n\t"
|
||||
|
||||
"vfmindb %%v16,%%v16,%%v18,0 \n\t"
|
||||
"vfmindb %%v17,%%v17,%%v19,0 \n\t"
|
||||
|
||||
"vfmindb %%v16,%%v16,%%v17,0 \n\t"
|
||||
|
||||
"vfmindb %%v0,%%v0,%%16,0 \n\t"
|
||||
|
||||
"agfi %%r1, 256 \n\t"
|
||||
"brctg %%r0, 0b \n\t"
|
||||
|
||||
"vrepg %%v16,%%v0,1 \n\t"
|
||||
"wfmindb %%v0,%%v0,%%v16,0 \n\t"
|
||||
"ldr %0,%%f0 "
|
||||
:"=f"(min)
|
||||
:"r"(n),"ZR"((const FLOAT (*)[n])x)
|
||||
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
|
||||
return min;
|
||||
}
|
||||
|
||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||
BLASLONG i = 0;
|
||||
BLASLONG j = 0;
|
||||
FLOAT minf = 0.0;
|
||||
|
||||
if (n <= 0 || inc_x <= 0) return (minf);
|
||||
|
||||
if (inc_x == 1) {
|
||||
|
||||
BLASLONG n1 = n & -32;
|
||||
if (n1 > 0) {
|
||||
|
||||
minf = dmin_kernel_32(n1, x);
|
||||
|
||||
i = n1;
|
||||
}
|
||||
else
|
||||
{
|
||||
minf=x[0];
|
||||
i++;
|
||||
}
|
||||
|
||||
while (i < n) {
|
||||
if (x[i] < minf) {
|
||||
minf = x[i];
|
||||
}
|
||||
i++;
|
||||
}
|
||||
return (minf);
|
||||
|
||||
} else {
|
||||
|
||||
minf=x[0];
|
||||
|
||||
BLASLONG n1 = n & -4;
|
||||
while (j < n1) {
|
||||
|
||||
if (x[i] < minf) {
|
||||
minf = x[i];
|
||||
}
|
||||
if (x[i + inc_x] < minf) {
|
||||
minf = x[i + inc_x];
|
||||
}
|
||||
if (x[i + 2 * inc_x] < minf) {
|
||||
minf = x[i + 2 * inc_x];
|
||||
}
|
||||
if (x[i + 3 * inc_x] < minf) {
|
||||
minf = x[i + 3 * inc_x];
|
||||
}
|
||||
|
||||
i += inc_x * 4;
|
||||
|
||||
j += 4;
|
||||
|
||||
}
|
||||
|
||||
|
||||
while (j < n) {
|
||||
if (x[i] < minf) {
|
||||
minf = x[i];
|
||||
}
|
||||
i += inc_x;
|
||||
j++;
|
||||
}
|
||||
return (minf);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,180 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2017, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
static FLOAT dmin_kernel_32(BLASLONG n, FLOAT *x)
|
||||
{
|
||||
FLOAT min;
|
||||
|
||||
__asm__ volatile (
|
||||
"vl %%v0,0(%2) \n\t"
|
||||
"srlg %%r0,%1,5 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 1, 1024(%%r1,%2) \n\t"
|
||||
|
||||
"vl %%v16,0(%%r1,%2) \n\t"
|
||||
"vl %%v17,16(%%r1,%2) \n\t"
|
||||
"vl %%v18,32(%%r1,%2) \n\t"
|
||||
"vl %%v19,48(%%r1,%2) \n\t"
|
||||
"vl %%v20,64(%%r1,%2) \n\t"
|
||||
"vl %%v21,80(%%r1,%2) \n\t"
|
||||
"vl %%v22,96(%%r1,%2) \n\t"
|
||||
"vl %%v23,112(%%r1,%2) \n\t"
|
||||
|
||||
"vfchdb %%v24,%%v17,%%v16 \n\t"
|
||||
"vfchdb %%v25,%%v19,%%v18 \n\t"
|
||||
"vfchdb %%v26,%%v21,%%v20 \n\t"
|
||||
"vfchdb %%v27,%%v23,%%v22 \n\t"
|
||||
"vsel %%v24,%%v16,%%v17,%%v24 \n\t"
|
||||
"vsel %%v25,%%v18,%%v19,%%v25 \n\t"
|
||||
"vsel %%v26,%%v20,%%v21,%%v26 \n\t"
|
||||
"vsel %%v27,%%v22,%%v23,%%v27 \n\t"
|
||||
|
||||
"vfchdb %%v28,%%v25,%%v24 \n\t"
|
||||
"vfchdb %%v29,%%v27,%%v26 \n\t"
|
||||
"vsel %%v28,%%v24,%%v25,%%v28 \n\t"
|
||||
"vsel %%v29,%%v26,%%v27,%%v29 \n\t"
|
||||
|
||||
"vfchdb %%v30,%%v29,%%v28 \n\t"
|
||||
"vsel %%v30,%%v28,%%v29,%%v30 \n\t"
|
||||
|
||||
"vfchdb %%v31,%%v0,%%v30 \n\t"
|
||||
"vsel %%v0,%%v30,%%v0,%%v31 \n\t"
|
||||
|
||||
"vl %%v16,128(%%r1,%2) \n\t"
|
||||
"vl %%v17,144(%%r1,%2) \n\t"
|
||||
"vl %%v18,160(%%r1,%2) \n\t"
|
||||
"vl %%v19,176(%%r1,%2) \n\t"
|
||||
"vl %%v20,192(%%r1,%2) \n\t"
|
||||
"vl %%v21,208(%%r1,%2) \n\t"
|
||||
"vl %%v22,224(%%r1,%2) \n\t"
|
||||
"vl %%v23,240(%%r1,%2) \n\t"
|
||||
|
||||
"vfchdb %%v24,%%v17,%%v16 \n\t"
|
||||
"vfchdb %%v25,%%v19,%%v18 \n\t"
|
||||
"vfchdb %%v26,%%v21,%%v20 \n\t"
|
||||
"vfchdb %%v27,%%v23,%%v22 \n\t"
|
||||
"vsel %%v24,%%v16,%%v17,%%v24 \n\t"
|
||||
"vsel %%v25,%%v18,%%v19,%%v25 \n\t"
|
||||
"vsel %%v26,%%v20,%%v21,%%v26 \n\t"
|
||||
"vsel %%v27,%%v22,%%v23,%%v27 \n\t"
|
||||
|
||||
"vfchdb %%v28,%%v25,%%v24 \n\t"
|
||||
"vfchdb %%v29,%%v27,%%v26 \n\t"
|
||||
"vsel %%v28,%%v24,%%v25,%%v28 \n\t"
|
||||
"vsel %%v29,%%v26,%%v27,%%v29 \n\t"
|
||||
|
||||
"vfchdb %%v30,%%v29,%%v28 \n\t"
|
||||
"vsel %%v30,%%v28,%%v29,%%v30 \n\t"
|
||||
|
||||
"vfchdb %%v31,%%v0,%%v30 \n\t"
|
||||
"vsel %%v0,%%v30,%%v0,%%v31 \n\t"
|
||||
|
||||
"agfi %%r1, 256 \n\t"
|
||||
"brctg %%r0, 0b \n\t"
|
||||
|
||||
"vrepg %%v16,%%v0,1 \n\t"
|
||||
"wfchdb %%v17,%%v16,%%v0 \n\t"
|
||||
"vsel %%v0,%%v0,%%v16,%%v17 \n\t"
|
||||
"ldr %0,%%f0 "
|
||||
:"=f"(min)
|
||||
:"r"(n),"ZR"((const FLOAT (*)[n])x)
|
||||
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
|
||||
return min;
|
||||
}
|
||||
|
||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||
BLASLONG i = 0;
|
||||
BLASLONG j = 0;
|
||||
FLOAT minf = 0.0;
|
||||
|
||||
if (n <= 0 || inc_x <= 0) return (minf);
|
||||
|
||||
if (inc_x == 1) {
|
||||
|
||||
BLASLONG n1 = n & -32;
|
||||
if (n1 > 0) {
|
||||
|
||||
minf = dmin_kernel_32(n1, x);
|
||||
|
||||
i = n1;
|
||||
}
|
||||
else
|
||||
{
|
||||
minf=x[0];
|
||||
i++;
|
||||
}
|
||||
|
||||
while (i < n) {
|
||||
if (x[i] < minf) {
|
||||
minf = x[i];
|
||||
}
|
||||
i++;
|
||||
}
|
||||
return (minf);
|
||||
|
||||
} else {
|
||||
|
||||
minf=x[0];
|
||||
|
||||
BLASLONG n1 = n & -4;
|
||||
while (j < n1) {
|
||||
|
||||
if (x[i] < minf) {
|
||||
minf = x[i];
|
||||
}
|
||||
if (x[i + inc_x] < minf) {
|
||||
minf = x[i + inc_x];
|
||||
}
|
||||
if (x[i + 2 * inc_x] < minf) {
|
||||
minf = x[i + 2 * inc_x];
|
||||
}
|
||||
if (x[i + 3 * inc_x] < minf) {
|
||||
minf = x[i + 3 * inc_x];
|
||||
}
|
||||
|
||||
i += inc_x * 4;
|
||||
|
||||
j += 4;
|
||||
|
||||
}
|
||||
|
||||
|
||||
while (j < n) {
|
||||
if (x[i] < minf) {
|
||||
minf = x[i];
|
||||
}
|
||||
i += inc_x;
|
||||
j++;
|
||||
}
|
||||
return (minf);
|
||||
}
|
||||
}
|
||||
|
|
@ -1,5 +1,5 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2017, The OpenBLAS Project
|
||||
Copyright (c) 2013-2018, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
|
|
@ -27,176 +27,166 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#include "common.h"
|
||||
|
||||
static void drot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT cosA, FLOAT sinA)
|
||||
static void drot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s)
|
||||
{
|
||||
__asm__ (
|
||||
"pfd 2, 0(%[ptr_x]) \n\t"
|
||||
"pfd 2, 0(%[ptr_y]) \n\t"
|
||||
"lgdr %%r1,%[cos] \n\t"
|
||||
"vlvgp %%v0,%%r1,%%r1 \n\t"
|
||||
"lgdr %%r1,%[sin] \n\t"
|
||||
"vlvgp %%v1,%%r1,%%r1 \n\t"
|
||||
"srlg %[n_tmp],%[n_tmp],5 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
".align 16 \n\t"
|
||||
"1: \n\t"
|
||||
"pfd 2, 256(%%r1,%[ptr_x]) \n\t"
|
||||
"pfd 2, 256(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v24, 0(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v25, 16(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v26, 32(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v27, 48(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v16, 0(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v17, 16(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v18, 32(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v19, 48(%%r1,%[ptr_y]) \n\t"
|
||||
|
||||
"vfmdb %%v28,%%v24,%%v0 \n\t"
|
||||
"vfmdb %%v29,%%v25,%%v0 \n\t"
|
||||
"vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v30,%%v26,%%v0 \n\t"
|
||||
"vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v31,%%v27,%%v0 \n\t"
|
||||
"vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
|
||||
/* 2nd parts*/
|
||||
"vfmadb %%v28,%%v16,%%v1,%%v28 \n\t"
|
||||
"vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
|
||||
"vfmadb %%v29,%%v17,%%v1,%%v29 \n\t"
|
||||
"vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
|
||||
"vfmadb %%v30,%%v18,%%v1,%%v30 \n\t"
|
||||
"vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */
|
||||
"vfmadb %%v31,%%v19,%%v1,%%v31 \n\t"
|
||||
"vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
|
||||
|
||||
"vst %%v28, 0(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v29, 16(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v30, 32(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v31, 48(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v20, 0(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v21, 16(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v22, 32(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v23, 48(%%r1,%[ptr_y]) \n\t"
|
||||
|
||||
"vl %%v24, 64(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v25, 80(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v26, 96(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v27, 112(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v16, 64(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v17, 80(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v18, 96(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v19, 112(%%r1,%[ptr_y]) \n\t"
|
||||
|
||||
"vfmdb %%v28,%%v24,%%v0 \n\t"
|
||||
"vfmdb %%v29,%%v25,%%v0 \n\t"
|
||||
"vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v30,%%v26,%%v0 \n\t"
|
||||
"vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v31,%%v27,%%v0 \n\t"
|
||||
"vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
|
||||
/* 2nd parts*/
|
||||
"vfmadb %%v28,%%v16,%%v1,%%v28 \n\t"
|
||||
"vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
|
||||
"vfmadb %%v29,%%v17,%%v1,%%v29 \n\t"
|
||||
"vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
|
||||
"vfmadb %%v30,%%v18,%%v1,%%v30 \n\t"
|
||||
"vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */
|
||||
"vfmadb %%v31,%%v19,%%v1,%%v31 \n\t"
|
||||
"vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
|
||||
|
||||
"vst %%v28, 64(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v29, 80(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v30, 96(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v31, 112(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v20, 64(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v21, 80(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v22, 96(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v23, 112(%%r1,%[ptr_y]) \n\t"
|
||||
|
||||
"vl %%v24, 128(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v25, 144(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v26, 160(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v27, 176(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v16, 128(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v17, 144(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v18, 160(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v19, 176(%%r1,%[ptr_y]) \n\t"
|
||||
|
||||
"vfmdb %%v28,%%v24,%%v0 \n\t"
|
||||
"vfmdb %%v29,%%v25,%%v0 \n\t"
|
||||
"vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v30,%%v26,%%v0 \n\t"
|
||||
"vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v31,%%v27,%%v0 \n\t"
|
||||
"vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
|
||||
/* 2nd parts*/
|
||||
"vfmadb %%v28,%%v16,%%v1,%%v28 \n\t"
|
||||
"vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
|
||||
"vfmadb %%v29,%%v17,%%v1,%%v29 \n\t"
|
||||
"vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
|
||||
"vfmadb %%v30,%%v18,%%v1,%%v30 \n\t"
|
||||
"vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */
|
||||
"vfmadb %%v31,%%v19,%%v1,%%v31 \n\t"
|
||||
"vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
|
||||
|
||||
"vst %%v28, 128(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v29, 144(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v30, 160(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v31, 176(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v20, 128(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v21, 144(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v22, 160(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v23, 176(%%r1,%[ptr_y]) \n\t"
|
||||
|
||||
"vl %%v24, 192(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v25, 208(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v26, 224(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v27, 240(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v16, 192(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v17, 208(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v18, 224(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v19, 240(%%r1,%[ptr_y]) \n\t"
|
||||
|
||||
"vfmdb %%v28,%%v24,%%v0 \n\t"
|
||||
"vfmdb %%v29,%%v25,%%v0 \n\t"
|
||||
"vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v30,%%v26,%%v0 \n\t"
|
||||
"vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v31,%%v27,%%v0 \n\t"
|
||||
"vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
|
||||
/* 2nd parts*/
|
||||
"vfmadb %%v28,%%v16,%%v1,%%v28 \n\t"
|
||||
"vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
|
||||
"vfmadb %%v29,%%v17,%%v1,%%v29 \n\t"
|
||||
"vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
|
||||
"vfmadb %%v30,%%v18,%%v1,%%v30 \n\t"
|
||||
"vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */
|
||||
"vfmadb %%v31,%%v19,%%v1,%%v31 \n\t"
|
||||
"vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
|
||||
|
||||
"vst %%v28, 192(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v29, 208(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v30, 224(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v31, 240(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v20, 192(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v21, 208(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v22, 224(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v23, 240(%%r1,%[ptr_y]) \n\t"
|
||||
|
||||
"la %%r1,256(%%r1) \n\t"
|
||||
"brctg %[n_tmp],1b"
|
||||
: [mem_x] "+m" (*(double (*)[n])x),
|
||||
[mem_y] "+m" (*(double (*)[n])y),
|
||||
[n_tmp] "+&r"(n)
|
||||
: [ptr_x] "a"(x), [ptr_y] "a"(y),[cos] "f"(cosA),[sin] "f"(sinA)
|
||||
: "cc", "r1" ,"v0","v1","v16",
|
||||
"v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
return;
|
||||
|
||||
__asm__ (
|
||||
"vlrepg %%v0,%3 \n\t"
|
||||
"vlrepg %%v1,%4 \n\t"
|
||||
"srlg %%r0,%0,5 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 2, 1024(%%r1,%1) \n\t"
|
||||
"pfd 2, 1024(%%r1,%2) \n\t"
|
||||
"vl %%v24, 0(%%r1,%1) \n\t"
|
||||
"vl %%v25, 16(%%r1,%1) \n\t"
|
||||
"vl %%v26, 32(%%r1,%1) \n\t"
|
||||
"vl %%v27, 48(%%r1,%1) \n\t"
|
||||
"vl %%v16, 0(%%r1,%2) \n\t"
|
||||
"vl %%v17, 16(%%r1,%2) \n\t"
|
||||
"vl %%v18, 32(%%r1,%2) \n\t"
|
||||
"vl %%v19, 48(%%r1,%2) \n\t"
|
||||
|
||||
"vfmdb %%v28,%%v24,%%v0 \n\t"
|
||||
"vfmdb %%v29,%%v25,%%v0 \n\t"
|
||||
"vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v30,%%v26,%%v0 \n\t"
|
||||
"vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v31,%%v27,%%v0 \n\t"
|
||||
"vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
|
||||
/* 2nd parts*/
|
||||
"vfmadb %%v28,%%v16,%%v1,%%v28 \n\t"
|
||||
"vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
|
||||
"vfmadb %%v29,%%v17,%%v1,%%v29 \n\t"
|
||||
"vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
|
||||
"vfmadb %%v30,%%v18,%%v1,%%v30 \n\t"
|
||||
"vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */
|
||||
"vfmadb %%v31,%%v19,%%v1,%%v31 \n\t"
|
||||
"vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
|
||||
|
||||
"vst %%v28, 0(%%r1,%1) \n\t"
|
||||
"vst %%v29, 16(%%r1,%1) \n\t"
|
||||
"vst %%v30, 32(%%r1,%1) \n\t"
|
||||
"vst %%v31, 48(%%r1,%1) \n\t"
|
||||
"vst %%v20, 0(%%r1,%2) \n\t"
|
||||
"vst %%v21, 16(%%r1,%2) \n\t"
|
||||
"vst %%v22, 32(%%r1,%2) \n\t"
|
||||
"vst %%v23, 48(%%r1,%2) \n\t"
|
||||
|
||||
"vl %%v24, 64(%%r1,%1) \n\t"
|
||||
"vl %%v25, 80(%%r1,%1) \n\t"
|
||||
"vl %%v26, 96(%%r1,%1) \n\t"
|
||||
"vl %%v27, 112(%%r1,%1) \n\t"
|
||||
"vl %%v16, 64(%%r1,%2) \n\t"
|
||||
"vl %%v17, 80(%%r1,%2) \n\t"
|
||||
"vl %%v18, 96(%%r1,%2) \n\t"
|
||||
"vl %%v19, 112(%%r1,%2) \n\t"
|
||||
|
||||
"vfmdb %%v28,%%v24,%%v0 \n\t"
|
||||
"vfmdb %%v29,%%v25,%%v0 \n\t"
|
||||
"vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v30,%%v26,%%v0 \n\t"
|
||||
"vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v31,%%v27,%%v0 \n\t"
|
||||
"vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
|
||||
/* 2nd parts*/
|
||||
"vfmadb %%v28,%%v16,%%v1,%%v28 \n\t"
|
||||
"vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
|
||||
"vfmadb %%v29,%%v17,%%v1,%%v29 \n\t"
|
||||
"vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
|
||||
"vfmadb %%v30,%%v18,%%v1,%%v30 \n\t"
|
||||
"vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */
|
||||
"vfmadb %%v31,%%v19,%%v1,%%v31 \n\t"
|
||||
"vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
|
||||
|
||||
"vst %%v28, 64(%%r1,%1) \n\t"
|
||||
"vst %%v29, 80(%%r1,%1) \n\t"
|
||||
"vst %%v30, 96(%%r1,%1) \n\t"
|
||||
"vst %%v31, 112(%%r1,%1) \n\t"
|
||||
"vst %%v20, 64(%%r1,%2) \n\t"
|
||||
"vst %%v21, 80(%%r1,%2) \n\t"
|
||||
"vst %%v22, 96(%%r1,%2) \n\t"
|
||||
"vst %%v23, 112(%%r1,%2) \n\t"
|
||||
|
||||
"vl %%v24, 128(%%r1,%1) \n\t"
|
||||
"vl %%v25, 144(%%r1,%1) \n\t"
|
||||
"vl %%v26, 160(%%r1,%1) \n\t"
|
||||
"vl %%v27, 176(%%r1,%1) \n\t"
|
||||
"vl %%v16, 128(%%r1,%2) \n\t"
|
||||
"vl %%v17, 144(%%r1,%2) \n\t"
|
||||
"vl %%v18, 160(%%r1,%2) \n\t"
|
||||
"vl %%v19, 176(%%r1,%2) \n\t"
|
||||
|
||||
"vfmdb %%v28,%%v24,%%v0 \n\t"
|
||||
"vfmdb %%v29,%%v25,%%v0 \n\t"
|
||||
"vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v30,%%v26,%%v0 \n\t"
|
||||
"vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v31,%%v27,%%v0 \n\t"
|
||||
"vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
|
||||
/* 2nd parts*/
|
||||
"vfmadb %%v28,%%v16,%%v1,%%v28 \n\t"
|
||||
"vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
|
||||
"vfmadb %%v29,%%v17,%%v1,%%v29 \n\t"
|
||||
"vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
|
||||
"vfmadb %%v30,%%v18,%%v1,%%v30 \n\t"
|
||||
"vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */
|
||||
"vfmadb %%v31,%%v19,%%v1,%%v31 \n\t"
|
||||
"vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
|
||||
|
||||
"vst %%v28, 128(%%r1,%1) \n\t"
|
||||
"vst %%v29, 144(%%r1,%1) \n\t"
|
||||
"vst %%v30, 160(%%r1,%1) \n\t"
|
||||
"vst %%v31, 176(%%r1,%1) \n\t"
|
||||
"vst %%v20, 128(%%r1,%2) \n\t"
|
||||
"vst %%v21, 144(%%r1,%2) \n\t"
|
||||
"vst %%v22, 160(%%r1,%2) \n\t"
|
||||
"vst %%v23, 176(%%r1,%2) \n\t"
|
||||
|
||||
"vl %%v24, 192(%%r1,%1) \n\t"
|
||||
"vl %%v25, 208(%%r1,%1) \n\t"
|
||||
"vl %%v26, 224(%%r1,%1) \n\t"
|
||||
"vl %%v27, 240(%%r1,%1) \n\t"
|
||||
"vl %%v16, 192(%%r1,%2) \n\t"
|
||||
"vl %%v17, 208(%%r1,%2) \n\t"
|
||||
"vl %%v18, 224(%%r1,%2) \n\t"
|
||||
"vl %%v19, 240(%%r1,%2) \n\t"
|
||||
|
||||
"vfmdb %%v28,%%v24,%%v0 \n\t"
|
||||
"vfmdb %%v29,%%v25,%%v0 \n\t"
|
||||
"vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v30,%%v26,%%v0 \n\t"
|
||||
"vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v31,%%v27,%%v0 \n\t"
|
||||
"vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
|
||||
/* 2nd parts*/
|
||||
"vfmadb %%v28,%%v16,%%v1,%%v28 \n\t"
|
||||
"vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
|
||||
"vfmadb %%v29,%%v17,%%v1,%%v29 \n\t"
|
||||
"vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
|
||||
"vfmadb %%v30,%%v18,%%v1,%%v30 \n\t"
|
||||
"vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */
|
||||
"vfmadb %%v31,%%v19,%%v1,%%v31 \n\t"
|
||||
"vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
|
||||
|
||||
"vst %%v28, 192(%%r1,%1) \n\t"
|
||||
"vst %%v29, 208(%%r1,%1) \n\t"
|
||||
"vst %%v30, 224(%%r1,%1) \n\t"
|
||||
"vst %%v31, 240(%%r1,%1) \n\t"
|
||||
"vst %%v20, 192(%%r1,%2) \n\t"
|
||||
"vst %%v21, 208(%%r1,%2) \n\t"
|
||||
"vst %%v22, 224(%%r1,%2) \n\t"
|
||||
"vst %%v23, 240(%%r1,%2) \n\t"
|
||||
|
||||
"agfi %%r1,256 \n\t"
|
||||
"brctg %%r0,0b "
|
||||
:
|
||||
:"r"(n),"ZR"((FLOAT (*)[n])x),"ZR"((FLOAT (*)[n])y),"m"(*c),"m"(*s)
|
||||
:"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
}
|
||||
|
||||
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
|
||||
|
|
@ -214,8 +204,10 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
|
|||
BLASLONG n1 = n & -32;
|
||||
if ( n1 > 0 )
|
||||
{
|
||||
|
||||
drot_kernel_32(n1, x, y, c, s);
|
||||
FLOAT cosa,sina;
|
||||
cosa=c;
|
||||
sina=s;
|
||||
drot_kernel_32(n1, x, y, &cosa, &sina);
|
||||
i=n1;
|
||||
}
|
||||
|
||||
|
|
@ -229,6 +221,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
|
|||
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
|
|
@ -250,3 +243,4 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
|
|||
|
||||
}
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -27,135 +27,75 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#include "common.h"
|
||||
|
||||
#ifdef Z13_A
|
||||
static void dscal_kernel_32( BLASLONG n, FLOAT da , FLOAT *x )
|
||||
static void dscal_kernel_16(BLASLONG n, FLOAT da, FLOAT *x)
|
||||
{
|
||||
__asm__ volatile (
|
||||
"vlrepg %%v0,%1 \n\t"
|
||||
"srlg %%r0,%0,4 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 2, 1024(%%r1,%2) \n\t"
|
||||
"vl %%v24, 0(%%r1,%2) \n\t"
|
||||
"vfmdb %%v24,%%v24,%%v0 \n\t"
|
||||
"vst %%v24, 0(%%r1,%2) \n\t"
|
||||
"vl %%v25, 16(%%r1,%2) \n\t"
|
||||
"vfmdb %%v25,%%v25,%%v0 \n\t"
|
||||
"vst %%v25, 16(%%r1,%2) \n\t"
|
||||
"vl %%v26, 32(%%r1,%2) \n\t"
|
||||
"vfmdb %%v26,%%v26,%%v0 \n\t"
|
||||
"vst %%v26, 32(%%r1,%2) \n\t"
|
||||
"vl %%v27, 48(%%r1,%2) \n\t"
|
||||
"vfmdb %%v27,%%v27,%%v0 \n\t"
|
||||
"vst %%v27, 48(%%r1,%2) \n\t"
|
||||
"vl %%v24, 64(%%r1,%2) \n\t"
|
||||
"vfmdb %%v24,%%v24,%%v0 \n\t"
|
||||
"vst %%v24, 64(%%r1,%2) \n\t"
|
||||
"vl %%v25, 80(%%r1,%2) \n\t"
|
||||
"vfmdb %%v25,%%v25,%%v0 \n\t"
|
||||
"vst %%v25, 80(%%r1,%2) \n\t"
|
||||
"vl %%v26, 96(%%r1,%2) \n\t"
|
||||
"vfmdb %%v26,%%v26,%%v0 \n\t"
|
||||
"vst %%v26, 96(%%r1,%2) \n\t"
|
||||
"vl %%v27, 112(%%r1,%2) \n\t"
|
||||
"vfmdb %%v27,%%v27,%%v0 \n\t"
|
||||
"vst %%v27, 112(%%r1,%2) \n\t"
|
||||
"agfi %%r1,128 \n\t"
|
||||
"brctg %%r0,0b "
|
||||
:
|
||||
:"r"(n),"m"(da),"ZR"((FLOAT (*)[n])x)
|
||||
:"memory","cc","r0","r1","v0","v24","v25","v26","v27"
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
__asm__ ("pfd 2, 0(%[x_ptr]) \n\t"
|
||||
"lgdr %%r0,%[alpha] \n\t"
|
||||
"vlvgp %%v0,%%r0,%%r0 \n\t"
|
||||
"srlg %[n],%[n],4 \n\t"
|
||||
"vlr %%v1,%%v0 \n\t"
|
||||
"vlm %%v16,%%v23, 0(%[x_ptr]) \n\t"
|
||||
"la %[x_ptr], 128(%[x_ptr]) \n\t"
|
||||
"aghik %[n], %[n], -1 \n\t"
|
||||
"jle 2f \n\t"
|
||||
".align 16 \n\t"
|
||||
"1: \n\t"
|
||||
"vfmdb %%v24, %%v16, %%v0 \n\t"
|
||||
"vfmdb %%v25, %%v17, %%v0 \n\t"
|
||||
"vfmdb %%v26, %%v18, %%v0 \n\t"
|
||||
"vfmdb %%v27, %%v19, %%v1 \n\t"
|
||||
"vlm %%v16,%%v19, 0(%[x_ptr]) \n\t"
|
||||
"vfmdb %%v28, %%v20, %%v0 \n\t"
|
||||
"vfmdb %%v29, %%v21, %%v1 \n\t"
|
||||
"vfmdb %%v30, %%v22, %%v0 \n\t"
|
||||
"vfmdb %%v31, %%v23, %%v1 \n\t"
|
||||
"vlm %%v20,%%v23, 64(%[x_ptr]) \n\t"
|
||||
"lay %[x_ptr], -128(%[x_ptr]) \n\t"
|
||||
"vstm %%v24,%%v31, 0(%[x_ptr]) \n\t"
|
||||
"la %[x_ptr],256(%[x_ptr]) \n\t"
|
||||
"brctg %[n],1b \n\t"
|
||||
"2: \n\t"
|
||||
"vfmdb %%v24, %%v16, %%v0 \n\t"
|
||||
"vfmdb %%v25, %%v17, %%v1 \n\t"
|
||||
"vfmdb %%v26, %%v18, %%v0 \n\t"
|
||||
"vfmdb %%v27, %%v19, %%v1 \n\t"
|
||||
"lay %[x_ptr] , -128(%[x_ptr]) \n\t"
|
||||
"vfmdb %%v28, %%v20, %%v0 \n\t"
|
||||
"vfmdb %%v29, %%v21, %%v1 \n\t"
|
||||
"vfmdb %%v30, %%v22, %%v0 \n\t"
|
||||
"vfmdb %%v31, %%v23, %%v1 \n\t"
|
||||
"vstm %%v24,%%v31, 0(%[x_ptr]) \n\t"
|
||||
: [mem] "+m" (*(double (*)[n])x) ,[x_ptr] "+&a"(x),[n] "+&r"(n)
|
||||
: [alpha] "f"(da)
|
||||
:"cc" , "r0","v0","v1","v16","v17","v18","v19","v20","v21",
|
||||
"v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
}
|
||||
#else
|
||||
static void dscal_kernel_32( BLASLONG n, FLOAT da , FLOAT *x )
|
||||
static void dscal_kernel_16_zero(BLASLONG n, FLOAT *x)
|
||||
{
|
||||
__asm__ volatile(
|
||||
"vzero %%v24 \n\t"
|
||||
"vzero %%v25 \n\t"
|
||||
"vzero %%v26 \n\t"
|
||||
"vzero %%v27 \n\t"
|
||||
"srlg %%r0,%0,4 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 2, 1024(%%r1,%1) \n\t"
|
||||
|
||||
/* faster than sequence of triples(vl vfmd vst) (tested OPENBLAS_LOOPS=10000) */
|
||||
__asm__ ("pfd 2, 0(%[x_ptr]) \n\t"
|
||||
"lgdr %%r0,%[alpha] \n\t"
|
||||
"vlvgp %%v0,%%r0,%%r0 \n\t"
|
||||
"vlr %%v1,%%v0 \n\t"
|
||||
"sllg %%r0,%[n],3 \n\t"
|
||||
"agr %%r0,%[x_ptr] \n\t"
|
||||
".align 16 \n\t"
|
||||
"1: \n\t"
|
||||
"pfd 2, 256(%[x_ptr]) \n\t"
|
||||
"vlm %%v16,%%v23, 0(%[x_ptr]) \n\t"
|
||||
"vfmdb %%v16,%%v16,%%v0 \n\t"
|
||||
"vfmdb %%v17,%%v17,%%v1 \n\t"
|
||||
"vfmdb %%v18,%%v18,%%v0 \n\t"
|
||||
"vfmdb %%v19,%%v19,%%v1 \n\t"
|
||||
"vfmdb %%v20,%%v20,%%v0 \n\t"
|
||||
"vfmdb %%v21,%%v21,%%v1 \n\t"
|
||||
"vfmdb %%v22,%%v22,%%v0 \n\t"
|
||||
"vfmdb %%v23,%%v23,%%v1 \n\t"
|
||||
"vstm %%v16,%%v23, 0(%[x_ptr]) \n\t"
|
||||
"vlm %%v24,%%v31,128(%[x_ptr]) \n\t"
|
||||
"vfmdb %%v24,%%v24,%%v0 \n\t"
|
||||
"vfmdb %%v25,%%v25,%%v1 \n\t"
|
||||
"vfmdb %%v26,%%v26,%%v0 \n\t"
|
||||
"vfmdb %%v27,%%v27,%%v1 \n\t"
|
||||
"vfmdb %%v28,%%v28,%%v0 \n\t"
|
||||
"vfmdb %%v29,%%v29,%%v1 \n\t"
|
||||
"vfmdb %%v30,%%v30,%%v0 \n\t"
|
||||
"vfmdb %%v31,%%v31,%%v1 \n\t"
|
||||
"vstm %%v24,%%v31,128(%[x_ptr]) \n\t"
|
||||
"la %[x_ptr], 256(%[x_ptr]) \n\t"
|
||||
"clgrjl %[x_ptr],%%r0,1b \n\t"
|
||||
: [mem] "+m" (*(double (*)[n])x) ,[x_ptr] "+&a"(x)
|
||||
: [n] "r"(n),[alpha] "f"(da)
|
||||
:"cc" , "r0","v0","v1","v16","v17","v18","v19","v20","v21",
|
||||
"v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
|
||||
}
|
||||
#endif
|
||||
static void dscal_kernel_32_zero( BLASLONG n, FLOAT *x )
|
||||
{
|
||||
|
||||
__asm__ ("pfd 2, 0(%[x_ptr]) \n\t"
|
||||
"vzero %%v24 \n\t"
|
||||
"sllg %%r0,%[n],3 \n\t"
|
||||
"vzero %%v25 \n\t"
|
||||
"agr %%r0,%[x_ptr] \n\t"
|
||||
".align 16 \n\t"
|
||||
"1: \n\t"
|
||||
"pfd 2, 256(%[x_ptr]) \n\t"
|
||||
"vst %%v24, 0(%[x_ptr]) \n\t"
|
||||
"vst %%v25, 16(%[x_ptr]) \n\t"
|
||||
"vst %%v24, 32(%[x_ptr]) \n\t"
|
||||
"vst %%v25, 48(%[x_ptr]) \n\t"
|
||||
"vst %%v24, 64(%[x_ptr]) \n\t"
|
||||
"vst %%v25, 80(%[x_ptr]) \n\t"
|
||||
"vst %%v24, 96(%[x_ptr]) \n\t"
|
||||
"vst %%v25, 112(%[x_ptr]) \n\t"
|
||||
"vst %%v24, 128(%[x_ptr]) \n\t"
|
||||
"vst %%v25, 144(%[x_ptr]) \n\t"
|
||||
"vst %%v24, 160(%[x_ptr]) \n\t"
|
||||
"vst %%v25, 176(%[x_ptr]) \n\t"
|
||||
"vst %%v24, 192(%[x_ptr]) \n\t"
|
||||
"vst %%v25, 208(%[x_ptr]) \n\t"
|
||||
"vst %%v24, 224(%[x_ptr]) \n\t"
|
||||
"vst %%v25, 240(%[x_ptr]) \n\t"
|
||||
"la %[x_ptr],256(%[x_ptr]) \n\t"
|
||||
"clgrjl %[x_ptr],%%r0,1b \n\t"
|
||||
: [mem] "=m" (*(double (*)[n])x) ,[x_ptr] "+&a"(x)
|
||||
: [n] "r"(n)
|
||||
:"cc" , "r0", "v24" ,"v25"
|
||||
);
|
||||
"vst %%v24,0(%%r1,%1) \n\t"
|
||||
"vst %%v25,16(%%r1,%1) \n\t"
|
||||
"vst %%v26,32(%%r1,%1) \n\t"
|
||||
"vst %%v27,48(%%r1,%1) \n\t"
|
||||
"vst %%v24,64(%%r1,%1) \n\t"
|
||||
"vst %%v25,80(%%r1,%1) \n\t"
|
||||
"vst %%v26,96(%%r1,%1) \n\t"
|
||||
"vst %%v27,112(%%r1,%1) \n\t"
|
||||
|
||||
"agfi %%r1,128 \n\t"
|
||||
"brctg %%r0,0b "
|
||||
:
|
||||
:"r"(n),"ZR"((FLOAT (*)[n])x)
|
||||
:"memory","cc","r0","r1","v24","v25","v26","v27"
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
|
||||
{
|
||||
BLASLONG i=0,j=0;
|
||||
|
|
@ -169,11 +109,11 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
|
|||
if ( da == 0.0 )
|
||||
{
|
||||
|
||||
BLASLONG n1 = n & -32;
|
||||
BLASLONG n1 = n & -16;
|
||||
if ( n1 > 0 )
|
||||
{
|
||||
|
||||
dscal_kernel_32_zero(n1 , x);
|
||||
dscal_kernel_16_zero(n1, x);
|
||||
j=n1;
|
||||
}
|
||||
|
||||
|
|
@ -188,10 +128,10 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
|
|||
else
|
||||
{
|
||||
|
||||
BLASLONG n1 = n & -32;
|
||||
BLASLONG n1 = n & -16;
|
||||
if ( n1 > 0 )
|
||||
{
|
||||
dscal_kernel_32(n1 , da , x);
|
||||
dscal_kernel_16(n1, da, x);
|
||||
j=n1;
|
||||
}
|
||||
while(j < n)
|
||||
|
|
@ -260,4 +200,6 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
|
|||
}
|
||||
return 0;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,169 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2018,The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms,with or without
|
||||
modification,are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice,this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice,this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES,INCLUDING,BUT NOT LIMITED TO,THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT,INDIRECT,INCIDENTAL,SPECIAL,EXEMPLARY,OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING,BUT NOT LIMITED TO,PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE,DATA,OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY,WHETHER IN CONTRACT,STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE,EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
static double dsdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y)
|
||||
{
|
||||
double dot;
|
||||
|
||||
__asm__ volatile (
|
||||
"vzero %%v0 \n\t"
|
||||
"srlg %%r0,%1,4 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 1,1024(%%r1,%2) \n\t"
|
||||
"pfd 1,1024(%%r1,%3) \n\t"
|
||||
|
||||
"vlef %%v16,0(%%r1,%2),0 \n\t"
|
||||
"vlef %%v16,4(%%r1,%2),2 \n\t"
|
||||
"vlef %%v17,8(%%r1,%2),0 \n\t"
|
||||
"vlef %%v17,12(%%r1,%2),2 \n\t"
|
||||
"vlef %%v18,16(%%r1,%2),0 \n\t"
|
||||
"vlef %%v18,20(%%r1,%2),2 \n\t"
|
||||
"vlef %%v19,24(%%r1,%2),0 \n\t"
|
||||
"vlef %%v19,28(%%r1,%2),2 \n\t"
|
||||
"vlef %%v20,32(%%r1,%2),0 \n\t"
|
||||
"vlef %%v20,36(%%r1,%2),2 \n\t"
|
||||
"vlef %%v21,40(%%r1,%2),0 \n\t"
|
||||
"vlef %%v21,44(%%r1,%2),2 \n\t"
|
||||
"vlef %%v22,48(%%r1,%2),0 \n\t"
|
||||
"vlef %%v22,52(%%r1,%2),2 \n\t"
|
||||
"vlef %%v23,56(%%r1,%2),0 \n\t"
|
||||
"vlef %%v23,60(%%r1,%2),2 \n\t"
|
||||
|
||||
"vflls %%v16,%%v16 \n\t"
|
||||
"vflls %%v17,%%v17 \n\t"
|
||||
"vflls %%v18,%%v18 \n\t"
|
||||
"vflls %%v19,%%v19 \n\t"
|
||||
"vflls %%v20,%%v20 \n\t"
|
||||
"vflls %%v21,%%v21 \n\t"
|
||||
"vflls %%v22,%%v22 \n\t"
|
||||
"vflls %%v23,%%v23 \n\t"
|
||||
|
||||
"vlef %%v24,0(%%r1,%3),0 \n\t"
|
||||
"vlef %%v24,4(%%r1,%3),2 \n\t"
|
||||
"vflls %%v24,%%v24 \n\t"
|
||||
"vfmadb %%v0,%%v16,%%v24,%%v0 \n\t"
|
||||
"vlef %%v25,8(%%r1,%3),0 \n\t"
|
||||
"vlef %%v25,12(%%r1,%3),2 \n\t"
|
||||
"vflls %%v25,%%v25 \n\t"
|
||||
"vfmadb %%v0,%%v17,%%v25,%%v0 \n\t"
|
||||
"vlef %%v26,16(%%r1,%3),0 \n\t"
|
||||
"vlef %%v26,20(%%r1,%3),2 \n\t"
|
||||
"vflls %%v26,%%v26 \n\t"
|
||||
"vfmadb %%v0,%%v18,%%v26,%%v0 \n\t"
|
||||
"vlef %%v27,24(%%r1,%3),0 \n\t"
|
||||
"vlef %%v27,28(%%r1,%3),2 \n\t"
|
||||
"vflls %%v27,%%v27 \n\t"
|
||||
"vfmadb %%v0,%%v19,%%v27,%%v0 \n\t"
|
||||
"vlef %%v28,32(%%r1,%3),0 \n\t"
|
||||
"vlef %%v28,36(%%r1,%3),2 \n\t"
|
||||
"vflls %%v28,%%v28 \n\t"
|
||||
"vfmadb %%v0,%%v20,%%v28,%%v0 \n\t"
|
||||
"vlef %%v29,40(%%r1,%3),0 \n\t"
|
||||
"vlef %%v29,44(%%r1,%3),2 \n\t"
|
||||
"vflls %%v29,%%v29 \n\t"
|
||||
"vfmadb %%v0,%%v21,%%v29,%%v0 \n\t"
|
||||
"vlef %%v30,48(%%r1,%3),0 \n\t"
|
||||
"vlef %%v30,52(%%r1,%3),2 \n\t"
|
||||
"vflls %%v30,%%v30 \n\t"
|
||||
"vfmadb %%v0,%%v22,%%v30,%%v0 \n\t"
|
||||
"vlef %%v31,56(%%r1,%3),0 \n\t"
|
||||
"vlef %%v31,60(%%r1,%3),2 \n\t"
|
||||
"vflls %%v31,%%v31 \n\t"
|
||||
"vfmadb %%v0,%%v23,%%v31,%%v0 \n\t"
|
||||
|
||||
"agfi %%r1,64 \n\t"
|
||||
"brctg %%r0,0b \n\t"
|
||||
"vrepg %%v1,%%v0,1 \n\t"
|
||||
"adbr %%f0,%%f1 \n\t"
|
||||
"ldr %0,%%f0 "
|
||||
:"=f"(dot)
|
||||
:"r"(n),"ZR"((const FLOAT (*)[n])x),"ZR"((const FLOAT (*)[n])y)
|
||||
:"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
|
||||
return dot;
|
||||
}
|
||||
|
||||
double CNAME(BLASLONG n,FLOAT *x,BLASLONG inc_x,FLOAT *y,BLASLONG inc_y)
|
||||
{
|
||||
BLASLONG i=0;
|
||||
BLASLONG ix=0,iy=0;
|
||||
|
||||
double dot = 0.0 ;
|
||||
|
||||
if ( n <= 0 ) return(dot);
|
||||
|
||||
if ( (inc_x == 1) && (inc_y == 1) )
|
||||
{
|
||||
|
||||
BLASLONG n1 = n & -16;
|
||||
|
||||
if ( n1 )
|
||||
dot = dsdot_kernel_16(n1,x,y);
|
||||
|
||||
i = n1;
|
||||
while(i < n)
|
||||
{
|
||||
|
||||
dot += y[i] * x[i] ;
|
||||
i++ ;
|
||||
|
||||
}
|
||||
return(dot);
|
||||
|
||||
|
||||
}
|
||||
|
||||
BLASLONG n1 = n & -2;
|
||||
|
||||
while(i < n1)
|
||||
{
|
||||
|
||||
dot += y[iy] * x[ix] + y[iy+inc_y] * x[ix+inc_x];
|
||||
ix += inc_x*2 ;
|
||||
iy += inc_y*2 ;
|
||||
i+=2 ;
|
||||
|
||||
}
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
|
||||
dot += y[iy] * x[ix] ;
|
||||
ix += inc_x ;
|
||||
iy += inc_y ;
|
||||
i++ ;
|
||||
|
||||
}
|
||||
return(dot);
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -1,5 +1,5 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2017, The OpenBLAS Project
|
||||
Copyright (c) 2013-2018, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
|
|
@ -25,217 +25,93 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
|||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
|
||||
#include "common.h"
|
||||
|
||||
|
||||
|
||||
#if defined(Z13_SWAP_A)
|
||||
static void dswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y)
|
||||
static void dswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y)
|
||||
{
|
||||
__asm__ volatile(
|
||||
"pfd 1, 0(%[ptr_x]) \n\t"
|
||||
"pfd 2, 0(%[ptr_y]) \n\t"
|
||||
"srlg %[n_tmp],%[n_tmp],5 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
".align 16 \n\t"
|
||||
"1: \n\t"
|
||||
"pfd 2, 256(%%r1,%[ptr_x]) \n\t"
|
||||
"pfd 2, 256(%%r1,%[ptr_y]) \n\t"
|
||||
|
||||
"vl %%v24, 0(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v16, 0(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v24, 0(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v16, 0(%%r1,%[ptr_x]) \n\t"
|
||||
__asm__ volatile(
|
||||
"srlg %%r0,%0,5 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 2, 1024(%%r1,%1) \n\t"
|
||||
"pfd 2, 1024(%%r1,%2) \n\t"
|
||||
|
||||
"vl %%v16, 0(%%r1,%1) \n\t"
|
||||
"vl %%v17, 16(%%r1,%1) \n\t"
|
||||
"vl %%v18, 32(%%r1,%1) \n\t"
|
||||
"vl %%v19, 48(%%r1,%1) \n\t"
|
||||
"vl %%v20, 64(%%r1,%1) \n\t"
|
||||
"vl %%v21, 80(%%r1,%1) \n\t"
|
||||
"vl %%v22, 96(%%r1,%1) \n\t"
|
||||
"vl %%v23, 112(%%r1,%1) \n\t"
|
||||
"vl %%v24, 128(%%r1,%1) \n\t"
|
||||
"vl %%v25, 144(%%r1,%1) \n\t"
|
||||
"vl %%v26, 160(%%r1,%1) \n\t"
|
||||
"vl %%v27, 176(%%r1,%1) \n\t"
|
||||
"vl %%v28, 192(%%r1,%1) \n\t"
|
||||
"vl %%v29, 208(%%r1,%1) \n\t"
|
||||
"vl %%v30, 224(%%r1,%1) \n\t"
|
||||
"vl %%v31, 240(%%r1,%1) \n\t"
|
||||
|
||||
"vl %%v25, 16(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v17, 16(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v25, 16(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v17, 16(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v0, 0(%%r1,%2) \n\t"
|
||||
"vl %%v1, 16(%%r1,%2) \n\t"
|
||||
"vl %%v2, 32(%%r1,%2) \n\t"
|
||||
"vl %%v3, 48(%%r1,%2) \n\t"
|
||||
"vl %%v4, 64(%%r1,%2) \n\t"
|
||||
"vl %%v5, 80(%%r1,%2) \n\t"
|
||||
"vl %%v6, 96(%%r1,%2) \n\t"
|
||||
"vl %%v7, 112(%%r1,%2) \n\t"
|
||||
"vst %%v0, 0(%%r1,%1) \n\t"
|
||||
"vst %%v1, 16(%%r1,%1) \n\t"
|
||||
"vst %%v2, 32(%%r1,%1) \n\t"
|
||||
"vst %%v3, 48(%%r1,%1) \n\t"
|
||||
"vst %%v4, 64(%%r1,%1) \n\t"
|
||||
"vst %%v5, 80(%%r1,%1) \n\t"
|
||||
"vst %%v6, 96(%%r1,%1) \n\t"
|
||||
"vst %%v7, 112(%%r1,%1) \n\t"
|
||||
|
||||
"vl %%v26, 32(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v18, 32(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v26, 32(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v18, 32(%%r1,%[ptr_x]) \n\t"
|
||||
|
||||
"vl %%v27, 48(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v19, 48(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v27, 48(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v19, 48(%%r1,%[ptr_x]) \n\t"
|
||||
|
||||
"vl %%v28, 64(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v20, 64(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v28, 64(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v20, 64(%%r1,%[ptr_x]) \n\t"
|
||||
|
||||
"vl %%v29, 80(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v21, 80(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v29, 80(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v21, 80(%%r1,%[ptr_x]) \n\t"
|
||||
|
||||
"vl %%v30, 96(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v22, 96(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v30, 96(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v22, 96(%%r1,%[ptr_x]) \n\t"
|
||||
|
||||
"vl %%v31, 112(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v23, 112(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v31, 112(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v23, 112(%%r1,%[ptr_x]) \n\t"
|
||||
|
||||
"vl %%v24, 128(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v16, 128(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v24, 128(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v16, 128(%%r1,%[ptr_x]) \n\t"
|
||||
|
||||
"vl %%v25, 144(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v17, 144(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v25, 144(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v17, 144(%%r1,%[ptr_x]) \n\t"
|
||||
|
||||
"vl %%v26, 160(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v18, 160(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v26, 160(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v18, 160(%%r1,%[ptr_x]) \n\t"
|
||||
|
||||
"vl %%v27, 176(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v19, 176(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v27, 176(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v19, 176(%%r1,%[ptr_x]) \n\t"
|
||||
|
||||
"vl %%v28, 192(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v20, 192(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v28, 192(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v20, 192(%%r1,%[ptr_x]) \n\t"
|
||||
|
||||
"vl %%v29, 208(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v21, 208(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v29, 208(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v21, 208(%%r1,%[ptr_x]) \n\t"
|
||||
|
||||
"vl %%v30, 224(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v22, 224(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v30, 224(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v22, 224(%%r1,%[ptr_x]) \n\t"
|
||||
|
||||
"vl %%v31, 240(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v23, 240(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v31, 240(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v23, 240(%%r1,%[ptr_x]) \n\t"
|
||||
|
||||
"la %%r1,256(%%r1) \n\t"
|
||||
"brctg %[n_tmp],1b"
|
||||
: [mem_x] "+m" (*(double (*)[n])x),
|
||||
[mem_y] "+m" (*(double (*)[n])y),
|
||||
[n_tmp] "+&r"(n)
|
||||
: [ptr_x] "a"(x), [ptr_y] "a"(y)
|
||||
: "cc", "r1", "v16","v17","v18","v19","v20","v21","v22","v23"
|
||||
,"v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
return;
|
||||
"vl %%v0, 128(%%r1,%2) \n\t"
|
||||
"vl %%v1, 144(%%r1,%2) \n\t"
|
||||
"vl %%v2, 160(%%r1,%2) \n\t"
|
||||
"vl %%v3, 176(%%r1,%2) \n\t"
|
||||
"vl %%v4, 192(%%r1,%2) \n\t"
|
||||
"vl %%v5, 208(%%r1,%2) \n\t"
|
||||
"vl %%v6, 224(%%r1,%2) \n\t"
|
||||
"vl %%v7, 240(%%r1,%2) \n\t"
|
||||
"vst %%v0, 128(%%r1,%1) \n\t"
|
||||
"vst %%v1, 144(%%r1,%1) \n\t"
|
||||
"vst %%v2, 160(%%r1,%1) \n\t"
|
||||
"vst %%v3, 176(%%r1,%1) \n\t"
|
||||
"vst %%v4, 192(%%r1,%1) \n\t"
|
||||
"vst %%v5, 208(%%r1,%1) \n\t"
|
||||
"vst %%v6, 224(%%r1,%1) \n\t"
|
||||
"vst %%v7, 240(%%r1,%1) \n\t"
|
||||
|
||||
"vst %%v16, 0(%%r1,%2) \n\t"
|
||||
"vst %%v17, 16(%%r1,%2) \n\t"
|
||||
"vst %%v18, 32(%%r1,%2) \n\t"
|
||||
"vst %%v19, 48(%%r1,%2) \n\t"
|
||||
"vst %%v20, 64(%%r1,%2) \n\t"
|
||||
"vst %%v21, 80(%%r1,%2) \n\t"
|
||||
"vst %%v22, 96(%%r1,%2) \n\t"
|
||||
"vst %%v23, 112(%%r1,%2) \n\t"
|
||||
"vst %%v24, 128(%%r1,%2) \n\t"
|
||||
"vst %%v25, 144(%%r1,%2) \n\t"
|
||||
"vst %%v26, 160(%%r1,%2) \n\t"
|
||||
"vst %%v27, 176(%%r1,%2) \n\t"
|
||||
"vst %%v28, 192(%%r1,%2) \n\t"
|
||||
"vst %%v29, 208(%%r1,%2) \n\t"
|
||||
"vst %%v30, 224(%%r1,%2) \n\t"
|
||||
"vst %%v31, 240(%%r1,%2) \n\t"
|
||||
|
||||
"agfi %%r1,256 \n\t"
|
||||
"brctg %%r0,0b "
|
||||
:
|
||||
:"r"(n),"ZR"((FLOAT (*)[n])x),"ZR"((FLOAT (*)[n])y)
|
||||
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
static void dswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y)
|
||||
{
|
||||
__asm__ volatile(
|
||||
"pfd 2, 0(%[ptr_x]) \n\t"
|
||||
"pfd 2, 0(%[ptr_y]) \n\t"
|
||||
"srlg %[n_tmp],%[n_tmp],5 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
".align 16 \n\t"
|
||||
"1: \n\t"
|
||||
"pfd 2, 256(%%r1,%[ptr_x]) \n\t"
|
||||
"pfd 2, 256(%%r1,%[ptr_y]) \n\t"
|
||||
|
||||
"vl %%v16, 0(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v17, 16(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v18, 32(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v19, 48(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v20, 64(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v21, 80(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v22, 96(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v23, 112(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v24, 128(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v25, 144(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v26, 160(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v27, 176(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v28, 192(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v29, 208(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v30, 224(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v31, 240(%%r1,%[ptr_x]) \n\t"
|
||||
|
||||
|
||||
"vl %%v0, 0(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v1, 16(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v2, 32(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v3, 48(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v4, 64(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v5, 80(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v6, 96(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v7, 112(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v0, 0(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v1, 16(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v2, 32(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v3, 48(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v4, 64(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v5, 80(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v6, 96(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v7, 112(%%r1,%[ptr_x]) \n\t"
|
||||
|
||||
"vl %%v0, 128(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v1, 144(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v2, 160(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v3, 176(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v4, 192(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v5, 208(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v6, 224(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v7, 240(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v0, 128(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v1, 144(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v2, 160(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v3, 176(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v4, 192(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v5, 208(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v6, 224(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v7, 240(%%r1,%[ptr_x]) \n\t"
|
||||
|
||||
"vst %%v16, 0(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v17, 16(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v18, 32(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v19, 48(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v20, 64(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v21, 80(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v22, 96(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v23, 112(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v24, 128(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v25, 144(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v26, 160(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v27, 176(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v28, 192(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v29, 208(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v30, 224(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v31, 240(%%r1,%[ptr_y]) \n\t"
|
||||
|
||||
|
||||
"la %%r1,256(%%r1) \n\t"
|
||||
"brctg %[n_tmp],1b"
|
||||
: [mem_x] "+m" (*(double (*)[n])x),
|
||||
[mem_y] "+m" (*(double (*)[n])y),
|
||||
[n_tmp] "+&r"(n)
|
||||
: [ptr_x] "a"(x), [ptr_y] "a"(y)
|
||||
: "cc", "r1", "v0","v1","v2","v3","v4","v5","v6","v7","v16",
|
||||
"v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
return;
|
||||
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
|
||||
{
|
||||
BLASLONG i=0;
|
||||
|
|
@ -284,5 +160,3 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x,
|
|||
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,311 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2017, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include <math.h>
|
||||
|
||||
#if defined(DOUBLE)
|
||||
#define ABS fabs
|
||||
#else
|
||||
#define ABS fabsf
|
||||
#endif
|
||||
#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1]))
|
||||
|
||||
static BLASLONG icamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax)
|
||||
{
|
||||
BLASLONG iamax;
|
||||
|
||||
__asm__ volatile (
|
||||
"vlef %%v0,0(%3),0 \n\t"
|
||||
"vlef %%v1,4(%3),0 \n\t"
|
||||
"vlef %%v0,8(%3),1 \n\t"
|
||||
"vlef %%v1,12(%3),1 \n\t"
|
||||
"vlef %%v0,16(%3),2 \n\t"
|
||||
"vlef %%v1,20(%3),2 \n\t"
|
||||
"vlef %%v0,24(%3),3 \n\t"
|
||||
"vlef %%v1,28(%3),3 \n\t"
|
||||
"vflpsb %%v0,%%v0 \n\t"
|
||||
"vflpsb %%v1,%%v1 \n\t"
|
||||
"vfasb %%v0,%%v0,%%v1 \n\t"
|
||||
"vleig %%v1,0,0 \n\t"
|
||||
"vleig %%v1,2,1 \n\t"
|
||||
"vleig %%v2,1,0 \n\t"
|
||||
"vleig %%v2,3,1 \n\t"
|
||||
"vrepig %%v3,16 \n\t"
|
||||
"vzero %%v4 \n\t"
|
||||
"vleib %%v9,0,0 \n\t"
|
||||
"vleib %%v9,1,1 \n\t"
|
||||
"vleib %%v9,2,2 \n\t"
|
||||
"vleib %%v9,3,3 \n\t"
|
||||
"vleib %%v9,8,4 \n\t"
|
||||
"vleib %%v9,9,5 \n\t"
|
||||
"vleib %%v9,10,6 \n\t"
|
||||
"vleib %%v9,11,7 \n\t"
|
||||
"vleib %%v9,16,8 \n\t"
|
||||
"vleib %%v9,17,9 \n\t"
|
||||
"vleib %%v9,18,10 \n\t"
|
||||
"vleib %%v9,19,11 \n\t"
|
||||
"vleib %%v9,24,12 \n\t"
|
||||
"vleib %%v9,25,13 \n\t"
|
||||
"vleib %%v9,26,14 \n\t"
|
||||
"vleib %%v9,27,15 \n\t"
|
||||
"vleif %%v24,0,0 \n\t"
|
||||
"vleif %%v24,1,1 \n\t"
|
||||
"vleif %%v24,2,2 \n\t"
|
||||
"vleif %%v24,3,3 \n\t"
|
||||
"vleif %%v25,4,0 \n\t"
|
||||
"vleif %%v25,5,1 \n\t"
|
||||
"vleif %%v25,6,2 \n\t"
|
||||
"vleif %%v25,7,3 \n\t"
|
||||
"vleif %%v26,8,0 \n\t"
|
||||
"vleif %%v26,9,1 \n\t"
|
||||
"vleif %%v26,10,2 \n\t"
|
||||
"vleif %%v26,11,3 \n\t"
|
||||
"vleif %%v27,12,0 \n\t"
|
||||
"vleif %%v27,13,1 \n\t"
|
||||
"vleif %%v27,14,2 \n\t"
|
||||
"vleif %%v27,15,3 \n\t"
|
||||
"srlg %%r0,%2,5 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 1, 1024(%%r1,%3) \n\t"
|
||||
|
||||
"vl %%v16,0(%%r1,%3) \n\t"
|
||||
"vl %%v28,16(%%r1,%3) \n\t"
|
||||
"vpkg %%v17,%%v16,%%v28 \n\t"
|
||||
"vperm %%v16,%%v16,%%v28,%%v9 \n\t"
|
||||
|
||||
"vl %%v18,32(%%r1,%3) \n\t"
|
||||
"vl %%v29,48(%%r1,%3) \n\t"
|
||||
"vpkg %%v19,%%v18,%%v29 \n\t"
|
||||
"vperm %%v18,%%v18,%%v29,%%v9 \n\t"
|
||||
|
||||
"vl %%v20,64(%%r1,%3) \n\t"
|
||||
"vl %%v30,80(%%r1,%3) \n\t"
|
||||
"vpkg %%v21,%%v20,%%v30 \n\t"
|
||||
"vperm %%v20,%%v20,%%v30,%%v9 \n\t"
|
||||
|
||||
"vl %%v22,96(%%r1,%3) \n\t"
|
||||
"vl %%v31,112(%%r1,%3) \n\t"
|
||||
"vpkg %%v23,%%v22,%%v31 \n\t"
|
||||
"vperm %%v22,%%v22,%%v31,%%v9 \n\t"
|
||||
|
||||
"vflpsb %%v16, %%v16 \n\t"
|
||||
"vflpsb %%v17, %%v17 \n\t"
|
||||
"vflpsb %%v18, %%v18 \n\t"
|
||||
"vflpsb %%v19, %%v19 \n\t"
|
||||
"vflpsb %%v20, %%v20 \n\t"
|
||||
"vflpsb %%v21, %%v21 \n\t"
|
||||
"vflpsb %%v22, %%v22 \n\t"
|
||||
"vflpsb %%v23, %%v23 \n\t"
|
||||
"vfasb %%v16,%%v16,%%v17 \n\t"
|
||||
"vfasb %%v17,%%v18,%%v19 \n\t"
|
||||
"vfasb %%v18,%%v20,%%v21 \n\t"
|
||||
"vfasb %%v19,%%v22,%%v23 \n\t"
|
||||
|
||||
"vfchesb %%v5,%%v16,%%v17 \n\t"
|
||||
"vfchesb %%v6,%%v18,%%v19 \n\t"
|
||||
"vsel %%v16,%%v16,%%v17,%%v5 \n\t"
|
||||
"vsel %%v5,%%v24,%%v25,%%v5 \n\t"
|
||||
"vsel %%v17,%%v18,%%v19,%%v6 \n\t"
|
||||
"vsel %%v6,%%v26,%%v27,%%v6 \n\t"
|
||||
|
||||
"vfchesb %%v18,%%v16,%%v17 \n\t"
|
||||
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
|
||||
"vsel %%v5,%%v5,%%v6,%%v18 \n\t"
|
||||
"vsegf %%v6,%%v5 \n\t"
|
||||
"vesrlg %%v5,%%v5,32 \n\t"
|
||||
"vag %%v5,%%v5,%%v4 \n\t"
|
||||
"vag %%v6,%%v6,%%v4 \n\t"
|
||||
|
||||
"vfchesb %%v7,%%v0,%%v16 \n\t"
|
||||
"vsel %%v0,%%v0,%%v16,%%v7 \n\t"
|
||||
"vsegf %%v8,%%v7 \n\t"
|
||||
"vesrlg %%v7,%%v7,32 \n\t"
|
||||
"vsegf %%v7,%%v7 \n\t"
|
||||
"vsel %%v1,%%v1,%%v5,%%v7 \n\t"
|
||||
"vsel %%v2,%%v2,%%v6,%%v8 \n\t"
|
||||
"vag %%v4,%%v4,%%v3 \n\t"
|
||||
|
||||
"vl %%v16,128(%%r1,%3) \n\t"
|
||||
"vl %%v28,144(%%r1,%3) \n\t"
|
||||
"vpkg %%v17,%%v16,%%v28 \n\t"
|
||||
"vperm %%v16,%%v16,%%v28,%%v9 \n\t"
|
||||
|
||||
"vl %%v18,160(%%r1,%3) \n\t"
|
||||
"vl %%v29,176(%%r1,%3) \n\t"
|
||||
"vpkg %%v19,%%v18,%%v29 \n\t"
|
||||
"vperm %%v18,%%v18,%%v29,%%v9 \n\t"
|
||||
|
||||
"vl %%v20,192(%%r1,%3) \n\t"
|
||||
"vl %%v30,208(%%r1,%3) \n\t"
|
||||
"vpkg %%v21,%%v20,%%v30 \n\t"
|
||||
"vperm %%v20,%%v20,%%v30,%%v9 \n\t"
|
||||
|
||||
"vl %%v22,224(%%r1,%3) \n\t"
|
||||
"vl %%v31,240(%%r1,%3) \n\t"
|
||||
"vpkg %%v23,%%v22,%%v31 \n\t"
|
||||
"vperm %%v22,%%v22,%%v31,%%v9 \n\t"
|
||||
|
||||
"vflpsb %%v16, %%v16 \n\t"
|
||||
"vflpsb %%v17, %%v17 \n\t"
|
||||
"vflpsb %%v18, %%v18 \n\t"
|
||||
"vflpsb %%v19, %%v19 \n\t"
|
||||
"vflpsb %%v20, %%v20 \n\t"
|
||||
"vflpsb %%v21, %%v21 \n\t"
|
||||
"vflpsb %%v22, %%v22 \n\t"
|
||||
"vflpsb %%v23, %%v23 \n\t"
|
||||
"vfasb %%v16,%%v16,%%v17 \n\t"
|
||||
"vfasb %%v17,%%v18,%%v19 \n\t"
|
||||
"vfasb %%v18,%%v20,%%v21 \n\t"
|
||||
"vfasb %%v19,%%v22,%%v23 \n\t"
|
||||
|
||||
"vfchesb %%v5,%%v16,%%v17 \n\t"
|
||||
"vfchesb %%v6,%%v18,%%v19 \n\t"
|
||||
"vsel %%v16,%%v16,%%v17,%%v5 \n\t"
|
||||
"vsel %%v5,%%v24,%%v25,%%v5 \n\t"
|
||||
"vsel %%v17,%%v18,%%v19,%%v6 \n\t"
|
||||
"vsel %%v6,%%v26,%%v27,%%v6 \n\t"
|
||||
|
||||
"vfchesb %%v18,%%v16,%%v17 \n\t"
|
||||
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
|
||||
"vsel %%v5,%%v5,%%v6,%%v18 \n\t"
|
||||
"vsegf %%v6,%%v5 \n\t"
|
||||
"vesrlg %%v5,%%v5,32 \n\t"
|
||||
"vag %%v5,%%v5,%%v4 \n\t"
|
||||
"vag %%v6,%%v6,%%v4 \n\t"
|
||||
|
||||
"vfchesb %%v7,%%v0,%%v16 \n\t"
|
||||
"vsel %%v0,%%v0,%%v16,%%v7 \n\t"
|
||||
"vsegf %%v8,%%v7 \n\t"
|
||||
"vesrlg %%v7,%%v7,32 \n\t"
|
||||
"vsegf %%v7,%%v7 \n\t"
|
||||
"vsel %%v1,%%v1,%%v5,%%v7 \n\t"
|
||||
"vsel %%v2,%%v2,%%v6,%%v8 \n\t"
|
||||
"vag %%v4,%%v4,%%v3 \n\t"
|
||||
|
||||
"agfi %%r1, 256 \n\t"
|
||||
"brctg %%r0, 0b \n\t"
|
||||
|
||||
"veslg %%v3,%%v0,32 \n\t"
|
||||
"vfchsb %%v4,%%v0,%%v3 \n\t"
|
||||
"vchlg %%v5,%%v2,%%v1 \n\t"
|
||||
"vfcesb %%v6,%%v0,%%v3 \n\t"
|
||||
"vn %%v5,%%v5,%%v6 \n\t"
|
||||
"vo %%v4,%%v4,%%v5 \n\t"
|
||||
"vsel %%v0,%%v0,%%v3,%%v4 \n\t"
|
||||
"vesrlg %%v4,%%v4,32 \n\t"
|
||||
"vsegf %%v4,%%v4 \n\t"
|
||||
"vsel %%v1,%%v1,%%v2,%%v4 \n\t"
|
||||
|
||||
"vrepf %%v2,%%v0,2 \n\t"
|
||||
"vrepg %%v3,%%v1,1 \n\t"
|
||||
"wfcsb %%v2,%%v0 \n\t"
|
||||
"jne 1f \n\t"
|
||||
"vstef %%v0,%1,0 \n\t"
|
||||
"vmnlg %%v0,%%v1,%%v3 \n\t"
|
||||
"vlgvg %0,%%v0,0 \n\t"
|
||||
"j 2f \n\t"
|
||||
"1: \n\t"
|
||||
"wfchsb %%v4,%%v2,%%v0 \n\t"
|
||||
"vesrlg %%v4,%%v4,32 \n\t"
|
||||
"vsegf %%v4,%%v4 \n\t"
|
||||
"vsel %%v1,%%v3,%%v1,%%v4 \n\t"
|
||||
"vsel %%v0,%%v2,%%v0,%%v4 \n\t"
|
||||
"ste %%f0,%1 \n\t"
|
||||
"vlgvg %0,%%v1,0 \n\t"
|
||||
"2: \n\t"
|
||||
"nop "
|
||||
:"=r"(iamax),"=m"(*amax)
|
||||
:"r"(n),"ZR"((const FLOAT (*)[n * 2])x)
|
||||
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v8","v9","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
|
||||
return iamax;
|
||||
}
|
||||
|
||||
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
{
|
||||
BLASLONG i = 0;
|
||||
BLASLONG ix = 0;
|
||||
FLOAT maxf = 0;
|
||||
BLASLONG max = 0;
|
||||
BLASLONG inc_x2;
|
||||
|
||||
if (n <= 0 || inc_x <= 0) return(max);
|
||||
|
||||
if (inc_x == 1) {
|
||||
|
||||
BLASLONG n1 = n & -32;
|
||||
if (n1 > 0) {
|
||||
|
||||
max = icamax_kernel_32(n1, x, &maxf);
|
||||
ix = n1 * 2;
|
||||
i = n1;
|
||||
}
|
||||
else
|
||||
{
|
||||
maxf = CABS1(x,0);
|
||||
ix += 2;
|
||||
i++;
|
||||
}
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
if( CABS1(x,ix) > maxf )
|
||||
{
|
||||
max = i;
|
||||
maxf = CABS1(x,ix);
|
||||
}
|
||||
ix += 2;
|
||||
i++;
|
||||
}
|
||||
return (max + 1);
|
||||
|
||||
} else {
|
||||
|
||||
max = 0;
|
||||
maxf = CABS1(x,0);
|
||||
inc_x2 = 2 * inc_x;
|
||||
ix += inc_x2;
|
||||
i++;
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
if( CABS1(x,ix) > maxf )
|
||||
{
|
||||
max = i;
|
||||
maxf = CABS1(x,ix);
|
||||
}
|
||||
ix += inc_x2;
|
||||
i++;
|
||||
}
|
||||
return (max + 1);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,311 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2017, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include <math.h>
|
||||
|
||||
#if defined(DOUBLE)
|
||||
#define ABS fabs
|
||||
#else
|
||||
#define ABS fabsf
|
||||
#endif
|
||||
#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1]))
|
||||
|
||||
static BLASLONG icamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin)
|
||||
{
|
||||
BLASLONG iamin;
|
||||
|
||||
__asm__ volatile (
|
||||
"vlef %%v0,0(%3),0 \n\t"
|
||||
"vlef %%v1,4(%3),0 \n\t"
|
||||
"vlef %%v0,8(%3),1 \n\t"
|
||||
"vlef %%v1,12(%3),1 \n\t"
|
||||
"vlef %%v0,16(%3),2 \n\t"
|
||||
"vlef %%v1,20(%3),2 \n\t"
|
||||
"vlef %%v0,24(%3),3 \n\t"
|
||||
"vlef %%v1,28(%3),3 \n\t"
|
||||
"vflpsb %%v0,%%v0 \n\t"
|
||||
"vflpsb %%v1,%%v1 \n\t"
|
||||
"vfasb %%v0,%%v0,%%v1 \n\t"
|
||||
"vleig %%v1,0,0 \n\t"
|
||||
"vleig %%v1,2,1 \n\t"
|
||||
"vleig %%v2,1,0 \n\t"
|
||||
"vleig %%v2,3,1 \n\t"
|
||||
"vrepig %%v3,16 \n\t"
|
||||
"vzero %%v4 \n\t"
|
||||
"vleib %%v9,0,0 \n\t"
|
||||
"vleib %%v9,1,1 \n\t"
|
||||
"vleib %%v9,2,2 \n\t"
|
||||
"vleib %%v9,3,3 \n\t"
|
||||
"vleib %%v9,8,4 \n\t"
|
||||
"vleib %%v9,9,5 \n\t"
|
||||
"vleib %%v9,10,6 \n\t"
|
||||
"vleib %%v9,11,7 \n\t"
|
||||
"vleib %%v9,16,8 \n\t"
|
||||
"vleib %%v9,17,9 \n\t"
|
||||
"vleib %%v9,18,10 \n\t"
|
||||
"vleib %%v9,19,11 \n\t"
|
||||
"vleib %%v9,24,12 \n\t"
|
||||
"vleib %%v9,25,13 \n\t"
|
||||
"vleib %%v9,26,14 \n\t"
|
||||
"vleib %%v9,27,15 \n\t"
|
||||
"vleif %%v24,0,0 \n\t"
|
||||
"vleif %%v24,1,1 \n\t"
|
||||
"vleif %%v24,2,2 \n\t"
|
||||
"vleif %%v24,3,3 \n\t"
|
||||
"vleif %%v25,4,0 \n\t"
|
||||
"vleif %%v25,5,1 \n\t"
|
||||
"vleif %%v25,6,2 \n\t"
|
||||
"vleif %%v25,7,3 \n\t"
|
||||
"vleif %%v26,8,0 \n\t"
|
||||
"vleif %%v26,9,1 \n\t"
|
||||
"vleif %%v26,10,2 \n\t"
|
||||
"vleif %%v26,11,3 \n\t"
|
||||
"vleif %%v27,12,0 \n\t"
|
||||
"vleif %%v27,13,1 \n\t"
|
||||
"vleif %%v27,14,2 \n\t"
|
||||
"vleif %%v27,15,3 \n\t"
|
||||
"srlg %%r0,%2,5 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 1, 1024(%%r1,%3) \n\t"
|
||||
|
||||
"vl %%v16,0(%%r1,%3) \n\t"
|
||||
"vl %%v28,16(%%r1,%3) \n\t"
|
||||
"vpkg %%v17,%%v16,%%v28 \n\t"
|
||||
"vperm %%v16,%%v16,%%v28,%%v9 \n\t"
|
||||
|
||||
"vl %%v18,32(%%r1,%3) \n\t"
|
||||
"vl %%v29,48(%%r1,%3) \n\t"
|
||||
"vpkg %%v19,%%v18,%%v29 \n\t"
|
||||
"vperm %%v18,%%v18,%%v29,%%v9 \n\t"
|
||||
|
||||
"vl %%v20,64(%%r1,%3) \n\t"
|
||||
"vl %%v30,80(%%r1,%3) \n\t"
|
||||
"vpkg %%v21,%%v20,%%v30 \n\t"
|
||||
"vperm %%v20,%%v20,%%v30,%%v9 \n\t"
|
||||
|
||||
"vl %%v22,96(%%r1,%3) \n\t"
|
||||
"vl %%v31,112(%%r1,%3) \n\t"
|
||||
"vpkg %%v23,%%v22,%%v31 \n\t"
|
||||
"vperm %%v22,%%v22,%%v31,%%v9 \n\t"
|
||||
|
||||
"vflpsb %%v16, %%v16 \n\t"
|
||||
"vflpsb %%v17, %%v17 \n\t"
|
||||
"vflpsb %%v18, %%v18 \n\t"
|
||||
"vflpsb %%v19, %%v19 \n\t"
|
||||
"vflpsb %%v20, %%v20 \n\t"
|
||||
"vflpsb %%v21, %%v21 \n\t"
|
||||
"vflpsb %%v22, %%v22 \n\t"
|
||||
"vflpsb %%v23, %%v23 \n\t"
|
||||
"vfasb %%v16,%%v16,%%v17 \n\t"
|
||||
"vfasb %%v17,%%v18,%%v19 \n\t"
|
||||
"vfasb %%v18,%%v20,%%v21 \n\t"
|
||||
"vfasb %%v19,%%v22,%%v23 \n\t"
|
||||
|
||||
"vfchesb %%v5,%%v17,%%v16 \n\t"
|
||||
"vfchesb %%v6,%%v19,%%v18 \n\t"
|
||||
"vsel %%v16,%%v16,%%v17,%%v5 \n\t"
|
||||
"vsel %%v5,%%v24,%%v25,%%v5 \n\t"
|
||||
"vsel %%v17,%%v18,%%v19,%%v6 \n\t"
|
||||
"vsel %%v6,%%v26,%%v27,%%v6 \n\t"
|
||||
|
||||
"vfchesb %%v18,%%v17,%%v16 \n\t"
|
||||
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
|
||||
"vsel %%v5,%%v5,%%v6,%%v18 \n\t"
|
||||
"vsegf %%v6,%%v5 \n\t"
|
||||
"vesrlg %%v5,%%v5,32 \n\t"
|
||||
"vag %%v5,%%v5,%%v4 \n\t"
|
||||
"vag %%v6,%%v6,%%v4 \n\t"
|
||||
|
||||
"vfchesb %%v7,%%v16,%%v0 \n\t"
|
||||
"vsel %%v0,%%v0,%%v16,%%v7 \n\t"
|
||||
"vsegf %%v8,%%v7 \n\t"
|
||||
"vesrlg %%v7,%%v7,32 \n\t"
|
||||
"vsegf %%v7,%%v7 \n\t"
|
||||
"vsel %%v1,%%v1,%%v5,%%v7 \n\t"
|
||||
"vsel %%v2,%%v2,%%v6,%%v8 \n\t"
|
||||
"vag %%v4,%%v4,%%v3 \n\t"
|
||||
|
||||
"vl %%v16,128(%%r1,%3) \n\t"
|
||||
"vl %%v28,144(%%r1,%3) \n\t"
|
||||
"vpkg %%v17,%%v16,%%v28 \n\t"
|
||||
"vperm %%v16,%%v16,%%v28,%%v9 \n\t"
|
||||
|
||||
"vl %%v18,160(%%r1,%3) \n\t"
|
||||
"vl %%v29,176(%%r1,%3) \n\t"
|
||||
"vpkg %%v19,%%v18,%%v29 \n\t"
|
||||
"vperm %%v18,%%v18,%%v29,%%v9 \n\t"
|
||||
|
||||
"vl %%v20,192(%%r1,%3) \n\t"
|
||||
"vl %%v30,208(%%r1,%3) \n\t"
|
||||
"vpkg %%v21,%%v20,%%v30 \n\t"
|
||||
"vperm %%v20,%%v20,%%v30,%%v9 \n\t"
|
||||
|
||||
"vl %%v22,224(%%r1,%3) \n\t"
|
||||
"vl %%v31,240(%%r1,%3) \n\t"
|
||||
"vpkg %%v23,%%v22,%%v31 \n\t"
|
||||
"vperm %%v22,%%v22,%%v31,%%v9 \n\t"
|
||||
|
||||
"vflpsb %%v16, %%v16 \n\t"
|
||||
"vflpsb %%v17, %%v17 \n\t"
|
||||
"vflpsb %%v18, %%v18 \n\t"
|
||||
"vflpsb %%v19, %%v19 \n\t"
|
||||
"vflpsb %%v20, %%v20 \n\t"
|
||||
"vflpsb %%v21, %%v21 \n\t"
|
||||
"vflpsb %%v22, %%v22 \n\t"
|
||||
"vflpsb %%v23, %%v23 \n\t"
|
||||
"vfasb %%v16,%%v16,%%v17 \n\t"
|
||||
"vfasb %%v17,%%v18,%%v19 \n\t"
|
||||
"vfasb %%v18,%%v20,%%v21 \n\t"
|
||||
"vfasb %%v19,%%v22,%%v23 \n\t"
|
||||
|
||||
"vfchesb %%v5,%%v17,%%v16 \n\t"
|
||||
"vfchesb %%v6,%%v19,%%v18 \n\t"
|
||||
"vsel %%v16,%%v16,%%v17,%%v5 \n\t"
|
||||
"vsel %%v5,%%v24,%%v25,%%v5 \n\t"
|
||||
"vsel %%v17,%%v18,%%v19,%%v6 \n\t"
|
||||
"vsel %%v6,%%v26,%%v27,%%v6 \n\t"
|
||||
|
||||
"vfchesb %%v18,%%v17,%%v16 \n\t"
|
||||
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
|
||||
"vsel %%v5,%%v5,%%v6,%%v18 \n\t"
|
||||
"vsegf %%v6,%%v5 \n\t"
|
||||
"vesrlg %%v5,%%v5,32 \n\t"
|
||||
"vag %%v5,%%v5,%%v4 \n\t"
|
||||
"vag %%v6,%%v6,%%v4 \n\t"
|
||||
|
||||
"vfchesb %%v7,%%v16,%%v0 \n\t"
|
||||
"vsel %%v0,%%v0,%%v16,%%v7 \n\t"
|
||||
"vsegf %%v8,%%v7 \n\t"
|
||||
"vesrlg %%v7,%%v7,32 \n\t"
|
||||
"vsegf %%v7,%%v7 \n\t"
|
||||
"vsel %%v1,%%v1,%%v5,%%v7 \n\t"
|
||||
"vsel %%v2,%%v2,%%v6,%%v8 \n\t"
|
||||
"vag %%v4,%%v4,%%v3 \n\t"
|
||||
|
||||
"agfi %%r1, 256 \n\t"
|
||||
"brctg %%r0, 0b \n\t"
|
||||
|
||||
"veslg %%v3,%%v0,32 \n\t"
|
||||
"vfchsb %%v4,%%v3,%%v0 \n\t"
|
||||
"vchlg %%v5,%%v2,%%v1 \n\t"
|
||||
"vfcesb %%v6,%%v0,%%v3 \n\t"
|
||||
"vn %%v5,%%v5,%%v6 \n\t"
|
||||
"vo %%v4,%%v4,%%v5 \n\t"
|
||||
"vsel %%v0,%%v0,%%v3,%%v4 \n\t"
|
||||
"vesrlg %%v4,%%v4,32 \n\t"
|
||||
"vsegf %%v4,%%v4 \n\t"
|
||||
"vsel %%v1,%%v1,%%v2,%%v4 \n\t"
|
||||
|
||||
"vrepf %%v2,%%v0,2 \n\t"
|
||||
"vrepg %%v3,%%v1,1 \n\t"
|
||||
"wfcsb %%v2,%%v0 \n\t"
|
||||
"jne 1f \n\t"
|
||||
"vstef %%v0,%1,0 \n\t"
|
||||
"vmnlg %%v0,%%v1,%%v3 \n\t"
|
||||
"vlgvg %0,%%v0,0 \n\t"
|
||||
"j 2f \n\t"
|
||||
"1: \n\t"
|
||||
"wfchsb %%v4,%%v0,%%v2 \n\t"
|
||||
"vesrlg %%v4,%%v4,32 \n\t"
|
||||
"vsegf %%v4,%%v4 \n\t"
|
||||
"vsel %%v1,%%v3,%%v1,%%v4 \n\t"
|
||||
"vsel %%v0,%%v2,%%v0,%%v4 \n\t"
|
||||
"ste %%f0,%1 \n\t"
|
||||
"vlgvg %0,%%v1,0 \n\t"
|
||||
"2: \n\t"
|
||||
"nop "
|
||||
:"=r"(iamin),"=m"(*amin)
|
||||
:"r"(n),"ZR"((const FLOAT (*)[n * 2])x)
|
||||
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v8","v9","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
|
||||
return iamin;
|
||||
}
|
||||
|
||||
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
{
|
||||
BLASLONG i = 0;
|
||||
BLASLONG ix = 0;
|
||||
FLOAT minf = 0;
|
||||
BLASLONG min = 0;
|
||||
BLASLONG inc_x2;
|
||||
|
||||
if (n <= 0 || inc_x <= 0) return(min);
|
||||
|
||||
if (inc_x == 1) {
|
||||
|
||||
BLASLONG n1 = n & -32;
|
||||
if (n1 > 0) {
|
||||
|
||||
min = icamin_kernel_32(n1, x, &minf);
|
||||
ix = n1 * 2;
|
||||
i = n1;
|
||||
}
|
||||
else
|
||||
{
|
||||
minf = CABS1(x,0);
|
||||
ix += 2;
|
||||
i++;
|
||||
}
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
if( CABS1(x,ix) < minf )
|
||||
{
|
||||
min = i;
|
||||
minf = CABS1(x,ix);
|
||||
}
|
||||
ix += 2;
|
||||
i++;
|
||||
}
|
||||
return (min + 1);
|
||||
|
||||
} else {
|
||||
|
||||
min = 0;
|
||||
minf = CABS1(x,0);
|
||||
inc_x2 = 2 * inc_x;
|
||||
ix += inc_x2;
|
||||
i++;
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
if( CABS1(x,ix) < minf )
|
||||
{
|
||||
min = i;
|
||||
minf = CABS1(x,ix);
|
||||
}
|
||||
ix += inc_x2;
|
||||
i++;
|
||||
}
|
||||
return (min + 1);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -23,164 +23,173 @@ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
|||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include <math.h>
|
||||
|
||||
#if defined(DOUBLE)
|
||||
|
||||
#define ABS fabs
|
||||
|
||||
#else
|
||||
|
||||
#define ABS fabsf
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
/**
|
||||
* Find maximum index
|
||||
* Warning: requirements n>0 and n % 32 == 0
|
||||
* @param n
|
||||
* @param x pointer to the vector
|
||||
* @param maxf (out) maximum absolute value .( only for output )
|
||||
* @return index
|
||||
*/
|
||||
static BLASLONG diamax_kernel_32_TUNED(BLASLONG n, FLOAT *x, FLOAT *maxf) {
|
||||
BLASLONG index;
|
||||
__asm__(
|
||||
"pfd 1, 0(%[ptr_x]) \n\t"
|
||||
"sllg %%r0,%[n],3 \n\t"
|
||||
"agr %%r0,%[ptr_x] \n\t"
|
||||
"vleig %%v20,0,0 \n\t"
|
||||
"vleig %%v20,1,1 \n\t"
|
||||
"vleig %%v21,2,0 \n\t"
|
||||
"vleig %%v21,3,1 \n\t"
|
||||
"vleig %%v22,4,0 \n\t"
|
||||
"vleig %%v22,5,1 \n\t"
|
||||
"vleig %%v23,6,0 \n\t"
|
||||
"vleig %%v23,7,1 \n\t"
|
||||
"vrepig %%v4,8 \n\t"
|
||||
"vzero %%v5 \n\t"
|
||||
"vzero %%v18 \n\t"
|
||||
"vzero %%v19 \n\t"
|
||||
".align 16 \n\t"
|
||||
"1: \n\t"
|
||||
"pfd 1, 256(%[ptr_tmp] ) \n\t"
|
||||
"vlm %%v24,%%v31, 0(%[ptr_tmp] ) \n\t"
|
||||
"vflpdb %%v24, %%v24 \n\t"
|
||||
"vflpdb %%v25, %%v25 \n\t"
|
||||
"vflpdb %%v26, %%v26 \n\t"
|
||||
"vflpdb %%v27, %%v27 \n\t"
|
||||
"vflpdb %%v28, %%v28 \n\t"
|
||||
"vflpdb %%v29, %%v29 \n\t"
|
||||
"vflpdb %%v30, %%v30 \n\t"
|
||||
"vflpdb %%v31, %%v31 \n\t"
|
||||
"vfchdb %%v16,%%v25,%%v24 \n\t "
|
||||
"vfchdb %%v17,%%v27,%%v26 \n\t "
|
||||
"vsel %%v1,%%v21,%%v20,%%v16 \n\t"
|
||||
"vsel %%v0,%%v25,%%v24,%%v16 \n\t"
|
||||
"vsel %%v2,%%v23,%%v22,%%v17 \n\t"
|
||||
"vsel %%v3,%%v27,%%v26,%%v17 \n\t"
|
||||
"vfchdb %%v16,%%v29,%%v28 \n\t "
|
||||
"vfchdb %%v17,%%v31,%%v30 \n\t"
|
||||
"vsel %%v24,%%v21,%%v20,%%v16 \n\t"
|
||||
"vsel %%v25,%%v29,%%v28,%%v16 \n\t"
|
||||
"vsel %%v26,%%v23,%%v22,%%v17 \n\t"
|
||||
"vsel %%v27,%%v31,%%v30,%%v17 \n\t"
|
||||
static BLASLONG idamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax)
|
||||
{
|
||||
BLASLONG iamax;
|
||||
|
||||
"vfchdb %%v28, %%v3,%%v0 \n\t"
|
||||
"vfchdb %%v29,%%v27, %%v25 \n\t"
|
||||
"vsel %%v1,%%v2,%%v1,%%v28 \n\t"
|
||||
"vsel %%v0,%%v3,%%v0,%%v28 \n\t"
|
||||
"vsel %%v24,%%v26,%%v24,%%v29 \n\t"
|
||||
"vsel %%v25,%%v27,%%v25,%%v29 \n\t"
|
||||
"vag %%v1,%%v1,%%v5 \n\t"
|
||||
"vag %%v24,%%v24,%%v5 \n\t"
|
||||
"vag %%v24,%%v24,%%v4 \n\t"
|
||||
"vfchdb %%v16,%%v25 , %%v0 \n\t"
|
||||
"vag %%v5,%%v5,%%v4 \n\t"
|
||||
"vsel %%v29,%%v25,%%v0,%%v16 \n\t"
|
||||
"vsel %%v28,%%v24,%%v1,%%v16 \n\t"
|
||||
"vfchdb %%v17, %%v29,%%v18 \n\t"
|
||||
"vsel %%v19,%%v28,%%v19,%%v17 \n\t"
|
||||
"vsel %%v18,%%v29,%%v18,%%v17 \n\t"
|
||||
"vag %%v5,%%v5,%%v4 \n\t"
|
||||
"vlm %%v24,%%v31,128(%[ptr_tmp] ) \n\t"
|
||||
"vflpdb %%v24, %%v24 \n\t"
|
||||
"vflpdb %%v25, %%v25 \n\t"
|
||||
"vflpdb %%v26, %%v26 \n\t"
|
||||
"vflpdb %%v27, %%v27 \n\t"
|
||||
"vflpdb %%v28, %%v28 \n\t"
|
||||
"vflpdb %%v29, %%v29 \n\t"
|
||||
"vflpdb %%v30, %%v30 \n\t"
|
||||
"vflpdb %%v31, %%v31 \n\t"
|
||||
"vfchdb %%v16,%%v25,%%v24 \n\t "
|
||||
"vfchdb %%v17,%%v27,%%v26 \n\t "
|
||||
"vsel %%v1,%%v21,%%v20,%%v16 \n\t"
|
||||
"vsel %%v0,%%v25,%%v24,%%v16 \n\t"
|
||||
"vsel %%v2,%%v23,%%v22,%%v17 \n\t"
|
||||
"vsel %%v3,%%v27,%%v26,%%v17 \n\t"
|
||||
"vfchdb %%v16,%%v29,%%v28 \n\t "
|
||||
"vfchdb %%v17,%%v31,%%v30 \n\t"
|
||||
"vsel %%v24,%%v21,%%v20,%%v16 \n\t"
|
||||
"vsel %%v25,%%v29,%%v28,%%v16 \n\t"
|
||||
"vsel %%v26,%%v23,%%v22,%%v17 \n\t"
|
||||
"vsel %%v27,%%v31,%%v30,%%v17 \n\t"
|
||||
__asm__ volatile (
|
||||
"vl %%v0,0(%3) \n\t"
|
||||
"vflpdb %%v0,%%v0 \n\t"
|
||||
"vleig %%v1,0,0 \n\t"
|
||||
"vleig %%v1,1,1 \n\t"
|
||||
"vrepig %%v2,16 \n\t"
|
||||
"vzero %%v3 \n\t"
|
||||
"vleig %%v24,0,0 \n\t"
|
||||
"vleig %%v24,1,1 \n\t"
|
||||
"vleig %%v25,2,0 \n\t"
|
||||
"vleig %%v25,3,1 \n\t"
|
||||
"vleig %%v26,4,0 \n\t"
|
||||
"vleig %%v26,5,1 \n\t"
|
||||
"vleig %%v27,6,0 \n\t"
|
||||
"vleig %%v27,7,1 \n\t"
|
||||
"vleig %%v28,8,0 \n\t"
|
||||
"vleig %%v28,9,1 \n\t"
|
||||
"vleig %%v29,10,0 \n\t"
|
||||
"vleig %%v29,11,1 \n\t"
|
||||
"vleig %%v30,12,0 \n\t"
|
||||
"vleig %%v30,13,1 \n\t"
|
||||
"vleig %%v31,14,0 \n\t"
|
||||
"vleig %%v31,15,1 \n\t"
|
||||
"srlg %%r0,%2,5 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 1, 1024(%%r1,%3) \n\t"
|
||||
|
||||
"vfchdb %%v28, %%v3,%%v0 \n\t"
|
||||
"vfchdb %%v29,%%v27, %%v25 \n\t"
|
||||
"vsel %%v1,%%v2,%%v1,%%v28 \n\t"
|
||||
"vsel %%v0,%%v3,%%v0,%%v28 \n\t"
|
||||
"vsel %%v24,%%v26,%%v24,%%v29 \n\t"
|
||||
"vsel %%v25,%%v27,%%v25,%%v29 \n\t"
|
||||
"vag %%v1,%%v1,%%v5 \n\t"
|
||||
"vag %%v24,%%v24,%%v5 \n\t"
|
||||
"la %[ptr_tmp],256(%[ptr_tmp]) \n\t"
|
||||
"vag %%v24,%%v24,%%v4 \n\t"
|
||||
"vfchdb %%v16,%%v25 , %%v0 \n\t"
|
||||
"vag %%v5,%%v5,%%v4 \n\t"
|
||||
"vsel %%v29,%%v25,%%v0,%%v16 \n\t"
|
||||
"vsel %%v28,%%v24,%%v1,%%v16 \n\t"
|
||||
"vfchdb %%v17, %%v29,%%v18 \n\t"
|
||||
"vsel %%v19,%%v28,%%v19,%%v17 \n\t"
|
||||
"vsel %%v18,%%v29,%%v18,%%v17 \n\t"
|
||||
"vag %%v5,%%v5,%%v4 \n\t"
|
||||
"clgrjl %[ptr_tmp],%%r0,1b \n\t"
|
||||
"vl %%v16,0(%%r1,%3) \n\t"
|
||||
"vl %%v17,16(%%r1,%3) \n\t"
|
||||
"vl %%v18,32(%%r1,%3) \n\t"
|
||||
"vl %%v19,48(%%r1,%3) \n\t"
|
||||
"vl %%v20,64(%%r1,%3) \n\t"
|
||||
"vl %%v21,80(%%r1,%3) \n\t"
|
||||
"vl %%v22,96(%%r1,%3) \n\t"
|
||||
"vl %%v23,112(%%r1,%3) \n\t"
|
||||
"vflpdb %%v16, %%v16 \n\t"
|
||||
"vflpdb %%v17, %%v17 \n\t"
|
||||
"vflpdb %%v18, %%v18 \n\t"
|
||||
"vflpdb %%v19, %%v19 \n\t"
|
||||
"vflpdb %%v20, %%v20 \n\t"
|
||||
"vflpdb %%v21, %%v21 \n\t"
|
||||
"vflpdb %%v22, %%v22 \n\t"
|
||||
"vflpdb %%v23, %%v23 \n\t"
|
||||
|
||||
"vfchedb %%v4,%%v16,%%v17 \n\t"
|
||||
"vfchedb %%v5,%%v18,%%v19 \n\t"
|
||||
"vfchedb %%v6,%%v20,%%v21 \n\t"
|
||||
"vfchedb %%v7,%%v22,%%v23 \n\t"
|
||||
"vsel %%v16,%%v16,%%v17,%%v4 \n\t"
|
||||
"vsel %%v4,%%v24,%%v25,%%v4 \n\t"
|
||||
"vsel %%v17,%%v18,%%v19,%%v5 \n\t"
|
||||
"vsel %%v5,%%v26,%%v27,%%v5 \n\t"
|
||||
"vsel %%v18,%%v20,%%v21,%%v6 \n\t"
|
||||
"vsel %%v6,%%v28,%%v29,%%v6 \n\t"
|
||||
"vsel %%v19,%%v22,%%v23,%%v7 \n\t"
|
||||
"vsel %%v7,%%v30,%%v31,%%v7 \n\t"
|
||||
|
||||
"vrepg %%v26,%%v18,1 \n\t"
|
||||
"vrepg %%v5,%%v19,1 \n\t"
|
||||
"wfcdb %%v26,%%v18 \n\t"
|
||||
"jne 2f \n\t"
|
||||
"vsteg %%v18,%[maxf],0 \n\t"
|
||||
"vmnlg %%v1,%%v5,%%v19 \n\t"
|
||||
"j 3f \n\t"
|
||||
"vfchedb %%v20,%%v16,%%v17 \n\t"
|
||||
"vfchedb %%v21,%%v18,%%v19 \n\t"
|
||||
"vsel %%v16,%%v16,%%v17,%%v20 \n\t"
|
||||
"vsel %%v4,%%v4,%%v5,%%v20 \n\t"
|
||||
"vsel %%v17,%%v18,%%v19,%%v21 \n\t"
|
||||
"vsel %%v5,%%v6,%%v7,%%v21 \n\t"
|
||||
|
||||
"2: \n\t"
|
||||
"wfchdb %%v16,%%v26,%%v18 \n\t"
|
||||
"vsel %%v1,%%v5,%%v19,%%v16 \n\t"
|
||||
"vsel %%v0,%%v26,%%v18,%%v16 \n\t"
|
||||
"std %%f0,%[maxf] \n\t"
|
||||
|
||||
"3: \n\t"
|
||||
"vlgvg %[index],%%v1,0 \n\t"
|
||||
: [index] "+r"(index) ,[maxf] "=m"(*maxf), [ptr_tmp] "+&a"(x)
|
||||
: [mem] "m"( *(const double (*)[n])x), [n] "r"(n), [ptr_x] "r"(x)
|
||||
: "cc", "r0", "f0","v0","v1","v2","v3","v4","v5","v6","v7","v16",
|
||||
"v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
|
||||
return index;
|
||||
"vfchedb %%v18,%%v16,%%v17 \n\t"
|
||||
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
|
||||
"vsel %%v4,%%v4,%%v5,%%v18 \n\t"
|
||||
"vag %%v4,%%v4,%%v3 \n\t"
|
||||
|
||||
"vfchedb %%v5,%%v0,%%v16 \n\t"
|
||||
"vsel %%v0,%%v0,%%v16,%%v5 \n\t"
|
||||
"vsel %%v1,%%v1,%%v4,%%v5 \n\t"
|
||||
"vag %%v3,%%v3,%%v2 \n\t"
|
||||
|
||||
"vl %%v16,128(%%r1,%3) \n\t"
|
||||
"vl %%v17,144(%%r1,%3) \n\t"
|
||||
"vl %%v18,160(%%r1,%3) \n\t"
|
||||
"vl %%v19,176(%%r1,%3) \n\t"
|
||||
"vl %%v20,192(%%r1,%3) \n\t"
|
||||
"vl %%v21,208(%%r1,%3) \n\t"
|
||||
"vl %%v22,224(%%r1,%3) \n\t"
|
||||
"vl %%v23,240(%%r1,%3) \n\t"
|
||||
"vflpdb %%v16, %%v16 \n\t"
|
||||
"vflpdb %%v17, %%v17 \n\t"
|
||||
"vflpdb %%v18, %%v18 \n\t"
|
||||
"vflpdb %%v19, %%v19 \n\t"
|
||||
"vflpdb %%v20, %%v20 \n\t"
|
||||
"vflpdb %%v21, %%v21 \n\t"
|
||||
"vflpdb %%v22, %%v22 \n\t"
|
||||
"vflpdb %%v23, %%v23 \n\t"
|
||||
|
||||
"vfchedb %%v4,%%v16,%%v17 \n\t"
|
||||
"vfchedb %%v5,%%v18,%%v19 \n\t"
|
||||
"vfchedb %%v6,%%v20,%%v21 \n\t"
|
||||
"vfchedb %%v7,%%v22,%%v23 \n\t"
|
||||
"vsel %%v16,%%v16,%%v17,%%v4 \n\t"
|
||||
"vsel %%v4,%%v24,%%v25,%%v4 \n\t"
|
||||
"vsel %%v17,%%v18,%%v19,%%v5 \n\t"
|
||||
"vsel %%v5,%%v26,%%v27,%%v5 \n\t"
|
||||
"vsel %%v18,%%v20,%%v21,%%v6 \n\t"
|
||||
"vsel %%v6,%%v28,%%v29,%%v6 \n\t"
|
||||
"vsel %%v19,%%v22,%%v23,%%v7 \n\t"
|
||||
"vsel %%v7,%%v30,%%v31,%%v7 \n\t"
|
||||
|
||||
"vfchedb %%v20,%%v16,%%v17 \n\t"
|
||||
"vfchedb %%v21,%%v18,%%v19 \n\t"
|
||||
"vsel %%v16,%%v16,%%v17,%%v20 \n\t"
|
||||
"vsel %%v4,%%v4,%%v5,%%v20 \n\t"
|
||||
"vsel %%v17,%%v18,%%v19,%%v21 \n\t"
|
||||
"vsel %%v5,%%v6,%%v7,%%v21 \n\t"
|
||||
|
||||
"vfchedb %%v18,%%v16,%%v17 \n\t"
|
||||
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
|
||||
"vsel %%v4,%%v4,%%v5,%%v18 \n\t"
|
||||
"vag %%v4,%%v4,%%v3 \n\t"
|
||||
|
||||
"vfchedb %%v5,%%v0,%%v16 \n\t"
|
||||
"vsel %%v0,%%v0,%%v16,%%v5 \n\t"
|
||||
"vsel %%v1,%%v1,%%v4,%%v5 \n\t"
|
||||
"vag %%v3,%%v3,%%v2 \n\t"
|
||||
|
||||
"agfi %%r1, 256 \n\t"
|
||||
"brctg %%r0, 0b \n\t"
|
||||
|
||||
"vrepg %%v2,%%v0,1 \n\t"
|
||||
"vrepg %%v3,%%v1,1 \n\t"
|
||||
"wfcdb %%v2,%%v0 \n\t"
|
||||
"jne 1f \n\t"
|
||||
"vsteg %%v0,%1,0 \n\t"
|
||||
"vmnlg %%v0,%%v1,%%v3 \n\t"
|
||||
"vlgvg %0,%%v0,0 \n\t"
|
||||
"j 2f \n\t"
|
||||
"1: \n\t"
|
||||
"wfchdb %%v4,%%v2,%%v0 \n\t"
|
||||
"vsel %%v1,%%v3,%%v1,%%v4 \n\t"
|
||||
"vsel %%v0,%%v2,%%v0,%%v4 \n\t"
|
||||
"std %%f0,%1 \n\t"
|
||||
"vlgvg %0,%%v1,0 \n\t"
|
||||
"2: \n\t"
|
||||
"nop "
|
||||
:"=r"(iamax),"=m"(*amax)
|
||||
:"r"(n),"ZR"((const FLOAT (*)[n])x)
|
||||
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
|
||||
return iamax;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||
BLASLONG i = 0;
|
||||
BLASLONG j = 0;
|
||||
BLASLONG ix = 0;
|
||||
FLOAT maxf = 0.0;
|
||||
BLASLONG max = 0;
|
||||
|
||||
|
|
@ -191,10 +200,15 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
|||
BLASLONG n1 = n & -32;
|
||||
if (n1 > 0) {
|
||||
|
||||
max = diamax_kernel_32_TUNED(n1, x, &maxf);
|
||||
max = idamax_kernel_32(n1, x, &maxf);
|
||||
|
||||
i = n1;
|
||||
}
|
||||
else
|
||||
{
|
||||
maxf = ABS(x[0]);
|
||||
i++;
|
||||
}
|
||||
|
||||
while (i < n) {
|
||||
if (ABS(x[i]) > maxf) {
|
||||
|
|
@ -207,6 +221,9 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
|||
|
||||
} else {
|
||||
|
||||
max = 0;
|
||||
maxf = ABS(x[0]);
|
||||
|
||||
BLASLONG n1 = n & -4;
|
||||
while (j < n1) {
|
||||
|
||||
|
|
|
|||
|
|
@ -23,194 +23,192 @@ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
|||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include <math.h>
|
||||
|
||||
#if defined(DOUBLE)
|
||||
|
||||
#define ABS fabs
|
||||
|
||||
#else
|
||||
|
||||
#define ABS fabsf
|
||||
|
||||
#endif
|
||||
|
||||
/**
|
||||
* Find minimum index
|
||||
* Warning: requirements n>0 and n % 32 == 0
|
||||
* @param n
|
||||
* @param x pointer to the vector
|
||||
* @param minf (out) minimum absolute value .( only for output )
|
||||
* @return minimum index
|
||||
*/
|
||||
static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) {
|
||||
BLASLONG index;
|
||||
__asm__(
|
||||
"pfd 1, 0(%[ptr_x]) \n\t"
|
||||
"sllg %%r0,%[n],3 \n\t"
|
||||
"agr %%r0,%[ptr_x] \n\t"
|
||||
"vleig %%v20,0,0 \n\t"
|
||||
"vleig %%v20,1,1 \n\t"
|
||||
"vleig %%v21,2,0 \n\t"
|
||||
"vleig %%v21,3,1 \n\t"
|
||||
"vleig %%v22,4,0 \n\t"
|
||||
"vleig %%v22,5,1 \n\t"
|
||||
"vleig %%v23,6,0 \n\t"
|
||||
"vleig %%v23,7,1 \n\t"
|
||||
"vrepig %%v4,8 \n\t"
|
||||
"vlrepg %%v18,0(%[ptr_x]) \n\t"
|
||||
"vzero %%v5 \n\t"
|
||||
"vflpdb %%v18, %%v18 \n\t"
|
||||
"vzero %%v19 \n\t"
|
||||
".align 16 \n\t"
|
||||
"1: \n\t"
|
||||
"pfd 1, 256(%[ptr_tmp] ) \n\t"
|
||||
"vlm %%v24,%%v31, 0(%[ptr_tmp] ) \n\t"
|
||||
static BLASLONG idamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin)
|
||||
{
|
||||
BLASLONG iamin;
|
||||
|
||||
"vflpdb %%v24, %%v24 \n\t"
|
||||
"vflpdb %%v25, %%v25 \n\t"
|
||||
"vflpdb %%v26, %%v26 \n\t"
|
||||
"vflpdb %%v27, %%v27 \n\t"
|
||||
"vflpdb %%v28, %%v28 \n\t"
|
||||
"vflpdb %%v29, %%v29 \n\t"
|
||||
"vflpdb %%v30, %%v30 \n\t"
|
||||
"vflpdb %%v31, %%v31 \n\t"
|
||||
__asm__ volatile (
|
||||
"vl %%v0,0(%3) \n\t"
|
||||
"vflpdb %%v0,%%v0 \n\t"
|
||||
"vleig %%v1,0,0 \n\t"
|
||||
"vleig %%v1,1,1 \n\t"
|
||||
"vrepig %%v2,16 \n\t"
|
||||
"vzero %%v3 \n\t"
|
||||
"vleig %%v24,0,0 \n\t"
|
||||
"vleig %%v24,1,1 \n\t"
|
||||
"vleig %%v25,2,0 \n\t"
|
||||
"vleig %%v25,3,1 \n\t"
|
||||
"vleig %%v26,4,0 \n\t"
|
||||
"vleig %%v26,5,1 \n\t"
|
||||
"vleig %%v27,6,0 \n\t"
|
||||
"vleig %%v27,7,1 \n\t"
|
||||
"vleig %%v28,8,0 \n\t"
|
||||
"vleig %%v28,9,1 \n\t"
|
||||
"vleig %%v29,10,0 \n\t"
|
||||
"vleig %%v29,11,1 \n\t"
|
||||
"vleig %%v30,12,0 \n\t"
|
||||
"vleig %%v30,13,1 \n\t"
|
||||
"vleig %%v31,14,0 \n\t"
|
||||
"vleig %%v31,15,1 \n\t"
|
||||
"srlg %%r0,%2,5 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 1, 1024(%%r1,%3) \n\t"
|
||||
|
||||
"vfchdb %%v16,%%v24,%%v25 \n\t "
|
||||
"vfchdb %%v17,%%v26 ,%%v27 \n\t "
|
||||
"vsel %%v1,%%v21,%%v20,%%v16 \n\t"
|
||||
"vsel %%v0,%%v25,%%v24,%%v16 \n\t"
|
||||
"vsel %%v2,%%v23,%%v22,%%v17 \n\t"
|
||||
"vsel %%v3,%%v27,%%v26,%%v17 \n\t"
|
||||
"vfchdb %%v16,%%v28, %%v29 \n\t "
|
||||
"vfchdb %%v17,%%v30,%%v31 \n\t"
|
||||
"vsel %%v24,%%v21,%%v20,%%v16 \n\t"
|
||||
"vsel %%v25,%%v29,%%v28,%%v16 \n\t"
|
||||
"vsel %%v26,%%v23,%%v22,%%v17 \n\t"
|
||||
"vsel %%v27,%%v31,%%v30,%%v17 \n\t"
|
||||
"vl %%v16,0(%%r1,%3) \n\t"
|
||||
"vl %%v17,16(%%r1,%3) \n\t"
|
||||
"vl %%v18,32(%%r1,%3) \n\t"
|
||||
"vl %%v19,48(%%r1,%3) \n\t"
|
||||
"vl %%v20,64(%%r1,%3) \n\t"
|
||||
"vl %%v21,80(%%r1,%3) \n\t"
|
||||
"vl %%v22,96(%%r1,%3) \n\t"
|
||||
"vl %%v23,112(%%r1,%3) \n\t"
|
||||
"vflpdb %%v16, %%v16 \n\t"
|
||||
"vflpdb %%v17, %%v17 \n\t"
|
||||
"vflpdb %%v18, %%v18 \n\t"
|
||||
"vflpdb %%v19, %%v19 \n\t"
|
||||
"vflpdb %%v20, %%v20 \n\t"
|
||||
"vflpdb %%v21, %%v21 \n\t"
|
||||
"vflpdb %%v22, %%v22 \n\t"
|
||||
"vflpdb %%v23, %%v23 \n\t"
|
||||
|
||||
"vfchedb %%v4,%%v17,%%v16 \n\t"
|
||||
"vfchedb %%v5,%%v19,%%v18 \n\t"
|
||||
"vfchedb %%v6,%%v21,%%v20 \n\t"
|
||||
"vfchedb %%v7,%%v23,%%v22 \n\t"
|
||||
"vsel %%v16,%%v16,%%v17,%%v4 \n\t"
|
||||
"vsel %%v4,%%v24,%%v25,%%v4 \n\t"
|
||||
"vsel %%v17,%%v18,%%v19,%%v5 \n\t"
|
||||
"vsel %%v5,%%v26,%%v27,%%v5 \n\t"
|
||||
"vsel %%v18,%%v20,%%v21,%%v6 \n\t"
|
||||
"vsel %%v6,%%v28,%%v29,%%v6 \n\t"
|
||||
"vsel %%v19,%%v22,%%v23,%%v7 \n\t"
|
||||
"vsel %%v7,%%v30,%%v31,%%v7 \n\t"
|
||||
|
||||
"vfchedb %%v20,%%v17,%%v16 \n\t"
|
||||
"vfchedb %%v21,%%v19,%%v18 \n\t"
|
||||
"vsel %%v16,%%v16,%%v17,%%v20 \n\t"
|
||||
"vsel %%v4,%%v4,%%v5,%%v20 \n\t"
|
||||
"vsel %%v17,%%v18,%%v19,%%v21 \n\t"
|
||||
"vsel %%v5,%%v6,%%v7,%%v21 \n\t"
|
||||
|
||||
"vfchdb %%v28,%%v0 , %%v3 \n\t"
|
||||
"vfchdb %%v29, %%v25,%%v27 \n\t"
|
||||
"vsel %%v1,%%v2,%%v1,%%v28 \n\t"
|
||||
"vsel %%v0,%%v3,%%v0,%%v28 \n\t"
|
||||
"vsel %%v24,%%v26,%%v24,%%v29 \n\t"
|
||||
"vsel %%v25,%%v27,%%v25,%%v29 \n\t"
|
||||
"vfchedb %%v18,%%v17,%%v16 \n\t"
|
||||
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
|
||||
"vsel %%v4,%%v4,%%v5,%%v18 \n\t"
|
||||
"vag %%v4,%%v4,%%v3 \n\t"
|
||||
|
||||
"vag %%v1,%%v1,%%v5 \n\t"
|
||||
"vag %%v24,%%v24,%%v5 \n\t"
|
||||
"vag %%v24,%%v24,%%v4 \n\t"
|
||||
"vfchedb %%v5,%%v16,%%v0 \n\t"
|
||||
"vsel %%v0,%%v0,%%v16,%%v5 \n\t"
|
||||
"vsel %%v1,%%v1,%%v4,%%v5 \n\t"
|
||||
"vag %%v3,%%v3,%%v2 \n\t"
|
||||
|
||||
"vfchdb %%v16, %%v0,%%v25 \n\t"
|
||||
"vag %%v5,%%v5,%%v4 \n\t"
|
||||
"vsel %%v29,%%v25,%%v0,%%v16 \n\t"
|
||||
"vsel %%v28,%%v24,%%v1,%%v16 \n\t"
|
||||
"vl %%v16,128(%%r1,%3) \n\t"
|
||||
"vl %%v17,144(%%r1,%3) \n\t"
|
||||
"vl %%v18,160(%%r1,%3) \n\t"
|
||||
"vl %%v19,176(%%r1,%3) \n\t"
|
||||
"vl %%v20,192(%%r1,%3) \n\t"
|
||||
"vl %%v21,208(%%r1,%3) \n\t"
|
||||
"vl %%v22,224(%%r1,%3) \n\t"
|
||||
"vl %%v23,240(%%r1,%3) \n\t"
|
||||
"vflpdb %%v16, %%v16 \n\t"
|
||||
"vflpdb %%v17, %%v17 \n\t"
|
||||
"vflpdb %%v18, %%v18 \n\t"
|
||||
"vflpdb %%v19, %%v19 \n\t"
|
||||
"vflpdb %%v20, %%v20 \n\t"
|
||||
"vflpdb %%v21, %%v21 \n\t"
|
||||
"vflpdb %%v22, %%v22 \n\t"
|
||||
"vflpdb %%v23, %%v23 \n\t"
|
||||
|
||||
"vfchdb %%v17,%%v18, %%v29 \n\t"
|
||||
"vsel %%v19,%%v28,%%v19,%%v17 \n\t"
|
||||
"vsel %%v18,%%v29,%%v18,%%v17 \n\t"
|
||||
"vfchedb %%v4,%%v17,%%v16 \n\t"
|
||||
"vfchedb %%v5,%%v19,%%v18 \n\t"
|
||||
"vfchedb %%v6,%%v21,%%v20 \n\t"
|
||||
"vfchedb %%v7,%%v23,%%v22 \n\t"
|
||||
"vsel %%v16,%%v16,%%v17,%%v4 \n\t"
|
||||
"vsel %%v4,%%v24,%%v25,%%v4 \n\t"
|
||||
"vsel %%v17,%%v18,%%v19,%%v5 \n\t"
|
||||
"vsel %%v5,%%v26,%%v27,%%v5 \n\t"
|
||||
"vsel %%v18,%%v20,%%v21,%%v6 \n\t"
|
||||
"vsel %%v6,%%v28,%%v29,%%v6 \n\t"
|
||||
"vsel %%v19,%%v22,%%v23,%%v7 \n\t"
|
||||
"vsel %%v7,%%v30,%%v31,%%v7 \n\t"
|
||||
|
||||
"vag %%v5,%%v5,%%v4 \n\t"
|
||||
"vfchedb %%v20,%%v17,%%v16 \n\t"
|
||||
"vfchedb %%v21,%%v19,%%v18 \n\t"
|
||||
"vsel %%v16,%%v16,%%v17,%%v20 \n\t"
|
||||
"vsel %%v4,%%v4,%%v5,%%v20 \n\t"
|
||||
"vsel %%v17,%%v18,%%v19,%%v21 \n\t"
|
||||
"vsel %%v5,%%v6,%%v7,%%v21 \n\t"
|
||||
|
||||
"vlm %%v24,%%v31,128(%[ptr_tmp] ) \n\t"
|
||||
"vflpdb %%v24, %%v24 \n\t"
|
||||
"vflpdb %%v25, %%v25 \n\t"
|
||||
"vflpdb %%v26, %%v26 \n\t"
|
||||
"vflpdb %%v27, %%v27 \n\t"
|
||||
"vflpdb %%v28, %%v28 \n\t"
|
||||
"vflpdb %%v29, %%v29 \n\t"
|
||||
"vflpdb %%v30, %%v30 \n\t"
|
||||
"vflpdb %%v31, %%v31 \n\t"
|
||||
"vfchedb %%v18,%%v17,%%v16 \n\t"
|
||||
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
|
||||
"vsel %%v4,%%v4,%%v5,%%v18 \n\t"
|
||||
"vag %%v4,%%v4,%%v3 \n\t"
|
||||
|
||||
"vfchdb %%v16,%%v24,%%v25 \n\t"
|
||||
"vfchdb %%v17,%%v26 ,%%v27 \n\t"
|
||||
"vsel %%v1,%%v21,%%v20,%%v16 \n\t"
|
||||
"vsel %%v0,%%v25,%%v24,%%v16 \n\t"
|
||||
"vsel %%v2,%%v23,%%v22,%%v17 \n\t"
|
||||
"vsel %%v3,%%v27,%%v26,%%v17 \n\t"
|
||||
"vfchdb %%v16,%%v28 ,%%v29 \n\t"
|
||||
"vfchdb %%v17,%%v30,%%v31 \n\t"
|
||||
"vsel %%v24,%%v21,%%v20,%%v16 \n\t"
|
||||
"vsel %%v25,%%v29,%%v28,%%v16 \n\t"
|
||||
"vsel %%v26,%%v23,%%v22,%%v17 \n\t"
|
||||
"vsel %%v27,%%v31,%%v30,%%v17 \n\t"
|
||||
"vfchedb %%v5,%%v16,%%v0 \n\t"
|
||||
"vsel %%v0,%%v0,%%v16,%%v5 \n\t"
|
||||
"vsel %%v1,%%v1,%%v4,%%v5 \n\t"
|
||||
"vag %%v3,%%v3,%%v2 \n\t"
|
||||
|
||||
"agfi %%r1, 256 \n\t"
|
||||
"brctg %%r0, 0b \n\t"
|
||||
|
||||
"vfchdb %%v28,%%v0 , %%v3 \n\t"
|
||||
"vfchdb %%v29, %%v25,%%v27 \n\t"
|
||||
"vsel %%v1,%%v2,%%v1,%%v28 \n\t"
|
||||
"vsel %%v0,%%v3,%%v0,%%v28 \n\t"
|
||||
"vsel %%v24,%%v26,%%v24,%%v29 \n\t"
|
||||
"vsel %%v25,%%v27,%%v25,%%v29 \n\t"
|
||||
|
||||
"vag %%v1,%%v1,%%v5 \n\t"
|
||||
"vag %%v24,%%v24,%%v5 \n\t"
|
||||
"la %[ptr_tmp],256(%[ptr_tmp]) \n\t"
|
||||
"vag %%v24,%%v24,%%v4 \n\t"
|
||||
|
||||
"vfchdb %%v16, %%v0,%%v25 \n\t"
|
||||
"vag %%v5,%%v5,%%v4 \n\t"
|
||||
"vsel %%v29,%%v25,%%v0,%%v16 \n\t"
|
||||
"vsel %%v28,%%v24,%%v1,%%v16 \n\t"
|
||||
|
||||
"vfchdb %%v17,%%v18, %%v29 \n\t"
|
||||
"vsel %%v19,%%v28,%%v19,%%v17 \n\t"
|
||||
"vsel %%v18,%%v29,%%v18,%%v17 \n\t"
|
||||
|
||||
"vag %%v5,%%v5,%%v4 \n\t"
|
||||
|
||||
"clgrjl %[ptr_tmp],%%r0,1b \n\t"
|
||||
|
||||
|
||||
"vrepg %%v26,%%v18,1 \n\t"
|
||||
"vrepg %%v5,%%v19,1 \n\t"
|
||||
"wfcdb %%v26,%%v18 \n\t"
|
||||
"jne 2f \n\t"
|
||||
"vsteg %%v18,%[minf],0 \n\t"
|
||||
"vmnlg %%v1,%%v5,%%v19 \n\t"
|
||||
"j 3f \n\t"
|
||||
|
||||
"2: \n\t"
|
||||
"wfchdb %%v16,%%v18 ,%%v26 \n\t "
|
||||
"vsel %%v1,%%v5,%%v19,%%v16 \n\t"
|
||||
"vsel %%v0,%%v26,%%v18,%%v16 \n\t"
|
||||
"std %%f0,%[minf] \n\t"
|
||||
|
||||
"3: \n\t"
|
||||
"vlgvg %[index],%%v1,0 \n\t"
|
||||
|
||||
: [index] "+r"(index) ,[minf] "=m"(*minf), [ptr_tmp] "+&a"(x)
|
||||
: [mem] "m"( *(const double (*)[n])x), [n] "r"(n), [ptr_x] "r"(x)
|
||||
: "cc","r0", "f0","v0","v1","v2","v3","v4","v5","v6","v7","v16",
|
||||
"v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
|
||||
);
|
||||
|
||||
return index;
|
||||
"vrepg %%v2,%%v0,1 \n\t"
|
||||
"vrepg %%v3,%%v1,1 \n\t"
|
||||
"wfcdb %%v2,%%v0 \n\t"
|
||||
"jne 1f \n\t"
|
||||
"vsteg %%v0,%1,0 \n\t"
|
||||
"vmnlg %%v0,%%v1,%%v3 \n\t"
|
||||
"vlgvg %0,%%v0,0 \n\t"
|
||||
"j 2f \n\t"
|
||||
"1: \n\t"
|
||||
"wfchdb %%v4,%%v0,%%v2 \n\t"
|
||||
"vsel %%v1,%%v3,%%v1,%%v4 \n\t"
|
||||
"vsel %%v0,%%v2,%%v0,%%v4 \n\t"
|
||||
"std %%f0,%1 \n\t"
|
||||
"vlgvg %0,%%v1,0 \n\t"
|
||||
"2: \n\t"
|
||||
"nop "
|
||||
:"=r"(iamin),"=m"(*amin)
|
||||
:"r"(n),"ZR"((const FLOAT (*)[n])x)
|
||||
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
|
||||
return iamin;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||
BLASLONG i = 0;
|
||||
BLASLONG j = 0;
|
||||
BLASLONG ix = 0;
|
||||
BLASLONG min = 0;
|
||||
FLOAT minf = 0.0;
|
||||
|
||||
BLASLONG min = 0;
|
||||
|
||||
if (n <= 0 || inc_x <= 0) return (min);
|
||||
minf = ABS(x[0]); //index's not incremented,though it will make first comparision redundant
|
||||
|
||||
if (inc_x == 1) {
|
||||
|
||||
BLASLONG n1 = n & -32;
|
||||
if (n1 > 0) {
|
||||
|
||||
min = diamin_kernel_32(n1, x, &minf);
|
||||
min = idamin_kernel_32(n1, x, &minf);
|
||||
|
||||
i = n1;
|
||||
}
|
||||
else
|
||||
{
|
||||
minf = ABS(x[0]);
|
||||
i++;
|
||||
}
|
||||
|
||||
while (i < n) {
|
||||
if (ABS(x[i]) < minf) {
|
||||
|
|
@ -223,6 +221,9 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
|||
|
||||
} else {
|
||||
|
||||
min = 0;
|
||||
minf = ABS(x[0]);
|
||||
|
||||
BLASLONG n1 = n & -4;
|
||||
while (j < n1) {
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,240 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2017, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
static BLASLONG idmax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *max)
|
||||
{
|
||||
BLASLONG imax;
|
||||
|
||||
__asm__ volatile (
|
||||
"vl %%v0,0(%3) \n\t"
|
||||
"vleig %%v1,0,0 \n\t"
|
||||
"vleig %%v1,1,1 \n\t"
|
||||
"vrepig %%v2,16 \n\t"
|
||||
"vzero %%v3 \n\t"
|
||||
"vleig %%v24,0,0 \n\t"
|
||||
"vleig %%v24,1,1 \n\t"
|
||||
"vleig %%v25,2,0 \n\t"
|
||||
"vleig %%v25,3,1 \n\t"
|
||||
"vleig %%v26,4,0 \n\t"
|
||||
"vleig %%v26,5,1 \n\t"
|
||||
"vleig %%v27,6,0 \n\t"
|
||||
"vleig %%v27,7,1 \n\t"
|
||||
"vleig %%v28,8,0 \n\t"
|
||||
"vleig %%v28,9,1 \n\t"
|
||||
"vleig %%v29,10,0 \n\t"
|
||||
"vleig %%v29,11,1 \n\t"
|
||||
"vleig %%v30,12,0 \n\t"
|
||||
"vleig %%v30,13,1 \n\t"
|
||||
"vleig %%v31,14,0 \n\t"
|
||||
"vleig %%v31,15,1 \n\t"
|
||||
"srlg %%r0,%2,5 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 1, 1024(%%r1,%3) \n\t"
|
||||
|
||||
"vl %%v16,0(%%r1,%3) \n\t"
|
||||
"vl %%v17,16(%%r1,%3) \n\t"
|
||||
"vl %%v18,32(%%r1,%3) \n\t"
|
||||
"vl %%v19,48(%%r1,%3) \n\t"
|
||||
"vl %%v20,64(%%r1,%3) \n\t"
|
||||
"vl %%v21,80(%%r1,%3) \n\t"
|
||||
"vl %%v22,96(%%r1,%3) \n\t"
|
||||
"vl %%v23,112(%%r1,%3) \n\t"
|
||||
|
||||
"vfchedb %%v4,%%v16,%%v17 \n\t"
|
||||
"vfchedb %%v5,%%v18,%%v19 \n\t"
|
||||
"vfchedb %%v6,%%v20,%%v21 \n\t"
|
||||
"vfchedb %%v7,%%v22,%%v23 \n\t"
|
||||
"vsel %%v16,%%v16,%%v17,%%v4 \n\t"
|
||||
"vsel %%v4,%%v24,%%v25,%%v4 \n\t"
|
||||
"vsel %%v17,%%v18,%%v19,%%v5 \n\t"
|
||||
"vsel %%v5,%%v26,%%v27,%%v5 \n\t"
|
||||
"vsel %%v18,%%v20,%%v21,%%v6 \n\t"
|
||||
"vsel %%v6,%%v28,%%v29,%%v6 \n\t"
|
||||
"vsel %%v19,%%v22,%%v23,%%v7 \n\t"
|
||||
"vsel %%v7,%%v30,%%v31,%%v7 \n\t"
|
||||
|
||||
"vfchedb %%v20,%%v16,%%v17 \n\t"
|
||||
"vfchedb %%v21,%%v18,%%v19 \n\t"
|
||||
"vsel %%v16,%%v16,%%v17,%%v20 \n\t"
|
||||
"vsel %%v4,%%v4,%%v5,%%v20 \n\t"
|
||||
"vsel %%v17,%%v18,%%v19,%%v21 \n\t"
|
||||
"vsel %%v5,%%v6,%%v7,%%v21 \n\t"
|
||||
|
||||
"vfchedb %%v18,%%v16,%%v17 \n\t"
|
||||
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
|
||||
"vsel %%v4,%%v4,%%v5,%%v18 \n\t"
|
||||
"vag %%v4,%%v4,%%v3 \n\t"
|
||||
|
||||
"vfchedb %%v5,%%v0,%%v16 \n\t"
|
||||
"vsel %%v0,%%v0,%%v16,%%v5 \n\t"
|
||||
"vsel %%v1,%%v1,%%v4,%%v5 \n\t"
|
||||
"vag %%v3,%%v3,%%v2 \n\t"
|
||||
|
||||
"vl %%v16,128(%%r1,%3) \n\t"
|
||||
"vl %%v17,144(%%r1,%3) \n\t"
|
||||
"vl %%v18,160(%%r1,%3) \n\t"
|
||||
"vl %%v19,176(%%r1,%3) \n\t"
|
||||
"vl %%v20,192(%%r1,%3) \n\t"
|
||||
"vl %%v21,208(%%r1,%3) \n\t"
|
||||
"vl %%v22,224(%%r1,%3) \n\t"
|
||||
"vl %%v23,240(%%r1,%3) \n\t"
|
||||
|
||||
"vfchedb %%v4,%%v16,%%v17 \n\t"
|
||||
"vfchedb %%v5,%%v18,%%v19 \n\t"
|
||||
"vfchedb %%v6,%%v20,%%v21 \n\t"
|
||||
"vfchedb %%v7,%%v22,%%v23 \n\t"
|
||||
"vsel %%v16,%%v16,%%v17,%%v4 \n\t"
|
||||
"vsel %%v4,%%v24,%%v25,%%v4 \n\t"
|
||||
"vsel %%v17,%%v18,%%v19,%%v5 \n\t"
|
||||
"vsel %%v5,%%v26,%%v27,%%v5 \n\t"
|
||||
"vsel %%v18,%%v20,%%v21,%%v6 \n\t"
|
||||
"vsel %%v6,%%v28,%%v29,%%v6 \n\t"
|
||||
"vsel %%v19,%%v22,%%v23,%%v7 \n\t"
|
||||
"vsel %%v7,%%v30,%%v31,%%v7 \n\t"
|
||||
|
||||
"vfchedb %%v20,%%v16,%%v17 \n\t"
|
||||
"vfchedb %%v21,%%v18,%%v19 \n\t"
|
||||
"vsel %%v16,%%v16,%%v17,%%v20 \n\t"
|
||||
"vsel %%v4,%%v4,%%v5,%%v20 \n\t"
|
||||
"vsel %%v17,%%v18,%%v19,%%v21 \n\t"
|
||||
"vsel %%v5,%%v6,%%v7,%%v21 \n\t"
|
||||
|
||||
"vfchedb %%v18,%%v16,%%v17 \n\t"
|
||||
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
|
||||
"vsel %%v4,%%v4,%%v5,%%v18 \n\t"
|
||||
"vag %%v4,%%v4,%%v3 \n\t"
|
||||
|
||||
"vfchedb %%v5,%%v0,%%v16 \n\t"
|
||||
"vsel %%v0,%%v0,%%v16,%%v5 \n\t"
|
||||
"vsel %%v1,%%v1,%%v4,%%v5 \n\t"
|
||||
"vag %%v3,%%v3,%%v2 \n\t"
|
||||
|
||||
"agfi %%r1, 256 \n\t"
|
||||
"brctg %%r0, 0b \n\t"
|
||||
|
||||
"vrepg %%v2,%%v0,1 \n\t"
|
||||
"vrepg %%v3,%%v1,1 \n\t"
|
||||
"wfcdb %%v2,%%v0 \n\t"
|
||||
"jne 1f \n\t"
|
||||
"vsteg %%v0,%1,0 \n\t"
|
||||
"vmnlg %%v0,%%v1,%%v3 \n\t"
|
||||
"vlgvg %0,%%v0,0 \n\t"
|
||||
"j 2f \n\t"
|
||||
"1: \n\t"
|
||||
"wfchdb %%v4,%%v2,%%v0 \n\t"
|
||||
"vsel %%v1,%%v3,%%v1,%%v4 \n\t"
|
||||
"vsel %%v0,%%v2,%%v0,%%v4 \n\t"
|
||||
"std %%f0,%1 \n\t"
|
||||
"vlgvg %0,%%v1,0 \n\t"
|
||||
"2: \n\t"
|
||||
"nop "
|
||||
:"=r"(imax),"=m"(*max)
|
||||
:"r"(n),"ZR"((const FLOAT (*)[n])x)
|
||||
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
|
||||
return imax;
|
||||
}
|
||||
|
||||
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||
BLASLONG i = 0;
|
||||
BLASLONG j = 0;
|
||||
FLOAT maxf = 0.0;
|
||||
BLASLONG max = 0;
|
||||
|
||||
if (n <= 0 || inc_x <= 0) return (max);
|
||||
|
||||
if (inc_x == 1) {
|
||||
|
||||
BLASLONG n1 = n & -32;
|
||||
if (n1 > 0) {
|
||||
|
||||
max = idmax_kernel_32(n1, x, &maxf);
|
||||
|
||||
i = n1;
|
||||
}
|
||||
else
|
||||
{
|
||||
maxf = x[0];
|
||||
i++;
|
||||
}
|
||||
|
||||
while (i < n) {
|
||||
if (x[i] > maxf) {
|
||||
max = i;
|
||||
maxf = x[i];
|
||||
}
|
||||
i++;
|
||||
}
|
||||
return (max + 1);
|
||||
|
||||
} else {
|
||||
|
||||
max = 0;
|
||||
maxf = x[0];
|
||||
|
||||
BLASLONG n1 = n & -4;
|
||||
while (j < n1) {
|
||||
|
||||
if (x[i] > maxf) {
|
||||
max = j;
|
||||
maxf = x[i];
|
||||
}
|
||||
if (x[i + inc_x] > maxf) {
|
||||
max = j + 1;
|
||||
maxf = x[i + inc_x];
|
||||
}
|
||||
if (x[i + 2 * inc_x] > maxf) {
|
||||
max = j + 2;
|
||||
maxf = x[i + 2 * inc_x];
|
||||
}
|
||||
if (x[i + 3 * inc_x] > maxf) {
|
||||
max = j + 3;
|
||||
maxf = x[i + 3 * inc_x];
|
||||
}
|
||||
|
||||
i += inc_x * 4;
|
||||
|
||||
j += 4;
|
||||
|
||||
}
|
||||
|
||||
|
||||
while (j < n) {
|
||||
if (x[i] > maxf) {
|
||||
max = j;
|
||||
maxf = x[i];
|
||||
}
|
||||
i += inc_x;
|
||||
j++;
|
||||
}
|
||||
return (max + 1);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,240 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2017, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
static BLASLONG idmin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *min)
|
||||
{
|
||||
BLASLONG imin;
|
||||
|
||||
__asm__ volatile (
|
||||
"vl %%v0,0(%3) \n\t"
|
||||
"vleig %%v1,0,0 \n\t"
|
||||
"vleig %%v1,1,1 \n\t"
|
||||
"vrepig %%v2,16 \n\t"
|
||||
"vzero %%v3 \n\t"
|
||||
"vleig %%v24,0,0 \n\t"
|
||||
"vleig %%v24,1,1 \n\t"
|
||||
"vleig %%v25,2,0 \n\t"
|
||||
"vleig %%v25,3,1 \n\t"
|
||||
"vleig %%v26,4,0 \n\t"
|
||||
"vleig %%v26,5,1 \n\t"
|
||||
"vleig %%v27,6,0 \n\t"
|
||||
"vleig %%v27,7,1 \n\t"
|
||||
"vleig %%v28,8,0 \n\t"
|
||||
"vleig %%v28,9,1 \n\t"
|
||||
"vleig %%v29,10,0 \n\t"
|
||||
"vleig %%v29,11,1 \n\t"
|
||||
"vleig %%v30,12,0 \n\t"
|
||||
"vleig %%v30,13,1 \n\t"
|
||||
"vleig %%v31,14,0 \n\t"
|
||||
"vleig %%v31,15,1 \n\t"
|
||||
"srlg %%r0,%2,5 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 1, 1024(%%r1,%3) \n\t"
|
||||
|
||||
"vl %%v16,0(%%r1,%3) \n\t"
|
||||
"vl %%v17,16(%%r1,%3) \n\t"
|
||||
"vl %%v18,32(%%r1,%3) \n\t"
|
||||
"vl %%v19,48(%%r1,%3) \n\t"
|
||||
"vl %%v20,64(%%r1,%3) \n\t"
|
||||
"vl %%v21,80(%%r1,%3) \n\t"
|
||||
"vl %%v22,96(%%r1,%3) \n\t"
|
||||
"vl %%v23,112(%%r1,%3) \n\t"
|
||||
|
||||
"vfchedb %%v4,%%v17,%%v16 \n\t"
|
||||
"vfchedb %%v5,%%v19,%%v18 \n\t"
|
||||
"vfchedb %%v6,%%v21,%%v20 \n\t"
|
||||
"vfchedb %%v7,%%v23,%%v22 \n\t"
|
||||
"vsel %%v16,%%v16,%%v17,%%v4 \n\t"
|
||||
"vsel %%v4,%%v24,%%v25,%%v4 \n\t"
|
||||
"vsel %%v17,%%v18,%%v19,%%v5 \n\t"
|
||||
"vsel %%v5,%%v26,%%v27,%%v5 \n\t"
|
||||
"vsel %%v18,%%v20,%%v21,%%v6 \n\t"
|
||||
"vsel %%v6,%%v28,%%v29,%%v6 \n\t"
|
||||
"vsel %%v19,%%v22,%%v23,%%v7 \n\t"
|
||||
"vsel %%v7,%%v30,%%v31,%%v7 \n\t"
|
||||
|
||||
"vfchedb %%v20,%%v17,%%v16 \n\t"
|
||||
"vfchedb %%v21,%%v19,%%v18 \n\t"
|
||||
"vsel %%v16,%%v16,%%v17,%%v20 \n\t"
|
||||
"vsel %%v4,%%v4,%%v5,%%v20 \n\t"
|
||||
"vsel %%v17,%%v18,%%v19,%%v21 \n\t"
|
||||
"vsel %%v5,%%v6,%%v7,%%v21 \n\t"
|
||||
|
||||
"vfchedb %%v18,%%v17,%%v16 \n\t"
|
||||
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
|
||||
"vsel %%v4,%%v4,%%v5,%%v18 \n\t"
|
||||
"vag %%v4,%%v4,%%v3 \n\t"
|
||||
|
||||
"vfchedb %%v5,%%v16,%%v0 \n\t"
|
||||
"vsel %%v0,%%v0,%%v16,%%v5 \n\t"
|
||||
"vsel %%v1,%%v1,%%v4,%%v5 \n\t"
|
||||
"vag %%v3,%%v3,%%v2 \n\t"
|
||||
|
||||
"vl %%v16,128(%%r1,%3) \n\t"
|
||||
"vl %%v17,144(%%r1,%3) \n\t"
|
||||
"vl %%v18,160(%%r1,%3) \n\t"
|
||||
"vl %%v19,176(%%r1,%3) \n\t"
|
||||
"vl %%v20,192(%%r1,%3) \n\t"
|
||||
"vl %%v21,208(%%r1,%3) \n\t"
|
||||
"vl %%v22,224(%%r1,%3) \n\t"
|
||||
"vl %%v23,240(%%r1,%3) \n\t"
|
||||
|
||||
"vfchedb %%v4,%%v17,%%v16 \n\t"
|
||||
"vfchedb %%v5,%%v19,%%v18 \n\t"
|
||||
"vfchedb %%v6,%%v21,%%v20 \n\t"
|
||||
"vfchedb %%v7,%%v23,%%v22 \n\t"
|
||||
"vsel %%v16,%%v16,%%v17,%%v4 \n\t"
|
||||
"vsel %%v4,%%v24,%%v25,%%v4 \n\t"
|
||||
"vsel %%v17,%%v18,%%v19,%%v5 \n\t"
|
||||
"vsel %%v5,%%v26,%%v27,%%v5 \n\t"
|
||||
"vsel %%v18,%%v20,%%v21,%%v6 \n\t"
|
||||
"vsel %%v6,%%v28,%%v29,%%v6 \n\t"
|
||||
"vsel %%v19,%%v22,%%v23,%%v7 \n\t"
|
||||
"vsel %%v7,%%v30,%%v31,%%v7 \n\t"
|
||||
|
||||
"vfchedb %%v20,%%v17,%%v16 \n\t"
|
||||
"vfchedb %%v21,%%v19,%%v18 \n\t"
|
||||
"vsel %%v16,%%v16,%%v17,%%v20 \n\t"
|
||||
"vsel %%v4,%%v4,%%v5,%%v20 \n\t"
|
||||
"vsel %%v17,%%v18,%%v19,%%v21 \n\t"
|
||||
"vsel %%v5,%%v6,%%v7,%%v21 \n\t"
|
||||
|
||||
"vfchedb %%v18,%%v17,%%v16 \n\t"
|
||||
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
|
||||
"vsel %%v4,%%v4,%%v5,%%v18 \n\t"
|
||||
"vag %%v4,%%v4,%%v3 \n\t"
|
||||
|
||||
"vfchedb %%v5,%%v16,%%v0 \n\t"
|
||||
"vsel %%v0,%%v0,%%v16,%%v5 \n\t"
|
||||
"vsel %%v1,%%v1,%%v4,%%v5 \n\t"
|
||||
"vag %%v3,%%v3,%%v2 \n\t"
|
||||
|
||||
"agfi %%r1, 256 \n\t"
|
||||
"brctg %%r0, 0b \n\t"
|
||||
|
||||
"vrepg %%v2,%%v0,1 \n\t"
|
||||
"vrepg %%v3,%%v1,1 \n\t"
|
||||
"wfcdb %%v2,%%v0 \n\t"
|
||||
"jne 1f \n\t"
|
||||
"vsteg %%v0,%1,0 \n\t"
|
||||
"vmnlg %%v0,%%v1,%%v3 \n\t"
|
||||
"vlgvg %0,%%v0,0 \n\t"
|
||||
"j 2f \n\t"
|
||||
"1: \n\t"
|
||||
"wfchdb %%v4,%%v0,%%v2 \n\t"
|
||||
"vsel %%v1,%%v3,%%v1,%%v4 \n\t"
|
||||
"vsel %%v0,%%v2,%%v0,%%v4 \n\t"
|
||||
"std %%f0,%1 \n\t"
|
||||
"vlgvg %0,%%v1,0 \n\t"
|
||||
"2: \n\t"
|
||||
"nop "
|
||||
:"=r"(imin),"=m"(*min)
|
||||
:"r"(n),"ZR"((const FLOAT (*)[n])x)
|
||||
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
|
||||
return imin;
|
||||
}
|
||||
|
||||
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||
BLASLONG i = 0;
|
||||
BLASLONG j = 0;
|
||||
FLOAT minf = 0.0;
|
||||
BLASLONG min = 0;
|
||||
|
||||
if (n <= 0 || inc_x <= 0) return (min);
|
||||
|
||||
if (inc_x == 1) {
|
||||
|
||||
BLASLONG n1 = n & -32;
|
||||
if (n1 > 0) {
|
||||
|
||||
min = idmin_kernel_32(n1, x, &minf);
|
||||
|
||||
i = n1;
|
||||
}
|
||||
else
|
||||
{
|
||||
minf = x[0];
|
||||
i++;
|
||||
}
|
||||
|
||||
while (i < n) {
|
||||
if (x[i] < minf) {
|
||||
min = i;
|
||||
minf = x[i];
|
||||
}
|
||||
i++;
|
||||
}
|
||||
return (min + 1);
|
||||
|
||||
} else {
|
||||
|
||||
min = 0;
|
||||
minf = x[0];
|
||||
|
||||
BLASLONG n1 = n & -4;
|
||||
while (j < n1) {
|
||||
|
||||
if (x[i] < minf) {
|
||||
min = j;
|
||||
minf = x[i];
|
||||
}
|
||||
if (x[i + inc_x] < minf) {
|
||||
min = j + 1;
|
||||
minf = x[i + inc_x];
|
||||
}
|
||||
if (x[i + 2 * inc_x] < minf) {
|
||||
min = j + 2;
|
||||
minf = x[i + 2 * inc_x];
|
||||
}
|
||||
if (x[i + 3 * inc_x] < minf) {
|
||||
min = j + 3;
|
||||
minf = x[i + 3 * inc_x];
|
||||
}
|
||||
|
||||
i += inc_x * 4;
|
||||
|
||||
j += 4;
|
||||
|
||||
}
|
||||
|
||||
|
||||
while (j < n) {
|
||||
if (x[i] < minf) {
|
||||
min = j;
|
||||
minf = x[i];
|
||||
}
|
||||
i += inc_x;
|
||||
j++;
|
||||
}
|
||||
return (min + 1);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,309 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2017, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include <math.h>
|
||||
|
||||
#if defined(DOUBLE)
|
||||
#define ABS fabs
|
||||
#else
|
||||
#define ABS fabsf
|
||||
#endif
|
||||
|
||||
static BLASLONG isamax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amax)
|
||||
{
|
||||
BLASLONG iamax;
|
||||
|
||||
__asm__ volatile (
|
||||
"vl %%v0,0(%3) \n\t"
|
||||
"vflpsb %%v0,%%v0 \n\t"
|
||||
"vleig %%v1,0,0 \n\t"
|
||||
"vleig %%v1,2,1 \n\t"
|
||||
"vleig %%v2,1,0 \n\t"
|
||||
"vleig %%v2,3,1 \n\t"
|
||||
"vrepig %%v3,32 \n\t"
|
||||
"vzero %%v4 \n\t"
|
||||
"vleif %%v24,0,0 \n\t"
|
||||
"vleif %%v24,1,1 \n\t"
|
||||
"vleif %%v24,2,2 \n\t"
|
||||
"vleif %%v24,3,3 \n\t"
|
||||
"vleif %%v25,4,0 \n\t"
|
||||
"vleif %%v25,5,1 \n\t"
|
||||
"vleif %%v25,6,2 \n\t"
|
||||
"vleif %%v25,7,3 \n\t"
|
||||
"vleif %%v26,8,0 \n\t"
|
||||
"vleif %%v26,9,1 \n\t"
|
||||
"vleif %%v26,10,2 \n\t"
|
||||
"vleif %%v26,11,3 \n\t"
|
||||
"vleif %%v27,12,0 \n\t"
|
||||
"vleif %%v27,13,1 \n\t"
|
||||
"vleif %%v27,14,2 \n\t"
|
||||
"vleif %%v27,15,3 \n\t"
|
||||
"vleif %%v28,16,0 \n\t"
|
||||
"vleif %%v28,17,1 \n\t"
|
||||
"vleif %%v28,18,2 \n\t"
|
||||
"vleif %%v28,19,3 \n\t"
|
||||
"vleif %%v29,20,0 \n\t"
|
||||
"vleif %%v29,21,1 \n\t"
|
||||
"vleif %%v29,22,2 \n\t"
|
||||
"vleif %%v29,23,3 \n\t"
|
||||
"vleif %%v30,24,0 \n\t"
|
||||
"vleif %%v30,25,1 \n\t"
|
||||
"vleif %%v30,26,2 \n\t"
|
||||
"vleif %%v30,27,3 \n\t"
|
||||
"vleif %%v31,28,0 \n\t"
|
||||
"vleif %%v31,29,1 \n\t"
|
||||
"vleif %%v31,30,2 \n\t"
|
||||
"vleif %%v31,31,3 \n\t"
|
||||
"srlg %%r0,%2,6 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 1, 1024(%%r1,%3) \n\t"
|
||||
|
||||
"vl %%v16,0(%%r1,%3) \n\t"
|
||||
"vl %%v17,16(%%r1,%3) \n\t"
|
||||
"vl %%v18,32(%%r1,%3) \n\t"
|
||||
"vl %%v19,48(%%r1,%3) \n\t"
|
||||
"vl %%v20,64(%%r1,%3) \n\t"
|
||||
"vl %%v21,80(%%r1,%3) \n\t"
|
||||
"vl %%v22,96(%%r1,%3) \n\t"
|
||||
"vl %%v23,112(%%r1,%3) \n\t"
|
||||
"vflpsb %%v16, %%v16 \n\t"
|
||||
"vflpsb %%v17, %%v17 \n\t"
|
||||
"vflpsb %%v18, %%v18 \n\t"
|
||||
"vflpsb %%v19, %%v19 \n\t"
|
||||
"vflpsb %%v20, %%v20 \n\t"
|
||||
"vflpsb %%v21, %%v21 \n\t"
|
||||
"vflpsb %%v22, %%v22 \n\t"
|
||||
"vflpsb %%v23, %%v23 \n\t"
|
||||
|
||||
"vfchesb %%v5,%%v16,%%v17 \n\t"
|
||||
"vfchesb %%v6,%%v18,%%v19 \n\t"
|
||||
"vfchesb %%v7,%%v20,%%v21 \n\t"
|
||||
"vfchesb %%v8,%%v22,%%v23 \n\t"
|
||||
"vsel %%v16,%%v16,%%v17,%%v5 \n\t"
|
||||
"vsel %%v5,%%v24,%%v25,%%v5 \n\t"
|
||||
"vsel %%v17,%%v18,%%v19,%%v6 \n\t"
|
||||
"vsel %%v6,%%v26,%%v27,%%v6 \n\t"
|
||||
"vsel %%v18,%%v20,%%v21,%%v7 \n\t"
|
||||
"vsel %%v7,%%v28,%%v29,%%v7 \n\t"
|
||||
"vsel %%v19,%%v22,%%v23,%%v8 \n\t"
|
||||
"vsel %%v8,%%v30,%%v31,%%v8 \n\t"
|
||||
|
||||
"vfchesb %%v20,%%v16,%%v17 \n\t"
|
||||
"vfchesb %%v21,%%v18,%%v19 \n\t"
|
||||
"vsel %%v16,%%v16,%%v17,%%v20 \n\t"
|
||||
"vsel %%v5,%%v5,%%v6,%%v20 \n\t"
|
||||
"vsel %%v17,%%v18,%%v19,%%v21 \n\t"
|
||||
"vsel %%v6,%%v7,%%v8,%%v21 \n\t"
|
||||
|
||||
"vfchesb %%v18,%%v16,%%v17 \n\t"
|
||||
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
|
||||
"vsel %%v5,%%v5,%%v6,%%v18 \n\t"
|
||||
"vsegf %%v6,%%v5 \n\t"
|
||||
"vesrlg %%v5,%%v5,32 \n\t"
|
||||
"vag %%v5,%%v5,%%v4 \n\t"
|
||||
"vag %%v6,%%v6,%%v4 \n\t"
|
||||
|
||||
"vfchesb %%v7,%%v0,%%v16 \n\t"
|
||||
"vsel %%v0,%%v0,%%v16,%%v7 \n\t"
|
||||
"vsegf %%v8,%%v7 \n\t"
|
||||
"vesrlg %%v7,%%v7,32 \n\t"
|
||||
"vsegf %%v7,%%v7 \n\t"
|
||||
"vsel %%v1,%%v1,%%v5,%%v7 \n\t"
|
||||
"vsel %%v2,%%v2,%%v6,%%v8 \n\t"
|
||||
"vag %%v4,%%v4,%%v3 \n\t"
|
||||
|
||||
"vl %%v16,128(%%r1,%3) \n\t"
|
||||
"vl %%v17,144(%%r1,%3) \n\t"
|
||||
"vl %%v18,160(%%r1,%3) \n\t"
|
||||
"vl %%v19,176(%%r1,%3) \n\t"
|
||||
"vl %%v20,192(%%r1,%3) \n\t"
|
||||
"vl %%v21,208(%%r1,%3) \n\t"
|
||||
"vl %%v22,224(%%r1,%3) \n\t"
|
||||
"vl %%v23,240(%%r1,%3) \n\t"
|
||||
"vflpsb %%v16, %%v16 \n\t"
|
||||
"vflpsb %%v17, %%v17 \n\t"
|
||||
"vflpsb %%v18, %%v18 \n\t"
|
||||
"vflpsb %%v19, %%v19 \n\t"
|
||||
"vflpsb %%v20, %%v20 \n\t"
|
||||
"vflpsb %%v21, %%v21 \n\t"
|
||||
"vflpsb %%v22, %%v22 \n\t"
|
||||
"vflpsb %%v23, %%v23 \n\t"
|
||||
|
||||
"vfchesb %%v5,%%v16,%%v17 \n\t"
|
||||
"vfchesb %%v6,%%v18,%%v19 \n\t"
|
||||
"vfchesb %%v7,%%v20,%%v21 \n\t"
|
||||
"vfchesb %%v8,%%v22,%%v23 \n\t"
|
||||
"vsel %%v16,%%v16,%%v17,%%v5 \n\t"
|
||||
"vsel %%v5,%%v24,%%v25,%%v5 \n\t"
|
||||
"vsel %%v17,%%v18,%%v19,%%v6 \n\t"
|
||||
"vsel %%v6,%%v26,%%v27,%%v6 \n\t"
|
||||
"vsel %%v18,%%v20,%%v21,%%v7 \n\t"
|
||||
"vsel %%v7,%%v28,%%v29,%%v7 \n\t"
|
||||
"vsel %%v19,%%v22,%%v23,%%v8 \n\t"
|
||||
"vsel %%v8,%%v30,%%v31,%%v8 \n\t"
|
||||
|
||||
"vfchesb %%v20,%%v16,%%v17 \n\t"
|
||||
"vfchesb %%v21,%%v18,%%v19 \n\t"
|
||||
"vsel %%v16,%%v16,%%v17,%%v20 \n\t"
|
||||
"vsel %%v5,%%v5,%%v6,%%v20 \n\t"
|
||||
"vsel %%v17,%%v18,%%v19,%%v21 \n\t"
|
||||
"vsel %%v6,%%v7,%%v8,%%v21 \n\t"
|
||||
|
||||
"vfchesb %%v18,%%v16,%%v17 \n\t"
|
||||
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
|
||||
"vsel %%v5,%%v5,%%v6,%%v18 \n\t"
|
||||
"vsegf %%v6,%%v5 \n\t"
|
||||
"vesrlg %%v5,%%v5,32 \n\t"
|
||||
"vag %%v5,%%v5,%%v4 \n\t"
|
||||
"vag %%v6,%%v6,%%v4 \n\t"
|
||||
|
||||
"vfchesb %%v7,%%v0,%%v16 \n\t"
|
||||
"vsel %%v0,%%v0,%%v16,%%v7 \n\t"
|
||||
"vsegf %%v8,%%v7 \n\t"
|
||||
"vesrlg %%v7,%%v7,32 \n\t"
|
||||
"vsegf %%v7,%%v7 \n\t"
|
||||
"vsel %%v1,%%v1,%%v5,%%v7 \n\t"
|
||||
"vsel %%v2,%%v2,%%v6,%%v8 \n\t"
|
||||
"vag %%v4,%%v4,%%v3 \n\t"
|
||||
|
||||
"agfi %%r1, 256 \n\t"
|
||||
"brctg %%r0, 0b \n\t"
|
||||
|
||||
"veslg %%v3,%%v0,32 \n\t"
|
||||
"vfchsb %%v4,%%v0,%%v3 \n\t"
|
||||
"vchlg %%v5,%%v2,%%v1 \n\t"
|
||||
"vfcesb %%v6,%%v0,%%v3 \n\t"
|
||||
"vn %%v5,%%v5,%%v6 \n\t"
|
||||
"vo %%v4,%%v4,%%v5 \n\t"
|
||||
"vsel %%v0,%%v0,%%v3,%%v4 \n\t"
|
||||
"vesrlg %%v4,%%v4,32 \n\t"
|
||||
"vsegf %%v4,%%v4 \n\t"
|
||||
"vsel %%v1,%%v1,%%v2,%%v4 \n\t"
|
||||
|
||||
"vrepf %%v2,%%v0,2 \n\t"
|
||||
"vrepg %%v3,%%v1,1 \n\t"
|
||||
"wfcsb %%v2,%%v0 \n\t"
|
||||
"jne 1f \n\t"
|
||||
"vstef %%v0,%1,0 \n\t"
|
||||
"vmnlg %%v0,%%v1,%%v3 \n\t"
|
||||
"vlgvg %0,%%v0,0 \n\t"
|
||||
"j 2f \n\t"
|
||||
"1: \n\t"
|
||||
"wfchsb %%v4,%%v2,%%v0 \n\t"
|
||||
"vesrlg %%v4,%%v4,32 \n\t"
|
||||
"vsegf %%v4,%%v4 \n\t"
|
||||
"vsel %%v1,%%v3,%%v1,%%v4 \n\t"
|
||||
"vsel %%v0,%%v2,%%v0,%%v4 \n\t"
|
||||
"ste %%f0,%1 \n\t"
|
||||
"vlgvg %0,%%v1,0 \n\t"
|
||||
"2: \n\t"
|
||||
"nop "
|
||||
:"=r"(iamax),"=m"(*amax)
|
||||
:"r"(n),"ZR"((const FLOAT (*)[n])x)
|
||||
:"memory","cc","r0","r1","v0","v1","v2","v4","v5","v6","v7","v8","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
|
||||
return iamax;
|
||||
}
|
||||
|
||||
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||
BLASLONG i = 0;
|
||||
BLASLONG j = 0;
|
||||
FLOAT maxf = 0.0;
|
||||
BLASLONG max = 0;
|
||||
|
||||
if (n <= 0 || inc_x <= 0) return (max);
|
||||
|
||||
if (inc_x == 1) {
|
||||
|
||||
BLASLONG n1 = n & -64;
|
||||
if (n1 > 0) {
|
||||
|
||||
max = isamax_kernel_64(n1, x, &maxf);
|
||||
|
||||
i = n1;
|
||||
}
|
||||
else
|
||||
{
|
||||
maxf = ABS(x[0]);
|
||||
i++;
|
||||
}
|
||||
|
||||
while (i < n) {
|
||||
if (ABS(x[i]) > maxf) {
|
||||
max = i;
|
||||
maxf = ABS(x[i]);
|
||||
}
|
||||
i++;
|
||||
}
|
||||
return (max + 1);
|
||||
|
||||
} else {
|
||||
|
||||
max = 0;
|
||||
maxf = ABS(x[0]);
|
||||
|
||||
BLASLONG n1 = n & -4;
|
||||
while (j < n1) {
|
||||
|
||||
if (ABS(x[i]) > maxf) {
|
||||
max = j;
|
||||
maxf = ABS(x[i]);
|
||||
}
|
||||
if (ABS(x[i + inc_x]) > maxf) {
|
||||
max = j + 1;
|
||||
maxf = ABS(x[i + inc_x]);
|
||||
}
|
||||
if (ABS(x[i + 2 * inc_x]) > maxf) {
|
||||
max = j + 2;
|
||||
maxf = ABS(x[i + 2 * inc_x]);
|
||||
}
|
||||
if (ABS(x[i + 3 * inc_x]) > maxf) {
|
||||
max = j + 3;
|
||||
maxf = ABS(x[i + 3 * inc_x]);
|
||||
}
|
||||
|
||||
i += inc_x * 4;
|
||||
|
||||
j += 4;
|
||||
|
||||
}
|
||||
|
||||
|
||||
while (j < n) {
|
||||
if (ABS(x[i]) > maxf) {
|
||||
max = j;
|
||||
maxf = ABS(x[i]);
|
||||
}
|
||||
i += inc_x;
|
||||
j++;
|
||||
}
|
||||
return (max + 1);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,309 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2017, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include <math.h>
|
||||
|
||||
#if defined(DOUBLE)
|
||||
#define ABS fabs
|
||||
#else
|
||||
#define ABS fabsf
|
||||
#endif
|
||||
|
||||
static BLASLONG isamin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amin)
|
||||
{
|
||||
BLASLONG iamin;
|
||||
|
||||
__asm__ volatile (
|
||||
"vl %%v0,0(%3) \n\t"
|
||||
"vflpsb %%v0,%%v0 \n\t"
|
||||
"vleig %%v1,0,0 \n\t"
|
||||
"vleig %%v1,2,1 \n\t"
|
||||
"vleig %%v2,1,0 \n\t"
|
||||
"vleig %%v2,3,1 \n\t"
|
||||
"vrepig %%v3,32 \n\t"
|
||||
"vzero %%v4 \n\t"
|
||||
"vleif %%v24,0,0 \n\t"
|
||||
"vleif %%v24,1,1 \n\t"
|
||||
"vleif %%v24,2,2 \n\t"
|
||||
"vleif %%v24,3,3 \n\t"
|
||||
"vleif %%v25,4,0 \n\t"
|
||||
"vleif %%v25,5,1 \n\t"
|
||||
"vleif %%v25,6,2 \n\t"
|
||||
"vleif %%v25,7,3 \n\t"
|
||||
"vleif %%v26,8,0 \n\t"
|
||||
"vleif %%v26,9,1 \n\t"
|
||||
"vleif %%v26,10,2 \n\t"
|
||||
"vleif %%v26,11,3 \n\t"
|
||||
"vleif %%v27,12,0 \n\t"
|
||||
"vleif %%v27,13,1 \n\t"
|
||||
"vleif %%v27,14,2 \n\t"
|
||||
"vleif %%v27,15,3 \n\t"
|
||||
"vleif %%v28,16,0 \n\t"
|
||||
"vleif %%v28,17,1 \n\t"
|
||||
"vleif %%v28,18,2 \n\t"
|
||||
"vleif %%v28,19,3 \n\t"
|
||||
"vleif %%v29,20,0 \n\t"
|
||||
"vleif %%v29,21,1 \n\t"
|
||||
"vleif %%v29,22,2 \n\t"
|
||||
"vleif %%v29,23,3 \n\t"
|
||||
"vleif %%v30,24,0 \n\t"
|
||||
"vleif %%v30,25,1 \n\t"
|
||||
"vleif %%v30,26,2 \n\t"
|
||||
"vleif %%v30,27,3 \n\t"
|
||||
"vleif %%v31,28,0 \n\t"
|
||||
"vleif %%v31,29,1 \n\t"
|
||||
"vleif %%v31,30,2 \n\t"
|
||||
"vleif %%v31,31,3 \n\t"
|
||||
"srlg %%r0,%2,6 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 1, 1024(%%r1,%3) \n\t"
|
||||
|
||||
"vl %%v16,0(%%r1,%3) \n\t"
|
||||
"vl %%v17,16(%%r1,%3) \n\t"
|
||||
"vl %%v18,32(%%r1,%3) \n\t"
|
||||
"vl %%v19,48(%%r1,%3) \n\t"
|
||||
"vl %%v20,64(%%r1,%3) \n\t"
|
||||
"vl %%v21,80(%%r1,%3) \n\t"
|
||||
"vl %%v22,96(%%r1,%3) \n\t"
|
||||
"vl %%v23,112(%%r1,%3) \n\t"
|
||||
"vflpsb %%v16, %%v16 \n\t"
|
||||
"vflpsb %%v17, %%v17 \n\t"
|
||||
"vflpsb %%v18, %%v18 \n\t"
|
||||
"vflpsb %%v19, %%v19 \n\t"
|
||||
"vflpsb %%v20, %%v20 \n\t"
|
||||
"vflpsb %%v21, %%v21 \n\t"
|
||||
"vflpsb %%v22, %%v22 \n\t"
|
||||
"vflpsb %%v23, %%v23 \n\t"
|
||||
|
||||
"vfchesb %%v5,%%v17,%%v16 \n\t"
|
||||
"vfchesb %%v6,%%v19,%%v18 \n\t"
|
||||
"vfchesb %%v7,%%v21,%%v20 \n\t"
|
||||
"vfchesb %%v8,%%v23,%%v22 \n\t"
|
||||
"vsel %%v16,%%v16,%%v17,%%v5 \n\t"
|
||||
"vsel %%v5,%%v24,%%v25,%%v5 \n\t"
|
||||
"vsel %%v17,%%v18,%%v19,%%v6 \n\t"
|
||||
"vsel %%v6,%%v26,%%v27,%%v6 \n\t"
|
||||
"vsel %%v18,%%v20,%%v21,%%v7 \n\t"
|
||||
"vsel %%v7,%%v28,%%v29,%%v7 \n\t"
|
||||
"vsel %%v19,%%v22,%%v23,%%v8 \n\t"
|
||||
"vsel %%v8,%%v30,%%v31,%%v8 \n\t"
|
||||
|
||||
"vfchesb %%v20,%%v17,%%v16 \n\t"
|
||||
"vfchesb %%v21,%%v19,%%v18 \n\t"
|
||||
"vsel %%v16,%%v16,%%v17,%%v20 \n\t"
|
||||
"vsel %%v5,%%v5,%%v6,%%v20 \n\t"
|
||||
"vsel %%v17,%%v18,%%v19,%%v21 \n\t"
|
||||
"vsel %%v6,%%v7,%%v8,%%v21 \n\t"
|
||||
|
||||
"vfchesb %%v18,%%v17,%%v16 \n\t"
|
||||
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
|
||||
"vsel %%v5,%%v5,%%v6,%%v18 \n\t"
|
||||
"vsegf %%v6,%%v5 \n\t"
|
||||
"vesrlg %%v5,%%v5,32 \n\t"
|
||||
"vag %%v5,%%v5,%%v4 \n\t"
|
||||
"vag %%v6,%%v6,%%v4 \n\t"
|
||||
|
||||
"vfchesb %%v7,%%v16,%%v0 \n\t"
|
||||
"vsel %%v0,%%v0,%%v16,%%v7 \n\t"
|
||||
"vsegf %%v8,%%v7 \n\t"
|
||||
"vesrlg %%v7,%%v7,32 \n\t"
|
||||
"vsegf %%v7,%%v7 \n\t"
|
||||
"vsel %%v1,%%v1,%%v5,%%v7 \n\t"
|
||||
"vsel %%v2,%%v2,%%v6,%%v8 \n\t"
|
||||
"vag %%v4,%%v4,%%v3 \n\t"
|
||||
|
||||
"vl %%v16,128(%%r1,%3) \n\t"
|
||||
"vl %%v17,144(%%r1,%3) \n\t"
|
||||
"vl %%v18,160(%%r1,%3) \n\t"
|
||||
"vl %%v19,176(%%r1,%3) \n\t"
|
||||
"vl %%v20,192(%%r1,%3) \n\t"
|
||||
"vl %%v21,208(%%r1,%3) \n\t"
|
||||
"vl %%v22,224(%%r1,%3) \n\t"
|
||||
"vl %%v23,240(%%r1,%3) \n\t"
|
||||
"vflpsb %%v16, %%v16 \n\t"
|
||||
"vflpsb %%v17, %%v17 \n\t"
|
||||
"vflpsb %%v18, %%v18 \n\t"
|
||||
"vflpsb %%v19, %%v19 \n\t"
|
||||
"vflpsb %%v20, %%v20 \n\t"
|
||||
"vflpsb %%v21, %%v21 \n\t"
|
||||
"vflpsb %%v22, %%v22 \n\t"
|
||||
"vflpsb %%v23, %%v23 \n\t"
|
||||
|
||||
"vfchesb %%v5,%%v17,%%v16 \n\t"
|
||||
"vfchesb %%v6,%%v19,%%v18 \n\t"
|
||||
"vfchesb %%v7,%%v21,%%v20 \n\t"
|
||||
"vfchesb %%v8,%%v23,%%v22 \n\t"
|
||||
"vsel %%v16,%%v16,%%v17,%%v5 \n\t"
|
||||
"vsel %%v5,%%v24,%%v25,%%v5 \n\t"
|
||||
"vsel %%v17,%%v18,%%v19,%%v6 \n\t"
|
||||
"vsel %%v6,%%v26,%%v27,%%v6 \n\t"
|
||||
"vsel %%v18,%%v20,%%v21,%%v7 \n\t"
|
||||
"vsel %%v7,%%v28,%%v29,%%v7 \n\t"
|
||||
"vsel %%v19,%%v22,%%v23,%%v8 \n\t"
|
||||
"vsel %%v8,%%v30,%%v31,%%v8 \n\t"
|
||||
|
||||
"vfchesb %%v20,%%v17,%%v16 \n\t"
|
||||
"vfchesb %%v21,%%v19,%%v18 \n\t"
|
||||
"vsel %%v16,%%v16,%%v17,%%v20 \n\t"
|
||||
"vsel %%v5,%%v5,%%v6,%%v20 \n\t"
|
||||
"vsel %%v17,%%v18,%%v19,%%v21 \n\t"
|
||||
"vsel %%v6,%%v7,%%v8,%%v21 \n\t"
|
||||
|
||||
"vfchesb %%v18,%%v17,%%v16 \n\t"
|
||||
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
|
||||
"vsel %%v5,%%v5,%%v6,%%v18 \n\t"
|
||||
"vsegf %%v6,%%v5 \n\t"
|
||||
"vesrlg %%v5,%%v5,32 \n\t"
|
||||
"vag %%v5,%%v5,%%v4 \n\t"
|
||||
"vag %%v6,%%v6,%%v4 \n\t"
|
||||
|
||||
"vfchesb %%v7,%%v16,%%v0 \n\t"
|
||||
"vsel %%v0,%%v0,%%v16,%%v7 \n\t"
|
||||
"vsegf %%v8,%%v7 \n\t"
|
||||
"vesrlg %%v7,%%v7,32 \n\t"
|
||||
"vsegf %%v7,%%v7 \n\t"
|
||||
"vsel %%v1,%%v1,%%v5,%%v7 \n\t"
|
||||
"vsel %%v2,%%v2,%%v6,%%v8 \n\t"
|
||||
"vag %%v4,%%v4,%%v3 \n\t"
|
||||
|
||||
"agfi %%r1, 256 \n\t"
|
||||
"brctg %%r0, 0b \n\t"
|
||||
|
||||
"veslg %%v3,%%v0,32 \n\t"
|
||||
"vfchsb %%v4,%%v3,%%v0 \n\t"
|
||||
"vchlg %%v5,%%v2,%%v1 \n\t"
|
||||
"vfcesb %%v6,%%v0,%%v3 \n\t"
|
||||
"vn %%v5,%%v5,%%v6 \n\t"
|
||||
"vo %%v4,%%v4,%%v5 \n\t"
|
||||
"vsel %%v0,%%v0,%%v3,%%v4 \n\t"
|
||||
"vesrlg %%v4,%%v4,32 \n\t"
|
||||
"vsegf %%v4,%%v4 \n\t"
|
||||
"vsel %%v1,%%v1,%%v2,%%v4 \n\t"
|
||||
|
||||
"vrepf %%v2,%%v0,2 \n\t"
|
||||
"vrepg %%v3,%%v1,1 \n\t"
|
||||
"wfcsb %%v2,%%v0 \n\t"
|
||||
"jne 1f \n\t"
|
||||
"vstef %%v0,%1,0 \n\t"
|
||||
"vmnlg %%v0,%%v1,%%v3 \n\t"
|
||||
"vlgvg %0,%%v0,0 \n\t"
|
||||
"j 2f \n\t"
|
||||
"1: \n\t"
|
||||
"wfchsb %%v4,%%v0,%%v2 \n\t"
|
||||
"vesrlg %%v4,%%v4,32 \n\t"
|
||||
"vsegf %%v4,%%v4 \n\t"
|
||||
"vsel %%v1,%%v3,%%v1,%%v4 \n\t"
|
||||
"vsel %%v0,%%v2,%%v0,%%v4 \n\t"
|
||||
"ste %%f0,%1 \n\t"
|
||||
"vlgvg %0,%%v1,0 \n\t"
|
||||
"2: \n\t"
|
||||
"nop "
|
||||
:"=r"(iamin),"=m"(*amin)
|
||||
:"r"(n),"ZR"((const FLOAT (*)[n])x)
|
||||
:"memory","cc","r0","r1","v0","v1","v2","v4","v5","v6","v7","v8","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
|
||||
return iamin;
|
||||
}
|
||||
|
||||
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||
BLASLONG i = 0;
|
||||
BLASLONG j = 0;
|
||||
FLOAT minf = 0.0;
|
||||
BLASLONG min = 0;
|
||||
|
||||
if (n <= 0 || inc_x <= 0) return (min);
|
||||
|
||||
if (inc_x == 1) {
|
||||
|
||||
BLASLONG n1 = n & -64;
|
||||
if (n1 > 0) {
|
||||
|
||||
min = isamin_kernel_64(n1, x, &minf);
|
||||
|
||||
i = n1;
|
||||
}
|
||||
else
|
||||
{
|
||||
minf = ABS(x[0]);
|
||||
i++;
|
||||
}
|
||||
|
||||
while (i < n) {
|
||||
if (ABS(x[i]) < minf) {
|
||||
min = i;
|
||||
minf = ABS(x[i]);
|
||||
}
|
||||
i++;
|
||||
}
|
||||
return (min + 1);
|
||||
|
||||
} else {
|
||||
|
||||
min = 0;
|
||||
minf = ABS(x[0]);
|
||||
|
||||
BLASLONG n1 = n & -4;
|
||||
while (j < n1) {
|
||||
|
||||
if (ABS(x[i]) < minf) {
|
||||
min = j;
|
||||
minf = ABS(x[i]);
|
||||
}
|
||||
if (ABS(x[i + inc_x]) < minf) {
|
||||
min = j + 1;
|
||||
minf = ABS(x[i + inc_x]);
|
||||
}
|
||||
if (ABS(x[i + 2 * inc_x]) < minf) {
|
||||
min = j + 2;
|
||||
minf = ABS(x[i + 2 * inc_x]);
|
||||
}
|
||||
if (ABS(x[i + 3 * inc_x]) < minf) {
|
||||
min = j + 3;
|
||||
minf = ABS(x[i + 3 * inc_x]);
|
||||
}
|
||||
|
||||
i += inc_x * 4;
|
||||
|
||||
j += 4;
|
||||
|
||||
}
|
||||
|
||||
|
||||
while (j < n) {
|
||||
if (ABS(x[i]) < minf) {
|
||||
min = j;
|
||||
minf = ABS(x[i]);
|
||||
}
|
||||
i += inc_x;
|
||||
j++;
|
||||
}
|
||||
return (min + 1);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,285 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2017, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
static BLASLONG ismax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *max)
|
||||
{
|
||||
BLASLONG imax;
|
||||
|
||||
__asm__ volatile (
|
||||
"vl %%v0,0(%3) \n\t"
|
||||
"vleig %%v1,0,0 \n\t"
|
||||
"vleig %%v1,2,1 \n\t"
|
||||
"vleig %%v2,1,0 \n\t"
|
||||
"vleig %%v2,3,1 \n\t"
|
||||
"vrepig %%v3,32 \n\t"
|
||||
"vzero %%v4 \n\t"
|
||||
"vleif %%v24,0,0 \n\t"
|
||||
"vleif %%v24,1,1 \n\t"
|
||||
"vleif %%v24,2,2 \n\t"
|
||||
"vleif %%v24,3,3 \n\t"
|
||||
"vleif %%v25,4,0 \n\t"
|
||||
"vleif %%v25,5,1 \n\t"
|
||||
"vleif %%v25,6,2 \n\t"
|
||||
"vleif %%v25,7,3 \n\t"
|
||||
"vleif %%v26,8,0 \n\t"
|
||||
"vleif %%v26,9,1 \n\t"
|
||||
"vleif %%v26,10,2 \n\t"
|
||||
"vleif %%v26,11,3 \n\t"
|
||||
"vleif %%v27,12,0 \n\t"
|
||||
"vleif %%v27,13,1 \n\t"
|
||||
"vleif %%v27,14,2 \n\t"
|
||||
"vleif %%v27,15,3 \n\t"
|
||||
"vleif %%v28,16,0 \n\t"
|
||||
"vleif %%v28,17,1 \n\t"
|
||||
"vleif %%v28,18,2 \n\t"
|
||||
"vleif %%v28,19,3 \n\t"
|
||||
"vleif %%v29,20,0 \n\t"
|
||||
"vleif %%v29,21,1 \n\t"
|
||||
"vleif %%v29,22,2 \n\t"
|
||||
"vleif %%v29,23,3 \n\t"
|
||||
"vleif %%v30,24,0 \n\t"
|
||||
"vleif %%v30,25,1 \n\t"
|
||||
"vleif %%v30,26,2 \n\t"
|
||||
"vleif %%v30,27,3 \n\t"
|
||||
"vleif %%v31,28,0 \n\t"
|
||||
"vleif %%v31,29,1 \n\t"
|
||||
"vleif %%v31,30,2 \n\t"
|
||||
"vleif %%v31,31,3 \n\t"
|
||||
"srlg %%r0,%2,6 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 1, 1024(%%r1,%3) \n\t"
|
||||
|
||||
"vl %%v16,0(%%r1,%3) \n\t"
|
||||
"vl %%v17,16(%%r1,%3) \n\t"
|
||||
"vl %%v18,32(%%r1,%3) \n\t"
|
||||
"vl %%v19,48(%%r1,%3) \n\t"
|
||||
"vl %%v20,64(%%r1,%3) \n\t"
|
||||
"vl %%v21,80(%%r1,%3) \n\t"
|
||||
"vl %%v22,96(%%r1,%3) \n\t"
|
||||
"vl %%v23,112(%%r1,%3) \n\t"
|
||||
|
||||
"vfchesb %%v5,%%v16,%%v17 \n\t"
|
||||
"vfchesb %%v6,%%v18,%%v19 \n\t"
|
||||
"vfchesb %%v7,%%v20,%%v21 \n\t"
|
||||
"vfchesb %%v8,%%v22,%%v23 \n\t"
|
||||
"vsel %%v16,%%v16,%%v17,%%v5 \n\t"
|
||||
"vsel %%v5,%%v24,%%v25,%%v5 \n\t"
|
||||
"vsel %%v17,%%v18,%%v19,%%v6 \n\t"
|
||||
"vsel %%v6,%%v26,%%v27,%%v6 \n\t"
|
||||
"vsel %%v18,%%v20,%%v21,%%v7 \n\t"
|
||||
"vsel %%v7,%%v28,%%v29,%%v7 \n\t"
|
||||
"vsel %%v19,%%v22,%%v23,%%v8 \n\t"
|
||||
"vsel %%v8,%%v30,%%v31,%%v8 \n\t"
|
||||
|
||||
"vfchesb %%v20,%%v16,%%v17 \n\t"
|
||||
"vfchesb %%v21,%%v18,%%v19 \n\t"
|
||||
"vsel %%v16,%%v16,%%v17,%%v20 \n\t"
|
||||
"vsel %%v5,%%v5,%%v6,%%v20 \n\t"
|
||||
"vsel %%v17,%%v18,%%v19,%%v21 \n\t"
|
||||
"vsel %%v6,%%v7,%%v8,%%v21 \n\t"
|
||||
|
||||
"vfchesb %%v18,%%v16,%%v17 \n\t"
|
||||
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
|
||||
"vsel %%v5,%%v5,%%v6,%%v18 \n\t"
|
||||
"vsegf %%v6,%%v5 \n\t"
|
||||
"vesrlg %%v5,%%v5,32 \n\t"
|
||||
"vag %%v5,%%v5,%%v4 \n\t"
|
||||
"vag %%v6,%%v6,%%v4 \n\t"
|
||||
|
||||
"vfchesb %%v7,%%v0,%%v16 \n\t"
|
||||
"vsel %%v0,%%v0,%%v16,%%v7 \n\t"
|
||||
"vsegf %%v8,%%v7 \n\t"
|
||||
"vesrlg %%v7,%%v7,32 \n\t"
|
||||
"vsegf %%v7,%%v7 \n\t"
|
||||
"vsel %%v1,%%v1,%%v5,%%v7 \n\t"
|
||||
"vsel %%v2,%%v2,%%v6,%%v8 \n\t"
|
||||
"vag %%v4,%%v4,%%v3 \n\t"
|
||||
|
||||
"vl %%v16,128(%%r1,%3) \n\t"
|
||||
"vl %%v17,144(%%r1,%3) \n\t"
|
||||
"vl %%v18,160(%%r1,%3) \n\t"
|
||||
"vl %%v19,176(%%r1,%3) \n\t"
|
||||
"vl %%v20,192(%%r1,%3) \n\t"
|
||||
"vl %%v21,208(%%r1,%3) \n\t"
|
||||
"vl %%v22,224(%%r1,%3) \n\t"
|
||||
"vl %%v23,240(%%r1,%3) \n\t"
|
||||
|
||||
"vfchesb %%v5,%%v16,%%v17 \n\t"
|
||||
"vfchesb %%v6,%%v18,%%v19 \n\t"
|
||||
"vfchesb %%v7,%%v20,%%v21 \n\t"
|
||||
"vfchesb %%v8,%%v22,%%v23 \n\t"
|
||||
"vsel %%v16,%%v16,%%v17,%%v5 \n\t"
|
||||
"vsel %%v5,%%v24,%%v25,%%v5 \n\t"
|
||||
"vsel %%v17,%%v18,%%v19,%%v6 \n\t"
|
||||
"vsel %%v6,%%v26,%%v27,%%v6 \n\t"
|
||||
"vsel %%v18,%%v20,%%v21,%%v7 \n\t"
|
||||
"vsel %%v7,%%v28,%%v29,%%v7 \n\t"
|
||||
"vsel %%v19,%%v22,%%v23,%%v8 \n\t"
|
||||
"vsel %%v8,%%v30,%%v31,%%v8 \n\t"
|
||||
|
||||
"vfchesb %%v20,%%v16,%%v17 \n\t"
|
||||
"vfchesb %%v21,%%v18,%%v19 \n\t"
|
||||
"vsel %%v16,%%v16,%%v17,%%v20 \n\t"
|
||||
"vsel %%v5,%%v5,%%v6,%%v20 \n\t"
|
||||
"vsel %%v17,%%v18,%%v19,%%v21 \n\t"
|
||||
"vsel %%v6,%%v7,%%v8,%%v21 \n\t"
|
||||
|
||||
"vfchesb %%v18,%%v16,%%v17 \n\t"
|
||||
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
|
||||
"vsel %%v5,%%v5,%%v6,%%v18 \n\t"
|
||||
"vsegf %%v6,%%v5 \n\t"
|
||||
"vesrlg %%v5,%%v5,32 \n\t"
|
||||
"vag %%v5,%%v5,%%v4 \n\t"
|
||||
"vag %%v6,%%v6,%%v4 \n\t"
|
||||
|
||||
"vfchesb %%v7,%%v0,%%v16 \n\t"
|
||||
"vsel %%v0,%%v0,%%v16,%%v7 \n\t"
|
||||
"vsegf %%v8,%%v7 \n\t"
|
||||
"vesrlg %%v7,%%v7,32 \n\t"
|
||||
"vsegf %%v7,%%v7 \n\t"
|
||||
"vsel %%v1,%%v1,%%v5,%%v7 \n\t"
|
||||
"vsel %%v2,%%v2,%%v6,%%v8 \n\t"
|
||||
"vag %%v4,%%v4,%%v3 \n\t"
|
||||
|
||||
"agfi %%r1, 256 \n\t"
|
||||
"brctg %%r0, 0b \n\t"
|
||||
|
||||
"veslg %%v3,%%v0,32 \n\t"
|
||||
"vfchsb %%v4,%%v0,%%v3 \n\t"
|
||||
"vchlg %%v5,%%v2,%%v1 \n\t"
|
||||
"vfcesb %%v6,%%v0,%%v3 \n\t"
|
||||
"vn %%v5,%%v5,%%v6 \n\t"
|
||||
"vo %%v4,%%v4,%%v5 \n\t"
|
||||
"vsel %%v0,%%v0,%%v3,%%v4 \n\t"
|
||||
"vesrlg %%v4,%%v4,32 \n\t"
|
||||
"vsegf %%v4,%%v4 \n\t"
|
||||
"vsel %%v1,%%v1,%%v2,%%v4 \n\t"
|
||||
|
||||
"vrepf %%v2,%%v0,2 \n\t"
|
||||
"vrepg %%v3,%%v1,1 \n\t"
|
||||
"wfcsb %%v2,%%v0 \n\t"
|
||||
"jne 1f \n\t"
|
||||
"vstef %%v0,%1,0 \n\t"
|
||||
"vmnlg %%v0,%%v1,%%v3 \n\t"
|
||||
"vlgvg %0,%%v0,0 \n\t"
|
||||
"j 2f \n\t"
|
||||
"1: \n\t"
|
||||
"wfchsb %%v4,%%v2,%%v0 \n\t"
|
||||
"vesrlg %%v4,%%v4,32 \n\t"
|
||||
"vsegf %%v4,%%v4 \n\t"
|
||||
"vsel %%v1,%%v3,%%v1,%%v4 \n\t"
|
||||
"vsel %%v0,%%v2,%%v0,%%v4 \n\t"
|
||||
"ste %%f0,%1 \n\t"
|
||||
"vlgvg %0,%%v1,0 \n\t"
|
||||
"2: \n\t"
|
||||
"nop "
|
||||
:"=r"(imax),"=m"(*max)
|
||||
:"r"(n),"ZR"((const FLOAT (*)[n])x)
|
||||
:"memory","cc","r0","r1","v0","v1","v2","v4","v5","v6","v7","v8","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
|
||||
return imax;
|
||||
}
|
||||
|
||||
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||
BLASLONG i = 0;
|
||||
BLASLONG j = 0;
|
||||
FLOAT maxf = 0.0;
|
||||
BLASLONG max = 0;
|
||||
|
||||
if (n <= 0 || inc_x <= 0) return (max);
|
||||
|
||||
if (inc_x == 1) {
|
||||
|
||||
BLASLONG n1 = n & -64;
|
||||
if (n1 > 0) {
|
||||
|
||||
max = ismax_kernel_64(n1, x, &maxf);
|
||||
|
||||
i = n1;
|
||||
}
|
||||
else
|
||||
{
|
||||
maxf = x[0];
|
||||
i++;
|
||||
}
|
||||
|
||||
while (i < n) {
|
||||
if (x[i] > maxf) {
|
||||
max = i;
|
||||
maxf = x[i];
|
||||
}
|
||||
i++;
|
||||
}
|
||||
return (max + 1);
|
||||
|
||||
} else {
|
||||
|
||||
max = 0;
|
||||
maxf = x[0];
|
||||
|
||||
BLASLONG n1 = n & -4;
|
||||
while (j < n1) {
|
||||
|
||||
if (x[i] > maxf) {
|
||||
max = j;
|
||||
maxf = x[i];
|
||||
}
|
||||
if (x[i + inc_x] > maxf) {
|
||||
max = j + 1;
|
||||
maxf = x[i + inc_x];
|
||||
}
|
||||
if (x[i + 2 * inc_x] > maxf) {
|
||||
max = j + 2;
|
||||
maxf = x[i + 2 * inc_x];
|
||||
}
|
||||
if (x[i + 3 * inc_x] > maxf) {
|
||||
max = j + 3;
|
||||
maxf = x[i + 3 * inc_x];
|
||||
}
|
||||
|
||||
i += inc_x * 4;
|
||||
|
||||
j += 4;
|
||||
|
||||
}
|
||||
|
||||
|
||||
while (j < n) {
|
||||
if (x[i] > maxf) {
|
||||
max = j;
|
||||
maxf = x[i];
|
||||
}
|
||||
i += inc_x;
|
||||
j++;
|
||||
}
|
||||
return (max + 1);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,285 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2017, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
static BLASLONG ismin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *min)
|
||||
{
|
||||
BLASLONG imin;
|
||||
|
||||
__asm__ volatile (
|
||||
"vl %%v0,0(%3) \n\t"
|
||||
"vleig %%v1,0,0 \n\t"
|
||||
"vleig %%v1,2,1 \n\t"
|
||||
"vleig %%v2,1,0 \n\t"
|
||||
"vleig %%v2,3,1 \n\t"
|
||||
"vrepig %%v3,32 \n\t"
|
||||
"vzero %%v4 \n\t"
|
||||
"vleif %%v24,0,0 \n\t"
|
||||
"vleif %%v24,1,1 \n\t"
|
||||
"vleif %%v24,2,2 \n\t"
|
||||
"vleif %%v24,3,3 \n\t"
|
||||
"vleif %%v25,4,0 \n\t"
|
||||
"vleif %%v25,5,1 \n\t"
|
||||
"vleif %%v25,6,2 \n\t"
|
||||
"vleif %%v25,7,3 \n\t"
|
||||
"vleif %%v26,8,0 \n\t"
|
||||
"vleif %%v26,9,1 \n\t"
|
||||
"vleif %%v26,10,2 \n\t"
|
||||
"vleif %%v26,11,3 \n\t"
|
||||
"vleif %%v27,12,0 \n\t"
|
||||
"vleif %%v27,13,1 \n\t"
|
||||
"vleif %%v27,14,2 \n\t"
|
||||
"vleif %%v27,15,3 \n\t"
|
||||
"vleif %%v28,16,0 \n\t"
|
||||
"vleif %%v28,17,1 \n\t"
|
||||
"vleif %%v28,18,2 \n\t"
|
||||
"vleif %%v28,19,3 \n\t"
|
||||
"vleif %%v29,20,0 \n\t"
|
||||
"vleif %%v29,21,1 \n\t"
|
||||
"vleif %%v29,22,2 \n\t"
|
||||
"vleif %%v29,23,3 \n\t"
|
||||
"vleif %%v30,24,0 \n\t"
|
||||
"vleif %%v30,25,1 \n\t"
|
||||
"vleif %%v30,26,2 \n\t"
|
||||
"vleif %%v30,27,3 \n\t"
|
||||
"vleif %%v31,28,0 \n\t"
|
||||
"vleif %%v31,29,1 \n\t"
|
||||
"vleif %%v31,30,2 \n\t"
|
||||
"vleif %%v31,31,3 \n\t"
|
||||
"srlg %%r0,%2,6 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 1, 1024(%%r1,%3) \n\t"
|
||||
|
||||
"vl %%v16,0(%%r1,%3) \n\t"
|
||||
"vl %%v17,16(%%r1,%3) \n\t"
|
||||
"vl %%v18,32(%%r1,%3) \n\t"
|
||||
"vl %%v19,48(%%r1,%3) \n\t"
|
||||
"vl %%v20,64(%%r1,%3) \n\t"
|
||||
"vl %%v21,80(%%r1,%3) \n\t"
|
||||
"vl %%v22,96(%%r1,%3) \n\t"
|
||||
"vl %%v23,112(%%r1,%3) \n\t"
|
||||
|
||||
"vfchesb %%v5,%%v17,%%v16 \n\t"
|
||||
"vfchesb %%v6,%%v19,%%v18 \n\t"
|
||||
"vfchesb %%v7,%%v21,%%v20 \n\t"
|
||||
"vfchesb %%v8,%%v23,%%v22 \n\t"
|
||||
"vsel %%v16,%%v16,%%v17,%%v5 \n\t"
|
||||
"vsel %%v5,%%v24,%%v25,%%v5 \n\t"
|
||||
"vsel %%v17,%%v18,%%v19,%%v6 \n\t"
|
||||
"vsel %%v6,%%v26,%%v27,%%v6 \n\t"
|
||||
"vsel %%v18,%%v20,%%v21,%%v7 \n\t"
|
||||
"vsel %%v7,%%v28,%%v29,%%v7 \n\t"
|
||||
"vsel %%v19,%%v22,%%v23,%%v8 \n\t"
|
||||
"vsel %%v8,%%v30,%%v31,%%v8 \n\t"
|
||||
|
||||
"vfchesb %%v20,%%v17,%%v16 \n\t"
|
||||
"vfchesb %%v21,%%v19,%%v18 \n\t"
|
||||
"vsel %%v16,%%v16,%%v17,%%v20 \n\t"
|
||||
"vsel %%v5,%%v5,%%v6,%%v20 \n\t"
|
||||
"vsel %%v17,%%v18,%%v19,%%v21 \n\t"
|
||||
"vsel %%v6,%%v7,%%v8,%%v21 \n\t"
|
||||
|
||||
"vfchesb %%v18,%%v17,%%v16 \n\t"
|
||||
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
|
||||
"vsel %%v5,%%v5,%%v6,%%v18 \n\t"
|
||||
"vsegf %%v6,%%v5 \n\t"
|
||||
"vesrlg %%v5,%%v5,32 \n\t"
|
||||
"vag %%v5,%%v5,%%v4 \n\t"
|
||||
"vag %%v6,%%v6,%%v4 \n\t"
|
||||
|
||||
"vfchesb %%v7,%%v16,%%v0 \n\t"
|
||||
"vsel %%v0,%%v0,%%v16,%%v7 \n\t"
|
||||
"vsegf %%v8,%%v7 \n\t"
|
||||
"vesrlg %%v7,%%v7,32 \n\t"
|
||||
"vsegf %%v7,%%v7 \n\t"
|
||||
"vsel %%v1,%%v1,%%v5,%%v7 \n\t"
|
||||
"vsel %%v2,%%v2,%%v6,%%v8 \n\t"
|
||||
"vag %%v4,%%v4,%%v3 \n\t"
|
||||
|
||||
"vl %%v16,128(%%r1,%3) \n\t"
|
||||
"vl %%v17,144(%%r1,%3) \n\t"
|
||||
"vl %%v18,160(%%r1,%3) \n\t"
|
||||
"vl %%v19,176(%%r1,%3) \n\t"
|
||||
"vl %%v20,192(%%r1,%3) \n\t"
|
||||
"vl %%v21,208(%%r1,%3) \n\t"
|
||||
"vl %%v22,224(%%r1,%3) \n\t"
|
||||
"vl %%v23,240(%%r1,%3) \n\t"
|
||||
|
||||
"vfchesb %%v5,%%v17,%%v16 \n\t"
|
||||
"vfchesb %%v6,%%v19,%%v18 \n\t"
|
||||
"vfchesb %%v7,%%v21,%%v20 \n\t"
|
||||
"vfchesb %%v8,%%v23,%%v22 \n\t"
|
||||
"vsel %%v16,%%v16,%%v17,%%v5 \n\t"
|
||||
"vsel %%v5,%%v24,%%v25,%%v5 \n\t"
|
||||
"vsel %%v17,%%v18,%%v19,%%v6 \n\t"
|
||||
"vsel %%v6,%%v26,%%v27,%%v6 \n\t"
|
||||
"vsel %%v18,%%v20,%%v21,%%v7 \n\t"
|
||||
"vsel %%v7,%%v28,%%v29,%%v7 \n\t"
|
||||
"vsel %%v19,%%v22,%%v23,%%v8 \n\t"
|
||||
"vsel %%v8,%%v30,%%v31,%%v8 \n\t"
|
||||
|
||||
"vfchesb %%v20,%%v17,%%v16 \n\t"
|
||||
"vfchesb %%v21,%%v19,%%v18 \n\t"
|
||||
"vsel %%v16,%%v16,%%v17,%%v20 \n\t"
|
||||
"vsel %%v5,%%v5,%%v6,%%v20 \n\t"
|
||||
"vsel %%v17,%%v18,%%v19,%%v21 \n\t"
|
||||
"vsel %%v6,%%v7,%%v8,%%v21 \n\t"
|
||||
|
||||
"vfchesb %%v18,%%v17,%%v16 \n\t"
|
||||
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
|
||||
"vsel %%v5,%%v5,%%v6,%%v18 \n\t"
|
||||
"vsegf %%v6,%%v5 \n\t"
|
||||
"vesrlg %%v5,%%v5,32 \n\t"
|
||||
"vag %%v5,%%v5,%%v4 \n\t"
|
||||
"vag %%v6,%%v6,%%v4 \n\t"
|
||||
|
||||
"vfchesb %%v7,%%v16,%%v0 \n\t"
|
||||
"vsel %%v0,%%v0,%%v16,%%v7 \n\t"
|
||||
"vsegf %%v8,%%v7 \n\t"
|
||||
"vesrlg %%v7,%%v7,32 \n\t"
|
||||
"vsegf %%v7,%%v7 \n\t"
|
||||
"vsel %%v1,%%v1,%%v5,%%v7 \n\t"
|
||||
"vsel %%v2,%%v2,%%v6,%%v8 \n\t"
|
||||
"vag %%v4,%%v4,%%v3 \n\t"
|
||||
|
||||
"agfi %%r1, 256 \n\t"
|
||||
"brctg %%r0, 0b \n\t"
|
||||
|
||||
"veslg %%v3,%%v0,32 \n\t"
|
||||
"vfchsb %%v4,%%v3,%%v0 \n\t"
|
||||
"vchlg %%v5,%%v2,%%v1 \n\t"
|
||||
"vfcesb %%v6,%%v0,%%v3 \n\t"
|
||||
"vn %%v5,%%v5,%%v6 \n\t"
|
||||
"vo %%v4,%%v4,%%v5 \n\t"
|
||||
"vsel %%v0,%%v0,%%v3,%%v4 \n\t"
|
||||
"vesrlg %%v4,%%v4,32 \n\t"
|
||||
"vsegf %%v4,%%v4 \n\t"
|
||||
"vsel %%v1,%%v1,%%v2,%%v4 \n\t"
|
||||
|
||||
"vrepf %%v2,%%v0,2 \n\t"
|
||||
"vrepg %%v3,%%v1,1 \n\t"
|
||||
"wfcsb %%v2,%%v0 \n\t"
|
||||
"jne 1f \n\t"
|
||||
"vstef %%v0,%1,0 \n\t"
|
||||
"vmnlg %%v0,%%v1,%%v3 \n\t"
|
||||
"vlgvg %0,%%v0,0 \n\t"
|
||||
"j 2f \n\t"
|
||||
"1: \n\t"
|
||||
"wfchsb %%v4,%%v0,%%v2 \n\t"
|
||||
"vesrlg %%v4,%%v4,32 \n\t"
|
||||
"vsegf %%v4,%%v4 \n\t"
|
||||
"vsel %%v1,%%v3,%%v1,%%v4 \n\t"
|
||||
"vsel %%v0,%%v2,%%v0,%%v4 \n\t"
|
||||
"ste %%f0,%1 \n\t"
|
||||
"vlgvg %0,%%v1,0 \n\t"
|
||||
"2: \n\t"
|
||||
"nop "
|
||||
:"=r"(imin),"=m"(*min)
|
||||
:"r"(n),"ZR"((const FLOAT (*)[n])x)
|
||||
:"memory","cc","r0","r1","v0","v1","v2","v4","v5","v6","v7","v8","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
|
||||
return imin;
|
||||
}
|
||||
|
||||
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||
BLASLONG i = 0;
|
||||
BLASLONG j = 0;
|
||||
FLOAT minf = 0.0;
|
||||
BLASLONG min = 0;
|
||||
|
||||
if (n <= 0 || inc_x <= 0) return (min);
|
||||
|
||||
if (inc_x == 1) {
|
||||
|
||||
BLASLONG n1 = n & -64;
|
||||
if (n1 > 0) {
|
||||
|
||||
min = ismin_kernel_64(n1, x, &minf);
|
||||
|
||||
i = n1;
|
||||
}
|
||||
else
|
||||
{
|
||||
minf = x[0];
|
||||
i++;
|
||||
}
|
||||
|
||||
while (i < n) {
|
||||
if (x[i] < minf) {
|
||||
min = i;
|
||||
minf = x[i];
|
||||
}
|
||||
i++;
|
||||
}
|
||||
return (min + 1);
|
||||
|
||||
} else {
|
||||
|
||||
min = 0;
|
||||
minf = x[0];
|
||||
|
||||
BLASLONG n1 = n & -4;
|
||||
while (j < n1) {
|
||||
|
||||
if (x[i] < minf) {
|
||||
min = j;
|
||||
minf = x[i];
|
||||
}
|
||||
if (x[i + inc_x] < minf) {
|
||||
min = j + 1;
|
||||
minf = x[i + inc_x];
|
||||
}
|
||||
if (x[i + 2 * inc_x] < minf) {
|
||||
min = j + 2;
|
||||
minf = x[i + 2 * inc_x];
|
||||
}
|
||||
if (x[i + 3 * inc_x] < minf) {
|
||||
min = j + 3;
|
||||
minf = x[i + 3 * inc_x];
|
||||
}
|
||||
|
||||
i += inc_x * 4;
|
||||
|
||||
j += 4;
|
||||
|
||||
}
|
||||
|
||||
|
||||
while (j < n) {
|
||||
if (x[i] < minf) {
|
||||
min = j;
|
||||
minf = x[i];
|
||||
}
|
||||
i += inc_x;
|
||||
j++;
|
||||
}
|
||||
return (min + 1);
|
||||
}
|
||||
}
|
||||
|
|
@ -24,190 +24,165 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
|||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
#include "common.h"
|
||||
#include <math.h>
|
||||
|
||||
#define ABS fabs
|
||||
#define CABS1(x,i) ABS(x[i])+ABS(x[i+1])
|
||||
#if defined(DOUBLE)
|
||||
#define ABS fabs
|
||||
#else
|
||||
#define ABS fabsf
|
||||
#endif
|
||||
#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1]))
|
||||
|
||||
static BLASLONG izamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *amax)
|
||||
{
|
||||
BLASLONG iamax;
|
||||
|
||||
__asm__ volatile (
|
||||
"vleg %%v0,0(%3),0 \n\t"
|
||||
"vleg %%v1,8(%3),0 \n\t"
|
||||
"vleg %%v0,16(%3),1 \n\t"
|
||||
"vleg %%v1,24(%3),1 \n\t"
|
||||
"vflpdb %%v0,%%v0 \n\t"
|
||||
"vflpdb %%v1,%%v1 \n\t"
|
||||
"vfadb %%v0,%%v0,%%v1 \n\t"
|
||||
"vleig %%v1,0,0 \n\t"
|
||||
"vleig %%v1,1,1 \n\t"
|
||||
"vrepig %%v2,8 \n\t"
|
||||
"vzero %%v3 \n\t"
|
||||
"vleig %%v24,0,0 \n\t"
|
||||
"vleig %%v24,1,1 \n\t"
|
||||
"vleig %%v25,2,0 \n\t"
|
||||
"vleig %%v25,3,1 \n\t"
|
||||
"vleig %%v26,4,0 \n\t"
|
||||
"vleig %%v26,5,1 \n\t"
|
||||
"vleig %%v27,6,0 \n\t"
|
||||
"vleig %%v27,7,1 \n\t"
|
||||
"srlg %%r0,%2,4 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 1, 1024(%%r1,%3) \n\t"
|
||||
|
||||
|
||||
/**
|
||||
* Find maximum index
|
||||
* Warning: requirements n>0 and n % 16 == 0
|
||||
* @param n
|
||||
* @param x pointer to the vector
|
||||
* @param maxf (out) maximum absolute value .( only for output )
|
||||
* @return index
|
||||
*/
|
||||
static BLASLONG ziamax_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *maxf) {
|
||||
BLASLONG index;
|
||||
__asm__(
|
||||
"pfd 1, 0(%[ptr_x]) \n\t"
|
||||
"vleig %%v16,0,0 \n\t"
|
||||
"vleig %%v16,1,1 \n\t"
|
||||
"vleig %%v17,2,0 \n\t"
|
||||
"vleig %%v17,3,1 \n\t"
|
||||
"vleig %%v18,4,0 \n\t"
|
||||
"vleig %%v18,5,1 \n\t"
|
||||
"vleig %%v19,6,0 \n\t"
|
||||
"vleig %%v19,7,1 \n\t"
|
||||
"vleig %%v20,8,0 \n\t"
|
||||
"vleig %%v20,9,1 \n\t"
|
||||
"vleig %%v21,10,0 \n\t"
|
||||
"vleig %%v21,11,1 \n\t"
|
||||
"vleig %%v22,12,0 \n\t"
|
||||
"vleig %%v22,13,1 \n\t"
|
||||
"vleig %%v23,14,0 \n\t"
|
||||
"vleig %%v23,15,1 \n\t"
|
||||
|
||||
|
||||
"sllg %%r0,%[n],4 \n\t"
|
||||
"agr %%r0,%[ptr_x] \n\t"
|
||||
"vzero %%v6 \n\t"
|
||||
"vzero %%v7 \n\t"
|
||||
"vrepig %%v4,16 \n\t"
|
||||
"vzero %%v5 \n\t"
|
||||
".align 16 \n\t"
|
||||
"1: \n\t"
|
||||
"pfd 1, 256(%[ptr_tmp] ) \n\t"
|
||||
"vleg %%v16,0(%%r1,%3),0 \n\t"
|
||||
"vleg %%v17,8(%%r1,%3),0 \n\t"
|
||||
"vleg %%v16,16(%%r1,%3),1 \n\t"
|
||||
"vleg %%v17,24(%%r1,%3),1 \n\t"
|
||||
"vleg %%v18,32(%%r1,%3),0 \n\t"
|
||||
"vleg %%v19,40(%%r1,%3),0 \n\t"
|
||||
"vleg %%v18,48(%%r1,%3),1 \n\t"
|
||||
"vleg %%v19,56(%%r1,%3),1 \n\t"
|
||||
"vleg %%v20,64(%%r1,%3),0 \n\t"
|
||||
"vleg %%v21,72(%%r1,%3),0 \n\t"
|
||||
"vleg %%v20,80(%%r1,%3),1 \n\t"
|
||||
"vleg %%v21,88(%%r1,%3),1 \n\t"
|
||||
"vleg %%v22,96(%%r1,%3),0 \n\t"
|
||||
"vleg %%v23,104(%%r1,%3),0 \n\t"
|
||||
"vleg %%v22,112(%%r1,%3),1 \n\t"
|
||||
"vleg %%v23,120(%%r1,%3),1 \n\t"
|
||||
"vflpdb %%v16, %%v16 \n\t"
|
||||
"vflpdb %%v17, %%v17 \n\t"
|
||||
"vflpdb %%v18, %%v18 \n\t"
|
||||
"vflpdb %%v19, %%v19 \n\t"
|
||||
"vflpdb %%v20, %%v20 \n\t"
|
||||
"vflpdb %%v21, %%v21 \n\t"
|
||||
"vflpdb %%v22, %%v22 \n\t"
|
||||
"vflpdb %%v23, %%v23 \n\t"
|
||||
"vfadb %%v16,%%v16,%%v17 \n\t"
|
||||
"vfadb %%v17,%%v18,%%v19 \n\t"
|
||||
"vfadb %%v18,%%v20,%%v21 \n\t"
|
||||
"vfadb %%v19,%%v22,%%v23 \n\t"
|
||||
|
||||
"vleg %%v24 , 0(%[ptr_tmp]),0 \n\t"
|
||||
"vleg %%v25 , 8(%[ptr_tmp]),0 \n\t"
|
||||
"vleg %%v24 , 16(%[ptr_tmp]),1 \n\t"
|
||||
"vleg %%v25 , 24(%[ptr_tmp]),1 \n\t"
|
||||
"vleg %%v26 , 32(%[ptr_tmp]),0 \n\t"
|
||||
"vleg %%v27 , 40(%[ptr_tmp]),0 \n\t"
|
||||
"vleg %%v26 , 48(%[ptr_tmp]),1 \n\t"
|
||||
"vleg %%v27 , 56(%[ptr_tmp]),1 \n\t"
|
||||
"vleg %%v28 , 64(%[ptr_tmp]),0 \n\t"
|
||||
"vleg %%v29 , 72(%[ptr_tmp]),0 \n\t"
|
||||
"vleg %%v28 , 80(%[ptr_tmp]),1 \n\t"
|
||||
"vleg %%v29 , 88(%[ptr_tmp]),1 \n\t"
|
||||
"vleg %%v30 , 96(%[ptr_tmp]),0 \n\t"
|
||||
"vleg %%v31 ,104(%[ptr_tmp]),0 \n\t"
|
||||
"vleg %%v30 ,112(%[ptr_tmp]),1 \n\t"
|
||||
"vleg %%v31 ,120(%[ptr_tmp]),1 \n\t"
|
||||
"vflpdb %%v24, %%v24 \n\t"
|
||||
"vflpdb %%v25, %%v25 \n\t"
|
||||
"vflpdb %%v26, %%v26 \n\t"
|
||||
"vflpdb %%v27, %%v27 \n\t"
|
||||
"vflpdb %%v28, %%v28 \n\t"
|
||||
"vflpdb %%v29, %%v29 \n\t"
|
||||
"vflpdb %%v30, %%v30 \n\t"
|
||||
"vflpdb %%v31, %%v31 \n\t"
|
||||
|
||||
"vfadb %%v0,%%v24,%%v25 \n\t"
|
||||
"vfadb %%v1,%%v26,%%v27 \n\t"
|
||||
"vfadb %%v2,%%v28,%%v29 \n\t"
|
||||
"vfadb %%v3,%%v30,%%v31 \n\t"
|
||||
|
||||
|
||||
"vleg %%v24 , 128(%[ptr_tmp]),0 \n\t"
|
||||
"vleg %%v25 , 136(%[ptr_tmp]),0 \n\t"
|
||||
"vleg %%v24 , 144(%[ptr_tmp]),1 \n\t"
|
||||
"vleg %%v25 , 152(%[ptr_tmp]),1 \n\t"
|
||||
"vleg %%v26 , 160(%[ptr_tmp]),0 \n\t"
|
||||
"vleg %%v27 , 168(%[ptr_tmp]),0 \n\t"
|
||||
"vleg %%v26 , 176(%[ptr_tmp]),1 \n\t"
|
||||
"vleg %%v27 , 184(%[ptr_tmp]),1 \n\t"
|
||||
"vleg %%v28 , 192(%[ptr_tmp]),0 \n\t"
|
||||
"vleg %%v29 , 200(%[ptr_tmp]),0 \n\t"
|
||||
"vleg %%v28 , 208(%[ptr_tmp]),1 \n\t"
|
||||
"vleg %%v29 , 216(%[ptr_tmp]),1 \n\t"
|
||||
"vleg %%v30 , 224(%[ptr_tmp]),0 \n\t"
|
||||
"vleg %%v31 , 232(%[ptr_tmp]),0 \n\t"
|
||||
"vleg %%v30 , 240(%[ptr_tmp]),1 \n\t"
|
||||
"vleg %%v31 , 248(%[ptr_tmp]),1 \n\t"
|
||||
"vflpdb %%v24, %%v24 \n\t"
|
||||
"vflpdb %%v25, %%v25 \n\t"
|
||||
"vflpdb %%v26, %%v26 \n\t"
|
||||
"vflpdb %%v27, %%v27 \n\t"
|
||||
"vflpdb %%v28, %%v28 \n\t"
|
||||
"vflpdb %%v29, %%v29 \n\t"
|
||||
"vflpdb %%v30, %%v30 \n\t"
|
||||
"vflpdb %%v31, %%v31 \n\t"
|
||||
|
||||
"vfadb %%v24,%%v24,%%v25 \n\t"
|
||||
"vfadb %%v26,%%v26,%%v27 \n\t"
|
||||
"vfadb %%v28,%%v28,%%v29 \n\t"
|
||||
"vfadb %%v30,%%v30,%%v31 \n\t"
|
||||
|
||||
"vfchdb %%v25,%%v1,%%v0 \n\t"
|
||||
"vsel %%v29,%%v17,%%v16,%%v25 \n\t"
|
||||
"vsel %%v31,%%v1,%%v0,%%v25 \n\t"
|
||||
|
||||
"vfchdb %%v27,%%v3,%%v2 \n\t "
|
||||
"vsel %%v0,%%v19,%%v18,%%v27 \n\t"
|
||||
"vsel %%v1,%%v3,%%v2,%%v27 \n\t"
|
||||
|
||||
"vfchdb %%v25,%%v26,%%v24 \n\t"
|
||||
"vsel %%v2,%%v21,%%v20,%%v25 \n\t"
|
||||
"vsel %%v3,%%v26,%%v24,%%v25 \n\t"
|
||||
|
||||
"vfchdb %%v27,%%v30,%%v28 \n\t"
|
||||
"vsel %%v25,%%v23,%%v22,%%v27 \n\t"
|
||||
"vsel %%v27,%%v30,%%v28,%%v27 \n\t"
|
||||
|
||||
"vfchdb %%v24, %%v1,%%v31 \n\t"
|
||||
"vsel %%v26,%%v0,%%v29,%%v24 \n\t"
|
||||
"vsel %%v28,%%v1,%%v31,%%v24 \n\t"
|
||||
|
||||
"vfchdb %%v30, %%v27,%%v3 \n\t"
|
||||
"vsel %%v29,%%v25,%%v2,%%v30 \n\t"
|
||||
"vsel %%v31,%%v27,%%v3 ,%%v30 \n\t"
|
||||
|
||||
"la %[ptr_tmp],256(%[ptr_tmp]) \n\t"
|
||||
|
||||
"vfchdb %%v0, %%v31,%%v28 \n\t"
|
||||
"vsel %%v25,%%v29,%%v26,%%v0 \n\t"
|
||||
"vsel %%v27,%%v31,%%v28,%%v0 \n\t"
|
||||
|
||||
"vag %%v25,%%v25,%%v5 \n\t"
|
||||
|
||||
//cmp with previous
|
||||
"vfchdb %%v30, %%v27,%%v6 \n\t"
|
||||
"vsel %%v7,%%v25,%%v7,%%v30 \n\t"
|
||||
"vsel %%v6,%%v27,%%v6,%%v30 \n\t"
|
||||
|
||||
"vag %%v5,%%v5,%%v4 \n\t"
|
||||
|
||||
"clgrjl %[ptr_tmp],%%r0,1b \n\t"
|
||||
"vfchedb %%v4,%%v16,%%v17 \n\t"
|
||||
"vfchedb %%v5,%%v18,%%v19 \n\t"
|
||||
"vsel %%v16,%%v16,%%v17,%%v4 \n\t"
|
||||
"vsel %%v4,%%v24,%%v25,%%v4 \n\t"
|
||||
"vsel %%v17,%%v18,%%v19,%%v5 \n\t"
|
||||
"vsel %%v5,%%v26,%%v27,%%v5 \n\t"
|
||||
|
||||
//xtract index
|
||||
"vrepg %%v26,%%v6,1 \n\t"
|
||||
"vrepg %%v5,%%v7,1 \n\t"
|
||||
"wfcdb %%v26,%%v6 \n\t"
|
||||
"jne 2f \n\t"
|
||||
"vsteg %%v6,%[maxf],0 \n\t"
|
||||
"vmnlg %%v1,%%v5,%%v7 \n\t"
|
||||
"vlgvg %[index],%%v1,0 \n\t"
|
||||
"j 3 \n\t"
|
||||
"2: \n\t"
|
||||
"wfchdb %%v16,%%v26,%%v6 \n\t"
|
||||
"vsel %%v1,%%v5,%%v7,%%v16 \n\t"
|
||||
"vsel %%v0,%%v26,%%v6,%%v16 \n\t"
|
||||
"vlgvg %[index],%%v1,0 \n\t"
|
||||
"std %%f0,%[maxf] \n\t"
|
||||
"3: \n\t"
|
||||
: [index] "+r"(index) ,[maxf] "=m"(*maxf), [ptr_tmp] "+&a"(x)
|
||||
: [mem] "m"( *(const double (*)[2*n])x), [n] "r"(n), [ptr_x] "r"(x)
|
||||
: "cc","r0", "f0","v0","v1","v2","v3","v4","v5","v6","v7","v16",
|
||||
"v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
"vfchedb %%v18,%%v16,%%v17 \n\t"
|
||||
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
|
||||
"vsel %%v4,%%v4,%%v5,%%v18 \n\t"
|
||||
"vag %%v4,%%v4,%%v3 \n\t"
|
||||
|
||||
);
|
||||
return index;
|
||||
"vfchedb %%v5,%%v0,%%v16 \n\t"
|
||||
"vsel %%v0,%%v0,%%v16,%%v5 \n\t"
|
||||
"vsel %%v1,%%v1,%%v4,%%v5 \n\t"
|
||||
"vag %%v3,%%v3,%%v2 \n\t"
|
||||
|
||||
"vleg %%v16,128(%%r1,%3),0 \n\t"
|
||||
"vleg %%v17,136(%%r1,%3),0 \n\t"
|
||||
"vleg %%v16,144(%%r1,%3),1 \n\t"
|
||||
"vleg %%v17,152(%%r1,%3),1 \n\t"
|
||||
"vleg %%v18,160(%%r1,%3),0 \n\t"
|
||||
"vleg %%v19,168(%%r1,%3),0 \n\t"
|
||||
"vleg %%v18,176(%%r1,%3),1 \n\t"
|
||||
"vleg %%v19,184(%%r1,%3),1 \n\t"
|
||||
"vleg %%v20,192(%%r1,%3),0 \n\t"
|
||||
"vleg %%v21,200(%%r1,%3),0 \n\t"
|
||||
"vleg %%v20,208(%%r1,%3),1 \n\t"
|
||||
"vleg %%v21,216(%%r1,%3),1 \n\t"
|
||||
"vleg %%v22,224(%%r1,%3),0 \n\t"
|
||||
"vleg %%v23,232(%%r1,%3),0 \n\t"
|
||||
"vleg %%v22,240(%%r1,%3),1 \n\t"
|
||||
"vleg %%v23,248(%%r1,%3),1 \n\t"
|
||||
"vflpdb %%v16, %%v16 \n\t"
|
||||
"vflpdb %%v17, %%v17 \n\t"
|
||||
"vflpdb %%v18, %%v18 \n\t"
|
||||
"vflpdb %%v19, %%v19 \n\t"
|
||||
"vflpdb %%v20, %%v20 \n\t"
|
||||
"vflpdb %%v21, %%v21 \n\t"
|
||||
"vflpdb %%v22, %%v22 \n\t"
|
||||
"vflpdb %%v23, %%v23 \n\t"
|
||||
"vfadb %%v16,%%v16,%%v17 \n\t"
|
||||
"vfadb %%v17,%%v18,%%v19 \n\t"
|
||||
"vfadb %%v18,%%v20,%%v21 \n\t"
|
||||
"vfadb %%v19,%%v22,%%v23 \n\t"
|
||||
|
||||
"vfchedb %%v4,%%v16,%%v17 \n\t"
|
||||
"vfchedb %%v5,%%v18,%%v19 \n\t"
|
||||
"vsel %%v16,%%v16,%%v17,%%v4 \n\t"
|
||||
"vsel %%v4,%%v24,%%v25,%%v4 \n\t"
|
||||
"vsel %%v17,%%v18,%%v19,%%v5 \n\t"
|
||||
"vsel %%v5,%%v26,%%v27,%%v5 \n\t"
|
||||
|
||||
"vfchedb %%v18,%%v16,%%v17 \n\t"
|
||||
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
|
||||
"vsel %%v4,%%v4,%%v5,%%v18 \n\t"
|
||||
"vag %%v4,%%v4,%%v3 \n\t"
|
||||
|
||||
"vfchedb %%v5,%%v0,%%v16 \n\t"
|
||||
"vsel %%v0,%%v0,%%v16,%%v5 \n\t"
|
||||
"vsel %%v1,%%v1,%%v4,%%v5 \n\t"
|
||||
"vag %%v3,%%v3,%%v2 \n\t"
|
||||
|
||||
"agfi %%r1, 256 \n\t"
|
||||
"brctg %%r0, 0b \n\t"
|
||||
|
||||
"vrepg %%v2,%%v0,1 \n\t"
|
||||
"vrepg %%v3,%%v1,1 \n\t"
|
||||
"wfcdb %%v2,%%v0 \n\t"
|
||||
"jne 1f \n\t"
|
||||
"vsteg %%v0,%1,0 \n\t"
|
||||
"vmnlg %%v0,%%v1,%%v3 \n\t"
|
||||
"vlgvg %0,%%v0,0 \n\t"
|
||||
"j 2f \n\t"
|
||||
"1: \n\t"
|
||||
"wfchdb %%v4,%%v2,%%v0 \n\t"
|
||||
"vsel %%v1,%%v3,%%v1,%%v4 \n\t"
|
||||
"vsel %%v0,%%v2,%%v0,%%v4 \n\t"
|
||||
"std %%f0,%1 \n\t"
|
||||
"vlgvg %0,%%v1,0 \n\t"
|
||||
"2: \n\t"
|
||||
"nop "
|
||||
:"=r"(iamax),"=m"(*amax)
|
||||
:"r"(n),"ZR"((const FLOAT (*)[n * 2])x)
|
||||
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27"
|
||||
);
|
||||
|
||||
return iamax;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
{
|
||||
BLASLONG i = 0;
|
||||
|
|
@ -223,10 +198,16 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
|||
BLASLONG n1 = n & -16;
|
||||
if (n1 > 0) {
|
||||
|
||||
max = ziamax_kernel_16_TUNED(n1, x, &maxf);
|
||||
max = izamax_kernel_16(n1, x, &maxf);
|
||||
ix = n1 * 2;
|
||||
i = n1;
|
||||
ix = n1 << 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
maxf = CABS1(x,0);
|
||||
ix += 2;
|
||||
i++;
|
||||
}
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
|
|
@ -242,9 +223,9 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
|||
|
||||
} else {
|
||||
|
||||
inc_x2 = 2 * inc_x;
|
||||
|
||||
max = 0;
|
||||
maxf = CABS1(x,0);
|
||||
inc_x2 = 2 * inc_x;
|
||||
ix += inc_x2;
|
||||
i++;
|
||||
|
||||
|
|
@ -260,7 +241,6 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
|||
}
|
||||
return (max + 1);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -24,253 +24,223 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
|||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
#include "common.h"
|
||||
#include <math.h>
|
||||
|
||||
#define ABS fabs
|
||||
#define CABS1(x,i) ABS(x[i])+ABS(x[i+1])
|
||||
#if defined(DOUBLE)
|
||||
#define ABS fabs
|
||||
#else
|
||||
#define ABS fabsf
|
||||
#endif
|
||||
#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1]))
|
||||
|
||||
|
||||
/**
|
||||
* Find minimum index
|
||||
* Warning: requirements n>0 and n % 16 == 0
|
||||
* @param n
|
||||
* @param x pointer to the vector
|
||||
* @param minf (out) minimum absolute value .( only for output )
|
||||
* @return minimum index
|
||||
*/
|
||||
static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) {
|
||||
BLASLONG index ;
|
||||
__asm__(
|
||||
"pfd 1, 0(%[ptr_x]) \n\t"
|
||||
"vleig %%v16,0,0 \n\t"
|
||||
"vleig %%v16,1,1 \n\t"
|
||||
"vleig %%v17,2,0 \n\t"
|
||||
"vleig %%v17,3,1 \n\t"
|
||||
"vleig %%v18,4,0 \n\t"
|
||||
"vleig %%v18,5,1 \n\t"
|
||||
"vleig %%v19,6,0 \n\t"
|
||||
"vleig %%v19,7,1 \n\t"
|
||||
"vleig %%v20,8,0 \n\t"
|
||||
"vleig %%v20,9,1 \n\t"
|
||||
"vleig %%v21,10,0 \n\t"
|
||||
"vleig %%v21,11,1 \n\t"
|
||||
"vleig %%v22,12,0 \n\t"
|
||||
"vleig %%v22,13,1 \n\t"
|
||||
"vleig %%v23,14,0 \n\t"
|
||||
"vleig %%v23,15,1 \n\t"
|
||||
"ld %%f6,0(%[ptr_x]) \n\t"
|
||||
"lpdbr %%f6,%%f6 \n\t"
|
||||
"ld %%f7,8(%[ptr_x]) \n\t"
|
||||
"lpdbr %%f7,%%f7 \n\t"
|
||||
"adbr %%f6,%%f7 \n\t"
|
||||
"sllg %%r0,%[n],4 \n\t"
|
||||
"agr %%r0,%[ptr_x] \n\t"
|
||||
"vrepg %%v6,%%v6,0 \n\t"
|
||||
"vzero %%v7 \n\t"
|
||||
"vrepig %%v4,16 \n\t"
|
||||
"vzero %%v5 \n\t"
|
||||
".align 16 \n\t"
|
||||
"1: \n\t"
|
||||
"pfd 1, 256(%[ptr_tmp] ) \n\t"
|
||||
static BLASLONG izamin_kernel_16(BLASLONG n, FLOAT *x, FLOAT *amin)
|
||||
{
|
||||
BLASLONG iamin;
|
||||
|
||||
__asm__ volatile (
|
||||
"vleg %%v0,0(%3),0 \n\t"
|
||||
"vleg %%v1,8(%3),0 \n\t"
|
||||
"vleg %%v0,16(%3),1 \n\t"
|
||||
"vleg %%v1,24(%3),1 \n\t"
|
||||
"vflpdb %%v0,%%v0 \n\t"
|
||||
"vflpdb %%v1,%%v1 \n\t"
|
||||
"vfadb %%v0,%%v0,%%v1 \n\t"
|
||||
"vleig %%v1,0,0 \n\t"
|
||||
"vleig %%v1,1,1 \n\t"
|
||||
"vrepig %%v2,8 \n\t"
|
||||
"vzero %%v3 \n\t"
|
||||
"vleig %%v24,0,0 \n\t"
|
||||
"vleig %%v24,1,1 \n\t"
|
||||
"vleig %%v25,2,0 \n\t"
|
||||
"vleig %%v25,3,1 \n\t"
|
||||
"vleig %%v26,4,0 \n\t"
|
||||
"vleig %%v26,5,1 \n\t"
|
||||
"vleig %%v27,6,0 \n\t"
|
||||
"vleig %%v27,7,1 \n\t"
|
||||
"srlg %%r0,%2,4 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 1, 1024(%%r1,%3) \n\t"
|
||||
|
||||
"vleg %%v16,0(%%r1,%3),0 \n\t"
|
||||
"vleg %%v17,8(%%r1,%3),0 \n\t"
|
||||
"vleg %%v16,16(%%r1,%3),1 \n\t"
|
||||
"vleg %%v17,24(%%r1,%3),1 \n\t"
|
||||
"vleg %%v18,32(%%r1,%3),0 \n\t"
|
||||
"vleg %%v19,40(%%r1,%3),0 \n\t"
|
||||
"vleg %%v18,48(%%r1,%3),1 \n\t"
|
||||
"vleg %%v19,56(%%r1,%3),1 \n\t"
|
||||
"vleg %%v20,64(%%r1,%3),0 \n\t"
|
||||
"vleg %%v21,72(%%r1,%3),0 \n\t"
|
||||
"vleg %%v20,80(%%r1,%3),1 \n\t"
|
||||
"vleg %%v21,88(%%r1,%3),1 \n\t"
|
||||
"vleg %%v22,96(%%r1,%3),0 \n\t"
|
||||
"vleg %%v23,104(%%r1,%3),0 \n\t"
|
||||
"vleg %%v22,112(%%r1,%3),1 \n\t"
|
||||
"vleg %%v23,120(%%r1,%3),1 \n\t"
|
||||
"vflpdb %%v16, %%v16 \n\t"
|
||||
"vflpdb %%v17, %%v17 \n\t"
|
||||
"vflpdb %%v18, %%v18 \n\t"
|
||||
"vflpdb %%v19, %%v19 \n\t"
|
||||
"vflpdb %%v20, %%v20 \n\t"
|
||||
"vflpdb %%v21, %%v21 \n\t"
|
||||
"vflpdb %%v22, %%v22 \n\t"
|
||||
"vflpdb %%v23, %%v23 \n\t"
|
||||
"vfadb %%v16,%%v16,%%v17 \n\t"
|
||||
"vfadb %%v17,%%v18,%%v19 \n\t"
|
||||
"vfadb %%v18,%%v20,%%v21 \n\t"
|
||||
"vfadb %%v19,%%v22,%%v23 \n\t"
|
||||
|
||||
"vleg %%v24 , 0(%[ptr_tmp]),0 \n\t"
|
||||
"vleg %%v25 , 8(%[ptr_tmp]),0 \n\t"
|
||||
"vleg %%v24 , 16(%[ptr_tmp]),1 \n\t"
|
||||
"vleg %%v25 , 24(%[ptr_tmp]),1 \n\t"
|
||||
"vleg %%v26 , 32(%[ptr_tmp]),0 \n\t"
|
||||
"vleg %%v27 , 40(%[ptr_tmp]),0 \n\t"
|
||||
"vleg %%v26 , 48(%[ptr_tmp]),1 \n\t"
|
||||
"vleg %%v27 , 56(%[ptr_tmp]),1 \n\t"
|
||||
"vleg %%v28 , 64(%[ptr_tmp]),0 \n\t"
|
||||
"vleg %%v29 , 72(%[ptr_tmp]),0 \n\t"
|
||||
"vleg %%v28 , 80(%[ptr_tmp]),1 \n\t"
|
||||
"vleg %%v29 , 88(%[ptr_tmp]),1 \n\t"
|
||||
"vleg %%v30 , 96(%[ptr_tmp]),0 \n\t"
|
||||
"vleg %%v31 ,104(%[ptr_tmp]),0 \n\t"
|
||||
"vleg %%v30 ,112(%[ptr_tmp]),1 \n\t"
|
||||
"vleg %%v31 ,120(%[ptr_tmp]),1 \n\t"
|
||||
"vflpdb %%v24, %%v24 \n\t"
|
||||
"vflpdb %%v25, %%v25 \n\t"
|
||||
"vflpdb %%v26, %%v26 \n\t"
|
||||
"vflpdb %%v27, %%v27 \n\t"
|
||||
"vflpdb %%v28, %%v28 \n\t"
|
||||
"vflpdb %%v29, %%v29 \n\t"
|
||||
"vflpdb %%v30, %%v30 \n\t"
|
||||
"vflpdb %%v31, %%v31 \n\t"
|
||||
|
||||
"vfadb %%v0,%%v24,%%v25 \n\t"
|
||||
"vfadb %%v1,%%v26,%%v27 \n\t"
|
||||
"vfadb %%v2,%%v28,%%v29 \n\t"
|
||||
"vfadb %%v3,%%v30,%%v31 \n\t"
|
||||
|
||||
|
||||
"vleg %%v24 ,128(%[ptr_tmp]),0 \n\t"
|
||||
"vleg %%v25 ,136(%[ptr_tmp]),0 \n\t"
|
||||
"vleg %%v24 ,144(%[ptr_tmp]),1 \n\t"
|
||||
"vleg %%v25 ,152(%[ptr_tmp]),1 \n\t"
|
||||
"vleg %%v26 ,160(%[ptr_tmp]),0 \n\t"
|
||||
"vleg %%v27 ,168(%[ptr_tmp]),0 \n\t"
|
||||
"vleg %%v26 ,176(%[ptr_tmp]),1 \n\t"
|
||||
"vleg %%v27 ,184(%[ptr_tmp]),1 \n\t"
|
||||
"vleg %%v28 ,192(%[ptr_tmp]),0 \n\t"
|
||||
"vleg %%v29 ,200(%[ptr_tmp]),0 \n\t"
|
||||
"vleg %%v28 ,208(%[ptr_tmp]),1 \n\t"
|
||||
"vleg %%v29 ,216(%[ptr_tmp]),1 \n\t"
|
||||
"vleg %%v30 ,224(%[ptr_tmp]),0 \n\t"
|
||||
"vleg %%v31 ,232(%[ptr_tmp]),0 \n\t"
|
||||
"vleg %%v30 ,240(%[ptr_tmp]),1 \n\t"
|
||||
"vleg %%v31 ,248(%[ptr_tmp]),1 \n\t"
|
||||
"vflpdb %%v24, %%v24 \n\t"
|
||||
"vflpdb %%v25, %%v25 \n\t"
|
||||
"vflpdb %%v26, %%v26 \n\t"
|
||||
"vflpdb %%v27, %%v27 \n\t"
|
||||
"vflpdb %%v28, %%v28 \n\t"
|
||||
"vflpdb %%v29, %%v29 \n\t"
|
||||
"vflpdb %%v30, %%v30 \n\t"
|
||||
"vflpdb %%v31, %%v31 \n\t"
|
||||
|
||||
"vfadb %%v24,%%v24,%%v25 \n\t"
|
||||
"vfadb %%v26,%%v26,%%v27 \n\t"
|
||||
"vfadb %%v28,%%v28,%%v29 \n\t"
|
||||
"vfadb %%v30,%%v30,%%v31 \n\t"
|
||||
|
||||
|
||||
"vfchdb %%v25,%%v0 ,%%v1 \n\t"
|
||||
"vsel %%v29,%%v17,%%v16,%%v25 \n\t"
|
||||
"vsel %%v31,%%v1,%%v0,%%v25 \n\t"
|
||||
|
||||
"vfchdb %%v27,%%v2,%%v3 \n\t"
|
||||
"vsel %%v0,%%v19,%%v18,%%v27 \n\t"
|
||||
"vsel %%v1,%%v3,%%v2,%%v27 \n\t"
|
||||
|
||||
"vfchdb %%v25,%%v24,%%v26 \n\t"
|
||||
"vsel %%v2,%%v21,%%v20,%%v25 \n\t"
|
||||
"vsel %%v3,%%v26,%%v24,%%v25 \n\t"
|
||||
|
||||
"vfchdb %%v27,%%v28,%%v30 \n\t"
|
||||
"vsel %%v25,%%v23,%%v22,%%v27 \n\t"
|
||||
"vsel %%v27,%%v30,%%v28,%%v27 \n\t"
|
||||
|
||||
"vfchdb %%v24,%%v31, %%v1 \n\t"
|
||||
"vsel %%v26,%%v0,%%v29,%%v24 \n\t"
|
||||
"vsel %%v28,%%v1,%%v31,%%v24 \n\t"
|
||||
|
||||
"vfchdb %%v30,%%v3, %%v27 \n\t"
|
||||
"vsel %%v29,%%v25,%%v2,%%v30 \n\t"
|
||||
"vsel %%v31,%%v27,%%v3 ,%%v30 \n\t"
|
||||
|
||||
"la %[ptr_tmp],256(%[ptr_tmp]) \n\t"
|
||||
|
||||
"vfchdb %%v0,%%v28, %%v31 \n\t"
|
||||
"vsel %%v25,%%v29,%%v26,%%v0 \n\t"
|
||||
"vsel %%v27,%%v31,%%v28,%%v0 \n\t"
|
||||
|
||||
"vag %%v25,%%v25,%%v5 \n\t"
|
||||
|
||||
//cmp with previous
|
||||
"vfchdb %%v30,%%v6 , %%v27 \n\t"
|
||||
"vsel %%v7,%%v25,%%v7,%%v30 \n\t"
|
||||
"vsel %%v6,%%v27,%%v6,%%v30 \n\t"
|
||||
|
||||
"vag %%v5,%%v5,%%v4 \n\t"
|
||||
|
||||
"clgrjl %[ptr_tmp],%%r0,1b \n\t"
|
||||
"vfchedb %%v4,%%v17,%%v16 \n\t"
|
||||
"vfchedb %%v5,%%v19,%%v18 \n\t"
|
||||
"vsel %%v16,%%v16,%%v17,%%v4 \n\t"
|
||||
"vsel %%v4,%%v24,%%v25,%%v4 \n\t"
|
||||
"vsel %%v17,%%v18,%%v19,%%v5 \n\t"
|
||||
"vsel %%v5,%%v26,%%v27,%%v5 \n\t"
|
||||
|
||||
//xtract index
|
||||
"vrepg %%v26,%%v6,1 \n\t"
|
||||
"vrepg %%v5,%%v7,1 \n\t"
|
||||
"wfcdb %%v26,%%v6 \n\t"
|
||||
"jne 2f \n\t"
|
||||
"vsteg %%v6,%[minf],0 \n\t"
|
||||
"vmnlg %%v1,%%v5,%%v7 \n\t"
|
||||
"vlgvg %[index],%%v1,0 \n\t"
|
||||
"j 3f \n\t"
|
||||
"2: \n\t"
|
||||
"wfchdb %%v16,%%v6 ,%%v26 \n\t"
|
||||
"vsel %%v1,%%v5,%%v7,%%v16 \n\t"
|
||||
"vsel %%v0,%%v26,%%v6,%%v16 \n\t"
|
||||
"vlgvg %[index],%%v1,0 \n\t"
|
||||
"std %%f0,%[minf] \n\t"
|
||||
"3: \n\t"
|
||||
"vfchedb %%v18,%%v17,%%v16 \n\t"
|
||||
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
|
||||
"vsel %%v4,%%v4,%%v5,%%v18 \n\t"
|
||||
"vag %%v4,%%v4,%%v3 \n\t"
|
||||
|
||||
: [index] "+r"(index) ,[minf] "=m"(*minf), [ptr_tmp] "+&a"(x)
|
||||
: [mem] "m"( *(const double (*)[2*n])x), [n] "r"(n), [ptr_x] "r"(x)
|
||||
: "cc","r0","f0","v0","v1","v2","v3","v4","v5","v6","v7","v16",
|
||||
"v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
"vfchedb %%v5,%%v16,%%v0 \n\t"
|
||||
"vsel %%v0,%%v0,%%v16,%%v5 \n\t"
|
||||
"vsel %%v1,%%v1,%%v4,%%v5 \n\t"
|
||||
"vag %%v3,%%v3,%%v2 \n\t"
|
||||
|
||||
);
|
||||
"vleg %%v16,128(%%r1,%3),0 \n\t"
|
||||
"vleg %%v17,136(%%r1,%3),0 \n\t"
|
||||
"vleg %%v16,144(%%r1,%3),1 \n\t"
|
||||
"vleg %%v17,152(%%r1,%3),1 \n\t"
|
||||
"vleg %%v18,160(%%r1,%3),0 \n\t"
|
||||
"vleg %%v19,168(%%r1,%3),0 \n\t"
|
||||
"vleg %%v18,176(%%r1,%3),1 \n\t"
|
||||
"vleg %%v19,184(%%r1,%3),1 \n\t"
|
||||
"vleg %%v20,192(%%r1,%3),0 \n\t"
|
||||
"vleg %%v21,200(%%r1,%3),0 \n\t"
|
||||
"vleg %%v20,208(%%r1,%3),1 \n\t"
|
||||
"vleg %%v21,216(%%r1,%3),1 \n\t"
|
||||
"vleg %%v22,224(%%r1,%3),0 \n\t"
|
||||
"vleg %%v23,232(%%r1,%3),0 \n\t"
|
||||
"vleg %%v22,240(%%r1,%3),1 \n\t"
|
||||
"vleg %%v23,248(%%r1,%3),1 \n\t"
|
||||
"vflpdb %%v16, %%v16 \n\t"
|
||||
"vflpdb %%v17, %%v17 \n\t"
|
||||
"vflpdb %%v18, %%v18 \n\t"
|
||||
"vflpdb %%v19, %%v19 \n\t"
|
||||
"vflpdb %%v20, %%v20 \n\t"
|
||||
"vflpdb %%v21, %%v21 \n\t"
|
||||
"vflpdb %%v22, %%v22 \n\t"
|
||||
"vflpdb %%v23, %%v23 \n\t"
|
||||
"vfadb %%v16,%%v16,%%v17 \n\t"
|
||||
"vfadb %%v17,%%v18,%%v19 \n\t"
|
||||
"vfadb %%v18,%%v20,%%v21 \n\t"
|
||||
"vfadb %%v19,%%v22,%%v23 \n\t"
|
||||
|
||||
"vfchedb %%v4,%%v17,%%v16 \n\t"
|
||||
"vfchedb %%v5,%%v19,%%v18 \n\t"
|
||||
"vsel %%v16,%%v16,%%v17,%%v4 \n\t"
|
||||
"vsel %%v4,%%v24,%%v25,%%v4 \n\t"
|
||||
"vsel %%v17,%%v18,%%v19,%%v5 \n\t"
|
||||
"vsel %%v5,%%v26,%%v27,%%v5 \n\t"
|
||||
|
||||
return index;
|
||||
"vfchedb %%v18,%%v17,%%v16 \n\t"
|
||||
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
|
||||
"vsel %%v4,%%v4,%%v5,%%v18 \n\t"
|
||||
"vag %%v4,%%v4,%%v3 \n\t"
|
||||
|
||||
"vfchedb %%v5,%%v16,%%v0 \n\t"
|
||||
"vsel %%v0,%%v0,%%v16,%%v5 \n\t"
|
||||
"vsel %%v1,%%v1,%%v4,%%v5 \n\t"
|
||||
"vag %%v3,%%v3,%%v2 \n\t"
|
||||
|
||||
"agfi %%r1, 256 \n\t"
|
||||
"brctg %%r0, 0b \n\t"
|
||||
|
||||
"vrepg %%v2,%%v0,1 \n\t"
|
||||
"vrepg %%v3,%%v1,1 \n\t"
|
||||
"wfcdb %%v2,%%v0 \n\t"
|
||||
"jne 1f \n\t"
|
||||
"vsteg %%v0,%1,0 \n\t"
|
||||
"vmnlg %%v0,%%v1,%%v3 \n\t"
|
||||
"vlgvg %0,%%v0,0 \n\t"
|
||||
"j 2f \n\t"
|
||||
"1: \n\t"
|
||||
"wfchdb %%v4,%%v0,%%v2 \n\t"
|
||||
"vsel %%v1,%%v3,%%v1,%%v4 \n\t"
|
||||
"vsel %%v0,%%v2,%%v0,%%v4 \n\t"
|
||||
"std %%f0,%1 \n\t"
|
||||
"vlgvg %0,%%v1,0 \n\t"
|
||||
"2: \n\t"
|
||||
"nop "
|
||||
:"=r"(iamin),"=m"(*amin)
|
||||
:"r"(n),"ZR"((const FLOAT (*)[n * 2])x)
|
||||
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27"
|
||||
);
|
||||
|
||||
return iamin;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
{
|
||||
BLASLONG i=0;
|
||||
BLASLONG ix=0;
|
||||
FLOAT minf;
|
||||
BLASLONG min=0;
|
||||
BLASLONG i = 0;
|
||||
BLASLONG ix = 0;
|
||||
FLOAT minf = 0;
|
||||
BLASLONG min = 0;
|
||||
BLASLONG inc_x2;
|
||||
|
||||
if (n <= 0 || inc_x <= 0) return(min);
|
||||
|
||||
|
||||
|
||||
if (inc_x == 1) {
|
||||
|
||||
BLASLONG n1 = n & -16;
|
||||
if (n1 > 0) {
|
||||
BLASLONG n1 = n & -16;
|
||||
if (n1 > 0) {
|
||||
|
||||
min = ziamin_kernel_16_TUNED(n1, x, &minf);
|
||||
min = izamin_kernel_16(n1, x, &minf);
|
||||
ix = n1 * 2;
|
||||
i = n1;
|
||||
ix = n1 << 1;
|
||||
}
|
||||
else {
|
||||
//assign minf
|
||||
minf = CABS1(x,0);
|
||||
ix += 2;
|
||||
i++;
|
||||
}
|
||||
|
||||
while(i < n)
|
||||
}
|
||||
else
|
||||
{
|
||||
if( CABS1(x,ix) < minf )
|
||||
{
|
||||
min = i;
|
||||
minf = CABS1(x,ix);
|
||||
}
|
||||
minf = CABS1(x,0);
|
||||
ix += 2;
|
||||
i++;
|
||||
}
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
if( CABS1(x,ix) < minf )
|
||||
{
|
||||
min = i;
|
||||
minf = CABS1(x,ix);
|
||||
}
|
||||
ix += 2;
|
||||
i++;
|
||||
}
|
||||
return (min + 1);
|
||||
|
||||
} else {
|
||||
|
||||
inc_x2 = 2 * inc_x;
|
||||
min = 0;
|
||||
minf = CABS1(x,0);
|
||||
inc_x2 = 2 * inc_x;
|
||||
ix += inc_x2;
|
||||
i++;
|
||||
|
||||
minf = CABS1(x,0);
|
||||
while(i < n)
|
||||
{
|
||||
if( CABS1(x,ix) < minf )
|
||||
{
|
||||
min = i;
|
||||
minf = CABS1(x,ix);
|
||||
}
|
||||
ix += inc_x2;
|
||||
i++;
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
if( CABS1(x,ix) < minf )
|
||||
{
|
||||
min = i;
|
||||
minf = CABS1(x,ix);
|
||||
}
|
||||
ix += inc_x2;
|
||||
i++;
|
||||
}
|
||||
}
|
||||
return (min + 1);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,169 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2017, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include <math.h>
|
||||
|
||||
#if defined(DOUBLE)
|
||||
#define ABS fabs
|
||||
#else
|
||||
#define ABS fabsf
|
||||
#endif
|
||||
|
||||
static FLOAT samax_kernel_64(BLASLONG n, FLOAT *x)
|
||||
{
|
||||
FLOAT amax;
|
||||
|
||||
__asm__ volatile (
|
||||
"vl %%v0,0(%2) \n\t"
|
||||
"srlg %%r0,%1,6 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 1, 1024(%%r1,%2) \n\t"
|
||||
|
||||
"vl %%v16,0(%%r1,%2) \n\t"
|
||||
"vl %%v17,16(%%r1,%2) \n\t"
|
||||
"vl %%v18,32(%%r1,%2) \n\t"
|
||||
"vl %%v19,48(%%r1,%2) \n\t"
|
||||
"vl %%v20,64(%%r1,%2) \n\t"
|
||||
"vl %%v21,80(%%r1,%2) \n\t"
|
||||
"vl %%v22,96(%%r1,%2) \n\t"
|
||||
"vl %%v23,112(%%r1,%2) \n\t"
|
||||
"vl %%v24,128(%%r1,%2) \n\t"
|
||||
"vl %%v25,144(%%r1,%2) \n\t"
|
||||
"vl %%v26,160(%%r1,%2) \n\t"
|
||||
"vl %%v27,176(%%r1,%2) \n\t"
|
||||
"vl %%v28,192(%%r1,%2) \n\t"
|
||||
"vl %%v29,208(%%r1,%2) \n\t"
|
||||
"vl %%v30,224(%%r1,%2) \n\t"
|
||||
"vl %%v31,240(%%r1,%2) \n\t"
|
||||
|
||||
"vfmaxsb %%v16,%%v16,%%v24,8 \n\t"
|
||||
"vfmaxsb %%v17,%%v17,%%v25,8 \n\t"
|
||||
"vfmaxsb %%v18,%%v18,%%v26,8 \n\t"
|
||||
"vfmaxsb %%v19,%%v19,%%v27,8 \n\t"
|
||||
"vfmaxsb %%v20,%%v20,%%v28,8 \n\t"
|
||||
"vfmaxsb %%v21,%%v21,%%v29,8 \n\t"
|
||||
"vfmaxsb %%v22,%%v22,%%v30,8 \n\t"
|
||||
"vfmaxsb %%v23,%%v23,%%v31,8 \n\t"
|
||||
|
||||
"vfmaxsb %%v16,%%v16,%%v20,8 \n\t"
|
||||
"vfmaxsb %%v17,%%v17,%%v21,8 \n\t"
|
||||
"vfmaxsb %%v18,%%v18,%%v22,8 \n\t"
|
||||
"vfmaxsb %%v19,%%v19,%%v23,8 \n\t"
|
||||
|
||||
"vfmaxsb %%v16,%%v16,%%v18,8 \n\t"
|
||||
"vfmaxsb %%v17,%%v17,%%v19,8 \n\t"
|
||||
|
||||
"vfmaxsb %%v16,%%v16,%%v17,8 \n\t"
|
||||
|
||||
"vfmaxsb %%v0,%%v0,%%16,8 \n\t"
|
||||
|
||||
"agfi %%r1, 256 \n\t"
|
||||
"brctg %%r0, 0b \n\t"
|
||||
|
||||
"veslg %%v16,%%v0,32 \n\t"
|
||||
"vfmaxsb %%v0,%%v0,%%v16,8 \n\t"
|
||||
|
||||
"vrepf %%v16,%%v0,2 \n\t"
|
||||
"wfmaxsb %%v0,%%v0,%%v16,8 \n\t"
|
||||
"lper %0,%%f0 "
|
||||
:"=f"(amax)
|
||||
:"r"(n),"ZR"((const FLOAT (*)[n])x)
|
||||
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
|
||||
return amax;
|
||||
}
|
||||
|
||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||
BLASLONG i = 0;
|
||||
BLASLONG j = 0;
|
||||
FLOAT maxf = 0.0;
|
||||
|
||||
if (n <= 0 || inc_x <= 0) return (maxf);
|
||||
|
||||
if (inc_x == 1) {
|
||||
|
||||
BLASLONG n1 = n & -64;
|
||||
if (n1 > 0) {
|
||||
|
||||
maxf = samax_kernel_64(n1, x);
|
||||
|
||||
i = n1;
|
||||
}
|
||||
else
|
||||
{
|
||||
maxf=ABS(x[0]);
|
||||
i++;
|
||||
}
|
||||
|
||||
while (i < n) {
|
||||
if (ABS(x[i]) > maxf) {
|
||||
maxf = ABS(x[i]);
|
||||
}
|
||||
i++;
|
||||
}
|
||||
return (maxf);
|
||||
|
||||
} else {
|
||||
|
||||
maxf=ABS(x[0]);
|
||||
|
||||
BLASLONG n1 = n & -4;
|
||||
while (j < n1) {
|
||||
|
||||
if (ABS(x[i]) > maxf) {
|
||||
maxf = ABS(x[i]);
|
||||
}
|
||||
if (ABS(x[i + inc_x]) > maxf) {
|
||||
maxf = ABS(x[i + inc_x]);
|
||||
}
|
||||
if (ABS(x[i + 2 * inc_x]) > maxf) {
|
||||
maxf = ABS(x[i + 2 * inc_x]);
|
||||
}
|
||||
if (ABS(x[i + 3 * inc_x]) > maxf) {
|
||||
maxf = ABS(x[i + 3 * inc_x]);
|
||||
}
|
||||
|
||||
i += inc_x * 4;
|
||||
|
||||
j += 4;
|
||||
|
||||
}
|
||||
|
||||
|
||||
while (j < n) {
|
||||
if (ABS(x[i]) > maxf) {
|
||||
maxf = ABS(x[i]);
|
||||
}
|
||||
i += inc_x;
|
||||
j++;
|
||||
}
|
||||
return (maxf);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,169 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2017, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include <math.h>
|
||||
|
||||
#if defined(DOUBLE)
|
||||
#define ABS fabs
|
||||
#else
|
||||
#define ABS fabsf
|
||||
#endif
|
||||
|
||||
static FLOAT samin_kernel_64(BLASLONG n, FLOAT *x)
|
||||
{
|
||||
FLOAT amin;
|
||||
|
||||
__asm__ volatile (
|
||||
"vl %%v0,0(%2) \n\t"
|
||||
"srlg %%r0,%1,6 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 1, 1024(%%r1,%2) \n\t"
|
||||
|
||||
"vl %%v16,0(%%r1,%2) \n\t"
|
||||
"vl %%v17,16(%%r1,%2) \n\t"
|
||||
"vl %%v18,32(%%r1,%2) \n\t"
|
||||
"vl %%v19,48(%%r1,%2) \n\t"
|
||||
"vl %%v20,64(%%r1,%2) \n\t"
|
||||
"vl %%v21,80(%%r1,%2) \n\t"
|
||||
"vl %%v22,96(%%r1,%2) \n\t"
|
||||
"vl %%v23,112(%%r1,%2) \n\t"
|
||||
"vl %%v24,128(%%r1,%2) \n\t"
|
||||
"vl %%v25,144(%%r1,%2) \n\t"
|
||||
"vl %%v26,160(%%r1,%2) \n\t"
|
||||
"vl %%v27,176(%%r1,%2) \n\t"
|
||||
"vl %%v28,192(%%r1,%2) \n\t"
|
||||
"vl %%v29,208(%%r1,%2) \n\t"
|
||||
"vl %%v30,224(%%r1,%2) \n\t"
|
||||
"vl %%v31,240(%%r1,%2) \n\t"
|
||||
|
||||
"vfminsb %%v16,%%v16,%%v24,8 \n\t"
|
||||
"vfminsb %%v17,%%v17,%%v25,8 \n\t"
|
||||
"vfminsb %%v18,%%v18,%%v26,8 \n\t"
|
||||
"vfminsb %%v19,%%v19,%%v27,8 \n\t"
|
||||
"vfminsb %%v20,%%v20,%%v28,8 \n\t"
|
||||
"vfminsb %%v21,%%v21,%%v29,8 \n\t"
|
||||
"vfminsb %%v22,%%v22,%%v30,8 \n\t"
|
||||
"vfminsb %%v23,%%v23,%%v31,8 \n\t"
|
||||
|
||||
"vfminsb %%v16,%%v16,%%v20,8 \n\t"
|
||||
"vfminsb %%v17,%%v17,%%v21,8 \n\t"
|
||||
"vfminsb %%v18,%%v18,%%v22,8 \n\t"
|
||||
"vfminsb %%v19,%%v19,%%v23,8 \n\t"
|
||||
|
||||
"vfminsb %%v16,%%v16,%%v18,8 \n\t"
|
||||
"vfminsb %%v17,%%v17,%%v19,8 \n\t"
|
||||
|
||||
"vfminsb %%v16,%%v16,%%v17,8 \n\t"
|
||||
|
||||
"vfminsb %%v0,%%v0,%%16,8 \n\t"
|
||||
|
||||
"agfi %%r1, 256 \n\t"
|
||||
"brctg %%r0, 0b \n\t"
|
||||
|
||||
"veslg %%v16,%%v0,32 \n\t"
|
||||
"vfminsb %%v0,%%v0,%%v16,8 \n\t"
|
||||
|
||||
"vrepf %%v16,%%v0,2 \n\t"
|
||||
"wfminsb %%v0,%%v0,%%v16,8 \n\t"
|
||||
"lper %0,%%f0 "
|
||||
:"=f"(amin)
|
||||
:"r"(n),"ZR"((const FLOAT (*)[n])x)
|
||||
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
|
||||
return amin;
|
||||
}
|
||||
|
||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||
BLASLONG i = 0;
|
||||
BLASLONG j = 0;
|
||||
FLOAT minf = 0.0;
|
||||
|
||||
if (n <= 0 || inc_x <= 0) return (minf);
|
||||
|
||||
if (inc_x == 1) {
|
||||
|
||||
BLASLONG n1 = n & -64;
|
||||
if (n1 > 0) {
|
||||
|
||||
minf = samin_kernel_64(n1, x);
|
||||
|
||||
i = n1;
|
||||
}
|
||||
else
|
||||
{
|
||||
minf=ABS(x[0]);
|
||||
i++;
|
||||
}
|
||||
|
||||
while (i < n) {
|
||||
if (ABS(x[i]) < minf) {
|
||||
minf = ABS(x[i]);
|
||||
}
|
||||
i++;
|
||||
}
|
||||
return (minf);
|
||||
|
||||
} else {
|
||||
|
||||
minf=ABS(x[0]);
|
||||
|
||||
BLASLONG n1 = n & -4;
|
||||
while (j < n1) {
|
||||
|
||||
if (ABS(x[i]) < minf) {
|
||||
minf = ABS(x[i]);
|
||||
}
|
||||
if (ABS(x[i + inc_x]) < minf) {
|
||||
minf = ABS(x[i + inc_x]);
|
||||
}
|
||||
if (ABS(x[i + 2 * inc_x]) < minf) {
|
||||
minf = ABS(x[i + 2 * inc_x]);
|
||||
}
|
||||
if (ABS(x[i + 3 * inc_x]) < minf) {
|
||||
minf = ABS(x[i + 3 * inc_x]);
|
||||
}
|
||||
|
||||
i += inc_x * 4;
|
||||
|
||||
j += 4;
|
||||
|
||||
}
|
||||
|
||||
|
||||
while (j < n) {
|
||||
if (ABS(x[i]) < minf) {
|
||||
minf = ABS(x[i]);
|
||||
}
|
||||
i += inc_x;
|
||||
j++;
|
||||
}
|
||||
return (minf);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,174 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2018, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include <math.h>
|
||||
|
||||
#if defined(DOUBLE)
|
||||
#define ABS fabs
|
||||
#else
|
||||
#define ABS fabsf
|
||||
#endif
|
||||
|
||||
static FLOAT sasum_kernel_64(BLASLONG n, FLOAT *x)
|
||||
{
|
||||
FLOAT asum;
|
||||
|
||||
__asm__ (
|
||||
"vzero %%v0 \n\t"
|
||||
"vzero %%v1 \n\t"
|
||||
"vzero %%v2 \n\t"
|
||||
"vzero %%v3 \n\t"
|
||||
"srlg %%r0,%1,6 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 1, 1024(%%r1,%2) \n\t"
|
||||
"vl %%v16, 0(%%r1,%2) \n\t"
|
||||
"vl %%v17, 16(%%r1,%2) \n\t"
|
||||
"vl %%v18, 32(%%r1,%2) \n\t"
|
||||
"vl %%v19, 48(%%r1,%2) \n\t"
|
||||
"vl %%v20, 64(%%r1,%2) \n\t"
|
||||
"vl %%v21, 80(%%r1,%2) \n\t"
|
||||
"vl %%v22, 96(%%r1,%2) \n\t"
|
||||
"vl %%v23, 112(%%r1,%2) \n\t"
|
||||
|
||||
"vflpsb %%v16, %%v16 \n\t"
|
||||
"vflpsb %%v17, %%v17 \n\t"
|
||||
"vflpsb %%v18, %%v18 \n\t"
|
||||
"vflpsb %%v19, %%v19 \n\t"
|
||||
"vflpsb %%v20, %%v20 \n\t"
|
||||
"vflpsb %%v21, %%v21 \n\t"
|
||||
"vflpsb %%v22, %%v22 \n\t"
|
||||
"vflpsb %%v23, %%v23 \n\t"
|
||||
|
||||
"vfasb %%v0,%%v0,%%v16 \n\t"
|
||||
"vfasb %%v1,%%v1,%%v17 \n\t"
|
||||
"vfasb %%v2,%%v2,%%v18 \n\t"
|
||||
"vfasb %%v3,%%v3,%%v19 \n\t"
|
||||
"vfasb %%v0,%%v0,%%v20 \n\t"
|
||||
"vfasb %%v1,%%v1,%%v21 \n\t"
|
||||
"vfasb %%v2,%%v2,%%v22 \n\t"
|
||||
"vfasb %%v3,%%v3,%%v23 \n\t"
|
||||
|
||||
"vl %%v16, 128(%%r1,%2) \n\t"
|
||||
"vl %%v17, 144(%%r1,%2) \n\t"
|
||||
"vl %%v18, 160(%%r1,%2) \n\t"
|
||||
"vl %%v19, 176(%%r1,%2) \n\t"
|
||||
"vl %%v20, 192(%%r1,%2) \n\t"
|
||||
"vl %%v21, 208(%%r1,%2) \n\t"
|
||||
"vl %%v22, 224(%%r1,%2) \n\t"
|
||||
"vl %%v23, 240(%%r1,%2) \n\t"
|
||||
|
||||
"vflpsb %%v16, %%v16 \n\t"
|
||||
"vflpsb %%v17, %%v17 \n\t"
|
||||
"vflpsb %%v18, %%v18 \n\t"
|
||||
"vflpsb %%v19, %%v19 \n\t"
|
||||
"vflpsb %%v20, %%v20 \n\t"
|
||||
"vflpsb %%v21, %%v21 \n\t"
|
||||
"vflpsb %%v22, %%v22 \n\t"
|
||||
"vflpsb %%v23, %%v23 \n\t"
|
||||
|
||||
"vfasb %%v0,%%v0,%%v16 \n\t"
|
||||
"vfasb %%v1,%%v1,%%v17 \n\t"
|
||||
"vfasb %%v2,%%v2,%%v18 \n\t"
|
||||
"vfasb %%v3,%%v3,%%v19 \n\t"
|
||||
"vfasb %%v0,%%v0,%%v20 \n\t"
|
||||
"vfasb %%v1,%%v1,%%v21 \n\t"
|
||||
"vfasb %%v2,%%v2,%%v22 \n\t"
|
||||
"vfasb %%v3,%%v3,%%v23 \n\t"
|
||||
|
||||
"agfi %%r1,256 \n\t"
|
||||
"brctg %%r0,0b \n\t"
|
||||
"vfasb %%v0,%%v0,%%v1 \n\t"
|
||||
"vfasb %%v0,%%v0,%%v2 \n\t"
|
||||
"vfasb %%v0,%%v0,%%v3 \n\t"
|
||||
"veslg %%v1,%%v0,32 \n\t"
|
||||
"vfasb %%v0,%%v0,%%v1 \n\t"
|
||||
"vrepf %%v1,%%v0,2 \n\t"
|
||||
"aebr %%f0,%%f1 \n\t"
|
||||
"ler %0,%%f0 "
|
||||
:"=f"(asum)
|
||||
:"r"(n),"ZR"((const FLOAT (*)[n])x)
|
||||
:"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23"
|
||||
);
|
||||
|
||||
return asum;
|
||||
}
|
||||
|
||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||
BLASLONG i = 0;
|
||||
BLASLONG j = 0;
|
||||
FLOAT sumf = 0.0;
|
||||
BLASLONG n1;
|
||||
|
||||
if (n <= 0 || inc_x <= 0) return sumf;
|
||||
|
||||
if (inc_x == 1) {
|
||||
|
||||
n1 = n & -64;
|
||||
|
||||
if (n1 > 0) {
|
||||
|
||||
sumf = sasum_kernel_64(n1, x);
|
||||
i = n1;
|
||||
}
|
||||
|
||||
while (i < n) {
|
||||
sumf += ABS(x[i]);
|
||||
i++;
|
||||
}
|
||||
|
||||
} else {
|
||||
BLASLONG n1 = n & -4;
|
||||
register FLOAT sum1, sum2;
|
||||
sum1 = 0.0;
|
||||
sum2 = 0.0;
|
||||
while (j < n1) {
|
||||
|
||||
sum1 += ABS(x[i]);
|
||||
sum2 += ABS(x[i + inc_x]);
|
||||
sum1 += ABS(x[i + 2 * inc_x]);
|
||||
sum2 += ABS(x[i + 3 * inc_x]);
|
||||
|
||||
i += inc_x * 4;
|
||||
j += 4;
|
||||
|
||||
}
|
||||
sumf = sum1 + sum2;
|
||||
while (j < n) {
|
||||
|
||||
sumf += ABS(x[i]);
|
||||
i += inc_x;
|
||||
j++;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
return sumf;
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,184 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2017, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
static void saxpy_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
||||
{
|
||||
__asm__ volatile(
|
||||
"vlrepf %%v0,%3 \n\t"
|
||||
"srlg %%r0,%0,6 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 1, 1024(%%r1,%1) \n\t"
|
||||
"pfd 2, 1024(%%r1,%2) \n\t"
|
||||
|
||||
"vl %%v16,0(%%r1,%1) \n\t"
|
||||
"vl %%v17,16(%%r1,%1) \n\t"
|
||||
"vl %%v18,32(%%r1,%1) \n\t"
|
||||
"vl %%v19,48(%%r1,%1) \n\t"
|
||||
"vl %%v20,0(%%r1,%2) \n\t"
|
||||
"vl %%v21,16(%%r1,%2) \n\t"
|
||||
"vl %%v22,32(%%r1,%2) \n\t"
|
||||
"vl %%v23,48(%%r1,%2) \n\t"
|
||||
|
||||
"vfmasb %%v16,%%v0,%%v16,%%v20 \n\t"
|
||||
"vfmasb %%v17,%%v0,%%v17,%%v21 \n\t"
|
||||
"vfmasb %%v18,%%v0,%%v18,%%v22 \n\t"
|
||||
"vfmasb %%v19,%%v0,%%v19,%%v23 \n\t"
|
||||
|
||||
"vl %%v24,64(%%r1,%1) \n\t"
|
||||
"vl %%v25,80(%%r1,%1) \n\t"
|
||||
"vl %%v26,96(%%r1,%1) \n\t"
|
||||
"vl %%v27,112(%%r1,%1) \n\t"
|
||||
"vl %%v28,64(%%r1,%2) \n\t"
|
||||
"vl %%v29,80(%%r1,%2) \n\t"
|
||||
"vl %%v30,96(%%r1,%2) \n\t"
|
||||
"vl %%v31,112(%%r1,%2) \n\t"
|
||||
|
||||
"vfmasb %%v20,%%v0,%%v24,%%v28 \n\t"
|
||||
"vfmasb %%v21,%%v0,%%v25,%%v29 \n\t"
|
||||
"vfmasb %%v22,%%v0,%%v26,%%v30 \n\t"
|
||||
"vfmasb %%v23,%%v0,%%v27,%%v31 \n\t"
|
||||
|
||||
"vst %%v16,0(%%r1,%2) \n\t"
|
||||
"vst %%v17,16(%%r1,%2) \n\t"
|
||||
"vst %%v18,32(%%r1,%2) \n\t"
|
||||
"vst %%v19,48(%%r1,%2) \n\t"
|
||||
"vst %%v20,64(%%r1,%2) \n\t"
|
||||
"vst %%v21,80(%%r1,%2) \n\t"
|
||||
"vst %%v22,96(%%r1,%2) \n\t"
|
||||
"vst %%v23,112(%%r1,%2) \n\t"
|
||||
|
||||
"vl %%v16,128(%%r1,%1) \n\t"
|
||||
"vl %%v17,144(%%r1,%1) \n\t"
|
||||
"vl %%v18,160(%%r1,%1) \n\t"
|
||||
"vl %%v19,176(%%r1,%1) \n\t"
|
||||
"vl %%v20,128(%%r1,%2) \n\t"
|
||||
"vl %%v21,144(%%r1,%2) \n\t"
|
||||
"vl %%v22,160(%%r1,%2) \n\t"
|
||||
"vl %%v23,176(%%r1,%2) \n\t"
|
||||
|
||||
"vfmasb %%v16,%%v0,%%v16,%%v20 \n\t"
|
||||
"vfmasb %%v17,%%v0,%%v17,%%v21 \n\t"
|
||||
"vfmasb %%v18,%%v0,%%v18,%%v22 \n\t"
|
||||
"vfmasb %%v19,%%v0,%%v19,%%v23 \n\t"
|
||||
|
||||
"vl %%v24,192(%%r1,%1) \n\t"
|
||||
"vl %%v25,208(%%r1,%1) \n\t"
|
||||
"vl %%v26,224(%%r1,%1) \n\t"
|
||||
"vl %%v27,240(%%r1,%1) \n\t"
|
||||
"vl %%v28,192(%%r1,%2) \n\t"
|
||||
"vl %%v29,208(%%r1,%2) \n\t"
|
||||
"vl %%v30,224(%%r1,%2) \n\t"
|
||||
"vl %%v31,240(%%r1,%2) \n\t"
|
||||
|
||||
"vfmasb %%v20,%%v0,%%v24,%%v28 \n\t"
|
||||
"vfmasb %%v21,%%v0,%%v25,%%v29 \n\t"
|
||||
"vfmasb %%v22,%%v0,%%v26,%%v30 \n\t"
|
||||
"vfmasb %%v23,%%v0,%%v27,%%v31 \n\t"
|
||||
|
||||
"vst %%v16,128(%%r1,%2) \n\t"
|
||||
"vst %%v17,144(%%r1,%2) \n\t"
|
||||
"vst %%v18,160(%%r1,%2) \n\t"
|
||||
"vst %%v19,176(%%r1,%2) \n\t"
|
||||
"vst %%v20,192(%%r1,%2) \n\t"
|
||||
"vst %%v21,208(%%r1,%2) \n\t"
|
||||
"vst %%v22,224(%%r1,%2) \n\t"
|
||||
"vst %%v23,240(%%r1,%2) \n\t"
|
||||
|
||||
"agfi %%r1,256 \n\t"
|
||||
"brctg %%r0,0b "
|
||||
:
|
||||
:"r"(n),"ZR"((const FLOAT (*)[n])x),"ZR"((FLOAT (*)[n])y),"m"(*alpha)
|
||||
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
}
|
||||
|
||||
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
|
||||
{
|
||||
BLASLONG i=0;
|
||||
BLASLONG ix=0,iy=0;
|
||||
|
||||
if ( n <= 0 ) return 0 ;
|
||||
|
||||
if ( (inc_x == 1) && (inc_y == 1) )
|
||||
{
|
||||
|
||||
BLASLONG n1 = n & -64;
|
||||
|
||||
if ( n1 )
|
||||
saxpy_kernel_64(n1, x, y , &da);
|
||||
|
||||
i = n1;
|
||||
while(i < n)
|
||||
{
|
||||
|
||||
y[i] += da * x[i] ;
|
||||
i++ ;
|
||||
|
||||
}
|
||||
return 0 ;
|
||||
|
||||
|
||||
}
|
||||
|
||||
BLASLONG n1 = n & -4;
|
||||
|
||||
while(i < n1)
|
||||
{
|
||||
|
||||
FLOAT m1 = da * x[ix] ;
|
||||
FLOAT m2 = da * x[ix+inc_x] ;
|
||||
FLOAT m3 = da * x[ix+2*inc_x] ;
|
||||
FLOAT m4 = da * x[ix+3*inc_x] ;
|
||||
|
||||
y[iy] += m1 ;
|
||||
y[iy+inc_y] += m2 ;
|
||||
y[iy+2*inc_y] += m3 ;
|
||||
y[iy+3*inc_y] += m4 ;
|
||||
|
||||
ix += inc_x*4 ;
|
||||
iy += inc_y*4 ;
|
||||
i+=4 ;
|
||||
|
||||
}
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
|
||||
y[iy] += da * x[ix] ;
|
||||
ix += inc_x ;
|
||||
iy += inc_y ;
|
||||
i++ ;
|
||||
|
||||
}
|
||||
return 0 ;
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,85 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2018, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
static void scopy_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y)
|
||||
{
|
||||
__asm__ volatile (
|
||||
"lgr %%r1,%1 \n\t"
|
||||
"lgr %%r2,%2 \n\t"
|
||||
"srlg %%r0,%0,6 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 1, 1024(%%r1) \n\t"
|
||||
"pfd 2, 1024(%%r2) \n\t"
|
||||
"mvc 0(256,%%r2),0(%%r1) \n\t"
|
||||
"agfi %%r1,256 \n\t"
|
||||
"agfi %%r2,256 \n\t"
|
||||
"brctg %%r0,0b "
|
||||
:
|
||||
:"r"(n),"a"((const FLOAT (*)[n])x),"a"((FLOAT (*)[n])y)
|
||||
:"memory","cc","r0","r1","r2"
|
||||
);
|
||||
}
|
||||
|
||||
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) {
|
||||
BLASLONG i = 0;
|
||||
BLASLONG ix = 0, iy = 0;
|
||||
|
||||
if (n <= 0) return 0;
|
||||
|
||||
if ((inc_x == 1) && (inc_y == 1)) {
|
||||
|
||||
BLASLONG n1 = n & -64;
|
||||
if (n1 > 0) {
|
||||
scopy_kernel_64(n1, x, y);
|
||||
i = n1;
|
||||
}
|
||||
|
||||
while (i < n) {
|
||||
y[i] = x[i];
|
||||
i++;
|
||||
|
||||
}
|
||||
|
||||
|
||||
} else {
|
||||
|
||||
while (i < n) {
|
||||
|
||||
y[iy] = x[ix];
|
||||
ix += inc_x;
|
||||
iy += inc_y;
|
||||
i++;
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
return 0;
|
||||
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,140 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2018,The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms,with or without
|
||||
modification,are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice,this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice,this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES,INCLUDING,BUT NOT LIMITED TO,THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT,INDIRECT,INCIDENTAL,SPECIAL,EXEMPLARY,OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING,BUT NOT LIMITED TO,PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE,DATA,OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY,WHETHER IN CONTRACT,STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE,EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
static FLOAT sdot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y)
|
||||
{
|
||||
FLOAT dot;
|
||||
|
||||
__asm__ volatile (
|
||||
"vzero %%v0 \n\t"
|
||||
"srlg %%r0,%1,5 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 1,1024(%%r1,%2) \n\t"
|
||||
"pfd 1,1024(%%r1,%3) \n\t"
|
||||
|
||||
"vl %%v16,0(%%r1,%2) \n\t"
|
||||
"vl %%v17,16(%%r1,%2) \n\t"
|
||||
"vl %%v18,32(%%r1,%2) \n\t"
|
||||
"vl %%v19,48(%%r1,%2) \n\t"
|
||||
"vl %%v20,64(%%r1,%2) \n\t"
|
||||
"vl %%v21,80(%%r1,%2) \n\t"
|
||||
"vl %%v22,96(%%r1,%2) \n\t"
|
||||
"vl %%v23,112(%%r1,%2) \n\t"
|
||||
|
||||
"vl %%v24,0(%%r1,%3) \n\t"
|
||||
"vfmasb %%v0,%%v16,%%v24,%%v0 \n\t"
|
||||
"vl %%v25,16(%%r1,%3) \n\t"
|
||||
"vfmasb %%v0,%%v17,%%v25,%%v0 \n\t"
|
||||
"vl %%v26,32(%%r1,%3) \n\t"
|
||||
"vfmasb %%v0,%%v18,%%v26,%%v0 \n\t"
|
||||
"vl %%v27,48(%%r1,%3) \n\t"
|
||||
"vfmasb %%v0,%%v19,%%v27,%%v0 \n\t"
|
||||
"vl %%v28,64(%%r1,%3) \n\t"
|
||||
"vfmasb %%v0,%%v20,%%v28,%%v0 \n\t"
|
||||
"vl %%v29,80(%%r1,%3) \n\t"
|
||||
"vfmasb %%v0,%%v21,%%v29,%%v0 \n\t"
|
||||
"vl %%v30,96(%%r1,%3) \n\t"
|
||||
"vfmasb %%v0,%%v22,%%v30,%%v0 \n\t"
|
||||
"vl %%v31,112(%%r1,%3) \n\t"
|
||||
"vfmasb %%v0,%%v23,%%v31,%%v0 \n\t"
|
||||
|
||||
"agfi %%r1,128 \n\t"
|
||||
"brctg %%r0,0b \n\t"
|
||||
"vrepf %%v1,%%v0,1 \n\t"
|
||||
"vrepf %%v2,%%v0,2 \n\t"
|
||||
"vrepf %%v3,%%v0,3 \n\t"
|
||||
"aebr %%f0,%%f1 \n\t"
|
||||
"aebr %%f0,%%f2 \n\t"
|
||||
"aebr %%f0,%%f3 \n\t"
|
||||
"ler %0,%%f0 "
|
||||
:"=f"(dot)
|
||||
:"r"(n),"ZR"((const FLOAT (*)[n])x),"ZR"((const FLOAT (*)[n])y)
|
||||
:"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
|
||||
return dot;
|
||||
}
|
||||
|
||||
FLOAT CNAME(BLASLONG n,FLOAT *x,BLASLONG inc_x,FLOAT *y,BLASLONG inc_y)
|
||||
{
|
||||
BLASLONG i=0;
|
||||
BLASLONG ix=0,iy=0;
|
||||
|
||||
FLOAT dot = 0.0 ;
|
||||
|
||||
if ( n <= 0 ) return(dot);
|
||||
|
||||
if ( (inc_x == 1) && (inc_y == 1) )
|
||||
{
|
||||
|
||||
BLASLONG n1 = n & -32;
|
||||
|
||||
if ( n1 )
|
||||
dot = sdot_kernel_32(n1,x,y);
|
||||
|
||||
i = n1;
|
||||
while(i < n)
|
||||
{
|
||||
|
||||
dot += y[i] * x[i] ;
|
||||
i++ ;
|
||||
|
||||
}
|
||||
return(dot);
|
||||
|
||||
|
||||
}
|
||||
|
||||
BLASLONG n1 = n & -2;
|
||||
|
||||
while(i < n1)
|
||||
{
|
||||
|
||||
dot += y[iy] * x[ix] + y[iy+inc_y] * x[ix+inc_x];
|
||||
ix += inc_x*2 ;
|
||||
iy += inc_y*2 ;
|
||||
i+=2 ;
|
||||
|
||||
}
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
|
||||
dot += y[iy] * x[ix] ;
|
||||
ix += inc_x ;
|
||||
iy += inc_y ;
|
||||
i++ ;
|
||||
|
||||
}
|
||||
return(dot);
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,668 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2017, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#define NBMAX 2048
|
||||
|
||||
static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha)
|
||||
{
|
||||
__asm__ volatile (
|
||||
"vlrepf %%v0,0(%5) \n\t"
|
||||
"vlrepf %%v1,4(%5) \n\t"
|
||||
"vlrepf %%v2,8(%5) \n\t"
|
||||
"vlrepf %%v3,12(%5) \n\t"
|
||||
"vlrepf %%v4,%7 \n\t"
|
||||
"vfmsb %%v0,%%v0,%%v4 \n\t"
|
||||
"vfmsb %%v1,%%v1,%%v4 \n\t"
|
||||
"vfmsb %%v2,%%v2,%%v4 \n\t"
|
||||
"vfmsb %%v3,%%v3,%%v4 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
|
||||
"lghi %%r0,-32 \n\t"
|
||||
"ngr %%r0,%0 \n\t"
|
||||
"ltgr %%r0,%%r0 \n\t"
|
||||
"jz 1f \n\t"
|
||||
|
||||
"srlg %%r0,%%r0,5 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 1,1024(%%r1,%1) \n\t"
|
||||
"pfd 1,1024(%%r1,%2) \n\t"
|
||||
"pfd 1,1024(%%r1,%3) \n\t"
|
||||
"pfd 1,1024(%%r1,%4) \n\t"
|
||||
"pfd 2,1024(%%r1,%6) \n\t"
|
||||
|
||||
"vl %%v16,0(%%r1,%1) \n\t"
|
||||
"vl %%v17,0(%%r1,%2) \n\t"
|
||||
"vl %%v18,0(%%r1,%3) \n\t"
|
||||
"vl %%v19,0(%%r1,%4) \n\t"
|
||||
"vl %%v20,16(%%r1,%1) \n\t"
|
||||
"vl %%v21,16(%%r1,%2) \n\t"
|
||||
"vl %%v22,16(%%r1,%3) \n\t"
|
||||
"vl %%v23,16(%%r1,%4) \n\t"
|
||||
"vl %%v24,32(%%r1,%1) \n\t"
|
||||
"vl %%v25,32(%%r1,%2) \n\t"
|
||||
"vl %%v26,32(%%r1,%3) \n\t"
|
||||
"vl %%v27,32(%%r1,%4) \n\t"
|
||||
"vl %%v28,48(%%r1,%1) \n\t"
|
||||
"vl %%v29,48(%%r1,%2) \n\t"
|
||||
"vl %%v30,48(%%r1,%3) \n\t"
|
||||
"vl %%v31,48(%%r1,%4) \n\t"
|
||||
|
||||
"vl %%v4,0(%%r1,%6) \n\t"
|
||||
"vfmasb %%v4,%%v16,%%v0,%%v4 \n\t"
|
||||
"vfmasb %%v4,%%v17,%%v1,%%v4 \n\t"
|
||||
"vfmasb %%v4,%%v18,%%v2,%%v4 \n\t"
|
||||
"vfmasb %%v4,%%v19,%%v3,%%v4 \n\t"
|
||||
"vst %%v4,0(%%r1,%6) \n\t"
|
||||
|
||||
"vl %%v4,16(%%r1,%6) \n\t"
|
||||
"vfmasb %%v4,%%v20,%%v0,%%v4 \n\t"
|
||||
"vfmasb %%v4,%%v21,%%v1,%%v4 \n\t"
|
||||
"vfmasb %%v4,%%v22,%%v2,%%v4 \n\t"
|
||||
"vfmasb %%v4,%%v23,%%v3,%%v4 \n\t"
|
||||
"vst %%v4,16(%%r1,%6) \n\t"
|
||||
|
||||
"vl %%v4,32(%%r1,%6) \n\t"
|
||||
"vfmasb %%v4,%%v24,%%v0,%%v4 \n\t"
|
||||
"vfmasb %%v4,%%v25,%%v1,%%v4 \n\t"
|
||||
"vfmasb %%v4,%%v26,%%v2,%%v4 \n\t"
|
||||
"vfmasb %%v4,%%v27,%%v3,%%v4 \n\t"
|
||||
"vst %%v4,32(%%r1,%6) \n\t"
|
||||
|
||||
"vl %%v4,48(%%r1,%6) \n\t"
|
||||
"vfmasb %%v4,%%v28,%%v0,%%v4 \n\t"
|
||||
"vfmasb %%v4,%%v29,%%v1,%%v4 \n\t"
|
||||
"vfmasb %%v4,%%v30,%%v2,%%v4 \n\t"
|
||||
"vfmasb %%v4,%%v31,%%v3,%%v4 \n\t"
|
||||
"vst %%v4,48(%%r1,%6) \n\t"
|
||||
|
||||
"vl %%v16,64(%%r1,%1) \n\t"
|
||||
"vl %%v17,64(%%r1,%2) \n\t"
|
||||
"vl %%v18,64(%%r1,%3) \n\t"
|
||||
"vl %%v19,64(%%r1,%4) \n\t"
|
||||
"vl %%v20,80(%%r1,%1) \n\t"
|
||||
"vl %%v21,80(%%r1,%2) \n\t"
|
||||
"vl %%v22,80(%%r1,%3) \n\t"
|
||||
"vl %%v23,80(%%r1,%4) \n\t"
|
||||
"vl %%v24,96(%%r1,%1) \n\t"
|
||||
"vl %%v25,96(%%r1,%2) \n\t"
|
||||
"vl %%v26,96(%%r1,%3) \n\t"
|
||||
"vl %%v27,96(%%r1,%4) \n\t"
|
||||
"vl %%v28,112(%%r1,%1) \n\t"
|
||||
"vl %%v29,112(%%r1,%2) \n\t"
|
||||
"vl %%v30,112(%%r1,%3) \n\t"
|
||||
"vl %%v31,112(%%r1,%4) \n\t"
|
||||
|
||||
"vl %%v4,64(%%r1,%6) \n\t"
|
||||
"vfmasb %%v4,%%v16,%%v0,%%v4 \n\t"
|
||||
"vfmasb %%v4,%%v17,%%v1,%%v4 \n\t"
|
||||
"vfmasb %%v4,%%v18,%%v2,%%v4 \n\t"
|
||||
"vfmasb %%v4,%%v19,%%v3,%%v4 \n\t"
|
||||
"vst %%v4,64(%%r1,%6) \n\t"
|
||||
|
||||
"vl %%v4,80(%%r1,%6) \n\t"
|
||||
"vfmasb %%v4,%%v20,%%v0,%%v4 \n\t"
|
||||
"vfmasb %%v4,%%v21,%%v1,%%v4 \n\t"
|
||||
"vfmasb %%v4,%%v22,%%v2,%%v4 \n\t"
|
||||
"vfmasb %%v4,%%v23,%%v3,%%v4 \n\t"
|
||||
"vst %%v4,80(%%r1,%6) \n\t"
|
||||
|
||||
"vl %%v4,96(%%r1,%6) \n\t"
|
||||
"vfmasb %%v4,%%v24,%%v0,%%v4 \n\t"
|
||||
"vfmasb %%v4,%%v25,%%v1,%%v4 \n\t"
|
||||
"vfmasb %%v4,%%v26,%%v2,%%v4 \n\t"
|
||||
"vfmasb %%v4,%%v27,%%v3,%%v4 \n\t"
|
||||
"vst %%v4,96(%%r1,%6) \n\t"
|
||||
|
||||
"vl %%v4,112(%%r1,%6) \n\t"
|
||||
"vfmasb %%v4,%%v28,%%v0,%%v4 \n\t"
|
||||
"vfmasb %%v4,%%v29,%%v1,%%v4 \n\t"
|
||||
"vfmasb %%v4,%%v30,%%v2,%%v4 \n\t"
|
||||
"vfmasb %%v4,%%v31,%%v3,%%v4 \n\t"
|
||||
"vst %%v4,112(%%r1,%6) \n\t"
|
||||
|
||||
"agfi %%r1,128 \n\t"
|
||||
"brctg %%r0,0b \n\t"
|
||||
|
||||
"1: \n\t"
|
||||
"lghi %%r0,28 \n\t"
|
||||
"ngr %%r0,%0 \n\t"
|
||||
"ltgr %%r0,%%r0 \n\t"
|
||||
"jz 3f \n\t"
|
||||
|
||||
"srlg %%r0,%%r0,2 \n\t"
|
||||
"2: \n\t"
|
||||
"vl %%v16,0(%%r1,%1) \n\t"
|
||||
"vl %%v17,0(%%r1,%2) \n\t"
|
||||
"vl %%v18,0(%%r1,%3) \n\t"
|
||||
"vl %%v19,0(%%r1,%4) \n\t"
|
||||
|
||||
"vl %%v4,0(%%r1,%6) \n\t"
|
||||
"vfmasb %%v4,%%v16,%%v0,%%v4 \n\t"
|
||||
"vfmasb %%v4,%%v17,%%v1,%%v4 \n\t"
|
||||
"vfmasb %%v4,%%v18,%%v2,%%v4 \n\t"
|
||||
"vfmasb %%v4,%%v19,%%v3,%%v4 \n\t"
|
||||
"vst %%v4,0(%%r1,%6) \n\t"
|
||||
|
||||
"agfi %%r1,16 \n\t"
|
||||
"brctg %%r0,2b \n\t"
|
||||
|
||||
"3: \n\t"
|
||||
"nop "
|
||||
:
|
||||
:"r"(n),"ZR"((const FLOAT (*)[n])ap[0]),"ZR"((const FLOAT (*)[n])ap[1]),"ZR"((const FLOAT (*)[n])ap[2]),"ZR"((const FLOAT (*)[n])ap[3]),"ZQ"((const FLOAT (*)[4])xo),"ZR"((FLOAT (*)[n])y),"m"(*alpha)
|
||||
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
}
|
||||
|
||||
static void sgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha)
|
||||
{
|
||||
__asm__ volatile (
|
||||
"vlrepf %%v0,0(%3) \n\t"
|
||||
"vlrepf %%v1,4(%3) \n\t"
|
||||
"vlrepf %%v2,%5 \n\t"
|
||||
"vfmsb %%v0,%%v0,%%v2 \n\t"
|
||||
"vfmsb %%v1,%%v1,%%v2 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
|
||||
"lghi %%r0,-32 \n\t"
|
||||
"ngr %%r0,%0 \n\t"
|
||||
"ltgr %%r0,%%r0 \n\t"
|
||||
"jz 1f \n\t"
|
||||
|
||||
"srlg %%r0,%%r0,5 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 1,1024(%%r1,%1) \n\t"
|
||||
"pfd 1,1024(%%r1,%2) \n\t"
|
||||
"pfd 2,1024(%%r1,%4) \n\t"
|
||||
|
||||
"vl %%v16,0(%%r1,%1) \n\t"
|
||||
"vl %%v17,0(%%r1,%2) \n\t"
|
||||
"vl %%v18,16(%%r1,%1) \n\t"
|
||||
"vl %%v19,16(%%r1,%2) \n\t"
|
||||
"vl %%v20,32(%%r1,%1) \n\t"
|
||||
"vl %%v21,32(%%r1,%2) \n\t"
|
||||
"vl %%v22,48(%%r1,%1) \n\t"
|
||||
"vl %%v23,48(%%r1,%2) \n\t"
|
||||
"vl %%v24,64(%%r1,%1) \n\t"
|
||||
"vl %%v25,64(%%r1,%2) \n\t"
|
||||
"vl %%v26,80(%%r1,%1) \n\t"
|
||||
"vl %%v27,80(%%r1,%2) \n\t"
|
||||
"vl %%v28,96(%%r1,%1) \n\t"
|
||||
"vl %%v29,96(%%r1,%2) \n\t"
|
||||
"vl %%v30,112(%%r1,%1) \n\t"
|
||||
"vl %%v31,112(%%r1,%2) \n\t"
|
||||
|
||||
"vl %%v2,0(%%r1,%4) \n\t"
|
||||
"vfmasb %%v2,%%v16,%%v0,%%v2 \n\t"
|
||||
"vfmasb %%v2,%%v17,%%v1,%%v2 \n\t"
|
||||
"vst %%v2,0(%%r1,%4) \n\t"
|
||||
|
||||
"vl %%v2,16(%%r1,%4) \n\t"
|
||||
"vfmasb %%v2,%%v18,%%v0,%%v2 \n\t"
|
||||
"vfmasb %%v2,%%v19,%%v1,%%v2 \n\t"
|
||||
"vst %%v2,16(%%r1,%4) \n\t"
|
||||
|
||||
"vl %%v2,32(%%r1,%4) \n\t"
|
||||
"vfmasb %%v2,%%v20,%%v0,%%v2 \n\t"
|
||||
"vfmasb %%v2,%%v21,%%v1,%%v2 \n\t"
|
||||
"vst %%v2,32(%%r1,%4) \n\t"
|
||||
|
||||
"vl %%v2,48(%%r1,%4) \n\t"
|
||||
"vfmasb %%v2,%%v22,%%v0,%%v2 \n\t"
|
||||
"vfmasb %%v2,%%v23,%%v1,%%v2 \n\t"
|
||||
"vst %%v2,48(%%r1,%4) \n\t"
|
||||
|
||||
"vl %%v2,64(%%r1,%4) \n\t"
|
||||
"vfmasb %%v2,%%v24,%%v0,%%v2 \n\t"
|
||||
"vfmasb %%v2,%%v25,%%v1,%%v2 \n\t"
|
||||
"vst %%v2,64(%%r1,%4) \n\t"
|
||||
|
||||
"vl %%v2,80(%%r1,%4) \n\t"
|
||||
"vfmasb %%v2,%%v26,%%v0,%%v2 \n\t"
|
||||
"vfmasb %%v2,%%v27,%%v1,%%v2 \n\t"
|
||||
"vst %%v2,80(%%r1,%4) \n\t"
|
||||
|
||||
"vl %%v2,96(%%r1,%4) \n\t"
|
||||
"vfmasb %%v2,%%v28,%%v0,%%v2 \n\t"
|
||||
"vfmasb %%v2,%%v29,%%v1,%%v2 \n\t"
|
||||
"vst %%v2,96(%%r1,%4) \n\t"
|
||||
|
||||
"vl %%v2,112(%%r1,%4) \n\t"
|
||||
"vfmasb %%v2,%%v30,%%v0,%%v2 \n\t"
|
||||
"vfmasb %%v2,%%v31,%%v1,%%v2 \n\t"
|
||||
"vst %%v2,112(%%r1,%4) \n\t"
|
||||
|
||||
"agfi %%r1,128 \n\t"
|
||||
"brctg %%r0,0b \n\t"
|
||||
|
||||
"1: \n\t"
|
||||
"lghi %%r0,28 \n\t"
|
||||
"ngr %%r0,%0 \n\t"
|
||||
"ltgr %%r0,%%r0 \n\t"
|
||||
"jz 3f \n\t"
|
||||
|
||||
"srlg %%r0,%%r0,2 \n\t"
|
||||
"2: \n\t"
|
||||
"vl %%v16,0(%%r1,%1) \n\t"
|
||||
"vl %%v17,0(%%r1,%2) \n\t"
|
||||
|
||||
"vl %%v2,0(%%r1,%4) \n\t"
|
||||
"vfmasb %%v2,%%v16,%%v0,%%v2 \n\t"
|
||||
"vfmasb %%v2,%%v17,%%v1,%%v2 \n\t"
|
||||
"vst %%v2,0(%%r1,%4) \n\t"
|
||||
|
||||
"agfi %%r1,16 \n\t"
|
||||
"brctg %%r0,2b \n\t"
|
||||
|
||||
"3: \n\t"
|
||||
"nop "
|
||||
:
|
||||
:"r"(n),"ZR"((const FLOAT (*)[n])ap[0]),"ZR"((const FLOAT (*)[n])ap[1]),"ZQ"((const FLOAT (*)[2])xo),"ZR"((FLOAT (*)[n])y),"m"(*alpha)
|
||||
:"memory","cc","r0","r1","v0","v1","v2","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
}
|
||||
|
||||
static void sgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *xo, FLOAT *y, FLOAT *alpha)
|
||||
{
|
||||
__asm__ volatile (
|
||||
"vlrepf %%v0,0(%2) \n\t"
|
||||
"vlrepf %%v1,%4 \n\t"
|
||||
"vfmsb %%v0,%%v0,%%v1 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
|
||||
"lghi %%r0,-32 \n\t"
|
||||
"ngr %%r0,%0 \n\t"
|
||||
"ltgr %%r0,%%r0 \n\t"
|
||||
"jz 1f \n\t"
|
||||
|
||||
"srlg %%r0,%%r0,5 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 1,1024(%%r1,%1) \n\t"
|
||||
"pfd 2,1024(%%r1,%3) \n\t"
|
||||
|
||||
"vl %%v16,0(%%r1,%1) \n\t"
|
||||
"vl %%v17,16(%%r1,%1) \n\t"
|
||||
"vl %%v18,32(%%r1,%1) \n\t"
|
||||
"vl %%v19,48(%%r1,%1) \n\t"
|
||||
"vl %%v20,64(%%r1,%1) \n\t"
|
||||
"vl %%v21,80(%%r1,%1) \n\t"
|
||||
"vl %%v22,96(%%r1,%1) \n\t"
|
||||
"vl %%v23,112(%%r1,%1) \n\t"
|
||||
|
||||
"vl %%v1,0(%%r1,%3) \n\t"
|
||||
"vfmasb %%v1,%%v16,%%v0,%%v1 \n\t"
|
||||
"vst %%v1,0(%%r1,%3) \n\t"
|
||||
|
||||
"vl %%v1,16(%%r1,%3) \n\t"
|
||||
"vfmasb %%v1,%%v17,%%v0,%%v1 \n\t"
|
||||
"vst %%v1,16(%%r1,%3) \n\t"
|
||||
|
||||
"vl %%v1,32(%%r1,%3) \n\t"
|
||||
"vfmasb %%v1,%%v18,%%v0,%%v1 \n\t"
|
||||
"vst %%v1,32(%%r1,%3) \n\t"
|
||||
|
||||
"vl %%v1,48(%%r1,%3) \n\t"
|
||||
"vfmasb %%v1,%%v19,%%v0,%%v1 \n\t"
|
||||
"vst %%v1,48(%%r1,%3) \n\t"
|
||||
|
||||
"vl %%v1,64(%%r1,%3) \n\t"
|
||||
"vfmasb %%v1,%%v20,%%v0,%%v1 \n\t"
|
||||
"vst %%v1,64(%%r1,%3) \n\t"
|
||||
|
||||
"vl %%v1,80(%%r1,%3) \n\t"
|
||||
"vfmasb %%v1,%%v21,%%v0,%%v1 \n\t"
|
||||
"vst %%v1,80(%%r1,%3) \n\t"
|
||||
|
||||
"vl %%v1,96(%%r1,%3) \n\t"
|
||||
"vfmasb %%v1,%%v22,%%v0,%%v1 \n\t"
|
||||
"vst %%v1,96(%%r1,%3) \n\t"
|
||||
|
||||
"vl %%v1,112(%%r1,%3) \n\t"
|
||||
"vfmasb %%v1,%%v23,%%v0,%%v1 \n\t"
|
||||
"vst %%v1,112(%%r1,%3) \n\t"
|
||||
|
||||
"agfi %%r1,128 \n\t"
|
||||
"brctg %%r0,0b \n\t"
|
||||
|
||||
"1: \n\t"
|
||||
"lghi %%r0,28 \n\t"
|
||||
"ngr %%r0,%0 \n\t"
|
||||
"ltgr %%r0,%%r0 \n\t"
|
||||
"jz 3f \n\t"
|
||||
|
||||
"srlg %%r0,%%r0,2 \n\t"
|
||||
"2: \n\t"
|
||||
"vl %%v16,0(%%r1,%1) \n\t"
|
||||
|
||||
"vl %%v1,0(%%r1,%3) \n\t"
|
||||
"vfmasb %%v1,%%v16,%%v0,%%v1 \n\t"
|
||||
"vst %%v1,0(%%r1,%3) \n\t"
|
||||
|
||||
"agfi %%r1,16 \n\t"
|
||||
"brctg %%r0,2b \n\t"
|
||||
|
||||
"3: \n\t"
|
||||
"nop "
|
||||
:
|
||||
:"r"(n),"ZR"((const FLOAT (*)[n])a0),"ZQ"((const FLOAT (*)[1])xo),"ZR"((FLOAT (*)[n])y),"m"(*alpha)
|
||||
:"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
}
|
||||
|
||||
static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest)
|
||||
{
|
||||
BLASLONG i;
|
||||
for (i = 0; i < n; i++)
|
||||
{
|
||||
*dest += src[i];
|
||||
dest += inc_dest;
|
||||
}
|
||||
}
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
|
||||
{
|
||||
BLASLONG i;
|
||||
FLOAT *a_ptr;
|
||||
FLOAT *x_ptr;
|
||||
FLOAT *y_ptr;
|
||||
FLOAT *ap[4];
|
||||
BLASLONG n1;
|
||||
BLASLONG m1;
|
||||
BLASLONG m2;
|
||||
BLASLONG m3;
|
||||
BLASLONG n2;
|
||||
BLASLONG lda4 = lda << 2;
|
||||
FLOAT xbuffer[8],*ybuffer;
|
||||
|
||||
if ( m < 1 ) return(0);
|
||||
if ( n < 1 ) return(0);
|
||||
|
||||
ybuffer = buffer;
|
||||
|
||||
n1 = n >> 2 ;
|
||||
n2 = n & 3 ;
|
||||
|
||||
m3 = m & 3 ;
|
||||
m1 = m & -4 ;
|
||||
m2 = (m & (NBMAX-1)) - m3 ;
|
||||
|
||||
y_ptr = y;
|
||||
|
||||
BLASLONG NB = NBMAX;
|
||||
|
||||
while ( NB == NBMAX )
|
||||
{
|
||||
|
||||
m1 -= NB;
|
||||
if ( m1 < 0)
|
||||
{
|
||||
if ( m2 == 0 ) break;
|
||||
NB = m2;
|
||||
}
|
||||
|
||||
a_ptr = a;
|
||||
x_ptr = x;
|
||||
|
||||
ap[0] = a_ptr;
|
||||
ap[1] = a_ptr + lda;
|
||||
ap[2] = ap[1] + lda;
|
||||
ap[3] = ap[2] + lda;
|
||||
|
||||
if ( inc_y != 1 )
|
||||
memset(ybuffer,0,NB*4);
|
||||
else
|
||||
ybuffer = y_ptr;
|
||||
|
||||
if ( inc_x == 1 )
|
||||
{
|
||||
|
||||
|
||||
for( i = 0; i < n1 ; i++)
|
||||
{
|
||||
sgemv_kernel_4x4(NB,ap,x_ptr,ybuffer,&alpha);
|
||||
ap[0] += lda4;
|
||||
ap[1] += lda4;
|
||||
ap[2] += lda4;
|
||||
ap[3] += lda4;
|
||||
a_ptr += lda4;
|
||||
x_ptr += 4;
|
||||
}
|
||||
|
||||
if ( n2 & 2 )
|
||||
{
|
||||
sgemv_kernel_4x2(NB,ap,x_ptr,ybuffer,&alpha);
|
||||
a_ptr += lda*2;
|
||||
x_ptr += 2;
|
||||
}
|
||||
|
||||
|
||||
if ( n2 & 1 )
|
||||
{
|
||||
sgemv_kernel_4x1(NB,a_ptr,x_ptr,ybuffer,&alpha);
|
||||
/* a_ptr += lda;
|
||||
x_ptr += 1; */
|
||||
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
for( i = 0; i < n1 ; i++)
|
||||
{
|
||||
xbuffer[0] = x_ptr[0];
|
||||
x_ptr += inc_x;
|
||||
xbuffer[1] = x_ptr[0];
|
||||
x_ptr += inc_x;
|
||||
xbuffer[2] = x_ptr[0];
|
||||
x_ptr += inc_x;
|
||||
xbuffer[3] = x_ptr[0];
|
||||
x_ptr += inc_x;
|
||||
sgemv_kernel_4x4(NB,ap,xbuffer,ybuffer,&alpha);
|
||||
ap[0] += lda4;
|
||||
ap[1] += lda4;
|
||||
ap[2] += lda4;
|
||||
ap[3] += lda4;
|
||||
a_ptr += lda4;
|
||||
}
|
||||
|
||||
for( i = 0; i < n2 ; i++)
|
||||
{
|
||||
xbuffer[0] = x_ptr[0];
|
||||
x_ptr += inc_x;
|
||||
sgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer,&alpha);
|
||||
a_ptr += lda;
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
a += NB;
|
||||
if ( inc_y != 1 )
|
||||
{
|
||||
add_y(NB,ybuffer,y_ptr,inc_y);
|
||||
y_ptr += NB * inc_y;
|
||||
}
|
||||
else
|
||||
y_ptr += NB ;
|
||||
|
||||
}
|
||||
|
||||
if ( m3 == 0 ) return(0);
|
||||
|
||||
if ( m3 == 3 )
|
||||
{
|
||||
a_ptr = a;
|
||||
x_ptr = x;
|
||||
FLOAT temp0 = 0.0;
|
||||
FLOAT temp1 = 0.0;
|
||||
FLOAT temp2 = 0.0;
|
||||
if ( lda == 3 && inc_x ==1 )
|
||||
{
|
||||
|
||||
for( i = 0; i < ( n & -4 ); i+=4 )
|
||||
{
|
||||
|
||||
temp0 += a_ptr[0] * x_ptr[0] + a_ptr[3] * x_ptr[1];
|
||||
temp1 += a_ptr[1] * x_ptr[0] + a_ptr[4] * x_ptr[1];
|
||||
temp2 += a_ptr[2] * x_ptr[0] + a_ptr[5] * x_ptr[1];
|
||||
|
||||
temp0 += a_ptr[6] * x_ptr[2] + a_ptr[9] * x_ptr[3];
|
||||
temp1 += a_ptr[7] * x_ptr[2] + a_ptr[10] * x_ptr[3];
|
||||
temp2 += a_ptr[8] * x_ptr[2] + a_ptr[11] * x_ptr[3];
|
||||
|
||||
a_ptr += 12;
|
||||
x_ptr += 4;
|
||||
}
|
||||
|
||||
for( ; i < n; i++ )
|
||||
{
|
||||
temp0 += a_ptr[0] * x_ptr[0];
|
||||
temp1 += a_ptr[1] * x_ptr[0];
|
||||
temp2 += a_ptr[2] * x_ptr[0];
|
||||
a_ptr += 3;
|
||||
x_ptr ++;
|
||||
}
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
for( i = 0; i < n; i++ )
|
||||
{
|
||||
temp0 += a_ptr[0] * x_ptr[0];
|
||||
temp1 += a_ptr[1] * x_ptr[0];
|
||||
temp2 += a_ptr[2] * x_ptr[0];
|
||||
a_ptr += lda;
|
||||
x_ptr += inc_x;
|
||||
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
y_ptr[0] += alpha * temp0;
|
||||
y_ptr += inc_y;
|
||||
y_ptr[0] += alpha * temp1;
|
||||
y_ptr += inc_y;
|
||||
y_ptr[0] += alpha * temp2;
|
||||
return(0);
|
||||
}
|
||||
|
||||
|
||||
if ( m3 == 2 )
|
||||
{
|
||||
a_ptr = a;
|
||||
x_ptr = x;
|
||||
FLOAT temp0 = 0.0;
|
||||
FLOAT temp1 = 0.0;
|
||||
if ( lda == 2 && inc_x ==1 )
|
||||
{
|
||||
|
||||
for( i = 0; i < (n & -4) ; i+=4 )
|
||||
{
|
||||
temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1];
|
||||
temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1];
|
||||
temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3];
|
||||
temp1 += a_ptr[5] * x_ptr[2] + a_ptr[7] * x_ptr[3];
|
||||
a_ptr += 8;
|
||||
x_ptr += 4;
|
||||
|
||||
}
|
||||
|
||||
|
||||
for( ; i < n; i++ )
|
||||
{
|
||||
temp0 += a_ptr[0] * x_ptr[0];
|
||||
temp1 += a_ptr[1] * x_ptr[0];
|
||||
a_ptr += 2;
|
||||
x_ptr ++;
|
||||
}
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
for( i = 0; i < n; i++ )
|
||||
{
|
||||
temp0 += a_ptr[0] * x_ptr[0];
|
||||
temp1 += a_ptr[1] * x_ptr[0];
|
||||
a_ptr += lda;
|
||||
x_ptr += inc_x;
|
||||
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
y_ptr[0] += alpha * temp0;
|
||||
y_ptr += inc_y;
|
||||
y_ptr[0] += alpha * temp1;
|
||||
return(0);
|
||||
}
|
||||
|
||||
if ( m3 == 1 )
|
||||
{
|
||||
a_ptr = a;
|
||||
x_ptr = x;
|
||||
FLOAT temp = 0.0;
|
||||
if ( lda == 1 && inc_x ==1 )
|
||||
{
|
||||
|
||||
for( i = 0; i < (n & -4); i+=4 )
|
||||
{
|
||||
temp += a_ptr[i] * x_ptr[i] + a_ptr[i+1] * x_ptr[i+1] + a_ptr[i+2] * x_ptr[i+2] + a_ptr[i+3] * x_ptr[i+3];
|
||||
|
||||
}
|
||||
|
||||
for( ; i < n; i++ )
|
||||
{
|
||||
temp += a_ptr[i] * x_ptr[i];
|
||||
}
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
for( i = 0; i < n; i++ )
|
||||
{
|
||||
temp += a_ptr[0] * x_ptr[0];
|
||||
a_ptr += lda;
|
||||
x_ptr += inc_x;
|
||||
}
|
||||
|
||||
}
|
||||
y_ptr[0] += alpha * temp;
|
||||
return(0);
|
||||
}
|
||||
|
||||
|
||||
return(0);
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,827 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2017, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#define NBMAX 2048
|
||||
|
||||
static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
|
||||
{
|
||||
__asm__ volatile (
|
||||
"vzero %%v0 \n\t"
|
||||
"vzero %%v1 \n\t"
|
||||
"vzero %%v2 \n\t"
|
||||
"vzero %%v3 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
|
||||
"lghi %%r0,-32 \n\t"
|
||||
"ngr %%r0,%0 \n\t"
|
||||
"ltgr %%r0,%%r0 \n\t"
|
||||
"jz 1f \n\t"
|
||||
|
||||
"srlg %%r0,%%r0,5 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 1,1024(%%r1,%1) \n\t"
|
||||
"pfd 1,1024(%%r1,%2) \n\t"
|
||||
"pfd 1,1024(%%r1,%3) \n\t"
|
||||
"pfd 1,1024(%%r1,%4) \n\t"
|
||||
"pfd 1,1024(%%r1,%5) \n\t"
|
||||
|
||||
"vl %%v16,0(%%r1,%5) \n\t"
|
||||
"vl %%v17,16(%%r1,%5) \n\t"
|
||||
"vl %%v18,32(%%r1,%5) \n\t"
|
||||
"vl %%v19,48(%%r1,%5) \n\t"
|
||||
"vl %%v20,64(%%r1,%5) \n\t"
|
||||
"vl %%v21,80(%%r1,%5) \n\t"
|
||||
"vl %%v22,96(%%r1,%5) \n\t"
|
||||
"vl %%v23,112(%%r1,%5) \n\t"
|
||||
|
||||
"vl %%v24,0(%%r1,%1) \n\t"
|
||||
"vfmasb %%v0,%%v16,%%v24,%%v0 \n\t"
|
||||
"vl %%v25,0(%%r1,%2) \n\t"
|
||||
"vfmasb %%v1,%%v16,%%v25,%%v1 \n\t"
|
||||
"vl %%v26,0(%%r1,%3) \n\t"
|
||||
"vfmasb %%v2,%%v16,%%v26,%%v2 \n\t"
|
||||
"vl %%v27,0(%%r1,%4) \n\t"
|
||||
"vfmasb %%v3,%%v16,%%v27,%%v3 \n\t"
|
||||
|
||||
"vl %%v28,16(%%r1,%1) \n\t"
|
||||
"vfmasb %%v0,%%v17,%%v28,%%v0 \n\t"
|
||||
"vl %%v29,16(%%r1,%2) \n\t"
|
||||
"vfmasb %%v1,%%v17,%%v29,%%v1 \n\t"
|
||||
"vl %%v30,16(%%r1,%3) \n\t"
|
||||
"vfmasb %%v2,%%v17,%%v30,%%v2 \n\t"
|
||||
"vl %%v31,16(%%r1,%4) \n\t"
|
||||
"vfmasb %%v3,%%v17,%%v31,%%v3 \n\t"
|
||||
|
||||
"vl %%v24,32(%%r1,%1) \n\t"
|
||||
"vfmasb %%v0,%%v18,%%v24,%%v0 \n\t"
|
||||
"vl %%v25,32(%%r1,%2) \n\t"
|
||||
"vfmasb %%v1,%%v18,%%v25,%%v1 \n\t"
|
||||
"vl %%v26,32(%%r1,%3) \n\t"
|
||||
"vfmasb %%v2,%%v18,%%v26,%%v2 \n\t"
|
||||
"vl %%v27,32(%%r1,%4) \n\t"
|
||||
"vfmasb %%v3,%%v18,%%v27,%%v3 \n\t"
|
||||
|
||||
"vl %%v28,48(%%r1,%1) \n\t"
|
||||
"vfmasb %%v0,%%v19,%%v28,%%v0 \n\t"
|
||||
"vl %%v29,48(%%r1,%2) \n\t"
|
||||
"vfmasb %%v1,%%v19,%%v29,%%v1 \n\t"
|
||||
"vl %%v30,48(%%r1,%3) \n\t"
|
||||
"vfmasb %%v2,%%v19,%%v30,%%v2 \n\t"
|
||||
"vl %%v31,48(%%r1,%4) \n\t"
|
||||
"vfmasb %%v3,%%v19,%%v31,%%v3 \n\t"
|
||||
|
||||
"vl %%v24,64(%%r1,%1) \n\t"
|
||||
"vfmasb %%v0,%%v20,%%v24,%%v0 \n\t"
|
||||
"vl %%v25,64(%%r1,%2) \n\t"
|
||||
"vfmasb %%v1,%%v20,%%v25,%%v1 \n\t"
|
||||
"vl %%v26,64(%%r1,%3) \n\t"
|
||||
"vfmasb %%v2,%%v20,%%v26,%%v2 \n\t"
|
||||
"vl %%v27,64(%%r1,%4) \n\t"
|
||||
"vfmasb %%v3,%%v20,%%v27,%%v3 \n\t"
|
||||
|
||||
"vl %%v28,80(%%r1,%1) \n\t"
|
||||
"vfmasb %%v0,%%v21,%%v28,%%v0 \n\t"
|
||||
"vl %%v29,80(%%r1,%2) \n\t"
|
||||
"vfmasb %%v1,%%v21,%%v29,%%v1 \n\t"
|
||||
"vl %%v30,80(%%r1,%3) \n\t"
|
||||
"vfmasb %%v2,%%v21,%%v30,%%v2 \n\t"
|
||||
"vl %%v31,80(%%r1,%4) \n\t"
|
||||
"vfmasb %%v3,%%v21,%%v31,%%v3 \n\t"
|
||||
|
||||
"vl %%v24,96(%%r1,%1) \n\t"
|
||||
"vfmasb %%v0,%%v22,%%v24,%%v0 \n\t"
|
||||
"vl %%v25,96(%%r1,%2) \n\t"
|
||||
"vfmasb %%v1,%%v22,%%v25,%%v1 \n\t"
|
||||
"vl %%v26,96(%%r1,%3) \n\t"
|
||||
"vfmasb %%v2,%%v22,%%v26,%%v2 \n\t"
|
||||
"vl %%v27,96(%%r1,%4) \n\t"
|
||||
"vfmasb %%v3,%%v22,%%v27,%%v3 \n\t"
|
||||
|
||||
"vl %%v28,112(%%r1,%1) \n\t"
|
||||
"vfmasb %%v0,%%v23,%%v28,%%v0 \n\t"
|
||||
"vl %%v29,112(%%r1,%2) \n\t"
|
||||
"vfmasb %%v1,%%v23,%%v29,%%v1 \n\t"
|
||||
"vl %%v30,112(%%r1,%3) \n\t"
|
||||
"vfmasb %%v2,%%v23,%%v30,%%v2 \n\t"
|
||||
"vl %%v31,112(%%r1,%4) \n\t"
|
||||
"vfmasb %%v3,%%v23,%%v31,%%v3 \n\t"
|
||||
|
||||
"agfi %%r1,128 \n\t"
|
||||
"brctg %%r0,0b \n\t"
|
||||
|
||||
"1: \n\t"
|
||||
"lghi %%r0,28 \n\t"
|
||||
"ngr %%r0,%0 \n\t"
|
||||
"ltgr %%r0,%%r0 \n\t"
|
||||
"jz 3f \n\t"
|
||||
|
||||
"srlg %%r0,%%r0,2 \n\t"
|
||||
"2: \n\t"
|
||||
"vl %%v16,0(%%r1,%5) \n\t"
|
||||
|
||||
"vl %%v24,0(%%r1,%1) \n\t"
|
||||
"vfmasb %%v0,%%v16,%%v24,%%v0 \n\t"
|
||||
"vl %%v25,0(%%r1,%2) \n\t"
|
||||
"vfmasb %%v1,%%v16,%%v25,%%v1 \n\t"
|
||||
"vl %%v26,0(%%r1,%3) \n\t"
|
||||
"vfmasb %%v2,%%v16,%%v26,%%v2 \n\t"
|
||||
"vl %%v27,0(%%r1,%4) \n\t"
|
||||
"vfmasb %%v3,%%v16,%%v27,%%v3 \n\t"
|
||||
|
||||
"agfi %%r1,16 \n\t"
|
||||
"brctg %%r0,2b \n\t"
|
||||
|
||||
"3: \n\t"
|
||||
"vrepf %%v4,%%v0,1 \n\t"
|
||||
"aebr %%f0,%%f4 \n\t"
|
||||
"vrepf %%v4,%%v0,2 \n\t"
|
||||
"aebr %%f0,%%f4 \n\t"
|
||||
"vrepf %%v4,%%v0,3 \n\t"
|
||||
"aebr %%f0,%%f4 \n\t"
|
||||
"ste %%f0,0(%6) \n\t"
|
||||
"vrepf %%v4,%%v1,1 \n\t"
|
||||
"aebr %%f1,%%f4 \n\t"
|
||||
"vrepf %%v4,%%v1,2 \n\t"
|
||||
"aebr %%f1,%%f4 \n\t"
|
||||
"vrepf %%v4,%%v1,3 \n\t"
|
||||
"aebr %%f1,%%f4 \n\t"
|
||||
"ste %%f1,4(%6) \n\t"
|
||||
"vrepf %%v4,%%v2,1 \n\t"
|
||||
"aebr %%f2,%%f4 \n\t"
|
||||
"vrepf %%v4,%%v2,2 \n\t"
|
||||
"aebr %%f2,%%f4 \n\t"
|
||||
"vrepf %%v4,%%v2,3 \n\t"
|
||||
"aebr %%f2,%%f4 \n\t"
|
||||
"ste %%f2,8(%6) \n\t"
|
||||
"vrepf %%v4,%%v3,1 \n\t"
|
||||
"aebr %%f3,%%f4 \n\t"
|
||||
"vrepf %%v4,%%v3,2 \n\t"
|
||||
"aebr %%f3,%%f4 \n\t"
|
||||
"vrepf %%v4,%%v3,3 \n\t"
|
||||
"aebr %%f3,%%f4 \n\t"
|
||||
"ste %%f3,12(%6) "
|
||||
:
|
||||
:"r"(n),"ZR"((const FLOAT (*)[n])ap[0]),"ZR"((const FLOAT (*)[n])ap[1]),"ZR"((const FLOAT (*)[n])ap[2]),"ZR"((const FLOAT (*)[n])ap[3]),"ZR"((const FLOAT (*)[n])x),"ZQ"((FLOAT (*)[4])y)
|
||||
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
}
|
||||
|
||||
static void sgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
|
||||
{
|
||||
__asm__ volatile (
|
||||
"vzero %%v0 \n\t"
|
||||
"vzero %%v1 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
|
||||
"lghi %%r0,-32 \n\t"
|
||||
"ngr %%r0,%0 \n\t"
|
||||
"ltgr %%r0,%%r0 \n\t"
|
||||
"jz 1f \n\t"
|
||||
|
||||
"srlg %%r0,%%r0,5 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 1,1024(%%r1,%1) \n\t"
|
||||
"pfd 1,1024(%%r1,%2) \n\t"
|
||||
"pfd 1,1024(%%r1,%3) \n\t"
|
||||
|
||||
"vl %%v16,0(%%r1,%3) \n\t"
|
||||
"vl %%v17,16(%%r1,%3) \n\t"
|
||||
"vl %%v18,32(%%r1,%3) \n\t"
|
||||
"vl %%v19,48(%%r1,%3) \n\t"
|
||||
"vl %%v20,64(%%r1,%3) \n\t"
|
||||
"vl %%v21,80(%%r1,%3) \n\t"
|
||||
"vl %%v22,96(%%r1,%3) \n\t"
|
||||
"vl %%v23,112(%%r1,%3) \n\t"
|
||||
|
||||
"vl %%v24,0(%%r1,%1) \n\t"
|
||||
"vfmasb %%v0,%%v16,%%v24,%%v0 \n\t"
|
||||
"vl %%v25,0(%%r1,%2) \n\t"
|
||||
"vfmasb %%v1,%%v16,%%v25,%%v1 \n\t"
|
||||
|
||||
"vl %%v26,16(%%r1,%1) \n\t"
|
||||
"vfmasb %%v0,%%v17,%%v26,%%v0 \n\t"
|
||||
"vl %%v27,16(%%r1,%2) \n\t"
|
||||
"vfmasb %%v1,%%v17,%%v27,%%v1 \n\t"
|
||||
|
||||
"vl %%v28,32(%%r1,%1) \n\t"
|
||||
"vfmasb %%v0,%%v18,%%v28,%%v0 \n\t"
|
||||
"vl %%v29,32(%%r1,%2) \n\t"
|
||||
"vfmasb %%v1,%%v18,%%v29,%%v1 \n\t"
|
||||
|
||||
"vl %%v30,48(%%r1,%1) \n\t"
|
||||
"vfmasb %%v0,%%v19,%%v30,%%v0 \n\t"
|
||||
"vl %%v31,48(%%r1,%2) \n\t"
|
||||
"vfmasb %%v1,%%v19,%%v31,%%v1 \n\t"
|
||||
|
||||
"vl %%v24,64(%%r1,%1) \n\t"
|
||||
"vfmasb %%v0,%%v20,%%v24,%%v0 \n\t"
|
||||
"vl %%v25,64(%%r1,%2) \n\t"
|
||||
"vfmasb %%v1,%%v20,%%v25,%%v1 \n\t"
|
||||
|
||||
"vl %%v26,80(%%r1,%1) \n\t"
|
||||
"vfmasb %%v0,%%v21,%%v26,%%v0 \n\t"
|
||||
"vl %%v27,80(%%r1,%2) \n\t"
|
||||
"vfmasb %%v1,%%v21,%%v27,%%v1 \n\t"
|
||||
|
||||
"vl %%v28,96(%%r1,%1) \n\t"
|
||||
"vfmasb %%v0,%%v22,%%v28,%%v0 \n\t"
|
||||
"vl %%v29,96(%%r1,%2) \n\t"
|
||||
"vfmasb %%v1,%%v22,%%v29,%%v1 \n\t"
|
||||
|
||||
"vl %%v30,112(%%r1,%1) \n\t"
|
||||
"vfmasb %%v0,%%v23,%%v30,%%v0 \n\t"
|
||||
"vl %%v31,112(%%r1,%2) \n\t"
|
||||
"vfmasb %%v1,%%v23,%%v31,%%v1 \n\t"
|
||||
|
||||
"agfi %%r1,128 \n\t"
|
||||
"brctg %%r0,0b \n\t"
|
||||
|
||||
"1: \n\t"
|
||||
"lghi %%r0,28 \n\t"
|
||||
"ngr %%r0,%0 \n\t"
|
||||
"ltgr %%r0,%%r0 \n\t"
|
||||
"jz 3f \n\t"
|
||||
|
||||
"srlg %%r0,%%r0,2 \n\t"
|
||||
"2: \n\t"
|
||||
"vl %%v16,0(%%r1,%3) \n\t"
|
||||
|
||||
"vl %%v24,0(%%r1,%1) \n\t"
|
||||
"vfmasb %%v0,%%v16,%%v24,%%v0 \n\t"
|
||||
"vl %%v25,0(%%r1,%2) \n\t"
|
||||
"vfmasb %%v1,%%v16,%%v25,%%v1 \n\t"
|
||||
|
||||
"agfi %%r1,16 \n\t"
|
||||
"brctg %%r0,2b \n\t"
|
||||
|
||||
"3: \n\t"
|
||||
"vrepf %%v2,%%v0,1 \n\t"
|
||||
"aebr %%f0,%%f2 \n\t"
|
||||
"vrepf %%v2,%%v0,2 \n\t"
|
||||
"aebr %%f0,%%f2 \n\t"
|
||||
"vrepf %%v2,%%v0,3 \n\t"
|
||||
"aebr %%f0,%%f2 \n\t"
|
||||
"ste %%f0,0(%4) \n\t"
|
||||
"vrepf %%v2,%%v1,1 \n\t"
|
||||
"aebr %%f1,%%f2 \n\t"
|
||||
"vrepf %%v2,%%v1,2 \n\t"
|
||||
"aebr %%f1,%%f2 \n\t"
|
||||
"vrepf %%v2,%%v1,3 \n\t"
|
||||
"aebr %%f1,%%f2 \n\t"
|
||||
"ste %%f1,4(%4) "
|
||||
:
|
||||
:"r"(n),"ZR"((const FLOAT (*)[n])ap[0]),"ZR"((const FLOAT (*)[n])ap[1]),"ZR"((const FLOAT (*)[n])x),"ZQ"((FLOAT (*)[2])y)
|
||||
:"memory","cc","r0","r1","v0","v1","v2","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
}
|
||||
|
||||
static void sgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y)
|
||||
{
|
||||
__asm__ volatile (
|
||||
"vzero %%v0 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
|
||||
"lghi %%r0,-32 \n\t"
|
||||
"ngr %%r0,%0 \n\t"
|
||||
"ltgr %%r0,%%r0 \n\t"
|
||||
"jz 1f \n\t"
|
||||
|
||||
"srlg %%r0,%%r0,5 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 1,1024(%%r1,%1) \n\t"
|
||||
"pfd 1,1024(%%r1,%2) \n\t"
|
||||
|
||||
"vl %%v16,0(%%r1,%2) \n\t"
|
||||
"vl %%v17,16(%%r1,%2) \n\t"
|
||||
"vl %%v18,32(%%r1,%2) \n\t"
|
||||
"vl %%v19,48(%%r1,%2) \n\t"
|
||||
"vl %%v20,64(%%r1,%2) \n\t"
|
||||
"vl %%v21,80(%%r1,%2) \n\t"
|
||||
"vl %%v22,96(%%r1,%2) \n\t"
|
||||
"vl %%v23,112(%%r1,%2) \n\t"
|
||||
|
||||
"vl %%v24,0(%%r1,%1) \n\t"
|
||||
"vfmasb %%v0,%%v16,%%v24,%%v0 \n\t"
|
||||
|
||||
"vl %%v25,16(%%r1,%1) \n\t"
|
||||
"vfmasb %%v0,%%v17,%%v25,%%v0 \n\t"
|
||||
|
||||
"vl %%v26,32(%%r1,%1) \n\t"
|
||||
"vfmasb %%v0,%%v18,%%v26,%%v0 \n\t"
|
||||
|
||||
"vl %%v27,48(%%r1,%1) \n\t"
|
||||
"vfmasb %%v0,%%v19,%%v27,%%v0 \n\t"
|
||||
|
||||
"vl %%v28,64(%%r1,%1) \n\t"
|
||||
"vfmasb %%v0,%%v20,%%v28,%%v0 \n\t"
|
||||
|
||||
"vl %%v29,80(%%r1,%1) \n\t"
|
||||
"vfmasb %%v0,%%v21,%%v29,%%v0 \n\t"
|
||||
|
||||
"vl %%v30,96(%%r1,%1) \n\t"
|
||||
"vfmasb %%v0,%%v22,%%v30,%%v0 \n\t"
|
||||
|
||||
"vl %%v31,112(%%r1,%1) \n\t"
|
||||
"vfmasb %%v0,%%v23,%%v31,%%v0 \n\t"
|
||||
|
||||
"agfi %%r1,128 \n\t"
|
||||
"brctg %%r0,0b \n\t"
|
||||
|
||||
"1: \n\t"
|
||||
"lghi %%r0,28 \n\t"
|
||||
"ngr %%r0,%0 \n\t"
|
||||
"ltgr %%r0,%%r0 \n\t"
|
||||
"jz 3f \n\t"
|
||||
|
||||
"srlg %%r0,%%r0,2 \n\t"
|
||||
"2: \n\t"
|
||||
"vl %%v16,0(%%r1,%2) \n\t"
|
||||
|
||||
"vl %%v24,0(%%r1,%1) \n\t"
|
||||
"vfmasb %%v0,%%v16,%%v24,%%v0 \n\t"
|
||||
|
||||
"agfi %%r1,16 \n\t"
|
||||
"brctg %%r0,2b \n\t"
|
||||
|
||||
"3: \n\t"
|
||||
"vrepf %%v1,%%v0,1 \n\t"
|
||||
"aebr %%f0,%%f1 \n\t"
|
||||
"vrepf %%v1,%%v0,2 \n\t"
|
||||
"aebr %%f0,%%f1 \n\t"
|
||||
"vrepf %%v1,%%v0,3 \n\t"
|
||||
"aebr %%f0,%%f1 \n\t"
|
||||
"ste %%f0,0(%3) "
|
||||
:
|
||||
:"r"(n),"ZR"((const FLOAT (*)[n])a0),"ZR"((const FLOAT (*)[n])x),"ZQ"((FLOAT (*)[1])y)
|
||||
:"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
}
|
||||
|
||||
static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src)
|
||||
{
|
||||
BLASLONG i;
|
||||
for (i = 0; i < n; i++)
|
||||
{
|
||||
dest[i] = *src;
|
||||
src += inc_src;
|
||||
}
|
||||
}
|
||||
|
||||
static void add_y_kernel_4(BLASLONG n, FLOAT da, FLOAT *src, FLOAT *dest)
|
||||
{
|
||||
__asm__ volatile (
|
||||
"vlrepf %%v0,%1 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
|
||||
"lghi %%r0,-32 \n\t"
|
||||
"ngr %%r0,%0 \n\t"
|
||||
"ltgr %%r0,%%r0 \n\t"
|
||||
"jz 1f \n\t"
|
||||
|
||||
"srlg %%r0,%%r0,5 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 1,1024(%%r1,%2) \n\t"
|
||||
"pfd 2,1024(%%r1,%3) \n\t"
|
||||
|
||||
"vl %%v16,0(%%r1,%2) \n\t"
|
||||
"vl %%v17,16(%%r1,%2) \n\t"
|
||||
"vl %%v18,32(%%r1,%2) \n\t"
|
||||
"vl %%v19,48(%%r1,%2) \n\t"
|
||||
"vl %%v20,64(%%r1,%2) \n\t"
|
||||
"vl %%v21,80(%%r1,%2) \n\t"
|
||||
"vl %%v22,96(%%r1,%2) \n\t"
|
||||
"vl %%v23,112(%%r1,%2) \n\t"
|
||||
|
||||
"vl %%v24, 0(%%r1,%3) \n\t"
|
||||
"vfmasb %%v24,%%v16,%%v0,%%v24 \n\t"
|
||||
"vst %%v24, 0(%%r1,%3) \n\t"
|
||||
"vl %%v25, 16(%%r1,%3) \n\t"
|
||||
"vfmasb %%v25,%%v17,%%v0,%%v25 \n\t"
|
||||
"vst %%v25, 16(%%r1,%3) \n\t"
|
||||
"vl %%v26, 32(%%r1,%3) \n\t"
|
||||
"vfmasb %%v26,%%v18,%%v0,%%v26 \n\t"
|
||||
"vst %%v26, 32(%%r1,%3) \n\t"
|
||||
"vl %%v27, 48(%%r1,%3) \n\t"
|
||||
"vfmasb %%v27,%%v19,%%v0,%%v27 \n\t"
|
||||
"vst %%v27, 48(%%r1,%3) \n\t"
|
||||
"vl %%v28, 64(%%r1,%3) \n\t"
|
||||
"vfmasb %%v28,%%v20,%%v0,%%v28 \n\t"
|
||||
"vst %%v28, 64(%%r1,%3) \n\t"
|
||||
"vl %%v29, 80(%%r1,%3) \n\t"
|
||||
"vfmasb %%v29,%%v21,%%v0,%%v29 \n\t"
|
||||
"vst %%v29, 80(%%r1,%3) \n\t"
|
||||
"vl %%v30, 96(%%r1,%3) \n\t"
|
||||
"vfmasb %%v30,%%v22,%%v0,%%v30 \n\t"
|
||||
"vst %%v30, 96(%%r1,%3) \n\t"
|
||||
"vl %%v31, 112(%%r1,%3) \n\t"
|
||||
"vfmasb %%v31,%%v23,%%v0,%%v31 \n\t"
|
||||
"vst %%v31, 112(%%r1,%3) \n\t"
|
||||
|
||||
"agfi %%r1,128 \n\t"
|
||||
"brctg %%r0,0b \n\t"
|
||||
|
||||
"1: \n\t"
|
||||
"lghi %%r0,28 \n\t"
|
||||
"ngr %%r0,%0 \n\t"
|
||||
"ltgr %%r0,%%r0 \n\t"
|
||||
"jz 3f \n\t"
|
||||
|
||||
"srlg %%r0,%%r0,2 \n\t"
|
||||
"2: \n\t"
|
||||
"vl %%v16,0(%%r1,%2) \n\t"
|
||||
|
||||
"vl %%v24, 0(%%r1,%3) \n\t"
|
||||
"vfmasb %%v24,%%v16,%%v0,%%v24 \n\t"
|
||||
"vst %%v24, 0(%%r1,%3) \n\t"
|
||||
|
||||
"agfi %%r1,16 \n\t"
|
||||
"brctg %%r0,2b \n\t"
|
||||
|
||||
"3: \n\t"
|
||||
"nop "
|
||||
:
|
||||
:"r"(n),"m"(da),"ZR"((const FLOAT (*)[n])src),"ZR"((FLOAT (*)[n])dest)
|
||||
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
}
|
||||
static void add_y(BLASLONG n, FLOAT da, FLOAT *src, FLOAT *dest, BLASLONG inc_dest)
|
||||
{
|
||||
if (inc_dest == 1)
|
||||
add_y_kernel_4(n, da, src, dest);
|
||||
else
|
||||
{
|
||||
BLASLONG i;
|
||||
for (i = 0; i < n; i++)
|
||||
{
|
||||
*dest += src[i] * da;
|
||||
dest += inc_dest;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
|
||||
{
|
||||
BLASLONG register i;
|
||||
BLASLONG register j;
|
||||
FLOAT *a_ptr;
|
||||
FLOAT *x_ptr;
|
||||
FLOAT *y_ptr;
|
||||
BLASLONG n0;
|
||||
BLASLONG n1;
|
||||
BLASLONG m1;
|
||||
BLASLONG m2;
|
||||
BLASLONG m3;
|
||||
BLASLONG n2;
|
||||
FLOAT ybuffer[2] __attribute__ ((aligned(16)));
|
||||
FLOAT *xbuffer;
|
||||
FLOAT *ytemp;
|
||||
|
||||
if ( m < 1 ) return(0);
|
||||
if ( n < 1 ) return(0);
|
||||
|
||||
xbuffer = buffer;
|
||||
ytemp = buffer + (m < NBMAX ? m : NBMAX);
|
||||
|
||||
n0 = n / NBMAX;
|
||||
n1 = (n % NBMAX) >> 2 ;
|
||||
n2 = n & 3 ;
|
||||
|
||||
m3 = m & 3 ;
|
||||
m1 = m & -4 ;
|
||||
m2 = (m & (NBMAX-1)) - m3 ;
|
||||
|
||||
|
||||
BLASLONG NB = NBMAX;
|
||||
|
||||
while ( NB == NBMAX )
|
||||
{
|
||||
m1 -= NB;
|
||||
if ( m1 < 0)
|
||||
{
|
||||
if ( m2 == 0 ) break;
|
||||
NB = m2;
|
||||
}
|
||||
|
||||
y_ptr = y;
|
||||
a_ptr = a;
|
||||
x_ptr = x;
|
||||
|
||||
if ( inc_x == 1 )
|
||||
xbuffer = x_ptr;
|
||||
else
|
||||
copy_x(NB,x_ptr,xbuffer,inc_x);
|
||||
|
||||
|
||||
FLOAT *ap[4];
|
||||
FLOAT *yp;
|
||||
BLASLONG register lda4 = 4 * lda;
|
||||
ap[0] = a_ptr;
|
||||
ap[1] = a_ptr + lda;
|
||||
ap[2] = ap[1] + lda;
|
||||
ap[3] = ap[2] + lda;
|
||||
|
||||
if ( n0 > 0 )
|
||||
{
|
||||
BLASLONG nb1 = NBMAX / 4;
|
||||
for( j=0; j<n0; j++)
|
||||
{
|
||||
|
||||
yp = ytemp;
|
||||
for( i = 0; i < nb1 ; i++)
|
||||
{
|
||||
sgemv_kernel_4x4(NB,ap,xbuffer,yp);
|
||||
ap[0] += lda4 ;
|
||||
ap[1] += lda4 ;
|
||||
ap[2] += lda4 ;
|
||||
ap[3] += lda4 ;
|
||||
yp += 4;
|
||||
}
|
||||
add_y(nb1*4, alpha, ytemp, y_ptr, inc_y );
|
||||
y_ptr += nb1 * inc_y * 4;
|
||||
a_ptr += nb1 * lda4 ;
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
yp = ytemp;
|
||||
|
||||
for( i = 0; i < n1 ; i++)
|
||||
{
|
||||
sgemv_kernel_4x4(NB,ap,xbuffer,yp);
|
||||
ap[0] += lda4 ;
|
||||
ap[1] += lda4 ;
|
||||
ap[2] += lda4 ;
|
||||
ap[3] += lda4 ;
|
||||
yp += 4;
|
||||
}
|
||||
if ( n1 > 0 )
|
||||
{
|
||||
add_y(n1*4, alpha, ytemp, y_ptr, inc_y );
|
||||
y_ptr += n1 * inc_y * 4;
|
||||
a_ptr += n1 * lda4 ;
|
||||
}
|
||||
|
||||
if ( n2 & 2 )
|
||||
{
|
||||
|
||||
sgemv_kernel_4x2(NB,ap,xbuffer,ybuffer);
|
||||
a_ptr += lda * 2;
|
||||
*y_ptr += ybuffer[0] * alpha;
|
||||
y_ptr += inc_y;
|
||||
*y_ptr += ybuffer[1] * alpha;
|
||||
y_ptr += inc_y;
|
||||
|
||||
}
|
||||
|
||||
if ( n2 & 1 )
|
||||
{
|
||||
|
||||
sgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer);
|
||||
// a_ptr += lda;
|
||||
*y_ptr += ybuffer[0] * alpha;
|
||||
// y_ptr += inc_y;
|
||||
|
||||
}
|
||||
a += NB;
|
||||
x += NB * inc_x;
|
||||
}
|
||||
|
||||
if ( m3 == 0 ) return(0);
|
||||
|
||||
x_ptr = x;
|
||||
a_ptr = a;
|
||||
if ( m3 == 3 )
|
||||
{
|
||||
FLOAT xtemp0 = *x_ptr * alpha;
|
||||
x_ptr += inc_x;
|
||||
FLOAT xtemp1 = *x_ptr * alpha;
|
||||
x_ptr += inc_x;
|
||||
FLOAT xtemp2 = *x_ptr * alpha;
|
||||
|
||||
FLOAT *aj = a_ptr;
|
||||
y_ptr = y;
|
||||
|
||||
if ( lda == 3 && inc_y == 1 )
|
||||
{
|
||||
|
||||
for ( j=0; j< ( n & -4) ; j+=4 )
|
||||
{
|
||||
|
||||
y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2;
|
||||
y_ptr[j+1] += aj[3] * xtemp0 + aj[4] * xtemp1 + aj[5] * xtemp2;
|
||||
y_ptr[j+2] += aj[6] * xtemp0 + aj[7] * xtemp1 + aj[8] * xtemp2;
|
||||
y_ptr[j+3] += aj[9] * xtemp0 + aj[10] * xtemp1 + aj[11] * xtemp2;
|
||||
aj += 12;
|
||||
}
|
||||
|
||||
for ( ; j<n; j++ )
|
||||
{
|
||||
y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2;
|
||||
aj += 3;
|
||||
}
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
if ( inc_y == 1 )
|
||||
{
|
||||
|
||||
BLASLONG register lda2 = lda << 1;
|
||||
BLASLONG register lda4 = lda << 2;
|
||||
BLASLONG register lda3 = lda2 + lda;
|
||||
|
||||
for ( j=0; j< ( n & -4 ); j+=4 )
|
||||
{
|
||||
|
||||
y_ptr[j] += *aj * xtemp0 + *(aj+1) * xtemp1 + *(aj+2) * xtemp2;
|
||||
y_ptr[j+1] += *(aj+lda) * xtemp0 + *(aj+lda+1) * xtemp1 + *(aj+lda+2) * xtemp2;
|
||||
y_ptr[j+2] += *(aj+lda2) * xtemp0 + *(aj+lda2+1) * xtemp1 + *(aj+lda2+2) * xtemp2;
|
||||
y_ptr[j+3] += *(aj+lda3) * xtemp0 + *(aj+lda3+1) * xtemp1 + *(aj+lda3+2) * xtemp2;
|
||||
aj += lda4;
|
||||
}
|
||||
|
||||
for ( ; j< n ; j++ )
|
||||
{
|
||||
|
||||
y_ptr[j] += *aj * xtemp0 + *(aj+1) * xtemp1 + *(aj+2) * xtemp2 ;
|
||||
aj += lda;
|
||||
}
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
for ( j=0; j<n; j++ )
|
||||
{
|
||||
*y_ptr += *aj * xtemp0 + *(aj+1) * xtemp1 + *(aj+2) * xtemp2;
|
||||
y_ptr += inc_y;
|
||||
aj += lda;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
return(0);
|
||||
}
|
||||
|
||||
if ( m3 == 2 )
|
||||
{
|
||||
FLOAT xtemp0 = *x_ptr * alpha;
|
||||
x_ptr += inc_x;
|
||||
FLOAT xtemp1 = *x_ptr * alpha;
|
||||
|
||||
FLOAT *aj = a_ptr;
|
||||
y_ptr = y;
|
||||
|
||||
if ( lda == 2 && inc_y == 1 )
|
||||
{
|
||||
|
||||
for ( j=0; j< ( n & -4) ; j+=4 )
|
||||
{
|
||||
y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 ;
|
||||
y_ptr[j+1] += aj[2] * xtemp0 + aj[3] * xtemp1 ;
|
||||
y_ptr[j+2] += aj[4] * xtemp0 + aj[5] * xtemp1 ;
|
||||
y_ptr[j+3] += aj[6] * xtemp0 + aj[7] * xtemp1 ;
|
||||
aj += 8;
|
||||
|
||||
}
|
||||
|
||||
for ( ; j<n; j++ )
|
||||
{
|
||||
y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 ;
|
||||
aj += 2;
|
||||
}
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
if ( inc_y == 1 )
|
||||
{
|
||||
|
||||
BLASLONG register lda2 = lda << 1;
|
||||
BLASLONG register lda4 = lda << 2;
|
||||
BLASLONG register lda3 = lda2 + lda;
|
||||
|
||||
for ( j=0; j< ( n & -4 ); j+=4 )
|
||||
{
|
||||
|
||||
y_ptr[j] += *aj * xtemp0 + *(aj+1) * xtemp1 ;
|
||||
y_ptr[j+1] += *(aj+lda) * xtemp0 + *(aj+lda+1) * xtemp1 ;
|
||||
y_ptr[j+2] += *(aj+lda2) * xtemp0 + *(aj+lda2+1) * xtemp1 ;
|
||||
y_ptr[j+3] += *(aj+lda3) * xtemp0 + *(aj+lda3+1) * xtemp1 ;
|
||||
aj += lda4;
|
||||
}
|
||||
|
||||
for ( ; j< n ; j++ )
|
||||
{
|
||||
|
||||
y_ptr[j] += *aj * xtemp0 + *(aj+1) * xtemp1 ;
|
||||
aj += lda;
|
||||
}
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
for ( j=0; j<n; j++ )
|
||||
{
|
||||
*y_ptr += *aj * xtemp0 + *(aj+1) * xtemp1 ;
|
||||
y_ptr += inc_y;
|
||||
aj += lda;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
return(0);
|
||||
|
||||
}
|
||||
|
||||
FLOAT xtemp = *x_ptr * alpha;
|
||||
FLOAT *aj = a_ptr;
|
||||
y_ptr = y;
|
||||
if ( lda == 1 && inc_y == 1 )
|
||||
{
|
||||
for ( j=0; j< ( n & -4) ; j+=4 )
|
||||
{
|
||||
y_ptr[j] += aj[j] * xtemp;
|
||||
y_ptr[j+1] += aj[j+1] * xtemp;
|
||||
y_ptr[j+2] += aj[j+2] * xtemp;
|
||||
y_ptr[j+3] += aj[j+3] * xtemp;
|
||||
}
|
||||
for ( ; j<n ; j++ )
|
||||
{
|
||||
y_ptr[j] += aj[j] * xtemp;
|
||||
}
|
||||
|
||||
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
if ( inc_y == 1 )
|
||||
{
|
||||
|
||||
BLASLONG register lda2 = lda << 1;
|
||||
BLASLONG register lda4 = lda << 2;
|
||||
BLASLONG register lda3 = lda2 + lda;
|
||||
for ( j=0; j< ( n & -4 ); j+=4 )
|
||||
{
|
||||
y_ptr[j] += *aj * xtemp;
|
||||
y_ptr[j+1] += *(aj+lda) * xtemp;
|
||||
y_ptr[j+2] += *(aj+lda2) * xtemp;
|
||||
y_ptr[j+3] += *(aj+lda3) * xtemp;
|
||||
aj += lda4 ;
|
||||
}
|
||||
|
||||
for ( ; j<n; j++ )
|
||||
{
|
||||
y_ptr[j] += *aj * xtemp;
|
||||
aj += lda;
|
||||
}
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
for ( j=0; j<n; j++ )
|
||||
{
|
||||
*y_ptr += *aj * xtemp;
|
||||
y_ptr += inc_y;
|
||||
aj += lda;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
return(0);
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,162 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2017, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
static FLOAT smax_kernel_64(BLASLONG n, FLOAT *x)
|
||||
{
|
||||
FLOAT max;
|
||||
|
||||
__asm__ volatile (
|
||||
"vl %%v0,0(%2) \n\t"
|
||||
"srlg %%r0,%1,6 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 1, 1024(%%r1,%2) \n\t"
|
||||
|
||||
"vl %%v16,0(%%r1,%2) \n\t"
|
||||
"vl %%v17,16(%%r1,%2) \n\t"
|
||||
"vl %%v18,32(%%r1,%2) \n\t"
|
||||
"vl %%v19,48(%%r1,%2) \n\t"
|
||||
"vl %%v20,64(%%r1,%2) \n\t"
|
||||
"vl %%v21,80(%%r1,%2) \n\t"
|
||||
"vl %%v22,96(%%r1,%2) \n\t"
|
||||
"vl %%v23,112(%%r1,%2) \n\t"
|
||||
"vl %%v24,128(%%r1,%2) \n\t"
|
||||
"vl %%v25,144(%%r1,%2) \n\t"
|
||||
"vl %%v26,160(%%r1,%2) \n\t"
|
||||
"vl %%v27,176(%%r1,%2) \n\t"
|
||||
"vl %%v28,192(%%r1,%2) \n\t"
|
||||
"vl %%v29,208(%%r1,%2) \n\t"
|
||||
"vl %%v30,224(%%r1,%2) \n\t"
|
||||
"vl %%v31,240(%%r1,%2) \n\t"
|
||||
|
||||
"vfmaxsb %%v16,%%v16,%%v24,0 \n\t"
|
||||
"vfmaxsb %%v17,%%v17,%%v25,0 \n\t"
|
||||
"vfmaxsb %%v18,%%v18,%%v26,0 \n\t"
|
||||
"vfmaxsb %%v19,%%v19,%%v27,0 \n\t"
|
||||
"vfmaxsb %%v20,%%v20,%%v28,0 \n\t"
|
||||
"vfmaxsb %%v21,%%v21,%%v29,0 \n\t"
|
||||
"vfmaxsb %%v22,%%v22,%%v30,0 \n\t"
|
||||
"vfmaxsb %%v23,%%v23,%%v31,0 \n\t"
|
||||
|
||||
"vfmaxsb %%v16,%%v16,%%v20,0 \n\t"
|
||||
"vfmaxsb %%v17,%%v17,%%v21,0 \n\t"
|
||||
"vfmaxsb %%v18,%%v18,%%v22,0 \n\t"
|
||||
"vfmaxsb %%v19,%%v19,%%v23,0 \n\t"
|
||||
|
||||
"vfmaxsb %%v16,%%v16,%%v18,0 \n\t"
|
||||
"vfmaxsb %%v17,%%v17,%%v19,0 \n\t"
|
||||
|
||||
"vfmaxsb %%v16,%%v16,%%v17,0 \n\t"
|
||||
|
||||
"vfmaxsb %%v0,%%v0,%%16,0 \n\t"
|
||||
|
||||
"agfi %%r1, 256 \n\t"
|
||||
"brctg %%r0, 0b \n\t"
|
||||
|
||||
"veslg %%v16,%%v0,32 \n\t"
|
||||
"vfmaxsb %%v0,%%v0,%%v16,0 \n\t"
|
||||
|
||||
"vrepf %%v16,%%v0,2 \n\t"
|
||||
"wfmaxsb %%v0,%%v0,%%v16,0 \n\t"
|
||||
"ler %0,%%f0 "
|
||||
:"=f"(max)
|
||||
:"r"(n),"ZR"((const FLOAT (*)[n])x)
|
||||
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
|
||||
return max;
|
||||
}
|
||||
|
||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||
BLASLONG i = 0;
|
||||
BLASLONG j = 0;
|
||||
FLOAT maxf = 0.0;
|
||||
|
||||
if (n <= 0 || inc_x <= 0) return (maxf);
|
||||
|
||||
if (inc_x == 1) {
|
||||
|
||||
BLASLONG n1 = n & -64;
|
||||
if (n1 > 0) {
|
||||
|
||||
maxf = smax_kernel_64(n1, x);
|
||||
|
||||
i = n1;
|
||||
}
|
||||
else
|
||||
{
|
||||
maxf=x[0];
|
||||
i++;
|
||||
}
|
||||
|
||||
while (i < n) {
|
||||
if (x[i] > maxf) {
|
||||
maxf = x[i];
|
||||
}
|
||||
i++;
|
||||
}
|
||||
return (maxf);
|
||||
|
||||
} else {
|
||||
|
||||
maxf=x[0];
|
||||
|
||||
BLASLONG n1 = n & -4;
|
||||
while (j < n1) {
|
||||
|
||||
if (x[i] > maxf) {
|
||||
maxf = x[i];
|
||||
}
|
||||
if (x[i + inc_x] > maxf) {
|
||||
maxf = x[i + inc_x];
|
||||
}
|
||||
if (x[i + 2 * inc_x] > maxf) {
|
||||
maxf = x[i + 2 * inc_x];
|
||||
}
|
||||
if (x[i + 3 * inc_x] > maxf) {
|
||||
maxf = x[i + 3 * inc_x];
|
||||
}
|
||||
|
||||
i += inc_x * 4;
|
||||
|
||||
j += 4;
|
||||
|
||||
}
|
||||
|
||||
|
||||
while (j < n) {
|
||||
if (x[i] > maxf) {
|
||||
maxf = x[i];
|
||||
}
|
||||
i += inc_x;
|
||||
j++;
|
||||
}
|
||||
return (maxf);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,162 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2017, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
static FLOAT smin_kernel_64(BLASLONG n, FLOAT *x)
|
||||
{
|
||||
FLOAT min;
|
||||
|
||||
__asm__ volatile (
|
||||
"vl %%v0,0(%2) \n\t"
|
||||
"srlg %%r0,%1,6 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 1, 1024(%%r1,%2) \n\t"
|
||||
|
||||
"vl %%v16,0(%%r1,%2) \n\t"
|
||||
"vl %%v17,16(%%r1,%2) \n\t"
|
||||
"vl %%v18,32(%%r1,%2) \n\t"
|
||||
"vl %%v19,48(%%r1,%2) \n\t"
|
||||
"vl %%v20,64(%%r1,%2) \n\t"
|
||||
"vl %%v21,80(%%r1,%2) \n\t"
|
||||
"vl %%v22,96(%%r1,%2) \n\t"
|
||||
"vl %%v23,112(%%r1,%2) \n\t"
|
||||
"vl %%v24,128(%%r1,%2) \n\t"
|
||||
"vl %%v25,144(%%r1,%2) \n\t"
|
||||
"vl %%v26,160(%%r1,%2) \n\t"
|
||||
"vl %%v27,176(%%r1,%2) \n\t"
|
||||
"vl %%v28,192(%%r1,%2) \n\t"
|
||||
"vl %%v29,208(%%r1,%2) \n\t"
|
||||
"vl %%v30,224(%%r1,%2) \n\t"
|
||||
"vl %%v31,240(%%r1,%2) \n\t"
|
||||
|
||||
"vfminsb %%v16,%%v16,%%v24,0 \n\t"
|
||||
"vfminsb %%v17,%%v17,%%v25,0 \n\t"
|
||||
"vfminsb %%v18,%%v18,%%v26,0 \n\t"
|
||||
"vfminsb %%v19,%%v19,%%v27,0 \n\t"
|
||||
"vfminsb %%v20,%%v20,%%v28,0 \n\t"
|
||||
"vfminsb %%v21,%%v21,%%v29,0 \n\t"
|
||||
"vfminsb %%v22,%%v22,%%v30,0 \n\t"
|
||||
"vfminsb %%v23,%%v23,%%v31,0 \n\t"
|
||||
|
||||
"vfminsb %%v16,%%v16,%%v20,0 \n\t"
|
||||
"vfminsb %%v17,%%v17,%%v21,0 \n\t"
|
||||
"vfminsb %%v18,%%v18,%%v22,0 \n\t"
|
||||
"vfminsb %%v19,%%v19,%%v23,0 \n\t"
|
||||
|
||||
"vfminsb %%v16,%%v16,%%v18,0 \n\t"
|
||||
"vfminsb %%v17,%%v17,%%v19,0 \n\t"
|
||||
|
||||
"vfminsb %%v16,%%v16,%%v17,0 \n\t"
|
||||
|
||||
"vfminsb %%v0,%%v0,%%16,0 \n\t"
|
||||
|
||||
"agfi %%r1, 256 \n\t"
|
||||
"brctg %%r0, 0b \n\t"
|
||||
|
||||
"veslg %%v16,%%v0,32 \n\t"
|
||||
"vfminsb %%v0,%%v0,%%v16,0 \n\t"
|
||||
|
||||
"vrepf %%v16,%%v0,2 \n\t"
|
||||
"wfminsb %%v0,%%v0,%%v16,0 \n\t"
|
||||
"ler %0,%%f0 "
|
||||
:"=f"(min)
|
||||
:"r"(n),"ZR"((const FLOAT (*)[n])x)
|
||||
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
|
||||
return min;
|
||||
}
|
||||
|
||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||
BLASLONG i = 0;
|
||||
BLASLONG j = 0;
|
||||
FLOAT minf = 0.0;
|
||||
|
||||
if (n <= 0 || inc_x <= 0) return (minf);
|
||||
|
||||
if (inc_x == 1) {
|
||||
|
||||
BLASLONG n1 = n & -64;
|
||||
if (n1 > 0) {
|
||||
|
||||
minf = smin_kernel_64(n1, x);
|
||||
|
||||
i = n1;
|
||||
}
|
||||
else
|
||||
{
|
||||
minf=x[0];
|
||||
i++;
|
||||
}
|
||||
|
||||
while (i < n) {
|
||||
if (x[i] < minf) {
|
||||
minf = x[i];
|
||||
}
|
||||
i++;
|
||||
}
|
||||
return (minf);
|
||||
|
||||
} else {
|
||||
|
||||
minf=x[0];
|
||||
|
||||
BLASLONG n1 = n & -4;
|
||||
while (j < n1) {
|
||||
|
||||
if (x[i] < minf) {
|
||||
minf = x[i];
|
||||
}
|
||||
if (x[i + inc_x] < minf) {
|
||||
minf = x[i + inc_x];
|
||||
}
|
||||
if (x[i + 2 * inc_x] < minf) {
|
||||
minf = x[i + 2 * inc_x];
|
||||
}
|
||||
if (x[i + 3 * inc_x] < minf) {
|
||||
minf = x[i + 3 * inc_x];
|
||||
}
|
||||
|
||||
i += inc_x * 4;
|
||||
|
||||
j += 4;
|
||||
|
||||
}
|
||||
|
||||
|
||||
while (j < n) {
|
||||
if (x[i] < minf) {
|
||||
minf = x[i];
|
||||
}
|
||||
i += inc_x;
|
||||
j++;
|
||||
}
|
||||
return (minf);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,246 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2018, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
static void srot_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s)
|
||||
{
|
||||
__asm__ (
|
||||
"vlrepf %%v0,%3 \n\t"
|
||||
"vlrepf %%v1,%4 \n\t"
|
||||
"srlg %%r0,%0,6 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 2, 1024(%%r1,%1) \n\t"
|
||||
"pfd 2, 1024(%%r1,%2) \n\t"
|
||||
"vl %%v24, 0(%%r1,%1) \n\t"
|
||||
"vl %%v25, 16(%%r1,%1) \n\t"
|
||||
"vl %%v26, 32(%%r1,%1) \n\t"
|
||||
"vl %%v27, 48(%%r1,%1) \n\t"
|
||||
"vl %%v16, 0(%%r1,%2) \n\t"
|
||||
"vl %%v17, 16(%%r1,%2) \n\t"
|
||||
"vl %%v18, 32(%%r1,%2) \n\t"
|
||||
"vl %%v19, 48(%%r1,%2) \n\t"
|
||||
|
||||
"vfmsb %%v28,%%v24,%%v0 \n\t"
|
||||
"vfmsb %%v29,%%v25,%%v0 \n\t"
|
||||
"vfmsb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmsb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmsb %%v30,%%v26,%%v0 \n\t"
|
||||
"vfmsb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmsb %%v31,%%v27,%%v0 \n\t"
|
||||
"vfmsb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
|
||||
/* 2nd parts*/
|
||||
"vfmasb %%v28,%%v16,%%v1,%%v28 \n\t"
|
||||
"vfmssb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
|
||||
"vfmasb %%v29,%%v17,%%v1,%%v29 \n\t"
|
||||
"vfmssb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
|
||||
"vfmasb %%v30,%%v18,%%v1,%%v30 \n\t"
|
||||
"vfmssb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */
|
||||
"vfmasb %%v31,%%v19,%%v1,%%v31 \n\t"
|
||||
"vfmssb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
|
||||
|
||||
"vst %%v28, 0(%%r1,%1) \n\t"
|
||||
"vst %%v29, 16(%%r1,%1) \n\t"
|
||||
"vst %%v30, 32(%%r1,%1) \n\t"
|
||||
"vst %%v31, 48(%%r1,%1) \n\t"
|
||||
"vst %%v20, 0(%%r1,%2) \n\t"
|
||||
"vst %%v21, 16(%%r1,%2) \n\t"
|
||||
"vst %%v22, 32(%%r1,%2) \n\t"
|
||||
"vst %%v23, 48(%%r1,%2) \n\t"
|
||||
|
||||
"vl %%v24, 64(%%r1,%1) \n\t"
|
||||
"vl %%v25, 80(%%r1,%1) \n\t"
|
||||
"vl %%v26, 96(%%r1,%1) \n\t"
|
||||
"vl %%v27, 112(%%r1,%1) \n\t"
|
||||
"vl %%v16, 64(%%r1,%2) \n\t"
|
||||
"vl %%v17, 80(%%r1,%2) \n\t"
|
||||
"vl %%v18, 96(%%r1,%2) \n\t"
|
||||
"vl %%v19, 112(%%r1,%2) \n\t"
|
||||
|
||||
"vfmsb %%v28,%%v24,%%v0 \n\t"
|
||||
"vfmsb %%v29,%%v25,%%v0 \n\t"
|
||||
"vfmsb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmsb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmsb %%v30,%%v26,%%v0 \n\t"
|
||||
"vfmsb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmsb %%v31,%%v27,%%v0 \n\t"
|
||||
"vfmsb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
|
||||
/* 2nd parts*/
|
||||
"vfmasb %%v28,%%v16,%%v1,%%v28 \n\t"
|
||||
"vfmssb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
|
||||
"vfmasb %%v29,%%v17,%%v1,%%v29 \n\t"
|
||||
"vfmssb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
|
||||
"vfmasb %%v30,%%v18,%%v1,%%v30 \n\t"
|
||||
"vfmssb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */
|
||||
"vfmasb %%v31,%%v19,%%v1,%%v31 \n\t"
|
||||
"vfmssb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
|
||||
|
||||
"vst %%v28, 64(%%r1,%1) \n\t"
|
||||
"vst %%v29, 80(%%r1,%1) \n\t"
|
||||
"vst %%v30, 96(%%r1,%1) \n\t"
|
||||
"vst %%v31, 112(%%r1,%1) \n\t"
|
||||
"vst %%v20, 64(%%r1,%2) \n\t"
|
||||
"vst %%v21, 80(%%r1,%2) \n\t"
|
||||
"vst %%v22, 96(%%r1,%2) \n\t"
|
||||
"vst %%v23, 112(%%r1,%2) \n\t"
|
||||
|
||||
"vl %%v24, 128(%%r1,%1) \n\t"
|
||||
"vl %%v25, 144(%%r1,%1) \n\t"
|
||||
"vl %%v26, 160(%%r1,%1) \n\t"
|
||||
"vl %%v27, 176(%%r1,%1) \n\t"
|
||||
"vl %%v16, 128(%%r1,%2) \n\t"
|
||||
"vl %%v17, 144(%%r1,%2) \n\t"
|
||||
"vl %%v18, 160(%%r1,%2) \n\t"
|
||||
"vl %%v19, 176(%%r1,%2) \n\t"
|
||||
|
||||
"vfmsb %%v28,%%v24,%%v0 \n\t"
|
||||
"vfmsb %%v29,%%v25,%%v0 \n\t"
|
||||
"vfmsb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmsb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmsb %%v30,%%v26,%%v0 \n\t"
|
||||
"vfmsb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmsb %%v31,%%v27,%%v0 \n\t"
|
||||
"vfmsb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
|
||||
/* 2nd parts*/
|
||||
"vfmasb %%v28,%%v16,%%v1,%%v28 \n\t"
|
||||
"vfmssb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
|
||||
"vfmasb %%v29,%%v17,%%v1,%%v29 \n\t"
|
||||
"vfmssb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
|
||||
"vfmasb %%v30,%%v18,%%v1,%%v30 \n\t"
|
||||
"vfmssb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */
|
||||
"vfmasb %%v31,%%v19,%%v1,%%v31 \n\t"
|
||||
"vfmssb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
|
||||
|
||||
"vst %%v28, 128(%%r1,%1) \n\t"
|
||||
"vst %%v29, 144(%%r1,%1) \n\t"
|
||||
"vst %%v30, 160(%%r1,%1) \n\t"
|
||||
"vst %%v31, 176(%%r1,%1) \n\t"
|
||||
"vst %%v20, 128(%%r1,%2) \n\t"
|
||||
"vst %%v21, 144(%%r1,%2) \n\t"
|
||||
"vst %%v22, 160(%%r1,%2) \n\t"
|
||||
"vst %%v23, 176(%%r1,%2) \n\t"
|
||||
|
||||
"vl %%v24, 192(%%r1,%1) \n\t"
|
||||
"vl %%v25, 208(%%r1,%1) \n\t"
|
||||
"vl %%v26, 224(%%r1,%1) \n\t"
|
||||
"vl %%v27, 240(%%r1,%1) \n\t"
|
||||
"vl %%v16, 192(%%r1,%2) \n\t"
|
||||
"vl %%v17, 208(%%r1,%2) \n\t"
|
||||
"vl %%v18, 224(%%r1,%2) \n\t"
|
||||
"vl %%v19, 240(%%r1,%2) \n\t"
|
||||
|
||||
"vfmsb %%v28,%%v24,%%v0 \n\t"
|
||||
"vfmsb %%v29,%%v25,%%v0 \n\t"
|
||||
"vfmsb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmsb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmsb %%v30,%%v26,%%v0 \n\t"
|
||||
"vfmsb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmsb %%v31,%%v27,%%v0 \n\t"
|
||||
"vfmsb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
|
||||
/* 2nd parts*/
|
||||
"vfmasb %%v28,%%v16,%%v1,%%v28 \n\t"
|
||||
"vfmssb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
|
||||
"vfmasb %%v29,%%v17,%%v1,%%v29 \n\t"
|
||||
"vfmssb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
|
||||
"vfmasb %%v30,%%v18,%%v1,%%v30 \n\t"
|
||||
"vfmssb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */
|
||||
"vfmasb %%v31,%%v19,%%v1,%%v31 \n\t"
|
||||
"vfmssb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
|
||||
|
||||
"vst %%v28, 192(%%r1,%1) \n\t"
|
||||
"vst %%v29, 208(%%r1,%1) \n\t"
|
||||
"vst %%v30, 224(%%r1,%1) \n\t"
|
||||
"vst %%v31, 240(%%r1,%1) \n\t"
|
||||
"vst %%v20, 192(%%r1,%2) \n\t"
|
||||
"vst %%v21, 208(%%r1,%2) \n\t"
|
||||
"vst %%v22, 224(%%r1,%2) \n\t"
|
||||
"vst %%v23, 240(%%r1,%2) \n\t"
|
||||
|
||||
"agfi %%r1,256 \n\t"
|
||||
"brctg %%r0,0b "
|
||||
:
|
||||
:"r"(n),"ZR"((FLOAT (*)[n])x),"ZR"((FLOAT (*)[n])y),"m"(*c),"m"(*s)
|
||||
:"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
}
|
||||
|
||||
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
|
||||
{
|
||||
BLASLONG i=0;
|
||||
BLASLONG ix=0,iy=0;
|
||||
|
||||
FLOAT temp;
|
||||
|
||||
if ( n <= 0 ) return(0);
|
||||
|
||||
if ( (inc_x == 1) && (inc_y == 1) )
|
||||
{
|
||||
|
||||
BLASLONG n1 = n & -64;
|
||||
if ( n1 > 0 )
|
||||
{
|
||||
FLOAT cosa,sina;
|
||||
cosa=c;
|
||||
sina=s;
|
||||
srot_kernel_64(n1, x, y, &cosa, &sina);
|
||||
i=n1;
|
||||
}
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
temp = c*x[i] + s*y[i] ;
|
||||
y[i] = c*y[i] - s*x[i] ;
|
||||
x[i] = temp ;
|
||||
|
||||
i++ ;
|
||||
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
temp = c*x[ix] + s*y[iy] ;
|
||||
y[iy] = c*y[iy] - s*x[ix] ;
|
||||
x[ix] = temp ;
|
||||
|
||||
ix += inc_x ;
|
||||
iy += inc_y ;
|
||||
i++ ;
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
return(0);
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,201 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2018, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
static void sscal_kernel_32(BLASLONG n, FLOAT da, FLOAT *x)
|
||||
{
|
||||
__asm__ volatile (
|
||||
"vlrepf %%v0,%1 \n\t"
|
||||
"srlg %%r0,%0,5 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 2, 1024(%%r1,%2) \n\t"
|
||||
"vl %%v24, 0(%%r1,%2) \n\t"
|
||||
"vfmsb %%v24,%%v24,%%v0 \n\t"
|
||||
"vst %%v24, 0(%%r1,%2) \n\t"
|
||||
"vl %%v25, 16(%%r1,%2) \n\t"
|
||||
"vfmsb %%v25,%%v25,%%v0 \n\t"
|
||||
"vst %%v25, 16(%%r1,%2) \n\t"
|
||||
"vl %%v26, 32(%%r1,%2) \n\t"
|
||||
"vfmsb %%v26,%%v26,%%v0 \n\t"
|
||||
"vst %%v26, 32(%%r1,%2) \n\t"
|
||||
"vl %%v27, 48(%%r1,%2) \n\t"
|
||||
"vfmsb %%v27,%%v27,%%v0 \n\t"
|
||||
"vst %%v27, 48(%%r1,%2) \n\t"
|
||||
"vl %%v24, 64(%%r1,%2) \n\t"
|
||||
"vfmsb %%v24,%%v24,%%v0 \n\t"
|
||||
"vst %%v24, 64(%%r1,%2) \n\t"
|
||||
"vl %%v25, 80(%%r1,%2) \n\t"
|
||||
"vfmsb %%v25,%%v25,%%v0 \n\t"
|
||||
"vst %%v25, 80(%%r1,%2) \n\t"
|
||||
"vl %%v26, 96(%%r1,%2) \n\t"
|
||||
"vfmsb %%v26,%%v26,%%v0 \n\t"
|
||||
"vst %%v26, 96(%%r1,%2) \n\t"
|
||||
"vl %%v27, 112(%%r1,%2) \n\t"
|
||||
"vfmsb %%v27,%%v27,%%v0 \n\t"
|
||||
"vst %%v27, 112(%%r1,%2) \n\t"
|
||||
"agfi %%r1,128 \n\t"
|
||||
"brctg %%r0,0b "
|
||||
:
|
||||
:"r"(n),"m"(da),"ZR"((FLOAT (*)[n])x)
|
||||
:"memory","cc","r0","r1","v0","v24","v25","v26","v27"
|
||||
);
|
||||
}
|
||||
|
||||
static void sscal_kernel_32_zero(BLASLONG n, FLOAT *x)
|
||||
{
|
||||
__asm__ volatile(
|
||||
"vzero %%v24 \n\t"
|
||||
"vzero %%v25 \n\t"
|
||||
"vzero %%v26 \n\t"
|
||||
"vzero %%v27 \n\t"
|
||||
"srlg %%r0,%0,5 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 2, 1024(%%r1,%1) \n\t"
|
||||
|
||||
"vst %%v24,0(%%r1,%1) \n\t"
|
||||
"vst %%v25,16(%%r1,%1) \n\t"
|
||||
"vst %%v26,32(%%r1,%1) \n\t"
|
||||
"vst %%v27,48(%%r1,%1) \n\t"
|
||||
"vst %%v24,64(%%r1,%1) \n\t"
|
||||
"vst %%v25,80(%%r1,%1) \n\t"
|
||||
"vst %%v26,96(%%r1,%1) \n\t"
|
||||
"vst %%v27,112(%%r1,%1) \n\t"
|
||||
|
||||
"agfi %%r1,128 \n\t"
|
||||
"brctg %%r0,0b "
|
||||
:
|
||||
:"r"(n),"ZR"((FLOAT (*)[n])x)
|
||||
:"memory","cc","r0","r1","v24","v25","v26","v27"
|
||||
);
|
||||
}
|
||||
|
||||
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
|
||||
{
|
||||
BLASLONG i=0,j=0;
|
||||
if ( n <= 0 || inc_x <=0 )
|
||||
return(0);
|
||||
|
||||
|
||||
if ( inc_x == 1 )
|
||||
{
|
||||
|
||||
if ( da == 0.0 )
|
||||
{
|
||||
|
||||
BLASLONG n1 = n & -32;
|
||||
if ( n1 > 0 )
|
||||
{
|
||||
|
||||
sscal_kernel_32_zero(n1, x);
|
||||
j=n1;
|
||||
}
|
||||
|
||||
while(j < n)
|
||||
{
|
||||
|
||||
x[j]=0.0;
|
||||
j++;
|
||||
}
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
BLASLONG n1 = n & -32;
|
||||
if ( n1 > 0 )
|
||||
{
|
||||
sscal_kernel_32(n1, da, x);
|
||||
j=n1;
|
||||
}
|
||||
while(j < n)
|
||||
{
|
||||
|
||||
x[j] = da * x[j] ;
|
||||
j++;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
if ( da == 0.0 )
|
||||
{
|
||||
|
||||
BLASLONG n1 = n & -2;
|
||||
|
||||
while (j < n1) {
|
||||
|
||||
x[i]=0.0;
|
||||
x[i + inc_x]=0.0;
|
||||
|
||||
i += inc_x * 2;
|
||||
j += 2;
|
||||
|
||||
}
|
||||
while(j < n)
|
||||
{
|
||||
|
||||
x[i]=0.0;
|
||||
i += inc_x ;
|
||||
j++;
|
||||
}
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
BLASLONG n1 = n & -2;
|
||||
|
||||
while (j < n1) {
|
||||
|
||||
x[i] = da * x[i] ;
|
||||
x[i + inc_x] = da * x[i + inc_x];
|
||||
|
||||
i += inc_x * 2;
|
||||
j += 2;
|
||||
|
||||
}
|
||||
|
||||
while(j < n)
|
||||
{
|
||||
|
||||
x[i] = da * x[i] ;
|
||||
i += inc_x ;
|
||||
j++;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
return 0;
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,164 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2018, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
static void sswap_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y)
|
||||
{
|
||||
__asm__ volatile(
|
||||
"srlg %%r0,%0,6 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 2, 1024(%%r1,%1) \n\t"
|
||||
"pfd 2, 1024(%%r1,%2) \n\t"
|
||||
|
||||
"vl %%v16, 0(%%r1,%1) \n\t"
|
||||
"vl %%v17, 16(%%r1,%1) \n\t"
|
||||
"vl %%v18, 32(%%r1,%1) \n\t"
|
||||
"vl %%v19, 48(%%r1,%1) \n\t"
|
||||
"vl %%v20, 64(%%r1,%1) \n\t"
|
||||
"vl %%v21, 80(%%r1,%1) \n\t"
|
||||
"vl %%v22, 96(%%r1,%1) \n\t"
|
||||
"vl %%v23, 112(%%r1,%1) \n\t"
|
||||
"vl %%v24, 128(%%r1,%1) \n\t"
|
||||
"vl %%v25, 144(%%r1,%1) \n\t"
|
||||
"vl %%v26, 160(%%r1,%1) \n\t"
|
||||
"vl %%v27, 176(%%r1,%1) \n\t"
|
||||
"vl %%v28, 192(%%r1,%1) \n\t"
|
||||
"vl %%v29, 208(%%r1,%1) \n\t"
|
||||
"vl %%v30, 224(%%r1,%1) \n\t"
|
||||
"vl %%v31, 240(%%r1,%1) \n\t"
|
||||
|
||||
"vl %%v0, 0(%%r1,%2) \n\t"
|
||||
"vl %%v1, 16(%%r1,%2) \n\t"
|
||||
"vl %%v2, 32(%%r1,%2) \n\t"
|
||||
"vl %%v3, 48(%%r1,%2) \n\t"
|
||||
"vl %%v4, 64(%%r1,%2) \n\t"
|
||||
"vl %%v5, 80(%%r1,%2) \n\t"
|
||||
"vl %%v6, 96(%%r1,%2) \n\t"
|
||||
"vl %%v7, 112(%%r1,%2) \n\t"
|
||||
"vst %%v0, 0(%%r1,%1) \n\t"
|
||||
"vst %%v1, 16(%%r1,%1) \n\t"
|
||||
"vst %%v2, 32(%%r1,%1) \n\t"
|
||||
"vst %%v3, 48(%%r1,%1) \n\t"
|
||||
"vst %%v4, 64(%%r1,%1) \n\t"
|
||||
"vst %%v5, 80(%%r1,%1) \n\t"
|
||||
"vst %%v6, 96(%%r1,%1) \n\t"
|
||||
"vst %%v7, 112(%%r1,%1) \n\t"
|
||||
|
||||
"vl %%v0, 128(%%r1,%2) \n\t"
|
||||
"vl %%v1, 144(%%r1,%2) \n\t"
|
||||
"vl %%v2, 160(%%r1,%2) \n\t"
|
||||
"vl %%v3, 176(%%r1,%2) \n\t"
|
||||
"vl %%v4, 192(%%r1,%2) \n\t"
|
||||
"vl %%v5, 208(%%r1,%2) \n\t"
|
||||
"vl %%v6, 224(%%r1,%2) \n\t"
|
||||
"vl %%v7, 240(%%r1,%2) \n\t"
|
||||
"vst %%v0, 128(%%r1,%1) \n\t"
|
||||
"vst %%v1, 144(%%r1,%1) \n\t"
|
||||
"vst %%v2, 160(%%r1,%1) \n\t"
|
||||
"vst %%v3, 176(%%r1,%1) \n\t"
|
||||
"vst %%v4, 192(%%r1,%1) \n\t"
|
||||
"vst %%v5, 208(%%r1,%1) \n\t"
|
||||
"vst %%v6, 224(%%r1,%1) \n\t"
|
||||
"vst %%v7, 240(%%r1,%1) \n\t"
|
||||
|
||||
"vst %%v16, 0(%%r1,%2) \n\t"
|
||||
"vst %%v17, 16(%%r1,%2) \n\t"
|
||||
"vst %%v18, 32(%%r1,%2) \n\t"
|
||||
"vst %%v19, 48(%%r1,%2) \n\t"
|
||||
"vst %%v20, 64(%%r1,%2) \n\t"
|
||||
"vst %%v21, 80(%%r1,%2) \n\t"
|
||||
"vst %%v22, 96(%%r1,%2) \n\t"
|
||||
"vst %%v23, 112(%%r1,%2) \n\t"
|
||||
"vst %%v24, 128(%%r1,%2) \n\t"
|
||||
"vst %%v25, 144(%%r1,%2) \n\t"
|
||||
"vst %%v26, 160(%%r1,%2) \n\t"
|
||||
"vst %%v27, 176(%%r1,%2) \n\t"
|
||||
"vst %%v28, 192(%%r1,%2) \n\t"
|
||||
"vst %%v29, 208(%%r1,%2) \n\t"
|
||||
"vst %%v30, 224(%%r1,%2) \n\t"
|
||||
"vst %%v31, 240(%%r1,%2) \n\t"
|
||||
|
||||
"agfi %%r1,256 \n\t"
|
||||
"brctg %%r0,0b "
|
||||
:
|
||||
:"r"(n),"ZR"((FLOAT (*)[n])x),"ZR"((FLOAT (*)[n])y)
|
||||
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
}
|
||||
|
||||
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
|
||||
{
|
||||
BLASLONG i=0;
|
||||
BLASLONG ix=0,iy=0;
|
||||
FLOAT temp;
|
||||
|
||||
if ( n <= 0 ) return(0);
|
||||
|
||||
if ( (inc_x == 1) && (inc_y == 1 ))
|
||||
{
|
||||
|
||||
BLASLONG n1 = n & -64;
|
||||
if ( n1 > 0 )
|
||||
{
|
||||
sswap_kernel_64(n1, x, y);
|
||||
i=n1;
|
||||
}
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
temp = y[i];
|
||||
y[i] = x[i] ;
|
||||
x[i] = temp;
|
||||
i++ ;
|
||||
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
temp = y[iy];
|
||||
y[iy] = x[ix] ;
|
||||
x[ix] = temp;
|
||||
ix += inc_x ;
|
||||
iy += inc_y ;
|
||||
i++ ;
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
return(0);
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,211 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2017, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include <math.h>
|
||||
|
||||
#if defined(DOUBLE)
|
||||
#define ABS fabs
|
||||
#else
|
||||
#define ABS fabsf
|
||||
#endif
|
||||
|
||||
#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1]))
|
||||
|
||||
static FLOAT zamax_kernel_16(BLASLONG n, FLOAT *x)
|
||||
{
|
||||
FLOAT amax;
|
||||
|
||||
__asm__ volatile (
|
||||
"vleg %%v0,0(%2),0 \n\t"
|
||||
"vleg %%v16,8(%2),0 \n\t"
|
||||
"vleg %%v0,16(%2),1 \n\t"
|
||||
"vleg %%v16,24(%2),1 \n\t"
|
||||
"vflpdb %%v0,%%v0 \n\t"
|
||||
"vflpdb %%v16,%%v16 \n\t"
|
||||
"vfadb %%v0,%%v0,%%v16 \n\t"
|
||||
"srlg %%r0,%1,4 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 1, 1024(%%r1,%2) \n\t"
|
||||
|
||||
"vleg %%v16,0(%%r1,%2),0 \n\t"
|
||||
"vleg %%v17,8(%%r1,%2),0 \n\t"
|
||||
"vleg %%v16,16(%%r1,%2),1 \n\t"
|
||||
"vleg %%v17,24(%%r1,%2),1 \n\t"
|
||||
"vleg %%v18,32(%%r1,%2),0 \n\t"
|
||||
"vleg %%v19,40(%%r1,%2),0 \n\t"
|
||||
"vleg %%v18,48(%%r1,%2),1 \n\t"
|
||||
"vleg %%v19,56(%%r1,%2),1 \n\t"
|
||||
"vleg %%v20,64(%%r1,%2),0 \n\t"
|
||||
"vleg %%v21,72(%%r1,%2),0 \n\t"
|
||||
"vleg %%v20,80(%%r1,%2),1 \n\t"
|
||||
"vleg %%v21,88(%%r1,%2),1 \n\t"
|
||||
"vleg %%v22,96(%%r1,%2),0 \n\t"
|
||||
"vleg %%v23,104(%%r1,%2),0 \n\t"
|
||||
"vleg %%v22,112(%%r1,%2),1 \n\t"
|
||||
"vleg %%v23,120(%%r1,%2),1 \n\t"
|
||||
"vleg %%v24,128(%%r1,%2),0 \n\t"
|
||||
"vleg %%v25,136(%%r1,%2),0 \n\t"
|
||||
"vleg %%v24,144(%%r1,%2),1 \n\t"
|
||||
"vleg %%v25,152(%%r1,%2),1 \n\t"
|
||||
"vleg %%v26,160(%%r1,%2),0 \n\t"
|
||||
"vleg %%v27,168(%%r1,%2),0 \n\t"
|
||||
"vleg %%v26,176(%%r1,%2),1 \n\t"
|
||||
"vleg %%v27,184(%%r1,%2),1 \n\t"
|
||||
"vleg %%v28,192(%%r1,%2),0 \n\t"
|
||||
"vleg %%v29,200(%%r1,%2),0 \n\t"
|
||||
"vleg %%v28,208(%%r1,%2),1 \n\t"
|
||||
"vleg %%v29,216(%%r1,%2),1 \n\t"
|
||||
"vleg %%v30,224(%%r1,%2),0 \n\t"
|
||||
"vleg %%v31,232(%%r1,%2),0 \n\t"
|
||||
"vleg %%v30,240(%%r1,%2),1 \n\t"
|
||||
"vleg %%v31,248(%%r1,%2),1 \n\t"
|
||||
|
||||
"vflpdb %%v16,%%v16 \n\t"
|
||||
"vflpdb %%v17,%%v17 \n\t"
|
||||
"vflpdb %%v18,%%v18 \n\t"
|
||||
"vflpdb %%v19,%%v19 \n\t"
|
||||
"vflpdb %%v20,%%v20 \n\t"
|
||||
"vflpdb %%v21,%%v21 \n\t"
|
||||
"vflpdb %%v22,%%v22 \n\t"
|
||||
"vflpdb %%v23,%%v23 \n\t"
|
||||
"vflpdb %%v24,%%v24 \n\t"
|
||||
"vflpdb %%v25,%%v25 \n\t"
|
||||
"vflpdb %%v26,%%v26 \n\t"
|
||||
"vflpdb %%v27,%%v27 \n\t"
|
||||
"vflpdb %%v28,%%v28 \n\t"
|
||||
"vflpdb %%v29,%%v29 \n\t"
|
||||
"vflpdb %%v30,%%v30 \n\t"
|
||||
"vflpdb %%v31,%%v31 \n\t"
|
||||
|
||||
"vfadb %%v16,%%v16,%%v17 \n\t"
|
||||
"vfadb %%v18,%%v18,%%v19 \n\t"
|
||||
"vfadb %%v20,%%v20,%%v21 \n\t"
|
||||
"vfadb %%v22,%%v22,%%v23 \n\t"
|
||||
"vfadb %%v24,%%v24,%%v25 \n\t"
|
||||
"vfadb %%v26,%%v26,%%v27 \n\t"
|
||||
"vfadb %%v28,%%v28,%%v29 \n\t"
|
||||
"vfadb %%v30,%%v30,%%v31 \n\t"
|
||||
|
||||
"vfmaxdb %%v16,%%v16,%%v24,0 \n\t"
|
||||
"vfmaxdb %%v18,%%v18,%%v26,0 \n\t"
|
||||
"vfmaxdb %%v20,%%v20,%%v28,0 \n\t"
|
||||
"vfmaxdb %%v22,%%v22,%%v30,0 \n\t"
|
||||
|
||||
"vfmaxdb %%v16,%%v16,%%v20,0 \n\t"
|
||||
"vfmaxdb %%v18,%%v18,%%v22,0 \n\t"
|
||||
|
||||
"vfmaxdb %%v16,%%v16,%%v18,0 \n\t"
|
||||
|
||||
"vfmaxdb %%v0,%%v0,%%v16,0 \n\t"
|
||||
|
||||
"agfi %%r1, 256 \n\t"
|
||||
"brctg %%r0, 0b \n\t"
|
||||
|
||||
"vrepg %%v16,%%v0,1 \n\t"
|
||||
"wfmaxdb %%v0,%%v0,%%v16,0 \n\t"
|
||||
"ldr %0,%%f0 "
|
||||
:"=f"(amax)
|
||||
:"r"(n),"ZR"((const FLOAT (*)[n])x)
|
||||
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
|
||||
return amax;
|
||||
}
|
||||
|
||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||
BLASLONG i = 0;
|
||||
BLASLONG ix = 0;
|
||||
FLOAT maxf = 0.0;
|
||||
BLASLONG inc_x2;
|
||||
|
||||
if (n <= 0 || inc_x <= 0) return (maxf);
|
||||
|
||||
if (inc_x == 1) {
|
||||
|
||||
BLASLONG n1 = n & -16;
|
||||
if (n1 > 0) {
|
||||
|
||||
maxf = zamax_kernel_16(n1, x);
|
||||
ix = n1 * 2;
|
||||
i = n1;
|
||||
}
|
||||
else
|
||||
{
|
||||
maxf=CABS1(x,0);
|
||||
ix += 2;
|
||||
i++;
|
||||
}
|
||||
|
||||
while (i < n) {
|
||||
if (CABS1(x,ix) > maxf) {
|
||||
maxf = CABS1(x,ix);
|
||||
}
|
||||
ix += 2;
|
||||
i++;
|
||||
}
|
||||
return (maxf);
|
||||
|
||||
} else {
|
||||
|
||||
maxf=CABS1(x,0);
|
||||
inc_x2 = 2 * inc_x;
|
||||
|
||||
BLASLONG n1 = n & -4;
|
||||
while (i < n1) {
|
||||
|
||||
if (CABS1(x,ix) > maxf) {
|
||||
maxf = CABS1(x,ix);
|
||||
}
|
||||
if (CABS1(x,ix+inc_x2) > maxf) {
|
||||
maxf = CABS1(x,ix+inc_x2);
|
||||
}
|
||||
if (CABS1(x,ix+inc_x2*2) > maxf) {
|
||||
maxf = CABS1(x,ix+inc_x2*2);
|
||||
}
|
||||
if (CABS1(x,ix+inc_x2*3) > maxf) {
|
||||
maxf = CABS1(x,ix+inc_x2*3);
|
||||
}
|
||||
|
||||
ix += inc_x2 * 4;
|
||||
|
||||
i += 4;
|
||||
|
||||
}
|
||||
|
||||
|
||||
while (i < n) {
|
||||
if (CABS1(x,ix) > maxf) {
|
||||
maxf = CABS1(x,ix);
|
||||
}
|
||||
ix += inc_x2;
|
||||
i++;
|
||||
}
|
||||
return (maxf);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,221 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2017, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include <math.h>
|
||||
|
||||
#if defined(DOUBLE)
|
||||
#define ABS fabs
|
||||
#else
|
||||
#define ABS fabsf
|
||||
#endif
|
||||
|
||||
#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1]))
|
||||
|
||||
static FLOAT zamax_kernel_16(BLASLONG n, FLOAT *x)
|
||||
{
|
||||
FLOAT amax;
|
||||
|
||||
__asm__ volatile (
|
||||
"vleg %%v0,0(%2),0 \n\t"
|
||||
"vleg %%v16,8(%2),0 \n\t"
|
||||
"vleg %%v0,16(%2),1 \n\t"
|
||||
"vleg %%v16,24(%2),1 \n\t"
|
||||
"vflpdb %%v0,%%v0 \n\t"
|
||||
"vflpdb %%v16,%%v16 \n\t"
|
||||
"vfadb %%v0,%%v0,%%v16 \n\t"
|
||||
"srlg %%r0,%1,4 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 1, 1024(%%r1,%2) \n\t"
|
||||
|
||||
"vleg %%v16,0(%%r1,%2),0 \n\t"
|
||||
"vleg %%v17,8(%%r1,%2),0 \n\t"
|
||||
"vleg %%v16,16(%%r1,%2),1 \n\t"
|
||||
"vleg %%v17,24(%%r1,%2),1 \n\t"
|
||||
"vleg %%v18,32(%%r1,%2),0 \n\t"
|
||||
"vleg %%v19,40(%%r1,%2),0 \n\t"
|
||||
"vleg %%v18,48(%%r1,%2),1 \n\t"
|
||||
"vleg %%v19,56(%%r1,%2),1 \n\t"
|
||||
"vleg %%v20,64(%%r1,%2),0 \n\t"
|
||||
"vleg %%v21,72(%%r1,%2),0 \n\t"
|
||||
"vleg %%v20,80(%%r1,%2),1 \n\t"
|
||||
"vleg %%v21,88(%%r1,%2),1 \n\t"
|
||||
"vleg %%v22,96(%%r1,%2),0 \n\t"
|
||||
"vleg %%v23,104(%%r1,%2),0 \n\t"
|
||||
"vleg %%v22,112(%%r1,%2),1 \n\t"
|
||||
"vleg %%v23,120(%%r1,%2),1 \n\t"
|
||||
"vflpdb %%v16, %%v16 \n\t"
|
||||
"vflpdb %%v17, %%v17 \n\t"
|
||||
"vflpdb %%v18, %%v18 \n\t"
|
||||
"vflpdb %%v19, %%v19 \n\t"
|
||||
"vflpdb %%v20, %%v20 \n\t"
|
||||
"vflpdb %%v21, %%v21 \n\t"
|
||||
"vflpdb %%v22, %%v22 \n\t"
|
||||
"vflpdb %%v23, %%v23 \n\t"
|
||||
"vfadb %%v16,%%v16,%%v17 \n\t"
|
||||
"vfadb %%v17,%%v18,%%v19 \n\t"
|
||||
"vfadb %%v18,%%v20,%%v21 \n\t"
|
||||
"vfadb %%v19,%%v22,%%v23 \n\t"
|
||||
|
||||
"vfchdb %%v24,%%v16,%%v17 \n\t"
|
||||
"vfchdb %%v25,%%v18,%%v19 \n\t"
|
||||
"vsel %%v24,%%v16,%%v17,%%v24 \n\t"
|
||||
"vsel %%v25,%%v18,%%v19,%%v25 \n\t"
|
||||
|
||||
"vfchdb %%v26,%%v24,%%v25 \n\t"
|
||||
"vsel %%v26,%%v24,%%v25,%%v26 \n\t"
|
||||
|
||||
"vfchdb %%v27,%%v26,%%v0 \n\t"
|
||||
"vsel %%v0,%%v26,%%v0,%%v27 \n\t"
|
||||
|
||||
"vleg %%v16,128(%%r1,%2),0 \n\t"
|
||||
"vleg %%v17,136(%%r1,%2),0 \n\t"
|
||||
"vleg %%v16,144(%%r1,%2),1 \n\t"
|
||||
"vleg %%v17,152(%%r1,%2),1 \n\t"
|
||||
"vleg %%v18,160(%%r1,%2),0 \n\t"
|
||||
"vleg %%v19,168(%%r1,%2),0 \n\t"
|
||||
"vleg %%v18,176(%%r1,%2),1 \n\t"
|
||||
"vleg %%v19,184(%%r1,%2),1 \n\t"
|
||||
"vleg %%v20,192(%%r1,%2),0 \n\t"
|
||||
"vleg %%v21,200(%%r1,%2),0 \n\t"
|
||||
"vleg %%v20,208(%%r1,%2),1 \n\t"
|
||||
"vleg %%v21,216(%%r1,%2),1 \n\t"
|
||||
"vleg %%v22,224(%%r1,%2),0 \n\t"
|
||||
"vleg %%v23,232(%%r1,%2),0 \n\t"
|
||||
"vleg %%v22,240(%%r1,%2),1 \n\t"
|
||||
"vleg %%v23,248(%%r1,%2),1 \n\t"
|
||||
"vflpdb %%v16, %%v16 \n\t"
|
||||
"vflpdb %%v17, %%v17 \n\t"
|
||||
"vflpdb %%v18, %%v18 \n\t"
|
||||
"vflpdb %%v19, %%v19 \n\t"
|
||||
"vflpdb %%v20, %%v20 \n\t"
|
||||
"vflpdb %%v21, %%v21 \n\t"
|
||||
"vflpdb %%v22, %%v22 \n\t"
|
||||
"vflpdb %%v23, %%v23 \n\t"
|
||||
"vfadb %%v16,%%v16,%%v17 \n\t"
|
||||
"vfadb %%v17,%%v18,%%v19 \n\t"
|
||||
"vfadb %%v18,%%v20,%%v21 \n\t"
|
||||
"vfadb %%v19,%%v22,%%v23 \n\t"
|
||||
|
||||
"vfchdb %%v24,%%v16,%%v17 \n\t"
|
||||
"vfchdb %%v25,%%v18,%%v19 \n\t"
|
||||
"vsel %%v24,%%v16,%%v17,%%v24 \n\t"
|
||||
"vsel %%v25,%%v18,%%v19,%%v25 \n\t"
|
||||
|
||||
"vfchdb %%v26,%%v24,%%v25 \n\t"
|
||||
"vsel %%v26,%%v24,%%v25,%%v26 \n\t"
|
||||
|
||||
"vfchdb %%v27,%%v26,%%v0 \n\t"
|
||||
"vsel %%v0,%%v26,%%v0,%%v27 \n\t"
|
||||
|
||||
"agfi %%r1, 256 \n\t"
|
||||
"brctg %%r0, 0b \n\t"
|
||||
|
||||
"vrepg %%v16,%%v0,1 \n\t"
|
||||
"wfchdb %%v17,%%v0,%%v16 \n\t"
|
||||
"vsel %%v0,%%v0,%%v16,%%v17 \n\t"
|
||||
"ldr %0,%%f0 "
|
||||
:"=f"(amax)
|
||||
:"r"(n),"ZR"((const FLOAT (*)[n])x)
|
||||
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27"
|
||||
);
|
||||
|
||||
return amax;
|
||||
}
|
||||
|
||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||
BLASLONG i = 0;
|
||||
BLASLONG ix = 0;
|
||||
FLOAT maxf = 0.0;
|
||||
BLASLONG inc_x2;
|
||||
|
||||
if (n <= 0 || inc_x <= 0) return (maxf);
|
||||
|
||||
if (inc_x == 1) {
|
||||
|
||||
BLASLONG n1 = n & -16;
|
||||
if (n1 > 0) {
|
||||
|
||||
maxf = zamax_kernel_16(n1, x);
|
||||
ix = n1 * 2;
|
||||
i = n1;
|
||||
}
|
||||
else
|
||||
{
|
||||
maxf=CABS1(x,0);
|
||||
ix += 2;
|
||||
i++;
|
||||
}
|
||||
|
||||
while (i < n) {
|
||||
if (CABS1(x,ix) > maxf) {
|
||||
maxf = CABS1(x,ix);
|
||||
}
|
||||
ix += 2;
|
||||
i++;
|
||||
}
|
||||
return (maxf);
|
||||
|
||||
} else {
|
||||
|
||||
maxf=CABS1(x,0);
|
||||
inc_x2 = 2 * inc_x;
|
||||
|
||||
BLASLONG n1 = n & -4;
|
||||
while (i < n1) {
|
||||
|
||||
if (CABS1(x,ix) > maxf) {
|
||||
maxf = CABS1(x,ix);
|
||||
}
|
||||
if (CABS1(x,ix+inc_x2) > maxf) {
|
||||
maxf = CABS1(x,ix+inc_x2);
|
||||
}
|
||||
if (CABS1(x,ix+inc_x2*2) > maxf) {
|
||||
maxf = CABS1(x,ix+inc_x2*2);
|
||||
}
|
||||
if (CABS1(x,ix+inc_x2*3) > maxf) {
|
||||
maxf = CABS1(x,ix+inc_x2*3);
|
||||
}
|
||||
|
||||
ix += inc_x2 * 4;
|
||||
|
||||
i += 4;
|
||||
|
||||
}
|
||||
|
||||
|
||||
while (i < n) {
|
||||
if (CABS1(x,ix) > maxf) {
|
||||
maxf = CABS1(x,ix);
|
||||
}
|
||||
ix += inc_x2;
|
||||
i++;
|
||||
}
|
||||
return (maxf);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,211 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2017, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include <math.h>
|
||||
|
||||
#if defined(DOUBLE)
|
||||
#define ABS fabs
|
||||
#else
|
||||
#define ABS fabsf
|
||||
#endif
|
||||
|
||||
#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1]))
|
||||
|
||||
static FLOAT zamin_kernel_16(BLASLONG n, FLOAT *x)
|
||||
{
|
||||
FLOAT amin;
|
||||
|
||||
__asm__ volatile (
|
||||
"vleg %%v0,0(%2),0 \n\t"
|
||||
"vleg %%v16,8(%2),0 \n\t"
|
||||
"vleg %%v0,16(%2),1 \n\t"
|
||||
"vleg %%v16,24(%2),1 \n\t"
|
||||
"vflpdb %%v0,%%v0 \n\t"
|
||||
"vflpdb %%v16,%%v16 \n\t"
|
||||
"vfadb %%v0,%%v0,%%v16 \n\t"
|
||||
"srlg %%r0,%1,4 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 1, 1024(%%r1,%2) \n\t"
|
||||
|
||||
"vleg %%v16,0(%%r1,%2),0 \n\t"
|
||||
"vleg %%v17,8(%%r1,%2),0 \n\t"
|
||||
"vleg %%v16,16(%%r1,%2),1 \n\t"
|
||||
"vleg %%v17,24(%%r1,%2),1 \n\t"
|
||||
"vleg %%v18,32(%%r1,%2),0 \n\t"
|
||||
"vleg %%v19,40(%%r1,%2),0 \n\t"
|
||||
"vleg %%v18,48(%%r1,%2),1 \n\t"
|
||||
"vleg %%v19,56(%%r1,%2),1 \n\t"
|
||||
"vleg %%v20,64(%%r1,%2),0 \n\t"
|
||||
"vleg %%v21,72(%%r1,%2),0 \n\t"
|
||||
"vleg %%v20,80(%%r1,%2),1 \n\t"
|
||||
"vleg %%v21,88(%%r1,%2),1 \n\t"
|
||||
"vleg %%v22,96(%%r1,%2),0 \n\t"
|
||||
"vleg %%v23,104(%%r1,%2),0 \n\t"
|
||||
"vleg %%v22,112(%%r1,%2),1 \n\t"
|
||||
"vleg %%v23,120(%%r1,%2),1 \n\t"
|
||||
"vleg %%v24,128(%%r1,%2),0 \n\t"
|
||||
"vleg %%v25,136(%%r1,%2),0 \n\t"
|
||||
"vleg %%v24,144(%%r1,%2),1 \n\t"
|
||||
"vleg %%v25,152(%%r1,%2),1 \n\t"
|
||||
"vleg %%v26,160(%%r1,%2),0 \n\t"
|
||||
"vleg %%v27,168(%%r1,%2),0 \n\t"
|
||||
"vleg %%v26,176(%%r1,%2),1 \n\t"
|
||||
"vleg %%v27,184(%%r1,%2),1 \n\t"
|
||||
"vleg %%v28,192(%%r1,%2),0 \n\t"
|
||||
"vleg %%v29,200(%%r1,%2),0 \n\t"
|
||||
"vleg %%v28,208(%%r1,%2),1 \n\t"
|
||||
"vleg %%v29,216(%%r1,%2),1 \n\t"
|
||||
"vleg %%v30,224(%%r1,%2),0 \n\t"
|
||||
"vleg %%v31,232(%%r1,%2),0 \n\t"
|
||||
"vleg %%v30,240(%%r1,%2),1 \n\t"
|
||||
"vleg %%v31,248(%%r1,%2),1 \n\t"
|
||||
|
||||
"vflpdb %%v16,%%v16 \n\t"
|
||||
"vflpdb %%v17,%%v17 \n\t"
|
||||
"vflpdb %%v18,%%v18 \n\t"
|
||||
"vflpdb %%v19,%%v19 \n\t"
|
||||
"vflpdb %%v20,%%v20 \n\t"
|
||||
"vflpdb %%v21,%%v21 \n\t"
|
||||
"vflpdb %%v22,%%v22 \n\t"
|
||||
"vflpdb %%v23,%%v23 \n\t"
|
||||
"vflpdb %%v24,%%v24 \n\t"
|
||||
"vflpdb %%v25,%%v25 \n\t"
|
||||
"vflpdb %%v26,%%v26 \n\t"
|
||||
"vflpdb %%v27,%%v27 \n\t"
|
||||
"vflpdb %%v28,%%v28 \n\t"
|
||||
"vflpdb %%v29,%%v29 \n\t"
|
||||
"vflpdb %%v30,%%v30 \n\t"
|
||||
"vflpdb %%v31,%%v31 \n\t"
|
||||
|
||||
"vfadb %%v16,%%v16,%%v17 \n\t"
|
||||
"vfadb %%v18,%%v18,%%v19 \n\t"
|
||||
"vfadb %%v20,%%v20,%%v21 \n\t"
|
||||
"vfadb %%v22,%%v22,%%v23 \n\t"
|
||||
"vfadb %%v24,%%v24,%%v25 \n\t"
|
||||
"vfadb %%v26,%%v26,%%v27 \n\t"
|
||||
"vfadb %%v28,%%v28,%%v29 \n\t"
|
||||
"vfadb %%v30,%%v30,%%v31 \n\t"
|
||||
|
||||
"vfmindb %%v16,%%v16,%%v24,0 \n\t"
|
||||
"vfmindb %%v18,%%v18,%%v26,0 \n\t"
|
||||
"vfmindb %%v20,%%v20,%%v28,0 \n\t"
|
||||
"vfmindb %%v22,%%v22,%%v30,0 \n\t"
|
||||
|
||||
"vfmindb %%v16,%%v16,%%v20,0 \n\t"
|
||||
"vfmindb %%v18,%%v18,%%v22,0 \n\t"
|
||||
|
||||
"vfmindb %%v16,%%v16,%%v18,0 \n\t"
|
||||
|
||||
"vfmindb %%v0,%%v0,%%v16,0 \n\t"
|
||||
|
||||
"agfi %%r1, 256 \n\t"
|
||||
"brctg %%r0, 0b \n\t"
|
||||
|
||||
"vrepg %%v16,%%v0,1 \n\t"
|
||||
"wfmindb %%v0,%%v0,%%v16,0 \n\t"
|
||||
"ldr %0,%%f0 "
|
||||
:"=f"(amin)
|
||||
:"r"(n),"ZR"((const FLOAT (*)[n])x)
|
||||
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
|
||||
return amin;
|
||||
}
|
||||
|
||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||
BLASLONG i = 0;
|
||||
BLASLONG ix = 0;
|
||||
FLOAT minf = 0.0;
|
||||
BLASLONG inc_x2;
|
||||
|
||||
if (n <= 0 || inc_x <= 0) return (minf);
|
||||
|
||||
if (inc_x == 1) {
|
||||
|
||||
BLASLONG n1 = n & -16;
|
||||
if (n1 > 0) {
|
||||
|
||||
minf = zamin_kernel_16(n1, x);
|
||||
ix = n1 * 2;
|
||||
i = n1;
|
||||
}
|
||||
else
|
||||
{
|
||||
minf=CABS1(x,0);
|
||||
ix += 2;
|
||||
i++;
|
||||
}
|
||||
|
||||
while (i < n) {
|
||||
if (CABS1(x,ix) < minf) {
|
||||
minf = CABS1(x,ix);
|
||||
}
|
||||
ix += 2;
|
||||
i++;
|
||||
}
|
||||
return (minf);
|
||||
|
||||
} else {
|
||||
|
||||
minf=CABS1(x,0);
|
||||
inc_x2 = 2 * inc_x;
|
||||
|
||||
BLASLONG n1 = n & -4;
|
||||
while (i < n1) {
|
||||
|
||||
if (CABS1(x,ix) < minf) {
|
||||
minf = CABS1(x,ix);
|
||||
}
|
||||
if (CABS1(x,ix+inc_x2) < minf) {
|
||||
minf = CABS1(x,ix+inc_x2);
|
||||
}
|
||||
if (CABS1(x,ix+inc_x2*2) < minf) {
|
||||
minf = CABS1(x,ix+inc_x2*2);
|
||||
}
|
||||
if (CABS1(x,ix+inc_x2*3) < minf) {
|
||||
minf = CABS1(x,ix+inc_x2*3);
|
||||
}
|
||||
|
||||
ix += inc_x2 * 4;
|
||||
|
||||
i += 4;
|
||||
|
||||
}
|
||||
|
||||
|
||||
while (i < n) {
|
||||
if (CABS1(x,ix) < minf) {
|
||||
minf = CABS1(x,ix);
|
||||
}
|
||||
ix += inc_x2;
|
||||
i++;
|
||||
}
|
||||
return (minf);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,221 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2017, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include <math.h>
|
||||
|
||||
#if defined(DOUBLE)
|
||||
#define ABS fabs
|
||||
#else
|
||||
#define ABS fabsf
|
||||
#endif
|
||||
|
||||
#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1]))
|
||||
|
||||
static FLOAT zamin_kernel_16(BLASLONG n, FLOAT *x)
|
||||
{
|
||||
FLOAT amin;
|
||||
|
||||
__asm__ volatile (
|
||||
"vleg %%v0,0(%2),0 \n\t"
|
||||
"vleg %%v16,8(%2),0 \n\t"
|
||||
"vleg %%v0,16(%2),1 \n\t"
|
||||
"vleg %%v16,24(%2),1 \n\t"
|
||||
"vflpdb %%v0,%%v0 \n\t"
|
||||
"vflpdb %%v16,%%v16 \n\t"
|
||||
"vfadb %%v0,%%v0,%%v16 \n\t"
|
||||
"srlg %%r0,%1,4 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 1, 1024(%%r1,%2) \n\t"
|
||||
|
||||
"vleg %%v16,0(%%r1,%2),0 \n\t"
|
||||
"vleg %%v17,8(%%r1,%2),0 \n\t"
|
||||
"vleg %%v16,16(%%r1,%2),1 \n\t"
|
||||
"vleg %%v17,24(%%r1,%2),1 \n\t"
|
||||
"vleg %%v18,32(%%r1,%2),0 \n\t"
|
||||
"vleg %%v19,40(%%r1,%2),0 \n\t"
|
||||
"vleg %%v18,48(%%r1,%2),1 \n\t"
|
||||
"vleg %%v19,56(%%r1,%2),1 \n\t"
|
||||
"vleg %%v20,64(%%r1,%2),0 \n\t"
|
||||
"vleg %%v21,72(%%r1,%2),0 \n\t"
|
||||
"vleg %%v20,80(%%r1,%2),1 \n\t"
|
||||
"vleg %%v21,88(%%r1,%2),1 \n\t"
|
||||
"vleg %%v22,96(%%r1,%2),0 \n\t"
|
||||
"vleg %%v23,104(%%r1,%2),0 \n\t"
|
||||
"vleg %%v22,112(%%r1,%2),1 \n\t"
|
||||
"vleg %%v23,120(%%r1,%2),1 \n\t"
|
||||
"vflpdb %%v16, %%v16 \n\t"
|
||||
"vflpdb %%v17, %%v17 \n\t"
|
||||
"vflpdb %%v18, %%v18 \n\t"
|
||||
"vflpdb %%v19, %%v19 \n\t"
|
||||
"vflpdb %%v20, %%v20 \n\t"
|
||||
"vflpdb %%v21, %%v21 \n\t"
|
||||
"vflpdb %%v22, %%v22 \n\t"
|
||||
"vflpdb %%v23, %%v23 \n\t"
|
||||
"vfadb %%v16,%%v16,%%v17 \n\t"
|
||||
"vfadb %%v17,%%v18,%%v19 \n\t"
|
||||
"vfadb %%v18,%%v20,%%v21 \n\t"
|
||||
"vfadb %%v19,%%v22,%%v23 \n\t"
|
||||
|
||||
"vfchdb %%v24,%%v17,%%v16 \n\t"
|
||||
"vfchdb %%v25,%%v19,%%v18 \n\t"
|
||||
"vsel %%v24,%%v16,%%v17,%%v24 \n\t"
|
||||
"vsel %%v25,%%v18,%%v19,%%v25 \n\t"
|
||||
|
||||
"vfchdb %%v26,%%v25,%%v24 \n\t"
|
||||
"vsel %%v26,%%v24,%%v25,%%v26 \n\t"
|
||||
|
||||
"vfchdb %%v27,%%v0,%%v26 \n\t"
|
||||
"vsel %%v0,%%v26,%%v0,%%v27 \n\t"
|
||||
|
||||
"vleg %%v16,128(%%r1,%2),0 \n\t"
|
||||
"vleg %%v17,136(%%r1,%2),0 \n\t"
|
||||
"vleg %%v16,144(%%r1,%2),1 \n\t"
|
||||
"vleg %%v17,152(%%r1,%2),1 \n\t"
|
||||
"vleg %%v18,160(%%r1,%2),0 \n\t"
|
||||
"vleg %%v19,168(%%r1,%2),0 \n\t"
|
||||
"vleg %%v18,176(%%r1,%2),1 \n\t"
|
||||
"vleg %%v19,184(%%r1,%2),1 \n\t"
|
||||
"vleg %%v20,192(%%r1,%2),0 \n\t"
|
||||
"vleg %%v21,200(%%r1,%2),0 \n\t"
|
||||
"vleg %%v20,208(%%r1,%2),1 \n\t"
|
||||
"vleg %%v21,216(%%r1,%2),1 \n\t"
|
||||
"vleg %%v22,224(%%r1,%2),0 \n\t"
|
||||
"vleg %%v23,232(%%r1,%2),0 \n\t"
|
||||
"vleg %%v22,240(%%r1,%2),1 \n\t"
|
||||
"vleg %%v23,248(%%r1,%2),1 \n\t"
|
||||
"vflpdb %%v16, %%v16 \n\t"
|
||||
"vflpdb %%v17, %%v17 \n\t"
|
||||
"vflpdb %%v18, %%v18 \n\t"
|
||||
"vflpdb %%v19, %%v19 \n\t"
|
||||
"vflpdb %%v20, %%v20 \n\t"
|
||||
"vflpdb %%v21, %%v21 \n\t"
|
||||
"vflpdb %%v22, %%v22 \n\t"
|
||||
"vflpdb %%v23, %%v23 \n\t"
|
||||
"vfadb %%v16,%%v16,%%v17 \n\t"
|
||||
"vfadb %%v17,%%v18,%%v19 \n\t"
|
||||
"vfadb %%v18,%%v20,%%v21 \n\t"
|
||||
"vfadb %%v19,%%v22,%%v23 \n\t"
|
||||
|
||||
"vfchdb %%v24,%%v17,%%v16 \n\t"
|
||||
"vfchdb %%v25,%%v19,%%v18 \n\t"
|
||||
"vsel %%v24,%%v16,%%v17,%%v24 \n\t"
|
||||
"vsel %%v25,%%v18,%%v19,%%v25 \n\t"
|
||||
|
||||
"vfchdb %%v26,%%v25,%%v24 \n\t"
|
||||
"vsel %%v26,%%v24,%%v25,%%v26 \n\t"
|
||||
|
||||
"vfchdb %%v27,%%v0,%%v26 \n\t"
|
||||
"vsel %%v0,%%v26,%%v0,%%v27 \n\t"
|
||||
|
||||
"agfi %%r1, 256 \n\t"
|
||||
"brctg %%r0, 0b \n\t"
|
||||
|
||||
"vrepg %%v16,%%v0,1 \n\t"
|
||||
"wfchdb %%v17,%%v16,%%v0 \n\t"
|
||||
"vsel %%v0,%%v0,%%v16,%%v17 \n\t"
|
||||
"ldr %0,%%f0 "
|
||||
:"=f"(amin)
|
||||
:"r"(n),"ZR"((const FLOAT (*)[n])x)
|
||||
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27"
|
||||
);
|
||||
|
||||
return amin;
|
||||
}
|
||||
|
||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||
BLASLONG i = 0;
|
||||
BLASLONG ix = 0;
|
||||
FLOAT minf = 0.0;
|
||||
BLASLONG inc_x2;
|
||||
|
||||
if (n <= 0 || inc_x <= 0) return (minf);
|
||||
|
||||
if (inc_x == 1) {
|
||||
|
||||
BLASLONG n1 = n & -16;
|
||||
if (n1 > 0) {
|
||||
|
||||
minf = zamin_kernel_16(n1, x);
|
||||
ix = n1 * 2;
|
||||
i = n1;
|
||||
}
|
||||
else
|
||||
{
|
||||
minf=CABS1(x,0);
|
||||
ix += 2;
|
||||
i++;
|
||||
}
|
||||
|
||||
while (i < n) {
|
||||
if (CABS1(x,ix) < minf) {
|
||||
minf = CABS1(x,ix);
|
||||
}
|
||||
ix += 2;
|
||||
i++;
|
||||
}
|
||||
return (minf);
|
||||
|
||||
} else {
|
||||
|
||||
minf=CABS1(x,0);
|
||||
inc_x2 = 2 * inc_x;
|
||||
|
||||
BLASLONG n1 = n & -4;
|
||||
while (i < n1) {
|
||||
|
||||
if (CABS1(x,ix) < minf) {
|
||||
minf = CABS1(x,ix);
|
||||
}
|
||||
if (CABS1(x,ix+inc_x2) < minf) {
|
||||
minf = CABS1(x,ix+inc_x2);
|
||||
}
|
||||
if (CABS1(x,ix+inc_x2*2) < minf) {
|
||||
minf = CABS1(x,ix+inc_x2*2);
|
||||
}
|
||||
if (CABS1(x,ix+inc_x2*3) < minf) {
|
||||
minf = CABS1(x,ix+inc_x2*3);
|
||||
}
|
||||
|
||||
ix += inc_x2 * 4;
|
||||
|
||||
i += 4;
|
||||
|
||||
}
|
||||
|
||||
|
||||
while (i < n) {
|
||||
if (CABS1(x,ix) < minf) {
|
||||
minf = CABS1(x,ix);
|
||||
}
|
||||
ix += inc_x2;
|
||||
i++;
|
||||
}
|
||||
return (minf);
|
||||
}
|
||||
}
|
||||
|
|
@ -25,92 +25,98 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
|||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
#include "common.h"
|
||||
#include <math.h>
|
||||
|
||||
#if defined(DOUBLE)
|
||||
|
||||
#define ABS fabs
|
||||
|
||||
#else
|
||||
|
||||
#define ABS fabsf
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
static FLOAT zasum_kernel_16(BLASLONG n, FLOAT *x) {
|
||||
|
||||
static FLOAT zasum_kernel_16(BLASLONG n, FLOAT *x)
|
||||
{
|
||||
FLOAT asum;
|
||||
|
||||
__asm__ (
|
||||
"pfd 1, 0(%[ptr_x]) \n\t"
|
||||
"sllg %%r0,%[n],4 \n\t"
|
||||
"agr %%r0,%[ptr_x] \n\t"
|
||||
"vzero %%v0 \n\t"
|
||||
"vzero %%v1 \n\t"
|
||||
"vzero %%v22 \n\t"
|
||||
"vzero %%v23 \n\t"
|
||||
".align 16 \n\t"
|
||||
"1: \n\t"
|
||||
"pfd 1, 256(%[ptr_tmp] ) \n\t"
|
||||
"vlm %%v24,%%v31,0(%[ptr_tmp]) \n\t"
|
||||
|
||||
"vflpdb %%v24, %%v24 \n\t"
|
||||
"vflpdb %%v25, %%v25 \n\t"
|
||||
"vflpdb %%v26, %%v26 \n\t"
|
||||
"vflpdb %%v27, %%v27 \n\t"
|
||||
"vflpdb %%v28, %%v28 \n\t"
|
||||
"vflpdb %%v29, %%v29 \n\t"
|
||||
"vflpdb %%v30, %%v30 \n\t"
|
||||
"vflpdb %%v31, %%v31 \n\t"
|
||||
|
||||
"vfadb %%v0,%%v0,%%v24 \n\t"
|
||||
"vfadb %%v1,%%v1,%%v25 \n\t"
|
||||
"vfadb %%v23,%%v23,%%v26 \n\t"
|
||||
"vfadb %%v22,%%v22,%%v27 \n\t"
|
||||
"vfadb %%v0,%%v0,%%v28 \n\t"
|
||||
"vfadb %%v1,%%v1,%%v29 \n\t"
|
||||
"vfadb %%v23,%%v23,%%v30 \n\t"
|
||||
"vfadb %%v22,%%v22,%%v31 \n\t"
|
||||
|
||||
"vlm %%v24,%%v31, 128(%[ptr_tmp]) \n\t"
|
||||
|
||||
"vflpdb %%v24, %%v24 \n\t"
|
||||
"vflpdb %%v25, %%v25 \n\t"
|
||||
"vflpdb %%v26, %%v26 \n\t"
|
||||
"vflpdb %%v27, %%v27 \n\t"
|
||||
"vflpdb %%v28, %%v28 \n\t"
|
||||
"vflpdb %%v29, %%v29 \n\t"
|
||||
"vflpdb %%v30, %%v30 \n\t"
|
||||
"vflpdb %%v31, %%v31 \n\t"
|
||||
"la %[ptr_tmp],256(%[ptr_tmp]) \n\t"
|
||||
"vfadb %%v0,%%v0,%%v24 \n\t"
|
||||
"vfadb %%v1,%%v1,%%v25 \n\t"
|
||||
"vfadb %%v23,%%v23,%%v26 \n\t"
|
||||
"vfadb %%v22,%%v22,%%v27 \n\t"
|
||||
"vfadb %%v0,%%v0,%%v28 \n\t"
|
||||
"vfadb %%v1,%%v1,%%v29 \n\t"
|
||||
"vfadb %%v23,%%v23,%%v30 \n\t"
|
||||
"vfadb %%v22,%%v22,%%v31 \n\t"
|
||||
|
||||
"clgrjl %[ptr_tmp],%%r0,1b \n\t"
|
||||
"vfadb %%v24,%%v0,%%v1 \n\t"
|
||||
"vfadb %%v25,%%v23,%%v22 \n\t"
|
||||
"vfadb %%v0,%%v25,%%v24 \n\t"
|
||||
"vrepg %%v1,%%v0,1 \n\t"
|
||||
"adbr %%f0,%%f1 \n\t"
|
||||
"ldr %[asum] ,%%f0"
|
||||
: [asum] "=f"(asum),[ptr_tmp] "+&a"(x)
|
||||
: [mem] "m"( *(const double (*)[2*n])x ), [n] "r"(n), [ptr_x] "a"(x)
|
||||
: "cc", "r0","f0","f1","v0","v1","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
"vzero %%v0 \n\t"
|
||||
"vzero %%v1 \n\t"
|
||||
"vzero %%v2 \n\t"
|
||||
"vzero %%v3 \n\t"
|
||||
"srlg %%r0,%1,4 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 1, 1024(%%r1,%2) \n\t"
|
||||
"vl %%v16, 0(%%r1,%2) \n\t"
|
||||
"vl %%v17, 16(%%r1,%2) \n\t"
|
||||
"vl %%v18, 32(%%r1,%2) \n\t"
|
||||
"vl %%v19, 48(%%r1,%2) \n\t"
|
||||
"vl %%v20, 64(%%r1,%2) \n\t"
|
||||
"vl %%v21, 80(%%r1,%2) \n\t"
|
||||
"vl %%v22, 96(%%r1,%2) \n\t"
|
||||
"vl %%v23, 112(%%r1,%2) \n\t"
|
||||
|
||||
"vflpdb %%v16, %%v16 \n\t"
|
||||
"vflpdb %%v17, %%v17 \n\t"
|
||||
"vflpdb %%v18, %%v18 \n\t"
|
||||
"vflpdb %%v19, %%v19 \n\t"
|
||||
"vflpdb %%v20, %%v20 \n\t"
|
||||
"vflpdb %%v21, %%v21 \n\t"
|
||||
"vflpdb %%v22, %%v22 \n\t"
|
||||
"vflpdb %%v23, %%v23 \n\t"
|
||||
|
||||
"vfadb %%v0,%%v0,%%v16 \n\t"
|
||||
"vfadb %%v1,%%v1,%%v17 \n\t"
|
||||
"vfadb %%v2,%%v2,%%v18 \n\t"
|
||||
"vfadb %%v3,%%v3,%%v19 \n\t"
|
||||
"vfadb %%v0,%%v0,%%v20 \n\t"
|
||||
"vfadb %%v1,%%v1,%%v21 \n\t"
|
||||
"vfadb %%v2,%%v2,%%v22 \n\t"
|
||||
"vfadb %%v3,%%v3,%%v23 \n\t"
|
||||
|
||||
"vl %%v16, 128(%%r1,%2) \n\t"
|
||||
"vl %%v17, 144(%%r1,%2) \n\t"
|
||||
"vl %%v18, 160(%%r1,%2) \n\t"
|
||||
"vl %%v19, 176(%%r1,%2) \n\t"
|
||||
"vl %%v20, 192(%%r1,%2) \n\t"
|
||||
"vl %%v21, 208(%%r1,%2) \n\t"
|
||||
"vl %%v22, 224(%%r1,%2) \n\t"
|
||||
"vl %%v23, 240(%%r1,%2) \n\t"
|
||||
|
||||
"vflpdb %%v16, %%v16 \n\t"
|
||||
"vflpdb %%v17, %%v17 \n\t"
|
||||
"vflpdb %%v18, %%v18 \n\t"
|
||||
"vflpdb %%v19, %%v19 \n\t"
|
||||
"vflpdb %%v20, %%v20 \n\t"
|
||||
"vflpdb %%v21, %%v21 \n\t"
|
||||
"vflpdb %%v22, %%v22 \n\t"
|
||||
"vflpdb %%v23, %%v23 \n\t"
|
||||
|
||||
"vfadb %%v0,%%v0,%%v16 \n\t"
|
||||
"vfadb %%v1,%%v1,%%v17 \n\t"
|
||||
"vfadb %%v2,%%v2,%%v18 \n\t"
|
||||
"vfadb %%v3,%%v3,%%v19 \n\t"
|
||||
"vfadb %%v0,%%v0,%%v20 \n\t"
|
||||
"vfadb %%v1,%%v1,%%v21 \n\t"
|
||||
"vfadb %%v2,%%v2,%%v22 \n\t"
|
||||
"vfadb %%v3,%%v3,%%v23 \n\t"
|
||||
|
||||
"agfi %%r1,256 \n\t"
|
||||
"brctg %%r0,0b \n\t"
|
||||
"vfadb %%v0,%%v0,%%v1 \n\t"
|
||||
"vfadb %%v0,%%v0,%%v2 \n\t"
|
||||
"vfadb %%v0,%%v0,%%v3 \n\t"
|
||||
"vrepg %%v1,%%v0,1 \n\t"
|
||||
"adbr %%f0,%%f1 \n\t"
|
||||
"ldr %0,%%f0 "
|
||||
:"=f"(asum)
|
||||
:"r"(n),"ZR"((const FLOAT (*)[n * 2])x)
|
||||
:"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23"
|
||||
);
|
||||
|
||||
return asum;
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
{
|
||||
BLASLONG i=0;
|
||||
|
|
@ -128,7 +134,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
|||
if ( n1 > 0 )
|
||||
{
|
||||
|
||||
sumf=zasum_kernel_16(n1, x );
|
||||
sumf = zasum_kernel_16(n1, x);
|
||||
i=n1;
|
||||
ip=2*n1;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -23,142 +23,98 @@ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
|||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
|
||||
static void zaxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT da_r,FLOAT da_i) {
|
||||
|
||||
BLASLONG tempR1 ;
|
||||
__asm__ ("pfd 1, 0(%[x_tmp]) \n\t"
|
||||
"pfd 2, 0(%[y_tmp]) \n\t"
|
||||
static void zaxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
||||
{
|
||||
__asm__ volatile(
|
||||
#if !defined(CONJ)
|
||||
"lgdr %[t1],%[alpha_r] \n\t"
|
||||
"vlvgp %%v28,%[t1],%[t1] \n\t" //load both from disjoint
|
||||
"lgdr %[t1],%[alpha_i] \n\t"
|
||||
"vlvgp %%v29,%[t1],%[t1] \n\t" //load both from disjoint
|
||||
"vflcdb %%v29,%%v29 \n\t" //complement both
|
||||
"vlvgg %%v29,%[t1],1 \n\t" //restore 2nd so that {-alpha_i, alpha_i}
|
||||
"vlrepg %%v0,0(%3) \n\t"
|
||||
"vleg %%v1,8(%3),0 \n\t"
|
||||
"wflcdb %%v1,%%v1 \n\t"
|
||||
"vleg %%v1,8(%3),1 \n\t"
|
||||
#else
|
||||
"vleg %%v0,0(%3),1 \n\t"
|
||||
"vflcdb %%v0,%%v0 \n\t"
|
||||
"vleg %%v0,0(%3),0 \n\t"
|
||||
"vlrepg %%v1,8(%3) \n\t"
|
||||
#endif
|
||||
"srlg %%r0,%0,3 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 1, 1024(%%r1,%1) \n\t"
|
||||
"pfd 2, 1024(%%r1,%2) \n\t"
|
||||
|
||||
#else
|
||||
"lgdr %[t1],%[alpha_i] \n\t"
|
||||
"vlvgp %%v29,%[t1],%[t1] \n\t" //load both from disjoint
|
||||
"lgdr %[t1],%[alpha_r] \n\t"
|
||||
"vlvgp %%v28,%[t1],%[t1] \n\t" //load both from disjoint
|
||||
"vflcdb %%v28,%%v28 \n\t" //complement both
|
||||
"vlvgg %%v28,%[t1],0 \n\t" //restore 1st so that {alpha_r,-alpha_r}
|
||||
#endif
|
||||
|
||||
"xgr %[t1],%[t1] \n\t"
|
||||
"sllg %[tmp],%[tmp],4 \n\t"
|
||||
"vl %%v30 , 0(%[t1],%[y_tmp]) \n\t"
|
||||
"vl %%v31 , 16(%[t1],%[y_tmp]) \n\t"
|
||||
"vl %%v6 , 32(%[t1],%[y_tmp]) \n\t"
|
||||
"vl %%v7 , 48(%[t1],%[y_tmp]) \n\t"
|
||||
"vl %%v20 , 0(%[t1],%[x_tmp]) \n\t"
|
||||
"vl %%v21 , 16(%[t1],%[x_tmp]) \n\t"
|
||||
"vl %%v22 , 32(%[t1],%[x_tmp]) \n\t"
|
||||
"vl %%v23 , 48(%[t1],%[x_tmp]) \n\t"
|
||||
"lay %[tmp],-64 (%[tmp]) \n\t" //tmp-=64 so that t1+64 can break tmp condition
|
||||
"j 2f \n\t"
|
||||
".align 16 \n\t"
|
||||
"1: \n\t"
|
||||
|
||||
"vpdi %%v24 , %%v20, %%v20, 4 \n\t"
|
||||
"vpdi %%v25 , %%v21, %%v21, 4 \n\t"
|
||||
"vpdi %%v26 , %%v22, %%v22, 4 \n\t"
|
||||
"vpdi %%v27 , %%v23, %%v23, 4 \n\t"
|
||||
"vfmadb %%v16, %%v20, %%v28, %%v16 \n\t"
|
||||
"vfmadb %%v17, %%v21, %%v28, %%v17 \n\t"
|
||||
"vfmadb %%v18, %%v22, %%v28, %%v18 \n\t"
|
||||
"vfmadb %%v19, %%v23, %%v28, %%v19 \n\t"
|
||||
"vl %%v30, 64(%[t1],%[y_tmp]) \n\t"
|
||||
"vl %%v31, 80(%[t1],%[y_tmp]) \n\t"
|
||||
"vl %%v6 , 96(%[t1],%[y_tmp]) \n\t"
|
||||
"vl %%v7 , 112(%[t1],%[y_tmp]) \n\t"
|
||||
"vfmadb %%v16, %%v24, %%v29, %%v16 \n\t"
|
||||
"vfmadb %%v17, %%v25, %%v29, %%v17 \n\t"
|
||||
"vfmadb %%v18, %%v26, %%v29, %%v18 \n\t"
|
||||
"vfmadb %%v19, %%v27, %%v29, %%v19 \n\t"
|
||||
"vl %%v20 , 64(%[t1],%[x_tmp]) \n\t"
|
||||
"vl %%v21 , 80(%[t1],%[x_tmp]) \n\t"
|
||||
"vl %%v22 , 96(%[t1],%[x_tmp]) \n\t"
|
||||
"vl %%v23 ,112(%[t1],%[x_tmp]) \n\t"
|
||||
"vl %%v16,0(%%r1,%1) \n\t"
|
||||
"vl %%v17,16(%%r1,%1) \n\t"
|
||||
"vl %%v18,32(%%r1,%1) \n\t"
|
||||
"vl %%v19,48(%%r1,%1) \n\t"
|
||||
"vl %%v20,0(%%r1,%2) \n\t"
|
||||
"vl %%v21,16(%%r1,%2) \n\t"
|
||||
"vl %%v22,32(%%r1,%2) \n\t"
|
||||
"vl %%v23,48(%%r1,%2) \n\t"
|
||||
"vpdi %%v24,%%v16,%%v16,4 \n\t"
|
||||
"vpdi %%v25,%%v17,%%v17,4 \n\t"
|
||||
"vpdi %%v26,%%v18,%%v18,4 \n\t"
|
||||
"vpdi %%v27,%%v19,%%v19,4 \n\t"
|
||||
|
||||
"vst %%v16 , 0(%[t1],%[y_tmp]) \n\t"
|
||||
"vst %%v17 , 16(%[t1],%[y_tmp]) \n\t"
|
||||
"vst %%v18 , 32(%[t1],%[y_tmp]) \n\t"
|
||||
"vst %%v19 , 48(%[t1],%[y_tmp]) \n\t"
|
||||
|
||||
"la %[t1],64(%[t1] ) \n\t"
|
||||
"2: \n\t"
|
||||
"pfd 1, 256(%[t1],%[x_tmp]) \n\t"
|
||||
"pfd 2, 256(%[t1],%[y_tmp]) \n\t"
|
||||
"vpdi %%v24 , %%v20, %%v20, 4 \n\t"
|
||||
"vpdi %%v25 , %%v21, %%v21, 4 \n\t"
|
||||
"vpdi %%v26 , %%v22, %%v22, 4 \n\t"
|
||||
"vpdi %%v27 , %%v23, %%v23, 4 \n\t"
|
||||
"vfmadb %%v28,%%v16,%%v0,%%v20 \n\t"
|
||||
"vfmadb %%v29,%%v17,%%v0,%%v21 \n\t"
|
||||
"vfmadb %%v30,%%v18,%%v0,%%v22 \n\t"
|
||||
"vfmadb %%v31,%%v19,%%v0,%%v23 \n\t"
|
||||
|
||||
"vfmadb %%v30, %%v20, %%v28, %%v30 \n\t"
|
||||
"vfmadb %%v31, %%v21, %%v28, %%v31 \n\t"
|
||||
"vfmadb %%v6, %%v22, %%v28, %%v6 \n\t"
|
||||
"vfmadb %%v7, %%v23, %%v28, %%v7 \n\t"
|
||||
"vl %%v16, 64(%[t1],%[y_tmp]) \n\t"
|
||||
"vl %%v17, 80(%[t1],%[y_tmp]) \n\t"
|
||||
"vl %%v18, 96(%[t1],%[y_tmp]) \n\t"
|
||||
"vl %%v19, 112(%[t1],%[y_tmp]) \n\t"
|
||||
"vfmadb %%v30, %%v24, %%v29, %%v30 \n\t"
|
||||
"vfmadb %%v31, %%v25, %%v29, %%v31 \n\t"
|
||||
"vfmadb %%v6, %%v26, %%v29, %%v6 \n\t"
|
||||
"vfmadb %%v7, %%v27, %%v29, %%v7 \n\t"
|
||||
"vfmadb %%v28,%%v24,%%v1,%%v28 \n\t"
|
||||
"vfmadb %%v29,%%v25,%%v1,%%v29 \n\t"
|
||||
"vfmadb %%v30,%%v26,%%v1,%%v30 \n\t"
|
||||
"vfmadb %%v31,%%v27,%%v1,%%v31 \n\t"
|
||||
|
||||
"vl %%v20 , 64(%[t1],%[x_tmp]) \n\t"
|
||||
"vl %%v21 , 80(%[t1],%[x_tmp]) \n\t"
|
||||
"vl %%v22 , 96(%[t1],%[x_tmp]) \n\t"
|
||||
"vl %%v23 ,112(%[t1],%[x_tmp]) \n\t"
|
||||
"vst %%v28,0(%%r1,%2) \n\t"
|
||||
"vst %%v29,16(%%r1,%2) \n\t"
|
||||
"vst %%v30,32(%%r1,%2) \n\t"
|
||||
"vst %%v31,48(%%r1,%2) \n\t"
|
||||
|
||||
"vst %%v30 , 0(%[t1],%[y_tmp]) \n\t"
|
||||
"vst %%v31 , 16(%[t1],%[y_tmp]) \n\t"
|
||||
"vst %%v6 , 32(%[t1],%[y_tmp]) \n\t"
|
||||
"vst %%v7 , 48(%[t1],%[y_tmp]) \n\t"
|
||||
|
||||
"la %[t1],64(%[t1] ) \n\t"
|
||||
|
||||
"vl %%v16,64(%%r1,%1) \n\t"
|
||||
"vl %%v17,80(%%r1,%1) \n\t"
|
||||
"vl %%v18,96(%%r1,%1) \n\t"
|
||||
"vl %%v19,112(%%r1,%1) \n\t"
|
||||
"vl %%v20,64(%%r1,%2) \n\t"
|
||||
"vl %%v21,80(%%r1,%2) \n\t"
|
||||
"vl %%v22,96(%%r1,%2) \n\t"
|
||||
"vl %%v23,112(%%r1,%2) \n\t"
|
||||
"vpdi %%v24,%%v16,%%v16,4 \n\t"
|
||||
"vpdi %%v25,%%v17,%%v17,4 \n\t"
|
||||
"vpdi %%v26,%%v18,%%v18,4 \n\t"
|
||||
"vpdi %%v27,%%v19,%%v19,4 \n\t"
|
||||
|
||||
"clgrjl %[t1],%[tmp],1b \n\t"
|
||||
//----------------------------------------------------------------------
|
||||
"vfmadb %%v16, %%v20, %%v28, %%v16 \n\t"
|
||||
"vfmadb %%v17, %%v21, %%v28, %%v17 \n\t"
|
||||
"vfmadb %%v18, %%v22, %%v28, %%v18 \n\t"
|
||||
"vfmadb %%v19, %%v23, %%v28, %%v19 \n\t"
|
||||
"vpdi %%v24 , %%v20, %%v20, 4 \n\t"
|
||||
"vpdi %%v25 , %%v21, %%v21, 4 \n\t"
|
||||
"vpdi %%v26 , %%v22, %%v22, 4 \n\t"
|
||||
"vpdi %%v27 , %%v23, %%v23, 4 \n\t"
|
||||
"vfmadb %%v16, %%v24, %%v29, %%v16 \n\t"
|
||||
"vfmadb %%v17, %%v25, %%v29, %%v17 \n\t"
|
||||
"vfmadb %%v18, %%v26, %%v29, %%v18 \n\t"
|
||||
"vfmadb %%v19, %%v27, %%v29, %%v19 \n\t"
|
||||
"vfmadb %%v28,%%v16,%%v0,%%v20 \n\t"
|
||||
"vfmadb %%v29,%%v17,%%v0,%%v21 \n\t"
|
||||
"vfmadb %%v30,%%v18,%%v0,%%v22 \n\t"
|
||||
"vfmadb %%v31,%%v19,%%v0,%%v23 \n\t"
|
||||
|
||||
"vst %%v16 , 0(%[t1],%[y_tmp]) \n\t"
|
||||
"vst %%v17 , 16(%[t1],%[y_tmp]) \n\t"
|
||||
"vst %%v18 , 32(%[t1],%[y_tmp]) \n\t"
|
||||
"vst %%v19 , 48(%[t1],%[y_tmp]) \n\t"
|
||||
"vfmadb %%v28,%%v24,%%v1,%%v28 \n\t"
|
||||
"vfmadb %%v29,%%v25,%%v1,%%v29 \n\t"
|
||||
"vfmadb %%v30,%%v26,%%v1,%%v30 \n\t"
|
||||
"vfmadb %%v31,%%v27,%%v1,%%v31 \n\t"
|
||||
|
||||
: [mem_y] "+m" (*(double (*)[2*n])y),[tmp]"+&r"(n) , [t1] "=&a" (tempR1)
|
||||
: [mem_x] "m" (*(const double (*)[2*n])x), [x_tmp] "a"(x), [y_tmp] "a"(y), [alpha_r] "f"(da_r),[alpha_i] "f"(da_i)
|
||||
: "cc", "v6","v7", "v16",
|
||||
"v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
"vst %%v28,64(%%r1,%2) \n\t"
|
||||
"vst %%v29,80(%%r1,%2) \n\t"
|
||||
"vst %%v30,96(%%r1,%2) \n\t"
|
||||
"vst %%v31,112(%%r1,%2) \n\t"
|
||||
|
||||
"agfi %%r1,128 \n\t"
|
||||
"brctg %%r0,0b "
|
||||
:
|
||||
:"r"(n),"ZR"((const FLOAT (*)[n * 2])x),"ZR"((FLOAT (*)[n * 2])y),"ZQ"((const FLOAT (*)[2])alpha)
|
||||
:"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) {
|
||||
BLASLONG i = 0;
|
||||
BLASLONG ix = 0, iy = 0;
|
||||
FLOAT da[2] __attribute__ ((aligned(16)));
|
||||
|
||||
if (n <= 0) return (0);
|
||||
|
||||
|
|
@ -166,8 +122,10 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
|
|||
|
||||
BLASLONG n1 = n & -8;
|
||||
|
||||
if (n1) {
|
||||
zaxpy_kernel_8(n1, x, y, da_r,da_i);
|
||||
if (n1) {
|
||||
da[0] = da_r;
|
||||
da[1] = da_i;
|
||||
zaxpy_kernel_8(n1, x, y, da);
|
||||
ix = 2 * n1;
|
||||
}
|
||||
i = n1;
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2017, The OpenBLAS Project
|
||||
Copyright (c) 2013-2018, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
|
|
@ -24,71 +24,28 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
|||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
#include "common.h"
|
||||
|
||||
static void zcopy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) {
|
||||
|
||||
__asm__ volatile(
|
||||
"pfd 1, 0(%[ptr_x]) \n\t"
|
||||
"pfd 2, 0(%[ptr_y]) \n\t"
|
||||
"srlg %[n_tmp],%[n_tmp],4 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
".align 16 \n\t"
|
||||
"1: \n\t"
|
||||
"pfd 1, 256(%%r1,%[ptr_x]) \n\t"
|
||||
"pfd 2, 256(%%r1,%[ptr_y]) \n\t"
|
||||
|
||||
"vl %%v24, 0(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v24, 0(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v25, 16(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v25, 16(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v26, 32(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v26, 32(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v27, 48(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v27, 48(%%r1,%[ptr_y]) \n\t"
|
||||
|
||||
"vl %%v28, 64(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v28, 64(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v29, 80(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v29, 80(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v30, 96(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v30, 96(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v31, 112(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v31, 112(%%r1,%[ptr_y]) \n\t"
|
||||
|
||||
|
||||
"vl %%v24, 128(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v24, 128(%%r1,%[ptr_y]) \n\t"
|
||||
|
||||
"vl %%v25, 144(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v25, 144(%%r1,%[ptr_y]) \n\t"
|
||||
|
||||
"vl %%v26, 160(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v26, 160(%%r1,%[ptr_y]) \n\t"
|
||||
|
||||
"vl %%v27, 176(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v27, 176(%%r1,%[ptr_y]) \n\t"
|
||||
|
||||
"vl %%v28, 192(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v28, 192(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v29, 208(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v29, 208(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v30, 224(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v30, 224(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v31, 240(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v31, 240(%%r1,%[ptr_y]) \n\t"
|
||||
"la %%r1,256(%%r1) \n\t"
|
||||
"brctg %[n_tmp],1b"
|
||||
: [mem_y] "=m" (*(double (*)[2*n])y), [n_tmp] "+&r"(n)
|
||||
: [mem_x] "m" (*(const double (*)[2*n])x), [ptr_x] "a"(x), [ptr_y] "a"(y)
|
||||
: "cc", "r1", "v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
return;
|
||||
|
||||
static void zcopy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y)
|
||||
{
|
||||
__asm__ volatile (
|
||||
"lgr %%r1,%1 \n\t"
|
||||
"lgr %%r2,%2 \n\t"
|
||||
"srlg %%r0,%0,4 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 1, 1024(%%r1) \n\t"
|
||||
"pfd 2, 1024(%%r2) \n\t"
|
||||
"mvc 0(256,%%r2),0(%%r1) \n\t"
|
||||
"agfi %%r1,256 \n\t"
|
||||
"agfi %%r2,256 \n\t"
|
||||
"brctg %%r0,0b "
|
||||
:
|
||||
:"r"(n),"a"((const FLOAT (*)[n * 2])x),"a"((FLOAT (*)[n * 2])y)
|
||||
:"memory","cc","r0","r1","r2"
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
||||
{
|
||||
BLASLONG i=0;
|
||||
|
|
@ -137,9 +94,6 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
|||
}
|
||||
|
||||
}
|
||||
return(0);
|
||||
|
||||
|
||||
return(0);
|
||||
}
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -23,137 +23,92 @@ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
|||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#if defined(Z13)
|
||||
|
||||
static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) {
|
||||
|
||||
static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d)
|
||||
{
|
||||
__asm__ volatile(
|
||||
"pfd 1, 0(%[ptr_x_tmp]) \n\t"
|
||||
"pfd 1, 0(%[ptr_y_tmp]) \n\t"
|
||||
"vzero %%v24 \n\t"
|
||||
"vzero %%v25 \n\t"
|
||||
"vzero %%v26 \n\t"
|
||||
"vzero %%v27 \n\t"
|
||||
"srlg %[n_tmp],%[n_tmp],3 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
".align 16 \n\t"
|
||||
"1: \n\t"
|
||||
"pfd 1, 256(%%r1,%[ptr_x_tmp]) \n\t"
|
||||
"pfd 1, 256(%%r1,%[ptr_y_tmp]) \n\t"
|
||||
"vl %%v16, 0(%%r1,%[ptr_x_tmp]) \n\t"
|
||||
"vl %%v17, 16(%%r1,%[ptr_x_tmp]) \n\t"
|
||||
"vl %%v18, 32(%%r1,%[ptr_x_tmp]) \n\t"
|
||||
"vl %%v19, 48(%%r1,%[ptr_x_tmp]) \n\t"
|
||||
"vl %%v28, 0(%%r1,%[ptr_y_tmp]) \n\t"
|
||||
"vl %%v29, 16(%%r1,%[ptr_y_tmp]) \n\t"
|
||||
"vl %%v30, 32(%%r1,%[ptr_y_tmp]) \n\t"
|
||||
"vl %%v31, 48(%%r1,%[ptr_y_tmp]) \n\t"
|
||||
"vpdi %%v20,%%v16,%%v16,4 \n\t"
|
||||
"vpdi %%v21,%%v17,%%v17,4 \n\t"
|
||||
"vpdi %%v22,%%v18,%%v18,4 \n\t"
|
||||
"vpdi %%v23,%%v19,%%v19,4 \n\t"
|
||||
"vzero %%v24 \n\t"
|
||||
"vzero %%v25 \n\t"
|
||||
"vzero %%v26 \n\t"
|
||||
"vzero %%v27 \n\t"
|
||||
"vzero %%v28 \n\t"
|
||||
"vzero %%v29 \n\t"
|
||||
"vzero %%v30 \n\t"
|
||||
"vzero %%v31 \n\t"
|
||||
"srlg %%r0,%0,3 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 1, 1024(%%r1,%1) \n\t"
|
||||
"pfd 1, 1024(%%r1,%2) \n\t"
|
||||
|
||||
"vl %%v16, 0(%%r1,%1) \n\t"
|
||||
"vl %%v17, 16(%%r1,%1) \n\t"
|
||||
"vl %%v18, 32(%%r1,%1) \n\t"
|
||||
"vl %%v19, 48(%%r1,%1) \n\t"
|
||||
"vl %%v0, 0(%%r1,%2) \n\t"
|
||||
"vl %%v1, 16(%%r1,%2) \n\t"
|
||||
"vl %%v2, 32(%%r1,%2) \n\t"
|
||||
"vl %%v3, 48(%%r1,%2) \n\t"
|
||||
"vpdi %%v20,%%v16,%%v16,4 \n\t"
|
||||
"vpdi %%v21,%%v17,%%v17,4 \n\t"
|
||||
"vpdi %%v22,%%v18,%%v18,4 \n\t"
|
||||
"vpdi %%v23,%%v19,%%v19,4 \n\t"
|
||||
|
||||
"vfmadb %%v24,%%v16,%%v28,%%v24 \n\t"
|
||||
"vfmadb %%v25,%%v20,%%v28,%%v25 \n\t"
|
||||
"vfmadb %%v26,%%v17,%%v29,%%v26 \n\t"
|
||||
"vfmadb %%v27,%%v21,%%v29,%%v27 \n\t"
|
||||
"vfmadb %%v24,%%v18,%%v30,%%v24 \n\t"
|
||||
"vfmadb %%v25,%%v22,%%v30,%%v25 \n\t"
|
||||
"vfmadb %%v26,%%v19,%%v31,%%v26 \n\t"
|
||||
"vfmadb %%v27,%%v23,%%v31,%%v27 \n\t"
|
||||
"vfmadb %%v24,%%v16,%%v0,%%v24 \n\t"
|
||||
"vfmadb %%v25,%%v20,%%v0,%%v25 \n\t"
|
||||
"vfmadb %%v26,%%v17,%%v1,%%v26 \n\t"
|
||||
"vfmadb %%v27,%%v21,%%v1,%%v27 \n\t"
|
||||
"vfmadb %%v28,%%v18,%%v2,%%v28 \n\t"
|
||||
"vfmadb %%v29,%%v22,%%v2,%%v29 \n\t"
|
||||
"vfmadb %%v30,%%v19,%%v3,%%v30 \n\t"
|
||||
"vfmadb %%v31,%%v23,%%v3,%%v31 \n\t"
|
||||
|
||||
"vl %%v16, 64(%%r1,%1) \n\t"
|
||||
"vl %%v17, 80(%%r1,%1) \n\t"
|
||||
"vl %%v18, 96(%%r1,%1) \n\t"
|
||||
"vl %%v19, 112(%%r1,%1) \n\t"
|
||||
"vl %%v0, 64(%%r1,%2) \n\t"
|
||||
"vl %%v1, 80(%%r1,%2) \n\t"
|
||||
"vl %%v2, 96(%%r1,%2) \n\t"
|
||||
"vl %%v3, 112(%%r1,%2) \n\t"
|
||||
"vpdi %%v20,%%v16,%%v16,4 \n\t"
|
||||
"vpdi %%v21,%%v17,%%v17,4 \n\t"
|
||||
"vpdi %%v22,%%v18,%%v18,4 \n\t"
|
||||
"vpdi %%v23,%%v19,%%v19,4 \n\t"
|
||||
|
||||
"vfmadb %%v24,%%v16,%%v0,%%v24 \n\t"
|
||||
"vfmadb %%v25,%%v20,%%v0,%%v25 \n\t"
|
||||
"vfmadb %%v26,%%v17,%%v1,%%v26 \n\t"
|
||||
"vfmadb %%v27,%%v21,%%v1,%%v27 \n\t"
|
||||
"vfmadb %%v28,%%v18,%%v2,%%v28 \n\t"
|
||||
"vfmadb %%v29,%%v22,%%v2,%%v29 \n\t"
|
||||
"vfmadb %%v30,%%v19,%%v3,%%v30 \n\t"
|
||||
"vfmadb %%v31,%%v23,%%v3,%%v31 \n\t"
|
||||
|
||||
"vl %%v16, 64(%%r1,%[ptr_x_tmp]) \n\t"
|
||||
"vl %%v17, 80(%%r1,%[ptr_x_tmp]) \n\t"
|
||||
"vl %%v18, 96(%%r1,%[ptr_x_tmp]) \n\t"
|
||||
"vl %%v19,112(%%r1,%[ptr_x_tmp]) \n\t"
|
||||
"vl %%v28, 64(%%r1,%[ptr_y_tmp]) \n\t"
|
||||
"vl %%v29, 80(%%r1,%[ptr_y_tmp]) \n\t"
|
||||
"vl %%v30, 96(%%r1,%[ptr_y_tmp]) \n\t"
|
||||
"vl %%v31,112(%%r1,%[ptr_y_tmp]) \n\t"
|
||||
"vpdi %%v20,%%v16,%%v16,4 \n\t"
|
||||
"vpdi %%v21,%%v17,%%v17,4 \n\t"
|
||||
"vpdi %%v22,%%v18,%%v18,4 \n\t"
|
||||
"vpdi %%v23,%%v19,%%v19,4 \n\t"
|
||||
"vfmadb %%v24,%%v16,%%v28,%%v24 \n\t"
|
||||
"vfmadb %%v25,%%v20,%%v28,%%v25 \n\t"
|
||||
"vfmadb %%v26,%%v17,%%v29,%%v26 \n\t"
|
||||
"vfmadb %%v27,%%v21,%%v29,%%v27 \n\t"
|
||||
"vfmadb %%v24,%%v18,%%v30,%%v24 \n\t"
|
||||
"vfmadb %%v25,%%v22,%%v30,%%v25 \n\t"
|
||||
"vfmadb %%v26,%%v19,%%v31,%%v26 \n\t"
|
||||
"vfmadb %%v27,%%v23,%%v31,%%v27 \n\t"
|
||||
|
||||
|
||||
"la %%r1,128(%%r1) \n\t"
|
||||
"brctg %[n_tmp],1b \n\t"
|
||||
"vfadb %%v24,%%v26,%%v24 \n\t"
|
||||
"vfadb %%v25,%%v25,%%v27 \n\t"
|
||||
"vsteg %%v24, 0(%[ptr_d]),0 \n\t"
|
||||
"vsteg %%v24, 8(%[ptr_d]),1 \n\t"
|
||||
"vsteg %%v25,16(%[ptr_d]),1 \n\t"
|
||||
"vsteg %%v25,24(%[ptr_d]),0 \n\t"
|
||||
: [mem_out] "=m"(*(double (*)[4])d ) ,[n_tmp] "+&r"(n)
|
||||
: [mem_x] "m"( *(const double (*)[2*n])x),
|
||||
[mem_y] "m"( *(const double (*)[2*n])y),
|
||||
[ptr_x_tmp] "a"(x), [ptr_y_tmp] "a"(y), [ptr_d] "a"(d)
|
||||
: "cc", "r1","v16",
|
||||
"v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
|
||||
"agfi %%r1,128 \n\t"
|
||||
"brctg %%r0,0b \n\t"
|
||||
"vfadb %%v24,%%v24,%%v26 \n\t"
|
||||
"vfadb %%v24,%%v24,%%v28 \n\t"
|
||||
"vfadb %%v24,%%v24,%%v30 \n\t"
|
||||
"vfadb %%v25,%%v25,%%v27 \n\t"
|
||||
"vfadb %%v25,%%v25,%%v29 \n\t"
|
||||
"vfadb %%v25,%%v25,%%v31 \n\t"
|
||||
"vsteg %%v24,0(%3),0 \n\t"
|
||||
"vsteg %%v24,8(%3),1 \n\t"
|
||||
"vsteg %%v25,16(%3),1 \n\t"
|
||||
"vsteg %%v25,24(%3),0 "
|
||||
:
|
||||
:"r"(n),"ZR"((const FLOAT (*)[n * 2])x),"ZR"((const FLOAT (*)[n * 2])y),"ZQ"((FLOAT (*)[4])d)
|
||||
:"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) {
|
||||
BLASLONG register i = 0;
|
||||
FLOAT dot[4] = {0.0, 0.0, 0.0, 0.0};
|
||||
BLASLONG j = 0;
|
||||
|
||||
while (i < n) {
|
||||
|
||||
dot[0] += x[j] * y[j];
|
||||
dot[1] += x[j + 1] * y[j + 1];
|
||||
dot[2] += x[j] * y[j + 1];
|
||||
dot[3] += x[j + 1] * y[j];
|
||||
|
||||
dot[0] += x[j + 2] * y[j + 2];
|
||||
dot[1] += x[j + 3] * y[j + 3];
|
||||
dot[2] += x[j + 2] * y[j + 3];
|
||||
dot[3] += x[j + 3] * y[j + 2];
|
||||
|
||||
dot[0] += x[j + 4] * y[j + 4];
|
||||
dot[1] += x[j + 5] * y[j + 5];
|
||||
dot[2] += x[j + 4] * y[j + 5];
|
||||
dot[3] += x[j + 5] * y[j + 4];
|
||||
|
||||
dot[0] += x[j + 6] * y[j + 6];
|
||||
dot[1] += x[j + 7] * y[j + 7];
|
||||
dot[2] += x[j + 6] * y[j + 7];
|
||||
dot[3] += x[j + 7] * y[j + 6];
|
||||
|
||||
j += 8;
|
||||
i += 4;
|
||||
|
||||
}
|
||||
d[0] = dot[0];
|
||||
d[1] = dot[1];
|
||||
d[2] = dot[2];
|
||||
d[3] = dot[3];
|
||||
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) {
|
||||
BLASLONG i = 0;
|
||||
BLASLONG ix=0, iy=0;
|
||||
BLASLONG i;
|
||||
BLASLONG ix, iy;
|
||||
OPENBLAS_COMPLEX_FLOAT result;
|
||||
FLOAT dot[4] __attribute__ ((aligned(16))) = {0.0, 0.0, 0.0, 0.0};
|
||||
|
||||
|
|
@ -167,14 +122,12 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA
|
|||
if ((inc_x == 1) && (inc_y == 1)) {
|
||||
|
||||
BLASLONG n1 = n & -8;
|
||||
BLASLONG j=0;
|
||||
|
||||
if (n1){
|
||||
if (n1)
|
||||
zdot_kernel_8(n1, x, y, dot);
|
||||
i = n1;
|
||||
j = n1 <<1;
|
||||
}
|
||||
|
||||
|
||||
i = n1;
|
||||
BLASLONG j = i * 2;
|
||||
|
||||
while (i < n) {
|
||||
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
|
@ -1,5 +1,5 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2017, The OpenBLAS Project
|
||||
Copyright (c) 2013-2018, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
|
|
@ -27,176 +27,166 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#include "common.h"
|
||||
|
||||
static void zrot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT cosA, FLOAT sinA)
|
||||
static void zrot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s)
|
||||
{
|
||||
__asm__ (
|
||||
"pfd 2, 0(%[ptr_x]) \n\t"
|
||||
"pfd 2, 0(%[ptr_y]) \n\t"
|
||||
"lgdr %%r1,%[cos] \n\t"
|
||||
"vlvgp %%v0,%%r1,%%r1 \n\t"
|
||||
"lgdr %%r1,%[sin] \n\t"
|
||||
"vlvgp %%v1,%%r1,%%r1 \n\t"
|
||||
"sllg %[tmp],%[tmp],4 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
".align 16 \n\t"
|
||||
"1: \n\t"
|
||||
"pfd 2, 256(%%r1,%[ptr_x]) \n\t"
|
||||
"pfd 2, 256(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v24, 0(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v25, 16(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v26, 32(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v27, 48(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v16, 0(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v17, 16(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v18, 32(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v19, 48(%%r1,%[ptr_y]) \n\t"
|
||||
|
||||
"vfmdb %%v28,%%v24,%%v0 \n\t"
|
||||
"vfmdb %%v29,%%v25,%%v0 \n\t"
|
||||
"vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v30,%%v26,%%v0 \n\t"
|
||||
"vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v31,%%v27,%%v0 \n\t"
|
||||
"vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
|
||||
/* 2nd parts*/
|
||||
"vfmadb %%v28,%%v16,%%v1,%%v28 \n\t"
|
||||
"vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
|
||||
"vfmadb %%v29,%%v17,%%v1,%%v29 \n\t"
|
||||
"vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
|
||||
"vfmadb %%v30,%%v18,%%v1,%%v30 \n\t"
|
||||
"vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */
|
||||
"vfmadb %%v31,%%v19,%%v1,%%v31 \n\t"
|
||||
"vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
|
||||
|
||||
"vst %%v28, 0(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v29, 16(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v30, 32(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v31, 48(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v20, 0(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v21, 16(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v22, 32(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v23, 48(%%r1,%[ptr_y]) \n\t"
|
||||
|
||||
"vl %%v24, 64(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v25, 80(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v26, 96(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v27,112(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v16, 64(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v17, 80(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v18, 96(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v19,112(%%r1,%[ptr_y]) \n\t"
|
||||
|
||||
"vfmdb %%v28,%%v24,%%v0 \n\t"
|
||||
"vfmdb %%v29,%%v25,%%v0 \n\t"
|
||||
"vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v30,%%v26,%%v0 \n\t"
|
||||
"vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v31,%%v27,%%v0 \n\t"
|
||||
"vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
|
||||
/* 2nd parts*/
|
||||
"vfmadb %%v28,%%v16,%%v1,%%v28 \n\t"
|
||||
"vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
|
||||
"vfmadb %%v29,%%v17,%%v1,%%v29 \n\t"
|
||||
"vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
|
||||
"vfmadb %%v30,%%v18,%%v1,%%v30 \n\t"
|
||||
"vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */
|
||||
"vfmadb %%v31,%%v19,%%v1,%%v31 \n\t"
|
||||
"vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
|
||||
|
||||
"vst %%v28, 64(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v29, 80(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v30, 96(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v31, 112(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v20, 64(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v21, 80(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v22, 96(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v23, 112(%%r1,%[ptr_y]) \n\t"
|
||||
|
||||
"vl %%v24, 128(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v25, 144(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v26, 160(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v27, 176(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v16, 128(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v17, 144(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v18, 160(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v19, 176(%%r1,%[ptr_y]) \n\t"
|
||||
|
||||
"vfmdb %%v28,%%v24,%%v0 \n\t"
|
||||
"vfmdb %%v29,%%v25,%%v0 \n\t"
|
||||
"vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v30,%%v26,%%v0 \n\t"
|
||||
"vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v31,%%v27,%%v0 \n\t"
|
||||
"vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
|
||||
/* 2nd parts*/
|
||||
"vfmadb %%v28,%%v16,%%v1,%%v28 \n\t"
|
||||
"vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
|
||||
"vfmadb %%v29,%%v17,%%v1,%%v29 \n\t"
|
||||
"vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
|
||||
"vfmadb %%v30,%%v18,%%v1,%%v30 \n\t"
|
||||
"vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */
|
||||
"vfmadb %%v31,%%v19,%%v1,%%v31 \n\t"
|
||||
"vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
|
||||
|
||||
"vst %%v28, 128(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v29, 144(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v30, 160(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v31, 176(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v20, 128(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v21, 144(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v22, 160(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v23, 176(%%r1,%[ptr_y]) \n\t"
|
||||
|
||||
"vl %%v24, 192(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v25, 208(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v26, 224(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v27, 240(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v16, 192(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v17, 208(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v18, 224(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v19, 240(%%r1,%[ptr_y]) \n\t"
|
||||
|
||||
"vfmdb %%v28,%%v24,%%v0 \n\t"
|
||||
"vfmdb %%v29,%%v25,%%v0 \n\t"
|
||||
"vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v30,%%v26,%%v0 \n\t"
|
||||
"vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v31,%%v27,%%v0 \n\t"
|
||||
"vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
|
||||
/* 2nd parts*/
|
||||
"vfmadb %%v28,%%v16,%%v1,%%v28 \n\t"
|
||||
"vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
|
||||
"vfmadb %%v29,%%v17,%%v1,%%v29 \n\t"
|
||||
"vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
|
||||
"vfmadb %%v30,%%v18,%%v1,%%v30 \n\t"
|
||||
"vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */
|
||||
"vfmadb %%v31,%%v19,%%v1,%%v31 \n\t"
|
||||
"vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
|
||||
|
||||
"vst %%v28, 192(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v29, 208(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v30, 224(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v31, 240(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v20, 192(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v21, 208(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v22, 224(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v23, 240(%%r1,%[ptr_y]) \n\t"
|
||||
|
||||
"la %%r1,256(%%r1) \n\t"
|
||||
"clgrjl %%r1,%[tmp],1b \n\t"
|
||||
: [mem_x] "+m" (*(double (*)[2*n])x),
|
||||
[mem_y] "+m" (*(double (*)[2*n])y),
|
||||
[tmp] "+&r"(n)
|
||||
: [ptr_x] "a"(x), [ptr_y] "a"(y),[cos] "f"(cosA),[sin] "f"(sinA)
|
||||
: "cc","r1" ,"v0","v1","v16",
|
||||
"v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
return;
|
||||
|
||||
__asm__ (
|
||||
"vlrepg %%v0,%3 \n\t"
|
||||
"vlrepg %%v1,%4 \n\t"
|
||||
"srlg %%r0,%0,4 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 2, 1024(%%r1,%1) \n\t"
|
||||
"pfd 2, 1024(%%r1,%2) \n\t"
|
||||
"vl %%v24, 0(%%r1,%1) \n\t"
|
||||
"vl %%v25, 16(%%r1,%1) \n\t"
|
||||
"vl %%v26, 32(%%r1,%1) \n\t"
|
||||
"vl %%v27, 48(%%r1,%1) \n\t"
|
||||
"vl %%v16, 0(%%r1,%2) \n\t"
|
||||
"vl %%v17, 16(%%r1,%2) \n\t"
|
||||
"vl %%v18, 32(%%r1,%2) \n\t"
|
||||
"vl %%v19, 48(%%r1,%2) \n\t"
|
||||
|
||||
"vfmdb %%v28,%%v24,%%v0 \n\t"
|
||||
"vfmdb %%v29,%%v25,%%v0 \n\t"
|
||||
"vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v30,%%v26,%%v0 \n\t"
|
||||
"vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v31,%%v27,%%v0 \n\t"
|
||||
"vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
|
||||
/* 2nd parts*/
|
||||
"vfmadb %%v28,%%v16,%%v1,%%v28 \n\t"
|
||||
"vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
|
||||
"vfmadb %%v29,%%v17,%%v1,%%v29 \n\t"
|
||||
"vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
|
||||
"vfmadb %%v30,%%v18,%%v1,%%v30 \n\t"
|
||||
"vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */
|
||||
"vfmadb %%v31,%%v19,%%v1,%%v31 \n\t"
|
||||
"vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
|
||||
|
||||
"vst %%v28, 0(%%r1,%1) \n\t"
|
||||
"vst %%v29, 16(%%r1,%1) \n\t"
|
||||
"vst %%v30, 32(%%r1,%1) \n\t"
|
||||
"vst %%v31, 48(%%r1,%1) \n\t"
|
||||
"vst %%v20, 0(%%r1,%2) \n\t"
|
||||
"vst %%v21, 16(%%r1,%2) \n\t"
|
||||
"vst %%v22, 32(%%r1,%2) \n\t"
|
||||
"vst %%v23, 48(%%r1,%2) \n\t"
|
||||
|
||||
"vl %%v24, 64(%%r1,%1) \n\t"
|
||||
"vl %%v25, 80(%%r1,%1) \n\t"
|
||||
"vl %%v26, 96(%%r1,%1) \n\t"
|
||||
"vl %%v27, 112(%%r1,%1) \n\t"
|
||||
"vl %%v16, 64(%%r1,%2) \n\t"
|
||||
"vl %%v17, 80(%%r1,%2) \n\t"
|
||||
"vl %%v18, 96(%%r1,%2) \n\t"
|
||||
"vl %%v19, 112(%%r1,%2) \n\t"
|
||||
|
||||
"vfmdb %%v28,%%v24,%%v0 \n\t"
|
||||
"vfmdb %%v29,%%v25,%%v0 \n\t"
|
||||
"vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v30,%%v26,%%v0 \n\t"
|
||||
"vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v31,%%v27,%%v0 \n\t"
|
||||
"vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
|
||||
/* 2nd parts*/
|
||||
"vfmadb %%v28,%%v16,%%v1,%%v28 \n\t"
|
||||
"vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
|
||||
"vfmadb %%v29,%%v17,%%v1,%%v29 \n\t"
|
||||
"vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
|
||||
"vfmadb %%v30,%%v18,%%v1,%%v30 \n\t"
|
||||
"vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */
|
||||
"vfmadb %%v31,%%v19,%%v1,%%v31 \n\t"
|
||||
"vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
|
||||
|
||||
"vst %%v28, 64(%%r1,%1) \n\t"
|
||||
"vst %%v29, 80(%%r1,%1) \n\t"
|
||||
"vst %%v30, 96(%%r1,%1) \n\t"
|
||||
"vst %%v31, 112(%%r1,%1) \n\t"
|
||||
"vst %%v20, 64(%%r1,%2) \n\t"
|
||||
"vst %%v21, 80(%%r1,%2) \n\t"
|
||||
"vst %%v22, 96(%%r1,%2) \n\t"
|
||||
"vst %%v23, 112(%%r1,%2) \n\t"
|
||||
|
||||
"vl %%v24, 128(%%r1,%1) \n\t"
|
||||
"vl %%v25, 144(%%r1,%1) \n\t"
|
||||
"vl %%v26, 160(%%r1,%1) \n\t"
|
||||
"vl %%v27, 176(%%r1,%1) \n\t"
|
||||
"vl %%v16, 128(%%r1,%2) \n\t"
|
||||
"vl %%v17, 144(%%r1,%2) \n\t"
|
||||
"vl %%v18, 160(%%r1,%2) \n\t"
|
||||
"vl %%v19, 176(%%r1,%2) \n\t"
|
||||
|
||||
"vfmdb %%v28,%%v24,%%v0 \n\t"
|
||||
"vfmdb %%v29,%%v25,%%v0 \n\t"
|
||||
"vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v30,%%v26,%%v0 \n\t"
|
||||
"vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v31,%%v27,%%v0 \n\t"
|
||||
"vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
|
||||
/* 2nd parts*/
|
||||
"vfmadb %%v28,%%v16,%%v1,%%v28 \n\t"
|
||||
"vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
|
||||
"vfmadb %%v29,%%v17,%%v1,%%v29 \n\t"
|
||||
"vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
|
||||
"vfmadb %%v30,%%v18,%%v1,%%v30 \n\t"
|
||||
"vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */
|
||||
"vfmadb %%v31,%%v19,%%v1,%%v31 \n\t"
|
||||
"vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
|
||||
|
||||
"vst %%v28, 128(%%r1,%1) \n\t"
|
||||
"vst %%v29, 144(%%r1,%1) \n\t"
|
||||
"vst %%v30, 160(%%r1,%1) \n\t"
|
||||
"vst %%v31, 176(%%r1,%1) \n\t"
|
||||
"vst %%v20, 128(%%r1,%2) \n\t"
|
||||
"vst %%v21, 144(%%r1,%2) \n\t"
|
||||
"vst %%v22, 160(%%r1,%2) \n\t"
|
||||
"vst %%v23, 176(%%r1,%2) \n\t"
|
||||
|
||||
"vl %%v24, 192(%%r1,%1) \n\t"
|
||||
"vl %%v25, 208(%%r1,%1) \n\t"
|
||||
"vl %%v26, 224(%%r1,%1) \n\t"
|
||||
"vl %%v27, 240(%%r1,%1) \n\t"
|
||||
"vl %%v16, 192(%%r1,%2) \n\t"
|
||||
"vl %%v17, 208(%%r1,%2) \n\t"
|
||||
"vl %%v18, 224(%%r1,%2) \n\t"
|
||||
"vl %%v19, 240(%%r1,%2) \n\t"
|
||||
|
||||
"vfmdb %%v28,%%v24,%%v0 \n\t"
|
||||
"vfmdb %%v29,%%v25,%%v0 \n\t"
|
||||
"vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v30,%%v26,%%v0 \n\t"
|
||||
"vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v31,%%v27,%%v0 \n\t"
|
||||
"vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
|
||||
/* 2nd parts*/
|
||||
"vfmadb %%v28,%%v16,%%v1,%%v28 \n\t"
|
||||
"vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
|
||||
"vfmadb %%v29,%%v17,%%v1,%%v29 \n\t"
|
||||
"vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
|
||||
"vfmadb %%v30,%%v18,%%v1,%%v30 \n\t"
|
||||
"vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */
|
||||
"vfmadb %%v31,%%v19,%%v1,%%v31 \n\t"
|
||||
"vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
|
||||
|
||||
"vst %%v28, 192(%%r1,%1) \n\t"
|
||||
"vst %%v29, 208(%%r1,%1) \n\t"
|
||||
"vst %%v30, 224(%%r1,%1) \n\t"
|
||||
"vst %%v31, 240(%%r1,%1) \n\t"
|
||||
"vst %%v20, 192(%%r1,%2) \n\t"
|
||||
"vst %%v21, 208(%%r1,%2) \n\t"
|
||||
"vst %%v22, 224(%%r1,%2) \n\t"
|
||||
"vst %%v23, 240(%%r1,%2) \n\t"
|
||||
|
||||
"agfi %%r1,256 \n\t"
|
||||
"brctg %%r0,0b "
|
||||
:
|
||||
:"r"(n),"ZR"((FLOAT (*)[n * 2])x),"ZR"((FLOAT (*)[n * 2])y),"m"(*c),"m"(*s)
|
||||
:"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
}
|
||||
|
||||
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
|
||||
|
|
@ -214,8 +204,11 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
|
|||
|
||||
BLASLONG n1 = n & -16;
|
||||
if ( n1 > 0 )
|
||||
{
|
||||
zrot_kernel_16(n1, x, y, c, s);
|
||||
{
|
||||
FLOAT cosa,sina;
|
||||
cosa=c;
|
||||
sina=s;
|
||||
zrot_kernel_16(n1, x, y, &cosa, &sina);
|
||||
i=n1;
|
||||
ix=2*n1;
|
||||
}
|
||||
|
|
@ -234,6 +227,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
|
|||
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
|
|
@ -259,3 +253,4 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
|
|||
|
||||
}
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -23,270 +23,211 @@ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
|||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
static void zscal_kernel_8(BLASLONG n, FLOAT *alpha, FLOAT *x)
|
||||
{
|
||||
__asm__ volatile(
|
||||
"vlrepg %%v0,0(%1) \n\t"
|
||||
"vleg %%v1,8(%1),0 \n\t"
|
||||
"wflcdb %%v1,%%v1 \n\t"
|
||||
"vleg %%v1,8(%1),1 \n\t"
|
||||
"srlg %%r0,%0,3 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 2, 1024(%%r1,%2) \n\t"
|
||||
|
||||
"vl %%v16,0(%%r1,%2) \n\t"
|
||||
"vl %%v17,16(%%r1,%2) \n\t"
|
||||
"vl %%v18,32(%%r1,%2) \n\t"
|
||||
"vl %%v19,48(%%r1,%2) \n\t"
|
||||
"vl %%v20,64(%%r1,%2) \n\t"
|
||||
"vl %%v21,80(%%r1,%2) \n\t"
|
||||
"vl %%v22,96(%%r1,%2) \n\t"
|
||||
"vl %%v23,112(%%r1,%2) \n\t"
|
||||
"vpdi %%v24,%%v16,%%v16,4 \n\t"
|
||||
"vpdi %%v25,%%v17,%%v17,4 \n\t"
|
||||
"vpdi %%v26,%%v18,%%v18,4 \n\t"
|
||||
"vpdi %%v27,%%v19,%%v19,4 \n\t"
|
||||
"vpdi %%v28,%%v20,%%v20,4 \n\t"
|
||||
"vpdi %%v29,%%v21,%%v21,4 \n\t"
|
||||
"vpdi %%v30,%%v22,%%v22,4 \n\t"
|
||||
"vpdi %%v31,%%v23,%%v23,4 \n\t"
|
||||
|
||||
"vfmdb %%v16,%%v16,%%v0 \n\t"
|
||||
"vfmdb %%v17,%%v17,%%v0 \n\t"
|
||||
"vfmdb %%v18,%%v18,%%v0 \n\t"
|
||||
"vfmdb %%v19,%%v19,%%v0 \n\t"
|
||||
"vfmdb %%v20,%%v20,%%v0 \n\t"
|
||||
"vfmdb %%v21,%%v21,%%v0 \n\t"
|
||||
"vfmdb %%v22,%%v22,%%v0 \n\t"
|
||||
"vfmdb %%v23,%%v23,%%v0 \n\t"
|
||||
"vfmadb %%v16,%%v24,%%v1,%%v16 \n\t"
|
||||
"vfmadb %%v17,%%v25,%%v1,%%v17 \n\t"
|
||||
"vfmadb %%v18,%%v26,%%v1,%%v18 \n\t"
|
||||
"vfmadb %%v19,%%v27,%%v1,%%v19 \n\t"
|
||||
"vfmadb %%v20,%%v28,%%v1,%%v20 \n\t"
|
||||
"vfmadb %%v21,%%v29,%%v1,%%v21 \n\t"
|
||||
"vfmadb %%v22,%%v30,%%v1,%%v22 \n\t"
|
||||
"vfmadb %%v23,%%v31,%%v1,%%v23 \n\t"
|
||||
|
||||
"vst %%v16,0(%%r1,%2) \n\t"
|
||||
"vst %%v17,16(%%r1,%2) \n\t"
|
||||
"vst %%v18,32(%%r1,%2) \n\t"
|
||||
"vst %%v19,48(%%r1,%2) \n\t"
|
||||
"vst %%v20,64(%%r1,%2) \n\t"
|
||||
"vst %%v21,80(%%r1,%2) \n\t"
|
||||
"vst %%v22,96(%%r1,%2) \n\t"
|
||||
"vst %%v23,112(%%r1,%2) \n\t"
|
||||
|
||||
"agfi %%r1,128 \n\t"
|
||||
"brctg %%r0,0b "
|
||||
:
|
||||
:"r"(n),"ZQ"((const FLOAT (*)[2])alpha),"ZR"((FLOAT (*)[n * 2])x)
|
||||
:"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
}
|
||||
|
||||
static void zscal_kernel_8_zero_r(BLASLONG n, FLOAT *alpha, FLOAT *x)
|
||||
{
|
||||
__asm__ volatile(
|
||||
"vleg %%v0,8(%1),0 \n\t"
|
||||
"wflcdb %%v0,%%v0 \n\t"
|
||||
"vleg %%v0,8(%1),1 \n\t"
|
||||
"srlg %%r0,%0,3 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 2, 1024(%%r1,%2) \n\t"
|
||||
|
||||
static void zscal_kernel_8(BLASLONG n, FLOAT da_r,FLOAT da_i, FLOAT *x) {
|
||||
BLASLONG tempR1 ;
|
||||
__asm__ (
|
||||
"pfd 2, 0(%[x_tmp]) \n\t"
|
||||
#if !defined(CONJ)
|
||||
"lgdr %[t1],%[alpha_r] \n\t"
|
||||
"vlvgp %%v28,%[t1],%[t1] \n\t" //load both from disjoint
|
||||
"lgdr %[t1],%[alpha_i] \n\t"
|
||||
"vlvgp %%v29,%[t1],%[t1] \n\t" //load both from disjoint
|
||||
"vflcdb %%v29,%%v29 \n\t" //complement both
|
||||
"vlvgg %%v29,%[t1],1 \n\t" //restore 2nd so that {-alpha_i, alpha_i}
|
||||
"vl %%v16,0(%%r1,%2) \n\t"
|
||||
"vl %%v17,16(%%r1,%2) \n\t"
|
||||
"vl %%v18,32(%%r1,%2) \n\t"
|
||||
"vl %%v19,48(%%r1,%2) \n\t"
|
||||
"vl %%v20,64(%%r1,%2) \n\t"
|
||||
"vl %%v21,80(%%r1,%2) \n\t"
|
||||
"vl %%v22,96(%%r1,%2) \n\t"
|
||||
"vl %%v23,112(%%r1,%2) \n\t"
|
||||
"vpdi %%v16,%%v16,%%v16,4 \n\t"
|
||||
"vpdi %%v17,%%v17,%%v17,4 \n\t"
|
||||
"vpdi %%v18,%%v18,%%v18,4 \n\t"
|
||||
"vpdi %%v19,%%v19,%%v19,4 \n\t"
|
||||
"vpdi %%v20,%%v20,%%v20,4 \n\t"
|
||||
"vpdi %%v21,%%v21,%%v21,4 \n\t"
|
||||
"vpdi %%v22,%%v22,%%v22,4 \n\t"
|
||||
"vpdi %%v23,%%v23,%%v23,4 \n\t"
|
||||
|
||||
#else
|
||||
"lgdr %[t1],%[alpha_i] \n\t"
|
||||
"vlvgp %%v29,%[t1],%[t1] \n\t" //load both from disjoint
|
||||
"lgdr %[t1],%[alpha_r] \n\t"
|
||||
"vlvgp %%v28,%[t1],%[t1] \n\t" //load both from disjoint
|
||||
"vflcdb %%v28,%%v28 \n\t" //complement both
|
||||
"vlvgg %%v28,%[t1],0 \n\t" //restore 1st so that {alpha_r,-alpha_r}
|
||||
#endif
|
||||
|
||||
"xgr %[t1],%[t1] \n\t"
|
||||
"sllg %[tmp],%[tmp],4 \n\t"
|
||||
"vl %%v20 , 0(%[t1],%[x_tmp]) \n\t"
|
||||
"vl %%v21 , 16(%[t1],%[x_tmp]) \n\t"
|
||||
"vl %%v22 , 32(%[t1],%[x_tmp]) \n\t"
|
||||
"vl %%v23 , 48(%[t1],%[x_tmp]) \n\t"
|
||||
|
||||
"lay %[tmp],-64 (%[tmp]) \n\t" //tmp-=64 so that t1+64 can break tmp condition
|
||||
"j 2f \n\t"
|
||||
".align 16 \n\t"
|
||||
"1: \n\t"
|
||||
|
||||
"vpdi %%v24 , %%v20, %%v20, 4 \n\t"
|
||||
"vpdi %%v25 , %%v21, %%v21, 4 \n\t"
|
||||
"vpdi %%v26 , %%v22, %%v22, 4 \n\t"
|
||||
"vpdi %%v27 , %%v23, %%v23, 4 \n\t"
|
||||
"vfmdb %%v16, %%v20, %%v28 \n\t"
|
||||
"vfmdb %%v17, %%v21, %%v28 \n\t"
|
||||
"vfmdb %%v18, %%v22, %%v28 \n\t"
|
||||
"vfmdb %%v19, %%v23, %%v28 \n\t"
|
||||
"vl %%v20, 64(%[t1],%[x_tmp]) \n\t"
|
||||
"vl %%v21, 80(%[t1],%[x_tmp]) \n\t"
|
||||
"vl %%v22, 96(%[t1],%[x_tmp]) \n\t"
|
||||
"vl %%v23, 112(%[t1],%[x_tmp]) \n\t"
|
||||
"vfmadb %%v16, %%v24, %%v29, %%v16 \n\t"
|
||||
"vfmadb %%v17, %%v25, %%v29, %%v17 \n\t"
|
||||
"vfmadb %%v18, %%v26, %%v29, %%v18 \n\t"
|
||||
"vfmadb %%v19, %%v27, %%v29, %%v19 \n\t"
|
||||
"vfmdb %%v16,%%v16,%%v0 \n\t"
|
||||
"vfmdb %%v17,%%v17,%%v0 \n\t"
|
||||
"vfmdb %%v18,%%v18,%%v0 \n\t"
|
||||
"vfmdb %%v19,%%v19,%%v0 \n\t"
|
||||
"vfmdb %%v20,%%v20,%%v0 \n\t"
|
||||
"vfmdb %%v21,%%v21,%%v0 \n\t"
|
||||
"vfmdb %%v22,%%v22,%%v0 \n\t"
|
||||
"vfmdb %%v23,%%v23,%%v0 \n\t"
|
||||
|
||||
"vst %%v16,0(%%r1,%2) \n\t"
|
||||
"vst %%v17,16(%%r1,%2) \n\t"
|
||||
"vst %%v18,32(%%r1,%2) \n\t"
|
||||
"vst %%v19,48(%%r1,%2) \n\t"
|
||||
"vst %%v20,64(%%r1,%2) \n\t"
|
||||
"vst %%v21,80(%%r1,%2) \n\t"
|
||||
"vst %%v22,96(%%r1,%2) \n\t"
|
||||
"vst %%v23,112(%%r1,%2) \n\t"
|
||||
|
||||
"vst %%v16 , 0(%[t1],%[x_tmp]) \n\t"
|
||||
"vst %%v17 , 16(%[t1],%[x_tmp]) \n\t"
|
||||
"vst %%v18 , 32(%[t1],%[x_tmp]) \n\t"
|
||||
"vst %%v19 , 48(%[t1],%[x_tmp]) \n\t"
|
||||
"agfi %%r1,128 \n\t"
|
||||
"brctg %%r0,0b "
|
||||
:
|
||||
:"r"(n),"ZQ"((const FLOAT (*)[2])alpha),"ZR"((FLOAT (*)[n * 2])x)
|
||||
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23"
|
||||
);
|
||||
}
|
||||
|
||||
static void zscal_kernel_8_zero_i(BLASLONG n, FLOAT *alpha, FLOAT *x)
|
||||
{
|
||||
__asm__ volatile(
|
||||
"vlrepg %%v0,0(%1) \n\t"
|
||||
"srlg %%r0,%0,3 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 2, 1024(%%r1,%2) \n\t"
|
||||
|
||||
"vl %%v16,0(%%r1,%2) \n\t"
|
||||
"vl %%v17,16(%%r1,%2) \n\t"
|
||||
"vl %%v18,32(%%r1,%2) \n\t"
|
||||
"vl %%v19,48(%%r1,%2) \n\t"
|
||||
"vl %%v20,64(%%r1,%2) \n\t"
|
||||
"vl %%v21,80(%%r1,%2) \n\t"
|
||||
"vl %%v22,96(%%r1,%2) \n\t"
|
||||
"vl %%v23,112(%%r1,%2) \n\t"
|
||||
|
||||
"vfmdb %%v16,%%v16,%%v0 \n\t"
|
||||
"vfmdb %%v17,%%v17,%%v0 \n\t"
|
||||
"vfmdb %%v18,%%v18,%%v0 \n\t"
|
||||
"vfmdb %%v19,%%v19,%%v0 \n\t"
|
||||
"vfmdb %%v20,%%v20,%%v0 \n\t"
|
||||
"vfmdb %%v21,%%v21,%%v0 \n\t"
|
||||
"vfmdb %%v22,%%v22,%%v0 \n\t"
|
||||
"vfmdb %%v23,%%v23,%%v0 \n\t"
|
||||
|
||||
"vst %%v16,0(%%r1,%2) \n\t"
|
||||
"vst %%v17,16(%%r1,%2) \n\t"
|
||||
"vst %%v18,32(%%r1,%2) \n\t"
|
||||
"vst %%v19,48(%%r1,%2) \n\t"
|
||||
"vst %%v20,64(%%r1,%2) \n\t"
|
||||
"vst %%v21,80(%%r1,%2) \n\t"
|
||||
"vst %%v22,96(%%r1,%2) \n\t"
|
||||
"vst %%v23,112(%%r1,%2) \n\t"
|
||||
|
||||
"agfi %%r1,128 \n\t"
|
||||
"brctg %%r0,0b "
|
||||
:
|
||||
:"r"(n),"ZQ"((const FLOAT (*)[2])alpha),"ZR"((FLOAT (*)[n * 2])x)
|
||||
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23"
|
||||
);
|
||||
}
|
||||
|
||||
static void zscal_kernel_8_zero(BLASLONG n, FLOAT *x)
|
||||
{
|
||||
__asm__ volatile(
|
||||
"vzero %%v24 \n\t"
|
||||
"vzero %%v25 \n\t"
|
||||
"vzero %%v26 \n\t"
|
||||
"vzero %%v27 \n\t"
|
||||
"srlg %%r0,%0,3 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 2, 1024(%%r1,%1) \n\t"
|
||||
|
||||
"vst %%v24,0(%%r1,%1) \n\t"
|
||||
"vst %%v25,16(%%r1,%1) \n\t"
|
||||
"vst %%v26,32(%%r1,%1) \n\t"
|
||||
"vst %%v27,48(%%r1,%1) \n\t"
|
||||
"vst %%v24,64(%%r1,%1) \n\t"
|
||||
"vst %%v25,80(%%r1,%1) \n\t"
|
||||
"vst %%v26,96(%%r1,%1) \n\t"
|
||||
"vst %%v27,112(%%r1,%1) \n\t"
|
||||
|
||||
"la %[t1],64(%[t1] ) \n\t"
|
||||
"2: \n\t"
|
||||
"pfd 2, 256(%[t1],%[x_tmp]) \n\t"
|
||||
"vpdi %%v24 , %%v20, %%v20, 4 \n\t"
|
||||
"vpdi %%v25 , %%v21, %%v21, 4 \n\t"
|
||||
"vpdi %%v26 , %%v22, %%v22, 4 \n\t"
|
||||
"vpdi %%v27 , %%v23, %%v23, 4 \n\t"
|
||||
|
||||
"vfmdb %%v30, %%v20, %%v28 \n\t"
|
||||
"vfmdb %%v31, %%v21, %%v28 \n\t"
|
||||
"vfmdb %%v6, %%v22, %%v28 \n\t"
|
||||
"vfmdb %%v7, %%v23, %%v28 \n\t"
|
||||
|
||||
"vl %%v20 , 64(%[t1],%[x_tmp]) \n\t"
|
||||
"vl %%v21 , 80(%[t1],%[x_tmp]) \n\t"
|
||||
"vl %%v22 , 96(%[t1],%[x_tmp]) \n\t"
|
||||
"vl %%v23 ,112(%[t1],%[x_tmp]) \n\t"
|
||||
|
||||
"vfmadb %%v30, %%v24, %%v29, %%v30 \n\t"
|
||||
"vfmadb %%v31, %%v25, %%v29, %%v31 \n\t"
|
||||
"vfmadb %%v6, %%v26, %%v29, %%v6 \n\t"
|
||||
"vfmadb %%v7, %%v27, %%v29, %%v7 \n\t"
|
||||
|
||||
|
||||
"vst %%v30 , 0(%[t1],%[x_tmp]) \n\t"
|
||||
"vst %%v31 , 16(%[t1],%[x_tmp]) \n\t"
|
||||
"vst %%v6 , 32(%[t1],%[x_tmp]) \n\t"
|
||||
"vst %%v7 , 48(%[t1],%[x_tmp]) \n\t"
|
||||
|
||||
"la %[t1],64(%[t1] ) \n\t"
|
||||
|
||||
|
||||
"clgrjl %[t1],%[tmp],1b \n\t"
|
||||
//----------------------------------------------------------------------
|
||||
"vfmdb %%v16, %%v20, %%v28 \n\t"
|
||||
"vfmdb %%v17, %%v21, %%v28 \n\t"
|
||||
"vfmdb %%v18, %%v22, %%v28 \n\t"
|
||||
"vfmdb %%v19, %%v23, %%v28 \n\t"
|
||||
"vpdi %%v24 , %%v20, %%v20, 4 \n\t"
|
||||
"vpdi %%v25 , %%v21, %%v21, 4 \n\t"
|
||||
"vpdi %%v26 , %%v22, %%v22, 4 \n\t"
|
||||
"vpdi %%v27 , %%v23, %%v23, 4 \n\t"
|
||||
"vfmadb %%v16, %%v24, %%v29, %%v16 \n\t"
|
||||
"vfmadb %%v17, %%v25, %%v29, %%v17 \n\t"
|
||||
"vfmadb %%v18, %%v26, %%v29, %%v18 \n\t"
|
||||
"vfmadb %%v19, %%v27, %%v29, %%v19 \n\t"
|
||||
|
||||
"vst %%v16 , 0(%[t1],%[x_tmp]) \n\t"
|
||||
"vst %%v17 , 16(%[t1],%[x_tmp]) \n\t"
|
||||
"vst %%v18 , 32(%[t1],%[x_tmp]) \n\t"
|
||||
"vst %%v19 , 48(%[t1],%[x_tmp]) \n\t"
|
||||
|
||||
: [mem_x] "+m" (*(double (*)[2*n])x),[tmp]"+&r"(n) , [t1] "=&a" (tempR1)
|
||||
: [x_tmp] "a"(x), [alpha_r] "f"(da_r),[alpha_i] "f"(da_i)
|
||||
: "cc", "v6","v7", "v16",
|
||||
"v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
||||
static void zscal_kernel_8_zero_r(BLASLONG n, FLOAT da_i, FLOAT *x) {
|
||||
|
||||
__asm__ ( "pfd 2, 0(%1) \n\t"
|
||||
"lgdr %%r0,%[alpha] \n\t"
|
||||
"vlvgp %%v16,%%r0,%%r0 \n\t" //load both from disjoint
|
||||
"vflcdb %%v16,%%v16 \n\t" //complement both
|
||||
"vlvgg %%v16,%%r0,0 \n\t" //restore 1st
|
||||
"vlr %%v17 ,%%v16 \n\t"
|
||||
"sllg %%r0,%[n],4 \n\t"
|
||||
"agr %%r0,%[x_ptr] \n\t"
|
||||
".align 16 \n\t"
|
||||
"1: \n\t"
|
||||
"vl %%v24, 0(%[x_ptr]) \n\t"
|
||||
"vfmdb %%v24,%%v24,%%v16 \n\t"
|
||||
"vsteg %%v24, 0(%[x_ptr]),1 \n\t"
|
||||
"vsteg %%v24, 8(%[x_ptr]),0 \n\t"
|
||||
"vl %%v25, 16(%[x_ptr]) \n\t"
|
||||
"vfmdb %%v25,%%v25,%%v17 \n\t"
|
||||
"vsteg %%v25, 16(%[x_ptr]),1 \n\t"
|
||||
"vsteg %%v25, 24(%[x_ptr]),0 \n\t"
|
||||
"vl %%v26, 32(%[x_ptr]) \n\t"
|
||||
"vfmdb %%v26,%%v26,%%v16 \n\t"
|
||||
"vsteg %%v26, 32(%[x_ptr]),1 \n\t"
|
||||
"vsteg %%v26, 40(%[x_ptr]),0 \n\t"
|
||||
"vl %%v27, 48(%[x_ptr]) \n\t"
|
||||
"vfmdb %%v27,%%v27,%%v17 \n\t"
|
||||
"vsteg %%v27, 48(%[x_ptr]),1 \n\t"
|
||||
"vsteg %%v27, 56(%[x_ptr]),0 \n\t"
|
||||
"vl %%v28, 64(%[x_ptr]) \n\t"
|
||||
"vfmdb %%v28,%%v28,%%v16 \n\t"
|
||||
"vsteg %%v28, 64(%[x_ptr]),1 \n\t"
|
||||
"vsteg %%v28, 72(%[x_ptr]),0 \n\t"
|
||||
"vl %%v29, 80(%[x_ptr]) \n\t"
|
||||
"vfmdb %%v29,%%v29,%%v17 \n\t"
|
||||
"vsteg %%v29, 80(%[x_ptr]),1 \n\t"
|
||||
"vsteg %%v29, 88(%[x_ptr]),0 \n\t"
|
||||
"vl %%v30, 96(%[x_ptr]) \n\t"
|
||||
"vfmdb %%v30,%%v30,%%v16 \n\t"
|
||||
"vsteg %%v30, 96(%[x_ptr]),1 \n\t"
|
||||
"vsteg %%v30, 104(%[x_ptr]),0 \n\t"
|
||||
"vl %%v31, 112(%[x_ptr]) \n\t"
|
||||
"vfmdb %%v31,%%v31,%%v17 \n\t"
|
||||
"vsteg %%v31, 112(%[x_ptr]),1 \n\t"
|
||||
"vsteg %%v31, 120(%[x_ptr]),0 \n\t"
|
||||
"la %[x_ptr],128(%[x_ptr]) \n\t"
|
||||
"clgrjl %[x_ptr],%%r0,1b \n\t"
|
||||
: [mem] "+m" (*(double (*)[2*n])x) ,[x_ptr] "+&a"(x)
|
||||
: [n] "r"(n),[alpha] "f"(da_i)
|
||||
:"cc", "r0","f0", "f1","v16","v17" ,"v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
|
||||
|
||||
"agfi %%r1,128 \n\t"
|
||||
"brctg %%r0,0b "
|
||||
:
|
||||
:"r"(n),"ZR"((FLOAT (*)[n * 2])x)
|
||||
:"memory","cc","r0","r1","v24","v25","v26","v27"
|
||||
);
|
||||
}
|
||||
|
||||
static void zscal_kernel_8_zero_i(BLASLONG n, FLOAT da_r, FLOAT *x) {
|
||||
__asm__ ("pfd 2, 0(%[x_ptr]) \n\t"
|
||||
"lgdr %%r0,%[alpha] \n\t"
|
||||
"vlvgp %%v18,%%r0,%%r0 \n\t"
|
||||
"vlr %%v19,%%v18 \n\t"
|
||||
"vlr %%v16,%%v18 \n\t"
|
||||
"vlr %%v17,%%v18 \n\t"
|
||||
"sllg %%r0,%[n],4 \n\t"
|
||||
"agr %%r0,%[x_ptr] \n\t"
|
||||
".align 16 \n\t"
|
||||
"1: \n\t"
|
||||
"vl %%v24, 0(%[x_ptr]) \n\t"
|
||||
"vfmdb %%v24,%%v24,%%v18 \n\t"
|
||||
"vst %%v24, 0(%[x_ptr]) \n\t"
|
||||
"vl %%v25, 16(%[x_ptr]) \n\t"
|
||||
"vfmdb %%v25,%%v25,%%v19 \n\t"
|
||||
"vst %%v25, 16(%[x_ptr]) \n\t"
|
||||
"vl %%v26, 32(%[x_ptr]) \n\t"
|
||||
"vfmdb %%v26,%%v26,%%v16 \n\t"
|
||||
"vst %%v26, 32(%[x_ptr]) \n\t"
|
||||
"vl %%v27, 48(%[x_ptr]) \n\t"
|
||||
"vfmdb %%v27,%%v27,%%v17 \n\t"
|
||||
"vst %%v27, 48(%[x_ptr]) \n\t"
|
||||
"vl %%v28, 64(%[x_ptr]) \n\t"
|
||||
"vfmdb %%v28,%%v28,%%v18 \n\t"
|
||||
"vst %%v28, 64(%[x_ptr]) \n\t"
|
||||
"vl %%v29, 80(%[x_ptr]) \n\t"
|
||||
"vfmdb %%v29,%%v29,%%v19 \n\t"
|
||||
"vst %%v29, 80(%[x_ptr]) \n\t"
|
||||
"vl %%v30, 96(%[x_ptr]) \n\t"
|
||||
"vfmdb %%v30,%%v30,%%v16 \n\t"
|
||||
"vst %%v30, 96(%[x_ptr]) \n\t"
|
||||
"vl %%v31,112(%[x_ptr]) \n\t"
|
||||
"vfmdb %%v31,%%v31,%%v17 \n\t"
|
||||
"vst %%v31,112(%[x_ptr]) \n\t"
|
||||
"la %[x_ptr],128(%[x_ptr]) \n\t"
|
||||
"clgrjl %[x_ptr],%%r0,1b \n\t"
|
||||
: [mem] "+m" (*(double (*)[2*n])x) ,[x_ptr] "+&a"(x)
|
||||
: [n] "r"(n),[alpha] "f"(da_r)
|
||||
: "cc", "r0","v16", "v17","v18","v19","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
static void zscal_kernel_8_zero(BLASLONG n, FLOAT *x) {
|
||||
|
||||
__asm__ ( "pfd 2, 0(%[x_ptr]) \n\t"
|
||||
"vzero %%v24 \n\t"
|
||||
"vzero %%v25 \n\t"
|
||||
"vzero %%v26 \n\t"
|
||||
"vzero %%v27 \n\t"
|
||||
"sllg %%r0,%[n],4 \n\t"
|
||||
"agr %%r0,%[x_ptr] \n\t"
|
||||
".align 16 \n\t"
|
||||
"1: \n\t"
|
||||
"pfd 2, 256( %[x_ptr]) \n\t"
|
||||
"vst %%v24, 0( %[x_ptr]) \n\t"
|
||||
"vst %%v25, 16( %[x_ptr]) \n\t"
|
||||
"vst %%v26, 32( %[x_ptr]) \n\t"
|
||||
"vst %%v27, 48( %[x_ptr]) \n\t"
|
||||
"vst %%v24, 64( %[x_ptr]) \n\t"
|
||||
"vst %%v25, 80( %[x_ptr]) \n\t"
|
||||
"vst %%v26, 96( %[x_ptr]) \n\t"
|
||||
"vst %%v27,112( %[x_ptr]) \n\t"
|
||||
|
||||
"la %[x_ptr],128(%[x_ptr]) \n\t"
|
||||
"clgrjl %[x_ptr],%%r0,1b \n\t"
|
||||
: [mem] "+m" (*(double (*)[2*n])x),[x_ptr] "+&a"(x)
|
||||
: [n] "r"(n)
|
||||
:"cc" ,"r0","v24","v25","v26","v27"
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
static void zscal_kernel_inc_8(BLASLONG n, FLOAT da_r,FLOAT da_i, FLOAT *x, BLASLONG inc_x) {
|
||||
|
||||
static void zscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, BLASLONG inc_x)
|
||||
{
|
||||
BLASLONG i;
|
||||
BLASLONG inc_x2 = 2 * inc_x;
|
||||
BLASLONG inc_x3 = inc_x2 + inc_x;
|
||||
FLOAT t0, t1, t2, t3;
|
||||
FLOAT t0, t1, t2, t3;
|
||||
FLOAT da_r = alpha[0];
|
||||
FLOAT da_i = alpha[1];
|
||||
|
||||
for (i = 0; i < n; i += 4) {
|
||||
for (i = 0; i < n; i += 4)
|
||||
{
|
||||
t0 = da_r * x[0] - da_i * x[1];
|
||||
t1 = da_r * x[inc_x] - da_i * x[inc_x + 1];
|
||||
t2 = da_r * x[inc_x2] - da_i * x[inc_x2 + 1];
|
||||
|
|
@ -303,17 +244,14 @@ static void zscal_kernel_inc_8(BLASLONG n, FLOAT da_r,FLOAT da_i, FLOAT *x, BLAS
|
|||
x[inc_x3] = t3;
|
||||
|
||||
x += 4 * inc_x;
|
||||
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) {
|
||||
BLASLONG i = 0, j = 0;
|
||||
FLOAT temp0;
|
||||
FLOAT temp1;
|
||||
|
||||
FLOAT alpha[2] __attribute__ ((aligned(16)));
|
||||
|
||||
if (inc_x != 1) {
|
||||
inc_x <<= 1;
|
||||
|
|
@ -405,8 +343,10 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
|
|||
} else {
|
||||
|
||||
BLASLONG n1 = n & -8;
|
||||
if (n1 > 0) {
|
||||
zscal_kernel_inc_8(n1, da_r,da_i, x, inc_x);
|
||||
if (n1 > 0) {
|
||||
alpha[0] = da_r;
|
||||
alpha[1] = da_i;
|
||||
zscal_kernel_inc_8(n1, alpha, x, inc_x);
|
||||
j = n1;
|
||||
i = n1 * inc_x;
|
||||
}
|
||||
|
|
@ -432,17 +372,19 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
|
|||
BLASLONG n1 = n & -8;
|
||||
if (n1 > 0) {
|
||||
|
||||
alpha[0] = da_r;
|
||||
alpha[1] = da_i;
|
||||
|
||||
if (da_r == 0.0)
|
||||
if (da_i == 0)
|
||||
zscal_kernel_8_zero(n1, x);
|
||||
else
|
||||
zscal_kernel_8_zero_r(n1, da_i, x);
|
||||
zscal_kernel_8_zero_r(n1, alpha, x);
|
||||
else
|
||||
if (da_i == 0)
|
||||
zscal_kernel_8_zero_i(n1, da_r, x);
|
||||
zscal_kernel_8_zero_i(n1, alpha, x);
|
||||
else
|
||||
zscal_kernel_8(n1, da_r,da_i, x);
|
||||
zscal_kernel_8(n1, alpha, x);
|
||||
|
||||
i = n1 << 1;
|
||||
j = n1;
|
||||
|
|
@ -508,5 +450,3 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
|
|||
|
||||
return (0);
|
||||
}
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -25,220 +25,93 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
|||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
#include "common.h"
|
||||
|
||||
|
||||
#if defined(Z13_SWAP_A)
|
||||
static void zswap_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y)
|
||||
static void zswap_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y)
|
||||
{
|
||||
__asm__ volatile(
|
||||
"pfd 1, 0(%[ptr_x]) \n\t"
|
||||
"pfd 2, 0(%[ptr_y]) \n\t"
|
||||
"srlg %[n_tmp],%[n_tmp],4 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
".align 16 \n\t"
|
||||
"1: \n\t"
|
||||
"pfd 2, 256(%%r1,%[ptr_x]) \n\t"
|
||||
"pfd 2, 256(%%r1,%[ptr_y]) \n\t"
|
||||
|
||||
"vl %%v24, 0(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v16, 0(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v24, 0(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v16, 0(%%r1,%[ptr_x]) \n\t"
|
||||
__asm__ volatile(
|
||||
"srlg %%r0,%0,4 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
"0: \n\t"
|
||||
"pfd 2, 1024(%%r1,%1) \n\t"
|
||||
"pfd 2, 1024(%%r1,%2) \n\t"
|
||||
|
||||
"vl %%v16, 0(%%r1,%1) \n\t"
|
||||
"vl %%v17, 16(%%r1,%1) \n\t"
|
||||
"vl %%v18, 32(%%r1,%1) \n\t"
|
||||
"vl %%v19, 48(%%r1,%1) \n\t"
|
||||
"vl %%v20, 64(%%r1,%1) \n\t"
|
||||
"vl %%v21, 80(%%r1,%1) \n\t"
|
||||
"vl %%v22, 96(%%r1,%1) \n\t"
|
||||
"vl %%v23, 112(%%r1,%1) \n\t"
|
||||
"vl %%v24, 128(%%r1,%1) \n\t"
|
||||
"vl %%v25, 144(%%r1,%1) \n\t"
|
||||
"vl %%v26, 160(%%r1,%1) \n\t"
|
||||
"vl %%v27, 176(%%r1,%1) \n\t"
|
||||
"vl %%v28, 192(%%r1,%1) \n\t"
|
||||
"vl %%v29, 208(%%r1,%1) \n\t"
|
||||
"vl %%v30, 224(%%r1,%1) \n\t"
|
||||
"vl %%v31, 240(%%r1,%1) \n\t"
|
||||
|
||||
"vl %%v25, 16(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v17, 16(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v25, 16(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v17, 16(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v0, 0(%%r1,%2) \n\t"
|
||||
"vl %%v1, 16(%%r1,%2) \n\t"
|
||||
"vl %%v2, 32(%%r1,%2) \n\t"
|
||||
"vl %%v3, 48(%%r1,%2) \n\t"
|
||||
"vl %%v4, 64(%%r1,%2) \n\t"
|
||||
"vl %%v5, 80(%%r1,%2) \n\t"
|
||||
"vl %%v6, 96(%%r1,%2) \n\t"
|
||||
"vl %%v7, 112(%%r1,%2) \n\t"
|
||||
"vst %%v0, 0(%%r1,%1) \n\t"
|
||||
"vst %%v1, 16(%%r1,%1) \n\t"
|
||||
"vst %%v2, 32(%%r1,%1) \n\t"
|
||||
"vst %%v3, 48(%%r1,%1) \n\t"
|
||||
"vst %%v4, 64(%%r1,%1) \n\t"
|
||||
"vst %%v5, 80(%%r1,%1) \n\t"
|
||||
"vst %%v6, 96(%%r1,%1) \n\t"
|
||||
"vst %%v7, 112(%%r1,%1) \n\t"
|
||||
|
||||
"vl %%v26, 32(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v18, 32(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v26, 32(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v18, 32(%%r1,%[ptr_x]) \n\t"
|
||||
|
||||
"vl %%v27, 48(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v19, 48(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v27, 48(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v19, 48(%%r1,%[ptr_x]) \n\t"
|
||||
|
||||
"vl %%v28, 64(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v20, 64(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v28, 64(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v20, 64(%%r1,%[ptr_x]) \n\t"
|
||||
|
||||
"vl %%v29, 80(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v21, 80(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v29, 80(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v21, 80(%%r1,%[ptr_x]) \n\t"
|
||||
|
||||
"vl %%v30, 96(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v22, 96(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v30, 96(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v22, 96(%%r1,%[ptr_x]) \n\t"
|
||||
|
||||
"vl %%v31, 112(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v23, 112(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v31, 112(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v23, 112(%%r1,%[ptr_x]) \n\t"
|
||||
|
||||
"vl %%v24, 128(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v16, 128(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v24, 128(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v16, 128(%%r1,%[ptr_x]) \n\t"
|
||||
|
||||
"vl %%v25, 144(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v17, 144(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v25, 144(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v17, 144(%%r1,%[ptr_x]) \n\t"
|
||||
|
||||
"vl %%v26, 160(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v18, 160(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v26, 160(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v18, 160(%%r1,%[ptr_x]) \n\t"
|
||||
|
||||
"vl %%v27, 176(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v19, 176(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v27, 176(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v19, 176(%%r1,%[ptr_x]) \n\t"
|
||||
|
||||
"vl %%v28, 192(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v20, 192(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v28, 192(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v20, 192(%%r1,%[ptr_x]) \n\t"
|
||||
|
||||
"vl %%v29, 208(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v21, 208(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v29, 208(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v21, 208(%%r1,%[ptr_x]) \n\t"
|
||||
|
||||
"vl %%v30, 224(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v22, 224(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v30, 224(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v22, 224(%%r1,%[ptr_x]) \n\t"
|
||||
|
||||
"vl %%v31, 240(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v23, 240(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v31, 240(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v23, 240(%%r1,%[ptr_x]) \n\t"
|
||||
|
||||
"la %%r1,256(%%r1) \n\t"
|
||||
"brctg %[n_tmp],1b"
|
||||
: [mem_x] "+m" (*(double (*)[2*n])x),
|
||||
[mem_y] "+m" (*(double (*)[2*n])y),
|
||||
[n_tmp] "+&r"(n)
|
||||
: [ptr_x] "a"(x), [ptr_y] "a"(y)
|
||||
: "cc", "r1", "v16","v17","v18","v19","v20","v21","v22","v23"
|
||||
,"v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
return;
|
||||
"vl %%v0, 128(%%r1,%2) \n\t"
|
||||
"vl %%v1, 144(%%r1,%2) \n\t"
|
||||
"vl %%v2, 160(%%r1,%2) \n\t"
|
||||
"vl %%v3, 176(%%r1,%2) \n\t"
|
||||
"vl %%v4, 192(%%r1,%2) \n\t"
|
||||
"vl %%v5, 208(%%r1,%2) \n\t"
|
||||
"vl %%v6, 224(%%r1,%2) \n\t"
|
||||
"vl %%v7, 240(%%r1,%2) \n\t"
|
||||
"vst %%v0, 128(%%r1,%1) \n\t"
|
||||
"vst %%v1, 144(%%r1,%1) \n\t"
|
||||
"vst %%v2, 160(%%r1,%1) \n\t"
|
||||
"vst %%v3, 176(%%r1,%1) \n\t"
|
||||
"vst %%v4, 192(%%r1,%1) \n\t"
|
||||
"vst %%v5, 208(%%r1,%1) \n\t"
|
||||
"vst %%v6, 224(%%r1,%1) \n\t"
|
||||
"vst %%v7, 240(%%r1,%1) \n\t"
|
||||
|
||||
"vst %%v16, 0(%%r1,%2) \n\t"
|
||||
"vst %%v17, 16(%%r1,%2) \n\t"
|
||||
"vst %%v18, 32(%%r1,%2) \n\t"
|
||||
"vst %%v19, 48(%%r1,%2) \n\t"
|
||||
"vst %%v20, 64(%%r1,%2) \n\t"
|
||||
"vst %%v21, 80(%%r1,%2) \n\t"
|
||||
"vst %%v22, 96(%%r1,%2) \n\t"
|
||||
"vst %%v23, 112(%%r1,%2) \n\t"
|
||||
"vst %%v24, 128(%%r1,%2) \n\t"
|
||||
"vst %%v25, 144(%%r1,%2) \n\t"
|
||||
"vst %%v26, 160(%%r1,%2) \n\t"
|
||||
"vst %%v27, 176(%%r1,%2) \n\t"
|
||||
"vst %%v28, 192(%%r1,%2) \n\t"
|
||||
"vst %%v29, 208(%%r1,%2) \n\t"
|
||||
"vst %%v30, 224(%%r1,%2) \n\t"
|
||||
"vst %%v31, 240(%%r1,%2) \n\t"
|
||||
|
||||
"agfi %%r1,256 \n\t"
|
||||
"brctg %%r0,0b "
|
||||
:
|
||||
:"r"(n),"ZR"((FLOAT (*)[n * 2])x),"ZR"((FLOAT (*)[n * 2])y)
|
||||
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
static void zswap_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y)
|
||||
{
|
||||
__asm__ volatile(
|
||||
"pfd 2, 0(%[ptr_x]) \n\t"
|
||||
"pfd 2, 0(%[ptr_y]) \n\t"
|
||||
"srlg %[n_tmp],%[n_tmp],4 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
".align 16 \n\t"
|
||||
"1: \n\t"
|
||||
"pfd 2, 256(%%r1,%[ptr_x]) \n\t"
|
||||
"pfd 2, 256(%%r1,%[ptr_y]) \n\t"
|
||||
|
||||
"vl %%v16, 0(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v17, 16(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v18, 32(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v19, 48(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v20, 64(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v21, 80(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v22, 96(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v23, 112(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v24, 128(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v25, 144(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v26, 160(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v27, 176(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v28, 192(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v29, 208(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v30, 224(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v31, 240(%%r1,%[ptr_x]) \n\t"
|
||||
|
||||
|
||||
"vl %%v0, 0(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v1, 16(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v2, 32(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v3, 48(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v4, 64(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v5, 80(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v6, 96(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v7, 112(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v0, 0(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v1, 16(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v2, 32(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v3, 48(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v4, 64(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v5, 80(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v6, 96(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v7, 112(%%r1,%[ptr_x]) \n\t"
|
||||
|
||||
"vl %%v0, 128(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v1, 144(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v2, 160(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v3, 176(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v4, 192(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v5, 208(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v6, 224(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v7, 240(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v0, 128(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v1, 144(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v2, 160(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v3, 176(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v4, 192(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v5, 208(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v6, 224(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v7, 240(%%r1,%[ptr_x]) \n\t"
|
||||
|
||||
"vst %%v16, 0(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v17, 16(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v18, 32(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v19, 48(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v20, 64(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v21, 80(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v22, 96(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v23, 112(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v24, 128(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v25, 144(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v26, 160(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v27, 176(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v28, 192(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v29, 208(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v30, 224(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v31, 240(%%r1,%[ptr_y]) \n\t"
|
||||
|
||||
|
||||
"la %%r1,256(%%r1) \n\t"
|
||||
"brctg %[n_tmp],1b"
|
||||
: [mem_x] "+m" (*(double (*)[2*n])x),
|
||||
[mem_y] "+m" (*(double (*)[2*n])y),
|
||||
[n_tmp] "+&r"(n)
|
||||
: [ptr_x] "a"(x), [ptr_y] "a"(y)
|
||||
: "cc", "r1", "v0","v1","v2","v3","v4","v5","v6","v7","v16",
|
||||
"v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
return;
|
||||
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
|
||||
{
|
||||
BLASLONG i=0;
|
||||
|
|
|
|||
Loading…
Reference in New Issue